I have a bunch of URL's (over 50k) in a CSV file from different Newspapers. I primarily looking for the main headline <h1> and the main paragraphs <p>.
I'm getting an exception that I'm not quite familiar with or don't know how to handle. Her is the message I get back:
Traceback (most recent call last):
File "/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/urllib3/connection.py", line 141, in _new_conn
(self.host, self.port), self.timeout, **extra_kw)
File "/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/urllib3/util/connection.py", line 60, in create_connection
for res in socket.getaddrinfo(host, port, family, socket.SOCK_STREAM):
File "/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/socket.py", line 745, in getaddrinfo
for res in _socket.getaddrinfo(host, port, family, type, proto, flags):
socket.gaierror: [Errno 8] nodename nor servname provided, or not known
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/urllib3/connectionpool.py", line 601, in urlopen
chunked=chunked)
File "/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/urllib3/connectionpool.py", line 346, in _make_request
self._validate_conn(conn)
File "/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/urllib3/connectionpool.py", line 850, in _validate_conn
conn.connect()
File "/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/urllib3/connection.py", line 284, in connect
conn = self._new_conn()
File "/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/urllib3/connection.py", line 150, in _new_conn
self, "Failed to establish a new connection: %s" % e)
urllib3.exceptions.NewConnectionError: <urllib3.connection.VerifiedHTTPSConnection object at 0x118e1a6a0>: Failed to establish a new connection: [Errno 8] nodename nor servname provided, or not known
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/requests/adapters.py", line 440, in send
timeout=timeout
File "/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/urllib3/connectionpool.py", line 639, in urlopen
_stacktrace=sys.exc_info()[2])
File "/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/urllib3/util/retry.py", line 388, in increment
raise MaxRetryError(_pool, url, error or ResponseError(cause))
urllib3.exceptions.MaxRetryError: HTTPSConnectionPool(host='www.cnn.com', port=443): Max retries exceeded with url: /2019/02/01/us/chicago-volunteer-homeless-cold-trnd/index.html?utm_source=feedburner&utm_medium=feed&utm_campaign=Feed%3A+rss%2Fcnn_topstories+%28RSS%3A+CNN+-+Top+Stories%29 (Caused by NewConnectionError('<urllib3.connection.VerifiedHTTPSConnection object at 0x118e1a6a0>: Failed to establish a new connection: [Errno 8] nodename nor servname provided, or not known',))
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/Volumes/FELIPE/english_news/pass_news.py", line 24, in <module>
request_to_url = requests.get(urls).text
File "/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/requests/api.py", line 72, in get
return request('get', url, params=params, **kwargs)
File "/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/requests/api.py", line 58, in request
return session.request(method=method, url=url, **kwargs)
File "/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/requests/sessions.py", line 508, in request
resp = self.send(prep, **send_kwargs)
File "/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/requests/sessions.py", line 640, in send
history = [resp for resp in gen] if allow_redirects else []
File "/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/requests/sessions.py", line 640, in <listcomp>
history = [resp for resp in gen] if allow_redirects else []
File "/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/requests/sessions.py", line 218, in resolve_redirects
**adapter_kwargs
File "/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/requests/sessions.py", line 618, in send
r = adapter.send(request, **kwargs)
File "/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/requests/adapters.py", line 508, in send
raise ConnectionError(e, request=request)
requests.exceptions.ConnectionError: HTTPSConnectionPool(host='www.cnn.com', port=443): Max retries exceeded with url: /2019/02/01/us/chicago-volunteer-homeless-cold-trnd/index.html?utm_source=feedburner&utm_medium=feed&utm_campaign=Feed%3A+rss%2Fcnn_topstories+%28RSS%3A+CNN+-+Top+Stories%29 (Caused by NewConnectionError('<urllib3.connection.VerifiedHTTPSConnection object at 0x118e1a6a0>: Failed to establish a new connection: [Errno 8] nodename nor servname provided, or not known',)))
Her is the code:
import uuid
import pandas as pd
import os
import requests
from bs4 import BeautifulSoup
cwd = os.path.dirname(os.path.realpath(__file__))
csv_file = os.path.join(cwd, "csv_data", "data.csv")
text_data = os.path.join(cwd, "raw_text2")
if not os.path.exists(text_data):
os.makedirs(text_data)
df = pd.read_csv(csv_file)
for link, source in df.iterrows():
urls = source['Link']
source_name = source["Source"]
request_to_url = requests.get(urls).text
soup = BeautifulSoup(request_to_url, 'html.parser')
try:
h = soup.find_all('h1')
try:
text_h = h.get_text()
except AttributeError:
text_h = ""
p = soup.find_all('p')
text_p = ([p.get_text() for p in soup('p')])
text_bb = str(" ".join(repr(e) for e in text_p))
source_dir = os.path.join(text_data, source_name)
try:
os.makedirs(source_dir)
except FileExistsError as e:
pass
filename = str(uuid.uuid4())
write = open(os.path.join(source_dir, filename + ".txt"), "w+", encoding="utf-8")
write.write(text_h + "\n" + text_bb)
write.close()
data = pd.Series(text_h + text_bb)
with open("raw_text.csv", "a") as f:
data.to_csv(f, encoding="utf-8", header=False, index=None)
except:
# Removes all <div> with id "sponsor-slug"
for child_div in soup.find_all("div", id="sponsor-slug"):
child_div.decompose()
# Remove all <p> with class "copyright"
for child_p in soup.find_all('p', attrs={'class': "copyright"}):
child_p.decompose()
# Removes all <a> tags an keeps the content if any
a_remove = soup.find_all("a")
for unwanted_tag in a_remove:
unwanted_tag.replaceWithChildren()
# Removes all <span> content and keeps content if any
span_remove = soup.find_all("span")
for unwanted_tag in span_remove:
unwanted_tag.replaceWithChildren()
# Removes all <em> content and keeps content if any
span_remove = soup.find_all("em")
for unwanted_tag in span_remove:
unwanted_tag.replaceWithChildren()
What is the best way of handling these exceptions?
Is it possible to ignore the connection if not possible and go to the next URL?
I want to crawl and add the content into another CSV file or add them to the current CSV if possible. At the same time create different folders with the different sources and add the corresponding text to that folder.
Its basically what this code is doing:
filename = str(uuid.uuid4())
write = open(os.path.join(source_dir, filename + ".txt"), "w+", encoding="utf-8")
write.write(text_h + "\n" + text_bb)
write.close()
data = pd.Series(text_h + text_bb)
with open("raw_text.csv", "a") as f:
data.to_csv(f, encoding="utf-8", header=False, index=None)
I want to use NLP on each text and later try to use some sentiment analyzing tools on the text.
Before getting the text value of response, in this line:
request_to_url = requests.get(urls).text
You can check if link is available or NOT. I wrote simple function for this action:
import requests
# Open session
s = requests.Session()
page_url = "http://wp.meQ/testBadUrl" # example of bad URL
def get_response(page_url):
""" Get good or bad response from page_url"""
# Create 'bad' Response object
bad_resp = requests.Response()
bad_resp.status_code = 404
try:
# By default 'allow_redirects' = True
good_resp = s.get(page_url, timeout=(3, 10))
if good_resp.ok:
return good_resp
else:
return bad_resp
except requests.exceptions.ConnectionError:
print("Exception! Bad Request for URL: " + page_url)
return bad_resp
except requests.exceptions.Timeout:
print("Exception! Timeout for URL: " + page_url)
return bad_resp
except:
print("Unknown Exception!: " + page_url)
return bad_resp
page_resp = get_response(page_url)
if page_resp.ok:
# Your code for good URLs
print("Append URL into 'GOOD' list")
else:
# Your code for bad URLs
print("Skip BAD url here...")
You can also add and handle different requests exceptions (full list here), if you need.
I hope it will help you.
Related
So I keep running into an issue where my code gets a HTTP Error 502:Bad Gateway. I have essentially the same code that works fine, just with a smaller data file. Is it just the amount of data that is messing it up or have I created another issue. Thanks for your help!
import pandas
from geopy.geocoders import ArcGIS
nom = ArcGIS(timeout=300)
info = pandas.read_csv('file here')
info['Address'] = info['city'] + ', ' + info['state'] + ', ' + 'USA'
info['Coordinates'] = info['Address'].apply(nom.geocode)
print(info)
Traceback (most recent call last):
File "C:\Users\roger\AppData\Local\Programs\Python\Python38\lib\site-packages\geopy\geocoders\base.py", line 367, in _call_geocoder
page = requester(req, timeout=timeout, **kwargs)
File "C:\Users\roger\AppData\Local\Programs\Python\Python38\lib\urllib\request.py", line 531, in open
response = meth(req, response)
File "C:\Users\roger\AppData\Local\Programs\Python\Python38\lib\urllib\request.py", line 640, in http_response
response = self.parent.error(
File "C:\Users\roger\AppData\Local\Programs\Python\Python38\lib\urllib\request.py", line 569, in error
return self._call_chain(*args)
File "C:\Users\roger\AppData\Local\Programs\Python\Python38\lib\urllib\request.py", line 502, in _call_chain
result = func(*args)
File "C:\Users\roger\AppData\Local\Programs\Python\Python38\lib\urllib\request.py", line 649, in http_error_default
raise HTTPError(req.full_url, code, msg, hdrs, fp)
urllib.error.HTTPError: HTTP Error 502: Bad Gateway
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "C:/Users/roger/PycharmProjects/", line 9, in <module>
info['Coordinates'] = info['Address'].apply(nom.geocode)
File "C:\Users\roger\AppData\Local\Programs\Python\Python38\lib\site-packages\pandas\core\series.py", line 3848, in apply
mapped = lib.map_infer(values, f, convert=convert_dtype)
File "pandas\_libs\lib.pyx", line 2329, in pandas._libs.lib.map_infer
File "C:\Users\roger\AppData\Local\Programs\Python\Python38\lib\site-packages\geopy\geocoders\arcgis.py", line 197, in geocode
response = self._call_geocoder(url, timeout=timeout)
File "C:\Users\roger\AppData\Local\Programs\Python\Python38\lib\site-packages\geopy\geocoders\base.py", line 389, in _call_geocoder
raise ERROR_CODE_MAP[http_code](message)
geopy.exc.GeocoderServiceError: HTTP Error 502: Bad Gateway
I wound up solving the problem by batching out the data in groups of 50. This was the most I could do without ArcGIS giving me a 502 error. Hope this helps
info['Address'] = info['city'] + ', ' + info['state'] + ', ' + 'USA'
total_length = len(info)
def make_file(data):
data.to_csv('file here', mode='a')
query_start = 0
query_end = query_start + 50
while query_start < total_length + 1:
brevinfo = info[query_start:query_end]
brevinfo['Coordinates'] = brevinfo['Address'].apply(nom.geocode)
brevinfo["Latitude"] = brevinfo["Coordinates"].apply(lambda x:
x.latitude if x != None else None)
brevinfo["Longitude"] = brevinfo["Coordinates"].apply(lambda x:
x.longitude if x != None else None)
print(brevinfo)
make_file(brevinfo)
query_start += 50
query_end += 50
time.sleep(1)
import requests
from selenium import webdriver
chrome_options = webdriver.ChromeOptions()
links = {"profile.default_content_setting_values.notifications": 2}
chrome_options.add_experimental_option("prefs", links)
driver = webdriver.Chrome(chrome_options=chrome_options,
executable_path="F:\\automation\\chromedriver.exe")
driver.maximize_window()
driver.get('https://google.com/')
links = driver.find_elements_by_tag_name("a")
for link in links:
r = requests.head(link.get_attribute('href'))
print(link.get_attribute('href'), r.status_code, 'front_page')
driver.quit()
I am getting this error:
Traceback (most recent call last):
File "F:/automation/frontpages.py", line 15, in <module>
r = requests.head(link.get_attribute('href'))
File "F:\automation\venv\lib\site-packages\requests\api.py", line 101, in head return request('head', url, **kwargs)
File "F:\automation\venv\lib\site-packages\requests\api.py", line 60, in request return session.request(method=method, url=url, **kwargs)
File "F:\automation\venv\lib\site-packages\requests\sessions.py", line 524, in request resp = self.send(prep, **send_kwargs)
File "F:\automation\venv\lib\site-packages\requests\sessions.py", line 631, in send adapter = self.get_adapter(url=request.url)
File "F:\automation\venv\lib\site-packages\requests\sessions.py", line 722, in get_adapter raise InvalidSchema("No connection adapters were found for '%s'" % url)
requests.exceptions.InvalidSchema: No connection adapters were found for 'mailto:care#pushengage.com'
and I want to export all the links into the HTML sheet when the test case passed
Why I am getting this error?
Some link has mailto link which is not http url .You are trying to use an mailto link(mailto:some#somepage.com) into r = requests.head. Which is not an http request, so your getting this error.
Filter the those invalid links in findelements itself.
links = driver.find_elements_by_xpath("//a[not(contains(#href,'mailto'))][contains(#href,'pushengage.com')]")
for link in links:
r = requests.head(link.get_attribute('href'))
print(link.get_attribute('href'), r.status_code, 'front_page')
driver.quit()
I'm running the code on Jupyter notebook, I modified the code from this link so it takes it from Jupyter notebook instead of console and iterates over a list of files.
"""Demonstrates how to make a simple call to the Natural Language API."""
import argparse
import requests
from google.cloud import language
from google.cloud.language import enums
from google.cloud.language import types
def print_result(annotations, movie_review_filename):
score = annotations.document_sentiment.score
magnitude = annotations.document_sentiment.magnitude
file_path_split = movie_review_filename.split("/")
fileName = file_path_split[len(file_path_split) - 1][:-4]
sentencelist = []
statuslist = []
for index, sentence in enumerate(annotations.sentences):
sentence_sentiment = sentence.sentiment.score
singlesentence = [fileName, sentence.text.content, sentence.sentiment.magnitude, sentence_sentiment]
sentencelist.append(singlesentence)
outputdf = pd.DataFrame(sentencelist, columns = ['status_id', 'sentence', 'sentence_magnitude', 'sentence_sentiment'])
outputdf.to_csv("/Users/abhi/Desktop/RetrySentenceCSVs/" + fileName + ".csv", index = False)
return 0
def analyze(movie_review_filename):
"""Run a sentiment analysis request on text within a passed filename."""
client = language.LanguageServiceClient()
with open(movie_review_filename, 'r') as review_file:
# Instantiates a plain text document.
content = review_file.read()
document = types.Document(
content=content,
type=enums.Document.Type.PLAIN_TEXT)
annotations = client.analyze_sentiment(document=document)
# Print the results
print_result(annotations, movie_review_filename)
if __name__ == '__main__':
import glob
csv_file_list = glob.glob("/Users/abhi/Desktop/mytxtfilepath/*.txt")
for file in csv_file_list: #Iterate through a list of file paths
analyze(file)
The code is running fine for 10% of the set of text files (I have 687), but after a while it starts to throw errors:
ERROR:root:AuthMetadataPluginCallback "<google.auth.transport.grpc.AuthMetadataPlugin object at 0x113b76588>" raised exception!
Traceback (most recent call last):
File "/anaconda3/lib/python3.6/site-packages/urllib3/connection.py", line 171, in _new_conn
(self._dns_host, self.port), self.timeout, **extra_kw)
File "/anaconda3/lib/python3.6/site-packages/urllib3/util/connection.py", line 56, in create_connection
for res in socket.getaddrinfo(host, port, family, socket.SOCK_STREAM):
File "/anaconda3/lib/python3.6/socket.py", line 745, in getaddrinfo
for res in _socket.getaddrinfo(host, port, family, type, proto, flags):
socket.gaierror: [Errno 8] nodename nor servname provided, or not known
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/anaconda3/lib/python3.6/site-packages/urllib3/connectionpool.py", line 600, in urlopen
chunked=chunked)
File "/anaconda3/lib/python3.6/site-packages/urllib3/connectionpool.py", line 343, in _make_request
self._validate_conn(conn)
File "/anaconda3/lib/python3.6/site-packages/urllib3/connectionpool.py", line 849, in _validate_conn
conn.connect()
File "/anaconda3/lib/python3.6/site-packages/urllib3/connection.py", line 314, in connect
conn = self._new_conn()
File "/anaconda3/lib/python3.6/site-packages/urllib3/connection.py", line 180, in _new_conn
self, "Failed to establish a new connection: %s" % e)
urllib3.exceptions.NewConnectionError: <urllib3.connection.VerifiedHTTPSConnection object at 0x113b840b8>: Failed to establish a new connection: [Errno 8] nodename nor servname provided, or not known
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/anaconda3/lib/python3.6/site-packages/requests/adapters.py", line 445, in send
timeout=timeout
File "/anaconda3/lib/python3.6/site-packages/urllib3/connectionpool.py", line 638, in urlopen
_stacktrace=sys.exc_info()[2])
File "/anaconda3/lib/python3.6/site-packages/urllib3/util/retry.py", line 398, in increment
raise MaxRetryError(_pool, url, error or ResponseError(cause))
urllib3.exceptions.MaxRetryError: HTTPSConnectionPool(host='accounts.google.com', port=443): Max retries exceeded with url: /o/oauth2/token (Caused by NewConnectionError('<urllib3.connection.VerifiedHTTPSConnection object at 0x113b840b8>: Failed to establish a new connection: [Errno 8] nodename nor servname provided, or not known',))
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/anaconda3/lib/python3.6/site-packages/google/auth/transport/requests.py", line 120, in __call__
**kwargs)
File "/anaconda3/lib/python3.6/site-packages/requests/sessions.py", line 512, in request
resp = self.send(prep, **send_kwargs)
File "/anaconda3/lib/python3.6/site-packages/requests/sessions.py", line 622, in send
r = adapter.send(request, **kwargs)
File "/anaconda3/lib/python3.6/site-packages/requests/adapters.py", line 513, in send
raise ConnectionError(e, request=request)
requests.exceptions.ConnectionError: HTTPSConnectionPool(host='accounts.google.com', port=443): Max retries exceeded with url: /o/oauth2/token (Caused by NewConnectionError('<urllib3.connection.VerifiedHTTPSConnection object at 0x113b840b8>: Failed to establish a new connection: [Errno 8] nodename nor servname provided, or not known',))
The above exception was the direct cause of the following exception:
Traceback (most recent call last):
File "/anaconda3/lib/python3.6/site-packages/grpc/_plugin_wrapping.py", line 77, in __call__
callback_state, callback))
File "/anaconda3/lib/python3.6/site-packages/google/auth/transport/grpc.py", line 77, in __call__
callback(self._get_authorization_headers(context), None)
File "/anaconda3/lib/python3.6/site-packages/google/auth/transport/grpc.py", line 65, in _get_authorization_headers
headers)
File "/anaconda3/lib/python3.6/site-packages/google/auth/credentials.py", line 122, in before_request
self.refresh(request)
File "/anaconda3/lib/python3.6/site-packages/google/oauth2/service_account.py", line 322, in refresh
request, self._token_uri, assertion)
File "/anaconda3/lib/python3.6/site-packages/google/oauth2/_client.py", line 145, in jwt_grant
response_data = _token_endpoint_request(request, token_uri, body)
File "/anaconda3/lib/python3.6/site-packages/google/oauth2/_client.py", line 106, in _token_endpoint_request
method='POST', url=token_uri, headers=headers, body=body)
File "/anaconda3/lib/python3.6/site-packages/google/auth/transport/requests.py", line 124, in __call__
six.raise_from(new_exc, caught_exc)
File "<string>", line 3, in raise_from
google.auth.exceptions.TransportError: HTTPSConnectionPool(host='accounts.google.com', port=443): Max retries exceeded with url: /o/oauth2/token (Caused by NewConnectionError('<urllib3.connection.VerifiedHTTPSConnection object at 0x113b840b8>: Failed to establish a new connection: [Errno 8] nodename nor servname provided, or not known',))
ERROR:root:AuthMetadataPluginCallback "<google.auth.transport.grpc.AuthMetadataPlugin object at 0x113b76588>" raised exception!
Traceback (most recent call last):
File "/anaconda3/lib/python3.6/site-packages/urllib3/connection.py", line 171, in _new_conn
(self._dns_host, self.port), self.timeout, **extra_kw)
File "/anaconda3/lib/python3.6/site-packages/urllib3/util/connection.py", line 56, in create_connection
for res in socket.getaddrinfo(host, port, family, socket.SOCK_STREAM):
File "/anaconda3/lib/python3.6/socket.py", line 745, in getaddrinfo
for res in _socket.getaddrinfo(host, port, family, type, proto, flags):
socket.gaierror: [Errno 8] nodename nor servname provided, or not known
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/anaconda3/lib/python3.6/site-packages/urllib3/connectionpool.py", line 600, in urlopen
chunked=chunked)
File "/anaconda3/lib/python3.6/site-packages/urllib3/connectionpool.py", line 343, in _make_request
self._validate_conn(conn)
File "/anaconda3/lib/python3.6/site-packages/urllib3/connectionpool.py", line 849, in _validate_conn
conn.connect()
File "/anaconda3/lib/python3.6/site-packages/urllib3/connection.py", line 314, in connect
conn = self._new_conn()
File "/anaconda3/lib/python3.6/site-packages/urllib3/connection.py", line 180, in _new_conn
self, "Failed to establish a new connection: %s" % e)
urllib3.exceptions.NewConnectionError: <urllib3.connection.VerifiedHTTPSConnection object at 0x113b84470>: Failed to establish a new connection: [Errno 8] nodename nor servname provided, or not known
During handling of the above exception, another exception occurred:
...
The error repeats itself, then runs the SentimentAnalysis on the file, and then shows up against multiple times, then runs the SentimentAnalysis on the file and then finally stops with RendezVous error (forgot to capture this message) What I'm wondering is, how is it that the code worked for certain set of files for some time and threw error messages, worked a little more, threw error messages and then completely stopped working after a point?
I reran the code, only to find that, it is returning socket.gaierror after some random number of files in a folder. So one can see with a reasonable level of confidence that it is not the file contents that is the issue.
EDIT1: The file is simply any .txt files that have words in it.
Can someone help me resolve this? I can also assure you, all the text I have in all the 680 files accounts for a total of 1400 requests, I've been very meticulous in its calculation based on the definition of what a request is according to Cloud Natural API. so I am WELL within my limits.
EDIT2: I've tried sleep(10) which seems to work fine for a while but again begins throwing errors..
I figured it out. You're going to have to not read in all 600 files at once, but instead try to read it in batches of 50 files. (Create 12 folders with 50 files each), and manually run the code every time it is done scanning the folder. I'm not sure WHY this seems to work out, but it just works.
Hello everyone I am a beginner programmer in language Python and I need help.
this is my code in Python, it gives an error, please help to fix
urllib.error.URLError: urlopen error [Errno 11001] getaddrinfo failed
Python:
# -*- coding: utf-8 -*-
import urllib.request
from lxml.html import parse
WEBSITE = 'http://allrecipes.com'
URL_PAGE = 'http://allrecipes.com/recipes/110/appetizers-and-snacks/deviled-eggs/?page='
START_PAGE = 1
END_PAGE = 5
def correct_str(s):
return s.encode('utf-8').decode('ascii', 'ignore').strip()
for i in range(START_PAGE, END_PAGE+1):
URL = URL_PAGE + str(i)
HTML = urllib.request.urlopen(URL)
page = parse(HTML).getroot()
for elem in page.xpath('//*[#id="grid"]/article[not(contains(#class, "video-card"))]/a[1]'):
href = WEBSITE + elem.get('href')
title = correct_str(elem.find('h3').text)
recipe_page = parse(urllib.request.urlopen(href)).getroot()
print(correct_str(href))
photo_url = recipe_page.xpath('//img[#class="rec-photo"]')[0].get('src')
print('\nName: |', title)
print('Photo: |', photo_url)
This into command prompt: python I get this error:
Traceback (most recent call last):
http://allrecipes.com/recipe/236225/crab-stuffed-deviled-eggs/
File "C:\Users\Ivan\AppData\Local\Programs\Python\Python35-32\lib\urllib\request.py", line 1240, in do_open
h.request(req.get_method(), req.selector, req.data, headers)
Name: | Crab-Stuffed Deviled Eggs
File "C:\Users\Ivan\AppData\Local\Programs\Python\Python35-32\lib\http\client.py", line 1083, in request
Photo: | http://images.media-allrecipes.com/userphotos/720x405/1091564.jpg
self._send_request(method, url, body, headers)
File "C:\Users\Ivan\AppData\Local\Programs\Python\Python35-32\lib\http\client.py", line 1128, in _send_request
self.endheaders(body)
File "C:\Users\Ivan\AppData\Local\Programs\Python\Python35-32\lib\http\client.py", line 1079, in endheaders
self._send_output(message_body)
File "C:\Users\Ivan\AppData\Local\Programs\Python\Python35-32\lib\http\client.py", line 911, in _send_output
self.send(msg)
File "C:\Users\Ivan\AppData\Local\Programs\Python\Python35-32\lib\http\client.py", line 854, in send
self.connect()
File "C:\Users\Ivan\AppData\Local\Programs\Python\Python35-32\lib\http\client.py", line 826, in connect
(self.host,self.port), self.timeout, self.source_address)
File "C:\Users\Ivan\AppData\Local\Programs\Python\Python35-32\lib\socket.py", line 693, in create_connection
for res in getaddrinfo(host, port, 0, SOCK_STREAM):
File "C:\Users\Ivan\AppData\Local\Programs\Python\Python35-32\lib\socket.py", line 732, in getaddrinfo
for res in _socket.getaddrinfo(host, port, family, type, proto, flags):
socket.gaierror: [Errno 11001] getaddrinfo failed
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "C:/Users/Ivan/Dropbox/parser/test.py", line 27, in <module>
recipe_page = parse(urllib.request.urlopen(href)).getroot()
File "C:\Users\Ivan\AppData\Local\Programs\Python\Python35-32\lib\urllib\request.py", line 162, in urlopen
return opener.open(url, data, timeout)
File "C:\Users\Ivan\AppData\Local\Programs\Python\Python35-32\lib\urllib\request.py", line 465, in open
response = self._open(req, data)
File "C:\Users\Ivan\AppData\Local\Programs\Python\Python35-32\lib\urllib\request.py", line 483, in _open
'_open', req)
File "C:\Users\Ivan\AppData\Local\Programs\Python\Python35-32\lib\urllib\request.py", line 443, in _call_chain
result = func(*args)
File "C:\Users\Ivan\AppData\Local\Programs\Python\Python35-32\lib\urllib\request.py", line 1268, in http_open
return self.do_open(http.client.HTTPConnection, req)
File "C:\Users\Ivan\AppData\Local\Programs\Python\Python35-32\lib\urllib\request.py", line 1242, in do_open
raise URLError(err)
urllib.error.URLError: <urlopen error [Errno 11001] getaddrinfo failed>
Process finished with exit code 1
I'll attempt to explain three main ways to dig into a programming problem:
(1) Use a debugger. You could walk through your code and examine variables before they are used and before they throw an exception. Python comes with pdb. In this problem you would step through the code and print out the href before urlopen().
(2) Assertions. Use Python's assert to assert assumptions in your code. You could, for example, assert not href.startswith('http')
(3) Logging. Log relevant variables before they are used. This is what I used:
I added the following to your code...
href = WEBSITE + elem.get('href')
print(href)
And got...
Photo: | http://images.media-allrecipes.com/userphotos/720x405/1091564.jpg
http://allrecipes.comhttp://dish.allrecipes.com/how-to-boil-an-egg/
From here you can see your getaddrinfo problem: Your system is trying to open a url at a host named allrecipes.comhttp.
This looks to be a problem based upon your assumption that WEBSITE must be prepended to every href you pull from the html.
You can handle the case of an absolute vs relative href with something like this and a function to determine if the url is absolute:
import urlparse
def is_absolute(url):
# See https://stackoverflow.com/questions/8357098/how-can-i-check-if-a-url-is-absolute-using-python
return bool(urlparse.urlparse(url).netloc)
href = elem.get('href')
if not is_absolute(href):
href = WEBSITE + href
A better way to do this is to use parse as such:
from urllib import parse
href = parse.urljoin(base_url, href)
This will will return a complete url for the href in case it is not complete.
I am not a python guru I am just writing a code to check my api authentication and URL access status. I just want to assure that my api and domain url is accessible to users.
For the above reason I am writing a python script which can check and a cron can send an alert to me.
Here is my Code:
def check(argv):
# I'm going to use argpase.It makes
# command-line args a breeze.
parser = argparse.ArgumentParser()
parser.add_argument('-H', '--hostname', dest='hostname', required=True)
parser.add_argument('-a', '--auth_id', dest='authenticationid')
parser.add_argument('-t', '--auth_token', dest='authenticationtoken')
parser.add_argument('-r', '--dest_url', dest='dest_url',help="""Path to report relative to root, like /v1/ OR /""", required=True)
parser.add_argument("-q", "--quiet", action="store_false", dest="verbose", default=True,
help="don't print status messages to stdout")
args = vars(parser.parse_args())
if args['authenticationid'] and args['authenticationtoken'] and not len(sys.argv) == 7:
authurl = urllib.request.Request('https://{%s}:{%s}#%s%s/%s/' %(args['authenticationid'],args['authenticationtoken'],args['hostname'], args['dest_url'],args['authenticationid']))
return (getAuthResponseCode(authurl))
else:
url = urllib.request.Request("https://%s%s" %(args['hostname'], args['dest_url']))
return(getResponseCode(url))
def getResponseCode(url):
try:
conn = urllib.request.urlopen(url,timeout=10)
code = conn.getcode()
return (status['OK'], code)
except timeout:
return (status['WARNING'], logging.error('socket timed out - URL %s', url))
except urllib.error.URLError as e:
return (status['CRITICAL'], e.reason)
else:
return (status['UNKNOWN'])
def getAuthResponseCode(authurl):
try:
authconn = urllib.request.urlopen(authurl, timeout=10)
authcode = authconn.getcode()
return (status['OK'], authcode)
except timeout:
return (status['WARNING'], logging.error('socket timed out - URL %s'))
except urllib.error.URLError as err:
return (status['CRITICAL'], err.reason)
else:
return (status['UNKNOWN'])
ERROR Message:
G:\Python>python check_http.py -H api.mydomain.com -r /API/Function/ -a 'MAMZMZZGVLMG
FMNTHIYTREETBESSS' -t 'DafniisfnsifnsifsbANBBDSDNBISDExODZlODAwMmZm'
Traceback (most recent call last):
File "C:\Python33\lib\http\client.py", line 770, in _set_hostport
port = int(host[i+1:])
ValueError: invalid literal for int() with base 10: "{'DafniisfnsifnsifsbANBBDSDNBISDExODZlODAw
MmZm'}#api.mydomain.com"
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "check_http.py", line 76, in <module>
print (check(sys.argv[1:]))
File "check_http.py", line 41, in check
return (getAuthResponseCode(authurl))
File "check_http.py", line 61, in getAuthResponseCode
authconn = urllib.request.urlopen(authurl, timeout=10)
File "C:\Python33\lib\urllib\request.py", line 156, in urlopen
return opener.open(url, data, timeout)
File "C:\Python33\lib\urllib\request.py", line 469, in open
response = self._open(req, data)
File "C:\Python33\lib\urllib\request.py", line 487, in _open
'_open', req)
File "C:\Python33\lib\urllib\request.py", line 447, in _call_chain
result = func(*args)
File "C:\Python33\lib\urllib\request.py", line 1283, in https_open
context=self._context, check_hostname=self._check_hostname)
File "C:\Python33\lib\urllib\request.py", line 1219, in do_open
h = http_class(host, timeout=req.timeout, **http_conn_args)
File "C:\Python33\lib\http\client.py", line 1172, in __init__
source_address)
File "C:\Python33\lib\http\client.py", line 749, in __init__
self._set_hostport(host, port)
File "C:\Python33\lib\http\client.py", line 775, in _set_hostport
raise InvalidURL("nonnumeric port: '%s'" % host[i+1:])
http.client.InvalidURL: nonnumeric port: '{'DafniisfnsifnsifsbANBBDSDNBISDExODZlODAw
MmZm'}#api.mydomain.com'
I know this is not write my code forum but I am helpless and looking for some help.If some one can tell where exactly I went wrong so that I can fix it.
I am using python3.
You're passing ʹhttps://user:pass#whateverʹ as the url.
Python doesn't understand you're trying to authenticate, and thinks you're passing ʹhttps://domain:port...ʹ
To do basic auth with urllib, you need to use a urllib.request.HTTPBasicAuthHandler
Sorry I'm not posting links and/or example code, but I'm typing this on my phone, and it makes those a pain.