Why I am getting the below mentioned error? - python-3.x

import requests
from selenium import webdriver
chrome_options = webdriver.ChromeOptions()
links = {"profile.default_content_setting_values.notifications": 2}
chrome_options.add_experimental_option("prefs", links)
driver = webdriver.Chrome(chrome_options=chrome_options,
executable_path="F:\\automation\\chromedriver.exe")
driver.maximize_window()
driver.get('https://google.com/')
links = driver.find_elements_by_tag_name("a")
for link in links:
r = requests.head(link.get_attribute('href'))
print(link.get_attribute('href'), r.status_code, 'front_page')
driver.quit()
I am getting this error:
Traceback (most recent call last):
File "F:/automation/frontpages.py", line 15, in <module>
r = requests.head(link.get_attribute('href'))
File "F:\automation\venv\lib\site-packages\requests\api.py", line 101, in head return request('head', url, **kwargs)
File "F:\automation\venv\lib\site-packages\requests\api.py", line 60, in request return session.request(method=method, url=url, **kwargs)
File "F:\automation\venv\lib\site-packages\requests\sessions.py", line 524, in request resp = self.send(prep, **send_kwargs)
File "F:\automation\venv\lib\site-packages\requests\sessions.py", line 631, in send adapter = self.get_adapter(url=request.url)
File "F:\automation\venv\lib\site-packages\requests\sessions.py", line 722, in get_adapter raise InvalidSchema("No connection adapters were found for '%s'" % url)
requests.exceptions.InvalidSchema: No connection adapters were found for 'mailto:care#pushengage.com'
and I want to export all the links into the HTML sheet when the test case passed
Why I am getting this error?

Some link has mailto link which is not http url .You are trying to use an mailto link(mailto:some#somepage.com) into r = requests.head. Which is not an http request, so your getting this error.
Filter the those invalid links in findelements itself.
links = driver.find_elements_by_xpath("//a[not(contains(#href,'mailto'))][contains(#href,'pushengage.com')]")
for link in links:
r = requests.head(link.get_attribute('href'))
print(link.get_attribute('href'), r.status_code, 'front_page')
driver.quit()

Related

How to get url instead of base64 value from img tags

I'm quite new regarding to the web realm and in an attempt to make my way into it today I started using Beautiful Soup and requests module. Program was going well until code execution gets on (line 78) whereon an error is raised because the source value of img tags is retrieved as base64 format. So far, I'm aware that this could be overcome by just encoding it to ascii then decoding it with base64 decoder, but the thing is that i want it as an URL value. How should i go about it?
NOTES:
(Just in case, one never knows)
python version: 3.8.5
lxml version 4.6.2
beautifulsoup4 4.9.3
ERROR
Traceback (most recent call last):
File "/home/l0new0lf/Documents/Projects/Catalog Scraping/scrape.py", line 72, in <module>
webdata['images'].append(requests.get(game_tables[j].select_one(IMG_TAG_SELECTOR)['src']).content)
File "/usr/lib/python3/dist-packages/requests/api.py", line 75, in get
return request('get', url, params=params, **kwargs)
File "/usr/lib/python3/dist-packages/requests/api.py", line 60, in request
return session.request(method=method, url=url, **kwargs)
File "/usr/lib/python3/dist-packages/requests/sessions.py", line 533, in request
resp = self.send(prep, **send_kwargs)
File "/usr/lib/python3/dist-packages/requests/sessions.py", line 640, in send
adapter = self.get_adapter(url=request.url)
File "/usr/lib/python3/dist-packages/requests/sessions.py", line 731, in get_adapter
raise InvalidSchema("No connection adapters were found for '%s'" % url)
requests.exceptions.InvalidSchema: No connection adapters were found for
''
Any help is welcome, many thanks in advance!!!
CODE
#CSS SELECTORS FOR LOOKUPS
TITLE_TAG_SELECTOR = 'tr:first-child td.ta14b.t11 div a strong'
IMG_TAG_SELECTOR = 'tr:last-child td:first-child a img'
DESCRIPTION_TAG_SELECTOR = 'tr:last-child td:last-child p'
GENRES_TAG_SELECTOR = 'tr:last-child td:last-child div.mt05 p :last-child'
GAME_SEARCH_RESULTS_TABLE_SELECTOR = 'table.mt1.tablestriped4.froboto_real.blanca'
#CSS CLASS ATTRIBUTE
GAME_TABLE_CLASS = 'table transparente tablasinbordes'
rq = requests.get(
f'https://vandal.elespanol.com/juegos/13/pc/letra/a/inicio/1')
soup = BeautifulSoup(rq.content, 'lxml')
main_table = soup.select_one(GAME_SEARCH_RESULTS_TABLE_SELECTOR)
print('main_table:', main_table)
game_tables = main_table.find_all('table', {'class': GAME_TABLE_CLASS})
print('game_tables', game_tables)
#help(game_tables)
webdata = {
'titles' : [],
'images' : [],
'descriptions' : [],
'genres': [],
}
for j in range(len(game_tables)):
webdata['titles'].append(game_tables[j].select_one(TITLE_TAG_SELECTOR).text)
print(game_tables[j].select_one(IMG_TAG_SELECTOR)['src'])
webdata['images'].append(requests.get(game_tables[j].select_one(IMG_TAG_SELECTOR)['src']).content)
webdata['descriptions'].append(game_tables[j].select_one(DESCRIPTION_TAG_SELECTOR))
webdata['genres'].append(game_tables[j].select_one(GENRES_TAG_SELECTOR))
print(titles, images, descriptions, genres, sep='\n')

Downloading a csv file with python

I'm trying to download historical stock prices from Yahoo Finance using Python using the following code:
import urllib.request
import ssl
import os
url = 'https://query1.finance.yahoo.com/v7/finance/download/%5ENSEI?period1=1537097203&period2=1568633203&interval=1d&events=history&crumb=0PVssBOEZBk'
ctx = ssl.create_default_context()
ctx.check_hostname = False
ctx.verify_mode = ssl.CERT_NONE
connection = urllib.request.urlopen(url,context = ctx)
data = connection.read()
with urllib.request.urlopen(url) as testfile, open('data.csv', 'w') as f:
f.write(testfile.read().decode())
however, I'm getting a traceback as mentioned below:
Traceback (most recent call last):
File "C:/Users/jmirand/Desktop/test.py", line 11, in <module>
connection = urllib.request.urlopen(url,context = ctx)
File "C:\Users\jmirand\AppData\Local\Programs\Python\Python37-32\lib\urllib\request.py", line 222, in urlopen
return opener.open(url, data, timeout)
File "C:\Users\jmirand\AppData\Local\Programs\Python\Python37-32\lib\urllib\request.py", line 531, in open
response = meth(req, response)
File "C:\Users\jmirand\AppData\Local\Programs\Python\Python37-32\lib\urllib\request.py", line 641, in http_response
'http', request, response, code, msg, hdrs)
File "C:\Users\jmirand\AppData\Local\Programs\Python\Python37-32\lib\urllib\request.py", line 569, in error
return self._call_chain(*args)
File "C:\Users\jmirand\AppData\Local\Programs\Python\Python37-32\lib\urllib\request.py", line 503, in _call_chain
result = func(*args)
File "C:\Users\jmirand\AppData\Local\Programs\Python\Python37-32\lib\urllib\request.py", line 649, in http_error_default
raise HTTPError(req.full_url, code, msg, hdrs, fp)
urllib.error.HTTPError: HTTP Error 401: Unauthorized
I assume this has to do with the fact that it's because of the HTTPS and Python doesn't have enough certificates to put into it by default.
The webpage im on is here Yahoo Finance NSEI historical prices and its on the 'Download Data' tab that you click on where the data automatically gets downloaded through a csv file.
Can you please help in rectifying the code?
The yahoo api expects cookies from your browser to authenticate. I have copied the cookies from my browser and passed them through python requests
import requests
import csv
url = "https://query1.finance.yahoo.com/v7/finance/download/%5ENSEI?period1=1537099135&period2=1568635135&interval=1d&events=history&crumb=MMDwV5mvf2J"
cookies = {
"APID":"UP26c2bef4-bc0b-11e9-936a-066776ea83e8",
"APIDTS":"1568635136",
"B":"d10v5dhekvhhg&b=3&s=m6",
"GUC":"AQEBAQFda2VeMUIeqgS6&s=AQAAAICdZvpJ&g=XWoXdg",
"PRF":"t%3D%255ENSEI",
"cmp":"t=1568635133&j=0",
"thamba":"2"
}
with requests.Session() as s:
download = s.get(url,cookies=cookies)
decoded_content = download.content.decode('utf-8')
cr = csv.reader(decoded_content.splitlines(), delimiter=',')
my_list = list(cr)
for row in my_list:
print(row)

Request Error when trying to download images from Netflix

Ok, I'm new with python and I'm intended to learn a little bit about webscraping which is even harder for me since I don't know anything about Web, JS, HTML etc. My idea was to download some images available on the netflix catalogue. And that even works fine for the first 5 or 6 images
import requests, os
from bs4 import BeautifulSoup
url = "https://www.netflix.com/br/browse/genre/839338"
page = requests.get(url)
page.raise_for_status()
soup = BeautifulSoup(page.text)
img_element_list= soup.select('a img')
print(f'Images avalaible (?) : {len(img_element_list)} ')
quantity = int(input('How many images: '))
for c in range(quantity):
name = img_element_list[c].get('alt')
print('Downloading ' + name + ' image...')
img_response= img_element_list[c].get('src')
print('SCR: ' + img_response + '\n\n')
img = requests.get(img_response)
file = os.path.join('Images', name)
img_file = open(file+'.jpg', 'wb')
for chunk in img.iter_content(100000):
img_file.write(chunk)
img_file.close()
But in the end after the forth or fifth image's been downloaded, the scr for the subsequent images gets like this '' and then it raises this error:
Traceback (most recent call last):
File "C:\Users\1513 IRON\PycharmProjects\DownloadNetflixImg.py", line 20, in <module>
img = requests.get(img_response)
File "C:\Users\1513 IRON\AppData\Local\Programs\Python\Python37\lib\site-packages\requests\api.py", line 75, in get
return request('get', url, params=params, **kwargs)
File "C:\Users\1513 IRON\AppData\Local\Programs\Python\Python37\lib\site-packages\requests\api.py", line 60, in request
return session.request(method=method, url=url, **kwargs)
File "C:\Users\1513 IRON\AppData\Local\Programs\Python\Python37\lib\site-packages\requests\sessions.py", line 533, in request
resp = self.send(prep, **send_kwargs)
File "C:\Users\1513 IRON\AppData\Local\Programs\Python\Python37\lib\site-packages\requests\sessions.py", line 640, in send
adapter = self.get_adapter(url=request.url)
File "C:\Users\1513 IRON\AppData\Local\Programs\Python\Python37\lib\site-packages\requests\sessions.py", line 731, in get_adapter
raise InvalidSchema("No connection adapters were found for '%s'" % url)
requests.exceptions.InvalidSchema: No connection adapters were found for ''

Handling Exceptions on requests

I have a bunch of URL's (over 50k) in a CSV file from different Newspapers. I primarily looking for the main headline <h1> and the main paragraphs <p>.
I'm getting an exception that I'm not quite familiar with or don't know how to handle. Her is the message I get back:
Traceback (most recent call last):
File "/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/urllib3/connection.py", line 141, in _new_conn
(self.host, self.port), self.timeout, **extra_kw)
File "/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/urllib3/util/connection.py", line 60, in create_connection
for res in socket.getaddrinfo(host, port, family, socket.SOCK_STREAM):
File "/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/socket.py", line 745, in getaddrinfo
for res in _socket.getaddrinfo(host, port, family, type, proto, flags):
socket.gaierror: [Errno 8] nodename nor servname provided, or not known
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/urllib3/connectionpool.py", line 601, in urlopen
chunked=chunked)
File "/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/urllib3/connectionpool.py", line 346, in _make_request
self._validate_conn(conn)
File "/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/urllib3/connectionpool.py", line 850, in _validate_conn
conn.connect()
File "/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/urllib3/connection.py", line 284, in connect
conn = self._new_conn()
File "/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/urllib3/connection.py", line 150, in _new_conn
self, "Failed to establish a new connection: %s" % e)
urllib3.exceptions.NewConnectionError: <urllib3.connection.VerifiedHTTPSConnection object at 0x118e1a6a0>: Failed to establish a new connection: [Errno 8] nodename nor servname provided, or not known
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/requests/adapters.py", line 440, in send
timeout=timeout
File "/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/urllib3/connectionpool.py", line 639, in urlopen
_stacktrace=sys.exc_info()[2])
File "/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/urllib3/util/retry.py", line 388, in increment
raise MaxRetryError(_pool, url, error or ResponseError(cause))
urllib3.exceptions.MaxRetryError: HTTPSConnectionPool(host='www.cnn.com', port=443): Max retries exceeded with url: /2019/02/01/us/chicago-volunteer-homeless-cold-trnd/index.html?utm_source=feedburner&utm_medium=feed&utm_campaign=Feed%3A+rss%2Fcnn_topstories+%28RSS%3A+CNN+-+Top+Stories%29 (Caused by NewConnectionError('<urllib3.connection.VerifiedHTTPSConnection object at 0x118e1a6a0>: Failed to establish a new connection: [Errno 8] nodename nor servname provided, or not known',))
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/Volumes/FELIPE/english_news/pass_news.py", line 24, in <module>
request_to_url = requests.get(urls).text
File "/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/requests/api.py", line 72, in get
return request('get', url, params=params, **kwargs)
File "/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/requests/api.py", line 58, in request
return session.request(method=method, url=url, **kwargs)
File "/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/requests/sessions.py", line 508, in request
resp = self.send(prep, **send_kwargs)
File "/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/requests/sessions.py", line 640, in send
history = [resp for resp in gen] if allow_redirects else []
File "/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/requests/sessions.py", line 640, in <listcomp>
history = [resp for resp in gen] if allow_redirects else []
File "/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/requests/sessions.py", line 218, in resolve_redirects
**adapter_kwargs
File "/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/requests/sessions.py", line 618, in send
r = adapter.send(request, **kwargs)
File "/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/requests/adapters.py", line 508, in send
raise ConnectionError(e, request=request)
requests.exceptions.ConnectionError: HTTPSConnectionPool(host='www.cnn.com', port=443): Max retries exceeded with url: /2019/02/01/us/chicago-volunteer-homeless-cold-trnd/index.html?utm_source=feedburner&utm_medium=feed&utm_campaign=Feed%3A+rss%2Fcnn_topstories+%28RSS%3A+CNN+-+Top+Stories%29 (Caused by NewConnectionError('<urllib3.connection.VerifiedHTTPSConnection object at 0x118e1a6a0>: Failed to establish a new connection: [Errno 8] nodename nor servname provided, or not known',)))
Her is the code:
import uuid
import pandas as pd
import os
import requests
from bs4 import BeautifulSoup
cwd = os.path.dirname(os.path.realpath(__file__))
csv_file = os.path.join(cwd, "csv_data", "data.csv")
text_data = os.path.join(cwd, "raw_text2")
if not os.path.exists(text_data):
os.makedirs(text_data)
df = pd.read_csv(csv_file)
for link, source in df.iterrows():
urls = source['Link']
source_name = source["Source"]
request_to_url = requests.get(urls).text
soup = BeautifulSoup(request_to_url, 'html.parser')
try:
h = soup.find_all('h1')
try:
text_h = h.get_text()
except AttributeError:
text_h = ""
p = soup.find_all('p')
text_p = ([p.get_text() for p in soup('p')])
text_bb = str(" ".join(repr(e) for e in text_p))
source_dir = os.path.join(text_data, source_name)
try:
os.makedirs(source_dir)
except FileExistsError as e:
pass
filename = str(uuid.uuid4())
write = open(os.path.join(source_dir, filename + ".txt"), "w+", encoding="utf-8")
write.write(text_h + "\n" + text_bb)
write.close()
data = pd.Series(text_h + text_bb)
with open("raw_text.csv", "a") as f:
data.to_csv(f, encoding="utf-8", header=False, index=None)
except:
# Removes all <div> with id "sponsor-slug"
for child_div in soup.find_all("div", id="sponsor-slug"):
child_div.decompose()
# Remove all <p> with class "copyright"
for child_p in soup.find_all('p', attrs={'class': "copyright"}):
child_p.decompose()
# Removes all <a> tags an keeps the content if any
a_remove = soup.find_all("a")
for unwanted_tag in a_remove:
unwanted_tag.replaceWithChildren()
# Removes all <span> content and keeps content if any
span_remove = soup.find_all("span")
for unwanted_tag in span_remove:
unwanted_tag.replaceWithChildren()
# Removes all <em> content and keeps content if any
span_remove = soup.find_all("em")
for unwanted_tag in span_remove:
unwanted_tag.replaceWithChildren()
What is the best way of handling these exceptions?
Is it possible to ignore the connection if not possible and go to the next URL?
I want to crawl and add the content into another CSV file or add them to the current CSV if possible. At the same time create different folders with the different sources and add the corresponding text to that folder.
Its basically what this code is doing:
filename = str(uuid.uuid4())
write = open(os.path.join(source_dir, filename + ".txt"), "w+", encoding="utf-8")
write.write(text_h + "\n" + text_bb)
write.close()
data = pd.Series(text_h + text_bb)
with open("raw_text.csv", "a") as f:
data.to_csv(f, encoding="utf-8", header=False, index=None)
I want to use NLP on each text and later try to use some sentiment analyzing tools on the text.
Before getting the text value of response, in this line:
request_to_url = requests.get(urls).text
You can check if link is available or NOT. I wrote simple function for this action:
import requests
# Open session
s = requests.Session()
page_url = "http://wp.meQ/testBadUrl" # example of bad URL
def get_response(page_url):
""" Get good or bad response from page_url"""
# Create 'bad' Response object
bad_resp = requests.Response()
bad_resp.status_code = 404
try:
# By default 'allow_redirects' = True
good_resp = s.get(page_url, timeout=(3, 10))
if good_resp.ok:
return good_resp
else:
return bad_resp
except requests.exceptions.ConnectionError:
print("Exception! Bad Request for URL: " + page_url)
return bad_resp
except requests.exceptions.Timeout:
print("Exception! Timeout for URL: " + page_url)
return bad_resp
except:
print("Unknown Exception!: " + page_url)
return bad_resp
page_resp = get_response(page_url)
if page_resp.ok:
# Your code for good URLs
print("Append URL into 'GOOD' list")
else:
# Your code for bad URLs
print("Skip BAD url here...")
You can also add and handle different requests exceptions (full list here), if you need.
I hope it will help you.

No connection adapters were found for Python3-Requests

I am using beautiful soup with requests package in python3 for web scraping. This is my code.
import csv
from datetime import datetime
import requests
import csv
from datetime import datetime
from bs4 import BeautifulSoup
quote_page = ['http://10.69.161.179:8080'];
data = []
page = requests.get(quote_page)
soup = BeautifulSoup(page.content,'html.parser')
name_box = soup.find('div', attrs={'class':'caption span10'})
name= name_box.text.strip() #strip() is used to remove starting and ending
print(name);
data.append(name)
with open('sample.csv', 'a') as csv_file:
writer = csv.writer(csv_file)
writer.writerow([name])
print ("Success");
When I execute the above code I'm getting the following error.
Traceback (most recent call last):
File "first_try.py", line 21, in <module>
page = requests.get(quote_page);
File "C:\Python\lib\site-packages\requests-2.13.0-py3.6.egg\requests\api.py", line 70, in get
return request('get', url, params=params, **kwargs)
File "C:\Python\lib\site-packages\requests-2.13.0-py3.6.egg\requests\api.py", line 56, in request
return session.request(method=method, url=url, **kwargs)
File "C:\Python\lib\site-packages\requests-2.13.0-py3.6.egg\requests\sessions.py", line 488, in request
resp = self.send(prep, **send_kwargs)
File "C:\Python\lib\site-packages\requests-2.13.0-py3.6.egg\requests\sessions.py", line 603, in send
adapter = self.get_adapter(url=request.url)
File "C:\Python\lib\site-packages\requests-2.13.0-py3.6.egg\requests\sessions.py", line 685, in get_adapter
raise InvalidSchema("No connection adapters were found for '%s'" % url)
requests.exceptions.InvalidSchema: No connection adapters were found for '['http://10.69.161.179:8080/#/main/dashboard/metrics']'
Can anyone help me with this? :(
Because requests.get() only accept url schema in string format. You need to unpack string inside the list [] .
quote_page = ['http://10.69.161.179:8080']
for url in quote_page:
page = requests.get(url)
.....
By the way , though semicolon is harmless under following statement, you should avoid it unless you need it for some reason
quote_page = ['http://10.69.161.179:8080'];

Resources