WinError 10060 when trying to save scraped data to csv file - python-3.x

I am trying to save a buch of scraped data to a csv file with python, but I am keep getting time out errors, don't really know how to go about it, what Should I do?
what I have tried
import pandas as pd
import urllib.request
from bs4 import BeautifulSoup as b
npo_codici ={}
codici_no = 0
df = pd.read_excel("led-italia_lampadine.xlsx","foglio1")
sku = df["codice SKU"].tolist()
rounded_sku = [round(x) for x in sku]
request_length = len(rounded_sku)
for e in range(request_length):
request = str(rounded_sku[e])
e+=1
base_url = "https://v-tac.it/led-products-results-page/?q="
url = base_url + request
html = urllib.request.urlopen(url).read()
soup = b(html, "html.parser")
these are the errors that I am getting
File "C:\Users\antonella\AppData\Local\Programs\Python\Python37-32\lib\urllib\request.py", line 222, in urlopen
return opener.open(url, data, timeout)
File "C:\Users\antonella\AppData\Local\Programs\Python\Python37-32\lib\urllib\request.py", line 525, in open
response = self._open(req, data)
File "C:\Users\antonella\AppData\Local\Programs\Python\Python37-32\lib\urllib\request.py", line 543, in _open
'_open', req)
File "C:\Users\antonella\AppData\Local\Programs\Python\Python37-32\lib\urllib\request.py", line 503, in _call_chain
result = func(*args)
File "C:\Users\antonella\AppData\Local\Programs\Python\Python37-32\lib\urllib\request.py", line 1360, in https_open
context=self._context, check_hostname=self._check_hostname)
File "C:\Users\antonella\AppData\Local\Programs\Python\Python37-32\lib\urllib\request.py", line 1319, in do_open
raise URLError(err)
urllib.error.URLError: <urlopen error [WinError 10060]

because of network problem , maybe try with headers or check out this Why can't I get Python's urlopen() method to work on Windows?

Related

How to Generate Blob SAS URL for a excel file stored in Blob Container

I am trying to generate Blob SAS URL for a excel file to read its data in Data frames.
I am using below python code which throws an error while passing the URL value to read_excel function
"HTTPError: Server failed to authenticate the request. Make sure the value of Authorization header is formed correctly including the signature."
Code :
from azure.storage.blob import generate_blob_sas
from azure.storage.blob import BlobServiceClient, ResourceTypes, AccountSasPermissions
from datetime import datetime, timedelta,date
import pandas as pd
blob_name=<Blobname>
account_name=<accountname>
account_key=<accountkey>
container_name=<blobname>
sas_blob = generate_blob_sas(account_name=account_name,
container_name=container_name,
blob_name=blob_name,
account_key=account_key,
resource_types=ResourceTypes(object=True),
permission=AccountSasPermissions(read=True),
expiry=datetime.utcnow() + timedelta(hours=1))
blob = generate_blob_sas(account_name,account_key, container_name, blob_name,sas_blob)
blob_service_client = BlobServiceClient(account_url="https://<account_name>.blob.core.windows.net", credential=sas_blob)
url = 'https://'+account_name+'.blob.core.windows.net/'+container_name+'/'+blob_name+'?'+sas_blob
print(url)
df=pd.read_excel(url, sheet_name='test',usecols=(cols),header=6)
Error
Failed
C:\WPy64-3800\python-3.8.0.amd64\lib\site-packages\azure\storage\blob\baseblobservice.py:1009: SyntaxWarning: "is not" with a literal. Did you mean "!="? if lease_duration is not -1 and \C:\WPy64-3800\python-3.8.0.amd64\lib\site-packages\azure\storage\blob\baseblobservice.py:2660: SyntaxWarning: "is not" with a literal. Did you mean "!="? if lease_duration is not -1 and \C:\WPy64-3800\python-3.8.0.amd64\lib\site-packages\azure\storage\common_connection.py:82: SyntaxWarning: "is" with a literal. Did you mean "=="? self.protocol = self.protocol if parsed_url.scheme is '' else parsed_url.schemeTraceback (most recent call last): File "C:\Temp\rid04ztb.tl0\005b3440-f226-432b-b554-d625411fdb58", line 26, in df=pd.read_excel(url, sheet_name='test',usecols=(cols),header=6) File "C:\WPy64-3800\python-3.8.0.amd64\lib\site-packages\pandas\util_decorators.py", line 299, in wrapper return func(*args, **kwargs) File "C:\WPy64-3800\python-3.8.0.amd64\lib\site-packages\pandas\io\excel_base.py", line 336, in read_excel io = ExcelFile(io, storage_options=storage_options, engine=engine) File "C:\WPy64-3800\python-3.8.0.amd64\lib\site-packages\pandas\io\excel_base.py", line 1071, in init ext = inspect_excel_format( File "C:\WPy64-3800\python-3.8.0.amd64\lib\site-packages\pandas\io\excel_base.py", line 949, in inspect_excel_format with get_handle( File "C:\WPy64-3800\python-3.8.0.amd64\lib\site-packages\pandas\io\common.py", line 558, in get_handle ioargs = _get_filepath_or_buffer( File "C:\WPy64-3800\python-3.8.0.amd64\lib\site-packages\pandas\io\common.py", line 289, in _get_filepath_or_buffer req = urlopen(filepath_or_buffer) File "C:\WPy64-3800\python-3.8.0.amd64\lib\site-packages\pandas\io\common.py", line 195, in urlopen return urllib.request.urlopen(*args, **kwargs) File "C:\WPy64-3800\python-3.8.0.amd64\lib\urllib\request.py", line 222, in urlopen return opener.open(url, data, timeout) File "C:\WPy64-3800\python-3.8.0.amd64\lib\urllib\request.py", line 531, in open response = meth(req, response) File "C:\WPy64-3800\python-3.8.0.amd64\lib\urllib\request.py", line 640, in http_response response = self.parent.error( File "C:\WPy64-3800\python-3.8.0.amd64\lib\urllib\request.py", line 569, in error return self._call_chain(*args) File "C:\WPy64-3800\python-3.8.0.amd64\lib\urllib\request.py", line 502, in _call_chain result = func(*args) File "C:\WPy64-3800\python-3.8.0.amd64\lib\urllib\request.py", line 649, in http_error_default raise HTTPError(req.full_url, code, msg, hdrs, fp)urllib.error.HTTPError: HTTP Error 403: Server failed to authenticate the request. Make sure the value of Authorization header is formed correctly including the signature.
Any help appreciated. Thanks in advance.
I believe you're getting this error is because you're mixing a service SAS with account SAS. You don't need resource_types in your generate_blob_sas method and also the permission type should be BlobSasPermissions.
Please try the following code:
from azure.storage.blob import generate_blob_sas
from azure.storage.blob import BlobServiceClient, ResourceTypes, BlobSasPermissions
from datetime import datetime, timedelta,date
import pandas as pd
blob_name=<Blobname>
account_name=<accountname>
account_key=<accountkey>
container_name=<blobname>
sas_blob = generate_blob_sas(account_name=account_name,
container_name=container_name,
blob_name=blob_name,
account_key=account_key,
permission=BlobSasPermissions(read=True),
expiry=datetime.utcnow() + timedelta(hours=1))

Reading a "special" URL

The task is simple- I want to tranfer a html file from an URL to a variable and read the feed below:
How can I read the contents of an URL with Python?
All that works well except with the url = "https://www.goyax.de/"
with
import urllib
#fp = urllib.request.urlopen("https://www.spiegel.de/")
fp = urllib.request.urlopen("https://www.goyax.de/")
print("Result code: " + str(fp.getcode()))
print("Returned data: -----------------")
data = fp.read().decode("utf-8")
print(data)
I get only "403" and "Forbidden". Also with
import requests
url = 'https://www.goyax.de/'
#url = 'https://www.spiegel.de'
r = requests.get(url)
tt = r.text
print(tt)
I don't get an improvement. With other URLs both solutions work well so far.
Until now I was using an Autohotkey script (UrlDownloadToFile) (Windows only) and tried it also with Octave (s = urlread("https://www.goyax.de/")) where I get the right result and no error message. the scripts ae running sicne years on a PC but I want to move this task to a Raspberry Pi. Because of that I was learning Python
The output / error messages:
fp = urllib.request.urlopen("http://www.goyax.de/")
File "C:\ProgramData\Anaconda3\lib\urllib\request.py", line 222, in urlopen
return opener.open(url, data, timeout)
File "C:\ProgramData\Anaconda3\lib\urllib\request.py", line 531, in open
response = meth(req, response)
File "C:\ProgramData\Anaconda3\lib\urllib\request.py", line 640, in http_response
response = self.parent.error(
File "C:\ProgramData\Anaconda3\lib\urllib\request.py", line 563, in error
result = self._call_chain(*args)
File "C:\ProgramData\Anaconda3\lib\urllib\request.py", line 502, in _call_chain
result = func(*args)
File "C:\ProgramData\Anaconda3\lib\urllib\request.py", line 755, in http_error_302
return self.parent.open(new, timeout=req.timeout)
File "C:\ProgramData\Anaconda3\lib\urllib\request.py", line 531, in open
response = meth(req, response)
File "C:\ProgramData\Anaconda3\lib\urllib\request.py", line 640, in http_response
response = self.parent.error(
File "C:\ProgramData\Anaconda3\lib\urllib\request.py", line 569, in error
return self._call_chain(*args)
File "C:\ProgramData\Anaconda3\lib\urllib\request.py", line 502, in _call_chain
result = func(*args)
File "C:\ProgramData\Anaconda3\lib\urllib\request.py", line 649, in http_error_default
raise HTTPError(req.full_url, code, msg, hdrs, fp)
HTTPError: Forbidden
Well, I found the answer myself or with some help of a friend: The key is to the soluition is to set the user agent.
Solution 1) (with "Requests")
import requests
r = requests.get(url2, headers={"User-Agent":'Mozilla/5.0'}, timeout=25)
mystr=r.text # mybytes=r.content
print(mystr)
Solution 2) (with "urllib.request" + "CookieJar")
import urllib.request
from http.cookiejar import CookieJar
req = urllib.request.Request(url2, None, {"User-Agent":'Mozilla/5.0'})
# instead of #req = urllib.request.Request(url2)
opener = urllib.request.build_opener(urllib.request.HTTPCookieProcessor(CookieJar()))
response = opener.open(req)
content = response.read()
mystr2 = content.decode("utf8")
print(mystr2)
Usually the user agent 'Mozilla/5.0' is sufficient. Or check
https://manytools.org/http-html-text/user-agent-string/
and
https://www.scrapehero.com/how-to-fake-and-rotate-user-agents-using-python-3/
for a real user agent string.

urllib.error.HTTPError: HTTP Error 404: Not Found when using request.urlopen()

I was following a tutorial and when using request.urlopen(url) I get an error, I have tried checking the URL
(https://www.wsj.com/market-data/quotes/PH/XPHS/JFC/historical-prices/download?MOD_VIEW=page&num_rows=150&range_days=150&startDate=06/01/2020&endDate=07/05/2020)
and it's fine.
Here is my code:
from urllib import request
import datetime
def download_stock_from_day_until_today(stock_code, start_date):
current_day = datetime.date.today()
formatted_current_day = datetime.date.strftime(current_day, "%m/%d/%Y") #formats today's date for links
#formatted url
url = "https://www.wsj.com/market-data/quotes/PH/XPHS/"+ stock_code +"/historical-prices/download?MOD_VIEW=page&num_rows=150&range_days=150&startDate="+ start_date +"&endDate=" + formatted_current_day
print(url)
response = request.urlopen(url) #requests the csv file
csv = response.read() #reads the csv file
csv_str = str(csv)
lines = csv_str.split("\\n")
dest_url = r'asd.csv'
fx = open(dest_url, "w")
for line in lines:
fx.write(line + "\n")
fx.close()
download_stock_from_day_until_today("JFC", "06/01/2020")
and the error I get in the console is:
Traceback (most recent call last):
File "C:/Users/Lathrix/PycharmProject/StockExcelDownloader/main.py", line 23, in <module>
download_stock_from_day_until_today("JFC", "06/01/2020")
File "C:/Users/Lathrix/PycharmProject/StockExcelDownloader/main.py", line 12, in download_stock_from_day_until_today
response = request.urlopen(url) #requests the csv file
File "C:\Users\Lathrix\AppData\Local\Programs\Python\Python38-32\lib\urllib\request.py", line 222, in urlopen
return opener.open(url, data, timeout)
File "C:\Users\Lathrix\AppData\Local\Programs\Python\Python38-32\lib\urllib\request.py", line 531, in open
response = meth(req, response)
File "C:\Users\Lathrix\AppData\Local\Programs\Python\Python38-32\lib\urllib\request.py", line 640, in http_response
response = self.parent.error(
File "C:\Users\Lathrix\AppData\Local\Programs\Python\Python38-32\lib\urllib\request.py", line 569, in error
return self._call_chain(*args)
File "C:\Users\Lathrix\AppData\Local\Programs\Python\Python38-32\lib\urllib\request.py", line 502, in _call_chain
result = func(*args)
File "C:\Users\Lathrix\AppData\Local\Programs\Python\Python38-32\lib\urllib\request.py", line 649, in http_error_default
raise HTTPError(req.full_url, code, msg, hdrs, fp)
urllib.error.HTTPError: HTTP Error 404: Not Found
Looks like wsj.com does not like urllib's User-Agent.
With the line   
response = request.urlopen(request.Request(url,headers={'User-Agent': 'Mozilla/5.0'}))
your code works correctly

Downloading a csv file with python

I'm trying to download historical stock prices from Yahoo Finance using Python using the following code:
import urllib.request
import ssl
import os
url = 'https://query1.finance.yahoo.com/v7/finance/download/%5ENSEI?period1=1537097203&period2=1568633203&interval=1d&events=history&crumb=0PVssBOEZBk'
ctx = ssl.create_default_context()
ctx.check_hostname = False
ctx.verify_mode = ssl.CERT_NONE
connection = urllib.request.urlopen(url,context = ctx)
data = connection.read()
with urllib.request.urlopen(url) as testfile, open('data.csv', 'w') as f:
f.write(testfile.read().decode())
however, I'm getting a traceback as mentioned below:
Traceback (most recent call last):
File "C:/Users/jmirand/Desktop/test.py", line 11, in <module>
connection = urllib.request.urlopen(url,context = ctx)
File "C:\Users\jmirand\AppData\Local\Programs\Python\Python37-32\lib\urllib\request.py", line 222, in urlopen
return opener.open(url, data, timeout)
File "C:\Users\jmirand\AppData\Local\Programs\Python\Python37-32\lib\urllib\request.py", line 531, in open
response = meth(req, response)
File "C:\Users\jmirand\AppData\Local\Programs\Python\Python37-32\lib\urllib\request.py", line 641, in http_response
'http', request, response, code, msg, hdrs)
File "C:\Users\jmirand\AppData\Local\Programs\Python\Python37-32\lib\urllib\request.py", line 569, in error
return self._call_chain(*args)
File "C:\Users\jmirand\AppData\Local\Programs\Python\Python37-32\lib\urllib\request.py", line 503, in _call_chain
result = func(*args)
File "C:\Users\jmirand\AppData\Local\Programs\Python\Python37-32\lib\urllib\request.py", line 649, in http_error_default
raise HTTPError(req.full_url, code, msg, hdrs, fp)
urllib.error.HTTPError: HTTP Error 401: Unauthorized
I assume this has to do with the fact that it's because of the HTTPS and Python doesn't have enough certificates to put into it by default.
The webpage im on is here Yahoo Finance NSEI historical prices and its on the 'Download Data' tab that you click on where the data automatically gets downloaded through a csv file.
Can you please help in rectifying the code?
The yahoo api expects cookies from your browser to authenticate. I have copied the cookies from my browser and passed them through python requests
import requests
import csv
url = "https://query1.finance.yahoo.com/v7/finance/download/%5ENSEI?period1=1537099135&period2=1568635135&interval=1d&events=history&crumb=MMDwV5mvf2J"
cookies = {
"APID":"UP26c2bef4-bc0b-11e9-936a-066776ea83e8",
"APIDTS":"1568635136",
"B":"d10v5dhekvhhg&b=3&s=m6",
"GUC":"AQEBAQFda2VeMUIeqgS6&s=AQAAAICdZvpJ&g=XWoXdg",
"PRF":"t%3D%255ENSEI",
"cmp":"t=1568635133&j=0",
"thamba":"2"
}
with requests.Session() as s:
download = s.get(url,cookies=cookies)
decoded_content = download.content.decode('utf-8')
cr = csv.reader(decoded_content.splitlines(), delimiter=',')
my_list = list(cr)
for row in my_list:
print(row)

Unable to run this Python script in Command Prompt for web scraping

Been following a web scraping tutorial on youtube and I'm getting this error due to the url. It's working perfectly in Spyder but all hell breaks loose when I try to run it in the Command Prompt.
I tried various answers from here (quote() that converts ":" to "%3A" and a few others like urlencode()) but it didn't work. Or may be I don't know how to make them work.
from bs4 import BeautifulSoup as soup
from urllib.request import urlopen as uReq
#from urllib.parse import quote
my_url = 'https://www.newegg.com/Video-Cards-Video-Devices/Category/ID-38?Tpk=graphics%20card'
# opening connection, grabbing information and closing connection
uClient = uReq(my_url)
This is the error message -
Traceback (most recent call last):
File "my_first_webscrape.py", line 8, in <module>
uClient = uReq(my_url)
File "C:\ProgramData\Anaconda3\lib\urllib\request.py", line 222, in urlopen
return opener.open(url, data, timeout)
File "C:\ProgramData\Anaconda3\lib\urllib\request.py", line 525, in open
response = self._open(req, data)
File "C:\ProgramData\Anaconda3\lib\urllib\request.py", line 548, in _open
'unknown_open', req)
File "C:\ProgramData\Anaconda3\lib\urllib\request.py", line 503, in _call_chain
result = func(*args)
File "C:\ProgramData\Anaconda3\lib\urllib\request.py", line 1387, in unknown_open
raise URLError('unknown url type: %s' % type)
urllib.error.URLError: <urlopen error unknown url type: https>

Resources