unable take input from a text file in python crawler - python-3.x

I have created a basic crawler in python, I want to take input from a text file.
I used open/raw_input but there was an error.
When I used input("") function it is prompting for input and was working fine.
The problem only with reading a file
import re
import urllib.request
url = open('input.txt', 'r')
data = urllib.request.urlopen(url).read()
data1 = data.decode("utf8")
print(data1)
file =open('output.txt' , 'w')
file.write(data1)
file.close()
error output below.
Traceback (most recent call last):
File "scrape.py", line 8, in <module>
data = urllib.request.urlopen(url).read()
File "/usr/lib/python3.6/urllib/request.py", line 223, in urlopen
return opener.open(url, data, timeout)
File "/usr/lib/python3.6/urllib/request.py", line 518, in open
protocol = req.type
AttributeError: '_io.TextIOWrapper' object has no attribute 'type'

the method open returns a file object, and not the content of the file as a string. if you want url to contain the content as a string, change the line to:
url = open('input.txt', 'r').read()

Related

AttributeError: 'list' object has no attribute 'decode'. im getting this error while writing a csv file in a array . how do i solve it?

import csv
import requests
from bs4 import BeautifulSoup
import wget
with open('__memes_magic_thumbnails.csv', newline='') as csvfile:
data = list(csv.reader(csvfile))
print(data)
k=0
for link in data:
print(k)
wget.download(link , "vid/logo.jpg")
k+=1
print("succes")
for this code im getting the following error
Traceback (most recent call last):
File "i:\Meme Channel\channel1\automated_youtube_channel-master\automated_youtube_channel-master\scraper.py", line 12, in <module>
wget.download(link , "vid/logo.jpg")
File "C:\Users\Sambhaji Karbhari\AppData\Local\Programs\Python\Python310\lib\site-packages\wget.py", line 505, in download
prefix = detect_filename(url, out)
File "C:\Users\Sambhaji Karbhari\AppData\Local\Programs\Python\Python310\lib\site-packages\wget.py", line 484, in detect_filename
names["url"] = filename_from_url(url) or ''
File "C:\Users\Sambhaji Karbhari\AppData\Local\Programs\Python\Python310\lib\site-packages\wget.py", line 230, in filename_from_url
fname = os.path.basename(urlparse.urlparse(url).path)
File "C:\Users\Sambhaji Karbhari\AppData\Local\Programs\Python\Python310\lib\urllib\parse.py", line 392, in urlparse
url, scheme, _coerce_result = _coerce_args(url, scheme)
File "C:\Users\Sambhaji Karbhari\AppData\Local\Programs\Python\Python310\lib\urllib\parse.py", line 128, in _coerce_args return _decode_args(args) + (_encode_result,)
File "C:\Users\Sambhaji Karbhari\AppData\Local\Programs\Python\Python310\lib\urllib\parse.py", line 112, in _decode_args return tuple(x.decode(encoding, errors) if x else '' for x in args)
File "C:\Users\Sambhaji Karbhari\AppData\Local\Programs\Python\Python310\lib\urllib\parse.py", line 112, in <genexpr>
return tuple(x.decode(encoding, errors) if x else '' for x in args)
AttributeError: 'list' object has no attribute 'decode'
here i downloading a image from the link which is taken fro array which is inserted in it from a csv file.

Python Requests.Get - Giving Invalid Schema Error

Another one for you.
Trying to scrape a list of URLs from a CSV file. This is my code:
from bs4 import BeautifulSoup
import requests
import csv
with open('TeamRankingsURLs.csv', newline='') as f_urls, open('TeamRankingsOutput.csv', 'w', newline='') as f_output:
csv_urls = csv.reader(f_urls)
csv_output = csv.writer(f_output)
for line in csv_urls:
page = requests.get(line[0]).text
soup = BeautifulSoup(page, 'html.parser')
results = soup.findAll('div', {'class' :'LineScoreCard__lineScoreColumnElement--1byQk'})
for r in range(len(results)):
csv_output.writerow([results[r].text])
...Which gives me the following error:
Traceback (most recent call last):
File "TeamRankingsScraper.py", line 11, in <module>
page = requests.get(line[0]).text
File "C:\Users\windowshopr\AppData\Local\Programs\Python\Python36\lib\site-packages\requests\api.py", line 72, in get
return request('get', url, params=params, **kwargs)
File "C:\Users\windowshopr\AppData\Local\Programs\Python\Python36\lib\site-packages\requests\api.py", line 58, in request
return session.request(method=method, url=url, **kwargs)
File "C:\Users\windowshopr\AppData\Local\Programs\Python\Python36\lib\site-packages\requests\sessions.py", line 512, in request
resp = self.send(prep, **send_kwargs)
File "C:\Users\windowshopr\AppData\Local\Programs\Python\Python36\lib\site-packages\requests\sessions.py", line 616, in send
adapter = self.get_adapter(url=request.url)
File "C:\Users\windowshopr\AppData\Local\Programs\Python\Python36\lib\site-packages\requests\sessions.py", line 707, in get_adapter
raise InvalidSchema("No connection adapters were found for '%s'" % url)
requests.exceptions.InvalidSchema: No connection adapters were found for 'https://www.teamrankings.com/mlb/stat/runs-per-game?date=2018-04-15'
My CSV file is just a list in column A of several urls (ie. https://www...)
(The div class I'm trying to scrape doesn't exist on that page, but that's not where the problem is. At least I don't think. I just need to update that when I can get it to read from the CSV file.)
Any suggestions? Because this code works on another project, but for some reason I'm having issues with this new URL list. Thanks a lot!
from the Traceback, requests.exceptions.InvalidSchema: No connection adapters were found for 'https://www.teamrankings.com/mlb/stat/runs-per-game?date=2018-04-15'
See the random character in the url, it should start from https://www.teamrankings.com/mlb/stat/runs-per-game?date=2018-04-15
So first parse the csv and remove any random characters before the http/https using regex. That should solve your problem.
If you want to resolve your current problem with this particular url while reading csv, do :
import regex as re
strin = "https://www.teamrankings.com/mlb/stat/runs-per-game?date=2018-04-15"
re.sub(r'.*http', 'http', strin)
This will give you the correct url which request can handle.
Since you ask for the full fix of the path which is accessible in the loop, here's what you could do:
from bs4 import BeautifulSoup
import requests
import csv
import regex as re
with open('TeamRankingsURLs.csv', newline='') as f_urls, open('TeamRankingsOutput.csv', 'w', newline='') as f_output:
csv_urls = csv.reader(f_urls)
csv_output = csv.writer(f_output)
for line in csv_urls:
page = re.sub(r'.*http', 'http', line[0])
page = requests.get(page).text
soup = BeautifulSoup(page, 'html.parser')
results = soup.findAll('div', {'class' :'LineScoreCard__lineScoreColumnElement--1byQk'})
for r in range(len(results)):
csv_output.writerow([results[r].text])

How to response with PIL image in Cherrypy dynamically (Python3)?

It seems the task is easy but...
I have simple PIL.Image object. How to make Cherrypy response with this image dynamically?
def get_image(self, data_id):
cherrypy.response.headers['Content-Type'] = 'image/png'
img = PIL.Image.frombytes(...)
buffer = io.StringIO()
img.save(buffer, 'PNG')
return buffer.getvalue()
This code gives me:
500 Internal Server Error
The server encountered an unexpected condition which prevented it from fulfilling the request.
Traceback (most recent call last):
File "C:\Users\Serge\AppData\Local\Programs\Python\Python36\lib\site-packages\cherrypy\_cprequest.py", line 631, in respond
self._do_respond(path_info)
File "C:\Users\Serge\AppData\Local\Programs\Python\Python36\lib\site-packages\cherrypy\_cprequest.py", line 690, in _do_respond
response.body = self.handler()
File "C:\Users\Serge\AppData\Local\Programs\Python\Python36\lib\site-packages\cherrypy\_cpdispatch.py", line 60, in __call__
return self.callable(*self.args, **self.kwargs)
File "D:\Dev\Bf\webapp\controllers\calculation.py", line 69, in get_image
img.save(buffer, 'PNG')
File "C:\Users\Serge\AppData\Local\Programs\Python\Python36\lib\site-packages\PIL\Image.py", line 1930, in save
save_handler(self, fp, filename)
File "C:\Users\Serge\AppData\Local\Programs\Python\Python36\lib\site-packages\PIL\PngImagePlugin.py", line 731, in _save
fp.write(_MAGIC)
TypeError: string argument expected, got 'bytes'
Can someone help me please?
Use io.BytesIO() instead of io.StringIO(). (From this answer.)

Uploading CSV files to Fusion Tables through Python

I am trying to grab data from looker and insert it directly into Google Fusion Tables using the MediaFileUpload so as to not download any files and upload from memory. My current code below returns a TypeError. Any help would be appreciated. Thanks!
Error returned to me:
Traceback (most recent call last):
File "csvpython.py", line 96, in <module>
main()
File "csvpython.py", line 88, in main
media = MediaFileUpload(dataq, mimetype='application/octet-stream', resumable=True)
File "/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/oauth2client/_helpers.py", line 133, in positional_wrapper
return wrapped(*args, **kwargs)
File "/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/googleapiclient/http.py", line 548, in __init__
fd = open(self._filename, 'rb')
TypeError: expected str, bytes or os.PathLike object, not NoneType
Code in question:
for x, y, z in zip(look, destination, fusion):
look_data = lc.run_look(x)
df = pd.DataFrame(look_data)
stream = io.StringIO()
dataq = df.to_csv(path_or_buf=stream, sep=";", index=False)
media = MediaFileUpload(dataq, mimetype='application/octet-stream', resumable=True)
replace = ftserv.table().replaceRows(tableId=z, media_body=media, startLine=None, isStrict=False, encoding='UTF-8', media_mime_type='application/octet-stream', delimiter=';', endLine=None).execute()
After switching dataq to stream in MediaFileUpload, I have had the following returned to me:
Traceback (most recent call last):
File "quicktestbackup.py", line 96, in <module>
main()
File "quicktestbackup.py", line 88, in main
media = MediaFileUpload(stream, mimetype='application/octet-stream', resumable=True)
File "/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/oauth2client/_helpers.py", line 133, in positional_wrapper
return wrapped(*args, **kwargs)
File "/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/googleapiclient/http.py", line 548, in __init__
fd = open(self._filename, 'rb')
TypeError: expected str, bytes or os.PathLike object, not _io.StringIO
DataFrame.to_csv is a void method and any side effects from calling it are passed to stream and not dataq. That is, dataq is NoneType and has no data - your CSV data is in stream.
When you construct the media file from the io object, you need to feed it the data from the stream (and not the stream itself), thus its getvalue() method is needed.
df.to_csv(path_or_buf=stream, ...)
media = MediaFileUpload(stream.getvalue(), ...)
The call to FusionTables looks to be perfectly valid.

No connection adapters were found for Python3-Requests

I am using beautiful soup with requests package in python3 for web scraping. This is my code.
import csv
from datetime import datetime
import requests
import csv
from datetime import datetime
from bs4 import BeautifulSoup
quote_page = ['http://10.69.161.179:8080'];
data = []
page = requests.get(quote_page)
soup = BeautifulSoup(page.content,'html.parser')
name_box = soup.find('div', attrs={'class':'caption span10'})
name= name_box.text.strip() #strip() is used to remove starting and ending
print(name);
data.append(name)
with open('sample.csv', 'a') as csv_file:
writer = csv.writer(csv_file)
writer.writerow([name])
print ("Success");
When I execute the above code I'm getting the following error.
Traceback (most recent call last):
File "first_try.py", line 21, in <module>
page = requests.get(quote_page);
File "C:\Python\lib\site-packages\requests-2.13.0-py3.6.egg\requests\api.py", line 70, in get
return request('get', url, params=params, **kwargs)
File "C:\Python\lib\site-packages\requests-2.13.0-py3.6.egg\requests\api.py", line 56, in request
return session.request(method=method, url=url, **kwargs)
File "C:\Python\lib\site-packages\requests-2.13.0-py3.6.egg\requests\sessions.py", line 488, in request
resp = self.send(prep, **send_kwargs)
File "C:\Python\lib\site-packages\requests-2.13.0-py3.6.egg\requests\sessions.py", line 603, in send
adapter = self.get_adapter(url=request.url)
File "C:\Python\lib\site-packages\requests-2.13.0-py3.6.egg\requests\sessions.py", line 685, in get_adapter
raise InvalidSchema("No connection adapters were found for '%s'" % url)
requests.exceptions.InvalidSchema: No connection adapters were found for '['http://10.69.161.179:8080/#/main/dashboard/metrics']'
Can anyone help me with this? :(
Because requests.get() only accept url schema in string format. You need to unpack string inside the list [] .
quote_page = ['http://10.69.161.179:8080']
for url in quote_page:
page = requests.get(url)
.....
By the way , though semicolon is harmless under following statement, you should avoid it unless you need it for some reason
quote_page = ['http://10.69.161.179:8080'];

Resources