Python BeautifulSoup "weird" errors - python-3.x

Somewhere in the mist i have tangled myself running these code gives me "weird" errors and it seems like i am missing a module but cant seem to get it work even after reading the error messages many times.
Anyone that has a clue on whats wrong here?
Happy new year and thanks in advance!
import requests
from bs4 import BeautifulSoup
import csv
def get_page(url):
response = requests.get(url)
if not response.ok:
print('Server responded:', response.status_code)
else:
soup = BeautifulSoup(response.text, 'lxml')
return soup
def get_detail_data(soup):
try:
product = soup.find('span',{'class':'a-size-large product-title-word-break'}).text
except:
product = ''
try:
price = soup.find('span',{'class':'a-size-medium a-color-price priceBlockBuyingPriceString'}).text.strip()
currency, price = p.split(' ')
except:
currency = ''
price = ''
try:
amount = soup.find('span', class_='a-size-medium a-color-state').find('a').text.strip()
except:
amount = ''
data = {
'product': product,
'price': price,
'currency': currency,
'amount': amount,
}
return data
def get_index_data(soup):
try:
links = soup.find_all('a',class_='a-link-normal a-text-normal')
except:
links = []
urls = [item.get('href') for item in links]
return urls
def write_csv(data, url):
with open('hardware.csv', 'a') as csvfile:
writer = csv.writer(csvfile)
row = [data['title'], data['price'], data['currency'], data['amount'], url]
writer.writerow(row)
def main():
url = 'https://www.amazon.se/s?k=grafikkort&page=1'
products = get_index_data(get_page(url))
for link in products:
data = get_detail_data(get_page(link))
write_csv(data, link)
if __name__ == '__main__':
main()
And the Error messages.
Traceback (most recent call last):
File "scrp.py", line 75, in <module>
main()
File "scrp.py", line 71, in main
data = get_detail_data(get_page(link))
File "scrp.py", line 7, in get_page
response = requests.get(url)
File "/usr/lib/python3/dist-packages/requests/api.py", line 75, in get
return request('get', url, params=params, **kwargs)
File "/usr/lib/python3/dist-packages/requests/api.py", line 60, in request
return session.request(method=method, url=url, **kwargs)
File "/usr/lib/python3/dist-packages/requests/sessions.py", line 519, in request
prep = self.prepare_request(req)
File "/usr/lib/python3/dist-packages/requests/sessions.py", line 452, in prepare_request
p.prepare(
File "/usr/lib/python3/dist-packages/requests/models.py", line 313, in prepare
self.prepare_url(url, params)
File "/usr/lib/python3/dist-packages/requests/models.py", line 387, in prepare_url
raise MissingSchema(error)
requests.exceptions.MissingSchema: Invalid URL '/ASUS-NVIDIA-GeForce-grafikkort-kylning/dp/B07489XSJP?dchild=1': No schema supplied. Perhaps you meant http:///ASUS-NVIDIA-GeForce-grafikkort-kylning/dp/B07489XSJP?dchild=1?

What is happening here is that you are only getting the URL suffixes from your products, as seen for instance with /ASUS-NVIDIA-GeForce-grafikkort-kylning.
A quick solution is to prepend `'https://amazon.se' to all your urls:
def main():
url = 'https://www.amazon.se/s?k=grafikkort&page=1'
products = get_index_data(get_page(url))
for link in products:
data = get_detail_data(get_page('https://www.amazon.se' + link))
write_csv(data, link)

Related

Reddit and Twitter bot in Python using PRAW

I am a beginner in Python, and trying out making a bot which automatically Tweets anything which is posted on a Subreddit that I have made.
I took help from some of the tutorials online which has the following code
import praw
import json
import requests
import tweepy
import time
access_token = '************************************'
access_token_secret = '************************************'
consumer_key = '************************************'
consumer_secret = '************************************'
def strip_title(title):
if len(title) == 94:
return title
else:
return title[:93] + "..."
def tweet_creator(subreddit_info):
post_dict = {}
post_ids = []
print("[bot] Getting posts from Reddit")
for submission in subreddit_info.get_hot(limit=20):
post_dict[strip_title(submission.title)] = submission.url
post_ids.append(submission.id)
print("[bot] Generating short link using goo.gl")
mini_post_dict = {}
for post in post_dict:
post_title = post
post_link = post_dict[post]
short_link = shorten(post_link)
mini_post_dict[post_title] = short_link
return mini_post_dict, post_ids
def setup_connection_reddit(subreddit):
print("[bot] setting up connection with Reddit")
r = praw.Reddit(' %s' %(subreddit))
subreddit = r.get_subreddit(subreddit)
return subreddit
def shorten(url):
headers = {'content-type': 'application/json'}
payload = {"longUrl": url}
url = "https://www.googleapis.com/urlshortener/v1/url"
r = requests.post(url, data=json.dumps(payload), headers=headers)
link = json.loads(r.text)['id']
return link
def duplicate_check(id):
found = 0
with open('posted_posts.txt', 'r') as file:
for line in file:
if id in line:
found = 1
return found
def add_id_to_file(id):
with open('posted_posts.txt', 'a') as file:
file.write(str(id) + "\n")
def main():
subreddit = setup_connection_reddit('*Name of the subreddit*')
post_dict, post_ids = tweet_creator(subreddit)
tweeter(post_dict, post_ids)
def tweeter(post_dict, post_ids):
auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_token_secret)
api = tweepy.API(auth)
for post, post_id in zip(post_dict, post_ids):
found = duplicate_check(post_id)
if found == 0:
print("[bot] Posting this link on twitter")
print(post + " " + post_dict[post] + " #Python #reddit #bot")
api.update_status(post+" "+post_dict[post]+" #Python #reddit #bot")
add_id_to_file(post_id)
time.sleep(30)
else:
print("[bot] Already posted")
if __name__ == '__main__':
main()
The code seems fine in PyCharm, however I am getting the following error when I try to run it directly from the folder via Terminal using the rolling code, reddit_bot2.py is my file name:
python3 reddit_bot2.py
When I try to run the code I am getting the following error:
mahesh#Maheshs-MacBook-Air Atoms % python3 reddit_bot2.py
[bot] setting up connection with Reddit
Traceback (most recent call last):
File "/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/configparser.py", line 846, in items
d.update(self._sections[section])
KeyError: '**Name of the subreddit to fetch posts from**'
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/Users/mahesh/Python_Bot/Atoms/reddit_bot2.py", line 82, in <module>
main()
File "/Users/mahesh/Python_Bot/Atoms/reddit_bot2.py", line 62, in main
subreddit = setup_connection_reddit('Bot167')
File "/Users/mahesh/Python_Bot/Atoms/reddit_bot2.py", line 36, in setup_connection_reddit
r = praw.Reddit(' %s' %(subreddit))
File "/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/praw/reddit.py", line 227, in __init__
self.config = Config(
File "/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/praw/config.py", line 85, in __init__
self.custom = dict(Config.CONFIG.items(site_name), **settings)
File "/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/configparser.py", line 849, in items
raise NoSectionError(section)
configparser.NoSectionError: No section: ' Bot167'
You provided the name of a praw.ini configuration which does not exist.
For help with creating a Reddit instance, visit
https://praw.readthedocs.io/en/latest/code_overview/reddit_instance.html
For help on configuring PRAW, visit
https://praw.readthedocs.io/en/latest/getting_started/configuration.html
Any help in this regards would be highly appreciated.
Thanks :)

Python3 beautifulsoup4 Multiple url request and save data

I'm new to python, and I am having problems using Beautifulsoup to scrape multiple url's from either a text list, or even coded into the program. Here is an example of my code.
import requests
from bs4 import BeautifulSoup
import re
url = 'https://0.0.0.0/directory/'
r = requests.get(url)
soup = BeautifulSoup(r.content, 'html5lib')
with open("1.txt", "w") as f:
for name, date in zip(
soup.find_all("a", {"class": "name"}), soup.find_all("span", {"class": "date"})
):
f.write(name.text.strip() + " ")
f.write(date.text.strip() + "\n")
This works great for one url, but when I add two it fails. It also fails when trying to load a list from a text file. I have about 25 urls in a file that I would like the program to run through and collect daily.
Failed multiple url code.
url = ['https://0.0.0.0/directory/', 'https://0.0.0.0/directory/']
Error message:
┌──(c4㉿ib)-[~/Desktop/dev]
└─$ python3 test.py
Traceback (most recent call last):
File "crime.py", line 9, in <module>
r = requests.get(url)
File "/usr/lib/python3/dist-packages/requests/api.py", line 76, in get
return request('get', url, params=params, **kwargs)
File "/usr/lib/python3/dist-packages/requests/api.py", line 61, in request
return session.request(method=method, url=url, **kwargs)
File "/usr/lib/python3/dist-packages/requests/sessions.py", line 530, in request
resp = self.send(prep, **send_kwargs)
File "/usr/lib/python3/dist-packages/requests/sessions.py", line 637, in send
adapter = self.get_adapter(url=request.url)
File "/usr/lib/python3/dist-packages/requests/sessions.py", line 728, in get_adapter
raise InvalidSchema("No connection adapters were found for {!r}".format(url))
requests.exceptions.InvalidSchema: No connection adapters were found for "['https://0.0.0.0/directory/', 'https://0.0.0.0/directory/']"
┌──(c4㉿ib)-[~/Desktop/dev]
└─$
Clearly I am not scraping 0.0.0.0 I renamed the domain for the question. Any advice what I am doing wrong would be helpful. I would rather grab from a list so my code doesn't have 25 urls stuffed into it. Thank you.
Try looping through the URL's and request each one separately:
import requests
from bs4 import BeautifulSoup
urls = ['https://0.0.0.0/directory/', 'https://0.0.0.0/directory/']
with open("output.txt", "w") as f:
for url in urls:
print(url)
resp = requests.get(url).content
soup = BeautifulSoup(resp, "html.parser")
for name, date in zip(
soup.find_all("a", {"class": "name"}), soup.find_all("span", {"class": "date"})
):
f.write(name.text.strip() + " ")
f.write(date.text.strip() + "\n")

How to get request with the url that contained Korean language?

I write a function to get full source of a Korean dictionary web and then cut the URL of mp3. It has errors because of the Korean language.
How do I fix it?
import requests
from bs4 import BeautifulSoup
def cut_to_get_mp3_url(word):
if word == None:
return None
link = 'https://krdict.korean.go.kr/vie/dicSearch/search?nation=vie&nationCode=2&ParaWordNo=&mainSearchWord='+word
x = requests.get(link)
soup = BeautifulSoup(x.content, "html.parser")
url = ''
for link in soup.find_all('img'):
str_onClick = link.get('onclick')
if str_onClick != None:
if str_onClick.endswith(".mp3');"):
url = str_onClick[len("javascript:fnSoundPlay('"): len(str_onClick)-len("');")]
print(url)
return url
cut_to_get_mp3_url('오')
The error:
Traceback (most recent call last):
File "/home/linh/Desktop/python/in_link.py", line 36, in <module>
save_file(cut_to_get_mp3_url(korean_word), str(count))
File "/home/linh/Desktop/python/in_link.py", line 24, in save_file
x = requests.get(mp3_url)
File "/usr/lib/python3.7/site-packages/requests/api.py", line 75, in get
return request('get', url, params=params, **kwargs)
File "/usr/lib/python3.7/site-packages/requests/api.py", line 60, in request
return session.request(method=method, url=url, **kwargs)
File "/usr/lib/python3.7/site-packages/requests/sessions.py", line 519, in request
prep = self.prepare_request(req)
File "/usr/lib/python3.7/site-packages/requests/sessions.py", line 462, in prepare_request
hooks=merge_hooks(request.hooks, self.hooks),
File "/usr/lib/python3.7/site-packages/requests/models.py", line 313, in prepare
self.prepare_url(url, params)
File "/usr/lib/python3.7/site-packages/requests/models.py", line 387, in prepare_url
raise MissingSchema(error)
requests.exceptions.MissingSchema: Invalid URL '': No schema supplied. Perhaps you meant http://?
I found a site that relate to my question here.
https://www.reddit.com/r/Korean/comments/60fzyq/download_sound_files_from_naver_dictionary/
I apply in my case and it works :D
import urllib.request
import re
#koreanWords = input('Enter words: ').split()
koreanWords = ['한국']
for x in range (0,len(koreanWords)):
url = ('https://krdict.korean.go.kr/vie/dicSearch/search?nation=vie&nationCode=2&ParaWordNo=&mainSearchWord'+ urllib.parse.urlencode({'': koreanWords[x], 'kind': 'keyword'}))
print(url)
response = urllib.request.urlopen(url)
html = response.read()
html = html.decode("utf8")
response.close()
regexSearch = re.search('mySound = soundManager.createSound\({url:\'(.*?)(\'|$)', html)
mp3Page = regexSearch.group(1)
print(mp3Page)

json error dump during web scraping in python

I am trying to download the thumbnails from the digital commons website in order to make a imageJ visualization. Everything prints up until the JSON dump file. I have a code written by my friend to download the image but I need to have a json file of the URLs before I continue. At the end it gives me the error that " Object of type Tag is not JSON serializable".
Sorry for the spaces, I'm new to stack overflow and when I copy and past from Sublime it is messed up.
from bs4 import BeautifulSoup
import requests
import re
import json
all_my_data = []
url = "https://www.digitalcommonwealth.org/search?f%5Bcollection_name_ssim%5D%5B%5D=Produce+Crate+Labels&f%5Binstitution_name_ssim%5D%5B%5D=Boston+Public+Library&per_page=50"
results_page = requests.get(url)
page_html = results_page.text
soup = BeautifulSoup(page_html, "html.parser")
all_labels = soup.find_all("div", attrs = {'class': 'document'})
for items in all_labels:
my_data = {
"caption": None,
"url": None,
"image url": None,
}
item_link = items.find('a')
abs_url = "https://www.digitalcommonwealth.org/search?f%5Bcollection_name_ssim%5D%5B%5D=Produce+Crate+Labels&f%5Binstitution_name_ssim%5D%5B%5D=Boston+Public+Library&per_page=50" + item_link["href"]
my_data["url"] = abs_url
#print(abs_url)
item_request = requests.get(abs_url)
item_html = item_request.text
item_soup = BeautifulSoup(item_html, "html.parser")
all_field_divs = item_soup.find_all("div", attrs={'class': 'caption'})
for field in all_field_divs:
caption = field.find("a")
cpation = caption.text
my_data["caption"] = caption
#print(caption)
all_photo_urls = item_soup.find_all("div", attrs={'class': 'thumbnail'})
for photo_url in all_photo_urls:
photo = photo_url.find('img')
photo_abs_url = "https://www.digitalcommonwealth.org/search?f%5Bcollection_name_ssim%5D%5B%5D=Produce+Crate+Labels&f%5Binstitution_name_ssim%5D%5B%5D=Boston+Public+Library&per_page=50" + photo['src']
my_data['image url'] = photo_abs_url
#print(photo_abs_url)
all_my_data.append(my_data)
#print(all_my_data)
with open('fruit_crate_labels.json', 'w') as file_object:
json.dump(all_my_data, file_object, indent=2)
print('Your file is now ready')
It prints this:
Traceback (most recent call last):
File "dh.py", line 54, in
json.dump(all_my_data, file_object, indent=2)
File "/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/json/init.py", line 179, in dump
for chunk in iterable:
File "/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/json/encoder.py", line 429, in _iterencode
yield from _iterencode_list(o, _current_indent_level)
File "/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/json/encoder.py", line 325, in _iterencode_list
yield from chunks
File "/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/json/encoder.py", line 405, in _iterencode_dict
yield from chunks
File "/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/json/encoder.py", line 438, in _iterencode
o = _default(o)
File "/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/json/encoder.py", line 179, in default
raise TypeError(f'Object of type {o.class.name} '
TypeError: Object of type Tag is not JSON serializable
thanks for the help!
The following code on line 35:
cpation = caption.text
should be:
caption = caption.text
Then your code appears to work as you intended.

AttributeError: 'module' object has no attribute 'ensure_future'

Hi i am writing a n/w bound server application using python asyncio which can accept a post request.
In post request i am accepting a symbol parameter
please tell me the best way to deal with n/w bound application.where i am collecting the data from another web api's by sending the post request to them.
Following is the code :
import asyncio
import aiohttp
import json
import logging
# async def fetch_content(url, symbols):
# yield from aiohttp.post(url, symbols=symbols)
#asyncio.coroutine
def fetch_page(writer, url, data):
response = yield from aiohttp.post(url, data=data)
resp = yield from response.read_and_close()
print(resp)
writer.write(resp)
return
#asyncio.coroutine
def process_payload(writer, data, scale):
tasks = []
data = data.split('\r\n\r\n')[1]
data = data.split('\n')
data = [x.split(':') for x in data]
print(data)
data = {x[0]: x[1] for x in data}
print(data)
# data = data[0].split(':')[1]
data = data['symbols']
print(data)
data = data.split(',')
data_len = len(data)
data_first = 0
data_last = scale
url = 'http://xxxxxx.xxxxxx.xxx/xxxx/xxxx'
while data_last < data_len:
tasks.append(asyncio.ensure_future(fetch_page(writer, url,{'symbols': ",".join(data[data_first:data_last])})))
data_first += scale
data_last += scale
tasks.append(asyncio.ensure_future(fetch_page(writer, url,{'symbols': ",".join(data[data_first:data_last])})))
loop.run_until_complete(tasks)
return
#asyncio.coroutine
def process_url(url):
pass
#asyncio.coroutine
def echo_server():
yield from asyncio.start_server(handle_connection, 'xxxxxx.xxxx.xxx', 3000)
#asyncio.coroutine
def handle_connection(reader, writer):
data = yield from reader.read(8192)
if data:
message = data.decode('utf-8')
print(message)
yield from process_payload(writer, message, 400)
writer.write_eof()
writer.close()
#url = 'http://XXXXXXX.xxxxx.xxx/xxxx/xxxxxx/xxx'
data = {'symbols': 'GD-US,14174T10,04523Y10,88739910,03209R10,46071F10,77543110,92847N10'}
loop = asyncio.get_event_loop()
loop.run_until_complete(echo_server())
try:
loop.run_forever()
finally:
loop.close()
But i am receiving the following error:
future: <Task finished coro=<handle_connection() done, defined at fql_server_async_v2.py:53> exception=AttributeError("'module' object has no attribute 'ensure_future'",)>
Traceback (most recent call last):
File "/home/user/anugupta/lib/python3.4/asyncio/tasks.py", line 234, in _step
result = coro.send(value)
File "fql_server_async_v2.py", line 60, in handle_connection
yield from process_payload(writer, message, 400)
File "/home/user/anugupta/lib/python3.4/asyncio/coroutines.py", line 141, in coro
res = func(*args, **kw)
File "fql_server_async_v2.py", line 41, in process_payload
tasks.append(asyncio.ensure_future(fetch_page(writer, url, {'symbols':",".join(data[data_first:data_last])})))
AttributeError: 'module' object has no attribute 'ensure_future'
^CTraceback (most recent call last):
File "fql_server_async_v2.py", line 72, in <module>
loop.run_forever()
File "/home/user/anugupta/lib/python3.4/asyncio/base_events.py", line 236, in run_forever
self._run_once()
File "/home/user/anugupta/lib/python3.4/asyncio/base_events.py", line 1017, in _run_once
event_list = self._selector.select(timeout)
File "/home/user/anugupta/lib/python3.4/selectors.py", line 424, in select
fd_event_list = self._epoll.poll(timeout, max_ev)
ensure_future was added in asyncio 3.4.4, use async for earlier versions.
While async is deprecated now it will be supported in oversable future.

Resources