Python BeautifulSoup "weird" errors

Python BeautifulSoup "weird" errors - python-3.x

Somewhere in the mist i have tangled myself running these code gives me "weird" errors and it seems like i am missing a module but cant seem to get it work even after reading the error messages many times.
Anyone that has a clue on whats wrong here?
Happy new year and thanks in advance!
import requests
from bs4 import BeautifulSoup
import csv
def get_page(url):
response = requests.get(url)
if not response.ok:
print('Server responded:', response.status_code)
else:
soup = BeautifulSoup(response.text, 'lxml')
return soup
def get_detail_data(soup):
try:
product = soup.find('span',{'class':'a-size-large product-title-word-break'}).text
except:
product = ''
try:
price = soup.find('span',{'class':'a-size-medium a-color-price priceBlockBuyingPriceString'}).text.strip()
currency, price = p.split(' ')
except:
currency = ''
price = ''
try:
amount = soup.find('span', class_='a-size-medium a-color-state').find('a').text.strip()
except:
amount = ''
data = {
'product': product,
'price': price,
'currency': currency,
'amount': amount,
}
return data
def get_index_data(soup):
try:
links = soup.find_all('a',class_='a-link-normal a-text-normal')
except:
links = []
urls = [item.get('href') for item in links]
return urls
def write_csv(data, url):
with open('hardware.csv', 'a') as csvfile:
writer = csv.writer(csvfile)
row = [data['title'], data['price'], data['currency'], data['amount'], url]
writer.writerow(row)
def main():
url = 'https://www.amazon.se/s?k=grafikkort&page=1'
products = get_index_data(get_page(url))
for link in products:
data = get_detail_data(get_page(link))
write_csv(data, link)
if __name__ == '__main__':
main()
And the Error messages.
Traceback (most recent call last):
File "scrp.py", line 75, in <module>
main()
File "scrp.py", line 71, in main
data = get_detail_data(get_page(link))
File "scrp.py", line 7, in get_page
response = requests.get(url)
File "/usr/lib/python3/dist-packages/requests/api.py", line 75, in get
return request('get', url, params=params, **kwargs)
File "/usr/lib/python3/dist-packages/requests/api.py", line 60, in request
return session.request(method=method, url=url, **kwargs)
File "/usr/lib/python3/dist-packages/requests/sessions.py", line 519, in request
prep = self.prepare_request(req)
File "/usr/lib/python3/dist-packages/requests/sessions.py", line 452, in prepare_request
p.prepare(
File "/usr/lib/python3/dist-packages/requests/models.py", line 313, in prepare
self.prepare_url(url, params)
File "/usr/lib/python3/dist-packages/requests/models.py", line 387, in prepare_url
raise MissingSchema(error)
requests.exceptions.MissingSchema: Invalid URL '/ASUS-NVIDIA-GeForce-grafikkort-kylning/dp/B07489XSJP?dchild=1': No schema supplied. Perhaps you meant http:///ASUS-NVIDIA-GeForce-grafikkort-kylning/dp/B07489XSJP?dchild=1?

What is happening here is that you are only getting the URL suffixes from your products, as seen for instance with /ASUS-NVIDIA-GeForce-grafikkort-kylning.
A quick solution is to prepend `'https://amazon.se' to all your urls:
def main():
url = 'https://www.amazon.se/s?k=grafikkort&page=1'
products = get_index_data(get_page(url))
for link in products:
data = get_detail_data(get_page('https://www.amazon.se' + link))
write_csv(data, link)

Related

Reddit and Twitter bot in Python using PRAW

I am a beginner in Python, and trying out making a bot which automatically Tweets anything which is posted on a Subreddit that I have made.
I took help from some of the tutorials online which has the following code
import praw
import json
import requests
import tweepy
import time
access_token = '************************************'
access_token_secret = '************************************'
consumer_key = '************************************'
consumer_secret = '************************************'
def strip_title(title):
if len(title) == 94:
return title
else:
return title[:93] + "..."
def tweet_creator(subreddit_info):
post_dict = {}
post_ids = []
print("[bot] Getting posts from Reddit")
for submission in subreddit_info.get_hot(limit=20):
post_dict[strip_title(submission.title)] = submission.url
post_ids.append(submission.id)
print("[bot] Generating short link using goo.gl")
mini_post_dict = {}
for post in post_dict:
post_title = post
post_link = post_dict[post]
short_link = shorten(post_link)
mini_post_dict[post_title] = short_link
return mini_post_dict, post_ids
def setup_connection_reddit(subreddit):
print("[bot] setting up connection with Reddit")
r = praw.Reddit(' %s' %(subreddit))
subreddit = r.get_subreddit(subreddit)
return subreddit
def shorten(url):
headers = {'content-type': 'application/json'}
payload = {"longUrl": url}
url = "https://www.googleapis.com/urlshortener/v1/url"
r = requests.post(url, data=json.dumps(payload), headers=headers)
link = json.loads(r.text)['id']
return link
def duplicate_check(id):
found = 0
with open('posted_posts.txt', 'r') as file:
for line in file:
if id in line:
found = 1
return found
def add_id_to_file(id):
with open('posted_posts.txt', 'a') as file:
file.write(str(id) + "\n")
def main():
subreddit = setup_connection_reddit('*Name of the subreddit*')
post_dict, post_ids = tweet_creator(subreddit)
tweeter(post_dict, post_ids)
def tweeter(post_dict, post_ids):
auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_token_secret)
api = tweepy.API(auth)
for post, post_id in zip(post_dict, post_ids):
found = duplicate_check(post_id)
if found == 0:
print("[bot] Posting this link on twitter")
print(post + " " + post_dict[post] + " #Python #reddit #bot")
api.update_status(post+" "+post_dict[post]+" #Python #reddit #bot")
add_id_to_file(post_id)
time.sleep(30)
else:
print("[bot] Already posted")
if __name__ == '__main__':
main()
The code seems fine in PyCharm, however I am getting the following error when I try to run it directly from the folder via Terminal using the rolling code, reddit_bot2.py is my file name:
python3 reddit_bot2.py
When I try to run the code I am getting the following error:
mahesh#Maheshs-MacBook-Air Atoms % python3 reddit_bot2.py
[bot] setting up connection with Reddit
Traceback (most recent call last):
File "/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/configparser.py", line 846, in items
d.update(self._sections[section])
KeyError: '**Name of the subreddit to fetch posts from**'
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/Users/mahesh/Python_Bot/Atoms/reddit_bot2.py", line 82, in <module>
main()
File "/Users/mahesh/Python_Bot/Atoms/reddit_bot2.py", line 62, in main
subreddit = setup_connection_reddit('Bot167')
File "/Users/mahesh/Python_Bot/Atoms/reddit_bot2.py", line 36, in setup_connection_reddit
r = praw.Reddit(' %s' %(subreddit))
File "/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/praw/reddit.py", line 227, in __init__
self.config = Config(
File "/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/praw/config.py", line 85, in __init__
self.custom = dict(Config.CONFIG.items(site_name), **settings)
File "/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/configparser.py", line 849, in items
raise NoSectionError(section)
configparser.NoSectionError: No section: ' Bot167'
You provided the name of a praw.ini configuration which does not exist.
For help with creating a Reddit instance, visit
https://praw.readthedocs.io/en/latest/code_overview/reddit_instance.html
For help on configuring PRAW, visit
https://praw.readthedocs.io/en/latest/getting_started/configuration.html
Any help in this regards would be highly appreciated.
Thanks :)

Python3 beautifulsoup4 Multiple url request and save data

I'm new to python, and I am having problems using Beautifulsoup to scrape multiple url's from either a text list, or even coded into the program. Here is an example of my code.
import requests
from bs4 import BeautifulSoup
import re
url = 'https://0.0.0.0/directory/'
r = requests.get(url)
soup = BeautifulSoup(r.content, 'html5lib')
with open("1.txt", "w") as f:
for name, date in zip(
soup.find_all("a", {"class": "name"}), soup.find_all("span", {"class": "date"})
):
f.write(name.text.strip() + " ")
f.write(date.text.strip() + "\n")
This works great for one url, but when I add two it fails. It also fails when trying to load a list from a text file. I have about 25 urls in a file that I would like the program to run through and collect daily.
Failed multiple url code.
url = ['https://0.0.0.0/directory/', 'https://0.0.0.0/directory/']
Error message:
┌──(c4㉿ib)-[~/Desktop/dev]
└─$ python3 test.py
Traceback (most recent call last):
File "crime.py", line 9, in <module>
r = requests.get(url)
File "/usr/lib/python3/dist-packages/requests/api.py", line 76, in get
return request('get', url, params=params, **kwargs)
File "/usr/lib/python3/dist-packages/requests/api.py", line 61, in request
return session.request(method=method, url=url, **kwargs)
File "/usr/lib/python3/dist-packages/requests/sessions.py", line 530, in request
resp = self.send(prep, **send_kwargs)
File "/usr/lib/python3/dist-packages/requests/sessions.py", line 637, in send
adapter = self.get_adapter(url=request.url)
File "/usr/lib/python3/dist-packages/requests/sessions.py", line 728, in get_adapter
raise InvalidSchema("No connection adapters were found for {!r}".format(url))
requests.exceptions.InvalidSchema: No connection adapters were found for "['https://0.0.0.0/directory/', 'https://0.0.0.0/directory/']"
┌──(c4㉿ib)-[~/Desktop/dev]
└─$
Clearly I am not scraping 0.0.0.0 I renamed the domain for the question. Any advice what I am doing wrong would be helpful. I would rather grab from a list so my code doesn't have 25 urls stuffed into it. Thank you.

Try looping through the URL's and request each one separately:
import requests
from bs4 import BeautifulSoup
urls = ['https://0.0.0.0/directory/', 'https://0.0.0.0/directory/']
with open("output.txt", "w") as f:
for url in urls:
print(url)
resp = requests.get(url).content
soup = BeautifulSoup(resp, "html.parser")
for name, date in zip(
soup.find_all("a", {"class": "name"}), soup.find_all("span", {"class": "date"})
):
f.write(name.text.strip() + " ")
f.write(date.text.strip() + "\n")

How to get request with the url that contained Korean language?

I write a function to get full source of a Korean dictionary web and then cut the URL of mp3. It has errors because of the Korean language.
How do I fix it?
import requests
from bs4 import BeautifulSoup
def cut_to_get_mp3_url(word):
if word == None:
return None
link = 'https://krdict.korean.go.kr/vie/dicSearch/search?nation=vie&nationCode=2&ParaWordNo=&mainSearchWord='+word
x = requests.get(link)
soup = BeautifulSoup(x.content, "html.parser")
url = ''
for link in soup.find_all('img'):
str_onClick = link.get('onclick')
if str_onClick != None:
if str_onClick.endswith(".mp3');"):
url = str_onClick[len("javascript:fnSoundPlay('"): len(str_onClick)-len("');")]
print(url)
return url
cut_to_get_mp3_url('오')
The error:
Traceback (most recent call last):
File "/home/linh/Desktop/python/in_link.py", line 36, in <module>
save_file(cut_to_get_mp3_url(korean_word), str(count))
File "/home/linh/Desktop/python/in_link.py", line 24, in save_file
x = requests.get(mp3_url)
File "/usr/lib/python3.7/site-packages/requests/api.py", line 75, in get
return request('get', url, params=params, **kwargs)
File "/usr/lib/python3.7/site-packages/requests/api.py", line 60, in request
return session.request(method=method, url=url, **kwargs)
File "/usr/lib/python3.7/site-packages/requests/sessions.py", line 519, in request
prep = self.prepare_request(req)
File "/usr/lib/python3.7/site-packages/requests/sessions.py", line 462, in prepare_request
hooks=merge_hooks(request.hooks, self.hooks),
File "/usr/lib/python3.7/site-packages/requests/models.py", line 313, in prepare
self.prepare_url(url, params)
File "/usr/lib/python3.7/site-packages/requests/models.py", line 387, in prepare_url
raise MissingSchema(error)
requests.exceptions.MissingSchema: Invalid URL '': No schema supplied. Perhaps you meant http://?

I found a site that relate to my question here.
https://www.reddit.com/r/Korean/comments/60fzyq/download_sound_files_from_naver_dictionary/
I apply in my case and it works :D
import urllib.request
import re
#koreanWords = input('Enter words: ').split()
koreanWords = ['한국']
for x in range (0,len(koreanWords)):
url = ('https://krdict.korean.go.kr/vie/dicSearch/search?nation=vie&nationCode=2&ParaWordNo=&mainSearchWord'+ urllib.parse.urlencode({'': koreanWords[x], 'kind': 'keyword'}))
print(url)
response = urllib.request.urlopen(url)
html = response.read()
html = html.decode("utf8")
response.close()
regexSearch = re.search('mySound = soundManager.createSound\({url:\'(.*?)(\'|$)', html)
mp3Page = regexSearch.group(1)
print(mp3Page)

json error dump during web scraping in python

I am trying to download the thumbnails from the digital commons website in order to make a imageJ visualization. Everything prints up until the JSON dump file. I have a code written by my friend to download the image but I need to have a json file of the URLs before I continue. At the end it gives me the error that " Object of type Tag is not JSON serializable".
Sorry for the spaces, I'm new to stack overflow and when I copy and past from Sublime it is messed up.
from bs4 import BeautifulSoup
import requests
import re
import json
all_my_data = []
url = "https://www.digitalcommonwealth.org/search?f%5Bcollection_name_ssim%5D%5B%5D=Produce+Crate+Labels&f%5Binstitution_name_ssim%5D%5B%5D=Boston+Public+Library&per_page=50"
results_page = requests.get(url)
page_html = results_page.text
soup = BeautifulSoup(page_html, "html.parser")
all_labels = soup.find_all("div", attrs = {'class': 'document'})
for items in all_labels:
my_data = {
"caption": None,
"url": None,
"image url": None,
}
item_link = items.find('a')
abs_url = "https://www.digitalcommonwealth.org/search?f%5Bcollection_name_ssim%5D%5B%5D=Produce+Crate+Labels&f%5Binstitution_name_ssim%5D%5B%5D=Boston+Public+Library&per_page=50" + item_link["href"]
my_data["url"] = abs_url
#print(abs_url)
item_request = requests.get(abs_url)
item_html = item_request.text
item_soup = BeautifulSoup(item_html, "html.parser")
all_field_divs = item_soup.find_all("div", attrs={'class': 'caption'})
for field in all_field_divs:
caption = field.find("a")
cpation = caption.text
my_data["caption"] = caption
#print(caption)
all_photo_urls = item_soup.find_all("div", attrs={'class': 'thumbnail'})
for photo_url in all_photo_urls:
photo = photo_url.find('img')
photo_abs_url = "https://www.digitalcommonwealth.org/search?f%5Bcollection_name_ssim%5D%5B%5D=Produce+Crate+Labels&f%5Binstitution_name_ssim%5D%5B%5D=Boston+Public+Library&per_page=50" + photo['src']
my_data['image url'] = photo_abs_url
#print(photo_abs_url)
all_my_data.append(my_data)
#print(all_my_data)
with open('fruit_crate_labels.json', 'w') as file_object:
json.dump(all_my_data, file_object, indent=2)
print('Your file is now ready')
It prints this:
Traceback (most recent call last):
File "dh.py", line 54, in
json.dump(all_my_data, file_object, indent=2)
File "/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/json/init.py", line 179, in dump
for chunk in iterable:
File "/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/json/encoder.py", line 429, in _iterencode
yield from _iterencode_list(o, _current_indent_level)
File "/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/json/encoder.py", line 325, in _iterencode_list
yield from chunks
File "/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/json/encoder.py", line 405, in _iterencode_dict
yield from chunks
File "/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/json/encoder.py", line 438, in _iterencode
o = _default(o)
File "/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/json/encoder.py", line 179, in default
raise TypeError(f'Object of type {o.class.name} '
TypeError: Object of type Tag is not JSON serializable
thanks for the help!

The following code on line 35:
cpation = caption.text
should be:
caption = caption.text
Then your code appears to work as you intended.

AttributeError: 'module' object has no attribute 'ensure_future'

Hi i am writing a n/w bound server application using python asyncio which can accept a post request.
In post request i am accepting a symbol parameter
please tell me the best way to deal with n/w bound application.where i am collecting the data from another web api's by sending the post request to them.
Following is the code :
import asyncio
import aiohttp
import json
import logging
# async def fetch_content(url, symbols):
# yield from aiohttp.post(url, symbols=symbols)
#asyncio.coroutine
def fetch_page(writer, url, data):
response = yield from aiohttp.post(url, data=data)
resp = yield from response.read_and_close()
print(resp)
writer.write(resp)
return
#asyncio.coroutine
def process_payload(writer, data, scale):
tasks = []
data = data.split('\r\n\r\n')[1]
data = data.split('\n')
data = [x.split(':') for x in data]
print(data)
data = {x[0]: x[1] for x in data}
print(data)
# data = data[0].split(':')[1]
data = data['symbols']
print(data)
data = data.split(',')
data_len = len(data)
data_first = 0
data_last = scale
url = 'http://xxxxxx.xxxxxx.xxx/xxxx/xxxx'
while data_last < data_len:
tasks.append(asyncio.ensure_future(fetch_page(writer, url,{'symbols': ",".join(data[data_first:data_last])})))
data_first += scale
data_last += scale
tasks.append(asyncio.ensure_future(fetch_page(writer, url,{'symbols': ",".join(data[data_first:data_last])})))
loop.run_until_complete(tasks)
return
#asyncio.coroutine
def process_url(url):
pass
#asyncio.coroutine
def echo_server():
yield from asyncio.start_server(handle_connection, 'xxxxxx.xxxx.xxx', 3000)
#asyncio.coroutine
def handle_connection(reader, writer):
data = yield from reader.read(8192)
if data:
message = data.decode('utf-8')
print(message)
yield from process_payload(writer, message, 400)
writer.write_eof()
writer.close()
#url = 'http://XXXXXXX.xxxxx.xxx/xxxx/xxxxxx/xxx'
data = {'symbols': 'GD-US,14174T10,04523Y10,88739910,03209R10,46071F10,77543110,92847N10'}
loop = asyncio.get_event_loop()
loop.run_until_complete(echo_server())
try:
loop.run_forever()
finally:
loop.close()
But i am receiving the following error:
future: <Task finished coro=<handle_connection() done, defined at fql_server_async_v2.py:53> exception=AttributeError("'module' object has no attribute 'ensure_future'",)>
Traceback (most recent call last):
File "/home/user/anugupta/lib/python3.4/asyncio/tasks.py", line 234, in _step
result = coro.send(value)
File "fql_server_async_v2.py", line 60, in handle_connection
yield from process_payload(writer, message, 400)
File "/home/user/anugupta/lib/python3.4/asyncio/coroutines.py", line 141, in coro
res = func(*args, **kw)
File "fql_server_async_v2.py", line 41, in process_payload
tasks.append(asyncio.ensure_future(fetch_page(writer, url, {'symbols':",".join(data[data_first:data_last])})))
AttributeError: 'module' object has no attribute 'ensure_future'
^CTraceback (most recent call last):
File "fql_server_async_v2.py", line 72, in <module>
loop.run_forever()
File "/home/user/anugupta/lib/python3.4/asyncio/base_events.py", line 236, in run_forever
self._run_once()
File "/home/user/anugupta/lib/python3.4/asyncio/base_events.py", line 1017, in _run_once
event_list = self._selector.select(timeout)
File "/home/user/anugupta/lib/python3.4/selectors.py", line 424, in select
fd_event_list = self._epoll.poll(timeout, max_ev)

ensure_future was added in asyncio 3.4.4, use async for earlier versions.
While async is deprecated now it will be supported in oversable future.

Develop Reference

node.js excel linux python-3.x azure haskell apache-spark rust .htaccess string

Python BeautifulSoup "weird" errors - python-3.x

Related

Reddit and Twitter bot in Python using PRAW

Python3 beautifulsoup4 Multiple url request and save data

How to get request with the url that contained Korean language?

json error dump during web scraping in python

AttributeError: 'module' object has no attribute 'ensure_future'

Categories

Resources