I'm new to python, and I am having problems using Beautifulsoup to scrape multiple url's from either a text list, or even coded into the program. Here is an example of my code.
import requests
from bs4 import BeautifulSoup
import re
url = 'https://0.0.0.0/directory/'
r = requests.get(url)
soup = BeautifulSoup(r.content, 'html5lib')
with open("1.txt", "w") as f:
for name, date in zip(
soup.find_all("a", {"class": "name"}), soup.find_all("span", {"class": "date"})
):
f.write(name.text.strip() + " ")
f.write(date.text.strip() + "\n")
This works great for one url, but when I add two it fails. It also fails when trying to load a list from a text file. I have about 25 urls in a file that I would like the program to run through and collect daily.
Failed multiple url code.
url = ['https://0.0.0.0/directory/', 'https://0.0.0.0/directory/']
Error message:
┌──(c4㉿ib)-[~/Desktop/dev]
└─$ python3 test.py
Traceback (most recent call last):
File "crime.py", line 9, in <module>
r = requests.get(url)
File "/usr/lib/python3/dist-packages/requests/api.py", line 76, in get
return request('get', url, params=params, **kwargs)
File "/usr/lib/python3/dist-packages/requests/api.py", line 61, in request
return session.request(method=method, url=url, **kwargs)
File "/usr/lib/python3/dist-packages/requests/sessions.py", line 530, in request
resp = self.send(prep, **send_kwargs)
File "/usr/lib/python3/dist-packages/requests/sessions.py", line 637, in send
adapter = self.get_adapter(url=request.url)
File "/usr/lib/python3/dist-packages/requests/sessions.py", line 728, in get_adapter
raise InvalidSchema("No connection adapters were found for {!r}".format(url))
requests.exceptions.InvalidSchema: No connection adapters were found for "['https://0.0.0.0/directory/', 'https://0.0.0.0/directory/']"
┌──(c4㉿ib)-[~/Desktop/dev]
└─$
Clearly I am not scraping 0.0.0.0 I renamed the domain for the question. Any advice what I am doing wrong would be helpful. I would rather grab from a list so my code doesn't have 25 urls stuffed into it. Thank you.
Try looping through the URL's and request each one separately:
import requests
from bs4 import BeautifulSoup
urls = ['https://0.0.0.0/directory/', 'https://0.0.0.0/directory/']
with open("output.txt", "w") as f:
for url in urls:
print(url)
resp = requests.get(url).content
soup = BeautifulSoup(resp, "html.parser")
for name, date in zip(
soup.find_all("a", {"class": "name"}), soup.find_all("span", {"class": "date"})
):
f.write(name.text.strip() + " ")
f.write(date.text.strip() + "\n")
Related
Somewhere in the mist i have tangled myself running these code gives me "weird" errors and it seems like i am missing a module but cant seem to get it work even after reading the error messages many times.
Anyone that has a clue on whats wrong here?
Happy new year and thanks in advance!
import requests
from bs4 import BeautifulSoup
import csv
def get_page(url):
response = requests.get(url)
if not response.ok:
print('Server responded:', response.status_code)
else:
soup = BeautifulSoup(response.text, 'lxml')
return soup
def get_detail_data(soup):
try:
product = soup.find('span',{'class':'a-size-large product-title-word-break'}).text
except:
product = ''
try:
price = soup.find('span',{'class':'a-size-medium a-color-price priceBlockBuyingPriceString'}).text.strip()
currency, price = p.split(' ')
except:
currency = ''
price = ''
try:
amount = soup.find('span', class_='a-size-medium a-color-state').find('a').text.strip()
except:
amount = ''
data = {
'product': product,
'price': price,
'currency': currency,
'amount': amount,
}
return data
def get_index_data(soup):
try:
links = soup.find_all('a',class_='a-link-normal a-text-normal')
except:
links = []
urls = [item.get('href') for item in links]
return urls
def write_csv(data, url):
with open('hardware.csv', 'a') as csvfile:
writer = csv.writer(csvfile)
row = [data['title'], data['price'], data['currency'], data['amount'], url]
writer.writerow(row)
def main():
url = 'https://www.amazon.se/s?k=grafikkort&page=1'
products = get_index_data(get_page(url))
for link in products:
data = get_detail_data(get_page(link))
write_csv(data, link)
if __name__ == '__main__':
main()
And the Error messages.
Traceback (most recent call last):
File "scrp.py", line 75, in <module>
main()
File "scrp.py", line 71, in main
data = get_detail_data(get_page(link))
File "scrp.py", line 7, in get_page
response = requests.get(url)
File "/usr/lib/python3/dist-packages/requests/api.py", line 75, in get
return request('get', url, params=params, **kwargs)
File "/usr/lib/python3/dist-packages/requests/api.py", line 60, in request
return session.request(method=method, url=url, **kwargs)
File "/usr/lib/python3/dist-packages/requests/sessions.py", line 519, in request
prep = self.prepare_request(req)
File "/usr/lib/python3/dist-packages/requests/sessions.py", line 452, in prepare_request
p.prepare(
File "/usr/lib/python3/dist-packages/requests/models.py", line 313, in prepare
self.prepare_url(url, params)
File "/usr/lib/python3/dist-packages/requests/models.py", line 387, in prepare_url
raise MissingSchema(error)
requests.exceptions.MissingSchema: Invalid URL '/ASUS-NVIDIA-GeForce-grafikkort-kylning/dp/B07489XSJP?dchild=1': No schema supplied. Perhaps you meant http:///ASUS-NVIDIA-GeForce-grafikkort-kylning/dp/B07489XSJP?dchild=1?
What is happening here is that you are only getting the URL suffixes from your products, as seen for instance with /ASUS-NVIDIA-GeForce-grafikkort-kylning.
A quick solution is to prepend `'https://amazon.se' to all your urls:
def main():
url = 'https://www.amazon.se/s?k=grafikkort&page=1'
products = get_index_data(get_page(url))
for link in products:
data = get_detail_data(get_page('https://www.amazon.se' + link))
write_csv(data, link)
I'm doing a course on Python at Coursera.
There is this assignment where ive to scrape a html web page and use it in my code.
Here is the code:
import urllib.request, urllib.parse, urllib.error
from bs4 import BeautifulSoup
import ssl
# Ignore SSL certificate errors
ctx = ssl.create_default_context()
ctx.check_hostname = False
ctx.verify_mode = ssl.CERT_NONE
url = input('http://py4e-data.dr-chuck.net/comments_828036.html')
html = urllib.request.urlopen(url).read()
soup = BeautifulSoup(html, 'html.parser')
# Retrieve all of the anchor tags
tags = soup('span')
sum = 0
for tag in tags:
sum = sum+int(tag.contents[0])
print (sum)
I'm using OnlineGDB as my compiler
On Compiling and running, a problem is arising:
Traceback (most recent call last):
File "main.py", line 11, in <module>
html = urllib.request.urlopen(url).read()
File "/usr/lib/python3.4/urllib/request.py", line 161, in urlopen
return opener.open(url, data, timeout)
File "/usr/lib/python3.4/urllib/request.py", line 448, in open
req = Request(fullurl, data)
File "/usr/lib/python3.4/urllib/request.py", line 266, in __init__
self.full_url = url
File "/usr/lib/python3.4/urllib/request.py", line 292, in full_url
self._parse()
File "/usr/lib/python3.4/urllib/request.py", line 321, in _parse
raise ValueError("unknown url type: %r" % self.full_url)
ValueError: unknown url type: ''
Now, can anyone explain what this problem is and the required solution?
It would seem that the problem lies in this line:
url = input('http://py4e-data.dr-chuck.net/comments_828036.html')
input() in python allows you to have the user input something to use in your code. The parameter you pass to the input function (in this case, the url) will be the text to display when prompting the user for the input. Eg. age = input('Enter your age -> ') would prompt the user like so:
Enter your age -> #you would enter it here, then the age variable would be assigned the input
Anyways, it doesn't seem that you need an input at all. So all you have to do to fix the code is to remove the input from your code, and assign the url variable the url directly, like so:
url = 'http://py4e-data.dr-chuck.net/comments_828036.html'
Final code:
import urllib.request, urllib.parse, urllib.error
from bs4 import BeautifulSoup
import ssl
# Ignore SSL certificate errors
ctx = ssl.create_default_context()
ctx.check_hostname = False
ctx.verify_mode = ssl.CERT_NONE
url = 'http://py4e-data.dr-chuck.net/comments_828036.html'
html = urllib.request.urlopen(url).read()
soup = BeautifulSoup(html, 'html.parser')
# Retrieve all of the anchor tags
tags = soup('span')
sum = 0
for tag in tags:
sum = sum+int(tag.contents[0])
print (sum)
Output: 2525
View and run it online
Furthermore, your code can be simplified a bit by using the requests module:
import requests
from bs4 import BeautifulSoup
url = 'http://py4e-data.dr-chuck.net/comments_828036.html'
html = requests.get(url).text
soup = BeautifulSoup(html, 'html.parser')
tags = soup('span')
sum = 0
for tag in tags:
sum += int(tag.contents[0])
print(sum)
I'm new to coding and I've been trying to scrape a page to practice. I have everything almost ready but I don't know why it gives an error.
from variables import MY_URL , OUT_FILE
import requests
from bs4 import BeautifulSoup as soup
from urllib.request import urlopen as ureq
import ssl
ssl._create_default_https_context = ssl._create_unverified_context
import csv
def agarrar_pagina():
for i in range(1,22):
uclient = ureq(MY_URL+'categorias/todas/?page={}'.format(i))
page_html = uclient.read()
page_soup = soup(page_html, "html.parser")
contenedores = page_soup.findAll('div', {'class':'cambur'})
contenedor=[]
for link in contenedor:
link = contenedor.findAll('a',['href'])
ulink = ureq(MY_URL + link)
page_link = ulink.read()
ulink = close()
uclient.close()
return page_link
This is the error
`Traceback (most recent call last):
File "prueba.py", line 93, in <module>
main()
File "prueba.py", line 89, in main
cajitas = seleccionar_caja(pagina)
File "prueba.py", line 30, in seleccionar_caja
page_soup = soup(html, "html.parser")
File "C:\Users\PC\AppData\Local\Programs\Python\Python37\lib\site-packages\bs4\__init__.py", line
267, in __init__
elif len(markup) <= 256 and (
TypeError: object of type 'NoneType' has no len()`
contenedor=[] is empty list. I think you intend to use
contenedores
for link in contenedores :
link = contenedores.findAll('a',['href'])
ulink = ureq(MY_URL + link)
page_link = ulink.read()
ulink = close()
uclient.close
The function agarrar_pagina is not mentioned in the stacktrace. I think that is because function agarrar_pagina returns None all the time.
This happens with your code when looping on empty list or on the corrected function if anything is found with the findall.
I write a function to get full source of a Korean dictionary web and then cut the URL of mp3. It has errors because of the Korean language.
How do I fix it?
import requests
from bs4 import BeautifulSoup
def cut_to_get_mp3_url(word):
if word == None:
return None
link = 'https://krdict.korean.go.kr/vie/dicSearch/search?nation=vie&nationCode=2&ParaWordNo=&mainSearchWord='+word
x = requests.get(link)
soup = BeautifulSoup(x.content, "html.parser")
url = ''
for link in soup.find_all('img'):
str_onClick = link.get('onclick')
if str_onClick != None:
if str_onClick.endswith(".mp3');"):
url = str_onClick[len("javascript:fnSoundPlay('"): len(str_onClick)-len("');")]
print(url)
return url
cut_to_get_mp3_url('오')
The error:
Traceback (most recent call last):
File "/home/linh/Desktop/python/in_link.py", line 36, in <module>
save_file(cut_to_get_mp3_url(korean_word), str(count))
File "/home/linh/Desktop/python/in_link.py", line 24, in save_file
x = requests.get(mp3_url)
File "/usr/lib/python3.7/site-packages/requests/api.py", line 75, in get
return request('get', url, params=params, **kwargs)
File "/usr/lib/python3.7/site-packages/requests/api.py", line 60, in request
return session.request(method=method, url=url, **kwargs)
File "/usr/lib/python3.7/site-packages/requests/sessions.py", line 519, in request
prep = self.prepare_request(req)
File "/usr/lib/python3.7/site-packages/requests/sessions.py", line 462, in prepare_request
hooks=merge_hooks(request.hooks, self.hooks),
File "/usr/lib/python3.7/site-packages/requests/models.py", line 313, in prepare
self.prepare_url(url, params)
File "/usr/lib/python3.7/site-packages/requests/models.py", line 387, in prepare_url
raise MissingSchema(error)
requests.exceptions.MissingSchema: Invalid URL '': No schema supplied. Perhaps you meant http://?
I found a site that relate to my question here.
https://www.reddit.com/r/Korean/comments/60fzyq/download_sound_files_from_naver_dictionary/
I apply in my case and it works :D
import urllib.request
import re
#koreanWords = input('Enter words: ').split()
koreanWords = ['한국']
for x in range (0,len(koreanWords)):
url = ('https://krdict.korean.go.kr/vie/dicSearch/search?nation=vie&nationCode=2&ParaWordNo=&mainSearchWord'+ urllib.parse.urlencode({'': koreanWords[x], 'kind': 'keyword'}))
print(url)
response = urllib.request.urlopen(url)
html = response.read()
html = html.decode("utf8")
response.close()
regexSearch = re.search('mySound = soundManager.createSound\({url:\'(.*?)(\'|$)', html)
mp3Page = regexSearch.group(1)
print(mp3Page)
I am trying to download the thumbnails from the digital commons website in order to make a imageJ visualization. Everything prints up until the JSON dump file. I have a code written by my friend to download the image but I need to have a json file of the URLs before I continue. At the end it gives me the error that " Object of type Tag is not JSON serializable".
Sorry for the spaces, I'm new to stack overflow and when I copy and past from Sublime it is messed up.
from bs4 import BeautifulSoup
import requests
import re
import json
all_my_data = []
url = "https://www.digitalcommonwealth.org/search?f%5Bcollection_name_ssim%5D%5B%5D=Produce+Crate+Labels&f%5Binstitution_name_ssim%5D%5B%5D=Boston+Public+Library&per_page=50"
results_page = requests.get(url)
page_html = results_page.text
soup = BeautifulSoup(page_html, "html.parser")
all_labels = soup.find_all("div", attrs = {'class': 'document'})
for items in all_labels:
my_data = {
"caption": None,
"url": None,
"image url": None,
}
item_link = items.find('a')
abs_url = "https://www.digitalcommonwealth.org/search?f%5Bcollection_name_ssim%5D%5B%5D=Produce+Crate+Labels&f%5Binstitution_name_ssim%5D%5B%5D=Boston+Public+Library&per_page=50" + item_link["href"]
my_data["url"] = abs_url
#print(abs_url)
item_request = requests.get(abs_url)
item_html = item_request.text
item_soup = BeautifulSoup(item_html, "html.parser")
all_field_divs = item_soup.find_all("div", attrs={'class': 'caption'})
for field in all_field_divs:
caption = field.find("a")
cpation = caption.text
my_data["caption"] = caption
#print(caption)
all_photo_urls = item_soup.find_all("div", attrs={'class': 'thumbnail'})
for photo_url in all_photo_urls:
photo = photo_url.find('img')
photo_abs_url = "https://www.digitalcommonwealth.org/search?f%5Bcollection_name_ssim%5D%5B%5D=Produce+Crate+Labels&f%5Binstitution_name_ssim%5D%5B%5D=Boston+Public+Library&per_page=50" + photo['src']
my_data['image url'] = photo_abs_url
#print(photo_abs_url)
all_my_data.append(my_data)
#print(all_my_data)
with open('fruit_crate_labels.json', 'w') as file_object:
json.dump(all_my_data, file_object, indent=2)
print('Your file is now ready')
It prints this:
Traceback (most recent call last):
File "dh.py", line 54, in
json.dump(all_my_data, file_object, indent=2)
File "/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/json/init.py", line 179, in dump
for chunk in iterable:
File "/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/json/encoder.py", line 429, in _iterencode
yield from _iterencode_list(o, _current_indent_level)
File "/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/json/encoder.py", line 325, in _iterencode_list
yield from chunks
File "/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/json/encoder.py", line 405, in _iterencode_dict
yield from chunks
File "/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/json/encoder.py", line 438, in _iterencode
o = _default(o)
File "/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/json/encoder.py", line 179, in default
raise TypeError(f'Object of type {o.class.name} '
TypeError: Object of type Tag is not JSON serializable
thanks for the help!
The following code on line 35:
cpation = caption.text
should be:
caption = caption.text
Then your code appears to work as you intended.