Webscraping with BeautifulSoup - python-3.x

i'm trying to scrape some data for training but I'm stuck.
I would like to scrape the date, not just the year, but I couldn't quite figure out how to do this for now.
Here's the segment I would like to scrape :
htmlscrape
And here's my script so far :
import requests
from requests import get
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import re
url = "https://www.senscritique.com/films/tops/top111"
results = requests.get(url)
soup = BeautifulSoup(results.text, "html.parser")
titles = []
years = []
notes = []
synopsys = []
infos = []
dates = []
movie_div = soup.find_all('div', class_ = 'elto-flexible-column')
for container in movie_div:
title = container.h2.a.text
titles.append(title)
year = container.h2.find('span', class_ = 'elco-date').text
year = year.replace('(', '')
year = year.replace(')', '')
years.append(year)
sy = container.find('p', class_ = 'elco-description').text
synopsys.append(sy)
note = float(container.div.a.text)
notes.append(note)
info = container.find('p', class_ = 'elco-baseline elco-options').text
#type = re.sub(r'[a-z]+', '', type)
infos.append(info)
soup = container.find('p', class_ = 'elco-baseline elco-options')
for i in soup:
i = soup.find('time')
dates.append(i)
print(dates[0])
And here's the results :
result
I would like to just have the "1957-04-10" or the "10 avril 1957", whatever ! But I cannot figure it out ! I tried many things but it's the best I had so far.
Thanks :)

You can use .text property of <time> tag to get the time:
import requests
from bs4 import BeautifulSoup
url = 'https://www.senscritique.com/films/tops/top111'
soup = BeautifulSoup(requests.get(url).content, 'html.parser')
for movie in soup.select('.elto-item'):
title = movie.select_one('[id^="product-title"]').text
time = movie.select_one('time')
time = time.text if time else '-'
print('{:<40} {}'.format(title, time))
Prints:
12 hommes en colère 10 avril 1957
Harakiri 16 septembre 1962
Barberousse 3 avril 1965
Le Bon, la Brute et le Truand 23 décembre 1966
Les Sept Samouraïs 26 avril 1954
Il était une fois dans l'Ouest 21 décembre 1968
Il était une fois en Amérique 23 mai 1984
Le Parrain 24 mars 1972
Le Trou 18 mars 1960
Dersou Ouzala 2 août 1975
Point limite 7 octobre 1964
Entre le ciel et l'enfer 1 mars 1963
...and so on.

I think something like this would do it for you, just returning the date.
tags = soup('time')
date_formatted = list()
for tag in tags:
date_formatted.append((tag.contents[0])))
print(date_formatted[0])

Related

Beautifulsoup - ValueError: No objects to concatenate

I'm trying to scraping multi-page amazon comments. My code is not capturing any of the parts I wanted to get.
from bs4 import BeautifulSoup
import requests
import pandas as pd
url = "https://www.amazon.fr/AmazonBasics-600-sacs-d%C3%A9jections-canines-distributeur/product-reviews/B00NABTG60/ref=cm_cr_getr_d_paging_btm_next_"
amazon_reviews = []
for page in range(2, 5):
req = requests.get(url + str(page) + "?ie=UTF8&reviewerType=all_reviews&pageNumber=" + str(page))
soup = BeautifulSoup(req.text, "html.parser")
# Getting desired data from our parsed soup
reviews = soup.find_all('div', {'data-hook': 'review'})
for item in reviews:
client = item.find('a', {'data-hook': 'genome-widget'}).text.strip()
title = item.find('a', {'data-hook': 'review-title'}).text.strip()
date = item.find('span', {'data-hook': 'review-date'}).text.strip()
rating = item.find('i', {'data-hook': 'review-star-rating'}).text.replace('out of 5 stars', '').strip()
text = item.find('span', {'data-hook': 'review-body'}).text.strip()
amazon_reviews.append(pd.DataFrame({'title': title, 'date': date, 'text': text, 'rating': rating, 'client': client}, index = [0]))
out = pd.concat(amazon_reviews, ignore_index = True)
My output:
ValueError: No objects to concatenate
You have to inject user-agent as headers parameter.
You can't invoke DataFrame inside for loop
client's element selection was a bit wrong
I've injected the pagination using dot format
Code:
from bs4 import BeautifulSoup
import requests
import pandas as pd
url = "https://www.amazon.fr/AmazonBasics-600-sacs-d%C3%A9jections-canines-distributeur/product-reviews/B00NABTG60/ref=cm_cr_arp_d_paging_btm_next_2?pageNumber={page}"
headers={'user-agent':'Mozilla/5.0'}
amazon_reviews = []
for page in range(1, 5):
req = requests.get(url.format(page=page),headers=headers)
soup = BeautifulSoup(req.text, "html.parser")
# Getting desired data from our parsed soup
reviews = soup.find_all('div', {'data-hook': 'review'})
for item in reviews:
client = item.find('div', {'class': 'a-profile-content'}).get_text(strip=True)
#print(client)
title = item.find('a', {'class': 'review-title'}).text.strip()
#print(title)
date = item.find('span', {'data-hook': 'review-date'}).text.strip()
#print(date)
rating = item.find('i', {'data-hook': 'review-star-rating'}).text.replace('out of 5 stars', '').strip()
#print(rating)
text = item.find('span', {'data-hook': 'review-body'}).text.strip()
#print(text)
amazon_reviews.append({'title': title, 'date': date, 'text': text, 'rating': rating, 'client': client})
df = pd.DataFrame(amazon_reviews)
print(df)
Output:
title ... client
0 Parfaits ... Client d'Amazon
1 Tellement pratique ... Karen M
2 Génial ... Constance Jourdain
3 Bon ... Bernardini
4 Très bon produit ... Floriane Le Loher
5 Produit simple et facile d'utilisation ... K4rm4_Ez
6 La solidité ... thierry
7 Sacs à dejection + dévidoir ... M&N ABK
8 Bon produit ... Christophe FRANCOIS
9 Bonne qualité ... Neuray Gabriel
10 Très bien pour déjection canine ... PELEGRIN ERIC
11 Bonne idée ... Marine
12 Sac de qualité ... Jennifer A
13 conforme et pratique et solide ... G pay
14 Génial. ... Alban
15 Impeccable ... Marina C.
16 Pratique aux bonnes dimensions ... YVES CALVEZ
17 Solide et taille ok pour un labrador ... magnésium
18 Très pratique ... Client d'Amazon
19 très bon article ... berger fabienne
20 pratique ... Laetitia Hermann
21 Indispensable ... ronin
22 Pratique ... SylM
23 Top ... Emilie Ouviere
24 Bonne qualité ... Manon
25 Parfait ... Nicolas
26 Top ... Simon
27 Crochet énervant ! ... Jabousan
28 TOUJOURS LE MEILLEUR ... FRANKL FAN
29 Très bon produit ... Ludo96ci
30 Top pour le prix ! ... AlanLB
31 Très bien ! ... Client d'Amazon
32 Solide ... Lambourg
33 Sacs solides mais très difficiles à détacher l... ... Client d'Amazon
34 Bon rapport qualité prix ... GUYET
35 Top ... Client d'Amazon
36 Livraison rapide ... Yann
37 Il fait le job ... Rod
38 Bon produit ... Anais D
39 Pratique ... mario D.
[40 rows x 5 columns]

Creating a CSV File from a Wikipedia table using Beautiful Soup

I am trying to use Beautiful Soup to scrape the first 3 Columns from a table in this Wikipedia Page.
I implemented the solution found here.
import requests
import lxml
import pandas as pd
from bs4 import BeautifulSoup
#requesting the page
url = 'https://en.wikipedia.org/wiki/List_of_winners_and_shortlisted_authors_of_the_Booker_Prize'
page = requests.get(url).text
#parsing the page
soup = BeautifulSoup(page, "lxml")
#selecting the table that matches the given class
table = soup.find('table',class_="sortable wikitable")
df = pd.read_html(str(table))
df = pd.concat(df)
print(df)
df.to_csv("booker.csv", index = False)
It worked like a charm. Gave me exactly the output I was looking for:
Expected Output 1
However, the solution above uses pandas.
I want to create the same output without using pandas.
I referred to the solution here but the output I am getting looks like this:
Output 2
here is the code that generates "Output 2":
import requests
import lxml
from bs4 import BeautifulSoup
#requesting the page
url = 'https://en.wikipedia.org/wiki/List_of_winners_and_shortlisted_authors_of_the_Booker_Prize'
page = requests.get(url).text
#parsing the page
soup = BeautifulSoup(page, "lxml")
#selecting the table that matches the given class
table = soup.find('table',class_="sortable wikitable")
with open('output.csv', 'w', newline="") as file:
writer = csv.writer(file)
writer.writerow(['Year','Author','Title'])
for tr in table.find_all('tr'):
try:
td_1 = tr.find_all('td')[0].get_text(strip=True)
except IndexError:
td_1 = ""
try:
td_2 = tr.find_all('td')[1].get_text(strip=True)
except IndexError:
td_2 = ""
try:
td_3 = tr.find_all('td')[3].get_text(strip=True)
except IndexError:
td_3 = ""
writer.writerow([td_1, td_2,td_3])
So my question is: How do I get the expected output without using Pandas?
P.S: I've tried to parse the rows in the table like this:
import requests
import lxml
from bs4 import BeautifulSoup
#requesting the page
url = 'https://en.wikipedia.org/wiki/List_of_winners_and_shortlisted_authors_of_the_Booker_Prize'
page = requests.get(url).text
#parsing the page
soup = BeautifulSoup(page, "lxml")
#selecting the table that matches the given class
table = soup.find('table',class_="sortable wikitable")
rows = table.find_all('tr')
for row in rows:
cell = row.td
if cell is not None:
print(cell.get_text())
print(cell.next_sibling.next_sibling.get_text())
else:
print("heehee")
But the output I get looks like this:
heehee
1969
Barry England
Nicholas Mosley
Iris Murdoch
Muriel Spark
Gordon Williams
1970
A. L. Barker
Elizabeth Bowen
Iris Murdoch
William Trevor
Terence Wheeler
1970 Awarded in 2010 as the Lost Man Booker Prize[a]
Nina Bawden
Shirley Hazzard
Mary Renault
Muriel Spark
Patrick White
1971
Thomas Kilroy
Doris Lessing
Mordecai Richler
Derek Robinson
Elizabeth Taylor
1972
Susan Hill
Thomas Keneally
Try the following to get your desired results from there. Make sure your bs4 version is up to date or at least higher than 4.7.0 for it to support pseudo css selector which I've used within the script.
import csv
import lxml
import requests
from bs4 import BeautifulSoup
url = 'https://en.wikipedia.org/wiki/List_of_winners_and_shortlisted_authors_of_the_Booker_Prize'
page = requests.get(url)
soup = BeautifulSoup(page.text, "lxml")
with open('output.csv', 'w', newline="") as file:
writer = csv.writer(file)
writer.writerow(['Year','Author','Title'])
for row in soup.select('table.wikitable > tbody > tr')[1:]:
try:
year = row.select_one("td[rowspan]").get_text(strip=True)
except AttributeError: year = ""
try:
author = row.select_one("td:not([rowspan]) > a[title]").get_text(strip=True)
except AttributeError: author = ""
try:
title = row.select_one("td > i > a[title], td > i").get_text(strip=True)
except AttributeError: title = ""
writer.writerow([year,author,title])
print(year,author,title)
The easiest way is to use pandas directly:
import pandas as pd
url = "https://en.wikipedia.org/wiki/List_of_winners_and_shortlisted_authors_of_the_Booker_Prize"
df = pd.read_html(url)[0][["Year", "Author", "Title"]]
print(df)
Prints:
Year Author Title
0 1969 P. H. Newby Something to Answer For
1 1969 Barry England Figures in a Landscape
2 1969 Nicholas Mosley The Impossible Object
3 1969 Iris Murdoch The Nice and the Good
4 1969 Muriel Spark The Public Image
5 1969 Gordon Williams From Scenes Like These
6 1970 Bernice Rubens The Elected Member
7 1970 A. L. Barker John Brown's Body
8 1970 Elizabeth Bowen Eva Trout
9 1970 Iris Murdoch Bruno's Dream
10 1970 William Trevor Mrs Eckdorf in O'Neill's Hotel
11 1970 Terence Wheeler The Conjunction
12 1970 Awarded in 2010 as the Lost Man Booker Pr... J. G. Farrell Troubles
13 1970 Awarded in 2010 as the Lost Man Booker Pr... Nina Bawden The Birds on the Trees
14 1970 Awarded in 2010 as the Lost Man Booker Pr... Shirley Hazzard The Bay of Noon
15 1970 Awarded in 2010 as the Lost Man Booker Pr... Mary Renault Fire From Heaven
16 1970 Awarded in 2010 as the Lost Man Booker Pr... Muriel Spark The Driver's Seat
17 1970 Awarded in 2010 as the Lost Man Booker Pr... Patrick White The Vivisector
...
To CSV:
df.to_csv("data.csv", index=None)
Creates data.csv:

scrape a table in a website with python (no table tag)

I'm trying to scrape daily the stock value of a product. This is the web https://funds.ddns.net/f.php?isin=ES0110407097. And this is the code I'm trying:
import pandas as pd
from bs4 import BeautifulSoup
html_string = 'https://funds.ddns.net/f.php?isin=ES0110407097'
soup = BeautifulSoup(html_string, 'lxml')
new_table = pd.DataFrame(columns=range(0,2), index = [0])
row_marker = 0
column_marker = 0
for row in soup.find_all('tr'):
columns = soup.find_all('td')
for column in columns:
new_table.iat[row_marker,column_marker] = column.get_text()
column_marker += 1
print(new_table)
I would like to get in Python the same format I can see in the web, both the data and the number. How can I get it, please?
There's a simpler way for that particular page:
import requests
import pandas as pd
url = 'https://funds.ddns.net/f.php?isin=ES0110407097'
resp = requests.get(url)
new_table = pd.read_html(resp.text)[0]
print(new_table.head(5))
Output:
0 1
0 FECHA VL:EUR
1 2019-12-20 120170000
2 2019-12-19 119600000
3 2019-12-18 119420000
4 2019-12-17 119390000

How do I fix the AttributeError: 'NoneType' object has no attribute 'text'...when looping

I am a beginner and answers on this forum have been invaluable. I am using Python 3 and Beautiful Soup to scrape (non-table) data from multiple web pages on the same website by looping the page number. It works but I keep getting the AttributeError: 'NoneType' object has no attribute 'text' after the first iteration.
Here is the code I have tried thus far:
import requests
from bs4 import BeautifulSoup
import csv
import lxml
# Lists to store the scraped data in
addresses = []
geographies = []
rents = []
units = []
availabilities = []
# Scraping all pages
pages_url = requests.get('https://www.rent.com/new-york/tuckahoe-apartments')
pages_soup = BeautifulSoup(pages_url.text, 'html.parser')
list_nums = pages_soup.find('div', class_='_1y05u').text
print(list_nums)
pages = [str(i) for i in range(1,8)]
for page in pages:
response = requests.get('https://www.rent.com/new-york/tuckahoe-apartments?page=' + page).text
html_soup = BeautifulSoup(response, 'lxml')
# Extract data from individual listing containers
listing_containers = html_soup.find_all('div', class_='_3PdAH')
print(type(listing_containers))
print(len(listing_containers))
for container in listing_containers:
address = container.a.text
addresses.append(address)
geography = container.find('div', class_='_1dhrl').text
geographies.append(geography)
rent = container.find('div', class_='_3e12V').text
rents.append(rent)
unit = container.find('div', class_='_2tApa').text
units.append(unit)
availability = container.find('div', class_='_2P6xE').text
availabilities.append(availability)
import pandas as pd
test_df = pd.DataFrame({'Street' : addresses,
'City-State-Zip' : geographies,
'Rent' : rents,
'BR/BA' : units,
'Units Available' : availabilities
})
print(test_df)
Here is the output:
240 Properties
<class 'bs4.element.ResultSet'>
30
Street City-State-Zip Rent BR/BA Units Available
0 Quarry Place at Tuckahoe 64 Midland PlaceTuckahoe, NY 10707 $2,490+ 1–2 Beds • 1–2 Baths 2 Units Available
Traceback (most recent call last):
File "renttucktabletest.py", line 60, in <module>
availability = container.find('div', class_='_2P6xE').text
AttributeError: 'NoneType' object has no attribute 'text'
The result I am looking for is all 240 listings in the pandas dataframe exactly like the first iteration shown in the output above. Can anyone help to fix this error? Would be much appreciated. Thank you!
As pointed out, the issue is some of the containers are missing certain div elements. eg no 'unit' or 'availability' information.
One way to deal with this would be to use if - else statements. Append only if the element exists, else append a NaN value. Something like:
import requests
import numpy as np
from bs4 import BeautifulSoup
import csv
import lxml
# Lists to store the scraped data in
addresses = []
geographies = []
rents = []
units = []
availabilities = []
# Scraping all pages
pages_url = requests.get('https://www.rent.com/new-york/tuckahoe-apartments')
pages_soup = BeautifulSoup(pages_url.text, 'html.parser')
list_nums = pages_soup.find('div', class_='_1y05u').text
print(list_nums)
pages = [str(i) for i in range(1,8)]
for page in pages:
response = requests.get('https://www.rent.com/new-york/tuckahoe-apartments?page=' + page).text
html_soup = BeautifulSoup(response, 'lxml')
# Extract data from individual listing containers
listing_containers = html_soup.find_all('div', class_='_3PdAH')
print(type(listing_containers))
print(len(listing_containers))
for container in listing_containers:
address = container.a
if address:
addresses.append(address.text)
else:
addresses.append(np.nan)
geography = container.find('div', class_='_1dhrl')
if geography:
geographies.append(geography.text)
else:
geographies.append(np.nan)
rent = container.find('div', class_='_3e12V')
if rent:
rents.append(rent.text)
else:
rents.append(np.nan)
unit = container.find('div', class_='_2tApa')
if unit:
units.append(unit.text)
else:
units.append(np.nan)
availability = container.find('div', class_='_2P6xE')
if availability:
availabilities.append(availability.text)
else:
availabilities.append(np.nan)
import pandas as pd
test_df = pd.DataFrame({'Street' : addresses,
'City-State-Zip' : geographies,
'Rent' : rents,
'BR/BA' : units,
'Units Available' : availabilities
})
print(test_df)
Street City-State-Zip Rent \
0 Quarry Place at Tuckahoe 64 Midland PlaceTuckahoe, NY 10707 $2,490+
1 address not disclosed Tuckahoe, NY 10707 $2,510
2 address not disclosed Tuckahoe, NY 10707 $4,145
3 60 Washington St 1 60 Washington StTuckahoe, NY 10707 $3,500
4 269 Columbus Ave 5 269 Columbus AveTuckahoe, NY 10707 $2,700
BR/BA Units Available
0 1–2 Beds • 1–2 Baths 2 Units Available
1 1 Bed • 1 Bath NaN
2 2 Beds • 2 Bath NaN
3 3 Beds • 2 Bath NaN
4 2 Beds • 1 Bath NaN
If you pull the info from a script tag and treat as json that problem goes away. None or 0 is returned from the json where had you been trying for class name etc you would have got an error.
import requests
import json
from bs4 import BeautifulSoup as bs
import re
import pandas as pd
def add_records(url, s):
res = requests.get(url)
soup = bs(res.content, 'lxml')
r = re.compile(r'window.__APPLICATION_CONTEXT__ = (.*)')
data = soup.find('script', text=r).text
script = r.findall(data)[0]
items = json.loads(script)['store']['listings']['listings']
for item in items:
street = item['address']
geography = ', '.join([item['city'], item['state'], item['zipCode']])
rent = item['aggregates']['prices']['low']
BR_BA = 'beds: ' + str(item['aggregates']['beds']['low']) + ' , ' + 'baths: ' + str(item['aggregates']['baths']['low'])
units = item['aggregates']['totalAvailable']
listingId = item['listingId']
url = base_url + item['listingSeoPath']
# all_info = item
record = {'Street' : street,
'Geography' : geography,
'Rent' : rent,
'BR/BA' : BR_BA,
'Units Available' : units,
'ListingId' : listingId,
'Url' : url}
results.append(record)
url = 'https://www.rent.com/new-york/tuckahoe-apartments?page={}'
base_url = 'https://www.rent.com/'
results = []
with requests.Session() as s:
for page in range(1, 9):
add_records(url.format(page), s)
df = pd.DataFrame(results, columns = [ 'Street', 'Geography', 'Rent', 'BR/BA', 'Units Available', 'ListingId', 'Url'])
print(df)
Here is another approach to achieve the same.
import pandas
import requests
from bs4 import BeautifulSoup
urls = ['https://www.rent.com/new-york/tuckahoe-apartments?page={}'.format(page) for page in range(1,9)]
def get_content(links):
for url in links:
res = requests.get(url)
soup = BeautifulSoup(res.text, 'lxml')
for items in soup.select("._3PdAH"):
d = {}
d['address'] = items.select_one("[data-tid='property-title']").text
try:
d['geographies'] = items.select_one("[data-tid='listing-info-address']").text
except AttributeError: d['geographies'] = ""
try:
d['rent'] = items.select_one("[data-tid='price']").text
except AttributeError: d['rent'] = ""
try:
d['units'] = items.select_one("[data-tid='beds-baths']").text
except AttributeError: d['units'] = ""
try:
d['availabilities'] = items.select_one("[data-tid='property-unitAvailText']").text
except AttributeError: d['availabilities'] = ""
dataframe.append(d)
return dataframe
if __name__ == '__main__':
dataframe = []
item = get_content(urls)
df = pandas.DataFrame(item)
df.to_csv("output.csv",index=False)

Extracting data from web page to CSV file, only last row saved

I'm faced with the following challenge: I want to get all financial data about companies and I wrote a code that does it and let's say that the result is like below:
Unnamed: 0 I Q 2017 II Q 2017 \
0 Przychody netto ze sprzedaży (tys. zł) 137 134
1 Zysk (strata) z działal. oper. (tys. zł) -423 -358
2 Zysk (strata) brutto (tys. zł) -501 -280
3 Zysk (strata) netto (tys. zł)* -399 -263
4 Amortyzacja (tys. zł) 134 110
5 EBITDA (tys. zł) -289 -248
6 Aktywa (tys. zł) 27 845 26 530
7 Kapitał własny (tys. zł)* 22 852 22 589
8 Liczba akcji (tys. szt.) 13 921,975 13 921,975
9 Zysk na akcję (zł) -0029 -0019
10 Wartość księgowa na akcję (zł) 1641 1623
11 Raport zbadany przez audytora N N
but 464 times more.
Unfortunately when I want to save all 464 results in one CSV file I can save only one last result. Not all 464 results, just one... Could you help me save all? Below is my code.
import requests
from bs4 import BeautifulSoup
import pandas as pd
url = 'https://www.bankier.pl/gielda/notowania/akcje'
page = requests.get(url)
soup = BeautifulSoup(page.content,'lxml')
# Find the second table on the page
t = soup.find_all('table')[0]
#Read the table into a Pandas DataFrame
df = pd.read_html(str(t))[0]
#get
names_of_company = df["Walor AD"].values
links_to_financial_date = []
#all linkt with the names of companies
links = []
for i in range(len(names_of_company)):
new_string = 'https://www.bankier.pl/gielda/notowania/akcje/' + names_of_company[i] + '/wyniki-finansowe'
links.append(new_string)
############################################################################
for i in links:
url2 = f'https://www.bankier.pl/gielda/notowania/akcje/{names_of_company[0]}/wyniki-finansowe'
page2 = requests.get(url2)
soup = BeautifulSoup(page2.content,'lxml')
# Find the second table on the page
t2 = soup.find_all('table')[0]
df2 = pd.read_html(str(t2))[0]
df2.to_csv('output.csv', index=False, header=None)
You've almost got it. You're just overwriting your CSV each time. Replace
df2.to_csv('output.csv', index=False, header=None)
with
with open('output.csv', 'a') as f:
df2.to_csv(f, header=False)
in order to append to the CSV instead of overwriting it.
Also, your example doesn't work because this:
for i in links:
url2 = f'https://www.bankier.pl/gielda/notowania/akcje/{names_of_company[0]}/wyniki-finansowe'
should be:
for i in links:
url2 = i
When the website has no data, skip and move on to the next one:
try:
t2 = soup.find_all('table')[0]
df2 = pd.read_html(str(t2))[0]
with open('output.csv', 'a') as f:
df2.to_csv(f, header=False)
except:
pass

Resources