Web scrape specific data from imdb project - python-3.x

So i want to extract the movie scores from imdb with the score of 8.7 above.
i have done it up to here like this but i dont know what to do next
import re
import requests
from bs4 import BeautifulSoup
l = list()
r = requests.get('https://www.imdb.com/chart/top?ref_=nv_mv_250')
soup = BeautifulSoup(r.text,'html.parser')
res = soup.find_all('strong')
for x in res:
q = re.sub(r'\s+',' ',x.text)
print(q)
it gives me all the scores. i just want 8.7 and above.
and thanks for answering!

Okay so first of all:
The data presented is not consistent, There are two values of 200 and 206 which are indeed above the 8.7 rating but I guess don't fit the whole model of rating from 0 to 10.
I have edited the code so it is currently printing out the rating values.
Please note the expression:
if my_float >= 8.7 and my_float <= 10:
This expression not only makes sure that You get the scores equal or higher than 8.7 but also they cannot be equal or higher than the maximum rating of 10.
import re
import requests
from bs4 import BeautifulSoup
r = requests.get('https://www.imdb.com/chart/top?ref_=nv_mv_250')
soup = BeautifulSoup(r.text,'html.parser')
res = soup.find_all('strong')
for x in res:
q = re.sub(r'\s+',' ',x.text)
try:
my_float = float(q)
if my_float >= 8.7 and my_float <= 10:
print(q)
except ValueError as error:
print(error)
Hope this helps. If You wish to do anything else with the data instead of printing it out. Be sure to check list comprehensions or use another code block provided below
(I have noticed that You created an empty list l = list())
import re
import requests
from bs4 import BeautifulSoup
l = list()
r = requests.get('https://www.imdb.com/chart/top?ref_=nv_mv_250')
soup = BeautifulSoup(r.text,'html.parser')
res = soup.find_all('strong')
for x in res:
q = re.sub(r'\s+',' ',x.text)
try:
my_float = float(q)
if my_float >= 8.7 and my_float <= 10:
l.append(my_float)
print(q)
except ValueError as error:
print(error)

Just extract the rating convert to float and do comparison
import requests
from bs4 import BeautifulSoup as bs
r = requests.get('https://www.imdb.com/chart/top?ref_=nv_mv_250')
soup = bs(r.content, 'lxml')
for tr in soup.select('.lister-list tr'):
rating = float(tr.select_one('.imdbRating').text)
if rating >= 8.7:
print(tr.select_one('.titleColumn a').text.strip(), rating )
Less readable list comprehension:
import requests
from bs4 import BeautifulSoup as bs
r = requests.get('https://www.imdb.com/chart/top?ref_=nv_mv_250')
soup = bs(r.content, 'lxml')
results = [(tr.select_one('.titleColumn a').text.strip(), float(tr.select_one('.imdbRating').text)) for tr in soup.select('.lister-list tr') if float(tr.select_one('.imdbRating').text) >= 8.7]
print(results)

Following code shows how you can create a list of movies whose rating is greater or equal to 8.7 .
import re
import requests
from bs4 import BeautifulSoup
l = list()
r = requests.get('https://www.imdb.com/chart/top?ref_=nv_mv_250')
soup = BeautifulSoup(r.text,'html.parser')
s = soup.find_all("tbody", class_="lister-list")
desired_list = []
for movie in s[0].findChildren("tr" , recursive=False):
title = movie.find_all("a")
rating = movie.find_all("strong")
q = re.sub(r'\s+',' ',rating[0].text)
try:
rating = float(q)
if rating >= 8.7:
desired_list.append((rating, title[1].contents[0]))
except Exception as e:
print e
print desired_list

Related

How to scrape table from website, while BS4 selection won`t find it?

I'm using below code scrape table element from url (www.sfda.gov.sa/en/cosmetics-list). But its coming empty
from bs4 import BeautifulSoup
import requests
import pandas as pd
url="https://www.sfda.gov.sa/en/cosmetics-list"
res = requests.get(url)
soup = BeautifulSoup(res.content, 'html.parser')
table = soup.find('table', attrs={'class':'table table-striped display'})
table_rows = table.find_all('tr')
res = []
for tr in table_rows:
td = tr.find_all('td')
row = [tr.text.strip() for tr in td if tr.text.strip()]
if row:
res.append(row)
df = pd.DataFrame(res, columns=["ProductName", "Category", "Country", "Company"])
print(df)
Running above code but not getting data
Data is loaded via XHR so you should use this to get your information:
url = 'https://www.sfda.gov.sa/GetCosmetics.php?page=1'
pd.DataFrame(requests.get(url).json()['results'])
Example
Loop over number of pages in range() and collect all data.
import requests
import pandas as pd
data = []
for i in range(1,5):
url = f'https://www.sfda.gov.sa/GetCosmetics.php?page={i}'
data.extend(requests.get(url).json()['results'])
pd.DataFrame(data)
Output
id
cosmatics_Id
productNotificationsId
productNumber
status
productArName
productEnName
brandName
catArabic
catEnglish
counrtyAr
counrtyEn
manufactureType
packageVolume
unitAr
unitEn
barcode
manufacturearabicname
manufactureenglishname
listedNameAr
listedNameEn
imageUrl
batchNumber
country_of_manufacturing_English
country_of_manufacturing_Arabic
productCreationDate
productexpireddate
subCategory1
subCategoryAR
storageCircumstances
protectionInstructions
usageInstructions
notes
mainCommercialRecordNumber
manufacturingLicenseNumber
0
549105
58472
10518
2020-011019101291-245945
Active
ليتسيا كوبيبا
Litsea cubeba oil
MOKSHA LIFE STYLE
منتجات العناية بالبشرة
Skin products
الهند
India
Foreign
250
ملي لتر
Milliliter (ml)
0
موكشا لايف ستايل برودكت
Moksha lifestyle products
مؤسسة شجور الارض للتجارة
shojoor alearth trading
India
الهند
2020-09-28T09:40:46
2025-10-05T09:40:46
Perfumes
العطور
room temperature
تاريخ انتهاء الصلاحية
الاستعمال الخارجي
7016000957
FR555666
...
9
84386
58481
4031
2016-0120132-048982
Active
جودي ثيرابي سيستيم للشعر بالبروتين
Judy protein & Silk hair therapy system
Judy
منتجات العناية بالشعر وفروة الرأس
Hair and scalp products
الولايات المتحدة
United States
Foreign
1000
ملي لتر
Milliliter (ml)
641243925950
معامل ناتيورال كوزماتيك
natural cosmetic labs USA Inc.,
شركه بيت جودي الدوليه للتجارة
bait gody for trading co.
United States
الولايات المتحدة
2016-12-25T14:40:44
2027-01-01T14:40:44
Hair styling products
منتجات تصفيف الشعر
7007289163
FR555666
You can use concurrent.futures to concurrently scrape pages and when all pages are complete concat the results into a single dataframe:
import concurrent.futures
import json
import os
import pandas as pd
import requests
class Scrape:
def __init__(self):
self.root_url = "https://www.sfda.gov.sa/GetCosmetics.php?"
self.pages = self.get_page_count()
self.processors = os.cpu_count()
def get_page_count(self) -> int:
return self.get_data(url=self.root_url).get("pageCount")
#staticmethod
def get_data(url: str) -> dict:
with requests.Session() as request:
response = request.get(url, timeout=30)
if response.status_code != 200:
print(response.raise_for_status())
return json.loads(response.text)
def process_pages(self) -> pd.DataFrame:
page_range = list(range(1, self.pages + 1))
with concurrent.futures.ProcessPoolExecutor(max_workers=self.processors) as executor:
return pd.concat(executor.map(self.parse_data, page_range)).reset_index(drop=True)
def parse_data(self, page: int) -> pd.DataFrame:
url = f"{self.root_url}page={page}"
data = self.get_data(url=url)
return (pd
.json_normalize(data=data, record_path="results")
)[["productEnName", "catEnglish", "counrtyEn", "brandName"]].rename(
columns={"productEnName": "ProductName", "catEnglish": "Category",
"counrtyEn": "Country", "brandName": "Company"}
)
if __name__ == "__main__":
final_df = Scrape().process_pages()
print(final_df)

extracting columns contents only so that all columns for each row are in the same row using Python's BeautifulSoup

I have the following python snippet in Jupyter Notebooks that works.
The challenge I have is to extract just the rows of columnar data only
Here's the snippet:
from bs4 import BeautifulSoup as bs
import pandas as pd
page = requests.get("http://lib.stat.cmu.edu/datasets/boston")
page
soup = bs(page.content)
soup
allrows = soup.find_all("p")
print(allrows)
I'm a little unclear of what you are after but I think it's each individual row of data from URL provided.
I couldn't find a way to use beautiful soup to parse the data you are after but did find a way to separate the rows using .split()
from bs4 import BeautifulSoup as bs
import pandas as pd
import requests
page = requests.get("http://lib.stat.cmu.edu/datasets/boston")
soup = bs(page.content)
allrows = soup.find_all("p")
text = soup.text # turn soup into text
text_split = text.split('\n\n') # split the page into 3 sections
data = text_split[2] # rows of data
# create df column titles using variable titles on page
col_titles = text_split[1].split('\n')
df = pd.DataFrame(columns=range(14))
df.columns = col_titles[1:]
# 'try/except' to catch end of index,
# loop throw text data building complete rows
try:
complete_row = []
n1 = 0 #used to track index
n2 = 1
rows = data.split('\n')
for el in range(len(rows)):
full_row = rows[n1] + rows[n2]
complete_row.append(full_row)
n1 = n1 + 2
n2 = n2 + 2
except IndexError:
print('end of loop')
# loop through rows of data, clean whitespace and append to df
for row in complete_row:
elem = row.split(' ')
df.loc[len(df)] = [el for el in elem if el]
#fininshed dataframe
df

scrape a table in a website with python (no table tag)

I'm trying to scrape daily the stock value of a product. This is the web https://funds.ddns.net/f.php?isin=ES0110407097. And this is the code I'm trying:
import pandas as pd
from bs4 import BeautifulSoup
html_string = 'https://funds.ddns.net/f.php?isin=ES0110407097'
soup = BeautifulSoup(html_string, 'lxml')
new_table = pd.DataFrame(columns=range(0,2), index = [0])
row_marker = 0
column_marker = 0
for row in soup.find_all('tr'):
columns = soup.find_all('td')
for column in columns:
new_table.iat[row_marker,column_marker] = column.get_text()
column_marker += 1
print(new_table)
I would like to get in Python the same format I can see in the web, both the data and the number. How can I get it, please?
There's a simpler way for that particular page:
import requests
import pandas as pd
url = 'https://funds.ddns.net/f.php?isin=ES0110407097'
resp = requests.get(url)
new_table = pd.read_html(resp.text)[0]
print(new_table.head(5))
Output:
0 1
0 FECHA VL:EUR
1 2019-12-20 120170000
2 2019-12-19 119600000
3 2019-12-18 119420000
4 2019-12-17 119390000

how to fix the indexing error and to scrape the data from a webpage

I want to scrape data from a webpage from a wayback machine using pandas. I used string split to split some string if its present.
the URL for the webpage is this
Here is my code:
import pandas as pd
url = "https://web.archive.org/web/20140528015357/http://eciresults.nic.in/statewiseS26.htm"
dfs = pd.read_html(url)
df = dfs[0]
idx = df[df[0] == '\xa0Next >>'].index[0]
# Error mentioned in comment happens on the above line.
cols = list(df.iloc[idx-1,:])
df.columns = cols
df = df[df['Const. No.'].notnull()]
df = df.loc[df['Const. No.'].str.isdigit()].reset_index(drop=True)
df = df.dropna(axis=1,how='all')
df['Leading Candidate'] = df['Leading Candidate'].str.split('i',expand=True)[0]
df['Leading Party'] = df['Leading Party'].str.split('iCurrent',expand=True)[0]
df['Trailing Party'] = df['Trailing Party'].str.split('iCurrent',expand=True)[0]
df['Trailing Candidate'] = df['Trailing Candidate'].str.split('iAssembly',expand=True)[0]
df.to_csv('Chhattisgarh_cand.csv', index=False)
The expected output from that webpage must be in csv format like
You can use BeautifulSoup to extract the data. Panadas will help you to process the data in efficient way but its not ment for data extraction.
import pandas as pd
from bs4 import BeautifulSoup
import requests
response = requests.get('https://web.archive.org/web/20140528015357/http://eciresults.nic.in/statewiseS26.htm?st=S26')
soup = BeautifulSoup(response.text,'lxml')
table_data = []
required_table = [table for table in soup.find_all('table') if str(table).__contains__('Indian National Congress')]
if required_table:
for tr_tags in required_table[0].find_all('tr',{'style':'font-size:12px;'}):
td_data = []
for td_tags in tr_tags.find_all('td'):
td_data.append(td_tags.text.strip())
table_data.append(td_data)
df = pd.DataFrame(table_data[1:])
# print(df.head())
df.to_csv("DataExport.csv",index=False)
You can expect result like this in pandas dataframe,
0 1 ... 6 7
0 BILASPUR 5 ... 176436 Result Declared
1 DURG 7 ... 16848 Result Declared
2 JANJGIR-CHAMPA 3 ... 174961 Result Declared
3 KANKER 11 ... 35158 Result Declared
4 KORBA 4 ... 4265 Result Declared
The code below should get you the table on your url link ("Chhattisgarh Result Status") using a combination of BS and pandas; you can then save it as csv:
from bs4 import BeautifulSoup
import urllib.request
import pandas as pd
url = "https://web.archive.org/web/20140528015357/http://eciresults.nic.in/statewiseS26.htm?st=S26"
response = urllib.request.urlopen(url)
elect = response.read()
soup = BeautifulSoup(elect,"lxml")
res = soup.find_all('table')
df = pd.read_html(str(res[7]))
df[3]

How do I fix the AttributeError: 'NoneType' object has no attribute 'text'...when looping

I am a beginner and answers on this forum have been invaluable. I am using Python 3 and Beautiful Soup to scrape (non-table) data from multiple web pages on the same website by looping the page number. It works but I keep getting the AttributeError: 'NoneType' object has no attribute 'text' after the first iteration.
Here is the code I have tried thus far:
import requests
from bs4 import BeautifulSoup
import csv
import lxml
# Lists to store the scraped data in
addresses = []
geographies = []
rents = []
units = []
availabilities = []
# Scraping all pages
pages_url = requests.get('https://www.rent.com/new-york/tuckahoe-apartments')
pages_soup = BeautifulSoup(pages_url.text, 'html.parser')
list_nums = pages_soup.find('div', class_='_1y05u').text
print(list_nums)
pages = [str(i) for i in range(1,8)]
for page in pages:
response = requests.get('https://www.rent.com/new-york/tuckahoe-apartments?page=' + page).text
html_soup = BeautifulSoup(response, 'lxml')
# Extract data from individual listing containers
listing_containers = html_soup.find_all('div', class_='_3PdAH')
print(type(listing_containers))
print(len(listing_containers))
for container in listing_containers:
address = container.a.text
addresses.append(address)
geography = container.find('div', class_='_1dhrl').text
geographies.append(geography)
rent = container.find('div', class_='_3e12V').text
rents.append(rent)
unit = container.find('div', class_='_2tApa').text
units.append(unit)
availability = container.find('div', class_='_2P6xE').text
availabilities.append(availability)
import pandas as pd
test_df = pd.DataFrame({'Street' : addresses,
'City-State-Zip' : geographies,
'Rent' : rents,
'BR/BA' : units,
'Units Available' : availabilities
})
print(test_df)
Here is the output:
240 Properties
<class 'bs4.element.ResultSet'>
30
Street City-State-Zip Rent BR/BA Units Available
0 Quarry Place at Tuckahoe 64 Midland PlaceTuckahoe, NY 10707 $2,490+ 1–2 Beds • 1–2 Baths 2 Units Available
Traceback (most recent call last):
File "renttucktabletest.py", line 60, in <module>
availability = container.find('div', class_='_2P6xE').text
AttributeError: 'NoneType' object has no attribute 'text'
The result I am looking for is all 240 listings in the pandas dataframe exactly like the first iteration shown in the output above. Can anyone help to fix this error? Would be much appreciated. Thank you!
As pointed out, the issue is some of the containers are missing certain div elements. eg no 'unit' or 'availability' information.
One way to deal with this would be to use if - else statements. Append only if the element exists, else append a NaN value. Something like:
import requests
import numpy as np
from bs4 import BeautifulSoup
import csv
import lxml
# Lists to store the scraped data in
addresses = []
geographies = []
rents = []
units = []
availabilities = []
# Scraping all pages
pages_url = requests.get('https://www.rent.com/new-york/tuckahoe-apartments')
pages_soup = BeautifulSoup(pages_url.text, 'html.parser')
list_nums = pages_soup.find('div', class_='_1y05u').text
print(list_nums)
pages = [str(i) for i in range(1,8)]
for page in pages:
response = requests.get('https://www.rent.com/new-york/tuckahoe-apartments?page=' + page).text
html_soup = BeautifulSoup(response, 'lxml')
# Extract data from individual listing containers
listing_containers = html_soup.find_all('div', class_='_3PdAH')
print(type(listing_containers))
print(len(listing_containers))
for container in listing_containers:
address = container.a
if address:
addresses.append(address.text)
else:
addresses.append(np.nan)
geography = container.find('div', class_='_1dhrl')
if geography:
geographies.append(geography.text)
else:
geographies.append(np.nan)
rent = container.find('div', class_='_3e12V')
if rent:
rents.append(rent.text)
else:
rents.append(np.nan)
unit = container.find('div', class_='_2tApa')
if unit:
units.append(unit.text)
else:
units.append(np.nan)
availability = container.find('div', class_='_2P6xE')
if availability:
availabilities.append(availability.text)
else:
availabilities.append(np.nan)
import pandas as pd
test_df = pd.DataFrame({'Street' : addresses,
'City-State-Zip' : geographies,
'Rent' : rents,
'BR/BA' : units,
'Units Available' : availabilities
})
print(test_df)
Street City-State-Zip Rent \
0 Quarry Place at Tuckahoe 64 Midland PlaceTuckahoe, NY 10707 $2,490+
1 address not disclosed Tuckahoe, NY 10707 $2,510
2 address not disclosed Tuckahoe, NY 10707 $4,145
3 60 Washington St 1 60 Washington StTuckahoe, NY 10707 $3,500
4 269 Columbus Ave 5 269 Columbus AveTuckahoe, NY 10707 $2,700
BR/BA Units Available
0 1–2 Beds • 1–2 Baths 2 Units Available
1 1 Bed • 1 Bath NaN
2 2 Beds • 2 Bath NaN
3 3 Beds • 2 Bath NaN
4 2 Beds • 1 Bath NaN
If you pull the info from a script tag and treat as json that problem goes away. None or 0 is returned from the json where had you been trying for class name etc you would have got an error.
import requests
import json
from bs4 import BeautifulSoup as bs
import re
import pandas as pd
def add_records(url, s):
res = requests.get(url)
soup = bs(res.content, 'lxml')
r = re.compile(r'window.__APPLICATION_CONTEXT__ = (.*)')
data = soup.find('script', text=r).text
script = r.findall(data)[0]
items = json.loads(script)['store']['listings']['listings']
for item in items:
street = item['address']
geography = ', '.join([item['city'], item['state'], item['zipCode']])
rent = item['aggregates']['prices']['low']
BR_BA = 'beds: ' + str(item['aggregates']['beds']['low']) + ' , ' + 'baths: ' + str(item['aggregates']['baths']['low'])
units = item['aggregates']['totalAvailable']
listingId = item['listingId']
url = base_url + item['listingSeoPath']
# all_info = item
record = {'Street' : street,
'Geography' : geography,
'Rent' : rent,
'BR/BA' : BR_BA,
'Units Available' : units,
'ListingId' : listingId,
'Url' : url}
results.append(record)
url = 'https://www.rent.com/new-york/tuckahoe-apartments?page={}'
base_url = 'https://www.rent.com/'
results = []
with requests.Session() as s:
for page in range(1, 9):
add_records(url.format(page), s)
df = pd.DataFrame(results, columns = [ 'Street', 'Geography', 'Rent', 'BR/BA', 'Units Available', 'ListingId', 'Url'])
print(df)
Here is another approach to achieve the same.
import pandas
import requests
from bs4 import BeautifulSoup
urls = ['https://www.rent.com/new-york/tuckahoe-apartments?page={}'.format(page) for page in range(1,9)]
def get_content(links):
for url in links:
res = requests.get(url)
soup = BeautifulSoup(res.text, 'lxml')
for items in soup.select("._3PdAH"):
d = {}
d['address'] = items.select_one("[data-tid='property-title']").text
try:
d['geographies'] = items.select_one("[data-tid='listing-info-address']").text
except AttributeError: d['geographies'] = ""
try:
d['rent'] = items.select_one("[data-tid='price']").text
except AttributeError: d['rent'] = ""
try:
d['units'] = items.select_one("[data-tid='beds-baths']").text
except AttributeError: d['units'] = ""
try:
d['availabilities'] = items.select_one("[data-tid='property-unitAvailText']").text
except AttributeError: d['availabilities'] = ""
dataframe.append(d)
return dataframe
if __name__ == '__main__':
dataframe = []
item = get_content(urls)
df = pandas.DataFrame(item)
df.to_csv("output.csv",index=False)

Resources