I am trying to extract data from this webpage :
https://www.oddsportal.com/basketball/usa/nba-2008-2009/results/
My code works but it do not extract data which match with what we can see on the website.
For odds you can see for the first event :
Orlando Magic - Los Angeles Lakers
The result is ok but for odds 1 and 2 there are differences.
Here is my code :
#dataframe that will be populated
df = pd.DataFrame()
# loading options for the driver
driver = webdriver.Chrome(options=options)
#beautiful soup and selenium objects
options = Options()
# maximized UI to load totally the page
options.add_argument("start-maximized")
url = 'https://www.oddsportal.com/basketball/usa/nba-2008-2009/results/'
driver.get(url)
html = driver.page_source
soup = BeautifulSoup(html, 'html.parser')
for col in soup.find_all('tr', attrs = {'deactivate'}):
df = df.append(
{
#match date
'date' : col.findPreviousSibling(attrs = {'center nob-border'}).text[0:-6],
#match name
'match_name' : col.find('td', attrs = {'class' : 'name table-participant'}).text.replace('\xa0', ''),
#match result
'result' : col.find('td', attrs = {'class' : 'center bold table-odds table-score'}).text,
#home winning odd
'home_odd' : is_empty(col.find('td', attrs = {'class' : "odds-nowrp"})),
#away odd
'away_odd' : is_empty(col.find('td', attrs = {'class' : "odds-nowrp"}).findNext( attrs = {'class' : "odds-nowrp"}))
},
ignore_index = True)
driver.quit()
df.head()
Related
The code below gives me the following error:
ValueError: Length mismatch: Expected axis has 0 elements, new values have 1 elements
on the df.columns = ["GP Practice Name"] line.
I tried
import pandas as pd
import requests
from bs4 import BeautifulSoup
postal_codes = ["2000", "2010", "2020", "2030", "2040"]
places_by_postal_code = {}
def get_places(postal_code):
url = f"https://www.yellowpages.com.au/search/listings?clue={postal_code}&locationClue=&latitude=&longitude=&selectedViewMode=list&refinements=category:General%20Practitioner&selectedSortType=distance"
response = requests.get(url)
soup = BeautifulSoup(response.text, "html.parser")
places = soup.find_all("div", {"class": "listing-content"})
return [place.find("h2").text for place in places]
for postal_code in postal_codes:
places = get_places(postal_code)
places_by_postal_code[postal_code] = places
df = pd.DataFrame.from_dict(places_by_postal_code, orient='index')
df.columns = ["GP Practice Name"]
df = pd.DataFrame(places_by_postal_code.values(), index=places_by_postal_code.keys(), columns=["GP Practice Name"])
print(df)
and was expecting a list of GPs for the postcodes specified in the postal_codes variable.
from bs4 import BeautifulSoup
import requests , sys ,os
import pandas as pd
URL = r"https://www.vault.com/best-companies-to-work-for/law/top-100-law-firms-rankings/year/"
My_list = ['2007','2008','2009','2010','2011','2012','2013','2014','2015','2016','2017','2018','2019','2020']
Year= []
CompanyName = []
Rank = []
Score = []
print('\n>>Process started please wait\n\n')
for I, Page in enumerate(My_list, start=1):
url = r'https://www.vault.com/best-companies-to-work-for/law/top-100-law-firms-rankings/year/{}'.format(Page)
print('\nData fetching from : ',url)
Res = requests.get(url)
soup = BeautifulSoup(Res.content , 'html.parser')
data = soup.find('section',{'class': 'search-result CompanyWorkfor RankingMain FindSchools school-results contrastSection d-flex justify-content-center min-height Rankings CompRank'})
if len(soup) > 0:
print("\n>>Getting page source for :" , url)
else:
print("Please Check url :",url)
for i, item in enumerate(data.find_all("div", {"class": "RankItem"})):
year = item.find("i",{"class":"fa-stack fa-2x"})
Year.append(year)
title = item.find("h3", {"class": "MainLink"}).get_text().strip()
CompanyName.append(title)
rank = item.find("div", {"class": "RankNumber"}).get_text().strip()
Rank.append(rank)
score = item.find("div", {"class": "score"}).get_text().strip()
Score.append(score)
Data = pd.DataFrame({"Year":Year,"CompanyName":CompanyName,"Rank":Rank,"Score":Score})
Data[['First','Score']] = Data.Score.str.split(" " , expand =True,)
Data[['hash','Rank']] = Data.Rank.str.split("#" , expand = True,)
Data.drop(columns = ['hash','First'],inplace = True)
Data.to_csv('Vault_scrap.csv',index = False)
For each url the expected output Data for year, rank, title and score is 100 lines, but I'm getting only 10 lines.
You can iterate through the year and pages like this.
import requests
import pandas as pd
url = 'https://www.vault.com/vault/api/Rankings/LoadMoreCompanyRanksJSON'
def page_loop(year, url):
tableReturn = pd.DataFrame()
for page in range(1,101):
payload = {
'rank': '2',
'year': year,
'category': 'LBACCompany',
'pg': page}
jsonData = requests.get(url, params=payload).json()
if jsonData == []:
return tableReturn
else:
print ('page: %s' %page)
tableReturn = tableReturn.append(pd.DataFrame(jsonData), sort=True).reset_index(drop=True)
return tableReturn
results = pd.DataFrame()
for year in range(2007,2021):
print ("\n>>Getting page source for :" , year)
jsonData = page_loop(year, url)
results = results.append(pd.DataFrame(jsonData), sort=True).reset_index(drop=True)
My web scraping script is returning duplicate results for some reason, i've tried so many alternatives, but just can't get it to work whatsoever. Can anyone help please?
import requests
from bs4 import BeautifulSoup as bs
from bs4.element import Tag
import csv
soup = [ ]
pages = [ ]
csv_file = open('444.csv', 'w')
csv_writer = csv.writer(csv_file)
csv_writer.writerow(['Practice', 'Practice Manager'])
for i in range(35899, 35909):
url = 'https://www.nhs.uk/Services/GP/Staff/DefaultView.aspx?id=' + str(i)
pages.append(url)
for item in pages:
page = requests.get(item)
soup.append(bs(page.text, 'lxml'))
business = []
for items in soup:
h1Obj = items.select('[class^=panel]:has([class^="gp notranslate"]:contains(""))')
for i in h1Obj:
tagArray = i.findChildren()
for tag in tagArray:
if isinstance(tag,Tag) and tag.name in 'h1':
business.append(tag.text)
else:
print('no-business')
names = []
for items in soup:
h4Obj = items.select('[class^=panel]:not(p):has([class^="staff-title"]:contains("Practice Manager"))')
for i in h4Obj:
tagArray = i.findChildren()
for tag in tagArray:
if isinstance(tag,Tag) and tag.name in 'h4':
names.append(tag.text)
else:
print('no-name')
print(business, names)
csv_writer.writerow([business, names])
csv_file.close()
It's currently returning duplicate values on all.
What it needs to do is return one 'business' and one 'names' value per url call. If there is no 'business' or 'name', it needs to return a value of 'no-business' or 'no-name'.
Can anyone please help me?
I don't know if it's the best way of doing it, but i used set instead of list to remove duplicates and just before saving the file i convert the set to a list like this :
import requests
from bs4 import BeautifulSoup as bs
from bs4.element import Tag
import csv
soup = [ ]
pages = [ ]
csv_file = open('444.csv', 'w')
csv_writer = csv.writer(csv_file)
csv_writer.writerow(['Practice', 'Practice Manager'])
for i in range(35899, 35909):
url = 'https://www.nhs.uk/Services/GP/Staff/DefaultView.aspx?id=' + str(i)
pages.append(url)
for item in pages:
page = requests.get(item)
soup.append(bs(page.text, 'lxml'))
business = set()
for items in soup:
h1Obj = items.select('[class^=panel]:has([class^="gp notranslate"]:contains(""))')
for i in h1Obj:
tagArray = i.findChildren()
for tag in tagArray:
if isinstance(tag,Tag) and tag.name in 'h1':
business.add(tag.text)
else:
print('no-business')
names = set()
for items in soup:
h4Obj = items.select('[class^=panel]:not(p):has([class^="staff-title"]:contains("Practice Manager"))')
for i in h4Obj:
tagArray = i.findChildren()
for tag in tagArray:
if isinstance(tag,Tag) and tag.name in 'h4':
names.add(tag.text)
else:
print('no-business')
print(business, names)
csv_writer.writerow([list(business), list(names)])
csv_file.close()
You could use the following id to generate the initial list of lists. You could write each row to csv rather than append to final list.
import requests
from bs4 import BeautifulSoup as bs
results = []
with requests.Session() as s:
for i in range(35899, 35909):
r = s.get('https://www.nhs.uk/Services/GP/Staff/DefaultView.aspx?id=' + str(i))
soup = bs(r.content, 'lxml')
row = [item.text for item in soup.select('.staff-title:has(em:contains("Practice Manager")) [id]')]
if not row: row = ['no practice manager']
practice = soup.select_one('.gp').text if soup.select_one(':has(#org-title)') else 'No practice name'
row.insert(0, practice)
results.append(row)
print(results)
Not sure how you want listing out for multiple names
import requests
from bs4 import BeautifulSoup as bs
import csv
with open('output.csv', 'w', newline='') as csvfile:
w = csv.writer(csvfile, delimiter=',', quotechar='|', quoting=csv.QUOTE_MINIMAL)
with requests.Session() as s:
for i in range(35899, 35909):
r = s.get('https://www.nhs.uk/Services/GP/Staff/DefaultView.aspx?id=' + str(i))
soup = bs(r.content, 'lxml')
row = [item.text for item in soup.select('.staff-title:has(em:contains("Practice Manager")) [id]')]
if not row: row = ['no practice manager']
practice = soup.select_one('.gp').text if soup.select_one(':has(#org-title)') else 'No practice name'
row.insert(0, practice)
w.writerow(row)
Looks like the problem stems from the fact that, in some of these pages, there is no information at all, and you get a "Profile Hidden" error. I modified your code somewhat, to cover the first 5 pages. Aside from saving to file, it looks like this:
[same imports]
pages = [ ]
for i in range(35899, 35904):
url = 'https://www.nhs.uk/Services/GP/Staff/DefaultView.aspx?id=' + str(i)
pages.append(url)
soup = [ ]
for item in pages:
page = requests.get(item)
soup.append(bs(page.text, 'lxml'))
business = []
for items in soup:
h1Obj = items.select('[class^=panel]:has([class^="gp notranslate"]:contains(""))')
for i in h1Obj:
tagArray = i.findChildren()
for tag in tagArray:
if isinstance(tag,Tag) and tag.name in 'h1':
business.append(tag.text)
names = []
for items in soup:
h4Obj = items.select('[class^=panel]:not(p):has([class^="staff-title"]:contains("Practice Manager"))')
for i in h4Obj:
tagArray = i.findChildren()
for tag in tagArray:
if isinstance(tag,Tag) and tag.name in 'h4':
names.append(tag.text)
for bus, name in zip(business,names):
print(bus,'---',name)
The output looks like this:
Bilbrook Medical Centre --- Di Palfrey
Caversham Group Practice --- Di Palfrey
Caversham Group Practice --- Di Palfrey
The Moorcroft Medical Ctr --- Ms Kim Stanyer
Brotton Surgery --- Mrs Gina Bayliss
Notice that only the 2nd and 3rd entries are duplicated; that is (somehow, not sure why) caused by the "Hidden Profile" in the third page. So if you modify the main blocks of the code to:
business = []
for items in soup:
if "ProfileHiddenError.aspx" in (str(items)):
business.append('Profile Hidden')
else:
h1Obj = items.select('[class^=panel]:has([class^="gp notranslate"]:contains(""))')
for i in h1Obj:
tagArray = i.findChildren()
for tag in tagArray:
if isinstance(tag,Tag) and tag.name in 'h1':
business.append(tag.text)
names = []
for items in soup:
if "ProfileHiddenError.aspx" in (str(items)):
names.append('Profile Hidden')
elif not "Practice Manager" in str(items):
names.append('No Practice Manager Specified')
else:
h4Obj = items.select('[class^=panel]:not(p):has([class^="staff-title"]:contains("Practice Manager"))')
for i in h4Obj:
tagArray = i.findChildren()
for tag in tagArray:
if isinstance(tag,Tag) and tag.name in 'h4':
names.append(tag.text)
for bus, name in zip(business,names):
print(bus,'---',name)
The output, this time is:
BBilbrook Medical Centre --- Di Palfrey
Caversham Group Practice --- No Practice Manager Specified
Profile Hidden --- Profile Hidden
The Moorcroft Medical Ctr --- Ms Kim Stanyer
Brotton Surgery --- Mrs Gina Bayliss
Hopefully this would help you to troubleshoot the problem.
I am a beginner and answers on this forum have been invaluable. I am using Python 3 and Beautiful Soup to scrape (non-table) data from multiple web pages on the same website by looping the page number. It works but I keep getting the AttributeError: 'NoneType' object has no attribute 'text' after the first iteration.
Here is the code I have tried thus far:
import requests
from bs4 import BeautifulSoup
import csv
import lxml
# Lists to store the scraped data in
addresses = []
geographies = []
rents = []
units = []
availabilities = []
# Scraping all pages
pages_url = requests.get('https://www.rent.com/new-york/tuckahoe-apartments')
pages_soup = BeautifulSoup(pages_url.text, 'html.parser')
list_nums = pages_soup.find('div', class_='_1y05u').text
print(list_nums)
pages = [str(i) for i in range(1,8)]
for page in pages:
response = requests.get('https://www.rent.com/new-york/tuckahoe-apartments?page=' + page).text
html_soup = BeautifulSoup(response, 'lxml')
# Extract data from individual listing containers
listing_containers = html_soup.find_all('div', class_='_3PdAH')
print(type(listing_containers))
print(len(listing_containers))
for container in listing_containers:
address = container.a.text
addresses.append(address)
geography = container.find('div', class_='_1dhrl').text
geographies.append(geography)
rent = container.find('div', class_='_3e12V').text
rents.append(rent)
unit = container.find('div', class_='_2tApa').text
units.append(unit)
availability = container.find('div', class_='_2P6xE').text
availabilities.append(availability)
import pandas as pd
test_df = pd.DataFrame({'Street' : addresses,
'City-State-Zip' : geographies,
'Rent' : rents,
'BR/BA' : units,
'Units Available' : availabilities
})
print(test_df)
Here is the output:
240 Properties
<class 'bs4.element.ResultSet'>
30
Street City-State-Zip Rent BR/BA Units Available
0 Quarry Place at Tuckahoe 64 Midland PlaceTuckahoe, NY 10707 $2,490+ 1–2 Beds • 1–2 Baths 2 Units Available
Traceback (most recent call last):
File "renttucktabletest.py", line 60, in <module>
availability = container.find('div', class_='_2P6xE').text
AttributeError: 'NoneType' object has no attribute 'text'
The result I am looking for is all 240 listings in the pandas dataframe exactly like the first iteration shown in the output above. Can anyone help to fix this error? Would be much appreciated. Thank you!
As pointed out, the issue is some of the containers are missing certain div elements. eg no 'unit' or 'availability' information.
One way to deal with this would be to use if - else statements. Append only if the element exists, else append a NaN value. Something like:
import requests
import numpy as np
from bs4 import BeautifulSoup
import csv
import lxml
# Lists to store the scraped data in
addresses = []
geographies = []
rents = []
units = []
availabilities = []
# Scraping all pages
pages_url = requests.get('https://www.rent.com/new-york/tuckahoe-apartments')
pages_soup = BeautifulSoup(pages_url.text, 'html.parser')
list_nums = pages_soup.find('div', class_='_1y05u').text
print(list_nums)
pages = [str(i) for i in range(1,8)]
for page in pages:
response = requests.get('https://www.rent.com/new-york/tuckahoe-apartments?page=' + page).text
html_soup = BeautifulSoup(response, 'lxml')
# Extract data from individual listing containers
listing_containers = html_soup.find_all('div', class_='_3PdAH')
print(type(listing_containers))
print(len(listing_containers))
for container in listing_containers:
address = container.a
if address:
addresses.append(address.text)
else:
addresses.append(np.nan)
geography = container.find('div', class_='_1dhrl')
if geography:
geographies.append(geography.text)
else:
geographies.append(np.nan)
rent = container.find('div', class_='_3e12V')
if rent:
rents.append(rent.text)
else:
rents.append(np.nan)
unit = container.find('div', class_='_2tApa')
if unit:
units.append(unit.text)
else:
units.append(np.nan)
availability = container.find('div', class_='_2P6xE')
if availability:
availabilities.append(availability.text)
else:
availabilities.append(np.nan)
import pandas as pd
test_df = pd.DataFrame({'Street' : addresses,
'City-State-Zip' : geographies,
'Rent' : rents,
'BR/BA' : units,
'Units Available' : availabilities
})
print(test_df)
Street City-State-Zip Rent \
0 Quarry Place at Tuckahoe 64 Midland PlaceTuckahoe, NY 10707 $2,490+
1 address not disclosed Tuckahoe, NY 10707 $2,510
2 address not disclosed Tuckahoe, NY 10707 $4,145
3 60 Washington St 1 60 Washington StTuckahoe, NY 10707 $3,500
4 269 Columbus Ave 5 269 Columbus AveTuckahoe, NY 10707 $2,700
BR/BA Units Available
0 1–2 Beds • 1–2 Baths 2 Units Available
1 1 Bed • 1 Bath NaN
2 2 Beds • 2 Bath NaN
3 3 Beds • 2 Bath NaN
4 2 Beds • 1 Bath NaN
If you pull the info from a script tag and treat as json that problem goes away. None or 0 is returned from the json where had you been trying for class name etc you would have got an error.
import requests
import json
from bs4 import BeautifulSoup as bs
import re
import pandas as pd
def add_records(url, s):
res = requests.get(url)
soup = bs(res.content, 'lxml')
r = re.compile(r'window.__APPLICATION_CONTEXT__ = (.*)')
data = soup.find('script', text=r).text
script = r.findall(data)[0]
items = json.loads(script)['store']['listings']['listings']
for item in items:
street = item['address']
geography = ', '.join([item['city'], item['state'], item['zipCode']])
rent = item['aggregates']['prices']['low']
BR_BA = 'beds: ' + str(item['aggregates']['beds']['low']) + ' , ' + 'baths: ' + str(item['aggregates']['baths']['low'])
units = item['aggregates']['totalAvailable']
listingId = item['listingId']
url = base_url + item['listingSeoPath']
# all_info = item
record = {'Street' : street,
'Geography' : geography,
'Rent' : rent,
'BR/BA' : BR_BA,
'Units Available' : units,
'ListingId' : listingId,
'Url' : url}
results.append(record)
url = 'https://www.rent.com/new-york/tuckahoe-apartments?page={}'
base_url = 'https://www.rent.com/'
results = []
with requests.Session() as s:
for page in range(1, 9):
add_records(url.format(page), s)
df = pd.DataFrame(results, columns = [ 'Street', 'Geography', 'Rent', 'BR/BA', 'Units Available', 'ListingId', 'Url'])
print(df)
Here is another approach to achieve the same.
import pandas
import requests
from bs4 import BeautifulSoup
urls = ['https://www.rent.com/new-york/tuckahoe-apartments?page={}'.format(page) for page in range(1,9)]
def get_content(links):
for url in links:
res = requests.get(url)
soup = BeautifulSoup(res.text, 'lxml')
for items in soup.select("._3PdAH"):
d = {}
d['address'] = items.select_one("[data-tid='property-title']").text
try:
d['geographies'] = items.select_one("[data-tid='listing-info-address']").text
except AttributeError: d['geographies'] = ""
try:
d['rent'] = items.select_one("[data-tid='price']").text
except AttributeError: d['rent'] = ""
try:
d['units'] = items.select_one("[data-tid='beds-baths']").text
except AttributeError: d['units'] = ""
try:
d['availabilities'] = items.select_one("[data-tid='property-unitAvailText']").text
except AttributeError: d['availabilities'] = ""
dataframe.append(d)
return dataframe
if __name__ == '__main__':
dataframe = []
item = get_content(urls)
df = pandas.DataFrame(item)
df.to_csv("output.csv",index=False)
I want to get information from two websites and display it on 'real-time' in the console.
To get the information from the website I am using BeautifulSoup 4. I have read, that the bottleneck of scraping websites is the connection or the website itself. So I wanted to use multithreading, so the read can be done 'simultaneously'. Below is my code:
import urllib
from bs4 import BeautifulSoup
import threading
import time
link_website_one = 'http://www.website1.com'
link_website_one = 'http://www.website2.com'
def request_url(url):
user_agent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2227.1 Safari/537.36'
header = { 'User-Agent' : user_agent }
req = urllib.request.Request(url, headers=header)
try:
response = urllib.request.urlopen(req)
return response
except urllib.error.HTTPError as e:
print('The server couldn\'t fulfill the request.')
print(e.code,': ', e.reason)
return None
except urllib.error.URLError as e:
print('We failed to reach a server.')
print(e.code,': ', e.reason)
return None
def new_entry(website, lis, arr, a_id):
if website == 0 :
info1 = arr.find()
info2 = arr.find('div', {'class' : '1'}).find('span', {'class' : '7'}).get_text().strip()
info3 = arr.find('span', {'class' : '2'}).get_text().strip()
info4 = arr.find('span', {'class' : '3'}).get_text().strip()
info5 = arr.find('span', {'class' : '4'}).get_text().strip()
info6 = arr.find('div', {'class' : '5'}).get_text().strip()
lis.append({'info1' : info1, 'info2' : info2, 'info3' : info3, 'info4' : info4, 'info5' : info5, 'info6' : info6})
elif website == 1 :
info1 = a_id
info2 = arr.find('span', {'class' : '8'}).get_text()
info3 = arr.find('div', {'class' : '9'}).get_text()
info4 = arr.find_all('div', {'class' : '10'})[0].get_text()
info5 = arr.find_all('div', {'class' : '10'})[1].get_text()
info6 = arr.a["href"]
lis.append({'info1' : info1, 'info2' : info2, 'info3' : info3, 'info4' : info4, 'info5' : info5, 'info6' : info6})
class AsyncSearch(threading.Thread):
def __init__(self, website, iter_rounds, delay_time):
threading.Thread.__init__(self)
self.website = website
self.iter_rounds = iter_rounds
self.delay_time = delay_time
def run(self):
if self.website == 0:
for z in range(self.iter_rounds):
req_1 = request_url(link_1)
content_1 =req_1.read()
soup_1 = BeautifulSoup(content_1, 'lxml')
arr_1 = soup_1.find_all('div', {'class' : 'special_class'})
for x in range(len(arr_1)):
id_as = int(arr_1[x].find('class123')['class2345'].split(',')[0].split('"')[3].strip())
if id_as not in found_ids_1:
found_ids_1.add(id_as)
new_entry(0, all_entries_1, arr_1[x], id_as)
#else:
# break
req_1.close()
time.sleep(self.delay_time)
elif self.website == 1:
for z in range(self.iter_rounds):
req_2 = request_url(link)
content_2 =req_2.read()
soup_2 = BeautifulSoup(content_2, 'lxml')
arr_2 = soup_2.find_all('div', {'class' : 'class445'})
for x in range(len(arr_2)):
if arr_2[x].a['oid'] not in found_ids_2:
#print('Mobile: ',test[x].a[Inserat_ID])
found_ids_2.add(arr_2[x].a['oid'])
new_entry(1, all_entries_1, arr_2[x], arr_2[x].a['oid'])
#else:
# break
req.close()
time.sleep(self.delay_time)
all_entries_1=[]
all_entries_2=[]
found_ids_1 = set()
found_ids_2 = set()
website1 = AsyncSearch(0,10,1)
website2 = AsyncSearch(1,10,1)
website1.start()
website2.start()
website1.join()
website2.join()
First of two lists (all_entries 1/2) and two sets (found_ids 1/2) are created.
The websites I am scraping are offering 20 ads per page with unique ids. With the new_entry method you can say which website you want, in which list it should append the new entry, which array contains the beautifulsoup and the id of the ad you want to append.
Now for the mutlithreading I created a class AsyncSearch, where you can choose the website, choose the number of iterations of requests and how long to wait for the next request.
The two sets found_ids 1/2 are there so you don't append an ad more than once in the all_entries list.
Now to the problem. This code works.
But: If delay_time = 1 and iterations = 10 it needs 20 seconds to finish. Is there a faster approach to solve this problem? the .read() duration is between 0.12 and 0.17 seconds for website 1 and between 0.03 and 0.10 seconds for website 2.
If I said anything not understandable please ask.