Stripping list into 3 columns Problem. BeautifulSoup, Requests, Pandas, Itertools - python-3.x

Python novice back again! I got a lot of great help on this but am now stumped. The code below scrapes soccer match data and scores from Lehigh University soccer website. I am trying to split the scores format ['T', '0-0(2 OT)'] into 3 columns 'T', '0-0, '2 OT but I am running into problems. The issue lies in this part of the code:
=> for result in soup.findAll("div", {'class': 'sidearm-schedule-game-result'}):
=> result = result.get_text(strip=True).split(',')
I tried .split(',') but that did not work as it created ['T', '0-0(2 OT)']. Is there a way to split than into 3 columns 1) T, 2) 0-0 and 3) 2 OT???
All help much appreciated.
Thanks
import requests
from bs4 import BeautifulSoup
import pandas as pd
from itertools import zip_longest
d = []
n = []
res = []
op = []
yr = []
with requests.Session() as req:
for year in range(2003, 2020):
print(f"Extracting Year# {year}")
r = req.get(
f"https://lehighsports.com/sports/mens-soccer/schedule/{year}")
if r.status_code == 200:
soup = BeautifulSoup(r.text, 'html.parser')
for date in soup.findAll("div", {'class': 'sidearm-schedule-game-opponent-date flex-item-1'}):
d.append(date.get_text(strip=True, separator=" "))
for name in soup.findAll("div", {'class': 'sidearm-schedule-game-opponent-name'}):
n.append(name.get_text(strip=True))
for result in soup.findAll("div", {'class': 'sidearm-schedule-game-result'}):
result = result.get_text(strip=True)
#result = result.get_text(strip=True).split(',')
res.append(result)
if len(d) != len(res):
res.append("None")
for opp in soup.findAll("div", {'class': 'sidearm-schedule-game-opponent-text'}):
op.append(opp.get_text(strip=True, separator=' '))
yr.append(year)
data = []
for items in zip_longest(yr, d, n, op, res):
data.append(items)
df = pd.DataFrame(data, columns=['Year', 'Date', 'Name', 'opponent', 'Result']).to_excel('lehigh.xlsx', index=False)

I'm going to focus here only on splitting the res list into three columns, and you can incorporate it into your code as you see fit. So let's say you have this:
res1='T, 0-0(2 OT)'
res2='W,2-1OT'
res3='T,2-2Game called '
res4='W,2-0'
scores = [res1,res2,res3,res4]
We split them like this:
print("result","score","extra")
for score in scores:
n_str = score.split(',')
target = n_str[1].strip()
print(n_str[0].strip(),' ',target[:3],' ',target[3:])
Output:
result score extra
T 0-0 (2 OT)
W 2-1 OT
T 2-2 Game called
W 2-0
Note that this assumes that no game ends with double digits scores (say, 11-2, or whatever); so this should work for your typical soccer game, but will fail with basketball :D

Related

python code to loop though a list of postcodes and get the GP practices for those postcodes by scraping the yellow pages (Australia)

The code below gives me the following error:
ValueError: Length mismatch: Expected axis has 0 elements, new values have 1 elements
on the df.columns = ["GP Practice Name"] line.
I tried
import pandas as pd
import requests
from bs4 import BeautifulSoup
postal_codes = ["2000", "2010", "2020", "2030", "2040"]
places_by_postal_code = {}
def get_places(postal_code):
url = f"https://www.yellowpages.com.au/search/listings?clue={postal_code}&locationClue=&latitude=&longitude=&selectedViewMode=list&refinements=category:General%20Practitioner&selectedSortType=distance"
response = requests.get(url)
soup = BeautifulSoup(response.text, "html.parser")
places = soup.find_all("div", {"class": "listing-content"})
return [place.find("h2").text for place in places]
for postal_code in postal_codes:
places = get_places(postal_code)
places_by_postal_code[postal_code] = places
df = pd.DataFrame.from_dict(places_by_postal_code, orient='index')
df.columns = ["GP Practice Name"]
df = pd.DataFrame(places_by_postal_code.values(), index=places_by_postal_code.keys(), columns=["GP Practice Name"])
print(df)
and was expecting a list of GPs for the postcodes specified in the postal_codes variable.

How to scrape table from website, while BS4 selection won`t find it?

I'm using below code scrape table element from url (www.sfda.gov.sa/en/cosmetics-list). But its coming empty
from bs4 import BeautifulSoup
import requests
import pandas as pd
url="https://www.sfda.gov.sa/en/cosmetics-list"
res = requests.get(url)
soup = BeautifulSoup(res.content, 'html.parser')
table = soup.find('table', attrs={'class':'table table-striped display'})
table_rows = table.find_all('tr')
res = []
for tr in table_rows:
td = tr.find_all('td')
row = [tr.text.strip() for tr in td if tr.text.strip()]
if row:
res.append(row)
df = pd.DataFrame(res, columns=["ProductName", "Category", "Country", "Company"])
print(df)
Running above code but not getting data
Data is loaded via XHR so you should use this to get your information:
url = 'https://www.sfda.gov.sa/GetCosmetics.php?page=1'
pd.DataFrame(requests.get(url).json()['results'])
Example
Loop over number of pages in range() and collect all data.
import requests
import pandas as pd
data = []
for i in range(1,5):
url = f'https://www.sfda.gov.sa/GetCosmetics.php?page={i}'
data.extend(requests.get(url).json()['results'])
pd.DataFrame(data)
Output
id
cosmatics_Id
productNotificationsId
productNumber
status
productArName
productEnName
brandName
catArabic
catEnglish
counrtyAr
counrtyEn
manufactureType
packageVolume
unitAr
unitEn
barcode
manufacturearabicname
manufactureenglishname
listedNameAr
listedNameEn
imageUrl
batchNumber
country_of_manufacturing_English
country_of_manufacturing_Arabic
productCreationDate
productexpireddate
subCategory1
subCategoryAR
storageCircumstances
protectionInstructions
usageInstructions
notes
mainCommercialRecordNumber
manufacturingLicenseNumber
0
549105
58472
10518
2020-011019101291-245945
Active
ليتسيا كوبيبا
Litsea cubeba oil
MOKSHA LIFE STYLE
منتجات العناية بالبشرة
Skin products
الهند
India
Foreign
250
ملي لتر
Milliliter (ml)
0
موكشا لايف ستايل برودكت
Moksha lifestyle products
مؤسسة شجور الارض للتجارة
shojoor alearth trading
India
الهند
2020-09-28T09:40:46
2025-10-05T09:40:46
Perfumes
العطور
room temperature
تاريخ انتهاء الصلاحية
الاستعمال الخارجي
7016000957
FR555666
...
9
84386
58481
4031
2016-0120132-048982
Active
جودي ثيرابي سيستيم للشعر بالبروتين
Judy protein & Silk hair therapy system
Judy
منتجات العناية بالشعر وفروة الرأس
Hair and scalp products
الولايات المتحدة
United States
Foreign
1000
ملي لتر
Milliliter (ml)
641243925950
معامل ناتيورال كوزماتيك
natural cosmetic labs USA Inc.,
شركه بيت جودي الدوليه للتجارة
bait gody for trading co.
United States
الولايات المتحدة
2016-12-25T14:40:44
2027-01-01T14:40:44
Hair styling products
منتجات تصفيف الشعر
7007289163
FR555666
You can use concurrent.futures to concurrently scrape pages and when all pages are complete concat the results into a single dataframe:
import concurrent.futures
import json
import os
import pandas as pd
import requests
class Scrape:
def __init__(self):
self.root_url = "https://www.sfda.gov.sa/GetCosmetics.php?"
self.pages = self.get_page_count()
self.processors = os.cpu_count()
def get_page_count(self) -> int:
return self.get_data(url=self.root_url).get("pageCount")
#staticmethod
def get_data(url: str) -> dict:
with requests.Session() as request:
response = request.get(url, timeout=30)
if response.status_code != 200:
print(response.raise_for_status())
return json.loads(response.text)
def process_pages(self) -> pd.DataFrame:
page_range = list(range(1, self.pages + 1))
with concurrent.futures.ProcessPoolExecutor(max_workers=self.processors) as executor:
return pd.concat(executor.map(self.parse_data, page_range)).reset_index(drop=True)
def parse_data(self, page: int) -> pd.DataFrame:
url = f"{self.root_url}page={page}"
data = self.get_data(url=url)
return (pd
.json_normalize(data=data, record_path="results")
)[["productEnName", "catEnglish", "counrtyEn", "brandName"]].rename(
columns={"productEnName": "ProductName", "catEnglish": "Category",
"counrtyEn": "Country", "brandName": "Company"}
)
if __name__ == "__main__":
final_df = Scrape().process_pages()
print(final_df)

extracting columns contents only so that all columns for each row are in the same row using Python's BeautifulSoup

I have the following python snippet in Jupyter Notebooks that works.
The challenge I have is to extract just the rows of columnar data only
Here's the snippet:
from bs4 import BeautifulSoup as bs
import pandas as pd
page = requests.get("http://lib.stat.cmu.edu/datasets/boston")
page
soup = bs(page.content)
soup
allrows = soup.find_all("p")
print(allrows)
I'm a little unclear of what you are after but I think it's each individual row of data from URL provided.
I couldn't find a way to use beautiful soup to parse the data you are after but did find a way to separate the rows using .split()
from bs4 import BeautifulSoup as bs
import pandas as pd
import requests
page = requests.get("http://lib.stat.cmu.edu/datasets/boston")
soup = bs(page.content)
allrows = soup.find_all("p")
text = soup.text # turn soup into text
text_split = text.split('\n\n') # split the page into 3 sections
data = text_split[2] # rows of data
# create df column titles using variable titles on page
col_titles = text_split[1].split('\n')
df = pd.DataFrame(columns=range(14))
df.columns = col_titles[1:]
# 'try/except' to catch end of index,
# loop throw text data building complete rows
try:
complete_row = []
n1 = 0 #used to track index
n2 = 1
rows = data.split('\n')
for el in range(len(rows)):
full_row = rows[n1] + rows[n2]
complete_row.append(full_row)
n1 = n1 + 2
n2 = n2 + 2
except IndexError:
print('end of loop')
# loop through rows of data, clean whitespace and append to df
for row in complete_row:
elem = row.split(' ')
df.loc[len(df)] = [el for el in elem if el]
#fininshed dataframe
df

Numbers stripped of '$' and ',' won't convert from str to int

I'm pretty new to Python, but am interested in taking tables, scraping them, and then running calculations. I took an income table from Wikipedia, stripped the columns with numbers of the dollar signs and commas (e.g. $26,400 to 26400) and then tried to convert them to integers and setting conditions based on the values. While the amounts show up without '$' or ',' in the updated dataframe, I still get an amount with '$' and ',' whenever I reference an individual entry or the columns.
Here's the code — apologies if I should have separated the blocks out more — it's my first post:
import requests
import pandas as pd
from bs4 import BeautifulSoup
URL = "https://en.wikipedia.org/wiki/List_of_Maine_locations_by_per_capita_income"
page = requests.get(URL)
soup = BeautifulSoup(page.content, 'html.parser')
all_tables = soup.find_all('table', class_="wikitable")
A = []
B = []
C = []
D = []
E = []
F = []
G = []
for row in all_tables[0].findAll('tr'):
cells = row.findAll('td')
if len(cells) == 7:
A.append(cells[0].text.strip())
B.append(cells[1].text.strip())
C.append(cells[2].text.strip())
D.append(cells[3].text.strip())
E.append(cells[4].text.strip())
F.append(cells[5].text.strip())
G.append(cells[6].text.strip())
df = pd.DataFrame(A,columns=['Rank'])
df['County']=B
df['Per capita income']=C
df['Median household income']=D
df['Median family income']=E
df['Population']=F
df['Number of households']=G
df
The initial frame shows '$' and ','.
At this point, I stripped columns C through E of all '$' and ','s. Here's column C, as an example.
df['Per capita income'] = df['Per capita income'].str.replace(',', '')
df['Per capita income'] = df['Per capita income'].str.replace('$', '')
I then attempted to convert the values (sans commas and dollar signs) from "str" to "int".
df['Per capita income'] = df['Per capita income'].astype(int)
The dollar sign and comma are gone, as seen below.
While the change displays properly in the dataframe, any reference to any cell still yields a "str" with a dollar sign and comma.
Argh!
I'm assuming that I'm missing a step somewhere, because I've tried a few methods of converting "str" to "int."
This will work as per your doubts in comments.
import pandas as pd
from bs4 import BeautifulSoup
URL = "https://en.wikipedia.org/wiki/List_of_Maine_locations_by_per_capita_income"
page = requests.get(URL)
soup = BeautifulSoup(page.content, 'html.parser')
all_tables = soup.find_all('table', class_="wikitable")
A = []
B = []
C = []
D = []
E = []
F = []
G = []
for row in all_tables[0].findAll('tr'):
cells = row.findAll('td')
if len(cells) == 7:
A.append(cells[0].text.strip())
B.append(cells[1].text.strip())
C.append(int(cells[2].text.strip().replace('$', '').replace(',', '')))
D.append(cells[3].text.strip())
E.append(cells[4].text.strip())
F.append(cells[5].text.strip())
G.append(cells[6].text.strip())
df = pd.DataFrame(A,columns=['Rank'])
df['County']=B
df['Per capita income']=C
df['Median household income']=D
df['Median family income']=E
df['Population']=F
df['Number of households']=G
df

Socket Error Exceptions in Python when Scraping

I am trying to learn scraping,
I use exceptions lower down in the code to pass through errors because they dont affect the writing of data to csv
I keep getting a "socket.gaierror" but in the handling of that there is a "urllib.error.URLError" in the handling of that I get "NameError: name 'socket' is not defined" which seems circuitous
I kind of understand that using these exceptions may not be the best way to run the code but I cant seem to get past these errors and I dont know a way around or how to fix the errors.
If you have any suggestions outside of fixing the error exceptions that would be greatly appreciated as well.
import csv
from urllib.request import urlopen
from urllib.error import HTTPError
from bs4 import BeautifulSoup
base_url = 'http://www.fangraphs.com/' # used in line 27 for concatenation
years = ['2017','2016','2015'] # for enough data to run tests
#Getting Links for letters
player_urls = []
data = urlopen('http://www.fangraphs.com/players.aspx')
soup = BeautifulSoup(data, "html.parser")
for link in soup.find_all('a'):
if link.has_attr('href'):
player_urls.append(base_url + link['href'])
#Getting Alphabet Links
test_for_playerlinks = 'players.aspx?letter='
player_alpha_links = []
for i in player_urls:
if test_for_playerlinks in i:
player_alpha_links.append(i)
# Getting Player Links
ind_player_urls = []
for l in player_alpha_links:
data = urlopen(l)
soup = BeautifulSoup(data, "html.parser")
for link in soup.find_all('a'):
if link.has_attr('href'):
ind_player_urls.append(link['href'])
#Player Links
jan = 'statss.aspx?playerid'
players = []
for j in ind_player_urls:
if jan in j:
players.append(j)
# Building Pitcher List
pitcher = 'position=P'
pitchers = []
pos_players = []
for i in players:
if pitcher in i:
pitchers.append(i)
else:
pos_players.append(i)
# Individual Links to Different Tables Sorted by Base URL differences
splits = 'http://www.fangraphs.com/statsplits.aspx?'
game_logs = 'http://www.fangraphs.com/statsd.aspx?'
split_pp = []
gamel = []
years = ['2017','2016','2015']
for i in pos_players:
for year in years:
split_pp.append(splits + i[12:]+'&season='+ year)
gamel.append(game_logs+ i[12:] + '&type=&gds=&gde=&season=' + year)
split_pitcher = []
gl_pitcher = []
for i in pitchers:
for year in years:
split_pitcher.append(splits + i[12:]+'&season=' + year)
gl_pitcher.append(game_logs + i[12:] + '&type=&gds=&gde=&season=' + year)
# Splits for Pitcher Data
row_sp = []
rows_sp = []
try:
for i in split_pitcher:
sauce = urlopen(i)
soup = BeautifulSoup(sauce, "html.parser")
table1 = soup.find_all('strong', {"style":"font-size:15pt;"})
row_sp = []
for name in table1:
nam = name.get_text()
row_sp.append(nam)
table = soup.find_all('table', {"class":"rgMasterTable"})
for h in table:
he = h.find_all('tr')
for i in he:
td = i.find_all('td')
for j in td:
row_sp.append(j.get_text())
rows_sp.append(row_sp)
except(RuntimeError, TypeError, NameError, URLError, socket.gaierror):
pass
try:
with open('SplitsPitchingData2.csv', 'w') as fp:
writer = csv.writer(fp)
writer.writerows(rows_sp)
except(RuntimeError, TypeError, NameError):
pass
I'm guessing your main problem was that you - without any sleep what so ever - queried the site for a huge amount of invalid urls (you create 3 urls for the years 2015-2017 for 22880 pitchers in total, but most of these do not fall within that scope so you have tens of thousands of queries that return errors).
I'm surprised your IP wasn't banned by site admin. That said: It would be better to do some filtering so you avoid all those error queries...
The filter I applied is not perfect. It checks if the years in the list either appears in the start or end the years given on the site (e.g. '2004 - 2015'). This also creates error links but no way near the amount the original script did.
In code it could look like this:
from urllib.request import urlopen
from bs4 import BeautifulSoup
from time import sleep
import csv
base_url = 'http://www.fangraphs.com/'
years = ['2017','2016','2015']
# Getting Links for letters
letter_links = []
data = urlopen('http://www.fangraphs.com/players.aspx')
soup = BeautifulSoup(data, "html.parser")
for link in soup.find_all('a'):
try:
link = base_url + link['href']
if 'players.aspx?letter=' in link:
letter_links.append(link)
except:
pass
print("[*] Retrieved {} links. Now fetching content for each...".format(len(letter_links)))
# the data resides in two different base_urls:
splits_url = 'http://www.fangraphs.com/statsplits.aspx?'
game_logs_url = 'http://www.fangraphs.com/statsd.aspx?'
# we need (for some reason) players in two lists - pitchers_split and pitchers_game_log - and the rest of the players in two different, pos_players_split and pis_players_game_log
pos_players_split = []
pos_players_game_log = []
pitchers_split = []
pitchers_game_log = []
# and if we wanted to do something with the data from the letter_queries, lets put that in a list for safe keeping:
ind_player_urls = []
current_letter_count = 0
for link in letter_links:
current_letter_count +=1
data = urlopen(link)
soup = BeautifulSoup(data, "html.parser")
trs = soup.find('div', class_='search').find_all('tr')
for player in trs:
player_data = [tr.text for tr in player.find_all('td')]
# To prevent tons of queries to fangraph with invalid years - check if elements from years list exist with the player stat:
if any(year in player_data[1] for year in years if player_data[1].startswith(year) or player_data[1].endswith(year)):
href = player.a['href']
player_data.append(base_url + href)
# player_data now looks like this:
# ['David Aardsma', '2004 - 2015', 'P', 'http://www.fangraphs.com/statss.aspx?playerid=1902&position=P']
ind_player_urls.append(player_data)
# build the links for game_log and split
for year in years:
split = '{}{}&season={}'.format(splits_url,href[12:],year)
game_log = '{}{}&type=&gds=&gde=&season={}'.format(game_logs_url, href[12:], year)
# checking if the player is pitcher or not. We're append both link and name (player_data[0]), so we don't need to extract name later on
if 'P' in player_data[2]:
pitchers_split.append([player_data[0],split])
pitchers_game_log.append([player_data[0],game_log])
else:
pos_players_split.append([player_data[0],split])
pos_players_game_log.append([player_data[0],game_log])
print("[*] Done extracting data for players for letter {} out of {}".format(current_letter_count, len(letter_links)))
sleep(2)
# CONSIDER INSERTING CSV-PART HERE....
# Extracting and writing pitcher data to file
with open('SplitsPitchingData2.csv', 'a') as fp:
writer = csv.writer(fp)
for i in pitchers_split:
try:
row_sp = []
rows_sp = []
# all elements in the pitchers_split are lists. Player name is i[1]
data = urlopen(i[1])
soup = BeautifulSoup(data, "html.parser")
# append name to row_sp from pitchers_split
row_sp.append(i[0])
# the page has 3 tables with the class rgMasterTable, the first i Standard, the second Advanced, the 3rd Batted Ball
# we're only grabbing standard
table_standard = soup.find_all('table', {"class":"rgMasterTable"})[0]
trs = table_standard.find_all('tr')
for tr in trs:
td = tr.find_all('td')
for content in td:
row_sp.append(content.get_text())
rows_sp.append(row_sp)
writer.writerows(rows_sp)
sleep(2)
except Exception as e:
print(e)
pass
Since I'm not sure precisely how you wanted the data formatted on output you need some work on that.
If you want to avoid waiting for all letter_links to be extracted before you retrieve the actual pitcher stats (and fine tune your output) you can move the csv writer part up, so it runs as a part of the letter loop. If you do this don't forget to empty the pitchers_split list before grabbing another letter_link...

Resources