Trying to Capture the Table from Multiple Pages With For Loops - python-3.x

Good day, everyone.
I'm trying to get the table on each page from the links appended to 'player_page.'
I want the stats per game for each player in that season, and the table I want is listed on the players' individual page. Each link appended is correct, but I'm having trouble capturing the correct info when running my loops.
Any idea what I'm doing wrong here?
Any help is appreciated.
from bs4 import BeautifulSoup
import requests
import pandas as pd
from numpy import sin
url = 'https://www.pro-football-reference.com'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.69 Safari/537.36'
}
year = 2018
r = requests.get(url + '/years/' + str(year) + '/fantasy.htm')
soup = BeautifulSoup(r.content, 'lxml')
player_list = soup.find_all('td', attrs= {'class': 'left', 'data-stat': 'player'})
player_page = []
for player in player_list:
for link in player.find_all('a', href= True):
#names = str(link['href'])strip('')
link = str(link['href'].strip('.htm'))
player_page.append(url + link + '/gamelog' + '/' + str(year))
for page in player_page:
dfs = pd.read_html(page)
yearly_stats = []
for df in dfs:
yearly_stats.append(df)
final_stats = pd.concat(yearly_stats)
final_stats.to_excel('Fantasy2018.xlsx')

This works. The table columns change according to the player's position, I believe. Not everyone has tackle information, for example.
import pandas as pd
from bs4 import BeautifulSoup
import requests
import pandas as pd
url = 'https://www.pro-football-reference.com'
year = 2018
r = requests.get(url + '/years/' + str(year) + '/fantasy.htm')
soup = BeautifulSoup(r.content, 'lxml')
player_list = soup.find_all('td', attrs= {'class': 'left', 'data-stat': 'player'})
dfs = []
for player in player_list:
for link in player.find_all('a', href= True):
name = link.getText()
link = str(link['href'].strip('.htm'))
try:
df = pd.read_html(url + link + '/gamelog' + '/' + str(year))[0]
for i, columns_old in enumerate(df.columns.levels):
columns_new = np.where(columns_old.str.contains('Unnamed'), '' , columns_old)
df.rename(columns=dict(zip(columns_old, columns_new)), level=i, inplace=True)
df.columns = df.columns.map('|'.join).str.strip('|')
df['Date'] = pd.to_datetime(df['Date'], errors='coerce')
df = df.dropna(subset=['Date'])
df.insert(0,'Name',name)
df.insert(1,'Moment','Regular Season')
dfs.append(df)
except:
pass
try:
df1 = pd.read_html(url + link + '/gamelog' + '/' + str(year))[1]
for i, columns_old in enumerate(df1.columns.levels):
columns_new = np.where(columns_old.str.contains('Unnamed'), '' , columns_old)
df1.rename(columns=dict(zip(columns_old, columns_new)), level=i, inplace=True)
df1.columns = df1.columns.map('|'.join).str.strip('|')
df1['Date'] = pd.to_datetime(df1['Date'], errors='coerce')
df1 = df1.dropna(subset=['Date'])
df1.insert(0,'Name',name)
df1.insert(1,'Moment','Playoffs')
dfs.append(df1)
except:
pass
dfall = pd.concat(dfs)
dfall.to_excel('Fantasy2018.xlsx')

Related

How to access a returned variable from function 1 to use in function 2?

Objective is to acquire stock price data for each stock ticker, then assign a relevant variable its current price. ie. var 'sandp' links to ticker_symbol 'GSPC' which equals the stocks closing price. This bit works. However, I wish to return each variable value which I can then use within another function, but how do I access that variable and its value?
Here is my code:
def live_indices():
"""Acquire stock value from Yahoo Finance using stock ticker as key. Then assign the relevant variable to the respective value.
ie. variable 'sandp' equates to the value gathered from 'GSPC' stock ticker.
"""
import requests
import bs4
ticker_symbol_1 = ['GSPC', 'DJI', 'IXIC', 'FTSE', 'NSEI', 'FCHI', 'N225', 'GDAXI']
ticker_symbol_2 = ['IMOEX.ME', '000001.SS'] # Assigned to seperate list as url for webscraping is different
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.61 Safari/537.36'}
all_indices_values = []
for i in range(len(ticker_symbol_1)):
url = 'https://uk.finance.yahoo.com/quote/%5E' + ticker_symbol_1[i] + '?p=%5E' + ticker_symbol_1[i]
tmp_res = requests.get(url, headers=headers)
tmp_res.raise_for_status()
soup = bs4.BeautifulSoup(tmp_res.text, 'html.parser')
indices_price_value = soup.select(
'#quote-header-info > div.My\(6px\).Pos\(r\).smartphone_Mt\(6px\).W\(100\%\) > div.D\(ib\).Va\(m\).Maw\('
'65\%\).Ov\(h\) > div > fin-streamer.Fw\(b\).Fz\(36px\).Mb\(-4px\).D\(ib\)')[
0].text
all_indices_values.append(indices_price_value)
for i in range(len(ticker_symbol_2)):
url = 'https://uk.finance.yahoo.com/quote/' + ticker_symbol_2[i] + '?p=' + ticker_symbol_2[i]
tmp_res = requests.get(url, headers=headers)
tmp_res.raise_for_status()
soup = bs4.BeautifulSoup(tmp_res.text, 'html.parser')
indices_price_value = soup.select(
'#quote-header-info > div.My\(6px\).Pos\(r\).smartphone_Mt\(6px\).W\(100\%\) > div.D\(ib\).Va\(m\).Maw\('
'65\%\).Ov\(h\) > div > fin-streamer.Fw\(b\).Fz\(36px\).Mb\(-4px\).D\(ib\)')[
0].text
all_indices_values.append(indices_price_value)
sandp, dow, nasdaq, ftse100, nifty50, cac40, nikkei, dax, moex, shanghai = [all_indices_values[i] for i in range(10)] # 10 stock tickers in total
return sandp, dow, nasdaq, ftse100, nifty50, cac40, nikkei, dax, moex, shanghai
I want the next function to simply be given the variable name returned from the first function to print out the stock value. I have tried the below to no avail-
def display_value(stock_name):
print(stock_name)
display_value(live_indices(sandp))
The obvious error here is that 'sandp' is not defined.
Additionally, the bs4 code runs fairly slowly, would it be best to use Threads() or is there another way to speed things up?
This looks a bit complicated in my opinion, anyway focus on your question. So you are not returning a variable, you are returning a tuple of values.
def live_indices():
all_indices_values = [1,2,3,4,5,6,7,8,9,10,11,12,13]
sandp, dow, nasdaq, ftse100, nifty50, cac40, nikkei, dax, moex, shanghai = [all_indices_values[i] for i in range(10)] # 10 stock tickers in total
return sandp, dow, nasdaq, ftse100, nifty50, cac40, nikkei, dax, moex, shanghai
live_indices() #-> (1, 2, 3, 4, 5, 6, 7, 8, 9, 10)
What you want to make more likely to have is this assignment and it do not need the list comprehension nor the range() simply slice your list:
def live_indices():
all_indices_values = [1,2,3,4,5,6,7,8,9,10,11,12,13]
return all_indices_values
def display_value(x):
print(x)
sandp, dow, nasdaq, ftse100, nifty50, cac40, nikkei, dax, moex, shanghai = live_indices()[:10]
display_value(sandp)#-> 1
You may wanna work with more structured data, so return a dict:
Example
import requests
from bs4 import BeautifulSoup
def live_indices():
all_indices_values = {}
symbols = ['GSPC', 'DJI', 'IXIC', 'FTSE', 'NSEI', 'FCHI', 'N225', 'GDAXI', 'IMOEX.ME', '000001.SS']
for ticker in symbols:
url = f'https://uk.finance.yahoo.com/lookup/all?s={ticker}'
tmp_res = requests.get(url, headers=headers)
tmp_res.raise_for_status()
soup = bs4.BeautifulSoup(tmp_res.text, 'html.parser')
indices_price_value = soup.select('#Main tbody>tr td')[2].text
all_indices_values[ticker] = indices_price_value
return all_indices_values
def display_value(live_indices):
for ticker in live_indices.items():
print(ticker)
display_value(live_indices())
Output
('GSPC', '3,873.33')
('DJI', '30,822.42')
('IXIC', '11,448.40')
('FTSE', '7,236.68')
('NSEI', '17,530.85')
('FCHI', '6,077.30')
('N225', '27,567.65')
('GDAXI', '12,741.26')
('IMOEX.ME', '2,222.51')
('000001.SS', '3,126.40')

Pass url column's values one by one to web crawler code in Python

Based on the answered code from this link, I'm able to create a new column: df['url'] = 'https://www.cspea.com.cn/list/c01/' + df['projectCode'].
Next step I would like to pass the url column's values to the following code and append all the scrapied contents as dataframe.
import urllib3
import requests
from bs4 import BeautifulSoup
import pandas as pd
url = "https://www.cspea.com.cn/list/c01/gr2021bj1000186" # url column's values should be passed here one by one
soup = BeautifulSoup(requests.get(url, verify=False).content, "html.parser")
index, data = [], []
for th in soup.select(".project-detail-left th"):
h = th.get_text(strip=True)
t = th.find_next("td").get_text(strip=True)
index.append(h)
data.append(t)
df = pd.DataFrame(data, index=index, columns=["value"])
print(df)
How could I do that in Python? Thanks.
Updated:
import requests
from bs4 import BeautifulSoup
import pandas as pd
df = pd.read_excel('items_scraped.xlsx')
data = []
urls = df.url.tolist()
for url_link in urls:
url = url_link
# url = "https://www.cspea.com.cn/list/c01/gr2021bj1000186"
soup = BeautifulSoup(requests.get(url, verify=False).content, "html.parser")
index, data = [], []
for th in soup.select(".project-detail-left th"):
h = th.get_text(strip=True)
t = th.find_next("td").get_text(strip=True)
index.append(h)
data.append(t)
df = pd.DataFrame(data, index=index, columns=["value"])
df = df.T
df.reset_index(drop=True, inplace=True)
print(df)
df.to_excel('result.xlsx', index = False)
But it only saved one rows into excel file.
You need to combine the dfs generated in the loop. You could add them to a list and then call pd.concat on that list.
import requests
from bs4 import BeautifulSoup
import pandas as pd
df = pd.read_excel('items_scraped.xlsx')
# data = []
urls = df.url.tolist()
dfs = []
for url_link in urls:
url = url_link
# url = "https://www.cspea.com.cn/list/c01/gr2021bj1000186"
soup = BeautifulSoup(requests.get(url, verify=False).content, "html.parser")
index, data = [], []
for th in soup.select(".project-detail-left th"):
h = th.get_text(strip=True)
t = th.find_next("td").get_text(strip=True)
index.append(h)
data.append(t)
df = pd.DataFrame(data, index=index, columns=["value"])
df = df.T
df.reset_index(drop=True, inplace=True)
print(df)
dfs.append(df)
df = pd.concat(dfs)
df.to_excel('result.xlsx', index = False)
Use
urls = df.url.tolist()
To create a list of URLs and then iterate through them using f string to insert each one into your base url

Loop pages and save contents in Excel file from website in Python

I'm trying to loop pages from this link and extract the interesting part.
Please see the contents in the red circle in the image below.
Here's what I've tried:
url = 'http://so.eastmoney.com/Ann/s?keyword=购买物业&pageindex={}'
for page in range(10):
r = requests.get(url.format(page))
soup = BeautifulSoup(r.content, "html.parser")
print(soup)
xpath for each element (might be helpful for those that don't read Chinese):
/html/body/div[3]/div/div[2]/div[2]/div[3]/h3/span --> 【润华物业】
/html/body/div[3]/div/div[2]/div[2]/div[3]/h3/a --> 润华物业:关于公司购买理财产品的公告
/html/body/div[3]/div/div[2]/div[2]/div[3]/p/label --> 2017-04-24
/html/body/div[3]/div/div[2]/div[2]/div[3]/p/span --> 公告编号:2017-019 证券代码:836007 证券简称:润华物业 主办券商:国联证券
/html/body/div[3]/div/div[2]/div[2]/div[3]/a --> http://data.eastmoney.com/notices/detail/836007/AN201704250530124271,JWU2JWI2JWE2JWU1JThkJThlJWU3JTg5JWE5JWU0JWI4JTlh.html
I need to save the output to an Excel file. How could I do that in Python? Many thanks.
BeautifulSoup won't see this stuff, as it's rendered dynamically by JS, but there's an API endpoint you can query to get what you're after.
Here's how:
import requests
import pandas as pd
def clean_up(text: str) -> str:
return text.replace('</em>', '').replace(':<em>', '').replace('<em>', '')
def get_data(page_number: int) -> dict:
url = f"http://searchapi.eastmoney.com/business/Web/GetSearchList?type=401&pageindex={page_number}&pagesize=10&keyword=购买物业&name=normal"
headers = {
"Referer": f"http://so.eastmoney.com/Ann/s?keyword=%E8%B4%AD%E4%B9%B0%E7%89%A9%E4%B8%9A&pageindex={page_number}",
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:83.0) Gecko/20100101 Firefox/83.0",
}
return requests.get(url, headers=headers).json()
def parse_response(response: dict) -> list:
for item in response["Data"]:
title = clean_up(item['NoticeTitle'])
date = item['NoticeDate']
url = item['Url']
notice_content = clean_up(" ".join(item['NoticeContent'].split()))
company_name = item['SecurityFullName']
print(f"{company_name} - {title} - {date}")
yield [title, url, date, company_name, notice_content]
def save_results(parsed_response: list):
df = pd.DataFrame(
parsed_response,
columns=['title', 'url', 'date', 'company_name', 'content'],
)
df.to_excel("test_output.xlsx", index=False)
if __name__ == "__main__":
output = []
for page in range(1, 11):
for parsed_row in parse_response(get_data(page)):
output.append(parsed_row)
save_results(output)
This outputs:
栖霞物业购买资产的公告 - 2019-09-03 16:00:00 - 871792
索克物业购买资产的公告 - 2020-08-17 00:00:00 - 832816
中都物业购买股权的公告 - 2019-12-09 16:00:00 - 872955
开元物业:开元物业购买银行理财产品的公告 - 2015-05-21 16:00:00 - 831971
开元物业:开元物业购买银行理财产品的公告 - 2015-04-12 16:00:00 - 831971
盛全物业:拟购买房产的公告 - 2017-10-30 16:00:00 - 834070
润华物业购买资产暨关联交易公告 - 2016-08-23 16:00:00 - 836007
润华物业购买资产暨关联交易公告 - 2017-08-14 16:00:00 - 836007
萃华珠宝:关于拟购买物业并签署购买意向协议的公告 - 2017-07-10 16:00:00 - 002731
赛意信息:关于购买办公物业的公告 - 2020-12-02 00:00:00 - 300687
And saves this to a .csv file that can be easily handled by excel.
PS. I don't know Chinese (?) so you'd have to look into the response contents and pick more stuff out.
Updated answer based on #baduker's solution, but not working out for loop pages.
import requests
import pandas as pd
for page in range(10):
url = "http://searchapi.eastmoney.com/business/Web/GetSearchList?type=401&pageindex={}&pagesize=10&keyword=购买物业&name=normal"
headers = {
"Referer": "http://so.eastmoney.com/Ann/s?keyword=%E8%B4%AD%E4%B9%B0%E7%89%A9%E4%B8%9A&pageindex={}",
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:83.0) Gecko/20100101 Firefox/83.0",
}
response = requests.get(url, headers=headers).json()
output_data = []
for item in response["Data"]:
# print(item)
# print('*' * 40)
title = item['NoticeTitle'].replace('</em>', '').replace(':<em>', '').replace('<em>', '')
url = item['Url']
date = item['NoticeDate'].split(' ')[0]
company_name = item['SecurityFullName']
content = item['NoticeContent'].replace('</em>', '').replace(':<em>', '').replace('<em>', '')
# url_code = item['Url'].split('/')[5]
output_data.append([title, url, date, company_name, content])
names = ['title', 'url', 'date', 'company_name', 'content']
df = pd.DataFrame(output_data, columns = names)
df.to_excel('test.xlsx', index = False)

Unable to scrape all data

from bs4 import BeautifulSoup
import requests , sys ,os
import pandas as pd
URL = r"https://www.vault.com/best-companies-to-work-for/law/top-100-law-firms-rankings/year/"
My_list = ['2007','2008','2009','2010','2011','2012','2013','2014','2015','2016','2017','2018','2019','2020']
Year= []
CompanyName = []
Rank = []
Score = []
print('\n>>Process started please wait\n\n')
for I, Page in enumerate(My_list, start=1):
url = r'https://www.vault.com/best-companies-to-work-for/law/top-100-law-firms-rankings/year/{}'.format(Page)
print('\nData fetching from : ',url)
Res = requests.get(url)
soup = BeautifulSoup(Res.content , 'html.parser')
data = soup.find('section',{'class': 'search-result CompanyWorkfor RankingMain FindSchools school-results contrastSection d-flex justify-content-center min-height Rankings CompRank'})
if len(soup) > 0:
print("\n>>Getting page source for :" , url)
else:
print("Please Check url :",url)
for i, item in enumerate(data.find_all("div", {"class": "RankItem"})):
year = item.find("i",{"class":"fa-stack fa-2x"})
Year.append(year)
title = item.find("h3", {"class": "MainLink"}).get_text().strip()
CompanyName.append(title)
rank = item.find("div", {"class": "RankNumber"}).get_text().strip()
Rank.append(rank)
score = item.find("div", {"class": "score"}).get_text().strip()
Score.append(score)
Data = pd.DataFrame({"Year":Year,"CompanyName":CompanyName,"Rank":Rank,"Score":Score})
Data[['First','Score']] = Data.Score.str.split(" " , expand =True,)
Data[['hash','Rank']] = Data.Rank.str.split("#" , expand = True,)
Data.drop(columns = ['hash','First'],inplace = True)
Data.to_csv('Vault_scrap.csv',index = False)
For each url the expected output Data for year, rank, title and score is 100 lines, but I'm getting only 10 lines.
You can iterate through the year and pages like this.
import requests
import pandas as pd
url = 'https://www.vault.com/vault/api/Rankings/LoadMoreCompanyRanksJSON'
def page_loop(year, url):
tableReturn = pd.DataFrame()
for page in range(1,101):
payload = {
'rank': '2',
'year': year,
'category': 'LBACCompany',
'pg': page}
jsonData = requests.get(url, params=payload).json()
if jsonData == []:
return tableReturn
else:
print ('page: %s' %page)
tableReturn = tableReturn.append(pd.DataFrame(jsonData), sort=True).reset_index(drop=True)
return tableReturn
results = pd.DataFrame()
for year in range(2007,2021):
print ("\n>>Getting page source for :" , year)
jsonData = page_loop(year, url)
results = results.append(pd.DataFrame(jsonData), sort=True).reset_index(drop=True)

How to use python to click the “load more” to extract links of names

I want to get links of names from all the pages by clicking load more and needs help with pagination
I've got the logic to print links for names but needs help with pagination
for pos in positions:
url = "https://247sports.com/Season/2021-Football/CompositeRecruitRankings/?InstitutionGroup=HighSchool"
two = requests.get("https://247sports.com/Season/2021-Football/CompositeRecruitRankings/?InstitutionGroup=HighSchool" + pos,headers=HEADERS)
bsObj = BeautifulSoup(two.content , 'lxml')
main_content = urljoin(url,bsObj.select(".data-js")[1]['href']) ## ['href']InstitutionGroup" extracting the link leading to the page containing everything available here
response = requests.get(main_content)
obj = BeautifulSoup(response.content , 'lxml')
names = obj.findAll("div",{"class" : "recruit"})
for player_name in names:
player_name.find('a',{'class' : ' rankings-page__name-link'})
for all_players in player_name.find_all('a', href=True):
player_urls = site + all_players.get('href')
# print(player_urls)
I expect output : https://247sports.com/Player/Jack-Sawyer-46049925/
(links of all player names)
Can just iterate through the parameters in the requests. Since you can just continue to iterate forever, I had it check for when players started to repeat (essentially when the next iteration doesn't add new players). Seem to stop after 21 pages which gives 960 players.
import requests
from bs4 import BeautifulSoup
url = 'https://247sports.com/Season/2021-Football/CompositeRecruitRankings/'
headers = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.90 Safari/537.36'}
player_links = []
prior_count = 0
for page in range(1,101):
#print ('Page: %s' %page)
payload = {
'ViewPath': '~/Views/SkyNet/PlayerSportRanking/_SimpleSetForSeason.ascx',
'InstitutionGroup': 'HighSchool',
'Page': '%s' %page}
response = requests.get(url, headers=headers, params=payload)
soup = BeautifulSoup(response.text, 'html.parser')
recruits = soup.find_all('div',{'class':'recruit'})
for recruit in recruits:
print ('https://247sports.com' + recruit.find('a')['href'])
player_links.append('https://247sports.com' + recruit.find('a')['href'])
current_count = len(list(set(player_links)))
if prior_count == current_count:
print ('No more players')
break
else:
prior_count = current_count
Output:
print (player_links)
['https://247sports.com/Player/Korey-Foreman-46056100', 'https://247sports.com/Player/Jack-Sawyer-46049925', 'https://247sports.com/Player/Tommy-Brockermeyer-46040211', 'https://247sports.com/Player/James-Williams-46049981', 'https://247sports.com/Player/Payton-Page-46055295', 'https://247sports.com/Player/Camar-Wheaton-46050152', 'https://247sports.com/Player/Brock-Vandagriff-46050870', 'https://247sports.com/Player/JT-Tuimoloau-46048440', 'https://247sports.com/Player/Emeka-Egbuka-46048438', 'https://247sports.com/Player/Tony-Grimes-46048912', 'https://247sports.com/Player/Sam-Huard-46048437', 'https://247sports.com/Player/Amarius-Mims-46079928', 'https://247sports.com/Player/Savion-Byrd-46078964', 'https://247sports.com/Player/Jake-Garcia-46053996', 'https://247sports.com/Player/Agiye-Hall-46055274', 'https://247sports.com/Player/Caleb-Williams-46040610', 'https://247sports.com/Player/JJ-McCarthy-46042742', 'https://247sports.com/Player/Dylan-Brooks-46079585', 'https://247sports.com/Player/Nolan-Rucci-46058902', 'https://247sports.com/Player/GaQuincy-McKinstry-46052990', 'https://247sports.com/Player/Will-Shipley-46056925', 'https://247sports.com/Player/Maason-Smith-46057128', 'https://247sports.com/Player/Isaiah-Johnson-46050757', 'https://247sports.com/Player/Landon-Jackson-46049327', 'https://247sports.com/Player/Tunmise-Adeleye-46050288', 'https://247sports.com/Player/Terrence-Lewis-46058521', 'https://247sports.com/Player/Lee-Hunter-46058922', 'https://247sports.com/Player/Raesjon-Davis-46056065', 'https://247sports.com/Player/Kyle-McCord-46047962', 'https://247sports.com/Player/Beaux-Collins-46049126', 'https://247sports.com/Player/Landon-Tengwall-46048781', 'https://247sports.com/Player/Smael-Mondon-46058273', 'https://247sports.com/Player/Derrick-Davis-Jr-46049676', 'https://247sports.com/Player/Troy-Franklin-46048840', 'https://247sports.com/Player/Tywone-Malone-46081337', 'https://247sports.com/Player/Micah-Morris-46051663', 'https://247sports.com/Player/Donte-Thornton-46056489', 'https://247sports.com/Player/Bryce-Langston-46050326', 'https://247sports.com/Player/Damon-Payne-46041148', 'https://247sports.com/Player/Rocco-Spindler-46049869', 'https://247sports.com/Player/David-Daniel-46076804', 'https://247sports.com/Player/Branden-Jennings-46049721', 'https://247sports.com/Player/JaTavion-Sanders-46058800', 'https://247sports.com/Player/Chris-Hilton-46055801', 'https://247sports.com/Player/Jason-Marshall-46051367', ... ]

Resources