Script isn't retrieving all the info - python-3.x

I tried making a python script that gets all the fighter names and their records from boxrec.com. The issue is that it doesn't retrieve them all (Floyd Mayweather is missing) and some of them appear several times (Success Tetteh for example).
The output is too big to post it here: https://cryptpad.fr/pad/#/2/pad/view/mYd4jIMOxY7QNUqW2-5TvYIvvx84KXbiMdYvXINGV9M/
Edit: For some fighters the records are wrong (Vasyl Lomachenko for example appears to have 28 wins, but he has 14)
import numpy
from requests import Session
from bs4 import BeautifulSoup
import pandas as pd
import pyautogui
import time
def main():
fighter_names = []
fighter_wins = []
fighter_losses = []
fighter_draws = []
username = "username"
password = "password"
site = "https://boxrec.com/en/login"
payload = {
'_username': username,
'_password': password,
'login[go]': None
}
with Session() as s:
s.get(site)
s.post(site, data=payload, headers={
"Content-Type": "application/x-www-form-urlencoded"
})
pages = numpy.arange(1, 19152, 20)
for page in pages:
page = s.get(
"https://boxrec.com/en/locations/people?l%5Brole%5D=proboxer&l%5Bdivision%5D=&l%5Bcountry%5D=&l"
"%5Bregion%5D=&l%5Btown%5D=&l_go=&offset= "
+ str(page))
soup = BeautifulSoup(page.text, 'html.parser')
names_a = soup.find_all('a', class_='personLink')
if not names_a:
print("solving captcha")
page = s.get(
"https://boxrec.com/en/locations/people?l%5Brole%5D=proboxer&l%5Bdivision%5D=&l%5Bcountry%5D=&l"
"%5Bregion%5D=&l%5Btown%5D=&l_go=&offset= "
+ str(page))
soup = BeautifulSoup(page.text, 'html.parser')
names_a = soup.find_all('a', class_='personLink')
pyautogui.click(x=118, y=1061)
time.sleep(1)
pyautogui.click(x=1035, y=619)
time.sleep(2)
pyautogui.click(x=97, y=59)
time.sleep(1)
pyautogui.click(x=834, y=247)
time.sleep(2)
if not names_a:
print("please solve captcha manually")
while not names_a:
page = s.get(
"https://boxrec.com/en/locations/people?l%5Brole%5D=proboxer&l%5Bdivision%5D=&l%5Bcountry%5D=&l"
"%5Bregion%5D=&l%5Btown%5D=&l_go=&offset= "
+ str(page))
soup = BeautifulSoup(page.text, 'html.parser')
names_a = soup.find_all('a', class_='personLink')
wins_span = soup.find_all('span', class_='textWon')
loses_span = soup.find_all('span', class_='textLost')
draws_span = soup.find_all('span', class_='textDraw')
for container in names_a:
name = container.text
print(name)
fighter_names.append(name)
for container in wins_span:
wins = container.text
fighter_wins.append(wins)
for container in loses_span:
losses = container.text
fighter_losses.append(losses)
for container in draws_span:
draws = container.text
fighter_draws.append(draws)
fighters = {
"name": fighter_names,
"wins": fighter_wins,
"loses": fighter_losses,
"draws": fighter_draws
}
df = pd.DataFrame.from_dict(fighters, orient="index")
df = df.transpose()
df.to_csv("fighters.csv")
if __name__ == '__main__':
main()

I would refrain from using the same variable name to represent 2 separate things...Looks like you have page variable being used in 2 separate instances, which can be confusing.
As far as some of the issues, I'm assuming at some point there's a mismatch in the lists so the corresponding data isn't lining up with the correct fighter name, etc. or there's something off with the sites actual data/html. Not entirely sure as I haven't debugged. Reason being, have you considered using pandas to parse the table then just split the 'w-l-d' column? I think it would be far easier to let pandas do the parsing as to not miss something in the 900+ pages you need to go through.
See if this helps:
import numpy
from requests import Session
from bs4 import BeautifulSoup
import pandas as pd
import pyautogui
import time
import math
def main():
final_df = pd.DataFrame()
username = 'username'
password = 'password'
site = "https://boxrec.com/en/login"
payload = {
'_username': username,
'_password': password,
'login[go]': None
}
with Session() as s:
s.get(site)
s.post(site, data=payload, headers={
"Content-Type": "application/x-www-form-urlencoded"
})
pages = numpy.arange(1, 19152, 20)
for page in pages:
response = s.get(
"https://boxrec.com/en/locations/people?l%5Brole%5D=proboxer&l%5Bdivision%5D=&l%5Bcountry%5D=&l"
"%5Bregion%5D=&l%5Btown%5D=&l_go=&offset= "
+ str(page))
soup = BeautifulSoup(response.text, 'html.parser')
names_a = soup.find_all('a', class_='personLink')
if not names_a:
print("solving captcha")
response = s.get(
"https://boxrec.com/en/locations/people?l%5Brole%5D=proboxer&l%5Bdivision%5D=&l%5Bcountry%5D=&l"
"%5Bregion%5D=&l%5Btown%5D=&l_go=&offset= "
+ str(page))
soup = BeautifulSoup(response.text, 'html.parser')
names_a = soup.find_all('a', class_='personLink')
pyautogui.click(x=118, y=1061)
time.sleep(1)
pyautogui.click(x=1035, y=619)
time.sleep(2)
pyautogui.click(x=97, y=59)
time.sleep(1)
pyautogui.click(x=834, y=247)
time.sleep(2)
if not names_a:
print("please solve captcha manually")
while not names_a:
response = s.get(
"https://boxrec.com/en/locations/people?l%5Brole%5D=proboxer&l%5Bdivision%5D=&l%5Bcountry%5D=&l"
"%5Bregion%5D=&l%5Btown%5D=&l_go=&offset= "
+ str(page))
soup = BeautifulSoup(response.text, 'html.parser')
names_a = soup.find_all('a', class_='personLink')
df = pd.read_html(response.text)[-1]
df = df[['name','w-l-d']]
df = df[df['w-l-d'].astype(str).str.match(r"(^\d*.\d*.\d*$)")] # <--- ADD THIS LINE
df[['wins','loses','draws']] = df['w-l-d'].str.split(expand=True)
df = df.drop('w-l-d', axis=1)
print('Page: %d of %d' %(((page-1)/20)+1,math.ceil(19152/20)))
final_df = final_df.append(df, sort=False).reset_index(drop=True)
final_df.to_csv("fighters.csv")
if __name__ == '__main__':
main()

Related

How to scrape if the whole page changes?

with this can reached the link 3 days ago:
import requests
from bs4 import BeautifulSoup
html_url23 = "http://streamstat.net/videoplayer.cgi?sid=148177550&ext=.m3u8"
html_response = requests.get(html_url23)
soup = BeautifulSoup(html_response.text, 'html.parser')
for vid in soup.find_all('source'):
FIXTV = vid['src']
now there is only one "text",
what can be done at this time?
No listen URL! SID not found!
it wouldn't bother me so much if the link didn't work, but when printing, if there is a "broken" link in the links, the whole thing doesn't work
in this example the CINELIFEHD works, as soon as I add FIXTV the print no longer works because of the changed page
import requests
from bs4 import BeautifulSoup
html_url55 = "http://streamstat.net/videoplayer.cgi?sid=14358315&ext=.m3u8"
html_response = requests.get(html_url55)
soup = BeautifulSoup(html_response.text, 'html.parser')
for vid in soup.find_all('source'):
CINELIFEHD = vid['src']
html_url23 = "http://streamstat.net/videoplayer.cgi?sid=148177550&ext=.m3u8"
html_response = requests.get(html_url23)
soup = BeautifulSoup(html_response.text, 'html.parser')
for vid in soup.find_all('source'):
FIXTV = vid['src']
print(
"#EXTM3U"
+ '\n' +"#EXTINF:0,tvg-logo=https://cinelife.com/wp-content/uploads/2020/04/cinelife_logo.png, CINE LIFE HD" + '\n' +
CINELIFEHD
+ '\n' + "#EXTINF:0,tvg-logo=http://1241.hu/userfiles/image/tvcsatornak/pic_atkoto_55_fix_tv.png, Fix" + '\n' +
FIXTV
)
from bs4 import BeautifulSoup
import requests
keys = [148177550, 14358315]
params = {
'ext': '.m3u8'
}
def get_soup(content):
return BeautifulSoup(content, 'lxml')
def main(url):
with requests.Session() as req:
for k in keys:
params['sid'] = k
r = req.get(url, params=params)
soup = get_soup(r.text)
try:
goal = soup.select_one('source')['src']
except TypeError:
goal = "N/A"
print("Key: {:10}, Result: {}".format(k, goal))
main('http://streamstat.net/videoplayer.cgi')
Output:
Key: 148177550, Result: N/A
Key: 14358315, Result: https://magselect-stirr.amagi.tv/playlist1080p.m3u8

Scraping all href links using Pagination

I've to Select each state from https://www.maxpreps.com/search/states_by_sport.aspx?gendersport=boys,football&season=fall and then click on team rankings and after that I've to grab href links of each ranked team.
I've completed till team rankings part now I want get links of each ranked team from all the pages in the pagination bar right now I'm getting links of all teams available on the first page only, I don't how to navigate to the next page.(below is the code)
import requests
from bs4 import BeautifulSoup
from urllib.request import urlopen
import re
site = "https://www.maxpreps.com"
url = requests.get("https://www.maxpreps.com/search/states_by_sport.aspx?gendersport=boys,football&season=fall")
soup = BeautifulSoup(url.content, "html.parser")
states = soup.findAll('div', {'class': 'states'})
for each_state in states:
all_states = each_state.find_all('a', href=True)
for a in all_states:
domain = site + a['href'] #domain consist oflinks of states
for r in domain:
page_link = domain
page_response = requests.get(page_link)
soup = BeautifulSoup(page_response.content, "html.parser")
for link in soup.findAll('a', attrs={'href': re.compile("rankings")}):
rankings_link = site + link.get('href')
#print(rankings_link)
for ert in rankings_link:
team_link = rankings_link
page_response1 = requests.get(team_link)
soup = BeautifulSoup(page_response1.content, "html.parser")
My_table = soup.find('table',{'class':'mx-grid sortable rankings-grid'})
links = My_table.findAll('a')
print(links)
output:
Everett, Methuen,
You could just iterate through pages within the query parameters.
import requests
from bs4 import BeautifulSoup
site = "https://www.maxpreps.com"
session = requests.Session()
response = session.get("https://www.maxpreps.com/search/states_by_sport.aspx?gendersport=boys,football&season=fall")
soup = BeautifulSoup(response.content, "html.parser")
all_states = soup.find('div', {'class': 'states'})
states_list = []
for each in all_states.find_all('a'):
states_list.append(each['href'].split('=')[-1])
states_list = states_list[:-1]
team_links = []
url = 'https://www.maxpreps.com/m/rankings/list.aspx'
for state in states_list:
break_loop = False
page=1
while break_loop == False:
print ('%s: Page %s' %(state, page))
payload = {
'page': str(page),
'ssid': '8d610ab9-220b-465b-9cf0-9f417bce6c65',
'state': state
}
response = requests.get(url, params=payload)
soup = BeautifulSoup(response.text, "html.parser")
table = soup.find('table')
if table == None:
break_loop = True
else:
page+=1
links = table.find_all('a')
for link in links:
team_links.append('https://www.maxpreps.com' + link['href'])
Output:
print (team_links[:10])
['https://www.maxpreps.com/m/high-schools/central-red-devils-(phenix-city,al)/football/default.htm', 'https://www.maxpreps.com/m/high-schools/thompson-warriors-(alabaster,al)/football/default.htm', 'https://www.maxpreps.com/m/high-schools/hoover-buccaneers-(hoover,al)/football/default.htm', 'https://www.maxpreps.com/m/high-schools/oxford-yellow-jackets-(oxford,al)/football/default.htm', 'https://www.maxpreps.com/m/high-schools/mountain-brook-spartans-(birmingham,al)/football/default.htm', 'https://www.maxpreps.com/m/high-schools/hewitt-trussville-huskies-(trussville,al)/football/default.htm', 'https://www.maxpreps.com/m/high-schools/mcgill-toolen-yellowjackets-(mobile,al)/football/default.htm', 'https://www.maxpreps.com/m/high-schools/lee-generals-(montgomery,al)/football/default.htm', 'https://www.maxpreps.com/m/high-schools/pinson-valley-indians-(pinson,al)/football/default.htm', 'https://www.maxpreps.com/m/high-schools/vestavia-hills-rebels-(vestavia-hills,al)/football/default.htm']

How to add exception to skip AttributeError in my code?

I was trying to skip AttributeError in my code because there is always atrribute error while crawling, especially when trying to get the value of title, and content. I have tried to put "except AttributeError" somewhere but never work. Could anybody help me? I am using python 3.6
from bs4 import BeautifulSoup
import requests
import pymysql.cursors
urls2 = []
result = requests.get("http://desaku.bandungkab.go.id/desaonline/")
src = result.content
soup = BeautifulSoup(src, 'lxml')
links = soup.find_all('a')
urls = []
for link in links:
if "www" in link.text:
url = link.attrs['href']
urls.append(url)
num1=len(urls)
b=0
while b<10:
result2 = requests.get(urls[b])
src2 = result2.content
soup = BeautifulSoup(src2, 'lxml')
links2 = soup.find_all('a')
for link in links2:
if "selengkapnya" in link.text:
url2 = link.attrs['href']
urls2.append(url2)
b+=1
num=len(urls2)
i=0
while i<num:
html = requests.get(urls2[i])
src = html.content
soup = BeautifulSoup(src, 'lxml')
recordList = soup.findAll("div", {"class": "artikel", })
recordlist = soup.find_all('div', attrs={'class':'sampul2'})
connection = pymysql.connect(host='localhost',
user='root',
password='',
db='bs4-test',
charset='utf8mb4',
cursorclass=pymysql.cursors.DictCursor)
try:
with connection.cursor() as cursor:
for record in recordList:
#except AttributeError:
#continue #WHERE TO PUT THIS EXCEPTION,TO SKIP ATRRIBUTEERRROR?
title = record.find("h2", {"class": "judul", }).get_text().strip()
date = record.find('i').next_sibling.next_sibling.next_sibling.replace('\t\t\t\t\t\t\t', '')
content = record.find("div", {"class":"teks"}).get_text().strip()
image = record.img['src']
cover = record.img['src']
sql = "INSERT INTO `artikel` (`jdl`, `tgl`, `kon`, `gambar`, `sampul`) VALUES (%s, %s, %s, %s, %s)"
cursor.execute(sql, (title, date, content, image, cover))
connection.commit()
print ("Record inserted successfully into table")
finally:
connection.close()
print("MySQL connection is closed")
i+=1
Just an example: if it is an url then append the url, if not then append null. Usually you would want the length of all lists to be the same, so that finally you could put them in a dataframe.
import numpy as np
links = soup.find_all('a')
urls = []
for link in links:
try:
url = link['href']
urls.append(url)
except:
urls.append(np.nan)

'list' object has no attribute 'timeout' and only prints first item in the table

I am trying to pull a table from a list of URL's. When I only input one URL it only prints out the first items in the table and when I add more URL's to the list I get the error message 'list' object has no attribute 'timeout'. What is the best way to get the rest of the items and adding more URL's?
Below is the code I am running.
import time, random, csv, bs4, requests, io
import pandas as pd
timeDelay = random.randrange(5, 20)
from urllib.request import urlopen as uReq
from bs4 import BeautifulSoup as soup
my_urls = [
"https://www.lonza.com/products-services/bio-research/electrophoresis-of-nucleic-acids-and-proteins/nucleic-acid-electrophoresis/precast-gels-for-dna-and-rna-analysis/truband-gel-anchors.aspx",
"https://www.lonza.com/products-services/bio-research/transfection/nucleofector-kits-for-primary-cells/nucleofector-kits-for-primary-epithelial-cells/nucleofector-kits-for-human-mammary-epithelial-cells-hmec.aspx",
"https://www.lonza.com/products-services/bio-research/transfection/nucleofector-kits-for-primary-cells/nucleofector-kits-for-primary-neural-cells/nucleofector-kits-for-mammalian-glial-cells.aspx",
]
uClient = uReq(my_urls)
page_html = uClient.read()
uClient.close()
page_soup = soup(page_html, "html.parser")
containers = page_soup.findAll('tbody')
product_name_list =[]
cat_no_list = []
size_list = []
price_list =[]
for container in containers:
if (len(container) > 0):
#try:
title_container = container.findAll('td')
Product_name = title_container[0].text.strip()
product_name_list.append(Product_name)
CatNo_container = container.findAll('td')
CatNo = CatNo_container[1].text.strip()
cat_no_list.append(CatNo)
#Size_container = container.findAll('div',{'class':'col-xs-2 noPadding'})
#Size = Size_container[0].text.strip()
#size_list.append(Size)
Price_container = container.findAll('td')
Price = Price_container[4].text.strip()
price_list.append(Price)
print('Product_name: '+ Product_name)
print('CatNo: ' + CatNo)
print('Size: ' + 'N/A')
print('Price: ' + Price)
print(" ")
time.sleep(timeDelay)
You are passing a list here, uClient = uReq(my_urls) as my_urls where a string is required.
You need to pass the individual element of the list i.e. the strings.
Here is the edited code that works for multiple urls.
UPDATED CODE (to get all items):
import time, random, csv, bs4, requests, io
import pandas as pd
timeDelay = random.randrange(5, 20)
from urllib.request import urlopen as uReq
from bs4 import BeautifulSoup as soup
my_urls = [
"https://www.lonza.com/products-services/bio-research/electrophoresis-of-nucleic-acids-and-proteins/nucleic-acid-electrophoresis/precast-gels-for-dna-and-rna-analysis/truband-gel-anchors.aspx",
"https://www.lonza.com/products-services/bio-research/transfection/nucleofector-kits-for-primary-cells/nucleofector-kits-for-primary-epithelial-cells/nucleofector-kits-for-human-mammary-epithelial-cells-hmec.aspx",
"https://www.lonza.com/products-services/bio-research/transfection/nucleofector-kits-for-primary-cells/nucleofector-kits-for-primary-neural-cells/nucleofector-kits-for-mammalian-glial-cells.aspx",
]
for url in my_urls:
print("URL using: ", url)
uClient = uReq(url)
page_html = uClient.read()
uClient.close()
page_soup = soup(page_html, "html.parser")
containers = page_soup.findAll('tbody')
product_name_list =[]
cat_no_list = []
size_list = []
price_list =[]
for container in containers:
if (len(container) > 0):
#try:
items = container.findAll('tr')
for item in items:
item = item.text.split('\n')
Product_name = item[1]
product_name_list.append(Product_name)
CatNo = item[2]
cat_no_list.append(CatNo)
#Size_container = container.findAll('div',{'class':'col-xs-2 noPadding'})
#Size = Size_container[0].text.strip()
#size_list.append(Size)
Price = item[6]
price_list.append(Price)
print('Product_name: '+ Product_name)
print('CatNo: ' + CatNo)
print('Size: ' + 'N/A')
print('Price: ' + Price)
print(" ")
time.sleep(timeDelay)

How to get certain text from a url links

So im trying to get all the statistics in the statistics box page on the url page for each team. An example of what the page looks like is on the hyperlink I put below. Im trying to have if so it prints out;
month : win %
month : win %
All time: win%
But I am not to sure how to write that code, since the last piece of code I wrote in the main was giving me an error.
http://www.gosugamers.net/counterstrike/teams/16448-nasty-gravy-runners
import time
import requests
from bs4 import BeautifulSoup
def get_all(url, base): # Well called it will print all the team links
r = requests.get(url)
page = r.text
soup = BeautifulSoup(page, 'html.parser')
for team_links in soup.select('div.details h3 a'):
members = int(team_links.find_next('th', text='Members:').find_next_sibling('td').text.strip().split()[0])
if members < 5:
continue
yield base + team_links['href']
next_page = soup.find('div', {'class': 'pages'}).find('span', text='Next')
while next_page:
# Gives the server a break
time.sleep(0.2)
r = requests.get(BASE_URL + next_page.find_previous('a')['href'])
page = r.text
soup = BeautifulSoup(page)
for team_links in soup.select('div.details h3 a'):
yield BASE_URL + team_links['href']
next_page = soup.find('div', {'class': 'pages'}).find('span', text='Next')
if __name__ == '__main__':
BASE_URL = 'http://www.gosugamers.net'
URL = 'http://www.gosugamers.net/counterstrike/teams'
for links in get_all(URL, BASE_URL): # When run it will generate all the links for all the teams
r = requests.get(links)
page = r.content
soup = BeautifulSoup(page)
for statistics in soup.select('div.statistics tr'):
win_rate = int(statistics.find('th', text='Winrate:').find_next_sibling('td'))
print(win_rate)
Not sure exactly what you want but this will get all the team stats:
from bs4 import BeautifulSoup, Tag
import requests
soup = BeautifulSoup(requests.get("http://www.gosugamers.net/counterstrike/teams/16448-nasty-gravy-runners").content)
table = soup.select_one("table.stats-table")
head1 = [th.text.strip() for th in table.select("tr.header th") if th.text]
head2 = [th.text.strip() for th in table.select_one("tr + tr") if isinstance(th, Tag)]
scores = [th.text.strip() for th in table.select_one("tr + tr + tr") if isinstance(th, Tag)]
print(head1, head2, scores)
Output:
([u'Jun', u'May', u'All time'], [u'Winrate:', u'0%', u'0%', u'0%'], [u'Matches played:', u'0 / 0 / 0', u'0 / 0 / 0', u'0 / 0 / 0'])

Resources