Scraping all href links using Pagination - python-3.x

I've to Select each state from https://www.maxpreps.com/search/states_by_sport.aspx?gendersport=boys,football&season=fall and then click on team rankings and after that I've to grab href links of each ranked team.
I've completed till team rankings part now I want get links of each ranked team from all the pages in the pagination bar right now I'm getting links of all teams available on the first page only, I don't how to navigate to the next page.(below is the code)
import requests
from bs4 import BeautifulSoup
from urllib.request import urlopen
import re
site = "https://www.maxpreps.com"
url = requests.get("https://www.maxpreps.com/search/states_by_sport.aspx?gendersport=boys,football&season=fall")
soup = BeautifulSoup(url.content, "html.parser")
states = soup.findAll('div', {'class': 'states'})
for each_state in states:
all_states = each_state.find_all('a', href=True)
for a in all_states:
domain = site + a['href'] #domain consist oflinks of states
for r in domain:
page_link = domain
page_response = requests.get(page_link)
soup = BeautifulSoup(page_response.content, "html.parser")
for link in soup.findAll('a', attrs={'href': re.compile("rankings")}):
rankings_link = site + link.get('href')
#print(rankings_link)
for ert in rankings_link:
team_link = rankings_link
page_response1 = requests.get(team_link)
soup = BeautifulSoup(page_response1.content, "html.parser")
My_table = soup.find('table',{'class':'mx-grid sortable rankings-grid'})
links = My_table.findAll('a')
print(links)
output:
Everett, Methuen,

You could just iterate through pages within the query parameters.
import requests
from bs4 import BeautifulSoup
site = "https://www.maxpreps.com"
session = requests.Session()
response = session.get("https://www.maxpreps.com/search/states_by_sport.aspx?gendersport=boys,football&season=fall")
soup = BeautifulSoup(response.content, "html.parser")
all_states = soup.find('div', {'class': 'states'})
states_list = []
for each in all_states.find_all('a'):
states_list.append(each['href'].split('=')[-1])
states_list = states_list[:-1]
team_links = []
url = 'https://www.maxpreps.com/m/rankings/list.aspx'
for state in states_list:
break_loop = False
page=1
while break_loop == False:
print ('%s: Page %s' %(state, page))
payload = {
'page': str(page),
'ssid': '8d610ab9-220b-465b-9cf0-9f417bce6c65',
'state': state
}
response = requests.get(url, params=payload)
soup = BeautifulSoup(response.text, "html.parser")
table = soup.find('table')
if table == None:
break_loop = True
else:
page+=1
links = table.find_all('a')
for link in links:
team_links.append('https://www.maxpreps.com' + link['href'])
Output:
print (team_links[:10])
['https://www.maxpreps.com/m/high-schools/central-red-devils-(phenix-city,al)/football/default.htm', 'https://www.maxpreps.com/m/high-schools/thompson-warriors-(alabaster,al)/football/default.htm', 'https://www.maxpreps.com/m/high-schools/hoover-buccaneers-(hoover,al)/football/default.htm', 'https://www.maxpreps.com/m/high-schools/oxford-yellow-jackets-(oxford,al)/football/default.htm', 'https://www.maxpreps.com/m/high-schools/mountain-brook-spartans-(birmingham,al)/football/default.htm', 'https://www.maxpreps.com/m/high-schools/hewitt-trussville-huskies-(trussville,al)/football/default.htm', 'https://www.maxpreps.com/m/high-schools/mcgill-toolen-yellowjackets-(mobile,al)/football/default.htm', 'https://www.maxpreps.com/m/high-schools/lee-generals-(montgomery,al)/football/default.htm', 'https://www.maxpreps.com/m/high-schools/pinson-valley-indians-(pinson,al)/football/default.htm', 'https://www.maxpreps.com/m/high-schools/vestavia-hills-rebels-(vestavia-hills,al)/football/default.htm']

Related

Script isn't retrieving all the info

I tried making a python script that gets all the fighter names and their records from boxrec.com. The issue is that it doesn't retrieve them all (Floyd Mayweather is missing) and some of them appear several times (Success Tetteh for example).
The output is too big to post it here: https://cryptpad.fr/pad/#/2/pad/view/mYd4jIMOxY7QNUqW2-5TvYIvvx84KXbiMdYvXINGV9M/
Edit: For some fighters the records are wrong (Vasyl Lomachenko for example appears to have 28 wins, but he has 14)
import numpy
from requests import Session
from bs4 import BeautifulSoup
import pandas as pd
import pyautogui
import time
def main():
fighter_names = []
fighter_wins = []
fighter_losses = []
fighter_draws = []
username = "username"
password = "password"
site = "https://boxrec.com/en/login"
payload = {
'_username': username,
'_password': password,
'login[go]': None
}
with Session() as s:
s.get(site)
s.post(site, data=payload, headers={
"Content-Type": "application/x-www-form-urlencoded"
})
pages = numpy.arange(1, 19152, 20)
for page in pages:
page = s.get(
"https://boxrec.com/en/locations/people?l%5Brole%5D=proboxer&l%5Bdivision%5D=&l%5Bcountry%5D=&l"
"%5Bregion%5D=&l%5Btown%5D=&l_go=&offset= "
+ str(page))
soup = BeautifulSoup(page.text, 'html.parser')
names_a = soup.find_all('a', class_='personLink')
if not names_a:
print("solving captcha")
page = s.get(
"https://boxrec.com/en/locations/people?l%5Brole%5D=proboxer&l%5Bdivision%5D=&l%5Bcountry%5D=&l"
"%5Bregion%5D=&l%5Btown%5D=&l_go=&offset= "
+ str(page))
soup = BeautifulSoup(page.text, 'html.parser')
names_a = soup.find_all('a', class_='personLink')
pyautogui.click(x=118, y=1061)
time.sleep(1)
pyautogui.click(x=1035, y=619)
time.sleep(2)
pyautogui.click(x=97, y=59)
time.sleep(1)
pyautogui.click(x=834, y=247)
time.sleep(2)
if not names_a:
print("please solve captcha manually")
while not names_a:
page = s.get(
"https://boxrec.com/en/locations/people?l%5Brole%5D=proboxer&l%5Bdivision%5D=&l%5Bcountry%5D=&l"
"%5Bregion%5D=&l%5Btown%5D=&l_go=&offset= "
+ str(page))
soup = BeautifulSoup(page.text, 'html.parser')
names_a = soup.find_all('a', class_='personLink')
wins_span = soup.find_all('span', class_='textWon')
loses_span = soup.find_all('span', class_='textLost')
draws_span = soup.find_all('span', class_='textDraw')
for container in names_a:
name = container.text
print(name)
fighter_names.append(name)
for container in wins_span:
wins = container.text
fighter_wins.append(wins)
for container in loses_span:
losses = container.text
fighter_losses.append(losses)
for container in draws_span:
draws = container.text
fighter_draws.append(draws)
fighters = {
"name": fighter_names,
"wins": fighter_wins,
"loses": fighter_losses,
"draws": fighter_draws
}
df = pd.DataFrame.from_dict(fighters, orient="index")
df = df.transpose()
df.to_csv("fighters.csv")
if __name__ == '__main__':
main()
I would refrain from using the same variable name to represent 2 separate things...Looks like you have page variable being used in 2 separate instances, which can be confusing.
As far as some of the issues, I'm assuming at some point there's a mismatch in the lists so the corresponding data isn't lining up with the correct fighter name, etc. or there's something off with the sites actual data/html. Not entirely sure as I haven't debugged. Reason being, have you considered using pandas to parse the table then just split the 'w-l-d' column? I think it would be far easier to let pandas do the parsing as to not miss something in the 900+ pages you need to go through.
See if this helps:
import numpy
from requests import Session
from bs4 import BeautifulSoup
import pandas as pd
import pyautogui
import time
import math
def main():
final_df = pd.DataFrame()
username = 'username'
password = 'password'
site = "https://boxrec.com/en/login"
payload = {
'_username': username,
'_password': password,
'login[go]': None
}
with Session() as s:
s.get(site)
s.post(site, data=payload, headers={
"Content-Type": "application/x-www-form-urlencoded"
})
pages = numpy.arange(1, 19152, 20)
for page in pages:
response = s.get(
"https://boxrec.com/en/locations/people?l%5Brole%5D=proboxer&l%5Bdivision%5D=&l%5Bcountry%5D=&l"
"%5Bregion%5D=&l%5Btown%5D=&l_go=&offset= "
+ str(page))
soup = BeautifulSoup(response.text, 'html.parser')
names_a = soup.find_all('a', class_='personLink')
if not names_a:
print("solving captcha")
response = s.get(
"https://boxrec.com/en/locations/people?l%5Brole%5D=proboxer&l%5Bdivision%5D=&l%5Bcountry%5D=&l"
"%5Bregion%5D=&l%5Btown%5D=&l_go=&offset= "
+ str(page))
soup = BeautifulSoup(response.text, 'html.parser')
names_a = soup.find_all('a', class_='personLink')
pyautogui.click(x=118, y=1061)
time.sleep(1)
pyautogui.click(x=1035, y=619)
time.sleep(2)
pyautogui.click(x=97, y=59)
time.sleep(1)
pyautogui.click(x=834, y=247)
time.sleep(2)
if not names_a:
print("please solve captcha manually")
while not names_a:
response = s.get(
"https://boxrec.com/en/locations/people?l%5Brole%5D=proboxer&l%5Bdivision%5D=&l%5Bcountry%5D=&l"
"%5Bregion%5D=&l%5Btown%5D=&l_go=&offset= "
+ str(page))
soup = BeautifulSoup(response.text, 'html.parser')
names_a = soup.find_all('a', class_='personLink')
df = pd.read_html(response.text)[-1]
df = df[['name','w-l-d']]
df = df[df['w-l-d'].astype(str).str.match(r"(^\d*.\d*.\d*$)")] # <--- ADD THIS LINE
df[['wins','loses','draws']] = df['w-l-d'].str.split(expand=True)
df = df.drop('w-l-d', axis=1)
print('Page: %d of %d' %(((page-1)/20)+1,math.ceil(19152/20)))
final_df = final_df.append(df, sort=False).reset_index(drop=True)
final_df.to_csv("fighters.csv")
if __name__ == '__main__':
main()

How can I get BeautifulSoup info in the same row?

I am currently web scraping and would like to get the specifications on the same row. When I currently print it column 2 looks like this:
text
text
text
text
text
I would like to get it all on the same row like this
text text text text text
so i can later chop it up into different columns in Excel later.
Is there maybe a transposing command I could use or something else?
Code:
import requests
from bs4 import BeautifulSoup
import csv
with open('Oslo.csv', 'w', newline='') as f:
fieldnames = ['column1', 'column2']
skriver = csv.DictWriter(f, fieldnames=fieldnames)
skriver.writeheader()
def data(page_number):
URL = 'https://www.url.com/' + str(
page_number) + '&sort=PUBLISHED_DESC'
page = requests.get(URL)
soup = BeautifulSoup(page.content, 'html.parser')
ads = soup.findAll('h2', class_="ads__unit__content__title ads__unit__content__title--fav-placeholder")
for data in ads:
id = data.find('a')
link = (id['id'])
url = 'https://www.url.com/'+str(link)
page = requests.get(url)
soup = BeautifulSoup(page.content, 'html.parser')
ads = soup.findAll('div', class_="u-word-break")
for stats in ads:
address = stats.find('p', class_="u-caption")
specs = stats.find('dl', class_="definition-list definition-list--cols1to2")
skriver.writerow({'column1': address.text.strip(), 'column2': specs.text})
for x in range(1, 2):
data(x)
print('Ferdig, du kan åpne oslo.csv')
EDIT: Scraping from the website is illegal, so I removed the URL.
your specs.text is a string that contains \n new lines. You can split it, then join it back with just a space. Ie ' '.join(specs.text.split())
import requests
from bs4 import BeautifulSoup
import csv
with open('Oslo.csv', 'w', newline='') as f:
fieldnames = ['column1', 'column2']
skriver = csv.DictWriter(f, fieldnames=fieldnames)
skriver.writeheader()
def data(page_number):
URL = 'https://www.url.com/' + str(page_number) + '&sort=PUBLISHED_DESC'
page = requests.get(URL)
soup = BeautifulSoup(page.content, 'html.parser')
ads = soup.findAll('h2', class_="ads__unit__content__title ads__unit__content__title--fav-placeholder")
for data in ads:
id = data.find('a')
link = (id['id'])
url = 'https://www.url.com/'+str(link)
page = requests.get(url)
soup = BeautifulSoup(page.content, 'html.parser')
ads = soup.findAll('div', class_="u-word-break")
for stats in ads:
address = stats.find('p', class_="u-caption")
specs = stats.find('dl', class_="definition-list definition-list--cols1to2")
address = ' '.join(address.text.split())
specs = ' '.joins(specs.text.split()) #<-- changed here
skriver.writerow({'column1': address, 'column2': specs})
for x in range(1, 2):
data(x)
print('Ferdig, du kan åpne oslo.csv')

How to actually use loop in beautifulsoup?

I'm trying to make a web crawler. I use some loop inside. The loop run well for the first loop, but not to the second. I always get this message: "During handling of the above exception, another exception occurred"
import requests
from bs4 import BeautifulSoup
result =
requests.get("http://desaku.bandungkab.go.id/desaonline/")
#This url is the main web, inside this web there are 270 links of
#other website. I get into that 270 webs and open every article in
#each
web
src = result.content
soup = BeautifulSoup(src, 'lxml')
links = soup.find_all('a')
urls = []
for link in links:
if "www" in link.text:
url = link.attrs['href']
urls.append(url)
num1=len(urls)
b=0
while b<num1:
result2 = requests.get(urls[b])
src2 = result2.content
soup = BeautifulSoup(src2, 'lxml')
links2 = soup.find_all('a')
urls2 = []
for link in links2:
if "selengkapnya" in link.text:
url2 = link.attrs['href']
urls2.append(url2)
b+=1
#the code run well until this part. If i print this, it will result
#url that take me directly to specific article
num=len(urls2)
i=0
while i<num:
result2 = requests.get(urls2[i])
src2 = result2.content
soup = BeautifulSoup(src2, 'lxml')
links2 = soup.find_all('a')
artikel=[]
isi = link.attrs['href']
artikel.append(isi)
print(artikel)
i+=1
I expect to get all the link of article from the website and put them into a list called artikel=[]
The problem is you are assigning to urls2 = [] and to artikel=[] each loop iteration, effectively replacing the list. At the end of iteration you have empty array. You could use this code as a start:
import requests
from bs4 import BeautifulSoup
result = requests.get("http://desaku.bandungkab.go.id/desaonline/")
#This url is the main web, inside this web there are 270 links of
#other website. I get into that 270 webs and open every article in
#each web
src = result.content
soup = BeautifulSoup(src, 'lxml')
urls = []
urls2 = []
for link in soup.select('a[href]:contains(www)'):
urls.append(link['href'])
print('Urls:')
for url in urls:
print('Downloading {}'.format(url))
result2 = requests.get(url)
src2 = result2.content
soup = BeautifulSoup(src2, 'lxml')
for link in soup.select('a[href]:contains(selengkapnya)'):
print('\tFound link {}'.format(link['href']))
urls2.append(link['href'])
print('Articles:')
articles = []
for url2 in urls2:
print('Downloading {}'.format(url2))
result2 = requests.get(url2)
src2 = result2.content
soup = BeautifulSoup(src2, 'lxml')
for link in soup.find_all('a[href]'):
articles.append(link['href'])
print(articles)
Prints:
Urls:
Downloading http://www.ancolmekar.desa.id
Found link http://www.ancolmekar.desa.id/first/artikel/423
Found link http://www.ancolmekar.desa.id/first/artikel/421
Found link http://www.ancolmekar.desa.id/first/artikel/420
Found link http://www.ancolmekar.desa.id/first/artikel/419
Found link http://www.ancolmekar.desa.id/first/artikel/414
Found link http://www.ancolmekar.desa.id/first/artikel/413
Found link http://www.ancolmekar.desa.id/first/artikel/412
Found link http://www.ancolmekar.desa.id/first/artikel/410
Found link http://www.ancolmekar.desa.id/first/artikel/410
Found link http://www.ancolmekar.desa.id/first/artikel/100
Downloading http://www.arjasari.desa.id
Found link http://www.arjasari.desa.id/first/artikel/180
Found link http://www.arjasari.desa.id/first/artikel/190
...and so on.

Scraping a lift with Python and BeautifulSoup

I am new to Python and trying to write some code that scrapes information form a website. I currently have:
from bs4 import BeautifulSoup
import requests
headers = {'User-Agent': 'Mozilla/5.0'}
for i in range(1, 300):
url = "[REMOVED]/footwear?page=%s" % i
response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.text, "html.parser")
items = soup.find_all('div', 'product-block__info')
for item in items:
for val in item.find_all('a','product-block'):
stock = item.find_all('class','count_product_stock hidden')[0].text
brand = item.find_all('div','brand')[0].text
price = item.find_all('span','selling_price')[0].text
print (items)
Which returns the error IndexError: list index out of range. If I put 'product-block__info' in the place of 'product-block' then I am able to print off the full list of the content within the 'product-block__info' tag on the page, but I'd like to just select a handful of elements and return these.
Can anyone explain to me what's happening here and how I can select just the elements i want from inside 'product-block__info'?
When selecting attributes with find_all you should either use the attrs dictionary or the keyword arguments, otherwise bs4 is lookink for tags.
for i in range(1, 300):
url = "[REMOVED]/footwear?page=%s" % i
response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.text, "html.parser")
items = soup.find_all('div', class_='product-block__info')
for item in items:
stock = item.find('span', class_='count_product_stock hidden').text
brand = item.find('h4', class_='brand').text
price = item.find('span', class_='selling_price').text
print(stock, brand, price)

How to get certain text from a url links

So im trying to get all the statistics in the statistics box page on the url page for each team. An example of what the page looks like is on the hyperlink I put below. Im trying to have if so it prints out;
month : win %
month : win %
All time: win%
But I am not to sure how to write that code, since the last piece of code I wrote in the main was giving me an error.
http://www.gosugamers.net/counterstrike/teams/16448-nasty-gravy-runners
import time
import requests
from bs4 import BeautifulSoup
def get_all(url, base): # Well called it will print all the team links
r = requests.get(url)
page = r.text
soup = BeautifulSoup(page, 'html.parser')
for team_links in soup.select('div.details h3 a'):
members = int(team_links.find_next('th', text='Members:').find_next_sibling('td').text.strip().split()[0])
if members < 5:
continue
yield base + team_links['href']
next_page = soup.find('div', {'class': 'pages'}).find('span', text='Next')
while next_page:
# Gives the server a break
time.sleep(0.2)
r = requests.get(BASE_URL + next_page.find_previous('a')['href'])
page = r.text
soup = BeautifulSoup(page)
for team_links in soup.select('div.details h3 a'):
yield BASE_URL + team_links['href']
next_page = soup.find('div', {'class': 'pages'}).find('span', text='Next')
if __name__ == '__main__':
BASE_URL = 'http://www.gosugamers.net'
URL = 'http://www.gosugamers.net/counterstrike/teams'
for links in get_all(URL, BASE_URL): # When run it will generate all the links for all the teams
r = requests.get(links)
page = r.content
soup = BeautifulSoup(page)
for statistics in soup.select('div.statistics tr'):
win_rate = int(statistics.find('th', text='Winrate:').find_next_sibling('td'))
print(win_rate)
Not sure exactly what you want but this will get all the team stats:
from bs4 import BeautifulSoup, Tag
import requests
soup = BeautifulSoup(requests.get("http://www.gosugamers.net/counterstrike/teams/16448-nasty-gravy-runners").content)
table = soup.select_one("table.stats-table")
head1 = [th.text.strip() for th in table.select("tr.header th") if th.text]
head2 = [th.text.strip() for th in table.select_one("tr + tr") if isinstance(th, Tag)]
scores = [th.text.strip() for th in table.select_one("tr + tr + tr") if isinstance(th, Tag)]
print(head1, head2, scores)
Output:
([u'Jun', u'May', u'All time'], [u'Winrate:', u'0%', u'0%', u'0%'], [u'Matches played:', u'0 / 0 / 0', u'0 / 0 / 0', u'0 / 0 / 0'])

Resources