How to actually use loop in beautifulsoup? - python-3.x

I'm trying to make a web crawler. I use some loop inside. The loop run well for the first loop, but not to the second. I always get this message: "During handling of the above exception, another exception occurred"
import requests
from bs4 import BeautifulSoup
result =
requests.get("http://desaku.bandungkab.go.id/desaonline/")
#This url is the main web, inside this web there are 270 links of
#other website. I get into that 270 webs and open every article in
#each
web
src = result.content
soup = BeautifulSoup(src, 'lxml')
links = soup.find_all('a')
urls = []
for link in links:
if "www" in link.text:
url = link.attrs['href']
urls.append(url)
num1=len(urls)
b=0
while b<num1:
result2 = requests.get(urls[b])
src2 = result2.content
soup = BeautifulSoup(src2, 'lxml')
links2 = soup.find_all('a')
urls2 = []
for link in links2:
if "selengkapnya" in link.text:
url2 = link.attrs['href']
urls2.append(url2)
b+=1
#the code run well until this part. If i print this, it will result
#url that take me directly to specific article
num=len(urls2)
i=0
while i<num:
result2 = requests.get(urls2[i])
src2 = result2.content
soup = BeautifulSoup(src2, 'lxml')
links2 = soup.find_all('a')
artikel=[]
isi = link.attrs['href']
artikel.append(isi)
print(artikel)
i+=1
I expect to get all the link of article from the website and put them into a list called artikel=[]

The problem is you are assigning to urls2 = [] and to artikel=[] each loop iteration, effectively replacing the list. At the end of iteration you have empty array. You could use this code as a start:
import requests
from bs4 import BeautifulSoup
result = requests.get("http://desaku.bandungkab.go.id/desaonline/")
#This url is the main web, inside this web there are 270 links of
#other website. I get into that 270 webs and open every article in
#each web
src = result.content
soup = BeautifulSoup(src, 'lxml')
urls = []
urls2 = []
for link in soup.select('a[href]:contains(www)'):
urls.append(link['href'])
print('Urls:')
for url in urls:
print('Downloading {}'.format(url))
result2 = requests.get(url)
src2 = result2.content
soup = BeautifulSoup(src2, 'lxml')
for link in soup.select('a[href]:contains(selengkapnya)'):
print('\tFound link {}'.format(link['href']))
urls2.append(link['href'])
print('Articles:')
articles = []
for url2 in urls2:
print('Downloading {}'.format(url2))
result2 = requests.get(url2)
src2 = result2.content
soup = BeautifulSoup(src2, 'lxml')
for link in soup.find_all('a[href]'):
articles.append(link['href'])
print(articles)
Prints:
Urls:
Downloading http://www.ancolmekar.desa.id
Found link http://www.ancolmekar.desa.id/first/artikel/423
Found link http://www.ancolmekar.desa.id/first/artikel/421
Found link http://www.ancolmekar.desa.id/first/artikel/420
Found link http://www.ancolmekar.desa.id/first/artikel/419
Found link http://www.ancolmekar.desa.id/first/artikel/414
Found link http://www.ancolmekar.desa.id/first/artikel/413
Found link http://www.ancolmekar.desa.id/first/artikel/412
Found link http://www.ancolmekar.desa.id/first/artikel/410
Found link http://www.ancolmekar.desa.id/first/artikel/410
Found link http://www.ancolmekar.desa.id/first/artikel/100
Downloading http://www.arjasari.desa.id
Found link http://www.arjasari.desa.id/first/artikel/180
Found link http://www.arjasari.desa.id/first/artikel/190
...and so on.

Related

How To Refactor Web Scraping Code In Python

I am web scraping data from the below url and was able to do it correctly but i am looking for more reliable and beautiful way to do it
import pandas as pd
from bs4 import BeautifulSoup
import requests
pages = list(range(1, 548))
list_of_url = []
for page in pages:
URL = "https://www.stats.gov.sa/ar/isic4?combine=&combine_1=All&items_per_page=5" + "&page=" + str(page)
#print (URL)
list_of_url.append(URL)
print(list_of_url)
list_activities = []
#page_number = 1
for url in list_of_url:
URL = url
page = requests.get(URL)
soup = BeautifulSoup(page.content, "html.parser")
results = soup.find('div', class_='view-content')
#print(results.prettify())
try:
activities = results.find_all("tr", class_=["views-row-first odd","even","odd","even","views-row-last odd"])
except:
print("in the activities line thisis a pad url", URL)
continue
try:
for activity in activities:
activity_section = activity.find('td', class_='views-field views-field-field-chapter-desc-en-et').text.strip()
activity_name = activity.find("td", class_="views-field views-field-field-activity-description-en-et").text.strip()
activity_code = activity.find("td", class_="views-field views-field-field-activity-code active").text.strip()
list_activities.append([activity_section,activity_name,activity_code])
except:
print("url not founf")
continue
page_number += 1
df = pd.DataFrame(list_activities, columns=["activity_section", "activity_name", "activity_code"])
df.head()
I am web scraping data from the below url and was able to do it correctly but i am looking for more reliable and beautiful way to do it
Here is a shorter version for your code:
import pandas as pd
from bs4 import BeautifulSoup
import requests
list_activities = []
URLS = [f'https://www.stats.gov.sa/ar/isic4?combine=&combine_1=All&items_per_page=5&page={page}' for page in range(1,3)]
for URL in URLS:
page = requests.get(URL)
soup = BeautifulSoup(page.text, "html.parser")
results = soup.find('div', class_='view-content')
activities = results.find_all("tr", class_=["views-row-first odd","even","odd","even","views-row-last odd"])
list_activities += [[
activity.find('td', class_='views-field views-field-field-chapter-desc-en-et').text.strip(),
activity.find("td", class_="views-field views-field-field-activity-description-en-et").text.strip(),
activity.find("td", class_="views-field views-field-field-activity-code active").text.strip()
] for activity in activities]
df = pd.DataFrame(list_activities, columns=["activity_section", "activity_name", "activity_code"])
df.head()
However, as an engineer at WebScrapingAPI I would recommend you implement a stealthier scraper if you want to scrape this website on the long run. As per my testing, it does not feature any known bot detection providers right now. But being a government website it might use a private detection system.

I make a list of URL of different pages for scraping the data. Can anyone tell me that is there any way to automate this process?

from bs4 import BeautifulSoup
import requests
urls = ['https://www.trustpilot.com/categories/restaurants_bars?
numberofreviews=0&status=all&timeperiod=0',
'https://www.trustpilot.com/categories/restaurants_bars?
numberofreviews=0&page=2&status=all&timeperiod=0',
'https://www.trustpilot.com/categories/restaurants_bars?
numberofreviews=0&page=3&status=all&timeperiod=0',
'https://www.trustpilot.com/categories/restaurants_bars?
numberofreviews=0&page=4&status=all&timeperiod=0',
'https://www.trustpilot.com/categories/restaurants_bars?
numberofreviews=0&page=5&status=all&timeperiod=0',
'https://www.trustpilot.com/categories/restaurants_bars?
numberofreviews=0&page=6&status=all&timeperiod=0',
'https://www.trustpilot.com/categories/restaurants_bars?
numberofreviews=0&page=7&status=all&timeperiod=0',
'https://www.trustpilot.com/categories/restaurants_bars?
numberofreviews=0&page=8&status=all&timeperiod=0']
for url in URLs:
html_text = requests.get(url).text
soup = BeautifulSoup(html_text, 'lxml')
restaurants = soup.find_all('div', class_ = 'categoryBusinessListWrapper___14CgD')
for index, restaurant in enumerate(restaurants):
tags = restaurant.find_all('a', class_ = 'internal___1jK0Z wrapper___26yB4')
for tag in tags:
restaurant_name = tag.find('div', class_ = 'businessTitle___152-c').text.split(',')[0]
ratings = tag.find('div', class_ = 'textRating___3F1NO')
location = tag.find('span', class_ = 'locationZipcodeAndCity___33EfU')
more_info = tag['href']
As you can see that I create a URLs list to store the URL of different pages on this website. Is there any process to automate this? I use BeautifulSoup and the request module for scraping. I want to know that if there is any process to automate the URL accessing for different pages.
You can look at the pagination at the bottom of the page and use list comprehension to create those links:
import requests
from bs4 import BeautifulSoup
import re
url = 'https://www.trustpilot.com/categories/restaurants_bars?numberofreviews=0&status=all&timeperiod=0'
regex = re.compile('pagination')
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')
pages = len(soup.find_all("a", {"class": regex}))
links = ['https://www.trustpilot.com/categories/restaurants_bars?numberofreviews=0&page={page}&status=all&timeperiod=0'.format(page=page+1) for page in range(0,pages) ]
Output:
print (links)
['https://www.trustpilot.com/categories/restaurants_bars?numberofreviews=0&page=1&status=all&timeperiod=0', 'https://www.trustpilot.com/categories/restaurants_bars?numberofreviews=0&page=2&status=all&timeperiod=0', 'https://www.trustpilot.com/categories/restaurants_bars?numberofreviews=0&page=3&status=all&timeperiod=0', 'https://www.trustpilot.com/categories/restaurants_bars?numberofreviews=0&page=4&status=all&timeperiod=0', 'https://www.trustpilot.com/categories/restaurants_bars?numberofreviews=0&page=5&status=all&timeperiod=0', 'https://www.trustpilot.com/categories/restaurants_bars?numberofreviews=0&page=6&status=all&timeperiod=0', 'https://www.trustpilot.com/categories/restaurants_bars?numberofreviews=0&page=7&status=all&timeperiod=0', 'https://www.trustpilot.com/categories/restaurants_bars?numberofreviews=0&page=8&status=all&timeperiod=0']

Scraping all href links using Pagination

I've to Select each state from https://www.maxpreps.com/search/states_by_sport.aspx?gendersport=boys,football&season=fall and then click on team rankings and after that I've to grab href links of each ranked team.
I've completed till team rankings part now I want get links of each ranked team from all the pages in the pagination bar right now I'm getting links of all teams available on the first page only, I don't how to navigate to the next page.(below is the code)
import requests
from bs4 import BeautifulSoup
from urllib.request import urlopen
import re
site = "https://www.maxpreps.com"
url = requests.get("https://www.maxpreps.com/search/states_by_sport.aspx?gendersport=boys,football&season=fall")
soup = BeautifulSoup(url.content, "html.parser")
states = soup.findAll('div', {'class': 'states'})
for each_state in states:
all_states = each_state.find_all('a', href=True)
for a in all_states:
domain = site + a['href'] #domain consist oflinks of states
for r in domain:
page_link = domain
page_response = requests.get(page_link)
soup = BeautifulSoup(page_response.content, "html.parser")
for link in soup.findAll('a', attrs={'href': re.compile("rankings")}):
rankings_link = site + link.get('href')
#print(rankings_link)
for ert in rankings_link:
team_link = rankings_link
page_response1 = requests.get(team_link)
soup = BeautifulSoup(page_response1.content, "html.parser")
My_table = soup.find('table',{'class':'mx-grid sortable rankings-grid'})
links = My_table.findAll('a')
print(links)
output:
Everett, Methuen,
You could just iterate through pages within the query parameters.
import requests
from bs4 import BeautifulSoup
site = "https://www.maxpreps.com"
session = requests.Session()
response = session.get("https://www.maxpreps.com/search/states_by_sport.aspx?gendersport=boys,football&season=fall")
soup = BeautifulSoup(response.content, "html.parser")
all_states = soup.find('div', {'class': 'states'})
states_list = []
for each in all_states.find_all('a'):
states_list.append(each['href'].split('=')[-1])
states_list = states_list[:-1]
team_links = []
url = 'https://www.maxpreps.com/m/rankings/list.aspx'
for state in states_list:
break_loop = False
page=1
while break_loop == False:
print ('%s: Page %s' %(state, page))
payload = {
'page': str(page),
'ssid': '8d610ab9-220b-465b-9cf0-9f417bce6c65',
'state': state
}
response = requests.get(url, params=payload)
soup = BeautifulSoup(response.text, "html.parser")
table = soup.find('table')
if table == None:
break_loop = True
else:
page+=1
links = table.find_all('a')
for link in links:
team_links.append('https://www.maxpreps.com' + link['href'])
Output:
print (team_links[:10])
['https://www.maxpreps.com/m/high-schools/central-red-devils-(phenix-city,al)/football/default.htm', 'https://www.maxpreps.com/m/high-schools/thompson-warriors-(alabaster,al)/football/default.htm', 'https://www.maxpreps.com/m/high-schools/hoover-buccaneers-(hoover,al)/football/default.htm', 'https://www.maxpreps.com/m/high-schools/oxford-yellow-jackets-(oxford,al)/football/default.htm', 'https://www.maxpreps.com/m/high-schools/mountain-brook-spartans-(birmingham,al)/football/default.htm', 'https://www.maxpreps.com/m/high-schools/hewitt-trussville-huskies-(trussville,al)/football/default.htm', 'https://www.maxpreps.com/m/high-schools/mcgill-toolen-yellowjackets-(mobile,al)/football/default.htm', 'https://www.maxpreps.com/m/high-schools/lee-generals-(montgomery,al)/football/default.htm', 'https://www.maxpreps.com/m/high-schools/pinson-valley-indians-(pinson,al)/football/default.htm', 'https://www.maxpreps.com/m/high-schools/vestavia-hills-rebels-(vestavia-hills,al)/football/default.htm']

How can I extract the links from HTML?

I'm trying to get a link of every article in this category on the SF chronicle but I'm not sure as to where I should begin on extracting the URLs. Here is my progress so far:
from urllib.request import urlopen as uReq
from bs4 import BeautifulSoup as soup
my_url = 'https://www.sfchronicle.com/local/'
# opening up connection, grabbing the page
uClient = uReq(my_url)
page_html = uClient.read()
uClient.close()
# html parsing
page_soup = soup(page_html, "html.parser")
zone2_container = page_soup.findAll("div",{"class":"zone zone-2"})
zone3_container = page_soup.findAll("div",{"class":"zone zone-3"})
zone4_container = page_soup.findAll("div",{"class":"zone zone-4"})
right_rail_container = page_soup.findAll("div",{"class":"right-rail"})
All of the links I want are located in zone2-4_container and right_rail_container.
You can use the following code to get all links:
all_zones = [zone2_container, zone3_container, zone4_container, right_rail_container]
urls = []
for i in all_zones:
links = i[0].findAll('a')
for link in links:
urls.append(link['href'])
I have merged all the lists in one list but you can also define a function to achieve the same.
def get_urls(zone):
urls = []
for i in zone:
links = i.findAll('a')
for link in links:
urls.append(link['href'])
return urls
get_urls(zone2_container)
It now sounds like you basically want all the article links, in which case you can use an attribute = value css selector with contains operator to target href attributes whose value contains the substring 'article'.
import requests
from bs4 import BeautifulSoup as bs
from urllib.parse import urljoin
base = 'https://www.sfchronicle.com/'
url = 'https://www.sfchronicle.com/local/'
res = requests.get(url)
soup = bs(res.content, 'lxml')
links = [urljoin(base,link['href']) for link in soup.select('[href*=article]')]
print(links)
print(len(links))

Why am I getting duplicate links ? And how do I fetch links on the next pages?

I am getting duplicate links when for the links I am trying to obtain, I am not sure why. Also I am trying to fetch all the links like the ones I am getting from all the pages. But I am not sure how to write the code to click next page. Could someone please help me understand how I would go about this?
import requests
from bs4 import BeautifulSoup
url = 'http://www.gosugamers.net/counterstrike/teams'
r = requests.get(url)
page = r.text
soup = BeautifulSoup(page)
#all_teams = []
for team_links in soup.find_all('a', href=True):
if team_links['href'] == '' or team_links['href'].startswith('/counterstrike/teams'):
print (team_links.get('href').replace('/counterstrike/teams', url))
The team links are in anchor tags inside the h3 tags which are inside the div with the details class:
import requests
from bs4 import BeautifulSoup
from urlparse import urljoin
base = "http://www.gosugamers.net"
url = 'http://www.gosugamers.net/counterstrike/teams'
r = requests.get(url)
page = r.text
soup = BeautifulSoup(page)
for team_links in soup.select("div.details h3 a"):
print ( urljoin(base, team_links["href"]))
Which gives you:
http://www.gosugamers.net/counterstrike/teams/16338-motv
http://www.gosugamers.net/counterstrike/teams/16337-absolute-monster
http://www.gosugamers.net/counterstrike/teams/16258-immortals-cs
http://www.gosugamers.net/counterstrike/teams/16251-ireal-star-gaming
http://www.gosugamers.net/counterstrike/teams/16176-team-genesis
http://www.gosugamers.net/counterstrike/teams/16175-potadies
http://www.gosugamers.net/counterstrike/teams/16174-crowns-gg
http://www.gosugamers.net/counterstrike/teams/16173-visomvet
http://www.gosugamers.net/counterstrike/teams/16172-team-phenomenon
http://www.gosugamers.net/counterstrike/teams/16152-kriklekrakle
http://www.gosugamers.net/counterstrike/teams/16148-begenius
http://www.gosugamers.net/counterstrike/teams/16144-blubblub
http://www.gosugamers.net/counterstrike/teams/16142-team-1231
http://www.gosugamers.net/counterstrike/teams/16141-vsv
http://www.gosugamers.net/counterstrike/teams/16140-tbi
http://www.gosugamers.net/counterstrike/teams/16136-deadweight
http://www.gosugamers.net/counterstrike/teams/16135-me-myself-and-i
http://www.gosugamers.net/counterstrike/teams/16085-pur-esports
http://www.gosugamers.net/counterstrike/teams/15850-falken
http://www.gosugamers.net/counterstrike/teams/15815-team-abyssal
You are literally parsing all the links on the page, that is why you see the dupes.
To get all the teams we can parse the next page link until the span with the "Next" text is not there any more which only happens for the last page:
def get_all(url, base):
r = requests.get(url)
page = r.text
soup = BeautifulSoup(page)
for team_links in soup.select("div.details h3 a"):
yield (urljoin(base, team_links["href"]))
nxt = soup.find("div", {"class": "pages"}).find("span", text="Next")
while nxt:
r = requests.get(urljoin(base, nxt.find_previous("a")["href"]))
page = r.text
soup = BeautifulSoup(page)
for team_links in soup.select("div.details h3 a"):
yield (urljoin(base, team_links["href"]))
nxt = soup.find("div", {"class": "pages"}).find("span", text="Next")
If we run it for a couple of seconds, you can see we get the next pages:
In [26]: for link in (get_all(url, base)):
....: print(link)
....:
http://www.gosugamers.net/counterstrike/teams/16386-cantonese-cs
http://www.gosugamers.net/counterstrike/teams/16338-motv
http://www.gosugamers.net/counterstrike/teams/16337-absolute-monster
http://www.gosugamers.net/counterstrike/teams/16258-immortals-cs
http://www.gosugamers.net/counterstrike/teams/16251-ireal-star-gaming
http://www.gosugamers.net/counterstrike/teams/16176-team-genesis
http://www.gosugamers.net/counterstrike/teams/16175-potadies
http://www.gosugamers.net/counterstrike/teams/16174-crowns-gg
http://www.gosugamers.net/counterstrike/teams/16173-visomvet
http://www.gosugamers.net/counterstrike/teams/16172-team-phenomenon
http://www.gosugamers.net/counterstrike/teams/16152-kriklekrakle
http://www.gosugamers.net/counterstrike/teams/16148-begenius
http://www.gosugamers.net/counterstrike/teams/16144-blubblub
http://www.gosugamers.net/counterstrike/teams/16142-team-1231
http://www.gosugamers.net/counterstrike/teams/16141-vsv
http://www.gosugamers.net/counterstrike/teams/16140-tbi
http://www.gosugamers.net/counterstrike/teams/16136-deadweight
http://www.gosugamers.net/counterstrike/teams/16135-me-myself-and-i
http://www.gosugamers.net/counterstrike/teams/16085-pur-esports
http://www.gosugamers.net/counterstrike/teams/15850-falken
http://www.gosugamers.net/counterstrike/teams/15815-team-abyssal
http://www.gosugamers.net/counterstrike/teams/15810-ex-deathtrap
http://www.gosugamers.net/counterstrike/teams/15808-mix123
http://www.gosugamers.net/counterstrike/teams/15651-undertake-esports
http://www.gosugamers.net/counterstrike/teams/15644-five
http://www.gosugamers.net/counterstrike/teams/15630-five
http://www.gosugamers.net/counterstrike/teams/15627-inetkoxtv
http://www.gosugamers.net/counterstrike/teams/15626-tetr-s
http://www.gosugamers.net/counterstrike/teams/15625-rozenoir-esports-white
http://www.gosugamers.net/counterstrike/teams/15619-fragment-gg
http://www.gosugamers.net/counterstrike/teams/15615-monarchs-gg
http://www.gosugamers.net/counterstrike/teams/15602-ottoman-fire
http://www.gosugamers.net/counterstrike/teams/15591-respect
http://www.gosugamers.net/counterstrike/teams/15569-moonbeam-gaming
http://www.gosugamers.net/counterstrike/teams/15563-team-tilt
http://www.gosugamers.net/counterstrike/teams/15534-dynasty-uk
http://www.gosugamers.net/counterstrike/teams/15507-urbantech
http://www.gosugamers.net/counterstrike/teams/15374-innova
http://www.gosugamers.net/counterstrike/teams/15373-g3x
http://www.gosugamers.net/counterstrike/teams/15372-cnb
http://www.gosugamers.net/counterstrike/teams/15370-intz
http://www.gosugamers.net/counterstrike/teams/15369-2kill
http://www.gosugamers.net/counterstrike/teams/15368-supernova
http://www.gosugamers.net/counterstrike/teams/15367-biggods
http://www.gosugamers.net/counterstrike/teams/15366-playzone
http://www.gosugamers.net/counterstrike/teams/15365-pride
http://www.gosugamers.net/counterstrike/teams/15359-rising-orkam
http://www.gosugamers.net/counterstrike/teams/15342-team-foxez
http://www.gosugamers.net/counterstrike/teams/15336-angels
http://www.gosugamers.net/counterstrike/teams/15331-atlando-esports
http://www.gosugamers.net/counterstrike/teams/15329-xfinity-esports
http://www.gosugamers.net/counterstrike/teams/15326-nano-reapers
http://www.gosugamers.net/counterstrike/teams/15322-erase-team
http://www.gosugamers.net/counterstrike/teams/15318-heyguys
http://www.gosugamers.net/counterstrike/teams/15317-illusory
http://www.gosugamers.net/counterstrike/teams/15285-dismay
http://www.gosugamers.net/counterstrike/teams/15284-kingdom-esports
http://www.gosugamers.net/counterstrike/teams/15283-team-rival
http://www.gosugamers.net/counterstrike/teams/15282-ze-pug-godz
http://www.gosugamers.net/counterstrike/teams/15281-unlimited-potential1
You can see in the source for the first and any bar the last page the span with Next:
And when we get to the last, there is only spans with Previous and First:

Resources