How to add exception to skip AttributeError in my code? - python-3.x

I was trying to skip AttributeError in my code because there is always atrribute error while crawling, especially when trying to get the value of title, and content. I have tried to put "except AttributeError" somewhere but never work. Could anybody help me? I am using python 3.6
from bs4 import BeautifulSoup
import requests
import pymysql.cursors
urls2 = []
result = requests.get("http://desaku.bandungkab.go.id/desaonline/")
src = result.content
soup = BeautifulSoup(src, 'lxml')
links = soup.find_all('a')
urls = []
for link in links:
if "www" in link.text:
url = link.attrs['href']
urls.append(url)
num1=len(urls)
b=0
while b<10:
result2 = requests.get(urls[b])
src2 = result2.content
soup = BeautifulSoup(src2, 'lxml')
links2 = soup.find_all('a')
for link in links2:
if "selengkapnya" in link.text:
url2 = link.attrs['href']
urls2.append(url2)
b+=1
num=len(urls2)
i=0
while i<num:
html = requests.get(urls2[i])
src = html.content
soup = BeautifulSoup(src, 'lxml')
recordList = soup.findAll("div", {"class": "artikel", })
recordlist = soup.find_all('div', attrs={'class':'sampul2'})
connection = pymysql.connect(host='localhost',
user='root',
password='',
db='bs4-test',
charset='utf8mb4',
cursorclass=pymysql.cursors.DictCursor)
try:
with connection.cursor() as cursor:
for record in recordList:
#except AttributeError:
#continue #WHERE TO PUT THIS EXCEPTION,TO SKIP ATRRIBUTEERRROR?
title = record.find("h2", {"class": "judul", }).get_text().strip()
date = record.find('i').next_sibling.next_sibling.next_sibling.replace('\t\t\t\t\t\t\t', '')
content = record.find("div", {"class":"teks"}).get_text().strip()
image = record.img['src']
cover = record.img['src']
sql = "INSERT INTO `artikel` (`jdl`, `tgl`, `kon`, `gambar`, `sampul`) VALUES (%s, %s, %s, %s, %s)"
cursor.execute(sql, (title, date, content, image, cover))
connection.commit()
print ("Record inserted successfully into table")
finally:
connection.close()
print("MySQL connection is closed")
i+=1

Just an example: if it is an url then append the url, if not then append null. Usually you would want the length of all lists to be the same, so that finally you could put them in a dataframe.
import numpy as np
links = soup.find_all('a')
urls = []
for link in links:
try:
url = link['href']
urls.append(url)
except:
urls.append(np.nan)

Related

How to scroll down, but at a specific location, until there is no more href to scrape?

I'm facing a problem,
I want to scrape all the watches's link here : https://www.omegawatches.com/fr-fr/watchfinder
But we need to scroll down to make visible all the link (1400 in total), but not at the very bottom because it will not scroll.
I tried this :
initial_value = 0
next_value = 300
while next_value<300000:
driver.execute_script("window.scrollTo(initial_value,next_value);")
time.sleep(0.5)
initial_value=next_value
n = n + 300
url2 = driver.current_url
results = requests.get(url)
soup = BeautifulSoup(results.text, "html.parser")
all_title = soup.find_all('a', class_ = 'ow-prod__img')
data_titles = []
for title in all_title:
try:
product_link = title['href']
data_titles.append(product_link)
except:
pass
data = pd.DataFrame({
'links' : data_titles
})
data.to_csv("testlink4.csv", sep=';', index=False)
But it doesn't seems to work.
How could I make a loop who scrape all the href for the watches until the very end ?
Here's one example for the html to the href needed :
htmlhref
I found a solution, if it could help someone later :
PATH = "driver\chromedriver.exe"
#path to modify if needed
options = webdriver.ChromeOptions()
options.add_argument("--disable-gpu")
options.add_argument('enable-logging')
options.add_argument("start-maximized")
options.add_experimental_option("excludeSwitches", ["enable-automation"])
options.add_experimental_option('useAutomationExtension', False)
driver = webdriver.Chrome(options=options, executable_path=PATH)
driver.get('https://www.omegawatches.com/fr-fr/watchfinder')
for i in range(730):
driver.execute_script("window.scrollBy(0, 250)")
time.sleep(1)
url2 = driver.current_url
results = requests.get(url2)
soup = BeautifulSoup(results.text, "html.parser")
all_title = soup.find_all('a', class_ = 'ow-prod__img')
data_titles = []
for title in all_title:
try:
product_link = title['href']
data_titles.append(product_link)
except:
pass
data = pd.DataFrame({
'links' : data_titles
})
data.to_csv("testlink4.csv", sep=';', index=False)

Script isn't retrieving all the info

I tried making a python script that gets all the fighter names and their records from boxrec.com. The issue is that it doesn't retrieve them all (Floyd Mayweather is missing) and some of them appear several times (Success Tetteh for example).
The output is too big to post it here: https://cryptpad.fr/pad/#/2/pad/view/mYd4jIMOxY7QNUqW2-5TvYIvvx84KXbiMdYvXINGV9M/
Edit: For some fighters the records are wrong (Vasyl Lomachenko for example appears to have 28 wins, but he has 14)
import numpy
from requests import Session
from bs4 import BeautifulSoup
import pandas as pd
import pyautogui
import time
def main():
fighter_names = []
fighter_wins = []
fighter_losses = []
fighter_draws = []
username = "username"
password = "password"
site = "https://boxrec.com/en/login"
payload = {
'_username': username,
'_password': password,
'login[go]': None
}
with Session() as s:
s.get(site)
s.post(site, data=payload, headers={
"Content-Type": "application/x-www-form-urlencoded"
})
pages = numpy.arange(1, 19152, 20)
for page in pages:
page = s.get(
"https://boxrec.com/en/locations/people?l%5Brole%5D=proboxer&l%5Bdivision%5D=&l%5Bcountry%5D=&l"
"%5Bregion%5D=&l%5Btown%5D=&l_go=&offset= "
+ str(page))
soup = BeautifulSoup(page.text, 'html.parser')
names_a = soup.find_all('a', class_='personLink')
if not names_a:
print("solving captcha")
page = s.get(
"https://boxrec.com/en/locations/people?l%5Brole%5D=proboxer&l%5Bdivision%5D=&l%5Bcountry%5D=&l"
"%5Bregion%5D=&l%5Btown%5D=&l_go=&offset= "
+ str(page))
soup = BeautifulSoup(page.text, 'html.parser')
names_a = soup.find_all('a', class_='personLink')
pyautogui.click(x=118, y=1061)
time.sleep(1)
pyautogui.click(x=1035, y=619)
time.sleep(2)
pyautogui.click(x=97, y=59)
time.sleep(1)
pyautogui.click(x=834, y=247)
time.sleep(2)
if not names_a:
print("please solve captcha manually")
while not names_a:
page = s.get(
"https://boxrec.com/en/locations/people?l%5Brole%5D=proboxer&l%5Bdivision%5D=&l%5Bcountry%5D=&l"
"%5Bregion%5D=&l%5Btown%5D=&l_go=&offset= "
+ str(page))
soup = BeautifulSoup(page.text, 'html.parser')
names_a = soup.find_all('a', class_='personLink')
wins_span = soup.find_all('span', class_='textWon')
loses_span = soup.find_all('span', class_='textLost')
draws_span = soup.find_all('span', class_='textDraw')
for container in names_a:
name = container.text
print(name)
fighter_names.append(name)
for container in wins_span:
wins = container.text
fighter_wins.append(wins)
for container in loses_span:
losses = container.text
fighter_losses.append(losses)
for container in draws_span:
draws = container.text
fighter_draws.append(draws)
fighters = {
"name": fighter_names,
"wins": fighter_wins,
"loses": fighter_losses,
"draws": fighter_draws
}
df = pd.DataFrame.from_dict(fighters, orient="index")
df = df.transpose()
df.to_csv("fighters.csv")
if __name__ == '__main__':
main()
I would refrain from using the same variable name to represent 2 separate things...Looks like you have page variable being used in 2 separate instances, which can be confusing.
As far as some of the issues, I'm assuming at some point there's a mismatch in the lists so the corresponding data isn't lining up with the correct fighter name, etc. or there's something off with the sites actual data/html. Not entirely sure as I haven't debugged. Reason being, have you considered using pandas to parse the table then just split the 'w-l-d' column? I think it would be far easier to let pandas do the parsing as to not miss something in the 900+ pages you need to go through.
See if this helps:
import numpy
from requests import Session
from bs4 import BeautifulSoup
import pandas as pd
import pyautogui
import time
import math
def main():
final_df = pd.DataFrame()
username = 'username'
password = 'password'
site = "https://boxrec.com/en/login"
payload = {
'_username': username,
'_password': password,
'login[go]': None
}
with Session() as s:
s.get(site)
s.post(site, data=payload, headers={
"Content-Type": "application/x-www-form-urlencoded"
})
pages = numpy.arange(1, 19152, 20)
for page in pages:
response = s.get(
"https://boxrec.com/en/locations/people?l%5Brole%5D=proboxer&l%5Bdivision%5D=&l%5Bcountry%5D=&l"
"%5Bregion%5D=&l%5Btown%5D=&l_go=&offset= "
+ str(page))
soup = BeautifulSoup(response.text, 'html.parser')
names_a = soup.find_all('a', class_='personLink')
if not names_a:
print("solving captcha")
response = s.get(
"https://boxrec.com/en/locations/people?l%5Brole%5D=proboxer&l%5Bdivision%5D=&l%5Bcountry%5D=&l"
"%5Bregion%5D=&l%5Btown%5D=&l_go=&offset= "
+ str(page))
soup = BeautifulSoup(response.text, 'html.parser')
names_a = soup.find_all('a', class_='personLink')
pyautogui.click(x=118, y=1061)
time.sleep(1)
pyautogui.click(x=1035, y=619)
time.sleep(2)
pyautogui.click(x=97, y=59)
time.sleep(1)
pyautogui.click(x=834, y=247)
time.sleep(2)
if not names_a:
print("please solve captcha manually")
while not names_a:
response = s.get(
"https://boxrec.com/en/locations/people?l%5Brole%5D=proboxer&l%5Bdivision%5D=&l%5Bcountry%5D=&l"
"%5Bregion%5D=&l%5Btown%5D=&l_go=&offset= "
+ str(page))
soup = BeautifulSoup(response.text, 'html.parser')
names_a = soup.find_all('a', class_='personLink')
df = pd.read_html(response.text)[-1]
df = df[['name','w-l-d']]
df = df[df['w-l-d'].astype(str).str.match(r"(^\d*.\d*.\d*$)")] # <--- ADD THIS LINE
df[['wins','loses','draws']] = df['w-l-d'].str.split(expand=True)
df = df.drop('w-l-d', axis=1)
print('Page: %d of %d' %(((page-1)/20)+1,math.ceil(19152/20)))
final_df = final_df.append(df, sort=False).reset_index(drop=True)
final_df.to_csv("fighters.csv")
if __name__ == '__main__':
main()

Scraping all href links using Pagination

I've to Select each state from https://www.maxpreps.com/search/states_by_sport.aspx?gendersport=boys,football&season=fall and then click on team rankings and after that I've to grab href links of each ranked team.
I've completed till team rankings part now I want get links of each ranked team from all the pages in the pagination bar right now I'm getting links of all teams available on the first page only, I don't how to navigate to the next page.(below is the code)
import requests
from bs4 import BeautifulSoup
from urllib.request import urlopen
import re
site = "https://www.maxpreps.com"
url = requests.get("https://www.maxpreps.com/search/states_by_sport.aspx?gendersport=boys,football&season=fall")
soup = BeautifulSoup(url.content, "html.parser")
states = soup.findAll('div', {'class': 'states'})
for each_state in states:
all_states = each_state.find_all('a', href=True)
for a in all_states:
domain = site + a['href'] #domain consist oflinks of states
for r in domain:
page_link = domain
page_response = requests.get(page_link)
soup = BeautifulSoup(page_response.content, "html.parser")
for link in soup.findAll('a', attrs={'href': re.compile("rankings")}):
rankings_link = site + link.get('href')
#print(rankings_link)
for ert in rankings_link:
team_link = rankings_link
page_response1 = requests.get(team_link)
soup = BeautifulSoup(page_response1.content, "html.parser")
My_table = soup.find('table',{'class':'mx-grid sortable rankings-grid'})
links = My_table.findAll('a')
print(links)
output:
Everett, Methuen,
You could just iterate through pages within the query parameters.
import requests
from bs4 import BeautifulSoup
site = "https://www.maxpreps.com"
session = requests.Session()
response = session.get("https://www.maxpreps.com/search/states_by_sport.aspx?gendersport=boys,football&season=fall")
soup = BeautifulSoup(response.content, "html.parser")
all_states = soup.find('div', {'class': 'states'})
states_list = []
for each in all_states.find_all('a'):
states_list.append(each['href'].split('=')[-1])
states_list = states_list[:-1]
team_links = []
url = 'https://www.maxpreps.com/m/rankings/list.aspx'
for state in states_list:
break_loop = False
page=1
while break_loop == False:
print ('%s: Page %s' %(state, page))
payload = {
'page': str(page),
'ssid': '8d610ab9-220b-465b-9cf0-9f417bce6c65',
'state': state
}
response = requests.get(url, params=payload)
soup = BeautifulSoup(response.text, "html.parser")
table = soup.find('table')
if table == None:
break_loop = True
else:
page+=1
links = table.find_all('a')
for link in links:
team_links.append('https://www.maxpreps.com' + link['href'])
Output:
print (team_links[:10])
['https://www.maxpreps.com/m/high-schools/central-red-devils-(phenix-city,al)/football/default.htm', 'https://www.maxpreps.com/m/high-schools/thompson-warriors-(alabaster,al)/football/default.htm', 'https://www.maxpreps.com/m/high-schools/hoover-buccaneers-(hoover,al)/football/default.htm', 'https://www.maxpreps.com/m/high-schools/oxford-yellow-jackets-(oxford,al)/football/default.htm', 'https://www.maxpreps.com/m/high-schools/mountain-brook-spartans-(birmingham,al)/football/default.htm', 'https://www.maxpreps.com/m/high-schools/hewitt-trussville-huskies-(trussville,al)/football/default.htm', 'https://www.maxpreps.com/m/high-schools/mcgill-toolen-yellowjackets-(mobile,al)/football/default.htm', 'https://www.maxpreps.com/m/high-schools/lee-generals-(montgomery,al)/football/default.htm', 'https://www.maxpreps.com/m/high-schools/pinson-valley-indians-(pinson,al)/football/default.htm', 'https://www.maxpreps.com/m/high-schools/vestavia-hills-rebels-(vestavia-hills,al)/football/default.htm']

Complex python3 csv scraper

I've got the code below working great when pulling data from a row, in my case row[0]. I'm wondering how to tweak it to pull data from multiple rows?
Also, I would love to be able to specify which divTag class (see the code below) to use for a specific column.
Something like for row[1,2] use:
divTag = soup.find("div", {"class": "productsPicture"})
and for row[4,5] use:
divTag = soup.find("div", {"class": "product_content"})
If that make sense to you guys.
from bs4 import BeautifulSoup
import requests
import csv
with open('urls.csv', 'r') as csvFile, open('results.csv', 'w', newline='') as results:
reader = csv.reader(csvFile, delimiter=';')
writer = csv.writer(results)
for row in reader:
# get the url
url = row[0]
print(url)
# fetch content from server
try:
html = requests.get(url).content
except requests.exceptions.ConnectionError as e:
writer.writerow([url, '', 'bad url'])
continue
# soup fetched content
soup = BeautifulSoup(html, 'html.parser')
divTag = soup.find("div", {"class": "productsPicture"})
if divTag:
# Return all 'a' tags that contain an href
for a in divTag.find_all("a", href=True):
url_sub = a['href']
# Test that link is valid
try:
r = requests.get(url_sub)
writer.writerow([url, url_sub, 'ok'])
except requests.exceptions.ConnectionError as e:
writer.writerow([url, url_sub, 'bad link'])
else:
writer.writerow([url, '', 'no results'])
urls.csv sample:
https://www.tennis-point.com/index.php?stoken=737F2976&lang=1&cl=search&searchparam=E705Y-0193;
https://www.tennis-point.com/index.php?stoken=737F2976&lang=1&cl=search&searchparam=E703Y-0193;
https://www.tennis-point.com/index.php?stoken=737F2976&lang=1&cl=search&searchparam=E702Y-4589;
https://www.tennis-point.com/index.php?stoken=737F2976&lang=1&cl=search&searchparam=E706Y-9093;
Example classes to search for:
To add a per column find parameter, you could create a dictionary mapping the index number into the required find parameters as follows:
from bs4 import BeautifulSoup
import requests
import csv
class_1 = {"class": "productsPicture"}
class_2 = {"class": "product_content"}
class_3 = {"class": "id-fix"}
# map a column number to the required find parameters
class_to_find = {
0 : class_3, # Not defined in question
1 : class_1,
2 : class_1,
3 : class_3, # Not defined in question
4 : class_2,
5 : class_2}
with open('urls.csv', 'r') as csvFile, open('results.csv', 'w', newline='') as results:
reader = csv.reader(csvFile)
writer = csv.writer(results)
for row in reader:
# get the url
output_row = []
for index, url in enumerate(row):
url = url.strip()
# Skip any empty URLs
if len(url):
#print('col: {}\nurl: {}\nclass: {}\n\n'.format(index, url, class_to_find[index]))
# fetch content from server
try:
html = requests.get(url).content
except requests.exceptions.ConnectionError as e:
output_row.extend([url, '', 'bad url'])
continue
except requests.exceptions.MissingSchema as e:
output_row.extend([url, '', 'missing http...'])
continue
# soup fetched content
soup = BeautifulSoup(html, 'html.parser')
divTag = soup.find("div", class_to_find[index])
if divTag:
# Return all 'a' tags that contain an href
for a in divTag.find_all("a", href=True):
url_sub = a['href']
# Test that link is valid
try:
r = requests.get(url_sub)
output_row.extend([url, url_sub, 'ok'])
except requests.exceptions.ConnectionError as e:
output_row.extend([url, url_sub, 'bad link'])
else:
output_row.extend([url, '', 'no results'])
writer.writerow(output_row)
The enumerate() function is used to return a counter whist iterating over a list. So index will be 0 for the first URL, and 1 for the next. This can then be used with the class_to_find dictionary to get the required parameters to search on.
Each URL results in 3 columns being created, the url, the sub-url if successful and the result. These can be removed if not needed.

How to return href link which are not starting "\listing" using python

I am trying to scrape the links in https://www.panpages.my/search_results?q=
I have written Python script to get all links in each pages
I want to filter the links which is not starting as "\Listings"
Please find my script below and help me:
import requests
from bs4 import BeautifulSoup
import re
from io import StringIO
import csv
data = open("D:/Mine/Python/Projects/Freelancer/seekProgramming/rootpages.csv").read()
dataFile = StringIO(data)
csvReader = csv.reader(dataFile)
f = open('paylinks.csv', 'w', newline = '')
writer = csv.writer(f)
for row in csvReader:
myurl = row[0]
def simple_web_scrapper(url):
source_code = requests.get(url)
plain_text = source_code.text
soup = BeautifulSoup(plain_text, 'html.parser')
for root in soup.findAll('div', {"class": "mid_section col-xs-10 col-sm-7 tmargin xs-nomargin"}):
for link in root.findAll('a'):
href = link.get('href')
print(href)
simple_web_scrapper(myurl)
for row in csvReader:
myurl = row[0]
def simple_web_scrapper(url):
source_code = requests.get(url)
plain_text = source_code.text
soup = BeautifulSoup(plain_text, 'html.parser')
for root in soup.findAll('div', {"class": "mid_section col-xs-10 col-sm-7 tmargin xs-nomargin"}):
for link in root.findAll('a'):
href = link.get('href')
if href.startswith('\listings'): #that's the row you need
print(href)
simple_web_scrapper(myurl)

Resources