I am trying to learn scraping,
I use exceptions lower down in the code to pass through errors because they dont affect the writing of data to csv
I keep getting a "socket.gaierror" but in the handling of that there is a "urllib.error.URLError" in the handling of that I get "NameError: name 'socket' is not defined" which seems circuitous
I kind of understand that using these exceptions may not be the best way to run the code but I cant seem to get past these errors and I dont know a way around or how to fix the errors.
If you have any suggestions outside of fixing the error exceptions that would be greatly appreciated as well.
import csv
from urllib.request import urlopen
from urllib.error import HTTPError
from bs4 import BeautifulSoup
base_url = 'http://www.fangraphs.com/' # used in line 27 for concatenation
years = ['2017','2016','2015'] # for enough data to run tests
#Getting Links for letters
player_urls = []
data = urlopen('http://www.fangraphs.com/players.aspx')
soup = BeautifulSoup(data, "html.parser")
for link in soup.find_all('a'):
if link.has_attr('href'):
player_urls.append(base_url + link['href'])
#Getting Alphabet Links
test_for_playerlinks = 'players.aspx?letter='
player_alpha_links = []
for i in player_urls:
if test_for_playerlinks in i:
player_alpha_links.append(i)
# Getting Player Links
ind_player_urls = []
for l in player_alpha_links:
data = urlopen(l)
soup = BeautifulSoup(data, "html.parser")
for link in soup.find_all('a'):
if link.has_attr('href'):
ind_player_urls.append(link['href'])
#Player Links
jan = 'statss.aspx?playerid'
players = []
for j in ind_player_urls:
if jan in j:
players.append(j)
# Building Pitcher List
pitcher = 'position=P'
pitchers = []
pos_players = []
for i in players:
if pitcher in i:
pitchers.append(i)
else:
pos_players.append(i)
# Individual Links to Different Tables Sorted by Base URL differences
splits = 'http://www.fangraphs.com/statsplits.aspx?'
game_logs = 'http://www.fangraphs.com/statsd.aspx?'
split_pp = []
gamel = []
years = ['2017','2016','2015']
for i in pos_players:
for year in years:
split_pp.append(splits + i[12:]+'&season='+ year)
gamel.append(game_logs+ i[12:] + '&type=&gds=&gde=&season=' + year)
split_pitcher = []
gl_pitcher = []
for i in pitchers:
for year in years:
split_pitcher.append(splits + i[12:]+'&season=' + year)
gl_pitcher.append(game_logs + i[12:] + '&type=&gds=&gde=&season=' + year)
# Splits for Pitcher Data
row_sp = []
rows_sp = []
try:
for i in split_pitcher:
sauce = urlopen(i)
soup = BeautifulSoup(sauce, "html.parser")
table1 = soup.find_all('strong', {"style":"font-size:15pt;"})
row_sp = []
for name in table1:
nam = name.get_text()
row_sp.append(nam)
table = soup.find_all('table', {"class":"rgMasterTable"})
for h in table:
he = h.find_all('tr')
for i in he:
td = i.find_all('td')
for j in td:
row_sp.append(j.get_text())
rows_sp.append(row_sp)
except(RuntimeError, TypeError, NameError, URLError, socket.gaierror):
pass
try:
with open('SplitsPitchingData2.csv', 'w') as fp:
writer = csv.writer(fp)
writer.writerows(rows_sp)
except(RuntimeError, TypeError, NameError):
pass
I'm guessing your main problem was that you - without any sleep what so ever - queried the site for a huge amount of invalid urls (you create 3 urls for the years 2015-2017 for 22880 pitchers in total, but most of these do not fall within that scope so you have tens of thousands of queries that return errors).
I'm surprised your IP wasn't banned by site admin. That said: It would be better to do some filtering so you avoid all those error queries...
The filter I applied is not perfect. It checks if the years in the list either appears in the start or end the years given on the site (e.g. '2004 - 2015'). This also creates error links but no way near the amount the original script did.
In code it could look like this:
from urllib.request import urlopen
from bs4 import BeautifulSoup
from time import sleep
import csv
base_url = 'http://www.fangraphs.com/'
years = ['2017','2016','2015']
# Getting Links for letters
letter_links = []
data = urlopen('http://www.fangraphs.com/players.aspx')
soup = BeautifulSoup(data, "html.parser")
for link in soup.find_all('a'):
try:
link = base_url + link['href']
if 'players.aspx?letter=' in link:
letter_links.append(link)
except:
pass
print("[*] Retrieved {} links. Now fetching content for each...".format(len(letter_links)))
# the data resides in two different base_urls:
splits_url = 'http://www.fangraphs.com/statsplits.aspx?'
game_logs_url = 'http://www.fangraphs.com/statsd.aspx?'
# we need (for some reason) players in two lists - pitchers_split and pitchers_game_log - and the rest of the players in two different, pos_players_split and pis_players_game_log
pos_players_split = []
pos_players_game_log = []
pitchers_split = []
pitchers_game_log = []
# and if we wanted to do something with the data from the letter_queries, lets put that in a list for safe keeping:
ind_player_urls = []
current_letter_count = 0
for link in letter_links:
current_letter_count +=1
data = urlopen(link)
soup = BeautifulSoup(data, "html.parser")
trs = soup.find('div', class_='search').find_all('tr')
for player in trs:
player_data = [tr.text for tr in player.find_all('td')]
# To prevent tons of queries to fangraph with invalid years - check if elements from years list exist with the player stat:
if any(year in player_data[1] for year in years if player_data[1].startswith(year) or player_data[1].endswith(year)):
href = player.a['href']
player_data.append(base_url + href)
# player_data now looks like this:
# ['David Aardsma', '2004 - 2015', 'P', 'http://www.fangraphs.com/statss.aspx?playerid=1902&position=P']
ind_player_urls.append(player_data)
# build the links for game_log and split
for year in years:
split = '{}{}&season={}'.format(splits_url,href[12:],year)
game_log = '{}{}&type=&gds=&gde=&season={}'.format(game_logs_url, href[12:], year)
# checking if the player is pitcher or not. We're append both link and name (player_data[0]), so we don't need to extract name later on
if 'P' in player_data[2]:
pitchers_split.append([player_data[0],split])
pitchers_game_log.append([player_data[0],game_log])
else:
pos_players_split.append([player_data[0],split])
pos_players_game_log.append([player_data[0],game_log])
print("[*] Done extracting data for players for letter {} out of {}".format(current_letter_count, len(letter_links)))
sleep(2)
# CONSIDER INSERTING CSV-PART HERE....
# Extracting and writing pitcher data to file
with open('SplitsPitchingData2.csv', 'a') as fp:
writer = csv.writer(fp)
for i in pitchers_split:
try:
row_sp = []
rows_sp = []
# all elements in the pitchers_split are lists. Player name is i[1]
data = urlopen(i[1])
soup = BeautifulSoup(data, "html.parser")
# append name to row_sp from pitchers_split
row_sp.append(i[0])
# the page has 3 tables with the class rgMasterTable, the first i Standard, the second Advanced, the 3rd Batted Ball
# we're only grabbing standard
table_standard = soup.find_all('table', {"class":"rgMasterTable"})[0]
trs = table_standard.find_all('tr')
for tr in trs:
td = tr.find_all('td')
for content in td:
row_sp.append(content.get_text())
rows_sp.append(row_sp)
writer.writerows(rows_sp)
sleep(2)
except Exception as e:
print(e)
pass
Since I'm not sure precisely how you wanted the data formatted on output you need some work on that.
If you want to avoid waiting for all letter_links to be extracted before you retrieve the actual pitcher stats (and fine tune your output) you can move the csv writer part up, so it runs as a part of the letter loop. If you do this don't forget to empty the pitchers_split list before grabbing another letter_link...
Related
Just doing one of my first web scraping and I already have elements I wanted to extract but I cannot find the function to print them as a numbered list. The code I have for now:
r = requests.get('https://mmazurek.dev/category/programowanie-2/page/3/', proxies={'http':'82.119.170.106'})
page = soup(r.content, "html.parser")
contents=page.findAll(None, class_="post-title-link")
for content in contents:
text_content=list(content.get_text())
first_letter=str(text_content[0])
x="".join(first_letter)
listToStr = "".join(map(str, text_content))
print(listToStr)
The purpose is to have list printed like:
P....
J...
...
Hope you dont mind it's a Polish text;)
def get_html(url, useragent=None, proxy=None):
session = requests.Session()
request = session.get(url=url, headers=useragent, proxies=proxy)
if request.status_code == 200:
soup = bs(request.text, 'lxml')
return soup
else:
print("Error " + str(request.status_code))
return request.status_code
def parse(soup):
data = []
contents = soup.findAll(None, class_="post-title-link")
for i, content in enumerate(contents):
text = content.text
href = content['href']
data.append([
i,
text,
href,
])
return data
return data
data = parse(get_html('https://mmazurek.dev/category/programowanie-2/page/3/', proxy={'http': '82.119.170.106'}))
print(data)
i am trying to scrape a table that spans multiple pages and export to a csv file. only one line of data seems to get exported and it is jumbled up.
I have looked on the web and tried many iterations and very frustrated now. As you can tell from code I am a novice at coding!
import bs4 as bs
import urllib.request
import pandas as pd
import csv
max_page_num = 14
max_page_dig = 1 # number of digits in the page number
with open('result.csv',"w") as f:
f.write("Name, Gender, State, Position, Grad, Club/HS, Rating, Commitment \n")
for i in range(0, max_page_num):
page_num = (max_page_dig - len(str(i))) * "0" +str(i) #gives a string in the format of 1, 01 or 001, 005 etc
print(page_num)
source = "https://www.topdrawersoccer.com/search/?query=&divisionId=&genderId=m&graduationYear=2020&positionId=0&playerRating=&stateId=All&pageNo=" + page_num + "&area=commitments"
print(source)
url = urllib.request.urlopen(source).read()
soup = bs.BeautifulSoup(url,'lxml')
table = soup.find('table')
table_rows = table.find_all('tr')
for tr in table_rows:
td = tr.find_all('td')
row = [i.text for i in td]
#final = row.strip("\n")
#final = row.replace("\n","")
with open('result.csv', 'a') as f:
f.write(row)
It seems when I write to csv it overwrites previous ones. It also pastes it on one line and the players name is concatenated with the school name . Thanks for any and all help.
I think you have a problem with your inside for loop. Try re-writing it as
with open('result.csv', 'a') as f:
for tr in table_rows:
td = tr.find_all('td')
row = [i.text for i in td]
f.write(row)
and see if it works.
More generally, this can probably be done more simply by using pandas. Try changing your for loop to:
for i in range(0, max_page_num):
page_num = ...
source = ....
df = pd.read_html(source)
df.to_csv('results.csv', header=False, index=False, mode='a') #'a' should append each table to the csv file, instead of overwriting it.
I am currently scraping through an XML API response. I am looking to gather a piece of information for each request and create a dictionary each time I find this piece of data. Each request can have several IDs. So one response can have 2 IDs while the next response might have 3 IDs. For example, let's say the first response has 2 IDs. I am storing this data in a list at the moment when the second request is done the additional 3 IDs are being stored under this same list as well.
import requests
import pandas as pd
from pandas import DataFrame
from bs4 import BeautifulSoup
import datetime as datetime
import json
import time
trackingDomain = ''
domain = ''
aIDs = []
cIDs = []
url = "https://" + domain + ""
print(url)
df = pd.read_csv('campids.csv')
for index, row in df.iterrows():
payload = {'api_key':'',
'campaign_id':'0',
'site_offer_id':row['IDs'],
'source_affiliate_id':'0',
'channel_id':'0',
'account_status_id':'0',
'media_type_id':'0',
'start_at_row':'0',
'row_limit':'0',
'sort_field':'campaign_id',
'sort_descending':'TRUE'
}
print('Campaign Payload', payload)
r = requests.get(url, params=payload)
print(r.status_code)
soup = BeautifulSoup(r.text, 'lxml')
success = soup.find('success').string
for affIDs in soup.select('campaign'):
affID = affIDs.find('source_affiliate_id').string
aIDs.append(affID)
dataDict = dict()
dataDict['offers'] = []
affDict = {'affliate_id':aIDs}
dataDict['offers'].append(dict(affDict))
The result ends up being as follows:
dictData = {'offers': [{'affliate_id': ['9','2','45','47','14','8','30','30','2','2','9','2']}]}
What I am looking to do is this:
dictData = {'offers':[{'affiliate_id'['9','2','45','47','14','8','30','30','2','2']},{'affiliate_id':['9','2']}]}
On the first request, I obtain the following:
IDs['9','2','45','47','14','8','30','30','2','2']
On the second request these IDs are returned:
['9','2']
I am new to Python so please bear with me as far etiquette goes and I am missing something. I'll be happy to provide any additional information.
It has to do with the order of your initializing and appending that is causing you to not get the outcome you are wanting. You are overwriting your dataDict after each iteration, and inserting the appended list which is not overwritten, thus leaving you with a final list that has appended ALL aIDs. What you want to to do is initialise that dataDict out side of your for loop, and then you can append the dictionary in the nested loop into that list:
Note: It's tough to work out/test without having the actual data, but I believe this should do it if I worked out the logic correctly in my head:
import requests
import pandas as pd
from pandas import DataFrame
from bs4 import BeautifulSoup
import datetime as datetime
import json
import time
trackingDomain = ''
domain = ''
cIDs = []
url = "https://" + domain + ""
# Initialize your dictionary
dataDict = dict()
# Initialize your list in your dictionary under key `offers`
dataDict['offers'] = []
print(url)
df = pd.read_csv('campids.csv')
for index, row in df.iterrows():
payload = {'api_key':'',
'campaign_id':'0',
'site_offer_id':row['IDs'],
'source_affiliate_id':'0',
'channel_id':'0',
'account_status_id':'0',
'media_type_id':'0',
'start_at_row':'0',
'row_limit':'0',
'sort_field':'campaign_id',
'sort_descending':'TRUE'
}
print('Campaign Payload', payload)
r = requests.get(url, params=payload)
print(r.status_code)
soup = BeautifulSoup(r.text, 'lxml')
success = soup.find('success').string
# Initialize your list for this iteration/row in your df.iterrows
aIDs = []
for affIDs in soup.select('campaign'):
affID = affIDs.find('source_affiliate_id').string
# Append those affIDs to the aIDs list
aIDs.append(affID)
# Create your dictionary of key:value with key 'affiliate_id' and value the aIDs list
affDict = {'affliate_id':aIDs}
# NOW append that into your list in your dictionary under key `offers`
dataDict['offers'].append(dict(affDict))
I want to scrape information from supermarket products but taking into account that some of the info (the origin of the product) isn't always available.
I am trying to iterate over a dataframe of links of a supermarket. From each of them, I want to get some information. However, the origin of the products isn't always available. I don't know how to make Python look for 'origin' only when it is available. I've tried the following code:
import urllib.request
from bs4 import BeautifulSoup
import csv
import os
dir = ''
file = 'data.xlsx'
sheetname="Hoja1"
# create and write headers to a list
rows = []
rows.append(['Brand', 'Product', 'Product_Number', 'Gross_Weight', 'Origin'])
# Change working directory:
os.chdir(dir)
# Retrieve current working directory ('cwd'):
cwd = os.getcwd()
cwd
# Load spreadsheet:
xl = pd.ExcelFile(file)
# Load a sheet into a DataFrame by name: df1
df = xl.parse(sheetname)
for index, row in df.iterrows():
# specify the url
urlpage = row['link']
#print(urlpage)
# query the website and return the html to the variable 'page'
page = urllib.request.urlopen(urlpage)
# parse the html using beautiful soup and store in variable 'soup'
soup = BeautifulSoup(page, 'html.parser')
# find results within table
results = soup.find_all('dl', attrs={'class': 'des_info clearfix'})
#print('Number of results', len(results))
for result in results:
# find all columns per result
data = result.find_all('dd')
# check that columns have data
if len(data) == 0:
continue
# write columns to variables
brand = data[0].getText()
product = data[1].getText()
number = data[2].getText()
weight = data[3].getText()
if data[4].getText() == None:
origin = 0
else:
origin = data[4].getText()
# write each result to rows
rows.append([brand, product, number, weight, origin])
I get the following error:
if data[4].getText() == None:
IndexError: list index out of range
I would like to get all the data ordered in a list and, if the origin isn't available for one item, a zero.
You can use a try statement:
# write columns to variables
brand = data[0].getText()
product = data[1].getText()
number = data[2].getText()
weight = data[3].getText()
try:
origin = data[4].getText()
except:
origin = 0
You could also use len of data
if len(data) >= 4:
#do something
else:
#do something else
I've gotten all the data I wanted from scraping this metacritc url (see below) however, I can't seem to put a value in for when I don't find the associated value for list (missing values)
I would like to have it so all the lists are even (so I can right to .csv)
Here is the code I have so far:
from requests import get
from bs4 import BeautifulSoup
from urllib.request import Request, urlopen
import pandas as pd
#Define year
year_number = 2018
# Define the URL
i = range(0, 1)
names = []
metascores = []
userscores = []
userscoresNew = []
release_dates = []
release_datesNew = []
publishers = []
ratings = []
genres = []
genresNew = []
for element in i:
url = "http://www.metacritic.com/browse/games/score/metascore/year/pc/filtered?view=detailed&sort=desc&year_selected=" + format(year_number)
print(url)
year_number -= 1
# not sure about this but it works (I was getting blocked by something and this the way I found around it)
req = Request(url, headers={'User-Agent': 'Mozilla/5.0'})
web_byte = urlopen(req).read()
webpage = web_byte.decode('utf-8')
#this grabs the all the text from the page
html_soup = BeautifulSoup(webpage, 'html5lib')
#this is for selecting all the games in from 1 to 100 (the list of them)
game_names = html_soup.find_all("div", class_="main_stats")
game_metas = html_soup.find_all("a", class_="basic_stat product_score")
game_users = html_soup.find_all("li", class_='stat product_avguserscore')
game_releases = html_soup.find_all("ul", class_='more_stats')
game_publishers = html_soup.find_all("li", class_='stat publisher')
game_ratings = html_soup.find_all("li", class_='stat maturity_rating')
game_genres = html_soup.find_all("li", class_='stat genre')
#Extract data from each game
for games in game_names:
name = games.find()
names.append(name.text.strip())
for games2 in game_metas:
metascore = games2.find()
metascores.append(metascore.text.strip())
for games3 in game_releases:
release_date = games3.find()
release_dates.append(release_date.text.strip())
for games4 in game_users:
userscore = games4.find('span', class_="data textscore textscore_favorable") or games4.find('span', class_="data textscore textscore_mixed")
if userscore:
userscores.append(userscore.text)
for games5 in game_publishers:
publisher = games5.find("span", class_ = "data")
if publisher:
publishers.append(publisher.text)
for games6 in game_ratings:
rating = games6.find("span", class_ = "data")
for games7 in game_genres:
genre = games7.find("span", class_ = "data")
if genre:
genres.append(genre.text)
for x in release_dates:
temp = str(x)
temp2 = temp.replace("Release Date:\n ", "")
release_datesNew.append(temp2)
for z in genres:
temp3 = str(z)
temp4 = temp3.strip()
temp5 = temp4.replace(" ", "")
genresNew.append(temp5)
df = pd.DataFrame({'Games:': names})
not sure how I would work that in to this code
From what I understand it's take all the data it can find but if there is a blank it doesn't know about it
can someone adivse the best solution for this situation
any help would be great
Thanks
Just add else's for the existing conditions...
if userscore:
userscores.append(userscore.text)
else:
userscores.append('na')