Web scraping script is returning duplicate values

Web scraping script is returning duplicate values - python-3.x

My web scraping script is returning duplicate results for some reason, i've tried so many alternatives, but just can't get it to work whatsoever. Can anyone help please?
import requests
from bs4 import BeautifulSoup as bs
from bs4.element import Tag
import csv
soup = [ ]
pages = [ ]
csv_file = open('444.csv', 'w')
csv_writer = csv.writer(csv_file)
csv_writer.writerow(['Practice', 'Practice Manager'])
for i in range(35899, 35909):
url = 'https://www.nhs.uk/Services/GP/Staff/DefaultView.aspx?id=' + str(i)
pages.append(url)
for item in pages:
page = requests.get(item)
soup.append(bs(page.text, 'lxml'))
business = []
for items in soup:
h1Obj = items.select('[class^=panel]:has([class^="gp notranslate"]:contains(""))')
for i in h1Obj:
tagArray = i.findChildren()
for tag in tagArray:
if isinstance(tag,Tag) and tag.name in 'h1':
business.append(tag.text)
else:
print('no-business')
names = []
for items in soup:
h4Obj = items.select('[class^=panel]:not(p):has([class^="staff-title"]:contains("Practice Manager"))')
for i in h4Obj:
tagArray = i.findChildren()
for tag in tagArray:
if isinstance(tag,Tag) and tag.name in 'h4':
names.append(tag.text)
else:
print('no-name')
print(business, names)
csv_writer.writerow([business, names])
csv_file.close()
It's currently returning duplicate values on all.
What it needs to do is return one 'business' and one 'names' value per url call. If there is no 'business' or 'name', it needs to return a value of 'no-business' or 'no-name'.
Can anyone please help me?

I don't know if it's the best way of doing it, but i used set instead of list to remove duplicates and just before saving the file i convert the set to a list like this :
import requests
from bs4 import BeautifulSoup as bs
from bs4.element import Tag
import csv
soup = [ ]
pages = [ ]
csv_file = open('444.csv', 'w')
csv_writer = csv.writer(csv_file)
csv_writer.writerow(['Practice', 'Practice Manager'])
for i in range(35899, 35909):
url = 'https://www.nhs.uk/Services/GP/Staff/DefaultView.aspx?id=' + str(i)
pages.append(url)
for item in pages:
page = requests.get(item)
soup.append(bs(page.text, 'lxml'))
business = set()
for items in soup:
h1Obj = items.select('[class^=panel]:has([class^="gp notranslate"]:contains(""))')
for i in h1Obj:
tagArray = i.findChildren()
for tag in tagArray:
if isinstance(tag,Tag) and tag.name in 'h1':
business.add(tag.text)
else:
print('no-business')
names = set()
for items in soup:
h4Obj = items.select('[class^=panel]:not(p):has([class^="staff-title"]:contains("Practice Manager"))')
for i in h4Obj:
tagArray = i.findChildren()
for tag in tagArray:
if isinstance(tag,Tag) and tag.name in 'h4':
names.add(tag.text)
else:
print('no-business')
print(business, names)
csv_writer.writerow([list(business), list(names)])
csv_file.close()

You could use the following id to generate the initial list of lists. You could write each row to csv rather than append to final list.
import requests
from bs4 import BeautifulSoup as bs
results = []
with requests.Session() as s:
for i in range(35899, 35909):
r = s.get('https://www.nhs.uk/Services/GP/Staff/DefaultView.aspx?id=' + str(i))
soup = bs(r.content, 'lxml')
row = [item.text for item in soup.select('.staff-title:has(em:contains("Practice Manager")) [id]')]
if not row: row = ['no practice manager']
practice = soup.select_one('.gp').text if soup.select_one(':has(#org-title)') else 'No practice name'
row.insert(0, practice)
results.append(row)
print(results)
Not sure how you want listing out for multiple names
import requests
from bs4 import BeautifulSoup as bs
import csv
with open('output.csv', 'w', newline='') as csvfile:
w = csv.writer(csvfile, delimiter=',', quotechar='|', quoting=csv.QUOTE_MINIMAL)
with requests.Session() as s:
for i in range(35899, 35909):
r = s.get('https://www.nhs.uk/Services/GP/Staff/DefaultView.aspx?id=' + str(i))
soup = bs(r.content, 'lxml')
row = [item.text for item in soup.select('.staff-title:has(em:contains("Practice Manager")) [id]')]
if not row: row = ['no practice manager']
practice = soup.select_one('.gp').text if soup.select_one(':has(#org-title)') else 'No practice name'
row.insert(0, practice)
w.writerow(row)

Looks like the problem stems from the fact that, in some of these pages, there is no information at all, and you get a "Profile Hidden" error. I modified your code somewhat, to cover the first 5 pages. Aside from saving to file, it looks like this:
[same imports]
pages = [ ]
for i in range(35899, 35904):
url = 'https://www.nhs.uk/Services/GP/Staff/DefaultView.aspx?id=' + str(i)
pages.append(url)
soup = [ ]
for item in pages:
page = requests.get(item)
soup.append(bs(page.text, 'lxml'))
business = []
for items in soup:
h1Obj = items.select('[class^=panel]:has([class^="gp notranslate"]:contains(""))')
for i in h1Obj:
tagArray = i.findChildren()
for tag in tagArray:
if isinstance(tag,Tag) and tag.name in 'h1':
business.append(tag.text)
names = []
for items in soup:
h4Obj = items.select('[class^=panel]:not(p):has([class^="staff-title"]:contains("Practice Manager"))')
for i in h4Obj:
tagArray = i.findChildren()
for tag in tagArray:
if isinstance(tag,Tag) and tag.name in 'h4':
names.append(tag.text)
for bus, name in zip(business,names):
print(bus,'---',name)
The output looks like this:
Bilbrook Medical Centre --- Di Palfrey
Caversham Group Practice --- Di Palfrey
Caversham Group Practice --- Di Palfrey
The Moorcroft Medical Ctr --- Ms Kim Stanyer
Brotton Surgery --- Mrs Gina Bayliss
Notice that only the 2nd and 3rd entries are duplicated; that is (somehow, not sure why) caused by the "Hidden Profile" in the third page. So if you modify the main blocks of the code to:
business = []
for items in soup:
if "ProfileHiddenError.aspx" in (str(items)):
business.append('Profile Hidden')
else:
h1Obj = items.select('[class^=panel]:has([class^="gp notranslate"]:contains(""))')
for i in h1Obj:
tagArray = i.findChildren()
for tag in tagArray:
if isinstance(tag,Tag) and tag.name in 'h1':
business.append(tag.text)
names = []
for items in soup:
if "ProfileHiddenError.aspx" in (str(items)):
names.append('Profile Hidden')
elif not "Practice Manager" in str(items):
names.append('No Practice Manager Specified')
else:
h4Obj = items.select('[class^=panel]:not(p):has([class^="staff-title"]:contains("Practice Manager"))')
for i in h4Obj:
tagArray = i.findChildren()
for tag in tagArray:
if isinstance(tag,Tag) and tag.name in 'h4':
names.append(tag.text)
for bus, name in zip(business,names):
print(bus,'---',name)
The output, this time is:
BBilbrook Medical Centre --- Di Palfrey
Caversham Group Practice --- No Practice Manager Specified
Profile Hidden --- Profile Hidden
The Moorcroft Medical Ctr --- Ms Kim Stanyer
Brotton Surgery --- Mrs Gina Bayliss
Hopefully this would help you to troubleshoot the problem.

Related

Split function doesnt work for string and for list

Just doing one of my first web scraping and I already have elements I wanted to extract but I cannot find the function to print them as a numbered list. The code I have for now:
r = requests.get('https://mmazurek.dev/category/programowanie-2/page/3/', proxies={'http':'82.119.170.106'})
page = soup(r.content, "html.parser")
contents=page.findAll(None, class_="post-title-link")
for content in contents:
text_content=list(content.get_text())
first_letter=str(text_content[0])
x="".join(first_letter)
listToStr = "".join(map(str, text_content))
print(listToStr)
The purpose is to have list printed like:
P....
J...
...
Hope you dont mind it's a Polish text;)

def get_html(url, useragent=None, proxy=None):
session = requests.Session()
request = session.get(url=url, headers=useragent, proxies=proxy)
if request.status_code == 200:
soup = bs(request.text, 'lxml')
return soup
else:
print("Error " + str(request.status_code))
return request.status_code
def parse(soup):
data = []
contents = soup.findAll(None, class_="post-title-link")
for i, content in enumerate(contents):
text = content.text
href = content['href']
data.append([
i,
text,
href,
])
return data
return data
data = parse(get_html('https://mmazurek.dev/category/programowanie-2/page/3/', proxy={'http': '82.119.170.106'}))
print(data)

Unable to scrape all data using beautiful soup

URL = r"https://www.vault.com/best-companies-to-work-for/law/top-100-law-firms-rankings/year/"
My_list = ['2007','2008','2009','2010']
Year = []
CompanyName = []
Rank = []
Score = []
for I, Page in enumerate(My_list, start=1):
url = r'https://www.vault.com/best-companies-to-work-for/law/top-100-law-firms-rankings/year/{}'.format(Page)
print(url)
Res = requests.get(url)
soup = BeautifulSoup(Res.content , 'html.parser')
data = soup.find('div' ,{'id':'main-content'})
for Data in data:
Title = data.findAll('h3')
for title in Title:
CompanyName.append(title.text.strip())
Rank = data.findAll('div' ,{'class':'rank RankNumber'})
for rank in Rank:
Rank.append(rank)
Score = data.findAll('div' ,{'class':'rank RankNumber'})
for score in Score:
Score.append(score)
I am unable to get all data for title ,Rank ,Score.
I dont know whether i have identified the right tag . and iam unble to extract value from the list rank.

To get you started. First, find all the div.RankItem elements, then within each, find the title, rank, and score.
from bs4 import BeautifulSoup
import requests
resp = requests.get('https://www.vault.com/best-companies-to-work-for/law/top-100-law-firms-rankings/year/2010')
soup = BeautifulSoup(resp.content , 'html.parser')
for i, item in enumerate(soup.find_all("div", {"class": "RankItem"})):
title = item.find("h3", {"class": "MainLink"}).get_text().strip()
rank = item.find("div", {"class": "RankNumber"}).get_text().strip()
score = item.find("div", {"class": "score"}).get_text().strip()
print(i+1, title, rank, score)

Formatting a Python generated CSV

I'm making a web scraper in python.
I'd like to remove the blank rows from the generated csv and would like to add a header saying "Car make", "Car Model", "Price". and would also like to remove the [] from all the names in the generated csv
imports go here...
source = requests.get(' website link goes here...').text
soup = bs(source, 'html.parser')
csv_file = open('pyScraper_1.3_Export', 'w')
csv_writer = csv.writer(csv_file)
csv_writer.writerow(['brand_Names', 'Prices'])
csv_file.close()
#gives us the make and model of all cars
Names = []
Prices_Cars = []
for var1 in soup.find_all('h3', class_ = 'brandModelTitle'):
car_Names = var1.text # var1.span.text
test_Split = car_Names.split("\n")
full_Names = test_Split[1:3]
#make = test_Split[1:2]
#model = test_Split[2:3]
Names.append(full_Names)
#prices
for Prices in soup.find_all('span', class_ = 'f20 bold fieldPrice'):
Prices = Prices.span.text
Prices = re.sub("^\s+|\s+$", "",Prices, flags=re.UNICODE) # removing whitespace before the prices
Prices_Cars.append(Prices)
csv_file = open('pyScraper_1.3_Export.csv', 'a')
csv_writer = csv.writer(csv_file)
i = 0
while i < len(Prices_Cars):
csv_writer.writerow([Names[i], Prices_Cars[i]])
i = i + 1
csv_file.close()
here is the screenshot of the generated csv
![][1]
[1]: https://i.stack.imgur.com/m7Xw1.jpg

To remove additional newlines:
csv_file = open('pyScraper_1.3_Export.csv', 'a', newline='')
("If csvfile is a file object, it should be opened with newline=''.", https://docs.python.org/3/library/csv.html#csv.writer)
To add headers:
you are actually adding headers, but to file named pyScraper_1.3_Export (note no .csv extension), this may be a mistype. Just change the code at about line 6 to
csv_file = open('pyScraper_1.3_Export.csv', 'w', newline='')
csv_writer = csv.writer(csv_file)
csv_writer.writerow(["Car make", "Car Model", "Price"])
csv_file.close()
As for removing nested list, unpack Names[i] with * operator:
csv_writer.writerow([*Names[i], Prices_Cars[i]])

adding "na" text to an array within a loop

I've gotten all the data I wanted from scraping this metacritc url (see below) however, I can't seem to put a value in for when I don't find the associated value for list (missing values)
I would like to have it so all the lists are even (so I can right to .csv)
Here is the code I have so far:
from requests import get
from bs4 import BeautifulSoup
from urllib.request import Request, urlopen
import pandas as pd
#Define year
year_number = 2018
# Define the URL
i = range(0, 1)
names = []
metascores = []
userscores = []
userscoresNew = []
release_dates = []
release_datesNew = []
publishers = []
ratings = []
genres = []
genresNew = []
for element in i:
url = "http://www.metacritic.com/browse/games/score/metascore/year/pc/filtered?view=detailed&sort=desc&year_selected=" + format(year_number)
print(url)
year_number -= 1
# not sure about this but it works (I was getting blocked by something and this the way I found around it)
req = Request(url, headers={'User-Agent': 'Mozilla/5.0'})
web_byte = urlopen(req).read()
webpage = web_byte.decode('utf-8')
#this grabs the all the text from the page
html_soup = BeautifulSoup(webpage, 'html5lib')
#this is for selecting all the games in from 1 to 100 (the list of them)
game_names = html_soup.find_all("div", class_="main_stats")
game_metas = html_soup.find_all("a", class_="basic_stat product_score")
game_users = html_soup.find_all("li", class_='stat product_avguserscore')
game_releases = html_soup.find_all("ul", class_='more_stats')
game_publishers = html_soup.find_all("li", class_='stat publisher')
game_ratings = html_soup.find_all("li", class_='stat maturity_rating')
game_genres = html_soup.find_all("li", class_='stat genre')
#Extract data from each game
for games in game_names:
name = games.find()
names.append(name.text.strip())
for games2 in game_metas:
metascore = games2.find()
metascores.append(metascore.text.strip())
for games3 in game_releases:
release_date = games3.find()
release_dates.append(release_date.text.strip())
for games4 in game_users:
userscore = games4.find('span', class_="data textscore textscore_favorable") or games4.find('span', class_="data textscore textscore_mixed")
if userscore:
userscores.append(userscore.text)
for games5 in game_publishers:
publisher = games5.find("span", class_ = "data")
if publisher:
publishers.append(publisher.text)
for games6 in game_ratings:
rating = games6.find("span", class_ = "data")
for games7 in game_genres:
genre = games7.find("span", class_ = "data")
if genre:
genres.append(genre.text)
for x in release_dates:
temp = str(x)
temp2 = temp.replace("Release Date:\n ", "")
release_datesNew.append(temp2)
for z in genres:
temp3 = str(z)
temp4 = temp3.strip()
temp5 = temp4.replace(" ", "")
genresNew.append(temp5)
df = pd.DataFrame({'Games:': names})
not sure how I would work that in to this code
From what I understand it's take all the data it can find but if there is a blank it doesn't know about it
can someone adivse the best solution for this situation
any help would be great
Thanks

Just add else's for the existing conditions...
if userscore:
userscores.append(userscore.text)
else:
userscores.append('na')

Socket Error Exceptions in Python when Scraping

I am trying to learn scraping,
I use exceptions lower down in the code to pass through errors because they dont affect the writing of data to csv
I keep getting a "socket.gaierror" but in the handling of that there is a "urllib.error.URLError" in the handling of that I get "NameError: name 'socket' is not defined" which seems circuitous
I kind of understand that using these exceptions may not be the best way to run the code but I cant seem to get past these errors and I dont know a way around or how to fix the errors.
If you have any suggestions outside of fixing the error exceptions that would be greatly appreciated as well.
import csv
from urllib.request import urlopen
from urllib.error import HTTPError
from bs4 import BeautifulSoup
base_url = 'http://www.fangraphs.com/' # used in line 27 for concatenation
years = ['2017','2016','2015'] # for enough data to run tests
#Getting Links for letters
player_urls = []
data = urlopen('http://www.fangraphs.com/players.aspx')
soup = BeautifulSoup(data, "html.parser")
for link in soup.find_all('a'):
if link.has_attr('href'):
player_urls.append(base_url + link['href'])
#Getting Alphabet Links
test_for_playerlinks = 'players.aspx?letter='
player_alpha_links = []
for i in player_urls:
if test_for_playerlinks in i:
player_alpha_links.append(i)
# Getting Player Links
ind_player_urls = []
for l in player_alpha_links:
data = urlopen(l)
soup = BeautifulSoup(data, "html.parser")
for link in soup.find_all('a'):
if link.has_attr('href'):
ind_player_urls.append(link['href'])
#Player Links
jan = 'statss.aspx?playerid'
players = []
for j in ind_player_urls:
if jan in j:
players.append(j)
# Building Pitcher List
pitcher = 'position=P'
pitchers = []
pos_players = []
for i in players:
if pitcher in i:
pitchers.append(i)
else:
pos_players.append(i)
# Individual Links to Different Tables Sorted by Base URL differences
splits = 'http://www.fangraphs.com/statsplits.aspx?'
game_logs = 'http://www.fangraphs.com/statsd.aspx?'
split_pp = []
gamel = []
years = ['2017','2016','2015']
for i in pos_players:
for year in years:
split_pp.append(splits + i[12:]+'&season='+ year)
gamel.append(game_logs+ i[12:] + '&type=&gds=&gde=&season=' + year)
split_pitcher = []
gl_pitcher = []
for i in pitchers:
for year in years:
split_pitcher.append(splits + i[12:]+'&season=' + year)
gl_pitcher.append(game_logs + i[12:] + '&type=&gds=&gde=&season=' + year)
# Splits for Pitcher Data
row_sp = []
rows_sp = []
try:
for i in split_pitcher:
sauce = urlopen(i)
soup = BeautifulSoup(sauce, "html.parser")
table1 = soup.find_all('strong', {"style":"font-size:15pt;"})
row_sp = []
for name in table1:
nam = name.get_text()
row_sp.append(nam)
table = soup.find_all('table', {"class":"rgMasterTable"})
for h in table:
he = h.find_all('tr')
for i in he:
td = i.find_all('td')
for j in td:
row_sp.append(j.get_text())
rows_sp.append(row_sp)
except(RuntimeError, TypeError, NameError, URLError, socket.gaierror):
pass
try:
with open('SplitsPitchingData2.csv', 'w') as fp:
writer = csv.writer(fp)
writer.writerows(rows_sp)
except(RuntimeError, TypeError, NameError):
pass

I'm guessing your main problem was that you - without any sleep what so ever - queried the site for a huge amount of invalid urls (you create 3 urls for the years 2015-2017 for 22880 pitchers in total, but most of these do not fall within that scope so you have tens of thousands of queries that return errors).
I'm surprised your IP wasn't banned by site admin. That said: It would be better to do some filtering so you avoid all those error queries...
The filter I applied is not perfect. It checks if the years in the list either appears in the start or end the years given on the site (e.g. '2004 - 2015'). This also creates error links but no way near the amount the original script did.
In code it could look like this:
from urllib.request import urlopen
from bs4 import BeautifulSoup
from time import sleep
import csv
base_url = 'http://www.fangraphs.com/'
years = ['2017','2016','2015']
# Getting Links for letters
letter_links = []
data = urlopen('http://www.fangraphs.com/players.aspx')
soup = BeautifulSoup(data, "html.parser")
for link in soup.find_all('a'):
try:
link = base_url + link['href']
if 'players.aspx?letter=' in link:
letter_links.append(link)
except:
pass
print("[*] Retrieved {} links. Now fetching content for each...".format(len(letter_links)))
# the data resides in two different base_urls:
splits_url = 'http://www.fangraphs.com/statsplits.aspx?'
game_logs_url = 'http://www.fangraphs.com/statsd.aspx?'
# we need (for some reason) players in two lists - pitchers_split and pitchers_game_log - and the rest of the players in two different, pos_players_split and pis_players_game_log
pos_players_split = []
pos_players_game_log = []
pitchers_split = []
pitchers_game_log = []
# and if we wanted to do something with the data from the letter_queries, lets put that in a list for safe keeping:
ind_player_urls = []
current_letter_count = 0
for link in letter_links:
current_letter_count +=1
data = urlopen(link)
soup = BeautifulSoup(data, "html.parser")
trs = soup.find('div', class_='search').find_all('tr')
for player in trs:
player_data = [tr.text for tr in player.find_all('td')]
# To prevent tons of queries to fangraph with invalid years - check if elements from years list exist with the player stat:
if any(year in player_data[1] for year in years if player_data[1].startswith(year) or player_data[1].endswith(year)):
href = player.a['href']
player_data.append(base_url + href)
# player_data now looks like this:
# ['David Aardsma', '2004 - 2015', 'P', 'http://www.fangraphs.com/statss.aspx?playerid=1902&position=P']
ind_player_urls.append(player_data)
# build the links for game_log and split
for year in years:
split = '{}{}&season={}'.format(splits_url,href[12:],year)
game_log = '{}{}&type=&gds=&gde=&season={}'.format(game_logs_url, href[12:], year)
# checking if the player is pitcher or not. We're append both link and name (player_data[0]), so we don't need to extract name later on
if 'P' in player_data[2]:
pitchers_split.append([player_data[0],split])
pitchers_game_log.append([player_data[0],game_log])
else:
pos_players_split.append([player_data[0],split])
pos_players_game_log.append([player_data[0],game_log])
print("[*] Done extracting data for players for letter {} out of {}".format(current_letter_count, len(letter_links)))
sleep(2)
# CONSIDER INSERTING CSV-PART HERE....
# Extracting and writing pitcher data to file
with open('SplitsPitchingData2.csv', 'a') as fp:
writer = csv.writer(fp)
for i in pitchers_split:
try:
row_sp = []
rows_sp = []
# all elements in the pitchers_split are lists. Player name is i[1]
data = urlopen(i[1])
soup = BeautifulSoup(data, "html.parser")
# append name to row_sp from pitchers_split
row_sp.append(i[0])
# the page has 3 tables with the class rgMasterTable, the first i Standard, the second Advanced, the 3rd Batted Ball
# we're only grabbing standard
table_standard = soup.find_all('table', {"class":"rgMasterTable"})[0]
trs = table_standard.find_all('tr')
for tr in trs:
td = tr.find_all('td')
for content in td:
row_sp.append(content.get_text())
rows_sp.append(row_sp)
writer.writerows(rows_sp)
sleep(2)
except Exception as e:
print(e)
pass
Since I'm not sure precisely how you wanted the data formatted on output you need some work on that.
If you want to avoid waiting for all letter_links to be extracted before you retrieve the actual pitcher stats (and fine tune your output) you can move the csv writer part up, so it runs as a part of the letter loop. If you do this don't forget to empty the pitchers_split list before grabbing another letter_link...

Develop Reference

node.js excel linux python-3.x azure haskell apache-spark rust .htaccess string

Web scraping script is returning duplicate values - python-3.x

Related

Split function doesnt work for string and for list

Unable to scrape all data using beautiful soup

Formatting a Python generated CSV

adding "na" text to an array within a loop

Socket Error Exceptions in Python when Scraping

Categories

Resources