I'm stumped at looping through a returned list of URLs - python-3.x

My first python project, I'm trying to scrape restaurant inspection. One site has summaries that offer keys to the detailed reports that I want to scrape. I'm stumped at looping through the keyed list of urls to get the details.
import pandas as pd
import bs4
import datetime
import re
import lxml
from urllib.request import urlopen
from urllib.error import HTTPError
try:
insp = pd.read_csv("ftp://dbprftp.state.fl.us/pub/llweb/5fdinspi.csv",
usecols=[2,14,18,80,81])
except IOError:
print("The file is not accessible.")
insp.columns = ["CountyName", "InspectDate",
"NumHighVio", "LicenseID", "VisitID"]
# filter for alachua county restaurants
alachua = insp[insp.CountyName == 'Alachua']
# filter for restaurants that had at least one serious violation
alachua = alachua[alachua.NumHighVio > 0]
# change date string to date object
alachua['InspectDate'] = pd.to_datetime(alachua['InspectDate'])
# sort most recent
alachua = alachua.sort_values('InspectDate', ascending=False)
# prefer to have user set timedelta below:
today = pd.to_datetime('today')
startDay = datetime.date.today() - datetime.timedelta(days=30)
alachua = alachua[(alachua['InspectDate'] > startDay) &
(alachua['InspectDate'] < today)]
# takes LicenseID and VisitID, passes it into the urls for detailed reports
for index, rows in alachua.iterrows():
visitID = rows['VisitID']
licID = rows['LicenseID']
urls = "https://www.myfloridalicense.com/inspectionDetail.asp?InspVisitID=
%s &licid= %s" % (visitID, licID)
urls = urls.replace(' ', '')
print(urls)
## here's my problem:
for url in urls:
def get_inspect_detail():
html = urlopen(url)
soup = bs4.BeautifulSoup(html.read(), 'lxml')
details = soup.find_all('font', {'face':'verdana'})[10:]
for detail in details:
siteName = details[0].text
licNum = details[2].text
siteRank = details[4].text
expDate = details[6].text
primeStatus = details[8].text
secStatus = details[10].text
siteAddress = details[12].text
inspectResult = details[20].text
observed1 = details[34].get_text
observed2 = details[36].text
observed3 = details[38].text
observed4 = details[40].text
observed5 = details[42].text
observed6 = details[44].text
observed7 = details[46].text
observed8 = details[48].text
observed9 = details[50].text
observed10 = details[52].text
detailsLib = {
'Restaurant': siteName,
'License': licNum,
'Rank': siteRank,
'Expires': expDate,
'Primary': primeStatus,
'Secondary': secStatus,
'Address': siteAddress,
'Result': inspectResult,
'Observed1': observed1,
'Observed2': observed2,
'Observed3': observed3,
'Observed4': observed4,
'Observed5': observed5,
'Observed6': observed6,
'Observed7': observed7,
'Observed8': observed8,
'Observed9': observed9,
'Observed10': observed10
}
repr(get_inspect_detail())
Probably an obvious mistake or lack of knowledge, but I can get the unscrubbed data for one url, but not for all.

I dont see a reason to define your function inside the loop. You would end up with a lot of redundant definitions this way. Second, you could just define a result list and accumulate the detailsLib objects inside it.
def get_inspect_detail(url):
html = urlopen(url)
soup = bs4.BeautifulSoup(html.read(), 'lxml')
details = soup.find_all('font', {'face': 'verdana'})[10:]
result = []
for detail in details:
siteName = details[0].text
licNum = details[2].text
siteRank = details[4].text
expDate = details[6].text
primeStatus = details[8].text
secStatus = details[10].text
siteAddress = details[12].text
inspectResult = details[20].text
observed1 = details[34].get_text
observed2 = details[36].text
observed3 = details[38].text
observed4 = details[40].text
observed5 = details[42].text
observed6 = details[44].text
observed7 = details[46].text
observed8 = details[48].text
observed9 = details[50].text
observed10 = details[52].text
detailsLib = {
'Restaurant': siteName,
'License': licNum,
'Rank': siteRank,
'Expires': expDate,
'Primary': primeStatus,
'Secondary': secStatus,
'Address': siteAddress,
'Result': inspectResult,
'Observed1': observed1,
'Observed2': observed2,
'Observed3': observed3,
'Observed4': observed4,
'Observed5': observed5,
'Observed6': observed6,
'Observed7': observed7,
'Observed8': observed8,
'Observed9': observed9,
'Observed10': observed10
}
result.append(detailsLib)
return result
for url in urls:
repr(get_inspect_detail(url))

Related

Unable to scrape all data

from bs4 import BeautifulSoup
import requests , sys ,os
import pandas as pd
URL = r"https://www.vault.com/best-companies-to-work-for/law/top-100-law-firms-rankings/year/"
My_list = ['2007','2008','2009','2010','2011','2012','2013','2014','2015','2016','2017','2018','2019','2020']
Year= []
CompanyName = []
Rank = []
Score = []
print('\n>>Process started please wait\n\n')
for I, Page in enumerate(My_list, start=1):
url = r'https://www.vault.com/best-companies-to-work-for/law/top-100-law-firms-rankings/year/{}'.format(Page)
print('\nData fetching from : ',url)
Res = requests.get(url)
soup = BeautifulSoup(Res.content , 'html.parser')
data = soup.find('section',{'class': 'search-result CompanyWorkfor RankingMain FindSchools school-results contrastSection d-flex justify-content-center min-height Rankings CompRank'})
if len(soup) > 0:
print("\n>>Getting page source for :" , url)
else:
print("Please Check url :",url)
for i, item in enumerate(data.find_all("div", {"class": "RankItem"})):
year = item.find("i",{"class":"fa-stack fa-2x"})
Year.append(year)
title = item.find("h3", {"class": "MainLink"}).get_text().strip()
CompanyName.append(title)
rank = item.find("div", {"class": "RankNumber"}).get_text().strip()
Rank.append(rank)
score = item.find("div", {"class": "score"}).get_text().strip()
Score.append(score)
Data = pd.DataFrame({"Year":Year,"CompanyName":CompanyName,"Rank":Rank,"Score":Score})
Data[['First','Score']] = Data.Score.str.split(" " , expand =True,)
Data[['hash','Rank']] = Data.Rank.str.split("#" , expand = True,)
Data.drop(columns = ['hash','First'],inplace = True)
Data.to_csv('Vault_scrap.csv',index = False)
For each url the expected output Data for year, rank, title and score is 100 lines, but I'm getting only 10 lines.
You can iterate through the year and pages like this.
import requests
import pandas as pd
url = 'https://www.vault.com/vault/api/Rankings/LoadMoreCompanyRanksJSON'
def page_loop(year, url):
tableReturn = pd.DataFrame()
for page in range(1,101):
payload = {
'rank': '2',
'year': year,
'category': 'LBACCompany',
'pg': page}
jsonData = requests.get(url, params=payload).json()
if jsonData == []:
return tableReturn
else:
print ('page: %s' %page)
tableReturn = tableReturn.append(pd.DataFrame(jsonData), sort=True).reset_index(drop=True)
return tableReturn
results = pd.DataFrame()
for year in range(2007,2021):
print ("\n>>Getting page source for :" , year)
jsonData = page_loop(year, url)
results = results.append(pd.DataFrame(jsonData), sort=True).reset_index(drop=True)

how to access bestbuy item price

I want to check the price of a item from bestbuy website, however, the access is denied. Does anyone have some advice how to access? Thanks!
My code:
import requests
import bs4 as bs
url = "https://www.bestbuy.com/site/lg-65-class-oled-b9-series-2160p-smart-4k-uhd-tv-with-hdr/6360611.p?skuId=6360611"
url_get = requests.get(url)
soup = bs.BeautifulSoup(url_get.content, 'lxml')
with open('url_bestbuy.txt', 'w', encoding='utf-8') as f_out:
f_out.write(soup.prettify())
js_test = soup.find('span', id ='priceblock_ourprice')
if js_test is None:
js_test = soup.find('span', id ='div.price-block')
str = ""
for line in js_test.stripped_strings :
str = line
# convert to integer
str = str.replace(", ", "")
str = str.replace("$", "")
current_price = int(float(str))
your_price = 2000
if current_price < your_price :
print("I can afford it")
else:
print("Price is high please wait for the best deal")
You don't have permission to access "http://www.bestbuy.com/site/lg-65-class-oled-b9-series-2160p-smart-4k-uhd-tv-with-hdr/6360611.p?" on this server.

Problem extracting data from json format to csv using BeautifulSoup 3

I am trying to export Data from json format in CSV but getting no results.
below is the code
import requests
from bs4 import BeautifulSoup
import json
import re
url = "https://www.daraz.pk/catalog/?q=dell&_keyori=ss&from=input&spm=a2a0e.home.search.go.35e34937qjElRf"
page = requests.get(url)
print(page.status_code)
print(page.text)
soup = BeautifulSoup(page.text, 'html.parser')
print(soup.prettify())
alpha = soup.find_all('script',{'type':'application/ld+json'})
jsonObj =`json.loads(alpha[1].text)`
for item in jsonObj['itemListElement']:
name = item['name']
price = item['offers']['price']
currency = item['offers']['priceCurrency']
availability = item['offers']['availability'].split('/')[-1]
availability = [s for s in re.split("([A-Z][^A-Z]*)", availability) if s]
availability = ' '.join(availability)
print('Availability: %s Price: %0.2f %s Name: %s' %(availability,float(price), currency,name))
Here is the code I am trying to export Data in CSV but not getting results in CSV
Create a file to write to, add headers row
outfile = open('products.csv','w', newline='')
writer = csv.writer(outfile)
writer.writerow(["name", "offers", "price", "priceCurrency", "availability" ])
outfile.close()
alpha = soup.find_all('script',{'type':'application/ld+json'})
jsonObj = json.loads(alpha[1].text)
for item in jsonObj['itemListElement']:
name = item['name']
price = item['offers']['price']
currency = item['offers']['priceCurrency']
availability = item['offers']['availability'].split('/')[-1]
availability = [s for s in re.split("([A-Z][^A-Z]*)", availability) if s]
availability = ' '.join(availability)
you get not result because not writing the CSV in the loop
outfile = open('products.csv','w', newline='')
writer = csv.writer(outfile)
writer.writerow(["name", "type", "price", "priceCurrency", "availability" ])
alpha = soup.find_all('script',{'type':'application/ld+json'})
jsonObj = json.loads(alpha[1].text)
for item in jsonObj['itemListElement']:
name = item['name']
type = item['#type']
price = item['offers']['price']
currency = item['offers']['priceCurrency']
availability = item['offers']['availability'].split('/')[-1]
# forgot this?
writer.writerow([name, type, price, currency, availability ])
# and close the CSV here
outfile.close()
I personally am a fan of Pandas to write a csv. Some might say its extensive. But it works.
import requests
from bs4 import BeautifulSoup
import json
import re
import pandas as pd
url = "https://www.daraz.pk/catalog/?q=dell&_keyori=ss&from=input&spm=a2a0e.home.search.go.35e34937qjElRf"
page = requests.get(url)
#print(page.status_code)
#print(page.text)
soup = BeautifulSoup(page.text, 'html.parser')
#(soup.prettify())
alpha = soup.find_all('script',{'type':'application/ld+json'})
jsonObj = json.loads(alpha[1].text)
results = pd.DataFrame()
for item in jsonObj['itemListElement']:
name = item['name']
price = item['offers']['price']
currency = item['offers']['priceCurrency']
availability = item['offers']['availability'].split('/')[-1]
availability = [s for s in re.split("([A-Z][^A-Z]*)", availability) if s]
availability = ' '.join(availability)
row = [name,price,currency,availability]
temp_df = pd.DataFrame([row], columns = ['name','price','currency','availability'])
results = results.append(temp_df)
results.to_csv('products.csv', index=False)

Python 3 code stops at HTTP error and I can't figure out how to handle it

I'm trying to scrape links from the website https://www.usyouthsoccer.org/clubs/club-directory/. Initially, the code broke at the 30th link, so I tried to handle the exception error with urllib HTTPError. Now, the script just stops running at the 30th link. I checked that specific url and it is a bad link. I just want to move past it in the loop, but I'm having trouble with the work around. Any suggestions would be greatly appreciated...
import requests
from bs4 import BeautifulSoup as bs
from splinter import Browser
import pandas as pd
from urllib.request import Request, urlopen
from urllib.error import HTTPError
executable_path = {"executable_path": "chromedriver"}
browser = Browser("chrome", **executable_path, headless=True)
url = 'https://www.usyouthsoccer.org/clubs/club-directory/'
zipcode_input = 'CT_Main_0$txtLocation'
search_button = '//*[#id="CT_Main_0_btnSearch"]'
dropdown = '//*[#id="CT_Main_0_drpMiles"]/option[5]'
zip_codes = [64015]
team_df = pd.DataFrame()
for x in zip_codes:
try:
print(f'\n{x}\n')
url = 'https://www.usyouthsoccer.org/clubs/club-directory/'
browser.visit(url)
browser.fill(zipcode_input, x)
browser.find_by_xpath(dropdown).click()
browser.find_by_xpath(search_button).click()
html = browser.html
soup = bs(html, 'html.parser')
dallas_urls = soup.find_all(class_="more")
counter = 1
for url in dallas_urls:
print(f'Link {counter} of {len((dallas_urls))}')
counter += 1
back_url = url['href']
front_url = 'https://www.usyouthsoccer.org'
total_url = front_url + back_url
browser.visit(total_url)
my_html = pd.read_html(total_url)
details_pd = pd.DataFrame(my_html[0])
details_pd.columns = ['Cols', 'Vals']
df = details_pd.T
df.columns = df.iloc[0]
df.drop('Cols', inplace = True)
contacts_pd = pd.DataFrame(my_html[1])
if len(contacts_pd.index) == 1:
df['Contact_Title'] = contacts_pd.iloc[0,0]
df['Contact_Name'] = contacts_pd.iloc[0, 1]
df['Contact_Email'] = contacts_pd.iloc[0, 2]
elif len(contacts_pd.index) == 2:
df['Contact_Title'] = contacts_pd.iloc[0,0]
df['Contact_Name'] = contacts_pd.iloc[0, 1]
df['Contact_Email'] = contacts_pd.iloc[0, 2]
df['Contact_Title2'] = contacts_pd.iloc[1,0]
df['Contact_Name2'] = contacts_pd.iloc[1, 1]
df['Contact_Email2'] = contacts_pd.iloc[1, 2]
elif len(contacts_pd.index) == 3:
df['Contact_Title'] = contacts_pd.iloc[0,0]
df['Contact_Name'] = contacts_pd.iloc[0, 1]
df['Contact_Email'] = contacts_pd.iloc[0, 2]
df['Contact_Title2'] = contacts_pd.iloc[1,0]
df['Contact_Name2'] = contacts_pd.iloc[1, 1]
df['Contact_Email2'] = contacts_pd.iloc[1, 2]
df['Contact_Title3'] = contacts_pd.iloc[2,0]
df['Contact_Name3'] = contacts_pd.iloc[2, 1]
df['Contact_Email3'] = contacts_pd.iloc[2, 2]
team_df = pd.concat([team_df, df])
except HTTPError as err:
continue
Put your try statement inside of the nested for loop. Right now it looks like if you have a HTTP Error it is stopping the entire for loop - instead of continuing through the for loop.
for url in dallas_urls:
try:
print(f'Link {counter} of {len((dallas_urls))}')
counter += 1
back_url = url['href']
front_url = 'https://www.usyouthsoccer.org'
total_url = front_url + back_url
urllib.request.urlretrieve(total_url)
except urllib.error.HTTPError:
print ('Error')
continue

adding "na" text to an array within a loop

I've gotten all the data I wanted from scraping this metacritc url (see below) however, I can't seem to put a value in for when I don't find the associated value for list (missing values)
I would like to have it so all the lists are even (so I can right to .csv)
Here is the code I have so far:
from requests import get
from bs4 import BeautifulSoup
from urllib.request import Request, urlopen
import pandas as pd
#Define year
year_number = 2018
# Define the URL
i = range(0, 1)
names = []
metascores = []
userscores = []
userscoresNew = []
release_dates = []
release_datesNew = []
publishers = []
ratings = []
genres = []
genresNew = []
for element in i:
url = "http://www.metacritic.com/browse/games/score/metascore/year/pc/filtered?view=detailed&sort=desc&year_selected=" + format(year_number)
print(url)
year_number -= 1
# not sure about this but it works (I was getting blocked by something and this the way I found around it)
req = Request(url, headers={'User-Agent': 'Mozilla/5.0'})
web_byte = urlopen(req).read()
webpage = web_byte.decode('utf-8')
#this grabs the all the text from the page
html_soup = BeautifulSoup(webpage, 'html5lib')
#this is for selecting all the games in from 1 to 100 (the list of them)
game_names = html_soup.find_all("div", class_="main_stats")
game_metas = html_soup.find_all("a", class_="basic_stat product_score")
game_users = html_soup.find_all("li", class_='stat product_avguserscore')
game_releases = html_soup.find_all("ul", class_='more_stats')
game_publishers = html_soup.find_all("li", class_='stat publisher')
game_ratings = html_soup.find_all("li", class_='stat maturity_rating')
game_genres = html_soup.find_all("li", class_='stat genre')
#Extract data from each game
for games in game_names:
name = games.find()
names.append(name.text.strip())
for games2 in game_metas:
metascore = games2.find()
metascores.append(metascore.text.strip())
for games3 in game_releases:
release_date = games3.find()
release_dates.append(release_date.text.strip())
for games4 in game_users:
userscore = games4.find('span', class_="data textscore textscore_favorable") or games4.find('span', class_="data textscore textscore_mixed")
if userscore:
userscores.append(userscore.text)
for games5 in game_publishers:
publisher = games5.find("span", class_ = "data")
if publisher:
publishers.append(publisher.text)
for games6 in game_ratings:
rating = games6.find("span", class_ = "data")
for games7 in game_genres:
genre = games7.find("span", class_ = "data")
if genre:
genres.append(genre.text)
for x in release_dates:
temp = str(x)
temp2 = temp.replace("Release Date:\n ", "")
release_datesNew.append(temp2)
for z in genres:
temp3 = str(z)
temp4 = temp3.strip()
temp5 = temp4.replace(" ", "")
genresNew.append(temp5)
df = pd.DataFrame({'Games:': names})
not sure how I would work that in to this code
From what I understand it's take all the data it can find but if there is a blank it doesn't know about it
can someone adivse the best solution for this situation
any help would be great
Thanks
Just add else's for the existing conditions...
if userscore:
userscores.append(userscore.text)
else:
userscores.append('na')

Resources