Information is repeating code more than five times in .csv - python-3.x

I'm trying to get properly the information in a file .csv, but code is scraping information more than five times. Normally I should have 31 reviews and, in the file, it shows me 301. I have tried to follow the answer to this question Data to .csv is repeating three times. I need three different scrapes exported to a csv file but I understood anything. And the answer for this question Python repeating CSV file, I tried to change my code taking into account that solution but it doesn't work. Also I tried to change the variable's name but it doesn't either. Could you tell me what is wrong and what I have to do to get information properly? i'm really novice in coding so please, if you can explain me line by line yours modifications, I will appreciate them!
with requests.Session() as s:
for offset in range(10,40):
url = f'https://www.tripadvisor.fr/Restaurant_Review-g187147-d947475-Reviews-or{offset}-Le_Bouclard-Paris_Ile_de_France.html'
r = s.get(url)
soup = bs(r.content, 'lxml')
reviews = soup.select('.reviewSelector')
ids = [review.get('data-reviewid') for review in reviews]
r = s.post(
'https://www.tripadvisor.fr/OverlayWidgetAjax?Mode=EXPANDED_HOTEL_REVIEWS_RESP&metaReferer=',
data = {'reviews': ','.join(ids), 'contextChoice': 'DETAIL'},
headers = {'referer': r.url}
)
soup = bs(r.content, 'lxml')
if not offset:
inf_rest_name = soup.select_one('.heading').text.replace("\n","").strip()
rest_eclf = soup.select_one('.header_links a').text.strip()
for review in reviews:
name_client = review.select_one('.info_text > div:first-child').text.strip()
date_rev_cl = review.select_one('.ratingDate')['title'].strip()
titre_rev_cl = review.select_one('.noQuotes').text.strip()
opinion_cl = review.select_one('.partial_entry').text.replace("\n","").strip()
row = [f"{inf_rest_name}", f"{rest_eclf}", f"{name_client}", f"{date_rev_cl}" , f"{titre_rev_cl}", f"{opinion_cl}"]
w.writerow(row)

Depending on how many ids can be posted I would issue all the requests that get ids first. Then a single post with all those ids.
import requests
from bs4 import BeautifulSoup as bs
import pandas as pd
ids = []
results = []
with requests.Session() as s:
for offset in range(10,40,10):
url = f'https://www.tripadvisor.fr/Restaurant_Review-g187147-d947475-Reviews-or{offset}-Le_Bouclard-Paris_Ile_de_France.html'
r = s.get(url)
soup = bs(r.content, 'lxml')
if offset == 10:
inf_rest_name = soup.select_one('.heading').text.replace("\n","").strip()
rest_eclf = soup.select_one('.header_links a').text.strip()
reviews = soup.select('.reviewSelector')
ids += [review.get('data-reviewid') for review in reviews]
r = s.post('https://www.tripadvisor.fr/OverlayWidgetAjax?Mode=EXPANDED_HOTEL_REVIEWS_RESP&metaReferer=', data = {'reviews': ','.join(ids), 'contextChoice': 'DETAIL'},
headers = {'referer': r.url}
)
soup = bs(r.content, 'lxml')
reviews = soup.select('.reviewSelector')
for review in reviews:
name_client = review.select_one('.info_text > div:first-child').text.strip()
date_rev_cl = review.select_one('.ratingDate')['title'].strip()
titre_rev_cl = review.select_one('.noQuotes').text.strip()
opinion_cl = review.select_one('.partial_entry').text.replace("\n","").strip()
row = [f"{inf_rest_name}", f"{rest_eclf}", f"{name_client}", f"{date_rev_cl}" , f"{titre_rev_cl}", f"{opinion_cl}"]
results.append(row)
df = pd.DataFrame(results)
df.to_csv(r'C:\Users\User\data.csv', sep=',', encoding='utf-8-sig',index = False)

My loop was runing 30 times, once for each number between 10 and 40. As every number 10-19 was redirected to 10, 20-29to 20, etc., this means that I was scraping each of those pages 10 times, geting 10 duplicates for each review. So the third argument(10) in range, allowed a change every tenth number.
import requests,csv
from bs4 import BeautifulSoup as bs
with open("bouclard.csv", "w", encoding="utf-8-sig", newline='') as csv_file:
w = csv.writer(csv_file, delimiter = ";", quoting=csv.QUOTE_MINIMAL)
w.writerow(["inf_rest_name", "rest_eclf", "name_client", "date_rev_cl", "titre_rev_cl", "opinion_cl"])
with requests.Session() as s:
for offset in range(10,40,10):
url = f'https://www.tripadvisor.fr/Restaurant_Review-g187147-d947475-Reviews-or{offset}-Le_Bouclard-Paris_Ile_de_France.html'
r = s.get(url)
soup = bs(r.content, 'lxml')
reviews = soup.select('.reviewSelector')
ids = [review.get('data-reviewid') for review in reviews]
r = s.post(
'https://www.tripadvisor.fr/OverlayWidgetAjax?Mode=EXPANDED_HOTEL_REVIEWS_RESP&metaReferer=',
data = {'reviews': ','.join(ids), 'contextChoice': 'DETAIL'},
headers = {'referer': r.url}
)
soup = bs(r.content, 'lxml')
if not offset:
inf_rest_name = soup.select_one('.heading').text.replace("\n","").strip()
rest_eclf = soup.select_one('.header_links a').text.strip()
for review in soup.select('.reviewSelector'):
name_client = review.select_one('.info_text > div:first-child').text.strip()
date_rev_cl = review.select_one('.ratingDate')['title'].strip()
titre_rev_cl = review.select_one('.noQuotes').text.strip()
opinion_cl = review.select_one('.partial_entry').text.replace("\n","").strip()
row = [f"{inf_rest_name}", f"{rest_eclf}", f"{name_client}", f"{date_rev_cl}" , f"{titre_rev_cl}", f"{opinion_cl}"]
print(row)

Related

Unable to scrape all data using beautiful soup

URL = r"https://www.vault.com/best-companies-to-work-for/law/top-100-law-firms-rankings/year/"
My_list = ['2007','2008','2009','2010']
Year = []
CompanyName = []
Rank = []
Score = []
for I, Page in enumerate(My_list, start=1):
url = r'https://www.vault.com/best-companies-to-work-for/law/top-100-law-firms-rankings/year/{}'.format(Page)
print(url)
Res = requests.get(url)
soup = BeautifulSoup(Res.content , 'html.parser')
data = soup.find('div' ,{'id':'main-content'})
for Data in data:
Title = data.findAll('h3')
for title in Title:
CompanyName.append(title.text.strip())
Rank = data.findAll('div' ,{'class':'rank RankNumber'})
for rank in Rank:
Rank.append(rank)
Score = data.findAll('div' ,{'class':'rank RankNumber'})
for score in Score:
Score.append(score)
I am unable to get all data for title ,Rank ,Score.
I dont know whether i have identified the right tag . and iam unble to extract value from the list rank.
To get you started. First, find all the div.RankItem elements, then within each, find the title, rank, and score.
from bs4 import BeautifulSoup
import requests
resp = requests.get('https://www.vault.com/best-companies-to-work-for/law/top-100-law-firms-rankings/year/2010')
soup = BeautifulSoup(resp.content , 'html.parser')
for i, item in enumerate(soup.find_all("div", {"class": "RankItem"})):
title = item.find("h3", {"class": "MainLink"}).get_text().strip()
rank = item.find("div", {"class": "RankNumber"}).get_text().strip()
score = item.find("div", {"class": "score"}).get_text().strip()
print(i+1, title, rank, score)

Unable to scrape all data

from bs4 import BeautifulSoup
import requests , sys ,os
import pandas as pd
URL = r"https://www.vault.com/best-companies-to-work-for/law/top-100-law-firms-rankings/year/"
My_list = ['2007','2008','2009','2010','2011','2012','2013','2014','2015','2016','2017','2018','2019','2020']
Year= []
CompanyName = []
Rank = []
Score = []
print('\n>>Process started please wait\n\n')
for I, Page in enumerate(My_list, start=1):
url = r'https://www.vault.com/best-companies-to-work-for/law/top-100-law-firms-rankings/year/{}'.format(Page)
print('\nData fetching from : ',url)
Res = requests.get(url)
soup = BeautifulSoup(Res.content , 'html.parser')
data = soup.find('section',{'class': 'search-result CompanyWorkfor RankingMain FindSchools school-results contrastSection d-flex justify-content-center min-height Rankings CompRank'})
if len(soup) > 0:
print("\n>>Getting page source for :" , url)
else:
print("Please Check url :",url)
for i, item in enumerate(data.find_all("div", {"class": "RankItem"})):
year = item.find("i",{"class":"fa-stack fa-2x"})
Year.append(year)
title = item.find("h3", {"class": "MainLink"}).get_text().strip()
CompanyName.append(title)
rank = item.find("div", {"class": "RankNumber"}).get_text().strip()
Rank.append(rank)
score = item.find("div", {"class": "score"}).get_text().strip()
Score.append(score)
Data = pd.DataFrame({"Year":Year,"CompanyName":CompanyName,"Rank":Rank,"Score":Score})
Data[['First','Score']] = Data.Score.str.split(" " , expand =True,)
Data[['hash','Rank']] = Data.Rank.str.split("#" , expand = True,)
Data.drop(columns = ['hash','First'],inplace = True)
Data.to_csv('Vault_scrap.csv',index = False)
For each url the expected output Data for year, rank, title and score is 100 lines, but I'm getting only 10 lines.
You can iterate through the year and pages like this.
import requests
import pandas as pd
url = 'https://www.vault.com/vault/api/Rankings/LoadMoreCompanyRanksJSON'
def page_loop(year, url):
tableReturn = pd.DataFrame()
for page in range(1,101):
payload = {
'rank': '2',
'year': year,
'category': 'LBACCompany',
'pg': page}
jsonData = requests.get(url, params=payload).json()
if jsonData == []:
return tableReturn
else:
print ('page: %s' %page)
tableReturn = tableReturn.append(pd.DataFrame(jsonData), sort=True).reset_index(drop=True)
return tableReturn
results = pd.DataFrame()
for year in range(2007,2021):
print ("\n>>Getting page source for :" , year)
jsonData = page_loop(year, url)
results = results.append(pd.DataFrame(jsonData), sort=True).reset_index(drop=True)

Problem extracting data from json format to csv using BeautifulSoup 3

I am trying to export Data from json format in CSV but getting no results.
below is the code
import requests
from bs4 import BeautifulSoup
import json
import re
url = "https://www.daraz.pk/catalog/?q=dell&_keyori=ss&from=input&spm=a2a0e.home.search.go.35e34937qjElRf"
page = requests.get(url)
print(page.status_code)
print(page.text)
soup = BeautifulSoup(page.text, 'html.parser')
print(soup.prettify())
alpha = soup.find_all('script',{'type':'application/ld+json'})
jsonObj =`json.loads(alpha[1].text)`
for item in jsonObj['itemListElement']:
name = item['name']
price = item['offers']['price']
currency = item['offers']['priceCurrency']
availability = item['offers']['availability'].split('/')[-1]
availability = [s for s in re.split("([A-Z][^A-Z]*)", availability) if s]
availability = ' '.join(availability)
print('Availability: %s Price: %0.2f %s Name: %s' %(availability,float(price), currency,name))
Here is the code I am trying to export Data in CSV but not getting results in CSV
Create a file to write to, add headers row
outfile = open('products.csv','w', newline='')
writer = csv.writer(outfile)
writer.writerow(["name", "offers", "price", "priceCurrency", "availability" ])
outfile.close()
alpha = soup.find_all('script',{'type':'application/ld+json'})
jsonObj = json.loads(alpha[1].text)
for item in jsonObj['itemListElement']:
name = item['name']
price = item['offers']['price']
currency = item['offers']['priceCurrency']
availability = item['offers']['availability'].split('/')[-1]
availability = [s for s in re.split("([A-Z][^A-Z]*)", availability) if s]
availability = ' '.join(availability)
you get not result because not writing the CSV in the loop
outfile = open('products.csv','w', newline='')
writer = csv.writer(outfile)
writer.writerow(["name", "type", "price", "priceCurrency", "availability" ])
alpha = soup.find_all('script',{'type':'application/ld+json'})
jsonObj = json.loads(alpha[1].text)
for item in jsonObj['itemListElement']:
name = item['name']
type = item['#type']
price = item['offers']['price']
currency = item['offers']['priceCurrency']
availability = item['offers']['availability'].split('/')[-1]
# forgot this?
writer.writerow([name, type, price, currency, availability ])
# and close the CSV here
outfile.close()
I personally am a fan of Pandas to write a csv. Some might say its extensive. But it works.
import requests
from bs4 import BeautifulSoup
import json
import re
import pandas as pd
url = "https://www.daraz.pk/catalog/?q=dell&_keyori=ss&from=input&spm=a2a0e.home.search.go.35e34937qjElRf"
page = requests.get(url)
#print(page.status_code)
#print(page.text)
soup = BeautifulSoup(page.text, 'html.parser')
#(soup.prettify())
alpha = soup.find_all('script',{'type':'application/ld+json'})
jsonObj = json.loads(alpha[1].text)
results = pd.DataFrame()
for item in jsonObj['itemListElement']:
name = item['name']
price = item['offers']['price']
currency = item['offers']['priceCurrency']
availability = item['offers']['availability'].split('/')[-1]
availability = [s for s in re.split("([A-Z][^A-Z]*)", availability) if s]
availability = ' '.join(availability)
row = [name,price,currency,availability]
temp_df = pd.DataFrame([row], columns = ['name','price','currency','availability'])
results = results.append(temp_df)
results.to_csv('products.csv', index=False)

adding "na" text to an array within a loop

I've gotten all the data I wanted from scraping this metacritc url (see below) however, I can't seem to put a value in for when I don't find the associated value for list (missing values)
I would like to have it so all the lists are even (so I can right to .csv)
Here is the code I have so far:
from requests import get
from bs4 import BeautifulSoup
from urllib.request import Request, urlopen
import pandas as pd
#Define year
year_number = 2018
# Define the URL
i = range(0, 1)
names = []
metascores = []
userscores = []
userscoresNew = []
release_dates = []
release_datesNew = []
publishers = []
ratings = []
genres = []
genresNew = []
for element in i:
url = "http://www.metacritic.com/browse/games/score/metascore/year/pc/filtered?view=detailed&sort=desc&year_selected=" + format(year_number)
print(url)
year_number -= 1
# not sure about this but it works (I was getting blocked by something and this the way I found around it)
req = Request(url, headers={'User-Agent': 'Mozilla/5.0'})
web_byte = urlopen(req).read()
webpage = web_byte.decode('utf-8')
#this grabs the all the text from the page
html_soup = BeautifulSoup(webpage, 'html5lib')
#this is for selecting all the games in from 1 to 100 (the list of them)
game_names = html_soup.find_all("div", class_="main_stats")
game_metas = html_soup.find_all("a", class_="basic_stat product_score")
game_users = html_soup.find_all("li", class_='stat product_avguserscore')
game_releases = html_soup.find_all("ul", class_='more_stats')
game_publishers = html_soup.find_all("li", class_='stat publisher')
game_ratings = html_soup.find_all("li", class_='stat maturity_rating')
game_genres = html_soup.find_all("li", class_='stat genre')
#Extract data from each game
for games in game_names:
name = games.find()
names.append(name.text.strip())
for games2 in game_metas:
metascore = games2.find()
metascores.append(metascore.text.strip())
for games3 in game_releases:
release_date = games3.find()
release_dates.append(release_date.text.strip())
for games4 in game_users:
userscore = games4.find('span', class_="data textscore textscore_favorable") or games4.find('span', class_="data textscore textscore_mixed")
if userscore:
userscores.append(userscore.text)
for games5 in game_publishers:
publisher = games5.find("span", class_ = "data")
if publisher:
publishers.append(publisher.text)
for games6 in game_ratings:
rating = games6.find("span", class_ = "data")
for games7 in game_genres:
genre = games7.find("span", class_ = "data")
if genre:
genres.append(genre.text)
for x in release_dates:
temp = str(x)
temp2 = temp.replace("Release Date:\n ", "")
release_datesNew.append(temp2)
for z in genres:
temp3 = str(z)
temp4 = temp3.strip()
temp5 = temp4.replace(" ", "")
genresNew.append(temp5)
df = pd.DataFrame({'Games:': names})
not sure how I would work that in to this code
From what I understand it's take all the data it can find but if there is a blank it doesn't know about it
can someone adivse the best solution for this situation
any help would be great
Thanks
Just add else's for the existing conditions...
if userscore:
userscores.append(userscore.text)
else:
userscores.append('na')

I'm stumped at looping through a returned list of URLs

My first python project, I'm trying to scrape restaurant inspection. One site has summaries that offer keys to the detailed reports that I want to scrape. I'm stumped at looping through the keyed list of urls to get the details.
import pandas as pd
import bs4
import datetime
import re
import lxml
from urllib.request import urlopen
from urllib.error import HTTPError
try:
insp = pd.read_csv("ftp://dbprftp.state.fl.us/pub/llweb/5fdinspi.csv",
usecols=[2,14,18,80,81])
except IOError:
print("The file is not accessible.")
insp.columns = ["CountyName", "InspectDate",
"NumHighVio", "LicenseID", "VisitID"]
# filter for alachua county restaurants
alachua = insp[insp.CountyName == 'Alachua']
# filter for restaurants that had at least one serious violation
alachua = alachua[alachua.NumHighVio > 0]
# change date string to date object
alachua['InspectDate'] = pd.to_datetime(alachua['InspectDate'])
# sort most recent
alachua = alachua.sort_values('InspectDate', ascending=False)
# prefer to have user set timedelta below:
today = pd.to_datetime('today')
startDay = datetime.date.today() - datetime.timedelta(days=30)
alachua = alachua[(alachua['InspectDate'] > startDay) &
(alachua['InspectDate'] < today)]
# takes LicenseID and VisitID, passes it into the urls for detailed reports
for index, rows in alachua.iterrows():
visitID = rows['VisitID']
licID = rows['LicenseID']
urls = "https://www.myfloridalicense.com/inspectionDetail.asp?InspVisitID=
%s &licid= %s" % (visitID, licID)
urls = urls.replace(' ', '')
print(urls)
## here's my problem:
for url in urls:
def get_inspect_detail():
html = urlopen(url)
soup = bs4.BeautifulSoup(html.read(), 'lxml')
details = soup.find_all('font', {'face':'verdana'})[10:]
for detail in details:
siteName = details[0].text
licNum = details[2].text
siteRank = details[4].text
expDate = details[6].text
primeStatus = details[8].text
secStatus = details[10].text
siteAddress = details[12].text
inspectResult = details[20].text
observed1 = details[34].get_text
observed2 = details[36].text
observed3 = details[38].text
observed4 = details[40].text
observed5 = details[42].text
observed6 = details[44].text
observed7 = details[46].text
observed8 = details[48].text
observed9 = details[50].text
observed10 = details[52].text
detailsLib = {
'Restaurant': siteName,
'License': licNum,
'Rank': siteRank,
'Expires': expDate,
'Primary': primeStatus,
'Secondary': secStatus,
'Address': siteAddress,
'Result': inspectResult,
'Observed1': observed1,
'Observed2': observed2,
'Observed3': observed3,
'Observed4': observed4,
'Observed5': observed5,
'Observed6': observed6,
'Observed7': observed7,
'Observed8': observed8,
'Observed9': observed9,
'Observed10': observed10
}
repr(get_inspect_detail())
Probably an obvious mistake or lack of knowledge, but I can get the unscrubbed data for one url, but not for all.
I dont see a reason to define your function inside the loop. You would end up with a lot of redundant definitions this way. Second, you could just define a result list and accumulate the detailsLib objects inside it.
def get_inspect_detail(url):
html = urlopen(url)
soup = bs4.BeautifulSoup(html.read(), 'lxml')
details = soup.find_all('font', {'face': 'verdana'})[10:]
result = []
for detail in details:
siteName = details[0].text
licNum = details[2].text
siteRank = details[4].text
expDate = details[6].text
primeStatus = details[8].text
secStatus = details[10].text
siteAddress = details[12].text
inspectResult = details[20].text
observed1 = details[34].get_text
observed2 = details[36].text
observed3 = details[38].text
observed4 = details[40].text
observed5 = details[42].text
observed6 = details[44].text
observed7 = details[46].text
observed8 = details[48].text
observed9 = details[50].text
observed10 = details[52].text
detailsLib = {
'Restaurant': siteName,
'License': licNum,
'Rank': siteRank,
'Expires': expDate,
'Primary': primeStatus,
'Secondary': secStatus,
'Address': siteAddress,
'Result': inspectResult,
'Observed1': observed1,
'Observed2': observed2,
'Observed3': observed3,
'Observed4': observed4,
'Observed5': observed5,
'Observed6': observed6,
'Observed7': observed7,
'Observed8': observed8,
'Observed9': observed9,
'Observed10': observed10
}
result.append(detailsLib)
return result
for url in urls:
repr(get_inspect_detail(url))

Resources