Web Scraping data from multiple pages then appending it to csv file - python-3.x

I'm working on web scraping with beautiful soup to retrieve jobs from indeed. My code is working but when it loops to the next page it would overwrite the existing CSV file. I see from other posts that I would need to use pandas concat? but I can't seem to get it to work or where to implement it in my source code. Any suggestions to improve my code would also be greatly appreciated.
Below scrape pages 1-2 on indeed.
from bs4 import BeautifulSoup
import requests, pandas as pd
from urllib.parse import urljoin
print('Getting new jobs...')
main_url = 'https://www.indeed.com/jobs?q=web+developer&l=Sacramento,+CA&sort=date'
start_from = '&start='
for page in range(1, 3):
page = (page - 1) * 10
url = "%s%s%d" % (main_url, start_from, page) # get full url
indeed = requests.get(url)
indeed.raise_for_status()
soup = BeautifulSoup(indeed.text, 'html.parser')
home = 'https://www.indeed.com/viewjob?'
jobsTitle, companiesName, citiesName, jobsSummary, jobsLink = [], [], [], [], []
target = soup.find_all('div', class_=' row result')
for div in target:
if div:
title = div.find('a', class_='turnstileLink').text.strip()
jobsTitle.append(title)
company = div.find('span', class_='company').text.strip()
companiesName.append(company)
city = div.find('span', class_='location').text.strip()
citiesName.append(city)
summary = div.find('span', class_='summary').text.strip()
jobsSummary.append(summary)
job_link = urljoin(home, div.find('a').get('href'))
jobsLink.append(job_link)
target2 = soup.find_all('div', class_='lastRow row result')
for i in target2:
title2 = i.find('a', class_='turnstileLink').text.strip()
jobsTitle.append(title2)
company2 = i.find('span', class_='company').text.strip()
companiesName.append(company2)
city2 = i.find('span', class_='location').text.strip()
citiesName.append(city2)
summary2 = i.find('span', class_='summary').text.strip()
jobsSummary.append(summary2)
jobLink2 = urljoin(home, i.find('a').get('href'))
jobsLink.append(jobLink2)
data_record = []
for title, company, city, summary, link in zip(jobsTitle, companiesName, citiesName, jobsSummary, jobsLink):
data_record.append({'Job Title': title, 'Company': company, 'City': city, 'Summary': summary, 'Job Link': link})
df = pd.DataFrame(data_record, columns=['Job Title', 'Company', 'City', 'Summary', 'Job Link'])
df

You can crate list data_record out of loop with DataFrame contructor:
data_record = []
for page in range(1, 3):
page = (page - 1) * 10
url = "%s%s%d" % (main_url, start_from, page) # get full url
indeed = requests.get(url)
indeed.raise_for_status()
soup = BeautifulSoup(indeed.text, 'html.parser')
...
for title, company, city, summary, link in zip(jobsTitle, companiesName, citiesName, jobsSummary, jobsLink):
data_record.append({'Job Title': title, 'Company': company, 'City': city, 'Summary': summary, 'Job Link': link})
df = pd.DataFrame(data_record, columns=['Job Title', 'Company', 'City', 'Summary', 'Job Link'])
Possible solution with concat:
dfs = []
for page in range(1, 3):
page = (page - 1) * 10
url = "%s%s%d" % (main_url, start_from, page) # get full url
indeed = requests.get(url)
indeed.raise_for_status()
soup = BeautifulSoup(indeed.text, 'html.parser')
...
data_record = []
for title, company, city, summary, link in zip(jobsTitle, companiesName, citiesName, jobsSummary, jobsLink):
data_record.append({'Job Title': title, 'Company': company, 'City': city, 'Summary': summary, 'Job Link': link})
df = pd.DataFrame(data_record, columns=['Job Title', 'Company', 'City', 'Summary', 'Job Link'])
dfs.append(df)
df_fin = pd.concat(dfs, ignore_index=True)

Related

Unable to _scrape_ the proper data from money control board meeting information

I am able to scrape the tables from this website
but i am unable to split the record, what i wanted. Here is my code
import requests
from bs4 import BeautifulSoup
import re
r = requests.get('https://www.moneycontrol.com/stocks/marketinfo/meetings.php?opttopic=brdmeeting')
print(r.status_code)
soup = BeautifulSoup(r.text, 'lxml')
# print(soup)
Calendar = soup.find('table', class_ = 'b_12 dvdtbl tbldata14').text
print(Calendar.strip())
for Company_Name in Calendar.find_all('tr'):
rows = Company_Name.find_all('td', class_ = 'dvd_brdb')
print(rows)
for row in rows:
pl_calender = row.find_all('b')
print(pl_calender)
Result
Company Name
Date
Agenda
Aplab
Add to Watchlist
Add to Portfolio
14-Sep-2020
Quarterly Results
I am looking output in below format
Date,Company Name,event
2020-09-14,Divi's Laboratories Ltd,AGM 14/09/2020
2020-09-14,Grasim Industries Ltd.,AGM 14/09/2020
Output picture
Thanks in advance
Jana
stay safe and live healthy
here what i try :
r = requests.get('https://www.moneycontrol.com/stocks/marketinfo/meetings.php?opttopic=brdmeeting')
soup = BeautifulSoup(r.text, 'lxml')
mytable = soup.find('table', class_ = 'b_12 dvdtbl tbldata14')
companyname = mytable.find_all('b')
date = mytable.find_all('td', attrs={'class':'dvd_brdb', 'align':'center'})
agenda = mytable.find_all('td', attrs={'class':'dvd_brdb', 'style':'text-align:left;'})
companyname_list = []
date_list = []
agenda_list = []
for cn in companyname:
companyname_list.append(cn.get_text())
for dt in date:
date_list.append(dt.get_text())
for ag in agenda:
agenda_list.append(ag.get_text())
del companyname_list[0:2]
del date_list[0:2]
fmt = '{:<8}{:<20}{:<20}{:<20}'
print(fmt.format('', 'Date', 'Company Name', 'Agenda'))
for i, (datess, companynamess, agendass) in enumerate(zip(date_list, companyname_list, agenda_list)):
print(fmt.format(i, datess, companynamess, agendass))
Result :
Date Company Name Agenda
0 15-Sep-2020 7NR Retail Quarterly Results
1 15-Sep-2020 Scan Projects Quarterly Results
2 15-Sep-2020 Avonmore Cap Quarterly Results
3 15-Sep-2020 Elixir Cap Quarterly Results
4 15-Sep-2020 Aarvee Denim Quarterly Results
5 15-Sep-2020 Vipul Quarterly Results
...
....

How to Extract Data from Graph from a web Page?

I am Trying to scrape graph data from the webpage: 'https://cawp.rutgers.edu/women-percentage-2020-candidates'
I tried bellow code to extract data from Graph:
import requests
from bs4 import BeautifulSoup
Res = requests.get('https://cawp.rutgers.edu/women-percentage-2020-candidates').text
soup = BeautifulSoup(Res, "html.parser")
Values= [i.text for i in soup.findAll('g', {'class': 'igc-graph'}) if i]
Dates = [i.text for i in soup.findAll('g', {'class': 'igc-legend-entry'}) if i]
print(Values, Dates) ## both list are empty
Data= pd.DataFrame({'Value':Values,'Date':Dates}) ## Returning an Empty Dataframe
I want to extract Date and Value from all the 4 bar Graphs. Please anyone suggest what i have to do here to extract the graph data, or is there any other method that i can try to extract the data. thanks;
This graph was located on this url : https://e.infogram.com/5bb50948-04b2-4113-82e6-5e5f06236538
You can find the infogram id (path of target url) directly on the original url if you look for div with class infogram-embed which has the value of attribute data-id:
<div class="infogram-embed" data-id="5bb50948-04b2-4113-82e6-5e5f06236538" data-title="Candidate Tracker 2020_US House_Proportions" data-type="interactive"> </div>
From this url, it loads a static JSON in javascript. You can use regex to extract it and parse the JSON structure to get row/column, and the different tables:
import requests
from bs4 import BeautifulSoup
import re
import json
original_url = "https://cawp.rutgers.edu/women-percentage-2020-candidates"
r = requests.get(original_url)
soup = BeautifulSoup(r.text, "html.parser")
infogram_url = f'https://e.infogram.com/{soup.find("div",{"class":"infogram-embed"})["data-id"]}'
r = requests.get(infogram_url)
soup = BeautifulSoup(r.text, "html.parser")
script = [
t
for t in soup.findAll("script")
if "window.infographicData" in t.text
][0].text
extract = re.search(r".*window\.infographicData=(.*);$", script)
data = json.loads(extract.group(1))
entities = data["elements"]["content"]["content"]["entities"]
tables = [
(entities[key]["props"]["chartData"]["sheetnames"], entities[key]["props"]["chartData"]["data"])
for key in entities.keys()
if ("props" in entities[key]) and ("chartData" in entities[key]["props"])
]
data = []
for t in tables:
for i, sheet in enumerate(t[0]):
data.append({
"sheetName": sheet,
"table": dict([(t[1][i][0][j],t[1][i][1][j]) for j in range(len(t[1][i][0])) ])
})
print(data)
Output:
[{'sheetName': 'Sheet 1',
'table': {'': '2020', 'Districts Already Filed': '435'}},
{'sheetName': 'All',
'table': {'': 'Filed', '2016': '17.8%', '2018': '24.2%', '2020': '29.1%'}},
{'sheetName': 'Democrats Only',
'table': {'': 'Filed', '2016': '25.1%', '2018': '32.5%', '2020': '37.9%'}},
{'sheetName': 'Republicans Only',
'table': {'': 'Filed', '2016': '11.5%', '2018': '13.7%', '2020': '21.3%'}}]

Unable to scrape all data using beautiful soup

URL = r"https://www.vault.com/best-companies-to-work-for/law/top-100-law-firms-rankings/year/"
My_list = ['2007','2008','2009','2010']
Year = []
CompanyName = []
Rank = []
Score = []
for I, Page in enumerate(My_list, start=1):
url = r'https://www.vault.com/best-companies-to-work-for/law/top-100-law-firms-rankings/year/{}'.format(Page)
print(url)
Res = requests.get(url)
soup = BeautifulSoup(Res.content , 'html.parser')
data = soup.find('div' ,{'id':'main-content'})
for Data in data:
Title = data.findAll('h3')
for title in Title:
CompanyName.append(title.text.strip())
Rank = data.findAll('div' ,{'class':'rank RankNumber'})
for rank in Rank:
Rank.append(rank)
Score = data.findAll('div' ,{'class':'rank RankNumber'})
for score in Score:
Score.append(score)
I am unable to get all data for title ,Rank ,Score.
I dont know whether i have identified the right tag . and iam unble to extract value from the list rank.
To get you started. First, find all the div.RankItem elements, then within each, find the title, rank, and score.
from bs4 import BeautifulSoup
import requests
resp = requests.get('https://www.vault.com/best-companies-to-work-for/law/top-100-law-firms-rankings/year/2010')
soup = BeautifulSoup(resp.content , 'html.parser')
for i, item in enumerate(soup.find_all("div", {"class": "RankItem"})):
title = item.find("h3", {"class": "MainLink"}).get_text().strip()
rank = item.find("div", {"class": "RankNumber"}).get_text().strip()
score = item.find("div", {"class": "score"}).get_text().strip()
print(i+1, title, rank, score)

Unable to scrape all data

from bs4 import BeautifulSoup
import requests , sys ,os
import pandas as pd
URL = r"https://www.vault.com/best-companies-to-work-for/law/top-100-law-firms-rankings/year/"
My_list = ['2007','2008','2009','2010','2011','2012','2013','2014','2015','2016','2017','2018','2019','2020']
Year= []
CompanyName = []
Rank = []
Score = []
print('\n>>Process started please wait\n\n')
for I, Page in enumerate(My_list, start=1):
url = r'https://www.vault.com/best-companies-to-work-for/law/top-100-law-firms-rankings/year/{}'.format(Page)
print('\nData fetching from : ',url)
Res = requests.get(url)
soup = BeautifulSoup(Res.content , 'html.parser')
data = soup.find('section',{'class': 'search-result CompanyWorkfor RankingMain FindSchools school-results contrastSection d-flex justify-content-center min-height Rankings CompRank'})
if len(soup) > 0:
print("\n>>Getting page source for :" , url)
else:
print("Please Check url :",url)
for i, item in enumerate(data.find_all("div", {"class": "RankItem"})):
year = item.find("i",{"class":"fa-stack fa-2x"})
Year.append(year)
title = item.find("h3", {"class": "MainLink"}).get_text().strip()
CompanyName.append(title)
rank = item.find("div", {"class": "RankNumber"}).get_text().strip()
Rank.append(rank)
score = item.find("div", {"class": "score"}).get_text().strip()
Score.append(score)
Data = pd.DataFrame({"Year":Year,"CompanyName":CompanyName,"Rank":Rank,"Score":Score})
Data[['First','Score']] = Data.Score.str.split(" " , expand =True,)
Data[['hash','Rank']] = Data.Rank.str.split("#" , expand = True,)
Data.drop(columns = ['hash','First'],inplace = True)
Data.to_csv('Vault_scrap.csv',index = False)
For each url the expected output Data for year, rank, title and score is 100 lines, but I'm getting only 10 lines.
You can iterate through the year and pages like this.
import requests
import pandas as pd
url = 'https://www.vault.com/vault/api/Rankings/LoadMoreCompanyRanksJSON'
def page_loop(year, url):
tableReturn = pd.DataFrame()
for page in range(1,101):
payload = {
'rank': '2',
'year': year,
'category': 'LBACCompany',
'pg': page}
jsonData = requests.get(url, params=payload).json()
if jsonData == []:
return tableReturn
else:
print ('page: %s' %page)
tableReturn = tableReturn.append(pd.DataFrame(jsonData), sort=True).reset_index(drop=True)
return tableReturn
results = pd.DataFrame()
for year in range(2007,2021):
print ("\n>>Getting page source for :" , year)
jsonData = page_loop(year, url)
results = results.append(pd.DataFrame(jsonData), sort=True).reset_index(drop=True)

How do I scrape the product price from target.com product page?

I've recently learned about web scraping and wanted to create a program that scraped daily product prices. I'm using requests and bs4 in python to scrape target.com. So far this is my code:
TIMES = [2, 3, 4, 5, 6, 7]
url = 'https://www.target.com/p/dyson-ball-animal-2-upright-vacuum-iron-purple/-/A-52190951'
sleep(choice(TIMES))
r = requests.get(url)
soup = BeautifulSoup(r.content, 'html.parser')
sleep(choice(TIMES))
name = soup.find('h1').get_text().strip().replace(',', ';')
print('Product name: ', name)
sleep(choice(TIMES))
current_price = soup.find('span', {'data-test': 'product-savings'})
print('Current price: ', current_price)
When I run my code, the product name is correct, but the current price is always "None". Is there a different way I should be searching for the product price?
Thanks in advance!
As long as you have the item/product ID, you can create a session to get the local store id, api key, and then get that from the API:
import pandas as pd
import requests
s = requests.session()
s.get('https://www.target.com')
key = s.cookies['visitorId']
location = s.cookies['GuestLocation'].split('|')[0]
store_id = requests.get('https://redsky.target.com/v3/stores/nearby/%s?key=%s&limit=1&within=100&unit=mile' %(location, key)).json()
store_id = store_id[0]['locations'][0]['location_id']
product_id = '52190951'
url = 'https://redsky.target.com/web/pdp_location/v1/tcin/%s' %product_id
payload = {
'pricing_store_id': store_id,
'key': key}
jsonData = requests.get(url, params=payload).json()
df = pd.DataFrame(jsonData['price'], index=[0])
Output:
print (df.to_string())
tcin location_id reg_retail current_retail current_retail_start_timestamp current_retail_end_timestamp default_price formatted_current_price formatted_current_price_type is_current_price_range
0 52190951 3991 499.99 499.99 2019-10-19T07:00:00Z 9999-12-31T00:00:00Z False $499.99 reg False
You do not want to scrape the html, you want to scrape either the emebedded microdata or the embedded 'ld+json' data. One of them contains the productid. Once you have that value plug it into 'redsky.target.com' api....see the productid value in the url below?
https://redsky.target.com/v2/pdp/tcin/52190951?excludes=taxonomy,promotion,bulk_ship,rating_and_review_reviews,rating_and_review_statistics,question_answer_statistics
… then parse the returned json to get the price.
This might help.

Resources