I'm trying to scrape the title of an item on a foreign version of a site.
After I run the Python script, the cli launches but returns nothing at all.
In iPython, to get the title, title = soup.find('a', {'class': 'vip'}).text works great on its own but it doesn't in Pycharm within the full code even though I went to my settings to download the BeautifulSoup package for my current interpreter.
Any idea why? Thanks.
#!/usr/bin/python3
import csv
import time
import requests
from bs4 import BeautifulSoup
product_category = input("Enter your product category: ")
def get_page(url):
response = requests.get(url)
if not response.ok:
print('Server responded:', response.status_code)
else:
soup = BeautifulSoup(response.text, 'html.parser')
return soup
def get_detail_data(soup):
# title
try:
title = soup.find('a', {'class': 'vip'}).text
except:
title = ''
# price
try:
price = soup.find_all('span', {'itemprop': 'price'})
for p in price:
price = p.get('content')
except:
price = ''
# currency
try:
currency = soup.find_all('span', {'class': 'bold'}).text
except:
currency = ''
# items sold
try:
i_s = soup.find('div', {'class': 'hotness-signal red'}).text
items_sold = i_s.strip().split(' ')[0]
except:
items_sold = ''
data = {
'title': title,
'price': price,
'currency': currency,
'total sold': items_sold
}
return data
def get_index_data(soup):
try:
links = soup.find_all('a', class_='s-item__link')
except:
links = []
urls = [item.get('href') for item in links]
return urls
def write_csv(data, url):
with open('output.csv', 'a') as csvfile:
writer = csv.writer(csvfile)
row = [data['title'], data['price'], data['currency'], data['total sold'], url]
writer.writerow(['Title', 'Price', 'Currency', 'Sales Volume', 'URL'])
writer.writerow(row)
def main():
# Store URL formats for each search engine with placeholders
url = f"https://www.ebay.fr/sch/i.html?_nkw={product_category}&_pgn=1"
print(url)
products = get_index_data(get_page(url))
for link in products:
time.sleep(7)
data = get_detail_data(get_page(link))
print(data)
write_csv(data, link)
if __name__ == '__main__':
main()
Seems that .fr site uses different markup, so you need to change classnames/attributes accordingly.
For example:
import re
import csv
import time
import requests
from bs4 import BeautifulSoup
product_category = input("Enter your product category: ")
def get_page(url):
response = requests.get(url)
if not response.ok:
print('Server responded:', response.status_code)
else:
soup = BeautifulSoup(response.text, 'html.parser')
return soup
def get_detail_data(soup):
# title
try:
title = soup.select_one('h1[itemprop="name"]')
for span in title.select('span'):
span.extract()
title = title.get_text(strip=True)
except:
title = ''
# price
try:
price = soup.find_all('span', {'itemprop': 'price'})
for p in price:
price = p.get('content')
except:
price = ''
# currency
try:
currency = soup.select_one('span[itemprop="priceCurrency"][content]')["content"]
except:
currency = ''
# items sold
try:
items_sold = re.findall(r'\d+', soup.select_one('.soldwithfeedback').text)[0]
except:
items_sold = ''
data = {
'title': title,
'price': price,
'currency': currency,
'total sold': items_sold
}
return data
def get_index_data(soup):
links = soup.select('.sresult h3 a')
urls = [item.get('href') for item in links]
return urls
def write_csv(data, url):
with open('output.csv', 'a') as csvfile:
writer = csv.writer(csvfile)
row = [data['title'], data['price'], data['currency'], data['total sold'], url]
writer.writerow(['Title', 'Price', 'Currency', 'Sales Volume', 'URL'])
writer.writerow(row)
def main():
# Store URL formats for each search engine with placeholders
url = f"https://www.ebay.fr/sch/i.html?_nkw={product_category}&_pgn=1"
print(url)
products = get_index_data(get_page(url))
for link in products:
time.sleep(0.5)
data = get_detail_data(get_page(link))
print(data)
# write_csv(data, link) # <-- I commented it, to just print to screen
if __name__ == '__main__':
main()
Prints:
Enter your product category: ddr4
https://www.ebay.fr/sch/i.html?_nkw=ddr4&_pgn=1
{'title': '16 Go 8 Go 4 Go DDR3 DDR4 1333 1600 1866 2133 RAM 2400 2666 MHz pour HyperX FURY Lot', 'price': '19.74', 'currency': 'USD', 'total sold': '1'}
{'title': '4 Go 8 Go 16 Go DDR4 2133 2400 2666 Mhz pour HyperX FURY DIMM Desktop Mémoire RAM Lot', 'price': '23.87', 'currency': 'USD', 'total sold': '93'}
{'title': '8 Go DDR4 2133 MHz pour HyperX FURY CL15 288 Pin DIMM PC4-17000 Desktop RAM RL1US', 'price': '39.96', 'currency': 'USD', 'total sold': '17'}
{'title': '16 Go G. Skill DDR4 Trident 3200 MHz Z PC4-25600 CL16 1.35 V Double Kit (2x8GB)', 'price': '70.0', 'currency': 'GBP', 'total sold': ''}
{'title': 'DDR4 4 Go 8 Go 16 Go Desktop 2666 MHz Desktop DIMM Mémoire RAM pour Kingston HyperX Fury R1US', 'price': '24.13', 'currency': 'USD', 'total sold': '19'}
{'title': 'Micron 8GB RAM DDR4 1Rx8 PC4-2400T-UAB-10', 'price': '23.0', 'currency': 'EUR', 'total sold': ''}
{'title': 'PATRIOT Viper Blackout 16 Go DDR4 3000 (2x8)', 'price': '54.99', 'currency': 'GBP', 'total sold': ''}
{'title': 'Samsung 8GB RAM DDR4 1Rx8 PC4-2133P SO-DIMM', 'price': '21.0', 'currency': 'EUR', 'total sold': ''}
{'title': 'Kingston 8 Go DDR4 2133 MHz Desktop PC RAM ~~ PC4 17000 Mémoire 2133P 288 broches 2Rx8', 'price': '31.99', 'currency': 'GBP', 'total sold': ''}
...and so on.
Related
I am able to scrape the tables from this website
but i am unable to split the record, what i wanted. Here is my code
import requests
from bs4 import BeautifulSoup
import re
r = requests.get('https://www.moneycontrol.com/stocks/marketinfo/meetings.php?opttopic=brdmeeting')
print(r.status_code)
soup = BeautifulSoup(r.text, 'lxml')
# print(soup)
Calendar = soup.find('table', class_ = 'b_12 dvdtbl tbldata14').text
print(Calendar.strip())
for Company_Name in Calendar.find_all('tr'):
rows = Company_Name.find_all('td', class_ = 'dvd_brdb')
print(rows)
for row in rows:
pl_calender = row.find_all('b')
print(pl_calender)
Result
Company Name
Date
Agenda
Aplab
Add to Watchlist
Add to Portfolio
14-Sep-2020
Quarterly Results
I am looking output in below format
Date,Company Name,event
2020-09-14,Divi's Laboratories Ltd,AGM 14/09/2020
2020-09-14,Grasim Industries Ltd.,AGM 14/09/2020
Output picture
Thanks in advance
Jana
stay safe and live healthy
here what i try :
r = requests.get('https://www.moneycontrol.com/stocks/marketinfo/meetings.php?opttopic=brdmeeting')
soup = BeautifulSoup(r.text, 'lxml')
mytable = soup.find('table', class_ = 'b_12 dvdtbl tbldata14')
companyname = mytable.find_all('b')
date = mytable.find_all('td', attrs={'class':'dvd_brdb', 'align':'center'})
agenda = mytable.find_all('td', attrs={'class':'dvd_brdb', 'style':'text-align:left;'})
companyname_list = []
date_list = []
agenda_list = []
for cn in companyname:
companyname_list.append(cn.get_text())
for dt in date:
date_list.append(dt.get_text())
for ag in agenda:
agenda_list.append(ag.get_text())
del companyname_list[0:2]
del date_list[0:2]
fmt = '{:<8}{:<20}{:<20}{:<20}'
print(fmt.format('', 'Date', 'Company Name', 'Agenda'))
for i, (datess, companynamess, agendass) in enumerate(zip(date_list, companyname_list, agenda_list)):
print(fmt.format(i, datess, companynamess, agendass))
Result :
Date Company Name Agenda
0 15-Sep-2020 7NR Retail Quarterly Results
1 15-Sep-2020 Scan Projects Quarterly Results
2 15-Sep-2020 Avonmore Cap Quarterly Results
3 15-Sep-2020 Elixir Cap Quarterly Results
4 15-Sep-2020 Aarvee Denim Quarterly Results
5 15-Sep-2020 Vipul Quarterly Results
...
....
Python novice back again! I got a lot of great help on this but am now stumped. The code below scrapes soccer match data and scores from Lehigh University soccer website. I am trying to split the scores format ['T', '0-0(2 OT)'] into 3 columns 'T', '0-0, '2 OT but I am running into problems. The issue lies in this part of the code:
=> for result in soup.findAll("div", {'class': 'sidearm-schedule-game-result'}):
=> result = result.get_text(strip=True).split(',')
I tried .split(',') but that did not work as it created ['T', '0-0(2 OT)']. Is there a way to split than into 3 columns 1) T, 2) 0-0 and 3) 2 OT???
All help much appreciated.
Thanks
import requests
from bs4 import BeautifulSoup
import pandas as pd
from itertools import zip_longest
d = []
n = []
res = []
op = []
yr = []
with requests.Session() as req:
for year in range(2003, 2020):
print(f"Extracting Year# {year}")
r = req.get(
f"https://lehighsports.com/sports/mens-soccer/schedule/{year}")
if r.status_code == 200:
soup = BeautifulSoup(r.text, 'html.parser')
for date in soup.findAll("div", {'class': 'sidearm-schedule-game-opponent-date flex-item-1'}):
d.append(date.get_text(strip=True, separator=" "))
for name in soup.findAll("div", {'class': 'sidearm-schedule-game-opponent-name'}):
n.append(name.get_text(strip=True))
for result in soup.findAll("div", {'class': 'sidearm-schedule-game-result'}):
result = result.get_text(strip=True)
#result = result.get_text(strip=True).split(',')
res.append(result)
if len(d) != len(res):
res.append("None")
for opp in soup.findAll("div", {'class': 'sidearm-schedule-game-opponent-text'}):
op.append(opp.get_text(strip=True, separator=' '))
yr.append(year)
data = []
for items in zip_longest(yr, d, n, op, res):
data.append(items)
df = pd.DataFrame(data, columns=['Year', 'Date', 'Name', 'opponent', 'Result']).to_excel('lehigh.xlsx', index=False)
I'm going to focus here only on splitting the res list into three columns, and you can incorporate it into your code as you see fit. So let's say you have this:
res1='T, 0-0(2 OT)'
res2='W,2-1OT'
res3='T,2-2Game called '
res4='W,2-0'
scores = [res1,res2,res3,res4]
We split them like this:
print("result","score","extra")
for score in scores:
n_str = score.split(',')
target = n_str[1].strip()
print(n_str[0].strip(),' ',target[:3],' ',target[3:])
Output:
result score extra
T 0-0 (2 OT)
W 2-1 OT
T 2-2 Game called
W 2-0
Note that this assumes that no game ends with double digits scores (say, 11-2, or whatever); so this should work for your typical soccer game, but will fail with basketball :D
I'm using BeautifulSoup to try to extract data from a web page. But for some reason it fails to iterate over items found in season greater than 1. There is seemingly no reason for this behavior as the nodes look exactly the same to me.
def scrape_show(show):
source = requests.get(show.url).text
soup = BeautifulSoup(source, 'lxml')
# All seasons and episodes
area = soup.find('div', class_='play_video-area-aside play_video-area-aside--related-videos play_video-area-aside--related-videos--titlepage')
for article in area:
if "season" in article.get('id'):
season = article.h2.a.find('span', class_='play_accordion__section-title-inner').text
print(season + " -- " + article.get('id'))
# All content for the given season
ul = article.find('ul')
if ul is None:
print("null!") # This should not happen
Example Output:
Season 1 -- section-season1-xxxx
Season 2 -- section-season2-xxxx
null!
https://www.svtplay.se/andra-aket (url from example)
The data is not available in HTML form for all seasons, only for season 1. But the information is embedded in the page in JSON form. You can parse this data with re and json module:
import re
import json
import requests
url = 'https://www.svtplay.se/andra-aket?tab=season-1-18927182'
data = json.loads( re.findall(r"root\['__svtplay_apollo'\] = (\{.*?\});", requests.get(url).text)[0] )
from pprint import pprint
# pprint(data) # <-- uncommment this to see all the data
for k in data:
if k.startswith('Episode:') or (k.startswith('$Episode:') and k.endswith('urls')):
print(k)
pprint(data[k])
print('-' * 80)
Prints (data about episodes 1 and 2 and their URLs):
Episode:1383301-001
{'__typename': 'Episode',
'accessibilities': {'json': ['AudioDescribed', 'SignInterpreted'],
'type': 'json'},
'duration': 1700,
'id': '1383301-001',
'image': {'generated': False,
'id': 'Image:18926434',
'type': 'id',
'typename': 'Image'},
'live': None,
'longDescription': 'Madde och Petter flyttar tillsammans med sin 13-åriga '
'dotter Ida till Björkfjället, en liten skidort i svenska '
'fjällen. Madde är uppvuxen där men för '
'Stockholms-hipstern Petter är det ett chockartat '
'miljöombyte. Maddes mamma Ingegerd har gått i pension och '
'lämnat över ansvaret för familjens lilla hotell till '
'Madde. Hon och Petter ska nu driva "Gammelgården" med '
'Maddes bror Tommy, vilket visar sig vara en inte helt '
'lätt uppgift. I rollerna: Sanna Sundqvist, Jakob '
'Setterberg, William Spetz, Bert-Åke Varg, Mattias '
'Fransson och Lena T Hansson. Del 1 av 8.',
'name': 'Avsnitt 1',
'nameRaw': '',
'positionInSeason': 'Säsong 1 — Avsnitt 1',
'restrictions': {'generated': True,
'id': '$Episode:1383301-001.restrictions',
'type': 'id',
'typename': 'Restrictions'},
'slug': 'avsnitt-1',
'svtId': 'jBD1gw8',
'urls': {'generated': True,
'id': '$Episode:1383301-001.urls',
'type': 'id',
'typename': 'Urls'},
'validFrom': '2019-07-25T02:00:00+02:00',
'validFromFormatted': 'Tor 25 jul 02:00',
'validTo': '2020-01-21T23:59:00+01:00',
'variants': [{'generated': False,
'id': 'Variant:1383301-001A',
'type': 'id',
'typename': 'Variant'},
{'generated': False,
'id': 'Variant:1383301-001S',
'type': 'id',
'typename': 'Variant'},
{'generated': False,
'id': 'Variant:1383301-001T',
'type': 'id',
'typename': 'Variant'}],
'videoSvtId': '8PbQdAj'}
--------------------------------------------------------------------------------
$Episode:1383301-001.urls
{'__typename': 'Urls',
'svtplay': '/video/19970142/andra-aket/andra-aket-sasong-1-avsnitt-1'}
--------------------------------------------------------------------------------
... and so on.
I'm working on web scraping with beautiful soup to retrieve jobs from indeed. My code is working but when it loops to the next page it would overwrite the existing CSV file. I see from other posts that I would need to use pandas concat? but I can't seem to get it to work or where to implement it in my source code. Any suggestions to improve my code would also be greatly appreciated.
Below scrape pages 1-2 on indeed.
from bs4 import BeautifulSoup
import requests, pandas as pd
from urllib.parse import urljoin
print('Getting new jobs...')
main_url = 'https://www.indeed.com/jobs?q=web+developer&l=Sacramento,+CA&sort=date'
start_from = '&start='
for page in range(1, 3):
page = (page - 1) * 10
url = "%s%s%d" % (main_url, start_from, page) # get full url
indeed = requests.get(url)
indeed.raise_for_status()
soup = BeautifulSoup(indeed.text, 'html.parser')
home = 'https://www.indeed.com/viewjob?'
jobsTitle, companiesName, citiesName, jobsSummary, jobsLink = [], [], [], [], []
target = soup.find_all('div', class_=' row result')
for div in target:
if div:
title = div.find('a', class_='turnstileLink').text.strip()
jobsTitle.append(title)
company = div.find('span', class_='company').text.strip()
companiesName.append(company)
city = div.find('span', class_='location').text.strip()
citiesName.append(city)
summary = div.find('span', class_='summary').text.strip()
jobsSummary.append(summary)
job_link = urljoin(home, div.find('a').get('href'))
jobsLink.append(job_link)
target2 = soup.find_all('div', class_='lastRow row result')
for i in target2:
title2 = i.find('a', class_='turnstileLink').text.strip()
jobsTitle.append(title2)
company2 = i.find('span', class_='company').text.strip()
companiesName.append(company2)
city2 = i.find('span', class_='location').text.strip()
citiesName.append(city2)
summary2 = i.find('span', class_='summary').text.strip()
jobsSummary.append(summary2)
jobLink2 = urljoin(home, i.find('a').get('href'))
jobsLink.append(jobLink2)
data_record = []
for title, company, city, summary, link in zip(jobsTitle, companiesName, citiesName, jobsSummary, jobsLink):
data_record.append({'Job Title': title, 'Company': company, 'City': city, 'Summary': summary, 'Job Link': link})
df = pd.DataFrame(data_record, columns=['Job Title', 'Company', 'City', 'Summary', 'Job Link'])
df
You can crate list data_record out of loop with DataFrame contructor:
data_record = []
for page in range(1, 3):
page = (page - 1) * 10
url = "%s%s%d" % (main_url, start_from, page) # get full url
indeed = requests.get(url)
indeed.raise_for_status()
soup = BeautifulSoup(indeed.text, 'html.parser')
...
for title, company, city, summary, link in zip(jobsTitle, companiesName, citiesName, jobsSummary, jobsLink):
data_record.append({'Job Title': title, 'Company': company, 'City': city, 'Summary': summary, 'Job Link': link})
df = pd.DataFrame(data_record, columns=['Job Title', 'Company', 'City', 'Summary', 'Job Link'])
Possible solution with concat:
dfs = []
for page in range(1, 3):
page = (page - 1) * 10
url = "%s%s%d" % (main_url, start_from, page) # get full url
indeed = requests.get(url)
indeed.raise_for_status()
soup = BeautifulSoup(indeed.text, 'html.parser')
...
data_record = []
for title, company, city, summary, link in zip(jobsTitle, companiesName, citiesName, jobsSummary, jobsLink):
data_record.append({'Job Title': title, 'Company': company, 'City': city, 'Summary': summary, 'Job Link': link})
df = pd.DataFrame(data_record, columns=['Job Title', 'Company', 'City', 'Summary', 'Job Link'])
dfs.append(df)
df_fin = pd.concat(dfs, ignore_index=True)
The following code does what I need it to do except when it comes across a product that is missing a class_name, lets say product-price. What I need help with is how to skip over that particular item and move onto the next one. Currently I get the following error :
"selenium.common.exceptions.NoSuchElementException: Message: {"errorMessage":"Unable to find element with class name 'product-display-price'","request":{"headers":{"Accept":"application/json","Accept-Encoding":"identity","Connection":"close","Content-Length":"138","Content-Type":"application/json;charset=UTF-8","Host":"127.0.0.1:64186","User-Agent":"Python-urllib/3.5"},"httpVersion":"1.1","method":"POST","post":"{\"id\": \":wdc:1473855489054\", \"value\": \"product-display-price\", \"using\": \"class name\", \"sessionId\": \"48018400-7a75-11e6-b0ab-5f6a864b5c88\"}","url":"/element","urlParsed":{"anchor":"","query":"","file":"element","directory":"/","path":"/element","relative":"/element","port":"","host":"","password":"","user":"","userInfo":"","authority":"","protocol":"","source":"/element","queryKey":{},"chunks":["element"]},"urlOriginal":"/session/48018400-7a75-11e6-b0ab-5f6a864b5c88/element/:wdc:1473855489054/element"}}"
import csv
from selenium import webdriver
from selenium.common.exceptions import NoSuchElementException
b = open('csv/homedepotfridges.csv', 'w', newline='')
a = csv.writer(b,delimiter=',')
driver = webdriver.PhantomJS()
driver.get('https://www.homedepot.ca/en/home/categories/appliances/dishwashers.html#!p=0&q=*%3Aprice-asc%3AcategoryPathHierarchy%3A2%2Fhd-classes%2Fl1-appliances%2Fl2-dishwashers')
items = []
for item in driver.find_elements_by_class_name('item'):
try:
model = item.find_element_by_class_name('product-model')
price = item.find_element_by_class_name('product-display-price')
title = item.find_element_by_class_name('product-title')
url = item.find_element_by_class_name('js-detail-link')
items.append({'model': model, 'price': price, 'title': title, 'url': url})
print (model.text, price.text, title.text, url.get_attribute("href"))
c = (model.text, price.text, title.text, url.get_attribute("href"))
a.writerow(c)
except NoSuchElementException:
model = 'n/a'
price = 'N/A'
title = 'N/A'
url = 'N/A'
items.append({'model': model, 'price': price, 'title': title, 'url': url})
print(model.text, price.text, title.text, url.get_attribute("href").text)
c = (model.text, price.text, title.text, url.get_attribute("href"))
a.writerow(c)
b.close()
Traceback (most recent call last):
File "/Users/User/PycharmProjects/Test/HomeDepotDishwashers.py", line 31, in
print(model.text, price.text, title.text, url.get_attribute.text("href"))
AttributeError: 'str' object has no attribute 'text'
Any help is appreciated
Several ways to do that. Have you tried any?
1) surround the line with a try - except https://docs.python.org/3/tutorial/errors.html
from selenium.common.exceptions import NoSuchElementException
for item in driver.find_elements_by_class_name('item'):
try:
model = item.find_element_by_class_name('product-model')
price = item.find_element_by_class_name('product-display-price')
title = item.find_element_by_class_name('product-title')
url = item.find_element_by_class_name('js-detail-link')
items.append({'model': model, 'price': price, 'title': title, 'url': url})
print (model.text, price.text, title.text, url.get_attribute("href"))
c = (model.text, price.text, title.text, url.get_attribute("href"))
a.writerow(c)
except NoSuchElementException:
#here you can do what you want to do when an element is not found. Then it'll continue with the next one.
b.close()
2) use an if statement to check if this item is found. You do this by making a list of all those elements (find_elements_by_...) and see if the length of that list is greater than 0. eg: