I'm using BeautifulSoup to try to extract data from a web page. But for some reason it fails to iterate over items found in season greater than 1. There is seemingly no reason for this behavior as the nodes look exactly the same to me.
def scrape_show(show):
source = requests.get(show.url).text
soup = BeautifulSoup(source, 'lxml')
# All seasons and episodes
area = soup.find('div', class_='play_video-area-aside play_video-area-aside--related-videos play_video-area-aside--related-videos--titlepage')
for article in area:
if "season" in article.get('id'):
season = article.h2.a.find('span', class_='play_accordion__section-title-inner').text
print(season + " -- " + article.get('id'))
# All content for the given season
ul = article.find('ul')
if ul is None:
print("null!") # This should not happen
Example Output:
Season 1 -- section-season1-xxxx
Season 2 -- section-season2-xxxx
null!
https://www.svtplay.se/andra-aket (url from example)
The data is not available in HTML form for all seasons, only for season 1. But the information is embedded in the page in JSON form. You can parse this data with re and json module:
import re
import json
import requests
url = 'https://www.svtplay.se/andra-aket?tab=season-1-18927182'
data = json.loads( re.findall(r"root\['__svtplay_apollo'\] = (\{.*?\});", requests.get(url).text)[0] )
from pprint import pprint
# pprint(data) # <-- uncommment this to see all the data
for k in data:
if k.startswith('Episode:') or (k.startswith('$Episode:') and k.endswith('urls')):
print(k)
pprint(data[k])
print('-' * 80)
Prints (data about episodes 1 and 2 and their URLs):
Episode:1383301-001
{'__typename': 'Episode',
'accessibilities': {'json': ['AudioDescribed', 'SignInterpreted'],
'type': 'json'},
'duration': 1700,
'id': '1383301-001',
'image': {'generated': False,
'id': 'Image:18926434',
'type': 'id',
'typename': 'Image'},
'live': None,
'longDescription': 'Madde och Petter flyttar tillsammans med sin 13-åriga '
'dotter Ida till Björkfjället, en liten skidort i svenska '
'fjällen. Madde är uppvuxen där men för '
'Stockholms-hipstern Petter är det ett chockartat '
'miljöombyte. Maddes mamma Ingegerd har gått i pension och '
'lämnat över ansvaret för familjens lilla hotell till '
'Madde. Hon och Petter ska nu driva "Gammelgården" med '
'Maddes bror Tommy, vilket visar sig vara en inte helt '
'lätt uppgift. I rollerna: Sanna Sundqvist, Jakob '
'Setterberg, William Spetz, Bert-Åke Varg, Mattias '
'Fransson och Lena T Hansson. Del 1 av 8.',
'name': 'Avsnitt 1',
'nameRaw': '',
'positionInSeason': 'Säsong 1 — Avsnitt 1',
'restrictions': {'generated': True,
'id': '$Episode:1383301-001.restrictions',
'type': 'id',
'typename': 'Restrictions'},
'slug': 'avsnitt-1',
'svtId': 'jBD1gw8',
'urls': {'generated': True,
'id': '$Episode:1383301-001.urls',
'type': 'id',
'typename': 'Urls'},
'validFrom': '2019-07-25T02:00:00+02:00',
'validFromFormatted': 'Tor 25 jul 02:00',
'validTo': '2020-01-21T23:59:00+01:00',
'variants': [{'generated': False,
'id': 'Variant:1383301-001A',
'type': 'id',
'typename': 'Variant'},
{'generated': False,
'id': 'Variant:1383301-001S',
'type': 'id',
'typename': 'Variant'},
{'generated': False,
'id': 'Variant:1383301-001T',
'type': 'id',
'typename': 'Variant'}],
'videoSvtId': '8PbQdAj'}
--------------------------------------------------------------------------------
$Episode:1383301-001.urls
{'__typename': 'Urls',
'svtplay': '/video/19970142/andra-aket/andra-aket-sasong-1-avsnitt-1'}
--------------------------------------------------------------------------------
... and so on.
Related
I need to view the news daily by a list of keywords. I use GNews for this. After a certain number of requests, google blocks my access to the news.
I want to try using the tor network for parsing. But I can't get GNews to use the tor network. Maybe there are some ready-made solutions? Or tell me what changes need to be made to the GNews code. Or maybe there are other options?
I am not a programmer, I am trying to automate some of my actions.
My code
import datetime
import psycopg2
from gnews import GNews
c = 0
format = '%a, %d %b %Y %H:%M:%S GMT'
countries_gnews = ['US', 'AU', 'BW', 'CA', 'ET', 'GH', 'IN', 'ID', 'IE', 'IL', 'KE', 'LV', 'MY', 'NA', 'NZ', 'NG', 'PK', 'PH', 'SG', 'ZA', 'TZ', 'UG', 'GB', 'ZW', 'CZ', 'DE', 'AT', 'CH', 'AR', 'CL', 'CO', 'CU', 'MX', 'PE', 'VE', 'BE', 'FR', 'MA', 'SN', 'IT', 'LT', 'HU', 'NL', 'NO', 'PL', 'BR', 'PT', 'RO', 'SK', 'SI', 'SE', 'VN', 'TR', 'GR', 'BG', 'RU', 'UA', 'RS', 'AE', 'SA', 'LB', 'EG', 'BD', 'TH', 'CN', 'TW', 'HK', 'JP', 'KR']
keywords_gnews = [
'keyword1',
'keyword2',
'keyword200'
]
date_start = datetime.date(2022, 10, 1)
date_end = datetime.date(2022, 10, 2)
for country_gnews in countries_gnews:
print(country_gnews)
google_news = GNews(language='en', country=country_gnews, start_date=date_start, end_date=date_end)
print(google_news)
for keyword_gnews in keywords_gnews:
json_resps = google_news.get_news(keyword_gnews)
i = 0
try:
connection = psycopg2.connect(user="******",
password="********",
host="127.0.0.1",
port="*****",
database="*******")
cursor = connection.cursor()
# Print PostgreSQL Connection properties
print(connection.get_dsn_parameters(), "\n")
# Print PostgreSQL version
cursor.execute("SELECT version();")
record = cursor.fetchone()
print("You are connected to - ", record, "\n")
except (Exception, psycopg2.Error) as error:
print("Error while connecting to PostgreSQL", error)
#print(keyword_gnews, country_gnews)
for json_resp in json_resps:
i = i + 1
title = json_resp['title']
description = json_resp['description']
url = json_resp['url']
published_date = json_resp['published date']
date_publ = datetime.datetime.strptime(published_date, format)
publisher = str(json_resp['publisher'])
source = 'gnews'
sql = "insert into ***** (title, description, url, date_publ, publisher, country_gnews, keyword_gnews, position, source) values (%s, %s, %s, %s, %s, %s, %s, %s, %s)"
cursor.execute(sql, (title, description, url, date_publ, publisher, country_gnews, keyword_gnews, i, source))
print('********************')
connection.commit()
Installed tor, tried to make changes to gnews.py but my knowledge was not enough for the program to send requests through the tor network. Windows OS
This is a challenging one for me tried for hours as I am learning but I am not sure if my logic is correct at all.
Define a function called stars that takes in two dictionaries:
movies: a dictionary where the keys are movie titles and
the values are lists of major performers in the movie.
For example:
movies["The Dark Knight"] = ["Christian Bale",
"Heath Ledger", "Maggie Gyllenhall", "Aaron Eckhart"]
tv shows: a dictionary where the keys are TV show titles
and the values lists of major performers in the show.
For example:
tvshows["Community"] = ["Joel McHale", "Alison
Brie", "Danny Pudi", "Donald Glover", "Yvette Brown"]
The function stars should return a new dictionary. The keys of the new dictionary should be the performers' names, and the values for each key should be the list of shows and
movies in which that performer has appeared. Sort the shows
and movies alphabetically.
If your function works correctly, this will originally print (although the order of the keys may vary):
{'Portia de Rossi': ['Arrested Development'], 'Will Ferrell': ['The Lego Movie'], 'Yvette Brown': ['Community'], 'Rebel Wilson': ['How to Be Single'], 'Danny Pudi': ['Community'], 'Elizabeth Banks': ['30 Rock', 'The Lego Movie'], 'Alec Baldwin': ['30 Rock'], 'Alison Brie': ['Community', 'How to Be Single', 'The Lego Movie'], 'Tina Fey': ['30 Rock'], 'Dakota Johnson': ['How to Be Single'], 'Joel McHale': ['Community'], 'Jack McBrayer': ['30 Rock'], 'Tracy Morgan': ['30 Rock'], 'Donald Glover': ['Community'], 'Will Arnett': ['Arrested Development', 'The Lego Movie'], 'Jason Bateman': ['Arrested Development']}
movies = {"How to Be Single": ["Alison Brie", "Dakota Johnson",
"Rebel Wilson"],
"The Lego Movie": ["Will Arnett", "Elizabeth Banks",
"Alison Brie", "Will Ferrell"]}
tvshows = {"Community": ["Alison Brie", "Joel McHale",
"Danny Pudi", "Yvette Brown",
"Donald Glover"],
"30 Rock": ["Tina Fey", "Tracy Morgan", "Jack McBrayer",
"Alec Baldwin", "Elizabeth Banks"],
"Arrested Development": ["Jason Bateman", "Will Arnett",
"Portia de Rossi"]}
print(stars(movies, tvshows))
def stars(movies, tv_shows):
# print(movies)
# print(tv_shows)
dictionary_to_return = {}
both_dict = {**movies, **tv_shows}
# print(both_dict)
celebrity_list = []
# # print(celebrity_list)
for (key, value) in both_dict.items():
celebrity_list.extend(value)
celebrity_list_filtered = list(set(celebrity_list))
celebrity_list_filtered.sort()
# print(celebrity_list_filtered)
for every_celebrity in celebrity_list_filtered:
# print(every_celebrity,": THis is artist")
for every_title in both_dict.keys():
# print(every_title,": THis is the tile")
artist_in_title = both_dict[every_title]
# print(artist_in_title, ":these are artist in the", every_title)
if every_celebrity in artist_in_title:
if every_title not in dictionary_to_return.keys():
celebrity_list.append(every_title)
else:
valuess = dictionary_to_return[every_celebrity].value
print(valuess)
print(celebrity_list)
print(dictionary_to_return)
# print(type(artist_in_title))
# (every_celebrity,)
# ret_list =[]
# individual_tilte_list=[]
# ret_dict ={}
# for every_celebrity in celebrity_list_filtered:
# # print(every_celebrity)
# for (each_title,value) in both_dict.items():
# if every_celebrity in both_dict[each_title]:
# ret_list.append(every_celebrity)
# ret_list.append(key)
# print(ret_list)
# for i in range(0, len(ret_list)):
# if ret_list[i] in celebrity_list_filtered:
# if ret_list[i] in individual_tilte_list:
# individual_tilte_list.append(ret_list[0+i])
# elif ret_list[i] not in celebrity_list_filtered:
# individual_tilte_list.append(ret_list[i])
# print(individual_tilte_list)
Above is the code I worked on which I feel is total trash. Some feedback on how to approach this problem will be appreciated
I believe you need.
def stars(movies, tv_shows):
dictionary_to_return = {}
for k, v in {**movies, **tv_shows}.items():
for actor in v:
dictionary_to_return.setdefault(actor, []).append(k)
# dictionary_to_return = {k: sorted(v) for k,v in dictionary_to_return.items()} #Sort by show-movie
return dictionary_to_return
print(stars(movies, tvshows))
Output:
{'Alec Baldwin': ['30 Rock'],
'Alison Brie': ['How to Be Single', 'The Lego Movie', 'Community'],
'Dakota Johnson': ['How to Be Single'],
'Danny Pudi': ['Community'],
'Donald Glover': ['Community'],
'Elizabeth Banks': ['The Lego Movie', '30 Rock'],
'Jack McBrayer': ['30 Rock'],
'Jason Bateman': ['Arrested Development'],
'Joel McHale': ['Community'],
'Portia de Rossi': ['Arrested Development'],
'Rebel Wilson': ['How to Be Single'],
'Tina Fey': ['30 Rock'],
'Tracy Morgan': ['30 Rock'],
'Will Arnett': ['The Lego Movie', 'Arrested Development'],
'Will Ferrell': ['The Lego Movie'],
'Yvette Brown': ['Community']}
I'm trying to scrape the title of an item on a foreign version of a site.
After I run the Python script, the cli launches but returns nothing at all.
In iPython, to get the title, title = soup.find('a', {'class': 'vip'}).text works great on its own but it doesn't in Pycharm within the full code even though I went to my settings to download the BeautifulSoup package for my current interpreter.
Any idea why? Thanks.
#!/usr/bin/python3
import csv
import time
import requests
from bs4 import BeautifulSoup
product_category = input("Enter your product category: ")
def get_page(url):
response = requests.get(url)
if not response.ok:
print('Server responded:', response.status_code)
else:
soup = BeautifulSoup(response.text, 'html.parser')
return soup
def get_detail_data(soup):
# title
try:
title = soup.find('a', {'class': 'vip'}).text
except:
title = ''
# price
try:
price = soup.find_all('span', {'itemprop': 'price'})
for p in price:
price = p.get('content')
except:
price = ''
# currency
try:
currency = soup.find_all('span', {'class': 'bold'}).text
except:
currency = ''
# items sold
try:
i_s = soup.find('div', {'class': 'hotness-signal red'}).text
items_sold = i_s.strip().split(' ')[0]
except:
items_sold = ''
data = {
'title': title,
'price': price,
'currency': currency,
'total sold': items_sold
}
return data
def get_index_data(soup):
try:
links = soup.find_all('a', class_='s-item__link')
except:
links = []
urls = [item.get('href') for item in links]
return urls
def write_csv(data, url):
with open('output.csv', 'a') as csvfile:
writer = csv.writer(csvfile)
row = [data['title'], data['price'], data['currency'], data['total sold'], url]
writer.writerow(['Title', 'Price', 'Currency', 'Sales Volume', 'URL'])
writer.writerow(row)
def main():
# Store URL formats for each search engine with placeholders
url = f"https://www.ebay.fr/sch/i.html?_nkw={product_category}&_pgn=1"
print(url)
products = get_index_data(get_page(url))
for link in products:
time.sleep(7)
data = get_detail_data(get_page(link))
print(data)
write_csv(data, link)
if __name__ == '__main__':
main()
Seems that .fr site uses different markup, so you need to change classnames/attributes accordingly.
For example:
import re
import csv
import time
import requests
from bs4 import BeautifulSoup
product_category = input("Enter your product category: ")
def get_page(url):
response = requests.get(url)
if not response.ok:
print('Server responded:', response.status_code)
else:
soup = BeautifulSoup(response.text, 'html.parser')
return soup
def get_detail_data(soup):
# title
try:
title = soup.select_one('h1[itemprop="name"]')
for span in title.select('span'):
span.extract()
title = title.get_text(strip=True)
except:
title = ''
# price
try:
price = soup.find_all('span', {'itemprop': 'price'})
for p in price:
price = p.get('content')
except:
price = ''
# currency
try:
currency = soup.select_one('span[itemprop="priceCurrency"][content]')["content"]
except:
currency = ''
# items sold
try:
items_sold = re.findall(r'\d+', soup.select_one('.soldwithfeedback').text)[0]
except:
items_sold = ''
data = {
'title': title,
'price': price,
'currency': currency,
'total sold': items_sold
}
return data
def get_index_data(soup):
links = soup.select('.sresult h3 a')
urls = [item.get('href') for item in links]
return urls
def write_csv(data, url):
with open('output.csv', 'a') as csvfile:
writer = csv.writer(csvfile)
row = [data['title'], data['price'], data['currency'], data['total sold'], url]
writer.writerow(['Title', 'Price', 'Currency', 'Sales Volume', 'URL'])
writer.writerow(row)
def main():
# Store URL formats for each search engine with placeholders
url = f"https://www.ebay.fr/sch/i.html?_nkw={product_category}&_pgn=1"
print(url)
products = get_index_data(get_page(url))
for link in products:
time.sleep(0.5)
data = get_detail_data(get_page(link))
print(data)
# write_csv(data, link) # <-- I commented it, to just print to screen
if __name__ == '__main__':
main()
Prints:
Enter your product category: ddr4
https://www.ebay.fr/sch/i.html?_nkw=ddr4&_pgn=1
{'title': '16 Go 8 Go 4 Go DDR3 DDR4 1333 1600 1866 2133 RAM 2400 2666 MHz pour HyperX FURY Lot', 'price': '19.74', 'currency': 'USD', 'total sold': '1'}
{'title': '4 Go 8 Go 16 Go DDR4 2133 2400 2666 Mhz pour HyperX FURY DIMM Desktop Mémoire RAM Lot', 'price': '23.87', 'currency': 'USD', 'total sold': '93'}
{'title': '8 Go DDR4 2133 MHz pour HyperX FURY CL15 288 Pin DIMM PC4-17000 Desktop RAM RL1US', 'price': '39.96', 'currency': 'USD', 'total sold': '17'}
{'title': '16 Go G. Skill DDR4 Trident 3200 MHz Z PC4-25600 CL16 1.35 V Double Kit (2x8GB)', 'price': '70.0', 'currency': 'GBP', 'total sold': ''}
{'title': 'DDR4 4 Go 8 Go 16 Go Desktop 2666 MHz Desktop DIMM Mémoire RAM pour Kingston HyperX Fury R1US', 'price': '24.13', 'currency': 'USD', 'total sold': '19'}
{'title': 'Micron 8GB RAM DDR4 1Rx8 PC4-2400T-UAB-10', 'price': '23.0', 'currency': 'EUR', 'total sold': ''}
{'title': 'PATRIOT Viper Blackout 16 Go DDR4 3000 (2x8)', 'price': '54.99', 'currency': 'GBP', 'total sold': ''}
{'title': 'Samsung 8GB RAM DDR4 1Rx8 PC4-2133P SO-DIMM', 'price': '21.0', 'currency': 'EUR', 'total sold': ''}
{'title': 'Kingston 8 Go DDR4 2133 MHz Desktop PC RAM ~~ PC4 17000 Mémoire 2133P 288 broches 2Rx8', 'price': '31.99', 'currency': 'GBP', 'total sold': ''}
...and so on.
I'm working on web scraping with beautiful soup to retrieve jobs from indeed. My code is working but when it loops to the next page it would overwrite the existing CSV file. I see from other posts that I would need to use pandas concat? but I can't seem to get it to work or where to implement it in my source code. Any suggestions to improve my code would also be greatly appreciated.
Below scrape pages 1-2 on indeed.
from bs4 import BeautifulSoup
import requests, pandas as pd
from urllib.parse import urljoin
print('Getting new jobs...')
main_url = 'https://www.indeed.com/jobs?q=web+developer&l=Sacramento,+CA&sort=date'
start_from = '&start='
for page in range(1, 3):
page = (page - 1) * 10
url = "%s%s%d" % (main_url, start_from, page) # get full url
indeed = requests.get(url)
indeed.raise_for_status()
soup = BeautifulSoup(indeed.text, 'html.parser')
home = 'https://www.indeed.com/viewjob?'
jobsTitle, companiesName, citiesName, jobsSummary, jobsLink = [], [], [], [], []
target = soup.find_all('div', class_=' row result')
for div in target:
if div:
title = div.find('a', class_='turnstileLink').text.strip()
jobsTitle.append(title)
company = div.find('span', class_='company').text.strip()
companiesName.append(company)
city = div.find('span', class_='location').text.strip()
citiesName.append(city)
summary = div.find('span', class_='summary').text.strip()
jobsSummary.append(summary)
job_link = urljoin(home, div.find('a').get('href'))
jobsLink.append(job_link)
target2 = soup.find_all('div', class_='lastRow row result')
for i in target2:
title2 = i.find('a', class_='turnstileLink').text.strip()
jobsTitle.append(title2)
company2 = i.find('span', class_='company').text.strip()
companiesName.append(company2)
city2 = i.find('span', class_='location').text.strip()
citiesName.append(city2)
summary2 = i.find('span', class_='summary').text.strip()
jobsSummary.append(summary2)
jobLink2 = urljoin(home, i.find('a').get('href'))
jobsLink.append(jobLink2)
data_record = []
for title, company, city, summary, link in zip(jobsTitle, companiesName, citiesName, jobsSummary, jobsLink):
data_record.append({'Job Title': title, 'Company': company, 'City': city, 'Summary': summary, 'Job Link': link})
df = pd.DataFrame(data_record, columns=['Job Title', 'Company', 'City', 'Summary', 'Job Link'])
df
You can crate list data_record out of loop with DataFrame contructor:
data_record = []
for page in range(1, 3):
page = (page - 1) * 10
url = "%s%s%d" % (main_url, start_from, page) # get full url
indeed = requests.get(url)
indeed.raise_for_status()
soup = BeautifulSoup(indeed.text, 'html.parser')
...
for title, company, city, summary, link in zip(jobsTitle, companiesName, citiesName, jobsSummary, jobsLink):
data_record.append({'Job Title': title, 'Company': company, 'City': city, 'Summary': summary, 'Job Link': link})
df = pd.DataFrame(data_record, columns=['Job Title', 'Company', 'City', 'Summary', 'Job Link'])
Possible solution with concat:
dfs = []
for page in range(1, 3):
page = (page - 1) * 10
url = "%s%s%d" % (main_url, start_from, page) # get full url
indeed = requests.get(url)
indeed.raise_for_status()
soup = BeautifulSoup(indeed.text, 'html.parser')
...
data_record = []
for title, company, city, summary, link in zip(jobsTitle, companiesName, citiesName, jobsSummary, jobsLink):
data_record.append({'Job Title': title, 'Company': company, 'City': city, 'Summary': summary, 'Job Link': link})
df = pd.DataFrame(data_record, columns=['Job Title', 'Company', 'City', 'Summary', 'Job Link'])
dfs.append(df)
df_fin = pd.concat(dfs, ignore_index=True)
I want to learn how to scrape a page using BeautifulSoup and write it to a csv file. When I start appending columns to the key in a dictionary all the values are appended to each key not just a single one.
I get the information I want:
[<td class="column-2">655</td>]
[<td class="column-2">660</td>]
[<td class="column-2">54</td>]
[<td class="column-2">241</td>]
Afterwards when I try to assign each value to a key I get:
{'date': ['14th November 2016'], 'total complaints': ['655', '660', '54', '241'], 'complaints': ['655', '660', '54', '241'], 'departures': ['655', '660', '54', '241'], 'arrivals': ['655', '660', '54', '241']}
Full code (csv writer is just for testing now):
import requests
from bs4 import BeautifulSoup as BS
import csv
operational_data_url = "http://heathrowoperationaldata.com/daily-operational-data/"
operational_data_page = requests.get(operational_data_url).text
print(operational_data_page)
soup = BS(operational_data_page, "html.parser")
data_div = soup.find_all("ul", class_="sub-menu")
list_items = data_div[0].find_all("li")
data_links = []
for menu in data_div:
list_items = menu.find_all("li")
for links in list_items:
data_link = links.find("a")
data_links.append(data_link.get("href"))
for page in data_links[:1]:
data_page = requests.get(page).text
soup = BS(data_page, "html.parser")
date = soup.find("title")
table = soup.find("tbody")
data = {
"date" : [],
"arrivals" : [],
"departures" : [],
"complaints" : [],
"total complaints" : [],
}
for day in date:
data["date"].append(day)
rows = table.find_all("tr", class_=["row-3", "row-4", "row-36", "row-37"])
for row in rows:
cols = row.find_all("td", class_="column-2")
data["arrivals"].append( cols[0].get_text() )
data["departures"].append( cols[0].get_text() )
data["complaints"].append( cols[0].get_text() )
data["total complaints"].append( cols[0].get_text() )
#test
with open('test.csv', 'w') as test_file:
fields = ['date', 'arrivals', 'departures', 'complaints', 'total complaints']
writer = csv.DictWriter(test_file, fields)
writer.writeheader()
row = {'date': day, 'arrivals': 655, 'departures': 660, 'complaints': 54, 'total complaints': 241 }
writer.writerow(row)
Thanks for any help!
When I start appending columns to the key in a dictionary all the values are appended to each key not just a single one.
Currently, your for row in rows: loop does this explicitly.
It appears to me that you want to be doing something like this instead:
rows = table.find_all("tr", class_=["row-3", "row-4", "row-36", "row-37"])
cols = [row.find_all("td", class_="column-2")[0] for row in rows]
data["arrivals"].append(cols[0].get_text())
data["departures"].append(cols[1].get_text())
data["complaints"].append(cols[2].get_text())
data["total complaints"].append(cols[3].get_text())
This will give you the following result for data:
{'date': [u'14th November 2016'], 'complaints': [u'54'], 'total complaints': [u'241'], 'departures': [u'660'], 'arrivals': [u'655']}
Note that this will only work if your rows are in the right order.