How do I combine url for my beautifulsoup project? - python-3.x

This is the code that I have so far :
from grp import struct_group
from bs4 import BeautifulSoup
import requests
import pandas as pd
import urllib.parse
name = []
price = []
mileage = []
dealer_name =[]
source = []
for i in range (1,13):
#Allow to Crawl multiple pages:
website ='https://www.cars.com/shopping/results/?page=' + str(i) + '&page_size=20&dealer_id=&keyword=&list_price_max=&list_price_min=&makes[]=&maximum_distance=all&mileage_max=&sort=best_match_desc&stock_type=used&year_max=&year_min=&zip=95355'
#Requesting using requests lib
response = requests.get(website)
soup = BeautifulSoup(response.content, 'html.parser')
#Finding results
results = soup.find_all('div', {'class':'vehicle-card'})
url_combine = []
root_url = 'http://www.cars.com'
url_combine = root_url , source
for result in results:
# name
try:
name.append(result.find('h2').get_text())
except:
name.append('n/a')
#price
try:
price.append(result.find('span', {'class':'primary-price'}).get_text())
except:
price.append('n/a')
# mileage
try:
mileage.append(result.find('div', {'class':'mileage'}).get_text())
except:
mileage.append('n/a')
# dealer_name
try:
dealer_name.append(result.find('div', {'class':'dealer-name'}).get_text().strip())
except:
dealer_name.append('n/a')
#link
try:
source.append(result.find('a', {'class':'vehicle-card-visited-tracking-link'}).get('href'))
except:
source.append('n/a')
for link in source:
url_combine.append(urllib.parse.urljoin(root_url, link))
#Using Pandas to create a dictionary and import to Excel
car_listings = pd.DataFrame({'Name': name, 'Mileage':mileage, 'Price': price, 'Dealer Name':dealer_name,'Link': source})
car_listings.to_excel('car_listings_page4.xlsx')
However, I keep running into prolem where it says that AttributeError: 'tuple' object has no attribute 'append'.
I know I need to make everything a list instead of a tuple but I can't seem to find where my mistake is. I believe this is one way to get the full url from href. If so, is there any other way I can implement into my code?

Avoid all these lists and use dicts instead, simplify and store info in more structured way - You could also use if-statement.
There are various ways to perform string concatination:
+ operator
join() method
% operator
format() function
f-string literal string interpolation (check in example for assignment
to website)
However simplest one is using + operator:
root_url+link
Concerning my example using dicts it would look like:
'link': root_url+result.find('a', {'class':'vehicle-card-visited-tracking-link'}).get('href') if result.find('a', {'class':'vehicle-card-visited-tracking-link'}) else None
or a bit shorter with walrus operator (Python 3.8 and later):
'link': root_url+a.get('href') if (a:=result.find('a', {'class':'vehicle-card-visited-tracking-link'})) else None
Example
from bs4 import BeautifulSoup
import requests
import pandas as pd
root_url = 'http://www.cars.com'
data = []
for i in range (1,2):
website =f'https://www.cars.com/shopping/results/?page={i}&page_size=20&dealer_id=&keyword=&list_price_max=&list_price_min=&makes[]=&maximum_distance=all&mileage_max=&sort=best_match_desc&stock_type=used&year_max=&year_min=&zip=95355'
response = requests.get(website)
soup = BeautifulSoup(response.content, 'html.parser')
results = soup.find_all('div', {'class':'vehicle-card'})
for result in results:
data.append({
'name': result.find('h2').get_text() if result.find('h2') else None,
'price': result.find('span', {'class':'primary-price'}).get_text() if result.find('span', {'class':'primary-price'}) else None,
'link': root_url+a.get('href') if (a:=result.find('a', {'class':'vehicle-card-visited-tracking-link'})) else None
### all the other info
})
pd.DataFrame(data)
Output
name
price
link
0
2017 Lexus IS 200t Base
$28,900
http://www.cars.com/vehicledetail/6942c51b-c26c-4614-97f1-acb0b7517b82/
1
2021 Lincoln Corsair Reserve
$43,797
http://www.cars.com/vehicledetail/e575219a-90fa-4a95-ade5-d2740e746cd0/
2
2021 Hyundai IONIQ Hybrid SE
$26,997
http://www.cars.com/vehicledetail/716b65ec-3abd-42e4-b19b-9024d2ad58f1/
3
2021 GMC Yukon XL Denali
$74,888
http://www.cars.com/vehicledetail/475045f6-142a-440f-80e7-2c3ae289fee2/
4
2007 Chevrolet Silverado 1500
$12,688
http://www.cars.com/vehicledetail/56080319-0bb9-49e0-8758-24f58d0d5d76/
...

Related

How would I loop through each page and scrape the specific parameters?

#Import Needed Libraries
import requests
from bs4 import BeautifulSoup
import pprint
res = requests.get('https://news.ycombinator.com/news')
soup = BeautifulSoup(res.text, 'html.parser')
links = soup.select('.titlelink')
subtext = soup.select('.subtext')
def sort_stories_by_votes(hnlist): #Sorting your create_custom_hn dict by votes(if)
return sorted(hnlist, key= lambda k:k['votes'], reverse=True)
def create_custom_hn(links, subtext): #Creates a list of links and subtext
hn = []
for idx, item in enumerate(links): #Need to use this because not every link has a lot of votes
title = links[idx].getText()
href = links[idx].get('href', None)
vote = subtext[idx].select('.score')
if len(vote):
points = int(vote[0].getText().replace(' points', ''))
if points > 99: #Only appends stories that are over 100 points
hn.append({'title': title, 'link': href, 'votes': points})
return sort_stories_by_votes(hn)
pprint.pprint(create_custom_hn(links, subtext))
My question is that this is only for the first page, which has only 30 stories.
How would I apply my web scraping method by going through each page.... let's say the next 10 pages and keeping the formatted code above?
The URL for each page is like this
https://news.ycombinator.com/news?p=<page_number>
Use a for-loop to scrape content from each page. See the code below.
Here is the code that prints the contents from the first two pages. You can change the page_no depending on your need.
import requests
from bs4 import BeautifulSoup
import pprint
def sort_stories_by_votes(hnlist): #Sorting your create_custom_hn dict by votes(if)
return sorted(hnlist, key= lambda k:k['votes'], reverse=True)
def create_custom_hn(links, subtext, page_no): #Creates a list of links and subtext
hn = []
for idx, item in enumerate(links): #Need to use this because not every link has a lot of votes
title = links[idx].getText()
href = links[idx].get('href', None)
vote = subtext[idx].select('.score')
if len(vote):
points = int(vote[0].getText().replace(' points', ''))
if points > 99: #Only appends stories that are over 100 points
hn.append({'title': title, 'link': href, 'votes': points})
return sort_stories_by_votes(hn)
for page_no in range(1,3):
print(f'Page: {page_no}')
url = f'https://news.ycombinator.com/news?p={page_no}'
res = requests.get(url)
soup = BeautifulSoup(res.text, 'html.parser')
links = soup.select('.titlelink')
subtext = soup.select('.subtext')
pprint.pprint(create_custom_hn(links, subtext, page_no))
Page: 1
[{'link': 'https://www.thisworddoesnotexist.com/',
'title': 'This word does not exist',
'votes': 904},
{'link': 'https://www.sparkfun.com/news/3970',
'title': 'A patent troll backs off',
'votes': 662},
.
.
Page: 2
[{'link': 'https://www.vice.com/en/article/m7vqkv/how-fbi-gets-phone-data-att-tmobile-verizon',
'title': "The FBI's internal guide for getting data from AT&T, T-Mobile, "
'Verizon',
'votes': 802},
{'link': 'https://www.dailymail.co.uk/news/article-10063665/Government-orders-Google-track-searching-certain-names-addresses-phone-numbers.html',
'title': 'Feds order Google to track people searching certain names or '
'details',
'votes': 733},
.
.

Word search with BeautifulSoup

I'm trying to scrape this news website "https://inshorts.com/en/read/national" and I'm fetching results of articles with heads Headline and news. I need all the articles on the pages which contain a specific word (e.g., "health") and add "date" on the head.
Here's my code:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from tqdm import tqdm
# code for scraping the first page
d={'headlines':[],'news':[], 'date':[]}
r = requests.get("https://inshorts.com/en/read/national")
soup = BeautifulSoup(r.content, 'html.parser')
min_news_id = soup.findAll("script",{"type":"text/javascript"})[2].text
min_news_id = min_news_id[25:35]
soup=soup.findAll("div",{"class":"news-card z-depth-1"})
#to search specific word in the content
soup = soup.find_all(text=re.compile("Health"))
for data in soup:
d['headlines'].append(data.find(itemprop="headline").getText())
d['news'].append(data.find(itemprop="articleBody").getText())
d['date'].append(data.find(itemprop="date").getText())
# code for scraping more pages
for i in tqdm(range(10)):
# It uses JavaScript to load more data from
# https://inshorts.com/en/ajax/more_news using POST requests
# with parameter 'news_offset' which informs server what page
# it has to send to client.
# we can make POST requests with this parameter to get new
# data in JSON format
try:
params = {'news_offset': min_news_id}
req = requests.post("https://inshorts.com/en/ajax/more_news",data=params)
#In JSON you have HTML in json_data['html'] and
#json_data['min_news_id'] for next page
json_data = req.json()
min_news_id = json_data['min_news_id']
soup = BeautifulSoup(json_data['html'], 'html.parser')
soup=soup.findAll("div",{"class":"news-card z-depth-1"})
for data in soup:
d['headlines'].append(data.find(itemprop="headline").getText())
d['news'].append(data.find(itemprop="articleBody").getText())
d['date'].append(data.find(itemprop="date").getText())
except:
pass
# storing the data into .csv file
df = pd.DataFrame(d)
df.to_csv("inshorts_news.csv", index=False)
And here's the error:
AttributeError Traceback (most recent call last)
<ipython-input-2-2d109f9dfc91> in <module>()
12
13 #to search specific word in the content
---> 14 soup = soup.find_all(text=re.compile("Health"))
15
16 for data in soup:
/usr/local/lib/python3.7/dist-packages/bs4/element.py in __getattr__(self, key)
1882 def __getattr__(self, key):
1883 raise AttributeError(
-> 1884 "ResultSet object has no attribute '%s'. You're probably treating a list of items like a single item. Did you call find_all() when you meant to call find()?" % key
1885 )
AttributeError: ResultSet object has no attribute 'find_all'. You're probably treating a list of items like a single item. Did you call find_all() when you meant to call find()?
What happens?
As the error tells you are trying to find_all() on a ResultSet object, that wont work.
How to fix?
Iterate over the elements of the object and check there for your keyword:
for data in soup.select('div.news-card.z-depth-1'):
if data.find(text=re.compile("farmer")):
Example
import requests
from bs4 import BeautifulSoup
import pandas as pd
from tqdm import tqdm
import re
# code for scraping the first page
d=[]
r = requests.get("https://inshorts.com/en/read/national")
soup = BeautifulSoup(r.content, 'html.parser')
min_news_id = soup.findAll("script",{"type":"text/javascript"})[2].text
min_news_id = min_news_id[25:35]
# code for scraping more pages
for i in tqdm(range(2)):
try:
params = {'news_offset': min_news_id}
req = requests.post("https://inshorts.com/en/ajax/more_news",data=params)
json_data = req.json()
min_news_id = json_data['min_news_id']
soup = BeautifulSoup(json_data['html'], 'html.parser')
for data in soup.select('div.news-card.z-depth-1'):
if data.find(text=re.compile("farmer")):
d.append({
'headline': data.find(itemprop="headline").getText(),
'article': data.find(itemprop="articleBody").getText()
})
except Exception as e:
print (e)
pd.DataFrame(d)
Output
headline article
0 Heavy traffic seen on DND Flyway at Noida toll... Heavy traffic was witnessed on Delhi Noida Dir...
1 Farmers take out protest march in Haryana over... Farmers have taken out a protest march in Hary...
2 Akhilesh Yadav detained in Lucknow after sit-i... Samajwadi Party President Akhilesh Yadav was d...
3 Priyanka detained on way to UP's Lakhimpur Khe... Congress leader Priyanka Gandhi Vadra was deta...
4 Rakesh Tikait reaches UP's Lakhimpur Kheri aft... BKU leader Rakesh Tikait reached UP's Lakhimpu...
5 Opposition to start with 'Photo Ops' in Lakhim... Uttar Pradesh Cabinet Minister Sidharth Nath S...

Appending extracted links in list but the list give the whole tag instead of link while printing

This is my code
from bs4 import BeautifulSoup
import requests, lxml
import re
from urllib.parse import urljoin
from googlesearch import search
import pandas as pd
query = 'A M C College of Engineering, Bangalore'
link = []
for i in search(query, tld='co.in', start=0, stop=1):
print(i)
soup = BeautifulSoup(requests.get(i).text, 'lxml')
for link in soup.select("a[href$='.pdf']"):
if re.search(r'nirf', str(link), flags=re.IGNORECASE):
fUrl = urljoin(i, link['href'])
print(fUrl)
link.append(fUrl)
print(link)
df = pd.DataFrame(link, columns=['PDF LINKS'])
print(df)
Here is my output after running the code:
https://www.amcgroup.edu.in/AMCEC/index.php
https://www.amcgroup.edu.in/AMCEC/image/Download/NIRFENGG.pdf
https://www.amcgroup.edu.in/AMCEC/image/Download/NIRFMBA.pdf
https://www.amcgroup.edu.in/AMCEC/image/Download/NIRF_2019.pdf
https://www.amcgroup.edu.in/AMCEC/image/Download/NIRF_2020.pdf
# Printing list with links but getting tags
For Invitation Click here...
# Dataframe where I want to store list
PDF LINKS
0 For Invitation Click here...
I should get the list of links which is shown in the output but when printing the list it gives me the whole tag instead of link. Also I want to push the all the links that I got into a single row of dataframe like this:
PDF LINKS
0 link1 link2 link3 #for query1
1 link1 link2 #for another query
How can I achieve this. And what is the problem with my code why I am getting tag instead of list.
Thanks in advance.
Use different variable name for the list and for the tag in for-loop:
import re
import requests
import pandas as pd
from bs4 import BeautifulSoup
from urllib.parse import urljoin
query = "A M C College of Engineering, Bangalore"
all_data = []
for i in ["https://www.amcgroup.edu.in/AMCEC/index.php"]:
soup = BeautifulSoup(requests.get(i).text, "lxml")
for link in soup.select("a[href$='.pdf']"): # <-- `link` is different than `all_data` here!
if re.search(r"nirf", link["href"], flags=re.IGNORECASE):
fUrl = urljoin(i, link["href"])
all_data.append(fUrl)
df = pd.DataFrame(all_data, columns=["PDF LINKS"])
print(df)
Prints:
PDF LINKS
0 https://www.amcgroup.edu.in/AMCEC/image/Download/NIRFENGG.pdf
1 https://www.amcgroup.edu.in/AMCEC/image/Download/NIRFMBA.pdf
2 https://www.amcgroup.edu.in/AMCEC/image/Download/NIRF_2019.pdf
3 https://www.amcgroup.edu.in/AMCEC/image/Download/NIRF_2020.pdf
EDIT: To have results in one row:
import re
import requests
import pandas as pd
from bs4 import BeautifulSoup
from urllib.parse import urljoin
query = "A M C College of Engineering, Bangalore"
all_data = []
for i in ["https://www.amcgroup.edu.in/AMCEC/index.php"]:
soup = BeautifulSoup(requests.get(i).text, "lxml")
row = []
for link in soup.select(
"a[href$='.pdf']"
): # <-- `link` is different than `all_data` here!
if re.search(r"nirf", link["href"], flags=re.IGNORECASE):
fUrl = urljoin(i, link["href"])
row.append(fUrl)
if row:
all_data.append(row)
df = pd.DataFrame({"PDF LINKS": all_data})
print(df)
Prints:
PDF LINKS
0 [https://www.amcgroup.edu.in/AMCEC/image/Download/NIRFENGG.pdf, https://www.amcgroup.edu.in/AMCEC/image/Download/NIRFMBA.pdf, https://www.amcgroup.edu.in/AMCEC/image/Download/NIRF_2019.pdf, https://www.amcgroup.edu.in/AMCEC/image/Download/NIRF_2020.pdf]

Issue webscraping a linked header with Beautiful Soup

I am running into an issue pulling in the human readable header name from this table in an html document. I can pull in the id, but my trouble comes when trying to pull in the correct header between the '>... I am not sure what I need to do in this instance... Below is my code. It all runs except for the last for loop.
# Import libraries
import requests
from bs4 import BeautifulSoup
from pprint import pprint
import pandas as pd
import numpy as np
# Pull the HTML link into a local file or buffer
# and then parse with the BeautifulSoup library
# ------------------------------------------------
url = 'https://web.dsa.missouri.edu/static/mirror_sites/factfinder.census.gov/bkmk/table/1.0/en/GEP/2014/00A4/0100000US.html'
r = requests.get(url)
#print('Status: ' + str(r.status_code))
#print(requests.status_codes._codes[200])
soup = BeautifulSoup(r.content, "html")
table = soup.find(id='data')
#print(table)
# Convert the data into a list of dictionaries
# or some other structure you can convert into
# pandas Data Frame
# ------------------------------------------------
trs = table.find_all('tr')
#print(trs)
header_row = trs[0]
#print(header_row)
names = []
for column in header_row.find_all('th'):
names.append(column.attrs['id'])
#print(names)
db_names = []
for column in header_row.find_all('a'):
db_names.append(column.attrs['data-vo-id']) # ISSUE ARISES HERE!!!
print(db_names)
Let pandas read_html do the work for you, and simply specify the table id to find:
from pandas import read_html as rh
table = rh('https://web.dsa.missouri.edu/static/mirror_sites/factfinder.census.gov/bkmk/table/1.0/en/GEP/2014/00A4/0100000US.html', attrs = {'id': 'data'})[0]
Hey you can try something like this :
soup = BeautifulSoup(r.content, "html")
table = soup.findAll('table', {'id':'data'})
trs = table[0].find_all('tr')
#print(trs)
names = []
for row in trs[:1]:
td = row.find_all('td')
data_row_txt_list = [td_tag.text.strip() for td_tag in row]
header_row = data_row_txt_list
for column in header_row:
names.append(column)

Loading scraped data into list

I was able to successfully scrape some text from a website and I'm now trying to load the text into a list so I can later convert it to a Pandas DataFrame.
The site supplied the data in a scsv format so it was quick to grab.
The following is my code:
import requests
from bs4 import BeautifulSoup
#Specify the url:url
url = "http://rotoguru1.com/cgi-bin/fyday.pl?week=1&year=2017&game=dk&scsv=1"
# Packages the request, send the request and catch the response: r
r = requests.get(url)
#Extract the response:html_doc
html_doc = r.text
soup = BeautifulSoup(html_doc,"html.parser")
#Find the tags associated with the data you need, in this case
# it's the "pre" tags
for data in soup.find_all("pre"):
print(data.text)
Sample Output
Week;Year;GID;Name;Pos;Team;h/a;Oppt;DK points;DK salary
1;2017;1254;Smith, Alex;QB;kan;a;nwe;34.02;5400 1;2017;1344;Bradford,
Sam;QB;min;h;nor;28.54;5900
use open function to write csv file
import requests
from bs4 import BeautifulSoup
url = "http://rotoguru1.com/cgi-bin/fyday.pl?week=1&year=2017&game=dk&scsv=1"
r = requests.get(url)
html_doc = r.content
soup = BeautifulSoup(html_doc,"html.parser")
file = open(“data.csv”,”w”)
for data in soup.find("pre").text.split('\n'):
file.write(data.replace(';',','))
file.close()
Here's one thing you can do, although it's possible that someone who knows pandas better than I can suggest something better.
You have r.text. Put that into a convenient text file, let me call it temp.csv. Now you can use pandas read_csv method to get these data into a dataframe.
>>> df = pandas.read_csv('temp.csv', sep=';')
Addendum:
Suppose results were like this.
>>> results = [['a', 'b', 'c'], [1,2,3], [4,5,6]]
Then you could put them in a dataframe in this way.
>>> df = pandas.DataFrame(results[1:], columns=results[0])
>>> df
a b c
0 1 2 3
1 4 5 6
If u want to convert your existing code into list, using split method might do the job and then use pandas to convert it into dataframe.
import requests
from bs4 import BeautifulSoup
#Specify the url:url
url = "http://rotoguru1.com/cgi-bin/fyday.pl?week=1&year=2017&game=dk&scsv=1"
# Packages the request, send the request and catch the response: r
r = requests.get(url)
#Extract the response:html_doc
html_doc = r.text
soup = BeautifulSoup(html_doc,"html.parser")
#Find the tags associated with the data you need, in this case
# it's the "pre" tags
for data in soup.find_all("pre"):
print(data.text.split(";"))

Resources