How would I loop through each page and scrape the specific parameters?

How would I loop through each page and scrape the specific parameters? - python-3.x

#Import Needed Libraries
import requests
from bs4 import BeautifulSoup
import pprint
res = requests.get('https://news.ycombinator.com/news')
soup = BeautifulSoup(res.text, 'html.parser')
links = soup.select('.titlelink')
subtext = soup.select('.subtext')
def sort_stories_by_votes(hnlist): #Sorting your create_custom_hn dict by votes(if)
return sorted(hnlist, key= lambda k:k['votes'], reverse=True)
def create_custom_hn(links, subtext): #Creates a list of links and subtext
hn = []
for idx, item in enumerate(links): #Need to use this because not every link has a lot of votes
title = links[idx].getText()
href = links[idx].get('href', None)
vote = subtext[idx].select('.score')
if len(vote):
points = int(vote[0].getText().replace(' points', ''))
if points > 99: #Only appends stories that are over 100 points
hn.append({'title': title, 'link': href, 'votes': points})
return sort_stories_by_votes(hn)
pprint.pprint(create_custom_hn(links, subtext))
My question is that this is only for the first page, which has only 30 stories.
How would I apply my web scraping method by going through each page.... let's say the next 10 pages and keeping the formatted code above?

The URL for each page is like this
https://news.ycombinator.com/news?p=<page_number>
Use a for-loop to scrape content from each page. See the code below.
Here is the code that prints the contents from the first two pages. You can change the page_no depending on your need.
import requests
from bs4 import BeautifulSoup
import pprint
def sort_stories_by_votes(hnlist): #Sorting your create_custom_hn dict by votes(if)
return sorted(hnlist, key= lambda k:k['votes'], reverse=True)
def create_custom_hn(links, subtext, page_no): #Creates a list of links and subtext
hn = []
for idx, item in enumerate(links): #Need to use this because not every link has a lot of votes
title = links[idx].getText()
href = links[idx].get('href', None)
vote = subtext[idx].select('.score')
if len(vote):
points = int(vote[0].getText().replace(' points', ''))
if points > 99: #Only appends stories that are over 100 points
hn.append({'title': title, 'link': href, 'votes': points})
return sort_stories_by_votes(hn)
for page_no in range(1,3):
print(f'Page: {page_no}')
url = f'https://news.ycombinator.com/news?p={page_no}'
res = requests.get(url)
soup = BeautifulSoup(res.text, 'html.parser')
links = soup.select('.titlelink')
subtext = soup.select('.subtext')
pprint.pprint(create_custom_hn(links, subtext, page_no))
Page: 1
[{'link': 'https://www.thisworddoesnotexist.com/',
'title': 'This word does not exist',
'votes': 904},
{'link': 'https://www.sparkfun.com/news/3970',
'title': 'A patent troll backs off',
'votes': 662},
.
.
Page: 2
[{'link': 'https://www.vice.com/en/article/m7vqkv/how-fbi-gets-phone-data-att-tmobile-verizon',
'title': "The FBI's internal guide for getting data from AT&T, T-Mobile, "
'Verizon',
'votes': 802},
{'link': 'https://www.dailymail.co.uk/news/article-10063665/Government-orders-Google-track-searching-certain-names-addresses-phone-numbers.html',
'title': 'Feds order Google to track people searching certain names or '
'details',
'votes': 733},
.
.

Related

How do I combine url for my beautifulsoup project?

This is the code that I have so far :
from grp import struct_group
from bs4 import BeautifulSoup
import requests
import pandas as pd
import urllib.parse
name = []
price = []
mileage = []
dealer_name =[]
source = []
for i in range (1,13):
#Allow to Crawl multiple pages:
website ='https://www.cars.com/shopping/results/?page=' + str(i) + '&page_size=20&dealer_id=&keyword=&list_price_max=&list_price_min=&makes[]=&maximum_distance=all&mileage_max=&sort=best_match_desc&stock_type=used&year_max=&year_min=&zip=95355'
#Requesting using requests lib
response = requests.get(website)
soup = BeautifulSoup(response.content, 'html.parser')
#Finding results
results = soup.find_all('div', {'class':'vehicle-card'})
url_combine = []
root_url = 'http://www.cars.com'
url_combine = root_url , source
for result in results:
# name
try:
name.append(result.find('h2').get_text())
except:
name.append('n/a')
#price
try:
price.append(result.find('span', {'class':'primary-price'}).get_text())
except:
price.append('n/a')
# mileage
try:
mileage.append(result.find('div', {'class':'mileage'}).get_text())
except:
mileage.append('n/a')
# dealer_name
try:
dealer_name.append(result.find('div', {'class':'dealer-name'}).get_text().strip())
except:
dealer_name.append('n/a')
#link
try:
source.append(result.find('a', {'class':'vehicle-card-visited-tracking-link'}).get('href'))
except:
source.append('n/a')
for link in source:
url_combine.append(urllib.parse.urljoin(root_url, link))
#Using Pandas to create a dictionary and import to Excel
car_listings = pd.DataFrame({'Name': name, 'Mileage':mileage, 'Price': price, 'Dealer Name':dealer_name,'Link': source})
car_listings.to_excel('car_listings_page4.xlsx')
However, I keep running into prolem where it says that AttributeError: 'tuple' object has no attribute 'append'.
I know I need to make everything a list instead of a tuple but I can't seem to find where my mistake is. I believe this is one way to get the full url from href. If so, is there any other way I can implement into my code?

Avoid all these lists and use dicts instead, simplify and store info in more structured way - You could also use if-statement.
There are various ways to perform string concatination:
+ operator
join() method
% operator
format() function
f-string literal string interpolation (check in example for assignment
to website)
However simplest one is using + operator:
root_url+link
Concerning my example using dicts it would look like:
'link': root_url+result.find('a', {'class':'vehicle-card-visited-tracking-link'}).get('href') if result.find('a', {'class':'vehicle-card-visited-tracking-link'}) else None
or a bit shorter with walrus operator (Python 3.8 and later):
'link': root_url+a.get('href') if (a:=result.find('a', {'class':'vehicle-card-visited-tracking-link'})) else None
Example
from bs4 import BeautifulSoup
import requests
import pandas as pd
root_url = 'http://www.cars.com'
data = []
for i in range (1,2):
website =f'https://www.cars.com/shopping/results/?page={i}&page_size=20&dealer_id=&keyword=&list_price_max=&list_price_min=&makes[]=&maximum_distance=all&mileage_max=&sort=best_match_desc&stock_type=used&year_max=&year_min=&zip=95355'
response = requests.get(website)
soup = BeautifulSoup(response.content, 'html.parser')
results = soup.find_all('div', {'class':'vehicle-card'})
for result in results:
data.append({
'name': result.find('h2').get_text() if result.find('h2') else None,
'price': result.find('span', {'class':'primary-price'}).get_text() if result.find('span', {'class':'primary-price'}) else None,
'link': root_url+a.get('href') if (a:=result.find('a', {'class':'vehicle-card-visited-tracking-link'})) else None
### all the other info
})
pd.DataFrame(data)
Output
name
price
link
0
2017 Lexus IS 200t Base
$28,900
http://www.cars.com/vehicledetail/6942c51b-c26c-4614-97f1-acb0b7517b82/
1
2021 Lincoln Corsair Reserve
$43,797
http://www.cars.com/vehicledetail/e575219a-90fa-4a95-ade5-d2740e746cd0/
2
2021 Hyundai IONIQ Hybrid SE
$26,997
http://www.cars.com/vehicledetail/716b65ec-3abd-42e4-b19b-9024d2ad58f1/
3
2021 GMC Yukon XL Denali
$74,888
http://www.cars.com/vehicledetail/475045f6-142a-440f-80e7-2c3ae289fee2/
4
2007 Chevrolet Silverado 1500
$12,688
http://www.cars.com/vehicledetail/56080319-0bb9-49e0-8758-24f58d0d5d76/
...

How can I scrape product data from flexbox elements with python HTMLSession method?

I am facing an issue with parse product detail because of the flexbox design. I use the same method for other websites and it works but for some websites where a flex box is there, I can't able to parse any data.
from requests_html import HTMLSession
import csv
url = 'https://gpltheme.com/product-category/themeforest/'
s = HTMLSession()
def get_links(url):
r = s.get(url)
items = r.html.find('div.product-element-top.wd-quick-shop')
links = []
for item in items:
links.append(item.find('a',first=True).attrs['href'])
return links
def get_productdata(link):
r = s.get(link)
title = r.html.find('h1', first=True).full_text
category = r.html.find('a.breadcrumb-link.breadcrumb-link-last', first=True).full_text
product = {
'title': title.strip(),
'category': category.strip(),
}
print(product)
return product
links = get_links(url)
results = []
for link in links:
results.append(get_productdata(link))
with open('gplproduct_py2.csv', 'w', encoding='utf8', newline='') as f:
wr = csv.DictWriter(f, fieldnames=results[0].keys(),)
wr.writeheader()
wr.writerows(results)
print('Fin.')

Modifying a scraped url and changing its extension

I am new to programming and trying to download images and PDFs from a website. In the source code, the items I need are in option tags with partial urls. The site lists these items in a drop-down menu and they display in an iframe, but each item can be opened on its own page using its full url.
So far, my code finds the options, appends the partial url to the page's base address to create the full url for each option, and removes the final " / " from the .tif and .TIF urls and adds a ".pdf".
However, for the .tif and .TIF urls, I need to change "convert" to "pdf" to open them in a new page. Is there a way to do this to only the .tif.pdf and .TIF.pdf urls while leaving the others unchanged?
from urllib.request import urlopen as uReq
from bs4 import BeautifulSoup as soup
import os
my_url = 'http://example.com'
uClient = uReq(my_url)
page_html = uClient.read()
uClient.close()
page_soup = soup(page_html, "html.parser")
options = page_soup.findAll("select",{"id":"images"})[0].findAll("option")
values = [o.get("value") for o in options]
split_values = [i.split("|", 1)[0] for i in values]
# The option value is split to separate the url from its label
# <option value="/convert/ASRIMG/new/hop.TIF/|New Form"></option>
new_val = []
for val in split_values:
ext = os.path.splitext(val.rstrip('/'))[-1]
new_ext = ext
if ext.lower() == '.tif':
new_ext += '.pdf'
new_val.append(val.rstrip('/').replace(ext, new_ext))
for i in range (len(new_val)):
image_urls = ('http://example.com' + new_val[i])
My current results:
print (new_val)
/ASRIMG/good.jpg
/ASRIMG/foo/bar1.jpg
/ASRIMG/foo/bar2.jpg
/ASRIMG/foo/bar3.jpg
/convert/ASRIMG/new/hop.TIF.pdf
/convert/REG/green1.tif.pdf
/convert/REG//green2.tif.pdf
/convert/SHIP/green3.tif.pdf
/convert/SHIP/green4.tif.pdf
/convert/SHIP/green5.tif.pdf
/SKETCHIMG/001.png
/SKETCH/002.JPG
print (image_urls)
http://example.com/ASRIMG/good.jpg
http://example.com/ASRIMG/foo/bar1.jpg
http://example.com/ASRIMG/foo/bar2.jpg
http://example.com/ASRIMG/foo/bar3.jpg
http://example.com/convert/ASRIMG/new/hop.TIF.pdf
http://example.com/convert/REG/green1.tif.pdf
http://example.com/convert/REG//green2.tif.pdf
http://example.com/convert/SHIP/green3.tif.pdf
http://example.com/convert/SHIP/green4.tif.pdf
http://example.com/convert/SHIP/green5.tif.pdf
http://example.com/SKETCHIMG/001.png
http://example.com/SKETCH/002.JPG
What I need:
http://example.com/ASRIMG/good.jpg
http://example.com/ASRIMG/foo/bar1.jpg
http://example.com/ASRIMG/foo/bar2.jpg
http://example.com/ASRIMG/foo/bar3.jpg
http://example.com/pdf/ASRIMG/new/hop.TIF.pdf
http://example.com/pdf/REG/green1.tif.pdf
http://example.com/pdf/REG//green2.tif.pdf
http://example.com/pdf/SHIP/green3.tif.pdf
http://example.com/pdf/SHIP/green4.tif.pdf
http://example.com/pdf/SHIP/green5.tif.pdf
http://example.com/SKETCHIMG/001.png
http://example.com/SKETCH/002.JPG

After this step:
split_values = [i.split("|", 1)[0] for i in values]
This code handles both upper and lower tif:
In [48]: import os
In [49]: split_values = ['/ASRIMG/good.jpg', '/convert/ASRIMG/new/hop.TIF/', 'SK
...: ETCHIMG/001.png']
In [50]: new_val = []
In [51]: for val in split_values:
...: ext = os.path.splitext(val.rstrip('/'))[-1]
...: new_ext = ext
...: if ext.lower() == '.tif':
...: new_ext += '.pdf'
...: new_val.append(val.rstrip('/').replace(ext, new_ext))
...:
...:
This strips .tif/ from each value from split_values list from the right side and then adds .tif.pdf in the end

Scraping a lift with Python and BeautifulSoup

I am new to Python and trying to write some code that scrapes information form a website. I currently have:
from bs4 import BeautifulSoup
import requests
headers = {'User-Agent': 'Mozilla/5.0'}
for i in range(1, 300):
url = "[REMOVED]/footwear?page=%s" % i
response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.text, "html.parser")
items = soup.find_all('div', 'product-block__info')
for item in items:
for val in item.find_all('a','product-block'):
stock = item.find_all('class','count_product_stock hidden')[0].text
brand = item.find_all('div','brand')[0].text
price = item.find_all('span','selling_price')[0].text
print (items)
Which returns the error IndexError: list index out of range. If I put 'product-block__info' in the place of 'product-block' then I am able to print off the full list of the content within the 'product-block__info' tag on the page, but I'd like to just select a handful of elements and return these.
Can anyone explain to me what's happening here and how I can select just the elements i want from inside 'product-block__info'?

When selecting attributes with find_all you should either use the attrs dictionary or the keyword arguments, otherwise bs4 is lookink for tags.
for i in range(1, 300):
url = "[REMOVED]/footwear?page=%s" % i
response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.text, "html.parser")
items = soup.find_all('div', class_='product-block__info')
for item in items:
stock = item.find('span', class_='count_product_stock hidden').text
brand = item.find('h4', class_='brand').text
price = item.find('span', class_='selling_price').text
print(stock, brand, price)

How to get certain text from a url links

So im trying to get all the statistics in the statistics box page on the url page for each team. An example of what the page looks like is on the hyperlink I put below. Im trying to have if so it prints out;
month : win %
month : win %
All time: win%
But I am not to sure how to write that code, since the last piece of code I wrote in the main was giving me an error.
http://www.gosugamers.net/counterstrike/teams/16448-nasty-gravy-runners
import time
import requests
from bs4 import BeautifulSoup
def get_all(url, base): # Well called it will print all the team links
r = requests.get(url)
page = r.text
soup = BeautifulSoup(page, 'html.parser')
for team_links in soup.select('div.details h3 a'):
members = int(team_links.find_next('th', text='Members:').find_next_sibling('td').text.strip().split()[0])
if members < 5:
continue
yield base + team_links['href']
next_page = soup.find('div', {'class': 'pages'}).find('span', text='Next')
while next_page:
# Gives the server a break
time.sleep(0.2)
r = requests.get(BASE_URL + next_page.find_previous('a')['href'])
page = r.text
soup = BeautifulSoup(page)
for team_links in soup.select('div.details h3 a'):
yield BASE_URL + team_links['href']
next_page = soup.find('div', {'class': 'pages'}).find('span', text='Next')
if __name__ == '__main__':
BASE_URL = 'http://www.gosugamers.net'
URL = 'http://www.gosugamers.net/counterstrike/teams'
for links in get_all(URL, BASE_URL): # When run it will generate all the links for all the teams
r = requests.get(links)
page = r.content
soup = BeautifulSoup(page)
for statistics in soup.select('div.statistics tr'):
win_rate = int(statistics.find('th', text='Winrate:').find_next_sibling('td'))
print(win_rate)

Not sure exactly what you want but this will get all the team stats:
from bs4 import BeautifulSoup, Tag
import requests
soup = BeautifulSoup(requests.get("http://www.gosugamers.net/counterstrike/teams/16448-nasty-gravy-runners").content)
table = soup.select_one("table.stats-table")
head1 = [th.text.strip() for th in table.select("tr.header th") if th.text]
head2 = [th.text.strip() for th in table.select_one("tr + tr") if isinstance(th, Tag)]
scores = [th.text.strip() for th in table.select_one("tr + tr + tr") if isinstance(th, Tag)]
print(head1, head2, scores)
Output:
([u'Jun', u'May', u'All time'], [u'Winrate:', u'0%', u'0%', u'0%'], [u'Matches played:', u'0 / 0 / 0', u'0 / 0 / 0', u'0 / 0 / 0'])

Develop Reference

node.js excel linux python-3.x azure haskell apache-spark rust .htaccess string

How would I loop through each page and scrape the specific parameters? - python-3.x

Related

How do I combine url for my beautifulsoup project?

How can I scrape product data from flexbox elements with python HTMLSession method?

Modifying a scraped url and changing its extension

Scraping a lift with Python and BeautifulSoup

How to get certain text from a url links

Categories

Resources