Unable to Scrape Product Price from Amazon - python-3.x

def scrapedPage(URL, user_agent):
if (not os.environ.get('PYTHONHTTPSVERIFY', '') and getattr(ssl, '_create_unverified_context', None)):
ssl._create_default_https_context = ssl._create_unverified_context
header = {"User-Agent": user_agent}
proxy = {'https': 'using-proxy'}
page = requests.get(URL, headers=header, proxies=proxy)
return page
I am not able to scrape the price of the product which is Rs 12,300
user_agent = random.choice(user_agent_list)
pricelink='https://www.amazon.in/gp/offer-listing/B07NXTH1BD/ref=dp_olp_afts?ie=UTF8&condition=all'
new_page = scrapedPage(pricelink, user_agent)
new_soup = BeautifulSoup(new_page.content, 'html.parser')
print(new_soup.prettify())
find_price = new_soup.find_all('span',attrs={'style':'text-decoration: inherit; white-space: nowrap'})
I tried many methods like findAll(), select(), find(), etc but I am not able to scrape the price for the same. Please help me out.
It is returning empty list

Male following changes in yout code:
new_soup = BeautifulSoup(new_page.content, 'lxml') # html.parser > lxml
find_price = new_soup.select('span.olpOfferPrice > span') # change selector
print(find_price[0].text)
Output
Rs. 12,300.00

Related

Why is my company_url variable not being defined?

I am trying to scrape the rank, name and url of a company from a website. This involves two pages and I have nested functions to get all the information I need. However, when I try to print the details I get an error that the company_url variable is not defined. I thought that calling the company_button_url function within the main function would do the job, but something is wrong. I have tried calling company_button_url() at differing points in the code, but cannot get it to work.
from urllib.request import urlopen, Request
from bs4 import BeautifulSoup
# Handle 403- Forbidden Error
url = 'https://www.b.co.uk/the-lists/mid-companies/'
req = Request(url, headers={'User-Agent': 'Mozilla'})
html = urlopen(req).read()
html_page = html.decode('utf-8')
soup = BeautifulSoup(html_page, 'html.parser') # create soup object
'''Main Function'''
def company_details():
# find rank
rank = soup.find('div', class_="company-score-redesign").text
# find company name
company_name = soup.find('div', class_="company-name-redesign").text
# find company website
''' Find Button Url...Parse HTML from new Url...Find Company Website '''
def company_button_url():
comp = soup.find('div', class_="company-name-redesign-mobile")
comp_btn = comp.find('a', href = True)
comp_btn_url = comp_btn['href']
new_url = comp_btn_url
# Handle 403- Forbidden Error
new_req = Request(new_url, headers={'User-Agent': 'Mozilla'})
nhtml = urlopen(new_req).read() # Getting new page
nhtml_page = nhtml.decode('utf-8')
nsoup = BeautifulSoup(nhtml_page, 'html.parser') # create new soup object
div_company_url = nsoup.find('div', class_="profile-info")
href_company_url = div_company_url.find('a', href = True)
company_url = href_company_url['href']
return company_url
company_button_url()
print(rank, company_name, company_url)
return()
company_details()
Feel very free to pull my coding to pieces - I am very new to this!
Thanks in advance.

Loop pagination using a dynamic integer

So far I can scrape the initial page and save. What I'm trying to do is use the page count on the site to determine the number of loops.
The page count is found in the code with the 'count =', which in this case is 18. How can I loop my code to scrape and save each page?
Secondly, my code scrapes each url 3 times.
Is there a way to not have the duplicates?
Lastly, I'm using 'strip' to get the dynamic integer for the loop. The element returns the text: Viewing page 1 of 18. Using 'strip' returns the correct number if the last number is a single integer. In this case, since there are two (18), it only returns the 8. Can't figure that one out for the life of me.
Appreciate the help.
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from bs4 import BeautifulSoup
import re
import csv
chrome_driver = "C:/chromedriver.exe"
Chrome_options = Options()
Chrome_options.add_experimental_option("debuggerAddress", "127.0.0.1:9015")
options = webdriver.ChromeOptions()
driver = webdriver.Chrome(chrome_driver, options=Chrome_options)
source = driver.page_source
soup = BeautifulSoup(source, "html.parser")
### set zipcode and search length ###
zipcode = "84105"
search = "1yr" #search option: 1mo 3mo 6mo 1yr 2yr 3yr All
url = 'https://www.redfin.com/zipcode/' + zipcode + '/filter/include=sold-' + search
https = "https://www.redfin.com"
driver.get(url)
#####################################
### get page count ###
count = soup.find('span', class_='pageText').get_text() #grabs total pages to grab
pages = count.strip('Viewing page 1 of') #gives a number of pages to paginate
print("This search has " + pages + " pages" + ": " + zipcode)
print(url)
########################
data = []
for url in soup.find_all('a', attrs={'href': re.compile("^/UT/")}):
print(https + url['href'])
data.append(https + url['href'])
with open("links.csv",'a') as csvfile:
write = csv.writer(csvfile, delimiter = ' ')
write.writerows(data)
Just noticed that you want to loop without duplicates:
import requests
from bs4 import BeautifulSoup
import csv
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:75.0) Gecko/20100101 Firefox/75.0'
}
def main(url):
with requests.Session() as req:
print("Extracting Page# 1")
r = req.get(url.format("1"), headers=headers)
soup = BeautifulSoup(r.content, 'html.parser')
total = int(soup.select_one("span.pageText").text.split(" ")[-1]) + 1
urls = [f'{url[:22]}{a.get("href")}' for a in soup.select(
"a.slider-item")]
for page in range(2, total):
print(f"Extracting Page# {page}")
r = req.get(url.format(page), headers=headers)
soup = BeautifulSoup(r.content, 'html.parser')
links = [f'{url[:22]}{a.get("href")}' for a in soup.select(
"a.slider-item")]
urls.extend(links)
mylist = list(dict.fromkeys(urls))
with open("links.csv", 'w', newline="") as f:
writer = csv.writer(f)
writer.writerow(["Links"])
writer.writerows(zip(mylist))
main("https://www.redfin.com/zipcode/84105/filter/include=sold-1yr/page-{}")

Scraping all href links using Pagination

I've to Select each state from https://www.maxpreps.com/search/states_by_sport.aspx?gendersport=boys,football&season=fall and then click on team rankings and after that I've to grab href links of each ranked team.
I've completed till team rankings part now I want get links of each ranked team from all the pages in the pagination bar right now I'm getting links of all teams available on the first page only, I don't how to navigate to the next page.(below is the code)
import requests
from bs4 import BeautifulSoup
from urllib.request import urlopen
import re
site = "https://www.maxpreps.com"
url = requests.get("https://www.maxpreps.com/search/states_by_sport.aspx?gendersport=boys,football&season=fall")
soup = BeautifulSoup(url.content, "html.parser")
states = soup.findAll('div', {'class': 'states'})
for each_state in states:
all_states = each_state.find_all('a', href=True)
for a in all_states:
domain = site + a['href'] #domain consist oflinks of states
for r in domain:
page_link = domain
page_response = requests.get(page_link)
soup = BeautifulSoup(page_response.content, "html.parser")
for link in soup.findAll('a', attrs={'href': re.compile("rankings")}):
rankings_link = site + link.get('href')
#print(rankings_link)
for ert in rankings_link:
team_link = rankings_link
page_response1 = requests.get(team_link)
soup = BeautifulSoup(page_response1.content, "html.parser")
My_table = soup.find('table',{'class':'mx-grid sortable rankings-grid'})
links = My_table.findAll('a')
print(links)
output:
Everett, Methuen,
You could just iterate through pages within the query parameters.
import requests
from bs4 import BeautifulSoup
site = "https://www.maxpreps.com"
session = requests.Session()
response = session.get("https://www.maxpreps.com/search/states_by_sport.aspx?gendersport=boys,football&season=fall")
soup = BeautifulSoup(response.content, "html.parser")
all_states = soup.find('div', {'class': 'states'})
states_list = []
for each in all_states.find_all('a'):
states_list.append(each['href'].split('=')[-1])
states_list = states_list[:-1]
team_links = []
url = 'https://www.maxpreps.com/m/rankings/list.aspx'
for state in states_list:
break_loop = False
page=1
while break_loop == False:
print ('%s: Page %s' %(state, page))
payload = {
'page': str(page),
'ssid': '8d610ab9-220b-465b-9cf0-9f417bce6c65',
'state': state
}
response = requests.get(url, params=payload)
soup = BeautifulSoup(response.text, "html.parser")
table = soup.find('table')
if table == None:
break_loop = True
else:
page+=1
links = table.find_all('a')
for link in links:
team_links.append('https://www.maxpreps.com' + link['href'])
Output:
print (team_links[:10])
['https://www.maxpreps.com/m/high-schools/central-red-devils-(phenix-city,al)/football/default.htm', 'https://www.maxpreps.com/m/high-schools/thompson-warriors-(alabaster,al)/football/default.htm', 'https://www.maxpreps.com/m/high-schools/hoover-buccaneers-(hoover,al)/football/default.htm', 'https://www.maxpreps.com/m/high-schools/oxford-yellow-jackets-(oxford,al)/football/default.htm', 'https://www.maxpreps.com/m/high-schools/mountain-brook-spartans-(birmingham,al)/football/default.htm', 'https://www.maxpreps.com/m/high-schools/hewitt-trussville-huskies-(trussville,al)/football/default.htm', 'https://www.maxpreps.com/m/high-schools/mcgill-toolen-yellowjackets-(mobile,al)/football/default.htm', 'https://www.maxpreps.com/m/high-schools/lee-generals-(montgomery,al)/football/default.htm', 'https://www.maxpreps.com/m/high-schools/pinson-valley-indians-(pinson,al)/football/default.htm', 'https://www.maxpreps.com/m/high-schools/vestavia-hills-rebels-(vestavia-hills,al)/football/default.htm']

Scraping a lift with Python and BeautifulSoup

I am new to Python and trying to write some code that scrapes information form a website. I currently have:
from bs4 import BeautifulSoup
import requests
headers = {'User-Agent': 'Mozilla/5.0'}
for i in range(1, 300):
url = "[REMOVED]/footwear?page=%s" % i
response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.text, "html.parser")
items = soup.find_all('div', 'product-block__info')
for item in items:
for val in item.find_all('a','product-block'):
stock = item.find_all('class','count_product_stock hidden')[0].text
brand = item.find_all('div','brand')[0].text
price = item.find_all('span','selling_price')[0].text
print (items)
Which returns the error IndexError: list index out of range. If I put 'product-block__info' in the place of 'product-block' then I am able to print off the full list of the content within the 'product-block__info' tag on the page, but I'd like to just select a handful of elements and return these.
Can anyone explain to me what's happening here and how I can select just the elements i want from inside 'product-block__info'?
When selecting attributes with find_all you should either use the attrs dictionary or the keyword arguments, otherwise bs4 is lookink for tags.
for i in range(1, 300):
url = "[REMOVED]/footwear?page=%s" % i
response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.text, "html.parser")
items = soup.find_all('div', class_='product-block__info')
for item in items:
stock = item.find('span', class_='count_product_stock hidden').text
brand = item.find('h4', class_='brand').text
price = item.find('span', class_='selling_price').text
print(stock, brand, price)

Why am I getting duplicate links ? And how do I fetch links on the next pages?

I am getting duplicate links when for the links I am trying to obtain, I am not sure why. Also I am trying to fetch all the links like the ones I am getting from all the pages. But I am not sure how to write the code to click next page. Could someone please help me understand how I would go about this?
import requests
from bs4 import BeautifulSoup
url = 'http://www.gosugamers.net/counterstrike/teams'
r = requests.get(url)
page = r.text
soup = BeautifulSoup(page)
#all_teams = []
for team_links in soup.find_all('a', href=True):
if team_links['href'] == '' or team_links['href'].startswith('/counterstrike/teams'):
print (team_links.get('href').replace('/counterstrike/teams', url))
The team links are in anchor tags inside the h3 tags which are inside the div with the details class:
import requests
from bs4 import BeautifulSoup
from urlparse import urljoin
base = "http://www.gosugamers.net"
url = 'http://www.gosugamers.net/counterstrike/teams'
r = requests.get(url)
page = r.text
soup = BeautifulSoup(page)
for team_links in soup.select("div.details h3 a"):
print ( urljoin(base, team_links["href"]))
Which gives you:
http://www.gosugamers.net/counterstrike/teams/16338-motv
http://www.gosugamers.net/counterstrike/teams/16337-absolute-monster
http://www.gosugamers.net/counterstrike/teams/16258-immortals-cs
http://www.gosugamers.net/counterstrike/teams/16251-ireal-star-gaming
http://www.gosugamers.net/counterstrike/teams/16176-team-genesis
http://www.gosugamers.net/counterstrike/teams/16175-potadies
http://www.gosugamers.net/counterstrike/teams/16174-crowns-gg
http://www.gosugamers.net/counterstrike/teams/16173-visomvet
http://www.gosugamers.net/counterstrike/teams/16172-team-phenomenon
http://www.gosugamers.net/counterstrike/teams/16152-kriklekrakle
http://www.gosugamers.net/counterstrike/teams/16148-begenius
http://www.gosugamers.net/counterstrike/teams/16144-blubblub
http://www.gosugamers.net/counterstrike/teams/16142-team-1231
http://www.gosugamers.net/counterstrike/teams/16141-vsv
http://www.gosugamers.net/counterstrike/teams/16140-tbi
http://www.gosugamers.net/counterstrike/teams/16136-deadweight
http://www.gosugamers.net/counterstrike/teams/16135-me-myself-and-i
http://www.gosugamers.net/counterstrike/teams/16085-pur-esports
http://www.gosugamers.net/counterstrike/teams/15850-falken
http://www.gosugamers.net/counterstrike/teams/15815-team-abyssal
You are literally parsing all the links on the page, that is why you see the dupes.
To get all the teams we can parse the next page link until the span with the "Next" text is not there any more which only happens for the last page:
def get_all(url, base):
r = requests.get(url)
page = r.text
soup = BeautifulSoup(page)
for team_links in soup.select("div.details h3 a"):
yield (urljoin(base, team_links["href"]))
nxt = soup.find("div", {"class": "pages"}).find("span", text="Next")
while nxt:
r = requests.get(urljoin(base, nxt.find_previous("a")["href"]))
page = r.text
soup = BeautifulSoup(page)
for team_links in soup.select("div.details h3 a"):
yield (urljoin(base, team_links["href"]))
nxt = soup.find("div", {"class": "pages"}).find("span", text="Next")
If we run it for a couple of seconds, you can see we get the next pages:
In [26]: for link in (get_all(url, base)):
....: print(link)
....:
http://www.gosugamers.net/counterstrike/teams/16386-cantonese-cs
http://www.gosugamers.net/counterstrike/teams/16338-motv
http://www.gosugamers.net/counterstrike/teams/16337-absolute-monster
http://www.gosugamers.net/counterstrike/teams/16258-immortals-cs
http://www.gosugamers.net/counterstrike/teams/16251-ireal-star-gaming
http://www.gosugamers.net/counterstrike/teams/16176-team-genesis
http://www.gosugamers.net/counterstrike/teams/16175-potadies
http://www.gosugamers.net/counterstrike/teams/16174-crowns-gg
http://www.gosugamers.net/counterstrike/teams/16173-visomvet
http://www.gosugamers.net/counterstrike/teams/16172-team-phenomenon
http://www.gosugamers.net/counterstrike/teams/16152-kriklekrakle
http://www.gosugamers.net/counterstrike/teams/16148-begenius
http://www.gosugamers.net/counterstrike/teams/16144-blubblub
http://www.gosugamers.net/counterstrike/teams/16142-team-1231
http://www.gosugamers.net/counterstrike/teams/16141-vsv
http://www.gosugamers.net/counterstrike/teams/16140-tbi
http://www.gosugamers.net/counterstrike/teams/16136-deadweight
http://www.gosugamers.net/counterstrike/teams/16135-me-myself-and-i
http://www.gosugamers.net/counterstrike/teams/16085-pur-esports
http://www.gosugamers.net/counterstrike/teams/15850-falken
http://www.gosugamers.net/counterstrike/teams/15815-team-abyssal
http://www.gosugamers.net/counterstrike/teams/15810-ex-deathtrap
http://www.gosugamers.net/counterstrike/teams/15808-mix123
http://www.gosugamers.net/counterstrike/teams/15651-undertake-esports
http://www.gosugamers.net/counterstrike/teams/15644-five
http://www.gosugamers.net/counterstrike/teams/15630-five
http://www.gosugamers.net/counterstrike/teams/15627-inetkoxtv
http://www.gosugamers.net/counterstrike/teams/15626-tetr-s
http://www.gosugamers.net/counterstrike/teams/15625-rozenoir-esports-white
http://www.gosugamers.net/counterstrike/teams/15619-fragment-gg
http://www.gosugamers.net/counterstrike/teams/15615-monarchs-gg
http://www.gosugamers.net/counterstrike/teams/15602-ottoman-fire
http://www.gosugamers.net/counterstrike/teams/15591-respect
http://www.gosugamers.net/counterstrike/teams/15569-moonbeam-gaming
http://www.gosugamers.net/counterstrike/teams/15563-team-tilt
http://www.gosugamers.net/counterstrike/teams/15534-dynasty-uk
http://www.gosugamers.net/counterstrike/teams/15507-urbantech
http://www.gosugamers.net/counterstrike/teams/15374-innova
http://www.gosugamers.net/counterstrike/teams/15373-g3x
http://www.gosugamers.net/counterstrike/teams/15372-cnb
http://www.gosugamers.net/counterstrike/teams/15370-intz
http://www.gosugamers.net/counterstrike/teams/15369-2kill
http://www.gosugamers.net/counterstrike/teams/15368-supernova
http://www.gosugamers.net/counterstrike/teams/15367-biggods
http://www.gosugamers.net/counterstrike/teams/15366-playzone
http://www.gosugamers.net/counterstrike/teams/15365-pride
http://www.gosugamers.net/counterstrike/teams/15359-rising-orkam
http://www.gosugamers.net/counterstrike/teams/15342-team-foxez
http://www.gosugamers.net/counterstrike/teams/15336-angels
http://www.gosugamers.net/counterstrike/teams/15331-atlando-esports
http://www.gosugamers.net/counterstrike/teams/15329-xfinity-esports
http://www.gosugamers.net/counterstrike/teams/15326-nano-reapers
http://www.gosugamers.net/counterstrike/teams/15322-erase-team
http://www.gosugamers.net/counterstrike/teams/15318-heyguys
http://www.gosugamers.net/counterstrike/teams/15317-illusory
http://www.gosugamers.net/counterstrike/teams/15285-dismay
http://www.gosugamers.net/counterstrike/teams/15284-kingdom-esports
http://www.gosugamers.net/counterstrike/teams/15283-team-rival
http://www.gosugamers.net/counterstrike/teams/15282-ze-pug-godz
http://www.gosugamers.net/counterstrike/teams/15281-unlimited-potential1
You can see in the source for the first and any bar the last page the span with Next:
And when we get to the last, there is only spans with Previous and First:

Resources