How To Refactor Web Scraping Code In Python - web

I am web scraping data from the below url and was able to do it correctly but i am looking for more reliable and beautiful way to do it
import pandas as pd
from bs4 import BeautifulSoup
import requests
pages = list(range(1, 548))
list_of_url = []
for page in pages:
URL = "https://www.stats.gov.sa/ar/isic4?combine=&combine_1=All&items_per_page=5" + "&page=" + str(page)
#print (URL)
list_of_url.append(URL)
print(list_of_url)
list_activities = []
#page_number = 1
for url in list_of_url:
URL = url
page = requests.get(URL)
soup = BeautifulSoup(page.content, "html.parser")
results = soup.find('div', class_='view-content')
#print(results.prettify())
try:
activities = results.find_all("tr", class_=["views-row-first odd","even","odd","even","views-row-last odd"])
except:
print("in the activities line thisis a pad url", URL)
continue
try:
for activity in activities:
activity_section = activity.find('td', class_='views-field views-field-field-chapter-desc-en-et').text.strip()
activity_name = activity.find("td", class_="views-field views-field-field-activity-description-en-et").text.strip()
activity_code = activity.find("td", class_="views-field views-field-field-activity-code active").text.strip()
list_activities.append([activity_section,activity_name,activity_code])
except:
print("url not founf")
continue
page_number += 1
df = pd.DataFrame(list_activities, columns=["activity_section", "activity_name", "activity_code"])
df.head()
I am web scraping data from the below url and was able to do it correctly but i am looking for more reliable and beautiful way to do it

Here is a shorter version for your code:
import pandas as pd
from bs4 import BeautifulSoup
import requests
list_activities = []
URLS = [f'https://www.stats.gov.sa/ar/isic4?combine=&combine_1=All&items_per_page=5&page={page}' for page in range(1,3)]
for URL in URLS:
page = requests.get(URL)
soup = BeautifulSoup(page.text, "html.parser")
results = soup.find('div', class_='view-content')
activities = results.find_all("tr", class_=["views-row-first odd","even","odd","even","views-row-last odd"])
list_activities += [[
activity.find('td', class_='views-field views-field-field-chapter-desc-en-et').text.strip(),
activity.find("td", class_="views-field views-field-field-activity-description-en-et").text.strip(),
activity.find("td", class_="views-field views-field-field-activity-code active").text.strip()
] for activity in activities]
df = pd.DataFrame(list_activities, columns=["activity_section", "activity_name", "activity_code"])
df.head()
However, as an engineer at WebScrapingAPI I would recommend you implement a stealthier scraper if you want to scrape this website on the long run. As per my testing, it does not feature any known bot detection providers right now. But being a government website it might use a private detection system.

Related

Web-Scraping Amazon products

I want to scrape multiple Amazon product pages. If I print the title, for instance, it does not print the title for both links or ASINs, but only for the latter one. How can I print the title of both ASINs?
ASIN = ['B09C1Q9P1N','B096W87PPJ']
for a in ASIN:
url = 'https://www.amazon.de/dp/' + a + '/'
driver.get(url)
urls = 'https://www.amazon.de/dp/B09C1Q9P1N/','https://www.amazon.de/dp/B096W87PPJ/'
soupa = soup(driver.page_source, 'html.parser')
title = soupa.find(id='productTitle').text.replace('\n', '').replace(' ', '')
I am not sure I fully understand your question but I gather you are using selenium? Have you tried putting it into a for loop for each URL such as:
import bs4
from selenium import webdriver
# Your webdriver code here
urls = ["https://www.amazon.de/dp/B09C1Q9P1N",
"https://www.amazon.de/dp/B096W87PPJ"]
for url in urls:
html = driver.page_source
soup = bs4.BeautifulSoup(html.text, "html.parser")
for title in soup.find_all("span", {"id": "productTitle"}):
print(title)

I make a list of URL of different pages for scraping the data. Can anyone tell me that is there any way to automate this process?

from bs4 import BeautifulSoup
import requests
urls = ['https://www.trustpilot.com/categories/restaurants_bars?
numberofreviews=0&status=all&timeperiod=0',
'https://www.trustpilot.com/categories/restaurants_bars?
numberofreviews=0&page=2&status=all&timeperiod=0',
'https://www.trustpilot.com/categories/restaurants_bars?
numberofreviews=0&page=3&status=all&timeperiod=0',
'https://www.trustpilot.com/categories/restaurants_bars?
numberofreviews=0&page=4&status=all&timeperiod=0',
'https://www.trustpilot.com/categories/restaurants_bars?
numberofreviews=0&page=5&status=all&timeperiod=0',
'https://www.trustpilot.com/categories/restaurants_bars?
numberofreviews=0&page=6&status=all&timeperiod=0',
'https://www.trustpilot.com/categories/restaurants_bars?
numberofreviews=0&page=7&status=all&timeperiod=0',
'https://www.trustpilot.com/categories/restaurants_bars?
numberofreviews=0&page=8&status=all&timeperiod=0']
for url in URLs:
html_text = requests.get(url).text
soup = BeautifulSoup(html_text, 'lxml')
restaurants = soup.find_all('div', class_ = 'categoryBusinessListWrapper___14CgD')
for index, restaurant in enumerate(restaurants):
tags = restaurant.find_all('a', class_ = 'internal___1jK0Z wrapper___26yB4')
for tag in tags:
restaurant_name = tag.find('div', class_ = 'businessTitle___152-c').text.split(',')[0]
ratings = tag.find('div', class_ = 'textRating___3F1NO')
location = tag.find('span', class_ = 'locationZipcodeAndCity___33EfU')
more_info = tag['href']
As you can see that I create a URLs list to store the URL of different pages on this website. Is there any process to automate this? I use BeautifulSoup and the request module for scraping. I want to know that if there is any process to automate the URL accessing for different pages.
You can look at the pagination at the bottom of the page and use list comprehension to create those links:
import requests
from bs4 import BeautifulSoup
import re
url = 'https://www.trustpilot.com/categories/restaurants_bars?numberofreviews=0&status=all&timeperiod=0'
regex = re.compile('pagination')
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')
pages = len(soup.find_all("a", {"class": regex}))
links = ['https://www.trustpilot.com/categories/restaurants_bars?numberofreviews=0&page={page}&status=all&timeperiod=0'.format(page=page+1) for page in range(0,pages) ]
Output:
print (links)
['https://www.trustpilot.com/categories/restaurants_bars?numberofreviews=0&page=1&status=all&timeperiod=0', 'https://www.trustpilot.com/categories/restaurants_bars?numberofreviews=0&page=2&status=all&timeperiod=0', 'https://www.trustpilot.com/categories/restaurants_bars?numberofreviews=0&page=3&status=all&timeperiod=0', 'https://www.trustpilot.com/categories/restaurants_bars?numberofreviews=0&page=4&status=all&timeperiod=0', 'https://www.trustpilot.com/categories/restaurants_bars?numberofreviews=0&page=5&status=all&timeperiod=0', 'https://www.trustpilot.com/categories/restaurants_bars?numberofreviews=0&page=6&status=all&timeperiod=0', 'https://www.trustpilot.com/categories/restaurants_bars?numberofreviews=0&page=7&status=all&timeperiod=0', 'https://www.trustpilot.com/categories/restaurants_bars?numberofreviews=0&page=8&status=all&timeperiod=0']

Not able to use BeautifulSoup to get span content of Nasdaq100 future

from bs4
import BeautifulSoup
import re
import requests
url = 'www.barchart.com/futures/quotes/NQU18'
r = requests.get("https://" +url)
data = r.text
soup = BeautifulSoup(data)
price = soup.find('span', {'class': 'last-change',
'data-ng-class': "highlightValue('priceChange’)”}).text
print(price)
Result:
[[ item.priceChange ]]
It is not the span content. The result should be price. Where am I going wrong?
The following is the span tag of the page:
2nd screenshot: How can I get the time?
Use price = soup.find('span', {'class': 'up'}).text instead to get the +X.XX value:
from bs4 import BeautifulSoup
import requests
url = 'www.barchart.com/futures/quotes/NQU18'
r = requests.get("https://" +url)
data = r.text
soup = BeautifulSoup(data, "lxml")
price = soup.find('span', {'class': 'up'}).text
print(price)
Output currently is:
+74.75
The tradeTime you seek seems to not be present in the page_source, since it's dynamically generated through JavaScript. You can, however, find it elsewhere if you're a little clever, and use the json library to parse the JSON data from a certain script element:
import json
trade_time = soup.find('script', {"id": 'barchart-www-inline-data'}).text
json_data = json.loads(trade_time)
print(json_data["NQU18"]["quote"]["tradeTime"])
This outputs:
2018-06-14T18:14:05
If these don't solve your problem then you will have to resort to something like Selenium that can run JavaScript to get what you're looking for:
from selenium import webdriver
driver = webdriver.Chrome()
url = ("https://www.barchart.com/futures/quotes/NQU18")
driver.get(url)
result = driver.find_element_by_xpath('//*[#id="main-content-column"]/div/div[1]/div[2]/span[2]/span[1]')
print(result.text)
Currently the output is:
-13.00

Not able to parse webpage contents using beautiful soup

I have been using Beautiful Soup for parsing webpages for some data extraction. It has worked perfectly well for me so far, for other webpages. But however I'm trying to count the number of < a> tags in this page,
from bs4 import BeautifulSoup
import requests
catsection = "cricket"
url_base = "http://www.dnaindia.com/"
i = 89
url = url_base + catsection + "?page=" + str(i)
print(url)
#This is the page I'm trying to parse and also the one in the hyperlink
#I get the correct url i'm looking for at this stage
r = requests.get(url)
data = r.text
soup = BeautifulSoup(data, 'html.parser')
j=0
for num in soup.find_all('a'):
j=j+1
print(j)
I'm getting the output as 0. This makes me think that the 2 lines after r=requests.get(url) is probably not working(there's obviously no chance that there's zero < a> tags in the page), and i'm not sure about what alternative solution I can use here. Does anybody have any solution or faced a similar kind of problem before?
Thanks, in advance.
You need to pass some of the information along with the request to the server.
Following code should work...You can play along with other parameter as well
from bs4 import BeautifulSoup
import requests
catsection = "cricket"
url_base = "http://www.dnaindia.com/"
i = 89
url = url_base + catsection + "?page=" + str(i)
print(url)
headers = {
'User-agent': 'Mozilla/5.0'
}
#This is the page I'm trying to parse and also the one in the hyperlink
#I get the correct url i'm looking for at this stage
r = requests.get(url, headers=headers)
data = r.text
soup = BeautifulSoup(data, 'html.parser')
j=0
for num in soup.find_all('a'):
j=j+1
print(j)
Put any url in the parser and check the number of "a" tags available on that page:
from bs4 import BeautifulSoup
import requests
url_base = "http://www.dnaindia.com/cricket?page=1"
res = requests.get(url_base, headers={'User-agent': 'Existed'})
soup = BeautifulSoup(res.text, 'html.parser')
a_tag = soup.select('a')
print(len(a_tag))

is it possible to scrape data from website which give data while scrolling it using Beautiful Soup of Python

I want to extract data from a scrollable website. Here Number of watches, it is giving me is only 52 because other data of 52 watches is only shown at first scroll. How can I get the all the data.
from urllib.request import urlopen as uReq
from bs4 import BeautifulSoup as soup
import pandas as pd
# Url from which i will scrap my data
myUrl = "https://www.titan.co.in/shop-online/watches/titan"
'''OPENING THE CONNECTION and GRABBING THE PAGE'''
# Open the website and grap inside my script
openUrl = uReq(myUrl)
# Now i would like to see my graped site
page_html = openUrl.read()
# As it is open connection i need to close it
openUrl.close()
'''HTML PARSING'''
# Parse html
page_soup = soup(page_html, 'html.parser')
# Grab all products
containers = page_soup.find_all("div", {"class": "product"})
print("Number of watches found:: ", len(containers))
newUrl = []
m = 0
for contains in containers:
product_page_link_container = contains.find('a', {'class': 'product_page_link'})
product_detail_url = product_page_link_container['href']
newUrl.append(myUrl + product_detail_url)
print(newUrl[m])
openurl = uReq(newUrl[m])
pageHtml = openurl.read()
openurl.close()
pageSoup = soup(pageHtml, 'html.parser')
m = m + 1
Scrolling is handled by javascript.
If you want to be able to scrap data that appears while scrolling, you should use a browser emulator like Selenium.

Resources