optimise scraping and requesting web page

optimise scraping and requesting web page - python-3.x

How should I optimise my time in making requests
link=['http://youtube.com/watch?v=JfLt7ia_mLg',
'http://youtube.com/watch?v=RiYRxPWQnbE'
'http://youtube.com/watch?v=tC7pBOPgqic'
'http://youtube.com/watch?v=3EXl9xl8yOk'
'http://youtube.com/watch?v=3vb1yIBXjlM'
'http://youtube.com/watch?v=8UBY0N9fWtk'
'http://youtube.com/watch?v=uRPf9uDplD8'
'http://youtube.com/watch?v=Coattwt5iyg'
'http://youtube.com/watch?v=WaprDDYFpjE'
'http://youtube.com/watch?v=Pm5B-iRlZfI'
'http://youtube.com/watch?v=op3hW7tSYCE'
'http://youtube.com/watch?v=ogYN9bbU8bs'
'http://youtube.com/watch?v=ObF8Wz4X4Jg'
'http://youtube.com/watch?v=x1el0wiePt4'
'http://youtube.com/watch?v=kkeMYeAIcXg'
'http://youtube.com/watch?v=zUdfNvqmTOY'
'http://youtube.com/watch?v=0ONtIsEaTGE'
'http://youtube.com/watch?v=7QedW6FcHgQ'
'http://youtube.com/watch?v=Sb33c9e1XbY']
I have a list of 15-20 links of youtube search result of first page Now the task is to get the likes,dislikes,view count from each video url and for that what I had done is
def parse(url,i,arr):
req=requests.get(url)
soup = bs4.BeautifulSoup(req.text,"lxml")#, 'html5lib')
try:
likes=int(soup.find("button",attrs={"title": "I like this"}).getText().__str__().replace(",",""))
except:
likes=0
try:
dislikes=int(soup.find("button",attrs={"title": "I dislike this"}).getText().__str__().replace(",",""))
except:
dislikes=0
try:
view=int(soup.find("div",attrs={"class": "watch-view-count"}).getText().__str__().split()[0].replace(",",""))
except:
view=0
arr[i]=(likes,dislikes,view,url)
time.sleep(0.3)
def parse_list(link):
arr=len(link)*[0]
threadarr=len(link)*[0]
import threading
a=time.clock()
for i in range(len(link)):
threadarr[i]=threading.Thread(target=parse,args=(link[i],i,arr))
threadarr[i].start()
for i in range(len(link)):
threadarr[i].join()
print(time.clock()-a)
return arr
arr=parse_list(link)
Now I am getting the populated result array in about 6 seconds.Is there any faster way I can get my array(arr) so that it takes quite less time than 6 secs
my array first 4 elements look like so that you get a rough idea
[(105, 11, 2836, 'http://youtube.com/watch?v=JfLt7ia_mLg'),
(32, 18, 5420, 'http://youtube.com/watch?v=RiYRxPWQnbE'),
(45, 3, 7988, 'http://youtube.com/watch?v=tC7pBOPgqic'),
(106, 38, 4968, 'http://youtube.com/watch?v=3EXl9xl8yOk')]
Thanks in advance :)

I would use multiprocessing Pool object for that particular case.
import requests
import bs4
from multiprocessing import Pool, cpu_count
links = [
'http://youtube.com/watch?v=JfLt7ia_mLg',
'http://youtube.com/watch?v=RiYRxPWQnbE',
'http://youtube.com/watch?v=tC7pBOPgqic',
'http://youtube.com/watch?v=3EXl9xl8yOk'
]
def parse_url(url):
req=requests.get(url)
soup = bs4.BeautifulSoup(req.text,"lxml")#, 'html5lib')
try:
likes=int(soup.find("button", attrs={"title": "I like this"}).getText().__str__().replace(",",""))
except:
likes=0
try:
dislikes=int(soup.find("button", attrs={"title": "I dislike this"}).getText().__str__().replace(",",""))
except:
dislikes=0
try:
view=int(soup.find("div", attrs={"class": "watch-view-count"}).getText().__str__().split()[0].replace(",",""))
except:
view=0
return (likes, dislikes, view, url)
pool = Pool(cpu_count) # number of processes
data = pool.map(parse_url, links) # this is where your results are
This is cleaner as you only have one function to write and you end up with exactly the same results.

This is not a workaround but it can save your script from using "try/except block" which definitely plays a role to somewhat slow the operation down.
for url in links:
response = requests.get(url).text
soup = BeautifulSoup(response,"html.parser")
for item in soup.select("div#watch-header"):
view = item.select("div.watch-view-count")[0].text
likes = item.select("button[title~='like'] span.yt-uix-button-content")[0].text
dislikes = item.select("button[title~='dislike'] span.yt-uix-button-content")[0].text
print(view, likes, dislikes)

Related

How do i loop to one page to another in my web scraping project.. should scrap datas from all 250 pages..But it stops in the first page

#So this is a part of the scrapping code...but it does not gets looped more than the first page please help me to loop all through 250 pages of the etsy ecommerce website
URL = f'https://www.etsy.com/in-en/c/jewelry/earrings/ear-jackets-and-climbers?ref=pagination&page={page}'
try:
#Count for every page of website
URL = URL.format(page)
browser.get(URL)
print("Scraping Page:",page)
#xpath of product table
PATH_1 ='//*[#id="content"]/div/div[1]/div/div[3]/div[2]/div[2]/div[9]/div/div/div'
#getting total items
items = browser.find_element(By.XPATH, PATH_1)
items = items.find_elements(By.TAG_NAME, 'li' )
#available items in page
end_product = len(items)
#Count for every product of the page
for product in range(0,end_product):
print("Scarping reviews for product", product +1)
#clicking on product
try:
items[product].find_element(By.TAG_NAME, 'a').click()
except:
print('Product link not found')
#switch the focus of driver to new tab
windows = browser.window_handles
browser.switch_to.window(windows[1])
try:
PATH_2 = '//*[#id="reviews"]/div[2]/div[2]'
count = browser.find_element(By.XPATH, PATH_2)
#Number of review on any page
count = count.find_elements(By.CLASS_NAME, 'wt-grid wt-grid--block wt-mb-xs-0')
for r1 in range(1,len(count)+1):
dat1 = browser.find_element(By.XPATH ,
'//*[#id="reviews"]/div[2]/div[2]/div[1]/div[1]/p'.format(
r1)).text
if dat1[:dat1.find(',')-6] not in person:
try:
person.append(dat1[:dat1.find(',')-6])
date.append(dat1[dat1.find(',')-6:])
except Exception:
person.append("Not Found")
date.append("Not Found")
try:
stars.append(browser.find_element(By.XPATH ,
'//*[#id="reviews"]/div[2]/div[2]/div[1]/div[2]/div[1]/div/div/span/span[2]'.format(
r1)).text[0])
except Exception:
stars.append("No stars")
except Exception:
browser.close()
#swtiching focus to main tab
browser.switch_to.window(windows[0])
#export data after every product
#export_data()
except Exception as e_1:
print(e_1)
print("Program stoped:")
export_data()
browser.quit()
#defining the main function
def main():
logging.basicConfig(filename='solution_etsy.log', level=logging.INFO)
logging.info('Started')
if 'page.txt' in os.listdir(os.getcwd()):
with open('page.txt','r') as file1:
page = int(file1.read())
for i in range(1 ,250):
run_scraper(i,browser)
else:
for i in range(1,250):
with open('page.txt','w') as file:
file.write(str(i))
run_scraper(i,browser)
export_data()
print("--- %s seconds ---" % (time.time() - start_time))
logging.info('Finished')
# Calling the main function
if __name__ == '__main__':
main()
So in this code please help to loop from one page to another where do i apply the loop.

stud = 'https://www.etsy.com/in-en/c/jewelry/earrings/ear-jackets-and-climbers?ref=pagination&page={}'
from time import sleep
from tqdm.notebook import tqdm
for i in tqdm(range(1, 250)):
url_pages = stud.format(i)
browser.get(url_pages)
sleep(4) ## sleep of 4 will rest your code for 4 sec so that the entire page is loded you can adjust it according to your internet speed.
html = browser.page_source
soup = BeautifulSoup(html, 'html.parser)
### funtion or anything that you want to apply from here
Then follow your steps as you wish this will load all the pages.
If it still not working, look how I have scraped data from multiple pages
from site similar to this site Github link to my solution : https://github.com/PullarwarOm/devtomanager.com-Web-Scraping/blob/main/devtomanager%20Final.ipynb
The scraped page is similar to page you are trying to scrape so go through this ipynb file. Thank you.

Selenium can't find a CSS selector

Selenium catches a NoSuchElementException after retrieving exactly 9 entries from the website. I think the problem might be in that the page contents doesn't have enough time to load, but I'm not sure.
I've written the code following this YouTube tutorial (nineteenths minute).
import requests
import json
import re
from bs4 import BeautifulSoup
from selenium import webdriver
import time
driver = webdriver.Chrome()
URL = 'https://www.alibaba.com//trade/search?fsb=y&IndexArea=product_en&CatId=&SearchText=white+hoodie'
time.sleep(1)
driver.get(URL)
driver.execute_script('window.scrollTo(0, document.body.scrollHeight)')
time.sleep(2)
driver.execute_script('window.scrollTo(0, document.body.scrollHeight)')
time.sleep(2)
items = driver.find_elements_by_css_selector('.J-offer-wrapper')
num = 1
for i in items:
print(num)
product_name = i.find_element_by_css_selector('h4').text
price = i.find_element_by_css_selector('.elements-offer-price-normal').text
time.sleep(0.5)
num += 1
print(price, product_name)
#driver.close()
If you have a clue why Selenium stops at the 10th entry and how to overcome this issue, please, share.

You are getting that because the 10th item is not like the rest. It's an ad thingy and not a hoodie as you've searched for. I suspect you'd want to exclude this so you are left only with the results you are actually interested in.
All you need to do is change the way you identify items (this just one of the options):
items = driver.find_elements_by_css_selector('.img-switcher-parent')

You need to update for the error handling as below:
for i in items:
print(num)
try:
product_name = i.find_element_by_css_selector('h4').text
except:
product_name=''
try:
price = i.find_element_by_css_selector('.elements-offer-pricenormal').text
except:
price=''
time.sleep(0.5)
num += 1
print(price, product_name)

BS4 Script not consistently scraping traget value, not generating error

A while back I created a BS4 script to scrape off individual stock ticker market values from Yahoo Finance. The purpose being to update a personal finance program (individual use not commercial).
The program worked flawlessly for months, but recently it stopped working 100%. It appears to have a 25-50% success rate. The errors that do generate from the script are associated with the fact that a value was not obtained. I can not figure out how to generate an error as to why a value wasn't found/scraped for one execution but not another execution of the same script.
like each time I run the script it will work sometimes but not other times. I have adjusted the script to just execute on a single user input ticker instead of pulling a list from a database. Any thoughts as to where I am going wrong?
An attempt at debugging was the addition of print(soup). the idea was to ensure something was being obtained. Which it appears to be doing. However, the soup.find_All() aspect seems to be the point of random success.
[as an aside I may find an api to switch to in the future but for educational and proof of concept I want to get this to work.]
from bs4 import BeautifulSoup
import ssl
import os
import time
from urllib.request import Request, urlopen
def scrape_value(ticker):
ticker_price = ""
# For ignoring SSL certificate errors
ctx = ssl.create_default_context()
ctx.check_hostname = False
ctx.verify_mode = ssl.CERT_NONE
print(f"ticker: {ticker} - before scrape")
url = f'https://finance.yahoo.com/quote/{ticker.upper()}?p={ticker.upper()}&.tsrc=fin-srch'
req = Request(url, headers={'User-Agent': 'Chrome/79.0.3945.130'})
webpage = urlopen(req).read()
soup = BeautifulSoup(webpage, 'html.parser')
print(soup)
for span in soup.find_All('span',
attrs={'class': "Trsdu(0.3s) Trsdu(0.3s) Fw(b) Fz(36px) Mb(-4px) D(b)"
}):
ticker_price = span.text.strip()
print(ticker_price)
return ticker_price
if __name__ == '__main__':
scrape_value('F')

Use different approach, how to select the element. Observing the HTML source, the quote is in <span> with data-reactid="32":
import requests
from bs4 import BeautifulSoup
def scrape_value(ticker):
url = f'https://finance.yahoo.com/quote/{ticker.upper()}?p={ticker.upper()}&.tsrc=fin-srch'
webpage = requests.get(url, headers={'User-Agent': 'Chrome/79.0.3945.130'}).text
soup = BeautifulSoup(webpage, 'html.parser')
ticker_price = soup.find('span', {'data-reactid': "32"}).text
return ticker_price
if __name__ == '__main__':
for ticker in ['F', 'AAPL', 'MSFT']:
print('{} - {}'.format(ticker, scrape_value(ticker)))
Prints:
F - 6.90
AAPL - 369.44
MSFT - 201.57

Sending GET requests to amazon.in but the webserver responded with response code 503, what to do?

Here is my code:
This whole script worked fine for the first 2-3 times but now is constantly sending 503 responses
The Internet was checked by me multiple times but there wasn't any problem with internet
from bs4 import BeautifulSoup
import requests, sys, os, json
def get_amazon_search_page(search):
search = search.strip().replace(" ", "+")
for i in range(3): # tries to connect and get request the amazon 3 times
try:
print("Searching...")
response = requests.get("https://www.amazon.in/s?k={}&ref=nb_sb_noss".format(search)) # search string will be manipulated by replacing all spaces with "+" in order to search from the website itself
print(response.status_code)
if response.status_code == 200:
return response.content, search
except Exception:
pass
print("Is the search valid for the site: https://www.amazon.in/s?k={}&ref=nb_sb_noss".format(search))
sys.exit(1)
def get_items_from_page(page_content):
print(page_content)
soup = BeautifulSoup(page_content, "html.parser") # soup for extracting information
items = soup.find_all("span", class_ = "a-size-medium a-color-base a-text-normal")
prices = soup.find_all("span", class_ = "a-price-whole")
item_list = []
total_price_of_all = 0
for item, price in zip(items, prices):
dict = {}
dict["Name"] = item.text
dict["Price"] = int(price.text)
total_price_of_all += int(price.text.replace(",", ""))
item_list.append(dict)
average_price = total_price_of_all/len(item_list)
file = open("items.json", "w")
json.dump(item_list, file, indent = 4)
print("Your search results are available in the items.json file")
print("Average prices for the search: {}".format(average_price))
file.close()
def main():
os.system("clear")
print("Note: Sometimes amazon site misbehaves by sending 503 responses, this can be due to heavy traffic on that site, please cooperate\n\n")
search = input("Enter product name: ").strip()
page_content = get_amazon_search_page(search)
get_items_from_page(page_content)
if __name__ == "__main__":
while True:
main()
Please Help !

The server blocks you from scraping it.
If you check the robots.txt, you can see that the link you are trying to request is disallowed:
Disallow: */s?k=*&rh=n*p_*p_*p_
However, a simple way to bypass this blocking would be to change your User-Agent (see here). By default, requests sends something like this "python-requests/2.22.0". Changing it to something more browser-like would temporarily work.

Trying to scrape multiple pages sequentially using loop to change url

First off, sorry... I am sure that this is a common problem but i did not find the solution anywhere eventhough i searched for a while.
I am trying to create list by scraping data from classicdb. The two problems i have is.
The scraping as written in the try loop does not work in inside the for loop but on its own it works. Currently it just returns the 0 even though there should be values to return.
The output that i get from the try loop gernerates new lists but i want to just get the value and append it later.
I have tried the try function outside the for loop and there it worked.
I also saw some solutions where a while true was used but that did not work for me.
from lxml.html import fromstring
import requests
import traceback
import time
from bs4 import BeautifulSoup as bs
Item_name=[]
Sell_Copper=[]
items= [47, 48]
url = 'https://classic.wowhead.com/item='
fails=[]
for i in items:
time.sleep(5)
url1=(url+str(i))
session = requests.session()
response = session.get(url1)
soup = bs(response.content, 'lxml')
name=soup.select_one('h1').text
print(name)
#get the buy prices
try:
copper = soup.select_one('li:contains("Sells for") .moneycopper').text
except Exception as e:
copper=str(0)
The expected result would be that i get one value in gold and a list in P_Gold. In this case:
copper='1'
Sell_copper=['1','1']

You don't need a sleep. It needs to be div:contains and the search text needs changing
import requests
from bs4 import BeautifulSoup as bs
Item_name=[]
Sell_Copper=[]
items= [47, 48]
url = 'https://classic.wowhead.com/item='
fails=[]
with requests.Session() as s:
for i in items:
response = s.get(url + str(i))
soup = bs(response.content, 'lxml')
name = soup.select_one('h1').text
print(name)
try:
copper = soup.select_one('div:contains("Sell Price") .moneycopper').text
except Exception as e:
copper=str(0)
print(copper)

Develop Reference

node.js excel linux python-3.x azure haskell apache-spark rust .htaccess string

optimise scraping and requesting web page - python-3.x

Related

How do i loop to one page to another in my web scraping project.. should scrap datas from all 250 pages..But it stops in the first page

Selenium can't find a CSS selector

BS4 Script not consistently scraping traget value, not generating error

Sending GET requests to amazon.in but the webserver responded with response code 503, what to do?

Trying to scrape multiple pages sequentially using loop to change url

Categories

Resources