Got a basic Google webscraper that returns urls of the first google search page - I want it to include URLS on further pages. What's the best way to paginate this code so as it grabs URLS from pages 2,3,4,5,6,7 etc.
Don't want to go off into space with how many pages I scrap but definitely want more than the first page !
import requests
import urllib
import pandas as pd
from requests_html import HTML
from requests_html import HTMLSession
def get_source(url):
try:
session = HTMLSession()
response = session.get(url)
return response
except requests.exceptions.RequestException as e:
print(e)
def scrape_google(query):
query = urllib.parse.quote_plus(query)
response = get_source("https://www.google.co.uk/search?q=" + query)
links = list(response.html.absolute_links)
google_domains = ('https://www.google.',
'https://google.',
'https://webcache.googleusercontent.',
'http://webcache.googleusercontent.',
'https://policies.google.',
'https://support.google.',
'https://maps.google.')
for url in links[:]:
if url.startswith(google_domains):
links.remove(url)
return links
print(scrape_google('https://www.google.com/search?q=letting agent'))
You can iterate over a specific range() and set the start parameter by multiply the number of iteration by 10 - Save your results to a list and use set() to remove duplicates:
data = []
for i in range(3):
data.extend(scrape_google('letting agent', i*10))
set(data)
Example
import requests
def scrape_google(query,start):
response = get_source(f"https://www.google.co.uk/search?q={query}&start={start}")
links = list(response.html.absolute_links)
google_domains = ('https://www.google.',
'https://google.',
'https://webcache.googleusercontent.',
'http://webcache.googleusercontent.',
'https://policies.google.',
'https://support.google.',
'https://maps.google.')
for url in links[:]:
if url.startswith(google_domains):
links.remove(url)
return links
data = []
for i in range(3):
data.extend(scrape_google('letting agent', i*10))
print(set(data))
Output
{'https://www.lettingagenttoday.co.uk/', 'https://translate.google.co.uk/translate?hl=de&sl=en&u=https://howsy.com/&prev=search&pto=aue', 'https://translate.google.co.uk/translate?hl=de&sl=en&u=https://www.propertymark.co.uk/professional-standards/consumer-guides/landlords/what-does-a-letting-agent-do.html&prev=search&pto=aue', 'https://www.citizensadvice.org.uk/housing/renting-privately/during-your-tenancy/complaining-about-your-letting-agent/', 'https://translate.google.co.uk/translate?hl=de&sl=en&u=https://www.allagents.co.uk/find-agent/&prev=search&pto=aue', 'https://translate.google.co.uk/translate?hl=de&sl=en&u=https://www.theonlinelettingagents.co.uk/&prev=search&pto=aue', 'https://www.which.co.uk/money/mortgages-and-property/buy-to-let/using-a-letting-agent-a16lu1w364rv', 'https://www.gov.uk/government/publications/non-resident-landord-guidance-notes-for-letting-agents-and-tenants-non-resident-landlords-scheme-guidance-notes', 'https://lettingagentregistration.gov.scot/renew', 'https://en.wikipedia.org/wiki/Letting_agent#Services_and_fees', 'https://patriciashepherd.co.uk/', 'https://dict.leo.org/englisch-deutsch/letting%20agent', 'https://www.diamonds-salesandlettings.co.uk/', 'https://www.lettingagentproperties.com/', 'https://translate.google.co.uk/translate?hl=de&sl=en&u=https://www.ukala.org.uk/&prev=search&pto=aue', 'https://translate.google.co.uk/translate?hl=de&sl=en&u=https://register.lettingagentregistration.gov.scot/search&prev=search&pto=aue', 'https://context.reverso.net/%C3%BCbersetzung/englisch-deutsch/letting+agent', 'https://www.cubittandwest.co.uk/landlord-guides/what-is-a-letting-agent/', 'https://en.wikipedia.org/wiki/Letting_agent', 'https://translate.google.co.uk/translate?hl=de&sl=en&u=https://safeagents.co.uk/&prev=search&pto=aue', 'https://translate.google.co.uk/translate?hl=de&sl=en&u=https://charlesroseproperties.co.uk/news/letting-agent-vs-estate-agent-the-differences/&prev=search&pto=aue', 'https://www.tenantshop.co.uk/letting-agents/', 'https://translate.google.co.uk/translate?hl=de&sl=en&u=https://lettingagentregistration.gov.scot/renew&prev=search&pto=aue', 'https://translate.google.co.uk/translate?hl=de&sl=en&u=https://www.winkworth.co.uk/&prev=search&pto=aue', 'https://objego.de/lp-immobilienverwaltung/', 'https://www.facebook.com/agestateagents/videos/looking-to-instruct-a-letting-agent-not-sure-what-you-should-be-looking-for-or-w/688390845096579/', 'https://www.ukala.org.uk/', 'https://en.wikipedia.org/wiki/Letting_agent#Regulation', 'https://www.foxtons.co.uk/', 'https://ibizaprestige.com/', 'https://translate.google.co.uk/translate?hl=de&sl=en&u=https://www.which.co.uk/money/mortgages-and-property/buy-to-let/using-a-letting-agent-a16lu1w364rv&prev=search&pto=aue', 'https://translate.google.co.uk/translate?hl=de&sl=en&u=https://www.tenantshop.co.uk/letting-agents/&prev=search&pto=aue', 'https://www.dict.cc/?s=letting+agent', 'https://www.landlordaccreditationscotland.com/letting-agent-training/', 'https://translate.google.co.uk/translate?hl=de&sl=en&u=https://www.gov.uk/government/publications/non-resident-landord-guidance-notes-for-letting-agents-and-tenants-non-resident-landlords-scheme-guidance-notes&prev=search&pto=aue', 'https://translate.google.co.uk/translate?hl=de&sl=en&u=https://www.propertyinvestmentsuk.co.uk/what-is-a-letting-agent/&prev=search&pto=aue', 'https://www.propertyinvestmentsuk.co.uk/what-is-a-letting-agent/', 'https://translate.google.co.uk/translate?hl=de&sl=en&u=https://www.leaders.co.uk/&prev=search&pto=aue', 'https://translate.google.co.uk/translate?hl=de&sl=en&u=https://en.wikipedia.org/wiki/Letting_agent&prev=search&pto=aue', 'https://www.allagents.co.uk/find-agent/', 'https://www.leaders.co.uk/', 'https://translate.google.co.uk/translate?hl=de&sl=en&u=https://www.foxtons.co.uk/&prev=search&pto=aue', 'https://howsy.com/', 'https://translate.google.co.uk/translate?hl=de&sl=en&u=https://patriciashepherd.co.uk/&prev=search&pto=aue', 'https://translate.google.co.uk/translate?hl=de&sl=en&u=https://www.lettingagenttoday.co.uk/&prev=search&pto=aue', 'https://register.lettingagentregistration.gov.scot/search', 'https://www.linguee.de/englisch-deutsch/uebersetzung/letting+agent.html', 'https://translate.google.co.uk/translate?hl=de&sl=en&u=https://www.diamonds-salesandlettings.co.uk/&prev=search&pto=aue', 'https://www.theonlinelettingagents.co.uk/', 'https://translate.google.co.uk/translate?hl=de&sl=en&u=https://www.lettingagentproperties.com/&prev=search&pto=aue', 'http://www.paul-partner.com/', 'https://www.homeday.de/de/homeday-makler/rhein-main-gebiet-sued/?utm_medium=seo&utm_source=gmb&utm_campaign=rhein_main_gebiet_sued', 'https://www.propertymark.co.uk/professional-standards/consumer-guides/landlords/what-does-a-letting-agent-do.html', 'https://translate.google.co.uk/translate?hl=de&sl=en&u=https://www.citizensadvice.org.uk/housing/renting-privately/during-your-tenancy/complaining-about-your-letting-agent/&prev=search&pto=aue', 'https://safeagents.co.uk/', 'https://charlesroseproperties.co.uk/news/letting-agent-vs-estate-agent-the-differences/', 'https://translate.google.co.uk/translate?hl=de&sl=en&u=https://www.landlordaccreditationscotland.com/letting-agent-training/&prev=search&pto=aue', 'https://move.uk.net/', 'https://www.winkworth.co.uk/', 'https://translate.google.co.uk/translate?hl=de&sl=en&u=https://www.cubittandwest.co.uk/landlord-guides/what-is-a-letting-agent/&prev=search&pto=aue'}
You can scrape Google Search Results using BeautifulSoup web scraping library without the need to use requests-html.
To extract all the results from all possible pages dynamically, we need to use while loop with a specific condition to exit the loop. It will go through all of them no matter how many pages there're. Basically, we don't hardcode page numbers to go from N to N pages.
In this case, pagination is possible as long as the next button exists (determined by the presence of a button selector on the page, in our case the CSS selector .d6cvqb a[id=pnnext], you need to increase the value of ["start"] by 10 to access the next page (non-token pagination), if present, otherwise, we need to exit the while loop:
if soup.select_one('.d6cvqb a[id=pnnext]'):
params["start"] += 10
else:
break
Google, like other sites, may block your request thinking you are a bot if you use requests, since the default user-agent library in requests is python-requests.
To avoid it, one of the steps could be to rotate user-agent, for example, to switch between PC, mobile, and tablet, as well as between browsers e.g. Chrome, Firefox, Safari, Edge and so on. The most reliable way is to use rotating proxies, user-agents, and a captcha solver.
Check code in online IDE.
from bs4 import BeautifulSoup
import requests, json, lxml
# https://docs.python-requests.org/en/master/user/quickstart/#passing-parameters-in-urls
params = {
"q": "letting agent", # query
"hl": "en", # language
"gl": "uk", # country of the search, UK -> United Kingdom
"start": 0, # number page by default up to 0
#"num": 100 # parameter defines the maximum number of results to return.
}
# https://docs.python-requests.org/en/master/user/quickstart/#custom-headers
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36"
}
page_num = 0
website_data = []
while True:
page_num += 1
print(f"page: {page_num}")
html = requests.get("https://www.google.co.uk/search", params=params, headers=headers, timeout=30)
soup = BeautifulSoup(html.text, 'lxml')
for result in soup.select(".tF2Cxc"):
title = result.select_one(".DKV0Md").text
website_link = result.select_one(".yuRUbf a")["href"]
try:
snippet = result.select_one(".lEBKkf span").text
except:
None
website_data.append({
"title": title,
"snippet": snippet,
"website_link": website_link
})
if soup.select_one('.d6cvqb a[id=pnnext]'):
params["start"] += 10
else:
break
print(json.dumps(website_data, indent=2, ensure_ascii=False))
Example output:
[
{
"title": "Letting agents in York Anderton McClements. Luxury Lets in ...",
"snippet": "Anderton McClements are the Letting Agents in York. We offer the best possible service in property letting in York. Contact us today.",
"website_link": "https://andertonmcclements.co.uk/"
},
{
"title": "Letting Agents near Swansea | Reviews - Yell",
"snippet": "Search for Letting Agents near you, or submit your own review. ... an experienced letting agent can help you discover your next property to let.",
"website_link": "https://www.yell.com/s/letting+agents-swansea.html"
},
other results...
]
As an alternative, you can use Google Search Engine Results API from SerpApi. It's a paid API with a free plan.
The difference is that it will bypass blocks (including CAPTCHA) from Google, no need to create the parser and maintain it.
Code example:
from serpapi import GoogleSearch
from urllib.parse import urlsplit, parse_qsl
import json, os
params = {
"api_key": os.getenv("API_KEY"), # serpapi key from https://serpapi.com/manage-api-key
"engine": "google", # serpapi parser engine
"q": "letting agent", # search query
"gl": "uk", # country of the search, UK -> United Kingdom
"num": "100" # number of results per page (100 per page in this case)
# other search parameters: https://serpapi.com/search-api#api-parameters
}
search = GoogleSearch(params) # where data extraction happens
organic_results_data = []
page_num = 0
while True:
results = search.get_dict() # JSON -> Python dictionary
page_num += 1
for result in results["organic_results"]:
organic_results_data.append({
"title": result.get("title"),
"snippet": result.get("snippet"),
"link": result.get("link")
})
if "next_link" in results.get("serpapi_pagination", []):
search.params_dict.update(dict(parse_qsl(urlsplit(results.get("serpapi_pagination").get("next_link")).query)))
else:
break
print(json.dumps(organic_results_data, indent=2, ensure_ascii=False))
Output:
[
{
"title": "Appeal to private landlords to offer tenancy to those in need",
"snippet": "“If you are unsure if your property will be suitable, please call us to discuss and if you are a landlord who uses a letting agent and would ...",
"link": "https://newsroom.shropshire.gov.uk/2022/12/appeal-to-private-landlords-to-offer-tenancy-to-those-in-need/"
},
other results...
]
I am trying to scrape the data from the MouthShut.com user review. If I am looking at the Reviews Devtools the required text of the review is inside the following tag.- more review data
<div class="more reviewdata"> Ipohone 11 Pro X : Looks alike a minion having Three Eyes. yes its Seems as An Alien, But Technically Iphone is Copying features and Function of Androids and Having Custom Os Phones.Triple Camera is Great! for Wide Angle Photography.But The looks of Iphone 11 pro X isn't Good.If ...<a style="cursor:pointer" onclick="bindreviewcontent('2958778',this,false,'I found this review of Apple iPhone 11 Pro Max 512GB pretty useful',925993570,'.png','I found this review of Apple iPhone 11 Pro Max 512GB pretty useful %23WriteShareWin','https://www.mouthshut.com/review/Apple-iPhone-11-Pro-Max-512GB-review-omnstsstqun','Apple iPhone 11 Pro Max 512GB',' 1/5','omnstsstqun');">Read More</a></div>
I wanted to extract only the text content of the review, Can anybody help on how to extract as there is no unique separator for it do so.
I have done the following code :
from requests import get
bse_url = 'https://www.mouthshut.com/mobile-phones/Apple-iPhone-11-Pro-Max-reviews-925993567'
response = get(url)
print(response.text[:100])
from bs4 import BeautifulSoup
html_soup = BeautifulSoup(response.text, 'html.parser')
type(html_soup)
reviews = html_soup.find_all('div', class_ = 'more reviewdata')
print(type(reviews))
print(len(reviews))
first_review = reviews[2]
first_review.div
To scrape all reviews from the page, you can use this example. Some larger reviews are scraped separately as POST request:
import re
import requests
from textwrap import wrap
from bs4 import BeautifulSoup
base_url = 'https://www.mouthshut.com/mobile-phones/Apple-iPhone-11-Pro-Max-reviews-925993567'
data = {
'type': 'review',
'reviewid': -1,
'corp': 'false',
'catname': ''
}
more_url = 'https://www.mouthshut.com/review/CorporateResponse.ashx'
output = []
with requests.session() as s:
soup = BeautifulSoup(s.get(base_url).text, 'html.parser')
for review in soup.select('.reviewdata'):
a = review.select_one('a[onclick^="bindreviewcontent"]')
if a:
data['reviewid'] = re.findall(r"bindreviewcontent\('(\d+)", a['onclick'])[0]
comment = BeautifulSoup( s.post(more_url, data=data).text, 'html.parser' )
comment.div.extract()
comment.ul.extract()
output.append( comment.get_text(separator=' ', strip=True) )
else:
review.div.extract()
output.append( review.get_text(separator=' ', strip=True) )
for i, review in enumerate(output, 1):
print('--- Review no.{} ---'.format(i))
print(*wrap(review), sep='\n')
print()
Prints:
--- Review no.1 ---
As you all know Apple products are too expensive this one is damn one
but who needs to sell his kidney to buy its look is not that much ease
than expected. For me it's 2 star phone
--- Review no.2 ---
Very disappointing product.nothing has changed in operating system,
only camera look has changed which is very odd looking.Device weight
is not light and dont fit in one hand.
--- Review no.3 ---
Ipohone 11 Pro X : Looks alike a minion having Three Eyes. yes its
Seems as An Alien, But Technically Iphone is Copying features and
Function of Androids and Having Custom Os Phones. Triple Camera is
Great! for Wide Angle Photography. But The looks of Iphone 11 pro X
isn't Good. If You Have 3 Kidneys, Then You Can Waste one of them to
... and so on.
I am trying to output the job's salary but it says need login to view. I can successfully output the other jobs' descriptions like the job title, company, location, etc. I have tried logged in with my account and logged out but it still says login to view salary. My question is, how do I show the salary which requires login to view? Need someone to help me.
import requests
from bs4 import BeautifulSoup
from mechanize import Browser
import http.cookiejar as cookielib
#creates browser
br = Browser()
#browser options
br.set_handle_robots(False) #ignore robots
br.set_handle_refresh(False) #can sometimes hang without this
br.addheaders = [('User-Agent', 'Firefox')]
login_url = "https://myjobstreet.jobstreet.com.my/home/login.php"
cj = cookielib.CookieJar()
br.set_cookiejar(cj)
response = br.open('https://myjobstreet.jobstreet.com.my/home/login.php')
#view available forms
for f in br.forms():
print(f)
br.select_form('login')
br.set_all_readonly(False) #allows everything to be written to
br.form['login_id'] = 'my_id'
br.form['password'] = 'my_password'
#submit current form
br.submit()
r = requests.get(url, headers=headers, auth=('user', 'pass'))
soup = BeautifulSoup(r.text, 'lxml')
jobs = soup.find_all("div", {"class": "rRow"})
for job in jobs:
try:
salary = job.find_all("div", {"class": "rRowLoc"})
job_salary = salary[0].text.strip()
except IndexError:
pass
print("Salary: ", job_salary)
This is the output:
Job: Sales Executive
Company: Company
Location: Earth
Salary: Login to view salary
Expected output:
Job: Sales Executive
Company: Company
Location: Earth
Salary: 1000
Your code is not working, but your goal is to scrape Company Name, Position, Location and Salary from page.
You can do your login process using requests.
Salary detail is not available into HTML because it is coming through Ajax request, So every time you find Salary into HTML it will be blank.
import requests
import bs4 as bs
headers = {
'Host': 'myjobstreet.jobstreet.com.my',
'User-Agent': 'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.64 Safari/537.31',
}
login_url = 'https://myjobstreet.jobstreet.com.my/home/login.php?site=&language_code=3'
post_data_for_login = {
"referer_url":"",
"mobile_referer":"",
"login_id":"**YOUR EMAIL ID**",
"password":"**YOUR PASSWORD**",
"remember":"on",
"btn_login":"",
"login":"1"
}
# Create Session.
session = requests.session()
# Login request to get cookies.
response = session.post(login_url, data=post_data_for_login, headers=headers)
print('login_response:', response.status_code)
job_page_url = 'https://www.jobstreet.com.my/en/job/fb-service-team-4126557'
job_page_json_url = job_page_url + '/panels'
# Update Host in headers.
headers['Host'] = 'www.jobstreet.com.my'
# Get Job details.
response = session.get(job_page_url, headers=headers)
# Fetch Company Name, Position and Location details from HTML.
soup = bs.BeautifulSoup(response.text, 'lxml')
company_name = soup.find("div", {"id": "company_name"}).text.strip()
position_title = soup.find("h1", {"id": "position_title"}).text.strip()
work_location = soup.find("span", {"id": "single_work_location"}).text.strip()
print('Company:', company_name);print('Position:', position_title);print('Location:', work_location)
# Get Salary data From JSON.
response = session.get(job_page_json_url, headers=headers)
# Fetch Salary details from JSON.
if response.status_code == 200:
json_data = response.json()
salary_tag = json_data['job_salary']
soup = bs.BeautifulSoup(salary_tag, 'lxml')
salary_range = soup.find("span", {"id": "salary_range"}).text
print('Salary:', salary_range)
Output:
login_response: 200
Company: Copper Bar and Restaurant (88 Armenian Sdn Bhd)
Position: F&B Service Team
Location: Malaysia - Penang
Salary: MYR 2,000 - MYR 2,500
That code is not runnable. There are multiple issues I can see. You don't use login_url, the variables url and headers are not defined. You're instantiating a browser br, use it to login using br.open but then you stop using the browser. You should keep using the browser instead of requests.get. Your goal should be to get the cookies after login and keep using the cookies for the next page. I'm not familiar with mechanize, though this would be how you would get the html from an open.
response = br.open(url)
print(response.read()) # the text of the page
A better option might be to open developer tools, look at the network request, right-click it and click "copy as cURL". which will show you how to repeat the request at the commandline with cookies and all. See a better explanation plus gif at https://developers.google.com/web/updates/2015/05/replay-a-network-request-in-curl
I have a scrapy bot that runs from a script,My problem is:After the spyder has finished crawling,the program does not end,so basically the program runs for ever until I manually shut it down,now this spyder is part of a bigger program so i cannot afford to shut it down like that as other processes havent happened.So how do i shut it down safely.
i have already surfed stackoverflow and other forums for this and i got this and this,the first one is totally not usable,trust me,i have tried,the second one looked promising but for some reason,close spider doesnt seem to close my spider when i get the signal spider closed
Here is the bot:
def pricebot(prod_name):
class PriceBot(scrapy.Spider):
name = 'pricebot'
query = prod_name
if query.find(' ') is not -1:
query = query.replace(' ', '-')
start_urls = ['http://www.shopping.com/'+query+'/products?CLT=SCH']
def parse(self, response):
prices_container = response.css('div:nth-child(2) > span:nth-child(1) > a:nth-child(1)')
t_cont = response.css('div:nth-child(2)>h2:nth-child(1)>a:nth-child(1)>span:nth-child(1)')
title = t_cont.xpath('#title').extract()
price = prices_container.xpath('text()').extract()
#Sanitise prices results
prices = []
for p in price:
prices.append(p.strip('\n'))
#Grouping Prices To Their Actual Products
product_info = dict(zip(title, prices))
with open('product_info.json','w') as f:
f.write(json.dumps(product_info))
process = CrawlerProcess({
'USER_AGENT': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)'
})
process.crawl(PriceBot)
process.start()
After it is done,i need to do other things,call 3 other functions to be exact
I've been trying to scrape a site that uses AJAX on link elements with onclick events to control page navigation. The scraper works for the first page, but never processes pages from there; so it seems not to be firing the POST request I build up.
I'm completely new to all of this (Python, scrapy, xPath, DOM), but my intuition is that I've mixed different structural patterns from different examples that are subtly incompatible?
I would also really appreciate some hints also on how better to debug this problem beyond (newbie) using the scrapy shell and outputting log messages.
My code:
import scrapy
from scrapy import FormRequest
class FansSpider(scrapy.Spider):
name = "fans"
allowed_domains = ['za.rs-online.com/web/c/hvac-fans-thermal-management/fans/axial-fans/']
start_urls = ['http://za.rs-online.com/web/c/hvac-fans-thermal-management/fans/axial-fans/']
def parse(self, response):
self.logger.info('Parse function called on %s', response.url)
for component in response.xpath('//tr[#class="resultRow"]'):
yield {
'id': component.xpath('.//a[#class="primarySearchLink"]/text()').extract_first().strip()
}
next_id = response.xpath('//a[#class="rightLink nextLink approverMessageTitle"]/#id').extract_first()
self.logger.info('Identified code of next URL as %s', next_id)
if next_id is not None:
first_id = response.xpath('//a[#class="rightLink nextLink approverMessageTitle"]/#onclick').\
extract_first().split(',')[1].strip('\'')
# POST the URL that is generated when clicking the next button
return [FormRequest.from_response(response,
url='http://za.rs-online.com/web/c/hvac-fans-thermal-management/fans/axial-fans/',
formdata={'AJAXREQUEST': '_viewRoot',
first_id: first_id,
'ajax-dimensions': '',
'ajax-request': 'true',
'ajax-sort-by': '',
'ajax-sort-order': '',
'ajax-attrSort': 'false',
'javax.faces.viewState': 'j_id1',
next_id: next_id},
callback=self.parse,
dont_filter=True,
dont_click=True,
method='POST'
)]
Additional information just in case it's relevant: I made these changes to the scrapy settings.py to avoid getting blocked by the webserver or getting banned:
# Obey robots.txt rules
ROBOTSTXT_OBEY = False
# Enable and configure the AutoThrottle extension (disabled by default)
# See http://doc.scrapy.org/en/latest/topics/autothrottle.html
AUTOTHROTTLE_ENABLED = True
# The initial download delay
AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
Thanks!