Google Webscraper (URLS) - including more than the first page in results

Google Webscraper (URLS) - including more than the first page in results - python-3.x

Got a basic Google webscraper that returns urls of the first google search page - I want it to include URLS on further pages. What's the best way to paginate this code so as it grabs URLS from pages 2,3,4,5,6,7 etc.
Don't want to go off into space with how many pages I scrap but definitely want more than the first page !
import requests
import urllib
import pandas as pd
from requests_html import HTML
from requests_html import HTMLSession
def get_source(url):
try:
session = HTMLSession()
response = session.get(url)
return response
except requests.exceptions.RequestException as e:
print(e)
def scrape_google(query):
query = urllib.parse.quote_plus(query)
response = get_source("https://www.google.co.uk/search?q=" + query)
links = list(response.html.absolute_links)
google_domains = ('https://www.google.',
'https://google.',
'https://webcache.googleusercontent.',
'http://webcache.googleusercontent.',
'https://policies.google.',
'https://support.google.',
'https://maps.google.')
for url in links[:]:
if url.startswith(google_domains):
links.remove(url)
return links
print(scrape_google('https://www.google.com/search?q=letting agent'))

You can iterate over a specific range() and set the start parameter by multiply the number of iteration by 10 - Save your results to a list and use set() to remove duplicates:
data = []
for i in range(3):
data.extend(scrape_google('letting agent', i*10))
set(data)
Example
import requests
def scrape_google(query,start):
response = get_source(f"https://www.google.co.uk/search?q={query}&start={start}")
links = list(response.html.absolute_links)
google_domains = ('https://www.google.',
'https://google.',
'https://webcache.googleusercontent.',
'http://webcache.googleusercontent.',
'https://policies.google.',
'https://support.google.',
'https://maps.google.')
for url in links[:]:
if url.startswith(google_domains):
links.remove(url)
return links
data = []
for i in range(3):
data.extend(scrape_google('letting agent', i*10))
print(set(data))
Output
{'https://www.lettingagenttoday.co.uk/', 'https://translate.google.co.uk/translate?hl=de&sl=en&u=https://howsy.com/&prev=search&pto=aue', 'https://translate.google.co.uk/translate?hl=de&sl=en&u=https://www.propertymark.co.uk/professional-standards/consumer-guides/landlords/what-does-a-letting-agent-do.html&prev=search&pto=aue', 'https://www.citizensadvice.org.uk/housing/renting-privately/during-your-tenancy/complaining-about-your-letting-agent/', 'https://translate.google.co.uk/translate?hl=de&sl=en&u=https://www.allagents.co.uk/find-agent/&prev=search&pto=aue', 'https://translate.google.co.uk/translate?hl=de&sl=en&u=https://www.theonlinelettingagents.co.uk/&prev=search&pto=aue', 'https://www.which.co.uk/money/mortgages-and-property/buy-to-let/using-a-letting-agent-a16lu1w364rv', 'https://www.gov.uk/government/publications/non-resident-landord-guidance-notes-for-letting-agents-and-tenants-non-resident-landlords-scheme-guidance-notes', 'https://lettingagentregistration.gov.scot/renew', 'https://en.wikipedia.org/wiki/Letting_agent#Services_and_fees', 'https://patriciashepherd.co.uk/', 'https://dict.leo.org/englisch-deutsch/letting%20agent', 'https://www.diamonds-salesandlettings.co.uk/', 'https://www.lettingagentproperties.com/', 'https://translate.google.co.uk/translate?hl=de&sl=en&u=https://www.ukala.org.uk/&prev=search&pto=aue', 'https://translate.google.co.uk/translate?hl=de&sl=en&u=https://register.lettingagentregistration.gov.scot/search&prev=search&pto=aue', 'https://context.reverso.net/%C3%BCbersetzung/englisch-deutsch/letting+agent', 'https://www.cubittandwest.co.uk/landlord-guides/what-is-a-letting-agent/', 'https://en.wikipedia.org/wiki/Letting_agent', 'https://translate.google.co.uk/translate?hl=de&sl=en&u=https://safeagents.co.uk/&prev=search&pto=aue', 'https://translate.google.co.uk/translate?hl=de&sl=en&u=https://charlesroseproperties.co.uk/news/letting-agent-vs-estate-agent-the-differences/&prev=search&pto=aue', 'https://www.tenantshop.co.uk/letting-agents/', 'https://translate.google.co.uk/translate?hl=de&sl=en&u=https://lettingagentregistration.gov.scot/renew&prev=search&pto=aue', 'https://translate.google.co.uk/translate?hl=de&sl=en&u=https://www.winkworth.co.uk/&prev=search&pto=aue', 'https://objego.de/lp-immobilienverwaltung/', 'https://www.facebook.com/agestateagents/videos/looking-to-instruct-a-letting-agent-not-sure-what-you-should-be-looking-for-or-w/688390845096579/', 'https://www.ukala.org.uk/', 'https://en.wikipedia.org/wiki/Letting_agent#Regulation', 'https://www.foxtons.co.uk/', 'https://ibizaprestige.com/', 'https://translate.google.co.uk/translate?hl=de&sl=en&u=https://www.which.co.uk/money/mortgages-and-property/buy-to-let/using-a-letting-agent-a16lu1w364rv&prev=search&pto=aue', 'https://translate.google.co.uk/translate?hl=de&sl=en&u=https://www.tenantshop.co.uk/letting-agents/&prev=search&pto=aue', 'https://www.dict.cc/?s=letting+agent', 'https://www.landlordaccreditationscotland.com/letting-agent-training/', 'https://translate.google.co.uk/translate?hl=de&sl=en&u=https://www.gov.uk/government/publications/non-resident-landord-guidance-notes-for-letting-agents-and-tenants-non-resident-landlords-scheme-guidance-notes&prev=search&pto=aue', 'https://translate.google.co.uk/translate?hl=de&sl=en&u=https://www.propertyinvestmentsuk.co.uk/what-is-a-letting-agent/&prev=search&pto=aue', 'https://www.propertyinvestmentsuk.co.uk/what-is-a-letting-agent/', 'https://translate.google.co.uk/translate?hl=de&sl=en&u=https://www.leaders.co.uk/&prev=search&pto=aue', 'https://translate.google.co.uk/translate?hl=de&sl=en&u=https://en.wikipedia.org/wiki/Letting_agent&prev=search&pto=aue', 'https://www.allagents.co.uk/find-agent/', 'https://www.leaders.co.uk/', 'https://translate.google.co.uk/translate?hl=de&sl=en&u=https://www.foxtons.co.uk/&prev=search&pto=aue', 'https://howsy.com/', 'https://translate.google.co.uk/translate?hl=de&sl=en&u=https://patriciashepherd.co.uk/&prev=search&pto=aue', 'https://translate.google.co.uk/translate?hl=de&sl=en&u=https://www.lettingagenttoday.co.uk/&prev=search&pto=aue', 'https://register.lettingagentregistration.gov.scot/search', 'https://www.linguee.de/englisch-deutsch/uebersetzung/letting+agent.html', 'https://translate.google.co.uk/translate?hl=de&sl=en&u=https://www.diamonds-salesandlettings.co.uk/&prev=search&pto=aue', 'https://www.theonlinelettingagents.co.uk/', 'https://translate.google.co.uk/translate?hl=de&sl=en&u=https://www.lettingagentproperties.com/&prev=search&pto=aue', 'http://www.paul-partner.com/', 'https://www.homeday.de/de/homeday-makler/rhein-main-gebiet-sued/?utm_medium=seo&utm_source=gmb&utm_campaign=rhein_main_gebiet_sued', 'https://www.propertymark.co.uk/professional-standards/consumer-guides/landlords/what-does-a-letting-agent-do.html', 'https://translate.google.co.uk/translate?hl=de&sl=en&u=https://www.citizensadvice.org.uk/housing/renting-privately/during-your-tenancy/complaining-about-your-letting-agent/&prev=search&pto=aue', 'https://safeagents.co.uk/', 'https://charlesroseproperties.co.uk/news/letting-agent-vs-estate-agent-the-differences/', 'https://translate.google.co.uk/translate?hl=de&sl=en&u=https://www.landlordaccreditationscotland.com/letting-agent-training/&prev=search&pto=aue', 'https://move.uk.net/', 'https://www.winkworth.co.uk/', 'https://translate.google.co.uk/translate?hl=de&sl=en&u=https://www.cubittandwest.co.uk/landlord-guides/what-is-a-letting-agent/&prev=search&pto=aue'}

You can scrape Google Search Results using BeautifulSoup web scraping library without the need to use requests-html.
To extract all the results from all possible pages dynamically, we need to use while loop with a specific condition to exit the loop. It will go through all of them no matter how many pages there're. Basically, we don't hardcode page numbers to go from N to N pages.
In this case, pagination is possible as long as the next button exists (determined by the presence of a button selector on the page, in our case the CSS selector .d6cvqb a[id=pnnext], you need to increase the value of ["start"] by 10 to access the next page (non-token pagination), if present, otherwise, we need to exit the while loop:
if soup.select_one('.d6cvqb a[id=pnnext]'):
params["start"] += 10
else:
break
Google, like other sites, may block your request thinking you are a bot if you use requests, since the default user-agent library in requests is python-requests.
To avoid it, one of the steps could be to rotate user-agent, for example, to switch between PC, mobile, and tablet, as well as between browsers e.g. Chrome, Firefox, Safari, Edge and so on. The most reliable way is to use rotating proxies, user-agents, and a captcha solver.
Check code in online IDE.
from bs4 import BeautifulSoup
import requests, json, lxml
# https://docs.python-requests.org/en/master/user/quickstart/#passing-parameters-in-urls
params = {
"q": "letting agent", # query
"hl": "en", # language
"gl": "uk", # country of the search, UK -> United Kingdom
"start": 0, # number page by default up to 0
#"num": 100 # parameter defines the maximum number of results to return.
}
# https://docs.python-requests.org/en/master/user/quickstart/#custom-headers
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36"
}
page_num = 0
website_data = []
while True:
page_num += 1
print(f"page: {page_num}")
html = requests.get("https://www.google.co.uk/search", params=params, headers=headers, timeout=30)
soup = BeautifulSoup(html.text, 'lxml')
for result in soup.select(".tF2Cxc"):
title = result.select_one(".DKV0Md").text
website_link = result.select_one(".yuRUbf a")["href"]
try:
snippet = result.select_one(".lEBKkf span").text
except:
None
website_data.append({
"title": title,
"snippet": snippet,
"website_link": website_link
})
if soup.select_one('.d6cvqb a[id=pnnext]'):
params["start"] += 10
else:
break
print(json.dumps(website_data, indent=2, ensure_ascii=False))
Example output:
[
{
"title": "Letting agents in York Anderton McClements. Luxury Lets in ...",
"snippet": "Anderton McClements are the Letting Agents in York. We offer the best possible service in property letting in York. Contact us today.",
"website_link": "https://andertonmcclements.co.uk/"
},
{
"title": "Letting Agents near Swansea | Reviews - Yell",
"snippet": "Search for Letting Agents near you, or submit your own review. ... an experienced letting agent can help you discover your next property to let.",
"website_link": "https://www.yell.com/s/letting+agents-swansea.html"
},
other results...
]
As an alternative, you can use Google Search Engine Results API from SerpApi. It's a paid API with a free plan.
The difference is that it will bypass blocks (including CAPTCHA) from Google, no need to create the parser and maintain it.
Code example:
from serpapi import GoogleSearch
from urllib.parse import urlsplit, parse_qsl
import json, os
params = {
"api_key": os.getenv("API_KEY"), # serpapi key from https://serpapi.com/manage-api-key
"engine": "google", # serpapi parser engine
"q": "letting agent", # search query
"gl": "uk", # country of the search, UK -> United Kingdom
"num": "100" # number of results per page (100 per page in this case)
# other search parameters: https://serpapi.com/search-api#api-parameters
}
search = GoogleSearch(params) # where data extraction happens
organic_results_data = []
page_num = 0
while True:
results = search.get_dict() # JSON -> Python dictionary
page_num += 1
for result in results["organic_results"]:
organic_results_data.append({
"title": result.get("title"),
"snippet": result.get("snippet"),
"link": result.get("link")
})
if "next_link" in results.get("serpapi_pagination", []):
search.params_dict.update(dict(parse_qsl(urlsplit(results.get("serpapi_pagination").get("next_link")).query)))
else:
break
print(json.dumps(organic_results_data, indent=2, ensure_ascii=False))
Output:
[
{
"title": "Appeal to private landlords to offer tenancy to those in need",
"snippet": "“If you are unsure if your property will be suitable, please call us to discuss and if you are a landlord who uses a letting agent and would ...",
"link": "https://newsroom.shropshire.gov.uk/2022/12/appeal-to-private-landlords-to-offer-tenancy-to-those-in-need/"
},
other results...
]

Related

Scraping website with xpath returns nothing

I'm trying scrape the job positions from the following website: https://supersolid.com/careers.
The data in question is:
[Server Developer, Game Designer/Senior Game Designer, Marketing Artist (2D), Game Designer (New Concepts), Senior Server Developer].
I've tried the usual process of going into developer tools and seeing if there is an XHR file in the network that I could use with all the roles there.
dev tools / network
I then tried to scrape it using XPath
data = []
url = "https://supersolid.com/careers"
page = requests.get(url)
tree = html.fromstring(page.content)
xpath = '/html/body/main/section[2]/div/div/div[5]/div/h4'
jobs = tree.xpath(xpath)
print(len(jobs))
I use print(len(jobs)) and it returns 0
Not too sure what else I could do.

Specify User-Agent in HTTP request:
import requests
from lxml import html
headers = {
"User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:90.0) Gecko/20100101 Firefox/90.0"
}
url = "https://supersolid.com/careers"
page = requests.get(url, headers=headers)
tree = html.fromstring(page.content)
xpath = ".//h4"
jobs = tree.xpath(xpath)
print([j.text for j in jobs])
Prints:
['Server Developer', 'Game Designer/Senior Game Designer', 'Marketing Artist (2D)', 'Game Designer (New Concepts)', 'Senior Server Developer']

Try BeautifulSoup.
from bs4 import BeautifulSoup
import requests
data = []
url = "https://supersolid.com/careers"
page = requests.get(url)
soup = BeautifulSoup(page.text, 'html.parser')
jobs = soup.find_all('h4')
print(len(jobs))

Web scraping with Python that requires login to view output

I am trying to output the job's salary but it says need login to view. I can successfully output the other jobs' descriptions like the job title, company, location, etc. I have tried logged in with my account and logged out but it still says login to view salary. My question is, how do I show the salary which requires login to view? Need someone to help me.
import requests
from bs4 import BeautifulSoup
from mechanize import Browser
import http.cookiejar as cookielib
#creates browser
br = Browser()
#browser options
br.set_handle_robots(False) #ignore robots
br.set_handle_refresh(False) #can sometimes hang without this
br.addheaders = [('User-Agent', 'Firefox')]
login_url = "https://myjobstreet.jobstreet.com.my/home/login.php"
cj = cookielib.CookieJar()
br.set_cookiejar(cj)
response = br.open('https://myjobstreet.jobstreet.com.my/home/login.php')
#view available forms
for f in br.forms():
print(f)
br.select_form('login')
br.set_all_readonly(False) #allows everything to be written to
br.form['login_id'] = 'my_id'
br.form['password'] = 'my_password'
#submit current form
br.submit()
r = requests.get(url, headers=headers, auth=('user', 'pass'))
soup = BeautifulSoup(r.text, 'lxml')
jobs = soup.find_all("div", {"class": "rRow"})
for job in jobs:
try:
salary = job.find_all("div", {"class": "rRowLoc"})
job_salary = salary[0].text.strip()
except IndexError:
pass
print("Salary: ", job_salary)
This is the output:
Job: Sales Executive
Company: Company
Location: Earth
Salary: Login to view salary
Expected output:
Job: Sales Executive
Company: Company
Location: Earth
Salary: 1000

Your code is not working, but your goal is to scrape Company Name, Position, Location and Salary from page.
You can do your login process using requests.
Salary detail is not available into HTML because it is coming through Ajax request, So every time you find Salary into HTML it will be blank.
import requests
import bs4 as bs
headers = {
'Host': 'myjobstreet.jobstreet.com.my',
'User-Agent': 'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.64 Safari/537.31',
}
login_url = 'https://myjobstreet.jobstreet.com.my/home/login.php?site=&language_code=3'
post_data_for_login = {
"referer_url":"",
"mobile_referer":"",
"login_id":"**YOUR EMAIL ID**",
"password":"**YOUR PASSWORD**",
"remember":"on",
"btn_login":"",
"login":"1"
}
# Create Session.
session = requests.session()
# Login request to get cookies.
response = session.post(login_url, data=post_data_for_login, headers=headers)
print('login_response:', response.status_code)
job_page_url = 'https://www.jobstreet.com.my/en/job/fb-service-team-4126557'
job_page_json_url = job_page_url + '/panels'
# Update Host in headers.
headers['Host'] = 'www.jobstreet.com.my'
# Get Job details.
response = session.get(job_page_url, headers=headers)
# Fetch Company Name, Position and Location details from HTML.
soup = bs.BeautifulSoup(response.text, 'lxml')
company_name = soup.find("div", {"id": "company_name"}).text.strip()
position_title = soup.find("h1", {"id": "position_title"}).text.strip()
work_location = soup.find("span", {"id": "single_work_location"}).text.strip()
print('Company:', company_name);print('Position:', position_title);print('Location:', work_location)
# Get Salary data From JSON.
response = session.get(job_page_json_url, headers=headers)
# Fetch Salary details from JSON.
if response.status_code == 200:
json_data = response.json()
salary_tag = json_data['job_salary']
soup = bs.BeautifulSoup(salary_tag, 'lxml')
salary_range = soup.find("span", {"id": "salary_range"}).text
print('Salary:', salary_range)
Output:
login_response: 200
Company: Copper Bar and Restaurant (88 Armenian Sdn Bhd)
Position: F&B Service Team
Location: Malaysia - Penang
Salary: MYR 2,000 - MYR 2,500

That code is not runnable. There are multiple issues I can see. You don't use login_url, the variables url and headers are not defined. You're instantiating a browser br, use it to login using br.open but then you stop using the browser. You should keep using the browser instead of requests.get. Your goal should be to get the cookies after login and keep using the cookies for the next page. I'm not familiar with mechanize, though this would be how you would get the html from an open.
response = br.open(url)
print(response.read()) # the text of the page
A better option might be to open developer tools, look at the network request, right-click it and click "copy as cURL". which will show you how to repeat the request at the commandline with cookies and all. See a better explanation plus gif at https://developers.google.com/web/updates/2015/05/replay-a-network-request-in-curl

How to scrape images from DuckDuckGo's image search results in Python

I'm creating an application with python that's going to show images scraped from DuckDuckGo's image search results. So I need to get a list of links to the images based on the search. The problem is that the HTML that constitutes DuckDuckGo's image search results does not contain any image tags but instead, the images seem to be stored in division tags. How can I with the help of python scrape those damn links to the images and store them in a variable on my program?
What I want my variable to look like:
image_links = ["https://duckduckgo.com/?q=duckduckgo&atb=v166-4_p&iax=images&ia=images&iai=https%3A%2F%2Fupload.wikimedia.org%2Fwikipedia%2Fen%2Fthumb%2F8%2F88%2FDuckDuckGo_logo.svg%2F1200px-DuckDuckGo_logo.svg.png","https://duckduckgo.com/?q=duckduckgo&atb=v166-4_p&iax=images&ia=images&iai=https%3A%2F%2Fupload.wikimedia.org%2Fwikipedia%2Fen%2Fthumb%2F8%2F88%2FDuckDuckGo_logo.svg%2F1200px-DuckDuckGo_logo.svg.png"]
A visualization of DuckDuckGo's HTML structure in its image search results
Edit:
When I scrape the HTML from the URL by doing this:
source = urllib.request.urlopen("https://duckduckgo.com/?q=duckduckgo&atb=v166-4_p&iax=images&ia=images").read()
it doesn't return any image tags at all.
I am checking that by doing this:
source_tree = BeautifulSoup(source, 'html.parser')
links = [img.get('src') for img in source_tree.find_all('img', _class='tile--img__img')]
print(f"links: {links}")
print(f"img in source_tree: {'img' in str(source_tree)}")
print(f"source_tree: {source_tree}")
Output:
links: []
img in source_tree: False
source_tree: <!DOCTYPE html>
<html class="no-js has-zcm" lang="en_US"><head><meta content="text/html; charset=utf-8" http-equiv="content-type"/><title>duckduckgo at DuckDuckGo</title><link href="/s1775.css" rel="stylesheet" type="text/css"/><link href="/r1775.css" rel="stylesheet" type="text/css"/><meta content="noindex,nofollow" name="robots"/><meta content="origin" name="referrer"/><meta content="duckduckgo" name="apple-mobile-web-app-title"/><link href="/favicon.ico" rel="shortcut icon" sizes="16x16 24x24 32x32 64x64" type="image/x-icon"><link href="/assets/icons/meta/DDG-iOS-icon_60x60.png?v=2" id="icon60" rel="apple-touch-icon"><link href="/assets/icons/meta/DDG-iOS-icon_76x76.png?v=2" id="icon76" rel="apple-touch-icon" sizes="76x76"/><link href="/assets/icons/meta/DDG-iOS-icon_120x120.png?v=2" id="icon120" rel="apple-touch-icon" sizes="120x120"/><link href="/assets/icons/meta/DDG-iOS-icon_152x152.png?v=2" id="icon152" rel="apple-touch-icon" sizes="152x152"/><link href="/assets/icons/meta/DDG-icon_256x256.png" rel="image_src"/><script type="text/javascript">var ct,fd,fq,it,iqa,iqm,iqs,iqp,iqq,qw,dl,ra,rv,rad,r1hc,r1c,r2c,r3c,rfq,rq,rds,rs,rt,rl,y,y1,ti,tig,iqd,locale,settings_js_version='s2472.js',is_twitter='',rpl=0;fq=0;fd=1;it=0;iqa=0;iqbi=0;iqm=0;iqs=0;iqp=0;iqq=0;qw=1;dl='';ct='DK';iqd=0;r1hc=0;r1c=0;r3c=0;rq='duckduckgo';rqd="duckduckgo";rfq=0;rt='A';ra='';rv='';rad='';rds=30;rs=0;spice_version='1396';spice_paths='{}';locale='en_US';settings_url_params={};rl='wt-wt';rlo=0;df='';ds='';sfq='';iar='';vqd='3-146459744347044482638673072010848595657-89706121844226791728716680155105882500';safe_ddg=0;;</script><meta content="width=device-width, initial-scale=1" name="viewport"><meta content="true" name="HandheldFriendly"><meta content="no" name="apple-mobile-web-app-capable"/></meta></meta></link></link></head><body class="body--serp"><input id="state_hidden" name="state_hidden" size="1" type="text"/><span class="hide">Ignore this box please.</span><div id="spacing_hidden_wrapper"><div id="spacing_hidden"></div></div><script src="/lib/l113.js" type="text/javascript"></script><script src="/locale/en_US/duckduckgo10.js" type="text/javascript"></script><script src="/util/u345.js" type="text/javascript"></script><script src="/d2615.js" type="text/javascript"></script><div class="site-wrapper js-site-wrapper"><div class="header-wrap js-header-wrap" id="header_wrapper"><div class="welcome-wrap js-welcome-wrap"></div><div class="header cw" id="header"><div class="header__search-wrap"><a class="header__logo-wrap js-header-logo" href="/" tabindex="-1"><span class="header__logo js-logo-ddg">DuckDuckGo</span></a><div class="header__content header__search"><form action="/" class="search--adv search--header js-search-form" id="search_form" name="x"><input autocomplete="off" class="search__input search__input--adv js-search-input" id="search_form_input" name="q" tabindex="1" type="text" value="duckduckgo"/><input class="search__clear js-search-clear" id="search_form_input_clear" tabindex="3" type="button" value="X"><input class="search__button js-search-button" id="search_button" tabindex="2" type="submit" value="S"><a class="search__dropdown" href="javascript:;" id="search_dropdown" tabindex="4"></a><div class="search__hidden js-search-hidden" id="search_elements_hidden"></div></input></input></form></div></div><div class="zcm-wrap zcm-wrap--header is-noscript-hidden" id="duckbar"></div></div><div class="header--aside js-header-aside"></div></div><div class="zci-wrap" id="zero_click_wrapper"></div><div class="verticals" id="vertical_wrapper"></div><div class="content-wrap" id="web_content_wrapper"><div class="serp__top-right js-serp-top-right"></div><div class="serp__bottom-right js-serp-bottom-right"><div class="js-feedback-btn-wrap"></div></div><div class="cw"><div class="serp__results js-serp-results" id="links_wrapper"><div class="results--main"><div class="search-filters-wrap"><div class="js-search-filters search-filters"></div></div><noscript><meta content="0;URL=/html?q=duckduckgo" http-equiv="refresh"/><link href="/css/noscript.css" rel="stylesheet" type="text/css"/><div class="msg msg--noscript"><p class="msg-title--noscript">You are being redirected to the non-JavaScript site.</p>Click here if it doesn't happen automatically.</div></noscript><div class="results--message" id="message"></div><div class="ia-modules js-ia-modules"></div><div class="results--ads results--ads--main is-hidden js-results-ads" id="ads"></div><div class="results is-hidden js-results" id="links"></div></div><div class="results--sidebar js-results-sidebar"><div class="sidebar-modules js-sidebar-modules"></div><div class="is-hidden js-sidebar-ads"></div></div></div></div></div><div id="bottom_spacing2"> </div></div><script type="text/javascript"></script><script type="text/JavaScript">function nrji() {nrj('/t.js?q=duckduckgo&t=A&l=wt-wt&s=0&ct=DK&ss_mkt=us&p_ent=website&ex=-1');nrj('/d.js?q=duckduckgo&t=A&l=wt-wt&s=0&ct=DK&ss_mkt=us&vqd=3-146459744347044482638673072010848595657-89706121844226791728716680155105882500&atb=v166-4_p&p_ent=website&ex=-1&sp=0');DDH.wikipedia_fathead=DDH.wikipedia_fathead||{};DDH.wikipedia_fathead.meta={"name":"Wikipedia","src_name":"Wikipedia","is_stackexchange":null,"perl_module":"DDG::Fathead::Wikipedia","unsafe":0,"live_date":null,"src_options":{"language":"en","min_abstract_length":"20","source_skip":"","skip_image_name":0,"is_wikipedia":1,"skip_abstract_paren":0,"skip_abstract":0,"skip_qr":"","is_mediawiki":1,"skip_icon":0,"is_fanon":0,"skip_end":"0","directory":"","src_info":""},"blockgroup":null,"description":"Wikipedia","signal_from":"wikipedia_fathead","tab":"About","producer":null,"production_state":"online","maintainer":{"github":"duckduckgo"},"src_id":1,"dev_milestone":"live","src_url":null,"attribution":null,"dev_date":null,"topic":["productivity"],"status":"live","id":"wikipedia_fathead","example_query":"nikola tesla","created_date":null,"src_domain":"en.wikipedia.org","repo":"fathead","js_callback_name":"wikipedia","designer":null,"developer":[{"name":"DDG Team","url":"http://www.duckduckhack.com","type":"ddg"}]};;};DDG.ready(nrji, 1);</script><script src="/g2124.js"></script><script type="text/javascript">DDG.ready(function () {DDG.duckbar.add({"meta":{"name":"Wikipedia","src_name":"Wikipedia","is_stackexchange":null,"perl_module":"DDG::Fathead::Wikipedia","unsafe":0,"live_date":null,"src_options":{"language":"en","min_abstract_length":"20","source_skip":"","skip_image_name":0,"is_wikipedia":1,"skip_abstract_paren":0,"skip_abstract":0,"skip_qr":"","is_mediawiki":1,"skip_icon":0,"is_fanon":0,"skip_end":"0","directory":"","src_info":""},"blockgroup":null,"description":"Wikipedia","signal_from":"wikipedia_fathead","tab":"About","producer":null,"production_state":"online","maintainer":{"github":"duckduckgo"},"src_id":1,"dev_milestone":"live","src_url":null,"attribution":null,"dev_date":null,"topic":["productivity"],"status":"live","id":"wikipedia_fathead","example_query":"nikola tesla","created_date":null,"src_domain":"en.wikipedia.org","repo":"fathead","js_callback_name":"wikipedia","designer":null,"developer":[{"name":"DDG Team","url":"http://www.duckduckhack.com","type":"ddg"}]},"signal":"medium","data":{"Results":[{"FirstURL":"https://duckduckgo.com","Text":"Official site - DuckDuckGo","Result":"<b>Official site</b> - DuckDuckGo","Icon":{"URL":"https://duckduckgo.com/i/duckduckgo.com.ico","Width":16,"Height":16}}],"AbstractSource":"Wikipedia","Abstract":"DuckDuckGo is an Internet search engine that emphasizes protecting searchers' privacy and avoiding the filter bubble of personalized search results. DuckDuckGo distinguishes itself from other search engines by not profiling its users and by deliberately showing all users the same search results for a given search term, and emphasizes returning the best results, rather than the most results, generating those results from over 400 individual sources, including crowdsourced sites such as Wikipedia, and other search engines like Bing, Yahoo!, and Yandex.","Answer":"","Redirect":"","Heading":"DuckDuckGo","ImageWidth":340,"Definition":"","Entity":"website","meta":{"name":"Wikipedia","src_name":"Wikipedia","is_stackexchange":null,"perl_module":"DDG::Fathead::Wikipedia","unsafe":0,"live_date":null,"src_options":{"language":"en","min_abstract_length":"20","source_skip":"","skip_image_name":0,"is_wikipedia":1,"skip_abstract_paren":0,"skip_abstract":0,"skip_qr":"","is_mediawiki":1,"skip_icon":0,"is_fanon":0,"skip_end":"0","directory":"","src_info":""},"blockgroup":null,"description":"Wikipedia","signal_from":"wikipedia_fathead","tab":"About","producer":null,"production_state":"online","maintainer":{"github":"duckduckgo"},"src_id":1,"dev_milestone":"live","src_url":null,"attribution":null,"dev_date":null,"topic":["productivity"],"status":"live","id":"wikipedia_fathead","example_query":"nikola tesla","created_date":null,"src_domain":"en.wikipedia.org","repo":"fathead","js_callback_name":"wikipedia","designer":null,"developer":[{"name":"DDG Team","url":"http://www.duckduckhack.com","type":"ddg"}]},"AnswerType":"","Image":"https://duckduckgo.com/i/adad4e5c.png","RelatedTopics":[{"Result":"Names Database - The Names Database is a defunct social network, owned and operated by Classmates.com, a wholly owned subsidiary of United Online. The site does not appear to be significantly updated since 2008, and has many broken links and display issues.","Text":"Names Database - The Names Database is a defunct social network, owned and operated by Classmates.com, a wholly owned subsidiary of United Online. The site does not appear to be significantly updated since 2008, and has many broken links and display issues.","FirstURL":"/Names_Database","Icon":{"URL":"","Height":"","Width":""}},{"Text":"Companies based in Chester County, Pennsylvania","FirstURL":"/c/Companies_based_in_Chester_County%2C_Pennsylvania","Result":"Companies based in Chester County, Pennsylvania","Icon":{"URL":"","Width":"","Height":""}},{"Text":"Tor hidden services","FirstURL":"/c/Tor_hidden_services","Result":"Tor hidden services","Icon":{"Width":"","Height":"","URL":""}},{"Result":"Perl software","FirstURL":"/c/Perl_software","Text":"Perl software","Icon":{"Height":"","Width":"","URL":""}},{"Result":"Internet privacy software","FirstURL":"/c/Internet_privacy_software","Text":"Internet privacy software","Icon":{"Height":"","Width":"","URL":""}},{"Icon":{"URL":"","Width":"","Height":""},"FirstURL":"/c/Proprietary_cross-platform_software","Text":"Proprietary cross-platform software","Result":"Proprietary cross-platform software"},{"Icon":{"Height":"","Width":"","URL":""},"Text":"Internet search engines","FirstURL":"/c/Internet_search_engines","Result":"Internet search engines"},{"Text":"Android (operating system) software","FirstURL":"/c/Android_(operating_system)_software","Result":"Android (operating system) software","Icon":{"Height":"","Width":"","URL":""}}],"AbstractURL":"https://en.wikipedia.org/wiki/DuckDuckGo","AbstractText":"DuckDuckGo is an Internet search engine that emphasizes protecting searchers' privacy and avoiding the filter bubble of personalized search results. DuckDuckGo distinguishes itself from other search engines by not profiling its users and by deliberately showing all users the same search results for a given search term, and emphasizes returning the best results, rather than the most results, generating those results from over 400 individual sources, including crowdsourced sites such as Wikipedia, and other search engines like Bing, Yahoo!, and Yandex.","ImageIsLogo":1,"DefinitionSource":"","DefinitionURL":"","Type":"A","Infobox":{"meta":[{"value":"DuckDuckGo","label":"article_title","data_type":"string"},{"label":"template_name","data_type":"string","value":"infobox website"},{"label":"formatting_rules","data_type":"string","value":"website"}],"content":[{"data_type":"string","wiki_order":0,"label":"Type of site","sort_order":"1","value":"Web search engine"},{"sort_order":"1000","value":"Multilingual","wiki_order":1,"data_type":"string","label":"Available in"},{"sort_order":"1001","value":"Worldwide","wiki_order":2,"data_type":"string","label":"Area served"},{"sort_order":"2","value":"Duck Duck Go, Inc.","wiki_order":3,"data_type":"string","label":"Owner"},{"sort_order":"3","value":"Gabriel Weinberg","data_type":"string","wiki_order":4,"label":"Created by"},{"value":"284 (30, 2018)","sort_order":"4","label":"Alexa rank","wiki_order":5,"data_type":"string"},{"label":"Commercial","wiki_order":6,"data_type":"string","value":"Yes","sort_order":"1002"},{"sort_order":"1003","value":"None","wiki_order":7,"data_type":"string","label":"Registration"},{"value":"Sept 25, 2008","sort_order":"3","label":"Launched","data_type":"string","wiki_order":8},{"value":"Active","sort_order":"1004","label":"Current status","data_type":"string","wiki_order":9},{"wiki_order":10,"data_type":"string","label":"Written in","sort_order":"1005","value":"Perl, JavaScript, Python"},{"data_type":"github_profile","wiki_order":"101","label":"GitHub profile","value":"duckduckgo"},{"value":"duckduckgo","label":"Twitter profile","wiki_order":"102","data_type":"twitter_profile"},{"value":"duckduckgo","data_type":"facebook_profile","wiki_order":"104","label":"Facebook profile"},{"value":{"id":"Q114106","entity-type":"item","numeric-id":114106},"data_type":"instance","wiki_order":"207","label":"Instance of"}]},"ImageHeight":270},"model":"FatheadArticle","duckbar_topic":"About","templates":{"detail":"info_detail"}});});</script><script type="text/javascript">DDG.page = new DDG.Pages.SERP({ showSafeSearch: 0, instantAnswerAds: false });</script><div id="z2"> </div><div id="z"></div></body></html>
[Finished in 0.6s]
What's the reason for this and how can I fix it?

As duckduckgo doesn't provide any api for image search but it does fetch the image using a request call.
So I have got this another solution which may work.
Here this solution will only work for a keyword = book. Because image search request is using a parameter vqd which is dynamic eiter on the basis of searched keyword or the user's machine.
If that can be solved this code will work for any keyword to download any image.
If anyone can decrypt this vqd, just replace :
'q': 'book',
with
'q': keyword,
and the rest is fine.
from bs4 import *
import requests as rq
import os
# api-endpoint
URL = "https://duckduckgo.com/i.js"
keyword = input('Enter the search keyword : ')
# defining a params dict for the parameters to be sent to the API
PARAMS = {'l': 'us-en',
'o': 'json',
'q': 'book',
'vqd': '3-160127109499719016074744569811997028386-179481262599639828155814625357171050706&f=,,,',
}
# sending get request and saving the response as response object
r = rq.get(url=URL, params=PARAMS)
# extracting data in json format
data = r.json()
img_link = data["results"][0]['image']
img_data = rq.get(img_link).content
# os.mkdir('downloads')
filename = "downloads/" + keyword + ".png"
with open(filename, 'wb+') as f:
f.write(img_data)
print("File " + keyword + ".png successfully downloaded.")
========================= Updated Answer =========================
There is a github package available for this also.
The main problem with the above script was the value of that parameter (vqd).
With the help of this package I have created the following script.
import requests
import re
import json
import os
def search(keywords):
url = 'https://duckduckgo.com/'
params = {
'q': keywords
}
print("Hitting DuckDuckGo for Token")
# First make a request to above URL, and parse out the 'vqd'
# This is a special token, which should be used in the subsequent request
res = requests.post(url, data=params)
searchObj = re.search(r'vqd=([\d-]+)\&', res.text, re.M | re.I)
if not searchObj:
print("Token Parsing Failed !")
return -1
print("Obtained Token")
headers = {
'dnt': '1',
'accept-encoding': 'gzip, deflate, sdch, br',
'x-requested-with': 'XMLHttpRequest',
'accept-language': 'en-GB,en-US;q=0.8,en;q=0.6,ms;q=0.4',
'user-agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36',
'accept': 'application/json, text/javascript, */*; q=0.01',
'referer': 'https://duckduckgo.com/',
'authority': 'duckduckgo.com',
}
params = (
('l', 'wt-wt'),
('o', 'json'),
('q', keywords),
('vqd', searchObj.group(1)),
('f', ',,,'),
('p', '2')
)
requestUrl = url + "i.js"
try:
res = requests.get(requestUrl, headers=headers, params=params)
data = json.loads(res.text)
saveImage(data["results"], keywords)
except ValueError as e:
print('Please try later.')
# logger.debug("Hitting Url Success : %s", requestUrl)
def saveImage(objs, keyword):
for obj in objs:
img_link = obj['image']
img_data = requests.get(img_link).content
# os.mkdir('downloads')
filename = "downloads/" + keyword + ".png"
with open(filename, 'wb+') as f:
f.write(img_data)
print("File " + keyword + ".png successfully downloaded.")
break
while True:
keyword = input('Enter the search keyword : ')
# print(keyword)
search(keyword)

The images are actually stored in img tags, they are just nested inside of some div elements.
You should be able to find all of the images on the page with the CSS selector img.tile--img__img.
You could use a library like Beautiful Soup to query for all of those links, like so:
from bs4 import BeautifulSoup
# Considering your HTML is in the variable `source`
source_tree = BeautifulSoup(source, 'html.parser')
links = [img.get('src') for img in source_tree.find_all('img', class_='tile--img__img')]
Edit
Seems like the issue here is that duckduckgo serves the images page with the use of some JavaScript to display all of the img tiles. Since the Python request can only fetch the resources and not execute any JavaScript, then you may need to implement a different solution. See this other SO answer for some options.

My solution was using selenium as proposed in the answer marked as right.
from urllib.parse import unquote
from selenium import webdriver
def search(query):
driver = webdriver.Firefox()
# For one word queries it will be ok, for complex ones should encode first
driver.get(f'https://duckduckgo.com/?q={query}&t=h_&iax=images&ia=images')
# For now it's working with this class, not sure if it will never change
img_tags = driver.find_elements_by_class_name('tile--img__img')
for tag in img_tags:
src = tag.get_attribute('data-src')
src = unquote(src)
src = src.split('=', maxsplit=1)
src = src[1]
yield src
driver.close()
if __name__ == '__main__':
from pprint import pprint
imgs_urls = list(search('sun'))
pprint(imgs_urls)
You'll need selenium python library, that you can install using pip and geckodriver, that may be found in your distribution package manager.

Scrapy spider finding one "Next" button but not the other

I am writing a spider to scrape a popular reviews website :-) This is my first attempt at writing a Scrapy spider.
The top level is a list of restaurants (I call this "top level"), which appear 30 at a time. My spider accesses each link and then "clicks next" to get the next 30, and so on. This part is working as my output does contain thousands of restaurants, not just the first 30.
I then want it to "click" on the link to each restaurant page ("restaurant level"), but this contains only truncated versions of the reviews, so I want it to then "click" down a further level (to "review level") and scrape the reviews from there, which appear 5 at a time with another "next" button. This is the only "level" from which I am extracting anything - the other levels just have links to access to get to the reviews and other info I want.
Most of this is working as I am getting all the information I want, but only for the first 5 reviews per restaurant. It is not "finding" the "next" button on the bottom "review level".
I have tried changing the order of commands within the parse method, but other than that I am coming up short of ideas! My xpaths are fine so it must be something to do with structure of the spider.
My spider looks thus:
import scrapy
from scrapy.http import Request
class TripSpider(scrapy.Spider):
name = 'tripadvisor'
allowed_domains = ['tripadvisor.co.uk']
start_urls = ['https://www.tripadvisor.co.uk/Restaurants-g187069-Manchester_Greater_Manchester_England.html']
custom_settings = {
'DOWNLOAD_DELAY': 1,
# 'DEPTH_LIMIT': 3,
'AUTOTHROTTLE_TARGET_CONCURRENCY': 0.5,
'USER_AGENT': "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36",
# 'DEPTH_PRIORITY': 1,
# 'SCHEDULER_DISK_QUEUE': 'scrapy.squeues.PickleFifoDiskQueue',
# 'SCHEDULER_MEMORY_QUEUE': 'scrapy.squeues.FifoMemoryQueue'
}
def scrape_review(self, response):
restaurant_name_review = response.xpath('//div[#class="wrap"]//span[#class="taLnk "]//text()').extract()
reviewer_name = response.xpath('//div[#class="username mo"]//text()').extract()
review_rating = response.xpath('//div[#class="wrap"]/div[#class="rating reviewItemInline"]/span[starts-with(#class,"ui_bubble_rating")]').extract()
review_title = response.xpath('//div[#class="wrap"]//span[#class="noQuotes"]//text()').extract()
full_reviews = response.xpath('//div[#class="wrap"]/div[#class="prw_rup prw_reviews_text_summary_hsx"]/div[#class="entry"]/p').extract()
review_date = response.xpath('//div[#class="prw_rup prw_reviews_stay_date_hsx"]/text()[not(parent::script)]').extract()
restaurant_name = response.xpath('//div[#id="listing_main_sur"]//a[#class="HEADING"]//text()').extract() * len(full_reviews)
restaurant_rating = response.xpath('//div[#class="userRating"]//#alt').extract() * len(full_reviews)
restaurant_review_count = response.xpath('//div[#class="userRating"]//a//text()').extract() * len(full_reviews)
for rvn, rvr, rvt, fr, rd, rn, rr, rvc in zip(reviewer_name, review_rating, review_title, full_reviews, review_date, restaurant_name, restaurant_rating, restaurant_review_count):
reviews_dict = dict(zip(['reviewer_name', 'review_rating', 'review_title', 'full_reviews', 'review_date', 'restaurant_name', 'restaurant_rating', 'restaurant_review_count'], (rvn, rvr, rvt, fr, rd, rn, rr, rvc)))
yield reviews_dict
# print(reviews_dict)
def parse(self, response):
### The parse method is what is actually being repeated / iterated
for review in self.scrape_review(response):
yield review
# print(review)
# access next page of resturants
next_page_restaurants = response.xpath('//a[#class="nav next rndBtn ui_button primary taLnk"]/#href').extract_first()
next_page_restaurants_url = response.urljoin(next_page_restaurants)
yield Request(next_page_restaurants_url)
print(next_page_restaurants_url)
# access next page of reviews
next_page_reviews = response.xpath('//a[#class="nav next taLnk "]/#href').extract_first()
next_page_reviews_url = response.urljoin(next_page_reviews)
yield Request(next_page_reviews_url)
print(next_page_reviews_url)
# access each restaurant page:
url = response.xpath('//div[#id="EATERY_SEARCH_RESULTS"]/div/div/div/div/a[#target="_blank"]/#href').extract()
for url_next in url:
url_full = response.urljoin(url_next)
yield Request(url_full)
# "accesses the first review to get to the full reviews (not the truncated versions)"
first_review = response.xpath('//a[#class="title "]/#href').extract_first() # extract first used as I only want to access one of the links on this page to get down to "review level"
first_review_full = response.urljoin(first_review)
yield Request(first_review_full)
# print(first_review_full)

You are missing a space at the end of the class value:
Try this:
next_page_reviews = response.xpath('//a[#class="nav next taLnk "]/#href').extract_first()
Here are some tips on matching classes partially: https://docs.scrapy.org/en/latest/topics/selectors.html#when-querying-by-class-consider-using-css
On a side note, you can define separate parse functions to make it clearer what each one is responsible for: https://docs.scrapy.org/en/latest/intro/tutorial.html?highlight=callback#more-examples-and-patterns

Ajax-based navigation with scrapy by generating appropriate POST request

I've been trying to scrape a site that uses AJAX on link elements with onclick events to control page navigation. The scraper works for the first page, but never processes pages from there; so it seems not to be firing the POST request I build up.
I'm completely new to all of this (Python, scrapy, xPath, DOM), but my intuition is that I've mixed different structural patterns from different examples that are subtly incompatible?
I would also really appreciate some hints also on how better to debug this problem beyond (newbie) using the scrapy shell and outputting log messages.
My code:
import scrapy
from scrapy import FormRequest
class FansSpider(scrapy.Spider):
name = "fans"
allowed_domains = ['za.rs-online.com/web/c/hvac-fans-thermal-management/fans/axial-fans/']
start_urls = ['http://za.rs-online.com/web/c/hvac-fans-thermal-management/fans/axial-fans/']
def parse(self, response):
self.logger.info('Parse function called on %s', response.url)
for component in response.xpath('//tr[#class="resultRow"]'):
yield {
'id': component.xpath('.//a[#class="primarySearchLink"]/text()').extract_first().strip()
}
next_id = response.xpath('//a[#class="rightLink nextLink approverMessageTitle"]/#id').extract_first()
self.logger.info('Identified code of next URL as %s', next_id)
if next_id is not None:
first_id = response.xpath('//a[#class="rightLink nextLink approverMessageTitle"]/#onclick').\
extract_first().split(',')[1].strip('\'')
# POST the URL that is generated when clicking the next button
return [FormRequest.from_response(response,
url='http://za.rs-online.com/web/c/hvac-fans-thermal-management/fans/axial-fans/',
formdata={'AJAXREQUEST': '_viewRoot',
first_id: first_id,
'ajax-dimensions': '',
'ajax-request': 'true',
'ajax-sort-by': '',
'ajax-sort-order': '',
'ajax-attrSort': 'false',
'javax.faces.viewState': 'j_id1',
next_id: next_id},
callback=self.parse,
dont_filter=True,
dont_click=True,
method='POST'
)]
Additional information just in case it's relevant: I made these changes to the scrapy settings.py to avoid getting blocked by the webserver or getting banned:
# Obey robots.txt rules
ROBOTSTXT_OBEY = False
# Enable and configure the AutoThrottle extension (disabled by default)
# See http://doc.scrapy.org/en/latest/topics/autothrottle.html
AUTOTHROTTLE_ENABLED = True
# The initial download delay
AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
Thanks!

Develop Reference

node.js excel linux python-3.x azure haskell apache-spark rust .htaccess string

Google Webscraper (URLS) - including more than the first page in results - python-3.x

Related

Scraping website with xpath returns nothing

Web scraping with Python that requires login to view output

How to scrape images from DuckDuckGo's image search results in Python

Scrapy spider finding one "Next" button but not the other

Ajax-based navigation with scrapy by generating appropriate POST request

Categories

Resources