How is my scraper being detected immediately by a search engine - python-3.x

I am using Scrapy with Selenium in order to scrape urls from a particular search engine (ekoru). Here is a screenshot of the response I get back from the search engine with just ONE request:
Since I am using selenium, I'd assume that my user-agent should be fine so what else could the issue be that makes the search engine detect the bot immediately?
Here is my code:
class CompanyUrlSpider(scrapy.Spider):
name = 'company_url'
def start_requests(self):
yield SeleniumRequest(
url='https://ekoru.org',
wait_time=3,
screenshot=True,
callback=self.parseEkoru
)
def parseEkoru(self, response):
driver = response.meta['driver']
search_input = driver.find_element_by_xpath("//input[#id='fld_q']")
search_input.send_keys('Hello World')
search_input.send_keys(Keys.ENTER)
html = driver.page_source
response_obj = Selector(text=html)
links = response_obj.xpath("//div[#class='serp-result-web-title']/a")
for link in links:
yield {
'ekoru_URL': link.xpath(".//#href").get()
}

Sometimes you need to pass other parameters in order to avoid being detected by any webpage.
Let me share a code you can use:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
#This code helps to simulate a "human being" visiting the website
chrome_options = Options()
chrome_options.add_argument('--start-maximized')
driver = webdriver.Chrome(options=chrome_options, executable_path=r"chromedriver")
driver.execute_cdp_cmd("Page.addScriptToEvaluateOnNewDocument", {"source":
"""Object.defineProperty(navigator,
'webdriver', {get: () => undefined})"""})
url = 'https://ekoru.org'
driver.get(url)
Yields (Check out below the bar address "Chrome is being controlled..."):

Related

How to scrape a website with multiple pages with the same url adress using scrapy-playwright

I am trying to scrape a website with multiple pages with the same url using scrapy-playwright.
the following script returned only the data of the second page and did not continue to the rest of the pages.
can anyone suggest how I can fix it?
import scrapy
from scrapy_playwright.page import PageMethod
from scrapy.crawler import CrawlerProcess
class AwesomeSpideree(scrapy.Spider):
name = "awesome"
def start_requests(self):
# GET request
yield scrapy.Request(
url=f"https://www.cia.gov/the-world-factbook/countries/" ,
callback = self.parse,
meta=dict(
playwright = True,
playwright_include_page = True,
playwright_page_methods = {
"click" : PageMethod('click',selector = 'xpath=//div[#class="pagination-controls col-lg-6"]//span[#class="pagination__arrow-right"]'),
"screenshot": PageMethod("screenshot", path=f"step1.png", full_page=True)
},
)
)
async def parse(self, response):
page = response.meta["playwright_page"]
await page.close()
print("-"*80)
CountryLst = response.xpath("//div[#class='col-lg-9']")
for Country in CountryLst:
yield {
"country_link": Country.xpath(".//a/#href").get()
}
I see you are trying to fetch URLs of countries from above mentioned URL.
if you inspect the Network tab you can see there is one request to one JSON data API. You can fetch all countries URL's from this url
after that if you still want scrap more data from scraped URL's then you can easily scrap because that data is static so there will be no need to use playwright.
Have a good day :)

Attempting login with Scrapy-Splash

Since i am not able to login to https://www.duif.nl/login, i tried many different methods like selenium, which i successfully logged in, but didnt manage to start crawling.
Now i tried my luck with scrapy-splash, but i cant login :(
If i render the loginpage with splash, i see following picture:
Well, there should be a loginform, like username and password, but scrapy cant see it?
Im sitting here like a week in front of that loginform and losing my will to live..
My last question didnt even get one answer, now i try it again.
here is the html code of the login-form:
When i login manual, i get redirected to "/login?returnUrl=", where i only have these form_data:
My Code
# -*- coding: utf-8 -*-
import scrapy
from scrapy_splash import SplashRequest
from scrapy.spiders import CrawlSpider, Rule
from ..items import ScrapysplashItem
from scrapy.http import FormRequest, Request
import csv
class DuifSplash(CrawlSpider):
name = "duifsplash"
allowed_domains = ['duif.nl']
login_page = 'https://www.duif.nl/login'
with open('duifonlylinks.csv', 'r') as f:
reader = csv.DictReader(f)
start_urls = [items['Link'] for items in reader]
def start_requests(self):
yield SplashRequest(
url=self.login_page,
callback=self.parse,
dont_filter=True
)
def parse(self, response):
return FormRequest.from_response(
response,
formdata={
'username' : 'not real',
'password' : 'login data',
}, callback=self.after_login)
def after_login(self, response):
accview = response.xpath('//div[#class="c-accountbox clearfix js-match-height"]/h3')
if accview:
print('success')
else:
print(':(')
for url in self.start_urls:
yield response.follow(url=url, callback=self.parse_page)
def parse_page(self, response):
productpage = response.xpath('//div[#class="product-details col-md-12"]')
if not productpage:
print('No productlink', response.url)
for a in productpage:
items = ScrapysplashItem()
items['SKU'] = response.xpath('//p[#class="desc"]/text()').get()
items['Title'] = response.xpath('//h1[#class="product-title"]/text()').get()
items['Link'] = response.url
items['Images'] = response.xpath('//div[#class="inner"]/img/#src').getall()
items['Stock'] = response.xpath('//div[#class="desc"]/ul/li/em/text()').getall()
items['Desc'] = response.xpath('//div[#class="item"]/p/text()').getall()
items['Title_small'] = response.xpath('//div[#class="left"]/p/text()').get()
items['Price'] = response.xpath('//div[#class="price"]/span/text()').get()
yield items
In my "prework", i crawled every internal link and saved it to a .csv-File, where i analyse which of the links are product links and which are not.
Now i wonder, if i open a link of my csv, it opens an authenticated session or not?
I cant find no cookies, this is also strange to me
UPDATE
I managed to login successfully :-) now i only need to know where the cookies are stored
Lua Script
LUA_SCRIPT = """
function main(splash, args)
splash:init_cookies(splash.args.cookies),
splash:go("https://www.duif.nl/login"),
splash:wait(0.5),
local title = splash.evaljs("document.title"),
return {
title=title,
cookies = splash:get_cookies(),
},
end
"""
I don't think using Splash here is the way to go, as even with a normal Request the form is there: response.xpath('//form[#id="login-form"]')
There are multiple forms available on the page, so you have to specify which form you want to base yourself on to make a FormRequest.from_response. Best specify the clickdata as well (so it goes to 'Login', not to 'forgot password'). In summary it would look something like this:
req = FormRequest.from_response(
response,
formid='login-form',
formdata={
'username' : 'not real',
'password' : 'login data'},
clickdata={'type': 'submit'}
)
If you don't use Splash, you don't have to worry about passing cookies - this is taken care of by Scrapy. Just make sure you don't put COOKIES_ENABLED=False in your settings.py

How to crawl dynamically generated data on google's webstore search results

I want to crawl a web page which shows the results of a search in google's webstore and the link is static for that particular keyword.
I want to find the ranking of an extension periodically.
Here is the URL
Problem is that I can't render the dynamic data generated by Javascript code in response from server.
I tried using Scrapy and Scrapy-Splash to render the desired page but I was still getting the same response. I used Docker to run an instance of scrapinghub/splash container on port 8050. I even visited the webpage http://localhost:8050 and entered my URL manually but it couldn't render the data although the message showed success.
Here's the code I wrote for the crawler. It actually does nothing and its only job is to fetch the HTML contents of the desired page.
import scrapy
from scrapy_splash import SplashRequest
class WebstoreSpider(scrapy.Spider):
name = 'webstore'
def start_requests(self):
yield SplashRequest(
url='https://chrome.google.com/webstore/search/netflix%20vpn?utm_source=chrome-ntp-icon&_category=extensions',
callback=self.parse,
args={
"wait": 3,
},
)
def parse(self, response):
print(response.text)
and the contents of the settings.py of my Scrapy project:
BOT_NAME = 'webstore_cralwer'
SPIDER_MODULES = ['webstore_cralwer.spiders']
NEWSPIDER_MODULE = 'webstore_cralwer.spiders'
ROBOTSTXT_OBEY = False
SPLASH_URL = 'http://localhost:8050'
DOWNLOADER_MIDDLEWARES = {
'scrapy_splash.SplashCookiesMiddleware': 723,
'scrapy_splash.SplashMiddleware': 725,
'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware': 810,
}
SPIDER_MIDDLEWARES = {
'scrapy_splash.SplashDeduplicateArgsMiddleware': 100,
}
DUPEFILTER_CLASS = 'scrapy_splash.SplashAwareDupeFilter'
And for the result I always get nothing.
Any help is appreciated.
Works for me with a small custom lua script:
lua_source = """
function main(splash, args)
assert(splash:go(args.url))
assert(splash:wait(5.0))
return {
html = splash:html(),
}
end
"""
You can then change your start_requests as follows:
def start_requests(self):
yield SplashRequest(
url='https://chrome.google.com/webstore/search/netflix%20vpn?utm_source=chrome-ntp-icon&_category=extensions',
callback=self.parse,
args={'lua_source': self.lua_source},
)

Get requests body using selenium and proxies

I want to be able to get a body of the specific subrequest using a selenium behind the proxy.
Now I'm using python + selenium + chromedriver. With logging I'm able to get each subrequest's headers but not body. My logging settings:
caps['loggingPrefs'] =
{'performance': 'ALL',
'browser': 'ALL'}
caps['perfLoggingPrefs'] = {"enableNetwork": True,
"enablePage": True,
"enableTimeline": True}
I know there are several options to form a HAR with selenium:
Use geckodriver and har-export-trigger. I tried to make it work with the following code:
window.foo = HAR.triggerExport().then(harLog => { return(harLog); });
return window.foo;
Unfortunately, I don't see the body of the response in the returning data.
Use browsermob proxy. The solution seems totally fine but I didn't find the way to make browsermob proxy work behind the proxy.
So the question is: how can I get the body of the specific network response on the request made during the downloading of the webpage with selenium AND use proxies.
UPD: Actually, with har-export-trigger I get the response bodies, but not all of them: the response body I need is in json, it's MIME type is 'text/html; charset=utf-8' and it is missing from the HAR file I generate, so the solution is still missing.
UPD2: After further investigation, I realized that a response body is missing even on my desktop firefox when the har-export-trigger add-on is turned on, so this solution may be a dead-end (issue on Github)
UPD3: This bug can be seen only with the latest version of har-export-trigger. With version 0.6.0. everything works just fine.
So, for future googlers: you may use har-export-trigger v. 0.6.0. or the approach from the accepted answer.
I have actually just finished to implemented a selenium HAR script with tools you are mentioned in the question. Both HAR getting from har-export-trigger and BrowserMob are verified with Google HAR Analyser.
A class using selenium, gecko driver and har-export-trigger:
class MyWebDriver(object):
# a inner class to implement custom wait
class PageIsLoaded(object):
def __call__(self, driver):
state = driver.execute_script('return document.readyState;')
MyWebDriver._LOGGER.debug("checking document state: " + state)
return state == "complete"
_FIREFOX_DRIVER = "geckodriver"
# load HAR_EXPORT_TRIGGER extension
_HAR_TRIGGER_EXT_PATH = os.path.abspath(
"har_export_trigger-0.6.1-an+fx_orig.xpi")
_PROFILE = webdriver.FirefoxProfile()
_PROFILE.set_preference("devtools.toolbox.selectedTool", "netmonitor")
_CAP = DesiredCapabilities().FIREFOX
_OPTIONS = FirefoxOptions()
# add runtime argument to run with devtools opened
_OPTIONS.add_argument("-devtools")
_LOGGER = my_logger.get_custom_logger(os.path.basename(__file__))
def __init__(self, log_body=False):
self.browser = None
self.log_body = log_body
# return the webdriver instance
def get_instance(self):
if self.browser is None:
self.browser = webdriver.Firefox(capabilities=
MyWebDriver._CAP,
executable_path=
MyWebDriver._FIREFOX_DRIVER,
firefox_options=
MyWebDriver._OPTIONS,
firefox_profile=
MyWebDriver._PROFILE)
self.browser.install_addon(MyWebDriver._HAR_TRIGGER_EXT_PATH,
temporary=True)
MyWebDriver._LOGGER.info("Web Driver initialized.")
return self.browser
def get_har(self):
# JSON.stringify has to be called to return as a string
har_harvest = "myString = HAR.triggerExport().then(" \
"harLog => {return JSON.stringify(harLog);});" \
"return myString;"
har_dict = dict()
har_dict['log'] = json.loads(self.browser.execute_script(har_harvest))
# remove content body
if self.log_body is False:
for entry in har_dict['log']['entries']:
temp_dict = entry['response']['content']
try:
temp_dict.pop("text")
except KeyError:
pass
return har_dict
def quit(self):
self.browser.quit()
MyWebDriver._LOGGER.warning("Web Driver closed.")
A subclass adding BrowserMob proxy for your reference as well:
class MyWebDriverWithProxy(MyWebDriver):
_PROXY_EXECUTABLE = os.path.join(os.getcwd(), "venv", "lib",
"browsermob-proxy-2.1.4", "bin",
"browsermob-proxy")
def __init__(self, url, log_body=False):
super().__init__(log_body=log_body)
self.server = Server(MyWebDriverWithProxy._PROXY_EXECUTABLE)
self.server.start()
self.proxy = self.server.create_proxy()
self.proxy.new_har(url,
options={'captureHeaders': True,
'captureContent': self.log_body})
super()._LOGGER.info("BrowserMob server started")
super()._PROFILE.set_proxy(self.proxy.selenium_proxy())
def get_har(self):
return self.proxy.har
def quit(self):
self.browser.quit()
self.proxy.close()
MyWebDriver._LOGGER.info("BroswerMob server and Web Driver closed.")

Scraping an ajax website using Python requests

I'm trying to scrape a webpage that load is content after 5 seconds.
I want to use the lib requests.
Is there something to make the request wait?
import requests
from bs4 import BeautifulSoup as soup
from time import sleep
link = 'https://www.off---white.com'
while True:
try:
r = requests.get(link, stream=False, timeout=8)
break
except:
if r.status_code == 404:
print("Client error")
r.raise_for_status()
sleep(1)
page = soup(r.text, "html.parser")
products = page.findAll('article', class_='product')
titles = page.findAll('span', class_='prod-title')[0].text.strip()
images= page.findAll('img', class_="js-scroll-gallery-snap-target")
for product in products:
print(product)
I ever answer such question but the asker give a better answer cfscrape , cfscrape is better than selenium in this website. btw the question seem to be closed i dont know why.
import cfscrape
import requests
from bs4 import BeautifulSoup as soup
url = "https://www.off---white.com"
headers = {
"User-Agent":"Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:47.0) Gecko/20180101 Firefox/47.0",
"Referer" : url
}
session = requests.session()
scraper = cfscrape.create_scraper(sess=session)
link = 'https://www.off---white.com'
r = scraper.get(link, headers=headers)
page = soup(r.text, "html.parser")
update at 15/4/2020
Since off-white updated his protection, cfscrape is not a good idea for now. plz try to use selenium.
To kind of this questions, i can not give a answer that work forever. They keep updating their protection!
No, the content that is received will be always be the same, you have to prerender it by yourself to fetch final version of the webpage.
You have to use a headless browser to execute the javascript on the webpage.
Prerender.IO offers pretty much what you need, you can check it out, the setup is pretty simple.
const prerender = require('prerender');
const server = prerender();
server.start();

Resources