I am trying to get the content from iframe for this reason I changed my splash request endpoint from execute to render.json. Howerver, splash.wait doesn't work at all. Here's the spider code.
import scrapy
from scrapy_splash import SplashRequest
from scrapy.http import HtmlResponse
src="""
function main(splash, args)
assert(splash:go(args.url))
assert(splash:wait(10))
return {
html = splash:html()
}
end
"""
class Lafarge (scrapy.Spider):
name = "lafargespider"
def __init__(self, *args, **kwargs):
self.root_url = "https://cacareers-lafarge-na.icims.com/jobs/search?pr=0&searchRelation=keyword_all&schemaId=&o="
def start_requests(self):
yield SplashRequest(self.root_url, self.parse_detail,
endpoint='render.json',
args={
'iframes': 1,
'html' : 1,
'lua_source': src,
'timeout': 90
}
)
def parse_detail(self, response):
#response decoded
rs = response.data['childFrames'][0]['html']
response = HtmlResponse(url="my HTML string", body=rs, encoding='utf-8')
print("next page ===>",response.xpath('//a[#class="glyph "]/#href').extract_first())
passing wait time in the Splash.request arguments solved the issue for me.
def start_requests(self):
yield SplashRequest(self.root_url, self.parse_detail,
endpoint='render.json',
args={
'wait': 5,
'iframes': 1,
'html' : 1,
'lua_source': src,
}
)
def parse_detail(self, response):
rs = response.data['childFrames'][0]['html']
Pass the wait param in args. It should be -
args ={
'wait': 5,
'iframes': 1,
'html' : 1,
'lua_source': src,
'timeout': 90
}
lua_source isn't supported with endpoint of type "render.json" but supported with type "execute" so no need for lua_source in your code.
What solved the problem is using wait see explanation for wait usage here page 11:
https://media.readthedocs.org/pdf/splash/latest/splash.pdf
Related
I am using python3.8.5, scrapy2.4.0 I am also using scrapy-proxy-pool and scrapy-user-agents I am getting "AttributeError: Response content isn't text". I am running this code on python3-venv. Would you like to help me explaining and solving the problem ?
Here is my code:
import scrapy
import json
class BasisMembersSpider(scrapy.Spider):
name = 'basis'
allowed_domains = ['www.basis.org.bd']
def start_requests(self):
start_url = 'https://basis.org.bd/get-member-list?page=1&team='
yield scrapy.Request(url=start_url, callback=self.get_membership_no)
def get_membership_no(self, response):
data_array = json.loads(response.body)['data']
next_page = json.loads(response.body)['links']['next']
for data in data_array:
next_url = 'https://basis.org.bd/get-company-profile/{0}'.format(data['membership_no'])
yield scrapy.Request(url=next_url, callback=self.parse)
if next_page:
yield scrapy.Request(url=next_page, callback=self.get_membership_no)
def parse(self, response):
print("Printing informations....................................................")
Here is my settings.py file:
BOT_NAME = 'web_scraping'
SPIDER_MODULES = ['web_scraping.spiders']
NEWSPIDER_MODULE = 'web_scraping.spiders'
AUTOTHROTTLE_ENABLED = True
# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'web_scraping (+http://www.yourdomain.com)'
# Obey robots.txt rules
ROBOTSTXT_OBEY = False
PROXY_POOL_ENABLED = True
DOWNLOADER_MIDDLEWARES = {
'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware': None,
'scrapy_user_agents.middlewares.RandomUserAgentMiddleware': 800,
'scrapy_proxy_pool.middlewares.ProxyPoolMiddleware': 610,
'scrapy_proxy_pool.middlewares.BanDetectionMiddleware': 620,
}
And are error messages from console output:
Thank you...
When I use Splash by going to localhost:8050 in my browser it loads different html than the one in my Scrapy script. Below is the Lua script (the one used in localhost:8050) that loads the HTML I want.
function main(splash, args)
assert(splash:go(args.url))
assert(splash:wait(20))
return {
html = splash:html(),
png = splash:png(),
har = splash:har(),
}
end
Here is the code from my Scrapy script that is not loading the HTML I want:
def start_requests(self):
for url in self.start_urls:
yield SplashRequest(url, self.parse,
endpoint='render.html',
args={'wait': 20},
)
I am passing the same URL for both. Any suggestions?
Note I am getting this error but the script appears to still run:
ScrapyDeprecationWarning: Call to deprecated function to_native_str. Use to_unicode instead.
Idk why I have been getting so frustrated with Splash but it seems like when I do things with Selenium it just works but I am always having problems with Splash. I clearly have some knowledge gaps that I need to fill.
EDIT:
Changes so it uses Lua script:
LUA_SCRIPT = """
function main(splash, args)
assert(splash:go(args.url))
assert(splash:wait(20))
return {
html = splash:html(),
png = splash:png(),
har = splash:har(),
}
end
"""
SCRAPY_CRAWLER_NAME = 'oddschecker'
SCRAPY_SPLASH_ENDPOINT = 'render.html'
SCRAPY_ARGS = {
'lua_source': LUA_SCRIPT
def start_requests(self):
for url in self.start_urls:
print(url)
yield SplashRequest(url, self.parse,
endpoint='execute',
args=SCRAPY_ARGS,
)
settings.py file:
SPIDER_MIDDLEWARES = {
'scrapy_splash.SplashDeduplicateArgsMiddleware': 100,
'surefirebetting.middlewares.SurefirebettingSpiderMiddleware': 543,
}
DOWNLOADER_MIDDLEWARES = {
'scrapy_splash.SplashCookiesMiddleware': 723,
'scrapy_splash.SplashMiddleware': 725,
'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware': 810,
'surefirebetting.middlewares.SurefirebettingDownloaderMiddleware': 543,
}
SPLASH_URL = 'http://localhost:8050'
DUPEFILTER_CLASS = 'scrapy_splash.SplashAwareDupeFilter'
HTTPCACHE_STORAGE = 'scrapy_splash.SplashAwareFSCacheStorage'
I want to crawl a web page which shows the results of a search in google's webstore and the link is static for that particular keyword.
I want to find the ranking of an extension periodically.
Here is the URL
Problem is that I can't render the dynamic data generated by Javascript code in response from server.
I tried using Scrapy and Scrapy-Splash to render the desired page but I was still getting the same response. I used Docker to run an instance of scrapinghub/splash container on port 8050. I even visited the webpage http://localhost:8050 and entered my URL manually but it couldn't render the data although the message showed success.
Here's the code I wrote for the crawler. It actually does nothing and its only job is to fetch the HTML contents of the desired page.
import scrapy
from scrapy_splash import SplashRequest
class WebstoreSpider(scrapy.Spider):
name = 'webstore'
def start_requests(self):
yield SplashRequest(
url='https://chrome.google.com/webstore/search/netflix%20vpn?utm_source=chrome-ntp-icon&_category=extensions',
callback=self.parse,
args={
"wait": 3,
},
)
def parse(self, response):
print(response.text)
and the contents of the settings.py of my Scrapy project:
BOT_NAME = 'webstore_cralwer'
SPIDER_MODULES = ['webstore_cralwer.spiders']
NEWSPIDER_MODULE = 'webstore_cralwer.spiders'
ROBOTSTXT_OBEY = False
SPLASH_URL = 'http://localhost:8050'
DOWNLOADER_MIDDLEWARES = {
'scrapy_splash.SplashCookiesMiddleware': 723,
'scrapy_splash.SplashMiddleware': 725,
'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware': 810,
}
SPIDER_MIDDLEWARES = {
'scrapy_splash.SplashDeduplicateArgsMiddleware': 100,
}
DUPEFILTER_CLASS = 'scrapy_splash.SplashAwareDupeFilter'
And for the result I always get nothing.
Any help is appreciated.
Works for me with a small custom lua script:
lua_source = """
function main(splash, args)
assert(splash:go(args.url))
assert(splash:wait(5.0))
return {
html = splash:html(),
}
end
"""
You can then change your start_requests as follows:
def start_requests(self):
yield SplashRequest(
url='https://chrome.google.com/webstore/search/netflix%20vpn?utm_source=chrome-ntp-icon&_category=extensions',
callback=self.parse,
args={'lua_source': self.lua_source},
)
I am trying to get request status code with scrapy and scrapy-splash,below is spider code.
class Exp10itSpider(scrapy.Spider):
name = "exp10it"
def start_requests(self):
urls = [
'http://192.168.8.240:8000/xxxx'
]
for url in urls:
#yield SplashRequest(url, self.parse, args={'wait': 0.5, 'dont_redirect': True},meta={'handle_httpstatus_all': True})
#yield scrapy.Request(url, self.parse, meta={'handle_httpstatus_all': True})
yield scrapy.Request(url, self.parse, meta={'handle_httpstatus_all': True,'splash': {
'args': {
'html': 1,
'png': 1,
}
}
}
)
def parse(self, response):
input("start .........")
print("status code is:\n")
input(response.status)
My start url http://192.168.8.240:8000/xxxx is a 404 status code url,there are threee kinds of request way upon:
the first is:
yield SplashRequest(url, self.parse, args={'wait': 0.5, 'dont_redirect': True},meta={'handle_httpstatus_all': True})
the second is:
yield scrapy.Request(url, self.parse, meta={'handle_httpstatus_all': True})
the third is:
yield scrapy.Request(url, self.parse, meta={'handle_httpstatus_all': True,'splash': {
'args': {
'html': 1,
'png': 1,
}
}
}
)
Only the second request way yield scrapy.Request(url, self.parse, meta={'handle_httpstatus_all': True}) can get the right status code 404,the first and the third both get status code 200,that's to say,after I try to use scrapy-splash,I can not get the right status code 404,can you help me?
As the documentation to scrapy-splash suggests, you have to pass magic_response=True to SplashRequest to achieve this:
meta['splash']['http_status_from_error_code'] - set response.status to HTTP error code when assert(splash:go(..)) fails; it requires meta['splash']['magic_response']=True. http_status_from_error_code option is False by default if you use raw meta API; SplashRequest sets it to True by default.
EDIT:
I was able to get it to work only with execute endpoint, though. Here is sample spider that tests HTTP status code using httpbin.org:
# -*- coding: utf-8 -*-
import scrapy
import scrapy_splash
class HttpStatusSpider(scrapy.Spider):
name = 'httpstatus'
lua_script = """
function main(splash, args)
assert(splash:go(args.url))
assert(splash:wait(0.5))
return {
html = splash:html(),
png = splash:png(),
}
end
"""
def start_requests(self):
yield scrapy_splash.SplashRequest(
'https://httpbin.org/status/402', self.parse,
endpoint='execute',
magic_response=True,
meta={'handle_httpstatus_all': True},
args={'lua_source': self.lua_script})
def parse(self, response):
pass
It passes the HTTP 402 status code to Scrapy, as can be seen from the output:
...
2017-10-23 08:41:31 [scrapy.core.engine] DEBUG: Crawled (402) <GET https://httpbin.org/status/402 via http://localhost:8050/execute> (referer: None)
...
You can experiment with other HTTP status codes as well.
I have a post request like
def start_requests(self):
yield FormRequest(url,formdata={'id': "parameter from redis"})
Can I use redis-cli lpush to save post parameter and that my crawler run it?
By default the scrapy-redis queue working only with url as messages.
One message = one url. But you can modify this behavior.
For example you can use some object for your messages/requests:
class ScheduledRequest:
def __init__(self, url, method, body)
self.url = url
self.method = method
self.body = body
Pass it to queue as json encoded dic:
redis.lpush(
queue_key,
json.dumps(
ScheduledRequest(
url='http://google.com',
method='POST',
body='some body data ...'
).__dict__
)
)
And rewrite the make_request_from_data and schedule_next_requests methods:
class MySpiderBase(RedisCrawlSpider, scrapy.Spider):
def __init__(self, *args, **kwargs):
super(MySpiderBase, self).__init__(*args, **kwargs)
def make_request_from_data(self, data):
scheduled = ScheduledRequest(
**json.loads(
bytes_to_str(data, self.redis_encoding)
)
)
# here you can use and FormRequest
return scrapy.Request(url=scheduled.url, method=scheduled.method, body=scheduled.body)
def schedule_next_requests(self):
for request in self.next_requests():
self.crawler.engine.crawl(request, spider=self)
def parse(self, response):
pass