I have a broad scrapy crawler which takes a csv file of about 20,000 rows. The file has a name, start_url and allowed domain column. See below:
Name start_url allowed_domain
place1 https://place1.co.uk place1.co.uk
place2 https://place2.co.uk place2.co.uk
place3 https://place3.co.uk place3.co.uk
A sample of my crawler code is below:
class FinalSpider(CrawlSpider):
name = "final"
df = pd.read_csv("places.csv")
start_urls = df["start_url"].values.tolist()
custom_settings = {
'DOWNLOAD_DELAY': 3,
'DOWNLOADER_MIDDLEWARES': {
'my_spider.middlewares.MySpiderDownloaderMiddleware': 543,
},
'SCHEDULER_PRIORITY_QUEUE': 'scrapy.pqueues.DownloaderAwarePriorityQueue',
'CONCURRENT_REQUESTS': 100,
'CONCURRENT_REQUESTS_PER_DOMAIN': 1,
'RETRY_ENABLED': False,
'AJAXCRAWL_ENABLED': True
}
rules = (Rule(LinkExtractor(allow_domains=(df["allowed_domain"].values.tolist())), callback='parse_item', follow=True),)
def __init__(self):
pass
def parse_item(self, response):
# do stuff
The problem is that my crawler is allowed to follow links that are in any of the allowed domains, not just the one associated with the start_url.
You cannot assign allowed_domain for each link in start_urls
You will have to filter urls inside process_request method of DownloaderMiddleware
Here is your Spider code
class FinalSpider(CrawlSpider):
name = "final"
custom_settings = {
'DOWNLOAD_DELAY': 3,
'DOWNLOADER_MIDDLEWARES': {
'my_spider.middlewares.MySpiderDownloaderMiddleware': 543,
},
'SCHEDULER_PRIORITY_QUEUE': 'scrapy.pqueues.DownloaderAwarePriorityQueue',
'CONCURRENT_REQUESTS': 100,
'CONCURRENT_REQUESTS_PER_DOMAIN': 1,
'RETRY_ENABLED': False,
'AJAXCRAWL_ENABLED': True
}
rules = (Rule(LinkExtractor(), follow=True),)
def start_requests(self):
df = pd.read_csv("places.csv")
for key, row in df.iterrows():
yield Request(url=row['start_url'],
callback=self.parse_item,
meta={'allowed_domain': row['allowed_domain']})
Here is Middleware code
import tldextract
from scrapy.exceptions import IgnoreRequest
class MySpiderDownloaderMiddleware(object):
def process_request(self, request, spider):
site = tldextract.extract(request.url)
site = "{}.{}".format(site.domain, site.suffix)
if request.meta['allowed_domain'] not in site:
raise IgnoreRequest("Filtered offsite request")
Related
Is it correct in Scrapy to have multiple parse methods in one spider ?
Something looking like:
import scrapy
class FooSpider(scrapy.Spider):
name = 'foo'
start_urls = ['https://example.com']
def parse(self, response):
...
yield {'foo': foo}
def parse(self, response):
...
yield {'bar': bar}
No, but you can create different methods and call them from start_requests for example.
import scrapy
class FooSpider(scrapy.Spider):
name = 'POC'
start_urls = ['https://scrapingclub.com/exercise/detail_basic/']
def start_requests(self):
for url in self.start_urls:
yield scrapy.Request(url=url, callback=self.get_title, dont_filter=True)
yield scrapy.Request(url=url, callback=self.get_price, dont_filter=True)
def get_title(self, response):
yield {'title': response.xpath('//h3/text()').get()}
def get_price(self, response):
yield {'price': response.xpath('//div[#class="card-body"]/h4/text()').get()}
I am using python3.8.5, scrapy2.4.0 I am also using scrapy-proxy-pool and scrapy-user-agents I am getting "AttributeError: Response content isn't text". I am running this code on python3-venv. Would you like to help me explaining and solving the problem ?
Here is my code:
import scrapy
import json
class BasisMembersSpider(scrapy.Spider):
name = 'basis'
allowed_domains = ['www.basis.org.bd']
def start_requests(self):
start_url = 'https://basis.org.bd/get-member-list?page=1&team='
yield scrapy.Request(url=start_url, callback=self.get_membership_no)
def get_membership_no(self, response):
data_array = json.loads(response.body)['data']
next_page = json.loads(response.body)['links']['next']
for data in data_array:
next_url = 'https://basis.org.bd/get-company-profile/{0}'.format(data['membership_no'])
yield scrapy.Request(url=next_url, callback=self.parse)
if next_page:
yield scrapy.Request(url=next_page, callback=self.get_membership_no)
def parse(self, response):
print("Printing informations....................................................")
Here is my settings.py file:
BOT_NAME = 'web_scraping'
SPIDER_MODULES = ['web_scraping.spiders']
NEWSPIDER_MODULE = 'web_scraping.spiders'
AUTOTHROTTLE_ENABLED = True
# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'web_scraping (+http://www.yourdomain.com)'
# Obey robots.txt rules
ROBOTSTXT_OBEY = False
PROXY_POOL_ENABLED = True
DOWNLOADER_MIDDLEWARES = {
'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware': None,
'scrapy_user_agents.middlewares.RandomUserAgentMiddleware': 800,
'scrapy_proxy_pool.middlewares.ProxyPoolMiddleware': 610,
'scrapy_proxy_pool.middlewares.BanDetectionMiddleware': 620,
}
And are error messages from console output:
Thank you...
I have a function for extracting articles from one page but i am not able to navigate to next page to scrape all the pages:
below is how i am trying:
import scrapy
from scrapy.spiders import Rule
from scrapy.linkextractors import LinkExtractor
class MedicalSpider(scrapy.Spider):
name = 'medical'
# allowed_domains = ['https://blogs.webmd.com/diabetes/default.htm']
allowed_domains = ['blogs.webmd.com'] # Only the domain, not the URL
start_urls = ['https://blogs.webmd.com/diabetes/default.htm']
def parse(self, response):
article_links = response.css('.posts-list-post-content a ::attr(href)')
print(article_links)
for link in article_links:
url = link.get()
if url:
yield response.follow(url=url, callback=self.parse_article)
def parse_article(self, response):
headline = response.css('.blog-header-container h1::text').get()
article_sections = response.css('.article-body .article-page section p::text')
body = ""
for article_sections in article_sections:
body += article_sections.get() + "\n"
yield {
'headline': headline,
'body': body
}
# url_apnd = "https://blogs.webmd.com/diabetes"
next_page = response.css('.next a ::attr(href)').get()
print(next_page)
# print("URL " + response.urljoin(next_page))
if next_page:
yield scrapy.Request(response.urljoin(next_page),callback=self.parse)
Please help me get the navigation to next page right.
You need to move the next page logic in your parse function, since the next page button is in the url defined in your start_urls.
import scrapy
from scrapy.spiders import Rule
from scrapy.linkextractors import LinkExtractor
class MedicalSpider(scrapy.Spider):
name = 'medical'
allowed_domains = ['blogs.webmd.com'] # Only the domain, not the URL
start_urls = ['https://blogs.webmd.com/diabetes/default.htm']
def parse(self, response):
article_links = response.css('.posts-list-post-content a ::attr(href)')
print(article_links)
for link in article_links:
url = link.get()
if url:
yield response.follow(url=url, callback=self.parse_article)
next_page = response.css('.next a ::attr(href)').get()
print(next_page)
# print("URL " + response.urljoin(next_page))
if next_page:
yield scrapy.Request(response.urljoin(next_page),callback=self.parse)
def parse_article(self, response):
headline = response.css('.blog-header-container h1::text').get()
article_sections = response.css('.article-body .article-page section p::text')
body = ""
for article_sections in article_sections:
body += article_sections.get() + "\n"
yield {
'headline': headline,
'body': body
}
I'm trying to scrape the headings in about page but I tried so much and failed due to not the proper understanding of what to do? I'm a beginner. So I require help.
import scrapy
from ..items import DmoztutorialItem
class DmozSpiderSpider(scrapy.Spider):
name = 'Dmoz'
start_urls = [
'http://dmoz-odp.org/',
]
def parse(self, response):
items = DmoztutorialItem()
Navbar = response.css('#main-nav a::text').extract()
Category_names = response.css('.top-cat a::text').extract()
Subcategories = response.css('.sub-cat a::text').extract()
items['Navbar'] = Navbar
items['Category_names'] = Category_names
items['Subcategories'] = Subcategories
yield items
# Nav_page = response.css('#main-nav a::attr(href)').extract()
Nav_page = 'http://dmoz-odp.org/docs/en/about.html'.extract()
# About_heading = response.css('h1+ p , #mainContent
# h1::text').extract()
items['Nav_page'] = Nav_page
# items['About_heading'] = About_heading
yield response.follow(Nav_page)
Can you tell what kind of output do you need? It is very unclear from your post.
Check this example, where you can:
Get some data;
Call request to another page with date saved;
Yield final data.
Hope it will help you.
import scrapy
class DmozSpiderSpider(scrapy.Spider):
name = 'Dmoz'
start_urls = ['http://dmoz-odp.org/']
nav_page = 'http://dmoz-odp.org/docs/en/about.html'
def parse(self, response):
# collect data on first page
items = {
'Navbar': response.css('#main-nav a::text').extract(),
'Category_names': response.css('.top-cat a::text').extract(),
'Subcategories': response.css('.sub-cat a::text').extract(),
'Nav_page': self.nav_page,
}
# save and call request to another page
yield response.follow(self.nav_page, self.parse_nav, meta={'items': items})
def parse_nav(self, response):
# do you stuff on second page
items = response.meta['items']
items['something'] = 'something' # add your logics
yield items
Or make separate logins for separate pages:
import scrapy
class DmozSpiderSpider(scrapy.Spider):
name = 'Dmoz'
def start_requests(self):
reqs = (
('http://dmoz-odp.org/', self.parse_main),
('http://dmoz-odp.org/docs/en/about.html', self.parse_nav),
)
for link, callback in reqs:
yield scrapy.Request(link, callback)
def parse_main(self, response):
items = {
'Navbar': response.css('#main-nav a::text').extract(),
'Category_names': response.css('.top-cat a::text').extract(),
'Subcategories': response.css('.sub-cat a::text').extract(),
}
yield items
def parse_nav(self, response):
items = {
'something': 'something', # add your logics
}
yield items
To parse a different HTML page, you need to yield a Request object with the target URL as the first argument for its constructor, and do the parsing in the method of your spider that you pass to the constructor of that Request object as the callback parameter.
I strongly encourage you to complete the Scrapy tutorial. What you are trying to achieve is covered in the Following links section.
i'm using elasticsearch to save scrapy Data but
when i run my code i get this error:
raise SerializationError(data, e)
elasticsearch.exceptions.SerializationError: ({'real_estate_ID': [],
but it function with the other items i get a problem only with the item : real_estate_ID
from __future__ import absolute_import
import scrapy
from adds.items import AddsItem
import stomp
from elasticsearch import Elasticsearch
from elasticsearch import Elasticsearch, helpers
class addsSpider(scrapy.Spider):
name = "adds"
custom_settings = {
'DOWNLOADER_MIDDLEWARES': {
'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware': None,
'scrapy_fake_useragent.middleware.RandomUserAgentMiddleware': 400,
}
}
allowed_domains = ["www.seloger.com"]
start_urls = ['https://www.seloger.com/list.htm?
tri=initial&idtypebien=2,1&idtt=2,5&naturebien=1,2,4&ci=750115']
es = Elasticsearch('localhost:9200',use_ssl=False,verify_certs=True)
def parse(self, response):
es = Elasticsearch()
es.indices.create(index="first_index", ignore=400)
conn = stomp.Connection()
conn.start()
conn.connect('admin', 'password', wait=True)
items = AddsItem()
items['real_estate_ID'] = response.xpath('//div[#class="c-pa-list c-pa-
sl c-pa-gold cartouche "]//#id').extract()
items['real_estate_URL'] = response.xpath('//a[#class="c-pa-link
link_AB"]//#href').extract()
items['real_estate_sale_price'] = response.xpath('//div[#class="h-fi-
pulse annonce__detail__sauvegarde"]//#data-prix').extract()
items['real_estate_category'] = response.xpath('//a[#class="c-pa-link
link_AB"]//#title').extract()
for item in items['real_estate_URL']:
conn.send(body=item, destination='/queue/scrapy.seloger.ads.queue',
persistent='false')
yield items
nextpageurl = response.xpath('//a[#class="pagination-next"]/#href')
if nextpageurl:
# If we've found a pattern which matches
path = nextpageurl.extract_first()
nextpage = response.urljoin(path)
print("Found url: {}".format(nextpage)) # Write a debug statement
yield scrapy.Request(nextpage, callback=self.parse)
es.index(index="urls", doc_type="Ads_url", id=1, body=items)
res = es.get(index="urls", doc_type="Ads_url", id=1)