Try to excute the code below and using the latest version of scrapy. Don't know what happen
import scrapy
from scrapy.Spider import Basespider
class crawler (Basespider):
name = "crawler"
allowed_domains = ['google.com']
start_urls = ["https://www.google.com"]
def parse(self, response):
hxs = Selector(response)
BaseSpide is from scrapy 0.16.5. If you have the newest version, then use another spider. This one is obsolete.
Related
I'm currently developing an application using Scrapy.
I want to get some value using CSS selector out of def parse, So I create a HtmlResponse object first and tried to get some value using css(), But I can't get any value...
Within def parse, I can get the value in the same way.
What should I do if it is outside of def parse?
Here is the code:
import scrapy
from scrapy.http import HtmlResponse
class SampleSpider(scrapy.Spider):
name = 'sample'
allowed_domains = ['sample.com']
start_urls = ['https://sample.com/search']
my_response = HtmlResponse(url=start_urls[0])
print('HtmlResponse')
print(my_response)
h3s = my_response.css('h3')
print(str(len(h3s)))
print('----------')
def parse(self, response, **kwargs):
print('def parse')
print(response)
h3s = response.css('h3')
print(str(len(h3s)))
Console display:
HtmlResponse
<200 https://sample.com/search>
0 # <- I want to show '3' here
----------
def parse
<200 https://sample.com/search>
3
update
The program I want to finally create is the following code:
[ (Note) The code below does not work for reference ]
import scrapy
from scrapy.http import HtmlResponse
class SampleSpider(scrapy.Spider):
name = 'sample'
allowed_domains = ['sample.com']
start_urls = []
response_url = 'https://sample.com/search'
my_response = HtmlResponse(url=response_url)
categories = my_response.css('.categories a::attr(href)').getall()
for category in categories:
start_urls.append(category)
def parse(self, response, **kwargs):
pages = response.css('h3')
for page in pages:
print(page.css('::text').get())
Python 3.8.5
Scrapy 2.5.0
I know what do you mean,your start url is the basic domain,but you also want to fetch all category page to extract h3.
in scrapy you can extract data and follow new links in the same parse method,here is a example.
import scrapy
class SampleSpider(scrapy.Spider):
name = 'sample'
allowed_domains = ['sample.com']
start_urls = ['https://sample.com/search']
def parse(self, response, **kwargs):
print('def parse')
print(response)
pages = response.css('h3')
#extract data at here
for page in pages:
print(page.css('::text').get())
yield page.css('::text').get()
#follow new links here
categories = response.css('.categories a::attr(href)').getall()
for category in categories:
yield scrapy.Request(category,callback=self.parse)
you can read scrapy document for more information
I am trying to implement a spider in scrapy and I am getting an error when I run the spider and tried several things but couldn't resolved.The error is as follows,
runspider: error: Unable to load 'articleSpider.py': No module named 'wikiSpider.wikiSpider'
I still learning python as well as scrapy package . But I think this is to do with module import from a different directory , so I have include my directory tree in my virtual environment created in pycharm as below image.
Also note that it is python 3.9 I am using as my interpreter for my virtual environment.
Code I am using for this with spider is as follows,
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from wikiSpider.wikiSpider.items import Article
class ArticleSpider(CrawlSpider):
name = 'articleItems'
allowed_domains = ['wikipedia.org']
start_urls = ['https://en.wikipedia.org/wiki/Benevolent'
'_dictator_for_life']
rules = [Rule(LinkExtractor(allow='(/wiki/)((?!:).)*$'),
callback='parse_items', follow=True)]
def parse_items(self, response):
article = Article()
article['url'] = response.url
article['title'] = response.css('h1::text').extract_first()
article['text'] = response.xpath('//div[#id='
'"mw-content-text"]//text()').extract()
lastUpdated = response.css('li#footer-info-lastmod::text').extract_first()
article['lastUpdated'] = lastUpdated.replace('This page was last edited on ', '')
return article
and this is the code in file generating the error ,
import scrapy
class Article(scrapy.Item):
url = scrapy.Field()
title = scrapy.Field()
text = scrapy.Field()
lastUpdated = scrapy.Field()
from "wikiSpider".wikiSpider.items import Article
change this folder name.
and then edit: from wikiSpider.items import Article
Solved.
I'm trying to develop a web-scraping project, in which I am scraping a website called Startup India, which you can use for connecting with startups. Here, I have clicked based on some filters I selected , and clicked on every startup and when I click on every startup, I have to go inside that startup and scrape it. But I can't scrape the data because I'm not able to capture the response for scraping profiles in startup India.
import scrapy
from selenium import webdriver
import os
import logging
class ProductSpider(scrapy.Spider):
name = "product_spider"
allowed_domains = ['https://www.startupindia.gov.in/']
start_urls = ['https://www.startupindia.gov.in/content/sih/en/search.html?industries=sih:industry/advertising&states=sih:location/india/andhra-pradesh&stages=Prototype&roles=Startup&page=0']
def __init__(self):
cwd = os.getcwd()
self.driver = webdriver.Chrome("C:/Users/RAJ/PycharmProjects/WebCrawler/WebCrawler/WebCrawler/spiders/chromedriver.exe")
def parse(self, response):
self.driver.get(response.url)
next = self.driver.find_elements_by_xpath("//*[#id='persona-results']//a[#class='img-wrap']")
logging.info(next)
for i in next:
try:
logging.info(i.click())
logging.info(response.url)
# get the data and write it to scrapy items
except:
print("Yolo")
Code will be appreciated
I have setup scrapy project and run scrapy crawl product_spider and it gives URL of new tab open after clicking on an element.
import scrapy
from selenium import webdriver
import os
import logging
from selenium.webdriver.chrome.options import Options as ChromeOptions
CHROME_DRIVER_UBUNTU_PATH = "your chrome driver path"
class ProductSpider(scrapy.Spider):
name = "product_spider"
allowed_domains = ['https://www.startupindia.gov.in/']
start_urls = [
'https://www.startupindia.gov.in/content/sih/en/search.html?industries=sih:industry/advertising&states=sih:location/india/andhra-pradesh&stages=Prototype&roles=Startup&page=0']
def __init__(self):
cwd = os.getcwd()
opts = ChromeOptions()
opts.add_argument("--headless") # for headless browser it's not necessary
self.driver = webdriver.Chrome(executable_path=CHROME_DRIVER_UBUNTU_PATH, chrome_options=opts)
def parse(self, response):
self.driver.get(response.url)
next = self.driver.find_elements_by_xpath("//*[#id='persona-results']//a[#class='img-wrap']")
for i in next:
try:
i.click() # click on image in page
# move to new tab open
self.driver.switch_to.window(self.driver.window_handles[next.index(i) + 1])
logging.info(self.driver.current_url)
self.driver.switch_to.window(self.driver.window_handles[0])
# get the data and write it to scrapy items
except Exception as e:
print(e)
Hi I am new to scrapy and I am Trying to extract text from links in a given webpage. Here is the code I wrote for the same and after running scrapy crawl article, it gives no module named article. Can you help me find where I am wrong? Thanks in advance.
import scrapy
from urllib.parse import urljoin
class ArticleSpider(scrapy.Spider):
name = 'article'
allowed_domains = ['www.timesofindia.indiatimes.com/business']
start_urls = ['https://timesofindia.indiatimes.com/business']
def parse(self, response):
links = response.css('span.w_tle a::attr(href)').extract()
for link in links:
url = urljoin(response.url, link)
yield scrapy.Request(url,callback=self.parse_article)
def parse_article(self,response):
for info in response.css('div.article_content clearfix'):
yield {'Article':info.css('div.Normal::text').extract()}
If you take a look at your log you'll see 'offsite/filtered': 211, and that the cause of not getting anything. In order to dodge this you can do two things:
Remove allowed_domains field
Add dont_filter=True in your request like:
yield scrapy.Request(url,callback=self.parse_article, dont_filter=True)
I tested your code it does not seems to work properly if you want to get text body so i rewrote it with XPath which I am more comfortable with.
import scrapy
from urllib.parse import urljoin
class ArticleSpider(scrapy.Spider):
name = 'article'
allowed_domains = ['www.timesofindia.indiatimes.com']
start_urls = ['https://timesofindia.indiatimes.com/business']
def parse(self, response):
links = response.xpath('//*[#id="c_listing_wdt_1"]//span[1]/a/#href').getall()
for link in links:
url = urljoin(response.url, link)
yield scrapy.Request(url,callback=self.parse_article, dont_filter=True)
def parse_article(self, response):
print(response.xpath('//*[#id="content"]//arttextxml//div//text()').getall())
for info in response.xpath('//*[#id="content"]//arttextxml//div//text()').getall():
yield {'Article':info}
getall() can be used instead of extract(), they are almost equal.
I am very happy to having discovered the Scrapy Crawl Class with its Rule Objects. However when I am trying to extract urls which contain the word "login" with process_links it doesn't work. The solution I implemented comes from here: Example code for Scrapy process_links and process_request but it doesn't exclude the pages I want
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from scrapy.loader import ItemLoader
from accenture.items import AccentureItem
class AccentureSpiderSpider(CrawlSpider):
name = 'accenture_spider'
start_urls = ['https://www.accenture.com/us-en/internet-of-things-index']
rules = (
Rule(LinkExtractor(restrict_xpaths='//a[contains(#href, "insight")]'), callback='parse_item',process_links='process_links', follow=True),
)
def process_links(self, links):
for link in links:
if 'login' in link.text:
continue # skip all links that have "login" in their text
yield link
def parse_item(self, response):
loader = ItemLoader(item=AccentureItem(), response=response)
url = response.url
loader.add_value('url', url)
yield loader.load_item()
My mistake was to use link.text
When using link.url it works fine :)