Scrapy script not scrapping items - python-3.x

Im not sure why my script isnt scrapping any items, is the same script that im using for another website, maybe the classes im using are wrong.
`
import scrapy
import os
from scrapy.crawler import CrawlerProcess
from datetime import datetime
date = datetime.now().strftime("%d_%m_%Y")
class stiendaSpider(scrapy.Spider):
name = 'stienda'
start_urls = ['https://stienda.uy/tv']
def parse(self, response):
for products in response.css('.grp778'):
price = products.css('.precioSantander::text').get()
name = products.css('#catalogoProductos .tit::text').get()
if price and name:
yield {'name': name.strip(),
'price': price.strip()}
os.chdir('C:\\Users\\cabre\\Desktop\\scraping\\stienda\\data\\raw')
process = CrawlerProcess(
# settings={"FEEDS": {"items.csv": {"format": "csv"}}}
settings={"FEEDS": {f"stienda_{date}.csv": {"format": "csv"}}}
)
process.crawl(stiendaSpider)
process.start()
`
I tried several but I dont usnderstand why is not working..

I was able to get the name field, but the price attribute is rendered empty and filled in later from an ajax call. That is why it's not being extracted.
import scrapy
import os
from scrapy.crawler import CrawlerProcess
from datetime import datetime
date = datetime.now().strftime("%d_%m_%Y")
class stiendaSpider(scrapy.Spider):
name = 'stienda'
start_urls = ['https://stienda.uy/tv']
def parse(self, response):
for products in response.xpath('//div[#data-disp="1"]'):
name = products.css('.tit::text').get()
if name:
yield {'name': name.strip()}
You can see it if you look at the page source... all of the elements with the class 'precioSantander' are empty.

Related

How to use css selector in object from HtmlResponse

I'm currently developing an application using Scrapy.
I want to get some value using CSS selector out of def parse, So I create a HtmlResponse object first and tried to get some value using css(), But I can't get any value...
Within def parse, I can get the value in the same way.
What should I do if it is outside of def parse?
Here is the code:
import scrapy
from scrapy.http import HtmlResponse
class SampleSpider(scrapy.Spider):
name = 'sample'
allowed_domains = ['sample.com']
start_urls = ['https://sample.com/search']
my_response = HtmlResponse(url=start_urls[0])
print('HtmlResponse')
print(my_response)
h3s = my_response.css('h3')
print(str(len(h3s)))
print('----------')
def parse(self, response, **kwargs):
print('def parse')
print(response)
h3s = response.css('h3')
print(str(len(h3s)))
Console display:
HtmlResponse
<200 https://sample.com/search>
0 # <- I want to show '3' here
----------
def parse
<200 https://sample.com/search>
3
update
The program I want to finally create is the following code:
[ (Note) The code below does not work for reference ]
import scrapy
from scrapy.http import HtmlResponse
class SampleSpider(scrapy.Spider):
name = 'sample'
allowed_domains = ['sample.com']
start_urls = []
response_url = 'https://sample.com/search'
my_response = HtmlResponse(url=response_url)
categories = my_response.css('.categories a::attr(href)').getall()
for category in categories:
start_urls.append(category)
def parse(self, response, **kwargs):
pages = response.css('h3')
for page in pages:
print(page.css('::text').get())
Python 3.8.5
Scrapy 2.5.0
I know what do you mean,your start url is the basic domain,but you also want to fetch all category page to extract h3.
in scrapy you can extract data and follow new links in the same parse method,here is a example.
import scrapy
class SampleSpider(scrapy.Spider):
name = 'sample'
allowed_domains = ['sample.com']
start_urls = ['https://sample.com/search']
def parse(self, response, **kwargs):
print('def parse')
print(response)
pages = response.css('h3')
#extract data at here
for page in pages:
print(page.css('::text').get())
yield page.css('::text').get()
#follow new links here
categories = response.css('.categories a::attr(href)').getall()
for category in categories:
yield scrapy.Request(category,callback=self.parse)
you can read scrapy document for more information

No Module name 'scrapy.Spider' found

Try to excute the code below and using the latest version of scrapy. Don't know what happen
import scrapy
from scrapy.Spider import Basespider
class crawler (Basespider):
name = "crawler"
allowed_domains = ['google.com']
start_urls = ["https://www.google.com"]
def parse(self, response):
hxs = Selector(response)
BaseSpide is from scrapy 0.16.5. If you have the newest version, then use another spider. This one is obsolete.

Extract articles from its corresponding links from a webpage using scrapy

Hi I am new to scrapy and I am Trying to extract text from links in a given webpage. Here is the code I wrote for the same and after running scrapy crawl article, it gives no module named article. Can you help me find where I am wrong? Thanks in advance.
import scrapy
from urllib.parse import urljoin
class ArticleSpider(scrapy.Spider):
name = 'article'
allowed_domains = ['www.timesofindia.indiatimes.com/business']
start_urls = ['https://timesofindia.indiatimes.com/business']
def parse(self, response):
links = response.css('span.w_tle a::attr(href)').extract()
for link in links:
url = urljoin(response.url, link)
yield scrapy.Request(url,callback=self.parse_article)
def parse_article(self,response):
for info in response.css('div.article_content clearfix'):
yield {'Article':info.css('div.Normal::text').extract()}
If you take a look at your log you'll see 'offsite/filtered': 211, and that the cause of not getting anything. In order to dodge this you can do two things:
Remove allowed_domains field
Add dont_filter=True in your request like:
yield scrapy.Request(url,callback=self.parse_article, dont_filter=True)
I tested your code it does not seems to work properly if you want to get text body so i rewrote it with XPath which I am more comfortable with.
import scrapy
from urllib.parse import urljoin
class ArticleSpider(scrapy.Spider):
name = 'article'
allowed_domains = ['www.timesofindia.indiatimes.com']
start_urls = ['https://timesofindia.indiatimes.com/business']
def parse(self, response):
links = response.xpath('//*[#id="c_listing_wdt_1"]//span[1]/a/#href').getall()
for link in links:
url = urljoin(response.url, link)
yield scrapy.Request(url,callback=self.parse_article, dont_filter=True)
def parse_article(self, response):
print(response.xpath('//*[#id="content"]//arttextxml//div//text()').getall())
for info in response.xpath('//*[#id="content"]//arttextxml//div//text()').getall():
yield {'Article':info}
getall() can be used instead of extract(), they are almost equal.

Downloading files with ItemLoaders() in Scrapy

I created a crawl spider to download files. However the spider downloaded only the urls of the files and not the files themselves. I uploaded a question here Scrapy crawl spider does not download files? . While the the basic yield spider kindly suggested in the answers works perfectly, when I attempt to download files with items or item loaders the spider does not work! The original question does not include the items.py. So there it is:
ITEMS
import scrapy
from scrapy.item import Item, Field
class DepositsusaItem(Item):
# main fields
name = Field()
file_urls = Field()
files = Field()
# Housekeeping Fields
url = Field()
project = Field()
spider = Field()
server = Field()
date = Field()
pass
EDIT: added original code
EDIT: further corrections
SPIDER
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
import datetime
import socket
from us_deposits.items import DepositsusaItem
from scrapy.loader import ItemLoader
from scrapy.loader.processors import MapCompose
from urllib.parse import urljoin
class DepositsSpider(CrawlSpider):
name = 'deposits'
allowed_domains = ['doi.org']
start_urls = ['https://minerals.usgs.gov/science/mineral-deposit-database/#products', ]
rules = (
Rule(LinkExtractor(restrict_xpaths='//*[#id="products"][1]/p/a'),
callback='parse_x'),
)
def parse_x(self, response):
i = ItemLoader(item=DepositsusaItem(), response=response)
i.add_xpath('name', '//*[#class="container"][1]/header/h1/text()')
i.add_xpath('file_urls', '//span[starts-with(#data-url, "/catalog/file/get/")]/#data-url',
MapCompose(lambda i: urljoin(response.url, i))
)
i.add_value('url', response.url)
i.add_value('project', self.settings.get('BOT_NAME'))
i.add_value('spider', self.name)
i.add_value('server', socket.gethostname())
i.add_value('date', datetime.datetime.now())
return i.load_item()
SETTINGS
BOT_NAME = 'us_deposits'
SPIDER_MODULES = ['us_deposits.spiders']
NEWSPIDER_MODULE = 'us_deposits.spiders'
ROBOTSTXT_OBEY = False
ITEM_PIPELINES = {
'us_deposits.pipelines.UsDepositsPipeline': 1,
'us_deposits.pipelines.FilesPipeline': 2
}
FILES_STORE = 'C:/Users/User/Documents/Python WebCrawling Learning Projects'
PIPELINES
class UsDepositsPipeline(object):
def process_item(self, item, spider):
return item
class FilesPipeline(object):
def process_item(self, item, spider):
return item
It seems to me that using items and/or item loaders has nothing to do with your problem.
The only problems I see are in your settings file:
FilesPipeline is not activated (only us_deposits.pipelines.UsDepositsPipeline is)
FILES_STORE should be a string, not a set (an exception is raised when you activate the files pipeline)
ROBOTSTXT_OBEY = True will prevent the downloading of files
If I correct all of those issues, the file download works as expected.

Can't get rid of blank rows in csv output

I've written a very tiny script in python scrapy to parse name, street and phone number displayed across multiple pages from yellowpage website. When I run my script i find it working smoothly. However, the only problem i encounter is the way data are getting scraped in csv output. It is always a line (row) gap between two rows. What I meant is: data are getting printed in every other row. Seeing the picture below you will get to know what I meant. If it were not for scrapy, I could have used [newline='']. But, unfortunately I am totally helpless here. How can i get rid of blank lines coming along in the csv output? Thanks in advance to take a look into it.
items.py includes:
import scrapy
class YellowpageItem(scrapy.Item):
name = scrapy.Field()
street = scrapy.Field()
phone = scrapy.Field()
Here is the spider:
import scrapy
class YellowpageSpider(scrapy.Spider):
name = "YellowpageSp"
start_urls = ["https://www.yellowpages.com/search?search_terms=Pizza&geo_location_terms=Los%20Angeles%2C%20CA&page={0}".format(page) for page in range(2,6)]
def parse(self, response):
for titles in response.css('div.info'):
name = titles.css('a.business-name span[itemprop=name]::text').extract_first()
street = titles.css('span.street-address::text').extract_first()
phone = titles.css('div[itemprop=telephone]::text').extract_first()
yield {'name': name, 'street': street, 'phone':phone}
Here is how the csv output looks like:
Btw, the command I'm using to get csv output is:
scrapy crawl YellowpageSp -o items.csv -t csv
You can fix it by creating a new FeedExporter. Change your settings.py as below
FEED_EXPORTERS = {
'csv': 'project.exporters.FixLineCsvItemExporter',
}
create a exporters.py in your project
exporters.py
import io
import os
import six
import csv
from scrapy.contrib.exporter import CsvItemExporter
from scrapy.extensions.feedexport import IFeedStorage
from w3lib.url import file_uri_to_path
from zope.interface import implementer
#implementer(IFeedStorage)
class FixedFileFeedStorage(object):
def __init__(self, uri):
self.path = file_uri_to_path(uri)
def open(self, spider):
dirname = os.path.dirname(self.path)
if dirname and not os.path.exists(dirname):
os.makedirs(dirname)
return open(self.path, 'ab')
def store(self, file):
file.close()
class FixLineCsvItemExporter(CsvItemExporter):
def __init__(self, file, include_headers_line=True, join_multivalued=',', **kwargs):
super(FixLineCsvItemExporter, self).__init__(file, include_headers_line, join_multivalued, **kwargs)
self._configure(kwargs, dont_fail=True)
self.stream.close()
storage = FixedFileFeedStorage(file.name)
file = storage.open(file.name)
self.stream = io.TextIOWrapper(
file,
line_buffering=False,
write_through=True,
encoding=self.encoding,
newline="",
) if six.PY3 else file
self.csv_writer = csv.writer(self.stream, **kwargs)
I am on Mac, so can't test its windows behavior. But if above doesn't work then change below part of code and set newline="\n"
self.stream = io.TextIOWrapper(
file,
line_buffering=False,
write_through=True,
encoding=self.encoding,
newline="\n",
) if six.PY3 else file

Resources