I have this Scrapy project with its default structure and I added a test package with a test module.
So this is my structure:
scraping/
scrapy.cfg
crawler/
__init__.py
items.py
pipelines.py
settings.py
spiders/
__init__.py
submissions_spider.py
test/
__init__.py
test_request.py
First, before start testing with py.test framework, I just wrote a main function inside this module running a new CrawlerProcess that invokes my spider.
Something like this:
from scraping.crawler.spiders.submissions_spider import SubmissionsSpider
from scrapy.crawler import CrawlerProcess
from scrapy.utils.project import get_project_settings
if __name__ == '__main__':
crawler = CrawlerProcess(get_project_settings())
crawler.crawl(SubmissionsSpider, n=2)
crawler.start()
This will execute perfectly on pycharm debugging. It runs my SubmissionsSpider and then process the items with my defined pipeline.
But... When I write a simple test just to execute the same routine above and run "py.test", the processing of the items with my pipeline is not executed. The process scrapes the data and nothing else.
This is my new test file:
import pytest
from scraping.crawler.spiders.submissions_spider import SubmissionsSpider
from scrapy.crawler import CrawlerProcess
from scrapy.utils.project import get_project_settings
from twisted.python.failure import Failure
#pytest.fixture
def crawler():
return CrawlerProcess(get_project_settings())
def test_crawler_execution(crawler):
crawler.crawl(SubmissionsSpider, n=2)
crawler.start()
I suspect it could be something in the configuration.
scrapycfg file:
[settings]
default = crawler.settings
[deploy]
#url = http://localhost:6800/
project = crawler
settings.py:
BOT_NAME = 'crawler'
SPIDER_MODULES = ['crawler.spiders']
NEWSPIDER_MODULE = 'crawler.spiders'
ROBOTSTXT_OBEY = True
ITEM_PIPELINES = {
'crawler.pipelines.MongoWriterPipeline': 1,
}
Any idea, folks?
Related
I have created a simple unit test for my flask app, these tests are saved in a different folder, and then called from the manage.py file. However, when I run the tests, none of the test functions is executed.
Here is the test.py file:
from flask import current_app
import unittest
from app import create_app, db
class BasicTestCase(unittest.TestCase):
def setUp(self):
self.app=create_app('testing')
self.app_context=self.app.app_context()
self.app_context.push()
def tearDown(self):
db.session.remove()
db.drop_all()
self.app_context.pop()
def test_app_exists(self):
self.assertFalse(current_app is None)
def test_app_is_testing(self):
self.assertTrue(current_app.config['TESTING'])
Also here is the manage.py file.
from app import create_app
app=create_app('default')
def test():
import unittest
tests=unittest.TestLoader().discover('tests')
unittest.TextTestRunner(verbosity=2).run(tests)
if __name__=='__main__':
app.run(debug=True)
When I run the test() function from the command line, I get the following response.
`----------------------------------------------------------------------
Ran 0 tests in 0.000s
OK`
Is there something is am doing wrong?
I have two python crawlers who can run independently.
crawler1.py
crawler2.py
They are part of an analysis that I want to run and I would like to import all to a commong script.
from crawler1.py import *
from crawler2.py import *
a bit lower in my script I have something like this
if <condition1>:
// running crawler1
runCrawler('crawlerName', '/dir1/dir2/')
if <condition2>:
// running crawler2
runCrawler('crawlerName', '/dir1/dir2/')
runCrawler is :
def runCrawler(crawlerName, crawlerFileName):
print('Running crawler for ' + crawlerName)
process = CP(
settings={
'FEED_URI' : crawlerFileName,
'FEED_FORMAT': 'csv'
}
)
process.crawl(globals()[crawlerName])
process.start()
I get the following error:
Exception has occurred: ReactorAlreadyInstalledError
reactor already installed
The first crawler runs ok. The second one has problems.
Any ideas?
I run the above through a visual studio debugger.
the best way to do it is this way
your code should be
from twisted.internet import reactor
from scrapy.crawler import CrawlerRunner
from scrapy.utils.log import configure_logging
# your code
settings={
'FEED_FORMAT': 'csv'
}
process = CrawlerRunner(Settings)
if condition1:
process.crawl(spider1,crawlerFileName=crawlerFileName)
if condition2:
process.crawl(spider2,crawlerFileName=crawlerFileName)
d = process.join()
d.addBoth(lambda _: reactor.stop())
reactor.run() # it will run both crawlers and code inside the function
your spiders should be like
class spider1(scrapy.Spider):
name = "spider1"
custom_settings = {'FEED_URI' : spider1.crawlerFileName}
def start_requests(self):
yield scrapy.Request('https://scrapy.org/')
def parse(self, response):
pass
I have a PyQt application that uses argparse to pass some argument.
I managed to write a simple test to see if the app starts
but I cannot set/mock the argparse arguments
I know it because inside the code I have some try/except like this
try:
if args.erase_data:
pass
except NameError:
logger.error("Error in parsing erase_data input argument \n")
that during the tests fail, while they do not fail if I run the app.
I tried this to mock args
import os
import pathlib
# import pdb
import sys
from unittest import mock
import pytest
from PyQt5 import QtTest
from PyQt5.QtWidgets import *
from pytestqt.plugin import QtBot
sys.path.append(os.getcwd())
src_dir = pathlib.Path(__file__).parents[1].absolute()
print(src_dir)
sys.path.append(src_dir.as_posix())
GUI = __import__("GUI")
#pytest.fixture(scope="module")
def qtbot_session(qapp, request):
result = QtBot(qapp)
with capture_exceptions() as e:
print(getattr(e, "message", repr(e)))
yield result
print(" TEARDOWN qtbot")
#pytest.fixture(scope="module")
def Viewer(request):
with mock.patch.object(sys, "argv", ["",'-d','2']):
print("mocking sys argv")
print(sys.argv)
# pdb.set_trace()
app, app_window = GUI.main()
qtbotbis = QtBot(app)
QtTest.QTest.qWait(0.5 * 1000)
assert app_window.isVisible()
return app, app_window, qtbotbis
but args is still not set.
any idea how to solve it?
#I am trying to run a script following these requirements:
After running the demo10.py script, The AmazonfeedSpider will crawl the product information using the generated urls saved in Purl and save the output into the dataset2.json file
After successfully crawling and saving data into dataset2.json file , The ProductfeedSpider will run and grab the 5 urls returned by the Final_Product() method of CompareString Class..
Finally after grabing the final product_url list from Comparestring4 Class, The ProductfeedSpider will scrape data from the returned url list and save the result into Fproduct.json file.
#Here is the demo10.py file:
import scrapy
from scrapy.crawler import CrawlerProcess
from AmazonScrap.spiders.Amazonfeed2 import AmazonfeedSpider
from scrapy.utils.project import get_project_settings
from AmazonScrap.spiders.Productfeed import ProductfeedSpider
import time
# from multiprocessing import Process
# def CrawlAmazon():
def main():
process1 = CrawlerProcess(settings=get_project_settings())
process1.crawl(AmazonfeedSpider)
process1.start()
process1.join()
# time.sleep(20)
process2 = CrawlerProcess(settings=get_project_settings())
process2.crawl(ProductfeedSpider)
process2.start()
process2.join()
if __name__ == "__main__":
main()
#After running the file it causes exception in the compiletime and says that dataset.json file doesn't exist. Do I need to use multiprocessing in order to create delay between the spiders? then how can I implement it?
#I am looking forward to hearing from experts
#I am following code from this previous stackoverflow posts:
How to schedule Scrapy crawl execution programmatically
Running Scrapy multiple times in the same process
##The following script works well while using one spider:
from twisted.internet import reactor
from scrapy.crawler import CrawlerRunner
from scrapy.utils.log import configure_logging
from AmazonScrap.spiders.Amazonfeed import AmazonfeedSpider
from scrapy.utils.project import get_project_settings
from twisted.internet.defer import inlineCallbacks
from urllib.parse import urlparse
# from AmazonScrap.spiders.Productfeed import ProductfeedSpider
import yaml
from urllib.parse import urlencode
with open(r'C:\Users\Latitude\Desktop\Shadman\Scrapy_Projects\Product_List.yaml') as file:
PList = yaml.load(file, Loader=yaml.FullLoader)
Purl= []
for k, v in PList.items():
arg = v['M_title']
args = {"k": arg}
amazon_url= 'https://www.amazon.com/s?{}'.format(urlencode(args))
Purl.append(amazon_url)
print(Purl)
configure_logging({'LOG_FORMAT': '%(levelname)s: %(message)s'})
runner = CrawlerRunner(settings = get_project_settings())
#inlineCallbacks
def loop_urls(urls):
for url in urls:
yield runner.crawl(AmazonfeedSpider, url)
# reactor.stop()
loop_urls(Purl)
reactor.run()
enter code here
##But this script doesn't even scrape successfully using the first spider.. and can't access the 2nd spider...
from twisted.internet import reactor
from scrapy.crawler import CrawlerRunner
from scrapy.utils.log import configure_logging
from AmazonScrap.spiders.Amazonfeed import AmazonfeedSpider
from scrapy.utils.project import get_project_settings
from twisted.internet.defer import inlineCallbacks
from urllib.parse import urlparse
from AmazonScrap.spiders.Productfeed import ProductfeedSpider
import yaml
from urllib.parse import urlencode
# def crawl_job():
# """
# Job to start spiders.
# Return Deferred, which will execute after crawl has completed.
# """
# settings = get_project_settings()
# runner = CrawlerRunner(settings)
# return runner.crawl(AmazonfeedSpider)
def CrawlProduct():
settings = get_project_settings()
runner2 = CrawlerRunner(settings)
yield runner2.crawl(ProductfeedSpider)
reactor.stop()
def schedule_next_crawl(null, sleep_time):
"""
Schedule the next crawl
"""
reactor.callLater(sleep_time, CrawlProduct)
#inlineCallbacks
def loop_urls(urls):
"""
# Job to start spiders.
# Return Deferred, which will execute after crawl has completed.
# """
settings = get_project_settings()
runner = CrawlerRunner(settings)
for url in urls:
yield runner.crawl(AmazonfeedSpider, url)
# reactor.stop()
def crawl(Purl):
"""
A function that schedules a crawl 30 seconds after
each successful crawl.
"""
# loop_urls() returns a Deferred
d = loop_urls(Purl)
# call schedule_next_crawl(<scrapy response>, n) after crawl job is complete
d.addCallback(schedule_next_crawl, 30)
d.addErrback(catch_error)
def catch_error(failure):
print(failure.value)
if __name__=="__main__":
with open(r'C:\Users\Latitude\Desktop\Shadman\Scrapy_Projects\Product_List.yaml') as file:
PList = yaml.load(file, Loader=yaml.FullLoader)
Purl= []
for k, v in PList.items():
arg = v['M_title']
args = {"k": arg}
amazon_url= 'https://www.amazon.com/s?{}'.format(urlencode(args))
Purl.append(amazon_url)
print(Purl)
configure_logging({'LOG_FORMAT': '%(levelname)s: %(message)s'})
crawl(Purl)
reactor.run()
#Is it for not executing the inlineCallbacks function properly..? I am drawing the attention of altruistic experts and looking forward to their suggestions and solutions. please speculate the aforementioned stackoverflow questions and solutions first, before answering my question.