How to customize ItemExporter to not override output file on Scrapy - python-3.x

Im writing a simple web crawler with scrapy that capture data from two different websites. You can find all my files here.
Basically I have a main.py file:
#!/usr/bin/env python
import scrapy
from app.spiders.spider_maquinas import VultrSpider, DigitalOceanSpider
from scrapy.crawler import CrawlerProcess
from scrapy.utils.project import get_project_settings
class Main():
def __init__(self):
settings = get_project_settings()
process = CrawlerProcess(settings)
process.crawl(VultrSpider())
process.crawl(DigitalOceanSpider())
process.start()
if __name__ == '__main__':
main = Main()
And this pipelines for my two spiders:
import json
from scrapy.exporters import JsonItemExporter, CsvItemExporter
class CustomJsonExporter(JsonItemExporter):
def _beautify_newline(self):
self.file.write(b'\n')
class PrintItem:
def process_item(self, item, spider):
print(dict(item))
return item
class JsonPipeline:
def open_spider(self, spider):
self.file = open('static/maquinas.json', 'wb')
self.exporter = CustomJsonExporter(self.file)
self.exporter.start_exporting()
def close_spider(self, spider):
self.exporter.finish_exporting()
self.file.close()
def process_item(self, item, spider):
self.exporter.export_item(item)
return item
class CsvPipeline:
def open_spider(self, spider):
self.file = open('static/maquinas.csv', 'wb')
self.exporter = CsvItemExporter(self.file)
self.exporter.fields_to_export = ['storage', 'cpu', 'memory', 'bandwidth', 'price']
def close_spider(self, spider):
self.exporter.finish_exporting()
self.file.close()
def process_item(self, item, spider):
self.exporter.export_item(item)
return item
But the second spider override the first values. I need to write new Pipelines changing this manually to now override the data or there is an easy way to do this ? Thank you!

Related

Is there a fast and robust way to unittest operations on files?

I’m writing a small pywin32 program that converts text files to docx/pdf and that’s all chill and working. BUT… I haven’t figured out a way to unittest the coverter that would not require creating tempfiles. The proccess is really slow.
Is there a commonly accepted good practice on unittesting operations on files?
The converter + unittests below.
# file converter.py
import win32com.client as win32
class Converter:
def __init__(self):
self.word = win32.Dispatch('Word.Application')
self.word.Visible = False
self._word_format = {
'docx': 12,
'pdf': 17
}
#property
def word_format(self):
return self._word_format
def open_and_close(func):
def wrapper(self, fpath):
doc = self.word.Documents.Open(str(fpath))
doc.Activate()
func(self, fpath)
doc.Close()
return wrapper
#open_and_close
def to_docx(self, fpath):
new_fpath = str(fpath).replace('.doc', '.docx')
self.word.ActiveDocument.SaveAs(new_fpath, FileFormat=self.word_format['docx'])
#open_and_close
def to_pdf(self, fpath):
new_fpath = str(fpath).replace('.doc', '.pdf')
self.word.ActiveDocument.SaveAs(new_fpath, FileFormat=self.word_format['pdf'])
def __del__(self):
self.word.Quit()
# file test_converter.py
import unittest
import tempfile
import os
from pathlib import Path
from converter import Converter
class TestConverterToDocx(unittest.TestCase):
#classmethod
def setUpClass(cls):
cls.converter = Converter()
def setUp(self):
self.tempdir = tempfile.TemporaryDirectory()
for _ in range(5):
tempfile.NamedTemporaryFile(dir=self.tempdir.name,
suffix='.doc', delete=False)
def tearDown(self):
self.tempdir.cleanup()
def test_to_docx(self):
for root, dirs, files in os.walk(self.tempdir.name):
for file in files:
self.converter.to_docx(str(Path(root) / file))
files = [file for file in os.listdir(self.tempdir.name)
if file.endswith('.docx')]
self.assertEqual(len(files), 5)
def test_to_pdf(self):
for root, dirs, files in os.walk(self.tempdir.name):
for file in files:
self.converter.to_pdf(str(Path(root) / file))
files = [file for file in os.listdir(self.tempdir.name)
if file.endswith('.pdf')]
self.assertEqual(len(files), 5)

i want to implement progress bar for downloading git repo in pyqt5

how can I make progress bar for cloning git repository in pyqt5
git.Repo.clone_from('https://github.com/addddd123/Osdag', '_new_update')
You have to execute the task in another thread, connect to the callback and send the progress information through signals:
import threading
import sys
from dataclasses import dataclass, field
from typing import List, Optional, Any, Mapping, Dict
from PyQt5 import QtCore, QtWidgets
import git
class GitReply(QtCore.QObject):
pass
#dataclass
class GitCloneReply(GitReply):
progress_changed = QtCore.pyqtSignal(int)
started = QtCore.pyqtSignal()
finished = QtCore.pyqtSignal()
url: str
path: str
env: Optional[Mapping[str, Any]] = None
multi_options: Optional[List[str]] = None
kwargs: Dict[str, Any] = field(default_factory=dict)
def __post_init__(self):
super().__init__()
def start(self):
threading.Thread(target=self._execute, daemon=True).start()
def _execute(self):
self.started.emit()
repo = git.Repo.clone_from(
self.url,
self.path,
self.callback,
self.env,
self.multi_options,
**self.kwargs
)
self.finished.emit()
def callback(self, op_code, cur_count, max_count=None, message=""):
self.progress_changed.emit(int((cur_count / max_count) * 100))
#dataclass
class RepoManager(QtCore.QObject):
_replies: List[GitReply] = field(init=False, default_factory=list)
def __post_init__(self):
super().__init__()
def clone_from(self, url, path, env=None, multi_options=None, **kwargs):
reply = GitCloneReply(url, path, env, multi_options, kwargs)
reply.finished.connect(self.handle_finished)
reply.start()
self._replies.append(reply)
return reply
def handle_finished(self):
reply = self.sender()
if reply in self._replies:
self._replies.remove(reply)
def main():
app = QtWidgets.QApplication(sys.argv)
progressbar = QtWidgets.QProgressBar()
progressbar.show()
manager = RepoManager()
reply = manager.clone_from("https://github.com/addddd123/Osdag", "_new_update")
reply.progress_changed.connect(progressbar.setValue)
ret = app.exec_()
sys.exit(ret)
if __name__ == "__main__":
main()
```

Access to an instanced class from an other script run by first

I'm trying to access to an instanced class build by firstscript.py from secondscript.py.
I run secondscript.py from firstscript.py by os.system('sudo secondscript.py') and I want access to the namespace of firstscript.py. For example I have the following code:
# firstscript.py
import wx
import abcd
class MyFrame(abcd):
def __init__(self, *args, **kwds):
abcd.__init__(self, *args, **kwds)
def log(self,str_log):
self.text_ctrl_logging.SetForegroundColour(wx.BLACK)
self.text_ctrl_logging.AppendText(str_log+'\n')
def runsudo(self):
os.system('sudo ./secondscript.py')
class fghi(wx.App):
def OnInit(self):
self.frame = MyFrame(None, wx.ID_ANY, "")
self.SetTopWindow(self.frame)
self.frame.Show()
return True
if __name__ == "__main__":
app = fghi(0)
app.MainLoop()
and the following is the secondscript.py
from firstscript import MyFrame
MyFrame.log("HELLO")
From secondscript.py I want print some string on text_ctrl_logging by log method.
But the following is the error:
TypeError: log() missing 1 required positional argument: 'str_log'
I'm sorry I'm a newbie then be patient for the stupid question.

cloud pickle current class with all its variable values

I am having sample program with file name myAutocomplete.py and want to return instance of class
import cloudpickle
class Autocomplete(object):
def __init__(self):
self.test = " Test String "
def assign_value(self):
self.test = "Hello"
return self.test
def get_value(self):
return self.test
#staticmethod
def save(path):
with open(path, "wb") as f:
cloudpickle.dump(Autocomplete, f)
#staticmethod
def load(path):
with open(path, "rb") as f:
test = cloudpickle.load(f)
return test
and another file with name main.py
from myAutocomplete import Autocomplete
if __name__ == '__main__':
testobj = Autocomplete()
testobj.assign_value()
testobj.save("test.pkl")
test = testobj.load("test.pkl")
print(test().get_value())
i expecting output as
"Hello"
but i am getting output as
"Test String"
plz help

Drag and drop in QTreeView fails with items that hold a QImage

I have a list of items in a QTreeView. Each item holds a QImage object. If I try to drag and drop the item, the program freezes. But when I comment out the line objMod._Image = QImage(flags = Qt.AutoColor), the program runs fine.
How can I drag and drop the items with the QImage object? The QImage holds an image which is rendered. The rendering process takes a while, so it would be nice to keep the QImage object.
import sys
import os
from PySide.QtCore import *
from PySide.QtGui import *
from PySide.QtUiTools import *
from PIL import Image, ImageCms, ImageQt
class ObjModel:
def __init__(self):
self._Image = None
class DragMoveTest(QMainWindow):
def __init__(self):
super(DragMoveTest,self).__init__()
self.initGUI()
self.show()
def initGUI(self):
self.treeView = QTreeView()
modelTreeView = QStandardItemModel()
self.treeView.setModel(modelTreeView)
for i in range(0, 4):
objMod = ObjModel()
objMod._Image = None
objMod._Image = QImage(flags = Qt.AutoColor)
item = QStandardItem('Test: %s' % str(i))
item.setData(objMod, Qt.UserRole + 1)
modelTreeView.invisibleRootItem().appendRow(item)
self.treeView.setDragDropMode(QAbstractItemView.InternalMove)
self.setCentralWidget(self.treeView)
def main(args):
app = QApplication(sys.argv)
qt_main_wnd = DragMoveTest()
ret = app.exec_()
sys.exit(ret)
if __name__ == "__main__":
main(sys.argv)
This is caused by a bug in PySide. During a drag and drop operation, the data in the dragged item must be serialized. This will be handled by Qt for most data-types, but for types that are specific to Python, special handling is required. This special handling seems to be broken in PySide. If your example is converted to PyQt, a TypeError is raised when trying to drag items, but the program does not freeze.
The source of the problem is that you are storing data using a custom Python class. PyQt uses pickle to serialize custom data-types, but it is not possible to also pickle the QImage that is stored in its __dict__, so the operation fails. I assume PySide must attempt something similar, but for some reason it does not raise an error when it fails. Qt grabs the mouse whilst dragging, so if the operation fails abnormally, it won't be released again, and the program will appear to freeze.
The simplest way to fix this is to avoid using a custom class to hold the QImage, and instead store the image directly in the item:
image = QImage()
item = QStandardItem('Test: %s' % i)
item.setData(image, Qt.UserRole + 1)
To store more data items, you can either use a different data-role for each one, or use a dict to hold them all:
data = {'image': QImage(), 'title': 'foo', 'timestamp': 1756790}
item.setData(data, Qt.UserRole + 1)
However, if you do this, you must always use string keys in the dict, otherwise you will face the same problems as before. (Using string keys means the dict can be converted into a QMap, which Qt knows how to serialize).
(NB: if you want to know whether a Qt class can be serialized, check the docs to see whether it defines the datastream operators).
I come up with a different solution. It is easier to have a object that holds a io.BytesIO. Your store the ImageData into the bytesIO variable. Upon on your image library you can open the image from the bytesIO variable.
In the demo the class ObjModel can handle QImage and Image from Pillow/PIL. If you use the set methods the image object will be converted into a bytesIO.
In short here a working example:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import sys
import os
import io
from PySide.QtCore import *
from PySide.QtGui import *
from PySide.QtUiTools import *
from PIL import Image, ImageCms, ImageQt
########################################################################
class ObjModel:
""""""
#----------------------------------------------------------------------
def __init__(self):
"""Constructor"""
self._ImageByteIO = None
#----------------------------------------------------------------------
def getObjByte(self):
""""""
return self._ImageByteIO
#----------------------------------------------------------------------
def getQImage(self):
""""""
try:
self._ImageByteIO.seek(0)
qImg = QImage.fromData(self._ImageByteIO.getvalue())
return qImg
except:
return None
#----------------------------------------------------------------------
def getPILImage(self):
""""""
try:
self._ImageByteIO.seek(0)
img = Image.open(tBytesIO)
return img
except:
return None
#----------------------------------------------------------------------
def setObjByte(self, fileName):
""""""
try:
tBytesIO = io.BytesIO()
f = open (fileName, 'rb')
tBytesIO.write(f.read())
f.close()
self._ImageByteIO = tBytesIO
except:
self._ImageByteIO = None
#----------------------------------------------------------------------
def setQImage(self, qImg):
""""""
try:
tBytesIO = io.BytesIO()
qByteArray = QByteArray()
qBuf = QBuffer(qByteArray)
qBuf.open(QIODevice.ReadWrite)
qImg.save(qBuf, 'PNG')
tBytesIO = io.BytesIO()
tBytesIO.write(qByteArray.data())
self._ImageByteIO = tBytesIO
except:
self._ImageByteIO = None
#----------------------------------------------------------------------
def setPILImage(self, pImg):
""""""
tBytesIO = io.BytesIO()
pImg.save(tBytesIO, 'png')
self._ImageByteIO = tBytesIO
#----------------------------------------------------------------------
class DragMoveTest(QMainWindow):
def __init__(self):
""""""
super(DragMoveTest,self).__init__()
self.initGUI()
self.show()
#----------------------------------------------------------------------
def initGUI(self):
""""""
self.treeView = QTreeView()
modelTreeView = QStandardItemModel()
self.treeView.setModel(modelTreeView)
for i in range(0, 4):
objMod = ObjModel()
objMod.setQImage(QImage(flags = Qt.AutoColor))
item = QStandardItem('Test: %s' % str(i))
item.setData(objMod, Qt.UserRole + 1)
modelTreeView.invisibleRootItem().appendRow(item)
self.treeView.setDragDropMode(QAbstractItemView.InternalMove)
self.setCentralWidget(self.treeView)
#----------------------------------------------------------------------
def main(args):
app = QApplication(sys.argv)
qt_main_wnd = DragMoveTest()
ret = app.exec_()
sys.exit(ret)
#----------------------------------------------------------------------
if __name__ == "__main__":
main(sys.argv)

Resources