How to access the local path of a downloaded image Scrapy

How to access the local path of a downloaded image Scrapy - python-3.x

here is how i am downloading Images.Now i had create one more pipline to insert my scraped data.
class CmindexPipeline(ImagesPipeline):
def get_media_requests(self, item, info):
for image_url in item['image_url']:
yield scrapy.Request(image_url)
def item_completed(self, results, item, info):
image_paths = [x['path'] for ok, x in results if ok]
if not image_paths:
raise DropItem("Item contains no images")
item['image_paths'] = image_paths
print("From Images Items", item)
return item
class MysqlPipline(object):
def process_item(self, item, spider):
print("From Process Items",item['image_path'])
here is my settings.py
ITEM_PIPELINES = {'cmindex.pipelines.CmindexPipeline': 1,'cmindex.pipelines.MysqlPipline':2}
IMAGES_STORE ='E:\WorkPlace\python\cmindex\cmindex\img'
IMAGES_THUMBS = {
'16X16': (16, 16)
}
But unfortunately sill i am not able to access item['image_paths'] in process_item.it raise error
KeyError: 'image_paths'
If anyone know what i am doing wrong please suggest me.

The process_item method is called before item_completed, so it does not have the image_paths yet.
If you want to access the image_paths, you will have to do it inside item_completed, or write another pipeline that is placed after the image pipeline.

Related

PySide2 QListView.setRootIndex with customModel not working as expected

I am pretty new to QT and I am using PySide2 (latest version) with Python 3.9.6.
I want to use a CustomModel via QAbstractItemModel on a QtreeView and at the same time with a QListView.
I have a CustomModel with a two-level hierarchy data.
I want to see the full data in the treeview (working).
At the beginning I show the same model in the QListView. It shows only the top level items.
So far so good.
Now I connected the setRootIndex fn from the QListView to the clicked signal of the QTreeView.
I want to be able to click on a root level item and see only the children in the QListView.
I thought the .setRootIndex should do the trick, but its weirdly offsetting the shown children.
And it's showing only ONE of the children and offsetted by the index count of the first level item.
Please see the gif:
First both views show the same model.
Then I click the first root element in the left treeView.
It updates the right ListView, but only the first children is shown.
And the second item shows its child but the second and with one gap in the listView
Here is a (almost) working example.
I really hope someone can spot the mistake or my misconception of things..
The .setRootIndex on the QListView is confusing me.
I tried approaching it differntly in the .index and .parent and .rowCount functions of the CustomModel. But like this it somehow works at least. I have the feeling I am doing something wrong somewhere or the QListView wants things differntly like the QTreeView.
Is it even possible and a good idea to use the same model in two views?
I really thought so and this is the hole point of a model/viewcontroller approach, isn't it?
# -*- coding: utf-8 -*-
from typing import *
from PySide2 import QtWidgets
from PySide2.QtCore import QAbstractItemModel, QModelIndex
from PySide2.QtGui import Qt
from PySide2.QtWidgets import QListView, QTreeView
class FirstLevelItem:
def __init__(self, name) -> None:
self.name = name
self.children = []
class SecondLevelItem:
def __init__(self, name, parent) -> None:
self.name = name
self.parent = parent
class CustomModel(QAbstractItemModel):
def __init__(self, root_items, parent=None):
super().__init__(parent)
self.root_items = root_items
def rowCount(self, itemIndex):
"""Has to return the number of children of the itemIndex.
If its not a valid index, its a root item, and we return the count of all root_items.
If its a valid one and can have children, return the number of children.
This makes the Model to ask for more indexes for each item.
Only works if parent is set properly"""
if itemIndex.isValid():
item = itemIndex.internalPointer()
if isinstance(item, FirstLevelItem):
return len(item.children)
else:
return 0
else:
return len(self.root_items)
def columnCount(self, parent=None):
return 1
def parent(self, child_index):
"""Has to return an index pointing to the parent of the current index."""
if child_index.isValid():
# get the item of this index
item = child_index.internalPointer()
# check if its one with a parent
if isinstance(item, SecondLevelItem):
# get the parent obj from the item
parent_item = item.parent
# now we have to find the parents row index to be able to create the index pointing to it
parent_row = parent_item.children.index(item)
# create an index with the parent row and column and the parent item itself
return self.createIndex(parent_row, 0, parent_item)
else:
return QModelIndex()
else:
return QModelIndex()
def data(self, index, role):
if not index.isValid():
return None
item = index.internalPointer()
if role == Qt.DisplayRole:
return item.name
return None
def index(self, row, column, parentIndex):
if parentIndex.isValid():
parent_item = parentIndex.internalPointer()
return self.createIndex(row, column, parent_item.children[row])
else:
return self.createIndex(row, column, self.root_items[row])
class ModelTestDialog(QtWidgets.QDialog):
window_instance = None
def __init__(self, parent=None):
super().__init__(parent)
self.setWindowFlags(self.windowFlags() ^ Qt.WindowContextHelpButtonHint)
# self.setMinimumSize(1024, 1024)
self.setWindowTitle("ModelTestDialog")
rootItems = []
for i in range(0, 3):
name = ["FirstLevel_A", "FirstLevel_B", "FirstLevel_C"][i]
rootItem = FirstLevelItem(name)
rootItems.append(rootItem)
for j in range(0, 3):
name = ["SecondLevel_A", "SecondLevel_B", "SecondLevel_C"][j]
childItem = SecondLevelItem(name, rootItem)
rootItem.children.append(childItem)
self.model = CustomModel(rootItems)
self.treeView = QTreeView()
self.treeView.setModel(self.model)
self.listView = QListView()
self.listView.setModel(self.model)
self.main_layout = QtWidgets.QVBoxLayout(self)
self.listViews_layout = QtWidgets.QHBoxLayout()
self.main_layout.addLayout(self.listViews_layout)
self.listViews_layout.addWidget(self.treeView)
self.listViews_layout.addWidget(self.listView)
self.treeView.clicked[QModelIndex].connect(self.listView.setRootIndex)
if __name__ == "__main__":
app = QtWidgets.QApplication()
form = ModelTestDialog()
form.show()
app.exec_()

There is absolutely nothing wrong about using the same model in multiple views.
That is the whole concept behind the model/view paradigm (which relies on the principle of separation of concerns): the same model can be shared amongs multiple views, even if they show the content of that model in different ways.
That is completely respected by Qt (as long as the model is properly implemented, obviously); this also happens for similar concepts in Qt, like the QTextDocument interface used in QTextEdit (the same document can be shown on different QTextEdit instances), or the QGraphicsScene shown in a QGraphicsView (each view can show a different portion of the same scene).
The actual issue
You're using the wrong row for the parent:
parent_row = parent_item.children.index(item)
The above returns the index (row) of the child item, but you need to use createIndex() as a reference for the parent, because parent() has to return the row/column of the parent, not that of the child.
In this simple case, just return the index within the root_items:
parent_row = self.root_items.index(parent_item)
A better approach
I would suggest a more flexible structure, where a single base class is used for all items, and it always has a parent attribute. To do this, you need to also create a "root item" which contains all top level items.
You can still create subclasses for items if you need more flexibility or specialization, but the default behavior remains unchanged, making the implementation simpler especially in the case you need further levels within the structure.
The major benefit of this approach is that you never need to care about the item type to know its level: you know that you need to access the root item when the given index is invalid, and for any other case (like index creation, parent access, etc), the implementation is much more easy and readable. This will automatically make easier to add support for other features, like moving items and drag&drop.
class TreeItem:
parent = None
def __init__(self, name='', parent=None):
self.name = name
self.children = []
if parent:
parent.appendChild(self)
def appendChild(self, item):
self.insertChild(len(self.children), item)
def insertChild(self, index, item):
self.children.insert(index, item)
item.parent = self
def row(self):
if self.parent:
return self.parent.children.index(self)
return -1
class CustomModel(QAbstractItemModel):
def __init__(self, root_items=None, parent=None):
super().__init__(parent)
self.root_item = TreeItem()
if root_items:
for item in root_items:
self.root_item.appendChild(item)
def rowCount(self, itemIndex):
if itemIndex.isValid():
return len(itemIndex.internalPointer().children)
else:
return len(self.root_item.children)
def columnCount(self, parent=None):
return 1
def parent(self, child_index):
if child_index.isValid():
item = child_index.internalPointer()
if item.parent:
return self.createIndex(item.parent.row(), 0, item.parent)
return QModelIndex()
def data(self, index, role):
if not index.isValid():
return None
item = index.internalPointer()
if role == Qt.DisplayRole:
return item.name
def index(self, row, column, parentIndex=QModelIndex()):
if parentIndex.isValid():
parent_item = parentIndex.internalPointer()
return self.createIndex(row, column, parent_item.children[row])
else:
return self.createIndex(row, column, self.root_item.children[row])
class ModelTestDialog(QDialog):
def __init__(self, parent=None):
super().__init__(parent)
self.setWindowFlags(self.windowFlags() ^ Qt.WindowContextHelpButtonHint)
self.setWindowTitle('ModelTestDialog')
rootItems = []
for i in range(0, 3):
name = 'FirstLevel {}'.format('ABC'[i])
rootItem = TreeItem(name)
rootItems.append(rootItem)
for j in range(0, 3):
name = 'SecondLevel {} (child of {})'.format('ABC'[j], 'ABC'[i])
TreeItem(name, rootItem)
# or, alternatively:
# rootItem.appendChild(TreeItem(name))
self.model = CustomModel(rootItems)
self.treeView = QTreeView()
self.treeView.setModel(self.model)
self.listView = QListView()
self.listView.setModel(self.model)
self.main_layout = QVBoxLayout(self)
self.listViews_layout = QHBoxLayout()
self.main_layout.addLayout(self.listViews_layout)
self.listViews_layout.addWidget(self.treeView)
self.listViews_layout.addWidget(self.listView)
self.treeView.clicked.connect(self.listView.setRootIndex)
As you can see, the whole model code is much simpler and cleaner: there is no need to check for item level/type, as the concept of the structure makes that automatically immediate.
Further notes:
the Qt API suggests that the parent argument of index() should be optional; while it's common to use None for that, a default (and invalid) QModelIndex() is preferable, as I did above;
python implicitly returns None if no other return value is given;
in the last few years, Qt has been in the process of removing all overloaded signals, replacing them with more verbose and unique ones; in general, it's unnecessary to specify them, especially where no overload actually exists (self.treeView.clicked);

getitem for custom dataloader is not working

I am new to deep learning and Pytorch. I have data set of 6000 images that have all four classes in a single folder. I have a .csv file that has an image name and there corresponding one hot encoded representation (Ground_Truth_Label). I want to bind the image name to its label, so I can feed it into the neural network. I found that a custom Data Loader can be used for this. I tried the following code snippet in PyCharm:
class DFUDataset(Dataset):
def __init__(self, csv_file, root_dir, transform=None):
self.DFU = pd.read_csv(csv_file)
self.root_dir = root_dir
self.transform = transform
def __len__(self):
return len(self.DFU)
def __getitem__(self, idx):
if torch.is_tensor(idx):
idx = idx.tolist()
img_name = os.path.join(self.root_dir, self.DFU.iloc[idx, 0]) # image names
image = io.imread(img_name)
img_label = os.path.join(self.root_dir, self.DFU.iloc[0, idx[1:]]) # image labels
sample = {'image': image, 'img_label': img_label}
return sample
DFU_dataset = DFUDataset(
csv_file='C:/Users/aleems2/Desktop/dfu/DFUC2021_trainset_210427/DFUC2021_train/Labelled_data_ground_truth.csv',
root_dir="C:/Users/aleems2/Desktop/dfu/DFUC2021_trainset_210427/DFUC2021_train/Labelled_test_images")
However, when I am trying to debug the code then, a blue circle comes right beside getitem (Blue circle), and the debugger does not go to this function. It just runs def init, and then comes out of class DFUDataset(Dataset). I do not know what to override getitem with.
I have spent hours on this, but could not figure it out.

Scrapy pipeline mysql connection module error

I am unable to run scrapy through my pipeline to my local database. I have already installed mysql-connector-python 8.0.19 and am able to write data to the database within the same project but outside of a Scrapy pipeline . Can someone please help i can't figure out why it isn't working.
When i try to send data via scrapy pipeline i get the following error:
[twisted] CRITICAL: Unhandled error in Deferred:
File "C:\Users\Viking\PycharmProjects\Indigo_Scrp\IndgoScrp\IndgoScrp\pipelines.py", line 7, in <module>
from mysql.connector import (connection)
ModuleNotFoundError: No module named 'mysql
Here is my code for the pipeline :
from mysql.connector import (connection)
from mysql.connector import errorcode
class IndgoscrpPipeline(object):
def __init__(self):
self.create_connection()
self.create_table()
def create_connection(self):
self.conn = connection.MySQLConnection(
host='127.0.0.1',
user='root',
passwd='',
database='Python'
)
self.curr = self.conn.cursor()
def open_spider(self, spider):
print("spider open")
def process_item(self, item, spider):
print("Saving item into db ...")
self.save(dict(item))
return item
def close_spider(self, spider):
self.mysql_close()
##########################################################################
def mysql_connect(self):
try:
return self.curr.connect(**self.conf)
except self.curr.Error as err:
if err.errno == errorcode.ER_ACCESS_DENIED_ERROR:
print("Something is wrong with your user name or password")
elif err.errno == errorcode.ER_BAD_DB_ERROR:
print("Database does not exist")
else:
print(err)
#########################################
def create_table(self):
self.curr.execute(""" DROP TABLE IF EXISTS indigo""")
self.curr.execute(""" Create table indigo(
Product_Name text,
Product_Author text,
Product_Price text,
Product_Image text
)""")
def process_item(self, item, spider):
self.store_db(item)
def store_db(self, item):
self.curr.execute("""Insert Into indigo values (%s,%s,%s,%s)""",
(item['Product_Name'][0],
item['Product_Author'][0],
item['Product_Price'][0],
item['Product_Image'][0],
)
)
self.conn.commit()
return item
self.conn.close()
*
Here is my code from my spider
import scrapy
from ..items import IndScrItem
class IndgoSpider(scrapy.Spider):
name = 'Indgo'
start_urls = ['https://www.chapters.indigo.ca/en-ca/books/?link-usage=Header%3A%20books&mc=Book&lu=Main']
def parse(self, response):
items = IndScrItem()
Product_Name= response.css('.product-list__product-title-link--grid::text').getall(),
Product_Author= response.css('.product-list__contributor::text').getall(),
Product_Price= response.css('.product-list__price--orange::text').getall(),
Product_Image= response.css('.product-image--lazy::attr(src)').getall()
items['Product_Name'] = Product_Name
items['Product_Author'] = Product_Author
items['Product_Price'] = Product_Price
items['Product_Image'] = Product_Image
yield items
This is the line in the settings file that i have to enable pipelines
ITEM_PIPELINES = {
'IndgoScrp.pipelines.IndgoscrpPipeline': 100,
}

I actually found the issue was tied to having previously pip installed the wrong version of mysql-connector even though through my ide pycharm i had installed the correct one python was confused. After uninstalling both and reinstalling mysql-connector-python it was able to run.

How to get the spider name in Scrapy pipeline outside of the process_item function?

I've written a few spiders that pull similar data from different sources. I've also written a pipeline that allows this data to be put in a database. I want to be able to use the same code for multiple spiders to output to different tables, named dynamically from the spider name.
Here is the pipeline.py code:
class DbPipeline(object):
def __init__(self):
"""
Initialises database connection and sessionmaker.
Creates table if it doesn't exist.
"""
engine = db_connect()
create_output_table(engine)
self.Session = sessionmaker(bind=engine)
def process_item(self, item, spider):
"""
Saves scraped products in database
"""
exists = self.check_item_exists(item)
if not exists:
session = self.Session()
product = Products(**item)
try:
session.add(product)
session.commit()
except:
session.rollback()
raise
finally:
session.close()
return item
def check_item_exists(self,item):
session = self.Session()
product = Products(**item)
result = session.query(Products).filter(Products.title == item['title']).first()
return result is not None
And here is the model.py file:
DeclarativeBase = declarative_base()
def create_output_table(engine):
DeclarativeBase.metadata.create_all(engine)
def db_connect():
"""
Connects to database from settings defined in settings.py
Returns an sqlalchemy engine instance
"""
return create_engine(URL(**settings.DATABASE))
class Products(DeclarativeBase):
"""Sqlalchemy table model"""
__tablename__ = "name"
id = Column(Integer, primary_key=True)
title = Column('title', String(200))
price = Column('price', String(10), nullable=True)
url = Column('url', String(200), nullable=True)
What i'm trying to do is get the __tablename__ variable to be the same as the spider name, which I can easily do in the process_item function as it is passed a spider object and can use spider.name and assign it to a class variable, however the function will run after the table is created/defined. How can I go about getting the spider name outside of the process_item function in the pipelines.py file?
Edit: I've tried the solutions listed in How to access scrapy settings from item Pipeline however access to the 'settings' doesn't give me access to the attributes assigned to the current spider running. I need to dynamically get the name of the spider based on what spider is running the pipelines. Thanks

It's pretty easy to get current spider name in your create_output_table:
class DbPipeline(object):
#classmethod
def from_crawler(cls, crawler):
return cls(crawler.spider.name)
def __init__(self, spider_name):
"""
Initializes database connection and sessionmaker.
Creates deals table.
"""
engine = db_connect()
create_output_table(engine, spider_name)
......
and (in models.py):
def create_output_table(engine, spider_name):
# now you have your spider_name
DeclarativeBase.metadata.create_all(engine)
The problem here is that Scrapy process your models.py file before your pipelines.py. So you need to find a way to generate your SQLAlchemy model later. You can use this thread as a starting point: Dynamically setting __tablename__ for sharding in SQLAlchemy?

how to print vertex using BFS Algo in python?

The problem is in this line (graph.vertList[currentVert].getConnections()) I am unable to get the list of all connected node which is currently connected to currentVert , when I print this code i get the all connected object print(g.vertList[1].getConnections())
dict_keys([<__main__.Vertex object at 0x7f7a7db9a3c8>, <__main__.Vertex object at 0x7f7a7db9a2e8>])
I am unable to find node(vertex) or id for these object so that ,I can Travers easily from one node to another node and print the result at same time .
I am unable to fine the bug , here is the complete code :-
class Queue:
def __init__(self):
self.queue=[]
def enqueue(self,item):
self.queue.insert(0,item)
def isEmpty(self):
return self.queue == []
def dequeue(self):
return self.queue.pop()
def size(self):
return len(self.queue)
Then I create an another class Vertex :
class Vertex:
def __init__(self,key):
self.id=key
self.connectedTo={}
def addNeighbor(self,nbr,weight=0):
self.connectedTo[nbr]=weight
def __str__(self):
return str(self.id)+' Connected To : '+str([x.id for x in self.connectedTo])
def getConnections(self):
return self.connectedTo.keys()
def getId(self):
return self.id
def getWeight(self,nbr):
return self.connectedTo[nbr]
Another class Graph :
class Graph:
def __init__(self):
self.vertList={}
self.numVertices=0
def addVertex(self,key):
self.numVertices=self.numVertices+1
newVertex=Vertex(key)
self.vertList[key]=newVertex
return newVertex
def addEdges(self,f,t,cost=0):
if f in self.vertList:
if t in self.vertList:
self.vertList[f].addNeighbor(self.vertList[t],cost)
else:
return "Not present in Graph"
else:
return "Not present in Graph"
def getVertex(self,n):
if n in self.vertList:
return self.vertList[n]
else:
return None
def getVertices(self):
return self.vertList.keys()
After that I created a function bfs (Breadth First Search) :
def bfs(graph,start):
#Keep track of all visited nodes
visited=[]
#keep track of nodes to be checked using queue
vertQueue= Queue()
vertQueue.enqueue(start)
#Keep looking until there are nodes still to be checked
while vertQueue:
#pop shallowest node (first node ) from queue
currentVert=vertQueue.dequeue()
print(currentVert,end="")
for nbr in (graph.vertList[currentVert].getConnections()):
if nbr not in visited:
#add node to list of checked nodes
vertQueue.enqueue(nbr)
visited.append(currentVert)
How can I fix this problem ?

The problem is here:
self.vertList[f].addNeighbor(self.vertList[t],cost)
Change this to
self.vertList[f].addNeighbor(t,cost)
and it should work.

Develop Reference

node.js excel linux python-3.x azure haskell apache-spark rust .htaccess string

How to access the local path of a downloaded image Scrapy - python-3.x

The process_item method is called before item_completed, so it does not have the image_paths yet. If you want to access the image_paths, you will have to do it inside item_completed, or write another pipeline that is placed after the image pipeline.

Related

PySide2 QListView.setRootIndex with customModel not working as expected

getitem for custom dataloader is not working

Scrapy pipeline mysql connection module error

How to get the spider name in Scrapy pipeline outside of the process_item function?

how to print vertex using BFS Algo in python?

Categories

Resources