I'm using Scrapy to grab domains and their creation date using the Whois module. I am then adding them to a MySQL database using SqlAlchemy but I get the below error when adding the creation date to the database because the data type is <class 'datetime.datetime'>
sqlalchemy.orm.exc.UnmappedInstanceError: Class 'datetime.datetime' is not mapped
I tried to convert the date into a string but then I get another error.
pipelines.py:
class SaveDomainsPipeline(object):
def __init__(self):
engine = db_connect()
create_table(engine)
self.Session = sessionmaker(bind=engine)
def process_item(self, item, spider):
session = self.Session()
domain = Domains(**item)
domain_item = item['domain']
domain_whois = whois.query(domain_item)
creation_date = domain_whois.creation_date
try:
session.add_all([domain, creation_date])
session.commit()
models.py
class Domains(Base):
__tablename__ = "domains"
id = Column(Integer, primary_key=True)
date_added = Column(DateTime(timezone=True), server_default=func.now())
domain = Column('domain', Text())
creation_date = Column('creation_date', DateTime(timezone=True))
#creation_date = Column('creation_date', Text()) -- I also tried this
I made a rookie mistake in my original code.
As I initiated an instance of the class "Domains", I had to refer to it when populating the columns which I had originally missed. The working code can be found below.
class SaveDomainsPipeline(object):
def __init__(self):
engine = db_connect()
create_table(engine)
self.Session = sessionmaker(bind=engine)
def process_item(self, item, spider):
session = self.Session()
domains = Domains() #initiate instance of Domains class.
domains.domain = item['domain'] #Add the item "domain" from Items to DB
domain_whois = whois.query(domains.domain)
domains.creation_date = domain_whois.creation_date #Add the creation date to DB
try:
#save the instance which saves both the domain item and creation date.
session.add(domains)
session.commit()
Related
I am working on a proyect that works with these libraries (among others)
postgresql 10.5
pandas 1.1.1
psycopg2 2.7.5
pytest 5.0.1
python 3.7.1
I am trying to run pytest on a library we have created that uses sqlalchemy to access a postgresql database.
I want to re-create a database to test the methods that we have being using for few months.
I have tried different things but without success.
I know that postgresql cannot create an in memory database, which is why I am trying to use sqlite3 for that purpose. .
The method I am trying to run test over is:
DatabaseHelper.py
class DatabaseHelper(object):
"""
Helps accessing database.
"""
def __init__(self):
pass
# ...
def create_engine(self, host_dns, dbname, port, user, password):
"""
:param host_dns: dns route to the database
:param dbname: name of the database to access to
:param port: number or port of the database
:param user: name of the user to access de database
:param password: password to connect to the database
"""
self.host = host_dns
self.dbname = dbname
self.port = port
self.user = user
self.password = password
self.connection_str = f'postgresql://{self.user}:{self.password}#{self.host}:{self.port}/{self.dbname}'
self.engine = create_engine(self.connection_str)
# session_factory = sessionmaker(bind=self.engine)
Session = sessionmaker(bind=self.engine)
self.session = Session()
# print("Agora objectec created ok")
# ...
def read_db_to_df(self, **kwargs):
""" Reads a database and transforms into a pandas.DataFrame """
try:
default_reading_sql_args = {'con': self.session.connection()}
reading_sql_args = utils.merge_two_dicts(default_reading_sql_args, kwargs)
df = pd.read_sql(**reading_sql_args)
return df
except SQLAlchemyError as e:
# self.logger.error("Error reading db to df")
# self.logger.error(str(e).replace("\n", ""))
print(e)
return -20
MY_test_before_test.py
from sqlalchemy import MetaData, Column, Table, ForeignKey, select, PrimaryKeyConstraint, Index
from sqlalchemy import Integer, String
from sqlalchemy import create_engine
from sqlalchemy.schema import CreateTable, DropTable
from sqlalchemy.ext.declarative.api import DeclarativeMeta
from agora_db.agora_helper import AgoraHelper
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy.orm import sessionmaker
from sqlalchemy.sql.functions import current_timestamp, current_user
engine = create_engine('sqlite:///:memory:')
Session = sessionmaker(bind=engine)
session = Session()
Base = declarative_base()
#This is how the database models look like:
class Schema(object):
""" A simple schema to provide a common argument for all tables """
__table_args__ = {"schema": "ifsmtd"}
class AbstractTable(object):
"""
A class that sets how the class is represents its objects as string.
"""
def __repr__(self):
""" Returns the object representation in string fromat in a way that can be used to reconstruct the object.
This returns an 'official' string representation of the object.
:Example:
"""
from sqlalchemy.inspection import inspect
table_inst = inspect(self)
table_name = type(self).__name__
column_key_value = [(c_attr.key, getattr(self, c_attr.key)) for c_attr in table_inst.mapper.column_attrs]
fields_str = ", ".join(["%s='%s'" % (c[0], c[1]) for c in column_key_value])
return "<" + table_name + "(" + fields_str + ")>"
class MyBasicTable(Base, Schema, AbstractTable):
__tablename__ = "mybasic_table"
timest_mov = Column(TIMESTAMP, primary_key=True, nullable=False, server_default=current_timestamp())
id_info = Column(String, primary_key=True, nullable=False)
id_wf = Column(String, primary_key=True, nullable=False)
process_name = Column(String, primary_key=True, nullable=False)
error_type = Column(String, primary_key=True, nullable=False)
resolution_status = Column(Boolean)
aud_timest_umo = Column(TIMESTAMP, server_default=current_timestamp())
aud_id_user = Column(String, server_default=current_user())
__table_args__ = (
PrimaryKeyConstraint('timest_mov', 'id_info', 'id_wf', 'process_name', 'error_type', name='pk_idx_mybasic_table'),
Index('pk_idx_mybasic_table', 'timest_mov', 'id_info', 'id_wf', 'process_name', 'error_type', unique=True),
{"schema": "ifsmtd"}
)
dbhelper = DatabaseHelper()
dbhelper.engine = engine
dbhelper.session = session
query = session.query(MyBasicTable.timest_mov.label("timest_mov"),
MyBasicTable.id_info .label("id_info "),
MyBasicTable.id_wf.label("id_wf"),
MyBasicTable.process_name.label("process_name"),
MyBasicTable.error_type.label("error_type"),
MyBasicTable.resolution_status.label("resolution_status")
)\
.distinct(MyBasicTable.id_jira.label("id_jira"))
df = dbhelper.read_db_to_df(sql=query.statement)
print(df)
The Error I get is:
(sqlite3.OperationalError) no such table: ifsmtd.mybasic_table
How could I do the test.
Looking at the code, there seems to be missing a call to Base.metadata.create_all(engine). This would create the initial database schema. After this call the tables are empty and you need to populate them.
Add the above statement just before using the database, but after defining the tables.
Base.metadata.create_all(engine) # This will create the schema!
dbhelper = DatabaseHelper()
On the use of sqlite: I have also gone that route and bumped into the fact that by default sqlite does not check foreign key constraints (it can be enabled!). There may be more differences!
I am new to API and I am creating a FLask Restful API.I was wondering that do I need to create new model and resource classes for any row manipulation I want to do in my DB? For example I have created a student in my DB. On creation he does not have any grades and so I created StudentModel and StudentResource and used table Student. When I need to update grades using PUT request do I need to create a SudentGradeModel and StudentGradeResource also accessing student table?
Every Model class includes helper functions that the Resource class uses by importing the Model class. The Resource classes only have GET, POST, PUT and DELETE methods.
class StudentModel(db.Model):
__tablename__ = 'Student'
__table_args__ = {'extend_existing': True}
id = db.Column(db.Integer, primary_key=True)
name = db.Column(db.String(30))
class_sec = db.Column(db.String(4))
def __init__(self, id, name, class_sec):
self.id = id
self.name= name
self.class_sec = class_sec
from flask_restful import Resource, reqparse
from models.student_model import StudenteModel
# noinspection PyMethodMayBeStatic
class StudentResource(Resource):
parser = reqparse.RequestParser()
parser.add_argument('id', type=int, required=True, help='Every Student must have an ID')
parser.add_argument('name', type=str, required=True, help='Every Student must have a name')
parser.add_argument('class', type=str, required=True, help='Every Student must be assigned a class and section')
def get(self, id):
pass
def post(self, id):
pass
class StudentGradeModel(db.Model):
__tablename__ = 'Student'
__table_args__ = {'extend_existing': True}
id = db.Column(db.Integer, primary_key=True)
grade = db.Column(db.String(2), primary_key=True)
def __init__(self, id, grade):
self.id = id
self.grade = grade
# noinspection PyMethodMayBeStatic
class StudentGradeResource(Resource):
parser = reqparse.RequestParser()
parser.add_argument('id', type=int, required=True, help='Student must have an ID to access table')
parser.add_argument('grade', type=str, required=True, help='Student must have a grade to be assigned')
def get(self, id):
pass
def post(self, id):
pass
Similarly if I wanted to only update the section would I have to create a similar Classe with a PUT request.
Thank You
From the question, I'm assuming that one student can only have one grade or no grade at all, because if they can have more than one, a grade must be in a separate table.
The table creation SQL looks like this:
CREATE TABLE student (
id INT PRIMARY KEY,
name VARCHAR(30) NOT NULL,
class_sec CHAR(4) NOT NULL,
grade INTEGER
);
(I changed the datatype for grade since numeric data shouldn't be stored as string)
No, you can't, and should not have two models for the same table. The model should represent the table itself.
class StudentModel(db.Model):
__tablename__ = 'Student'
__table_args__ = {'extend_existing': True}
id = db.Column(db.Integer, primary_key=True)
name = db.Column(db.String(30), nullable=False)
class_sec = db.Column(db.String(4), nullable=False)
grade = db.Column(db.Integer)
def __init__(self, id, name, class_sec):
self.id = id
self.name= name
self.class_sec = class_sec
On the other hand, you can have more than one resource to interface a table. However, each resource is associated with one route, and you shouldn't have a separate resource for grade, unless you need another route for that, which I think you don't.
class Student(Resource):
...
def put(self, id):
request_body = request.get_json()
# do your things here
I've written a few spiders that pull similar data from different sources. I've also written a pipeline that allows this data to be put in a database. I want to be able to use the same code for multiple spiders to output to different tables, named dynamically from the spider name.
Here is the pipeline.py code:
class DbPipeline(object):
def __init__(self):
"""
Initialises database connection and sessionmaker.
Creates table if it doesn't exist.
"""
engine = db_connect()
create_output_table(engine)
self.Session = sessionmaker(bind=engine)
def process_item(self, item, spider):
"""
Saves scraped products in database
"""
exists = self.check_item_exists(item)
if not exists:
session = self.Session()
product = Products(**item)
try:
session.add(product)
session.commit()
except:
session.rollback()
raise
finally:
session.close()
return item
def check_item_exists(self,item):
session = self.Session()
product = Products(**item)
result = session.query(Products).filter(Products.title == item['title']).first()
return result is not None
And here is the model.py file:
DeclarativeBase = declarative_base()
def create_output_table(engine):
DeclarativeBase.metadata.create_all(engine)
def db_connect():
"""
Connects to database from settings defined in settings.py
Returns an sqlalchemy engine instance
"""
return create_engine(URL(**settings.DATABASE))
class Products(DeclarativeBase):
"""Sqlalchemy table model"""
__tablename__ = "name"
id = Column(Integer, primary_key=True)
title = Column('title', String(200))
price = Column('price', String(10), nullable=True)
url = Column('url', String(200), nullable=True)
What i'm trying to do is get the __tablename__ variable to be the same as the spider name, which I can easily do in the process_item function as it is passed a spider object and can use spider.name and assign it to a class variable, however the function will run after the table is created/defined. How can I go about getting the spider name outside of the process_item function in the pipelines.py file?
Edit: I've tried the solutions listed in How to access scrapy settings from item Pipeline however access to the 'settings' doesn't give me access to the attributes assigned to the current spider running. I need to dynamically get the name of the spider based on what spider is running the pipelines. Thanks
It's pretty easy to get current spider name in your create_output_table:
class DbPipeline(object):
#classmethod
def from_crawler(cls, crawler):
return cls(crawler.spider.name)
def __init__(self, spider_name):
"""
Initializes database connection and sessionmaker.
Creates deals table.
"""
engine = db_connect()
create_output_table(engine, spider_name)
......
and (in models.py):
def create_output_table(engine, spider_name):
# now you have your spider_name
DeclarativeBase.metadata.create_all(engine)
The problem here is that Scrapy process your models.py file before your pipelines.py. So you need to find a way to generate your SQLAlchemy model later. You can use this thread as a starting point: Dynamically setting __tablename__ for sharding in SQLAlchemy?
I'm following the Flask-SQLAlchemy tutorial. I have Flask 0.9, sqlalchemy 0.7.8 and flask-sqlalchemy 0.16 on python 2.6. (and I work with eclipse)
(The tuto is here : http://packages.python.org/Flask-SQLAlchemy/models.html)
I have 2 classes : a man and a wallet. There is a 1-1 relationship. (Each man has his own wallet)
class Man(db.Model):
sid = db.Column(db.Integer, primary_key=True)
name = db.Column(db.String(100), unique=False)
wallet = db.relationship('Wallet', backref='man', lazy='dynamic', uselist=False)
def __init__(self, wallet):
self.wallet = wallet
class Wallet(db.Model):
sid = db.Column(db.Integer, primary_key=True)
account = db.Column(db.Integer)
manId = db.Column(db.Integer, db.ForeignKey('man.sid'))
def __init__(self, account):
self.account = account
In my "main" module, I create my database :
app = Flask(__name__)
app.config['SQLALCHEMY_DATABASE_URI'] = 'sqlite:PATH'
db = SQLAlchemy(app)
In this very same module, I try to attach a Wallet to a Man :
if __name__ == "__main__":
db.create_all()
w1 = Wallet(132)
w2 = Wallet(18)
db.session.add(w1)
db.session.add(w2)
db.session.commit()
man1 = Man(w1)
db.session.add(man1)
db.session.commit()
But I get this error :
TypeError: 'Wallet' object is not iterable
I fail to understand why such an error appears. What is the right way of adding a mapped object ?
PS : I've been on the SQLAlchemy tutorial and I believe that they would declare things differently :
class Man(db.Model):
sid = db.Column(db.Integer, primary_key=True)
name = db.Column(db.String(100), unique=False)
wallet = db.relationship('Wallet', backref='man', lazy='dynamic', uselist=False)
manId = db.Column(db.Integer, db.ForeignKey('man.sid'))
def __init__(self, wallet):
self.wallet = wallet
class Wallet(db.Model):
sid = db.Column(db.Integer, primary_key=True)
account = db.Column(db.Integer)
def __init__(self, account):
self.account = account
Which tutorial should I trust ?
Thank you very much !
I fail to understand why such an error appears. What is the right way of adding a mapped object ?
Notice that when you configure you wallet relationship you use lazy="dynamic" option. This way you are setting up a dynamic relationship. As it is designed to be used with large collections it doesn't really makes much sense to use it with one-to-one relationship.
At the same time it alters the way you can assign to your scalar relationship, i.e. you cannot assing your single object directly:
self.wallet = wallet
but you must use an iterable
self.wallet = [wallet]
So you have two solutions here: either assign collection of one element as shown above or better yet stop using dynamic collections for this relationship.
Starting SQLalchemy user here. I plan to use UUID's as the primary keys for my tables.
In the tutorial I saw some code for using the native Python UUID type in ORM classes. Eureka! I can use Postgresql's native UUID type for my system database and this TypeDecorator will stringify the UUID's for SQLite on my mobile clients.
http://docs.sqlalchemy.org/en/latest/core/types.html#backend-agnostic-guid-type
Sadness. When using this with an existing SQLite database that has stringified UUID's as the primary key I get stale data errors when I try to commit any changes.
This class crashes with stale data on commit.
class CommodityTypes(Base):
__tablename__ = 'CommodityTypes'
uuidKey = Column(GUID, primary_key=True)
myName = Column(String, unique = True)
sortKey = Column(Integer, unique = True)
, but this class works:
class NewTypes(Base):
__tablename__ = 'CommodityTypes'
uuidKey = Column(String, primary_key=True)
myName = Column(String, unique = True)
sortKey = Column(Integer, unique = True)
Queried objects from the CommodityTypes class show the python UUID type for uuidKey. The script queries the object correctly. I can change settings, but I can't commit. The decorated uuidKey doesn't seem to work.
I can go forward just using Strings for the uuidKey columns, but it frustrates me that the code from http://docs.sqlalchemy.org/en/latest/core/types.html#backend-agnostic-guid-type almost works.
Here's sample code with the problem. The string workaround not using the GUID type decorator is commented out.
#system modules
import uuid
#other modules
import sqlalchemy
from sqlalchemy import create_engine
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy.orm import relationship, backref, sessionmaker
from sqlalchemy.types import TypeDecorator, CHAR
from sqlalchemy.dialects.postgresql import UUID
from sqlalchemy.orm.exc import MultipleResultsFound, NoResultFound
engine = create_engine('sqlite:////home/XXXX/XobfuscatedXXXX/XXXXXXXX.sqlite')
Base = declarative_base()
Session = sessionmaker(bind=engine)
class GUID(TypeDecorator):
"""Platform-independent GUID type.
Uses Postgresql's UUID type, otherwise uses
CHAR(32), storing as stringified hex values.
"""
impl = CHAR
def load_dialect_impl(self, dialect):
if dialect.name == 'postgresql':
return dialect.type_descriptor(UUID())
else:
return dialect.type_descriptor(CHAR(32))
def process_bind_param(self, value, dialect):
if value is None:
return value
elif dialect.name == 'postgresql':
return str(value)
else:
if not isinstance(value, uuid.UUID):
return "%.32x" % uuid.UUID(value)
else:
# hexstring
return "%.32x" % value
def process_result_value(self, value, dialect):
if value is None:
return value
else:
return uuid.UUID(value)
from sqlalchemy import Column, Boolean, DateTime, Date, Float, ForeignKey, Integer, Numeric, String
class CommodityTypes(Base):
__tablename__ = 'CommodityTypes'
uuidKey = Column(GUID, primary_key=True)
myName = Column(String, unique = True)
sortKey = Column(Integer, unique = True)
#class NewTypes(Base):
# __tablename__ = 'CommodityTypes'
# uuidKey = Column(String, primary_key=True)
# myName = Column(String, unique = True)
# sortKey = Column(Integer, unique = True)
if __name__=="__main__":
session = Session()
# newList = session.query(NewTypes).order_by(NewTypes.sortKey)
# for instance in newList:
# print(instance.myName)
#
# nt = newList[1]
# print(nt.myName)
# print(nt.sortKey)
# nt.sortKey = 11
# print(nt.sortKey)
# session.commit()
# print(nt.sortKey)
ctList = session.query(CommodityTypes).order_by(CommodityTypes.sortKey)
for instance in ctList:
print(instance.myName)
ct = ctList[1]
print(ct.myName)
print(ct.sortKey)
ct.sortKey = 22
print(ct.sortKey)
session.commit()
print(ct.sortKey)
Oh, forgot to mention software versions:
Python 3.1.3 (r313:86834, Dec 1 2010, 06:15:12)
[GCC 4.1.2 20080704 (Red Hat 4.1.2-48)] on linux2