Replace a large amount of data at once with SQLAlchemy? - python-3.x

I need to update a large amount of data daily (large means >3MB). I thought to store it as JSON, but SQLAlchemy doesn't support converting from JSON as far as I found. So now I'm trying to do it with Pickle. At the moment I'm storing every product I have in a huge Pickle file, to load it back in later and commit them. However, I keep getting errors saying my product class is not mapped, and I'm not sure what it means or how to fix it. Everything I came across while Googling didn't resemble my code in the slightest.
Here is my product class:
class Product:
id = ""
name = ""
store_name = ""
brand = ""
price = ""
amount = ""
info = ""
image = ""
And here is my Pickle / Database code:
def loadall():
with open('products.txt', mode='rb') as products_txt:
while True:
try:
yield pickle.load(products_txt)
except EOFError:
break
Session = sessionmaker(bind=db)
session = Session()
products = loadall()
with db.connect() as conn:
session.add_all(products)
session.commit()
(made after reading Saving and loading multiple objects in pickle file?)

Below should give you an idea (i limited test data to just 2 columns):
test.py :
#!/usr/bin/env python3
import json
from sqlalchemy import Column, Integer, String, create_engine
from sqlalchemy.orm import Session, as_declarative, registry
## configuration
engine = create_engine("sqlite+pysqlite:///:memory:", echo=True, future=True)
mapper_registry = registry()
#as_declarative()
class Base(object):
pass
class Product(Base):
__tablename__ = "product"
id = Column("id", Integer, primary_key=True)
name = Column(String)
info = Column(String)
def _main():
with Session(engine) as session:
Base.metadata.drop_all(engine)
Base.metadata.create_all(engine)
## test data
objects = [
Product(name="no-info"),
Product(name="red-color", info="large RED"),
]
session.add_all(objects)
session.commit()
session.expunge_all()
## test data: from JSON
# product_list = load_all_test() # load from test code
product_list = load_all_file() # load from the file
print(product_list)
# create Product() instances from JSON
products = [Product(**kw) for kw in product_list]
session.add_all(products)
session.commit()
def load_all_test():
test_json_content = """
[
{ "name": "json-product-1", "info": "some info from json-1" },
{ "name": "json-product-2", "info": "some info from json-2" }
]
"""
product_list = json.loads(test_json_content)
return product_list
def load_all_file():
with open("myjsonfile.json") as fh:
product_list = json.load(fh)
return product_list
_main()

Related

How can I speed up this SQLAlchemy Query? In FastAPI

I have Volunteers table and it has 60 fields, I use FastAPI and SQLalchemy ORM:
class Volunteers(Base):
__tablename__ = "volunteers"
id = Column(Integer, primary_key=True, index=True, nullable=False)
candidate_id = Column(Integer)
full_name = Column(String)
......
And I have function where I import excel file and write data from excel to database. I import all users from excel to all_users_in_excel list. all_users_in_excel is a list of dictionaries. Dictionaries include each user information. Then I get one user at a time from all_users_in_excel list and create new_user object with values and append it to a new list.
def import_data(file: UploadFile = File(...), db: Session = Depends(get_db)):
all_users_in_excel = []
with open(f'{file.filename}', "wb") as buffer:
shutil.copyfileobj(file.file, buffer)
volunteer_data = pd.read_excel("assignment-data.xlsx", index_col=None)
for name in volunteer_data.iterrows():
new_user_from_excel = name[1].to_dict()
all_users_in_excel.append(new_user_from_excel)
saved_users = []
for key in all_users_in_excel:
new_user = {
"candidate_id": key["Candidate - ID"],
"full_name": key["Candidate - Full Name"],
"checkpoint": key["Candidate - Checkpoint"],
.....
"created_at": datetime.now()
}
saved_users.append(new_user)
db.bulk_insert_mappings(models.Volunteers, saved_users)
db.commit()
db.bulk_insert_mappings(models.Volunteers, saved_users) takes 20 seconds to save 110 users. How can I speed it up? I use Postgresql as a database and it is deployed to Heroku free subscription.

Re-create postgresql database to do pytest

I am working on a proyect that works with these libraries (among others)
postgresql 10.5
pandas 1.1.1
psycopg2 2.7.5
pytest 5.0.1
python 3.7.1
I am trying to run pytest on a library we have created that uses sqlalchemy to access a postgresql database.
I want to re-create a database to test the methods that we have being using for few months.
I have tried different things but without success.
I know that postgresql cannot create an in memory database, which is why I am trying to use sqlite3 for that purpose. .
The method I am trying to run test over is:
DatabaseHelper.py
class DatabaseHelper(object):
"""
Helps accessing database.
"""
def __init__(self):
pass
# ...
def create_engine(self, host_dns, dbname, port, user, password):
"""
:param host_dns: dns route to the database
:param dbname: name of the database to access to
:param port: number or port of the database
:param user: name of the user to access de database
:param password: password to connect to the database
"""
self.host = host_dns
self.dbname = dbname
self.port = port
self.user = user
self.password = password
self.connection_str = f'postgresql://{self.user}:{self.password}#{self.host}:{self.port}/{self.dbname}'
self.engine = create_engine(self.connection_str)
# session_factory = sessionmaker(bind=self.engine)
Session = sessionmaker(bind=self.engine)
self.session = Session()
# print("Agora objectec created ok")
# ...
def read_db_to_df(self, **kwargs):
""" Reads a database and transforms into a pandas.DataFrame """
try:
default_reading_sql_args = {'con': self.session.connection()}
reading_sql_args = utils.merge_two_dicts(default_reading_sql_args, kwargs)
df = pd.read_sql(**reading_sql_args)
return df
except SQLAlchemyError as e:
# self.logger.error("Error reading db to df")
# self.logger.error(str(e).replace("\n", ""))
print(e)
return -20
MY_test_before_test.py
from sqlalchemy import MetaData, Column, Table, ForeignKey, select, PrimaryKeyConstraint, Index
from sqlalchemy import Integer, String
from sqlalchemy import create_engine
from sqlalchemy.schema import CreateTable, DropTable
from sqlalchemy.ext.declarative.api import DeclarativeMeta
from agora_db.agora_helper import AgoraHelper
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy.orm import sessionmaker
from sqlalchemy.sql.functions import current_timestamp, current_user
engine = create_engine('sqlite:///:memory:')
Session = sessionmaker(bind=engine)
session = Session()
Base = declarative_base()
#This is how the database models look like:
class Schema(object):
""" A simple schema to provide a common argument for all tables """
__table_args__ = {"schema": "ifsmtd"}
class AbstractTable(object):
"""
A class that sets how the class is represents its objects as string.
"""
def __repr__(self):
""" Returns the object representation in string fromat in a way that can be used to reconstruct the object.
This returns an 'official' string representation of the object.
:Example:
"""
from sqlalchemy.inspection import inspect
table_inst = inspect(self)
table_name = type(self).__name__
column_key_value = [(c_attr.key, getattr(self, c_attr.key)) for c_attr in table_inst.mapper.column_attrs]
fields_str = ", ".join(["%s='%s'" % (c[0], c[1]) for c in column_key_value])
return "<" + table_name + "(" + fields_str + ")>"
class MyBasicTable(Base, Schema, AbstractTable):
__tablename__ = "mybasic_table"
timest_mov = Column(TIMESTAMP, primary_key=True, nullable=False, server_default=current_timestamp())
id_info = Column(String, primary_key=True, nullable=False)
id_wf = Column(String, primary_key=True, nullable=False)
process_name = Column(String, primary_key=True, nullable=False)
error_type = Column(String, primary_key=True, nullable=False)
resolution_status = Column(Boolean)
aud_timest_umo = Column(TIMESTAMP, server_default=current_timestamp())
aud_id_user = Column(String, server_default=current_user())
__table_args__ = (
PrimaryKeyConstraint('timest_mov', 'id_info', 'id_wf', 'process_name', 'error_type', name='pk_idx_mybasic_table'),
Index('pk_idx_mybasic_table', 'timest_mov', 'id_info', 'id_wf', 'process_name', 'error_type', unique=True),
{"schema": "ifsmtd"}
)
dbhelper = DatabaseHelper()
dbhelper.engine = engine
dbhelper.session = session
query = session.query(MyBasicTable.timest_mov.label("timest_mov"),
MyBasicTable.id_info .label("id_info "),
MyBasicTable.id_wf.label("id_wf"),
MyBasicTable.process_name.label("process_name"),
MyBasicTable.error_type.label("error_type"),
MyBasicTable.resolution_status.label("resolution_status")
)\
.distinct(MyBasicTable.id_jira.label("id_jira"))
df = dbhelper.read_db_to_df(sql=query.statement)
print(df)
The Error I get is:
(sqlite3.OperationalError) no such table: ifsmtd.mybasic_table
How could I do the test.
Looking at the code, there seems to be missing a call to Base.metadata.create_all(engine). This would create the initial database schema. After this call the tables are empty and you need to populate them.
Add the above statement just before using the database, but after defining the tables.
Base.metadata.create_all(engine) # This will create the schema!
dbhelper = DatabaseHelper()
On the use of sqlite: I have also gone that route and bumped into the fact that by default sqlite does not check foreign key constraints (it can be enabled!). There may be more differences!

SqlAlchemy 'datetime.datetime' is not mapped

I'm using Scrapy to grab domains and their creation date using the Whois module. I am then adding them to a MySQL database using SqlAlchemy but I get the below error when adding the creation date to the database because the data type is <class 'datetime.datetime'>
sqlalchemy.orm.exc.UnmappedInstanceError: Class 'datetime.datetime' is not mapped
I tried to convert the date into a string but then I get another error.
pipelines.py:
class SaveDomainsPipeline(object):
def __init__(self):
engine = db_connect()
create_table(engine)
self.Session = sessionmaker(bind=engine)
def process_item(self, item, spider):
session = self.Session()
domain = Domains(**item)
domain_item = item['domain']
domain_whois = whois.query(domain_item)
creation_date = domain_whois.creation_date
try:
session.add_all([domain, creation_date])
session.commit()
models.py
class Domains(Base):
__tablename__ = "domains"
id = Column(Integer, primary_key=True)
date_added = Column(DateTime(timezone=True), server_default=func.now())
domain = Column('domain', Text())
creation_date = Column('creation_date', DateTime(timezone=True))
#creation_date = Column('creation_date', Text()) -- I also tried this
I made a rookie mistake in my original code.
As I initiated an instance of the class "Domains", I had to refer to it when populating the columns which I had originally missed. The working code can be found below.
class SaveDomainsPipeline(object):
def __init__(self):
engine = db_connect()
create_table(engine)
self.Session = sessionmaker(bind=engine)
def process_item(self, item, spider):
session = self.Session()
domains = Domains() #initiate instance of Domains class.
domains.domain = item['domain'] #Add the item "domain" from Items to DB
domain_whois = whois.query(domains.domain)
domains.creation_date = domain_whois.creation_date #Add the creation date to DB
try:
#save the instance which saves both the domain item and creation date.
session.add(domains)
session.commit()

Is there any way I can speed up my python program?

I am working upon a pubmed project where I need to extract the ids for free full text and free pmc articles.This is what my code is.
import requests
from bs4 import BeautifulSoup
from Bio import Entrez
Entrez.email = "abc#gmail.com" # Always tell NCBI who you are
handle = Entrez.esearch(db="pubmed", term="cough")
record = Entrez.read(handle)
count = record['Count']
handle = Entrez.esearch(db="pubmed", term="cough", retmax=count)
record = Entrez.read(handle)
free_article_ids = []
for id_ in record['IdList']:
req = requests.get(f"https://www.ncbi.nlm.nih.gov/pubmed/{id_}")
soup = BeautifulSoup(req.text, 'lxml')
status = soup.find('span', {'class':'status_icon'})
if status is None:
continue
elif status.text in ["Free full text", "Free PMC Article"]:
free_article_ids.append(id_)
print(free_article_ids)
Problem with my code is that it is taking way too much time for giving the result and I want to speed this process up. How do I do it?
Use multithreading to download concurrently. Recommend a simple framework.
from Bio import Entrez
from simplified_scrapy import Spider, SimplifiedDoc, SimplifiedMain
class MySpider(Spider):
name = 'ncbi.nlm.nih.gov'
start_urls = []
def __init__(self):
Entrez.email = "abc#gmail.com" # Always tell NCBI who you are
handle = Entrez.esearch(db="pubmed", term="cough")
record = Entrez.read(handle)
count = record['Count']
handle = Entrez.esearch(db="pubmed", term="cough", retmax=count)
record = Entrez.read(handle)
for id_ in record['IdList']:
self.start_urls.append(f"https://www.ncbi.nlm.nih.gov/pubmed/{id_}")
Spider.__init__(self,self.name) #necessary
free_article_ids = []
def extract(self,url,html,models,modelNames):
doc = SimplifiedDoc(html)
status = doc.select('span.status_icon')
if status and status.text in ["Free full text", "Free PMC Article"]:
id = url.split('/')[-1]
self.free_article_ids.append(id)
return {"Urls": [], "Data": {"id":id}}
return True
SimplifiedMain.startThread(MySpider())
Here are more examples. https://github.com/yiyedata/simplified-scrapy-demo

Trying to use a UUID for a primary key. Getting stale data on SQLalchemy commit()

Starting SQLalchemy user here. I plan to use UUID's as the primary keys for my tables.
In the tutorial I saw some code for using the native Python UUID type in ORM classes. Eureka! I can use Postgresql's native UUID type for my system database and this TypeDecorator will stringify the UUID's for SQLite on my mobile clients.
http://docs.sqlalchemy.org/en/latest/core/types.html#backend-agnostic-guid-type
Sadness. When using this with an existing SQLite database that has stringified UUID's as the primary key I get stale data errors when I try to commit any changes.
This class crashes with stale data on commit.
class CommodityTypes(Base):
__tablename__ = 'CommodityTypes'
uuidKey = Column(GUID, primary_key=True)
myName = Column(String, unique = True)
sortKey = Column(Integer, unique = True)
, but this class works:
class NewTypes(Base):
__tablename__ = 'CommodityTypes'
uuidKey = Column(String, primary_key=True)
myName = Column(String, unique = True)
sortKey = Column(Integer, unique = True)
Queried objects from the CommodityTypes class show the python UUID type for uuidKey. The script queries the object correctly. I can change settings, but I can't commit. The decorated uuidKey doesn't seem to work.
I can go forward just using Strings for the uuidKey columns, but it frustrates me that the code from http://docs.sqlalchemy.org/en/latest/core/types.html#backend-agnostic-guid-type almost works.
Here's sample code with the problem. The string workaround not using the GUID type decorator is commented out.
#system modules
import uuid
#other modules
import sqlalchemy
from sqlalchemy import create_engine
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy.orm import relationship, backref, sessionmaker
from sqlalchemy.types import TypeDecorator, CHAR
from sqlalchemy.dialects.postgresql import UUID
from sqlalchemy.orm.exc import MultipleResultsFound, NoResultFound
engine = create_engine('sqlite:////home/XXXX/XobfuscatedXXXX/XXXXXXXX.sqlite')
Base = declarative_base()
Session = sessionmaker(bind=engine)
class GUID(TypeDecorator):
"""Platform-independent GUID type.
Uses Postgresql's UUID type, otherwise uses
CHAR(32), storing as stringified hex values.
"""
impl = CHAR
def load_dialect_impl(self, dialect):
if dialect.name == 'postgresql':
return dialect.type_descriptor(UUID())
else:
return dialect.type_descriptor(CHAR(32))
def process_bind_param(self, value, dialect):
if value is None:
return value
elif dialect.name == 'postgresql':
return str(value)
else:
if not isinstance(value, uuid.UUID):
return "%.32x" % uuid.UUID(value)
else:
# hexstring
return "%.32x" % value
def process_result_value(self, value, dialect):
if value is None:
return value
else:
return uuid.UUID(value)
from sqlalchemy import Column, Boolean, DateTime, Date, Float, ForeignKey, Integer, Numeric, String
class CommodityTypes(Base):
__tablename__ = 'CommodityTypes'
uuidKey = Column(GUID, primary_key=True)
myName = Column(String, unique = True)
sortKey = Column(Integer, unique = True)
#class NewTypes(Base):
# __tablename__ = 'CommodityTypes'
# uuidKey = Column(String, primary_key=True)
# myName = Column(String, unique = True)
# sortKey = Column(Integer, unique = True)
if __name__=="__main__":
session = Session()
# newList = session.query(NewTypes).order_by(NewTypes.sortKey)
# for instance in newList:
# print(instance.myName)
#
# nt = newList[1]
# print(nt.myName)
# print(nt.sortKey)
# nt.sortKey = 11
# print(nt.sortKey)
# session.commit()
# print(nt.sortKey)
ctList = session.query(CommodityTypes).order_by(CommodityTypes.sortKey)
for instance in ctList:
print(instance.myName)
ct = ctList[1]
print(ct.myName)
print(ct.sortKey)
ct.sortKey = 22
print(ct.sortKey)
session.commit()
print(ct.sortKey)
Oh, forgot to mention software versions:
Python 3.1.3 (r313:86834, Dec 1 2010, 06:15:12)
[GCC 4.1.2 20080704 (Red Hat 4.1.2-48)] on linux2

Resources