How to load data from a connection string with vaex package? - python-3.x

If I have a table on my server and I am producing a connection string to it, how can I, using Vaex, load it to a dataframe?
Here is what I am doing but with Pandas:
from sqlalchemy import types, create_engine, text
import pandas as pd
import pymysql
def connect_to_data(driver='mysql+pymysql://', conn_string=''):
try:
conn = create_engine(driver + conn_string)
print("MySQL Connection Successfull!")
except Exception as err:
print("MySQL Connection Failed!")
print(err)
return conn
# Connect to the db:
conn_string = 'xxxxxxxx'
conn = connect_to_data(conn_string=conn_string)
# Get all requests from the db:
query = '''SELECT * FROM table_name'''
result = conn.execute(text(query))
# Desired dataframe:
df = pd.read_sql_query(query, conn)
How can I do the same with Vaex (because of it's high performance)?

For now at least, you can't do it directly. But vaex can easily read a pandas dataframe so you can
# Following your example..
pandas_df = pd.read_sql_query(query, conn)
df = vaex.from_pandas(pandas_df)

Related

Issues with multiprocessing and import get pass

Trying to write a multiprocessing code using the import get pass module
import time
from multiprocessing import Pool
from multiprocessing import freeze_support
import getpass
import jaydebeapi
import pandas as pd
import numpy as np
pw = getpass.getpass(prompt="Password", stream=False)
# establishing database to the ODS database
ODS = jaydebeapi.connect(
'com.ibm.db2.jcc.DB2Driver',
'jdbc:db2://he3qlxvtdbs351.fhlmc.com:50001/DB2QLTY',
['f408195', pw],
'C:/JDBC/db2jcc.jar')
# Allows SQL statements between the ODS database
ODS = ODS.cursor()
# creating the password needed to establish PML database connection
pw_2 = getpass.getpass(prompt="Password", stream=False)
# establishing database to the PML database
PML = jaydebeapi.connect(
'com.ibm.db2.jcc.DB2Driver',
'jdbc:db2://he3qlxvtdbs957.fhlmc.com:50001/PMLFDB2',
['f408195', pw_2],
'C:/JDBC/db2jcc.jar')
# Allows SQL statements between the PML database
PML = PML.cursor()
def test(first_evnt, last_evnt):
PML_loan_Query = "select b.id_lpa_alt_loan from udbadm.pml_lst_cmpltd_trans_mtch a join udbadm.lpa_altv_loan_idtn b on a.id_evnt = b.id_evnt where b.cd_lpa_alt_loan_idtn = 'HewlettPackardGeneratedTransaction' and a.id_evnt BETWEEN ? AND ?"
PML.execute(PML_loan_Query,(first_evnt, last_evnt))
loan_records = PML.fetchall()
df = pd.DataFrame()
for x in loan_records:
# Populating the ODS table
#borr_query = "SELECT nbr_aus, CAST(NULLIF(NULLIF(cd_idx, -9999), 0.000000) AS VARCHAR(100)) AS cd_idx, CAST(rate_curr_int AS INT) AS rate_curr_int, CAST(NULLIF(rate_gr_mrtg_mrgn,0) AS INT) AS rate_gr_mrtg_mrgn, CAST(rate_loln_max_cap AS INT) AS rate_loln_max_cap, CAST(NULLIF(rate_perdc_cap,0) AS INT) AS rate_perdc_cap FROM DB2MANT.I_LP_TRANS WHERE nbr_trans_aus BETWEEN ? AND ?"
borr_query = 'SELECT nbr_aus, CAST(NULLIF(NULLIF(cd_idx, -9999), 0.000000) AS VARCHAR(10)) AS cd_idx, CAST(rate_curr_int AS VARCHAR(10)) AS rate_curr_int, CAST(NULLIF(rate_gr_mrtg_mrgn,0) AS VARCHAR(10)) AS rate_gr_mrtg_mrgn, CAST(rate_loln_max_cap AS VARCHAR(10)) AS rate_loln_max_cap, CAST(NULLIF(rate_perdc_cap,0) AS VARCHAR(10)) AS rate_perdc_cap FROM DB2MANT.I_LP_TRANS WHERE nbr_trans_aus IN (?)'
#borr_query = "SELECT DISTINCT nbr_aus FROM DB2MANT.I_LP_TRANS WHERE nbr_trans_aus BETWEEN ? AND ?"
ODS.execute(borr_query, x)
#ODS.execute(ODS_list)
ODS_records = ODS.fetchall()
ODS_records = df.append(pd.DataFrame(ODS_records, columns = ['nbr_aus', 'cd_idx', 'rate_curr_int', 'rate_gr_mrtg_mrgn', 'rate_loln_max_cap', 'rate_perdc_cap']))
return ODS_records
if __name__ == '__main__':
freeze_support()
first_evnt = 155643917
last_evnt = 155684481
p = Pool()
result = p.map(test, [first_evnt, last_evnt])
print(result)
p.close()
p.join()
I saved this script into a .py file and I tried to run it in the command prompt, it asked for my password for my ODS database, then my PML database, and then it seems to keep running the getpass command over and over again.
Below in a picture for reference.
enter image description here python script terminal

Size in spark dataframe

I created a dataframe with a table of my postgres database. when i pass this command to see the number of row (df.count()), i have the error :
WARN TaskSetManager: Stage 9 contains a task of very large size (22439 KiB). The maximum recommended task size is 1000 KiB.
What does that mean ? what is the maximum size of a dataframe in spark ?
Here's the way that i connected to the postgre Database :
import configparser
import psycopg2
import pandas as pd
from queries import COUNTRY_TABLE,ACTORS_TABLE,COL_ACTOR, COL_COUNTRY
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, udf
spark = SparkSession.builder.appName('ETL dvdrental pysaprk').getOrCreate()
def connection_db():
conn = psycopg2.connect("host=localhost dbname=demo user=postgres password=admin port=5432")
cur = conn.cursor()
return [cur, conn]
def extract_data(query):
conn_param = connection_db()
cur = conn_param[0]
conn = conn_param[1]
try:
cur.execute(query)
data = cur.fetchall()
return data
except Exception as e:
print(e)
tickets_col = ["ticket_no","book_ref", "passenger_id", "passenger_name","contact_data"]
tickets = spark.createDataFrame(extract_data("SELECT * FROM tickets")).toDF(*tickets_col)
tickets.count()
I have the warning when i execute tickets.count()

How to Encode ( utf-8) in Pandas (Excel as source)

I am trying to read from excel and load into Mongodb using Pymongo.
The Error I got cannot is "encode object: , of type: <class 'pandas._libs.missing.NAType'>", when researched, I was told to use utf-8-sign format to insert it into monogodb, but in pandas dataframe there is no option to use utf-8
from pymongo import MongoClient
from datetime import datetime
import pandas as pd
import Parameters
import pandasql as pf
import json
import pymongo
import xlrd
from pathlib import Path
import os
import constants
try:
class conn:
def __init__(self):
client = pymongo.MongoClient( "mongodb://" + constants.USER_NAME + ":" + constants.PWD + constants.server + constants.CA_CERTIFICATES_PATH)
db = client[Parameters.STG_QC_Hub_Files]
week="08-02-2021"
out_col = db[Parameters.col]
filename = "1.xlsx"
path1 = Path('//test3'+'/'+filename)
data_load_date = datetime.today().strftime('%m-%d-%Y')
df1=pd.read_excel(path1,sheet_name="AU-ARCM Details",keep_default_na=False)
# df1 = pd.read_excel(xls+filename,keep_default_na=False,encoding='utf-8-sig')
# df1 = pd.read_csv(xls,keep_default_na=False,encoding='utf-8-sig').iloc[:, : 86]
df1["Week"]=week
df1["Data Load Date"]=data_load_date
df1 = df1.astype('string')
# df1.index = df1.index.str.encode('utf-8')
df1=df1.drop(['Source.Name'], axis=1)
records = json.loads(df1.T.to_json()).values()
out_col.insert_many(df1.to_dict('records'))
print("Imported File " +str(filename)+" with " +str(len(records) )+ " records")
c = conn()
except Exception as e:
print(e)
Traceback:
File "C:\Users\PycharmProjects\ReMs\venv\lib\site-packages\pymongo\message.py", line 1323, in _do_batched_op_msg
operation, command, docs, check_keys, ack, opts, ctx)
bson.errors.InvalidDocument: cannot encode object: <NA>, of type: <class 'pandas._libs.missing.NAType'>
You have some blank cells in your spreadsheet that pandas has its own type (NAT) for; pymongo doesn't know what to do with this type, hence the error. You will need to remove any of these in order to load the values into mongodb using the method you are using.
Consider something like this just before you attempt the insert:
import numpy as np
df1 = df1.replace(np.nan, None)

Python pandas into azure SQL, bulk insert

How can I arrange bulk insert of python dataframe into corresponding azure SQL.
I see that INSERT works with individual records :
INSERT INTO XX ([Field1]) VALUES (value1);
How can I insert the entire content of dataframe into Azure table?
Thanks
According to my test, we also can use to_sql to insert data to Azure sql
for example
from urllib.parse import quote_plus
import numpy as np
import pandas as pd
from sqlalchemy import create_engine, event
import pyodbc
# azure sql connect tion string
conn ='Driver={ODBC Driver 17 for SQL Server};Server=tcp:<server name>.database.windows.net,1433;Database=<db name>;Uid=<user name>;Pwd=<password>;Encrypt=yes;TrustServerCertificate=no;Connection Timeout=30;'
quoted = quote_plus(conn)
engine=create_engine('mssql+pyodbc:///?odbc_connect={}'.format(quoted))
#event.listens_for(engine, 'before_cursor_execute')
def receive_before_cursor_execute(conn, cursor, statement, params, context, executemany):
print("FUNC call")
if executemany:
cursor.fast_executemany = True
#insert
table_name = 'Sales'
# For test, I use a csv file to create dataframe
df = pd.read_csv('D:\data.csv')
df.to_sql(table_name, engine, index=False, if_exists='replace', schema='dbo')
#test after inserting
query = 'SELECT * FROM {table}'.format(table=table_name )
dfsql = pd.read_sql(query, engine)
print(dfsql)

Autocommit is causing pandas to_sql to fail

I've got a problem with my engine parameters:
import pyodbc
import pandas as pd
from sqlalchemy import create_engine
import urllib
conn_str = (
r'Driver=ODBC Driver 11 for SQL Server;'
r'Server=Saturn;'
r'Database=ExperienceRating2019;'
r'Trusted_Connection=yes;'
)
quoted_conn_str = urllib.parse.quote_plus(conn_str)
engine = create_engine('mssql+pyodbc:///?odbc_connect={}'.format(quoted_conn_str)).execution_options(autocommit=True)
cnxn = engine.connect()
splitpoint = 17000
excel_file = "#2 DRATIO RUN.xlsx"
d_ratio_sheet = "D RATIO & ELR"
d_ratio = pd.read_excel(open(excel_file,'rb'),sheet_name = d_ratio_sheet)
d_ratio.to_sql("d_ratio", cnxn, if_exists = 'replace')
I will get the following error:
DBAPIError: (pyodbc.Error) ('HY010', '[HY010] [Microsoft][ODBC Driver 11 for SQL Server]Function sequence error (0) (SQLFetch)') (Background on this error at: http://sqlalche.me/e/dbapi)
If I change my engine to lose the autocommit
engine = create_engine('mssql+pyodbc:///?odbc_connect={}'.format(quoted_conn_str))
The error goes away(yay!) but later in my code, where I execute a SP, it will no longer commit:
engine.execute("sp_refresh_inputs")
**Question:**How can I change my change my connection so that both pandas and sqlalchemy work?
I ended up using two engines, one for pandas and one for sqlalchemy:
cnxn = create_engine('mssql+pyodbc:///?odbc_connect={}'.format(quoted_conn_str))
engine = create_engine('mssql+pyodbc:///?odbc_connect={}'.format(quoted_conn_str)).execution_options(autocommit=True)

Resources