How to create a temporary table by passing in data to psycopg2? - psycopg2

I have a pandas dataframe that I want to pass into a psycopg2 execute statement as a temporary table. This should be very simple:
pseudo-code...
string = """
with temporary_table (id, value) as (values %s)
select * from temporary_table
"""
cur.execute(string, df)
Where df is just a dataframe with an id and value column.
What would be the syntax to use such that I'd be able to pass this data in as a temporary table and use it in my query?

I would create a temporary table in Postgres database with df.to_sql or execute insert sql query with values, query it and at the end of process delete it

A test case that I think does what you want:
import psycopg2
from psycopg2.extras import execute_values
con = psycopg2.connect("dbname=test user=postgres host=localhost port=5432")
sql_str = """WITH temporary_table (
id,
value
) AS (
VALUES %s
)
SELECT
*
FROM
temporary_table
"""
cur = con.cursor()
execute_values(cur, sql_str, ((1, 2), (2,3)))
cur.fetchall()
[(1, 2), (2, 3)]
Using execute_values from Fast Execution Helpers.
UPDATE
Sticking to just execute:
import psycopg2
from psycopg2 import sql
con = psycopg2.connect("dbname=test user=postgres host=localhost port=5432")
cur = con.cursor()
input_data = ((1,2), (3,4), (5,6))
sql_str = sql.SQL("""WITH temporary_table (
id,
value
) AS (
VALUES {}
)
SELECT
*
FROM
temporary_table
""").format(sql.SQL(', ').join(sql.Placeholder() * len(input_data)))
cur.execute(sql_str, input_data)
cur.fetchall()
[(1, 2), (3, 4), (5, 6)]

Related

How to fetch data from postgres including column name using psycopg2 in python3

I am trying to fetch data from a postgres table using psycopg2.
Here is what i have done.
import psycopg2
con = psycopg2.connect("host=localhost dbname=crm_whatsapp user=odoo password=password")
cur = con.cursor()
sql = """SELECT * from tbl_hospital;"""
db_cursor.execute(sql)
hospital_data = db_cursor.fetchall()
print('hospital_data',hospital_data)
And the output is:
hospital_data [(1, 'hospital1', 1), (2, 'hospital2', 2), (3, 'hospital3', 3), (4, 'hospital4', 1)]
The output is not contains the cloumn header. I need that too. How can i get that.?
The cursor has the metadata in it.
From "Programming Python" by M. Lutz:
...
db_cursor.execute(sql)
colnames = [desc[0] for desc in db_cursor.description]

Querying MSSQL using pyodbc putting results in one column

I have a table in mssql with columns of data. I can successfully connect and pull data but when I'm pulling the data its getting put in one column of the dataframe. What am I missing?
I've tried the below code with is putting all the results into one column of the data frame.
import pyodbc
import pandas as pd
conn = pyodbc.connect('Driver={SQL Server};'
'Server=server;'
'Database=server;'
'Trusted_Connection=yes;')
cursor = conn.cursor()
cursor.execute("""SELECT [leadid], [CreateDate], [Status Change Count],
[logdate], [statustitle], [groupedstatus], [leadbucket] FROM vel_actions""")
df = pd.DataFrame(cursor.fetchall())
df.head() returns:
0 [2065004, 2018-03-12 03:06:10.0000000, 1, 2018...
1 [2065004, 2018-03-12 03:06:10.0000000, 2, 2018...
2 [2065004, 2018-03-12 03:06:10.0000000, 3, 2018...
3 [2065004, 2018-03-12 03:06:10.0000000, 4, 2018...
4 [2065004, 2018-03-12 03:06:10.0000000, 5, 2018...
How do I get the data into a dataframe and keep the columns that my sql table is?
fetchall() returns a list of tuples. Within, will not be found the data for naming columns.
sqlite3 is substituted for pyodbc because pyodbc does not enable a demonstrative example.
import pandas as pd
import sqlite3
conn = sqlite3.connect('example.db')
c = conn.cursor()
# Create table
c.execute('''CREATE TABLE stocks
(date text, trans text, symbol text, qty_real, price_real)''')
# Insert a row of data
c.execute("INSERT INTO stocks VALUES ('2006-01-05','BUY','RHAT',100,35.14)")
# Larger example that inserts many records at a time
purchases = [('2006-03-28', 'BUY', 'IBM', 1000, 45.00),
('2006-04-05', 'BUY', 'MSFT', 1000, 72.00),
('2006-04-06', 'SELL', 'IBM', 500, 53.00),
]
c.executemany('INSERT INTO stocks VALUES (?,?,?,?,?)', purchases)
# Save (commit) the changes
conn.commit()
c = conn.cursor()
c.execute("select date, trans, symbol, qty_real, price_real from stocks")
list_o_tuples = c.fetchall()
df = pd.DataFrame(list_o_tuples)
df.columns = ["date", "trans", "symbol", "qty_real", "price_real"]
print(df)
For sqlite3, you can have it return the column names, which may be passed into your DataFrame constructor.
conn.row_factory = sqlite3.Row
c = conn.cursor()
c.execute("select date, trans, symbol, qty_real, price_real from stocks")
list_o_dict_like_rows = c.fetchall()
l = [dict(i) for i in list_o_dict_like_rows]
df = pd.DataFrame(l)
print(df)
conn.close()
And for pyodbc the answer (cannot be reproduced here) would be something like:
cursor = conn.cursor()
cursor.execute("""SELECT [leadid], [CreateDate], [Status Change Count],
[logdate], [statustitle], [groupedstatus], [leadbucket] FROM vel_actions""")
columns = [column[0] for column in cursor.description]
df = pd.DataFrame(cursor.fetchall(), columns=columns)

Python dictionary to sqlite database

I'm trying to write a dictionary into an existing sql database, but without success giving me:
sqlite3.InterfaceError: Error binding parameter 0 - probably unsupported type.
Based on my minimal example, has anzbody some useful hints? (python3)
Command to create the empty db3 anywhere on your machine:
CREATE TABLE "testTable" (
sID INTEGER NOT NULL UNIQUE PRIMARY KEY,
colA REAL,
colB TEXT,
colC INTEGER);
And the code for putting my dictionary into the database looks like:
import sqlite3
def main():
path = '***anywhere***/test.db3'
data = {'sID': [1, 2, 3],
'colA': [0.3, 0.4, 0.5],
'colB': ['A', 'B', 'C'],
'colC': [4, 5, 6]}
db = sqlite3.connect(path)
c = db.cursor()
writeDict2Table(c, 'testTable', data)
db.commit()
db.close()
return
def writeDict2Table(cursor, tablename, dictionary):
qmarks = ', '.join('?' * len(dictionary))
cols = ', '.join(dictionary.keys())
values = tuple(dictionary.values())
query = "INSERT INTO %s (%s) VALUES (%s)" % (tablename, cols, qmarks)
cursor.execute(query, values)
return
if __name__ == "__main__":
main()
I had already a look at
Python : How to insert a dictionary to a sqlite database?
but unfortunately I did not succeed.
You must not use a dictionary with question marks as parameter markers, because there is no guarantee about the order of the values.
To handle multiple rows, you must use executemany().
And executemany() expects each item to contain the values for one row, so you have to rearrange the data:
>>> print(*zip(data['sID'], data['colA'], data['colB'], data['colC']), sep='\n')
(1, 0.3, 'A', 4)
(2, 0.4, 'B', 5)
(3, 0.5, 'C', 6)
cursor.executemany(query, zip(data['sID'], data['colA'], data['colB'], data['colC']))

Extracting pandas dataframes from dictionary

I have exported several tables from SQLite into a dictionary. Is there a way to extract all of the dataframes that are now in the dictionary and create one dataframe for each ? (ex. df1, df2, df3, etc...)
conn = sq3.connect('C:/Users/database.db')
cur = conn.cursor()
cur.execute("SELECT name FROM sqlite_master WHERE type='table';")
tablenames = cur.fetchall()
df_dict = {}
for tablename in tablenames:
tablename = tablename[0]
df_dict[tablename] = pd.read_sql_query('SELECT * FROM %s' % tablename, conn).set_index('date')

Filtering rows in Spark Dataframe based on multiple values in a list [duplicate]

I want to filter a Pyspark DataFrame with a SQL-like IN clause, as in
sc = SparkContext()
sqlc = SQLContext(sc)
df = sqlc.sql('SELECT * from my_df WHERE field1 IN a')
where a is the tuple (1, 2, 3). I am getting this error:
java.lang.RuntimeException: [1.67] failure: ``('' expected but identifier a found
which is basically saying it was expecting something like '(1, 2, 3)' instead of a.
The problem is I can't manually write the values in a as it's extracted from another job.
How would I filter in this case?
String you pass to SQLContext it evaluated in the scope of the SQL environment. It doesn't capture the closure. If you want to pass a variable you'll have to do it explicitly using string formatting:
df = sc.parallelize([(1, "foo"), (2, "x"), (3, "bar")]).toDF(("k", "v"))
df.registerTempTable("df")
sqlContext.sql("SELECT * FROM df WHERE v IN {0}".format(("foo", "bar"))).count()
## 2
Obviously this is not something you would use in a "real" SQL environment due to security considerations but it shouldn't matter here.
In practice DataFrame DSL is a much better choice when you want to create dynamic queries:
from pyspark.sql.functions import col
df.where(col("v").isin({"foo", "bar"})).count()
## 2
It is easy to build and compose and handles all details of HiveQL / Spark SQL for you.
reiterating what #zero323 has mentioned above : we can do the same thing using a list as well (not only set) like below
from pyspark.sql.functions import col
df.where(col("v").isin(["foo", "bar"])).count()
Just a little addition/update:
choice_list = ["foo", "bar", "jack", "joan"]
If you want to filter your dataframe "df", such that you want to keep rows based upon a column "v" taking only the values from choice_list, then
from pyspark.sql.functions import col
df_filtered = df.where( ( col("v").isin (choice_list) ) )
You can also do this for integer columns:
df_filtered = df.filter("field1 in (1,2,3)")
or this for string columns:
df_filtered = df.filter("field1 in ('a','b','c')")
A slightly different approach that worked for me is to filter with a custom filter function.
def filter_func(a):
"""wrapper function to pass a in udf"""
def filter_func_(col):
"""filtering function"""
if col in a.value:
return True
return False
return udf(filter_func_, BooleanType())
# Broadcasting allows to pass large variables efficiently
a = sc.broadcast((1, 2, 3))
df = my_df.filter(filter_func(a)(col('field1'))) \
from pyspark.sql import SparkSession
import pandas as pd
spark=SparkSession.builder.appName('Practise').getOrCreate()
df_pyspark=spark.read.csv('datasets/myData.csv',header=True,inferSchema=True)
df_spark.createOrReplaceTempView("df") # we need to create a Temp table first
spark.sql("SELECT * FROM df where Departments in ('IOT','Big Data') order by Departments").show()

Resources