Check null values in data fetched from SQL database - python-3.x

I want to store the data in numpy arrays fetched from the database. I want to make sure that no null value (None) go to the numpy array(throws an error anyway doing that). I have tried to do it the following way but it does not work. For some reason, NullValueCheck() always returns true How can I know about null values and do something about it?
import numpy as np
import pyodbc as odbc
cnxn = odbc.connect(conn_string)
cursor = cnxn.cursor()
cursor.execute("""SELECT ID, BuildingID, Title FROM Something"""")
rows = cursor.fetchall()
cnxn.close()
ID = [i[0] for i in rows]
buildingID = [i[1] for i in rows]
title = [i[2] for i in rows]
def NullValueCheck(rows):
if (any(elem is None for elem in rows[0])):
return True
else:
return False
if NullValueCheck(rows):
ID_array = np.fromiter(ID, dtype= np.int32)
Edit:
It turns out that I don't have to write all that code. I can achieve the same using pandas dataframe that I want to achieve from numpy array.
import pandas as pd
import pyodbc as odbc
cnxn = odbc.connect(conn_string)
df = pd.io.sql.read_sql("""SELECT ID, BuildingID, Title FROM Something""", cnxn)

I've found this is easiest to address in your source SQL. COALESCE is helpful here:
df = pd.io.sql.read_sql("""SELECT ID, COALESCE(BuildingID, 0) AS BuildingID, Title FROM Something""", cnxn)
This will return 0 if the value of BuildingID is NULL. Different SQL databases have functions for specific NULL checking (ISNULL in SQL Server, IFNULL in MySQL, for example), but COALESCE is the most cross database compatible.

Related

Column names not displayed with st.dataframe after PostgreSQL `SELECT *` into Pandas DataFrame

Python 3.8.10
streamlit==1.9.0
pandas==1.4.2
psycopg2-binary==2.9.3
Loading a Postgres table directly into a Pandas DataFrame with the following code.
df = pd.DataFrame(run_query("SELECT * FROM schema.tablename;"))
Displaying it with either streamlit.dataframe(df) or streamlit.write(df) loses the column names.
In order to capture the column names, I use this kluge.
# Initialize connection.
#st.experimental_singleton
def init_connection():
return psycopg2.connect(**st.secrets["postgresservername"])
conn = init_connection()
# Perform query.
#st.experimental_memo(ttl=600)
def run_query(query):
with conn.cursor() as cur:
cur.execute(query)
return cur.fetchall()
def load_table_as_dataframe(table):
# This is super klugy.
data = run_query("SELECT * FROM schema.{};".format(str(table)))
columns = run_query("SELECT *FROM information_schema.columns WHERE table_schema = 'schema' AND table_name = '{}';".format(str(table)))
# Fish out the actual column names
columns = [c[3] for c in columns]
df = pd. DataFrame(data, columns = columns)
return df
df = load_table_as_dataframe("tablename")
Which works...
Is there a better way to collect the needed data (and columns names) into a Pandas DataFrame within Postgres and Streamlit?
Using...
df = pd.read_sql("SELECT * FROM schema.{};".format(str(table)), conn)
...solved the issue. (Thx #parfait)

Psycopg2 can't write numpy nans to postgresql table: invalid input syntax for type double precision: ""

I have a small pyhton code that build a dataframe with one (or more) nans and then write it to a postgres database with psycopg2 module using copy_from function. Here it is:
table_name = "test"
df = pd.DataFrame([[1.0, 2.0], [3.0, np.nan]], columns=["VALUE0", "VALUE1"], index=pd.date_range("2000-01-01", "2000-01-02"))
database = "xxxx"
user = "xxxxxxx"
password = "xxxxxx"
host = "127.0.0.1"
port = "xxxxx"
def nan_to_null(f,
_NULL=psycopg2.extensions.AsIs('NULL'),
_NaN=np.NaN,
_Float=psycopg2.extensions.Float):
if f != f:
return _NULL
else:
return _Float(f)
psycopg2.extensions.register_adapter(float, nan_to_null)
psycopg2.extensions.register_adapter(np.float, nan_to_null)
psycopg2.extensions.register_adapter(np.float64, nan_to_null)
with psycopg2.connect(database=database,
user=user,
password=password,
host=host,
port=port) as conn:
try:
with conn.cursor() as cur:
cmd = "CREATE TABLE {} (TIMESTAMP timestamp PRIMARY KEY NOT NULL, VALUE0 FLOAT, VALUE1 FLOAT)"
cur.execute(sql.SQL(cmd).format(sql.Identifier(table_name)))
buffer = StringIO()
df.to_csv(buffer, index_label='TIMESTAMP', header=False)
buffer.seek(0)
cur.copy_from(buffer, table_name, sep=",")
conn.commit()
except Exception as e:
conn.rollback()
logging.error(traceback.format_exc())
raise e
The problème is that psycopg2 fail to transform nan into posgres NULL, although I have used this trick:
How do I convert numpy NaN objects to SQL nulls?
(the nan_to_null function).
I cannot make it work, it throws the following exception:
psycopg2.errors.InvalidTextRepresentation: invalid input syntax for type double precision: ""
CONTEXT: COPY test, line 2, column value1: ""
I am using python 3.8 on windows 10 with anaconda 3, psycopg2 v2.8.5 and postgres v12.3.
Thanks!
I put here the same code with the solution updated of Adrian Klaver.
The line that changed is:
df.to_csv(buffer, index_label='TIMESTAMP', header=False, na_rep='NaN')
We've added na_rep='NaN' in to_csv function. No need to replace nans with another line of code. replacing with 'NULL' does not work.
import psycopg2, logging, numpy as np, pandas as pd
from psycopg2 import sql
import traceback
from io import StringIO
if __name__ == '__main__':
table_name = "test"
df = pd.DataFrame([[1.0, 2.0], [3.0, np.nan]], columns=["VALUE0", "VALUE1"], index=pd.date_range("2000-01-01", "2000-01-02"))
database = "xxxxxx"
user = "xxxxx"
password = "xxxxxx"
host = "127.0.0.1"
port = "xxxxxx"
with psycopg2.connect(database=database,
user=user,
password=password,
host=host,
port=port) as conn:
try:
with conn.cursor() as cur:
#Creating a new table test
cmd = "CREATE TABLE {} (TIMESTAMP timestamp PRIMARY KEY NOT NULL, VALUE0 FLOAT, VALUE1 FLOAT);"
cur.execute(sql.SQL(cmd).format(sql.Identifier(table_name)))
#Writting content
buffer = StringIO()
df.to_csv(buffer, index_label='TIMESTAMP', header=False, na_rep='NaN')
buffer.seek(0)
cur.copy_from(buffer, table_name, sep=",")
#Reading the table content
cmd = "SELECT * FROM {};"
cur.execute(sql.SQL(cmd).format(sql.Identifier(table_name)))
test_data = pd.DataFrame(cur.fetchall())
print(test_data)
print(type(test_data.loc[1, 2]))
#Deleting the test table
cmd = "DROP TABLE {};"
cur.execute(sql.SQL(cmd).format(sql.Identifier(table_name)))
conn.commit()
except Exception as e:
conn.rollback()
logging.error(traceback.format_exc())
raise e
The prints shows that nan is well interpreted and stored in the DB.
The issue is the use of copy_from. From the docs:
Currently no adaptation is provided between Python and PostgreSQL types on COPY: ...
So your adapter does not come into play.
UPDATE A possible solution:
Pandas Changing the format of NaN values when saving to CSV
See #cs95 answer.
It seems you are inserting empty string instead of NULL value, you can easily reproduce you error with the following SQL code:
CREATE TABLE test(
x FLOAT
);
INSERT INTO test(x) VALUES ('');
-- ERROR: invalid input syntax for type double precision: "" Position: 29
On the other hand, NaN can be safely inserted into PostgreSQL:
INSERT INTO test(x) VALUES ('NaN');
Notice PostgreSQL float support slightly differs from IEEE 754 standards because PostresSQL needs all value to be orderable to consistently build index. Therefore NaN is greater or equal to any other number including itself in PostgreSQL.
Thanks to Adrian Klaver and jlandercy answer, the solution is simple... replace np.nan by 'NaN' manually with the following line that replace the nan_to_null function:
'''
df.replace(np.nan, "NaN", inplace=True)
'''
And it works fine. Thank you guys!
Add na_rep='NaN' when you write your csv file.
If you are using this in conjunction with psycopg2's copy_expert method, you may need to also add the null = "NaN" param to your postgres syntax so that the null representations match up.
Here's an example:
df.to_csv(csv_filename, index=False, na_rep='NaN')
string = sql.SQL("""
copy {}
from stdin (
format csv,
null "NaN",
delimiter ',',
header
)
""").format(sql.Identifier(table_name))

Is it possible to use Writestream directly to an API via spark

I build a code on Databricks to read a delta table in realtime (readstream) and then i need post this stream data to an API.
In all paper that I read, writestream is used only to create files (.csv, .avro, .parquet, etc) or sent to an Event Hub. Is possible to use writestream to post to an API!?
My code:
from pyspark.sql.functions import unix_timestamp, round, col
import json
import pandas as pd
from pyspark.sql.functions import lit
import requests
#tried with foreach_batch but it doens't work
def foreach_batch_function(df,epochId):
r2 = requests.post('https://demo.api.com/index.php/api/v5/smsrequest/', data=str(df), verify=False)
r2.json()
pass
rs = spark.readStream.format("delta").option('path','/mnt/gen2/raw/mytable').load()
df = rs.select(round('id_cliente_fat').alias('id_cliente_fat'),'fone_fat','nome_fat',unix_timestamp('dt_nasc_fat','YYYY-MM-DD').cast('timestamp').cast('date').alias('birth_date'),'email_fat')
df2 = df.selectExpr('id_cliente_fat as identifier_code','fone_fat as phone_number','nome_fat as name','birth_date','email_fat as email')
data = {'authentication':{'username':'user','password':'pass'}}
r = requests.post('https://demo.api.com/index.php/api/v5/login/', data=json.dumps(data), verify=False).json()
df3 = df2.withColumn("steps", lit("[1,2,4,7]")).withColumn("place_id", lit(164)).withColumn("token", lit(r["authentication"]["token"]))
df4 = df3.select(to_json(struct(struct("token").alias("authentication"), struct("identifier_code", "phone_number", "name", "birth_date", "email","steps","place_id").alias("smsrequest").alias("smsrequest"))).alias(""))
df4.writeStream.foreachBatch(foreach_batch_function).start()
You need to take data to the driver with .collect() method(It's not recommended for large amount of data).
Try something like this:
def foreach_batch_function(df,epochId):
# Create a Json with kews the name of the columns and values the values of the df
json_data = map(lambda row: row.asDict(), df.collect())
r2 = requests.post('https://demo.api.com/index.php/api/v5/smsrequest/', data=json_data, verify=False)
r2.json()
pass

Is there a loop function for automating data pull request from Google Trends?

I want to create a loop that helps me to pull data from Google Trends via PyTrends. I need to iterate through a lot of keywords but Google Trends allows only to compare five keywords at the time, hence I need to iterate through the keywords manually and create a dataframe in pandas. However, it seems something is off.
I get data but my dataframe with pandas creates the dataframe with values that are shifted in different rows and with duplicate "NaN" values.
instead of 62 rows I get 372 rows(with duplicate "NaN").
from pytrends.request import TrendReq
import pandas as pd
pytrend = TrendReq()
kw_list = ['cool', 'fun', 'big','house', 'phone', 'garden']
df1 = pd.DataFrame()
for i in kw_list:
kw_list = i
pytrend.build_payload([kw_list], timeframe='2015-10-14 2015-12-14', geo='FR')
df1 = df1.append(pytrend.interest_over_time())
print(df1.head)
I want to have one coherent dataframe, with the columns 'cool', 'fun', 'big','house', 'phone', 'garden' and their respective values in each column on the same row. Like e.g. a dataframe with 62 rows and 6 columns.
I'm probably going to lose some rep points because this is an old question (oldish, anyway), but I was struggling with the same problem and I solved it like this:
import pytrends
import pandas as pd
from pytrends.request import TrendReq
pytrend = TrendReq()
kw_list = ['cool', 'fun', 'big','house', 'phone', 'garden']
df_gtrends_kw = {}
df_gtrends = pd.DataFrame()
for kw in kw_list:
pytrend.build_payload(kw_list = [kw], timeframe='today 12-m')
df_gtrends_kw[kw] = pytrend.interest_by_region(resolution='COUNTRY')
df_gtrends = pd.concat([df_gtrends_kw[key] for key in kw_list], join = 'inner', axis = 1)
According to the official doc, one has to specify the axes along which one is to glue the dataframes; in this case, the Column names, since the index name is the same for each dataframe.

How to get SalesForce data to Python Panda dataframes

Currently we are taking SalesForce data in to CSV file and reading this CSV file in Pandas using read_csv, to_csv methods. Do we have any other way to get data from SalesForce to pandas dataframe.
With Python - you can download a package called Simple Salesforce and write SOQL queries to return data
https://github.com/simple-salesforce/simple-salesforce
Here's an example of how to do this:
from simple_salesforce import Salesforce
sf = Salesforce(username='<enter username>', password='<enter password>',
security_token = '<enter your access token from your profile>')
a_query= pd.DataFrame(sf.query(
"SELECT Name, CreatedDate FROM User")['records'])
In my case, to display the information as a dataframe I had to use the following code:
# Import libraries
import simple_salesforce as ssf, pandas
# Create the connection
session_id, instance = ssf.SalesforceLogin(username='<username>', password='<password>', security_token='<token>', sandbox=False)
sf_ = ssf.Salesforce(instance=instance, session_id=session_id)
# Query to execute
sql_code = "SELECT id, name FROM main_table"
# Store query result as dataframe
information = sf_.query(query= sql_code)
table = pandas.DataFrame(information['records']).drop(columns='attributes')
Adding up to the original answer,
the function below is also suitable for simple joins.
def sf_results_to_dataframe(results, drop_index=True) -> pd.DataFrame:
df = pd.DataFrame(results['records'])
df.drop('attributes', axis=1, inplace=True) # clean up from technical info
df.set_index('Id', drop=drop_index, inplace=True)
for table in ['Account', 'Contact', 'Lead', 'Opportunity']:
if table in results['records'][0].keys(): # detect JOIN
local_keys = list(results['records'][0][table].keys()) # keys from the joined table
if 'attributes' in local_keys:
local_keys.remove('attributes')
global_keys = [table + key for key in local_keys] # name for the fields in the output table
# fields of the joined table and the record index
table_records = [{'Id': record['Id'],
**{global_key:record[table][local_key] for global_key, local_key in zip(global_keys, local_keys)}}
for record in results['records']]
df_extra = pd.DataFrame(table_records)
df_extra.set_index('Id', drop=True, inplace=True) # match index
df.drop(table, axis=1, inplace=True) # drop duplicated info
df = df.merge(df_extra, left_index=True, right_index=True) # merge on index
return df
Example:
import pandas as pd
from simple_salesforce import Salesforce
SALESFORCE_EMAIL = '...'
SALESFORCE_TOKEN = '...'
SALESFORCE_PASSWORD = '...'
sf = Salesforce(username=SALESFORCE_EMAIL, password=SALESFORCE_PASSWORD, security_token=SALESFORCE_TOKEN)
query = """SELECT Id, Name, Account.Name
FROM Contact
LIMIT 1
"""
results = sf.query(query)
df = sf_results_to_dataframe(results)

Resources