How to get SalesForce data to Python Panda dataframes - python-3.x

Currently we are taking SalesForce data in to CSV file and reading this CSV file in Pandas using read_csv, to_csv methods. Do we have any other way to get data from SalesForce to pandas dataframe.

With Python - you can download a package called Simple Salesforce and write SOQL queries to return data
https://github.com/simple-salesforce/simple-salesforce
Here's an example of how to do this:
from simple_salesforce import Salesforce
sf = Salesforce(username='<enter username>', password='<enter password>',
security_token = '<enter your access token from your profile>')
a_query= pd.DataFrame(sf.query(
"SELECT Name, CreatedDate FROM User")['records'])

In my case, to display the information as a dataframe I had to use the following code:
# Import libraries
import simple_salesforce as ssf, pandas
# Create the connection
session_id, instance = ssf.SalesforceLogin(username='<username>', password='<password>', security_token='<token>', sandbox=False)
sf_ = ssf.Salesforce(instance=instance, session_id=session_id)
# Query to execute
sql_code = "SELECT id, name FROM main_table"
# Store query result as dataframe
information = sf_.query(query= sql_code)
table = pandas.DataFrame(information['records']).drop(columns='attributes')

Adding up to the original answer,
the function below is also suitable for simple joins.
def sf_results_to_dataframe(results, drop_index=True) -> pd.DataFrame:
df = pd.DataFrame(results['records'])
df.drop('attributes', axis=1, inplace=True) # clean up from technical info
df.set_index('Id', drop=drop_index, inplace=True)
for table in ['Account', 'Contact', 'Lead', 'Opportunity']:
if table in results['records'][0].keys(): # detect JOIN
local_keys = list(results['records'][0][table].keys()) # keys from the joined table
if 'attributes' in local_keys:
local_keys.remove('attributes')
global_keys = [table + key for key in local_keys] # name for the fields in the output table
# fields of the joined table and the record index
table_records = [{'Id': record['Id'],
**{global_key:record[table][local_key] for global_key, local_key in zip(global_keys, local_keys)}}
for record in results['records']]
df_extra = pd.DataFrame(table_records)
df_extra.set_index('Id', drop=True, inplace=True) # match index
df.drop(table, axis=1, inplace=True) # drop duplicated info
df = df.merge(df_extra, left_index=True, right_index=True) # merge on index
return df
Example:
import pandas as pd
from simple_salesforce import Salesforce
SALESFORCE_EMAIL = '...'
SALESFORCE_TOKEN = '...'
SALESFORCE_PASSWORD = '...'
sf = Salesforce(username=SALESFORCE_EMAIL, password=SALESFORCE_PASSWORD, security_token=SALESFORCE_TOKEN)
query = """SELECT Id, Name, Account.Name
FROM Contact
LIMIT 1
"""
results = sf.query(query)
df = sf_results_to_dataframe(results)

Related

Pivot issue in databricks

I have dataframe table having values :
id Country Interest
00 Russian Digestion;Destillation
I want to pivot the Interest column and name new column in azure databricks in python like this :
id Country Int Interest
00Q7 Russ Digestion Digestion;Destillation
00Q7 Russ Destillation Digestion;Destillation
Please advise how it can be done
Regards
RK
I have created a sample dataframe similar to yours using the following code:
data = [['00Q7','Russian Federation','Digestion;Destillation'],['00Q6','United States','Oils;Automobiles']]
df = spark.createDataFrame(data=data,schema = ['id','country','interests'])
display(df)
To get the desired output (like yours), first I have split the data in interests column using pyspark.sql.functions.split.
from pyspark.sql.functions import split,col
df1 = df.withColumn("interest", split(col("interests"), ";"))
display(df1)
Now I have exploded the new column interest using pyspark.sql.functions.explode to get the required output.
from pyspark.sql.functions import explode
op = df1.withColumn('interest',explode(col('interest')))
display(op)
UPDATE:
data = [['00Q7','Russian Federation','01_Digestion;02_Destillation']]
df = spark.createDataFrame(data=data,schema = ['id','country','interests'])
#display(df)
from pyspark.sql.functions import split,col
df1 = df.withColumn("interest", split(col("interests"), ";"))
from pyspark.sql.functions import explode
op = df1.withColumn('interest',explode(col('interest')))
#UPDATE
from pyspark.sql.functions import concat,lit
op.withColumn("set",concat(lit('Set'),split(col('interest'),'_').getItem(0))).show(truncate=False)
UPDATE-2:
pdf['set']= pdf['interest'].str.split('_').str[0]
import numpy as np
pdf["set"] = np.where(pdf["set"].astype(int)<10 , 'Set'+pdf['set'].str[1], 'Set'+pdf['set'])

Column names not displayed with st.dataframe after PostgreSQL `SELECT *` into Pandas DataFrame

Python 3.8.10
streamlit==1.9.0
pandas==1.4.2
psycopg2-binary==2.9.3
Loading a Postgres table directly into a Pandas DataFrame with the following code.
df = pd.DataFrame(run_query("SELECT * FROM schema.tablename;"))
Displaying it with either streamlit.dataframe(df) or streamlit.write(df) loses the column names.
In order to capture the column names, I use this kluge.
# Initialize connection.
#st.experimental_singleton
def init_connection():
return psycopg2.connect(**st.secrets["postgresservername"])
conn = init_connection()
# Perform query.
#st.experimental_memo(ttl=600)
def run_query(query):
with conn.cursor() as cur:
cur.execute(query)
return cur.fetchall()
def load_table_as_dataframe(table):
# This is super klugy.
data = run_query("SELECT * FROM schema.{};".format(str(table)))
columns = run_query("SELECT *FROM information_schema.columns WHERE table_schema = 'schema' AND table_name = '{}';".format(str(table)))
# Fish out the actual column names
columns = [c[3] for c in columns]
df = pd. DataFrame(data, columns = columns)
return df
df = load_table_as_dataframe("tablename")
Which works...
Is there a better way to collect the needed data (and columns names) into a Pandas DataFrame within Postgres and Streamlit?
Using...
df = pd.read_sql("SELECT * FROM schema.{};".format(str(table)), conn)
...solved the issue. (Thx #parfait)

Pandas - Add items to dataframe

I am trying to add row items to the dataframe, and I am not able to update the dataframe.
What i tried until now is commented out as it doesn't do what I need.
I simply want to download the json file and store it to a dataframe with those given columns. Seems I am not able to extract the child components fron JSON file and store them to a brand new dataframe.
Please find bellow my code:
import requests, json, urllib
import pandas as pd
url = "https://www.cisa.gov/sites/default/files/feeds/known_exploited_vulnerabilities.json"
data = pd.read_json(url)
headers = []
df = pd.DataFrame()
for key, item in data['vulnerabilities'].items():
for k in item.keys():
headers.append(k)
col = list(set(headers))
new_df = pd.DataFrame(columns=col)
for item in data['vulnerabilities'].items():
print(item[1])
# new_df['product'] = item[1]['product']
# new_df['vendorProject'] = item[1]['vendorProject']
# new_df['dueDate'] = item[1]['dueDate']
# new_df['shortDescription'] = item[1]['shortDescription']
# new_df['dateAdded'] = item[1]['dateAdded']
# new_df['vulnerabilityName'] = item[1]['vulnerabilityName']
# new_df['cveID'] = item[1]['cveID']
# new_df.append(item[1], ignore_index = True)
new_df
At the end my df is still blank.
The nested JSON data can be directly converted to a flattened dataframe using pd.json_normalize(). The headers are extracted from the JSON itself.
new_df = pd.DataFrame(pd.json_normalize(data['vulnerabilities']))
UPDATE: Unnested the vulnerabilities column specifically.
Output:
It worked with this:
import requests, json, urllib
import pandas as pd
url = "https://www.cisa.gov/sites/default/files/feeds/known_exploited_vulnerabilities.json"
data = pd.read_json(url)
headers = []
df = pd.DataFrame()
for key, item in data['vulnerabilities'].items():
for k in item.keys():
headers.append(k)
col = list(set(headers))
new_df = pd.DataFrame(columns=col)
for item in data['vulnerabilities'].items():
new_df.loc[len(new_df.index)] = item[1] <===THIS
new_df.head()

Is it possible to use Writestream directly to an API via spark

I build a code on Databricks to read a delta table in realtime (readstream) and then i need post this stream data to an API.
In all paper that I read, writestream is used only to create files (.csv, .avro, .parquet, etc) or sent to an Event Hub. Is possible to use writestream to post to an API!?
My code:
from pyspark.sql.functions import unix_timestamp, round, col
import json
import pandas as pd
from pyspark.sql.functions import lit
import requests
#tried with foreach_batch but it doens't work
def foreach_batch_function(df,epochId):
r2 = requests.post('https://demo.api.com/index.php/api/v5/smsrequest/', data=str(df), verify=False)
r2.json()
pass
rs = spark.readStream.format("delta").option('path','/mnt/gen2/raw/mytable').load()
df = rs.select(round('id_cliente_fat').alias('id_cliente_fat'),'fone_fat','nome_fat',unix_timestamp('dt_nasc_fat','YYYY-MM-DD').cast('timestamp').cast('date').alias('birth_date'),'email_fat')
df2 = df.selectExpr('id_cliente_fat as identifier_code','fone_fat as phone_number','nome_fat as name','birth_date','email_fat as email')
data = {'authentication':{'username':'user','password':'pass'}}
r = requests.post('https://demo.api.com/index.php/api/v5/login/', data=json.dumps(data), verify=False).json()
df3 = df2.withColumn("steps", lit("[1,2,4,7]")).withColumn("place_id", lit(164)).withColumn("token", lit(r["authentication"]["token"]))
df4 = df3.select(to_json(struct(struct("token").alias("authentication"), struct("identifier_code", "phone_number", "name", "birth_date", "email","steps","place_id").alias("smsrequest").alias("smsrequest"))).alias(""))
df4.writeStream.foreachBatch(foreach_batch_function).start()
You need to take data to the driver with .collect() method(It's not recommended for large amount of data).
Try something like this:
def foreach_batch_function(df,epochId):
# Create a Json with kews the name of the columns and values the values of the df
json_data = map(lambda row: row.asDict(), df.collect())
r2 = requests.post('https://demo.api.com/index.php/api/v5/smsrequest/', data=json_data, verify=False)
r2.json()
pass

Check null values in data fetched from SQL database

I want to store the data in numpy arrays fetched from the database. I want to make sure that no null value (None) go to the numpy array(throws an error anyway doing that). I have tried to do it the following way but it does not work. For some reason, NullValueCheck() always returns true How can I know about null values and do something about it?
import numpy as np
import pyodbc as odbc
cnxn = odbc.connect(conn_string)
cursor = cnxn.cursor()
cursor.execute("""SELECT ID, BuildingID, Title FROM Something"""")
rows = cursor.fetchall()
cnxn.close()
ID = [i[0] for i in rows]
buildingID = [i[1] for i in rows]
title = [i[2] for i in rows]
def NullValueCheck(rows):
if (any(elem is None for elem in rows[0])):
return True
else:
return False
if NullValueCheck(rows):
ID_array = np.fromiter(ID, dtype= np.int32)
Edit:
It turns out that I don't have to write all that code. I can achieve the same using pandas dataframe that I want to achieve from numpy array.
import pandas as pd
import pyodbc as odbc
cnxn = odbc.connect(conn_string)
df = pd.io.sql.read_sql("""SELECT ID, BuildingID, Title FROM Something""", cnxn)
I've found this is easiest to address in your source SQL. COALESCE is helpful here:
df = pd.io.sql.read_sql("""SELECT ID, COALESCE(BuildingID, 0) AS BuildingID, Title FROM Something""", cnxn)
This will return 0 if the value of BuildingID is NULL. Different SQL databases have functions for specific NULL checking (ISNULL in SQL Server, IFNULL in MySQL, for example), but COALESCE is the most cross database compatible.

Resources