Is it possible to use Writestream directly to an API via spark - apache-spark

I build a code on Databricks to read a delta table in realtime (readstream) and then i need post this stream data to an API.
In all paper that I read, writestream is used only to create files (.csv, .avro, .parquet, etc) or sent to an Event Hub. Is possible to use writestream to post to an API!?
My code:
from pyspark.sql.functions import unix_timestamp, round, col
import json
import pandas as pd
from pyspark.sql.functions import lit
import requests
#tried with foreach_batch but it doens't work
def foreach_batch_function(df,epochId):
r2 = requests.post('https://demo.api.com/index.php/api/v5/smsrequest/', data=str(df), verify=False)
r2.json()
pass
rs = spark.readStream.format("delta").option('path','/mnt/gen2/raw/mytable').load()
df = rs.select(round('id_cliente_fat').alias('id_cliente_fat'),'fone_fat','nome_fat',unix_timestamp('dt_nasc_fat','YYYY-MM-DD').cast('timestamp').cast('date').alias('birth_date'),'email_fat')
df2 = df.selectExpr('id_cliente_fat as identifier_code','fone_fat as phone_number','nome_fat as name','birth_date','email_fat as email')
data = {'authentication':{'username':'user','password':'pass'}}
r = requests.post('https://demo.api.com/index.php/api/v5/login/', data=json.dumps(data), verify=False).json()
df3 = df2.withColumn("steps", lit("[1,2,4,7]")).withColumn("place_id", lit(164)).withColumn("token", lit(r["authentication"]["token"]))
df4 = df3.select(to_json(struct(struct("token").alias("authentication"), struct("identifier_code", "phone_number", "name", "birth_date", "email","steps","place_id").alias("smsrequest").alias("smsrequest"))).alias(""))
df4.writeStream.foreachBatch(foreach_batch_function).start()

You need to take data to the driver with .collect() method(It's not recommended for large amount of data).
Try something like this:
def foreach_batch_function(df,epochId):
# Create a Json with kews the name of the columns and values the values of the df
json_data = map(lambda row: row.asDict(), df.collect())
r2 = requests.post('https://demo.api.com/index.php/api/v5/smsrequest/', data=json_data, verify=False)
r2.json()
pass

Related

Pivot issue in databricks

I have dataframe table having values :
id Country Interest
00 Russian Digestion;Destillation
I want to pivot the Interest column and name new column in azure databricks in python like this :
id Country Int Interest
00Q7 Russ Digestion Digestion;Destillation
00Q7 Russ Destillation Digestion;Destillation
Please advise how it can be done
Regards
RK
I have created a sample dataframe similar to yours using the following code:
data = [['00Q7','Russian Federation','Digestion;Destillation'],['00Q6','United States','Oils;Automobiles']]
df = spark.createDataFrame(data=data,schema = ['id','country','interests'])
display(df)
To get the desired output (like yours), first I have split the data in interests column using pyspark.sql.functions.split.
from pyspark.sql.functions import split,col
df1 = df.withColumn("interest", split(col("interests"), ";"))
display(df1)
Now I have exploded the new column interest using pyspark.sql.functions.explode to get the required output.
from pyspark.sql.functions import explode
op = df1.withColumn('interest',explode(col('interest')))
display(op)
UPDATE:
data = [['00Q7','Russian Federation','01_Digestion;02_Destillation']]
df = spark.createDataFrame(data=data,schema = ['id','country','interests'])
#display(df)
from pyspark.sql.functions import split,col
df1 = df.withColumn("interest", split(col("interests"), ";"))
from pyspark.sql.functions import explode
op = df1.withColumn('interest',explode(col('interest')))
#UPDATE
from pyspark.sql.functions import concat,lit
op.withColumn("set",concat(lit('Set'),split(col('interest'),'_').getItem(0))).show(truncate=False)
UPDATE-2:
pdf['set']= pdf['interest'].str.split('_').str[0]
import numpy as np
pdf["set"] = np.where(pdf["set"].astype(int)<10 , 'Set'+pdf['set'].str[1], 'Set'+pdf['set'])

show the data collected in pyspark

Below code runs without any error where I am trying to read data from a JSON file in my storage account. I would like to know how can I see the Output, the contents here in Databricks notebook. The file is quite long, so I just need to verify the output is what I am looking for? so would like to see first 10 items.. How do we do that?
import re
import json
%pip install azure
import azure
from azure.storage.blob import AppendBlobService
abs = AppendBlobService(account_name="azurestorage", account_key="mykey")
base_path = "resourceId=/SUBSCRIPTIONS/5315MyId/RESOURCEGROUPS/AZURE-DEV/PROVIDERS/MICROSOFT.CONTAINERSERVICE/MANAGEDCLUSTERS/AZURE-DEV/y=2022/m=05/d=23/h=13/m=00/PT1H.json"
pattern = base_path + "/*/*/*/*/m=00/*.json"
filter = glob2re(pattern)
df1 = (
spark.sparkContext.parallelize(
[
blob.name
for blob in abs.list_blobs("insights-logs-kube-audit", prefix=base_path)
if re.match(filter, blob.name)
]
)
.map(
lambda blob_name: abs.get_blob_to_bytes("insights-logs-kube-audit", blob_name)
.content.decode("utf-8")
.splitlines()
)
.flatMap(lambda lines: [json.loads(l) for l in lines])
.collect()
)
collect() :-
PySpark RDD/DataFrame collect() is an action operation that is used to retrieve all the elements of the dataset (from all nodes) to the driver node. We should use the collect() on smaller dataset usually after filter(), group() e.t.c.
take(num) :-It returns the first num rows as a list of Row.
DataFrame.take(num)
import re
import json
%pip install azure
import azure
from azure.storage.blob import AppendBlobService
abs = AppendBlobService(account_name="azurestorage", account_key="mykey")
base_path = "resourceId=/SUBSCRIPTIONS/5315MyId/RESOURCEGROUPS/AZURE-DEV/PROVIDERS/MICROSOFT.CONTAINERSERVICE/MANAGEDCLUSTERS/AZURE-DEV/y=2022/m=05/d=23/h=13/m=00/PT1H.json"
pattern = base_path + "/*/*/*/*/m=00/*.json"
filter = glob2re(pattern)
df1 = (
spark.sparkContext.parallelize(
[
blob.name
for blob in abs.list_blobs("insights-logs-kube-audit", prefix=base_path)
if re.match(filter, blob.name)
]
)
.map(
lambda blob_name: abs.get_blob_to_bytes("insights-logs-kube-audit", blob_name)
.content.decode("utf-8")
.splitlines()
)
.flatMap(lambda lines: [json.loads(l) for l in lines])
.df1.take(10)
)
Refer - https://spark.apache.org/docs/latest/api/python/reference/api/pyspark.sql.DataFrame.take.html

Pandas - Add items to dataframe

I am trying to add row items to the dataframe, and I am not able to update the dataframe.
What i tried until now is commented out as it doesn't do what I need.
I simply want to download the json file and store it to a dataframe with those given columns. Seems I am not able to extract the child components fron JSON file and store them to a brand new dataframe.
Please find bellow my code:
import requests, json, urllib
import pandas as pd
url = "https://www.cisa.gov/sites/default/files/feeds/known_exploited_vulnerabilities.json"
data = pd.read_json(url)
headers = []
df = pd.DataFrame()
for key, item in data['vulnerabilities'].items():
for k in item.keys():
headers.append(k)
col = list(set(headers))
new_df = pd.DataFrame(columns=col)
for item in data['vulnerabilities'].items():
print(item[1])
# new_df['product'] = item[1]['product']
# new_df['vendorProject'] = item[1]['vendorProject']
# new_df['dueDate'] = item[1]['dueDate']
# new_df['shortDescription'] = item[1]['shortDescription']
# new_df['dateAdded'] = item[1]['dateAdded']
# new_df['vulnerabilityName'] = item[1]['vulnerabilityName']
# new_df['cveID'] = item[1]['cveID']
# new_df.append(item[1], ignore_index = True)
new_df
At the end my df is still blank.
The nested JSON data can be directly converted to a flattened dataframe using pd.json_normalize(). The headers are extracted from the JSON itself.
new_df = pd.DataFrame(pd.json_normalize(data['vulnerabilities']))
UPDATE: Unnested the vulnerabilities column specifically.
Output:
It worked with this:
import requests, json, urllib
import pandas as pd
url = "https://www.cisa.gov/sites/default/files/feeds/known_exploited_vulnerabilities.json"
data = pd.read_json(url)
headers = []
df = pd.DataFrame()
for key, item in data['vulnerabilities'].items():
for k in item.keys():
headers.append(k)
col = list(set(headers))
new_df = pd.DataFrame(columns=col)
for item in data['vulnerabilities'].items():
new_df.loc[len(new_df.index)] = item[1] <===THIS
new_df.head()

AttributeError: 'datetime.datetime' object has no attribute 'striftime'

I am currently writing a machine learning program for school to predict the weather. I have been using this article https://stackabuse.com/using-machine-learning-to-predict-the-weather-part-1/ as my main resource (I have had to adjust as wunderground is no longer free so I have instead been using openweathermap). I was writing the data collection and organization part of my code I received the following error 'AttributeError: 'datetime.datetime' object has no attribute 'striftime'. Sorry in advance for the massive block of code, I figured it would be the best way to troubleshoot the problem. Thank you for any the help. The parts with '** code **' are what I am struggling with
from datetime import datetime
from datetime import timedelta
import time
from collections import namedtuple
import pandas as pd
import requests
import matplotlib.pyplot as plt
#Data collection and Organization
url = 'http://history.openweathermap.org//storage/d12a3df743e650ba4035d2c6d42fb68f.json'
#res = requests.get(url)
#data = res.json()
target_date = datetime(2018, 4, 22)
features = ["date", "temperature", "pressure", "humidity", "maxtemperature", "mintemperature"]
DailySummary = namedtuple("DailySummary", features)
def extra_weather_data(url, target_date, days):
for _ in range(days):
**request = url.format(target_date.striftime('%Y%m%d'))**
respone = requests.get(request)
if response.status_code == 200:
data = response.json()
records.append(DailySummary(
date = target_date,
temperature = data['main']['temp'],
pressure = data['main']['pressure'],
humidity = data['main']['humidity'],
maxtemperature = data['main']['temp_max'],
mintemperature = data['main']['temp_min']))
time.sleep(6)
target_date += timedelta(days=1)
**records = extra_weather_data(url, target_date, 365)**
#Finished data collection now begin to clean and process data using Pandas
df = pd.DataFrame(records, columns=features).set_index('date')
tmp = df[['temperature','pressure','humidty', 'maxtemperature', 'mintemperature']].head(10)
def derive_nth_day_feature(df, feature, N):
rows =df.shape[0]
nth_prior_measurements = [None]*N + [df[feature][i-N] for i in range(N,rows)]
col_name = "{}_{}".format(feature, N)
df[col_name] = nth_prior_measurements
for feature in features:
if feature != 'date':
for N in range(1, 4):
derive_nth_day_feature(df, feature, N)
df.columns

How to get SalesForce data to Python Panda dataframes

Currently we are taking SalesForce data in to CSV file and reading this CSV file in Pandas using read_csv, to_csv methods. Do we have any other way to get data from SalesForce to pandas dataframe.
With Python - you can download a package called Simple Salesforce and write SOQL queries to return data
https://github.com/simple-salesforce/simple-salesforce
Here's an example of how to do this:
from simple_salesforce import Salesforce
sf = Salesforce(username='<enter username>', password='<enter password>',
security_token = '<enter your access token from your profile>')
a_query= pd.DataFrame(sf.query(
"SELECT Name, CreatedDate FROM User")['records'])
In my case, to display the information as a dataframe I had to use the following code:
# Import libraries
import simple_salesforce as ssf, pandas
# Create the connection
session_id, instance = ssf.SalesforceLogin(username='<username>', password='<password>', security_token='<token>', sandbox=False)
sf_ = ssf.Salesforce(instance=instance, session_id=session_id)
# Query to execute
sql_code = "SELECT id, name FROM main_table"
# Store query result as dataframe
information = sf_.query(query= sql_code)
table = pandas.DataFrame(information['records']).drop(columns='attributes')
Adding up to the original answer,
the function below is also suitable for simple joins.
def sf_results_to_dataframe(results, drop_index=True) -> pd.DataFrame:
df = pd.DataFrame(results['records'])
df.drop('attributes', axis=1, inplace=True) # clean up from technical info
df.set_index('Id', drop=drop_index, inplace=True)
for table in ['Account', 'Contact', 'Lead', 'Opportunity']:
if table in results['records'][0].keys(): # detect JOIN
local_keys = list(results['records'][0][table].keys()) # keys from the joined table
if 'attributes' in local_keys:
local_keys.remove('attributes')
global_keys = [table + key for key in local_keys] # name for the fields in the output table
# fields of the joined table and the record index
table_records = [{'Id': record['Id'],
**{global_key:record[table][local_key] for global_key, local_key in zip(global_keys, local_keys)}}
for record in results['records']]
df_extra = pd.DataFrame(table_records)
df_extra.set_index('Id', drop=True, inplace=True) # match index
df.drop(table, axis=1, inplace=True) # drop duplicated info
df = df.merge(df_extra, left_index=True, right_index=True) # merge on index
return df
Example:
import pandas as pd
from simple_salesforce import Salesforce
SALESFORCE_EMAIL = '...'
SALESFORCE_TOKEN = '...'
SALESFORCE_PASSWORD = '...'
sf = Salesforce(username=SALESFORCE_EMAIL, password=SALESFORCE_PASSWORD, security_token=SALESFORCE_TOKEN)
query = """SELECT Id, Name, Account.Name
FROM Contact
LIMIT 1
"""
results = sf.query(query)
df = sf_results_to_dataframe(results)

Resources