show the data collected in pyspark - apache-spark

Below code runs without any error where I am trying to read data from a JSON file in my storage account. I would like to know how can I see the Output, the contents here in Databricks notebook. The file is quite long, so I just need to verify the output is what I am looking for? so would like to see first 10 items.. How do we do that?
import re
import json
%pip install azure
import azure
from azure.storage.blob import AppendBlobService
abs = AppendBlobService(account_name="azurestorage", account_key="mykey")
base_path = "resourceId=/SUBSCRIPTIONS/5315MyId/RESOURCEGROUPS/AZURE-DEV/PROVIDERS/MICROSOFT.CONTAINERSERVICE/MANAGEDCLUSTERS/AZURE-DEV/y=2022/m=05/d=23/h=13/m=00/PT1H.json"
pattern = base_path + "/*/*/*/*/m=00/*.json"
filter = glob2re(pattern)
df1 = (
spark.sparkContext.parallelize(
[
blob.name
for blob in abs.list_blobs("insights-logs-kube-audit", prefix=base_path)
if re.match(filter, blob.name)
]
)
.map(
lambda blob_name: abs.get_blob_to_bytes("insights-logs-kube-audit", blob_name)
.content.decode("utf-8")
.splitlines()
)
.flatMap(lambda lines: [json.loads(l) for l in lines])
.collect()
)

collect() :-
PySpark RDD/DataFrame collect() is an action operation that is used to retrieve all the elements of the dataset (from all nodes) to the driver node. We should use the collect() on smaller dataset usually after filter(), group() e.t.c.
take(num) :-It returns the first num rows as a list of Row.
DataFrame.take(num)
import re
import json
%pip install azure
import azure
from azure.storage.blob import AppendBlobService
abs = AppendBlobService(account_name="azurestorage", account_key="mykey")
base_path = "resourceId=/SUBSCRIPTIONS/5315MyId/RESOURCEGROUPS/AZURE-DEV/PROVIDERS/MICROSOFT.CONTAINERSERVICE/MANAGEDCLUSTERS/AZURE-DEV/y=2022/m=05/d=23/h=13/m=00/PT1H.json"
pattern = base_path + "/*/*/*/*/m=00/*.json"
filter = glob2re(pattern)
df1 = (
spark.sparkContext.parallelize(
[
blob.name
for blob in abs.list_blobs("insights-logs-kube-audit", prefix=base_path)
if re.match(filter, blob.name)
]
)
.map(
lambda blob_name: abs.get_blob_to_bytes("insights-logs-kube-audit", blob_name)
.content.decode("utf-8")
.splitlines()
)
.flatMap(lambda lines: [json.loads(l) for l in lines])
.df1.take(10)
)
Refer - https://spark.apache.org/docs/latest/api/python/reference/api/pyspark.sql.DataFrame.take.html

Related

Left join with CoGroupByKey sink to BigQuery using Dataflow

I would like to join files (expeditions- 2010s.csv and peaks.csv) using join key "peakid" with CoGroupByKey. However, there is an error when I sink it to BigQuery:
RuntimeError: BigQuery job beam_bq_job_LOAD_AUTOMATIC_JOB_NAME_LOAD_STEP_88_215864ba592a2e01f0c4e2157cc60c47_86e3562707f348c29b2a030cb6ed7ded failed. Error Result: <ErrorProto
location: 'gs://bucket-name/input/temp/bq_load/ededcfb43cda4d16934011481e2fd774/project_name.dataset.expeditions/9fe30f70-8473-44bc-86d5-20dfdf59f502'
message: 'Error while reading data, error message: JSON table encountered too many errors, giving up. Rows: 1; errors: 1. Please look into the errors[] collection for more details. File: gs://bucket-name/input/temp/bq_load/ededcfb43cda4d16934011481e2fd774/project_name.dataset.expeditions/9fe30f70-8473-44bc-86d5-20dfdf59f502'
reason: 'invalid'> [while running 'Write To BigQuery/BigQueryBatchFileLoads/WaitForDestinationLoadJobs'].
Please review code as below:
def read_csv_pd_input1(readable_file):
import json
import pandas as pd
import csv
import io
gcs_file = beam.io.filesystems.FileSystems.open(readable_file)
csv_dict = csv.DictReader(io.TextIOWrapper(gcs_file))
df = pd.DataFrame(csv_dict)
df = df[['peakid', 'bcdate', 'smtdate']]
a = df.set_index('peakid')[['bcdate', 'smtdate']].apply(tuple,1).to_dict()
a = tuple(a.items())
# result: only column name
# a = df.agg(lambda x: (x.values)).apply(tuple)
# result: only value but not as expected
# a = [tuple(x) for x in df.values]
# a = tuple(a)
return a
def read_csv_pd_input3(readable_file):
import json
import pandas as pd
import csv
import io
gcs_file = beam.io.filesystems.FileSystems.open(readable_file)
csv_dict = csv.DictReader(io.TextIOWrapper(gcs_file))
df = pd.DataFrame(csv_dict)
df = df[['peakid', 'pkname', 'heightm']]
a = df.set_index('peakid')[['pkname', 'heightm']].apply(tuple,1).to_dict()
a = tuple(a.items())
return a
def run(argv=None):
import apache_beam as beam
import io
parser = argparse.ArgumentParser()
parser.add_argument(
'--input',
dest='input',
required=False,
help='Input file to read. This can be a local file or '
'a file in a Google Storage Bucket.',
default='gs://bucket-name/input/expeditions- 2010s.csv')
parser.add_argument(
'--input3',
dest='input3',
required=False,
help='Input_p3 file to read. This can be a local file or '
'a file in a Google Storage Bucket.',
default='gs://bucket-name/input/peaks.csv')
known_args, pipeline_args = parser.parse_known_args(argv)
pipeline_options = PipelineOptions(pipeline_args)
p = beam.Pipeline(options=PipelineOptions(pipeline_args))
input_p1 = (
p
| 'Read From GCS input1' >> beam.Create([known_args.input])
| 'Pair each employee with key p1' >> beam.FlatMap(read_csv_pd_input1)
# | beam.Map(print)
)
input_p3 = (
p
| 'Read From GCS input3' >> beam.Create([known_args.input3])
| 'Pair each employee with key p3' >> beam.FlatMap(read_csv_pd_input3)
)
# CoGroupByKey: relational join of 2 or more key/values PCollection. It also accept dictionary of key value
output = (
{'input_p1': input_p1, 'input_p3': input_p3}
| 'Join' >> beam.CoGroupByKey()
| 'Write To BigQuery' >> beam.io.gcp.bigquery.WriteToBigQuery(
table='project_name:dataset.expeditions',
schema='peakid:STRING,bcdate:DATE,pkname:STRING,heightm:INTEGER',
method='FILE_LOADS',
custom_gcs_temp_location='gs://bucket-name/input/temp',
create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED,
write_disposition=beam.io.BigQueryDisposition.WRITE_TRUNCATE)
)
p.run().wait_until_finish()
# runner = DataflowRunner()
# runner.run_pipeline(p, options=options)
if __name__ == '__main__':
logging.getLogger().setLevel(logging.INFO)
run()
This part of the pipeline is wrong:
| 'Join' >> beam.CoGroupByKey()
| 'Write To BigQuery' >> beam.io.gcp.bigquery.WriteToBigQuery(...
The output of CoGroupByKey will have the format key, {'input_p1': [list_of_p1_elems_with_key], 'input_p3': [list_of_p3_elems_with_key]}. You need to process that output to map it to the schema expected by the BigQuery sink.
Because the schema of the data does not match the schema specified in the BigQuery sink, the ingestion of data fails.
The Beam programming guide has an example of how to process the output of CoGroupByKey, and the transform catalog has an example too.
I am not sure exactly how the columns of p1 and p3 are used to populate the BigQuery table. But other than that, after the beam.CoGroupByKey you could apply a beam.Map with a function similar to this one:
def process_group(kv):
key, values = kv
input_p1_list = values['input_p1']
input_p3_list = values['input_p3']
for p1 in input_p1_list:
for p3 in input_p3_list:
row_for_bq = {'peak_id': key, 'bcdate': p1['something'], 'heightm': p3['something'] }
yield row_for_bq

Problems to extract the pair verb-noun

I'm interested in extracting the verb-noun pair from my "task" column, so I first loaded the table using pandas
import pandas as pd
and then the file
DF = pd.read_excel(r'/content/contentdrive/MyDrive/extrac.xlsx')
After I import nltk and some packages import nltk
I create a function to process each text: `
def processa(Text_tasks):
text = nltk.word_tokenize(Text_tasks)
pos_tagged = nltk.pos_tag(text)
NV = list(filter(lambda x: x[1] == "NN" or x[1] == "VB", pos_tagged))
return NV
In the end, I try to generate a list with the results:
results = DF[‘task’].map(processa) and this happen
[enter image description here][1]
here is the data: https://docs.google.com/spreadsheets/d/1bRuTqpATsBglWMYIe-AmO5A2kq_i-0kg/edit?usp=sharing&ouid=115543599430411372875&rtpof=true&sd=true

Reading multiple excel files into a pandas dataframe, but also storing the file name

I would like to read multiple excel files and store them into a single pandas dataframe, but I would like one of the columns in the dataframe to be the file name. This is because the file name contains the date (this is monthly data) and I need that information. I can't seem to get the filename, but I'm able to get the excel files into a dataframe. Please help.
import os
import pandas as pd
import fsspec
files = os.listdir("C://Users//6J2754897//Downloads//monthlydata")
paths = "C://Users//6J2754897//Downloads//monthlydata"
a = pd.DataFrame([2], index = None)
df = pd.DataFrame()
for file in range(len(files)):
if files[file].endswith('.xlsx'):
df = df.append(pd.read_excel(paths + "//" + files[file], sheet_name = "information", skiprows=7), ignore_index=True)
df['Month'] = str(files[file])
The order of operations here is incorrect. The line:
df['Month'] = str(files[file])
Is going to overwrite the entire column with the most recent value.
Instead we should only add the value to the current DataFrame:
import os
import pandas as pd
paths = "C://Users//6J2754897//Downloads//monthlydata"
files = os.listdir(paths)
df = pd.DataFrame()
for file in range(len(files)):
if files[file].endswith('.xlsx'):
# Read in File
file_df = pd.read_excel(paths + "//" + files[file],
sheet_name="information",
skiprows=7)
# Add to just this DataFrame
file_df['Month'] = str(files[file])
# Update `df`
df = df.append(file_df, ignore_index=True)
Alternatively we can use DataFrame.assign to chain the column assignment:
import os
import pandas as pd
paths = "C://Users//6J2754897//Downloads//monthlydata"
files = os.listdir(paths)
df = pd.DataFrame()
for file in range(len(files)):
if files[file].endswith('.xlsx'):
# Read in File
df = df.append(
# Read in File
pd.read_excel(paths + "//" + files[file],
sheet_name="information",
skiprows=7)
.assign(Month=str(files[file])), # Add to just this DataFrame
ignore_index=True
)
For general overall improvements we can use pd.concat with a list comprehension over files. This is done to avoid growing the DataFrame (which can be extremely slow). Pathlib.glob can also help with the ability to select the appropriate files:
from pathlib import Path
import pandas as pd
paths = "C://Users//6J2754897//Downloads//monthlydata"
df = pd.concat([
pd.read_excel(file,
sheet_name="information",
skiprows=7)
.assign(Month=file.stem) # We may also want file.name here
for file in Path(paths).glob('*.xlsx')
])
Some options for the Month Column are either:
file.stem will give "[t]he final path component, without its suffix".
'folder/folder/sample.xlsx' -> 'sample'
file.name will give "the final path component, excluding the drive and root".
'folder/folder/sample.xlsx' -> 'sample.xlsx'

Is it possible to use Writestream directly to an API via spark

I build a code on Databricks to read a delta table in realtime (readstream) and then i need post this stream data to an API.
In all paper that I read, writestream is used only to create files (.csv, .avro, .parquet, etc) or sent to an Event Hub. Is possible to use writestream to post to an API!?
My code:
from pyspark.sql.functions import unix_timestamp, round, col
import json
import pandas as pd
from pyspark.sql.functions import lit
import requests
#tried with foreach_batch but it doens't work
def foreach_batch_function(df,epochId):
r2 = requests.post('https://demo.api.com/index.php/api/v5/smsrequest/', data=str(df), verify=False)
r2.json()
pass
rs = spark.readStream.format("delta").option('path','/mnt/gen2/raw/mytable').load()
df = rs.select(round('id_cliente_fat').alias('id_cliente_fat'),'fone_fat','nome_fat',unix_timestamp('dt_nasc_fat','YYYY-MM-DD').cast('timestamp').cast('date').alias('birth_date'),'email_fat')
df2 = df.selectExpr('id_cliente_fat as identifier_code','fone_fat as phone_number','nome_fat as name','birth_date','email_fat as email')
data = {'authentication':{'username':'user','password':'pass'}}
r = requests.post('https://demo.api.com/index.php/api/v5/login/', data=json.dumps(data), verify=False).json()
df3 = df2.withColumn("steps", lit("[1,2,4,7]")).withColumn("place_id", lit(164)).withColumn("token", lit(r["authentication"]["token"]))
df4 = df3.select(to_json(struct(struct("token").alias("authentication"), struct("identifier_code", "phone_number", "name", "birth_date", "email","steps","place_id").alias("smsrequest").alias("smsrequest"))).alias(""))
df4.writeStream.foreachBatch(foreach_batch_function).start()
You need to take data to the driver with .collect() method(It's not recommended for large amount of data).
Try something like this:
def foreach_batch_function(df,epochId):
# Create a Json with kews the name of the columns and values the values of the df
json_data = map(lambda row: row.asDict(), df.collect())
r2 = requests.post('https://demo.api.com/index.php/api/v5/smsrequest/', data=json_data, verify=False)
r2.json()
pass

How to get SalesForce data to Python Panda dataframes

Currently we are taking SalesForce data in to CSV file and reading this CSV file in Pandas using read_csv, to_csv methods. Do we have any other way to get data from SalesForce to pandas dataframe.
With Python - you can download a package called Simple Salesforce and write SOQL queries to return data
https://github.com/simple-salesforce/simple-salesforce
Here's an example of how to do this:
from simple_salesforce import Salesforce
sf = Salesforce(username='<enter username>', password='<enter password>',
security_token = '<enter your access token from your profile>')
a_query= pd.DataFrame(sf.query(
"SELECT Name, CreatedDate FROM User")['records'])
In my case, to display the information as a dataframe I had to use the following code:
# Import libraries
import simple_salesforce as ssf, pandas
# Create the connection
session_id, instance = ssf.SalesforceLogin(username='<username>', password='<password>', security_token='<token>', sandbox=False)
sf_ = ssf.Salesforce(instance=instance, session_id=session_id)
# Query to execute
sql_code = "SELECT id, name FROM main_table"
# Store query result as dataframe
information = sf_.query(query= sql_code)
table = pandas.DataFrame(information['records']).drop(columns='attributes')
Adding up to the original answer,
the function below is also suitable for simple joins.
def sf_results_to_dataframe(results, drop_index=True) -> pd.DataFrame:
df = pd.DataFrame(results['records'])
df.drop('attributes', axis=1, inplace=True) # clean up from technical info
df.set_index('Id', drop=drop_index, inplace=True)
for table in ['Account', 'Contact', 'Lead', 'Opportunity']:
if table in results['records'][0].keys(): # detect JOIN
local_keys = list(results['records'][0][table].keys()) # keys from the joined table
if 'attributes' in local_keys:
local_keys.remove('attributes')
global_keys = [table + key for key in local_keys] # name for the fields in the output table
# fields of the joined table and the record index
table_records = [{'Id': record['Id'],
**{global_key:record[table][local_key] for global_key, local_key in zip(global_keys, local_keys)}}
for record in results['records']]
df_extra = pd.DataFrame(table_records)
df_extra.set_index('Id', drop=True, inplace=True) # match index
df.drop(table, axis=1, inplace=True) # drop duplicated info
df = df.merge(df_extra, left_index=True, right_index=True) # merge on index
return df
Example:
import pandas as pd
from simple_salesforce import Salesforce
SALESFORCE_EMAIL = '...'
SALESFORCE_TOKEN = '...'
SALESFORCE_PASSWORD = '...'
sf = Salesforce(username=SALESFORCE_EMAIL, password=SALESFORCE_PASSWORD, security_token=SALESFORCE_TOKEN)
query = """SELECT Id, Name, Account.Name
FROM Contact
LIMIT 1
"""
results = sf.query(query)
df = sf_results_to_dataframe(results)

Resources