Bigquery CSV file load fail - python-3.x

google.api_core.exceptions.BadRequest: 400 Error while reading data, error message: CSV table encountered too many errors, giving up. Rows: 1; errors: 1. Please look into the error stream for more details.
I am trying to run Python script that loads the data into csv but getting this error. can anyone explain me this error
import csv
#Imports the Google Cloud BigQuery client library
from google.cloud import bigquery
from google.cloud.bigquery import Dataset
from google.cloud.bigquery import Table
from google.cloud.bigquery import LoadJobConfig
from google.cloud.bigquery import SchemaField
filename = 'events.csv'
idNeeded=0
#Instantiates a client
bigquery_client = bigquery.Client()
#Runs a query from BigQuery
def runBigQueryQuery( query, filename, idNeeded ):
if idNeeded == 1:
i = 1
query_job = bigquery_client.query(query)
results = query_job.result()
with open (filename, 'w', newline='') as f: #Create CSV file
write = csv.writer(f,dialect='excel',lineterminator='\n')
try:
for row in results:
print('{},{},{},{},{},{},{},{},{},{},{},{},{},{},{} '.format(row.EventId,
row.ScheduleId,
row.Date,
row.TimeFrom,
row.Description,
row.TimeTo,
row.ResourceId,
row.EmployeeId,
row.MovementTypeId,
row.Capacity,
row.CanBook,
row.NonMemberFlag,
row.MemberAmount,
row.NonMemberAmount,
row.Attendance))
write.writerow([i,row.EventId,
row.ScheduleId,
row.Date,
row.TimeFrom,
row.Description,
row.TimeTo,
row.ResourceId,
row.EmployeeId,
row.MovementTypeId,
row.Capacity,
row.CanBook,
row.NonMemberFlag,
row.MemberAmount,
row.NonMemberAmount,
row.Attendance]) #write Rows to CSV
i = i+1
except AttributeError as error:
print('An error occured: {0}'.format(error))
else:
query_job = bigquery_client.query(query)
results = query_job.result()
with open (filename, 'w', newline='') as f: #Create CSV file
write = csv.writer(f,dialect='excel',lineterminator='\n')
try:
for row in results:
print('{},{},{},{},{},{},{},{},{},{},{},{},{},{},{} '.format( row.EventId,
row.ScheduleId,
row.Date,
row.TimeFrom,
row.Description,
row.TimeTo,
row.ResourceId,
row.EmployeeId,
row.MovementTypeId,
row.Capacity,
row.CanBook,
row.NonMemberFlag,
row.MemberAmount,
row.NonMemberAmount,
row.Attendance))
write.writerow([row.EventId,
row.ScheduleId,
row.Date,
row.TimeFrom,
row.Description,
row.TimeTo,
row.ResourceId,
row.EmployeeId,
row.MovementTypeId,
row.Capacity,
row.CanBook,
row.NonMemberFlag,
row.MemberAmount,
row.NonMemberAmount,
row.Attendance]) #write Rows to CSV
except AttributeError as error:
print('An error occured: {0}'.format(error))
return
#Creates a dataset in BigQuery
def createDataset(datasetname):
dataset_ref = bigquery_client.dataset(datasetname)
dataset = Dataset(dataset_ref)
dataset.location = 'US'
dataset = bigquery_client.create_dataset(dataset)
return
def getDataset(datasetname):
dataset = bigquery_client.dataset(datasetname)
return dataset
def createTable(tablename, global_dataset_ref):
schema = [
#Enter Schema here.
# SchemaField('url', 'STRING', mode='required'),
# SchemaField('views', 'INTEGER', mode='required')
]
table_ref = global_dataset_ref.table(tablename)
table = Table(table_ref, schema=schema)
table = bigquery_client.create_table(table)
assert table.table_id == tablename
return
def getTable(tablename, global_dataset_ref):
table_ref = global_dataset_ref.table(tablename)
table = bigquery_client.get_table(table_ref)
# print(table.table_id)
print(table.schema)
# print(table.description)
# print(table.num_rows)
return table
def getTableSchema(tablename, global_dataset_ref):
table_ref = global_dataset_ref.table(tablename)
table = bigquery_client.get_table(table_ref)
schema = table.schema
return schema
def loadDataFromCSV(tablename, global_dataset_ref, filename):
schema = getTableSchema(tablename, global_dataset_ref)
table_ref = global_dataset_ref.table(tablename)
load_config = LoadJobConfig()
load_config.source_format = bigquery.SourceFormat.CSV
load_config.schema = schema
load_config.autodetect = True
load_config.allow_quoted_newlines = True
with open (filename, 'rb') as readable:
job = bigquery_client.load_table_from_file(readable, table_ref, location='US', job_config=load_config)
job.result()
print('Loaded {} rows into {}:{}.'.format(job.output_rows, global_dataset_ref, table_ref.table_id))
return
# Testing
if __name__ == '__main__':
datasetname = 'Data_Layer'
tablename = 'Events'
sqlquery = '''SELECT
null as EventId,
sc.scheduleid AS ScheduleId,
NULL AS Description,
sc.scheduledatefrom AS Date,
sc.timestart AS TimeFrom,
sc.timeduration AS TimeTo,
r.resourceid AS ResourceId,
sp.employeeid AS EmployeeId,
NULL AS MovementTypeId,
r.configheight AS Capacity,
CASE
WHEN st.schedulestatus IN (1, 3) THEN '1'
ELSE '0'
END CanBook,
CASE
WHEN sv.nonmembermayenroll = TRUE THEN '1'
ELSE '0'
END NonMemberFlag,
COALESCE(ProgramPrice.pricemember,
ServicePrice.pricemember,
0) AS MemberAmount,
COALESCE(ProgramPrice.pricenonmember,
ServicePrice.pricenonmember,
0) AS NonMemberAmount,
'N/A' AS Attendance
FROM
AloomaTest.SCSESSIONS s
LEFT JOIN
AloomaTest.SCSESSION_PROVIDERS sp
ON
sp.sessionid = s.sessionid
LEFT JOIN
AloomaTest.SCSESSION_RESOURCES sr
ON
sr.sessionid = s.sessionid
LEFT JOIN
AloomaTest.SCSCHEDULES sc
ON
sc.scheduleid = s.scheduleid
LEFT JOIN
AloomaTest._SCSCHEDULESTATUS ST
ON
ST.schedulestatus = sc.schedulestatus
LEFT JOIN
AloomaTest.SCRESOURCES r
ON
r.resourceid = sr.resourceid
LEFT JOIN
AloomaTest.SCSERVICES sv
ON
sv.serviceid = sc.serviceid
LEFT JOIN
AloomaTest.SCPROGREG_SEMCOURSES semc
ON
semc.serviceid = sc.serviceid
AND semc.semesterid = sc.semesterid
LEFT JOIN
AloomaTest.SCPROGREG_PRICES ProgramPrice
ON
ProgramPrice.scheduleid = sc.scheduleid
LEFT JOIN
AloomaTest.SCPROGREG_PRICES ServicePrice
ON
ServicePrice.semcourseid = semc.semcourseid
WHERE
COALESCE(ProgramPrice.feetypeid,
0) = 0
AND COALESCE(ServicePrice.feetypeid,
0)= 0
and sc.scheduleid in(31207,
25936,
5761094,
832794,
9825,
17912)
'''
#createDataset(datasetname) #Successfully tested this code 2018-09-24
global_dataset_ref = getDataset(datasetname) #Successfully tested this code 2018-09-24
#createTable(tablename, global_dataset_ref) #Successfully tested this code 2018-09-24
getTable(tablename, global_dataset_ref) #Successfully tested this code 2018-09-24
runBigQueryQuery(sqlquery,filename,idNeeded) #Successfully tested this code 2018-09-24
loadDataFromCSV(tablename, global_dataset_ref,filename) #Successfully tested this code 2018-09-24
sample data
,25936,2009-06-01 18:30:00,1110,M1PO - M1 PT Full,60,,254,,,1,0,0,0,N/A
,17912,2009-04-22 06:15:00,375,Pil Ptnr - Pilates Partner,60,47,398,,10,1,1,0,0,N/A
,31207,2009-06-22 19:00:00,1140,D390-2 - 1 1/2 Hour Massage,90,107,548,,20,0,0,0,0,N/A
,5761094,2018-10-05 00:00:00,1140,Fr 7:00-9:00p Adult Paddle Mixer,120,583,2349,,20,0,1,20,50,N/A
,5761094,2018-10-05 00:00:00,1140,Fr 7:00-9:00p Adult Paddle Mixer,120,591,2349,,20,0,1,20,50,N/A
,5761094,2018-10-05 00:00:00,1140,Fr 7:00-9:00p Adult Paddle Mixer,120,585,2349,,20,0,1,20,50,N/A
,5761094,2018-10-05 00:00:00,1140,Fr 7:00-9:00p Adult Paddle Mixer,120,584,2349,,20,0,1,20,50,N/A
,832794,2012-02-21 14:30:00,870,Comp Member One/One,60,,2963,,,1,0,0,0,N/A

The error message indicates that there is only 1 row in your CSV, you might be missing new lines while making it.

Related

Left join with CoGroupByKey sink to BigQuery using Dataflow

I would like to join files (expeditions- 2010s.csv and peaks.csv) using join key "peakid" with CoGroupByKey. However, there is an error when I sink it to BigQuery:
RuntimeError: BigQuery job beam_bq_job_LOAD_AUTOMATIC_JOB_NAME_LOAD_STEP_88_215864ba592a2e01f0c4e2157cc60c47_86e3562707f348c29b2a030cb6ed7ded failed. Error Result: <ErrorProto
location: 'gs://bucket-name/input/temp/bq_load/ededcfb43cda4d16934011481e2fd774/project_name.dataset.expeditions/9fe30f70-8473-44bc-86d5-20dfdf59f502'
message: 'Error while reading data, error message: JSON table encountered too many errors, giving up. Rows: 1; errors: 1. Please look into the errors[] collection for more details. File: gs://bucket-name/input/temp/bq_load/ededcfb43cda4d16934011481e2fd774/project_name.dataset.expeditions/9fe30f70-8473-44bc-86d5-20dfdf59f502'
reason: 'invalid'> [while running 'Write To BigQuery/BigQueryBatchFileLoads/WaitForDestinationLoadJobs'].
Please review code as below:
def read_csv_pd_input1(readable_file):
import json
import pandas as pd
import csv
import io
gcs_file = beam.io.filesystems.FileSystems.open(readable_file)
csv_dict = csv.DictReader(io.TextIOWrapper(gcs_file))
df = pd.DataFrame(csv_dict)
df = df[['peakid', 'bcdate', 'smtdate']]
a = df.set_index('peakid')[['bcdate', 'smtdate']].apply(tuple,1).to_dict()
a = tuple(a.items())
# result: only column name
# a = df.agg(lambda x: (x.values)).apply(tuple)
# result: only value but not as expected
# a = [tuple(x) for x in df.values]
# a = tuple(a)
return a
def read_csv_pd_input3(readable_file):
import json
import pandas as pd
import csv
import io
gcs_file = beam.io.filesystems.FileSystems.open(readable_file)
csv_dict = csv.DictReader(io.TextIOWrapper(gcs_file))
df = pd.DataFrame(csv_dict)
df = df[['peakid', 'pkname', 'heightm']]
a = df.set_index('peakid')[['pkname', 'heightm']].apply(tuple,1).to_dict()
a = tuple(a.items())
return a
def run(argv=None):
import apache_beam as beam
import io
parser = argparse.ArgumentParser()
parser.add_argument(
'--input',
dest='input',
required=False,
help='Input file to read. This can be a local file or '
'a file in a Google Storage Bucket.',
default='gs://bucket-name/input/expeditions- 2010s.csv')
parser.add_argument(
'--input3',
dest='input3',
required=False,
help='Input_p3 file to read. This can be a local file or '
'a file in a Google Storage Bucket.',
default='gs://bucket-name/input/peaks.csv')
known_args, pipeline_args = parser.parse_known_args(argv)
pipeline_options = PipelineOptions(pipeline_args)
p = beam.Pipeline(options=PipelineOptions(pipeline_args))
input_p1 = (
p
| 'Read From GCS input1' >> beam.Create([known_args.input])
| 'Pair each employee with key p1' >> beam.FlatMap(read_csv_pd_input1)
# | beam.Map(print)
)
input_p3 = (
p
| 'Read From GCS input3' >> beam.Create([known_args.input3])
| 'Pair each employee with key p3' >> beam.FlatMap(read_csv_pd_input3)
)
# CoGroupByKey: relational join of 2 or more key/values PCollection. It also accept dictionary of key value
output = (
{'input_p1': input_p1, 'input_p3': input_p3}
| 'Join' >> beam.CoGroupByKey()
| 'Write To BigQuery' >> beam.io.gcp.bigquery.WriteToBigQuery(
table='project_name:dataset.expeditions',
schema='peakid:STRING,bcdate:DATE,pkname:STRING,heightm:INTEGER',
method='FILE_LOADS',
custom_gcs_temp_location='gs://bucket-name/input/temp',
create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED,
write_disposition=beam.io.BigQueryDisposition.WRITE_TRUNCATE)
)
p.run().wait_until_finish()
# runner = DataflowRunner()
# runner.run_pipeline(p, options=options)
if __name__ == '__main__':
logging.getLogger().setLevel(logging.INFO)
run()
This part of the pipeline is wrong:
| 'Join' >> beam.CoGroupByKey()
| 'Write To BigQuery' >> beam.io.gcp.bigquery.WriteToBigQuery(...
The output of CoGroupByKey will have the format key, {'input_p1': [list_of_p1_elems_with_key], 'input_p3': [list_of_p3_elems_with_key]}. You need to process that output to map it to the schema expected by the BigQuery sink.
Because the schema of the data does not match the schema specified in the BigQuery sink, the ingestion of data fails.
The Beam programming guide has an example of how to process the output of CoGroupByKey, and the transform catalog has an example too.
I am not sure exactly how the columns of p1 and p3 are used to populate the BigQuery table. But other than that, after the beam.CoGroupByKey you could apply a beam.Map with a function similar to this one:
def process_group(kv):
key, values = kv
input_p1_list = values['input_p1']
input_p3_list = values['input_p3']
for p1 in input_p1_list:
for p3 in input_p3_list:
row_for_bq = {'peak_id': key, 'bcdate': p1['something'], 'heightm': p3['something'] }
yield row_for_bq

Save to csv file with dataframe data obtained from for loop

I am trying to save data from a dataframe built during a while loop. However, on the exit of the loop, the LPCCfeatextr comes out empty. How can I parse the LPCCfeatextrdf dataframe to be saved to a csv file correctly?
Here is my code:
LPCCfeatextr = []
# Remove csv file if it exists
with contextlib.suppress(FileNotFoundError):
os.remove(filename2)
def extract_features(file_name):
try:
fs, sig = wavfile.read(file_name)
lpcs_feat = lpc(sig=sig, fs=fs, num_ceps=num_ceps)
File_name = file_name
shape = lpcs_feat.shape
pad_width = max_pad_len - lpcs_feat.shape[1]
lpcs_feat = np.pad(lpcs_feat, pad_width=((0, 0), (0, pad_width)), mode='constant')
LPCCfeatextr.append([File_name, shape, pad_width])
LPCCfeatextrdf = pd.DataFrame(LPCCfeatextr, columns=['File_name', 'Shape', 'Pad_width'])
df = pd.DataFrame(LPCCfeatextrdf)
except Exception as e:
print("Error encountered while parsing file: ", file_name)
return None
return lpcs_feat, LPCCfeatextrdf
# Convert into a Panda dataframe
LPCCfeatextrdf = pd.DataFrame(LPCCfeatextr, columns=['File_name', 'Shape', 'Pad_width'])
# dataframe from dictionary
df = pd.DataFrame(LPCCfeatextrdf)
df.to_csv(filename2, mode='a', index=False, header=True)

pyspark modify class attributes using spark.sql.rdd.foreach()

The main task is to connect Hive and read data using spark rdd.
I have tried the code below. Connection and reading are both successful, but when I want to modify the value of self.jobUserProfile, I failed. Then I print this value in three positions(masking in #1,#2 and #3). In the first position, the value is valid, but in the second and third position, the dict is empty. It seems that the modification has not been assigned into the class attribute.
I have tried response = spark.sql('select userid, logtime from hive.dwd_log_login_i_d limit 10').collect() and iterate this dataframe, but when the data volume is too large, the performance may decline.
When I change response.rdd.foreach(lambda x: self.readLoginFunction(x)) to response.rdd.map(lambda x: self.readLoginFunction(x)), the target value in three position are all empty.
I'm a newbie in spark. Any advice could be helpful. Thanks in advance.
from analysis.common.db.hive.connectHive import *
import collections
class OperateHive():
def __init__(self):
self.jobUserProfile = collections.defaultdict(dict)
def readLoginFunction(self, e):
dic = collections.defaultdict()
dic['userid'] = e[0]
dic['logtime'] = e[1]
self.jobUserProfile[e[0]] = dic
print(self.jobUserProfile) #1
def readLogin(self, spark):
response = spark.sql('select userid, logtime from hive.dwd_log_login_i_d limit 10')
response.rdd.foreach(lambda x: self.readLoginFunction(x))
print(self.jobUserProfile) #2
if __name__ == '__main__':
spark = connectHive(['conf/hdfs-site.xml', 'conf/hive-site.xml'], 'utf-8')
operateHive = OperateHive()
operateHive.readLogin(spark)
print(operateHive.jobUserProfile) #3
Finally the code below works.
from analysis.common.db.hive.connectHive import *
import collections
class OperateHive():
def readLoginFunction(self, e,jobUserProfile, devAppProfile):
dic = collections.defaultdict()
dic['userid'] = e[0]
dic['logtime'] = e[1]
jobUserProfile[e[0]] = dic
devAppProfile[e[0]] = dic
print(jobUserProfile)
return jobUserProfile, devAppProfile
def readLogin(self, spark, jobUserProfile,devAppProfile):
response = spark.sql('select userid, logtime from hive.dwd_log_login_i_d limit 10')
rdd1 = response.rdd.map(lambda x: self.readLoginFunction(x, jobUserProfile, devAppProfile))
return rdd1.top(1)[0][0]
if __name__ == '__main__':
spark = connectHive(['conf/hdfs-site.xml', 'conf/hive-site.xml'], 'utf-8')
jobUserProfile = collections.defaultdict(dict)
devAppProfile = collections.defaultdict(dict)
operateHive = OperateHive()
jobUserProfile = operateHive.readLogin(spark, jobUserProfile, devAppProfile)
print(jobUserProfile)
But when I remove devAppProfile, the code show like below:
from analysis.common.db.hive.connectHive import *
import collections
class OperateHive():
def readLoginFunction(self, e,jobUserProfile, devAppProfile):
dic = collections.defaultdict()
dic['userid'] = e[0]
dic['logtime'] = e[1]
jobUserProfile[e[0]] = dic
devAppProfile[e[0]] = dic
print(jobUserProfile)
return jobUserProfile
def readLogin(self, spark, jobUserProfile,devAppProfile):
response = spark.sql('select userid, logtime from hive.dwd_log_login_i_d limit 10')
response.rdd.map(lambda x: self.readLoginFunction(x, jobUserProfile, devAppProfile))
if __name__ == '__main__':
spark = connectHive(['conf/hdfs-site.xml', 'conf/hive-site.xml'], 'utf-8')
jobUserProfile = collections.defaultdict(dict)
devAppProfile = collections.defaultdict(dict)
operateHive = OperateHive()
operateHive.readLogin(spark, jobUserProfile, devAppProfile)
The rdd.map() won't work as there is no print in print(jobUserProfile).
Then I change the code like below, which works again.
from analysis.common.db.hive.connectHive import *
import collections
class OperateHive():
def readLoginFunction(self, e,jobUserProfile, devAppProfile):
dic = collections.defaultdict()
dic['userid'] = e[0]
dic['logtime'] = e[1]
jobUserProfile[e[0]] = dic
devAppProfile[e[0]] = dic
print(jobUserProfile)
return jobUserProfile
def readLogin(self, spark, jobUserProfile,devAppProfile):
response = spark.sql('select userid, logtime from hive.dwd_log_login_i_d limit 10')
rdd1 = response.rdd.map(lambda x: self.readLoginFunction(x, jobUserProfile, devAppProfile))
return rdd1.collect()[-1]
if __name__ == '__main__':
spark = connectHive(['conf/hdfs-site.xml', 'conf/hive-site.xml'], 'utf-8')
jobUserProfile = collections.defaultdict(dict)
devAppProfile = collections.defaultdict(dict)
operateHive = OperateHive()
jobUserProfile = operateHive.readLogin(spark, jobUserProfile, devAppProfile)
print(jobUserProfile)
The problem on the post is about closure. But I don't work out why the three versions on the answer work differently.

Unable to retrieve data from frame

I am trying to retrieve specific data from data-frame with particular condition, but it show empty data frame. I am new to data science, trying to learn data science. Here is my code.
file = open('/home/jeet/files1/files/ch03/adult.data', 'r')
def chr_int(a):
if a.isdigit(): return int(a)
else: return 0
data = []
for line in file:
data1 = line.split(',')
if len(data1) == 15:
data.append([chr_int(data1[0]), data1[1],
chr_int(data1[2]), data1[3],
chr_int(data1[4]), data1[5],
data1[6], data1[7], data1[8],
data1[9], chr_int(data1[10]),
chr_int(data1[11]),
chr_int(data1[12]),
data1[13], data1[14]])
import pandas as pd
df = pd.DataFrame(data)
df.columns = ['age', 'type-employer', 'fnlwgt', 'education','education_num', 'marital','occupation', 'relationship','race','sex','capital_gain','capital_loss','hr_per_week','country','income']
ml = df[(df.sex == 'Male')] # here i retrive data who is male
ml1 = df[(df.sex == 'Male') & (df.income == '>50K\n')]
print(ml1.head()) # here i printing that data
fm =df[(df.sex == 'Female')]
fm1 = df [(df.sex == 'Female') & (df.income =='>50K\n')]
output:
Empty DataFrame
Columns: [age, type-employer, fnlwgt, education, education_num, marital, occupation, relationship, race, sex, capital_gain, capital_loss, hr_per_week, country, income]
Index: []
what's wrong with the code. why data frame is empty.
If you check the values carefully, you may see the problem:
print(df.income.unique())
>>> [' <=50K\n' ' >50K\n']
There are spaces in front of each values. So values should be either processed to get rid of these spaces, or the code should be modified like this:
ml1 = df[(df.sex == 'Male') & (df.income == ' >50K\n')]
fm1 = df [(df.sex == 'Female') & (df.income ==' <=50K\n')]

Why my spark streaming app does not show any out put

This is my follow up question from my earlier stack overflow question ,for which I did not get a response
I have tried writing this ,which does not throw up any error but it does not show any out put
My goal is to evaluate the Dstream objects with historical data RDD ,I could not
find any example for pyspark like this ( checking streaming RDD with static RDD
created before hand ) .Appreciate your response . Thanks
"""
Created on Thu May 05 16:23:15 2016
#author: bghosh
"""
import re
from pyspark import SparkContext
from pyspark.streaming import StreamingContext
from pyspark.sql import SQLContext,functions as func,Row
sc = SparkContext("local[2]", "realtimeApp")
sqlContext = SQLContext(sc)
ssc = StreamingContext(sc,10)
files = ssc.textFileStream("hdfs://RealTimeInputFolder/")
########Lets get the data from the db which is relavant for streaming ###
driver = "com.microsoft.sqlserver.jdbc.SQLServerDriver"
dataurl = "jdbc:sqlserver://devserver:1433"
db = "devDB"
table = "stream_helper"
credential = "dev_credential"
########basic data for evaluation purpose ########
#base_data = sqlContext.read.format("jdbc").options(driver=driver,url=dataurl,database=db,user=credential,password=credential,dbtable=table).load()
base_data = sqlContext.read.format("jdbc").options(driver=driver,url=dataurl,database=db,user=credential,password=credential,dbtable=table).load()
base_data.registerTempTable("base_data")
######
files_count = files.flatMap(lambda file: file.split( ))
#pattern = '(TranAmount=Decimal.{2})(.[0-9]*.[0-9]*)(\\S+ )(TranDescription=u.)([a-zA-z\\s]+)([\\S\\s]+ )(dSc=u.)([A-Z]{2}.[0-9]+)'
tranfiles = "wasb://vanspark01#vanspark01.blob.core.windows.net/RealTimeInputFolder01/"
def getSqlContextInstance(sparkContext):
if ('sqlContextSingletonInstance' not in globals()):
globals()['sqlContextSingletonInstance'] = SQLContext(sparkContext)
return globals()['sqlContextSingletonInstance']
def preparse(logline):
#match = re.search(pattern,logline)
pre = logline.split(",")
return(
Row(
Customer_id = pre[-1],
trantype = pre[-4],
amount = float(pre[-5]))
)
def parse():
parsed_tran = ssc.textFileStream(tranfiles).map(preparse)
#success = parsed_tran.filter(lambda s: s[1] == 1).map(lambda x:x[0])
#fail = parsed_tran.filter(lambda s:s[1] == 0).map(lambda x:x[0])
"""if fail.count() > 0:
print "no of non parsed file : %d",fail.count()
"""
return parsed_tran#success
def check_historic(rdd):
#checking with the historical table #
try:
streamSqlcontext = getSqlContextInstance(rdd)
stream_df = streamSqlcontext.createDataFrame(rdd)
stream_df.registerTempTable("stream_df")
result_data_frame = streamSqlcontext.sql("select * from stream_df LEFT OUTER JOIN base_data on stream_df.Customer_id= base_data.Customer_id" )
result_data_frame.show()
except:
pass
#return result_data_frame.rdd
success = parse()
success.foreachRDD(check_historic)
ssc.start()
ssc.awaitTermination()

Resources