Dataflow transform sending same input to output - python-3.x

The objective of this pipeline is to see how Ptransform works in Pubsub to Pubsub Python pipeline. I am giving the following input but it is giving me the same input back in the output pubsub. The idea is to just get one field out of the incoming stream from pubsub and send only that field to the output topic.
{"field_1": "14726485", "field_2": "3947183"}
class ExtractStoreStock(beam.PTransform):
"""A transform to extract a field
"""
def __init__(self, field):
super(ExtractStoreStock, self).__init__()
self.field = field
def expand(self, pcoll):
return (pcoll
| beam.Map(lambda elem: (elem[self.field])))
def run(argv=None):
parser = argparse.ArgumentParser()
known_args, pipeline_args = parser.parse_known_args(argv)
global cloud_options
global custom_options
pipeline_options = PipelineOptions(
pipeline_args, streaming=True
)
cloud_options = pipeline_options.view_as(GoogleCloudOptions)
custom_options = pipeline_options.view_as(CustomPipelineOptions)
pipeline_options.view_as(SetupOptions).save_main_session = True
with beam.Pipeline(options=pipeline_options) as pipeline:
messages = (
pipeline
| "Read from PubSub"
>> beam.io.ReadFromPubSub(subscription=custom_options.inputSubscription)
)
get_stores = messages | "get_store" >> ExtractStoreStock('field_1')
get_stores | "Write to PubSub" >> beam.io.WriteToPubSub(topic=custom_options.outputTopic)
pipeline.run()
if __name__ == "__main__": # noqa
logging.getLogger().setLevel(logging.INFO)
run()
I am new to beam or google dataflow and I am confused about what to change in the transform to give me the desired result.

You probably need to add a json.loads to parse the byte string you read from Pub/Sub.
messages | beam.Map(json.loads) | "get_store" >> ExtractStoreStock('field_1')
Also you can simplify it to:
get_stores = (messages
| beam.Map(json.loads)
| beam.Map(lambda x: x['field_1']))

Related

Left join with CoGroupByKey sink to BigQuery using Dataflow

I would like to join files (expeditions- 2010s.csv and peaks.csv) using join key "peakid" with CoGroupByKey. However, there is an error when I sink it to BigQuery:
RuntimeError: BigQuery job beam_bq_job_LOAD_AUTOMATIC_JOB_NAME_LOAD_STEP_88_215864ba592a2e01f0c4e2157cc60c47_86e3562707f348c29b2a030cb6ed7ded failed. Error Result: <ErrorProto
location: 'gs://bucket-name/input/temp/bq_load/ededcfb43cda4d16934011481e2fd774/project_name.dataset.expeditions/9fe30f70-8473-44bc-86d5-20dfdf59f502'
message: 'Error while reading data, error message: JSON table encountered too many errors, giving up. Rows: 1; errors: 1. Please look into the errors[] collection for more details. File: gs://bucket-name/input/temp/bq_load/ededcfb43cda4d16934011481e2fd774/project_name.dataset.expeditions/9fe30f70-8473-44bc-86d5-20dfdf59f502'
reason: 'invalid'> [while running 'Write To BigQuery/BigQueryBatchFileLoads/WaitForDestinationLoadJobs'].
Please review code as below:
def read_csv_pd_input1(readable_file):
import json
import pandas as pd
import csv
import io
gcs_file = beam.io.filesystems.FileSystems.open(readable_file)
csv_dict = csv.DictReader(io.TextIOWrapper(gcs_file))
df = pd.DataFrame(csv_dict)
df = df[['peakid', 'bcdate', 'smtdate']]
a = df.set_index('peakid')[['bcdate', 'smtdate']].apply(tuple,1).to_dict()
a = tuple(a.items())
# result: only column name
# a = df.agg(lambda x: (x.values)).apply(tuple)
# result: only value but not as expected
# a = [tuple(x) for x in df.values]
# a = tuple(a)
return a
def read_csv_pd_input3(readable_file):
import json
import pandas as pd
import csv
import io
gcs_file = beam.io.filesystems.FileSystems.open(readable_file)
csv_dict = csv.DictReader(io.TextIOWrapper(gcs_file))
df = pd.DataFrame(csv_dict)
df = df[['peakid', 'pkname', 'heightm']]
a = df.set_index('peakid')[['pkname', 'heightm']].apply(tuple,1).to_dict()
a = tuple(a.items())
return a
def run(argv=None):
import apache_beam as beam
import io
parser = argparse.ArgumentParser()
parser.add_argument(
'--input',
dest='input',
required=False,
help='Input file to read. This can be a local file or '
'a file in a Google Storage Bucket.',
default='gs://bucket-name/input/expeditions- 2010s.csv')
parser.add_argument(
'--input3',
dest='input3',
required=False,
help='Input_p3 file to read. This can be a local file or '
'a file in a Google Storage Bucket.',
default='gs://bucket-name/input/peaks.csv')
known_args, pipeline_args = parser.parse_known_args(argv)
pipeline_options = PipelineOptions(pipeline_args)
p = beam.Pipeline(options=PipelineOptions(pipeline_args))
input_p1 = (
p
| 'Read From GCS input1' >> beam.Create([known_args.input])
| 'Pair each employee with key p1' >> beam.FlatMap(read_csv_pd_input1)
# | beam.Map(print)
)
input_p3 = (
p
| 'Read From GCS input3' >> beam.Create([known_args.input3])
| 'Pair each employee with key p3' >> beam.FlatMap(read_csv_pd_input3)
)
# CoGroupByKey: relational join of 2 or more key/values PCollection. It also accept dictionary of key value
output = (
{'input_p1': input_p1, 'input_p3': input_p3}
| 'Join' >> beam.CoGroupByKey()
| 'Write To BigQuery' >> beam.io.gcp.bigquery.WriteToBigQuery(
table='project_name:dataset.expeditions',
schema='peakid:STRING,bcdate:DATE,pkname:STRING,heightm:INTEGER',
method='FILE_LOADS',
custom_gcs_temp_location='gs://bucket-name/input/temp',
create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED,
write_disposition=beam.io.BigQueryDisposition.WRITE_TRUNCATE)
)
p.run().wait_until_finish()
# runner = DataflowRunner()
# runner.run_pipeline(p, options=options)
if __name__ == '__main__':
logging.getLogger().setLevel(logging.INFO)
run()
This part of the pipeline is wrong:
| 'Join' >> beam.CoGroupByKey()
| 'Write To BigQuery' >> beam.io.gcp.bigquery.WriteToBigQuery(...
The output of CoGroupByKey will have the format key, {'input_p1': [list_of_p1_elems_with_key], 'input_p3': [list_of_p3_elems_with_key]}. You need to process that output to map it to the schema expected by the BigQuery sink.
Because the schema of the data does not match the schema specified in the BigQuery sink, the ingestion of data fails.
The Beam programming guide has an example of how to process the output of CoGroupByKey, and the transform catalog has an example too.
I am not sure exactly how the columns of p1 and p3 are used to populate the BigQuery table. But other than that, after the beam.CoGroupByKey you could apply a beam.Map with a function similar to this one:
def process_group(kv):
key, values = kv
input_p1_list = values['input_p1']
input_p3_list = values['input_p3']
for p1 in input_p1_list:
for p3 in input_p3_list:
row_for_bq = {'peak_id': key, 'bcdate': p1['something'], 'heightm': p3['something'] }
yield row_for_bq

Scheduling multiple pipelines as a different dataflow job in a single template file

I need to execute multiple pipelines in parallel(each pipeline is going to do the same logic but with different inputs) and want to get the metric count after executing the pipeline. The problem is while creating the template for the pipeline since it creates only one template file for each pipeline so it overrides the old template and finally get the template file for last pipeline alone. Basically we are doing this to get the row count fetched from Bigquery and bigtable(written rows) are same or not for different data sources.
Note: I am using multithreading to schedule the pipelines in parallel.
def getrow_count(self, pipeline, metric_name):
if not hasattr(pipeline.result, 'has_job'):
read_filter = MetricsFilter().with_name(metric_name)
query_result = pipeline.result.metrics().query(read_filter)
if query_result['counters']:
read_counter = query_result['counters'][0]
print(f"Row count for metric {metric_name} is {read_counter.committed}")
return read_counter.committed
def run_pipeline(self, query_text, schema, table_type):
job_name = table_type.replace('_','') + datetime.datetime.now().strftime("%Y%m%d")
self.options.view_as(GoogleCloudOptions).job_name = str(job_name)
self.options.view_as(GoogleCloudOptions).temp_location = 'gs://dataflow_storage_bq_bt/dataflow_test/tmp/' + table_type
self.options.view_as(GoogleCloudOptions).staging_location = 'gs://dataflow_storage_bq_bt/dataflow_test/tmp/'+ table_type
print(self.options.view_as(GoogleCloudOptions).staging_location)
with beam.Pipeline(options=self.options) as pipeline:
data_collection = pipeline | f"Get {table_type} from BigQuery" >> beam.io.Read(ReadFromBigQuery(query=query_text.get(),
use_standard_sql=True))
data_collection \
| f"Get {table_type} list of direct_row's " >> beam.ParDo(CreateRowFn(schema)) \
| f"Get {table_type} single direct row" >> beam.ParDo(GetRowFn()) \
| f"Write {table_type} To BT" >> WriteToBigTable(project_id=self.config_data["gcp_config"]["bt_project"],
instance_id=self.config_data["gcp_config"]["bt_instance_id"],
table_id=self.config_data["gcp_config"]["bt_table_id"])
bigquery_count = self.getrow_count(pipeline,'bigquery_row')
bigtable_count = self.getrow_count(pipeline,'Written Row')
if bigquery_count is None or bigtable_count is None:
print(f"No daily upload data for {table_type}")
elif bigquery_count == bigtable_count:
print(f"All daily upload data for {table_type} moved to bigtable from bigquery")
else:
raise ValueError("Row count mismatch; check the pipeline for {table_type}")
def get_query_text(self, file_path):
query_text_read_output = self.get_blob_data(file_path)
query_text = query_text_read_output.decode('utf-8')
return query_text
def get_blob_data(self, file_path):
blob = self.bucket.get_blob(file_path)
data = blob.download_as_string()
return data
def run(self):
self.set_options()
sql_config = self.config_data["sql_config"]
querytext1= self.options.view_as(DailyUploadOptions).test1
querytext2= self.options.view_as(DailyUploadOptions).test2
querytext3= self.options.view_as(DailyUploadOptions).test3
querytext4= self.options.view_as(DailyUploadOptions).test4
querytext5= self.options.view_as(DailyUploadOptions).test5
querytext6= self.options.view_as(DailyUploadOptions).test6
querytext7 = self.options.view_as(DailyUploadOptions).test7
Thread(target=self.run_pipeline,
args=(querytext1,
sql_config['test1_config']['schema'],
'feature1')).start()
time.sleep(100) #sleep time should be there or else dataflow job will get failed due to same job name pciked in subsequent job
Thread(target=self.run_pipeline,
args=(querytext2,
sql_config['test2_config']['schema'],
'feature2')).start()
time.sleep(100)
Thread(target=self.run_pipeline,
args=(querytext3,
sql_config['test3_config']['schema'],
'feature3')).start()
time.sleep(100)
Thread(target=self.run_pipeline,
args=(querytext4,
sql_config['test4_config']['schema'],
'feature4')).start()
time.sleep(100)
Thread(target=self.run_pipeline,
args=(querytext5,
sql_config['test5_config']['schema'],
'feature5')).start()
time.sleep(100)
Thread(target=self.run_pipeline,
args=(querytext6,
sql_config['test6_config']['schema'],
'feature6')).start()
time.sleep(100)
Thread(target=self.run_pipeline,
args=(querytext7,
sql_config['test7_config']['schema'],
'feature7')).start()
class CreateRowFn(beam.DoFn):
def __init__(self, schema):
self.schema = schema
self.bg_count = Metrics.counter('Bigquery', 'bigquery_row')
def process(self, key):
self.bg_count.inc()
direct_rows = []
data = json.loads(key['data'], strict=False)
direct_row = row.DirectRow(row_key=data["row_key"])
for table_type in self.schema:
for column_family in self.schema[table_type]['columns']:
for column in self.schema[table_type]['columns'][column_family]:
direct_row.set_cell(
column_family,
column,
json.dumps(data.get(column, {})),
datetime.datetime.fromtimestamp(0.0))
direct_rows.append(direct_row)
return [direct_rows]
class GetRowFn(beam.DoFn):
def process(self, row_list):
for row in row_list:
return [row]
Can you try this. Define the pipeline before, and then use it in parallel. In addition, don't worry about parallelism, let Dataflow managing it. You will disturb its own thread manager if you manage yourselves the thread.
def run_pipeline(self, pipeline, query_text, schema, table_type):
data_collection = pipeline | f"Get {table_type} from BigQuery" >> beam.io.Read(ReadFromBigQuery(query=query_text.get(), use_standard_sql=True))
.......
def run(self):
self.set_options()
sql_config = self.config_data["sql_config"]
querytext1= self.options.view_as(DailyUploadOptions).test1
querytext2= self.options.view_as(DailyUploadOptions).test2
querytext3= self.options.view_as(DailyUploadOptions).test3
querytext4= self.options.view_as(DailyUploadOptions).test4
querytext5= self.options.view_as(DailyUploadOptions).test5
querytext6= self.options.view_as(DailyUploadOptions).test6
querytext7 = self.options.view_as(DailyUploadOptions).test7
job_name = table_type.replace('_','') + datetime.datetime.now().strftime("%Y%m%d")
self.options.view_as(GoogleCloudOptions).job_name = str(job_name)
self.options.view_as(GoogleCloudOptions).temp_location = 'gs://dataflow_storage_bq_bt/dataflow_test/tmp/' + table_type
self.options.view_as(GoogleCloudOptions).staging_location = 'gs://dataflow_storage_bq_bt/dataflow_test/tmp/'+ table_type
print(self.options.view_as(GoogleCloudOptions).staging_location)
pipeline = beam.Pipeline(options=self.options)
run_pipeline(pipeline, querytext1,
sql_config['test1_config']['schema'],
'feature1')
run_pipeline(pipeline, querytext2,
sql_config['test2_config']['schema'],
'feature2')
run_pipeline(pipeline, querytext3,
sql_config['test3_config']['schema'],
'feature3')
run_pipeline(pipeline, querytext4,
sql_config['test4_config']['schema'],
'feature4')
run_pipeline(pipeline, querytext5,
sql_config['test5_config']['schema'],
'feature5')
run_pipeline(pipeline, querytext6,
sql_config['test6_config']['schema'],
'feature6')
run_pipeline(pipeline, querytext7,
sql_config['test7_config']['schema'],
'feature7')
pipeline.run()

How to dynamically create kafka producers

first I am doing baby steps in python and kafka, So let's say I have a listA=[item1, item2, item3] and every item of listA is a producer on a topic. Now what I want is to dynamically add/remove items to listA and became immediately producers also every item should run on it's own thread as they should be independent.
So basically I am trying to scale the application.
so far I tried to hard code every producer item and run it in its own terminal
each Item
from pykafka import KafkaClient
import json
from datetime import datetime
import uuid
import time
input_file = open('./data/item1.json')
json_array = json.load(input_file)
coordinates = json_array['features'][0]['geometry']['coordinates']
# Generate uuid
def generate_uuid():
return uuid.uuid4()
# Kafaka producer
client = KafkaClient(hosts="localhost:9092")
topic = client.topics['test_kafka2']
producer = topic.get_sync_producer()
# Generate all coordinates
def generate_coordinates(coordinates):
# new_coordinates = []
i = 0
while i < len(coordinates):
data = {}
data['class'] = 201
data['key'] = str(data['class']) + '_' + str(generate_uuid())
data['time_stamp'] = str(datetime.utcnow())
data['longitude'] = coordinates[i][0]
data['latitude'] = coordinates[i][1]
message = json.dumps(data)
producer.produce(message.encode('ascii'))
time.sleep(1)
# If item reaches last coordinaates
if i == len(coordinates)-1:
coordinates = coordinates[::-1]
i = 0
else:
i += 1
# return new_coordinates
generate_coordinates(coordinates)

Dataflow pipeline triggering NotImplementedError [while running 'Filter Status 1']

My pipeline has the following simple JSON input
{"mac": "KC:FC:48:AE:F6:94", "status": 8, "datetime": "2015-07-13T21:15:02Z"}
The output should basically go to a BigQuery table with 3 columns (mac, status and datetime) with their corresponding values
My Pipeline looks as follows:
# -*- coding: utf-8 -*-
import os, json, logging, argparse, datetime, apache_beam as beam
from google.cloud import error_reporting
from apache_beam.options.pipeline_options import PipelineOptions
from apache_beam.options.pipeline_options import StandardOptions
from apache_beam.options.pipeline_options import GoogleCloudOptions
GOOGLE_PUBSUB_CHANNEL = 'projects/project-name/topics/topic-name'
GOOGLE_BIGQUERY_TABLE = 'bq-table'
GOOGLE_DATASET_ID = 'bq-dataset'
GOOGLE_PROJECT_ID = 'project-name'
class GoogleBigQuery():
client_error = error_reporting.Client()
#staticmethod
def get_schema_table(schema):
bigquery_schema = []
for key in range(len(schema)):
bigquery_schema.append('{}:{}'.format(schema[key].get('bigquery_field_name'), schema[key].get('bigquery_field_type')))
return ','.join(bigquery_schema)
fields_contract = (
{ 'bigquery_field_name': 'datetime', 'bigquery_field_type': 'STRING' },
{ 'bigquery_field_name': 'mac', 'bigquery_field_type': 'STRING' },
{ 'bigquery_field_name': 'status', 'bigquery_field_type': 'INTEGER' }
)
def parse_pubsub(line):
record = json.loads(line)
logging.info(record)
return record
class FilterStatus1(beam.DoFn):
def status_filter_1(self, data):
for r in data:
print(r)
logging.info(r)
if r["status"] == 1:
print(r)
logging.info(r)
yield r
def run(argv=None):
parser = argparse.ArgumentParser()
known_args, pipeline_args = parser.parse_known_args(argv)
pipeline_parameters = [
'--runner', 'DirectRunner'
, '--staging_location', 'gs://bucket/staging'
, '--temp_location', 'gs://bucket/temp'
, '--autoscaling_algorithm', 'THROUGHPUT_BASED' #'NONE' to disable autoscaling
, '--num_workers', '1'
, '--max_num_workers', '2'
, '--disk_size_gb', '30'
, '--worker_machine_type', 'n1-standard-1'
]
pipeline_options = PipelineOptions(pipeline_parameters)
pipeline_options.view_as(StandardOptions).streaming = True
pipeline_options.view_as(GoogleCloudOptions).job_name = os.path.basename(__file__).split('.')[0].replace('_', '-')
pipeline_options.view_as(GoogleCloudOptions).project = GOOGLE_PROJECT_ID
with beam.Pipeline(options=pipeline_options, argv=pipeline_parameters) as p:
# Read the pubsub topic into a PCollection.
lines = (
p
| 'ReadPubSubMessage' >> beam.io.ReadFromPubSub(GOOGLE_PUBSUB_CHANNEL).with_output_types(bytes)
| 'Decode UTF-8' >> beam.Map(lambda x: x.decode('utf-8'))
| 'ParsePubSub' >> beam.Map(parse_pubsub)
)
(
lines | 'Filter Status 1' >> beam.ParDo(FilterStatus1())
| 'WriteToBigQueryStatus1' >> beam.io.WriteToBigQuery(
GOOGLE_BIGQUERY_TABLE
, project=GOOGLE_PROJECT_ID
, dataset=GOOGLE_DATASET_ID
, schema=GoogleBigQuery.get_schema_table(fields_contract)
, create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED
, write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND
#, write_disposition=beam.io.BigQueryDisposition.WRITE_TRUNCATE
)
)
logging.info('Pipeline finished')
result = p.run()
result.wait_until_finish()
if __name__ == '__main__':
logging.getLogger().setLevel(logging.INFO)
run()
I'm getting the following error:
RuntimeError: NotImplementedError [while running 'Filter Status 1']
My goal here is to filter the status column and when the value is 1 to stream it into BQ.
Thanks in advance for helping me out.
You can try a filtering approach using FlatMap to do such things.
First, define a filtering method:
def FilterStatus1(row):
if row["status"] == 1:
yield row
Then you can apply like:
lines = lines | beam.FlatMap(FilterStatus1) | 'WriteToBigQueryStatus1' ...
Also, try breaking up your code into chunks or explicitly assigned steps. This giant transformation, mappings and filterings happening in a single row usually turn your code into a black-box.
Hope it helps. Thanks.
I fixed my code this way
class FilterStatus1(beam.DoFn):
def process(self, data):
if data["status"] == 1:
result = [{"datetime":data["datetime"], "mac":data["mac"], "status":data["status"]}]
logging.info(result)
return result

Never resets list

I am trying to create a calorie counter the standard input goes like this:
python3 calories.txt < test.txt
Inside calories the food is the following format: apples 500
The problem I am having is that whenever I calculate the values for the person it seems to never return to an empty list..
import sys
food = {}
eaten = {}
finished = {}
total = 0
#mappings
def calories(x):
with open(x,"r") as file:
for line in file:
lines = line.strip().split()
key = " ".join(lines[0:-1])
value = lines[-1]
food[key] = value
def calculate(x):
a = []
for keys,values in x.items():
for c in values:
try:
a.append(int(food[c]))
except:
a.append(100)
print("before",a)
a = []
total = sum(a) # Problem here
print("after",a)
print(total)
def main():
calories(sys.argv[1])
for line in sys.stdin:
lines = line.strip().split(',')
for c in lines:
values = lines[0]
keys = lines[1:]
eaten[values] = keys
calculate(eaten)
if __name__ == '__main__':
main()
Edit - forgot to include what test.txt would look like:
joe,almonds,almonds,blue cheese,cabbage,mayonnaise,cherry pie,cola
mary,apple pie,avocado,broccoli,butter,danish pastry,lettuce,apple
sandy,zuchini,yogurt,veal,tuna,taco,pumpkin pie,macadamia nuts,brazil nuts
trudy,waffles,waffles,waffles,chicken noodle soup,chocolate chip cookie
How to make it easier on yourself:
When reading the calories-data, convert the calories to int() asap, no need to do it every time you want to sum up somthing that way.
Dictionary has a .get(key, defaultvalue) accessor, so if food not found, use 100 as default is a 1-liner w/o try: ... except:
This works for me, not using sys.stdin but supplying the second file as file as well instead of piping it into the program using <.
I modified some parsings to remove whitespaces and return a [(name,cal),...] tuplelist from calc.
May it help you to fix it to your liking:
def calories(x):
with open(x,"r") as file:
for line in file:
lines = line.strip().split()
key = " ".join(lines[0:-1])
value = lines[-1].strip() # ensure no whitespaces in
food[key] = int(value)
def getCal(foodlist, defValueUnknown = 100):
"""Get sum / total calories of a list of ingredients, unknown cost 100."""
return sum( food.get(x,defValueUnknown ) for x in foodlist) # calculate it, if unknown assume 100
def calculate(x):
a = []
for name,foods in x.items():
a.append((name, getCal(foods))) # append as tuple to list for all names/foods eaten
return a
def main():
calories(sys.argv[1])
with open(sys.argv[2]) as f: # parse as file, not piped in via sys.stdin
for line in f:
lines = line.strip().split(',')
for c in lines:
values = lines[0].strip()
keys = [x.strip() for x in lines[1:]] # ensure no whitespaces in
eaten[values] = keys
calced = calculate(eaten) # calculate after all are read into the dict
print (calced)
Output:
[('joe', 1400), ('mary', 1400), ('sandy', 1600), ('trudy', 1000)]
Using sys.stdin and piping just lead to my console blinking and waiting for manual input - maybe VS related...

Resources