Left join with CoGroupByKey sink to BigQuery using Dataflow

Left join with CoGroupByKey sink to BigQuery using Dataflow - python-3.x

I would like to join files (expeditions- 2010s.csv and peaks.csv) using join key "peakid" with CoGroupByKey. However, there is an error when I sink it to BigQuery:
RuntimeError: BigQuery job beam_bq_job_LOAD_AUTOMATIC_JOB_NAME_LOAD_STEP_88_215864ba592a2e01f0c4e2157cc60c47_86e3562707f348c29b2a030cb6ed7ded failed. Error Result: <ErrorProto
location: 'gs://bucket-name/input/temp/bq_load/ededcfb43cda4d16934011481e2fd774/project_name.dataset.expeditions/9fe30f70-8473-44bc-86d5-20dfdf59f502'
message: 'Error while reading data, error message: JSON table encountered too many errors, giving up. Rows: 1; errors: 1. Please look into the errors[] collection for more details. File: gs://bucket-name/input/temp/bq_load/ededcfb43cda4d16934011481e2fd774/project_name.dataset.expeditions/9fe30f70-8473-44bc-86d5-20dfdf59f502'
reason: 'invalid'> [while running 'Write To BigQuery/BigQueryBatchFileLoads/WaitForDestinationLoadJobs'].
Please review code as below:
def read_csv_pd_input1(readable_file):
import json
import pandas as pd
import csv
import io
gcs_file = beam.io.filesystems.FileSystems.open(readable_file)
csv_dict = csv.DictReader(io.TextIOWrapper(gcs_file))
df = pd.DataFrame(csv_dict)
df = df[['peakid', 'bcdate', 'smtdate']]
a = df.set_index('peakid')[['bcdate', 'smtdate']].apply(tuple,1).to_dict()
a = tuple(a.items())
# result: only column name
# a = df.agg(lambda x: (x.values)).apply(tuple)
# result: only value but not as expected
# a = [tuple(x) for x in df.values]
# a = tuple(a)
return a
def read_csv_pd_input3(readable_file):
import json
import pandas as pd
import csv
import io
gcs_file = beam.io.filesystems.FileSystems.open(readable_file)
csv_dict = csv.DictReader(io.TextIOWrapper(gcs_file))
df = pd.DataFrame(csv_dict)
df = df[['peakid', 'pkname', 'heightm']]
a = df.set_index('peakid')[['pkname', 'heightm']].apply(tuple,1).to_dict()
a = tuple(a.items())
return a
def run(argv=None):
import apache_beam as beam
import io
parser = argparse.ArgumentParser()
parser.add_argument(
'--input',
dest='input',
required=False,
help='Input file to read. This can be a local file or '
'a file in a Google Storage Bucket.',
default='gs://bucket-name/input/expeditions- 2010s.csv')
parser.add_argument(
'--input3',
dest='input3',
required=False,
help='Input_p3 file to read. This can be a local file or '
'a file in a Google Storage Bucket.',
default='gs://bucket-name/input/peaks.csv')
known_args, pipeline_args = parser.parse_known_args(argv)
pipeline_options = PipelineOptions(pipeline_args)
p = beam.Pipeline(options=PipelineOptions(pipeline_args))
input_p1 = (
p
| 'Read From GCS input1' >> beam.Create([known_args.input])
| 'Pair each employee with key p1' >> beam.FlatMap(read_csv_pd_input1)
# | beam.Map(print)
)
input_p3 = (
p
| 'Read From GCS input3' >> beam.Create([known_args.input3])
| 'Pair each employee with key p3' >> beam.FlatMap(read_csv_pd_input3)
)
# CoGroupByKey: relational join of 2 or more key/values PCollection. It also accept dictionary of key value
output = (
{'input_p1': input_p1, 'input_p3': input_p3}
| 'Join' >> beam.CoGroupByKey()
| 'Write To BigQuery' >> beam.io.gcp.bigquery.WriteToBigQuery(
table='project_name:dataset.expeditions',
schema='peakid:STRING,bcdate:DATE,pkname:STRING,heightm:INTEGER',
method='FILE_LOADS',
custom_gcs_temp_location='gs://bucket-name/input/temp',
create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED,
write_disposition=beam.io.BigQueryDisposition.WRITE_TRUNCATE)
)
p.run().wait_until_finish()
# runner = DataflowRunner()
# runner.run_pipeline(p, options=options)
if __name__ == '__main__':
logging.getLogger().setLevel(logging.INFO)
run()

This part of the pipeline is wrong:
| 'Join' >> beam.CoGroupByKey()
| 'Write To BigQuery' >> beam.io.gcp.bigquery.WriteToBigQuery(...
The output of CoGroupByKey will have the format key, {'input_p1': [list_of_p1_elems_with_key], 'input_p3': [list_of_p3_elems_with_key]}. You need to process that output to map it to the schema expected by the BigQuery sink.
Because the schema of the data does not match the schema specified in the BigQuery sink, the ingestion of data fails.
The Beam programming guide has an example of how to process the output of CoGroupByKey, and the transform catalog has an example too.

I am not sure exactly how the columns of p1 and p3 are used to populate the BigQuery table. But other than that, after the beam.CoGroupByKey you could apply a beam.Map with a function similar to this one:
def process_group(kv):
key, values = kv
input_p1_list = values['input_p1']
input_p3_list = values['input_p3']
for p1 in input_p1_list:
for p3 in input_p3_list:
row_for_bq = {'peak_id': key, 'bcdate': p1['something'], 'heightm': p3['something'] }
yield row_for_bq

Related

Dataflow transform sending same input to output

The objective of this pipeline is to see how Ptransform works in Pubsub to Pubsub Python pipeline. I am giving the following input but it is giving me the same input back in the output pubsub. The idea is to just get one field out of the incoming stream from pubsub and send only that field to the output topic.
{"field_1": "14726485", "field_2": "3947183"}
class ExtractStoreStock(beam.PTransform):
"""A transform to extract a field
"""
def __init__(self, field):
super(ExtractStoreStock, self).__init__()
self.field = field
def expand(self, pcoll):
return (pcoll
| beam.Map(lambda elem: (elem[self.field])))
def run(argv=None):
parser = argparse.ArgumentParser()
known_args, pipeline_args = parser.parse_known_args(argv)
global cloud_options
global custom_options
pipeline_options = PipelineOptions(
pipeline_args, streaming=True
)
cloud_options = pipeline_options.view_as(GoogleCloudOptions)
custom_options = pipeline_options.view_as(CustomPipelineOptions)
pipeline_options.view_as(SetupOptions).save_main_session = True
with beam.Pipeline(options=pipeline_options) as pipeline:
messages = (
pipeline
| "Read from PubSub"
>> beam.io.ReadFromPubSub(subscription=custom_options.inputSubscription)
)
get_stores = messages | "get_store" >> ExtractStoreStock('field_1')
get_stores | "Write to PubSub" >> beam.io.WriteToPubSub(topic=custom_options.outputTopic)
pipeline.run()
if __name__ == "__main__": # noqa
logging.getLogger().setLevel(logging.INFO)
run()
I am new to beam or google dataflow and I am confused about what to change in the transform to give me the desired result.

You probably need to add a json.loads to parse the byte string you read from Pub/Sub.
messages | beam.Map(json.loads) | "get_store" >> ExtractStoreStock('field_1')
Also you can simplify it to:
get_stores = (messages
| beam.Map(json.loads)
| beam.Map(lambda x: x['field_1']))

Dataflow pipeline triggering NotImplementedError [while running 'Filter Status 1']

My pipeline has the following simple JSON input
{"mac": "KC:FC:48:AE:F6:94", "status": 8, "datetime": "2015-07-13T21:15:02Z"}
The output should basically go to a BigQuery table with 3 columns (mac, status and datetime) with their corresponding values
My Pipeline looks as follows:
# -*- coding: utf-8 -*-
import os, json, logging, argparse, datetime, apache_beam as beam
from google.cloud import error_reporting
from apache_beam.options.pipeline_options import PipelineOptions
from apache_beam.options.pipeline_options import StandardOptions
from apache_beam.options.pipeline_options import GoogleCloudOptions
GOOGLE_PUBSUB_CHANNEL = 'projects/project-name/topics/topic-name'
GOOGLE_BIGQUERY_TABLE = 'bq-table'
GOOGLE_DATASET_ID = 'bq-dataset'
GOOGLE_PROJECT_ID = 'project-name'
class GoogleBigQuery():
client_error = error_reporting.Client()
#staticmethod
def get_schema_table(schema):
bigquery_schema = []
for key in range(len(schema)):
bigquery_schema.append('{}:{}'.format(schema[key].get('bigquery_field_name'), schema[key].get('bigquery_field_type')))
return ','.join(bigquery_schema)
fields_contract = (
{ 'bigquery_field_name': 'datetime', 'bigquery_field_type': 'STRING' },
{ 'bigquery_field_name': 'mac', 'bigquery_field_type': 'STRING' },
{ 'bigquery_field_name': 'status', 'bigquery_field_type': 'INTEGER' }
)
def parse_pubsub(line):
record = json.loads(line)
logging.info(record)
return record
class FilterStatus1(beam.DoFn):
def status_filter_1(self, data):
for r in data:
print(r)
logging.info(r)
if r["status"] == 1:
print(r)
logging.info(r)
yield r
def run(argv=None):
parser = argparse.ArgumentParser()
known_args, pipeline_args = parser.parse_known_args(argv)
pipeline_parameters = [
'--runner', 'DirectRunner'
, '--staging_location', 'gs://bucket/staging'
, '--temp_location', 'gs://bucket/temp'
, '--autoscaling_algorithm', 'THROUGHPUT_BASED' #'NONE' to disable autoscaling
, '--num_workers', '1'
, '--max_num_workers', '2'
, '--disk_size_gb', '30'
, '--worker_machine_type', 'n1-standard-1'
]
pipeline_options = PipelineOptions(pipeline_parameters)
pipeline_options.view_as(StandardOptions).streaming = True
pipeline_options.view_as(GoogleCloudOptions).job_name = os.path.basename(__file__).split('.')[0].replace('_', '-')
pipeline_options.view_as(GoogleCloudOptions).project = GOOGLE_PROJECT_ID
with beam.Pipeline(options=pipeline_options, argv=pipeline_parameters) as p:
# Read the pubsub topic into a PCollection.
lines = (
p
| 'ReadPubSubMessage' >> beam.io.ReadFromPubSub(GOOGLE_PUBSUB_CHANNEL).with_output_types(bytes)
| 'Decode UTF-8' >> beam.Map(lambda x: x.decode('utf-8'))
| 'ParsePubSub' >> beam.Map(parse_pubsub)
)
(
lines | 'Filter Status 1' >> beam.ParDo(FilterStatus1())
| 'WriteToBigQueryStatus1' >> beam.io.WriteToBigQuery(
GOOGLE_BIGQUERY_TABLE
, project=GOOGLE_PROJECT_ID
, dataset=GOOGLE_DATASET_ID
, schema=GoogleBigQuery.get_schema_table(fields_contract)
, create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED
, write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND
#, write_disposition=beam.io.BigQueryDisposition.WRITE_TRUNCATE
)
)
logging.info('Pipeline finished')
result = p.run()
result.wait_until_finish()
if __name__ == '__main__':
logging.getLogger().setLevel(logging.INFO)
run()
I'm getting the following error:
RuntimeError: NotImplementedError [while running 'Filter Status 1']
My goal here is to filter the status column and when the value is 1 to stream it into BQ.
Thanks in advance for helping me out.

You can try a filtering approach using FlatMap to do such things.
First, define a filtering method:
def FilterStatus1(row):
if row["status"] == 1:
yield row
Then you can apply like:
lines = lines | beam.FlatMap(FilterStatus1) | 'WriteToBigQueryStatus1' ...
Also, try breaking up your code into chunks or explicitly assigned steps. This giant transformation, mappings and filterings happening in a single row usually turn your code into a black-box.
Hope it helps. Thanks.

I fixed my code this way
class FilterStatus1(beam.DoFn):
def process(self, data):
if data["status"] == 1:
result = [{"datetime":data["datetime"], "mac":data["mac"], "status":data["status"]}]
logging.info(result)
return result

Bigquery CSV file load fail

google.api_core.exceptions.BadRequest: 400 Error while reading data, error message: CSV table encountered too many errors, giving up. Rows: 1; errors: 1. Please look into the error stream for more details.
I am trying to run Python script that loads the data into csv but getting this error. can anyone explain me this error
import csv
#Imports the Google Cloud BigQuery client library
from google.cloud import bigquery
from google.cloud.bigquery import Dataset
from google.cloud.bigquery import Table
from google.cloud.bigquery import LoadJobConfig
from google.cloud.bigquery import SchemaField
filename = 'events.csv'
idNeeded=0
#Instantiates a client
bigquery_client = bigquery.Client()
#Runs a query from BigQuery
def runBigQueryQuery( query, filename, idNeeded ):
if idNeeded == 1:
i = 1
query_job = bigquery_client.query(query)
results = query_job.result()
with open (filename, 'w', newline='') as f: #Create CSV file
write = csv.writer(f,dialect='excel',lineterminator='\n')
try:
for row in results:
print('{},{},{},{},{},{},{},{},{},{},{},{},{},{},{} '.format(row.EventId,
row.ScheduleId,
row.Date,
row.TimeFrom,
row.Description,
row.TimeTo,
row.ResourceId,
row.EmployeeId,
row.MovementTypeId,
row.Capacity,
row.CanBook,
row.NonMemberFlag,
row.MemberAmount,
row.NonMemberAmount,
row.Attendance))
write.writerow([i,row.EventId,
row.ScheduleId,
row.Date,
row.TimeFrom,
row.Description,
row.TimeTo,
row.ResourceId,
row.EmployeeId,
row.MovementTypeId,
row.Capacity,
row.CanBook,
row.NonMemberFlag,
row.MemberAmount,
row.NonMemberAmount,
row.Attendance]) #write Rows to CSV
i = i+1
except AttributeError as error:
print('An error occured: {0}'.format(error))
else:
query_job = bigquery_client.query(query)
results = query_job.result()
with open (filename, 'w', newline='') as f: #Create CSV file
write = csv.writer(f,dialect='excel',lineterminator='\n')
try:
for row in results:
print('{},{},{},{},{},{},{},{},{},{},{},{},{},{},{} '.format( row.EventId,
row.ScheduleId,
row.Date,
row.TimeFrom,
row.Description,
row.TimeTo,
row.ResourceId,
row.EmployeeId,
row.MovementTypeId,
row.Capacity,
row.CanBook,
row.NonMemberFlag,
row.MemberAmount,
row.NonMemberAmount,
row.Attendance))
write.writerow([row.EventId,
row.ScheduleId,
row.Date,
row.TimeFrom,
row.Description,
row.TimeTo,
row.ResourceId,
row.EmployeeId,
row.MovementTypeId,
row.Capacity,
row.CanBook,
row.NonMemberFlag,
row.MemberAmount,
row.NonMemberAmount,
row.Attendance]) #write Rows to CSV
except AttributeError as error:
print('An error occured: {0}'.format(error))
return
#Creates a dataset in BigQuery
def createDataset(datasetname):
dataset_ref = bigquery_client.dataset(datasetname)
dataset = Dataset(dataset_ref)
dataset.location = 'US'
dataset = bigquery_client.create_dataset(dataset)
return
def getDataset(datasetname):
dataset = bigquery_client.dataset(datasetname)
return dataset
def createTable(tablename, global_dataset_ref):
schema = [
#Enter Schema here.
# SchemaField('url', 'STRING', mode='required'),
# SchemaField('views', 'INTEGER', mode='required')
]
table_ref = global_dataset_ref.table(tablename)
table = Table(table_ref, schema=schema)
table = bigquery_client.create_table(table)
assert table.table_id == tablename
return
def getTable(tablename, global_dataset_ref):
table_ref = global_dataset_ref.table(tablename)
table = bigquery_client.get_table(table_ref)
# print(table.table_id)
print(table.schema)
# print(table.description)
# print(table.num_rows)
return table
def getTableSchema(tablename, global_dataset_ref):
table_ref = global_dataset_ref.table(tablename)
table = bigquery_client.get_table(table_ref)
schema = table.schema
return schema
def loadDataFromCSV(tablename, global_dataset_ref, filename):
schema = getTableSchema(tablename, global_dataset_ref)
table_ref = global_dataset_ref.table(tablename)
load_config = LoadJobConfig()
load_config.source_format = bigquery.SourceFormat.CSV
load_config.schema = schema
load_config.autodetect = True
load_config.allow_quoted_newlines = True
with open (filename, 'rb') as readable:
job = bigquery_client.load_table_from_file(readable, table_ref, location='US', job_config=load_config)
job.result()
print('Loaded {} rows into {}:{}.'.format(job.output_rows, global_dataset_ref, table_ref.table_id))
return
# Testing
if __name__ == '__main__':
datasetname = 'Data_Layer'
tablename = 'Events'
sqlquery = '''SELECT
null as EventId,
sc.scheduleid AS ScheduleId,
NULL AS Description,
sc.scheduledatefrom AS Date,
sc.timestart AS TimeFrom,
sc.timeduration AS TimeTo,
r.resourceid AS ResourceId,
sp.employeeid AS EmployeeId,
NULL AS MovementTypeId,
r.configheight AS Capacity,
CASE
WHEN st.schedulestatus IN (1, 3) THEN '1'
ELSE '0'
END CanBook,
CASE
WHEN sv.nonmembermayenroll = TRUE THEN '1'
ELSE '0'
END NonMemberFlag,
COALESCE(ProgramPrice.pricemember,
ServicePrice.pricemember,
0) AS MemberAmount,
COALESCE(ProgramPrice.pricenonmember,
ServicePrice.pricenonmember,
0) AS NonMemberAmount,
'N/A' AS Attendance
FROM
AloomaTest.SCSESSIONS s
LEFT JOIN
AloomaTest.SCSESSION_PROVIDERS sp
ON
sp.sessionid = s.sessionid
LEFT JOIN
AloomaTest.SCSESSION_RESOURCES sr
ON
sr.sessionid = s.sessionid
LEFT JOIN
AloomaTest.SCSCHEDULES sc
ON
sc.scheduleid = s.scheduleid
LEFT JOIN
AloomaTest._SCSCHEDULESTATUS ST
ON
ST.schedulestatus = sc.schedulestatus
LEFT JOIN
AloomaTest.SCRESOURCES r
ON
r.resourceid = sr.resourceid
LEFT JOIN
AloomaTest.SCSERVICES sv
ON
sv.serviceid = sc.serviceid
LEFT JOIN
AloomaTest.SCPROGREG_SEMCOURSES semc
ON
semc.serviceid = sc.serviceid
AND semc.semesterid = sc.semesterid
LEFT JOIN
AloomaTest.SCPROGREG_PRICES ProgramPrice
ON
ProgramPrice.scheduleid = sc.scheduleid
LEFT JOIN
AloomaTest.SCPROGREG_PRICES ServicePrice
ON
ServicePrice.semcourseid = semc.semcourseid
WHERE
COALESCE(ProgramPrice.feetypeid,
0) = 0
AND COALESCE(ServicePrice.feetypeid,
0)= 0
and sc.scheduleid in(31207,
25936,
5761094,
832794,
9825,
17912)
'''
#createDataset(datasetname) #Successfully tested this code 2018-09-24
global_dataset_ref = getDataset(datasetname) #Successfully tested this code 2018-09-24
#createTable(tablename, global_dataset_ref) #Successfully tested this code 2018-09-24
getTable(tablename, global_dataset_ref) #Successfully tested this code 2018-09-24
runBigQueryQuery(sqlquery,filename,idNeeded) #Successfully tested this code 2018-09-24
loadDataFromCSV(tablename, global_dataset_ref,filename) #Successfully tested this code 2018-09-24
sample data
,25936,2009-06-01 18:30:00,1110,M1PO - M1 PT Full,60,,254,,,1,0,0,0,N/A
,17912,2009-04-22 06:15:00,375,Pil Ptnr - Pilates Partner,60,47,398,,10,1,1,0,0,N/A
,31207,2009-06-22 19:00:00,1140,D390-2 - 1 1/2 Hour Massage,90,107,548,,20,0,0,0,0,N/A
,5761094,2018-10-05 00:00:00,1140,Fr 7:00-9:00p Adult Paddle Mixer,120,583,2349,,20,0,1,20,50,N/A
,5761094,2018-10-05 00:00:00,1140,Fr 7:00-9:00p Adult Paddle Mixer,120,591,2349,,20,0,1,20,50,N/A
,5761094,2018-10-05 00:00:00,1140,Fr 7:00-9:00p Adult Paddle Mixer,120,585,2349,,20,0,1,20,50,N/A
,5761094,2018-10-05 00:00:00,1140,Fr 7:00-9:00p Adult Paddle Mixer,120,584,2349,,20,0,1,20,50,N/A
,832794,2012-02-21 14:30:00,870,Comp Member One/One,60,,2963,,,1,0,0,0,N/A

The error message indicates that there is only 1 row in your CSV, you might be missing new lines while making it.

How to get SalesForce data to Python Panda dataframes

Currently we are taking SalesForce data in to CSV file and reading this CSV file in Pandas using read_csv, to_csv methods. Do we have any other way to get data from SalesForce to pandas dataframe.

With Python - you can download a package called Simple Salesforce and write SOQL queries to return data
https://github.com/simple-salesforce/simple-salesforce
Here's an example of how to do this:
from simple_salesforce import Salesforce
sf = Salesforce(username='<enter username>', password='<enter password>',
security_token = '<enter your access token from your profile>')
a_query= pd.DataFrame(sf.query(
"SELECT Name, CreatedDate FROM User")['records'])

In my case, to display the information as a dataframe I had to use the following code:
# Import libraries
import simple_salesforce as ssf, pandas
# Create the connection
session_id, instance = ssf.SalesforceLogin(username='<username>', password='<password>', security_token='<token>', sandbox=False)
sf_ = ssf.Salesforce(instance=instance, session_id=session_id)
# Query to execute
sql_code = "SELECT id, name FROM main_table"
# Store query result as dataframe
information = sf_.query(query= sql_code)
table = pandas.DataFrame(information['records']).drop(columns='attributes')

Adding up to the original answer,
the function below is also suitable for simple joins.
def sf_results_to_dataframe(results, drop_index=True) -> pd.DataFrame:
df = pd.DataFrame(results['records'])
df.drop('attributes', axis=1, inplace=True) # clean up from technical info
df.set_index('Id', drop=drop_index, inplace=True)
for table in ['Account', 'Contact', 'Lead', 'Opportunity']:
if table in results['records'][0].keys(): # detect JOIN
local_keys = list(results['records'][0][table].keys()) # keys from the joined table
if 'attributes' in local_keys:
local_keys.remove('attributes')
global_keys = [table + key for key in local_keys] # name for the fields in the output table
# fields of the joined table and the record index
table_records = [{'Id': record['Id'],
**{global_key:record[table][local_key] for global_key, local_key in zip(global_keys, local_keys)}}
for record in results['records']]
df_extra = pd.DataFrame(table_records)
df_extra.set_index('Id', drop=True, inplace=True) # match index
df.drop(table, axis=1, inplace=True) # drop duplicated info
df = df.merge(df_extra, left_index=True, right_index=True) # merge on index
return df
Example:
import pandas as pd
from simple_salesforce import Salesforce
SALESFORCE_EMAIL = '...'
SALESFORCE_TOKEN = '...'
SALESFORCE_PASSWORD = '...'
sf = Salesforce(username=SALESFORCE_EMAIL, password=SALESFORCE_PASSWORD, security_token=SALESFORCE_TOKEN)
query = """SELECT Id, Name, Account.Name
FROM Contact
LIMIT 1
"""
results = sf.query(query)
df = sf_results_to_dataframe(results)

How do i sort a text file by column numerically?

from lxml import html
import operator
import discord
import yaml
import csv
raw_json =
requests.get('https://bittrex.com/api/v1.1/public/getmarketsummaries').text
json_dict = json.loads(raw_json)
stuff = json_dict["result"]
new = []
for i in range(0,197):
price = (stuff[i]['Last'])
name1 = (stuff[i]['MarketName'])
name = name1.replace("BTC-", "")
prev = (stuff[i]['PrevDay'])
diff = price - prev
change = round(((price - prev) / price) * 100, 2)
final = ('{0},{1}'.format(name,change))
new.append(final)
butFirst = new[0:]
this1 = ("\n".join(butFirst))
text_file = open("Sort.txt", "w")
text_file.write(this1)
text_file.close()
Im having problems sorting this output in second column..
I get base 10 errors.. integer errors etc.. i think the problem
is how the number is stored but i cant figure it out.
output looks like this>
1ST,-5.94
2GIVE,3.45
ABY,2.44
ADA,0.0
ADT,-4.87
ADX,-13.09
AEON,-2.86
AGRS,-2.0

You should avoid changing your data to text earlier than you need to. If you operate with a list of dictionaries it's very easy to sort the list.
import json
import csv
import requests
raw_json = requests.get('https://bittrex.com/api/v1.1/public/getmarketsummaries').text
json_dict = json.loads(raw_json)
stuff = json_dict["result"]
new = []
for i in range(0,197):
price = float(stuff[i]['Last'])
prev = float(stuff[i]['PrevDay'])
# Use dictionary to hold the data
d = {
'name' : stuff[i]['MarketName'].replace("BTC-", ""),
'change' : round(((price - prev) / price) * 100, 2)
}
new.append(d)
# The actual sorting part, sorting by change
sorted_list = sorted(new, key=lambda k: k['change'])
# Writing the dictionaries to file
with open("Sort.txt", "w") as text_file:
dict_writer = csv.DictWriter(text_file, sorted_list[0].keys())
# include the line below, if you want headers
# dict_writer.writeheader()
dict_writer.writerows(sorted_list)

Develop Reference

node.js excel linux python-3.x azure haskell apache-spark rust .htaccess string

Left join with CoGroupByKey sink to BigQuery using Dataflow - python-3.x

Related

Dataflow transform sending same input to output

Dataflow pipeline triggering NotImplementedError [while running 'Filter Status 1']

Bigquery CSV file load fail

How to get SalesForce data to Python Panda dataframes

How do i sort a text file by column numerically?

Categories

Resources