How to dynamically create kafka producers - python-3.x

first I am doing baby steps in python and kafka, So let's say I have a listA=[item1, item2, item3] and every item of listA is a producer on a topic. Now what I want is to dynamically add/remove items to listA and became immediately producers also every item should run on it's own thread as they should be independent.
So basically I am trying to scale the application.
so far I tried to hard code every producer item and run it in its own terminal
each Item
from pykafka import KafkaClient
import json
from datetime import datetime
import uuid
import time
input_file = open('./data/item1.json')
json_array = json.load(input_file)
coordinates = json_array['features'][0]['geometry']['coordinates']
# Generate uuid
def generate_uuid():
return uuid.uuid4()
# Kafaka producer
client = KafkaClient(hosts="localhost:9092")
topic = client.topics['test_kafka2']
producer = topic.get_sync_producer()
# Generate all coordinates
def generate_coordinates(coordinates):
# new_coordinates = []
i = 0
while i < len(coordinates):
data = {}
data['class'] = 201
data['key'] = str(data['class']) + '_' + str(generate_uuid())
data['time_stamp'] = str(datetime.utcnow())
data['longitude'] = coordinates[i][0]
data['latitude'] = coordinates[i][1]
message = json.dumps(data)
producer.produce(message.encode('ascii'))
time.sleep(1)
# If item reaches last coordinaates
if i == len(coordinates)-1:
coordinates = coordinates[::-1]
i = 0
else:
i += 1
# return new_coordinates
generate_coordinates(coordinates)

Related

Write file name based on return

I'm creating a boto3 script that scrapes and uploads our entire accounts Public Ips and NatGateway Ips to our S3 bucket. I'm stuck on writing files for both returns. I would ideally like to write two separate files while still using the same filename variable you see in main(). Right now I can get this to work with only one return(either nat_ips or public_ips)
import boto3
from datetime import datetime
from csv import writer
def get_ips():
# Uses STS to assume the role needed.
boto_sts=boto3.client('sts')
sts_response = boto_sts.assume_role(
RoleArn='arn:aws:iam::1234:role/foo',
RoleSessionName='Foo'
)
# Save the details from assumed role into vars
sts_credentials = sts_response["Credentials"]
session_id = sts_credentials["AccessKeyId"]
session_key = sts_credentials["SecretAccessKey"]
session_token = sts_credentials["SessionToken"]
# List and store all the regions
ec2_client=boto3.client('ec2',aws_access_key_id=session_id,aws_secret_access_key=session_key,aws_session_token=session_token,region_name='us-west-1')
all_regions=[region['RegionName'] for region in ec2_client.describe_regions()['Regions']]
nat_ips = []
public_ips = []
for region in all_regions:
max_results = 1000
next_token = ''
ec2_client=boto3.client('ec2',aws_access_key_id=session_id,aws_secret_access_key=session_key,aws_session_token=session_token,region_name=region)
session=boto3.Session(aws_access_key_id=session_id, aws_secret_access_key=session_key, aws_session_token=session_token, region_name=region)
while next_token or next_token == '':
response = ec2_client.describe_nat_gateways(MaxResults=max_results, NextToken=next_token)
filters = [{'Name':'tag:Name', 'Values':['*sgw-eip']}]
get_ips = ec2_client.describe_addresses(Filters=filters)
for gateway in response["NatGateways"]:
for address in gateway["NatGatewayAddresses"]:
nat_ips.append(address["PublicIp"]+'/32')
for eip_dict in get_ips['Addresses']:
public_ip_string = eip_dict['Tags'][0]['Value'] + ' : ' + eip_dict['PublicIp']
public_ips.append(public_ip_string)
next_token = response.get("NextToken", None)
return nat_ips, public_ips
def _s3_upload(filename):
s3 = boto3.resource('s3')
bucket = 'foo-bar'
object_name = 'foo/'
s3.meta.client.upload_file(Filename=filename,Bucket=bucket,Key=object_name+filename)
print(f'Uploading {filename} to {bucket}')
def write_list_to_file(filename, data):
lines_string = '\n'.join(str(x) for x in data)
with open(filename,'w') as output:
output.writelines(lines_string)
print(f'Writing file to {filename}')
if __name__ == "__main__":
date = datetime.now().strftime('%Y%m%d')
# Stuck here since I want to make it one variable
filename_nat_ips = f'natgateway_ips{date}.csv'
filename_sga_ips = f'sga_ips{date}.csv'
public_ips = get_ips()
nat_ips = get_ips()
print(filename)
write_list_to_file(filename, nat_ips)
_s3_upload(filename)
I see that you are already returning a tuple of public_ips and nat_ips from your get_ips() function. So in your main, you could collect them together as well.
You might try something like this:
if __name__ == "__main__":
date = datetime.now().strftime('%Y%m%d')
# Stuck here since I want to make it one variable
filename_nat_ips = f'natgateway_ips{date}.csv'
filename_sga_ips = f'sga_ips{date}.csv'
nat_ips, public_ips = get_ips()
write_list_to_file(filename_nat_ips, nat_ips)
write_list_to_file(filename_public_ips, public_ips)
_s3_upload(filename_nat_ips)
_s3_upload(filename_public_ips)
I was doing it right the first time. And was trying to make it more complicated.
if __name__ == "__main__":
date = datetime.now().strftime('%Y%m%d')
filename_nat_ips = f'natgateway_ips{date}.csv'
filename_sga_ips = f'sga_ips{date}.csv'
nat_ips, public_ips = get_ips()
print(filename_nat_ips)
print(filename_sga_ips)
write_list_to_file(filename_nat_ips, nat_ips)
write_list_to_file(filename_sga_ips, public_ips)
_s3_upload(filename_nat_ips)
_s3_upload(filename_sga_ips)

Python 3 Multiprocessing and openCV problem with dictionary sharing between processor

I would like to use multiprocessing to compute the SIFT extraction and SIFT matching for object detection.
For now, I have a problem with the return value of the function that does not insert data in the dictionary.
I'm using Manager class and image that are open inside the function. But does not work.
Finally, my idea is:
Computer the keypoint for every reference image, use this keypoint as a parameter of a second function that compares and match with the keypoint and descriptors of the test image.
My code is:
# %% Import Section
import cv2
import numpy as np
from matplotlib import pyplot as plt
import os
from datetime import datetime
from multiprocessing import Process, cpu_count, Manager, Lock
import argparse
# %% path section
tests_path = 'TestImages/'
references_path = 'ReferenceImages2/'
result_path = 'ResultParametrizer/'
#%% Number of processor
cpus = cpu_count()
# %% parameter section
eps = 1e-7
useTwo = False # using the m and n keypoint better with False
# good point parameters
distanca_coefficient = 0.75
# gms parameter
gms_thresholdFactor = 3
gms_withRotation = True
gms_withScale = True
# flann parameter
flann_trees = 5
flann_checks = 50
#%% Locker
lock = Lock()
# %% function definition
def keypointToDictionaries(keypoint):
x, y = keypoint.pt
pt = float(x), float(y)
angle = float(keypoint.angle) if keypoint.angle is not None else None
size = float(keypoint.size) if keypoint.size is not None else None
response = float(keypoint.response) if keypoint.response is not None else None
class_id = int(keypoint.class_id) if keypoint.class_id is not None else None
octave = int(keypoint.octave) if keypoint.octave is not None else None
return {
'point': pt,
'angle': angle,
'size': size,
'response': response,
'class_id': class_id,
'octave': octave
}
def dictionariesToKeypoint(dictionary):
kp = cv2.KeyPoint()
kp.pt = dictionary['pt']
kp.angle = dictionary['angle']
kp.size = dictionary['size']
kp.response = dictionary['response']
kp.octave = dictionary['octave']
kp.class_id = dictionary['class_id']
return kp
def rootSIFT(dictionary, image_name, image_path,eps=eps):
# SIFT init
image = cv2.imread(image_path, cv2.IMREAD_GRAYSCALE)
sift = cv2.xfeatures2d.SIFT_create()
keypoints, descriptors = sift.detectAndCompute(image, None)
descriptors /= (descriptors.sum(axis=1, keepdims=True) + eps)
descriptors = np.sqrt(descriptors)
print('Finito di calcolare, PID: ', os.getpid())
lock.acquire()
dictionary[image_name]['keypoints'] = keypoints
dictionary[image_name]['descriptors'] = descriptors
lock.release()
def featureMatching(reference_image, reference_descriptors, reference_keypoints, test_image, test_descriptors,
test_keypoints, flann_trees=flann_trees, flann_checks=flann_checks):
# FLANN parameter
FLANN_INDEX_KDTREE = 1
index_params = dict(algorithm=FLANN_INDEX_KDTREE, trees=flann_trees)
search_params = dict(checks=flann_checks) # or pass empty dictionary
flann = cv2.FlannBasedMatcher(index_params, search_params)
flann_matches = flann.knnMatch(reference_descriptors, test_descriptors, k=2)
matches_copy = []
for i, (m, n) in enumerate(flann_matches):
if m.distance < distanca_coefficient * n.distance:
matches_copy.append(m)
gsm_matches = cv2.xfeatures2d.matchGMS(reference_image.shape, test_image.shape, keypoints1=reference_keypoints,
keypoints2=test_keypoints, matches1to2=matches_copy,
withRotation=gms_withRotation, withScale=gms_withScale,
thresholdFactor=gms_thresholdFactor)
#%% Starting reference list file creation
reference_init = datetime.now()
print('Start reference file list creation')
reference_image_process_list = []
manager = Manager()
reference_image_dictionary = manager.dict()
reference_image_list = manager.list()
for root, directories, files in os.walk(references_path):
for file in files:
if file.endswith('.DS_Store'):
continue
reference_image_path = os.path.join(root, file)
reference_name = file.split('.')[0]
image = cv2.imread(reference_image_path, cv2.IMREAD_GRAYSCALE)
reference_image_dictionary[reference_name] = {
'image': image,
'keypoints': None,
'descriptors': None
}
proc = Process(target=rootSIFT, args=(reference_image_list, reference_name, reference_image_path))
reference_image_process_list.append(proc)
proc.start()
for proc in reference_image_process_list:
proc.join()
reference_end = datetime.now()
reference_time = reference_end - reference_init
print('End reference file list creation, time required: ', reference_time)
I faced pretty much the same error. It seems that the code hangs at detectAndCompute in my case, not when creating the dictionary. For some reason, sift feature extraction is not multi-processing safe (to my understanding, it is the case in Macs but I am not totally sure.)
I found this in a github thread. Many people say it works but I couldn't get it worked. (Edit: I tried this later which works fine)
Instead I used multithreading which is pretty much the same code and works perfectly. Of course you need to take multithreading vs multiprocessing into account

Taking info from file and creating a dictionary

The goal of mine is to create a dictionary called 'sum_of_department' contains the department as the key and the total annual salary of all employees combined as a value. So far this is what I have but I'm a bit lost on how to add all the department names along with a sum of all of the employees salary in that dictionary. The current dictionary i tried displays only the amount of the salary and how many times its seen in the file. this is where i need the help.
import requests
# endpoint
endpoint = "https://data.cityofchicago.org/resource/xzkq-xp2w.json"
# optional parameters
parameters = {"$limit":20,}
# make request
response = requests.get(endpoint, params=parameters)
# Get the response data as a python object.
data = response.json()
count_by_department = {}
sum_by_department = {}
#loop through the data
for i in data:
if ('department' and 'salary_or_hourly' and 'annual_salary' in i):
department = i['department']
pay_type = i['salary_or_hourly']
anual_salary = i['annual_salary']
# print(i['annual_salary'])
else:
# handle case where there is no department property in that record
department = 'undefined'
pay_type = 'n/a'
anual_salary = 'n/a'
# print(department,"," ,pay_type)
# exclude the cases where the pay type is Hourly
if(pay_type != 'Salary' ):
pay_type = 0
# print(department,"," ,pay_type)
# update the sum_by_department and count_by_department dictionaries
if (department in count_by_department):
count_by_department[department] += 1
else:
count_by_department[department] = 1
if (anual_salary in sum_by_department):
sum_by_department[anual_salary] +=1
else:
sum_by_department[anual_salary] = 1
# print(count_by_department)
# print(sum_by_department)
You should add each person's annual_salary to the sum_by_department array while looping. Also, do not forget to convert your annual_salary variable to the float type, because adding them together as strings won't work.
Example script:
import requests
# endpoint
endpoint = "https://data.cityofchicago.org/resource/xzkq-xp2w.json"
# optional parameters
parameters = {"$limit":20,}
# make request
response = requests.get(endpoint, params=parameters)
# Get the response data as a python object.
data = response.json()
count_by_department = {}
sum_by_department = {}
#loop through the data
for i in data:
if ('department' and 'salary_or_hourly' and 'annual_salary' in i):
department = i['department']
pay_type = i['salary_or_hourly']
annual_salary = float(i['annual_salary'])
# print(i['annual_salary'])
else:
# handle case where there is no department property in that record
department = 'undefined'
pay_type = 'n/a'
annual_salary = 0
# print(department,"," ,pay_type)
# exclude the cases where the pay type is Hourly
if(pay_type != 'Salary' ):
pay_type = 0
# print(department,"," ,pay_type)
# update the sum_by_department and count_by_department dictionaries
if (department in count_by_department):
count_by_department[department] += 1
sum_by_department[department] += annual_salary
else:
count_by_department[department] = 1
sum_by_department[department] = annual_salary
#import pdb; pdb.set_trace();
print('count_by_department = ', count_by_department)
print('sum_by_department = ', sum_by_department)
Tip:
Uncomment the pdb line to debug interactively. The Python Debugger (pdb for short) halts the program while it's still running (i.e. in memory), so you can interact with it and inspect all variables.

Use dictionary instead of list of dictionary to reduce program complexity

Trying to validate the consistency between DynamoDB tables, Used list of dictionary to store dynamodb table items, Which is taking longer time for execution.
New to python, any help to convert List of dictionary to dictionary please, To reduce my program complexity .
#!/usr/bin/python
import sys
import boto3
import argparse
import argparse
def table_consistency_check(table, column_name):
paginator = dynamoClient.get_paginator('scan')
modified_accounts = []
params = {
'TableName': table
}
page_iterator = paginator.paginate(**params)
for page in page_iterator:
for item in page['Items']:
account = item['account_name']['S']
license_key = item[column_name]['S']
credentials = {
'account_name': account,
column_name: license_key
}
modified_accounts.append(credentials)
return modified_accounts
if __name__ == '__main__':
parser = argparse.ArgumentParser(description='Find all accounts with different license key and account key hash')
parser.add_argument('-r', '--region', nargs='?', type=str, default='us-west-2')
try:
args = parser.parse_args()
except:
exit_code = int(str(sys.exc_info()[1]))
accounts_table = 'accounts_table'
Credentail_table = 'credential_table'
dynamoClient = boto3.client('dynamodb', region_name=args.region)
account1 = table_consistency_check(accounts_table, 'license_key')
account2 = table_consistency_check(Credentail_table, 'access_key_hash')
output = []
for acct_item in account1:
for creds_item in account2:
if acct_item['account_name'] == creds_item['account_name']:
if creds_item['access_key_hash'].startswith('ORIGINAL_KEY_'):
val = creds_item['access_key_hash']
length = len('ORIGINAL_KEY_')
str = val[length:]
if acct_item['license_key'] != str:
output.append(creds_item['account_name'])
print('Duplicate record found')
print('Account Name : ' + acct_item['account_name'] + ', License Key : ' + acct_item[
'license_key'] + ', Access Key Hash : ' + creds_item['access_key_hash'])
if not output:
print('the tables are consistent, No duplicate item found')

How do i sort a text file by column numerically?

from lxml import html
import operator
import discord
import yaml
import csv
raw_json =
requests.get('https://bittrex.com/api/v1.1/public/getmarketsummaries').text
json_dict = json.loads(raw_json)
stuff = json_dict["result"]
new = []
for i in range(0,197):
price = (stuff[i]['Last'])
name1 = (stuff[i]['MarketName'])
name = name1.replace("BTC-", "")
prev = (stuff[i]['PrevDay'])
diff = price - prev
change = round(((price - prev) / price) * 100, 2)
final = ('{0},{1}'.format(name,change))
new.append(final)
butFirst = new[0:]
this1 = ("\n".join(butFirst))
text_file = open("Sort.txt", "w")
text_file.write(this1)
text_file.close()
Im having problems sorting this output in second column..
I get base 10 errors.. integer errors etc.. i think the problem
is how the number is stored but i cant figure it out.
output looks like this>
1ST,-5.94
2GIVE,3.45
ABY,2.44
ADA,0.0
ADT,-4.87
ADX,-13.09
AEON,-2.86
AGRS,-2.0
You should avoid changing your data to text earlier than you need to. If you operate with a list of dictionaries it's very easy to sort the list.
import json
import csv
import requests
raw_json = requests.get('https://bittrex.com/api/v1.1/public/getmarketsummaries').text
json_dict = json.loads(raw_json)
stuff = json_dict["result"]
new = []
for i in range(0,197):
price = float(stuff[i]['Last'])
prev = float(stuff[i]['PrevDay'])
# Use dictionary to hold the data
d = {
'name' : stuff[i]['MarketName'].replace("BTC-", ""),
'change' : round(((price - prev) / price) * 100, 2)
}
new.append(d)
# The actual sorting part, sorting by change
sorted_list = sorted(new, key=lambda k: k['change'])
# Writing the dictionaries to file
with open("Sort.txt", "w") as text_file:
dict_writer = csv.DictWriter(text_file, sorted_list[0].keys())
# include the line below, if you want headers
# dict_writer.writeheader()
dict_writer.writerows(sorted_list)

Resources