Scenario:
I have a graph edge list stored as Kusto Table. I need to divide them into groups(connected components)
I am running Kusto Python Plugins to run DFS and assign a group number to each edge in the same table(extending another column)
Sometimes there can be a large number of rows, millions of rows. Once the groups has been formed, we can return the table to the c# API in a paginated manner, but how to run the python script in such large number of rows?
Sometimes it takes 2 or more minutes to run. Can I decrease this time, given that we know that these scripts run in the sandbox in kusto cluster?
Please suggest some startergy.
Problem Statement: Given a table containing list of people who gave and borrowed money. Now we have to form groups of people which are connected to each other. Eg if A gives money to B and C. C gives money to D . Then A,B,C,D are connected
Pseudo Code:
Payment // table Payment has columns- LenderName,LenderPhone,BorrowerName,BorrowerPhone
|evaluate python(
typeof(*,GroupNo:int),
```
class Node:
def __init__(self,name,phone):
self.name=name
self.phone=phone
def __eq__(self,other):
return self.name==other.name and self.phone==other.phone
def __ne__(self,other):
return not self.__eq__(other)
def __hash__(self):
return hash((self.name,self.phone))
def get_adjacency_list(connections):
# Personis a dataframe with columns- LenderName,LenderPhone,BorrowerName,BorrowerPhone
adjacency_list=dict()#dictionary of Node-->Set<Nodes>
for index, row in connections.iterrows():
node1=Node(row["LenderName"],row["LenderPhone"])
node2=Node(row["BorrowerName"],row["BorrowerPhone"])
if(node1 not in adjacency_list):
adjacency_list[node1]=set()
if(node2 not in adjacency_list):
adjacency_list[node2]=set()
adjacency_list[node1].add(node2)
adjacency_list[node2].add(node1)
return adjacency_list
def run_dfs(node,group_no,visited,graph_list,groups):
visited.add(node)
groups[node]=group_no
for next_node in graph_list[node]:
if next_node not in visited:
visited.add(next_node)
run_dfs(next_node,group_no,visited,graph_list,groups)
def get_groups(graph_list):
groups=dict()
visited=set()
group_no=0
for node in graph_list.keys():
print(node)
if node not in visited:
group_no+=1
run_dfs(node,group_no,visited,graph_list,groups)
return groups
graph_list=get_adjacency_list(df)
groups=get_groups(graph_list)
result=df;
result["GroupNo"]=df.apply(lambda row: groups.get(Node(row["LenderName"],row["LenderPhone"]),-1),axis=1)
```
)
Related
I want to calculate moving average and take difference of present average and previous average ,if difference between two consecutive moving average is greater than 4 add one is count counter.
my data is streaming data
with this function i can achieve moving average for offline data. but i am not sure how to do this for streaming data.
moving_avg= df['S1'].rolling(window=5).mean()
diff=df['moving_avg'].diff()
This is easy to archive with the standard python collections.
The idea is to keep a limited queue for your values and compute a simple average over that queue. deque uses linked list under the hood, so append performance is O(1).
Here is a simple example:
from collections import deque
class Stats:
def __init__(self, window=5):
self._queue = deque(maxlen=window)
self._prev_ma = None
def push(self, value):
self._queue.append(value)
ma = sum(self._queue) / len(self._queue)
diff = ma - self._prev_ma if self._prev_ma is not None else None
self._prev_ma = ma
print(f"value: {value}, MA: {ma}, diff: {diff}")
# usage
s = Stats()
for i in range(10):
s.push(i)
there I am a very beginner level in multiprocessing in python. I cant re-formulate my code into multi processing
def Customer_Merchant_value_pass(minicustomer_list):
begin=timer()
sum_val=0
list_avg_score=[]
list_category_val=[]
dict_list=[]
#Avg_score=0
with graphdriver.session()as session:
for i in itertools.islice(minicustomer_list,len(minicustomer_list)):
for key in list_of_unique_merchants:
print("Here at list_of_unique_merchants customer value is ",i)
print("BMCC_Code",key)
valuelist=list_of_unique_merchants[key]
#print("Uniquelistfor:",key,valuelist)
for j in valuelist:
#print("list len",len(valuelist))
#print("Here the iner of value list ",i)
#print("--------------------------------")
#print([i,j])
pref_attach_score=Prefer_Attachment_query2([i,j])
#print(pref_attach_score)
result=session.run(pref_attach_score)
for line in result:
#print(line["score"])
sum_val=sum_val+line["score"]
#Avg_score=sum_val/len(valuelist)
Totalsumval=sum_val
print("Totalsum",Totalsumval)
Avg_score=sum_val/len(valuelist)
print("Avg_score",Avg_score)
sum_val=0
list_avg_score.append(Avg_score)
list_category_val.append(key)
avg_score_list=list_avg_score
category_list=list_category_val
#print("sumval is now",sum_val)
#print(result)
max_dictionary =MaxValue_calc(i,category_list,avg_score_list)
#MaxValue_calc(i,category_list,avg_score_list)
print("max_dicitionary",max_dictionary)
dict_list.append(max_dictionary)
rowlist=dict_list
print(rowlist)
#dict_list=[]
list_avg_score=[]
list_category_val=[]
#print("rowlist", rowlist)
#print("list_category_val is now",list_category_val)
#print("for",i," category AVG scores is now ",category_list)
#print("list_avg_score is now",list_avg_score)
#print("for",i," category AVG scores is now ",avg_score_list)
session.close()
end=timer()
print("Total time :",(end-begin))
return rowlist
datalist=Customer_Merchant_value_pass()
def Prefer_Attachment_query2(listval):
customer_wallet=listval[0]
merchant_wallet=listval[1]
#print(x,y)
prefquery="""MATCH (p1:CUSTOMER {WALLETID: '%s'})
MATCH (p2:MERCHANT {WALLETID: '%s'})
RETURN gds.alpha.linkprediction.preferentialAttachment(p1, p2,{relationshipQuery: "PAYMENT"}) as score"""%(customer_wallet,merchant_wallet)
#print(prefquery)
return prefquery
from collections import Counter
def MaxValue_calc(customer_wallet,category_code,avg_score):
print("In MaxValue_calc")
#print(category_code)
#print(avg_score)
#score_dict={"score":avg_score}
#category_dict={"category":category_code}
#print(score_dict)
#print(category_dict)
#customer_wallet_list=[]
#customer_wallet_list.append(customer_wallet)
#customer_wallet=customer_wallet_list
#customer_wallet_list=[]
#print("category_code_list",category_code)
#print("average_score_list",avg_score)
#print(customer_wallet)
print(" For Customer",customer_wallet)
wallet_dictionary=dict(zip(category_code,avg_score))
print("Wallet dictionary",wallet_dictionary)
#top2score=heapq.nlargest(2, wallet_dictionary, key=wallet_dictionary.get)
#print(wallet_dictionary)
#print(wallet_dictionary)
content=Counter(wallet_dictionary)
topscore=content.most_common(3)
top2value_dict=dict(topscore)
print("Top values",top2value_dict)
top2value_dict.update({'WALLET_ID':customer_wallet})
#score
return top2value_dict
#print(top2value_dict)
#print(type(top2value_dict))
#print( mydataframe)
#print(content)
This code generates a list but takes a huge time in processing for larger values. I tried this code for parallelism
import concurrent.futures
with concurrent.futures.ProcessPoolExecutor() as executor:
f1=executor.submit(Customer_Merchant_value_pass,minicustomer_list)
datalist=f1.result()
This worked but still could not use full core of my CPU. but how could use proper parallism
Here i iterates over a list
key iterates over a dictionary of lists and extract value
j iterates over elements of extracted lists
so the workflow is Customer_Merchant_value function process list mini customer list, a dictionary of lists, and the element from extracted lists from dictionary. then for each element, it calls Prefer_Attachment_query2([i,j]. where i is the element from customer list and j is an element from merchant list. this function dynamically pushes query into neo4j. and return a score. After getting the score average value is calculated. Then it calls MaxValue_calc function for fetching largest 3 average score. so it takes customer list number, category list which is actually list of keys in dictionary and list of average_score list. and it returns largest key wise average_score. then it will return a data list
I have a list of suppliers eg.
suppliers=[] in which i have a n number of elements in somewhat following way eg [{"supplierId":"1","aCode":2},{"supplierId":"1","aCode":3}]
Now, I need to check based on the value of a property, lets say areaCode=2 and need to check if area code is in list of suppliers named as aCode. How can I detemine the area Code exists with minimum time and code complexity and by not using for loops as I will have a lot of data in suppliers array.
In your case because it is a list of dict it is hard to not use a loop. If you only want to see if it exists you can one line it such as:
print(any(areaCode==x['aCode'] for x in suppliers))
or if you want the entries you can one line it like this:
suppliers_in_area = [x for x in suppliers if x['aCode'] == areaCode]
Both versions require a for loop and both are equally fast but the first one requires minimal memory.
- Edit -
If you just one the first occurrence (or if only one element exists) then short circuit your for loop.
def get_supplier_by_area(area_code):
for supplier in suppliers:
if supplier['aCode'] == area_code:
return supplier
# It will return None if nothing is found
or you can use a generator if you want to return the next supplier every time to call the function.
def get_supplier_by_area(area_code):
for supplier in suppliers:
if supplier['aCode'] == area_code:
yield supplier
try:
gen = get_supplier_by_area('A01')
print(next(gen)) # will print the first result or will raise a StopIteration error
print(next(gen)) # will print the second result or will raise a StopIteration
except StopIteration:
print("No more results")
The inner items are dictionaries, they can be referenced by their key.
def main():
list1=[{"supplierId":"1","aCode":2},{"supplierId":"1","aCode":3}]
searchKey=2
for item in list1:
if item['aCode']==searchKey:
print(item)
if __name__== "__main__":
main()
I have a loop that generates data and writes it to a database:
myDatabase = Database('myDatabase')
for i in range(10):
#some code here that generates dictionaries that can be saved as activities
myDatabase.write({('myDatabase', 'valid code'): activityDict})
Single activities thus created can be saved to the database. However, when creating more than one, the length of the database is always 1 and only the last activity makes its way to the database.
Because I have lots of very big datasets, it is not convenient to store all of them in a single dictionary and write to the database all at once.
Is there a way to incrementally add activities to an existing database?
Normal activity writing
Database.write() will replace the entire database. The best approach is to create the database in python, and then write the entire thing:
data = {}
for i in range(10):
# some code here that generates data
data['foo'] = 'bar'
Database('myDatabase').write(data)
Dynamically generating datasets
However, if you are dynamically creating aggregated datasets from an existing database, you can create the individual datasets in a custom generator. This generator will need to support the following:
__iter__: Returns the database keys. Used to check that each dataset belongs to the database being written. Therefor we only need to return the first element.
__len__: Number of datasets to write.
keys: Used to add keys to mapping.
values: Used to add activity locations to geomapping. As the locations will be the same in our source database and aggregated system database, we can just give the original datasets here.
items: The new keys and datasets.
Here is the code:
class IterativeSystemGenerator(object):
def __init__(self, from_db_name, to_db_name):
self.source = Database(from_db_name)
self.new_name = to_db_name
self.lca = LCA({self.source.random(): 1})
self.lca.lci(factorize=True)
def __len__(self):
return len(self.source)
def __iter__(self):
yield ((self.new_name,))
def get_exchanges(self):
vector = self.lca.inventory.sum(axis=1)
assert vector.shape == (len(self.lca.biosphere_dict), 1)
return [{
'input': flow,
'amount': float(vector[index]),
'type': 'biosphere',
} for flow, index in self.lca.biosphere_dict.items()
if abs(float(vector[index])) > 1e-17]
def keys(self):
for act in self.source:
yield (self.new_name, act['code'])
def values(self):
for act in self.source:
yield act
def items(self):
for act in self.source:
self.lca.redo_lci({act: 1})
obj = copy.deepcopy(act._data)
obj['database'] = self.new_name
obj['exchanges'] = self.get_exchanges()
yield ((self.new_name, obj['code']), obj)
And usage:
new_name = "ecoinvent 3.2 cutoff aggregated"
new_data = IterativeSystemGenerator("ecoinvent 3.2 cutoff", new_name)
Database(new_name).write(new_data)
Limitations of this approach
If you are writing so many datasets or exchanges within datasets that you are running into memory problems, then you are also probably using the wrong tool. The current system of database tables and matrix builders uses sparse matrices. In this case, dense matrices would make much more sense. For example, the IO table backend skips the database entirely, and just writes processed arrays. It will take a long time to load and create the biosphere matrix if it has 13.000 * 1.500 = 20.000.000 entries. In this specific case, my first instinct is to try one of the following:
Don't write the biosphere flows into the database, but save them separately per aggregated process, and then add them after the inventory calculation.
Create a separate database for each aggregated system process.
I am retrieving information from a sqlite DB that gives me back around 20 million rows that I need to process. This information is then transformed into a dict of lists which I need to use. I am trying to use generators wherever possible.
Can someone please take a look at this code and suggest optimization please? I am either getting a “Killed” message or it takes a really long time to run. The SQL result set part is working fine. I tested the generator code in the Python interpreter and it doesn’t have any problems. I am guessing the problem is with the dict generation.
EDIT/UPDATE FOR CLARITY:
I have 20 million rows in my result set from my sqlite DB. Each row is of the form:
(2786972, 486255.0, 4125992.0, 'AACAGA', '2005’)
I now need to create a dict that is keyed with the fourth element ‘AACAGA’ of the row. The value that the dict will hold is the third element, but it has to hold the values for all the occurences in the result set. So, in our case here, ‘AACAGA’ will hold a list containing multiple values from the sql result set. The problem here is to find tandem repeats in a genome sequence. A tandem repeat is a genome read (‘AACAGA’) that is repeated atleast three times in succession. For me to calculate this, I need all the values in the third index as a list keyed by the genome read, in our case ‘AACAGA’. Once I have the list, I can subtract successive values in the list to see if there are three consecutive matches to the length of the read. This is what I aim to accomplish with the dictionary and lists as values.
#!/usr/bin/python3.3
import sqlite3 as sql
sequence_dict = {}
tandem_repeat = {}
def dict_generator(large_dict):
dkeys = large_dict.keys()
for k in dkeys:
yield(k, large_dict[k])
def create_result_generator():
conn = sql.connect('sequences_mt_test.sqlite', timeout=20)
c = conn.cursor()
try:
conn.row_factory = sql.Row
sql_string = "select * from sequence_info where kmer_length > 2"
c.execute(sql_string)
except sql.Error as error:
print("Error retrieving information from the database : ", error.args[0])
result_set = c.fetchall()
if result_set:
conn.close()
return(row for row in result_set)
def find_longest_tandem_repeat():
sortList = []
for entry in create_result_generator():
sequence_dict.setdefault(entry[3], []).append(entry[2])
for key,value in dict_generator(sequence_dict):
sortList = sorted(value)
for i in range (0, (len(sortList)-1)):
if((sortList[i+1]-sortList[i]) == (sortList[i+2]-sortList[i+1])
== (sortList[i+3]-sortList[i+2]) == (len(key))):
tandem_repeat[key] = True
break
print(max(k for k, v in tandem_repeat.items() if v))
if __name__ == "__main__":
find_longest_tandem_repeat()
I got some help with this on codereview as #hivert suggested. Thanks. This is much better solved in SQL rather than just code. I was new to SQL and hence could not write complex queries. Someone helped me out with that.
SELECT *
FROM sequence_info AS middle
JOIN sequence_info AS preceding
ON preceding.sequence_info = middle.sequence_info
AND preceding.sequence_offset = middle.sequence_offset -
length(middle.sequence_info)
JOIN sequence_info AS following
ON following.sequence_info = middle.sequence_info
AND following.sequence_offset = middle.sequence_offset +
length(middle.sequence_info)
WHERE middle.kmer_length > 2
ORDER BY length(middle.sequence_info) DESC, middle.sequence_info,
middle.sequence_offset;
Hope this helps someone with around the same idea. Here is a link to the thread on codereview.stackexchange.com