I am trying to do the example in Use Python & Pandas to Create a D3 Force Directed Network Diagram
But in the below line I am getting an error 'KeyError: ('count', 'occurred at index 0')'
temp_links_list = list(grouped_src_dst.apply(lambda row: {"source": row['source'], "target": row['target'], "value": row['count']}, axis=1))
I am new in python. What is the issue here?
Edited code
import pandas as pd
import json
import re
pcap_data = pd.read_csv('C:\packet_metadata.csv', index_col='No.')
dataframe = pcap_data
src_dst = dataframe[["Source","Destination"]]
src_dst.rename(columns={"Source":"source","Destination":"target"}, inplace=True)
grouped_src_dst = src_dst.groupby(["source","target"]).size().reset_index()
grouped_src_dst.rename(columns={'count':'value'}).to_dict(orient='records')
unique_ips = pd.Index(grouped_src_dst['source']
.append(grouped_src_dst['target'])
.reset_index(drop=True).unique())
But
print(grouped_src_dst.columns.tolist())
['source', 'target', 0]
Final code
import pandas as pd
import json
import re
pcap_data = pd.read_csv('C:\packet_metadata.csv', index_col='No.')
dataframe = pcap_data
src_dst = dataframe[["Source","Destination"]]
src_dst.sample(10)
grouped_src_dst = src_dst.groupby(["Source","Destination"]).size().reset_index()
d={0:'value',"Source":"source","Destination":"target"}
L = grouped_src_dst.rename(columns=d)
unique_ips = pd.Index(L['source']
.append(L['target'])
.reset_index(drop=True).unique())
group_dict = {}
counter = 0
for ip in unique_ips:
breakout_ip = re.match("^(\d{1,3})\.(\d{1,3})\.(\d{1,3})\.(\d{1,3})$", ip)
if breakout_ip:
net_id = '.'.join(breakout_ip.group(1,2,3))
if net_id not in group_dict:
counter += 1
group_dict[net_id] = counter
else:
pass
temp_links_list = list(L.apply(lambda row: {"source": row['source'], "target": row['target'], "value": row['value']}, axis=1))
I think there is problem with column name count - missing or some witespace like ' count'.
#check columns names
print (grouped_src_dst.columns.tolist())
['count', 'source', 'target']
Sample:
grouped_src_dst = pd.DataFrame({'source':['a','s','f'],
'target':['b','n','m'],
'count':[0,8,4]})
print (grouped_src_dst)
count source target
0 0 a b
1 8 s n
2 4 f m
f = lambda row: {"source": row['source'], "target": row['target'], "value": row['count']}
temp_links_list = list(grouped_src_dst.apply(f, axis=1))
print (temp_links_list)
[{'value': 0, 'source': 'a', 'target': 'b'},
{'value': 8, 'source': 's', 'target': 'n'},
{'value': 4, 'source': 'f', 'target': 'm'}]
Simplier solution is rename column count and use DataFrame.to_dict:
print (grouped_src_dst.rename(columns={'count':'value'}).to_dict(orient='records'))
[{'value': 0, 'source': 'a', 'target': 'b'},
{'value': 8, 'source': 's', 'target': 'n'},
{'value': 4, 'source': 'f', 'target': 'm'}]
EDIT1:
pcap_data = pd.read_csv('C:\packet_metadata.csv', index_col='No.')
grouped_src_dst = pcap_data.groupby(["Source","Destination"]).size().reset_index()
d = {0:'value', "Source":"source","Destination":"target"}
L = grouped_src_dst.rename(columns=d).to_dict(orient='records')
Sample:
pcap_data = pd.DataFrame({'Source':list('aabbccdd'),
'Destination':list('eertffff')})
print (pcap_data)
Destination Source
0 e a
1 e a
2 r b
3 t b
4 f c
5 f c
6 f d
7 f d
grouped_src_dst = pcap_data.groupby(["Source","Destination"]).size().reset_index()
print (grouped_src_dst)
Source Destination 0
0 a e 2
1 b r 1
2 b t 1
3 c f 2
4 d f 2
d = {0:'value', "Source":"source","Destination":"target"}
L = grouped_src_dst.rename(columns=d).to_dict(orient='records')
print (L)
[{'value': 2, 'source': 'a', 'target': 'e'},
{'value': 1, 'source': 'b', 'target': 'r'},
{'value': 1, 'source': 'b', 'target': 't'},
{'value': 2, 'source': 'c', 'target': 'f'},
{'value': 2, 'source': 'd', 'target': 'f'}]
unique_ips = pd.Index(grouped_src_dst['Source']
.append(grouped_src_dst['Destination'])
.reset_index(drop=True).unique())
print (unique_ips)
Index(['a', 'b', 'c', 'd', 'e', 'r', 't', 'f'], dtype='object')
import numpy as np
unique_ips = np.unique(grouped_src_dst[['Source','Destination']].values.ravel()).tolist()
print (unique_ips)
['a', 'b', 'c', 'd', 'e', 'f', 'r', 't']
Related
I have a dataframe below:
data = {'Name': ['A', 'B', 'C', 'D'],
'Lower': ['+', '2', '2+', '3'],
'Upper': ['2','3+','4+','5']}
df= pd.DataFrame(data)
The expected output should be:
data = {'Name': ['A', 'B', 'C', 'D'],
'Lower': ['.5', '2', '2.5', '3'],
'Upper': ['2','3.5','4.5','5']}
I have tried using the code below but it only replaces + and not 2+, 3+, 4+
df.replace('+','.5', regex=False)
I also tried using str.replace but the rest of the values become NaN:
df['Lower'].str.replace('+', '.5')
you can override the value by looping, but it's not the fastest solution
import pandas as pd
data = {'Name': ['A', 'B', 'C', 'D'],
'Lower': ['+', '2', '2+', '3'],
'Upper': ['2','3+','4+','5']}
lower = []
upper = []
newdata = {'Name': ['A', 'B', 'C', 'D'],
'Lower': lower,
'Upper': upper}
for i in data['Lower']:
if "+" in i:
lower.append(i.replace("+", ".5"))
else:
lower.append(i)
for j in data['Upper']:
if "+" in j:
upper.append(j.replace("+", ".5"))
else:
upper.append(j)
df= pd.DataFrame(newdata)
print(df)
I have a nested list of music artists comprised of user inputs, lets say:
artists_list = [['A', 'B', 'C'],
['A', 'C', 'B'],
['B', 'A', 'D']]
I've also managed to create a separate list, based on order of input (not alphabetically), that assigns a genre to each unique artist in the above list:
artist_genre_list = [['A', 'Rock'],
['B', 'Rap'],
['C', 'Rock'],
['D', 'Blues']]
How do I combine these two to make either a master list or dictionary including the frequency count similar to:
master_list = [['A', 'Rock', 3],
['B', 'Rap', 3],
['C', 'Rock', 2],
['D', 'Blues', 1]]
master_dict = {'A': {
'Genre': 'Rock',
'Frequency': 3},
'B': {
'Genre': 'Rap',
'Frequency': 3},
'C': {
'Genre': 'Rock',
'Frequency': 2},
'D': {
'Genre': 'Blues',
'Frequency': 1}
}
The order doesn't necessarily have to be alphabetical. Here is a sample of what I'm doing to create the first two lists:
# Counters
count = 1
new_artist_counter = 0
# Generate Lists
artists_input_list = []
aux_artists_list = []
aux_genre_list = []
aux_artists_genre_list = []
def merge(aux_artists_list, aux_genre_list):
merged_list = [[aux_artists_list[i], aux_genre_list[i]] for i in range(0,
len(aux_artists_list))]
return merged_list
while count < 4:
# Inputs
a1_in = str(input("Artist 1: "))
a2_in = str(input("Artist 2: "))
a3_in = str(input("Artist 3: "))
artists_input_list.append([a1_in, a2_in, a3_in])
# Determines if new unique artist has been added and asks for it's genre
while new_artist_counter < len(artists_input_list):
for entry in artists_input_list:
for artist in entry:
if artist not in aux_artists_list:
aux_artists_list.append(artist)
genre_input = input("What is "+artist+"'s genre? ")
aux_genre_list.append(genre_input)
else: continue
new_artist_counter += 1
aux_artists_genre_list = merge(aux_artists_list, aux_genre_list)
# Counter updates
count += 1
print(artists_input_list)
print(aux_artists_genre_list)
This is what I came up with. It first flattens your artist list, gets the frequencies of each item in the list then combines it with your genre list
from itertools import groupby, chain
import pprint
artists_list = [
['A', 'B', 'C'],
['A', 'C', 'B'],
['B', 'A', 'D']
]
artist_genre_list = [
['A', 'Rock'],
['B', 'Rap'],
['C', 'Rock'],
['D', 'Blues']
]
frequencies = {
key: len(list(value)) for key,
value in groupby(sorted(chain.from_iterable(artists_list)))
}
frequency = [{
letter: {
'Genre': genre,
'Frequency': next((freq
for key, freq in frequencies.items() if key is letter), 0)
}
}
for letter, genre in artist_genre_list
]
pprint.pprint(frequency)
I used pprint just to make the output tidier, which shows as
[{'A': {'Frequency': 3, 'Genre': 'Rock'}},
{'B': {'Frequency': 3, 'Genre': 'Rap'}},
{'C': {'Frequency': 2, 'Genre': 'Rock'}},
{'D': {'Frequency': 1, 'Genre': 'Blues'}}]
I have a dataframe as given below
data = {
'Code': ['P', 'J', 'M', 'Y', 'P', 'Z', 'P', 'P', 'J', 'P', 'J', 'M', 'P', 'Z', 'Y', 'M', 'Z', 'J', 'J'],
'Value': [10, 10, 20, 30, 10, 40, 50, 10, 10, 20, 10, 50, 60, 40, 30, 20, 40, 20, 10]
}
example = pd.DataFrame(data)
Using Python 3, I want to create another dataframe from the dataframe example such that the Code associated with the greater number of Value is obtained.
The new dataframe should look like solution below
output = {'Code': ['J', 'M', 'Y', 'Z', 'P', 'M'],'Value': [10, 20, 30, 40, 50, 50]}
solution = pd.DataFrame(output)
As can be seen, J has more association to Value 10 than other Code so J is selected, and so on.
You could define a function that returns the most occurring items and apply it to the grouped elements. Finally explode to list to rows.
>>> def most_occurring(grp):
... res = Counter(grp)
... highest = max(res.values())
... return [k for k, v in res.items() if v == highest]
...
>>> example.groupby('Value')['Code'].apply(lambda x: most_occurring(x)).explode().reset_index()
Value Code
0 10 J
1 20 M
2 30 Y
3 40 Z
4 50 P
5 50 M
6 60 P
If I understood correctly, you need something like this:
grouped = example.groupby(['Code', 'Value']).indices
arr_tmp = []
[arr_tmp.append([i[0], i[1], len(grouped[i])]) for i in grouped]#['Int64Index'])
output = pd.DataFrame(data=arr_tmp, columns=['Code', 'Value', 'index_count'])
output = output.sort_values(by=['index_count'], ascending=False)
output.reset_index(inplace=True)
output
Trying to get the output of the cartesian product in excel using xlsxwriter and arrange them column-wise but have not been successful.
Intended Result:
If the first output is (A,B,C,D,E) the output should be displayed in excel in the following manner:
Row 0, Col 0 = A
Row 0, Col 1 = B
Row 0, Col 2 = C
Row 0, Col 3 = D
Row 0, Col 4 = E
then row+=1 and the next set of results are displayed in the same manner.
Code:
list1 = ['A', 'B', 'C', 'D', 'E', 'F']
list2 = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H','I']
list3 = ['A', 'B', 'C', 'D', 'E']
list4 = ['A', 'B', 'C', 'D']
list5 = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H','I','J','K','L','M']
list = [(p,q,r,s,t) for p in list1 for q in list2 for r in list3 for s in
list4 for t in list5]
x = print(list)
import xlsxwriter
workbook = xlsxwriter.workbook('Stat.xlsx')
worksheet = workbook.add_worksheet()
row = 0
col = 0
for Apple, Boy, Cat, Dog, Eagle in (x):
worksheet.write (row, col, Apple)
worksheet.write(row, col + 1, Boy)
worksheet.write(row, col + 1, Cat)
worksheet.write(row, col + 1, Dog)
worksheet.write(row, col + 1, Eagle)
row += 1
workbook.close()
Does this do what you want?
list1 = ['A', 'B', 'C', 'D', 'E', 'F']
list2 = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H','I']
list3 = ['A', 'B', 'C', 'D', 'E']
list4 = ['A', 'B', 'C', 'D']
list5 = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H','I','J','K','L','M']
# You probably could use product from itertools for this: https://docs.python.org/2/library/itertools.html#itertools.product
list_combined = [(p,q,r,s,t) for p in list1
for q in list2
for r in list3
for s in list4
for t in list5]
import xlsxwriter
workbook = xlsxwriter.Workbook('Stat.xlsx')
worksheet = workbook.add_worksheet()
for row, group in enumerate(list_combined):
for col in range(5):
worksheet.write (row, col, group[col])
workbook.close()
for example:
s = 'abc'
number = 1
I want to write a function that return a dict like {'a': {'a', 'b'}, 'b': {'a', 'b', 'c'}, 'c': {'b', 'c'}}
number determine how many adjacent letters next to the current key.
def test(s : str, num : int) -> {str:{str}}:
dict = {}
for word in s:
dict[word] = word
return dict
i can only write one return the same key and value. any suggestions?
Try something like:
>>> s='abc'
>>> n=1
>>> {c:{e for e in[s[i-n:i],c,s[i+1:i+1+n]] if e} for i, c in enumerate(s)}
{'a': {'a', 'b'}, 'b': {'a', 'b', 'c'}, 'c': {'b', 'c'}}