Is there a simple way to conduct the this transformation on a multi-indexed dataframe? - pivot

Please see the attached image showing a DataFrame (left_table in the picture, wrote this as a code in the following). I want to transform it to the right_table in a simple way (using pivot, melt, transposing etc).
arrays = [
["T1", "T1", "T2", "T2"],
["C2", "C3", "C2", "C3"],
]
tuples = list(zip(*arrays))
index = pd.MultiIndex.from_tuples(tuples, names=["C1", "second"])
index
Left_table = pd.DataFrame(np.random.randn(2, 4), index=["N1", "N2"], columns=index)
Left_table

Try:
Left_table = (
Left_table.stack(0)
.reset_index()
.rename(columns={"level_0": "C1", "C1": "T"})
)
Left_table.columns.name = None
Left_table = Left_table.sort_values(by=["T", "C1"])
print(Left_table.to_markdown(index=False))
Prints:
C1
T
C2
C3
N1
T1
1.10134
0.52524
N2
T1
1.45332
0.226281
N1
T2
0.816961
-0.677362
N2
T2
1.00841
0.0634249

Related

Python logic calculate time difference between two rows

I have input data like this
Input:
ID, eventName, time
1,pause,2022-10-27T18:15:47Z
1,resume,2022-10-28T08:01:16Z
2,pause,2022-10-26T22:00:01Z
2,resume,2022-10-27T08:00:15Z
3,pause,2022-10-27T04:00:26Z
3,resume,2022-10-27T10:00:20Z
4,pause,2022-10-28T09:09:07Z
4,resume,2022-10-28T07:05:13Z
5,pause,2022-10-27T09:42:14Z
5,resume,2022-10-27T23:01:00Z
I am expecting below format as output
Expected output:
ID, time_pause, time_resume, time_diff
1,2022-10-27T18:15:47Z,2022-10-28T08:01:16Z,14hr
2,2022-10-26T22:00:01Z,2022-10-27T08:00:15Z,10hr
3,2022-10-27T04:00:26Z,2022-10-27T10:00:20Z,6hr
4,2022-10-28T07:05:13Z,2022-10-28T09:09:07Z,2hr
5,2022-10-27T09:42:14Z,2022-10-27T23:01:00Z,14hr
Assuming that:
there are always 2 rows with the same id, one for resume and one for pause
the datetime format is always the same
here it is a simple attempt:
import datetime
FORMAT = '%Y-%m-%dT%H:%M:%SZ'
DATA = (
(1,'pause','2022-10-27T18:15:47Z'),
(1,'resume','2022-10-28T08:01:16Z'),
(2,'pause','2022-10-26T22:00:01Z'),
(2,'resume','2022-10-27T08:00:15Z'),
(3,'pause','2022-10-27T04:00:26Z'),
(3,'resume','2022-10-27T10:00:20Z'),
(4,'pause','2022-10-28T09:09:07Z'),
(4,'resume','2022-10-28T07:05:13Z'),
(5,'pause','2022-10-27T09:42:14Z'),
(5,'resume','2022-10-27T23:01:00Z'),
)
def _f(d):
d = sorted(d, key=lambda x: x[0])
for p1, p2 in zip(d[::2], d[1::2]):
id1, pr1, d1 = p1
id2, pr2, d2 = p2
assert id1 == id2
d1 = datetime.datetime.strptime(d1, FORMAT)
d2 = datetime.datetime.strptime(d2, FORMAT)
min_datetime, max_datetime = min(d1, d2), max(d1, d2)
delta = max_datetime - min_datetime
yield id1, min_datetime.strftime(FORMAT), max_datetime.strftime(FORMAT), f'{round(delta.seconds / 3600)}h'
if __name__ == '__main__':
for x in _f(DATA):
print(x)
The output is the following:
(1, '2022-10-27T18:15:47Z', '2022-10-28T08:01:16Z', '14h')
(2, '2022-10-26T22:00:01Z', '2022-10-27T08:00:15Z', '10h')
(3, '2022-10-27T04:00:26Z', '2022-10-27T10:00:20Z', '6h')
(4, '2022-10-28T07:05:13Z', '2022-10-28T09:09:07Z', '2h')
(5, '2022-10-27T09:42:14Z', '2022-10-27T23:01:00Z', '13h')
The last row is not clear as the delta is 13.31 hours (in decimal), so if rounded it is 13 hours, not 14.

PySpark : Different Aggregate Alias for each group

Is it possible in spark to do a groupby and aggregate where the alias for the aggregate function is different for each group? For example, if I was doing a groupby and AVG, I want each group to have a different column name such as "group_1_avg" for group 1 and "group_2_avg" for group 2, etc. With the idea that the final result will be a list of columns group_1_avg, group_2_avg, etc.
I realize I can probably not do this and just have everything aggregated under one name and pivot it, but I am trying to avoid pivot due to how expensive it is for my data.
Things I've tried:
frame = frame.groupBy(Item, Group, Level).agg(F.avg(val))
frame = frame.withColumn('Columns', concat(col("Group"), lit(""), col("level"), lit(""), lit("AVG")))
frame = frame.groupBy(Item).pivot(Columns).agg(first(AVG))
This works and does what I need but the problem I have is that the pivot becomes too expensive given the scale of my data so I am looking for an alternate solution.
Thank you for your time.
Input Format
Item
Group
Level
val
W1
A
S1
40
W1
A
S1
40
W1
A
S2
25
W2
A
S1
50
W2
A
S1
50
Expected Output:
Item
A_S1_AVG
A_S2_AVG
W1
40
25.0
W2
50
null
For large dataset, can you envision your data to be in this format ?(instead of constructing 20K columns(!!?), you could have 20K rows)
+----+-----------+----+
|Item|Group_Level|Mean|
+----+-----------+----+
| W1| A_S1|40.0|
| W1| A_S2|25.0|
| W2| A_S1|50.0|
+----+-----------+----+
If so,
from pyspark.sql import functions as F
from pyspark.sql import types as T
df = spark.createDataFrame([('W1', 'A' ,'S1', 40),
('W1', 'A', 'S1', 40),
('W1','A', 'S2', 25),
('W2', 'A', 'S1', 50),
('W2', 'A', 'S1', 50)], ["Item", "Group", "Level", "Val"])
#udf (T.MapType(T.StringType(), T.FloatType()))
def create_group_scores(data):
data_map = {}
mean_map = {}
for datum in data:
key = f"{datum.Group}_{datum.Level}"
if key in data_map:
data_map[key].append(datum.Val)
else:
data_map[key] = [datum.Val]
for key in data_map:
mean_map[key] = sum(data_map[key])/len(data_map[key])
return mean_map
item_groups = df.groupBy("Item").agg(F.collect_list(F.struct("Group", "Level", "Val")).alias("group_level_val")).withColumn("group_scores",create_group_scores("group_level_val"))
item_groups = item_groups.select("Item", F.explode_outer("group_scores").alias("Group_Level", "Mean"))
item_groups.show()

Relationship Parsing

I am having a df with two columns col_1 and col_2. The entries in col_1 are related with entries in col_2. It is some sort of relationship where A belongs to B, and B belongs to C & D, therefore A belongs to B, C and D.
import pandas as pd
col_1 = ["A", "A", "B", "B", "I", "J", "C", "A"]
col_2 = ["B", "H", "C", "D", "J", "L", "E", "Z"]
df = pd.DataFrame({"col_1":col_1, "col_2":col_2})
df.sort_values("col_1", inplace=True)
df
I want to extract the relationship by keeping the first occurring key as the "my_key" and all other keys in "Connected" column.
How can I fetch all keys which are connected to each others ,keeping the conditions in mind.
The keys that are in col_1 should not be in the list of col_2
and
Only the related keys should be in front of my_key
Use networkx with connected_components for dictionary:
import networkx as nx
# Create the graph from the dataframe
g = nx.Graph()
g.add_edges_from(df[['col_1','col_2']].itertuples(index=False))
connected_components = nx.connected_components(g)
# Find the component id of the nodes
node2id = {}
for cid, component in enumerate(connected_components):
for node in component:
node2id[node] = cid + 1
Then get first values of groups to column col_1 and map all another values in lists:
g1 = df['col_1'].map(node2id)
df1 = df.loc[~g.duplicated(), ['col_1']]
s = pd.Series(list(node2id.keys()), index=list(node2id.values()))
s = s[~s.isin(df1['col_1'])]
d = s.groupby(level=0).agg(list)
df1['Connected'] = g1.map(d)
print (df1)
col_1 Connected
0 A [C, B, E, H, D, Z]
4 I [J, L]
For plotting use:
pos = nx.spring_layout(g, scale=20)
nx.draw(g, pos, node_color='lightblue', node_size=500, with_labels=True)

How to make a column with lists from columns of list elements in a pandas dataframe?

I have a pandas dataframe like
test = pd.DataFrame([[['P','N'], ['Z', 'P']],[['N','N'], ['Z', 'P']]],
columns=['c1', 'c2'])
I want to add another column c3 to test whose elements are
['PZ', 'NP']
['NZ', 'NP']
How can I do this?
Use assign:
df = test.assign(c3 = [[x[0]+y[0], x[1]+y[1]] for x,y in test.values.tolist()])
Or:
df = test.assign(c3 = list(map(list,zip(test.c1.str[0]+test.c2.str[0],test.c1.str[1]+test.c2.str[1]))))
print(df)
c1 c2 c3
0 [P, N] [Z, P] [PZ, NP]
1 [N, N] [Z, P] [NZ, NP]
print([[x[0]+y[0], x[1]+y[1]] for x,y in test.values.tolist()])
[['PZ', 'NP'], ['NZ', 'NP']]
print(list(map(list,zip(test.c1.str[0]+test.c2.str[0],test.c1.str[1]+test.c2.str[1]))))
[['PZ', 'NP'], ['NZ', 'NP']]

pyspark select particular rows which have more matching column fields

I have sample table like below(I have 1 million such rows) out of this I need to select the rows to new dataframe based below condition,
I have to select top 1000 students who attended more class
Top 1000 students who attended class 1, 2,3,4 more times as compare to other
so in my example case I need to store all the rows of student 123 and 678 in to other dataframe
I could not get a proper logic
Below is the solution of your problem, please let me know if that helps you or not
import pyspark.sql.functions as F
from pyspark.sql import Window
attended_more_classes = df.filter(
F.col("check_in") == "y"
).groupby(
"id"
).agg(
F.countDistinct(F.col("class")).alias("class_count")
)
win = Window.partitionBy("id").orderBy(F.col("class_count").desc())
attended_more_classes = attended_more_classes.withColumn(
"rank",
F.rank().over(win)
).withColumn(
"attended_more_class",
F.when(
F.col("rank")<=1000,
F.lit("Y")
)
)
# result of first part
attended_more_classes.show()
# answer start for second question
win2 = Window.partitionBy("id", "class").orderBy(F.col("class_count").desc())
filtered_students = df.filter(F.col("class").isin(1,2,3,4)).select("id").distinct()
aggregated_data2 = df.filter(
F.col("check_in") == "y"
).groupby(
"id",
"class"
).agg(
F.count(F.col("check_in")).alias("class_count")
).withColumn(
"max_class",
F.first(F.col("class")).over(win)
)
attend_more_class2 = aggregated_data2.join(
filtered_students,
on = "id",
how = "inner"
)
attend_more_class23 = aggregated_data2.filter(
F.col("max_class").isin(1,2,3,4)
).withColumn(
"rank",
F.rank().over(win2)
).withColumn(
"attended_more_class",
F.when(
F.col("rank")<=1000,
F.lit("Y")
)
)
# answer of second part
attend_more_class23.show()

Resources