Using groupBy to generate an excel with two sheet and two datasets - python-3.x

I have two datasets df1 and df2
my goal is to create an excel-file with fruit name and inside file I want to create two sheets with customer details and second sheet with vender details.
df1 = pd.DataFrame({
"Fruit": ["apple", "orange", "banana", "apple", "orange"],
"customerName": ["John", "Sam", "David", "Rebeca", "Sydney"],
"customerID": [877, 546, 767, 887, 890],
"PurchasePrice": [1, 2, 5, 6, 4]})
df2 = pd.DataFrame({
"Fruit": ["apple", "orange", "banana", "apple", "orange"],
"VenderName": ["share", "cami", "sniff", "tom", "Adam"],
"VenderID": [0091, 0092, 0094, 0097, 0076]})
I know how to do groupby with on dataset and generate a file.
grouped = df.groupby("Fruit")
# run this to generate separate Excel files
for fruit, group in grouped:
group.to_excel(excel_writer=f"{fruit}.xlsx", sheet_name= customer, index=False)
Could please help to in solving this issue.

Use ExcelWriter:
from pandas import ExcelWriter
fruits = set(df1["Fruit"].unique().tolist() + df2["Fruit"].unique().tolist())
for fruit in fruits:
sheets = {
"Customer": df1.loc[df1["Fruit"].eq(fruit)],
"Vendor": df2.loc[df2["Fruit"].eq(fruit)]
}
with ExcelWriter(f"{fruit}_.xlsx") as writer:
for sh_name, table in sheets.items():
table.to_excel(writer, sheet_name=sh_name, index=False)

Related

Fill df with empty rows based on index of other df

I am trying to use df.update(), but my dfs have different sizes. Now I want to fill up the smaler df with dummy rows to match the shape the bigger df. Here's a minimal example:
import pandas as pd
import numpy as np
data = {
"Feat_A": ["INVALID", "INVALID", "INVALID"],
"Feat_B": ["INVALID", "INVALID", "INVALID"],
"Key": [12, 25, 99],
}
df = pd.DataFrame(data=data)
data = {"Feat_A": [1, np.nan], "Feat_B": [np.nan, 2], "Key": [12, 99]}
result = pd.DataFrame(data=data)
# df.update(result) not working because of different sizes/shape
# result should be
# Feat_A Feat_B Key
# 0 1.0 NaN 12
# NaN NaN NaN NaN
# 2 NaN 2.0 99
# df.update(result) should work now
This did it:
df.update(result.set_index('Key').reindex(df.set_index('Key').index).reset_index())
Does this meet your needs? Modified your example to include unique DataFrame values to confirm proper alignment:
# Modified example
data = {
"Feat_A": ["INVALID_A12", "INVALID_A25", "INVALID_A99"],
"Feat_B": ["INVALID_B12", "INVALID_B25", "INVALID_B99"],
"Key": [12, 25, 99],
}
df = pd.DataFrame(data=data)
data = {"Feat_A": [1, np.nan], "Feat_B": [np.nan, 2], "Key": [12, 99]}
result = pd.DataFrame(data=data)
# Use Key column as DataFrame indexes
df = df.set_index('Key')
result = result.set_index('Key')
# Add all-NaN rows with keys that exist in df but not in result
result = result.reindex_like(df)
# Update
result.update(df)
print(result)
Feat_A Feat_B
Key
12 INVALID_A12 INVALID_B12
25 INVALID_A25 INVALID_B25
99 INVALID_A99 INVALID_B99

plotly: descending order in grouped bar chart

I'm trying to
create a nice plot which is sorted by LABEL and then by Value inside each LABEL.
If possible remove labels on the bottom of the chart because I have explanation in legend.
libraries:
from plotly import graph_objs as go
import plotly.express as px
import pandas as pd
My data looks like this:
df = pd.DataFrame({'LABEL': ['1', '1', '2', '2', '3', '3', '3', '3'],
'Cat2': ['a', 'b', 'a', 'b', 'c', 'a', 'e', 'f'],
'Value': [3, 2, 1, 4, 1, 3, 4, 1]})
df.sort_values(by=['LABEL', 'Value'], ascending=[True, False],inplace=True)
Here is my try:
COLOR_MAP = {str(i): c for i, c in enumerate(px.colors.qualitative.Light24)}
fig = go.Figure()
for i in df['LABEL'].unique():
df_ = df[df['LABEL'] == i]
fig.add_trace(go.Bar(
x=[df_['LABEL'], df_['Cat2']],
y=df_['Value'],
marker=dict(color=COLOR_MAP[i]),
name=f'{i}'))
fig.update_layout(legend_title='Cat1')
fig.update_layout(
xaxis=dict(tickangle=45))
fig.update_layout(xaxis={'categoryorder': 'trace'}) # I tried: 'total descending', 'category descending', 'array'
Result:
My expectation:
Thanks in advance!!
it's much simpler in plotly express
define a new column in dataframe that defines x
from plotly import graph_objs as go
import plotly.express as px
import pandas as pd
df = pd.DataFrame(
{
"LABEL": ["1", "1", "2", "2", "3", "3", "3", "3"],
"Cat2": ["a", "b", "a", "b", "c", "a", "e", "f"],
"Value": [3, 2, 1, 4, 1, 3, 4, 1],
}
)
df.sort_values(by=["LABEL", "Value"], ascending=[True, False], inplace=True)
# define a concatenated column for x
df = df.assign(labx=df["LABEL"] + df["Cat2"])
px.bar(
df,
x="labx",
y="Value",
hover_data=["Cat2"],
color="LABEL",
color_discrete_sequence=px.colors.qualitative.Light24,
).update_layout(
xaxis={"tickmode": "array", "tickvals": df["labx"], "ticktext": df["Cat2"]}
)
without plotly express
import plotly.graph_objects as go
fig = go.Figure()
fig.add_trace(
go.Bar(
x=df["labx"],
y=df["Value"],
marker_color=df["LABEL"]
.map(
{v: c for v, c in zip(df["LABEL"].unique(), px.colors.qualitative.Light24)}
)
.values,
)
).update_layout(
xaxis={"tickmode": "array", "tickvals": df["labx"], "ticktext": df["Cat2"]}
)

How could I count all the genres in my DataFrame?

I have a DataFrame called df_imdb:
Each row contains the information about a movie, This DataFrame has a column name 'genres' that shows the genre of that movie that could have more than one genre e.g. [{'id': 53, 'name': 'Thriller'}, {'id': 28, 'name': 'Action'}, {'id': 9648, 'name': 'Mystery'}]
I want to find out what are the most genre used in this movies (find the top 3 most used genres in this DataFrame)
The data is a list of dictionaries, multiple options here:
Option 1: Pure pandas, convert the values associated with key name to a Series and use value_counts
df = pd.DataFrame({'genres':[[{'id': 53, 'name': 'Thriller'}, {'id': 28, 'name': 'Action'}, {'id': 9648, 'name': 'Mystery'}],[{'id': 53, 'name': 'Thriller'}, {'id': 30, 'name': 'Blah'}, {'id': 9648, 'name': 'Mystery'}]]})
df['genres'].apply(lambda x: pd.Series([i['name'] for i in x]))\
.stack().value_counts()
You get
Thriller 2
Mystery 2
Action 1
Blah 1
Option 2: Convert the values to list and use Counter
from collections import Counter
l_genres = df['genres'].apply(lambda x: [i['name'] for i in x]).sum()
Counter(l_genres)
You get
Counter({'Thriller': 2, 'Action': 1, 'Mystery': 2, 'Blah': 1})
df['genres'].apply(lambda x: pd.Series([i['name'] for i in x])).stack().value_counts()
Edit: Data type is str and not list, first use literal_eval
from ast import literal_eval
df['genres'] = df['genres'].apply(literal_eval)
Give this a try:
c = df_imbd['genres'].apply(lambda x: [n['name'] for n in x]).sum()
pd.Series(c).value_counts()

Python merge two dictionaries and output them to csv file

I have 2 dictionaries that look like that :
subjects = {'aaa' : 1,
'bbb' : 1,
'ccc': 1}
objects = {'aaa' : 1,
'bbb' : 1,
'ccc': 1}
I want to output them to a csv file that will have the string, times as a subject and times as an object.
For the 2 dictionaries I want the csv file to look like that :
aaa,1,1
bbb,1,1
ccc,1,1
You can try using pandas, it's really useful for these kind of tasks.
>>> import pandas as pd
>>> subjects = {'aaa' : 1, 'bbb' : 1, 'ccc': 1}
>>> objects = {'aaa' : 1, 'bbb' : 1, 'ccc': 1}
>>> df1 = pd.DataFrame([subjects]).T
>>> df2 = pd.DataFrame([objects]).T
>>> pd.concat([df1,df2],axis=1).to_csv('./out.csv', header=False)
aaa,1,1
bbb,1,1
ccc,1,1
Or you can do the same without pandas:
subjects = {'aaa' : 1, 'bbb' : 1, 'ccc': 1}
objects = {'aaa' : 1, 'bbb' : 1, 'ccc': 1}
with open('./out.csv','w') as f:
for k in subjects:
f.write(f'{k},{subjects[k]},{objects[k]}\n')

How to convert some pyspark dataframe's column into a dict with its column name and combine them to be a json column?

I have data in the following format, and I want to change its format using pyspark with two columns ('tag' and 'data').
The 'tag' column values are unique, and the 'data' column values are a json string obtained from the orginial column 'date、stock、price'
in which combine 'stock' and 'price' to be the 'A' columns value, combine 'date' and 'num' to be the 'B' columns value.
I didn't find or write good funcitions to realize this effect.
my spark version is 2.1.0
original DataFrame
date, stock, price, tag, num
1388534400, GOOG, 50, a, 1
1388534400, FB, 60, b, 2
1388534400, MSFT, 55, c, 3
1388620800, GOOG, 52, d, 4
I expect the output:
new DataFrame
tag| data
'a'| "{'A':{'stock':'GOOD', 'price': 50}, B:{'date':1388534400, 'num':1}"
'b'| "{'A':{'stock':'FB', 'price': 60}, B:{'date':1388534400, 'num':2}"
'c'| "{'A':{'stock':'MSFT', 'price': 55}, B:{'date':1388534400, 'num':3}"
'd'| "{'A':{'stock':'GOOG', 'price': 52}, B:{'date':1388620800, 'num':4}"
+--+--------------------------------------------------------------+
from pyspark.sql import SparkSession
from pyspark.sql.functions import create_map
spark = SparkSession.builder.appName("example").getOrCreate()
df = spark.createDataFrame([
(1388534400, "GOOG", 50, 'a', 1),
(1388534400, "FB", 60, 'b', 2),
(1388534400, "MSFT", 55, 'c', 3),
(1388620800, "GOOG", 52, 'd', 4)]
).toDF("date", "stock", "price", 'tag', 'num')
df.show()
tag_cols = {'A':['stock', 'price'], 'B':['date', 'num']}
# todo, change the Dataframe columns format
IIUC, just use pyspark.sql.functions.struct and pyspark.sql.functions.to_json (both should be available in spark 2.1)
from pyspark.sql import functions as F
# skip df initialization[enter link description here][1]
df_new = df.withColumn('A', F.struct('stock', 'price')) \
.withColumn('B', F.struct('date', 'num')) \
.select('tag', F.to_json(F.struct('A', 'B')).alias('data'))
>>> df_new.show(5,0)
+---+-----------------------------------------------------------------+
|tag|data |
+---+-----------------------------------------------------------------+
|a |{"A":{"stock":"GOOG","price":50},"B":{"date":1388534400,"num":1}}|
|b |{"A":{"stock":"FB","price":60},"B":{"date":1388534400,"num":2}} |
|c |{"A":{"stock":"MSFT","price":55},"B":{"date":1388534400,"num":3}}|
|d |{"A":{"stock":"GOOG","price":52},"B":{"date":1388620800,"num":4}}|
+---+-----------------------------------------------------------------+

Resources