How to convert a pandas DataFrame to YAML in python - python-3.x

import yaml
import pandas as pd
data = ['apple','car','smash','market','washing']
bata = ['natural','artificail','intelligence','outstanding','brain']
df = pd.DataFrame(zip(data,bata),columns=['Data','Bata'])
for columns in df:
for list in df[columns]:
text = yaml.dump_all(list)
print(text)
I used above code but I'm getting each letter printed. How to get good YAML format. Thank you.

You can use yaml.dump to get text in yaml format
>>> import yaml
>>> import pandas as pd
>>> data = ['apple','car','smash','market','washing']
>>> bata = ['natural','artificail','intelligence','outstanding','brain']
>>> df = pd.DataFrame(zip(data,bata),columns=['Data','Bata'])
>>> text = yaml.dump(
df.reset_index().to_dict(orient='records'),
sort_keys=False, width=72, indent=4,
default_flow_style=None)
>>> text
'- {index: 0, Data: apple, Bata: natural}\n- {index: 1, Data: car, Bata: artificail}\n- {index: 2, Data: smash, Bata: intelligence}\n- {index: 3, Data: market, Bata: outstanding}\n- {index: 4, Data: washing, Bata: brain}\n'

import yaml
import pandas as pd
data = ['apple','car','smash','market','washing']
bata = ['natural','artificail','intelligence','outstanding','brain']
df = pd.DataFrame(zip(data,bata),columns=['Data','Bata'])
text = yaml.dump(df.to_dict(orient='records'),default_flow_style=None)`
If you want save to file your df:
with open('test_df_to_yaml.yaml', 'w') as file:
documents = yaml.dump({'result': df.to_dict(orient='records')}, file, default_flow_style=False)
If you open after saving as DataFrame:
with open('test_df_to_yaml.yaml', mode="rt", encoding="utf-8") as test_df_to_yaml:
df_merged = pd.DataFrame(yaml.full_load(test_df_to_yaml)['result'])

Related

'NA' handling in python pandas

i have a dataframe with name,age fieldname,name column has missing value and NA when i read the value using pd.read_excel,missing value and NA become NaN,how can i avoid this issue.
this is my code
import pandas as pd
data = {'Name':['Tom', '', 'NA','', 'Ricky',"NA",''],'Age':[28,34,29,42,35,33,40]}
df = pd.DataFrame(data)
df.to_excel("test1.xlsx",sheet_name="test")
import pandas as pd
data=pd.read_excel("./test1.xlsx")
To avoid this, just set the keep_default_na to False:
df = pd.read_excel('test1.xlsx', keep_default_na=False)

Write second Header and Merge Cells using Pandas

All,
I have written a script to write header and data into a excel sheet. But my actual requirement is to write sub header as well and need to merge the cells from the 2nd row.
import xlsxwriter
import pandas as pd
import numpy as np
import openpyxl
import time
# Creating a dataframe
df = pd.DataFrame(np.random.randn(10, 3), columns=list('ABC'))
column_list = df
# Create a Pandas Excel writer using XlsxWriter engine.
writer = pd.ExcelWriter("test.xlsx", engine='xlsxwriter')
df.to_excel(writer, sheet_name='Sheet1', startrow=2, header=False, index=False)
# Get workbook and worksheet objects
workbook = writer.book
worksheet = writer.sheets['Sheet1']
header_fmt = workbook.add_format({'font_name': 'Arial', 'font_size': 12, 'color' : 'white', 'fg_color': '#00007f','bold': True, 'border' : 1})
for idx, val in enumerate(column_list):
worksheet.write(0, idx, val, header_fmt)
worksheet.write(1, 1, 'Sample', header_fmt)
font_fmt = workbook.add_format({'font_name': 'Arial', 'font_size': 9 })
worksheet.set_column('A:C', None, font_fmt)
worksheet.set_row(0, None, header_fmt)
writer.save()
EDIT:
Expected Output:
There are 4 sections in the expected output, all of them are from different Dataframes. I need to merge all those Dataframes' output into a single sheet as shown in the image.
How about this? Is this what you want?
import xlsxwriter
import pandas as pd
import numpy as np
import openpyxl
import time
# Creating a dataframe
df = pd.DataFrame(np.random.randn(10, 3), columns=list('ABC'))
column_list = df
# Create a Pandas Excel writer using XlsxWriter engine.
writer = pd.ExcelWriter("test.xlsx", engine='xlsxwriter')
df.to_excel(writer, sheet_name='Sheet1', startrow=2, header=False, index=False)
# Get workbook and worksheet objects
workbook = writer.book
worksheet = writer.sheets['Sheet1']
header_fmt = workbook.add_format({'font_name': 'Arial', 'font_size': 12, 'color' : 'white', 'fg_color': '#00007f','bold': True, 'border' : 1})
merge_format = workbook.add_format({'align': 'center'})
worksheet.merge_range('A2:C2', 'Sample', merge_format)
for idx, val in enumerate(column_list):
worksheet.write(0, idx, val,header_fmt)
font_fmt = workbook.add_format({'font_name': 'Arial', 'font_size': 9 })
worksheet.set_column('A:C',None, font_fmt)
worksheet.set_row(0,None, header_fmt)
writer.save()

Read a df, split each cell and append to a list

I am working on a csv file that has multiple columns.
The file looks something like this...
A,B,C
1,'x;y;z','e;f;g'
2,'w;x;y','r;s;t'
3,'','p;q;r'
Each cell in the file has a string that is separated by ";".
I want to create a single list by reading each cell and splitting each cell based on the separator.
I have been able to do this but there are performance issues.
The csv file is huge and so I am looking for an optimized version.
The columns names are known upfront. My code is given below
My current solution is
Make a list reading all the rows from each column
Flatten the list
split the items in list if the item is string,append to a new list
remove duplicates from the list
import pandas as pd
from io import StringIO
from collections import Iterable
import operator
csv_path ='my_dir'
# load the data with pd.read_csv
dataDF = pd.read_csv(csv_path)
dataDF.fillna(" ")
result=[]
cols=['A','B','C']
for i in cols:
result.append(dataDF[i].tolist())
result=reduce(operator.concat, result)
print(result)
my_list=[]
for token in result:
if isinstance(token, str):
my_list.append(token.split(";"))
my_list=reduce(operator.concat, my_list)
my_list=list(set(my_list))
If you have many repeated values, this will probably go faster.
from itertools import chain
# load the data with pd.read_csv
dataDF = pd.DataFrame({'A': [1, 2, 3], 'B': ['x;y;z', 'w;x;y', ''], 'C': ['e;f;g', 'r;s;t', 'p;q;r']})
dataDF.fillna(" ", inplace=True)
results_set = set()
for i in dataDF.columns:
try:
results_set.update(chain(*dataDF[i].str.split(';').values))
except AttributeError:
pass
print(results_set)
Try this one:
from itertools import chain
# load the data with pd.read_csv
dataDF = pd.DataFrame({'A': [1, 2, 3], 'B': ['x;y;z', 'w;x;y', ''], 'C': ['e;f;g', 'r;s;t', 'p;q;r']})
dataDF.fillna(" ", inplace=True)
list_of_lists = []
for i in dataDF.columns:
try:
list_of_lists.extend(dataDF[i].str.split(';').values)
except AttributeError:
pass
print(set(chain(*list_of_lists)))

How to create Excel **Table** with pandas.to_excel()?

Need the achieve this programmatically from a dataframe:
https://learn.microsoft.com/en-us/power-bi/service-admin-troubleshoot-excel-workbook-data
Here is one way to do it using XlsxWriter:
import pandas as pd
# Create a Pandas dataframe from some data.
data = [10, 20, 30, 40, 50, 60, 70, 80]
df = pd.DataFrame({'Rank': data,
'Country': data,
'Population': data,
'Data1': data,
'Data2': data})
# Create a Pandas Excel writer using XlsxWriter as the engine.
writer = pd.ExcelWriter("pandas_table.xlsx", engine='xlsxwriter')
# Convert the dataframe to an XlsxWriter Excel object. Turn off the default
# header and index and skip one row to allow us to insert a user defined
# header.
df.to_excel(writer, sheet_name='Sheet1', startrow=1, header=False, index=False)
# Get the xlsxwriter workbook and worksheet objects.
workbook = writer.book
worksheet = writer.sheets['Sheet1']
# Get the dimensions of the dataframe.
(max_row, max_col) = df.shape
# Create a list of column headers, to use in add_table().
column_settings = []
for header in df.columns:
column_settings.append({'header': header})
# Add the table.
worksheet.add_table(0, 0, max_row, max_col - 1, {'columns': column_settings})
# Make the columns wider for clarity.
worksheet.set_column(0, max_col - 1, 12)
# Close the Pandas Excel writer and output the Excel file.
writer.save()
Output:
Update: I've added a similar example to the XlsxWriter docs: Example: Pandas Excel output with a worksheet table
You can't do it with to_excel. A workaround is to open the generated xlsx file and add the table there with openpyxl:
import pandas as pd
df = pd.DataFrame({'Col1': [1,2,3], 'Col2': list('abc')})
filename = 'so58326392.xlsx'
sheetname = 'mySheet'
with pd.ExcelWriter(filename) as writer:
if not df.index.name:
df.index.name = 'Index'
df.to_excel(writer, sheet_name=sheetname)
import openpyxl
wb = openpyxl.load_workbook(filename = filename)
tab = openpyxl.worksheet.table.Table(displayName="df", ref=f'A1:{openpyxl.utils.get_column_letter(df.shape[1])}{len(df)+1}')
wb[sheetname].add_table(tab)
wb.save(filename)
Please note the all table headers must be strings. If you have an un-named index (which is the rule) the first cell (A1) will be empty which leads to file corruption. To avoid this give your index a name (as shown above) or export the dataframe without the index using:
df.to_excel(writer, sheet_name=sheetname, index=False)
Another workaround, if you don't want to save, re-open, and re-save, is to use xlsxwriter. It can write ListObject tables directly, but does not do so directly from a dataframe, so you need to break out the parts:
import pandas as pd
import xlsxwriter as xl
df = pd.DataFrame({'Col1': [1,2,3], 'Col2': list('abc')})
filename = 'output.xlsx'
sheetname = 'Table'
tablename = 'TEST'
(rows, cols) = df.shape
data = df.to_dict('split')['data']
headers = []
for col in df.columns:
headers.append({'header':col})
wb = xl.Workbook(filename)
ws = wb.add_worksheet()
ws.add_table(0, 0, rows, cols-1,
{'name': tablename
,'data': data
,'columns': headers})
wb.close()
The add_table() function expects 'data' as a list of lists, where each sublist represents a row of the dataframe, and 'columns' as a list of dicts for the header where each column is specified by a dictionary of the form {'header': 'ColumnName'}.
I created a package to write properly formatted excel tables from pandas: pandas-xlsx-tables
from pandas_xlsx_tables import df_to_xlsx_table
import pandas as pd
data = [10, 20, 30, 40, 50, 60, 70, 80]
df = pd.DataFrame({'Rank': data,
'Country': data,
'Population': data,
'Strings': [f"n{n}" for n in data],
'Datetimes': [pd.Timestamp.now() for _ in range(len(data))]})
df_to_xlsx_table(df, "my_table", index=False, header_orientation="diagonal")
You can also do the reverse with xlsx_table_to_df
Based on the answer of #jmcnamara, but as a convenient function and using "with" statement:
import pandas as pd
def to_excel(df:pd.DataFrame, excel_name: str, sheet_name: str, startrow=1, startcol=0):
""" Exports pandas dataframe as a formated excel table """
with pd.ExcelWriter(excel_name, engine='xlsxwriter') as writer:
df.to_excel(writer, sheet_name=sheet_name, startrow=startrow, startcol=startcol, header=True, index=False)
workbook = writer.book
worksheet = writer.sheets[sheet_name]
max_row, max_col = df.shape
olumn_settings = [{'header': header} for header in df.columns]
worksheet.add_table(startrow, startcol, max_row+startrow, max_col+startcol-1, {'columns': column_settings})
# style columns
worksheet.set_column(startcol, max_col + startcol, 21)

Groupby and transpose or unstack in Pandas

I have the following Python pandas dataframe:
There are more EventName's than shown on this date.
Each will have Race_Number = 'Race 1', 'Race 2', etc.
After a while the date increments.
.
I'm trying to create a dataframe that looks like this:
Each race has different numbers of runners.
Is there a way to do this in pandas ?
Thanks
I assumed output would be another DataFrame.
import pandas as pd
import numpy as np
from nltk import flatten
import copy
df = pd.DataFrame({'EventName': ['sydney', 'sydney', 'sydney', 'sydney', 'sydney', 'sydney'],
'Date': ['2019-01.01', '2019-01.01', '2019-01.01', '2019-01.01', '2019-01.01', '2019-01.01'],
'Race_Number': ['Race1', 'Race1', 'Race1', 'Race2', 'Race2', 'Race3'],
'Number': [4, 7, 2, 9, 5, 10]
})
print(df)
dic={}
for rows in df.itertuples():
if rows.Race_Number in dic:
dic[rows.Race_Number] = flatten([dic[rows.Race_Number], rows.Number])
else:
dic[rows.Race_Number] = rows.Number
copy_dic = copy.deepcopy(dic)
seq = np.arange(0,len(dic.keys()))
for key, n_key in zip(copy_dic, seq):
dic[n_key] = dic.pop(key)
df = pd.DataFrame([dic])
print(df)

Resources