Write second Header and Merge Cells using Pandas - python-3.x

All,
I have written a script to write header and data into a excel sheet. But my actual requirement is to write sub header as well and need to merge the cells from the 2nd row.
import xlsxwriter
import pandas as pd
import numpy as np
import openpyxl
import time
# Creating a dataframe
df = pd.DataFrame(np.random.randn(10, 3), columns=list('ABC'))
column_list = df
# Create a Pandas Excel writer using XlsxWriter engine.
writer = pd.ExcelWriter("test.xlsx", engine='xlsxwriter')
df.to_excel(writer, sheet_name='Sheet1', startrow=2, header=False, index=False)
# Get workbook and worksheet objects
workbook = writer.book
worksheet = writer.sheets['Sheet1']
header_fmt = workbook.add_format({'font_name': 'Arial', 'font_size': 12, 'color' : 'white', 'fg_color': '#00007f','bold': True, 'border' : 1})
for idx, val in enumerate(column_list):
worksheet.write(0, idx, val, header_fmt)
worksheet.write(1, 1, 'Sample', header_fmt)
font_fmt = workbook.add_format({'font_name': 'Arial', 'font_size': 9 })
worksheet.set_column('A:C', None, font_fmt)
worksheet.set_row(0, None, header_fmt)
writer.save()
EDIT:
Expected Output:
There are 4 sections in the expected output, all of them are from different Dataframes. I need to merge all those Dataframes' output into a single sheet as shown in the image.

How about this? Is this what you want?
import xlsxwriter
import pandas as pd
import numpy as np
import openpyxl
import time
# Creating a dataframe
df = pd.DataFrame(np.random.randn(10, 3), columns=list('ABC'))
column_list = df
# Create a Pandas Excel writer using XlsxWriter engine.
writer = pd.ExcelWriter("test.xlsx", engine='xlsxwriter')
df.to_excel(writer, sheet_name='Sheet1', startrow=2, header=False, index=False)
# Get workbook and worksheet objects
workbook = writer.book
worksheet = writer.sheets['Sheet1']
header_fmt = workbook.add_format({'font_name': 'Arial', 'font_size': 12, 'color' : 'white', 'fg_color': '#00007f','bold': True, 'border' : 1})
merge_format = workbook.add_format({'align': 'center'})
worksheet.merge_range('A2:C2', 'Sample', merge_format)
for idx, val in enumerate(column_list):
worksheet.write(0, idx, val,header_fmt)
font_fmt = workbook.add_format({'font_name': 'Arial', 'font_size': 9 })
worksheet.set_column('A:C',None, font_fmt)
worksheet.set_row(0,None, header_fmt)
writer.save()

Related

Highlighting excel cells based on the cell values using pandas dataframe and xlsxwriter

I have a csv file. After doing certain process, it has to be saved as an excel file.
I am opening it as pandas dataframe and after doing some cleaning (renaming and rearranging columns, dropping few columns), i have to replace null values or if the cell value is "N/A" to "DN". Currently i am using two lines of code for this.
df.replace('', np.nan, inplace = True)
df.replace('N/A', np.nan, inplace = True)
df = df.fillna("DN")
Then, i have to highlight cells which has the value "DN" with yellow color
I am trying with the code mentioned in this post How Do I Highlight Rows Of Data? Python Pandas issue. But in the output excel nothing is getting highlighted. Below is the code i am currently working with
df.replace('', np.nan, inplace = True)
df.replace('N/A', np.nan, inplace = True)
df = df.fillna("NA")
df.index = np.arange(1, len(df) + 1)
def high_color(val):
color = 'yellow' if val == 'NA' else ''
return 'color: {}'.format(color)
result = df.style.applymap(high_color)
writer_orig = pd.ExcelWriter(out_name, engine='xlsxwriter')
df.to_excel(writer_orig, sheet_name='report', index=True, index_label="S_No", freeze_panes=(1,1))
workbook = writer_orig.book
worksheet = writer_orig.sheets['report']
# Add a header format.
header_format = workbook.add_format({
'bold': True,
'fg_color': '#ffcccc',
'border': 1})
for col_num, value in enumerate(df.columns.values):
worksheet.write(0, col_num + 1, value, header_format)
writer_orig.close()
Any kind of suggestions will be greatly helpful.
You can't save a Styler Object to an Excel spreadsheet by using pandas.ExcelWriter.
class pandas.ExcelWriter(path, engine=None, date_format=None,
datetime_format=None, mode='w', storage_options=None,
if_sheet_exists=None, engine_kwargs=None, **kwargs)
Class for writing DataFrame objects into excel sheets.
You need to use worksheet.conditional_format from xlsxwriter to highlight a value in every cell. Also, you can pass na_values as a kwarg to pandas.read_csv to automatically consider a list of values as NaN.
from xlsxwriter.utility import xl_rowcol_to_cell
df = pd.read_csv('/tmp/inputfile.csv', na_values=['', 'N/A']).fillna('DN')
l = df.columns.get_indexer(df.columns).tolist()
xshape = list(map(xl_col_to_name, [e+1 for e in l]))
max_row, max_col = df.shape
with pd.ExcelWriter("/tmp/outputfile.xlsx") as writer:
df.to_excel(writer, sheet_name='report', index=True,
index_label='S_No', freeze_panes=(1,1))
wb = writer.book
ws = writer.sheets['report']
format_header = wb.add_format({'bold': True, 'fg_color': '#ffcccc', 'border': 1})
for idx, col in enumerate(['S_No'] + list(df.columns)):
ws.write(0, idx, col, format_header)
format_dn = wb.add_format({'bg_color':'yellow', 'font_color': 'black'})
ws.conditional_format(f'{xshape[0]}2:{xshape[-1]}{str(max_row+1)}',
{'type': 'cell', 'criteria': '==',
'value': '"DN"', 'format': format_dn})
Output :
You have to export to excel with result Styler:
# Demo
def high_color(val):
return 'background-color: yellow' if val == 'NA' else None
result = df.style.applymap(high_color)
result.to_excel('styler1.xlsx')
df.to_excel('styler2.xlsx')
Export from result
Export from df

Need to delete a couple of characters from one column in Pandas Dataframe

I have tried various things to remove just the the Call ,[,] and ' from column D. What am I missing?
I've tried:
.str.replace
df.Required_no_Email.replace("\(", 'xxx', regex=True)
df.Required_no_Email.replace('\(|\)', '', regex=True)
df.Required_no_Email.str.strip('()')
and quite a few others, but I have lost track of what else I've tried.
Here's the script
from bs4 import BeautifulSoup # BeautifulSoup is in bs4 package
import requests
import re
import pandas as pd
URL = 'https://reallyfrustrated.com'
content = requests.get(URL)
soup = BeautifulSoup(content.text, 'html.parser')
business = soup.find('title')
companys = business.get_text()
phones = soup.find_all(text=re.compile("Call (.*)"))
data = {'Required':[companys], 'Required_no_Email':[phones]}
df = pd.DataFrame(data, columns = ['Required','First', 'Last', 'Required_no_Email', 'Business Fax'])
writer = pd.ExcelWriter("ProspectUploadSheetRob.xlsx", engine='xlsxwriter')
df.to_excel(writer, sheet_name='Sheet1', index=False, startrow=4, header=3)
workbook = writer.book
worksheet = writer.sheets['Sheet1']
header_format = workbook.add_format({
'bold': False,
'text_wrap': False,
'valign': 'top',
'fg_color': False,
'border': False})
for col_num, value in enumerate(df.columns.values):
worksheet.write(4, col_num, value, header_format)
df.Required_no_Email.str.strip('()')
writer.save()
#RobK the below works with a one-liner for whatever characters you want to replace. You were pretty close in one of your tries above passing regex=True and escaping characters with \ and using an or separator with |. You need to remember to set the changes with df.Required_no_Email =.
import pandas as pd
df = pd.DataFrame({'Required' : ['CTC Landscaping'],
'Required_no_Email' : ['''['Call (123) 456-7890']''']})
df.Required_no_Email = df.Required_no_Email.replace('\[|\]|\'|\(|\)', '',
regex=True)
df
#RobK you said your code was not changing anything. My guess is that you placed the replace part of your script after creating the writer object. It is working for me below. I created my own dataframe in place of the beautiful soup portion of your code, and it worked perfectly. I will also attach a screenshot:
import pandas as pd
df = pd.DataFrame({'Required' : ['CTC Landscaping'],
'Required_no_Email' : ['''['Call (123) 456-7890']''']})
df.Required_no_Email = df.Required_no_Email.replace('\[|\]|\'|\(|\)', '',
regex=True)
writer = pd.ExcelWriter("ProspectUploadSheetRob.xlsx", engine='xlsxwriter')
df.to_excel(writer, sheet_name='Sheet1', index=False, startrow=4, header=3)
workbook = writer.book
worksheet = writer.sheets['Sheet1']
header_format = workbook.add_format({
'bold': False,
'text_wrap': False,
'valign': 'top',
'fg_color': False,
'border': False})
for col_num, value in enumerate(df.columns.values):
worksheet.write(4, col_num, value, header_format)
df.Required_no_Email.str.strip('()')
writer.save()

How to create Excel **Table** with pandas.to_excel()?

Need the achieve this programmatically from a dataframe:
https://learn.microsoft.com/en-us/power-bi/service-admin-troubleshoot-excel-workbook-data
Here is one way to do it using XlsxWriter:
import pandas as pd
# Create a Pandas dataframe from some data.
data = [10, 20, 30, 40, 50, 60, 70, 80]
df = pd.DataFrame({'Rank': data,
'Country': data,
'Population': data,
'Data1': data,
'Data2': data})
# Create a Pandas Excel writer using XlsxWriter as the engine.
writer = pd.ExcelWriter("pandas_table.xlsx", engine='xlsxwriter')
# Convert the dataframe to an XlsxWriter Excel object. Turn off the default
# header and index and skip one row to allow us to insert a user defined
# header.
df.to_excel(writer, sheet_name='Sheet1', startrow=1, header=False, index=False)
# Get the xlsxwriter workbook and worksheet objects.
workbook = writer.book
worksheet = writer.sheets['Sheet1']
# Get the dimensions of the dataframe.
(max_row, max_col) = df.shape
# Create a list of column headers, to use in add_table().
column_settings = []
for header in df.columns:
column_settings.append({'header': header})
# Add the table.
worksheet.add_table(0, 0, max_row, max_col - 1, {'columns': column_settings})
# Make the columns wider for clarity.
worksheet.set_column(0, max_col - 1, 12)
# Close the Pandas Excel writer and output the Excel file.
writer.save()
Output:
Update: I've added a similar example to the XlsxWriter docs: Example: Pandas Excel output with a worksheet table
You can't do it with to_excel. A workaround is to open the generated xlsx file and add the table there with openpyxl:
import pandas as pd
df = pd.DataFrame({'Col1': [1,2,3], 'Col2': list('abc')})
filename = 'so58326392.xlsx'
sheetname = 'mySheet'
with pd.ExcelWriter(filename) as writer:
if not df.index.name:
df.index.name = 'Index'
df.to_excel(writer, sheet_name=sheetname)
import openpyxl
wb = openpyxl.load_workbook(filename = filename)
tab = openpyxl.worksheet.table.Table(displayName="df", ref=f'A1:{openpyxl.utils.get_column_letter(df.shape[1])}{len(df)+1}')
wb[sheetname].add_table(tab)
wb.save(filename)
Please note the all table headers must be strings. If you have an un-named index (which is the rule) the first cell (A1) will be empty which leads to file corruption. To avoid this give your index a name (as shown above) or export the dataframe without the index using:
df.to_excel(writer, sheet_name=sheetname, index=False)
Another workaround, if you don't want to save, re-open, and re-save, is to use xlsxwriter. It can write ListObject tables directly, but does not do so directly from a dataframe, so you need to break out the parts:
import pandas as pd
import xlsxwriter as xl
df = pd.DataFrame({'Col1': [1,2,3], 'Col2': list('abc')})
filename = 'output.xlsx'
sheetname = 'Table'
tablename = 'TEST'
(rows, cols) = df.shape
data = df.to_dict('split')['data']
headers = []
for col in df.columns:
headers.append({'header':col})
wb = xl.Workbook(filename)
ws = wb.add_worksheet()
ws.add_table(0, 0, rows, cols-1,
{'name': tablename
,'data': data
,'columns': headers})
wb.close()
The add_table() function expects 'data' as a list of lists, where each sublist represents a row of the dataframe, and 'columns' as a list of dicts for the header where each column is specified by a dictionary of the form {'header': 'ColumnName'}.
I created a package to write properly formatted excel tables from pandas: pandas-xlsx-tables
from pandas_xlsx_tables import df_to_xlsx_table
import pandas as pd
data = [10, 20, 30, 40, 50, 60, 70, 80]
df = pd.DataFrame({'Rank': data,
'Country': data,
'Population': data,
'Strings': [f"n{n}" for n in data],
'Datetimes': [pd.Timestamp.now() for _ in range(len(data))]})
df_to_xlsx_table(df, "my_table", index=False, header_orientation="diagonal")
You can also do the reverse with xlsx_table_to_df
Based on the answer of #jmcnamara, but as a convenient function and using "with" statement:
import pandas as pd
def to_excel(df:pd.DataFrame, excel_name: str, sheet_name: str, startrow=1, startcol=0):
""" Exports pandas dataframe as a formated excel table """
with pd.ExcelWriter(excel_name, engine='xlsxwriter') as writer:
df.to_excel(writer, sheet_name=sheet_name, startrow=startrow, startcol=startcol, header=True, index=False)
workbook = writer.book
worksheet = writer.sheets[sheet_name]
max_row, max_col = df.shape
olumn_settings = [{'header': header} for header in df.columns]
worksheet.add_table(startrow, startcol, max_row+startrow, max_col+startcol-1, {'columns': column_settings})
# style columns
worksheet.set_column(startcol, max_col + startcol, 21)

Groupby and transpose or unstack in Pandas

I have the following Python pandas dataframe:
There are more EventName's than shown on this date.
Each will have Race_Number = 'Race 1', 'Race 2', etc.
After a while the date increments.
.
I'm trying to create a dataframe that looks like this:
Each race has different numbers of runners.
Is there a way to do this in pandas ?
Thanks
I assumed output would be another DataFrame.
import pandas as pd
import numpy as np
from nltk import flatten
import copy
df = pd.DataFrame({'EventName': ['sydney', 'sydney', 'sydney', 'sydney', 'sydney', 'sydney'],
'Date': ['2019-01.01', '2019-01.01', '2019-01.01', '2019-01.01', '2019-01.01', '2019-01.01'],
'Race_Number': ['Race1', 'Race1', 'Race1', 'Race2', 'Race2', 'Race3'],
'Number': [4, 7, 2, 9, 5, 10]
})
print(df)
dic={}
for rows in df.itertuples():
if rows.Race_Number in dic:
dic[rows.Race_Number] = flatten([dic[rows.Race_Number], rows.Number])
else:
dic[rows.Race_Number] = rows.Number
copy_dic = copy.deepcopy(dic)
seq = np.arange(0,len(dic.keys()))
for key, n_key in zip(copy_dic, seq):
dic[n_key] = dic.pop(key)
df = pd.DataFrame([dic])
print(df)

Not able to unlock cell with custom value using pd.xlsxwriter

I have a dataframe as shown in the below code. I just want to lock the header(top row) and let the user change rest of the cells. Based on the code below, it does lock the header and enable me to change the value of all the columns except for the "Date" column. I cannot change the value of date column. It should allow me to change the value of the date column too
import pandas as pd
df = pd.DataFrame({'Data1': [10, 20, 30],
'Data2': [11, 21, 31],
'Date': ["",
"",
pd.to_datetime('today')]})
writer = pd.ExcelWriter('pandas_filter.xlsx', engine='xlsxwriter', )
df.to_excel(writer, sheet_name='Sheet1', index=False)
workbook = writer.book
worksheet = writer.sheets['Sheet1']
unlocked = workbook.add_format({'locked': False})
locked = workbook.add_format({'locked': True})
worksheet.protect()
for row in range(1, 150):
worksheet.set_row(row, None, unlocked)
writer.save()
In XlsxWriter a cell format overrides a row format which overrides a column format.
The reason that the datetime cells aren't unlocked is due to the fact that Pandas applies a cell format to those cells (for the date format) and thus the row format is ignored/overridden.
The only way to avoid this would be to write (or overwrite) the date cell data separately from the other data frame data and apply an unlocked date format. Something like this:
import pandas as pd
df = pd.DataFrame({'Data1': [10, 20, 30],
'Data2': [11, 21, 31],
'Date': [pd.to_datetime('today'),
pd.to_datetime('today'),
pd.to_datetime('today')]})
writer = pd.ExcelWriter('pandas_filter.xlsx', engine='xlsxwriter', )
df.to_excel(writer, sheet_name='Sheet1', index=False)
workbook = writer.book
worksheet = writer.sheets['Sheet1']
unlocked = workbook.add_format({'locked': False})
locked = workbook.add_format({'locked': True})
# Write the 'Date' column data.
worksheet.set_column('C:C', 20)
unlocked_date_format = workbook.add_format({'num_format': 'yyyy-mm-dd',
'locked': False})
worksheet.write_column('C2', df['Date'], unlocked_date_format)
worksheet.protect()
for row in range(1, 150):
worksheet.set_row(row, None, unlocked)
writer.save()
Output, after modification:

Resources