Wrap text for column of dataframe in pandas - python-3.x

I am trying to wrap text in python dataframe columns but this code is working for values in columns and not header of column.
I am using below code (taken form stackoverflow). Kindly suggest how to wrap header of dataframe
long_text = 'aa aa ss df fff ggh ttr tre ww rr tt ww errr t ttyyy eewww rr55t e'
data = {'a':[long_text, long_text, 'a'],'c': [long_text,long_text,long_text],
'b':[1,2,3]}
df = pd.DataFrame(data)
#choose columns of df for wrapping
cols_for_wrap = ['a','c']
writer = pd.ExcelWriter('aaa.xlsx', engine='xlsxwriter')
df.to_excel(writer, sheet_name='Sheet1', index=False)
#modifyng output by style - wrap
workbook = writer.book
worksheet = writer.sheets['Sheet1']
wrap_format = workbook.add_format({'text_wrap': True})
#get positions of columns
for col in df.columns.get_indexer(cols_for_wrap):
#map by dict to format like "A:A"
excel_header = d[col] + ':' + d[col]
#None means not set with
worksheet.set_column(excel_header, None, wrap_format)
#for with = 20
worksheet.set_column(excel_header, 10, wrap_format)
writer.save()

In the header_format piece that jmcnamara linked, you can add or remove any formats you want or do not want.
header_format = workbook.add_format({
'bold': True,
'text_wrap': True,
'valign': 'top',
'fg_color': '#D7E4BC',
'border': 1})
# Write the column headers with the defined format.
for col_num, value in enumerate(df.columns.values):
worksheet.write(0, col_num + 1, value, header_format)
My code looks like this:
h_format = workbook.add_format({'text_wrap': True})
...
...
...
for col_num, value in enumerate(df_new.columns.values):
worksheet.write(0, col_num, value, format)
writer.save()

This is covered almost exactly in the Formatting of the Dataframe headers section of the XlsxWriter docs.

Related

Highlighting excel cells based on the cell values using pandas dataframe and xlsxwriter

I have a csv file. After doing certain process, it has to be saved as an excel file.
I am opening it as pandas dataframe and after doing some cleaning (renaming and rearranging columns, dropping few columns), i have to replace null values or if the cell value is "N/A" to "DN". Currently i am using two lines of code for this.
df.replace('', np.nan, inplace = True)
df.replace('N/A', np.nan, inplace = True)
df = df.fillna("DN")
Then, i have to highlight cells which has the value "DN" with yellow color
I am trying with the code mentioned in this post How Do I Highlight Rows Of Data? Python Pandas issue. But in the output excel nothing is getting highlighted. Below is the code i am currently working with
df.replace('', np.nan, inplace = True)
df.replace('N/A', np.nan, inplace = True)
df = df.fillna("NA")
df.index = np.arange(1, len(df) + 1)
def high_color(val):
color = 'yellow' if val == 'NA' else ''
return 'color: {}'.format(color)
result = df.style.applymap(high_color)
writer_orig = pd.ExcelWriter(out_name, engine='xlsxwriter')
df.to_excel(writer_orig, sheet_name='report', index=True, index_label="S_No", freeze_panes=(1,1))
workbook = writer_orig.book
worksheet = writer_orig.sheets['report']
# Add a header format.
header_format = workbook.add_format({
'bold': True,
'fg_color': '#ffcccc',
'border': 1})
for col_num, value in enumerate(df.columns.values):
worksheet.write(0, col_num + 1, value, header_format)
writer_orig.close()
Any kind of suggestions will be greatly helpful.
You can't save a Styler Object to an Excel spreadsheet by using pandas.ExcelWriter.
class pandas.ExcelWriter(path, engine=None, date_format=None,
datetime_format=None, mode='w', storage_options=None,
if_sheet_exists=None, engine_kwargs=None, **kwargs)
Class for writing DataFrame objects into excel sheets.
You need to use worksheet.conditional_format from xlsxwriter to highlight a value in every cell. Also, you can pass na_values as a kwarg to pandas.read_csv to automatically consider a list of values as NaN.
from xlsxwriter.utility import xl_rowcol_to_cell
df = pd.read_csv('/tmp/inputfile.csv', na_values=['', 'N/A']).fillna('DN')
l = df.columns.get_indexer(df.columns).tolist()
xshape = list(map(xl_col_to_name, [e+1 for e in l]))
max_row, max_col = df.shape
with pd.ExcelWriter("/tmp/outputfile.xlsx") as writer:
df.to_excel(writer, sheet_name='report', index=True,
index_label='S_No', freeze_panes=(1,1))
wb = writer.book
ws = writer.sheets['report']
format_header = wb.add_format({'bold': True, 'fg_color': '#ffcccc', 'border': 1})
for idx, col in enumerate(['S_No'] + list(df.columns)):
ws.write(0, idx, col, format_header)
format_dn = wb.add_format({'bg_color':'yellow', 'font_color': 'black'})
ws.conditional_format(f'{xshape[0]}2:{xshape[-1]}{str(max_row+1)}',
{'type': 'cell', 'criteria': '==',
'value': '"DN"', 'format': format_dn})
Output :
You have to export to excel with result Styler:
# Demo
def high_color(val):
return 'background-color: yellow' if val == 'NA' else None
result = df.style.applymap(high_color)
result.to_excel('styler1.xlsx')
df.to_excel('styler2.xlsx')
Export from result
Export from df

Need to delete a couple of characters from one column in Pandas Dataframe

I have tried various things to remove just the the Call ,[,] and ' from column D. What am I missing?
I've tried:
.str.replace
df.Required_no_Email.replace("\(", 'xxx', regex=True)
df.Required_no_Email.replace('\(|\)', '', regex=True)
df.Required_no_Email.str.strip('()')
and quite a few others, but I have lost track of what else I've tried.
Here's the script
from bs4 import BeautifulSoup # BeautifulSoup is in bs4 package
import requests
import re
import pandas as pd
URL = 'https://reallyfrustrated.com'
content = requests.get(URL)
soup = BeautifulSoup(content.text, 'html.parser')
business = soup.find('title')
companys = business.get_text()
phones = soup.find_all(text=re.compile("Call (.*)"))
data = {'Required':[companys], 'Required_no_Email':[phones]}
df = pd.DataFrame(data, columns = ['Required','First', 'Last', 'Required_no_Email', 'Business Fax'])
writer = pd.ExcelWriter("ProspectUploadSheetRob.xlsx", engine='xlsxwriter')
df.to_excel(writer, sheet_name='Sheet1', index=False, startrow=4, header=3)
workbook = writer.book
worksheet = writer.sheets['Sheet1']
header_format = workbook.add_format({
'bold': False,
'text_wrap': False,
'valign': 'top',
'fg_color': False,
'border': False})
for col_num, value in enumerate(df.columns.values):
worksheet.write(4, col_num, value, header_format)
df.Required_no_Email.str.strip('()')
writer.save()
#RobK the below works with a one-liner for whatever characters you want to replace. You were pretty close in one of your tries above passing regex=True and escaping characters with \ and using an or separator with |. You need to remember to set the changes with df.Required_no_Email =.
import pandas as pd
df = pd.DataFrame({'Required' : ['CTC Landscaping'],
'Required_no_Email' : ['''['Call (123) 456-7890']''']})
df.Required_no_Email = df.Required_no_Email.replace('\[|\]|\'|\(|\)', '',
regex=True)
df
#RobK you said your code was not changing anything. My guess is that you placed the replace part of your script after creating the writer object. It is working for me below. I created my own dataframe in place of the beautiful soup portion of your code, and it worked perfectly. I will also attach a screenshot:
import pandas as pd
df = pd.DataFrame({'Required' : ['CTC Landscaping'],
'Required_no_Email' : ['''['Call (123) 456-7890']''']})
df.Required_no_Email = df.Required_no_Email.replace('\[|\]|\'|\(|\)', '',
regex=True)
writer = pd.ExcelWriter("ProspectUploadSheetRob.xlsx", engine='xlsxwriter')
df.to_excel(writer, sheet_name='Sheet1', index=False, startrow=4, header=3)
workbook = writer.book
worksheet = writer.sheets['Sheet1']
header_format = workbook.add_format({
'bold': False,
'text_wrap': False,
'valign': 'top',
'fg_color': False,
'border': False})
for col_num, value in enumerate(df.columns.values):
worksheet.write(4, col_num, value, header_format)
df.Required_no_Email.str.strip('()')
writer.save()

How to create Excel **Table** with pandas.to_excel()?

Need the achieve this programmatically from a dataframe:
https://learn.microsoft.com/en-us/power-bi/service-admin-troubleshoot-excel-workbook-data
Here is one way to do it using XlsxWriter:
import pandas as pd
# Create a Pandas dataframe from some data.
data = [10, 20, 30, 40, 50, 60, 70, 80]
df = pd.DataFrame({'Rank': data,
'Country': data,
'Population': data,
'Data1': data,
'Data2': data})
# Create a Pandas Excel writer using XlsxWriter as the engine.
writer = pd.ExcelWriter("pandas_table.xlsx", engine='xlsxwriter')
# Convert the dataframe to an XlsxWriter Excel object. Turn off the default
# header and index and skip one row to allow us to insert a user defined
# header.
df.to_excel(writer, sheet_name='Sheet1', startrow=1, header=False, index=False)
# Get the xlsxwriter workbook and worksheet objects.
workbook = writer.book
worksheet = writer.sheets['Sheet1']
# Get the dimensions of the dataframe.
(max_row, max_col) = df.shape
# Create a list of column headers, to use in add_table().
column_settings = []
for header in df.columns:
column_settings.append({'header': header})
# Add the table.
worksheet.add_table(0, 0, max_row, max_col - 1, {'columns': column_settings})
# Make the columns wider for clarity.
worksheet.set_column(0, max_col - 1, 12)
# Close the Pandas Excel writer and output the Excel file.
writer.save()
Output:
Update: I've added a similar example to the XlsxWriter docs: Example: Pandas Excel output with a worksheet table
You can't do it with to_excel. A workaround is to open the generated xlsx file and add the table there with openpyxl:
import pandas as pd
df = pd.DataFrame({'Col1': [1,2,3], 'Col2': list('abc')})
filename = 'so58326392.xlsx'
sheetname = 'mySheet'
with pd.ExcelWriter(filename) as writer:
if not df.index.name:
df.index.name = 'Index'
df.to_excel(writer, sheet_name=sheetname)
import openpyxl
wb = openpyxl.load_workbook(filename = filename)
tab = openpyxl.worksheet.table.Table(displayName="df", ref=f'A1:{openpyxl.utils.get_column_letter(df.shape[1])}{len(df)+1}')
wb[sheetname].add_table(tab)
wb.save(filename)
Please note the all table headers must be strings. If you have an un-named index (which is the rule) the first cell (A1) will be empty which leads to file corruption. To avoid this give your index a name (as shown above) or export the dataframe without the index using:
df.to_excel(writer, sheet_name=sheetname, index=False)
Another workaround, if you don't want to save, re-open, and re-save, is to use xlsxwriter. It can write ListObject tables directly, but does not do so directly from a dataframe, so you need to break out the parts:
import pandas as pd
import xlsxwriter as xl
df = pd.DataFrame({'Col1': [1,2,3], 'Col2': list('abc')})
filename = 'output.xlsx'
sheetname = 'Table'
tablename = 'TEST'
(rows, cols) = df.shape
data = df.to_dict('split')['data']
headers = []
for col in df.columns:
headers.append({'header':col})
wb = xl.Workbook(filename)
ws = wb.add_worksheet()
ws.add_table(0, 0, rows, cols-1,
{'name': tablename
,'data': data
,'columns': headers})
wb.close()
The add_table() function expects 'data' as a list of lists, where each sublist represents a row of the dataframe, and 'columns' as a list of dicts for the header where each column is specified by a dictionary of the form {'header': 'ColumnName'}.
I created a package to write properly formatted excel tables from pandas: pandas-xlsx-tables
from pandas_xlsx_tables import df_to_xlsx_table
import pandas as pd
data = [10, 20, 30, 40, 50, 60, 70, 80]
df = pd.DataFrame({'Rank': data,
'Country': data,
'Population': data,
'Strings': [f"n{n}" for n in data],
'Datetimes': [pd.Timestamp.now() for _ in range(len(data))]})
df_to_xlsx_table(df, "my_table", index=False, header_orientation="diagonal")
You can also do the reverse with xlsx_table_to_df
Based on the answer of #jmcnamara, but as a convenient function and using "with" statement:
import pandas as pd
def to_excel(df:pd.DataFrame, excel_name: str, sheet_name: str, startrow=1, startcol=0):
""" Exports pandas dataframe as a formated excel table """
with pd.ExcelWriter(excel_name, engine='xlsxwriter') as writer:
df.to_excel(writer, sheet_name=sheet_name, startrow=startrow, startcol=startcol, header=True, index=False)
workbook = writer.book
worksheet = writer.sheets[sheet_name]
max_row, max_col = df.shape
olumn_settings = [{'header': header} for header in df.columns]
worksheet.add_table(startrow, startcol, max_row+startrow, max_col+startcol-1, {'columns': column_settings})
# style columns
worksheet.set_column(startcol, max_col + startcol, 21)

xlsxwriter - Conditional formatting based on column name of the dataframe

I have a dataframe as below. I want to apply conditional formatting on column "Data2" using the column name. I know how to define format for a specific column but I am not sure how to define it based on column name as shown below.
So basically I want to do the same formatting on column name(because the order of column might change)
df1 = pd.DataFrame({'Data1': [10, 20, 30],
'Data2': ["a", "b", "c"]})
writer = pd.ExcelWriter('pandas_filter.xlsx', engine='xlsxwriter', )
workbook = writer.book
df1.to_excel(writer, sheet_name='Sheet1', index=False)
worksheet = writer.sheets['Sheet1']
blue = workbook.add_format({'bg_color':'#000080', 'font_color': 'white'})
red = workbook.add_format({'bg_color':'#E52935', 'font_color': 'white'})
l = ['B2:B500']
for columns in l:
worksheet.conditional_format(columns, {'type': 'text',
'criteria': 'containing',
'value': 'a',
'format': blue})
worksheet.conditional_format(columns, {'type': 'text',
'criteria': 'containing',
'value': 'b',
'format': red})
writer.save()
using xlsxwriter with xl_col_to_name we can get the column name using the index.
from xlsxwriter.utility import xl_col_to_name
target_col = xl_col_to_name(df1.columns.get_loc("Data2"))
l = [f'{target_col}2:{target_col}500']
for columns in l:
using opnpyxl with get_column_letter we can get the column name using the index.
from openpyxl.utils import get_column_letter
target_col = get_column_letter(df1.columns.get_loc("Data2") + 1) # add 1 because get_column_letter index start from 1
l = [f'{target_col}2:{target_col}500']
for columns in l:
...

Convert Dictionary to CSV Python

I have a dictionary that's in the following format:
mydict = {'item1': ['label1_item', 'label2_item', 'label3_item', 'label4_item'], ...
'item999': ['label1_item999', 'label2_item999', 'label3_item999', 'label4_item999']}
this is how i'm currently outputing the dictionary:
filename = datetime.now().strftime('output_-%Y-%m-%d-%H-%M.csv')
df = pd.DataFrame(mydict)
df.to_csv(filename,encoding='utf-8', header = ['label1', 'label2', 'label3', 'label4'], sep=',')
I want to label the first column "item", but I am unable to label the first column, I have labels for columns 2 (label1)- columns 5 (label4). How do I modify my script to do so?
Not clear what you want, so I am assuming you want rows to be labeled ['label1', 'label2', 'label3', 'label4'] and columns to be labeled ['item', 'item99']
Reset index:
df.index = ['label1', 'label2', 'label3', 'label4']
Save:
df.to_csv(filename,encoding='utf-8', sep=',', header=['item', 'item99'])
Edit:
Based on your comment:
your dataframe needs to be transposed:
df = pd.DataFrame(mydict).T
which yields:
0 1 2 3
item1 label1_item label2_item label3_item label4_item
item999 label1_item999 label2_item999 label3_item999 label4_item999
then save:
df.to_csv(filename,encoding='utf-8', header=['label1', 'label2', 'label3', 'label4'], sep=',')

Resources