Format certain rows after writing to excel file - python-3.x

I have some code which compares two excel files and determines any new rows (new_rows) added or any rows which were deleted (dropped_rows). It then uses xlsxwriter to write this to a excel sheet. The bit of code I am having trouble with is that it is supposed to then iterate through the rows and if the row was a new row or a dropped row it is supposed to format it a certain way. For whatever reason this part of the code isn't working correct and being ignored.
I've tried a whole host of different syntax to make this work but no luck.
UPDATE
After some more trial and error the issue seems to be caused by the index column. It is a Case Number column and the values have a prefix like "Case_123, Case_456, Case_789, etc..". This seems to be the root of the issue. But not sure how to solve for it.
grey_fmt = workbook.add_format({'font_color': '#E0E0E0'})
highlight_fmt = workbook.add_format({'font_color': '#FF0000', 'bg_color':'#B1B3B3'})
new_fmt = workbook.add_format({'font_color': '#32CD32','bold':True})
# set format over range
## highlight changed cells
worksheet.conditional_format('A1:J10000', {'type': 'text',
'criteria': 'containing',
'value':'→',
'format': highlight_fmt})
# highlight new/changed rows
for row in range(dfDiff.shape[0]):
if row+1 in newRows:
worksheet.set_row(row+1, 15, new_fmt)
if row+1 in droppedRows:
worksheet.set_row(row+1, 15, grey_fmt)
the last part # highlight new/changed rows is the bit that is not working. The conditional format portion works fine.
the rest of the code:
import pandas as pd
from pathlib import Path
def excel_diff(path_OLD, path_NEW, index_col):
df_OLD = pd.read_excel(path_OLD, index_col=index_col).fillna(0)
df_NEW = pd.read_excel(path_NEW, index_col=index_col).fillna(0)
# Perform Diff
dfDiff = df_NEW.copy()
droppedRows = []
newRows = []
cols_OLD = df_OLD.columns
cols_NEW = df_NEW.columns
sharedCols = list(set(cols_OLD).intersection(cols_NEW))
for row in dfDiff.index:
if (row in df_OLD.index) and (row in df_NEW.index):
for col in sharedCols:
value_OLD = df_OLD.loc[row,col]
value_NEW = df_NEW.loc[row,col]
if value_OLD==value_NEW:
dfDiff.loc[row,col] = df_NEW.loc[row,col]
else:
dfDiff.loc[row,col] = ('{}→{}').format(value_OLD,value_NEW)
else:
newRows.append(row)
for row in df_OLD.index:
if row not in df_NEW.index:
droppedRows.append(row)
dfDiff = dfDiff.append(df_OLD.loc[row,:])
dfDiff = dfDiff.sort_index().fillna('')
print(dfDiff)
print('\nNew Rows: {}'.format(newRows))
print('Dropped Rows: {}'.format(droppedRows))
# Save output and format
fname = '{} vs {}.xlsx'.format(path_OLD.stem,path_NEW.stem)
writer = pd.ExcelWriter(fname, engine='xlsxwriter')
dfDiff.to_excel(writer, sheet_name='DIFF', index=True)
df_NEW.to_excel(writer, sheet_name=path_NEW.stem, index=True)
df_OLD.to_excel(writer, sheet_name=path_OLD.stem, index=True)
# get xlsxwriter objects
workbook = writer.book
worksheet = writer.sheets['DIFF']
worksheet.hide_gridlines(2)
worksheet.set_default_row(15)
# define formats
date_fmt = workbook.add_format({'align': 'center', 'num_format': 'yyyy-mm-dd'})
center_fmt = workbook.add_format({'align': 'center'})
number_fmt = workbook.add_format({'align': 'center', 'num_format': '#,##0.00'})
cur_fmt = workbook.add_format({'align': 'center', 'num_format': '$#,##0.00'})
perc_fmt = workbook.add_format({'align': 'center', 'num_format': '0%'})
grey_fmt = workbook.add_format({'font_color': '#E0E0E0'})
highlight_fmt = workbook.add_format({'font_color': '#FF0000', 'bg_color':'#B1B3B3'})
new_fmt = workbook.add_format({'font_color': '#32CD32','bold':True})
# set format over range
## highlight changed cells
worksheet.conditional_format('A1:J10000', {'type': 'text',
'criteria': 'containing',
'value':'→',
'format': highlight_fmt})
# highlight new/changed rows
for row in range(dfDiff.shape[0]):
if row+1 in newRows:
worksheet.set_row(row+1, 15, new_fmt)
if row+1 in droppedRows:
worksheet.set_row(row+1, 15, grey_fmt)
# save
writer.save()
print('\nDone.\n')
def main():
path_OLD = Path('file1.xlsx')
path_NEW = Path('file2.xlsx')
# get index col from data
df = pd.read_excel(path_NEW)
index_col = df.columns[0]
print('\nIndex column: {}\n'.format(index_col))
excel_diff(path_OLD, path_NEW, index_col)
if __name__ == '__main__':
main()

Related

Pandas - how to create a new dataframe from the columns and values of an old dataframe?

I have a CSV file in which I have tweets with the following column names: File, User, Date 1, month, day, Tweet, Permalink, Retweet count, Likes count, Tweet value, Language, Location.
I want to create a new data frame with tweets from certain cities. I can do it but only for the last city on my list (Girona). So it doesn't add all the rows. Here is my code:
import pandas as pd
import os
path_to_file = "populismo_merge.csv"
df = pd.read_csv(path_to_file, encoding='utf-8', sep=',')
values = df[df['Location'].str.contains("A Coruña",na=False)]
values = df[df['Location'].str.contains("Alava",na=False)]
values = df[df['Location'].str.contains("Albacete",na=False)]
values = df[df['Location'].str.contains("Alicante",na=False)]
values = df[df['Location'].str.contains("Almería",na=False)]
values = df[df['Location'].str.contains("Asturias",na=False)]
values = df[df['Location'].str.contains("Avila",na=False)]
values = df[df['Location'].str.contains("Badajoz",na=False)]
values = df[df['Location'].str.contains("Barcelona",na=False)]
values = df[df['Location'].str.contains("Burgos",na=False)]
values = df[df['Location'].str.contains("Cáceres",na=False)]
values = df[df['Location'].str.contains("Cádiz",na=False)]
values = df[df['Location'].str.contains("Cantabria",na=False)]
values = df[df['Location'].str.contains("Castellón",na=False)]
values = df[df['Location'].str.contains("Ceuta",na=False)]
values = df[df['Location'].str.contains("Ciudad Real",na=False)]
values = df[df['Location'].str.contains("Córdoba",na=False)]
values = df[df['Location'].str.contains("Cuenca",na=False)]
values = df[df['Location'].str.contains("Formentera",na=False)]
values = df[df['Location'].str.contains("Girona",na=False)]
values.to_csv(r'populismo_ciudad.csv', index = False)
Many thanks!!!
Use isin:
import pandas as pd
import os
path_to_file = "populismo_merge.csv"
df = pd.read_csv(path_to_file, encoding='utf-8', sep=',')
cities = ['A Coruña', 'Alava', 'Albacete', 'Alicante', 'Almería',
'Asturias', 'Avila', 'Badajoz', 'Barcelona', 'Burgos',
'Cáceres', 'Cádiz', 'Cantabria', 'Castellón', 'Ceuta',
'Ciudad Real', 'Córdoba', 'Cuenca', 'Formentera', 'Girona']
values = df[df['Location'].isin(cities)]
values.to_csv(r'populismo_ciudad.csv', index = False)
You are overwriting the values variable each time. A more concise answer would be along the lines of.
values= df[df['LocationName'].isin(["A Coruña", "Alava", ......)]

How to create new columns based off specific conditions?

I have a multi-index dataframe. The index's are represented by an ID and date. The 3 columns I have are cost, revenue, and expenditure.
I want to create 3 new columns based off certain conditions.
1) The first new column I would want to create would be based off the condition, for the 3 most previous dates per ID, if the cost column decreases consistently, label the new row values as 'NEG', if not then label it 'No'.
2) The second column I would want to create would be based off the condition, for the 3 most recent dates, if the revenue column decreases consistently, label the new row values as 'NEG', if not then label it 'No'.
3) The third column I would want to create would be based off the condition, for the 3 most recent dates, if the expenditure column increases consistently, label the new row value as 'POS' or if it stays the same label the new row value as 'STABLE'.
idx = pd.MultiIndex.from_product([['001', '002', '003','004'],
['2017-06-30', '2017-12-31', '2018-06-30','2018-12-31','2019-06-30']],
names=['ID', 'Date'])
col = ['Cost', 'Revenue','Expenditure']
dict2 = {'Cost':[12,6,-2,-10,-16,-10,14,12,6,7,4,2,1,4,-4,5,7,9,8,1],
'Revenue':[14,13,2,1,-6,-10,14,12,6,7,4,2,1,4,-4,5,7,9,18,91],
'Expenditure':[17,196,20,1,-6,-10,14,12,6,7,4,2,1,4,-4,5,7,9,18,18]}
df = pd.DataFrame(dict2,idx,col)
i have tried creating a function then applying it to my DF but i keep getting errors...
the solution i want to end up with would look like this..
idx = pd.MultiIndex.from_product([['001', '002', '003','004'],
['2017-06-30', '2017-12-31', '2018-06-30','2018-12-31','2019-06-30']],
names=['ID', 'Date'])
col = ['Cost', 'Revenue','Expenditure', 'Cost Outlook', 'Revenue Outlook', 'Expenditure Outlook']
dict3= {'Cost': [12,6,-2,-10,-16,
-10,14,12,6,7,
4,2,1,4,-4,
5,7,9,8,1],
'Cost Outlook': ['no','no','NEG','NEG','NEG',
'no','no','no','NEG','NEG',
'no','no','NEG','no','no',
'no','no','no','no','NEG'],
'Revenue':[14,13,2,1,-6,
-10,14,12,6,7,
4,2,1,4,-4,
5,7,9,18,91],
'Revenue Outlook': ['no','no','NEG','NEG','NEG',
'no','no','no','NEG','NEG',
'no','no','NEG','no','no',
'no','no','no','no','no'],
'Expenditure':[17,196,1220,1220, -6,
-10,14,120,126,129,
4,2,1,4,-4,
5,7,9,18,18],
'Expenditure Outlook':['no','no','POS','POS','no',
'no','no','POS','POS','POS',
'no','no','no','no','no',
'no','no','POS','POS','STABLE']
}
df_new = pd.DataFrame(dict3,idx,col)
Here's what I would do:
# update Cost and Revenue Outlooks
# because they have similar conditions
for col in ['Cost', 'Revenue']:
groups = df.groupby('ID')
outlook = f'{col} Outlook'
df[outlook] = groups[col].diff().lt(0)
# moved here
df[outlook] = np.where(groups[outlook].rolling(2).sum().eq(2), 'NEG', 'no')
# update Expenditure Outlook
col = 'Expenditure'
outlook = f'{col} Outlook'
s = df.groupby('ID')[col].diff()
df[outlook] = np.select( (s.eq(0).groupby(level=0).rolling(2).sum().eq(2),
s.gt(0).groupby(level=0).rolling(2).sum().eq(2)),
('STABLE', 'POS'), 'no')
See if this does the job:
is_descending = lambda a: np.all(a[:-1] > a[1:])
is_ascending = lambda a: np.all(a[:-1] <= a[1:])
df1 = df.reset_index()
df1["CostOutlook"] = df1.groupby("ID").Cost.rolling(3).apply(is_descending).fillna(0).apply(lambda x: "NEG" if x > 0 else "no").to_list()
df1["RevenueOutlook"] = df1.groupby("ID").Revenue.rolling(3).apply(is_descending).fillna(0).apply(lambda x: "NEG" if x > 0 else "no").to_list()
df1["ExpenditureOutlook"] = df1.groupby("ID").Expenditure.rolling(3).apply(is_ascending).fillna(0).apply(lambda x: "POS" if x > 0 else "no").to_list()
df1 = df1.set_index(["ID", "Date"])
Note: The requirement for "STABLE" is not handled.
Edit:
This is alternative solution:
is_descending = lambda a: np.all(a[:-1] > a[1:])
def is_ascending(a):
if np.all(a[:-1] <= a[1:]):
if a[-1] == a[-2]:
return 2
return 1
return 0
for col in ['Cost', 'Revenue']:
outlook = df[col].unstack(level="ID").rolling(3).apply(is_descending).fillna(0).replace({0.0:"no", 1.0:"NEG"}).unstack().rename(f"{col} outlook")
df = df.join(outlook)
col = "Expenditure"
outlook = df[col].unstack(level="ID").rolling(3).apply(is_ascending).fillna(0).replace({0.0:"no", 1.0:"POS", 2.0:"STABLE"}).unstack().rename(f"{col} outlook")
df = df.join(outlook)

Saving loop output to multiple excel sheets

I have a csv file full of multiple years of water data. I've broken up each water year into it's own data frame. Now I want to do some math to those water years then save each water year to it's own excel sheet.
The math part of the code is working, but I'm having trouble with the final step of naming and saving the output of the loop correctly. Right now I have it creating the excel file and creating the sheet names correctly, but the loop just saves the final iteration to all the sheets. I've googled around but I can't get any other of the similar questions answers to work. This is my first python program so advice would be appreciated.
import pandas as pd
with open(r'wft.csv') as csvfile:
tdata = pd.read_csv(csvfile)
tdata['date'] = pd.to_datetime(tdata['date'], format='%m/%d/%Y %H:%M')
tdata = tdata.set_index(['date'])
wy2015 = tdata.loc['2014-10-1 00:00' : '2015-7-1 00:00']
wy2016 = tdata.loc['2015-10-1 00:00' : '2016-7-1 00:00']
wy2017 = tdata.loc['2016-10-1 00:00' : '2017-7-1 00:00']
writer = pd.ExcelWriter('WFT.xlsx', engine='xlsxwriter')
wyID = [wy2014, wy2015, wy2016, wy2017]
seq = ['wy2014', 'wy2015', 'wy2016', 'wy2017']
for df in wyID:
df = df.sort_values(by=['turbidity'], ascending=False)
df['rank'] = df['turbidity'].rank(method = 'first', ascending=0)
df['cunnanes'] = (df['rank'] - 0.4)/(len(df['rank']) + 0.2)*100
for name in seq:
df.to_excel(writer, sheet_name= name)
writer.save()
Issues in your code
writer = pd.ExcelWriter('WFT.xlsx', engine='xlsxwriter')
wyID = [wy2014, wy2015, wy2016, wy2017]
seq = ['wy2014', 'wy2015', 'wy2016', 'wy2017']
for df in wyID: # outer loop that figures out wy20xx
df = df.sort_values(by=['turbidity'], ascending=False)
df['rank'] = df['turbidity'].rank(method = 'first', ascending=0)
df['cunnanes'] = (df['rank'] - 0.4)/(len(df['rank']) + 0.2)*100
for name in seq: # you loop through all the names and write all sheets every time. you want to be writing just one
df.to_excel(writer, sheet_name= name)
writer.save()
Instead try this.
for i, df in enumerate(wyID): # outer loop that figures out wy20xx
df = df.sort_values(by=['turbidity'], ascending=False)
df['rank'] = df['turbidity'].rank(method = 'first', ascending=0)
df['cunnanes'] = (df['rank'] - 0.4)/(len(df['rank']) + 0.2)*100
df.to_excel(writer, sheet_name= seq[i]) # writes to correct wy20xx sheet
writer.save() # Now you're done writing the excel

Using Python to delete rows in a csv file that contain certain chars

I have a csv file that I'm trying to clean up. I am trying to look at the first column and delete any rows that have anything other than chars for that row in the first column (I'm working on cleaning up rows where the first column has a ^ or . for now). It seems all my attempts either do nothing or nuke the whole csv file.
Interestingly enough, I have code that can identify the problem rows and it seems to work fine
def FindProblemRows():
with open('Data.csv') as csvDataFile:
ProblemRows = []
csvReader = csv.reader(csvDataFile)
data = [row for row in csv.reader(csvDataFile)]
length = len(data)
for i in range (0,length):
if data[i][0].find('^')!=-1 or data[i][0].find('.')!=-1:
ProblemRows.append(i)
return (ProblemRows)
Below I have my latest three failed attempts. Where am I going wrong and what should I change? Which of these comes closest?
'''
def Clean():
with open("Data.csv", "w", newline='') as f:
data = list(csv.reader(f))
writer = csv.writer(f)
Problems = FindProblemRows()
data = list(csv.reader(f))
length = len(data)
for row in data:
for i in Problems:
for j in range (0, length):
if row[j] == i:
writer.writerow(row)
Problems.remove(i)
def Clean():
Problems = FindProblemRows()
with open('Data.csv') as csvDataFile:
csvReader = csv.reader(csvDataFile)
data = [row for row in csv.reader(csvDataFile)]
length = len(data)
width = len(data[0])
with open("Data.csv","r") as csvFile:
csvReader = csv.reader( csvFile )
with open("CleansedData.csv","w") as csvResult:
csvWrite = csv.writer( csvResult )
for i in Problems:
for j in range (0, length):
if data[j] == i:
del data[j]
for j in range (0, length):
csvWrite.writerow(data[j])
'''
def Clean():
with open("Data.csv", 'r') as infile , open("CleansedData.csv", 'w') as outfile:
data = [row for row in infile]
for row in infile:
for column in row:
if "^" not in data[row][0]:
if "." not in data[row][0]:
outfile.write(data[row])
Update
Now I have:
def Clean():
df = pd.read_csv('Data.csv')
df = df['^' not in df.Symbol]
df = df['.' not in df.Symbol]
but I get KeyError: True
Shouldn't that work?
You should check whether the column Symbol contains any of the characters of interest. Method contains takes a regular expression:
bad_rows = df.Symbol.str.contains('[.^]')
df_clean = df[~bad_rows]

How to extract formatted cells and their corresponding rows in Excel

SO, my problem is as follows:
Using this script written below, I have successfully formatted certain cells in the extracted excel sheet. It is intended to accept any excel file as long as it is entered correctly and is in the same format (which for my purposes will always be in the same format).
My next step (my problem): is how to I grab the cells that have been formatted and their corresponding rows, and then print those rows into a new file.
Using a very similar program I am able to take the chosen formatted excel file.
But do not know what my course
def ready():
print("Type the name of the excel file you wish to analyze.")
while True:
try:
a_input = input(">> ")
df = pd.read_excel("{}".format(a_input), parse_cols = "AS")
df2 = pd.read_excel("{}".format(a_input), parse_cols = "B")
df3 = pd.read_excel("{}".format(a_input), parse_cols = "A")
df4 = pd.read_excel("{}".format(a_input), parse_cols = "D")
df5 = pd.read_excel("{}".format(a_input), parse_cols = "C")
except IOError:
print("The file name was either entered incorrectly or is not in the directory")
else:
print("--------------------File Grab Successful--------------------")
break
time.sleep(2)
print("Grabbing the required data.....\n\n")
df.set_index('Last Date of Support', inplace=True)
df2.set_index('Product Series', inplace=True)
df3.set_index('Product Family', inplace=True)
df4.set_index('Item Name', inplace=True)
df5.set_index('Item Type', inplace=True)
time.sleep(2)
print("--------------Creating Excel Document with required data------------------\n\n")
time.sleep(2)
print("Type the name you want for the excel file that will be created. Be sure to add the file extension on the end (EX: filename.xlsx)")
b_input = input(">> ")
writer = pd.ExcelWriter('{}'.format(b_input), engine='xlsxwriter')
df.to_excel(writer, sheet_name='Sheet1', startrow=-1, startcol=0)
df2.to_excel(writer, sheet_name='Sheet1', startrow=-1, startcol=4)
df3.to_excel(writer, sheet_name='Sheet1', startrow=-1, startcol=2)
df4.to_excel(writer, sheet_name='Sheet1', startrow=-1, startcol=3)
df5.to_excel(writer, sheet_name='Sheet1', startrow=-1, startcol=1)
workbook = writer.book
worksheet = writer.sheets['Sheet1']
date = datetime.datetime.strptime('2018-01-01', "%Y-%m-%d")
format1 = workbook.add_format({'bg_color': '#FFC7CE',
'font_color': '#9C0006'})
worksheet.conditional_format('A2:A20000', {'type': 'date',
'criteria': 'less than',
'value': date,
'format': format1})
writer.save()
print("Goodbye!")
time.sleep(5)
ready()
Using a very similar program I am able to take the chosen formatted excel file.
But do not know what my course of action would be.
Here is an example of what I have tried.
format1 = workbook.add_format({'bg_color': '#FFC7CE',
'font_color': '#9C0006'})
for i in df == format1:
print(i)
But i keep getting an error.
Sorry for the edit post,
I clicked advance before completed.
The loop you wrote is incorrect. That's because df==format1 is equivalent to the boolean True. If you want to print i only when some condition holds, do:
upperBound = 1000
for i in range(upperBound):
if df==format1:
print(i)

Resources