Combine Multiple Workbooks into One - excel

I have a various amount of input .xlsx documents that contain 12 sheets (all sheets have the same name within each .xlsx document). I need to combine these into one .xlsx document while retaining the original sheets' names, but the data from all documents for each sheets appended to the original sheets.
For example, see my original output:
Original Output
Desired Output
Currently, I am not adding the inputFile name anywhere and just trying to merge into one workbook. However, I keep receiving an error:
error
def createEmptyWorkbook(self, outputFileName):
logging.info('creating empty workbook: %s' % (outputFileName))
# create empty workbook
ncoa_combined_report = openpyxl.Workbook()
# save new file
ncoa_combined_report.save(outputFileName)
ncoa_combined_report = openpyxl.load_workbook(filename=outputFileName)#, data_only=True)
return ncoa_combined_report
def combine_sheets(self, inputFiles):
logging.info('combining ncoa reports to one workbook')
# new output files
outputFile = os.path.join(self.processingDir, 'combined_ncoa_report.xlsx')
# create empty workbook
ncoa_combined_report = self.createEmptyWorkbook(outputFile)
# get a list of sheet names created in output file
outputSheetNames = ncoa_combined_report.sheetnames
for inputFile in inputFiles:
logging.info('reading ncoa report: %s' % (os.path.split(inputFile)[-1]))
# load entire input file into memory
input_wb = openpyxl.load_workbook(filename = inputFile)#, data_only=True)
# get sheet name values in inputFile
sheets = input_wb.sheetnames
# iterate worksheets in input file
for worksheet in input_wb.worksheets:
outputSheetMaxRow = 0
currentSheet = ''
row = ''
column = ''
logging.info('working on sheet: %s' % (worksheet.title))
# check if sheet exist in output file and add if neccissary
if not worksheet.title in outputSheetNames:
logging.info('creating sheet: %s' % (worksheet.title))
currentSheet = ncoa_combined_report.create_sheet(worksheet.title)
else:
currentSheet = worksheet.title
## check if default sheet name is in output
#if 'Sheet' in outputSheetNames:
# ncoa_combined_report.remove_sheet(ncoa_combined_report.get_sheet_by_name('Sheet'))
outputSheetMaxRow = currentSheet.max_row
for row, entry in enumerate(worksheet, start=1):
logging.info('working on row: %s' % (row))
for cell in entry:
try:
outputSheetMaxRow = currentSheet.max_row
# add cell value to output file
#currentSheet[cell.coordinate].value
currentSheet.cell(row=row+outputSheetMaxRow, column=cell.column).value = cell.value #, value=cell
except:
logging.critical('could not add row:%s, cell:%s' % (row, entry))
raise ValueError('could not add row:%s, cell:%s' % (row, entry))
# save new file
ncoa_combined_report.save(outputFile)
I am not sure why I am getting the error or what I need to update to correct it. Any guidance is appreciated.

I think I found the issue with this portion of the code. I found where you can get the xy, col, and row from openpyxl.utils, which allowed me to insert at the append at the correct locations. Hopefully this will help someone else in the future.
for line, entry in enumerate(worksheet, start=1):
#logging.info('working on row: %s' % (row))
for cell in entry:
#try:
xy = openpyxl.utils.coordinate_from_string(cell.coordinate) # returns ('A',4)
col = openpyxl.utils.column_index_from_string(xy[0]) # returns 1
rowCord = xy[1]
# add cell value to output file
#currentSheet[cell.coordinate].value
if line == 1 and inputFileCount == 1:
currentSheet.cell(row=1, column=1).value = 'Project'
currentSheet.cell(row=1, column=2).value = os.path.split(inputFile)[-1]
if line == 1 and inputFileCount > 1:
currentSheet.cell(row=outputSheetMaxRow + 2, column=1).value = 'Project'
currentSheet.cell(row=outputSheetMaxRow + 2, column=2).value = os.path.split(inputFile)[-1]
else:
currentSheet.cell(row=outputSheetMaxRow + rowCord + 1, column=col).value = cell.value #, value=cell

Related

Python string finder in excel file using lookup table

I need a function which returns matched string from excel table, but the matching template have to come from a lookup table e.g. a list with couple of strings, so if one of the strings into that list is matching within the returned value from function then print OK. Here is my example:
Let's assume that I got the following table (test_file.xlsx), I want to iterate through the whole worksheet (it's way bigger just pasted a couple columns/rows).
So, here only the "O" column is matching within the desired lookup table.
lookup_table = ['MM1XX', 'MM2XX', 'MC2XX', 'MC3XX', 'MC4XX', 'MS1XX', 'MS2XX', 'MS3XX', 'MS4XX']
This is what I've meant:
import openpyxl
# Path
wb = openpyxl.load_workbook('test_file.xlsx')
# active worksheet data
ws = wb.active
lookup_table = ['MM1XX', 'MM2XX', 'MC2XX', 'MC3XX', 'MC4XX', 'MS1XX', 'MS2XX', 'MS3XX', 'MS4XX']
def wordfinder():
search_string = 0
for i in range(1, ws.max_row + 1):
for j in range(1, ws.max_column + 1):
if search_string == ws.cell(i, j).value:
print("SUCCESS! Found match!")
return search_string
if lookup_table == wordfinder():
print("Match")
else:
print("No matches found")
Hope for your proposals. I'd like to receive only one matched element from the function if it does exist not all and compare it within the lookup table.

Is there a python coding that can access and change the cell's alphabet to its opposite from in excel?

I'm new to python and I need to make a program that changes the letter's in the cell to the opposite form and also know the amount of names in the column and which row the name list is at so that it can change all of the names. The code is for me to be able to change the names without to ever look at the name list due to privacy reasons. I'm currently using Pycharm and Openpyxl if anyone is wondering. The picture shows the before and after of how it should look like. I have done a few tries but after that, I just can't seem to get any ideas on how to change the alphabet. I also tried the replacement (replacement = {'Danial' = 'Wzmrzo'}) function however I am required to look at the name list and then be able to change the letters.
import openpyxl
from openpyxl import Workbook, load_workbook
from openpyxl.utils import get_column_letter
print("Type the file name:")
DF = input()
wb = load_workbook(DF + '.xlsx')
print("Sheet Name:")
sht = input()
ws = wb[sht]
NC = str(input("Where is the Name Column?"))
column = ws[ NC ]
column_list = [column[x].value for x in range(len(column))]
print(column_list)
wb.save(DF + '.xlsx')
Before
After
Warning I'm not too familiar with openpyxl and how they access rows/cols but it seems to have changed a lot in the last few years. So this should give you an idea for how to make it work but might not work exactly as written depending on your version.
To find the name column you could use
name_col = False
# loop along the top row looking for "Name"
for i,x in enumerate(ws.iter_cols(max_row=1)):
if x[0].value == "Name":
name_col = i + 1 # enumerate is 0 indexed, excel rows/cols are 1 indexed
break
if name_col:
# insert name changing code here
else:
print("'Name' column not found.")
To change the names you could use (insert this in the code above)
# loop down name column
for i,x in enumerate(ws.iter_rows(min_col = name_col, max_col = name_col)):
# we need to skip the header row so
if i == 0:
continue
name = x[0].value
new_name = ""
for c in name:
# ord() gets the ASCII value of the char, manipulates it to be the opposite then uses chr() to get back the character
if ord(c) > 90:
new_c = chr(25 - (ord(c) - 97) + 97)
else:
new_c = chr(25 - (ord(c) - 65) + 65)
new_name.append(new_c)
ws.cell(row=i+1, column=name_col).value = new_name # enumerate is 0 indexed, excel rows/cols are 1 indexed hence i+1

To extract content of 1st column (all rows) from an .xlsx file and replace it with the extracted information from each column

I have to replace first entire column (all rows) with information extracted from each column itself. Last digit is missing for each column with my code.
I have coded but had to save the output to a different file. I am unable to figure out how to replace the first column of the existing file itself. I need one file with the required output only.
fname = 'output.xlsx'
wb = openpyxl.load_workbook(fname)
sheet = wb.active
print('The sheet title is: ', sheet.title)
row_a = sheet['A']
d = []
for cell in row_a:
a = cell.value
d.append(a)
print(d)
s = []
for i in d:
i = i[-1:-8]
s.append(i)
print('The list of account numbers is: ', s)
wc = xlwt.Workbook()
ws = wc.add_sheet('Sheet1')
row=0
col=0
list_d = s
for item in list_d:
ws.write(row, col, item)
row+=1
wc.save('FINAL.xls')
I suggest using python's builtin string.split method:
import openpyxl
fname = 'output.xlsx'
wb = openpyxl.load_workbook(fname)
sheet = wb.active
d = [cell.value for cell in sheet['A']] # List comprehension to replace your for loop
# str.split splits the 'Name' column data into an array of strings
# selecting [-1] selects only the account number
s = [i.split('.')[-1] for i in d]
s[0] = 'Account' # replace 'Name' with 'Account' for column header
row = 1
col = 1
for item in s:
sheet.cell(row, col).value = item
row += 1
wb.save(fname)
I also added list comprehensions, which are a more Pythonic way of creating arrays from data in many cases.

Read file and output specific fields to CSV file

I'm trying to search for data based on a key word and export that data to an Excel or text file.
When I "print" the variable/list it works no problem. When I try and output the data to a file it only outputs the last entry. I think something is wrong with the iteration, but I can't figure it out.
import xlsxwriter
#Paths
xls_output_path = 'C:\\Data\\'
config = 'C:\\Configs\\filename.txt'
excel_inc = 0 #used to increment the excel columns so not everything
#is written in "A1"
lines = open(config,"r").read().splitlines()
search_term = "ACL"
for i, line in enumerate(lines):
if search_term in line:
split_lines = line.split(' ') #Split lines via a space.
linebefore = lines[i - 1] #Print the line before the search term
linebefore_split = linebefore.split(' ') #Split the line before via
#space
from_obj = linebefore_split[2] #[2] holds the data I need
to_object = split_lines[4] #[4] holds the data I need
print(len(split_lines)) #Prints each found line with no
#problem.
excel_inc = excel_inc + 1 #Increments for column A so not all of
#the data is placed in A1
excel_inc_str = str(excel_inc) #Change type to string so it can
#concatenate.
workbook = xlsxwriter.Workbook(xls_output_path + 'Test.xlsx') #Creates the xls file
worksheet = workbook.add_worksheet()
worksheet.write('A' + excel_inc_str, split_lines[4]) #Write data from
#split_lines[4]
#to column A
workbook.close()
I created this script so it will go and find all lines in the "config" file with the keyword "ACL".
It then has the ability to print the line before and the actual line the data is found. This works great.
My next step is outputting the data to an excel spreadsheet. This is where I get stuck.
The script only prints the very last item in the column A row 10.
I need help figuring out why it'll print the data correctly, but it won't output it to an excel spreadsheet or even a .txt file.
Try this - I moved your workbook and worksheet definitions outside the loop, so it doesn't keep getting redefined.
import xlsxwriter
#Paths
xls_output_path = 'C:\\Data\\'
config = 'C:\\Configs\\filename.txt'
excel_inc = 0 #used to increment the excel columns so not everything
#is written in "A1"
lines = open(config,"r").read().splitlines()
search_term = "ACL"
workbook = xlsxwriter.Workbook(xls_output_path + 'Test.xlsx') #Creates the xls file
worksheet = workbook.add_worksheet()
for i, line in enumerate(lines):
if search_term in line:
split_lines = line.split(' ') #Split lines via a space.
linebefore = lines[i - 1] #Print the line before the search term
linebefore_split = linebefore.split(' ') #Split the line before via
#space
from_obj = linebefore_split[2] #[2] holds the data I need
to_object = split_lines[4] #[4] holds the data I need
print(len(split_lines)) #Prints each found line with no
#problem.
excel_inc = excel_inc + 1 #Increments for column A so not all of
#the data is placed in A1
excel_inc_str = str(excel_inc) #Change type to string so it can
#concatenate.
worksheet.write('A' + excel_inc_str, split_lines[4]) #Write data from
#split_lines[4]
#to column A
workbook.close()

How to automatically copy text out of excel files

I have 250 Microsoft Excel (.xls) files, all in a folder. I need to do the following:
for each file:
open the file
switch to a specific tab in the file
extract the text from rows 15-100 on that tab
save the text in a text file somewhere
I assume this can be automated somehow, but I have no idea how. Where do I start looking to figure out how to do this? I really don't want to open 250 excel files and copy text out by hand, as that would take hours. :(
Since you already have Excel, you can create an Excel macro in a separate worksheet to do this; just make sure the worksheet is outside of the directory you are parsing. You'll need to add a reference for the FileSystemObject, which should be found in C:\Windows\System32\scrrun.dll.
Option Explicit
Sub ExtractData()
Dim fso As New FileSystemObject
Dim oFile As File
Dim oFolder As Folder
Dim sFileOutput As String
Dim fNum
Dim excelFile As Excel.Workbook
Dim excelWorksheet As Excel.Worksheet
Dim i As Integer
sFileOutput = "C:\FolderToScan\ExcelOutput.txt"
Set oFolder = fso.GetFolder("C:\FolderToScan")
For Each oFile In oFolder.Files
If Right(oFile.Name, 3) = "xls" Then
fNum = FreeFile()
Open sFileOutput For Append As fNum
Set excelFile = Workbooks.Open(oFile.Path)
Set excelWorksheet = excelFile.Sheets(1)
'Or:
' Set excelWorksheet = excelFile.Sheets("Name of your sheet")
For i = 15 To 100
Write #fNum, excelWorksheet.Cells(i, 1)
Next
Close #fNum
excelFile.Close
Set excelFile = Nothing
End If
Next
End Sub
That can be quickly solved using the xlrd module and python, I copied the following example from activestate.com - it is easy to adapt it to your needs.
## {{{ http://code.activestate.com/recipes/483742/ (r3)
class readexcel(object):
""" Simple OS Independent Class for Extracting Data from Excel Files
the using xlrd module found at http://www.lexicon.net/sjmachin/xlrd.htm
Versions of Excel supported: 2004, 2002, XP, 2000, 97, 95, 5, 4, 3
xlrd version tested: 0.5.2
Data is extracted by creating a iterator object which can be used to
return data one row at a time. The default extraction method assumes
that the worksheet is in tabular format with the first nonblank row
containing variable names and all subsequent rows containing values.
This method returns a dictionary which uses the variables names as keys
for each piece of data in the row. Data can also be extracted with
each row represented by a list.
Extracted data is represented fairly logically. By default dates are
returned as strings in "yyyy/mm/dd" format or "yyyy/mm/dd hh:mm:ss",
as appropriate. However, dates can be return as a tuple containing
(Year, Month, Day, Hour, Min, Second) which is appropriate for usage
with mxDateTime or DateTime. Numbers are returned as either INT or
FLOAT, whichever is needed to support the data. Text, booleans, and
error codes are also returned as appropriate representations.
Quick Example:
xl = readexcel('testdata.xls')
sheetnames = xl.worksheets()
for sheet in sheetnames:
print sheet
for row in xl.getiter(sheet):
# Do Something here
"""
def __init__(self, filename):
""" Returns a readexcel object of the specified filename - this may
take a little while because the file must be parsed into memory """
import xlrd
import os.path
if not os.path.isfile(filename):
raise NameError, "%s is not a valid filename" % filename
self.__filename__ = filename
self.__book__ = xlrd.open_workbook(filename)
self.__sheets__ = {}
self.__sheetnames__ = []
for i in self.__book__.sheet_names():
uniquevars = []
firstrow = 0
sheet = self.__book__.sheet_by_name(i)
for row in range(sheet.nrows):
types,values = sheet.row_types(row),sheet.row_values(row)
nonblank = False
for j in values:
if j != '':
nonblank=True
break
if nonblank:
# Generate a listing of Unique Variable Names for Use as
# Dictionary Keys In Extraction. Duplicate Names will
# be replaced with "F#"
variables = self.__formatrow__(types,values,False)
unknown = 1
while variables:
var = variables.pop(0)
if var in uniquevars or var == '':
var = 'F' + str(unknown)
unknown += 1
uniquevars.append(str(var))
firstrow = row + 1
break
self.__sheetnames__.append(i)
self.__sheets__.setdefault(i,{}).__setitem__('rows',sheet.nrows)
self.__sheets__.setdefault(i,{}).__setitem__('cols',sheet.ncols)
self.__sheets__.setdefault(i,{}).__setitem__('firstrow',firstrow)
self.__sheets__.setdefault(i,{}).__setitem__('variables',uniquevars[:])
def getiter(self, sheetname, returnlist=False, returntupledate=False):
""" Return an generator object which yields the lines of a worksheet;
Default returns a dictionary, specifing returnlist=True causes lists
to be returned. Calling returntupledate=True causes dates to returned
as tuples of (Year, Month, Day, Hour, Min, Second) instead of as a
string """
if sheetname not in self.__sheets__.keys():
raise NameError, "%s is not present in %s" % (sheetname,\
self.__filename__)
if returnlist:
return __iterlist__(self, sheetname, returntupledate)
else:
return __iterdict__(self, sheetname, returntupledate)
def worksheets(self):
""" Returns a list of the Worksheets in the Excel File """
return self.__sheetnames__
def nrows(self, worksheet):
""" Return the number of rows in a worksheet """
return self.__sheets__[worksheet]['rows']
def ncols(self, worksheet):
""" Return the number of columns in a worksheet """
return self.__sheets__[worksheet]['cols']
def variables(self,worksheet):
""" Returns a list of Column Names in the file,
assuming a tabular format of course. """
return self.__sheets__[worksheet]['variables']
def __formatrow__(self, types, values, wanttupledate):
""" Internal function used to clean up the incoming excel data """
## Data Type Codes:
## EMPTY 0
## TEXT 1 a Unicode string
## NUMBER 2 float
## DATE 3 float
## BOOLEAN 4 int; 1 means TRUE, 0 means FALSE
## ERROR 5
import xlrd
returnrow = []
for i in range(len(types)):
type,value = types[i],values[i]
if type == 2:
if value == int(value):
value = int(value)
elif type == 3:
datetuple = xlrd.xldate_as_tuple(value, self.__book__.datemode)
if wanttupledate:
value = datetuple
else:
# time only no date component
if datetuple[0] == 0 and datetuple[1] == 0 and \
datetuple[2] == 0:
value = "%02d:%02d:%02d" % datetuple[3:]
# date only, no time
elif datetuple[3] == 0 and datetuple[4] == 0 and \
datetuple[5] == 0:
value = "%04d/%02d/%02d" % datetuple[:3]
else: # full date
value = "%04d/%02d/%02d %02d:%02d:%02d" % datetuple
elif type == 5:
value = xlrd.error_text_from_code[value]
returnrow.append(value)
return returnrow
def __iterlist__(excel, sheetname, tupledate):
""" Function Used To Create the List Iterator """
sheet = excel.__book__.sheet_by_name(sheetname)
for row in range(excel.__sheets__[sheetname]['rows']):
types,values = sheet.row_types(row),sheet.row_values(row)
yield excel.__formatrow__(types, values, tupledate)
def __iterdict__(excel, sheetname, tupledate):
""" Function Used To Create the Dictionary Iterator """
sheet = excel.__book__.sheet_by_name(sheetname)
for row in range(excel.__sheets__[sheetname]['firstrow'],\
excel.__sheets__[sheetname]['rows']):
types,values = sheet.row_types(row),sheet.row_values(row)
formattedrow = excel.__formatrow__(types, values, tupledate)
# Pad a Short Row With Blanks if Needed
for i in range(len(formattedrow),\
len(excel.__sheets__[sheetname]['variables'])):
formattedrow.append('')
yield dict(zip(excel.__sheets__[sheetname]['variables'],formattedrow))
## end of http://code.activestate.com/recipes/483742/ }}}

Resources