Python string finder in excel file using lookup table - string

I need a function which returns matched string from excel table, but the matching template have to come from a lookup table e.g. a list with couple of strings, so if one of the strings into that list is matching within the returned value from function then print OK. Here is my example:
Let's assume that I got the following table (test_file.xlsx), I want to iterate through the whole worksheet (it's way bigger just pasted a couple columns/rows).
So, here only the "O" column is matching within the desired lookup table.
lookup_table = ['MM1XX', 'MM2XX', 'MC2XX', 'MC3XX', 'MC4XX', 'MS1XX', 'MS2XX', 'MS3XX', 'MS4XX']
This is what I've meant:
import openpyxl
# Path
wb = openpyxl.load_workbook('test_file.xlsx')
# active worksheet data
ws = wb.active
lookup_table = ['MM1XX', 'MM2XX', 'MC2XX', 'MC3XX', 'MC4XX', 'MS1XX', 'MS2XX', 'MS3XX', 'MS4XX']
def wordfinder():
search_string = 0
for i in range(1, ws.max_row + 1):
for j in range(1, ws.max_column + 1):
if search_string == ws.cell(i, j).value:
print("SUCCESS! Found match!")
return search_string
if lookup_table == wordfinder():
print("Match")
else:
print("No matches found")
Hope for your proposals. I'd like to receive only one matched element from the function if it does exist not all and compare it within the lookup table.

Related

Is there a python coding that can access and change the cell's alphabet to its opposite from in excel?

I'm new to python and I need to make a program that changes the letter's in the cell to the opposite form and also know the amount of names in the column and which row the name list is at so that it can change all of the names. The code is for me to be able to change the names without to ever look at the name list due to privacy reasons. I'm currently using Pycharm and Openpyxl if anyone is wondering. The picture shows the before and after of how it should look like. I have done a few tries but after that, I just can't seem to get any ideas on how to change the alphabet. I also tried the replacement (replacement = {'Danial' = 'Wzmrzo'}) function however I am required to look at the name list and then be able to change the letters.
import openpyxl
from openpyxl import Workbook, load_workbook
from openpyxl.utils import get_column_letter
print("Type the file name:")
DF = input()
wb = load_workbook(DF + '.xlsx')
print("Sheet Name:")
sht = input()
ws = wb[sht]
NC = str(input("Where is the Name Column?"))
column = ws[ NC ]
column_list = [column[x].value for x in range(len(column))]
print(column_list)
wb.save(DF + '.xlsx')
Before
After
Warning I'm not too familiar with openpyxl and how they access rows/cols but it seems to have changed a lot in the last few years. So this should give you an idea for how to make it work but might not work exactly as written depending on your version.
To find the name column you could use
name_col = False
# loop along the top row looking for "Name"
for i,x in enumerate(ws.iter_cols(max_row=1)):
if x[0].value == "Name":
name_col = i + 1 # enumerate is 0 indexed, excel rows/cols are 1 indexed
break
if name_col:
# insert name changing code here
else:
print("'Name' column not found.")
To change the names you could use (insert this in the code above)
# loop down name column
for i,x in enumerate(ws.iter_rows(min_col = name_col, max_col = name_col)):
# we need to skip the header row so
if i == 0:
continue
name = x[0].value
new_name = ""
for c in name:
# ord() gets the ASCII value of the char, manipulates it to be the opposite then uses chr() to get back the character
if ord(c) > 90:
new_c = chr(25 - (ord(c) - 97) + 97)
else:
new_c = chr(25 - (ord(c) - 65) + 65)
new_name.append(new_c)
ws.cell(row=i+1, column=name_col).value = new_name # enumerate is 0 indexed, excel rows/cols are 1 indexed hence i+1

From CSV list to XLSX. Numbers recognise as text not as numbers

I am working with CSV datafile.
From this file I took some specific data. These data convey to a list that contains strings of words but also numbers (saved as string, sigh!).
As this:
data_of_interest = ["string1", "string2, "242", "765", "string3", ...]
I create new XLSX (should have this format) file in which this data have been pasted in.
The script does the work but on the new XLSX file, the numbers (float and int) are pasted in as text.
I could manually convert their format on excel but it would be time consuming.
Is there a way to do it automatically when writing the new XLSX file?
Here the extract of code I used:
## import library and create excel file and the working sheet
import xlsxwriter
workbook = xlsxwriter.Workbook("newfile.xlsx")
sheet = workbook.add_worksheet('Sheet 1')
## take the data from the list (data_of_interest) from csv file
## paste them inside the excel file, in rows and columns
column = 0
row = 0
for value in data_of_interest:
if type(value) is float:
sheet.write_number(row, column, value)
elif type(value) is int:
sheet.write_number(row, column, value)
else:
sheet.write(row, column, value)
column += 1
row += 1
column = 0
workbook.close()
Is the problem related with the fact that the numbers are already str type in the original list, so the code cannot recognise that they are float or int (and so it doesn't write them as numbers)?
Thank you for your help!
Try int(value) or float(value) before if block.
All data you read are strings you have to try to convert them into float or int type first.
Example:
for value in data_of_interest:
try:
value.replace(',', '.') # Note that might change commas to dots in strings which are not numbers
value = float(value)
except ValueError:
pass
if type(value) is float:
sheet.write_number(row, column, line)
else:
sheet.write(row, column, line)
column += 1
row += 1
column = 0
workbook.close()
The best way to do this with XlsxWriter is to use the strings_to_numbers constructor option:
import xlsxwriter
workbook = xlsxwriter.Workbook("newfile.xlsx", {'strings_to_numbers': True})
sheet = workbook.add_worksheet('Sheet 1')
data_of_interest = ["string1", "string2", "242", "765", "string3"]
column = 0
row = 0
for value in data_of_interest:
sheet.write(row, column, value)
column += 1
workbook.close()
Output: (note that there aren't any warnings about numbers stored as strings):

To extract content of 1st column (all rows) from an .xlsx file and replace it with the extracted information from each column

I have to replace first entire column (all rows) with information extracted from each column itself. Last digit is missing for each column with my code.
I have coded but had to save the output to a different file. I am unable to figure out how to replace the first column of the existing file itself. I need one file with the required output only.
fname = 'output.xlsx'
wb = openpyxl.load_workbook(fname)
sheet = wb.active
print('The sheet title is: ', sheet.title)
row_a = sheet['A']
d = []
for cell in row_a:
a = cell.value
d.append(a)
print(d)
s = []
for i in d:
i = i[-1:-8]
s.append(i)
print('The list of account numbers is: ', s)
wc = xlwt.Workbook()
ws = wc.add_sheet('Sheet1')
row=0
col=0
list_d = s
for item in list_d:
ws.write(row, col, item)
row+=1
wc.save('FINAL.xls')
I suggest using python's builtin string.split method:
import openpyxl
fname = 'output.xlsx'
wb = openpyxl.load_workbook(fname)
sheet = wb.active
d = [cell.value for cell in sheet['A']] # List comprehension to replace your for loop
# str.split splits the 'Name' column data into an array of strings
# selecting [-1] selects only the account number
s = [i.split('.')[-1] for i in d]
s[0] = 'Account' # replace 'Name' with 'Account' for column header
row = 1
col = 1
for item in s:
sheet.cell(row, col).value = item
row += 1
wb.save(fname)
I also added list comprehensions, which are a more Pythonic way of creating arrays from data in many cases.

Creating a dictionary from one excel workbook, matching the keys with another workbook, paste values

I hope someone can provide a little help. I'm attempting to pull data from one excel workbook, titled DownTime, and create a dictionary of coil(product) numbers matched with "codes" that coil has experienced. I have been able to accomplish this part, it's pretty straight forward.
The part that is tripping me up, is how to match the coil numbers with a different excel workbook, and paste in the corresponding "codes".
So here is what I have so far:
import openpyxl
from collections import defaultdict
DT = openpyxl.load_workbook('DownTime.xlsm')
bl2 = DT.get_sheet_by_name('BL2')
CS = openpyxl.load_workbook('CoilSummary.xlsm')
line = CS.get_sheet_by_name('BL2')
#opening needed workbooks with specific worksheets
coil =[]
rc = []
code = defaultdict(set)
cnum = ''
next_row = 2
col = 32
for row in range(2, bl2.max_row + 1):
coil = bl2['K' + str(row)].value
rc = bl2['D' + str(row)].value
code[coil].add(rc)
# Creating a dictionary that represents each coil with corresponding codes
for key,value in code.items():
cnum = line['B' + str(row)].value
if cnum == key:
line.write(next_row, col, value)
next_row+=1
# Attempting to match coil numbers with dictionary and column B
# if the key is present, paste the value in column AF
CS.close()
DT.close()
A sample output of the dictionary looks as follows:
('M30434269': {106, 107, 173}, 'M30434270': {132, 424, 106, 173, 188}, 'M30434271': {194, 426, 202, 106, 173}})
Only there are about 22,000 entries.
So to reiterate what I want to accomplish:
I want to take this dictionary that I made from the workbook DownTime, match the keys with a column in CoilSummary, and if the keys match the cell entry, paste the value into a blank cell at the end of the table.
Example:
"CoilNum" "Date" "Shift" "info1" "info2" "Code"
M30322386 03/03/2017 06:48:30 3 1052 1722 ' '
M30322390 03/03/2017 05:18:26 3 703 1662 ' '
I would like to match the "CoilNum" with the keys in the dictionary, and paste the values into "Code".
I hope I explained that well enough. Any help with the code, or point to a website for reference, would be very much appreciated. I just don't want to have to type all of these codes in by hand!
Thank you!
After much research and trial and error, accidentally corrupting excel files and getting generally frustrated with python and excel, I figured it out. Here is what I have:
# -*- coding: utf-8 -*-
# importing tools needed for the code to work
import pyexcel as pe
from collections import defaultdict
import openpyxl as op
coil =''
rc = {}
code = defaultdict(list)
next_row = 2
col = 33
cnum = []
temp = ''
def write_data(code,cnum):
''' Used to open a given sheet in a workbook. The code will then compare values
collected from one column in a specific sheet referred to as "coils" and compares it to a dictionary where the key's are also "coils."
If the coil number matches, the code will then paste the values in a new workbook. From here the values can be copied by hand and pasted into the excel file of choice.'''
sheet = pe.get_sheet(file_name="CoilSummaryTesting.xlsx")
next_row = 2
lst = []
while next_row <= len(cnum):
for key in code.keys():
for step in cnum:
if str(step) == str(key):
for val in code.values():
temp = val
lst.append(temp)
next_row+=1
if step!=key:
break
break
for item in lst:
sublist = (" ").join(str(item))
sheet.row+= [sublist]
sheet.save_as("CoilSummaryTest.xlsx")
print("\nCoils Compared: ",next_row)
def open_downtime():
''' Pull data from a second excel file to obtain the coil numbers with corresponding downtime codes'''
DT = op.load_workbook('DownTime.xlsm')
bl2 = DT.get_sheet_by_name('BL2')
n = 1
for row in bl2.iter_cols(min_col=11,max_col=11):
for colD in row:
code[colD.offset(row=1,column=0).value].append(colD.offset(row=1,column=-7).value
n+=1
print('\nNumber of rows in DownTime file: ',n)
return code
def open_coil():
'''Opens the first workbook and sheet to know how many rows are needed for coil comparision.'''
i = 1
CSR = op.load_workbook('CoilSummaryTesting.xlsx')
line_read = CSR.get_sheet_by_name('BL2')
for rows in line_read.iter_cols(min_col=2, max_col=2):
for col in rows:
cnum.append(col.offset(row=1,column=0).value)
i+=1
print('\nNumber of rows in CoilSummary file: ',i)
return write_data(open_downtime(),cnum)
def main():
sheet = open_coil()
if __name__ == "__main__":
main()
I understand this is probably not the shortest version of this code and there are probably a lot of ways to get it to paste directly into the excel file of my choice, but I couldn't figure that part out yet.
What I did differently is using pyexcel. This proved to be the easiest when it came to just pasting values into rows or columns. Using join, I broke the generated list of lists up to allow each sublist to be inserted in its own row. I currently settled on having the generated rows saved to a different excel workbook because having continuously corrupted workbooks during this exploration; however, if anyone knows how to manipulate this code to eliminate the last step of having to copy the rows to paste into the desired workbook, please let me know.

How to automatically copy text out of excel files

I have 250 Microsoft Excel (.xls) files, all in a folder. I need to do the following:
for each file:
open the file
switch to a specific tab in the file
extract the text from rows 15-100 on that tab
save the text in a text file somewhere
I assume this can be automated somehow, but I have no idea how. Where do I start looking to figure out how to do this? I really don't want to open 250 excel files and copy text out by hand, as that would take hours. :(
Since you already have Excel, you can create an Excel macro in a separate worksheet to do this; just make sure the worksheet is outside of the directory you are parsing. You'll need to add a reference for the FileSystemObject, which should be found in C:\Windows\System32\scrrun.dll.
Option Explicit
Sub ExtractData()
Dim fso As New FileSystemObject
Dim oFile As File
Dim oFolder As Folder
Dim sFileOutput As String
Dim fNum
Dim excelFile As Excel.Workbook
Dim excelWorksheet As Excel.Worksheet
Dim i As Integer
sFileOutput = "C:\FolderToScan\ExcelOutput.txt"
Set oFolder = fso.GetFolder("C:\FolderToScan")
For Each oFile In oFolder.Files
If Right(oFile.Name, 3) = "xls" Then
fNum = FreeFile()
Open sFileOutput For Append As fNum
Set excelFile = Workbooks.Open(oFile.Path)
Set excelWorksheet = excelFile.Sheets(1)
'Or:
' Set excelWorksheet = excelFile.Sheets("Name of your sheet")
For i = 15 To 100
Write #fNum, excelWorksheet.Cells(i, 1)
Next
Close #fNum
excelFile.Close
Set excelFile = Nothing
End If
Next
End Sub
That can be quickly solved using the xlrd module and python, I copied the following example from activestate.com - it is easy to adapt it to your needs.
## {{{ http://code.activestate.com/recipes/483742/ (r3)
class readexcel(object):
""" Simple OS Independent Class for Extracting Data from Excel Files
the using xlrd module found at http://www.lexicon.net/sjmachin/xlrd.htm
Versions of Excel supported: 2004, 2002, XP, 2000, 97, 95, 5, 4, 3
xlrd version tested: 0.5.2
Data is extracted by creating a iterator object which can be used to
return data one row at a time. The default extraction method assumes
that the worksheet is in tabular format with the first nonblank row
containing variable names and all subsequent rows containing values.
This method returns a dictionary which uses the variables names as keys
for each piece of data in the row. Data can also be extracted with
each row represented by a list.
Extracted data is represented fairly logically. By default dates are
returned as strings in "yyyy/mm/dd" format or "yyyy/mm/dd hh:mm:ss",
as appropriate. However, dates can be return as a tuple containing
(Year, Month, Day, Hour, Min, Second) which is appropriate for usage
with mxDateTime or DateTime. Numbers are returned as either INT or
FLOAT, whichever is needed to support the data. Text, booleans, and
error codes are also returned as appropriate representations.
Quick Example:
xl = readexcel('testdata.xls')
sheetnames = xl.worksheets()
for sheet in sheetnames:
print sheet
for row in xl.getiter(sheet):
# Do Something here
"""
def __init__(self, filename):
""" Returns a readexcel object of the specified filename - this may
take a little while because the file must be parsed into memory """
import xlrd
import os.path
if not os.path.isfile(filename):
raise NameError, "%s is not a valid filename" % filename
self.__filename__ = filename
self.__book__ = xlrd.open_workbook(filename)
self.__sheets__ = {}
self.__sheetnames__ = []
for i in self.__book__.sheet_names():
uniquevars = []
firstrow = 0
sheet = self.__book__.sheet_by_name(i)
for row in range(sheet.nrows):
types,values = sheet.row_types(row),sheet.row_values(row)
nonblank = False
for j in values:
if j != '':
nonblank=True
break
if nonblank:
# Generate a listing of Unique Variable Names for Use as
# Dictionary Keys In Extraction. Duplicate Names will
# be replaced with "F#"
variables = self.__formatrow__(types,values,False)
unknown = 1
while variables:
var = variables.pop(0)
if var in uniquevars or var == '':
var = 'F' + str(unknown)
unknown += 1
uniquevars.append(str(var))
firstrow = row + 1
break
self.__sheetnames__.append(i)
self.__sheets__.setdefault(i,{}).__setitem__('rows',sheet.nrows)
self.__sheets__.setdefault(i,{}).__setitem__('cols',sheet.ncols)
self.__sheets__.setdefault(i,{}).__setitem__('firstrow',firstrow)
self.__sheets__.setdefault(i,{}).__setitem__('variables',uniquevars[:])
def getiter(self, sheetname, returnlist=False, returntupledate=False):
""" Return an generator object which yields the lines of a worksheet;
Default returns a dictionary, specifing returnlist=True causes lists
to be returned. Calling returntupledate=True causes dates to returned
as tuples of (Year, Month, Day, Hour, Min, Second) instead of as a
string """
if sheetname not in self.__sheets__.keys():
raise NameError, "%s is not present in %s" % (sheetname,\
self.__filename__)
if returnlist:
return __iterlist__(self, sheetname, returntupledate)
else:
return __iterdict__(self, sheetname, returntupledate)
def worksheets(self):
""" Returns a list of the Worksheets in the Excel File """
return self.__sheetnames__
def nrows(self, worksheet):
""" Return the number of rows in a worksheet """
return self.__sheets__[worksheet]['rows']
def ncols(self, worksheet):
""" Return the number of columns in a worksheet """
return self.__sheets__[worksheet]['cols']
def variables(self,worksheet):
""" Returns a list of Column Names in the file,
assuming a tabular format of course. """
return self.__sheets__[worksheet]['variables']
def __formatrow__(self, types, values, wanttupledate):
""" Internal function used to clean up the incoming excel data """
## Data Type Codes:
## EMPTY 0
## TEXT 1 a Unicode string
## NUMBER 2 float
## DATE 3 float
## BOOLEAN 4 int; 1 means TRUE, 0 means FALSE
## ERROR 5
import xlrd
returnrow = []
for i in range(len(types)):
type,value = types[i],values[i]
if type == 2:
if value == int(value):
value = int(value)
elif type == 3:
datetuple = xlrd.xldate_as_tuple(value, self.__book__.datemode)
if wanttupledate:
value = datetuple
else:
# time only no date component
if datetuple[0] == 0 and datetuple[1] == 0 and \
datetuple[2] == 0:
value = "%02d:%02d:%02d" % datetuple[3:]
# date only, no time
elif datetuple[3] == 0 and datetuple[4] == 0 and \
datetuple[5] == 0:
value = "%04d/%02d/%02d" % datetuple[:3]
else: # full date
value = "%04d/%02d/%02d %02d:%02d:%02d" % datetuple
elif type == 5:
value = xlrd.error_text_from_code[value]
returnrow.append(value)
return returnrow
def __iterlist__(excel, sheetname, tupledate):
""" Function Used To Create the List Iterator """
sheet = excel.__book__.sheet_by_name(sheetname)
for row in range(excel.__sheets__[sheetname]['rows']):
types,values = sheet.row_types(row),sheet.row_values(row)
yield excel.__formatrow__(types, values, tupledate)
def __iterdict__(excel, sheetname, tupledate):
""" Function Used To Create the Dictionary Iterator """
sheet = excel.__book__.sheet_by_name(sheetname)
for row in range(excel.__sheets__[sheetname]['firstrow'],\
excel.__sheets__[sheetname]['rows']):
types,values = sheet.row_types(row),sheet.row_values(row)
formattedrow = excel.__formatrow__(types, values, tupledate)
# Pad a Short Row With Blanks if Needed
for i in range(len(formattedrow),\
len(excel.__sheets__[sheetname]['variables'])):
formattedrow.append('')
yield dict(zip(excel.__sheets__[sheetname]['variables'],formattedrow))
## end of http://code.activestate.com/recipes/483742/ }}}

Resources