xlrd named range example? - python-3.x

I have an excel spreadsheet that I am trying to parse with xlrd. The spreadsheet itself makes extensive use of named ranges.
If I use:
for name in book.name_map:
print(name)
I can see all of the names are there.
However I can't make any of the methods work (cell method and area2d). Can anyone give me an example of the syntax to be able to read the cell range that a name is pointing to given the name.
The Excel file is an XLSM file with lots of visual basic that also operates on these named ranges.

I think that the naming support in XLRD is broken for XLSM files but I found an answer by switching to openpyxl. This has a function get_named_ranges() which contains all of the named ranges. The support after that is a bit thin so I wrote my own class to turn the named ranges in my spreadsheet into a class where I can access the same information using the same names.
# -- coding: utf-8 --
"""
Created on Wed Sep 14 09:42:09 2016
#author: ellwood
"""
from openpyxl import load_workbook
class NamedArray(object):
''' Named range object
'''
C_CAPS = 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
def __init__(self, workbook, named_range_raw):
''' Initialise a NameArray object from the named_range_raw information in the given
workbook
'''
self.wb = workbook
sheet_str, cellrange_str = str(named_range_raw).split('!')
self.sheet =sheet_str.split('"')[1]
self.loc = self.wb[self.sheet]
if ':' in cellrange_str:
self.has_range = True
self.has_value = False
lo,hi = cellrange_str.split(':')
self.ad_lo = lo.replace('$','')
self.ad_hi = hi.replace('$','')
else:
self.has_range = False
self.has_value = True
self.ad_lo = cellrange_str.replace('$','')
self.ad_hi = self.ad_lo
self.min_row = self.get_row(self.ad_lo)
self.max_row = self.get_row(self.ad_hi)
self.rows = self.max_row - self.min_row + 1
self.min_col = self.col_to_n(self.ad_lo)
self.max_col = self.col_to_n(self.ad_hi)
self.cols = self.max_col - self.min_col + 1
def size_of(self):
''' Returns two dimensional size of named space
'''
return self.cols, self.rows
def cols(self):
''' Returns number of cols in named space
'''
return self.cols
def rows(self):
''' Returns number of rows in named space
'''
return self.rows
def value(self, r=1, c=1):
''' Returns the value at row r, column c
'''
if self.has_value:
return self.loc.cell(self.ad_lo).value
assert r < self.max_rows
assert c < self.max_cols
return self.loc.cell(self.n_to_col(self.min_col + c-1)+str(self.min_row+r-1)).value
def is_range(self):
''' if true then name defines a table more than 1 cell
'''
return self.range
def is_value(self):
''' if true then name defines the location of a single value
'''
return None
def __str__(self):
''' printed description of named space
'''
locs = 's ' + self.ad_lo + ':' + self.ad_hi if self.is_range else ' ' + self.ad_lo
return('named range'+ str(self.size_of()) + ' in sheet ' + self.sheet + ' # location' + locs)
#classmethod
def get_row(cls, ad):
''' get row number from cell string
Cell string is assumed to be in excel format i.e "ABC123" where row is 123
'''
row = 0
for l in ad:
if l in "1234567890":
row = row*10 + int(l)
return row
#classmethod
def col_to_n(cls, ad):
''' find column number from xl address
Cell string is assumed to be in excel format i.e "ABC123" where column is abc
column number is integer represenation i.e.(A-A)*26*26 + (B-A)*26 + (C-A)
'''
n = 0
for l in ad:
if l in cls.C_CAPS:
n = n*26 + cls.C_CAPS.find(l)+1
return n
#classmethod
def n_to_col(cls,n):
''' make xl column address from column number
'''
ad = ''
while n > 0:
ad = cls.C_CAPS[n%26-1] + ad
n = n // 26
return ad
class Struct(object):
''' clast which turns a dictionary into a structure
'''
def __init__(self, **entries):
self.__dict__.update(entries)
def repr__(self):
return '<%s>' % str('\n '.join('%s : %s' % (k, repr(v)) for (k, v) in self.__dict.iteritems()))
def get_names(workbook):
''' Get a structure containing all of the names in the workbook
'''
named_ranges = wb.get_named_ranges()
name_list = {}
for named_range in named_ranges:
name = named_range.name
if name[0:2] == 'n_':
# only store the names beginning with 'n_'
name_list[name[2:]] = NamedArray(wb, str(named_range))
for item in name_list:
print (item, '=', name_list[item])
return Struct(**name_list)
# ------------------
# program example
# -----------------
wb = load_workbook('test.xlsm', data_only=True)
n = get_names(wb)
print(n.my_name.value())
One Small optimisation is that I prefixed all of the names I was interested in importing wiht 'n_' so I could then ignore any built in Excel names. I hope this is useful to someone.

Related

pd.rename key KeyError: 'New_Name'

Edit 12/07/19: The problem was not in fact with pd.rename fuction but the fact that I did not return from the function the pandas dataframe and as a result the column change did not exist when printing. i.e.
def change_column_names(as_pandas, old_name, new_name):
as_pandas.rename(columns={old_name: new_name}, inplace=)
return as_pandas <- This was missing*
Please see the user comment below to uptick them for finding this error for me.
Alternatively, you can continue reading.
The data can be downloaded from this link, yet I have added a sample dataset. The formatting of the file is not a typical CSV file and I believe this may have been an assessment piece and is related to Hidden Decision Tree article. I have given the portion of the code as it solves the issues surrounding the format of the text file as mentioned above and allows the user to rename the column.
The problem occured when I tried to assign create a re-naming function:
def change_column_names(as_pandas, old_name, new_name):
as_pandas.rename(columns={old_name: new_name}, inplace=)
However, it seem to work when I set the variable names inside rename function.
def change_column_names(as_pandas):
as_pandas.rename(columns={'Unique Pageviews': 'Page_Views'}, inplace=True)
return as_pandas
Sample Dataset
Title URL Date Unique Pageviews
oupUrl=tutorials 18-Apr-15 5608
"An Exclusive Interview with Data Expert, John Bottega" http://www.datasciencecentral.com/forum/topics/an-exclusive-interview-with-data-expert-john-bottega?groupUrl=announcements 10-Jun-14 360
Announcing Composable Analytics http://www.datasciencecentral.com/forum/topics/announcing-composable-analytics 15-Jun-14 367
Announcing the release of Spark 1.5 http://www.datasciencecentral.com/forum/topics/announcing-the-release-of-spark-1-5 12-Sep-15 156
Are Extreme Weather Events More Frequent? The Data Science Answer http://www.datasciencecentral.com/forum/topics/are-extreme-weather-events-more-frequent-the-data-science-answer 5-Oct-15 204
Are you interested in joining the University of California for an empiricalstudy on 'Big Data'? http://www.datasciencecentral.com/forum/topics/are-you-interested-in-joining-the-university-of-california-for-an 7-Feb-13 204
Are you smart enough to work at Google? http://www.datasciencecentral.com/forum/topics/are-you-smart-enough-to-work-at-google 11-Oct-15 3625
"As a software engineer, what's the best skill set to have for the next 5-10years?" http://www.datasciencecentral.com/forum/topics/as-a-software-engineer-what-s-the-best-skill-set-to-have-for-the- 12-Feb-16 2815
A Statistician's View on Big Data and Data Science (Updated) http://www.datasciencecentral.com/forum/topics/a-statistician-s-view-on-big-data-and-data-science-updated-1 21-May-14 163
A synthetic variance designed for Hadoop and big data http://www.datasciencecentral.com/forum/topics/a-synthetic-variance-designed-for-hadoop-and-big-data?groupUrl=research 26-May-14 575
A Tough Calculus Question http://www.datasciencecentral.com/forum/topics/a-tough-calculus-question 10-Feb-16 937
Attribution Modeling: Key Analytical Strategy to Boost Marketing ROI http://www.datasciencecentral.com/forum/topics/attribution-modeling-key-concept 24-Oct-15 937
Audience expansion http://www.datasciencecentral.com/forum/topics/audience-expansion 6-May-13 223
Automatic use of insights http://www.datasciencecentral.com/forum/topics/automatic-use-of-insights 27-Aug-15 122
Average length of dissertations by higher education discipline. http://www.datasciencecentral.com/forum/topics/average-length-of-dissertations-by-higher-education-discipline 4-Jun-15 1303
This is the full code that produces the Key Error:
def change_column_names(as_pandas):
as_pandas.rename(columns={'Unique Pageviews': 'Page_Views'}, inplace=True)
def change_column_names(as_pandas, old_name, new_name):
as_pandas.rename(columns={old_name: new_name}, inplace=True)
def change_column_names(as_pandas):
as_pandas.rename(columns={'Unique Pageviews': 'Page_Views'},
inplace=True)
def open_as_dataframe(file_name_in):
reader = pd.read_csv(file_name_in, encoding='windows-1251')
return reader
# Get each column of data including the heading and separate each element
i.e. Title, URL, Date, Page Views
# and save to string_of_rows with comma separator for storage as a csv
# file.
def get_columns_of_data(*args):
# Function that accept variable length arguments
string_of_rows = str()
num_cols = len(args)
try:
if num_cols > 0:
for number, element in enumerate(args):
if number == (num_cols - 1):
string_of_rows = string_of_rows + element + '\n'
else:
string_of_rows = string_of_rows + element + ','
except UnboundLocalError:
print('Empty file \'or\' No arguments received, cannot be zero')
return string_of_rows
def open_file(file_name):
try:
with open(file_name) as csv_file_in, open('HDT_data5.txt', 'w') as csv_file_out:
csv_read = csv.reader(csv_file_in, delimiter='\t')
for row in csv_read:
try:
row[0] = row[0].replace(',', '')
csv_file_out.write(get_columns_of_data(*row))
except TypeError:
continue
print("The file name '{}' was successfully opened and read".format(file_name))
except IOError:
print('File not found \'OR\' Not in current directory\n')
# All acronyms used in variable naming correspond to the function at time
# of return from function.
# csv_list being a list of the v file contents the remainder i.e. 'st' of
# csv_list_st = split_title().
def main():
open_file('HDTdata3.txt')
multi_sets = open_as_dataframe('HDT_data5.txt')
# change_column_names(multi_sets)
change_column_names(multi_set, 'Old_Name', 'New_Name')
print(multi_sets)
main()
I cleaned up your code so it would run. You were changing the column names but not returning the result. Try the following:
import pandas as pd
import numpy as np
import math
def set_new_columns(as_pandas):
titles_list = ['Year > 2014', 'Forum', 'Blog', 'Python', 'R',
'Machine_Learning', 'Data_Science', 'Data',
'Analytics']
for number, word in enumerate(titles_list):
as_pandas.insert(len(as_pandas.columns), titles_list[number], 0)
def title_length(as_pandas):
# Insert new column header then count the number of letters in 'Title'
as_pandas.insert(len(as_pandas.columns), 'Title_Length', 0)
as_pandas['Title_Length'] = as_pandas['Title'].map(str).apply(len)
# Although it is log, percentage of change is inverse linear comparison of
#logX1 - logX2
# therefore you could think of it as the percentage change in Page Views
# map
# function allows for function to be performed on all rows in column
# 'Page_Views'.
def log_page_view(as_pandas):
# Insert new column header
as_pandas.insert(len(as_pandas.columns), 'Log_Page_Views', 0)
as_pandas['Log_Page_Views'] = as_pandas['Page_Views'].map(lambda x: math.log(1 + float(x)))
def change_to_numeric(as_pandas):
# Check for missing values then convert the column to numeric.
as_pandas = as_pandas.replace(r'^\s*$', np.nan, regex=True)
as_pandas['Page_Views'] = pd.to_numeric(as_pandas['Page_Views'],
errors='coerce')
def change_column_names(as_pandas):
as_pandas.rename(columns={'Unique Pageviews': 'Page_Views'}, inplace=True)
return as_pandas
def open_as_dataframe(file_name_in):
reader = pd.read_csv(file_name_in, encoding='windows-1251')
return reader
# Get each column of data including the heading and separate each element
# i.e. Title, URL, Date, Page Views
# and save to string_of_rows with comma separator for storage as a csv
# file.
def get_columns_of_data(*args):
# Function that accept variable length arguments
string_of_rows = str()
num_cols = len(args)
try:
if num_cols > 0:
for number, element in enumerate(args):
if number == (num_cols - 1):
string_of_rows = string_of_rows + element + '\n'
else:
string_of_rows = string_of_rows + element + ','
except UnboundLocalError:
print('Empty file \'or\' No arguments received, cannot be zero')
return string_of_rows
def open_file(file_name):
import csv
try:
with open(file_name) as csv_file_in, open('HDT_data5.txt', 'w') as csv_file_out:
csv_read = csv.reader(csv_file_in, delimiter='\t')
for row in csv_read:
try:
row[0] = row[0].replace(',', '')
csv_file_out.write(get_columns_of_data(*row))
except TypeError:
continue
print("The file name '{}' was successfully opened and read".format(file_name))
except IOError:
print('File not found \'OR\' Not in current directory\n')
# All acronyms used in variable naming correspond to the function at time
# of return from function.
# csv_list being a list of the v file contents the remainder i.e. 'st' of
# csv_list_st = split_title().
def main():
open_file('HDTdata3.txt')
multi_sets = open_as_dataframe('HDT_data5.txt')
multi_sets = change_column_names(multi_sets)
change_to_numeric(multi_sets)
log_page_view(multi_sets)
title_length(multi_sets)
set_new_columns(multi_sets)
print(multi_sets)
main()

Exception Occurred (..) - 'Microsoft Excel', 'SaveAs ... class failed'

This code basically does what it needs to do. But it does throw an error sometimes whereas I would expect the code to work as all data necessary is present.
So what it does is: (1) Read in users (2) Add information to the Excel dashboard, such as header information, word clouds and profile picture.
Then it saves the Excel file. It sometimes, randomly almost, gives an error: (pywintypes.com_error: (-2147352567, 'Exception occurred.', (0, 'Microsoft Excel', 'Open method of Workbooks class failed', 'xlmain11.chm', 0, -2146827284), None)). What part of the code could cause this?
import os
import xlwings as xw
import pandas as pd
import openpyxl
def get_users(file_name):
"""Read all the users from the csv file."""
users = []
f = open(file_name, 'r')
for line in f:
user = line.strip().split(',')
screen_name = user[0]
users.append(screen_name)
f.close()
return users
def read_csv_file(file_name):
"""Return csv file with accounts."""
data = []
f = open(file_name, 'r')
for line in f:
temp = tuple(line.strip().split(';'))
data.append(temp)
f.close()
return data
def write_panda_to_excel(df, start_cell, wb):
"""Write Pandas DataFrame to Excel."""
sht = wb.sheets['Input']
sht.range(start_cell).value = df
def add_word_cloud(name, cell, wb):
"""Add the WordCloud to Sheet2 """
sht = wb.sheets['Sheet2']
name = os.getcwd() + '\\' + name
rng = sht.range(cell)
sht.pictures.add(name, top=rng.top, left=rng.left, width=325, height=155)
def add_profile_picture(user, cell, wb):
#Add charts to dashboard.
sht = wb.sheets['Sheet1']
picture = [f for f in os.listdir('.') if f.startswith(user + '.')][0]
name = os.getcwd() + '\\' + picture
rng = sht.range(cell)
sht.pictures.add(name, top=rng.top, left=rng.left, width=70, height=90)
app = xw.App(visible=False)
# Read users
os.chdir('../FolderA/')
file_name = 'accounts_file.csv'
users = get_users(file_name)
os.chdir('../Data')
for i, user in enumerate(users):
try:
#count += 1
print(100 * '-')
print(len(users), i+1, user)
# go to directory where the dashboard is stored
os.chdir('../Folder5/FolderE')
wb = xw.Book('Twitter - Individuele Rapportage.xlsm')
os.chdir('../../Data/' + user)
# Remove file if exists
xl = [e for e in os.listdir('.') if e.endswith('.xlsm')]
for e in xl:
os.remove(e)
# add user name to title of Dashboard
sht = wb.sheets['Input_Data']
# add the csv data and profile pictures the other data to the dashboard
df = pd.read_csv(user + '_header_info.csv', sep=',')
write_panda_to_excel(df, 'A1', wb)
cell = 'L20'
try:
add_profile_picture(user, cell, wb)
except:
os.chdir('../../Folder6')
with open('Twitter - Profile picture Error.txt', 'a') as ExceptFile:
ExceptFile.write(str(user) + '\n')
os.chdir('../Data/' + user)
name = user + '_WC.png'
cell = 'Y46'
add_word_cloud(name, cell, wb)
xlname = 'Twitter' + user + '.xlsm'
try:
wb.save(xlname)
wb.close()
except:
os.chdir('../../Folder6')
with open('Twitter - Dashboard Generation Errors.txt', 'a') as myfile:
myfile.write(str(user + "\n"))
myfile.close()
os.chdir('../Data/' + user)
os.chdir('..')
except OSError as exception:
print(exception)
os.chdir('..')
with open('dash_errors.txt', 'w') as dashboard_errors:
dashboard_errors.write(user+"\n")

Properly using dataclasses to return values of items

The project is to sort items - using a particular algorithm - into boxes. I am having trouble after assigning each items to the proper class, to return to another function and use and modify the data held within the object in the data class.
My testing file looks like this:
17 10 4
Abacus 3
Blender 5
Chessboard 3
Dishes 6
My classes:
#dataclass
class InventoryItem:
name: str
weight: float
#dataclass
class BoxInventory:
name: str
maxWeight: float
remainingWeight: float
contents: dict = ""
"""
def listContents(self, contents):
self.listContents = contents
def remainingWeight(self, remainingWeight):
self.remainingWeight = remainingWeight
def addItemWeight(self, itemWeight):
self.remainingWeight -= itemWeight
def addItemList(self, itemName, itemWeight, contents):
self.contents = contents[itemName] = contents[itemWeight]
"""
Here is where I read my text file and transfer it to a class:
"""
Take the given txt file and format into proper list for boxes and items
:param filename: The filename of the text file
:return: Send lists to to be used by an algo.
"""
with open(filename, 'r') as myFile: # Open the correct file
itemDict = {}
boxDict = {}
myList = [line.split() for line in myFile.readlines()]
boxLine = ' '.join(myList[0])
for line in range(1, len(myList)):
lines = ''.join(myList[line])
itemName = lines[:-1]
weight = lines[len(lines) - 1:]
item = InventoryItem(itemName, int(weight))
itemDict[itemName] = [item]
boxString = ""
count = 0
for char in boxLine:
if char != " ":
boxString = boxString + char
else:
boxName = "Box" + str(count)
box = BoxInventory(boxName, int(boxString), int(boxString))
boxDict[boxName] = [box]
boxString = ""
count += 1
myReturn = {}
myReturn['boxDict'] = boxDict
myReturn['itemDict'] = itemDict
return myReturn
Non-implemented algorithm:
def roomiest(myReturnDict):
"""
For each item find the box with the greatest remaining allowed weight that can support the item and place the item in that box
:param boxList: The list of boxes in the class from the given file
:param itemList: The list of items in the class from the given file
:return: If boxes were able to fit all items(1); items in box with individual weights(2); Box name with max
weight(3); items with their weights that were left behind(4)
"""
itemList = myReturnDict.get("itemDict")
boxList = myReturnDict.get("boxDict")
My problem is that I do know how to read the parsed data from my
fileReader function in my algo. function.
Your input function is a little strange as you're storing the objects in a list of length 1 inside a dictionary. So your data looks like:
'Dishes': [InventoryItem(name='Dishes', weight=6)]
instead of
'Dishes': InventoryItem(name='Dishes', weight=6)
You might have a reason for it, but changing itemDict[itemName] = [item] to itemDict[itemName] = item makes your code a little easier to follow (and the same for boxDict[boxName] = [box]). With that change you can access the parsed data easily with the following:
for item_name, item in itemList.items():
print(item.name)
print(item.weight)
This iterates through the itemList dictionary, getting the key, value pairs which in this case is itemName, item (or [item] in your original code. If you don't want to change that, replace item with item[0] in the code above). Then you can access attributes of your Class directly by calling their label.
You can get the box with most space remaining, using
sorted_box_list = (sorted(boxList.values(), key=operator.attrgetter('remainingWeight'), reverse=True))
What I have done is rather than using a dictionay I am using a list to pass on the data to a new function.
Text File --> List --> Dict --> List --> sortedList
Here is my new fileReader function:
def fileReader(filename):
"""
Take the given txt file and format into proper list for boxes and items
:param filename: The filename of the text file
:return: Send lists to to be used by an algo.
"""
with open(filename, 'r') as myFile: # Open the correct file
itemList = []
boxList = []
myList = [line.split() for line in myFile.readlines()]
boxLine = ' '.join(myList[0])
for line in range(1, len(myList)):
lines = ''.join(myList[line])
itemName = lines[:-1]
weight = lines[len(lines) - 1:]
item = InventoryItem(itemName, int(weight))
itemList.append(item)
boxString = ""
count = 0
for char in boxLine:
if char != " ":
boxString = boxString + char
else:
boxName = "Box" + str(count)
box = BoxInventory(boxName, int(boxString), int(boxString))
boxList.append(box)
boxString = ""
count += 1
I then read and sort the data in each algotithm using this same method:
def roomiest(myReturnDict):
"""
For each item find the box with the greatest remaining allowed weight that can support the item and place the item in that box
:param boxList: The list of boxes in the class from the given file
:param itemList: The list of items in the class from the given file
:return: If boxes were able to fit all items(1); items in box with individual weights(2); Box name with max
weight(3); items with their weights that were left behind(4)
"""
itemData = list(myReturnDict.get("itemList"))
boxData = list(myReturnDict.get("boxList"))
sortedItemList = sorted(itemData, key=lambda x: x.weight, reverse=True)
sortedBoxList = sorted(boxData, key=lambda x: x.remainingWeight, reverse=True)
myReturn = {}
myReturn['boxList'] = boxList
myReturn['itemList'] = itemList
return myReturn
My dataclasses look like the following:
#dataclass
class InventoryItem:
name: str
weight: float
#dataclass
class BoxInventory:
name: str
maxWeight: float
remainingWeight: float
contents: dict = ""
def itemWeight(item):
print("Weight of", item.name, "is: ", item.weight, "\n")
return item.weight
def remainWeight(box):
print("Rem. weight in ", box.name, "is: ", box.remainingWeight, "\n")
return box.remainingWeight

openpyxl - overwrite datasheet and preserve pivot table

I've seen a few answers around to this question but none of them are working.
eg: How to write to an existing excel file without breaking formulas with openpyxl?
Docs give nothing away it seems:
http://openpyxl.readthedocs.io/en/latest/api/openpyxl.reader.excel.html
I tried replacing xls.load_workbook with xls.reader.excel.load_workbook but it doesn't change anything.
My current code overwrites the data in the data sheet, but kills the pivot table functionality in the other sheet (the sheet is still there but only with values). Any idea how to keep the pivot table?
import pandas as pd
import openpyxl as xls
from shutil import copyfile
template_file = 'openpy_test.xlsx'
output_file = 'openpy_output.xlsx'
copyfile(template_file, output_file)
book = xls.load_workbook(output_file,guess_types=False,data_only=False)
writer = pd.ExcelWriter(output_file,engine='openpyxl')
writer.book = book
writer.sheets = dict((ws.title, ws) for ws in book.worksheets)
df.to_excel(writer,sheet_name='data',index=False,encoding='utf8')
writer.save()
I have also tried book.save('dummycopy.xlsx'), which also saves with a non-funcitoning pivot table. So I am sure the problem is related to the load_workbook function.
Package versions:
openpyxl 2.4.10 py36_0
pandas 0.20.3 py36hce827b7_2
i don't think openpyxl supports excel pivot tables currently. I had to switch to using win32com library.
here is a wrapper module i wrote to do specific stuff with pivot tables; it's basically VBA translated to python (record macros and read the VBA, it'll make sense). hope it helps. it's still a work in progress but should be enough for you to work with.
import os, datetime
import win32com.client as win32
win32c = win32.constants
import sys, datetime
letters = ' ABCDEFGHIJKLMNOPQRSTUVWXYZ' #space to compensate for index. if letter is a if column is 1
def Pull_excel_workbook(path = '', filename = '', visible = False):
'''function to run excel on the given filename'''
if path == '': path = os.getcwd()
if filename == '': raise FileNotFoundError('Please supply a file')
excel = win32.gencache.EnsureDispatch('Excel.Application')
excel.Visible = visible
try: wb = excel.Workbooks.Open(path + filename)
except: print('Try again\n{}'.format(sys.exc_info()))
ws = wb.ActiveSheet
data = list(ws.UsedRange.Value) #2d list of rows and columns
src = '{}!R1C1:R{}C{}'.format(ws.Name, len(data), len(data[0]))
return excel, wb, src
#wb.SaveAs(path + filename)
def Create_pivottable(wb, src, table_name = 'Pivot'):
'''creates Pivot Table object in the wb in a new Pivot worksheet'''
ws = wb.Sheets.Add() #should also change wb.ActiveSheet to the new one.
ws.Name = table_name
tname = ws.Name
starting_point = (4,1) #row, column
pc = wb.PivotCaches().Add(SourceType = win32c.xlDatabase,
SourceData = src)
try:
pt = pc.CreatePivotTable(TableDestination = '{}!R{}C{}'.format(tname, starting_point[0], starting_point[1]),
TableName = table_name,
DefaultVersion = win32c.xlPivotTableVersion10 #15
)
except: #not sure if will work...
print('{}:{}:{}:{}'.format(wb, src, table_name, '{}!R{}C{}'.format(tname, starting_point[0], starting_point[1])))
#tabledestination format of RN Pivot!R4C1 is not correct format, should be 'RN Pivot'!R4C1
pt = pc.CreatePivotTable(TableDestination = '{}!R{}C{}'.format(tname, starting_point[0], starting_point[1]),
TableName = table_name,
DefaultVersion = win32c.xlPivotTableVersion15
)
wb.Sheets(ws.Name).Select()
wb.Sheets(ws.Name).Cells(3,1).Select()
def Add_to_Filter(wb, tname, field_name):
''' '''
field = wb.ActiveSheet.PivotTables(tname).PivotFields(field_name)
field.Orientation = win32c.xlPageField
field.Position = 1
def Add_to_Row(wb, tname, field_name, position = 1):
''' '''
field = wb.ActiveSheet.PivotTables(tname).PivotFields(field_name)
field.Orientation = win32c.xlRowField
field.Position = position
def Add_to_Column(wb, tname, field_name, position = 1):
''' '''
field = wb.ActiveSheet.PivotTables(tname).PivotFields(field_name)
field.Orientation = win32c.xlColumnField
field.Position = position
if position > 1:
text = 'maybe do something here....'
pass
def Add_to_Value(wb, tname, field_name, alias = '', calculation = 'xlSum'):
''' '''
if type(calculation) is str and calculation in win32c.__dict__['__dicts__'][0]:
calculation = win32c.__dict__['__dicts__'][0][calculation]
datafield = wb.ActiveSheet.PivotTables(tname).PivotFields(field_name)
wb.ActiveSheet.PivotTables(tname).AddDataField(datafield, alias, calculation)
def LtoC(letter):
global letters
col = letters.index(letter)
return col
def CtoL(col):
global letters
letter = letters[col]
return letter
def Format_pretty(wb, tname, row_to_colapse):
'''makes it look prettier'''
wb.ActiveSheet.PivotTables(tname).TableStyle2 = 'PivotStyleMedium9'
if type(row_to_colapse) is not str:
for row in row_to_colapse:
wb.ActiveSheet.PivotTables(tname).PivotFields(row).ShowDetail = False #collapses
wb.ActiveSheet.PivotTables(tname).PivotFields(row).RepeatLabels = True #repeats labels
else:
wb.ActiveSheet.PivotTables(tname).PivotFields(row_to_colapse).ShowDetail = False #collapses
wb.ActiveSheet.PivotTables(tname).PivotFields(row_to_colapse).RepeatLabels = True #repeats labels
wb.ActiveSheet.Columns('A:Z').EntireColumn.AutoFit()
wb.ActiveSheet.Range('A1').Select()
def Add_calcd_col(ws, col, row_start, row_end, formula, style = '', col_title = 'default'):
'''col and rows should be int
'''
letter = CtoL(col)
ws.Range('{0}{1}:{0}{2}'.format(letter, row_start, row_end)).Select()
ws.Cells(row_start, col).Value = col_title
for row in range(row_start + 1, row_end + 1):
ws.Cells(row, col).Value = formula.format(row)
ws.Range('{0}{1}:{0}{2}'.format(letter, row_start, row_end)).Style = style
#print("ws.Range('{0}1:{0}200'.format({0})).Style = style".format(letter))
#ws.Range('{0}1:{0}200'.format(letter)).Style = style
def Values_to_columns(wb,tname, position = 2):
''' '''
wb.ActiveSheet.PivotTables(tname).DataPivotField.Orientation = win32c.xlColumnField
wb.ActiveSheet.PivotTables(tname).DataPivotField.Position = position
def WB_save(wb, path, tname, filename):
'''clean save of the new file '''
#Format_pretty(wb, tname, 'Division') #that needs to be fixed....
new_filename = filename[:-5] + '-{}.xlsx'.format(datetime.date.today().strftime('%m.%d.%y'))
wb.SaveAs(path + new_filename)
def Pivot_refresh(path, filename, pivot_sheet_name, pivot_table_name = 'Pivot'):
'''function to refresh the pivot table
tested and functional with recruiting prod report'''
excel, wb, src = Pull_excel_workbook(path = path, filename = filename)
wb.Sheets(pivot_sheet_name).Select()
cell = 'A6' #need a better way for this
excel.Worksheets(pivot_sheet_name).Range(cell).PivotTable.RefreshTable()
#pvt = excel.Worksheets(pivot_sheet_name).Range(cell).PivotTable
#pvt.RefreshTable()
WB_save(wb, path, pivot_table_name, filename)
#pivot refresh
#new = filename[:-5] + '-{}.xlsx'.format(2)
#Pivot_refresh(path = path, filename = new, pivot_sheet_name = 'Pivot')
def Hide_columns(wb, tname, start, end):
'''Hides columns'''
if type(start) is not str: start = CtoL(start)
if type(end) is not str: end = CtoL(end)
wb.ActiveSheet.Columns('{}:{}'.format(start, end)).EntireColumn.Hidden = True

Openpyxl - find hidden rows in ReadOnlyWorksheet

I have a problem with detecting which rows are hidden when I open workbook in read-only mode.
It works flawlessly when I set read_only parameter to False while loading workbook, because then I can iterate over row_dimensions to check which rows are hidden - but opening workbook in read-write mode takes much longer (~2 mins vs ~20 secs in read-only mode) and consumes over 1GB of RAM.
Unfortunately read-only worksheets don't have row_dimensions attribute.
Any help is welcome.
The underlying issue is that the parser is used once and discarded after iterating over all the rows. This is how read_only mode can optimize memory allocation and generate rows upon request. Interestingly enough, the parser itself is still creating the row_dimensions with the row attributes in it!
There are a couple of work arounds you could attempt. In lieu of forking and creating an official fix that exposes the ReadOnlyWorksheet parser, I went with monkey patching:
from openpyxl.worksheet._read_only import ReadOnlyWorksheet, WorkSheetParser, EMPTY_CELL
# The override:
class MyReadOnlyWorksheet(ReadOnlyWorksheet):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.parser = None
def row_is_hidden(self, row_index):
str_row_index = str(row_index)
if self.parser and str_row_index in self.parser.row_dimensions:
return self.parser.row_dimensions[str_row_index].get('hidden') == '1'
if self.parser is None or row_index > self.parser.row_counter:
raise RuntimeError('Must generate the row before calling')
return False
def _cells_by_row(self, min_col, min_row, max_col, max_row, values_only=False):
"""
The source worksheet file may have columns or rows missing.
Missing cells will be created.
Logically the same but saves the parser to "self" during row iteration
"""
filler = EMPTY_CELL
if values_only:
filler = None
max_col = max_col or self.max_column
max_row = max_row or self.max_row
empty_row = []
if max_col is not None:
empty_row = (filler,) * (max_col + 1 - min_col)
counter = min_row
idx = 1
src = self._get_source()
parser = WorkSheetParser(src, self._shared_strings,
data_only=self.parent.data_only, epoch=self.parent.epoch,
date_formats=self.parent._date_formats)
### Cache parser in order to check generated row attrs ###
self.parser = parser
for idx, row in parser.parse():
if max_row is not None and idx > max_row:
break
# some rows are missing
for _ in range(counter, idx):
counter += 1
yield empty_row
# return cells from a row
if counter <= idx:
row = self._get_row(row, min_col, max_col, values_only)
counter += 1
yield row
if max_row is not None and max_row < idx:
for _ in range(counter, max_row+1):
yield empty_row
src.close()
# the monkey patch:
import openpyxl.reader.excel
openpyxl.reader.excel.ReadOnlyWorksheet = MyReadOnlyWorksheet
# the test drive:
from openpyxl import load_workbook
file_location = '' # load your file
workbook = load_workbook(file_location, data_only=True, keep_links=False, read_only=True)
for worksheet in workbook.worksheets:
row_gen = worksheet.rows
for i, row in enumerate(row_gen, start=1):
if worksheet.row_is_hidden(i):
continue # do not process hidden rows.
This does what you need, but beware! I would add sufficient test coverage before using in production (think things like future version re-keying row_dimension dict, removing row_dimensions from read_only parsing, etc). You can similarly add your own accessors to the worksheet that exposes other row attrs (or return the entire dict).
Happy coding!

Resources