How can I loop through all the xls files in a folder to find the sheet names, and then replace them - excel

I am trying to loop through all the XLS files in a folder, and then replace the worksheet name by another string. This has to be done for all the files inside.
I am relatively new to programming, and here is my Python code. It runs okay (partially, when I do it for one file at a time), however, I am unable to get it to work for all the files in the folder.
from xlutils.copy import copy
from xlrd import open_workbook
# open the file
direc = input('Enter file name: ')
rb = open_workbook(direc)
wb = copy(rb)
#index of a sheet
pointSheet = rb.sheet_names()
print(pointSheet)
idx = pointSheet.index(pointSheet)
wb.get_sheet(idx).name = u'RenamedSheet1'
wb.save(direc)
Error message:
Traceback (most recent call last):
File "./Rename.py", line 13, in <module>
idx = pointSheet.index(pointSheet)
ValueError: ['x xxx xxxx xxxxxx'] is not in list
My bad! The above code is for testing with a single file. Here is the loop:
files = []
for dirname, dirnames, filenames in os.walk('D:\Temp\Final'):
# print path to all subdirectories first.
for subdirname in dirnames:
files.append(os.path.join(dirname, subdirname))
# print path to all filenames.
for filename in filenames:
files.append(os.path.join(dirname, filename))
pprint(files)
for i in range(0,len(files)):
rb = open_workbook(files[i])
wb = copy(rb)
idx = rb.sheet_names().index('5 new bulk rename')
wb.get_sheet(idx).name = u'RenamedSheet1'
wb.save(files[i])
print('Operation succeeded!')

Try something like this (untested) for a single file:
from xlutils.copy import copy
from xlrd import open_workbook
# open the file
direc = input('Enter file name: ')
rb = open_workbook(direc)
wb = copy(rb)
for pointSheet in rb.sheet_names()
print(pointSheet)
idx = pointSheet.index(pointSheet)
wb.get_sheet(idx).name = u'RenamedSheet1'
wb.save(direc)
And wrap that in another loop using listdir (taken from here):
import os
for file in os.listdir("/mydir"):
if file.endswith(".xls"):
# <do what you did for a single file>

Related

Attempt to populate list with multiple entries in a for loop only returns a single entry

I have a csv file with URLs I'd like to extract data from, but my script currently only manages to get the last entry to append. This is the script:
import os
import glob
import time
from urllib.request import urlopen
import pandas as pd
import xml.etree.ElementTree as ET
count=0
files=glob.glob('./extract/isbnlist/Reihe*_isbn-dnb2.csv',recursive=True) #searches all files in folder
print(files)
for file in files:
if count==0:
csvfile = pd.read_csv(file, sep='\t', encoding='utf-8')
for row in csvfile['URL']:
print('row: ' + row)
with urlopen(str(row)) as response:
doc = ET.parse(response)
root = doc.getroot()
namespaces = { # Manually extracted from the XML file, but there could be code written to automatically do that.
"zs": "http://www.loc.gov/zing/srw/",
"": "http://www.loc.gov/MARC21/slim",
}
datafield_nodes_path = "./zs:records/zs:record/zs:recordData/record/datafield" # XPath
datafield_attribute_filters = [ #which fields to extract
{
"tag": "100", #author
"ind1": "1",
"ind2": " ",
}]
#datafield_attribute_filters = [] # Decomment this line to clear filters (and process each datafield node)
aut = []
for datafield_node in root.iterfind(datafield_nodes_path, namespaces=namespaces):
if datafield_attribute_filters:
skip_node = True
for attr_dict in datafield_attribute_filters:
for k, v in attr_dict.items():
if datafield_node.get(k) != v:
break
else:
skip_node = False
break
if skip_node:
continue
for subfield_node in datafield_node.iterfind("./subfield[#code='a']", namespaces=namespaces):
aut.append(subfield_node.text) #this gets the author name and title
print(aut)
count+=1
and this is the csv file:
URL
0 http://services.dnb.de/sru/dnb?version=1.1&operation=searchRetrieve&query=ISBN%3D9783960382850&recordSchema=MARC21-xml
1 http://services.dnb.de/sru/dnb?version=1.1&operation=searchRetrieve&query=ISBN%3D9783963622106&recordSchema=MARC21-xml
2 http://services.dnb.de/sru/dnb?version=1.1&operation=searchRetrieve&query=ISBN%3D-&recordSchema=MARC21-xml
3 http://services.dnb.de/sru/dnb?version=1.1&operation=searchRetrieve&query=ISBN%3D9783806241280&recordSchema=MARC21-xml
4 http://services.dnb.de/sru/dnb?version=1.1&operation=searchRetrieve&query=ISBN%3D9783890296005&recordSchema=MARC21-xml
5 http://services.dnb.de/sru/dnb?version=1.1&operation=searchRetrieve&query=ISBN%3D9783110699111&recordSchema=MARC21-xml
6 http://services.dnb.de/sru/dnb?version=1.1&operation=searchRetrieve&query=ISBN%3D9783110698930&recordSchema=MARC21-xml
7 http://services.dnb.de/sru/dnb?version=1.1&operation=searchRetrieve&query=ISBN%3D9783110699104&recordSchema=MARC21-xml
8 http://services.dnb.de/sru/dnb?version=1.1&operation=searchRetrieve&query=ISBN%3D9783963621093&recordSchema=MARC21-xml
9 http://services.dnb.de/sru/dnb?version=1.1&operation=searchRetrieve&query=ISBN%3D9783451716034&recordSchema=MARC21-xml
10 http://services.dnb.de/sru/dnb?version=1.1&operation=searchRetrieve&query=ISBN%3D9788791953514&recordSchema=MARC21-xml
When I execute the script, the output is:
['Schmidt, Horst']
but I need the other results as well. How can I do that?
Any help is appreciated.
EDIT: link to the full csv file on Pastebin the filename is: Reihe-21A51.csv_extract.csv_isbn-dnb2.csv
As #Tranbi pointed out, I had to move the aut=[] outside of the loop
it's now
for file in files:
if count==0: #to only go through the first file, instead of all files in the folder
csvfile = pd.read_csv(file, sep='\t', encoding='utf-8')
aut = []
instead of
aut = []
for datafield_node in root.iterfind(datafield_nodes_path, namespaces=namespaces):

Python 3.x | FileNotFoundError: [Errno 2] No such file or directory | writing .csv from .xlxs

I was working on a file converting function for a xlxs -> csv format. I was able to make the function work when I specified the exact file, but I'm running into issues when I try to iterate the process over a dir folder. Below is the code:
def ExceltoCSV(excel_file, csv_file_base_path):
workbook = xlrd.open_workbook(excel_file)
## get the worksheet names
for sheet_name in workbook.sheet_names():
print('processing - ' + sheet_name)
## extract the data from each worksheet
worksheet = workbook.sheet_by_name(sheet_name)
## create a new csv file, with the name being the original Excel worksheet name; tidied up a bit replacing spaces and dashes
csv_file_full_path = csv_file_base_path + sheet_name.lower().replace(" - ", "_").replace(" ","_") + '.csv'
csvfile = open(csv_file_full_path, 'w')
## write into the new csv file, one row at a time
writetocsv = csv.writer(csvfile, quoting = csv.QUOTE_ALL)
for rownum in range(worksheet.nrows):
writetocsv.writerow(
list(x.encode('utf-8') if type(x) == type(u'') else x for x in worksheet.row_values(rownum)
)
)
csvfile.close()
print(sheet_name + ' has been saved at - ' + csv_file_full_path)
## Paths as strings
p = r'//Network/TestingFolder/'
nf_p = r'//Network/TestingFolder/CSV_Only/'
## directory reference
directory = r'//Network/TestingFolder/' # for os.listdir() function below
file_list = []
## for iterating over directory and spitting out the paths for each file { to be used in conjunction
with ExceltoCSV() }
for filename in os.listdir(directory):
if filename.endswith(".xlsx"): # or filename.endswith(".csv")
file_path = os.path.join(directory, filename)
file_list.append(file_path)
else:
continue
for paths in file_list:
print(paths)
ExceltoCSV(paths, nf_p)
My error is occurring with the line >> csvfile = open(csv_file_full_path, 'w')
Error is: FileNotFoundError: [Errno 2] No such file or directory

Change order in filenames in a folder

I need to rename a bunch of files in a specific folder. They all end with date and time, like for example "hello 2019-05-22 1310.txt" and I want the date and time for each file to be first so I can sort them. With my code I get an error and it wont find my dir where all files are located. What is wrong with the code?
import os
import re
import shutil
dir_path = r'C:\Users\Admin\Desktop\Testfiles'
comp = re.compile(r'\d{4}-\d{2}-\d{2}')
for file in os.listdir(dir_path):
if '.' in file:
index = [i for i, v in enumerate(file,0) if v=='.'][-1]
name = file[:index]
ext = file[index+1:]
else:
ext=''
name = file
data = comp.findall(name)
if len(data)!=0:
date= comp.findall(name)[0]
rest_name = ' '.join(comp.split(name)).strip()
new_name = '{} {}{}'.format(date,rest_name,'.'+ext)
print('changing {} to {}'.format(name, new_name))
shutil.move(os.path.join(dir_path,name), os.path.join(dir_path, new_name))
else:
print('file {} is not change'.format(name))

How to open and append nested zip archives into dataframe without extracting?

I am trying to open a large number of csv files which found in several layers of zip files. Given the nature of this project, I am trying to open, read_csv them into a dataframe, append that data to an aggregate dataframe then continue through the loop.
Example: Folder Directory/First Zip/Second Zip/Third Zip/csv file.csv
My existing code can loop through the contents of the second and third zip file and get the name of each csv file. I am aware that this code can probably be made more simple by importing glob, but I'm unfamiliar.
import os
import pandas as pd
import zipfile, re, io
directory = 'C:/Test/'
os.chdir(directory)
fname = "test" + ".zip"
with zipfile.ZipFile(fname, 'r') as zfile:
# second level of zip files
for zipname in zfile.namelist():
if re.search(r'\.zip$', zipname) != None:
zfiledata = io.BytesIO(zfile.read(zipname))
# third level of zip files
with zipfile.ZipFile(zfiledata) as zfile2:
for zipname2 in zfile2.namelist():
# this zipfile contains xml and csv contents. This filters out the xmls
if zipname2.find("csv") > 0:
zfiledata2 = io.BytesIO(zfile2.read(zipname2))
with zipfile.ZipFile(zfiledata2) as zfile3:
fullpath = directory + fname + "/" + zipname + "/" + zipname2 + "/"
# csv file names are always the same as their zips. this cleans the string.
csvf = zipname2.replace('_csv.zip',".csv")
filehandle = open(fullpath, 'rb')
# the above statement is erroring: FileNotFoundError: [Errno 2] No such file or directory:
zfilehandle = zipfile.ZipFile(filehandle)
data = []
csvdata = StringIO.StringIO(zfilehandle.read(csvf))
df = pd.read_csv(csvdata)
data.append(df)
print(data.head())

Re-loop until all matches are found, logic?

I cannot figure out the logic for this. I am attempting to compare a list of matches 'matches' to files from a folder. If file in 'folders' equal the name in 'matches', then do something, but obviously it doesn't 'try' each match to each file. I'm thinking I need to use a while loop but I don't know how to apply it.
import os
import glob
import os.path
folders = glob.glob('C:\\Corrections\\*.*')
matches = open('filename.txt', 'r')
for each in folders:
splitname_one = each.split('\\', 3) #Separate the filename from the path
filename = splitname_one[3] #Get Filename only
basefile = filename.split('.', 1) #Separate filename and file extension
compare0 = basefile[0] #assign base file name to compare0
#print (basefile[0])
for line in matches:
match = line.split('.', 1) #Separe base filename from file extension
#print (match[1])
compare1 = match[0] #assign base file name to compare1
if compare1==compare0:
#os.rename(filename, 'C:\\holder\\' + filename)
print ('We Have a match!')
else:
print ('no match :( ')
FWIW here's how I might end up doing something like this:
import glob
from os.path import basename, splitext
def file_base(filename):
return splitext(basename(filename))[0]
folders = set(file_base(f) for f in glob.glob('C:\\Corrections\\*.*'))
with open('filename.txt') as fobj:
matches = set(file_base(f) for f in fobj.readlines())
print(folders.intersection(matches))

Resources