Applying function to a list of file-paths and writing csv output to the respective paths - python-3.x

How do I apply a function to a list of file paths I have built, and write an output csv in the same path?
read file in a subfolder -> perform a function -> write file in the
subfolder -> go to next subfolder
#opened xml by filename
with open(r'XML_opsReport 100001.xml', encoding = "utf8") as fd:
Odict_parsedFromFilePath = xmltodict.parse(fd.read())
#func called in func below
def activity_to_df_one_day (list_activity_this_day):
ib_list = [pd.DataFrame(list_activity_this_day[i], columns=list_activity_this_day[i].keys()).drop("#uom") for i in range(len(list_activity_this_day))]
return pd.concat(ib_list)
#Processes parsed xml and writes csv
def activity_to_df_all_days (Odict_parsedFromFilePath, subdir): #writes csv from parsed xml after some processing
nodes_reports = Odict_parsedFromFilePath['opsReports']['opsReport']
list_activity = []
for i in range(len(nodes_reports)):
try:
df = activity_to_df_one_day(nodes_reports[i]['activity'])
list_activity.append(df)
except KeyError:
continue
opsReport = pd.concat(list_activity)
opsReport['dTimStart'] = pd.to_datetime(opsReport['dTimStart'], infer_datetime_format =True)
opsReport.sort_values('dTimStart', axis=0, ascending=True, inplace=True, kind='quicksort', na_position='last')
opsReport.to_csv("subdir\opsReport.csv") #write to the subdir
def scanfolder(): #fetches list of file-paths with desired starting name.
list_files = []
for path, dirs, files in os.walk(r'C:\..\xml_objects'): #directory containing several subfolders
for f in files:
if f.startswith('XML_opsReport'):
list_files.append(os.path.join(path, f))
return list_files
filepaths = scanfolder() #list of file-paths
Every function works well, the xml processing is good, so I am not sharing the xml structure. There are 100+ paths in filepaths , each a different subdirectory. I want to be able to apply above flow in future as well, where I can get filepaths and perform desired actions. It's important to write the csv file to it's sub directory.

To get the directory that a file is in, you can use:
import os
for root, dirs, files, in os.walk(some_dir):
for f in files:
print(root)
output_file = os.path.join(root, "output_file.csv")
print(output_file)
Is that what you're looking for?
Output:
somedir
somedir\output_file.csv
See also Python 3 - travel directory tree with limited recursion depth and Find current directory and file's directory.

Was able to solve with os.path.join.
exceptions_path_list =[]
for i in filepaths:
try:
with open(i, encoding = "utf8") as fd:
doc = xmltodict.parse(fd.read())
activity_to_df_all_days (doc, i)
except ValueError:
exceptions_path_list.append(os.path.dirname(i))
continue
def activity_to_df_all_days (Odict_parsedFromFilePath, filepath):
...
...
...
opsReport.to_csv(os.path.join(os.path.dirname(filepath), "opsReport.csv"))

Related

How to copy merge files of two different directories with different extensions into one directory and remove the duplicated ones

I would need a Python function which performs below action:
I have two directories which in one of them I have files with .xml format and in the other one I have files with .pdf format. To simplify things consider this example:
Directory 1: a.xml, b.xml, c.xml
Directory 2: a.pdf, c.pdf, d.pdf
Output:
Directory 3: a.xml, b.xml, c.xml, d.pdf
As you can see the priority is with the xml files in the case that both extensions have similar names.
I would be thankful for your help.
You need to use the shutil module and the os module to achieve this. This function will work on the following assumption:
A given directory has all files with the same extension
The priority_directory will be the directory with file extensions to be prioritized
The secondary_directory will be the directory with file extensions to be dropped in case of a name collision
Try:
import os,shutil
def copy_files(priority_directory,secondary_directory,destination = "new_directory"):
file_names = [os.path.splitext(filename)[0] for filename in os.listdir(priority_directory)] # get the file names to check for collisions
os.mkdir(destination) # make a new directory
for file in os.listdir(priority_directory): # this loop copies the first direcotory as it is
file_path = os.path.join(priority_directory,file)
dst_path = os.path.join(destination,file)
shutil.copy(file_path,dst_path)
for file in os.listdir(secondary_directory): # this loop checks for collisions and drops files whose name collide
if(os.path.splitext(file)[0] not in file_names):
file_path = os.path.join(secondary_directory,file)
dst_path = os.path.join(destination,file)
shutil.copy(file_path,dst_path)
print(os.listdir(destination))
Let's run it with your direcotry names as arguments:
copy_files('directory_1','directory_2','directory_3')
You can now check a new directory with the name directory_3 will be created with the desired files in it.
This will work for all such similar cases no matter what the extension is.
Note: There should not be a need to do this i guess cause a directory can have two files with the same name as long as the extensions differ.
Rough working solution:
import os
from shutil import copy2
d1 = './d1/'
d2 = './d2/'
d3 = './d3/'
ext_1 = '.xml'
ext_2 = '.pdf'
def get_files(d: str, files: list):
directory = os.fsencode(d)
for file in os.listdir(d):
dup = False
filename = os.fsdecode(file)
if filename[-4:] == ext_2:
for (x, y) in files:
if y == filename[:-4] + ext_1:
dup = True
break
if dup:
continue
files.append((d, filename))
files = []
get_files(d1, files)
get_files(d2, files)
for d, file in files:
copy2(d+file, d3)
I'll see if I can get it to look/perform better.

read_csv won't read from a file list and specified directory

I have an issue with using Python read_csv in a function. If I use the following code, there is no issue. It will read the two files which I can then append to a single dataframe output:
directory = r"\\*my directory*"
files1 = ['001 Data.txt', '002 Data.txt']
def read_data(directory, files):
list_ = []
for file in files:
df = pd.read_csv(directory + '\\' + file, sep='\t', header=0)
...do stuff...
list_.append()
return list_
df_mct20 = read_data(directory, files1) # This will generate my list which I can then concatenate
df_final = pd.concat(df_mct20)
The above code works fine. However, if I call this exact "read_data()" function in a for loop, I get the "No such file or directory" error:
files2 = ['003 Data.txt', '004 Data.txt']
for file in files2:
df2 = read_data(directory, file) # The error shows up here "No such file or directory"
...want to do stuff...
I've tried a number of things and can't seem to get it to work. Any help will be greatly appreciated~
The second input of your function has to be a list of filenames. So you cannot loop in a series of strings into the function itself.
You either need to re-write your function to just take in one filename at a time, or load in your files2 as a list, and not loop the filenames in files2 in.

Python3: Index out of range for script that worked before

the attached script returns:
IndexError: list index out of range
for the line starting with values = {line.split (...)
values=dict()
with open(csv) as f:
lines =f.readlines()
values = {line.split(',')[0].strip():line.split(',')[1].strip() for line in lines}
However, I could use it yesterday for doing exactly the same:
replacing certain text in a dir of xml-files with different texts
import os
from distutils.dir_util import copy_tree
drc = 'D:/Spielwiese/00100_Arbeitsverzeichnis'
backup = 'D:/Spielwiese/Backup/'
csv = 'D:/persons1.csv'
copy_tree(drc, backup)
values=dict()
with open(csv) as f:
lines =f.readlines()
values = {line.split(',')[0].strip():line.split(',')[1].strip() for line in lines}
#Getting a list of the full paths of files
for dirpath, dirname, filename in os.walk(drc):
for fname in filename:
#Joining dirpath and filenames
path = os.path.join(dirpath, fname)
#Opening the files for reading only
filedata = open(path,encoding="Latin-1").read()
for k,v in values.items():
filedata=filedata.replace(k,v)
f = open(path, 'w',encoding="Latin-1")
# We are writing the the changes to the files
f.write(filedata)
f.close() #Closing the files
print("In case something went wrong, you can find a backup in " + backup)
I don't see anything weird and I could, as mentioned before use it before ... :-o
Any ideas on how to fix it?
best Wishes,
K

Having trouble using zipfile.ZipFile.extractall (Already read the docs)

I have a folder with many zipfiles, most of these zipfiles contain shapefiles and some of them have subfolders which contain zipfiles that contain shapefiles. I am trying to extract everything into one main folder wihtout keeping any folder structure. This is where I am now;
import os, zipfile
def getListOfFiles(dirName):
# create a list of file and sub directories
# names in the given directory
listOfFile = os.listdir(dirName)
allFiles = list()
# Iterate over all the entries
for entry in listOfFile:
# Create full path
fullPath = os.path.join(dirName, entry)
# If entry is a directory then get the list of files in this directory
if os.path.isdir(fullPath):
allFiles = allFiles + getListOfFiles(fullPath)
else:
allFiles.append(fullPath)
return allFiles
def main():
dirName = r'C:\Users\myusername\My_Dataset'
# Get the list of all files in directory tree at given path
listOfFiles = getListOfFiles(dirName)
# Print the files
for elem in listOfFiles:
print(elem)
zipfile.ZipFile.extractall(elem)
print("****************")
if __name__ == '__main__':
main()
This script prints all the shapefiles (including the ones under subfolders). Now I need to extract all these listed shapefiles into one main folder. I try zipfile.ZipFile.extractall(elem) but it doesn't work.
line 1611, in extractall
members = self.namelist()
AttributeError: 'str' object has no attribute 'namelist'
Is the error I'm getting. zipfile.ZipFile.extractall(elem) is the line that doesn't work. I imagine it expects one zipfile but I'm trying to feed it a folder (or a list in this case?)
How would I change this script so that it extracts my listed shapefiles into a folder (preferably a new folder)
You need to make an instance of ZipFile first and use extractall on this instance:
for elem in listOfFiles:
my_zipfile = zipfile.ZipFile(elem)
my_zipfile.extractall()
I have added this code block to my script and it works now.
def getfiles(path):
if os.path.isdir(path):
for root, dirs, files in os.walk(path):
for name in files:
yield os.path.join(root, name)
else:
yield path
fromdir = r"C:\Users\username\My_Dataset\new"
for f in getfiles(fromdir):
filename = str.split(f, '/')[-1]
if os.path.isfile(destination + filename):
filename = f.replace(fromdir, "", 1).replace("/", "_")
# os.rename(f, destination+filename)
shutil.copy2(f, r"C:\Users\username\Documents\flatten")

creating corresponding subfolders and writing a portion of the file in new files inside those subfolders using python

I have a folder named "data". It contains subfolders "data_1", "data_2", and "data_3". These subfolders contain some text files. I want to parse through all these subfolders and generate corresponding subfolders with the same name, inside another folder named "processed_data". I want to also generate corresponding files with "processed" as a prefix in the name and want to write all those lines from the original file where "1293" is there in the original files.
I am using the below code but not able to get the required result. Neither the subfolders "data_1", "data_2", and "data_3" nor the files are getting created
import os
folder_name=""
def pre_processor():
data_location="D:\data" # folder containing all the data
for root, dirs, files in os.walk(data_location):
for dir in dirs:
#folder_name=""
folder_name=dir
for filename in files:
with open(os.path.join(root, filename),encoding="utf8",mode="r") as f:
processed_file_name = 'D:\\processed_data\\'+folder_name+'\\'+'processed'+filename
processed_file = open(processed_file_name,"w", encoding="utf8")
for line_number, line in enumerate(f, 1):
if "1293" in line:
processed_file.write(str(line))
processed_file.close()
pre_processor()
You might need to elaborate on the issue you are having; e.g., are the files being created, but empty?
A few things I notice:
1) Your indentation is off (not sure if this is just a copy-paste issue though): the pre_processor function is empty, i.e. you are defining the function at the same level as the declaration, not inside of it.
try this:
import os
folder_name=""
def pre_processor():
data_location="D:\data" # folder containing all the data
for root, dirs, files in os.walk(data_location):
for dir in dirs:
#folder_name=""
folder_name=dir
for filename in files:
with open(os.path.join(root, filename), encoding="utf8",mode="r") as f:
processed_file_name = 'D:\\processed_data\\'+folder_name+'\\'+'processed'+filename
processed_file = open(processed_file_name,"w", encoding="utf8")
for line_number, line in enumerate(f, 1):
if "1293" in line:
processed_file.write(str(line))
processed_file.close()
pre_processor()
2) Check if the processed_data and sub_folders exist; if not, create them first as this will not do so.
Instead of creating the path to the new Folder by hand you could just replace the name of the folder.
Furthermore, you are not creating the subfolders.
This code should work but replace the Linux folder slashes:
import os
folder_name=""
def pre_processor():
data_location="data" # folder containing all the data
for root, dirs, files in os.walk(data_location):
for dir in dirs:
# folder_name=""
folder_name = dir
for filename in files:
joined_path = os.path.join(root, filename)
with open(joined_path, encoding="utf8", mode="r") as f:
processed_folder_name = root.replace("data/", 'processed_data/')
processed_file_name = processed_folder_name+'/processed'+filename
if not os.path.exists(processed_folder_name):
os.makedirs(processed_folder_name)
processed_file = open(processed_file_name, "w", encoding="utf8")
for line in f:
if "1293" in line:
processed_file.write(str(line))
processed_file.close()
pre_processor()

Resources