creating corresponding subfolders and writing a portion of the file in new files inside those subfolders using python - python-3.x

I have a folder named "data". It contains subfolders "data_1", "data_2", and "data_3". These subfolders contain some text files. I want to parse through all these subfolders and generate corresponding subfolders with the same name, inside another folder named "processed_data". I want to also generate corresponding files with "processed" as a prefix in the name and want to write all those lines from the original file where "1293" is there in the original files.
I am using the below code but not able to get the required result. Neither the subfolders "data_1", "data_2", and "data_3" nor the files are getting created
import os
folder_name=""
def pre_processor():
data_location="D:\data" # folder containing all the data
for root, dirs, files in os.walk(data_location):
for dir in dirs:
#folder_name=""
folder_name=dir
for filename in files:
with open(os.path.join(root, filename),encoding="utf8",mode="r") as f:
processed_file_name = 'D:\\processed_data\\'+folder_name+'\\'+'processed'+filename
processed_file = open(processed_file_name,"w", encoding="utf8")
for line_number, line in enumerate(f, 1):
if "1293" in line:
processed_file.write(str(line))
processed_file.close()
pre_processor()

You might need to elaborate on the issue you are having; e.g., are the files being created, but empty?
A few things I notice:
1) Your indentation is off (not sure if this is just a copy-paste issue though): the pre_processor function is empty, i.e. you are defining the function at the same level as the declaration, not inside of it.
try this:
import os
folder_name=""
def pre_processor():
data_location="D:\data" # folder containing all the data
for root, dirs, files in os.walk(data_location):
for dir in dirs:
#folder_name=""
folder_name=dir
for filename in files:
with open(os.path.join(root, filename), encoding="utf8",mode="r") as f:
processed_file_name = 'D:\\processed_data\\'+folder_name+'\\'+'processed'+filename
processed_file = open(processed_file_name,"w", encoding="utf8")
for line_number, line in enumerate(f, 1):
if "1293" in line:
processed_file.write(str(line))
processed_file.close()
pre_processor()
2) Check if the processed_data and sub_folders exist; if not, create them first as this will not do so.

Instead of creating the path to the new Folder by hand you could just replace the name of the folder.
Furthermore, you are not creating the subfolders.
This code should work but replace the Linux folder slashes:
import os
folder_name=""
def pre_processor():
data_location="data" # folder containing all the data
for root, dirs, files in os.walk(data_location):
for dir in dirs:
# folder_name=""
folder_name = dir
for filename in files:
joined_path = os.path.join(root, filename)
with open(joined_path, encoding="utf8", mode="r") as f:
processed_folder_name = root.replace("data/", 'processed_data/')
processed_file_name = processed_folder_name+'/processed'+filename
if not os.path.exists(processed_folder_name):
os.makedirs(processed_folder_name)
processed_file = open(processed_file_name, "w", encoding="utf8")
for line in f:
if "1293" in line:
processed_file.write(str(line))
processed_file.close()
pre_processor()

Related

How to copy merge files of two different directories with different extensions into one directory and remove the duplicated ones

I would need a Python function which performs below action:
I have two directories which in one of them I have files with .xml format and in the other one I have files with .pdf format. To simplify things consider this example:
Directory 1: a.xml, b.xml, c.xml
Directory 2: a.pdf, c.pdf, d.pdf
Output:
Directory 3: a.xml, b.xml, c.xml, d.pdf
As you can see the priority is with the xml files in the case that both extensions have similar names.
I would be thankful for your help.
You need to use the shutil module and the os module to achieve this. This function will work on the following assumption:
A given directory has all files with the same extension
The priority_directory will be the directory with file extensions to be prioritized
The secondary_directory will be the directory with file extensions to be dropped in case of a name collision
Try:
import os,shutil
def copy_files(priority_directory,secondary_directory,destination = "new_directory"):
file_names = [os.path.splitext(filename)[0] for filename in os.listdir(priority_directory)] # get the file names to check for collisions
os.mkdir(destination) # make a new directory
for file in os.listdir(priority_directory): # this loop copies the first direcotory as it is
file_path = os.path.join(priority_directory,file)
dst_path = os.path.join(destination,file)
shutil.copy(file_path,dst_path)
for file in os.listdir(secondary_directory): # this loop checks for collisions and drops files whose name collide
if(os.path.splitext(file)[0] not in file_names):
file_path = os.path.join(secondary_directory,file)
dst_path = os.path.join(destination,file)
shutil.copy(file_path,dst_path)
print(os.listdir(destination))
Let's run it with your direcotry names as arguments:
copy_files('directory_1','directory_2','directory_3')
You can now check a new directory with the name directory_3 will be created with the desired files in it.
This will work for all such similar cases no matter what the extension is.
Note: There should not be a need to do this i guess cause a directory can have two files with the same name as long as the extensions differ.
Rough working solution:
import os
from shutil import copy2
d1 = './d1/'
d2 = './d2/'
d3 = './d3/'
ext_1 = '.xml'
ext_2 = '.pdf'
def get_files(d: str, files: list):
directory = os.fsencode(d)
for file in os.listdir(d):
dup = False
filename = os.fsdecode(file)
if filename[-4:] == ext_2:
for (x, y) in files:
if y == filename[:-4] + ext_1:
dup = True
break
if dup:
continue
files.append((d, filename))
files = []
get_files(d1, files)
get_files(d2, files)
for d, file in files:
copy2(d+file, d3)
I'll see if I can get it to look/perform better.

Having trouble using zipfile.ZipFile.extractall (Already read the docs)

I have a folder with many zipfiles, most of these zipfiles contain shapefiles and some of them have subfolders which contain zipfiles that contain shapefiles. I am trying to extract everything into one main folder wihtout keeping any folder structure. This is where I am now;
import os, zipfile
def getListOfFiles(dirName):
# create a list of file and sub directories
# names in the given directory
listOfFile = os.listdir(dirName)
allFiles = list()
# Iterate over all the entries
for entry in listOfFile:
# Create full path
fullPath = os.path.join(dirName, entry)
# If entry is a directory then get the list of files in this directory
if os.path.isdir(fullPath):
allFiles = allFiles + getListOfFiles(fullPath)
else:
allFiles.append(fullPath)
return allFiles
def main():
dirName = r'C:\Users\myusername\My_Dataset'
# Get the list of all files in directory tree at given path
listOfFiles = getListOfFiles(dirName)
# Print the files
for elem in listOfFiles:
print(elem)
zipfile.ZipFile.extractall(elem)
print("****************")
if __name__ == '__main__':
main()
This script prints all the shapefiles (including the ones under subfolders). Now I need to extract all these listed shapefiles into one main folder. I try zipfile.ZipFile.extractall(elem) but it doesn't work.
line 1611, in extractall
members = self.namelist()
AttributeError: 'str' object has no attribute 'namelist'
Is the error I'm getting. zipfile.ZipFile.extractall(elem) is the line that doesn't work. I imagine it expects one zipfile but I'm trying to feed it a folder (or a list in this case?)
How would I change this script so that it extracts my listed shapefiles into a folder (preferably a new folder)
You need to make an instance of ZipFile first and use extractall on this instance:
for elem in listOfFiles:
my_zipfile = zipfile.ZipFile(elem)
my_zipfile.extractall()
I have added this code block to my script and it works now.
def getfiles(path):
if os.path.isdir(path):
for root, dirs, files in os.walk(path):
for name in files:
yield os.path.join(root, name)
else:
yield path
fromdir = r"C:\Users\username\My_Dataset\new"
for f in getfiles(fromdir):
filename = str.split(f, '/')[-1]
if os.path.isfile(destination + filename):
filename = f.replace(fromdir, "", 1).replace("/", "_")
# os.rename(f, destination+filename)
shutil.copy2(f, r"C:\Users\username\Documents\flatten")

How to rename the files of different format from the same folder but different subfolder using python

I have one scenario where i have to rename the files in the folder. Please find the scenario,
Example :
Elements(Main Folder)<br/>
2(subfolder-1) <br/>
sample_2_description.txt(filename1)<br/>
sample_2_video.avi(filename2)<br/>
3(subfolder2)
sample_3_tag.jpg(filename1)<br/>
sample_3_analysis.GIF(filename2)<br/>
sample_3_word.docx(filename3)<br/>
I want to modify the names of the files as,
Elements(Main Folder)<br/>
2(subfolder1)<br/>
description.txt(filename1)<br/>
video.avi(filename2)<br/>
3(subfolder2)
tag.jpg(filename1)<br/>
analysis.GIF(filename2)<br/>
word.docx(filename3)<br/>
Could anyone guide on how to write the code?
Recursive directory traversal to rename a file can be based on this answer. All we are required to do is to replace the file name instead of the extension in the accepted answer.
Here is one way - split the file name by _ and use the last index of the split list as the new name
import os
import sys
directory = os.path.dirname(os.path.realpath("/path/to/parent/folder")) #get the directory of your script
for subdir, dirs, files in os.walk(directory):
for filename in files:
subdirectoryPath = os.path.relpath(subdir, directory) #get the path to your subdirectory
filePath = os.path.join(subdirectoryPath, filename) #get the path to your file
newFilePath = filePath.split("_")[-1] #create the new name by splitting the old name by _ and grabbing last index
os.rename(filePath, newFilePath) #rename your file
Hope this helps.
check below code example for the first filename1, replace path with the actual path of the file:
import os
os.rename(r'path\\sample_2_description.txt',r'path\\description.txt')
print("File Renamed!")

extracting file content inside multiple folders and printing required folder names

I have files and directory structure like this:
C:\data\4512\required_data\121\a.txt
C:\data\4512\required_data\121\b.txt
C:\data\4512\required_data\456\c.txt
C:\data\4512\required_data\456\d.txt
C:\data\9813\required_data\789\e.txt
C:\data\9813\required_data\789\f.txt
C:\data\9813\required_data\301\g.txt
C:\data\9813\required_data\301\h.txt
I want to print the content of the text files
Also I want to print outer_folder number like 4512, inner folder number like 121 and the file name
I was trying some code like this:
path = "C:\\data"
for root, dirs, files in os.walk(path):
for dir in dirs:
print(dir)
data_location = os.path.join(path, dir, "required_data")
for example, for this case:
C:\data\4512\required_data\121\a.txt
Expected output:
file=open("a.txt")
print(file) # content of file
print("outer_number") # 4512
print("inner_number") # 121
print("name_of_file") # a.txt
I want to do this for all files

How to have ZipFile only zip a specified directory - Python 3

When I try to zip a directory with the following code, my directory is zipped and contains all the files I would like zipped, however it is also zipping the root directories for the directory I would like zipped.
(Test is the target directory to be zipped - it contains other directories and files) When unzipping the my_python_files.zip, it unzips with absolute paths:
unzipping my_python_files.zip:
\Users\hhafez\Desktop\Test
when I would like to have:
\Test
I am having trouble trying to find a way to avoid this, any tips would be much appreciated.
def get_all_file_paths(directory):
file_paths = []
for root, directories, files in os.walk(directory):
for filename in files:
filepath = os.path.join(root, filename)
file_paths.append(filepath)
return file_paths
def zipfiles():
file_paths = get_all_file_paths(r"C:\Users\hhafez\Desktop\Test")
with ZipFile('my_python_files.zip','w') as myzip:
for file in file_paths:
print(file)
myzip.write(file)
print('All files zipped successfully!')
zipfiles()
if you want to zip the list of files and dont want the zip archive to contain the absolute paths of each file...
def zipFiles(directory):
parentDir = "C:\Users\hhafez\Desktop" #hardcoding parent path to strip
with ZipFile('my_python_files.zip','w') as myzip:
for root, directories, files in os.walk(directory):
zipFileName = root[len(parentDir):] #always take whats after the parentDir for the filename going in the zip
for file in files:
myzip.write(os.path.join(root,file), os.path.join(zipFileName,file), compress_type=zipfile.ZIP_DEFLATED)
zipfiles(r"C:\Users\hhafez\Desktop\Test")
This should accomplish what you need. The major difference here is the zipFileName variable.
zipFileName = root[len(parentDir):]
This line strips out the parentDir from the directory that you are crawling through . zipFileName and the name of the file would be the archive name to pass to myzip.write which explains this:
myzip.write(os.path.join(root,file), os.path.join(zipFileName,file), compress_type=zipfile.ZIP_DEFLATED)

Resources