iterating over files in folder using os python - python-3.x

Ultimate Goal: Iterate over many files in a folder to perform a specific set of tasks.
Immediate Goal: Load next file (file2) to perform tasks
Background: I am using the following code
import os
folder = '/Users/eer/Desktop/myfolder/'
for subdir, dirs, files in os.walk(folder):
for item in os.listdir(folder):
if not item.startswith('.') and os.path.isfile(os.path.join(folder, item)): #gets rid of .DS_store file
print(item)
Output: print(item)
file1.txt
file2.txt
file3.txt
(etc...)
I am using the following code to open the first file:
data_path = folder + item
file = open(data_path, "r")
#perform a set of tasks for this file
This works well for opening the first file, file1.txt and performing a set of tasks.
However, I am not sure how to load file2.txt (and eventually file3.txt and etc...)so I can continue the task performance
Questions:
1) How do I put this code in a for loop? (so I can load, and perform tasks on all the files)?

You can do the file operations in the same loop like:
import os
folder = '/Users/eer/Desktop/myfolder/'
for subdir, dirs, files in os.walk(folder):
for item in os.listdir(folder):
if not item.startswith('.') and os.path.isfile(os.path.join(folder, item)):
data_path = folder + item
with open(data_path, "r") as file:
... use file here ...

Related

How to copy merge files of two different directories with different extensions into one directory and remove the duplicated ones

I would need a Python function which performs below action:
I have two directories which in one of them I have files with .xml format and in the other one I have files with .pdf format. To simplify things consider this example:
Directory 1: a.xml, b.xml, c.xml
Directory 2: a.pdf, c.pdf, d.pdf
Output:
Directory 3: a.xml, b.xml, c.xml, d.pdf
As you can see the priority is with the xml files in the case that both extensions have similar names.
I would be thankful for your help.
You need to use the shutil module and the os module to achieve this. This function will work on the following assumption:
A given directory has all files with the same extension
The priority_directory will be the directory with file extensions to be prioritized
The secondary_directory will be the directory with file extensions to be dropped in case of a name collision
Try:
import os,shutil
def copy_files(priority_directory,secondary_directory,destination = "new_directory"):
file_names = [os.path.splitext(filename)[0] for filename in os.listdir(priority_directory)] # get the file names to check for collisions
os.mkdir(destination) # make a new directory
for file in os.listdir(priority_directory): # this loop copies the first direcotory as it is
file_path = os.path.join(priority_directory,file)
dst_path = os.path.join(destination,file)
shutil.copy(file_path,dst_path)
for file in os.listdir(secondary_directory): # this loop checks for collisions and drops files whose name collide
if(os.path.splitext(file)[0] not in file_names):
file_path = os.path.join(secondary_directory,file)
dst_path = os.path.join(destination,file)
shutil.copy(file_path,dst_path)
print(os.listdir(destination))
Let's run it with your direcotry names as arguments:
copy_files('directory_1','directory_2','directory_3')
You can now check a new directory with the name directory_3 will be created with the desired files in it.
This will work for all such similar cases no matter what the extension is.
Note: There should not be a need to do this i guess cause a directory can have two files with the same name as long as the extensions differ.
Rough working solution:
import os
from shutil import copy2
d1 = './d1/'
d2 = './d2/'
d3 = './d3/'
ext_1 = '.xml'
ext_2 = '.pdf'
def get_files(d: str, files: list):
directory = os.fsencode(d)
for file in os.listdir(d):
dup = False
filename = os.fsdecode(file)
if filename[-4:] == ext_2:
for (x, y) in files:
if y == filename[:-4] + ext_1:
dup = True
break
if dup:
continue
files.append((d, filename))
files = []
get_files(d1, files)
get_files(d2, files)
for d, file in files:
copy2(d+file, d3)
I'll see if I can get it to look/perform better.

Select specific files with their name in folders then manipulate them (Python)

I am trying to write a script to select specific files from their names in several folders and them copy those files in a new folder.
More precisely, I have a directory that contains 29 folders, in each folders there are hundreds of '*.fits' files.
I want to select among those fits files those which do not have the numbers '4' or '8' in the last 3 digits before .fits
For example: "ngc6397id016000520jd2456870p5705f002.fits" has '002' as last three digits before the extension .fits
I am kind of lost here as I am pretty new to this, can anyone help ?
Thank you!
import os
import shutil
data_path = ".<your directory with 29 folders>"
data_dir = os.listdir(data_path)
# for each folder in the directory
for fits_dir in data_dir:
fits_path = data_path + "/" + log_dir + "/"
# for each .fits file in the folder
for file in os.listdir(fits_path):
# if neither 4 or 8 are in the last 3 digits before the dot
if '4' not in file.split(".")[0][-3:] and '8' not in file.split(".")[0][-3:]:
shutil.copy(fits_path + "/" + file, destination)
SO is for code help, however, this should get you started.
The below code will print out the desired files by traversing all subdirectories and contained files. There are improvements you must make to this code for it to be reliable; such as error and case checking, however, it will serve to get you going.
import os
TARGET_DIR = r"C:\yourDir"
IGNORE_NUM = ['4', '8']
for path, dirs, files in os.walk(TARGET_DIR):
for index, file in enumerate(files):
fileSplit = os.path.splitext(file)
if(fileSplit[1] != ".fits"):
continue
lastThree = fileSplit[0][-3:]
if(set(IGNORE_NUM).intersection(set(lastThree))):
continue
print(f"[{index}] {file}")
From that, it is trivial to copy the file over to your desired directory using the shutil library.
shutil.copyfile(src, dst)
Combine all of that and you have your script.

extracting file content inside multiple folders and printing required folder names

I have files and directory structure like this:
C:\data\4512\required_data\121\a.txt
C:\data\4512\required_data\121\b.txt
C:\data\4512\required_data\456\c.txt
C:\data\4512\required_data\456\d.txt
C:\data\9813\required_data\789\e.txt
C:\data\9813\required_data\789\f.txt
C:\data\9813\required_data\301\g.txt
C:\data\9813\required_data\301\h.txt
I want to print the content of the text files
Also I want to print outer_folder number like 4512, inner folder number like 121 and the file name
I was trying some code like this:
path = "C:\\data"
for root, dirs, files in os.walk(path):
for dir in dirs:
print(dir)
data_location = os.path.join(path, dir, "required_data")
for example, for this case:
C:\data\4512\required_data\121\a.txt
Expected output:
file=open("a.txt")
print(file) # content of file
print("outer_number") # 4512
print("inner_number") # 121
print("name_of_file") # a.txt
I want to do this for all files

Python Open last file having same extension

I have many folders say "folder1", "folder2" etc
Each folder inturn has many files with different extensions
eg:
file1.txt
file2.txt
file3.txt
file4.txt
file5.txt
file6.txt
file7.txt
file8.txt
file.xlsx
file2.bin
I want to open the last file in every folder.
For instance here I want to open the file8.txt as its the last file having extension "txt"
Can anyone please let me know a generic method to do this.
I am a beginner in python.
Try this:
import glob
import os
list_of_files = glob.glob('/path/to/folder/*.txt')
latest_file = max(list_of_files, key=os.path.getctime)
print(latest_file)
f = open(latest_file, 'r')
You can run this in a loop for all folders. Let me know if it works.
Considering last means latest timestamp of file this should do :
import os
import glob
def get_latest_file(file_loc,file_nm_str):
file_lst = glob.glob(file_loc+"*" + file_nm_str + "*")
latest_file = max(file_lst, key=os.path.getctime)
return latest_file
You can modify * with required extension.

creating corresponding subfolders and writing a portion of the file in new files inside those subfolders using python

I have a folder named "data". It contains subfolders "data_1", "data_2", and "data_3". These subfolders contain some text files. I want to parse through all these subfolders and generate corresponding subfolders with the same name, inside another folder named "processed_data". I want to also generate corresponding files with "processed" as a prefix in the name and want to write all those lines from the original file where "1293" is there in the original files.
I am using the below code but not able to get the required result. Neither the subfolders "data_1", "data_2", and "data_3" nor the files are getting created
import os
folder_name=""
def pre_processor():
data_location="D:\data" # folder containing all the data
for root, dirs, files in os.walk(data_location):
for dir in dirs:
#folder_name=""
folder_name=dir
for filename in files:
with open(os.path.join(root, filename),encoding="utf8",mode="r") as f:
processed_file_name = 'D:\\processed_data\\'+folder_name+'\\'+'processed'+filename
processed_file = open(processed_file_name,"w", encoding="utf8")
for line_number, line in enumerate(f, 1):
if "1293" in line:
processed_file.write(str(line))
processed_file.close()
pre_processor()
You might need to elaborate on the issue you are having; e.g., are the files being created, but empty?
A few things I notice:
1) Your indentation is off (not sure if this is just a copy-paste issue though): the pre_processor function is empty, i.e. you are defining the function at the same level as the declaration, not inside of it.
try this:
import os
folder_name=""
def pre_processor():
data_location="D:\data" # folder containing all the data
for root, dirs, files in os.walk(data_location):
for dir in dirs:
#folder_name=""
folder_name=dir
for filename in files:
with open(os.path.join(root, filename), encoding="utf8",mode="r") as f:
processed_file_name = 'D:\\processed_data\\'+folder_name+'\\'+'processed'+filename
processed_file = open(processed_file_name,"w", encoding="utf8")
for line_number, line in enumerate(f, 1):
if "1293" in line:
processed_file.write(str(line))
processed_file.close()
pre_processor()
2) Check if the processed_data and sub_folders exist; if not, create them first as this will not do so.
Instead of creating the path to the new Folder by hand you could just replace the name of the folder.
Furthermore, you are not creating the subfolders.
This code should work but replace the Linux folder slashes:
import os
folder_name=""
def pre_processor():
data_location="data" # folder containing all the data
for root, dirs, files in os.walk(data_location):
for dir in dirs:
# folder_name=""
folder_name = dir
for filename in files:
joined_path = os.path.join(root, filename)
with open(joined_path, encoding="utf8", mode="r") as f:
processed_folder_name = root.replace("data/", 'processed_data/')
processed_file_name = processed_folder_name+'/processed'+filename
if not os.path.exists(processed_folder_name):
os.makedirs(processed_folder_name)
processed_file = open(processed_file_name, "w", encoding="utf8")
for line in f:
if "1293" in line:
processed_file.write(str(line))
processed_file.close()
pre_processor()

Resources