How to search certain files from certain subdirectories using python - python-3.x

I have directories like this:
MOL1 MOL2 MOL3 dir1 test1 scripts
This code is in scripts directory and what I want to do is loop through all directories that start with MOL and search for files that start with info
import os
def get_info():
parent = os.pardir
contents = os.listdir(parent)
list1 = []
for dir in contents:
if dir.startswith("MOL"):
for file in dir:
path = os.path.join(parent, file)
if file.startswith("info"):
with open(path, "r") as data:
text = data.readlines()
for num1, row1 in enumerate(text):
row1 = row1.rstrip("\n").split()
list1.append(row1)
list2 = []
#do stuff for list2
return list2
get_info()
But this does not work. Nothing happens when I run this. How can I get this working?

The code you posted depends on the location of your work directory, i.e. the directory from where you call your script, which is not necessarily the parent folder of the script. You can use the built-in __file__ property to find paths relative to your script:
from pathlib import Path
script_dir = Path(__file__).resolve().parent
info_files = script_dir.parent.glob('MOL*/info*')
for info_file in info_files:
# do something with the file
Have a look at the pathlib module which provides a more convenient interface to the file system.

The practical way of debugging this is just using print statements as it finds the directory and files.

Related

Find Files with the term "deadbolt" in it and return only first subfolder with os.walk

This script gets a term and a path to a folder. Its goal is then to search in every subfolder for files that contain the term "deadbolt" in it and make a list and return that list.
So far so good but at the end I want to delete the first subfolder of where the script found a deadbolt file.
So for example I do have following folder structure:
d:/Movies/
├─ Movie1/subfolder1Movie1/subfolder2Movie1/movie1.mp4.deadbolt
├─ Movie2/subfolder1Movie2/subfolder2Movie2/movie2.mpeg
├─ Movie3/subfolder1Movie3/subfolder2Movie3/movie3.avi.deadbolt
In this case I provide the path "D:\Movies" and the term "deadbolt" and want the script to return ["Movie1","Movie3"].
Because I want to delete those folder structures completely. With there subfolders and files. But how can I achieve to get the first subfolder where a file was found without regex?
import os
import re
def findDeadbolts(searchTerm,search_path):
results = []
for root, dir, files in os.walk(search_path, topdown=True):
for filename in files:
if searchTerm in filename:
fullPath = os.path.join(root, filename)
results.append(fullPath)
pattern="(?<=Movies\\\\)[a-zA-Z0-9\_\-\!\?]+" #Dont wont to do it with regex since names can be qutie complex
print(re.search(pattern,fullPath)[0])
return results
print(findDeadbolts('deadbolt','D:\\Movies'))
I found a solution for this.
Using the method "parts" from "pathlib.Path". This gives me every part of a path. Since I know the root path I can get both lengths with len() and count the length of the root +1 or since it starts counting by 0 I just take the length of root path and this will work.
from pathlib import Path
import os
from shutil import rmtree
foundPath = "D:\Movies\Movie2\Movie2.avi.deadbolt"
rootPath = "D:\Movies"
foundParts = Path(foundPath).parts
rootParts = Path(rootPath).parts
folder = foundParts[len(rootParts)]
rmtree(os.path.join(rootPath,folder))
If there is a better solution for this comment below. ;-)

How to copy merge files of two different directories with different extensions into one directory and remove the duplicated ones

I would need a Python function which performs below action:
I have two directories which in one of them I have files with .xml format and in the other one I have files with .pdf format. To simplify things consider this example:
Directory 1: a.xml, b.xml, c.xml
Directory 2: a.pdf, c.pdf, d.pdf
Output:
Directory 3: a.xml, b.xml, c.xml, d.pdf
As you can see the priority is with the xml files in the case that both extensions have similar names.
I would be thankful for your help.
You need to use the shutil module and the os module to achieve this. This function will work on the following assumption:
A given directory has all files with the same extension
The priority_directory will be the directory with file extensions to be prioritized
The secondary_directory will be the directory with file extensions to be dropped in case of a name collision
Try:
import os,shutil
def copy_files(priority_directory,secondary_directory,destination = "new_directory"):
file_names = [os.path.splitext(filename)[0] for filename in os.listdir(priority_directory)] # get the file names to check for collisions
os.mkdir(destination) # make a new directory
for file in os.listdir(priority_directory): # this loop copies the first direcotory as it is
file_path = os.path.join(priority_directory,file)
dst_path = os.path.join(destination,file)
shutil.copy(file_path,dst_path)
for file in os.listdir(secondary_directory): # this loop checks for collisions and drops files whose name collide
if(os.path.splitext(file)[0] not in file_names):
file_path = os.path.join(secondary_directory,file)
dst_path = os.path.join(destination,file)
shutil.copy(file_path,dst_path)
print(os.listdir(destination))
Let's run it with your direcotry names as arguments:
copy_files('directory_1','directory_2','directory_3')
You can now check a new directory with the name directory_3 will be created with the desired files in it.
This will work for all such similar cases no matter what the extension is.
Note: There should not be a need to do this i guess cause a directory can have two files with the same name as long as the extensions differ.
Rough working solution:
import os
from shutil import copy2
d1 = './d1/'
d2 = './d2/'
d3 = './d3/'
ext_1 = '.xml'
ext_2 = '.pdf'
def get_files(d: str, files: list):
directory = os.fsencode(d)
for file in os.listdir(d):
dup = False
filename = os.fsdecode(file)
if filename[-4:] == ext_2:
for (x, y) in files:
if y == filename[:-4] + ext_1:
dup = True
break
if dup:
continue
files.append((d, filename))
files = []
get_files(d1, files)
get_files(d2, files)
for d, file in files:
copy2(d+file, d3)
I'll see if I can get it to look/perform better.

For Loop to Move and Rename .html Files - Python 3

I'm asking for help in trying to create a loop to make this script go through all files in a local directory. Currently I have this script working with a single HTML file, but would like it so it picks the first file in the directory and just loops until it gets to the last file in the directory.
Another way to help would be adding a line to the string would add a (1), (2), (3), etc. at the end if the names are duplicate.
Can anyone help with renaming thousands of files with a string that is parsed with BeautifulSoup4. Each file contains a name and reference number at the same position/line. Could be same name and reference number, or could be different reference number with same name.
import bs4, shutil, os
src_dir = os.getcwd()
print(src_dir)
dest_dir = os.mkdir('subfolder')
os.listdir()
dest_dir = src_dir+"/subfolder"
src_file = os.path.join(src_dir, 'example_filename_here.html')
shutil.copy(src_file, dest_dir)
exampleFile = open('example_filename_here.html')
exampleSoup = bs4.BeautifulSoup(exampleFile.read(), 'html.parser')
elems = exampleSoup.select('.bodycopy')
type(elems)
elems[2].getText()
dst_file = os.path.join(dest_dir, 'example_filename_here.html')
new_dst_file_name = os.path.join(dest_dir, elems[2].getText()+ '.html')
os.rename(dst_file, new_dst_file_name)
os.chdir(dest_dir)
print(elems[2].getText())

Having trouble using zipfile.ZipFile.extractall (Already read the docs)

I have a folder with many zipfiles, most of these zipfiles contain shapefiles and some of them have subfolders which contain zipfiles that contain shapefiles. I am trying to extract everything into one main folder wihtout keeping any folder structure. This is where I am now;
import os, zipfile
def getListOfFiles(dirName):
# create a list of file and sub directories
# names in the given directory
listOfFile = os.listdir(dirName)
allFiles = list()
# Iterate over all the entries
for entry in listOfFile:
# Create full path
fullPath = os.path.join(dirName, entry)
# If entry is a directory then get the list of files in this directory
if os.path.isdir(fullPath):
allFiles = allFiles + getListOfFiles(fullPath)
else:
allFiles.append(fullPath)
return allFiles
def main():
dirName = r'C:\Users\myusername\My_Dataset'
# Get the list of all files in directory tree at given path
listOfFiles = getListOfFiles(dirName)
# Print the files
for elem in listOfFiles:
print(elem)
zipfile.ZipFile.extractall(elem)
print("****************")
if __name__ == '__main__':
main()
This script prints all the shapefiles (including the ones under subfolders). Now I need to extract all these listed shapefiles into one main folder. I try zipfile.ZipFile.extractall(elem) but it doesn't work.
line 1611, in extractall
members = self.namelist()
AttributeError: 'str' object has no attribute 'namelist'
Is the error I'm getting. zipfile.ZipFile.extractall(elem) is the line that doesn't work. I imagine it expects one zipfile but I'm trying to feed it a folder (or a list in this case?)
How would I change this script so that it extracts my listed shapefiles into a folder (preferably a new folder)
You need to make an instance of ZipFile first and use extractall on this instance:
for elem in listOfFiles:
my_zipfile = zipfile.ZipFile(elem)
my_zipfile.extractall()
I have added this code block to my script and it works now.
def getfiles(path):
if os.path.isdir(path):
for root, dirs, files in os.walk(path):
for name in files:
yield os.path.join(root, name)
else:
yield path
fromdir = r"C:\Users\username\My_Dataset\new"
for f in getfiles(fromdir):
filename = str.split(f, '/')[-1]
if os.path.isfile(destination + filename):
filename = f.replace(fromdir, "", 1).replace("/", "_")
# os.rename(f, destination+filename)
shutil.copy2(f, r"C:\Users\username\Documents\flatten")

Can I force os.walk to visit directories in alphabetical order?

I would like to know if it's possible to force os.walk in python3 to visit directories in alphabetical order. For example, here is a directory and some code that will walk this directory:
ryan:~/bktest$ ls -1 sample
CD01
CD02
CD03
CD04
CD05
--------
def main_work_subdirs(gl):
for root, dirs, files in os.walk(gl['pwd']):
if root == gl['pwd']:
for d2i in dirs:
print(d2i)
When the python code hits the directory above, here is the output:
ryan:~/bktest$ ~/test.py sample
CD03
CD01
CD05
CD02
CD04
I would like to force walk to visit these dirs in alphabetical order, 01, 02 ... 05. In the python3 doc for os.walk, it says:
When topdown is True, the caller can modify the dirnames list in-place
(perhaps using del or slice assignment), and walk() will only recurse
into the subdirectories whose names remain in dirnames; this can be
used to prune the search, impose a specific order of visiting
Does that mean that I can impose an alphabetical visiting order on os.walk? If so, how?
Yes. You sort dirs in the loop.
def main_work_subdirs(gl):
for root, dirs, files in os.walk(gl['pwd']):
dirs.sort()
if root == gl['pwd']:
for d2i in dirs:
print(d2i)
I know this has already been answered but I wanted to add one little detail and adding more than a single line of code in the comments is wonky.
In addition to wanting the directories sorted I also wanted the files sorted so that my iteration through "gl" was consistent and predictable. To do this one more sort was required:
for root, dirs, files in os.walk(gl['pwd']):
dirs.sort()
for filename in sorted(files):
print(os.path.join(root, filename))
And, with benefit of learning more about Python, a different (better) way:
from pathlib import Path
# Directories, per original question.
[print(p) for p in sorted(Path(gl['pwd']).glob('**/*')) if p.is_dir()]
# Files, like I usually need.
[print(p) for p in sorted(Path(gl['pwd']).glob('**/*')) if p.is_file()]
This answer is not specific to this question and the problem is a little different but the solution can be used in either case.
Consider having these files ("one1.txt", "one2.txt", "one10.txt") and the content of all of them is a String "default":
I want to loop through a directory that contains these files and find a specific String in every file and replace it with the name of the file.
If you use any other methods which have already mentioned here and in other questions (like dirs.sort() and sorted(files) and sorted(dirs), the result will be something like this:
"one1.txt"--> "one10"
"one2.txt"--> "one1"
"one10.txt" --> "one2"
But we want it to be:
"one1.txt"--> "one1"
"one2.txt"--> "one2"
"one10.txt" --> "one10"
I found this method which changes file content alphabetically:
import re, os, fnmatch
def atoi(text):
return int(text) if text.isdigit() else text
def natural_keys(text):
'''
alist.sort(key=natural_keys) sorts in human order
http://nedbatchelder.com/blog/200712/human_sorting.html
(See Toothy's implementation in the comments)
'''
return [ atoi(c) for c in re.split('(\d+)', text) ]
def findReplace(directory, find, replace, filePattern):
count = 0
for path, dirs, files in sorted(os.walk(os.path.abspath(directory))):
dirs.sort()
for filename in sorted(fnmatch.filter(files, filePattern), key=natural_keys):
count = count +1
filepath = os.path.join(path, filename)
with open(filepath) as f:
s = f.read()
s = s.replace(find, replace+str(count)+".png")
with open(filepath, "w") as f:
f.write(s)
Then run this line:
findReplace(os.getcwd(), "default", "one", "*.xml")

Resources