File comparison in two directories - python-3.x

I am comparing all files in two directories, if comparison is greater than 90% so i continue the outer loop and i want to remove the file in the second directory that was matched so that the second file in the first directory doesn't compare with the file that's already matched.
Here's what i've tried:
for i for i in sorted_files:
for j in sorted_github_files:
#pdb.set_trace()
with open(f'./files/{i}') as f1:
try:
text1 = f1.read()
except:
pass
with open(f'./github_files/{j}') as f2:
try:
text2 = f2.read()
except:
pass
m = SequenceMatcher(None, text1, text2)
print("file1:", i, "file2:", j)
if m.ratio() > 0.90:
os.remove(f'./github_files/{j}')
break
I know i cannot change the iteration once it's in action that's why its returning me file not found error i dont want to use try except blocks. Any ideas appreciated

A couple of things to point out:
Always provide a minimal reproducible example
Your first for loop is not working since you used `for i for i ..``
If you want to iterate over the files in list1 (sorted_files) first, then read the file outside of the second loop
I would add the files that match with a ratio over 0.90 to a new list and remove the files afterward so your items do not change during the iteration
You can find the test-data i created and used here
import os
from difflib import SequenceMatcher
# define your two folders, full paths
first_path = os.path.abspath(r"C:\Users\XYZ\Desktop\testfolder\a")
second_path = os.path.abspath(r"C:\Users\XYZ\Desktop\testfolder\b")
# get files from folder
first_path_files = os.listdir(first_path)
second_path_files = os.listdir(second_path)
# join path and filenames
first_folder = [os.path.join(first_path, f) for f in first_path_files]
second_folder = [os.path.join(second_path, f) for f in second_path_files]
# empty list for matching results
matched_files = []
# iterate over the files in the first folder
for file_one in first_folder:
# read file content
with open(file_one, "r") as f:
file_one_text = f.read()
# iterate over the files in the second folder
for file_two in second_folder:
# read file content
with open(file_two, "r") as f:
file_two_text = f.read()
# match the two file contents
match = SequenceMatcher(None, file_one_text, file_two_text)
if match.ratio() > 0.90:
print(f"Match found ({match.ratio()}): '{file_one}' | '{file_two}'")
# TODO: here you have to decide if you rather want to remove files from the first or second folder
matched_files.append(file_two) # i delete files from the second folder
# remove duplicates from the resulted list
matched_files = list(set(matched_files))
# remove the files
for f in matched_files:
print(f"Removing file: {f}")
os.remove(f)

Related

How to copy merge files of two different directories with different extensions into one directory and remove the duplicated ones

I would need a Python function which performs below action:
I have two directories which in one of them I have files with .xml format and in the other one I have files with .pdf format. To simplify things consider this example:
Directory 1: a.xml, b.xml, c.xml
Directory 2: a.pdf, c.pdf, d.pdf
Output:
Directory 3: a.xml, b.xml, c.xml, d.pdf
As you can see the priority is with the xml files in the case that both extensions have similar names.
I would be thankful for your help.
You need to use the shutil module and the os module to achieve this. This function will work on the following assumption:
A given directory has all files with the same extension
The priority_directory will be the directory with file extensions to be prioritized
The secondary_directory will be the directory with file extensions to be dropped in case of a name collision
Try:
import os,shutil
def copy_files(priority_directory,secondary_directory,destination = "new_directory"):
file_names = [os.path.splitext(filename)[0] for filename in os.listdir(priority_directory)] # get the file names to check for collisions
os.mkdir(destination) # make a new directory
for file in os.listdir(priority_directory): # this loop copies the first direcotory as it is
file_path = os.path.join(priority_directory,file)
dst_path = os.path.join(destination,file)
shutil.copy(file_path,dst_path)
for file in os.listdir(secondary_directory): # this loop checks for collisions and drops files whose name collide
if(os.path.splitext(file)[0] not in file_names):
file_path = os.path.join(secondary_directory,file)
dst_path = os.path.join(destination,file)
shutil.copy(file_path,dst_path)
print(os.listdir(destination))
Let's run it with your direcotry names as arguments:
copy_files('directory_1','directory_2','directory_3')
You can now check a new directory with the name directory_3 will be created with the desired files in it.
This will work for all such similar cases no matter what the extension is.
Note: There should not be a need to do this i guess cause a directory can have two files with the same name as long as the extensions differ.
Rough working solution:
import os
from shutil import copy2
d1 = './d1/'
d2 = './d2/'
d3 = './d3/'
ext_1 = '.xml'
ext_2 = '.pdf'
def get_files(d: str, files: list):
directory = os.fsencode(d)
for file in os.listdir(d):
dup = False
filename = os.fsdecode(file)
if filename[-4:] == ext_2:
for (x, y) in files:
if y == filename[:-4] + ext_1:
dup = True
break
if dup:
continue
files.append((d, filename))
files = []
get_files(d1, files)
get_files(d2, files)
for d, file in files:
copy2(d+file, d3)
I'll see if I can get it to look/perform better.

How to read many files have a specific format in python

I am a little bit confused in how to read all lines in many files where the file names have format from "datalog.txt.98" to "datalog.txt.120".
This is my code:
import json
file = "datalog.txt."
i = 97
for line in file:
i+=1
f = open (line + str (i),'r')
for row in f:
print (row)
Here, you will find an example of one line in one of those files:
I need really to your help
I suggest using a loop for opening multiple files with different formats.
To better understand this project I would recommend researching the following topics
for loops,
String manipulation,
Opening a file and reading its content,
List manipulation,
String parsing.
This is one of my favourite beginner guides.
To set the parameters of the integers at the end of the file name I would look into python for loops.
I think this is what you are trying to do
# create a list to store all your file content
files_content = []
# the prefix is of type string
filename_prefix = "datalog.txt."
# loop from 0 to 13
for i in range(0,14):
# make the filename variable with the prefix and
# the integer i which you need to convert to a string type
filename = filename_prefix + str(i)
# open the file read all the lines to a variable
with open(filename) as f:
content = f.readlines()
# append the file content to the files_content list
files_content.append(content)
To get rid of white space from file parsing add the missing line
content = [x.strip() for x in content]
files_content.append(content)
Here's an example of printing out files_content
for file in files_content:
print(file)

Applying function to a list of file-paths and writing csv output to the respective paths

How do I apply a function to a list of file paths I have built, and write an output csv in the same path?
read file in a subfolder -> perform a function -> write file in the
subfolder -> go to next subfolder
#opened xml by filename
with open(r'XML_opsReport 100001.xml', encoding = "utf8") as fd:
Odict_parsedFromFilePath = xmltodict.parse(fd.read())
#func called in func below
def activity_to_df_one_day (list_activity_this_day):
ib_list = [pd.DataFrame(list_activity_this_day[i], columns=list_activity_this_day[i].keys()).drop("#uom") for i in range(len(list_activity_this_day))]
return pd.concat(ib_list)
#Processes parsed xml and writes csv
def activity_to_df_all_days (Odict_parsedFromFilePath, subdir): #writes csv from parsed xml after some processing
nodes_reports = Odict_parsedFromFilePath['opsReports']['opsReport']
list_activity = []
for i in range(len(nodes_reports)):
try:
df = activity_to_df_one_day(nodes_reports[i]['activity'])
list_activity.append(df)
except KeyError:
continue
opsReport = pd.concat(list_activity)
opsReport['dTimStart'] = pd.to_datetime(opsReport['dTimStart'], infer_datetime_format =True)
opsReport.sort_values('dTimStart', axis=0, ascending=True, inplace=True, kind='quicksort', na_position='last')
opsReport.to_csv("subdir\opsReport.csv") #write to the subdir
def scanfolder(): #fetches list of file-paths with desired starting name.
list_files = []
for path, dirs, files in os.walk(r'C:\..\xml_objects'): #directory containing several subfolders
for f in files:
if f.startswith('XML_opsReport'):
list_files.append(os.path.join(path, f))
return list_files
filepaths = scanfolder() #list of file-paths
Every function works well, the xml processing is good, so I am not sharing the xml structure. There are 100+ paths in filepaths , each a different subdirectory. I want to be able to apply above flow in future as well, where I can get filepaths and perform desired actions. It's important to write the csv file to it's sub directory.
To get the directory that a file is in, you can use:
import os
for root, dirs, files, in os.walk(some_dir):
for f in files:
print(root)
output_file = os.path.join(root, "output_file.csv")
print(output_file)
Is that what you're looking for?
Output:
somedir
somedir\output_file.csv
See also Python 3 - travel directory tree with limited recursion depth and Find current directory and file's directory.
Was able to solve with os.path.join.
exceptions_path_list =[]
for i in filepaths:
try:
with open(i, encoding = "utf8") as fd:
doc = xmltodict.parse(fd.read())
activity_to_df_all_days (doc, i)
except ValueError:
exceptions_path_list.append(os.path.dirname(i))
continue
def activity_to_df_all_days (Odict_parsedFromFilePath, filepath):
...
...
...
opsReport.to_csv(os.path.join(os.path.dirname(filepath), "opsReport.csv"))

Count multiple files in a directory with the same name

I'm relatively new to Python and was working on a project where the user can navigate to a folder, after which the program does a count of all the files in that folder with a specific name.
The problem is that I have a folder with over 5000 files many of them sharing the same name but different extensions. I wrote code that somewhat does what I want the final version to do but its VERY redundant and I can't see myself doing this for over 600 file names.
Wanted to ask if it is possible to make this program "automated" or less redundant where I don't have to manually type out the names of 600 files to return data for.
Sample code I currently have:
import os, sys
print(sys.version)
file_counting1 = 0
file_counting2 = 0
filepath = input("Enter file path here: ")
if os.path.exists(filepath):
for file in os.listdir(filepath):
if file.startswith('expressmail'):
file_counting1 += 1
print('expressmail')
print('Total files found:', file_counting1)
for file in os.listdir(filepath):
if file.startswith('prioritymail'):
file_counting2 += 1
print('prioritymail')
print('Total files found:', file_counting2)
Sample Output:
expressmail
Total files found: 3
prioritymail
Total files found: 1
The following script will count occurrences of files with the same name. If the file does not have an extension, the whole filename is treated as the name. It also does not traverse subdirectories, since the original question just asks about files in the given folder.
import os
dir_name = "."
files = next(os.walk(dir_name))[2] # get all the files directly in the directory
names = [f[:f.rindex(".")] for f in files if "." in f] # drop the extensions
names += [f for f in files if "." not in f] # add those without extensions
for name in set(names): # for each unique name-
print("{}\nTotal files found: {}".format(name, names.count(name)))
If you want to support files in subdirectories, you could use something like
files = [os.path.join(r,file) for r,d,f in os.walk(dir_name) for file in f]
If you don't want to consider files without extensions, just remove the line:
names += [f for f in files if "." not in f]
There are a number of ways you can do what you're trying to do. Partly it depends on whether or not you need to recover the list of extension for a given duplicated file.
Counter, from the collections module - use this for a simple count of file. Ignore the extensions when building the count.
Use the filename without extension as a dictionary key, add a list of items as the key-value, where the list of items is each occurrence of the file.
Here's an example using the Counter class:
import os, sys, collections
c = collections.Counter()
for root, dirs,files in os.walk('/home/myname/hg/2018/'):
# discard any path data and just use filename
for names in files:
name, ext = os.path.splitext(names)
# discard any extension
c[name] += 1
# Counter.most_common() gives the values in the form of (entry, count)
# Counter.most_common(x) - pass a value to display only the top x counts
# e.g. Counter.most_common(2) = top 2
for x in c.most_common():
print(x[0] + ': ' + str(x[1]))
you can use regular expressions:
import os, sys, re
print(sys.version)
filepath = input("Enter file path here: ")
if os.path.exists(filepath):
allfiles = "\n".join(os.listdir(filepath))
file_counting1 = len(re.findall("^expressmail",allfiles,re.M))
print('expressmail')
print('Total files found:', file_counting1)
file_counting2 = len(re.findall("^prioritymail",allfiles,re.M))
print('prioritymail')
print('Total files found:', file_counting2)

Python: If a string is found, stop searching for that string, search for the next string, and output the matching strings

This code outputs the matching string once for every time it is in the file that is being searched (so I end up with a huge list if the string is there repeatedly). I only want to know if the strings from my list match, not how many times they match. I do want to know which strings match, so a True/False solution does not work. But I only want them listed once, each, if they match. I do not really understand what the pattern = '|'.join(keywords) part is doing - I got that from someone else's code to get my file to file matching working, but don't know if I need it. Your help would be much appreciated.
# declares the files used
filenames = ['//Katie/Users/kitka/Documents/appreport.txt', '//Dallin/Users/dallin/Documents/appreport.txt' ,
'//Aidan/Users/aidan/Documents/appreport.txt']
# parses each file
for filename in filenames:
# imports the necessary libraries
import os, time, re, smtplib
from stat import * # ST_SIZE etc
# finds the time the file was last modified and error checks
try:
st = os.stat(filename)
except IOError:
print("failed to get information about", filename)
else:
# creates a list of words to search for
keywords = ['LoL', 'javaw']
pattern = '|'.join(keywords)
# searches the file for the strings in the list, sorts them and returns results
results = []
with open(filename, 'r') as f:
for line in f:
matches = re.findall(pattern, line)
if matches:
results.append((line, len(matches)))
results = sorted(results)
# appends results to the archive file
with open("GameReport.txt", "a") as f:
for line in results:
f.write(filename + '\n')
f.write(time.asctime(time.localtime(st[ST_MTIME])) + '\n')
f.write(str(line)+ '\n')
Untested, but this should work. Note that this only keeps track of which words were found, not which words were found in which files. I couldn't figure out whether or not that's what you wanted.
import fileinput
filenames = [...]
keywords = ['LoL', 'javaw']
# a set is like a list but with no duplicates, so even if a keyword
# is found multiple times, it will only appear once in the set
found = set()
# iterate over the lines of all the files
for line in fileinput.input(files=filenames):
for keyword in keywords:
if keyword in line:
found.add(keyword)
print(found)
EDIT
If you want to keep track of which keywords are present in which files, then I'd suggest keeping a set of (filename, keyword) tuples:
filenames = [...]
keywords = ['LoL', 'javaw']
found = set()
for filename in filenames:
with open(filename, 'rt') as f:
for line in f:
for keyword in keywords:
if keyword in line:
found.add((filename, keyword))
for filename, keyword in found:
print('Found the word "{}" in the file "{}"'.format(keyword, filename))

Resources