Compare the files and find the duplicate queries - python-3.x

How to conpare two directories. First directory contain 5 sql files ,second directory contains 2 sql files .Compare the files and find the duplicate queries using python
import os
import filecmp
Define the paths of the two folders to compare
folder1 = 'path/to/folder1'
folder2 = 'path/to/folder2'
Loop through each file in folder1
for filename in os.listdir(folder1):
file1 = os.path.join(folder1, filename)
file2 = os.path.join(folder2, filename)
# Check if the file exists in folder2 and is a SQL file
if os.path.isfile(file2) and file1.endswith('.sql') and file2.endswith('.sql'):
# Compare the contents of the two files
if filecmp.cmp(file1, file2, shallow=False):
print(f"Duplicate file found: {filename}")

Related

How to join a directory with wild search in python

I am trying to read all the csv files in the particular set of directories. I have a subdirectories that named as report followed by date like 'report2021-12-22-14_15', 'report2022-01-22-11_10'. I am manually trying to join the path as below
root = os.path.join(base, 'report2021-12-22-14_15' , 'report')
Is there any way I can do a wild search like 'report*' to join the directories so that I will not miss any subdirectories. Below is the snippet
from fnmatch import fnmatch
base = '/Users/user/Desktop/report_files/'
root = os.path.join(base, 'report2021-12-22-14_15','report')
pattern = "report.csv"
for path, files in os.walk(root):
for name in files:
if fnmatch(name, pattern):
file_path = os.path.join(path, name)

How to merge two files from different folders base on few character match in files name

I have two folders with text files, i want to read files from first folder and check in second folder if some specfic character matched in files name then merge on column 'Time' , do this for several files
folder 1:
07k0ms_610s_hh85m_sq150_t40k0_sn183_0
08k0ms_610s_hh85m_sq150_t40k0_sn183_20
011k0ms_610s_hh85m_sq150_t40k0_sn183_-10
folder 2:
07k0m_t40k0_try-0.2
08k0m_t40k0_try-0.2
32k0m_t40k0_try-0.2
read file from folder 1 and check if 07k0m_t40k0 or 08k0m_t40k0 or 11k0m_t40k0 match in file name then folder two file merge in folder 1 file and save in csv one by one
Try the following:
import glob
import pandas as pd
lst_folders = ['folder_1',
'folder_2']
lst_str_find = ['07k0m_t40k0', '08k0m_t40k0', '11k0m_t40k0']
lst_files_1 = sorted(glob.glob(lst_folders[0]+'/*.txt'))
lst_files_2 = sorted(glob.glob(lst_folders[1]+'/*.txt'))
for file_1 in lst_files_1:
str_search = file_1[file_1.find("/")+1:file.find("s_")]
if any([(str_search in i) for i in lst_str_find]):
for file_2 in lst_files_2:
if file_name in file_2:
print(file_1)
print(file_2)
# here load,merge and save file_1 & file_2 - the specific code
# depends on the structure of your files and the way you want
# to import them. Should look similar to:
#
# merge_1 = pd.read_csv(file_1)
# merge_2 = pd.read_csv(file_2)
# merged_file = pd.concat([merge_1, merge_2])
# merged_file.to_csv (lst_folders[0]+'/merged_'+str_search+'.csv', index=None)
Notes:
read/merge/write might need to be adjusted, depending on the actual
structure of your files, which did not become clear from your post
the code assumes that it lives in the same directory as the folders. If that is not the case, the paths must be adjusted accordingly
Let me know, if it worked :)

extracting file content inside multiple folders and printing required folder names

I have files and directory structure like this:
C:\data\4512\required_data\121\a.txt
C:\data\4512\required_data\121\b.txt
C:\data\4512\required_data\456\c.txt
C:\data\4512\required_data\456\d.txt
C:\data\9813\required_data\789\e.txt
C:\data\9813\required_data\789\f.txt
C:\data\9813\required_data\301\g.txt
C:\data\9813\required_data\301\h.txt
I want to print the content of the text files
Also I want to print outer_folder number like 4512, inner folder number like 121 and the file name
I was trying some code like this:
path = "C:\\data"
for root, dirs, files in os.walk(path):
for dir in dirs:
print(dir)
data_location = os.path.join(path, dir, "required_data")
for example, for this case:
C:\data\4512\required_data\121\a.txt
Expected output:
file=open("a.txt")
print(file) # content of file
print("outer_number") # 4512
print("inner_number") # 121
print("name_of_file") # a.txt
I want to do this for all files

Create folders dynamically and write csv files to that folders

I would like to read several input files from a folder, perform some transformations,create folders on the fly and write the csv to corresponding folders. The point here is I have the input path which is like
"Input files\P1_set1\Set1_Folder_1_File_1_Hour09.csv" - for a single patient (This file contains readings of patient (P1) at 9th hour)
Similarly, there are multiple files for each patient and each patient files are grouped under each folder as shown below
So, to read each file, I am using wildcard regex as shown below in code
I have already tried using the glob package and am able to read it successfully but am facing issue while creating the output folders and saving the files. I am parsing the file string as shown below
f = "Input files\P1_set1\Set1_Folder_1_File_1_Hour09.csv"
f[12:] = "P1_set1\Set1_Folder_1_File_1_Hour09.csv"
filenames = sorted(glob.glob('Input files\P*_set1\*.csv'))
for f in filenames:
print(f) #This will print the full path
print(f[12:]) # This print the folder structure along with filename
df_transform = pd.read_csv(f)
df_transform = df_transform.drop(['Format 10','Time','Hour'],axis=1)
df_transform.to_csv("Output\" + str(f[12:]),index=False)
I expect the output folder to have the csv files which are grouped by each patient under their respective folders. The screenshot below shows how the transformed files should be arranged in output folder (same structure as input folder). Please note that "Output" folder is already existing (it's easy to create one folder you know)
So to read files in a folder use os library then you can do
import os
folder_path = "path_to_your_folder"
dir = os.listdir(folder_path)
for x in dir:
df_transform = pd.read_csv(f)
df_transform = df_transform.drop(['Format 10','Time','Hour'],axis=1)
if os.path.isdir("/home/el"):
df_transform.to_csv("Output/" + str(f[12:]),index=False)
else:
os.makedirs(folder_path+"/")
df_transform.to_csv("Output/" + str(f[12:]),index=False)
Now instead of user f[12:] split the x in for loop like
file_name = x.split('/')[-1] #if you want filename.csv
Let me know if this is what you wanted

creating corresponding subfolders and writing a portion of the file in new files inside those subfolders using python

I have a folder named "data". It contains subfolders "data_1", "data_2", and "data_3". These subfolders contain some text files. I want to parse through all these subfolders and generate corresponding subfolders with the same name, inside another folder named "processed_data". I want to also generate corresponding files with "processed" as a prefix in the name and want to write all those lines from the original file where "1293" is there in the original files.
I am using the below code but not able to get the required result. Neither the subfolders "data_1", "data_2", and "data_3" nor the files are getting created
import os
folder_name=""
def pre_processor():
data_location="D:\data" # folder containing all the data
for root, dirs, files in os.walk(data_location):
for dir in dirs:
#folder_name=""
folder_name=dir
for filename in files:
with open(os.path.join(root, filename),encoding="utf8",mode="r") as f:
processed_file_name = 'D:\\processed_data\\'+folder_name+'\\'+'processed'+filename
processed_file = open(processed_file_name,"w", encoding="utf8")
for line_number, line in enumerate(f, 1):
if "1293" in line:
processed_file.write(str(line))
processed_file.close()
pre_processor()
You might need to elaborate on the issue you are having; e.g., are the files being created, but empty?
A few things I notice:
1) Your indentation is off (not sure if this is just a copy-paste issue though): the pre_processor function is empty, i.e. you are defining the function at the same level as the declaration, not inside of it.
try this:
import os
folder_name=""
def pre_processor():
data_location="D:\data" # folder containing all the data
for root, dirs, files in os.walk(data_location):
for dir in dirs:
#folder_name=""
folder_name=dir
for filename in files:
with open(os.path.join(root, filename), encoding="utf8",mode="r") as f:
processed_file_name = 'D:\\processed_data\\'+folder_name+'\\'+'processed'+filename
processed_file = open(processed_file_name,"w", encoding="utf8")
for line_number, line in enumerate(f, 1):
if "1293" in line:
processed_file.write(str(line))
processed_file.close()
pre_processor()
2) Check if the processed_data and sub_folders exist; if not, create them first as this will not do so.
Instead of creating the path to the new Folder by hand you could just replace the name of the folder.
Furthermore, you are not creating the subfolders.
This code should work but replace the Linux folder slashes:
import os
folder_name=""
def pre_processor():
data_location="data" # folder containing all the data
for root, dirs, files in os.walk(data_location):
for dir in dirs:
# folder_name=""
folder_name = dir
for filename in files:
joined_path = os.path.join(root, filename)
with open(joined_path, encoding="utf8", mode="r") as f:
processed_folder_name = root.replace("data/", 'processed_data/')
processed_file_name = processed_folder_name+'/processed'+filename
if not os.path.exists(processed_folder_name):
os.makedirs(processed_folder_name)
processed_file = open(processed_file_name, "w", encoding="utf8")
for line in f:
if "1293" in line:
processed_file.write(str(line))
processed_file.close()
pre_processor()

Resources