write specific file name in specific list in python - python-3.x

I have folder contain multiple .txt files ,the names of files(one.txt,two.txt,three.txt,...) I need to read the one.txt and then write the content of this file in list has name onefile[], then read two.txt and write the content in list twofile[] and so on. how can do this?
Update! Iam try this code, now how can print the values in each list ?
def writeinlist(file_path,i):
multilist = {}
output = open(file_path,'r')
globals()['List%s' % i] = output
print('List%s' % i)
input_path = Path(Path.home(), "Desktop", "NN")
index=1
for root, dirs, files in os.walk(input_path):
for file in files:
file_path = Path(root, file)
writeinlist(file_path,index)
index+=1
Update2: how can delete \n from values?
value_list1 = files_dict['file1']
print('Values of file1 are:')
print(value_list1)

I used the following to create a dictionary with dynamic keys (with the names of the files) and the respective values being a list with elements the lines of the file.
First, contents of onefile.txt:
First file first line
First file second line
First file third line
Contents of twofile.txt:
Second file first line
Second file second line
My code:
import os
import pprint
files_dict = {}
for file in os.listdir("/path/to/folder"):
if file.endswith(".txt"):
key = file.split(".")[0]
full_filename = os.path.join("/path/to/folder", file)
with open(full_filename, "r") as f:
files_dict[key] = f.readlines()
pprint.pprint(files_dict)
Output:
{'onefile': ['First file first line\n',
'First file second line\n',
'First file third line'],
'twofile': ['Second file first line\n', 'Second file second line']}
Another way to do this that's a bit more Pythonic:
import os
import pprint
files_dict = {}
for file in [
f
for f in os.listdir("/Users/itroulli/Downloads/data_eng_challenge3/files")
if f.endswith(".txt")
]:
with open(os.path.join("/path/to/folder", file), "r") as fo:
files_dict[file.split(".")[0]] = fo.readlines()
pprint.pprint(files_dict)

Related

File comparison in two directories

I am comparing all files in two directories, if comparison is greater than 90% so i continue the outer loop and i want to remove the file in the second directory that was matched so that the second file in the first directory doesn't compare with the file that's already matched.
Here's what i've tried:
for i for i in sorted_files:
for j in sorted_github_files:
#pdb.set_trace()
with open(f'./files/{i}') as f1:
try:
text1 = f1.read()
except:
pass
with open(f'./github_files/{j}') as f2:
try:
text2 = f2.read()
except:
pass
m = SequenceMatcher(None, text1, text2)
print("file1:", i, "file2:", j)
if m.ratio() > 0.90:
os.remove(f'./github_files/{j}')
break
I know i cannot change the iteration once it's in action that's why its returning me file not found error i dont want to use try except blocks. Any ideas appreciated
A couple of things to point out:
Always provide a minimal reproducible example
Your first for loop is not working since you used `for i for i ..``
If you want to iterate over the files in list1 (sorted_files) first, then read the file outside of the second loop
I would add the files that match with a ratio over 0.90 to a new list and remove the files afterward so your items do not change during the iteration
You can find the test-data i created and used here
import os
from difflib import SequenceMatcher
# define your two folders, full paths
first_path = os.path.abspath(r"C:\Users\XYZ\Desktop\testfolder\a")
second_path = os.path.abspath(r"C:\Users\XYZ\Desktop\testfolder\b")
# get files from folder
first_path_files = os.listdir(first_path)
second_path_files = os.listdir(second_path)
# join path and filenames
first_folder = [os.path.join(first_path, f) for f in first_path_files]
second_folder = [os.path.join(second_path, f) for f in second_path_files]
# empty list for matching results
matched_files = []
# iterate over the files in the first folder
for file_one in first_folder:
# read file content
with open(file_one, "r") as f:
file_one_text = f.read()
# iterate over the files in the second folder
for file_two in second_folder:
# read file content
with open(file_two, "r") as f:
file_two_text = f.read()
# match the two file contents
match = SequenceMatcher(None, file_one_text, file_two_text)
if match.ratio() > 0.90:
print(f"Match found ({match.ratio()}): '{file_one}' | '{file_two}'")
# TODO: here you have to decide if you rather want to remove files from the first or second folder
matched_files.append(file_two) # i delete files from the second folder
# remove duplicates from the resulted list
matched_files = list(set(matched_files))
# remove the files
for f in matched_files:
print(f"Removing file: {f}")
os.remove(f)

saving text files to .npy file

I have many text files in a directory with numerical extension(example: signal_data1.9995100000000001,signal_data1.99961 etc)
The content of the files are as given below
signal_data1.9995100000000001
-1.710951390504200198e+00
5.720409824754981720e-01
2.730176313110273423e+00
signal_data1.99961
-6.710951390504200198e+01
2.720409824754981720e-01
6.730176313110273423e+05
I just want to arrange the above files into a single .npy files as
-1.710951390504200198e+00,5.720409824754981720e-01, 2.730176313110273423e+00
-6.710951390504200198e+01,2.720409824754981720e-01, 6.730176313110273423e+05
So, I want to implement the same procedure for many files of a directory.
I tried a loop as follows:
import numpy as np
import glob
for file in glob.glob(./signal_*):
np.savez('data', file)
However, it does not give what I want as depicted above. So here I need help. Thanks in advance.
Here is another way of achieving it:
import os
dirPath = './data/' # folder where you store your data
with os.scandir(dirPath) as entries:
output = ""
for entry in entries: # read each file in your folder
dataFile = open(dirPath + entry.name, "r")
dataLines = dataFile.readlines()
dataFile.close()
for line in dataLines:
output += line.strip() + " " # clear all unnecessary characters & append
output += '\n' # after each file break line
writeFile = open("a.npy", "w") # save it
writeFile.write(output)
writeFile.close()
You can use np.loadtxt() and np.save():
a = np.array([np.loadtxt(f) for f in sorted(glob.glob('./signal_*'))])
np.save('data.npy', a)

merge mutliple files into single file, new file file should merge to new line in output file

I've written script to merge multiple files into single file and create list from that.
requirement: file1 + file2 = file3 like below
file1 :
37717531209
201128307083
211669759863
496338947094
File 2:
348353447295
278262427715
901601149752
333676465561
my outputfile(not expecting this output):
37717531209
201128307083
211669759863
496338947094348353447295
278262427715
901601149752
333676465561
my expected outputfile:
37717531209
201128307083
211669759863
496338947094
348353447295
278262427715
901601149752
333676465561
My code is:
with open(outputfile, 'wb') as outfile:
for filename in glob.glob('*.accts'):
if filename == outputfile:
# don't want to copy the output into the output
continue
with open(filename, 'rb') as readfile:
shutil.copyfileobj(readfile, outfile)
#accounts = list(outfile)
with open('accounts.txt') as f:
acc = list(f)
accounts = []
for element in acc:
accounts.append(element.strip())
I want the new file to merge from next line. not to start from same line.
You don't appear to be doing anything that would require writing the contents of the input files into an output file. At the end, after creating the output file, you reopen it and process the account numbers it contains. You could simply do this, which requires no output file at all:
accounts = []
for filename in glob.glob('*.accts'):
with open(filename) as readfile:
accounts.extend(line.strip() for line in readfile)
return accounts
Furthermore, you could probably do away with downloading the *.accts files
by using download_fileobj() to download the files into a file like object (e.g. a io.BytesIO object) and processing from there.

Python3: Index out of range for script that worked before

the attached script returns:
IndexError: list index out of range
for the line starting with values = {line.split (...)
values=dict()
with open(csv) as f:
lines =f.readlines()
values = {line.split(',')[0].strip():line.split(',')[1].strip() for line in lines}
However, I could use it yesterday for doing exactly the same:
replacing certain text in a dir of xml-files with different texts
import os
from distutils.dir_util import copy_tree
drc = 'D:/Spielwiese/00100_Arbeitsverzeichnis'
backup = 'D:/Spielwiese/Backup/'
csv = 'D:/persons1.csv'
copy_tree(drc, backup)
values=dict()
with open(csv) as f:
lines =f.readlines()
values = {line.split(',')[0].strip():line.split(',')[1].strip() for line in lines}
#Getting a list of the full paths of files
for dirpath, dirname, filename in os.walk(drc):
for fname in filename:
#Joining dirpath and filenames
path = os.path.join(dirpath, fname)
#Opening the files for reading only
filedata = open(path,encoding="Latin-1").read()
for k,v in values.items():
filedata=filedata.replace(k,v)
f = open(path, 'w',encoding="Latin-1")
# We are writing the the changes to the files
f.write(filedata)
f.close() #Closing the files
print("In case something went wrong, you can find a backup in " + backup)
I don't see anything weird and I could, as mentioned before use it before ... :-o
Any ideas on how to fix it?
best Wishes,
K

How to read multiple lines in a txt file in a loop?

I have a directory containing several txt.files. I am interested only in the first 7 characters of each line. I created a dictionary that takes as keys the name of the files and as values the first 7 character of each row in each file. Then, I made 2 for loops like below:
files = glob.glob('/Users/andreasportelli/Desktop/sqldata/*')
customer = {"customer_id":[],"order_id":[]} #create a dictionary
for file in files:
x = open(file, 'r', encoding='ascii', errors='replace')
customer["customer_id"].append(str(file))
for file in files:
x = open(file, 'r', encoding='ascii', errors='replace')
customer["order_id"].append(x.readline(7))
The problem is that in this way it only reads the first line and not all lines.
How do I make it iterate on each line in each file?
thanks
You only need to loop the file names once, and then a nested loop to grab each line in the file. Hope that helps!
files = glob.glob('/Users/andreasportelli/Desktop/sqldata/*')
customer = {"customer_id":[],"order_id":[]} #create a dictionary
for file in files:
customer["customer_id"].append(file)
for line in open(file, 'r', encoding='ascii', errors='replace'):
customer["order_id"].append(line[:7])

Resources