My below code is listing all the files irrespective of the pattern is matched or not. The file ss.txt doesn't contain the pattern but still getting the output
Added the code
import os
import re
files = []
pattern = re.compile('my')
for p, d, f in os.walk(r'C:\Users\anaveed\test'):
for file in f:
files.append(os.path.join(p, file))
for f in files:
with open(f, 'r') as x:
for i in x:
Var1 = re.search(pattern, i)
print(f)
x.close()
C:\Users\anaveed\test\sample.txt
C:\Users\anaveed\test\testfile.txt
C:\Users\anaveed\test\hoax\a.txt
C:\Users\anaveed\test\hoax\ss.txt
import os
import re
files = []
pattern = re.compile('my')
for p, d, f in os.walk(r'C:\Users\anaveed\test'):
for file in f:
files.append(os.path.join(p, file))
for f in files:
with open(f, 'r') as x:
Var1 = False
for i in x:
if re.search(pattern, i):
Var1 = True
if Var1:
print(f)
x.close()
C:\Users\anaveed\test\sample.txt
C:\Users\anaveed\test\testfile.txt
C:\Users\anaveed\test\hoax\a.txt
Related
I have a data output file in the format below from the script I run.
1. xxx %percentage1
2. yyy %percentage1
.
.
.
I am trying to take the percentages only, and append them to the same formatted file line by line (writing a new file once in the process).
1. xxx %percentage1 %percentage2
2. yyy %percentage1 %percentage2
The main idea is every time I run the code with a source data file I want it to add those percentages to the new file line by line.
1. xxx %percentage1 %percentage2 %percentage3 ...
2. yyy %percentage1 %percentage2 %percentage3 ...
This is what I could come up with:
import os
os.chdir("directory")
f = open("data1", "r")
n=3
a = f.readlines()
b = []
for i in range(n):
b.append(a[i].split(" ")[2])
file_lines = []
with open("data1", 'r') as f:
for t in range(n):
for x in f.readlines():
file_lines.append(''.join([x.strip(), b[t], '\n']))
print(b[t])
with open("data2", 'w') as f:
f.writelines(file_lines)
With this code I get the new file but the appending percentages are all from the first line, not different for each line. And I can only get one set of percentages added only and it is overwriting it rather than adding more down the lines.
I hope I explained it properly, if you can give some help I would be glad.
You can use a dict as a structure to load and write your data. This dict can then be pickled to store the data.
EDIT: added missing return statement
EDIT2: Fix return list of get_data
import pickle
import os
output = 'output'
dump = 'dump'
output_dict = {}
if os.path.exists(dump):
with open(dump, 'rb') as f:
output_dict = pickle.load(f)
def read_data(lines):
""" Builds a dict from a list of lines where the keys are
a tuple(w1, w2) and the values are w3 where w1, w2 and w3
are the 3 words composing each line.
"""
d = {}
for line in lines:
elts = line.split()
assert(len(elts)==3)
d[tuple(elts[:2])] = elts[2]
return d
def get_data(data):
""" Recover data from a dict as a list of strings.
The formatting for each element of the list is the following:
k[0] k[1] v
where k and v are the key/values of the data dict.
"""
lines = []
for k, v in data.items():
line = list(k)
line += [v, '\n']
lines.append(' '.join(line))
return lines
def update_data(output_d, new_d):
""" Update a data dict with new data
The values are appended if the key already exists.
Otherwise a new key/value pair is created.
"""
for k, v in new_d.items():
if k in output_d:
output_d[k] = ' '.join([output_d[k], v])
else:
output_d[k] = v
for data_file in ('data1', 'data2', 'data3'):
with open(data_file) as f:
d1 = read_data(f.readlines())
update_data(output_dict, d1)
print("Dumping data", output_dict)
with open(dump, 'wb') as f:
pickle.dump(output_dict, f)
print("Writing data")
with open(output, 'w') as f:
f.write('\n'.join(get_data(output_dict)))
I have large amounts of list for replacement like below.
The remplacement file list.txt:
人の,NN
人の名前,FF
And the data in which to replace text.txt :
aaa人の abc 人の名前def ghi
I want to replace this text to like below using list.txt.
>>> my_func('aaa人の abc 人の名前def ghi')
'aaaNN abc FFdef ghi'
This is my code. But I think this is quite inefficiency to process large data.
d = {}
with open('list.txt', 'r', encoding='utf8') as f:
for line in f:
line = line.strip()
d[line.split(',')[0]] = line.split(',')[1]
with open('text.txt', 'r', encoding='utf8') as f:
txt = f.read()
st = 0
lst = []
# \u4e00-\u9fea\u3040-\u309f] means the range of unicode of Japanese character
for match in re.finditer(r"([\u4e00-\u9fea\u3040-\u309f]+)", txt):
st_m, ed_m = match.span()
lst.append(txt[st:st_m])
search = txt[st_m:ed_m]
rpld = d[search]
lst.append(rpld)
st = ed_m
lst.append(txt[st:])
print(''.join(lst))
Please let me know better way.
After seeing your input aaa人の abc 人の名前def ghi I see you have white-spaces in between. So it's not really a word replace it's more of a phrase replace.
You can refer to the edit history to see the old answer in case you want word replacement
In such a case that you have phrase replacement, you can use re (reg-ex) and provide a array of replacements. Below is an implementation:
>>> import re
>>> _regex = {r'aaa人の abc 人の名前def ghi': r'人の,NN 人の名前,FF'}
>>> input_string = 'hi aaa人の abc 人の名前def ghi work'
>>> for pattern in _regex.keys():
input_string = re.sub(pattern, _regex[pattern], input_string)
>>> input_string
'hi 人の,NN 人の名前,FF work'
>>>
Below is an object oriented implementation of the above
import csv
import re
class RegexCleanser(object):
_regex = None
def __init__(self, input_string: str):
self._input_string = input_string
self._regex = self._fetch_rows_as_dict_keys(r'C:\Users\adity\Desktop\japsyn.csv')
#staticmethod
def _fetch_rows_as_dict_keys(file_path: str) -> dict:
"""
Reads the data from the file
:param file_path: the path of the file that holds the lookup data
:return: the read data
"""
try:
word_map = {}
for line in csv.reader(open(file_path, encoding='UTF-8')):
word, syn = line
word_map[word] = syn
return word_map
except FileNotFoundError:
print(f'Could not find the file at {file_path}')
def clean(self)-> str:
for pattern in self._regex.keys():
self._input_string = re.sub(pattern, self._regex[pattern], self._input_string)
return self._input_string
Usage:
if __name__ == '__main__':
cleaner = RegexCleanser(r'hi aaa人の abc 人の名前def ghi I dont know this language.')
clean_string = cleaner.clean()
print(clean_string)
I have my own data set that I want to train my model on. I have successfully created .pk.gz files but I don't know how would I import them into my model.
I am using a windows 10, python 3.5.2 with tensor-flow and tflearn and sublime text 3 to write code.
The code I used to create the pickle file:
from numpy import genfromtxt
import gzip
import _pickle as cPickle
#data = sio.loadmat('C:/DeepLearning_lib/Theano/Data/test_x.mat')
train_set_x = genfromtxt('C:/Users/Jay/Desktop/MachineLearning/dataset/NSL-KDD Processed/Kdd_Train_41.csv', delimiter=',')
train_set_y = genfromtxt('C:/Users/Jay/Desktop/MachineLearning/dataset/NSL-KDD Processed/NSL_TrainLabels_mat4.csv', delimiter=',')
valid_set_x = genfromtxt('C:/Users/Jay/Desktop/MachineLearning/dataset/NSL-KDD Processed/Kdd_Valid_41.csv', delimiter=',')
valid_set_y = genfromtxt('C:/Users/Jay/Desktop/MachineLearning/dataset/NSL-KDD Processed/NSL_ValidLabels_int2.csv', delimiter=',')
test_set_x = genfromtxt('C:/Users/Jay/Desktop/MachineLearning/dataset/NSL-KDD Processed/Kdd_Test_41.csv', delimiter=',')
test_set_y = genfromtxt('C:/Users/Jay/Desktop/MachineLearning/dataset/NSL-KDD Processed/NSL_TestLabels_mat5.csv', delimiter=',')
train_set = test_set_x
train_set_labels= test_set_y
valid_set = valid_set_x
valid_set_labels= valid_set_y
test_set = train_set_x
test_set_labels= train_set_y
f = gzip.open('C:/Users/Jay/Desktop/Data/train_set.pkl.gz','wb')
cPickle.dump(train_set, f, protocol=2)
f.close()
f = gzip.open('C:/Users/Jay/Desktop/Data/train_set_labels.pkl.gz','wb')
cPickle.dump(train_set_labels, f, protocol=2)
f.close()
f = gzip.open('C:/Users/Jay/Desktop/Data/valid_set_labels.pkl.gz','wb')
cPickle.dump(valid_set_labels, f, protocol=2)
f.close()
f = gzip.open('C:/Users/Jay/Desktop/Data/test_set_labels.pkl.gz','wb')
cPickle.dump(test_set_labels, f, protocol=2)
f.close()
f = gzip.open('C:/Users/Jay/Desktop/Data/valid_set.pkl.gz','wb')
cPickle.dump(valid_set, f, protocol=2)
f.close()
f = gzip.open('C:/Users/Jay/Desktop/Data/test_set.pkl.gz','wb')
cPickle.dump(test_set, f, protocol=2)
f.close()
Errors : when used 'rb'
'OSError: [Errno 9] peek() on write-only GzipFile object'
The following code should reconstruct your train_set:
with gzip.open('C:/Users/Jay/Desktop/Data/train_set.pkl.gz', 'rb') as f:
train_set = cPickle.load(f)
How can I use print format to place a set of characters (dot leading) between 2 objects?
For example, I have the following code:
os.chdir( "LOGS\\" )
for file in glob.glob('*'):
with open(file) as f:
contents = f.read()
if 'HOST_POWER="ON"' in contents:
print('{0:38} {1:3}'.format(file[:-4]," = ON"))
for file in glob.glob('*'):
with open(file) as f:
contents = f.read()
if 'HOST_POWER="OFF"' in contents:
print('{0:38} {1:3}'.format(file[:-4]," = OFF"))
Output:
server1.web.com = ON
server2.web.com = ON
server3334.web.com = OFF
server5332223.web.com = ON
server2233.web.com = ON
server44.web.com = ON
server1133333.web.com = OFF
But I want the output to look like this:
server1.web.com ............ ON
server2.web.com ............ ON
server3334.web.com ......... OFF
server5332223.web.com ...... ON
server2233.web.com ......... ON
server44.web.com ........... ON
server1133333.web.com ...... OFF
server{SPACE}............{SPACE}ON
server{SPACE}............{SPACE}OFF
You could just edit the string before you pass it to print (edited to get exactly the kind of formatting you want):
import glob
def padStr( x, n ):
x += ' '
return x + '.'*(n - len(x) )
for file in glob.glob('*.*'):
with open(file) as f:
contents = f.read()
if 'HOST_POWER="OFF"' in contents:
print('%s %s' % ( padStr(file[:-4], 38 ),"ON"))
for file in glob.glob('*'):
with open(file) as f:
contents = f.read()
if 'HOST_POWER="OFF"' in contents:
print('%s %s' % ( padStr(file[:-4], 38 ),"OFF"))
Output:
blahblahblah ......................... ON
f1 ................................... ON
tes .................................. OFF
Another (slightly messier) option is to fix the string in line before using it as a format arg:
print('{0} {1:3}'.format((f[:-4]+' ').ljust(38, '.'),"= ON"))
I am filtering my file list using this line:
MyList = filter(lambda x: x.endswith(('.doc','.txt','.dat')), os.listdir(path))
The line above will only filter lowercase extension files. Therefore, is there an elegant way to make it filter also the uppercase extension files?
You just need to add a .lower() to your lambda function
MyList = filter(lambda x: x.lower().endswith(('.doc','.txt','.dat')), os.listdir(path))
I'd prefer to use os.path.splitext with a list comprehension
from os.path import splitext
my_list = [x for x in os.listdir(path) if splitext(x)[1].lower() in {'.doc', '.txt', '.dat'}]
Still a bit much for a single line, so perhaps
from os.path import splitext
def valid_extension(x, valid={'.doc', '.txt', '.dat'}):
return splitext(x)[1].lower() in valid
my_list = [x for x in os.listdir(path) if valid_extension(x)]
import os
import re
pat = re.compile(r'[.](doc|txt|dat)$', re.IGNORECASE)
filenames = [filename for filename in os.listdir(path)
if re.search(pat, filename)]
print(filenames)