What is the efficient way to replace string on python? - python-3.x

I have large amounts of list for replacement like below.
The remplacement file list.txt:
人の,NN
人の名前,FF
And the data in which to replace text.txt :
aaa人の abc 人の名前def ghi
I want to replace this text to like below using list.txt.
>>> my_func('aaa人の abc 人の名前def ghi')
'aaaNN abc FFdef ghi'
This is my code. But I think this is quite inefficiency to process large data.
d = {}
with open('list.txt', 'r', encoding='utf8') as f:
for line in f:
line = line.strip()
d[line.split(',')[0]] = line.split(',')[1]
with open('text.txt', 'r', encoding='utf8') as f:
txt = f.read()
st = 0
lst = []
# \u4e00-\u9fea\u3040-\u309f] means the range of unicode of Japanese character
for match in re.finditer(r"([\u4e00-\u9fea\u3040-\u309f]+)", txt):
st_m, ed_m = match.span()
lst.append(txt[st:st_m])
search = txt[st_m:ed_m]
rpld = d[search]
lst.append(rpld)
st = ed_m
lst.append(txt[st:])
print(''.join(lst))
Please let me know better way.

After seeing your input aaa人の abc 人の名前def ghi I see you have white-spaces in between. So it's not really a word replace it's more of a phrase replace.
You can refer to the edit history to see the old answer in case you want word replacement
In such a case that you have phrase replacement, you can use re (reg-ex) and provide a array of replacements. Below is an implementation:
>>> import re
>>> _regex = {r'aaa人の abc 人の名前def ghi': r'人の,NN 人の名前,FF'}
>>> input_string = 'hi aaa人の abc 人の名前def ghi work'
>>> for pattern in _regex.keys():
input_string = re.sub(pattern, _regex[pattern], input_string)
>>> input_string
'hi 人の,NN 人の名前,FF work'
>>>
Below is an object oriented implementation of the above
import csv
import re
class RegexCleanser(object):
_regex = None
def __init__(self, input_string: str):
self._input_string = input_string
self._regex = self._fetch_rows_as_dict_keys(r'C:\Users\adity\Desktop\japsyn.csv')
#staticmethod
def _fetch_rows_as_dict_keys(file_path: str) -> dict:
"""
Reads the data from the file
:param file_path: the path of the file that holds the lookup data
:return: the read data
"""
try:
word_map = {}
for line in csv.reader(open(file_path, encoding='UTF-8')):
word, syn = line
word_map[word] = syn
return word_map
except FileNotFoundError:
print(f'Could not find the file at {file_path}')
def clean(self)-> str:
for pattern in self._regex.keys():
self._input_string = re.sub(pattern, self._regex[pattern], self._input_string)
return self._input_string
Usage:
if __name__ == '__main__':
cleaner = RegexCleanser(r'hi aaa人の abc 人の名前def ghi I dont know this language.')
clean_string = cleaner.clean()
print(clean_string)

Related

count occurrences of a string pattern in a file and count

Team,
I am trying to count two patterns in a file and list them as
pattern1: 2
pattern2: 3
#!/usr/bin/python
import os
import re
d = dict()
with open('/home/user/waste/nodes-prod.log', 'r') as file:
for line in file:
line = line.strip()
for word in line.split():
node1 = re.match(r"team1.*", word)
type(node1)
node2 = re.match(r"team2.*", word)
type(node2)
if node1 in d:
d[node1] = d[node1] + 1
else:
d[node2] = d[node2] + 1
for key in list(d.keys()):
print(key, ":", d[key])
my /home/user/waste/nodes-prod.log is below
cat /home/user/waste/nodes-prod.log
team1-develop
team1-work
team2-research1
team2-research2
team2-research3
output
Traceback (most recent call last):
File "read-and-count-words-pattern-fromfile-using-dict-in-python.py", line 17, in <module>
d[node2] = d[node2] + 1
KeyError: <_sre.SRE_Match object; span=(0, 10), match='team2-research1'>
expected:
node1: 2
node2: 3
It is easier if you read the entire text into memory (if that is not burdensome given the size of the file):
import re
with open(fn) as f:
txt=f.read()
print(f'node 1: {len(re.findall(r"team1.*", txt))}')
print(f'node 2: {len(re.findall(r"team2.*", txt))}')
Prints:
node 1: 2
node 2: 3
If you do want to do line-by-line, you can just keep a counter:
import re
node1,node2 =(0,0)
with open(fn) as f:
for line in f:
if re.search(r"team1.*", line): node1+=1
if re.search(r"team2.*", line): node2+=1
print(f'node 1: {node1}')
print(f'node 2: {node2}')
Better still, you could use a dict to map any `"team\d" to a mapping of that variable number:
nodes={}
with open(fn) as f:
for line in f:
if m:=re.search(r"team(\d+).*", line):
nodes[m.group(1)]=nodes.get(m.group(1),0)+1
>>> nodes
{'1': 2, '2': 3}
#!/usr/bin/python
import os
import re
# dict is the dictionary,
# pattern is the regular expression,
# word is the word to match.
def increment(dict: dict, pattern: str, word: str):
match = re.match(pattern, word)
if match:
# re.match returns a Match object, not a string.
# .group(n) returns n-s capture. .group() returns
# 0th capture, i.e. the whole match:
node = match.group()
# Initialise the counter, if necessary:
if not node in dict:
dict[node] = 0
# Increment the counter:
dict[node] += 1
# filename is a string that contains a path to file to parse,
# patterns is a dictionary of patterns to check against,
# the function returns a dictionary.
def scores(filename: str, patterns: dict) -> dict:
# Initialise the dictionary that keeps counters:
d = dict()
with open(filename, 'r') as file:
for line in file:
line = line.strip()
for word in line.split():
# Check against all patterns:
for pattern in patterns:
increment(d, pattern, word)
return d
# Patterns to search for.
# It is claimed that Python caches the compiled
# regular expressions, so that we don't need
# to pre-compile them:
patterns = [r"team1.*", r"team2.*"]
# file to parse:
filename = '/home/user/waste/nodes-prod.log'
# This is how a dictionary is iterated, when both key and value are needed:
for key, value in scores(filename, patterns).items():
print(key, ":", value)
def increment(dict: dict, pattern: str, word: str): defines a function that receives a dictionary dict, pattern and the word to check against patern. and a Match object match. The parameters are typed, which is optional in Python.
def scores(filename: str, patterns: dict) -> dict: defines a function that receives filename as a string, a dictionary of patterns and returns another dictionary of match counts.

Program doesn't stop iterating through list

def gameinfo():
lines = []
html_doc = 'STATIC.html'
soup = BeautifulSoup(open(html_doc), 'html.parser')
for mytable in soup.find_all('table'):
for trs in mytable.find_all('tr'):
tds = trs.find_all('td')
row1 = [elem.text.strip() for elem in tds]
row = str(row1)
sausage = False
with open("FIRE.txt", "r+") as file:
for line in file:
if row+"\n" in line:
break
else:
if row.split(",")[:4] == line.split(",")[:4]:
print(row)
print(line)
file.write(line.replace(line+"\n", row+"\n"))
print('Already exists with diff date')
sausage = True
break
if sausage == False:
print(row.split(",")[:4])
print(line.split(",")[:4])
print(row)
print(line)
file.write(row+"\n")
print('appended')
while True:
gameinfo()
gameinfo()
This program is supposed to keep searching the text file FIRE.txt for lines that match the variable row. When i run it, it works okay, but the part of the code that is supposed to check if the first four elements of the list are the same, and then skin the appending section below, doesn't work. When the program detects that the first 4 elements of a string turned into a list(row) that matches with another string's first 4 elements that's in the text file, it should overwrite the string in the text file. However when it detects a list that has the same first 4 elements, it loops forever and never breaks out.
My string looks like this:
['Infield Upper Deck Reserved 529', '$17.29', '4', '2', '175']
and i compare it to a list that looks like this:
['Infield Upper Deck Reserved 529', '$17.29', '4', '2', '170']
and when it sees that the first 4 elements in the list are the same, it should overwrite the one that was in the text file to begin with, but it is looping.
Question has changed; most recent version last.
Methinks you want to use the csv module. If you iterate through a csv.reader object instead of the file object directly, you'll get each line as a a list.
Example:
import csv
row = ["this", "is", "an", "example"]
with open("FIRE.txt", "r+") as file:
reader = csv.reader(file)
for line in reader:
if row in line:
break
pass
Alternatively, if you don't need to use this in anything other than Python, you could pickle a collections.OrderedDict with a tuple of the first four items as the keys:
import collections
import pickle
import contextlib
#contextlib.contextmanager
def mutable_pickle(path, default=object):
try:
with open(path, "rb") as f:
obj = pickle.load(f)
except IOError, EOFError:
obj = default()
try:
yield obj
finally:
with open(path, "wb") as f:
pickle.dump(obj, f)
with mutable_pickle("fire.bin",
default=collections.OrderedDict) as d:
for row in rows:
d[tuple(row[:4])] = row

Never resets list

I am trying to create a calorie counter the standard input goes like this:
python3 calories.txt < test.txt
Inside calories the food is the following format: apples 500
The problem I am having is that whenever I calculate the values for the person it seems to never return to an empty list..
import sys
food = {}
eaten = {}
finished = {}
total = 0
#mappings
def calories(x):
with open(x,"r") as file:
for line in file:
lines = line.strip().split()
key = " ".join(lines[0:-1])
value = lines[-1]
food[key] = value
def calculate(x):
a = []
for keys,values in x.items():
for c in values:
try:
a.append(int(food[c]))
except:
a.append(100)
print("before",a)
a = []
total = sum(a) # Problem here
print("after",a)
print(total)
def main():
calories(sys.argv[1])
for line in sys.stdin:
lines = line.strip().split(',')
for c in lines:
values = lines[0]
keys = lines[1:]
eaten[values] = keys
calculate(eaten)
if __name__ == '__main__':
main()
Edit - forgot to include what test.txt would look like:
joe,almonds,almonds,blue cheese,cabbage,mayonnaise,cherry pie,cola
mary,apple pie,avocado,broccoli,butter,danish pastry,lettuce,apple
sandy,zuchini,yogurt,veal,tuna,taco,pumpkin pie,macadamia nuts,brazil nuts
trudy,waffles,waffles,waffles,chicken noodle soup,chocolate chip cookie
How to make it easier on yourself:
When reading the calories-data, convert the calories to int() asap, no need to do it every time you want to sum up somthing that way.
Dictionary has a .get(key, defaultvalue) accessor, so if food not found, use 100 as default is a 1-liner w/o try: ... except:
This works for me, not using sys.stdin but supplying the second file as file as well instead of piping it into the program using <.
I modified some parsings to remove whitespaces and return a [(name,cal),...] tuplelist from calc.
May it help you to fix it to your liking:
def calories(x):
with open(x,"r") as file:
for line in file:
lines = line.strip().split()
key = " ".join(lines[0:-1])
value = lines[-1].strip() # ensure no whitespaces in
food[key] = int(value)
def getCal(foodlist, defValueUnknown = 100):
"""Get sum / total calories of a list of ingredients, unknown cost 100."""
return sum( food.get(x,defValueUnknown ) for x in foodlist) # calculate it, if unknown assume 100
def calculate(x):
a = []
for name,foods in x.items():
a.append((name, getCal(foods))) # append as tuple to list for all names/foods eaten
return a
def main():
calories(sys.argv[1])
with open(sys.argv[2]) as f: # parse as file, not piped in via sys.stdin
for line in f:
lines = line.strip().split(',')
for c in lines:
values = lines[0].strip()
keys = [x.strip() for x in lines[1:]] # ensure no whitespaces in
eaten[values] = keys
calced = calculate(eaten) # calculate after all are read into the dict
print (calced)
Output:
[('joe', 1400), ('mary', 1400), ('sandy', 1600), ('trudy', 1000)]
Using sys.stdin and piping just lead to my console blinking and waiting for manual input - maybe VS related...

How to avoid repetion with my code

I've written a code that extracts all the words from two files, and only returns the words that are in both of the file.
However, i have done some repetition and that is not considered a good style, so i wondering if it would be possible to avoid this with my code?
import re
def print_common_words(filename_1, filename_2):
try:
input_file = open(filename_1, 'r')
source_string = input_file.read().lower()
input_file.close()
all_words1 = set(re.findall('[a-zA-Z]+', source_string))
input_file = open(filename_2, 'r') #Repetition
source_string = input_file.read().lower() #Repetition
input_file.close() #Repetition
all_words2 = set(re.findall('[a-zA-Z]+', source_string)) #Repetition
intersection_list = all_words1.intersection(all_words2)
union_list = []
for word in intersection_list:
union_list += [word]
union_list.sort()
for i in union_list:
print(i)
except FileNotFoundError:
print("A file could not be found.")
Use a method to factor out the duplicated code.
def get_file(file):
input_file = open(file, 'r')
source_string = input_file.read().lower()
input_file.close()
return set(re.findall('[a-zA-Z]+', source_string))
Call it like:
all_words1 = get_file(filename_1)
all_words2 = get_file(filename_2)
Eg:
all_words1 = get_file(filename_1)
all_words2 = get_file(filename_2)
intersection_list = all_words1.intersection(all_words2)
union_list = []
for word in intersection_list:
union_list += [word]
union_list.sort()
for i in union_list:
print(i)

How can I simplify and format this function?

So I have this messy code where I wanted to get every word from frankenstein.txt, sort them alphabetically, eliminated one and two letter words, and write them into a new file.
def Dictionary():
d = []
count = 0
bad_char = '~!##$%^&*()_+{}|:"<>?\`1234567890-=[]\;\',./ '
replace = ' '*len(bad_char)
table = str.maketrans(bad_char, replace)
infile = open('frankenstein.txt', 'r')
for line in infile:
line = line.translate(table)
for word in line.split():
if len(word) > 2:
d.append(word)
count += 1
infile.close()
file = open('dictionary.txt', 'w')
file.write(str(set(d)))
file.close()
Dictionary()
How can I simplify it and make it more readable and also how can I make the words write vertically in the new file (it writes in a horizontal list):
abbey
abhorred
about
etc....
A few improvements below:
from string import digits, punctuation
def create_dictionary():
words = set()
bad_char = digits + punctuation + '...' # may need more characters
replace = ' ' * len(bad_char)
table = str.maketrans(bad_char, replace)
with open('frankenstein.txt') as infile:
for line in infile:
line = line.strip().translate(table)
for word in line.split():
if len(word) > 2:
words.add(word)
with open('dictionary.txt', 'w') as outfile:
outfile.writelines(sorted(words)) # note 'lines'
A few notes:
follow the style guide
string contains constants you can use to provide the "bad characters";
you never used count (which was just len(d) anyway);
use the with context manager for file handling; and
using a set from the start prevents duplicates, but they aren't ordered (hence sorted).
Using re module.
import re
words = set()
with open('frankenstein.txt') as infile:
for line in infile:
words.extend([x for x in re.split(r'[^A-Za-z]*', line) if len(x) > 2])
with open('dictionary.txt', 'w') as outfile:
outfile.writelines(sorted(words))
From r'[^A-Za-z]*' in re.split, replace 'A-Za-z' with the characters which you want to include in dictionary.txt.

Resources