count occurrences of a string pattern in a file and count - python-3.x

Team,
I am trying to count two patterns in a file and list them as
pattern1: 2
pattern2: 3
#!/usr/bin/python
import os
import re
d = dict()
with open('/home/user/waste/nodes-prod.log', 'r') as file:
for line in file:
line = line.strip()
for word in line.split():
node1 = re.match(r"team1.*", word)
type(node1)
node2 = re.match(r"team2.*", word)
type(node2)
if node1 in d:
d[node1] = d[node1] + 1
else:
d[node2] = d[node2] + 1
for key in list(d.keys()):
print(key, ":", d[key])
my /home/user/waste/nodes-prod.log is below
cat /home/user/waste/nodes-prod.log
team1-develop
team1-work
team2-research1
team2-research2
team2-research3
output
Traceback (most recent call last):
File "read-and-count-words-pattern-fromfile-using-dict-in-python.py", line 17, in <module>
d[node2] = d[node2] + 1
KeyError: <_sre.SRE_Match object; span=(0, 10), match='team2-research1'>
expected:
node1: 2
node2: 3

It is easier if you read the entire text into memory (if that is not burdensome given the size of the file):
import re
with open(fn) as f:
txt=f.read()
print(f'node 1: {len(re.findall(r"team1.*", txt))}')
print(f'node 2: {len(re.findall(r"team2.*", txt))}')
Prints:
node 1: 2
node 2: 3
If you do want to do line-by-line, you can just keep a counter:
import re
node1,node2 =(0,0)
with open(fn) as f:
for line in f:
if re.search(r"team1.*", line): node1+=1
if re.search(r"team2.*", line): node2+=1
print(f'node 1: {node1}')
print(f'node 2: {node2}')
Better still, you could use a dict to map any `"team\d" to a mapping of that variable number:
nodes={}
with open(fn) as f:
for line in f:
if m:=re.search(r"team(\d+).*", line):
nodes[m.group(1)]=nodes.get(m.group(1),0)+1
>>> nodes
{'1': 2, '2': 3}

#!/usr/bin/python
import os
import re
# dict is the dictionary,
# pattern is the regular expression,
# word is the word to match.
def increment(dict: dict, pattern: str, word: str):
match = re.match(pattern, word)
if match:
# re.match returns a Match object, not a string.
# .group(n) returns n-s capture. .group() returns
# 0th capture, i.e. the whole match:
node = match.group()
# Initialise the counter, if necessary:
if not node in dict:
dict[node] = 0
# Increment the counter:
dict[node] += 1
# filename is a string that contains a path to file to parse,
# patterns is a dictionary of patterns to check against,
# the function returns a dictionary.
def scores(filename: str, patterns: dict) -> dict:
# Initialise the dictionary that keeps counters:
d = dict()
with open(filename, 'r') as file:
for line in file:
line = line.strip()
for word in line.split():
# Check against all patterns:
for pattern in patterns:
increment(d, pattern, word)
return d
# Patterns to search for.
# It is claimed that Python caches the compiled
# regular expressions, so that we don't need
# to pre-compile them:
patterns = [r"team1.*", r"team2.*"]
# file to parse:
filename = '/home/user/waste/nodes-prod.log'
# This is how a dictionary is iterated, when both key and value are needed:
for key, value in scores(filename, patterns).items():
print(key, ":", value)
def increment(dict: dict, pattern: str, word: str): defines a function that receives a dictionary dict, pattern and the word to check against patern. and a Match object match. The parameters are typed, which is optional in Python.
def scores(filename: str, patterns: dict) -> dict: defines a function that receives filename as a string, a dictionary of patterns and returns another dictionary of match counts.

Related

read and write from and to file using functions

I'm trying to create 2 functions.
readfiles(file_path), That reads a file specified by file_path and returns a list of strings containing each line in the file.
writefiles(lines, file_path) That writes line by line the content of the list lines to the file specified by file_path.
When used one after another the output file should be an exact copy of the input file(including the formatting)
This is what i have so far.
file_path = ("/myfolder/text.txt", "r")
def readfiles(file_path):
with open file_path as f:
for line in f:
return line
lst = list[]
lst = line
lst.append(line)
return lst
read_file(file_path)
lines = lst []
def writefiles(lines, file_path):
with open ("file_path", "w") as f:
for line in lst:
f.write(line)
f.write("\n")
I can get it to kind of work when I use this for read
with open("/myfolder/text.txt", "r") as f:
for line in f:
print(line, end='')
and this for write
with open ("/myfolder/text.txt", "w") as f:
for line in f:
f.write(line)
f.write("\n")
But when I try to put them into functions it all messes up.
I'm not sure why, I know it's a simple question but it's just not clicking for me. I've read documentation on it but I'm not following it fully and am at my wits end. What's wrong with my functions?
I get varying errors from
lst = list[]
^
SyntaxError: invalid syntax
to
lst or list is not callable
Also I know there are similar questions but the ones I found don't seem to define a function.
The problems with your code are explained as comments
file_path = ("/myfolder/text.txt", "r") # this is a tupple of 2 elements should be file_path = "/myfolder/text.txt"
def readfiles(file_path):
with open file_path as f: # "open" is a function and will probably throw an error if you use it without parenthesis
# use open this way: open(file_path, "r")
for line in f:
return line # it will return the first line and exit the function
lst = list[] # "lst = []" is how you define a list in python. also you want to define it outside the loop
lst = line # you are replacing the list lst with the string in line
lst.append(line) # will throw an error because lst is a string now and doesn't have the append method
return lst
read_file(file_path) # should be lines = read_file(file_path)
lines = lst [] # lines is an empty list
def writefiles(lines, file_path):
with open ("file_path", "w") as f:
for line in lst: # this line should have 1 more tabulation
f.write(line) # this line should have 1 more tabulation
f.write("\n") # this line should have 1 more tabulation
Here's how the code should look like
def readfiles(file_path):
lst = []
with open(file_path) as f:
for line in f:
lst.append(line.strip("\n"))
return lst
def writefiles(lines, file_path):
with open(file_path, "w") as f:
for line in lines:
f.write(line + "\n")
file_path = "/myfolder/text.txt"
filepathout = "myfolder/text2.txt"
lines = readfiles(file_path)
writefiles(lines, filepathout)
A more pythonic way to do it
# readlines is a built-in function in python
with open(file_path) as f:
lines = f.readlines()
# stripping line returns
lines = [line.strip("\n") for line in lines]
# join will convert the list to a string by adding a \n between the list elements
with open(filepathout, "w") as f:
f.write("\n".join(lines))
key points:
- the function stops after reaching the return statement
- be careful where you define your variable.
i.e "lst" in a for loop will get redefined after each iteration
defining variables:
- for a list: list_var = []
- for a tuple: tup_var = (1, 2)
- for an int: int_var = 3
- for a dictionary: dict_var = {}
- for a string: string_var = "test"
A couple learning points here that will help.
In your reading function, you are kinda close. However, you cannot put the return statement in the loop. As soon as the function hits that anywhere for the first time, it ends. Also, if you are going to make a container to hold the list of things read, you need to make that before you start your loop. Lastly, don't name anything list. It is a keyword. If you want to make a new list item, just do something like: results = list() or results = []
So in pseudocode, you should:
Make a list to hold results
Open the file as you are now
Make a loop to loop through lines
append to the results list
return the results (outside the loop)
Your writefiles is very close. You should be looping through the lines variable that is a parameter of your function. Right now you are referencing lst which is not a parameter of your function.
Good luck!

Iterate N items at a time on a generator with single yield

How do I do that?
islice() return n items at a time but I can't figure out how to iterate it.
Right now I do something like this:
# -*- coding: utf-8 -*-
'''
print 3 lines at a time.
'''
def myread(filename):
with open(filename,'r',encoding='utf-8-sig') as f:
for line in f:
yield line.strip()
filename = 'test.txt'
temp = []
for res_a in myread(filename):
temp.append(res_a)
if len(temp)==3:
print(temp)
temp = []
print(temp)
Note that I don't know how big is my text file.
You can use itertools.islice and the two argument form of iter, eg:
from itertools import islice
with open('file') as fin:
# gen-comp yielding stripped lines
lines = (line.strip() for line in fin)
# create list of at most 3 lines from the file's current position
# and use an empty list as a sentinel value of when to stop... (no more lines)
for three in iter(lambda: list(islice(lines, 3)), []):
print(three)
As a function:
def myread(filename):
with open(filename) as fin:
lines = (line.strip() for line in fin)
yield from iter(lambda: list(islice(lines, 3)), [])
islice(itr, n) will only return an iterator that runs until it reaches the nth element of itr. You would have to keep rebuilding the islice iterator for every group of n elements you want to return. You might want to try the grouper recipe from the itertools documentation, which avoids this rebuilding:
def grouper(iterable, n, fillvalue=None):
"Collect data into fixed-length chunks or blocks"
# grouper('ABCDEFG', 3, 'x') --> ABC DEF Gxx"
args = [iter(iterable)] * n
return zip_longest(*args, fillvalue=fillvalue)
To complete the example, you can filter out the fillvalues added to the output groups to get it to replicate the code provided by the OP:
for grp in grouper(myread(filename), 3):
trimmed_grp = [line for line in grp if line is not None]
print(trimmed_grp)

Program doesn't stop iterating through list

def gameinfo():
lines = []
html_doc = 'STATIC.html'
soup = BeautifulSoup(open(html_doc), 'html.parser')
for mytable in soup.find_all('table'):
for trs in mytable.find_all('tr'):
tds = trs.find_all('td')
row1 = [elem.text.strip() for elem in tds]
row = str(row1)
sausage = False
with open("FIRE.txt", "r+") as file:
for line in file:
if row+"\n" in line:
break
else:
if row.split(",")[:4] == line.split(",")[:4]:
print(row)
print(line)
file.write(line.replace(line+"\n", row+"\n"))
print('Already exists with diff date')
sausage = True
break
if sausage == False:
print(row.split(",")[:4])
print(line.split(",")[:4])
print(row)
print(line)
file.write(row+"\n")
print('appended')
while True:
gameinfo()
gameinfo()
This program is supposed to keep searching the text file FIRE.txt for lines that match the variable row. When i run it, it works okay, but the part of the code that is supposed to check if the first four elements of the list are the same, and then skin the appending section below, doesn't work. When the program detects that the first 4 elements of a string turned into a list(row) that matches with another string's first 4 elements that's in the text file, it should overwrite the string in the text file. However when it detects a list that has the same first 4 elements, it loops forever and never breaks out.
My string looks like this:
['Infield Upper Deck Reserved 529', '$17.29', '4', '2', '175']
and i compare it to a list that looks like this:
['Infield Upper Deck Reserved 529', '$17.29', '4', '2', '170']
and when it sees that the first 4 elements in the list are the same, it should overwrite the one that was in the text file to begin with, but it is looping.
Question has changed; most recent version last.
Methinks you want to use the csv module. If you iterate through a csv.reader object instead of the file object directly, you'll get each line as a a list.
Example:
import csv
row = ["this", "is", "an", "example"]
with open("FIRE.txt", "r+") as file:
reader = csv.reader(file)
for line in reader:
if row in line:
break
pass
Alternatively, if you don't need to use this in anything other than Python, you could pickle a collections.OrderedDict with a tuple of the first four items as the keys:
import collections
import pickle
import contextlib
#contextlib.contextmanager
def mutable_pickle(path, default=object):
try:
with open(path, "rb") as f:
obj = pickle.load(f)
except IOError, EOFError:
obj = default()
try:
yield obj
finally:
with open(path, "wb") as f:
pickle.dump(obj, f)
with mutable_pickle("fire.bin",
default=collections.OrderedDict) as d:
for row in rows:
d[tuple(row[:4])] = row

Counting the occurrences of all letters in a txtfile [duplicate]

This question already has answers here:
I'm trying to count all letters in a txt file then display in descending order
(4 answers)
Closed 6 years ago.
I'm trying to open a file and count the occurrences of letters.
So far this is where I'm at:
def frequencies(filename):
infile=open(filename, 'r')
wordcount={}
content = infile.read()
infile.close()
counter = {}
invalid = "ā€˜'`,.?!:;-_\nā€”' '"
for word in content:
word = content.lower()
for letter in word:
if letter not in invalid:
if letter not in counter:
counter[letter] = content.count(letter)
print('{:8} appears {} times.'.format(letter, counter[letter]))
Any help would be greatly appreciated.
best way is using numpy packages, the example would be like this
import numpy
text = "xvasdavawdazczxfawaczxcaweac"
text = list(text)
a,b = numpy.unique(text, return_counts=True)
x = sorted(zip(b,a), reverse=True)
print(x)
in your case, you can combine all your words into single string, then convert the string into list of character
if you want to remove all except character, you can use regex to clean it
#clean all except character
content = re.sub(r'[^a-zA-Z]', r'', content)
#convert to list of char
content = list(content)
a,b = numpy.unique(content, return_counts=True)
x = sorted(zip(b,a), reverse=True)
print(x)
If you are looking for a solution not using numpy:
invalid = set([ch for ch in "ā€˜'`,.?!:;-_\nā€”' '"])
def frequencies(filename):
counter = {}
with open(filename, 'r') as f:
for ch in (char.lower() for char in f.read()):
if ch not in invalid:
if ch not in counter:
counter[ch] = 0
counter[ch] += 1
results = [(counter[ch], ch) for ch in counter]
return sorted(results)
for result in reversed(frequencies(filename)):
print result
I would suggest using collections.Counter instead.
Compact Solution
from collections import Counter
from string import ascii_lowercase # a-z string
VALID = set(ascii_lowercase)
with open('in.txt', 'r') as fin:
counter = Counter(char.lower() for line in fin for char in line if char.lower() in VALID)
print(counter.most_common()) # print values in order of most common to least.
More readable solution.
from collections import Counter
from string import ascii_lowercase # a-z string
VALID = set(ascii_lowercase)
with open('in.txt', 'r') as fin:
counter = Counter()
for char in (char.lower() for line in fin for char in line):
if char in VALID:
counter[char] += 1
print(counter)
If you don't want to use a Counter then you can just use a dict.
from string import ascii_lowercase # a-z string
VALID = set(ascii_lowercase)
with open('test.txt', 'r') as fin:
counter = {}
for char in (char.lower() for line in fin for char in line):
if char in VALID:
# add the letter to dict
# dict.get used to either get the current count value
# or default to 0. Saves checking if it is in the dict already
counter[char] = counter.get(char, 0) + 1
# sort the values by occurrence in descending order
data = sorted(counter.items(), key = lambda t: t[1], reverse = True)
print(data)

Markov analysis - Return and recursion role

I am working on the solution of the Markov analysis in Think Python, but I do not understand the role of "Return" in the block code below.
As far as I known when the code reach return the function is cancel immediately, but isn't it unnecessary in this case, because there is a recursion here random_text(n-i) before the code reach the return statement, so the function will cancel only when the recursion is finish which mean when the for loop is over?? The question seem stupid but I am newbie in python and the recursion stuff is really confusing with me. I try to remove 'return' and it still run well.
def random_text(n=100):
start = random.choice(list(suffix_map.keys()))
for i in range(n):
suffixes = suffix_map.get(start, None)
if suffixes == None:
# if the start isn't in map, we got to the end of the
# original text, so we have to start again.
random_text(n-i)
return
word = random.choice(suffixes)
print(word, end=' ')
start = shift(start, word)
The full code is as below so you can understand what each function do.
from __future__ import print_function, division
import os
os.chdir(r"C:\Users\Hoang-Ngoc.Anh\Documents\WinPython-64bit 3.4.4.2\notebooks\docs")
import sys
import string
import random
# global variables
suffix_map = {} # map from prefixes to a list of suffixes
prefix = () # current tuple of words
def process_file(filename, order=2):
"""Reads a file and performs Markov analysis.
filename: string
order: integer number of words in the prefix
returns: map from prefix to list of possible suffixes.
"""
fp = open(filename)
skip_gutenberg_header(fp)
for line in fp:
for word in line.rstrip().split():
process_word(word, order)
def skip_gutenberg_header(fp):
"""Reads from fp until it finds the line that ends the header.
fp: open file object
"""
for line in fp:
if line.startswith('*END*THE SMALL PRINT!'):
break
def process_word(word, order=2):
"""Processes each word.
word: string
order: integer
During the first few iterations, all we do is store up the words;
after that we start adding entries to the dictionary.
"""
global prefix
if len(prefix) < order:
prefix += (word,)
return
try:
suffix_map[prefix].append(word)
except KeyError:
# if there is no entry for this prefix, make one
suffix_map[prefix] = [word]
prefix = shift(prefix, word)
def random_text(n=100):
"""Generates random wordsfrom the analyzed text.
Starts with a random prefix from the dictionary.
n: number of words to generate
"""
# choose a random prefix (not weighted by frequency)
start = random.choice(list(suffix_map.keys()))
for i in range(n):
suffixes = suffix_map.get(start, None)
if suffixes == None:
# if the start isn't in map, we got to the end of the
# original text, so we have to start again.
random_text(n-i)
return
# choose a random suffix
word = random.choice(suffixes)
print(word, end=' ')
start = shift(start, word)
def shift(t, word):
"""Forms a new tuple by removing the head and adding word to the tail.
t: tuple of strings
word: string
Returns: tuple of strings
"""
return t[1:] + (word,)
def main(script, filename='emma.txt', n=100, order=2):
try:
n = int(n)
order = int(order)
except ValueError:
print('Usage: %d filename [# of words] [prefix length]' % script)
else:
process_file(filename, order)
random_text(n)
print()
if __name__ == '__main__':
main(*sys.argv)

Resources