File operation using numpy - python-3.x

I am trying to delete phrase from text file using numpy.I have tried
num = [] and num1.append(num1)
'a' instead of 'w' to write the file back.
While append doesn't delete the phrase
writes' first run deletes the phrase
second run deletes second line which is not phrase
third run empties the file
import numpy as np
phrase = 'the dog barked'
num = 0
with open("yourfile.txt") as myFile:
for num1, line in enumerate(myFile, 1):
if phrase in line:
num += num1
else:
break
a=np.genfromtxt("yourfile.txt",dtype=None, delimiter="\n", encoding=None )
with open('yourfile.txt','w') as f:
for el in np.delete(a,(num),axis=0):
f.write(str(el)+'\n')
'''
the bird flew
the dog barked
the cat meowed
'''

I think you can still use nums.append(num1) with w mode, the issue I think you're getting is that you used the enumerate function for myFile's lines using 1-index instead of 0-index as expected in numpy array. Changing it from enumerate(myFile, 1) to enumerate(myFile, 0) seems to fix the issue
import numpy as np
phrase = 'the dog barked'
nums = []
with open("yourfile.txt") as myFile:
for num1, line in enumerate(myFile, 0):
if phrase in line:
nums.append(num1)
a=np.genfromtxt("yourfile.txt",dtype=None, delimiter="\n", encoding=None )
with open('yourfile.txt','w') as f:
for el in np.delete(a,nums,axis=0):
f.write(str(el)+'\n')

Related

Iterate N items at a time on a generator with single yield

How do I do that?
islice() return n items at a time but I can't figure out how to iterate it.
Right now I do something like this:
# -*- coding: utf-8 -*-
'''
print 3 lines at a time.
'''
def myread(filename):
with open(filename,'r',encoding='utf-8-sig') as f:
for line in f:
yield line.strip()
filename = 'test.txt'
temp = []
for res_a in myread(filename):
temp.append(res_a)
if len(temp)==3:
print(temp)
temp = []
print(temp)
Note that I don't know how big is my text file.
You can use itertools.islice and the two argument form of iter, eg:
from itertools import islice
with open('file') as fin:
# gen-comp yielding stripped lines
lines = (line.strip() for line in fin)
# create list of at most 3 lines from the file's current position
# and use an empty list as a sentinel value of when to stop... (no more lines)
for three in iter(lambda: list(islice(lines, 3)), []):
print(three)
As a function:
def myread(filename):
with open(filename) as fin:
lines = (line.strip() for line in fin)
yield from iter(lambda: list(islice(lines, 3)), [])
islice(itr, n) will only return an iterator that runs until it reaches the nth element of itr. You would have to keep rebuilding the islice iterator for every group of n elements you want to return. You might want to try the grouper recipe from the itertools documentation, which avoids this rebuilding:
def grouper(iterable, n, fillvalue=None):
"Collect data into fixed-length chunks or blocks"
# grouper('ABCDEFG', 3, 'x') --> ABC DEF Gxx"
args = [iter(iterable)] * n
return zip_longest(*args, fillvalue=fillvalue)
To complete the example, you can filter out the fillvalues added to the output groups to get it to replicate the code provided by the OP:
for grp in grouper(myread(filename), 3):
trimmed_grp = [line for line in grp if line is not None]
print(trimmed_grp)

Program doesn't stop iterating through list

def gameinfo():
lines = []
html_doc = 'STATIC.html'
soup = BeautifulSoup(open(html_doc), 'html.parser')
for mytable in soup.find_all('table'):
for trs in mytable.find_all('tr'):
tds = trs.find_all('td')
row1 = [elem.text.strip() for elem in tds]
row = str(row1)
sausage = False
with open("FIRE.txt", "r+") as file:
for line in file:
if row+"\n" in line:
break
else:
if row.split(",")[:4] == line.split(",")[:4]:
print(row)
print(line)
file.write(line.replace(line+"\n", row+"\n"))
print('Already exists with diff date')
sausage = True
break
if sausage == False:
print(row.split(",")[:4])
print(line.split(",")[:4])
print(row)
print(line)
file.write(row+"\n")
print('appended')
while True:
gameinfo()
gameinfo()
This program is supposed to keep searching the text file FIRE.txt for lines that match the variable row. When i run it, it works okay, but the part of the code that is supposed to check if the first four elements of the list are the same, and then skin the appending section below, doesn't work. When the program detects that the first 4 elements of a string turned into a list(row) that matches with another string's first 4 elements that's in the text file, it should overwrite the string in the text file. However when it detects a list that has the same first 4 elements, it loops forever and never breaks out.
My string looks like this:
['Infield Upper Deck Reserved 529', '$17.29', '4', '2', '175']
and i compare it to a list that looks like this:
['Infield Upper Deck Reserved 529', '$17.29', '4', '2', '170']
and when it sees that the first 4 elements in the list are the same, it should overwrite the one that was in the text file to begin with, but it is looping.
Question has changed; most recent version last.
Methinks you want to use the csv module. If you iterate through a csv.reader object instead of the file object directly, you'll get each line as a a list.
Example:
import csv
row = ["this", "is", "an", "example"]
with open("FIRE.txt", "r+") as file:
reader = csv.reader(file)
for line in reader:
if row in line:
break
pass
Alternatively, if you don't need to use this in anything other than Python, you could pickle a collections.OrderedDict with a tuple of the first four items as the keys:
import collections
import pickle
import contextlib
#contextlib.contextmanager
def mutable_pickle(path, default=object):
try:
with open(path, "rb") as f:
obj = pickle.load(f)
except IOError, EOFError:
obj = default()
try:
yield obj
finally:
with open(path, "wb") as f:
pickle.dump(obj, f)
with mutable_pickle("fire.bin",
default=collections.OrderedDict) as d:
for row in rows:
d[tuple(row[:4])] = row

Markov analysis - Return and recursion role

I am working on the solution of the Markov analysis in Think Python, but I do not understand the role of "Return" in the block code below.
As far as I known when the code reach return the function is cancel immediately, but isn't it unnecessary in this case, because there is a recursion here random_text(n-i) before the code reach the return statement, so the function will cancel only when the recursion is finish which mean when the for loop is over?? The question seem stupid but I am newbie in python and the recursion stuff is really confusing with me. I try to remove 'return' and it still run well.
def random_text(n=100):
start = random.choice(list(suffix_map.keys()))
for i in range(n):
suffixes = suffix_map.get(start, None)
if suffixes == None:
# if the start isn't in map, we got to the end of the
# original text, so we have to start again.
random_text(n-i)
return
word = random.choice(suffixes)
print(word, end=' ')
start = shift(start, word)
The full code is as below so you can understand what each function do.
from __future__ import print_function, division
import os
os.chdir(r"C:\Users\Hoang-Ngoc.Anh\Documents\WinPython-64bit 3.4.4.2\notebooks\docs")
import sys
import string
import random
# global variables
suffix_map = {} # map from prefixes to a list of suffixes
prefix = () # current tuple of words
def process_file(filename, order=2):
"""Reads a file and performs Markov analysis.
filename: string
order: integer number of words in the prefix
returns: map from prefix to list of possible suffixes.
"""
fp = open(filename)
skip_gutenberg_header(fp)
for line in fp:
for word in line.rstrip().split():
process_word(word, order)
def skip_gutenberg_header(fp):
"""Reads from fp until it finds the line that ends the header.
fp: open file object
"""
for line in fp:
if line.startswith('*END*THE SMALL PRINT!'):
break
def process_word(word, order=2):
"""Processes each word.
word: string
order: integer
During the first few iterations, all we do is store up the words;
after that we start adding entries to the dictionary.
"""
global prefix
if len(prefix) < order:
prefix += (word,)
return
try:
suffix_map[prefix].append(word)
except KeyError:
# if there is no entry for this prefix, make one
suffix_map[prefix] = [word]
prefix = shift(prefix, word)
def random_text(n=100):
"""Generates random wordsfrom the analyzed text.
Starts with a random prefix from the dictionary.
n: number of words to generate
"""
# choose a random prefix (not weighted by frequency)
start = random.choice(list(suffix_map.keys()))
for i in range(n):
suffixes = suffix_map.get(start, None)
if suffixes == None:
# if the start isn't in map, we got to the end of the
# original text, so we have to start again.
random_text(n-i)
return
# choose a random suffix
word = random.choice(suffixes)
print(word, end=' ')
start = shift(start, word)
def shift(t, word):
"""Forms a new tuple by removing the head and adding word to the tail.
t: tuple of strings
word: string
Returns: tuple of strings
"""
return t[1:] + (word,)
def main(script, filename='emma.txt', n=100, order=2):
try:
n = int(n)
order = int(order)
except ValueError:
print('Usage: %d filename [# of words] [prefix length]' % script)
else:
process_file(filename, order)
random_text(n)
print()
if __name__ == '__main__':
main(*sys.argv)

Creating a dictionary to count the number of occurrences of Sequence IDs

I'm trying to write a function to count the number of each sequence ID that occurs in this file (it's a sample blast file)
The picture above is the input file I'm dealing with.
def count_seq(input):
dic1={}
count=0
for line in input:
if line.startswith('#'):
continue
if line.find('hits found'):
line=line.split('\t')
if line[1] in dic1:
dic1[line]+=1
else:
dic1[line]=1
return dic1
Above is my code which when called just returns empty brackets {}
So I'm trying to count how many times each of the sequence IDs (second element of last 13 lines) occur eg: FO203510.1 occurs 4 times.
Any help would be appreciated immensely, thanks!
Maybe this is what you're after:
def count_seq(input_file):
dic1={}
with open(input_file, "r") as f:
for line in f:
line = line.strip()
if not line.startswith('#'):
line = line.split()
seq_id = line[1]
if not seq_id in dic1:
dic1[seq_id] = 1
else:
dic1[seq_id] += 1
return dic1
print(count_seq("blast_file"))
This is a fitting case for collections.defaultdict. Let f be the file object. Assuming the sequences are in the second column, it's only a few lines of code as shown.
from collections import defaultdict
d = defaultdict(int)
seqs = (line.split()[1] for line in f if not line.strip().startswith("#"))
for seq in seqs:
d[seq] += 1
See if it works!

How can I simplify and format this function?

So I have this messy code where I wanted to get every word from frankenstein.txt, sort them alphabetically, eliminated one and two letter words, and write them into a new file.
def Dictionary():
d = []
count = 0
bad_char = '~!##$%^&*()_+{}|:"<>?\`1234567890-=[]\;\',./ '
replace = ' '*len(bad_char)
table = str.maketrans(bad_char, replace)
infile = open('frankenstein.txt', 'r')
for line in infile:
line = line.translate(table)
for word in line.split():
if len(word) > 2:
d.append(word)
count += 1
infile.close()
file = open('dictionary.txt', 'w')
file.write(str(set(d)))
file.close()
Dictionary()
How can I simplify it and make it more readable and also how can I make the words write vertically in the new file (it writes in a horizontal list):
abbey
abhorred
about
etc....
A few improvements below:
from string import digits, punctuation
def create_dictionary():
words = set()
bad_char = digits + punctuation + '...' # may need more characters
replace = ' ' * len(bad_char)
table = str.maketrans(bad_char, replace)
with open('frankenstein.txt') as infile:
for line in infile:
line = line.strip().translate(table)
for word in line.split():
if len(word) > 2:
words.add(word)
with open('dictionary.txt', 'w') as outfile:
outfile.writelines(sorted(words)) # note 'lines'
A few notes:
follow the style guide
string contains constants you can use to provide the "bad characters";
you never used count (which was just len(d) anyway);
use the with context manager for file handling; and
using a set from the start prevents duplicates, but they aren't ordered (hence sorted).
Using re module.
import re
words = set()
with open('frankenstein.txt') as infile:
for line in infile:
words.extend([x for x in re.split(r'[^A-Za-z]*', line) if len(x) > 2])
with open('dictionary.txt', 'w') as outfile:
outfile.writelines(sorted(words))
From r'[^A-Za-z]*' in re.split, replace 'A-Za-z' with the characters which you want to include in dictionary.txt.

Resources