I am trying to print the word and line number(s) where the word occurs in the file in Python. Currently I am getting the correct numbers for second word, but the first word I look up does not print the right line numbers. I must iterate through infile, use a dictionary to store the line numbers, remove new line chars, remove any punctuation & skip over blank lines when pulling the number. I need to add a value that is actually a list, so that I may add the line numbers to the list if the word is contained on multiple lines.
Adjusted code:
def index(f,wordf):
infile = open(filename, 'r')
dct = {}
count = 0
for line in infile:
count += 1
newLine = line.replace('\n', ' ')
if newLine == ' ':
continue
for word in wordf:
if word in split_line:
if word in dct:
dct[word] += 1
else:
dct[word] = 1
for word in word_list:
print('{:12} {},'.format(word,dct[word]))
infile.close()
Current Output:
>>> index('leaves.txt',['cedars','countenance'])
pines [9469, 9835, 10848, 10883],
counter [792, 2092, 2374],
Desired output:
>>> index2('f.txt',['pines','counter','venison'])
pines [530, 9469, 9835, 10848, 10883]
counter [792, 2092, 2374]
There is some ambiguity for how your file is set up, but I think it understand.
Try this:
import numpy as np # add this import
...
for word in word_f:
if word in split_line:
np_array = np.array(split_line)
item_index_list = np.where(np_array == word)
dct[word] = item_index_list # note, you might want the 'index + 1' instead of the 'index'
for word in word_f:
print('{:12} {},'.format(word,dct[word]))
...
btw, as far as I can tell, you're not using your 'increment' variable.
I think that'll work, let me know if it doesn't and I'll fix it
per request, I made an additional answer (that I think works) without importing another library
def index2(f,word_f):
infile = open(f, 'r')
dct = {}
# deleted line
for line in infile:
newLine = line.replace('\n', ' ')
if newLine == ' ':
continue
# deleted line
newLine2 = removePunctuation(newLine)
split_line = newLine2.split()
for word in word_f:
count = 0 # you might want to start at 1 instead, if you're going for 'word number'
# important note: you need to have 'word2', not 'word' here, and on the next line
for word2 in split_line: # changed to looping through data
if word2 == word:
if word2 in dct:
temp = dct[word]
temp.append(count)
dct[word] = temp
else:
temp = []
temp.append(count)
dct[word] = temp
count += 1
for word in word_f:
print('{:12} {},'.format(word,dct[word]))
infile.close()
Do be aware, I don't think this code will handle if the words passed in are not in the file. I'm not positive on the file that you're grabbing from, so I can't be sure, but I think it'll seg fault if you pass in a word that doesn't exist in the file.
Note: I took this code from my other post to see if it works, and it seems that it does
def index2():
word_list = ["work", "many", "lots", "words"]
infile = ["lots of words","many many work words","how come this picture lots work","poem poem more words that rhyme"]
dct = {}
# deleted line
for line in infile:
newLine = line.replace('\n', ' ') # shouldn't do anything, because I have no newlines
if newLine == ' ':
continue
# deleted line
newLine2 = newLine # ignoring punctuation
split_line = newLine2.split()
for word in word_list:
count = 0 # you might want to start at 1 instead, if you're going for 'word number'
# important note: you need to have 'word2', not 'word' here, and on the next line
for word2 in split_line: # changed to looping through data
if word2 == word:
if word2 in dct:
temp = dct[word]
temp.append(count)
dct[word] = temp
else:
temp = []
temp.append(count)
dct[word] = temp
count += 1
for word in word_list:
print('{:12} {}'.format(word, ", ".join(map(str, dct[word])))) # edited output so it's comma separated list without a trailing comma
def main():
index2()
if __name__ == "__main__":main()
and the output:
work 2, 5
many 0, 1
lots 0, 4
words 2, 3, 3
and the explanation:
infile = [
"lots of words", # lots at index 0, words at index 2
"many many work words", # many at index 0, many at index 1, work at index 2, words at index 3
"how come this picture lots work", # lots at index 4, work at index 5
"poem poem more words that rhyme" # words at index 3
]
when they get appended in that order, they get the correct word placement position
My biggest error was that I was not properly adding the line number to the counter. I completely used the wrong call, and did nothing to increment the line number as the word was found in the file. The proper format was dct[word] += [count] not dct[word] += 1
def index(filename,word_list):
infile = open(filename, 'r')
dct = {}
count = 0
for line in infile:
count += 1
newLine = line.replace('\n', ' ')
if newLine == ' ':
continue
newLine2 = removePunctuation(newLine)
split_line = newLine2.split()
for word in word_list:
if word in split_line:
if word in dct:
dct[word] += [count]
else:
dct[word] = [count]
for word in word_list:
print('{:12} {}'.format(word,dct[word]))
infile.close()
Related
I am making a program, that reads a .txt file and prints how many times a certain word has been used:
filename = 'for_python.txt'
with open(filename) as file:
contents = file.read().split()
dict = {}
for word in contents:
if word not in dict:
dict[word] = 1
else:
dict[word] += 1
dict = sorted(dict.items(), key=lambda x: x[1], reverse=True)
for i in dict:
print(i[0], i[1])
It works, but it treats words with commas as different words. Is there an easy and efficient way to solve this?
This is what I did.
filename = 'for_python.txt'
with open(filename) as file:
contents = file.read().splitlines()
dict = {}
for sentence in contents:
word_list = sentence.split(" ")
for word in word_list:
cleaned_word = " "
for character in word:
if character.isalnum():
cleaned_word += character
if cleaned_word not in dict:
dict[cleaned_word] = 1
else:
dict[cleaned_word] += 1
dict = sorted(dict.items(), key=lambda x: x[1], reverse=True)
for i in dict:
print(i[0], i[1])
For some reason, I cannot seem to find where I have gone wrong with this program. It simply takes a file and reverses the text in the file, but for some reason all of separate sentences print on a new and I need them to print on the same line.
Here is my code for reference:
def read_file(filename):
try:
sentences = []
with open(filename, 'r') as infile:
sentence = ''
for line in infile.readlines():
if(line.strip())=='':continue
for word in line.split():
if word[-1] in ['.', '?', '!']:
sentence += word
sentences.append(sentence)
sentence = ''
else:
sentence += word + ' '
return sentences
except:
return None
def reverse_line(sentence):
stack = []
punctuation=sentence[-1]
sentence=sentence[:-1].lower()
words=sentence.split()
words[-1] = words[-1].title()
for word in words:
stack.append(word)
reversed_sentence = ''
while len(stack) != 0:
reversed_sentence += stack.pop() + ' '
return reversed_sentence.strip()+punctuation
def main():
filepath = input('File: ')
sentences = read_file(filepath)
if sentences is None:
print('Unable to read data from file: {}'.format(filepath))
return
for sentence in sentences:
reverse_sentence = reverse_line(sentence)
print(reverse_sentence)
main()
You can use the end keyword argument:
print(reverse_sentence, end=' ')
The default value for the end is \n, printing a new-line character at the end.
https://docs.python.org/3.3/library/functions.html#print
I have a text file like this:
This is just
an example of
a textfile
and would like to find the sum of all words that don't contain an "e". This sum is to be printed for every line, and should be the total sum of words in that line.
Currently I have this:
with open(sys.argv[1], "r") as f:
count = 0
for line in f:
words = line.split()
for word in words:
if "e" not in word:
for char in word:
count += 1
print(count)
and the output I get is:
4
6
10
12
14
15
when it should be:
10
4
1
You can use the len builtin to get the length of a string. The reason you're getting larger numbers than you expect is that you're not resetting the count variable for each line, and also you're printing after every word, not each line.
with open(sys.argv[1], "r") as f:
for line in f:
count = 0
words = line.split()
for word in words:
if "e" not in word:
count += len(word)
print(count)
You can write this more compactly as
with open(sys.argv[1], "r") as f:
for line in f:
print(sum(len(word) for word in line.split() if 'e' not in word))
Hello guys I am still an armature in python was hoping if anyone could help with this solution.
Write a function called longest which will take a string of space separated words and will return the longest one.
For example:
longest("This is Fabulous") => "Fabulous"
longest("F") => "F"
class Test(unittest.TestCase):
def test_longest_word(self):
sentence = "This is Fabulous"
self.assertEqual('Fabulous', longest(sentence))
def test_one_word(self):
sentence = "This"
self.assertEqual("This", longest(sentence))
This is my solution so far;
def find_longest_word(word_list):
longest_word = ''
longest_size = 0
for word in word_list:
if (len(word) > longest_size)
longest_word = word
longest_size = len(word)
return longest_word
words = input('Please enter a few words')
word_list = words.split()
find_longest_word(word_list)
Unfortunately am getting this error when I try to test the code
"File "", line 6
if (len(word) > longest_size)
^
SyntaxError: invalid syntax
Any help please I will highly appreciate?
def find_longest_word(myText):
a = myText.split(' ')
return max(a, key=len)
text = "This is Fabulous"
print (find_longest_word(text)) #Fabulous
EDIT: The solution above works if you want one of the longest words and not all of them. For example if my text is "Hey ! How are you ?" It will return just "Hey". If you want it to return ["Hey", "How", "are", "you"]
Better use this.
def find_longest_word(myText):
a = myText.split(' ')
m = max(map(len,a))
return [x for x in a if len(x) == m]
print (find_longest_word("Hey ! How are you ?")) #['Hey', 'How', 'are', 'you']
See also, this question
You are missing the : at the end of the if statement
Use the updated code below, I fixed your indentation issues too.
def find_longest_word(word_list):
longest_word = ''
longest_size = 0
for word in word_list:
if (len(word) > longest_size):
longest_word = word
longest_size = len(word)
return longest_word
words = input('Please enter a few words')
word_list = words.split()
find_longest_word(word_list)
Code sample is incorrect. I get the following message if I try to output:
Error on line 15: print(longest_word("chair", "couch", "table"))
TypeError: longest_word() takes 1 positional argument but 3 were given
So the code looks like this:
def longest_word(word_list):
longest_word = ''
longest_size = 0
for word in word_list:
if (len(word) > longest_size):
longest_word = word
longest_size = len(word)
return longest_word
words = input("chair", "couch", "table")
word_list = words.split()
find_longest_word(word_list)
# longest word in a text
text = input("Enter your text")
#Create a list of strings by splitting the original string
split_txt = text.split(" ")
# create a dictionary as word:len(word)
text_dic = {i:len(i)for i in split_txt}
long_word = max([v for v in text_dic.values()])
for k,v in text_dic.items():
if long_word == v:
print(k)
why is it that the number and totalLength still 0? What am I doing wrong. It should have changed due to the for line in lines statement
def cleanedup(s):
alphabet= 'abcdefghijklmnopqrstuvwxyz'
cleantext = ''
for character in s.lower():
if character in alphabet:
cleantext += character
else:
cleantext = ' '
return cleantext
import shelve
shelf = shelve.open('books')
lines = shelf['Pride and Prejudice']
shelf.close()
number = 0
totalLength = 0
for line in lines:
for word in cleanedup(line).split():
number += 1
totalLength += len(word)
print(totalLength, number)
This is the main problem with your script:
for character in s.lower():
...
else:
cleantext = ' '
I'm not sure this is the right position for the else, in your case you've put it after the for loop, so cleantext will be reset each time you run this function because your for loop has no break statement inside. More info on for ... else ...
Although that might not be what you want (question is a little unclear), the following code works:
def cleanedup(s):
alphabet= 'abcdefghijklmnopqrstuvwxyz'
cleantext = ''
for character in s.lower():
if character in alphabet:
cleantext += character
return cleantext
lines = ['lorem ipsum dolor sin amet', 'foo bar']
number = 0
totalLength = 0
for line in lines:
for word in cleanedup(line).split():
number += 1
totalLength += len(word)
print(totalLength, number)
Output:
>>> 28 2 # 28 = total number of characters, 2 = total number of lines
PS: Next time, provide a more concise example that demonstrates the problem, instead of using an external file.