Search different words between two files - python-3.x

I have two files (txt), for examle FILE A and FILE B and i want to
find all the words in FILE A that exist in FILE B,
for example if file A is :
HIS HOUSE IS VERY SMALL
and file B is
HIS DOG IS VERY NICE
I want to write a program that show me that HOUSE is not
in file B.
I thought to use the SPLIT command and looping over the file
but since I do not know the python well, does anyone can help me
if there is another command that can help me?

Maybe there is a better solution, but this one below will solve your problem.
import re
def is_letter(s):
return re.match('[a-z]|[A-Z]$', s)
def words_only(s):
for i, x in enumerate(s):
if not is_letter(x):
s = s[:i] + s[i:].replace(x, ' ')
s = re.sub( '\s+', ' ', s).strip().upper().split(' ')
return s
file_a = words_only(open('file_a.txt', 'r').read())
file_b = words_only(open('file_b.txt', 'r').read())
for x in file_a:
if x not in file_b:
print(x)
file_a.txt
HIS DOG IS VERY SMALL.
His wife is very nice.
file_b.txt
HIS DOG IS VERY NICE.
He is very ugly.

Related

Issue with text file importing and manipulation

I have a txt file with this in it:
GWashington 83
JAdams 86
What I need to do is read the file, add 5 to the numbers and save it to a new file.
newFile = open('scores2.txt', 'w')
stdLines = [line.strip() for line in open('class_scores.txt')]
scrSep = [line.split(',') for line in stdLines]
print(stdLines, scrSep)
def convert_numbers(s):
if not s:
return s
try:
f = float(s)
i = int(f)
return i if f == i else f
except ValueError:
return s
g = list(map(convert_numbers, scrSep))
print(s)
print(scrSep)
Thank you in advance for your help.
what should happen with this is it should open the file, seperate the lines and seperate the components so then I can turn the numbers into ints and manipulate them. But strip and split are making it harder for the items to be accessed.
Never mind I fixed it, just had to call from deeper in the list.
So instead of list[x] I had to do list[x][y]
That uhh... slipped my mind.

How to Read Multiple Files in a Loop in Python and get count of matching words

I have two text files and 2 lists (FIRST_LIST,SCND_LIST),i want to find out count of each file matching words from FIRST_LIST,SCND_LIST individually.
FIRST_LIST =
"accessorizes","accessorizing","accessorized","accessorize"
SCND_LIST=
"accessorize","accessorized","accessorizes","accessorizing"
text File1 contains:
This is a very good question, and you have received good answers which describe interesting topics accessorized accessorize.
text File2 contains:
is more applied,using accessorize accessorized,accessorizes,accessorizing
output
File1 first list count=2
File1 second list count=0
File2 first list count=0
File2 second list count=4
This code i have tried to achive this functionality but not able to get the expected output.
if any help appreciated
import os
import glob
files=[]
for filename in glob.glob("*.txt"):
files.append(filename)
# remove Punctuations
import re
def remove_punctuation(line):
return re.sub(r'[^\w\s]', '', line)
two_files=[]
for filename in files:
for line in open(filename):
#two_files.append(remove_punctuation(line))
print(remove_punctuation(line),end='')
two_files.append(remove_punctuation(line))
FIRST_LIST = "accessorizes","accessorizing","accessorized","accessorize"
SCND_LIST="accessorize","accessorized","accessorizes","accessorizing"
c=[]
for match in FIRST_LIST:
if any(match in value for value in two_files):
#c=match+1
print (match)
c.append(match)
print(c)
len(c)
d=[]
for match in SCND_LIST:
if any(match in value for value in two_files):
#c=match+1
print (match)
d.append(match)
print(d)
len(d)
Using Counter and some list comprehension is one of many different approaches to solve your problem.
I assume, your sample output being wrong since some words are part of both lists and both files but are not counted. In addition I added a second line to the sample strings in order to show how that is working with multi-line strings which might be the typical contents of a given file.
io.StringIO objects emulate your files, but working with real files from your file system works exactly the same since both provide a file-like object or file-like interface:
from collections import Counter
list_a = ["accessorizes", "accessorizing", "accessorized", "accessorize"]
list_b = ["accessorize", "accessorized", "accessorizes", "accessorizing"]
# added a second line to each string just for the sake
file_contents_a = 'This is a very good question, and you have received good answers which describe interesting topics accessorized accessorize.\nThis is the second line in file a'
file_contents_b = 'is more applied,using accessorize accessorized,accessorizes,accessorizing\nThis is the second line in file b'
# using io.StringIO to simulate a file input (--> file-like object)
# you should use `with open(filename) as ...` for real file input
file_like_a = io.StringIO(file_contents_a)
file_like_b = io.StringIO(file_contents_b)
# read file contents and split lines into a list of strings
lines_of_file_a = file_like_a.read().splitlines()
lines_of_file_b = file_like_b.read().splitlines()
# iterate through all lines of each file (for file a here)
for line_number, line in enumerate(lines_of_file_a):
words = line.replace('.', ' ').replace(',', ' ').split(' ')
c = Counter(words)
in_list_a = sum([v for k,v in c.items() if k in list_a])
in_list_b = sum([v for k,v in c.items() if k in list_b])
print("Line {}".format(line_number))
print("- in list a {}".format(in_list_a))
print("- in list b {}".format(in_list_b))
# iterate through all lines of each file (for file b here)
for line_number, line in enumerate(lines_of_file_b):
words = line.replace('.', ' ').replace(',', ' ').split(' ')
c = Counter(words)
in_list_a = sum([v for k,v in c.items() if k in list_a])
in_list_b = sum([v for k,v in c.items() if k in list_b])
print("Line {}".format(line_number))
print("- in list a {}".format(in_list_a))
print("- in list b {}".format(in_list_b))
# actually, your two lists are the same
lists_are_equal = sorted(list_a) == sorted(list_b)
print(lists_are_equal)

Python Text File Compare and Concatenate

I need help with concatenating two text files based on common strings.
My first txt file looks like this:
Hello abc
Wonders xyz
World abc
And my second txt file looks like this:
abc A
xyz B
abc C
I want my output file to be:
Hello abc A
Wonders xyz B
World abc C
My Code goes something like this:
a = open("file1","r")
b = open("file2","r")
c = open("output","w")
for line in b:
chk = line.split(" ")
for line_new in a:
chk_new = line_new.split(" ")
if (chk_new[0] == chk[1]):
c.write(chk[0])
c.write(chk_new[0])
c.write(chk_new[1])
But when I use this code, I get the output as:
Hello abc A
Wonders xyz B
Hello abc C
Line 3 mismatch occurs. What should I do to get it the correct way?
I'm afraid you are mistaken, your code does not produce the output you say it does.
Partly because a file can only be read once, with the exception being if you move the read cursor back to the beginning of the file (file.seek(0), docs).
Partly because the second element of a line in the first file ends with a newline character, thus you are comparing e.g. "abc" with "abc\n" etc. which will never be true.
Hence the output file will be completely empty.
So how do you solve the problem? Reading a file more than once seems overly complicated, don't do that. I suggest you do something along the lines of:
# open all the files simultaneously
with open('file1', 'r') as (f1
), open('file2', 'r') as (f2
), open('output', 'w') as (outf
):
lines_left = True
while lines_left:
f1_line = f1.readline().rstrip()
# check if there's more to read
if len(f1_line) != 0:
f1_line_tokens = f1_line.split(' ')
# no need to strip the line from the second file
f2_line_tokens = f2.readline().split(' ')
if f1_line_tokens[1] == f2_line_tokens[0]:
outf.write(f1_line + ' ' + f2_line_tokens[1])
else:
lines_left = False
I've tested it on your example input and it produces the correct output (where file1 is the first example file and file2 is the second). If we talk about huge files (millions of lines), this version will be considerably faster than aarons. In other cases the performance difference will be negligible.
The open streams aren't safe and you can only read a file once. Do this:
aLines = []
bLines = []
with open("file1","r") as a:
for line in a:
aLines.append(line.strip().split(" "))
with open("file2","r") as b:
for line in b:
bLines.append(line.strip().split(" "))
bLines.reverse()
with open("output","w") as c:
for chk in aLines:
chk_new = bLines.pop()
if chk_new[0] == chk[1]:
c.write(chk[0])
c.write(chk_new[0])
c.write(chk_new[1])

Appending csv rows while adding characters from lines from file

I have a .csv file that I am creating, and it is being created by iterating through an input file. My current code for the specific column this question is about looks like this:
input_filename = sys.argv[1]
output_filename = sys.argv[2]
f = open(sys.argv[3]).read()
list.append(("A B", f[0:2], "numeric", "A B"))
For the portion of the code 'f[0:2]', rather than having it append the first few characters of f as a whole file (which obviously makes it append the first few characters every time it is appended), I want it to append [0:2] for the next line in f every time the loop is executed. I have tried:
list.append(("A B", f.line[0:2], "numeric", "A B"))
and other similar approaches, to no avail. I hope this question is clear - if not, I am happy to clarify. Any suggestions for putting this stipulation into this append line are appreciated!
Thank you!
It's a little hard for me to guess what you're trying to do here, but is this something like what you're looking for?
Contents of data.txt
abc
def
The code:
# I'm simply replacing your names so I can test this more easily
input_filename = 'input.txt'
output_filename = 'output.txt'
data_filename = 'data.txt'
transformed_data = []
with open(data_filename) as df:
for line in df:
# remove surrounding whitespace- assuming you want this
line = line.strip()
if line: # make sure there's non-whitespace characters left
transformed_data.append(("A B", line[0:2], "numeric", "A B"))
print(transformed_data)
# produces
# [('A B', 'ab', 'numeric', 'A B'), ('A B', 'de', 'numeric', 'A B')]
If you're working with .csv files, I highly recommend the csv library that comes with Python. Let it handle encoding and formatting for you.

a python program that searches text files and prints out mutual lines

I am trying to write a python program that takes n number of text files , each file contains names , each name on a separate line like this
Steve
Mark
Sarah
what the program does is that it prints out only the names that exist in all the inputted files .
I am new to programming so I don't really know how to implement this idea , but I thought in recursion , still the program seems to run in an infinite loop , I am not sure what's the problem . is the implementation wrong ? if so , do you have a better idea of how to implement it ?
import sys
arguments = sys.argv[1:]
files = {}
file = iter(arguments)
for number in range(len(sys.argv[1:])):
files[number] = open(next(file))
def close_files():
for num in files:
files[num].close()
def start_next_file(line,files,orderOfFile):
print('starting next file')
if orderOfFile < len(files): # to avoid IndexError
for line_searched in files[orderOfFile]:
if line_searched.strip():
line_searched = line_searched[:-1]
print('searched line = '+line_searched)
print('searched compared to = ' + line)
if line_searched == line:
#good now see if that name exists in the other files as well
start_next_file(line,files,orderOfFile+1)
elif orderOfFile >= len(files): # when you finish searching all the files
print('got ya '+line) #print the name that exists in all the files
for file in files:
# to make sure the cursor is at the beginning of the read files
#so we can loop through them again
files[file].seek(0)
def start_find_match(files):
orderOfFile = 0
for line in files[orderOfFile] :
# for each name in the file see if it exists in all other files
if line.strip():
line = line[:-1]
print ('starting line = '+line)
start_next_file(line,files,orderOfFile+1)
start_find_match(files)
close_files()
I'm not sure how to fix your code exactly but here's one conceptual way to think about it.
listdir gets all the files in the directory as a list. We narrow that to only .txt files. Next, open, read, split on newlines, and lower to make a larger list containing names. So, files will be a list of lists. Last, find the intersection across all lists using some set logic.
import os
folder = [f for f in os.listdir() if f[-4:] == '.txt']
files = []
for i,file in enumerate(folder):
with open(file) as f:
files.append([name.lower() for name in f.read().splitlines()])
result = set.intersection(*map(set, files))
Example:
#file1.txt
john
smith
mary
sue
pretesh
ashton
olaf
Elsa
#file2.txt
David
Lorenzo
Cassy
Grant
elsa
Felica
Salvador
Candance
Fidel
olaf
Tammi
Pasquale
#file3.txt
Jaleesa
Domenic
Shala
Berry
Pamelia
Kenneth
Georgina
Olaf
Kenton
Milly
Morgan
elsa
Returns:
{'olaf', 'elsa'}

Resources