How to prepare text in all TXT files in folder for python via terminal? - python-3.x

I have folder with a lot of TXT files (books) which have many special symbols (multiple spaces, paragraphs, #, -, '.', etc) in the beginning. It causes great variety of problems while reading the files in python (pandas). Usually it transfers into errors like:
ParserError: Error tokenizing data. C error: Expected 1 fields in line 29, saw 2
or
Found 0 texts.
Can I use some terminal script for text preconditioning? Your assistance will be much appreciated!

example for one file:
and code:
texts = [] # list of text samples
labels_index = {} # dictionary mapping label name to numeric id
labels = [] # list of label ids
for name in sorted(os.listdir(TEXT_DATA_DIR)):
path = os.path.join(TEXT_DATA_DIR, name)
if os.path.isdir(path):
label_id = len(labels_index)
labels_index[name] = label_id
for fname in sorted(os.listdir(path)):
if fname.isdigit():
fpath = os.path.join(path, fname)
args = {} if sys.version_info < (3,) else {'encoding': 'utf-8'}
with open(fpath, **args) as f:
t = f.read()
i = t.find('\n\n') # skip header
if 0 < i:
t = t[i:]
texts.append(t)
labels.append(label_id)
print('Found %s texts.' % len(texts))

You can try the unicodedata.
text = unicodedata.normalize('NFKD', text)
It Replaces unicode characters with their normal representations

Related

How do I find multiple strings in a text file?

I need all the strings found in the text file to be found and capitalized. I have found out how to find the string but getting multiple is my issue if you can help me print, where the given string is throughout my code, would be great thanks.
import os
import subprocess
i = 1
string1 = 'biscuit eater'
# opens the text file
# if this is the path where my file resides, f will become an absolute path to it
f = os.path.expanduser("/users/acarroll55277/documents/Notes/new_myfile.txt")
# with this form of open, the wile will automatically close when exiting the code block
txtfile = open (f, 'r')
# print(f.read()) to print the text document in terminal
# this sets variables flag and index to 0
flag = 0
index = 0
# looks through the file line by line
for line in txtfile:
index += 1
#checking if the sting is in the line or not
if string1 in line:
flag = 1
break
# checking condition for sting found or not
if flag == 0:
print('string ' + string1 + ' not found')
else:
print('string ' + string1 + ' found in line ' + str(index))
I believe your approach would work, but it is very verbose and not very Pythonic. Try this out:
import os, subprocess
string1 = 'biscuit eater'
with open(os.path.expanduser("/users/acarroll55277/documents/Notes/new_myfile.txt"), 'r+') as fptr:
matches = list()
[matches.append(i) for i, line in enumerate(fptr.readlines()) if string1 in line.strip()]
fptr.read().replace(string1, string1.title())
if len(matches) == 0: print(f"string {string1} not found")
[print(f"string {string1} found in line {i}") for i in matches]
This will now print out a message for every occurrence of your string in the file. In addition, the file is handled safely and closed automatically at the end of the script thanks to the with statement.
You can use the str.replace-method. So in the line where you find the string, write line.replace(string1, string1.upper(), 1). The last 1 is there to only make the function replace 1 occurence of the string.
Either that or you read the text file as a string and use the replace-method on that entire string. That saves you the trouble of trying to find the occurence manually. In that case, you can write
txtfile = open(f, 'r')
content = txtfile.read()
content = content.replace(string1, string1.upper())

Creating a python spellchecker using tkinter

For school, I need to create a spell checker, using python. I decided to do it using a GUI created with tkinter. I need to be able to input a text (.txt) file that will be checked, and a dictionary file, also a text file. The program needs to open both files, check the check file against the dictionary file, and then display any words that are misspelled.
Here's my code:
import tkinter as tk
from tkinter.filedialog import askopenfilename
def checkFile():
# get the sequence of words from a file
text = open(file_ent.get())
dictDoc = open(dict_ent.get())
for ch in '!"#$%&()*+,-./:;<=>?#[\\]^_`{|}~':
text = text.replace(ch, ' ')
words = text.split()
# make a dictionary of the word counts
wordDict = {}
for w in words:
wordDict[w] = wordDict.get(w,0) + 1
for k in dictDict:
dictDoc.pop(k, None)
misspell_lbl["text"] = dictDoc
# Set-up the window
window = tk.Tk()
window.title("Temperature Converter")
window.resizable(width=False, height=False)
# Setup Layout
frame_a = tk.Frame(master=window)
file_lbl = tk.Label(master=frame_a, text="File Name")
space_lbl = tk.Label(master=frame_a, width = 6)
dict_lbl =tk.Label(master=frame_a, text="Dictionary File")
file_lbl.pack(side=tk.LEFT)
space_lbl.pack(side=tk.LEFT)
dict_lbl.pack(side=tk.LEFT)
frame_b = tk.Frame(master=window)
file_ent = tk.Entry(master=frame_b, width=20)
dict_ent = tk.Entry(master=frame_b, width=20)
file_ent.pack(side=tk.LEFT)
dict_ent.pack(side=tk.LEFT)
check_btn = tk.Button(master=window, text="Spellcheck", command=checkFile)
frame_c = tk.Frame(master=window)
message_lbl = tk.Label(master=frame_c, text="Misspelled Words:")
misspell_lbl = tk.Label(master=frame_c, text="")
message_lbl.pack()
misspell_lbl.pack()
frame_a.pack()
frame_b.pack()
check_btn.pack()
frame_c.pack()
# Run the application
window.mainloop()
I want the file to check against the dictionary and display the misspelled words in the misspell_lbl.
The test files I'm using to make it work, and to submit with the assignment are here:
check file
dictionary file
I preloaded the files to the site that I'm submitting this on, so it should just be a matter of entering the file name and extension, not the entire path.
I'm pretty sure the problem is with my function to read and check the file, I've been beating my head on a wall trying to solve this, and I'm stuck. Any help would be greatly appreciated.
Thanks.
The first problem is with how you try to read the files. open(...) will return a _io.TextIOWrapper object, not a string and this is what causes your error. To get the text from the file, you need to use .read(), like this:
def checkFile():
# get the sequence of words from a file
with open(file_ent.get()) as f:
text = f.read()
with open(dict_ent.get()) as f:
dictDoc = f.read().splitlines()
The with open(...) as f part gives you a file object called f, and automatically closes the file when it's done. This is more concise version of
f = open(...)
text = f.read()
f.close()
f.read() will get the text from the file. For the dictionary I also added .splitlines() to turn the newline separated text into a list.
I couldn't really see where you'd tried to check for misspelled words, but you can do it with a list comprehension.
misspelled = [x for x in words if x not in dictDoc]
This gets every word which is not in the dictionary file and adds it to a list called misspelled. Altogether, the checkFile function now looks like this, and works as expected:
def checkFile():
# get the sequence of words from a file
with open(file_ent.get()) as f:
text = f.read()
with open(dict_ent.get()) as f:
dictDoc = f.read().splitlines()
for ch in '!"#$%&()*+,-./:;<=>?#[\\]^_`{|}~':
text = text.replace(ch, ' ')
words = text.split()
# make a dictionary of the word counts
wordDict = {}
for w in words:
wordDict[w] = wordDict.get(w,0) + 1
misspelled = [x for x in words if x not in dictDoc]
misspell_lbl["text"] = misspelled

How to read many files have a specific format in python

I am a little bit confused in how to read all lines in many files where the file names have format from "datalog.txt.98" to "datalog.txt.120".
This is my code:
import json
file = "datalog.txt."
i = 97
for line in file:
i+=1
f = open (line + str (i),'r')
for row in f:
print (row)
Here, you will find an example of one line in one of those files:
I need really to your help
I suggest using a loop for opening multiple files with different formats.
To better understand this project I would recommend researching the following topics
for loops,
String manipulation,
Opening a file and reading its content,
List manipulation,
String parsing.
This is one of my favourite beginner guides.
To set the parameters of the integers at the end of the file name I would look into python for loops.
I think this is what you are trying to do
# create a list to store all your file content
files_content = []
# the prefix is of type string
filename_prefix = "datalog.txt."
# loop from 0 to 13
for i in range(0,14):
# make the filename variable with the prefix and
# the integer i which you need to convert to a string type
filename = filename_prefix + str(i)
# open the file read all the lines to a variable
with open(filename) as f:
content = f.readlines()
# append the file content to the files_content list
files_content.append(content)
To get rid of white space from file parsing add the missing line
content = [x.strip() for x in content]
files_content.append(content)
Here's an example of printing out files_content
for file in files_content:
print(file)

How to iterate over multiple files by name within given range?

So I'm trying to iterate over multiple xml files from a library which contains more then 100k files, I need to list files by their 3 last digits.
Expected result is a list of files named from 'asset-PD471090' to 'asset-PD471110' or 'asset-GT888185' to 'asset-GT888209', and so on.
My Code -
'''
import glob
strtid = input('From ID: ') # First file in range
seps = strtid[-3:]
endid = input('To ID: ') # Last file in range
eeps = endid[-3:]
FileId = strtid[:5] # always same File Id for whole range
for name in glob.iglob('asset-' + FileId + [seps-eeps] + '.xml', recursive=True):
print(name) # iterate over every file in given range and print file names.
'''
The error I'm getting is
TypeError: unsupported operand type(s) for -: 'str' and 'str'
How to load a specific range of input files ?
As the error tells you: you try to use - on strings:
strtid = input('From ID: ') # string
seps = strtid[-3:] # part of a string
endid = input('To ID: ') # string
eeps = endid[-3:] # part of a string
FileId = strtid[:5] # also part of a string
# [seps-eeps]: trying to substract a string from a string:
for name in glob.iglob('asset-' + FileId + [seps-eeps] + '.xml', recursive=True):
You can convert the string to a integer using int("1234") - won't help you much though, because then you only have one (wrong) number for your iglob.
If you wanted to give them as glob-pattern you would need to encase them in stringdelimiters - and glob does not work that way with numberranges:
"[123-678]" would be one digit of 1,2,3,4,5,6,7,8 - not 123 up to 678
However, you can test your files yourself:
import os
def get_files(directory, prefix, postfix, numbers):
lp = len(prefix) # your assets-GT
li = len(postfix) + 4 # your id + ".xml"
for root, dirs, files in os.walk(directory):
for file in sorted(files): # sorted to get files in order, might not need it
if int(file[lp:len(file)-li]) in numbers:
yield os.path.join(root,file)
d = "test"
prefix = "asset-GT" # input("Basename: ")
postfix = "185" # input("Id: ")
# create demo files to search into
os.makedirs(d)
for i in range(50,100):
with open (os.path.join(d,f"{prefix}{i:03}{postfix}.xml"),"w") as f:
f.write("")
# search params
fromto = "75 92" # input("From To (space seperated numbers): ")
fr, to = map(int,fromto.strip().split())
to += 1 # range upper limit is exclusive, so need to add 1 to include it
all_searched = list(get_files("./test", prefix, postfix, range(fr,to)))
print(*all_searched, sep="\n")
Output:
./test/asset-GT075185.xml
./test/asset-GT076185.xml
./test/asset-GT077185.xml
./test/asset-GT078185.xml
./test/asset-GT079185.xml
./test/asset-GT080185.xml
./test/asset-GT081185.xml
./test/asset-GT082185.xml
./test/asset-GT083185.xml
./test/asset-GT084185.xml
./test/asset-GT085185.xml
./test/asset-GT086185.xml
./test/asset-GT087185.xml
./test/asset-GT088185.xml
./test/asset-GT089185.xml
./test/asset-GT090185.xml
./test/asset-GT091185.xml
./test/asset-GT092185.xml

Python: If a string is found, stop searching for that string, search for the next string, and output the matching strings

This code outputs the matching string once for every time it is in the file that is being searched (so I end up with a huge list if the string is there repeatedly). I only want to know if the strings from my list match, not how many times they match. I do want to know which strings match, so a True/False solution does not work. But I only want them listed once, each, if they match. I do not really understand what the pattern = '|'.join(keywords) part is doing - I got that from someone else's code to get my file to file matching working, but don't know if I need it. Your help would be much appreciated.
# declares the files used
filenames = ['//Katie/Users/kitka/Documents/appreport.txt', '//Dallin/Users/dallin/Documents/appreport.txt' ,
'//Aidan/Users/aidan/Documents/appreport.txt']
# parses each file
for filename in filenames:
# imports the necessary libraries
import os, time, re, smtplib
from stat import * # ST_SIZE etc
# finds the time the file was last modified and error checks
try:
st = os.stat(filename)
except IOError:
print("failed to get information about", filename)
else:
# creates a list of words to search for
keywords = ['LoL', 'javaw']
pattern = '|'.join(keywords)
# searches the file for the strings in the list, sorts them and returns results
results = []
with open(filename, 'r') as f:
for line in f:
matches = re.findall(pattern, line)
if matches:
results.append((line, len(matches)))
results = sorted(results)
# appends results to the archive file
with open("GameReport.txt", "a") as f:
for line in results:
f.write(filename + '\n')
f.write(time.asctime(time.localtime(st[ST_MTIME])) + '\n')
f.write(str(line)+ '\n')
Untested, but this should work. Note that this only keeps track of which words were found, not which words were found in which files. I couldn't figure out whether or not that's what you wanted.
import fileinput
filenames = [...]
keywords = ['LoL', 'javaw']
# a set is like a list but with no duplicates, so even if a keyword
# is found multiple times, it will only appear once in the set
found = set()
# iterate over the lines of all the files
for line in fileinput.input(files=filenames):
for keyword in keywords:
if keyword in line:
found.add(keyword)
print(found)
EDIT
If you want to keep track of which keywords are present in which files, then I'd suggest keeping a set of (filename, keyword) tuples:
filenames = [...]
keywords = ['LoL', 'javaw']
found = set()
for filename in filenames:
with open(filename, 'rt') as f:
for line in f:
for keyword in keywords:
if keyword in line:
found.add((filename, keyword))
for filename, keyword in found:
print('Found the word "{}" in the file "{}"'.format(keyword, filename))

Resources