listing filenames in a directory with .docx extension using python - python-3.x

'''This script is to copy text from documents (docx) to simple text file
'''
import sys
import ntpath
import os
from docx import Document
docpath = os.path.abspath(r'C:\Users\Khairul Basar\Documents\CWD Projects\00_WORKING\WL_SLOT1_submission_date_30-03-2018\1-100')
txtpath = os.path.abspath(r'C:\Users\Khairul Basar\Documents\CWD Projects\00_WORKING\WL_SLOT1_submission_date_30-03-2018\Textfiles')
for filename in os.listdir(docpath):
try:
document = Document(os.path.join(docpath, filename))
# print(document.paragraphs)
print(filename)
savetxt = os.path.join(txtpath, ntpath.basename(filename).split('.')[0] + ".txt")
print('Reading ' + filename)
# print(savetxt)
fullText = []
for para in document.paragraphs:
# print(para.text)
fullText.append(para.text)
with open(savetxt, 'wt') as newfile:
for item in fullText:
newfile.write("%s\n" % item)
# with open(savetxt, 'a') as f:
# f.write(para.text)
# print(" ".join([line.rstrip('\n') for line in f]))
# newfile.write(fullText)
# newfile.save()
# newfile.save()
#
# newfile.write('\n\n'.join(fullText))
# newfile.close()
except:
# print(filename)
# document = Document(os.path.join(docpath, filename))
# print(document.paragraphs)
print('Please fix an error')
exit()
# print("Please supply an input and output file. For example:\n"
# # " example-extracttext.py 'My Office 2007 document.docx' 'outp"
# "utfile.txt'")
# Fetch all the text out of the document we just created
# Make explicit unicode version
# Print out text of document with two newlines under each paragraph
print(savetxt)
Above python 3 script is to read Docx file and create txt files. In one directory I have 100s docx files, but it is only creating 19 txt files and then exiting the program. I couldn't figure why?
Docx files are output files from OCR software, all are English text ( no image, tables or graph or something special).
Today again I run the program after removing the Try/Except instruction and result is same:
1.docx
Reading 1.docx
10.docx
Reading 10.docx
100.docx
Reading 100.docx
11.docx
Reading 11.docx
12.docx
Reading 12.docx
13.docx
Reading 13.docx
14.docx
Reading 14.docx
15.docx
Reading 15.docx
16.docx
Reading 16.docx
17.docx
Reading 17.docx
18.docx
Reading 18.docx
Traceback (most recent call last):
File "C:\Users\Khairul Basar\Documents\CWD Projects\docx2txtv2.py", line 26,
in
newfile.write("%s\n" % item)
File "C:\Python36\lib\encodings\cp1252.py", line 19, in encode
return codecs.charmap_encode(input,self.errors,encoding_table)[0]
UnicodeEncodeError: 'charmap' codec can't encode character '\u0113' in position
77: character maps to
Some other post Here resolve this by .encode("utf-8")
But if i use it then I get b' my text' in every line - which i don't need.
UPDATE fixed
I have made change to following line:
with open(savetxt, 'w', encoding='utf-8') as newfile:
by adding encoding='utf-8'
help i took from this post. post
Thank you who has formated my post in a nice way.

usr2564301 has pointed out to remove Try/except from the code. By doing so i got exact error why it was not working or exiting the program prematurely.
The problem was my Docx has many characters which are beyond 8-bit character set. To convert that non-english characters to English encoding='utf-8' is used.
That solved the problem.
anyway, all credit goes to usr2564301 who is somewhere I don't know.

Related

Running a function on multiple files simultaneously with python

i have a specific function that manipulates text files via input of directory and file name.
The defined function is as below
def nav2xy(target_directory, target_file):
after_rows = f'MOD {target_file}_alines.txt'
after_columns = f'MOD {target_file}_acolumns.txt'
# this segment is used to remove top lines(8 in this case) for work with only the actual data
infile = open(f'{target_directory}/{target_file}', 'r').readlines()
with open(after_rows, 'w') as outfile:
for index, line in enumerate(infile):
if index >= 8:
outfile.write(line)
# this segment removes the necessary columns, in this case leaving only coordinates for gmt use
with open(after_rows) as In, open(after_columns, "w") as Out:
for line in In:
values = line.split()
Out.write(f"{values[4]} {values[5]}\n")
i am searching for a way to run this code once on all files in the chosen directory(could be targeted by name or just do all of them),
should i change the function to use only the file name?
tried running the function this way, to no avail
for i in os.listdir('Geoseas_related_files'):
nav2xy('target_directory', i)
this way works perfectly, although somehow i still get this error with it.
(base) ms-iMac:python gan$ python3 coordinates_fromtxt.py
Traceback (most recent call last):
File "coordinates_fromtxt.py", line 7, in <module>
nav2xy('Geoseas_related_files', str(i))
File "/Users/gadraifman/research/python/GAD_MSC/Nav.py", line 19, in nav2xy
Out.write(f"{values[4]} {values[5]}\n")
IndexError: list index out of range
any help or advice would be a great help,
From what I gather from Iterating through directories with Python, the best way to loop directories is using glob.
I made some extensive other modifications to your code to simplify it and remove the middle step of saving lines to a file just to read them again. If this step is mandatory, then feel free to add it back.
import os, glob
def nav2xy(target_file):
# New file name, just appending stuff.
# "target_file" will contain the path as defined by root_dir + current filename
after_columns = f'{target_file}_acolumns.txt'
with open(target_file, 'r') as infile, open(after_columns, "w") as outfile:
content = infile.readlines()
#
# --- Skip 8 lines here
# |
# v
for line in content[8:]:
# No need to write the lines to a file, just to read them again.
# Process directly
values = line.split()
outfile.write(f"{values[4]} {values[5]}\n")
# I guess this is the dir you want to loop through.
# Maybe an absolute path c:\path\to\files is better.
root_dir = 'Geoseas_related_files/*'
for file_or_dir in glob.iglob(os.path.join(root_dir,"*")):
# Skip directories, if there are any.
if os.path.isfile(file_or_dir):
nav2xy(file_or_dir)

Python3, read specific word from .txt file

I want to know how I can search for a specific word in a .txt file using Python3.
If the word 'pizza' is included in the .txt file, then it should (e.g.) import another python program.
You can iterate through each line in the file with a 'for' loop and check to see if the string is in the line:
f = open('file.txt', 'r')
for line in f:
if 'pizza' in line:
# do whatever you want here
print("Found it!")
else:
pass

UnicodeEncodeError: 'charmap' codec can't encode character '\ufe0f' in position 62: character maps to <undefined>

I am trying to scrape geo locations based on the urls and after about 500 searches and extraction of geo location I am getting encoding error. I have included the encoding utf-8 in the code and also followed the following command in the cmd.
chcp 65001
set PYTHONIOENCODING=utf-8
And yet I am getting the following error :
Traceback (most recent call last):
File "__main__.py", line 33, in <module>
outputfile.write(newline)
File "C:\Program Files\Anaconda3\lib\encodings\cp1252.py", line 19, in encode
return codecs.charmap_encode(input,self.errors,encoding_table)[0]
UnicodeEncodeError: 'charmap' codec can't encode character '\ufe0f' in position 62: character maps to <undefined>
I am using Python 3.x updated version on anaconda of all the packages.
#!/usr/bin/python
import sys
from twitter_location import get_location
from google_location import get_coordinates
# Open output file
outputfile = open(sys.argv[2], 'w')
# Read input file
with open(sys.argv[1], 'r', encoding = "utf-8", errors='ignore') as csv:
# Skip headers line
next(csv)
# Loop in lines
for line in csv:
# Extract userid
print (line)
permalink = line.split(',')[-1].strip()
userid = permalink.split('/')[3]
# Get location as string if exists
location = get_location(userid)
if location is None:
print ('user {} can not be reached or do not exposes any location.'.format(userid))
continue
else:
# If location is ok, get coordinates
coordinates = get_coordinates(location)
print ('{}: {}'.format(userid, coordinates))
# Copy current input line and add coordinates at the end
newline = '{},{}\n'.format(line.strip(), coordinates)
# Write in output file
outputfile.write(newline)
I am looking for two things here
Help with the encoding error
I want to put back the input headers in the output + the new column header
My input files have following headers
username date retweets favorites text geo mentions hashtags id permalink
while writing the output I am able to get all the columns + the new geo coordinates column too. But I am not able to put back the headers in the output files.
Appreciate the help, Thanks in advance.

To find a particular string from multiples text files in the same directory

I am finding a string, for example "error", from a multiple text files. The multiple text files are within a similar directory. After finding, it must be able to print that line containing the string.
So far, I have only been successful on searching and printing out the string from one text file.
In the below code, I tried to create a list of the filenames in the directory; the list is called logz, but it printed out nothing. It only worked when the logz in line 10 is listed as a TXT file.
The desired output should be something like this:
Line 0: asdasda error wefrewfawvewvaw
Line 3: awvawvawvaw error afvavavav
Line 6: e ERROR DSCVSVWASEFVEWVWEVW
Here is my code:
import re
import sys
import os
logz = [fn for fn in os.listdir(r'my text file directory') if fn.endswith('.txt')]
err_occur = [] # The list where we will store results.
pattern = re.compile(" error ", re.IGNORECASE)
try: # Try to:
with open ('logz', 'rt') as in_file: # open file for reading text.
for linenum, line in enumerate(in_file):
if pattern.search(line) != None:
err_occur.append((linenum, line.rstrip('/n')))
print("Line ", linenum, ": ", line, sep='')
You can use the following program as an example for writing yours. Replace the '.' in line 5 with the path to your text file directory. Line 9 can be modified as needed to search for words other than 'error' as well. You will need to be running Python 3.6 if you want to use f'' strings (line 10).
import pathlib
def main():
for path in pathlib.Path('.').iterdir():
if path.suffix.lower() == '.txt':
with path.open() as file:
for line, text in enumerate(file):
if 'error' in text.lower():
print(f'Line {line}: {text}')
if __name__ == '__main__':
main()

Merging multiple text files into one and related problems

I'm using Windows 7 and Python 3.4.
I have several multi-line text files (all in Persian) and I want to merge them into one under one condition: each line of the output file must contain the whole text of each input file. It means if there are nine text files, the output text file must have only nine lines, each line containing the text of a single file. I wrote this:
import os
os.chdir ('C:\Dir')
with open ('test.txt', 'w', encoding = 'UTF8') as OutFile:
with open ('news01.txt', 'r', encoding = 'UTF8') as InFile:
while True:
_Line = InFile.readline()
if len (_Line) == 0:
break
else:
_LineString = str (_Line)
OutFile.write (_LineString)
It worked for that one file but it looks like it takes more than one line in output file and also the output file contains disturbing characters like: &amp, &nbsp and things like that. But the source files don't contain any of them.
Also, I've got some other texts: news02.txt, news03.txt, news04.txt ... news09.txt.
Considering all these:
How can I correct my code so that it reads all files one after one, putting each in only one line?
How can I clean these unfamiliar and strange characters or prevent them to appear in my final text?
Here is an example that will do the merging portion of your question:
def merge_file(infile, outfile, separator = ""):
print(separator.join(line.strip("\n") for line in infile), file = outfile)
def merge_files(paths, outpath, separator = ""):
with open(outpath, 'w') as outfile:
for path in paths:
with open(path) as infile:
merge_file(infile, outfile, separator)
Example use:
merge_files(["C:\file1.txt", "C:\file2.txt"], "C:\output.txt")
Note this makes the rather large assumption that the contents of 'infile' can fit into memory. Reasonable for most text files, but possibly quite unreasonable otherwise. If your text files will be very large, you can this alternate merge_file implementation:
def merge_file(infile, outfile, separator = ""):
for line in infile:
outfile.write(line.strip("\n")+separator)
outfile.write("\n")
It's slower, but shouldn't run into memory problems.
Answering question 1:
You were right about the UTF-8 part.
You probably want to create a function which takes multiple files as a tuple of files/strings of file directories or *args. Then, read all input files, and replace all "\n" (newlines) with a delimiter (Default ""). out_file can be in in_files, but makes the assumption that the contents of files can be loaded in to memory. Also, out_file can be a file object, and in_files can be file objects.
def write_from_files(out_file, in_files, delimiter="", dir="C:\Dir"):
import _io
import os
import html.parser # See part 2 of answer
os.chdir(dir)
output = []
for file in in_files:
file_ = file
if not isinstance(file_, _io.TextIOWrapper):
file_ = open(file_, "r", -1, "UTF-8") # If it isn't a file, make it a file
file_.seek(0, 0)
output.append(file_.read().replace("\n", delimiter)) # Replace all newlines
file_.close() # Close file to prevent IO errors # with delimiter
if not isinstance(out_file, _io.TextIOWrapper):
out_file = open(out_file, "w", -1, "UTF-8")
html.parser.HTMLParser().unescape("\n".join(output))
out_file.write(join)
out_file.close()
return join # Do not have to return
Answering question 2:
I think you may of copied from a webpage. This does not happen to me. The &amp and &nbsp are the HTML entities, (&) and ( ). You may need to replace them with their corresponding character. I would use HTML.parser. As you see in above, it turns HTML escape sequences into Unicode literals. E.g.:
>>> html.parser.HTMLParser().unescape("Alpha &lt β")
'Alpha < β'
This will not work in Python 2.x, as in 3.x it was renamed. Instead, replace the incorrect lines with:
import HTMLParser
HTMLParser.HTMLParser().unescape("\n".join(output))

Resources