I need to convert a folder with around 4,000 .txt files into a single .csv with two columns:
(1) Column 1: 'File Name' (as specified in the original folder);
(2) Column 2: 'Content' (which should contain all text present in the corresponding .txt file).
Here you can see some of the files I am working with.
The most similar question to mine here is this one (Combine a folder of text files into a CSV with each content in a cell) but I could not implement any of the solutions presented there.
The last one I tried was the Python code proposed in the aforementioned question by Nathaniel Verhaaren but I got the exact same error as the question's author (even after implementing some suggestions):
import os
import csv
dirpath = 'path_of_directory'
output = 'output_file.csv'
with open(output, 'w') as outfile:
csvout = csv.writer(outfile)
csvout.writerow(['FileName', 'Content'])
files = os.listdir(dirpath)
for filename in files:
with open(dirpath + '/' + filename) as afile:
csvout.writerow([filename, afile.read()])
afile.close()
outfile.close()
Other questions which seemed similar to mine (for example, Python: Parsing Multiple .txt Files into a Single .csv File?, Merging multiple .txt files into a csv, and Converting 1000 text files into a single csv file) do not solve this exact problem I presented (and I could not adapt the solutions presented to my case).
I had a similar requirement and so I wrote the following class
import os
import pathlib
import glob
import csv
from collections import defaultdict
class FileCsvExport:
"""Generate a CSV file containing the name and contents of all files found"""
def __init__(self, directory: str, output: str, header = None, file_mask = None, walk_sub_dirs = True, remove_file_extension = True):
self.directory = directory
self.output = output
self.header = header
self.pattern = '**/*' if walk_sub_dirs else '*'
if isinstance(file_mask, str):
self.pattern = self.pattern + file_mask
self.remove_file_extension = remove_file_extension
self.rows = 0
def export(self) -> bool:
"""Return True if the CSV was created"""
return self.__make(self.__generate_dict())
def __generate_dict(self) -> defaultdict:
"""Finds all files recursively based on the specified parameters and returns a defaultdict"""
csv_data = defaultdict(list)
for file_path in glob.glob(os.path.join(self.directory, self.pattern), recursive = True):
path = pathlib.Path(file_path)
if not path.is_file():
continue
content = self.__get_content(path)
name = path.stem if self.remove_file_extension else path.name
csv_data[name].append(content)
return csv_data
#staticmethod
def __get_content(file_path: str) -> str:
with open(file_path) as file_object:
return file_object.read()
def __make(self, csv_data: defaultdict) -> bool:
"""
Takes a defaultdict of {k, [v]} where k is the file name and v is a list of file contents.
Writes out these values to a CSV and returns True when complete.
"""
with open(self.output, 'w', newline = '') as csv_file:
writer = csv.writer(csv_file, quoting = csv.QUOTE_ALL)
if isinstance(self.header, list):
writer.writerow(self.header)
for key, values in csv_data.items():
for duplicate in values:
writer.writerow([key, duplicate])
self.rows = self.rows + 1
return True
Which can be used like so
...
myFiles = r'path/to/files/'
outputFile = r'path/to/output.csv'
exporter = FileCsvExport(directory = myFiles, output = outputFile, header = ['File Name', 'Content'], file_mask = '.txt')
if exporter.export():
print(f"Export complete. Total rows: {exporter.rows}.")
In my example directory, this returns
Export complete. Total rows: 6.
Note: rows does not count the header if present
This generated the following CSV file:
"File Name","Content"
"Test1","This is from Test1"
"Test2","This is from Test2"
"Test3","This is from Test3"
"Test4","This is from Test4"
"Test5","This is from Test5"
"Test5","This is in a sub-directory"
Optional parameters:
header: Takes a list of strings that will be written as the first line in the CSV. Default None.
file_mask: Takes a string that can be used to specify the file type; for example, .txt will cause it to only match .txt files. Default None.
walk_sub_dirs: If set to False, it will not search in sub-directories. Default True.
remove_file_extension: If set to False, it will cause the file name to be written with the file extension included; for example, File.txt instead of just File. Default True.
Related
I am trying to find files with .desktop extension in a specific directory in Python3. I tried the code snippet below but it didn't work as I wanted. I want it to be a single string value.
import os, fnmatch
desktopfile = configparser.ConfigParser ()
def find(pattern, path):
result = []
for root, dirs, files in os.walk(path):
for name in files:
if fnmatch.fnmatch(name, pattern):
result.append(os.path.join(root, name))
return result
script_tmp_dir = "/tmp/appiload/appinstall" # Geçici dizin (dosyalar burada ayıklanıyor)
desktopfilea=f"{script_tmp_dir}/squashfs-root/{str(find ('*.desktop', f'{script_tmp_dir}/squashfs-root/')}"
print(desktopfilea)
desktopfile.items()
Result:
/tmp/appiload/appinstall/squashfs-root/['/tmp/appiload/appinstall/squashfs-root/helloworld.desktop']
Use glob.glob instead of writing a function to do this job.
import os, glob
desktopfile = configparser.ConfigParser ()
script_tmp_dir = "/tmp/appiload/appinstall" # Geçici dizin (dosyalar burada ayıklanıyor)
desktopfilea = glob.glob(f'{script_tmp_dir}/squashfs-root/*.desktop')
# desktopfilea = " ".join(desktopfilea) # Join them in one string, using space as seperator
print(str(desktopfilea))
desktopfile.items()
I don't exactly understand what do you mean but I made a simple program that will print all the files with the .desktop extension and save them to 2 files: applications.json in an array and applications.txt just written one after another.
I also have 2 versions of the program, one that will only print and save only the file names with extensions and other one that will print and save the whole path.
File names only:
import os
import json
strApplications = ""
applications = []
for file in os.listdir(os.path.dirname(os.path.realpath(__file__))):
if file.endswith(".desktop"):
applications.append(file)
with open("applications.json", "w") as f:
json.dump(applications, f)
strApplications = strApplications + file + "\n"
with open("applications.txt", "w") as f:
f.write(strApplications)
print(strApplications)
Full file path:
import os
import json
cwd = os.getcwd()
files = [cwd + "\\" + f for f in os.listdir(cwd) if f.endswith(".desktop")]
with open("applications.json", "w") as f:
json.dump(files, f)
with open("applications.txt", "w") as f:
f.write("\n".join(files))
print("\n".join(files))
I have folder contain multiple .txt files ,the names of files(one.txt,two.txt,three.txt,...) I need to read the one.txt and then write the content of this file in list has name onefile[], then read two.txt and write the content in list twofile[] and so on. how can do this?
Update! Iam try this code, now how can print the values in each list ?
def writeinlist(file_path,i):
multilist = {}
output = open(file_path,'r')
globals()['List%s' % i] = output
print('List%s' % i)
input_path = Path(Path.home(), "Desktop", "NN")
index=1
for root, dirs, files in os.walk(input_path):
for file in files:
file_path = Path(root, file)
writeinlist(file_path,index)
index+=1
Update2: how can delete \n from values?
value_list1 = files_dict['file1']
print('Values of file1 are:')
print(value_list1)
I used the following to create a dictionary with dynamic keys (with the names of the files) and the respective values being a list with elements the lines of the file.
First, contents of onefile.txt:
First file first line
First file second line
First file third line
Contents of twofile.txt:
Second file first line
Second file second line
My code:
import os
import pprint
files_dict = {}
for file in os.listdir("/path/to/folder"):
if file.endswith(".txt"):
key = file.split(".")[0]
full_filename = os.path.join("/path/to/folder", file)
with open(full_filename, "r") as f:
files_dict[key] = f.readlines()
pprint.pprint(files_dict)
Output:
{'onefile': ['First file first line\n',
'First file second line\n',
'First file third line'],
'twofile': ['Second file first line\n', 'Second file second line']}
Another way to do this that's a bit more Pythonic:
import os
import pprint
files_dict = {}
for file in [
f
for f in os.listdir("/Users/itroulli/Downloads/data_eng_challenge3/files")
if f.endswith(".txt")
]:
with open(os.path.join("/path/to/folder", file), "r") as fo:
files_dict[file.split(".")[0]] = fo.readlines()
pprint.pprint(files_dict)
the attached script returns:
IndexError: list index out of range
for the line starting with values = {line.split (...)
values=dict()
with open(csv) as f:
lines =f.readlines()
values = {line.split(',')[0].strip():line.split(',')[1].strip() for line in lines}
However, I could use it yesterday for doing exactly the same:
replacing certain text in a dir of xml-files with different texts
import os
from distutils.dir_util import copy_tree
drc = 'D:/Spielwiese/00100_Arbeitsverzeichnis'
backup = 'D:/Spielwiese/Backup/'
csv = 'D:/persons1.csv'
copy_tree(drc, backup)
values=dict()
with open(csv) as f:
lines =f.readlines()
values = {line.split(',')[0].strip():line.split(',')[1].strip() for line in lines}
#Getting a list of the full paths of files
for dirpath, dirname, filename in os.walk(drc):
for fname in filename:
#Joining dirpath and filenames
path = os.path.join(dirpath, fname)
#Opening the files for reading only
filedata = open(path,encoding="Latin-1").read()
for k,v in values.items():
filedata=filedata.replace(k,v)
f = open(path, 'w',encoding="Latin-1")
# We are writing the the changes to the files
f.write(filedata)
f.close() #Closing the files
print("In case something went wrong, you can find a backup in " + backup)
I don't see anything weird and I could, as mentioned before use it before ... :-o
Any ideas on how to fix it?
best Wishes,
K
I have folder with a lot of TXT files (books) which have many special symbols (multiple spaces, paragraphs, #, -, '.', etc) in the beginning. It causes great variety of problems while reading the files in python (pandas). Usually it transfers into errors like:
ParserError: Error tokenizing data. C error: Expected 1 fields in line 29, saw 2
or
Found 0 texts.
Can I use some terminal script for text preconditioning? Your assistance will be much appreciated!
example for one file:
and code:
texts = [] # list of text samples
labels_index = {} # dictionary mapping label name to numeric id
labels = [] # list of label ids
for name in sorted(os.listdir(TEXT_DATA_DIR)):
path = os.path.join(TEXT_DATA_DIR, name)
if os.path.isdir(path):
label_id = len(labels_index)
labels_index[name] = label_id
for fname in sorted(os.listdir(path)):
if fname.isdigit():
fpath = os.path.join(path, fname)
args = {} if sys.version_info < (3,) else {'encoding': 'utf-8'}
with open(fpath, **args) as f:
t = f.read()
i = t.find('\n\n') # skip header
if 0 < i:
t = t[i:]
texts.append(t)
labels.append(label_id)
print('Found %s texts.' % len(texts))
You can try the unicodedata.
text = unicodedata.normalize('NFKD', text)
It Replaces unicode characters with their normal representations
How do I apply a function to a list of file paths I have built, and write an output csv in the same path?
read file in a subfolder -> perform a function -> write file in the
subfolder -> go to next subfolder
#opened xml by filename
with open(r'XML_opsReport 100001.xml', encoding = "utf8") as fd:
Odict_parsedFromFilePath = xmltodict.parse(fd.read())
#func called in func below
def activity_to_df_one_day (list_activity_this_day):
ib_list = [pd.DataFrame(list_activity_this_day[i], columns=list_activity_this_day[i].keys()).drop("#uom") for i in range(len(list_activity_this_day))]
return pd.concat(ib_list)
#Processes parsed xml and writes csv
def activity_to_df_all_days (Odict_parsedFromFilePath, subdir): #writes csv from parsed xml after some processing
nodes_reports = Odict_parsedFromFilePath['opsReports']['opsReport']
list_activity = []
for i in range(len(nodes_reports)):
try:
df = activity_to_df_one_day(nodes_reports[i]['activity'])
list_activity.append(df)
except KeyError:
continue
opsReport = pd.concat(list_activity)
opsReport['dTimStart'] = pd.to_datetime(opsReport['dTimStart'], infer_datetime_format =True)
opsReport.sort_values('dTimStart', axis=0, ascending=True, inplace=True, kind='quicksort', na_position='last')
opsReport.to_csv("subdir\opsReport.csv") #write to the subdir
def scanfolder(): #fetches list of file-paths with desired starting name.
list_files = []
for path, dirs, files in os.walk(r'C:\..\xml_objects'): #directory containing several subfolders
for f in files:
if f.startswith('XML_opsReport'):
list_files.append(os.path.join(path, f))
return list_files
filepaths = scanfolder() #list of file-paths
Every function works well, the xml processing is good, so I am not sharing the xml structure. There are 100+ paths in filepaths , each a different subdirectory. I want to be able to apply above flow in future as well, where I can get filepaths and perform desired actions. It's important to write the csv file to it's sub directory.
To get the directory that a file is in, you can use:
import os
for root, dirs, files, in os.walk(some_dir):
for f in files:
print(root)
output_file = os.path.join(root, "output_file.csv")
print(output_file)
Is that what you're looking for?
Output:
somedir
somedir\output_file.csv
See also Python 3 - travel directory tree with limited recursion depth and Find current directory and file's directory.
Was able to solve with os.path.join.
exceptions_path_list =[]
for i in filepaths:
try:
with open(i, encoding = "utf8") as fd:
doc = xmltodict.parse(fd.read())
activity_to_df_all_days (doc, i)
except ValueError:
exceptions_path_list.append(os.path.dirname(i))
continue
def activity_to_df_all_days (Odict_parsedFromFilePath, filepath):
...
...
...
opsReport.to_csv(os.path.join(os.path.dirname(filepath), "opsReport.csv"))