Read zip files from hdfs and create a dataframe with file name and file content - zip

I am trying to read zip files located in wasp location and, creating a df with the file name and file content. Rach zip file is having multiple .rcv files.
I tried doing it but it's giving the content of all files as an rdd, and not exactly how I want it.
the below code gives the zipfile name and its files contents as key value pair. what i expect is to have to the individual file name and its content as key value pair
eg:
ACARS 20170507/file1.rcv "file content as string"
ACARS 20170507/file2.rcv "file content as string"
def zip_extract(x):
in_memory_data = io.BytesIO(x[1])
file_obj = zipfile.ZipFile(in_memory_data, "r")
files = [i for i in file_obj.namelist()]
return dict(zip(files, [file_obj.open(file).read() for file in files]))
zips = sc.binaryFiles("hdfs://ACARS 20170507.zip")
files_data = zips.map(zip_extract).collect()

Related

Converting multiple files in a directory into .txt format. But file names become Binary

So I am creating plagiarism software, for that, I need to convert .pdf, .docx,[enter image description here][1] etc files into a .txt format. I successfully found a way to convert all the files in one directory to another. BUT the problem is, this method is changing the file names
into binary values. I need to get the original file name which I am gonna need in the next phase.
**Code:**
import os
import uuid
import textract
source_directory = os.path.join(os.getcwd(), "C:/Users/syedm/Desktop/Study/FOUNDplag/Plagiarism-checker-Python/mainfolder")
for filename in os.listdir(source_directory):
file, extension = os.path.splitext(filename)
unique_filename = str(uuid.uuid4()) + extension
os.rename(os.path.join(source_directory, filename), os.path.join(source_directory, unique_filename))
training_directory = os.path.join(os.getcwd(), "C:/Users/syedm/Desktop/Study/FOUNDplag/Plagiarism-checker-Python/trainingdata")
for process_file in os.listdir(source_directory):
file, extension = os.path.splitext(process_file)
# We create a new text file name by concatenating the .txt extension to file UUID
dest_file_path = file + '.txt'
# extract text from the file
content = textract.process(os.path.join(source_directory, process_file))
# We create and open the new and we prepare to write the Binary Data which is represented by the wb - Write Binary
write_text_file = open(os.path.join(training_directory, dest_file_path), "wb")
# write the content and close the newly created file
write_text_file.write(content)
write_text_file.close()
remove this line where you rename the files:
os.rename(os.path.join(source_directory, filename), os.path.join(source_directory, unique_filename))
that's also not binary, but a uuid instead.
Cheers

How to I check whether a file already contains the text I want to append?

I am currently working on a project. So I want to read all the *.pdf files in a directory, extract their text and append it to a text file. So far so good. I was able to do this, yeah.
Now the problem: if I am reading the same directory again, it appends the same files again. Is there a way to check whether the extracted text is already in the file and thus, skip the whole thing?
My code for this looks like this right now (I created the directory variable already):
`
for filename in os.listdir(directory):
if filename.endswith(".pdf"):
file = os.path.join(directory, filename)
print(file)
#parse data from file
file_data = parser.from_file(file)
#get files text content
text = file_data['content']
#print(type(text))
print("len ", len(text))
#print(text)
#save to textfile
f = open("test2.txt", "a+", encoding = 'utf-8')
f.write(text)
f.close()
else:
continue
`
Thanks in advance!
One thing you could do is load the file contents and check if the file is within the file:
if text in open("test2.txt"):
# write here
else:
# text is already in file, don't write
However, this is very inefficient. A better way is to create a file with the filenames that you have already written, and check that:
(at the beginning of your code):
files = open("files.txt").readlines()
(before parser.from_file(file)):
if file in files:
continue # don't read or write
(after f.close()):
files.append(file)
(after the whole loop has finished)
with open("files.txt", "w") as f:
f.write("\n".join(files))
Putting it all together:
files = open("files.txt").readlines()
for filename in os.listdir(directory):
if filename.endswith(".pdf"):
file = os.path.join(directory, filename)
if file in files:
continue # don't read or write
print(file)
#parse data from file
file_data = parser.from_file(file)
#get files text content
text = file_data['content']
#print(type(text))
print("len ", len(text))
#print(text)
#save to textfile
f = open("test2.txt", "a+", encoding = 'utf-8')
f.write(text)
f.close()
files.append(file)
else:
continue
with open("files.txt", "a+") as f:
f.write("\n".join(files))
Note that you need to create a file named files.txt in the current directory.

How to merge multiple text file located in two different folders and create a new column in the combine file in python?

All,
I have two folders that contains ~1000 txt files. Say Folder 1 and Folder 2. I would like to combine all the files in one txt file and create a new column call "Label" and assign labels such that if 001.txt file belong to Folder 1, the the label column will have "Folder 1" as a label.Likewise, if the txt file belong to "Folder 2" than the label will "Folder 2". So far I have below code, where I manage to marge all the txt file in folder 1 and rename to folder 1, but that's not What I want.
Folder1=001.txt,002.txt....1000.txt
Folder2=001.txt,002.txt....1000.txt
Reference dataset can be found here
Download File name reference = polarity dataset v1.0
import fileinput
import glob
file_list = glob.glob("*txt") #Looking at the files that has .txt extension
with open('Folder1.txt', 'w') as file:
input_lines = fileinput.input(file_list)
file.writelines(input_lines)

Create folders dynamically and write csv files to that folders

I would like to read several input files from a folder, perform some transformations,create folders on the fly and write the csv to corresponding folders. The point here is I have the input path which is like
"Input files\P1_set1\Set1_Folder_1_File_1_Hour09.csv" - for a single patient (This file contains readings of patient (P1) at 9th hour)
Similarly, there are multiple files for each patient and each patient files are grouped under each folder as shown below
So, to read each file, I am using wildcard regex as shown below in code
I have already tried using the glob package and am able to read it successfully but am facing issue while creating the output folders and saving the files. I am parsing the file string as shown below
f = "Input files\P1_set1\Set1_Folder_1_File_1_Hour09.csv"
f[12:] = "P1_set1\Set1_Folder_1_File_1_Hour09.csv"
filenames = sorted(glob.glob('Input files\P*_set1\*.csv'))
for f in filenames:
print(f) #This will print the full path
print(f[12:]) # This print the folder structure along with filename
df_transform = pd.read_csv(f)
df_transform = df_transform.drop(['Format 10','Time','Hour'],axis=1)
df_transform.to_csv("Output\" + str(f[12:]),index=False)
I expect the output folder to have the csv files which are grouped by each patient under their respective folders. The screenshot below shows how the transformed files should be arranged in output folder (same structure as input folder). Please note that "Output" folder is already existing (it's easy to create one folder you know)
So to read files in a folder use os library then you can do
import os
folder_path = "path_to_your_folder"
dir = os.listdir(folder_path)
for x in dir:
df_transform = pd.read_csv(f)
df_transform = df_transform.drop(['Format 10','Time','Hour'],axis=1)
if os.path.isdir("/home/el"):
df_transform.to_csv("Output/" + str(f[12:]),index=False)
else:
os.makedirs(folder_path+"/")
df_transform.to_csv("Output/" + str(f[12:]),index=False)
Now instead of user f[12:] split the x in for loop like
file_name = x.split('/')[-1] #if you want filename.csv
Let me know if this is what you wanted

creating corresponding subfolders and writing a portion of the file in new files inside those subfolders using python

I have a folder named "data". It contains subfolders "data_1", "data_2", and "data_3". These subfolders contain some text files. I want to parse through all these subfolders and generate corresponding subfolders with the same name, inside another folder named "processed_data". I want to also generate corresponding files with "processed" as a prefix in the name and want to write all those lines from the original file where "1293" is there in the original files.
I am using the below code but not able to get the required result. Neither the subfolders "data_1", "data_2", and "data_3" nor the files are getting created
import os
folder_name=""
def pre_processor():
data_location="D:\data" # folder containing all the data
for root, dirs, files in os.walk(data_location):
for dir in dirs:
#folder_name=""
folder_name=dir
for filename in files:
with open(os.path.join(root, filename),encoding="utf8",mode="r") as f:
processed_file_name = 'D:\\processed_data\\'+folder_name+'\\'+'processed'+filename
processed_file = open(processed_file_name,"w", encoding="utf8")
for line_number, line in enumerate(f, 1):
if "1293" in line:
processed_file.write(str(line))
processed_file.close()
pre_processor()
You might need to elaborate on the issue you are having; e.g., are the files being created, but empty?
A few things I notice:
1) Your indentation is off (not sure if this is just a copy-paste issue though): the pre_processor function is empty, i.e. you are defining the function at the same level as the declaration, not inside of it.
try this:
import os
folder_name=""
def pre_processor():
data_location="D:\data" # folder containing all the data
for root, dirs, files in os.walk(data_location):
for dir in dirs:
#folder_name=""
folder_name=dir
for filename in files:
with open(os.path.join(root, filename), encoding="utf8",mode="r") as f:
processed_file_name = 'D:\\processed_data\\'+folder_name+'\\'+'processed'+filename
processed_file = open(processed_file_name,"w", encoding="utf8")
for line_number, line in enumerate(f, 1):
if "1293" in line:
processed_file.write(str(line))
processed_file.close()
pre_processor()
2) Check if the processed_data and sub_folders exist; if not, create them first as this will not do so.
Instead of creating the path to the new Folder by hand you could just replace the name of the folder.
Furthermore, you are not creating the subfolders.
This code should work but replace the Linux folder slashes:
import os
folder_name=""
def pre_processor():
data_location="data" # folder containing all the data
for root, dirs, files in os.walk(data_location):
for dir in dirs:
# folder_name=""
folder_name = dir
for filename in files:
joined_path = os.path.join(root, filename)
with open(joined_path, encoding="utf8", mode="r") as f:
processed_folder_name = root.replace("data/", 'processed_data/')
processed_file_name = processed_folder_name+'/processed'+filename
if not os.path.exists(processed_folder_name):
os.makedirs(processed_folder_name)
processed_file = open(processed_file_name, "w", encoding="utf8")
for line in f:
if "1293" in line:
processed_file.write(str(line))
processed_file.close()
pre_processor()

Resources