Split big file in multiple files in python3.x - python-3.x

I want to split the file into multiple files if file size of file_write is greater than 20MB.
In Random function, I am opening big_file.txt and removing noise using remove_noise() and writing clean line to outfile.
I am not sure how to split the file based on the size in my current implementation. Please find the code below:
(Apologies for not providing proper implementation with example because it is really complicated)
I have gone through the example at this link: Split large text file(around 50GB) into multiple files
import os
def parses(lines, my_date_list):
for line in reversed(list(lines)):
line = line.strip()
if not line:
continue
date_string = "2019-11-01" # assumption
yield date_string, line
def remove_noise(line):
""" dummy function"""
return line
def random_function(path, output, cutoff="2019-10-31"):
my_date_list = []
if os.path.exists(path):
with open(path) as f:
lines = parses(f, my_date_list)
for date, line in lines:
if cutoff <= date:
results = remove_noise(line)
output.write(results + '\n')
continue
else:
break
While writing lines to output, I need to check size. If size reached 20MB and I want to write it to second {may be output_2} and so on.
if __name__ == '__main__':
path = "./big_file.txt"
file_write = "./write_file.txt"
with open(file_write) as outfile:
random_function(path=path, output=outfile)

Related

How to iterate on keras Dataset and edit content

I am working on this movie classification problem
https://www.tensorflow.org/tutorials/keras/text_classification
In this example text files(12500 files with movie revies) are read and a batched dataset is prepared like below
raw_train_ds = tf.keras.preprocessing.text_dataset_from_directory(
'aclImdb/train',
batch_size=batch_size,
validation_split=0.2,
subset='training',
seed=seed)
at the time of standardization
def custom_standardization(input_data):
lowercase = tf.strings.lower(input_data)
stripped_html = tf.strings.regex_replace(lowercase, '<br />', ' ')
#I WANT TO REMOVE STOP WORDS HERE, CAN I DO
return tf.strings.regex_replace(stripped_html,'[%s]' % re.escape(string.punctuation),'')
Problem: I understand that I have got training dataset with labels in variable 'raw_train_ds'. Now I want to iterate over this dataset and remove stop words from the movie review text and store back to same variable, I tried to do it in function 'custom_standardization' but it gives type error,
I also tried to use tf.strings.as_strings but it returns error
InvalidArgumentError: Value for attr 'T' of string is not in the list of allowed values: int8, int16, int32, int64
can someone please help on it OR simply please help how to remove stopwords from the batch dataset
It looks like right now TensorFlow does not have built in support for stop words removal, just basic standardization (lowercase & punctuation stripping). The TextVectorization used in the tutorial supports a custom standardization callback, but I couldn't find any stop words examples.
Since the tutorial downloads the imdb dataset and reads the text files from disc you can just do standardization manually with python before reading them. This will modify the text files themselves, but then you can read in the files normally using tf.keras.preprocessing.text_dataset_from_directory, and the entries will already have the stop words removed.
#!/usr/bin/env python3
import pathlib
import re
from bs4 import BeautifulSoup
from nltk.corpus import stopwords
stop_words = set(stopwords.words("english"))
def cleanup_text_files_in_folder(folder_name):
text_files = []
for file_path in pathlib.Path(folder_name).glob('*.txt'):
text_files.append(str(file_path))
print(f'Found {len(text_files)} files in {folder_name}')
# Give some kind of status
i = 0
for text_file in text_files:
replace_file_contents(text_file)
i += 1
if i % 1000 == 0:
print("No of files processed =", i)
return text_files
def replace_file_contents(input_file):
"""
This will read in the contents of the text file, process it (clean up, remove stop words)
and overwrite the new 'processed' output to that same file
"""
with open(input_file, 'r') as file:
file_data = file.read()
file_data = process_text_adv(file_data)
with open(input_file, 'w') as file:
file.write(file_data)
def process_text_adv(text):
# review without HTML tags
text = BeautifulSoup(text, features="html.parser").get_text()
# review without punctuation and numbers
text = re.sub(r'[^\w\s]','',text, re.UNICODE)
# lowercase
text = text.lower()
# simple split
text = text.split()
swords = set(stopwords.words("english")) # conversion into set for fast searching
text = [w for w in text if w not in swords]
# joining of splitted paragraph by spaces and return
return " ".join(text)
if __name__ == "__main__":
# Download & untar dataset beforehand, then running this would modify the text files
# in place. Back up the originals if that's a concern.
cleanup_text_files_in_folder('aclImdb/train/pos/')
cleanup_text_files_in_folder('aclImdb/train/neg/')
cleanup_text_files_in_folder('aclImdb/test/pos/')
cleanup_text_files_in_folder('aclImdb/test/neg/')

read and write from and to file using functions

I'm trying to create 2 functions.
readfiles(file_path), That reads a file specified by file_path and returns a list of strings containing each line in the file.
writefiles(lines, file_path) That writes line by line the content of the list lines to the file specified by file_path.
When used one after another the output file should be an exact copy of the input file(including the formatting)
This is what i have so far.
file_path = ("/myfolder/text.txt", "r")
def readfiles(file_path):
with open file_path as f:
for line in f:
return line
lst = list[]
lst = line
lst.append(line)
return lst
read_file(file_path)
lines = lst []
def writefiles(lines, file_path):
with open ("file_path", "w") as f:
for line in lst:
f.write(line)
f.write("\n")
I can get it to kind of work when I use this for read
with open("/myfolder/text.txt", "r") as f:
for line in f:
print(line, end='')
and this for write
with open ("/myfolder/text.txt", "w") as f:
for line in f:
f.write(line)
f.write("\n")
But when I try to put them into functions it all messes up.
I'm not sure why, I know it's a simple question but it's just not clicking for me. I've read documentation on it but I'm not following it fully and am at my wits end. What's wrong with my functions?
I get varying errors from
lst = list[]
^
SyntaxError: invalid syntax
to
lst or list is not callable
Also I know there are similar questions but the ones I found don't seem to define a function.
The problems with your code are explained as comments
file_path = ("/myfolder/text.txt", "r") # this is a tupple of 2 elements should be file_path = "/myfolder/text.txt"
def readfiles(file_path):
with open file_path as f: # "open" is a function and will probably throw an error if you use it without parenthesis
# use open this way: open(file_path, "r")
for line in f:
return line # it will return the first line and exit the function
lst = list[] # "lst = []" is how you define a list in python. also you want to define it outside the loop
lst = line # you are replacing the list lst with the string in line
lst.append(line) # will throw an error because lst is a string now and doesn't have the append method
return lst
read_file(file_path) # should be lines = read_file(file_path)
lines = lst [] # lines is an empty list
def writefiles(lines, file_path):
with open ("file_path", "w") as f:
for line in lst: # this line should have 1 more tabulation
f.write(line) # this line should have 1 more tabulation
f.write("\n") # this line should have 1 more tabulation
Here's how the code should look like
def readfiles(file_path):
lst = []
with open(file_path) as f:
for line in f:
lst.append(line.strip("\n"))
return lst
def writefiles(lines, file_path):
with open(file_path, "w") as f:
for line in lines:
f.write(line + "\n")
file_path = "/myfolder/text.txt"
filepathout = "myfolder/text2.txt"
lines = readfiles(file_path)
writefiles(lines, filepathout)
A more pythonic way to do it
# readlines is a built-in function in python
with open(file_path) as f:
lines = f.readlines()
# stripping line returns
lines = [line.strip("\n") for line in lines]
# join will convert the list to a string by adding a \n between the list elements
with open(filepathout, "w") as f:
f.write("\n".join(lines))
key points:
- the function stops after reaching the return statement
- be careful where you define your variable.
i.e "lst" in a for loop will get redefined after each iteration
defining variables:
- for a list: list_var = []
- for a tuple: tup_var = (1, 2)
- for an int: int_var = 3
- for a dictionary: dict_var = {}
- for a string: string_var = "test"
A couple learning points here that will help.
In your reading function, you are kinda close. However, you cannot put the return statement in the loop. As soon as the function hits that anywhere for the first time, it ends. Also, if you are going to make a container to hold the list of things read, you need to make that before you start your loop. Lastly, don't name anything list. It is a keyword. If you want to make a new list item, just do something like: results = list() or results = []
So in pseudocode, you should:
Make a list to hold results
Open the file as you are now
Make a loop to loop through lines
append to the results list
return the results (outside the loop)
Your writefiles is very close. You should be looping through the lines variable that is a parameter of your function. Right now you are referencing lst which is not a parameter of your function.
Good luck!

Python Split itertools output to multi files(BIG output)

So I have created a script to read lines from a file (1500 lines)
Write them as 10 per line
(and do every possible output we can get with product a b c d a , a b c d b etc...)
The thing is the moment I run the script my computer freezes completly(because it writes so much data)
So I thought if its possible to run the script every 100 mb it will save it to a file and save the current state so when I run the script again it will actuly run from where we stopped (the last line on the 100mb file)
Or if you have another solution I would love to hear it :P
heres the script :
from itertools import product
with open('file.txt', 'r') as f:
content = f.readlines()
comb = product(content, repeat=10)
new_content = [elem for elem in list(comb)]
with open('log.txt', 'w') as f:
for line in new_content:
f.write(str(line) + '\n')
The line
new_content = [elem for elem in list(comb)]
takes the generator and transforms it into a list in memory, twice. The result is the same as just doing
new_content = list(comb)
Your computer freezes up because this will use all of the available RAM.
Since you use new_content only for iterating over it, you could just iterate over the initial generator directly instead:
from itertools import product
with open('file.txt', 'r') as f:
content = f.readlines()
comb = product(content, repeat=10)
with open('log.txt', 'w') as f:
for line in comb:
f.write(str(line) + '\n')
But now this will fill up your harddisk, since with an input size of 1500 lines it will produce 57665039062500000000000000000000 lines (1500**10) of output.
I would open the file in a separate function and yield a line at a time - that way you're never going to blow your memory.
function read_file(filename):
with open(filename", "r") as f:
for line in f:
yield line
Then you can use this in your code:
for line in read_file("log.txt"):
f.write(line + "\n")

Issue opening large number of files in Python

I'm trying to process a pipe separated text file with the following format:
18511|1|2587198|2004-03-31|0|100000|0|1.97|0.49988|100000||||
18511|2|2587198|2004-06-30|0|160000|0|3.2|0.79669|60000|60|||
18511|3|2587198|2004-09-30|0|160000|0|2.17|0.79279|0|0|||
18511|4|2587198|2004-09-30|0|160000|0|1.72|0.79118|0|0|||
18511|5|2587198|2005-03-31|0|0|0|0|0|-160000|-100|||19
18511|1|2587940|2004-03-31|0|240000|0|0.78|0.27327|240000||||
18511|2|2587940|2004-06-30|0|560000|0|1.59|0.63576|320000|133.33||24|
18511|3|2587940|2004-09-30|0|560000|0|1.13|0.50704|0|0|||
18511|4|2587940|2004-09-30|0|560000|0|0.96|0.50704|0|0|||
18511|5|2587940|2005-03-31|0|0|0|0|0|-560000|-100|||14
For each line I want to isolate the second field and write that line to a file with that field as part of the filename e.g issue1.txt, issue2.txt where the number is the second field in the above file excerpt. This number can be in the range 1 to 56. My code is shown below:
with open('d:\\tmp\issueholding.txt') as f, open('d:\\tmp\issue1.txt', 'w') as out_f1,\
open('d:\\tmp\issue2.txt', 'w') as out_f2,open('d:\\tmp\issue3.txt', 'w') as out_f3,\
open('d:\\tmp\issue4.txt', 'w') as out_f4,open('d:\\tmp\issue5.txt', 'w') as out_f5,\
open('d:\\tmp\issue6.txt', 'w') as out_f6,open('d:\\tmp\issue7.txt', 'w') as out_f7,\
open('d:\\tmp\issue8.txt', 'w') as out_f8,open('d:\\tmp\issue9.txt', 'w') as out_f9,\
open('d:\\tmp\issue10.txt', 'w') as out_f10,open('d:\\tmp\issue11.txt', 'w') as out_f11,\
open('d:\\tmp\issue12.txt', 'w') as out_f12,open('d:\\tmp\issue13.txt', 'w') as out_f13,\
open('d:\\tmp\issue14.txt', 'w') as out_f14,open('d:\\tmp\issue15.txt', 'w') as out_f15,\
open('d:\\tmp\issue16.txt', 'w') as out_f16,open('d:\\tmp\issue17.txt', 'w') as out_f17,\
open('d:\\tmp\issue18.txt', 'w') as out_f18,open('d:\\tmp\issue19.txt', 'w') as out_f19,\
open('d:\\tmp\issue20.txt', 'w') as out_f20,open('d:\\tmp\issue21.txt', 'w') as out_f21,\
open('d:\\tmp\issue22.txt', 'w') as out_f22,open('d:\\tmp\issue23.txt', 'w') as out_f23,\
open('d:\\tmp\issue24.txt', 'w') as out_f24,open('d:\\tmp\issue25.txt', 'w') as out_f25,\
open('d:\\tmp\issue32.txt', 'w') as out_f32,open('d:\\tmp\issue33.txt', 'w') as out_f33,\
open('d:\\tmp\issue34.txt', 'w') as out_f34,open('d:\\tmp\issue35.txt', 'w') as out_f35,\
open('d:\\tmp\issue36.txt', 'w') as out_f36,open('d:\\tmp\issue37.txt', 'w') as out_f37,\
open('d:\\tmp\issue38.txt', 'w') as out_f38,open('d:\\tmp\issue39.txt', 'w') as out_f39,\
open('d:\\tmp\issue40.txt', 'w') as out_f40,open('d:\\tmp\issue41.txt', 'w') as out_f41,\
open('d:\\tmp\issue42.txt', 'w') as out_f42,open('d:\\tmp\issue43.txt', 'w') as out_f43,\
open('d:\\tmp\issue44.txt', 'w') as out_f44,open('d:\\tmp\issue45.txt', 'w') as out_f45,\
open('d:\\tmp\issue46.txt', 'w') as out_f46,open('d:\\tmp\issue47.txt', 'w') as out_f47,\
open('d:\\tmp\issue48.txt', 'w') as out_f48,open('d:\\tmp\issue49.txt', 'w') as out_f49,\
open('d:\\tmp\issue50.txt', 'w') as out_f50,open('d:\\tmp\issue51.txt', 'w') as out_f51,\
open('d:\\tmp\issue52.txt', 'w') as out_f52,open('d:\\tmp\issue53.txt', 'w') as out_f53,\
open('d:\\tmp\issue54.txt', 'w') as out_f54,open('d:\\tmp\issue55.txt', 'w') as out_f55,\
open('d:\\tmp\issue56.txt', 'w') as out_f56:
for line in f:
field1_end = line.find('|') +1
field2_end = line.find('|',field1_end)
f2=line[field1_end:field2_end]
out_f56.write(line)
My two issue are:
1) When trying to run the above I get the following error message
File "", line unknown
SyntaxError: too many statically nested blocks
2) How do I change this line out_f56.write(line) so that I can use the variable f2 as part of the file descriptor rather than hard coding it.
I am running this in a jupyter notebook running python3 under Windows. To be clear, the input file has approx 235 Million records so performance is key.
Appreciate any help or suggestions
Try something like this (see comments in code for explanation):
with open(R"d:\tmp\issueholding.txt") as f:
for line in f:
# splitting line into list of strings at '|' character
fields = line.split('|')
# defining output file name according to issue code in second field
# NB: list-indexes are zero-based, therefore use 1
out_name = R"d:\tmp\issue%s.txt" % fields[1]
# opening output file and writing current line to it
# NB: make sure you use the 'a+' mode to append to existing file
with open(out_name, 'a+') as ff:
ff.write(line)
To avoid opening files repeatedly inside the reading loop, you could do the following:
from collections import defaultdict
with open(R"D:\tmp\issueholding.txt") as f:
# setting up dictionary to hold lines grouped by issue code
# using a defaultdict here to automatically create a list when inserting
# the first item
collected_issues = defaultdict(list)
for line in f:
# splitting line into list of strings at '|' character and retrieving
# current issue code from second token
issue_code = line.split('|')[1]
# appending current line to list of collected lines associated with
# current issue code
collected_issues[issue_code].append(line)
else:
for issue_code in collected_issues:
# defining output file name according to issue code
out_name = R"D:\tmp\issue%s.txt" % issue_code
# opening output file and writing collected lines to it
with open(out_name, 'a+') as ff:
ff.write("".join(collected_issues[issue_code]))
This of course creates an in-memory dictionary holding all lines retrieved from the input file. Given your specification this could very well be not feasible with your machine. An alternative would be to split up the input file and processing it chunk by chunk instead. This can be achieved in code by defining a corresponding generator that reads a defined amount of lines (here: 1000) from the input file. A possible final solution could then look like this:
from itertools import islice
from collections import defaultdict
def get_chunk_of_lines(file, N):
"""
Retrieves N lines from specified opened file.
"""
return [x.strip() for x in islice(file, N)]
def collect_issues(lines):
"""
Collects and groups issues from specified lines.
"""
collected_issues = defaultdict(list)
for line in lines:
# splitting line into list of strings at '|' character and retrieving
# current issue code from second token
issue_code = line.split('|')[1]
# appending current line to list of collected lines associated with
# current issue code
collected_issues[issue_code].append(line)
return collected_issues
def export_grouped_issues(issues):
"""
Exports collected and grouped issues.
"""
for issue_code in issues:
# defining output file name according to issue code
out_name = R"D:\tmp\issue%s.txt" % issue_code
# opening output file and writing collected lines to it
with open(out_name, 'a+') as f:
f.write("".join(issues[issue_code]))
with open(R"D:\tmp\issueholding.txt") as issue_src:
chunk_cnt = 0
while True:
# retrieving 1000 input lines at a time
line_chunk = get_chunk_of_lines(issue_src, 1000)
# exiting while loop if no more chunk is left
if not line_chunk:
break
chunk_cnt += 1
print("+ Working on chunk %d" % chunk_cnt)
# collecting, grouping and exporting issues
issues = collect_issues(line_chunk)
export_grouped_issues(issues)

ZipFile cracker works with a few combinations and crashes when many combinations are used

I've found many examples of zip crackers written in python, but unfortunatelly they were either written in python2 or have the functionality I do not need (i.e. use of dictionaries saved in files). For me was interesting to check how long and how much memory will it take to break, let's say, a password of 5-10 different symbols (a-z, a-zA-Z, a-zA-Z1-10 etc.). Afterwards I can try different libraries and techniques (threads etc.) to improve performance of the code and, hopefully, get better undestaning of python mechanics in the process.
Here is my program. It works well when the program tries 2-position passwords (a-zA-Z) and crashes with longer passwords.
import os
import shutil
import zipfile
from itertools import permutations, combinations_with_replacement
from string import ascii_letters
#passgen() yields passwords to be checked
def passgen(passminlength, passmaxlength,searchdict):
prevpwd = []
for n in range(passminlength,passmaxlength):
for p in combinations_with_replacement(searchdict,n):
for k in permutations(p,n):
pwd_tmp=''.join(k)
if prevpwd != pwd_tmp: #without this check passgen() yields
prevpwd = pwd_tmp #recurring password combinations
yield pwd_tmp #due to the logic behind permutations()
if __name__ == '__main__':
zFile = zipfile.ZipFile("secret.zip", 'r') #encrypted file to crack
pwd = None #password to find
output_directory = os.path.curdir+"/unzip_tmp" #output tmpfolder for extracted files
if not os.path.isdir(output_directory): #if it exists - delete, otherwise - create
os.makedirs('unzip_tmp')
else:
shutil.rmtree(output_directory)
os.makedirs('unzip_tmp')
searchdict = list(ascii_letters) #list with symbols for brute force: a-zA-Z
passminlength = 1
passmaxlength = 3 #code works with passmaxlength=2, doesn't - with passmaxlength=3
pwd_tmp = passgen(passminlength,passmaxlength,searchdict) #pwd_tmp is an iterator
while True:
try:
tmp = next(pwd_tmp)
except StopIteration: #iterator is empty-> quit while-loop
break
print("trying..:%s" % tmp)
zFile.setpassword(bytes(tmp,'ascii'))
try:
zFile.extractall(output_directory)
pwd = tmp
break #password is found->quit while-loop
except RuntimeError: #checked password is wrong ->go again though while loop
print("wrong password:%s" % tmp)
print("password is:%s" % pwd)
The program crashes with maxpasslength=3 on the row with "zFile.extractall(output_directory)".
Error is:
Traceback (most recent call last):
File "C:\Experiments\Eclipse\Workspace\messingaroundpython\zipcracker.py", lin
e 48, in <module>
zFile.extractall(output_directory)
File "C:\Python34\lib\zipfile.py", line 1240, in extractall
self.extract(zipinfo, path, pwd)
File "C:\Python34\lib\zipfile.py", line 1228, in extract
return self._extract_member(member, path, pwd)
File "C:\Python34\lib\zipfile.py", line 1292, in _extract_member
shutil.copyfileobj(source, target)
File "C:\Python34\lib\shutil.py", line 67, in copyfileobj
buf = fsrc.read(length)
File "C:\Python34\lib\zipfile.py", line 763, in read
data = self._read1(n)
File "C:\Python34\lib\zipfile.py", line 839, in _read1
data = self._decompressor.decompress(data, n)
zlib.error: Error -3 while decompressing data: invalid distance too far back
I am stuck. Any idea what I could be missing?
Update: It seems to be a bug from zlib. I added an exeption catcher to ignore bad combinations:
except RuntimeError: #checked password is wrong ->go again though while loop
print("wrong password:%s" % tmp)
except Exception as err:
print("zlib bug, wrong password?:%s" % tmp)
Now the program can process much longer passwords.

Resources