I want to split zip file with python and then join splited files together, I found this code but I can not join splited files - python-3.x

Thank you to #Jeronimo
Split a zip archive into multiple chunks
outfile = archive_name
packet_size = int(1.5 * 1024**3) # bytes
with open(outfile, "rb") as output:
filecount = 0
while True:
data = output.read(packet_size)
print(len(data))
if not data:
break # we're done
with open("{}{:03}".format(outfile, filecount), "wb") as packet:
packet.write(data)
filecount += 1
after splitting them I can not join them together

Fortunately, I solve this problem my self
outfile = "archive_name"
packet_size = int(1024*1024*100) # bytes
filenumbers=9 #number of files you want to join
for i in range(filenumbers):
with open("{}.zip{:03}".format(outfile, i), "rb") as packet:
col=packet.read(packet_size)
with open("{}02.zip".format(outfile), "ab+") as mainpackage:
mainpackage.write(col)

Related

why SFTP parallel upload of a file is giving me recursive appended data?

I am trying to split a file into chunks and use threads to write to a file in the SFTP server.
The file name: hi.txt
The data:
hi....
file upload check.
The code to upload:
threads_count = 4
data_ap = {}
size = os.path.getsize(local_path)
part_size = int(size / threads_count)
lock = threading.Lock()
def open_ssh():
# ssh connection codes
return ssh
def upload_part(num, offset, part_size, remote_path_part):
#print(f"Running thread {num}")
try:
ssh = open_ssh()
sftp = ssh.open_sftp()
with open(local_path, "rb") as fl:
fl.seek(offset)
with lock:
with sftp.open(remote_path_part, "ab") as fr:
fr.set_pipelined(True)
size = 0
while size < part_size:
s = 32768
if size + s > part_size:
s = part_size - size
data = fl.read(s)
data_ap[num] = data
print(data)
# print({offset : data})
fr.write(str(data_ap))
#fr.write(data)
size += len(data)
if len(data) == 0:
break
except (paramiko.ssh_exception.SSHException) as x:
print(f"Thread {num} failed: {x}")
#print(f"Thread {num} done")
#ssh.close()
#start_time = time.time()
print("Starting")
offset = 0
threads = []
part_filenames = []
for num in range(threads_count):
if num == threads_count - 1:
part_size = size - offset
#remote_path_part = f"{remote_path}.{num}"
args = (num, offset, part_size, remote_path)
#print(f"Starting thread {num} offset {offset} size {part_size} " + f"part name {remote_path}")
thread = threading.Thread(target=upload_part, args=args)
threads.append(thread)
part_filenames.append(remote_path)
thread.start()
#print(f"Started thread {num}")
offset += part_size
for num in range(len(threads)):
#print(f"Waiting for thread {num}")
threads[num].join()
print("All thread done")
Now I have two problems.
First:
I am not getting the data sorted correctly since the data is divided across chunks I am getting different order
in the uploaded file
The uploaded data:
upload check. code to uploadhi ... the
Second:
To solve the above issue, I thought of using a dictionary where the key is the thread, and the value is the data, and during the download, I will reconstruct the file by order of the key. But I am getting the data recursively added like this.
The uploaded data:
{0: b'file uploa'}{0: b'file uploa', 1: b'd check.\nc'}{0: b'file uploa', 1: b'd check.\nc', 3: b' if appends'}{0: b'file uploa', 1: b'd check.\nc', 3: b' if appends', 2: b'heck again'}
How to fix this?
It would be preferable to fix the First part wherein I do not have to use any dictionary to rearrange the data when uploaded.
Reference for the upload code

How can I merge two files with numbers into a new file and make it sorted?

How can I merge two files with numbers into a new file and make it sorted?
Code:
#combining the two files
filenames = ["numbers1.txt", "numbers2.txt"]
with open("allNumbers.txt", "w") as al_no:
**#iterating through the filenames list**
for f in filenames:
with open(f) as infile:
for line in infile:
al_no.write(line)
There are two approaches you can use.
The first approach is to loop through, append the lines to a list, sort the list and then write that out to the file.
filenames = ["numbers1.txt", "numbers2.txt"]
# Step 1: merge the lines into a list
lines = []
for f in filenames:
with open(f) as infile:
for line in infile:
lines.append(line)
# Step 2: write the list out to the file in a sorted order
with open("allNumbers.txt", "w") as all_no:
all_no.write(''.join(sorted(lines, key=lambda x: int(x.strip()))))
It is more succinct (and Pythonic) to use list comprehensions instead:
filenames = ["numbers1.txt", "numbers2.txt"]
lines = [line for sublist in [open(f).readlines() for f in filenames] for line in sublist]
with open("allNumbers.txt", "w") as all_no:
all_no.write(''.join(sorted(lines, key=lambda x: int(x.strip()))))
Remember that when sorting, you need to use the key argument to sorted to ensure a numeric sort is done, rather than the default lexicographic sort.
This code assumes that each line in the source files contains a number, which is likely given the current approach you've taken.
You did not provide a detailed description or file, I worked on it with a prediction.
try this code:
# Reading data from file1
with open('numbers1.txt') as fp:
data = fp.read()
# Reading data from file2
with open('numbers2.txt') as fp:
data2 = fp.read()
# Merging 2 files
data += "\n"
data += data2
# Covert str to list
list_data = data.split()
# Covert list(item) to int
list_data = list(map(int, list_data))
# Sort list(item)
list_data.sort()
# save file
with open('allNumbers.txt', 'w') as file:
for data in list_data:
file.write("%s\n" % data)
You can change the structure and use it.
good luck!

Running a Python script for files in a folder

There are 15 text files in a folder and I am trying to extract certain parts of each file and output them to a new file.
I am able to extract each file individually by just changing the file name and append each file to the output file but this means copying the same code 15 times and just changing the file name each time.
import glob,os
lst = []
filelist=glob.glob ('/C:/Users/bridaly/Documents/PythonTest/Python_Test_ENdata_3080_v20150914/input/*')
for file in filelist:
if os.path.isfile(file):
for line in filelist:
line = line.strip()
if not (
line.startswith("APPEND") or line.startswith("_") or
line.startswith("SAP") or line.startswith("~") or
line.startswith("INCLUDE") or line.startswith("ABAP")
or line.strip() == "" or line.startswith("Field") or
line.startswith("Short")
) :
y=line.replace(' ',' ')
#print(y)
z = y.replace('X','')
#print(z)
w = "|".join(z.split())
#print(w)
x = w.split("|",3)[:4]
#print(x)
x.insert(0,'./input/01BKPF')
#print(x)
if len(x) >=4:
t = [s.replace('|',' ') for s in x]
#print(t)
print("|".join(t))
lst.append("|".join(t))
#Output Script
output_file = open('Output_Final.txt', 'w')
for l in lst:
output_file.write(l)
output_file.write('\n')
output_file.close()
"""
The output should extract what's written in the code but for each file and append it to the output file. I have gotten the correct output by copying the code 15 times but I just want to use it once as it is more efficient.
files = glob.glob('path')
for file in files:
file_name = os.path.basename(file)
print(file_name)
you can iterate for each file

how to speed up looping over 4GB tab delimited text file

It took me over 3 minutes to loop over a 4gb text file, counting the number of lines, number of words and chars per line as I go. Is there a faster way to do this?
This is my code:
import time
import csv
import sys
csv.field_size_limit(sys.maxsize)
i=0
countwords={}
countchars={}
start=time.time()
with open("filename.txt", "r", encoding="utf-8") as file:
for line in csv.reader(file, delimiter="\t"):
i+=1
countwords[i]=len(str(line).split())
countchars[i]=len(str(line))
if i%10000==0:
print(i)
end=time.time()
if i>0:
print(i)
print(sum(countwords.values())/i)
print(sum(countchars.values())/i)
print(end-start)
From my limited tested (on a unix dictionary) I get only a minor speedup using numpy, but any win is a win. I'm not sure if using csvreader is a good way of parsing out tabbed delimited text, but I have not checked whether this gives a more optimal speed.
import time
import numpy
# Holds count of words and letters per line of input
countwords = numpy.array( [] )
countchars = numpy.array( [] )
# Holds total count of words and letters per file
word_sum = 0
char_sum = 0
start = time.time()
file_in = open( "filename.txt", "rt", encoding="utf-8" )
for line in file_in:
# cleanup the line, split it into fields by TAB character
line = line.strip()
fields = line.split( '\t' )
# Count the fields, and the letters of each field's content
field_count = len( fields )
char_count = len( line ) - field_count # don't count the '\t' chars too
# keep a separate count of the fields and letters by line
numpy.append( countwords, field_count )
numpy.append( countchars, char_count )
# Keep a running total to save summation at the end
word_sum += field_count
char_sum += char_count
file_in.close()
end = time.time()
print("Total Words: %3d" % ( word_sum ) )
print("Total Letters: %3d" % ( char_sum ) )
print("Elapsed Time: %.2f" % ( end-start ) )
You can avoid allocating extra data, and use lists instead of dictionaries:
import time
import csv
import sys
csv.field_size_limit(sys.maxsize)
countwords=0
countchars=0
start=time.time()
with open("filename.txt", "r", encoding="utf-8") as file:
for i, line in enumerate(csv.reader(file, delimiter="\t")):
words = str(line).split() #we allocate just 1 extra string
wordsLen = len(words)
countwords += wordsLen
# for avoiding posible allocation we iterate throug the chars of the words
# we already have, then we need to add the spaces in between which is
# wordsLen - 1
countchars += len(itertools.chain.from_iterable(words)) + wordsLen - 1)
if i%10000==0:
print(i)
end=time.time()
if i>0:
print(i)
print(countwords/i)
print(countchars/i)
print(end-start)
I managed to write another version of a speedy code (using an idea I saw in a different thread), but it currently has a disadvantage compared to Kingsley's code using numpy, because it does not save data per line, but only aggregate data. In any case, here it is:
import time
start=time.time()
f = open("filename.txt", 'rb')
lines = 0
charcount=0
wordcount=0
#i=10000
buf_size = 1024 * 1024
read_f = f.raw.read
buf = read_f(buf_size)
while buf:
lines += buf.count(b'\t')
'''while lines/i>1:
print(i)
i+=10000'''
charcount+=len((buf.strip()))
wordcount+=len((buf.strip()).split())
buf = read_f(buf_size)
end=time.time()
print(end-start)
print(lines)
print(charcount/lines)
print(wordcount/lines)

Split a zip archive into multiple chunks

I'm trying to create a zip archive of a possibly huge folder.
For this purpose I'm using the python zipfile module, but as far as I can see there is no option to split the created archive into multiple chunks with a max size.
The zipped archive is supposed to be sent via Telegram, which has a size limitation of 1.5 GB per file. Thereby I need to split the resulting zip archive.
I would really like to not use a subprocess and shell commands for creating this archive.
My current code looks like this:
def create_zip(archive_name, directory):
"""Create a zip file from given dir path."""
with ZipFile(archive_name, "w", ZIP_LZMA) as target_zip_file:
for root, _, files in os.walk(directory):
for file_to_zip in files:
absolute_path = os.path.join(root, file_to_zip)
zip_file_name = absolute_path[len(directory) + len(os.sep):]
target_zip_file.write(absolute_path, zip_file_name)
return target_zip_file
Thanks in Advance
Here is what i use to send file to telegram channel by telegram bot.
The file size limit is 50MB in upload by telegram bot
The file size limit is 1500MB in upload by telegram client but you may add some text or other info so 1495 is more safe
#! /usr/bin/python3
# -*- coding:utf-8 -*-
# apt-get install p7zip-full
import subprocess
import os
import math
import logzero
logger = logzero.logger
MAX_SPLIT_SIZE = 1495
def file_split_7z(file_path, split_size=MAX_SPLIT_SIZE):
file_path_7z_list = []
# if origin file is 7z file rename it
origin_file_path = ""
if os.path.splitext(file_path)[1] == ".7z":
origin_file_path = file_path
file_path = os.path.splitext(origin_file_path)[0] + ".7zo"
os.rename(origin_file_path, file_path)
# do 7z compress
fz = os.path.getsize(file_path) / 1024 / 1024
pa = math.ceil(fz / split_size)
head, ext = os.path.splitext(os.path.abspath(file_path))
archive_head = "".join((head, ext.replace(".", "_"))) + ".7z"
for i in range(pa):
check_file_name = "{}.{:03d}".format(archive_head, i + 1)
if os.path.isfile(check_file_name):
logger.debug("remove exists file | {}".format(check_file_name))
os.remove(check_file_name)
cmd_7z = ["7z", "a", "-v{}m".format(split_size), "-y", "-mx0", archive_head, file_path]
proc = subprocess.Popen(cmd_7z, shell=False, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
out, err = proc.communicate()
if b"Everything is Ok" not in out:
logger.error("7z output | {}".format(out.decode("utf-8")))
logger.error("7z error | {}".format(err.decode("utf-8")))
return file_path_7z_list
for i in range(pa):
file_path_7z_list.append("{}.{:03d}".format(archive_head, i + 1))
# if origin file is 7z file rename it back
if origin_file_path:
os.rename(file_path, origin_file_path)
return file_path_7z_list
def do_file_split(file_path, split_size=MAX_SPLIT_SIZE):
"""caculate split size
example max split size is 1495 file size is 2000
than the split part num should be int(2000 / 1495 + 0.5) = 2
so the split size should be 1000 + 1000 but not 1495 + 505
with the file size increase the upload risk would be increase too
"""
file_size = os.path.getsize(file_path) / 2 ** 20
split_part = math.ceil(file_size / split_size)
new_split_size = math.ceil(file_size / split_part)
logger.info("file size | {} | split num | {} | split size | {}".format(file_size, split_part, new_split_size))
file_path_7z_list = file_split_7z(file_path, split_size=new_split_size)
return file_path_7z_list
In case you don't find a better, native way with zipfile, you could still write the file splitting algorithm yourself. Something like this:
outfile = archive_name
packet_size = int(1.5 * 1024**3) # bytes
with open(outfile, "rb") as output:
filecount = 0
while True:
data = output.read(packet_size)
print(len(data))
if not data:
break # we're done
with open("{}{:03}".format(outfile, filecount), "wb") as packet:
packet.write(data)
filecount += 1
And similar to put it back together on the receiver's side.

Resources