Byte Compression from file to image - python-3.x

I have been trying to compress files based off the byte content in them, then store this compression as an image(png) and reverse the process to get the original file back.
I have tested all the compression methods i can and none seem to work too well for the test files I am using.
The big test file I am using is a PNG of 7,187Kb (7,358,681 bytes) and the best compression I can get out of it is using blocs which results in a 7,191Kb (7,362,976 bytes)
This is by using binary compression only. When i use PIL to open the image and extract the pixels i can shrink it by almost 70% down to 2,676Kb (2,740,053 bytes)
code:
from PIL import Image
import io, math, os
import zlib, bz2, pylmza, lmza, blosc ##all tested
def __file_to_bytes(self, fname):
img_byte_arr = io.BytesIO()
with open(fname, 'rb+') as fp:
bts = fp.read()
return bts
def __image_to_bytes(self, fname):
img_byte_arr = io.BytesIO()
with Image.open(fname) as fp:
w, h = fp.size ##used this to re-create the image after
bts = fp.tobytes()
return bts
def compresser(self, algo, fname, outname):
bts = __file_to_bytes(fname) # file to bytes = better compatibility and no compression
#bts = __image_to_bytes(fname) #png to bytes = better compression
compressed_bytes = algo.compress(bts)
compressed_image_output = self.__to_image(outname, bts, fname)
def __bytes_to_rgb(self, bts):
padding_len = 0 ##adds black pixels to make the resulting data a perfect square
p = []
for b in bts:
p.append(int(b))
img = list(self.divide_chunks(p, 3))
while len(img[-1])%3 != 0:
img[-1].append(0)
padding_len += 1
for i in range(len(img)):
img[i] = tuple(img[i])
size = math.ceil(math.sqrt(len(img)))
for i in range(size**2-len(img)):
img.append((0, 0, 0))
padding_len += 3
p = (255, math.floor(padding_len/255), padding_len%255)
img[-1] = p ##encodes the last pixel to be the padding lenght as (255*y+z)
return img, size
def __rgb_to_image(self, arr, fname, size, ofname):
metadata = PngInfo()
metadata.add_text("filename", ofname)
output = Image.new('RGB', (size, size))
output.putdata(arr)
output.save(fname, pnginfo=metadata)
return output
def __to_image(self, output_fname, bts, ofname):
img, size = self.__bytes_to_rgb(bts)
ouput_img = self.__rgb_to_image(img, output_fname, size, ofname)
return ouput_img
Although I am using a PNG image for the input file this can be easily changed to any type of file so the compression needs to be lossless, however there is no time criticality for this so it doesn't matter how slow the method is.

Related

Why are albumentations Augmentations (Yolo / YoloV5) altering Bounding Boxes if no augmentations are being placed?

I was using the Albumentations library in order to perform some data augmentations on an object detection dataset that I intended to train a YoloV5 model on.
I have to perform the augmentations seperately and save the images locally to disk, but when I do I noticed that some of the output bounding boxes returned aren't generating properly.
I have my augmentations set up in a seperate aug.py file, shown below (augmentations purposefully removed in debugging attempts, see below) -
import albumentations as A
import cv2
PROB = 0.5
bbp = A.BboxParams(format="yolo")
horizontal_flip_transform = A.Compose([
], bbox_params = bbp)
vertical_flip_transform = A.Compose([
], bbp)
pixel_dropout_transform = A.Compose([
], bbox_params = bbp)
random_rotate = A.Compose([
], bbox_params = bbp )
#NOTE: THIS METHOD IMPLIES THAT THE IMAGE WIDTHS MUST BE AT LEAST 50 PIXELS
#Remove this aug to remove this constraint
random_crop = A.Compose([
], bbox_params = bbp)
augs = [horizontal_flip_transform, vertical_flip_transform, pixel_dropout_transform, random_rotate, random_crop]
def get_augmentations():
return augs
And the relevant parts of my implementation for performing the augmentations and saving them to disk is below:
def run_augments_on_image(img_name, bboxes, max_images_to_generate = 500):
ret = []
img = np.array(Image.open(img_name), dtype=np.uint8)
transforms = get_augmentations()
for i in range(min(len(transforms), max_images_to_generate)):
transformed = transforms[i](image=img, bboxes = bboxes)
ret.append((transformed["image"] , transformed["bboxes"]))
return ret
def run_and_save_augments_on_image_sets(batch_img_names, bboxes_urls, max_images_to_generate, dataset_dir, trainval):
num_images = 0
for i in range(len(batch_img_names)):
bboxes = []
with open(os.path.join(dataset_dir, trainval, 'labels', bboxes_urls[i]), 'r') as f:
for row in f:
x = row.strip().split(' ')
x.append(row[0])
x.pop(0)
x[0] = float(x[0])
x[1] = float(x[1])
x[2] = float(x[2])
x[3] = float(x[3])
bboxes.append(x)
trans = run_augments_on_image(os.path.join(dataset_dir, trainval, 'images', batch_img_names[i]), bboxes)
img_index = len(os.listdir(os.path.join(dataset_dir, 'train' , 'images'))) + len(os.listdir(os.path.join(dataset_dir, 'valid', 'images'))) + 1
for j in range(len(trans)):
img_trans, bboxes_trans = trans[j]
p = Image.fromarray(img_trans).save(os.path.join(dataset_dir, trainval, 'images' , f'image-{img_index}.{batch_img_names[j].split(".")[-1]}'))
with open(os.path.join(dataset_dir, trainval, 'labels', f'image-{img_index}.txt'), 'w') as f:
for boxs in bboxes_trans:
print(f'{boxs[-1]} {boxs[0]} {boxs[1]} {boxs[2]} {boxs[3]}', file=f)
num_images += 1
img_index += 1
if num_images >= max_images_to_generate:
break
if num_images >= max_images_to_generate:
break
For testing purposes (some of the bounding boxes were off), I removed all the actual augmentations, expecting the input image label (one augmented image example shown below) to be equal to augmented label since there were no augmentations. But, as you can see, the two labels are different.
img-original.txt
0 0.5662285714285714 0.2740066225165563 0.5297714285714286 0.4837913907284769
img-augmented.txt
0 0.51488 0.47173333333333334 0.6405099999999999 0.6527333333333334
(The labels above are in normalized xywh YOLO format)
Why is albumentations altering the labels? None of the augmentations in augs.py contain anything.

Python 3 Multiprocessing and openCV problem with dictionary sharing between processor

I would like to use multiprocessing to compute the SIFT extraction and SIFT matching for object detection.
For now, I have a problem with the return value of the function that does not insert data in the dictionary.
I'm using Manager class and image that are open inside the function. But does not work.
Finally, my idea is:
Computer the keypoint for every reference image, use this keypoint as a parameter of a second function that compares and match with the keypoint and descriptors of the test image.
My code is:
# %% Import Section
import cv2
import numpy as np
from matplotlib import pyplot as plt
import os
from datetime import datetime
from multiprocessing import Process, cpu_count, Manager, Lock
import argparse
# %% path section
tests_path = 'TestImages/'
references_path = 'ReferenceImages2/'
result_path = 'ResultParametrizer/'
#%% Number of processor
cpus = cpu_count()
# %% parameter section
eps = 1e-7
useTwo = False # using the m and n keypoint better with False
# good point parameters
distanca_coefficient = 0.75
# gms parameter
gms_thresholdFactor = 3
gms_withRotation = True
gms_withScale = True
# flann parameter
flann_trees = 5
flann_checks = 50
#%% Locker
lock = Lock()
# %% function definition
def keypointToDictionaries(keypoint):
x, y = keypoint.pt
pt = float(x), float(y)
angle = float(keypoint.angle) if keypoint.angle is not None else None
size = float(keypoint.size) if keypoint.size is not None else None
response = float(keypoint.response) if keypoint.response is not None else None
class_id = int(keypoint.class_id) if keypoint.class_id is not None else None
octave = int(keypoint.octave) if keypoint.octave is not None else None
return {
'point': pt,
'angle': angle,
'size': size,
'response': response,
'class_id': class_id,
'octave': octave
}
def dictionariesToKeypoint(dictionary):
kp = cv2.KeyPoint()
kp.pt = dictionary['pt']
kp.angle = dictionary['angle']
kp.size = dictionary['size']
kp.response = dictionary['response']
kp.octave = dictionary['octave']
kp.class_id = dictionary['class_id']
return kp
def rootSIFT(dictionary, image_name, image_path,eps=eps):
# SIFT init
image = cv2.imread(image_path, cv2.IMREAD_GRAYSCALE)
sift = cv2.xfeatures2d.SIFT_create()
keypoints, descriptors = sift.detectAndCompute(image, None)
descriptors /= (descriptors.sum(axis=1, keepdims=True) + eps)
descriptors = np.sqrt(descriptors)
print('Finito di calcolare, PID: ', os.getpid())
lock.acquire()
dictionary[image_name]['keypoints'] = keypoints
dictionary[image_name]['descriptors'] = descriptors
lock.release()
def featureMatching(reference_image, reference_descriptors, reference_keypoints, test_image, test_descriptors,
test_keypoints, flann_trees=flann_trees, flann_checks=flann_checks):
# FLANN parameter
FLANN_INDEX_KDTREE = 1
index_params = dict(algorithm=FLANN_INDEX_KDTREE, trees=flann_trees)
search_params = dict(checks=flann_checks) # or pass empty dictionary
flann = cv2.FlannBasedMatcher(index_params, search_params)
flann_matches = flann.knnMatch(reference_descriptors, test_descriptors, k=2)
matches_copy = []
for i, (m, n) in enumerate(flann_matches):
if m.distance < distanca_coefficient * n.distance:
matches_copy.append(m)
gsm_matches = cv2.xfeatures2d.matchGMS(reference_image.shape, test_image.shape, keypoints1=reference_keypoints,
keypoints2=test_keypoints, matches1to2=matches_copy,
withRotation=gms_withRotation, withScale=gms_withScale,
thresholdFactor=gms_thresholdFactor)
#%% Starting reference list file creation
reference_init = datetime.now()
print('Start reference file list creation')
reference_image_process_list = []
manager = Manager()
reference_image_dictionary = manager.dict()
reference_image_list = manager.list()
for root, directories, files in os.walk(references_path):
for file in files:
if file.endswith('.DS_Store'):
continue
reference_image_path = os.path.join(root, file)
reference_name = file.split('.')[0]
image = cv2.imread(reference_image_path, cv2.IMREAD_GRAYSCALE)
reference_image_dictionary[reference_name] = {
'image': image,
'keypoints': None,
'descriptors': None
}
proc = Process(target=rootSIFT, args=(reference_image_list, reference_name, reference_image_path))
reference_image_process_list.append(proc)
proc.start()
for proc in reference_image_process_list:
proc.join()
reference_end = datetime.now()
reference_time = reference_end - reference_init
print('End reference file list creation, time required: ', reference_time)
I faced pretty much the same error. It seems that the code hangs at detectAndCompute in my case, not when creating the dictionary. For some reason, sift feature extraction is not multi-processing safe (to my understanding, it is the case in Macs but I am not totally sure.)
I found this in a github thread. Many people say it works but I couldn't get it worked. (Edit: I tried this later which works fine)
Instead I used multithreading which is pretty much the same code and works perfectly. Of course you need to take multithreading vs multiprocessing into account

How can the code be modified so that multiple images can be read and stored in an array? so that they will be used for LSB steganography

The problem here is that this is only used for one image and i need to optimize it so that multiple images can be stored. (their width,height etc)
I am not fluent in python. I have worked on it about 4 years ago but now i have almost forgotten most part of the syntax.
def __init__(self, im):
self.image = im
self.height, self.width, self.nbchannels = im.shape
self.size = self.width * self.height
self.maskONEValues = [1,2,4,8,16,32,64,128]
#Mask used to put one ex:1->00000001, 2->00000010 .. associated with OR bitwise
self.maskONE = self.maskONEValues.pop(0) #Will be used to do bitwise operations
self.maskZEROValues = [254,253,251,247,239,223,191,127]
#Mak used to put zero ex:254->11111110, 253->11111101 .. associated with AND bitwise
self.maskZERO = self.maskZEROValues.pop(0)
self.curwidth = 0 # Current width position
self.curheight = 0 # Current height position
self.curchan = 0 # Current channel position
I want to store multiple images (their width, height etc) from a file path (that contains these images) in an array
TRY:-
from PIL import Image
import os
# This variable will store the data of the images
Image_data = []
dir_path = r"C:\Users\vasudeos\Pictures"
for file in os.listdir(dir_path):
if file.lower().endswith(".png"):
# Creating the image file object
img = Image.open(os.path.join(dir_path, file))
# Getting Dimensions of the image
x, y = img.size
# Getting channels of the image
channel = img.mode
img.close()
# Adding the data of the image file to our list
Image_data.append(tuple([channel, (x, y)]))
print(Image_data)
Just change the dir_path variable with the directory of your Image files. This code stores the color channel and dimensions of the Images, in a separate tuple unique to that file. And adds the tuple to a list.
P.S.:
Tuple format = (channels, dimensions)

How to generate heat map on the Whole Slide Images (.svs format) using some probability values?

I am trying to generate heat map, or probability map, for Whole Slide Images (WSIs) using probability values. I have coordinate points (which determine areas on the WSIs) and corresponding probability values.
Basic Introduction on WSI: WSIs are large is size (almost 100000 x 100000 pixels). Hence, can't open these images using normal image viewer. The WSIs are processed using OpenSlide software.
I have seen previous posts in Stack Overflow on related to heat map, but as WSIs are processed in a different way, I am unable to figure out how to apply these solutions. Some examples that I followed: 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, etc.
To generate heat map on WSIs, follow below instructions:
First of all Extract image patches and save the coordinates. Use below code for patch extraction. The code require some changes as per the requirements. The code has been copied from: patch extraction code link
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import argparse
import logging
try:
import Image
except:
from PIL import Image
import math
import numpy as np
import openslide
import os
from time import strftime,gmtime
parser = argparse.ArgumentParser(description='Extract a series of patches from a whole slide image')
parser.add_argument("-i", "--image", dest='wsi', nargs='+', required=True, help="path to a whole slide image")
parser.add_argument("-p", "--patch_size", dest='patch_size', default=299, type=int, help="pixel width and height for patches")
parser.add_argument("-b", "--grey_limit", dest='grey_limit', default=0.8, type=float, help="greyscale value to determine if there is sufficient tissue present [default: `0.8`]")
parser.add_argument("-o", "--output", dest='output_name', default="output", help="Name of the output file directory [default: `output/`]")
parser.add_argument("-v", "--verbose",
dest="logLevel",
choices=['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL'],
default="INFO",
help="Set the logging level")
args = parser.parse_args()
if args.logLevel:
logging.basicConfig(level=getattr(logging, args.logLevel))
wsi=' '.join(args.wsi)
""" Set global variables """
mean_grey_values = args.grey_limit * 255
number_of_useful_regions = 0
wsi=os.path.abspath(wsi)
outname=os.path.abspath(args.output_name)
basename = os.path.basename(wsi)
level = 0
def main():
img,num_x_patches,num_y_patches = open_slide()
logging.debug('img: {}, num_x_patches = {}, num_y_patches: {}'.format(img,num_x_patches,num_y_patches))
for x in range(num_x_patches):
for y in range(num_y_patches):
img_data = img.read_region((x*args.patch_size,y*args.patch_size),level, (args.patch_size, args.patch_size))
print_pics(x*args.patch_size,y*args.patch_size,img_data,img)
pc_uninformative = number_of_useful_regions/(num_x_patches*num_y_patches)*100
pc_uninformative = round(pc_uninformative,2)
logging.info('Completed patch extraction of {} images.'.format(number_of_useful_regions))
logging.info('{}% of the image is uninformative\n'.format(pc_uninformative))
def print_pics(x_top_left,y_top_left,img_data,img):
if x_top_left % 100 == 0 and y_top_left % 100 == 0 and x_top_left != 0:
pc_complete = round(x_top_left /img.level_dimensions[0][0],2) * 100
logging.info('{:.2f}% Complete at {}'.format(pc_complete,strftime("%a, %d %b %Y %H:%M:%S +0000", gmtime())))
exit()
img_data_np = np.array(img_data)
""" Convert to grayscale"""
grey_img = rgb2gray(img_data_np)
if np.mean(grey_img) < mean_grey_values:
logging.debug('Image grayscale = {} compared to threshold {}'.format(np.mean(grey_img),mean_grey_values))
global number_of_useful_regions
number_of_useful_regions += 1
wsi_base = os.path.basename(wsi)
wsi_base = wsi_base.split('.')[0]
img_name = wsi_base + "_" + str(x_top_left) + "_" + str(y_top_left) + "_" + str(args.patch_size)
#write_img_rotations(img_data_np,img_name)
logging.debug('Saving {} {} {}'.format(x_top_left,y_top_left,np.mean(grey_img)))
save_image(img_data_np,1,img_name)
def gen_x_and_y(xlist,ylist,img):
for x in xlist:
for y in ylist:
img_data = img.read_region((x*args.patch_size,y*args.patch_size),level, (args.patch_size, args.patch_size))
yield (x, y,img_data)
def open_slide():
"""
The first level is always the main image
Get width and height tuple for the first level
"""
logging.debug('img: {}'.format(wsi))
img = openslide.OpenSlide(wsi)
img_dim = img.level_dimensions[0]
"""
Determine what the patch size should be, and how many iterations it will take to get through the WSI
"""
num_x_patches = int(math.floor(img_dim[0] / args.patch_size))
num_y_patches = int(math.floor(img_dim[1] / args.patch_size))
remainder_x = img_dim[0] % num_x_patches
remainder_y = img_dim[1] % num_y_patches
logging.debug('The WSI shape is {}'.format(img_dim))
logging.debug('There are {} x-patches and {} y-patches to iterate through'.format(num_x_patches,num_y_patches))
return img,num_x_patches,num_y_patches
def validate_dir_exists():
if os.path.isdir(outname) == False:
os.mkdir(outname)
logging.debug('Validated {} directory exists'.format(outname))
if os.path.exists(wsi):
logging.debug('Found the file {}'.format(wsi))
else:
logging.debug('Could not find the file {}'.format(wsi))
exit()
def rgb2gray(rgb):
"""Converts an RGB image into grayscale """
r, g, b = rgb[:,:,0], rgb[:,:,1], rgb[:,:,2]
gray = 0.2989 * r + 0.5870 * g + 0.1140 * b
return gray
def save_image(img,j,img_name):
tmp = os.path.join(outname,img_name+"_"+str(j)+".png")
try:
im = Image.fromarray(img)
im.save(tmp)
except:
print('Could not print {}'.format(tmp))
exit()
if __name__ == '__main__':
validate_dir_exists()
main()
Secondly, generate the probability values of each patches.
Finally, replace all the pixel values within a coordinates with the corresponding probability values and display the results using color maps.
This is the basic idea of generating heat map on WSIs. You can modify the code and concept to get a heat map as per your wish.
We have developed a python package for processing whole-slide-images:
https://github.com/amirakbarnejad/PyDmed
Here is a tutorial for getting heatmaps for whole-slide-images:
https://amirakbarnejad.github.io/Tutorial/tutorial_section5.html.
Also here is a sample notebook that gets heatmaps for WSIs using PyDmed:
Link to the sample notebook.
The benefit of PyDmed is that it is multi-processed. The dataloader sends a stream of patches to GPU(s), and the StreamWriter writes to disk in a separate process. Therefore, it is highly efficient. The running time of course depends on the machine, the size of WSIs, etc. On a good machine with a good GPU, PyDmed can generate heatmaps for ~120 WSIs in one day.

How to list files inside tar in AWS S3 without downloading it?

While looking around for ideas I found https://stackoverflow.com/a/54222447/264822 for zip files which I think is a very clever solution. But it relies on zip files having a Central Directory - tar files don't.
I thought I could follow the same general principle and expose the S3 file to tarfile through the fileobj parameter:
import boto3
import io
import tarfile
class S3File(io.BytesIO):
def __init__(self, bucket_name, key_name, s3client):
super().__init__()
self.bucket_name = bucket_name
self.key_name = key_name
self.s3client = s3client
self.offset = 0
def close(self):
return
def read(self, size):
print('read: offset = {}, size = {}'.format(self.offset, size))
start = self.offset
end = self.offset + size - 1
try:
s3_object = self.s3client.get_object(Bucket=self.bucket_name, Key=self.key_name, Range="bytes=%d-%d" % (start, end))
except:
return bytearray()
self.offset = self.offset + size
result = s3_object['Body'].read()
return result
def seek(self, offset, whence=0):
if whence == 0:
print('seek: offset {} -> {}'.format(self.offset, offset))
self.offset = offset
def tell(self):
return self.offset
s3file = S3File(bucket_name, file_name, s3client)
tarf = tarfile.open(fileobj=s3file)
names = tarf.getnames()
for name in names:
print(name)
This works fine except the output looks like:
read: offset = 0, size = 2
read: offset = 2, size = 8
read: offset = 10, size = 8192
read: offset = 8202, size = 1235
read: offset = 9437, size = 1563
read: offset = 11000, size = 3286
read: offset = 14286, size = 519
read: offset = 14805, size = 625
read: offset = 15430, size = 1128
read: offset = 16558, size = 519
read: offset = 17077, size = 573
read: offset = 17650, size = 620
(continued)
tarfile is just reading the whole file anyway so I haven't gained anything. Is there anyway of making tarfile only read the parts of the file it needs? The only alternative I can think of is re-implementing the tar file parsing so it:
Reads the 512 bytes header and writes this into a BytesIO buffer.
Gets the size of the file following and writes zeroes into the BytesIO buffer.
Skips over the file to the next header.
But this seems overly complicated.
My mistake. I'm actually dealing with tar.gz files but I assumed that zip and tar.gz are similar. They're not - tar is an archive file which is then compressed as gzip, so to read the tar you have to decompress it first. My idea of pulling bits out of the tar file won't work.
What does work is:
s3_object = s3client.get_object(Bucket=bucket_name, Key=file_name)
wholefile = s3_object['Body'].read()
fileobj = io.BytesIO(wholefile)
tarf = tarfile.open(fileobj=fileobj)
names = tarf.getnames()
for name in names:
print(name)
I suspect the original code will work for a tar file but I don't have any to try it on.
I just tested your original code on a tar file and it works quite well.
Here is my sample output (truncated). I made some minor changes to display the total downloaded bytes and the seek step size in kB (published at this gist). This is for a 1 GB tar file containing 321 files (average size per file is 3 MB):
read: offset = 0, size = 2, total download = 2
seek: offset 2 -> 0 (diff = -1 kB)
read: offset = 0, size = 8192, total download = 8194
seek: offset 8192 -> 0 (diff = -9 kB)
read: offset = 0, size = 8192, total download = 16386
seek: offset 8192 -> 0 (diff = -9 kB)
read: offset = 0, size = 512, total download = 16898
<TarInfo 'yt.txt' at 0x7fbbed639ef0>
seek: offset 512 -> 7167 (diff = 6 kB)
read: offset = 7167, size = 1, total download = 16899
read: offset = 7168, size = 512, total download = 17411
<TarInfo 'yt_cache/youtube-sigfuncs' at 0x7fbbed639e20>
read: offset = 7680, size = 512, total download = 17923
...
<TarInfo 'yt_vids/whistle_dolphins-SZTC_zT9ijg.m4a' at 0x7fbbecc697a0>
seek: offset 1004473856 -> 1005401599 (diff = 927 kB)
read: offset = 1005401599, size = 1, total download = 211778
read: offset = 1005401600, size = 512, total download = 212290
None
322
So this downloads 212 kB for a 1GB tar file in order to get a list of 321 filenames in about 2 minutes on colab and 1.5 minutes on ec2 in the same region as the bucket.
In comparison, it takes 17 seconds to download the full file on colab and 1 second to list the files in it with tar -tf file.tar. So if I'm optimizing on execution time, I'd rather just download the full file and work on it locally. Otherwise, there might be some optimization that could be done in your original code? IDK.
OTOH, fetching a single file is more efficient than the above 2 minutes if it's at the beginning of the tar, but as slow as getting all file names if it's at the end. But I couldn't do that with the getmember() function because it seems that it internally calls getmembers() which has to go through the full file. Instead, I rolled out my own while loop to find the file and abort the search once found:
bucket_name, file_name = "bucket", "file.tar"
import boto3
s3client = boto3.client("s3")
s3file = S3File(bucket_name, file_name, s3client)
import tarfile
with tarfile.open(mode="r", fileobj=s3file) as tarf:
tarinfo = 1 # dummy
while tarinfo is not None:
tarinfo = tarf.next()
if tarinfo.name == name_search:
break
I think a future direction for this would be to have the tarinfo.open(...) cache the offsets of each file so that a subsequent tarinfo.open(...) doesn't go through the full file again. Once that's done, a first pass through the tar file will allow downloading individual files from the tar in s3 without going through the full file again and again for reach file.
Side note, couldn't you have just run gunzip on the tar.gz to get the tar to test on?

Resources