How can I use BytesIO as a pandas.read_csv data source

How can I use BytesIO as a pandas.read_csv data source - python-3.x

I am trying to perform a csv data parsing using pandas.read_csv(bytes, chunksize=n) where bytes is a ongoing stream of data which I want to receive from a database CLOB field, reading it by chunks.
reader = pandas.read_csv(io.BytesIO(b'1;qwer\n2;asdf\n3;zxcv'), sep=';', chunksize=2)
for row_chunk in reader:
print(row_chunk)
Code above is working fine, but I want to use some updatable stream instead of fixed io.BytesIO(b'...')
I tried to redefine read method like this
class BlobIO(io.BytesIO):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self._chunk_size = 4
self._file_data_table = 'my_table'
self._job_id = 'job_id'
self._get_raw_sql = """
select dbms_lob.substr(body, {0}, {1})
from {2}
where job_id = '{3}'
"""
dsn_tns = cx_Oracle.makedsn('host', 'port', 'service_name')
self.ora_con = cx_Oracle.connect('ora_user', 'ora_pass', dsn_tns)
self.res = b''
self.ora_cur = self.ora_con.cursor()
self.chunker = self.get_chunk()
next(self.chunker)
def get_chunk(self):
returned = 0
sended = (yield)
self._chunk_size = sended or self._chunk_size
while True:
to_exec = self._get_raw_sql.format(
self._chunk_size,
returned + 1,
self._file_data_table,
self._job_id)
self.ora_cur.execute(to_exec)
self.res = self.ora_cur.fetchall()[0][0]
returned += self._chunk_size
yield self.res
sended = (yield self.res)
self._chunk_size = sended or self._chunk_size
if not self.res:
break
def read(self, nbytes=None):
if nbytes:
self.chunker.send(nbytes)
else:
self.chunker.send(self._chunk_size)
try:
to_return = next(self.chunker)
except StopIteration:
self.ora_con.close()
to_return = b''
return to_return
buffer = BlobIO()
reader = pandas.read_csv(buffer, encoding='cp1251', sep=';', chunksize=2)
but it looks like I'm doing something completely wrong because pd.read_csv never got executed here at the last line and I don't understand what is happening there.
Maybe creating buffer = BytesIO(b'') and then writing new data to the buffer buffer.write(new_chunk_from_db) could be a better approach but I don't understand when exactly should I call such a write action.
I believe I can create a temporary file with the contents of a CLOB which I can then pass to read_csv, but I really want to skip this step and read data directly from database.
Please give me some directions.

cx_Oracle provides native way to read LOBs. Seems like overriding BytesIO read with cx_Oracle LOB read does the job:
class BlobIO(BytesIO):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.res = b''
self.ora_con = db.get_conn()
self.ora_cur = self.ora_con.cursor()
self.ora_cur.execute("select lob from table")
self.res = self.ora_cur.fetchall()[0][0]
self.offset = 1
def read(self, size=None):
r = self.res.read(self.offset, size)
self.offset += size
# size + 1 should be here to perform nonoverlaping reads
# but looks like panadas C parser uses some kind of overlaping
# because while testing size+1 - parser occasionally missed some bytes
if not r:
self.ora_cur.close()
self.ora_con.close()
return r
blob_buffer = BlobIO()
reader = pandas.read_csv(
blob_buffer,
chunksize=JobContext.rchunk_size)
for row_chunk in reader:
print(row_chunk)

Related

Returning a generator in a with statement

I wanted to create a wrapper function over pandas.read_csv to change the default separator and format the file a specific way. This is the code I had :
def custom_read(path, sep="|", **kwargs):
if not kwargs.get("chunksize", False):
df_ = pd.read_csv(path, sep=sep, **kwargs)
return format_df(df_, path)
else:
with pd.read_csv(path, sep=sep, **kwargs) as reader:
return (format_df(chunk, path) for chunk in reader)
It turns out that this segfaults when used like so :
L = [chunk.iloc[:10, :] for chunk in custom_read(my_file)]
From what I understood off the backtrace, the generator is created, then the file is closed and the segfault happens when the generator tries to read from the now closed file.
I could avoid the segfault with a minor refactoring :
def custom_read(path, sep="|", **kwargs):
if not kwargs.get("chunksize", False):
df_ = pd.read_csv(path, sep=sep, **kwargs)
return format_df(df_, path)
else:
reader = pd.read_csv(path, sep=sep, **kwargs)
return (format_df(chunk, path) for chunk in reader)
I couldn't find anything on the particular usecase of generators in with clauses, is it something to avoid ? Is this supposed not to work or is this a bug of some kind ?
Is there a way to avoid this error but still use the encouraged with statement ?

You could use a generator which keeps the file open. See the following example:
import os
def lines_format(lines):
return "\n".join(f"*{line.strip()}*" for line in lines)
def chunk_gen(file, chunksize):
with open(file, mode='r') as f:
while True:
lines = f.readlines(chunksize)
if not lines:
break
yield lines_format(lines)
def get_formatted_pages(file, chunksize=0):
if chunksize > 0:
return chunk_gen(file, chunksize)
else:
with open(file, mode='r') as f:
lines = f.readlines()
return [lines_format(lines)]
with open("abc.txt", mode='w') as f:
f.write(os.linesep.join('abc'))
pages = get_formatted_pages("abc.txt")
for i, page in enumerate(pages, start=1):
print(f"Page {i}")
print(page)
pages = get_formatted_pages("abc.txt", chunksize=2)
for i, page in enumerate(pages, start=1):
print(f"Page {i}")
print(page)
Edit:
In your pandas.read_csv use case, this would look like
import pandas as pd
df = pd.DataFrame({'char': list('abc'), "num": range(3)})
df.to_csv('abc.csv')
def gen_chunk(file, chunksize):
with pd.read_csv(file, chunksize=chunksize, index_col=0) as reader:
for chunk in reader:
yield format_df(chunk)
def format_df(df):
# do something
df['char'] = df['char'].str.capitalize()
return df
def get_formatted_pages(file, chunksize=0):
if chunksize > 0:
return gen_chunk(file, chunksize)
else:
return [format_df(pd.read_csv(file, index_col=0))]
list(get_formatted_pages('abc.csv', chunksize=2))

Django - Batch Actions in a ListView - Select Rows and Return a Zip Archive in Response

Ok, first off, I know this is not the best way to handle serving files on a prod server.
But this site will be accessed by a small group of users, the request for these files
should be minimal, and the static files being served are very small - so I am trying to keep things simple.
I have a ListView Table where the user can select rows and then perform batch actions on the objects (eg. Delete, Export CSVs, Send CSV data to another location, etc). I've created a class based view to handle each of these batch actions. The only action that I can't get to work is when the user requests a download of a ZIP archive of the CSV data.
When the ZIP export action is selected from the ListView, the selected rows are written as individual CSV files into to a static file location, and then zipped into a single ZIP archive. But the FileResponse never returns the archive to the user as a download.
I've tried different ways of handling the response - redirecting to a new URL,
reverse to the ListView, different URL patterns, etc.
the method shown below that handles the create and zip of the CSV is called: batch_csv_zip
Can someone please point out where I'm going wrong. thanks.
class BatchActionView(LoginRequiredMixin, FormMixin, SuccessMessageMixin, TemplateView):
"""
Takes a list of pks to perform a batch action
"""
model = SlateDoc
template_name = 'slatedoc_batch_action.html'
permission_required = ('slatedoc.can_run_batch_actions')
permission_denied_message = "Permission Denied"
def dispatch(self, request, *args, **kwargs):
if not request.user.has_perm(permission_required):
messages.error(self.request, self.permission_denied_message)
return HttpResponseRedirect(request.META.get('HTTP_REFERER'))
else:
handler = getattr(self, request.method.lower(), self.http_method_not_allowed)
return handler(request, *args, **kwargs)
def post(self, request, *args, **kwargs):
if request.method == 'POST':
input = request.POST.get('inputString')
input_dict = eval(input)
modalID = input_dict["modalID"]
pkList = input_dict["pks"].split(",")
self.batch_actions(request, modalID, pkList)
return HttpResponseRedirect(self.get_success_url())
def get_success_url(self):
input = self.request.POST.get('inputString')
input_dict = eval(input)
modalID = input_dict["modalID"]
pks = input_dict["pks"]
if modalID == "csvModalSubmit":
return reverse_lazy('slatedoc-list')
# return reverse('batch-actions')
# return reverse('slatedoc-download', kwargs={'pks': pks})
# return reverse('slatedoc-download', kwargs={'file': f"{zipfilename}.zip"})
else:
return reverse_lazy('slatedoc-list')
def batch_actions(self, request, modalID, pkList):
"""
Check the modalID data from POST to trigger a Batch Action
"""
if modalID == "deleteModalSubmit":
self.batch_delete(pkList)
self.success_message = 'SlateDocs successfully deleted!'
if modalID == "csvModalSubmit":
self.batch_csv_zip(pkList)
self.success_message = 'SlateDocs successfully exported as a ZIP!'
if modalID == "vantageModalSubmit":
self.batch_csv_to_vantage(request, pkList)
self.success_message = 'SlateDocs successfully sent to Vantage!'
messages.success(self.request, self.success_message)
# return HttpResponseRedirect(self.get_success_url())
def batch_delete(self, pkList):
"""
Perform a soft delete for each pk in the pkList
"""
for pk in pkList:
slatedoc = SlateDoc.objects.get(id=pk)
self.object = slatedoc
slatedoc.soft_delete()
return
def batch_csv_zip(self, pkList):
"""
Generate CSV for each PK and then return a ZIP archive of the data
"""
t = time.time()
datetime = time.strftime('%Y%m%d%H%M', time.localtime(t))
csv_filepath = f"ngceng/static/docs/csv"
zippath = f"ngceng/static/docs/zip"
zipfilename = f"{datetime}_slatedoc_csv"
os.mkdir(os.path.join(csv_filepath,datetime))
for pk in pkList:
item = SlateDoc.objects.get(pk=pk)
slatedoc_resource = SlateDocResource()
queryset = SlateDoc.objects.filter(pk=pk)
dataset = slatedoc_resource.export(queryset)
filename = item.filename
filename_list = re.split('(\-+)|(\_+)', filename)
filepath = f"ngceng/static/docs/csv/{datetime}/{filename_list[0]}.csv"
with open(filepath, 'w', newline='') as f:
writer = csv.writer(f, delimiter=',')
writer.writerow(fieldnames_export)
writer.writerows(dataset)
f.close
# csvfilelist = [f for f in os.listdir(os.path.dirname(filepath)) if f.endswith(".csv")]
csv_f = os.path.abspath(os.path.join(csv_filepath, datetime))
zip_f = os.path.abspath(os.path.join(zippath, f"{zipfilename}.zip"))
os.chdir(zippath)
shutil.make_archive(zipfilename, 'zip', csv_f)
# os.remove(csv_f)
response = FileResponse(open(zip_f, 'rb').read())
response['content_type'] = 'application/zip'
response['Content-Disposition'] = f"attachment; filename = {zipfilename}.zip"
response['input'] = input
response['zipfilename'] = zipfilename
return response
# return HttpResponseRedirect(reverse('slatedoc-download', kwargs={'pks': ",".join(pkList)}))
# return HttpResponseRedirect(reverse('slatedoc-download', kwargs={'file': f"{zipfilename}.zip"}))
def batch_csv_to_vantage(self, request, pkList):
"""
Generate a CSV file for each pk in the pk List, send these CSVs to Vantage.
"""
for pk in pkList:
export_csv_to_vantage(request, pk)
return HttpResponseRedirect(self.get_success_url())

Wrap an io.BufferedIOBase such that it becomes seek-able

I was trying to craft a response to a question about streaming audio from a HTTP server, then play it with PyGame. I had the code mostly complete, but hit an error where the PyGame music functions tried to seek() on the urllib.HTTPResponse object.
According to the urlib docs, the urllib.HTTPResponse object (since v3.5) is an io.BufferedIOBase. I expected this would make the stream seek()able, however it does not.
Is there a way to wrap the io.BufferedIOBase such that it is smart enough to buffer enough data to handle the seek operation?
import pygame
import urllib.request
import io
# Window size
WINDOW_WIDTH = 400
WINDOW_HEIGHT = 400
# background colour
SKY_BLUE = (161, 255, 254)
### Begin the streaming of a file
### Return the urlib.HTTPResponse, a file-like-object
def openURL( url ):
result = None
try:
http_response = urllib.request.urlopen( url )
print( "streamHTTP() - Fetching URL [%s]" % ( http_response.geturl() ) )
print( "streamHTTP() - Response Status [%d] / [%s]" % ( http_response.status, http_response.reason ) )
result = http_response
except:
print( "streamHTTP() - Error Fetching URL [%s]" % ( url ) )
return result
### MAIN
pygame.init()
window = pygame.display.set_mode( ( WINDOW_WIDTH, WINDOW_HEIGHT ) )
pygame.display.set_caption("Music Streamer")
clock = pygame.time.Clock()
done = False
while not done:
# Handle user-input
for event in pygame.event.get():
if ( event.type == pygame.QUIT ):
done = True
# Keys
keys = pygame.key.get_pressed()
if ( keys[pygame.K_UP] ):
if ( pygame.mixer.music.get_busy() ):
print("busy")
else:
print("play")
remote_music = openURL( 'http://127.0.0.1/example.wav' )
if ( remote_music != None and remote_music.status == 200 ):
pygame.mixer.music.load( io.BufferedReader( remote_music ) )
pygame.mixer.music.play()
# Re-draw the screen
window.fill( SKY_BLUE )
# Update the window, but not more than 60fps
pygame.display.flip()
clock.tick_busy_loop( 60 )
pygame.quit()
When this code runs, and Up is pushed, it fails with the error:
streamHTTP() - Fetching URL [http://127.0.0.1/example.wav]
streamHTTP() - Response Status [200] / [OK]
io.UnsupportedOperation: seek
io.UnsupportedOperation: File or stream is not seekable.
io.UnsupportedOperation: seek
io.UnsupportedOperation: File or stream is not seekable.
Traceback (most recent call last):
File "./sound_stream.py", line 57, in <module>
pygame.mixer.music.load( io.BufferedReader( remote_music ) )
pygame.error: Unknown WAVE format
I also tried re-opening the the io stream, and various other re-implementations of the same sort of thing.

Seeking seeking
According to the urlib docs, the urllib.HTTPResponse object (since v3.5) is an io.BufferedIOBase. I expected this would make the stream seek()able, however it does not.
That's correct. The io.BufferedIOBase interface doesn't guarantee the I/O object is seekable. For HTTPResponse objects, IOBase.seekable() returns False:
>>> import urllib.request
>>> response = urllib.request.urlopen("http://httpbin.org/get")
>>> response
<http.client.HTTPResponse object at 0x110870ca0>
>>> response.seekable()
False
That's because the BufferedIOBase implementation offered by HTTPResponse is wrapping a socket object, and sockets are not seekable either.
You can't wrap an BufferedIOBase object in a BufferedReader object and add seeking support. The Buffered* wrapper objects can only wrap RawIOBase types, and they rely on the wrapped object to provide seeking support. You would have to emulate seeking at raw I/O level, see below.
You can still provide the same functionality at a higher level, but take into account that seeking on remote data is a lot more involved; this isn't a simple change a simple OS variable that represents a file position on disk operation. For larger remote file data, seeking without backing the whole file on disk locally could be as sophisticated as using HTTP range requests and local (in memory or on-disk) buffers to balance sound play-back performance and minimising local data storage. Doing this correctly for a wide range of use-cases can be a lot of effort, so is certainly not part of the Python standard library.
If your sound files are small
If your HTTP-sourced sound files are small enough (a few MB at most) then just read the whole response into an in-memory io.BytesIO() file object. I really do not think it is worth making this more complicated than that, because the moment you have enough data to make that worth pursuing your files are large enough to take up too much memory!
So this would be more than enough if your sound files are smaller (no more than a few MB):
from io import BytesIO
import urllib.error
import urllib.request
def open_url(url):
try:
http_response = urllib.request.urlopen(url)
print(f"streamHTTP() - Fetching URL [{http_response.geturl()}]")
print(f"streamHTTP() - Response Status [{http_response.status}] / [{http_response.reason}]")
except urllib.error.URLError:
print("streamHTTP() - Error Fetching URL [{url}]")
return
if http_response.status != 200:
print("streamHTTP() - Error Fetching URL [{url}]")
return
return BytesIO(http_response.read())
This doesn't require writing a wrapper object, and because BytesIO is a native implementation, once the data is fully copied over, access to the data is faster than any Python-code wrapper could ever give you.
Note that this returns a BytesIO file object, so you no longer need to test for the response status:
remote_music = open_url('http://127.0.0.1/example.wav')
if remote_music is not None:
pygame.mixer.music.load(remote_music)
pygame.mixer.music.play()
If they are more than a few MB
Once you go beyond a few megabytes, you could try pre-loading the data into a local file object. You can make this more sophisticated by using a thread to have shutil.copyfileobj() copy most of the data into that file in the background and give the file to PyGame after loading just an initial amount of data.
By using an actual file object, you can actually help performance here, as PyGame will try to minimize interjecting itself between the SDL mixer and the file data. If there is an actual file on disk with a file number (the OS-level identifier for a stream, something that the SDL mixer library can make use of), then PyGame will operate directly on that and so minimize blocking the GIL (which in turn will help the Python portions of your game perform better!). And if you pass in a filename (just a string), then PyGame gets out of the way entirely and leaves all file operations over to the SDL library.
Here's such an implementation; this should, on normal Python interpreter exit, clean up the downloaded files automatically. It returns a filename for PyGame to work on, and finalizing downloading the data is done in a thread after the initial few KB has been buffered. It will avoid loading the same URL more than once, and I've made it thread-safe:
import shutil
import urllib.error
import urllib.request
from tempfile import NamedTemporaryFile
from threading import Lock, Thread
INITIAL_BUFFER = 1024 * 8 # 8kb initial file read to start URL-backed files
_url_files_lock = Lock()
# stores open NamedTemporaryFile objects, keeping them 'alive'
# removing entries from here causes the file data to be deleted.
_url_files = {}
def open_url(url):
with _url_files_lock:
if url in _url_files:
return _url_files[url].name
try:
http_response = urllib.request.urlopen(url)
print(f"streamHTTP() - Fetching URL [{http_response.geturl()}]")
print(f"streamHTTP() - Response Status [{http_response.status}] / [{http_response.reason}]")
except urllib.error.URLError:
print("streamHTTP() - Error Fetching URL [{url}]")
return
if http_response.status != 200:
print("streamHTTP() - Error Fetching URL [{url}]")
return
fileobj = NamedTemporaryFile()
content_length = http_response.getheader("Content-Length")
if content_length is not None:
try:
content_length = int(content_length)
except ValueError:
content_length = None
if content_length:
# create sparse file of full length
fileobj.seek(content_length - 1)
fileobj.write(b"\0")
fileobj.seek(0)
fileobj.write(http_response.read(INITIAL_BUFFER))
with _url_files_lock:
if url in _url_files:
# another thread raced us to this point, we lost, return their
# result after cleaning up here
fileobj.close()
http_response.close()
return _url_files[url].name
# store the file object for this URL; this keeps the file
# open and so readable if you have the filename.
_url_files[url] = fileobj
def copy_response_remainder():
# copies file data from response to disk, for all data past INITIAL_BUFFER
with http_response:
shutil.copyfileobj(http_response, fileobj)
t = Thread(daemon=True, target=copy_response_remainder)
t.start()
return fileobj.name
Like the BytesIO() solution, the above returns either None or a value ready for passing to pass to pygame.mixer.music.load().
The above will probably not work if you try to immediately set an advanced playing position in your sound files, as later data may not yet have been copied into the file. It's a trade-off.
Seeking and finding third party libraries
If you need to have full seeking support on remote URLs and don't want to use on-disk space for them and don't want to have to worry about their size, you don't need to re-invent the HTTP-as-seekable-file wheel here. You could use an existing project that offers the same functionality. I found two that offer io.BufferedIOBase-based implementations:
smart_open
httpio
Both use HTTP Range requests to implement seeking support. Just use httpio.open(URL) or smart_open.open(URL) and pass that directly to pygame.mixer.music.load(); if the URL can't be opened, you can catch that by handling the IOError exception:
from smart_open import open as url_open # or from httpio import open
try:
remote_music = url_open('http://127.0.0.1/example.wav')
except IOError:
pass
else:
pygame.mixer.music.load(remote_music)
pygame.mixer.music.play()
smart_open uses an in-memory buffer to satisfy reads of a fixed size, but creates a new HTTP Range request for every call to seek that changes the current file position, so performance may vary. Since the SDL mixer executes a few seeks on audio files to determine their type, I expect this to be a little slower.
httpio can buffer blocks of data and so might handle seeks better, but from a brief glance at the source code, when actually setting a buffer size the cached blocks are never evicted from memory again so you'd end up with the whole file in memory, eventually.
Implementing seeking ourselves, via io.RawIOBase
And finally, because I'm not able to find efficient HTTP-Range-backed I/O implementations, I wrote my own. The following implements the io.RawIOBase interface, specifically so you can then wrap the object in a io.BufferedIOReader() and so delegate caching to a caching buffer that will be managed correctly when seeking:
import io
from copy import deepcopy
from functools import wraps
from typing import cast, overload, Callable, Optional, Tuple, TypeVar, Union
from urllib.request import urlopen, Request
T = TypeVar("T")
#overload
def _check_closed(_f: T) -> T: ...
#overload
def _check_closed(*, connect: bool, default: Union[bytes, int]) -> Callable[[T], T]: ...
def _check_closed(
_f: Optional[T] = None,
*,
connect: bool = False,
default: Optional[Union[bytes, int]] = None,
) -> Union[T, Callable[[T], T]]:
def decorator(f: T) -> T:
#wraps(cast(Callable, f))
def wrapper(self, *args, **kwargs):
if self.closed:
raise ValueError("I/O operation on closed file.")
if connect and self._fp is None or self._fp.closed:
self._connect()
if self._fp is None:
# outside the seekable range, exit early
return default
try:
return f(self, *args, **kwargs)
except Exception:
self.close()
raise
finally:
if self._range_end and self._pos >= self._range_end:
self._fp.close()
del self._fp
return cast(T, wrapper)
if _f is not None:
return decorator(_f)
return decorator
def _parse_content_range(
content_range: str
) -> Tuple[Optional[int], Optional[int], Optional[int]]:
"""Parse a Content-Range header into a (start, end, length) tuple"""
units, *range_spec = content_range.split(None, 1)
if units != "bytes" or not range_spec:
return (None, None, None)
start_end, _, size = range_spec[0].partition("/")
try:
length: Optional[int] = int(size)
except ValueError:
length = None
start_val, has_start_end, end_val = start_end.partition("-")
start = end = None
if has_start_end:
try:
start, end = int(start_val), int(end_val)
except ValueError:
pass
return (start, end, length)
class HTTPRawIO(io.RawIOBase):
"""Wrap a HTTP socket to handle seeking via HTTP Range"""
url: str
closed: bool = False
_pos: int = 0
_size: Optional[int] = None
_range_end: Optional[int] = None
_fp: Optional[io.RawIOBase] = None
def __init__(self, url_or_request: Union[Request, str]) -> None:
if isinstance(url_or_request, str):
self._request = Request(url_or_request)
else:
# copy request objects to avoid sharing state
self._request = deepcopy(url_or_request)
self.url = self._request.full_url
self._connect(initial=True)
def readable(self) -> bool:
return True
def seekable(self) -> bool:
return True
def close(self) -> None:
if self.closed:
return
if self._fp:
self._fp.close()
del self._fp
self.closed = True
#_check_closed
def tell(self) -> int:
return self._pos
def _connect(self, initial: bool = False) -> None:
if self._fp is not None:
self._fp.close()
if self._size is not None and self._pos >= self._size:
# can't read past the end
return
request = self._request
request.add_unredirected_header("Range", f"bytes={self._pos}-")
response = urlopen(request)
self.url = response.geturl() # could have been redirected
if response.status not in (200, 206):
raise OSError(
f"Failed to open {self.url}: "
f"{response.status} ({response.reason})"
)
if initial:
# verify that the server supports range requests. Capture the
# content length if available
if response.getheader("Accept-Ranges") != "bytes":
raise OSError(
f"Resource doesn't support range requests: {self.url}"
)
try:
length = int(response.getheader("Content-Length", ""))
if length >= 0:
self._size = length
except ValueError:
pass
# validate the range we are being served
start, end, length = _parse_content_range(
response.getheader("Content-Range", "")
)
if self._size is None:
self._size = length
if (start is not None and start != self._pos) or (
length is not None and length != self._size
):
# non-sensical range response
raise OSError(
f"Resource at {self.url} served invalid range: pos is "
f"{self._pos}, range {start}-{end}/{length}"
)
if self._size and end is not None and end + 1 < self._size:
# incomplete range, not reaching all the way to the end
self._range_end = end
else:
self._range_end = None
fp = cast(io.BufferedIOBase, response.fp) # typeshed doesn't name fp
self._fp = fp.detach() # assume responsibility for the raw socket IO
#_check_closed
def seek(self, offset: int, whence: int = io.SEEK_SET) -> int:
relative_to = {
io.SEEK_SET: 0,
io.SEEK_CUR: self._pos,
io.SEEK_END: self._size,
}.get(whence)
if relative_to is None:
if whence == io.SEEK_END:
raise IOError(
f"Can't seek from end on unsized resource {self.url}"
)
raise ValueError(f"whence value {whence} unsupported")
if -offset > relative_to: # can't seek to a point before the start
raise OSError(22, "Invalid argument")
self._pos = relative_to + offset
# there is no point in optimising an existing connection
# by reading from it if seeking forward below some threshold.
# Use a BufferedIOReader to avoid seeking by small amounts or by 0
if self._fp:
self._fp.close()
del self._fp
return self._pos
# all read* methods delegate to the SocketIO object (itself a RawIO
# implementation).
#_check_closed(connect=True, default=b"")
def read(self, size: int = -1) -> Optional[bytes]:
assert self._fp is not None # show type checkers we already checked
res = self._fp.read(size)
if res is not None:
self._pos += len(res)
return res
#_check_closed(connect=True, default=b"")
def readall(self) -> bytes:
assert self._fp is not None # show type checkers we already checked
res = self._fp.readall()
self._pos += len(res)
return res
#_check_closed(connect=True, default=0)
def readinto(self, buffer: bytearray) -> Optional[int]:
assert self._fp is not None # show type checkers we already checked
n = self._fp.readinto(buffer)
self._pos += n or 0
return n
Remember that this is a RawIOBase object, which you really want to wrap in a BufferReader(). Doing so in open_url() looks like this:
def open_url(url, *args, **kwargs):
return io.BufferedReader(HTTPRawIO(url), *args, **kwargs)
This gives you fully buffered I/O, with full support seeking, over a remote URL, and the BufferedReader implementation will minimise resetting the HTTP connection when seeking. I've found that using this with the PyGame mixer, only single HTTP connection is made, as all the test seeks are within the default 8KB buffer.

If your fine with using the requests module (which supports streaming) instead of urllib, you could use a wrapper like this:
class ResponseStream(object):
def __init__(self, request_iterator):
self._bytes = BytesIO()
self._iterator = request_iterator
def _load_all(self):
self._bytes.seek(0, SEEK_END)
for chunk in self._iterator:
self._bytes.write(chunk)
def _load_until(self, goal_position):
current_position = self._bytes.seek(0, SEEK_END)
while current_position < goal_position:
try:
current_position = self._bytes.write(next(self._iterator))
except StopIteration:
break
def tell(self):
return self._bytes.tell()
def read(self, size=None):
left_off_at = self._bytes.tell()
if size is None:
self._load_all()
else:
goal_position = left_off_at + size
self._load_until(goal_position)
self._bytes.seek(left_off_at)
return self._bytes.read(size)
def seek(self, position, whence=SEEK_SET):
if whence == SEEK_END:
self._load_all()
else:
self._bytes.seek(position, whence)
Then I guess you can do something like this:
WINDOW_WIDTH = 400
WINDOW_HEIGHT = 400
SKY_BLUE = (161, 255, 254)
URL = 'http://localhost:8000/example.wav'
pygame.init()
window = pygame.display.set_mode( ( WINDOW_WIDTH, WINDOW_HEIGHT ) )
pygame.display.set_caption("Music Streamer")
clock = pygame.time.Clock()
done = False
font = pygame.font.SysFont(None, 32)
state = 0
def play_music():
response = requests.get(URL, stream=True)
if (response.status_code == 200):
stream = ResponseStream(response.iter_content(64))
pygame.mixer.music.load(stream)
pygame.mixer.music.play()
else:
state = 0
while not done:
for event in pygame.event.get():
if ( event.type == pygame.QUIT ):
done = True
if event.type == pygame.KEYDOWN and state == 0:
Thread(target=play_music).start()
state = 1
window.fill( SKY_BLUE )
window.blit(font.render(str(pygame.time.get_ticks()), True, (0,0,0)), (32, 32))
pygame.display.flip()
clock.tick_busy_loop( 60 )
pygame.quit()
using a Thread to start streaming.
I'm not sure this works 100%, but give it a try.

Create sub stream in python

How can I create a "sub stream" in python. Let's say I have an file opened for reading. I want to return a file-like object that you can use to read only part of that file.
with open(filename, 'rb') as f:
start = 0x34
size = 0x20
return Substream(f, start, size) # <-- How do I do this?
Seeking to 0 on this object should go to "start" on the f object. Further more reading past size should trigger eof behavior. Hope this makes sense. How do I accomplish this?

A quick subclass of io.RawIOBase seems to do the trick, at least for my use case. I understand this is not a full implementation of the io.RawIOBase interface, but it gets the job done.
class Substream(io.RawIOBase):
"""Represents a view of a subset of a file like object"""
def __init__(self, file: io.RawIOBase, start, size):
self.file = file
self.start = start
self.size = size
self.p = 0
def seek(self, offset, origin=0):
if origin == 0:
self.p = offset
elif origin == 1:
self.p += offset
# TODO: origin == 2
else:
raise ValueError("Unexpected origin: {}".format(origin))
def read(self, n):
prev = self.file.tell()
self.file.seek(self.start + self.p)
data = self.file.read(n if self.p + n <= self.size else self.size - self.p)
self.p += len(data)
self.file.seek(prev)
return data
Use it like so
with open(filename) as f:
print(Substream(f, 10, 100).read(10))
I wonder if this can be done on file descriptor level instead somehow...?

How could I create my own custom .zip format in python34?

I want to make my own zip file similar to this one but in python 3.4

If you are interested in creating your own ZIP file, you might be interested in checking out the following two files. The first provides a GUI interface for compressing and decompressing directories, and the second has classes that use a custom serialization format. Use this idea to create your own system.
Archive3.py
#! /usr/bin/env python3
"""Provide an efficient alternative to zipping directories into archives.
This simple application is designed to serialize and deserialize directories
and their files into BZ2 compressed files. No active progress is shown, but
the window reappears once the requested operations have been completed."""
# Import the compression library along with tools to create a GUI on screen.
import bz2
import pathlib
import tkinter.filedialog
import tkinter.messagebox
import tkinter.ttk
# Import the custom serialization module to handle directories.
import daf_stream
# Include supplemental information along with a public API definition.
__author__ = 'Stephen "Zero" Chappell <Noctis.Skytower#gmail.com>'
__date__ = '21 February 2017'
__version__ = 3, 0, 0
__all__ = 'Application', 'find_parent_of_type'
class Application(tkinter.ttk.Frame):
"""Application(master=None, **kw) -> Application instance"""
# Define some of the options used when dealing with directories and files.
DIRECTORY_OPTIONS = dict(
initialdir=pathlib.Path.home(),
mustexist=True
)
FILE_OPTIONS = dict(
defaultextension='.caa',
filetypes=['Archive .caa'],
initialdir=pathlib.Path.home()
)
#classmethod
def main(cls):
"""Create a root window and display an Application instance in it."""
tkinter.NoDefaultRoot()
root = tkinter.Tk()
root.title('Archive 3')
root.resizable(False, False)
frame = cls(root)
frame.grid(sticky=tkinter.NSEW)
root.mainloop()
def __init__(self, master=None, **kw):
"""Initialize the Application's instance attributes."""
super().__init__(master, **kw)
self.label = self.compress_button = self.decompress_button = None
self.create_widgets()
self.configure_widgets()
def create_widgets(self):
"""Build the controls the application will shown on screen."""
self.label = tkinter.ttk.Label(
self,
text='''\
WARNING:
This program is not backward-compatible and
cannot be used with the Archive 2.0 program.''',
justify=tkinter.CENTER
)
self.compress_button = tkinter.ttk.Button(
self,
text='Compress Directory to File',
command=lambda: self.wrapper(self.compress_directory)
)
self.decompress_button = tkinter.ttk.Button(
self,
text='Decompress File to Directory',
command=lambda: self.wrapper(self.decompress_file)
)
def configure_widgets(self):
"""Set up the controls so they show up in their frame."""
options = dict(padx=5, pady=5, sticky=tkinter.NSEW)
self.label.grid(row=0, column=0, **options)
self.compress_button.grid(row=1, column=0, **options)
self.decompress_button.grid(row=2, column=0, **options)
def wrapper(self, method):
"""Handle the root window, execute the method, and show any errors."""
root = find_parent_of_type(self, tkinter.Tk)
root.withdraw()
try:
method()
except Exception as error:
tkinter.messagebox.showerror(
'Exception',
f'{type(error).__name__}: {error}',
master=self
)
root.deiconify()
def compress_directory(self):
"""Pick a source and serialize it to the destination."""
source = tkinter.filedialog.askdirectory(
parent=self,
title='Where is the directory you want to archive?',
**self.DIRECTORY_OPTIONS
)
if source:
destination = tkinter.filedialog.asksaveasfilename(
confirmoverwrite=True,
parent=self,
title='Where should the compressed file be saved?',
**self.FILE_OPTIONS
)
if destination:
with bz2.open(destination, 'wb') as destination:
daf_stream.Serializer(destination).run(source)
def decompress_file(self):
"""Pick a source and deserialize it to the destination."""
source = tkinter.filedialog.askopenfilename(
multiple=False,
parent=self,
title='Where is the file you want to decompress?',
**self.FILE_OPTIONS
)
if source:
destination = tkinter.filedialog.askdirectory(
parent=self,
title='Where should the data archive be loaded?',
**self.DIRECTORY_OPTIONS
)
if destination:
with bz2.open(source, 'rb') as source:
daf_stream.Deserializer(source).run(destination)
def find_parent_of_type(widget, desired_type):
"""Retrieve the control's parent that is of the desired type."""
while True:
widget = widget.master
if widget is None:
raise AttributeError('cannot find parent of desired type')
if isinstance(widget, desired_type):
return widget
if __name__ == '__main__':
Application.main()
daf_stream.py
#! /usr/bin/env python3
"""Provide a simple directory and file serialization protocol.
This module implements two classes that can handle the DFS (Directory &
File Serialization) file format. Both classes can deal with file-like
objects and stream directories and files to and from the file system."""
# Import other modules needed for this module to work properly.
import abc
import collections
import enum
import io
import pathlib
# Include supplemental information along with a public API definition.
__author__ = 'Stephen "Zero" Chappell <Noctis.Skytower#gmail.com>'
__date__ = '9 February 2017'
__version__ = 3, 0, 0
__all__ = 'Serializer', 'Deserializer'
# The organization of the serialized data is fairly simple as shown below.
SERIALIZATION_FORMAT = '''\
Directory
Header
0,aaa,b,c,dd (Bit Mapping)
0 = Directory
a = Pointer Length
b = Name Size Length
c = Content Flag
d = Type Code
00 = Separator
01 = Reserved
10 = Reserved
11 = Genuine
Pointer to Parent
Name Size
Name
---------------------------------
File
Header
1,aaa,b,ccc (Bit Mapping)
1 = File
a = Pointer Length
b = Name Size Length
c = Data Size Length
Pointer to Parent
Name Size
Name
Data Size
Data
'''
#enum.unique
class _RecordType(enum.IntEnum):
"""Enumeration of the different types a record may represent."""
DIRECTORY = 0b0
FILE = 0b1
#enum.unique
class _DirectoryTypeCode(enum.IntEnum):
"""Enumeration of codes directories may specify for their type."""
SEPARATOR = 0b00
RESERVED_A = 0b01
RESERVED_B = 0b10
GENUINE = 0b11
# Define the necessary components used to describe a bit field.
_BitField = collections.namedtuple('_BitField', 'offset, width')
class _Common(abc.ABC):
"""Abstract class for supporting Serializer and Deserializer classes."""
# Define a few static attributes for use in derived classes.
BUFFER_SIZE = 1 << 20
BYTE_WIDTH = 8
BYTE_MASK = (1 << BYTE_WIDTH) - 1
NAME_ENCODING = 'utf_8' # Set to 'mbcs' for Archive 2.0 compatibility.
NULL_BYTE = b'\0'
# Define the bit fields used in header bytes.
RECORD_TYPE = _BitField(7, 1)
POINTER_LENGTH = _BitField(4, 3)
NAME_SIZE_LENGTH = _BitField(3, 1)
CONTENT_FLAG = _BitField(2, 1)
DIRECTORY_TYPE_CODE = _BitField(0, 2)
FILE_DATA_SIZE_LENGTH = _BitField(0, 3)
#abc.abstractmethod
def __init__(self, stream):
"""Initialize the _Common instance's attributes."""
self._stream = stream
self._header = None
#classmethod
def _int_to_bytes(cls, integer):
"""Convert a number into a byte string of variable length."""
if integer:
array = bytearray()
while integer:
array.insert(0, integer & cls.BYTE_MASK)
integer >>= cls.BYTE_WIDTH
return bytes(array)
return cls.NULL_BYTE
#classmethod
def _bytes_to_int(cls, array):
"""Convert a byte string of variable length into a number."""
integer = 0
for byte in array:
integer <<= cls.BYTE_WIDTH
integer |= byte
return integer
#staticmethod
def _write(file, buffer):
"""Write buffer to file until it is completely written."""
while True:
written = file.write(buffer)
if written is None:
raise IOError('nothing could be written to the file')
if written == len(buffer):
break
buffer = buffer[written:]
class Serializer(_Common):
"""Serializer(destination) -> Serializer instance"""
def __init__(self, destination):
"""Initialize the Serializer instance's attributes."""
super().__init__(destination)
self._started = False
self._pointer = None
def run(self, source, keep_zombies=True):
"""Dump the source file or directory contents onto the destination."""
path = pathlib.Path(source).resolve()
zombies = []
if path.is_dir():
self._prime_run()
self._acquire_dir(path, self.NULL_BYTE, keep_zombies, zombies)
elif path.is_file():
self._prime_run()
self._acquire_file(path, self.NULL_BYTE, keep_zombies, zombies)
else:
raise ValueError('source must be a dir or a file')
return zombies
def _prime_run(self):
"""Reset some attributes before a serialization run."""
self._pointer = 0
if self._started:
self._write(self._stream, self.NULL_BYTE)
else:
self._started = True
def _acquire_dir(self, source, parent, keep_zombies, zombies):
"""Serialize a directory."""
try:
paths = tuple(source.iterdir())
except OSError:
zombies.append(source)
if not keep_zombies:
return
paths = ()
self._write_complete_dir_header(source, parent, bool(paths))
if paths:
self._pointer += 1
parent = self._int_to_bytes(self._pointer)
for path in paths:
if path.is_dir():
self._acquire_dir(path, parent, keep_zombies, zombies)
elif path.is_file():
self._acquire_file(path, parent, keep_zombies, zombies)
def _write_complete_dir_header(self, source, parent, content):
"""Record all directory information except its contents."""
name = source.name.encode(self.NAME_ENCODING)
name_size = self._int_to_bytes(len(name))
self._write_dir_header_byte(parent, name_size, content)
self._write(self._stream, parent)
self._write(self._stream, name_size)
self._write(self._stream, name)
def _write_dir_header_byte(self, pointer, name_size, content):
"""Record the directory header byte using the correct format."""
self._header = 0
self._set_bits(_RecordType.DIRECTORY, self.RECORD_TYPE)
self._set_bits(len(pointer) - 1, self.POINTER_LENGTH)
self._set_bits(len(name_size) - 1, self.NAME_SIZE_LENGTH)
self._set_bits(content, self.CONTENT_FLAG)
self._set_bits(_DirectoryTypeCode.GENUINE, self.DIRECTORY_TYPE_CODE)
self._write(self._stream, bytes([self._header]))
def _set_bits(self, integer, bit_field):
"""Help build the header byte while checking certain arguments."""
if not 0 <= integer < 1 << bit_field.width:
raise ValueError('integer does not fit in width numbers of bits')
self._header |= integer << bit_field.offset
def _acquire_file(self, source, parent, keep_zombies, zombies):
"""Serialize a file."""
restore_point = self._stream.tell()
try:
with source.open('rb') as file:
file_length = file.seek(0, io.SEEK_END)
self._write_complete_file_header(source, parent, file_length)
future_data = file.seek(0, io.SEEK_END)
if future_data != file_length:
raise OSError('source changed size after writing header')
file.seek(0, io.SEEK_SET)
while future_data:
buffer = file.read(min(future_data, self.BUFFER_SIZE))
if not buffer:
raise OSError('source file ended with remaining data')
self._write(self._stream, buffer)
future_data -= len(buffer)
if file.seek(0, io.SEEK_END) != file_length:
raise OSError('file changed size during serialization')
except OSError:
self._stream.seek(restore_point, io.SEEK_SET)
self._stream.truncate()
zombies.append(source)
if keep_zombies:
self._write_complete_file_header(source, parent, 0)
def _write_complete_file_header(self, source, parent, file_length):
"""Record all file information except its data."""
name = source.name.encode(self.NAME_ENCODING)
name_size = self._int_to_bytes(len(name))
data_size = self._int_to_bytes(file_length)
self._write_file_header_byte(parent, name_size, data_size)
self._write(self._stream, parent)
self._write(self._stream, name_size)
self._write(self._stream, name)
self._write(self._stream, data_size)
def _write_file_header_byte(self, pointer, name_size, data_size):
"""Record the file header byte using the correct format."""
self._header = 0
self._set_bits(_RecordType.FILE, self.RECORD_TYPE)
self._set_bits(len(pointer) - 1, self.POINTER_LENGTH)
self._set_bits(len(name_size) - 1, self.NAME_SIZE_LENGTH)
self._set_bits(len(data_size) - 1, self.FILE_DATA_SIZE_LENGTH)
self._write(self._stream, bytes([self._header]))
class Deserializer(_Common):
"""Deserializer(source) -> Deserializer instance"""
def __init__(self, source):
"""Initialize the Deserializer instance's attributes."""
super().__init__(source)
self._finished = False
self._parents = None
#property
def finished(self):
"""Check if the object has reached the end of the file yet."""
return self._finished
def run(self, destination):
"""Load the source file-like object onto the destination directory."""
if self._finished:
raise EOFError('end of file was found')
self._parents = [pathlib.Path(destination).resolve()]
starting_run = True
while True:
byte = self._stream.read(1)
if not byte:
self._finished = True
if starting_run:
raise IOError('unexpected file termination detected')
break
self._header = byte[0]
if self._get_bits(self.RECORD_TYPE) == _RecordType.FILE:
self._release_file()
else:
type_code = self._get_bits(self.DIRECTORY_TYPE_CODE)
if type_code == _DirectoryTypeCode.GENUINE:
self._release_dir()
elif type_code == _DirectoryTypeCode.SEPARATOR:
if starting_run:
raise IOError('empty record detected')
break
else:
raise IOError('reserved directory type code detected')
starting_run = False
def _get_bits(self, bit_field):
"""Extract width number of bits from header starting at offset."""
return self._header >> bit_field.offset & (1 << bit_field.width) - 1
def _release_dir(self):
"""Deserialize a directory."""
pointer_length = self._get_bits(self.POINTER_LENGTH) + 1
name_size_length = self._get_bits(self.NAME_SIZE_LENGTH) + 1
content_flag = bool(self._get_bits(self.CONTENT_FLAG))
# After decoding the header byte, read and process the remaining data.
pointer = self._bytes_to_int(self._read(pointer_length))
name_size = self._bytes_to_int(self._read(name_size_length))
name = self._read(name_size).decode(self.NAME_ENCODING)
path = self._parents[pointer] / name
path.mkdir()
if content_flag:
self._parents.append(path)
def _release_file(self):
"""Deserialize a file."""
pointer_length = self._get_bits(self.POINTER_LENGTH) + 1
name_size_length = self._get_bits(self.NAME_SIZE_LENGTH) + 1
data_size_length = self._get_bits(self.FILE_DATA_SIZE_LENGTH) + 1
# After decoding the header byte, read and process the remaining data.
pointer = self._bytes_to_int(self._read(pointer_length))
name_size = self._bytes_to_int(self._read(name_size_length))
name = self._read(name_size).decode(self.NAME_ENCODING)
with (self._parents[pointer] / name).open('wb') as destination:
future_data = self._bytes_to_int(self._read(data_size_length))
while future_data:
buffer = self._stream.read(min(future_data, self.BUFFER_SIZE))
if not buffer:
raise IOError('end of file was found')
self._write(destination, buffer)
future_data -= len(buffer)
def _read(self, future_data):
"""Read at least as many bytes from the source as requested."""
if future_data:
buffer = bytearray()
while future_data:
data = self._stream.read(future_data)
if not data:
raise IOError('end of file was found')
buffer.extend(data)
future_data -= len(data)
return buffer
raise IOError('request for zero bytes found')

Develop Reference

node.js excel linux python-3.x azure haskell apache-spark rust .htaccess string

How can I use BytesIO as a pandas.read_csv data source - python-3.x

Related

Returning a generator in a with statement

Django - Batch Actions in a ListView - Select Rows and Return a Zip Archive in Response

Wrap an io.BufferedIOBase such that it becomes seek-able

Create sub stream in python

How could I create my own custom .zip format in python34?

Categories

Resources