Python: Write content to an open stream - python

I am working with an API that takes an open binary file as a parameter and then performs blocking reads on that until EOF.
Rather than opening an existing file (io.open mode 'rb') I want to pass it a stream that I write calculated/constructed content to - in effect I want something that is conceptually a unidirectional pipe where the output is delivered via an inputstream that is interchangeable with an open file.
I looked at BufferedRWPair but the few examples I could find all violate it's warnings not to use the same object for the input and output sides.
If anyone has an appropriate example or better suggestion, it's welcome!
I've looked at BufferedRandom based on comments here, but I'm obviously doing something wrong as....
import io
buf = io.BufferedRandom(io.BytesIO())
buf.write("a")
buf.write("b")
buf.flush()
while True:
print "reading"
a = buf.read(1024)
if not a: break
print "read: {}".format(a)
buf.close()
This exits after the first read
update
This admittedly messy example shows the solution, having to maintain independent read and write positions
import io
buf = io.BufferedRandom(io.BytesIO())
read = 0
wrote = 0
buf.seek(wrote)
wrote += buf.write(b"a")
wrote += buf.write(b"b")
buf.seek(read)
data = buf.read(1)
read += len(data)
buf.seek(wrote)
wrote += buf.write(b"c")
print "read: {}".format(data)
buf.seek(read)
data = buf.read(512)
read += len(data)
wrote += buf.write(b"d")
buf.seek(wrote)
wrote += buf.write(b"efghihjlmnop")
while data:
print "read: {}".format(data)
buf.seek(read)
data = buf.read(1024)
read += len(data)
buf.close()

Comment: ... allow me to interleave reads and writes to the stream without ... managing the current read and write positions myself.
This is the behave of io.BufferedRandom.
But you can encapsulate the logic into a own class StreamRW(io.BufferedRandom),
for instance:
class StreamRW(io.BufferedRandom):
def __init__(self, raw):
super().__init__(raw)
self.seek(0)
def read(self, size=1):
super().seek(self.read_offset)
data = super().read(size)
self.read_offset = self.tell()
return data
def write(self, data):
super().seek(self.write_offset)
written = super().write(data)
self.write_offset = self.tell()
return written
def seek(self, offset):
super().seek(offset)
self.read_offset = self.write_offset = self.tell()
#Usage:
buf = StreamRW(io.BytesIO())
...
Further code as below, but without buf.seek(0)!
You have to use buf.seek(0) to rewind the file position.
Note: I have to use binary prefix b""!
This is working for me:
import io
buf = io.BufferedRandom(io.BytesIO())
buf.write(b"a")
buf.write(b"b")
buf.seek(0)
while True:
print "reading"
a = buf.read(1024)
if not a: break
print "read: {}".format(a)
buf.close()
Output:
read: b'ab'
Tested with Python: 3.4.2 and 2.7.9

Related

Pandas Read Continuously Growing CSV File

I have a continuously growing CSV File, that I want to periodically read. I am also only interested in new values.
I was hoping to do something like:
file_chunks = pd.read_csv('file.csv', chunksize=1)
while True:
do_something(next(file_chunks))
time.sleep(0.1)
in a frequency, that is faster than the .csv file is growing.
However, as soon as the iterator does not return a value once, it "breaks" and does not return values, even if the .csv file has grown in the meantime.
Is there a way to read continuously growing .csv files line by line?
you could build a try: except: around it or make and if statement that checks if file_chunks is not none first.
Like this it shouldnt break anymore and he only sleeps when he there are no more chunks left.
while True:
file_chunks = pd.read_csv('file.csv', chunksize=1)
while True:
try:
do_something(next(file_chunks))
except:
time.sleep(0.1)
This is easier to do with the standard csv module where you can write your own line iterator that knows how to read an updating file. This generator would read in binary mode so that it can track file position, close the file at EOF and poll its size for appended data. This can fail if the reader gets a partial file update because the other side hasn't flushed yet, or if a CSV cell contains and embedded new line that invalidates the reader's assumption that a binary mode newline always terminates a row.
import csv
import time
import os
import threading
import random
def rolling_reader(filename, poll_period=.1, encoding="utf-8"):
pos = 0
while True:
while True:
try:
if os.stat(filename).st_size > pos:
break
except FileNotFoundError:
pass
time.sleep(poll_period)
fp = open(filename, "rb")
fp.seek(pos)
for line in fp:
if line.strip():
yield line.decode("utf-8")
pos = fp.tell()
# ---- TEST - thread updates test.csv periodically
class GenCSVThread(threading.Thread):
def __init__(self, csv_name):
super().__init__(daemon=True)
self.csv_name = csv_name
self.start()
def run(self):
val = 1
while True:
with open(self.csv_name, "a") as fp:
for _ in range(random.randrange(4)):
fp.write(",".join(str(val) for _ in range(4)) + "\n")
val += 1
time.sleep(random.random())
if os.path.exists("test.csv"):
os.remove("test.csv")
test_gen = GenCSVThread("test.csv")
reader = csv.reader(rolling_reader("test.csv"))
for row in reader:
print(row)
A platform dependent update would be to use a facility such as inotify to trigger reads off of a file close operation to reduce the risk of partial data.

How to manage stream requests with base64 image file into json data respose?

I make a requests.post() call to the server, which replies me with a json, in this json there are some keys and also the base64 file.
This is an example of a response from the server:
The server responds like this:
'success' is the key to understanding if access with private data is
correct.
'message' is the key in case success is False (In this case being
success == True, the message is not shown
'data' is the dictionary key that contains the fileName and the
base64 format file
So:
{'success': True,
'message': '',
'data': {'fileName': 'Python_logo_and_wordmark.svg.png',
'file': 'iVBORw0KGgoAAAANSUhEUgAABLAAAA....'}} #To limit the space, I cut the very long bytes example
So the respose in json also contains the file, which I need to decode with base64.b64decode(r.json()['data']['file'])
Everything ok, I can get my file and decrypt it correctly.
The problem is that with large files I would like to use the stream method like this:
file = "G:\Python_logo_and_wordmark.svg.png"
if os.path.isfile(file):
os.remove(file)
def get_chunk(chunk):
# Try to decode the base64 file (Chunked)
# is this a wrong approach?
chunk = chunk.decode("ascii")
chunk = chunk.replace('"', '')
if "file" in chunk:
chunk = chunk.split('file:')[1]
elif "}}" in chunk:
chunk = chunk.split('}}')[0]
else:
chunk = chunk
chunk += "=" * ((4 - len(chunk) % 4) % 4)
chunk_decoded = base64.b64decode(chunk)
return chunk_decoded
r = requests.post(url=my_url, json=my_data, stream=True)
iter_content = r.iter_content(chunk_size=64)
while True:
chunk = next(iter_content, None)
if not chunk:
break
chunk_decoded = get_chunk(chunk)
with open(file, "ab") as file_object:
file_object.write(chunk_decoded)
iter_content chunks return this:
b'{"success":true,"message":"","data":{"fileName":"Python_logo_and'
b'_wordmark.svg.png","file":"iVBORw0KGgoAAAANSUhEUgAABLAAAAFkCAYAA'
b'AAwtsJRAAAABGdBTUEAALGPC\\/xhBQAAACBjSFJNAAB6JgAAgIQAAPoAAACA6AAA'
b'dTAAAOpgAAA6mAAAF3CculE8AAAABmJLR0QA\\/wD\\/AP+gvaeTAACAAElEQVR42u'
b'zdeZwbdf0\\/8Nf7k2Ovdttyt7QIggoth1qUW1AQ5PLeAiK13UwWiqLiBZ4Eb+T6+'
There are errors inherent in padding sometimes in decoding, but after 1 week of trying I preferred to ask this question here, as I am afraid of being wrong approach to this situation.
I would like how to handle this situation in the right way
According to your requirement mentioned in the comment, I'm pointing out the current issues and probable future problems below:
In your get_chunck function, you're doing this:
chunk = chunk.decode("ascii")
chunk = chunk.replace('"', '')
if "file" in chunk:
chunk = chunk.split('file:')[1]
elif "}}" in chunk:
chunk = chunk.split('}}')[0]
else:
chunk = chunk
Now look into the first chunk given by iter_line:
b'{"success":true,"message":"","data":{"fileName":"Python_logo_and'
So, it will fall under the condition if "file" in chunk: as it contains this file string in the fileName. So when it will try to split this based on file:, it will return a list of one element, because the file was in fileName, not as file:. Hence the program will through following error:
Traceback (most recent call last):
File "main.py", line 7, in <module>
chunk = chunk.split('file:')[1]
IndexError: list index out of range
try if "file:" in chunk: instead.
Your program may also fail if the fileName contains something like "prod_file:someName". You have to check for that too.
A chunk that doesn't contain file can contain }}, so it can break what you're trying too achieve too.
You can modify the response server and wrap the start and ending of the file base64 encoded string with unique identifiers so that you can receive the response as below and therefore can identify the start and end of the file with guarantee in this stream approach. For example:
{'success': True,
'message': '',
'data': {'fileName': 'Python_logo_and_wordmark.svg.png',
'file': '0000101100iVBORw0KGgoAAAANSUhEUgAABLAAAA....0000101101'}}
I've appended 0000101100 as starting identifier and 0000101101 as ending. You can trim them off while writing to chunk/file. You can use any other unique identifier format as your own, not conflicting the base64 encoding.
Feel free to ask if there's any further confusion.
I tried to analyze your problem, and can't find solution better than #devReddir provided.
The reason is - it is impossible (or very difficult) to parse data before completely download it.
Workaround may be to save data as is in one big file and parse it by separate worker. That will allow to decrease server memory usage, when downloading file and avoid to loss data.
save file as is
...
while True:
chunk = next(iter_content, None)
if not chunk:
break
with open(file, "ab") as file_object:
file_object.write(chunk)
...
read file in separated worker
import json
import base64
with open("saved_as_is.json") as json_file:
json_object = json.load(json_file)
encoded_base64 = json_object['data']['file']
decoded = base64.b64decode(encoded_base64)
...
Why parse data on the fly is so difficult?
file separator may be splitted by two chunks:
b'... ... ... .., "fi'
b'le": "AAAB... ... .'
Actually \\ is a escape symbol and you must to handle it manually (and don't forget that \\ may be splitted by chunks → b'...\', b'\...'):
b'dTAAAOpgAAA6mAAAF3CculE8AAAABmJLR0QA\\/wD\\/AP+gvaeTAACAAElEQVR42u'
If file is super tiny, chunk line may be look like:
b'"file":"SUPERTINY_BASE64_DECODED", "fileName":"Python_lo'
And chunk.split('file:')[1] will don't work
base64 chunk must be multiple of 4, so if your first chunk (characters after "file":) will be 3 character length, you will be need to read next chunk and add one first character to end of previous chunk for all following iterations
So here is tones of nuances if you will try to parse data manually.
Howevevr, if you want to choose this hard way, here is how to decode base64 chunks.
And here is list of allowed base64 characters
If you want to use #devReddir's solution and store whole data in memory, not sure if here any profit of stream usage at all.
Okay, that is complete working solution:
Server side (main.py):
I added this code to be able run test server that responding json data with base64 encoded file.
Also I added some randomness in response to be able to check if string parsing independent on character position
import base64 as b
import json as j
from fastapi import FastAPI as f
import requests as r
import random as rr
import string as s
import uvicorn as u
banana_url = 'https://upload.wikimedia.org/wikipedia/commons/c/ce/PNG_demo_Banana.png'
banana_b64 = b.encodebytes(
r.get(banana_url, stream=True).raw.read())
banana_b64 = banana_b64.decode('ascii').replace('\n', '').encode('ascii')
def get_response(banana_file, banana_file_name):
random_status = ''
for i in range(rr.randint(3, 30)): random_status += rr.choice(s.ascii_letters)
banana_response = {
'status': random_status,
'data': {
'fileName': banana_file_name.split('/')[-1],
'file': banana_file,
}
}
if len(random_status) % 2 == 0:
banana_response['data']['random_payload'] = 'hello_world'
banana_response['random_payload'] = '%hello_world_again%'
return banana_response
app = f()
#app.get("/")
async def read_root():
resp = get_response(banana_b64, banana_url.split('/')[-1])
print('file length:', len(resp['data']['file']))
return resp
if __name__ == "__main__":
u.run('main:app', host="0.0.0.0", port=8000, reload=True, workers=1)
Client side (file downloader decoder.py):
import requests
import base64
# must be larger than len('"file":')
CHUNK_SIZE = 64
# iterable response
r = requests.get('http://127.0.0.1:8000', stream=True).iter_content(chunk_size=CHUNK_SIZE)
class ChunkParser:
file = None
total_length = 0
def close(self):
if self.file:
self.file.close()
def __init__(self, file_name) -> None:
self.file = open(file_name, 'ab')
def add_chunk(self, chunk):
# remove all escape symbols if existing
chunk = chunk.decode('ascii').replace('\\', '').encode('ascii')
# if chunk size is not multiple of 4, return modulo to be able add it in next chunk
modulo = b''
if not (l := len(chunk)) % 4 == 0:
modulo = chunk[l-(l%4):]
chunk = chunk[:l-(l%4)]
self.file.write(base64.b64decode(chunk))
self.total_length += len(chunk)
return modulo
prev_chunk = None
cur_chunk = None
writing_started = False
last_chunk = False
parser = ChunkParser('temp_file.png')
file_found = False
while True:
# set previous chunk on first iterations before modulo may be returned
if cur_chunk is not None and not writing_started:
prev_chunk = cur_chunk
# get current chunk
cur_chunk = next(r, None)
# skip first iteration
if prev_chunk is None:
continue
# break loop if no data
if not cur_chunk:
break
# concatenate two chunks to avoid b' ... "fil', b'e": ... ' patern
two_chunks = prev_chunk + cur_chunk
# if file key found get real base64 encoded data
if not file_found and '"file":' in two_chunks.decode('ascii'):
file_found = True
# get part after "file" key
two_chunks = two_chunks.decode('ascii').split('"file":')[1].encode('ascii')
if file_found and not writing_started:
# data should be started after first "-quote
# so cut all data before "
if '"' in (t := two_chunks.decode('ascii')):
two_chunks = t[t.find('"')+1:].encode('ascii')
writing_started = True
# handle b' ... "file":', b'"... ' patern
else:
cur_chunk = b''
continue
# check for last data chunk
# "-quote means end of value
if writing_started and '"' in (t := two_chunks.decode('ascii')):
two_chunks = t[:t.find('"')].encode('ascii')
last_chunk = True
if writing_started:
# decode and write data in file
prev_chunk = parser.add_chunk(two_chunks)
# end operation
if last_chunk:
if (l := len(prev_chunk)) > 0:
# if last modulo length is larget than 0, that meaning the data total length is not multiple of 4
# probably data loss appear?
raise ValueError(f'Bad end of data. length is {str(l)} and last characters are {prev_chunk.decode("ascii")}')
break
parser.close()
print(parser.total_length)
Don't forget to compare files after download when testing this script:
# get md5 of downloaded by chunks file
$ md5 temp_file.png
MD5 (temp_file.png) = 806165d96d5f9a25cebd2778ae4a3da2
# get md5 of downloaded file using browser
$ md5 PNG_demo_Banana.png
MD5 (PNG_demo_Banana.png) = 806165d96d5f9a25cebd2778ae4a3da2
You could stream it down to a file like this (pip install base64io):
class decoder():
def __init__(self, fh):
self.fileh = open(fh, 'rb')
self.closed = False
search = ''
start_tag = '"file": "'
for i in range(1024):
search += self.fileh.read(1).decode('UTF8')
if len(start_tag) > len(search)+1:
continue
if search[-len(start_tag):] == start_tag:
break
def read(self, chunk=1200):
data = self.fileh.read(chunk)
if not data:
self.close()
return b''
return data if not data.decode('UTF8').endswith('"}}') else data[:-3]
def close(self):
self.fileh.close()
self.closed = True
def closed(self):
return self.closed
def flush(self):
pass
def write(self):
pass
def readable(self):
return True
And then use the class like this:
from base64io import Base64IO
encoded_source = decoder(fh)
with open("target_file.jpg", "wb") as target, Base64IO(encoded_source) as source:
for line in source:
target.write(line)
But of course you need to change from streaming from local file to streaming from the requests.raw object.

How to play streaming audio using pyglet?

The goal of this question is trying to figure out how to play streaming audio using pyglet. The first is just making sure you're able to play mp3 files using pyglet, that's the purpose of this first snippet:
import sys
import inspect
import requests
import pyglet
from pyglet.media import *
pyglet.lib.load_library('avbin')
pyglet.have_avbin = True
def url_to_filename(url):
return url.split('/')[-1]
def download_file(url, filename=None):
filename = filename or url_to_filename(url)
with open(filename, "wb") as f:
print("Downloading %s" % filename)
response = requests.get(url, stream=True)
total_length = response.headers.get('content-length')
if total_length is None:
f.write(response.content)
else:
dl = 0
total_length = int(total_length)
for data in response.iter_content(chunk_size=4096):
dl += len(data)
f.write(data)
done = int(50 * dl / total_length)
sys.stdout.write("\r[%s%s]" % ('=' * done, ' ' * (50 - done)))
sys.stdout.flush()
url = "https://freemusicarchive.org/file/music/ccCommunity/DASK/Abiogenesis/DASK_-_08_-_Protocell.mp3"
filename = "mcve.mp3"
download_file(url, filename)
music = pyglet.media.load(filename)
music.play()
pyglet.app.run()
If you've installed the libraries pip install pyglet requests and also installed AVBin at this point you should be able to listen the mp3 once it's been downloaded.
Once we've reached this point, I'd like to figure out how to play & buffering the file in a similar way to mostly of the existing web video/audio players using pyglet+requests. This means playing the files without waiting till the file has been downloaded completely.
After reading the pyglet media docs you can see there are available these classes:
media
sources
base
AudioData
AudioFormat
Source
SourceGroup
SourceInfo
StaticSource
StreamingSource
VideoFormat
player
Player
PlayerGroup
I've seen there are another similar SO questions but they haven't been solved properly and their content doesn't provide a lot of relevant details:
Play streaming audio using pyglet
How can I play audio stream without saving it into the file with pyglet?
That's why I've created a new question. How do you play streaming audio using pyglet? Could you provide a little example using the above mcve as a base?
Assuming you don't want to import a new package to do this for you - this can be done with a bit of effort.
First, let's head over to the Pyglet source code and have a look at media.load in media/__init__.py.
"""Load a Source from a file.
All decoders that are registered for the filename extension are tried.
If none succeed, the exception from the first decoder is raised.
You can also specifically pass a decoder to use.
:Parameters:
`filename` : str
Used to guess the media format, and to load the file if `file` is
unspecified.
`file` : file-like object or None
Source of media data in any supported format.
`streaming` : bool
If `False`, a :class:`StaticSource` will be returned; otherwise
(default) a :class:`~pyglet.media.StreamingSource` is created.
`decoder` : MediaDecoder or None
A specific decoder you wish to use, rather than relying on
automatic detection. If specified, no other decoders are tried.
:rtype: StreamingSource or Source
"""
if decoder:
return decoder.decode(file, filename, streaming)
else:
first_exception = None
for decoder in get_decoders(filename):
try:
loaded_source = decoder.decode(file, filename, streaming)
return loaded_source
except MediaDecodeException as e:
if not first_exception or first_exception.exception_priority < e.exception_priority:
first_exception = e
# TODO: Review this:
# The FFmpeg codec attempts to decode anything, so this codepath won't be reached.
if not first_exception:
raise MediaDecodeException('No decoders are available for this media format.')
raise first_exception
add_default_media_codecs()
The critical line here is loaded_source = decoder.decode(...). Essentially, to load audio Pyglet takes a file and hauls it over to a media decoder (eg. FFMPEG), which then returns a list of 'frames' or packets that Pyglet can play with a built-in Player class. If the audio format is compressed (eg. mp3 or aac), Pyglet will use an external library (currently only AVBin is supported) to convert it to raw, decompressed audio. You probably already know some of this.
So if we want to see how we can stuff a stream of bytes into Pyglet's audio engine rather than a file, we'll need to take a look at one of the decoders. For this example, let's use FFMPEG as it's the easiest to access.
In media/codecs/ffmpeg.py:
class FFmpegDecoder(object):
def get_file_extensions(self):
return ['.mp3', '.ogg']
def decode(self, file, filename, streaming):
if streaming:
return FFmpegSource(filename, file)
else:
return StaticSource(FFmpegSource(filename, file))
The 'object' it inherits from is MediaDecoder, found in media/codecs/__init__.py. Back at the load function in media/__init__.py, you'll see pyglet will choose a MediaDecoder based on file extension, then return its decode function with the file as a parameter to get the audio in the form of a packet stream. That packet stream is a Source object; each decoder has its own flavor, in the form of StaticSource or StreamingSource. The former is used to store audio in memory, and the latter to play it immediately. FFmpeg's decoder only supports StreamingSource.
We can see that FFMPEG's is FFmpegSource, also located in media/codecs/ffmpeg.py. We find this Goliath of a class:
class FFmpegSource(StreamingSource):
# Max increase/decrease of original sample size
SAMPLE_CORRECTION_PERCENT_MAX = 10
def __init__(self, filename, file=None):
if file is not None:
raise NotImplementedError('Loading from file stream is not supported')
self._file = ffmpeg_open_filename(asbytes_filename(filename))
if not self._file:
raise FFmpegException('Could not open "{0}"'.format(filename))
self._video_stream = None
self._video_stream_index = None
self._audio_stream = None
self._audio_stream_index = None
self._audio_format = None
self.img_convert_ctx = POINTER(SwsContext)()
self.audio_convert_ctx = POINTER(SwrContext)()
file_info = ffmpeg_file_info(self._file)
self.info = SourceInfo()
self.info.title = file_info.title
self.info.author = file_info.author
self.info.copyright = file_info.copyright
self.info.comment = file_info.comment
self.info.album = file_info.album
self.info.year = file_info.year
self.info.track = file_info.track
self.info.genre = file_info.genre
# Pick the first video and audio streams found, ignore others.
for i in range(file_info.n_streams):
info = ffmpeg_stream_info(self._file, i)
if isinstance(info, StreamVideoInfo) and self._video_stream is None:
stream = ffmpeg_open_stream(self._file, i)
self.video_format = VideoFormat(
width=info.width,
height=info.height)
if info.sample_aspect_num != 0:
self.video_format.sample_aspect = (
float(info.sample_aspect_num) /
info.sample_aspect_den)
self.video_format.frame_rate = (
float(info.frame_rate_num) /
info.frame_rate_den)
self._video_stream = stream
self._video_stream_index = i
elif (isinstance(info, StreamAudioInfo) and
info.sample_bits in (8, 16) and
self._audio_stream is None):
stream = ffmpeg_open_stream(self._file, i)
self.audio_format = AudioFormat(
channels=min(2, info.channels),
sample_size=info.sample_bits,
sample_rate=info.sample_rate)
self._audio_stream = stream
self._audio_stream_index = i
channel_input = avutil.av_get_default_channel_layout(info.channels)
channels_out = min(2, info.channels)
channel_output = avutil.av_get_default_channel_layout(channels_out)
sample_rate = stream.codec_context.contents.sample_rate
sample_format = stream.codec_context.contents.sample_fmt
if sample_format in (AV_SAMPLE_FMT_U8, AV_SAMPLE_FMT_U8P):
self.tgt_format = AV_SAMPLE_FMT_U8
elif sample_format in (AV_SAMPLE_FMT_S16, AV_SAMPLE_FMT_S16P):
self.tgt_format = AV_SAMPLE_FMT_S16
elif sample_format in (AV_SAMPLE_FMT_S32, AV_SAMPLE_FMT_S32P):
self.tgt_format = AV_SAMPLE_FMT_S32
elif sample_format in (AV_SAMPLE_FMT_FLT, AV_SAMPLE_FMT_FLTP):
self.tgt_format = AV_SAMPLE_FMT_S16
else:
raise FFmpegException('Audio format not supported.')
self.audio_convert_ctx = swresample.swr_alloc_set_opts(None,
channel_output,
self.tgt_format, sample_rate,
channel_input, sample_format,
sample_rate,
0, None)
if (not self.audio_convert_ctx or
swresample.swr_init(self.audio_convert_ctx) < 0):
swresample.swr_free(self.audio_convert_ctx)
raise FFmpegException('Cannot create sample rate converter.')
self._packet = ffmpeg_init_packet()
self._events = [] # They don't seem to be used!
self.audioq = deque()
# Make queue big enough to accomodate 1.2 sec?
self._max_len_audioq = 50 # Need to figure out a correct amount
if self.audio_format:
# Buffer 1 sec worth of audio
self._audio_buffer = \
(c_uint8 * ffmpeg_get_audio_buffer_size(self.audio_format))()
self.videoq = deque()
self._max_len_videoq = 50 # Need to figure out a correct amount
self.start_time = self._get_start_time()
self._duration = timestamp_from_ffmpeg(file_info.duration)
self._duration -= self.start_time
# Flag to determine if the _fillq method was already scheduled
self._fillq_scheduled = False
self._fillq()
# Don't understand why, but some files show that seeking without
# reading the first few packets results in a seeking where we lose
# many packets at the beginning.
# We only seek back to 0 for media which have a start_time > 0
if self.start_time > 0:
self.seek(0.0)
---
[A few hundred lines more...]
---
def get_next_video_timestamp(self):
if not self.video_format:
return
if self.videoq:
while True:
# We skip video packets which are not video frames
# This happens in mkv files for the first few frames.
video_packet = self.videoq[0]
if video_packet.image == 0:
self._decode_video_packet(video_packet)
if video_packet.image is not None:
break
self._get_video_packet()
ts = video_packet.timestamp
else:
ts = None
if _debug:
print('Next video timestamp is', ts)
return ts
def get_next_video_frame(self, skip_empty_frame=True):
if not self.video_format:
return
while True:
# We skip video packets which are not video frames
# This happens in mkv files for the first few frames.
video_packet = self._get_video_packet()
if video_packet.image == 0:
self._decode_video_packet(video_packet)
if video_packet.image is not None or not skip_empty_frame:
break
if _debug:
print('Returning', video_packet)
return video_packet.image
def _get_start_time(self):
def streams():
format_context = self._file.context
for idx in (self._video_stream_index, self._audio_stream_index):
if idx is None:
continue
stream = format_context.contents.streams[idx].contents
yield stream
def start_times(streams):
yield 0
for stream in streams:
start = stream.start_time
if start == AV_NOPTS_VALUE:
yield 0
start_time = avutil.av_rescale_q(start,
stream.time_base,
AV_TIME_BASE_Q)
start_time = timestamp_from_ffmpeg(start_time)
yield start_time
return max(start_times(streams()))
#property
def audio_format(self):
return self._audio_format
#audio_format.setter
def audio_format(self, value):
self._audio_format = value
if value is None:
self.audioq.clear()
The line you'll be interested in here is self._file = ffmpeg_open_filename(asbytes_filename(filename)). This brings us here, once again in media/codecs/ffmpeg.py:
def ffmpeg_open_filename(filename):
"""Open the media file.
:rtype: FFmpegFile
:return: The structure containing all the information for the media.
"""
file = FFmpegFile() # TODO: delete this structure and use directly AVFormatContext
result = avformat.avformat_open_input(byref(file.context),
filename,
None,
None)
if result != 0:
raise FFmpegException('Error opening file ' + filename.decode("utf8"))
result = avformat.avformat_find_stream_info(file.context, None)
if result < 0:
raise FFmpegException('Could not find stream info')
return file
and this is where things get messy: it calls to a ctypes function (avformat_open_input) that when given a file, will grab its details and fill out all the information it needs for our FFmpegSource class. With some work, you should be able to get avformat_open_input to take a bytes object rather than a path to a file which it will open to get the same information. I'd love to do this and include a working example, but I don't have the time right now. You'd then need to make a new ffmpeg_open_filename function utilizing the new avformat_open_input function, and then a new FFmpegSource class utilizing the new ffmpeg_open_filename function. All you need now is a new FFmpegDecoder class utilizing the new FFmpegSource class.
You could then implement this by adding it to your pyglet package directly. After, you'd want to add support for a byte object argument in the load() function (located in media/__init__.py and override the decoder to your new one. And there, you would now be able to stream audio without saving it.
Or, you could simply use a package that already supports it. Python-vlc does. You could use the example here to play whatever audio you'd like from a link. If you aren't doing this just for a challenge, I would strongly recommend you use another package. Otherwise: good luck.

How to get the internal position while reading bzip2 file

I've got a script to decompress and parse data contained in a bunch of very large bzip2 compressed files. Since it can take a while I'd like to have some way to monitor the progress. I know I can get the file size with os.path.getsize(), but bz2.BZ2File.tell() returns the position within the uncompressed data. Is there any way to get the current position within the uncompressed file so I can monitor the progress?
Bonus points if there's a python equivalent to Java's ProgressMonitorInputStream.
If you only need to parse the data in the bziped file, I think it should be possible to avoid to unzip the file before reading it. I have not tested it on bzip, but on gziped files. I hope this is also possible with bziped files.
See for instance :
How to write csv in python efficiently?.
This is the solution I came up with that seems to work.
import bz2
class SimpleBZ2File(object):
def __init__(self,path,readsize=1024):
self.decomp = bz2.BZ2Decompressor()
self.rawinput = open(path,'rb')
self.eof = False
self.readsize = readsize
self.leftover = ''
def tell(self):
return self.rawinput.tell()
def __iter__(self):
while not self.eof:
rawdata = self.rawinput.read(self.readsize)
if rawdata == '':
self.eof = True
else:
data = self.decomp.decompress(rawdata)
if not data:
continue #we need to supply more raw to decompress
newlines = list(data.splitlines(True))
yield self.leftover + newlines[0]
self.leftover = ''
for l in newlines[1:-1]:
yield l
if newlines[-1].endswith('\n'):
yield newlines[-1]
else:
self.leftover = newlines[-1]
if self.leftover:
yield self.leftover
self.rawinput.close()

Create and stream a large archive without storing it in memory or on disk

I want to allow users to download an archive of multiple large files at once. However, the files and the archive may be too large to store in memory or on disk on my server (they are streamed in from other servers on the fly). I'd like to generate the archive as I stream it to the user.
I can use Tar or Zip or whatever is simplest. I am using django, which allows me to return a generator or file-like object in my response. This object could be used to pump the process along. However, I am having trouble figuring out how to build this sort of thing around the zipfile or tarfile libraries, and I'm afraid they may not support reading files as they go, or reading the archive as it is built.
This answer on converting an iterator to a file-like object might help. tarfile#addfile takes an iterable, but it appears to immediately pass that to shutil.copyfileobj, so this may not be as generator-friendly as I had hoped.
I ended up using SpiderOak ZipStream.
You can do it by generating and streaming a zip file with no compression, which is basically to just add the headers before each file's content. You're right, the libraries don't support this, but you can hack around them to get it working.
This code wraps zipfile.ZipFile with a class that manages the stream and creates instances of zipfile.ZipInfo for the files as they come. CRC and size can be set at the end. You can push data from the input stream into it with put_file(), write() and flush(), and read data out of it to the output stream with read().
import struct
import zipfile
import time
from StringIO import StringIO
class ZipStreamer(object):
def __init__(self):
self.out_stream = StringIO()
# write to the stringIO with no compression
self.zipfile = zipfile.ZipFile(self.out_stream, 'w', zipfile.ZIP_STORED)
self.current_file = None
self._last_streamed = 0
def put_file(self, name, date_time=None):
if date_time is None:
date_time = time.localtime(time.time())[:6]
zinfo = zipfile.ZipInfo(name, date_time)
zinfo.compress_type = zipfile.ZIP_STORED
zinfo.flag_bits = 0x08
zinfo.external_attr = 0600 << 16
zinfo.header_offset = self.out_stream.pos
# write right values later
zinfo.CRC = 0
zinfo.file_size = 0
zinfo.compress_size = 0
self.zipfile._writecheck(zinfo)
# write header to stream
self.out_stream.write(zinfo.FileHeader())
self.current_file = zinfo
def flush(self):
zinfo = self.current_file
self.out_stream.write(struct.pack("<LLL", zinfo.CRC, zinfo.compress_size, zinfo.file_size))
self.zipfile.filelist.append(zinfo)
self.zipfile.NameToInfo[zinfo.filename] = zinfo
self.current_file = None
def write(self, bytes):
self.out_stream.write(bytes)
self.out_stream.flush()
zinfo = self.current_file
# update these...
zinfo.CRC = zipfile.crc32(bytes, zinfo.CRC) & 0xffffffff
zinfo.file_size += len(bytes)
zinfo.compress_size += len(bytes)
def read(self):
i = self.out_stream.pos
self.out_stream.seek(self._last_streamed)
bytes = self.out_stream.read()
self.out_stream.seek(i)
self._last_streamed = i
return bytes
def close(self):
self.zipfile.close()
Keep in mind that this code was just a quick proof of concept and I did no further development or testing once I decided to let the http server itself deal with this problem. A few things you should look into if you decide to use it is to check if nested folders are archived correctly, and filename encoding (which is always a pain with zip files anyway).
You can stream a ZipFile to a Pylons or Django response fileobj by wrapping the fileobj in something file-like that implements tell(). This will buffer each individual file in the zip in memory, but stream the zip itself. We use it to stream download a zip file full of images, so we never buffer more than a single image in memory.
This example streams to sys.stdout. For Pylons use response.body_file, for Django you can use the HttpResponse itself as a file.
import zipfile
import sys
class StreamFile(object):
def __init__(self, fileobj):
self.fileobj = fileobj
self.pos = 0
def write(self, str):
self.fileobj.write(str)
self.pos += len(str)
def tell(self):
return self.pos
def flush(self):
self.fileobj.flush()
# Wrap a stream so ZipFile can use it
out = StreamFile(sys.stdout)
z = zipfile.ZipFile(out, 'w', zipfile.ZIP_DEFLATED)
for i in range(5):
z.writestr("hello{0}.txt".format(i), "this is hello{0} contents\n".format(i) * 3)
z.close()
Here is the solution from Pedro Werneck (from above) but with a fix to avoid collecting all data in memory (read method is fixed a little bit):
class ZipStreamer(object):
def __init__(self):
self.out_stream = StringIO.StringIO()
# write to the stringIO with no compression
self.zipfile = zipfile.ZipFile(self.out_stream, 'w', zipfile.ZIP_STORED)
self.current_file = None
self._last_streamed = 0
def put_file(self, name, date_time=None):
if date_time is None:
date_time = time.localtime(time.time())[:6]
zinfo = zipfile.ZipInfo(name, date_time)
zinfo.compress_type = zipfile.ZIP_STORED
zinfo.flag_bits = 0x08
zinfo.external_attr = 0600 << 16
zinfo.header_offset = self.out_stream.pos
# write right values later
zinfo.CRC = 0
zinfo.file_size = 0
zinfo.compress_size = 0
self.zipfile._writecheck(zinfo)
# write header to mega_streamer
self.out_stream.write(zinfo.FileHeader())
self.current_file = zinfo
def flush(self):
zinfo = self.current_file
self.out_stream.write(
struct.pack("<LLL", zinfo.CRC, zinfo.compress_size,
zinfo.file_size))
self.zipfile.filelist.append(zinfo)
self.zipfile.NameToInfo[zinfo.filename] = zinfo
self.current_file = None
def write(self, bytes):
self.out_stream.write(bytes)
self.out_stream.flush()
zinfo = self.current_file
# update these...
zinfo.CRC = zipfile.crc32(bytes, zinfo.CRC) & 0xffffffff
zinfo.file_size += len(bytes)
zinfo.compress_size += len(bytes)
def read(self):
self.out_stream.seek(self._last_streamed)
bytes = self.out_stream.read()
self._last_streamed = 0
# cleaning up memory in each iteration
self.out_stream.seek(0)
self.out_stream.truncate()
self.out_stream.flush()
return bytes
def close(self):
self.zipfile.close()
then you can use stream_generator function as a stream for a zip file
def stream_generator(files_paths):
s = ZipStreamer()
for f in files_paths:
s.put_file(f)
with open(f) as _f:
s.write(_f.read())
s.flush()
yield s.read()
s.close()
example for Falcon:
class StreamZipEndpoint(object):
def on_get(self, req, resp):
files_pathes = [
'/path/to/file/1',
'/path/to/file/2',
]
zip_filename = 'output_filename.zip'
resp.content_type = 'application/zip'
resp.set_headers([
('Content-Disposition', 'attachment; filename="%s"' % (
zip_filename,))
])
resp.stream = stream_generator(files_pathes)
An option is to use stream-zip (full disclosure: written by me)
Amending its example slightly:
from datetime import datetime
from stream_zip import stream_zip, ZIP_64
def non_zipped_files():
modified_at = datetime.now()
perms = 0o600
# Hard coded in this example, but in real cases could
# for example yield data from a remote source
def file_1_data():
for i in range(0, 1000):
yield b'Some bytes'
def file_2_data():
for i in range(0, 1000):
yield b'Some bytes'
yield 'my-file-1.txt', modified_at, perms, ZIP64, file_1_data()
yield 'my-file-2.txt', modified_at, perms, ZIP64, file_2_data()
zipped_chunks = stream_zip(non_zipped_files())
# Can print each chunk, or return them to a client,
# say using Django's StreamingHttpResponse
for zipped_chunk in zipped_chunks:
print(zipped_chunk)

Categories