I'm trying to upload a file around 1gb in size to Amazon Glacier. Somewhat arbitrarily, I've decided to break it into 32mb parts and upload them in serial.
import math
import boto3
from botocore.utils import calculate_tree_hash
client = boto3.client('glacier')
vault_name = 'my-vault'
size = 1073745600 # in bytes
size_mb = size / (2**20) # Convert to megabytes for readability
local_file = 'filename'
multi_up = client.initiate_multipart_upload(vaultName=vault_name,
archiveDescription=local_file,
partSize=str(2**25)) # 32 mb in bytes
parts = math.floor(size_mb / 32)
with open("/Users/alexchase/Desktop/{}".format(local_file), 'rb') as upload:
for p in range(parts):
# Calculate lower and upper bounds for the byte ranges. The last range
# is bigger than the ones that come before.
lower = (p * (2**25))
upper = (((p + 1) * (2**25)) - 1) if (p + 1 < parts) else (size)
up_part = client.upload_multipart_part(vaultName=vault_name,
uploadId=multi_up['uploadId'],
range='bytes {}-{}/*'.format(lower, upper),
body=upload)
checksum = calculate_tree_hash(upload)
complete_up = client.complete_multipart_upload(archiveSize=str(size),
checksum=checksum,
uploadId=multi_up['uploadId'],
vaultName=vault_name)
This generates an error about the first byte range.
---------------------------------------------------------------------------
InvalidParameterValueException Traceback (most recent call last)
<ipython-input-2-9dd3ac986601> in <module>()
93 uploadId=multi_up['uploadId'],
94 range='bytes {}-{}/*'.format(lower, upper),
---> 95 body=upload)
96 upload_info.append(up_part)
97 checksum = calculate_tree_hash(upload)
~/anaconda/lib/python3.5/site-packages/botocore/client.py in _api_call(self, *args, **kwargs)
251 "%s() only accepts keyword arguments." % py_operation_name)
252 # The "self" in this scope is referring to the BaseClient.
--> 253 return self._make_api_call(operation_name, kwargs)
254
255 _api_call.__name__ = str(py_operation_name)
~/anaconda/lib/python3.5/site-packages/botocore/client.py in _make_api_call(self, operation_name, api_params)
555 error_code = parsed_response.get("Error", {}).get("Code")
556 error_class = self.exceptions.from_code(error_code)
--> 557 raise error_class(parsed_response, operation_name)
558 else:
559 return parsed_response
InvalidParameterValueException: An error occurred (InvalidParameterValueException) when calling the UploadMultipartPart operation:
Content-Range: bytes 0-33554431/* is incompatible with Content-Length: 1073745600
Can anyone see what I'm doing wrong?
#Michael-sqlbot is quite right, the issue with the Content-Range was that I was passing the whole file instead of a part. I fixed this by using the read() method, but then I discovered a separate issue, which is that (per the docs), the final part has to be the same size or smaller than the preceding parts. This means using math.ceil() instead of math.floor() to define the number of parts.
The working code is:
import math
import boto3
from botocore.utils import calculate_tree_hash
client = boto3.client('glacier')
vault_name = 'my-vault'
size = 1073745600 # in bytes
size_mb = size / (2**20) # Convert to megabytes for readability
local_file = 'filename'
partSize=(2**25)
multi_up = client.initiate_multipart_upload(vaultName=vault_name,
archiveDescription=local_file,
partSize=str(partSize)) # 32 mb in bytes
parts = math.ceil(size_mb / 32) # The number of <=32mb parts we need
with open("/Users/alexchase/Desktop/{}".format(local_file), 'rb') as upload:
for p in range(parts):
# Calculate lower and upper bounds for the byte ranges. The last range
# is now smaller than the ones that come before.
lower = (p * (partSize))
upper = (((p + 1) * (partSize)) - 1) if (p + 1 < parts) else (size-1)
read_size = upper-lower+1
file_part = upload.read(read_size)
up_part = client.upload_multipart_part(vaultName=vault_name,
uploadId=multi_up['uploadId'],
range='bytes {}-{}/*'.format(lower, upper),
body=file_part)
checksum = calculate_tree_hash(upload)
complete_up = client.complete_multipart_upload(archiveSize=str(size),
checksum=checksum,
uploadId=multi_up['uploadId'],
vaultName=vault_name)
Content-Range: bytes 0-33554431/* is incompatible with Content-Length: 1073745600
You're telling the API that you're sending the first 32 MiB, but you're actually sending (proposing to send) the entire file, since body=upload and upload isn't just the first part, it's the entire file. The Content-Length refers to the size of this part upload, which should be 33554432 (32 MiB).
The docs are admittedly ambiguous...
body (bytes or seekable file-like object) -- The data to upload.
...but the "data to upload" seems to refer to the data for only this part, in spite of the word "seekable."
Since the follow up answer from Alex claims it "works", I'm posting another version that worked for me under Python 3.5 and Ubuntu 16.04. I also added some environment variables from our production end to end solution.
The original post gave me an error so I tweaked it and provided some clean up. Hope this helps someone needing this Glacier functionality. Using a Shell script with awscli commands was not as clean.
import math
import boto3
import os
from botocore.utils import calculate_tree_hash
vault_name = os.getenv('GLACIER_VAULT_NAME')
file_name = os.getenv('GLACIER_UPLOAD_FILE')
if vault_name is None:
print('GLACIER_VAULT_NAME environment variable is required. Exiting.')
exit(1)
if file_name is None:
print('GLACIER_UPLOAD_FILE environment variable is required. Exiting.')
exit(2)
chunk_size = 2 ** 25
client = boto3.client('glacier')
client.create_vault(vaultName=vault_name)
upload_obj = client.initiate_multipart_upload(vaultName=vault_name,
archiveDescription=file_name,
partSize=str(chunk_size))
file_size = os.path.getsize(file_name)
parts = math.ceil(file_size / chunk_size)
with open(file_name, 'rb') as upload:
for p in range(parts):
lower = p * chunk_size
upper = lower + chunk_size - 1
if upper > file_size:
upper = (file_size - lower) + lower - 1
file_part = upload.read(chunk_size)
up_part = client.upload_multipart_part(vaultName=vault_name,
uploadId=upload_obj['uploadId'],
range='bytes {}-{}/{}'.format(lower,
upper,
file_size),
body=file_part)
# this needs a new file handler because calculate_tree_hash() processes
# the handler in a similar way to the loop above
checksum = calculate_tree_hash(open(file_name, 'rb'))
complete_up = client.complete_multipart_upload(vaultName=vault_name,
uploadId=upload_obj['uploadId'],
archiveSize=str(file_size),
checksum=checksum)
print(complete_up)
Related
I am making uart communication using serial library. I can send and receive data, but when the number of bytes sent reaches 32768 or more, I get the error;
ValueError: byte must be in range(0,256)
function definition:
def Write_to_serial_port(value):
ser.write([value])
function usage:
#...
for i in data_buf[0:total_len]: # data_buf is a list
Write_to_serial_port(i)
#...
The error message that occurs when the number of bytes sent reaches 32768:
An alternative serial port write function I tested:
def Write_to_serial_port(value):
data = struct.pack('>B', value)
ser.write(data)
Again the error message that occurs when the number of bytes sent reaches 32768:
I also tried periodically flushing the input and output buffers, but it didn't help.
Any ideas on the solution?
EDIT1:
The purpose of the program is to send the bytes in the binary file. While doing this, I send 128 bytes (from the binary file) and 13 bytes of CRC, file size etc information over the serial port in each cycle. data_buff size is 255 bytes but I am using 141 bytes.
function usage(Extended):
# ...
# Some definitions and assignments
# ...
while(bytes_remaining):
if(bytes_remaining >= 128):
len_to_read = 128
else:
len_to_read = bytes_remaining
for x in range(len_to_read):
file_read_value = bin_file.read(1)
data_buf[9+x] = int(file_read_value[0])
data_buf[0] = mem_write_cmd_total_len-1
data_buf[1] = write_cmd
data_buf[2] = word_to_byte(base_mem_address,1,1)
data_buf[3] = word_to_byte(base_mem_address,2,1)
data_buf[4] = word_to_byte(base_mem_address,3,1)
data_buf[5] = word_to_byte(base_mem_address,4,1)
data_buf[6] = gl_bin_file_sector_needed
data_buf[7] = len_to_read
data_buf[8] = send_count
send_count = send_count + 1
crc32 = get_crc(data_buf,mem_write_cmd_total_len - 4)
data_buf[9 +len_to_read] = word_to_byte(crc32,1,1)
data_buf[10+len_to_read] = word_to_byte(crc32,2,1)
data_buf[11+len_to_read] = word_to_byte(crc32,3,1)
data_buf[12+len_to_read] = word_to_byte(crc32,4,1)
for i in data_buf[0:mem_write_cmd_total_len]:
Write_to_serial_port(i)
#...
Error Message
EDIT2: I also tried splitting the 40KB binary file into 128byte chunk files and sending it. But I got the same error on the 256th file. I guess 256*128 = 32768 can't be a coincidence.
When accessing a Raw Disk on Windows via Python open(), it for whatever reason does not allow me to read the last 10240 bytes (aka last 5 sectors at 2048 bytes/sector).
When dumping the disc image by other means and comparing the images I can see that the data cannot be assumed to be empty either. In fact, the first of the missing sectors has a UDF Anchor Tag present with related metadata in it. The following sectors are entirely blank.
This is how I dumped the disc contents:
out = open("test.iso", "wb")
with open(r"\\.\D:", "rb") as f:
while True:
data = f.read(512)
if len(data) == 0:
break
out.write(data)
If I take that same open() object and tell it to seek to the very end of the disc, it does. So it can clearly reach the sectors at least in terms of seeking. If I then seek back 10240 bytes then attempt to f.read(...), it returns b'' (empty result) and not an error. It doesn't matter what size I tell it to read either. I tried all kinds of sizes, no-arg/default, 1, 12, 255, 512, 2048, 999999, etc.
Another StackOverflow answer on a different (but related) question also reported similar findings on Enhanced Audio Discs but seemingly no discussion was brought up since.
I have tested this on multiple DVD discs from varying kinds of studios and creators, all of which are in great condition with it still occurring.
Example reproducing code:
I don't know if its gonna happen to you on your system config/disc/reader).
PyPI Dependencies: wmic
WMIC reports the disc size-10240 as well, perhaps it's a Windows issue?
import os
from wmi import WMI
DISC_LETTER = "D:"
c = WMI()
disc_info = next(iter(c.Win32_CDROMDrive(Drive=DISC_LETTER)), None)
if not disc_info:
raise("Disc %s not found...", DISC_LETTER)
disc_size = int(disc_info.size)
disc_size += 10240 # WMIC also reports the size without 10240, but it is real!
f = open(r"\\.\%s" % DISC_LETTER, "rb")
f.seek(disc_size)
if f.tell() == disc_size:
print("Seeked to the end of the disc...")
f.seek(-10240, os.SEEK_CUR)
if f.tell() == disc_size - (2048 * 5):
print("Seeked 5 sectors before the end of the disc...")
data = f.read(2048 * 5):
print("Data read (len: %d): %b" % (len(data), data))
Any ideas on why this might be would great as I have tried everywhere I could.
It seems this occurs as open(r'\\.\N:') opens the device with restricted boundaries.
My solution was to open the disc with IOCTL instead of open(). Specifically with CreateFile, DeviceIoControl, and FSCTL_ALLOW_EXTENDED_DASD_IO.
handle = win32file.CreateFile(
r"\\.\D:",
win32con.MAXIMUM_ALLOWED,
win32con.FILE_SHARE_READ | win32con.FILE_SHARE_WRITE,
None,
win32con.OPEN_EXISTING,
win32con.FILE_ATTRIBUTE_NORMAL,
None
)
if handle == win32file.INVALID_HANDLE_VALUE:
raise RuntimeError("Failed to obtain device handle...")
win32file.DeviceIoControl(handle, winioctlcon.FSCTL_ALLOW_EXTENDED_DASD_IO, None, None)
From here I can use ReadFile and SetFilePointer as replacements for read and seek respectively.
I even worked on a new class that loads it all and allows you to dynamically read and seek without having to worry about sector alignment.
class Win32Device:
"""
Class to read and seek a Windows Raw Device IO object without bother.
It deals with getting the full size, allowing full access to all sectors,
and alignment with the discs sector size.
Author: PHOENiX <pragma.exe#gmail.com>
License: Free, enjoy! This should be a thing open() does by default.
"""
def __init__(self, target):
# type: (str) -> None
self.target = target
self.sector_size = None
self.disc_size = None
self.position = 0
self.handle = self.get_handle()
self.geometry = self.get_geometry()
def __enter__(self):
return self
def __exit__(self, *_, **__):
self.dispose()
def __len__(self) -> int:
return self.geometry[-2]
def dispose(self):
if self.handle != win32file.INVALID_HANDLE_VALUE:
win32file.CloseHandle(self.handle)
def get_target(self):
# type: () -> str
"""Get UNC target name. Can be `E:` or `PhysicalDriveN`."""
target = self.target
if not target.startswith("\\\\.\\"):
target += rf"\\.\{target}"
return target
def get_handle(self):
# type: () -> int
"""Get a direct handle to the raw UNC target, and unlock its IO capabilities."""
handle = win32file.CreateFile(
# https://learn.microsoft.com/en-us/windows/win32/api/fileapi/nf-fileapi-createfilea
self.get_target(), # target
win32con.MAXIMUM_ALLOWED, # desired access
win32con.FILE_SHARE_READ | win32con.FILE_SHARE_WRITE, # share mode, write needed
None, # security attributes
win32con.OPEN_EXISTING, # creation disposition
win32con.FILE_ATTRIBUTE_NORMAL, # flags and attributes
None # template file
)
if handle == win32file.INVALID_HANDLE_VALUE:
raise RuntimeError("Failed to obtain device handle...")
# elevate accessible sectors, without this the last 5 sectors (in my case) will not be readable
win32file.DeviceIoControl(handle, winioctlcon.FSCTL_ALLOW_EXTENDED_DASD_IO, None, None)
return handle
def get_geometry(self):
# type: () -> tuple[int, ...]
"""
Retrieves information about the physical disk's geometry.
https://learn.microsoft.com/en-us/windows/win32/api/winioctl/ns-winioctl-disk_geometry_ex
Returns a tuple of:
Cylinders-Lo
Cylinders-Hi
Media Type
Tracks Per Cylinder
Sectors Per Track
Bytes Per Sector
Disk Size
Extra Data
"""
return struct.unpack("8L", win32file.DeviceIoControl(
self.handle, # handle
winioctlcon.IOCTL_DISK_GET_DRIVE_GEOMETRY_EX, # ioctl api
b"", # in buffer
32 # out buffer
))
def tell(self):
# type: () -> int
"""Get current (spoofed) position."""
return self.position
def _tell(self):
# type: () -> int
"""Get current real position."""
if not self.handle:
self.handle = self.get_handle()
return win32file.SetFilePointer(self.handle, 0, win32file.FILE_CURRENT)
def seek(self, offset, whence=os.SEEK_SET):
# type: (int, int) -> int
"""Seek at any point in the stream, in an aligned way."""
if whence == os.SEEK_CUR:
whence = self.tell()
elif whence == os.SEEK_END:
whence = len(self)
to = whence + offset
closest = self.align(to) # get as close as we can while being aligned
if not self.handle:
self.handle = self.get_handle()
pos = win32file.SetFilePointer(self.handle, closest, win32file.FILE_BEGIN)
if pos != closest:
raise IOError(f"Seek was not precise...")
self.position = to # not actually at this location, read will deal with it
return to
def read(self, size=-1):
# type: (int) -> Optional[bytes]
"""Read any amount of bytes in the stream, in an aligned way."""
if not self.handle:
self.handle = self.get_handle()
sector_size = self.geometry[-3]
offset = abs(self._tell() - self.tell())
has_data = b''
while self._tell() < self.tell() + size:
res, data = win32file.ReadFile(self.handle, sector_size, None)
if res != 0:
raise IOError(f"An error occurred: {res} {data}")
if len(data) < sector_size:
raise IOError(f"Read {sector_size - len(data)} less bytes than requested...")
has_data += data
# seek to the position wanted + size read, which will then be re-aligned
self.seek(self.tell() + size)
return has_data[offset:offset + size]
def align(self, size, to=None):
# type: (int, Optional[int]) -> int
"""
Align size to the closest but floor mod `to` value.
Examples:
align(513, to=512)
>>>512
align(1023, to=512)
>>>512
align(1026, to=512)
>>>1024
align(12, to=10)
>>>10
"""
if not to:
to = self.geometry[-3] # logical bytes per sector value
return math.floor(size / to) * to
I am streaming data into a class in chunks. For each chunk of data, two different types of np.convolve() are executed on the same ProcessPoolExecutor. The type of convolve that was called is determined by a return variable.
The order of the data must be maintained, so each future has an associated sequence number. The output function enforces that only data from contiguous futures is returned (not shown below). From what I understand I am properly calling the ProcessPoolExecutor.shutdown() function, but I am still getting a IOError:
The errors is:
$ python processpoolerror.py
ran 5000000 samples in 3.70395112038 sec: 1.34990982265 Msps
Traceback (most recent call last):
File "/usr/lib/python2.7/multiprocessing/queues.py", line 268, in _feed
send(obj)
IOError: [Errno 32] Broken pipe
Sorry it's a bit long, but I have pruned this class down as much as possible while keeping the error. On my machine Ubuntu 16.04.2 with a Intel(R) Core(TM) i7-6700K CPU # 4.00GHz the paired down code always gives this error. In the non-pruned version of this code, the Broken pipe occurs 25% of the time.
If you edit line 78 to True, and print during the execution, the error is not thrown. If you reduce the amount of data on line 100, the error is not thrown. What am I doing wrong here? Thanks.
import numpy as np
from concurrent.futures import ProcessPoolExecutor
import time
def _do_xcorr3(rev_header, packet_chunk, seq):
r1 = np.convolve(rev_header, packet_chunk, 'full')
return 0, seq, r1
def _do_power3(power_kernel, packet_chunk, seq):
cp = np.convolve(power_kernel, np.abs(packet_chunk) ** 2, 'full')
return 1, seq, cp
class ProcessPoolIssues():
## Constructor
# #param chunk_size how many samples to feed in during input() stage
def __init__(self,header,chunk_size=500,poolsize=5):
self.chunk_size = chunk_size ##! How many samples to feed
# ProcessPool stuff
self.poolsize = poolsize
self.pool = ProcessPoolExecutor(poolsize)
self.futures = []
# xcr stage stuff
self.results0 = []
self.results0.append((0, -1, np.zeros(chunk_size)))
# power stage stuff
self.results1 = []
self.results1.append((1, -1, np.zeros(chunk_size)))
self.countin = 0
self.countout = -1
def shutdown(self):
self.pool.shutdown(wait=True)
## Returns True if all data has been extracted for given inputs
def all_done(self):
return self.countin == self.countout+1
## main function
# #param packet_chunk an array of chunk_size samples to be computed
def input(self, packet_chunk):
assert len(packet_chunk) == self.chunk_size
fut0 = self.pool.submit(_do_xcorr3, packet_chunk, packet_chunk, self.countin)
self.futures.append(fut0)
fut1 = self.pool.submit(_do_power3, packet_chunk, packet_chunk, self.countin)
self.futures.append(fut1)
self.countin += 1
# loops through thread pool, copying any results from done threads into results0/1 (and then terminating them)
def cultivate_pool(self):
todel = []
for i, f in enumerate(self.futures):
# print "checking", f
if f.done():
a, b, c = f.result()
if a == 0:
self.results0.append((a,b,c)) # results from one type of future
elif a == 1:
self.results1.append((a,b,c)) # results from another type of future
todel.append(i)
# now we need to remove items from futures that are done
# we need do it in reverse order so we remove items from the end first (thereby not affecting indices as we go)
for i in sorted(todel, reverse=True):
del self.futures[i]
if False: # change this to true and error goes away
print "deleting future #", i
# may return None
def output(self):
self.cultivate_pool() # modifies self.results list
# wait for both results to be done before clearing
if len(self.results0) and len(self.results1):
del self.results0[0]
del self.results1[0]
self.countout += 1
return None
def testRate():
chunk = 500
# a value of 10000 will throw: IOError: [Errno 32] Broken pipe
# smaller values like 1000 do not
din = chunk * 10000
np.random.seed(666)
search = np.random.random(233) + np.random.random(233) * 1j
input = np.random.random(din) + np.random.random(din) * 1j
pct = ProcessPoolIssues(search, chunk, poolsize=8)
st = time.time()
for x in range(0, len(input), chunk):
slice = input[x:x + chunk]
if len(slice) != chunk:
break
pct.input(slice)
pct.output()
while not pct.all_done():
pct.output()
ed = time.time()
dt = ed - st
print "ran", din, "samples in", dt, "sec:", din / dt / 1E6, "Msps"
pct.shutdown()
if __name__ == '__main__':
testRate()
This is probably happening because you're exceeding the buffer size of the pipe when you try sending in larger chunks at once.
def _do_xcorr3(rev_header, packet_chunk, seq):
r1 = np.convolve(rev_header, packet_chunk, 'full')
return 0, seq, r1
def _do_power3(power_kernel, packet_chunk, seq):
cp = np.convolve(power_kernel, np.abs(packet_chunk) ** 2, 'full')
return 1, seq, cp
the values r1 and cp are very large because you are convolving with the square of the chunks.
Hence, when you try to run this with larger chunk sizes, the buffer of IO Pipe can't handle it. Refer this for clearer understanding.
As for the second part of the question,
if False: # change this to true and error goes away
print "deleting future #", i
Found this in the py3 docs:
16.2.4.4. Reentrancy
Binary buffered objects (instances of BufferedReader, BufferedWriter, BufferedRandom and BufferedRWPair) are not reentrant. While reentrant calls will not happen in normal situations, they can arise from doing I/O in a signal handler. If a thread tries to re-enter a buffered object which it is already accessing, a RuntimeError is raised. Note this doesn’t prohibit a different thread from entering the buffered object.
The above implicitly extends to text files, since the open() function will wrap a buffered object inside a TextIOWrapper. This includes standard streams and therefore affects the built-in function print() as well.
What I'm trying in Python
I am trying to copy a large file over a TCP connection in python (3.6). I have two functions: send_chunk (sends a little header, then some data) and recv_chunk (parses that header, returns the data). I split the file I'm sending into chunks and put them on the network as fast as it lets me. Until around 4-5 MB, everything works. Then, recv_chunk receives some incorrect data, and everything is toast.
What works in C
The same operation in C (as demonstrated by netcat) has no problem sending a 100MB file with no errors (also much lower CPU usage). I looked in the netcat code, and I just see normal-old select and read/write calls.
Question of the day:
What could be going wrong? Why does it seem so simple in C but it isn't working in python?
code, for reference:
def send_chunk(data, sock):
if type(data) is str:
data = bytes(data, 'utf8')
len_str = "{}".format(len(data))
len_str_size = chr(len(len_str))
send_data = bytes(len_str_size+len_str, 'ascii')+data
total_sent = 0
total_len = len(send_data)
while total_sent < total_len:
data_sent = sock.send(send_data[total_sent:])
print('%f sending %d'%(time(),total_len))
if data_sent < total_len:
print('only sent %d'%data_sent,flush=True)
total_sent += data_sent
def recv_chunk(sock):
payload_data = b''; size = 0
len_data = b''; len_size = 0
# get the length field size
len_size = ord(sock.recv(1))
# get the length field
while len(len_data) < len_size:
len_data += sock.recv(len_size-len(len_data))
size = int(len_data)
# get the data
while len(payload_data) < size:
payload_data += sock.recv(min(size-len(payload_data), 2048))
return payload_data
Your code works for me, but copying your data many times make this slow.
Simply use sendall:
def send_chunk(data, sock):
if isinstance(data, str):
data = bytes(data, 'utf8')
sock.sendall(len(data).to_bytes(4, 'little'))
sock.sendall(data)
def recv_chunk(sock):
size = b""
while len(size) < 4:
size += sock.recv(4-len(size))
bytes_left = int.from_bytes(size, 'little')
# get the data
data = []
while bytes_left:
d = sock.recv(bytes_left)
data.append(d)
bytes_left -= len(d)
return b''.join(data)
I want to log the total bytes downloaded and uploaded by my Python script.
total_downloaded_bytes = 0
def bandwidth_hook(r, *args, **kwargs):
global total_downloaded_bytes
total_downloaded_bytes += len(r.content)
req = requests.session()
req.hooks = {'response': bandwidth_hook}
The above code doesn't take into account HTTP compression (if I'm right) and the size of headers.
Is there a way to count total uploaded and downloaded bytes from a requests.session? If not, what about a script-wide count?
You can access the r.request object to calculate outgoing bytes, and you can determine incoming bytes (compressed or not) by looking at the content-length header for the incoming request. This should suffice for 99% of all requests you normally would make.
Calculating the byte size of headers is easy enough; just add up key and value lenghts, add 4 bytes for the colon and whitespace, plus 2 more for the blank line:
def header_size(headers):
return sum(len(key) + len(value) + 4 for key, value in headers.items()) + 2
There is also the initial line; that's {method} {path_url} HTTP/1.1{CRLF} for requests, and HTTP/1.x {status_code} {reason}{CRLF} for the response. Those lengths are all also available to you.
Total size then is:
request_line_size = len(r.request.method) + len(r.request.path_url) + 12
request_size = request_line_size + header_size(r.request.headers) + int(r.request.headers.get('content-length', 0))
response_line_size = len(r.response.reason) + 15
response_size = response_line_size + header_size(r.headers) + int(r.headers.get('content-length', 0))
total_size = request_size + response_size