Python Requests/urllib — monitoring bandwidth usage - python

I want to log the total bytes downloaded and uploaded by my Python script.
total_downloaded_bytes = 0
def bandwidth_hook(r, *args, **kwargs):
global total_downloaded_bytes
total_downloaded_bytes += len(r.content)
req = requests.session()
req.hooks = {'response': bandwidth_hook}
The above code doesn't take into account HTTP compression (if I'm right) and the size of headers.
Is there a way to count total uploaded and downloaded bytes from a requests.session? If not, what about a script-wide count?

You can access the r.request object to calculate outgoing bytes, and you can determine incoming bytes (compressed or not) by looking at the content-length header for the incoming request. This should suffice for 99% of all requests you normally would make.
Calculating the byte size of headers is easy enough; just add up key and value lenghts, add 4 bytes for the colon and whitespace, plus 2 more for the blank line:
def header_size(headers):
return sum(len(key) + len(value) + 4 for key, value in headers.items()) + 2
There is also the initial line; that's {method} {path_url} HTTP/1.1{CRLF} for requests, and HTTP/1.x {status_code} {reason}{CRLF} for the response. Those lengths are all also available to you.
Total size then is:
request_line_size = len(r.request.method) + len(r.request.path_url) + 12
request_size = request_line_size + header_size(r.request.headers) + int(r.request.headers.get('content-length', 0))
response_line_size = len(r.response.reason) + 15
response_size = response_line_size + header_size(r.headers) + int(r.headers.get('content-length', 0))
total_size = request_size + response_size

Related

Error status after sending 32768 bytes from serial port in Python

I am making uart communication using serial library. I can send and receive data, but when the number of bytes sent reaches 32768 or more, I get the error;
ValueError: byte must be in range(0,256)
function definition:
def Write_to_serial_port(value):
ser.write([value])
function usage:
#...
for i in data_buf[0:total_len]: # data_buf is a list
Write_to_serial_port(i)
#...
The error message that occurs when the number of bytes sent reaches 32768:
An alternative serial port write function I tested:
def Write_to_serial_port(value):
data = struct.pack('>B', value)
ser.write(data)
Again the error message that occurs when the number of bytes sent reaches 32768:
I also tried periodically flushing the input and output buffers, but it didn't help.
Any ideas on the solution?
EDIT1:
The purpose of the program is to send the bytes in the binary file. While doing this, I send 128 bytes (from the binary file) and 13 bytes of CRC, file size etc information over the serial port in each cycle. data_buff size is 255 bytes but I am using 141 bytes.
function usage(Extended):
# ...
# Some definitions and assignments
# ...
while(bytes_remaining):
if(bytes_remaining >= 128):
len_to_read = 128
else:
len_to_read = bytes_remaining
for x in range(len_to_read):
file_read_value = bin_file.read(1)
data_buf[9+x] = int(file_read_value[0])
data_buf[0] = mem_write_cmd_total_len-1
data_buf[1] = write_cmd
data_buf[2] = word_to_byte(base_mem_address,1,1)
data_buf[3] = word_to_byte(base_mem_address,2,1)
data_buf[4] = word_to_byte(base_mem_address,3,1)
data_buf[5] = word_to_byte(base_mem_address,4,1)
data_buf[6] = gl_bin_file_sector_needed
data_buf[7] = len_to_read
data_buf[8] = send_count
send_count = send_count + 1
crc32 = get_crc(data_buf,mem_write_cmd_total_len - 4)
data_buf[9 +len_to_read] = word_to_byte(crc32,1,1)
data_buf[10+len_to_read] = word_to_byte(crc32,2,1)
data_buf[11+len_to_read] = word_to_byte(crc32,3,1)
data_buf[12+len_to_read] = word_to_byte(crc32,4,1)
for i in data_buf[0:mem_write_cmd_total_len]:
Write_to_serial_port(i)
#...
Error Message
EDIT2: I also tried splitting the 40KB binary file into 128byte chunk files and sending it. But I got the same error on the 256th file. I guess 256*128 = 32768 can't be a coincidence.

Multipart upload to Amazon Glacier: Content-Range incompatible with Content-Length

I'm trying to upload a file around 1gb in size to Amazon Glacier. Somewhat arbitrarily, I've decided to break it into 32mb parts and upload them in serial.
import math
import boto3
from botocore.utils import calculate_tree_hash
client = boto3.client('glacier')
vault_name = 'my-vault'
size = 1073745600 # in bytes
size_mb = size / (2**20) # Convert to megabytes for readability
local_file = 'filename'
multi_up = client.initiate_multipart_upload(vaultName=vault_name,
archiveDescription=local_file,
partSize=str(2**25)) # 32 mb in bytes
parts = math.floor(size_mb / 32)
with open("/Users/alexchase/Desktop/{}".format(local_file), 'rb') as upload:
for p in range(parts):
# Calculate lower and upper bounds for the byte ranges. The last range
# is bigger than the ones that come before.
lower = (p * (2**25))
upper = (((p + 1) * (2**25)) - 1) if (p + 1 < parts) else (size)
up_part = client.upload_multipart_part(vaultName=vault_name,
uploadId=multi_up['uploadId'],
range='bytes {}-{}/*'.format(lower, upper),
body=upload)
checksum = calculate_tree_hash(upload)
complete_up = client.complete_multipart_upload(archiveSize=str(size),
checksum=checksum,
uploadId=multi_up['uploadId'],
vaultName=vault_name)
This generates an error about the first byte range.
---------------------------------------------------------------------------
InvalidParameterValueException Traceback (most recent call last)
<ipython-input-2-9dd3ac986601> in <module>()
93 uploadId=multi_up['uploadId'],
94 range='bytes {}-{}/*'.format(lower, upper),
---> 95 body=upload)
96 upload_info.append(up_part)
97 checksum = calculate_tree_hash(upload)
~/anaconda/lib/python3.5/site-packages/botocore/client.py in _api_call(self, *args, **kwargs)
251 "%s() only accepts keyword arguments." % py_operation_name)
252 # The "self" in this scope is referring to the BaseClient.
--> 253 return self._make_api_call(operation_name, kwargs)
254
255 _api_call.__name__ = str(py_operation_name)
~/anaconda/lib/python3.5/site-packages/botocore/client.py in _make_api_call(self, operation_name, api_params)
555 error_code = parsed_response.get("Error", {}).get("Code")
556 error_class = self.exceptions.from_code(error_code)
--> 557 raise error_class(parsed_response, operation_name)
558 else:
559 return parsed_response
InvalidParameterValueException: An error occurred (InvalidParameterValueException) when calling the UploadMultipartPart operation:
Content-Range: bytes 0-33554431/* is incompatible with Content-Length: 1073745600
Can anyone see what I'm doing wrong?
#Michael-sqlbot is quite right, the issue with the Content-Range was that I was passing the whole file instead of a part. I fixed this by using the read() method, but then I discovered a separate issue, which is that (per the docs), the final part has to be the same size or smaller than the preceding parts. This means using math.ceil() instead of math.floor() to define the number of parts.
The working code is:
import math
import boto3
from botocore.utils import calculate_tree_hash
client = boto3.client('glacier')
vault_name = 'my-vault'
size = 1073745600 # in bytes
size_mb = size / (2**20) # Convert to megabytes for readability
local_file = 'filename'
partSize=(2**25)
multi_up = client.initiate_multipart_upload(vaultName=vault_name,
archiveDescription=local_file,
partSize=str(partSize)) # 32 mb in bytes
parts = math.ceil(size_mb / 32) # The number of <=32mb parts we need
with open("/Users/alexchase/Desktop/{}".format(local_file), 'rb') as upload:
for p in range(parts):
# Calculate lower and upper bounds for the byte ranges. The last range
# is now smaller than the ones that come before.
lower = (p * (partSize))
upper = (((p + 1) * (partSize)) - 1) if (p + 1 < parts) else (size-1)
read_size = upper-lower+1
file_part = upload.read(read_size)
up_part = client.upload_multipart_part(vaultName=vault_name,
uploadId=multi_up['uploadId'],
range='bytes {}-{}/*'.format(lower, upper),
body=file_part)
checksum = calculate_tree_hash(upload)
complete_up = client.complete_multipart_upload(archiveSize=str(size),
checksum=checksum,
uploadId=multi_up['uploadId'],
vaultName=vault_name)
Content-Range: bytes 0-33554431/* is incompatible with Content-Length: 1073745600
You're telling the API that you're sending the first 32 MiB, but you're actually sending (proposing to send) the entire file, since body=upload and upload isn't just the first part, it's the entire file. The Content-Length refers to the size of this part upload, which should be 33554432 (32 MiB).
The docs are admittedly ambiguous...
body (bytes or seekable file-like object) -- The data to upload.
...but the "data to upload" seems to refer to the data for only this part, in spite of the word "seekable."
Since the follow up answer from Alex claims it "works", I'm posting another version that worked for me under Python 3.5 and Ubuntu 16.04. I also added some environment variables from our production end to end solution.
The original post gave me an error so I tweaked it and provided some clean up. Hope this helps someone needing this Glacier functionality. Using a Shell script with awscli commands was not as clean.
import math
import boto3
import os
from botocore.utils import calculate_tree_hash
vault_name = os.getenv('GLACIER_VAULT_NAME')
file_name = os.getenv('GLACIER_UPLOAD_FILE')
if vault_name is None:
print('GLACIER_VAULT_NAME environment variable is required. Exiting.')
exit(1)
if file_name is None:
print('GLACIER_UPLOAD_FILE environment variable is required. Exiting.')
exit(2)
chunk_size = 2 ** 25
client = boto3.client('glacier')
client.create_vault(vaultName=vault_name)
upload_obj = client.initiate_multipart_upload(vaultName=vault_name,
archiveDescription=file_name,
partSize=str(chunk_size))
file_size = os.path.getsize(file_name)
parts = math.ceil(file_size / chunk_size)
with open(file_name, 'rb') as upload:
for p in range(parts):
lower = p * chunk_size
upper = lower + chunk_size - 1
if upper > file_size:
upper = (file_size - lower) + lower - 1
file_part = upload.read(chunk_size)
up_part = client.upload_multipart_part(vaultName=vault_name,
uploadId=upload_obj['uploadId'],
range='bytes {}-{}/{}'.format(lower,
upper,
file_size),
body=file_part)
# this needs a new file handler because calculate_tree_hash() processes
# the handler in a similar way to the loop above
checksum = calculate_tree_hash(open(file_name, 'rb'))
complete_up = client.complete_multipart_upload(vaultName=vault_name,
uploadId=upload_obj['uploadId'],
archiveSize=str(file_size),
checksum=checksum)
print(complete_up)

C more reliable for networking than Python?

What I'm trying in Python
I am trying to copy a large file over a TCP connection in python (3.6). I have two functions: send_chunk (sends a little header, then some data) and recv_chunk (parses that header, returns the data). I split the file I'm sending into chunks and put them on the network as fast as it lets me. Until around 4-5 MB, everything works. Then, recv_chunk receives some incorrect data, and everything is toast.
What works in C
The same operation in C (as demonstrated by netcat) has no problem sending a 100MB file with no errors (also much lower CPU usage). I looked in the netcat code, and I just see normal-old select and read/write calls.
Question of the day:
What could be going wrong? Why does it seem so simple in C but it isn't working in python?
code, for reference:
def send_chunk(data, sock):
if type(data) is str:
data = bytes(data, 'utf8')
len_str = "{}".format(len(data))
len_str_size = chr(len(len_str))
send_data = bytes(len_str_size+len_str, 'ascii')+data
total_sent = 0
total_len = len(send_data)
while total_sent < total_len:
data_sent = sock.send(send_data[total_sent:])
print('%f sending %d'%(time(),total_len))
if data_sent < total_len:
print('only sent %d'%data_sent,flush=True)
total_sent += data_sent
def recv_chunk(sock):
payload_data = b''; size = 0
len_data = b''; len_size = 0
# get the length field size
len_size = ord(sock.recv(1))
# get the length field
while len(len_data) < len_size:
len_data += sock.recv(len_size-len(len_data))
size = int(len_data)
# get the data
while len(payload_data) < size:
payload_data += sock.recv(min(size-len(payload_data), 2048))
return payload_data
Your code works for me, but copying your data many times make this slow.
Simply use sendall:
def send_chunk(data, sock):
if isinstance(data, str):
data = bytes(data, 'utf8')
sock.sendall(len(data).to_bytes(4, 'little'))
sock.sendall(data)
def recv_chunk(sock):
size = b""
while len(size) < 4:
size += sock.recv(4-len(size))
bytes_left = int.from_bytes(size, 'little')
# get the data
data = []
while bytes_left:
d = sock.recv(bytes_left)
data.append(d)
bytes_left -= len(d)
return b''.join(data)

Not receiving any data back from bittorrent peer handshake

I'm having some trouble on the bit torrent protocol. I'm at the point of sending a handshake message to some peers. I have my client basically connect to every peer in list then send the 'handshake'. Code is below -
peer_id = 'autobahn012345678bit'
peer_id = peer_id.encode('utf-8')
pstr = 'BitTorrent protocol'
pstr = pstr.encode('utf-8')
pstrlen = chr(19)
pstrlen = pstrlen.encode('utf-8')
reserved = chr(0) * 8
reserved = reserved.encode('utf-8')
There are my variables that I'm sending. My msg is -
msg = (pstrlen + pstr + reserved + new.torrent_hash() + peer_id)
Based on the bit torrent specification my message is the appropriate len of 49 + len(pstr) -
lenmsg = (pstrlen + reserved + new.torrent_hash() + peer_id)
print(lenmsg)
print(len(lenmsg))
is out put -
b'\x13\x00\x00\x00\x00\x00\x00\x00\x00\x94z\xb0\x12\xbd\x1b\xf1\x1fO\x1d)\xf8\xfa\x1e\xabs\xa8_\xe7\x93autobahn012345678bit'
49
the entire message looks like this -
b'\x13\x00\x00\x00\x00\x00\x00\x00\x00\x94z\xb0\x12\xbd\x1b\xf1\x1fO\x1d)\xf8\xfa\x1e\xabs\xa8_\xe7\x93autobahn012345678bit'
My main problem being I don't receive any data back. I have the socket.settimeout(4) and it'll just timeout?
The output is incorrect, it misses 'BitTorrent protocol'.
A proper handshake string is 68 bytes long.
It should be:
\x13BitTorrent protocol\x00\x00\x00\x00\x00\x00\x00\x00\x94z\xb0\x12\xbd\x1b\xf1\x1fO\x1d)\xf8\xfa\x1e\xabs\xa8_\xe7\x93autobahn012345678bit

Can scraping be applied to this page which is actively recalculating?

I would like to grab satellite positions from the page(s) below, but I'm not sure if scraping is appropriate because the page appears to be updating itself every second using some internal code (it keeps updating after I disconnect from the internet). Background information can be found in my question at Space Stackexchange: A nicer way to download the positions of the Orbcomm-2 satellites.
I need a "snapshot" of four items simultaneously:
UTC time
latitude
longitude
altitude
Right now I use screen shots and manual typing. Since these values are being updated by the page - is conventional web-scraping going to work here? I found a "screen-scraping" tag, should I try to learn about that instead?
I'm looking for the simplest solution to get those four values, I wonder if I can just use urllib or urllib2 and avoid installing something new?
example page: http://www.satview.org/?sat_id=41186U I need to do 41179U through 41189U (the eleven Orbcomm-2 satellites that SpaceX just put in orbit)
One option would be to fire up a real browser and continuously poll the position in an endless loop:
import time
from selenium import webdriver
driver = webdriver.Firefox()
driver.get("http://www.satview.org/?sat_id=41186U")
while True:
location = driver.find_element_by_css_selector("#sat_latlon .texto_track2").text
latitude, longitude = location.split("\n")[:2]
print(latitude, longitude)
time.sleep(1)
Sample output:
(u'-16.57', u'66.63')
(u'-16.61', u'66.67')
...
Here we are using selenium and Firefox - there are multiple drivers for different browsers including headless, like PhantomJS.
no need to scrape. Just look at the source html of that page and copy/paste the javascript code. None of the positions are fetched remotely...they're all calculated on the fly in the page. So just grab the code and run it yourself!
Space-Track.org's REST API seems built to handle this type of request. Once you have an account there, you can even download a sample script (updated here) to download TLES:
# STTest.py
#
# Simple Python app to extract resident space object history data from www.space-track.org into a spreadsheet
# (prior to executing, register for a free personal account at https://www.space-track.org/auth/createAccount)
#
# This program is free software: you can redistribute it and/or modify it
# under the terms of the GNU General Public License as published by the Free Software Foundation,
# either version 3 of the License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
# without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
# See the GNU General Public License for more details.
#
# For full licencing terms, please refer to the GNU General Public License (gpl-3_0.txt) distributed with this release,
# or see http://www.gnu.org/licenses/gpl-3.0.html.
import requests
import json
import xlsxwriter
import time
import datetime
import getpass
import sys
class MyError(Exception):
def __init___(self, args):
Exception.__init__(self, "my exception was raised with arguments {0}".format(args))
self.args = args
# See https://www.space-track.org/documentation for details on REST queries
# "Find Starlinks" query finds all satellites w/ NORAD_CAT_ID > 40000 & OBJECT_NAME matching STARLINK*, 1 line per sat;
# the "OMM Starlink" query gets all Orbital Mean-Elements Messages (OMM) for a specific NORAD_CAT_ID in JSON format.
uriBase = "https://www.space-track.org"
requestLogin = "/ajaxauth/login"
requestCmdAction = "/basicspacedata/query"
requestFindStarlinks = "/class/tle_latest/NORAD_CAT_ID/>40000/ORDINAL/1/OBJECT_NAME/STARLINK~~/format/json/orderby/NORAD_CAT_ID%20asc"
requestOMMStarlink1 = "/class/omm/NORAD_CAT_ID/"
requestOMMStarlink2 = "/orderby/EPOCH%20asc/format/json"
# Parameters to derive apoapsis and periapsis from mean motion (see https://en.wikipedia.org/wiki/Mean_motion)
GM = 398600441800000.0
GM13 = GM ** (1.0 / 3.0)
MRAD = 6378.137
PI = 3.14159265358979
TPI86 = 2.0 * PI / 86400.0
# Log in to personal account obtained by registering for free at https://www.space-track.org/auth/createAccount
print('\nEnter your personal Space-Track.org username (usually your email address for registration): ')
configUsr = input()
print('Username capture complete.\n')
configPwd = getpass.getpass(prompt='Securely enter your Space-Track.org password (minimum of 15 characters): ')
# Excel Output file name - e.g. starlink-track.xlsx (note: make it an .xlsx file)
configOut = 'STText.xslx'
siteCred = {'identity': configUsr, 'password': configPwd}
# User xlsxwriter package to write the .xlsx file
print('Creating Microsoft Excel (.xlsx) file to contain outputs...')
workbook = xlsxwriter.Workbook(configOut)
worksheet = workbook.add_worksheet()
z0_format = workbook.add_format({'num_format': '#,##0'})
z1_format = workbook.add_format({'num_format': '#,##0.0'})
z2_format = workbook.add_format({'num_format': '#,##0.00'})
z3_format = workbook.add_format({'num_format': '#,##0.000'})
# write the headers on the spreadsheet
print('Starting to write outputs to Excel file create...')
now = datetime.datetime.now()
nowStr = now.strftime("%m/%d/%Y %H:%M:%S")
worksheet.write('A1', 'Starlink data from' + uriBase + " on " + nowStr)
worksheet.write('A3', 'NORAD_CAT_ID')
worksheet.write('B3', 'SATNAME')
worksheet.write('C3', 'EPOCH')
worksheet.write('D3', 'Orb')
worksheet.write('E3', 'Inc')
worksheet.write('F3', 'Ecc')
worksheet.write('G3', 'MnM')
worksheet.write('H3', 'ApA')
worksheet.write('I3', 'PeA')
worksheet.write('J3', 'AvA')
worksheet.write('K3', 'LAN')
worksheet.write('L3', 'AgP')
worksheet.write('M3', 'MnA')
worksheet.write('N3', 'SMa')
worksheet.write('O3', 'T')
worksheet.write('P3', 'Vel')
wsline = 3
def countdown(t, step=1, msg='Sleeping...'): # in seconds
pad_str = ' ' * len('%d' % step)
for i in range(t, 0, -step):
sys.stdout.write('{} for the next {} seconds {}\r'.format(msg, i, pad_str))
sys.stdout.flush()
time.sleep(step)
print('Done {} for {} seconds! {}'.format(msg, t, pad_str))
# use requests package to drive the RESTful session with space-track.org
print('Interfacing with SpaceTrack.org to obtain data...')
with requests.Session() as session:
# Need to log in first. NOTE: we get a 200 to say the web site got the data, not that we are logged in.
resp = session.post(uriBase + requestLogin, data=siteCred)
if resp.status_code != 200:
raise MyError(resp, "POST fail on login.")
# This query picks up all Starlink satellites from the catalog. NOTE: a 401 failure shows you have bad credentials.
resp = session.get(uriBase + requestCmdAction + requestFindStarlinks)
if resp.status_code != 200:
print(resp)
raise MyError(resp, "GET fail on request for resident space objects.")
# Use json package to break json-formatted response into a Python structure (a list of dictionaries)
retData = json.loads(resp.text)
satCount = len(retData)
satIds = []
for e in retData:
# each e describes the latest elements for one resident space object. We just need the NORAD_CAT_ID...
catId = e['NORAD_CAT_ID']
satIds.append(catId)
# Using our new list of resident space object NORAD_CAT_IDs, we can now get the OMM message
maxs = 1 # counter for number of sessions we have established without a pause in querying space-track.org
for s in satIds:
resp = session.get(uriBase + requestCmdAction + requestOMMStarlink1 + s + requestOMMStarlink2)
if resp.status_code != 200:
# If you are getting error 500's here, its probably the rate throttle on the site (20/min and 200/hr)
# wait a while and retry
print(resp)
raise MyError(resp, "GET fail on request for resident space object number " + s + '.')
# the data here can be quite large, as it's all the elements for every entry for one resident space object
retData = json.loads(resp.text)
for e in retData:
# each element is one reading of the orbital elements for one resident space object
print("Scanning satellite " + e['OBJECT_NAME'] + " at epoch " + e['EPOCH'] + '...')
mmoti = float(e['MEAN_MOTION'])
ecc = float(e['ECCENTRICITY'])
worksheet.write(wsline, 0, int(e['NORAD_CAT_ID']))
worksheet.write(wsline, 1, e['OBJECT_NAME'])
worksheet.write(wsline, 2, e['EPOCH'])
worksheet.write(wsline, 3, float(e['REV_AT_EPOCH']))
worksheet.write(wsline, 4, float(e['INCLINATION']), z1_format)
worksheet.write(wsline, 5, ecc, z3_format)
worksheet.write(wsline, 6, mmoti, z1_format)
# do some ninja-fu to flip Mean Motion into Apoapsis and Periapsis, and to get orbital period and velocity
sma = GM13 / ((TPI86 * mmoti) ** (2.0 / 3.0)) / 1000.0
apo = sma * (1.0 + ecc) - MRAD
per = sma * (1.0 - ecc) - MRAD
smak = sma * 1000.0
orbT = 2.0 * PI * ((smak ** 3.0) / GM) ** (0.5)
orbV = (GM / smak) ** (0.5)
worksheet.write(wsline, 7, apo, z1_format)
worksheet.write(wsline, 8, per, z1_format)
worksheet.write(wsline, 9, (apo + per) / 2.0, z1_format)
worksheet.write(wsline, 10, float(e['RA_OF_ASC_NODE']), z1_format)
worksheet.write(wsline, 11, float(e['ARG_OF_PERICENTER']), z1_format)
worksheet.write(wsline, 12, float(e['MEAN_ANOMALY']), z1_format)
worksheet.write(wsline, 13, sma, z1_format)
worksheet.write(wsline, 14, orbT, z0_format)
worksheet.write(wsline, 15, orbV, z0_format)
wsline = wsline + 1
maxs = maxs + 1
print(str(maxs))
if maxs > 18:
print('\nSnoozing for 60 secs for rate limit reasons (max 20/min and 200/hr).')
countdown(60)
maxs = 1
session.close()
workbook.close()
print('\nCompleted session.')

Categories