Stream a large file from URL straight into a gzip file

Stream a large file from URL straight into a gzip file - python

I want to stream a large file into a gzip file directly, instead of downloading it all into memory and then compressing. This is how far I have gotten (does not work). I know how to just download a file in python and save and I know how to compress one, it is the streaming part that does not work.
Note: this linked csv is not large, it is just an example url.
import requests
import zlib
url = f"http://samplecsvs.s3.amazonaws.com/Sacramentorealestatetransactions.csv"
with requests.get(url, stream=True) as r:
compressor = zlib.compressobj()
with open(save_file_path, 'wb') as f:
f.write(compressor.compress(r.raw))

Alright I figured it out:
with requests.get(url, stream=True, verify=False) as r:
if save_file_path.endswith('gz'):
compressor = zlib.compressobj(9, zlib.DEFLATED, zlib.MAX_WBITS | 16)
with open(save_file_path, 'wb') as f:
for chunk in r.iter_content(chunk_size=1024*1024):
f.write(compressor.compress(chunk))
f.write(compressor.flush())
else:
with open(save_file_path, 'wb') as f:
shutil.copyfileobj(r.raw, f)

Related

Use python request to convert bytes gzip to .wav?

r_api= requests.get(url = urll, headers = hederr, stream=True)
request_data = r_api.content
print(request_data)
I got gzip bytes as output/response and I want to convert it into .wav file.
I tried this code and it's converting the file to gzip wav not to .wav.
with open('myfile_1.wav', mode='bx') as f:
f.write(request_data)
gzip bytes
how to convert it to gzip bytes to .wav?
b'\x1f\x8b\x08\x00\x00\x00\x00\x00\x00\x03\xedXwP\x93\xdb\xb6\xff\x12\x8a\x91\xfa\xd1{\x08M\x10\x14BW\xdcB\x88\x01\x11\x14\x90"E\xc0\x04B$B$$t\x05BW\x8a\x14\x95*\x88\xc8Q\x01A\xe4\x88\x02j\xc4\xc6\x11\x10\xc5\x06\xa2\x80\x01D\x01[\x00=G,x?\x8e\xe7\xcc\xf5\xfaf\xde\xcc\x9d7\xf7\xcd\xfd\x83\xdfd\xcdZ\xfb[k\xaf\xbd\xd6nkO\xf0x<~\x07\xdex\x87\xb1\xa1\xe9N&\xf4\x9f\x012\x84\xa5\xa59\x0e\xe1\xc6\x96\xe6\xf8\x1f9~Q\xc4[\x9a\xe2 cSKKc\xbc\xa9\x99\xb9\xb19\x847636Y\xb4\xff\x0f\xc5\xf3/\x88bGRXH(\xa1\x14V\x1c\xe5\xff\xe0\xe7\xcfd\xfeN\n\xe1\xff\x05063\xb1\xb0\xb0\xc4\x9b[\x98[\xfe\x8b\xfc\x13\x14i\x91qLd\xf5\xcd\x16\x1btv8\xe3\xbb,\xd8\xc9\x08\x0f\x8fF>\x851\xa2C\xa8\x8b\xbaA?\xc6\xd4"A\x90\xc0$\x04\x89\nB(h\xf1\xf7O\xa0~\xf2\xfcS\x9b\xf0\xbf\x87\x8bF(\x8e\x11\x1c\xb9\xb8\n\x1a!\xd40\xd6\xdf\n\x065\xf2\x7f\xac\x8cAhp\x1c\xfb\x87aT\x16\x8d\x82\xc2\x19\x86\x94\xddTV8\x9dj\x18\x1d\xccb\xd3\xc3w#*\x1dz\x18;\x12\xe1j\x7f\x99\xcaQ)\x7f\xfa\xfb3\xba5\x86\xc6\x86\xc8>\x13\x18\x8cdQB\x91\xb6_d\xe8\x9f\xc9.\xfbg\xb2\x7f\xa7\x81$\xfcsn\xffV\xc2\x02Q\x0c*}q`\x1c\x83\xfa\xf3\x84\xaa#\xa6\xde\x7fo\x9aU?&\xcf\x0e\x8f\xda\xfd\xa3\x17w\xa4Mu#\xb2\x0c\x0b^\xf4\xa9\xc1\xa0\xef\xa6!\x02\xccf|\xf7\xf9\x17\xb4\xa9\xdf\xbf\xabRY\xc1\xb4\x1f\xa2\x13\x8bb\x85\xe1\xbe\xcb\xe8\tvd`\x18"\xdb\xb1#\xd9\xd4\x1fl\xcc\xd9\x14\x06\xeb\x87\xf6\xdf\x12\xbc\xc8\xd4\xbf\'%EElp8\xc4U\xea\xb7?\x8dp\xec\xc8\xc8\xbf\x17\x04\xfdW\xc7*\x84r\xbes\xb4\x032H\xfc\x0f\x01\xe6.\xf6Y\xa2%\xfa7I\x15\xd9GA\xd0w\xa0\xfe\xa2\xdc\xbf\xb8,;2(\xfc\x9f:\xb1r\xe4\x1c\xb7\xd0X\xc1\xc1\xd0\x12\x96\xb0\x84%,a\tK\xf8o\x87\xd4)\x06\xf2F\x07\x05\n\xf2\\Z\xdc(\x0f\xf5\xf2\x0c\x1f\xf9\xca\xf9\xc9\n8\xc4h\x17}\xd7\xb7/\xea;~\xd6G\xfb\xb2"\xa93W;\xed\xc2\xc7\xe6\xa4\x8c\xd1\x8b\xfd\xb9\xe5\xcc#\x08\x13\xf8\xfc\xf1!\x07\xf4M\xec\xd5 >^>~\xd3B\xb6\xff\xb1\xcb\xefk\x9e\xb3DE\xb6^\xd0\xd9^\xe3\xb9\xf9\x8e\xfb.> \x9d\xa2I/3\x97\x1a]\xd8\xb0j\xb8\xf0\xbc\xc8\x8d\xc3\xf3D+\xcfQ\xd4\xf8\xe3\xdd!E\x9f^\xf1\x81#\x9d\x83\x13.\x84\xfa\xde\xfa\x16*\x94B;\xee\xd1E\x9f\x96y\x04\x88\t4\x12=\xf8\xe6\xc2\x04#L\xdc\x11\xfdk\xa4\xa8\xeb\xdb\xc3n\n\xc3\xc7\x9bku\x8d)\xc6\x97\xc3%v\xbf\xeb\xbd\xa3\xa0\xf8\t\x07L\x18q\xca2w4\xdaI\xbb\n[\x0f\x98P\xda\xaa1\xbd\xca\x15ND\xd2 \xe589\xaf\x17\x02\xa4`\xaa*qU\x93\x0b\xbc[\xcb;\xe7Z\xad\xe6k\xbe\xd8\xb4\xfd\x94\xd5\x894Ht\xae\xc9\x81\x0f\xa4\xdb\xb5\xa2\xf0Y\xda#\xa8\x0e7\x96V\x9d#\xf7\xb5\x8d\x8c\xe4LA[\xa4\xe8-\x7f\xf30MD\xef\x8a\xc7\x91]4\x17P)\xd5[\x9f\xba\xea<\xc8\x95\xdf\xa3-R\xe7\x9b\x00\xa5<Q_M\xe4\x031\x1358\xc8DaA\xe0P\xf5\xd8\x8b4\xe3vW\xd2\xc1\x88\x08\x9eBNX\xd7\x85\x8d\x9f-\xf2\x81\xc4\xba\xbd\xb8#\x87\xda\xc83\xc5\xd5\xe6\xb9\x12\x114\xfa\xd1\x17\xf2\xab\xac\x83+\x9dZ.\xaf\xeb)`\x02c\xc9\x19\x8c\xf7\xd7\xf2\xca\xd2Z\x9d\xfb%\xdd\x03\xa6\xd9\xc9\xd7\x9cO6\x87u\xd4\xe3b\xb8)sx0\x98\\\x06\x9b\x87\xfb.l\xe2\xf4z\xef\xadD\x1f\x96\xab\x9d\xcd\xde\x923p\'\xfct\x91P\x8b)\x0c\x1c\xde\xf7{\x93\x1f\xed\xad\\\x9b\xc1\xe4\x0bJ\xbde\xb7\xa5W\xc67^\xb16k\xacq\xb4\xec[\x83\x03D\xa7T\x0c%\xf5\xbd\x91\x9a\xe1\x0c\xa9\xae\xec\x85\xb3\x939\xae\xb4\xad\xd9\xb7\xf2\xea\xf3`\xcc\x89J> Nk{S2G\x17d8\x11\xd2moo\x93\xca0\xdd\xadBB\xab\xb3r\xfc\x88\xb4\xb5\xeb\xfb\x80\x99W\x1c\x86b4Q)\xf8h.\xf7\x94\x88\x91\xc0\xc2\xdd\xb2\x8a\x1b\x9a\rU\xee\x8f\x19\x89\x1b\xbf\xe5\x03\xbd\x10*L1\x98J\xdc\x9a\xdcA\xea\x0e\xc8\xbe\x97\xfeL\x91*!\x83O?\xf6\xee\xe1\xfe\xd4\xa8*0f\x1b\xc1\x0c\x89X\x89UZ\x1ex\xect\xfe\xf4{L\x9e\xf6/\xaf\x13\xbb\x94\xc4\'\xbeE\x8d\xb6\xbb\x02i\xac\x16L\x1e\xc1\x02\xfd\x87\x0c\xe3\x06\xe5\xea\x03[\xd1\x8fju\xcb\x0c\xcfwf\x8a#\xa4T\x1e\x90w\xf0\xe3P\x8e\x9eN46\x90\xdb}\x9ds\xf2-^AE\x94\xb0\xa5\xf0\x83\xd11\xae3\xad\x9c\x0f\xb6\x05Da(ic;\xd0\xf7\xce~\xe3\xb2\x8c\xd4\x8e\xf7w_\xe8Ek\x1f\xf7\x92\xf3\xb7\xd8\xd5\x84\x07$\x7fm\x1c9i\xb6R2\xb9\x88*q\xa0\xde\xd3\x9e \xf7l}\xa1Oa\xe7\xb3\xbd\x92\x9a\xcd0\xb0\xa7\xc6a\xc8\xecQ\xaeR2{\xfc\x84~#\x8e\xf3\x1b{}w\xde\x9d\xd9\x10\xbf<\xf3\x8c\x12\x18\x906\xefa\x92)\x13\xd8b\x83\xdf\xfbN\x194\xf4E\x1d^\x06a{\x08\xac\x9co\xce\x04%\x1a\x0c\xb4\xad\x90\xfe\xdd*\x01\x82\xcfX\xeb\xe5\xb2}bm\xec^\xf6\xac!\xe8\xef\x84\x0eM\x1c\xb9\xd6\xd0\x04\x04\xcd\xf70\x83\xce\xa3\x16P\xc9\x0f\xc7/\xa0q+N\xe9.4\x19\xbbK\x1e\x8e.9WT6\xbc\x18\x1fU\xf9\xa3\x85\xd2^\xad&M\xf3\xc1\x893\xed\x0b\xa5\xaf\xc5f\n\xa73\xf4\xd0\tn;\x06\xf3\x81\x1b#R\xee\xca\x08g\x01J\xb8\x9e\xfb\xea\xe0\xb2\xc3\x8eWLG^]\x19\xdai\xc4\t\xdb\x84.k\x02&\xbe\t\x86m\x8e\x99#\x8a\x96\xb2\xa8\x10}G\xd8^\xc8\xae\xd6zZ\xbf!\xcfY\x17\xa4\xd8p\x80\x84e\x99\xfa\x15\x11\xd3\x05\x11\xbb\x83~:\xb9\xf2\xb55\xaa\x03\xd9\xcb\x1e\xccs\x95\xf9\x11^\xdb\xd63\x01\xd5.\x08\xa3\xec\xf5\x1e\xab\x94.\xfd\xe5 sO\x8a]PG\x8d\xcb\x87?*\n7_\xdd\xef\xba\x82\tp\xee{\x98X\x06\x7fD-\xd5=]\xa5\xbby\x1f\xba\xb5\x06M\xde~,\xd8\xc2\xac\xb7lP\xbe\t\x90\xdd\xc20\xd8G\xa3\x0b\xa2\x1ci\xdeX\xc7\x8ao\xb719\xac\xfa\xcb\x86Wk\xe6C54z\x90\xfb\x85\xa6\x06\xab\x9f\xfd\xbdR.\x15\x85\xaa\xb3\x17\xc9\'\xa6\n\x07\xfa\xef\xd9\xa7s.\xe9P\xb1\xff\xb3*\xe0\xe0\xa7\x85Q\x7f9\xc5E\'\xdd7\xd2\x9fn\x9c^\x99\xd3M\xef\xd7\xcc\xc6\xbe3}"\xb6\xf62\x13\x90\xe9\xd9\x18\xf5\xf9\xd1\x05\x14GC\x99\xeb\xd3\xda\xe4N\xaf\xa7\xee\x8e{vnN\xb5\xe0\x97\x95(\xc4\xbf\x7f6\xac\xfe\xf9E\xa5(\xa7\xaf\xf5\xf1\xd4\xd9gF[\xde\xc8Z]o\x93\xccFa.}q\x86\x00\xd9O\xc1\x9b,0\x9a(\x94\xcc_\x99\x9au4sR\xfd\x8c\xda\x99\x1d>\x99\xc1+\xb4\xd9J\xd7]\x81\x83g\x91+Y\x87\xb7 \x90F\x90\xed\xa2?\xdaZ#\xaa\xb9\xbc\xe3XWk\xf6\x97\xd6\x03\x19/\t\xc0\x8c$\x03\x93\xf13\xaa\x06\x06\x1d\xd9D\xc2\xbd\n\x9d,\x89\xae8f\xb7\x8a\x00\xad\x8a\xc4\xc4\xe4\x03\x9c\xcdYW\xb2E\xdc%-Nsy\\\xac\x8f\xed\xb1\x0e\x1dq\xdb\r\xac\xa4\xac5Y\xa1\x92\'\x99`\xcd\x86"\x98\x1c\xf7\x0e+\x95\\u\xfe\x177`\x9c\x9e\x83a\xdfr4?/\xffR?D\xe8\n\x19x\xb8\x17x\x93c>&*pL\xf6\x95\xea3\xfd\x06\x05c\xb6Z\xd8\xbd{\xc56S\xba\xf9\xd4\xa5\x0f\x90\x03\xc21d\x97Q\xae\x8cAS\x8d\x84Fr\xc7d\x7f\xc9\xa7\xae\xfbU\x94\xb1O\xf6\xbf\x11\xd2\x08\xc0\xc1)\t&KMq\x97\'\xa7\xd6F\xfdQa_\xf2\xdaZ\'\xad\'\xb2\xc5\xa3\x8f\xb0\xb5E\x85\x0f4\xed\x120\xea\x9f\xe2\xb8R\xfa-szD-\xcbM\xac\xbe\x80\xe1\x0b\xc5\xebiO\xba\x9f\x91{`\xa0\x15K\x87\xd5G\xd4\x16\xa4R\xc4\x9e\xfb\x99\xed\x9f,Iid\xe7\xe8:_\x19X\xb6\xf1}$L\x00\xd8\xce#\xfd\xe9\x06x\x01\xf3\xa1p\xf9#a\x9d#\xe3\x0buH\xf7H\xfc\xc5\xb5c\x03k%\xe6\x90\xfb\x9b\xbdJ\xd9\x7f\xb32\x16\xf3\xf0\xfd\xa7\xda\x8d\xbf*\xa8D\x0eo\x96r\xd921|$\xf9`\xf6V\x18\xf0\xd6\x95\xcb\xec\xcaUJT\xfcf\xfb\xf4\xea\xe3\xe8\x94\xe9u7\xf1\x96\xb6G\x8b\xc35\n\xbe\xf4>\x81\xc0\x10+F\xe7#\xa0#\x9eT\x1e\xf1\x82\xf7\xc7o\xfd_D\xbc\xda|\xaf\xc9\xf6\xb8x\xfe\xc64\x16\xc6\x017\xa0\xae\xc0\x9e\x95\x04Z6[o\xaa\x97Kn\xe9\xcc\xc9l\x8c\xde\xa4Rn\xa5kyON\x97\x07\x08\x19,\xa3ye\x91\x11\x99]\xcf\x9f\xaf\'\xd5R \xf4T\x8f\xf8\xc7\xa7\'\x8f\x9c\xd2\x15\xca\x08"#\xf7O\xea\xb6Z\x17t\x00J\x87rw\xabq\xc5\xcaq\xd5R\xae\x17\xddT\xb7b\xb4\xe0 \xfb"\x1e\xe8\xc4\x02\x9cJL\xe4\'\xaf\x0c\xd3h\x9d\x93\x1e\xa7\xd6\xa4\x07^\xae~Z\xb7\x7f6|T\xc1\x16\xc3\x03X\xc7!\x18\x9b46\x82\xe2\xf8\x1b\xbc\xa7u\x1d\x85<(\x8e\xd3\x11\'\x9d\xa4z/\xfa\x00\x1e\x07\xac\x8c+\xc7a_r]T\xf2M\xden\xe8b\xc5\xdf,M\xd3\x8d\xd5W\xba{\x85m\x99|\x03r\x05z\x04\x18C6\xe6%*\xa4%\xf0\xa6dN\x9cj\xf6\xb8!\xf9\xbcU8\x0e\xa7\xbf\x1d\xde\xdcE\x06xMd})\xaf\xe4\xdb8\xac\x8ch\xc7R\x17\xecv\x8b\x8e\xfd\x0b]&\xfa\x99\xeb\x1e\x16\xad\xee\x008\xb3~\x88l.\xd9n\x97\xacY\xa9Bad\xe4\xc6n\x1e\x06\x11\xab\x94o\xfa\xdcK=+\x83\xd4\x1f\xb3\x9dk\x82\x0c7\xf6Wp\xae\x9e3_\xbdI\xe8\xfd]\xa1\xc6\x02\x93\x92rE\xf8\xb2\x93\xf71\x1c\xc8rU\x0b!\xbb\xc4}v\xb1\x82*\x0fw\x8e\x8eX\x88\xa8\xa6\t\xea\xcam\xfex+\x9f\xd5\xa6\x84\xdc\xef\x1e\xe1\x8ad\xecv\xe4\xfcJ\x8d+\xbe\x8c\x94H\x93s\xf5=\xb4z\xb9\x05V}\x03\xfbC.\x0c\xf2=\xd4\xc8\xe4\x9d\xb3wU8\x04\xc5\xf4q\xafN\xd5\x0b_\xc4\xd8k\xb1\xba\xd7\xba\xf5Ee\xbdq\xc0~\xb3\x02L\x0e\x1a\x97\xcf\xe5lu\xf9\xc5+z\xd7\x8d}\xe8]\xb3\xe2\x92E\xf5\x83n(T%r~\x9c\xce\xc2d\xa7k\xd6\xe8\xa4|K\xf7\xf2\x9d\xc7\xf6g\xca\x0e=X&N\x99\x1aj\xf6\xe8pC\xf2\xb7\x008\xb2\x0co\xf5\xba\x0c\xd6\x03O\xcfMi\x0e\x97\x85\xe0I\\\xbd\xec`\x91\x06\x91\xf1\xb2\x0f\xe8\x11\xa50d\xa7W\xdb\xb2\x92\x8ea\x85R25\x8a\xe6\x9d\xb5\xec\xa9*\xee\xa4^mg\xa8\x9f\x0c\\\x9d\xce*\x92\xd5\xf8\x95\xc2\x03\x15a\x0f\xdb+J:[\xbfH\xd7\x02\x19\xf5\xc2\xfa9\xfd\xd3W \xc0\xdc\xadB&\xd3\xc3.\x19p\xaa\xe6\x99\xde\xaf\x84\x13\xa9\x9dY+\xad\xe1|\xef\x07\xcaM\xde6L\xe0\x10\xa8\x8a\xd4\xc7\t\xaeRZB\xca\xa7\xae\xa9Ck\xea\xb8iU(\x9e\x88m:\xdfa?\xbf\x03\x90\x0350\x14\xde\x03\xa4>V\x15\xa6^ffq\x1aL[\xd4m\x132UOK\xd3\x125\x91\xf9sO\xc5\x04\xd9=\xe7\xca\xa4\xf9\xdb5\xbc\xae\xbbp\xfa\r\xce\xc7k\x94\x1c\x92\xfc\x8b\xb4\xb7\x19\x84\xf4\xf7\x08\xc3\xd0\x04f*\xc59\xf7B?j\x85\xd0\x8a\xf6\xdbZr\x95_\xd1Vt\x12\x04\xab\xbe\xf1\x00\x91\xb0\x13\xa6\x89\xd3+E\x93Q\xd86\x91\xb2z\xc1\x19)\xb6\x03:;Q\x07\x12\xaf\x13Y\x8d\x07\xda \x14\xa6\x89l\x0f\x102\x18\xf3\xdcu\xad\xee\xa6\xfbxV\xe8\xbeF\xc7\x08\xd3\xd6\xb5\xb6\xae\xbf\xbb\x02\xe1\xed1Lf\x96\xc0;\xe9\xe2{COj\xa7\x1c%\xad\xb2\xe9\xc7V].\'\xcc\x0eX\x1e\xc2\x10\x00#\xc9\x03\xe3\xd8\xd8\x1cp\xb8\xc7\x93s+\xc2G\xf7Vu\xb8\xf9\x1b\x82\xf3\xeb\x99\xa9\x86C\xb5\xb3L#r\xd3\xc6P2\xc3\x12\x95\xf4\x17tm\x04$\x9c\x9a9\x04\xdfC\xa1[\xe4}\xae\xdbS\xe52\\\x01\x99\xa0\xe5M1\xe0/\xa02\xd8\x92\xa5\xf1\xbd95\xfb\xce\x18*\xbc,8\x90\xda\xa7\x1b%\x94_\x05pn\xaaH\xfd\x1f\r\x10O\xee\x98\xa9\x97v\x94\x8e\xc5)&\xff\x9a\xd2\xf7#\xe6\x89\xb6\xdf\x1fE<`FT\x83\xc9\x14\xa4\xbf~\x9f\xd8\xca\xe9:\xe6\x13\x87\x80\x1d[.9\x07\x96\xde7*\xd9\x85\xc4\xaf\xe7A\x87\xc9\n\x1f\x13\xa1\x87\x04\x89\xb1\xce:\xbd7\x1b\'\x8d\xd2\x82\xf3\xda{DZ\x8dZ\xeb]\x01\xce\x9a\x0e\x9b\xfb \xe7;%\xb9\xbf\x96\xd5\x11\x7fG\x81\xf0PG\xb0)\xb6\xc6T\xa1\x04\xcfa\x02\x99X\x7fE\xc7\x12\xb3^\x93a\xf6\xb3\xc2\xb6\xba\xbc\xd8\xc9\xa1\xf9\x16\x8aM\xfd\x8d\xed\x15^C\x93\xc8\xfb\xc1\x8a\xaeh\xee1\'\xbf?\xb59 \xe2\xf3[\xd6=I`1\xe2i?\x1f}i=\xfbwS<\x90\xb7K\x80\xc9[\x1c+\xa1!T\xd1\xb7\xd3\xaeq\xe2IX\xb3*!\xbd\x0f!\xa7|7(\x91\xf1#\xaf\xcb7\xc4i\xf5\x15\xeb\r\xd5\xe3\xb1\xa4w+n8\x0c\xe7\x9d\xcfR\xbd\xee\xbe\xedM\xd3W\xb1\xfb|`G\xd4\x86)\xfb?p\xc5\xd3\xc8W/\xa6\xd6\xa5\x05\xa9\xae*\x01\xd5R\x1fo\xef\xa9\xe1\xe9^\xe7-\xd6\'\x0c\xe5\xfc\x8c\xfc\xd1T\xa9/\x1b{\x8e\xe1\xc3B&\x9a\xb4r\xfdV\xb9\x84\x06&\xb4f\xe5\x03\x12\xbd\x00\x13\xb4\xd1eA<\xcd\xcd5\x7f\xdb\xb5\x99^\xd9\xae\x14\xb2SH:\xf3\xd1\x0c\xa7-\xa3\x03\x10\x03t0\xb4e\xfc\x00\x15N\x94\xd4[tVT\x7f\xcc\x93F}_\xe12\x93N\xfaUw%\x18`\x03u\xbci\x02!H\xfda%\xd6-t\xf5\xeb\xf1D\x1e\xd8\r\x04e\x15D\xb0\xea\xb9\t\x10P\xf6\xf4\x85\xa9u\xdb\x91\xfb\xff\xf8\x1b\x03\xe5\xb2\xc0\xab\xe8\xe8X\xbf\xf48\x98\xbe\xdczg4\x8c\xbco\x19\xd90\xa5\'\xc7z9\xca\x8d`8}\xeav\xc0Ql\xc3{t\xd4\xfd\x9a$\xff\x8f2\xe7q#\xda\'\x08G~\x81\xfa}\x7f\x8a\xb4x)we~\xb5\xb2\xea\xfevh\x8d\xdb\xa5\x84x\xd7Y%\x08Hd\xa8\xc1W^\xa3+Q\\4\xaa>\xab\xf0C\xe2\xd3U"W\x92\x07rU\x9d\x0e\x06\x15g#\x80\xe8\xb8\xc3\xe8\xceq(~\x0bn\xfd\xdc\xa7\xc2\xdc#S\x98\x0b7\x9d\x0f\x8a\xf8\x86\xf7\r\x9d\xb92\xd0\x01\x1eg\xe8\xc8}=\x0fq\xa1\xdf\xae\x8eii\xb5\xe7\x9aZ\xcd\xf9\xa8MP,\xe9j\xb9\x1f\x96\x7f$\x00\x9e\xb7\x97\xae\x92\x90\x88\xaf\x9d\xe4\xd7\x9e\x15O\x8e\x9f7g\x8e\xb7$=$\xb4\xf4\x04\xde\x89\xc3\xc6s\x00iO\x92|o\xa5\xe2\xb6RQ\xa2\xf4\x14K~O9\x9f\x00\xd6\x16\xed\xdc*u\xe0\xd4\xad\xd7AM#\xc4\x9b\x8auL\x14\xc5\xa2D\nWU\x9b\xd5\xd7\xdc\x0c\x88r\xb3\xb0~\xf9B\'\xfd`\xc4J\x93>\x80{\xa2\x8d\xbd\xfc^ \x00\xa5\xd7<\xf3h\x9f\x95S\x8a\xd8\xbc\x7f\x89\xd9\r\xe6\x91\x8f6\xd5J\x19M`\xc4\x97\xc6\xa6:K\'\xa2\x8c\x870B\x16\xc3\xcf\xa3J\x96Ow\xa8\xbdQA{\xcdm4\xdb\xd4\x01H\x1ez\xca\x9f\xdf.O\x84r\x08\xb8\xf8\xe6\xd9\xf1Ib\xc2\x85\n7\xec \xf1\x8f9Y\xd5W\xc8\xf9H\xdc)\xce\xe0)rQG\xf7\xd6y\xaa\xef\x1b\x19\xf3\re\xcc\xd0\xd7\x1e\t\x7f\x92p\xfdi1\x0ey_%\xb8\xb9\x0fI/\xa0N\x9d\x9d\x1dpM\xc3_do<\x9f\xbe\xdap\xce\xaa\xebk\xfb\xbex<\xe0\xb1\xc2\xd84\xa2h"\n\xf7t\xf9&\x93\xe6m\x98\xe1\xea9R\xd3\x0e\xb3\xe7jm\x9b\xb8\xaa<\xe0\xb6!Z\xdd9\x1e\xd5\xe3\x9fzv\xb6\xe5\xc5\xca#KL#\x7f\xf8pd\xe7\xe6#\x9d\xc9\x81\x1a\x1e %\x19\xa9\xbf6\xcc\xb6\x86\xfa\xb4\xeb\xf5m\x0b\xbd\n\xee\xd6g==\xc3O\x18\xd3:hF2s\x05f\xfe\x9c\xe8\xf6vh\x01zr\xbf\xfb\xed\xa1\xc0\xb4\xb6\xfb\xb0\xce\xf8\xda\x96\xfbk9\x86\x93!k\x90\xfa\xfd$F\xfd\xc2nT"T![\r\xca\xa7\xeb\x18\xb3\xc2\xc7\xf9\xf2!\xa8IaSt\xf1;d\xfei\xabb\xc2\x1c\x90\xf7-\x9dE\xf7\xde\xb16\x85/X\x00u\x0b=\xa0;\x1d6\xb0}\xcbl\x02<k\x86\xba\xb3\xb2(\x16\xaa\xdcPe{\xf9k\xb2`\xe6J\xfb\xcf\x1fh\x0f*\x1f\xff:k^\x05\x03\xe2\x9b\xb8\xa8\xd2#\xa8J(\xe6\x8d\x89\xeb\x96\x9e\xfdo>\x1f\xfdbmQ\x16UE\x95\xf9%\xed\x03\x0c\xf46P7~:\'\xa9l\'E\x88\xce>\x1b\xf1\x9au\x87\xf9f~\xa8\xc2\xc1\xc4p#\xac\xfa\x1c\x0e\xf0ZuW\xec\xb8\xb1\xec\x88\xe6\xe4\xfb-\xb6\xba:bg\xeeo_\x87)\xa2q\x85C\xe4\xe6\xfd\x0e\xe1\x00\x89\x19\x81\xa5\xbc\x10\x18\x81:\xe6&f\xce>\xf6\xce\xd3-\x1e\xc0\xc9.;*,2\xf1\x18\xe86\x01\xe5\x8bA\xaa\xdbL`.$P\x98,w\xad%\xe7\xfeN\xf9x\x85\x82\xc1h\xd0a!00\xcb\x03\xbc\x90\xa0\x15V6\x82\xd6\x90\x8f\xf4\x17\xbb\x13\xa4\x97\x05\x8c\xact\xcf\xa4\x0f7\xf2\x12Z\x06\x83N\xe7\x83\xfeST\x85\x9d\x8d\xc9\xe1\x8azW17x\xe7\xfd\xb3\x85W\x1c\x7f\x1b\xe2\xab&r\x13p\xacf\x91\xf9\x0b\x89\xd5\x9d\xd3\xb3\xcf\x93\x928\xd4\xe8\r\x9e\xd6\xe8L\xacR:q\xd8et\xe0\xb4\x88&\xf77<0\xf3\xa5\xab\xef\xd6\x83}\xed\xd2M\xbcV\xdf\xb5)b\x14GD\x9eU\xb7\xc8\xb9t\xf1\xc0\xc9V\x0c2\x7f\x9e\xc7\x0c\xce4\t^\xd2\xca\x16\xa8\xdf+\xe4\x9f\xc0\x98-^\x96\xbb\xdd\xc1\xe6D\xcdj#\xbaJ\x1f\x18a\xc5\xc8o\xe2\xa6\x9f/\xcd\x0b[q\xe3\x9a[\xf7DE\xed(\x17\xeb\x12\xea\x15L%\x9bG\xe3\x01\xc6Y\xc3h\xf39U+\x92>j\xe3\xb9\x86\xcd\xd2\xc2n=B\x17\xac\xc8GI\x18\xfd\xde?\xa8\xae\xc0-l\x8f\x02A\xc2i\x8b)\xfd\xe1\xea\xa3O)\xfa\x12\xaf|F\x98\xc0\x04\xae\x1eoV>\xdd#\x06\xd8x\xbaA\x02K\xcaj\xdb\xed\xb3|\x96\x97\xb2E\x97\x83\xd4\xd6_\'^\xde\xb8u\x9bx7\'\xbc\n\x8c8g*<S>W\t\x05\x9d\xf3\x18\xb7\xfeUcx\xe6\xeb\xf5\xd8\xf8\xdb>E\xac\xe4[\xdejH\xfc,\xfaF/g\x07\xd1mBW+#\xf6e:\x82;\xd3o\xb0,\xaf\xafv5%\xf6,\'>x\xe1\x12\x1a3\xf7I\x8b\x8b\xf2\xf5\xc0k%\x9e\xb8\xec\xd9T\xabC\x8c0\xdanY\xdcpn\xea!\x0c0\xd1\xfd\xba\xf9\xc7\xc5\xac!K\x8d[\xe2\xdb\xde\xcb\xa9*J\x08g\xa7\xbeqrV\x1e\xe8\'\x0f6!\xfb_\x07;I\x95-\xda\xa7_sd\xecC\xdf\xf2\x8f\xe6\xa8\xf8\xf5x\x8bP+k\xc13\x1aU}\x80\x17\x8b]q\xf1\x88\x04\x16ePxX6]%;\x94y\xe6\x19Y}=_b\x9dI\xcb\xc4\x0e\x02\xf0\x08HP\xe0\xa6#\x0b\xa8\xbd5}\'\xa2\x1c\x85\x92.N\xbe3\xbb\xfcV&\x82\x1d\x18\xbc&\xbd\x03<\xf7\xd5\x8e\x99\x17N}\x07G\xdfs\x9ei\x18\xde\xbc^\xdf;w\xbe\xb2\x98\x9a:\xd6s]C\x01\x19?,H}\\\x0f\xe3\x02\xbb\xcc\x1e\xb1=v8I\xe1\xf6\xe8\x99\xdf\xc4\x1b\x1d4\xf69ds4\x90\xfc}\xb5\xb16\xfa\xf0\x02\xcaBu\xaf\x8c\x9a\xe7BV}|~\xd9\x87\xce\xe3\xfbv]<M\xc64\xfd\x7f\xfc\x13\xb4\x84%,a\tK\xf8\x0f\xe1\x1f\xf8\x0eLo\x00&\x00\x00'

You'll want to unzip the content, and you can use the gzip module to do this. Use gzip.decompress to read the content into a wav file:
import requests
import gzip
r = requests.get(url = urll, headers = hederr, stream=True)
with open('myfile.wav', 'wb') as fh:
fh.write(gzip.decompress(r.content))
Edit
Since this is apparently a wav.tar.gz file, you'll want to un-tar the decompressed content:
import requests
import gzip
import tarfile
from io import BytesIO
r = requests.get(url = urll, headers = hederr, stream=True)
with open('myfile.wav.tar', 'wb') as fh:
fh.write(gzip.decompress(r.content))
with tarfile.open('myfile.wav.tar', 'rb') as fh:
fh.extractall()
Where extractall will extract to the current working directory by default, but you can specify a separate directory if you want via fh.extractall(path='/some/path'). To avoid intermediate files, you could do:
with BytesIO(gzip.decompress(r.content)) as fh:
tf = tarfile.TarFile(fileobj=fh)
tf.extractall()

Python getting error "UnicodeDecodeError: 'utf-8' codec can't decode byte 0xad in position 10: invalid start byte" [duplicate]

If I have a URL that, when submitted in a web browser, pops up a dialog box to save a zip file, how would I go about catching and downloading this zip file in Python?

As far as I can tell, the proper way to do this is:
import requests, zipfile, StringIO
r = requests.get(zip_file_url, stream=True)
z = zipfile.ZipFile(StringIO.StringIO(r.content))
z.extractall()
of course you'd want to check that the GET was successful with r.ok.
For python 3+, sub the StringIO module with the io module and use BytesIO instead of StringIO: Here are release notes that mention this change.
import requests, zipfile, io
r = requests.get(zip_file_url)
z = zipfile.ZipFile(io.BytesIO(r.content))
z.extractall("/path/to/destination_directory")

Most people recommend using requests if it is available, and the requests documentation recommends this for downloading and saving raw data from a url:
import requests
def download_url(url, save_path, chunk_size=128):
r = requests.get(url, stream=True)
with open(save_path, 'wb') as fd:
for chunk in r.iter_content(chunk_size=chunk_size):
fd.write(chunk)
Since the answer asks about downloading and saving the zip file, I haven't gone into details regarding reading the zip file. See one of the many answers below for possibilities.
If for some reason you don't have access to requests, you can use urllib.request instead. It may not be quite as robust as the above.
import urllib.request
def download_url(url, save_path):
with urllib.request.urlopen(url) as dl_file:
with open(save_path, 'wb') as out_file:
out_file.write(dl_file.read())
Finally, if you are using Python 2 still, you can use urllib2.urlopen.
from contextlib import closing
def download_url(url, save_path):
with closing(urllib2.urlopen(url)) as dl_file:
with open(save_path, 'wb') as out_file:
out_file.write(dl_file.read())

With the help of this blog post, I've got it working with just requests.
The point of the weird stream thing is so we don't need to call content
on large requests, which would require it to all be processed at once,
clogging the memory. The stream avoids this by iterating through the data
one chunk at a time.
url = 'https://www2.census.gov/geo/tiger/GENZ2017/shp/cb_2017_02_tract_500k.zip'
response = requests.get(url, stream=True)
with open('alaska.zip', "wb") as f:
for chunk in response.iter_content(chunk_size=512):
if chunk: # filter out keep-alive new chunks
f.write(chunk)

Here's what I got to work in Python 3:
import zipfile, urllib.request, shutil
url = 'http://www....myzipfile.zip'
file_name = 'myzip.zip'
with urllib.request.urlopen(url) as response, open(file_name, 'wb') as out_file:
shutil.copyfileobj(response, out_file)
with zipfile.ZipFile(file_name) as zf:
zf.extractall()

Super lightweight solution to save a .zip file to a location on disk (using Python 3.9):
import requests
url = r'https://linktofile'
output = r'C:\pathtofolder\downloaded_file.zip'
r = requests.get(url)
with open(output, 'wb') as f:
f.write(r.content)

Either use urllib2.urlopen, or you could try using the excellent Requests module and avoid urllib2 headaches:
import requests
results = requests.get('url')
#pass results.content onto secondary processing...

I came here searching how to save a .bzip2 file. Let me paste the code for others who might come looking for this.
url = "http://api.mywebsite.com"
filename = "swateek.tar.gz"
response = requests.get(url, headers=headers, auth=('myusername', 'mypassword'), timeout=50)
if response.status_code == 200:
with open(filename, 'wb') as f:
f.write(response.content)
I just wanted to save the file as is.

Thanks to #yoavram for the above solution,
my url path linked to a zipped folder, and encounter an error of BADZipfile
(file is not a zip file), and it was strange if I tried several times it
retrieve the url and unzipped it all of sudden so I amend the solution a little
bit. using the is_zipfile method as per here
r = requests.get(url, stream =True)
check = zipfile.is_zipfile(io.BytesIO(r.content))
while not check:
r = requests.get(url, stream =True)
check = zipfile.is_zipfile(io.BytesIO(r.content))
else:
z = zipfile.ZipFile(io.BytesIO(r.content))
z.extractall()

Use requests, zipfile and io python packages.
Specially BytesIO function is used to keep the unzipped file in memory rather than saving it into the drive.
import requests
from zipfile import ZipFile
from io import BytesIO
r = requests.get(zip_file_url)
z = ZipFile(BytesIO(r.content))
file = z.extract(a_file_to_extract, path_to_save)
with open(file) as f:
print(f.read())

twython upload_media from tempfiles

So I was writing a bot that pulls images from wikipedia (with requests) and posts them to twitter (with twython). I found this, which led me to believe I could do something like
import tempfile
import twython
import requests
...
req = requests.get(img_url, stream=True)
with tempfile.TemporaryFile() as img_file:
for chunk in req:
img_file.write(req)
resp = twython_client.upload_media(media=img_file)
return resp['media_id']
But the upload_media call throws 400s. Something like
...
with open('tmp_img_file', 'wb') as img_file:
for chunk in req:
img_file.write(chunk)
with open('tmp_img_file', 'rb') as img_file:
resp = twython_client.upload_media(media=img_file)
os.remove('tmp_img_file')
return resp['media_id']
does work, but isn't "creating a temporary file that gets deleted immediately after use" the whole point of tempfiles? What am I missing/doing wrong?

Writing advances the file position, so you have to do
with tempfile.TemporaryFile() as f:
f.write(data_to_write)
f.seek(0)
read_data = f.read()

Download csv file using python 3

I am new to Python. Here is my environment setup:
I have Anaconda 3 ( Python 3). I would like to be able to download an CSV file from the website:
https://data.baltimorecity.gov/api/views/dz54-2aru/rows.csv?accessType=DOWNLOAD
I would like to use the requests library. I would appreciate anyhelp in figuring our how I can use the requests library in downloading the CSV file to the local directory on my machine

It is recommended to download data as stream, and flush it into the target or intermediate local file.
import requests
def download_file(url, output_file, compressed=True):
"""
compressed: enable response compression support
"""
# NOTE the stream=True parameter. It enable a more optimized and buffer support for data loading.
headers = {}
if compressed:
headers["Accept-Encoding"] = "gzip"
r = requests.get(url, headers=headers, stream=True)
with open(output_file, 'wb') as f: #open as block write.
for chunk in r.iter_content(chunk_size=4096):
if chunk: # filter out keep-alive new chunks
f.write(chunk)
f.flush() #Afterall, force data flush into output file (optional)
return output_file
Considering original post:
remote_csv = "https://data.baltimorecity.gov/api/views/dz54-2aru/rows.csv?accessType=DOWNLOAD"
local_output_file = "test.csv"
download_file(remote_csv, local_output_file)
#Check file content, just for test purposes:
print(open(local_output_file).read())
Base code was extracted from this post: https://stackoverflow.com/a/16696317/176765
Here, you can have more detailed information about body stream usage with requests lib:
http://docs.python-requests.org/en/latest/user/advanced/#body-content-workflow

Download large file in python with requests

Requests is a really nice library. I'd like to use it for downloading big files (>1GB).
The problem is it's not possible to keep whole file in memory; I need to read it in chunks. And this is a problem with the following code:
import requests
def DownloadFile(url)
local_filename = url.split('/')[-1]
r = requests.get(url)
f = open(local_filename, 'wb')
for chunk in r.iter_content(chunk_size=512 * 1024):
if chunk: # filter out keep-alive new chunks
f.write(chunk)
f.close()
return
For some reason it doesn't work this way; it still loads the response into memory before it is saved to a file.

With the following streaming code, the Python memory usage is restricted regardless of the size of the downloaded file:
def download_file(url):
local_filename = url.split('/')[-1]
# NOTE the stream=True parameter below
with requests.get(url, stream=True) as r:
r.raise_for_status()
with open(local_filename, 'wb') as f:
for chunk in r.iter_content(chunk_size=8192):
# If you have chunk encoded response uncomment if
# and set chunk_size parameter to None.
#if chunk:
f.write(chunk)
return local_filename
Note that the number of bytes returned using iter_content is not exactly the chunk_size; it's expected to be a random number that is often far bigger, and is expected to be different in every iteration.
See body-content-workflow and Response.iter_content for further reference.

It's much easier if you use Response.raw and shutil.copyfileobj():
import requests
import shutil
def download_file(url):
local_filename = url.split('/')[-1]
with requests.get(url, stream=True) as r:
with open(local_filename, 'wb') as f:
shutil.copyfileobj(r.raw, f)
return local_filename
This streams the file to disk without using excessive memory, and the code is simple.
Note: According to the documentation, Response.raw will not decode gzip and deflate transfer-encodings, so you will need to do this manually.

Not exactly what OP was asking, but... it's ridiculously easy to do that with urllib:
from urllib.request import urlretrieve
url = 'http://mirror.pnl.gov/releases/16.04.2/ubuntu-16.04.2-desktop-amd64.iso'
dst = 'ubuntu-16.04.2-desktop-amd64.iso'
urlretrieve(url, dst)
Or this way, if you want to save it to a temporary file:
from urllib.request import urlopen
from shutil import copyfileobj
from tempfile import NamedTemporaryFile
url = 'http://mirror.pnl.gov/releases/16.04.2/ubuntu-16.04.2-desktop-amd64.iso'
with urlopen(url) as fsrc, NamedTemporaryFile(delete=False) as fdst:
copyfileobj(fsrc, fdst)
I watched the process:
watch 'ps -p 18647 -o pid,ppid,pmem,rsz,vsz,comm,args; ls -al *.iso'
And I saw the file growing, but memory usage stayed at 17 MB. Am I missing something?

Your chunk size could be too large, have you tried dropping that - maybe 1024 bytes at a time? (also, you could use with to tidy up the syntax)
def DownloadFile(url):
local_filename = url.split('/')[-1]
r = requests.get(url)
with open(local_filename, 'wb') as f:
for chunk in r.iter_content(chunk_size=1024):
if chunk: # filter out keep-alive new chunks
f.write(chunk)
return
Incidentally, how are you deducing that the response has been loaded into memory?
It sounds as if python isn't flushing the data to file, from other SO questions you could try f.flush() and os.fsync() to force the file write and free memory;
with open(local_filename, 'wb') as f:
for chunk in r.iter_content(chunk_size=1024):
if chunk: # filter out keep-alive new chunks
f.write(chunk)
f.flush()
os.fsync(f.fileno())

use wget module of python instead. Here is a snippet
import wget
wget.download(url)

Based on the Roman's most upvoted comment above, here is my implementation,
Including "download as" and "retries" mechanism:
def download(url: str, file_path='', attempts=2):
"""Downloads a URL content into a file (with large file support by streaming)
:param url: URL to download
:param file_path: Local file name to contain the data downloaded
:param attempts: Number of attempts
:return: New file path. Empty string if the download failed
"""
if not file_path:
file_path = os.path.realpath(os.path.basename(url))
logger.info(f'Downloading {url} content to {file_path}')
url_sections = urlparse(url)
if not url_sections.scheme:
logger.debug('The given url is missing a scheme. Adding http scheme')
url = f'http://{url}'
logger.debug(f'New url: {url}')
for attempt in range(1, attempts+1):
try:
if attempt > 1:
time.sleep(10) # 10 seconds wait time between downloads
with requests.get(url, stream=True) as response:
response.raise_for_status()
with open(file_path, 'wb') as out_file:
for chunk in response.iter_content(chunk_size=1024*1024): # 1MB chunks
out_file.write(chunk)
logger.info('Download finished successfully')
return file_path
except Exception as ex:
logger.error(f'Attempt #{attempt} failed with error: {ex}')
return ''

Here is additional approach for the use-case of async chunked download, without reading all the file content to memory.
It means that both read from the URL and the write to file are implemented with asyncio libraries (aiohttp to read from the URL and aiofiles to write the file).
The following code should work on Python 3.7 and later.
Just edit SRC_URL and DEST_FILE variables before copy and paste.
import aiofiles
import aiohttp
import asyncio
async def async_http_download(src_url, dest_file, chunk_size=65536):
async with aiofiles.open(dest_file, 'wb') as fd:
async with aiohttp.ClientSession() as session:
async with session.get(src_url) as resp:
async for chunk in resp.content.iter_chunked(chunk_size):
await fd.write(chunk)
SRC_URL = "/path/to/url"
DEST_FILE = "/path/to/file/on/local/machine"
asyncio.run(async_http_download(SRC_URL, DEST_FILE))

requests is good, but how about socket solution?
def stream_(host):
import socket
import ssl
with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as sock:
context = ssl.create_default_context(Purpose.CLIENT_AUTH)
with context.wrap_socket(sock, server_hostname=host) as wrapped_socket:
wrapped_socket.connect((socket.gethostbyname(host), 443))
wrapped_socket.send(
"GET / HTTP/1.1\r\nHost:thiscatdoesnotexist.com\r\nAccept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9\r\n\r\n".encode())
resp = b""
while resp[-4:-1] != b"\r\n\r":
resp += wrapped_socket.recv(1)
else:
resp = resp.decode()
content_length = int("".join([tag.split(" ")[1] for tag in resp.split("\r\n") if "content-length" in tag.lower()]))
image = b""
while content_length > 0:
data = wrapped_socket.recv(2048)
if not data:
print("EOF")
break
image += data
content_length -= len(data)
with open("image.jpeg", "wb") as file:
file.write(image)

We Keep Coding

Python is a programming language that lets you work quickly and integrate systems more effectively.

Stream a large file from URL straight into a gzip file - python

Related

Use python request to convert bytes gzip to .wav?

Python getting error "UnicodeDecodeError: 'utf-8' codec can't decode byte 0xad in position 10: invalid start byte" [duplicate]

twython upload_media from tempfiles

Download csv file using python 3

Download large file in python with requests

Categories

Resources