Downloading Many Files using Python Web-Scraping

Downloading Many Files using Python Web-Scraping - python

If I have a link to a CSV on Yahoo Finance: http://ichart.finance.yahoo.com/table.csv?s=LOW&d=4&e=29&f=2014&g=d&a=8&b=22&c=1981&ignore=.csv
how would I write a web scraper to download multiple files based on a list of symbols: [LOW, SPY, AAPL]
from StringIO import StringIO
from urllib2 import urlopen
for symbol in symbols:
f = urlopen ('http://www.myurl.com'+symbol+'therestoftheurl')
p = f.read()
d = StringIO(p)
f.close
Do I need to write the contents of the url to file, or will it download automatically into a directory?

You can use a method like this to download files:
import urllib2
file_name = "myfile.xyz"
u = urllib2.urlopen(url)
f = open(file_name, 'wb')
block_sz = 4096
while True:
buffer = u.read(block_sz)
if not buffer:
break
f.write(buffer)
f.close()

Related

Use python request to convert bytes gzip to .wav?

r_api= requests.get(url = urll, headers = hederr, stream=True)
request_data = r_api.content
print(request_data)
I got gzip bytes as output/response and I want to convert it into .wav file.
I tried this code and it's converting the file to gzip wav not to .wav.
with open('myfile_1.wav', mode='bx') as f:
f.write(request_data)
gzip bytes
how to convert it to gzip bytes to .wav?
b'\x1f\x8b\x08\x00\x00\x00\x00\x00\x00\x03\xedXwP\x93\xdb\xb6\xff\x12\x8a\x91\xfa\xd1{\x08M\x10\x14BW\xdcB\x88\x01\x11\x14\x90"E\xc0\x04B$B$$t\x05BW\x8a\x14\x95*\x88\xc8Q\x01A\xe4\x88\x02j\xc4\xc6\x11\x10\xc5\x06\xa2\x80\x01D\x01[\x00=G,x?\x8e\xe7\xcc\xf5\xfaf\xde\xcc\x9d7\xf7\xcd\xfd\x83\xdfd\xcdZ\xfb[k\xaf\xbd\xd6nkO\xf0x<~\x07\xdex\x87\xb1\xa1\xe9N&\xf4\x9f\x012\x84\xa5\xa59\x0e\xe1\xc6\x96\xe6\xf8\x1f9~Q\xc4[\x9a\xe2 cSKKc\xbc\xa9\x99\xb9\xb19\x847636Y\xb4\xff\x0f\xc5\xf3/\x88bGRXH(\xa1\x14V\x1c\xe5\xff\xe0\xe7\xcfd\xfeN\n\xe1\xff\x05063\xb1\xb0\xb0\xc4\x9b[\x98[\xfe\x8b\xfc\x13\x14i\x91qLd\xf5\xcd\x16\x1btv8\xe3\xbb,\xd8\xc9\x08\x0f\x8fF>\x851\xa2C\xa8\x8b\xbaA?\xc6\xd4"A\x90\xc0$\x04\x89\nB(h\xf1\xf7O\xa0~\xf2\xfcS\x9b\xf0\xbf\x87\x8bF(\x8e\x11\x1c\xb9\xb8\n\x1a!\xd40\xd6\xdf\n\x065\xf2\x7f\xac\x8cAhp\x1c\xfb\x87aT\x16\x8d\x82\xc2\x19\x86\x94\xddTV8\x9dj\x18\x1d\xccb\xd3\xc3w#*\x1dz\x18;\x12\xe1j\x7f\x99\xcaQ)\x7f\xfa\xfb3\xba5\x86\xc6\x86\xc8>\x13\x18\x8cdQB\x91\xb6_d\xe8\x9f\xc9.\xfbg\xb2\x7f\xa7\x81$\xfcsn\xffV\xc2\x02Q\x0c*}q`\x1c\x83\xfa\xf3\x84\xaa#\xa6\xde\x7fo\x9aU?&\xcf\x0e\x8f\xda\xfd\xa3\x17w\xa4Mu#\xb2\x0c\x0b^\xf4\xa9\xc1\xa0\xef\xa6!\x02\xccf|\xf7\xf9\x17\xb4\xa9\xdf\xbf\xabRY\xc1\xb4\x1f\xa2\x13\x8bb\x85\xe1\xbe\xcb\xe8\tvd`\x18"\xdb\xb1#\xd9\xd4\x1fl\xcc\xd9\x14\x06\xeb\x87\xf6\xdf\x12\xbc\xc8\xd4\xbf\'%EElp8\xc4U\xea\xb7?\x8dp\xec\xc8\xc8\xbf\x17\x04\xfdW\xc7*\x84r\xbes\xb4\x032H\xfc\x0f\x01\xe6.\xf6Y\xa2%\xfa7I\x15\xd9GA\xd0w\xa0\xfe\xa2\xdc\xbf\xb8,;2(\xfc\x9f:\xb1r\xe4\x1c\xb7\xd0X\xc1\xc1\xd0\x12\x96\xb0\x84%,a\tK\xf8o\x87\xd4)\x06\xf2F\x07\x05\n\xf2\\Z\xdc(\x0f\xf5\xf2\x0c\x1f\xf9\xca\xf9\xc9\n8\xc4h\x17}\xd7\xb7/\xea;~\xd6G\xfb\xb2"\xa93W;\xed\xc2\xc7\xe6\xa4\x8c\xd1\x8b\xfd\xb9\xe5\xcc#\x08\x13\xf8\xfc\xf1!\x07\xf4M\xec\xd5 >^>~\xd3B\xb6\xff\xb1\xcb\xefk\x9e\xb3DE\xb6^\xd0\xd9^\xe3\xb9\xf9\x8e\xfb.> \x9d\xa2I/3\x97\x1a]\xd8\xb0j\xb8\xf0\xbc\xc8\x8d\xc3\xf3D+\xcfQ\xd4\xf8\xe3\xdd!E\x9f^\xf1\x81#\x9d\x83\x13.\x84\xfa\xde\xfa\x16*\x94B;\xee\xd1E\x9f\x96y\x04\x88\t4\x12=\xf8\xe6\xc2\x04#L\xdc\x11\xfdk\xa4\xa8\xeb\xdb\xc3n\n\xc3\xc7\x9bku\x8d)\xc6\x97\xc3%v\xbf\xeb\xbd\xa3\xa0\xf8\t\x07L\x18q\xca2w4\xdaI\xbb\n[\x0f\x98P\xda\xaa1\xbd\xca\x15ND\xd2 \xe589\xaf\x17\x02\xa4`\xaa*qU\x93\x0b\xbc[\xcb;\xe7Z\xad\xe6k\xbe\xd8\xb4\xfd\x94\xd5\x894Ht\xae\xc9\x81\x0f\xa4\xdb\xb5\xa2\xf0Y\xda#\xa8\x0e7\x96V\x9d#\xf7\xb5\x8d\x8c\xe4LA[\xa4\xe8-\x7f\xf30MD\xef\x8a\xc7\x91]4\x17P)\xd5[\x9f\xba\xea<\xc8\x95\xdf\xa3-R\xe7\x9b\x00\xa5<Q_M\xe4\x031\x1358\xc8DaA\xe0P\xf5\xd8\x8b4\xe3vW\xd2\xc1\x88\x08\x9eBNX\xd7\x85\x8d\x9f-\xf2\x81\xc4\xba\xbd\xb8#\x87\xda\xc83\xc5\xd5\xe6\xb9\x12\x114\xfa\xd1\x17\xf2\xab\xac\x83+\x9dZ.\xaf\xeb)`\x02c\xc9\x19\x8c\xf7\xd7\xf2\xca\xd2Z\x9d\xfb%\xdd\x03\xa6\xd9\xc9\xd7\x9cO6\x87u\xd4\xe3b\xb8)sx0\x98\\\x06\x9b\x87\xfb.l\xe2\xf4z\xef\xadD\x1f\x96\xab\x9d\xcd\xde\x923p\'\xfct\x91P\x8b)\x0c\x1c\xde\xf7{\x93\x1f\xed\xad\\\x9b\xc1\xe4\x0bJ\xbde\xb7\xa5W\xc67^\xb16k\xacq\xb4\xec[\x83\x03D\xa7T\x0c%\xf5\xbd\x91\x9a\xe1\x0c\xa9\xae\xec\x85\xb3\x939\xae\xb4\xad\xd9\xb7\xf2\xea\xf3`\xcc\x89J> Nk{S2G\x17d8\x11\xd2moo\x93\xca0\xdd\xadBB\xab\xb3r\xfc\x88\xb4\xb5\xeb\xfb\x80\x99W\x1c\x86b4Q)\xf8h.\xf7\x94\x88\x91\xc0\xc2\xdd\xb2\x8a\x1b\x9a\rU\xee\x8f\x19\x89\x1b\xbf\xe5\x03\xbd\x10*L1\x98J\xdc\x9a\xdcA\xea\x0e\xc8\xbe\x97\xfeL\x91*!\x83O?\xf6\xee\xe1\xfe\xd4\xa8*0f\x1b\xc1\x0c\x89X\x89UZ\x1ex\xect\xfe\xf4{L\x9e\xf6/\xaf\x13\xbb\x94\xc4\'\xbeE\x8d\xb6\xbb\x02i\xac\x16L\x1e\xc1\x02\xfd\x87\x0c\xe3\x06\xe5\xea\x03[\xd1\x8fju\xcb\x0c\xcfwf\x8a#\xa4T\x1e\x90w\xf0\xe3P\x8e\x9eN46\x90\xdb}\x9ds\xf2-^AE\x94\xb0\xa5\xf0\x83\xd11\xae3\xad\x9c\x0f\xb6\x05Da(ic;\xd0\xf7\xce~\xe3\xb2\x8c\xd4\x8e\xf7w_\xe8Ek\x1f\xf7\x92\xf3\xb7\xd8\xd5\x84\x07$\x7fm\x1c9i\xb6R2\xb9\x88*q\xa0\xde\xd3\x9e \xf7l}\xa1Oa\xe7\xb3\xbd\x92\x9a\xcd0\xb0\xa7\xc6a\xc8\xecQ\xaeR2{\xfc\x84~#\x8e\xf3\x1b{}w\xde\x9d\xd9\x10\xbf<\xf3\x8c\x12\x18\x906\xefa\x92)\x13\xd8b\x83\xdf\xfbN\x194\xf4E\x1d^\x06a{\x08\xac\x9co\xce\x04%\x1a\x0c\xb4\xad\x90\xfe\xdd*\x01\x82\xcfX\xeb\xe5\xb2}bm\xec^\xf6\xac!\xe8\xef\x84\x0eM\x1c\xb9\xd6\xd0\x04\x04\xcd\xf70\x83\xce\xa3\x16P\xc9\x0f\xc7/\xa0q+N\xe9.4\x19\xbbK\x1e\x8e.9WT6\xbc\x18\x1fU\xf9\xa3\x85\xd2^\xad&M\xf3\xc1\x893\xed\x0b\xa5\xaf\xc5f\n\xa73\xf4\xd0\tn;\x06\xf3\x81\x1b#R\xee\xca\x08g\x01J\xb8\x9e\xfb\xea\xe0\xb2\xc3\x8eWLG^]\x19\xdai\xc4\t\xdb\x84.k\x02&\xbe\t\x86m\x8e\x99#\x8a\x96\xb2\xa8\x10}G\xd8^\xc8\xae\xd6zZ\xbf!\xcfY\x17\xa4\xd8p\x80\x84e\x99\xfa\x15\x11\xd3\x05\x11\xbb\x83~:\xb9\xf2\xb55\xaa\x03\xd9\xcb\x1e\xccs\x95\xf9\x11^\xdb\xd63\x01\xd5.\x08\xa3\xec\xf5\x1e\xab\x94.\xfd\xe5 sO\x8a]PG\x8d\xcb\x87?*\n7_\xdd\xef\xba\x82\tp\xee{\x98X\x06\x7fD-\xd5=]\xa5\xbby\x1f\xba\xb5\x06M\xde~,\xd8\xc2\xac\xb7lP\xbe\t\x90\xdd\xc20\xd8G\xa3\x0b\xa2\x1ci\xdeX\xc7\x8ao\xb719\xac\xfa\xcb\x86Wk\xe6C54z\x90\xfb\x85\xa6\x06\xab\x9f\xfd\xbdR.\x15\x85\xaa\xb3\x17\xc9\'\xa6\n\x07\xfa\xef\xd9\xa7s.\xe9P\xb1\xff\xb3*\xe0\xe0\xa7\x85Q\x7f9\xc5E\'\xdd7\xd2\x9fn\x9c^\x99\xd3M\xef\xd7\xcc\xc6\xbe3}"\xb6\xf62\x13\x90\xe9\xd9\x18\xf5\xf9\xd1\x05\x14GC\x99\xeb\xd3\xda\xe4N\xaf\xa7\xee\x8e{vnN\xb5\xe0\x97\x95(\xc4\xbf\x7f6\xac\xfe\xf9E\xa5(\xa7\xaf\xf5\xf1\xd4\xd9gF[\xde\xc8Z]o\x93\xccFa.}q\x86\x00\xd9O\xc1\x9b,0\x9a(\x94\xcc_\x99\x9au4sR\xfd\x8c\xda\x99\x1d>\x99\xc1+\xb4\xd9J\xd7]\x81\x83g\x91+Y\x87\xb7 \x90F\x90\xed\xa2?\xdaZ#\xaa\xb9\xbc\xe3XWk\xf6\x97\xd6\x03\x19/\t\xc0\x8c$\x03\x93\xf13\xaa\x06\x06\x1d\xd9D\xc2\xbd\n\x9d,\x89\xae8f\xb7\x8a\x00\xad\x8a\xc4\xc4\xe4\x03\x9c\xcdYW\xb2E\xdc%-Nsy\\\xac\x8f\xed\xb1\x0e\x1dq\xdb\r\xac\xa4\xac5Y\xa1\x92\'\x99`\xcd\x86"\x98\x1c\xf7\x0e+\x95\\u\xfe\x177`\x9c\x9e\x83a\xdfr4?/\xffR?D\xe8\n\x19x\xb8\x17x\x93c>&*pL\xf6\x95\xea3\xfd\x06\x05c\xb6Z\xd8\xbd{\xc56S\xba\xf9\xd4\xa5\x0f\x90\x03\xc21d\x97Q\xae\x8cAS\x8d\x84Fr\xc7d\x7f\xc9\xa7\xae\xfbU\x94\xb1O\xf6\xbf\x11\xd2\x08\xc0\xc1)\t&KMq\x97\'\xa7\xd6F\xfdQa_\xf2\xdaZ\'\xad\'\xb2\xc5\xa3\x8f\xb0\xb5E\x85\x0f4\xed\x120\xea\x9f\xe2\xb8R\xfa-szD-\xcbM\xac\xbe\x80\xe1\x0b\xc5\xebiO\xba\x9f\x91{`\xa0\x15K\x87\xd5G\xd4\x16\xa4R\xc4\x9e\xfb\x99\xed\x9f,Iid\xe7\xe8:_\x19X\xb6\xf1}$L\x00\xd8\xce#\xfd\xe9\x06x\x01\xf3\xa1p\xf9#a\x9d#\xe3\x0buH\xf7H\xfc\xc5\xb5c\x03k%\xe6\x90\xfb\x9b\xbdJ\xd9\x7f\xb32\x16\xf3\xf0\xfd\xa7\xda\x8d\xbf*\xa8D\x0eo\x96r\xd921|$\xf9`\xf6V\x18\xf0\xd6\x95\xcb\xec\xcaUJT\xfcf\xfb\xf4\xea\xe3\xe8\x94\xe9u7\xf1\x96\xb6G\x8b\xc35\n\xbe\xf4>\x81\xc0\x10+F\xe7#\xa0#\x9eT\x1e\xf1\x82\xf7\xc7o\xfd_D\xbc\xda|\xaf\xc9\xf6\xb8x\xfe\xc64\x16\xc6\x017\xa0\xae\xc0\x9e\x95\x04Z6[o\xaa\x97Kn\xe9\xcc\xc9l\x8c\xde\xa4Rn\xa5kyON\x97\x07\x08\x19,\xa3ye\x91\x11\x99]\xcf\x9f\xaf\'\xd5R \xf4T\x8f\xf8\xc7\xa7\'\x8f\x9c\xd2\x15\xca\x08"#\xf7O\xea\xb6Z\x17t\x00J\x87rw\xabq\xc5\xcaq\xd5R\xae\x17\xddT\xb7b\xb4\xe0 \xfb"\x1e\xe8\xc4\x02\x9cJL\xe4\'\xaf\x0c\xd3h\x9d\x93\x1e\xa7\xd6\xa4\x07^\xae~Z\xb7\x7f6|T\xc1\x16\xc3\x03X\xc7!\x18\x9b46\x82\xe2\xf8\x1b\xbc\xa7u\x1d\x85<(\x8e\xd3\x11\'\x9d\xa4z/\xfa\x00\x1e\x07\xac\x8c+\xc7a_r]T\xf2M\xden\xe8b\xc5\xdf,M\xd3\x8d\xd5W\xba{\x85m\x99|\x03r\x05z\x04\x18C6\xe6%*\xa4%\xf0\xa6dN\x9cj\xf6\xb8!\xf9\xbcU8\x0e\xa7\xbf\x1d\xde\xdcE\x06xMd})\xaf\xe4\xdb8\xac\x8ch\xc7R\x17\xecv\x8b\x8e\xfd\x0b]&\xfa\x99\xeb\x1e\x16\xad\xee\x008\xb3~\x88l.\xd9n\x97\xacY\xa9Bad\xe4\xc6n\x1e\x06\x11\xab\x94o\xfa\xdcK=+\x83\xd4\x1f\xb3\x9dk\x82\x0c7\xf6Wp\xae\x9e3_\xbdI\xe8\xfd]\xa1\xc6\x02\x93\x92rE\xf8\xb2\x93\xf71\x1c\xc8rU\x0b!\xbb\xc4}v\xb1\x82*\x0fw\x8e\x8eX\x88\xa8\xa6\t\xea\xcam\xfex+\x9f\xd5\xa6\x84\xdc\xef\x1e\xe1\x8ad\xecv\xe4\xfcJ\x8d+\xbe\x8c\x94H\x93s\xf5=\xb4z\xb9\x05V}\x03\xfbC.\x0c\xf2=\xd4\xc8\xe4\x9d\xb3wU8\x04\xc5\xf4q\xafN\xd5\x0b_\xc4\xd8k\xb1\xba\xd7\xba\xf5Ee\xbdq\xc0~\xb3\x02L\x0e\x1a\x97\xcf\xe5lu\xf9\xc5+z\xd7\x8d}\xe8]\xb3\xe2\x92E\xf5\x83n(T%r~\x9c\xce\xc2d\xa7k\xd6\xe8\xa4|K\xf7\xf2\x9d\xc7\xf6g\xca\x0e=X&N\x99\x1aj\xf6\xe8pC\xf2\xb7\x008\xb2\x0co\xf5\xba\x0c\xd6\x03O\xcfMi\x0e\x97\x85\xe0I\\\xbd\xec`\x91\x06\x91\xf1\xb2\x0f\xe8\x11\xa50d\xa7W\xdb\xb2\x92\x8ea\x85R25\x8a\xe6\x9d\xb5\xec\xa9*\xee\xa4^mg\xa8\x9f\x0c\\\x9d\xce*\x92\xd5\xf8\x95\xc2\x03\x15a\x0f\xdb+J:[\xbfH\xd7\x02\x19\xf5\xc2\xfa9\xfd\xd3W \xc0\xdc\xadB&\xd3\xc3.\x19p\xaa\xe6\x99\xde\xaf\x84\x13\xa9\x9dY+\xad\xe1|\xef\x07\xcaM\xde6L\xe0\x10\xa8\x8a\xd4\xc7\t\xaeRZB\xca\xa7\xae\xa9Ck\xea\xb8iU(\x9e\x88m:\xdfa?\xbf\x03\x90\x0350\x14\xde\x03\xa4>V\x15\xa6^ffq\x1aL[\xd4m\x132UOK\xd3\x125\x91\xf9sO\xc5\x04\xd9=\xe7\xca\xa4\xf9\xdb5\xbc\xae\xbbp\xfa\r\xce\xc7k\x94\x1c\x92\xfc\x8b\xb4\xb7\x19\x84\xf4\xf7\x08\xc3\xd0\x04f*\xc59\xf7B?j\x85\xd0\x8a\xf6\xdbZr\x95_\xd1Vt\x12\x04\xab\xbe\xf1\x00\x91\xb0\x13\xa6\x89\xd3+E\x93Q\xd86\x91\xb2z\xc1\x19)\xb6\x03:;Q\x07\x12\xaf\x13Y\x8d\x07\xda \x14\xa6\x89l\x0f\x102\x18\xf3\xdcu\xad\xee\xa6\xfbxV\xe8\xbeF\xc7\x08\xd3\xd6\xb5\xb6\xae\xbf\xbb\x02\xe1\xed1Lf\x96\xc0;\xe9\xe2{COj\xa7\x1c%\xad\xb2\xe9\xc7V].\'\xcc\x0eX\x1e\xc2\x10\x00#\xc9\x03\xe3\xd8\xd8\x1cp\xb8\xc7\x93s+\xc2G\xf7Vu\xb8\xf9\x1b\x82\xf3\xeb\x99\xa9\x86C\xb5\xb3L#r\xd3\xc6P2\xc3\x12\x95\xf4\x17tm\x04$\x9c\x9a9\x04\xdfC\xa1[\xe4}\xae\xdbS\xe52\\\x01\x99\xa0\xe5M1\xe0/\xa02\xd8\x92\xa5\xf1\xbd95\xfb\xce\x18*\xbc,8\x90\xda\xa7\x1b%\x94_\x05pn\xaaH\xfd\x1f\r\x10O\xee\x98\xa9\x97v\x94\x8e\xc5)&\xff\x9a\xd2\xf7#\xe6\x89\xb6\xdf\x1fE<`FT\x83\xc9\x14\xa4\xbf~\x9f\xd8\xca\xe9:\xe6\x13\x87\x80\x1d[.9\x07\x96\xde7*\xd9\x85\xc4\xaf\xe7A\x87\xc9\n\x1f\x13\xa1\x87\x04\x89\xb1\xce:\xbd7\x1b\'\x8d\xd2\x82\xf3\xda{DZ\x8dZ\xeb]\x01\xce\x9a\x0e\x9b\xfb \xe7;%\xb9\xbf\x96\xd5\x11\x7fG\x81\xf0PG\xb0)\xb6\xc6T\xa1\x04\xcfa\x02\x99X\x7fE\xc7\x12\xb3^\x93a\xf6\xb3\xc2\xb6\xba\xbc\xd8\xc9\xa1\xf9\x16\x8aM\xfd\x8d\xed\x15^C\x93\xc8\xfb\xc1\x8a\xaeh\xee1\'\xbf?\xb59 \xe2\xf3[\xd6=I`1\xe2i?\x1f}i=\xfbwS<\x90\xb7K\x80\xc9[\x1c+\xa1!T\xd1\xb7\xd3\xaeq\xe2IX\xb3*!\xbd\x0f!\xa7|7(\x91\xf1#\xaf\xcb7\xc4i\xf5\x15\xeb\r\xd5\xe3\xb1\xa4w+n8\x0c\xe7\x9d\xcfR\xbd\xee\xbe\xedM\xd3W\xb1\xfb|`G\xd4\x86)\xfb?p\xc5\xd3\xc8W/\xa6\xd6\xa5\x05\xa9\xae*\x01\xd5R\x1fo\xef\xa9\xe1\xe9^\xe7-\xd6\'\x0c\xe5\xfc\x8c\xfc\xd1T\xa9/\x1b{\x8e\xe1\xc3B&\x9a\xb4r\xfdV\xb9\x84\x06&\xb4f\xe5\x03\x12\xbd\x00\x13\xb4\xd1eA<\xcd\xcd5\x7f\xdb\xb5\x99^\xd9\xae\x14\xb2SH:\xf3\xd1\x0c\xa7-\xa3\x03\x10\x03t0\xb4e\xfc\x00\x15N\x94\xd4[tVT\x7f\xcc\x93F}_\xe12\x93N\xfaUw%\x18`\x03u\xbci\x02!H\xfda%\xd6-t\xf5\xeb\xf1D\x1e\xd8\r\x04e\x15D\xb0\xea\xb9\t\x10P\xf6\xf4\x85\xa9u\xdb\x91\xfb\xff\xf8\x1b\x03\xe5\xb2\xc0\xab\xe8\xe8X\xbf\xf48\x98\xbe\xdczg4\x8c\xbco\x19\xd90\xa5\'\xc7z9\xca\x8d`8}\xeav\xc0Ql\xc3{t\xd4\xfd\x9a$\xff\x8f2\xe7q#\xda\'\x08G~\x81\xfa}\x7f\x8a\xb4x)we~\xb5\xb2\xea\xfevh\x8d\xdb\xa5\x84x\xd7Y%\x08Hd\xa8\xc1W^\xa3+Q\\4\xaa>\xab\xf0C\xe2\xd3U"W\x92\x07rU\x9d\x0e\x06\x15g#\x80\xe8\xb8\xc3\xe8\xceq(~\x0bn\xfd\xdc\xa7\xc2\xdc#S\x98\x0b7\x9d\x0f\x8a\xf8\x86\xf7\r\x9d\xb92\xd0\x01\x1eg\xe8\xc8}=\x0fq\xa1\xdf\xae\x8eii\xb5\xe7\x9aZ\xcd\xf9\xa8MP,\xe9j\xb9\x1f\x96\x7f$\x00\x9e\xb7\x97\xae\x92\x90\x88\xaf\x9d\xe4\xd7\x9e\x15O\x8e\x9f7g\x8e\xb7$=$\xb4\xf4\x04\xde\x89\xc3\xc6s\x00iO\x92|o\xa5\xe2\xb6RQ\xa2\xf4\x14K~O9\x9f\x00\xd6\x16\xed\xdc*u\xe0\xd4\xad\xd7AM#\xc4\x9b\x8auL\x14\xc5\xa2D\nWU\x9b\xd5\xd7\xdc\x0c\x88r\xb3\xb0~\xf9B\'\xfd`\xc4J\x93>\x80{\xa2\x8d\xbd\xfc^ \x00\xa5\xd7<\xf3h\x9f\x95S\x8a\xd8\xbc\x7f\x89\xd9\r\xe6\x91\x8f6\xd5J\x19M`\xc4\x97\xc6\xa6:K\'\xa2\x8c\x870B\x16\xc3\xcf\xa3J\x96Ow\xa8\xbdQA{\xcdm4\xdb\xd4\x01H\x1ez\xca\x9f\xdf.O\x84r\x08\xb8\xf8\xe6\xd9\xf1Ib\xc2\x85\n7\xec \xf1\x8f9Y\xd5W\xc8\xf9H\xdc)\xce\xe0)rQG\xf7\xd6y\xaa\xef\x1b\x19\xf3\re\xcc\xd0\xd7\x1e\t\x7f\x92p\xfdi1\x0ey_%\xb8\xb9\x0fI/\xa0N\x9d\x9d\x1dpM\xc3_do<\x9f\xbe\xdap\xce\xaa\xebk\xfb\xbex<\xe0\xb1\xc2\xd84\xa2h"\n\xf7t\xf9&\x93\xe6m\x98\xe1\xea9R\xd3\x0e\xb3\xe7jm\x9b\xb8\xaa<\xe0\xb6!Z\xdd9\x1e\xd5\xe3\x9fzv\xb6\xe5\xc5\xca#KL#\x7f\xf8pd\xe7\xe6#\x9d\xc9\x81\x1a\x1e %\x19\xa9\xbf6\xcc\xb6\x86\xfa\xb4\xeb\xf5m\x0b\xbd\n\xee\xd6g==\xc3O\x18\xd3:hF2s\x05f\xfe\x9c\xe8\xf6vh\x01zr\xbf\xfb\xed\xa1\xc0\xb4\xb6\xfb\xb0\xce\xf8\xda\x96\xfbk9\x86\x93!k\x90\xfa\xfd$F\xfd\xc2nT"T![\r\xca\xa7\xeb\x18\xb3\xc2\xc7\xf9\xf2!\xa8IaSt\xf1;d\xfei\xabb\xc2\x1c\x90\xf7-\x9dE\xf7\xde\xb16\x85/X\x00u\x0b=\xa0;\x1d6\xb0}\xcbl\x02<k\x86\xba\xb3\xb2(\x16\xaa\xdcPe{\xf9k\xb2`\xe6J\xfb\xcf\x1fh\x0f*\x1f\xff:k^\x05\x03\xe2\x9b\xb8\xa8\xd2#\xa8J(\xe6\x8d\x89\xeb\x96\x9e\xfdo>\x1f\xfdbmQ\x16UE\x95\xf9%\xed\x03\x0c\xf46P7~:\'\xa9l\'E\x88\xce>\x1b\xf1\x9au\x87\xf9f~\xa8\xc2\xc1\xc4p#\xac\xfa\x1c\x0e\xf0ZuW\xec\xb8\xb1\xec\x88\xe6\xe4\xfb-\xb6\xba:bg\xeeo_\x87)\xa2q\x85C\xe4\xe6\xfd\x0e\xe1\x00\x89\x19\x81\xa5\xbc\x10\x18\x81:\xe6&f\xce>\xf6\xce\xd3-\x1e\xc0\xc9.;*,2\xf1\x18\xe86\x01\xe5\x8bA\xaa\xdbL`.$P\x98,w\xad%\xe7\xfeN\xf9x\x85\x82\xc1h\xd0a!00\xcb\x03\xbc\x90\xa0\x15V6\x82\xd6\x90\x8f\xf4\x17\xbb\x13\xa4\x97\x05\x8c\xact\xcf\xa4\x0f7\xf2\x12Z\x06\x83N\xe7\x83\xfeST\x85\x9d\x8d\xc9\xe1\x8azW17x\xe7\xfd\xb3\x85W\x1c\x7f\x1b\xe2\xab&r\x13p\xacf\x91\xf9\x0b\x89\xd5\x9d\xd3\xb3\xcf\x93\x928\xd4\xe8\r\x9e\xd6\xe8L\xacR:q\xd8et\xe0\xb4\x88&\xf77<0\xf3\xa5\xab\xef\xd6\x83}\xed\xd2M\xbcV\xdf\xb5)b\x14GD\x9eU\xb7\xc8\xb9t\xf1\xc0\xc9V\x0c2\x7f\x9e\xc7\x0c\xce4\t^\xd2\xca\x16\xa8\xdf+\xe4\x9f\xc0\x98-^\x96\xbb\xdd\xc1\xe6D\xcdj#\xbaJ\x1f\x18a\xc5\xc8o\xe2\xa6\x9f/\xcd\x0b[q\xe3\x9a[\xf7DE\xed(\x17\xeb\x12\xea\x15L%\x9bG\xe3\x01\xc6Y\xc3h\xf39U+\x92>j\xe3\xb9\x86\xcd\xd2\xc2n=B\x17\xac\xc8GI\x18\xfd\xde?\xa8\xae\xc0-l\x8f\x02A\xc2i\x8b)\xfd\xe1\xea\xa3O)\xfa\x12\xaf|F\x98\xc0\x04\xae\x1eoV>\xdd#\x06\xd8x\xbaA\x02K\xcaj\xdb\xed\xb3|\x96\x97\xb2E\x97\x83\xd4\xd6_\'^\xde\xb8u\x9bx7\'\xbc\n\x8c8g*<S>W\t\x05\x9d\xf3\x18\xb7\xfeUcx\xe6\xeb\xf5\xd8\xf8\xdb>E\xac\xe4[\xdejH\xfc,\xfaF/g\x07\xd1mBW+#\xf6e:\x82;\xd3o\xb0,\xaf\xafv5%\xf6,\'>x\xe1\x12\x1a3\xf7I\x8b\x8b\xf2\xf5\xc0k%\x9e\xb8\xec\xd9T\xabC\x8c0\xdanY\xdcpn\xea!\x0c0\xd1\xfd\xba\xf9\xc7\xc5\xac!K\x8d[\xe2\xdb\xde\xcb\xa9*J\x08g\xa7\xbeqrV\x1e\xe8\'\x0f6!\xfb_\x07;I\x95-\xda\xa7_sd\xecC\xdf\xf2\x8f\xe6\xa8\xf8\xf5x\x8bP+k\xc13\x1aU}\x80\x17\x8b]q\xf1\x88\x04\x16ePxX6]%;\x94y\xe6\x19Y}=_b\x9dI\xcb\xc4\x0e\x02\xf0\x08HP\xe0\xa6#\x0b\xa8\xbd5}\'\xa2\x1c\x85\x92.N\xbe3\xbb\xfcV&\x82\x1d\x18\xbc&\xbd\x03<\xf7\xd5\x8e\x99\x17N}\x07G\xdfs\x9ei\x18\xde\xbc^\xdf;w\xbe\xb2\x98\x9a:\xd6s]C\x01\x19?,H}\\\x0f\xe3\x02\xbb\xcc\x1e\xb1=v8I\xe1\xf6\xe8\x99\xdf\xc4\x1b\x1d4\xf69ds4\x90\xfc}\xb5\xb16\xfa\xf0\x02\xcaBu\xaf\x8c\x9a\xe7BV}|~\xd9\x87\xce\xe3\xfbv]<M\xc64\xfd\x7f\xfc\x13\xb4\x84%,a\tK\xf8\x0f\xe1\x1f\xf8\x0eLo\x00&\x00\x00'

You'll want to unzip the content, and you can use the gzip module to do this. Use gzip.decompress to read the content into a wav file:
import requests
import gzip
r = requests.get(url = urll, headers = hederr, stream=True)
with open('myfile.wav', 'wb') as fh:
fh.write(gzip.decompress(r.content))
Edit
Since this is apparently a wav.tar.gz file, you'll want to un-tar the decompressed content:
import requests
import gzip
import tarfile
from io import BytesIO
r = requests.get(url = urll, headers = hederr, stream=True)
with open('myfile.wav.tar', 'wb') as fh:
fh.write(gzip.decompress(r.content))
with tarfile.open('myfile.wav.tar', 'rb') as fh:
fh.extractall()
Where extractall will extract to the current working directory by default, but you can specify a separate directory if you want via fh.extractall(path='/some/path'). To avoid intermediate files, you could do:
with BytesIO(gzip.decompress(r.content)) as fh:
tf = tarfile.TarFile(fileobj=fh)
tf.extractall()

Trying to create Python code that inputs json and then displays it

import json
import requests
def download_file(url):
r = requests.get(url)
filename = url.split('/')[-1]
with open(filename, 'wb') as f:
f.write(r.content)
api_url = 'https://api.fda.gov/download.json'
r = requests.get(api_url)
files = [file['file'] for file in json.loads(r.text)['results']['drug']['event']['partitions']]
count = 1
for file in files:
download_file(file)
print(f"{count}/{len(files)} downloaded!")
count += 1
This is the other code
import urllib.request, json
with urllib.request.urlopen("https://api.fda.gov/drug/label.json") as url:
data = json.loads(url.read().decode())
print(data)
The first code just downloads it. I wondering if theres a way to not have to download any of the 1000+ files and just display it, so the code can be used locally. While the second one prints the json in the terminal.

requests.get() and urllib.request.urlopen() both "download" the full response of the URL they are given.
If you do not want to "save" the file to disk, then remove the code that calls f.write()
More specifically,
import json
import requests
api_url = 'https://api.fda.gov/download.json'
r = requests.get(api_url)
files = [file['file'] for file in r.json()['results']['drug']['event']['partitions']]
total_files = len(files)
count = 0
for file in files:
print(requests.get(file).content)
print(f"{count+1}/{total_files} downloaded!")
count += 1

Python getting error "UnicodeDecodeError: 'utf-8' codec can't decode byte 0xad in position 10: invalid start byte" [duplicate]

If I have a URL that, when submitted in a web browser, pops up a dialog box to save a zip file, how would I go about catching and downloading this zip file in Python?

As far as I can tell, the proper way to do this is:
import requests, zipfile, StringIO
r = requests.get(zip_file_url, stream=True)
z = zipfile.ZipFile(StringIO.StringIO(r.content))
z.extractall()
of course you'd want to check that the GET was successful with r.ok.
For python 3+, sub the StringIO module with the io module and use BytesIO instead of StringIO: Here are release notes that mention this change.
import requests, zipfile, io
r = requests.get(zip_file_url)
z = zipfile.ZipFile(io.BytesIO(r.content))
z.extractall("/path/to/destination_directory")

Most people recommend using requests if it is available, and the requests documentation recommends this for downloading and saving raw data from a url:
import requests
def download_url(url, save_path, chunk_size=128):
r = requests.get(url, stream=True)
with open(save_path, 'wb') as fd:
for chunk in r.iter_content(chunk_size=chunk_size):
fd.write(chunk)
Since the answer asks about downloading and saving the zip file, I haven't gone into details regarding reading the zip file. See one of the many answers below for possibilities.
If for some reason you don't have access to requests, you can use urllib.request instead. It may not be quite as robust as the above.
import urllib.request
def download_url(url, save_path):
with urllib.request.urlopen(url) as dl_file:
with open(save_path, 'wb') as out_file:
out_file.write(dl_file.read())
Finally, if you are using Python 2 still, you can use urllib2.urlopen.
from contextlib import closing
def download_url(url, save_path):
with closing(urllib2.urlopen(url)) as dl_file:
with open(save_path, 'wb') as out_file:
out_file.write(dl_file.read())

With the help of this blog post, I've got it working with just requests.
The point of the weird stream thing is so we don't need to call content
on large requests, which would require it to all be processed at once,
clogging the memory. The stream avoids this by iterating through the data
one chunk at a time.
url = 'https://www2.census.gov/geo/tiger/GENZ2017/shp/cb_2017_02_tract_500k.zip'
response = requests.get(url, stream=True)
with open('alaska.zip', "wb") as f:
for chunk in response.iter_content(chunk_size=512):
if chunk: # filter out keep-alive new chunks
f.write(chunk)

Here's what I got to work in Python 3:
import zipfile, urllib.request, shutil
url = 'http://www....myzipfile.zip'
file_name = 'myzip.zip'
with urllib.request.urlopen(url) as response, open(file_name, 'wb') as out_file:
shutil.copyfileobj(response, out_file)
with zipfile.ZipFile(file_name) as zf:
zf.extractall()

Super lightweight solution to save a .zip file to a location on disk (using Python 3.9):
import requests
url = r'https://linktofile'
output = r'C:\pathtofolder\downloaded_file.zip'
r = requests.get(url)
with open(output, 'wb') as f:
f.write(r.content)

Either use urllib2.urlopen, or you could try using the excellent Requests module and avoid urllib2 headaches:
import requests
results = requests.get('url')
#pass results.content onto secondary processing...

I came here searching how to save a .bzip2 file. Let me paste the code for others who might come looking for this.
url = "http://api.mywebsite.com"
filename = "swateek.tar.gz"
response = requests.get(url, headers=headers, auth=('myusername', 'mypassword'), timeout=50)
if response.status_code == 200:
with open(filename, 'wb') as f:
f.write(response.content)
I just wanted to save the file as is.

Thanks to #yoavram for the above solution,
my url path linked to a zipped folder, and encounter an error of BADZipfile
(file is not a zip file), and it was strange if I tried several times it
retrieve the url and unzipped it all of sudden so I amend the solution a little
bit. using the is_zipfile method as per here
r = requests.get(url, stream =True)
check = zipfile.is_zipfile(io.BytesIO(r.content))
while not check:
r = requests.get(url, stream =True)
check = zipfile.is_zipfile(io.BytesIO(r.content))
else:
z = zipfile.ZipFile(io.BytesIO(r.content))
z.extractall()

Use requests, zipfile and io python packages.
Specially BytesIO function is used to keep the unzipped file in memory rather than saving it into the drive.
import requests
from zipfile import ZipFile
from io import BytesIO
r = requests.get(zip_file_url)
z = ZipFile(BytesIO(r.content))
file = z.extract(a_file_to_extract, path_to_save)
with open(file) as f:
print(f.read())

Print JSON data from csv list of multiple urls

Very new to Python and haven't found specific answer on SO but apologies in advance if this appears very naive or elsewhere already.
I am trying to print 'IncorporationDate' JSON data from multiple urls of public data set. I have the urls saved as a csv file, snippet below. I am only getting as far as printing ALL the JSON data from one url, and I am uncertain how to run that over all of the csv urls, and write to csv just the IncorporationDate values.
Any basic guidance or edits are really welcomed!
try:
# For Python 3.0 and later
from urllib.request import urlopen
except ImportError:
# Fall back to Python 2's urllib2
from urllib2 import urlopen
import json
def get_jsonparsed_data(url):
response = urlopen(url)
data = response.read().decode("utf-8")
return json.loads(data)
url = ("http://data.companieshouse.gov.uk/doc/company/01046514.json")
print(get_jsonparsed_data(url))
import csv
with open('test.csv') as f:
lis=[line.split() for line in f]
for i,x in enumerate(lis):
print ()
import StringIO
s = StringIO.StringIO()
with open('example.csv', 'w') as f:
for line in s:
f.write(line)
Snippet of csv:
http://business.data.gov.uk/id/company/01046514.json
http://business.data.gov.uk/id/company/01751318.json
http://business.data.gov.uk/id/company/03164710.json
http://business.data.gov.uk/id/company/04403406.json
http://business.data.gov.uk/id/company/04405987.json

Welcome to the Python world.
For dealing with making http requests, we commonly use requests because it's dead simple api.
The code snippet below does what I believe you want:
It grabs the data from each of the urls you posted
It creates a new CSV file with each of the IncorporationDate keys.
```
import csv
import requests
COMPANY_URLS = [
'http://business.data.gov.uk/id/company/01046514.json',
'http://business.data.gov.uk/id/company/01751318.json',
'http://business.data.gov.uk/id/company/03164710.json',
'http://business.data.gov.uk/id/company/04403406.json',
'http://business.data.gov.uk/id/company/04405987.json',
]
def get_company_data():
for url in COMPANY_URLS:
res = requests.get(url)
if res.status_code == 200:
yield res.json()
if __name__ == '__main__':
for data in get_company_data():
try:
incorporation_date = data['primaryTopic']['IncorporationDate']
except KeyError:
continue
else:
with open('out.csv', 'a') as csvfile:
writer = csv.writer(csvfile)
writer.writerow([incorporation_date])
```

First step, you have to read all the URLs in your CSV
import csv
csvReader = csv.reader('text.csv')
# next(csvReader) uncomment if you have a header in the .CSV file
all_urls = [row for row in csvReader if row]
Second step, fetch the data from the URL
from urllib.request import urlopen
def get_jsonparsed_data(url):
response = urlopen(url)
data = response.read().decode("utf-8")
return json.loads(data)
url_data = get_jsonparsed_data("give_your_url_here")
Third step:
Go through all URLs that you got from CSV file
Get JSON data
Fetch the field what you need, in your case "IncorporationDate"
Write into an output CSV file, I'm naming it as IncorporationDates.csv
Code below:
for each_url in all_urls:
url_data = get_jsonparsed_data(each_url)
with open('IncorporationDates.csv', 'w' ) as abc:
abc.write(url_data['primaryTopic']['IncorporationDate'])

How to batch read and then write a list of weblink .JSON files to specified locations on C drive in Python v2.7

I have a long list of .json files that I need to download to my computer. I want to download them as .json files (so no parsing or anything like that at this point).
I have some code that works for small files, but it is pretty buggy. Also it doesn't handle multiple links well.
Appreciate any advice to fix up this code:
import os
filename = 'test.json'
path = "C:/Users//Master"
fullpath = os.path.join(path, filename)
import urllib2
url = 'https://www.premierlife.com/secure/index.json'
response = urllib2.urlopen(url)
webContent = response.read()
f = open(fullpath, 'w')
f.write(webContent)
f.close

It's creating a blank file because the f.close at the end should be f.close().
I took your code and made a little function and then called it on a little loop to go through a .txt file with the list of urls called "list_of_urls.txt" having 1 url per line (you can change the delimiter in the split function if you want to format it differently).
def save_json(url):
import os
filename = url.replace('/','').replace(':','')
# this replaces / and : in urls
path = "C:/Users/Master"
fullpath = os.path.join(path, filename)
import urllib2
response = urllib2.urlopen(url)
webContent = response.read()
f = open(fullpath, 'w')
f.write(webContent)
f.close()
And then the loop:
f = open('list_of_urls.txt')
p = f.read()
url_list = p.split('\n') #here's where \n is the line break delimiter that can be changed
for url in url_list:
save_json(url)

We Keep Coding

Python is a programming language that lets you work quickly and integrate systems more effectively.

Downloading Many Files using Python Web-Scraping - python

You can use a method like this to download files: import urllib2 file_name = "myfile.xyz" u = urllib2.urlopen(url) f = open(file_name, 'wb') block_sz = 4096 while True: buffer = u.read(block_sz) if not buffer: break f.write(buffer) f.close()

Related

Use python request to convert bytes gzip to .wav?

Trying to create Python code that inputs json and then displays it

Python getting error "UnicodeDecodeError: 'utf-8' codec can't decode byte 0xad in position 10: invalid start byte" [duplicate]

Print JSON data from csv list of multiple urls

How to batch read and then write a list of weblink .JSON files to specified locations on C drive in Python v2.7

Categories

Resources