Python urllib2 resume download doesn't work when network reconnects - python

I'm using urllib2 to make a resuming downloader, roughly based on this method. I can end the program and re-start it, and it starts downloading where it left off, downloading the file that ends up the same size as if it were downloaded all at once.
However, I have tested it when disabling and reenabling network, and it doesn't download correctly. The file size ends up longer than the file should be, and the file doesn't work correctly. Is there something I missed, or could this be a urllib2 bug?
import urllib2
opener = urllib2.build_opener();
self.count = 0 # Counts downloaded size.
self.downloading = True
while (not(self.success) and self.downloading):
try:
self.Err = ""
self._netfile = self.opener.open(self.url)
self.filesize = float(self._netfile.info()['Content-Length'])
if (os.path.exists(self.localfile) and os.path.isfile(self.localfile)):
self.count = os.path.getsize(self.localfile)
print self.count,"of",self.filesize,"downloaded."
if self.count >= self.filesize:
#already downloaded
self.downloading = False
self.success = True
self._netfile.close()
return
if (os.path.exists(self.localfile) and os.path.isfile(self.localfile)):
#File already exists, start where it left off:
#This seems to corrupt the file sometimes?
self._netfile.close()
req = urllib2.Request(self.url)
print "file downloading at byte: ",self.count
req.add_header("Range","bytes=%s-" % (self.count))
self._netfile = self.opener.open(req)
if (self.downloading): #Don't do it if cancelled, downloading=false.
next = self._netfile.read(1024)
self._outfile = open(self.localfile,"ab") #to append binary
self._outfile.write(next)
self.readsize = desc(self.filesize) # get size mb/kb
self.count += 1024
while (len(next)>0 and self.downloading):
next = self._netfile.read(1024)
self._outfile.write(next)
self.count += len(next)
self.success = True
except IOError, e:
print e
self.Err=("Download error, retrying in a few seconds: "+str(e))
try:
self._netfile.close()
except Exception:
pass
time.sleep(8) #Then repeat

I added self._outfile.close() with the self._netfile.close() in the IOError handler, that seems to have fixed it. I guess this error was caused by opening for appending again without closing it.

Related

Using .hdf5 files only once they are finished writing

I am trying to use .hdf5 files once they are done writing (in my case, trying to emit them). But the problem is that I don't have a way to 1) test if they are finished writing and 2) then send them. The code that I have been trying to work with is follows:
while True:
event = self._q.get()
while True:
try:
file = h5py.File(event.src_path, "r")
file.close()
self.new_file.emit(event.src_path, os.path.basename(event.src_path))
break
except OSError:
if retry_count < max_retry_count:
retry_count += 1
print(f"h5 file <{event.src_path}> is locked, retrying {retry_count}/{max_retry_count}")
time.sleep(retry_interval_seconds)
else:
print(f"h5 file <{event.src_path}> reached max retry count, skipping")
except Exception as err:
print(f"Got unexpected Error <{type(err).__name__}> while opening <{event.src_path}> ")
traceback.print_exc()
Obviously this is problematic with the break. But without the break, the try stays in the loop and emits the same file over and over again. This code tests if they are done writing perfectly but the ability to send them and continue to take in new files does not work. Any insight is greatly appreciated.
I solved this by the following code:
while True:
event = self._q.get()
max_retry_count = 350 # for test purposes now but want to set an upper bound on verifying a file is finished.
retry_interval_seconds = .01 # every hundreth it will try the file to see if it finished writing
retry_count = 0
if event.event_type == "created" and event.src_path.lower().endswith(".hdf5"):
while True:
try:
file = h5py.File(event.src_path, "r")
file.close()
except OSError:
if retry_count < max_retry_count:
retry_count += 1
print(f"h5 file <{event.src_path}> is locked, retrying {retry_count}/{max_retry_count}")
time.sleep(retry_interval_seconds)
else:
print(f"h5 file <{event.src_path}> reached max retry count, skipping")
break # <--- looks useful here
except Exception as err:
print(f"Got unexpected Error <{type(err).__name__}> while opening <{event.src_path}> ")
traceback.print_exc()
else:
self.new_file.emit(event.src_path, os.path.basename(event.src_path))
break

python socket file transfer verified with sha256 not working, but only sometimes?

Client side:
def send_file_to_hashed(data, tcpsock):
time.sleep(1)
f = data
flag = 0
i=0
tcpsock.send(hashlib.sha256(f.read()).hexdigest())
f.seek(0)
time.sleep(1)
l = f.read(BUFFER_SIZE-64)
while True:
while (l):
tcpsock.send(hashlib.sha256(l).hexdigest() + l)
time.sleep(1)
hashok = tcpsock.recv(6)
if hashok == "HASHOK":
l = f.read(BUFFER_SIZE-64)
flag = 1
if hashok == "BROKEN":
flag = 0
if not l:
time.sleep(1)
tcpsock.send("DONE")
break
return (tcpsock,flag)
def upload(filename):
flag = 0
while(flag == 0):
with open(os.getcwd()+'\\data\\'+ filename +'.csv', 'rU') as UL:
tuplol = send_file_to_hashed(UL ,send_to_sock(filename +".csv",send_to("upload",TCP_IP,TCP_PORT)))
(sock,flagn) = tuplol
flag = flagn
time.sleep(2)
sock.close()
Server Side:
elif(message == "upload"):
message = rec_OK(self.sock)
fis = os.getcwd()+'/data/'+ time.strftime("%H:%M_%d_%m_%Y") + "_" + message
f = open(fis , 'w')
latest = open(os.getcwd()+'/data/' + message , 'w')
time.sleep(1)
filehash = rec_OK(self.sock)
print("filehash:" + filehash)
while True:
time.sleep(1)
rawdata = self.sock.recv(BUFFER_SIZE)
log.write("rawdata :" + rawdata + "\n")
data = rawdata[64:]
dhash = rawdata[:64]
log.write("chash: " + dhash + "\n")
log.write("shash: " + hashlib.sha256(data).hexdigest() + "\n")
if dhash == hashlib.sha256(data).hexdigest():
f.write(data)
latest.write(data)
self.sock.send("HASHOK")
log.write("HASHOK\n" )
print"HASHOK"
else:
self.sock.send("HASHNO")
print "HASHNO"
log.write("HASHNO\n")
if rawdata == "DONE":
f.close()
f = open(fis , 'r')
if (hashlib.sha256(f.read()).hexdigest() == filehash):
print "ULDONE"
log.write("ULDONE")
f.close()
latest.close()
break
else:
self.sock.send("BROKEN")
print hashlib.sha256(f.read()).hexdigest()
log.write("BROKEN")
print filehash
print "BROKEN UL"
f.close()
So the data upload is working fine in all tests that i ran from my computer, even worked fine while uploading data over my mobile connection and still sometimes people say it takes a long time and they kill it after a few minutes. the data is there on their computers but not on the server. I don't know what is happening please help!
First of all: this is unrelated to sha.
Streaming over the network is unpredictable. This line
rawdata = self.sock.recv(BUFFER_SIZE)
doesn't guarantee that you read BUFFER_SIZE bytes. You may have read only 1 byte in the worst case scenario. Therefore your server side is completely broken because of the assumption that rawdata contains whole message. It is even worse. If the client sends command and hash fast you may get e.g. rawdata == 'DONEa2daf78c44(...) which is a mixed output.
The "hanging" part just follows from that. Trace your code and see what happens when the server receives partial/broken messages ( I already did that in my imagination :P ).
Streaming over the network is almost never as easy as calling sock.send on one side and sock.recv on the other side. You need some buffering/framing protocol. For example you can implement this simple protocol: always interpret first two bytes as the size of incoming message, like this:
client (pseudocode)
# convert len of msg into two-byte array
# I am assuming the max size of msg is 65536
buf = bytearray([len(msg) & 255, len(msg) >> 8])
sock.sendall(buf)
sock.sendall(msg)
server (pseudocode)
size = to_int(sock.recv(1))
size += to_int(sock.recv(1)) << 8
# You need two calls to recv since recv(2) can return 1 byte.
# (well, you can try recv(2) with `if` here to avoid additional
# syscall, not sure if worth it)
buffer = bytearray()
while size > 0:
tmp = sock.recv(size)
buffer += tmp
size -= len(tmp)
Now you have properly read data in buffer variable which you can work with.
WARNING: the pseudocode for the server is simplified. For example you need to check for empty recv() result everywhere (including where size is calculated). This is the case when the client disconnects.
So unfortunately there's a lot of work in front of you. You have to rewrite whole sending and receving code.

Indentation Error Python Not Working

Im trying to run my code and there is an
File "C:/trcrt/trcrt.py", line 42
def checkInternet():
^
IndentationError: unexpected unindent
The code supposed to check for the traceroute to a website... i know... its not very smart but its what i was told to do
Ive checked the code using pep8 and eveything is seems to be fine...
'''
Developer: Roei Edri
File name: trcrt.py
Date: 24.11.17
Version: 1.1.0
Description: Get an url as an input and prints the traceroute to it.
'''
import sys
import urllib2
i, o, e = sys.stdin, sys.stdout, sys.stderr
from scapy.all import *
from scapy.layers.inet import *
sys.stdin, sys.stdout, sys.stderr = i, o, e
def trcrt(dst):
"""
Check for the route for the given destination
:param dst: Final destination, in a form of a website.
:type dst: str
"""
try:
pckt = IP(dst=dst)/ICMP() # Creates the
# packet
ip = [p for p in pckt.dst] # Gets the ip
print "Tracerouting for {0} : {1}".format(dst, ip[0])
for ttl in range(1, 40):
pckt = IP(ttl=ttl, dst=dst)/ICMP()
timeBefore = time.time()
reply = sr1(pckt, verbose=0, timeout=5)
timeAfter = time.time()
timeForReply = (timeAfter - timeBefore)*1000
if reply is not None:
print "{0} : {1} ; Time for reply: {2}".format(ttl,
reply.src, timeForReply)
if reply.type == 0:
print "Tracerout Completed"
break
else:
print "{0} ... Request Time Out".format(ttl)
def checkInternet():
"""
Checks if there is an internet connection
:return: True if there is an internet connection
"""
try:
urllib2.urlopen('http://45.33.21.159', timeout=1)
return True
except urllib2.URLError as IntError:
return False
Thanks for any help...
Btw pep8 says
"module level import not at top of file"
for lines 12,13
The try block is missing its except clause.
try:
pckt = IP(dst=dst)/ICMP() # Creates the
# packet
ip = [p for p in pckt.dst] # Gets the ip
print "Tracerouting for {0} : {1}".format(dst, ip[0])
for ttl in range(1, 40):
pckt = IP(ttl=ttl, dst=dst)/ICMP()
timeBefore = time.time()
reply = sr1(pckt, verbose=0, timeout=5)
timeAfter = time.time()
timeForReply = (timeAfter - timeBefore)*1000
if reply is not None:
print "{0} : {1} ; Time for reply: {2}".format(ttl,
reply.src, timeForReply)
if reply.type == 0:
print "Tracerout Completed"
break
else:
print "{0} ... Request Time Out".format(ttl)
except: # Here : Add the exception you wish to catch
pass # handle this exception appropriately
As a general rule, do not use catch all except clauses, and do not pass on a caught exception, it lets it fail silently.
If this is your full code, there are two things to check:
1) Have you mixed tabs and spaces? Make sure that all tabs are converted to spaces (I recommend 4 spaces per tab) for indentation. A good IDE will do this for you.
2) The try: in trcrt(dst) does not hava a matching except block.
PEP8 will by the way also tell you, that function names should be lowercase:
check_internet instead of checkInternet, ...
I will give you the same recommendation, that I give to everyone working with me: Start using an IDE that marks PEP8 and other errors for you, there is multiple around. It helps spotting those errors a lot and trains you to write clean Python code that is easily readable and (if you put comments in it) also reausable and understandable a few years later.

How to request multiple url at one time using urllib in python

I'm programing a program for downloading images from internet and I would like to speed it up using multiple requests at once.
So I wrote a code you can see here at GitHub.
I can request for webpage only like this:
def myrequest(url):
worked = False
req = Request(url, headers={'User-Agent': 'Mozilla/5.0'})
while not worked:
try:
webpage_read = urlopen(req).read()
worked = True
except:
print("failed to connect to \n{}".format(url))
return(webpage_read)
url = "http://www.mangahere.co/manga/mysterious_girlfriend_x"
webpage_read = myrequest(url).decode("utf-8")
The while is here because I definitely want to download every single picture, so I'm trying until it work (nothing can go wrong except urllib.error.HTTPError: HTTP Error 504: Gateway Time-out)
My question is, how to run that multiple times at once?
My idea is to have " a comander" which will run 5 (or 85) pythonic scripts, give each url and get webpage from them once they are finished, but this is definitely a silly solution :)
EDIT:
I used _thread but it doesn't seem to speed up the program. That should have been the solution am I doing it wrong? that is my new question.
You can use link do get to my code on GitHub
def thrue_thread_download_pics(path, url, ep, name):
lock.acquire()
global goal
goal += 1
lock.release()
webpage_read = myrequest("{}/{}.html".format(url, ep))
url_to_pic = webpage_read.decode("utf-8").split('" onerror="')[0].split('<img src="')[-1]
pic = myrequest(url_to_pic)
myfile = open("{}/pics/{}.jpg".format(path, name), "wb")
myfile.write(pic)
myfile.close()
global finished
finished += 1
and I'm using it here:
for url_ep in urls_eps:
url, maxep = url_ep.split()
maxep = int(maxep)
chap = url.split("/")[-1][2:]
if "." in chap:
chap = chap.replace(".", "")
else:
chap = "{}0".format(chap)
for ep in range(1, maxep + 1):
ted = time.time()
name = "{}{}".format(chap, "{}{}".format((2 - len(str(ep))) * "0", ep))
if name in downloaded:
continue
_thread.start_new_thread(thrue_thread_download_pics, (path, url, ep, name))
checker = -1
while finished != goal:
if finished != checker:
checker = finished
print("{} of {} downloaded".format(finished, goal))
time.sleep(0.1)
Requests Futures is built on top of the very popular requests library and uses non-blocking IO:
from requests_futures.sessions import FuturesSession
session = FuturesSession()
# These requests will run at the same time
future_one = session.get('http://httpbin.org/get')
future_two = session.get('http://httpbin.org/get?foo=bar')
# Get the first result
response_one = future_one.result()
print(response_one.status_code)
print(response_one.text)
# Get the second result
response_two = future_two.result()
print(response_two.status_code)
print(response_two.text)

how to use threads to grab multiple chunks of a file concurrently from server but write to disk atomically?

I am stuck in a grieve problem, and not able to figure out which way to go, in attempts made whole day I have posted so many times, this is not a duplicate question, since I need clarity how can I use multiple threads to grab a multiple chunks simultaneously from a server but write to disk atomically by locking the file write operation for single thread access; while the next thread waits for lock to get released.
import argparse,logging, Queue, os, requests, signal, sys, time, threading
import utils as _fdUtils
DESKTOP_PATH = os.path.expanduser("~/Desktop")
appName = 'FileDownloader'
logFile = os.path.join(DESKTOP_PATH, '%s.log' % appName)
_log = _fdUtils.fdLogger(appName, logFile, logging.DEBUG, logging.DEBUG, console_level=logging.DEBUG)
queue = Queue.Queue()
STOP_REQUEST = threading.Event()
maxSplits = threading.BoundedSemaphore(3)
threadLimiter = threading.BoundedSemaphore(5)
lock = threading.Lock()
Update:1
def _grabAndWriteToDisk(url, saveTo, first=None, queue=None, mode='wb', irange=None):
""" Function to download file..
Args:
url(str): url of file to download
saveTo(str): path where to save file
first(int): starting byte of the range
queue(Queue.Queue): queue object to set status for file download
mode(str): mode of file to be downloaded
irange(str): range of byte to download
"""
fileName = url.split('/')[-1]
filePath = os.path.join(saveTo, fileName)
fileSize = int(_fdUtils.getUrlSizeInBytes(url))
downloadedFileSize = 0 if not first else first
block_sz = 8192
irange = irange if irange else '0-%s' % fileSize
# print mode
resp = requests.get(url, headers={'Range': 'bytes=%s' % irange}, stream=True)
fileBuffer = resp.raw.read()
with open(filePath, mode) as fd:
downloadedFileSize += len(fileBuffer)
fd.write(fileBuffer)
status = r"%10d [%3.2f%%]" % (downloadedFileSize, downloadedFileSize * 100. / fileSize)
status = status + chr(8)*(len(status)+1)
sys.stdout.write('%s\r' % status)
time.sleep(.05)
sys.stdout.flush()
if downloadedFileSize == fileSize:
STOP_REQUEST.set()
if queue:
queue.task_done()
_log.info("Download Completed %s%% for file %s, saved to %s",
downloadedFileSize * 100. / fileSize, fileName, saveTo)
class ThreadedFetch(threading.Thread):
""" docstring for ThreadedFetch
"""
def __init__(self, queue):
super(ThreadedFetch, self).__init__()
self.queue = queue
self.lock = threading.Lock()
def run(self):
threadLimiter.acquire()
try:
items = self.queue.get()
url = items[0]
saveTo = DESKTOP_PATH if not items[1] else items[1]
split = items[-1]
# grab split chunks in separate thread.
if split > 1:
maxSplits.acquire()
try:
sizeInBytes = int(_fdUtils.getUrlSizeInBytes(url))
byteRanges = _fdUtils.getRange(sizeInBytes, split)
mode = 'wb'
th = threading.Thread(target=_grabAndWriteToDisk, args=(url, saveTo, first, self.queue, mode, _range))
_log.info("Pulling for range %s using %s" , _range, th.getName())
th.start()
# _grabAndWriteToDisk(url, saveTo, first, self.queue, mode, _range)
mode = 'a'
finally:
maxSplits.release()
else:
while not STOP_REQUEST.isSet():
self.setName("primary_%s" % url.split('/')[-1])
# if downlaod whole file in single chunk no need
# to start a new thread, so directly download here.
_grabAndWriteToDisk(url, saveTo, 0, self.queue)
finally:
threadLimiter.release()
def main(appName, flag='with'):
args = _fdUtils.getParser()
urls_saveTo = {}
if flag == 'with':
_fdUtils.Watcher()
elif flag != 'without':
_log.info('unrecognized flag: %s', flag)
sys.exit()
# spawn a pool of threads, and pass them queue instance
# each url will be downloaded concurrently
for i in xrange(len(args.urls)):
t = ThreadedFetch(queue)
t.daemon = True
t.start()
split = 4
try:
for url in args.urls:
# TODO: put split as value of url as tuple with saveTo
urls_saveTo[url] = args.saveTo
# populate queue with data
for url, saveTo in urls_saveTo.iteritems():
queue.put((url, saveTo, split))
# wait on the queue until everything has been processed
queue.join()
_log.info('Finsihed all dowonloads.')
except (KeyboardInterrupt, SystemExit):
_log.critical('! Received keyboard interrupt, quitting threads.')
I expect to multiple threads grab each chunk but in the terminal i see same thread grabbing each range of chunk:
INFO - Pulling for range 0-25583 using Thread-1
INFO - Pulling for range 25584-51166 using Thread-1
INFO - Pulling for range 51167-76748 using Thread-1
INFO - Pulling for range 76749-102331 using Thread-1
INFO - Download Completed 100.0% for file 607800main_kepler1200_1600-1200.jpg
but what I am expecting is
INFO - Pulling for range 0-25583 using Thread-1
INFO - Pulling for range 25584-51166 using Thread-2
INFO - Pulling for range 51167-76748 using Thread-3
INFO - Pulling for range 76749-102331 using Thread-4
Please do not mark duplicate, without understanding...
please recommend if there is a better approach of doing what I am trying to do..
Cheers:/
If you want the download happen simultaneously and just writing to the file to be atomic then don't put the lock around downloading and writing to the file, but just around the write part.
As every thread uses its own file object to write I don't see the need to lock that access anyway. What you have to make sure is every thread writes to the correct offset, so you need a seek() call on the file before writing the data chunk. Otherwise you'll have to write the chunks in file order, which would make things more complicated.

Categories