How to download single file using multiple threads using python requests library - python

I have tried this code it is throwing some error I have changed from urllib2 to requests library
I ran this code in the pycharm and I got the following error
I can't able to install the urllib2 module
I need to download a single file with multiple threads using
the requests Library
using multi-threading a file can be downloaded in the form of chunks simultaneously from different threads.
Error:
Exception in thread Thread-1:
Traceback (most recent call last):
File "C:\Users\suresh_ram\AppData\Local\Programs\Python\Python38\lib\threading.py", line 932, in
_bootstrap_inner
self.run()
File "C:\Users\suresh_ram\AppData\Local\Programs\Python\Python38\lib\threading.py", line 870, in run
self._target(*self._args, **self._kwargs)
File "C:/Users/suresh_ram/PycharmProjects/DownloadManager/multithreaded_downloader.py", line 37, in downloadChunk
dataDict[idx] = open(req,"wb").write(req.content)
TypeError: expected str, bytes or os.PathLike object, not Response
Exception in thread Thread-3:
Traceback (most recent call last):
File "C:\Users\suresh_ram\AppData\Local\Programs\Python\Python38\lib\threading.py", line 932, in _bootstrap_inner
self.run()
File "C:\Users\suresh_ram\AppData\Local\Programs\Python\Python38\lib\threading.py", line 870, in run
self._target(*self._args, **self._kwargs)
File "C:/Users/suresh_ram/PycharmProjects/DownloadManager/multithreaded_downloader.py", line 37, in downloadChunk
dataDict[idx] = open(req,"wb").write(req.content)
TypeError: expected str, bytes or os.PathLike object, not Response
Exception in thread Thread-2:
Traceback (most recent call last):
File "C:\Users\suresh_ram\AppData\Local\Programs\Python\Python38\lib\threading.py", line 932, in _bootstrap_inner
self.run()
File "C:\Users\suresh_ram\AppData\Local\Programs\Python\Python38\lib\threading.py", line 870, in run
import threading
import time
URL = "http://www.nasa.gov/images/content/607800main_kepler1200_1600-1200.jpg"
def buildRange(value, numsplits):
lst = []
for i in range(numsplits):
if i == 0:
lst.append('%s-%s' % (i, int(round(1 + i * value/(numsplits*1.0) + value/(numsplits*1.0)-1, 0))))
else:
lst.append('%s-%s' % (int(round(1 + i * value/(numsplits*1.0),0)), int(round(1 + i * value/(numsplits*1.0) + value/(numsplits*1.0)-1, 0))))
return lst
def main(url=None, splitBy=3):
start_time = time.time()
if not url:
print("Please Enter some url to begin download.")
return
fileName = url.split('/')[-1]
sizeInBytes = requests.head(url, headers={'Accept-Encoding': 'identity'}).headers.get('content-length', None)
print("%s bytes to download." % sizeInBytes)
if not sizeInBytes:
print("Size cannot be determined.")
return
dataDict = {}
# split total num bytes into ranges
ranges = buildRange(int(sizeInBytes), splitBy)
def downloadChunk(idx, irange):
req = requests.get(url)
req.headers['Range'] = 'bytes={}'.format(irange)
dataDict[idx] = open(req,"wb").write(req.content)
# create one downloading thread per chunk
downloaders = [
threading.Thread(
target=downloadChunk,
args=(idx, irange),
)
for idx,irange in enumerate(ranges)
]
# start threads, let run in parallel, wait for all to finish
for th in downloaders:
th.start()
for th in downloaders:
th.join()
print ('done: got {} chunks, total {} bytes'.format(
len(dataDict), sum( (
len(chunk) for chunk in dataDict.values()
) )
))
print( "--- %s seconds ---" % str(time.time() - start_time))
if os.path.exists(fileName):
os.remove(fileName)
# reassemble file in correct order
with open(fileName, 'w') as fh:
for _idx,chunk in sorted(dataDict.iteritems()):
fh.write(chunk)
print ("Finished Writing file %s" % fileName)
print ('file size {} bytes'.format(os.path.getsize(fileName)))
if __name__ == '__main__':
main("https://bugs.python.org/file47781/Tutorial_EDIT.pdf")```

Related

Python Multi threading - Rawlink error on connection.wait()

I am trying to multithread a Singal R connection with python, I am getting a connection Rawlink error the connections works and I need to wait 1 second in order to receive the message from the client, I am also using a barrier to execute the thread "simultaneous".
Here is my code
with Session() as session:
global connection
connection = Connection("http://sampleSINGALRURL/signalr", session)
presenceservice = connection.register_hub('ClientRegistration')
presenceservice1 = connection.register_hub('PresenceClientHub')
connection.start()
presenceservice.server.invoke('IdentifyClient', devideIdentity, softwareVersion, IpAddress,
machineName, DeviceType, patientAdmissionGuid, patientID, pairingId)
presenceservice1.client.on('StaffPresenceNotified', self.get_data1)
connection.wait(1)
And then my threading functions
def get_clients(self):
global barrier
self.connect_to_database1()
barrier.wait()
self.get_message_from_client1()
self.print_data1()
def send_messages(self):
global MessageNumber
global machineName
global staffName
global request
machineName = final_result[MessageNumber][0]
staffName = staff_results[MessageNumber][0]
MessageNumber += 1
barrier.wait()
request = requests.post(
"http://sampleurl/api/sample")
return request
def print_response(self):
global request
timestamp = datetime.utcnow().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]
logging.info("Message sent at " + "Time : " + timestamp + " " + machineName)
def Spin_Clients(self, NumMessages):
for i in range(10):
self.client_list.append(Thread(target=self.send_messages))
self.client_list[i].start()
self.print_response()
sleep(2)
for i in range(10):
self.Message_List.append(Thread(target=self.get_clients))
self.Message_List[i].start()
for thread in self.client_list:
thread.join()
for thread in self.Message_List:
thread.join()
Error logs
All threads have finished
11:41:37.243
Exception in thread Thread-13: Traceback (most recent call last): File "c:\users\appdata\local\programs\python\python37\lib\threading.py", line 917, in _bootstrap_inner
self.run()
File "c:\users\appdata\local\programs\python\python37\lib\threading.py", line 865, in run
self._target(*self._args, **self._kwargs)
File "C:\Data\LoadTest.py", line 103, in get_clients
self.get_message_from_client1()
File "C:\Data\LoadTest.py", line 89, in get_message_from_client1
connection.wait(1)
File "c:\users\appdata\local\programs\python\python37\lib\site-packages\signalr\_connection.py", line 60, in wait
gevent.joinall([self.__greenlet], timeout)
File "src\gevent\greenlet.py", line 849, in gevent._greenlet.joinall
File "src\gevent\greenlet.py", line 859, in gevent._greenlet.joinall
File "src\gevent\_hub_primitives.py", line 198, in gevent.__hub_primitives.wait_on_objects File "src\gevent\_hub_primitives.py", line 235, in gevent.__hub_primitives.wait_on_objects
File "src\gevent\_hub_primitives.py", line 125, in gevent.__hub_primitives._WaitIterator.__iter__ AttributeError: 'NoneType' object has no attribute 'rawlink'
I also tried to use locks but that had the same outcome.
Any idea?

Python TypeError when passing yield outputs to a pool of workers. Want to split a large file into chunks of lines

The code below is returning a strange type error at the job.get() line:
multiprocessing.pool.RemoteTraceback: """ Traceback (most recent call last): File "C:\Python36\lib\multiprocessing\pool.py", line 119, in worker
result = (True, func(*args, **kwds)) File "G:\emd\mppurger.py", line 41, in process_wrapper
run(line) File "G:\emd\mppurger.py", line 25, in run
if correct(copy): File "G:\emd\mppurger.py", line 4, in correct
print('Not Equal to 14? ' + item) TypeError: must be str, not list """
job.get()
File "C:\Python36\lib\multiprocessing\pool.py", line 644, in get
raise self._value
TypeError: must be str, not list
TypeError "string not list".
File "g:\EMD\mppurger.py", line 76, in <module>
job.get()
File "C:\Python36\Lib\multiprocessing\pool.py", line 644, in get
raise self._value
builtins.TypeError: must be str, not list
The only list must be the jobs list. But this is an iterant not iterator?
My assumption is that this is referring to the arguments that I am trying to pass to process_wrapper function.
cores=16
pool = mp.Pool(cores)
jobs = []
#create jobjs
for chunkStart, chunkSize in chunkify("out.txt"):
jobs.append( pool.apply_async(process_wrapper,(chunkStart, chunkSize )))
#wait for all jobs to finish
for job in jobs:
job.get()
#clean up
pool.close()
My generator function that produces chunkStart chunkSize is as follows:
def chunkify(fname,size=1024*1024):
fileEnd = os.path.getsize(fname)
with open(fname,'r') as f:
chunkEnd = f.tell()
while True:
chunkStart = chunkEnd
f.seek(chunkStart + size, 0)
f.readline()
chunkEnd = f.tell()
chunkSize = chunkEnd-chunkStart
yield chunkStart, chunkSize
if chunkEnd > fileEnd:
break
I am going to assume this was an indentation error or something. I couldn't find the error but upon re-writing the code, it disappeared.

Python multiprocessing Deadlock using Queue

I have a python program like below.
from multiprocessing import Lock, Process, Queue, current_process
import time
lock = Lock()
def do_job(tasks_to_accomplish, tasks_that_are_done):
while not tasks_to_accomplish.empty():
task = tasks_to_accomplish.get()
print(task)
lock.acquire()
tasks_that_are_done.put(task + ' is done by ' + current_process().name)
lock.release()
time.sleep(1)
return True
def main():
number_of_task = 10
number_of_processes = 4
tasks_to_accomplish = Queue()
tasks_that_are_done = Queue()
processes = []
for i in range(number_of_task):
tasks_to_accomplish.put("Task no " + str(i))
# creating processes
for w in range(number_of_processes):
p = Process(target=do_job, args=(tasks_to_accomplish, tasks_that_are_done))
processes.append(p)
p.start()
# completing process
for p in processes:
p.join()
# print the output
while not tasks_that_are_done.empty():
print(tasks_that_are_done.get())
return True
if __name__ == '__main__':
main()
Sometimes program run perfectly but sometimes it gets stuck and doesn't complete. When quit manually, it produces following error.
$ python3 multiprocessing_example.py
Task no 0
Task no 1
Task no 2
Task no 3
Task no 4
Task no 5
Task no 6
Task no 7
Task no 8
Task no 9
^CProcess Process-1:
Traceback (most recent call last):
File "multiprocessing_example.py", line 47, in <module>
main()
File "multiprocessing_example.py", line 37, in main
p.join()
File "/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/multiprocessing/process.py", line 121, in join
res = self._popen.wait(timeout)
File "/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/multiprocessing/popen_fork.py", line 51, in wait
return self.poll(os.WNOHANG if timeout == 0.0 else 0)
File "/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/multiprocessing/popen_fork.py", line 29, in poll
pid, sts = os.waitpid(self.pid, flag)
KeyboardInterrupt
Traceback (most recent call last):
File "/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/multiprocessing/process.py", line 249, in _bootstrap
self.run()
File "/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/multiprocessing/process.py", line 93, in run
self._target(*self._args, **self._kwargs)
File "multiprocessing_example.py", line 9, in do_job
task = tasks_to_accomplish.get()
File "/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/multiprocessing/queues.py", line 94, in get
res = self._recv_bytes()
File "/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/multiprocessing/connection.py", line 216, in recv_bytes
buf = self._recv_bytes(maxlength)
File "/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/multiprocessing/connection.py", line 407, in _recv_bytes
buf = self._recv(4)
File "/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/multiprocessing/connection.py", line 379, in _recv
chunk = read(handle, remaining)
KeyboardInterrupt
Can someone tell me what is the issue with the program? I am using python 3.6.
Note: Lock is not needed around a Queue.
lock.acquire()
tasks_that_are_done.put(task + ' is done by ' + current_process().name)
lock.release()
Queue
The Queue class in this module implements all the required locking semantics.
Question: ... what is the issue with the program?
You are using Queue.empty() and Queue.get(),
such leads to Deadlock on calling join() because there is no guarantee that the empty() State don't change until get()
was reaching.
Deadlock prone:
while not tasks_to_accomplish.empty():
task = tasks_to_accomplish.get()
Instead of using empty/get, Pair use for instance:
import queue
while True:
try:
task = tasks_to_accomplish.get_nowait()
except queue.Empty:
break
else:
# Handle task here
...
tasks_to_accomplish.task_done()

How to create a file when working with threads in python?

Have a look at this code :
import threading
import time
def my_inline_function(number):
#do some stuff
download_thread = threading.Thread(target=function_that_writes, args=number)
download_thread.start()
#continue doing stuff
i = 0
while(i < 10000):
print str(i) + " : Main thread"
time.sleep(1)
i = i + 1
def function_that_writes(number):
i = number
file = open("dummy.txt", 'w')
while (i < 10000):
string = str(i) + " : child thread"
file.write(string)
time.sleep(1)
file.close()
my_inline_function(5)
function_that_writes(5)
With does my_inline_function(), which starts a thread, not create a file?
But when I am calling a function_that_writes(...) directly, which is not running in a thread, it is able to create a file.
Why am I getting this behaviour?
You need to supply your argument as a tuple args=(number,):
download_thread = threading.Thread(target=function_that_writes, args=(number,))
The exception is pretty clear here:
Exception in thread Thread-1:
Traceback (most recent call last):
File "/Users/mike/anaconda/lib/python2.7/threading.py", line 801, in __bootstrap_inner
self.run()
File "/Users/mike/anaconda/lib/python2.7/threading.py", line 754, in run
self.__target(*self.__args, **self.__kwargs)
TypeError: function_that_writes() argument after * must be an iterable, not int

Sometime pathos.multiprocessing.Pool can't be terminated correctly

I try to use pathos.multiprocessing.Pool in my project.
However, it will meet the following problem when I terminate the Pool.
I use CentOS 6.5, I'm not sure if it is caused by pathos.multiprocessing.Pool or other thing, can anyone help me on it?
Traceback (most recent call last):
File "/usr/local/lib/python2.7/threading.py", line 801, in __bootstrap_inner
self.run()
File "/usr/local/lib/python2.7/threading.py", line 1073, in run
self.function(*self.args, **self.kwargs)
File "receiver.py", line 132, in kill_clients
pool.terminate()
File "/usr/local/lib/python2.7/site-packages/multiprocess/pool.py", line 465, in terminate
self._terminate()
File "/usr/local/lib/python2.7/site-packages/multiprocess/util.py", line 207, in __call__
res = self._callback(*self._args, **self._kwargs)
File "/usr/local/lib/python2.7/site-packages/multiprocess/pool.py", line 513, in _terminate_pool
p.terminate()
File "/usr/local/lib/python2.7/site-packages/multiprocess/process.py", line 137, in terminate
self._popen.terminate()
File "/usr/local/lib/python2.7/site-packages/multiprocess/forking.py", line 174, in terminate
os.kill(self.pid, signal.SIGTERM)
OSError: [Errno 3] No such process
The wired thing is that at the beginning, it works well. But when the 4th job is received, there will be such problem.
class Receiver:
def __init__(self):
....
self.results={}
def kill_clients(self, client_list, pool):
for client in client_list:
client.kill()
pool.terminate()
def process_result(self, result):
if result is None:
self.results = {}
return
res = result.split(':')
if len(res) != 4:
raise Exception("result with wrong format: %s" % result)
self.results['%s_%s' % (res[0], res[1])] = {"code": res[3], "msg": res[4]}
...
def handler(self, job):
self.lg.debug("Receive job in rtmp_start_handler.")
self.lg.debug("<%s>" % str(job))
# each client corresponding one process
cli_counts = job['count']
pool = Pool(processes=cli_counts)
clients = []
try:
for i in xrange(cli_counts):
rtmp_cli = RtmpClient(job['case'], i)
clients.append(rtmp_cli)
[pool.apply_async(client.run, callback=self.process_result)
for client in clients]
pool.close()
sleep(1)
self.lg.debug("All clients are started.")
t = Timer(
job['timeout'],
self.kill_clients,
args=(clients, pool)
)
t.start()
self.lg.debug("Timer is started. timeout %s s" % job['timeout'])
pool.join()
except Exception, e:
self.lg.warning("Exception occurred: %s" % e)
self.lg.warning(format_exc())
return "0"
# here the self.results shall be ready
return self.parse_results()
The OSError is not caused by the Pool but by my program issue.
When I use Popen to create a subprocess and exec ffmpeg, it will exit immediately(due to other problem), so when I try to kill the subprocess, it it not existed by then. That's why OSError will be raised.

Categories