Here I am retrieving different webpages and then counting the frequency of alphabets and getting the final dictionary as an output. In this case I am going through 20 webpages for i in range(1000,1020):and therefore I ran a loop and created 20 threads. It is serving the purpose and is reducing a lot of time as compared to single thread. What if I wanted to retrieving 1000 of webpages, should I use loop to create different 1000 threads or is there any way to give some webpages as chunk to each thread ? Is there any limitation of creating the number of threads ?
import json
import json
import time
import urllib.request
from threading import Thread
finished_tasks = 0
def count_letters(url,frequency):
response = urllib.request.urlopen(url)
txt = str(response.read())
for l in txt:
letter = l.lower()
if letter in frequency:
frequency[letter] += 1
global finished_tasks
finished_tasks += 1
def main():
frequency = {}
for c in 'abcdefghijklmnopqrstuvwxyz':
frequency[c] = 0
start_time = time.time()
for i in range(1000,1020):
p = Thread(target = count_letters,args =('https://www.rfc-editor.org/rfc/rfc'+str(i)+'.txt',frequency))
p.start()
while finished_tasks < 20:
time.sleep(0.5)
end_time = time.time()
print(end_time-start_time)
print(frequency)
main()
Related
I am quite new to XML and to what makes code effective, and the code I am using takes quite a long time to run.
So I want to extract the elevation from given lat, long-values as fast as possible (I have a lot of lat,long-points). This is how I tried it:
import xml.etree.ElementTree as ET
from urllib.request import urlopen
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
def elevation(lat, long):
query = ('http://openwps.statkart.no/skwms1/wps.elevation2?request=Execute&service=WPS&version=1.0.0'
f'&identifier=elevation&datainputs=lat={lat};lon={long};epsg=4326')
parsing = "{http://www.opengis.net/wps/1.0.0}"
with urlopen(query) as f:
tree = ET.parse(f)
root = tree.getroot()
return float(root.findall(f".//{parsing}Data/*")[0].text)
Using this function on the data set I have extracted from an csv-file, with several datasets within the same file separated by a "new_sheep"-line:
df = pd.read_csv("/Users/ninsalv/Documents/Sheepdata/Data.csv", delimiter=';',
dtype={"Initial start": "str", "Start": "str", "Stop": "str"})
print(df.head())
dataset = 1
Lat = []
Long = []
temp = 0
for i in range(len(df)):
if "new_sheep" in df.iloc[i][0]:
temp += 1
continue
if temp == dataset:
Lat.append(df.iloc[i][3])
Long.append(df.iloc[i][4])
if temp > dataset:
break
step = np.linspace(0,len(Lat),len(Lat))
altitude = []
for i in range(len(Lat)):
altitude.append(elevation(Lat[i], Long[i]))
if (i % 100) == 0:
print("round number ", i)
plt.plot(step, altitude)
This works, but it takes almost a minute to find every 100 altitudes, and I have about 7000-15000 points to check in my dataset. Does anybody know either XML, pandas or something else that may make my code faster?
What you need to do is to get data (HTTP request) you are looking for in parallel. You cab use multi threading for that.
See the example below.
import requests
from requests.sessions import Session
import time
from threading import Thread,local
from queue import Queue
url_list = [] # TODO long list of urls to be populated by your code
q = Queue(maxsize=0) #Use a queue to store all URLs
for url in url_list:
q.put(url)
thread_local = local() #The thread_local will hold a Session object
def get_session() -> Session:
if not hasattr(thread_local,'session'):
thread_local.session = requests.Session() # Create a new Session if not exists
return thread_local.session
def download_link() -> None:
'''download link worker, get URL from queue until no url left in the queue'''
session = get_session()
while not q.empty():
url = q.get()
with session.get(url) as response:
print(f'Read {len(response.content)} from {url}')
q.task_done() # tell the queue, this url downloading work is done
def download_all(urls) -> None:
'''Start 10 threads, each thread as a wrapper of downloader'''
thread_num = 10
for i in range(thread_num):
t_worker = Thread(target=download_link)
t_worker.start()
q.join() # main thread wait until all url finished downloading
print("start work")
start = time.time()
download_all(url_list)
end = time.time()
print(f'download {len(url_list)} links in {end - start} seconds')
I'm trying to multiprocess an action inside a for x in y loop. Basically, the concept of the script is to do a request to a site, load up a json file containing a list of URLs. Once fetched, another function is called to parse an URL individually. What i've been trying to do is to multiprocess this task with multiprocess.Process() in order to speed up the process since there is lots of URLs to parse. However, my approach doesn't speed up the process at all, it actually goes at the same speed than with no multiprocessing. It seems like gets blocked when using proc.join().
This is a code i've been working on:
import json
import requests
import multiprocessing
def ExtractData(id):
print("Processing ", id)
result = requests.get('http://example-index.com/' + id')
result = result.text.split('\n')[:-1]
for entry in result:
data = json.loads(entry)['url']
print("data is:", data)
def ParseJsonAndCall():
url = "https://example-site.com/info.json"
data = json.loads(requests.get(url).text)
t = []
for results in data:
print("Processing ", results['url'])
p = multiprocessing.Process(target=ExtractData, args=(results['id'],))
t.append(p)
p.start()
for proc in threads:
proc.join()
ParseJsonAndCall()
Any help would be greatly appreciated!
A Pool may help.
import multiprocessing as mp
def ParseJsonAndCall():
url = "https://example-site.com/info.json"
data = json.loads(requests.get(url).text)
collect_results = []
with mp.Pool(processes=mp.cpu_count()) as pool:
for results in data:
res = pool.apply_async(ExtractData, [results['id'],])
collect_results.append(res)
for res in collect_results:
res.get()
Although the print statement in ExtractData() might cause a race condition.
I expect to have maybe something like 100k URLs from different domains. I wrote this code which has a list of URLs in all_urls and forms N threads to run in one batch. Currently I'm using threading module to make these requests in parallel.
import requests
import os
import threading
import time
all_urls = [] # a list of URLs to request, can have up to 100k
global success, fail
success = 0
fail = 0
def func(url_to_request):
global success, fail
try:
r = requests.get(url_to_request, timeout=5)
c = r.content
success = success +1
except:
fail = fail +1
return
batch_count = 1
N = 200 # number of threads
all_threads_urls = []
time_start = time.time()
for item in all_urls:
all_threads_urls.append(item)
if all_urls.index(item) == len(all_urls)-1 or len(all_threads_urls) == N:
# call it
all_threads = []
for link in all_threads_urls:
current_thread = threading.Thread(target=func, args=(link,))
all_threads.append(current_thread)
current_thread.start()
for thr in all_threads:
thr.join()
all_threads_urls = [] # for the next batch
time_end = time.time()
print "Request number", all_urls.index(item)+1, "Good:", success, "Bad:", fail, "Duration:", round(time_end - time_start,2 ), "seconds."
time_start = time_end
Results for this are a bit weird, it seems that the script starts very fast but then slows down a lot (see image). Printed durations are for each batch.
Can someone explain what is the bottleneck here? Is there maybe a better module for this or there is no way around this?
I have a program in place that follows logic like this:
At the start of every hour, multiple directories receive a file that is continuously fed data. I'm developing a simple program that can read all the files simultaneously and abstracted the tailing/reading part into a function, lets call it 'tail' for now. The external program feeding the data doesn't always run smoothly. Sometimes a file will come in late, sometimes the next hour will hit and the program still feeds the stale file data. I can't afford to lose the data. My solution looks something like this using multiprocessing.pool using pseudo code in parts of it:
def process_data(logfile):
num_retries = 5
while num_retries > 0:
if os.path.isfile(logfile):
for record in tail(logfile):
do_something(record)
else:
num_retries -= 1
time.sleep(30)
def tail(logfile):
logfile = open(logfile, 'r')
logfile.seek(0, 2)
while True:
line = logfile.readline()
if line:
wait_time = 0
yield line
else:
if wait_time >= 360:
break
wait_time += 1
time.sleep(1)
continue
if __name__ == '__main__':
start_time = sys.argv[1]
next_hour = None
while True:
logdirs = glob.glob("/opt/logs/plog*")
current_date = datetime.now()
current_hour = current_date.strftime('%H')
current_format = datetime.now().strftime("%Y%m%d%H")
logfiles = [logdir + '/some/custom/path/tofile.log' for logdir in logdirs]
if not next_hour:
next_hour = current_date + timedelta(hours=1)
if current_hour == next_hour.strftime('%H') or current_hour == start_time:
start_time = None
pool = multiprocessing.Pool()
pool.map(process_data, logfiles)
pool.close()
pool.join()
next_hour = current_date + timedelta(hours=1)
time.sleep(30)
Here's what I'm observing when I have logging implemented at the process level:
all files in each directory are getting read appropriately
when the next hour hits, there's a delay of 360s (6 minutes) before the next set of files get read
so if hour 4 ends, a new pool doesn't get created for hour 5 until processes for hour 4 finish
What I'm looking for: I'd like to keep using multiprocessing, but can't figure out why the code inside the main while loop doesn't go through until the previous Pool of processes finishes. I have tried the hourly logic for other examples without multiprocessing and have had it work fine. I'm lead to believe that this has to do with the Pool class and hoping to get advice on how to make it so that even while the previous Pool is active, I can create a new Pool for the new hour and begin processing new files even if it means this creates a ton of processes.
So I have been trying to multi-thread some internet connections in python. I have been using the multiprocessing module so I can get around the "Global Interpreter Lock". But it seems that the system only gives one open connection port to python, Or at least it only allows one connection to happen at once. Here is an example of what I am saying.
*Note that this is running on a linux server
from multiprocessing import Process, Queue
import urllib
import random
# Generate 10,000 random urls to test and put them in the queue
queue = Queue()
for each in range(10000):
rand_num = random.randint(1000,10000)
url = ('http://www.' + str(rand_num) + '.com')
queue.put(url)
# Main funtion for checking to see if generated url is active
def check(q):
while True:
try:
url = q.get(False)
try:
request = urllib.urlopen(url)
del request
print url + ' is an active url!'
except:
print url + ' is not an active url!'
except:
if q.empty():
break
# Then start all the threads (50)
for thread in range(50):
task = Process(target=check, args=(queue,))
task.start()
So if you run this you will notice that it starts 50 instances on the function but only runs one at a time. You may think that the 'Global Interpreter Lock' is doing this but it isn't. Try changing the function to a mathematical function instead of a network request and you will see that all fifty threads run simultaneously.
So will I have to work with sockets? Or is there something I can do that will give python access to more ports? Or is there something I am not seeing? Let me know what you think! Thanks!
*Edit
So I wrote this script to test things better with the requests library. It seems as though I had not tested it very well with this before. (I had mainly used urllib and urllib2)
from multiprocessing import Process, Queue
from threading import Thread
from Queue import Queue as Q
import requests
import time
# A main timestamp
main_time = time.time()
# Generate 100 urls to test and put them in the queue
queue = Queue()
for each in range(100):
url = ('http://www.' + str(each) + '.com')
queue.put(url)
# Timer queue
time_queue = Queue()
# Main funtion for checking to see if generated url is active
def check(q, t_q): # args are queue and time_queue
while True:
try:
url = q.get(False)
# Make a timestamp
t = time.time()
try:
request = requests.head(url, timeout=5)
t = time.time() - t
t_q.put(t)
del request
except:
t = time.time() - t
t_q.put(t)
except:
break
# Then start all the threads (20)
thread_list = []
for thread in range(20):
task = Process(target=check, args=(queue, time_queue))
task.start()
thread_list.append(task)
# Join all the threads so the main process don't quit
for each in thread_list:
each.join()
main_time_end = time.time()
# Put the timerQueue into a list to get the average
time_queue_list = []
while True:
try:
time_queue_list.append(time_queue.get(False))
except:
break
# Results of the time
average_response = sum(time_queue_list) / float(len(time_queue_list))
total_time = main_time_end - main_time
line = "Multiprocessing: Average response time: %s sec. -- Total time: %s sec." % (average_response, total_time)
print line
# A main timestamp
main_time = time.time()
# Generate 100 urls to test and put them in the queue
queue = Q()
for each in range(100):
url = ('http://www.' + str(each) + '.com')
queue.put(url)
# Timer queue
time_queue = Queue()
# Main funtion for checking to see if generated url is active
def check(q, t_q): # args are queue and time_queue
while True:
try:
url = q.get(False)
# Make a timestamp
t = time.time()
try:
request = requests.head(url, timeout=5)
t = time.time() - t
t_q.put(t)
del request
except:
t = time.time() - t
t_q.put(t)
except:
break
# Then start all the threads (20)
thread_list = []
for thread in range(20):
task = Thread(target=check, args=(queue, time_queue))
task.start()
thread_list.append(task)
# Join all the threads so the main process don't quit
for each in thread_list:
each.join()
main_time_end = time.time()
# Put the timerQueue into a list to get the average
time_queue_list = []
while True:
try:
time_queue_list.append(time_queue.get(False))
except:
break
# Results of the time
average_response = sum(time_queue_list) / float(len(time_queue_list))
total_time = main_time_end - main_time
line = "Standard Threading: Average response time: %s sec. -- Total time: %s sec." % (average_response, total_time)
print line
# Do the same thing all over again but this time do each url at a time
# A main timestamp
main_time = time.time()
# Generate 100 urls and test them
timer_list = []
for each in range(100):
url = ('http://www.' + str(each) + '.com')
t = time.time()
try:
request = requests.head(url, timeout=5)
timer_list.append(time.time() - t)
except:
timer_list.append(time.time() - t)
main_time_end = time.time()
# Results of the time
average_response = sum(timer_list) / float(len(timer_list))
total_time = main_time_end - main_time
line = "Not using threads: Average response time: %s sec. -- Total time: %s sec." % (average_response, total_time)
print line
As you can see, it is multithreading very well. Actually, most of my tests show that the threading module is actually faster than the multiprocessing module. (I don't understand why!) Here are some of my results.
Multiprocessing: Average response time: 2.40511314869 sec. -- Total time: 25.6876308918 sec.
Standard Threading: Average response time: 2.2179402256 sec. -- Total time: 24.2941861153 sec.
Not using threads: Average response time: 2.1740363431 sec. -- Total time: 217.404567957 sec.
This was done on my home network, the response time on my server is much faster. I think my question has been answered indirectly, since I was having my problems on a much more complex script. All of the suggestions helped me optimize it very well. Thanks to everyone!
it starts 50 instances on the function but only runs one at a time
You have misinterpreted the results of htop. Only a few, if any, copies of python will be runnable at any specific instance. Most of them will be blocked waiting for network I/O.
The processes are, in fact, running parallel.
Try changing the function to a mathematical function instead of a network request and you will see that all fifty threads run simultaneously.
Changing the task to a mathematical function merely illustrates the difference between CPU-bound (e.g. math) and IO-bound (e.g. urlopen) processes. The former is always runnable, the latter is rarely runnable.
it only prints one at a time. If it was actually running multiple processes it would print many out at once.
It prints one at a time because you are writing lines to a terminal. Because the lines are indistinguishable, you wouldn't be able to tell if they are written all by one thread, or each by a separate thread in turn.
First of all, using multiprocessing to parallelize network I/O is an overkill. Using the built-in threading or a lightweight greenlet library like gevent are a much better option with less overhead. The GIL has nothing to do with blocking IO calls, so you don't have to worry about that at all.
Secondly, an easy way to see if your subprocesses/threads/greenlets are running in parallel if you are monitoring stdout is to print out something at the very beginning of the function, right after the subprocesses/threads/greenlets are spawned. For example, modify your check() function like so
def check(q):
print 'Start checking urls!'
while True:
...
If your code is correct, you should see many Start checking urls! lines printed out before any of the url + ' is [not] an active url!' printed out. It works on my machine, so it looks like your code is correct.
It appears that your issue is actually with the serial behavior of gethostbyname(3). This is discussed in this SO thread.
Try this code that uses the Twisted asynchronous I/O library:
import random
import sys
from twisted.internet import reactor
from twisted.internet import defer
from twisted.internet.task import cooperate
from twisted.web import client
SIMULTANEOUS_CONNECTIONS = 25
# Generate 10,000 random urls to test and put them in the queue
pages = []
for each in range(10000):
rand_num = random.randint(1000,10000)
url = ('http://www.' + str(rand_num) + '.com')
pages.append(url)
# Main function for checking to see if generated url is active
def check(page):
def successback(data, page):
print "{} is an active URL!".format(page)
def errback(err, page):
print "{} is not an active URL!; errmsg:{}".format(page, err.value)
d = client.getPage(page, timeout=3) # timeout in seconds
d.addCallback(successback, page)
d.addErrback(errback, page)
return d
def generate_checks(pages):
for i in xrange(0, len(pages)):
page = pages[i]
#print "Page no. {}".format(i)
yield check(page)
def work(pages):
print "started work(): {}".format(len(pages))
batch_size = len(pages) / SIMULTANEOUS_CONNECTIONS
for i in xrange(0, len(pages), batch_size):
task = cooperate(generate_checks(pages[i:i+batch_size]))
print "starting..."
reactor.callWhenRunning(work, pages)
reactor.run()