Python port scanner, how to determine optimal number of threads - python

I have a multi threaded Python port scanner where each thread in a loop gets something (an IP address/port pair) from a common queue, does some work on it (connects, does a handshake and grabs the server's version) and loops again.
Here's some partial code:
import threading, queue, multiprocessing
class ScanProcess(multiprocessing.Process):
threads = []
def __init__(self, squeue, dqueue, count):
self.squeue = squeue
self.dqueue = dqueue
self.count = count
self._init_threads()
super(ScanProcess, self).__init__()
def _init_threads(self):
self.threads = [ScanThread(self.squeue, self.dqueue) for _ in range(0, self.count)]
def _start_threads(self):
for thread in self.threads:
thread.start()
def _join_threads(self):
for thread in self.threads:
thread.join()
def run(self):
self._start_threads()
self._join_threads()
class ScanThread(threading.Thread):
def __init__(self, squeue, dqueue):
self.squeue = squeue
self.dqueue = dqueue
super(ScanThread, self).__init__()
def run(self):
while not self.squeue.empty():
try:
target = self.squeue.get(block=False)
# do the actual work then put the result in dqueue
except queue.Empty:
continue
# how many threads/processes
process_count = 2
thread_count = 10
# load tasks from file or network and fill the queues
squeue = multiprocessing.Queue()
dqueue = multiprocessing.Queue()
# create and start everything
processes = [ScanProcess(squeue, dqueue, thread_count) for _ in range(0, process_count)]
for process in processes:
process.start()
for process in processes:
process.join()
# enjoy the show!
The problem I've got is that the number of threads is currently set manually. I'd like to set it automatically to saturate the network connection while not dropping packets, but I have no idea how to begin implementing that. Could anyone summarize how nmap/zmap do it?
Any help is appreciated.

Related

Stopping a python thread while the queue is not empty

I have some code which I hopefully boiled down to a correct MWE.
My goal is to stop the (multiple) threads if a list within the thread has a specific length.
In contrast to the MWE it is not known how many iterations are needed:
from queue import Queue
from threading import Thread
def is_even(n):
return n % 2 == 0
class MT(Thread):
def __init__(self, queue):
super().__init__()
self.queue = queue
self.output = []
def run(self):
while len(self.output) < 4:
task = self.queue.get()
if is_even(task):
self.output.append(task)
self.queue.task_done()
else:
self.queue.task_done()
print(self.output)
print('done')
queue = Queue(10)
threads = 1
thr = []
for th in range(threads):
thr.append(MT(queue))
for th in thr:
th.start()
for i in range(100):
queue.put(i)
queue.join()
for th in thr:
th.join()
print('finished')
This code wil not hit finish...
To quote the documentation,
Queue.join()
Blocks until all items in the queue have been gotten and processed.
You have placed 100 items in the queue. The thread pulls 4 items, and completes. There are still 96 unprocessed items, and nobody is going to pull them. Therefore, queue.join() never returns.

Python multiprocessing for dataset preparation

I'm looking for shorter ways to prepare my dataset for a machine-learning task. I found that the multiprocessing library might helpful. However, because I'm a newbie in multiprocessing, I couldn't find a proper way.
I first wrote some codes like below:
class DatasetReader:
def __init__(self):
self.data_list = Read_Data_from_file
self.data = []
def _ready_data(self, ex, idx):
# Some complex functions that takes several minutes
def _dataset_creator(self, queue):
for idx, ex in enumerate(self.data_list):
queue.put(self._ready_data(ex, idx))
def _dataset_consumer(self, queue):
total_mem = 0.0
t = tqdm(range(self.num_data), total=self.num_data, desc='Building Dataset ', bar_format='{desc}:{percentage:3.0f}% ({n_fmt}/{total_fmt}) [{elapsed}<{remaining},{rate_fmt}{postfix}]')
for idx in t:
ins = queue.get()
self.data.append(ins)
gc.collect()
def _build_dataset(self):
queue = Queue()
creator = Process(target=self._dataset_creator, args=(queue,))
consumer = Process(target=self._dataset_consumer, args=(queue,))
creator.start()
consumer.start()
queue.close()
queue.join_thread()
creator.join()
consumer.join()
However, in my opinion, because the _dataset_creator processes data (here _ready_data) in serial manner, this would not be helpful for reducing time consumption.
So, I modified the code to generate multiple processes that process one datum:
class DatasetReader:
def __init__(self):
self.data_list = Read_Data_from_file
self.data = []
def _ready_data(self, ex, idx):
# Some complex functions that takes several minutes
def _dataset_creator(self, ex, idx, queue):
queue.put(self._ready_data(ex, idx))
def _dataset_consumer(self, queue):
total_mem = 0.0
t = tqdm(range(self.num_data), total=self.num_data, desc='Building Dataset ', bar_format='{desc}:{percentage:3.0f}% ({n_fmt}/{total_fmt}) [{elapsed}<{remaining},{rate_fmt}{postfix}]')
for idx in t:
ins = queue.get()
self.data.append(ins)
gc.collect()
def _build_dataset(self):
queue = Queue()
for idx, ex in enumerate(self.data_list):
p = Process(target=self._dataset_creator, args=(ex, idx, queue,))
p.start()
consumer = Process(target=self._dataset_consumer, args=(queue,))
consumer.start()
queue.close()
queue.join_thread()
consumer.join()
However, this returns me errors:
Process Process-18:
Traceback ~~~
RuntimeError: can't start new thread
Traceback ~~~
OSError: [Errno 12] Cannot allocate memory
Could you help me to process complex data in a parallel way?
EDIT 1:
Thanks to #tdelaney, I can reduce the time consumption by generating self.num_worker processes (16 in my experiment):
def _dataset_creator(self, pid, queue):
for idx, ex in list(enumerate(self.data_list))[pid::self.num_worker]:
queue.put(self._ready_data(ex, idx))
def _dataset_consumer(self, queue):
t = tqdm(range(self.num_data), total=self.num_data, desc='Building Dataset ', bar_format='{desc}:{percentage:3.0f}% ({n_fmt}/{total_fmt}) [{elapsed}<{remaining},{rate_fmt}{postfix}]')
for _ in t:
ins = queue.get()
self.data[ins['idx']] = ins
def _build_dataset(self):
queue = Queue()
procs = []
for pid in range(self.num_worker):
p = Process(target=self._dataset_creator, args=(pid, queue,))
procs.append(p)
p.start()
consumer = Process(target=self._dataset_consumer, args=(queue,))
consumer.start()
queue.close()
queue.join_thread()
for p in procs:
p.join()
consumer.join()
I'm trying to sketch out what a solution with a multiprocessing pool would look like. I got rid of the consumer process completely because it looks like the parent process is just waiting anyway (and needs the data eventually) so it can be the consumer. So, I set up a pool and use imap_unordered to handle passing the data to the worker.
I guessed that the data processing doesn't really need the DatasetReader at all and moved it out to its own function. On Windows, either the entire DataReader object is serialized to the subprocess (including data you don't want) or the child version of the object is incomplete and may crash when you try to use it.
Either way, changes made to a DatasetReader object in the child processes aren't seen in the parent. This can be unexpected if the parent is dependent on updated state in that object. Its best to severely bracket what's happening in subprocesses, in my opinion.
from multiprocessing import Pool, get_start_method, cpu_count
# moved out of class (assuming it is not class dependent) so that
# the entire DatasetReader object isn't pickled and sent to
# the child on spawning systems like Microsoft Windows
def _ready_data(idx_ex):
idx, ex = idx_ex
# Some complex functions that take several minutes
result = complex_functions(ex)
return (idx, result)
class DatasetReader:
def __init__(self):
self.data_list = Read_Data_from_file
self.data = [None] * len(data_list)
def _ready_data_fork(self, idx):
# on forking system, call worker with object data
return _ready_data((idx, self.data_list[idx]))
def run(self):
t = tqdm(range(self.num_data), total=self.num_data, desc='Building Dataset ',
bar_format='{desc}:{percentage:3.0f}% ({n_fmt}/{total_fmt}) '
'[{elapsed}<{remaining},{rate_fmt}{postfix}]')
pool = Pool(min(cpu_count, len(self.data_list)))
if get_start_method() == 'fork':
# on forking system, self.data_list is in child process and
# we only pass the index
result_iter = pool.imap_unordered(self._ready_data_fork,
(idx for idx in range(len(data_list))),
chunksize=1)
else:
# on spawning system, we need to pass the data
result_iter = pool.imap_unordered(_ready_data,
enumerate(self.data_list,
chunksize=1)
for idx, result in result_iter:
next(t)
self.data[idx] = result
pool.join()

Why the threads are not released after all work is consumed from python Queue

I use Queue to provide tasks that threads can work on. After all work is done from Queue, I see the threads are still alive while I expected them being released. Here is my code. You can see the active threads number is increasing after a batch of task(in the same queue) increases from the console. How could I release the threads after a batch of work get done?
import threading
import time
from Queue import Queue
class ThreadWorker(threading.Thread):
def __init__(self, task_queue):
threading.Thread.__init__(self)
self.task_queue = task_queue
def run(self):
while True:
work = self.task_queue.get()
#do some work
# do_work(work)
time.sleep(0.1)
self.task_queue.task_done()
def get_batch_work_done(works):
task_queue = Queue()
for _ in range(5):
t = ThreadWorker(task_queue)
t.setDaemon(True)
t.start()
for work in range(works):
task_queue.put(work)
task_queue.join()
print 'get batch work done'
print 'active threads count is {}'.format(threading.activeCount())
if __name__ == '__main__':
for work_number in range(3):
print 'start with {}'.format(work_number)
get_batch_work_done(work_number)
Do a non blocking read in a loop and use the exception handling to terminate
def run(self):
try:
while True:
work = self.task_queue.get(True, 0.1)
#do some work
# do_work(work)
except Queue.Empty:
print "goodbye"

Send individual data from one thread to multiple threads in python using Queue

I want to share send data from multiple worker threads to one thread, have that thread process all the data received and send data to the multiple worker threads, where each thread receives different data.
I want all of the threads to run over a long time.
I kind of want to use queues and thought about a something like this:
from threading import Thread
import Queue
import time
import random
nr_of_workers = 3
class SyncData(Thread):
def __init__(self, q_in, q_out):
Thread.__init__(self)
self.setDaemon(True)
self.q_in = q_in
self.q_out = q_out
self.start()
def run(self):
for j in range(4):
msg_list = []
for i in range(nr_of_workers):
msg_list.append( self.q_in.get() )
self.q_in.task_done()
for i in range(nr_of_workers):
addr = msg_list[i][0]
print "addr " , addr
message = msg_list[i][1]
message_out = "message from thread addr " + str(addr) + " " + message
self.q_out.put(( addr, message_out ))
time.sleep(2.0)
class ThreadWorker(Thread):
def __init__(self, q_in, q_out, addr):
Thread.__init__(self)
self.addr = addr
self.q_in = q_in
self.q_out = q_out
self.setDaemon(True)
self.start()
def run(self):
for j in range(4):
self.q_out.put((self.addr, "hello from thread"))
recv_msg = self.q_in.get()
self.q_in.task_done()
print recv_msg[1]
q_sync2workers = Queue.Queue()
q_workers2sync = Queue.Queue()
sync = SyncData(q_workers2sync, q_sync2workers)
threads = []
for i in range(nr_of_workers):
threads.append(ThreadWorker(q_sync2workers, q_workers2sync, i))
while True:
pass
where each ThreadWorker sends data over a queue, and receives data over a different queue. The only problem is that I want the SyncData object to send different data to different ThreadWorker, kind of address each part sent over the queue with an id, so that each ThreadWorker only gets the element in the queue specified for that specific ThreadWorker.
I do not really want to have one queue for each ThreadWorker because my real application will be adding and removing ThreadWorkers during runtime.
Any tips on good strategy, either with using Queues, or a more good-practice strategy?

Why my python multi process code runs very slow?

# multi-processes
from multiprocessing import Process, Queue
class Worker(object):
def __init__(self, queue):
self.queue = queue
self.process_num = 10 <------------ 10 processes
self.count = 0
def start(self):
for i in range(self.process_num):
p = Process(target = self.run)
p.start()
p.join()
def run(self):
while True:
self.count += 1
user = self.queue.get()
# do something not so fast like time.sleep(1)
print self.count
if self.queue.empty():
break
I use Worker().start(queue) to start the program, but the output is not so fast as i expected(Seems only one process are running).
Is there any problem in my code ?
Yes, you're only running one process at a time, you're waiting for each process to terminate before starting the next;
def start(self):
for i in range(self.process_num):
p = Process(target = self.run)
p.start() <-- starts a new process
p.join() <-- waits for the process to terminate
In other words, you're starting 10 processes, but the second one won't start until the first one terminates and so on.
For what you're trying to do, it may be better not to use Process manually and instead use a Process Pool.

Categories