Storing result of a thread or process with concurrent.futures - python

I'm writing a utility I can use to check ports on many subnets. Currently I'm adding my results to a csv file and then sorting the file. I would like to instead add my results to a single list and then output the list so I'm doing fewer file open/close operations. I cannot seem to figure out how to make my results persist between threads. Below is my code:
import csv
import test_ports
import pandas
import ipaddress
import concurrent.futures
import time
import os
class check_subnets(object):
def __init__(self):
self.tested_list = []
def setup(self, l_subnets):
with concurrent.futures.ProcessPoolExecutor(max_workers=4) as executor:
executor.map(self.subnet_search, l_subnets)
return self.tested_list
def subnet_search(self, sub):
print("Testing the " + sub + " subnet.")
with concurrent.futures.ThreadPoolExecutor(max_workers=16) as executor2:
executor2.map(self.ip_search, ipaddress.IPv4Network(sub))
def ip_search(self, ip):
test = test_ports.TestPort()
s_ip_addr = str(ip)
print("Tested " + s_ip_addr)
test_ssh = test.test_ssh(s_ip_addr)
test_rdp = test.test_rdp(s_ip_addr)
this_list = [s_ip_addr, test_ssh, test_rdp]
self.tested_list.append(this_list)
with open('tested.csv', 'a') as file:
writer = csv.writer(file)
writer.writerow(this_list)
file.close()
if __name__ == '__main__':
subnets = pandas.read_csv('hosts.csv')
list_subnets = subnets['Subnet'].values.tolist()
fields = ['IP_Addr', "SSH(22)", "RDP(443)"]
with open('tested.csv', 'w') as f:
write = csv.writer(f)
write.writerow(fields)
f.close()
t0 = time.time()
checker = check_subnets()
results = checker.setup(list_subnets)
print(results)
t1 = time.time()
print(t1-t0)
with open("tested.csv", 'r',newline='') as f_input:
csv_input = csv.DictReader(f_input)
data = sorted(csv_input, key=lambda row: (row['IP_Addr']))
f_input.close()
with open("sorted.csv", 'w', newline='') as f_output:
csv_output = csv.DictWriter(f_output, fieldnames=csv_input.fieldnames)
csv_output.writeheader()
csv_output.writerows(data)
f_output.close()
if os.path.exists("tested.csv"):
os.remove("tested.csv")
else:
print("The file does not exist")
I'm using the class to try and create some kind of location each method would see. I have a feeling the class-specific tested_list is not available to each thread, rather each thread is seeing one instance of tested_list and not a shared list.
The test_ports module is just a wrapper for some socket operations.

I figured out that there is a small difference in concurrent.futures.ProcessPoolExecutor
and
concurrent.futures.ThreadPoolExecutor
ThreadPoolExecutor is doing exactly what I wanted, preserving data between threads. New code looks like this:
import csv
import test_ports
import pandas
import ipaddress
import concurrent.futures
import time
class check_subnets(object):
def __init__(self):
self.tested_list = []
def setup(self, l_subnets):
with concurrent.futures.ThreadPoolExecutor(max_workers=4) as executor:
executor.map(self.subnet_search, l_subnets)
return self.tested_list
def subnet_search(self, sub):
with concurrent.futures.ThreadPoolExecutor(max_workers=16) as executor2:
executor2.map(self.ip_search, ipaddress.IPv4Network(sub))
def ip_search(self, ip):
test = test_ports.TestPort()
s_ip_addr = str(ip)
test_ssh = test.test_ssh(s_ip_addr)
test_rdp = test.test_rdp(s_ip_addr)
this_list = [s_ip_addr, test_ssh, test_rdp]
self.tested_list.append(this_list)
if __name__ == '__main__':
subnets = pandas.read_csv('hosts.csv')
list_subnets = subnets['Subnet'].values.tolist()
t0 = time.time()
checker = check_subnets()
results = checker.setup(list_subnets)
t1 = time.time()
print(t1-t0)
sorted_list = (sorted(results, key=lambda x: x[0]))
fields = ['IP_Addr', "SSH(22)", "RDP(443)"]
with open('tested.csv', 'w') as f:
write = csv.writer(f)
write.writerow(fields)
write.writerows(sorted_list)
f.close()
The end result is a sorted list of opened and closed ssh and rdp ports.

Related

multiprocessing slower than loop

I'm trying to write huge data to a csv file. When I try normal method it writes 50 data in 1 second but with multiprocessing it's down to 5 data in 1 second.
And I also added this code sys.setrecursionlimit(25000). Because without it's giving error.
I can feel I'm not doing right. What is the right way?
from bs4 import BeautifulSoup
import requests
import lxml
import csv
import cchardet
from multiprocessing import Pool
import sys
import time
sys.setrecursionlimit(25000)
csvfileWrite=open("comments.csv", 'a+', newline='',encoding='utf-8') #declared as a global variable
writer = csv.writer(csvfileWrite, delimiter=';', quotechar='"',
quoting=csv.QUOTE_MINIMAL) #declared as a global variable
def kacYildiz(div): #This function returns a number 0 to 5. Not important.
yildizSayisi=0
yildizYeri=div.find("div",attrs={"class":"RatingPointer-module-1OKF3"})
yildizlar=yildizYeri.find_all("svg")
for yildiz in yildizlar:
sonuc=yildiz.find("path").get("fill")
if(sonuc=="#f28b00"):
yildizSayisi+=1
return yildizSayisi
def takeText(div):
comment=div.find("span",attrs={"itemprop":"description"}).text
return comment
def yorumSayfaSayisi(row): # This function returns a number that how many
pages in the sites comment section. Not important.
yorumKismi="-yorumlari?"
adres=row[0]+yorumKismi
r = requests_session.get(adres)
soup = BeautifulSoup(r.text,"lxml")
sayfaS=soup.find("ul",attrs={"class":"PaginationBar-module-3qhrm"})
sayi=sayfaS.find_all("li")[-1].text
return sayi
def writeToCsv(comments): #writing commets to csv file.
global csvfileWrite
global writer
textToWrite = takeText(comments)
writer.writerow([kacYildiz(comments),textToWrite])
if __name__ == '__main__':
pageNumber=1
requests_session = requests.Session()
comments=list()
csvfile=open('adresler.csv',newline='')
reader = csv.reader(csvfile, delimiter=';', quotechar='|')
for row in reader:
rowNumber=yorumSayfaSayisi(row)
for i in range(1,int(rowNumber)):
comments.clear()
commetAdress="-yorumlari?sayfa={}".format(i)
adress=row[0]+commetAdress
r = requests_session.get(adress)
soup = BeautifulSoup(r.text,"lxml")
page=soup.find_all("div",attrs={"class":"ReviewCard-module-
3Y36S"})
for comment in page:
comments.append(comment)
p = Pool(10)
start = time.process_time()
p.map(writeToCsv, comments)
p.terminate()
p.join()
once try this approach using ThreadPool
from multiprocessing.pool import ThreadPool
def csvYaz(yorumlar):
global csvfileYaz
global yazici
yazi = yorumAl(yorumlar)
yazici.writerow([kacYildiz(yorumlar),yazi])
# ------main-----
for yorum in yorumSayfasi:
yorumlar.append(yorum)
threads = ThreadPool(10).map(csvYaz, yorumlar)
for zz in threads:
print(zz)

How to keep writing value into spreadsheet in a user controlled loop in Python?

I'm trying to write values into a .csv file every second until interrupted by the user by pressing a key. I'm able to achieve it in a finite loop.
I tried using raw_input but the program would only write the last value before interruption. What should I change in my code?
Here's the code that works for a finite loop:
import time
import csv
class Timestamp:
def __init__(self):
my_file = open('test_csv.csv','w+')
with my_file:
new_file = csv.writer(my_file)
for val in range(0,20):
with open('test_csv.csv','a') as f:
date_now = time.strftime('%d/%m/%y')
time_now = time.strftime('%H:%M:%S')
to_write = [date_now, time_now]
csv_file =csv.writer(f)
csv_file.writerow(to_write)
time.sleep(1)
Timestamp()
You can use threads.
import time
import csv
from threading import Thread
def write_loop(self):
my_file = open('test_csv.csv', 'w+')
with my_file:
new_file = csv.writer(my_file)
for val in range(0, 20):
with open('test_csv.csv', 'a') as f:
date_now = time.strftime('%d/%m/%y')
time_now = time.strftime('%H:%M:%S')
to_write = [date_now, time_now]
csv_file = csv.writer(f)
csv_file.writerow(to_write)
time.sleep(1)
t = Thread(target=write_loop)
t.daemon = True
t.start()
input("Press any key to stop")

Python - multiprocessing while writing to a single result file

I am really new to the multiprocessing package and I am failing to get the task done.
I have lots of calculations to do on a list of objects.
The results I need to write down are saved in those objects, too.
The results should be written in a single file as soon as the process finished the calculations (the way I got it at least working, waits until all calculations are done).
import multiprocessing
import time
import csv
class simpl():
def __init__(self, name, val):
self.name = name
self.val = val
def pot_val(inpt):
print("Process %s\t ..." % (inpt.name))
old_v = inpt.val
inpt.val *= inpt.val
if old_v != 8:
time.sleep(old_v)
print("Process %s\t ... Done" % (inpt.name))
def mp_worker(inpt):
pot_val(inpt)
return inpt
def mp_handler(data_list):
p = multiprocessing.Pool(4)
with open('results.csv', 'a') as f:
res = p.map_async(mp_worker, data_list)
results = (res.get())
for result in results:
print("Writing result for ",result.name)
writer= csv.writer(f, lineterminator = '\n', delimiter=";")
writer.writerow((result.name, result.val))
if __name__=='__main__':
data = []
counter=0
for i in range(10):
data.append(simpl("name"+str(counter),counter))
counter += 1
for d in data:
print(d.name, d.val)
mp_handler(data)
How to write the results from the calculations simultaneously to one single file, without having to wait for all processes to finish?
You can use imap_unordered
def mp_handler(data_list):
p = multiprocessing.Pool(4)
with open('results.csv', 'a') as f:
writer= csv.writer(f, lineterminator = '\n', delimiter=";")
for result in p.imap_unordered(mp_worker, data_list):
print("Writing result for ",result.name)
writer.writerow((result.name, result.val))
With Python 3.3+ better do
def mp_handler(data_list):
with multiprocessing.Pool(4) as p:
with open('results.csv', 'a') as f:
writer= csv.writer(f, lineterminator = '\n', delimiter=";")
for result in p.imap_unordered(mp_worker, data_list):
print("Writing result for ",result.name)
writer.writerow((result.name, result.val))

Can I accelerate python file method `read()` by parallelism?

I have a lot of files(300~500) to read, and I want to accelerate this task.
The idealization is:
from multiprocessing import Pool
import os
import _io
filelist = map(open,os.listdir())
if __name__ == '__main__':
with Pool() as pool:
a = pool.map(_io.TextIOWrapper.read,filelist)
Of course, I got an error:
TypeError: cannot serialize '_io.TextIOWrapper' object
The question is: Can I accelerate I/O process by parallelism? If yes, how to?
UPDATE conclusion:
Now I get the way to parallelism and have tested my code:
I used 22 items, totalling 63.2 MB
from multiprocessing import Pool
import os
import _io
def my_read(file_name):
with open(file_name) as f:
return f.read()
def mul():
with Pool() as pool:
a = pool.map(my_read, os.listdir())
def single():
a = []
for i in os.listdir():
with open(i) as f:
r = f.read()
a.append(r)
if __name__ == '__main__':
mul()
# single()
Sadly, single() costs 0.4s while mul() costs 0.8s.
UPDATE 1:
Some people said it's an IO-bound task so I can not improve it by parallelism。
However, I can find these words in Python doc:
However, threading is still an appropriate model if you want to run multiple I/O-bound tasks simultaneously.
The full code is here:
My purpose is to transfer Epub to txt.
I have parallelized char2text and now I want to accelerate readall:
import zipfile
from multiprocessing import Pool
import bs4
def char2text(i):
soup = bs4.BeautifulSoup(i)
chapter = soup.body.getText().splitlines()
chapter = "\n".join(chapter).strip() + "\n\n"
return chapter
class Epub(zipfile.ZipFile):
def __init__(self, file, mode='r', compression=0, allowZip64=False):
zipfile.ZipFile.__init__(self, file, mode, compression, allowZip64)
if mode == 'r':
self.opf = self.read('OEBPS/content.opf').decode()
opf_soup = bs4.BeautifulSoup(self.opf)
self.author = opf_soup.find(name='dc:creator').getText()
self.title = opf_soup.find(name='dc:title').getText()
try:
self.description = opf_soup.find(name='dc:description').getText()
except:
self.description = ''
try:
self.chrpattern = opf_soup.find(name='dc:chrpattern').getText()
except:
self.chrpattern = ''
self.cover = self.read('OEBPS/images/cover.jpg')
elif mode == 'w':
pass
def get_text(self):
self.tempread = ""
charlist = self.readall(self.namelist())
with Pool() as pool:
txtlist = pool.map(char2text, charlist)
self.tempread = "".join(txtlist)
return self.tempread
def readall(self, namelist):
charlist = []
for i in namelist:
if i.startswith('OEBPS/') and i.endswith('.xhtml'):
r = self.read(i).decode()
charlist.append(r)
return charlist
def epub2txt(self):
tempread = self.get_text()
with open(self.title + '.txt', 'w', encoding='utf8') as f:
f.write(tempread)
if __name__ == "__main__":
e = Epub("assz.epub")
import cProfile
cProfile.run("e.epub2txt()")
Did you try something like:
from multiprocessing import Pool
import os
import _io
def my_read(file_name):
with open(file_name) as f:
return _io.TextIOWrapper.read(f)
if __name__ == '__main__':
with Pool() as pool:
a = pool.map(my_read, os.listdir('some_dir'))
Is sounds more logical to me to open/close the file in the sub-process and string are easily serializable.
for your readall method try:
def readall(self, namelist):
filter_func = lambda i: i.startswith('OEBPS/') and i.endswith('.xhtml')
read_fun= lambda i: self.read(i).decode()
with Pool() as pool:
a = pool.map(read_fun, filter(filter_func, namelist))
return a

gevent pool getting stuck

I am a gevent newbie, but I think I got it working — in a limited sense. Basically, for pools of 1, the code proceeds, while for larger pools the code gets stuck, usually within the first pool (e.g. with a pool of 5, I see 3 greenlet finishing, but not more). What is going wrong? Spawn? Join?
I cannot verify whether the remote server gets confused by multiple queries, but it has no problem with a rapid sequence of serial requests, so probably not…
(I share the code in its entirety as I am not sure where the bug is. Thanks for bearing with me.)
from urllib2 import urlopen
from lxml.etree import parse
import os, csv, cStringIO, codecs, pickle
from selenium import webdriver
from time import sleep
import gevent
from gevent import socket
from gevent import monkey, pool
# patches stdlib (including socket and ssl modules) to cooperate with other greenlets
monkey.patch_all()
class UnicodeWriter:
"""
A CSV writer which will write rows to CSV file "f",
which is encoded in the given encoding.
"""
def __init__(self, f, dialect=csv.excel, encoding="utf-8", **kwds):
# Redirect output to a queue
self.queue = cStringIO.StringIO()
self.writer = csv.writer(self.queue, dialect=dialect, **kwds)
self.stream = f
self.encoder = codecs.getincrementalencoder(encoding)()
def writerow(self, row):
self.writer.writerow([unicode(s).encode("utf-8") for s in row])
# Fetch UTF-8 output from the queue ...
data = self.queue.getvalue()
data = data.decode("utf-8")
# ... and reencode it into the target encoding
data = self.encoder.encode(data)
# write to the target stream
self.stream.write(data)
# empty queue
self.queue.truncate(0)
def writerows(self, rows):
for row in rows:
self.writerow(row)
os.chdir('/Users/laszlosandor/Downloads/kozbeszerzes')
HOSTNAME = 'http://kozbeszerzes.ceu.hu'
driver = webdriver.Chrome()
results = set()
for y in xrange(1998,2015):
for p in xrange(0,9999):
driver.get('http://kozbeszerzes.ceu.hu/searchresults.xhtml?q={}&page={}'.format(y,p))
sleep(1)
if len(driver.find_elements_by_class_name('result'))==0:
break
for e in driver.find_elements_by_class_name('result'):
link = e.find_element_by_tag_name('a')
r = link.get_attribute('href').encode('ascii', 'ignore')
if r[:34]== 'http://kozbeszerzes.ceu.hu/tender/':
results.add(r)
driver.quit()
with open('list_of_urls', 'wb') as f:
pickle.dump(results, f)
#with open('list_of_urls', 'r') as f:
# results = pickle.load(f)
entities = set()
header = ('TenderID','RequestorName','URL','Year','RequestorID','Subject','SourceURL','EstValue','Currency','DecisionDate','Value','VAT')
# """Spawn multiple workers and wait for them to complete"""
# # limit ourselves to max 10 simultaneous outstanding requests
p = pool.Pool(10)
f = open('tenders.csv', 'w')
f.write(codecs.BOM_UTF8)
writer = UnicodeWriter(f)
writer.writerow(header)
def workres(res):
try:
tender = parse(urlopen(res)).getroot()
print ('%s succeeded' % res)
for requestor in tender.findall('requestor'):
entities.add(HOSTNAME + requestor.get('url'))
id = tender.get('id')
reqname = tender.get('requestor')
url = tender.get('url')
year = tender.get('year')
reqid = tender.get('requestor_id')
subject = tender.get('subject')
source = tender.get('source_url')
estval = tender.get('estimated_value')
for part in tender.findall('./parts/part'):
winner = part.find('winner')
entities.add(HOSTNAME + winner.get('url'))
curr = part.find('currency').text
date = part.find('decisionDate').text
value = part.find('value').text
vat = part.find('vat').text
row = id, reqname, url, year, reqid, subject, source, estval, curr, date, value, vat
writer.writerow(row)
except socket.gaierror:
ex = sys.exc_info()[1]
print ('%s failed with %s' % (res, ex))
jobs = [p.spawn(workres, res) for res in results]
p.join()
f.close()
with open('entities', 'wb') as f:
pickle.dump(entities, f)
header = ['ID','URL','Name','NominalCity','City', 'ZIP', 'Address']
f = open('entities.csv', 'w')
f.write(codecs.BOM_UTF8)
writer = UnicodeWriter(f)
writer.writerow(header)
def workent(ent):
try:
ent = parse(urlopen(ent)).getroot()
print ('%s succeeded' % ent)
id = ent.get('id')
url = ent.get('url')
name = ent.get('name')
nominalcity = ent.get('city')
cities = ent.findall('./resolved_addresses/whitelistAddress/city')
zips = ent.findall('./resolved_addresses/whitelistAddress/postalCode')
streets = ent.findall('./resolved_addresses/whitelistAddress/street')
for a in xrange(0,len(cities)):
city = cities[a].text
zip = zips[a].text
street = streets[a].text
row = id, url, name, nominalcity, city, zip, street
writer.writerow(row)
except socket.gaierror:
ex = sys.exc_info()[1]
print ('%s failed with %s' % (ent, ex))
jobs = [p.spawn(workent, ent) for ent in entities]
p.join()
f.close()
I see many mistakes here.
There is not used gevent.sleep() and not time.sleep which is
blocking.
Your variables names are too short. Your could add
descriptions on what each part of code is supposed to do. for example the variable 'p'
is used twice..
There are multiple urls gets using urlopen and the driver module? confusing..
I would use queues between different workers and have just one worker do
write_row calls and deal with the file access now you have multiple green lets accessing
the same file..
use less list compehensions just write out the loops.
I would suggest putting the try except in 'workres' only around the 'parse(urlopen())'
code maybe there are more exceptions happening, which you now don't see.
more tips for gevent

Categories