I am a gevent newbie, but I think I got it working — in a limited sense. Basically, for pools of 1, the code proceeds, while for larger pools the code gets stuck, usually within the first pool (e.g. with a pool of 5, I see 3 greenlet finishing, but not more). What is going wrong? Spawn? Join?
I cannot verify whether the remote server gets confused by multiple queries, but it has no problem with a rapid sequence of serial requests, so probably not…
(I share the code in its entirety as I am not sure where the bug is. Thanks for bearing with me.)
from urllib2 import urlopen
from lxml.etree import parse
import os, csv, cStringIO, codecs, pickle
from selenium import webdriver
from time import sleep
import gevent
from gevent import socket
from gevent import monkey, pool
# patches stdlib (including socket and ssl modules) to cooperate with other greenlets
monkey.patch_all()
class UnicodeWriter:
"""
A CSV writer which will write rows to CSV file "f",
which is encoded in the given encoding.
"""
def __init__(self, f, dialect=csv.excel, encoding="utf-8", **kwds):
# Redirect output to a queue
self.queue = cStringIO.StringIO()
self.writer = csv.writer(self.queue, dialect=dialect, **kwds)
self.stream = f
self.encoder = codecs.getincrementalencoder(encoding)()
def writerow(self, row):
self.writer.writerow([unicode(s).encode("utf-8") for s in row])
# Fetch UTF-8 output from the queue ...
data = self.queue.getvalue()
data = data.decode("utf-8")
# ... and reencode it into the target encoding
data = self.encoder.encode(data)
# write to the target stream
self.stream.write(data)
# empty queue
self.queue.truncate(0)
def writerows(self, rows):
for row in rows:
self.writerow(row)
os.chdir('/Users/laszlosandor/Downloads/kozbeszerzes')
HOSTNAME = 'http://kozbeszerzes.ceu.hu'
driver = webdriver.Chrome()
results = set()
for y in xrange(1998,2015):
for p in xrange(0,9999):
driver.get('http://kozbeszerzes.ceu.hu/searchresults.xhtml?q={}&page={}'.format(y,p))
sleep(1)
if len(driver.find_elements_by_class_name('result'))==0:
break
for e in driver.find_elements_by_class_name('result'):
link = e.find_element_by_tag_name('a')
r = link.get_attribute('href').encode('ascii', 'ignore')
if r[:34]== 'http://kozbeszerzes.ceu.hu/tender/':
results.add(r)
driver.quit()
with open('list_of_urls', 'wb') as f:
pickle.dump(results, f)
#with open('list_of_urls', 'r') as f:
# results = pickle.load(f)
entities = set()
header = ('TenderID','RequestorName','URL','Year','RequestorID','Subject','SourceURL','EstValue','Currency','DecisionDate','Value','VAT')
# """Spawn multiple workers and wait for them to complete"""
# # limit ourselves to max 10 simultaneous outstanding requests
p = pool.Pool(10)
f = open('tenders.csv', 'w')
f.write(codecs.BOM_UTF8)
writer = UnicodeWriter(f)
writer.writerow(header)
def workres(res):
try:
tender = parse(urlopen(res)).getroot()
print ('%s succeeded' % res)
for requestor in tender.findall('requestor'):
entities.add(HOSTNAME + requestor.get('url'))
id = tender.get('id')
reqname = tender.get('requestor')
url = tender.get('url')
year = tender.get('year')
reqid = tender.get('requestor_id')
subject = tender.get('subject')
source = tender.get('source_url')
estval = tender.get('estimated_value')
for part in tender.findall('./parts/part'):
winner = part.find('winner')
entities.add(HOSTNAME + winner.get('url'))
curr = part.find('currency').text
date = part.find('decisionDate').text
value = part.find('value').text
vat = part.find('vat').text
row = id, reqname, url, year, reqid, subject, source, estval, curr, date, value, vat
writer.writerow(row)
except socket.gaierror:
ex = sys.exc_info()[1]
print ('%s failed with %s' % (res, ex))
jobs = [p.spawn(workres, res) for res in results]
p.join()
f.close()
with open('entities', 'wb') as f:
pickle.dump(entities, f)
header = ['ID','URL','Name','NominalCity','City', 'ZIP', 'Address']
f = open('entities.csv', 'w')
f.write(codecs.BOM_UTF8)
writer = UnicodeWriter(f)
writer.writerow(header)
def workent(ent):
try:
ent = parse(urlopen(ent)).getroot()
print ('%s succeeded' % ent)
id = ent.get('id')
url = ent.get('url')
name = ent.get('name')
nominalcity = ent.get('city')
cities = ent.findall('./resolved_addresses/whitelistAddress/city')
zips = ent.findall('./resolved_addresses/whitelistAddress/postalCode')
streets = ent.findall('./resolved_addresses/whitelistAddress/street')
for a in xrange(0,len(cities)):
city = cities[a].text
zip = zips[a].text
street = streets[a].text
row = id, url, name, nominalcity, city, zip, street
writer.writerow(row)
except socket.gaierror:
ex = sys.exc_info()[1]
print ('%s failed with %s' % (ent, ex))
jobs = [p.spawn(workent, ent) for ent in entities]
p.join()
f.close()
I see many mistakes here.
There is not used gevent.sleep() and not time.sleep which is
blocking.
Your variables names are too short. Your could add
descriptions on what each part of code is supposed to do. for example the variable 'p'
is used twice..
There are multiple urls gets using urlopen and the driver module? confusing..
I would use queues between different workers and have just one worker do
write_row calls and deal with the file access now you have multiple green lets accessing
the same file..
use less list compehensions just write out the loops.
I would suggest putting the try except in 'workres' only around the 'parse(urlopen())'
code maybe there are more exceptions happening, which you now don't see.
more tips for gevent
Related
I'm writing a utility I can use to check ports on many subnets. Currently I'm adding my results to a csv file and then sorting the file. I would like to instead add my results to a single list and then output the list so I'm doing fewer file open/close operations. I cannot seem to figure out how to make my results persist between threads. Below is my code:
import csv
import test_ports
import pandas
import ipaddress
import concurrent.futures
import time
import os
class check_subnets(object):
def __init__(self):
self.tested_list = []
def setup(self, l_subnets):
with concurrent.futures.ProcessPoolExecutor(max_workers=4) as executor:
executor.map(self.subnet_search, l_subnets)
return self.tested_list
def subnet_search(self, sub):
print("Testing the " + sub + " subnet.")
with concurrent.futures.ThreadPoolExecutor(max_workers=16) as executor2:
executor2.map(self.ip_search, ipaddress.IPv4Network(sub))
def ip_search(self, ip):
test = test_ports.TestPort()
s_ip_addr = str(ip)
print("Tested " + s_ip_addr)
test_ssh = test.test_ssh(s_ip_addr)
test_rdp = test.test_rdp(s_ip_addr)
this_list = [s_ip_addr, test_ssh, test_rdp]
self.tested_list.append(this_list)
with open('tested.csv', 'a') as file:
writer = csv.writer(file)
writer.writerow(this_list)
file.close()
if __name__ == '__main__':
subnets = pandas.read_csv('hosts.csv')
list_subnets = subnets['Subnet'].values.tolist()
fields = ['IP_Addr', "SSH(22)", "RDP(443)"]
with open('tested.csv', 'w') as f:
write = csv.writer(f)
write.writerow(fields)
f.close()
t0 = time.time()
checker = check_subnets()
results = checker.setup(list_subnets)
print(results)
t1 = time.time()
print(t1-t0)
with open("tested.csv", 'r',newline='') as f_input:
csv_input = csv.DictReader(f_input)
data = sorted(csv_input, key=lambda row: (row['IP_Addr']))
f_input.close()
with open("sorted.csv", 'w', newline='') as f_output:
csv_output = csv.DictWriter(f_output, fieldnames=csv_input.fieldnames)
csv_output.writeheader()
csv_output.writerows(data)
f_output.close()
if os.path.exists("tested.csv"):
os.remove("tested.csv")
else:
print("The file does not exist")
I'm using the class to try and create some kind of location each method would see. I have a feeling the class-specific tested_list is not available to each thread, rather each thread is seeing one instance of tested_list and not a shared list.
The test_ports module is just a wrapper for some socket operations.
I figured out that there is a small difference in concurrent.futures.ProcessPoolExecutor
and
concurrent.futures.ThreadPoolExecutor
ThreadPoolExecutor is doing exactly what I wanted, preserving data between threads. New code looks like this:
import csv
import test_ports
import pandas
import ipaddress
import concurrent.futures
import time
class check_subnets(object):
def __init__(self):
self.tested_list = []
def setup(self, l_subnets):
with concurrent.futures.ThreadPoolExecutor(max_workers=4) as executor:
executor.map(self.subnet_search, l_subnets)
return self.tested_list
def subnet_search(self, sub):
with concurrent.futures.ThreadPoolExecutor(max_workers=16) as executor2:
executor2.map(self.ip_search, ipaddress.IPv4Network(sub))
def ip_search(self, ip):
test = test_ports.TestPort()
s_ip_addr = str(ip)
test_ssh = test.test_ssh(s_ip_addr)
test_rdp = test.test_rdp(s_ip_addr)
this_list = [s_ip_addr, test_ssh, test_rdp]
self.tested_list.append(this_list)
if __name__ == '__main__':
subnets = pandas.read_csv('hosts.csv')
list_subnets = subnets['Subnet'].values.tolist()
t0 = time.time()
checker = check_subnets()
results = checker.setup(list_subnets)
t1 = time.time()
print(t1-t0)
sorted_list = (sorted(results, key=lambda x: x[0]))
fields = ['IP_Addr', "SSH(22)", "RDP(443)"]
with open('tested.csv', 'w') as f:
write = csv.writer(f)
write.writerow(fields)
write.writerows(sorted_list)
f.close()
The end result is a sorted list of opened and closed ssh and rdp ports.
I am beginner to Python and trying to add few lines of code to convert json to csv and back to json. Have thousands of files (size 300 MB) to be converted and processed. With current program (using 1 CPU), i am not able to use 16 CPUs of server and need suggestions to fine tune the program for multiprocessing. Below is my code with python 3.7 version.
import json
import csv
import os
os.chdir('/stagingData/Scripts/test')
for JsonFile in os.listdir(os.getcwd()):
PartialFileName = JsonFile.split('.')[0]
j = 1
with open(PartialFileName +".csv", 'w', newline='') as Output_File:
with open(JsonFile) as fileHandle:
i = 1
for Line in fileHandle:
try:
data = json.loads(Line, parse_float=str)
except:
print("Can't load line {}".format(i))
if i == 1:
header = data.keys()
output = csv.writer(Output_File)
output.writerow(header) #Writes header row
i += 1
output.writerow(data.values()) #writes values row
j += 1
Appreciate suggestions on multiprocessing logic
If you have a single big file that you want to process more effectively I suggest the following:
Split file into chunks
Create a process to process each chunk
(if necessary) merge the processed chunks back into a single file
Something like this:
import csv
import json
from pathlib import Path
from concurrent.futures import ProcessPoolExecutor
source_big_file = Path('/path/to/file')
def chunk_file_by_line(source_filepath: Path, chunk_size: int = 10_000):
chunk_line_size = 10_000
intermediate_file_handlers = {}
last_chunk_filepath = None
with source_big_file.open('r', encoding='utf8') as big:
for line_number, line in big:
group = line_number - (line_number % chunk_line_size)
chunk_filename = f'{source_big_file.stem}.g{group}{source_big_file.suffix}'
chunk_filepath = source_big_file.parent / chunk_filename
if chunk_filepath not in intermediate_file_handlers:
file_handler = chuck_filepath.open('w', encoding='utf8')
intermediate_file_handlers[chunk_filepath] = file_handler
if last_chunk_filepath:
last_file_hanlder = intermediate_file_handlers[last_chunk_filepath]
last_file_handler.close()
yield last_chunk_filepath
else:
file_handler = intermediate_file_handlers[chunk_filepath]
file_handler.write(line)
last_chunk_filepath = chunk_filepath
# output last one
yield last_chunk_filepath
def json_to_csv(json_filepath: Path) -> Path:
csv_filename = f'{json_filepath.stem}.csv'
csv_filepath = json_filepath.parent / csv_filename
with csv_filepath.open('w', encoding='utf8') as csv_out, json_filepath.open('r', encoding='utf8') as json_in:
dwriter = csv.DictWriter(csv_out)
headers_written = False
for json_line in json_in:
data = json.loads(json_line)
if not headers_written:
# create header record
headers = {k:k for k in data.keys()}
dwriter.writeline(headers)
headers_written = True
dwriter.writeline(data)
return csv_filepath
with ProcessPoolExecutor() as pool:
futures = []
for chunk_filepath in chuck_file_by_line(source_big_file):
future = pool.submit(json_to_csv, chunk_filepath)
futures.append(future)
# wait for all to finish
for future in futures:
csv_filepath = future.result(timeout=None) # waits until complete
print(f'conversion complete> csv filepath: {csv_filepath}')
Since you have many files, the simplest multiprocessing example from the documentation should work for you. https://docs.python.org/3.4/library/multiprocessing.html?highlight=process
f(JsonFile):
# open input, output files and convert
with Pool(16) as p:
p.map(f, os.listdir(os.getcwd()))
You could also try replacing listdir with os.scandir(), which doesn't have to return all directory entries before starting.
I have to parse 30 days access logs from the server based on client IP and accessed hosts and need to know top 10 accessed sites. The log file will be around 10-20 GB in size which takes lots of time for single threaded execution of script. Initially, I wrote a script which was working fine but it is taking a lot of time to due to large log file size. Then I tried to implement multiprocessing library for parallel processing but it is not working. It seems implementation of multiprocessing is repeating tasks instead of doing parallel processing. Not sure, what is wrong in the code. Can some one please help on this? Thank you so much in advance for your help.
Code:
from datetime import datetime, timedelta
import commands
import os
import string
import sys
import multiprocessing
def ipauth (slave_list, static_ip_list):
file_record = open('/home/access/top10_domain_accessed/logs/combined_log.txt', 'a')
count = 1
while (count <=30):
Nth_days = datetime.now() - timedelta(days=count)
date = Nth_days.strftime("%Y%m%d")
yr_month = Nth_days.strftime("%Y/%m")
file_name = 'local2' + '.' + date
with open(slave_list) as file:
for line in file:
string = line.split()
slave = string[0]
proxy = string[1]
log_path = "/LOGS/%s/%s" %(slave, yr_month)
try:
os.path.exists(log_path)
file_read = os.path.join(log_path, file_name)
with open(file_read) as log:
for log_line in log:
log_line = log_line.strip()
if proxy in log_line:
file_record.write(log_line + '\n')
except IOError:
pass
count = count + 1
file_log = open('/home/access/top10_domain_accessed/logs/ipauth_logs.txt', 'a')
with open(static_ip_list) as ip:
for line in ip:
with open('/home/access/top10_domain_accessed/logs/combined_log.txt','r') as f:
for content in f:
log_split = content.split()
client_ip = log_split[7]
if client_ip in line:
content = str(content).strip()
file_log.write(content + '\n')
return
if __name__ == '__main__':
slave_list = sys.argv[1]
static_ip_list = sys.argv[2]
jobs = []
for i in range(5):
p = multiprocessing.Process(target=ipauth, args=(slave_list, static_ip_list))
jobs.append(p)
p.start()
p.join()
UPDATE AFTER CONVERSATION WITH OP, PLEASE SEE COMMENTS
My take: Split the file into smaller chunks and use a process pool to work on those chunks:
import multiprocessing
def chunk_of_lines(fp, n):
# read n lines from file
# then yield
pass
def process(lines):
pass # do stuff to a file
p = multiprocessing.Pool()
fp = open(slave_list)
for f in chunk_of_lines(fp,10):
p.apply_async(process, [f,static_ip_list])
p.close()
p.join() # Wait for all child processes to close.
There are many ways to implement the chunk_of_lines method, you could iterate over the file lines using a simple for or do something more advance like call fp.read().
I have a list of 30 million strings, and I want to run a dns query to all of them using python. I do not understand how this operation can get memory intensive. I would assume that the threads would exit after the job is done, and there is also a timeout of 1 minute as well ({'dns_request_timeout': 1}).
Here is a sneak peek of the machine's resources while running the script:
My code is as follows:
# -*- coding: utf-8 -*-
import dns.resolver
import concurrent.futures
from pprint import pprint
from json import json
bucket = json.load(open('30_million_strings.json','r'))
def _dns_query(target, **kwargs):
global bucket
resolv = dns.resolver.Resolver()
resolv.timeout = kwargs['function']['dns_request_timeout']
try:
resolv.query(target + '.com', kwargs['function']['query_type'])
with open('out.txt', 'a') as f:
f.write(target + '\n')
except Exception:
pass
def run(**kwargs):
global bucket
temp_locals = locals()
pprint({k: v for k, v in temp_locals.items()})
with concurrent.futures.ThreadPoolExecutor(max_workers=kwargs['concurrency']['threads']) as executor:
future_to_element = dict()
for element in bucket:
future = executor.submit(kwargs['function']['name'], element, **kwargs)
future_to_element[future] = element
for future in concurrent.futures.as_completed(future_to_element):
result = future_to_element[future]
run(function={'name': _dns_query, 'dns_request_timeout': 1, 'query_type': 'MX'},
concurrency={'threads': 15})
try this:
def sure_ok(future):
try:
with open('out.txt', 'a') as f:
f.write(str(future.result()[0]) + '\n')
except:
pass
with concurrent.futures.ThreadPoolExecutor(max_workers=2500):
for element in json.load(open('30_million_strings.json','r')):
resolv = dns.resolver.Resolver()
resolv.timeout = 1
future = executor.submit(resolv.query, target + '.com', 'MX')
future.add_done_callback(sure_ok)
remove global bucket as it is redundant, and not needed.
remove reference of the 30+ million futures in a dictionary, also redundant.
also you're probably not using a new enough
version of concurrent.futures:
https://github.com/python/cpython/commit/5cbca0235b8da07c9454bcaa94f12d59c2df0ad2
I have just completed a script that (sigh) finally works. It searches twitter for keywords. The results are written to a csv with 4 columns of keyword, Tweet, Lat, Lon (location). The code that I'm using is:
import tweepy
import csv
keywordList = ['McDonalds', 'Taco Bell', 'Burger King',]
for keyword in keywordList:
result = tweepy.api.search(q=keyword,rpp=1000,page=2, geocode= "34.085422,-117.900879,500mi" )
with open(r'C:\Temp\results.csv', 'a') as acsv:
w = csv.writer(acsv)
for tweet in result:
lat, lon = tweet.geo if tweet.geo else ('', '')
try:
a = tweet.geo['coordinates']
print a[0] , a[1]
print tweet.text
w.writerow((keyword, tweet.text, a[0] , a[1]))
except:
pass
I want to use task manager or python to run this search every 5 minutes but It will rewrite duplicates. I was going to use the following code to remove duplicates but two things happen. The resutls2.csv is blank and when I go to open the csv, it is locked and I have to view it in a read only. I tried f1.close(), writer.close() etc but it says 'csv.reader' object has no attribute close.
My biggest concern is getting no duplicates either by writing to the new csv or somehow removing and writing to the same table on each search. Any suggestions are much appreciated!!
import csv
f1 = csv.reader(open(r'C:\Temp\results.csv', 'rb'))
writer = csv.writer(open(r'C:\Temp\results2.csv', 'wb'))
tweet = set()
for row in f1:
if row[1] not in tweet:
writer.writerow(row)
tweet.add( row[1] )
f1.close()
writer.close()
Here's a refactored version:
Edit: unicode, what fun - I've added a .decode() call in read_csv() and an .encode() call in append_csv(); this should solve your problem (I think - you might need to decide on a string codec).
import tweepy
import csv
from collections import defaultdict
import time
FILE = 'c:/temp/results.csv'
KEYWORDS = ['McDonalds', 'Taco Bell', 'Burger King']
WHERE = "34.085422,-117.900879,500mi"
DELAY = 300 # seconds
def _float(s, err=None):
try:
return float(s)
except ValueError:
return err
def _str(f, err=""):
return err if f is None else str(f)
def read_csv(fname=FILE):
data = defaultdict(dict)
with open(fname, 'rb') as inf:
incsv = csv.reader(inf)
for kw,tw,lat,lon in incsv:
# added .decode() call to handle saved unicode chars
data[kw][tw.decode()] = (_float(lat), _float(lon))
return data
def append_csv(data, fname=FILE):
with open(fname, "ab") as outf:
outcsv = csv.writer(outf)
# added .encode() call to handle saved unicode chars
outcsv.writerows((kw,tw.encode(),_str(lat),_str(lon)) for kw,dat in data.iteritems() for tw,(lat,lon) in dat.iteritems())
def search_twitter(keywords=KEYWORDS, loc=WHERE):
data = defaultdict(dict)
for kw in keywords:
for tweet in tweepy.api.search(q=kw, rpp=1000, page=2, geocode=loc):
data[kw][tweet.text] = tweet.geo if tweet.geo else (None,None)
return data
def calc_update(old_data, new_data):
diff = defaultdict(dict)
for kw,dat in new_data.iteritems():
for tw,loc in dat.iteritems():
if tw not in old_data[kw]:
diff[kw][tw] = old_data[kw][tw] = loc
return old_data, diff
def show_data(data):
for kw,dat in data.iteritems():
for tw,(lat,lon) in dat.iteritems():
print("<{},{}> {} [{}]".format(_str(lat,"???"), _str(lon,"???"), tw, kw))
def main():
data = read_csv()
while True:
new_data = search_twitter()
data,diff = calc_update(data, new_data)
append_csv(diff)
show_data(diff)
time.sleep(DELAY)
if __name__=="__main__":
main()