I have a list of 30 million strings, and I want to run a dns query to all of them using python. I do not understand how this operation can get memory intensive. I would assume that the threads would exit after the job is done, and there is also a timeout of 1 minute as well ({'dns_request_timeout': 1}).
Here is a sneak peek of the machine's resources while running the script:
My code is as follows:
# -*- coding: utf-8 -*-
import dns.resolver
import concurrent.futures
from pprint import pprint
from json import json
bucket = json.load(open('30_million_strings.json','r'))
def _dns_query(target, **kwargs):
global bucket
resolv = dns.resolver.Resolver()
resolv.timeout = kwargs['function']['dns_request_timeout']
try:
resolv.query(target + '.com', kwargs['function']['query_type'])
with open('out.txt', 'a') as f:
f.write(target + '\n')
except Exception:
pass
def run(**kwargs):
global bucket
temp_locals = locals()
pprint({k: v for k, v in temp_locals.items()})
with concurrent.futures.ThreadPoolExecutor(max_workers=kwargs['concurrency']['threads']) as executor:
future_to_element = dict()
for element in bucket:
future = executor.submit(kwargs['function']['name'], element, **kwargs)
future_to_element[future] = element
for future in concurrent.futures.as_completed(future_to_element):
result = future_to_element[future]
run(function={'name': _dns_query, 'dns_request_timeout': 1, 'query_type': 'MX'},
concurrency={'threads': 15})
try this:
def sure_ok(future):
try:
with open('out.txt', 'a') as f:
f.write(str(future.result()[0]) + '\n')
except:
pass
with concurrent.futures.ThreadPoolExecutor(max_workers=2500):
for element in json.load(open('30_million_strings.json','r')):
resolv = dns.resolver.Resolver()
resolv.timeout = 1
future = executor.submit(resolv.query, target + '.com', 'MX')
future.add_done_callback(sure_ok)
remove global bucket as it is redundant, and not needed.
remove reference of the 30+ million futures in a dictionary, also redundant.
also you're probably not using a new enough
version of concurrent.futures:
https://github.com/python/cpython/commit/5cbca0235b8da07c9454bcaa94f12d59c2df0ad2
Related
I am trying to open up some huge json files
papers0 = []
papers1 = []
papers2 = []
papers3 = []
papers4 = []
papers5 = []
papers6 = []
papers7 = []
for x in range(8):
for line in open(f'part_00{x}.json', 'r'):
globals()['papers%s' % x].append(json.loads(line))
However the process above is slow. I wonder if there is some parallelization trick or some other in order to speed it up.
Thank you
If the JSON files are very large then loading them (as Python dictionaries) will be I/O bound. Therefore, multithreading would be appropriate for parallelisation.
Rather than having discrete variables for each dictionary, why not have a single dictionary keyed on the significant numeric part of the filename(s).
For example:
from concurrent.futures import ThreadPoolExecutor as TPE
from json import load as LOAD
from sys import stderr as STDERR
NFILES = 8
JDATA = {}
def get_json(n):
try:
with open(f'part_00{n}.json') as j:
return n, LOAD(j)
except Exception as e:
print(e, file=STDERR)
return n, None
def main():
with TPE() as tpe:
JDATA = dict(tpe.map(get_json, range(NFILES)))
if __name__ == '__main__':
main()
After running this, the dictionary representation of the JSON file part_005.json (for example) would be accessible as JDATA[5]
Note that if an exception arises during accessing or processing of any of the files, the relevant dictionary value will be None
I have over a million json files, and I'm trying to find the fastest way to check first, if they load, and then, if there exists either key_A, key_B, or neither. I thought I might be able to use ray to speed up this process, but opening a file seems to fail with ray.
As a simplification, here's my attempt at just checking whether or not a file will load:
import ray
ray.init()
#ray.remote
class Counter(object):
def __init__(self):
self.good = 0
self.bad = 0
def increment(self, j):
try:
with open(j, 'r') as f:
l = json.load(f)
self.good += 1
except: # all files end up here
self.bad += 1
def read(self):
return (self.good, self.bad)
counter = Counter.remote()
[counter.increment.remote(j) for j in json_paths]
futures = counter.read.remote()
print(ray.get(futures))
But I end up with (0, len(json_paths)) as a result.
For reference, the slightly more complicated actual end goal I have is to check:
new, old, bad = 0,0,0
try:
with open(json_path, 'r') as f:
l = json.load(f)
ann = l['frames']['FrameLabel']['annotations']
first_object = ann[0][0]
except:
bad += 1
return
if 'object_category' in first_object:
new += 1
elif 'category' in first_object:
old += 1
else:
bad += 1
I'd recommend not using Python for this at all, but for example jq.
A command like
jq -c "[input_filename, (.frames.FrameLabel.annotations[0][0]|[.object_category,.category])]" good.json bad.json old.json
outputs
["good.json",["good",null]]
["bad.json",[null,null]]
["old.json",[null,"good"]]
for each of your categories of data, which will be significantly easier to parse.
You can use e.g. the GNU find tool, or if you're feeling fancy, parallel, to come up with the command lines to run.
You could use Python' built-in concurrent module instead to perform your task, which ray might not be best-suited for. Example:
from concurrent.futures import ThreadPoolExecutor
numThreads = 10
def checkFile(path):
return True # parse and check here
with ThreadPoolExecutor(max_workers=numThreads) as pool:
good = sum(pool.map(checkFile, json_paths))
bad = len(json_paths) - good
I want to check if some element is already present in some list, while i am constantly updating that list.I am using multiprocessing to achieve this, but currently my list gets reinitialised every time.Any suggestions on how i could append to the list without it being reinitialized would be very helpful.Thanks in advance.
import multiprocessing as mp
import socket
# Set the default timeout in seconds
timeout = 20
socket.setdefaulttimeout(timeout)
from PIL import Image
import hashlib
import os
image_hash_list=[]
url_list =[]
some_dict=dict()
def getImages(val):
# import pdb;pdb.set_trace()
#Dowload images
f = open('image_files.txt', 'a')
try:
url=val # preprocess the url from the input val
local=url.split('/')[-1] #Filename Generation From Global Varables And Rand Stuffs...
urllib.request.urlretrieve(url,local)
md5hash = hashlib.md5(Image.open(local).tobytes())
image_hash = md5hash.hexdigest()
global image_hash_list
global url_list
if image_hash not in image_hash_list:
image_hash_list.append(image_hash)
some_dict[image_hash] = 0
os.remove(local)
f.write(url+'\n')
return 1
else:
os.remove(local)
print(some_dict.keys())
except Exception as e:
return 0
# if __name__ == '__main__':
files = "Identity.txt"
lst = list(open(files))
lst = [l.replace("\n", "") for l in lst]
pool = mp.Pool(processes=12)
res = pool.map(getImages, lst)
print ("tempw")
Here the image_hash_list get reinitialised every time.
Use a Manager to create shared lists and dicts (and other types too): Sharing state betweek processes.
I have to parse 30 days access logs from the server based on client IP and accessed hosts and need to know top 10 accessed sites. The log file will be around 10-20 GB in size which takes lots of time for single threaded execution of script. Initially, I wrote a script which was working fine but it is taking a lot of time to due to large log file size. Then I tried to implement multiprocessing library for parallel processing but it is not working. It seems implementation of multiprocessing is repeating tasks instead of doing parallel processing. Not sure, what is wrong in the code. Can some one please help on this? Thank you so much in advance for your help.
Code:
from datetime import datetime, timedelta
import commands
import os
import string
import sys
import multiprocessing
def ipauth (slave_list, static_ip_list):
file_record = open('/home/access/top10_domain_accessed/logs/combined_log.txt', 'a')
count = 1
while (count <=30):
Nth_days = datetime.now() - timedelta(days=count)
date = Nth_days.strftime("%Y%m%d")
yr_month = Nth_days.strftime("%Y/%m")
file_name = 'local2' + '.' + date
with open(slave_list) as file:
for line in file:
string = line.split()
slave = string[0]
proxy = string[1]
log_path = "/LOGS/%s/%s" %(slave, yr_month)
try:
os.path.exists(log_path)
file_read = os.path.join(log_path, file_name)
with open(file_read) as log:
for log_line in log:
log_line = log_line.strip()
if proxy in log_line:
file_record.write(log_line + '\n')
except IOError:
pass
count = count + 1
file_log = open('/home/access/top10_domain_accessed/logs/ipauth_logs.txt', 'a')
with open(static_ip_list) as ip:
for line in ip:
with open('/home/access/top10_domain_accessed/logs/combined_log.txt','r') as f:
for content in f:
log_split = content.split()
client_ip = log_split[7]
if client_ip in line:
content = str(content).strip()
file_log.write(content + '\n')
return
if __name__ == '__main__':
slave_list = sys.argv[1]
static_ip_list = sys.argv[2]
jobs = []
for i in range(5):
p = multiprocessing.Process(target=ipauth, args=(slave_list, static_ip_list))
jobs.append(p)
p.start()
p.join()
UPDATE AFTER CONVERSATION WITH OP, PLEASE SEE COMMENTS
My take: Split the file into smaller chunks and use a process pool to work on those chunks:
import multiprocessing
def chunk_of_lines(fp, n):
# read n lines from file
# then yield
pass
def process(lines):
pass # do stuff to a file
p = multiprocessing.Pool()
fp = open(slave_list)
for f in chunk_of_lines(fp,10):
p.apply_async(process, [f,static_ip_list])
p.close()
p.join() # Wait for all child processes to close.
There are many ways to implement the chunk_of_lines method, you could iterate over the file lines using a simple for or do something more advance like call fp.read().
I am a gevent newbie, but I think I got it working — in a limited sense. Basically, for pools of 1, the code proceeds, while for larger pools the code gets stuck, usually within the first pool (e.g. with a pool of 5, I see 3 greenlet finishing, but not more). What is going wrong? Spawn? Join?
I cannot verify whether the remote server gets confused by multiple queries, but it has no problem with a rapid sequence of serial requests, so probably not…
(I share the code in its entirety as I am not sure where the bug is. Thanks for bearing with me.)
from urllib2 import urlopen
from lxml.etree import parse
import os, csv, cStringIO, codecs, pickle
from selenium import webdriver
from time import sleep
import gevent
from gevent import socket
from gevent import monkey, pool
# patches stdlib (including socket and ssl modules) to cooperate with other greenlets
monkey.patch_all()
class UnicodeWriter:
"""
A CSV writer which will write rows to CSV file "f",
which is encoded in the given encoding.
"""
def __init__(self, f, dialect=csv.excel, encoding="utf-8", **kwds):
# Redirect output to a queue
self.queue = cStringIO.StringIO()
self.writer = csv.writer(self.queue, dialect=dialect, **kwds)
self.stream = f
self.encoder = codecs.getincrementalencoder(encoding)()
def writerow(self, row):
self.writer.writerow([unicode(s).encode("utf-8") for s in row])
# Fetch UTF-8 output from the queue ...
data = self.queue.getvalue()
data = data.decode("utf-8")
# ... and reencode it into the target encoding
data = self.encoder.encode(data)
# write to the target stream
self.stream.write(data)
# empty queue
self.queue.truncate(0)
def writerows(self, rows):
for row in rows:
self.writerow(row)
os.chdir('/Users/laszlosandor/Downloads/kozbeszerzes')
HOSTNAME = 'http://kozbeszerzes.ceu.hu'
driver = webdriver.Chrome()
results = set()
for y in xrange(1998,2015):
for p in xrange(0,9999):
driver.get('http://kozbeszerzes.ceu.hu/searchresults.xhtml?q={}&page={}'.format(y,p))
sleep(1)
if len(driver.find_elements_by_class_name('result'))==0:
break
for e in driver.find_elements_by_class_name('result'):
link = e.find_element_by_tag_name('a')
r = link.get_attribute('href').encode('ascii', 'ignore')
if r[:34]== 'http://kozbeszerzes.ceu.hu/tender/':
results.add(r)
driver.quit()
with open('list_of_urls', 'wb') as f:
pickle.dump(results, f)
#with open('list_of_urls', 'r') as f:
# results = pickle.load(f)
entities = set()
header = ('TenderID','RequestorName','URL','Year','RequestorID','Subject','SourceURL','EstValue','Currency','DecisionDate','Value','VAT')
# """Spawn multiple workers and wait for them to complete"""
# # limit ourselves to max 10 simultaneous outstanding requests
p = pool.Pool(10)
f = open('tenders.csv', 'w')
f.write(codecs.BOM_UTF8)
writer = UnicodeWriter(f)
writer.writerow(header)
def workres(res):
try:
tender = parse(urlopen(res)).getroot()
print ('%s succeeded' % res)
for requestor in tender.findall('requestor'):
entities.add(HOSTNAME + requestor.get('url'))
id = tender.get('id')
reqname = tender.get('requestor')
url = tender.get('url')
year = tender.get('year')
reqid = tender.get('requestor_id')
subject = tender.get('subject')
source = tender.get('source_url')
estval = tender.get('estimated_value')
for part in tender.findall('./parts/part'):
winner = part.find('winner')
entities.add(HOSTNAME + winner.get('url'))
curr = part.find('currency').text
date = part.find('decisionDate').text
value = part.find('value').text
vat = part.find('vat').text
row = id, reqname, url, year, reqid, subject, source, estval, curr, date, value, vat
writer.writerow(row)
except socket.gaierror:
ex = sys.exc_info()[1]
print ('%s failed with %s' % (res, ex))
jobs = [p.spawn(workres, res) for res in results]
p.join()
f.close()
with open('entities', 'wb') as f:
pickle.dump(entities, f)
header = ['ID','URL','Name','NominalCity','City', 'ZIP', 'Address']
f = open('entities.csv', 'w')
f.write(codecs.BOM_UTF8)
writer = UnicodeWriter(f)
writer.writerow(header)
def workent(ent):
try:
ent = parse(urlopen(ent)).getroot()
print ('%s succeeded' % ent)
id = ent.get('id')
url = ent.get('url')
name = ent.get('name')
nominalcity = ent.get('city')
cities = ent.findall('./resolved_addresses/whitelistAddress/city')
zips = ent.findall('./resolved_addresses/whitelistAddress/postalCode')
streets = ent.findall('./resolved_addresses/whitelistAddress/street')
for a in xrange(0,len(cities)):
city = cities[a].text
zip = zips[a].text
street = streets[a].text
row = id, url, name, nominalcity, city, zip, street
writer.writerow(row)
except socket.gaierror:
ex = sys.exc_info()[1]
print ('%s failed with %s' % (ent, ex))
jobs = [p.spawn(workent, ent) for ent in entities]
p.join()
f.close()
I see many mistakes here.
There is not used gevent.sleep() and not time.sleep which is
blocking.
Your variables names are too short. Your could add
descriptions on what each part of code is supposed to do. for example the variable 'p'
is used twice..
There are multiple urls gets using urlopen and the driver module? confusing..
I would use queues between different workers and have just one worker do
write_row calls and deal with the file access now you have multiple green lets accessing
the same file..
use less list compehensions just write out the loops.
I would suggest putting the try except in 'workres' only around the 'parse(urlopen())'
code maybe there are more exceptions happening, which you now don't see.
more tips for gevent