Opening various csv files, processing and then saving them through multiprocessing? - python

I am new to multiprocessing. Here I am just trying opening multiple csv files and then again saving them through multiprocessing.
def opening_file(x):
print(x)
url = 'D:\\Tanmay\\Market\\Data\\Processed GFDL_options\\Bank Nifty\\Intraday\\'
d = pd.read_csv(url+x)
d.to_csv('D:\\Tanmay\\trial\\'+x)
all_files = os.listdir('D:\\Tanmay\\Market\\Data\\Processed GFDL_options\\Bank Nifty\\Intraday\\')
processes = []
for i in all_files:
p = multiprocessing.Process(target=opening_file(i),args=(i,))
pid = os.getpid()
print(pid)
p.start()
processes.append(p)
for y in processes:
y.join()
Issue here is that I am only getting one process id, which means that only 1 process is running. Kindly assist

You should pass a function to multiprocessing.Process, and not its result:
p = multiprocessing.Process(target=opening_file, args=(i,))

Related

Reading thousands of json file and process them using python multiprocessing

I'm trying to read thousands of json file from directory and process each file separately and store the result in a dictionary. I already write a working code for sequential execution. Now i want to take the leverage of multi-processing for speed up the whole process.
So far what i did -
import json
import os
from multiprocessing import Process, Manager
def read_file(file_name):
'''
Read the given json file and return data
'''
with open(file_name) as file :
data = json.load(file)
return data
def do_some_process(data):
'''
Some calculation will be done here
and return the result
'''
return some_result
def process_each_file(file, result):
file_name = file.split('.')[0]
# reading data from file
data = read_file('../data/{}'.format(file))
processed_result = do_some_process(data)
result[file_name] = processed_result
if __name__ == '__main__':
manager = Manager()
result = manager.dict()
file_list = os.listdir("../data")
all_process = [Process(target=process_each_file, args=(file, result, ))
for file in file_list if file.endswith(".json")]
for p in all_process:
p.start()
for p in all_process:
p.join()
'''
Do some further work with 'rusult' variable
'''
When i run this code it shows OSError: [Errno 24] Too many open files
How can i achive my goal ?
To read and process multiple JSON files using Python's multiprocessing module, you can use the following approach:
import os
import json
from multiprocessing import Pool
# List all the JSON files in the current directory
json_files = [f for f in os.listdir('.') if f.endswith('.json')]
def process_data(data):
return data
def process_json_file(filename):
with open(filename, 'r') as f:
data = json.load(f)
# Process the data here...
processed_data = process_data(data)
return processed_data
# Create a pool of workers to process the files concurrently
with Pool() as pool:
# Apply the processing function to each JSON file concurrently
results = pool.map(process_json_file, json_files)
# Do something with the results
for result in results:
print(result)

Python - Multiprocessing slows down gradually?

I recently started learning about multiprocessing in python and made this code to test it. So I have around 1300 csv files which I simply want to open and then save it to a folder with multiprocessing to test the speed. The issue here is that, the first 600-700 files are processed and saved in less than 10 seconds but the next 600-700 files takes more than 1 minute. I am really not sure why its happening. I have 8 cores and 16 gb ram in my system. Below is my code
import pandas as pd
import os,time
import multiprocessing
import numpy as np
def csv_processing(p):
final_df = pd.DataFrame(columns=['File_name', 'col'])
for file in p :
url = 'E:\\Ashish\\Market\\Data\\Processed GFDL_options\\Bank Nifty\\Intraday\\'
output = 'E:\\Testing\\'
df = pd.read_csv(url +file)
df.to_csv(output+file)
def split_list_into_prcessess(main_list, req_process):
index_freq = round(.5 + len(main_list)/req_process)
splitted_list = [main_list[r*index_freq:(r+1)*index_freq] for r in range(req_process)]
return [x for x in splitted_list if len(x)>0]
if __name__ == '__main__':
start_time = time.time()
processes = []
all_files = os.listdir('E:\\Ashish\\Market\\Data\\Processed GFDL_options\\Bank Nifty\\Intraday\\')
print(len(all_files))
data = split_list_into_prcessess(all_files,os.cpu_count())
print(data)
print(len(data))
for t in data:
p = multiprocessing.Process(target=csv_processing,args=(t,))
p.start()
processes.append(p)
for l in processes:
l.join()
end_time = time.time()
time_took = end_time - start_time
print(time_took)

How to process access log using python multiprocessing library?

I have to parse 30 days access logs from the server based on client IP and accessed hosts and need to know top 10 accessed sites. The log file will be around 10-20 GB in size which takes lots of time for single threaded execution of script. Initially, I wrote a script which was working fine but it is taking a lot of time to due to large log file size. Then I tried to implement multiprocessing library for parallel processing but it is not working. It seems implementation of multiprocessing is repeating tasks instead of doing parallel processing. Not sure, what is wrong in the code. Can some one please help on this? Thank you so much in advance for your help.
Code:
from datetime import datetime, timedelta
import commands
import os
import string
import sys
import multiprocessing
def ipauth (slave_list, static_ip_list):
file_record = open('/home/access/top10_domain_accessed/logs/combined_log.txt', 'a')
count = 1
while (count <=30):
Nth_days = datetime.now() - timedelta(days=count)
date = Nth_days.strftime("%Y%m%d")
yr_month = Nth_days.strftime("%Y/%m")
file_name = 'local2' + '.' + date
with open(slave_list) as file:
for line in file:
string = line.split()
slave = string[0]
proxy = string[1]
log_path = "/LOGS/%s/%s" %(slave, yr_month)
try:
os.path.exists(log_path)
file_read = os.path.join(log_path, file_name)
with open(file_read) as log:
for log_line in log:
log_line = log_line.strip()
if proxy in log_line:
file_record.write(log_line + '\n')
except IOError:
pass
count = count + 1
file_log = open('/home/access/top10_domain_accessed/logs/ipauth_logs.txt', 'a')
with open(static_ip_list) as ip:
for line in ip:
with open('/home/access/top10_domain_accessed/logs/combined_log.txt','r') as f:
for content in f:
log_split = content.split()
client_ip = log_split[7]
if client_ip in line:
content = str(content).strip()
file_log.write(content + '\n')
return
if __name__ == '__main__':
slave_list = sys.argv[1]
static_ip_list = sys.argv[2]
jobs = []
for i in range(5):
p = multiprocessing.Process(target=ipauth, args=(slave_list, static_ip_list))
jobs.append(p)
p.start()
p.join()
UPDATE AFTER CONVERSATION WITH OP, PLEASE SEE COMMENTS
My take: Split the file into smaller chunks and use a process pool to work on those chunks:
import multiprocessing
def chunk_of_lines(fp, n):
# read n lines from file
# then yield
pass
def process(lines):
pass # do stuff to a file
p = multiprocessing.Pool()
fp = open(slave_list)
for f in chunk_of_lines(fp,10):
p.apply_async(process, [f,static_ip_list])
p.close()
p.join() # Wait for all child processes to close.
There are many ways to implement the chunk_of_lines method, you could iterate over the file lines using a simple for or do something more advance like call fp.read().

python multiprocessing pool, make one worker to execute a different function

I have to perform some processing on each line of a file and I have many files present in an input directory. I have to dump the response I get from processing each line (from multiple input files) in to a single result file.
I have decided this flow - Will dump all the input files into a queue and fork 3-4 workers, where each worker works on a unique file, read its content and after processing dump the response into a writer queue. Their will be a separate process which will read this queue and write result in to an output file.
I have comeup with this code-
def write_to_csv(queue):
file_path = os.path.join(os.getcwd(), 'test_dir', "writer.csv")
ofile = open(file_path, "w")
job_writer = csv.writer(ofile, delimiter='\a')
while 1:
line = queue.get()
if line == 'kill':
print("Kill Signal received")
break
if line:job_writer.writerow([str(line).strip()])
ofile.close()
def worker_main(file_queue, writer_queue):
print os.getpid(),"working"
while not file_queue.empty():
file_name = file_queue.get(True)
# somewhere in process_file writer_queue.put(line_resp) is called
# for every line in file_name
process_file(file_name, writer_queue)
if __name__ == "__main__":
file_queue = multiprocessing.Queue()
output_queue = multiprocessing.Queue()
writer_pool = multiprocessing.Pool(1, write_to_csv, (output_queue,))
cwd = os.getcwd()
test_dir = 'test_dir'
file_list = os.listdir(os.path.join(cwd, test_dir))
for file_name in file_list:
file_queue.put(file_name)
reader_pool = multiprocessing.Pool(3, worker_main, (file_queue, output_queue))
reader_pool.close()
reader_pool.join()
output_queue.put("kill")
print("Finished execution")
The code is working fine. But I wonder if it is possible to do the same thing by a single multiprocessing Pool as opposed to using reader_pool and writer_pool in the code above
You could do that by apply_async, also don't set initializer (write_to_csv or worker_main in your case) when creating Pool object, or it would run the task by default.
file_queue = multiprocessing.Queue()
output_queue = multiprocessing.Queue()
cwd = os.getcwd()
test_dir = 'test_dir'
file_list = os.listdir(os.path.join(cwd, test_dir))
for file_name in file_list:
file_queue.put(file_name)
pool = Pool(4)
pool.apply_async(write_to_csv, (output_queue,))
[pool.apply_async(worker_main, (file_queue, output_queue, )) for i in range(3)]
pool.close()
pool.join()

Writing to a file with multiprocessing

I'm having the following problem in python.
I need to do some calculations in parallel whose results I need to be written sequentially in a file. So I created a function that receives a multiprocessing.Queue and a file handle, do the calculation and print the result in the file:
import multiprocessing
from multiprocessing import Process, Queue
from mySimulation import doCalculation
# doCalculation(pars) is a function I must run for many different sets of parameters and collect the results in a file
def work(queue, fh):
while True:
try:
parameter = queue.get(block = False)
result = doCalculation(parameter)
print >>fh, string
except:
break
if __name__ == "__main__":
nthreads = multiprocessing.cpu_count()
fh = open("foo", "w")
workQueue = Queue()
parList = # list of conditions for which I want to run doCalculation()
for x in parList:
workQueue.put(x)
processes = [Process(target = writefh, args = (workQueue, fh)) for i in range(nthreads)]
for p in processes:
p.start()
for p in processes:
p.join()
fh.close()
But the file ends up empty after the script runs. I tried to change the worker() function to:
def work(queue, filename):
while True:
try:
fh = open(filename, "a")
parameter = queue.get(block = False)
result = doCalculation(parameter)
print >>fh, string
fh.close()
except:
break
and pass the filename as parameter. Then it works as I intended. When I try to do the same thing sequentially, without multiprocessing, it also works normally.
Why it didn't worked in the first version? I can't see the problem.
Also: can I guarantee that two processes won't try to write the file simultaneously?
EDIT:
Thanks. I got it now. This is the working version:
import multiprocessing
from multiprocessing import Process, Queue
from time import sleep
from random import uniform
def doCalculation(par):
t = uniform(0,2)
sleep(t)
return par * par # just to simulate some calculation
def feed(queue, parlist):
for par in parlist:
queue.put(par)
def calc(queueIn, queueOut):
while True:
try:
par = queueIn.get(block = False)
print "dealing with ", par, ""
res = doCalculation(par)
queueOut.put((par,res))
except:
break
def write(queue, fname):
fhandle = open(fname, "w")
while True:
try:
par, res = queue.get(block = False)
print >>fhandle, par, res
except:
break
fhandle.close()
if __name__ == "__main__":
nthreads = multiprocessing.cpu_count()
fname = "foo"
workerQueue = Queue()
writerQueue = Queue()
parlist = [1,2,3,4,5,6,7,8,9,10]
feedProc = Process(target = feed , args = (workerQueue, parlist))
calcProc = [Process(target = calc , args = (workerQueue, writerQueue)) for i in range(nthreads)]
writProc = Process(target = write, args = (writerQueue, fname))
feedProc.start()
for p in calcProc:
p.start()
writProc.start()
feedProc.join ()
for p in calcProc:
p.join()
writProc.join ()
You really should use two queues and three separate kinds of processing.
Put stuff into Queue #1.
Get stuff out of Queue #1 and do calculations, putting stuff in Queue #2. You can have many of these, since they get from one queue and put into another queue safely.
Get stuff out of Queue #2 and write it to a file. You must have exactly 1 of these and no more. It "owns" the file, guarantees atomic access, and absolutely assures that the file is written cleanly and consistently.
If anyone is looking for a simple way to do the same, this can help you.
I don't think there are any disadvantages to doing it in this way. If there are, please let me know.
import multiprocessing
import re
def mp_worker(item):
# Do something
return item, count
def mp_handler():
cpus = multiprocessing.cpu_count()
p = multiprocessing.Pool(cpus)
# The below 2 lines populate the list. This listX will later be accessed parallely. This can be replaced as long as listX is passed on to the next step.
with open('ExampleFile.txt') as f:
listX = [line for line in (l.strip() for l in f) if line]
with open('results.txt', 'w') as f:
for result in p.imap(mp_worker, listX):
# (item, count) tuples from worker
f.write('%s: %d\n' % result)
if __name__=='__main__':
mp_handler()
Source: Python: Writing to a single file with queue while using multiprocessing Pool
There is a mistake in the write worker code, if the block is false, the worker will never get any data. Should be as follows:
par, res = queue.get(block = True)
You can check it by adding line
print "QSize",queueOut.qsize()
after the
queueOut.put((par,res))
With block=False you would be getting ever increasing length of the queue until it fills up, unlike with block=True where you get always "1".

Categories