I have a huge zip file with a large number of files. Parsing all these files takes a lot of time, so I thought about using multiprocessing to speed things up. I am not sure how to approach it, as a zipfile.ZipFile in Python is not an iterable.
I am aware that I could extract all contents from the zip file and then iterate over the list of filenames, however, I'd prefer to not have to keep extra free space to hold the extracted data and would like to operate on the ZipFile.
Maybe there is any other solution to this this problem, so I am open to suggestions.
EDIT:
Using the below code technically works, but the problem is that each time the get_content() function runs, it seems the large zip file that I have is being opened again, ultimately taking as long as 15 seconds to reach each file.
import multiprocessing
from zipfile import ZipFile
from multiprocessing import Pool
import time
path = 'zipfile.zip'
def get_file_list(zip_path):
with ZipFile(zip_path, 'r') as zipObj:
listOfiles = zipObj.namelist()
return listOfiles
def get_content(file_name):
start_time = time.time()
with ZipFile(path, 'r') as zipObject:
with zipObject.open(file_name) as file:
content = file.read()
end_time = time.time()
print(f"It took {end_time - start_time} to open this file")
return content
def parse_files():
file_list = get_file_list(path)
with Pool(multiprocessing.cpu_count()) as p:
contents = p.map(get_content, file_list)
print(contents)
parse_files()
import os
import shutil
from zipfile import ZipFile
from multiprocessing import Pool
def create_dummy_zip():
os.mkdir("dummy")
for i in range(100):
with open(f"dummy/{i}.file", "w") as f:
f.write(f"Content: {i}")
shutil.make_archive("dummy", 'zip', "dummy")
shutil.rmtree('dummy')
def delete_dummy():
try:
os.remove("dummy.zip")
shutil.rmtree('dummy')
except:
pass
def get_file_list(zip_path):
with ZipFile(zip_path, 'r') as zipObj:
listOfiles = zipObj.namelist()
return listOfiles
def get_content(file_name):
with ZipFile("dummy.zip", 'r') as zipObject:
with zipObject.open(file_name) as file:
content = file.read()
return content
if __name__ == '__main__':
try:
create_dummy_zip()
file_list = get_file_list("dummy.zip")
with Pool(5) as p:
contents = p.map(get_content, file_list)
print(contents)
delete_dummy()
except:
delete_dummy()
Related
Considering I have a large number of json files, but small in size (about 20000 files around 100 Mbs), reading them the first time with code snippet:
from time import perf_counter
from glob import glob
def load_all_jsons_serial():
t_i = perf_counter()
json_files = glob("*json")
for file in json_files:
with open(file,"r") as f:
f.read()
t_f = perf_counter()
return t_f-t_i
load_all_jsons_serial()
take around 50 seconds.
However, if I rerun the code, it takes less than a second to finish! Could someone please:
Explain this observation. Why does it take longer the first time and less for the nexts runs?
How can I reduce the time for loading for the first time?
I am on a windows 11 machine and run the code in a notebook extension of VSCode.
Thanks.
You can read in parallel with aiofiles. Here is a full example, where i had 1000 json files (200kb each) in folder jsonfiles\async\ and jsonfiles\sync\ to prevent any hard disk or OS level caching. Removing the files and recreated the JSON files again after each run.
from glob import glob
import aiofiles
import asyncio
from time import perf_counter
###
# Synchronous file operation:
###
def load_all_jsons_serial():
json_files = glob("jsonfiles\\sync\\*.json")
for file in json_files:
with open(file,"r") as f:
f.read()
return
t_i = perf_counter()
load_all_jsons_serial()
t_f = perf_counter()
print(f"Synchronous: {t_f - t_i}")
###
# Async file operation
###
async def load_async(files: list[str]):
for file in files:
async with aiofiles.open(file, "r") as f:
await f.read()
return
async def main():
json_files = glob("jsonfiles\\async\\*.json")
no_of_tasks = 10
files_per_task = len(json_files)//no_of_tasks + 1
tasks = []
for i in range(no_of_tasks):
tasks.append(
asyncio.create_task(load_async(
json_files[i*files_per_task : i*files_per_task+files_per_task]))
)
await asyncio.gather(*tasks)
return
t_i = perf_counter()
asyncio.run(main())
t_f = perf_counter()
print(f"Asynchronous: {t_f - t_i}")
It's not exactly science but you can see there is a significant gain in performance:
Synchronous: 13.353551400010474
Asynchronous: 3.1800755000440404
I'm trying to read thousands of json file from directory and process each file separately and store the result in a dictionary. I already write a working code for sequential execution. Now i want to take the leverage of multi-processing for speed up the whole process.
So far what i did -
import json
import os
from multiprocessing import Process, Manager
def read_file(file_name):
'''
Read the given json file and return data
'''
with open(file_name) as file :
data = json.load(file)
return data
def do_some_process(data):
'''
Some calculation will be done here
and return the result
'''
return some_result
def process_each_file(file, result):
file_name = file.split('.')[0]
# reading data from file
data = read_file('../data/{}'.format(file))
processed_result = do_some_process(data)
result[file_name] = processed_result
if __name__ == '__main__':
manager = Manager()
result = manager.dict()
file_list = os.listdir("../data")
all_process = [Process(target=process_each_file, args=(file, result, ))
for file in file_list if file.endswith(".json")]
for p in all_process:
p.start()
for p in all_process:
p.join()
'''
Do some further work with 'rusult' variable
'''
When i run this code it shows OSError: [Errno 24] Too many open files
How can i achive my goal ?
To read and process multiple JSON files using Python's multiprocessing module, you can use the following approach:
import os
import json
from multiprocessing import Pool
# List all the JSON files in the current directory
json_files = [f for f in os.listdir('.') if f.endswith('.json')]
def process_data(data):
return data
def process_json_file(filename):
with open(filename, 'r') as f:
data = json.load(f)
# Process the data here...
processed_data = process_data(data)
return processed_data
# Create a pool of workers to process the files concurrently
with Pool() as pool:
# Apply the processing function to each JSON file concurrently
results = pool.map(process_json_file, json_files)
# Do something with the results
for result in results:
print(result)
I am beginner to Python and trying to add few lines of code to convert json to csv and back to json. Have thousands of files (size 300 MB) to be converted and processed. With current program (using 1 CPU), i am not able to use 16 CPUs of server and need suggestions to fine tune the program for multiprocessing. Below is my code with python 3.7 version.
import json
import csv
import os
os.chdir('/stagingData/Scripts/test')
for JsonFile in os.listdir(os.getcwd()):
PartialFileName = JsonFile.split('.')[0]
j = 1
with open(PartialFileName +".csv", 'w', newline='') as Output_File:
with open(JsonFile) as fileHandle:
i = 1
for Line in fileHandle:
try:
data = json.loads(Line, parse_float=str)
except:
print("Can't load line {}".format(i))
if i == 1:
header = data.keys()
output = csv.writer(Output_File)
output.writerow(header) #Writes header row
i += 1
output.writerow(data.values()) #writes values row
j += 1
Appreciate suggestions on multiprocessing logic
If you have a single big file that you want to process more effectively I suggest the following:
Split file into chunks
Create a process to process each chunk
(if necessary) merge the processed chunks back into a single file
Something like this:
import csv
import json
from pathlib import Path
from concurrent.futures import ProcessPoolExecutor
source_big_file = Path('/path/to/file')
def chunk_file_by_line(source_filepath: Path, chunk_size: int = 10_000):
chunk_line_size = 10_000
intermediate_file_handlers = {}
last_chunk_filepath = None
with source_big_file.open('r', encoding='utf8') as big:
for line_number, line in big:
group = line_number - (line_number % chunk_line_size)
chunk_filename = f'{source_big_file.stem}.g{group}{source_big_file.suffix}'
chunk_filepath = source_big_file.parent / chunk_filename
if chunk_filepath not in intermediate_file_handlers:
file_handler = chuck_filepath.open('w', encoding='utf8')
intermediate_file_handlers[chunk_filepath] = file_handler
if last_chunk_filepath:
last_file_hanlder = intermediate_file_handlers[last_chunk_filepath]
last_file_handler.close()
yield last_chunk_filepath
else:
file_handler = intermediate_file_handlers[chunk_filepath]
file_handler.write(line)
last_chunk_filepath = chunk_filepath
# output last one
yield last_chunk_filepath
def json_to_csv(json_filepath: Path) -> Path:
csv_filename = f'{json_filepath.stem}.csv'
csv_filepath = json_filepath.parent / csv_filename
with csv_filepath.open('w', encoding='utf8') as csv_out, json_filepath.open('r', encoding='utf8') as json_in:
dwriter = csv.DictWriter(csv_out)
headers_written = False
for json_line in json_in:
data = json.loads(json_line)
if not headers_written:
# create header record
headers = {k:k for k in data.keys()}
dwriter.writeline(headers)
headers_written = True
dwriter.writeline(data)
return csv_filepath
with ProcessPoolExecutor() as pool:
futures = []
for chunk_filepath in chuck_file_by_line(source_big_file):
future = pool.submit(json_to_csv, chunk_filepath)
futures.append(future)
# wait for all to finish
for future in futures:
csv_filepath = future.result(timeout=None) # waits until complete
print(f'conversion complete> csv filepath: {csv_filepath}')
Since you have many files, the simplest multiprocessing example from the documentation should work for you. https://docs.python.org/3.4/library/multiprocessing.html?highlight=process
f(JsonFile):
# open input, output files and convert
with Pool(16) as p:
p.map(f, os.listdir(os.getcwd()))
You could also try replacing listdir with os.scandir(), which doesn't have to return all directory entries before starting.
I have to parse 30 days access logs from the server based on client IP and accessed hosts and need to know top 10 accessed sites. The log file will be around 10-20 GB in size which takes lots of time for single threaded execution of script. Initially, I wrote a script which was working fine but it is taking a lot of time to due to large log file size. Then I tried to implement multiprocessing library for parallel processing but it is not working. It seems implementation of multiprocessing is repeating tasks instead of doing parallel processing. Not sure, what is wrong in the code. Can some one please help on this? Thank you so much in advance for your help.
Code:
from datetime import datetime, timedelta
import commands
import os
import string
import sys
import multiprocessing
def ipauth (slave_list, static_ip_list):
file_record = open('/home/access/top10_domain_accessed/logs/combined_log.txt', 'a')
count = 1
while (count <=30):
Nth_days = datetime.now() - timedelta(days=count)
date = Nth_days.strftime("%Y%m%d")
yr_month = Nth_days.strftime("%Y/%m")
file_name = 'local2' + '.' + date
with open(slave_list) as file:
for line in file:
string = line.split()
slave = string[0]
proxy = string[1]
log_path = "/LOGS/%s/%s" %(slave, yr_month)
try:
os.path.exists(log_path)
file_read = os.path.join(log_path, file_name)
with open(file_read) as log:
for log_line in log:
log_line = log_line.strip()
if proxy in log_line:
file_record.write(log_line + '\n')
except IOError:
pass
count = count + 1
file_log = open('/home/access/top10_domain_accessed/logs/ipauth_logs.txt', 'a')
with open(static_ip_list) as ip:
for line in ip:
with open('/home/access/top10_domain_accessed/logs/combined_log.txt','r') as f:
for content in f:
log_split = content.split()
client_ip = log_split[7]
if client_ip in line:
content = str(content).strip()
file_log.write(content + '\n')
return
if __name__ == '__main__':
slave_list = sys.argv[1]
static_ip_list = sys.argv[2]
jobs = []
for i in range(5):
p = multiprocessing.Process(target=ipauth, args=(slave_list, static_ip_list))
jobs.append(p)
p.start()
p.join()
UPDATE AFTER CONVERSATION WITH OP, PLEASE SEE COMMENTS
My take: Split the file into smaller chunks and use a process pool to work on those chunks:
import multiprocessing
def chunk_of_lines(fp, n):
# read n lines from file
# then yield
pass
def process(lines):
pass # do stuff to a file
p = multiprocessing.Pool()
fp = open(slave_list)
for f in chunk_of_lines(fp,10):
p.apply_async(process, [f,static_ip_list])
p.close()
p.join() # Wait for all child processes to close.
There are many ways to implement the chunk_of_lines method, you could iterate over the file lines using a simple for or do something more advance like call fp.read().
I have few text files containing some numbers in each line. I read the file and displayed total of each number in the function.
from Queue import Queue
from threading import Thread
import os
enclosure_queue = Queue()
list=[]
def getAllFiles():
for root, dirs, files in os.walk("C:/Users/test"):
for file in files:
if file.endswith(".txt"):
file_path= os.path.join(root, file)
list.append(file_path)
def calc(i, q):
while True:
file = q.get()
fileData = open(file, 'r')
add=0
for line in fileData:
add= add + int(line)
print str(add) + '\n'
q.task_done()
getAllFiles()
num_fetch_threads = len(list)
for i in range(num_fetch_threads):
worker = Thread(target=calc, args=(i, enclosure_queue,))
worker.setDaemon(True)
worker.start()
for ind_file in list:
enclosure_queue.put(ind_file)
enclosure_queue.join()
It displays the sum of data in lines of individual file, but I need to add up the results.
For example if the calc function's add has 300 , 200 and 500 , I want final result as 1000 . I thought of adding each result to a list and then splitting it in another function and adding them. Is there any better solution ?
No need to use Queue here. Use multiprocessing.Pool.map, and change your calc method accordingly. Also threading.Thread does not return results, whereas multiprocessing.Pool.map returns results.
import multiprocessing
import os
def getAllFiles():
my_files = list()
for root, dirs, files in os.walk("C:/Users/test"):
for file in files:
if file.endswith(".txt"):
file_path= os.path.join(root, file)
my_files.append(file_path)
return my_files
def calc(file):
with open(file, 'r') as f:
return sum(map(int, f.readlines()))
if __name__ == '__main__':
my_files = getAllFiles()
num_fetch_threads = len(my_files)
pool = multiprocessing.Pool(processes=num_fetch_threads)
results = pool.map(calc, my_files)
result = sum(results)
print result