Considering I have a large number of json files, but small in size (about 20000 files around 100 Mbs), reading them the first time with code snippet:
from time import perf_counter
from glob import glob
def load_all_jsons_serial():
t_i = perf_counter()
json_files = glob("*json")
for file in json_files:
with open(file,"r") as f:
f.read()
t_f = perf_counter()
return t_f-t_i
load_all_jsons_serial()
take around 50 seconds.
However, if I rerun the code, it takes less than a second to finish! Could someone please:
Explain this observation. Why does it take longer the first time and less for the nexts runs?
How can I reduce the time for loading for the first time?
I am on a windows 11 machine and run the code in a notebook extension of VSCode.
Thanks.
You can read in parallel with aiofiles. Here is a full example, where i had 1000 json files (200kb each) in folder jsonfiles\async\ and jsonfiles\sync\ to prevent any hard disk or OS level caching. Removing the files and recreated the JSON files again after each run.
from glob import glob
import aiofiles
import asyncio
from time import perf_counter
###
# Synchronous file operation:
###
def load_all_jsons_serial():
json_files = glob("jsonfiles\\sync\\*.json")
for file in json_files:
with open(file,"r") as f:
f.read()
return
t_i = perf_counter()
load_all_jsons_serial()
t_f = perf_counter()
print(f"Synchronous: {t_f - t_i}")
###
# Async file operation
###
async def load_async(files: list[str]):
for file in files:
async with aiofiles.open(file, "r") as f:
await f.read()
return
async def main():
json_files = glob("jsonfiles\\async\\*.json")
no_of_tasks = 10
files_per_task = len(json_files)//no_of_tasks + 1
tasks = []
for i in range(no_of_tasks):
tasks.append(
asyncio.create_task(load_async(
json_files[i*files_per_task : i*files_per_task+files_per_task]))
)
await asyncio.gather(*tasks)
return
t_i = perf_counter()
asyncio.run(main())
t_f = perf_counter()
print(f"Asynchronous: {t_f - t_i}")
It's not exactly science but you can see there is a significant gain in performance:
Synchronous: 13.353551400010474
Asynchronous: 3.1800755000440404
Related
I have a huge zip file with a large number of files. Parsing all these files takes a lot of time, so I thought about using multiprocessing to speed things up. I am not sure how to approach it, as a zipfile.ZipFile in Python is not an iterable.
I am aware that I could extract all contents from the zip file and then iterate over the list of filenames, however, I'd prefer to not have to keep extra free space to hold the extracted data and would like to operate on the ZipFile.
Maybe there is any other solution to this this problem, so I am open to suggestions.
EDIT:
Using the below code technically works, but the problem is that each time the get_content() function runs, it seems the large zip file that I have is being opened again, ultimately taking as long as 15 seconds to reach each file.
import multiprocessing
from zipfile import ZipFile
from multiprocessing import Pool
import time
path = 'zipfile.zip'
def get_file_list(zip_path):
with ZipFile(zip_path, 'r') as zipObj:
listOfiles = zipObj.namelist()
return listOfiles
def get_content(file_name):
start_time = time.time()
with ZipFile(path, 'r') as zipObject:
with zipObject.open(file_name) as file:
content = file.read()
end_time = time.time()
print(f"It took {end_time - start_time} to open this file")
return content
def parse_files():
file_list = get_file_list(path)
with Pool(multiprocessing.cpu_count()) as p:
contents = p.map(get_content, file_list)
print(contents)
parse_files()
import os
import shutil
from zipfile import ZipFile
from multiprocessing import Pool
def create_dummy_zip():
os.mkdir("dummy")
for i in range(100):
with open(f"dummy/{i}.file", "w") as f:
f.write(f"Content: {i}")
shutil.make_archive("dummy", 'zip', "dummy")
shutil.rmtree('dummy')
def delete_dummy():
try:
os.remove("dummy.zip")
shutil.rmtree('dummy')
except:
pass
def get_file_list(zip_path):
with ZipFile(zip_path, 'r') as zipObj:
listOfiles = zipObj.namelist()
return listOfiles
def get_content(file_name):
with ZipFile("dummy.zip", 'r') as zipObject:
with zipObject.open(file_name) as file:
content = file.read()
return content
if __name__ == '__main__':
try:
create_dummy_zip()
file_list = get_file_list("dummy.zip")
with Pool(5) as p:
contents = p.map(get_content, file_list)
print(contents)
delete_dummy()
except:
delete_dummy()
I recently started learning about multiprocessing in python and made this code to test it. So I have around 1300 csv files which I simply want to open and then save it to a folder with multiprocessing to test the speed. The issue here is that, the first 600-700 files are processed and saved in less than 10 seconds but the next 600-700 files takes more than 1 minute. I am really not sure why its happening. I have 8 cores and 16 gb ram in my system. Below is my code
import pandas as pd
import os,time
import multiprocessing
import numpy as np
def csv_processing(p):
final_df = pd.DataFrame(columns=['File_name', 'col'])
for file in p :
url = 'E:\\Ashish\\Market\\Data\\Processed GFDL_options\\Bank Nifty\\Intraday\\'
output = 'E:\\Testing\\'
df = pd.read_csv(url +file)
df.to_csv(output+file)
def split_list_into_prcessess(main_list, req_process):
index_freq = round(.5 + len(main_list)/req_process)
splitted_list = [main_list[r*index_freq:(r+1)*index_freq] for r in range(req_process)]
return [x for x in splitted_list if len(x)>0]
if __name__ == '__main__':
start_time = time.time()
processes = []
all_files = os.listdir('E:\\Ashish\\Market\\Data\\Processed GFDL_options\\Bank Nifty\\Intraday\\')
print(len(all_files))
data = split_list_into_prcessess(all_files,os.cpu_count())
print(data)
print(len(data))
for t in data:
p = multiprocessing.Process(target=csv_processing,args=(t,))
p.start()
processes.append(p)
for l in processes:
l.join()
end_time = time.time()
time_took = end_time - start_time
print(time_took)
I have the following code. I want to load the data once in memory and then run the function get_ids in parallel. Actually, the data was loading 8 times. What follows in a memory error. Also, I'm very happy over hints to optimize the multiprocessing.
I use python 3.8
With windows and 16 GB ram and 8 CPU
import multiprocessing as mp
import os
import json
import datetime
from dateutil.relativedelta import relativedelta
import re
import time
NUM_CPUS = mp.cpu_count()
os.chdir(r'C:\Users\final_tweets_de')
directory =r'C:\Users\final_tweets_de'
path= r'C:\Users\final_tweets_de'
for file in os.listdir(directory):
fh = open(os.path.join(path, file),'r')
if file =="SARS_CoV.json":
with open(file, 'r', encoding='utf-8') as json_file:
data_tweets = json.load(json_file)
def get_id(data_tweets):
for i in range(0,len(data_tweets)):
try:
account = data_tweets[i]['user_screen_name']
created = datetime.datetime.strptime(data_tweets[i]['date'], '%Y-%m-%d').date()
until = created + relativedelta(days=10)
id = data_tweets[i]['id']
filename = re.search(r'(.*).json',file).group(1) + '_' + 'tweet_id_' +str(id)+ '_' + 'user_id_' + str(data_tweets[i]['user_id'])
try:
os.system('snscrape twitter-search "(to:'+account+') since:'+created.strftime("%Y-%m-%d")+' until:'+until.strftime("%Y-%m-%d")+' filter:replies" >C:\\Users\\Antworten\\antworten_SARS_CoV.json\\'+filename)
except:
continue
except:
Exception:logging.exception("f(%r) failed" % (args,))
if __name__ == "__main__":
pool = mp.Pool(NUM_CPUS)
get_id(data_tweets)
pool.close()
pool.join()
Update
After the comment from #tdelaney, I split the data into smaller pieces and have no memory errors yet. But now, the cores are still not fully used. I have a workload of around 20 percent.
I am beginner to Python and trying to add few lines of code to convert json to csv and back to json. Have thousands of files (size 300 MB) to be converted and processed. With current program (using 1 CPU), i am not able to use 16 CPUs of server and need suggestions to fine tune the program for multiprocessing. Below is my code with python 3.7 version.
import json
import csv
import os
os.chdir('/stagingData/Scripts/test')
for JsonFile in os.listdir(os.getcwd()):
PartialFileName = JsonFile.split('.')[0]
j = 1
with open(PartialFileName +".csv", 'w', newline='') as Output_File:
with open(JsonFile) as fileHandle:
i = 1
for Line in fileHandle:
try:
data = json.loads(Line, parse_float=str)
except:
print("Can't load line {}".format(i))
if i == 1:
header = data.keys()
output = csv.writer(Output_File)
output.writerow(header) #Writes header row
i += 1
output.writerow(data.values()) #writes values row
j += 1
Appreciate suggestions on multiprocessing logic
If you have a single big file that you want to process more effectively I suggest the following:
Split file into chunks
Create a process to process each chunk
(if necessary) merge the processed chunks back into a single file
Something like this:
import csv
import json
from pathlib import Path
from concurrent.futures import ProcessPoolExecutor
source_big_file = Path('/path/to/file')
def chunk_file_by_line(source_filepath: Path, chunk_size: int = 10_000):
chunk_line_size = 10_000
intermediate_file_handlers = {}
last_chunk_filepath = None
with source_big_file.open('r', encoding='utf8') as big:
for line_number, line in big:
group = line_number - (line_number % chunk_line_size)
chunk_filename = f'{source_big_file.stem}.g{group}{source_big_file.suffix}'
chunk_filepath = source_big_file.parent / chunk_filename
if chunk_filepath not in intermediate_file_handlers:
file_handler = chuck_filepath.open('w', encoding='utf8')
intermediate_file_handlers[chunk_filepath] = file_handler
if last_chunk_filepath:
last_file_hanlder = intermediate_file_handlers[last_chunk_filepath]
last_file_handler.close()
yield last_chunk_filepath
else:
file_handler = intermediate_file_handlers[chunk_filepath]
file_handler.write(line)
last_chunk_filepath = chunk_filepath
# output last one
yield last_chunk_filepath
def json_to_csv(json_filepath: Path) -> Path:
csv_filename = f'{json_filepath.stem}.csv'
csv_filepath = json_filepath.parent / csv_filename
with csv_filepath.open('w', encoding='utf8') as csv_out, json_filepath.open('r', encoding='utf8') as json_in:
dwriter = csv.DictWriter(csv_out)
headers_written = False
for json_line in json_in:
data = json.loads(json_line)
if not headers_written:
# create header record
headers = {k:k for k in data.keys()}
dwriter.writeline(headers)
headers_written = True
dwriter.writeline(data)
return csv_filepath
with ProcessPoolExecutor() as pool:
futures = []
for chunk_filepath in chuck_file_by_line(source_big_file):
future = pool.submit(json_to_csv, chunk_filepath)
futures.append(future)
# wait for all to finish
for future in futures:
csv_filepath = future.result(timeout=None) # waits until complete
print(f'conversion complete> csv filepath: {csv_filepath}')
Since you have many files, the simplest multiprocessing example from the documentation should work for you. https://docs.python.org/3.4/library/multiprocessing.html?highlight=process
f(JsonFile):
# open input, output files and convert
with Pool(16) as p:
p.map(f, os.listdir(os.getcwd()))
You could also try replacing listdir with os.scandir(), which doesn't have to return all directory entries before starting.
I have to parse 30 days access logs from the server based on client IP and accessed hosts and need to know top 10 accessed sites. The log file will be around 10-20 GB in size which takes lots of time for single threaded execution of script. Initially, I wrote a script which was working fine but it is taking a lot of time to due to large log file size. Then I tried to implement multiprocessing library for parallel processing but it is not working. It seems implementation of multiprocessing is repeating tasks instead of doing parallel processing. Not sure, what is wrong in the code. Can some one please help on this? Thank you so much in advance for your help.
Code:
from datetime import datetime, timedelta
import commands
import os
import string
import sys
import multiprocessing
def ipauth (slave_list, static_ip_list):
file_record = open('/home/access/top10_domain_accessed/logs/combined_log.txt', 'a')
count = 1
while (count <=30):
Nth_days = datetime.now() - timedelta(days=count)
date = Nth_days.strftime("%Y%m%d")
yr_month = Nth_days.strftime("%Y/%m")
file_name = 'local2' + '.' + date
with open(slave_list) as file:
for line in file:
string = line.split()
slave = string[0]
proxy = string[1]
log_path = "/LOGS/%s/%s" %(slave, yr_month)
try:
os.path.exists(log_path)
file_read = os.path.join(log_path, file_name)
with open(file_read) as log:
for log_line in log:
log_line = log_line.strip()
if proxy in log_line:
file_record.write(log_line + '\n')
except IOError:
pass
count = count + 1
file_log = open('/home/access/top10_domain_accessed/logs/ipauth_logs.txt', 'a')
with open(static_ip_list) as ip:
for line in ip:
with open('/home/access/top10_domain_accessed/logs/combined_log.txt','r') as f:
for content in f:
log_split = content.split()
client_ip = log_split[7]
if client_ip in line:
content = str(content).strip()
file_log.write(content + '\n')
return
if __name__ == '__main__':
slave_list = sys.argv[1]
static_ip_list = sys.argv[2]
jobs = []
for i in range(5):
p = multiprocessing.Process(target=ipauth, args=(slave_list, static_ip_list))
jobs.append(p)
p.start()
p.join()
UPDATE AFTER CONVERSATION WITH OP, PLEASE SEE COMMENTS
My take: Split the file into smaller chunks and use a process pool to work on those chunks:
import multiprocessing
def chunk_of_lines(fp, n):
# read n lines from file
# then yield
pass
def process(lines):
pass # do stuff to a file
p = multiprocessing.Pool()
fp = open(slave_list)
for f in chunk_of_lines(fp,10):
p.apply_async(process, [f,static_ip_list])
p.close()
p.join() # Wait for all child processes to close.
There are many ways to implement the chunk_of_lines method, you could iterate over the file lines using a simple for or do something more advance like call fp.read().