flle processing using multiprocessing - python - python

I am beginner to Python and trying to add few lines of code to convert json to csv and back to json. Have thousands of files (size 300 MB) to be converted and processed. With current program (using 1 CPU), i am not able to use 16 CPUs of server and need suggestions to fine tune the program for multiprocessing. Below is my code with python 3.7 version.
import json
import csv
import os
os.chdir('/stagingData/Scripts/test')
for JsonFile in os.listdir(os.getcwd()):
PartialFileName = JsonFile.split('.')[0]
j = 1
with open(PartialFileName +".csv", 'w', newline='') as Output_File:
with open(JsonFile) as fileHandle:
i = 1
for Line in fileHandle:
try:
data = json.loads(Line, parse_float=str)
except:
print("Can't load line {}".format(i))
if i == 1:
header = data.keys()
output = csv.writer(Output_File)
output.writerow(header) #Writes header row
i += 1
output.writerow(data.values()) #writes values row
j += 1
Appreciate suggestions on multiprocessing logic

If you have a single big file that you want to process more effectively I suggest the following:
Split file into chunks
Create a process to process each chunk
(if necessary) merge the processed chunks back into a single file
Something like this:
import csv
import json
from pathlib import Path
from concurrent.futures import ProcessPoolExecutor
source_big_file = Path('/path/to/file')
def chunk_file_by_line(source_filepath: Path, chunk_size: int = 10_000):
chunk_line_size = 10_000
intermediate_file_handlers = {}
last_chunk_filepath = None
with source_big_file.open('r', encoding='utf8') as big:
for line_number, line in big:
group = line_number - (line_number % chunk_line_size)
chunk_filename = f'{source_big_file.stem}.g{group}{source_big_file.suffix}'
chunk_filepath = source_big_file.parent / chunk_filename
if chunk_filepath not in intermediate_file_handlers:
file_handler = chuck_filepath.open('w', encoding='utf8')
intermediate_file_handlers[chunk_filepath] = file_handler
if last_chunk_filepath:
last_file_hanlder = intermediate_file_handlers[last_chunk_filepath]
last_file_handler.close()
yield last_chunk_filepath
else:
file_handler = intermediate_file_handlers[chunk_filepath]
file_handler.write(line)
last_chunk_filepath = chunk_filepath
# output last one
yield last_chunk_filepath
def json_to_csv(json_filepath: Path) -> Path:
csv_filename = f'{json_filepath.stem}.csv'
csv_filepath = json_filepath.parent / csv_filename
with csv_filepath.open('w', encoding='utf8') as csv_out, json_filepath.open('r', encoding='utf8') as json_in:
dwriter = csv.DictWriter(csv_out)
headers_written = False
for json_line in json_in:
data = json.loads(json_line)
if not headers_written:
# create header record
headers = {k:k for k in data.keys()}
dwriter.writeline(headers)
headers_written = True
dwriter.writeline(data)
return csv_filepath
with ProcessPoolExecutor() as pool:
futures = []
for chunk_filepath in chuck_file_by_line(source_big_file):
future = pool.submit(json_to_csv, chunk_filepath)
futures.append(future)
# wait for all to finish
for future in futures:
csv_filepath = future.result(timeout=None) # waits until complete
print(f'conversion complete> csv filepath: {csv_filepath}')

Since you have many files, the simplest multiprocessing example from the documentation should work for you. https://docs.python.org/3.4/library/multiprocessing.html?highlight=process
f(JsonFile):
# open input, output files and convert
with Pool(16) as p:
p.map(f, os.listdir(os.getcwd()))
You could also try replacing listdir with os.scandir(), which doesn't have to return all directory entries before starting.

Related

How to parse a zipped file with multiprocessing?

I have a huge zip file with a large number of files. Parsing all these files takes a lot of time, so I thought about using multiprocessing to speed things up. I am not sure how to approach it, as a zipfile.ZipFile in Python is not an iterable.
I am aware that I could extract all contents from the zip file and then iterate over the list of filenames, however, I'd prefer to not have to keep extra free space to hold the extracted data and would like to operate on the ZipFile.
Maybe there is any other solution to this this problem, so I am open to suggestions.
EDIT:
Using the below code technically works, but the problem is that each time the get_content() function runs, it seems the large zip file that I have is being opened again, ultimately taking as long as 15 seconds to reach each file.
import multiprocessing
from zipfile import ZipFile
from multiprocessing import Pool
import time
path = 'zipfile.zip'
def get_file_list(zip_path):
with ZipFile(zip_path, 'r') as zipObj:
listOfiles = zipObj.namelist()
return listOfiles
def get_content(file_name):
start_time = time.time()
with ZipFile(path, 'r') as zipObject:
with zipObject.open(file_name) as file:
content = file.read()
end_time = time.time()
print(f"It took {end_time - start_time} to open this file")
return content
def parse_files():
file_list = get_file_list(path)
with Pool(multiprocessing.cpu_count()) as p:
contents = p.map(get_content, file_list)
print(contents)
parse_files()
import os
import shutil
from zipfile import ZipFile
from multiprocessing import Pool
def create_dummy_zip():
os.mkdir("dummy")
for i in range(100):
with open(f"dummy/{i}.file", "w") as f:
f.write(f"Content: {i}")
shutil.make_archive("dummy", 'zip', "dummy")
shutil.rmtree('dummy')
def delete_dummy():
try:
os.remove("dummy.zip")
shutil.rmtree('dummy')
except:
pass
def get_file_list(zip_path):
with ZipFile(zip_path, 'r') as zipObj:
listOfiles = zipObj.namelist()
return listOfiles
def get_content(file_name):
with ZipFile("dummy.zip", 'r') as zipObject:
with zipObject.open(file_name) as file:
content = file.read()
return content
if __name__ == '__main__':
try:
create_dummy_zip()
file_list = get_file_list("dummy.zip")
with Pool(5) as p:
contents = p.map(get_content, file_list)
print(contents)
delete_dummy()
except:
delete_dummy()

How to concatenate a newly added file to pandas dataframe?

I am trying to write a script which will be grabbing newly added csv file from the folder and adding it to one big file. Basically, I want all of the csv files added to a particular folder, being stored in one resulting csv file. I have a code below which generates the list of files and I am selecting the newly added file there:
def check_dir(fh,start_path='/Users/.../Desktop/files',new_cb=None,changed_cb=None):
total_size = 0
for dirpath, dirnames, filenames in os.walk(start_path):
for f in filenames:
fp = os.path.join(dirpath, f)
if not os.path.islink(fp):
fs = os.path.getsize(fp)
total_size += fs
if f in fh:
if fh[f] == fs:
# file unchanged
pass
else:
if changed_cb:
changed_cb(fp)
else:
#new file
if new_cb:
new_cb(fp)
fh[f] = fs
return total_size
def new_file(fp):
print("New File {0}!".format(fp))
def changed_file(fp):
print("File {0} changed!".format(fp))
if __name__ == '__main__':
file_history={}
total = 0
while(True):
nt = check_dir(file_history,'/Users/.../Desktop/files',new_file,changed_file)
if total and nt != total:
print("Total size changed from {0} to {1}".format(total,nt))
total = nt
time.sleep(200)
print("File list:\n{0}".format(file_history))
print(list(dict.keys(file_history))[-1])
I don't really know how to create this empty pandas data frame to which this latest added file will be added on a regular basis (that's why I have a time.sleep there). In the end I want to have this big csv file with all the files added to it.
Please, help :(
P.S. I am new to Python, so please don't judge if it is super simple..
Are you going to be using Pandas to process the data in the csv or only to concatenate the files?
If you simply want to append each csv file to the big one, then why not use python io for speed and simplicity. Assuming that all csv files use the same type of formatting that is.
I have updated the new_file method to append to the big csv using io. I have added an append_pandas function which is not used but should help you if you must use pandas to do the job. I haven't tested the pandas function, there are more things to consider like the format of the csv files. Check out the documentation for more details.
import os
import time
def check_dir(fh,start_path='/Users/.../Desktop/files',new_cb=None,changed_cb=None,**kwargs):
total_size = 0
for dirpath, dirnames, filenames in os.walk(start_path):
for f in filenames:
fp = os.path.join(dirpath, f)
if not os.path.islink(fp):
fs = os.path.getsize(fp)
total_size += fs
if f in fh:
if fh[f] == fs:
# file unchanged
pass
else:
if changed_cb:
changed_cb(fp,**kwargs)
else:
#new file
if new_cb:
new_cb(fp, **kwargs)
fh[f] = fs
return total_size
def is_csv(f):
# you can add more to check here
return 'csv' in f
def append_csv(s,d,skip_header=1):
with open(s,'r') as readcsv:
with open(d,'a') as appendcsv:
for line in readcsv:
if(skip_header < 1):
appendcsv.write(line)
else:
skip_header -= 1
if not "\n" in line:
appendcsv.write("\n")
def append_pandas(s,d):
# i haven't tested this
pd = pandas.read_csv(s)
pdb = pandas.read_csv(d)
newpd = pdb.append(pd)
DataFrame.to_csv(d)
def new_file(fp, **kwargs):
if is_csv(fp):
print("Appending {0}!".format(fp))
bcsv = kwargs.get('append_to_csv','/default/path/to/big.csv')
skip = kwargs.get('skip_header',1)
append_csv(fp,bcsv,skip)
def changed_file(fp, **kwargs):
print("File {0} changed!".format(fp))
if __name__ == '__main__':
file_history={}
total = 0
while(True):
nt = check_dir(file_history,'/tmp/test/',new_file,changed_file, append_to_csv ='/tmp/big.csv', skip_header = 1)
if total and ns != total:
print("Total size changed from {0} to {1}".format(total,ns))
total = ns
time.sleep(10)
print("File list:\n{0}".format(file_history))
I think that pandas.concat() is what you are looking for

How to extract one column to another file from a 300GB file

Problem was the huge data number, and I have to do it with my personal laptop with 12GB RAM. I tried a loop with 1M. lines every round, and used csv.writer. But csv.writer wrote like 1M. lines every two hours. So, any other ways worth to try?
lines = 10000000
for i in range(0, 330):
list_str = []
with open(file, 'r') as f:
line_flag = 0
for _ in range(i*lines):
next(f)
for line in f:
line_flag = line_flag + 1
data = json.loads(line)['name']
if data != former_str:
list_str.append(data)
former_str = data
if line_flag == lines:
break
with open(self.path + 'data_range\\names.csv', 'a', newline='') as writeFile:
writer = csv.writer(writeFile, delimiter='\n')
writer.writerow(list_str)
writeFile.close()
another version
def read_large_file(f):
block_size = 200000000
block = []
for line in f:
block.append(line[:-1])
if len(block) == block_size:
yield block
block = []
if block:
yield block
def split_files():
with open(write_file, 'r') as f:
i = 0
for block in read_large_file(f):
print(i)
file_name = write_name + str(i) + '.csv'
with open(file_name, 'w', newline='') as f_:
writer = csv.writer(f_, delimiter='\n')
writer.writerow(block)
i += 1
This was after it read a block and writing ... I wonder how come the rate of data trasmission was keeping about 0.
It should be as simple as this:
import json
import csv
with open(read_file, 'rt') as r, open(write_file, 'wt', newline='') as w:
writer = csv.writer(w)
for line in r:
writer.writerow([json.loads(line)['name']])
I tried the loop inside the file, but I always get me a Error, I guessed we cannot write the data into another file while opening the file?
You totally can write data in one file while reading another. I can't tell you more about your error until you post what it said, though.
There was a bit in your code about former_str which is not covered under "extract one column", so I did not write anything about it.
Would something like this work?
Essentially using a generator to avoid reading the entire file in memory, and writing the data one line at a time.
import jsonlines # pip install jsonlines
from typing import Generator
def gen_lines(file_path: str, col_name: str) -> Generator[str]:
with jsonline.open(file_path) as f:
for obj in f:
yield obj[col_name]
# Here you can also change to writing a jsonline again
with open(output_file, "w") as out:
for item in gen_lines(your_file_path, col_name_to_extract):
out.write(f"{item}\n")

Changing an orientation of a reverse sequence in the fasta file doesnt work

I am trying to get the reverse sequences orientated correctly in a file. This is the code:
import os
import sys import pysam
from Bio import SeqIO, Seq, SeqRecord
def main(in_file):
out_file = "%s.fa" % os.path.splitext(in_file)[0]
with open(out_file, "w") as out_handle:
# Write records from the BAM file one at a time to the output file.
# Works lazily as BAM sequences are read so will handle large files.
SeqIO.write(bam_to_rec(in_file), out_handle, "fasta")
def bam_to_rec(in_file):
"""Generator to convert BAM files into Biopython SeqRecords.
"""
bam_file = pysam.Samfile(in_file, "rb")
for read in bam_file:
seq = Seq.Seq(read.seq)
if read.is_reverse:
seq = seq.reverse_complement()
rec = SeqRecord.SeqRecord(seq, read.qname, "", "")
yield rec
if __name__ == "__main__":
main(*sys.argv[1:])`
When I print out the reverse sequences, the code works. But when in the file it is printed out as a reverse sequence. Can anyone help me to find out what is going wrong?
Here is the link to my infile:
https://www.dropbox.com/sh/68ui8l7nh5fxatm/AABUr82l01qT1nL8I_XgJaeTa?dl=0
Note the ugly counter is just to print 10000 sequences, not more.
comparing one without ever reversing with one that reverses if needed
Here's the output on a couple of seqs, feel free to test it, I think your issue is that yield returns an iterator but you are not iterating it, unless I am missunderstanding what you are doing:
Original:
SOLEXA-1GA-2:2:93:1281:961#0
GGGTTAGGTTAGGGTTAGGGTTAGGGTTAGGGTTAG
Becomes:
SOLEXA-1GA-2:2:93:1281:961#0
CTAACCCTAACCCTAACCCTAACCCTAACCTAACCC
And if not reverse:
Original:
SOLEXA-1GA-2:2:12:96:1547#0
ACACACAAACACACACACACACACACACACACCCCC
Becomes:
SOLEXA-1GA-2:2:12:96:1547#0
ACACACAAACACACACACACACACACACACACCCCC
Here's my code:
import os
import sys
import pysam
from Bio import SeqIO, Seq, SeqRecord
def main(in_file):
out_file = "%s.fa" % os.path.splitext(in_file)[0]
with open('test_non_reverse.txt', 'w') as non_reverse:
with open(out_file, "w") as out_handle:
# Write records from the BAM file one at a time to the output file.
# Works lazily as BAM sequences are read so will handle large files.
i = 0
for s in bam_to_rec(in_file):
if i == 10000:
break
i +=1
SeqIO.write(s, out_handle, "fasta")
i = 0
for s in convert_to_seq(in_file):
if i == 10000:
break
i +=1
SeqIO.write(s, non_reverse, 'fasta')
def convert_to_seq(in_file):
bam_file = pysam.Samfile(in_file, "rb")
for read in bam_file:
seq = Seq.Seq(read.seq)
rec = SeqRecord.SeqRecord(seq, read.qname, "", "")
yield rec
def bam_to_rec(in_file):
"""Generator to convert BAM files into Biopython SeqRecords.
"""
bam_file = pysam.Samfile(in_file, "rb")
for read in bam_file:
seq = Seq.Seq(read.seq)
if read.is_reverse:
seq = seq.reverse_complement()
rec = SeqRecord.SeqRecord(seq, read.qname, "", "")
yield rec
if __name__ == "__main__":
main(*sys.argv[1:])

How to read lines from arbitrary BZ2 streams for CSV?

The bz2 module provides a standard open() method from which one can call readline(). However, my situation is one where I have a stream (pointing to a large amount of data) that I want to decompress lines from on the fly. My current implementation is as follows but I know there must be a more succinct way to do this.
import bz2
import csv
BZ2_BUFFER = ''
BZ2_DECOMPRESSOR = None
BZ2_FILE = None
BZ2_READ_SIZE = 100 * 1024
def bz2_csv_rows(fp):
global BZ2_BUFFER, BZ2_DECOMPRESSOR, BZ2_FILE, BZ2_READ_SIZE
BZ2_BUFFER = ''
BZ2_DECOMPRESSOR = bz2.BZ2Decompressor()
BZ2_FILE = fp
for row in csv.reader(iter(bz2_line_reader, b'')):
yield row
def bz2_line_reader():
global BZ2_BUFFER, BZ2_DECOMPRESSOR, BZ2_FILE, BZ2_READ_SIZE
if BZ2_BUFFER is None:
return None
while '\n' not in BZ2_BUFFER:
bindata = BZ2_FILE.read(BZ2_READ_SIZE)
try:
data = BZ2_DECOMPRESSOR.decompress(bindata)
except EOFError:
break
except IOError:
pass
BZ2_BUFFER += data
if len(data) < BZ2_READ_SIZE:
BZ2_FILE = None
break
i = BZ2_BUFFER.find('\n')
if i is None or i < 0:
line = BZ2_BUFFER
BZ2_BUFFER = None
return line
line = BZ2_BUFFER[:i]
BZ2_BUFFER = BZ2_BUFFER[i + 1:]
return line
Thoughts?
Here's something that's a little more succinct, and (in my opinion) it's more readable and gets rid of all those nasty global variables your code uses:
import bz2
import csv
from functools import partial
class BZ2_CSV_LineReader(object):
def __init__(self, filename, buffer_size=4*1024):
self.filename = filename
self.buffer_size = buffer_size
def readlines(self):
with open(self.filename, 'rb') as file:
for row in csv.reader(self._line_reader(file)):
yield row
def _line_reader(self, file):
buffer = ''
decompressor = bz2.BZ2Decompressor()
reader = partial(file.read, self.buffer_size)
for bindata in iter(reader, b''):
block = decompressor.decompress(bindata).decode('utf-8')
buffer += block
if '\n' in buffer:
lines = buffer.splitlines(True)
if lines:
buffer = '' if lines[-1].endswith('\n') else lines.pop()
for line in lines:
yield line
if __name__ == '__main__':
bz2_csv_filename = 'test_csv.bz2'
for row in BZ2_CSV_LineReader(bz2_csv_filename).readlines():
print(row)
Maybe it'll be useful: I use Python 3 and I have a large csv.bz2 file.
I handle it this way:
import bz2
import csv
def bz2_csv_rows(fp):
with bz2.open(fp, mode='rt', newline='') as bzfp:
for row in csv.reader(bzfp):
yield row
Key feature is to open stream in text mode: mode='rt' in call bz2.open() instead of manual searching "\n" in binary mode. But I'm not sure this will work for not physical files.

Categories