I have few text files containing some numbers in each line. I read the file and displayed total of each number in the function.
from Queue import Queue
from threading import Thread
import os
enclosure_queue = Queue()
list=[]
def getAllFiles():
for root, dirs, files in os.walk("C:/Users/test"):
for file in files:
if file.endswith(".txt"):
file_path= os.path.join(root, file)
list.append(file_path)
def calc(i, q):
while True:
file = q.get()
fileData = open(file, 'r')
add=0
for line in fileData:
add= add + int(line)
print str(add) + '\n'
q.task_done()
getAllFiles()
num_fetch_threads = len(list)
for i in range(num_fetch_threads):
worker = Thread(target=calc, args=(i, enclosure_queue,))
worker.setDaemon(True)
worker.start()
for ind_file in list:
enclosure_queue.put(ind_file)
enclosure_queue.join()
It displays the sum of data in lines of individual file, but I need to add up the results.
For example if the calc function's add has 300 , 200 and 500 , I want final result as 1000 . I thought of adding each result to a list and then splitting it in another function and adding them. Is there any better solution ?
No need to use Queue here. Use multiprocessing.Pool.map, and change your calc method accordingly. Also threading.Thread does not return results, whereas multiprocessing.Pool.map returns results.
import multiprocessing
import os
def getAllFiles():
my_files = list()
for root, dirs, files in os.walk("C:/Users/test"):
for file in files:
if file.endswith(".txt"):
file_path= os.path.join(root, file)
my_files.append(file_path)
return my_files
def calc(file):
with open(file, 'r') as f:
return sum(map(int, f.readlines()))
if __name__ == '__main__':
my_files = getAllFiles()
num_fetch_threads = len(my_files)
pool = multiprocessing.Pool(processes=num_fetch_threads)
results = pool.map(calc, my_files)
result = sum(results)
print result
Related
I have a huge zip file with a large number of files. Parsing all these files takes a lot of time, so I thought about using multiprocessing to speed things up. I am not sure how to approach it, as a zipfile.ZipFile in Python is not an iterable.
I am aware that I could extract all contents from the zip file and then iterate over the list of filenames, however, I'd prefer to not have to keep extra free space to hold the extracted data and would like to operate on the ZipFile.
Maybe there is any other solution to this this problem, so I am open to suggestions.
EDIT:
Using the below code technically works, but the problem is that each time the get_content() function runs, it seems the large zip file that I have is being opened again, ultimately taking as long as 15 seconds to reach each file.
import multiprocessing
from zipfile import ZipFile
from multiprocessing import Pool
import time
path = 'zipfile.zip'
def get_file_list(zip_path):
with ZipFile(zip_path, 'r') as zipObj:
listOfiles = zipObj.namelist()
return listOfiles
def get_content(file_name):
start_time = time.time()
with ZipFile(path, 'r') as zipObject:
with zipObject.open(file_name) as file:
content = file.read()
end_time = time.time()
print(f"It took {end_time - start_time} to open this file")
return content
def parse_files():
file_list = get_file_list(path)
with Pool(multiprocessing.cpu_count()) as p:
contents = p.map(get_content, file_list)
print(contents)
parse_files()
import os
import shutil
from zipfile import ZipFile
from multiprocessing import Pool
def create_dummy_zip():
os.mkdir("dummy")
for i in range(100):
with open(f"dummy/{i}.file", "w") as f:
f.write(f"Content: {i}")
shutil.make_archive("dummy", 'zip', "dummy")
shutil.rmtree('dummy')
def delete_dummy():
try:
os.remove("dummy.zip")
shutil.rmtree('dummy')
except:
pass
def get_file_list(zip_path):
with ZipFile(zip_path, 'r') as zipObj:
listOfiles = zipObj.namelist()
return listOfiles
def get_content(file_name):
with ZipFile("dummy.zip", 'r') as zipObject:
with zipObject.open(file_name) as file:
content = file.read()
return content
if __name__ == '__main__':
try:
create_dummy_zip()
file_list = get_file_list("dummy.zip")
with Pool(5) as p:
contents = p.map(get_content, file_list)
print(contents)
delete_dummy()
except:
delete_dummy()
I am trying to write a script which will be grabbing newly added csv file from the folder and adding it to one big file. Basically, I want all of the csv files added to a particular folder, being stored in one resulting csv file. I have a code below which generates the list of files and I am selecting the newly added file there:
def check_dir(fh,start_path='/Users/.../Desktop/files',new_cb=None,changed_cb=None):
total_size = 0
for dirpath, dirnames, filenames in os.walk(start_path):
for f in filenames:
fp = os.path.join(dirpath, f)
if not os.path.islink(fp):
fs = os.path.getsize(fp)
total_size += fs
if f in fh:
if fh[f] == fs:
# file unchanged
pass
else:
if changed_cb:
changed_cb(fp)
else:
#new file
if new_cb:
new_cb(fp)
fh[f] = fs
return total_size
def new_file(fp):
print("New File {0}!".format(fp))
def changed_file(fp):
print("File {0} changed!".format(fp))
if __name__ == '__main__':
file_history={}
total = 0
while(True):
nt = check_dir(file_history,'/Users/.../Desktop/files',new_file,changed_file)
if total and nt != total:
print("Total size changed from {0} to {1}".format(total,nt))
total = nt
time.sleep(200)
print("File list:\n{0}".format(file_history))
print(list(dict.keys(file_history))[-1])
I don't really know how to create this empty pandas data frame to which this latest added file will be added on a regular basis (that's why I have a time.sleep there). In the end I want to have this big csv file with all the files added to it.
Please, help :(
P.S. I am new to Python, so please don't judge if it is super simple..
Are you going to be using Pandas to process the data in the csv or only to concatenate the files?
If you simply want to append each csv file to the big one, then why not use python io for speed and simplicity. Assuming that all csv files use the same type of formatting that is.
I have updated the new_file method to append to the big csv using io. I have added an append_pandas function which is not used but should help you if you must use pandas to do the job. I haven't tested the pandas function, there are more things to consider like the format of the csv files. Check out the documentation for more details.
import os
import time
def check_dir(fh,start_path='/Users/.../Desktop/files',new_cb=None,changed_cb=None,**kwargs):
total_size = 0
for dirpath, dirnames, filenames in os.walk(start_path):
for f in filenames:
fp = os.path.join(dirpath, f)
if not os.path.islink(fp):
fs = os.path.getsize(fp)
total_size += fs
if f in fh:
if fh[f] == fs:
# file unchanged
pass
else:
if changed_cb:
changed_cb(fp,**kwargs)
else:
#new file
if new_cb:
new_cb(fp, **kwargs)
fh[f] = fs
return total_size
def is_csv(f):
# you can add more to check here
return 'csv' in f
def append_csv(s,d,skip_header=1):
with open(s,'r') as readcsv:
with open(d,'a') as appendcsv:
for line in readcsv:
if(skip_header < 1):
appendcsv.write(line)
else:
skip_header -= 1
if not "\n" in line:
appendcsv.write("\n")
def append_pandas(s,d):
# i haven't tested this
pd = pandas.read_csv(s)
pdb = pandas.read_csv(d)
newpd = pdb.append(pd)
DataFrame.to_csv(d)
def new_file(fp, **kwargs):
if is_csv(fp):
print("Appending {0}!".format(fp))
bcsv = kwargs.get('append_to_csv','/default/path/to/big.csv')
skip = kwargs.get('skip_header',1)
append_csv(fp,bcsv,skip)
def changed_file(fp, **kwargs):
print("File {0} changed!".format(fp))
if __name__ == '__main__':
file_history={}
total = 0
while(True):
nt = check_dir(file_history,'/tmp/test/',new_file,changed_file, append_to_csv ='/tmp/big.csv', skip_header = 1)
if total and ns != total:
print("Total size changed from {0} to {1}".format(total,ns))
total = ns
time.sleep(10)
print("File list:\n{0}".format(file_history))
I think that pandas.concat() is what you are looking for
As I asked, I have multiple txt files:
file1.txt
file2.txt
file3.txt
file4.txt
...
They're in the same folder, each file has a few lines. I want to merge them into one file using multithreading in Python.
Here is my code without thread:
filenames = glob.glob(DATA_DIR + '/*.txt')
with open('final.txt', 'w') as outputfile:
for fname in filenames:
with open(fname) as infile:
for line in infile:
outputfile.write(line)
outputfile.write('\n')
Here is a very simple example demonstrating a basic concept. You will have to play with what to do with Lock() to end up getting the result you need. But in short, this code is an example that the "threaded" version (use_thread = True) writes files in an inconsistent manner compared to the "non_threaded" version (use_thread = False) where the results are always consistent.
import threading
use_thread = True
def read_and_write_file(file_name):
if use_thread:
print(thread.name)
with threading.Lock() as lock:
with open('file_all.txt', 'a') as out, open(file_name) as in_:
out.writelines(in_)
if __name__ == '__main__':
file_names = ['file1.txt', 'file2.txt', 'file3.txt']
if use_thread:
for file_name in file_names:
thread = threading.Thread(target = read_and_write_file, args = (file_name,), name = file_name)
thread.start()
else:
for file_name in file_names:
read_and_write_file(file_name)
I am beginner to Python and trying to add few lines of code to convert json to csv and back to json. Have thousands of files (size 300 MB) to be converted and processed. With current program (using 1 CPU), i am not able to use 16 CPUs of server and need suggestions to fine tune the program for multiprocessing. Below is my code with python 3.7 version.
import json
import csv
import os
os.chdir('/stagingData/Scripts/test')
for JsonFile in os.listdir(os.getcwd()):
PartialFileName = JsonFile.split('.')[0]
j = 1
with open(PartialFileName +".csv", 'w', newline='') as Output_File:
with open(JsonFile) as fileHandle:
i = 1
for Line in fileHandle:
try:
data = json.loads(Line, parse_float=str)
except:
print("Can't load line {}".format(i))
if i == 1:
header = data.keys()
output = csv.writer(Output_File)
output.writerow(header) #Writes header row
i += 1
output.writerow(data.values()) #writes values row
j += 1
Appreciate suggestions on multiprocessing logic
If you have a single big file that you want to process more effectively I suggest the following:
Split file into chunks
Create a process to process each chunk
(if necessary) merge the processed chunks back into a single file
Something like this:
import csv
import json
from pathlib import Path
from concurrent.futures import ProcessPoolExecutor
source_big_file = Path('/path/to/file')
def chunk_file_by_line(source_filepath: Path, chunk_size: int = 10_000):
chunk_line_size = 10_000
intermediate_file_handlers = {}
last_chunk_filepath = None
with source_big_file.open('r', encoding='utf8') as big:
for line_number, line in big:
group = line_number - (line_number % chunk_line_size)
chunk_filename = f'{source_big_file.stem}.g{group}{source_big_file.suffix}'
chunk_filepath = source_big_file.parent / chunk_filename
if chunk_filepath not in intermediate_file_handlers:
file_handler = chuck_filepath.open('w', encoding='utf8')
intermediate_file_handlers[chunk_filepath] = file_handler
if last_chunk_filepath:
last_file_hanlder = intermediate_file_handlers[last_chunk_filepath]
last_file_handler.close()
yield last_chunk_filepath
else:
file_handler = intermediate_file_handlers[chunk_filepath]
file_handler.write(line)
last_chunk_filepath = chunk_filepath
# output last one
yield last_chunk_filepath
def json_to_csv(json_filepath: Path) -> Path:
csv_filename = f'{json_filepath.stem}.csv'
csv_filepath = json_filepath.parent / csv_filename
with csv_filepath.open('w', encoding='utf8') as csv_out, json_filepath.open('r', encoding='utf8') as json_in:
dwriter = csv.DictWriter(csv_out)
headers_written = False
for json_line in json_in:
data = json.loads(json_line)
if not headers_written:
# create header record
headers = {k:k for k in data.keys()}
dwriter.writeline(headers)
headers_written = True
dwriter.writeline(data)
return csv_filepath
with ProcessPoolExecutor() as pool:
futures = []
for chunk_filepath in chuck_file_by_line(source_big_file):
future = pool.submit(json_to_csv, chunk_filepath)
futures.append(future)
# wait for all to finish
for future in futures:
csv_filepath = future.result(timeout=None) # waits until complete
print(f'conversion complete> csv filepath: {csv_filepath}')
Since you have many files, the simplest multiprocessing example from the documentation should work for you. https://docs.python.org/3.4/library/multiprocessing.html?highlight=process
f(JsonFile):
# open input, output files and convert
with Pool(16) as p:
p.map(f, os.listdir(os.getcwd()))
You could also try replacing listdir with os.scandir(), which doesn't have to return all directory entries before starting.
I wrote this simple code for searching for strings in a collection of files. It works but is not optimal.
I need to hardcode file names in a dictionary
some of my files are about 60Mb of size, search lasts for a while
Could someone please optimize my code for the following:
read all files in a given directory without the need to hardcode the file names
parallelize the search for speed
write the search result in a output.txt file
my_file = {"File1.xml", "File2.xml", "File3.xml"}
my_string = {"John", "Mary", "Clara"}
for f in my_file:
for s in my_string:
with open(f) as fp:
a = fp.read().count(s)
fp.close()
print f,',',s,',',a
Thank you
1. Read the files:
files_queue = Queue()
for root, dirs, files in os.walk(start_path):
for file in files:
files_queue.put(file)
2. Parallel search:
res_queue = Queue()
threads = []
def search(files_queue, words, res_queue):
while True:
file = files_queue.get(block=Flase)
with open(file) as fp:
content = fp.read()
results = {}
for word in words
results[word] = content.count(s)
res_queue.put(results)
files_queue.task_done()
# use 10 workers
for _ in range(10):
thread = Thread(target=search, args=(files_queue, words, res_queue)
threads.append(thread)
thread.start()
3. Collect results:
# wait until all files processed
files.join()
# collect results from queue
results = []
while not res_queue.empty()
results.append( res_queue.get() )
# profit
...