Is it possible to do dbutils io asynchronously? - python

I've written some code (based on https://stackoverflow.com/a/40199652/529618) that writes partitioned data to blob, and for the most part it's quite quick. The slowest part is that the one csv file per partition I have spark generate are named in a user-unfriendly way, so I do a simple rename operation to clean them up (and delete some excess files). This takes much longer than writing the data in the first place.
# Organize the data into a folders matching the specified partitions, with a single CSV per partition
from datetime import datetime
def one_file_per_partition(df, path, partitions, sort_within_partitions, VERBOSE = False):
extension = ".csv.gz" # TODO: Support multiple extention
start = datetime.now()
df.repartition(*partitions).sortWithinPartitions(*sort_within_partitions) \
.write.partitionBy(*partitions).option("header", "true").option("compression", "gzip").mode("overwrite").csv(path)
log(f"Wrote {get_df_name(df)} data partitioned by {partitions} and sorted by {sort_within_partitions} to:" +
f"\n {path}\n Time taken: {(datetime.now() - start).total_seconds():,.2f} seconds")
# Recursively traverse all partition subdirectories and rename + move the CSV to their root
# TODO: This is very slow, it should be parallelizable
def traverse(root, remaining_partitions):
if VERBOSE: log(f"Traversing partitions by {remaining_partitions[0]} within folder: {root}")
for folder in list_subfolders(root):
subdirectory = os.path.join(root, folder)
if(len(remaining_partitions) > 1):
traverse(subdirectory, remaining_partitions[1:])
else:
destination = os.path.join(root, folder[len(f"{remaining_partitions[0]}="):]) + extension
if VERBOSE: log(f"Moving file\nFrom:{subdirectory}\n To:{destination}")
spark_output_to_single_file(subdirectory, destination, VERBOSE)
log(f"Cleaning up spark output directories...")
start = datetime.now()
traverse(path, partitions)
log(f"Moving output files to their destination took {(datetime.now() - start).total_seconds():,.2f} seconds")
# Convert a single-file spark output folder into a single file at the specified location, and clean up superfluous artifacts
def spark_output_to_single_file(output_folder, destination_path, VERBOSE = False):
output_files = [x for x in dbutils.fs.ls(output_folder) if x.name.startswith("part-")]
if(len(output_files) == 0):
raise FileNotFoundError(f"Could not find any output files (prefixed with 'part-') in the specified spark output folder: {output_folder}")
if(len(output_files) > 1):
raise ValueError(f"The specified spark folder has more than 1 output file in the specified spark output folder: {output_folder}\n" +
f"We found {len(output_files)}: {[x.name for x in output_files]}\n" +
f"This function should only be used for single-file spark outputs.")
dbutils.fs.mv(output_files[0].path, destination_path)
# Clean up all the other spark output generated to our temp folder
dbutils.fs.rm(output_folder, recurse=True)
if VERBOSE: log(f"Successfully wrote {destination_path}")
Here is a sample output:
2022-04-22 20:36:45.313963 Wrote df_test data partitioned by ['Granularity', 'PORTINFOID'] and sorted by ['Rank'] to: /mnt/.../all_data_by_rank
Time taken: 19.31 seconds
2022-04-22 20:36:45.314020 Cleaning up spark output directories...
2022-04-22 20:37:42.583850 Moving output files to their destination took 57.27 seconds
I believe the reason is that I'm processing the folders sequentially, and if I could simply do it in parallel, it would go much quicker.
The problem is that all IO on databricks is done with "dbutils", which is abstracting out mounted blob container and making this sort of thing very easy. I just can't find any information about doing async IO with this utility though.
Does anyone know how I could attempt to parallelize this activity?

The solution wound up being to abandon dbutils, which does not support parallelism in any way, and instead use os operations, which does:
import os
from datetime import datetime
from pyspark.sql.types import StringType
# Recursively traverse all partition subdirectories and rename + move the outputs to their root
# NOTE: The code to do this sequentially is much simpler, but very slow. The complexity arises from parallelising the file operations
def spark_output_to_single_file_per_partition(root, partitions, output_extension, VERBOSE = False):
if VERBOSE: log(f"Cleaning up spark output directories...")
start = datetime.now()
# Helper to recursively collect information from all partitions and flatten it into a single list
def traverse_partitions(root, partitions, fn_collect_info, currentPartition = None):
results = [fn_collect_info(root, currentPartition)]
return results if len(partitions) == 0 else results + \
[result for subdir in [traverse_partitions(os.path.join(root, folder), partitions[1:], fn_collect_info, partitions[0]) for folder in list_subfolders(root)] for result in subdir]
# Get the path of files to rename or delete. Note: We must convert to OS paths because we cannot parallelize use of dbutils
def find_files_to_rename_and_delete(folder, partition):
files = [x.name for x in dbutils.fs.ls(folder)]
renames = [x for x in files if x[0:5] == "part-"]
deletes = [f"/dbfs{folder}/{x}" for x in files if x[0:1] == "_"]
if len(renames) > 0 and partition is None: raise Exception(f"Found {len(files)} partition file(s) in the root location: {folder}. Have files already been moved?")
elif len(renames) > 1: raise Exception(f"Expected at most one partition file, but found {len(files)} in location: {folder}")
elif len(renames) == 1: deletes.append(f"/dbfs{folder}/") # The leaf-folders (containing partitions) should be deleted after the file is moved
return (deletes, None if len(renames) == 0 else (f"/dbfs{folder}/{renames[0]}", f"/dbfs{folder.replace(partition + '=', '')}{output_extension}"))
# Scan the file system to find all files and folders that need to be moved and deleted
if VERBOSE: log(f"Collecting a list of files that need to be renamed and deleted...")
actions = traverse_partitions(root, partitions, find_files_to_rename_and_delete)
# Rename all files in parallel using spark executors
renames = [rename for (deletes, rename) in actions if rename is not None]
if VERBOSE: log(f"Renaming {len(renames)} partition files...")
spark.createDataFrame(renames, ['from', 'to']).foreach(lambda r: os.rename(r[0], r[1]))
# Delete unwanted spark temp files and empty folders
deletes = [path for (deletes, rename) in actions for path in deletes]
delete_files = [d for d in deletes if d[-1] != "/"]
delete_folders = [d for d in deletes if d[-1] == "/"]
if VERBOSE: log(f"Deleting {len(delete_files)} spark outputs...")
spark.createDataFrame(delete_files, StringType()).foreach(lambda r: os.remove(r[0]))
if VERBOSE: log(f"Deleting {len(delete_folders)} empty folders...")
spark.createDataFrame(delete_folders, StringType()).foreach(lambda r: os.rmdir(r[0]))
log(f"Moving output files to their destination and cleaning spark artifacts took {(datetime.now() - start).total_seconds():,.2f} seconds")
This lets you generate partitioned data, with user-friendly names, and clean up all the spark temp files (_started..., _committed..., _SUCCESS) generated in the process.
Usage:
# Organize the data into a folders matching the specified partitions, with a single CSV per partition
def dataframe_to_csv_gz_per_partition(df, path, partitions, sort_within_partitions, rename_spark_outputs = True, VERBOSE = False):
start = datetime.now()
# Write the actual data to disk using spark
df.repartition(*partitions).sortWithinPartitions(*sort_within_partitions) \
.write.partitionBy(*partitions).option("header", "true").option("compression", "gzip").mode("overwrite").csv(path)
log(f"Wrote {get_df_name(df)} data partitioned by {partitions} and sorted by {sort_within_partitions} to:" +
f"\n {path}\n Time taken: {(datetime.now() - start).total_seconds():,.2f} seconds")
# Rename outputs and clean up
spark_output_to_single_file_per_partition(path, partitions, ".csv.gz", VERBOSE)
For what it's worth, I also tried parallelizing with Pool, but the results were not as good. I haven't attempted importing and using any libraries that can do async io, I imagine this would perform the best.

Related

Is there a better way to do this? Counting Files, and directories via for loop vs map

Folks,
I'm trying to optimize this to help speed up the process...
What I am doing is creating a dictionary of scandir entries...
e.g.
fs_data = {}
for item in Path(fqpn).iterdir():
# snipped out a bunch of normalization code
fs_data[item.name.title().strip()] = item
{'file1': <file1 scandisk data>, etc}
and then later using a function to gather the count of files, and directories in the data.
Now I suspect that the new code, using map could be optimized to be faster than the old code. I suspect that having to run the list comprehension twice, once for files, and once for directories.
But I can't think of a way to optimize it to only have to run once.
Can anyone suggest a way to sum the files, and directories at the same time in the new version? (I could fall back to the old code, if necessary)
But I might be over optimizing at this point?
Any feedback would be welcome.
def new_fs_counts(fs_entries) -> (int, int):
"""
Quickly count the files vs directories in a list of scandir entries
Used primary by sync_database_disk to count a path's files & directories
Parameters
----------
fs_entries (list) - list of scandir entries
Returns
-------
tuple - (# of files, # of dirs)
"""
def counter(fs_entry):
return (fs_entry.is_file(), not fs_entry.is_file())
mapdata = list(map(counter, fs_entries.values()))
files = sum(files for files, _ in mapdata)
dirs = sum(dirs for _, dirs in mapdata)
return (files, dirs)
vs
def old_fs_counts(fs_entries) -> (int, int):
"""
Quickly count the files vs directories in a list of scandir entries
Used primary by sync_database_disk to count a path's files & directories
Parameters
----------
fs_entries (list) - list of scandir entries
Returns
-------
tuple - (# of files, # of dirs)
"""
files = 0
dirs = 0
for fs_item in fs_entries:
is_file = fs_entries[fs_item].is_file()
files += is_file
dirs += not is_file
return (files, dirs)
map is fast here if you map the is_file function directly:
files = sum(map(os.DirEntry.is_file, fs_entries.values()))
dirs = len(fs_entries) - files
(Something with filter might be even faster, at least if most entries aren't files. Or filter with is_dir if that works for you and most entries aren't directories. Or itertools.filterfalse with is_file. Or using itertools.compress. Also, counting True with list.count or operator.countOf instead of summing bools might be faster. But all of these ideas take more code (and some also memory). I'd prefer my above way.)
Okay, map is definitely not the right answer here.
This morning I got up and created a test using timeit...
and it was a bit of a splash of reality to the face.
Without optimizations, new vs old, the new map code was roughly 2x the time.
New : 0.023185124970041215
old : 0.011841499945148826
I really ended up falling for a bit of click bait, and thought that rewriting with MAP would gain some better efficiency.
For the sake of completeness.
from timeit import timeit
import os
new = '''
def counter(fs_entry):
files = fs_entry.is_file()
return (files, not files)
mapdata = list(map(counter, fs_entries.values()))
files = sum(files for files, _ in mapdata)
dirs = sum(dirs for _, dirs in mapdata)
#dirs = len(fs_entries)-files
'''
#dirs = sum(dirs for _, dirs in mapdata)
old = '''
files = 0
dirs = 0
for fs_item in fs_entries:
is_file = fs_entries[fs_item].is_file()
files += is_file
dirs += not is_file
'''
fs_location = '/Volumes/4TB_Drive/gallery/albums/collection1'
fs_data = {}
for item in os.scandir(fs_location):
fs_data[item.name] = item
print("New : ", timeit(stmt=new, number=1000, globals={'fs_entries':fs_data}))
print("old : ", timeit(stmt=old, number=1000, globals={'fs_entries':fs_data}))
And while I was able close the gap with some optimizations.. (Thank you Lee for your suggestion)
New : 0.10864979098550975
old : 0.08246175001841038
It is clear that the for loop solution is easier to read, faster, and just simpler.
The speed difference between new and old, doesn't seem to be map specifically.
The duplicate sum statement added .021, and The biggest slow down was from the second fs_entry.is_file, it added .06x to the timings...

File retention mechanism in a large data storage

recently I faced performance problem with mp4 files retention. I have kind a recorder which saves 1 min long mp4 files from multiple RTSP streams. Those files are stored on external drive in file tree like this:
./recordings/{camera_name}/{YYYY-MM-DD}/{HH-MM}.mp4
Apart from video files, there are many other files on this drive which are not considered (unless they have mp4 extension), as they took much less space.
Assumption of file retention is as follows. Every minute, python script that is responsible for recording, check for external drive fulfillment level. If the level is above 80%, it performs a scan of the whole drive, and look for .mp4 files. When scanning is done, it sorts a list of files by its creation date, and deletes the number of the oldest files which is equal to the cameras number.
The part of the code, which is responsible for files retention, is shown below.
total, used, free = shutil.disk_usage("/home")
used_percent = int(used / total * 100)
if used_percent > 80:
logging.info("SSD usage %s. Looking for the oldest files", used_percent)
try:
oldest_files = sorted(
(
os.path.join(dirname, filename)
for dirname, dirnames, filenames in os.walk('/home')
for filename in filenames
if filename.endswith(".mp4")
),
key=lambda fn: os.stat(fn).st_mtime,
)[:len(camera_devices)]
logging.info("Removing %s", oldest_files)
for oldest_file in oldest_files:
os.remove(oldest_file)
logging.info("%s removed", oldest_file)
except ValueError as e:
# no files to delete
pass
(/home is external drive mount point)
The problem is that this mechanism used to work as a charm, when I used 256 or 512 GB SSD. Now I have a need of larger space (more cameras and longer storage time), and it takes a lot of time to create files list on larger SSD (from 2 to 5 TB now and maybe 8 TB in the future). The scanning process takes a lot more than 1 min, what could be resolved by performing it more rarely, and extending the length of "to delete" files list. The real problem is, that the process uses a lot of CPU load (by I/O ops) itself. The performance drop is visible is the whole system. Other applications, like some simple computer vision algorithms, works slower, and CPU load can even cause kernel panic.
The HW I work on is Nvidia Jetson Nano and Xavier NX. Both devices have problem with performance as I described above.
The question is if you know some algorithms or out of the box software for file retention that will work on the case I described. Or maybe there is a way to rewrite my code, to let it be more reliable and perform?
EDIT:
I was able to lower os.walk() impact by limit space to check.Now I just scan /home/recordings and /home/recognition/ which also lower directory tree (for recursive scan). At the same time, I've added .jpg files checking, so now I look from both .mp4 and .jpg. Result is much better in this implementation.
However, I need further optimization. I prepared some test cases, and tested them on 1 TB drive which is 80% filled (media files mostly). I attached profiler results per case below.
#time_measure
def method6():
paths = [
"/home/recordings",
"/home/recognition",
"/home/recognition/marked_frames",
]
files = []
for path in paths:
files.extend((
os.path.join(dirname, filename)
for dirname, dirnames, filenames in os.walk(path)
for filename in filenames
if (filename.endswith(".mp4") or filename.endswith(".jpg")) and not os.path.islink(os.path.join(dirname, filename))
))
oldest_files = sorted(
files,
key=lambda fn: os.stat(fn).st_mtime,
)
print(oldest_files[:5])
#time_measure
def method7():
ext = [".mp4", ".jpg"]
paths = [
"/home/recordings/*/*/*",
"/home/recognition/*",
"/home/recognition/marked_frames/*",
]
files = []
for path in paths:
files.extend((file for file in glob(path) if not os.path.islink(file) and (file.endswith(".mp4") or file.endswith(".jpg"))))
oldest_files = sorted(files, key=lambda fn: os.stat(fn).st_mtime)
print(oldest_files[:5])
The original implementation on the same data set last ~100 s
EDIT2
#norok2 proposals comparation
I compared them with method6 and method7 from above. I tried several times with similar result.
Testing method7
['/home/recordings/35e68df5-44b1-5010-8d12-74b892c60136/2022-06-24/17-36-18.jpg', '/home/recordings/db33186d-3607-5055-85dd-7e5e3c46faba/2021-11-22/11-27-30.jpg', '/home/recordings/acce21a2-763d-56fe-980d-a85af1744b7a/2021-11-22/11-27-30.jpg', '/home/recordings/b97eb889-e050-5c82-8034-f52ae2d99c37/2021-11-22/11-28-23.jpg', '/home/recordings/01ae845c-b743-5b64-86f6-7f1db79b73ae/2021-11-22/11-28-23.jpg']
Took 24.73726773262024 s
_________________________
Testing find_oldest
['/home/recordings/35e68df5-44b1-5010-8d12-74b892c60136/2022-06-24/17-36-18.jpg', '/home/recordings/db33186d-3607-5055-85dd-7e5e3c46faba/2021-11-22/11-27-30.jpg', '/home/recordings/acce21a2-763d-56fe-980d-a85af1744b7a/2021-11-22/11-27-30.jpg', '/home/recordings/b97eb889-e050-5c82-8034-f52ae2d99c37/2021-11-22/11-28-23.jpg', '/home/recordings/01ae845c-b743-5b64-86f6-7f1db79b73ae/2021-11-22/11-28-23.jpg']
Took 34.355509757995605 s
_________________________
Testing find_oldest_cython
['/home/recordings/35e68df5-44b1-5010-8d12-74b892c60136/2022-06-24/17-36-18.jpg', '/home/recordings/db33186d-3607-5055-85dd-7e5e3c46faba/2021-11-22/11-27-30.jpg', '/home/recordings/acce21a2-763d-56fe-980d-a85af1744b7a/2021-11-22/11-27-30.jpg', '/home/recordings/b97eb889-e050-5c82-8034-f52ae2d99c37/2021-11-22/11-28-23.jpg', '/home/recordings/01ae845c-b743-5b64-86f6-7f1db79b73ae/2021-11-22/11-28-23.jpg']
Took 25.81963086128235 s
method7 (glob())
iglob()
Cython
You could get an extra few percent speed-up on top of your method7() with the following:
import os
import glob
def find_oldest(paths=("*",), exts=(".mp4", ".jpg"), k=5):
result = [
filename
for path in paths
for filename in glob.iglob(path)
if any(filename.endswith(ext) for ext in exts) and not os.path.islink(filename)]
mtime_idxs = sorted(
(os.stat(fn).st_mtime, i)
for i, fn in enumerate(result))
return [result[mtime_idxs[i][1]] for i in range(k)]
The main improvements are:
use iglob instead of glob -- while it may be of comparable speed, it takes significantly less memory which may help on low end machines
str.endswith() is done before the allegedly more expensive os.path.islink() which helps reducing the number of such calls due to shortcircuiting
an intermediate list with all the mtimes is produces to minimize the os.stat() calls
This can be sped up even further with Cython:
%%cython --cplus -c-O3 -c-march=native -a
import os
import glob
cpdef find_oldest_cy(paths=("*",), exts=(".mp4", ".jpg"), k=5):
result = []
for path in paths:
for filename in glob.iglob(path):
good_ext = False
for ext in exts:
if filename.endswith(ext):
good_ext = True
break
if good_ext and not os.path.islink(filename):
result.append(filename)
mtime_idxs = []
for i, fn in enumerate(result):
mtime_idxs.append((os.stat(fn).st_mtime, i))
mtime_idxs.sort()
return [result[mtime_idxs[i][1]] for i in range(k)]
My tests on the following files:
def gen_files(n, exts=("mp4", "jpg", "txt"), filename="somefile", content="content"):
for i in range(n):
ext = exts[i % len(exts)]
with open(f"{filename}{i}.{ext}", "w") as f:
f.write(content)
gen_files(10_000)
produces the following:
funcs = find_oldest_OP, find_oldest, find_oldest_cy
timings = []
base = funcs[0]()
for func in funcs:
res = func()
is_good = base == res
timed = %timeit -r 8 -n 4 -q -o func()
timing = timed.best * 1e3
timings.append(timing if is_good else None)
print(f"{func.__name__:>24} {is_good} {timing:10.3f} ms")
# find_oldest_OP True 81.074 ms
# find_oldest True 70.994 ms
# find_oldest_cy True 64.335 ms
find_oldest_OP is the following, based on method7() from OP:
def find_oldest_OP(paths=("*",), exts=(".mp4", ".jpg"), k=5):
files = []
for path in paths:
files.extend(
(file for file in glob.glob(path)
if not os.path.islink(file) and any(file.endswith(ext) for ext in exts)))
oldest_files = sorted(files, key=lambda fn: os.stat(fn).st_mtime)
return oldest_files[:k]
The Cython version seems to point to a ~25% reduction in execution time.
You could use the subprocess module to list all the mp4 files directly, without having to loop through all the files in the directory.
import subprocess as sb
oldest_files = sb.getoutput("dir /b /s .\home\*.mp4").split("\n")).sort(lambda fn: os.stat(fn).st_mtime,)[:len(camera_devices)]
A quick optimization would be not to bother checking file creation time and trusting the filename.
total, used, free = shutil.disk_usage("/home")
used_percent = int(used / total * 100)
if used_percent > 80:
logging.info("SSD usage %s. Looking for the oldest files", used_percent)
try:
files = []
for dirname, dirnames, filenames in os.walk('/home/recordings'):
for filename in filenames:
files.push((
name := os.path.join(dirname, filename),
datetime.strptime(
re.search(r'\d{4}-\d{2}-\d{2}\/\d{2}-\d{2}', name)[0],
"%Y-%m-%d/%H-%M"
))
oldest_files = files.sort(key=lambda e: e[1])[:len(camera_devices)]
logging.info("Removing %s", oldest_files)
for oldest_file in oldest_files:
os.remove(oldest_file)
# logging.info("%s removed", oldest_file)
logging.info("Removed")
except ValueError as e:
# no files to delete
pass

A Pythonic way to delete older logfiles

I'm just cleaning log files greater than 50 (by oldest first). This is the only thing I've been able to come up with and I feel like there is a better way to do this. I'm currently getting a pylint warning using the lambda on get_time.
def clean_logs():
log_path = "Runtime/logs/"
max_log_files = 50
def sorted_log_list(path):
get_time = lambda f: os.stat(os.path.join(path, f)).st_mtime
return list(sorted(os.listdir(path), key=get_time))
del_list = sorted_log_list(log_path)[0:(len(sorted_log_list(log_path)) - max_log_files)]
for x in del_list:
pathlib.Path(pathlib.Path(log_path).resolve() / x).unlink(missing_ok=True)
clean_logs()
The two simplified solutions below are used to accomplish different tasks, so included both for flexibility. Obviously, you can wrap this in a function if you like.
Both code examples breaks down into the following steps:
Set the date delta (as an epoch reference) for mtime comparison as, N days prior to today.
Collect the full path to all files matching a given extension.
Create a generator (or list) to hold the files to be deleted, using mtime as a reference.
Iterate the results and delete all applicable files.
Removing log files older than (n) days:
import os
from datetime import datetime as dt
from glob import glob
# Setup
path = '/tmp/logs/'
days = 5
ndays = dt.now().timestamp() - days * 86400
# Collect all files.
files = glob(os.path.join(path, '*.sql.gz'))
# Choose files to be deleted.
to_delete = (f for f in files if os.stat(f).st_mtime < ndays)
# Delete files older than (n) days.
for f in to_delete:
os.remove(f)
Keeping the (n) latest log files
To keep the (n) latest log files, simply replace the to_delete definition above with:
n = 50
to_delete = sorted(files, key=lambda x: os.stat(x).st_mtime)[:len(files)-n]

sort images based on a cluster correspondances list

I have the following working code to sort images according to a cluster list which is a list of tuples: (image_id, cluster_id).
One image can only be in one and only one cluster (there is never the same image in two clusters for example).
I wonder if there is a way to shorten the "for+for+if+if" loops at the end of the code as yet, for each file name, I must check in every pairs in the cluster list, which makes it a little redundant.
import os
import re
import shutil
srcdir = '/home/username/pictures/' #
if not os.path.isdir(srcdir):
print("Error, %s is not a valid directory!" % srcdir)
return None
pts_cls # is the list of pairs (image_id, cluster_id)
filelist = [(srcdir+fn) for fn in os.listdir(srcdir) if
re.search(r'\.jpg$', fn, re.IGNORECASE)]
filelist.sort(key=lambda var:[int(x) if x.isdigit() else
x for x in re.findall(r'[^0-9]|[0-9]+', var)])
for f in filelist:
fbname = os.path.splitext(os.path.basename(f))[0]
for e,cls in enumerate(pts_cls): # for each (img_id, clst_id) pair
if str(cls[0])==fbname: # check if image_id corresponds to file basename on disk)
if cls[1]==-1: # if cluster_id is -1 (->noise)
outdir = srcdir+'cluster_'+'Noise'+'/'
else:
outdir = srcdir+'cluster_'+str(cls[1])+'/'
if not os.path.isdir(outdir):
os.makedirs(outdir)
dstf = outdir+os.path.basename(f)
if os.path.isfile(dstf)==False:
shutil.copy2(f,dstf)
Of course, as I am pretty new to Python, any other well explained improvements are welcome!
I think you're complicating this far more than needed. Since your image names are unique (there can only be one image_id) you can safely convert pts_cls into a dict and have fast lookups on the spot instead of looping through the list of pairs each and every time. You are also utilizing regex where its not needed and you're packing your paths only to unpack them later.
Also, your code would break if it happens that an image from your source directory is not in the pts_cls as its outdir would never be set (or worse, its outdir would be the one from the previous loop).
I'd streamline it like:
import os
import shutil
src_dir = "/home/username/pictures/"
if not os.path.isdir(src_dir):
print("Error, %s is not a valid directory!" % src_dir)
exit(1) # return is expected only from functions
pts_cls = [] # is the list of pairs (image_id, cluster_id), load from whereever...
# convert your pts_cls into a dict - since there cannot be any images in multiple clusters
# base image name is perfectly ok to use as a key for blazingly fast lookups later
cluster_map = dict(pts_cls)
# get only `.jpg` files; store base name and file name, no need for a full path at this time
files = [(fn[:-4], fn) for fn in os.listdir(src_dir) if fn.lower()[-4:] == ".jpg"]
# no need for sorting based on your code
for name, file_name in files: # loop through all files
if name in cluster_map: # proceed with the file only if in pts_cls
cls = cluster_map[name] # get our cluster value
# get our `cluster_<cluster_id>` or `cluster_Noise` (if cluster == -1) target path
target_dir = os.path.join(src_dir, "cluster_" + str(cls if cls != -1 else "Noise"))
target_file = os.path.join(target_dir, file_name) # get the final target path
if not os.path.exists(target_file): # if the target file doesn't exists
if not os.path.isdir(target_dir): # make sure our target path exists
os.makedirs(target_dir, exist_ok=True) # create a full path if it doesn't
shutil.copy(os.path.join(src_dir, file_name), target_file) # copy
UPDATE - If you have multiple 'special' folders for certain cluster IDs (like Noise is for -1) you can create a map like cluster_targets = {-1: "Noise"} where the keys are your cluster IDs and their values are, obviously, the special names. Then you can replace the target_dir generation with: target_dir = os.path.join(src_dir, "cluster_" + str(cluster_targets.get(cls,cls)))
UPDATE #2 - Since your image_id values appear to be integers while filenames are strings, I'd suggest you to just build your cluster_map dict by converting your image_id parts to strings. That way you'd be comparing likes to likes without the danger of type mismatch:
cluster_map = {str(k): v for k, v in pts_cls}
If you're sure that none of the *.jpg files in your src_dir will have a non-integer in their name you can instead convert the filename into an integer to begin with in the files list generation - just replace fn[:-4] with int(fn[:-4]). But I wouldn't advise that as, again, you never know how your files might be named.

Python: Continuously check size of files being added to list, stop at size, zip list, continue

I am trying to loop through a directory, check the size of each file, and add the files to a list until they reach a certain size (2040 MB). At that point, I want to put the list into a zip archive, and then continue looping through the next set of files in the directory and continue to do the same thing. The other constraint is that files with the same name but different extension need to be added together into the zip, and can't be separated. I hope that makes sense.
The issue I am having is that my code basically ignores the size constraint that I have added, and just zips up all the files in the directory anyway.
I suspect there is some logic issue, but I am failing to see it. Any help would be appreciated. Here is my code:
import os,os.path, zipfile
from time import *
#### Function to create zip file ####
# Add the files from the list to the zip archive
def zipFunction(zipList):
# Specify zip archive output location and file name
zipName = "D:\Documents\ziptest1.zip"
# Create the zip file object
zipA = zipfile.ZipFile(zipName, "w", allowZip64=True)
# Go through the list and add files to the zip archive
for w in zipList:
# Create the arcname parameter for the .write method. Otherwise the zip file
# mirrors the directory structure within the zip archive (annoying).
arcname = w[len(root)+1:]
# Write the files to a zip
zipA.write(w, arcname, zipfile.ZIP_DEFLATED)
# Close the zip process
zipA.close()
return
#################################################
#################################################
sTime = clock()
# Set the size counter
totalSize = 0
# Create an empty list for adding files to count MB and make zip file
zipList = []
tifList = []
xmlList = []
# Specify the directory to look at
searchDirectory = "Y:\test"
# Create a counter to check number of files
count = 0
# Set the root, directory, and file name
for root,direc,f in os.walk(searchDirectory):
#Go through the files in directory
for name in f:
# Set the os.path file root and name
full = os.path.join(root,name)
# Split the file name from the file extension
n, ext = os.path.splitext(name)
# Get size of each file in directory, size is obtained in BYTES
fileSize = os.path.getsize(full)
# Add up the total sizes for all the files in the directory
totalSize += fileSize
# Convert from bytes to megabytes
# 1 kilobyte = 1,024 bytes
# 1 megabyte = 1,048,576 bytes
# 1 gigabyte = 1,073,741,824 bytes
megabytes = float(totalSize)/float(1048576)
if ext == ".tif": # should be everything that is not equal to XML (could be TIF, PDF, etc.) need to fix this later
tifList.append(n)#, fileSize/1048576])
tifSorted = sorted(tifList)
elif ext == ".xml":
xmlList.append(n)#, fileSize/1048576])
xmlSorted = sorted(xmlList)
if full.endswith(".xml") or full.endswith(".tif"):
zipList.append(full)
count +=1
if megabytes == 2040 and len(tifList) == len(xmlList):
zipFunction(zipList)
else:
continue
eTime = clock()
elapsedTime = eTime - sTime
print "Run time is %s seconds"%(elapsedTime)
The only thing I can think of is that there is never an instance where my variable megabytes==2040 exactly. I can't figure out how to make the code stop at that point otherwise though; I wonder if using a range would work? I also tried:
if megabytes < 2040:
zipList.append(full)
continue
elif megabytes == 2040:
zipFunction(zipList)
Your main problem is that you need to reset your file size tally when you archive the current list of files. Eg
if megabytes >= 2040:
zipFunction(zipList)
totalSize = 0
BTW, you don't need
else:
continue
there, since it's the end of the loop.
As for the constraint that you need to keep files together that have the same main file name but different extensions, the only fool-proof way to do that is to sort the file names before processing them.
If you want to guarantee that the total file size in each archive is under the limit you need to test the size before you add the file(s) to the list. Eg,
if (totalSize + fileSize) // 1048576 > 2040:
zipFunction(zipList)
totalsize = 0
totalSize += fileSize
That logic will need to be modified slightly to handle keeping a group of files together: you'll need to add the filesizes of each file in the group together into a sub-total, and then see if adding that sub-total to totalSize takes it over the limit.

Categories