Forcing os.walk to stop if taking too long time - python

I want to find all files in a directory tree with a given file extension. However, some folders are really large and I therefore want to stop this process if it takes too long time (say 1 second). My current code looks something like this:
import os
import time
start_time = time.time()
file_ext = '.txt'
path = 'C:/'
file_list = []
for root, dirs, files in os.walk(path):
for file in files:
if file.endswith(file_ext):
relDir = os.path.relpath(root, path)
relFile = os.path.join(relDir, file)
file_list.append(relFile)
if time.time() - start_time> 1:
break
if time.time() - start_time> 1:
break
The problem with this code is that when I get to a really large subfolder, this code does not break until that folder has been completely traversed. If that folder contains many files, it might take much longer time than I would like. Is there any way I can make sure that the code does not run for much longer than the allotted time?
Edit: Note that while it is certainly helpful to find ways to speed up the code (for instance by using os.scandir), this question deals primarily with how to kill a process that is running.

You can do the walk in a subprocess and kill that. Options include multiprocessing.Process but the multiprocessing libs on Windows may need to do a fair amount of work that you don't need. Instead, you can just pipe the walker code into a python subprocess and go from there.
import os
import sys
import threading
import subprocess as subp
walker_script = """
import os
import sys
path = os.environ['TESTPATH']
file_ext = os.environ['TESTFILEEXT']
# let parent know we are going
print('started')
for root, dirs, files in os.walk(path):
for file in files:
if file.endswith(file_ext):
relDir = os.path.relpath(root, path)
relFile = os.path.join(relDir, file)
print(relFile)
"""
file_ext = '.txt'
path = 'C:/'
encoding = sys.getdefaultencoding()
# subprocess reads directories... additional python flags seek to
# speed python initialization. If a linuxy system, forking would
# be a good option.
env = {'TESTPATH':path, 'TESTFILEEXT':file_ext}
env.update(os.environ)
proc = subp.Popen([sys.executable, '-E', '-s', '-S', '-'], stdin=subp.PIPE,
stdout=subp.PIPE, # , stderr=open(os.devnull, 'wb'))
env = env)
# write walker script
proc.stdin.write(walker_script.encode('utf-8'))
proc.stdin.close()
# wait for start marker
next(proc.stdout)
# timer kills directory traversal when bored
threading.Timer(1, proc.kill).start()
file_list = [line.decode(encoding).strip() for line in proc.stdout]
print(file_list)

Related

How do i iterate through a directory, and print out the file names and their sizes and make the printout clean looking?

The code i am currently working on, enters a directory. Once in that directory I need to iterate through the files in that directory and print the file names and extensions, along with the file size.
os.chdir(Path('pets', 'cats'))
current = Path.cwd()
for file in os.listdir(current):
fileName = os.path.split(file)
fileSize = os.path.getsize(file)
print(str(fileName) + ': ' + str(fileSize))
The issue I am having is that the printout includes ('' '<filename.ext>'). I want to omit all these extra characters and just have <filename.ext>. Any clues on how I can clean this up?
It looks like you're using the pathlib module, so you can write:
import os
from pathlib import Path
path = Path('dogs', 'cats')
for item in path.iterdir():
if not item.is_file():
continue
fstat = os.stat(item)
print(f'{item.name}: {fstat.st_size}')
This would yield output like:
foo.txt: 32
bar.txt: 64
You could use this piece of code for the task:
import glob
import os
folPath = r'dir_address'
for fPath in glob.glob("{0}\**\*".format(folPath), recursive=True):
# get file size
fSize = os.stat(fPath).st_size/1024
print('size of {0} is {1}'.format(fPath, fSize))
I hope this helps you.

Python 3 File Indexing Parallel

I have a question about python 3 file indexing.
I have a code, it will list all file in my Computer into a file named test.txt.
All I wanna do is to make this code paralel for faster processing time.
import os
import time
start = time.time()
handle = open("test.txt","w")
rootDir = '/'
for dirName, subdirList, fileList in os.walk(rootDir):
for fname in fileList:
handle.write("%s\n" %os.path.abspath(os.path.join(dirName,fname)).encode('utf-8'))
handle.close()
end = time.time()
print (end-start)
I hope you all can help me.
Thanks a lot and have a nice day.

Start script when new element on directory

I want to convert every new wav file coming into an input dir to another dir on mp3. I've been look how to convert those file but, I don't know how to add a listener on the input dir or if it's even possible?
Edit:
Sorry, I've forget to share you the code I already have. I use ffmpeg to convert audio file
import os, sys, glob
FFMPEG_PATH = "C:\\ffmpeg\\bin"
fileName = ""
fileExt = ""
wavdir = ""
mp3dir = ""
for file in glob.glob('wav/*.wav'):
# get the name without .ext
fileName = os.path.basename(file)
fileName = fileName.split(".")[0]
# verify if no mp3 file with thesame name exist
if not os.path.isfile('./mp3/'+fileName+".mp3"):
# set var with the 2 types files dir
wavdir = file
mp3dir = "mp3/"+fileName+".mp3"
# start the convertion with ffmpeg by commande line
os.system("ffmpeg -i "+wavdir+" "+mp3dir)
I'm not sure if this is the best idea, however you can assign a process or a thread to check if a file was added to the directory every X seconds:
import os
import time
wav_files_path = "/WAV_dir_path"
prev_files = os.listdir(wav_files_path)
x = 1 # time to sleep
while True:
files = os.listdir(wav_files_path)
if len(files) > len(prev_files): # if files are not deleted it's better to check if any files were added
# NEW FILE(S) ADDED
for f in files:
if f not in prev_files:
convert_file(f)
prev_files = files
time.sleep(x)
This is highly suboptimal, but should do the job
import os, time
SLEEPTIME = 0.5
TARGET_DIRECTORY = 'path_of_your_folder'
while True:
time.sleep(SLEEPTIME)
files = os.listdir(TARGET_DIRECTORY)
for file in files:
if file.endswith('.wav'):
CONVERT
Make a while True loop. Then in the loop you make another loop with for item in os.listdir(yourdir) in there you move every item and then you can make time.sleep(1) to reduce lag.

Need to process all files in a directory, but am only getting one

I have a Python script that is successfully processing single files. I am trying to write a for loop to have it get all the files in a directory that the user inputs. However, it is only processing one of the files in the directory.
The code below is followed by the rest of the script that does the analysis on data. Do I need to somehow close the for loop?
import os
print "Enter the location of the files: "; directory = raw_input()
path = r"%s" % directory
for file in os.listdir(path):
current_file = os.path.join(path, file)
data = open(current_file, "rb")
# Here's an abridged version of the data analysis
for i in range(0, 10):
fluff = data.readline()
Input_Parameters = fluff.split("\t")
output.write("%f\t%f\t%f\t%f\t%.3f\t%.1f\t%.2f\t%.2f\t%s\n" % (Voc, Isc, Vmp, Imp, Pmax, 100 * Eff, IscErr, 100 * (1 - (P2 / Pmax)), file))
data.close()
In general, if something does not work, you can try to get something simpler working. I removed the data analysis part and kept the code below. This works with me. I noticed that if I have a directory in my path, the open will fail. I am not sure this is the case with you as well.
import os
import sys
path = '.'
for file in os.listdir(path):
current = os.path.join(path, file)
if os.path.isfile(current):
data = open(current, "rb")
print len(data.read())
The current code in your answer looks basically OK to me. I noticed that it's writing to the same output file for each of the files it processes, so that may lead you to think it's not processing them all.
for you debugging:
for file in os.listdir(path):
current_file = os.path.join(path, file)
print current_file
check the output of current_file
BTW: are you indent your code by tab?
because there are different indent length in your code.
This is bad style

using subprocess over different files python

I've got a problem with a short script, it'd be great if you could have a look!
import os
import subprocess
root = "/Users/software/fmtomov1.0/remaker_lastplot/source_relocation/observed_arrivals_loc3d"
def loop_loc3d(file_in):
"""Loops loc3d over the source files"""
return subprocess.call (['loc3d'], shell=True)
def relocation ():
for subdir, dirs, files in os.walk(root):
for file in files:
file_in = open(os.path.join(subdir, file), 'r')
return loop_loc3d(file_in)
I think the script is quite easy to understand, it's very simple. However I'm not getting the result wanted. In a few word I just want 'loc3d' to operate over all the files contents present in the 'observed_arrivals_loc3d' directory, which means that I need to open all the files and that's what I've actually done. In fact, if I try to 'print files' after:
for subdir, dirs, files in os.walk(root)
I'll get the name of every file. Furthermore, if I try a 'print file_in' after
file_in = open(os.path.join(subdir, file), 'r')
I get something like this line for every file:
<open file '/Users/software/fmtomov1.0/remaker_lastplot/source_relocation/observed_arrivals_loc3d/EVENT2580', mode 'r' at 0x78fe38>
subprocess has been tested alone on only one file and it's working.
Overall I'm getting no errors but just -11 which means absolutely nothing to me. The output from loc3d should be completly different.
So does the code look fine to you? Is there anything I'm missing? Any suggestion?
Thanks for your help!
I assume you would call loc3d filename from the CLI. If so, then:
def loop_loc3d(filename):
"""Loops loc3d over the source files"""
return subprocess.call (['loc3d',filename])
def relocation():
for subdir, dirs, files in os.walk(root):
for file in files:
filename = os.path.join(subdir, file)
return loop_loc3d(filename)
In other words, don't open the file yourself, let loc3d do it.
Currently your relocation method will return after the first iteration (for the first file). You shouldn't need to return at all.
def loop_loc3d(filename):
"""Loops loc3d over the source files"""
return subprocess.call (['loc3d',filename])
def relocation ():
for subdir, dirs, files in os.walk(root):
for file in files:
filename = os.path.join(subdir, file)
loop_loc3d(filename)
This is only one of the issues. The other is concerning loc3d itself. Try providing the full path for loc3d.
-11 exit code might mean that the command killed by signal Segmentation fault.
It is a bug in loc3d. A well-behaved program should not produce 'Segmentation fault' on any user input.
Feed loc3d only files that it can understand. Print filenames or use subprocess.check_call() to find out which file it doesn't like:
#!/usr/bin/env python
import fnmatch
import os
import subprocess
def loc3d_files(root):
for dirpath, dirs, files in os.walk(root, topdown=True):
# skip hidden directories
dirs[:] = [d for d in dirs if not d.startswith('.')]
# process only known files
for file in fnmatch.filter(files, "*some?pattern[0-9][0-9].[ch]"):
yield os.path.join(dirpath, file)
for path in loc3d_files(root):
print path
subprocess.check_call(['loc3d', path]) # raise on any error
Just found out that loc3d, as unutbu said, relies on several variables and in the specific case one called 'observal_arrivals' that I have to create and delete every time from my directory. In Pythonic terms it means:
import os
import shutil
import subprocess
def loop_loc3d(file_in):
"""Loops loc3d over the source files"""
return subprocess.call(["loc3d"], shell=True)
path = "/Users/software/fmtomo/remaker_lastplot/source_relocation"
path2 = "/Users/Programming/working_directory/2test"
new_file_name = 'observed_arrivals'
def define_object_file ():
for filename in os.listdir("."):
file_in = os.rename (filename, new_file_name) # get the observal_arrivals file
file_in = shutil.copy ("/Users/simone/Programming/working_directory/2test/observed_arrivals", "/Users/software/fmtomo/remaker_lastplot/source_relocation")
os.chdir(path) # goes where loc3d is
loop_loc3d (file_in)
os.remove("/Users/software/fmtomo/remaker_lastplot/source_relocation/observed_arrivals")
os.remove ("/Users/Programming/working_directory/2test/observed_arrivals")
os.chdir(path2)
Now, this is working very well, so it should answer my question. I guess it's quite easy to understand, it's just copying, changing dir and that kind of stuff.

Categories