Python Watchdog process existing files on startup - python

I have a simple Watchdog and Queue process to monitor files in a directory.
Code taken from https://camcairns.github.io/python/2017/09/06/python_watchdog_jobs_queue.html
import time
from watchdog.events import PatternMatchingEventHandler
from watchdog.observers import Observer
from queue import Queue
from threading import Thread
dir_path = "/data"
def process_queue(q):
while True:
if not q.empty():
event = q.get()
print("New event %s" % event)
time.sleep(5)
class FileWatchdog(PatternMatchingEventHandler):
def __init__(self, queue, patterns):
PatternMatchingEventHandler.__init__(self, patterns=patterns)
self.queue = queue
def process(self, event):
self.queue.put(event)
def on_created(self, event):
self.process(event)
if __name__ == '__main__':
watchdog_queue = Queue()
worker = Thread(target=process_queue, args=(watchdog_queue,))
worker.setDaemon(True)
worker.start()
event_handler = FileWatchdog(watchdog_queue, patterns="*.ini")
observer = Observer()
observer.schedule(event_handler, path=dir_path)
observer.start()
try:
while True:
time.sleep(2)
except KeyboardInterrupt:
observer.stop()
observer.join()
Once the process is running new files are processed correctly.
However if I restart the process and a file already exists in the directory it is ignored.
I have tried to create a dict to add to the queue
for file in os.listdir(dir_path):
if file.endswith(".ini"):
file_path = os.path.join(dir_path, file)
event = {'event_type' : 'on_created', 'is_directory' : 'False', 'src_path' : file_path}
watchdog_queue.put(event)
but it's expecting an object of type (class 'watchdog.events.FileCreatedEvent') and I can't work out how to create this.
Alternatively I can see in the Watchdog documentation (class watchdog.utils.dirsnapshot.DirectorySnapshot) but I cannot work out how to run this and add it to the queue.
Any suggestions on how I can add existing files to the queue on startup ?

This code should do what you are trying to achieve.
from watchdog.events import FileCreatedEvent
# Loop to get all files; dir_path is your lookup folder.
for file in os.listdir(dir_path):
filename = os.path.join(dir_path, file)
event = FileCreatedEvent(filename)
watchdog_queue.put(event)

I stumbled over the same problem and maybe this solution is for you too. At least on linux this works like a charm.
Add the "on_modified" method
class FileWatchdog(PatternMatchingEventHandler):
def __init__(self, queue, patterns):
PatternMatchingEventHandler.__init__(self, patterns=patterns)
self.queue = queue
...
def on_modified(self, event):
self.process(event)
Now after starting the observer, loop through all files in directory and "touch" them, so they will be "modified".
# Loop to get all files; dir_path is your lookup folder.
for file in os.listdir(dir_path):
filename = os.path.join(dir_path, file)
os.popen(f'touch {filename}')
No need to add special filters as your FileHandler will handle that.

Related

Using watchdog to put newly created file names into variables

I would like to use watchdog to find new files that are created in one folder. The file name will then be used in a different function. I am using this code here:
import sys
import time
import logging
from watchdog.observers import Observer
from watchdog.events import LoggingEventHandler
if __name__ == "__main__":
logging.basicConfig(level=logging.INFO,
format=' %(message)s')
path = sys.argv[1] if len(sys.argv) > 1 else '.'
event_handler = LoggingEventHandler()
observer = Observer()
observer.schedule(event_handler, path, recursive=True)
observer.start()
try:
while True:
time.sleep(1)
except KeyboardInterrupt:
observer.stop()
observer.join()
This gives the output for example in the console:
Created file: ./test_file.h5
Modified directory: .
Modified file: ./test_file.h5
Modified directory: .
What I would like is only when new files are created, that the name of them be returned and I don't need it returned in the console but just rather returned in a variable so I can use it as an input for a different function. Is there a way to do this?
You need to create custom handler, it can be done by inheriting FileSystemEventHandler and overriding event you want to use.
class CustomHandler(FileSystemEventHandler):
def __init__(self, callback: Callable):
self.callback = callback
def on_created(self, event: Union[DirCreatedEvent, FileCreatedEvent]):
print(f"Event type: {event.event_type}\nAt: {event.src_path}\n")
if isinstance(event, FileCreatedEvent):
file = pathlib.Path(event.src_path)
print(f"Processing file {file.name}\n")
self.callback(file)
Available events are:
on_modified(self, event)
on_deleted(self, event)
on_closed(self, event)
on_moved(self, event)
on_any_event(self, event)
Each event could vary, for on_created - it would be safe to assume it's only DirCreatedEvent and FileCreatedEvent.
--
Example code
import time
import pathlib
from typing import Union
from watchdog.observers import Observer
from watchdog.events import FileSystemEventHandler, DirCreatedEvent, FileCreatedEvent
class CustomHandler(FileSystemEventHandler):
"""Custom handler for Watchdog"""
def __init__(self):
# List to store path
self.path_strings = []
# callback for File/Directory created event, called by Observer.
def on_created(self, event: Union[DirCreatedEvent, FileCreatedEvent]):
print(f"Event type: {event.event_type}\nAt: {event.src_path}")
# check if it's File creation, not Directory creation
if isinstance(event, FileCreatedEvent):
# if so, do something with event.src_path - it's path of the created file.
self.path_strings.append(event.src_path)
print(f"Path content: \n{self.path_strings}")
def main():
# get current path as absolute, linux-style path.
working_path = pathlib.Path(".").absolute().as_posix()
# create instance of observer and CustomHandler
observer = Observer()
handler = CustomHandler()
# start observer, checks files recursively
observer.schedule(handler, path=working_path, recursive=True)
observer.start()
try:
while True:
time.sleep(1)
except KeyboardInterrupt:
observer.stop()
observer.join()
if __name__ == '__main__':
main()
Example output
Event type: created
At: E:/github/ProjectIncubator/single_run_scripts\test.a
Path content:
['E:/github/ProjectIncubator/single_run_scripts\\test.a']
Event type: created
At: E:/github/ProjectIncubator/single_run_scripts\nyan.txt
Path content:
['E:/github/ProjectIncubator/single_run_scripts\\test.a', 'E:/github/ProjectIncubator/single_run_scripts\\nyan.txt']
Old
Full Example Code
import time
import pathlib
import argparse
from typing import Union, Callable
from watchdog.observers import Observer
from watchdog.events import FileSystemEventHandler, DirCreatedEvent, FileCreatedEvent
class CustomHandler(FileSystemEventHandler):
def __init__(self, callback: Callable):
self.callback = callback
# Store callback to be called on every on_created event
def on_created(self, event: Union[DirCreatedEvent, FileCreatedEvent]):
print(f"Event type: {event.event_type}\nAt: {event.src_path}\n")
# check if it's File creation, not Directory creation
if isinstance(event, FileCreatedEvent):
file = pathlib.Path(event.src_path)
print(f"Processing file {file.name}\n")
# call callback
self.callback(file)
def main():
path: pathlib.Path = args.dir
# list for new files
created_files = []
# create callback
def callback(path_: pathlib.Path):
print(f"Adding {path_.name} to list!")
created_files.append(path_)
# create instance of observer and CustomHandler
observer = Observer()
handler = CustomHandler(callback)
observer.schedule(handler, path=path.absolute().as_posix(), recursive=True)
observer.start()
print("Observer started")
try:
while True:
time.sleep(1)
except KeyboardInterrupt:
observer.stop()
observer.join()
print(f"{len(created_files)} new files was created!", "\n".join(p.name for p in created_files), sep="\n")
input("Press enter to exit!")
if __name__ == '__main__':
# get script's root
ROOT = pathlib.Path(__file__).parent
# parse argument - if provided given directory is used, otherwise
parser = argparse.ArgumentParser(description="Listen for file change in directory.")
parser.add_argument("dir", metavar="DIR", type=pathlib.Path, default=ROOT, nargs="?", help="Directory to listen for. If omitted, script path is used.")
args = parser.parse_args()
main()
Example output (Line breaks are mess, sorry!)
❯ py .\watchdog_test.py X:\test
Observer started
Event type: created
At: X:/test\새 폴더
Event type: created
At: X:/test\meow.txt
Processing file meow.txt
Adding meow.txt to list!
Event type: created
At: X:/test\meow.txt
Processing file meow.txt
Adding meow.txt to list!
Event type: created
At: X:/test\meow - 복사본.txt
Processing file meow - 복사본.txt
Adding meow - 복사본.txt to list!
Event type: created
At: X:/test\meow - 복사본 (2).txt
Processing file meow - 복사본 (2).txt
Adding meow - 복사본 (2).txt to list!
Event type: created
At: X:/test\meow - 복사본 (3).txt
Processing file meow - 복사본 (3).txt
Adding meow - 복사본 (3).txt to list!
5 new files was created!
meow.txt
meow.txt
meow - 복사본.txt
meow - 복사본 (2).txt
meow - 복사본 (3).txt
Press enter to exit!

How to properly close a file with signal handler

I'm using the following code to monitor file access from a running job.
When the job is stopped my code receive a SIGINT.
As this job is very intensive, there's buffered IO and I can't unbuffered those writes, and I want a precise log.
So I tried to catch SIGINT and flush the file before shutting down my script I end up with :
RuntimeError: reentrant call inside <_io.BufferedWriter name=
As I understand from several articles I read, it's impossible to consistently use write/print/flush command as they are not thread safe in a signal handler.
My question is how can I ensure that my file is written properly before shutting down the script ?
Here's a simpler version of my script:
import signal
import sys
import os
import time
from time import strftime
import inotify.adapters
separator = ';'
jump = '\n'
logfile_pointer = open("path/to/log/file", 'w')
#Try to close nicely everything
def signal_handler(signal, frame):
logfile_pointer.flush()
logfile_pointer.close()
sys.exit(0)
#Register signal handler
signal.signal(signal.SIGINT, signal_handler)
signal.signal(signal.SIGHUP, signal_handler)
eventHandler = inotify.adapters.InotifyTrees(["/folder/one","/folder/two"])
for event in eventHandler.event_gen():
if event is not None:
(_, type_names, path, filename) = event
try:
timestamp = '%.2f'%(time.time())
filepath=path +'/'+ filename
logfile_pointer.write ("{}{}{}{}{}{}{}{}".format(timestamp, separator, filepath , separator , type_names[0] ,separator, os.path.getsize(filepath) , jump )
except os.error as e:
pass
The typical approach here is to have the signal handler set a flag, and return without exiting. The main loop checks the flag and when it’s set, cleans up and exits.
In this particular instance this means you need to have the event producer yield regularly; with PyInotify you can do this by setting a short timeout. This would end up looking like
[...]
exit_requested = False
def signal_handler(signal, frame):
# Perhaps check which signal was received...
exit_requested = True
[...]
for event in eventHandler.event_gen(timeout_s = 1):
if exit_requested:
# Clean up and exit
if event:
...
When event_gen returns None because it timed out, inotify events which occur before the next call to event_gen will be queued and not lost: inotify events are consumed when they are read from the inotify file descriptor, and the event handler here keeps this open.
I had several issue to solve one being the way to stop my script from running as Python have some strange thread conception, here's my solution :
define a thread that will be the inotify watcher:
import os
import sys
import time
import signal
import argparse
import inotify.adapters
from time import strftime
from threading import Thread
from argparse import RawTextHelpFormatter
class EventMonitor(Thread):
separator = ';'
jump = '\n'
def __init__(self, folders, logfile):
Thread.__init__(self)
check_message=''
self.eventHandler = None
self.stop = False
self.logfile = open(logfile,'w',buffering=bufferSize)
self.line_count = 0
self.alive=True
self.eventHandler = inotify.adapters.InotifyTrees(folders)
def run(self):
while not self.stop:
for event in self.eventHandler.event_gen( timeout_s = 3 ):
try:
if event is not None:
(_, type_names, path, filename) = event
timestamp = '%.2f'%(time.time())
filepath=path +'/'+ filename
self.logfile.write ("{}{}{}{}{}{}{}{}".format(timestamp, self.separator, filepath , self.separator , type_names[0] ,self.separator, os.path.getsize(filepath) , self.jump ))
except os.error as e:
pass
for event in self.eventHandler.event_gen( timeout_s = 1 ):
try:
if event is not None:
(_, type_names, path, filename) = event
timestamp = '%.2f'%(time.time())
filepath=path +'/'+ filename
self.logfile.write ("{}{}{}{}{}{}{}{}".format(timestamp, self.separator, filepath , self.separator , type_names[0] ,self.separator, os.path.getsize(filepath) , self.jump ))
except os.error as e:
pass
self.logfile.flush()
self.logfile.close()
self.alive=False
def stopped(self):
if not self.stop:
self.stop = True
else:
print("Event Monitoring is already disabled")
def isAlive(self):
return self.alive
Then in my main script :
import os
import sys
import time
import signal
import argparse
import traceback
from time import strftime
from CPUMonitor import CPUMonitor
from EventMonitor import EventMonitor
from argparse import RawTextHelpFormatter
#define argument
parser = argparse.ArgumentParser(description='attache spies on multiple folders in argument and generate a csv log file containing a list of event on files.File is formatted like this: \ntimestamp;fullpath;event;size\n123456897.25;/path/file;IN_OPEN;0\n/123456899.25;path/file;IN_CLOSE;1234\n.....\nFor more info about inotify events => `man inotify`',formatter_class=RawTextHelpFormatter)
parser.add_argument("-l", "--log-folder",type=str, help="Destination folder for the logs. If no value /tmp is used", default='/tmp')
parser.add_argument("-e", "--event", help="enable file event watch ",action="store_true")
parser.add_argument( 'folders', metavar='folderpath', type=str ,help='a list of folder path to spy on if -e is not set this will be ignore.', nargs = '*', default=[os.getcwd()])
args = parser.parse_args()
#Try to close nicely everything
def signal_handler(signal, frame):
if CPU_thread is not None:
CPU_thread.stopped()
if Event_thread is not None:
Event_thread.stopped()
print('Kill signal receive.{}CPU and Event monitoring stopped.{}'.format(jump,jump))
sys.exit(0)
#Register signal handler
signal.signal(signal.SIGINT, signal_handler)
signal.signal(signal.SIGHUP, signal_handler)
try:
#define variable
separator = ';'
jump = '\n'
logDest = ''
go = True
Event_logfile = None
Event_logfile_debug = None
Event_thread = None
jobname = ''
check_message=''
if not os.path.isdir(args.log_folder):
go=False
check_message = check_message + "/!\ Log folder {} is not a directory. Monitoring won't start{}".format(args.log_folder,jump)
elif not os.access(args.log_folder, os.W_OK | os.X_OK) :
go=False
check_message = check_message + "/!\ Log folder {} is not writable. Monitoring won't start{}".format(args.log_folder,jump)
else:
check_message = check_message + "Log folder is a proper directory and can be RW. {}".format(jump)
if not go :
print(check_message)
sys.exit(-2)
if go :
event_logfile = args.log_folder + '/Event_'+os.environ['JOB_ID'] + '_' + strftime("%Y-%m-%d_%H:%M:%S") + '-log.txt'
print('Event logfile: {}{}'.format(event_logfile,jump) )
print( 'Start monitoring of the event on: {} {}'.format( args.folders, jump ))
Event_thread = EventMonitor(args.folders, event_logfile)
Event_thread.start()
else:
print(("Error detected, monitoring hasn't started{}".format(jump)))
sys.exit(-4)
while Event_thread is not None and Event_thread.isAlive() :
time.sleep(5)
if Event_thread is not None:
Event_thread.join()
except Exception as error:
traceback.print_exc()
print(str(error))
sys.exit(-5)
In the thread as long as the thread is not stopped it will look for event and write them inside the file.
When stopped() is called the loop will time out after 3 seconds without event then I start the event loop a last time with a shorter timeout of 1 seconds, once all events are treated, the thread stops and isAlive() return False.
In the main program when SIGINT or SIGHUP is received it ask the thread to stop, and the python script only stops once the thread stops properly.
This code work both in Python 2.7.15 and 3.6.7 and above; however, keep in mind that this is a simplified version of my code and it might not work as is and might need some adjustment.
PS: thanks to Stephen answer which helps me a lot.

python inotify to monitor for In_closted_write and in_moved_to events

I am monitoring a directory for new files to be moved to or created.
Upon detecting the new file I call a another python script to process the file.
#!/usr/bin/python
import os
import signal
import sys
import logging
import inotify.adapters
import subprocess
_DEFAULT_LOG_FORMAT = ''
_LOGGER = logging.getLogger(__name__)
def _configure_logging():
_LOGGER.setLevel(logging.DEBUG)
ch = logging.StreamHandler()
formatter = logging.Formatter(_DEFAULT_LOG_FORMAT)
ch.setFormatter(formatter)
_LOGGER.addHandler(ch)
def exit_gracefully(signum, frame):
signal.signal(signal.SIGINT, original_sigint)
sys.exit(1)
signal.signal(signal.SIGINT, exit_gracefully)
def main():
i = inotify.adapters.Inotify()
i.add_watch(b'/home/sort/tmp')
try:
for event in i.event_gen():
if event is not None:
if 'IN_MOVED_TO' in event[1] or 'IN_CLOSE_WRITE' in event[1]:
(header, type_names, watch_path, filename) = event
_LOGGER.info("%s" #"WD=(%d) MASK=(%d) COOKIE=(%d) LEN=(%d) MASK->NAMES=%s "
#"WATCH-PATH=[%s]"
"FILENAME=%s" + "/" + "%s",
type_names,#header.wd, header.mask, header.cookie, header.len, type_names,
watch_path.decode('utf-8'), filename.decode('utf-8'))
fnp = str(event[2] + "/" + event[3])
print fnp
proc = subprocess.Popen([orgpath, fnp], stderr=subprocess.STDOUT, bufsize=1)
#proc.communicate()
finally:
i.remove_watch(b'/home/sort/tmp')
if __name__ == '__main__':
_configure_logging()
orgdir = os.path.dirname(os.path.realpath(sys.argv[0]))
orgpath = os.path.join(orgdir, "organize.py")
original_sigint = signal.getsignal(signal.SIGINT)
signal.signal(signal.SIGINT, exit_gracefully)
print("Watching /home/sort/tmp for new files")
main()
The end goal is to only process one file at a time as I call to an API to scrape for metadata. To many calls to the API in a short period of time could result in the API key to be banned or temporarily blocked.
Right now when I copy more than a single file into the monitoring directory the script gets called on each file at the same time.
Try putting a for loop to run the python file..
for files in directory:
...code that runs the python file
if it is still running too fast, you can put a timer on to throttle the API calls
import time
for files in directory:
...code that runs the python file
time.sleep(5)

Detect the changes in a file and write them in a file

I produced a script that detects the changes on files that are located in a specific directory. I'm trying to write all these changes to a changes.txt file. For this purpose I'm using the sys.stdout = open('changes.txt','w') instruction.
The problem is that whenever I run the script and change a file in the directory and save it, an empty file called changes.txt is created. This file is never written!
#!/usr/bin/python
import time
import sys
from watchdog.observers import Observer
from watchdog.events import FileSystemEventHandler
sys.stdout = open('changes.txt','w')
class MyHandler(FileSystemEventHandler):
def on_modified(self, event):
print "something happened!"
if __name__ == "__main__":
event_handler = MyHandler()
observer = Observer()
observer.schedule(event_handler, path='.', recursive=False)
observer.start()
try:
while True:
time.sleep(1)
except KeyboardInterrupt:
observer.stop()
I'd recommend something like
#!/usr/bin/python
import time
import sys
from watchdog.observers import Observer
from watchdog.events import FileSystemEventHandler
class MyHandler(FileSystemEventHandler):
def __init__(self, f):
self.f = f
def on_modified(self, event):
self.f.write("something happened!\n")
self.f.flush()
if __name__ == "__main__":
with open('changes.txt','w') as f:
event_handler = MyHandler(f)
observer = Observer()
observer.schedule(event_handler, path='.', recursive=False)
observer.start()
try:
while True:
time.sleep(1)
except KeyboardInterrupt:
observer.stop()
as you can see, the control over where your outputwill be written to has been handed to the caller (the one instanciating MyHandler) instead of the callee (on_modified).
This means you can also do
event_handler = MyHandler(sys.stdout)
and see the output instead of the output being put into the file.
An additional benefit: using a context manager you can be sure the file is closed properly, even if errors occurr.

Stop monitoring a file with watchdog

I have this small script to monitoring with watchdog one single file (test.txt).
Till now I got a screen message each time the file is modified but I need just the get the notification for the first time, it's mean to stop monitoring, is there any way I could tell watchdog to stop it?
Here is my code:
#!/usr/bin/python
import time
from watchdog.observers import Observer
from watchdog.events import FileSystemEventHandler
file_to_scan = "test.txt"
class MyHandler(FileSystemEventHandler):
def on_modified(self, event):
if file_to_scan in event.src_path:
print "Got it!", event.src_path
#### I want to stop here the monitoring
def on_created(self, event):
pass
if __name__ == "__main__":
event_handler = MyHandler()
observer = Observer()
observer.schedule(event_handler, path ="." , recursive=False)
observer.start()
try:
while True:
time.sleep(1)
except KeyboardInterrupt:
observer.stop()
observer.join()

Categories