How to implement a watchdog like in native python? - python

I have a tool which generates some reports as html file. Since there are many and it need to be generated manual organizing them manually will take a lot of time and that's why I tried on making a script which will organize the files automatically with some rules I have applied.
import os
import re
import endwith
filefullname = EndsWith('.html')
allfiles = filefullname.findfile()
report_path = "/home/user/reports/"
while True:
files = os.listdir("/home/user/")
if not allfiles:
continue
else:
header = re.match(r"^[^_]+(?=_)", allfiles[0])
if not os.path.exists(report_path + str(header.group())):
os.system(f"mkdir {report_path + str(header.group())}")
os.system(f"mv /home/user/*.html reports/{str(header.group())}")
else:
os.system(f"mv /home/user/*.html reports/{str(header.group())}")
This is the main file which do the automation. and the class is a custom endswith class because the native one returned only boolean types. The thing is it that it runs but the problem is that it requires a restart to finish the job.
Any suggestions?
P.S. This is the class code:
import os
class EndsWith:
def __init__(self, extension):
self.extension = extension
def findfile(self):
files = os.listdir("/home/user/")
file_list = []
for file in files:
#print(file)
if self.extension in file:
file_list.append(file)
return file_list

Related

Locating multiple files in large dataset in python

I have a large repository of image files (~2 million, .jpg) with individual ids spread in multiple sub-dirs and I'm trying to locate and copy each image on a list containing a ~1,000 subset of these ids.
I'm still very new to Python so my first thought was to use os.walk to iterate through the 1k subset for each file, to see if any within the subset matched the id. This works, at least theoretically, but it seems incredibly slow at something like 3-5 images a second. The same seems to be the case for running through all of the files looking for one id at a time.
import shutil
import os
import csv
# Wander to Folder, Identify Files
for root, dirs, files in os.walk(ImgFolder):
for file in files:
fileName = ImgFolder + str(file)
# For each file, check dictionary for match
with open(DictFolder, 'r') as data1:
csv_dict_reader = csv.DictReader(data1)
for row in csv.DictReader(data1):
img_id_line = row['id_line']
isIdentified = (img_id_line in fileName) and ('.jpg' in fileName)
# If id_line == file ID, copy file
if isIdentified:
src = fileName + '.jpg'
dst = dstFolder + '.jpg'
shutil.copyfile(src,dst)
else:
continue
I've been looking at trying to automate query searches instead, but the data is contained on a NAS and I have no easy way of indexing the files to make querying faster. The machine I'm running the code through is a W10 and thus I can't use the Ubuntu Find method which I gather is considerably better at this task.
Any way to speed up the process would be greatly appreciated!
Here's a couple of scripts that should do what you're looking for.
index.py
This script uses pathlib to walk through directories searching for files with a given extension. It will write a TSV file with two columns, filename and filepath.
import argparse
from pathlib import Path
def main(args):
for arg, val in vars(args).items():
print(f"{arg} = {val}")
ext = "*." + args.ext
index = {}
with open(args.output, "w") as fh:
for file in Path(args.input).rglob(ext):
index[file.name] = file.resolve()
fh.write(f"{file.name}\t{file.resolve()}\n")
if __name__ == "__main__":
p = argparse.ArgumentParser()
p.add_argument(
"input",
help="Top level folder which will be recursively "
" searched for files ending with the value "
"provided to `--ext`",
)
p.add_argument("output", help="Output file name for the index tsv file")
p.add_argument(
"--ext",
default="jpg",
help="Extension to search for. Don't include `*` or `.`",
)
main(p.parse_args())
search.py
This script will load the index (output from index.py) into a dicttionary, then it will load the CSV file into a dictionary, then for each id_line it will look for the filename in the index and attempt to copy it to the output folder.
import argparse
import csv
import shutil
from collections import defaultdict
from pathlib import Path
def main(args):
for arg, val in vars(args).items():
print(f"{arg} = {val}")
if not Path(args.dest).is_dir():
Path(args.dest).mkdir(parents=True)
with open(args.index) as fh:
index = dict(l.strip().split("\t", 1) for l in fh)
print(f"Loaded {len(index):,} records")
csv_dict = defaultdict(list)
with open(args.csv) as fh:
reader = csv.DictReader(fh)
for row in reader:
for (k, v) in row.items():
csv_dict[k].append(v)
print(f"Searching for {len(csv_dict['id_line']):,} files")
copied = 0
for file in csv_dict["id_line"]:
if file in index:
shutil.copy2(index[file], args.dest)
copied += 1
else:
print(f"!! File {file!r} not found in index")
print(f"Copied {copied} files to {args.dest}")
if __name__ == "__main__":
p = argparse.ArgumentParser()
p.add_argument("index", help="Index file from `index.py`")
p.add_argument("csv", help="CSV file with target filenames")
p.add_argument("dest", help="Target folder to copy files to")
main(p.parse_args())
How to run this:
python index.py --ext "jpg" "C:\path\to\image\folder" "index.tsv"
python search.py "index.tsv" "targets.csv" "C:\path\to\output\folder"
I would try this on one/two folders first to check that it has the expected results.
Under the assumption that file names are unique and files location doesn't change, it is possible to create a dictionary that will allow searching for a file path in O(1) time complexity. The dictionary creation process will take some time, it is possible to pickle it on your computer, so you have to run it only once.
A simple script to create the dictionary:
from pathlib import Path
import pickle
root = Path('path/to/root/folder')
# files extensions to index
extensions = {'.jpg', '.png'}
# iterating over whole `root` directory tree and indexing by file name
image = {file.stem: file for file in root.rglob('*.*') if file.suffix in extensions}
# saving the index on your computer for further use
index_path = Path('path/to/index.pickle')
with index_path.open('wb') as file:
pickle.dump(image, file, pickle.HIGHEST_PROTOCOL)
An example of loading the dictionary:
from pathlib import Path
import pickle
index_path = Path('path/to/index.pickle')
with index_path.open('rb') as file:
image = pickle.load(file)

How could I know if I already import a modul and reload it if it has been modified?

I made a class where I have a model in it. I also made a QT GUI which allows me to select the file (.py) of my class model in order to import it and use it.
What I'm seeking for is a way to know if I already import a modul (corresponding to the file I selected) and reload it if it has changed.
to import my module from a path, i use:
fileName = QFileDialog.getOpenFileName(self,"Open Data File" , "", "data files (*.py)")
if fileName[0]=='':
return
fileName = str(fileName[0])
abspath = os.path.dirname(os.path.abspath(__file__))
self.fileName = os.path.relpath(fileName,abspath)
(filepath, filename) = os.path.split(fileName)
sys.path.append(filepath)
(shortname, extension) = os.path.splitext(filename)
mod = __import__(shortname)
instead of mod = __import__(shortname) I need to make a test to know if I import or reload the module.
EDIT
if shortname not in sys.modules:
mod = __import__(shortname)
else:
importlib.reload(__import__(shortname))
I try the previous code. however I still have an issue. when I do importlib.reload(__import__(shortname)), I seem that when I modify the module in between the first import and the second one, I still load the first form of the class. I added self.A=0 in the class __init__ but I don't have the acces to it.

Trying to print only if the operation is executed

i am pretty bad with using python but im trying to learn. So i have a script that extracts .zip and .rar files for me and that works flawlessly, now the only thing i want to implement is that if the script extracts a .zip or a .rar i want it to use Pushbullet to send a notification to my phone. This is being achieved with pushbullet.py
Anyway here is the script as of now:
import os
from subprocess import check_call
from os.path import join
from pushbullet import Pushbullet
from pyunpack import Archive
pb = Pushbullet("APIkey")
path = "/mnt/synology/Torrents/completed"
for root, dirs, files in os.walk(path):
if not any(f.endswith(".mkv") for f in files):
for file in files:
pth = join(root, file)
found_r = False
try:
if file.endswith(".zip"):
push = pb.push_note("NUC", "Extracting")
Archive(pth).extractall(root)
found_zip = True
elif not found_r and file.endswith((".rar")):
push = pb.push_note("NUC", "Extracting")
Archive(pth).extractall(root)
found_r = True
break
except:
pass
So right now it's pushing to my phone on every match it finds which is many matches and not what i want. I want it to push on just a successfull Extraction.
Does anyone know of a solution ?

Recursively create directories prior to opening file for writing

I need to write to a file (truncating) and the path it is on itself might not exist). For example, I want to write to /tmp/a/b/c/config, but /tmp/a itself might not exist. Then, open('/tmp/a/b/c/config', 'w') would not work, obviously, since it doesn't make the necessary directories. However, I can work with the following code:
import os
config_value = 'Foo=Bar' # Temporary placeholder
config_dir = '/tmp/a/b/c' # Temporary placeholder
config_file_path = os.path.join(config_dir, 'config')
if not os.path.exists(config_dir):
os.makedirs(config_dir)
with open(config_file_path, 'w') as f:
f.write(config_value)
Is there a more Pythonic way to do this? Both Python 2.x and Python 3.x would be nice to know (even though I use 2.x in my code, due to dependency reasons).
If you're repeating this pattern in multiple places, you could create your own Context Manager that extends open() and overloads __enter__():
import os
class OpenCreateDirs(open):
def __enter__(self, filename, *args, **kwargs):
file_dir = os.path.dirname(filename)
if not os.path.exists(file_dir):
os.makedirs(file_dir)
super(OpenCreateDirs, self).__enter__(filename, *args, **kwargs)
Then your code becomes:
import os
config_value = 'Foo=Bar' # Temporary placeholder
config_file_path = os.path.join('/tmp/a/b/c', 'config')
with OpenCreateDirs(config_file_path, 'w') as f:
f.write(config_value)
The first method to be called when you run with open(...) as f: is open.__enter__(). So by creating directories before calling super(...).__enter__(), you create the directories before attempting to open the file.

Accuracy of stat mtime in Windows

Have an example piece of (Python) code to check if a directory has changed:
import os
def watch(path, fdict):
"""Checks a directory and children for changes"""
changed = []
for root, dirs, files in os.walk(path):
for f in files:
abspath = os.path.abspath(os.path.join(root, f))
new_mtime = os.stat(abspath).st_mtime
if not fdict.has_key(abspath) or new_mtime > fdict[abspath]:
changed.append(abspath)
fdict[abspath] = new_mtime
return fdict, changed
But the accompanying unittest randomly fails unless I add at least a 2 second sleep:
import unittest
import project_creator
import os
import time
class tests(unittest.TestCase):
def setUp(self):
os.makedirs('autotest')
f = open(os.path.join('autotest', 'new_file.txt'), 'w')
f.write('New file')
def tearDown(self):
os.unlink(os.path.join('autotest', 'new_file.txt'))
os.rmdir('autotest')
def test_amend_file(self):
changed = project_creator.watch('autotest', {})
time.sleep(2)
f = open(os.path.join('autotest', 'new_file.txt'), 'a')
f.write('\nA change!')
f.close()
changed = project_creator.watch('autotest', changed[0])
self.assertEqual(changed[1], [os.path.abspath(os.path.join('autotest', 'new_file.txt'))])
if __name__ == '__main__':
unittest.main()
Is stat really limited to worse than 1 second accuracy? (Edit: apparently so, with FAT)
Is there any (cross platform) way of detecting more rapid changes?
The proper way is to watch a directory instead of polling for changes.
Check out FindFirstChangeNotification Function.
Watch a Directory for Changes is a Python implementation.
If directory watching isn't accurate enough then probably the only alternative is to intercept file systems calls.
Watchdog:
http://packages.python.org/watchdog/quickstart.html
Is a good project to have some multi-platform changes notification.
if this were linux, i'd use inotify. there's apparently a windows inotify equivalent - the java jnotify library has implemented it - but i don't know if there's a python implementation

Categories