Check if directory is empty without using os.listdir - python

I need a function to check if a directory is empty, but it should be as fast as possible, because I use it for thousands of directories that can have up to 100k files. I implemented the next one, but it looks like something is wrong with kernel32 module in python3 (I get OSError: exception: access violation writing 0xFFFFFFFFCE4A9500 on FindNextFileW, right from the first call)
import os
import ctypes
from ctypes.wintypes import WIN32_FIND_DATAW
def is_empty(fpath):
ret = True
loop = True
fpath = os.path.join(fpath, '*')
wfd = WIN32_FIND_DATAW()
handle = ctypes.windll.kernel32.FindFirstFileW(fpath, ctypes.byref(wfd))
if handle == -1:
return ret
while loop:
if wfd.cFileName not in ('.', '..'):
ret = False
break
loop = ctypes.windll.kernel32.FindNextFileW(handle, ctypes.byref(wfd))
ctypes.windll.kernel32.FindClose(handle)
return ret
print(is_empty(r'C:\\Users'))

You can use os.scandir, the iterator version of listdir, and simply return upon "iterating" the first entry, like this:
import os
def is_empty(path):
with os.scandir(path) as scanner:
for entry in scanner: # this loop will have maximum 1 iteration
return False # found file, not empty.
return True # if we reached here, then empty.

Related

Appending to list during multiprocessing

I want to check if some element is already present in some list, while i am constantly updating that list.I am using multiprocessing to achieve this, but currently my list gets reinitialised every time.Any suggestions on how i could append to the list without it being reinitialized would be very helpful.Thanks in advance.
import multiprocessing as mp
import socket
# Set the default timeout in seconds
timeout = 20
socket.setdefaulttimeout(timeout)
from PIL import Image
import hashlib
import os
image_hash_list=[]
url_list =[]
some_dict=dict()
def getImages(val):
# import pdb;pdb.set_trace()
#Dowload images
f = open('image_files.txt', 'a')
try:
url=val # preprocess the url from the input val
local=url.split('/')[-1] #Filename Generation From Global Varables And Rand Stuffs...
urllib.request.urlretrieve(url,local)
md5hash = hashlib.md5(Image.open(local).tobytes())
image_hash = md5hash.hexdigest()
global image_hash_list
global url_list
if image_hash not in image_hash_list:
image_hash_list.append(image_hash)
some_dict[image_hash] = 0
os.remove(local)
f.write(url+'\n')
return 1
else:
os.remove(local)
print(some_dict.keys())
except Exception as e:
return 0
# if __name__ == '__main__':
files = "Identity.txt"
lst = list(open(files))
lst = [l.replace("\n", "") for l in lst]
pool = mp.Pool(processes=12)
res = pool.map(getImages, lst)
print ("tempw")
Here the image_hash_list get reinitialised every time.
Use a Manager to create shared lists and dicts (and other types too): Sharing state betweek processes.

grab next .zip file in folder (iterate through zip directory)

Below is my most recent attempt; but alas, I print 'current_file' and it's always the same (first) .zip file in my directory?
Why/how can I iterate this to get to the next file in my zip directory?
my DIRECTORY_LOCATION has 4 zip files in it.
def find_file(cls):
listOfFiles = os.listdir(config.DIRECTORY_LOCATION)
total_files = 0
for entry in listOfFiles:
total_files += 1
# if fnmatch.fnmatch(entry, pattern):
current_file = entry
print (current_file)
""""Finds the excel file to process"""
archive = ZipFile(config.DIRECTORY_LOCATION + "/" + current_file)
for file in archive.filelist:
if file.filename.__contains__('Contact Frog'):
return archive.extract(file.filename, config.UNZIP_LOCATION)
return FileNotFoundError
find_file usage:
excel_data = pandas.read_excel(self.find_file())
Update:
I just tried changing return to yield at:
yield archive.extract(file.filename, config.UNZIP_LOCATION)
and now getting the below error at my find_file line.
ValueError: Invalid file path or buffer object type: <class 'generator'>
then I alter with the generator obj as suggested in comments; i.e.:
generator = self.find_file(); excel_data = pandas.read_excel(generator())
and now getting this error:
generator = self.find_file(); excel_data = pandas.read_excel(generator())
TypeError: 'generator' object is not callable
Here is my /main.py if helpful
"""Start Point"""
from data.find_pending_records import FindPendingRecords
from vital.vital_entry import VitalEntry
import sys
import os
import config
import datetime
# from csv import DictWriter
if __name__ == "__main__":
try:
for file in os.listdir(config.DIRECTORY_LOCATION):
if 'VCCS' in file:
PENDING_RECORDS = FindPendingRecords().get_excel_data()
# Do operations on PENDING_RECORDS
# Reads excel to map data from excel to vital
MAP_DATA = FindPendingRecords().get_mapping_data()
# Configures Driver
VITAL_ENTRY = VitalEntry()
# Start chrome and navigate to vital website
VITAL_ENTRY.instantiate_chrome()
# Begin processing Records
VITAL_ENTRY.process_records(PENDING_RECORDS, MAP_DATA)
except:
print("exception occured")
raise
It is not tested.
def find_file(cls):
listOfFiles = os.listdir(config.DIRECTORY_LOCATION)
total_files = 0
for entry in listOfFiles:
total_files += 1
# if fnmatch.fnmatch(entry, pattern):
current_file = entry
print (current_file)
""""Finds the excel file to process"""
archive = ZipFile(config.DIRECTORY_LOCATION + "/" + current_file)
for file in archive.filelist:
if file.filename.__contains__('Contact Frog'):
yield archive.extract(file.filename, config.UNZIP_LOCATION)
This is just your function rewritten with yield instead of return.
I think it should be used in the following way:
for extracted_archive in self.find_file():
excel_data = pandas.read_excel(extracted_archive)
#do whatever you want to do with excel_data here
self.find_file() is a generator, should be used like an iterator (read this answer for more details).
Try to integrate the previous loop in your main script. Each iteration of the loop, it will read a different file in excel_data, so in the body of the loop you should also do whatever you need to do with the data.
Not sure what you mean by:
just one each time the script is executed
Even with yield, if you execute the script multiple times, you will always start from the beginning (and always get the first file). You should read all of the files in the same execution.

LRU cache on hard drive python

I want to be able to decorate a function as you would do with functools.lru_cache, however, I want the results to be cached on the hard drive and not in memory. Looking around, I get a feeling this is a solved problem, and I was wondering if anyone could point me in the right direction (or at least give me a few more keywords to try googling)
I don't know if this will help or if it matters, but the function is computing images from unique filenames.
Here's some code to get you started:
from pathlib import Path
import pickle
import hashlib
import os
class LRU_Cache:
def __init__(self, directory, original_function, maxsize=10):
self.directory = directory
self.original_function = original_function
self.maxsize = maxsize
try:
os.mkdir(directory)
except OSError:
pass
def __call__(self, *args):
filename = hashlib.sha1(pickle.dumps(args)).hexdigest()
fullname = os.path.join(self.directory, filename)
try:
with open(fullname, 'rb') as f:
value = pickle.load(f)
Path(fullname).touch()
return value
except FileNotFoundError:
pass
value = self.original_function(*args)
with open(fullname, 'wb') as f:
pickle.dump(value, f)
filenames = os.listdir(self.directory)
if len(filenames) <= self.maxsize:
return
fullnames = [os.path.join(self.directory, filename)
for filename in filenames]
oldest = min(fullnames, key=lambda fn: os.stat(fn).st_mtime)
os.remove(oldest)
It uses hashes the arguments to create a unique filename for each function call. The function return value is pickled using that filename.
Cache hits unpickle the stored result and update the file modification time.
If the cache directory exceeds a target size, the oldest cache file is removed.
Use it like this:
def square(x):
print('!')
return x ** 2
sqr = LRU_Cache('square_cache', square, 10)
Now call sqr normally and results will be cached to disk.

Problems with variable referenced before assignment when using os.path.walk

OK. I have some background in Matlab and I'm now switching to Python.
I have this bit of code under Pythnon 2.6.5 on 64-bit Linux which scrolls through directories, finds files named 'GeneralData.dat', retrieves some data from them and stitches them into a new data set:
import pylab as p
import os, re
import linecache as ln
def LoadGenomeMeanSize(arg, dirname, files):
for file in files:
filepath = os.path.join(dirname, file)
if filepath == os.path.join(dirname,'GeneralData.dat'):
data = p.genfromtxt(filepath)
if data[-1,4] != 0.0: # checking if data set is OK
data_chopped = data[1000:-1,:] # removing some of data
Grand_mean = data_chopped[:,2].mean()
Grand_STD = p.sqrt((sum(data_chopped[:,4]*data_chopped[:,3]**2) + sum((data_chopped[:,2]-Grand_mean)**2))/sum(data_chopped[:,4]))
else:
break
if filepath == os.path.join(dirname,'ModelParams.dat'):
l = re.split(" ", ln.getline(filepath, 6))
turb_param = float(l[2])
arg.append((Grand_mean, Grand_STD, turb_param))
GrandMeansData = []
os.path.walk(os.getcwd(), LoadGenomeMeanSize, GrandMeansData)
GrandMeansData = sorted(GrandMeansData, key=lambda data_sort: data_sort[2])
TheMeans = p.zeros((len(GrandMeansData), 3 ))
i = 0
for item in GrandMeansData:
TheMeans[i,0] = item[0]
TheMeans[i,1] = item[1]
TheMeans[i,2] = item[2]
i += 1
print TheMeans # just checking...
# later do some computation on TheMeans in NumPy
And it throws me this (though I would swear it was working a month ego):
Traceback (most recent call last):
File "/home/User/01_PyScripts/TESTtest.py", line 29, in <module>
os.path.walk(os.getcwd(), LoadGenomeMeanSize, GrandMeansData)
File "/usr/lib/python2.6/posixpath.py", line 233, in walk
walk(name, func, arg)
File "/usr/lib/python2.6/posixpath.py", line 225, in walk
func(arg, top, names)
File "/home/User/01_PyScripts/TESTtest.py", line 26, in LoadGenomeMeanSize
arg.append((Grand_mean, Grand_STD, turb_param))
UnboundLocalError: local variable 'Grand_mean' referenced before assignment
All right... so I went and did some reading and came up with this global variable:
import pylab as p
import os, re
import linecache as ln
Grand_mean = p.nan
Grand_STD = p.nan
def LoadGenomeMeanSize(arg, dirname, files):
for file in files:
global Grand_mean
global Grand_STD
filepath = os.path.join(dirname, file)
if filepath == os.path.join(dirname,'GeneralData.dat'):
data = p.genfromtxt(filepath)
if data[-1,4] != 0.0: # checking if data set is OK
data_chopped = data[1000:-1,:] # removing some of data
Grand_mean = data_chopped[:,2].mean()
Grand_STD = p.sqrt((sum(data_chopped[:,4]*data_chopped[:,3]**2) + sum((data_chopped[:,2]-Grand_mean)**2))/sum(data_chopped[:,4]))
else:
break
if filepath == os.path.join(dirname,'ModelParams.dat'):
l = re.split(" ", ln.getline(filepath, 6))
turb_param = float(l[2])
arg.append((Grand_mean, Grand_STD, turb_param))
GrandMeansData = []
os.path.walk(os.getcwd(), LoadGenomeMeanSize, GrandMeansData)
GrandMeansData = sorted(GrandMeansData, key=lambda data_sort: data_sort[2])
TheMeans = p.zeros((len(GrandMeansData), 3 ))
i = 0
for item in GrandMeansData:
TheMeans[i,0] = item[0]
TheMeans[i,1] = item[1]
TheMeans[i,2] = item[2]
i += 1
print TheMeans # just checking...
# later do some computation on TheMeans in NumPy
It does not give error massages. Even gives a file with data... but data are bloody wrong! I checked some of them manually by running commands:
import pylab as p
data = p.genfromtxt(filepath)
data_chopped = data[1000:-1,:]
Grand_mean = data_chopped[:,2].mean()
Grand_STD = p.sqrt((sum(data_chopped[:,4]*data_chopped[:,3]**2) \
+ sum((data_chopped[:,2]-Grand_mean)**2))/sum(data_chopped[:,4]))
on selected files. They are different :-(
1) Can anyone explain me what's wrong?
2) Does anyone know a solution to that?
I'll be grateful for help :-)
Cheers,
PTR
I would say this condition is not passing:
if filepath == os.path.join(dirname,'GeneralData.dat'):
which means you are not getting GeneralData.dat before ModelParams.dat. Maybe you need to sort alphabetically or the file is not there.
I see one issue with the code and the solution that you have provided.
Never hide the issue of "variable referencing before assignment" by just making the variable visible.
Try to understand why it happened?
Prior to creating a global variable "Grand_mean", you were getting an issue that you are accessing Grand_mean before any value is assigned to it. In such a case, by initializing the variable outside the function and marking it as global, only serves to hide the issue.
You see erroneous result because now you have made the variable visible my making it global but the issue continues to exist. You Grand_mean was never equalized to some correct data.
This means that section of code under "if filepath == os.path.join(dirname,..." was never executed.
Using global is not the right solution. That only makes sense if you do in fact want to reference and assign to the global "Grand_mean" name. The need for disambiguation comes from the way the interpreter prescans for assignment operators in function declarations.
You should start by assigning a default value to Grand_mean within the scope of LoadGenomeMeanSize(). You have 1 of 4 branches to actually assign a value to Grand_mean that has correct semantic meaning within one loop iteration. You are likely running into a case where
if filepath == os.path.join(dirname,'ModelParams.dat'): is true, but either
if filepath == os.path.join(dirname,'GeneralData.dat'): or if data[-1,4] != 0.0: is not. It's likely the second condition that is failing for you. Move the
The quick and dirty answer is you probably need to rearrange your code like this:
...
if filepath == os.path.join(dirname,'GeneralData.dat'):
data = p.genfromtxt(filepath)
if data[-1,4] != 0.0: # checking if data set is OK
data_chopped = data[1000:-1,:] # removing some of data
Grand_mean = data_chopped[:,2].mean()
Grand_STD = p.sqrt((sum(data_chopped[:,4]*data_chopped[:,3]**2) + sum((data_chopped[:,2]-Grand_mean)**2))/sum(data_chopped[:,4]))
if filepath == os.path.join(dirname,'ModelParams.dat'):
l = re.split(" ", ln.getline(filepath, 6))
turb_param = float(l[2])
arg.append((Grand_mean, Grand_STD, turb_param))
else:
break
...

Python - Check network map

I'm looking for some help on logic, the code is not very Pythonic I'm still learning. We map the Z: drive to different locations all the time. Here is what I'm trying to accomplish
1: Check for an old map on Z: say \192.168.1.100\old
2: Map the new location to Z: say \192.168.1.200\new
3: Make sure the new Z: mapping exists and is still connected
4: If it gets disconnected or unmapped reconnect it and log it
90% of the code works, if I run it as is, it unmaps the old drive and maps the new drive but the name of the old drive stays the same even though it's mapped to the new location and I can browse it. The other problem is I only want to run checkOldDrive one time and just let checkDrive run. Any advice is appreciated.
#!/usr/bin/python
import pywintypes
import win32com.client
import os.path
import sys
import string
import fileinput
import time
import win32net
##################################################################
# Check for old Z: map and remove it
# Map the new instance of Z:
# Check if the Z: drive exists
# if the drive exists report to status.log we are working
# if the drive DOES NOT exist map it and report errors to the log
###################################################################
def checkDrive():
if os.path.exists('z:'):
saveout = sys.stdout
fsock = open('status.log', 'a')
sys.stdout = fsock
print os.getenv("COMPUTERNAME"), " - ", time.ctime(), " - Connected"
sys.stdout = saveout
fsock.close()
else:
ivvinetwork = win32com.client.Dispatch('Wscript.Network')
network_drives = ivvinetwork.EnumNetworkDrives()
for mapped_drive in [network_drives.Item(i)
for i in range(0, network_drives.Count() -1 , 2)
if network_drives.Item(i)]:
ivvinetwork.RemoveNetworkDrive(mapped_drive, True, True)
drive_mapping = [
('z:', '\\\\192.168.1.100\\newmap', 'someuser', 'somepass')]
for drive_letter, network_path, user_name, user_pass in drive_mapping:
try:
ivvinetwork.MapNetworkDrive(drive_letter, network_path, True, user_name, user_pass)
saveout = sys.stdout
fsock = open('status.log', 'a')
sys.stdout = fsock
print os.getenv("COMPUTERNAME"), " - ", time.ctime(), " - ", drive_mapping, "Drive Has Been Mapped"
sys.stdout = saveout
fsock.close()
except Exception, err:
saveout = sys.stdout
fsock = open('status.log', 'a')
sys.stdout = fsock
print os.getenv("COMPUTERNAME"), " - ", time.ctime(), " - ", err
sys.stdout = saveout
fsock.close()
def checkOldDrive():
if os.path.exists('z:'):
ivvinetwork = win32com.client.Dispatch('Wscript.Network')
network_drives = ivvinetwork.EnumNetworkDrives()
for mapped_drive in [network_drives.Item(i)
for i in range(0, network_drives.Count() -1 , 2)
if network_drives.Item(i)]:
ivvinetwork.RemoveNetworkDrive(mapped_drive, True, True)
checkOldDrive()
checkDrive()
I've put together a script based on the one you laid out which I believe accomplishes what you have described.
I've tried to do it in a way that's both Pythonic and follows good programming principles.
In particular, I've done the following:
modularize much of the functionality into reusable functions
avoided repetition as much as possible. I did not factor out the hard-coded 'Z:' drive. I leave that to you as an exercise (as you see fit).
factored the logging definition into one location (so the format, etc are consistent and not repeated). The logging module made this easy.
moved all code out of the top level scope (except for some global constants). This allows the script to be run directly or imported by another script as a module.
Added some documentation strings to help document what each function does.
Kept each function short an succinct - so it can be read more easily on a single screen and in an isolated context.
Surely, there is still room for some improvement, but I have tested this script and it is functional. It should provide some good lessons while also helping you accomplish your task. Enjoy.
#!/usr/bin/env python
import os
import time
import win32com.client
import logging
old_mappings = [
r'\\192.168.1.100\old',
]
new_mapping = r'\\192.168.1.200\new'
LOG_FILENAME = 'status.log'
def main():
"""
Check to see if Z: is mapped to the old server; if so remove it and
map the Z: to the new server.
Then, repeatedly monitor the Z: mapping. If the Z: drive exists,
report to status.log that we are working. Otherwise, re-map it and
report errors to the log.
"""
setupLogging()
replaceMapping()
monitorMapping()
def replaceMapping():
if removeMapping():
createNewMapping()
def setupLogging():
format = os.environ['COMPUTERNAME'] + " - %(asctime)s - %(message)s"
logging.basicConfig(filename=LOG_FILENAME, level=logging.DEBUG, format=format)
def getCredentials():
"""
Return one of three things:
- an empty tuple
- a tuple containing just a username (if a password is not required)
- a tuple containing username and password
"""
return ('someuser', 'somepass')
def createNewMapping():
network = win32com.client.Dispatch('WScript.Network')
params = (
'Z:', # drive letter
new_mapping, # UNC path
True, # update profile
)
params += getCredentials()
try:
network.MapNetworkDrive(*params)
msg = '{params} - Drive has been mapped'
logging.getLogger().info(msg.format(**vars()))
except Exception as e:
msg = 'error mapping {params}'
logging.getLogger().exception(msg.format(**vars()))
def monitorMapping():
while True:
# only check once a minute
time.sleep(60)
checkMapping()
def checkMapping():
if getDriveMappings()['Z:'] == new_mapping:
msg = 'Drive is still mapped'
logging.getLogger().info(msg.format(**vars()))
else:
replaceMapping()
# From Python 2.6.4 docs
from itertools import izip_longest
def grouper(n, iterable, fillvalue=None):
"grouper(3, 'ABCDEFG', 'x') --> ABC DEF Gxx"
args = [iter(iterable)] * n
return izip_longest(fillvalue=fillvalue, *args)
def getDriveMappings():
"""
Return a dictionary of drive letter to UNC paths as mapped on the
system.
"""
network = win32com.client.Dispatch('WScript.Network')
# http://msdn.microsoft.com/en-us/library/t9zt39at%28VS.85%29.aspx
drives = network.EnumNetworkDrives()
# EnumNetworkDrives returns an even-length array of drive/unc pairs.
# Use grouper to convert this to a dictionary.
result = dict(grouper(2, drives))
# Potentially several UNC paths will be connected but not assigned
# to any drive letter. Since only the last will be in the
# dictionary, remove it.
if '' in result: del result['']
return result
def getUNCForDrive(drive):
"""
Get the UNC path for a mapped drive.
Throws a KeyError if no mapping exists.
"""
return getDriveMappings()[drive.upper()]
def removeMapping():
"""
Remove the old drive mapping. If it is removed, or was not present,
return True.
Otherwise, return False or None.
"""
mapped_drives = getDriveMappings()
drive_letter = 'Z:'
if not drive_letter in mapped_drives:
return True
if mapped_drives[drive_letter] in old_mappings:
network = win32com.client.Dispatch('WScript.Network')
force = True
update_profile = True
network.RemoveNetworkDrive(drive_letter, force, update_profile)
return True
# return None
if __name__ == '__main__':
main()

Categories