Extracting "unsigned files" from a directory - python

I have a directory with xml files associated with encrypted P7M files, meaning that for every name.xml there is a name.P7M. But there are some exceptions (P7M file is absent) and my goal is to detect them using python.
I'm thinking this code.. Can you help with an elegant code?
import glob
# functions to eleminate extension name
def is_xml(x):
a = re.search(r"(\s)(.xml)",x)
if a :
return a.group(0)
else:
return False
def is_P7M(x):
a = re.search(r"(\s)(.P7M)", x)
if a :
return a.group(0)
else:
return False
# putting xml files and P7M files in two sets
setA = set (glob.glob('directory/*.xml'))
setB = set (glob.glob('directory/*.P7M'))
#eliminating extention names
for elt in setA:
elt= is_xml(elt)
for elt in setB:
elt= is_P7M(elt)
#difference between two sets. setB is always a larger set
print "unsigned files are:", setB.difference(setA)

A simpler way is to glob for the .xml files, then check using os.path.exists for a .P7M file:
import os, glob
for xmlfile in glob.glob('*.xml'):
if not os.path.exists(xmlfile.rsplit(".", 1)[0] + ".P7M"):
print xmlfile, "is unsigned"
This code:
Uses glob.glob to get all the xml files.
Uses str.rsplit to split the filename up into name and extension (e.g. "name.xml" to ("name", ".xml")). The second argument stops str.rsplit splitting more than once.
Takes the name of the file and adds the .P7M extension.
Uses os.path.exists to see if the key file is there. If is isn't, the xmlfile is unsigned, so print it out.
If you need them in a list, you can do:
unsigned = [xmlfile for xmlfile in glob.glob('*.xml') if not os.path.exists(xmlfile.rsplit(".", 1)[0] + ".P7M")]
Or a set:
unsigned = {xmlfile for xmlfile in glob.glob('*.xml') if not os.path.exists(xmlfile.rsplit(".", 1)[0] + ".P7M")}

My solution would be:
import glob
import os
get_name = lambda fname: os.path.splitext(fname)[0]
xml_names = {get_name(fname) for fname in glob.glob('directory/*.xml')}
p7m_names = {get_name(fname) for fname in glob.glob('directory/*.p7m')}
unsigned = [xml_name + ".xml" for xml_name in \
xml_names.difference(p7m_names)]
print unsigned

get all xml's in a dict removing the extension and using the name as key and setting the value to false initially, if we find a matching P7M set value to True, finally print all keys with a False value.
xmls = glob.glob('directory/*.xml')
p7ms = glob.glob('directory/*.P7M')
# use xml file names as keys by removing the extension
d = {k[rsplit(k,1)[0]]:False for k in xmls}
# go over every .P7M again removing extension
# setting value to True for every match
for k in p7ms:
k[rsplit(k,1)[0]] = True
# any values that are False means there is no .P7M match for the xml file
for k,v in d.items():
if not v:
print(k)
Or create a set of each and find the difference:
xmls = {x.rsplit(".",1)[0] for x in in glob.glob('directory/*.xml')}
pm7s = {x.rsplit(".",1)[0] for x in glob.glob('directory/*.P7M')}
print(xmls - pm7s)

Iterate over glob once and populate a dict of filenames by extension. Finally, compute the difference between 'xml' and 'P7M' sets.
import os, glob, collections
fnames = collections.defaultdict(set)
for fname in glob.glob('*'):
f, e = os.path.splitext(fname)
fnames[e].add(f)
print fnames['.xml'] - fnames['.P7M']
Note that unlike other suggestions, this makes one single request to the filesystem, which might be important if the FS is slow (e.g. a network mount).

Related

Python: how to iterate thru a folder to find all "docx"

I need help writing a for loop to find all ".docx" files then edit them and save them under a new name.
I have some code that works for a single file
I need code that iterates thru a folder and picks up only the "docx."
After editing, save them into another folder.
a) Retain the *current file name and put it in another folder
Code below:
import docx
import os
directory = '.' #set dir of all *docx files"
extension = '.docx' # editable files must be "docx"
text = '' #not sure what this is doing here.
#The for loop to check that all the .docx are found in folder. Edit them regx. BUT I do not know the proper syntax for "file" ending in "docx"
for "file" in os.listdir(directory) #
if "file".endswith(extension)
def replacer(p, replace_dict):
inline = p.runs # Specify the list being used
for j in range(0, len(inline)):
# Iterate over the dictionary
for k, v in replace_dict.items():
if k in inline[j].text:
inline[j].text = inline[j].text.replace(k, v)
return p
# Replace Paragraphs
doc = docx.Document("5538AP_7975_D1.docx") # Get a file BUT I need all files with different part names.
dict = {'Don Steel': 'TAN', '5538AP':'5499AP', 'Special Mask Build Notes': 'Special Mask Build Notes: Special New receipts for TAN 4X to 5X'} # Build the dict
for p in doc.paragraphs: # If needed, iter over paragraphs
p = replacer(p, dict) # Call the new replacer function
doc.save("/newOrder/SUBSTITUTE_5538AP1.docx") # need to save all files with their original name in a new folder "newOrder".
A simple answer but effective, you could use the glob module as this has direct pattern searching builtin. See the following example
import glob
for i in glob.glob('E:\drivers\\**\\*.pdf', recursive=True):
print(i)
This would look at all the files in E:\Drivers\ then look through every sub directory and look for pdf files. You can see more at the documentation page: https://docs.python.org/3/library/glob.html
# NEW FUNCTION:
import docx
import glob
for i in glob.glob(r'C:\Users\LOCAL\Documents\TOPPAN_MOR_BASE_PART\*.docx', recursive=True):
def replacer(p, replace_dict):
inline = p.runs # Specify the list being used
for j in range(0, len(inline)):
# Iterate over the dictionary
for k, v in replace_dict.items():
if k in inline[j].text:
inline[j].text = inline[j].text.replace(k, v)
return p
# Replace Paragraphs
file_suffix = raw_input(i)
doc = docx.Document(".docx") # Get the file
dict = {'Don Coffman': 'TOPPAN', 'MG5459APSIM':'MG5499AP', 'Special Mask Build Notes': 'Special Mask Build Notes: Special Mask Build Notes: TOPPAN 4X to 5X'} # Build the dict
for p in doc.paragraphs: # If needed, iter over paragraphs
p = replacer(p, dict) # Call the new replacer function
doc.save(r'C:\Users\LOCAL\Documents\TOPPAN_MOR_BASE_PART\NewOrder\SUBSTITUTE'+file_suffix+'docx')

Python Sort List By Number in File Name

I have a bunch of files that are downloaded and I'm trying to get the most recently downloaded version for my analysis. Obviously this is sorting based on text rather than numeric so I'm running into the issue where File 30 comes before File 4. The numbers are within () everytime (your normal copied download). How would I sort based on that number?
Filename (1)
Filename (2)
...
Filename (30)
Filename (4)
...
files = glob.glob(r"C:\Users\xxxxx\Downloads\Filename*")
#files = files.sort(reverse=True)
files = sorted(files, reverse = True)
print(files)
exit()
Using Regex with pattern r"\((\d+)\)" to extract the number inside the brackets and then convert to int for sorting.
Ex:
import re
files = glob.glob(r"C:\Users\xxxxx\Downloads\Filename*")
files = sorted(files, key=lambda x:int(re.search(r"\((\d+)\)", x).group(1)), reverse = True)
You can use key inside the sorted function. They key uses a function that gets the int value between the brackets.
e.g.
def get_num(x):
return int(x.split('(')[1].lstrip().split(')')[0])
files = glob.glob(r"C:\Users\xxxxx\Downloads\Filename*")
files = sorted(files, key = get_num, reverse = True)
print(files)
exit()

Traversing through each directory to identify file types and the count of each type?

I am trying to build a script that can help me in traversing through all the files in a directory and to identify its file type. At the end the result should print the total count of each file type that were identified. I am using the magic library to identify the file type based on MIME.
for filename in os.listdir(os.getcwd()):
print filename
with magic.Magic(flags=magic.MAGIC_MIME_TYPE) as m:
t = m.id_filename(filename)
print t
The identification piece is pasted above which seems to be working fine but I am not sure how to store the identified filetypes and their count. The output should look like:
filetype1 count
filetype2 count
...
...
Please guide me as to what should be the ideal way of doing it.
You can create a dictionary containing a mapping of each file type to its count. e.g.
file_types = {'filetype1' : 10, 'filetype2': 20, ...}
Note that your current solution will only work on the current directory and not subdirectories.
file_types = {}
for filename in os.listdir(os.getcwd()):
with magic.Magic(flags=magic.MAGIC_MIME_TYPE) as m:
t = m.id_filename(filename)
file_types.setdefault(t, 0)
file_types[t] += 1
...
Should append and count for you.
You could use the Counter class from the collections module. It is basically a variant of a dictionary, with a few additional methods and the advantage that you don't need to initialize it with 0 when counting.
I don't have that magic that you mention, so here's an example using my_magic as a substitute:
import collections
import os
def my_magic(filename):
"""
This function is just a placeholder to be used in place of your id_filename()
method.
"""
if filename.endswith(".txt"):
return "TXT"
elif filename.endswith(".pdf"):
return "PDF"
else:
return "other"
# initialize the counter object:
counter = collections.Counter()
for filename in os.listdir(os.getcwd()):
print filename
# substitute the next line with whatever you use to determine the
# type of the file:
t = my_magic(filename)
print t
# increase the count for the current value of 't':
counter[t] += 1
# output what is in counter:
for ext, n in counter.items():
print ext, n

Find all text files not containing some text string

I'm on Python 2.7.1 and I'm trying to identify all text files that don't contain some text string.
The program seemed to be working at first but whenever I add the text string to a file, it keeps coming up as if it doesn't contain it (false positive). When I check the contents of the text file, the string is clearly present.
The code I tried to write is
def scanFiles2(rdir,sstring,extens,start = '',cSens = False):
fList = []
for fol,fols,fils in os.walk(rdir):
fList.extend([os.path.join(rdir,fol,fil) for fil in fils if fil.endswith(extens) and fil.startswith(start)])
if fList:
for fil in fList:
rFil = open(fil)
for line in rFil:
if not cSens:
line,sstring = line.lower(), sstring.lower()
if sstring in line:
fList.remove(fil)
break
rFil.close()
if fList:
plur = 'files do' if len(fList) > 1 else 'file does'
print '\nThe following %d %s not contain "%s":\n'%(len(fList),plur,sstring)
for fil in fList:
print fil
else:
print 'No files were found that don\'t contain %(sstring)s.'%locals()
scanFiles2(rdir = r'C:\temp',sstring = '!!syn',extens = '.html', start = '#', cSens = False)
I guess there's a flaw in the code but I really don't see it.
UPDATE
The code still comes up with many false positives: files that do contain the search string but are identified as not containing it.
Could text encoding be an issue here? I prefixed the search string with U to account for Unicode encoding but it didn't make any difference.
Does Python in some way cache file contents? I don't think so but that could somewhat account for files to still pop up after having been corrected.
Could some kind of malware cause symptoms like these? Seems highly unlikely to me but I'm kinda desperate to get this fixed.
Modifying element while iterating the list cause unexpected results:
For example:
>>> lst = [1,2,4,6,3,8,0,5]
>>> for n in lst:
... if n % 2 == 0:
... lst.remove(n)
...
>>> lst
[1, 4, 3, 0, 5]
Workaround iterate over copy
>>> lst = [1,2,4,6,3,8,0,5]
>>> for n in lst[:]:
... if n % 2 == 0:
... lst.remove(n)
...
>>> lst
[1, 3, 5]
Alternatively, you can append valid file path, instead of removing from the whole file list.
Modified version (appending file that does not contian sstring instead of removing):
def scanFiles2(rdir, sstring, extens, start='', cSens=False):
if not cSens:
# This only need to called once.
sstring = sstring.lower()
fList = []
for fol, fols, fils in os.walk(rdir):
for fil in fils:
if not (fil.startswith(start) and fil.endswith(extens)):
continue
fil = os.path.join(fol, fil)
with open(fil) as rFil:
for line in rFil:
if not cSens:
line = line.lower()
if sstring in line:
break
else:
fList.append(fil)
...
list.remove takes O(n) time, while list.append takes O(1). See Time Complexity.
Use with statement if possible.
Falsetru already showed you why you should not remove lines from a list while looping over it; list iterators do not and cannot update their counter when a list is shortened, so if item 3 was processed but you removed that item, the next iteration item 4 was previously located at index 5.
List comprehension version using fnmatch.filter() and any() and a filter lambda for case insensitive matching:
import fnmatch
def scanFiles2(rdir, sstring, extens, start='', cSens=False):
lfilter = sstring.__eq__ if cSens else lambda l, s=sstring.lower(): l.lower() == s
ffilter = '{}*{}'.format(start, extens)
return [os.path.join(r, fname)
for r, _, f in os.walk(rdir)
for fname in fnmatch.filter(f, ffilter)
if not any(lfilter(l) for l in open(os.path.join(root, fname)))]
but perhaps you'd be better off sticking to a more readable loop:
def scanFiles2(rdir, sstring, extens, start='', cSens=False):
lfilter = sstring.__eq__ if cSens else lambda l, s=sstring.lower(): l.lower() == s
ffilter = '{}*{}'.format(start, extens)
result = []
for root, _, files in os.walk(rdir):
for fname in fnmatch.filter(files, ffilter):
fname = os.path.join(r, fname)
with open(fname) as infh:
if not any(lfilter(l) for l in infh):
result.append(fname)
return result
Another alternative that opens the searching up for using regular expressions (although just using grep with appropriate options would still be better):
import mmap
import os
import re
import fnmatch
def scan_files(rootdir, search_string, extension, start='', case_sensitive=False):
rx = re.compile(re.escape(search_string), flags=re.I if not case_sensitive else 0)
name_filter = start + '*' + extension
for root, dirs, files in os.walk(rootdir):
for fname in fnmatch.filter(files, name_filter):
with open(os.path.join(root, fname)) as fin:
try:
mm = mmap.mmap(fin.fileno(), 0, access=mmap.ACCESS_READ)
except ValueError:
continue # empty files etc.... include this or not?
if not next(rx.finditer(mm), None):
yield fin.name
Then use list on that if you want the names materialised or treat it as you would any other generator...
Please do not write a python program for that. This program already exists. Use grep:
grep * -Ilre 'main' 2> /dev/null
99client/.git/COMMIT_EDITMSG
99client/taxis-android/build/incremental/mergeResources/production/merger.xml
99client/taxis-android/build/incremental/mergeResources/production/inputs.data
99client/taxis-android/build/incremental/mergeResources/production/outputs.data
99client/taxis-android/build/incremental/mergeResources/release/merger.xml
99client/taxis-android/build/incremental/mergeResources/release/inputs.data
99client/taxis-android/build/incremental/mergeResources/release/outputs.data
99client/taxis-android/build/incremental/mergeResources/debug/merger.xml
99client/taxis-android/build/incremental/mergeResources/debug/inputs.data
(...)
http://www.gnu.org/savannah-checkouts/gnu/grep/manual/grep.html#Introduction
If you need the list in python, simply execute grep from it and collect the result.

batch renaming 100K files with python

I have a folder with over 100,000 files, all numbered with the same stub, but without leading zeros, and the numbers aren't always contiguous (usually they are, but there are gaps) e.g:
file-21.png,
file-22.png,
file-640.png,
file-641.png,
file-642.png,
file-645.png,
file-2130.png,
file-2131.png,
file-3012.png,
etc.
I would like to batch process this to create padded, contiguous files. e.g:
file-000000.png,
file-000001.png,
file-000002.png,
file-000003.png,
When I parse the folder with for filename in os.listdir('.'): the files don't come up in the order I'd like to them to. Understandably they come up
file-1,
file-1x,
file-1xx,
file-1xxx,
etc. then
file-2,
file-2x,
file-2xx,
etc. How can I get it to go through in the order of the numeric value? I am a complete python noob, but looking at the docs i'm guessing I could use map to create a new list filtering out only the numerical part, and then sort that list, then iterate that? With over 100K files this could be heavy. Any tips welcome!
import re
thenum = re.compile('^file-(\d+)\.png$')
def bynumber(fn):
mo = thenum.match(fn)
if mo: return int(mo.group(1))
allnames = os.listdir('.')
allnames.sort(key=bynumber)
Now you have the files in the order you want them and can loop
for i, fn in enumerate(allnames):
...
using the progressive number i (which will be 0, 1, 2, ...) padded as you wish in the destination-name.
There are three steps. The first is getting all the filenames. The second is converting the filenames. The third is renaming them.
If all the files are in the same folder, then glob should work.
import glob
filenames = glob.glob("/path/to/folder/*.txt")
Next, you want to change the name of the file. You can print with padding to do this.
>>> filename = "file-338.txt"
>>> import os
>>> fnpart = os.path.splitext(filename)[0]
>>> fnpart
'file-338'
>>> _, num = fnpart.split("-")
>>> num.rjust(5, "0")
'00338'
>>> newname = "file-%s.txt" % num.rjust(5, "0")
>>> newname
'file-00338.txt'
Now, you need to rename them all. os.rename does just that.
os.rename(filename, newname)
To put it together:
for filename in glob.glob("/path/to/folder/*.txt"): # loop through each file
newname = make_new_filename(filename) # create a function that does step 2, above
os.rename(filename, newname)
Thank you all for your suggestions, I will try them all to learn the different approaches. The solution I went for is based on using a natural sort on my filelist, and then iterating that to rename. This was one of the suggested answers but for some reason it has disappeared now so I cannot mark it as accepted!
import os
files = os.listdir('.')
natsort(files)
index = 0
for filename in files:
os.rename(filename, str(index).zfill(7)+'.png')
index += 1
where natsort is defined in http://code.activestate.com/recipes/285264-natural-string-sorting/
Why don't you do it in a two step process. Parse all the files and rename with padded numbers and then run another script that takes those files, which are sorted correctly now, and renames them so they're contiguous?
1) Take the number in the filename.
2) Left-pad it with zeros
3) Save name.
def renamer():
for iname in os.listdir('.'):
first, second = iname.replace(" ", "").split("-")
number, ext = second.split('.')
first, number, ext = first.strip(), number.strip(), ext.strip()
number = '0'*(6-len(number)) + number # pad the number to be 7 digits long
oname = first + "-" + number + '.' + ext
os.rename(iname, oname)
print "Done"
Hope this helps
The simplest method is given below. You can also modify for recursive search this script.
use os module.
get filenames
os.rename
import os
class Renamer:
def __init__(self, pattern, extension):
self.ext = extension
self.pat = pattern
return
def rename(self):
p, e = (self.pat, self.ext)
number = 0
for x in os.listdir(os.getcwd()):
if str(x).endswith(f".{e}") == True:
os.rename(x, f'{p}_{number}.{e}')
number+=1
return
if __name__ == "__main__":
pattern = "myfile"
extension = "txt"
r = Renamer(pattern=pattern, extension=extension)
r.rename()

Categories