Find all text files not containing some text string - python

I'm on Python 2.7.1 and I'm trying to identify all text files that don't contain some text string.
The program seemed to be working at first but whenever I add the text string to a file, it keeps coming up as if it doesn't contain it (false positive). When I check the contents of the text file, the string is clearly present.
The code I tried to write is
def scanFiles2(rdir,sstring,extens,start = '',cSens = False):
fList = []
for fol,fols,fils in os.walk(rdir):
fList.extend([os.path.join(rdir,fol,fil) for fil in fils if fil.endswith(extens) and fil.startswith(start)])
if fList:
for fil in fList:
rFil = open(fil)
for line in rFil:
if not cSens:
line,sstring = line.lower(), sstring.lower()
if sstring in line:
fList.remove(fil)
break
rFil.close()
if fList:
plur = 'files do' if len(fList) > 1 else 'file does'
print '\nThe following %d %s not contain "%s":\n'%(len(fList),plur,sstring)
for fil in fList:
print fil
else:
print 'No files were found that don\'t contain %(sstring)s.'%locals()
scanFiles2(rdir = r'C:\temp',sstring = '!!syn',extens = '.html', start = '#', cSens = False)
I guess there's a flaw in the code but I really don't see it.
UPDATE
The code still comes up with many false positives: files that do contain the search string but are identified as not containing it.
Could text encoding be an issue here? I prefixed the search string with U to account for Unicode encoding but it didn't make any difference.
Does Python in some way cache file contents? I don't think so but that could somewhat account for files to still pop up after having been corrected.
Could some kind of malware cause symptoms like these? Seems highly unlikely to me but I'm kinda desperate to get this fixed.

Modifying element while iterating the list cause unexpected results:
For example:
>>> lst = [1,2,4,6,3,8,0,5]
>>> for n in lst:
... if n % 2 == 0:
... lst.remove(n)
...
>>> lst
[1, 4, 3, 0, 5]
Workaround iterate over copy
>>> lst = [1,2,4,6,3,8,0,5]
>>> for n in lst[:]:
... if n % 2 == 0:
... lst.remove(n)
...
>>> lst
[1, 3, 5]
Alternatively, you can append valid file path, instead of removing from the whole file list.
Modified version (appending file that does not contian sstring instead of removing):
def scanFiles2(rdir, sstring, extens, start='', cSens=False):
if not cSens:
# This only need to called once.
sstring = sstring.lower()
fList = []
for fol, fols, fils in os.walk(rdir):
for fil in fils:
if not (fil.startswith(start) and fil.endswith(extens)):
continue
fil = os.path.join(fol, fil)
with open(fil) as rFil:
for line in rFil:
if not cSens:
line = line.lower()
if sstring in line:
break
else:
fList.append(fil)
...
list.remove takes O(n) time, while list.append takes O(1). See Time Complexity.
Use with statement if possible.

Falsetru already showed you why you should not remove lines from a list while looping over it; list iterators do not and cannot update their counter when a list is shortened, so if item 3 was processed but you removed that item, the next iteration item 4 was previously located at index 5.
List comprehension version using fnmatch.filter() and any() and a filter lambda for case insensitive matching:
import fnmatch
def scanFiles2(rdir, sstring, extens, start='', cSens=False):
lfilter = sstring.__eq__ if cSens else lambda l, s=sstring.lower(): l.lower() == s
ffilter = '{}*{}'.format(start, extens)
return [os.path.join(r, fname)
for r, _, f in os.walk(rdir)
for fname in fnmatch.filter(f, ffilter)
if not any(lfilter(l) for l in open(os.path.join(root, fname)))]
but perhaps you'd be better off sticking to a more readable loop:
def scanFiles2(rdir, sstring, extens, start='', cSens=False):
lfilter = sstring.__eq__ if cSens else lambda l, s=sstring.lower(): l.lower() == s
ffilter = '{}*{}'.format(start, extens)
result = []
for root, _, files in os.walk(rdir):
for fname in fnmatch.filter(files, ffilter):
fname = os.path.join(r, fname)
with open(fname) as infh:
if not any(lfilter(l) for l in infh):
result.append(fname)
return result

Another alternative that opens the searching up for using regular expressions (although just using grep with appropriate options would still be better):
import mmap
import os
import re
import fnmatch
def scan_files(rootdir, search_string, extension, start='', case_sensitive=False):
rx = re.compile(re.escape(search_string), flags=re.I if not case_sensitive else 0)
name_filter = start + '*' + extension
for root, dirs, files in os.walk(rootdir):
for fname in fnmatch.filter(files, name_filter):
with open(os.path.join(root, fname)) as fin:
try:
mm = mmap.mmap(fin.fileno(), 0, access=mmap.ACCESS_READ)
except ValueError:
continue # empty files etc.... include this or not?
if not next(rx.finditer(mm), None):
yield fin.name
Then use list on that if you want the names materialised or treat it as you would any other generator...

Please do not write a python program for that. This program already exists. Use grep:
grep * -Ilre 'main' 2> /dev/null
99client/.git/COMMIT_EDITMSG
99client/taxis-android/build/incremental/mergeResources/production/merger.xml
99client/taxis-android/build/incremental/mergeResources/production/inputs.data
99client/taxis-android/build/incremental/mergeResources/production/outputs.data
99client/taxis-android/build/incremental/mergeResources/release/merger.xml
99client/taxis-android/build/incremental/mergeResources/release/inputs.data
99client/taxis-android/build/incremental/mergeResources/release/outputs.data
99client/taxis-android/build/incremental/mergeResources/debug/merger.xml
99client/taxis-android/build/incremental/mergeResources/debug/inputs.data
(...)
http://www.gnu.org/savannah-checkouts/gnu/grep/manual/grep.html#Introduction
If you need the list in python, simply execute grep from it and collect the result.

Related

Regex on list element in for loop

I have a script that searches through config files and finds all matches of strings from another list as follows:
dstn_dir = "C:/xxxxxx/foobar"
dst_list =[]
files = [fn for fn in os.listdir(dstn_dir)if fn.endswith('txt')]
dst_list = []
for file in files:
parse = CiscoConfParse(dstn_dir+'/'+file)
for sfarm in search_str:
int_objs = parse.find_all_children(sfarm)
if len(int_objs) > 0:
dst_list.append(["\n","#" *40,file + " " + sfarm,"#" *40])
dst_list.append(int_objs)
I need to change this part of the code:
for sfarm in search_str:
int_objs = parse.find_all_children(sfarm)
search_str is a list containing strings similar to ['xrout:55','old:23'] and many others.
So it will only find entries that end with the string from the list I am iterating through in sfarm. My understanding is that this would require my to use re and match on something like sfarm$ but Im not sure on how to do this as part of the loop.
Am I correct in saying that sfarm is an iterable? If so I need to know how to regex on an iterable object in this context.
Strings in python are iterable, so sfarm is an iterable, but that has little meaning in this case. From reading what CiscoConfParse.find_all_children() does, it is apparent that your sfarm is the linespec, which is a regular expression string. You do not need to explicitly use the re module here; just pass sfarm concatenated with '$':
search_string = ['xrout:55','old:23']
...
for sfarm in search_str:
int_objs = parse.find_all_children(sfarm + '$') # one of many ways to concat
...
Please check this code. Used glob module to get all "*.txt" files in folder.
Please check here for more info on glob module.
import glob
import re
dst_list = []
search_str = ['xrout:55','old:23']
for file_name in glob.glob(r'C:/Users/dinesh_pundkar\Desktop/*.txt'):
with open(file_name,'r') as f:
text = f.read()
for sfarm in search_str:
regex = re.compile('%s$'%sfarm)
int_objs = regex.findall(text)
if len(int_objs) > 0:
dst_list.append(["\n","#" *40,file_name + " " + sfarm,"#" *40])
dst_list.append(int_objs)
print dst_list
Output:
C:\Users\dinesh_pundkar\Desktop>python a.py
[['\n', '########################################', 'C:/Users/dinesh_pundkar\\De
sktop\\out.txt old:23', '########################################'], ['old:23']]
C:\Users\dinesh_pundkar\Desktop>

Extracting "unsigned files" from a directory

I have a directory with xml files associated with encrypted P7M files, meaning that for every name.xml there is a name.P7M. But there are some exceptions (P7M file is absent) and my goal is to detect them using python.
I'm thinking this code.. Can you help with an elegant code?
import glob
# functions to eleminate extension name
def is_xml(x):
a = re.search(r"(\s)(.xml)",x)
if a :
return a.group(0)
else:
return False
def is_P7M(x):
a = re.search(r"(\s)(.P7M)", x)
if a :
return a.group(0)
else:
return False
# putting xml files and P7M files in two sets
setA = set (glob.glob('directory/*.xml'))
setB = set (glob.glob('directory/*.P7M'))
#eliminating extention names
for elt in setA:
elt= is_xml(elt)
for elt in setB:
elt= is_P7M(elt)
#difference between two sets. setB is always a larger set
print "unsigned files are:", setB.difference(setA)
A simpler way is to glob for the .xml files, then check using os.path.exists for a .P7M file:
import os, glob
for xmlfile in glob.glob('*.xml'):
if not os.path.exists(xmlfile.rsplit(".", 1)[0] + ".P7M"):
print xmlfile, "is unsigned"
This code:
Uses glob.glob to get all the xml files.
Uses str.rsplit to split the filename up into name and extension (e.g. "name.xml" to ("name", ".xml")). The second argument stops str.rsplit splitting more than once.
Takes the name of the file and adds the .P7M extension.
Uses os.path.exists to see if the key file is there. If is isn't, the xmlfile is unsigned, so print it out.
If you need them in a list, you can do:
unsigned = [xmlfile for xmlfile in glob.glob('*.xml') if not os.path.exists(xmlfile.rsplit(".", 1)[0] + ".P7M")]
Or a set:
unsigned = {xmlfile for xmlfile in glob.glob('*.xml') if not os.path.exists(xmlfile.rsplit(".", 1)[0] + ".P7M")}
My solution would be:
import glob
import os
get_name = lambda fname: os.path.splitext(fname)[0]
xml_names = {get_name(fname) for fname in glob.glob('directory/*.xml')}
p7m_names = {get_name(fname) for fname in glob.glob('directory/*.p7m')}
unsigned = [xml_name + ".xml" for xml_name in \
xml_names.difference(p7m_names)]
print unsigned
get all xml's in a dict removing the extension and using the name as key and setting the value to false initially, if we find a matching P7M set value to True, finally print all keys with a False value.
xmls = glob.glob('directory/*.xml')
p7ms = glob.glob('directory/*.P7M')
# use xml file names as keys by removing the extension
d = {k[rsplit(k,1)[0]]:False for k in xmls}
# go over every .P7M again removing extension
# setting value to True for every match
for k in p7ms:
k[rsplit(k,1)[0]] = True
# any values that are False means there is no .P7M match for the xml file
for k,v in d.items():
if not v:
print(k)
Or create a set of each and find the difference:
xmls = {x.rsplit(".",1)[0] for x in in glob.glob('directory/*.xml')}
pm7s = {x.rsplit(".",1)[0] for x in glob.glob('directory/*.P7M')}
print(xmls - pm7s)
Iterate over glob once and populate a dict of filenames by extension. Finally, compute the difference between 'xml' and 'P7M' sets.
import os, glob, collections
fnames = collections.defaultdict(set)
for fname in glob.glob('*'):
f, e = os.path.splitext(fname)
fnames[e].add(f)
print fnames['.xml'] - fnames['.P7M']
Note that unlike other suggestions, this makes one single request to the filesystem, which might be important if the FS is slow (e.g. a network mount).

Faster operation reading file

I have to process a 15MB txt file (nucleic acid sequence) and find all the different substrings (size 5). For instance:
ABCDEF
would return 2, as we have both ABCDE and BCDEF, but
AAAAAA
would return 1. My code:
control_var = 0
f=open("input.txt","r")
list_of_substrings=[]
while(f.read(5)!=""):
f.seek(control_var)
aux = f.read(5)
if(aux not in list_of_substrings):
list_of_substrings.append(aux)
control_var += 1
f.close()
print len(list_of_substrings)
Would another approach be faster (instead of comparing the strings direct from the file)?
Depending on what your definition of a legal substring is, here is a possible solution:
import re
regex = re.compile(r'(?=(\w{5}))')
with open('input.txt', 'r') as fh:
input = fh.read()
print len(set(re.findall(regex, input)))
Of course, you may replace \w with whatever you see fit to qualify as a legal character in your substring. [A-Za-z0-9], for example will match all alphanumeric characters.
Here is an execution example:
>>> import re
>>> input = "ABCDEF GABCDEF"
>>> set(re.findall(regex, input))
set(['GABCD', 'ABCDE', 'BCDEF'])
EDIT: Following your comment above, that all character in the file are valid, excluding the last one (which is \n), it seems that there is no real need for regular expressions here and the iteration approach is much faster. You can benchmark it yourself with this code (note that I slightly modified the functions to reflect your update regarding the definition of a valid substring):
import timeit
import re
FILE_NAME = r'input.txt'
def re_approach():
return len(set(re.findall(r'(?=(.{5}))', input[:-1])))
def iter_approach():
return len(set([input[i:i+5] for i in xrange(len(input[:-6]))]))
with open(FILE_NAME, 'r') as fh:
input = fh.read()
# verify that the output of both approaches is identicle
assert set(re.findall(r'(?=(.{5}))', input[:-1])) == set([input[i:i+5] for i in xrange(len(input[:-6]))])
print timeit.repeat(stmt = re_approach, number = 500)
print timeit.repeat(stmt = iter_approach, number = 500)
15MB doesn't sound like a lot. Something like this probably would work fine:
import Counter, re
contents = open('input.txt', 'r').read()
counter = Counter.Counter(re.findall('.{5}', contents))
print len(counter)
Update
I think user590028 gave a great solution, but here is another option:
contents = open('input.txt', 'r').read()
print set(contents[start:start+5] for start in range(0, len(contents) - 4))
# Or using a dictionary
# dict([(contents[start:start+5],True) for start in range(0, len(contents) - 4)]).keys()
You could use a dictionary, where each key is a substring. It will take care of duplicates, and you can just count the keys at the end.
So: read through the file once, storing each substring in the dictionary, which will handle finding duplicate substrings & counting the distinct ones.
Reading all at once is more i/o efficient, and using a dict() is going to be faster than testing for existence in a list. Something like:
fives = {}
buf = open('input.txt').read()
for x in xrange(len(buf) - 4):
key = buf[x:x+5]
fives[key] = 1
for keys in fives.keys():
print keys

Python script to match a partcular text in filename and count the number of such files

In a folder I have files containing file names as the following :
Q1234_ABC_B02_12232.hl7
12313_SDDD_Q03_44545.hl7
Q43434_SAD_B02_2312.hl7
4324_SDSD_W05_344423423.hl7
3123123_DSD_D06_67578.hl7
and many such files
I need to write a python script to count the number of files whose file names begin with "Q" and which have "B02" after the second underscore which means that I should get output count as 2. I have tried the following script but not got the desired solution.
import re
import os
resultsDict = {}
myString1 = ""
regex = r'[^_]+_([^_]*)_.*'
for file_name in os.listdir("."):
m = file_name.split("_")
if len(m) > 2 :
myString = m[2]
if "B02" in myString:
myString1 = myString
if myString1 in resultsDict:
resultsDict[myString1] += 1
else:
resultsDict.update({myString1: 1})
else:
print "error in the string! there are less then 2 _"
print resultsDict
I am using python 2.6.6. Any help would be useful.
As time of this writing, there is several answer with a wrong regex.
One of these is probably better:
r'^Q[^_]*_[^_]*_B02_.*'
r'^Q[^_]*_[^_]*_B02.*'
r'^Q[^_]*_[^_]*_B02(_.*|$)'
If you stick with .* instead, the regex might consume some intermediate underscore. So your are no longer able to enforce B02 being after the second _
After that, test for matching values (re.match) is a simple loop over the various file names ( os.listdir or glob.glob). Here is an example using list comprehension:
>>> l = [file for file in os.listdir(".") if re.match(r'^Q[^_]*_[^_]*_B02.*', file)]
>>> l
['Q1234_ABC_B02_12232.hl7', 'Q43434_SAD_B02_2312.hl7']
>>> len(l)
2
For better performances you might wish to compile the regex first (re.compile).
As a comment by #camh above let me think that maybe you have jumped into Python because you couldn't find a shell-based solution, here is how to do the same thing using only bash:
sh$ shopt -s extglob
sh$ ls Q*([^_])_*([^_])_B02*
Q1234_ABC_B02_12232.hl7 Q43434_SAD_B02_2312.hl7
sh$ ls Q*([^_])_*([^_])_B02* | wc -l
# ^^^^^^^
# This *won't* work if some file names contain '\n' !!!
Use a regular expression,
import re
resultsDict = {}
expression = "^Q.*_.*_B02_.*"
p = re.compile(expression)
for file_name in os.listdir("."):
if p.match(file_name):
if file_name in resultsDict:
resultsDict[file_name] = resultsDict[file_name] + 1
else:
resultsDict[file_name] = 1
You can try with this regular expression:
'^Q.*_.*_B02_.*'
This code will match all files in current directory according to your requirements.
import os
import re
regex = r'^Q\w+_\w+_B02' # This will match any word character between the underscores
for f in os.listdir("."):
if re.match(regex, f, re.I):
print f
Word character is A-Z, a-z and 0-9.
A solution with list comprehensions instead of regular expressions. First, get all the directory names that start with Q, and split them on the underscores;
import os
dirs = [d.split('_') for d in os.listdir(".") if d.startswith('Q')]
Now get all directories with two underscores or more;
dirs = [d for d in dirs if len(d) > 2]
Finally, narrow it down;
dirs = [d for d in dirs if d[2] == 'B02']
You could combine the last to comprehensions into one;
dirs = [d for d in dirs if len(d) > 2 and d[2] == 'B02']

batch renaming 100K files with python

I have a folder with over 100,000 files, all numbered with the same stub, but without leading zeros, and the numbers aren't always contiguous (usually they are, but there are gaps) e.g:
file-21.png,
file-22.png,
file-640.png,
file-641.png,
file-642.png,
file-645.png,
file-2130.png,
file-2131.png,
file-3012.png,
etc.
I would like to batch process this to create padded, contiguous files. e.g:
file-000000.png,
file-000001.png,
file-000002.png,
file-000003.png,
When I parse the folder with for filename in os.listdir('.'): the files don't come up in the order I'd like to them to. Understandably they come up
file-1,
file-1x,
file-1xx,
file-1xxx,
etc. then
file-2,
file-2x,
file-2xx,
etc. How can I get it to go through in the order of the numeric value? I am a complete python noob, but looking at the docs i'm guessing I could use map to create a new list filtering out only the numerical part, and then sort that list, then iterate that? With over 100K files this could be heavy. Any tips welcome!
import re
thenum = re.compile('^file-(\d+)\.png$')
def bynumber(fn):
mo = thenum.match(fn)
if mo: return int(mo.group(1))
allnames = os.listdir('.')
allnames.sort(key=bynumber)
Now you have the files in the order you want them and can loop
for i, fn in enumerate(allnames):
...
using the progressive number i (which will be 0, 1, 2, ...) padded as you wish in the destination-name.
There are three steps. The first is getting all the filenames. The second is converting the filenames. The third is renaming them.
If all the files are in the same folder, then glob should work.
import glob
filenames = glob.glob("/path/to/folder/*.txt")
Next, you want to change the name of the file. You can print with padding to do this.
>>> filename = "file-338.txt"
>>> import os
>>> fnpart = os.path.splitext(filename)[0]
>>> fnpart
'file-338'
>>> _, num = fnpart.split("-")
>>> num.rjust(5, "0")
'00338'
>>> newname = "file-%s.txt" % num.rjust(5, "0")
>>> newname
'file-00338.txt'
Now, you need to rename them all. os.rename does just that.
os.rename(filename, newname)
To put it together:
for filename in glob.glob("/path/to/folder/*.txt"): # loop through each file
newname = make_new_filename(filename) # create a function that does step 2, above
os.rename(filename, newname)
Thank you all for your suggestions, I will try them all to learn the different approaches. The solution I went for is based on using a natural sort on my filelist, and then iterating that to rename. This was one of the suggested answers but for some reason it has disappeared now so I cannot mark it as accepted!
import os
files = os.listdir('.')
natsort(files)
index = 0
for filename in files:
os.rename(filename, str(index).zfill(7)+'.png')
index += 1
where natsort is defined in http://code.activestate.com/recipes/285264-natural-string-sorting/
Why don't you do it in a two step process. Parse all the files and rename with padded numbers and then run another script that takes those files, which are sorted correctly now, and renames them so they're contiguous?
1) Take the number in the filename.
2) Left-pad it with zeros
3) Save name.
def renamer():
for iname in os.listdir('.'):
first, second = iname.replace(" ", "").split("-")
number, ext = second.split('.')
first, number, ext = first.strip(), number.strip(), ext.strip()
number = '0'*(6-len(number)) + number # pad the number to be 7 digits long
oname = first + "-" + number + '.' + ext
os.rename(iname, oname)
print "Done"
Hope this helps
The simplest method is given below. You can also modify for recursive search this script.
use os module.
get filenames
os.rename
import os
class Renamer:
def __init__(self, pattern, extension):
self.ext = extension
self.pat = pattern
return
def rename(self):
p, e = (self.pat, self.ext)
number = 0
for x in os.listdir(os.getcwd()):
if str(x).endswith(f".{e}") == True:
os.rename(x, f'{p}_{number}.{e}')
number+=1
return
if __name__ == "__main__":
pattern = "myfile"
extension = "txt"
r = Renamer(pattern=pattern, extension=extension)
r.rename()

Categories