Script to remove Python comments/docstrings - python

Is there a Python script or tool available which can remove comments and docstrings from Python source?
It should take care of cases like:
"""
aas
"""
def f():
m = {
u'x':
u'y'
} # faake docstring ;)
if 1:
'string' >> m
if 2:
'string' , m
if 3:
'string' > m
So at last I have come up with a simple script, which uses the tokenize module and removes comment tokens. It seems to work pretty well, except that I am not able to remove docstrings in all cases. See if you can improve it to remove docstrings.
import cStringIO
import tokenize
def remove_comments(src):
"""
This reads tokens using tokenize.generate_tokens and recombines them
using tokenize.untokenize, and skipping comment/docstring tokens in between
"""
f = cStringIO.StringIO(src)
class SkipException(Exception): pass
processed_tokens = []
last_token = None
# go thru all the tokens and try to skip comments and docstrings
for tok in tokenize.generate_tokens(f.readline):
t_type, t_string, t_srow_scol, t_erow_ecol, t_line = tok
try:
if t_type == tokenize.COMMENT:
raise SkipException()
elif t_type == tokenize.STRING:
if last_token is None or last_token[0] in [tokenize.INDENT]:
# FIXEME: this may remove valid strings too?
#raise SkipException()
pass
except SkipException:
pass
else:
processed_tokens.append(tok)
last_token = tok
return tokenize.untokenize(processed_tokens)
Also I would like to test it on a very large collection of scripts with good unit test coverage. Can you suggest such a open source project?

I'm the author of the "mygod, he has written a python interpreter using regex..." (i.e. pyminifier) mentioned at that link below =).
I just wanted to chime in and say that I've improved the code quite a bit using the tokenizer module (which I discovered thanks to this question =) ).
You'll be happy to note that the code no longer relies so much on regular expressions and uses tokenizer to great effect. Anyway, here's the remove_comments_and_docstrings() function from pyminifier
(Note: It works properly with the edge cases that previously-posted code breaks on):
import cStringIO, tokenize
def remove_comments_and_docstrings(source):
"""
Returns 'source' minus comments and docstrings.
"""
io_obj = cStringIO.StringIO(source)
out = ""
prev_toktype = tokenize.INDENT
last_lineno = -1
last_col = 0
for tok in tokenize.generate_tokens(io_obj.readline):
token_type = tok[0]
token_string = tok[1]
start_line, start_col = tok[2]
end_line, end_col = tok[3]
ltext = tok[4]
# The following two conditionals preserve indentation.
# This is necessary because we're not using tokenize.untokenize()
# (because it spits out code with copious amounts of oddly-placed
# whitespace).
if start_line > last_lineno:
last_col = 0
if start_col > last_col:
out += (" " * (start_col - last_col))
# Remove comments:
if token_type == tokenize.COMMENT:
pass
# This series of conditionals removes docstrings:
elif token_type == tokenize.STRING:
if prev_toktype != tokenize.INDENT:
# This is likely a docstring; double-check we're not inside an operator:
if prev_toktype != tokenize.NEWLINE:
# Note regarding NEWLINE vs NL: The tokenize module
# differentiates between newlines that start a new statement
# and newlines inside of operators such as parens, brackes,
# and curly braces. Newlines inside of operators are
# NEWLINE and newlines that start new code are NL.
# Catch whole-module docstrings:
if start_col > 0:
# Unlabelled indentation means we're inside an operator
out += token_string
# Note regarding the INDENT token: The tokenize module does
# not label indentation inside of an operator (parens,
# brackets, and curly braces) as actual indentation.
# For example:
# def foo():
# "The spaces before this docstring are tokenize.INDENT"
# test = [
# "The spaces before this string do not get a token"
# ]
else:
out += token_string
prev_toktype = token_type
last_col = end_col
last_lineno = end_line
return out

This does the job:
""" Strip comments and docstrings from a file.
"""
import sys, token, tokenize
def do_file(fname):
""" Run on just one file.
"""
source = open(fname)
mod = open(fname + ",strip", "w")
prev_toktype = token.INDENT
first_line = None
last_lineno = -1
last_col = 0
tokgen = tokenize.generate_tokens(source.readline)
for toktype, ttext, (slineno, scol), (elineno, ecol), ltext in tokgen:
if 0: # Change to if 1 to see the tokens fly by.
print("%10s %-14s %-20r %r" % (
tokenize.tok_name.get(toktype, toktype),
"%d.%d-%d.%d" % (slineno, scol, elineno, ecol),
ttext, ltext
))
if slineno > last_lineno:
last_col = 0
if scol > last_col:
mod.write(" " * (scol - last_col))
if toktype == token.STRING and prev_toktype == token.INDENT:
# Docstring
mod.write("#--")
elif toktype == tokenize.COMMENT:
# Comment
mod.write("##\n")
else:
mod.write(ttext)
prev_toktype = toktype
last_col = ecol
last_lineno = elineno
if __name__ == '__main__':
do_file(sys.argv[1])
I'm leaving stub comments in the place of docstrings and comments since it simplifies the code. If you remove them completely, you also have to get rid of indentation before them.

Here is a modification of Dan's solution to make it run for Python3 + also remove empty lines + make it ready-to-use:
import io, tokenize, re
def remove_comments_and_docstrings(source):
io_obj = io.StringIO(source)
out = ""
prev_toktype = tokenize.INDENT
last_lineno = -1
last_col = 0
for tok in tokenize.generate_tokens(io_obj.readline):
token_type = tok[0]
token_string = tok[1]
start_line, start_col = tok[2]
end_line, end_col = tok[3]
ltext = tok[4]
if start_line > last_lineno:
last_col = 0
if start_col > last_col:
out += (" " * (start_col - last_col))
if token_type == tokenize.COMMENT:
pass
elif token_type == tokenize.STRING:
if prev_toktype != tokenize.INDENT:
if prev_toktype != tokenize.NEWLINE:
if start_col > 0:
out += token_string
else:
out += token_string
prev_toktype = token_type
last_col = end_col
last_lineno = end_line
out = '\n'.join(l for l in out.splitlines() if l.strip())
return out
with open('test.py', 'r') as f:
print(remove_comments_and_docstrings(f.read()))

I found an easier way to do this with the ast and astunparse module (available from pip). It converts the code text into a syntax tree, and then the astunparse module prints the code back out again without the comments. I had to strip out the docstrings with a simple matching, but it seems to work. I've been looking through output and so far the only downside of this method is that it strips all newlines from your code.
import ast, astunparse
with open('my_module.py') as f:
lines = astunparse.unparse(ast.parse(f.read())).split('\n')
for line in lines:
if line.lstrip()[:1] not in ("'", '"'):
print(line)

Try testing each chunk of tokens ending with NEWLINE. Then correct pattern for docstring (including cases where it serves as comment, but isn't assigned to __doc__) I believe is (assuming match is performed from start of file of after NEWLINE):
( DEDENT+ | INDENT? ) STRING+ COMMENT? NEWLINE
This should handle all tricky cases: string concatenation, line continuation, module/class/function docstrings, comment in the sameline after string. Note, there is a difference between NL and NEWLINE tokens, so we don't need to worry about single string of the line inside expression.

I've just used the code given by Dan McDougall, and I've found two problems.
There were too many empty new lines, so I decided to remove line every time we have two consecutive new lines
When the Python code was processed all spaces were missing (except indentation) and so such things as "import Anything" changed into "importAnything" which caused problems. I added spaces after and before reserved Python words which needed it done. I hope I didn't make any mistake there.
I think I have fixed both things with adding (before return) few more lines:
# Removing unneeded newlines from string
buffered_content = cStringIO.StringIO(content) # Takes the string generated by Dan McDougall's code as input
content_without_newlines = ""
previous_token_type = tokenize.NEWLINE
for tokens in tokenize.generate_tokens(buffered_content.readline):
token_type = tokens[0]
token_string = tokens[1]
if previous_token_type == tokenize.NL and token_type == tokenize.NL:
pass
else:
# add necessary spaces
prev_space = ''
next_space = ''
if token_string in ['and', 'as', 'or', 'in', 'is']:
prev_space = ' '
if token_string in ['and', 'del', 'from', 'not', 'while', 'as', 'elif', 'global', 'or', 'with', 'assert', 'if', 'yield', 'except', 'import', 'print', 'class', 'exec', 'in', 'raise', 'is', 'return', 'def', 'for', 'lambda']:
next_space = ' '
content_without_newlines += prev_space + token_string + next_space # This will be our new output!
previous_token_type = token_type

I was trying to create a program that would count all lines in a python file, ignoring blank lines, lines with comments and docstrings. Here is my solution:
with open(file_path, 'r', encoding='utf-8') as pyt_file:
count = 0
docstring = False
for i_line in pyt_file.readlines():
cur_line = i_line.rstrip().replace(' ', '')
if cur_line.startswith('"""') and not docstring:
marks_counter = Counter(cur_line)
if marks_counter['"'] == 6:
count -= 1
else:
docstring = True
elif cur_line.startswith('"""') and docstring:
count -= 1
docstring = False
if len(cur_line) > 0 and not cur_line.startswith('#') and not docstring:
count += 1
My problem was to detect the docstrings (including both one-lines and multi-lines), so I suppose if you want to delete those you can try to use the same Flag-solution.
P.S. I understand that it is an old quiestion but when I was dealing with my problem I couldn't find anything simple and effective

Related

How would I format python code using python?

Let's say I've got this code in python:
total=0for i in range(100):print(i)if i > 50:total=total+i
How would I make an algorithm in python to format this python code into the code below:
total=0
for i in range(100):
print(i)
if i > 50:
total=total+i
Assume that everything is nested under each other, such that another statement would be assumed to be inside the if block.
This was quite a fun exercise! I'm running out of juice so just posting this as is. It works on your example but probably not much for anything more complex.
code_block = "total=0for i in range(100):print(i)if i > 50:total=total+iprint('finished')"
code_block_b = "def okay() {print('ff')while True:print('blbl')break}"
line_break_before = ['for', 'while', 'if', 'print', 'break', '}']
line_break_after = [':', '{']
indent_chars = [':', '{']
unindent_chars = ['}']
# Add line breaks before keywords
for kw in line_break_before:
kw_indexes = [idx for idx in range(len(code_block)) if code_block[idx:idx + len(kw)] == kw]
for kw_idx in kw_indexes[::-1]:
code_block = code_block[:kw_idx] + '\n' + code_block[kw_idx:]
# Add line breaks after other keywords if not present already
for kw in line_break_after:
kw_indexes = [idx for idx in range(len(code_block)) if code_block[idx:idx + len(kw)] == kw]
for kw_idx in kw_indexes[::-1]:
if code_block[kw_idx + 1: kw_idx + 2] != '\n':
code_block = code_block[:kw_idx + 1] + '\n' + code_block[kw_idx + 1:]
# Add indentation
indent = 0
formatted_code_lines = []
for line in code_block.split('\n'):
if line[-1] in unindent_chars:
indent = 0
formatted_code_lines.append(' ' * indent)
if line[-1] in indent_chars:
indent += 4
formatted_code_lines.append(line + '\n')
code_block = ''.join(formatted_code_lines)
print(code_block)
The basic premise for formatting is based around keywords. There are keys that require a line break before, and keys that require a line break after them. After that, the indentation was counted +4 spaces for every line after each : symbol. I tested some formatting with braces too in code_block_b.
Output a
total=0
for i in range(100):
print(i)
if i > 50:
total=total+i
Output b
def okay() {
print('ff')
while True:
print('blbl')
break
}

List index out of range with stanford-nlp

I'm trying to remove all blank lines from a large .txt file but whatever method I use it always returns this traceback:
Traceback (most recent call last):
File "C:\Users\svp12\PycharmProjects\practiques\main.py", line 53, in <module>
doc = nlp(texts[line])
IndexError: list index out of range
If I don't remove these spaces then I get IndexErrors on the consequent 2 for loops (or at least I think that's the reason), that's why I'm using the the try/except like this:
try:
for word in doc.sentences[0].words:
noun.append(word.text)
lemma.append(word.lemma)
pos.append(word.pos)
xpos.append(word.xpos)
deprel.append(word.deprel)
except IndexError:
errors += 1
pass
I'd like to be able to remove all blank lines and not have to avoid IndexErrors like this, any idea on how to fix?
Here's the whole code:
import io
import stanza
import os
def linecount(filename):
ffile = open(filename, 'rb')
lines = 0
buf_size = 1024 * 1024
read_f = ffile.read
buf = read_f(buf_size)
while buf:
lines += buf.count(b'\n')
buf = read_f(buf_size)
return lines
errors = 0
with io.open('#_Calvia_2018-01-01_2022-04-01.txt', 'r+', encoding='utf-8') as f:
text = f.read()
# replacing eos with \n, numbers and symbols
texts = text.replace('eos', '.\n')
texts = texts.replace('0', ' ').replace('1', ' ').replace('2', ' ').replace('3', ' ').replace('4', ' ')\
.replace('5', ' ').replace('6', ' ').replace('7', ' ').replace('8', ' ').replace('9', ' ').replace(',', ' ')\
.replace('"', ' ').replace('·', ' ').replace('?', ' ').replace('¿', ' ').replace(':', ' ').replace(';', ' ')\
.replace('-', ' ').replace('!', ' ').replace('¡', ' ').replace('.', ' ').splitlines()
os.system("sed -i \'/^$/d\' #_Calvia_2018-01-01_2022-04-01.txt") # removing empty lines to avoid IndexError
nlp = stanza.Pipeline(lang='ca')
nouns = []
lemmas = []
poses = []
xposes = []
heads = []
deprels = []
total_lines = linecount('#_Calvia_2018-01-01_2022-04-01.txt') - 1
for line in range(50): # range should be total_lines which is 6682
noun = []
lemma = []
pos = []
xpos = []
head = []
deprel = []
# print('analyzing: '+str(line+1)+' / '+str(len(texts)), end='\r')
doc = nlp(texts[line])
try:
for word in doc.sentences[0].words:
noun.append(word.text)
lemma.append(word.lemma)
pos.append(word.pos)
xpos.append(word.xpos)
deprel.append(word.deprel)
except IndexError:
errors += 1
pass
try:
for word in doc.sentences[0].words:
head.extend([lemma[word.head-1] if word.head > 0 else "root"])
except IndexError:
errors += 1
pass
nouns.append(noun)
lemmas.append(lemma)
poses.append(pos)
xposes.append(xpos)
heads.append(head)
deprels.append(deprel)
print(nouns)
print(lemmas)
print(poses)
print(xposes)
print(heads)
print(deprels)
print("errors: " + str(errors)) # wierd, seems to be range/2-1
And as a side question, is worth to import os just for this line? (which is the one removing the blank lines
os.system("sed -i \'/^$/d\' #_Calvia_2018-01-01_2022-04-01.txt")
I can't guarantee that this works because I couldn't test it, but it should give you an idea of how you'd approach this task in Python.
I'm omitting the head processing/the second loop here, that's for you to figure out.
I'd recommend you throw some prints in there and look at the output, make sure you understand what's going on (especially with different data types) and look at examples of applications using Stanford NLP, watch some tutorials online (from start to finish, no skipping), etc.
import stanza
import re
def clean(line):
# function that does the text cleaning
line = line.replace('eos', '.\n')
line = re.sub(r'[\d,"·?¿:;!¡.-]', ' ', line)
return line.strip()
nlp = stanza.Pipeline(lang='ca')
# instead of individual variables, you could keep the values in a dictionary
# (or just leave them as they are - your call)
values_to_extract = ['text', 'lemma', 'pos', 'xpos', 'deprel']
data = {v:[] for v in values_to_extract}
with open('#_Calvia_2018-01-01_2022-04-01.txt', 'r', encoding='utf-8') as f:
for line in f:
# clean the text
line = clean(line)
# skip empty lines
if not line:
continue
doc = nlp(line)
# loop over sentences – this will work even if it's an empty list
for sentence in doc.sentences:
# append a new list to the dictionary entries
for v in values_to_extract:
data[v].append([])
for word in sentence.words:
for v in values_to_extract:
# extract the attribute (e.g.,
# a surface form, a lemma, a pos tag, etc.)
attribute = getattr(word, v)
# and add it to its slot
data[v][-1].append(attribute)
for v in values_to_extract:
print('Value:', v)
print(data[v])
print()
Because texts doesn't have 50 lines, why do you hardcode 50?
If you just need to remove blank lines you only have to do text = text.replace("\n\n","\n")
if you need to remove lines that are just whitespaces you can just do:
text = '\n'.join(line.rstrip() for line in text.split('\n') if line.strip())

What Is the error in the code, i want to replace a set of characters from a text file when i give a work with blanks in it

i want to replace a set of characters from a text file when i give a work with blanks in it like for example :
i gave the line The Language Is _th_n !
it should return python replacing _ with text from a file like text.txt
i wrote this code please check once
with open('data/text','r', encoding='utf8') as file:
word_list = file.read()
def solve(message):
hint = []
for i in range(15,len(message) - 1):
if message[i] != '\\':
hint.append(message[i])
hint_string = ''
for i in hint:
hint_string += i
hint_replaced = hint_string.replace('_', '!')
solution = re.findall('^'+hint_replaced+'$', word_list, re.MULTILINE)
return solution```

Why python script doesn't print to console or can't debug using pdb in Ubuntu

I am looking into this code.
For training lpr, we can use train.py in lpr folder.
train.py uses methods and classes in trainer.py, such as CTCUtils, InputData, inference and LPRVocab.
I put print inside LPRVocab to see how the code works as follows.
class LPRVocab:
#staticmethod
def create_vocab(train_list_path, val_list_path, use_h_concat=False, use_oi_concat=False):
print('create_vocab called ')
[vocab, r_vocab, num_classes] = LPRVocab._create_standard_vocabs(train_list_path, val_list_path)
if use_h_concat:
[vocab, r_vocab, num_classes] = LPRVocab._concat_all_hieroglyphs(vocab, r_vocab)
if use_oi_concat:
[vocab, r_vocab, num_classes] = LPRVocab._concat_oi(vocab, r_vocab)
return vocab, r_vocab, num_classes
#staticmethod
def _char_range(char1, char2):
"""Generates the characters from `char1` to `char2`, inclusive."""
for char_code in range(ord(char1), ord(char2) + 1):
yield chr(char_code)
# Function for reading special symbols
#staticmethod
def _read_specials(filepath):
characters = set()
with open(filepath, 'r') as file_:
for line in file_:
current_label = line.split(' ')[-1].strip()
characters = characters.union(re.findall('(<[^>]*>|.)', current_label))
return characters
#staticmethod
def _create_standard_vocabs(train_list_path, val_list_path):
print('_create_standard_vocabs called ')
chars = set().union(LPRVocab._char_range('A', 'Z')).union(LPRVocab._char_range('0', '9'))
print(chars)
print('for special characters')
chars = chars.union(LPRVocab._read_specials(train_list_path)).union(LPRVocab._read_specials(val_list_path))
print(chars)
print('for list characters')
chars = list(chars)
print(chars)
print('for sort characters')
chars.sort()
print(chars)
print('for append characters')
chars.append('_')
print(chars)
num_classes = len(chars)
print('num_classes '+str(num_classes))
vocab = dict(zip(chars, range(num_classes)))
print('vocab ')
print(vocab)
r_vocab = dict(zip(range(num_classes), chars))
r_vocab[-1] = ''
print('r_vocab ')
print(r_vocab)
return [vocab, r_vocab, num_classes]
But I don't see any prints to console.
Then I used
python -m pdb train.py
then set break point inside trainer.py.
Break points are never hit.
Press Key S also doesn't make to go detail inside another files.
Why debug desn't work and print doesn't print to console?
I used python3.5.
I recommend the following:
wherever you want to debug, put this:
import ipdb
ipdb.set_trace()
Then on the ipython console, make an instance of your class, and the call the method you need to debug, it will stop in your trace

Python glob but against a list of strings rather than the filesystem

I want to be able to match a pattern in glob format to a list of strings, rather than to actual files in the filesystem. Is there any way to do this, or convert a glob pattern easily to a regex?
The glob module uses the fnmatch module for individual path elements.
That means the path is split into the directory name and the filename, and if the directory name contains meta characters (contains any of the characters [, * or ?) then these are expanded recursively.
If you have a list of strings that are simple filenames, then just using the fnmatch.filter() function is enough:
import fnmatch
matching = fnmatch.filter(filenames, pattern)
but if they contain full paths, you need to do more work as the regular expression generated doesn't take path segments into account (wildcards don't exclude the separators nor are they adjusted for cross-platform path matching).
You can construct a simple trie from the paths, then match your pattern against that:
import fnmatch
import glob
import os.path
from itertools import product
# Cross-Python dictionary views on the keys
if hasattr(dict, 'viewkeys'):
# Python 2
def _viewkeys(d):
return d.viewkeys()
else:
# Python 3
def _viewkeys(d):
return d.keys()
def _in_trie(trie, path):
"""Determine if path is completely in trie"""
current = trie
for elem in path:
try:
current = current[elem]
except KeyError:
return False
return None in current
def find_matching_paths(paths, pattern):
"""Produce a list of paths that match the pattern.
* paths is a list of strings representing filesystem paths
* pattern is a glob pattern as supported by the fnmatch module
"""
if os.altsep: # normalise
pattern = pattern.replace(os.altsep, os.sep)
pattern = pattern.split(os.sep)
# build a trie out of path elements; efficiently search on prefixes
path_trie = {}
for path in paths:
if os.altsep: # normalise
path = path.replace(os.altsep, os.sep)
_, path = os.path.splitdrive(path)
elems = path.split(os.sep)
current = path_trie
for elem in elems:
current = current.setdefault(elem, {})
current.setdefault(None, None) # sentinel
matching = []
current_level = [path_trie]
for subpattern in pattern:
if not glob.has_magic(subpattern):
# plain element, element must be in the trie or there are
# 0 matches
if not any(subpattern in d for d in current_level):
return []
matching.append([subpattern])
current_level = [d[subpattern] for d in current_level if subpattern in d]
else:
# match all next levels in the trie that match the pattern
matched_names = fnmatch.filter({k for d in current_level for k in d}, subpattern)
if not matched_names:
# nothing found
return []
matching.append(matched_names)
current_level = [d[n] for d in current_level for n in _viewkeys(d) & set(matched_names)]
return [os.sep.join(p) for p in product(*matching)
if _in_trie(path_trie, p)]
This mouthful can quickly find matches using globs anywhere along the path:
>>> paths = ['/foo/bar/baz', '/spam/eggs/baz', '/foo/bar/bar']
>>> find_matching_paths(paths, '/foo/bar/*')
['/foo/bar/baz', '/foo/bar/bar']
>>> find_matching_paths(paths, '/*/bar/b*')
['/foo/bar/baz', '/foo/bar/bar']
>>> find_matching_paths(paths, '/*/[be]*/b*')
['/foo/bar/baz', '/foo/bar/bar', '/spam/eggs/baz']
On Python 3.4+ you can just use PurePath.match.
pathlib.PurePath(path_string).match(pattern)
On Python 3.3 or earlier (including 2.x), get pathlib from PyPI.
Note that to get platform-independent results (which will depend on why you're running this) you'd want to explicitly state PurePosixPath or PureWindowsPath.
Good artists copy; great artists steal.
I stole ;)
fnmatch.translate translates globs ? and * to regex . and .* respectively. I tweaked it not to.
import re
def glob2re(pat):
"""Translate a shell PATTERN to a regular expression.
There is no way to quote meta-characters.
"""
i, n = 0, len(pat)
res = ''
while i < n:
c = pat[i]
i = i+1
if c == '*':
#res = res + '.*'
res = res + '[^/]*'
elif c == '?':
#res = res + '.'
res = res + '[^/]'
elif c == '[':
j = i
if j < n and pat[j] == '!':
j = j+1
if j < n and pat[j] == ']':
j = j+1
while j < n and pat[j] != ']':
j = j+1
if j >= n:
res = res + '\\['
else:
stuff = pat[i:j].replace('\\','\\\\')
i = j+1
if stuff[0] == '!':
stuff = '^' + stuff[1:]
elif stuff[0] == '^':
stuff = '\\' + stuff
res = '%s[%s]' % (res, stuff)
else:
res = res + re.escape(c)
return res + '\Z(?ms)'
This one à la fnmatch.filter, both re.match and re.search work.
def glob_filter(names,pat):
return (name for name in names if re.match(glob2re(pat),name))
Glob patterns and strings found on this page pass test.
pat_dict = {
'a/b/*/f.txt': ['a/b/c/f.txt', 'a/b/q/f.txt', 'a/b/c/d/f.txt','a/b/c/d/e/f.txt'],
'/foo/bar/*': ['/foo/bar/baz', '/spam/eggs/baz', '/foo/bar/bar'],
'/*/bar/b*': ['/foo/bar/baz', '/foo/bar/bar'],
'/*/[be]*/b*': ['/foo/bar/baz', '/foo/bar/bar'],
'/foo*/bar': ['/foolicious/spamfantastic/bar', '/foolicious/bar']
}
for pat in pat_dict:
print('pattern :\t{}\nstrings :\t{}'.format(pat,pat_dict[pat]))
print('matched :\t{}\n'.format(list(glob_filter(pat_dict[pat],pat))))
While fnmatch.fnmatch can be used directly to check whether a pattern matches a filename or not, you can also use the fnmatch.translate method to generate the regex out of the given fnmatch pattern:
>>> import fnmatch
>>> fnmatch.translate('*.txt')
'.*\\.txt\\Z(?ms)'
From the documenation:
fnmatch.translate(pattern)
Return the shell-style pattern converted to a regular expression.
My solution is similar to Nizam's but with a few changes:
Support for ** wildcards
Prevents patterns like [^abc] from matching /
Updated to use fnmatch.translate() from Python 3.8.13 as a base
WARNING:
There are some slight differences to glob.glob() which this solution suffers from (along with most of the other solutions), feel free to suggest changes in the comments if you know how to fix them:
* and ? should not match file names starting with .
** should also match 0 folders when used like /**/
Code:
import re
def glob_to_re(pat: str) -> str:
"""Translate a shell PATTERN to a regular expression.
Derived from `fnmatch.translate()` of Python version 3.8.13
SOURCE: https://github.com/python/cpython/blob/v3.8.13/Lib/fnmatch.py#L74-L128
"""
i, n = 0, len(pat)
res = ''
while i < n:
c = pat[i]
i = i+1
if c == '*':
# -------- CHANGE START --------
# prevent '*' matching directory boundaries, but allow '**' to match them
j = i
if j < n and pat[j] == '*':
res = res + '.*'
i = j+1
else:
res = res + '[^/]*'
# -------- CHANGE END ----------
elif c == '?':
# -------- CHANGE START --------
# prevent '?' matching directory boundaries
res = res + '[^/]'
# -------- CHANGE END ----------
elif c == '[':
j = i
if j < n and pat[j] == '!':
j = j+1
if j < n and pat[j] == ']':
j = j+1
while j < n and pat[j] != ']':
j = j+1
if j >= n:
res = res + '\\['
else:
stuff = pat[i:j]
if '--' not in stuff:
stuff = stuff.replace('\\', r'\\')
else:
chunks = []
k = i+2 if pat[i] == '!' else i+1
while True:
k = pat.find('-', k, j)
if k < 0:
break
chunks.append(pat[i:k])
i = k+1
k = k+3
chunks.append(pat[i:j])
# Escape backslashes and hyphens for set difference (--).
# Hyphens that create ranges shouldn't be escaped.
stuff = '-'.join(s.replace('\\', r'\\').replace('-', r'\-')
for s in chunks)
# Escape set operations (&&, ~~ and ||).
stuff = re.sub(r'([&~|])', r'\\\1', stuff)
i = j+1
if stuff[0] == '!':
# -------- CHANGE START --------
# ensure sequence negations don't match directory boundaries
stuff = '^/' + stuff[1:]
# -------- CHANGE END ----------
elif stuff[0] in ('^', '['):
stuff = '\\' + stuff
res = '%s[%s]' % (res, stuff)
else:
res = res + re.escape(c)
return r'(?s:%s)\Z' % res
Test Cases:
Here are some test cases comparing the built-in fnmatch.translate() to the above glob_to_re().
import fnmatch
test_cases = [
# path, pattern, old_should_match, new_should_match
("/path/to/foo", "*", True, False),
("/path/to/foo", "**", True, True),
("/path/to/foo", "/path/*", True, False),
("/path/to/foo", "/path/**", True, True),
("/path/to/foo", "/path/to/*", True, True),
("/path/to", "/path?to", True, False),
("/path/to", "/path[!abc]to", True, False),
]
for path, pattern, old_should_match, new_should_match in test_cases:
old_re = re.compile(fnmatch.translate(pattern))
old_match = bool(old_re.match(path))
if old_match is not old_should_match:
raise AssertionError(
f"regex from `fnmatch.translate()` should match path "
f"'{path}' when given pattern: {pattern}"
)
new_re = re.compile(glob_to_re(pattern))
new_match = bool(new_re.match(path))
if new_match is not new_should_match:
raise AssertionError(
f"regex from `glob_to_re()` should match path "
f"'{path}' when given pattern: {pattern}"
)
Example:
Here is an example that uses glob_to_re() with a list of strings.
glob_pattern = "/path/to/*.txt"
glob_re = re.compile(glob_to_re(glob_pattern))
input_paths = [
"/path/to/file_1.txt",
"/path/to/file_2.txt",
"/path/to/folder/file_3.txt",
"/path/to/folder/file_4.txt",
]
filtered_paths = [path for path in input_paths if glob_re.match(path)]
# filtered_paths = ["/path/to/file_1.txt", "/path/to/file_2.txt"]
never mind, I found it. I want the fnmatch module.
An extension to #Veedrac PurePath.match answer that can be applied to a lists of strings:
# Python 3.4+
from pathlib import Path
path_list = ["foo/bar.txt", "spam/bar.txt", "foo/eggs.txt"]
# convert string to pathlib.PosixPath / .WindowsPath, then apply PurePath.match to list
print([p for p in path_list if Path(p).match("ba*")]) # "*ba*" also works
# output: ['foo/bar.txt', 'spam/bar.txt']
print([p for p in path_list if Path(p).match("*o/ba*")])
# output: ['foo/bar.txt']
It is preferable to use pathlib.Path() over pathlib.PurePath(), because then you don't have to worry about the underlying filesystem.
Here is a glob that can deal with escaped punctuation. It does not stop on path separators. I'm posting it here because it matches the title of the question.
To use on a list:
rex = glob_to_re(glob_pattern)
rex = r'(?s:%s)\Z' % rex # Can match newline; match whole string.
rex = re.compile(rex)
matches = [name for name in names if rex.match(name)]
Here's the code:
import re as _re
class GlobSyntaxError(SyntaxError):
pass
def glob_to_re(pattern):
r"""
Given pattern, a unicode string, return the equivalent regular expression.
Any special character * ? [ ! - ] \ can be escaped by preceding it with
backslash ('\') in the pattern. Forward-slashes ('/') and escaped
backslashes ('\\') are treated as ordinary characters, not boundaries.
Here is the language glob_to_re understands.
Earlier alternatives within rules have precedence.
pattern = item*
item = '*' | '?' | '[!' set ']' | '[' set ']' | literal
set = element element*
element = literal '-' literal | literal
literal = '\' char | char other than \ [ ] and sometimes -
glob_to_re does not understand "{a,b...}".
"""
# (Note: the docstring above is r""" ... """ to preserve backslashes.)
def expect_char(i, context):
if i >= len(pattern):
s = "Unfinished %s: %r, position %d." % (context, pattern, i)
raise GlobSyntaxError(s)
def literal_to_re(i, context="pattern", bad="[]"):
if pattern[i] == '\\':
i += 1
expect_char(i, "backslashed literal")
else:
if pattern[i] in bad:
s = "Unexpected %r in %s: %r, position %d." \
% (pattern[i], context, pattern, i)
raise GlobSyntaxError(s)
return _re.escape(pattern[i]), i + 1
def set_to_re(i):
assert pattern[i] == '['
set_re = "["
i += 1
try:
if pattern[i] == '!':
set_re += '^'
i += 1
while True:
lit_re, i = literal_to_re(i, "character set", bad="[-]")
set_re += lit_re
if pattern[i] == '-':
set_re += '-'
i += 1
expect_char(i, "character set range")
lit_re, i = literal_to_re(i, "character set range", bad="[-]")
set_re += lit_re
if pattern[i] == ']':
return set_re + ']', i + 1
except IndexError:
expect_char(i, "character set") # Trigger "unfinished" error.
i = 0
re_pat = ""
while i < len(pattern):
if pattern[i] == '*':
re_pat += ".*"
i += 1
elif pattern[i] == '?':
re_pat += "."
i += 1
elif pattern[i] == '[':
set_re, i = set_to_re(i)
re_pat += set_re
else:
lit_re, i = literal_to_re(i)
re_pat += lit_re
return re_pat
Can't say how efficient it is, but it is much less verbose, much less complicated, more complete, and possibly more secure/reliable than other solutions.
Supported syntax:
* -- matches zero or more characters.
** (actually, it's either **/ or /**) -- matches zero or more subdirectories.
? -- matches one character.
[] -- matches one character within brackets.
[!] -- matches one character not within brackets.
Due to escaping with \, only / can be used as a path separator.
Order of operation:
Escape special RE chars in glob.
Generate RE for tokenization of escaped glob.
Replace escaped glob tokens by equivalent RE.
import re
from sys import hexversion, implementation
# Support for insertion-preserving/ordered dicts became language feature in Python 3.7, but works in CPython since 3.6.
if hexversion >= 0x03070000 or (implementation.name == 'cpython' and hexversion >= 0x03060000):
ordered_dict = dict
else:
from collections import OrderedDict as ordered_dict
escaped_glob_tokens_to_re = ordered_dict((
# Order of ``**/`` and ``/**`` in RE tokenization pattern doesn't matter because ``**/`` will be caught first no matter what, making ``/**`` the only option later on.
# W/o leading or trailing ``/`` two consecutive asterisks will be treated as literals.
('/\*\*', '(?:/.+?)*'), # Edge-case #1. Catches recursive globs in the middle of path. Requires edge case #2 handled after this case.
('\*\*/', '(?:^.+?/)*'), # Edge-case #2. Catches recursive globs at the start of path. Requires edge case #1 handled before this case. ``^`` is used to ensure proper location for ``**/``.
('\*', '[^/]*'), # ``[^/]*`` is used to ensure that ``*`` won't match subdirs, as with naive ``.*?`` solution.
('\?', '.'),
('\[\*\]', '\*'), # Escaped special glob character.
('\[\?\]', '\?'), # Escaped special glob character.
('\[!', '[^'), # Requires ordered dict, so that ``\[!`` preceded ``\[`` in RE pattern. Needed mostly to differentiate between ``!`` used within character class ``[]`` and outside of it, to avoid faulty conversion.
('\[', '['),
('\]', ']'),
))
escaped_glob_replacement = re.compile('(%s)' % '|'.join(escaped_glob_tokens_to_re).replace('\\', '\\\\\\'))
def glob_to_re(pattern):
return escaped_glob_replacement.sub(lambda match: escaped_glob_tokens_to_re[match.group(0)], re.escape(pattern))
if __name__ == '__main__':
validity_paths_globs = (
(True, 'foo.py', 'foo.py'),
(True, 'foo.py', 'fo[o].py'),
(True, 'fob.py', 'fo[!o].py'),
(True, '*foo.py', '[*]foo.py'),
(True, 'foo.py', '**/foo.py'),
(True, 'baz/duck/bar/bam/quack/foo.py', '**/bar/**/foo.py'),
(True, 'bar/foo.py', '**/foo.py'),
(True, 'bar/baz/foo.py', 'bar/**'),
(False, 'bar/baz/foo.py', 'bar/*'),
(False, 'bar/baz/foo.py', 'bar**/foo.py'),
(True, 'bar/baz/foo.py', 'bar/**/foo.py'),
(True, 'bar/baz/wut/foo.py', 'bar/**/foo.py'),
)
results = []
for seg in validity_paths_globs:
valid, path, glob_pat = seg
print('valid:', valid)
print('path:', path)
print('glob pattern:', glob_pat)
re_pat = glob_to_re(glob_pat)
print('RE pattern:', re_pat)
match = re.fullmatch(re_pat, path)
print('match:', match)
result = bool(match) == valid
results.append(result)
print('result was expected:', result)
print('-'*79)
print('all results were expected:', all(results))
print('='*79)
I wanted to add support for recursive glob patterns, i.e. things/**/*.py and have relative path matching so example*.py doesn't match with folder/example_stuff.py.
Here is my approach:
from os import path
import re
def recursive_glob_filter(files, glob):
# Convert to regex and add start of line match
pattern_re = '^' + fnmatch_translate(glob)
# fnmatch does not escape path separators so escape them
if path.sep in pattern_re and not r'\{}'.format(path.sep) in pattern_re:
pattern_re = pattern_re.replace('/', r'\/')
# Replace `*` with one that ignores path separators
sep_respecting_wildcard = '[^\{}]*'.format(path.sep)
pattern_re = pattern_re.replace('.*', sep_respecting_wildcard)
# And now for `**` we have `[^\/]*[^\/]*`, so replace that with `.*`
# to match all patterns in-between
pattern_re = pattern_re.replace(2 * sep_respecting_wildcard, '.*')
compiled_re = re.compile(pattern_re)
return filter(compiled_re.search, files)

Categories