What's the quickest way (processing sense) to iterate over a list and validate them according to set flags?
Or, in other words, what's the optimal approach to filter list with statement depending on configuration.
Example below is on a list of strings and depending on flags I'd like to filter some of them.
class Validator(object):
def __init__(self, digit=False, tag=False, short=False):
self.digit = digit
self.tag = tag
self.short = short
def __call__(self, words):
good_words = []
for word in words:
if self.digit:
if word.is_digit(): continue
if self.tag:
if word[0] == "<": continue
if self.short:
if len(word) < 3: continue
good_words.append(word)
return good_words
Use of Validator
val = Validator(digit=True, short=True)
words = "An apple a day is 20 dollars a month"
print(val(words))
# ["apple", "day", "dollars", "month"]
To avoid creating a new (and potentially long) list, you can instead make a generator function with yield for each succeeded word instead of return for a whole list:
class Validator(object):
def __init__(self, digit=False, tag=False, short=False, *more_filters):
self.digit = digit
self.tag = tag
self.short = short
self.more_filters= more_filters
def __call__(self, words):
for word in words:
if self.digit:
if word.is_digit(): continue
if self.tag:
if word[0] == "<": continue
if self.short:
if len(word) < 3: continue
if any(f(word) for f in self.more_filters):
continue
yield word
Use:
other_tests = [lambda w: w.startswith('#'), lambda w: w.endswith('?')]
val = Validator(digit=True, short=True, *other_tests)
words = "An apple a day is 20 dollars a month #healty toomuch? livelong!"
print(val(words))
# apple day dollars month livelong!
Element by element approach using python filter
from itertools import filterfalse
class Validator(object):
def __init__(self, digit=False, tag=False, short=False):
self.digit = digit
self.tag = tag
self.short = short
def __call__(self, word):
if self.digit:
if word.isdigit(): return True
if self.tag:
if word[0] == "<": return True
if self.short:
if len(word) < 3: return True
return False
val = Validator(digit=True, short=True)
words = "An apple a day is 20 dollars a month".split()
assert list(filter(val, words)) == ['An', 'a', 'is', '20', 'a']
assert list(filterfalse(val, words)) == ['apple', 'day', 'dollars', 'month']
Related
def spin_words(sentence):
adjusted_string = sentence.split()
for i in adjusted_string:
if len(i) > 5:
print(i[::-1], end = ' ')
else:
print(i, end = ' ')
The problem is asking to take a string and return the same string but, with all the five letter words or more in reversed
def spin_words(sentence):
splitted_string = sentence.split()
reversed_fives = [s[::-1] if len(s) >= 5 else s for s in splitted_string]
return " ".join(reversed_fives)
It is quite long so I'm not sure I can ask you to review my code. But I have no choice indeed.
What I'm doing is analyzing the Korean language. To do this, first I collect data and loaded the data via pd.DataFrame. Then, I chunk the text data ('contents' column in the dataframe) into morpheme units by using TextAnalyzer. The below is an example of the process so far:
data = [document1, document2, ...] # dataframe.loc[:, 'contents']
tokenizer = TextAnalyzer()
for item in data:
tokenizer.ma.analyze_wordform_parsed(item) # get sentences in the document, and split the sentence into morphemes
# results
[[(morpheme1 of sentence1, pos1 of sentence1), ..], [(morpheme1 of sentence2, pos1 of sentence2), ...] # for 1 document
Now I have morphemes and their corresponding part of speech. And I need this information to get the original word so I made a dictionary for saving the information into M2W.
For the collected documents, what I want to do is extracting keywords and sentiment words. For the collection of the morphemes, I need to find sentiment words. Therefore, I also split the sentiment words into their corresponding morphemes. The sentiment words are saved in POS_DICT, NEU_DICT and NEG_DICT. The sentiment morphemes and their corresponding POS are:
[[(sentiment morpheme1, POS1), (sentiment morepheme2, POS2), ...), ...], ...] # the inside list representing morephemes/POS for a sentiment word
So I used the Boyer Moore algorithm to find the patterns which are the sentiment words. The work has proceeded through pandas-multiprocess. The blow is the entire code of the process and I made them as a class.
class KeywordAnalyzer:
def __init__(self, morph_path: str="../rsc/resource/NULL/", sent_dict: str="../rsc/sentiment_dict/") -> None:
self.sent_dict = sent_dict
self.tokenizer = TextAnalyzer(dict_path=morph_path)
self.init_sent_dict(self.sent_dict)
# keyword list
self.KEYWORDS_POS = {"NNIN1", # 고유명사
"NNIN2", # 일반명사
"NPR" # 술어명사
} # POS for part of speech
self.init_sent_dict(sent_dict)
def init_sent_dict(self, sent_dict: str):
self.POS_DICT = set() # POS for positive
self.NEG_DICT = set()
self.NEU_DICT = set()
self.M2W = dict() # Morph to Word
def get_morphs(file):
words = [item.strip() for item in file]
morphs = [tuple(self.tokenizer.ma.analyze_wordform_parsed(item)[0]) for item in words]
return words, morphs
for item in Path(sent_dict).glob("*"):
with open(item) as f:
if item.name.split('_')[1] == "POSITIVE":
words, morphs = get_morphs(f)
self.POS_DICT.update(morphs)
elif item.name.split('_')[1] == "NEUTRALITY":
words, morphs = get_morphs(f)
self.NEU_DICT.update(morphs)
elif item.name.split('_')[1] == "NEGATIVE":
words, morphs = get_morphs(f)
self.NEG_DICT.update(morphs)
temp_dict = {morph:word for word, morph in zip(words, morphs)}
self.M2W.update(temp_dict)
def get_keywords(self, data: pd.Series) -> Tuple[List[str], List[str], List[str], List[str]]:
# make tokens
tokens = self.tokenizer.morpheme_analyzer(data.loc['contents'])
pos_words = []
neu_words = []
neg_words = []
rels = []
for item in tokens:
sentence = list(itertools.chain(*item))
pos_words.extend(self._get_pos_words(sentence))
neu_words.extend(self._get_neu_words(sentence))
neg_words.extend(self._get_neg_words(sentence))
rels.extend(self._get_rel_words(sentence))
return pos_words, neu_words, neg_words, rels # Tuple[[sents_1, ...], [rels_1, ...]]
def _get_pos_words(self, sentence: List[Tuple[str, str]]) -> List[str]:
words = [self.M2W[sent_morph] for sent_morph in self.POS_DICT if self._boyer_moore(sent_morph, sentence)]
return words
def _get_neu_words(self, sentence: List[Tuple[str, str]]) -> List[str]:
words = [self.M2W[sent_morph] for sent_morph in self.NEU_DICT if self._boyer_moore(sent_morph, sentence)]
return words
def _get_neg_words(self, sentence: List[Tuple[str, str]]) -> List[str]:
words = [self.M2W[sent_morph] for sent_morph in self.NEG_DICT if self._boyer_moore(sent_morph, sentence)]
return words
def _get_rel_words(self, sentence: List[Tuple[str, str]]) -> List[str]:
keywords = [morph for morph, pos in sentence if pos in self.KEYWORDS_POS]
return keywords
def _boyer_moore(self, pattern: Iterable, text: Iterable) -> bool:
def find(pattern, char):
for i in range(len(pattern) -2, -1, -1):
if pattern[i] == char:
return len(pattern) - i - 1
return len(pattern)
M = len(pattern)
N = len(text)
i = 0
while i <= N - M:
j = M - 1
while j >= 0:
if pattern[j] != text[i + j]:
move = find(pattern, text[i + M - 1])
break
j = j - 1
if j == -1:
return True
else:
i += move
return False
I'm not sure my explanation is straightforward. Thanks for reading.
I have the following python script which does regex matching using 'AND', 'OR' features as well:
class PyBoolReException(Exception):
def __init__(self, value):
self.value = value
def __str__(self):
return str(self.value)
class PyBoolRe:
def __init__(self, boolstr):
# Require whitespace before words?
self.__needspace = True
# whitespace re
self._wspre = re.compile('^\s*$')
# create regexp string
self.__rexplist = []
oparct = boolstr.count('(')
clparct = boolstr.count(')')
if oparct != clparct:
raise PyBoolReException, 'Mismatched parantheses!'
self.__parse(boolstr)
# if NOT is one of the members, reverse
# the list
# print self.__rexplist
if '!' in self.__rexplist:
self.__rexplist.reverse()
s = self.__makerexp(self.__rexplist)
# print s
self.__rexp = re.compile(s)
def match(self, data):
""" Match the boolean expression, behaviour
is same as the 'match' method of re """
return self.__rexp.match(data)
def search(self, data):
""" Search the boolean expression, behaviour
is same as the 'search' method of re """
return self.__rexp.search(data)
def __parse(self, s):
""" Parse the boolean regular expression string
and create the regexp list """
# The string is a nested parantheses with
# any character in between the parens.
scopy = s[:]
oparmatch, clparmatch = False, False
# Look for a NOT expression
index = scopy.rfind('(')
l = []
if index != -1:
oparmatch = True
index2 = scopy.find(')', index)
if index2 != -1:
clparmatch = True
newstr = scopy[index+1:index2]
# if the string is only of whitespace chars, skip it
if not self._wspre.match(newstr):
self.__rexplist.append(newstr)
replacestr = '(' + newstr + ')'
scopy = scopy.replace(replacestr, '')
self.__parse(scopy)
if not clparmatch and not oparmatch:
if scopy: self.__rexplist.append(scopy)
def is_inbetween(self, l, elem):
""" Find out if an element is in between
in a list """
index = l.index(elem)
if index == 0:
return False
if index>2:
if index in range(1, len(l) -1):
return True
else:
return False
else:
return True
def __makenotexpr(self, s):
""" Make a NOT expression """
if s.find('!') == 0:
return ''.join(('(?!', s[1:], ')'))
else:
return s
def __makerexp(self, rexplist):
""" Make the regular expression string for
the boolean match from the nested list """
is_list = True
if type(rexplist) is str:
is_list = False
elem = rexplist
elif type(rexplist) is list:
elem = rexplist[0]
if type(elem) is list:
elem = elem[0]
eor = False
if not is_list or len(rexplist) == 1:
eor = True
word_str = '.*'
s=''
# Implementing NOT
if elem == '!':
return ''.join(('(?!', self.__makerexp(rexplist[1:]), ')'))
# Implementing OR
elif elem.find(' | ') != -1:
listofors = elem.split(' | ')
for o in listofors:
index = listofors.index(o)
in_bet = self.is_inbetween(listofors, o)
if o:
o = self.__makenotexpr(o)
if in_bet:
s = ''.join((s, '|', word_str, o, '.*'))
else:
s = ''.join((s, word_str, o, '.*'))
# Implementing AND
elif elem.find(' & ') != -1:
listofands = elem.split(' & ')
for a in listofands:
index = listofands.index(a)
in_bet = self.is_inbetween(listofands, a)
if a:
a = self.__makenotexpr(a)
s = ''.join((s, word_str, a, '.*'))
else:
if elem:
elem = self.__makenotexpr(elem)
s = ''.join((elem, '.*'))
if eor:
return s
else:
return ''.join((s, self.__makerexp(rexplist[1:])))
When the search phrase is as follows:
p = PyBoolRe('Python | Perl')
s1 = 'Guido invented Python'
s2 = 'Guido Perl'
if p.match(s1):
print 'Match found for first string'
else:
print 'No match found for first string'
if p.match(s2):
print 'Match found for second string'
else:
print 'No match found for second string'
Then both s1 & s2 match
But when the search phrase is:
p = PyBoolRe('Guido & (Python | Perl)')
s1 = 'Guido invented Python'
s2 = 'Guido Perl is great'
Then it should match if s1 or s2 has "Guido Python" or "Guido Perl". s2 has that but it does not match it. On the other hand, it matches s1, which it should not. Why is that?
Please help!! How can I get it to work??
Your generated expression is
.*Python.*|.*Perl.*.*Guido.*
while it should look like
(?=.*Guido.*)(?:.*Python.*|.*Perl.*)
So the parser needs some revision.
1) x|y should be enclosed into (?:...) (at least when used inside another block). Otherwise, | unluckily takes the global priority in the regexp.
2) x & y should be converted into (?=x)y (trailing context may be used to express the and between regular expressions)
My code :
sent = str(input("Please input a sentence: "))
dl = [0]
for count , v in enumerate (splitsent):
if splitsent.count(v) < 2:
dl.append(max(dl) +1)
else:
dl.append(splitsent.index(v) +1)
dl.remove(0)
print(sent, "\n",dl)
gives the output :
"1,2,3,4,1,2"
with the input:
"To be or not to be"
This is it in it's "compressed" form. How would I take the output,"1,2,3,4,1,2" from an external file and turn it into the "To be or not to be"?
Your method really not an efficient way of compressing a text file, just use the existing zlib.
But, for the academic exercise, you will want to use pickle to store your dictionary keys such that when you recover it you get the same values. As you want the 'compressed' form to exist between invocations, so that you can successfully decompress a previously 'compressed' file, you will need to allocate an index to each word.
If you want a 'standard' python method, OrderedDict from collections can be used to create an index in this way, new words are added to the end, but unlike conventional dict objects, old ones keep their position. A better method is an OrderedSet, but this is not in standard python, see this recipe.
Case
You also have to decide if 'THIS', 'this' and 'ThIs' are different words or the same word. Perhaps each word token needs a bitfield to indicate if each character is lower or upper case, e.g. 'ThIs' gets a token 15, but a bitfield of 5 "0x1010", producing a tuple of (15,5) in the compressed file.
Punctuation
You will also need to consider punctuation, where a word is thus punctuated you will need a way to represent this in the compressed form, a token for the punctuation character.
But there is a problem with this.
Then when you decompress you will need to recreate the original exactly, so handle punctuation. e.g. "Is this correct?" -> [1,2,3,4] -> "Is this correct ?" or "Is this correct?" without the space.
So for each punctuation you need to indicate how it joins to the previous and next character, e.g.
As punctuation is only ever one character (i.e. one 8 bit number), you may want to consider just putting the character as-is.
Multiple spaces
You will also need to handle multiple spaces.
Example code
This code is incomplete, mostly untested and probably does not handle all use cases, but it illustrates one possible solution to the question.
To use it, create a file called in.txt containing the text you want to compress, then run
python compdict.py -c in.txt out.comp
or
python compdict.py -d out.comp out.txt
or
python compdict.py --list
from ordered_set import OrderedSet #pip install ordered_set
import os
import cPickle as pickle
import string
import argparse
class CompDecomp(object):
__DEFAULT_PICKLE_FN__ = "my.dict"
printable_non_chars = set(string.printable) - set(string.digits) - set(string.ascii_letters)
def __init__(self, fn=None, *args, **kw):
if fn is None:
self.fn = self.__DEFAULT_PICKLE_FN__
else:
self.fn = fn
self.dict = self.loaddict()
def loaddict(self):
if os.path.exists(self.fn):
pkl = open(self.fn, "rb")
d = pickle.load(pkl)
pkl.close()
else:
d = OrderedSet()
return d
def savedict(self):
pkl = open(self.fn, "wb")
pickle.dump(self.dict, pkl)
pkl.close()
def compressword(self, word, conjoin=False):
if word.lower() not in self.dict:
self.dict.append(word.lower())
print "New word: \'%s\'" % word
self.savedict()
index, flag, _ = self.__caseflag__(word, conjoin)
#print index, bin(flag)[2:].zfill(len(word)), conjoin
return index, flag, conjoin
def decompressword(self, index, caseflag=0, conjoin=False):
if isinstance(index, int):
word = self.dict[index]
else:
word = index
if caseflag == 0:
return word, conjoin
flag = bin(caseflag)[2:].zfill(len(word))
res = ""
for n, c in enumerate(word):
if flag[n] == '1':
res += c.upper()
else:
res += c.lower()
return res, conjoin
def __caseflag__(self, word, conjoin):
index = self.dict.index(word.lower())
if word.lower() == word:
#Word is all lowercase
return (index,0, conjoin)
if word.upper() == word:
#Word is all uppercase
return index, int("1" * len(word), 2), conjoin
res = ""
for c in word:
if c in string.uppercase:
res += "1"
else:
res += "0"
return index, int(res, 2), conjoin
def compressfile(self, fileobj):
with fileobj as f:
data = f.read(-1)
f.close()
words = data.split(" ")
compress = []
for word in words:
#Handle multiple spaces
if word == "":
compress.append(" ")
continue
#Handle puntuation, treat apostrophied words as new words
substr = []
p1 = 0
csplit = word.translate(None, string.ascii_letters+'\'')
for n, c in enumerate(csplit):
subword, word = word.split(c, 1)
compress.append(self.compressword(subword, True if n > 0 else False))
compress.append((c, 0, True))
#Handle words
if len(word) and not len(csplit):
compress.append(self.compressword(word))
return compress
def decompressfile(self, fileobj):
data = pickle.load(fileobj)
decomp = ""
for v in data:
if not isinstance(v,tuple):
print "Bad data %s" % v
continue
if len(v) > 0 and len(v) <= 3:
d, conjoin = self.decompressword(*v)
if len(decomp):
decomp += "" if conjoin else " "
decomp += d
else:
print "Bad data %s (length %d)" % (v, len(v))
return decomp
if __name__ == "__main__":
parser = argparse.ArgumentParser(description='Test file compress / decompress')
group = parser.add_mutually_exclusive_group()
parser.add_argument('infile', nargs='?', default=None)
parser.add_argument('outfile', nargs='?', default=None)
group.add_argument('-compress', action='store_true')
group.add_argument('-decompress', action='store_true')
group.add_argument('--list', action='store_true')
args = parser.parse_args()
cd = CompDecomp()
#Invocation
#python dictcompress.py [-h|-c|-d|--list] [<infile>] [<outfile>]
infile, outfile = args.infile, args.outfile
if infile is not None and not os.path.exists(infile):
print "Input file missing"
if outfile is not None:
of = open(outfile, "wb")
else:
of = None
if not args.list:
if args.compress:
print "Compress"
pickle.dump(cd.compressfile(open(infile, "r")), of)
if args.decompress:
print "Decompress"
of.write(cd.decompressfile(open(infile, "r")))
else:
for k in cd.dict:
print k
if of is not None:
of.close()
So me and my groupmates are trying to make a Markov Model that finds the probability of letter transitions in a text file. In the text file we have a group of words "Steam, Teams, Meets, Teems, Eat, Ate, State, Tease, Test, Mast, Mates". In the code we have spaces added to the beginning of the first letter and after the last letter in each word. So the problem we are having is making a function that puts the letter transitions into separate dictionaries. For example all the e transitions(ex: "_e", "ea"...etc, the _ is a space) would go into a dictionary and then the t, s, a, and m.
This is the code we have so far:
import random
import re
inFile = open("markov.txt",'r')
file = inFile.read().lower()
inFile.close()
file=re.sub('[^[a-z\ \']+', " ", file)
fileTuple=tuple(file.split())
fileList=list(fileTuple)
fileString=file
def addSpaces(atuple):
theString=''
for i in atuple:
theString=theString+' '+i+' '
return(theString)
print('The words in the text file:',addSpaces(fileTuple))
fileDict = { }
for i in fileList:
fileDict['_'+i+'_']=''
print("This is a dictionary of the words in the text file with underscores as spaces:",fileDict)
def countTotalWords(atuple):
count=0
for i in atuple:
count=count+1
return(count)
print('Total amount of words:',countTotalWords(fileTuple))
def findFirstLetter(aDict):
for i in aDict:
aDict[i]=i[0:2]
return(aDict)
print('The first letters of each word in the file:',findFirstLetter(fileDict))
valueList=list(fileDict.values())
keyList=list(fileDict.keys())
def countFirstLetters(alist):
d={}
count = 0
for character in alist:
if character in d:
d[character] += 1
else:
d[character] = 1
return d
print('Total amount of occurences of each first letter:',countFirstLetters(valueList))
def countFirstLettersProbability(alist):
d={}
count = 0
for character in alist:
if character in d:
d[character] += (1/countTotalWords(fileTuple))
else:
d[character] = (1/countTotalWords(fileTuple))
return d
print('Probility that each letter is the first in the word:',countFirstLettersProbability(valueList))
def countAllLetters(alist):
d={}
for word in alist:
for char in word:
if char in d:
d[char] += 1
else:
d[char] = 1
return d
print('Total amount of occurences of each letter:',countFirstLetters(fileString))
Here is a solid start; I've rewritten your code as a Markov class.
from random import choice
import re
from collections import defaultdict
from itertools import chain, tee, izip
def strip_non_alpha(text, reg=re.compile('[^a-z\']+', re.IGNORECASE)):
return reg.sub(' ', text.strip())
def nwise(iterable, n):
"s -> (s0,s1, ... sn-1), (s1,s2, ... sn), (s2, s3, ... sn+1), ..."
args = tee(iterable, n)
for i,t in enumerate(args):
for j in range(i):
next(t, None)
return izip(*args)
class Markov():
CHAINLEN = 3
PRE = ' '*(CHAINLEN - 1)
#classmethod
def from_file(cls, fname):
with open(fname) as inf:
return Markov(inf)
def __init__(self, text):
"""
Create a new Markov chain model
text
Either a string or a sequence of strings
"""
self.lookup = defaultdict(list)
self.words = 0
self.strings = 0
if hasattr(text, '__iter__'):
for s in text:
self.add_text(s)
else:
self.add_text(text)
def add_text(self, text):
"""
Add a string to the lookup table
text
string to add
"""
text = strip_non_alpha(text).lower()
self.words += len(text.split())
self.strings += 1
for chars in nwise(chain(Markov.PRE, text, Markov.PRE), Markov.CHAINLEN):
stem = ''.join(chars[:-1])
self.lookup[stem].append(chars[-1])
def gen_text(self, upto=200):
"""
Generate a string
upto
maximum length of string to be generated
"""
s = Markov.PRE
res = []
for i in range(upto + Markov.CHAINLEN):
ch = choice(self.lookup[s])
res.append(ch)
s = s[1:] + ch
if s == Markov.PRE: # terminal string
break
return ''.join(res[:-(Markov.CHAINLEN - 1)])
def __str__(self):
return '\n'.join("'{}': {}".format(k, self.lookup[k]) for k in sorted(self.lookup))
def main():
# mc = Markov.from_file('markov.txt')
mc = Markov('Steam,Teams,Meets,Teems,Eat,Ate,State,Tease,Test,Mast,Mates'.split(','))
print mc.strings, mc.words
print mc
for i in range(10):
print(mc.gen_text())
if __name__=="__main__":
main()