Markov Analysis, Formatting - python

I have a program that reads in a big chunk of text from a text file and then randomizes the content to display back as a short story based on the content of the text. The program works but the last part, where I am displaying the material is super clunky and not efficient and I was wondering if anyone had any ideas on how I could more efficiently take in the text and then display it as a string to the user but allow it to span multiple lines (wrap text, essentially) so that it is not just a giant string of text continuing to the right of the console.
from __future__ import print_function, division
import sys
import random
# global variables
suffix_map = {} # map from prefixes to a list of suffixes
prefix = () # current tuple of words
big_list = []
def process_file(filename, order=2):
"""Reads a file and performs Markov analysis.
filename: string
order: integer number of words in the prefix
returns: map from prefix to list of possible suffixes.
"""
fp = open(filename)
for line in fp:
for word in line.rstrip().split():
process_word(word, order)
def process_word(word, order=3):
"""Processes each word.
word: string
order: integer
During the first few iterations, all we do is store up the words;
after that we start adding entries to the dictionary.
"""
global prefix
if len(prefix) < order:
prefix += (word,)
return
try:
suffix_map[prefix].append(word)
except KeyError:
# if there is no entry for this prefix, make one
suffix_map[prefix] = [word]
prefix = shift(prefix, word)
def random_text(n=300):
"""Generates random wordsfrom the analyzed text.
Starts with a random prefix from the dictionary.
n: number of words to generate
"""
global big_list
# choose a random prefix (not weighted by frequency)
start = random.choice(list(suffix_map.keys()))
for i in range(n):
suffixes = suffix_map.get(start, None)
if suffixes == None:
random_text(n-i)
return
# choose a random suffix
word = random.choice(suffixes)
big_list.append(word + " ")
start = shift(start, word)
def shift(t, word):
"""Forms a new tuple by removing the head and adding word to the tail.
t : tuple of strings
word: string
Returns: tuple of strings
"""
return t[1:] + (word,)
def list_to_str_format():
global big_list
whole = " ".join(str(i) for i in big_list)
# 25 words per line
l1 = big_list[:25]
l2 = big_list[26:50]
l3 = big_list[51:75]
l4 = big_list[76:100]
l5 = big_list[101:125]
l6 = big_list[126:150]
l7 = big_list[151:175]
l8 = big_list[176:200]
l9 = big_list[201:225]
l10 = big_list[226:250]
l11 = big_list[256:275]
l12 = big_list[276:300]
str_1 = " ".join(str(i) for i in l1).capitalize()
str_2 = " ".join(str(i) for i in l2)
str_3 = " ".join(str(i) for i in l3)
str_4 = " ".join(str(i) for i in l4)
str_5 = " ".join(str(i) for i in l5)
str_6 = " ".join(str(i) for i in l6)
str_7 = " ".join(str(i) for i in l7)
str_8 = " ".join(str(i) for i in l8)
str_9 = " ".join(str(i) for i in l9)
str_10 = " ".join(str(i) for i in l10)
str_11 = " ".join(str(i) for i in l11)
str_12 = " ".join(str(i) for i in l12)
print(str_1)
print(str_2)
print(str_3)
print(str_4)
print(str_5)
print(str_6)
print(str_7)
print(str_8)
print(str_9)
print(str_10)
print(str_11)
print(str_12)
def main(filename, n=300, order=3):
try:
n = int(n)
order = int(order)
except ValueError as e:
print('Usage: %d filename [# of words] [prefix length]' % e)
else:
process_file(filename, order)
random_text(n)
list_to_str_format()
print()
main('C:\\Users\\Desktop\\TheBrothersKaramazov.txt')

i allowed myself to change your joining pattern which made a double space. you must import module re
def list_to_str_format(line_length=80):
global big_list
whole = "".join(str(i) for i in big_list)
regex = re.compile('(.*?(\s))*')
while whole != "":
break_pos = regex.match(whole[:line_length]).end()
print(whole[:break_pos])
whole = whole[break_pos:]

Related

How can I use the same Word as in the function above? If ther's no way to do it how can I use the same random number without using a global variable?

def read_input(random_words):
word = choose_random_words()
rand = random.randrange(0, 3)
letter = find_row_letter(word[rand])
print(rand)
user_input = timed_input("Input: %s " % letter)
return user_input
This function chooses a random word out of a list. This Word is then linked to a letter of the alphabet. You can see the timed_input as a normal input, as it is not important for my question.
def handle_input(random_words):
letter = read_input(random_words)
word =
print(letter)
if word == letter:
print("Correct")
else:
print("False")
In this function I want to compare if the user input from function read_input is the same as the chosen word from the same function.
Make read_input return the word too
def read_input(random_words):
word = choose_random_words()
rand = random.randrange(0, 3)
letter = find_row_letter(word[rand])
user_input = timed_input("Input: %s " % letter)
return user_input, word
def handle_input(random_words):
letter, word = read_input(random_words)
What you can do is declare the word variable to be global:
def read_input(random_words):
global word
word = choose_random_words()
rand = random.randrange(0, 3)
letter = find_row_letter(word[rand])
print(rand)
user_input = timed_input("Input: %s " % letter)
return user_input
That allows you to access the word variable from anywhere.

Duplicated characters in dict

I did this code:
import itertools
black = "Ellie"
s = "E111e"
messagelist = list(s)
a = s.count("1")
dicta = {}
for x in messagelist:
if not x == "1":
dicta[x] = messagelist.index(x)
print(dicta)
listo = ['l', 'i', 'j',]
result = itertools.combinations_with_replacement(listo, a)
lista2 = []
for each in result:
a =str(each).replace("(", "")
a = a.replace(")", "")
a = a.replace(",", "")
a = a.replace("'", "")
a = a.replace(" ", "")
lista2.append(a)
lista3 = []
for x in lista2:
listexa = list(x)
for item in dicta:
listexa.insert(dicta[item], item)
listexa = "".join(listexa)
lista3.append(listexa)
print(lista3)
if black in lista3:
print("DELETE")
else:
print("IT'S OKAY")
black = blacklisted word
s = user writing it with numbers
The problem is with words that contains more than one egual character like "finishing" that has 2 "n" characters, so in the dict, only 1 "n" will be added how can I solve this?
This will do the trick for you:
(I also tweaked the for loop, where you were iterating over combinations - just to keep it pythonish ;) )
import itertools
def repl(txt, pat):
if(len(pat)==0):
return txt
return repl(txt.replace("1", pat[0], 1), pat[1:])
black = "Ellie"
s = "E111e"
messagelist = list(s)
a = s.count("1")
dicta = {}
listo = ['l', 'i', 'j',]
result = itertools.combinations_with_replacement(listo, a)
lista2 = []
for each in result:
lista2.append("".join(each))
lista3 = []
for x in lista2:
listexa = repl(s, x)
lista3.append(listexa)
print(lista3)
if black in lista3:
print("DELETE")
else:
print("IT'S OKAY")
Consider function repl - the most important improvement. It essentially leverages the fact that python str.replace() can also take 3rd argument, which in essence defines number of replacements to be done - just to replace 1 one at a time.

Replace a sequence of characters by another one

I have a sequence of characters '-------' and i want to replace each '-' in it by each letter in 'jaillir' in the correct range.
How do i do that ?
Here is my code
import random
with open ("lexique.txt", "r", encoding= "utf8") as a:
words = []
letters = []
tirets= []
for line in a:
ligne = line[:-1]
words.append(ligne)
choix = random.choice(words)
tiret = ('-'* len(choix))
print(tiret)
print(choix)
accompli = False
while not accompli:
lettre = input("Entrez une lettre du mot ")
for t in range(len(tiret)):
if lettre in choix:
tiret.replace(tiret[t], lettre[t])
print(tiret)
I think you need to fix your file reading code, even though it is not the question, as below:
with open('lexique.txt', r) as f:
text = f.read() # get file contents
Next to replace the ---- by a word, I am assuming that the dashes in your text will only ever be the same length as the word, so:
word = 'word' # any string e.g. word
dashes = '-' * len(word)
So now you can use python's string.replace method like so:
text = text.replace(dashes, word) # every time it finds the sequence of dashes it will be replaced by your word
With a for loop (gradual replacement):
word = 'word' # any word
length = len(word)
temp = ''
for i, letter in enumerate(text):
if letter == '-':
if i + len(tempword) < len(text):
characters = [True if l == '-' else False for l in text[i:i + len(tempword)]]
if not(False in characters):
new += tempword[0]
if len(tempword) > 1:
tempword = tempword[1:]
else:
tempword = word
else:
new += letter
else:
new += letter
print(new)

Reading a (compressed) file

My code :
sent = str(input("Please input a sentence: "))
dl = [0]
for count , v in enumerate (splitsent):
if splitsent.count(v) < 2:
dl.append(max(dl) +1)
else:
dl.append(splitsent.index(v) +1)
dl.remove(0)
print(sent, "\n",dl)
gives the output :
"1,2,3,4,1,2"
with the input:
"To be or not to be"
This is it in it's "compressed" form. How would I take the output,"1,2,3,4,1,2" from an external file and turn it into the "To be or not to be"?
Your method really not an efficient way of compressing a text file, just use the existing zlib.
But, for the academic exercise, you will want to use pickle to store your dictionary keys such that when you recover it you get the same values. As you want the 'compressed' form to exist between invocations, so that you can successfully decompress a previously 'compressed' file, you will need to allocate an index to each word.
If you want a 'standard' python method, OrderedDict from collections can be used to create an index in this way, new words are added to the end, but unlike conventional dict objects, old ones keep their position. A better method is an OrderedSet, but this is not in standard python, see this recipe.
Case
You also have to decide if 'THIS', 'this' and 'ThIs' are different words or the same word. Perhaps each word token needs a bitfield to indicate if each character is lower or upper case, e.g. 'ThIs' gets a token 15, but a bitfield of 5 "0x1010", producing a tuple of (15,5) in the compressed file.
Punctuation
You will also need to consider punctuation, where a word is thus punctuated you will need a way to represent this in the compressed form, a token for the punctuation character.
But there is a problem with this.
Then when you decompress you will need to recreate the original exactly, so handle punctuation. e.g. "Is this correct?" -> [1,2,3,4] -> "Is this correct ?" or "Is this correct?" without the space.
So for each punctuation you need to indicate how it joins to the previous and next character, e.g.
As punctuation is only ever one character (i.e. one 8 bit number), you may want to consider just putting the character as-is.
Multiple spaces
You will also need to handle multiple spaces.
Example code
This code is incomplete, mostly untested and probably does not handle all use cases, but it illustrates one possible solution to the question.
To use it, create a file called in.txt containing the text you want to compress, then run
python compdict.py -c in.txt out.comp
or
python compdict.py -d out.comp out.txt
or
python compdict.py --list
from ordered_set import OrderedSet #pip install ordered_set
import os
import cPickle as pickle
import string
import argparse
class CompDecomp(object):
__DEFAULT_PICKLE_FN__ = "my.dict"
printable_non_chars = set(string.printable) - set(string.digits) - set(string.ascii_letters)
def __init__(self, fn=None, *args, **kw):
if fn is None:
self.fn = self.__DEFAULT_PICKLE_FN__
else:
self.fn = fn
self.dict = self.loaddict()
def loaddict(self):
if os.path.exists(self.fn):
pkl = open(self.fn, "rb")
d = pickle.load(pkl)
pkl.close()
else:
d = OrderedSet()
return d
def savedict(self):
pkl = open(self.fn, "wb")
pickle.dump(self.dict, pkl)
pkl.close()
def compressword(self, word, conjoin=False):
if word.lower() not in self.dict:
self.dict.append(word.lower())
print "New word: \'%s\'" % word
self.savedict()
index, flag, _ = self.__caseflag__(word, conjoin)
#print index, bin(flag)[2:].zfill(len(word)), conjoin
return index, flag, conjoin
def decompressword(self, index, caseflag=0, conjoin=False):
if isinstance(index, int):
word = self.dict[index]
else:
word = index
if caseflag == 0:
return word, conjoin
flag = bin(caseflag)[2:].zfill(len(word))
res = ""
for n, c in enumerate(word):
if flag[n] == '1':
res += c.upper()
else:
res += c.lower()
return res, conjoin
def __caseflag__(self, word, conjoin):
index = self.dict.index(word.lower())
if word.lower() == word:
#Word is all lowercase
return (index,0, conjoin)
if word.upper() == word:
#Word is all uppercase
return index, int("1" * len(word), 2), conjoin
res = ""
for c in word:
if c in string.uppercase:
res += "1"
else:
res += "0"
return index, int(res, 2), conjoin
def compressfile(self, fileobj):
with fileobj as f:
data = f.read(-1)
f.close()
words = data.split(" ")
compress = []
for word in words:
#Handle multiple spaces
if word == "":
compress.append(" ")
continue
#Handle puntuation, treat apostrophied words as new words
substr = []
p1 = 0
csplit = word.translate(None, string.ascii_letters+'\'')
for n, c in enumerate(csplit):
subword, word = word.split(c, 1)
compress.append(self.compressword(subword, True if n > 0 else False))
compress.append((c, 0, True))
#Handle words
if len(word) and not len(csplit):
compress.append(self.compressword(word))
return compress
def decompressfile(self, fileobj):
data = pickle.load(fileobj)
decomp = ""
for v in data:
if not isinstance(v,tuple):
print "Bad data %s" % v
continue
if len(v) > 0 and len(v) <= 3:
d, conjoin = self.decompressword(*v)
if len(decomp):
decomp += "" if conjoin else " "
decomp += d
else:
print "Bad data %s (length %d)" % (v, len(v))
return decomp
if __name__ == "__main__":
parser = argparse.ArgumentParser(description='Test file compress / decompress')
group = parser.add_mutually_exclusive_group()
parser.add_argument('infile', nargs='?', default=None)
parser.add_argument('outfile', nargs='?', default=None)
group.add_argument('-compress', action='store_true')
group.add_argument('-decompress', action='store_true')
group.add_argument('--list', action='store_true')
args = parser.parse_args()
cd = CompDecomp()
#Invocation
#python dictcompress.py [-h|-c|-d|--list] [<infile>] [<outfile>]
infile, outfile = args.infile, args.outfile
if infile is not None and not os.path.exists(infile):
print "Input file missing"
if outfile is not None:
of = open(outfile, "wb")
else:
of = None
if not args.list:
if args.compress:
print "Compress"
pickle.dump(cd.compressfile(open(infile, "r")), of)
if args.decompress:
print "Decompress"
of.write(cd.decompressfile(open(infile, "r")))
else:
for k in cd.dict:
print k
if of is not None:
of.close()

Python: Create multiple dictionaries of letter transitions

So me and my groupmates are trying to make a Markov Model that finds the probability of letter transitions in a text file. In the text file we have a group of words "Steam, Teams, Meets, Teems, Eat, Ate, State, Tease, Test, Mast, Mates". In the code we have spaces added to the beginning of the first letter and after the last letter in each word. So the problem we are having is making a function that puts the letter transitions into separate dictionaries. For example all the e transitions(ex: "_e", "ea"...etc, the _ is a space) would go into a dictionary and then the t, s, a, and m.
This is the code we have so far:
import random
import re
inFile = open("markov.txt",'r')
file = inFile.read().lower()
inFile.close()
file=re.sub('[^[a-z\ \']+', " ", file)
fileTuple=tuple(file.split())
fileList=list(fileTuple)
fileString=file
def addSpaces(atuple):
theString=''
for i in atuple:
theString=theString+' '+i+' '
return(theString)
print('The words in the text file:',addSpaces(fileTuple))
fileDict = { }
for i in fileList:
fileDict['_'+i+'_']=''
print("This is a dictionary of the words in the text file with underscores as spaces:",fileDict)
def countTotalWords(atuple):
count=0
for i in atuple:
count=count+1
return(count)
print('Total amount of words:',countTotalWords(fileTuple))
def findFirstLetter(aDict):
for i in aDict:
aDict[i]=i[0:2]
return(aDict)
print('The first letters of each word in the file:',findFirstLetter(fileDict))
valueList=list(fileDict.values())
keyList=list(fileDict.keys())
def countFirstLetters(alist):
d={}
count = 0
for character in alist:
if character in d:
d[character] += 1
else:
d[character] = 1
return d
print('Total amount of occurences of each first letter:',countFirstLetters(valueList))
def countFirstLettersProbability(alist):
d={}
count = 0
for character in alist:
if character in d:
d[character] += (1/countTotalWords(fileTuple))
else:
d[character] = (1/countTotalWords(fileTuple))
return d
print('Probility that each letter is the first in the word:',countFirstLettersProbability(valueList))
def countAllLetters(alist):
d={}
for word in alist:
for char in word:
if char in d:
d[char] += 1
else:
d[char] = 1
return d
print('Total amount of occurences of each letter:',countFirstLetters(fileString))
Here is a solid start; I've rewritten your code as a Markov class.
from random import choice
import re
from collections import defaultdict
from itertools import chain, tee, izip
def strip_non_alpha(text, reg=re.compile('[^a-z\']+', re.IGNORECASE)):
return reg.sub(' ', text.strip())
def nwise(iterable, n):
"s -> (s0,s1, ... sn-1), (s1,s2, ... sn), (s2, s3, ... sn+1), ..."
args = tee(iterable, n)
for i,t in enumerate(args):
for j in range(i):
next(t, None)
return izip(*args)
class Markov():
CHAINLEN = 3
PRE = ' '*(CHAINLEN - 1)
#classmethod
def from_file(cls, fname):
with open(fname) as inf:
return Markov(inf)
def __init__(self, text):
"""
Create a new Markov chain model
text
Either a string or a sequence of strings
"""
self.lookup = defaultdict(list)
self.words = 0
self.strings = 0
if hasattr(text, '__iter__'):
for s in text:
self.add_text(s)
else:
self.add_text(text)
def add_text(self, text):
"""
Add a string to the lookup table
text
string to add
"""
text = strip_non_alpha(text).lower()
self.words += len(text.split())
self.strings += 1
for chars in nwise(chain(Markov.PRE, text, Markov.PRE), Markov.CHAINLEN):
stem = ''.join(chars[:-1])
self.lookup[stem].append(chars[-1])
def gen_text(self, upto=200):
"""
Generate a string
upto
maximum length of string to be generated
"""
s = Markov.PRE
res = []
for i in range(upto + Markov.CHAINLEN):
ch = choice(self.lookup[s])
res.append(ch)
s = s[1:] + ch
if s == Markov.PRE: # terminal string
break
return ''.join(res[:-(Markov.CHAINLEN - 1)])
def __str__(self):
return '\n'.join("'{}': {}".format(k, self.lookup[k]) for k in sorted(self.lookup))
def main():
# mc = Markov.from_file('markov.txt')
mc = Markov('Steam,Teams,Meets,Teems,Eat,Ate,State,Tease,Test,Mast,Mates'.split(','))
print mc.strings, mc.words
print mc
for i in range(10):
print(mc.gen_text())
if __name__=="__main__":
main()

Categories