While working on a script to correct formatting errors from documents produced by OCR, I ran into an issue where depending on which loop I run first, the program runs about 80% slower.
Here is a simplified version of the code. I have the following loops to check for uppercase errors (e.g. "posSible"):
def fixUppercase(doc):
fixedText = ''
for line in doc.split('\n'):
fixedLine = ''
for word in line.split():
if (
word.isalpha()
and (
word.isupper()
or word.istitle()
or word.islower()
)
):
if word == line.split()[-1]:
fixedLine += word + '\n'
else:
fixedLine += word + ' '
elif (
word.isalpha()
):
lower = word.lower()
if word == line.split()[-1]:
fixedLine += lower + '\n'
else:
fixedLine += lower + ' '
else:
if word == line.split()[-1]:
fixedLine += word + '\n'
else:
fixedLine += word + ' '
fixedText += fixedLine
return fixedText
The following loop checks for and removes headings:
def headingsFix(doc):
fixedText = ''
count = 0
stopWords = ['on', 'and', 'of', 'as', 'for']
for line in doc.split('\n'):
tokenLine = ''
for word in line.split():
if word not in stopWords:
tokenLine += word + " "
if tokenLine.istitle() and (
not line.endswith('.')
and not line.endswith(',')
and not line.endswith(')')
and not line.endswith(';')
and not line.endswith(':')
):
count += 1
else:
fixedText += line
return fixedText
It's the loop in the fixedUppercase function that massively slows down. If I run any other function or loop prior to that one or If I run that one first or remove it entirely, the program is quick. Same behavior if both loops are part of one function.
I thought maybe another function or loop was causing the error by expanding the length of the document, but a check with len() shows same doc size either way.
headingsFix strips out all the line endings, which you presumably did not intend. However, your question is about why changing the order of transformations results in slower execution, so I'll not discuss fixing that here.
fixUppercase is extremely inefficient at handling lines with many words. It repeatedly calls line.split() over and over again on the entire book-length string. That isn't terribly slow if each line has maybe a dozen words, but it gets extremely slow if you have one enormous line with tens of thousands of words. I found your program runs vastly faster with this change to only split each line once. (I note that I can't say whether your program is correct as it stands, just that this change should have the same behaviour while being a lot faster. I'm afraid I don't particularly understand why it's comparing each word to see if it's the same as the last word on the line.)
def fixUppercase(doc):
fixedText = ''
for line in doc.split('\n'):
line_words = line.split() # Split the line once here.
fixedLine = ''
for word in line_words:
if (
word.isalpha()
and (
word.isupper()
or word.istitle()
or word.islower()
)
):
if word == line_words[-1]: # No need to split here.
fixedLine += word + '\n'
else:
fixedLine += word + ' '
elif (
word.isalpha()
):
lower = word.lower()
if word == line_words[-1]: # No need to split here.
fixedLine += lower + '\n'
else:
fixedLine += lower + ' '
else:
if word == line_words[-1]: # No need to split here.
fixedLine += word + '\n'
else:
fixedLine += word + ' '
fixedText += fixedLine
return fixedText
Here you can see my timings. I download 'Alice in Wonderland' from Project Gutenberg to use as test input.
annette#DISSONANCE:~/scratch$ wget 'https://www.gutenberg.org/files/11/11-0.txt' -O alice.txt
--2021-06-13 02:06:33-- https://www.gutenberg.org/files/11/11-0.txt
Resolving www.gutenberg.org (www.gutenberg.org)... 152.19.134.47, 2610:28:3090:3000:0:bad:cafe:47
Connecting to www.gutenberg.org (www.gutenberg.org)|152.19.134.47|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 174313 (170K) [text/plain]
Saving to: ‘alice.txt’
alice.txt 100%[============================================================================================================================>] 170.23K 175KB/s in 1.0s
2021-06-13 02:06:35 (175 KB/s) - ‘alice.txt’ saved [174313/174313]
annette#DISSONANCE:~/scratch$ time python slow_ocr_cleanup.py --headings-last < alice.txt > alice1.txt
real 0m0.065s
user 0m0.047s
sys 0m0.016s
annette#DISSONANCE:~/scratch$ time python slow_ocr_cleanup.py --headings-first < alice.txt > alice2.txt
^CTraceback (most recent call last):
File "slow_ocr_cleanup.py", line 117, in <module>
main()
File "slow_ocr_cleanup.py", line 106, in main
doc = fixUppercase(doc)
File "slow_ocr_cleanup.py", line 17, in fixUppercase
if word == line.split()[-1]:
KeyboardInterrupt
real 0m16.856s
user 0m8.438s
sys 0m8.375s
annette#DISSONANCE:~/scratch!1$ time python slow_ocr_cleanup.py --fixed < alice.txt > alice3.txt
real 0m0.058s
user 0m0.047s
sys 0m0.000s
As you can see, running without the fix was taking a long time so I stopped it early.
Here's the full test program:
import sys
def fixUppercase(doc):
fixedText = ''
for line in doc.split('\n'):
fixedLine = ''
for word in line.split():
if (
word.isalpha()
and (
word.isupper()
or word.istitle()
or word.islower()
)
):
if word == line.split()[-1]:
fixedLine += word + '\n'
else:
fixedLine += word + ' '
elif (
word.isalpha()
):
lower = word.lower()
if word == line.split()[-1]:
fixedLine += lower + '\n'
else:
fixedLine += lower + ' '
else:
if word == line.split()[-1]:
fixedLine += word + '\n'
else:
fixedLine += word + ' '
fixedText += fixedLine
return fixedText
def fixUppercaseFast(doc):
fixedText = ''
for line in doc.split('\n'):
line_words = line.split()
fixedLine = ''
for word in line_words:
if (
word.isalpha()
and (
word.isupper()
or word.istitle()
or word.islower()
)
):
if word == line_words[-1]:
fixedLine += word + '\n'
else:
fixedLine += word + ' '
elif (
word.isalpha()
):
lower = word.lower()
if word == line_words[-1]:
fixedLine += lower + '\n'
else:
fixedLine += lower + ' '
else:
if word == line_words[-1]:
fixedLine += word + '\n'
else:
fixedLine += word + ' '
fixedText += fixedLine
return fixedText
def headingsFix(doc):
fixedText = ''
count = 0
stopWords = ['on', 'and', 'of', 'as', 'for']
for line in doc.split('\n'):
tokenLine = ''
for word in line.split():
if word not in stopWords:
tokenLine += word + " "
if tokenLine.istitle() and (
not line.endswith('.')
and not line.endswith(',')
and not line.endswith(')')
and not line.endswith(';')
and not line.endswith(':')
):
count += 1
else:
fixedText += line
return fixedText
def main():
doc = sys.stdin.read()
if '--headings-last' in sys.argv[1:]:
doc = fixUppercase(doc)
doc = headingsFix(doc)
elif '--headings-first' in sys.argv[1:]:
doc = headingsFix(doc)
doc = fixUppercase(doc)
elif '--fixed' in sys.argv[1:]:
doc = headingsFix(doc)
doc = fixUppercaseFast(doc)
else:
print('Specify --headings-last, --headings-first or --fixed', file=sys.stderr)
sys.exit(1)
print(doc, end='')
if __name__ == '__main__':
main()
You'll note that the string concatenation isn't the source of the problem, although it's still inadvisable. In some some versions of Python there's an optimisation that makes it fast, but in general you can't rely on it always to work. This question and answer explain the problem in more detail, but broadly speaking, repeatedly using + or += to build larger and larger strings in a loop is inefficient because every time the whole string needs to be copied, and it's getting longer and longer as the loop goes on. It's a notorious pitfall known as Schlemiel the Painter's Algorithm. Better alternatives are to use str.join or io.StringIO.
Your fixUppercase() function basically does this:
change all alphabetical words that are not all lowercase, a proper title or all uppercase to all lowercase
However, you assume a document would only contain \n and as whitespace, so tabs (for example) would break your code. You could instead break up the document in space metacharacters and strings of the rest using regular expressions.
Your main problem is caused by the inefficiency of fixedUpper, so a solution would be to fix that.
This would do the same, but more efficiently:
import re
example="""
This is an example.
It Has:\ta fEw examples of thIngs that should be FIXED and CHANGED!
Don't touch this: a123B or this_Is_finE
Did it woRk?
"""
def fixedUpper(doc):
p = re.compile(r'\s|([^\s]+)')
# go through all the matches and join them back together into a string when done
return ''.join(
# lowercase for any alphabetic substring that does not contain whitespace and isn't a title or all uppercase
m.group(1).lower() if not (m.group(1) is None or m.group(1).istitle() or m.group(1).isupper()) and m.group(1).isalpha()
# in all other cases, just leave the match untouched
else m.group(0)
for m in p.finditer(doc)
)
print(repr(fixedUpper(example)))
Output (note how it preserved the whitespace):
"\nThis is an example.\n\nIt Has:\ta few examples of things that should be FIXED and CHANGED!\n\nDon't touch this: a123B or this_Is_finE\n\nDid it woRk?\n"
Also note that this still has the problem your code does as well: if there's interpunction at the end of a word, it's not fixed, like woRk?
This is better:
def fixedUpper(doc):
p = re.compile(r'\s|((\w+)([^\w\s]*))')
return ''.join(
m.group(1).lower()
if not (m.group(2) is None or m.group(2).istitle() or m.group(2).isupper()) and m.group(2).isalpha()
else m.group(0)
for m in p.finditer(doc)
)
Related
I am trying to make a function that automatically generated a response to a selection of an action in a text adventure game. My problem is that I have to replace every second '_' with ' '. However I have tried everything I have though of and whenever I google the question the only solution I get is to use .replace(). However .replace() replaces every instance of that character. Here is my code, could you please fix this for me and explain how you fixed it.
example_actions = ['[1] Search desk', '[2] Search Cupboard', '[3] Search yard'
def response(avaliable_actions):
for i in avaliable_actions:
print(i, end=' ')
x = avaliable_actions.index(i)
avaliable_actions[x] = avaliable_actions[x][4:]
avaliable_actions = ' '.join(avaliable_actions)
avaliable_actions = avaliable_actions.lower()
avaliable_actions = avaliable_actions.replace(' ', '_')
avaliable_actions = list(avaliable_actions)
count = 0
for i in avaliable_actions:
if count == 2:
count = 0
index = avaliable_actions.index(i)
avaliable_actions[index] = ' '
elif i == '_':
count += 1
avaliable_actions = ' '.join(avaliable_actions)
print('\n\n' + str(avaliable_actions)) #error checking
Here's one approach:
s = 'here_is_an_example_of_a_sentence'
tokens = s.split('_')
result = ' '.join('_'.join(tokens[i:i+2]) for i in range(0,len(tokens),2))
print(result)
The result:
here_is an_example of_a sentence
Did I understand you correct, that you wanna produce something like this?
this_is_a_test -> this is_a test or this_is a_test?
If so, adapt the following for your needs:
s = "this_is_just_a_test"
def replace_every_nth_char(string, char, replace, n):
parts = string.split(char)
result = ""
for i, part in enumerate(parts):
result += part
if i % n == 0:
result += replace
else:
result += char
return ''.join(result)
res = replace_every_nth_char(s, "_", " ", 2)
print(s, "->", res)
# "this_is_just_a_test" -> "this is_just a_test"
I'm trying to solve a problem that can be found in the Book The Coder's Apprentice by Pieter Spronck, in section 13.2.4. This is the code I wrote so far:
english_dutch = {"last":"laatst", "week":"week", "the":"de", "royal":"koninklijk",
"festival":"feast", "hall":"hal", "saw":"zaag", "first":"eerst", "performance":"optreden",
"of":"van", "a":"een", "new":"nieuw", "symphony":"symphonie", "by":"bij",
"one":"een", "world":"wereld", "leading":"leidend", "modern":"modern",
"composer":"componist", "composers:componisten" "two":"twee", "shed":"schuur", "sheds":"schuren"}
text = "Last week The Royal Festival Hall saw the first \
performance of a new symphony by one of the world's leading \
modern composers, Arthur 'Two-Sheds' Jackson."
def clean(t):
t = t.lower()
t = t.split()
new_t = ""
for word in t:
new_word = ""
for letter in word:
if "a" <= letter <= "z":
new_word += letter
if letter == "-":
new_word += " "
else:
continue
new_t += new_word + " "
return new_t
def translate(t):
translation = ""
for word in t.split():
if english_dutch.get(word):
translation += english_dutch[word] + " "
else:
translation += word + " "
return translation
def auto_correct():
news = ""
a = translate(clean(text)).split()
for word in a:
if len(word) > 1:
news += word + " "
print(news)
auto_correct()
It seems to work OK, but when I run it, the words "composers" and "two" are not translated.
You forgot a comma between the word composers and the word two. In addiotion you wrote "composers:componisten" instead of "composers":"componisten". Change your dictionary like so
english_dutch = {"last":"laatst", "week":"week",
"the":"de", "royal":"koninklijk",
"festival":"feast", "hall":"hal",
"saw":"zaag", "first":"eerst",
"performance":"optreden",
"of":"van", "a":"een",
"new":"nieuw", "symphony":"symphonie",
"by":"bij",
"one":"een", "world":"wereld",
"leading":"leidend", "modern":"modern",
"composer":"componist",
"composers":"componisten", "two":"twee", # <- HERE
"shed":"schuur", "sheds":"schuren"}
Why it passed undetected? Check this:
>>> {"composers:componisten" "two":"twee"}
{'composers:componistentwo': 'twee'}
Because the comma was missing and the colon was within the string, python concatenated the strings, creating a useless (but valid) key/value pair.
This behaviour is documented here
Multiple adjacent string literals (delimited by whitespace), possibly using different quoting conventions, are allowed, and their meaning is the same as their concatenation. Thus, "hello" 'world' is equivalent to "helloworld".
Is there another to have exception for capitalizing an entire sentence. I've heard of skipList method, but it didn't work for my code. See below:
string = input('Enter a string: ')
i = 0
tempString = ' '.join(s[0].upper() + s[1:] for s in string.split(' '))
result = ""
for word in tempString.split():
if i == 0:
result = result + word + " "
elif (len(word) <= 2):
result = result + word.lower() + " "
elif (word == "And" or word == "The" or word == "Not"):
result = result + word.lower() + " "
else:
result = result + word + " "
i = i + 1
print ("\n")
print (result)
Sure. Write a complete list of words that should not be title-cased ("and", "the", "or", "not", etc), and title-case everything else.
words = s.split(' ')
result = ' '.join([words[0]] + [w.title() for w in words[1:] if w not in skipwords])
of course this will still miss Mr. Not's last name, which should be capitalized, and some stranger things like "McFinnigan" will be wrong, but language is hard. If you want better than that, you'll probably have to look into NTLK.
You could rewrite this like this
skip_words = {w.capitalize(): w for w in 'a in of or to and for the'.split()}
words = string.title().split()
result = ' '.join(skip_words.get(w, w) for w in words).capitalize()
I need to fix this program so that it removes punctuation from the decompressed file. For example when the file original text is decompressed there is a space between the word and punctuation.
example: cheese ,
should return cheese,
def RemoveSpace(ln): #subroutine used to remove the spaces after the punctuation
line = ""
line2 = ""
puncpst = []
for g in range(1, len(line)):
if line[g] == "." or line[g] == "," or line[g] == "!" or line[g] == "?":
puncpst.append(g) #get the positions of punctuation marks in a list
for b in range(len(line)):
if b + 1 not in puncpst:
line2 = line2 + line[b]
return line2
The reason for the code to not work is the indentation after if statement. Please correct the indentation as below:
if b+1 not in puncpst:
line2 = line2+line[b]
Another way to handle it is to directly replace space in the string:
line.replace(" .",".")
line.replace(" ,",",")
It sounds like your program should be like this:
def RemoveSpace(line):
puncpst = []
for g in range(1, len(line)):
if line[g] == "." or line[g] == "," or line[g] == "!" or line[g] == "?":
puncpst.append(g) #get the positions of punctuation marks in a list
ret = ""
for b in range(len(line)):
if b + 1 not in puncpst:
ret += line[b]
return ret
Your original had def RemoveSpace(ln): where ln was not used
An improved version, taking a lead from #v.coder, might be like this:
def RemoveSpace2(line):
punctuation = ['.', ',', '!', '?']
for p in punctuation:
original = ' ' + p
line = line.replace(original, p)
return line
Problem statement : Write a function called censor that takes two strings, text and word, as input. It should return the text with the word you chose replaced with asterisks
Here is my code,
def censor(text, word):
i = 0
j = 0
ans = ""
while i<len(text):
while text[j] == word[j]:
j = j + 1
if text[j+1] == " " or j+1 == len(text):
while i<j:
ans += "*"
i = i + 1
ans += " "
i = i + 1
else:
while text[j] != " ":
j = j + 1
while i<=j:
ans += text[i]
i = i + 1
i = i + 1
j = j + 1
return ans
print censor("how are you? you are not fine.","you")
But I am getting the following error,
Traceback (most recent call last):
File "python", line 27, in <module>
File "python", line 7, in censor
IndexError: string index out of range
This is much more complicated than it needs to be. You can just do this:
def censor(text, censored_word):
return text.replace(censored_word, '*'*len(censored_word))
>>> censor('How are you? Are you okay?', 'you')
'How are ***? Are *** okay?'
If you don't want the word youth to be censored but you do want you to be censored, here's how:
def censor(text, censored_word):
repl = '*'*len(censored_word)
return ' '.join([repl if word == censored_word else word for word in text.split()])
If you want to have multiple censored words:
def censor(text, censored_words):
return ' '.join(['*'*len(word) if word in censored_words else word for word in text.split()])
When dealing with index errors, it is often helpful to print out the index and figure out why the index has a value not within the required bounds.
It's good to use string replace in python for replacing the string.
In your case, you should make use of word's length to match word in the text as:
def censor(text, word):
i = 0
j = 0
ans = ""
wl=len(word)
while i<(len(text)):
if word==text[i:i+wl]:
ans=ans+'*'*wl
i=i+wl
else:
ans=ans+text[i]
i = i + 1
return ans
print censor("how are you? you are not fine.","you")