In order to fix a bunch all-uppercase text files, I have written a script that:
Lowers all characters and capitalizes the first word of each line and the first word after a period.
Capitalizes all words that are in a list of city and country names (from another text file)
def lowit(line):
line = line.lower()
sentences = line.split('. ')
sentences2 = [sentence[0].capitalize() + sentence[1:] for sentence in sentences]
string2 = '. '.join(sentences2)
return string2
def capcico(line, allKeywords):
allWords = line.split(' ')
original = line.split(' ')
for i,words in enumerate(allWords):
words = words.replace(',', '')
words = words.replace('.', '')
words = words.replace(';', '')
if words in allKeywords:
original[i] = original[i].capitalize()
return ' '.join(original)
def main():
dfile = open('fixed.txt', 'w')
f = open('allist.txt', 'r')
allKeywords = f.read().split('\n')
with open('ulm.txt', 'r') as fileinput:
for line in fileinput:
low_line = lowit(line)
dfile.write('\n' + capcico(low_line, allKeywords))
dfile.close()
if __name__ == '__main__':
main()
It works, but the problem is that it doesn't capitalize a city/Country if there are more than one in the same line:
TOWN IN WUERTTEMBERG, GERMANY.
changes to:
Town in Wuerttemberg, germany.
Any Ideas to what's wrong?
TNX
It is because "germany" is really "germany\n".
Strip the EOL off the word...
words = words.replace(',', '')
words = words.replace('.', '')
words = words.replace(';', '')
# Add in this line to strip the EOL
words = words.rstrip('\r\n')
#Input
fileinput = open("ulm.txt").read()
##Input lower
filow = fileinput.lower()
#Keywords
allKeywords = open("allist.txt").read().split("\n")
for kw in allKeywords:
filow = filow.replace(kw.strip().lower(), kw.capitalize())
#Dots
fidots = filow.split(".")
for i,d in enumerate(fidots):
c = d.strip().capitalize()
dc = d.replace(c.lower(), c)
fidots[i] = dc
#Result
dfile = open("fixed.txt", "w")
result = ".".join(fidots)
dfile.write(result)
dfile.close()
Related
I have this project which is information search systems.
I have an array called Corpus and I want to read the words from it and extract the irregular verbs from it and save it to another file.
I have another matrix containing irregular verbs.
I compared the two matrices and extracted the regular verbs and saved them in a file called "irregular_verbs".
But the problem is when the program is executed nothing is printed in the file "irregular_verbs".
Inside this file I called the corpus array in order to pass it and compare it with the array of regular verbs and if the verb is irregular it is put in the file "irregular_verbs".
second_request.py:
from firstRequest import read_corpus_file_and_delete_stop_words;
corpus = read_corpus_file_and_delete_stop_words();
# print(corpus)
z = []
irregular_verbs = []
def read_irregular_verbs():
with open('C:/Users/Super/PycharmProjects/pythonProject/Files/irregular_verbs.txt', 'r') as file:
for line in file:
for word in line.split():
irregular_verbs.append(word)
return irregular_verbs
# print(read_irregular_verbs())
irregular_verbs_file = []
def access_irregular_verbs():
irregular_verbs_list = read_irregular_verbs()
for t in irregular_verbs_list:
for words in corpus:
for word in words:
if t != word[0]:
continue
else:
with open('../Files/irregular_verbs_output.txt', 'a+') as irregular_file:
irregular_file.write(t)
return irregular_verbs_list
print(access_irregular_verbs())
Through this file, I went to a folder named Corpus that contains many files and I saved the elements of these lists in an array.
first.py:
def read_corpus_file_and_delete_stop_words():
stop_words_list = stopwords.words('english')
additional_stopwords = []
with open("C:/Users/Super/Desktop/IR/homework/Lab4/IR Homework/stop words.txt", 'r') as file:
for word in file:
word = word.split('\n')
additional_stopwords.append(word[0])
stop_words_list += additional_stopwords
dir_path = 'C:/Users/Super/Desktop/IR/homework/Lab4/corpus/corpus/'
# save_dir = "C:/Users/Super/Desktop/IR/homework/Files_Without_SW/"
files_without_sw = []
for document in os.listdir(dir_path):
with open(dir_path + document, "r") as reader:
# save_file = open(save_dir + document, 'w')
text = reader.read()
text = text.replace('.', ' ').replace(',', ' ')
text = text.replace(':', ' '). replace('?', ' ').replace('!', ' ')
text = text.replace(' ', ' ') # convert double space into single space
text = text.replace('"', ' ').replace('``', ' ')
text = text.strip() # remove space at the end
text_tokens = word_tokenize(text)
tokens_without_sw = [word for word in text_tokens if
(word not in stop_words_list)]
# save_file.writelines(["%s " % item for item in tokens_without_sw])
# print(document, ':', tokens_without_sw)
files_without_sw.append(tokens_without_sw)
return files_without_sw
print(read_corpus_file_and_delete_stop_words())
I have this simple code that reads a txt file and accepts a word from the user to check if that word is in the txt document or not. It looks like this works only for a single word. I have to modify this code so that the user can input two or more words. Example; GOING HOME instead of just HOME. Any help please.
word = input('Enter any word that you want to find in text File :')
f = open("AM30.EB","r")
if word in f.read().split():
print('Word Found in Text File')
else:
print('Word not found in Text File')
I'm not sure this is exactly what you are looking for
f = open("AM30.EB","r")
word_list = []
while True:
word = input('Enter any word that you want to find in text File or 1 to stop entering words :')
if word == "1": break
word_list.append(word)
file_list = f.read().split()
for word in word_list:
if word in file_list:
print("Found word - {}".format(word))
These are case-sensitive solutions!
All words in query separately:
words = input('Enter all words that you want to find in text File: ').split()
f_data = []
with open("AM30.EB", "r") as f:
f_data = f.read().split()
results = list(map(lambda x: any([y == x for y in f_data]), words))
print("Found ")
for i in range(len(words)):
print(f"'{words[i]}'", end="")
if i < len(words) - 1:
print("and", end="")
print(f": {all(results)}")
Any word in query:
words = input('Enter any word that you want to find in the text File: ').split()
f_data = []
with open("AM30.EB", "r") as f:
f_data = f.read().split()
results = list(map(lambda x: any([y == x for y in f_data]), words))
if any(results):
for i in range(len(words)):
print(f"Found '{words[i]}': {results[i]}")
Exact phrase in query:
phrase = input('Enter a phrase that you want to find in the text File: ')
f_data = ""
with open("AM30.EB", "r") as f:
f_data = f.read()
print(f"Found '{phrase}': {f_data.count(phrase) > 0}")
This is case sensitive and checks for each word individually. Not sure if this is what you were looking for but hope it helps!
file1 = open('file.txt', 'r').read().split()
wordsFoundList = []
userInput = input('Enter any word or words that you want to find in text File :').split()
for word in userInput:
if word in file1:
wordsFoundList.append(word)
if len(wordsFoundList) == 0:
print("No words found in text file")
else:
print("These words were found in text file: " + str(wordsFoundList))
I want to tally up the word freq. from text files. The issue I'm facing is that only the last word is tallied.
def main():
rep = input("Enter a text file: ")
infield = open(rep, 'r')
s = infield.read()
punctuation = [',',';','.',':','!',"'","\""]
for ch in punctuation:
s = s.replace(ch,' ')
s = s.split()
wordcount = {}
for word in s:
if word not in wordcount:
count_1 = s.count(word)
wordcount = {word:count_1}
#s.append(w:s.count(w))
print (wordcount)
main()
Expected: A tallied word count for words in a text file in a key-value format/ a dictionary.
Actual: {'fun': 2}
Fun is the last word of the text file and indeed comes up only twice.
Also, the indentation that is displayed isn't reflective of what I have.
Your problem is here:
wordcount = {word:count_1}
You're overwriting the dictionary on every loop iteration.
Make it:
wordcount[word] = count_1
Though, to be honest, the much better approach is to use the standard library's collections.Counter container.
def main():
import collections
rep = input("Enter a text file: ")
infield = open(rep, 'r')
s = infield.read()
punctuation = [',',';','.',':','!',"'","\""]
for ch in punctuation:
s = s.replace(ch,' ')
s = s.split()
wordcount = collections.Counter(s) # <===
print (wordcount.most_common()) # <===
main()
No point in manually doing something that is already done in the standard library (since Python 2.7):
from collections import Counter
import re
rep = input("Enter a text file: ")
infield = open(rep, 'r')
s = infield.read()
s = re.split(r'[ ,;.:!\'"]', s)
wordcount = Counter(s)
del wordcount['']
print (wordcount)
There is a difference between re.split() and string.split(): the former creates empty words when there are several delimiters in a row, the latter doesn't. That's why del wordcount['']
You had a couple of issues, but the most pressing one was this bit of code:
for word in s:
if word not in wordcount:
count_1 = s.count(word)
wordcount = {word:count_1}
You were setting wordcount to a single-key dictionary at every new word. This is how I would have written it...
def main():
punctuation = [',',';','.',':','!',"'","\""]
rep = input("Enter a text file: ")
with open(rep, 'r') as infield:
s = infield.read()
for ch in punctuation:
s = s.replace(ch, ' ')
s = s.split()
wordcount = {}
for word in s:
if word not in wordcount.keys():
wordcount[word] = 1
else:
wordcount[word] += 1
print(wordcount)
main()
Use wordcount.update({word: count_1}) instead: wordcount = {word:count_1}.
Full example:
# coding: utf-8
PUNCTUATION = [',', ';', '.', ':', '!', "'", "\""]
if __name__ == '__main__':
wordcount = {}
rep = input("Enter a text file: ")
infield = open(rep, 'r')
s = infield.read()
for ch in PUNCTUATION:
s = s.replace(ch, ' ')
s = s.split()
for word in s:
if word not in wordcount:
count_1 = s.count(word)
wordcount.update({word: count_1})
print(wordcount)
My below code is currently checking a text file to see if it can find words in a sentence from my lexicon file, if it does find one it then searches this line to see if it can find a word from a secondary list if both of these conditions are met in a line then this line is printed.
What i am trying to do is set the lexicon word colour to for example red & blue for the words found in the secondary list that is called CategoryGA, my purpose for this is to easily identify in the printout there each of the found words have came from.
import re
import collections
from collections import defaultdict
from collections import Counter
import sys
from Categories.GainingAccess import GA
Chatpath = "########/Chat1.txt"
Chatfile = Chatpath
lpath = 'Lexicons/######.txt'
lfile = lpath
CategoryGA = GA
Hits = []
"""
text_file = open(path, "r")
lines = text_file.read().split()
c = Counter(lines)
for i, j in c.most_common(50):
print(i, j)
"""
# class LanguageModelling:
def readfile():
Word_Hit = None
with open(Chatfile) as file_read:
content = file_read.readlines()
for line_num, line in enumerate(content):
if any(word in line for word in CategoryGA):
Word_Hit = False
for word in CategoryGA:
if line.find(word) != -1:
Word_Hit = True
Hits.append(word)
Cleanse = re.sub('<.*?>', '', line)
print('%s appeared on Line %d : %s' % (word, line_num, Cleanse))
file_read.close()
count = Counter(Hits)
count.keys()
for key, value in count.items():
print(key, ':', value)
def readlex():
with open(lfile) as l_read:
l_content = l_read.readlines()
for line in l_content:
r = re.compile(r'^\d+\s+\d+\.\d+%\s*')
l_Cleanse = r.sub('', line)
print(l_Cleanse)
l_read.close()
def LanguageDetect():
with open(Chatfile) as c_read, open(lfile) as l_read:
c_content = c_read.readlines()
lex_content = l_read.readlines()
for line in c_content:
Cleanse = re.sub('<.*?>', '', line)
if any(lex_word in line for lex_word in lex_content) \
and \
any(cat_word in line for cat_word in CategoryGA):
lex_word = '\033[1;31m{}\033[1;m'.format(lex_word)
cat_word = '\033[1;44m{}\033[1;m'.format(cat_word)
print(Cleanse)
# print(cat_word)
c_read.close()
l_read.close()
#readfile()
LanguageDetect()
# readlex()
This is my full code but the issue is occurring in the "LanguageDetect" method my current way of trying by assigning the lex_word & cat_word variables hasn't worked and frankly I'm stumped as to what to try next.
Lexicon:
31547 4.7072% i
25109 3.7466% u
20275 3.0253% you
10992 1.6401% me
9490 1.4160% do
7681 1.1461% like
6293 0.9390% want
6225 0.9288% my
5459 0.8145% have
5141 0.7671% your
5103 0.7614% lol
4857 0.7247% can
then within the readlex method i use:
r = re.compile(r'^\d+\s+\d+\.\d+%\s*')
l_Cleanse = r.sub('', line)
to remove everything before the word/character i believe this may be the main issue as to why i can't colour the lexicon word but unsure on how to fix this.
I think you problem comes from the way you handle the line data but maybe i did not understand your question clearly.
That should do the trick :
lex_content = ['aaa', 'xxx']
CategoryGA = ['ccc', 'ddd']
line = 'abc aaa bbb ccc'
for lex_word in lex_content:
for cat_word in CategoryGA:
if lex_word in line and cat_word in line:
print(lex_word, cat_word)
line = line.replace(lex_word, '\033[1;31m' + lex_word + '\033[1;m')
line = line.replace(cat_word, '\033[1;44m' + cat_word + '\033[1;m')
print(line)
Gives the output:
I am trying to write a program that opens a text document and replaces all four letter words with **. I have been messing around with this program for multiple hours now. I can not seem to get anywhere. I was hoping someone would be able to help me out with this one. Here is what I have so far. Help is greatly appreciated!
def censor():
filename = input("Enter name of file: ")
file = open(filename, 'r')
file1 = open(filename, 'w')
for element in file:
words = element.split()
if len(words) == 4:
file1 = element.replace(words, "xxxx")
alist.append(bob)
print (file)
file.close()
here is revised verison, i don't know if this is much better
def censor():
filename = input("Enter name of file: ")
file = open(filename, 'r')
file1 = open(filename, 'w')
i = 0
for element in file:
words = element.split()
for i in range(len(words)):
if len(words[i]) == 4:
file1 = element.replace(i, "xxxx")
i = i+1
file.close()
for element in file:
words = element.split()
for word in words:
if len(word) == 4:
etc etc
Here's why:
say the first line in your file is 'hello, my name is john'
then for the first iteration of the loop: element = 'hello, my name is john'
and words = ['hello,','my','name','is','john']
You need to check what is inside each word thus for word in words
Also it might be worth noting that in your current method you do not pay any attention to punctuation. Note the first word in words above...
To get rid of punctuation rather say:
import string
blah blah blah ...
for word in words:
cleaned_word = word.strip(string.punctuation)
if len(cleaned_word) == 4:
etc etc
Here is a hint: len(words) returns the number of words on the current line, not the length of any particular word. You need to add code that would look at every word on your line and decide whether it needs to be replaced.
Also, if the file is more complicated than a simple list of words (for example, if it contains punctuation characters that need to be preserved), it might be worth using a regular expression to do the job.
It can be something like this:
def censor():
filename = input("Enter name of file: ")
with open(filename, 'r') as f:
lines = f.readlines()
newLines = []
for line in lines:
words = line.split()
for i, word in enumerate(words):
if len(word) == 4:
words[i] == '**'
newLines.append(' '.join(words))
with open(filename, 'w') as f:
for line in newLines:
f.write(line + '\n')
def censor(filename):
"""Takes a file and writes it into file censored.txt with every 4-letterword replaced by xxxx"""
infile = open(filename)
content = infile.read()
infile.close()
outfile = open('censored.txt', 'w')
table = content.maketrans('.,;:!?', ' ')
noPunc = content.translate(table) #replace all punctuation marks with blanks, so they won't tie two words together
wordList = noPunc.split(' ')
for word in wordList:
if '\n' in word:
count = word.count('\n')
wordLen = len(word)-count
else:
wordLen = len(word)
if wordLen == 4:
censoredWord = word.replace(word, 'xxxx ')
outfile.write(censoredWord)
else:
outfile.write(word + ' ')
outfile.close()