I need to create a program which removes punctuation, some specific words, duplicates and return the words left and their respective lines. I also need to keep track of the duplicates. For instance,
Python IDLE
Indexer: type in lines, finish with a . at start of line only
It is a briskly blowing wind that blows
from the north, the North of my youth.
The wind is cold too, colder than the
winds of yesteryear.
.
The index is:
brisk 1
blow 1
wind 1, 3, 4
north 2
youth 2
cold 3
yesteryear 4
The Problem: I need to keep track of the line number of the words left and also their duplicates. I'm not being able to do that.
from string import *
stopWords = [ "a", "i", "it", "am", "at", "on", "in", "to", "too", "very", \
"of", "from", "here", "even", "the", "but", "and", "is", "my", \
"them", "then", "this", "that", "than", "though", "so", "are" ]
endings = [ "es" , "ed" , "er", "ly"]
punctuation = [ ".", "," , ":" , ";" , "!" , "?" , "&" , "'" ]
unindexed_sentence = raw_input("type in lines, finish with a . at start of line only").lower()
#removing duplicates.
def unique_string(l):
ulist = []
ulist2 = []
[ulist.append(x) for x in l if x not in ulist]
[ulist2.append(x)]
global ulist2
return ulist
unindexed_sentence =' '.join(unique_string(unindexed_sentence.split()))
unindexed_sentence1 = split(unindexed_sentence,"\n")
list_unindexed = []
# splitting
i = 0
while i<len(unindexed_sentence1):
list_unindexed += [split(unindexed_sentence1[i])]
i+=1
countline = 0
i = 0
while i < len(list_unindexed):
j = 0
while j < len(list_unindexed[i]):
if list_unindexed[i][j][0] in punctuation:
list_unindexed[i][j] = list_unindexed[i][j][:0]
if list_unindexed[i][j][-1] in punctuation:
list_unindexed[i][j] = list_unindexed[i][j][:-1]
if list_unindexed[i][j][-1] == "s":
list_unindexed[i][j] = list_unindexed[i][j][:-1]
if list_unindexed[i][j][-2:] in endings:
list_unindexed[i][j] = list_unindexed[i][j][:-2]
if list_unindexed[i][j][-3:] == "ing":
list_unindexed[i][j] = list_unindexed[i][j][:-3]
if list_unindexed[i][j] in stopWords:
del list_unindexed[i][j]
else:
j += 1
i += 1
countline += 1
def new_line(n):
split(n,"\n")
count = 1
if n[-1] == "\n":
count += 1
return count
string1 = str(list_unindexed)
string2 = str(string1)
string2 ='\n'.join(unique_string(string2.split()))
print string2
Is it your homework?
Here some tips:
Don't do: from string import *. You don't need it.
Use data.splitlines() to get list of lines
Use enumerate() to get a index, e.g.: for i, line in enumerate(data.splitlines())
Use a dictionary for keeping track of all words. Each value could be a list or a set of line numbers
Don't remove duplicates initially. You can do this using dictionaries or sets.
Related
What Am I doing wrong? I already defined it but it keeps on saying its not defined.
# Pre-process the comments
def preprocess_text(text):
# Lowercase the text
text = text.lower()
# Remove punctuations
text = re.sub(r'\[^\\w\\s\]', '', text)
# Tokenize the text
words = word_tokenize(text)
stop_words = set(stopwords.words("english"))
stop_words.update(["a", "an", "and", "are", "as", "at", "be", "by",
"for", "from", "has", "he", "in", "is", "it", "its", "of", "on",
"that", "the", "to", "was", "were", "will", "with"])
words = [word for word in words if word not in stop_words]
lemmatizer = WordNetLemmatizer()
words = [lemmatizer.lemmatize(word) for word in words]
return words
comments_processed = [preprocess_text(comment) for comment in comments]
# Perform sentiment analysis on the comments
sentiments = []
for comment in comments:
sentiment = TextBlob(comment).sentiment.polarity
sentiments.append(sentiment)
# Identify the top 3 best and worst things about the product
positive_features = {}
negative_features = {}
for i in range(len(comments)):
comment = comments[i]
sentiment = sentiments[i]
words = preprocess_text(comment)
for word in words:
if sentiment > 0:
if word in positive_features:
positive_features[word] += 1
else:
positive_features[word] = 1
elif sentiment < 0:
if word in negative_features:
negative_features[word] += 1
else:
negative_features[word] = 1
top_positive_features = sorted(positive_features, key=positive_features.get, reverse=True)[:3]
top_negative_features = sorted(negative_features, key=negative_features.get, reverse=True)[:3]
# Visualize the results using word clouds
positive_cloud = WordCloud(width=800, height=800, background_color='white', stopwords=stop_words, min_font_size=10).generate_from_frequencies(positive_features)
negative_cloud = WordCloud(width=800, height=800, background_color='white', stopwords=stop_words, min_font_size=10).generate_from_frequencies(negative_features)
What is wrong here?
NameError
Traceback (most recent call last)
~\AppData\Local\Temp\ipykernel_1612\1814734049.py in <module>
63
64 # Visualize the results using word clouds
---> 65 positive_cloud = WordCloud(width=800, height=800,
background_color='white', stopwords=stop_words,
min_font_size=10).generate_from_frequencies(positive_features)
66
67 negative_cloud = WordCloud(width=800, height=800,
background_color='white', stopwords=stop_words,
min_font_size=10).generate_from_frequencies(negative_features)
NameError: name 'stop_words' is not defined
Your stop_words is defined only in the function preprocess_text(), so it's scope is limited to only that function.
I am new to python and I want to convert a text file into json file.
Here's how it looks like:
#Q Three of these animals hibernate. Which one does not?
^ Sloth
A Mouse
B Sloth
C Frog
D Snake
#Q What is the literal translation of the Greek word Embioptera, which denotes an order of insects, also known as webspinners?
^ Lively wings
A Small wings
B None of these
C Yarn knitter
D Lively wings
#Q There is a separate species of scorpions which have two tails, with a venomous sting on each tail.
^ False
A True
B False
Contd
.
.
.
.
^ means Answer.
I want it in json format as shown below.
Example:
{
"questionBank": [
{
"question": "Grand Central Terminal, Park Avenue, New York is the worlds",
"a": "largest railway station",
"b": "Longest railway station",
"c": "highest railway station",
"d": "busiest railway station",
"answer": "largest railway station"
},
{
"question": "Eritrea, which became the 182nd member of the UN in 1993, is in the continent of",
"a": "Asia",
"b": "Africa",
"c": "Europe",
"d": "Oceania",
"answer": "Africa"
}, Contd.....
]
}
I came across a few similar posts and here's what I have tried:
dataset = "file.txt"
data = []
with open(dataset) as ds:
for line in ds:
line = line.strip().split(",")
print(line)
To which the output is:
['']
['#Q What part of their body do the insects from order Archaeognatha use to spring up into the air?']
['^ Tail']
['A Antennae']
['B Front legs']
['C Hind legs']
['D Tail']
['']
['#Q What is the literal translation of the Greek word Embioptera', ' which denotes an order of insects', ' also known as webspinners?']
['^ Lively wings']
['A Small wings']
['B None of these']
['C Yarn knitter']
['D Lively wings']
['']
Contd....
The sentences containing commas are separated by python lists. I tried to use .join but didn't get the results I was expecting.
Please let me know how to approach this.
dataset = "text.txt"
question_bank = []
with open(dataset) as ds:
for i, line in enumerate(ds):
line = line.strip("\n")
if len(line) == 0:
question_bank.append(question)
question = {}
elif line.startswith("#Q"):
question = {"question": line}
elif line.startswith("^"):
question['answer'] = line.split(" ")[1]
else:
key, val = line.split(" ", 1)
question[key] = val
question_bank.append(question)
print({"questionBank":question_bank})
#for storing json file to local directory
final_output = {"questionBank":question_bank}
with open("output.json", "w") as outfile:
outfile.write(json.dumps(final_output, indent=4))
Rather than handling the lines one at a time, I went with using a regex pattern approach.
This also more reliable as it will error out if the input data is in a bad format - rather than silently ignoring a grouping which is missing a field.
PATTERN = r"""[#]Q (?P<question>.+)\n\^ (?P<answer>.+)\nA (?P<option_a>.+)\nB (?P<option_b>.+)\n(?:C (?P<option_c>.+)\n)?(?:D (?P<option_d>.+))?"""
def parse_qa_group(qa_group):
"""
Extact question, answer and 2 to 4 options from input string and return as a dict.
"""
# "group" here is a set of question, answer and options.
matches = PATTERN.search(qa_group)
# "group" here is a regex group.
question = matches.group('question')
answer = matches.group('answer')
try:
c = matches.group('option_c')
except IndexError:
c = None
try:
d = matches.group('option_d')
except IndexError:
d = None
results = {
"question": question,
"answer": answer,
"a": matches.group('option_a'),
"b": matches.group('option_b')
}
if c:
results['c'] = c
if d:
results['d'] = d
return results
# Split into groups using the blank line.
qa_groups = question_answer_str.split('\n\n')
# Process each group, building up a list of all results.
all_results = [parse_qa_group(qa_group) for qa_group in qa_groups]
print(json.dumps(all_results, indent=4))
Further details in my gist. Read more on regex Grouping
I left out reading the text and writing a JSON file.
I want to remove troubled test_data list elements, which contains bad_characters and fixed characters to append into the new list stripped_test_data but the script does not work.
The following code:
test_data = ["1912", "1929", "1913-1923",
"(1951)", "1994", "1934",
"c. 1915", "1995", "c. 1912",
"(1988)", "2002", "1957-1959",
"c. 1955.", "c. 1970's",
"C. 1990-1999"]
bad_chars = ["(",")","c","C",".","s","'", " "]
def strip_characters(data):
stripped_test_data = []
for each in data:
if bad_chars in each:
tostr = str(each)
adjusted = tostr.replace(bad_chars, "")
stripped_test_data.append(tostr)
else:
adjusted = each
stripped_test_data.append(each)
return stripped_test_data
adjsuted_data = strip_characters(test_data)
when run throws the error:
> > TypeErrorTraceback (most recent call last) <ipython-input-1-d9d5a3a4542a> in <module>()
> 20 return stripped_test_data
> 21
> ---> 22 adjsuted_data = strip_characters(test_data)
> 23
> 24
>
> <ipython-input-1-d9d5a3a4542a> in strip_characters(data)
> 11 stripped_test_data = []
> 12 for each in data:
> ---> 13 if bad_chars in each:
> 14 tostr = str(each)
> 15 adjusted = tostr.replace(bad_chars, "")
TypeError: 'in <string>' requires string as left operand, not list
Can you please help what is wrong with the code and how to proceeds the operation?
str.strip can handle multiple characters:
bad_chars_joined = ''.join(bad_chars)
[t.strip(bad_chars_joined) for t in test_data]
Output:
['1912',
'1929',
'1913-1923',
'1951',
'1994',
'1934',
'1915',
'1995',
'1912',
'1988',
'2002',
'1957-1959',
'1955',
'1970',
'1990-1999']
your code is trying to compare the entire list of bad chars when using the in bad_chars
try this:
test_data = ["1912", "1929", "1913-1923",
"(1951)", "1994", "1934",
"c. 1915", "1995", "c. 1912",
"(1988)", "2002", "1957-1959",
"c. 1955.", "c. 1970's",
"C. 1990-1999"]
bad_chars = ["(",")","c","C",".","s","'", " "]
def strip_characters(data):
stripped_test_data = []
for char in bad_chars:
for each in data:
if char in each:
tostr = str(each)
adjusted = tostr.replace(char, "")
stripped_test_data.append(adjusted)
else:
stripped_test_data.append(each)
return stripped_test_data
adjsuted_data = strip_characters(test_data)
I am trying to clean a string using re.sub to transform text to a time. My initial string is "Durée : 1h30" and I want to delete "Durée : " and to get this output: "1h30". However with my current code, the output is this list of string: ["D", "u", "r", "é", "e", " ", ":", " ", "1", "h", "3", "0"].
for href in response.xpath("//div[#class='item']/a[#class='roll-2']//#href"):
url = "https://www.louvre.fr" + href.extract()
yield scrapy.Request(url, callback=self.parse_dir_contents)
lenght = response.xpath("//tbody/tr/td/text()").extract()[1] #lenght = "Durée : 1h30"
item['lenght'] = [re.sub("Durée : ", "", le) for le in lenght]
Strings are iterable in Python, and you're iterating over each character inside the list comprehension and running re.sub in those characters separately.
Also, you don't need Regex here. Use str.replace:
item['length'] = [length.replace('Durée : ', '')]
I am currently programming an Artificial Intelligence in Python, with some basic code from ELIZA. I will improve on the code once I get it working. My problem is that when I run the program and enter a query to the computer, there is no response. My code is below.
import string
# OSWALD v1.0
switch = [
["I need \(.*\)",
[ "Why do you need %1?",
"Would it REALLY help you to get %1?",
"Are you sure you need %1?"]]
#There is more code with responses.
]
gPats = {
"am" : "are",
"was" : "were",
"i" : "you",
"i'd" : "you would",
"i've" : "you have",
"i'll" : "you will",
"my" : "your",
"are" : "am",
"you've": "I have",
"you'll": "I will",
"your" : "my",
"yours" : "mine",
"you" : "me",
"me" : "you",
}
s = input
gKeys = map(lambda x:regex.compile(x[0]),gPats)
gValues = map(lambda x:x[1],gPats)
print ("Hello, mortal. My name is Oswald. What would you like to talk about?")
while s == input:
try: s = input(">")
def translate(str,dict):
words = string.split(string.lower(str))
keys = dict.keys();
for i in range(0,len(words)):
if words[i] in keys:
words[i] = dict[words[i]]
return print(switch)
def respond(str,keys,values):
for i in range(0,len(keys)):
if input == input:
respnum = whrandom.randint(0,len(values[word])-1)
resp = values[i][respnum]
pos = string.find(resp,'%')
print(string.find(resp,'%'))
while pos > -1:
num = string.atoi(resp[pos+1:pos+2])
resp = resp[:pos] + \
translate(keys[i].group(num),gReflections) + \
resp[pos+2:]
pos = string.find(resp,'%')
if resp[-2:] == '?.': resp = resp[:-2] + '.'
if resp[-2:] == '??': resp = resp[:-2] + '?'
print(string.find(resp,'%'))