I read an article, named Unsupervised Personality Recognition for Social Network Sites, about the personality extraction from text. There are 22 features and 4 classes represented 4 personalities. Through counting the features in a text, we can know which class this sentence belong to, which means we can know the personality of the sentence.
The article provides the correlations for every feature and class. So the score of a class is the feature's value minus the mean of all feature's value then divided by the standard deviation of all feature's value then multiply the correlation coefficiency provided by the article. Then we can judge if the sentence belongs to the class through its score. I set the threshold to improve its accuracy but it is still not good enough. My result is around 50-60% accuracy and don't know how to improve it. Anyone can help me ?
import csv
import re
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize, RegexpTokenizer
import pickle
from statistics import mean, stdev
with open('mypersonality_final.csv', newline = '') as csvfile:
reader = csv.reader(csvfile)
test = []
for w in reader:
test.append(w)
def all_punctuation(text):
punctuations = ['.', ',', ';', ':']
count = 0
for w in text:
if w in punctuations:
count += 1
return count
def count_commas(text):
count = 0
for w in text:
if w == ',':
count += 1
return count
def count_pattern(text):
grammar = RegexpTokenizer(r'\#')
pattern = grammar.tokenize(text)
return len(pattern)
def count_exclamation(text):
grammar = RegexpTokenizer(r'\!')
pattern = grammar.tokenize(text)
return len(pattern)
def ex_links(text):
grammar = RegexpTokenizer(r'http?\S+\w(?:(?:\/[^\s/]*))*|www\.\S+\w(?:(?:\/[^\s/]*))*|ftp\S+\w(?:(?:\/[^\s/]*))*')
pattern = grammar.tokenize(text)
return len(pattern)
def firs_sinpronouns(text):
sigpronouns = ['i', 'me', 'my', 'mine', 'we']
count = 0
for w in text:
if w.lower() in sigpronouns:
count += 1
return count
def negative_particle(text):
with open('negative-words.txt') as neg:
neg = neg.read()
words = nltk.word_tokenize(neg)
grammar = RegexpTokenizer(r'\w+')
nopunctuation = grammar.tokenize(text)
count = 0
for w in nopunctuation:
if w.lower() in words:
count += 1
return count
def negative_emoticon(text):
grammar = RegexpTokenizer(r"(?::|;|=)(?:-)?(?:\()")
emoticons = grammar.tokenize(text)
return len(emoticons)
def numbers(text):
grammar = RegexpTokenizer(r'\d+')
pattern = grammar.tokenize(text)
return len(pattern)
def parenthesis(text):
pat = '\([^)]*\)'
parent = re.findall(pat, text)
return len(parent)
def positive_emoticon(text):
grammar = RegexpTokenizer(r'(?::|;|=|<|>)(?:-|\.)?(?:\)|D|P|3|<)')
emoticons = grammar.tokenize(text)
return len(emoticons)
def prepositions(text):
tagged = nltk.pos_tag(text)
count = 0
for w in tagged:
if w[1] == 'IN':
count += 1
return count
def pronouns(text):
tagged = nltk.pos_tag(text)
count = 0
for w in tagged:
if (w[1] == 'PRP' or w[1] == 'PRP$' or w[1] == 'WP' or w[1] == 'WPR$'):
count += 1
return count
def count_question(text):
grammar = RegexpTokenizer(r'\?')
pattern = grammar.tokenize(text)
return len(pattern)
def long_words(text):
grammar = RegexpTokenizer(r'\w{7,}')
pattern = grammar.tokenize(text)
return len(pattern)
def firs_pronouns(text):
firstpronouns = ['i', 'me', 'my', 'mine', 'we', 'our', 'ours', 'us']
count = 0
for w in text:
if w.lower() in firstpronouns:
count += 1
return count
def swears_count(text):
with open('swears.txt') as test:
words = test.read()
swears = re.sub(r'[^\w+\s]+', '', words)
swears = swears.split('\n')
count = 0
for w in text:
if w.lower() in swears:
count += 1
return count
def typetoken_ratio(text):
typed = set(text)
token = text
ratio = len(typed)/len(token)
return ratio
def count_words(text):
grammar = RegexpTokenizer(r'\w+')
pattern = grammar.tokenize(text)
return len(pattern)
def firs_pluralpronouns(text):
pluralpronouns = ['we', 'our', 'ours', 'us']
count = 0
for w in text:
if w.lower() in pluralpronouns:
count += 1
return count
def sec_pronouns(text):
secpronouns = ['you', 'your', 'yours']
count = 0
for w in text:
if w.lower() in secpronouns:
count += 1
return count
def mean_freq(text):
## grammar = RegexpTokenizer(r'\w+')
words = word_tokenize(text)
wordsl = []
for w in words:
wordsl.append(w.lower())
unique = set(wordsl)
return (len(wordsl)/len(unique))
def mean_std(test):
f1 = []
f2 = []
f3 = []
f4 = []
f5 = []
f6 = []
f7 = []
f8 = []
f9 = []
f10 = []
f11 = []
f12 = []
f13 = []
f14 = []
f15 = []
f16 = []
f17 = []
f18 = []
f19 = []
f20 = []
f21 = []
f22 = []
for w in test[1:]:
f1.append(all_punctuation(word_tokenize(w[1])))
f2.append(count_commas(word_tokenize(w[1])))
f3.append(count_pattern(w[1]))
f4.append(count_exclamation(w[1]))
f5.append(ex_links(w[1]))
f6.append(firs_sinpronouns(word_tokenize(w[1])))
f7.append(negative_particle(w[1]))
f8.append(negative_emoticon(w[1]))
f9.append(numbers(w[1]))
f10.append(parenthesis(w[1]))
f11.append(positive_emoticon(w[1]))
f12.append(prepositions(word_tokenize(w[1])))
f13.append(pronouns(word_tokenize(w[1])))
f14.append(count_question(w[1]))
f15.append(long_words(w[1]))
f16.append(firs_pronouns(word_tokenize(w[1])))
f17.append(swears_count(word_tokenize(w[1])))
f18.append(typetoken_ratio(word_tokenize(w[1])))
f19.append(count_words(w[1]))
f20.append(firs_pluralpronouns(word_tokenize(w[1])))
f21.append(sec_pronouns(word_tokenize(w[1])))
f22.append(mean_freq(w[1]))
value = [f1, f2, f3, f4, f5, f6, f7, f8, f9, f10, f11, f12, f13, f14, f15, f16, f17, f18, f19, f20, f21, f22]
mean1 = []
stdev1 = []
for a in value:
mean1.append(round(mean(a),2))
stdev1.append(round(stdev(a),2))
return (mean1, stdev1)
##save_file = open('sample_value.pickle', 'wb')
##pickle.dump(mean_std(test), save_file)
##save_file.close()
savedfile = open('sample_value.pickle', 'rb')
trained = pickle.load(savedfile)
savedfile.close()
def evaluation(test):
ne = 0
ns = 0
na = 0
nc = 0
no = 0
for w in test[1:]:
z1 = (all_punctuation(word_tokenize(w[1])) - trained[0][0])/(trained[1][0])
z2 = (count_commas(word_tokenize(w[1]))- trained[0][1])/(trained[1][1])
z3 = (count_pattern(w[1]) - trained[0][2])/(trained[1][2])
z4 = (count_exclamation(w[1]) - trained[0][3])/(trained[1][3])
z5 = (ex_links(w[1]) - trained[0][4])/(trained[1][4])
z6 = (firs_sinpronouns(word_tokenize(w[1]))- trained[0][5])/(trained[1][5])
z7 = (negative_particle(w[1])-trained[0][6])/(trained[1][6])
z8 = (negative_emoticon(w[1]) - trained[0][7])/(trained[1][7])
z9 = (numbers(w[1])-trained[0][8])/(trained[1][8])
z10 = (parenthesis(w[1])-trained[0][9])/(trained[1][9])
z11 = (positive_emoticon(w[1])-trained[0][10])/(trained[1][10])
z12 = (prepositions(word_tokenize(w[1]))-trained[0][11])/(trained[1][11])
z13 = (pronouns(word_tokenize(w[1]))-trained[0][12])/(trained[1][12])
z14 = (count_question(w[1])-trained[0][13])/(trained[1][13])
z15 = (long_words(w[1])-trained[0][14])/(trained[1][14])
z16 = (firs_pronouns(word_tokenize(w[1]))-trained[0][15])/(trained[1][15])
z17 = (swears_count(word_tokenize(w[1]))-trained[0][16])/(trained[1][16])
z18 = (typetoken_ratio(word_tokenize(w[1]))-trained[0][17])/(trained[1][17])
z19 = (count_words(w[1])-trained[0][18])/(trained[1][18])
z20 = (firs_pluralpronouns(word_tokenize(w[1]))-trained[0][19])/(trained[1][19])
z21 = (sec_pronouns(word_tokenize(w[1]))-trained[0][20])/(trained[1][20])
z22 = (mean_freq(w[1])-trained[0][21])/(trained[1][21])
E = -0.08*z1-0.02*z2-0.07*z3-0.05*z5+0.05*z6-0.08*z7-0.03*z8-0.03*z9-0.06*z10+0.07*z11+0.07*z13-0.06*z14-0.06*z15+0.07*z16-0.01*z17-0.05*z18-0.01*z19+0.06*z20-0.01*z21+0.05*z22
S = -0.04*z1+0.01*z2+0.02*z3-0.05*z4-0.02*z5-0.15*z6+0.12*z7-0.18*z8+0.05*z9+0.03*z10+0.07*z11+0.06*z12+0.12*z13-0.05*z14+0.06*z15-0.14*z16+0.1*z18+0.02*z19+0.07*z20+0.03*z21-0.06*z22
A = -0.01*z1-0.02*z2+0.01*z3+0.06*z4-0.01*z5+0.05*z6+0.11*z7-0.11*z8-0.03*z9-0.04*z10+0.05*z11+0.04*z12+0.04*z13-0.04*z14-0.05*z15-0.06*z16-0.14*z17-0.04*z18+0.02*z19+0.04*z20-0.06*z21+0.03*z22
C = -0.04*z1-0.01*z2+0.01*z3-0.03*z5+0.04*z6-0.07*z7-0.11*z8-0.02*z9-0.01*z10+0.02*z11+0.08*z12+0.02*z13-0.06*z14+0.02*z15-0.04*z16-0.11*z17-0.05*z18-0.02*z19+0.01*z20-0.04*z21+0.06*z22
O = -10*z1+0.1*z2+0.06*z3-0.03*z4+0.09*z5-0.14*z6+0.01*z7+0.04*z8-0.06*z9+0.1*z10+0.02*z11-0.04*z12-0.06*z13+0.08*z14+0.1*z15-0.14*z16+0.08*z17+0.09*z18+0.06*z19+0.04*z20+0.11*z21-0.07*z22
if E>0.65:
if w[7] =='y':
ne+=1
if E<0.65:
if w[7]=='n':
ne+=1
if S>0.75:
if w[8] == 'y':
ns +=1
if S<0.75:
if w[8] == 'n':
ns+=1
if A>0.005:
if w[9]=='y':
na+=1
if A<0.005:
if w[9]=='n':
na+=1
if C>0.58:
if w[10]=='y':
nc+=1
if C<0.58:
if w[10]=='n':
nc+=1
if O>(-0.05):
if w[11]=='y':
no+=1
if O<(-0.05):
if w[11]=='n':
no+=1
print (round((ne/9917)*100,2), round((ns/9917)*100,2),round((na/9917)*100,2),round((nc/9917)*100,2),round((no/9917)*100,2))
evaluation(test)
The sample data is:
enter image description here
Related
from app import getPhonemes
import pandas as pd
import sys
triphones = []
def phonemize(sentence):
tokens = sentence.split(' ')
phonemes = getPhonemes(tokens)
return '$'.join(phonemes)
def generateTriphones(phonemes):
triphones = []
for i in range(len(phonemes)):
for j in range(len(phonemes)):
for k in range(len(phonemes)):
triphones.append(phonemes[i] + ' ' + phonemes[j] + ' ' + phonemes[k])
return triphones
def scoreSentence(sentence,phonemes):
flag = 0
global triphones
score = 0
tokens = sentence.split('$')
uniqueTokens = set(tokens)
triphoneticTokens = [token for token in uniqueTokens if token.count(' ') > 1]
for token in triphoneticTokens:
for triphone in triphones:
if token.find(triphone) != -1:
score += 1
triphones.remove(triphone)
if triphones == []:
flag = -1
return score, flag
def Process(fil):
global triphones
file = open('itudict/vocab.phoneme', 'r',encoding='utf-8')
data = []
for line in file:
data.append(line.strip())
file.close()
phonemes = data[4:]
triphones = generateTriphones(phonemes)
data = pd.read_csv(fil+'.csv')
data = data.drop(['score','covered_vocab'],axis=1)
i = 1
while len(data) > 0:
print('Processing File: '+str(i))
sentencee = data[:10000]
data = data[10000:]
sentences = sentencee['sentence'].tolist()
phonemes = []
scores = []
for j in range(len(sentences)):
if j%1000 == 0:
print('Processing Sentence: '+str(j))
print(len(triphones))
phones = phonemize(sentences[j])
score, flag = scoreSentence(phones,phonemes)
if flag == -1:
data = []
phonemes.append(phones)
scores.append(score)
data['Phonemes'] = phonemes
data['score'] = scores
data.to_csv(fil+'phonemized'+str(i)+'.csv', index=False)
i += 1
if __name__ == '__main__':
Process(sys.argv[1])
I am trying to generate the phonemes for 800000 sentences. The model which am using is G2P which phonemizes the sentence. after phonemization i am calculating the scores. the phoneme array which i am using for calculating scores is of size 2620000.
The length of sentences are 800000 and the code is taking days, can somebody parallelize this code or suggest some solution
I want to parallelize this code to execute faster.
My code I have written is, when you input 5 different directory do 5 different loops. But what T want to do is insert a single directory, which has 5 different folders and implement a single loop.
Here is the code I have written now:
def find_threshold(dir1, dir2, dir3, dir4, dir5):
#Finding mean for Buisness
business_mean = 0
business_sum = 0
n = 0
index = 0
business_mean_list = []
for path, _, files in os.walk(dir1):
for file_name in files:
filepath = os.path.join(path, file_name)
print(f"Checking --> {filepath}")
filename_1 = filepath
for path, _, files in os.walk(dir1):
for file_name in files:
filepath = os.path.join(path, file_name)
#print(f"Checking --> {filepath}")
filename_2 = filepath
# Program to measure the similarity between
# two sentences using cosine similarity.
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
# X = input("Enter first string: ").lower()
# Y = input("Enter second string: ").lower()
#filename_1 = "buisness1.txt"
#filename_2 = "world1.txt"
A = open(filename_1, encoding='utf-8')
B = open(filename_2, encoding='utf-8')
X = A.read()
Y = B.read()
# tokenization
X_list = word_tokenize(X)
Y_list = word_tokenize(Y)
# sw contains the list of stopwords
sw = stopwords.words('english')
l1 =[];l2 =[]
# remove stop words from the string
X_set = {w for w in X_list if not w in sw}
Y_set = {w for w in Y_list if not w in sw}
# form a set containing keywords of both strings
rvector = X_set.union(Y_set)
for w in rvector:
if w in X_set: l1.append(1) # create a vector
else: l1.append(0)
if w in Y_set: l2.append(1)
else: l2.append(0)
c = 0
# cosine formula
for i in range(len(rvector)):
c+= l1[i]*l2[i]
cosine = c / float((sum(l1)*sum(l2))**0.5)
n += 1
if cosine != 0:
#print("similarity: ", cosine)
#mean += np.mean(cosine)
business_sum += cosine
#print("sum: ", business)
#print("n: ", n)
#else:
#print("similarity is zero")
business_mean = business_sum/n
#print(index)
business_mean_list.insert(index, business_mean)
index += 1
#print("business_mean: ", business_mean)
#print("business mean list: ", mean_list)
business_threshold = min(business_mean_list)
#Finding mean for Entertainment
entertainment_mean = 0
entertainment_sum = 0
n = 0
index = 0
entertainment_mean_list = []
for path, _, files in os.walk(dir2):
for file_name in files:
filepath = os.path.join(path, file_name)
print(f"Checking --> {filepath}")
filename_1 = filepath
for path, _, files in os.walk(dir2):
for file_name in files:
filepath = os.path.join(path, file_name)
#print(f"Checking --> {filepath}")
filename_2 = filepath
# Program to measure the similarity between
# two sentences using cosine similarity.
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
# X = input("Enter first string: ").lower()
# Y = input("Enter second string: ").lower()
#filename_1 = "buisness1.txt"
#filename_2 = "world1.txt"
A = open(filename_1, encoding='utf-8')
B = open(filename_2, encoding='utf-8')
X = A.read()
Y = B.read()
# tokenization
X_list = word_tokenize(X)
Y_list = word_tokenize(Y)
# sw contains the list of stopwords
sw = stopwords.words('english')
l1 =[];l2 =[]
# remove stop words from the string
X_set = {w for w in X_list if not w in sw}
Y_set = {w for w in Y_list if not w in sw}
# form a set containing keywords of both strings
rvector = X_set.union(Y_set)
for w in rvector:
if w in X_set: l1.append(1) # create a vector
else: l1.append(0)
if w in Y_set: l2.append(1)
else: l2.append(0)
c = 0
# cosine formula
for i in range(len(rvector)):
c+= l1[i]*l2[i]
cosine = c / float((sum(l1)*sum(l2))**0.5)
n += 1
if cosine != 0:
#print("similarity: ", cosine)
#mean += np.mean(cosine)
entertainment_sum += cosine
#print("sum: ", entertainment_sum)
#print("n: ", n)
#else:
#print("similarity is zero")
entertainment_mean = entertainment_sum/n
#print(index)
entertainment_mean_list.insert(index, entertainment_mean)
index += 1
#print("entertainment_mean: ", entertainment_mean)
#print("entertainment mean list: ", mean_list)
entertainment_threshold = min(entertainment_mean_list)
#Finding mean for local
local_mean = 0
local_sum = 0
n = 0
index = 0
local_mean_list = []
for path, _, files in os.walk(dir3):
for file_name in files:
filepath = os.path.join(path, file_name)
print(f"Checking --> {filepath}")
filename_1 = filepath
for path, _, files in os.walk(dir3):
for file_name in files:
filepath = os.path.join(path, file_name)
#print(f"Checking --> {filepath}")
filename_1 = filepath
# Program to measure the similarity between
# two sentences using cosine similarity.
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
# X = input("Enter first string: ").lower()
# Y = input("Enter second string: ").lower()
#filename_1 = "buisness1.txt"
filename_2 = "world1.txt"
A = open(filename_1, encoding='utf-8')
B = open(filename_2, encoding='utf-8')
X = A.read()
Y = B.read()
# tokenization
X_list = word_tokenize(X)
Y_list = word_tokenize(Y)
# sw contains the list of stopwords
sw = stopwords.words('english')
l1 =[];l2 =[]
# remove stop words from the string
X_set = {w for w in X_list if not w in sw}
Y_set = {w for w in Y_list if not w in sw}
# form a set containing keywords of both strings
rvector = X_set.union(Y_set)
for w in rvector:
if w in X_set: l1.append(1) # create a vector
else: l1.append(0)
if w in Y_set: l2.append(1)
else: l2.append(0)
c = 0
# cosine formula
for i in range(len(rvector)):
c+= l1[i]*l2[i]
cosine = c / float((sum(l1)*sum(l2))**0.5)
n += 1
if cosine != 0:
#print("similarity: ", cosine)
#mean += np.mean(cosine)
local_sum += cosine
#print("sum: ", local_sum)
#print("n: ", n)
#else:
#print("similarity is zero")
local_mean = local_sum/n
#print(index)
local_mean_list.insert(index, local_mean)
index += 1
#print("local_mean: ", local_mean)
#print("local mean mean list: ", mean_list)
local_threshold = min(local_mean_list)
#Finding mean for sports
sports_mean = 0
sports_sum = 0
n = 0
index = 0
sports_mean_list = []
for path, _, files in os.walk(dir4):
for file_name in files:
filepath = os.path.join(path, file_name)
print(f"Checking --> {filepath}")
filename_1 = filepath
for path, _, files in os.walk(dir4):
for file_name in files:
filepath = os.path.join(path, file_name)
#print(f"Checking --> {filepath}")
filename_1 = filepath
# Program to measure the similarity between
# two sentences using cosine similarity.
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
# X = input("Enter first string: ").lower()
# Y = input("Enter second string: ").lower()
#filename_1 = "buisness1.txt"
filename_2 = "world1.txt"
A = open(filename_1, encoding='utf-8')
B = open(filename_2, encoding='utf-8')
X = A.read()
Y = B.read()
# tokenization
X_list = word_tokenize(X)
Y_list = word_tokenize(Y)
# sw contains the list of stopwords
sw = stopwords.words('english')
l1 =[];l2 =[]
# remove stop words from the string
X_set = {w for w in X_list if not w in sw}
Y_set = {w for w in Y_list if not w in sw}
# form a set containing keywords of both strings
rvector = X_set.union(Y_set)
for w in rvector:
if w in X_set: l1.append(1) # create a vector
else: l1.append(0)
if w in Y_set: l2.append(1)
else: l2.append(0)
c = 0
# cosine formula
for i in range(len(rvector)):
c+= l1[i]*l2[i]
cosine = c / float((sum(l1)*sum(l2))**0.5)
n += 1
if cosine != 0:
#print("similarity: ", cosine)
#mean += np.mean(cosine)
sports_sum += cosine
#print("sum: ", sports_sum)
#print("n: ", n)
#else:
#print("similarity is zero")
sports_mean = sports_sum/n
#print(index)
sports_mean_list.insert(index, sports_mean)
index += 1
#print("sports mean: ", sports_mean)
#print("sports mean list: ", mean_list)
sports_threshold = min(sports_mean_list)
#Finding mean for world
world_mean = 0
world_sum = 0
n = 0
index = 0
world_mean_list = []
for path, _, files in os.walk(dir5):
for file_name in files:
filepath = os.path.join(path, file_name)
print(f"Checking --> {filepath}")
filename_1 = filepath
for path, _, files in os.walk(dir5):
for file_name in files:
filepath = os.path.join(path, file_name)
#print(f"Checking --> {filepath}")
filename_1 = filepath
# Program to measure the similarity between
# two sentences using cosine similarity.
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
# X = input("Enter first string: ").lower()
# Y = input("Enter second string: ").lower()
#filename_1 = "buisness1.txt"
filename_2 = "world1.txt"
A = open(filename_1, encoding='utf-8')
B = open(filename_2, encoding='utf-8')
X = A.read()
Y = B.read()
# tokenization
X_list = word_tokenize(X)
Y_list = word_tokenize(Y)
# sw contains the list of stopwords
sw = stopwords.words('english')
l1 =[];l2 =[]
# remove stop words from the string
X_set = {w for w in X_list if not w in sw}
Y_set = {w for w in Y_list if not w in sw}
# form a set containing keywords of both strings
rvector = X_set.union(Y_set)
for w in rvector:
if w in X_set: l1.append(1) # create a vector
else: l1.append(0)
if w in Y_set: l2.append(1)
else: l2.append(0)
c = 0
# cosine formula
for i in range(len(rvector)):
c+= l1[i]*l2[i]
cosine = c / float((sum(l1)*sum(l2))**0.5)
n += 1
if cosine != 0:
#print("similarity: ", cosine)
#mean += np.mean(cosine)
world_sum += cosine
#print("sum: ", world_sum)
#print("n: ", n)
#else:
#print("similarity is zero")
world_mean = world_sum/n
#print(index)
world_mean_list.insert(index, world_mean)
index += 1
#print("world mean: ", world_mean)
#print("world mean list: ", mean_list)
world_threshold = min(world_mean_list)
return (business_threshold, entertainment_threshold, local_threshold, sports_threshold, world_threshold)
As you can see I'm inputing 5 directories to the method find_threshold and doing a cosine calculation for 5 directories in 5 loops which at the end gives out 5 values as outputs of each loops. What I want to do is give 1 directory (it have 5 folders with 100 text files each) for the method find_threshold and get the same output.
Such like,
def find_threshold(dir):
Can anyone help me to write the code for this?
Take any of your sections and make find_threshold() just do one directory at a time. Then when the user specifies a parent folder you can search it for direct children and call find_threshold() on each of them.
Maybe something like this I totally have not tested:
def find_threshold_by_parent(parent_directory):
return [find_threshold(dir) for dir in os.listdir(parent_directory) if os.path.isdir(dir)]
def find_threshold(child_directory):
#Finding mean for Buisness
mean = 0
sum = 0
n = 0
index = 0
mean_list = []
for path, _, files in os.walk(child_directory):
for file_name in files:
filepath = os.path.join(path, file_name)
print(f"Checking --> {filepath}")
filename_1 = filepath
for path, _, files in os.walk(child_directory):
for file_name in files:
filepath = os.path.join(path, file_name)
#print(f"Checking --> {filepath}")
filename_2 = filepath
# Program to measure the similarity between
# two sentences using cosine similarity.
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
# X = input("Enter first string: ").lower()
# Y = input("Enter second string: ").lower()
#filename_1 = "buisness1.txt"
#filename_2 = "world1.txt"
A = open(filename_1, encoding='utf-8')
B = open(filename_2, encoding='utf-8')
X = A.read()
Y = B.read()
# tokenization
X_list = word_tokenize(X)
Y_list = word_tokenize(Y)
# sw contains the list of stopwords
sw = stopwords.words('english')
l1 =[];l2 =[]
# remove stop words from the string
X_set = {w for w in X_list if not w in sw}
Y_set = {w for w in Y_list if not w in sw}
# form a set containing keywords of both strings
rvector = X_set.union(Y_set)
for w in rvector:
if w in X_set: l1.append(1) # create a vector
else: l1.append(0)
if w in Y_set: l2.append(1)
else: l2.append(0)
c = 0
# cosine formula
for i in range(len(rvector)):
c+= l1[i]*l2[i]
cosine = c / float((sum(l1)*sum(l2))**0.5)
n += 1
if cosine != 0:
#print("similarity: ", cosine)
#mean += np.mean(cosine)
sum += cosine
#print("sum: ", sum)
#print("n: ", n)
#else:
#print("similarity is zero")
mean = sum/n
#print(index)
mean_list.insert(index, mean)
index += 1
#print("mean: ", mean)
#print("mean list: ", list)
threshold = min(mean_list)
return threshold
my viterbi code program becomes exponential. can you help me find the place i can change to make it dynamic program. I need to remember and use only the 2 previous tags of words.
thanks a lot.
from collections import defaultdict
import sys
import re
import feature_maker as fm
bla = ''
all_states = set()
#distirbuition over all of the corpus
POS_probability = fm.load_obj('probas')
POS_probability['START'] = 1.0
def cpd_tagwords(words, tag):
pattern = re.compile("\W")# to check for .,: etc.
if pattern.match(words) and tag == words:
return 1
elif pattern.match(tag):
return 0
for word in emle.split("\n"):
if word.__contains__(words) and word.__contains__(tag):
return word[word.index(":") + 2:]
#if we dont have data about the word with the tag,just retturn the probability
#to get the tag over all of the word in the corpus.
return POS_probability[tag]
def cpd_tags(early, prev, current):
lambda1 = 0
lambda3 = 0
lambda6 = 0
for word in qmle.split("\n"):
word1 = word.split()
if len(word1) > 0:
if word1[0].__contains__(current): #for tuple of 1
if len(word1) == 2:
lambda1 = word[word.index("]:") + 3:]
if len(word1) > 2 and word1[1].__contains__(prev): #for tuple of 2
if len(word1) == 3:
lambda3 = word[word.index("]:") + 3:]
if len(word1) > 3 and word1[2].__contains__(early): #for tuple of 3
if len(word1) == 4:
lambda6 = word[word.index("]:") + 3:]
return (0.6*float(lambda6)) + (0.3*float(lambda3)) + (0.1*float(lambda1))
#map: popular_copuler['POS'] = list of all pos that can come before it.
popular_copules = fm.load_obj('popular_copules')
# Viterbi Algo
def viterbi(sentence, tags1):
def findSet(index,tag):
if tag == 'ALL':
return tags1
if index in range(1, len(sentence) + 1):
possible_tags = set(popular_copules[tag])
if possible_tags == set([]):
return tags1
return set(popular_copules[tag])
elif index == 0 or index == -1:
return {'START'}
# stores (word:tag) in this whole sentence
sentence_with_tag = defaultdict(str)
# inner function to commpute pi values--start
def pi_viterbi(k, u, v, sentence):#here is the start of the bad sequence
prob = defaultdict(float)
# initialization
if k == 0 and u == 'START' and v == 'START':
return (1., 'START')
else:
for w in findSet(k - 2,u):
prev = pi_viterbi(k - 1, w, u, sentence)[0]
# tuple((w,u,v))
q = cpd_tags(w, u, v)**
e = cpd_tagwords(sentence[k - 1].lower(), v)
probability = float(prev) * q * float(e)
prob[tuple((w, u))] = probability**
#here is the end of the bad sequence
max_tuple = max(prob.items(), key=lambda x: x[1])
# print (max_tuple[1],max_tuple[0][0])
return max_tuple[1], max_tuple[0][0]
# inner function to commpute pi values--end
sentence_with_tag = list()
backpointer = defaultdict(str)
tags = defaultdict(str)
k = len(sentence)
u_glob = ''
v_glob = ''
glob = 0.
for i in range(1, k + 1):
prob = defaultdict(float)
#for current word we check all the tags
""" changed from for u in findSet(i - 1):"""
for u in findSet(i ,'ALL'):
#going backwards we call findset with u so it gives us only
# tags v that go togeter alot with u(this is purnnig)
""" changed from for v in findSet(i)"""
for v in findSet(i-1,u_glob):
#siwtched u and v
value, w = pi_viterbi(i, v, u, sentence)#the v recursion in the algorithm
prob[tuple((i, u, v))] = value
backpointer[tuple((i, u, v))] = w #bp from the algorithm
max_tuple = max(prob.items(), key=lambda x: x[1])
backpointer[tuple((i, max_tuple[0][1], max_tuple[0][-1]))] = max_tuple[0][1] # bp (k,u,v)= tag w
# sentence_with_tag.append(max_tuple[0][-1])
u_glob = max_tuple[0][-2]
v_glob = max_tuple[0][-1]
glob = max_tuple[1]
print ('Max', max_tuple)
tags[k - 1] = u_glob
tags[k] = v_glob
for i in range((k - 2), 0, -1):
tag = backpointer[tuple(((i + 2), tags[i + 1], tags[i + 2]))]
tags[i] = tag
tag_list = list()
for i in range(1, len(tags) + 1):
tag_list.append(tags[i])
file = open(sys.argv[4], 'w')
file.truncate()
for word in tag_list:
file.write(word)
# tag list as results
return tag_list
file=open(sys.argv[1],"r+")
fQ = open(sys.argv[2], 'r')
qmle = fQ.read()
fQ.close()
f = open("tags.txt",'r+')
tags = f.read()
f.close()
fe = open(sys.argv[3], 'r')
emle = fe.read()
distinct_tags = set()
# what is the list of all tags?
for word in tags.split():
distinct_tags.add(word)
sentence = []
sentence1 = []
sentence1 = file.read()
sentence = sentence1.split()
file.close()
file = open(sys.argv[4], 'w')
file.truncate()
viterbi(sentence, distinct_tags)
how can I reduce the time complexity?
i have some problems on my code that show some error when i run it. i'm using python
so, here's my code
import collections
import nltk.classify.util, nltk.metrics
from nltk.classify import SklearnClassifier
import csv
from sklearn import cross_validation
from sklearn.svm import LinearSVC, SVC
import random
from nltk.corpus import stopwords
import itertools
from nltk.collocations import BigramCollocationFinder
from nltk.metrics import BigramAssocMeasures
posdata = []
with open('positive-data.csv', 'rb') as myfile:
reader = csv.reader(myfile, delimiter=',')
for val in reader:
posdata.append(val[0])
negdata = []
with open('negative-data.csv', 'rb') as myfile:
reader = csv.reader(myfile, delimiter=',')
for val in reader:
negdata.append(val[0])
def word_split(data):
data_new = []
for word in data:
word_filter = [i.lower() for i in word.split()]
data_new.append(word_filter)
return data_new
def word_split_sentiment(data):
data_new = []
for (word, sentiment) in data:
word_filter = [i.lower() for i in word.split()]
data_new.append((word_filter, sentiment))
return data_new
def word_feats(words):
return dict([(word, True) for word in words])
stopset = set(stopwords.words('english')) - set(('over', 'under', 'below', 'more', 'most', 'no', 'not', 'only', 'such', 'few', 'so', 'too', 'very', 'just', 'any', 'once'))
def stopword_filtered_word_feats(words):
return dict([(word, True) for word in words if word not in stopset])
def bigram_word_feats(words, score_fn=BigramAssocMeasures.chi_sq, n=200):
bigram_finder = BigramCollocationFinder.from_words(words)
bigrams = bigram_finder.nbest(score_fn, n)
"""
print words
for ngram in itertools.chain(words, bigrams):
if ngram not in stopset:
print ngram
exit()
"""
return dict([(ngram, True) for ngram in itertools.chain(words, bigrams)])
def bigram_word_feats_stopwords(words, score_fn=BigramAssocMeasures.chi_sq, n=200):
bigram_finder = BigramCollocationFinder.from_words(words)
bigrams = bigram_finder.nbest(score_fn, n)
"""
print words
for ngram in itertools.chain(words, bigrams):
if ngram not in stopset:
print ngram
exit()
"""
return dict([(ngram, True) for ngram in itertools.chain(words, bigrams) if ngram not in stopset])
# Calculating Precision, Recall & F-measure
def evaluate_classifier(featx):
negfeats = [(featx(f), 'neg') for f in word_split(negdata)]
posfeats = [(featx(f), 'pos') for f in word_split(posdata)]
negcutoff = len(negfeats)*3/4
poscutoff = len(posfeats)*3/4
trainfeats = negfeats[:negcutoff] + posfeats[:poscutoff]
testfeats = negfeats[negcutoff:] + posfeats[poscutoff:]
classifier = 'svm'
for cl in classifier:
if cl == 'svm':
classifierName = 'SVM'
classifier = SklearnClassifier(LinearSVC(), sparse=False)
classifier.train(trainfeats)
refsets = collections.defaultdict(set)
testsets = collections.defaultdict(set)
for i, (feats, label) in enumerate(testfeats):
refsets[label].add(i)
observed = classifier.classify(feats)
testsets[observed].add(i)
accuracy = nltk.classify.util.accuracy(classifier, testfeats)
pos_precision = nltk.metrics.precision(refsets['pos'], testsets['pos'])
pos_recall = nltk.metrics.recall(refsets['pos'], testsets['pos'])
pos_fmeasure = nltk.metrics.f_measure(refsets['pos'], testsets['pos'])
neg_precision = nltk.metrics.precision(refsets['neg'], testsets['neg'])
neg_recall = nltk.metrics.recall(refsets['neg'], testsets['neg'])
neg_fmeasure = nltk.metrics.f_measure(refsets['neg'], testsets['neg'])
print ''
print '---------------------------------------'
print 'SINGLE FOLD RESULT ' + '(' + classifierName + ')'
print '---------------------------------------'
print 'accuracy:', accuracy
print 'precision', (pos_precision + neg_precision) / 2
print 'recall', (pos_recall + neg_recall) / 2
print 'f-measure', (pos_fmeasure + neg_fmeasure) / 2
print ''
## CROSS VALIDATION
trainfeats = negfeats + posfeats
# SHUFFLE TRAIN SET
random.shuffle(trainfeats)
n = 5
for cl in classifier_list:
subset_size = len(trainfeats) / n
accuracy = []
pos_precision = []
pos_recall = []
neg_precision = []
neg_recall = []
pos_fmeasure = []
neg_fmeasure = []
cv_count = 1
for i in range(n):
testing_this_round = trainfeats[i*subset_size:][:subset_size]
training_this_round = trainfeats[:i*subset_size] + trainfeats[(i+1)*subset_size:]
if cl == 'svm':
classifierName = 'SVM'
classifier = SklearnClassifier(LinearSVC(), sparse=False)
classifier.train(training_this_round)
refsets = collections.defaultdict(set)
testsets = collections.defaultdict(set)
for i, (feats, label) in enumerate(testing_this_round):
refsets[label].add(i)
observed = classifier.classify(feats)
testsets[observed].add(i)
cv_accuracy = nltk.classify.util.accuracy(classifier, testing_this_round)
cv_pos_precision = nltk.metrics.precision(refsets['pos'], testsets['pos'])
cv_pos_recall = nltk.metrics.recall(refsets['pos'], testsets['pos'])
cv_pos_fmeasure = nltk.metrics.f_measure(refsets['pos'], testsets['pos'])
cv_neg_precision = nltk.metrics.precision(refsets['neg'], testsets['neg'])
cv_neg_recall = nltk.metrics.recall(refsets['neg'], testsets['neg'])
cv_neg_fmeasure = nltk.metrics.f_measure(refsets['neg'], testsets['neg'])
accuracy.append(cv_accuracy)
pos_precision.append(cv_pos_precision)
pos_recall.append(cv_pos_recall)
neg_precision.append(cv_neg_precision)
neg_recall.append(cv_neg_recall)
pos_fmeasure.append(cv_pos_fmeasure)
neg_fmeasure.append(cv_neg_fmeasure)
cv_count += 1
print '---------------------------------------'
print 'N-FOLD CROSS VALIDATION RESULT ' + '(' + classifierName + ')'
print '---------------------------------------'
print 'accuracy:', sum(accuracy) / n
print 'precision', (sum(pos_precision)/n + sum(neg_precision)/n) / 2
print 'recall', (sum(pos_recall)/n + sum(neg_recall)/n) / 2
print 'f-measure', (sum(pos_fmeasure)/n + sum(neg_fmeasure)/n) / 2
print ''
evaluate_classifier(word_feats)`
its suppose to analysis sentiment from csv document using SVM, but when i run the code i got this error anyone have an idea to fix it??
really need your help guys
I am doing sentiment analysis of movie review using python with scikit-learn and nltk. i want to equate elements related to unigram to 0 (when they are having opposite polarity) when a bigram/ trigram related to those unigram are non zero.
for example:
movie is not bad
than feature vector is ['movie' 'is' 'not' 'bad' 'movie is' 'is not' 'not bad']=[3 3 1 1 4 2 4]
but i want [3 3 0 0 4 2 4] instead.
code:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
import math
#######################Reading Training Review Phrases and Sentiments###################################
train_list = []
train_sentiment = []
with open('sentences.txt') as f:
content = f.readlines()
for sentence in content:
train_list.append(sentence.rstrip('\n').split("\t")[0])
train_sentiment.append(sentence.rstrip('\n').split("\t")[1])
#######################Number of phrases in each class###################################
ex_pos = pos = neu = neg = ex_neg = 0
ex_pos_phrases = pos_phrases = neu_phrases = neg_phrases = ex_neg_phrases = []
with open('ex_pos.txt', 'r') as ex_posF:
ex_pos_phrases = ex_posF.readlines()
ex_pos = len(ex_pos_phrases)
with open('pos.txt', 'r') as posF:
pos_phrases = posF.readlines()
pos = len(pos_phrases)
with open('neu.txt', 'r') as neuF:
neu_phrases = neuF.readlines()
neu = len(neu_phrases)
with open('neg.txt', 'r') as negF:
neg_phrases = negF.readlines()
neg = len(neg_phrases)
with open('ex_neg.txt', 'r') as ex_negF:
ex_neg_phrases = ex_negF.readlines()
ex_neg = len(ex_neg_phrases)
print(str(ex_neg) + "," + str(neg) + "," + str(neu) + "," + str(pos) + "," + str(ex_pos))
####################### Getting unique Words ###################################
unique_words = []
model = TfidfVectorizer(input=train_list)
train_tfidf = model.fit_transform(train_list)
unique_words = model.get_feature_names()
print("##### Word sentiment matrix ####")
########################## Word sentiment matrix ########################################
word_sentiment = [[0 for x in range(5)] for x in range(len(unique_words)) ]
wordcount = 0
for word in unique_words:
count = 0
for review in ex_neg_phrases:
review_words = review.rstrip('\n').split(" ")
for review_word in review_words:
if review_word == word:
count += 1
break
word_sentiment[wordcount][0] = count
count = 0
for review in neg_phrases:
review_words = review.rstrip('\n').split(" ")
for review_word in review_words:
if review_word == word:
count += 1
break
word_sentiment[wordcount][1] = count
count = 0
for review in neu_phrases:
review_words = review.rstrip('\n').split(" ")
for review_word in review_words:
if review_word == word:
count += 1
break
word_sentiment[wordcount][2] = count
count = 0
for review in ex_pos_phrases:
review_words = review.rstrip('\n').split(" ")
for review_word in review_words:
if review_word == word:
count += 1
break
word_sentiment[wordcount][4] = count
count = 0
for review in pos_phrases:
review_words = review.rstrip('\n').split(" ")
for review_word in review_words:
if review_word == word:
count += 1
break
word_sentiment[wordcount][3] = count
wordcount += 1
print("###The Training feature matrix###")
#################################The feature matrix#######################################
feature_matrix = [[0 for x in range(len(unique_words))] for x in range(len(train_list))]
print(len(feature_matrix))
print(len(feature_matrix[0]))
wordcount = 0
for unique_word in unique_words:
phrasecount = 0
ep = p = nu = en = n = 0
if word_sentiment[wordcount][4] != 0:
ep = .35 * math.log(word_sentiment[wordcount][4]/ex_pos)
if word_sentiment[wordcount][3] != 0:
p = .15 * math.log(word_sentiment[wordcount][3]/pos)
if word_sentiment[wordcount][2] != 0:
nu = 1 * math.log(word_sentiment[wordcount][2]/neu)
if word_sentiment[wordcount][0] != 0:
en = -.35 * math.log(word_sentiment[wordcount][0]/ex_neg)
if word_sentiment[wordcount][1] != 0:
n = -.15 * math.log(word_sentiment[wordcount][1]/neg)
for phrase in train_list:
words = phrase.split(" ")
docwordcount = 0
for word in words:
if word == unique_word:
docwordcount += 1
tfidf = (docwordcount * ep) + (docwordcount * p) + (docwordcount * nu) + (docwordcount * en) + (docwordcount * n)
feature_matrix[phrasecount][wordcount] = tfidf
phrasecount += 1
wordcount += 1
print("###The test feature matrix###")
test_list=[]
test_phraseid =[]
with open('sentences_test.txt') as f:
content = f.readlines()
for sentence in content:
test_list.append(sentence.rstrip('\n').split("\t")[0])
test_phraseid.append(sentence.rstrip('\n').split("\t")[1])
wordcount = 0
test_tfidf = [[0 for x in range(len(unique_words))] for x in range(len(test_list))]
for unique_word in unique_words:
phrasecount = 0
ep = p = nu = en = n = 0
if word_sentiment[wordcount][4] != 0:
ep = .35 * math.log(word_sentiment[wordcount][4]/ex_pos)
if word_sentiment[wordcount][3] != 0:
p = .15 * math.log(word_sentiment[wordcount][3]/pos)
if word_sentiment[wordcount][2] != 0:
nu = 1 * math.log(word_sentiment[wordcount][2]/neu)
if word_sentiment[wordcount][0] != 0:
en = -.35 * math.log(word_sentiment[wordcount][0]/ex_neg)
if word_sentiment[wordcount][1] != 0:
n = -.15 * math.log(word_sentiment[wordcount][1]/neg)
for phrase in test_list:
words = phrase.split(" ")
docwordcount = 0
for word in words:
if word == unique_word:
docwordcount += 1
tfidf = (docwordcount * ep) + (docwordcount * p) + (docwordcount * nu) + (docwordcount * en) + (docwordcount * n)
test_tfidf[phrasecount][wordcount] = tfidf
phrasecount += 1
wordcount += 1
print("###The Linear SVC ###")
self = LinearSVC()
self = LinearSVC.fit(self, feature_matrix, train_sentiment)
test_sentiment = LinearSVC.predict(self, test_tfidf)
with open('output_deltatfidf.csv', 'w') as fil:
fil.write("PhraseId,Sentiment\n")
for x in range(0, len(test_sentiment)):
fil.write(test_phraseid[x] + "," + test_sentiment[x] + "\n")