I import my function using myfuncs but it fails I did test the same file paths only slightly different file name but it fails. The python code was 5 years old not sure if something changed or not.
I did go to this thread and I did the test and it did works successfully for the test and my actual looks to be the same.
function is not defined error in Python
#!/usr/bin/python
# Our function is pulled in here
from myfunction import pyth_test
pyth_test(1,2)
This test works however my actual function file does not both files are in the same directory both imports are from import * same format just slightly different file names which should not matter.
the test to see if import works
from myfunction import *
pyth_test(1,2)
This works successfully
3
however when I try the actual impor of the function I need to use I get this error
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
<ipython-input-5-1a2412372452> in <module>
6 f.close()
7
----> 8 bag_of_words = myfuncs.get_bag_of_words(titles_lines)
9 keywords = myfuncs.get_keywords(titles_lines, bag_of_words)
NameError: name 'myfuncs' is not defined
Code Below is what I run to call the function from the file called myfuncs.py
run.py
from myfuncs import *
f = open('s2-titles.txt', encoding="utf8")
titles_lines = f.readlines()
f.close()
bag_of_words = myfuncs.get_bag_of_words(titles_lines)
keywords = myfuncs.get_keywords(titles_lines, bag_of_words)
myfuncs.py
#!/usr/bin/env python
# coding: utf-8
def get_bag_of_words(titles_lines):
# bag of words
bag_of_words = {}
# [1: ]skips the first line which is the header
for line in titles_lines[1:]:
courseid, course_bag_of_words = get_course_bag_of_words(line)
for word in course_bag_of_words:
if word not in course_bag_of_words:
bag_of_words[word] = course_bag_of_words[word]
else:
bag_of_words[word] += course_bag_of_words[word]
return bag_of_words
def get_course_bag_of_words(line):
course_bag_of_words = {}
#split by weirdcombo to prevent weird splits
courseid, title, description = line.split('XXXYYYZZZ')
title = title.lower()
description = description.lower()
wordlist = title.split() + description.split()
if len(wordlist) >=10:
for word in wordlist:
if word not in course_bag_of_words:
course_bag_of_words[word] = 1
else:
course_bag_of_words[word] += 1
return courseid, course_bag_of_words
def get_sorted_results(d):
kv_list = d.items()
vk_list = []
for kv in kv_list:
k,v = kv
vk = v,k
vk_list.append(vk)
vk_list.sort()
vk_list.reverse()
k_list = []
for vk in vk_list[:10]:
v,k = vk
k_list.append(k)
return k_list
def get_keywords(titles_lines, bag_of_words):
n = sum(bag_of_words.values())
keywords = {}
for line in titles_lines[1:]:
courseid, course_bag_of_words = get_course_bag_of_words(line)
term_importance = {}
for word in course_bag_of_words:
tf_course =(float(course_bag_of_words[word])/
sum(course_bag_of_words.values())
)
tf_overall = float(bag_of_words[word]) /n
term_importance[word] = tf_course/tf_overall
keywords[courseid] = get_sorted_results(term_importance)
if courseid == '74953':
for word in keywords[courseid]:
print('has importance', term_importance['word'])
return keywords
I was able to figure it out. It looks like i don't need the myfuncts.myfuction() for it to load anymore just .myfuction()
I changed this
from myfuncs import *
f = open('s2-titles.txt', encoding="utf8")
titles_lines = f.readlines()
f.close()
bag_of_words = myfuncs.get_bag_of_words(titles_lines)
keywords = myfuncs.get_keywords(titles_lines, bag_of_words)
Into this and it works throws another error but that's a whole other story. for me to search around SO to figure out now.
from myfuncs import *
f = open('s2-titles.txt', encoding = "utf8")
titles_lines = f.readlines()
f.close()
bag_of_words = get_bag_of_words(titles_lines)
keywords = get_keywords(titles_lines, bag_of_words)
---------------------------------------------------------------------------
KeyError Traceback (most recent call last)
<ipython-input-18-916e24603531> in <module>
5 f.close()
6
----> 7 bag_of_words = get_bag_of_words(titles_lines)
8 keywords = get_keywords(titles_lines, bag_of_words)
~\django\nlp-notebooks\myfuncs.py in get_bag_of_words(titles_lines)
13 bag_of_words[word] = course_bag_of_words[word]
14 else:
---> 15 bag_of_words[word] += course_bag_of_words[word]
16 return bag_of_words
17
KeyError: 'learning'
Related
I'm new to python and snakemake. I'm trying to create a bed file from trf output. I'm reusing code from github, but I don't need to do all of the things the github code does. I've pulled the relevant trf sections from here:
https://github.com/mrvollger/assembly_workflows/blob/master/workflows/mask.smk
When I run the script, I get an error:
No values given for wildcard 'ID,\\d+'.
File "/project/90daydata/cotton_genomics/genomes/GB0085/reference/trf.smk", line 107, in <module>
From what I've read on other posts, the issue isn't really about the wildcard constraint, rather the expand function in line 107. I'm still figuring out how expand works, but I'm missing something and don't understand how to fix it. It is mind-bending to me still. Any help is appreciated.
import os
import sys
import re
import re
import pysam
import pandas as pd
from datetime import date
from snakemake.remote.HTTP import RemoteProvider as HTTPRemoteProvider
from snakemake.remote.FTP import RemoteProvider as FTPRemoteProvider
FTP = FTPRemoteProvider()
HTTP = HTTPRemoteProvider()
today = date.today()
DATE = today.strftime("%Y/%m/%d")
SDIR=os.path.realpath(os.path.dirname(srcdir("env.cfg"))+"/..")
shell.prefix(f"source {SDIR}/env.cfg ; set -eo pipefail; ")
# delete if not debug
DEBUG=True
def tempd(fname):
if(DEBUG):
return(fname)
return(temp(fname))
FASTA = os.path.abspath( config["fasta"] )
FAI = FASTA + ".fai"
assert os.path.exists(FAI), f"Index must exist. Try: samtools faidx {FASTA}"
# WILDCARDS
NIDS = min(200, len(open(FAI).readlines()) )
IDS = [ "{:03}".format(ID+1) for ID in range(NIDS) ]
# IDS = [ 1 ]
#x = range(6)
#for n in x:
# print(n)
for y in range(len(IDS)):
print(IDS[y], "+++")
print(open(FAI).readlines())
SM = "asm"
if("sample" in config): SM = config["sample"]
SPECIES = "human"
if("species" in config): SPECIES = config["species"]
THREADS = 16
if("threads" in config): THREADS = config["threads"]
SMS = [SM]
wildcard_constraints:
SM="|".join(SMS),
ID="\d+",
# FASTA_FMT = f"Masked/temp/{SM}_{{ID}}.fasta"
FASTA_FMT = f"temp/{SM}_{{ID}}.fa"
TRFBED = os.path.abspath(f"{SM}_{{ID}}.trf.bed")
rule split_fasta:
input:
fasta = FASTA,
output:
fastas = tempd(expand(FASTA_FMT, ID=IDS)),
threads: 1
resources:
mem=8
run:
fasta = pysam.FastaFile(input["fasta"])
outs = [open(f,"w+") for f in output.fastas]
outidx = 0
for name in fasta.references:
seq = fasta.fetch(name)
outs[outidx].write( ">{}\n{}\n".format(name, seq) )
outidx += 1
if(outidx == NIDS): outidx = 0
for out in outs:
out.close()
# This runs trf on the temp output
rule run_trf:
input:
fasta = FASTA_FMT,
output:
dat = tempd(FASTA_FMT + ".dat")
benchmark:
FASTA_FMT + ".bench"
resources:
mem=24,
threads: 1
shell:"""
trf {input.fasta} 2 7 7 80 10 50 15 -l 25 -h -ngs > {output.dat}
"""
#for yy in range(len(IDS)):
# print(IDS[yy], "++")
rule trf_bed:
input:
dats = expand(rules.run_trf.output.dat, ID=IDS, SM=SM),
output:
bed = TRFBED,
resources:
mem=8,
threads: 1
run:
trf = []
header = '#chr start end PeriodSize CopyNumber ConsensusSize PercentMatches PercentIndels Score A C G T Entropy Motif Sequence'.split()
for datf in input.dats:
chrom = None
sys.stderr.write( "\r" + datf )
with open(datf, 'r') as dat:
for line in dat:
splitline = line.split()
if( line.startswith("Sequence:") ):
chrom = int(line.split()[1].strip())
#sys.stderr.write(chrom + "\n")
elif( line.startswith("#") ):
chrom = splitline[0][1:].strip() # grab everything after the # in the first word
else:
# Catch index errors when line is blank
try:
# Check if in header sequence (all non-header lines start with an int: start pos)
try:
int(splitline[0])
except ValueError:
continue
trf.append([chrom] + splitline[ 0: (len(header)-1) ] )
except IndexError:
pass
trf = pd.DataFrame(trf, columns=header)
print(trf.shape)
trf["start"] = trf["start"].astype(int)
trf.sort_values(by=["#chr", "start"], inplace=True)
print("done sorting trf")
trf.to_csv(output.bed, sep="\t", index=False)
rule trf:
input:
bed = rules.trf_bed.output.bed
I am working with Gensim FASTText modeling and have the following questions.
The output of "ft_model.save(BASE_PATH + MODEL_PATH + fname)" saves the following 3 files. Is this correct? is there a way to combine all three files?
ft_gensim-v3
ft_gensim-v3.trainables.vectors_ngrams_lockf.npy
ft_gensim-v3.wv.vectors_ngrams.npy
When I attempt to load the training file and then use it, I get the following error from if model.wv.similarity(real_data, labelled['QueryText'][i]) > maxSimilaity:
'function' object has no attribute 'wv'
Finally, both models, is there a way not to have to store the output of def read_train(path,label_path) and def lemmetize(df_col)so I do not have to run this part of the code every time I want to train the model or compare?
Thanks for the assistance.
Here is my FastText Train Model
import os
import logging
from config import BASE_PATH, DATA_PATH, MODEL_PATH
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
from pprint import pprint as print
from gensim.models.fasttext import FastText as FT_gensim
from gensim.test.utils import datapath
#Read Training data
import pandas as pd
def read_train(path,label_path):
d = []
#e = []
df = pd.read_excel(path)
labelled = pd.read_csv(label_path)
updated_col1 = lemmetize(df['query_text'])
updated_col2 = lemmetize(labelled['QueryText'])
for i in range(len(updated_col1)):
d.append(updated_col1[i])
#print(d)
for i in range(len(updated_col2)):
d.append(updated_col2[i])
return d
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import nltk
import string
from nltk.stem import PorterStemmer
def lemmetize(df_col):
df_updated_col = pd.Series(0, index = df_col.index)
stop_words = set(stopwords.words('english'))
lemmatizer = nltk.stem.wordnet.WordNetLemmatizer()
ps = PorterStemmer()
for i, j in zip(df_col, range(len(df_col))):
lem = []
t = str(i).lower()
t = t.replace("'s","")
t = t.replace("'","")
translator = str.maketrans(string.punctuation, ' '*len(string.punctuation))
t = t.translate(translator)
word_tokens = word_tokenize(t)
for i in range(len(word_tokens)):
l1 = lemmatizer.lemmatize(word_tokens[i])
s1 = ps.stem(word_tokens[i])
if list(l1) != [''] and list(l1) != [' '] and l1 != '' and l1 != ' ':
lem.append(l1)
filtered_sentence = [w for w in lem if not w in stop_words]
df_updated_col[j] = filtered_sentence
return df_updated_col
#read test data
def read_test(path):
return pd.read_excel(path)
#Read labelled data
def read_labelled(path):
return pd.read_csv(path)
word_tokenized_corpus = read_train('Train Data.xlsx','SMEQueryText.csv')
#Train fasttext model
import tempfile
import os
from gensim.models import FastText
from gensim.test.utils import get_tmpfile
fname = get_tmpfile("ft_gensime-v3")
def train_fastText(data, embedding_size = 60, window_size = 40, min_word = 5, down_sampling = 1e-2, iter=100):
ft_model = FastText(word_tokenized_corpus,
size=embedding_size,
window=window_size,
min_count=min_word,
sample=down_sampling,
sg=1,
iter=100)
#with tempfile.NamedTemporaryFile(prefix=BASE_PATH + MODEL_PATH + 'ft_gensim_v2-', delete=False) as tmp:
# ft_model.save(tmp.name, separately=[])
ft_model.save(BASE_PATH + MODEL_PATH + fname)
return ft_model
# main function to output
def main(test_path, train_path, labelled):
test_data = read_test(test_path)
train_data = read_train(train_path,labelled)
labelled = read_labelled(labelled)
output_df = pd.DataFrame(index = range(len(test_data)))
output_df['test_query'] = str()
output_df['Similar word'] = str()
output_df['category'] = str()
output_df['similarity'] = float()
model = train_fastText(train_data)
# run main
if __name__ == "__main__":
output = main('Test Data.xlsx','Train Data.xlsx','QueryText.csv')
Here is my Usage Model
import pandas as pd
from gensim.models import FastText
import gensim
from config import BASE_PATH, DATA_PATH, MODEL_PATH
#Read Training data
def read_train(path,label_path):
d = []
#e = []
df = pd.read_excel(path)
labelled = pd.read_csv(label_path)
updated_col1 = lemmetize(df['query_text'])
updated_col2 = lemmetize(labelled['QueryText'])
for i in range(len(updated_col1)):
d.append(updated_col1[i])
for i in range(len(updated_col2)):
d.append(updated_col2[i])
return d
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import nltk
import string
from nltk.stem import PorterStemmer
def lemmetize(df_col):
df_updated_col = pd.Series(0, index = df_col.index)
stop_words = set(stopwords.words('english'))
lemmatizer = nltk.stem.wordnet.WordNetLemmatizer()
ps = PorterStemmer()
for i, j in zip(df_col, range(len(df_col))):
lem = []
t = str(i).lower()
t = t.replace("'s","")
t = t.replace("'","")
translator = str.maketrans(string.punctuation, ' '*len(string.punctuation))
t = t.translate(translator)
word_tokens = word_tokenize(t)
for i in range(len(word_tokens)):
l1 = lemmatizer.lemmatize(word_tokens[i])
s1 = ps.stem(word_tokens[i])
if list(l1) != [''] and list(l1) != [' '] and l1 != '' and l1 != ' ':
lem.append(l1)
filtered_sentence = [w for w in lem if not w in stop_words]
df_updated_col[j] = filtered_sentence
return df_updated_col
#read test data
def read_test(path):
return pd.read_excel(path)
#Read labelled data
def read_labelled(path):
return pd.read_csv(path)
def load_training():
return FT_gensim.load(BASE_PATH + MODEL_PATH +'ft_gensim-v3')
#compare similarity
def compare_similarity(model, real_data, labelled):
maxWord = ''
category = ''
maxSimilaity = 0
#print("train data",labelled[1])
for i in range(len(labelled)):
if model.similarity(real_data, labelled['QueryText'][i]) > maxSimilaity:
#print('labelled',labelled['QueryText'][i], 'i', i)
maxWord = labelled['QueryText'][i]
category = labelled['Subjectmatter'][i]
maxSimilaity = model.similarity(real_data, labelled['QueryText'][i])
return maxWord, category, maxSimilaity
# Output from Main to excel
from pandas import ExcelWriter
def export_Excel(data, aFile = 'FASTTEXTOutput.xlsx'):
df = pd.DataFrame(data)
writer = ExcelWriter(aFile)
df.to_excel(writer,'Sheet1')
writer.save()
# main function to output
def main(test_path, train_path, labelled):
test_data = read_test(test_path)
train_data = read_train(train_path,labelled)
labelled = read_labelled(labelled)
output_df = pd.DataFrame(index = range(len(test_data)))
output_df['test_query'] = str()
output_df['Similar word'] = str()
output_df['category'] = str()
output_df['similarity'] = float()
model = load_training
for i in range(len(test_data)):
output_df['test_query'][i] = test_data['query_text'][i]
#<first change>
maxWord, category, maxSimilaity = compare_similarity(model, str(test_data['query_text'][i]), labelled)
output_df['Similar word'][i] = maxWord
output_df['category'][i] = category
output_df['similarity'][i] = maxSimilaity
#<second change>
return output_df
# run main
if __name__ == "__main__":
output = main('Test Data.xlsx','Train Data.xlsx','SMEQueryText.csv')
export_Excel(output)
Here is the full tracible error message
---------------------------------------------------------------------------
AttributeError Traceback (most recent call last)
<ipython-input-22-57803b59c0b9> in <module>
1 # run main
2 if __name__ == "__main__":
----> 3 output = main('Test Data.xlsx','Train Data.xlsx','SMEQueryText.csv')
4 export_Excel(output)
<ipython-input-21-17cb88ee0f79> in main(test_path, train_path, labelled)
13 output_df['test_query'][i] = test_data['query_text'][i]
14 #<first change>
---> 15 maxWord, category, maxSimilaity = compare_similarity(model, str(test_data['query_text'][i]), labelled)
16 output_df['Similar word'][i] = maxWord
17 output_df['category'][i] = category
<ipython-input-19-84d7f268d669> in compare_similarity(model, real_data, labelled)
6 #print("train data",labelled[1])
7 for i in range(len(labelled)):
----> 8 if model.wv.similarity(real_data, labelled['QueryText'][i]) > maxSimilaity:
9 #print('labelled',labelled['QueryText'][i], 'i', i)
10 maxWord = labelled['QueryText'][i]
AttributeError: 'function' object has no attribute 'wv'
You've got three separate, only-vaguely-related questions here. Taking each in order:
Why are there 3 files, and can they be combined?
It's more efficient to store the big raw arrays separately from the main 'pickled' model – and for models above a few gigabytes in size, necessary to work-around 'pickle' implementation limits. So I'd recommend just keeping the default behavior, and keeping the habit of managing/moving/copying the sets of files together.
If your model is small enough, there is something you can try, though. The .save() method has an optional parameter sep_limit which controls the threshold array size, over which arrays are stored as separate files. By setting that much larger, say sep_limit=2*1024*1024*1024 (2GiB), smaller models should save a single file. (But, loading will be slower, you won't have the sometimes-useful option of memory-map loading, and saving may break on oversized models.)
Why is there a AttributeError: 'function' object has no attribute 'wv' error?
Your line of code model = load_training assigns an actual function to the model variable, rather than what you probably intended, the return-value of calling that function with some arguments. That function has no .wv attribute, hence the error. If model were an actual instance of FastText, you'd not get that error.
Can the corpus text be stored to avoid repeat preprocessing and conversion from pandas formats?
Sure, you can just write the text to a file. Roughly:
with open('mycorpus.txt', mode='w') as corpusfile:
for text in word_tokenized_corpus:
corpusfile.write(' '.join(text))
corpusfile.write('\n')
Though in fact, gensim offers a utility function, utils.save_as_line_sentence(), that can do this (& explicitly handles some extra encoding concerns). See:
https://radimrehurek.com/gensim/utils.html#gensim.utils.save_as_line_sentence
The LineSentence utility class in gensim.models.word2vec can stream texts from such a file back for future re-use:
https://radimrehurek.com/gensim/models/word2vec.html#gensim.models.word2vec.LineSentence
I'm running this code but I seem to keep getting an attribute error and I don't know how this would be fixed. I've included code as well as the shell window when I run it!
# the Count class. The wordleFromObject function takes a Count object as
# input, and calls its getTopWords method.
import string
class Count:
# method to initialize any data structures, such as a dictionary to
# hold the counts for each word, and a list of stop words
def __init__(self):
#print("Initializing Word Counter")
# set the attrbute wordCounts to an empty dictionary
self.wordCounts = {}
infile = open("stop_words.txt", "r")
self.stop_word_dict = {};
for line in infile.readlines():
self.stop_word_dict = 1
# method to add one to the count for a word in the dictionary.
# if the word is not yet in the dictionary, we'll need to add a
# record for the word, with a count of one.
def incCount(self,word):
my_table = str.maketrans('', '', string.punctuation)
self.wordCounts = {}
if word in self.stop_word_dict.keys():
return
else:
self.stop_word_dict += 1
cleaned_word = word.translate(my_table).lower()
if cleaned_word != '':
if cleaned_word in self.wordCounts.keys():
self.wordCounts[cleaned_word] += 1
else:
self.wordCounts[cleaned_word] = 1
# method to look up the count for a word
def lookUpCount(self, word):
return self.wordCounts.get(word.lower(), 0)
def main():
print("Initializing Word Counter")
filename = input("Enter book file:")
infile = open(filename, "r")
counter = Count()
for line in infile.readlines():
words = [word.strip() for word in line.strip().split()]
for word in words:
counter.incCount(word)
infile.close()
# Test code for Part 2 and 3
# Comment this code once you have completed part 3.
print(counter.lookUpCount("alice"))
print(counter.lookUpCount("rabbit"))
print(counter.lookUpCount("and"))
print(counter.lookUpCount("she"))
return
# Test code for Part 4 and 5
# topTen = counter.getTopWords(10)
# print(topTen)
# Test code for Part 5
# Import the wordle module and uncomment the call to the wordle function!
# wordle.wordleFromObject(counter,30)
# run the main program
main()
Error Message:
Initializing Word Counter
Enter book file:Alice.txt
Traceback (most recent call last):
line 69, in <module>
main()
line 50, in main
counter.incCount(word)
line 28, in incCount
if word in self.stop_word_dict.keys():
AttributeError: 'int' object has no attribute 'keys'
for line in infile.readlines():
self.stop_word_dict = 1
In this lines you change your stop_word_dict from dict to int, and later in the code, you are trying to reach dictionary "keys" attribute
I am pretty new to python and this is the first code I have written. Trying to use the NLTK package. The problem comes at the end when trying to execute the label_probdist.prob('positive') line.
This is the error I get;
name 'label_probdist' is not defined
NameError Traceback (most recent call last)
<ipython-input-57-006d791d4445> in <module>()
----> 1 print label_probdist.prob('positive')
NameError: name 'label_probdist' is not defined
import nltk, re, pprint
import csv
from nltk import word_tokenize, wordpunct_tokenize
from nltk.tokenize import wordpunct_tokenize
from nltk.probability import FreqDist, DictionaryProbDist, ELEProbDist, sum_logs
from nltk.classify.api import ClassifierI
# not in use nltk.download() #Download the bookpackage
#open the file that containts wallposts and classifier
with open('Classified.csv' ,'rb') as f:
reader = csv.reader(f)
FBsocial = map(tuple, reader)
import random
random.shuffle(FBsocial)
FBsocial = FBsocial[:500]
len(FBsocial)
FBSocialData = [] #sorting data
for row in FBsocial:
statement = row[0]
sentiment = row[1]
words_filtered = [e.lower() for e in statement.split() if len(e) >= 3]
FBSocialData.append((words_filtered, sentiment))
len(FBSocialData)
#Extracting features of word(list of words ordered by frequency)
def get_words_in_FBdata(FBSocialData):
all_words = []
for (statement, sentiment) in FBSocialData:
all_words.extend(statement)
return all_words
def get_word_features(wordlist):
wordlist = nltk.FreqDist(wordlist)
word_features = wordlist.keys()
return word_features
word_features = get_word_features(get_words_in_FBdata(FBSocialData))
len(word_features)
#just a test;
document = ("hei","grin","andre","jævlig","gøy",)
#Classifier to decide which feature are relevant
def extract_features(document):
document_words = set(document)
features = {}
for word in word_features:
features['contains(%s)' % word] = (word in document_words)
return features
extract_features(document)
#testing extract_features
extract_features("udviser blomsterbutik")
training_set = nltk.classify.util.apply_features(extract_features, FBSocialData)
len(training_set)
classifier = nltk.NaiveBayesClassifier.train(training_set)
def train(labeled_featuresets, estimator=nltk.probability.ELEProbDist):
# Create the P(label) distribution
label_probdist = estimator(label_freqdist)
# Create the P(fval|label, fname) distribution
feature_probdist = {}
return NaiveBayesClassifier(label_probdist, feature_probdist)
#pvalue
print label_probdist.prob('positive')
print label_probdist.prob('negative')
You are defining variable label_probdist inside function train. Then you are trying to access it outside it's scope. It is not possible. It's a local variable, not a global one.
I was experimenting with python NLTK text classification. Here is the code example i am practicing: http://www.laurentluce.com/posts/twitter-sentiment-analysis-using-python-and-nltk/
Here is code:
from nltk import bigrams
from nltk.probability import ELEProbDist, FreqDist
from nltk import NaiveBayesClassifier
from collections import defaultdict
train_samples = {}
with file ('data/positive.txt', 'rt') as f:
for line in f.readlines():
train_samples[line] = 'pos'
with file ('data/negative.txt', 'rt') as d:
for line in d.readlines():
train_samples[line] = 'neg'
f = open("data/test.txt", "r")
test_samples = f.readlines()
# Error in this code
# def bigramReturner(text):
# tweetString = text.lower()
# bigramFeatureVector = {}
# for item in bigrams(tweetString.split()):
# bigramFeatureVector.append(' '.join(item))
# return bigramFeatureVector
# Updated the code from the stack overflow comment
def bigramReturner (tweetString):
tweetString = tweetString.lower()
#comment the line since the function is not defined
#tweetString = removePunctuation (tweetString)
bigramFeatureVector = []
for item in nltk.unigrams(tweetString.split()):
bigramFeatureVector.append(' '.join(item))
return bigramFeatureVector
def get_labeled_features(samples):
word_freqs = {}
for text, label in train_samples.items():
tokens = text.split()
for token in tokens:
if token not in word_freqs:
word_freqs[token] = {'pos': 0, 'neg': 0}
word_freqs[token][label] += 1
return word_freqs
def get_label_probdist(labeled_features):
label_fd = FreqDist()
for item, counts in labeled_features.items():
for label in ['neg', 'pos']:
if counts[label] > 0:
label_fd.inc(label)
label_probdist = ELEProbDist(label_fd)
return label_probdist
def get_feature_probdist(labeled_features):
feature_freqdist = defaultdict(FreqDist)
feature_values = defaultdict(set)
num_samples = len(train_samples) / 2
for token, counts in labeled_features.items():
for label in ['neg', 'pos']:
feature_freqdist[label, token].inc(True, count=counts[label])
feature_freqdist[label, token].inc(None, num_samples - counts[label])
feature_values[token].add(None)
feature_values[token].add(True)
for item in feature_freqdist.items():
print item[0], item[1]
feature_probdist = {}
for ((label, fname), freqdist) in feature_freqdist.items():
probdist = ELEProbDist(freqdist, bins=len(feature_values[fname]))
feature_probdist[label, fname] = probdist
return feature_probdist
labeled_features = get_labeled_features(train_samples)
label_probdist = get_label_probdist(labeled_features)
feature_probdist = get_feature_probdist(labeled_features)
classifier = NaiveBayesClassifier(label_probdist, feature_probdist)
for sample in test_samples:
print "%s | %s" % (sample, classifier.classify(bigramReturner(sample)))
but when I run the code I get following error:
Traceback (most recent call last):
File "naive_bigram_1.py", line 87, in <module>
print "%s | %s" % (sample, classifier.classify(bigramReturner(sample)))
File "naive_bigram_1.py", line 30, in bigramReturner
tweetString = removePunctuation (tweetString)
NameError: global name 'removePunctuation' is not defined
I saw the similar question with other error, here I updated as well n-grams with Naive Bayes classifier
You're calling a function removePunctuation that hasn't been defined previously:
def bigramReturner (tweetString):
tweetString = tweetString.lower()
tweetString = removePunctuation (tweetString)
....
I also noticed that you put spaces between your functions' names and the parameters list. Avoid that as it's not really idiomatic Python and could even cause some problems (like your function being evaluated as an object instead of being called).