I am making a document classifier and here is my code:
import os
import io
import numpy
from pandas import DataFrame
from sklearn.feature_extraction.text import CountVectorizer,
TfidfTransformer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
def readFiles(path):
for root, dirnames, filenames in os.walk(path):
for filename in filenames:
path = os.path.join(root, filename)
inBody = False
lines = []
f = io.open(path, 'r', encoding='latin1')
for line in f:
if inBody:
lines.append(line)
elif line == '\n':
inBody = True
f.close()
message = '\n'.join(lines)
yield path, message
def dataFrameFromDirectory(path, classification):
rows = []
index = []
for filename, message in readFiles(path):
rows.append({'resume': message, 'class': classification})
index.append(filename)
return DataFrame(rows, index=index)
data = DataFrame({'resume': [], 'class': []})
data = data.append(dataFrameFromDirectory(r'<path>', 'Yes'))
data = data.append(dataFrameFromDirectory(r'<path>', 'No'))
Then I split the data, and used Tfidf Vectorizer:
tf=TfidfVectorizer(min_df=1, stop_words='english')
data_traintf=tf.fit_transform(data_train)
mnb=MultinomialNB()
mnb.fit(data_traintf,class_train)
After training and testing, I saved my classifier as a pickle file:
import pickle
with open(r'clf.pkl','wb') as f:
pickle.dump(mnb,f)
But when I load it again and try to use the classifier, I get TfidfVectorizer - Vocabulary wasn't fitted error. So I tried using pipeline and saved my vectorizer as well :
from sklearn.pipeline import Pipeline
classifier=Pipeline([('tfidf',tf),('multiNB',mnb)])
with open(r'clf_1.pkl','wb') as f:
pickle.dump(classifier,f)
But still I get the same error. What might be going wrong?
EDIT: The pickle file was stored successfully and on the other end, I loaded the file:
import pickle
with open(r'clf_1.pkl','rb') as f:
clf=pickle.load(f)
And created a test data frame. When I do test_tf=tf.fit(test['resume']) it works fine but pred=clf.predict(test_tf) gives error TypeError: 'TfidfVectorizer' object is not iterable
Do I need to loop through the data frame that has around 15 objects?
Related
I have about 5,000 .gzip files (~1MB each). Each of these files contains data in a jsonlines format. Here's what it looks like:
{"category_id":39,"app_id":12731}
{"category_id":45,"app_id":12713}
{"category_id":6014,"app_id":13567}
I want to parse these files and convert them to a pandas dataframe. Is there a way to speed up this process? Here's my code but it's kinda slow (0.5s per file)
import pandas as pd
import jsonlines
import gzip
import os
import io
path = 'data/apps/'
files = os.listdir(path)
result = []
for n, file in enumerate(files):
print(n, file)
with open(f'{path}/{file}', 'rb') as f:
data = f.read()
unzipped_data = gzip.decompress(data)
decoded_data = io.BytesIO(unzipped_data)
reader = jsonlines.Reader(decoded_data)
for line in reader:
if line['category_id'] == 6014:
result.append(line)
df = pd.DataFrame(result)
This should allow you to read each line without loading the whole file.
import pandas as pd
import json
import gzip
import os
path = 'data/apps/'
files = os.listdir(path)
result = []
for n, file in enumerate(files):
print(n, file)
with gzip.open(f'{path}/{file}') as f:
for line in f:
data = json.loads(line)
if data['category_id'] == 6014:
result.append(data)
df = pd.DataFrame(result)
curernttly I am working on a project and using Tfidf to transform X_train data which contain the text data. When I am using count_vectorizer.fit_transform(X_train) I get this error:
Traceback (most recent call last):
File "train.py", line 100, in <module>
counts = count_vectorizer.fit_transform(X_train)
File "/home/vishalthadari/Documents/Seperation 1/API's/Confirmation API/python 3 /env/lib/python3.6/site-packages/sklearn/feature_extraction/text.py", line 869, in fit_transform
self.fixed_vocabulary_)
File "/home/vishalthadari/Documents/Seperation 1/API's/Confirmation API/python 3 /env/lib/python3.6/site-packages/sklearn/feature_extraction/text.py", line 811, in _count_vocab
raise ValueError("empty vocabulary; perhaps the documents only"
ValueError: empty vocabulary; perhaps the documents only contain stop words
I read other stackoverflow questions like this Link But i cannot able to understand how to split the data of X_train
Here's my Train.py file
import os
import numpy
from pandas import DataFrame
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
NEWLINE = '\n'
TRAVEL = 'Travel'
OTHER = 'Other'
SOURCES = [
('data/travel', TRAVEL),
('data/other', OTHER),
]
SKIP_FILES = {'cmds', '.DS_Store'}
SEED = 0 # for reproducibility
def read_files(path):
#Reads all files in all directories mentioned in SOURCES
for root, dir_names, file_names in os.walk(path):
for path in dir_names:
read_files(os.path.join(root, path))
for file_name in file_names:
if file_name not in SKIP_FILES:
file_path = os.path.join(root, file_name)
if os.path.isfile(file_path):
past_header, lines = False, []
f = open(file_path, encoding="latin-1")
for line in f:
if past_header:
lines.append(line)
elif line == NEWLINE:
past_header = True
f.close()
content = NEWLINE.join(lines)
yield file_path, content
def build_data_frame(path, classification):
#Returns a data frame of all the files read using read_files()
data_frame = DataFrame({'text': [], 'class': []})
for file_name, text in read_files(path):
data_frame = data_frame.append(
DataFrame({'text': [text], 'class': [classification]}, index=[file_name]))
return data_frame
data = DataFrame({'text': [], 'class': []})
for path, classification in SOURCES:
data = data.append(build_data_frame(path, classification))
data = data.reindex(numpy.random.permutation(data.index))
#Training data
X_train = numpy.asarray(data['text'])
count_vectorizer = CountVectorizer()
counts = count_vectorizer.fit_transform(X_train)
I followed all the solutions but still didnt solved the issue. Is i am doing the wrong apprach to transform the data if i am doing right then why i am getting this error.
Thanks in Advance
This question already has an answer here:
Creating a custom categorized corpus in NLTK and Python
(1 answer)
Closed 6 years ago.
I'm looking to use my own created corpus within Visual Studio Code for MacOSX; I have read probably a hundred forums and I can't wrap my head around what I'm doing wrong as I'm pretty new to programming.
This question seems to be the closes thing I can find to what I need to do; however, I am unaware of how to do the following:
"on a Mac it would be in ~/nltk_data/corpora, for instance. And it looks like you also have to append your new corpus to the __init__.py within .../site-packages/nltk/corpus/."
When answering, please be aware I am using Homebrew and don't want to permanently disable using another path if I need to use a stock NLTK corpora data set as well within the same coding.
If needed, I can post my attempt at coding using "PlaintextCorpusReader" along with the provided traceback below, although I would rather not have to use PlaintextCorpusReader at all for seamless use and would rather just use a simple copy+paste for .txt files into an appropriate location I wish to use in accordance with the append coding.
Thank you.
Traceback (most recent call last):
File "/Users/jordanXXX/Documents/NLP/bettertrainingdata", line 42, in <module>
short_pos = open("short_reviews/pos.txt", "r").read
IOError: [Errno 2] No such file or directory: 'short_reviews/pos.txt'
EDIT:
Thank you for your responses.
I have taken your advice and moved the folder out of NLTK's corpora.
I've been doing some experimenting with my folder location and I've gotten different tracebacks.
If you are saying the best way to do it is with PlaintextCorpusReader then so be it; however, maybe for my application I'd want to use CategorizedPlaintextCorpusReader?
sys.argv is definitely not what I meant, so I can read up on that later.
First, here is my code without my attempt to use PlaintextCorpusReader which results in the above traceback when the folder "short_reviews" containing the pos.txt and neg.txt files is outside of the NLP folder:
import nltk
import random
from nltk.corpus import movie_reviews
from nltk.classify.scikitlearn import SklearnClassifier
import pickle
from sklearn.naive_bayes import MultinomialNB, GaussianNB, BernoulliNB
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.svm import SVC, LinearSVC, NuSVC
from nltk.classify import ClassifierI
from statistics import mode
from nltk import word_tokenize
class VoteClassifier(ClassifierI):
def __init__(self, *classifiers):
self._classifiers = classifiers
def classify(self, features):
votes = []
for c in self._classifiers:
v = c.classify(features)
votes.append(v)
return mode(votes)
def confidence(self, features):
votes = []
for c in self._classifiers:
v = c.classify(features)
votes.append(v)
choice_votes = votes.count(mode(votes))
conf = choice_votes / len(votes)
return conf
# def main():
# file = open("short_reviews/pos.txt", "r")
# short_pos = file.readlines()
# file.close
short_pos = open("short_reviews/pos.txt", "r").read
short_neg = open("short_reviews/neg.txt", "r").read
documents = []
for r in short_pos.split('\n'):
documents.append( (r, "pos") )
for r in short_neg.split('\n'):
documents.append((r, "neg"))
all_words = []
short_pos_words = word.tokenize(short_pos)
short_neg_words = word.tokenize(short_neg)
for w in short_pos_words:
all_words.append(w. lower())
for w in short_neg_words:
all_words.append(w. lower())
all_words = nltk.FreqDist(all_words)
However, when I move the folder "short_reviews" containing the text files into the NLP folder using the same code as above but without the use of PlaintextCorpusReader the following occurs:
Traceback (most recent call last):
File "/Users/jordanXXX/Documents/NLP/bettertrainingdata", line 47, in <module>
for r in short_pos.split('\n'):
AttributeError: 'builtin_function_or_method' object has no attribute 'split'
When I move the folder "short_reviews" containing the text files into the NLP folder using the code below with the use of PlaintextCorpusReader the following Traceback occurs:
import nltk
import random
from nltk.corpus import movie_reviews
from nltk.classify.scikitlearn import SklearnClassifier
import pickle
from sklearn.naive_bayes import MultinomialNB, GaussianNB, BernoulliNB
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.svm import SVC, LinearSVC, NuSVC
from nltk.classify import ClassifierI
from statistics import mode
from nltk import word_tokenize
from nltk.corpus import PlaintextCorpusReader
corpus_root = 'short_reviews'
word_lists = PlaintextCorpusReader(corpus_root, '*')
wordlists.fileids()
class VoteClassifier(ClassifierI):
def __init__(self, *classifiers):
self._classifiers = classifiers
def classify(self, features):
votes = []
for c in self._classifiers:
v = c.classify(features)
votes.append(v)
return mode(votes)
def confidence(self, features):
votes = []
for c in self._classifiers:
v = c.classify(features)
votes.append(v)
choice_votes = votes.count(mode(votes))
conf = choice_votes / len(votes)
return conf
# def main():
# file = open("short_reviews/pos.txt", "r")
# short_pos = file.readlines()
# file.close
short_pos = open("short_reviews/pos.txt", "r").read
short_neg = open("short_reviews/neg.txt", "r").read
documents = []
for r in short_pos.split('\n'):
documents.append((r, "pos"))
for r in short_neg.split('\n'):
documents.append((r, "neg"))
all_words = []
short_pos_words = word.tokenize(short_pos)
short_neg_words = word.tokenize(short_neg)
for w in short_pos_words:
all_words.append(w. lower())
for w in short_neg_words:
all_words.append(w. lower())
all_words = nltk.FreqDist(all_words)
Traceback (most recent call last):
File "/Users/jordanXXX/Documents/NLP/bettertrainingdata2", line 18, in <module>
word_lists = PlaintextCorpusReader(corpus_root, '*')
File "/Library/Python/2.7/site-packages/nltk/corpus/reader/plaintext.py", line 62, in __init__
CorpusReader.__init__(self, root, fileids, encoding)
File "/Library/Python/2.7/site-packages/nltk/corpus/reader/api.py", line 87, in __init__
fileids = find_corpus_fileids(root, fileids)
File "/Library/Python/2.7/site-packages/nltk/corpus/reader/util.py", line 763, in find_corpus_fileids
if re.match(regexp, prefix+fileid)]
File "/usr/local/Cellar/python/2.7.12_2/Frameworks/Python.framework/Versions/2.7/lib/python2.7/re.py", line 141, in match
return _compile(pattern, flags).match(string)
File "/usr/local/Cellar/python/2.7.12_2/Frameworks/Python.framework/Versions/2.7/lib/python2.7/re.py", line 251, in _compile
raise error, v # invalid expression
error: nothing to repeat
The answer you refer to contains some very poor (or rather, inapplicable) advice. There is no reason to place your own corpus in nltk_data, or to hack nltk.corpus.__init__.py to load it like a native corpus. In fact, do not do these things.
You should use PlaintextCorpusReader. I don't understand your reluctance to do so, but if your files are plain text, it's the right tool to use. Supposing you have a folder NLP/bettertrainingdata, you can build a reader that will load all .txt files in this folder like this:
myreader = nltk.corpus.reader.PlaintextCorpusReader(r"NLP/bettertrainingdata", r".*\.txt")
If you add new files to the folder, the reader will find and use them. If what you want is to be able to use your script with other folders, then just do so-- you don't need a different reader, you need to learn about sys.argv. If you are after a categorized corpus with pos.txt and neg.txt, then you need a CategorizedPlaintextCorpusReader (which see). If it's something else yet that you want, then please edit your question to explain what you are trying to do.
I am using some sample code (below) to test a NB classifier and Im getting the following error from line 22:
_csv.Error: new-line character seen in unquoted field - do you need to open the file in universal-newline mode?
This is a sample row of the csv file:
b8:27:eb:38:72:a7,df598b5eb8f4,5/9/16 14:47,154aec250ef6,-84,outside
sample of code:
from sklearn.preprocessing import LabelBinarizer
import numpy as np
from sklearn import naive_bayes
import csv
import random
from sklearn import metrics
import urllib
url = "example.com"
webpage = urllib.urlopen(url)
# download the file
#raw_data = urllib.urlopen(url)
datareader = csv.reader(webpage) #line 22 is this one
ct = 0;
for row in datareader:
ct = ct+1
webpage = urllib.urlopen(url)
datareader = csv.reader(webpage)
data = np.array(-1*np.ones((ct,6),float),object);
k=0;
for row in datareader:
data[k,:] = np.array(row)
k = k+1;
featnames = np.array(['unti','dongle','timestamp','tracker','rssi','label'],str)
keys = [[]]*np.size(data,1)
numdata = -1*np.ones_like(data);
for k in range(np.size(data,1)):
keys[k],garbage,numdata[:k] = np.unique(data[:,k],True,True)
numrows = np.size(numdata,0);
numcols = np.size(numdata,1);
numdata = np.array(numdata, int)
xdata = numdata[:,:-1]
ydata = numdata[:,-1]
lbin = LabelBinarizer();
for k in range(np.size(xdata,1)):
if k==0:
xdata_ml = lbin.fit_transform(xdata[:,k]);
else:
xdata_ml = np.hstack((xdata_ml,lbin.fit_transform(xdata[:,k])))
ydata_ml = lbin.fit_transform(ydata)
allIDX = np.arrange(numrows);
random.shuffle(allIDX);
holdout_number = numrows/10;
testIDX = allIDX[0:holdout_number];
trainIDX = allIDX[holdout_number:];
xtest = xdata_ml[testIDX,:];
xtrain = xdata_ml[trainIDX,:];
ytest = ydata[testIDX];
ytrain = ydata[trainIDX];
mnb = naive_bayes.MultinomialNB();
mnb.fit(xtrain,ytrain);
print "Classification accuracy of MNB =", mnb.score(xtest,ytest)
Can anyone help me find the error and suggest a fix?
Are you using windows? If yes, this can be solved by:
datareader = csv.reader(webpage, dialect=csv.excel_tab)
Some of the answers here CSV new-line character seen in unquoted field error refer to CSV in MAC
Can you try to manually download the file to your MAC and try to do the following with the file as local file:
1) Save the file as CSV (MS-DOS Comma-Separated)
2) Save the file as CSV (Windows Comma-Separated)
3) Run the following script
with open(csv_filename, 'rU') as csvfile:
csvreader = csv.reader(csvfile)
for row in csvreader:
print ', '.join(row)
explanation about 'ru': https://www.python.org/dev/peps/pep-0278/
In a Python with universal newline support open() the mode parameter can also be "U", meaning "open for input as a text file with universal newline interpretation". Mode "rU" is also allowed, for symmetry with "rb"
Rationale
Universal newline support is implemented in C, not in Python.
This is done because we want files with a foreign newline
convention to be import-able, so a Python Lib directory can be
shared over a remote file system connection, or between MacPython
and Unix-Python on Mac OS X
import nltk.classify.util
import csv
from nltk.classify import NaiveBayesClassifier
from nltk.corpus import movie_reviews
def word_feats(words):
return dict([(word, True) for word in words])
negids = movie_reviews.fileids('neg')
posids = movie_reviews.fileids('pos')
negfeats = [(word_feats(movie_reviews.words(fileids=[f])), 'neg') for f in negids]
posfeats = [(word_feats(movie_reviews.words(fileids=[f])), 'pos') for f in posids]
negcutoff = len(negfeats)*3/4
poscutoff = len(posfeats)*3/4
trainfeats = negfeats[:negcutoff] + posfeats[:poscutoff]
testfeats = negfeats[negcutoff:] + posfeats[poscutoff:]
print 'train on %d instances, test on %d instances' % (len(trainfeats), len(testfeats))
classifier = NaiveBayesClassifier.train(trainfeats)
print 'accuracy:', nltk.classify.util.accuracy(classifier, testfeats)
classifier.show_most_informative_features()
data = csv.DictReader(open('twitter_data_Trump.csv', 'r'))
print data.fieldnames
for each in data:
row = {}
otuput.append(row)
print output
I am very new to Python and am having trouble trying to figure out how to use the Movie Reviews Classifier to classify my imported .csv file. I keep receiving an error message saying: "No such file or directory." Any idea on how I can fix this? I am trying to read in a .csv file and basically assign positive and negatives to tweets that are contained in the csv file. I am not sure if my code works and accomplishes the task at hand.