ValueError when using sklearn to do TF-IDF based on NLP

ValueError when using sklearn to do TF-IDF based on NLP - python

Here is the code:
import nltk
import string
import os
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.stem.porter import PorterStemmer
path = '/opt/datacourse/data/parts'
token_dict = {}
stemmer = PorterStemmer()
def stem_tokens(tokens, stemmer):
stemmed = []
for item in tokens:
stemmed.append(stemmer.stem(item))
return stemmed
def tokenize(text):
tokens = nltk.word_tokenize(text)
stems = stem_tokens(tokens, stemmer)
return stems
for subdir, dirs, files in os.walk(path):
for file in files:
file_path = subdir + os.path.sep + file
shakes = open(file_path, 'r')
text = shakes.read()
lowers = text.lower()
no_punctuation = lowers.translate(None, string.punctuation)
token_dict[file] = no_punctuation
tfidf = TfidfVectorizer(tokenizer=tokenize, stop_words='english')
tfs = tfidf.fit_transform(token_dict.values())
After running,it turned out to be:
File "D:\Python27\lib\site-packages\sklearn\feature_extraction\text.py", line 751, in _count_vocab
raise ValueError("empty vocabulary; perhaps the documents only"
ValueError: empty vocabulary; perhaps the documents only contain stop words
According to others' replies,I've chencked text.py and confirmed that min_def = 1 in _init_
Can anyone tell me what's the problem?Much appreciated.

Related

Fastapi error with unpickling a ML model, can not get its attribute

I trained a vectorizer with some data and a custom tokenizer, but when I want to use it in fastapi it throws an error
here is my vectorizer
import nltk
nltk.download('punkt')
from nltk import word_tokenize
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
import re
from sklearn.feature_extraction.text import TfidfVectorizer
def preprocess_and_tokenize(data):
#remove html markup
data = re.sub("(<.*?>)", "", data)
#remove urls
data = re.sub(r'http\S+', '', data)
#remove hashtags and #names
data= re.sub(r"(#[\d\w\.]+)", '', data)
data= re.sub(r"(#[\d\w\.]+)", '', data)
#remove punctuation and non-ascii digits
data = re.sub("(\\W|\\d)", " ", data)
#remove whitespace
data = data.strip()
# tokenization with nltk
data = word_tokenize(data)
# stemming with nltk
porter = PorterStemmer()
stem_data = [porter.stem(word) for word in data]
return stem_data
vect = TfidfVectorizer(tokenizer=preprocess_and_tokenize, sublinear_tf=True, norm='l2', ngram_range=(1, 2))
#train_x was loaded previously i didnt mentioned it here
vect_transform = vect.fit_transform(train_x)
import pickle
pickle.dump(vect_transform, open("tfid_vectorizer.pk1", "wb"))
I did the above code in jupyter notebook
I tried unpickling the model to test it, I got an error of not finding the preprocess_and_tokenize so I copied it in the file where I was loading the model and it worked
but when I wanted to make an api with fastapi it throws the same error and I don't know why!
I started building its api in vscode
here is the code
import nltk
nltk.download('punkt')
from nltk import word_tokenize
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
import re
import pickle
from fastapi import FastAPI
def preprocess_and_tokenize(data):
#remove html markup
data = re.sub("(<.*?>)", "", data)
#remove urls
data = re.sub(r'http\S+', '', data)
#remove hashtags and #names
data= re.sub(r"(#[\d\w\.]+)", '', data)
data= re.sub(r"(#[\d\w\.]+)", '', data)
#remove punctuation and non-ascii digits
data = re.sub("(\\W|\\d)", " ", data)
#remove whitespace
data = data.strip()
# tokenization with nltk
data = word_tokenize(data)
# stemming with nltk
porter = PorterStemmer()
stem_data = [porter.stem(word) for word in data]
return stem_data
app = FastAPI(
title = "Emotion Prediction"
)
#the saved model reside in AI_models folder
emotion_transform = pickle.load(open('AI_models/tfid_vect.pk1', 'rb'))
#app.get('/log_predict/{text}')
async def log_predict(text: str):
data = [text]
data_transformed = emotion_transform.transform(data)
return {
'result': data_transformed
}
as soon as I hit this command uvicorn api:app --reload in the terminal this error is thrown
.
.
.
File "C:\Users\Nima\anaconda3\envs\virtual_workspace\lib\pickle.py", line 331, in _getattribute
raise AttributeError("Can't get attribute {!r} on {!r}"
AttributeError: Can't get attribute 'preprocess_and_tokenize' on <module '__main__' (built-in)>

How to loop and delete stop words from a folder

I am currently working on the task of deleting stop words. This code can be run, but I would like to ask how to change it into a loop statement, that is, loop to extract stop words in a folder instead of a single file. It might be the "file1.... this statement", but I don't know how to change it. The code is attached as follows, Thanks!
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
stop_words = set(stopwords.words('english'))
file1 = open(
r"D:\1.1 SEC EDGAR年报源文件 (10Q_10KA_10QA)\2001\QTR1\20010102_10-K-A_edgar_data_1024302_0001092388-00-500453.txt")
line = file1.read()
words = word_tokenize(line)
words_witout_stop_words = ["" if word in stop_words else word for word in words]
new_words = " ".join(words_witout_stop_words).strip()
appendFile = open(
r"D:\1.1 SEC EDGAR年报源文件 (10Q_10KA_10QA)\2001\QTR1\20010102_10-K-A_edgar_data_1024302_0001092388-00-500453.txt", 'w')
appendFile.write(new_words)
appendFile.close()

A simple fix for this is to get all the files in the folder and run the code on them.
I have mentioned the whole code here:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import os
path = r"D:\1.1 SEC EDGAR年报源文件 (10Q_10KA_10QA)\2001\QTR1\""
files = os.listdir(path)
stop_words = set(stopwords.words('english'))
for i in files:
file1 = open(path + i)
line = file1.read()
words = word_tokenize(line)
words_witout_stop_words = ["" if word in stop_words else word for word in words]
new_words = " ".join(words_witout_stop_words).strip()
appendFile = open(path + i, 'w')
appendFile.write(new_words)
appendFile.close()

Tfidf empty vocabulary; perhaps the documents only contain stop words

curernttly I am working on a project and using Tfidf to transform X_train data which contain the text data. When I am using count_vectorizer.fit_transform(X_train) I get this error:
Traceback (most recent call last):
File "train.py", line 100, in <module>
counts = count_vectorizer.fit_transform(X_train)
File "/home/vishalthadari/Documents/Seperation 1/API's/Confirmation API/python 3 /env/lib/python3.6/site-packages/sklearn/feature_extraction/text.py", line 869, in fit_transform
self.fixed_vocabulary_)
File "/home/vishalthadari/Documents/Seperation 1/API's/Confirmation API/python 3 /env/lib/python3.6/site-packages/sklearn/feature_extraction/text.py", line 811, in _count_vocab
raise ValueError("empty vocabulary; perhaps the documents only"
ValueError: empty vocabulary; perhaps the documents only contain stop words
I read other stackoverflow questions like this Link But i cannot able to understand how to split the data of X_train
Here's my Train.py file
import os
import numpy
from pandas import DataFrame
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
NEWLINE = '\n'
TRAVEL = 'Travel'
OTHER = 'Other'
SOURCES = [
('data/travel', TRAVEL),
('data/other', OTHER),
]
SKIP_FILES = {'cmds', '.DS_Store'}
SEED = 0 # for reproducibility
def read_files(path):
#Reads all files in all directories mentioned in SOURCES
for root, dir_names, file_names in os.walk(path):
for path in dir_names:
read_files(os.path.join(root, path))
for file_name in file_names:
if file_name not in SKIP_FILES:
file_path = os.path.join(root, file_name)
if os.path.isfile(file_path):
past_header, lines = False, []
f = open(file_path, encoding="latin-1")
for line in f:
if past_header:
lines.append(line)
elif line == NEWLINE:
past_header = True
f.close()
content = NEWLINE.join(lines)
yield file_path, content
def build_data_frame(path, classification):
#Returns a data frame of all the files read using read_files()
data_frame = DataFrame({'text': [], 'class': []})
for file_name, text in read_files(path):
data_frame = data_frame.append(
DataFrame({'text': [text], 'class': [classification]}, index=[file_name]))
return data_frame
data = DataFrame({'text': [], 'class': []})
for path, classification in SOURCES:
data = data.append(build_data_frame(path, classification))
data = data.reindex(numpy.random.permutation(data.index))
#Training data
X_train = numpy.asarray(data['text'])
count_vectorizer = CountVectorizer()
counts = count_vectorizer.fit_transform(X_train)
I followed all the solutions but still didnt solved the issue. Is i am doing the wrong apprach to transform the data if i am doing right then why i am getting this error.
Thanks in Advance

how to access and open files in folder automatically and check similarity with input file in python

i am making a desktop tool for plagiarism checking between documents. I use stopwords, vectorizer tf-idf etc and use cosine similarity to check similarity between two documents
{import nltk, string
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import nltk
userinput1 = input ("Enter file name:")
myfile1 = open(userinput1).read()
stop_words = set(stopwords.words("english"))
word1 = nltk.word_tokenize(myfile1)
filtration_sentence = []
for w in word1:
word = word_tokenize(myfile1)
filtered_sentence = [w for w in word if not w in stop_words]
print(filtered_sentence)
userinput2 = input ("Enter file name:")
myfile2 = open(userinput2).read()
stop_words = set(stopwords.words("english"))
word2 = nltk.word_tokenize(myfile2)
filtration_sentence = []
for w in word2:
word = word_tokenize(myfile2)
filtered_sentence = [w for w in word if not w in stop_words]
print(filtered_sentence)
stemmer = nltk.stem.porter.PorterStemmer()
remove_punctuation_map = dict((ord(char), None) for char in string.punctuation)
def stem_tokens(tokens):
return [stemmer.stem(item) for item in tokens]
'''remove punctuation, lowercase, stem'''
def normalize(text):
return stem_tokens(nltk.word_tokenize(text.lower().translate(remove_punctuation_map)))
vectorizer = TfidfVectorizer(tokenizer=normalize, stop_words='english')
def cosine_sim(myfile1, myfile2):
tfidf = vectorizer.fit_transform([myfile1, myfile2])
return ((tfidf * tfidf.T).A)[0,1]
print(cosine_sim(myfile1,myfile2))}
but the problem is "i have to check similarity of input file from user with the number of files in folder. i tried my best to access folder ,open files automatically but not succeed. "anyone here who can tell me how to access folder containing files and open files one by one and compare with input file.i am using python 3.4.4 and window 7

As per my understanding you need to get all the files present in a directory/ folder
import os
fileList = os.listdir('path_to_the_directory')
for eachFile in fileList:
with open(eachFile, 'rb') as _fp:
fileData = _fp.read()
print("FILE DATA (%s):\n\n%s\n\n"%(_fp.name, fileData))
This will iterate through all the file in a directory and call the function doSomething on the file pointer

Python not able to import .csv file

import nltk.classify.util
import csv
from nltk.classify import NaiveBayesClassifier
from nltk.corpus import movie_reviews
def word_feats(words):
return dict([(word, True) for word in words])
negids = movie_reviews.fileids('neg')
posids = movie_reviews.fileids('pos')
negfeats = [(word_feats(movie_reviews.words(fileids=[f])), 'neg') for f in negids]
posfeats = [(word_feats(movie_reviews.words(fileids=[f])), 'pos') for f in posids]
negcutoff = len(negfeats)*3/4
poscutoff = len(posfeats)*3/4
trainfeats = negfeats[:negcutoff] + posfeats[:poscutoff]
testfeats = negfeats[negcutoff:] + posfeats[poscutoff:]
print 'train on %d instances, test on %d instances' % (len(trainfeats), len(testfeats))
classifier = NaiveBayesClassifier.train(trainfeats)
print 'accuracy:', nltk.classify.util.accuracy(classifier, testfeats)
classifier.show_most_informative_features()
data = csv.DictReader(open('twitter_data_Trump.csv', 'r'))
print data.fieldnames
for each in data:
row = {}
otuput.append(row)
print output
I am very new to Python and am having trouble trying to figure out how to use the Movie Reviews Classifier to classify my imported .csv file. I keep receiving an error message saying: "No such file or directory." Any idea on how I can fix this? I am trying to read in a .csv file and basically assign positive and negatives to tweets that are contained in the csv file. I am not sure if my code works and accomplishes the task at hand.

We Keep Coding

Python is a programming language that lets you work quickly and integrate systems more effectively.

ValueError when using sklearn to do TF-IDF based on NLP - python

Related

Fastapi error with unpickling a ML model, can not get its attribute

How to loop and delete stop words from a folder

Tfidf empty vocabulary; perhaps the documents only contain stop words

how to access and open files in folder automatically and check similarity with input file in python

Python not able to import .csv file

Categories

Resources