I have been trying to self host with apache an sklearn classifier that I put together, and I ended up using joblib to serialize the saved model, then load it in a flask app. Now, this app worked perfectly when running flask's built in development server, but when I set this up with a debian 9 apache server, I get a 500 error. Delving into apache's error.log, I get:
AttributeError: module '__main__' has no attribute 'tokenize'
Now, this is funny to me because while I did write my own tokenizer, the web app gave me no problems when I was running it locally. Furthermore, the saved model that I used was trained on the webserver, so slightly different library versions should not be a problem.
My code for the web app is:
import re
import sys
from flask import Flask, request, render_template
from nltk import word_tokenize
from nltk.stem.wordnet import WordNetLemmatizer
from sklearn.externals import joblib
app = Flask(__name__)
def tokenize(text):
# text = text.translate(str.maketrans('','',string.punctuation))
text = re.sub(r'\W+', ' ', text)
tokens = word_tokenize(text)
lemas = []
for item in tokens:
lemas.append(WordNetLemmatizer().lemmatize(item))
return lemas
#app.route('/')
def home():
return render_template('home.html')
#app.route('/analyze',methods=['POST','GET'])
def analyze():
if request.method=='POST':
result=request.form
input_text = result['input_text']
clf = joblib.load("model.pkl.z")
parameters = clf.named_steps['clf'].get_params()
predicted = clf.predict([input_text])
# print(predicted)
certainty = clf.decision_function([input_text])
# Is it bonkers?
if predicted[0]:
verdict = "Not too nuts!"
else:
verdict = "Bonkers!"
return render_template('result.html',prediction=[input_text, verdict, float(certainty), parameters])
if __name__ == '__main__':
#app.debug = True
app.run()
With the .wsgi file being:
import sys
sys.path.append('/var/www/mysite')
from conspiracydetector import app as application
Furthermore, I trained the model with this code:
import logging
import pprint # Pretty stuff
import re
import sys # For command line arguments
from time import time # to show progress
import numpy as np
from nltk import word_tokenize
from nltk.stem.wordnet import WordNetLemmatizer
from sklearn import metrics
from sklearn.datasets import load_files
from sklearn.externals import joblib # In order to save
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC
# Tokenizer that does stemming and strips punctuation
def tokenize(text):
# text = text.translate(str.maketrans('','',string.punctuation))
text = re.sub(r'\W+', ' ', text)
tokens = word_tokenize(text)
lemas = []
for item in tokens:
lemas.append(WordNetLemmatizer().lemmatize(item))
return lemas
if __name__ == "__main__":
# NOTE: we put the following in a 'if __name__ == "__main__"' protected
# block to be able to use a multi-core grid search that also works under
# Windows, see: http://docs.python.org/library/multiprocessing.html#windows
# The multiprocessing module is used as the backend of joblib.Parallel
# that is used when n_jobs != 1 in GridSearchCV
# Display progress logs on stdout
print("Initializing...")
# Command line arguments
save = sys.argv[1]
training_directory = sys.argv[2]
logging.basicConfig(level=logging.INFO,
format='%(asctime)s %(levelname)s %(message)s')
dataset = load_files(training_directory, shuffle=False)
print("n_samples: %d" % len(dataset.data))
# split the dataset in training and test set:
print("Splitting the dataset in training and test set...")
docs_train, docs_test, y_train, y_test = train_test_split(
dataset.data, dataset.target, test_size=0.25, random_state=None)
# Build a vectorizer / classifier pipeline that filters out tokens
# that are too rare or too frequent
# Also remove stop words
print("Loading list of stop words...")
with open('stopwords.txt', 'r') as f:
words = [line.strip() for line in f]
print("Stop words list loaded...")
print("Setting up pipeline...")
pipeline = Pipeline(
[
# ('vect', TfidfVectorizer(stop_words=words, min_df=0.001, max_df=0.5, ngram_range=(1,1))),
('vect',
TfidfVectorizer(tokenizer=tokenize, stop_words=words, min_df=0.001, max_df=0.5, ngram_range=(1, 1))),
('clf', LinearSVC(C=5000)),
])
print("Pipeline:", [name for name, _ in pipeline.steps])
# Build a grid search to find out whether unigrams or bigrams are
# more useful.
# Fit the pipeline on the training set using grid search for the parameters
print("Initializing grid search...")
# uncommenting more parameters will give better exploring power but will
# increase processing time in a combinatorial way
parameters = {
# 'vect__ngram_range': [(1, 1), (1, 2)],
# 'vect__min_df': (0.0005, 0.001),
# 'vect__max_df': (0.25, 0.5),
# 'clf__C': (10, 15, 20),
}
print("Parameters:")
pprint.pprint(parameters)
grid_search = GridSearchCV(
pipeline,
parameters,
n_jobs=-1,
verbose=True)
print("Training and performing grid search...\n")
t0 = time()
grid_search.fit(docs_train, y_train)
print("\nDone in %0.3fs!\n" % (time() - t0))
# Print the mean and std for each candidate along with the parameter
# settings for all the candidates explored by grid search.
n_candidates = len(grid_search.cv_results_['params'])
for i in range(n_candidates):
print(i, 'params - %s; mean - %0.2f; std - %0.2f'
% (grid_search.cv_results_['params'][i],
grid_search.cv_results_['mean_test_score'][i],
grid_search.cv_results_['std_test_score'][i]))
# Predict the outcome on the testing set and store it in a variable
# named y_predicted
print("\nRunning against testing set...\n")
y_predicted = grid_search.predict(docs_test)
# Save model
print("\nSaving model to", save, "...")
joblib.dump(grid_search.best_estimator_, save)
print("Model Saved! \nPrepare for some awesome stats!")
I must confess that I am pretty stumped, and after tinkering around, searching, and making sure that my server is configured correctly, I felt that perhaps someone here might be able to help.
Any help is appreciated, and if there is any more information that I need to provide, please let me know and I will be happy to.
Also, I am running:
python 3.5.3 with nltk and sklearn.
I solved this problem, although imperfectly, by removing my custom tokenizer and falling back on one of sklearn's.
However, I am still in the dark on how to integrate my own tokenizer.
Related
I am new to SpaCy and NLP. I am using SpaCy v 3.1 and Python 3.9.7 64-bit.
My objective: to use a pre-trained SpaCy model (en_core_web_sm) and add a set of custom labels to the existing NER labels (GPE, PERSON, MONEY, etc.) so that the model can recognize both the default AND the custom entities.
I've looked at the SpaCy documentation and what I need seems to be an EntityRecogniser, specifically a new pipe.
However, it is not really clear to me at what point in my workflow I should add this new pipe, since in SpaCy 3 the training happens in CLI, and from the docs it's not even clear to me where the pre-trained model is called.
Any tutorials or pointers you might have are highly appreciated.
This is what I think should be done, but I am not sure how:
import spacy
from spacy import displacy
from spacy_langdetect import LanguageDetector
from spacy.language import Language
from spacy.pipeline import EntityRecognizer
# Load model
nlp = spacy.load("en_core_web_sm")
# Register custom component and turn a simple function into a pipeline component
#Language.factory('new-ner')
def create_bespoke_ner(nlp, name):
# Train the new pipeline with custom labels here??
return LanguageDetector()
# Add custom pipe
custom = nlp.add_pipe("new-ner")
This is what my config file looks like so far. I suspect my new pipe needs to go next to "tok2vec" and "ner".
[paths]
train = null
dev = null
vectors = null
init_tok2vec = null
[system]
gpu_allocator = null
seed = 0
[nlp]
lang = "en"
pipeline = ["tok2vec","ner"]
batch_size = 1000
disabled = []
before_creation = null
after_creation = null
after_pipeline_creation = null
tokenizer = {"#tokenizers":"spacy.Tokenizer.v1"}
[components]
[components.ner]
factory = "ner"
incorrect_spans_key = null
moves = null
update_with_oracle_cut_size = 100
For Spacy 3.2 I did it this way:
import spacy
import random
from spacy import util
from spacy.tokens import Doc
from spacy.training import Example
from spacy.language import Language
def print_doc_entities(_doc: Doc):
if _doc.ents:
for _ent in _doc.ents:
print(f" {_ent.text} {_ent.label_}")
else:
print(" NONE")
def customizing_pipeline_component(nlp: Language):
# NOTE: Starting from Spacy 3.0, training via Python API was changed. For information see - https://spacy.io/usage/v3#migrating-training-python
train_data = [
('We need to deliver it to Festy.', [(25, 30, 'DISTRICT')]),
('I like red oranges', [])
]
# Result before training
print(f"\nResult BEFORE training:")
doc = nlp(u'I need a taxi to Festy.')
print_doc_entities(doc)
# Disable all pipe components except 'ner'
disabled_pipes = []
for pipe_name in nlp.pipe_names:
if pipe_name != 'ner':
nlp.disable_pipes(pipe_name)
disabled_pipes.append(pipe_name)
print(" Training ...")
optimizer = nlp.create_optimizer()
for _ in range(25):
random.shuffle(train_data)
for raw_text, entity_offsets in train_data:
doc = nlp.make_doc(raw_text)
example = Example.from_dict(doc, {"entities": entity_offsets})
nlp.update([example], sgd=optimizer)
# Enable all previously disabled pipe components
for pipe_name in disabled_pipes:
nlp.enable_pipe(pipe_name)
# Result after training
print(f"Result AFTER training:")
doc = nlp(u'I need a taxi to Festy.')
print_doc_entities(doc)
def main():
nlp = spacy.load('en_core_web_sm')
customizing_pipeline_component(nlp)
if __name__ == '__main__':
main()
I have created a Streamlit App to as a demo of a project on Multilingual Text Classification using mBERT in PyTorch. When I run the app with the command python app.py it works fine but when I try to use Streamlit with the command streamlit run app.py it throws a PyCUDA Error.
Following is the code present in app.py:
import torch
from typing import Text
import streamlit as st
import pandas as pd
from textblob import TextBlob
from inference.inference_onnx import run_onnx_inference
from inference.inference_tensorRT import run_trt_inference
from googletrans import Translator
st.title("LinClass: Multilingual Text Classifier")
input_text = st.text_input('Text:')
####################
# Google Translate API
####################
translator = Translator()
input_text = translator.translate(
input_text,
dest= "en"
)
input_text = input_text.text
####################
#Select Precision and Inference Method
####################
df = pd.DataFrame()
df["lang"] = ["en"]
precision = st.sidebar.selectbox("Select Precision:",
("16 Bit", "32 Bit")
)
inference = st.sidebar.selectbox("Inference Method:",
("ONNX", "TensorRT")
)
if st.button('Show Selected Configuration'):
st.subheader("Selected Configuration:")
st.write("Precision: ", precision)
st.write("Inference: ", inference)
st.subheader("Results")
def result(x):
"""
Function to classify the comment toxicity based on the probability and given threshold
params: x(float) - Probability of Toxicity
"""
if x >= 0.4:
st.write("Toxic")
else:
st.write("Non Toxic")
####################
# Implement Selected Configuration
####################
if precision=="16 Bit":
if inference=="ONNX":
df["comment_text"] = [input_text]
predictions = run_onnx_inference(
onnx_model_path = "/workspace/data/multilingual-text-classifier/output models/mBERT_lightning_fp16_2GPU.onnx",
stage="inference",
df_test = df
)
predictions = torch.sigmoid(torch.tensor(predictions))
st.write(input_text)
st.write(predictions)
result(predictions)
if inference=="TensorRT":
df["content"] = [input_text]
predictions = run_trt_inference(
trt_model_path = "/workspace/data/multilingual-text-classifier/output models/mBERT_lightning_fp16_bs16.engine",
stage="inference",
df_test = df
)
predictions = predictions.astype("float32")
predictions = torch.sigmoid(torch.tensor(predictions))
st.write(input_text)
st.write(predictions)
result(predictions)
if precision=="32 Bit":
if inference=="ONNX":
df["comment_text"] = [input_text]
predictions = run_onnx_inference(
onnx_model_path = "/workspace/data/multilingual-text-classifier/output models/mBERT_fp32.onnx",
stage="inference",
df_test = df
)
predictions = torch.sigmoid(torch.tensor(predictions))
st.write(input_text)
st.write(predictions)
result(predictions)
if inference=="TensorRT":
df["content"] = [input_text]
predictions = run_trt_inference(
trt_model_path = "/workspace/data/multilingual-text-classifier/output models/mBERT_fp32.engine",
stage="inference",
df_test = df
)
predictions = predictions.astype("float32")
predictions = torch.sigmoid(torch.tensor(predictions))
st.write(input_text)
st.write(predictions)
result(predictions)
####################
# Take Feedback
####################
st.subheader("Feedback:")
feedback = st.radio(
"Are you satisfied with the results?",
('Yes', 'No'))
st.write("Thanks for the Feedback!")
Error
-------------------------------------------------------------------
PyCUDA ERROR: The context stack was not empty upon module cleanup.
-------------------------------------------------------------------
A context was still active when the context stack was being
cleaned up. At this point in our execution, CUDA may already
have been deinitialized, so there is no way we can finish
cleanly. The program will be aborted now.
Use Context.pop() to avoid this problem.
-------------------------------------------------------------------
Aborted (core dumped)
I am getting this error "PipelineException: No mask_token ([MASK]) found on the input"
when I run this line.
fill_mask("Auto Car .")
I am running it on Colab.
My Code:
from transformers import BertTokenizer, BertForMaskedLM
from pathlib import Path
from tokenizers import ByteLevelBPETokenizer
from transformers import BertTokenizer, BertForMaskedLM
paths = [str(x) for x in Path(".").glob("**/*.txt")]
print(paths)
bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
from transformers import BertModel, BertConfig
configuration = BertConfig()
model = BertModel(configuration)
configuration = model.config
print(configuration)
model = BertForMaskedLM.from_pretrained("bert-base-uncased")
from transformers import LineByLineTextDataset
dataset = LineByLineTextDataset(
tokenizer=bert_tokenizer,
file_path="./kant.txt",
block_size=128,
)
from transformers import DataCollatorForLanguageModeling
data_collator = DataCollatorForLanguageModeling(
tokenizer=bert_tokenizer, mlm=True, mlm_probability=0.15
)
from transformers import Trainer, TrainingArguments
training_args = TrainingArguments(
output_dir="./KantaiBERT",
overwrite_output_dir=True,
num_train_epochs=1,
per_device_train_batch_size=64,
save_steps=10_000,
save_total_limit=2,
)
trainer = Trainer(
model=model,
args=training_args,
data_collator=data_collator,
train_dataset=dataset,
)
trainer.train()
from transformers import pipeline
fill_mask = pipeline(
"fill-mask",
model=model,
tokenizer=bert_tokenizer,
device=0,
)
fill_mask("Auto Car <mask>."). # This line is giving me the error...
The last line is giving me the error mentioned above. Please let me know what I am doing wrong or what I have to do in order to remove this error.
Complete error: "f"No mask_token ({self.tokenizer.mask_token}) found on the input","
Even if you have already found the error, a recommendation to avoid it in the future. Instead of calling
fill_mask("Auto Car <mask>.")
you can do the following to be more flexible when you use different models:
MASK_TOKEN = tokenizer.mask_token
fill_mask("Auto Car {}.".format(MASK_TOKEN))
If the model implementation changes the token to be identified (some identify , some [mask]), then you get into trouble. It is better to use f strings and pass the argument. The advantage of using an f-string is that it is intuitive to understand.
The following code works for me -
from transformers import BertTokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
mask_fill = pipeline("fill-mask", model="bert-base-uncased")
mask_fill(f"The gaming laptop is {tokenizer.mask_token} and I have loved playing games on it.", top_k=2)
I have a simple streamlit app that that includes tranforms + estimator stored as a pickle file for prediction. The app works well when I deployed to the local host. When deployed to Heroku, the web layout works but the prediction app generates the error "AttributeError: 'ColumnTransformer' object has no attribute '_feature_names_in'.
I used the requirements.txt below:
"numpy==1.17.2 pandas==0.25.1 streamlit==0.67.1 Pillow==7.2.0 scikit_learn==0.23.2"
generated by pipreqs.
From published answers to similar questions, I gather that this could be due to incapatibility of sklearn versions. But not sure how to correct it.
Here is the error message from Heruko:
AttributeError: 'ColumnTransformer' object has no '__feature_names_in'
Here is the code for app.py:
import pandas as pd
import numpy as np
import pickle
import streamlit as st
from PIL import Image
#from sklearn.preprocessing import OneHotEncoder
from sklearn.base import BaseEstimator, TransformerMixin
#from sklearn.impute import SimpleImputer
#from sklearn.pipeline import Pipeline
#from sklearn.preprocessing import MinMaxScaler
#from sklearn.compose import ColumnTransformer
import warnings
warnings.filterwarnings('ignore')
acc_ix, wt_ix, hpower_ix, cyl_ix = 4, 3, 2, 0
##custom class inheriting the BaseEstimator and TransformerMixin
class CustomAttrAdder(BaseEstimator, TransformerMixin):
def __init__(self, acc_and_power=True):
self.acc_and_power = acc_and_power # new optional variable
def fit(self, X, y=None):
return self # nothing else to do
def transform(self, X):
wt_and_cyl = X[:, wt_ix] * X[:, cyl_ix] # required new variable
if self.acc_and_power:
acc_and_power = X[:, acc_ix] * X[:, hpower_ix]
return np.c_[X, acc_and_power, wt_and_cyl] # returns a 2D array
return np.c_[X, wt_and_cyl]
def predict_mpg_web1(config,regressor):
if type(config)==dict:
df=pd.DataFrame(config)
else:
df=config
# Note the model is in the form of pipeline_m, including both transforms and the estimator
# The config is with Origin already in country code
y_pred=regressor.predict(df)
return y_pred
# this is the main function in which we define our webpage
def main():
# giving the webpage a title
#st.title("MPG Prediction")
st.write("""
# MPG Prediction App
based on a Random Forest Model built from
"http://archive.ics.uci.edu/ml/machine-learning-databases/auto-mpg/auto-mpg.data"
""")
# here we define some of the front end elements of the web page like
# the font and background color, the padding and the text to be displayed
html_temp = """
<div style ="background-color:yellow;padding:13px">
<h1 style ="color:black;text-align:center;">What is the mpg of my car? </h1>
</div>
"""
# this line allows us to display the front end aspects we have
# defined in the above code
st.markdown(html_temp, unsafe_allow_html = True)
# the following lines create dropdowns and nueemric sliders in which the user can enter
# the data required to make the prediction
st.sidebar.header('Set My Car Configurations')
Orig = st.sidebar.selectbox("Select Car Origin",("India", "USA", "Germany"))
Cyl = st.sidebar.slider('Cylinders', 3, 6, 8)
Disp = st.sidebar.slider('Displacement', 68.0, 455.0, 193.0)
Power = st.sidebar.slider('Horsepower', 46.0, 230.0, 104.0)
WT = st.sidebar.slider(' Weight', 1613.0, 5140.0, 2970.0)
Acc = st.sidebar.slider('Acceleration', 8.0, 25.0, 15.57)
MY = st.sidebar.slider('Model_Year', 70, 82, 76)
image = Image.open('car.jpg')
st.image(image, caption='MPG Prediction',
use_column_width=True)
st.subheader("Click the 'Predict' button below")
# loading the saved model
pickle_in = open('final_model.pkl', 'rb')
regressor=pickle.load(pickle_in)
result =""
# the below line ensures that when the button called 'Predict' is clicked,
# the prediction function defined above is called to make the prediction
# and store it in the variable result
# Set up the Vehicale configurations
vehicle={"Origin": [Orig], "Cylinders": [Cyl], "Displacement": Disp, "Horsepower": [Power],
"Weight":[WT], "Acceelation": [Acc], "Model Year": [MY]
}
if st.button("Predict"):
result = predict_mpg_web1(vehicle, regressor)
mpg=int(result[0])
st.success('The prediction is {}'.format(mpg))
if __name__=='__main__':
main()
Is it possible that you are trying to call predict using a ColumnTransformer whcih has not been fitted yet?
the attribute _feature_names_in is set in the fit_transform call. I have the same sklearn version and the method is present, so imho shouldn't be a problem with the version
I fixed the problem. It turns out that somehow the pickle file for the saved model was corrupted. I regenerated the model and the deployment works.
Thanks for anyone who spent the time reviewing my problem.
Apollo.
I am trying to make a model with the Gensim library. I am using python 3 and Spyder. I also want to incorporate the wiki corpus. The code is shown below:
enter code hereimport os
import sys
import bz2
import logging
import multiprocessing
import gensim
SCRIPT_PATH = os.path.dirname(os.path.realpath(__file__))
DATA_PATH = os.path.join(SCRIPT_PATH, 'data/')
MODEL_PATH = os.path.join(SCRIPT_PATH, 'model/')
DICTIONARY_FILEPATH = os.path.join(DATA_PATH, 'wiki-english_wordids.txt.bz2')
WIKI_DUMP_FILEPATH = os.path.join(DATA_PATH, 'enwiki-latest-pages-
articles.xml.bz2')
if __name__ == '__main__':
# Check if the required files have been downloaded
if not WIKI_DUMP_FILEPATH:
print('Wikipedia articles dump could not be found..')
print('Please see README.md for instructions!')
sys.exit()
# Get number of available cpus
cores = multiprocessing.cpu_count()
if not os.path.exists(MODEL_PATH):
os.makedirs(MODEL_PATH)
# Initialize logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
if not os.path.isfile(DICTIONARY_FILEPATH):
logging.info('Dictionary has not been created yet..')
logging.info('Creating dictionary (takes about 9h)..')
# Construct corpus
wiki = gensim.corpora.WikiCorpus(WIKI_DUMP_FILEPATH)
# Remove words occuring less than 20 times, and words occuring in more
# than 10% of the documents. (keep_n is the vocabulary size)
wiki.dictionary.filter_extremes(no_below=20, no_above=0.1, keep_n=100000)
# Save dictionary to file
wiki.dictionary.save_as_text(DICTIONARY_FILEPATH)
del wiki
# Load dictionary from file
dictionary = gensim.corpora.Dictionary.load_from_text(DICTIONARY_FILEPATH)
# Construct corpus using dictionary
wiki = gensim.corpora.WikiCorpus(WIKI_DUMP_FILEPATH, dictionary=dictionary)
class SentencesIterator:
def __init__(self, wiki):
self.wiki = wiki
def __iter__(self):
for sentence in self.wiki.get_texts():
yield list(map(lambda x: x.decode('utf-8'), sentence))
# Initialize simple sentence iterator required for the Word2Vec model
sentences = SentencesIterator(wiki)
logging.info('Training word2vec model..')
model = gensim.models.Word2Vec(sentences=sentences, size=300, min_count=1, window=5, workers=cores)
# Save model
logging.info('Saving model..')
model.save(os.path.join(MODEL_PATH, 'word2vec.model'))
logging.info('Done training word2vec model!')
But I am getting the following error:
File "C:/Users/elli/.spyder-py3/temp.py", line 60, in <lambda>
yield list(map(lambda x: x.decode('utf-8'), sentence))
AttributeError: 'str' object has no attribute 'decode'
This code was from github from this link:
https://github.com/LasseRegin/gensim-word2vec-model/blob/master/train.py.
I suspect this should be something simple to sort. Could you please advise?
It's the Unicode issue in your class SentencesIterator, your sample code is for python2. For python3, you can remove the decode part and make it as follows:
class TaggedWikiDocument(object):
def __init__(self, wiki):
self.wiki = wiki
self.wiki.metadata = True
def __iter__(self):
for content, (page_id, title) in self.wiki.get_texts():
yield TaggedDocument(content, [title])