Gensim Doc2Vec Exception AttributeError: 'str' object has no attribute 'decode' - python

I am trying to make a model with the Gensim library. I am using python 3 and Spyder. I also want to incorporate the wiki corpus. The code is shown below:
enter code hereimport os
import sys
import bz2
import logging
import multiprocessing
import gensim
SCRIPT_PATH = os.path.dirname(os.path.realpath(__file__))
DATA_PATH = os.path.join(SCRIPT_PATH, 'data/')
MODEL_PATH = os.path.join(SCRIPT_PATH, 'model/')
DICTIONARY_FILEPATH = os.path.join(DATA_PATH, 'wiki-english_wordids.txt.bz2')
WIKI_DUMP_FILEPATH = os.path.join(DATA_PATH, 'enwiki-latest-pages-
articles.xml.bz2')
if __name__ == '__main__':
# Check if the required files have been downloaded
if not WIKI_DUMP_FILEPATH:
print('Wikipedia articles dump could not be found..')
print('Please see README.md for instructions!')
sys.exit()
# Get number of available cpus
cores = multiprocessing.cpu_count()
if not os.path.exists(MODEL_PATH):
os.makedirs(MODEL_PATH)
# Initialize logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
if not os.path.isfile(DICTIONARY_FILEPATH):
logging.info('Dictionary has not been created yet..')
logging.info('Creating dictionary (takes about 9h)..')
# Construct corpus
wiki = gensim.corpora.WikiCorpus(WIKI_DUMP_FILEPATH)
# Remove words occuring less than 20 times, and words occuring in more
# than 10% of the documents. (keep_n is the vocabulary size)
wiki.dictionary.filter_extremes(no_below=20, no_above=0.1, keep_n=100000)
# Save dictionary to file
wiki.dictionary.save_as_text(DICTIONARY_FILEPATH)
del wiki
# Load dictionary from file
dictionary = gensim.corpora.Dictionary.load_from_text(DICTIONARY_FILEPATH)
# Construct corpus using dictionary
wiki = gensim.corpora.WikiCorpus(WIKI_DUMP_FILEPATH, dictionary=dictionary)
class SentencesIterator:
def __init__(self, wiki):
self.wiki = wiki
def __iter__(self):
for sentence in self.wiki.get_texts():
yield list(map(lambda x: x.decode('utf-8'), sentence))
# Initialize simple sentence iterator required for the Word2Vec model
sentences = SentencesIterator(wiki)
logging.info('Training word2vec model..')
model = gensim.models.Word2Vec(sentences=sentences, size=300, min_count=1, window=5, workers=cores)
# Save model
logging.info('Saving model..')
model.save(os.path.join(MODEL_PATH, 'word2vec.model'))
logging.info('Done training word2vec model!')
But I am getting the following error:
File "C:/Users/elli/.spyder-py3/temp.py", line 60, in <lambda>
yield list(map(lambda x: x.decode('utf-8'), sentence))
AttributeError: 'str' object has no attribute 'decode'
This code was from github from this link:
https://github.com/LasseRegin/gensim-word2vec-model/blob/master/train.py.
I suspect this should be something simple to sort. Could you please advise?

It's the Unicode issue in your class SentencesIterator, your sample code is for python2. For python3, you can remove the decode part and make it as follows:
class TaggedWikiDocument(object):
def __init__(self, wiki):
self.wiki = wiki
self.wiki.metadata = True
def __iter__(self):
for content, (page_id, title) in self.wiki.get_texts():
yield TaggedDocument(content, [title])

Related

SpaCy: how do you add custom NER labels to a pre-trained model?

I am new to SpaCy and NLP. I am using SpaCy v 3.1 and Python 3.9.7 64-bit.
My objective: to use a pre-trained SpaCy model (en_core_web_sm) and add a set of custom labels to the existing NER labels (GPE, PERSON, MONEY, etc.) so that the model can recognize both the default AND the custom entities.
I've looked at the SpaCy documentation and what I need seems to be an EntityRecogniser, specifically a new pipe.
However, it is not really clear to me at what point in my workflow I should add this new pipe, since in SpaCy 3 the training happens in CLI, and from the docs it's not even clear to me where the pre-trained model is called.
Any tutorials or pointers you might have are highly appreciated.
This is what I think should be done, but I am not sure how:
import spacy
from spacy import displacy
from spacy_langdetect import LanguageDetector
from spacy.language import Language
from spacy.pipeline import EntityRecognizer
# Load model
nlp = spacy.load("en_core_web_sm")
# Register custom component and turn a simple function into a pipeline component
#Language.factory('new-ner')
def create_bespoke_ner(nlp, name):
# Train the new pipeline with custom labels here??
return LanguageDetector()
# Add custom pipe
custom = nlp.add_pipe("new-ner")
This is what my config file looks like so far. I suspect my new pipe needs to go next to "tok2vec" and "ner".
[paths]
train = null
dev = null
vectors = null
init_tok2vec = null
[system]
gpu_allocator = null
seed = 0
[nlp]
lang = "en"
pipeline = ["tok2vec","ner"]
batch_size = 1000
disabled = []
before_creation = null
after_creation = null
after_pipeline_creation = null
tokenizer = {"#tokenizers":"spacy.Tokenizer.v1"}
[components]
[components.ner]
factory = "ner"
incorrect_spans_key = null
moves = null
update_with_oracle_cut_size = 100
For Spacy 3.2 I did it this way:
import spacy
import random
from spacy import util
from spacy.tokens import Doc
from spacy.training import Example
from spacy.language import Language
def print_doc_entities(_doc: Doc):
if _doc.ents:
for _ent in _doc.ents:
print(f" {_ent.text} {_ent.label_}")
else:
print(" NONE")
def customizing_pipeline_component(nlp: Language):
# NOTE: Starting from Spacy 3.0, training via Python API was changed. For information see - https://spacy.io/usage/v3#migrating-training-python
train_data = [
('We need to deliver it to Festy.', [(25, 30, 'DISTRICT')]),
('I like red oranges', [])
]
# Result before training
print(f"\nResult BEFORE training:")
doc = nlp(u'I need a taxi to Festy.')
print_doc_entities(doc)
# Disable all pipe components except 'ner'
disabled_pipes = []
for pipe_name in nlp.pipe_names:
if pipe_name != 'ner':
nlp.disable_pipes(pipe_name)
disabled_pipes.append(pipe_name)
print(" Training ...")
optimizer = nlp.create_optimizer()
for _ in range(25):
random.shuffle(train_data)
for raw_text, entity_offsets in train_data:
doc = nlp.make_doc(raw_text)
example = Example.from_dict(doc, {"entities": entity_offsets})
nlp.update([example], sgd=optimizer)
# Enable all previously disabled pipe components
for pipe_name in disabled_pipes:
nlp.enable_pipe(pipe_name)
# Result after training
print(f"Result AFTER training:")
doc = nlp(u'I need a taxi to Festy.')
print_doc_entities(doc)
def main():
nlp = spacy.load('en_core_web_sm')
customizing_pipeline_component(nlp)
if __name__ == '__main__':
main()

Python: PyCUDA ERROR: The context stack was not empty upon module cleanup

I have created a Streamlit App to as a demo of a project on Multilingual Text Classification using mBERT in PyTorch. When I run the app with the command python app.py it works fine but when I try to use Streamlit with the command streamlit run app.py it throws a PyCUDA Error.
Following is the code present in app.py:
import torch
from typing import Text
import streamlit as st
import pandas as pd
from textblob import TextBlob
from inference.inference_onnx import run_onnx_inference
from inference.inference_tensorRT import run_trt_inference
from googletrans import Translator
st.title("LinClass: Multilingual Text Classifier")
input_text = st.text_input('Text:')
####################
# Google Translate API
####################
translator = Translator()
input_text = translator.translate(
input_text,
dest= "en"
)
input_text = input_text.text
####################
#Select Precision and Inference Method
####################
df = pd.DataFrame()
df["lang"] = ["en"]
precision = st.sidebar.selectbox("Select Precision:",
("16 Bit", "32 Bit")
)
inference = st.sidebar.selectbox("Inference Method:",
("ONNX", "TensorRT")
)
if st.button('Show Selected Configuration'):
st.subheader("Selected Configuration:")
st.write("Precision: ", precision)
st.write("Inference: ", inference)
st.subheader("Results")
def result(x):
"""
Function to classify the comment toxicity based on the probability and given threshold
params: x(float) - Probability of Toxicity
"""
if x >= 0.4:
st.write("Toxic")
else:
st.write("Non Toxic")
####################
# Implement Selected Configuration
####################
if precision=="16 Bit":
if inference=="ONNX":
df["comment_text"] = [input_text]
predictions = run_onnx_inference(
onnx_model_path = "/workspace/data/multilingual-text-classifier/output models/mBERT_lightning_fp16_2GPU.onnx",
stage="inference",
df_test = df
)
predictions = torch.sigmoid(torch.tensor(predictions))
st.write(input_text)
st.write(predictions)
result(predictions)
if inference=="TensorRT":
df["content"] = [input_text]
predictions = run_trt_inference(
trt_model_path = "/workspace/data/multilingual-text-classifier/output models/mBERT_lightning_fp16_bs16.engine",
stage="inference",
df_test = df
)
predictions = predictions.astype("float32")
predictions = torch.sigmoid(torch.tensor(predictions))
st.write(input_text)
st.write(predictions)
result(predictions)
if precision=="32 Bit":
if inference=="ONNX":
df["comment_text"] = [input_text]
predictions = run_onnx_inference(
onnx_model_path = "/workspace/data/multilingual-text-classifier/output models/mBERT_fp32.onnx",
stage="inference",
df_test = df
)
predictions = torch.sigmoid(torch.tensor(predictions))
st.write(input_text)
st.write(predictions)
result(predictions)
if inference=="TensorRT":
df["content"] = [input_text]
predictions = run_trt_inference(
trt_model_path = "/workspace/data/multilingual-text-classifier/output models/mBERT_fp32.engine",
stage="inference",
df_test = df
)
predictions = predictions.astype("float32")
predictions = torch.sigmoid(torch.tensor(predictions))
st.write(input_text)
st.write(predictions)
result(predictions)
####################
# Take Feedback
####################
st.subheader("Feedback:")
feedback = st.radio(
"Are you satisfied with the results?",
('Yes', 'No'))
st.write("Thanks for the Feedback!")
Error
-------------------------------------------------------------------
PyCUDA ERROR: The context stack was not empty upon module cleanup.
-------------------------------------------------------------------
A context was still active when the context stack was being
cleaned up. At this point in our execution, CUDA may already
have been deinitialized, so there is no way we can finish
cleanly. The program will be aborted now.
Use Context.pop() to avoid this problem.
-------------------------------------------------------------------
Aborted (core dumped)

Error when using a custom dataset with fastai

I am getting an error when trying to use my custom fastai dataset
The error:
Exception: Can't infer the type of your targets.
It's either because your data source is empty or because your labeling function raised an error.
The code:
from fastai import *
from fastai.vision import *
class URL:
MURDERHORNETS = f"https://superdata.quinniboi10.repl.co/MurderHornetImages"
path = untar_data(URL.MURDERHORNETS)
'''
path = untar_data(URLs.PETS)
files = get_image_files(path)
import PIL
img = PIL.Image.open(files[0])
img
'''
fnames = get_image_files(path)
fnames[:5]
np.random.seed (2)
pat = r'/([^/]+)_\d+\.(png|jpg|jpeg)$'
data = ImageDataBunch.from_folder(path, train=path, test=None, valid_pct=0.2,
ds_tfms=get_transforms(),
size=160)
data.normalize (imagenet_stats)
data.show_batch(rows=3, figsize=(7,6))
print (data.classes)
len (data.classes),data.c
learn = cnn_learner(data, models.resnet50, metrics=error_rate)
learn.fit_one_cycle(5)
learn.save ('stage-1')
The dataset is here, don't comment on the name, I don't know why that is what I chose :/
Get the zip file of the dataset here

How to initialize a pool of python multiprocessing workers with a shared state?

I am trying to execute in parallel some machine learning algorithm.
When I use multiprocessing, it's slower than without. My wild guess is that the pickle serialization of the models I use slowing down the whole process. So the question is: how can I initialize the pool's worker with an initial state so that I don't need to serialize/deserialize for every single call the models?
Here is my current code:
import pickle
from pathlib import Path
from collections import Counter
from multiprocessing import Pool
from gensim.models.doc2vec import Doc2Vec
from wikimark import html2paragraph
from wikimark import tokenize
def process(args):
doc2vec, regressions, filepath = args
with filepath.open('r') as f:
string = f.read()
subcategories = Counter()
for index, paragraph in enumerate(html2paragraph(string)):
tokens = tokenize(paragraph)
vector = doc2vec.infer_vector(tokens)
for subcategory, model in regressions.items():
prediction = model.predict([vector])[0]
subcategories[subcategory] += prediction
# compute the mean score for each subcategory
for subcategory, prediction in subcategories.items():
subcategories[subcategory] = prediction / (index + 1)
# keep only the main category
subcategory = subcategories.most_common(1)[0]
return (filepath, subcategory)
def main():
input = Path('./build')
doc2vec = Doc2Vec.load(str(input / 'model.doc2vec.gz'))
regressions = dict()
for filepath in input.glob('./*/*/*.model'):
with filepath.open('rb') as f:
model = pickle.load(f)
regressions[filepath.parent] = model
examples = list(input.glob('../data/wikipedia/english/*'))
with Pool() as pool:
iterable = zip(
[doc2vec] * len(examples), # XXX!
[regressions] * len(examples), # XXX!
examples
)
for filepath, subcategory in pool.imap_unordered(process, iterable):
print('* {} -> {}'.format(filepath, subcategory))
if __name__ == '__main__':
main()
The lines marked with XXX! point to the data that serialized when I call pool.imap_unodered. There at least 200MB of data that is serialized.
How can I avoid serialization?
The solution as simple as using a global for both doc2vec and regressions.

Sklearn classifier and flask issues

I have been trying to self host with apache an sklearn classifier that I put together, and I ended up using joblib to serialize the saved model, then load it in a flask app. Now, this app worked perfectly when running flask's built in development server, but when I set this up with a debian 9 apache server, I get a 500 error. Delving into apache's error.log, I get:
AttributeError: module '__main__' has no attribute 'tokenize'
Now, this is funny to me because while I did write my own tokenizer, the web app gave me no problems when I was running it locally. Furthermore, the saved model that I used was trained on the webserver, so slightly different library versions should not be a problem.
My code for the web app is:
import re
import sys
from flask import Flask, request, render_template
from nltk import word_tokenize
from nltk.stem.wordnet import WordNetLemmatizer
from sklearn.externals import joblib
app = Flask(__name__)
def tokenize(text):
# text = text.translate(str.maketrans('','',string.punctuation))
text = re.sub(r'\W+', ' ', text)
tokens = word_tokenize(text)
lemas = []
for item in tokens:
lemas.append(WordNetLemmatizer().lemmatize(item))
return lemas
#app.route('/')
def home():
return render_template('home.html')
#app.route('/analyze',methods=['POST','GET'])
def analyze():
if request.method=='POST':
result=request.form
input_text = result['input_text']
clf = joblib.load("model.pkl.z")
parameters = clf.named_steps['clf'].get_params()
predicted = clf.predict([input_text])
# print(predicted)
certainty = clf.decision_function([input_text])
# Is it bonkers?
if predicted[0]:
verdict = "Not too nuts!"
else:
verdict = "Bonkers!"
return render_template('result.html',prediction=[input_text, verdict, float(certainty), parameters])
if __name__ == '__main__':
#app.debug = True
app.run()
With the .wsgi file being:
import sys
sys.path.append('/var/www/mysite')
from conspiracydetector import app as application
Furthermore, I trained the model with this code:
import logging
import pprint # Pretty stuff
import re
import sys # For command line arguments
from time import time # to show progress
import numpy as np
from nltk import word_tokenize
from nltk.stem.wordnet import WordNetLemmatizer
from sklearn import metrics
from sklearn.datasets import load_files
from sklearn.externals import joblib # In order to save
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC
# Tokenizer that does stemming and strips punctuation
def tokenize(text):
# text = text.translate(str.maketrans('','',string.punctuation))
text = re.sub(r'\W+', ' ', text)
tokens = word_tokenize(text)
lemas = []
for item in tokens:
lemas.append(WordNetLemmatizer().lemmatize(item))
return lemas
if __name__ == "__main__":
# NOTE: we put the following in a 'if __name__ == "__main__"' protected
# block to be able to use a multi-core grid search that also works under
# Windows, see: http://docs.python.org/library/multiprocessing.html#windows
# The multiprocessing module is used as the backend of joblib.Parallel
# that is used when n_jobs != 1 in GridSearchCV
# Display progress logs on stdout
print("Initializing...")
# Command line arguments
save = sys.argv[1]
training_directory = sys.argv[2]
logging.basicConfig(level=logging.INFO,
format='%(asctime)s %(levelname)s %(message)s')
dataset = load_files(training_directory, shuffle=False)
print("n_samples: %d" % len(dataset.data))
# split the dataset in training and test set:
print("Splitting the dataset in training and test set...")
docs_train, docs_test, y_train, y_test = train_test_split(
dataset.data, dataset.target, test_size=0.25, random_state=None)
# Build a vectorizer / classifier pipeline that filters out tokens
# that are too rare or too frequent
# Also remove stop words
print("Loading list of stop words...")
with open('stopwords.txt', 'r') as f:
words = [line.strip() for line in f]
print("Stop words list loaded...")
print("Setting up pipeline...")
pipeline = Pipeline(
[
# ('vect', TfidfVectorizer(stop_words=words, min_df=0.001, max_df=0.5, ngram_range=(1,1))),
('vect',
TfidfVectorizer(tokenizer=tokenize, stop_words=words, min_df=0.001, max_df=0.5, ngram_range=(1, 1))),
('clf', LinearSVC(C=5000)),
])
print("Pipeline:", [name for name, _ in pipeline.steps])
# Build a grid search to find out whether unigrams or bigrams are
# more useful.
# Fit the pipeline on the training set using grid search for the parameters
print("Initializing grid search...")
# uncommenting more parameters will give better exploring power but will
# increase processing time in a combinatorial way
parameters = {
# 'vect__ngram_range': [(1, 1), (1, 2)],
# 'vect__min_df': (0.0005, 0.001),
# 'vect__max_df': (0.25, 0.5),
# 'clf__C': (10, 15, 20),
}
print("Parameters:")
pprint.pprint(parameters)
grid_search = GridSearchCV(
pipeline,
parameters,
n_jobs=-1,
verbose=True)
print("Training and performing grid search...\n")
t0 = time()
grid_search.fit(docs_train, y_train)
print("\nDone in %0.3fs!\n" % (time() - t0))
# Print the mean and std for each candidate along with the parameter
# settings for all the candidates explored by grid search.
n_candidates = len(grid_search.cv_results_['params'])
for i in range(n_candidates):
print(i, 'params - %s; mean - %0.2f; std - %0.2f'
% (grid_search.cv_results_['params'][i],
grid_search.cv_results_['mean_test_score'][i],
grid_search.cv_results_['std_test_score'][i]))
# Predict the outcome on the testing set and store it in a variable
# named y_predicted
print("\nRunning against testing set...\n")
y_predicted = grid_search.predict(docs_test)
# Save model
print("\nSaving model to", save, "...")
joblib.dump(grid_search.best_estimator_, save)
print("Model Saved! \nPrepare for some awesome stats!")
I must confess that I am pretty stumped, and after tinkering around, searching, and making sure that my server is configured correctly, I felt that perhaps someone here might be able to help.
Any help is appreciated, and if there is any more information that I need to provide, please let me know and I will be happy to.
Also, I am running:
python 3.5.3 with nltk and sklearn.
I solved this problem, although imperfectly, by removing my custom tokenizer and falling back on one of sklearn's.
However, I am still in the dark on how to integrate my own tokenizer.

Categories