Spark-nlp Pretrained-model not loading in windows - python

I am trying to install pretrained pipelines in spark-nlp in windows 10 with python.
The following is the code I have tried so far in the Jupyter notebook in the local system:
! java -version
# should be Java 8 (Oracle or OpenJDK)
! conda create -n sparknlp python=3.7 -y
! conda activate sparknlp
! pip install --user spark-nlp==2.6.4 pyspark==2.4.5
from sparknlp.base import *
from sparknlp.annotator import *
from sparknlp.pretrained import PretrainedPipeline
import sparknlp
# Start Spark Session with Spark NLP
# start() functions has two parameters: gpu and spark23
# sparknlp.start(gpu=True) will start the session with GPU support
# sparknlp.start(sparrk23=True) is when you have Apache Spark 2.3.x installed
spark = sparknlp.start()
# Download a pre-trained pipeline
pipeline = PretrainedPipeline('explain_document_ml', lang='en')
I am getting the following error:
explain_document_ml download started this may take some time.
Approx size to download 9.4 MB
[OK!]
---------------------------------------------------------------------------
Py4JJavaError Traceback (most recent call last)
~\AppData\Roaming\Python\Python37\site-packages\pyspark\sql\utils.py in deco(*a, **kw)
62 try:
---> 63 return f(*a, **kw)
64 except py4j.protocol.Py4JJavaError as e:
~\Anaconda3\envs\py37\lib\site-packages\py4j\protocol.py in get_return_value(answer, gateway_client, target_id, name)
327 "An error occurred while calling {0}{1}{2}.\n".
--> 328 format(target_id, ".", name), value)
329 else:
Py4JJavaError: An error occurred while calling z:com.johnsnowlabs.nlp.pretrained.PythonResourceDownloader.downloadPipeline.
: java.lang.IllegalArgumentException: requirement failed: Was not found appropriate resource to download for request: ResourceRequest(explain_document_ml,Some(en),public/models,2.6.4,2.4.4) with downloader: com.johnsnowlabs.nlp.pretrained.S3ResourceDownloader#2570f26e
at scala.Predef$.require(Predef.scala:224)
at com.johnsnowlabs.nlp.pretrained.ResourceDownloader$.downloadResource(ResourceDownloader.scala:345)
at com.johnsnowlabs.nlp.pretrained.ResourceDownloader$.downloadPipeline(ResourceDownloader.scala:376)
at com.johnsnowlabs.nlp.pretrained.ResourceDownloader$.downloadPipeline(ResourceDownloader.scala:371)
at com.johnsnowlabs.nlp.pretrained.PythonResourceDownloader$.downloadPipeline(ResourceDownloader.scala:474)
at com.johnsnowlabs.nlp.pretrained.PythonResourceDownloader.downloadPipeline(ResourceDownloader.scala)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(Unknown Source)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(Unknown Source)
at java.lang.reflect.Method.invoke(Unknown Source)
at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
at py4j.Gateway.invoke(Gateway.java:282)
at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
at py4j.commands.CallCommand.execute(CallCommand.java:79)
at py4j.GatewayConnection.run(GatewayConnection.java:238)
at java.lang.Thread.run(Unknown Source)
During handling of the above exception, another exception occurred:
IllegalArgumentException Traceback (most recent call last)
<ipython-input-2-d18238e76d9f> in <module>
11
12 # Download a pre-trained pipeline
---> 13 pipeline = PretrainedPipeline('explain_document_ml', lang='en')
~\Anaconda3\envs\py37\lib\site-packages\sparknlp\pretrained.py in __init__(self, name, lang, remote_loc, parse_embeddings, disk_location)
89 def __init__(self, name, lang='en', remote_loc=None, parse_embeddings=False, disk_location=None):
90 if not disk_location:
---> 91 self.model = ResourceDownloader().downloadPipeline(name, lang, remote_loc)
92 else:
93 self.model = PipelineModel.load(disk_location)
~\Anaconda3\envs\py37\lib\site-packages\sparknlp\pretrained.py in downloadPipeline(name, language, remote_loc)
58 t1.start()
59 try:
---> 60 j_obj = _internal._DownloadPipeline(name, language, remote_loc).apply()
61 jmodel = PipelineModel._from_java(j_obj)
62 finally:
~\Anaconda3\envs\py37\lib\site-packages\sparknlp\internal.py in __init__(self, name, language, remote_loc)
179 class _DownloadPipeline(ExtendedJavaWrapper):
180 def __init__(self, name, language, remote_loc):
--> 181 super(_DownloadPipeline, self).__init__("com.johnsnowlabs.nlp.pretrained.PythonResourceDownloader.downloadPipeline", name, language, remote_loc)
182
183
~\Anaconda3\envs\py37\lib\site-packages\sparknlp\internal.py in __init__(self, java_obj, *args)
127 super(ExtendedJavaWrapper, self).__init__(java_obj)
128 self.sc = SparkContext._active_spark_context
--> 129 self._java_obj = self.new_java_obj(java_obj, *args)
130 self.java_obj = self._java_obj
131
~\Anaconda3\envs\py37\lib\site-packages\sparknlp\internal.py in new_java_obj(self, java_class, *args)
137
138 def new_java_obj(self, java_class, *args):
--> 139 return self._new_java_obj(java_class, *args)
140
141 def new_java_array(self, pylist, java_class):
~\AppData\Roaming\Python\Python37\site-packages\pyspark\ml\wrapper.py in _new_java_obj(java_class, *args)
65 java_obj = getattr(java_obj, name)
66 java_args = [_py2java(sc, arg) for arg in args]
---> 67 return java_obj(*java_args)
68
69 #staticmethod
~\Anaconda3\envs\py37\lib\site-packages\py4j\java_gateway.py in __call__(self, *args)
1255 answer = self.gateway_client.send_command(command)
1256 return_value = get_return_value(
-> 1257 answer, self.gateway_client, self.target_id, self.name)
1258
1259 for temp_arg in temp_args:
~\AppData\Roaming\Python\Python37\site-packages\pyspark\sql\utils.py in deco(*a, **kw)
77 raise QueryExecutionException(s.split(': ', 1)[1], stackTrace)
78 if s.startswith('java.lang.IllegalArgumentException: '):
---> 79 raise IllegalArgumentException(s.split(': ', 1)[1], stackTrace)
80 raise
81 return deco
IllegalArgumentException: 'requirement failed: Was not found appropriate resource to download for request: ResourceRequest(explain_document_ml,Some(en),public/models,2.6.4,2.4.4) with downloader: com.johnsnowlabs.nlp.pretrained.S3ResourceDownloader#2570f26e'

This is one of the common issues with Apache Spark & Spark NLP when the Java/Spark/Hadoop is not correctly setup on Windows:
You need to follow these steps correctly to avoid the common issues including failed pretrained() downloads:
Download OpenJDK from here: https://adoptopenjdk.net/?variant=openjdk8&jvmVariant=hotspot
Make sure it is 64-bit
Make sure you install it in the root C:\java Windows doesn't like space in the path.
During installation after changing the path, select setting Path
Download winutils and put it in C:\hadoop\bin https://github.com/cdarlint/winutils/blob/master/hadoop-2.7.3/bin/winutils.exe
Download Anaconda 3.6 from Archive, I didn't like the new 3.8 (Apache Spark 2.4.x only works with Python 3.6 and 3.7): https://repo.anaconda.com/archive/Anaconda3-2020.02-Windows-x86_64.exe
Download Apache Spark 2.4.6 and extract it in C:\spark\
Set the env for HADOOP_HOME to C:\hadoop and SPARK_HOME to C:\spark
Set Paths for %HADOOP_HOME%\bin and %SPARK_HOME%\bin
Install C++ (again the 64 bit) https://www.microsoft.com/en-us/download/confirmation.aspx?id=14632
Create C:\temp and C:\temp\hive
Fix permissions:
C:\Users\maz>%HADOOP_HOME%\bin\winutils.exe chmod 777 /tmp/hive
C:\Users\maz>%HADOOP_HOME%\bin\winutils.exe chmod 777 /tmp/
Either create a conda env for python 3.6, install pyspark==2.4.6 spark-nlp numpy and use Jupyter/python console, or in the same conda env you can go to spark bin for pyspark --packages com.johnsnowlabs.nlp:spark-nlp_2.11:2.6.5.

Related

Pyspark windows os - RuntimeError: Java gateway process exited before sending its port number

I am trying to install Pyspark on windows since yesterday but I am constantly getting this error. It's been more then 48 hours, I tried everything to resolve the problem. Reinstalled Pyspark from scratch numerous times but still could not get it to work.
Whenever I am running -
spark = SparkSession.builder.getOrCreate()
I am getting this error -
RuntimeError Traceback (most recent call last)
~\AppData\Local\Temp/ipykernel_20592/2335384691.py in <module>
1 # create a spark session
----> 2 spark = SparkSession.builder.getOrCreate()
c:\users\bhola\appdata\local\programs\python\python38\lib\site-packages\pyspark\sql\session.py in getOrCreate(self)
226 sparkConf.set(key, value)
227 # This SparkContext may be an existing one.
--> 228 sc = SparkContext.getOrCreate(sparkConf)
229 # Do not update `SparkConf` for existing `SparkContext`, as it's shared
230 # by all sessions.
c:\users\bhola\appdata\local\programs\python\python38\lib\site-packages\pyspark\context.py in getOrCreate(cls, conf)
390 with SparkContext._lock:
391 if SparkContext._active_spark_context is None:
--> 392 SparkContext(conf=conf or SparkConf())
393 return SparkContext._active_spark_context
394
c:\users\bhola\appdata\local\programs\python\python38\lib\site-packages\pyspark\context.py in __init__(self, master, appName, sparkHome, pyFiles, environment, batchSize, serializer, conf, gateway, jsc, profiler_cls)
142 " is not allowed as it is a security risk.")
143
--> 144 SparkContext._ensure_initialized(self, gateway=gateway, conf=conf)
145 try:
146 self._do_init(master, appName, sparkHome, pyFiles, environment, batchSize, serializer,
c:\users\bhola\appdata\local\programs\python\python38\lib\site-packages\pyspark\context.py in _ensure_initialized(cls, instance, gateway, conf)
337 with SparkContext._lock:
338 if not SparkContext._gateway:
--> 339 SparkContext._gateway = gateway or launch_gateway(conf)
340 SparkContext._jvm = SparkContext._gateway.jvm
341
c:\users\bhola\appdata\local\programs\python\python38\lib\site-packages\pyspark\java_gateway.py in launch_gateway(conf, popen_kwargs)
106
107 if not os.path.isfile(conn_info_file):
--> 108 raise RuntimeError("Java gateway process exited before sending its port number")
109
110 with open(conn_info_file, "rb") as info:
RuntimeError: Java gateway process exited before sending its port number
I Tried the solution given in this stackoveflow post and in this stackoverflow2 post.
export PYSPARK_SUBMIT_ARGS="--master local[2] pyspark-shell"
In my windows system I used variable name = PYSPARK_SUBMIT_ARGS and variable value = "--master local[2] pyspark-shell"
But it's not working.
Other system variables that is set on my machine are during installations are-
SPARK_HOME = D:\spark\spark-3.2.0-bin-hadoop3.2
HADOOP_HOME = D:\spark\spark-3.2.0-bin-hadoop3.2
Path = D:\spark\spark-3.2.0-bin-hadoop3.2\bin
PYSPARK_DRIVER_PYTHON = jupyter
PYSPARK_DRIVER_PYTHON_OPTS = jupyter
JAVA_HOME = C:\Program Files\Java\jdk1.8.0_301
Can anyone help me with this?
Did you download the winutils.exe from https://github.com/kontext-tech/winutils? You'll need to put that in \Hadoop\bin and add paths, etc.

Cant load spacy en_core_web_trf

As the self guide says, I've installed it with (conda environment)
conda install -c conda-forge spacy
python -m spacy download en_core_web_trf
I have spacy-transformers already installed. But when I simply do:
import spacy
spacy.load("en_core_web_trf")
It shows me this error:
ValueError: [E002] Can't find factory for 'transformer' for language English (en). This usually happens when spaCy calls `nlp.create_pipe` with a custom component name that's not registered on the current language class. If you're using a Transformer, make sure to install 'spacy-transformers'. If you're using a custom component, make sure you've added the decorator `#Language.component` (for function components) or `#Language.factory` (for class components).
Available factories: attribute_ruler, tok2vec, merge_noun_chunks, merge_entities, merge_subtokens, token_splitter, parser, beam_parser, entity_linker, ner, beam_ner, entity_ruler, lemmatizer, tagger, morphologizer, senter, sentencizer, textcat, spancat, textcat_multilabel, en.lemmatizer
More info about the error:
ValueError Traceback (most recent call last)
~\AppData\Local\Temp/ipykernel_11108/2648447056.py in <module>
----> 1 nlp_en = spacy.load("en_core_web_trf")
~\Anaconda3\envs\rl\lib\site-packages\spacy\__init__.py in load(name, vocab, disable, exclude, config)
49 RETURNS (Language): The loaded nlp object.
50 """
---> 51 return util.load_model(
52 name, vocab=vocab, disable=disable, exclude=exclude, config=config
53 )
~\Anaconda3\envs\rl\lib\site-packages\spacy\util.py in load_model(name, vocab, disable, exclude, config)
345 return get_lang_class(name.replace("blank:", ""))()
346 if is_package(name): # installed as package
--> 347 return load_model_from_package(name, **kwargs)
348 if Path(name).exists(): # path to model data directory
349 return load_model_from_path(Path(name), **kwargs)
~\Anaconda3\envs\rl\lib\site-packages\spacy\util.py in load_model_from_package(name, vocab, disable, exclude, config)
378 """
379 cls = importlib.import_module(name)
--> 380 return cls.load(vocab=vocab, disable=disable, exclude=exclude, config=config)
381
382
~\Anaconda3\envs\rl\lib\site-packages\en_core_web_trf\__init__.py in load(**overrides)
8
9 def load(**overrides):
---> 10 return load_model_from_init_py(__file__, **overrides)
~\Anaconda3\envs\rl\lib\site-packages\spacy\util.py in load_model_from_init_py(init_file, vocab, disable, exclude, config)
538 if not model_path.exists():
539 raise IOError(Errors.E052.format(path=data_path))
--> 540 return load_model_from_path(
541 data_path,
542 vocab=vocab,
~\Anaconda3\envs\rl\lib\site-packages\spacy\util.py in load_model_from_path(model_path, meta, vocab, disable, exclude, config)
413 overrides = dict_to_dot(config)
414 config = load_config(config_path, overrides=overrides)
--> 415 nlp = load_model_from_config(config, vocab=vocab, disable=disable, exclude=exclude)
416 return nlp.from_disk(model_path, exclude=exclude, overrides=overrides)
417
~\Anaconda3\envs\rl\lib\site-packages\spacy\util.py in load_model_from_config(config, vocab, disable, exclude, auto_fill, validate)
450 # registry, including custom subclasses provided via entry points
451 lang_cls = get_lang_class(nlp_config["lang"])
--> 452 nlp = lang_cls.from_config(
453 config,
454 vocab=vocab,
~\Anaconda3\envs\rl\lib\site-packages\spacy\language.py in from_config(cls, config, vocab, disable, exclude, meta, auto_fill, validate)
1712 # The pipe name (key in the config) here is the unique name
1713 # of the component, not necessarily the factory
-> 1714 nlp.add_pipe(
1715 factory,
1716 name=pipe_name,
~\Anaconda3\envs\rl\lib\site-packages\spacy\language.py in add_pipe(self, factory_name, name, before, after, first, last, source, config, raw_config, validate)
774 lang_code=self.lang,
775 )
--> 776 pipe_component = self.create_pipe(
777 factory_name,
778 name=name,
~\Anaconda3\envs\rl\lib\site-packages\spacy\language.py in create_pipe(self, factory_name, name, config, raw_config, validate)
639 lang_code=self.lang,
640 )
--> 641 raise ValueError(err)
642 pipe_meta = self.get_factory_meta(factory_name)
643 # This is unideal, but the alternative would mean you always need to
Are you sure you did install spacy-transformers?
After installing spacy?
I am using pip:
pip install spacy-transformers
and I have no problems loading the en_core_web_trf.
For anyone who tried this solution but still did not get it to work. Something that they did not mention (because it is trivial) but got me staring at it for ages was that after the
!pip install spacy-transformers
You still need to place
import spacy_transformers
at the top of your code.
# !pip install spacy
# !pip install spacy-transformers
# !python3 -m spacy download en_core_web_trf
It's working fine, if you are working in google colab, use this
# !pip install spacy==(last version of spacy(3.2.4))
Remaining all are same, colab downloading the older version. Hope you get the answers!

VS Code: "Open Folder" make my PySpark not work?

I am using Anaconda on Windows 10 x64. I'm using VS Code. Recently I successfully ran the following:
(I)
import pyspark
from pyspark.sql import SparkSession
spark=SparkSession.builder.master("local[1]").appName("local").getOrCreate()
rdd=spark.sparkContext.parallelize([1,2,3,4,56])
print(rdd.count())
Then I clicked file->open folder and opened a folder, so now it appears in a pane on the left-hand side of my screen. Question #1: What does this do? I used to think that it was just a quick way to see some frequently-used files.
Now that my folder is in my left-hand pane, the above code errors out (see below), with an error that includes the phrase Python worker failed to connect back.. Question #2: Why does this happen?
Question #3: If I want to be able to avoid the above error while also having a folder open in VS Code, what should I do? Any ideas about what settings I should look at?
If I close the folder, my code works again.
-----------------------------------------------------------------------------------------------------------------------------------------
The Error: Every time it gives me a slightly different error, but it always begins with something like this:
Py4JJavaError Traceback (most recent call last)
\\pathtomyfile\temp.py in
----> 240 spark.createDataFrame(pandas_df).toDF(*columns).show()
C:\Spark\spark-3.1.1-bin-hadoop2.7\python\pyspark\sql\dataframe.py in show(self, n, truncate, vertical)
482 """
483 if isinstance(truncate, bool) and truncate:
--> 484 print(self._jdf.showString(n, 20, vertical))
485 else:
486 print(self._jdf.showString(n, int(truncate), vertical))
C:\Spark\spark-3.1.1-bin-hadoop2.7\python\lib\py4j-0.10.9-src.zip\py4j\java_gateway.py in __call__(self, *args)
1302
1303 answer = self.gateway_client.send_command(command)
-> 1304 return_value = get_return_value(
1305 answer, self.gateway_client, self.target_id, self.name)
1306
C:\Spark\spark-3.1.1-bin-hadoop2.7\python\pyspark\sql\utils.py in deco(*a, **kw)
109 def deco(*a, **kw):
110 try:
--> 111 return f(*a, **kw)
112 except py4j.protocol.Py4JJavaError as e:
113 converted = convert_exception(e.java_exception)
C:\Spark\spark-3.1.1-bin-hadoop2.7\python\lib\py4j-0.10.9-src.zip\py4j\protocol.py in get_return_value(answer, gateway_client, target_id, name)
324 value = OUTPUT_CONVERTER[type](answer[2:], gateway_client)
325 if answer[1] == REFERENCE_TYPE:
--> 326 raise Py4JJavaError(
327 "An error occurred while calling {0}{1}{2}.\n".
328 format(target_id, ".", name), value)
Py4JJavaError: An error occurred while calling o77.showString.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 3.0 failed 1 times, most recent failure: Lost task 0.0 in stage 3.0 (TID 3) (blablabla executor driver): org.apache.spark.SparkException: Python worker failed to connect back.
<a bunch more stuff after that>
(It's always "Task 0 in stage x failed y times", but x isn't always 3 and y isn't always 1.)
-----------------------------------------------------------------------------------------------------------------------------------------
Other Results: I get a similar error if I run the following:
(II)
import pandas as pd
import numpy as np
from pyspark.sql import SparkSession
spark=SparkSession.builder.master("local[1]").appName("local").getOrCreate()
pandas_df = pd.DataFrame(np.random.randint(0,100, size=(5,5)))
columns = ["col"+str(i) for i in range(5)]
print(columns)
display(pandas_df.head())
spark.createDataFrame(pandas_df).toDF(*columns).show()
I don't get an error if I run the following:
(III)
from pyspark.sql import SparkSession
spark=SparkSession.builder.master("local[1]").appName("local").getOrCreate()
rocks = spark.read.format("csv").option("header", "true").load("C:\\rocksamples.csv")
rocks.show(10)
Also, none of the above three code blocks (I, II, III) gives an error if I run it from an .ipynb file inside the folder I opened.
-----------------------------------------------------------------------------------------------------------------------------------------
Background: I have the following files and folders on my machine:
Anaconda directory: C:\users\me\Anaconda3
Spark directory: C:\Spark\spark-3.1.1-bin-hadoop2.7
Java directory: C:\Spark\java\jre1.8.0_231
Nothing called pyspark in C:\users\me\Anaconda3\Lib\site-packages
a .csv file at C:\rocksamples.csv
Winutils file in C:\Hadoop\bin\winutils.exe
My current environment variables (which I would like to clean up but now I'm afraid to) include the following:
HADOOP_HOME=C:\Hadoop
JAVA_HOME=C:\Spark\java\jre1.8.0_231
Path=(other stuff);C:\Spark\spark-3.1.1-bin-hadoop2.7\bin;C:\Spark\java\jre1.8.0_231\bin
PYSPARK_DRIVER_PYTHON=jupyter
PYSPARK_DRIVER_PYTHON_OPTS=notebook
PYSPARK_PYTHON=python
PYTHONPATH=C:\Spark\spark-3.1.1-bin-hadoop2.7\python
SPARK_HOME=C:\Spark\spark-3.1.1-bin-hadoop2.7
I think Python is finding the correct PySpark, because if I try from pyspark import this_does_not_exist I get ImportError: cannot import name 'this_does_not_exist' from 'pyspark' (C:\Spark\spark-3.1.1-bin-hadoop2.7\python\pyspark\__init__.py) .
​
The folder I opened using "open folder" in VS Code is on a UNC path containing a space (i.e. \\blablabla\bla\my folder).

spark-nlp 'JavaPackage' object is not callable

I am using jupyter lab to run spark-nlp text analysis. At the moment I am just running the sample code:
import sparknlp
from pyspark.sql import SparkSession
from sparknlp.pretrained import PretrainedPipeline
#create or get Spark Session
#spark = sparknlp.start()
spark = SparkSession.builder \
.appName("ner")\
.master("local[4]")\
.config("spark.driver.memory","8G")\
.config("spark.driver.maxResultSize", "2G") \
.config("spark.jars.packages", "com.johnsnowlabs.nlp:spark-nlp_2.11:2.6.5")\
.config("spark.kryoserializer.buffer.max", "500m")\
.getOrCreate()
print("sparknlp version", sparknlp.version(), "sparkversion", spark.version)
#download, load, and annotate a text by pre-trained pipeline
pipeline = PretrainedPipeline('recognize_entities_dl', 'en')
result = pipeline.annotate('Harry Potter is a great movie')
I get the following error:
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
<ipython-input-4-bfd6884be04c> in <module>
15
16 #download, load, and annotate a text by pre-trained pipeline
---> 17 pipeline = PretrainedPipeline('recognize_entities_dl', 'en')
18 result = pipeline.annotate('Harry Potter is a great movie')
~/.pyenv/versions/3.7.9/lib/python3.7/site-packages/sparknlp/pretrained.py in __init__(self, name, lang, remote_loc, parse_embeddings, disk_location)
89 def __init__(self, name, lang='en', remote_loc=None, parse_embeddings=False, disk_location=None):
90 if not disk_location:
---> 91 self.model = ResourceDownloader().downloadPipeline(name, lang, remote_loc)
92 else:
93 self.model = PipelineModel.load(disk_location)
~/.pyenv/versions/3.7.9/lib/python3.7/site-packages/sparknlp/pretrained.py in downloadPipeline(name, language, remote_loc)
49 def downloadPipeline(name, language, remote_loc=None):
50 print(name + " download started this may take some time.")
---> 51 file_size = _internal._GetResourceSize(name, language, remote_loc).apply()
52 if file_size == "-1":
53 print("Can not find the model to download please check the name!")
~/.pyenv/versions/3.7.9/lib/python3.7/site-packages/sparknlp/internal.py in __init__(self, name, language, remote_loc)
190 def __init__(self, name, language, remote_loc):
191 super(_GetResourceSize, self).__init__(
--> 192 "com.johnsnowlabs.nlp.pretrained.PythonResourceDownloader.getDownloadSize", name, language, remote_loc)
193
194
~/.pyenv/versions/3.7.9/lib/python3.7/site-packages/sparknlp/internal.py in __init__(self, java_obj, *args)
127 super(ExtendedJavaWrapper, self).__init__(java_obj)
128 self.sc = SparkContext._active_spark_context
--> 129 self._java_obj = self.new_java_obj(java_obj, *args)
130 self.java_obj = self._java_obj
131
~/.pyenv/versions/3.7.9/lib/python3.7/site-packages/sparknlp/internal.py in new_java_obj(self, java_class, *args)
137
138 def new_java_obj(self, java_class, *args):
--> 139 return self._new_java_obj(java_class, *args)
140
141 def new_java_array(self, pylist, java_class):
~/.pyenv/versions/3.7.9/lib/python3.7/site-packages/pyspark/ml/wrapper.py in _new_java_obj(java_class, *args)
67 java_obj = getattr(java_obj, name)
68 java_args = [_py2java(sc, arg) for arg in args]
---> 69 return java_obj(*java_args)
70
71 #staticmethod
TypeError: 'JavaPackage' object is not callable
I read a few of the github issues developers raised in spark-nlp repo, but the fixes are not working for me. I am wondering if the use of pyenv is causing problems, but it works for everything else.
My jupyter lab is launched like so:
/home/myuser/.pyenv/shims/jupyter lab --no-browser --allow-root --notebook-dir /home/myuser/workdir/notebooks
My env configuration:
ubuntu: 20.10
Apache Spark: 3.0.1
pyspark: 2.4.4
spark-nlp: 2.6.5
pyenv: 1.2.21
Java:
openjdk 11.0.9 2020-10-20
OpenJDK Runtime Environment (build 11.0.9+10-post-Ubuntu-0ubuntu1)
OpenJDK 64-Bit Server VM (build 11.0.9+10-post-Ubuntu-0ubuntu1, mixed mode, sharing)
jupyter:
jupyter core : 4.7.0
jupyter-notebook : 6.1.5
qtconsole : 5.0.1
ipython : 7.19.0
ipykernel : 5.4.2
jupyter client : 6.1.7
jupyter lab : 2.2.9
nbconvert : 6.0.7
ipywidgets : 7.5.1
nbformat : 5.0.8
traitlets : 5.0.5
I appreciate your help .. thank you
Remove Spark 3.0.1, leave just PySpark 2.4.x. as Spark NLP still doesn't support Spark 3.x. Use Java 8 instead of Java 11 because it's not supported in Spark 2.4.

AttributeError when running unittest sample in iPy Notebook

I am new to iPython and trying to help another developer get started and we are both hitting same issues.
We are attempting to run a python unittest sample in iPython from https://docs.python.org/2/library/unittest.html#basic-example
The code runs just fine from command line on windows and ubuntu without ANY modifications
Exact same code from iPy notebook generates following exception:
AttributeError: 'module' object has no attribute '/home/myuser/'
The filename is: /home/myuser/example_unittest.ipynb
I have noodled the iPython docs and google with no luck as of the moment. Any debugging tips, or clues to solving this issue are appreciated.
(full stack):
AttributeError Traceback (most recent call last)
<ipython-input-2-39bc0ec16f11> in <module>()
28
29 if __name__ == '__main__':
---> 30 unittest.main()
31
/usr/lib/python2.7/unittest/main.pyc in __init__(self, module, defaultTest, argv, testRunner, testLoader, exit, verbosity, failfast, catchbreak, buffer)
92 self.testLoader = testLoader
93 self.progName = os.path.basename(argv[0])
---> 94 self.parseArgs(argv)
95 self.runTests()
96
/usr/lib/python2.7/unittest/main.pyc in parseArgs(self, argv)
147 else:
148 self.testNames = (self.defaultTest,)
--> 149 self.createTests()
150 except getopt.error, msg:
151 self.usageExit(msg)
/usr/lib/python2.7/unittest/main.pyc in createTests(self)
156 else:
157 self.test = self.testLoader.loadTestsFromNames(self.testNames,
--> 158 self.module)
159
160 def _do_discovery(self, argv, Loader=loader.TestLoader):
/usr/lib/python2.7/unittest/loader.pyc in loadTestsFromNames(self, names, module)
126 of string specifiers. See 'loadTestsFromName()'.
127 """
--> 128 suites = [self.loadTestsFromName(name, module) for name in names]
129 return self.suiteClass(suites)
130
/usr/lib/python2.7/unittest/loader.pyc in loadTestsFromName(self, name, module)
98 obj = module
99 for part in parts:
--> 100 parent, obj = obj, getattr(obj, part)
101
102 if isinstance(obj, types.ModuleType):
AttributeError: 'module' object has no attribute '/home/myuser/'
unittest.main() is primarily for command line execution.
In order to run a unittest in the ipython notebook, remove the if __name__ == '__main__' part of the code and, in a new cell, create a test suite and then run it using TextTestRunner,
suite = unittest.TestLoader().loadTestsFromTestCase(TestSequenceFunctions)
unittest.TextTestRunner().run(suite)
The reason you are getting that error because of unittest.main checks for arguments (sys.argv). It's what starts iPython or Jupyter.
Change your code to:
if __name__ == '__main__':
unittest.main(argv=['first-arg-is-ignored'], exit=False)
In the notebook, you will also want to include exit=False to prevent unittest.main from trying to shutdown the kernel process:

Categories