I used Python and boto3 to process some S3 files on the spark, and when I downloaded the files, it was unusual: The 's3'resource does not exist.
Because boto3 is not installed on each cluster node, I packaged the dependency packages used by boto3 as zip and used the spark cluster submitted by -- py-files, and this exception occurred.
Py4JJavaErrorTraceback (most recent call last)
<ipython-input-3-8147865bf49c> in <module>()
2
3
----> 4 extractor.extract(paths)
/usr/local/lib/python2.7/site-packages/extract-1.0-py2.7.egg/extract.pyc in extract(self, target_files_path)
52 try:
53 sc = self.get_spark_context()
---> 54 self._extract_file(sc, target_files_path)
55 finally:
56 if sc:
/usr/local/lib/python2.7/site-packages/extract-1.0-py2.7.egg/extract.pyc in _extract_file(self, sc, target_files_path)
109 def _extract_file(self, sc, target_files_path):
110 file_rdd = sc.parallelize(target_files_path, len(target_files_path))
--> 111 result_rdd = file_rdd.map(lambda file_path: self.process(file_path, self.func)).collect()
112 result_rdd.saveAsTextFile(self.result_path)
113
/usr/lib/spark/python/pyspark/rdd.py in collect(self)
769 """
770 with SCCallSiteSync(self.context) as css:
--> 771 port = self.ctx._jvm.PythonRDD.collectAndServe(self._jrdd.rdd())
772 return list(_load_from_socket(port, self._jrdd_deserializer))
773
/usr/lib/spark/python/lib/py4j-0.9-src.zip/py4j/java_gateway.py in __call__(self, *args)
811 answer = self.gateway_client.send_command(command)
812 return_value = get_return_value(
--> 813 answer, self.gateway_client, self.target_id, self.name)
814
815 for temp_arg in temp_args:
/usr/lib/spark/python/lib/py4j-0.9-src.zip/py4j/protocol.py in get_return_value(answer, gateway_client, target_id, name)
306 raise Py4JJavaError(
307 "An error occurred while calling {0}{1}{2}.\n".
--> 308 format(target_id, ".", name), value)
309 else:
310 raise Py4JError(
Py4JJavaError: An error occurred while calling z:org.apache.spark.api.python.PythonRDD.collectAndServe.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 2 in stage 0.0 failed 4 times, most recent failure: Lost task 2.3 in stage 0.0 (TID 15, ip-172-20-219-210.corp.hpicloud.net): org.apache.spark.api.python.PythonException: Traceback (most recent call last):
File "/mnt/yarn/usercache/chqiang/appcache/application_1521024688288_67008/container_1521024688288_67008_01_000002/pyspark.zip/pyspark/worker.py", line 111, in main
process()
File "/mnt/yarn/usercache/chqiang/appcache/application_1521024688288_67008/container_1521024688288_67008_01_000002/pyspark.zip/pyspark/worker.py", line 106, in process
serializer.dump_stream(func(split_index, iterator), outfile)
File "/mnt/yarn/usercache/chqiang/appcache/application_1521024688288_67008/container_1521024688288_67008_01_000002/pyspark.zip/pyspark/serializers.py", line 263, in dump_stream
vs = list(itertools.islice(iterator, batch))
File "/usr/local/lib/python2.7/site-packages/extract-1.0-py2.7.egg/extract.py", line 111, in <lambda>
File "./lib.zip/extract.py", line 115, in process
local_path = download_file_from_s3(self.app_name, file_path)
File "./lib.zip/extract.py", line 22, in download_file_from_s3
s3 = boto3.resource('s3')
File "./lib.zip/boto3/__init__.py", line 100, in resource
return _get_default_session().resource(*args, **kwargs)
File "./lib.zip/boto3/session.py", line 347, in resource
has_low_level_client)
ResourceNotExistsError: The 's3' resource does not exist.
The available resources are:
-
at org.apache.spark.api.python.PythonRunner$$anon$1.read(PythonRDD.scala:166)
at org.apache.spark.api.python.PythonRunner$$anon$1.<init>(PythonRDD.scala:207)
at org.apache.spark.api.python.PythonRunner.compute(PythonRDD.scala:125)
at org.apache.spark.api.python.PythonRDD.compute(PythonRDD.scala:70)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:306)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:270)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:66)
at org.apache.spark.scheduler.Task.run(Task.scala:89)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:227)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1145)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:615)
at java.lang.Thread.run(Thread.java:745)
Driver stacktrace:
at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1431)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1419)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1418)
at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:47)
at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1418)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:799)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:799)
at scala.Option.foreach(Option.scala:236)
at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:799)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:1640)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1599)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1588)
at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:48)
at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:620)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:1832)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:1845)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:1858)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:1929)
at org.apache.spark.rdd.RDD$$anonfun$collect$1.apply(RDD.scala:927)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:150)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:111)
at org.apache.spark.rdd.RDD.withScope(RDD.scala:316)
at org.apache.spark.rdd.RDD.collect(RDD.scala:926)
at org.apache.spark.api.python.PythonRDD$.collectAndServe(PythonRDD.scala:405)
at org.apache.spark.api.python.PythonRDD.collectAndServe(PythonRDD.scala)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:57)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:606)
at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:231)
at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:381)
at py4j.Gateway.invoke(Gateway.java:259)
at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:133)
at py4j.commands.CallCommand.execute(CallCommand.java:79)
at py4j.GatewayConnection.run(GatewayConnection.java:209)
at java.lang.Thread.run(Thread.java:745)
Caused by: org.apache.spark.api.python.PythonException: Traceback (most recent call last):
File "/mnt/yarn/usercache/chqiang/appcache/application_1521024688288_67008/container_1521024688288_67008_01_000002/pyspark.zip/pyspark/worker.py", line 111, in main
process()
File "/mnt/yarn/usercache/chqiang/appcache/application_1521024688288_67008/container_1521024688288_67008_01_000002/pyspark.zip/pyspark/worker.py", line 106, in process
serializer.dump_stream(func(split_index, iterator), outfile)
File "/mnt/yarn/usercache/chqiang/appcache/application_1521024688288_67008/container_1521024688288_67008_01_000002/pyspark.zip/pyspark/serializers.py", line 263, in dump_stream
vs = list(itertools.islice(iterator, batch))
File "/usr/local/lib/python2.7/site-packages/extract-1.0-py2.7.egg/extract.py", line 111, in <lambda>
File "./lib.zip/extract.py", line 115, in process
local_path = download_file_from_s3(self.app_name, file_path)
File "./lib.zip/extract.py", line 22, in download_file_from_s3
s3 = boto3.resource('s3')
File "./lib.zip/boto3/__init__.py", line 100, in resource
return _get_default_session().resource(*args, **kwargs)
File "./lib.zip/boto3/session.py", line 347, in resource
has_low_level_client)
ResourceNotExistsError: The 's3' resource does not exist.
The available resources are:
-
at org.apache.spark.api.python.PythonRunner$$anon$1.read(PythonRDD.scala:166)
at org.apache.spark.api.python.PythonRunner$$anon$1.<init>(PythonRDD.scala:207)
at org.apache.spark.api.python.PythonRunner.compute(PythonRDD.scala:125)
at org.apache.spark.api.python.PythonRDD.compute(PythonRDD.scala:70)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:306)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:270)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:66)
at org.apache.spark.scheduler.Task.run(Task.scala:89)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:227)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1145)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:615)
... 1 more
Could you please help me? thank you!
I ran into the same error today. I tried fixing it using client instead of resource, however that gave out another error.
botocore.exceptions.DataNotFoundError: Unable to load data for: endpoints
I googled a bit and came to the conclusion that it is not possible to zip up the boto3 package since packaging it results in loss of a few .json files. The endpoints.json is one of them. It exists in your botocore/data directory however when you zip it, it doesn't have it.
The solution is to install boto3 during bootstrap. You can create a bootstrap file and provide it during the cluster build-up process (there is an option in the AWS EMR console for it). This will install boto3 on the master node and all of the slave nodes.
You can refer to the following answer as well :
boto3 cannot create client on pyspark worker?
I packaged the dependency packages as whl files instead of zip packages, then added them all to the --py-files parameter(eg. a.whl,b.whl,c.whl), changed s3=boto3.resource('s3') in the code to client=boto3.client('s3') and tried to succeed
Related
I am using hyperopt. I run following code in python:
from hyperopt import hp, fmin, tpe, rand, SparkTrials, STATUS_OK, STATUS_FAIL, space_eval
trials = SparkTrials()
best = fmin(fn=self.objective, space=space, algo=tpe.suggest, trials=trials, max_evals=iteration_number)
but I got following error:
File "~/classes/Algorithm.py", line 507, in random_hyperopt
trials = SparkTrials()
File "~/.local/lib/python3.8/site-packages/hyperopt/spark.py", line 93, in __init__
max_num_concurrent_tasks = self._spark_context._jsc.sc().maxNumConcurrentTasks()
File "/opt/spark-3.1.1/python/lib/py4j-0.10.9-src.zip/py4j/java_gateway.py", line 1304, in __call__
File "/opt/spark-3.1.1/python/pyspark/sql/utils.py", line 111, in deco
return f(*a, **kw)
File "/opt/spark-3.1.1/python/lib/py4j-0.10.9-src.zip/py4j/protocol.py", line 330, in get_return_value
py4j.protocol.Py4JError: An error occurred while calling o25.maxNumConcurrentTasks. Trace:
py4j.Py4JException: Method maxNumConcurrentTasks([]) does not exist
at py4j.reflection.ReflectionEngine.getMethod(ReflectionEngine.java:318)
at py4j.reflection.ReflectionEngine.getMethod(ReflectionEngine.java:326)
at py4j.Gateway.invoke(Gateway.java:274)
at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
at py4j.commands.CallCommand.execute(CallCommand.java:79)
at py4j.GatewayConnection.run(GatewayConnection.java:238)
at java.lang.Thread.run(Thread.java:748)
Anybody knows how can I fix it?
If you install hyperopt using the below command, you will include all updates that are merged on github but not yet merged on pypi.
%pip install -I git+https://github.com/hyperopt/hyperopt.git
It is the first time I am using Pyspark. I would like to create an ETL which extract from the API and put the data into a database in my local environment. But I have an error to call the API as shown below. Any help would be appreciated.
t
t
The error:
Traceback (most recent call last):
File "etl.py", line 9, in <module>
df = spark.read.format("org.apache.dsext.spark.datasource.rest.RestDataSource").options(**options).load()
File "/home/ubuntu/.local/lib/python3.6/site-packages/pyspark/sql/readwriter.py", line 184, in load
return self._df(self._jreader.load())
File "/home/ubuntu/.local/lib/python3.6/site-packages/py4j/java_gateway.py", line 1305, in __call__
answer, self.gateway_client, self.target_id, self.name)
File "/home/ubuntu/.local/lib/python3.6/site-packages/pyspark/sql/utils.py", line 128, in deco
return f(*a, **kw)
File "/home/ubuntu/.local/lib/python3.6/site-packages/py4j/protocol.py", line 328, in get_return_value
format(target_id, ".", name), value)
py4j.protocol.Py4JJavaError: An error occurred while calling o32.load.
: java.lang.ClassNotFoundException: Failed to find data source: org.apache.dsext.spark.datasource.rest.RestDataSource. Please find packages at http://spark.apache.org/third-party-projects.html
at org.apache.spark.sql.execution.datasources.DataSource$.lookupDataSource(DataSource.scala:679)
at org.apache.spark.sql.execution.datasources.DataSource$.lookupDataSourceV2(DataSource.scala:733)
at org.apache.spark.sql.DataFrameReader.load(DataFrameReader.scala:248)
at org.apache.spark.sql.DataFrameReader.load(DataFrameReader.scala:221)
at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
at java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.base/java.lang.reflect.Method.invoke(Method.java:566)
at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
at py4j.Gateway.invoke(Gateway.java:282)
at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
at py4j.commands.CallCommand.execute(CallCommand.java:79)
at py4j.GatewayConnection.run(GatewayConnection.java:238)
at java.base/java.lang.Thread.run(Thread.java:834)
Caused by: java.lang.ClassNotFoundException: org.apache.dsext.spark.datasource.rest.RestDataSource.DefaultSource
at java.base/java.net.URLClassLoader.findClass(URLClassLoader.java:471)
at java.base/java.lang.ClassLoader.loadClass(ClassLoader.java:589)
at java.base/java.lang.ClassLoader.loadClass(ClassLoader.java:522)
at org.apache.spark.sql.execution.datasources.DataSource$.$anonfun$lookupDataSource$5(DataSource.scala:653)
at scala.util.Try$.apply(Try.scala:213)
at org.apache.spark.sql.execution.datasources.DataSource$.$anonfun$lookupDataSource$4(DataSource.scala:653)
at scala.util.Failure.orElse(Try.scala:224)
at org.apache.spark.sql.execution.datasources.DataSource$.lookupDataSource(DataSource.scala:653)
... 14 more
My code:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("{your app name here}").getOrCreate()
uri = "https://min-api.cryptocompare.com/data/histoday?fsym=BTC&tsym=JPY&limit=30&aggregate=1&e=CCCAGG"
options = { 'url' : uri, 'method' : 'GET', 'readTimeout' : '10000', 'connectionTimeout' : '2000', 'partitions' : '10'}
df = spark.read.format("org.apache.dsext.spark.datasource.rest.RestDataSource").options(**options).load()
df.printSchema()
JAVA version:
openjdk 11.0.9.1 2020-11-04
OpenJDK Runtime Environment (build 11.0.9.1+1-Ubuntu-0ubuntu1.18.04)
OpenJDK 64-Bit Server VM (build 11.0.9.1+1-Ubuntu-0ubuntu1.18.04, mixed mode, sharing)
I believe that this issue was raised due to a missing dependency.
In the code, you mentioned org.apache.dsext.spark.datasource.rest.RestDataSource as your format, this particular functionality is not inbuild in spark but depends on third party package called REST Data Source
you need to create a jar file by building the codebase and add it to your spark as follows:
$SPARK_HOME/bin/spark-shell --jars spark-datasource-rest_2.11-2.1.0-SNAPSHOT.jar --packages org.scalaj:scalaj-http_2.10:2.3.0
I have set up a spark cluster and all the nodes have access to network shared storage where they can access a file to read. I am running this in a python jupyter notebook. It was working a few days ago, and now it stopped working but I'm not sure why, or what I have changed.
I have tried restarting the nodes and master.
I have also tried copying the csv file to a new directory and pointing the spark.read there, but it still gives the same error.
When I delete the csv file, it gives a much shorter error saying 'File not found'
Any help would be greatly appreciated.
This is my code:
from pyspark.sql import SparkSession
from pyspark.conf import SparkConf
spark = SparkSession.builder \
.master("spark://IP:PORT") \
.appName("app_1") \
.config(conf=SparkConf()) \
.getOrCreate()
df = spark.read.csv("/nas/file123.csv")
string1 = df.rdd.map(lambda x: x.column1).collect()
However, I get this error:
---------------------------------------------------------------------------
Py4JJavaError Traceback (most recent call last)
<ipython-input-2-12bd938122cd> in <module>()
29
30
---> 31 string1 = df.rdd.map(lambda x: x.column1).collect()
32
33
/home/hjk/Downloads/spark-2.1.0-bin-hadoop2.7/python/pyspark/rdd.pyc in collect(self)
807 """
808 with SCCallSiteSync(self.context) as css:
--> 809 port = self.ctx._jvm.PythonRDD.collectAndServe(self._jrdd.rdd())
810 return list(_load_from_socket(port, self._jrdd_deserializer))
811
/usr/local/lib/python2.7/dist-packages/py4j/java_gateway.pyc in __call__(self, *args)
1131 answer = self.gateway_client.send_command(command)
1132 return_value = get_return_value(
-> 1133 answer, self.gateway_client, self.target_id, self.name)
1134
1135 for temp_arg in temp_args:
/home/hjk/Downloads/spark-2.1.0-bin-hadoop2.7/python/pyspark/sql/utils.pyc in deco(*a, **kw)
61 def deco(*a, **kw):
62 try:
---> 63 return f(*a, **kw)
64 except py4j.protocol.Py4JJavaError as e:
65 s = e.java_exception.toString()
/usr/local/lib/python2.7/dist-packages/py4j/protocol.pyc in get_return_value(answer, gateway_client, target_id, name)
317 raise Py4JJavaError(
318 "An error occurred while calling {0}{1}{2}.\n".
--> 319 format(target_id, ".", name), value)
320 else:
321 raise Py4JError(
Py4JJavaError: An error occurred while calling z:org.apache.spark.api.python.PythonRDD.collectAndServe.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 4 in stage 3.0 failed 4 times, most recent failure: Lost task 4.3 in stage 3.0 (TID 37, executor 2): java.io.FileNotFoundException: File file:/nas/file123.csv does not exist
It is possible the underlying files have been updated. You can explicitly invalidate the cache in Spark by running 'REFRESH TABLE tableName' command in SQL or by recreating the Dataset/DataFrame involved.
at org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.nextIterator(FileScanRDD.scala:157)
at org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.hasNext(FileScanRDD.scala:102)
at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIterator.processNext(Unknown Source)
at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
at org.apache.spark.sql.execution.WholeStageCodegenExec$$anonfun$8$$anon$1.hasNext(WholeStageCodegenExec.scala:377)
at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:408)
at org.apache.spark.api.python.SerDeUtil$AutoBatchedPickler.hasNext(SerDeUtil.scala:117)
at scala.collection.Iterator$class.foreach(Iterator.scala:893)
at org.apache.spark.api.python.SerDeUtil$AutoBatchedPickler.foreach(SerDeUtil.scala:112)
at org.apache.spark.api.python.PythonRDD$.writeIteratorToStream(PythonRDD.scala:504)
at org.apache.spark.api.python.PythonRunner$WriterThread$$anonfun$run$3.apply(PythonRDD.scala:328)
at org.apache.spark.util.Utils$.logUncaughtExceptions(Utils.scala:1951)
at org.apache.spark.api.python.PythonRunner$WriterThread.run(PythonRDD.scala:269)
Driver stacktrace:
at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1435)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1423)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1422)
at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48)
at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1422)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:802)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:802)
at scala.Option.foreach(Option.scala:257)
at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:802)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:1650)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1605)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1594)
at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:48)
at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:628)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:1918)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:1931)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:1944)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:1958)
at org.apache.spark.rdd.RDD$$anonfun$collect$1.apply(RDD.scala:935)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
at org.apache.spark.rdd.RDD.withScope(RDD.scala:362)
at org.apache.spark.rdd.RDD.collect(RDD.scala:934)
at org.apache.spark.api.python.PythonRDD$.collectAndServe(PythonRDD.scala:453)
at org.apache.spark.api.python.PythonRDD.collectAndServe(PythonRDD.scala)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:498)
at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
at py4j.Gateway.invoke(Gateway.java:280)
at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
at py4j.commands.CallCommand.execute(CallCommand.java:79)
at py4j.GatewayConnection.run(GatewayConnection.java:214)
at java.lang.Thread.run(Thread.java:748)
Caused by: java.io.FileNotFoundException: File file:/nas/file123.csv does not exist
It is possible the underlying files have been updated. You can explicitly invalidate the cache in Spark by running 'REFRESH TABLE tableName' command in SQL or by recreating the Dataset/DataFrame involved.
at org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.nextIterator(FileScanRDD.scala:157)
at org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.hasNext(FileScanRDD.scala:102)
at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIterator.processNext(Unknown Source)
at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
at org.apache.spark.sql.execution.WholeStageCodegenExec$$anonfun$8$$anon$1.hasNext(WholeStageCodegenExec.scala:377)
at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:408)
at org.apache.spark.api.python.SerDeUtil$AutoBatchedPickler.hasNext(SerDeUtil.scala:117)
at scala.collection.Iterator$class.foreach(Iterator.scala:893)
at org.apache.spark.api.python.SerDeUtil$AutoBatchedPickler.foreach(SerDeUtil.scala:112)
at org.apache.spark.api.python.PythonRDD$.writeIteratorToStream(PythonRDD.scala:504)
at org.apache.spark.api.python.PythonRunner$WriterThread$$anonfun$run$3.apply(PythonRDD.scala:328)
at org.apache.spark.util.Utils$.logUncaughtExceptions(Utils.scala:1951)
at org.apache.spark.api.python.PythonRunner$WriterThread.run(PythonRDD.scala:269)
From the error it looks like it is checking the file on your local system. Just make sure that you have file present on specified Path. Also try below suggestions.
try with file URI : file:///nas/file123.csv
Upload the file on HDFS and try to read the file from HDFS URI like hdfs:///...
Hope this helps.
Regards,
Neeraj
If you are loading the data from local directory, remember to make sure file exists in all of your worker nodes.
I'm running a pyspark script and encountered an error below. It seems saying "RuntimeError: Set changed size during iteration" due to my code "if len(rdd.take(1)) > 0:". I'm not sure if that's the real reason and wonder what exactly went wrong. Any help will be greatly appreciated.
thanks!
17/03/23 21:54:17 INFO DStreamGraph: Updated checkpoint data for time 1490320070000 ms
17/03/23 21:54:17 INFO JobScheduler: Finished job streaming job 1490320072000 ms.0 from job set of time 1490320072000 ms
17/03/23 21:54:17 INFO JobScheduler: Starting job streaming job 1490320072000 ms.1 from job set of time 1490320072000 ms
17/03/23 21:54:17 ERROR JobScheduler: Error running job streaming job 1490320072000 ms.0
org.apache.spark.SparkException: An exception was raised by Python:
Traceback (most recent call last):
File "/usr/lib/spark/python/lib/pyspark.zip/pyspark/streaming/util.py",
line 65, in call
r = self.func(t, *rdds)
File "/usr/lib/spark/python/lib/pyspark.zip/pyspark/streaming/dstream.py",
line 159, in
func = lambda t, rdd: old_func(rdd)
File "/home/richard/Documents/spark_code/with_kafka/./mongo_kafka_spark_script.py",
line 96, in _compute_glb_max
if len(rdd.take(1)) > 0:
File "/usr/lib/spark/python/lib/pyspark.zip/pyspark/rdd.py", line 1343, in take
res = self.context.runJob(self, takeUpToNumLeft, p)
File "/usr/lib/spark/python/lib/pyspark.zip/pyspark/context.py", line 965, in runJob
port = self._jvm.PythonRDD.runJob(self._jsc.sc(), mappedRDD._jrdd, partitions)
File "/usr/lib/spark/python/lib/pyspark.zip/pyspark/rdd.py", line 2439, in _jrdd
self._jrdd_deserializer, profiler)
File "/usr/lib/spark/python/lib/pyspark.zip/pyspark/rdd.py", line 2372, in _wrap_function
pickled_command, broadcast_vars, env, includes = _prepare_for_python_RDD(sc, command)
File "/usr/lib/spark/python/lib/pyspark.zip/pyspark/rdd.py", line 2363, in _prepare_for_python_RDD
broadcast_vars = [x._jbroadcast for x in sc._pickled_broadcast_vars]
RuntimeError: Set changed size during iteration
at org.apache.spark.streaming.api.python.TransformFunction.callPythonTransformFunction(PythonDStream.scala:95)
at org.apache.spark.streaming.api.python.TransformFunction.apply(PythonDStream.scala:78)
at org.apache.spark.streaming.api.python.PythonDStream$$anonfun$callForeachRDD$1.apply(PythonDStream.scala:179)
at org.apache.spark.streaming.api.python.PythonDStream$$anonfun$callForeachRDD$1.apply(PythonDStream.scala:179)
at org.apache.spark.streaming.dstream.ForEachDStream$$anonfun$1$$anonfun$apply$mcV$sp$1.apply$mcV$sp(ForEachDStream.scala:51)
at org.apache.spark.streaming.dstream.ForEachDStream$$anonfun$1$$anonfun$apply$mcV$sp$1.apply(ForEachDStream.scala:51)
at org.apache.spark.streaming.dstream.ForEachDStream$$anonfun$1$$anonfun$apply$mcV$sp$1.apply(ForEachDStream.scala:51)
at org.apache.spark.streaming.dstream.DStream.createRDDWithLocalProperties(DStream.scala:415)
at org.apache.spark.streaming.dstream.ForEachDStream$$anonfun$1.apply$mcV$sp(ForEachDStream.scala:50)
at org.apache.spark.streaming.dstream.ForEachDStream$$anonfun$1.apply(ForEachDStream.scala:50)
at org.apache.spark.streaming.dstream.ForEachDStream$$anonfun$1.apply(ForEachDStream.scala:50)
at scala.util.Try$.apply(Try.scala:192)
at org.apache.spark.streaming.scheduler.Job.run(Job.scala:39)
at org.apache.spark.streaming.scheduler.JobScheduler$JobHandler$$anonfun$run$1.apply$mcV$sp(JobScheduler.scala:254)
at org.apache.spark.streaming.scheduler.JobScheduler$JobHandler$$anonfun$run$1.apply(JobScheduler.scala:254)
at org.apache.spark.streaming.scheduler.JobScheduler$JobHandler$$anonfun$run$1.apply(JobScheduler.scala:254)
at scala.util.DynamicVariable.withValue(DynamicVariable.scala:58)
at org.apache.spark.streaming.scheduler.JobScheduler$JobHandler.run(JobScheduler.scala:253)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
at java.lang.Thread.run(Thread.java:745)
Traceback (most recent call last):
File "/home/richard/Documents/spark_code/with_kafka/./mongo_kafka_spark_script.py",
line 224, in
ssc.awaitTermination();
File "/usr/lib/spark/python/lib/pyspark.zip/pyspark/streaming/context.py",
line 206, in awaitTermination
File "/usr/lib/spark/python/lib/py4j-0.10.4-src.zip/py4j/java_gateway.py",
line 1133, in call
File "/usr/lib/spark/python/lib/pyspark.zip/pyspark/sql/utils.py", line 63,
in deco
File "/usr/lib/spark/python/lib/py4j-0.10.4-src.zip/py4j/protocol.py", line
319, in get_return_value
py4j.protocol.Py4JJavaError: An error occurred while calling o38.awaitTermination.
: org.apache.spark.SparkException: An exception was raised by Python:
Traceback (most recent call last):
File "/usr/lib/spark/python/lib/pyspark.zip/pyspark/streaming/util.py",
line 65, in call
r = self.func(t, *rdds)
File "/usr/lib/spark/python/lib/pyspark.zip/pyspark/streaming/dstream.py",
line 159, in
func = lambda t, rdd: old_func(rdd)
File "/home/richard/Documents/spark_code/with_kafka/./mongo_kafka_spark_script.py",
line 96, in _compute_glb_max
if len(rdd.take(1)) > 0:
File "/usr/lib/spark/python/lib/pyspark.zip/pyspark/rdd.py", line 1343, in take
res = self.context.runJob(self, takeUpToNumLeft, p)
File "/usr/lib/spark/python/lib/pyspark.zip/pyspark/context.py", line 965, in runJob
port = self._jvm.PythonRDD.runJob(self._jsc.sc(), mappedRDD._jrdd, partitions)
File "/usr/lib/spark/python/lib/pyspark.zip/pyspark/rdd.py", line 2439, in _jrdd
self._jrdd_deserializer, profiler)
File "/usr/lib/spark/python/lib/pyspark.zip/pyspark/rdd.py", line 2372, in _wrap_function
pickled_command, broadcast_vars, env, includes = _prepare_for_python_RDD(sc, command)
File "/usr/lib/spark/python/lib/pyspark.zip/pyspark/rdd.py", line 2363, in _prepare_for_python_RDD
broadcast_vars = [x._jbroadcast for x in sc._pickled_broadcast_vars]
RuntimeError: Set changed size during iteration
at org.apache.spark.streaming.api.python.TransformFunction.callPythonTransformFunction(PythonDStream.scala:95)
at org.apache.spark.streaming.api.python.TransformFunction.apply(PythonDStream.scala:78)
at org.apache.spark.streaming.api.python.PythonDStream$$anonfun$callForeachRDD$1.apply(PythonDStream.scala:179)
at org.apache.spark.streaming.api.python.PythonDStream$$anonfun$callForeachRDD$1.apply(PythonDStream.scala:179)
at org.apache.spark.streaming.dstream.ForEachDStream$$anonfun$1$$anonfun$apply$mcV$sp$1.apply$mcV$sp(ForEachDStream.scala:51)
at org.apache.spark.streaming.dstream.ForEachDStream$$anonfun$1$$anonfun$apply$mcV$sp$1.apply(ForEachDStream.scala:51)
at org.apache.spark.streaming.dstream.ForEachDStream$$anonfun$1$$anonfun$apply$mcV$sp$1.apply(ForEachDStream.scala:51)
at org.apache.spark.streaming.dstream.DStream.createRDDWithLocalProperties(DStream.scala:415)
at org.apache.spark.streaming.dstream.ForEachDStream$$anonfun$1.apply$mcV$sp(ForEachDStream.scala:50)
at org.apache.spark.streaming.dstream.ForEachDStream$$anonfun$1.apply(ForEachDStream.scala:50)
at org.apache.spark.streaming.dstream.ForEachDStream$$anonfun$1.apply(ForEachDStream.scala:50)
at scala.util.Try$.apply(Try.scala:192)
at org.apache.spark.streaming.scheduler.Job.run(Job.scala:39)
at org.apache.spark.streaming.scheduler.JobScheduler$JobHandler$$anonfun$run$1.apply$mcV$sp(JobScheduler.scala:254)
at org.apache.spark.streaming.scheduler.JobScheduler$JobHandler$$anonfun$run$1.apply(JobScheduler.scala:254)
at org.apache.spark.streaming.scheduler.JobScheduler$JobHandler$$anonfun$run$1.apply(JobScheduler.scala:254)
at scala.util.DynamicVariable.withValue(DynamicVariable.scala:58)
at org.apache.spark.streaming.scheduler.JobScheduler$JobHandler.run(JobScheduler.scala:253)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
at java.lang.Thread.run(Thread.java:745)
It seems not to be the best practice creating broadcast variables among iterations. Always use updateStateByKey if possible when stateful data is required.
try
if rdd.count() <1 :
take() can give exceptions, but, if more details were available we could have pinpointed the error.
I installed Spark 1.4.1 in my Hadoop 2.6.0 and I tried to run the following PySpark command to count the number of lines. It throwing the following error. I am new for Spark and could not able to find the error.
Can anyone provide the solution.
>>> distFile = sc.textFile("/home/hduser2/spark-1.4.1-bin-hadoop2.6/README.md")
15/12/31 09:31:50 INFO storage.MemoryStore: ensureFreeSpace(213560) called with curMem=695185, maxMem=278019440
15/12/31 09:31:50 INFO storage.MemoryStore: Block broadcast_10 stored as values in memory (estimated size 208.6 KB, free 264.3 MB)
15/12/31 09:31:50 INFO storage.MemoryStore: ensureFreeSpace(19929) called with curMem=908745, maxMem=278019440
15/12/31 09:31:50 INFO storage.MemoryStore: Block broadcast_10_piece0 stored as bytes in memory (estimated size 19.5 KB, free 264.3 MB)
15/12/31 09:31:50 INFO storage.BlockManagerInfo: Added broadcast_10_piece0 in memory on localhost:60765 (size: 19.5 KB, free: 265.1 MB)
15/12/31 09:31:50 INFO spark.SparkContext: Created broadcast 10 from textFile at NativeMethodAccessorImpl.java:-2
>>> distFile.count()
Traceback (most recent call last):
File "<stdin>", line 1, in <module>
File "/usr/local/spark/python/pyspark/rdd.py", line 984, in count
return self.mapPartitions(lambda i: [sum(1 for _ in i)]).sum()
File "/usr/local/spark/python/pyspark/rdd.py", line 975, in sum
return self.mapPartitions(lambda x: [sum(x)]).fold(0, operator.add)
File "/usr/local/spark/python/pyspark/rdd.py", line 852, in fold
vals = self.mapPartitions(func).collect()
File "/usr/local/spark/python/pyspark/rdd.py", line 757, in collect
port = self.ctx._jvm.PythonRDD.collectAndServe(self._jrdd.rdd())
File "/usr/local/spark/python/lib/py4j-0.8.2.1-src.zip/py4j/java_gateway.py", line 538, in __call__
File "/usr/local/spark/python/lib/py4j-0.8.2.1-src.zip/py4j/protocol.py", line 300, in get_return_value
py4j.protocol.Py4JJavaError: An error occurred while calling z:org.apache.spark.api.python.PythonRDD.collectAndServe.
: org.apache.hadoop.mapred.InvalidInputException: Input path does not exist: hdfs://localhost:9000/home/hduser2/spark-1.4.1-bin-hadoop2.6/README.md
at org.apache.hadoop.mapred.FileInputFormat.singleThreadedListStatus(FileInputFormat.java:285)
at org.apache.hadoop.mapred.FileInputFormat.listStatus(FileInputFormat.java:228)
at org.apache.hadoop.mapred.FileInputFormat.getSplits(FileInputFormat.java:313)
at org.apache.spark.rdd.HadoopRDD.getPartitions(HadoopRDD.scala:207)
at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:219)
at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:217)
at scala.Option.getOrElse(Option.scala:120)
at org.apache.spark.rdd.RDD.partitions(RDD.scala:217)
at org.apache.spark.rdd.MapPartitionsRDD.getPartitions(MapPartitionsRDD.scala:32)
at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:219)
at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:217)
at scala.Option.getOrElse(Option.scala:120)
at org.apache.spark.rdd.RDD.partitions(RDD.scala:217)
at org.apache.spark.api.python.PythonRDD.getPartitions(PythonRDD.scala:58)
at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:219)
at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:217)
at scala.Option.getOrElse(Option.scala:120)
at org.apache.spark.rdd.RDD.partitions(RDD.scala:217)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:1781)
at org.apache.spark.rdd.RDD$$anonfun$collect$1.apply(RDD.scala:885)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:147)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:108)
at org.apache.spark.rdd.RDD.withScope(RDD.scala:286)
at org.apache.spark.rdd.RDD.collect(RDD.scala:884)
at org.apache.spark.api.python.PythonRDD$.collectAndServe(PythonRDD.scala:378)
at org.apache.spark.api.python.PythonRDD.collectAndServe(PythonRDD.scala)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:497)
at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:231)
at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:379)
at py4j.Gateway.invoke(Gateway.java:259)
at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:133)
at py4j.commands.CallCommand.execute(CallCommand.java:79)
at py4j.GatewayConnection.run(GatewayConnection.java:207)
at java.lang.Thread.run(Thread.java:745)
You say the file is in your local filesystem, but the error states it is looking for the file on HDFS.
Input path does not exist: hdfs://localhost:9000/home/hduser2/spark-1.4.1-bin-hadoop2.6/README.md.
Spark executes lazily, which means it won't actually read the file until it needs to, e.g. calling count(). That explains why the previous line doesn't error.
You can either move your file to that path in HDFS, or setup your SparkContext in local mode.