Pyspark read REST API - python

It is the first time I am using Pyspark. I would like to create an ETL which extract from the API and put the data into a database in my local environment. But I have an error to call the API as shown below. Any help would be appreciated.
t
t
The error:
Traceback (most recent call last):
File "etl.py", line 9, in <module>
df = spark.read.format("org.apache.dsext.spark.datasource.rest.RestDataSource").options(**options).load()
File "/home/ubuntu/.local/lib/python3.6/site-packages/pyspark/sql/readwriter.py", line 184, in load
return self._df(self._jreader.load())
File "/home/ubuntu/.local/lib/python3.6/site-packages/py4j/java_gateway.py", line 1305, in __call__
answer, self.gateway_client, self.target_id, self.name)
File "/home/ubuntu/.local/lib/python3.6/site-packages/pyspark/sql/utils.py", line 128, in deco
return f(*a, **kw)
File "/home/ubuntu/.local/lib/python3.6/site-packages/py4j/protocol.py", line 328, in get_return_value
format(target_id, ".", name), value)
py4j.protocol.Py4JJavaError: An error occurred while calling o32.load.
: java.lang.ClassNotFoundException: Failed to find data source: org.apache.dsext.spark.datasource.rest.RestDataSource. Please find packages at http://spark.apache.org/third-party-projects.html
at org.apache.spark.sql.execution.datasources.DataSource$.lookupDataSource(DataSource.scala:679)
at org.apache.spark.sql.execution.datasources.DataSource$.lookupDataSourceV2(DataSource.scala:733)
at org.apache.spark.sql.DataFrameReader.load(DataFrameReader.scala:248)
at org.apache.spark.sql.DataFrameReader.load(DataFrameReader.scala:221)
at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
at java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.base/java.lang.reflect.Method.invoke(Method.java:566)
at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
at py4j.Gateway.invoke(Gateway.java:282)
at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
at py4j.commands.CallCommand.execute(CallCommand.java:79)
at py4j.GatewayConnection.run(GatewayConnection.java:238)
at java.base/java.lang.Thread.run(Thread.java:834)
Caused by: java.lang.ClassNotFoundException: org.apache.dsext.spark.datasource.rest.RestDataSource.DefaultSource
at java.base/java.net.URLClassLoader.findClass(URLClassLoader.java:471)
at java.base/java.lang.ClassLoader.loadClass(ClassLoader.java:589)
at java.base/java.lang.ClassLoader.loadClass(ClassLoader.java:522)
at org.apache.spark.sql.execution.datasources.DataSource$.$anonfun$lookupDataSource$5(DataSource.scala:653)
at scala.util.Try$.apply(Try.scala:213)
at org.apache.spark.sql.execution.datasources.DataSource$.$anonfun$lookupDataSource$4(DataSource.scala:653)
at scala.util.Failure.orElse(Try.scala:224)
at org.apache.spark.sql.execution.datasources.DataSource$.lookupDataSource(DataSource.scala:653)
... 14 more
My code:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("{your app name here}").getOrCreate()
uri = "https://min-api.cryptocompare.com/data/histoday?fsym=BTC&tsym=JPY&limit=30&aggregate=1&e=CCCAGG"
options = { 'url' : uri, 'method' : 'GET', 'readTimeout' : '10000', 'connectionTimeout' : '2000', 'partitions' : '10'}
df = spark.read.format("org.apache.dsext.spark.datasource.rest.RestDataSource").options(**options).load()
df.printSchema()
JAVA version:
openjdk 11.0.9.1 2020-11-04
OpenJDK Runtime Environment (build 11.0.9.1+1-Ubuntu-0ubuntu1.18.04)
OpenJDK 64-Bit Server VM (build 11.0.9.1+1-Ubuntu-0ubuntu1.18.04, mixed mode, sharing)

I believe that this issue was raised due to a missing dependency.
In the code, you mentioned org.apache.dsext.spark.datasource.rest.RestDataSource as your format, this particular functionality is not inbuild in spark but depends on third party package called REST Data Source
you need to create a jar file by building the codebase and add it to your spark as follows:
$SPARK_HOME/bin/spark-shell --jars spark-datasource-rest_2.11-2.1.0-SNAPSHOT.jar --packages org.scalaj:scalaj-http_2.10:2.3.0

Related

HDP 2.6.5 : java.lang.ClassNotFoundException: com.mongodb.spark.sql.DefaultSource.DefaultSource

I'm using HDP 2.6.5 and when i want to save my data (get on a json file) to a mongoDB database i have this problem.
I'm using the sandbox.
My code :
#HDFS-Mongo Used to write the json file from HDFS to Mongo DB
from pyspark.sql import SparkSession
my_spark = SparkSession \
.builder \
.appName("testdb") \
.config("spark.mongodb.input.uri", "mongodb://127.0.0.1/testdb.test1") \
.config("spark.mongodb.output.uri", "mongodb://127.0.0.1/testdb.test1") \
.getOrCreate()
df = my_spark.read.option("multiline", "true").json("hdfs://sandbox-hdp.hortonworks.com:8020/user/root/output2.json")
df.count()
df.printSchema()
df.write.format("com.mongodb.spark.sql.DefaultSource").mode("append").option("database","testdb").option("collection", "test1").save()
The errors :
Traceback (most recent call last): File "hdfs_mongo.py", line 19, in <module> df.write.format("com.mongodb.spark.sql.DefaultSource").mode("append").option("database","testdb").option("collection", "test1").save() File "/usr/local/lib/python3.6/site-packages/pyspark/sql/readwriter.py", line 738, in save self._jwrite.save() File "/usr/local/lib/python3.6/site-packages/py4j/java_gateway.py", line 1322, in __call__ answer, self.gateway_client, self.target_id, self.name) File "/usr/local/lib/python3.6/site-packages/pyspark/sql/utils.py", line 111, in deco return f(*a, **kw) File "/usr/local/lib/python3.6/site-packages/py4j/protocol.py", line 328, in get_return_value format(target_id, ".", name), value) py4j.protocol.Py4JJavaError: An error occurred while calling o44.save. : java.lang.ClassNotFoundException: Failed to find data source: com.mongodb.spark.sql.DefaultSource. Please find packages at http://spark.apache.org/third-party-projects.html                                                                                            at org.apache.spark.sql.errors.QueryExecutionErrors$.failedToFindDataSourceError(QueryExecutionErrors.scala:443) at org.apache.spark.sql.execution.datasources.DataSource$.lookupDataSource(DataSource.scala:670) at org.apache.spark.sql.execution.datasources.DataSource$.lookupDataSourceV2(DataSource.scala:720) at org.apache.spark.sql.DataFrameWriter.lookupV2Provider(DataFrameWriter.scala:852) at org.apache.spark.sql.DataFrameWriter.saveInternal(DataFrameWriter.scala:256) at org.apache.spark.sql.DataFrameWriter.save(DataFrameWriter.scala:247) at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method) at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62) at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43) at java.lang.reflect.Method.invoke(Method.java:498) at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244) at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357) at py4j.Gateway.invoke(Gateway.java:282) at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132) at py4j.commands.CallCommand.execute(CallCommand.java:79) at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182) at py4j.ClientServerConnection.run(ClientServerConnection.java:106) at java.lang.Thread.run(Thread.java:748) Caused by: java.lang.ClassNotFoundException: com.mongodb.spark.sql.DefaultSource.DefaultSource at java.net.URLClassLoader.findClass(URLClassLoader.java:381) at java.lang.ClassLoader.loadClass(ClassLoader.java:424) at java.lang.ClassLoader.loadClass(ClassLoader.java:357) at org.apache.spark.sql.execution.datasources.DataSource$.$anonfun$lookupDataSource$5(DataSource.scala:656) at scala.util.Try$.apply(Try.scala:213) at org.apache.spark.sql.execution.datasources.DataSource$.$anonfun$lookupDataSource$4(DataSource.scala:656) at scala.util.Failure.orElse(Try.scala:224) at org.apache.spark.sql.execution.datasources.DataSource$.lookupDataSource(DataSource.scala:656) ... 16 more
I already try to uninstall python and install again (python3.6), i use this code and run it but it don't works :
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("myApp") \
.config("spark.mongodb.input.uri", "mongodb://xxx.xxx.xxx.xxx:27017/sample1.zips") \
.config("spark.mongodb.output.uri", "mongodb://xxx.xxx.xxx.xxx:27017/sample1.zips") \
.config('spark.jars.packages', 'org.mongodb.spark:mongo-spark-connector_2.11:2.3.2') \
.getOrCreate()
df = spark.read.format("com.mongodb.spark.sql.DefaultSource").load()
df.printSchema()

hyperopt SparkTrials maxNumConcurrentTasks([]) does not exist

I am using hyperopt. I run following code in python:
from hyperopt import hp, fmin, tpe, rand, SparkTrials, STATUS_OK, STATUS_FAIL, space_eval
trials = SparkTrials()
best = fmin(fn=self.objective, space=space, algo=tpe.suggest, trials=trials, max_evals=iteration_number)
but I got following error:
File "~/classes/Algorithm.py", line 507, in random_hyperopt
trials = SparkTrials()
File "~/.local/lib/python3.8/site-packages/hyperopt/spark.py", line 93, in __init__
max_num_concurrent_tasks = self._spark_context._jsc.sc().maxNumConcurrentTasks()
File "/opt/spark-3.1.1/python/lib/py4j-0.10.9-src.zip/py4j/java_gateway.py", line 1304, in __call__
File "/opt/spark-3.1.1/python/pyspark/sql/utils.py", line 111, in deco
return f(*a, **kw)
File "/opt/spark-3.1.1/python/lib/py4j-0.10.9-src.zip/py4j/protocol.py", line 330, in get_return_value
py4j.protocol.Py4JError: An error occurred while calling o25.maxNumConcurrentTasks. Trace:
py4j.Py4JException: Method maxNumConcurrentTasks([]) does not exist
at py4j.reflection.ReflectionEngine.getMethod(ReflectionEngine.java:318)
at py4j.reflection.ReflectionEngine.getMethod(ReflectionEngine.java:326)
at py4j.Gateway.invoke(Gateway.java:274)
at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
at py4j.commands.CallCommand.execute(CallCommand.java:79)
at py4j.GatewayConnection.run(GatewayConnection.java:238)
at java.lang.Thread.run(Thread.java:748)
Anybody knows how can I fix it?
If you install hyperopt using the below command, you will include all updates that are merged on github but not yet merged on pypi.
%pip install -I git+https://github.com/hyperopt/hyperopt.git

How can i run a pyspark job on k8s?

I'm trying to run a hello world spark application on k8s cluster. I've built my own docker image with the script on top of a standard pyspark docker image and now I'm trying to run this image on k8s cluster, but get the following error. DNS pods logs are okay.
My current Dockerfile:
FROM semenchukou/spark-py:v2.4.1
COPY . /app
WORKDIR /app
The command I'm using to deploy the job on k8s:
bin/spark-submit
--master k8s://https://172.20.234.174:6443
--deploy-mode cluster
--conf spark.executor.instances=2
--conf spark.kubernetes.authenticate.driver.serviceAccountName=spark
--conf spark.kubernetes.container.image=semenchukou/pyspark-k8s-example:likeEx3
--name spark_k8s_hello_world_0
--conf spark.kubernetes.pyspark.pythonVersion=3
local:///app/HelloWorldSpark.py
Error:
Traceback (most recent call last):
File "/app/HelloWorldSpark.py", line 10, in <module>
.appName("PythonPi")\
File "/opt/spark/python/lib/pyspark.zip/pyspark/sql/session.py", line 173, in getOrCreate
File "/opt/spark/python/lib/pyspark.zip/pyspark/context.py", line 367, in getOrCreate
File "/opt/spark/python/lib/pyspark.zip/pyspark/context.py", line 136, in __init__
File "/opt/spark/python/lib/pyspark.zip/pyspark/context.py", line 198, in _do_init
File "/opt/spark/python/lib/pyspark.zip/pyspark/context.py", line 306, in _initialize_context
File "/opt/spark/python/lib/py4j-0.10.7-src.zip/py4j/java_gateway.py", line 1525, in __call__
File "/opt/spark/python/lib/py4j-0.10.7-src.zip/py4j/protocol.py", line 328, in get_return_value
py4j.protocol.Py4JJavaError: An error occurred while calling None.org.apache.spark.api.java.JavaSparkContext.
: org.apache.spark.SparkException: External scheduler cannot be instantiated
at org.apache.spark.SparkContext$.org$apache$spark$SparkContext$$createTaskScheduler(SparkContext.scala:2794)
at org.apache.spark.SparkContext.<init>(SparkContext.scala:493)
at org.apache.spark.api.java.JavaSparkContext.<init>(JavaSparkContext.scala:58)
at sun.reflect.NativeConstructorAccessorImpl.newInstance0(Native Method)
at sun.reflect.NativeConstructorAccessorImpl.newInstance(NativeConstructorAccessorImpl.java:62)
at sun.reflect.DelegatingConstructorAccessorImpl.newInstance(DelegatingConstructorAccessorImpl.java:45)
at java.lang.reflect.Constructor.newInstance(Constructor.java:423)
at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:247)
at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
at py4j.Gateway.invoke(Gateway.java:238)
at py4j.commands.ConstructorCommand.invokeConstructor(ConstructorCommand.java:80)
at py4j.commands.ConstructorCommand.execute(ConstructorCommand.java:69)
at py4j.GatewayConnection.run(GatewayConnection.java:238)
at java.lang.Thread.run(Thread.java:748)
Caused by: io.fabric8.kubernetes.client.KubernetesClientException: Operation: [get] for kind: [Pod] with name: [sparkk8shelloworld0-1583151334880-driver] in namespace: [default] failed.
at io.fabric8.kubernetes.client.KubernetesClientException.launderThrowable(KubernetesClientException.java:64)
at io.fabric8.kubernetes.client.KubernetesClientException.launderThrowable(KubernetesClientException.java:72)
at io.fabric8.kubernetes.client.dsl.base.BaseOperation.getMandatory(BaseOperation.java:229)
at io.fabric8.kubernetes.client.dsl.base.BaseOperation.get(BaseOperation.java:162)
at org.apache.spark.scheduler.cluster.k8s.ExecutorPodsAllocator$$anonfun$1.apply(ExecutorPodsAllocator.scala:57)
at org.apache.spark.scheduler.cluster.k8s.ExecutorPodsAllocator$$anonfun$1.apply(ExecutorPodsAllocator.scala:55)
at scala.Option.map(Option.scala:146)
at org.apache.spark.scheduler.cluster.k8s.ExecutorPodsAllocator.<init>(ExecutorPodsAllocator.scala:55)
at org.apache.spark.scheduler.cluster.k8s.KubernetesClusterManager.createSchedulerBackend(KubernetesClusterManager.scala:89)
at org.apache.spark.SparkContext$.org$apache$spark$SparkContext$$createTaskScheduler(SparkContext.scala:2788)
... 13 more
Caused by: java.net.UnknownHostException: kubernetes.default.svc: Try again
at java.net.Inet4AddressImpl.lookupAllHostAddr(Native Method)
at java.net.InetAddress$2.lookupAllHostAddr(InetAddress.java:929)
at java.net.InetAddress.getAddressesFromNameService(InetAddress.java:1324)
at java.net.InetAddress.getAllByName0(InetAddress.java:1277)
at java.net.InetAddress.getAllByName(InetAddress.java:1193)
at java.net.InetAddress.getAllByName(InetAddress.java:1127)
at okhttp3.Dns$1.lookup(Dns.java:39)
at okhttp3.internal.connection.RouteSelector.resetNextInetSocketAddress(RouteSelector.java:171)
at okhttp3.internal.connection.RouteSelector.nextProxy(RouteSelector.java:137)
at okhttp3.internal.connection.RouteSelector.next(RouteSelector.java:82)
at okhttp3.internal.connection.StreamAllocation.findConnection(StreamAllocation.java:171)
at okhttp3.internal.connection.StreamAllocation.findHealthyConnection(StreamAllocation.java:121)
at okhttp3.internal.connection.StreamAllocation.newStream(StreamAllocation.java:100)
at okhttp3.internal.connection.ConnectInterceptor.intercept(ConnectInterceptor.java:42)
at okhttp3.internal.http.RealInterceptorChain.proceed(RealInterceptorChain.java:92)
at okhttp3.internal.http.RealInterceptorChain.proceed(RealInterceptorChain.java:67)
at okhttp3.internal.cache.CacheInterceptor.intercept(CacheInterceptor.java:93)
at okhttp3.internal.http.RealInterceptorChain.proceed(RealInterceptorChain.java:92)
at okhttp3.internal.http.RealInterceptorChain.proceed(RealInterceptorChain.java:67)
at okhttp3.internal.http.BridgeInterceptor.intercept(BridgeInterceptor.java:93)
at okhttp3.internal.http.RealInterceptorChain.proceed(RealInterceptorChain.java:92)
at okhttp3.internal.http.RetryAndFollowUpInterceptor.intercept(RetryAndFollowUpInterceptor.java:120)
at okhttp3.internal.http.RealInterceptorChain.proceed(RealInterceptorChain.java:92)
at okhttp3.internal.http.RealInterceptorChain.proceed(RealInterceptorChain.java:67)
at io.fabric8.kubernetes.client.utils.BackwardsCompatibilityInterceptor.intercept(BackwardsCompatibilityInterceptor.java:119)
at okhttp3.internal.http.RealInterceptorChain.proceed(RealInterceptorChain.java:92)
at okhttp3.internal.http.RealInterceptorChain.proceed(RealInterceptorChain.java:67)
at io.fabric8.kubernetes.client.utils.ImpersonatorInterceptor.intercept(ImpersonatorInterceptor.java:68)
at okhttp3.internal.http.RealInterceptorChain.proceed(RealInterceptorChain.java:92)
at okhttp3.internal.http.RealInterceptorChain.proceed(RealInterceptorChain.java:67)
at io.fabric8.kubernetes.client.utils.HttpClientUtils.lambda$createHttpClient$3(HttpClientUtils.java:110)
at okhttp3.internal.http.RealInterceptorChain.proceed(RealInterceptorChain.java:92)
at okhttp3.internal.http.RealInterceptorChain.proceed(RealInterceptorChain.java:67)
at okhttp3.RealCall.getResponseWithInterceptorChain(RealCall.java:185)
at okhttp3.RealCall.execute(RealCall.java:69)
at io.fabric8.kubernetes.client.dsl.base.OperationSupport.handleResponse(OperationSupport.java:404)
at io.fabric8.kubernetes.client.dsl.base.OperationSupport.handleResponse(OperationSupport.java:365)
at io.fabric8.kubernetes.client.dsl.base.OperationSupport.handleGet(OperationSupport.java:330)
at io.fabric8.kubernetes.client.dsl.base.OperationSupport.handleGet(OperationSupport.java:311)
at io.fabric8.kubernetes.client.dsl.base.BaseOperation.handleGet(BaseOperation.java:810)
at io.fabric8.kubernetes.client.dsl.base.BaseOperation.getMandatory(BaseOperation.java:218)
... 20 more
What am I doing wrong?
The helloWorkd script:
from pyspark import SparkContext
from pyspark import SparkConf
from pyspark.sql import SparkSession
if __name__ == '__main__':
spark = SparkSession\
.builder\
.appName("PythonEx")\
.getOrCreate()
txt = spark.sparkContext.textFile('hdfs://172.20.234.174:1515/testing/testFile.txt')
first = txt.first()
spark.sparkContext.parallelize(first).saveAsTextFile('hdfs://172.20.234.174:9000/testing/result.txt')
spark.stop()
Your error stacktrace says:
Caused by: java.net.UnknownHostException: kubernetes.default.svc: Try again
which means that by some reason you do not have Service kubernetes in namespace default or you have DNS related problems in your cluster.
Also this issue has already been discussed with you here.

How to get data from cassandra via pyspark?

I'm trying to get data from cassandra via pyspark. And I got the connector from github . But I failed to do that.
The following is the code.
import pyspark_cassandra
from pyspark_cassandra import CassandraSparkContext
from pyspark import SparkConf
#from pyspark.sql import SQLContext
conf = SparkConf() \
.setAppName("PySpark Cassandra Test") \
.setMaster("spark://192.192.141.21:7077") \
.set("spark.cassandra.connection.host", "192.192.141.26:9042")
sc = CassandraSparkContext(conf=conf)
sc.cassandraTable("oltpdb", "XiangWan") \
.select("dt", "wid") \
.where("wid='XiangWan001'", "daybucket in ('20190326')","dt >= '2019-03-26 13:18:03'") \
.collect()
So, with the following command:
spark-submit /root/model/connect_cannandra_via_spark.py
I got the error:
Traceback (most recent call last):
File "/root/model/connect_cannandra_via_spark.py", line 25, in <module>
df = (SQLContext
AttributeError: 'property' object has no attribute 'format'
[root#CDH21 python]# spark-submit /root/model/connect_cannandra_via_spark.py
19/04/11 14:06:39 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
19/04/11 14:06:39 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
19/04/11 14:06:39 WARN Utils: Service 'SparkUI' could not bind on port 4041. Attempting port 4042.
Traceback (most recent call last):
File "/root/model/connect_cannandra_via_spark.py", line 12, in <module>
sc.cassandraTable("oltpdb", "XiangWan") \
File "/root/anaconda3/lib/python3.6/site-packages/pyspark_cassandra-0.9.0-py3.6.egg/pyspark_cassandra/context.py", line 33, in cassandraTable
File "/root/anaconda3/lib/python3.6/site-packages/pyspark_cassandra-0.9.0-py3.6.egg/pyspark_cassandra/rdd.py", line 324, in __init__
File "/root/anaconda3/lib/python3.6/site-packages/pyspark_cassandra-0.9.0-py3.6.egg/pyspark_cassandra/rdd.py", line 213, in _helper
File "/root/anaconda3/lib/python3.6/site-packages/pyspark_cassandra-0.9.0-py3.6.egg/pyspark_cassandra/util.py", line 99, in helper
File "/root/anaconda3/lib/python3.6/site-packages/pyspark_cassandra-0.9.0-py3.6.egg/pyspark_cassandra/util.py", line 88, in load_class
File "/root/spark-2.2.0-bin-hadoop2.7/python/lib/py4j-0.10.4-src.zip/py4j/java_gateway.py", line 1133, in __call__
File "/root/spark-2.2.0-bin-hadoop2.7/python/lib/py4j-0.10.4-src.zip/py4j/protocol.py", line 319, in get_return_value
py4j.protocol.Py4JJavaError: An error occurred while calling o24.loadClass.
: java.lang.ClassNotFoundException: pyspark_cassandra.PythonHelper
at java.net.URLClassLoader.findClass(URLClassLoader.java:381)
at java.lang.ClassLoader.loadClass(ClassLoader.java:424)
at java.lang.ClassLoader.loadClass(ClassLoader.java:357)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:498)
at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
at py4j.Gateway.invoke(Gateway.java:280)
at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
at py4j.commands.CallCommand.execute(CallCommand.java:79)
at py4j.GatewayConnection.run(GatewayConnection.java:214)
at java.lang.Thread.run(Thread.java:748)
How should I do?

Reading google bucket files in spark

I am trying to read google bucket file in spark/ I have done the necessary settings as described in https://cloud.google.com/dataproc/docs/connectors/install-storage-connector. Also, the result of
hadoop fs -ls gs://directory-name/
is as expected. But when reading the same directory from a python/spark script as
rdd = sc.textFile("gs://directory-name/")
I am getting the error with following stack trace:
File "/home/hadoopuser/spark/python/lib/pyspark.zip/pyspark/rdd.py", line 1312, in take
File "/home/hadoopuser/spark/python/lib/pyspark.zip/pyspark/rdd.py", line 384, in getNumPartitions
File "/home/hadoopuser/spark/python/lib/py4j-0.10.4-src.zip/py4j/java_gateway.py", line 1133, in __call__
File "/home/hadoopuser/spark/python/lib/py4j-0.10.4-src.zip/py4j/protocol.py", line 319, in get_return_value
py4j.protocol.Py4JJavaError: An error occurred while calling o21.partitions.
: java.io.IOException: Error getting access token from metadata server at: http://metadata/computeMetadata/v1/instance/service-accounts/default/token
at com.google.cloud.hadoop.util.CredentialFactory.getCredentialFromMetadataServiceAccount(CredentialFactory.java:207)
at com.google.cloud.hadoop.util.CredentialConfiguration.getCredential(CredentialConfiguration.java:70)
at com.google.cloud.hadoop.fs.gcs.GoogleHadoopFileSystemBase.configure(GoogleHadoopFileSystemBase.java:1816)
at com.google.cloud.hadoop.fs.gcs.GoogleHadoopFileSystemBase.initialize(GoogleHadoopFileSystemBase.java:1003)
at com.google.cloud.hadoop.fs.gcs.GoogleHadoopFileSystemBase.initialize(GoogleHadoopFileSystemBase.java:966)
at org.apache.hadoop.fs.FileSystem.createFileSystem(FileSystem.java:2669)
at org.apache.hadoop.fs.FileSystem.access$200(FileSystem.java:94)
at org.apache.hadoop.fs.FileSystem$Cache.getInternal(FileSystem.java:2703)
at org.apache.hadoop.fs.FileSystem$Cache.get(FileSystem.java:2685)
at org.apache.hadoop.fs.FileSystem.get(FileSystem.java:373)
at org.apache.hadoop.fs.Path.getFileSystem(Path.java:295)
at org.apache.hadoop.mapred.FileInputFormat.singleThreadedListStatus(FileInputFormat.java:258)
at org.apache.hadoop.mapred.FileInputFormat.listStatus(FileInputFormat.java:229)
at org.apache.hadoop.mapred.FileInputFormat.getSplits(FileInputFormat.java:315)
at org.apache.spark.rdd.HadoopRDD.getPartitions(HadoopRDD.scala:202)
at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:252)
at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:250)
at scala.Option.getOrElse(Option.scala:121)
at org.apache.spark.rdd.RDD.partitions(RDD.scala:250)
at org.apache.spark.rdd.MapPartitionsRDD.getPartitions(MapPartitionsRDD.scala:35)
at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:252)
at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:250)
at scala.Option.getOrElse(Option.scala:121)
at org.apache.spark.rdd.RDD.partitions(RDD.scala:250)
at org.apache.spark.api.java.JavaRDDLike$class.partitions(JavaRDDLike.scala:61)
at org.apache.spark.api.java.AbstractJavaRDDLike.partitions(JavaRDDLike.scala:45)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:498)
at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
at py4j.Gateway.invoke(Gateway.java:280)
at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
at py4j.commands.CallCommand.execute(CallCommand.java:79)
at py4j.GatewayConnection.run(GatewayConnection.java:214)
at java.lang.Thread.run(Thread.java:748)
Caused by: java.net.UnknownHostException: metadata
at java.net.AbstractPlainSocketImpl.connect(AbstractPlainSocketImpl.java:184)
at java.net.SocksSocketImpl.connect(SocksSocketImpl.java:392)
at java.net.Socket.connect(Socket.java:589)
at sun.net.NetworkClient.doConnect(NetworkClient.java:175)
at sun.net.www.http.HttpClient.openServer(HttpClient.java:463)
at sun.net.www.http.HttpClient.openServer(HttpClient.java:558)
at sun.net.www.http.HttpClient.<init>(HttpClient.java:242)
at sun.net.www.http.HttpClient.New(HttpClient.java:339)
at sun.net.www.http.HttpClient.New(HttpClient.java:357)
at sun.net.www.protocol.http.HttpURLConnection.getNewHttpClient(HttpURLConnection.java:1202)
at sun.net.www.protocol.http.HttpURLConnection.plainConnect0(HttpURLConnection.java:1138)
at sun.net.www.protocol.http.HttpURLConnection.plainConnect(HttpURLConnection.java:1032)
at sun.net.www.protocol.http.HttpURLConnection.connect(HttpURLConnection.java:966)
at com.google.api.client.http.javanet.NetHttpRequest.execute(NetHttpRequest.java:93)
at com.google.api.client.http.HttpRequest.execute(HttpRequest.java:972)
at com.google.cloud.hadoop.util.CredentialFactory$ComputeCredentialWithRetry.executeRefreshToken(CredentialFactory.java:158)
at com.google.api.client.auth.oauth2.Credential.refreshToken(Credential.java:489)
at com.google.cloud.hadoop.util.CredentialFactory.getCredentialFromMetadataServiceAccount(CredentialFactory.java:205)
... 36 more
I have possibly all the links I have found but could not solve it. ANy suggestions would be appreciated.
It looks like Spark is not picking up your core-site.xml. You can
Copy it to $SPARK_HOME/conf or
Set HADOOP_CONF_DIR (for example in $SPARK_HOME/conf/spark-env.sh).

Categories