Pyspark: Error Connecting to Snowflake using Private Key - python

I am looking to create an ETL process that reads queries from Snowflake. Most of the examples online show how to set up a connection using a regular string password, but the way my company has set up their password is via private key. Unfortunately, when I try to pass in the private key as a parameter, it returns the following error:
Traceback (most recent call last):
File "/Users/rihun/PycharmProjects/snowflake_gcp_etl/loader.py", line 61, in <module>
.option("query", query) \
File "/usr/local/opt/apache-spark/libexec/python/pyspark/sql/readwriter.py", line 172, in load
return self._df(self._jreader.load())
File "/usr/local/opt/apache-spark/libexec/python/lib/py4j-0.10.7-src.zip/py4j/java_gateway.py", line 1257, in __call__
File "/usr/local/opt/apache-spark/libexec/python/pyspark/sql/utils.py", line 79, in deco
raise IllegalArgumentException(s.split(': ', 1)[1], stackTrace)
pyspark.sql.utils.IllegalArgumentException: 'Input PEM private key is invalid'
Code Example:
import findspark
findspark.init()
import pyspark
import os
os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages net.snowflake:snowflake-jdbc:3.6.24,net.snowflake:spark-snowflake_2.11:2.4.12-spark_2.3 pyspark-shell'
from pyspark import SparkConf, SparkContext
from pyspark.sql import SQLContext, SparkSession
from pyspark.sql.types import *
from snowflake_connector import get_keeper_token, get_snowflake_credentials
spark = SparkSession.builder.master('local').appName('Snowflake Loader').config('spark.driver.memory', '5G').getOrCreate()
spark.builder.config('spark.executor.memory', '16G')
spark.builder.config('spark.executor.cores', '4')
SNOWFLAKE_SOURCE_NAME = "net.snowflake.spark.snowflake"
sf_creds = get_snowflake_credentials(keeper_token=get_keeper_token())
sfOptions = {
"sfURL": sf_creds['sfURL'],
"sfAccount": sf_creds['sfAccount'],
"sfUser": sf_creds['sfUser'],
"pem_private_key": sf_creds['sfPrivateKey'],
"sfDatabase": sf_creds['sfDatabase'],
"sfSchema": sf_creds['sfSchema'],
"sfWarehouse": sf_creds['sfWarehouse'],
}
df = spark.read.format(SNOWFLAKE_SOURCE_NAME) \
.options(**sfOptions) \
.option("query", query) \
.load()
df.count()
How I am getting the credentials
def get_snowflake_credentials(keeper_token: str,
keeper_url='<keeper_url>',
keeper_namespace='cloudDB',
keeper_secret_path='<path_to_key>',
sf_account='<sf_account>',
sf_svc_user='<user>',
sf_wh='<warehouse>',
sf_role='<role>',
sf_db='<db>',
sf_schema='<schema>'):
# Connect to Keeper to collect secrets
client = hvac.Client(
url=keeper_url,
namespace=keeper_namespace,
token=keeper_token
)
# Secrets are stored within the key entitled 'data'
keeper_secrets = client.read(keeper_secret_path)['data']
passphrase = keeper_secrets['SNOWSQL_PRIVATE_KEY_PASSPHRASE']
private_key = keeper_secrets['private_key']
# PEM key must be byte encoded
key = bytes(private_key, 'utf-8')
p_key = serialization.load_pem_private_key(
key
, password=passphrase.encode()
, backend=default_backend()
)
pkb = p_key.private_bytes(
encoding=serialization.Encoding.DER
, format=serialization.PrivateFormat.PKCS8
, encryption_algorithm=serialization.NoEncryption())
sf_client = snowflake.connector.connect(
user=sf_svc_user
, account=sf_account
, warehouse=sf_wh
, role=sf_role
, database=sf_db
, schema=sf_schema
, private_key=pkb)
return {
"sfURL": "<url>",
"sfAccount": sf_account,
"sfUser": sf_svc_user,
"sfPrivateKey": pkb,
"sfDatabase": sf_db,
"sfSchema": sf_schema,
"sfWarehouse": sf_wh
}

Can you try with this code.
---------------------------------------------------------------------------------
#!/usr/bin/env python
# coding=utf-8
from pyspark import SparkConf, SparkContext
from pyspark.sql import SQLContext
from pyspark.sql.types import *
from pyspark import SparkConf, SparkContext
from pyspark.sql import SQLContext
from pyspark.sql.types import *
from pyspark import SparkConf, SparkContext
import subprocess
from pyspark.sql import SparkSession
import os
import logging
from logging import getLogger
from cryptography.hazmat.primitives import serialization
from cryptography.hazmat.backends import default_backend
import re
from cryptography.hazmat.primitives.serialization import load_pem_private_key
v_log = '<path>/spark.log'
spark = SparkSession \
.builder \
.config("spark.jars", "<path>/snowflake-jdbc-3.8.0.jar,<path>/spark-snowflake_2.11-2.4.13-spark_2.4.jar") \
.config("spark.repl.local.jars",
"<path>/snowflake-jdbc-3.8.0.jar,<path>/spark-snowflake_2.11-2.4.13-spark_2.4.jar") \
.config("spark.sql.catalogImplementation", "in-memory") \
.getOrCreate()
spark._jvm.net.snowflake.spark.snowflake.SnowflakeConnectorUtils.enablePushdownSession(
spark._jvm.org.apache.spark.sql.SparkSession.builder().getOrCreate())
logging.basicConfig(
filename=v_log,
level=logging.DEBUG)
logger = getLogger(__name__)
with open("<path-to>/rsa_key.p8", "rb") as key_file:
p_key = serialization.load_pem_private_key(
key_file.read(),
password=os.environ['PRIVATE_KEY_PASSPHRASE'].encode(),
backend=default_backend()
)
pkb = p_key.private_bytes(
encoding=serialization.Encoding.PEM,
format=serialization.PrivateFormat.PKCS8,
encryption_algorithm=serialization.NoEncryption()
)
pkb = pkb.decode("UTF-8")
pkb = re.sub("-*(BEGIN|END) PRIVATE KEY-*\n", "", pkb).replace("\n", "")
sfOptions = {
"sfURL": "<URL>",
"sfAccount": "sfcsupport",
"sfUser": "",
"sfDatabase": "",
"sfSchema": "PUBLIC",
"sfWarehouse": "",
"sfRole": "",
"pem_private_key":pkb
}
SNOWFLAKE_SOURCE_NAME = "net.snowflake.spark.snowflake"
df = spark.read.format(SNOWFLAKE_SOURCE_NAME) \
.options(**sfOptions) \
.option("query", "Select * from <TableName>") \
.load()
df.show()
----------------------------------------------------------------------------

Related

ERROR MicroBatchExecution: When integrating Kafka with pySpark

I am trying to send data to pySpark but it is giving me this error:
ERROR MicroBatchExecution: Query [id = d3a1ed30-d223-4da4-9052-189b103afca8, runId = 70bfaa84-15c9-4c8b-9058-0f9a04ee4dd0] terminated with error
java.lang.UnsatisfiedLinkError: 'boolean org.apache.hadoop.io.nativeio.NativeIO$Windows.access0(java.lang.String, int)'
Producer
import tweepy
import json
import time
from kafka import KafkaProducer
import twitterauth as auth
import utils
producer = KafkaProducer(bootstrap_servers=["localhost:9092"], value_serializer=utils.json_serializer)
class twitterStream(tweepy.StreamingClient):
def on_connect(self):
print("Twitter Client Connected")
def on_tweet(self, raw_data):
if raw_data.referenced_tweets == None:
producer.send(topic="registered_user", value=raw_data.text)
print("Producer Running")
def on_error(self):
self.disconnect()
def adding_rules(self, keywords):
for terms in keywords:
self.add_rules(tweepy.StreamRule(terms))
if __name__ == "__main__":
stream = twitterStream(bearer_token=auth.bearer_token)
stream_terms = ['bitcoin','luna','etherum']
stream.adding_rules(stream_terms)
stream.filter(tweet_fields=['referenced_tweets'])
pySpark
from pyspark.sql import SparkSession
from pyspark import SparkContext
from pyspark.streaming import StreamingContext
import findspark
import json
import os
os.environ['PYSPARK_SUBMIT_ARGS'] = f'--packages org.apache.spark:spark-sql-kafka-0-10_2.12:3.3.0 pyspark-shell'
if __name__ == "__main__":
findspark.init()
# spark = SparkSession.builder.appName("Kafka Pyspark Streaming Learning").master("local[*]").getOrCreate()
sc = SparkSession.builder.master("local[*]") \
.appName('SparkByExamples.com') \
.getOrCreate()
df = sc \
.readStream \
.format("kafka") \
.option("kafka.bootstrap.servers", "localhost:9092") \
.option("startingoffsets","latest") \
.option("subscribe", "registered_user") \
.load()
query = df.selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)")
query = df.writeStream.format("console").start()
import time
time.sleep(10) # sleep 10 seconds
query.stop()
Hadoop Version: 3.3.0
Spark and PySpark version: 3.3.0
Scala Version: 2.12.15
I have tried everything for the last 8 hours still no luck. Could anyone help?
StackTrace:
When I run df.printSchema() there is no error and this is the output:
you need to use different name for holding the converted fields as the Dataframe is immutable , your initial dataframe is named df and it has certain schema, when you are casting the fields to string you need to have a seperate name.
castDf = df.selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)")
query = castDf .writeStream...

PicklingError: Could not serialize object: TypeError: can't pickle SSLContext objects

I'm trying to use aws encryption sdk to encrypt pyspark columns but running into this error. Here's the code
from pyspark.sql import functions as F
from pyspark.sql.functions import udf, col, lit
from pyspark.context import SparkContext
import aws_encryption_sdk
from aws_encryption_sdk import CommitmentPolicy
client = aws_encryption_sdk.EncryptionSDKClient(
commitment_policy=CommitmentPolicy.REQUIRE_ENCRYPT_REQUIRE_DECRYPT
)
kms_kwargs = dict(key_ids=[key_arn])
global master_key_provider
master_key_provider = aws_encryption_sdk.StrictAwsKmsMasterKeyProvider(**kms_kwargs)
df = spark.read.csv('test.csv', inferSchema = True, header = True)
def encrypt_string(text):
encrypted_text, encryptor_header = client.encrypt(
source=text, key_provider=master_key_provider
)
return encrypted_text
udf_encrypt = udf(lambda text: encrypt_string(text))
def spark_encrypt(df, colmn):
return df.withColumn("segment_encrypt", udf_encrypt(col(colmn)))
df_out = spark_encrypt(df, "segment")
Is there a way to resolve this ?

How to Access variable from different file

My first File "sk_read_write.py" is as follows:
from spark_conn import *
from Table_structure import *
class read_write1:
def sk_read_write1(self,schema,spark):
df3 = spark.read.option("multiline", "true").option("mode", "PERMISSIVE").schema(schema).json(
"C:\\Users\\komu0\\Desktop\\Read\\*.json")
print(Spark_connect_1.connection())
df3.write.format('jdbc').options( url= Spark_connect_1.connection+str(connection._port),
driver='com.mysql.cj.jdbc.Driver',
dbtable='sparktable',
user=connection._username,
password=Spark_connect_1.connection._pass).\
mode('append').save()
My Other file is spark_conn.py:
from pyspark.sql.types import StructType,StructField, StringType, IntegerType,BooleanType,DoubleType
from aws_config import *
from Table_structure import*
class Spark_connect_1:
dict1 = get_secret()
def connection(self):
dict1 = get_secret()
_username = dict1['username']
_pass = dict1['password']
_host = dict1['host']
_port = dict1['port']
_dbname = dict1['dbname']
spark = SparkSession.builder \
.master("local[1]") \
.appName("JSON_MYSQL") \
.getOrCreate()
return spark
I want to use the variable in " _port" into sk_read_write file.
I have tried to import spark_conn into sk_read_write file and use
"Spark_connect_1.connection._port" (to get port name) but not working, please suggest how to proceed
You can access the port name by using a class variable for _port
Example
base_file.py
class ABC:
_port = "Some value"
def sample(self):
ABC._port = "another value"
print("Value of port is {}".format(ABC._port))
test_file.py
from base_file import ABC
#before changing value:
before = ABC()
before.sample()

Data loss in Spark Streaming writing in cassandra

I'm writting data to cassandra and to a text file.
After some minutes I stop the process. Then ,by example, i have 82035 rows in cassandra and I have 96749 rows in the file.
I found apparently valid data in the txt file that is not in the database. By example:
promerar|082220.80|4158.5985417|00506.7613786
MOLIIUO|082220|4107.4749|00444.2117
josehrin|082220|4159.1124|00455.1298
This is the code:
import os
os.environ['PYSPARK_SUBMIT_ARGS'] = '--conf spark.ui.port=4040 --packages org.apache.spark:spark-streaming-kafka-0-8_2.11:2.0.0,com.datastax.spark:spark-cassandra-connector_2.11:2.0.0-M3 pyspark-shell'
import time
from pyspark import SparkContext, SparkConf
from pyspark.sql import SQLContext, Row
conf = SparkConf() \
.setAppName("Streaming test") \
.setMaster("local[2]") \
.set("spark.cassandra.connection.host", "127.0.0.1")
sc = SparkContext(conf=conf)
sqlContext=SQLContext(sc)
from pyspark.streaming import StreamingContext
from pyspark.streaming.kafka import KafkaUtils
count =0
def saveToCassandra(rows):
if not rows.isEmpty():
sqlContext.createDataFrame(rows).write\
.format("org.apache.spark.sql.cassandra")\
.mode('append')\
.options(table="puntos", keyspace="test_rtk")\
.save()
def saveCoord(rdd):
rdd.foreach(lambda x: open("/tmp/p1", "a").write(x[0]+"|"+x[2]+"|"+x[3]+"|"+x[5]+"\n"))
ssc = StreamingContext(sc, 1)
kvs = KafkaUtils.createStream(ssc, "127.0.0.1:2181", "spark-streaming-consumer", {'test': 2})
data = kvs.map(lambda x: x[1].split(","))
rows= data.map(lambda x:Row(long=x[5],lat=x[3],date=time.strftime("%Y-%m-%d"),time=x[2],user=x[0]))
rows.foreachRDD(saveToCassandra)
data.foreachRDD(saveCoord)
ssc.start()
This is the creation of the table in cassandra:
CREATE KEYSPACE test_rtk WITH replication = {'class': 'SimpleStrategy', 'replication_factor' : 1};
CREATE TABLE test_rtk.puntos(
long text,
lat text,
time text,
date text,
user TEXT,
PRIMARY KEY (time,long,lat)
);
Can you help my?
Check Here For Better Solutions
Spark Streaming Programming Guide

no module found when calling lambda function

I am trying to run a python program on PySpark 1.6. The script below is using a module called "dateutil" to convert time from one timezone to another timezone. I've checked that dateutil module is installed on all worker nodes and the current system I am using to submit the job.
Exec command:
spark-submit --packages "com.databricks:spark-csv_2.11:1.5.0" test.py
Script:
from pyspark import SparkConf
from pyspark import SparkContext
from pyspark.sql import HiveContext, Row, functions, SQLContext
from pyspark.sql.window import Window
import os, sys
import logging
import subprocess
import math
import csv
import functools
import pickle
from operator import add
from itertools import chain
import argparse
import hashlib
import cStringIO
import time
import copy
import datetime
import pytz
conf = SparkConf ()
conf.set('spark.kyroserializer.buffer.max', '32000')
conf.set('spark.scheduler.mode', 'FAIR')
sc = SparkContext(conf = conf, appName = "Testing dateutil...")
sqlContext = HiveContext (sc)
def utcToAESTDateString (row):
#import pytz
from dateutil import tz
utc_tz = dateutil.tz.gettz('UTC')
#utc_tz = pytz.timezone('UTC')
utc_time = datetime.datetime.fromtimestamp(int(row["start time (unix time)"].decode())).replace(tzinfo=utc_time)
#print(utc_time.strftime('%Y-%m-%d %H:%M:%S'))
aest_time = dateutil.tz.gettz('AEST')
math.acos (1)
#print(utc_time.astimezone(aest_time).strftime('%Y-%m-%d %H:%M:%S'))
#aedt_time = tz.gettz('AEDT')
#print(utc_time.astimezone(aedt_time).strftime('%Y-%m-%d %H:%M:%S'))
#return utc_time.astimezone(aedt_time).strftime('%Y-%m-%d')
return Row(sdate = unicode(utc_time.astimezone(aest_time).strftime('%Y-%m-%d'), "utf-8")) + row
sqlContext.createDataFrame(sqlContext.read.format('com.databricks.spark.csv').options(header='true', inferschema='false', quote='"')\
.load("/user/xxx/conviva/*")\
.filter("`start time (unix time)` <> '' AND `start time (unix time)` IS NOT NULL")\
.rdd\
.map(lambda y: utcToAESTDateString(y)))\
.registerTempTable("table1")
#sqlContext.sql ("""select * from table1 left join fixed_dart on table1.`_1` = fixed_dart.`_4` and table1.`_18` = fixed_dart.`_1`""").show()
sqlContext.sql ("""select * from table1 limit 10""").show()
Error:
Caused by: org.apache.spark.api.python.PythonException: Traceback (most recent call last):
File "/opt/cloudera/parcels/CDH-5.8.2-1.cdh5.8.2.p0.3/lib/spark/python/lib/pyspark.zip/pyspark/worker.py", line 111, in main
process()
File "/opt/cloudera/parcels/CDH-5.8.2-1.cdh5.8.2.p0.3/lib/spark/python/lib/pyspark.zip/pyspark/worker.py", line 106, in process
serializer.dump_stream(func(split_index, iterator), outfile)
File "/opt/cloudera/parcels/CDH-5.8.2-1.cdh5.8.2.p0.3/lib/spark/python/lib/pyspark.zip/pyspark/serializers.py", line 263, in dump_stream
vs = list(itertools.islice(iterator, batch))
File "/opt/cloudera/parcels/CDH-5.8.2-1.cdh5.8.2.p0.3/lib/spark/python/lib/pyspark.zip/pyspark/rdd.py", line 1293, in takeUpToNumLeft
File "/home/xxxx/test.py", line 50, in <lambda>
File "/home/xxxx/test.py", line 34, in utcToAESTDateString
NameError: global name 'dateutil' is not defined
at org.apache.spark.api.python.PythonRunner$$anon$1.read(PythonRDD.scala:166)
at org.apache.spark.api.python.PythonRunner$$anon$1.<init>(PythonRDD.scala:207)
at org.apache.spark.api.python.PythonRunner.compute(PythonRDD.scala:125)
at org.apache.spark.api.python.PythonRDD.compute(PythonRDD.scala:70)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:306)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:270)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:66)
at org.apache.spark.scheduler.Task.run(Task.scala:89)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:214)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
... 1 more
Change these lines utc_tz = tz.gettz('UTC')
and aest_time = tz.gettz('AEST')
So when you import a specific method like this:
from dateutil import tz and you can't do a function call like dateutil.tz
You have to do tz().
Your code should be as follows:
from pyspark import SparkConf
from pyspark import SparkContext
from pyspark.sql import HiveContext, Row, functions, SQLContext
from pyspark.sql.window import Window
import os, sys
import logging
import subprocess
import math
import csv
import functools
import pickle
from operator import add
from itertools import chain
import argparse
import hashlib
import cStringIO
import time
import copy
import datetime
import pytz
conf = SparkConf ()
conf.set('spark.kyroserializer.buffer.max', '32000')
conf.set('spark.scheduler.mode', 'FAIR')
sc = SparkContext(conf = conf, appName = "Testing dateutil...")
sqlContext = HiveContext (sc)
def utcToAESTDateString (row):
#import pytz
from dateutil import tz
utc_tz = tz.gettz('UTC')
#utc_tz = pytz.timezone('UTC')
utc_time = datetime.datetime.fromtimestamp(int(row["start time (unix time)"].decode())).replace(tzinfo=utc_time)
#print(utc_time.strftime('%Y-%m-%d %H:%M:%S'))
aest_time = tz.gettz('AEST')
math.acos (1)
#print(utc_time.astimezone(aest_time).strftime('%Y-%m-%d %H:%M:%S'))
#aedt_time = tz.gettz('AEDT')
#print(utc_time.astimezone(aedt_time).strftime('%Y-%m-%d %H:%M:%S'))
#return utc_time.astimezone(aedt_time).strftime('%Y-%m-%d')
return Row(sdate = unicode(utc_time.astimezone(aest_time).strftime('%Y-%m-%d'), "utf-8")) + row
sqlContext.createDataFrame(sqlContext.read.format('com.databricks.spark.csv').options(header='true', inferschema='false', quote='"')\
.load("/user/xxx/conviva/*")\
.filter("`start time (unix time)` <> '' AND `start time (unix time)` IS NOT NULL")\
.rdd\
.map(lambda y: utcToAESTDateString(y)))\
.registerTempTable("table1")
#sqlContext.sql ("""select * from table1 left join fixed_dart on table1.`_1` = fixed_dart.`_4` and table1.`_18` = fixed_dart.`_1`""").show()
sqlContext.sql ("""select * from table1 limit 10""").show()

Categories