no module found when calling lambda function - python

I am trying to run a python program on PySpark 1.6. The script below is using a module called "dateutil" to convert time from one timezone to another timezone. I've checked that dateutil module is installed on all worker nodes and the current system I am using to submit the job.
Exec command:
spark-submit --packages "com.databricks:spark-csv_2.11:1.5.0" test.py
Script:
from pyspark import SparkConf
from pyspark import SparkContext
from pyspark.sql import HiveContext, Row, functions, SQLContext
from pyspark.sql.window import Window
import os, sys
import logging
import subprocess
import math
import csv
import functools
import pickle
from operator import add
from itertools import chain
import argparse
import hashlib
import cStringIO
import time
import copy
import datetime
import pytz
conf = SparkConf ()
conf.set('spark.kyroserializer.buffer.max', '32000')
conf.set('spark.scheduler.mode', 'FAIR')
sc = SparkContext(conf = conf, appName = "Testing dateutil...")
sqlContext = HiveContext (sc)
def utcToAESTDateString (row):
#import pytz
from dateutil import tz
utc_tz = dateutil.tz.gettz('UTC')
#utc_tz = pytz.timezone('UTC')
utc_time = datetime.datetime.fromtimestamp(int(row["start time (unix time)"].decode())).replace(tzinfo=utc_time)
#print(utc_time.strftime('%Y-%m-%d %H:%M:%S'))
aest_time = dateutil.tz.gettz('AEST')
math.acos (1)
#print(utc_time.astimezone(aest_time).strftime('%Y-%m-%d %H:%M:%S'))
#aedt_time = tz.gettz('AEDT')
#print(utc_time.astimezone(aedt_time).strftime('%Y-%m-%d %H:%M:%S'))
#return utc_time.astimezone(aedt_time).strftime('%Y-%m-%d')
return Row(sdate = unicode(utc_time.astimezone(aest_time).strftime('%Y-%m-%d'), "utf-8")) + row
sqlContext.createDataFrame(sqlContext.read.format('com.databricks.spark.csv').options(header='true', inferschema='false', quote='"')\
.load("/user/xxx/conviva/*")\
.filter("`start time (unix time)` <> '' AND `start time (unix time)` IS NOT NULL")\
.rdd\
.map(lambda y: utcToAESTDateString(y)))\
.registerTempTable("table1")
#sqlContext.sql ("""select * from table1 left join fixed_dart on table1.`_1` = fixed_dart.`_4` and table1.`_18` = fixed_dart.`_1`""").show()
sqlContext.sql ("""select * from table1 limit 10""").show()
Error:
Caused by: org.apache.spark.api.python.PythonException: Traceback (most recent call last):
File "/opt/cloudera/parcels/CDH-5.8.2-1.cdh5.8.2.p0.3/lib/spark/python/lib/pyspark.zip/pyspark/worker.py", line 111, in main
process()
File "/opt/cloudera/parcels/CDH-5.8.2-1.cdh5.8.2.p0.3/lib/spark/python/lib/pyspark.zip/pyspark/worker.py", line 106, in process
serializer.dump_stream(func(split_index, iterator), outfile)
File "/opt/cloudera/parcels/CDH-5.8.2-1.cdh5.8.2.p0.3/lib/spark/python/lib/pyspark.zip/pyspark/serializers.py", line 263, in dump_stream
vs = list(itertools.islice(iterator, batch))
File "/opt/cloudera/parcels/CDH-5.8.2-1.cdh5.8.2.p0.3/lib/spark/python/lib/pyspark.zip/pyspark/rdd.py", line 1293, in takeUpToNumLeft
File "/home/xxxx/test.py", line 50, in <lambda>
File "/home/xxxx/test.py", line 34, in utcToAESTDateString
NameError: global name 'dateutil' is not defined
at org.apache.spark.api.python.PythonRunner$$anon$1.read(PythonRDD.scala:166)
at org.apache.spark.api.python.PythonRunner$$anon$1.<init>(PythonRDD.scala:207)
at org.apache.spark.api.python.PythonRunner.compute(PythonRDD.scala:125)
at org.apache.spark.api.python.PythonRDD.compute(PythonRDD.scala:70)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:306)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:270)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:66)
at org.apache.spark.scheduler.Task.run(Task.scala:89)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:214)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
... 1 more

Change these lines utc_tz = tz.gettz('UTC')
and aest_time = tz.gettz('AEST')
So when you import a specific method like this:
from dateutil import tz and you can't do a function call like dateutil.tz
You have to do tz().
Your code should be as follows:
from pyspark import SparkConf
from pyspark import SparkContext
from pyspark.sql import HiveContext, Row, functions, SQLContext
from pyspark.sql.window import Window
import os, sys
import logging
import subprocess
import math
import csv
import functools
import pickle
from operator import add
from itertools import chain
import argparse
import hashlib
import cStringIO
import time
import copy
import datetime
import pytz
conf = SparkConf ()
conf.set('spark.kyroserializer.buffer.max', '32000')
conf.set('spark.scheduler.mode', 'FAIR')
sc = SparkContext(conf = conf, appName = "Testing dateutil...")
sqlContext = HiveContext (sc)
def utcToAESTDateString (row):
#import pytz
from dateutil import tz
utc_tz = tz.gettz('UTC')
#utc_tz = pytz.timezone('UTC')
utc_time = datetime.datetime.fromtimestamp(int(row["start time (unix time)"].decode())).replace(tzinfo=utc_time)
#print(utc_time.strftime('%Y-%m-%d %H:%M:%S'))
aest_time = tz.gettz('AEST')
math.acos (1)
#print(utc_time.astimezone(aest_time).strftime('%Y-%m-%d %H:%M:%S'))
#aedt_time = tz.gettz('AEDT')
#print(utc_time.astimezone(aedt_time).strftime('%Y-%m-%d %H:%M:%S'))
#return utc_time.astimezone(aedt_time).strftime('%Y-%m-%d')
return Row(sdate = unicode(utc_time.astimezone(aest_time).strftime('%Y-%m-%d'), "utf-8")) + row
sqlContext.createDataFrame(sqlContext.read.format('com.databricks.spark.csv').options(header='true', inferschema='false', quote='"')\
.load("/user/xxx/conviva/*")\
.filter("`start time (unix time)` <> '' AND `start time (unix time)` IS NOT NULL")\
.rdd\
.map(lambda y: utcToAESTDateString(y)))\
.registerTempTable("table1")
#sqlContext.sql ("""select * from table1 left join fixed_dart on table1.`_1` = fixed_dart.`_4` and table1.`_18` = fixed_dart.`_1`""").show()
sqlContext.sql ("""select * from table1 limit 10""").show()

Related

PicklingError: Could not serialize object: TypeError: can't pickle SSLContext objects

I'm trying to use aws encryption sdk to encrypt pyspark columns but running into this error. Here's the code
from pyspark.sql import functions as F
from pyspark.sql.functions import udf, col, lit
from pyspark.context import SparkContext
import aws_encryption_sdk
from aws_encryption_sdk import CommitmentPolicy
client = aws_encryption_sdk.EncryptionSDKClient(
commitment_policy=CommitmentPolicy.REQUIRE_ENCRYPT_REQUIRE_DECRYPT
)
kms_kwargs = dict(key_ids=[key_arn])
global master_key_provider
master_key_provider = aws_encryption_sdk.StrictAwsKmsMasterKeyProvider(**kms_kwargs)
df = spark.read.csv('test.csv', inferSchema = True, header = True)
def encrypt_string(text):
encrypted_text, encryptor_header = client.encrypt(
source=text, key_provider=master_key_provider
)
return encrypted_text
udf_encrypt = udf(lambda text: encrypt_string(text))
def spark_encrypt(df, colmn):
return df.withColumn("segment_encrypt", udf_encrypt(col(colmn)))
df_out = spark_encrypt(df, "segment")
Is there a way to resolve this ?

Issue in Kafka to Spark Streaming Data pipeline with python pyspark

I am using below program and runnign this in Anaconda(Spyder) for creating data pipeline from Kafka to Spark streaming & in python
import sys
from pyspark import SparkContext, SparkConf
from pyspark.streaming import StreamingContext
from pyspark.streaming.kafka import KafkaUtils
from uuid import uuid1
import os
##Step 1: Initialize sparkcontext
spark_context = SparkContext(appName="Transformation Application")
###Step 2: Initialize streaming context
ssc = StreamingContext(spark_context, 5)
def utf8_decoder(s):
""" Decode the unicode as UTF-8 """
if s is None:
return None
return s.decode('utf-8')
message = KafkaUtils.createDirectStream(ssc,topics=['testtopic'],kafkaParams={"metadata.broker.list":"localhost:9092","key.deserializer": "org.springframework.kafka.support.serializer.JsonDeserializer","value.deserializer": "org.springframework.kafka.support.serializer.JsonDeserializer"},fromOffsets=None,messageHandler=None,keyDecoder=utf8_decoder,valueDecoder=utf8_decoder)
message
words = message.map(lambda x: x[1]).flatMap(lambda x: x.split(" "))
wordcount=words.map(lambda x: (x,1)).reduceByKey(lambda a,b:a+b)
wordcount.pprint()
When I am printing message, words,wordscount i am getting no proper results,I am getting hexadecimal values .
message
Out[16]: <pyspark.streaming.kafka.KafkaDStream at 0x23f8b1f8248>
wordcount
Out[18]: <pyspark.streaming.dstream.TransformedDStream at 0x23f8b2324c8>
in my topic(testtopic) I am produced string - " Hi Hi Hi how are you doing" then wordcount should give count for each word but it is giving some encoded hexadecimal values

Pyspark: Error Connecting to Snowflake using Private Key

I am looking to create an ETL process that reads queries from Snowflake. Most of the examples online show how to set up a connection using a regular string password, but the way my company has set up their password is via private key. Unfortunately, when I try to pass in the private key as a parameter, it returns the following error:
Traceback (most recent call last):
File "/Users/rihun/PycharmProjects/snowflake_gcp_etl/loader.py", line 61, in <module>
.option("query", query) \
File "/usr/local/opt/apache-spark/libexec/python/pyspark/sql/readwriter.py", line 172, in load
return self._df(self._jreader.load())
File "/usr/local/opt/apache-spark/libexec/python/lib/py4j-0.10.7-src.zip/py4j/java_gateway.py", line 1257, in __call__
File "/usr/local/opt/apache-spark/libexec/python/pyspark/sql/utils.py", line 79, in deco
raise IllegalArgumentException(s.split(': ', 1)[1], stackTrace)
pyspark.sql.utils.IllegalArgumentException: 'Input PEM private key is invalid'
Code Example:
import findspark
findspark.init()
import pyspark
import os
os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages net.snowflake:snowflake-jdbc:3.6.24,net.snowflake:spark-snowflake_2.11:2.4.12-spark_2.3 pyspark-shell'
from pyspark import SparkConf, SparkContext
from pyspark.sql import SQLContext, SparkSession
from pyspark.sql.types import *
from snowflake_connector import get_keeper_token, get_snowflake_credentials
spark = SparkSession.builder.master('local').appName('Snowflake Loader').config('spark.driver.memory', '5G').getOrCreate()
spark.builder.config('spark.executor.memory', '16G')
spark.builder.config('spark.executor.cores', '4')
SNOWFLAKE_SOURCE_NAME = "net.snowflake.spark.snowflake"
sf_creds = get_snowflake_credentials(keeper_token=get_keeper_token())
sfOptions = {
"sfURL": sf_creds['sfURL'],
"sfAccount": sf_creds['sfAccount'],
"sfUser": sf_creds['sfUser'],
"pem_private_key": sf_creds['sfPrivateKey'],
"sfDatabase": sf_creds['sfDatabase'],
"sfSchema": sf_creds['sfSchema'],
"sfWarehouse": sf_creds['sfWarehouse'],
}
df = spark.read.format(SNOWFLAKE_SOURCE_NAME) \
.options(**sfOptions) \
.option("query", query) \
.load()
df.count()
How I am getting the credentials
def get_snowflake_credentials(keeper_token: str,
keeper_url='<keeper_url>',
keeper_namespace='cloudDB',
keeper_secret_path='<path_to_key>',
sf_account='<sf_account>',
sf_svc_user='<user>',
sf_wh='<warehouse>',
sf_role='<role>',
sf_db='<db>',
sf_schema='<schema>'):
# Connect to Keeper to collect secrets
client = hvac.Client(
url=keeper_url,
namespace=keeper_namespace,
token=keeper_token
)
# Secrets are stored within the key entitled 'data'
keeper_secrets = client.read(keeper_secret_path)['data']
passphrase = keeper_secrets['SNOWSQL_PRIVATE_KEY_PASSPHRASE']
private_key = keeper_secrets['private_key']
# PEM key must be byte encoded
key = bytes(private_key, 'utf-8')
p_key = serialization.load_pem_private_key(
key
, password=passphrase.encode()
, backend=default_backend()
)
pkb = p_key.private_bytes(
encoding=serialization.Encoding.DER
, format=serialization.PrivateFormat.PKCS8
, encryption_algorithm=serialization.NoEncryption())
sf_client = snowflake.connector.connect(
user=sf_svc_user
, account=sf_account
, warehouse=sf_wh
, role=sf_role
, database=sf_db
, schema=sf_schema
, private_key=pkb)
return {
"sfURL": "<url>",
"sfAccount": sf_account,
"sfUser": sf_svc_user,
"sfPrivateKey": pkb,
"sfDatabase": sf_db,
"sfSchema": sf_schema,
"sfWarehouse": sf_wh
}
Can you try with this code.
---------------------------------------------------------------------------------
#!/usr/bin/env python
# coding=utf-8
from pyspark import SparkConf, SparkContext
from pyspark.sql import SQLContext
from pyspark.sql.types import *
from pyspark import SparkConf, SparkContext
from pyspark.sql import SQLContext
from pyspark.sql.types import *
from pyspark import SparkConf, SparkContext
import subprocess
from pyspark.sql import SparkSession
import os
import logging
from logging import getLogger
from cryptography.hazmat.primitives import serialization
from cryptography.hazmat.backends import default_backend
import re
from cryptography.hazmat.primitives.serialization import load_pem_private_key
v_log = '<path>/spark.log'
spark = SparkSession \
.builder \
.config("spark.jars", "<path>/snowflake-jdbc-3.8.0.jar,<path>/spark-snowflake_2.11-2.4.13-spark_2.4.jar") \
.config("spark.repl.local.jars",
"<path>/snowflake-jdbc-3.8.0.jar,<path>/spark-snowflake_2.11-2.4.13-spark_2.4.jar") \
.config("spark.sql.catalogImplementation", "in-memory") \
.getOrCreate()
spark._jvm.net.snowflake.spark.snowflake.SnowflakeConnectorUtils.enablePushdownSession(
spark._jvm.org.apache.spark.sql.SparkSession.builder().getOrCreate())
logging.basicConfig(
filename=v_log,
level=logging.DEBUG)
logger = getLogger(__name__)
with open("<path-to>/rsa_key.p8", "rb") as key_file:
p_key = serialization.load_pem_private_key(
key_file.read(),
password=os.environ['PRIVATE_KEY_PASSPHRASE'].encode(),
backend=default_backend()
)
pkb = p_key.private_bytes(
encoding=serialization.Encoding.PEM,
format=serialization.PrivateFormat.PKCS8,
encryption_algorithm=serialization.NoEncryption()
)
pkb = pkb.decode("UTF-8")
pkb = re.sub("-*(BEGIN|END) PRIVATE KEY-*\n", "", pkb).replace("\n", "")
sfOptions = {
"sfURL": "<URL>",
"sfAccount": "sfcsupport",
"sfUser": "",
"sfDatabase": "",
"sfSchema": "PUBLIC",
"sfWarehouse": "",
"sfRole": "",
"pem_private_key":pkb
}
SNOWFLAKE_SOURCE_NAME = "net.snowflake.spark.snowflake"
df = spark.read.format(SNOWFLAKE_SOURCE_NAME) \
.options(**sfOptions) \
.option("query", "Select * from <TableName>") \
.load()
df.show()
----------------------------------------------------------------------------

Data loss in Spark Streaming writing in cassandra

I'm writting data to cassandra and to a text file.
After some minutes I stop the process. Then ,by example, i have 82035 rows in cassandra and I have 96749 rows in the file.
I found apparently valid data in the txt file that is not in the database. By example:
promerar|082220.80|4158.5985417|00506.7613786
MOLIIUO|082220|4107.4749|00444.2117
josehrin|082220|4159.1124|00455.1298
This is the code:
import os
os.environ['PYSPARK_SUBMIT_ARGS'] = '--conf spark.ui.port=4040 --packages org.apache.spark:spark-streaming-kafka-0-8_2.11:2.0.0,com.datastax.spark:spark-cassandra-connector_2.11:2.0.0-M3 pyspark-shell'
import time
from pyspark import SparkContext, SparkConf
from pyspark.sql import SQLContext, Row
conf = SparkConf() \
.setAppName("Streaming test") \
.setMaster("local[2]") \
.set("spark.cassandra.connection.host", "127.0.0.1")
sc = SparkContext(conf=conf)
sqlContext=SQLContext(sc)
from pyspark.streaming import StreamingContext
from pyspark.streaming.kafka import KafkaUtils
count =0
def saveToCassandra(rows):
if not rows.isEmpty():
sqlContext.createDataFrame(rows).write\
.format("org.apache.spark.sql.cassandra")\
.mode('append')\
.options(table="puntos", keyspace="test_rtk")\
.save()
def saveCoord(rdd):
rdd.foreach(lambda x: open("/tmp/p1", "a").write(x[0]+"|"+x[2]+"|"+x[3]+"|"+x[5]+"\n"))
ssc = StreamingContext(sc, 1)
kvs = KafkaUtils.createStream(ssc, "127.0.0.1:2181", "spark-streaming-consumer", {'test': 2})
data = kvs.map(lambda x: x[1].split(","))
rows= data.map(lambda x:Row(long=x[5],lat=x[3],date=time.strftime("%Y-%m-%d"),time=x[2],user=x[0]))
rows.foreachRDD(saveToCassandra)
data.foreachRDD(saveCoord)
ssc.start()
This is the creation of the table in cassandra:
CREATE KEYSPACE test_rtk WITH replication = {'class': 'SimpleStrategy', 'replication_factor' : 1};
CREATE TABLE test_rtk.puntos(
long text,
lat text,
time text,
date text,
user TEXT,
PRIMARY KEY (time,long,lat)
);
Can you help my?
Check Here For Better Solutions
Spark Streaming Programming Guide

How to use sqoop command in python code for incremental import

I want to do incremental import from user_location_history and after incremental import want to save the last id of in the user_location_updated,so that it can get automated for future.
#!/usr/bin/python
import subprocess
import time
import subprocess
import MySQLdb
import datetime
import sys
import pytz
import os
from subprocess import call
def get_mysql_cursor():
conn_1 = MySQLdb.connect(user='db', passwd='bazookadb', host='10.216.204.20', db='bazooka')
conn_2 = MySQLdb.connect(user='db', passwd='bazookadb', host='10.216.204.7', db='bazooka')
#print conn_1,conn_2
return conn_1.cursor(),conn_2.cursor()
def get_records():
cur_1,cur_2 = get_mysql_cursor()
cur_1.execute("select updated from user_location_updated")
cur_2.execute("select max(moving_date) from user_location_history")
return cur_1.fetchone(),cur_2.fetchone()
def update_records(update_date):
cur_1,cur_2 = get_mysql_cursor()
print update_date
query = "update user_location_updated set updated = '"+str(update_date)+"' where id='1' "
print query
result = cur_1.execute(query)
print result
result = get_records()
update_result = update_records(result[1][0])
print result[0][0]
print result[1][0]
sqoopcom = "sqoop import --connect jdbc:mysql://10.216.204.7:3306/bazooka --username db --password bazookadb --fields-terminated-by , --escaped-by \\ --enclosed-by '\"' --table user_location_history -m 1 --hive-delims-replacement ' ' --as-textfile --incremental append --check-column moving_date --last-value 2016-08-04 19:00:36 --target-dir hdfs://example:9000/user/bigdata/sqoopip --verbose"
#os.system(sqoopcom)
exec (sqoopcom)
----but this code is giving error
Wrap --last-value in single quotes.
Use --last-value '2016-08-04 19:00:36'

Categories