no module found when calling lambda function - python
I am trying to run a python program on PySpark 1.6. The script below is using a module called "dateutil" to convert time from one timezone to another timezone. I've checked that dateutil module is installed on all worker nodes and the current system I am using to submit the job.
Exec command:
spark-submit --packages "com.databricks:spark-csv_2.11:1.5.0" test.py
Script:
from pyspark import SparkConf
from pyspark import SparkContext
from pyspark.sql import HiveContext, Row, functions, SQLContext
from pyspark.sql.window import Window
import os, sys
import logging
import subprocess
import math
import csv
import functools
import pickle
from operator import add
from itertools import chain
import argparse
import hashlib
import cStringIO
import time
import copy
import datetime
import pytz
conf = SparkConf ()
conf.set('spark.kyroserializer.buffer.max', '32000')
conf.set('spark.scheduler.mode', 'FAIR')
sc = SparkContext(conf = conf, appName = "Testing dateutil...")
sqlContext = HiveContext (sc)
def utcToAESTDateString (row):
#import pytz
from dateutil import tz
utc_tz = dateutil.tz.gettz('UTC')
#utc_tz = pytz.timezone('UTC')
utc_time = datetime.datetime.fromtimestamp(int(row["start time (unix time)"].decode())).replace(tzinfo=utc_time)
#print(utc_time.strftime('%Y-%m-%d %H:%M:%S'))
aest_time = dateutil.tz.gettz('AEST')
math.acos (1)
#print(utc_time.astimezone(aest_time).strftime('%Y-%m-%d %H:%M:%S'))
#aedt_time = tz.gettz('AEDT')
#print(utc_time.astimezone(aedt_time).strftime('%Y-%m-%d %H:%M:%S'))
#return utc_time.astimezone(aedt_time).strftime('%Y-%m-%d')
return Row(sdate = unicode(utc_time.astimezone(aest_time).strftime('%Y-%m-%d'), "utf-8")) + row
sqlContext.createDataFrame(sqlContext.read.format('com.databricks.spark.csv').options(header='true', inferschema='false', quote='"')\
.load("/user/xxx/conviva/*")\
.filter("`start time (unix time)` <> '' AND `start time (unix time)` IS NOT NULL")\
.rdd\
.map(lambda y: utcToAESTDateString(y)))\
.registerTempTable("table1")
#sqlContext.sql ("""select * from table1 left join fixed_dart on table1.`_1` = fixed_dart.`_4` and table1.`_18` = fixed_dart.`_1`""").show()
sqlContext.sql ("""select * from table1 limit 10""").show()
Error:
Caused by: org.apache.spark.api.python.PythonException: Traceback (most recent call last):
File "/opt/cloudera/parcels/CDH-5.8.2-1.cdh5.8.2.p0.3/lib/spark/python/lib/pyspark.zip/pyspark/worker.py", line 111, in main
process()
File "/opt/cloudera/parcels/CDH-5.8.2-1.cdh5.8.2.p0.3/lib/spark/python/lib/pyspark.zip/pyspark/worker.py", line 106, in process
serializer.dump_stream(func(split_index, iterator), outfile)
File "/opt/cloudera/parcels/CDH-5.8.2-1.cdh5.8.2.p0.3/lib/spark/python/lib/pyspark.zip/pyspark/serializers.py", line 263, in dump_stream
vs = list(itertools.islice(iterator, batch))
File "/opt/cloudera/parcels/CDH-5.8.2-1.cdh5.8.2.p0.3/lib/spark/python/lib/pyspark.zip/pyspark/rdd.py", line 1293, in takeUpToNumLeft
File "/home/xxxx/test.py", line 50, in <lambda>
File "/home/xxxx/test.py", line 34, in utcToAESTDateString
NameError: global name 'dateutil' is not defined
at org.apache.spark.api.python.PythonRunner$$anon$1.read(PythonRDD.scala:166)
at org.apache.spark.api.python.PythonRunner$$anon$1.<init>(PythonRDD.scala:207)
at org.apache.spark.api.python.PythonRunner.compute(PythonRDD.scala:125)
at org.apache.spark.api.python.PythonRDD.compute(PythonRDD.scala:70)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:306)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:270)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:66)
at org.apache.spark.scheduler.Task.run(Task.scala:89)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:214)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
... 1 more
Change these lines utc_tz = tz.gettz('UTC')
and aest_time = tz.gettz('AEST')
So when you import a specific method like this:
from dateutil import tz and you can't do a function call like dateutil.tz
You have to do tz().
Your code should be as follows:
from pyspark import SparkConf
from pyspark import SparkContext
from pyspark.sql import HiveContext, Row, functions, SQLContext
from pyspark.sql.window import Window
import os, sys
import logging
import subprocess
import math
import csv
import functools
import pickle
from operator import add
from itertools import chain
import argparse
import hashlib
import cStringIO
import time
import copy
import datetime
import pytz
conf = SparkConf ()
conf.set('spark.kyroserializer.buffer.max', '32000')
conf.set('spark.scheduler.mode', 'FAIR')
sc = SparkContext(conf = conf, appName = "Testing dateutil...")
sqlContext = HiveContext (sc)
def utcToAESTDateString (row):
#import pytz
from dateutil import tz
utc_tz = tz.gettz('UTC')
#utc_tz = pytz.timezone('UTC')
utc_time = datetime.datetime.fromtimestamp(int(row["start time (unix time)"].decode())).replace(tzinfo=utc_time)
#print(utc_time.strftime('%Y-%m-%d %H:%M:%S'))
aest_time = tz.gettz('AEST')
math.acos (1)
#print(utc_time.astimezone(aest_time).strftime('%Y-%m-%d %H:%M:%S'))
#aedt_time = tz.gettz('AEDT')
#print(utc_time.astimezone(aedt_time).strftime('%Y-%m-%d %H:%M:%S'))
#return utc_time.astimezone(aedt_time).strftime('%Y-%m-%d')
return Row(sdate = unicode(utc_time.astimezone(aest_time).strftime('%Y-%m-%d'), "utf-8")) + row
sqlContext.createDataFrame(sqlContext.read.format('com.databricks.spark.csv').options(header='true', inferschema='false', quote='"')\
.load("/user/xxx/conviva/*")\
.filter("`start time (unix time)` <> '' AND `start time (unix time)` IS NOT NULL")\
.rdd\
.map(lambda y: utcToAESTDateString(y)))\
.registerTempTable("table1")
#sqlContext.sql ("""select * from table1 left join fixed_dart on table1.`_1` = fixed_dart.`_4` and table1.`_18` = fixed_dart.`_1`""").show()
sqlContext.sql ("""select * from table1 limit 10""").show()
Related
PicklingError: Could not serialize object: TypeError: can't pickle SSLContext objects
I'm trying to use aws encryption sdk to encrypt pyspark columns but running into this error. Here's the code from pyspark.sql import functions as F from pyspark.sql.functions import udf, col, lit from pyspark.context import SparkContext import aws_encryption_sdk from aws_encryption_sdk import CommitmentPolicy client = aws_encryption_sdk.EncryptionSDKClient( commitment_policy=CommitmentPolicy.REQUIRE_ENCRYPT_REQUIRE_DECRYPT ) kms_kwargs = dict(key_ids=[key_arn]) global master_key_provider master_key_provider = aws_encryption_sdk.StrictAwsKmsMasterKeyProvider(**kms_kwargs) df = spark.read.csv('test.csv', inferSchema = True, header = True) def encrypt_string(text): encrypted_text, encryptor_header = client.encrypt( source=text, key_provider=master_key_provider ) return encrypted_text udf_encrypt = udf(lambda text: encrypt_string(text)) def spark_encrypt(df, colmn): return df.withColumn("segment_encrypt", udf_encrypt(col(colmn))) df_out = spark_encrypt(df, "segment") Is there a way to resolve this ?
Issue in Kafka to Spark Streaming Data pipeline with python pyspark
I am using below program and runnign this in Anaconda(Spyder) for creating data pipeline from Kafka to Spark streaming & in python import sys from pyspark import SparkContext, SparkConf from pyspark.streaming import StreamingContext from pyspark.streaming.kafka import KafkaUtils from uuid import uuid1 import os ##Step 1: Initialize sparkcontext spark_context = SparkContext(appName="Transformation Application") ###Step 2: Initialize streaming context ssc = StreamingContext(spark_context, 5) def utf8_decoder(s): """ Decode the unicode as UTF-8 """ if s is None: return None return s.decode('utf-8') message = KafkaUtils.createDirectStream(ssc,topics=['testtopic'],kafkaParams={"metadata.broker.list":"localhost:9092","key.deserializer": "org.springframework.kafka.support.serializer.JsonDeserializer","value.deserializer": "org.springframework.kafka.support.serializer.JsonDeserializer"},fromOffsets=None,messageHandler=None,keyDecoder=utf8_decoder,valueDecoder=utf8_decoder) message words = message.map(lambda x: x[1]).flatMap(lambda x: x.split(" ")) wordcount=words.map(lambda x: (x,1)).reduceByKey(lambda a,b:a+b) wordcount.pprint() When I am printing message, words,wordscount i am getting no proper results,I am getting hexadecimal values . message Out[16]: <pyspark.streaming.kafka.KafkaDStream at 0x23f8b1f8248> wordcount Out[18]: <pyspark.streaming.dstream.TransformedDStream at 0x23f8b2324c8> in my topic(testtopic) I am produced string - " Hi Hi Hi how are you doing" then wordcount should give count for each word but it is giving some encoded hexadecimal values
Pyspark: Error Connecting to Snowflake using Private Key
I am looking to create an ETL process that reads queries from Snowflake. Most of the examples online show how to set up a connection using a regular string password, but the way my company has set up their password is via private key. Unfortunately, when I try to pass in the private key as a parameter, it returns the following error: Traceback (most recent call last): File "/Users/rihun/PycharmProjects/snowflake_gcp_etl/loader.py", line 61, in <module> .option("query", query) \ File "/usr/local/opt/apache-spark/libexec/python/pyspark/sql/readwriter.py", line 172, in load return self._df(self._jreader.load()) File "/usr/local/opt/apache-spark/libexec/python/lib/py4j-0.10.7-src.zip/py4j/java_gateway.py", line 1257, in __call__ File "/usr/local/opt/apache-spark/libexec/python/pyspark/sql/utils.py", line 79, in deco raise IllegalArgumentException(s.split(': ', 1)[1], stackTrace) pyspark.sql.utils.IllegalArgumentException: 'Input PEM private key is invalid' Code Example: import findspark findspark.init() import pyspark import os os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages net.snowflake:snowflake-jdbc:3.6.24,net.snowflake:spark-snowflake_2.11:2.4.12-spark_2.3 pyspark-shell' from pyspark import SparkConf, SparkContext from pyspark.sql import SQLContext, SparkSession from pyspark.sql.types import * from snowflake_connector import get_keeper_token, get_snowflake_credentials spark = SparkSession.builder.master('local').appName('Snowflake Loader').config('spark.driver.memory', '5G').getOrCreate() spark.builder.config('spark.executor.memory', '16G') spark.builder.config('spark.executor.cores', '4') SNOWFLAKE_SOURCE_NAME = "net.snowflake.spark.snowflake" sf_creds = get_snowflake_credentials(keeper_token=get_keeper_token()) sfOptions = { "sfURL": sf_creds['sfURL'], "sfAccount": sf_creds['sfAccount'], "sfUser": sf_creds['sfUser'], "pem_private_key": sf_creds['sfPrivateKey'], "sfDatabase": sf_creds['sfDatabase'], "sfSchema": sf_creds['sfSchema'], "sfWarehouse": sf_creds['sfWarehouse'], } df = spark.read.format(SNOWFLAKE_SOURCE_NAME) \ .options(**sfOptions) \ .option("query", query) \ .load() df.count() How I am getting the credentials def get_snowflake_credentials(keeper_token: str, keeper_url='<keeper_url>', keeper_namespace='cloudDB', keeper_secret_path='<path_to_key>', sf_account='<sf_account>', sf_svc_user='<user>', sf_wh='<warehouse>', sf_role='<role>', sf_db='<db>', sf_schema='<schema>'): # Connect to Keeper to collect secrets client = hvac.Client( url=keeper_url, namespace=keeper_namespace, token=keeper_token ) # Secrets are stored within the key entitled 'data' keeper_secrets = client.read(keeper_secret_path)['data'] passphrase = keeper_secrets['SNOWSQL_PRIVATE_KEY_PASSPHRASE'] private_key = keeper_secrets['private_key'] # PEM key must be byte encoded key = bytes(private_key, 'utf-8') p_key = serialization.load_pem_private_key( key , password=passphrase.encode() , backend=default_backend() ) pkb = p_key.private_bytes( encoding=serialization.Encoding.DER , format=serialization.PrivateFormat.PKCS8 , encryption_algorithm=serialization.NoEncryption()) sf_client = snowflake.connector.connect( user=sf_svc_user , account=sf_account , warehouse=sf_wh , role=sf_role , database=sf_db , schema=sf_schema , private_key=pkb) return { "sfURL": "<url>", "sfAccount": sf_account, "sfUser": sf_svc_user, "sfPrivateKey": pkb, "sfDatabase": sf_db, "sfSchema": sf_schema, "sfWarehouse": sf_wh }
Can you try with this code. --------------------------------------------------------------------------------- #!/usr/bin/env python # coding=utf-8 from pyspark import SparkConf, SparkContext from pyspark.sql import SQLContext from pyspark.sql.types import * from pyspark import SparkConf, SparkContext from pyspark.sql import SQLContext from pyspark.sql.types import * from pyspark import SparkConf, SparkContext import subprocess from pyspark.sql import SparkSession import os import logging from logging import getLogger from cryptography.hazmat.primitives import serialization from cryptography.hazmat.backends import default_backend import re from cryptography.hazmat.primitives.serialization import load_pem_private_key v_log = '<path>/spark.log' spark = SparkSession \ .builder \ .config("spark.jars", "<path>/snowflake-jdbc-3.8.0.jar,<path>/spark-snowflake_2.11-2.4.13-spark_2.4.jar") \ .config("spark.repl.local.jars", "<path>/snowflake-jdbc-3.8.0.jar,<path>/spark-snowflake_2.11-2.4.13-spark_2.4.jar") \ .config("spark.sql.catalogImplementation", "in-memory") \ .getOrCreate() spark._jvm.net.snowflake.spark.snowflake.SnowflakeConnectorUtils.enablePushdownSession( spark._jvm.org.apache.spark.sql.SparkSession.builder().getOrCreate()) logging.basicConfig( filename=v_log, level=logging.DEBUG) logger = getLogger(__name__) with open("<path-to>/rsa_key.p8", "rb") as key_file: p_key = serialization.load_pem_private_key( key_file.read(), password=os.environ['PRIVATE_KEY_PASSPHRASE'].encode(), backend=default_backend() ) pkb = p_key.private_bytes( encoding=serialization.Encoding.PEM, format=serialization.PrivateFormat.PKCS8, encryption_algorithm=serialization.NoEncryption() ) pkb = pkb.decode("UTF-8") pkb = re.sub("-*(BEGIN|END) PRIVATE KEY-*\n", "", pkb).replace("\n", "") sfOptions = { "sfURL": "<URL>", "sfAccount": "sfcsupport", "sfUser": "", "sfDatabase": "", "sfSchema": "PUBLIC", "sfWarehouse": "", "sfRole": "", "pem_private_key":pkb } SNOWFLAKE_SOURCE_NAME = "net.snowflake.spark.snowflake" df = spark.read.format(SNOWFLAKE_SOURCE_NAME) \ .options(**sfOptions) \ .option("query", "Select * from <TableName>") \ .load() df.show() ----------------------------------------------------------------------------
Data loss in Spark Streaming writing in cassandra
I'm writting data to cassandra and to a text file. After some minutes I stop the process. Then ,by example, i have 82035 rows in cassandra and I have 96749 rows in the file. I found apparently valid data in the txt file that is not in the database. By example: promerar|082220.80|4158.5985417|00506.7613786 MOLIIUO|082220|4107.4749|00444.2117 josehrin|082220|4159.1124|00455.1298 This is the code: import os os.environ['PYSPARK_SUBMIT_ARGS'] = '--conf spark.ui.port=4040 --packages org.apache.spark:spark-streaming-kafka-0-8_2.11:2.0.0,com.datastax.spark:spark-cassandra-connector_2.11:2.0.0-M3 pyspark-shell' import time from pyspark import SparkContext, SparkConf from pyspark.sql import SQLContext, Row conf = SparkConf() \ .setAppName("Streaming test") \ .setMaster("local[2]") \ .set("spark.cassandra.connection.host", "127.0.0.1") sc = SparkContext(conf=conf) sqlContext=SQLContext(sc) from pyspark.streaming import StreamingContext from pyspark.streaming.kafka import KafkaUtils count =0 def saveToCassandra(rows): if not rows.isEmpty(): sqlContext.createDataFrame(rows).write\ .format("org.apache.spark.sql.cassandra")\ .mode('append')\ .options(table="puntos", keyspace="test_rtk")\ .save() def saveCoord(rdd): rdd.foreach(lambda x: open("/tmp/p1", "a").write(x[0]+"|"+x[2]+"|"+x[3]+"|"+x[5]+"\n")) ssc = StreamingContext(sc, 1) kvs = KafkaUtils.createStream(ssc, "127.0.0.1:2181", "spark-streaming-consumer", {'test': 2}) data = kvs.map(lambda x: x[1].split(",")) rows= data.map(lambda x:Row(long=x[5],lat=x[3],date=time.strftime("%Y-%m-%d"),time=x[2],user=x[0])) rows.foreachRDD(saveToCassandra) data.foreachRDD(saveCoord) ssc.start() This is the creation of the table in cassandra: CREATE KEYSPACE test_rtk WITH replication = {'class': 'SimpleStrategy', 'replication_factor' : 1}; CREATE TABLE test_rtk.puntos( long text, lat text, time text, date text, user TEXT, PRIMARY KEY (time,long,lat) ); Can you help my?
Check Here For Better Solutions Spark Streaming Programming Guide
How to use sqoop command in python code for incremental import
I want to do incremental import from user_location_history and after incremental import want to save the last id of in the user_location_updated,so that it can get automated for future. #!/usr/bin/python import subprocess import time import subprocess import MySQLdb import datetime import sys import pytz import os from subprocess import call def get_mysql_cursor(): conn_1 = MySQLdb.connect(user='db', passwd='bazookadb', host='10.216.204.20', db='bazooka') conn_2 = MySQLdb.connect(user='db', passwd='bazookadb', host='10.216.204.7', db='bazooka') #print conn_1,conn_2 return conn_1.cursor(),conn_2.cursor() def get_records(): cur_1,cur_2 = get_mysql_cursor() cur_1.execute("select updated from user_location_updated") cur_2.execute("select max(moving_date) from user_location_history") return cur_1.fetchone(),cur_2.fetchone() def update_records(update_date): cur_1,cur_2 = get_mysql_cursor() print update_date query = "update user_location_updated set updated = '"+str(update_date)+"' where id='1' " print query result = cur_1.execute(query) print result result = get_records() update_result = update_records(result[1][0]) print result[0][0] print result[1][0] sqoopcom = "sqoop import --connect jdbc:mysql://10.216.204.7:3306/bazooka --username db --password bazookadb --fields-terminated-by , --escaped-by \\ --enclosed-by '\"' --table user_location_history -m 1 --hive-delims-replacement ' ' --as-textfile --incremental append --check-column moving_date --last-value 2016-08-04 19:00:36 --target-dir hdfs://example:9000/user/bigdata/sqoopip --verbose" #os.system(sqoopcom) exec (sqoopcom) ----but this code is giving error
Wrap --last-value in single quotes. Use --last-value '2016-08-04 19:00:36'