for example, given the following json (named 'json':
{"myTime": "2016-10-26 18:19:15"}
and the following python script:
from pyspark import SparkContext
from pyspark import SparkConf
from pyspark.sql import SQLContext
conf = SparkConf().setAppName('simpleTest')
sc = SparkContext(conf=conf)
sqlContext = SQLContext(sc)
print sc.version
json_file = 'json'
df = sqlContext.read.json(json_file,timestampFormat='yyyy-MM-dd HH:mm:ss')
df.printSchema()
The output is:
2.0.2
root
|-- myTime: string (nullable = true)
I expected the schema to be defined as timestamp.
What am i missing?
You need to define a schema explictly:
from pyspark.sql.types import StructType, StructField, TimestampType
schema = StructType([StructField("myTime", TimestampType(), True)])
df = spark.read.json(json_file, schema=schema, timestampFormat="yyyy-MM-dd HH:mm:ss")
This will output:
>>> df.collect()
[Row(myTime=datetime.datetime(2016, 10, 26, 18, 19, 15))]
>>> df.printSchema()
root
|-- myTime: timestamp (nullable = true)
>>>
Additional to Dat Tran solution, you can also directly apply cast to dataframe column after reading the file.
# example
from pyspark.sql import Row
json = [Row(**{"myTime": "2016-10-26 18:19:15"})]
df = spark.sparkContext.parallelize(json).toDF()
# using cast to 'timestamp' format
df_time = df.select(df['myTime'].cast('timestamp'))
df_time.printSchema()
root
|-- myTime: timestamp (nullable = true)
Related
I am spark dataframe with below schema.
-root
|-- ME_KE: string (nullable = true)
|-- CSPD_CAT: string (nullable = true)
|-- EFF_DT: string (nullable = true)
|-- TER_DT: string (nullable = true)
|-- CREATE_DTM: string (nullable = true)
|-- ELIG_IND: string (nullable = true)
Basically I am trying to convert spark SQL code into SQL on directly on dataframe.
df=spark.read.format('csv').load(SourceFilesPath+"\\cutdetl.csv",infraSchema=True,header=True)
df.createOrReplaceTempView("cutdetl")
spark.sql(f"""select
me_ke,
eff_dt,
ter_dt,
create_dtm
from
cutdetl
where
(elig_ind = 'Y') and
((to_date({start_dt},'dd-mon-yyyy') between eff_dt and ter_dt) or
(eff_dt between to_date({start_dt}'dd-mon-yyyy') and to_date({end_dt},'dd-mon-yyyy'))
""")
Below is the code I have tried.
df1=df.select("me_ke","eff_dt","ter_dt","elig_ind")
.where(col("elig_ind")=="Y" & (F.to_date('31-SEP-2022', dd-mon-yyyy')
.between(col("mepe_eff_dt"),col("mepe_term_dt"))) |
(F.to_date(col("eff_dt"))
.between(F.to_date('31-DEC-2022'),F.to_date('31-DEC-2022'))))
I am getting below error:
py4j.Py4JException: Method and([class java.lang.String]) does not exist```
Could anyone help with converting above code to dataframe level SQL
I'd go like this
from pyspark.sql.functions import col
df=spark.read.format('csv').load(SourceFilesPath+"\\cutdetl.csv",infraSchema=True,header=True)
df.createOrReplaceTempView("cutdetl")
df1 = df.filter(col("elig_ind") == "Y")
df1 = df1.filter((col("eff_dt").between(f"to_date({start_dt},'dd-mon-yyyy')", f"to_date({end_dt},'dd-mon-yyyy')")) |
(f"to_date({start_dt},'dd-mon-yyyy')".between(col("eff_dt"), col("ter_dt"))))
df1 = df1.select("me_ke", "eff_dt", "ter_dt", "create_dtm")
I am trying to convert this "17-MAR-15 09.11.39.395000 AM" to datetime format.
I have tried below things but it is not working
df = df.withColumn("created_date", F.from_unixtime(F.unix_timestamp("created_date", 'dd-MMM-yy hh.mm.ss.SSSSSS a'), 'MM-dd-yyyy HH:mm:ss'))
df = df.withColumn("created_date", F.unix_timestamp(F.col("created_date"), 'dd-MMM-yy hh.mm.ss.SSSSSS a').cast("timestamp"))
Thanks for the help .
You can directly use to_timestamp with the date_format as - dd-MMM-yy hh.mm.ss.SSSSSS a
Data Preparation
s = StringIO("""
CREATED_DATE
17-MAR-15 09.11.39.395000 AM
""")
df = pd.read_csv(s,delimiter=',')
sparkDF = sql.createDataFrame(df)
sparkDF.show(truncate=False)
+----------------------------+
|CREATED_DATE |
+----------------------------+
|17-MAR-15 09.11.39.395000 AM|
+----------------------------+
To Timestamp
sparkDF = sparkDF.withColumn('CREATED_DATE_TIME',F.to_timestamp(F.col('CREATED_DATE'),'dd-MMM-yy hh.mm.ss.SSSSSS a'))
sparkDF.show(truncate=False)
+----------------------------+-----------------------+
|CREATED_DATE |CREATED_DATE_TIME |
+----------------------------+-----------------------+
|17-MAR-15 09.11.39.395000 AM|2015-03-17 09:11:39.395|
+----------------------------+-----------------------+
sparkDF.printSchema()
root
|-- CREATED_DATE: string (nullable = true)
|-- CREATED_DATE_TIME: timestamp (nullable = true)
In Spark, literal columns, when added, are not nullable:
from pyspark.sql import SparkSession, functions as F
spark = SparkSession.builder.getOrCreate()
df = spark.createDataFrame([(1,)], ['c1'])
df = df.withColumn('c2', F.lit('a'))
df.printSchema()
# root
# |-- c1: long (nullable = true)
# |-- c2: string (nullable = false)
How to create a nullable column?
The shortest method I've found - using when (the otherwise clause seems not needed):
df = df.withColumn('c2', F.when(F.lit(True), F.lit('a')))
If in Scala: .withColumn("c2", when(lit(true), lit("a")))
Full test result:
from pyspark.sql import SparkSession, functions as F
spark = SparkSession.builder.getOrCreate()
df = spark.createDataFrame([(1,)], ['c1'])
df = df.withColumn('c2', F.when(F.lit(True), F.lit('a')))
df.show()
# +---+---+
# | c1| c2|
# +---+---+
# | 1| a|
# +---+---+
df.printSchema()
# root
# |-- c1: long (nullable = true)
# |-- c2: string (nullable = true)
I looked at spark-rdd to dataframe.
I read my gziped json into rdd
rdd1 =sc.textFile('s3://cw-milenko-tests/Json_gzips/ticr_calculated_2_2020-05-27T11-59-06.json.gz')
I want to convert it to spark dataframe. The first method from the linked SO question does not work. This is the first row form the file
{"code_event": "1092406", "code_event_system": "LOTTO", "company_id": "2", "date_event": "2020-05-27 12:00:00.000", "date_event_real": "0001-01-01 00:00:00.000", "ecode_class": "", "ecode_event": "183", "eperiod_event": "", "etl_date": "2020-05-27", "event_no": 1, "group_no": 0, "name_event": "Ungaria Putto - 8/20", "name_event_short": "Ungaria Putto - 8/20", "odd_coefficient": 1, "odd_coefficient_entry": 1, "odd_coefficient_user": 1, "odd_ekey": "11", "odd_name": "11", "odd_status": "", "odd_type": "11", "odd_voidfactor": 0, "odd_win_types": "", "special_bet_value": "", "ticket_id": "899M-E2X93P", "id_update": 8000001036823656, "topic_group": "cwg5", "kafka_key": "899M-E2X93P", "kafka_epoch": 1590580609424, "kafka_partition": 0, "kafka_topic": "tickets-calculated_2"}
How to infer the schema?
SO answer says
schema = StructType([StructField(str(i), StringType(), True) for i in range(32)])
Why range(32) ?
To answer your question the range(32) just indicates number of columns to which StrucField class can be applied for required schema. In your case there are 30 columns.
Based on your data I was able to create dataframe using below logic:
from pyspark.sql.functions import *
from pyspark.sql.types import *
data_json = {"code_event": "1092406", "code_event_system": "LOTTO", "company_id": "2", "date_event": "2020-05-27 12:00:00.000",
"date_event_real": "0001-01-01 00:00:00.000", "ecode_class": "", "ecode_event": "183", "eperiod_event": "",
"etl_date": "2020-05-27", "event_no": 1, "group_no": 0, "name_event": "Ungaria Putto - 8/20", "name_event_short": "Ungaria Putto - 8/20",
"odd_coefficient": 1, "odd_coefficient_entry": 1, "odd_coefficient_user": 1, "odd_ekey": "11", "odd_name": "11", "odd_status": "",
"odd_type": "11", "odd_voidfactor": 0, "odd_win_types": "", "special_bet_value": "", "ticket_id": "899M-E2X93P", "id_update": 8000001036823656,
"topic_group": "cwg5", "kafka_key": "899M-E2X93P", "kafka_epoch": 1590580609424, "kafka_partition": 0, "kafka_topic": "tickets-calculated_2"}
column_names = [x for x in data_json.keys()]
row_data = [([x for x in data_json.values()])]
input = []
for i in column_names:
if str(type(data_json[i])).__contains__('str') :
input.append(StructField(str(i), StringType(), True))
elif str(type(data_json[i])).__contains__('int') and len(str(data_json[i])) <= 8:
input.append(StructField(str(i), IntegerType(), True))
else :
input.append(StructField(str(i), LongType(), True))
schema = StructType(input)
data = spark.createDataFrame(row_data, schema)
data.show()
Output
# +----------+-----------------+----------+--------------------+--------------------+-----------+-----------+-------------+----------+--------+--------+--------------------+--------------------+---------------+---------------------+--------------------+--------+--------+----------+--------+--------------+-------------+-----------------+-----------+----------------+-----------+-----------+-------------+---------------+--------------------+
# |code_event|code_event_system|company_id| date_event| date_event_real|ecode_class|ecode_event|eperiod_event| etl_date|event_no|group_no| name_event| name_event_short|odd_coefficient|odd_coefficient_entry|odd_coefficient_user|odd_ekey|odd_name|odd_status|odd_type|odd_voidfactor|odd_win_types|special_bet_value| ticket_id| id_update|topic_group| kafka_key| kafka_epoch|kafka_partition| kafka_topic|
# +----------+-----------------+----------+--------------------+--------------------+-----------+-----------+-------------+----------+--------+--------+--------------------+--------------------+---------------+---------------------+--------------------+--------+--------+----------+--------+--------------+-------------+-----------------+-----------+----------------+-----------+-----------+-------------+---------------+--------------------+
# | 1092406| LOTTO| 2|2020-05-27 12:00:...|0001-01-01 00:00:...| | 183| |2020-05-27| 1| 0|Ungaria Putto - 8/20|Ungaria Putto - 8/20| 1| 1| 1| 11| 11| | 11| 0| | |899M-E2X93P|8000001036823656| cwg5|899M-E2X93P|1590580609424| 0|tickets-calculated_2|
# +----------+-----------------+----------+--------------------+--------------------+-----------+-----------+-------------+----------+--------+--------+--------------------+--------------------+---------------+---------------------+--------------------+--------+--------+----------+--------+--------------+-------------+-----------------+-----------+----------------+-----------+-----------+-------------+---------------+--------------------+
range(32) in that example is just an example - they are generating schema with 32 columns, each of them having the number as a name. If you really want to define schema, then you need to explicitly define every column:
from pyspark.sql.types import *
schema = StructType([
StructField('code_event', IntegerType(), True),
StructField('code_event_system', StringType(), True),
...
])
But better way would be to avoid use of the RDD API, and directly read the file into a dataframe with following code (see documentation):
>>> data = spark.read.json('s3://cw-milenko-tests/Json_gzips/ticr_calculated_2_2020-05-27T11-59-06.json.gz')
>>> data.printSchema()
root
|-- code_event: string (nullable = true)
|-- code_event_system: string (nullable = true)
|-- company_id: string (nullable = true)
|-- date_event: string (nullable = true)
|-- date_event_real: string (nullable = true)
|-- ecode_class: string (nullable = true)
|-- ecode_event: string (nullable = true)
|-- eperiod_event: string (nullable = true)
|-- etl_date: string (nullable = true)
....
My Schema:
|-- Canonical_URL: string (nullable = true)
|-- Certifications: array (nullable = true)
| |-- element: struct (containsNull = true)
| | |-- Certification_Authority: string (nullable = true)
| | |-- End: string (nullable = true)
| | |-- License: string (nullable = true)
| | |-- Start: string (nullable = true)
| | |-- Title: string (nullable = true)
|-- CompanyId: string (nullable = true)
|-- Country: string (nullable = true)
|-- vendorTags: array (nullable = true)
| |-- element: struct (containsNull = true)
| | |-- score: double (nullable = true)
| | |-- vendor: string (nullable = true)
I tried the below query to select nested fields from vendorTags
df3 = sqlContext.sql("select vendorTags.vendor from globalcontacts")
How can I query the nested fields in where clause like below in PySpark
df3 = sqlContext.sql("select vendorTags.vendor from globalcontacts where vendorTags.vendor = 'alpha'")
or
df3 = sqlContext.sql("select vendorTags.vendor from globalcontacts where vendorTags.score > 123.123456")
something like this..
I tried the above queries only to get the below error
df3 = sqlContext.sql("select vendorTags.vendor from globalcontacts where vendorTags.vendor = 'alpha'")
16/03/15 13:16:02 INFO ParseDriver: Parsing command: select vendorTags.vendor from globalcontacts where vendorTags.vendor = 'alpha'
16/03/15 13:16:03 INFO ParseDriver: Parse Completed
Traceback (most recent call last):
File "<stdin>", line 1, in <module>
File "/usr/lib/spark/python/pyspark/sql/context.py", line 583, in sql
return DataFrame(self._ssql_ctx.sql(sqlQuery), self)
File "/usr/lib/spark/python/lib/py4j-0.9-src.zip/py4j/java_gateway.py", line 813, in __call__
File "/usr/lib/spark/python/pyspark/sql/utils.py", line 51, in deco
raise AnalysisException(s.split(': ', 1)[1], stackTrace)
pyspark.sql.utils.AnalysisException: u"cannot resolve '(vendorTags.vendor = cast(alpha as double))' due to data type mismatch: differing types in '(vendorTags.vendor = cast(alpha as double))' (array<string> and double).; line 1 pos 71"
For equality based queries you can use array_contains:
df = sc.parallelize([(1, [1, 2, 3]), (2, [4, 5, 6])]).toDF(["k", "v"])
df.createOrReplaceTempView("df")
# With SQL
sqlContext.sql("SELECT * FROM df WHERE array_contains(v, 1)")
# With DSL
from pyspark.sql.functions import array_contains
df.where(array_contains("v", 1))
If you want to use more complex predicates you'll have to either explode or use an UDF, for example something like this:
from pyspark.sql.types import BooleanType
from pyspark.sql.functions import udf
def exists(f):
return udf(lambda xs: any(f(x) for x in xs), BooleanType())
df.where(exists(lambda x: x > 3)("v"))
In Spark 2.4. or later it is also possible to use higher order functions
from pyspark.sql.functions import expr
df.where(expr("""aggregate(
transform(v, x -> x > 3),
false,
(x, y) -> x or y
)"""))
or
df.where(expr("""
exists(v, x -> x > 3)
"""))
Python wrappers should be available in 3.1 (SPARK-30681).
In spark 2.4 you can filter array values using filter function in sql API.
https://spark.apache.org/docs/2.4.0/api/sql/index.html#filter
Here's example in pyspark. In the example we filter out all array values which are empty strings:
df = df.withColumn("ArrayColumn", expr("filter(ArrayColumn, x -> x != '')"))