cant perform 2 succesive groupBy in spark - python

I am working with Spark on python.
My problem is: i have a .csv file which contains some data (int1, int2, int3, date). I did a groupByKey on int1. Now I want to perform an other groupBy on my date with the rdd create by the first groupBy.
Problem is I can't perform it. Any idea?
Regards
EDIT2:
from pyspark import SparkContext
import csv
import sys
import StringIO
sc = SparkContext("local", "Simple App")
file = sc.textFile("histories_2week9.csv")
csvById12Rdd=file.map(lambda (id1,id2,value): ((id1,id2),value)).groupByKey()
csvById1Rdd=csvById12Rdd.map(lambda ((id1,id2),group):(id1, (id2,group))).groupByKey()
def printit(one):
id1, twos=one
print("Id1:{}".format(id1))
for two in twos:
id2, values=two
print("Id1:{} Id2:{}".format(id1,id2))
for value in values:
print("Id1:{} Id2:{} Value:{}".format(id1,id2,value))
csvById12Rdd.first().foreach(printit)
the csv is like
31705,48,2,2014-10-28T18:14:09.000Z
EDIT 3:
i can print my iterator data with this code
from pyspark import SparkContext
import csv
import sys
import StringIO
sc = SparkContext("local", "Simple App")
file = sc.textFile("histories_2week9.csv")
def go_in_rdd2(x):
print x[0]
for i in x[1]:
print i
counts = file.map(lambda line: (line.split(",")[0],line.split(",")[1:]))
counts = counts.groupByKey()
counts.foreach(go_in_rdd2)
but i still cant groupBy

Group by return an RDD of (Key, Iterable[Value]), can you do the otherway round?
Group by id1 and id2 and get an RDD of ((Id1,Id2), Iterable[Value])
Then group by id1 alone and get an RDD of (Id1, Iterable[(Id2,Iterable[Value])])
Something like:
csv=[(1,1,"One","Un"),(1,2,"Two","Deux"),(2,1,"Three","Trois"),(2,1,"Four","Quatre")]
csvRdd=sc.parallelize(csv)
# Step 1
csvById12Rdd=csvRdd.map(lambda (id1,id2,value1,value2): ((id1,id2),(value1,value2))).groupByKey()
# Step 2
csvById1Rdd=csvById12Rdd.map(lambda ((id1,id2),group):(id1, (id2,group))).groupByKey()
# Print
def printit(one):
id1, twos=one
print("Id1:{}".format(id1))
for two in twos:
id2, values=two
print("Id1:{} Id2:{}".format(id1,id2))
for value1,value2 in values:
print("Id1:{} Id2:{} Values:{} {}".format(id1,id2,value1,value2))
csvById1Rdd.foreach(printit)

Related

How to generate Pyspark dynamic frame name dynamically

I have a table which has data as shown in the diagram . I want to create store results in dynamically generated data frame names.
For eg here in the below example I want to create two different data frame name
dnb_df and es_df and store the read result in these two frames and print structure of each data frame
When I am running the below code getting the error
SyntaxError: can't assign to operator (TestGlue2.py, line 66)
import sys
import boto3
from awsglue.transforms import *
from awsglue.utils import getResolvedOptions
from pyspark.context import SparkContext
from awsglue.context import GlueContext
from awsglue.job import Job
from awsglue.dynamicframe import DynamicFrame
from pyspark.sql.functions import regexp_replace, col
args = getResolvedOptions(sys.argv, ['JOB_NAME'])
sc = SparkContext()
#sc.setLogLevel('DEBUG')
glueContext = GlueContext(sc)
spark = glueContext.spark_session
#logger = glueContext.get_logger()
#logger.DEBUG('Hello Glue')
job = Job(glueContext)
job.init(args["JOB_NAME"], args)
client = boto3.client('glue', region_name='XXXXXX')
response = client.get_connection(Name='XXXXXX')
connection_properties = response['Connection']['ConnectionProperties']
URL = connection_properties['JDBC_CONNECTION_URL']
url_list = URL.split("/")
host = "{}".format(url_list[-2][:-5])
new_host=host.split('#',1)[1]
port = url_list[-2][-4:]
database = "{}".format(url_list[-1])
Oracle_Username = "{}".format(connection_properties['USERNAME'])
Oracle_Password = "{}".format(connection_properties['PASSWORD'])
#print("Oracle_Username:",Oracle_Username)
#print("Oracle_Password:",Oracle_Password)
print("Host:",host)
print("New Host:",new_host)
print("Port:",port)
print("Database:",database)
Oracle_jdbc_url="jdbc:oracle:thin:#//"+new_host+":"+port+"/"+database
print("Oracle_jdbc_url:",Oracle_jdbc_url)
source_df = spark.read.format("jdbc").option("url", Oracle_jdbc_url).option("dbtable", "(select * from schema.table order by VENDOR_EXECUTION_ORDER) ").option("user", Oracle_Username).option("password", Oracle_Password).load()
vendor_data=source_df.collect()
for row in vendor_data :
vendor_query=row.SRC_QUERY
row.VENDOR_NAME+'_df'= spark.read.format("jdbc").option("url",
Oracle_jdbc_url).option("dbtable", vendor_query).option("user",
Oracle_Username).option("password", Oracle_Password).load()
print(row.VENDOR_NAME+'_df')
Added use case in picture
Update: As discussed in the comments, your requirement is to further join all with another dataframe
for row in vendor_data:
rowAsDict=row.asDict()
# Here you can use any variable as rowAsDict is not going to be used anywhere else anyway
rowAsDict[rowAsDict["VENDOR_NAME"]+"_df"] = spark.sql(rowAsDict["SOURCE_QUERY"])
main_dataframe=main_dataframe.join(rowAsDict[rowAsDict["VENDOR_NAME"]+"_df"], "acc_id")
Input main_dataframe:
source_df :
View1 and View2:
Output main_dataframe
If I understood correctly, you need to generate the VENDOR_NAME_DF dynamically.
You won't be able to assign to the Row Object, neither it'll be useful to assign dataframe to a Row as you can't create a Dataframe with a column of type Dataframe.
Though, you can convert a row to a dict using asDict and use that instead.
This would work:
vendor_data=source_df.collect()
for row in vendor_data:
rowAsDict=row.asDict()
# Replace this with spark.read() or any way to create a Dataframe
rowAsDict[rowAsDict["VENDOR_NAME"]+"_df"] = spark.sql(rowAsDict["SOURCE_QUERY"])
rowAsDict[rowAsDict["VENDOR_NAME"]+"_df"].show()
Input Source_DF:
Result of SOURCE_QUERY:
Output (of rowAsDict[rowAsDict["VENDOR_NAME"]+"_df"].show()):
Final rowAsDict:
{'VENDOR_NAME': 'Name1', 'SOURCE_QUERY': 'select * from view1', 'Name1_df': DataFrame[id: string, date: string, Code: string]}
Add the last two lines in your for loop, you should be able to get the results.
First one is creating a temp table using the dynamic df name
Second is to show the data in that temp table.
for row in vendor_data :
vendor_query=row.SRC_QUERY
spark.read.format("jdbc").option("url",
Oracle_jdbc_url).option("dbtable", vendor_query).option("user",
Oracle_Username).option("password", Oracle_Password).load().createOrReplaceTempView(row.VENDOR_NAME+'_df')
spark.sql("select * from "+row.VENDOR_NAME+"_df").show()

PySpark Error when Using Jellyfish Functions: str argument expected

I am working on a task getting the similarity score of the name related data. I am using Spark and jellyfish function in Python. Below is my code in a class:
import jellyfish
import pyspark.sql.functions as F
from pyspark.sql import SparkSession, DataFrame
from pyspark import SparkContext
df = self.jaro_winkler_func(df, 'df1.first_name', 'df2.first_name')
def jaro_winkler_score(self, s1, s2):
if s1 is None or s2 is None:
out = 0
else:
out = jellyfish.jaro_winkler(s1, s2)
return out
def jaro_winkler_func(self, df, column_left, column_right):
df = df.withColumn('test', self.jaro_winkler_score(df[column_left], df[column_right]))
return df
Below is the error I got:
out = jellyfish.jaro_winkler(s1, s2)
TypeError: str argument expected
I see other related posts in below for same issue but above functions used are already borrowing the answers from these posts.
Creating score column in Pyspark data frame using jellyfish package
Pyspark: How to deal with null values in python user defined functions
I am using Spark 2.3.
Please suggest and thanks in advance.

Window aggregation on many columns in Spark

Having trouble doing an aggregation across many columns in Pyspark. There are hundreds of boolean columns showing the current state of a system, with a row added every second. The goal is to transform this data to show the number of state changes for every 10 second window.
I planned to do this in two steps, first XOR the boolean value with the previous row's value, then second sum over a 10 second window. Here's the rough code I came up with:
import pyspark
from pyspark import SparkConf, SparkContext
from pyspark.sql import SparkSession, Window, Row
from pyspark.sql import types as T, functions as F
from datetime import datetime, timedelta
from random import random
import time
sc = pyspark.SparkContext(conf=pyspark.SparkConf().setMaster('local[*]'))
spark = SparkSession(sc)
# create dataframe
num_of_cols = 50
df = spark.createDataFrame(
[(datetime.now() + timedelta(0, i), *[round(random()) for _ in range(num_of_cols)]) for i in range(10000)],
['Time', *[f"M{m+1}" for m in range(num_of_cols)]])
cols = set(df.columns) - set(['Time'])
# Generate changes
data_window = Window.partitionBy(F.minute('Time')).orderBy('Time')
# data_window = Window.orderBy('Time')
df = df.select('Time', *[F.col(m).bitwiseXOR(F.lag(m, 1).over(data_window)).alias(m) for m in cols])
df = df.groupBy(F.window('Time', '10 seconds')) \
.agg(*[F.sum(m).alias(m) for m in cols]) \
.withColumn('start_time', F.col('window')['start']) \
.drop('window')
df.orderBy('start_time').show(20, False)
# Keep UI open
time.sleep(60*60)
With the data_window partitioned by minute, Spark generates 52 stages, each dependent on the last. Increasing the num_of_cols increases the number of stages as well. It seems to me this should be an embarrassingly parallelizable problem. Compare each row to the last, and then aggregate by 10 seconds. Removing the data_window partitionBy allows it to run in a single stage, but it forces all the data on a single partition to achieve it.
Why are the stages dependent on eachother, is there a better way to write this to improve parallelization? I'd think it'd be possible to do multiple aggregations over the same window at the same time. Eventually this would need to scale to hundreds of columns, are there any tricks to improve performance at that point?
Based off the helpful response from Georg, I came up with the following:
import pandas as pd
import pyspark
from pyspark import SparkConf, SparkContext
from pyspark.sql import SparkSession, Window
from pyspark.sql import types as T, functions as F
from datetime import datetime, timedelta
from random import random
import time
import pprint
sc = pyspark.SparkContext(conf=pyspark.SparkConf().setMaster('local[*]'))
spark = SparkSession(sc)
#F.pandas_udf(T.ArrayType(T.IntegerType()), F.PandasUDFType.GROUPED_AGG)
def pandas_xor(v):
values = v.values
if len(values) == 1:
return values[0] * False
elif len(values) == 2:
return values[0] ^ values[1]
else:
raise RuntimeError('Too many values given to pandas_xor: {}'.format(values))
# create dataframe
num_of_cols = 50
df = spark.createDataFrame(
[(datetime.now() + timedelta(0, i), *[round(random()) for _ in range(num_of_cols)]) for i in range(100000)],
['Time', *[f"M{m+1}" for m in range(num_of_cols)]])
cols = set(df.columns) - set(['Time'])
df = df.select('Time', F.array(*cols).alias('data'))
# XOR
data_window = Window.partitionBy(F.minute('Time')).orderBy('Time').rowsBetween(Window.currentRow, 1)
# data_window = Window.orderBy('Time')
df = df.select('Time', pandas_xor(df.data).over(data_window).alias('data'))
df = df.groupBy(F.window('Time', '10 seconds')) \
.agg(*[F.sum(F.element_at('data', i + 1)).alias(m) for i, m in enumerate(cols)]) \
.withColumn('start_time', F.col('window')['start']) \
.drop('window')
df.orderBy('start_time').show(20, False)
# Keep UI open
time.sleep(60*60)
With the following instructions to run it with Spark 3.0.0preview2
Download Spark 3.0.0
mkdir contrib
wget -O contrib/spark-3.0.0-preview2.tgz 'https://www.apache.org/dyn/mirrors/mirrors.cgi?action=download&filename=spark/spark-3.0.0-preview2/spark-3.0.0-preview2-bin-hadoop2.7.tgz'
tar -C contrib -xf contrib/spark-3.0.0-preview2.tgz
rm contrib/spark-3.0.0-preview2.tgz
In first shell, configure environment to use Pyspark 3.0.0
export SPARK_HOME="$(pwd)/contrib/spark-3.0.0-preview2-bin-hadoop2.7"
export PYTHONPATH="$SPARK_HOME/python/lib/pyspark.zip:$SPARK_HOME/python/lib/py4j-0.10.8.1-src.zip"
Kick off pyspark job
time python3 so-example.py
View local Spark run's Web UI at http://localhost:4040

Convert Column value in Dataframe to list

I have the following source file. I have a name called "john" in my file wanted to split to list ['j','o','h','n']. Please find the person file as follows.
Source File:
id,name,class,start_data,end_date
1,john,xii,20170909,20210909
Code:
from pyspark.sql import SparkSession
def main():
spark = SparkSession.builder.appName("PersonProcessing").getOrCreate()
df = spark.read.csv('person.txt', header=True)
nameList = [x['name'] for x in df.rdd.collect()]
print(list(nameList))
df.show()
if __name__ == '__main__':
main()
Actual Output:
[u'john']
Desired Output:
['j','o','h','n']
If you want to in python:
nameList = [c for x in df.rdd.collect() for c in x['name']]
or If you want to do it in spark:
from pyspark.sql import functions as F
df.withColumn('name', F.split(F.col('name'), '')).show()
Result:
+---+--------------+-----+----------+--------+
| id| name|class|start_data|end_date|
+---+--------------+-----+----------+--------+
| 1|[j, o, h, n, ]| xii| 20170909|20210909|
+---+--------------+-----+----------+--------+
nameList = [x for x in 'john']
.tolist() turns a pandas series into a python list, so you should create a list first from the data and loop over the list created.
namelist=df['name'].tolist()
for x in namelist:
print(x)
If you are doing this in spark scala (spark 2.3.1 & scala-2.11.8 )
Below code works.
We will get an extra record with blank name hence filtering it .
import spark.implicits._
val classDF = spark.sparkContext.parallelize(Seq((1, "John", "Xii", "20170909", "20210909")))
.toDF("ID", "Name", "Class", "Start_Date", "End_Date")
classDF.withColumn("Name", explode((split(trim(col("Name")), ""))))
.withColumn("Start_Date", to_date(col("Start_Date"), "yyyyMMdd"))
.withColumn("End_Date", to_date(col("End_Date"), "yyyyMMdd")).filter(col("Name").=!=("")).show

How to map structured data to schemaRDD in Spark?

I've asked this question differently before but there are some changes so I thought asking it again as a new question.
I have a structured data which only part of it is in json format but I need to map the entire data to an schemaRDD. The data looks like this:
03052015 04:13:20
{"recordType":"NEW","data":{"keycol":"val1","col2":"val2","col3":"val3"}
Each line starts with date followed by time and a json formatted text.
I need to map not only the text in json but also the date and time into the same structure.
I tried it in Python but obviously it doesn't work because Row does not take an RDD (jsonRDD in this case).
from pyspark.sql import SQLContext, Row
sqlContext = SQLContext(sc)
orderFile = sc.textFile(myfile)
orderLine = orderFile.map(lambda line: line.split(" ", 2))
anotherOrderLine = orderLine.map(lambda p: Row(date=p[0], time=p[1], content=sqlContext.jsonRDD(p[3])))
schemaOrder = sqlContext.inferSchema(anotherOrderLine)
schemaOrder.printSchema()
for x in schemaOrder.collect():
print x
The goal is to be able to run a query like this against the schemaRDD:
select date, time, data.keycol, data.val1, data.val2, data.val3 from myOrder
How can I map the entire line to a schemaRDD?
Any help is appreciated?
The simplest option would be to add this field to JSON and use jsonRDD
My data:
03052015 04:13:20 {"recordType":"NEW","data":{"keycol":"val1","col1":"val5","col2":"val3"}}
03062015 04:13:20 {"recordType":"NEW1","data":{"keycol":"val2","col1":"val6","col2":"val3"}}
03072015 04:13:20 {"recordType":"NEW2","data":{"keycol":"val3","col1":"val7","col2":"val3"}}
03082015 04:13:20 {"recordType":"NEW3","data":{"keycol":"val4","col1":"val8","col2":"val3"}}
Code:
import json
def transform(data):
ts = data[:18].strip()
jss = data[18:].strip()
jsj = json.loads(jss)
jsj['ts'] = ts
return json.dumps(jsj)
from pyspark.sql import SQLContext
sqlContext = SQLContext(sc)
rdd = sc.textFile('/sparkdemo/sample.data')
tbl = sqlContext.jsonRDD(rdd.map(transform))
tbl.registerTempTable("myOrder")
sqlContext.sql("select ts, recordType, data.keycol, data.col1, data.col2 data from myOrder").collect()
Result:
[Row(ts=u'03052015 04:13:20', recordType=u'NEW', keycol=u'val1', col1=u'val5', data=u'val3'), Row(ts=u'03062015 04:13:20', recordType=u'NEW1', keycol=u'val2', col1=u'val6', data=u'val3'), Row(ts=u'03072015 04:13:20', recordType=u'NEW2', keycol=u'val3', col1=u'val7', data=u'val3'), Row(ts=u'03082015 04:13:20', recordType=u'NEW3', keycol=u'val4', col1=u'val8', data=u'val3')]
In your code there is a problem that you are calling jsonRDD for each of the rows, this is not correct - it accepts an RDD and returns SchemaRDD.
The sqlContext.jsonRDD creates a schema rdd from an RDD containing strings where each string contains a JSON representation. This code sample is from the SparkSQL documentation (https://spark.apache.org/docs/1.2.0/sql-programming-guide.html):
val anotherPeopleRDD = sc.parallelize("""{"name":"Yin","address":{"city":"Columbus","state":"Ohio"}}""" :: Nil)
val anotherPeople = sqlContext.jsonRDD(anotherPeopleRDD)
One of the cool things about jsonRDD is the fact that you can provide and additional parameter stating the JSONs schema, which should improve performance your performance. This can be don by creating an schemaRDD (just load a sample document) and then call the schemaRDD.schema method to get the schema.

Categories