Having trouble doing an aggregation across many columns in Pyspark. There are hundreds of boolean columns showing the current state of a system, with a row added every second. The goal is to transform this data to show the number of state changes for every 10 second window.
I planned to do this in two steps, first XOR the boolean value with the previous row's value, then second sum over a 10 second window. Here's the rough code I came up with:
import pyspark
from pyspark import SparkConf, SparkContext
from pyspark.sql import SparkSession, Window, Row
from pyspark.sql import types as T, functions as F
from datetime import datetime, timedelta
from random import random
import time
sc = pyspark.SparkContext(conf=pyspark.SparkConf().setMaster('local[*]'))
spark = SparkSession(sc)
# create dataframe
num_of_cols = 50
df = spark.createDataFrame(
[(datetime.now() + timedelta(0, i), *[round(random()) for _ in range(num_of_cols)]) for i in range(10000)],
['Time', *[f"M{m+1}" for m in range(num_of_cols)]])
cols = set(df.columns) - set(['Time'])
# Generate changes
data_window = Window.partitionBy(F.minute('Time')).orderBy('Time')
# data_window = Window.orderBy('Time')
df = df.select('Time', *[F.col(m).bitwiseXOR(F.lag(m, 1).over(data_window)).alias(m) for m in cols])
df = df.groupBy(F.window('Time', '10 seconds')) \
.agg(*[F.sum(m).alias(m) for m in cols]) \
.withColumn('start_time', F.col('window')['start']) \
.drop('window')
df.orderBy('start_time').show(20, False)
# Keep UI open
time.sleep(60*60)
With the data_window partitioned by minute, Spark generates 52 stages, each dependent on the last. Increasing the num_of_cols increases the number of stages as well. It seems to me this should be an embarrassingly parallelizable problem. Compare each row to the last, and then aggregate by 10 seconds. Removing the data_window partitionBy allows it to run in a single stage, but it forces all the data on a single partition to achieve it.
Why are the stages dependent on eachother, is there a better way to write this to improve parallelization? I'd think it'd be possible to do multiple aggregations over the same window at the same time. Eventually this would need to scale to hundreds of columns, are there any tricks to improve performance at that point?
Based off the helpful response from Georg, I came up with the following:
import pandas as pd
import pyspark
from pyspark import SparkConf, SparkContext
from pyspark.sql import SparkSession, Window
from pyspark.sql import types as T, functions as F
from datetime import datetime, timedelta
from random import random
import time
import pprint
sc = pyspark.SparkContext(conf=pyspark.SparkConf().setMaster('local[*]'))
spark = SparkSession(sc)
#F.pandas_udf(T.ArrayType(T.IntegerType()), F.PandasUDFType.GROUPED_AGG)
def pandas_xor(v):
values = v.values
if len(values) == 1:
return values[0] * False
elif len(values) == 2:
return values[0] ^ values[1]
else:
raise RuntimeError('Too many values given to pandas_xor: {}'.format(values))
# create dataframe
num_of_cols = 50
df = spark.createDataFrame(
[(datetime.now() + timedelta(0, i), *[round(random()) for _ in range(num_of_cols)]) for i in range(100000)],
['Time', *[f"M{m+1}" for m in range(num_of_cols)]])
cols = set(df.columns) - set(['Time'])
df = df.select('Time', F.array(*cols).alias('data'))
# XOR
data_window = Window.partitionBy(F.minute('Time')).orderBy('Time').rowsBetween(Window.currentRow, 1)
# data_window = Window.orderBy('Time')
df = df.select('Time', pandas_xor(df.data).over(data_window).alias('data'))
df = df.groupBy(F.window('Time', '10 seconds')) \
.agg(*[F.sum(F.element_at('data', i + 1)).alias(m) for i, m in enumerate(cols)]) \
.withColumn('start_time', F.col('window')['start']) \
.drop('window')
df.orderBy('start_time').show(20, False)
# Keep UI open
time.sleep(60*60)
With the following instructions to run it with Spark 3.0.0preview2
Download Spark 3.0.0
mkdir contrib
wget -O contrib/spark-3.0.0-preview2.tgz 'https://www.apache.org/dyn/mirrors/mirrors.cgi?action=download&filename=spark/spark-3.0.0-preview2/spark-3.0.0-preview2-bin-hadoop2.7.tgz'
tar -C contrib -xf contrib/spark-3.0.0-preview2.tgz
rm contrib/spark-3.0.0-preview2.tgz
In first shell, configure environment to use Pyspark 3.0.0
export SPARK_HOME="$(pwd)/contrib/spark-3.0.0-preview2-bin-hadoop2.7"
export PYTHONPATH="$SPARK_HOME/python/lib/pyspark.zip:$SPARK_HOME/python/lib/py4j-0.10.8.1-src.zip"
Kick off pyspark job
time python3 so-example.py
View local Spark run's Web UI at http://localhost:4040
Related
I have a table which has data as shown in the diagram . I want to create store results in dynamically generated data frame names.
For eg here in the below example I want to create two different data frame name
dnb_df and es_df and store the read result in these two frames and print structure of each data frame
When I am running the below code getting the error
SyntaxError: can't assign to operator (TestGlue2.py, line 66)
import sys
import boto3
from awsglue.transforms import *
from awsglue.utils import getResolvedOptions
from pyspark.context import SparkContext
from awsglue.context import GlueContext
from awsglue.job import Job
from awsglue.dynamicframe import DynamicFrame
from pyspark.sql.functions import regexp_replace, col
args = getResolvedOptions(sys.argv, ['JOB_NAME'])
sc = SparkContext()
#sc.setLogLevel('DEBUG')
glueContext = GlueContext(sc)
spark = glueContext.spark_session
#logger = glueContext.get_logger()
#logger.DEBUG('Hello Glue')
job = Job(glueContext)
job.init(args["JOB_NAME"], args)
client = boto3.client('glue', region_name='XXXXXX')
response = client.get_connection(Name='XXXXXX')
connection_properties = response['Connection']['ConnectionProperties']
URL = connection_properties['JDBC_CONNECTION_URL']
url_list = URL.split("/")
host = "{}".format(url_list[-2][:-5])
new_host=host.split('#',1)[1]
port = url_list[-2][-4:]
database = "{}".format(url_list[-1])
Oracle_Username = "{}".format(connection_properties['USERNAME'])
Oracle_Password = "{}".format(connection_properties['PASSWORD'])
#print("Oracle_Username:",Oracle_Username)
#print("Oracle_Password:",Oracle_Password)
print("Host:",host)
print("New Host:",new_host)
print("Port:",port)
print("Database:",database)
Oracle_jdbc_url="jdbc:oracle:thin:#//"+new_host+":"+port+"/"+database
print("Oracle_jdbc_url:",Oracle_jdbc_url)
source_df = spark.read.format("jdbc").option("url", Oracle_jdbc_url).option("dbtable", "(select * from schema.table order by VENDOR_EXECUTION_ORDER) ").option("user", Oracle_Username).option("password", Oracle_Password).load()
vendor_data=source_df.collect()
for row in vendor_data :
vendor_query=row.SRC_QUERY
row.VENDOR_NAME+'_df'= spark.read.format("jdbc").option("url",
Oracle_jdbc_url).option("dbtable", vendor_query).option("user",
Oracle_Username).option("password", Oracle_Password).load()
print(row.VENDOR_NAME+'_df')
Added use case in picture
Update: As discussed in the comments, your requirement is to further join all with another dataframe
for row in vendor_data:
rowAsDict=row.asDict()
# Here you can use any variable as rowAsDict is not going to be used anywhere else anyway
rowAsDict[rowAsDict["VENDOR_NAME"]+"_df"] = spark.sql(rowAsDict["SOURCE_QUERY"])
main_dataframe=main_dataframe.join(rowAsDict[rowAsDict["VENDOR_NAME"]+"_df"], "acc_id")
Input main_dataframe:
source_df :
View1 and View2:
Output main_dataframe
If I understood correctly, you need to generate the VENDOR_NAME_DF dynamically.
You won't be able to assign to the Row Object, neither it'll be useful to assign dataframe to a Row as you can't create a Dataframe with a column of type Dataframe.
Though, you can convert a row to a dict using asDict and use that instead.
This would work:
vendor_data=source_df.collect()
for row in vendor_data:
rowAsDict=row.asDict()
# Replace this with spark.read() or any way to create a Dataframe
rowAsDict[rowAsDict["VENDOR_NAME"]+"_df"] = spark.sql(rowAsDict["SOURCE_QUERY"])
rowAsDict[rowAsDict["VENDOR_NAME"]+"_df"].show()
Input Source_DF:
Result of SOURCE_QUERY:
Output (of rowAsDict[rowAsDict["VENDOR_NAME"]+"_df"].show()):
Final rowAsDict:
{'VENDOR_NAME': 'Name1', 'SOURCE_QUERY': 'select * from view1', 'Name1_df': DataFrame[id: string, date: string, Code: string]}
Add the last two lines in your for loop, you should be able to get the results.
First one is creating a temp table using the dynamic df name
Second is to show the data in that temp table.
for row in vendor_data :
vendor_query=row.SRC_QUERY
spark.read.format("jdbc").option("url",
Oracle_jdbc_url).option("dbtable", vendor_query).option("user",
Oracle_Username).option("password", Oracle_Password).load().createOrReplaceTempView(row.VENDOR_NAME+'_df')
spark.sql("select * from "+row.VENDOR_NAME+"_df").show()
After Split-apply-combing in PySpark the code does not show the final result (from the book "Data Analysis with Python and PySpark" by JONATHAN RIOUX), the code seems to be working, but fails to "show" results.
NOTE This is not a machine learning exercise: I am just using scikit-learn’s plumbing to create a feature.
We are Creating a grouped aggregate UDF.
Then Creating a grouped aggregate UDF.
Then a group map UDF to scale temperature values.
Split-apply-combing in PySpark.
Finally, Moving one station, one month’s worth of data into a local pandas DataFrame.
> The code seems to be working, but fails to "show" results:
import pyspark
import os
import sys
import pandas as pd
from pyspark.sql import SparkSession
# from pyspark.sql.functions import col, explode
from functools import reduce
import pyspark.sql.types as T
import pyspark.sql.functions as F
from sklearn.linear_model import LinearRegression
spark = pyspark.sql.SparkSession.builder.appName("MyApp").getOrCreate()
spark.sparkContext.setLogLevel("WARN")
os.environ['PYSPARK_PYTHON'] = 'C:/bigdatasetup/anaconda3/envs/pyspark-env/python.exe'
os.environ['PYSPARK_DRIVER_PYTHON'] = 'C:/bigdatasetup/anaconda3/envs/pyspark-env/python.exe'
#os.environ['PYTHONPATH'] = ":".join(sys.path)
gsod = (
reduce(
lambda x, y: x.unionByName(y, allowMissingColumns=True),
[
spark.read.parquet(
f"C:/bigdatasetup/spark/data/gsod_noaa/gsod{year}
.parquet")
for year in range(2010, 2020)
],
)
.dropna(subset=["year", "mo", "da", "temp"])
.where(F.col("temp") != 9999.9)
.drop("date")
)
> Listing 9.8 Creating a grouped aggregate UDF
#F.pandas_udf(T.DoubleType())
def rate_of_change_temperature(day: pd.Series, temp: pd.Series) - > float:
return (
LinearRegression()
.fit(X=day.astype(int).values.reshape(-1, 1), y=temp)
.coef_[0]
)
> #Listing 9.10 A group map UDF to scale temperature values
def scale_temperature(temp_by_day: pd.DataFrame) -> pd.DataFrame:
temp = temp_by_day.temp
answer = temp_by_day[["stn", "year", "mo", "da", "temp"]]
if temp.min() == temp.max():
return answer.assign(temp_norm = 0.5)
return answer.assign(
temp_norm = (temp-temp.min()) / (temp.max()-temp.min())
)
> # Listing 9.11 Split-apply-combing in PySpark
gsod_map = gsod.groupby("stn", "year", "mo").applyInPandas(
scale_temperature,
schema=(
"stn string, year string, mo string, "
"da string, temp double, temp_norm double"),
)
gsod_map.show(5, False)
> # Here the showing does not work
> # Listing 9.12 Moving one station, one month’s worth of data into a local pandas DataFrame
gsod_local = gsod.where(
"year = '2018' and mo = '08' and stn = '710920'"
).toPandas()
print(
rate_of_change_temperature.func(
gsod_local["da"], gsod_local["temp_norm"]
)
)
**> # It gives this error:
> # File "C:\bigdatasetup\anaconda3\envs\pyspark-env\lib\site-packages
> # \pandas\core\indexes\base.py", line 3623, in get_loc
> # raise KeyError(key) from err
> # KeyError: 'temp_norm'**
I am not seeing significantly faster read times using PySpark read.parquet vs. Pandas read_parquet. I am trying to read 4 parquet files each 2-3 MB using a for loop and do some basic aggregation.
Here is my PySpark code:
ts_dfs = []
# For loop to collect building time series and append to empty dataframe
start = time.time()
for id in ids[0:4]:
# Make the path to the data
timeseries_path = f'{dataset_path}/timeseries_individual_buildings/by_county/upgrade=0/county={id[0]}'
# Read the data and select columns of interest
ts_data_df = spark.read.parquet(timeseries_path).select('`bldg_id`', '`out.electricity.heating.energy_consumption`', '`timestamp`')
# Aggregate by month
ts_data_df = ts_data_df \
.groupBy(f.month('timestamp').alias('month'),'bldg_id') \
.agg(f.sum('`out.electricity.heating.energy_consumption`').alias('kWh'))
# Append to empty list
ts_dfs.append(ts_data_df)
# Combine all dfs
ts = reduce(DataFrame.unionAll, ts_dfs)
end = time.time()
print ("Time elapsed:", end - start)
Time elapsed: 5.127371788024902
Here is my Pandas code:
#Download time series of all buildings with the ids we want:
cols = {'timestamp','out.electricity.heating.energy_consumption'}
ts_dfs = []
start = time.time()
for id in ids[0:4]:
# Make the path to the data
timeseries_path = f'{dataset_path}/timeseries_individual_buildings/by_county/upgrade=0/county={id[0]}'
# Read the data and columns of interest
ts_data_df = pd.read_parquet(timeseries_path, columns = cols)
# Some date processing
ts_data_df['date'] = pd.to_datetime(ts_data_df['timestamp'])
ts_data_df['month'] = ts_data_df['date'].dt.month
# Aggregate by month
ts_data_df = ts_data_df.groupby(['month','bldg_id'], as_index = False).agg(sum = ('out.electricity.heating.energy_consumption','sum'))
# Append to empty list
ts_dfs.append(ts_data_df)
# Concatenate all the dfs:
all_ts_df = pd.concat(ts_dfs)
end = time.time()
print ("Time elapsed:", end - start)
Time elapsed: 40.325382232666016
Frankly I would expect Spark to finish this significantly faster given that the files are so small. I'm running this in a Jupyter Notebook and my spark configuration is as follows:
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession
# Configure prior to creating context
conf = pyspark.SparkConf() \
.setAppName('appName') \
.setMaster('local[*]') \
.setAll([
("spark.sql.execution.arrow.pyspark.enabled", "true"),
("spark.sql.execution.arrow.enabled", "true")
])
sc = SparkContext(conf=conf)
My machine has 16 gb of ram and 8 cores, so there should be some parallelization going on given this configuration, correct?
Additionally, when I try to convert the above pyspark DataFrame to pandas, it doubles the amount of time even though the resulting dataframe is only 48 rows and 3 columns. Is there any .persist() or .cache() I can do to speed things up? Any changes to my PySpark configuration that would better leverage my computing power?
I am running levenstein comparison on 50k records. I need to compare each record between each other. Is there a way how to optimize the following code to run it faster? The data is stored in pandas dataframe.
import pandas as pd
import numpy as np
import Levenshtein
df_s_sorted = df.sort_values(['nonascii_2', 'birth_date'])
df_similarity = pd.DataFrame()
q=0
for index, p in df_s_sorted.iterrows():
q = q + 1
print(q)
for index, p1 in df_s_sorted.iterrows():
if ((p["birth_date"] == p1["birth_date"]) & (p["name"] != p1["name"] )):
if (Levenshtein.distance(p["name"],p1["name"]) == 1):
df_similarity = df_similarity.append(p)
print(p)
df_s_sorted.drop(index, inplace=True)
I am working with Spark on python.
My problem is: i have a .csv file which contains some data (int1, int2, int3, date). I did a groupByKey on int1. Now I want to perform an other groupBy on my date with the rdd create by the first groupBy.
Problem is I can't perform it. Any idea?
Regards
EDIT2:
from pyspark import SparkContext
import csv
import sys
import StringIO
sc = SparkContext("local", "Simple App")
file = sc.textFile("histories_2week9.csv")
csvById12Rdd=file.map(lambda (id1,id2,value): ((id1,id2),value)).groupByKey()
csvById1Rdd=csvById12Rdd.map(lambda ((id1,id2),group):(id1, (id2,group))).groupByKey()
def printit(one):
id1, twos=one
print("Id1:{}".format(id1))
for two in twos:
id2, values=two
print("Id1:{} Id2:{}".format(id1,id2))
for value in values:
print("Id1:{} Id2:{} Value:{}".format(id1,id2,value))
csvById12Rdd.first().foreach(printit)
the csv is like
31705,48,2,2014-10-28T18:14:09.000Z
EDIT 3:
i can print my iterator data with this code
from pyspark import SparkContext
import csv
import sys
import StringIO
sc = SparkContext("local", "Simple App")
file = sc.textFile("histories_2week9.csv")
def go_in_rdd2(x):
print x[0]
for i in x[1]:
print i
counts = file.map(lambda line: (line.split(",")[0],line.split(",")[1:]))
counts = counts.groupByKey()
counts.foreach(go_in_rdd2)
but i still cant groupBy
Group by return an RDD of (Key, Iterable[Value]), can you do the otherway round?
Group by id1 and id2 and get an RDD of ((Id1,Id2), Iterable[Value])
Then group by id1 alone and get an RDD of (Id1, Iterable[(Id2,Iterable[Value])])
Something like:
csv=[(1,1,"One","Un"),(1,2,"Two","Deux"),(2,1,"Three","Trois"),(2,1,"Four","Quatre")]
csvRdd=sc.parallelize(csv)
# Step 1
csvById12Rdd=csvRdd.map(lambda (id1,id2,value1,value2): ((id1,id2),(value1,value2))).groupByKey()
# Step 2
csvById1Rdd=csvById12Rdd.map(lambda ((id1,id2),group):(id1, (id2,group))).groupByKey()
# Print
def printit(one):
id1, twos=one
print("Id1:{}".format(id1))
for two in twos:
id2, values=two
print("Id1:{} Id2:{}".format(id1,id2))
for value1,value2 in values:
print("Id1:{} Id2:{} Values:{} {}".format(id1,id2,value1,value2))
csvById1Rdd.foreach(printit)