PySpark: Generating Data in Pandas Very Slow - python

I need to generate some data in PySpark and I am currently using PySpark pandas to do it. What I have found is that when I want to use .repeat() to scale my data generating process, it is very, very slow (tens of minutes).
Are there any other alternatives that I can use to generate a dataframe of sorts like as follows?
import pyspark.pandas as ps
# params
start_time = '2022-04-01'
end_time = '2022-07-01'
IDs = [1, 2, 3, 4, 5, 6, 7, 8, ...]
dStates = ['A', 'B', 'C', 'D', ....]
# delta time
delta_time = (ps.to_datetime(end_time).month - ps.to_datetime(start_time).month)
# create DF
timeSet = ps.date_range(start=start_time, end=end_time, freq='MS').repeat( len(dStates) * len(IDs) )
stateSet = ps.Series( dStates * ( delta_time + 1 ) * len(IDs) )
nodeSet = ps.Series(IDs).repeat( len(dStates) * ( delta_time + 1 ) ).reset_index(drop=True)
# combine
tseries = ps.DataFrame({'monthlyTrend': timeSet.astype(str),
'FromState': stateSet,
'ID': nodeSet})

Usually numpy functions are more optimized, so you could try using numpy.repeat(). I have tweaked the below code to generate dates day by day in a range and adjust IDs and dStates according to the timeList's length:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
# params
start_time = '2022-04-01'
end_time = '2022-07-01'
IDs = [1, 2, 3, 4, 5, 6, 7, 8]
dStates = ['A', 'B', 'C', 'D']
# Generate data based on params
timeList = np.arange(datetime(2022, 4, 1), datetime(2022, 7, 1), timedelta(days=1)).astype(datetime)
stateList = np.repeat(dStates, len(timeList)//len(dStates))
stateList = np.append(stateList, dStates[:len(timeList)%len(dStates)]) # this ensures the lengths remain the same
nodeList = np.repeat(IDs, len(timeList)//len(IDs))
nodeList = np.append(nodeList, IDs[:len(timeList)%len(IDs)])
# combine
tseries = pd.DataFrame({
'monthlyTrend': timeList.astype(str),
'FromState': stateList,
'ID': nodeList
})
df = spark.createDataFrame(tseries)
Update
Here is another approach that uses explode() and array_repeat to achieve the above using only pyspark functions. We first create a dataframe that is as long as your longest list of params (in the example it's IDs). Then use pyspark functions to expand it.
from pyspark.sql import functions as F
import pyspark.pandas as ps
# params
start_time = '2022-04-01'
end_time = '2022-07-01'
delta_time = (ps.to_datetime(end_time).month - ps.to_datetime(start_time).month)
timeSet = ps.date_range(start=start_time, end=end_time, freq='MS').tolist()
IDs = [1, 2, 3, 4, 5, 6, 7, 8]
dStates = ['A', 'B', 'C', 'D']
# create a minimum length DF aligned to the longest list of params
longest_list = IDs
timeSet = ps.concat([ps.Series(timeSet * (len(longest_list)//len(timeSet))), ps.Series(timeSet[:len(longest_list)%len(timeSet)])], ignore_index=True)
stateSet = ps.concat([ps.Series(dStates * (len(longest_list)//len(dStates))), ps.Series(dStates[:len(longest_list)%len(dStates)])], ignore_index=True)
nodeSet = ps.Series(IDs)
# combine
df_tseries = ps.DataFrame({
'monthlyTrend': timeSet,
'FromState': stateSet,
'ID': nodeSet}).to_spark()
# expand the df with explode and array_repeat
no_of_repeats = 10
df_tseries = df_tseries.withColumn("ID", F.explode(F.array_repeat("ID", no_of_repeats)))

Related

How to speed up nested loop and add condition?

I am trying to speed up my nested loop it currently takes 15 mins for 100k customers.
I am also having trouble adding an additional condition that only multiplies states (A,B,C) by lookup2 val, else multiplies by 1.
customer_data = pd.DataFrame({"cust_id": [1, 2, 3, 4, 5, 6, 7, 8],
"state": ['B', 'E', 'D', 'A', 'B', 'E', 'C', 'A'],
"cust_amt": [1000,300, 500, 200, 400, 600, 200, 300],
"year":[3, 3, 4, 3, 4, 2, 2, 4],
"group":[10, 25, 30, 40, 55, 60, 70, 85]})
state_list = ['A','B','C','D','E']
# All lookups should be dataframes with the year and/or group and the value like these.
lookup1 = pd.DataFrame({'year': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
'lim %': 0.1})
lookup2 = pd.concat([pd.DataFrame({'group':g, 'lookup_val': 0.1, 'year':range(1, 11)}
for g in customer_data['group'].unique())]).explode('year')
multi_data = np.arange(250).reshape(10,5,5)
lookups = [lookup1, lookup2]
# Preprocessing.
# Transform the state to categorical code to use it as array index.
customer_data['state'] = pd.Categorical(customer_data['state'],
categories=state_list,
ordered=True).codes
# Set index on lookups.
for i in range(len(lookups)):
if 'group' in lookups[i].columns:
lookups[i] = lookups[i].set_index(['year', 'group'])
else:
lookups[i] = lookups[i].set_index(['year'])
calculation:
results = {}
for customer, state, amount, start, group in customer_data.itertuples(name=None, index=False):
for year in range(start, len(multi_data)+1):
if year == start:
results[customer] = [[amount * multi_data[year-1, state, :]]]
else:
results[customer].append([results[customer][-1][-1] # multi_data[year-1]])
for lookup in lookups:
if isinstance(lookup.index, pd.MultiIndex):
value = lookup.loc[(year, group)].iat[0]
else:
value = lookup.loc[year].iat[0]
results[customer][-1].append(value * results[customer][-1][-1])
example of expected output:
{1: [[array([55000, 56000, 57000, 58000, 59000]),
array([5500., 5600., 5700., 5800., 5900.]),
array([550., 560., 570., 5800., 5900.])],...
You could use multiprocessing if you have more than one CPU.
from multiprocessing import Pool
def get_customer_data(data_tuple) -> dict:
results = {}
customer, state, amount, start, group = data_tuple
for year in range(start, len(multi_data)+1):
if year == start:
results[customer] = [[amount * multi_data[year-1, state, :]]]
else:
results[customer].append([results[customer][-1][-1] # multi_data[year-1]])
for lookup in lookups:
if isinstance(lookup.index, pd.MultiIndex):
value = lookup.loc[(year, group)].iat[0]
else:
value = lookup.loc[year].iat[0]
results[customer][-1].append(value * results[customer][-1][-1])
return results
p = Pool(mp.cpu_count())
# Pool.map() takes a function and an iterable like a list or generator
results_list = p.map(get_customer_data, [data_tuple for data_tuple in customer_data.itertuples(name=None, index=False)] )
# results is a list of dict()
results_dict = {k:v for x in results_list for k,v in x.items()}
p.close()
Glad to see you posting this! As promised, my thoughts:
With Pandas works with columns very well. What you need to look to do is remove the need for loops as much as possible (In your case I would say get rid of the main loop you have then keep the year and lookups loop).
To do this, forget about the results{} variable for now. You want to do the calculations directly on the DataFrame. For example your first calculation would become something like:
customer_data['meaningful_column_name'] = [[amount * multi_data[customer_data['year']-1, customer_data['state'], :]]]
For your lookups loop you just have to be aware that the if statement will be looking at entire columns.
Finally, as it seems you want to have your data in a list of arrays you will need to do some formatting to extract the data from a DataFrame structure.
I hope that makes some sense

What's the fastest way to vectorise creation of combinatorial DataFrame?

How can I create the master DataFrame through some vectorised process? If it's not possible, what's the most time efficient (not concerned about memory) method to execute this operation?
Can the for-loop be replaced for something more efficient?
As you can see, combinations very quickly produces very large number, thus I need a fast way to produce this DataFrame.
Please see below a minimum reproducible example:
%%time
import pandas as pd
import string
import numpy as np
from itertools import combinations
# create dummy data
cols = list(string.ascii_uppercase)
dummy = pd.DataFrame()
for col in cols:
dummy = dummy.append([[col, 0] + np.random.randint(2, 100, size=(1, 10)).tolist()[0]])
dummy = dummy.append([[col, 1] + np.random.randint(2, 100, size=(1, 10)).tolist()[0]])
dummy = dummy.append([[col, 2] + np.random.randint(2, 100, size=(1, 10)).tolist()[0]])
dummy.columns=['name', 'id', 'v1', 'v2', 'v3', 'v4', 'v5', 'v1', 'v6', 'v7', 'v8', 'v9']
# create all possible unique combinations
combos = list(combinations(cols, 2))
# generate DataFrame with all combinations
master = pd.DataFrame()
for i, combo in enumerate(combos):
A = dummy[dummy.name == combo[0]]
B = dummy[dummy.name == combo[1]]
joined = pd.merge(A, B, on=["id"], suffixes=('_A', '_B'))
joined = joined.sort_values("id")
joined['pair_id'] = i
master = pd.concat([master, joined])
Output:
CPU times: total: 1.8 s
Wall time: 1.8 s
Thanks!
Since your data is structural, you can drop down to numpy to take advantage of vectorized operations.
names = list(string.ascii_uppercase)
ids = [0, 1, 2]
columns = pd.Series(["v1", "v2", "v3", "v4", "v5", "v1", "v6", "v7", "v8", "v9"])
# Generate the random data
data = np.random.randint(2, 100, (len(names), len(ids), len(columns)))
# Pair data for every 2-combination of names
arr = [np.hstack([data[i], data[j]]) for i,j in combinations(range(len(names)), 2)]
# Assembling the data to final dataframe
idx = pd.MultiIndex.from_tuples([
(p,a,b,i) for p, (a, b) in enumerate(combinations(names,2)) for i in ids
], names=["pair_id", "name_A", "name_B", "id"])
cols = pd.concat([columns + "_A", columns + "_B"])
master = pd.DataFrame(np.vstack(arr), index=idx, columns=cols)
Original code: 4s. New code: 7ms

How to merge Dynamic Dataframes

I'm looking for help to add two dynamically generated dataframes.
Both DataFrames have a column computed on input from an intslider ipywidget.
the third Dataframe should update dynamically on changes of any of above Dataframes
import pandas as pd
from ipywidgets import interact
#interact(x=(0,1000,10))
def df_draw_one(x):
data = {"A":[1,2,3,4,5]}
df_one = pd.DataFrame(data)
df_one['B'] = df_one['A']*x
print(df_one)
#interact(x=(0,1000,10))
def df_draw_two(x):
data = {"A":[6,7,8,9,10]}
df_two = pd.DataFrame(data)
df_two['B'] = df_two['A']*x
print(df_two)
df_res = df_one+df_two
I understand with the current code, df_one and two are local and hence result in:
NameError: name 'df_one' is not defined
but I'm at loss on how to make them accessible.
Any pointer would be appreciated
You can have your functions return the two dataframe adding a return statement.
import pandas as pd
from ipywidgets import interact
#interact(x=(0, 1000, 10))
def df_draw_one(x):
data = {"A": [1, 2, 3, 4, 5]}
df_one = pd.DataFrame(data)
df_one['B'] = df_one['A'] * x
print(df_one)
return df_one
#interact(x=(0, 1000, 10))
def df_draw_two(x):
data = {"A": [6, 7, 8, 9, 10]}
df_two = pd.DataFrame(data)
df_two['B'] = df_two['A'] * x
print(df_two)
return df_two
df_one = df_draw_one(1)
df_two = df_draw_two(1)
df_res = df_one + df_two
print(df_res)
Another way is to have df_one and df_two as global variables, but it's dirty and not really necessary.
Update
One idea could be to have both widget generated in the same function, then everything becomes more easy to handle.
import pandas as pd
from ipywidgets import interact
#interact()
def df_draw_one(x=(0, 1000, 10), y=(0, 1000, 10)):
data = {"A": [1, 2, 3, 4, 5]}
df_one = pd.DataFrame(data)
df_one['B'] = df_one['A'] * x
data2 = {"A": [6, 7, 8, 9, 10]}
df_two = pd.DataFrame(data2)
df_two['B'] = df_two['A'] * y
display(df_one)
display(df_two)
df_res = df_one + df_two
display(df_res)
Here my result:

Quicker way to iterate pandas dataframe and apply a conditional function

Summary
I am trying to iterate over a large dataframe. Identify unique groups based on several columns, apply the mean to another column based on how many are in the group. My current approach is very slow when iterating over a large dataset and applying the average function across many columns. Is there a way I can do this more efficiently?
Example
Here's a example of the problem. I want to find unique combinations of ['A', 'B', 'C']. For each unique combination, I want the value of column ['D'] / number of rows in the group.
Edit:
Resulting dataframe should preserve the duplicated groups. But with edited column 'D'
import pandas as pd
import numpy as np
import datetime
def time_mean_rows():
# Generate some random data
A = np.random.randint(0, 5, 1000)
B = np.random.randint(0, 5, 1000)
C = np.random.randint(0, 5, 1000)
D = np.random.randint(0, 10, 1000)
# init dataframe
df = pd.DataFrame(data=[A, B, C, D]).T
df.columns = ['A', 'B', 'C', 'D']
tstart = datetime.datetime.now()
# Get unique combinations of A, B, C
unique_groups = df[['A', 'B', 'C']].drop_duplicates().reset_index()
# Iterate unique groups
normalised_solutions = []
for idx, row in unique_groups.iterrows():
# Subset dataframe to the unique group
sub_df = df[
(df['A'] == row['A']) &
(df['B'] == row['B']) &
(df['C'] == row['C'])
]
# If more than one solution, get mean of column D
num_solutions = len(sub_df)
if num_solutions > 1:
sub_df.loc[:, 'D'] = sub_df.loc[:,'D'].values.sum(axis=0) / num_solutions
normalised_solutions.append(sub_df)
# Concatenate results
res = pd.concat(normalised_solutions)
tend = datetime.datetime.now()
time_elapsed = (tstart - tend).seconds
print(time_elapsed)
I know the section causing slowdown is when num_solutions > 1. How can I do this more efficiently
Hm, why don't you use groupby?
df_res = df.groupby(['A', 'B', 'C'])['D'].mean().reset_index()
This is a complement to AT_asks's answer which only gave the first part of the solution.
Once we have df.groupby(['A', 'B', 'C'])['D'].mean() we can use it to change the value of the column 'D' in a copy of the original dataframe provided we use a dataframe sharing same index. The global solution is then:
res = df.set_index(['A', 'B', 'C']).assign(
D=df.groupby(['A', 'B', 'C'])['D'].mean()).reset_index()
This will contains same rows (even if a different order that the res dataframe from OP's question.
Here's a solution I found
Using groupby as suggested by AT, then merging back to the original df and dropping the original ['D', 'E'] columns. Nice speedup!
def time_mean_rows():
# Generate some random data
np.random.seed(seed=42)
A = np.random.randint(0, 10, 10000)
B = np.random.randint(0, 10, 10000)
C = np.random.randint(0, 10, 10000)
D = np.random.randint(0, 10, 10000)
E = np.random.randint(0, 10, 10000)
# init dataframe
df = pd.DataFrame(data=[A, B, C, D, E]).T
df.columns = ['A', 'B', 'C', 'D', 'E']
tstart_grpby = timer()
cols = ['D', 'E']
group_df = df.groupby(['A', 'B', 'C'])[cols].mean().reset_index()
# Merge df
df = pd.merge(df, group_df, how='left', on=['A', 'B', 'C'], suffixes=('_left', ''))
# Get left columns (have not been normalised) and drop
drop_cols = [x for x in df.columns if x.endswith('_left')]
df.drop(drop_cols, inplace=True, axis='columns')
tend_grpby = timer()
time_elapsed_grpby = timedelta(seconds=tend_grpby-tstart_grpby).total_seconds()
print(time_elapsed_grpby)

Pandas assignment using nested loops leading to memory error

I am using pandas and trying to do an assignment using a nested loops. I iterate over a dataframe and then run a distance function if it meets a certain criteria. I am faced with two problems:
SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame
Memory Error. It doesn't work on large datasets. I end up having to terminate the process.
How should I change my solution to ensure it can scale with a larger dataset of 60,000 rows?
for i, row in df.iterrows():
listy = 0
school = []
if row['LS_Type'] == 'Primary (1-4)':
a = row['Northing']
b = row['Easting']
LS_ID = row['LS_ID']
for j, row2 in df.iterrows():
if row2['LS_Type'] == 'Primary (1-8)':
dist_km = distance(a,b, df.Northing[j], df.Easting[j])
if (listy == 0):
listy = dist_km
school.append([df.LS_Name[j], df.LS_ID[j]])
else:
if dist_km < listy:
listy = dist_km
school[0] = [df.LS_Name[j], int(df.LS_ID[j])]
df['dist_up_prim'][i] = listy
df["closest_up_prim"][i] = school[0]
else:
df['dist_up_prim'][i] = 0
The double for loop is what's killing you here. See if you can break it up into two separate apply steps.
Here is a toy example of using df.apply() and partial to do a nested for loop:
import math
import pandas as pd
from functools import partial
df = pd.DataFrame.from_dict({'A': [1, 2, 3, 4, 5, 6, 7, 8],
'B': [1, 2, 3, 4, 5, 6, 7, 8]})
def myOtherFunc(row):
if row['A'] <= 4:
return row['B']*row['A']
def myFunc(the_df, row):
if row['A'] <= 2:
other_B = the_df.apply(myOtherFunc, axis=1)
return other_B.mean()
return pd.np.NaN
apply_myFunc_on_df = partial(myFunc, df)
df.apply(apply_myFunc_on_df, axis=1)
You can rewrite your code in this form, which will be much faster.

Categories