How to merge Dynamic Dataframes - python

I'm looking for help to add two dynamically generated dataframes.
Both DataFrames have a column computed on input from an intslider ipywidget.
the third Dataframe should update dynamically on changes of any of above Dataframes
import pandas as pd
from ipywidgets import interact
#interact(x=(0,1000,10))
def df_draw_one(x):
data = {"A":[1,2,3,4,5]}
df_one = pd.DataFrame(data)
df_one['B'] = df_one['A']*x
print(df_one)
#interact(x=(0,1000,10))
def df_draw_two(x):
data = {"A":[6,7,8,9,10]}
df_two = pd.DataFrame(data)
df_two['B'] = df_two['A']*x
print(df_two)
df_res = df_one+df_two
I understand with the current code, df_one and two are local and hence result in:
NameError: name 'df_one' is not defined
but I'm at loss on how to make them accessible.
Any pointer would be appreciated

You can have your functions return the two dataframe adding a return statement.
import pandas as pd
from ipywidgets import interact
#interact(x=(0, 1000, 10))
def df_draw_one(x):
data = {"A": [1, 2, 3, 4, 5]}
df_one = pd.DataFrame(data)
df_one['B'] = df_one['A'] * x
print(df_one)
return df_one
#interact(x=(0, 1000, 10))
def df_draw_two(x):
data = {"A": [6, 7, 8, 9, 10]}
df_two = pd.DataFrame(data)
df_two['B'] = df_two['A'] * x
print(df_two)
return df_two
df_one = df_draw_one(1)
df_two = df_draw_two(1)
df_res = df_one + df_two
print(df_res)
Another way is to have df_one and df_two as global variables, but it's dirty and not really necessary.
Update
One idea could be to have both widget generated in the same function, then everything becomes more easy to handle.
import pandas as pd
from ipywidgets import interact
#interact()
def df_draw_one(x=(0, 1000, 10), y=(0, 1000, 10)):
data = {"A": [1, 2, 3, 4, 5]}
df_one = pd.DataFrame(data)
df_one['B'] = df_one['A'] * x
data2 = {"A": [6, 7, 8, 9, 10]}
df_two = pd.DataFrame(data2)
df_two['B'] = df_two['A'] * y
display(df_one)
display(df_two)
df_res = df_one + df_two
display(df_res)
Here my result:

Related

PySpark: Generating Data in Pandas Very Slow

I need to generate some data in PySpark and I am currently using PySpark pandas to do it. What I have found is that when I want to use .repeat() to scale my data generating process, it is very, very slow (tens of minutes).
Are there any other alternatives that I can use to generate a dataframe of sorts like as follows?
import pyspark.pandas as ps
# params
start_time = '2022-04-01'
end_time = '2022-07-01'
IDs = [1, 2, 3, 4, 5, 6, 7, 8, ...]
dStates = ['A', 'B', 'C', 'D', ....]
# delta time
delta_time = (ps.to_datetime(end_time).month - ps.to_datetime(start_time).month)
# create DF
timeSet = ps.date_range(start=start_time, end=end_time, freq='MS').repeat( len(dStates) * len(IDs) )
stateSet = ps.Series( dStates * ( delta_time + 1 ) * len(IDs) )
nodeSet = ps.Series(IDs).repeat( len(dStates) * ( delta_time + 1 ) ).reset_index(drop=True)
# combine
tseries = ps.DataFrame({'monthlyTrend': timeSet.astype(str),
'FromState': stateSet,
'ID': nodeSet})
Usually numpy functions are more optimized, so you could try using numpy.repeat(). I have tweaked the below code to generate dates day by day in a range and adjust IDs and dStates according to the timeList's length:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
# params
start_time = '2022-04-01'
end_time = '2022-07-01'
IDs = [1, 2, 3, 4, 5, 6, 7, 8]
dStates = ['A', 'B', 'C', 'D']
# Generate data based on params
timeList = np.arange(datetime(2022, 4, 1), datetime(2022, 7, 1), timedelta(days=1)).astype(datetime)
stateList = np.repeat(dStates, len(timeList)//len(dStates))
stateList = np.append(stateList, dStates[:len(timeList)%len(dStates)]) # this ensures the lengths remain the same
nodeList = np.repeat(IDs, len(timeList)//len(IDs))
nodeList = np.append(nodeList, IDs[:len(timeList)%len(IDs)])
# combine
tseries = pd.DataFrame({
'monthlyTrend': timeList.astype(str),
'FromState': stateList,
'ID': nodeList
})
df = spark.createDataFrame(tseries)
Update
Here is another approach that uses explode() and array_repeat to achieve the above using only pyspark functions. We first create a dataframe that is as long as your longest list of params (in the example it's IDs). Then use pyspark functions to expand it.
from pyspark.sql import functions as F
import pyspark.pandas as ps
# params
start_time = '2022-04-01'
end_time = '2022-07-01'
delta_time = (ps.to_datetime(end_time).month - ps.to_datetime(start_time).month)
timeSet = ps.date_range(start=start_time, end=end_time, freq='MS').tolist()
IDs = [1, 2, 3, 4, 5, 6, 7, 8]
dStates = ['A', 'B', 'C', 'D']
# create a minimum length DF aligned to the longest list of params
longest_list = IDs
timeSet = ps.concat([ps.Series(timeSet * (len(longest_list)//len(timeSet))), ps.Series(timeSet[:len(longest_list)%len(timeSet)])], ignore_index=True)
stateSet = ps.concat([ps.Series(dStates * (len(longest_list)//len(dStates))), ps.Series(dStates[:len(longest_list)%len(dStates)])], ignore_index=True)
nodeSet = ps.Series(IDs)
# combine
df_tseries = ps.DataFrame({
'monthlyTrend': timeSet,
'FromState': stateSet,
'ID': nodeSet}).to_spark()
# expand the df with explode and array_repeat
no_of_repeats = 10
df_tseries = df_tseries.withColumn("ID", F.explode(F.array_repeat("ID", no_of_repeats)))

How to speed up nested loop and add condition?

I am trying to speed up my nested loop it currently takes 15 mins for 100k customers.
I am also having trouble adding an additional condition that only multiplies states (A,B,C) by lookup2 val, else multiplies by 1.
customer_data = pd.DataFrame({"cust_id": [1, 2, 3, 4, 5, 6, 7, 8],
"state": ['B', 'E', 'D', 'A', 'B', 'E', 'C', 'A'],
"cust_amt": [1000,300, 500, 200, 400, 600, 200, 300],
"year":[3, 3, 4, 3, 4, 2, 2, 4],
"group":[10, 25, 30, 40, 55, 60, 70, 85]})
state_list = ['A','B','C','D','E']
# All lookups should be dataframes with the year and/or group and the value like these.
lookup1 = pd.DataFrame({'year': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
'lim %': 0.1})
lookup2 = pd.concat([pd.DataFrame({'group':g, 'lookup_val': 0.1, 'year':range(1, 11)}
for g in customer_data['group'].unique())]).explode('year')
multi_data = np.arange(250).reshape(10,5,5)
lookups = [lookup1, lookup2]
# Preprocessing.
# Transform the state to categorical code to use it as array index.
customer_data['state'] = pd.Categorical(customer_data['state'],
categories=state_list,
ordered=True).codes
# Set index on lookups.
for i in range(len(lookups)):
if 'group' in lookups[i].columns:
lookups[i] = lookups[i].set_index(['year', 'group'])
else:
lookups[i] = lookups[i].set_index(['year'])
calculation:
results = {}
for customer, state, amount, start, group in customer_data.itertuples(name=None, index=False):
for year in range(start, len(multi_data)+1):
if year == start:
results[customer] = [[amount * multi_data[year-1, state, :]]]
else:
results[customer].append([results[customer][-1][-1] # multi_data[year-1]])
for lookup in lookups:
if isinstance(lookup.index, pd.MultiIndex):
value = lookup.loc[(year, group)].iat[0]
else:
value = lookup.loc[year].iat[0]
results[customer][-1].append(value * results[customer][-1][-1])
example of expected output:
{1: [[array([55000, 56000, 57000, 58000, 59000]),
array([5500., 5600., 5700., 5800., 5900.]),
array([550., 560., 570., 5800., 5900.])],...
You could use multiprocessing if you have more than one CPU.
from multiprocessing import Pool
def get_customer_data(data_tuple) -> dict:
results = {}
customer, state, amount, start, group = data_tuple
for year in range(start, len(multi_data)+1):
if year == start:
results[customer] = [[amount * multi_data[year-1, state, :]]]
else:
results[customer].append([results[customer][-1][-1] # multi_data[year-1]])
for lookup in lookups:
if isinstance(lookup.index, pd.MultiIndex):
value = lookup.loc[(year, group)].iat[0]
else:
value = lookup.loc[year].iat[0]
results[customer][-1].append(value * results[customer][-1][-1])
return results
p = Pool(mp.cpu_count())
# Pool.map() takes a function and an iterable like a list or generator
results_list = p.map(get_customer_data, [data_tuple for data_tuple in customer_data.itertuples(name=None, index=False)] )
# results is a list of dict()
results_dict = {k:v for x in results_list for k,v in x.items()}
p.close()
Glad to see you posting this! As promised, my thoughts:
With Pandas works with columns very well. What you need to look to do is remove the need for loops as much as possible (In your case I would say get rid of the main loop you have then keep the year and lookups loop).
To do this, forget about the results{} variable for now. You want to do the calculations directly on the DataFrame. For example your first calculation would become something like:
customer_data['meaningful_column_name'] = [[amount * multi_data[customer_data['year']-1, customer_data['state'], :]]]
For your lookups loop you just have to be aware that the if statement will be looking at entire columns.
Finally, as it seems you want to have your data in a list of arrays you will need to do some formatting to extract the data from a DataFrame structure.
I hope that makes some sense

How to run variable of one function to another in python?

I am a beginner. I have two function:
1st creating dataframes and some print statement
2nd is downloading the dataframes to csv in colab.
I want to download all dataframes by the df_name.
code:
def fun1():
import pandas as pd
d = {'col1': [1, 2], 'col2': [3, 4]}
d2 = {'col1': [-5, -6], 'col2': [-7, -8]}
df = pd.DataFrame(data=d)
df2 = pd.DataFrame(data=d2)
print('info', df.info())
print('info', df2.info())
return df, df2
def fun2(df):
from google.colab import files
name1 = 'positive.csv'
name2 = 'negative.csv'
df.to_csv(name1)
df2.to_csv(name2)
files.download(name1)
files.download(name2)
fun2(df) #looking something like this that download my df, func2 should read my df and df2 from fun1()
I tried:
class tom:
def fun1(self):
import pandas as pd
d = {'col1': [1, 2], 'col2': [3, 4]}
d2 = {'col1': [-5, -6], 'col2': [-7, -8]}
df = pd.DataFrame(data=d)
df2 = pd.DataFrame(data=d2)
print('info', df.info())
print('info', df2.info())
self.df= df
self.df2 = df2
return df, df2
def fun2(self):
df,df2 = fun1()
from google.colab import files
name1 = 'positive.csv'
name2 = 'negative.csv'
df.to_csv(name1)
df2.to_csv(name2)
return files.download(name1) ,files.download(name2)
tom().fun2() #it download files but shows print of fun1 as well which I don't want.
looking for something like
tom().fun2(dataframe_name) #it just download the files nothing else
set permanent variables directly in the class if its not gonna change and
define fun just for actions.
class s:
import pandas as pd
d = {'col1': [1, 2], 'col2': [3, 4]}
d2 = {'col1': [-5, -6], 'col2': [-7, -8]}
df = pd.DataFrame(data=d)
df2 = pd.DataFrame(data=d2)
name1 = 'positive.csv'
name2 = 'negative.csv'
df.to_csv(name1)
df2.to_csv(name2)
def f():
print('info', df.info())
print('info', df2.info())
def fun(x):
from google.colab import files
return files.download(x)
run
s.f() --it will print value only
s.fun(s.name1) --it will just download the file
Maybe you can save the data you need in a class variable or create another function, that keeps the data from the first function you need the value (lets call it A) and then pass A to the second function as an argument.

Pandas assignment using nested loops leading to memory error

I am using pandas and trying to do an assignment using a nested loops. I iterate over a dataframe and then run a distance function if it meets a certain criteria. I am faced with two problems:
SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame
Memory Error. It doesn't work on large datasets. I end up having to terminate the process.
How should I change my solution to ensure it can scale with a larger dataset of 60,000 rows?
for i, row in df.iterrows():
listy = 0
school = []
if row['LS_Type'] == 'Primary (1-4)':
a = row['Northing']
b = row['Easting']
LS_ID = row['LS_ID']
for j, row2 in df.iterrows():
if row2['LS_Type'] == 'Primary (1-8)':
dist_km = distance(a,b, df.Northing[j], df.Easting[j])
if (listy == 0):
listy = dist_km
school.append([df.LS_Name[j], df.LS_ID[j]])
else:
if dist_km < listy:
listy = dist_km
school[0] = [df.LS_Name[j], int(df.LS_ID[j])]
df['dist_up_prim'][i] = listy
df["closest_up_prim"][i] = school[0]
else:
df['dist_up_prim'][i] = 0
The double for loop is what's killing you here. See if you can break it up into two separate apply steps.
Here is a toy example of using df.apply() and partial to do a nested for loop:
import math
import pandas as pd
from functools import partial
df = pd.DataFrame.from_dict({'A': [1, 2, 3, 4, 5, 6, 7, 8],
'B': [1, 2, 3, 4, 5, 6, 7, 8]})
def myOtherFunc(row):
if row['A'] <= 4:
return row['B']*row['A']
def myFunc(the_df, row):
if row['A'] <= 2:
other_B = the_df.apply(myOtherFunc, axis=1)
return other_B.mean()
return pd.np.NaN
apply_myFunc_on_df = partial(myFunc, df)
df.apply(apply_myFunc_on_df, axis=1)
You can rewrite your code in this form, which will be much faster.

Transforming pandas Dataframe into dictionary via function taking column inputs

I have the following pandas Dataframe:
dict1 = {'file': ['filename2', 'filename2', 'filename3', 'filename4', 'filename4', 'filename3'], 'amount': [3, 4, 5, 1, 2, 1], 'front':[21889611, 36357723, 196312, 11, 42, 1992], 'back':[21973805, 36403870, 277500, 19, 120, 3210]}
df1 = pd.DataFrame(dict1)
print(df1)
file amount front back
0 filename2 3 21889611 21973805
1 filename2 4 36357723 36403870
2 filename3 5 196312 277500
3 filename4 1 11 19
4 filename4 2 42 120
5 filename3 1 1992 3210
My task is to take N random draws between front and back, whereby N is equal to the value in amount. Parse this into a dictionary.
To do this on an row-by-row basis is easy for me to understand:
e.g. row 1
import numpy as np
random_draws = np.random.choice(np.arange(21889611, 21973805+1), 3)
e.g. row 2
random_draws = np.random.choice(np.arange(36357723, 36403870+1), 4)
Normally with pandas, users could define this as a function and use something like
def func(front, back, amount):
return np.random.choice(np.arange(front, back+1), amount)
df["new_column"].apply(func)
but the result of my function is an array of varying size.
My second problem is that I would like the output to be a dictionary, of the format
{file: [random_draw_results], file: [random_draw_results], file: [random_draw_results], ...}
For the above example df1, the function should output this dictionary (given the draws):
final_dict = {"filename2": [21927457, 21966814, 21898538, 36392840, 36375560, 36384078, 36366833],
"filename3": 212143, 239725, 240959, 197359, 276948, 3199],
"filename4": [100, 83, 15]}
We can pass axis=1 to operate over rows when using apply.
We then need to tell what columns to use and we return a list.
We then either perform some form of groupby or we could use defaultdict as shown below:
dict1 = {'file': ['filename2', 'filename2', 'filename3', 'filename4', 'filename4', 'filename3'], 'amount': [3, 4, 5, 1, 2, 1], 'front':[21889611, 36357723, 196312, 11, 42, 1992], 'back':[21973805, 36403870, 277500, 19, 120, 3210]}
import numpy as np
import pandas as pd
def func(x):
return np.random.choice(np.arange(x.front, x.back+1), x.amount).tolist()
df1 = pd.DataFrame(dict1)
df1["new_column"] = df1.apply(func, axis=1)
df1.groupby('file')['new_column'].apply(sum).to_dict()
Returns:
{'filename2': [21891765,
21904680,
21914414,
36398355,
36358161,
36387670,
36369443],
'filename3': [240766, 217580, 217581, 274396, 241413, 2488],
'filename4': [18, 96, 107]}
Alt2 would be to use (and by some small timings I ran it looks like it runs as fast):
from collections import defaultdict
d = defaultdict(list)
for k,v in df1.set_index('file')['new_column'].items():
d[k].extend(v)

Categories