I found a behavior in pandas that I'm not able to explain to myself.
I am studying a database of audio features with N+2 columns: an ID, the time t, and N audio features related to time t. For various reasons, I would like to put in every row also the features of the next T time steps. (yes, the same data will be repeated up to T times). I have therefore written a function that creates additional feature columns containing data from the successive time steps. I have implemented it in the three ways, as you can see in the attached code, and one of them is not working, which is surprising to me since it works if the underlying data structures are numpy arrays. Can anybody explain me why?
def create_datapoints_for_dnn(df, T):
"""
Here we take the data frame with chroma features at time t and create all features at times t+1, t+2, ..., t+T-1.
:param df: initial data frame of chroma features
:param T: number of time steps to keep
:return: expanded data frame of chroma features
"""
res = df.copy()
original_labels = df.columns.values
n_steps = df.shape[0] # the number of time steps in this song
nans = pd.Series(np.full(n_steps, np.NaN)).values # a column of nans of the correct length
for n in range(1, T):
new_labels = [ol + '+' + str(n) for ol in original_labels[2:]]
for nl, ol in zip(new_labels, original_labels[2:]):
# df.assign would use the name "nl" instead of what nl contains, so we build and unpack a dictionary
res = res.assign(**{nl: nans}) # create a new column
# CORRECT BUT EXTREMELY SLOW
# for i in range(n_steps - (T - 1)):
# res.iloc[i, res.columns.get_loc(nl)] = df.iloc[n+i, df.columns.get_loc(ol)]
# CORRECT AND FAST
res.iloc[:-n, res.columns.get_loc(nl)] = df.iloc[:, df.columns.get_loc(ol)].shift(-n)
# NOT WORKING
# res.iloc[:-n, res.columns.get_loc(nl)] = df.iloc[n:, df.columns.get_loc(ol)]
return res[: - (T - 1)] # drop the last T-1 rows because time t+T-1 is not defined for them
Data example (put it in a csv):
songID,time,A_t,A#_t
CrossEra-0850,0.0,0.0,0.0
CrossEra-0850,0.1,0.0,0.0
CrossEra-0850,0.2,0.0,0.0
CrossEra-0850,0.3,0.31621,0.760299
CrossEra-0850,0.4,0.0,0.00107539
CrossEra-0850,0.5,0.0,0.142832
CrossEra-0850,0.6,0.8506459999999999,0.12481600000000001
CrossEra-0850,0.7,0.0,0.21206399999999997
CrossEra-0850,0.8,0.0796207,0.28227399999999997
CrossEra-0850,0.9,2.55144,0.169434
CrossEra-0850,1.0,3.4581699999999995,0.08014550000000001
CrossEra-0850,1.1,3.1061400000000003,0.030419599999999998
Code to run it
import pandas as pd
import numpy as np
T = 4 # how many successive steps we want to put in a single row
df = pd.read_csv('path_to_csv')
res = create_datapoints_for_dnn(df, T)
res.to_csv('path_to_output', index=False)
Results:
Use pd.DataFrame.shift and concat
f-string requires Python 3.6. Otherwise use '+{}'.format(i)'
cols = ['songID', 'time']
d = df.drop(['songID', 'time'], 1)
df[cols].join(
pd.concat(
[d.shift(-i).add_suffix(f'+{i}') for i in range(4)],
axis=1
)
)
songID time A_t+0 A#_t+0 A_t+1 A#_t+1 A_t+2 A#_t+2 A_t+3 A#_t+3
0 CrossEra-0850 0.0 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.316210 0.760299
1 CrossEra-0850 0.1 0.000000 0.000000 0.000000 0.000000 0.316210 0.760299 0.000000 0.001075
2 CrossEra-0850 0.2 0.000000 0.000000 0.316210 0.760299 0.000000 0.001075 0.000000 0.142832
3 CrossEra-0850 0.3 0.316210 0.760299 0.000000 0.001075 0.000000 0.142832 0.850646 0.124816
4 CrossEra-0850 0.4 0.000000 0.001075 0.000000 0.142832 0.850646 0.124816 0.000000 0.212064
5 CrossEra-0850 0.5 0.000000 0.142832 0.850646 0.124816 0.000000 0.212064 0.079621 0.282274
6 CrossEra-0850 0.6 0.850646 0.124816 0.000000 0.212064 0.079621 0.282274 2.551440 0.169434
7 CrossEra-0850 0.7 0.000000 0.212064 0.079621 0.282274 2.551440 0.169434 3.458170 0.080146
8 CrossEra-0850 0.8 0.079621 0.282274 2.551440 0.169434 3.458170 0.080146 3.106140 0.030420
9 CrossEra-0850 0.9 2.551440 0.169434 3.458170 0.080146 3.106140 0.030420 NaN NaN
10 CrossEra-0850 1.0 3.458170 0.080146 3.106140 0.030420 NaN NaN NaN NaN
11 CrossEra-0850 1.1 3.106140 0.030420 NaN NaN NaN NaN NaN NaN
Related
I have the following dataframe:
df1 = pd.DataFrame({'id' : [1, 2, 1,2], 'plat' : ['and','and','ios','ios'], 'd30_real' : [1.2,1.4,1.5,1.9], 'd3_d30':[1.1,1.5,1.5,1.8], 'd7_d30':[1.4,1.5,1.6,1.9], 'd14_d30':[1.2,1.3,1.5,2.0]})
I want to calculate the MAPE with Sklearn function comparing the column on the lista_target as real value and the columns of the lista_com as predictions. This last list has to be A LIST OF LISTS. This cannot be changed.
lista_target = ['d30_real']
lista_com = [['d3_d30','d7_d30','d14_d30']]
mape_hm = pd.DataFrame()
This is the loop to generate the MAPE results:
for i in range(len(lista_target)):
for e in range(len(lista_com[i])):
mape_hm[i][e] = df1.groupby(by = ['id','plat']).apply(lambda x: mean_absolute_percentage_error(x[lista_target[i]], x[lista_com[i][e]]))
But it is giving me this error:
KeyError: 0
I understand that this is because it is not finding the position '0' on the lista_target. I would like what I am doing wrong, as I need to read the string inside the list, no the position.
This would be the output (fake numbers):
result = pd.DataFrame({'id' : [1, 2, 1,2], 'plat' : ['and','and','ios','ios'], 'd30_real' : [1.2,1.4,1.5,1.9], 'MAPE_d3_d30':[0.02,0.03,0.4,0.0], 'MAPE_d7_d30':[0.03,0.04,0.06,0], 'MAPE_d14_d30':[0.0,0.02,0,0.09]})
I see in your question, that you use groupby on ['id','plat'] columns then I write the answer with groupby and use apply on it and create the dataframe for sklearn.metrics.mean_absolute_percentage_error for columns that you want.
from sklearn.metrics import mean_absolute_percentage_error
cols = [['d3_d30'], ['d7_d30', 'd14_d30']]
lst = []
def f_mape(x):
dct = {}
for col in cols:
for c in col:
dct[f'real_{c}'] = mean_absolute_percentage_error(x['d30_real'], x[c])
lst.append(dct)
df1.groupby(['id', 'plat']).apply(lambda x: f_mape(x))
print(pd.DataFrame(lst))
Output:
real_d3_d30 real_d7_d30 real_d14_d30
0 0.083333 0.166667 0.000000
1 0.000000 0.066667 0.000000
2 0.071429 0.071429 0.071429
3 0.052632 0.000000 0.052632
Assuming you want to compute the MAPE per column, per group:
from sklearn.metrics import mean_absolute_percentage_error as mape
(df1
.groupby(['id','plat'])[lista_com[0]]
.transform(lambda g: mape(df1.loc[g.index, lista_target[0]], g))
.add_prefix('MAPE_')
)
output:
MAPE_d3_d30 MAPE_d7_d30 MAPE_d14_d30
0 0.083333 0.166667 0.000000
1 0.071429 0.071429 0.071429
2 0.000000 0.066667 0.000000
3 0.052632 0.000000 0.052632
full output:
out = df1.drop(columns=lista_com[0]).join(df1
.groupby(['id','plat'])[lista_com[0]]
.transform(lambda g: mape(df1.loc[g.index, lista_target[0]], g))
.add_prefix('MAPE_')
)
output:
id plat d30_real MAPE_d3_d30 MAPE_d7_d30 MAPE_d14_d30
0 1 and 1.2 0.083333 0.166667 0.000000
1 2 and 1.4 0.071429 0.071429 0.071429
2 1 ios 1.5 0.000000 0.066667 0.000000
3 2 ios 1.9 0.052632 0.000000 0.052632
I have 2 pivot tables in pandas pre_pt and sd_pt which may not have the same columns or index values. For example,
pre_pt=
risk_type R1 R2
qualifier
A 1.958512e+06 -10718.288787
B -1.008596e+04 1.933457
C 0.000000e+00 0.329764
D 2.390952e+03 5726.806464
E 1.002147e+04 -4.661991
F -2.016144e+06 12807.479302
sd_pt=
risk_type R1 R3 R4
qualifier
A 1.936494e+06 0.000000 -10425.198385
B -1.010489e+04 0.000000 1.107070
C 0.000000e+00 0.000000 0.568966
D 2.648684e+03 0.000000 5640.661105
E 1.001735e+04 0.000000 -3.839769
F -2.006834e+06 0.000000 12633.668916
G 0.000000e+00 11589.966215 0.000000
I want to be able to get the difference between the two in a new data frame so that I can then concatenate the 3 data frames into a report, I need to fill in the elements that are not in either data-frame to zero so that they can then be subtracted. I do this with the following code. Which works but wanted to know if there was an inbuilt solution in pandas
# create a set of the columns and indices
my_cols = set()
my_qs = set()
my_pts = [pre_pt, sd_pt]
for pts in my_pts:
my_cols.update(pts.columns.tolist())
my_qs.update(pts.index.tolist())
#now add the cols, indices that don't exist
for pts in my_pts:
pts_cols = pts.columns.tolist()
pts_qs = pts.index.tolist()
for c in my_cols:
if c not in pts_cols:
pts[c] = 0.0
for q in my_qs:
if q not in pts_qs:
pts.loc[q] = 0.0
pts = pts.sort_index(axis=1)
pts = pts.sort_index()
diff = sd_pt - pre_pt
#concatenate all of the pivot tables
risk_sd = pd.concat([pre_pt, sd_pt, diff], axis = 1, keys = [pre_start_date, start_date, "Difference"], sort =True)
You can use sub:
diff = sd_pt.sub(pre_pt, fill_value=0)
risk_sd = pd.concat([pre_pt, sd_pt, diff], axis=1, sort=True,
keys=['pre_start_date', 'start_date', 'Difference'])
print(risk_sd)
# Output
pre_start_date start_date Difference
risk_type R1 R2 R1 R3 R4 R1 R2 R3 R4
qualifier
A 1958512.000 -10718.288787 1936494.000 0.000000 -10425.198385 -22018.000 10718.288787 0.000000 -10425.198385
B -10085.960 1.933457 -10104.890 0.000000 1.107070 -18.930 -1.933457 0.000000 1.107070
C 0.000 0.329764 0.000 0.000000 0.568966 0.000 -0.329764 0.000000 0.568966
D 2390.952 5726.806464 2648.684 0.000000 5640.661105 257.732 -5726.806464 0.000000 5640.661105
E 10021.470 -4.661991 10017.350 0.000000 -3.839769 -4.120 4.661991 0.000000 -3.839769
F -2016144.000 12807.479302 -2006834.000 0.000000 12633.668916 9310.000 -12807.479302 0.000000 12633.668916
G NaN NaN 0.000 11589.966215 0.000000 0.000 NaN 11589.966215 0.000000
Append .fillna(0) after pd.concat(...) to fill NaN by 0 if needed.
I have 3 dataframes in Pandas:
1) user_interests:
With 'user' as an id, and 'interest' as an interest:
2) similarity_score:
With 'user' as a unique id matching ids in user_interests:
3) similarity_total:
With 'interest' being a list of all the unique interests in user_interets:
What I need to do:
Step 1: Look up interest from similarity_table to user_interests
Step 2: Take the corresponding user from user_interests and match it to the user in similarity_score
Step 3: Take the corresponding similarity_score from similarity_score and add it to the corresponding interest in similarity_total
The ultimate objective being to total the similarity scores of all users interested in the subjects in similarity_total. A diagram may help:
I know this can be done in Pandas in one line, however I am not there yet. If anyone can point me in the right direction, that would be amazing. Thanks!
IIUC, I think you need:
user_interest['similarity_score'] = user_interest['users'].map(similarity_score.set_index('user')['similarity_score'])
similarity_total = user_interest.groupby('interest', as_index=False)['similarity_score'].sum()
Output:
interest similarity_score
0 Big Data 1.000000
1 Cassandra 1.338062
2 HBase 0.338062
3 Hbase 1.000000
4 Java 1.154303
5 MongoDB 0.338062
6 NoSQL 0.338062
7 Postgres 0.338062
8 Python 0.154303
9 R 0.154303
10 Spark 1.000000
11 Storm 1.000000
12 decision tree 0.000000
13 libsvm 0.000000
14 machine learning 0.000000
15 numpy 0.000000
16 pandas 0.000000
17 probability 0.000000
18 regression 0.000000
19 scikit-learn 0.000000
20 scipy 0.000000
21 statistics 0.000000
22 statsmodels 0.000000
I'm not sure what code you have already written but have you tried something similar to this for the merging? It's not one line though.
# Merge user_interest with similarity_total dataframe
ui_st_df = user_interests.merge(similarity_total, on='interest',how='left').copy()
# Merge ui_st_df with similarity_score dataframe
ui_ss_df = ui_st_df.merge(similarity_score, on='user',how='left').copy()
I have a pandas dataframe of variable number of columns. I'd like to numerically integrate each column of the dataframe so that I can evaluate the definite integral from row 0 to row 'n'. I have a function that works on an 1D array, but is there a better way to do this in a pandas dataframe so that I don't have to iterate over columns and cells? I was thinking of some way of using applymap, but I can't see how to make it work.
This is the function that works on a 1D array:
def findB(x,y):
y_int = np.zeros(y.size)
y_int_min = np.zeros(y.size)
y_int_max = np.zeros(y.size)
end = y.size-1
y_int[0]=(y[1]+y[0])/2*(x[1]-x[0])
for i in range(1,end,1):
j=i+1
y_int[i] = (y[j]+y[i])/2*(x[j]-x[i]) + y_int[i-1]
return y_int
I'd like to replace it with something that calculates multiple columns of a dataframe all at once, something like this:
B_df = y_df.applymap(integrator)
EDIT:
Starting dataframe dB_df:
Sample1 1 dB Sample1 2 dB Sample1 3 dB Sample1 4 dB Sample1 5 dB Sample1 6 dB
0 2.472389 6.524537 0.306852 -6.209527 -6.531123 -4.901795
1 6.982619 -0.534953 -7.537024 8.301643 7.744730 7.962163
2 -8.038405 -8.888681 6.856490 -0.052084 0.018511 -4.117407
3 0.040788 5.622489 3.522841 -8.170495 -7.707704 -6.313693
4 8.512173 1.896649 -8.831261 6.889746 6.960343 8.236696
5 -6.234313 -9.908385 4.934738 1.595130 3.116842 -2.078000
6 -1.998620 3.818398 5.444592 -7.503763 -8.727408 -8.117782
7 7.884663 3.818398 -8.046873 6.223019 4.646397 6.667921
8 -5.332267 -9.163214 1.993285 2.144201 4.646397 0.000627
9 -2.783008 2.288842 5.836786 -8.013618 -7.825365 -8.470759
Ending dataframe B_df:
Sample1 1 B Sample1 2 B Sample1 3 B Sample1 4 B Sample1 5 B Sample1 6 B
0 0.000038 0.000024 -0.000029 0.000008 0.000005 0.000012
1 0.000034 -0.000014 -0.000032 0.000041 0.000036 0.000028
2 0.000002 -0.000027 0.000010 0.000008 0.000005 -0.000014
3 0.000036 0.000003 -0.000011 0.000003 0.000002 -0.000006
4 0.000045 -0.000029 -0.000027 0.000037 0.000042 0.000018
5 0.000012 -0.000053 0.000015 0.000014 0.000020 -0.000023
6 0.000036 -0.000023 0.000004 0.000009 0.000004 -0.000028
7 0.000046 -0.000044 -0.000020 0.000042 0.000041 -0.000002
8 0.000013 -0.000071 0.000011 0.000019 0.000028 -0.000036
9 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
In the above example,
(x[j]-x[i]) = 0.000008
First of all, you can achieve a similar result using vectorized operations. Each element of the integration is just the mean of the current and next y value scaled by the corresponding difference in x. The final integral is just the cumulative sum of these elements. You can achieve the same result by doing something like
def findB(x, y):
"""
x : pandas.Series
y : pandas.DataFrame
"""
mean_y = (y[:-1] + y.shift(-1)[:-1]) / 2
delta_x = x.shift(-1)[:-1] - x[:-1]
scaled_int = mean_y.multiply(delta_x)
cumulative_int = scaled_int.cumsum(axis='index')
return cumulative_int.shift(1).fillna(0)
Here DataFrame.shift and Series.shift are used to match the indices of the "next" elements to the current. You have to use DataFrame.multiply rather than the * operator to ensure that the proper axis is used ('index' vs 'column'). Finally, DataFrame.cumsum provides the final integration step. DataFrame.fillna ensures that you have a first row of zeros as you did in the original solution. The advantage of using all the native pandas functions is that you can pass in a dataframe with any number of columns and have it operate on all of them simultaneously.
Do you really look for numeric values of the integral? Maybe you just need a picture? Then it is easier, using pyplot.
import matplotlib.pyplot as plt
# Introduce a column *bin* holding left limits of our bins.
df['bin'] = pd.cut(df['volume2'], 50).apply(lambda bin: bin.left)
# Group by bins and calculate *f*.
g = df[['bin', 'universe']].groupby('bin').sum()
# Plot the function using cumulative=True.
plt.hist(list(g.index), bins=50, weights=list(g['universe']), cumulative=True)
plt.show()
I am attempting to divide one column by another inside of a function:
lcontrib=lcontrib_lev.div(lcontrib_lev['base'],axis='index')
As can be seen, I am dividing by a column within the DataFrame, but I am getting a rather strange error:
ValueError: putmask: mask and data must be the same size
I must confess, this is the first time I have seen this error. It seems to suggest that the DF and the column are of different lengths, but clearly (since the column comes from the DataFrame) they are not.
A further twist is that am using this function to loop a data management procedure over year-specific sets (the data are from the Quarterly Census of Employment and Wages 'singlefiles' in the beta series). The sets associated with the 1990-2000 time period go off without a hitch, but 2001 throws this error. I am afraid I have not been able to identify a difference in structure across years, and even if I could, how would it explain the length mismatch?
Any thoughts would be greatly appreciated.
EDIT (2/1/2014): Thanks for taking a look Tom. As requested, the pandas version is 0.13.0, and the data file in question is located here on the BLS FTP site. Just to clarify what I meant by consistent structure, every year has the same variable set and dtype (in addition to a consistent data code structure).
EDIT (2/1/2014): Perhaps it would be useful to share the entire function:
def qcew(f,m_dict):
'''Function reads in file and captures county level aggregations with government contributions'''
#Read in file
cew=pd.read_csv(f)
#Create string version of area fips
cew['fips']=cew['area_fips'].astype(str)
#Generate description variables
cew['area']=cew['fips'].map(m_dict['area'])
cew['industry']=cew['industry_code'].map(m_dict['industry'])
cew['agglvl']=cew['agglvl_code'].map(m_dict['agglvl'])
cew['own']=cew['own_code'].map(m_dict['ownership'])
cew['size']=cew['size_code'].map(m_dict['size'])
#Generate boolean masks
lagg_mask=cew['agglvl_code']==73
lsize_mask=cew['size_code']==0
#Subset data to above specifications
cew_super=cew[lagg_mask & lsize_mask]
#Define column subset
lsub_cols=['year','fips','area','industry_code','industry','own','annual_avg_estabs_count','annual_avg_emplvl',\
'total_annual_wages','own_code']
#Subset to desired columns
cew_sub=cew_super[lsub_cols]
#Rename columns
cew_sub.columns=['year','fips','cty','ind_code','industry','own','estabs','emp','tot_wages','own_code']
#Set index
cew_sub.set_index(['year','fips','cty'],inplace=True)
#Capture total wage base and the contributions of Federal, State, and Local
cew_base=cew_sub['tot_wages'].groupby(level=['year','fips','cty']).sum()
cew_fed=cew_sub[cew_sub['own_code']==1]['tot_wages'].groupby(level=['year','fips','cty']).sum()
cew_st=cew_sub[cew_sub['own_code']==2]['tot_wages'].groupby(level=['year','fips','cty']).sum()
cew_loc=cew_sub[cew_sub['own_code']==3]['tot_wages'].groupby(level=['year','fips','cty']).sum()
#Convert to DFs for join
lbase=DataFrame(cew_base).rename(columns={0:'base'})
lfed=DataFrame(cew_fed).rename(columns={0:'fed_wage'})
lstate=DataFrame(cew_st).rename(columns={0:'st_wage'})
llocal=DataFrame(cew_loc).rename(columns={0:'loc_wage'})
#Join these series
lcontrib_lev=pd.concat([lbase,lfed,lstate,llocal],axis='index').fillna(0)
#Diag prints
print f
print lcontrib_lev.head()
print lcontrib_lev.describe()
print '*****************************\n'
#Calculate proportional contributions (failure point)
lcontrib=lcontrib_lev.div(lcontrib_lev['base'],axis='index')
#Group base data by year, county, and industry
cew_g=cew_sub.reset_index().groupby(['year','fips','cty','ind_code','industry']).sum().reset_index()
#Join contributions to joined data
cew_contr=cew_g.set_index(['year','fips','cty']).join(lcontrib[['fed_wage','st_wage','loc_wage']])
return cew_contr[[x for x in cew_contr.columns if x != 'own_code']]
Work ok for me (this is on 0.13.1, but IIRC I don't think anything in this particular area changed, but its possible it was a bug that was fixed).
In [48]: lcontrib_lev.div(lcontrib_lev['base'],axis='index').head()
Out[48]:
base fed_wage st_wage loc_wage
year fips cty
2001 1000 1000 NaN NaN NaN NaN
1000 NaN NaN NaN NaN
10000 10000 NaN NaN NaN NaN
10000 NaN NaN NaN NaN
10001 10001 NaN NaN NaN NaN
[5 rows x 4 columns]
In [49]: lcontrib_lev.div(lcontrib_lev['base'],axis='index').tail()
Out[49]:
base fed_wage st_wage loc_wage
year fips cty
2001 CS566 CS566 1 0.000000 0.000000 0.000000
US000 US000 1 0.022673 0.027978 0.073828
USCMS USCMS 1 0.000000 0.000000 0.000000
USMSA USMSA 1 0.000000 0.000000 0.000000
USNMS USNMS 1 0.000000 0.000000 0.000000
[5 rows x 4 columns]