Improvement of stddev function - python

I just wrote a function to calculate the standard-deviation of one specific column of a pandas dataframe. I just wanted to ask, if there is a way to do this more efficient than I did it here.
import numpy as np
import pandas as pd
import astropy
from astropy.table import QTable, Table, Column
from astropy import units as u
import random
def std_dev(Dataframe, column, nb_rows, av):
DF = Dataframe
calc = []
for x in DF[column].tolist():
r = (x - av)**2
calc.append(r)
#calculating sum over calc
R = sum(calc)
return (R / (nb_rows - 1))**(1 / 2)
if __name__ == "__main__":
sample = Table({
'row':
np.array([round(random.uniform(3300, 3700), 2) for i in
range(20)])
})
df = sample.to_pandas()
a = std_dev(df, 'row', 20, 3500)
Thanks for the help.

Related

Pandas saving and loading to CSV without introducing rounding error

This script:
import numpy as np
import pandas as pd
#
x = 10000 * np.pi
df = pd.DataFrame({"test": [x]})
df.to_csv("pd_test.csv")
other_df = pd.read_csv("pd_test.csv")
print(df["test"][0], other_df["test"][0])
print(df["test"][0] - other_df["test"][0])
Gives:
31415.926535897932 31415.92653589793
3.637978807091713e-12
I would like to not introduce a change when saving and loading to CSV, if possible - for example, is there a datatype I can use for the dataframe which would accomplish this?
I don't mind losing a small amount of accuracy if necessary, I would just like to avoid the change during the save and load process if possible.
There are two alternatives.
You can round up your float with round()
x = 10000 * np.pi
print(round(x,2))
output = 31415.93
or use .format()
print("{:.2e}".format(x))
output = 3.14e+04
print("{:.2f}".format(x))
output = 31415.93
I ended up casting the dataframes to float32 before save and on load:
import numpy as np
import pandas as pd
#
x = 10000 * np.pi
df = pd.DataFrame({"test": [x]})
df = df.astype('float32')
df.to_csv("pd_test.csv")
other_df = pd.read_csv("pd_test.csv").astype('float32')
print(df["test"][0], other_df["test"][0])
print(df["test"][0] - other_df["test"][0])

Computing a chi square statistic from scratch using numpy/pandas, matrix computations

I was just looking at https://en.wikipedia.org/wiki/Chi-squared_test and wanted to recreate the example "Example chi-squared test for categorical data".
I feel that the approach I've taken might have room for improvement, so was wondering how that might be done.
Here's the code:
csv = """\
,A,B,C,D
White collar,90,60,104,95
Blue collar,30,50,51,20
No collar,30,40,45,35
"""
observed_workers = pd.read_csv(io.StringIO(csv), index_col=0)
col_sums = dt.apply(sum)
row_sums = dt.apply(sum, axis=1)
l = list(x[1] * (x[0] / col_sums.sum()) for x in itertools.product(row_sums, col_sums))
expected_workers = pd.DataFrame(
np.array(l).reshape((3, 4)),
columns=observed_workers.columns,
index=observed_workers.index,
)
chi_squared_stat = (
((observed_workers - expected_workers) ** 2).div(expected_workers).sum().sum()
)
This returns the correct value, but is probably ignorant of a nicer approach using some particular numpy / pandas methods.
With numpy/scipy:
csv = """\
,A,B,C,D
White collar,90,60,104,95
Blue collar,30,50,51,20
No collar,30,40,45,35
"""
import io
from numpy import genfromtxt, outer
from scipy.stats.contingency import margins
observed = genfromtxt(io.StringIO(csv), delimiter=',', skip_header=True, usecols=range(1, 5))
row_sums, col_sums = margins(observed)
expected = outer(row_sums, col_sums) / observed.sum()
chi_squared_stat = ((observed - expected)**2 / expected).sum()
print(chi_squared_stat)
With pandas:
import io
import pandas as pd
csv = """\
work_group,A,B,C,D
White collar,90,60,104,95
Blue collar,30,50,51,20
No collar,30,40,45,35
"""
df = pd.read_csv(io.StringIO(csv))
df_melt = df.melt(id_vars ='work_group', var_name='group', value_name='observed')
df_melt['col_sum'] = df_melt.groupby('group')['observed'].transform(np.sum)
df_melt['row_sum'] = df_melt.groupby('work_group')['observed'].transform(np.sum)
total = df_melt['observed'].sum()
df_melt['expected'] = df_melt.apply(lambda row: row['col_sum']*row['row_sum']/total, axis=1)
chi_squared_stat = df_melt.apply(lambda row: ((row['observed'] - row['expected'])**2) / row['expected'], axis=1).sum()
print(chi_squared_stat)

Creating a vector of values based off a test using a for loop

This feels like it should be a simple problem but I am newer to python, in R i would use a foreach loop that gave me an option to combine.
I have tried a for loop that lets me print out all the values i need but i want them collected into a vector of values that i can use later.
from scipy.stats import gamma
import scipy.stats as stats
import numpy as np
import random
data2 = np.random.gamma(1,2, size = 500)
gammT = np.log(data2 + 1)
mean = np.mean(gammT)
sd = np.std(gammT)
a = (mean/ sd)**2
b = (sd**2)/ mean
for i in range(1,100):
gammT = random.sample(list(gammT), 500)
gamm = np.random.gamma(a,b, size = len(gammT))
s = stats.anderson_ksamp([gammT,gamm])
s = s[2]
print(s)
So i am able to print all the values i want but i want them all to be gathered together in a vector of values. I have tried to append and make lists but am not able to get them together.
from scipy.stats import gamma
import scipy.stats as stats
import numpy as np
import random
gammT = np.log(data2.iScore + 1)
mean = np.mean(gammT)
sd = np.std(gammT)
a = (mean/ sd)**2
b = (sd**2)/ mean
#initialize empty list
result=[]
for i in range(100):
# removed (1,100) you only need range(100) for 100 elements
gammT = random.sample(list(gammT), 500)
gamm = np.random.gamma(a,b, size = len(gammT))
s = stats.anderson_ksamp([gammT,gamm])
s = s[2]
#append calculation to list
result.append(s)
print(s)
print(result)

How to discretize large dataframe by columns with variable bins in Pandas/Dask

I am able to discretize a Pandas dataframe by columns with this code:
import numpy as np
import pandas as pd
def discretize(X, n_scale=1):
for c in X.columns:
loc = X[c].median()
# median absolute deviation of the column
scale = mad(X[c])
bins = [-np.inf, loc - (scale * n_scale),
loc + (scale * n_scale), np.inf]
X[c] = pd.cut(X[c], bins, labels=[-1, 0, 1])
return X
I want to discretize each column using as parameters: loc (the median of the column) and scale (the median absolute deviation of the column).
With small dataframes the time required is acceptable (since it is a single thread solution).
However, with larger dataframes I want to exploit more threads (or processes) to speed up the computation.
I am no expert of Dask, which should provide the solution for this problem.
However, in my case the discretization should be feasible with the code:
import dask.dataframe as dd
import numpy as np
import pandas as pd
def discretize(X, n_scale=1):
# I'm using only 2 partitions for this example
X_dask = dd.from_pandas(X, npartitions=2)
# FIXME:
# how can I define bins to compute loc and scale
# for each column?
bins = [-np.inf, loc - (scale * n_scale),
loc + (scale * n_scale), np.inf]
X = X_dask.apply(pd.cut, axis=1, args=(bins,), labels=[-1, 0, 1]).compute()
return X
but the problem here is that loc and scale are dependent on column values, so they should be computed for each column, either before or during the apply.
How can it be done?
I've never used dask, but I guess you can define a new function to be used in apply.
import dask.dataframe as dd
import multiprocessing as mp
import numpy as np
import pandas as pd
def discretize(X, n_scale=1):
X_dask = dd.from_pandas(X.T, npartitions=mp.cpu_count()+1)
X = X_dask.apply(_discretize_series,
axis=1, args=(n_scale,),
columns=X.columns).compute().T
return X
def _discretize_series(x, n_scale=1):
loc = x.median()
scale = mad(x)
bins = [-np.inf, loc - (scale * n_scale),
loc + (scale * n_scale), np.inf]
x = pd.cut(x, bins, labels=[-1, 0, 1])
return x

How to more efficiently calculate a rolling ratio

i have data length is over 3000.
below are code for making 20days value ( Volume Ration in Stock market)
it took more than 2 min.
is there any good way to reduce running time.
import pandas as pd
import numpy as np
from pandas.io.data import DataReader
import matplotlib.pylab as plt
data = DataReader('047040.KS','yahoo',start='2010')
data['vr']=0
data['Volume Ratio']=0
data['acend']=0
data['vr'] = np.sign(data['Close']-data['Open'])
data['vr'] = np.where(data['vr']==0,0.5,data['vr'])
data['vr'] = np.where(data['vr']<0,0,data['vr'])
data['acend'] = np.multiply(data['Volume'],data['vr'])
for i in range(len(data['Open'])):
if i<19:
data['Volume Ratio'][i]=0
else:
data['Volume Ratio'][i] = ((sum(data['acend'][i-19:i]))/((sum(data['Volume'][i-19:i])-sum(data['acend'][i-19:i]))))*100
Consider using conditional row selection and rolling.sum():
data.loc[data.index[:20], 'Volume Ratio'] = 0
data.loc[data.index[20:], 'Volume Ratio'] = (data.loc[:20:, 'acend'].rolling(window=20).sum() / (data.loc[:20:, 'Volume'].rolling(window=20).sum() - data.loc[:20:, 'acend'].rolling(window=20).sum()) * 100
or, simplified - .rolling.sum() will create np.nan for the first 20 values so just use .fillna(0):
data['new_col'] = data['acend'].rolling(window=20).sum().div(data['Volume'].rolling(window=20).sum().subtract(data['acend'].rolling(window=20).sum()).mul(100).fillna(0)

Categories