This script:
import numpy as np
import pandas as pd
#
x = 10000 * np.pi
df = pd.DataFrame({"test": [x]})
df.to_csv("pd_test.csv")
other_df = pd.read_csv("pd_test.csv")
print(df["test"][0], other_df["test"][0])
print(df["test"][0] - other_df["test"][0])
Gives:
31415.926535897932 31415.92653589793
3.637978807091713e-12
I would like to not introduce a change when saving and loading to CSV, if possible - for example, is there a datatype I can use for the dataframe which would accomplish this?
I don't mind losing a small amount of accuracy if necessary, I would just like to avoid the change during the save and load process if possible.
There are two alternatives.
You can round up your float with round()
x = 10000 * np.pi
print(round(x,2))
output = 31415.93
or use .format()
print("{:.2e}".format(x))
output = 3.14e+04
print("{:.2f}".format(x))
output = 31415.93
I ended up casting the dataframes to float32 before save and on load:
import numpy as np
import pandas as pd
#
x = 10000 * np.pi
df = pd.DataFrame({"test": [x]})
df = df.astype('float32')
df.to_csv("pd_test.csv")
other_df = pd.read_csv("pd_test.csv").astype('float32')
print(df["test"][0], other_df["test"][0])
print(df["test"][0] - other_df["test"][0])
Related
My code runs properly but it will not provide output as it should. I am not sure where the issue is occurring. Could someone help me correct it? Do you need the CSV too?
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
df = pd.read_csv("/content/drive/MyDrive/replicates/Replicate 3 Gilts just measures.csv")
df.info()
df.head()
# removing the irrelevant columns
cols_to_drop = ["animal"]
df = df.drop(columns=cols_to_drop,axis=1)
# first five rows of data frame after removing columns
df.head()
deep_df = df.copy(deep = True)
numerical_columns = [col for col in df.columns if (df[col].dtype=='int64' or
df[col].dtype=='float64')]
df[numerical_columns].describe().loc[['min','max', 'mean','50%'],:]
df[df['i1000.0'] == df['i1000.0'].min()]
This is where the issue occurs
i1000_bucket = df.groupby(pd.cut(df["i1000.0"],bins=[10,20,30,40,50,60,70,80,90,100]))
number_bucket = df.groupby(pd.cut(df["i1000.0"],bins=[10,20,30,40,50,60,70,80,90,100]))
i1000_bucket = ((i1000_bucket.sum()["i1000.0"] / i1000_bucket.size())*100 , 2)
number_bucket = round((number_bucket.sum()["i1000.0"] / number_bucket.size())*100 , 2)
The graph appears but nothing actually plots
x = [str(i)+"-"+str(i+10) for i in range(10,91,10)]
plt.plot(x,number_bucket.values)
plt.xlabel("i1000.0")
plt.ylabel("p1000.0")
plt.title("1000.0 comparisons")
I just wrote a function to calculate the standard-deviation of one specific column of a pandas dataframe. I just wanted to ask, if there is a way to do this more efficient than I did it here.
import numpy as np
import pandas as pd
import astropy
from astropy.table import QTable, Table, Column
from astropy import units as u
import random
def std_dev(Dataframe, column, nb_rows, av):
DF = Dataframe
calc = []
for x in DF[column].tolist():
r = (x - av)**2
calc.append(r)
#calculating sum over calc
R = sum(calc)
return (R / (nb_rows - 1))**(1 / 2)
if __name__ == "__main__":
sample = Table({
'row':
np.array([round(random.uniform(3300, 3700), 2) for i in
range(20)])
})
df = sample.to_pandas()
a = std_dev(df, 'row', 20, 3500)
Thanks for the help.
I was just looking at https://en.wikipedia.org/wiki/Chi-squared_test and wanted to recreate the example "Example chi-squared test for categorical data".
I feel that the approach I've taken might have room for improvement, so was wondering how that might be done.
Here's the code:
csv = """\
,A,B,C,D
White collar,90,60,104,95
Blue collar,30,50,51,20
No collar,30,40,45,35
"""
observed_workers = pd.read_csv(io.StringIO(csv), index_col=0)
col_sums = dt.apply(sum)
row_sums = dt.apply(sum, axis=1)
l = list(x[1] * (x[0] / col_sums.sum()) for x in itertools.product(row_sums, col_sums))
expected_workers = pd.DataFrame(
np.array(l).reshape((3, 4)),
columns=observed_workers.columns,
index=observed_workers.index,
)
chi_squared_stat = (
((observed_workers - expected_workers) ** 2).div(expected_workers).sum().sum()
)
This returns the correct value, but is probably ignorant of a nicer approach using some particular numpy / pandas methods.
With numpy/scipy:
csv = """\
,A,B,C,D
White collar,90,60,104,95
Blue collar,30,50,51,20
No collar,30,40,45,35
"""
import io
from numpy import genfromtxt, outer
from scipy.stats.contingency import margins
observed = genfromtxt(io.StringIO(csv), delimiter=',', skip_header=True, usecols=range(1, 5))
row_sums, col_sums = margins(observed)
expected = outer(row_sums, col_sums) / observed.sum()
chi_squared_stat = ((observed - expected)**2 / expected).sum()
print(chi_squared_stat)
With pandas:
import io
import pandas as pd
csv = """\
work_group,A,B,C,D
White collar,90,60,104,95
Blue collar,30,50,51,20
No collar,30,40,45,35
"""
df = pd.read_csv(io.StringIO(csv))
df_melt = df.melt(id_vars ='work_group', var_name='group', value_name='observed')
df_melt['col_sum'] = df_melt.groupby('group')['observed'].transform(np.sum)
df_melt['row_sum'] = df_melt.groupby('work_group')['observed'].transform(np.sum)
total = df_melt['observed'].sum()
df_melt['expected'] = df_melt.apply(lambda row: row['col_sum']*row['row_sum']/total, axis=1)
chi_squared_stat = df_melt.apply(lambda row: ((row['observed'] - row['expected'])**2) / row['expected'], axis=1).sum()
print(chi_squared_stat)
How is one intended to use the output of the pandas.ewm.cov function. I would presume that there are functions that allow you to directly use it in the form returned for multiplication, but nothing I try seems to work.
For example, suppose I take a minimal use case, stock X and Y returns timeseries in DF1, so we estimate an ewma covariance matrix, then to get the variance estimate for a portfolio of position A and B (given in DF2) I need to compute $x^T C x$, but I can't find the command to do this without writing a for loop?
# Python 3.6, pandas 0.20
import pandas as pd
import numpy as np
np.random.seed(100)
DF1 = pd.DataFrame(dict(X = np.random.normal(size = 100), Y = np.random.normal(size = 100)))
DF2 = pd.DataFrame(dict(A = np.random.normal(size = 100), B = np.random.normal(size = 100)))
COV = DF1.ewm(10).cov()
print(DF1)
print(COV)
# All of the following are invalid
print(COV.dot(DF2))
print(DF2.dot(COV))
print(COV.multiply(DF2))
The best I can figure out is this ugly piece of code
COV.reset_index().rename(columns = dict(level_0 = "index", level_1 = "variable"), inplace = True)
DF2m = pd.melt(DF2.reset_index(), id_vars = "index").sort_values("index")
MDF = pd.merge(COV, DF2m, on=["index", "variable"])
VAR = MDF.groupby("index").apply(lambda x: np.dot(np.dot(x["value"], np.matrix([x["X"], x["Y"]])), x["value"])[0,0])
I hold out hope that there is a nice way to do this...
i have data length is over 3000.
below are code for making 20days value ( Volume Ration in Stock market)
it took more than 2 min.
is there any good way to reduce running time.
import pandas as pd
import numpy as np
from pandas.io.data import DataReader
import matplotlib.pylab as plt
data = DataReader('047040.KS','yahoo',start='2010')
data['vr']=0
data['Volume Ratio']=0
data['acend']=0
data['vr'] = np.sign(data['Close']-data['Open'])
data['vr'] = np.where(data['vr']==0,0.5,data['vr'])
data['vr'] = np.where(data['vr']<0,0,data['vr'])
data['acend'] = np.multiply(data['Volume'],data['vr'])
for i in range(len(data['Open'])):
if i<19:
data['Volume Ratio'][i]=0
else:
data['Volume Ratio'][i] = ((sum(data['acend'][i-19:i]))/((sum(data['Volume'][i-19:i])-sum(data['acend'][i-19:i]))))*100
Consider using conditional row selection and rolling.sum():
data.loc[data.index[:20], 'Volume Ratio'] = 0
data.loc[data.index[20:], 'Volume Ratio'] = (data.loc[:20:, 'acend'].rolling(window=20).sum() / (data.loc[:20:, 'Volume'].rolling(window=20).sum() - data.loc[:20:, 'acend'].rolling(window=20).sum()) * 100
or, simplified - .rolling.sum() will create np.nan for the first 20 values so just use .fillna(0):
data['new_col'] = data['acend'].rolling(window=20).sum().div(data['Volume'].rolling(window=20).sum().subtract(data['acend'].rolling(window=20).sum()).mul(100).fillna(0)