I have a dataframe as follows:
from uncertainties import ufloat # pip3 uncertainties.py
import random
random.seed(0)
values = [[round(random.random(),2) for i in range(3)] for j in range(4)]
df = pd.DataFrame(values, index=['name1','sd', 'name2', 'sd'], columns=['A','B','C'])
and I want to rearrange the data, where I combine the mean and sd into one row, as ufloat, of which the desired output looks as follows:
new_values = [[ufloat(0.91,0.90), ufloat(0.98,0.31), ufloat(0.81,0.73)],
[ufloat(0.90,0.10), ufloat(0.68,0.43), ufloat(0.47, 0.61)]]
df = pd.DataFrame(new_values, index=['name1', 'name2'], columns=['A','B','C'])
I think it might be easiest to create two dataframes and combine them somehow
mean = df.iloc[::2].reset_index()
std = df.iloc[1::2].reset_index()
where now I need to merge the two and apply ufloat
This is my current solution:
mean = df.iloc[::2]
std = df.iloc[1::2]
tmp = np.array([ufloat(x[0], x[1]) for x
in zip(mean.values.ravel(), std.values.ravel())])
df = pd.DataFrame(tmp.reshape(mean.shape), columns=mean.columns, index=mean.index)
Related
I have a dataframe of patients and their gene expressions. I has this format:
Patient_ID | gene1 | gene2 | ... | gene10000
p1 0.142 0.233 ... bla
p2 0.243 0.243 ... -0.364
...
p4000 1.423 bla ... -1.222
As you see, that dataframe contains noise, with cells that are values other then a float value.
I want to remove every row that has a any column with non numeric values.
I've managed to do this using apply and pd.to_numeric like this:
cols = df.columns[1:]
df[cols] = df[cols].apply(pd.to_numeric, errors='coerce')
df = df.dropna()
The problem is that it's taking for ever to run, and I need a better and more efficient way of achieving this
EDIT: To reproduce something like my data:
arr = np.random.random_sample((3000,10000))
df = pd.DataFrame(arr, columns=['gene' + str(i) for i in range(10000)])
df = pd.concat([pd.DataFrame(['p' + str(i) for i in range(10000)], columns=['Patient_ID']),df],axis = 1)
df['gene0'][2] = 'bla'
df['gene9998'][4] = 'bla'
Was right it is worth trying numpy :)
I got 30-60x times faster version (bigger array, larger improvement)
Convert to numpy array (.values)
Iterate through all rows
Try to convert each row to row of floats
If it fails (some NaN present), note this in boolean array
Create array based on the results
Code:
import pandas as pd
import numpy as np
from line_profiler_pycharm import profile
def op_version(df):
cols = df.columns[1:]
df[cols] = df[cols].apply(pd.to_numeric, errors='coerce')
return df.dropna()
def np_version(df):
keep = np.full(len(df), True)
for idx, row in enumerate(df.values[:, 1:]):
try:
row.astype(np.float)
except:
keep[idx] = False
pass # maybe its better to store to_remove list, depends on data
return df[keep]
#profile
def main():
arr = np.random.random_sample((3000, 5000))
df = pd.DataFrame(arr, columns=['gene' + str(i) for i in range(5000)])
df = pd.concat([pd.DataFrame(['p' + str(i) for i in range(3000)],
columns=['Patient_ID']), df], axis=1)
df['gene0'][2] = 'bla'
df['gene998'][4] = 'bla'
df2 = df.copy()
df = op_version(df)
df2 = np_version(df2)
Note I decreased number of columns so it is more feasible for tests.
Also, fixed small bug in your example, instead of:
df = pd.concat([pd.DataFrame(['p' + str(i) for i in range(10000)], columns=['Patient_ID']),df],axis = 1)
I think should be
df = pd.concat([pd.DataFrame(['p' + str(i) for i in range(3000)], columns=['Patient_ID']),df],axis = 1)
I'm trying to achieve this kind of transformation with Pandas.
I made this code but unfortunately it doesn't give the result I'm searching for.
CODE :
import pandas as pd
df = pd.read_csv('file.csv', delimiter=';')
df = df.count().reset_index().T.reset_index()
df.columns = df.iloc[0]
df = df[1:]
df
RESULT :
Do you have any proposition ? Any help will be appreciated.
First create columns for test nonOK and then use named aggregatoin for count, sum column Values and for count Trues values use sum again, last sum both columns:
df = (df.assign(NumberOfTest1 = df['Test one'].eq('nonOK'),
NumberOfTest2 = df['Test two'].eq('nonOK'))
.groupby('Category', as_index=False)
.agg(NumberOfID = ('ID','size'),
Values = ('Values','sum'),
NumberOfTest1 = ('NumberOfTest1','sum'),
NumberOfTest2 = ('NumberOfTest2','sum'))
.assign(TotalTest = lambda x: x['NumberOfTest1'] + x['NumberOfTest2']))
I need to save matrix results in one dataFrame.
to do that:
i split matrix and i create a new dataFrameor each iteration
and i append it to Target dataFrame.
i don't know if is The good way or not
what about perFormance?
import pandas as pd
import numpy as np
def generate_Matrix_as_dataframe( productname,variableName,results):
# df_results = pd.DataFrame({'Values': result})
df= pd.DataFrame(results)
dimension = len(results[0])
df['Values'] = pd.Series(df.fillna('').values.tolist())
# convert to Array
df['Values'] = df['Values'].apply(lambda x: np.array(x))
df_results =df[df.columns.drop([i for i in range(dimension)])]
df_results = df_results.reset_index()
df_results= df_results.rename(columns={"index":"Generation"})
df_results['Depth'] = df_results.index + 1
df_results['ProductName'] = productname
df_results['VariableName'] = variableName
return df_results[['ProductName','VariableName' ,'Depth', 'Values']]
df_results_ifrs17 = pd.DataFrame(columns=['ProductName', 'VariableName','Depth', 'Values'])
products =['P1','P2']
variables =['V1','V2']
nbrproduct=1
nbvariables=1
for p in products:
for v in variables:
value= np.ones( (nbrproduct, nbvariables), dtype=np.int32 )
df_results = generate_Matrix_as_dataframe(p, v,value)
df_results_ifrs17 = df_results_ifrs17.append(df_results, ignore_index=True)
nbvariables=nbvariables+1
print(value)
nbrproduct=nbrproduct+1
print(df_results_ifrs17)
I have the following dataframe:
df = pd.DataFrame({'A':range(10), 'B':range(10), 'C':range(10), 'D':range(10)})
I would like to shuffle the data using the below function:
import pandas as pd
import numpy as np
def shuffle(df, n=1, axis=0):
df = df.copy()
for _ in range(n):
df.apply(np.random.shuffle, axis=axis)
return df
However I do not want to shuffle columns A and D, only columns B and C. Is there a way to do this by amending the function? I want to say if column == 'A' or 'D' then don't shuffle.
Thanks
You could shuffle the required columns as below:
import numpy as np
import pandas as pd
# the data
df = pd.DataFrame({'A':range(10), 'B':range(10),
'C':range(10), 'D':range(10)})
# shuffle
df.B = np.random.permutation(df.B)
df.C = np.random.permutation(df.C)
# or shuffle this way (in place)
np.random.shuffle(df.B)
np.random.shuffle(df.C)
If you need to shuffle using your shuffle function:
def shuffle(df, n=1):
for _ in range(n):
# shuffle B
np.random.shuffle(df.B)
# shuffle C
np.random.shuffle(df.C)
print(df.B,df.C) # comment this out as needed
return df
You do not need to disturb columns A and D.
I'm preparing a big multivariate time series data set for a supervised learning task and I would like to create time shifted versions of my input features so my model also infers from past values. In pandas there's the shift(n) command that lets you shift a column by n rows. Is there something similar in vaex?
I could not find anything comparable in the vaex documentation.
No, we do not support that yet (https://github.com/vaexio/vaex/issues/660). Because vaex is extensible (see http://docs.vaex.io/en/latest/tutorial.html#Adding-DataFrame-accessors) I thought I would give you the solution in the form of that:
import vaex
import numpy as np
#vaex.register_dataframe_accessor('mytool', override=True)
class mytool:
def __init__(self, df):
self.df = df
def shift(self, column, n, inplace=False):
# make a copy without column
df = self.df.copy().drop(column)
# make a copy with just the colum
df_column = self.df[[column]]
# slice off the head and tail
df_head = df_column[-n:]
df_tail = df_column[:-n]
# stitch them together
df_shifted = df_head.concat(df_tail)
# and join (based on row number)
return df.join(df_shifted, inplace=inplace)
x = np.arange(10)
y = x**2
df = vaex.from_arrays(x=x, y=y)
df['shifted_y'] = df.y
df2 = df.mytool.shift('shifted_y', 2)
df2
It generates a single column datagram, slices that up, concatenates and joins it back. All without a single memory copy.
I am assuming here a cyclic shift/rotate.
The function needs to be modified slightly in order to work in the latest release (vaex 4.0.0ax), see this thread.
Code by Maarten should be updated as follows:
import vaex
import numpy as np
#vaex.register_dataframe_accessor('mytool', override=True)
class mytool:
def __init__(self, df):
self.df = df
# mytool.shift is the analog of pandas.shift() but add the shifted column with specified name to the end of initial df
def shift(self, column, new_column, n, cyclic=True):
df = self.df.copy().drop(column)
df_column = self.df[[column]]
if cyclic:
df_head = df_column[-n:]
else:
df_head = vaex.from_dict({column: np.ma.filled(np.ma.masked_all(n, dtype=float), 0)})
df_tail = df_column[:-n]
df_shifted = df_head.concat(df_tail)
df_shifted.rename(column, new_column)
return df_shifted
x = np.arange(10)
y = x**2
df = vaex.from_arrays(x=x, y=y)
df2 = df.join(df.mytool.shift('y', 'shifted_y', 2))
df2