Shuffling a pandas dataframe - python

I have the following dataframe:
df = pd.DataFrame({'A':range(10), 'B':range(10), 'C':range(10), 'D':range(10)})
I would like to shuffle the data using the below function:
import pandas as pd
import numpy as np
def shuffle(df, n=1, axis=0):
df = df.copy()
for _ in range(n):
df.apply(np.random.shuffle, axis=axis)
return df
However I do not want to shuffle columns A and D, only columns B and C. Is there a way to do this by amending the function? I want to say if column == 'A' or 'D' then don't shuffle.
Thanks

You could shuffle the required columns as below:
import numpy as np
import pandas as pd
# the data
df = pd.DataFrame({'A':range(10), 'B':range(10),
'C':range(10), 'D':range(10)})
# shuffle
df.B = np.random.permutation(df.B)
df.C = np.random.permutation(df.C)
# or shuffle this way (in place)
np.random.shuffle(df.B)
np.random.shuffle(df.C)
If you need to shuffle using your shuffle function:
def shuffle(df, n=1):
for _ in range(n):
# shuffle B
np.random.shuffle(df.B)
# shuffle C
np.random.shuffle(df.C)
print(df.B,df.C) # comment this out as needed
return df
You do not need to disturb columns A and D.

Related

Slow codes from functions in Python Pandas

I am trying to run a specific function (stats.boxcox) in a faster way in Python, but both codes I am using run very slow. Is there a way to do this in a more efficient way in Python?
First code:
import numpy as np
import pandas as pd
from scipy import stats
df = pd.DataFrame(np.random.randint(1,100,size=(100, 4)), columns=list('ABCD'))
df_new = pd.DataFrame()
for column in list(df):
df_new[column], lam = stats.boxcox(df[column])
Second code:
import numpy as np
import pandas as pd
from scipy import stats
df = pd.DataFrame(np.random.randint(1,100,size=(100, 4)), columns=list('ABCD'))
df2_a = df.apply(lambda x: stats.boxcox(x))
df2_al = list(zip(*df2_a))
df2 = pd.DataFrame(df2_al[0]).T
We can do it in a parallel way using ProcessPoolExecutor like this:
from concurrent.futures import ProcessPoolExecutor
# utils function
def cpu_tasks(func, *args):
# set chunksize to be even
with ProcessPoolExecutor(max_workers=None) as tp:
result = tp.map(func, chunksize=10, *args)
return list(result)
def get_box(s):
return stats.boxcox(s)[0]
# make column list
obj_lst = [df[x].tolist() for x in df.columns]
ddf = pd.DataFrame(cpu_tasks(get_box, obj_lst)).T
ddf.columns = list('ABCD')
print(ddf.head())
A B C D
0 8.906146 23.867250 23.682790 16.669473
1 16.948358 11.752727 18.120534 15.678950
2 23.892165 7.159087 1.693517 17.055528
3 11.210063 5.720299 29.496806 13.348581
4 6.588403 29.940734 35.874053 4.101704

vaex: shift column by n steps

I'm preparing a big multivariate time series data set for a supervised learning task and I would like to create time shifted versions of my input features so my model also infers from past values. In pandas there's the shift(n) command that lets you shift a column by n rows. Is there something similar in vaex?
I could not find anything comparable in the vaex documentation.
No, we do not support that yet (https://github.com/vaexio/vaex/issues/660). Because vaex is extensible (see http://docs.vaex.io/en/latest/tutorial.html#Adding-DataFrame-accessors) I thought I would give you the solution in the form of that:
import vaex
import numpy as np
#vaex.register_dataframe_accessor('mytool', override=True)
class mytool:
def __init__(self, df):
self.df = df
def shift(self, column, n, inplace=False):
# make a copy without column
df = self.df.copy().drop(column)
# make a copy with just the colum
df_column = self.df[[column]]
# slice off the head and tail
df_head = df_column[-n:]
df_tail = df_column[:-n]
# stitch them together
df_shifted = df_head.concat(df_tail)
# and join (based on row number)
return df.join(df_shifted, inplace=inplace)
x = np.arange(10)
y = x**2
df = vaex.from_arrays(x=x, y=y)
df['shifted_y'] = df.y
df2 = df.mytool.shift('shifted_y', 2)
df2
It generates a single column datagram, slices that up, concatenates and joins it back. All without a single memory copy.
I am assuming here a cyclic shift/rotate.
The function needs to be modified slightly in order to work in the latest release (vaex 4.0.0ax), see this thread.
Code by Maarten should be updated as follows:
import vaex
import numpy as np
#vaex.register_dataframe_accessor('mytool', override=True)
class mytool:
def __init__(self, df):
self.df = df
# mytool.shift is the analog of pandas.shift() but add the shifted column with specified name to the end of initial df
def shift(self, column, new_column, n, cyclic=True):
df = self.df.copy().drop(column)
df_column = self.df[[column]]
if cyclic:
df_head = df_column[-n:]
else:
df_head = vaex.from_dict({column: np.ma.filled(np.ma.masked_all(n, dtype=float), 0)})
df_tail = df_column[:-n]
df_shifted = df_head.concat(df_tail)
df_shifted.rename(column, new_column)
return df_shifted
x = np.arange(10)
y = x**2
df = vaex.from_arrays(x=x, y=y)
df2 = df.join(df.mytool.shift('y', 'shifted_y', 2))
df2

Add a vector/column of random numbers in Pandas

I've been trying to create a table that has randomly generated data using Pandas and Numpy. I've looked at the cheat sheet for Pandas but still can't get this work
import names
import pandas as pd
import random
import numpy as np
random.seed(100)
currency_numbers = random.sample(range(100000, 1000000), 100)
s = pd.Series(np.random.randn(100))
raw_data = {
"Names":["".join(names.get_full_name()) for i in range(100)],
"Names2":["".join(names.get_full_name()) for i in range(100)],
"Currency":[]
}
df = pd.DataFrame(raw_data, columns=["Names", "Names2", "Currency"])
df.head()
I want to create a column of 100 random numbers under the Currency section?
Just use the function: np.random.randint()
For example when I call this --> np.random.randint(1000,size=100)
The largest integer to be chosen in the random function is 999 aka anything from [0, 1000) and the size of the array would have a length of 100.
Therefore in your case,
s = np.random.randint(1000,size=100)
then set Currency to s,
"Currency":s
and the resulting DataFrame should give a column with 100 random numbers
JUST FYI, with this function you can also set a low and a high range...
So in your case it would be something like this:
s = np.random.randint(100000, 1000000,size=100)
Please check whether this helps.
import names
import pandas as pd
import random
import numpy as np
random.seed(100)
currency_numbers = np.random.randint(100000,1000000,size=(1,100))
s = pd.Series(np.random.randn(100))
raw_data = {
"Names":["".join(names.get_full_name()) for i in range(100)],
"Names2":["".join(names.get_full_name()) for i in range(100)],
"Currency":currency_numbers[0]
}
df = pd.DataFrame(raw_data, columns=["Names", "Names2", "Currency"])
df.head()

pandas operation combining each two rows

I have a dataframe as follows:
from uncertainties import ufloat # pip3 uncertainties.py
import random
random.seed(0)
values = [[round(random.random(),2) for i in range(3)] for j in range(4)]
df = pd.DataFrame(values, index=['name1','sd', 'name2', 'sd'], columns=['A','B','C'])
and I want to rearrange the data, where I combine the mean and sd into one row, as ufloat, of which the desired output looks as follows:
new_values = [[ufloat(0.91,0.90), ufloat(0.98,0.31), ufloat(0.81,0.73)],
[ufloat(0.90,0.10), ufloat(0.68,0.43), ufloat(0.47, 0.61)]]
df = pd.DataFrame(new_values, index=['name1', 'name2'], columns=['A','B','C'])
I think it might be easiest to create two dataframes and combine them somehow
mean = df.iloc[::2].reset_index()
std = df.iloc[1::2].reset_index()
where now I need to merge the two and apply ufloat
This is my current solution:
mean = df.iloc[::2]
std = df.iloc[1::2]
tmp = np.array([ufloat(x[0], x[1]) for x
in zip(mean.values.ravel(), std.values.ravel())])
df = pd.DataFrame(tmp.reshape(mean.shape), columns=mean.columns, index=mean.index)

Creating a Pandas DataFrame from a Numpy array: How do I specify the index column and column headers?

I have a Numpy array consisting of a list of lists, representing a two-dimensional array with row labels and column names as shown below:
data = array([['','Col1','Col2'],['Row1',1,2],['Row2',3,4]])
I'd like the resulting DataFrame to have Row1 and Row2 as index values, and Col1, Col2 as header values
I can specify the index as follows:
df = pd.DataFrame(data,index=data[:,0]),
however I am unsure how to best assign column headers.
You need to specify data, index and columns to DataFrame constructor, as in:
>>> pd.DataFrame(data=data[1:,1:], # values
... index=data[1:,0], # 1st column as index
... columns=data[0,1:]) # 1st row as the column names
edit: as in the #joris comment, you may need to change above to np.int_(data[1:,1:]) to have correct data type.
Here is an easy to understand solution
import numpy as np
import pandas as pd
# Creating a 2 dimensional numpy array
>>> data = np.array([[5.8, 2.8], [6.0, 2.2]])
>>> print(data)
>>> data
array([[5.8, 2.8],
[6. , 2.2]])
# Creating pandas dataframe from numpy array
>>> dataset = pd.DataFrame({'Column1': data[:, 0], 'Column2': data[:, 1]})
>>> print(dataset)
Column1 Column2
0 5.8 2.8
1 6.0 2.2
I agree with Joris; it seems like you should be doing this differently, like with numpy record arrays. Modifying "option 2" from this great answer, you could do it like this:
import pandas
import numpy
dtype = [('Col1','int32'), ('Col2','float32'), ('Col3','float32')]
values = numpy.zeros(20, dtype=dtype)
index = ['Row'+str(i) for i in range(1, len(values)+1)]
df = pandas.DataFrame(values, index=index)
This can be done simply by using from_records of pandas DataFrame
import numpy as np
import pandas as pd
# Creating a numpy array
x = np.arange(1,10,1).reshape(-1,1)
dataframe = pd.DataFrame.from_records(x)
>>import pandas as pd
>>import numpy as np
>>data.shape
(480,193)
>>type(data)
numpy.ndarray
>>df=pd.DataFrame(data=data[0:,0:],
... index=[i for i in range(data.shape[0])],
... columns=['f'+str(i) for i in range(data.shape[1])])
>>df.head()
[![array to dataframe][1]][1]
Here simple example to create pandas dataframe by using numpy array.
import numpy as np
import pandas as pd
# create an array
var1 = np.arange(start=1, stop=21, step=1).reshape(-1)
var2 = np.random.rand(20,1).reshape(-1)
print(var1.shape)
print(var2.shape)
dataset = pd.DataFrame()
dataset['col1'] = var1
dataset['col2'] = var2
dataset.head()
Adding to #behzad.nouri 's answer - we can create a helper routine to handle this common scenario:
def csvDf(dat,**kwargs):
from numpy import array
data = array(dat)
if data is None or len(data)==0 or len(data[0])==0:
return None
else:
return pd.DataFrame(data[1:,1:],index=data[1:,0],columns=data[0,1:],**kwargs)
Let's try it out:
data = [['','a','b','c'],['row1','row1cola','row1colb','row1colc'],
['row2','row2cola','row2colb','row2colc'],['row3','row3cola','row3colb','row3colc']]
csvDf(data)
In [61]: csvDf(data)
Out[61]:
a b c
row1 row1cola row1colb row1colc
row2 row2cola row2colb row2colc
row3 row3cola row3colb row3colc
I think this is a simple and intuitive method:
data = np.array([[0, 0], [0, 1] , [1, 0] , [1, 1]])
reward = np.array([1,0,1,0])
dataset = pd.DataFrame()
dataset['StateAttributes'] = data.tolist()
dataset['reward'] = reward.tolist()
dataset
returns:
But there are performance implications detailed here:
How to set the value of a pandas column as list
It's not so short, but maybe can help you.
Creating Array
import numpy as np
import pandas as pd
data = np.array([['col1', 'col2'], [4.8, 2.8], [7.0, 1.2]])
>>> data
array([['col1', 'col2'],
['4.8', '2.8'],
['7.0', '1.2']], dtype='<U4')
Creating data frame
df = pd.DataFrame(i for i in data).transpose()
df.drop(0, axis=1, inplace=True)
df.columns = data[0]
df
>>> df
col1 col2
0 4.8 7.0
1 2.8 1.2

Categories