Normalizing pandas DataFrame rows by their sums - python

What is the most idiomatic way to normalize each row of a pandas DataFrame? Normalizing the columns is easy, so one (very ugly!) option is:
(df.T / df.T.sum()).T
Pandas broadcasting rules prevent df / df.sum(axis=1) from doing this

To overcome the broadcasting issue, you can use the div method:
df.div(df.sum(axis=1), axis=0)
See pandas User Guide: Matching / broadcasting behavior

I would suggest to use Scikit preprocessing libraries and transpose your dataframe as required:
'''
Created on 05/11/2015
#author: rafaelcastillo
'''
import matplotlib.pyplot as plt
import pandas
import random
import numpy as np
from sklearn import preprocessing
def create_cos(number_graphs,length,amp):
# This function is used to generate cos-kind graphs for testing
# number_graphs: to plot
# length: number of points included in the x axis
# amp: Y domain modifications to draw different shapes
x = np.arange(length)
amp = np.pi*amp
xx = np.linspace(np.pi*0.3*amp, -np.pi*0.3*amp, length)
for i in range(number_graphs):
iterable = (2*np.cos(x) + random.random()*0.1 for x in xx)
y = np.fromiter(iterable, np.float)
if i == 0:
yfinal = y
continue
yfinal = np.vstack((yfinal,y))
return x,yfinal
x,y = create_cos(70,24,3)
data = pandas.DataFrame(y)
x_values = data.columns.values
num_rows = data.shape[0]
fig, ax = plt.subplots()
for i in range(num_rows):
ax.plot(x_values, data.iloc[i])
ax.set_title('Raw data')
plt.show()
std_scale = preprocessing.MinMaxScaler().fit(data.transpose())
df_std = std_scale.transform(data.transpose())
data = pandas.DataFrame(np.transpose(df_std))
fig, ax = plt.subplots()
for i in range(num_rows):
ax.plot(x_values, data.iloc[i])
ax.set_title('Data Normalized')
plt.show()

Related

Plot CDF of columns from a CSV file using pandas

I want to plot CDF value of columns from a CSV file using pandas as follows:
I have tried some codes, but they are not reporting the correct plot. Can you help with an easy way?
df = pd.read_csv('pathfile.csv')
def compute_distrib(df, col):
stats_df = df.groupby(col)[col].agg('count')\
.pipe(pd.DataFrame).rename(columns={col: 'frequency'})
# PDF
stats_df['pdf'] = stats_df['frequency'] / sum(stats_df['frequency'])
# CDF
stats_df['CDF'] = stats_df['pdf'].cumsum()
# modifications
stats_df = stats_df.reset_index()\
.rename(columns={col:"X"})
stats_df[" "] = col
return stats_df
cdf = []
for col in ['1','2','3','4']:
cdf.append(compute_distrib(df, col))
cdf = pd.concat(cdf, ignore_index=True)
import seaborn as sns
sns.lineplot(x=cdf["X"],
y=cdf["CDF"],
hue=cdf[" "]);
Due to the lack of runnable code on your post, I created my own code for plotting the CDF of the columns of a dataframe df:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from itertools import accumulate
# GENERATE EXAMPLE DATA
df = pd.DataFrame()
df['x1'] = np.random.uniform(-1,1, size=1000)
df['x2'] = df['x1'] + np.random.uniform(-1,1, size=1000)
df['x3'] = df['x2'] + np.random.uniform(-1,1, size=1000)
df['x4'] = df['x3'] + np.random.uniform(-1, 1, size=1000)
# START A PLOT
fig,ax = plt.subplots()
for col in df.columns:
# SKIP IF IT HAS ANY INFINITE VALUES
if not all(np.isfinite(df[col].values)):
continue
# USE numpy's HISTOGRAM FUNCTION TO COMPUTE BINS
xh, xb = np.histogram(df[col], bins=60, normed=True)
# COMPUTE THE CUMULATIVE SUM WITH accumulate
xh = list(accumulate(xh))
# NORMALIZE THE RESULT
xh = np.array(xh) / max(xh)
# PLOT WITH LABEL
ax.plot(xb[1:], xh, label=f"$CDF$({col})")
ax.legend()
plt.title("CDFs of Columns")
plt.show()
The resulting plot from this code is below:
To put in your own data, just replace the # GENERATE EXAMPLE DATA section with df = pd.read_csv('path/to/sheet.csv')
Let me know if anything in the example is unclear to you or if it needs more explanation.

How to use pandas with matplotlib to create 3D plots

I am struggling a bit with the pandas transformations needed to make data render in 3D on matplot lib. The data I have is usually in columns of numbers (usually time and some value). So lets create some test data to illustrate.
import pandas as pd
pattern = ("....1...."
"....1...."
"..11111.."
".1133311."
"111393111"
".1133311."
"..11111.."
"....1...."
"....1....")
# create the data and coords
Zdata = list(map(lambda d:0 if d == '.' else int(d), pattern))
Zinverse = list(map(lambda d:1 if d == '.' else -int(d), pattern))
Xdata = [x for y in range(1,10) for x in range(1,10)]
Ydata = [y for y in range(1,10) for x in range(1,10)]
# pivot the data into columns
data = [d for d in zip(Xdata,Ydata,Zdata,Zinverse)]
# create the data frame
df = pd.DataFrame(data, columns=['X','Y','Z',"Zi"], index=zip(Xdata,Ydata))
df.head(5)
Edit: This block of data is demo data that would normally come from a query on a
database that may need more cleaning and transforms before plotting. In this case data is already aligned and there are no problems aside having one more column we don't need (Zi).
So the numbers in pattern are transferred into height data in the Z column of df ('Zi' being the inverse image) and with that as the data frame I've struggled to come up with this pivot method which is 3 separate operations. I wonder if that can be better.
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
import matplotlib.cm as cm
fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')
Xs = df.pivot(index='X', columns='Y', values='X').values
Ys = df.pivot(index='X', columns='Y', values='Y').values
Zs = df.pivot(index='X', columns='Y', values='Z').values
ax.plot_surface(Xs,Ys,Zs, cmap=cm.RdYlGn)
plt.show()
Although I have something working I feel there must be a better way than what I'm doing. On a big data set I would imagine doing 3 pivots is an expensive way to plot something. Is there a more efficient way to transform this data ?
I guess you can avoid some steps during the preparation of the data by not using pandas (but only numpy arrays) and by using some convenience fonctions provided by numpy such as linespace and meshgrid.
I rewrote your code to do so, trying to keep the same logic and the same variable names :
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.cm as cm
pattern = ("....1...."
"....1...."
"..11111.."
".1133311."
"111393111"
".1133311."
"..11111.."
"....1...."
"....1....")
# Extract the value according to your logic
Zdata = list(map(lambda d:0 if d == '.' else int(d), pattern))
# Assuming the pattern is always a square
size = int(len(Zdata) ** 0.5)
# Create a mesh grid for plotting the surface
Xdata = np.linspace(1, size, size)
Ydata = np.linspace(1, size, size)
Xs, Ys = np.meshgrid(Xdata, Ydata)
# Convert the Zdata to a numpy array with the appropriate shape
Zs = np.array(Zdata).reshape((size, size))
fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')
# Plot the surface
ax.plot_surface(Xs, Ys, Zs, cmap=cm.RdYlGn)
plt.show()

how to rotate a seaborn lineplot

How can I rotate a seaborn.lineplot so that the result will be as a function of y and not a function of x.
For example, this code:
import pandas as pd
import seaborn as sns
df = pd.DataFrame([[0,1],[0,2],[0,1.5],[1,1],[1,5]], columns=['group','val'])
sns.lineplot(x='group',y='val',data=df)
Create this figure:
But is there a way to rotate the figure in 90° ? so that in the X we will have "val" and in Y we will have "group" and the std will go from left to right and not from bottom to up.
Thanks
EDIT: I've opened a ticket in seaborn to ask for this feature: https://github.com/mwaskom/seaborn/issues/1661
Per the seaborn docs on lineplot, the dataframe passed to data must be
Tidy (“long-form”) dataframe where each column is a variable and each row is an observation.
Which seems to imply there is no way to force the axes to switch, even by manipulating the data. If there is a way to do that I haven't found it - I'm sure there is a more elegant way to do this, but one way you could go about it is to do it by hand so to speak. Something like this would do the trick
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
df = pd.DataFrame([[0,1],[0,2],[0,1.5],[1,1],[1,5]], columns=['group','val'])
group = df['group'].tolist()
val = df['val'].tolist()
yl = list()
yu = list()
avg = list()
ii = 0
while ii < len(group): #Loop through all the groups
g = group[ii]
y0 = val[ii]
y1 = val[ii]
s = 0
jj = ii
while (jj < len(group) and group[jj] == g):
s += val[jj]
#This takes the min and max, but could easily take the standard deviation
if val[jj] > y1:
y1 = val[jj]
if val[jj] < y0:
y0 = val[jj]
jj += 1
avg.append(s/(jj - ii))
ii = jj
yl.append(y0)
yu.append(y1)
x = np.linspace(min(group), max(group), len(yl))
plt.ylabel(df.columns[0])
plt.xlabel(df.columns[1])
plt.plot(avg, x, color="#5a9edd", linestyle="-", linewidth=1.5)
plt.fill_betweenx(x, yl, yu, alpha=0.3)
This will give you the following plot:
For brevity this uses the minimum and maximum from each group to give the error band, but that can be easily changed to standard error or standard deviation as needed.
Consider what you'd do if not using seaborn. You would calculate the mean and standard deviation and plot those as a function of the group. Now it is quite straight forward to exchange x and y for a plot(x,y): plot(y,x). For the filled region, you can use fill_betweenx instead of fill_between.
Below the two cases for comparisson.
import pandas as pd
import matplotlib.pyplot as plt
df = pd.DataFrame([[0,1],[0,2],[0,1.5],[1,1],[1,5]], columns=['group','val'])
mean = df.groupby("group").mean()
std = df.groupby("group").std()
fig, (ax, ax2) = plt.subplots(ncols=2)
ax.plot(mean.index, mean["val"].values)
ax.fill_between(mean.index, (mean-std)["val"].values, (mean+std)["val"].values, alpha=.5)
ax.set(xlabel="group", ylabel="val")
ax2.plot(mean["val"].values, mean.index)
ax2.fill_betweenx(mean.index, (mean-std)["val"].values, (mean+std)["val"].values, alpha=.5)
ax2.set(ylabel="group", xlabel="val")
fig.tight_layout()
plt.show()

Suppressing all labeling in pandas scatter_matrix

I'm producing a scatterplot matrix using the scatter_matrix function in pandas.tools.plotting and since I have a lot of variables the labels end up looking very messy. Is there a way to suppress all the labels and perhaps even the tick marks? Here is some code that shows essentially what I mean:
import numpy as np
from pandas import DataFrame, scatter_matrix
n = 50
p = 15
cols = ['var_' + str(k) for k in range(p)]
data = DataFrame(np.random.randn(n, p), columns = cols)
scatter_matrix(data, diagonal = 'kde')
This works for me:
sm = scatter_matrix(data, diagonal = 'kde')
for subaxis in sm:
for ax in subaxis:
ax.xaxis.set_ticks([])
ax.yaxis.set_ticks([])
ax.set_ylabel("")
ax.set_xlabel("")
pic = sm[0][0].get_figure()
pic.savefig("MyScatter.png")

Create heatmap using pandas TimeSeries

I need to create MatplotLib heatmap (pcolormesh) using Pandas DataFrame TimeSeries column (df_all.ts) as my X-axis.
How to convert Pandas TimeSeries column to something which can be used as X-axis in np.meshgrid(x, y) function to create heatmap? The workaround is to create Matplotlib drange using same parameters as in pandas column, but is there a simple way?
x = pd.date_range(df_all.ts.min(),df_all.ts.max(),freq='H')
xt = mdates.drange(df_all.ts.min(), df_all.ts.max(), dt.timedelta(hours=1))
y = arange(ylen)
X,Y = np.meshgrid(xt, y)
I do not know what you mean by heat map for a time series, but for a dataframe you may do as below:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from itertools import product
from string import ascii_uppercase
from matplotlib import patheffects
m, n = 4, 7 # 4 rows, 7 columns
df = pd.DataFrame(np.random.randn(m, n),
columns=list(ascii_uppercase[:n]),
index=list(ascii_uppercase[-m:]))
ax = plt.imshow(df, interpolation='nearest', cmap='Oranges').axes
_ = ax.set_xticks(np.linspace(0, n-1, n))
_ = ax.set_xticklabels(df.columns)
_ = ax.set_yticks(np.linspace(0, m-1, m))
_ = ax.set_yticklabels(df.index)
ax.grid('off')
ax.xaxis.tick_top()
optionally, to print actual values in the middle of each square, with some shadows for readability, you may do:
path_effects = [patheffects.withSimplePatchShadow(shadow_rgbFace=(1,1,1))]
for i, j in product(range(m), range(n)):
_ = ax.text(j, i, '{0:.2f}'.format(df.iloc[i, j]),
size='medium', ha='center', va='center',
path_effects=path_effects)

Categories