I want to plot CDF value of columns from a CSV file using pandas as follows:
I have tried some codes, but they are not reporting the correct plot. Can you help with an easy way?
df = pd.read_csv('pathfile.csv')
def compute_distrib(df, col):
stats_df = df.groupby(col)[col].agg('count')\
.pipe(pd.DataFrame).rename(columns={col: 'frequency'})
# PDF
stats_df['pdf'] = stats_df['frequency'] / sum(stats_df['frequency'])
# CDF
stats_df['CDF'] = stats_df['pdf'].cumsum()
# modifications
stats_df = stats_df.reset_index()\
.rename(columns={col:"X"})
stats_df[" "] = col
return stats_df
cdf = []
for col in ['1','2','3','4']:
cdf.append(compute_distrib(df, col))
cdf = pd.concat(cdf, ignore_index=True)
import seaborn as sns
sns.lineplot(x=cdf["X"],
y=cdf["CDF"],
hue=cdf[" "]);
Due to the lack of runnable code on your post, I created my own code for plotting the CDF of the columns of a dataframe df:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from itertools import accumulate
# GENERATE EXAMPLE DATA
df = pd.DataFrame()
df['x1'] = np.random.uniform(-1,1, size=1000)
df['x2'] = df['x1'] + np.random.uniform(-1,1, size=1000)
df['x3'] = df['x2'] + np.random.uniform(-1,1, size=1000)
df['x4'] = df['x3'] + np.random.uniform(-1, 1, size=1000)
# START A PLOT
fig,ax = plt.subplots()
for col in df.columns:
# SKIP IF IT HAS ANY INFINITE VALUES
if not all(np.isfinite(df[col].values)):
continue
# USE numpy's HISTOGRAM FUNCTION TO COMPUTE BINS
xh, xb = np.histogram(df[col], bins=60, normed=True)
# COMPUTE THE CUMULATIVE SUM WITH accumulate
xh = list(accumulate(xh))
# NORMALIZE THE RESULT
xh = np.array(xh) / max(xh)
# PLOT WITH LABEL
ax.plot(xb[1:], xh, label=f"$CDF$({col})")
ax.legend()
plt.title("CDFs of Columns")
plt.show()
The resulting plot from this code is below:
To put in your own data, just replace the # GENERATE EXAMPLE DATA section with df = pd.read_csv('path/to/sheet.csv')
Let me know if anything in the example is unclear to you or if it needs more explanation.
Related
The dataframe I created is as follows:
import pandas as pd
import numpy as np
import seaborn as sns
date = pd.date_range('2003-01-01', '2022-11-01', freq='MS').strftime('%Y-%m-%d').tolist()
mom = [np.nan] + list(np.repeat([0.01], 238))
cpi = [100] + list(np.repeat([np.nan], 238))
df = pd.DataFrame(list(zip(date, mom, cpi)), columns=['date','mom','cpi'])
df['date'] = pd.to_datetime(df['date'])
for i in range(1,len(df),1):
df['cpi'][i] = df['cpi'][(i-1)] * (1 + df['mom'][i])
df['yoy'] = df['cpi'].pct_change(periods=12)
Y-axis values not displaying correctly as can be seen below.
sns.lineplot(
x = 'date',
y = 'yoy',
data = df
)
I think the percentage changes I calculated for the yoy column are the cause of the issue. Because there are no issues if I manually fill in the yoy column.
Thanks in advance.
You can use matplotlib to set the axis scaling, as the difference is really subtle in your data:
import matplotlib.pyplot as plt
ax = plt.gca()
ax.set_ylim([df.yoy.min(numeric_only=True), df.yoy.max(numeric_only=True)])
sns.lineplot(
x = 'date',
y = 'yoy',
data = df,
ax = ax
)
With this the result should be more of a stepping function.
You can use something like the max difference to the mean times 1.01 to set the limits a little better, but this is the idea. You can set the axis ticks using ax.set_yticks(ticks=<list of ticks>) (documentation).
In the data in test.csv, I'd like to do the following with the values in the Time column as the x-axis and the values in the A_x column (x=1,2,3) as the data on the y-axis.
・Draw an approximate curve from each of the three types of data.
・Draw the value of column A_xsd (x=1,2,3) as standard deviation, not as an error bar but as a shadow.
However, due to my lack of knowledge, I'm only halfway through. I'd be grateful if someone could give me some correct answer. Thank you very much.
(This is a simplified version of the original data more than 1000 lines)
#!/usr/bin/python
# -*- coding: utf-8 -*-
import pandas as pd
from pandas import DataFrame
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
df = pd.read_csv('test.csv', header = None)
x1 = df['Time']
y1 = df['A_1']
x2 = df['Time']
y2 = df['A_2']
x3 = df['Time']
y3 = df['A_3']
sd1 = df['A_1sd']
sd2 = df['A_1sd']
sd3 = df['A_1sd']
fig = plt.figure()
ax.set_xlim(0, 5)
ax.set_ylim(0, 150)
ax.set_xlabel("Time", fontsize=10)
ax.set_ylabel("OD600", fontsize=10)
ax.grid()
ax.tick_params(labelsize=10)
test.csv
Time,A_1,A_1sd,A_2,A_2sd,A_3,A_3sd
1,6,76,23159,125,23239,40
2,20,85,22709,99,22809,50
3,46,20,22629,89,22749,62
4,12,81,22729,85,22859,86
5,1,75,23029,90,23219,112
One of the three data columns has a range very different from the other two, so we should use subplots. You could use a loop to create the same kind of subplot for each of the three data columns.
The shading can be done with matplotlib's fill_between() method:
x = df.Time
fig, ax = plt.subplots(3, 1, sharex=True)
for n in (1, 2, 3):
y = eval('df.A_' + str(n))
sd = eval('df.A_' + str(n) + 'sd')
ax[n-1].plot(x, y)
ax[n-1].fill_between(x, y - sd, y + sd, alpha=0.3)
I'm using Pandas and matplotlib to try to replicate this graph from tableau:
So far, I have this code:
group = df.groupby(["Region","Rep"]).sum()
total_price = group["Total Price"].groupby(level=0, group_keys=False)
total_price.nlargest(5).plot(kind="bar")
Which produces this graph:
It correctly groups the data, but is it possible to get it grouped similar to how Tableau shows it?
You can create some lines and labels using the respective matplotlib methods (ax.text and ax.axhline).
import pandas as pd
import numpy as np; np.random.seed(5)
import matplotlib.pyplot as plt
a = ["West"]*25+ ["Central"]*10+ ["East"]*10
b = ["Mattz","McDon","Jeffs","Warf","Utter"]*5 + ["Susanne","Lokomop"]*5 + ["Richie","Florence"]*5
c = np.random.randint(5,55, size=len(a))
df=pd.DataFrame({"Region":a, "Rep":b, "Total Price":c})
group = df.groupby(["Region","Rep"]).sum()
total_price = group["Total Price"].groupby(level=0, group_keys=False)
gtp = total_price.nlargest(5)
ax = gtp.plot(kind="bar")
#draw lines and titles
count = gtp.groupby("Region").count()
cum = np.cumsum(count)
for i in range(len(count)):
title = count.index.values[i]
ax.axvline(cum[i]-.5, lw=0.8, color="k")
ax.text(cum[i]-(count[i]+1)/2., 1.02, title, ha="center",
transform=ax.get_xaxis_transform())
# shorten xticklabels
ax.set_xticklabels([l.get_text().split(", ")[1][:-1] for l in ax.get_xticklabels()])
plt.show()
I'm producing a scatterplot matrix using the scatter_matrix function in pandas.tools.plotting and since I have a lot of variables the labels end up looking very messy. Is there a way to suppress all the labels and perhaps even the tick marks? Here is some code that shows essentially what I mean:
import numpy as np
from pandas import DataFrame, scatter_matrix
n = 50
p = 15
cols = ['var_' + str(k) for k in range(p)]
data = DataFrame(np.random.randn(n, p), columns = cols)
scatter_matrix(data, diagonal = 'kde')
This works for me:
sm = scatter_matrix(data, diagonal = 'kde')
for subaxis in sm:
for ax in subaxis:
ax.xaxis.set_ticks([])
ax.yaxis.set_ticks([])
ax.set_ylabel("")
ax.set_xlabel("")
pic = sm[0][0].get_figure()
pic.savefig("MyScatter.png")
What is the most idiomatic way to normalize each row of a pandas DataFrame? Normalizing the columns is easy, so one (very ugly!) option is:
(df.T / df.T.sum()).T
Pandas broadcasting rules prevent df / df.sum(axis=1) from doing this
To overcome the broadcasting issue, you can use the div method:
df.div(df.sum(axis=1), axis=0)
See pandas User Guide: Matching / broadcasting behavior
I would suggest to use Scikit preprocessing libraries and transpose your dataframe as required:
'''
Created on 05/11/2015
#author: rafaelcastillo
'''
import matplotlib.pyplot as plt
import pandas
import random
import numpy as np
from sklearn import preprocessing
def create_cos(number_graphs,length,amp):
# This function is used to generate cos-kind graphs for testing
# number_graphs: to plot
# length: number of points included in the x axis
# amp: Y domain modifications to draw different shapes
x = np.arange(length)
amp = np.pi*amp
xx = np.linspace(np.pi*0.3*amp, -np.pi*0.3*amp, length)
for i in range(number_graphs):
iterable = (2*np.cos(x) + random.random()*0.1 for x in xx)
y = np.fromiter(iterable, np.float)
if i == 0:
yfinal = y
continue
yfinal = np.vstack((yfinal,y))
return x,yfinal
x,y = create_cos(70,24,3)
data = pandas.DataFrame(y)
x_values = data.columns.values
num_rows = data.shape[0]
fig, ax = plt.subplots()
for i in range(num_rows):
ax.plot(x_values, data.iloc[i])
ax.set_title('Raw data')
plt.show()
std_scale = preprocessing.MinMaxScaler().fit(data.transpose())
df_std = std_scale.transform(data.transpose())
data = pandas.DataFrame(np.transpose(df_std))
fig, ax = plt.subplots()
for i in range(num_rows):
ax.plot(x_values, data.iloc[i])
ax.set_title('Data Normalized')
plt.show()