I need to plot histograms for numeric variables, in order to determine if their distributions are skewed. Below is the function definition, and the function being called.
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sys
def variable_type(df, nominal_level = 3):
categorical, numeric, nominal = [],[],[]
for variable in df.columns.values:
if np.issubdtype(np.array(df[variable]).dtype, int) or np.issubdtype(np.array(df[variable]).dtype, float): #if srray variable is of type int or float
if len(np.unique(np.array(df[variable]))) <= nominal_level:
nominal.append(variable)
else:
numeric.append(variable)
else:
categorical.append(variable)
return numeric,categorical,nominal
def draw_histograms(df, variables, n_rows, n_cols):
fig = plt.figure()
import math
for i in range(min(n_rows * n_cols, len(variables))):
index = n_rows * 100 + n_cols * 10 + i + 1
ax = fig.add_subplot(index)
df[variables[i]].hist(bins = 20, ax = ax)
plt.title(variables[i]+' distribution')
#plt.xlabel(variables[i])
#plt.ylabel('Count')
plt.show()
def main():
df = read_data()
col_names = df.columns.tolist()
numeric,categorical,nominal = variable_type(df)
util.draw_histograms(df, numeric, 3, 3)
if __name__ == "__main__":
main()
My program only works when I use 3, 3 for n_rows and n_cols in the calling function, and this is a problem because it only plots 9 of the 20 variables. If I try any other numbers, I get a ValueError: num must be 1 <= num <= 18, not 0 or some other range depending on my chosen n_rows and n_cols. What can I do to plot all 20 numeric variables as subplots on one figure? or should I break it into different figures? This is a sample of my data frame.
TARGET_B ID GiftCnt36 GiftCntAll GiftCntCard36 GiftCntCardAll \
0 0 14974 2 4 1 3
1 0 6294 1 8 0 3
2 1 46110 6 41 3 20
3 1 185937 3 12 3 8
4 0 29637 1 1 1 1
GiftAvgLast GiftAvg36 GiftAvgAll GiftAvgCard36 ... \
0 17 13.50 9.25 17.00 ...
1 20 20.00 15.88 NaN ...
2 6 5.17 3.73 5.00 ...
3 10 8.67 8.50 8.67 ...
4 20 20.00 20.00 20.00 ...
PromCntCardAll StatusCat96NK StatusCatStarAll DemCluster DemAge \
0 13 A 0 0 NaN
1 24 A 0 23 67
2 22 S 1 0 NaN
3 16 E 1 0 NaN
4 6 F 0 35 53
DemGender DemHomeOwner DemMedHomeValue DemPctVeterans DemMedIncome
0 F U $0 0 $0
1 F U $186,800 85 $0
2 M U $87,600 36 $38,750
3 M U $139,200 27 $38,942
4 M U $168,100 37 $71,509
There is a NaN in your 10th attribute. Can your code handle this?
An you plot the 10th attribute?
Related
It might sound trivial but I am surprised by the output. Basically, I have am calculating y = m*x + b for given a, b & x. With below code I am able to get the desired result of y which a list of 20 values.
But when I am checking the length of the list, I am getting 1 in return. And the range is (0,1) which is weird as I was expecting it to be 20.
Am I making any mistake here?
a = 10
b = 0
x = df['x']
print(x)
0 0.000000
1 0.052632
2 0.105263
3 0.157895
4 0.210526
5 0.263158
6 0.315789
7 0.368421
8 0.421053
9 0.473684
10 0.526316
11 0.578947
12 0.631579
13 0.684211
14 0.736842
15 0.789474
16 0.842105
17 0.894737
18 0.947368
19 1.000000
y_new = []
for i in x:
y = a*x +b
y_new.append(y)
len(y_new)
Output: 1
print(y_new)
[0 0.000000
1 0.526316
2 1.052632
3 1.578947
4 2.105263
5 2.631579
6 3.157895
7 3.684211
8 4.210526
9 4.736842
10 5.263158
11 5.789474
12 6.315789
13 6.842105
14 7.368421
15 7.894737
16 8.421053
17 8.947368
18 9.473684
19 10.000000
Name: x, dtype: float64]
I would propose two solutions:
The first solution is : you convert your columnn df['x'] into a list by doing df['x'].tolist() and you re-run your code and also you should replace ax+b by ai+b
The second solution is (which I would do): You convert your df['x'] into an array by doing x = np.array(df['x']). By doing this you can do some array broadcasting.
So, your code will simply be :
x = np.array(df['x'])
y = a*x + b
This should give you the desired output.
I hope this would be helpful
With the code below, I have a length of 20 for the array y_new. Are you sure to print the right value? According to this post, df['x'] returns a panda Series so df['x'] is equivalent to pd.Series(...).
df['x'] — index a column named 'x'. Returns pd.Series
import pandas as pd
a = 10
b = 0
x = pd.Series(data=[0.000000,0.052632,0.105263,0.157895,0.210526, 0.263158, 0.315789, 0.368421, 0.421053,0.473684,0.526316,0.578947,0.631579
,0.684211,0.736842,0.789474,0.842105,0.894737,0.947368,1.000000])
y_new = []
for i in x:
y = a*x +b
y_new.append(y)
print("y_new length: " + str(len(y_new)) )
Output:
y_new length: 20
I understand how to create simple quantiles in Pandas using pd.qcut. But after searching around, I don't see anything to create weighted quantiles. Specifically, I wish to create a variable which bins the values of a variable of interest (from smallest to largest) such that each bin contains an equal weight. So far this is what I have:
def wtdQuantile(dataframe, var, weight = None, n = 10):
if weight == None:
return pd.qcut(dataframe[var], n, labels = False)
else:
dataframe.sort_values(var, ascending = True, inplace = True)
cum_sum = dataframe[weight].cumsum()
cutoff = max(cum_sum)/n
quantile = cum_sum/cutoff
quantile[-1:] -= 1
return quantile.map(int)
Is there an easier way, or something prebuilt from Pandas that I'm missing?
Edit: As requested, I'm providing some sample data. In the following, I'm trying to bin the "Var" variable using "Weight" as the weight. Using pd.qcut, we get an equal number of observations in each bin. Instead, I want an equal weight in each bin, or in this case, as close to equal as possible.
Weight Var pd.qcut(n=5) Desired_Rslt
10 1 0 0
14 2 0 0
18 3 1 0
15 4 1 1
30 5 2 1
12 6 2 2
20 7 3 2
25 8 3 3
29 9 4 3
45 10 4 4
I don't think this is built-in to Pandas, but here is a function that does what you want in a few lines:
import numpy as np
import pandas as pd
from pandas._libs.lib import is_integer
def weighted_qcut(values, weights, q, **kwargs):
'Return weighted quantile cuts from a given series, values.'
if is_integer(q):
quantiles = np.linspace(0, 1, q + 1)
else:
quantiles = q
order = weights.iloc[values.argsort()].cumsum()
bins = pd.cut(order / order.iloc[-1], quantiles, **kwargs)
return bins.sort_index()
We can test it on your data this way:
data = pd.DataFrame({
'var': range(1, 11),
'weight': [10, 14, 18, 15, 30, 12, 20, 25, 29, 45]
})
data['qcut'] = pd.qcut(data['var'], 5, labels=False)
data['weighted_qcut'] = weighted_qcut(data['var'], data['weight'], 5, labels=False)
print(data)
The output matches your desired result from above:
var weight qcut weighted_qcut
0 1 10 0 0
1 2 14 0 0
2 3 18 1 0
3 4 15 1 1
4 5 30 2 1
5 6 12 2 2
6 7 20 3 2
7 8 25 3 3
8 9 29 4 3
9 10 45 4 4
I have a large set (thousands) of smooth lines (series of x,y pairs) with different sampling of x and y and different length for each line, i.e.
x_0 = {x_00, x_01, ..., } # length n_0
x_1 = {x_10, x_11, ..., } # length n_1
...
x_m = {x_m0, x_m1, ..., } # length n_m
y_0 = {y_00, y_01, ..., } # length n_0
y_1 = {y_10, y_11, ..., } # length n_1
...
y_m = {y_m0, y_m1, ..., } # length n_m
I want to find cumulative properties of each line interpolated to a regular set of x points, i.e. x = {x_0, x_1 ..., x_n-1}
Currently I'm for-looping over each line, creating an interpolant, resampling, and then taking the sum/median/whatever of that result. It works, but it's really slow. Is there any way to vectorize / matrisize this operation?
I was thinking, since linear interpolation can be a matrix operation, perhaps it's possible. At the same time, since each row can have a different length... it might be complicated. Edit: but zero padding the shorter arrays would be easy...
What I'm doing now looks something like,
import numpy as np
import scipy as sp
import scipy.interpolate
...
# `xx` and `yy` are lists of lists with the x and y points respectively
# `xref` are the reference x values at which I want interpolants
yref = np.zeros([len(xx), len(xref)])
for ii, (xi, yi) in enumerate(zip(xx, yy)):
yref[ii] = sp.interp(xref, xi, yi)
y_med = np.median(yref, axis=-1)
y_sum = np.sum(yref, axis=-1)
...
Hopefully, you can adjust the following for your purposes.
I included pandas because it has an interpolation feature to fill in missing values.
Setup
import pandas as pd
import numpy as np
x = np.arange(19)
x_0 = x[::2]
x_1 = x[::3]
np.random.seed([3,1415])
y_0 = x_0 + np.random.randn(len(x_0)) * 2
y_1 = x_1 + np.random.randn(len(x_1)) * 2
xy_0 = pd.DataFrame(y_0, index=x_0)
xy_1 = pd.DataFrame(y_1, index=x_1)
Note:
x is length 19
x_0 is length 10
x_1 is length 7
xy_0 looks like:
0
0 -4.259448
2 -0.536932
4 0.059001
6 1.481890
8 7.301427
10 9.946090
12 12.632472
14 14.697564
16 17.430729
18 19.541526
xy_0 can be aligned with x via reindex
xy_0.reindex(x)
0
0 -4.259448
1 NaN
2 -0.536932
3 NaN
4 0.059001
5 NaN
6 1.481890
7 NaN
8 7.301427
9 NaN
10 9.946090
11 NaN
12 12.632472
13 NaN
14 14.697564
15 NaN
16 17.430729
17 NaN
18 19.541526
we can then fill in missing with interpolate
xy_0.reindex(x).interpolate()
0
0 -4.259448
1 -2.398190
2 -0.536932
3 -0.238966
4 0.059001
5 0.770445
6 1.481890
7 4.391659
8 7.301427
9 8.623759
10 9.946090
11 11.289281
12 12.632472
13 13.665018
14 14.697564
15 16.064147
16 17.430729
17 18.486128
18 19.541526
What about xy_1
xy_1.reindex(x)
0
0 -1.216416
1 NaN
2 NaN
3 3.704781
4 NaN
5 NaN
6 5.294958
7 NaN
8 NaN
9 8.168262
10 NaN
11 NaN
12 10.176849
13 NaN
14 NaN
15 14.714924
16 NaN
17 NaN
18 19.493678
Interpolated
xy_0.reindex(x).interpolate()
0
0 -1.216416
1 0.423983
2 2.064382
3 3.704781
4 4.234840
5 4.764899
6 5.294958
7 6.252726
8 7.210494
9 8.168262
10 8.837791
11 9.507320
12 10.176849
13 11.689541
14 13.202233
15 14.714924
16 16.307842
17 17.900760
18 19.493678
I am having trouble getting this python code to work right. it is a code to display pascal's triangle using binomials. I do not know what is wrong. The code looks like this
from math import factorial
def binomial (n,k):
if k==0:
return 1
else:
return int((factorial(n)//factorial(k))*factorial(n-k))
def pascals_triangle(rows):
rows=20
for n in range (0,rows):
for k in range (0,n+1):
print(binomial(n,k))
print '\n'
This is what it keeps printing
1
1 1
1
2
1
1
12
3
1
1
144
24
4
1
1
2880
360
40
5
1
1
86400
8640
720
60
6
1
1
3628800
302400
20160
1260
and on and on. any help would be welcomed.!!
from math import factorial
def binomial (n,k):
if k==0:
return 1
else:
return int((factorial(n)//factorial(k))*factorial(n-k))
def pascals_triangle(rows):
for n in range (rows):
l = [binomial(n, k) for k in range (0,n+1)]
print l
pascals_triangle(5)
output:
[1]
[1, 1]
[1, 2, 1]
[1, 12, 3, 1]
[1, 144, 24, 4, 1]
there are many wrong things.
The first one is the way you compute the values : if building a pascal triangle, you want to use the previous line to compute the current one, and not use the binomial computation (which is expensive due to the number of multiplications).
then by default, print appends a "\n"
Correct implementation:
def print_line(x):
print (" ".join(map(str,x)))
def pascals_triangle(rows):
cur_line=[1,1]
for x in range (2,rows):
new_line=[1]
for n in range (0,len(cur_line)-1):
new_line.append(cur_line[n]+cur_line[n+1])
new_line.append(1)
print_line (new_line)
cur_line=new_line
this provides the following output
$ python pascal.py
1 2 1
1 3 3 1
1 4 6 4 1
1 5 10 10 5 1
1 6 15 20 15 6 1
1 7 21 35 35 21 7 1
1 8 28 56 70 56 28 8 1
1 9 36 84 126 126 84 36 9 1
Your binomial function had a small bracketing mistake in it, which was giving you incorrect output:
from math import factorial
def binomial(n, k):
if k==0:
return 1
else:
return int((factorial(n)/(factorial(k)*factorial(n-k))))
def pascals_triangle(rows, max_width):
for n in range (0,rows):
indent = (rows - n - 1) * max_width
print(' ' * indent, end='')
for k in range(0, n+1):
print("{:^{w}}".format(binomial(n, k), w = max_width*2), end='')
print()
pascals_triangle(7, 2)
With the addition of a padding parameter, the output can be made to look like this:
1
1 1
1 2 1
1 3 3 1
1 4 6 4 1
1 5 10 10 5 1
1 6 15 20 15 6 1
I'm trying to bin a sample of observations into n discrete groups, then combine these groups until each subgroup has a mimimum of 6 members. So far, I've generated bins, and grouped my DataFrame into them:
# df is a DataFrame containing 135 measurments
bins = np.linspace(df.heights.min(), df.heights.max(), 21)
grp = df.groupby(np.digitize(df.heights, bins))
grp.size()
1 4
2 1
3 2
4 3
5 2
6 8
7 7
8 6
9 19
10 12
11 13
12 12
13 7
14 12
15 12
16 2
17 3
18 6
19 3
21 1
So I can see that I need to combine groups 1 - 3, 3 - 5, and 16 - 21, while leaving the others intact, but I don't know how to do this programmatically.
You can do this:
df = pd.DataFrame(np.random.random_integers(1,200,135), columns=['heights'])
bins = np.linspace(df.heights.min(), df.heights.max(), 21)
grp = df.groupby(np.digitize(df.heights, bins))
sizes = grp.size()
def f(vals, max):
sum = 0
group = 1
for v in vals:
sum += v
if sum <= max:
yield group
else:
group +=1
sum = v
yield group
#I've changed 6 by 30 for the example cause I don't have your original dataset
grp.size().groupby([g for g in f(sizes, 30)])
And if you do print grp.size().groupby([g for g in f(sizes, 30)]).cumsum() you will see that the cumulative sums is grouped as expected.
Also if you want to group the original values you can do something like:
dat = np.random.random_integers(0,200,135)
dat = np.array([78,116,146,111,147,78,14,91,196,92,163,144,107,182,58,89,77,134,
83,126,94,70,121,175,174,88,90,42,93,131,91,175,135,8,142,166,
1,112,25,34,119,13,95,182,178,200,97,8,60,189,49,94,191,81,
56,131,30,107,16,48,58,65,78,8,0,11,45,179,151,130,35,64,
143,33,49,25,139,20,53,55,20,3,63,119,153,14,81,93,62,162,
46,29,84,4,186,66,90,174,55,48,172,83,173,167,66,4,197,175,
184,20,23,161,70,153,173,127,51,186,114,27,177,96,93,105,169,158,
83,155,161,29,197,143,122,72,60])
df = pd.DataFrame({'heights':dat})
bins = np.digitize(dat,np.linspace(0,200,21))
grp = df.heights.groupby(bins)
m = 15 #you should put 6 here, the minimun
s = 0
c = 1
def f(x):
global c,s
res = pd.Series([c]*x.size,index=x.index)
s += x.size
if s>m:
s = 0
c += 1
return res
g = grp.apply(f)
print df.groupby(g).size()
#another way of doing the same, just a matter of taste
m = 15 #you should put 6 here, the minimun
s = 0
c = 1
def f2(x):
global c,s
res = [c]*x.size #here is the main difference with f
s += x.size
if s>m:
s = 0
c += 1
return res
g = grp.transform(f2) #call it this way
print df.groupby(g).size()