Partitioning data in a pandas dataframe - python

class Question:
def __init__(self, column, value):
self.column = column #storing a column number
self.value = value #storing a column value
def match(self,example):
val = example[self.column]
if is_numeric(val):
return val >= self.value
else:
return val == self.value
def partition(df, question):
true_rows, false_rows = [],[]
for row in df:
if question.match(row):
true_rows.append(row)
else:
false_rows.append(row)
return true_rows, false_rows
TypeError Traceback (most recent call last)
<ipython-input-53-386e8df97e85> in <module>
----> 2 true_rows, false_rows = partition(training_data, Question(0,1))
3 true_rows
<ipython-input-52-9ff7f19eff20> in partition(df, question)
20 true_rows, false_rows = [],[]
21 for row in df:
---> 22 if question.match(row):
23 true_rows.append(row)
24 else:
<ipython-input-12-928374ee6f4e> in match(self, example)
14 #feature value in the question
15
---> 16 val = example[self.column]
17 if is_numeric(val):
18 return val >= self.value
TypeError: 'int' object is not subscriptable
I'm trying to adapt this code from a Decision Tree so that it can be applied to a pandas dataframe instead of a list, I get the error "TypeError: 'int' object is not subscriptable", how do I return a count or a list of rows in the dataframe that were true or false without getting the error, I know that I'm not iterating through the dataframe properly, all help is really appreciated!

The error is likely in your iterator. for row in df does not iterate over the rows. It is equivalent to for column_name in df.columns. What you probably want is the following to get all the rows.
for loc, row in df.iterrows():
...
That said, you will get better performance if you instead create a column for this.
def partition(df, question_column, true_value):
view_of_values = df[question_column]
example = view_of_values[0]
if is_numeric(example):
is_true = view_of_values >= true_value
else:
is_true = view_of_values == true_value
# is_true is a column of True, False values for each
# you could store this df['question_A_is_true'] = is_true
# for partitioning later.
return is_true
df['question_A_is_true'] = partition(df, 'question_A', 'the_truth_is_out_there')
Then you can do things like df.query('question_A_is_true') or df.loc[df['question_A_is_true]] to get just the true rows. And if you have multiple questions you can then do combinations of them that are very fast because they use numpy under the hood.
df.loc[
(df['question_A_is_true'])
or (df['question_B_is_true'] and not df['question_C_is_true'])
]

Related

Remove following rows that are above or under by X amount from the current row['x']

I am calculating correlations and the data frame I have needs to be filtered.
I am looking to remove the rows under the current row from the data frame that are above or under by X amount starting with the first row and looping through the dataframe all the way until the last row.
example:
df['y'] has the values 50,51,52,53,54,55,70,71,72,73,74,75
if X = 10 it would start at 50 and see 51,52,53,54,55 as within that 10+- range and delete the rows. 70 would stay as it is not within that range and the same test would start again at 70 where 71,72,73,74,75 and respective rows would be deleted
the filter if X=10 would thus leave us with the rows including 50,75 for df.
It would leave me with a clean dataframe that deletes the instances that are linked to the first instance of what is essentially the same observed period. I tried coding a loop to do that but I am left with the wrong result and desperate at this point. Hopefully someone can correct the mistake or point me in the right direction.
df6['index'] = df6.index
df6.sort_values('index')
boom = len(dataframe1.index)/3
#Taking initial comparison values from first row
c = df6.iloc[0]['index']
#Including first row in result
filters = [True]
#Skipping first row in comparisons
for index, row in df6.iloc[1:].iterrows():
if c-boom <= row['index'] <= c+boom:
filters.append(False)
else:
filters.append(True)
# Updating values to compare based on latest accepted row
c = row['index']
df2 = df6.loc[filters].sort_values('correlation').drop('index', 1)
df2
OUTPUT BEFORE
OUTPUT AFTER
IIUC, your main issue is to filter consecutive values within a threshold.
You can use a custom function for that that acts on a Series (=column) to return the list of valid indices:
def consecutive(s, threshold = 10):
prev = float('-inf')
idx = []
for i, val in s.iteritems():
if val-prev > threshold:
idx.append(i)
prev = val
return idx
Example of use:
import pandas as pd
df = pd.DataFrame({'y': [50,51,52,53,54,55,70,71,72,73,74,75]})
df2 = df.loc[consecutive(df['y'])]
Output:
y
0 50
6 70
variant
If you prefer the function to return a boolean indexer, here is a varient:
def consecutive(s, threshold = 10):
prev = float('-inf')
idx = [False]*len(s)
for i, val in s.iteritems():
if val-prev > threshold:
idx[i] = True
prev = val
return idx

How to iterate over rows of each column in a dataframe

My current code functions and produces a graph if there is only 1 sensor, i.e. if col2, and col3 are deleted in the example data provided below, leaving one column.
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
d = {'col1': [-2587.944231, -1897.324231,-2510.304231,-2203.814231,-2105.734231,-2446.964231,-2963.904231,-2177.254231, 2796.354231,-2085.304231], 'col2': [-3764.468462,-3723.608462,-3750.168462,-3694.998462,-3991.268462,-3972.878462,3676.608462,-3827.808462,-3629.618462,-1841.758462,], 'col3': [-166.1357692,-35.36576923, 321.4157692,108.9257692,-123.2257692, -10.84576923, -100.7457692, 89.27423077, -211.0857692, 101.5342308]}
df = pd.DataFrame(data=d)
sensors = 3
window_size = 5
dfn = df.rolling(window_size).corr(pairwise = True)
index = df.index #index of values in the data frame.
rows = len(index) #len(index) returns number of rows in the data.
sensors = 3
baseline_num = [0]*(rows) #baseline numerator, by default zero
baseline = [0]*(rows) #initialize baseline value
baseline = DataFrame(baseline)
baseline_num = DataFrame(baseline_num)
v = [None]*(rows) # Initialize an empty array v[] equal to amount of rows in .csv file
s = [None]*(rows) #Initialize another empty array for the slope values for detecting when there is an exposure
d = [0]*(rows)
sensors_on = True #Is the sensor detecting something (True) or not (False).
off_count = 0
off_require = 8 # how many offs until baseline is updated
sensitivity = 1000
for i in range(0, (rows)): #This iterates over each index value, i.e. each row, and sums the values and returns them in list format.
v[i] = dfn.loc[i].to_numpy().sum() - sensors
for colname,colitems in df.iteritems():
for rownum,rowitem in colitems.iteritems():
#d[rownum] = dfone.loc[rownum].to_numpy()
#d[colname][rownum] = df.loc[colname][rownum]
if v[rownum] >= sensitivity:
sensors_on = True
off_count = 0
baseline_num[rownum] = 0
else:
sensors_on = False
off_count += 1
if off_count == off_require:
for x in range(0, (off_require)):
baseline_num[colname][rownum] += df[colname][rownum - x]
elif off_count > off_require:
baseline_num[colname][rownum] += baseline_num[colname][rownum - 1] + df[colname][rownum] - (df[colname][rownum - off_require]) #this loop is just an optimization, one calculation per loop once the first calculation is established
baseline[colname][rownum] = ((baseline_num[colname][rownum])//(off_require)) #mean of the last "off_require" points
dfx = DataFrame(v, columns =['Sensor Correlation']) #converts the summed correlation tables back from list format to a DataFrame, with the sole column name 'Sensor Correlation'
dft = pd.DataFrame(baseline, columns =['baseline'])
dft = dft.astype(float)
dfx.plot(figsize=(50,25), linewidth=5, fontsize=40) # plots dfx dataframe which contains correlated and summed data
dft.plot(figsize=(50,25), linewidth=5, fontsize=40)
Basically, instead of 1 graph as this produces, I would like to iterate over each column only for this loop:
for colname,colitems in df.iteritems():
for rownum,rowitem in colitems.iteritems():
#d[rownum] = dfone.loc[rownum].to_numpy()
#d[colname][rownum] = df.loc[colname][rownum]
if v[rownum] >= sensitivity:
sensors_on = True
off_count = 0
baseline_num[rownum] = 0
else:
sensors_on = False
off_count += 1
if off_count == off_require:
for x in range(0, (off_require)):
baseline_num[colname][rownum] += df[colname][rownum - x]
elif off_count > off_require:
baseline_num[colname][rownum] += baseline_num[colname][rownum - 1] + df[colname][rownum] - (df[colname][rownum - off_require]) #this loop is just an optimization, one calculation per loop once the first calculation is established
I've tried some other solutions from other questions but none of them seem to solve this case.
As of now, I've tried multiple conversions to things like lists and tuples, and then calling them something like this:
baseline_num[i,column] += d[i - x,column]
as well as
baseline_num[i][column += d[i - x][column]
while iterating over the loop using
for column in columns
However no matter how I seem to arrange the solution, there is always some keyerror of expecting integer or slice indices, among other errors.
See pictures for expected/possible outputs of one column on actual data.with varying input parameters (sensitivity value, and off_require is varied in different cases.)
One such solution which didn't work was the looping method from this link:
https://www.geeksforgeeks.org/iterating-over-rows-and-columns-in-pandas-dataframe/
I've also tried creating a loop using iteritems as the outer loop. This did not function as well.
Below are links to possible graph outputs for various sensitivity values, and windows in my actual dataset, with only one column. (i.e i manually deleted other columns, and plotted just the one using the current program)
sensitivity 1000, window 8
sensitivity 800, window 5
sensitivity 1500, window 5
If there's anything I've left out that would be helpful to solving this, please let me know so I can rectify it immediately.
See this picture for my original df.head:
df.head
Did you try,
for colname,colitems in df.iteritems():
for rownum,rowitem in colitems.iteritems():
print(df[colname][rownum])
The first loop iterates over all the columns, and the 2nd loops iterates over all the rows for that column.
edit:
From our conversation below, I think that your baseline and df dataframes don't have the same column names because of how you created them and how you are accessing the elements.
My suggestion is that you create the baseline dataframe to be a copy of your df dataframe and edit the information within it from there.
Edit:
I have managed to make your code work for 1 loop, but I run into an index error, I am not sure what your optimisation function does but i think that is what is causing it, take a look.
It is this part baseline_num[colname][rownum - 1], in the second loop i guess because you do rownum (0) -1, you get index -1. You need to change it so that in the first loop rownum is 1 or something, I am not sure what you are trying to do there.
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
d = {'col1': [-2587.944231, -1897.324231,-2510.304231,-2203.814231,-2105.734231,-2446.964231,-2963.904231,-2177.254231, 2796.354231,-2085.304231], 'col2': [-3764.468462,-3723.608462,-3750.168462,-3694.998462,-3991.268462,-3972.878462,3676.608462,-3827.808462,-3629.618462,-1841.758462,], 'col3': [-166.1357692,-35.36576923, 321.4157692,108.9257692,-123.2257692, -10.84576923, -100.7457692, 89.27423077, -211.0857692, 101.5342308]}
df = pd.DataFrame(data=d)
sensors = 3
window_size = 5
dfn = df.rolling(window_size).corr(pairwise = True)
index = df.index #index of values in the data frame.
rows = len(index) #len(index) returns number of rows in the data.
sensors = 3
baseline_num = [0]*(rows) #baseline numerator, by default zero
baseline = [0]*(rows) #initialize baseline value
baseline = pd.DataFrame(df)
baseline_num = pd.DataFrame(df)
#print(baseline_num)
v = [None]*(rows) # Initialize an empty array v[] equal to amount of rows in .csv file
s = [None]*(rows) #Initialize another empty array for the slope values for detecting when there is an exposure
d = [0]*(rows)
sensors_on = True #Is the sensor detecting something (True) or not (False).
off_count = 0
off_require = 8 # how many offs until baseline is updated
sensitivity = 1000
for i in range(0, (rows)): #This iterates over each index value, i.e. each row, and sums the values and returns them in list format.
v[i] = dfn.loc[i].to_numpy().sum() - sensors
for colname,colitems in df.iteritems():
#print(colname)
for rownum,rowitem in colitems.iteritems():
#print(rownum)
#display(baseline[colname][rownum])
#d[rownum] = dfone.loc[rownum].to_numpy()
#d[colname][rownum] = df.loc[colname][rownum]
if v[rownum] >= sensitivity:
sensors_on = True
off_count = 0
baseline_num[rownum] = 0
else:
sensors_on = False
off_count += 1
if off_count == off_require:
for x in range(0, (off_require)):
baseline_num[colname][rownum] += df[colname][rownum - x]
elif off_count > off_require:
baseline_num[colname][rownum] += baseline_num[colname][rownum - 1] + df[colname][rownum] - (df[colname][rownum - off_require]) #this loop is just an optimization, one calculation per loop once the first calculation is established
baseline[colname][rownum] = ((baseline_num[colname][rownum])//(off_require)) #mean of the last "off_require" points
print(baseline[colname][rownum])
dfx = pd.DataFrame(v, columns =['Sensor Correlation']) #converts the summed correlation tables back from list format to a DataFrame, with the sole column name 'Sensor Correlation'
dft = pd.DataFrame(baseline, columns =['baseline'])
dft = dft.astype(float)
dfx.plot(figsize=(50,25), linewidth=5, fontsize=40) # plots dfx dataframe which contains correlated and summed data
dft.plot(figsize=(50,25), linewidth=5, fontsize=40)
My output looks like this,
-324.0
-238.0
-314.0
-276.0
-264.0
-306.0
-371.0
-806.0
638.0
-412.0
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
/usr/local/lib/python3.7/dist-packages/pandas/core/indexes/range.py in get_loc(self, key, method, tolerance)
354 try:
--> 355 return self._range.index(new_key)
356 except ValueError as err:
ValueError: -1 is not in range
The above exception was the direct cause of the following exception:
KeyError Traceback (most recent call last)
3 frames
/usr/local/lib/python3.7/dist-packages/pandas/core/indexes/range.py in get_loc(self, key, method, tolerance)
355 return self._range.index(new_key)
356 except ValueError as err:
--> 357 raise KeyError(key) from err
358 raise KeyError(key)
359 return super().get_loc(key, method=method, tolerance=tolerance)
KeyError: -1
I don't have enough rep to comment, but below is what I was able to work out. Hope it helps!
I tried to use the to_list() function while working out an answer, and it threw me an error:
AttributeError: 'DataFrame' object has no attribute 'to_list'
So, I decided to circumvent that method and came up with this:
indexes = [x for x in df.index]
row_vals = []
for index in indexes :
for val in df.iloc[i].values:
row_vals.append(val)
The object row_vals will contain all values in row order.
If you only want to get the row values for a particular row or set of rows, you would need to do this:
indx_subset = [`list of row indices`] #(Ex. [1, 2, 5, 6, etc...])
row_vals = []
for indx in indx_subset:
for val in df.loc[indx].values:
row_vals.append(val)
row_vals will then have all the row values from the specified indices.

while-loop: list index out of range filtering dataframe

Essentially I have a list full of numerical identifiers -- I use these numerical identifiers as a condition to filter down a dataframe, then once the df is filtered down, I am attempting to store the length of the filtered down dataframe as values in a new, separate dataframe.
I am using the last value in my list of numerical identifiers (ex. list[-1]) as the stopping point for my loop -- I did this so the loop would run thru all the identifiers and finish after its gone through the last one -- I assume this is potentially where the issue is.
My code is spitting out all the correct lengths as it goes through all the unique numerical identifiers in the list -- however, it is still giving me an index out of range error (shown below).
def get_frames(U_id):
k = sorted(df.trackId.unique())
#k is the sorted list of unique numerical identifiers
i = 0
maximum = k[-1] #am using the final value in the list as the stopping point for the loop
while i <= maximum:
condition = df.trackId == k[i]
df2 = df[condition]
values = print(len(df2))
df2 = pd.DataFrame({U_id:values}, index = [i])
i+=1
return df2
get_frames('1CCM0701')
36
18
37
4
33
25
27
49
46
12
45
24
4
---------------------------------------------------------------------------
IndexError Traceback (most recent call last)
<ipython-input-120-3252dfb603ae> in <module>
13 return df2
14
---> 15 get_frames('1CCM0701')
<ipython-input-120-3252dfb603ae> in get_frames(U_id)
6 maximum = k[-1]
7 while i <= maximum:
----> 8 condition = df.trackId == k[i]
9 df2 = df[condition]
10 values = print(len(df2))
IndexError: list index out of range
The issue here is that you are using k[-1] as the stopping point when you are using an iterating variable that accesses the array through an index. k[-1] is 36, which is obviously unrelated to the length of your array. Instead, you should a for loop, or have i be compared to the length of the array instead.
Pythonic For loop:
for i, val in enumerate(k):
condition = df.trackId == val
df2 = df[condition]
values = print(len(df2))
df2 = pd.DataFrame({U_id:values}, index = [i])
Traditional For Loop
for i in range(len(k))
condition = df.trackId == k[i]
df2 = df[condition]
values = print(len(df2))
df2 = pd.DataFrame({U_id:values}, index = [i])
While Loop
arrLen = len(k)
while i < arrLen:
condition = df.trackId == k[i]
df2 = df[condition]
values = print(len(df2))
df2 = pd.DataFrame({U_id:values}, index = [i])
i+=1

Iterating through a dataframe and adding a new row

I want to add a new column to my exisitng dataframe.
I am doing this,
def test(self, sess, df):
for index, frame in df.iterrows():
medical_plan = sess.query(MedicalPlan.id).filter(MedicalPlan.issuer_id == frame['issuer_id'],
MedicalPlan.hios_plan_identifier == frame['hios_plan_identifier'],
MedicalPlan.plan_year == frame['plan_year'],
MedicalPlan.group_or_individual_plan_type == frame['group_or_individual_plan_type']).first()
sess.commit()
frame['medical_plan_id'] = list(medical_plan)[0]
df = df.append(frame)
print df
The df before the loop is ,
wellthie_issuer_identifier ... service_area_id
0 UHC99806 ... 1
[1 rows x 106 columns]
Normally the column and data should be added to this row. But I am getting 2 rows instead and only the last loop value inserted.
df after the loop, column is getting created but the data is wrong.
wellthie_issuer_identifier ... medical_plan_id
0 UHC99806 ... NaN
0 UHC99806 ... 879519.0
[2 rows x 107 columns]
How do I achieve this.
Output I should get as below-
wellthie_issuer_identifier ... service_area_id medical_plan_id
0 UHC99806 ... 1 879519.0
[1 rows x 107 columns]
try 1:
I called the get_id method like below -
def test(self, sess, df):
print ("in test", df)
for index, frame in df.iterrows():
id = self.get_id(sess, frame)
df['medical_plan_id'] = df.apply(id, axis=1)
print df
def test(self, sess, df):
def get_id(frame):
medical_plan = sess.query(MedicalPlan.id).filter(MedicalPlan.issuer_id == frame['issuer_id'],
MedicalPlan.hios_plan_identifier == frame['hios_plan_identifier'],
MedicalPlan.plan_year == frame['plan_year'],
MedicalPlan.group_or_individual_plan_type == frame['group_or_individual_plan_type']).first()
sess.commit()
return list(medical_plan)[0]
df['medical_plan_id']=df.apply(get_id, axis =1)
print(df)
If you want medical_plan_id to be an int, you can change the last line of get_id to return int(list(medical_plan)[0]). Also, you probably could do
medical_plan = sess.query(MedicalPlan.id).filter(
all([MedicalPlan.attribute == frame.attribute for attribute in
['issuer_id','hios_plan_identifier','plan_year','group_or_individual_plan_type']])).first()
or
attributes = ['issuer_id','hios_plan_identifier','plan_year','group_or_individual_plan_type']
medical_plan = sess.query(MedicalPlan.id).filter(all(MedicalPlan[attributes]==frame[attributes])).first())
(I can't say for certain whether that will work without knowing what kind of object MedicalPlan is.)

Loop through a list for dataframe column plot in matplotlib

I have been trying to loop through this list of 19000 tuple and have matplot lib to plot them according the key value of columns in data frame but I could not plot it out.
import os
import pandas as pd
os.chdir('/home/xyzcsv')
%matplotlib inline
from pylab import *
with open('list_tuple.txt','rb') as file:
a = file.readlines()
df = pd.read_csv('20130831_000000.csv')
def createtuple(cola,colb):
names = df.cola
names1 = df.colb
X = []
y = []
for i in range(len(names)):
if names[i] <=float(0) or names1[i]<=float(0):
pass
else:
X.append([names1[i],names[i]])
y.append(i+1)
X = np.array(X)
y = np.array(y)
return (X,y)
def plotgraph():
plt.figure(figsize=(10, 6))
plt.scatter(X[:,0], X[:,1],c=y.astype(np.float),alpha=.5)
plt.show()
for i in range(len(a)):
b = a[i].split("'")
(X,y) = createtuple(b[1],b[3])
plotgraph()
The error I got is :
---------------------------------------------------------------------------
AttributeError Traceback (most recent call last)
<ipython-input-12-11551e9515fa> in <module>()
31 b = a[i].split("'")
32 print b
---> 33 (X,y) = createtuple(b[1],b[3])
34 plotgraph()
<ipython-input-12-11551e9515fa> in createtuple(cola, colb)
9
10 def createtuple(cola,colb):
---> 11 names = df.cola
12 names1 = df.colb
13 X = []
/home/bigdata/anaconda2/lib/python2.7/site-packages/pandas/core/generic.pyc in __getattr__(self, name)
2667 if name in self._info_axis:
2668 return self[name]
-> 2669 return object.__getattribute__(self, name)
2670
2671 def __setattr__(self, name, value):
AttributeError: 'DataFrame' object has no attribute 'cola'
How do I call dataframe column using string concatenation?
Use subscripting to return the column by string name:
names = df[cola]
names1 = df[colb]
by trying to access the cols as an attribute it fails as cola is not an attribute but if you pass the column names to subscript then it will return that column
Also I strongly advise to get into the habit to never access the columns as attributes as it will yield strange behaviour especially if you have column names match an existing attribute or method, for instance you have a column names sum then calling df.sum returns the address of the method sum()

Categories