Iterating through a dataframe and adding a new row - python

I want to add a new column to my exisitng dataframe.
I am doing this,
def test(self, sess, df):
for index, frame in df.iterrows():
medical_plan = sess.query(MedicalPlan.id).filter(MedicalPlan.issuer_id == frame['issuer_id'],
MedicalPlan.hios_plan_identifier == frame['hios_plan_identifier'],
MedicalPlan.plan_year == frame['plan_year'],
MedicalPlan.group_or_individual_plan_type == frame['group_or_individual_plan_type']).first()
sess.commit()
frame['medical_plan_id'] = list(medical_plan)[0]
df = df.append(frame)
print df
The df before the loop is ,
wellthie_issuer_identifier ... service_area_id
0 UHC99806 ... 1
[1 rows x 106 columns]
Normally the column and data should be added to this row. But I am getting 2 rows instead and only the last loop value inserted.
df after the loop, column is getting created but the data is wrong.
wellthie_issuer_identifier ... medical_plan_id
0 UHC99806 ... NaN
0 UHC99806 ... 879519.0
[2 rows x 107 columns]
How do I achieve this.
Output I should get as below-
wellthie_issuer_identifier ... service_area_id medical_plan_id
0 UHC99806 ... 1 879519.0
[1 rows x 107 columns]
try 1:
I called the get_id method like below -
def test(self, sess, df):
print ("in test", df)
for index, frame in df.iterrows():
id = self.get_id(sess, frame)
df['medical_plan_id'] = df.apply(id, axis=1)
print df

def test(self, sess, df):
def get_id(frame):
medical_plan = sess.query(MedicalPlan.id).filter(MedicalPlan.issuer_id == frame['issuer_id'],
MedicalPlan.hios_plan_identifier == frame['hios_plan_identifier'],
MedicalPlan.plan_year == frame['plan_year'],
MedicalPlan.group_or_individual_plan_type == frame['group_or_individual_plan_type']).first()
sess.commit()
return list(medical_plan)[0]
df['medical_plan_id']=df.apply(get_id, axis =1)
print(df)
If you want medical_plan_id to be an int, you can change the last line of get_id to return int(list(medical_plan)[0]). Also, you probably could do
medical_plan = sess.query(MedicalPlan.id).filter(
all([MedicalPlan.attribute == frame.attribute for attribute in
['issuer_id','hios_plan_identifier','plan_year','group_or_individual_plan_type']])).first()
or
attributes = ['issuer_id','hios_plan_identifier','plan_year','group_or_individual_plan_type']
medical_plan = sess.query(MedicalPlan.id).filter(all(MedicalPlan[attributes]==frame[attributes])).first())
(I can't say for certain whether that will work without knowing what kind of object MedicalPlan is.)

Related

Creating a Pandas dataframe column which is conditional on a function

Say I have some dataframe like below and I create a new column (track_len) which gives the length of the column track_no.
import pandas as pd
df = pd.DataFrame({'item_id': [1,2,3], 'track_no': ['qwerty23', 'poiu2', 'poiuyt5']})
df['track_len'] = df['track_no'].str.len()
df.head()
My Question is:
How do I now create a new column (new_col) which selects a specific subset of the track_no string and outputs that depending on the length of the track number (track_len).
I have tried creating a function which outputs the specific string slice of the track_no given the various track_len conditions and then use an apply method to create the column and it doesnt work. The code is below:
Tried:
def f(row):
if row['track_len'] == 8:
val = row['track_no'].str[0:3]
elif row['track_len'] == 5:
val = row['track_no'].str[0:1]
elif row['track_len'] =7:
val = row['track_no'].str[0:2]
return val
df['new_col'] = df.apply(f, axis=1)
df.head()
Thus the desired output should be (based on string slicing output of f):
Output
{new_col: ['qwe', 'p', 'po']}
If there are alternative better solutions to this problem those would also be appreciated.
Your function works well you need to remove .str part in your if blocks. Values are already strings:
def f(row):
if row['track_len'] == 8:
val = row['track_no'][:3]
elif row['track_len'] == 5:
val = row['track_no'][:1]
elif row['track_len'] ==7:
val = row['track_no'][:2]
return val
df['new_col'] = df.apply(f, axis=1)
df.head()
#Output:
item_id track_no track_len new_col
0 1 qwerty23 8 qwe
1 2 poiu2 5 p
2 3 poiuyt5 7 po

Partitioning data in a pandas dataframe

class Question:
def __init__(self, column, value):
self.column = column #storing a column number
self.value = value #storing a column value
def match(self,example):
val = example[self.column]
if is_numeric(val):
return val >= self.value
else:
return val == self.value
def partition(df, question):
true_rows, false_rows = [],[]
for row in df:
if question.match(row):
true_rows.append(row)
else:
false_rows.append(row)
return true_rows, false_rows
TypeError Traceback (most recent call last)
<ipython-input-53-386e8df97e85> in <module>
----> 2 true_rows, false_rows = partition(training_data, Question(0,1))
3 true_rows
<ipython-input-52-9ff7f19eff20> in partition(df, question)
20 true_rows, false_rows = [],[]
21 for row in df:
---> 22 if question.match(row):
23 true_rows.append(row)
24 else:
<ipython-input-12-928374ee6f4e> in match(self, example)
14 #feature value in the question
15
---> 16 val = example[self.column]
17 if is_numeric(val):
18 return val >= self.value
TypeError: 'int' object is not subscriptable
I'm trying to adapt this code from a Decision Tree so that it can be applied to a pandas dataframe instead of a list, I get the error "TypeError: 'int' object is not subscriptable", how do I return a count or a list of rows in the dataframe that were true or false without getting the error, I know that I'm not iterating through the dataframe properly, all help is really appreciated!
The error is likely in your iterator. for row in df does not iterate over the rows. It is equivalent to for column_name in df.columns. What you probably want is the following to get all the rows.
for loc, row in df.iterrows():
...
That said, you will get better performance if you instead create a column for this.
def partition(df, question_column, true_value):
view_of_values = df[question_column]
example = view_of_values[0]
if is_numeric(example):
is_true = view_of_values >= true_value
else:
is_true = view_of_values == true_value
# is_true is a column of True, False values for each
# you could store this df['question_A_is_true'] = is_true
# for partitioning later.
return is_true
df['question_A_is_true'] = partition(df, 'question_A', 'the_truth_is_out_there')
Then you can do things like df.query('question_A_is_true') or df.loc[df['question_A_is_true]] to get just the true rows. And if you have multiple questions you can then do combinations of them that are very fast because they use numpy under the hood.
df.loc[
(df['question_A_is_true'])
or (df['question_B_is_true'] and not df['question_C_is_true'])
]

Using function on entire pandas dataframe to generate comments

I have a multiple dataframes which are similar to below:
df:
Name Value1 Value2
A 98 57
B 267 962
C 43 423
D 612 34
I need to use a function on the above datframe which will perform some calculations and output some variables.
def my_func()
c001=[]
for _, value in df.iterrows():
var1 = value['Value1']
var2 = value['Value1%']
seg1 = value['Name']
flag1 = 'over' if var1>0 else 'under'
kpi = 'YYT'
c001.append(f"{seg1} {kpi} {flag1} Plan by {human(var1)}({abs(var2)}%) ")
c001[1]
How do I use this function on the input dataframe to print the value in variable c001[1]?
I hope I understood you correctly:
def my_func()
c001=[]
for _, value in df.iterrows():
var1 = value['Value1']
var2 = value['Value1%']
seg1 = value['Name']
flag1 = 'over' if var1>0 else 'under'
kpi = 'YYT'
c001.append(f"{seg1} {kpi} {flag1} Plan by {human(var1)}({abs(var2)}%) ")
return c001[1]
print (my_func())
You can try to create "c001" as a column and then print it.
def my_func(value):
var1 = value['Value1']
var2 = value['Value1%']
seg1 = value['Name']
flag1 = 'over' if var1 > 0 else 'under'
kpi = 'YYT'
return f"{seg1} {kpi} {flag1} Plan by {human(var1)}({abs(var2)}%) "
df["c001"] = df.apply(my_func, axis=1)
print(df["c001"])
The result will look like:
0 A YYT over Plan by 98(57%)
1 B YYT over Plan by 267(962%)
2 C YYT over Plan by 43(423%)
3 D YYT over Plan by 612(34%)
Name: c001, dtype: object

How to compare rows of two different dataframes

I have 2 dataframes(df and df_flagMax) that are not the same in size. I need help on the structure of comparing two different databases that are not the same in size. I want to compare the rows of both dataframes.
df = pd.read_excel('df.xlsx')
df_flagMax = df.groupby(['Name'], as_index=False)['Max'].max()
df['flagMax'] = 0
num = len(df)
for i in range(num):
colMax = df.at[i, 'Name']
df['flagMax'][(df['Max'] == colMax)] = 1
print(df)
df_flagMax data:
Name Max
0 Sf 39.91
1 Th -25.74
df data:
For example: I want to compare 'Sf' from both df and df_flagMax and then perform this line:
df['flag'][(df['Max'] == colMax)] = 1
if and only if the 'Sf' is in both dataframes on the same row index. The same goes for the next Name value ... 'Th'

How to append dataframes inside a for loop in Python

I have been trying to append the DataFrame in the four loop, for loop works fine, however it is not appending the data frames, any help would be much appreciated.
symbols = ['MSFT', 'GOOGL', 'AAPL']
apikey = 'CR*****YDA'
for s in symbols:
print(s)
url = "https://www.alphavantage.co/query?function=TIME_SERIES_DAILY&symbol=%s&apikey=%s" % (s, apikey)
stockdata = urllib.request.urlopen(url)
data = stockdata.read().decode()
js = json.loads(data)
a = pd.DataFrame(js['Time Series (Daily)']).T
b = pd.DataFrame()
print(b)
b = b.append(a, ignore_index=True)
print(b)
print("loop successful")
print("run successfull")
Outputs:
MSFT
Empty DataFrame
Columns: []
Index: []
1. open 2. high 3. low 4. close 5. volume
0 107.4600 107.9000 105.9100 107.7100 37427587
1 105.0000 106.6250 104.7600 106.1200 28393015
.. ... ... ... ... ...
99 109.2700 109.6400 108.5100 109.6000 19662331
[100 rows x 5 columns]
loop successful
GOOGL
Empty DataFrame
Columns: []
Index: []
1. open 2. high 3. low 4. close 5. volume
0 1108.5900 1118.0000 1099.2800 1107.3000 2244569
1 1087.9900 1100.7000 1083.2600 1099.1200 1244801
.. ... ... ... ... ...
99 1244.1400 1257.8700 1240.6800 1256.2700 1428992
[100 rows x 5 columns]
loop successful
AAPL
Empty DataFrame
Columns: []
Index: []
1. open 2. high 3. low 4. close 5. volume
0 157.5000 157.8800 155.9806 156.8200 33751023
1 154.2000 157.6600 153.2600 155.8600 29821160
.. ... ... ... ... ...
99 217.1500 218.7400 216.3300 217.9400 20525117
[100 rows x 5 columns]
loop successful
run successfull
The immediate problem is you define b as an empty dataframe within each iteration of your for loop. Instead, define it once before your for loop begins:
b = pd.DataFrame()
for s in symbols:
# some code
a = pd.DataFrame(js['Time Series (Daily)']).T
b = b.append(a, ignore_index=True)
But appending dataframes in a loop is not recommended. It requires unnecessary copy operations and is inefficient. The docs recommend using pd.concat on an iterable of dataframes:
list_of_dfs = []
for s in symbols:
# some code
list_of_dfs.append(pd.DataFrame(js['Time Series (Daily)']).T)
b = pd.concat(list_of_dfs, ignore_index=True)
The problem is that you kept erasing the value of b with an empty DataFrame. So you have to define b as a DataFrame before the for loop.
symbols = ['MSFT', 'GOOGL', 'AAPL']
apikey = 'CR*****YDA'
b = pd.DataFrame()
for s in symbols:
print(s)
url = "https://www.alphavantage.co/query?function=TIME_SERIES_DAILY&symbol=%s&apikey=%s" % (s, apikey)
stockdata = urllib.request.urlopen(url)
data = stockdata.read().decode()
js = json.loads(data)
a = pd.DataFrame(js['Time Series (Daily)']).T
print(b)
b = b.append(a, ignore_index=True)
print(b)
print("loop successful")
print("run successfull")
Moving the following code
b = pd.DataFrame()
to outside of the loop would fix your problem. Right now, 'b' is re-initialized as empty dataframe every loop.

Categories