Checking Padded data in Pandas Dataframe on specific columns - python

I have a DataFrame that looks like this:
import numpy as np
raw_data = {'Series_Date':['2017-03-10','2017-03-13','2017-03-14','2017-03-15'],'SP':[35.6,56.7,41,41],'1M':[-7.8,56,56,-3.4],'3M':[24,-31,53,5]}
import pandas as pd
df = pd.DataFrame(raw_data,columns=['Series_Date','SP','1M','3M'])
print df
I would like to run a test on certain columns in this DataFrame only, all column names in this list:
check = {'1M','SP'}
print check
For these columns, I would like to know when the values in either of these columns is the same as the value on the previous day. So the output dataframe should return series date and a Comment such as (for the example in this case:)
output_data = {'Series_Date':['2017-03-14','2017-03-15'],'Comment':["Value for 1M data is same as previous day","Value for SP data is same as previous day"]}
output_data_df = pd.DataFrame(output_data,columns = ['Series_Date','Comment'])
print output_data_df
Could you please provide some assistance how to deal with this?

The following does more or less what you want.
Columns item_ok are added to the original dataframe specifying if the value is the same as previous day or not:
from datetime import timedelta
df['Date_diff'] = pd.to_datetime(df['Series_Date']).diff()
for item in check:
df[item+'_ok'] = (df[item].diff() == 0) & (df['Date_diff'] == timedelta(1))
df_output = df.loc[(df[[item + '_ok' for item in check]]).any(axis=1)]

I'm not sure it is the most clean way to do it. However, it works
check = {'1M', 'SP'}
prev_dict = {c: None for c in check}
def check_prev_value(row):
global prev_dict
msg = ""
# MAYBE add clause to check if both are equal
for column in check:
if row[column] == prev_dict[column]:
msg = 'Value for %s data is same as previous day' % column
prev_dict[column] = row[column]
return msg
df['comment'] = df.apply(check_prev_value, axis=1)
output_data_df = df[df['comment'] != ""]
output_data_df = output_data_df[["Series_Date", "comment"]].reset_index(drop=True)
For your input:
Series_Date SP 1M 3M
0 2017-03-10 35.6 -7.8 24
1 2017-03-13 56.7 56.0 -31
2 2017-03-14 41.0 56.0 53
3 2017-03-15 41.0 -3.4 5
The output is:
Series_Date comment
0 2017-03-14 Value for 1M data is same as previous day
1 2017-03-15 Value for SP data is same as previous day

Reference: this answer
cols = ['1M','SP']
for col in cols:
df[col + '_dup'] = df[col].groupby((df[col] != df[col].shift()).cumsum()).cumcount()
Output column will have an integer greater than zero when a duplicate is found.
df:
Series_Date SP 1M 3M 1M_dup SP_dup
0 2017-03-10 35.6 -7.8 24 0 0
1 2017-03-13 56.7 56.0 -31 0 0
2 2017-03-14 41.0 56.0 53 1 0
3 2017-03-15 41.0 -3.4 5 0 1
Slice to find dups:
col = 'SP'
dup_df = df[df[col + '_dup'] > 0][['Series_Date', col + '_dup']]
dup_df:
Series_Date SP_dup
3 2017-03-15 1
Here is a function version of the above (with the added feature of handling multiple columns):
import pandas as pd
import numpy as np
def find_repeats(df, col_list, date_col='Series_Date'):
dummy_df = df[[date_col, *col_list]].copy()
dates = dummy_df[date_col]
date_series = []
code_series = []
if len(col_list) > 1:
for col in col_list:
these_repeats = df[col].groupby((df[col] != df[col].shift()).cumsum()).cumcount().values
repeat_idx = list(np.where(these_repeats > 0)[0])
date_arr = dates.iloc[repeat_idx]
code_arr = [col] * len(date_arr)
date_series.extend(list(date_arr))
code_series.extend(code_arr)
return pd.DataFrame({date_col: date_series, 'col_dup': code_series}).sort_values(date_col).reset_index(drop=True)
else:
col = col_list[0]
dummy_df[col + '_dup'] = df[col].groupby((df[col] != df[col].shift()).cumsum()).cumcount()
return dummy_df[dummy_df[col + '_dup'] > 0].reset_index(drop=True)
find_repeats(df, ['1M'])
Series_Date 1M 1M_dup
0 2017-03-14 56.0 1
find_repeats(df, ['1M', 'SP'])
Series_Date col_dup
0 2017-03-14 1M
1 2017-03-15 SP
And here is another way using pandas diff:
def find_repeats(df, col_list, date_col='Series_Date'):
code_list = []
dates = list()
for col in col_list:
these_dates = df[date_col].iloc[np.where(df[col].diff().values == 0)[0]].values
code_arr = [col] * len(these_dates)
dates.extend(list(these_dates))
code_list.extend(code_arr)
return pd.DataFrame({date_col: dates, 'val_repeat': code_list}).sort_values(date_col).reset_index(drop=True)

Related

converting the contents of txt file to columns of pandas dataframe

I have a .txt file of this sort
12
21
23
1
23
42
12
0
In which <12,21,23> are features and <1> is a label.
Again <23,42,12> are features and <0> is the label and so on.
I want to create a pandas dataframe from the above text file which contains only a single column into multiple column.
The format of the dataframe is {column1,column2,column3,column4}. And there are no column names in it.
Can someone please help me out in this?
Thanks
import pandas as pd
df = dict()
features = list()
label = ''
filename = '.txt'
with open(filename) as fd:
i = 0
for line in fd:
if i != 3:
features.append(line.strip())
i += 1
else:
label = line.strip()
i = 0
df[label] = features
features = list()
df = pd.DataFrame(df)
df
import pandas as pd
with open(<FILEPATH>, "r") as f:
lines = f.readlines()
formatted = [int(line[:-1]) for line in lines] # Remove \n and convert to int
labels = formatted[3::4]
features = list(zip(formatted[::4], formatted[1::4], formatted[2::4])) # You can modify this if there are more than three rows
data = {}
for i, label in enumerate(labels):
data[label] = list(features[i])
df = pd.DataFrame(data)
Comment if you have any questions or found any errors, and I will make ammendments.
You can use numpy first, you need to ensure that the number of values is a multiple of 4
each record as column with the label as header
a = np.loadtxt('file.txt').reshape((4,-1), order='F')
df = pd.DataFrame(a[:-1], columns=a[-1])
Output:
1.0 0.0
0 12.0 23.0
1 21.0 42.0
2 23.0 12.0
each record as a new row
a = np.loadtxt('file.txt').reshape((-1,4))
df = pd.DataFrame(a)
Output:
0 1 2 3
0 12.0 21.0 23.0 1.0
1 23.0 42.0 12.0 0.0
row = []
i = 0
data = []
with open('a.txt') as f:
for line in f:
data
i+= 1
row.append(int(line.strip()))
if i%4==0 and i!=0:
print(i)
data_rows_count +=1
data.append(row)
row = []
f.close()
df = pd.DataFrame(data)
results in df to be:
0 1 2 3
0 12 21 23 1
1 23 42 12 0

Python : Remove all data in a column of a dataframe and keep the last value in the first row

Let's say that I have a simple Dataframe.
import pandas as pd
data1 = [12,34,'fsdf',678,'','','dfs','','']
df1 = pd.DataFrame(data1, columns= ['Data'])
print(df1)
Data
0 12
1 34
2 fsdf
3 678
4
5
6 dfs
7
8
I want to delete all the data except the last value found in the column that I want to keep in the first row. It can be an column with thousands of rows. So I would like the result :
Data
0 dfs
1
2
3
4
5
6
7
8
And I have to keep the shape of this dataframe, so not removing rows.
What are the simplest functions to do that efficiently ?
Thank you
Get index of last not empty string value and pass to first value of column:
s = df1.loc[df1['Data'].iloc[::-1].ne('').idxmax(), 'Data']
print (s)
dfs
df1['Data'] = ''
df1.loc[0, 'Data'] = s
print (df1)
Data
0 dfs
1
2
3
4
5
6
7
8
If empty strings are missing values:
data1 = [12,34,'fsdf',678,np.nan,np.nan,'dfs',np.nan,np.nan]
df1 = pd.DataFrame(data1, columns= ['Data'])
print(df1)
Data
0 12
1 34
2 fsdf
3 678
4 NaN
5 NaN
6 dfs
7 NaN
8 NaN
s = df1.loc[df1['Data'].iloc[::-1].notna().idxmax(), 'Data']
print (s)
dfs
df1['Data'] = ''
df1.loc[0, 'Data'] = s
print (df1)
Data
0 dfs
1
2
3
4
5
6
7
8
A simple pandas condition check like this can help,
df1['Data'] = [df1.loc[df1['Data'].ne(""), "Data"].iloc[-1]] + [''] * (len(df1) - 1)
You can replace '' with NaN using df.replace, now use df.last_valid_index
val = df1.loc[df1.replace('', np.nan).last_valid_index(), 'Data']
# Below two lines taken from #jezrael's answer
df1.loc[0, 'Data'] = val
df1.loc[1:, 'Data'] = ''
Or
You can use np.full with fill_value set to np.nan here.
val = df1.loc[df1.replace("", np.nan).last_valid_index(), "Data"]
df1 = pd.DataFrame(np.full(df1.shape, np.nan),
index=df.index,
columns=df1.columns)
df1.loc[0, "Data"] = val

Python/ Pandas: calculate 1. minimum, 2. max of columns to left of minimum and 3. max of columns to right of minimum

This is a continuation of Python/ Pandas: Finding a left and right max
I have a dataframe, with timelines of data. Here is an example:
idx Q12000 Q22000 Q32000 Q42000 Q12001 Q22001 Q32001 Q42001 Q12002 Q22002 Q32002 Q42002
0 4085280.0 4114911.0 4108089.0 4111713.0 4055699.0 4076430.0 4043219.0 4039370.0 4201158.0 4243119.0 4231823.0 4254681.0
1 21226.0 21566.0 21804.0 22072.0 21924.0 23232.0 22748.0 22258.0 22614.0 22204.0 22500.0 22660.0
2 96400.0 102000.0 98604.0 97086.0 96354.0 103054.0 97824.0 95958.0 115938.0 123064.0 120406.0 120648.0
3 23820.0 24116.0 24186.0 23726.0 23504.0 23574.0 23162.0 23078.0 22306.0 22334.0 22152.0 22080.0
4 7838.0 7906.0 7714.0 7676.0 7480.0 7520.0 7102.0 6722.0 8324.0 8166.0 8208.0 8326.0
To do my analysis I need to calculate the following values for each row:
nadir: the lowest point (min)
nadir_qtr: the quarter at which the nadir happens
pre-peak: the highest point before the nadir
pre-peak_qtr: the quarter at which the pre-peak happens
post-peak: the highest point after the nadir
post-peak_qtr: the quarter at which the post-peak happens
With the help from my last post, I used the below helper functions:
from io import StringIO
import pandas as pd
def calc_nadir(s):
assert isinstance(s, pd.Series)
return s.min()
def calc_nadir_qtr(s):
return s.argmin()
def calc_pre_peak(s):
return s[ : s.argmin()].max()
def calc_pre_peak_quarter(s):
try:
qtr = s[ : s.argmin()].argmax()
except:
qtr = None
return qtr
def calc_post_peak(s):
return s[s.argmin() : ].max()
def calc_post_peak_qtr(s):
return s[s.argmin() : ].argmax() + s.argmin()
nadir = df.apply(lambda x: calc_nadir(x), axis=1).rename('nadir')
nadir_qtr = df.apply(lambda x: calc_nadir_qtr(x), axis=1).rename('nadir_qtr')
pre_peak = df.apply(lambda x: calc_pre_peak(x), axis=1).rename('pre_peak')
pre_peak_qtr = df.apply(lambda x: calc_pre_peak_quarter(x), axis=1).rename('pre_peak_qtr')
post_peak = df.apply(lambda x: calc_post_peak(x), axis=1).rename('post_peak')
post_peak_qtr = df.apply(lambda x: calc_post_peak_qtr(x), axis=1).rename('post_peak_qtr')
results = pd.concat([nadir, nadir_qtr, pre_peak, pre_peak_qtr,
post_peak, post_peak_qtr], axis=1)
print(results)
nadir nadir_qtr pre_peak pre_peak_qtr post_peak post_peak_qtr
0 4039370.0 7 4114911.0 1.0 4254681.0 11
1 21226.0 0 NaN NaN 23232.0 5
2 95958.0 7 103054.0 5.0 123064.0 9
3 22080.0 11 24186.0 2.0 22080.0 11
4 6722.0 7 7906.0 1.0 8326.0 11
The trouble I'm having is the second line. Having the nadir as the first column is not meaningful, so I altered the above code to only get the nadir after the first few columns.
nadir = df.iloc[:,6:].apply(lambda x: calc_nadir(x), axis=1).rename('nadir')
nadir_qtr = df.iloc[:,6:].apply(lambda x: calc_nadir_qtr(x), axis=1).rename('nadir_qtr')
That seems to work well enough. But I'm stuck on how to get the pre-peak to replace the NaNs.
I've tried iterating through the rows, but no luck. Still getting Nans in the exact same spots.
for index, row in df.iterrows():
if not row['pre_peak']:
slice = row['nadir_qtr'][index]
row['pre_peak'] = row.iloc[1:slice].max(axis=0)
Any advice appreciated
You can use .iloc[:1,:] to only select after the first column, and use a bunch of pandas methods like .min, .max, idxmin, idxmax and others:
df['nadir'] = df.iloc[:,1:].min(axis=1)
df['nadir_qtr'] = df.iloc[:,1:].idxmin(axis=1).apply(lambda x: df.columns.get_loc(x))
df['new'] = [df.iloc[i].values for i in df.index]
df['pre_peak'] = df.apply(lambda x: max(x['new'][0:x['nadir_qtr']]), axis=1)
df['post_peak'] = df.apply(lambda x: max(x['new'][x['nadir_qtr']:]), axis=1)
df['pre_peak_qtr'] = pd.Series([s[i] for i, s in zip(df.index, df['pre_peak'].apply(
lambda x: [i for i in (df.iloc[:,0:-6] == x)
.idxmax(axis=1)]))]).apply(lambda x: df.columns.get_loc(x))
df['post_peak_qtr'] = pd.Series([s[i] for i, s in zip(df.index, df['post_peak'].apply(
lambda x: [i for i in (df.iloc[:,0:-6] == x)
.idxmax(axis=1)]))]).apply(lambda x: df.columns.get_loc(x))
df_new = df[['nadir', 'nadir_qtr', 'pre_peak', 'pre_peak_qtr', 'post_peak', 'post_peak_qtr']]
df_new
Out[1]:
nadir nadir_qtr pre_peak pre_peak_qtr post_peak post_peak_qtr
idx
0 4039370.0 7 4114911.0 1 4254681.0 11
1 21566.0 1 21226.0 0 23232.0 5
2 95958.0 7 103054.0 5 123064.0 9
3 22080.0 11 24186.0 2 22080.0 11
4 6722.0 7 7906.0 1 8326.0 11

How to combine two pandas dataframes value by value

I have 2 dataframes - players (only has playerid) and dates (only has date). I want new dataframe which will contain for each player each date. In my case, players df contains about 2600 rows and date df has 1100 rows. I used 2 for loops to do this, but it is really slow, is there a way to do it faster via some function? thx
my loop:
player_elo = pd.DataFrame(columns = ['PlayerID','Date'])
for row in players.itertuples():
idx = row.Index
pl = players.at[idx,'PlayerID']
for i in dates.itertuples():
idd = row.Index
dt = dates.at[idd, 0]
new = {'PlayerID': [pl], 'Date': [dt]}
new = pd.DataFrame(new)
player_elo = player_elo.append(new)
If you have a key that is repeated for each df, you can come up with the cartesian product you are looking for using pd.merge().
import pandas as pd
players = pd.DataFrame([['A'], ['B'], ['C']], columns=['PlayerID'])
dates = pd.DataFrame([['12/12/2012'],['12/13/2012'],['12/14/2012']], columns=['Date'])
dates['Date'] = pd.to_datetime(dates['Date'])
players['key'] = 1
dates['key'] = 1
print(pd.merge(players, dates,on='key')[['PlayerID', 'Date']])
Output
PlayerID Date
0 A 2012-12-12
1 A 2012-12-13
2 A 2012-12-14
3 B 2012-12-12
4 B 2012-12-13
5 B 2012-12-14
6 C 2012-12-12
7 C 2012-12-13
8 C 2012-12-14

Vectorizing a multiplication and dict mapping on a Pandas DataFrame without iterating?

I have a Pandas DataFrame, df:
import pandas as pd
import numpy as np
import math
df = pd.DataFrame({'A':[1,2,2,4,np.nan],'B':[1,2,3,4,5]})
and a dict, mask:
mask = {1:32,2:64,3:100,4:200}
I want my end result to be a DataFrame like this:
A B C
1 1 32
2 2 64
2 3 96
4 4 400
nan nan nan
Right now I am doing this, which seems innefficient:
for idx, row in df.iterrows():
if not math.isnan(row['A']):
if row['A'] != 1:
df.loc[idx, 'C'] = row['B'] * mask[row['A'] - 1]
else:
df.loc[idx, 'C'] = row['B'] * mask[row['A']]
Is there an easy way to vectorize this?
This should work:
df['C'] = df.B * (df.A - (df.A != 1)).map(mask)
Timing
10,000 rows
# Initialize each run with
df = pd.DataFrame({'A':[1,2,2,4,np.nan],'B':[1,2,3,4,5]})
df = pd.concat([df for _ in range(2000)])
100,000 rows
# Initialize each run with
df = pd.DataFrame({'A':[1,2,2,4,np.nan],'B':[1,2,3,4,5]})
df = pd.concat([df for _ in range(20000)])
Here is an option using apply, and the get method for dictionary which returns None if the key is not in the dictionary:
df['C'] = df.apply(lambda r: mask.get(r.A) if r.A == 1 else mask.get(r.A - 1), axis = 1) * df.B
df
# A B C
#0 1 1 32
#1 2 2 64
#2 2 3 96
#3 4 4 400
#4 NaN 5 NaN

Categories