get the full row data from the found extrema - python

I am new to using pandas and I can't find a way to get the full row of the found extrema
df = pd.read_csv('test.csv')
df['min'] = df.iloc[argrelextrema(df.Close.values, np.less_equal,
order=10)[0]]['Close']
df['max'] = df.iloc[argrelextrema(df.Close.values, np.greater_equal,
order=10)[0]]['Close']
# create lists for `min` and `max`
min_values_list = df['min'].dropna().tolist()
max_values_list = df['max'].dropna().tolist()
print(min_values_list, max_values_list)
It print only the minima and extrema values, but I need the full row data of found minima / maxima
Example of data:
Datetime,Date,Open,High,Low,Close
2021-01-11 00:00:00+00:00,18638.0,1.2189176082611084,1.2199585437774658,1.2186205387115479,1.2192147970199585

If the list is required, then I would suggest:
def df_to_list_rowwise(df: pd.DataFrame) -> list:
return [df.iloc[_, :].tolist() for _ in range(df.shape[0])]
df_min_values = df.iloc[argrelextrema(np.array(df.Close), np.less_equal)[0], :]
df_max_values = df.iloc[argrelextrema(np.array(df.Close), np.greater_equal)[0], :]
print(df_to_list_rowwise(df_min_values))
print(df_to_list_rowwise(df_max_values))
Would that help?

try to use df.dropna().index.tolist() instead of specifying the column because adding the column name returns just the value of a specific row and the specified column not the whole row

Related

get idxmax rolling for each group and each row?

data: https://github.com/zero-jack/data/blob/main/hy_data.csv#L7
Goal
get the idxmax from last n rows for each group.
Try
df=df.assign(
l6d_highest_date=lambda x: x.groupby('hy_code')['high'].transform(lambda x: x.rolling(6).idxmax())
AttributeError: 'Rolling' object has no attribute 'idxmax'
notice: week_date is the index.
My solution is based on the conversion of the argmax computed on each sliding-window. For each date, thanks to this information, you can infer the date the argmax refers to.
df = pd.read_csv(
"https://raw.githubusercontent.com/zero-jack/data/main/hy_data.csv",
sep=",", index_col="week_date"
)
def rolling_idmax(series, n):
#fist compute the index in the sliding window
ids = series.rolling(n).apply(np.argmax)
#0 <= ids <= n-1
#how many rows have past from the sliding window maximum?
ids = n-1-ids
#0 <= ids <= n-1
#subtract `ids` from the actual positions
ids = np.arange(len(series))-ids
#0 <= ids <= len(series)-1
#convert the positions stored in `ids` with the corrisponding dates (series.index)
ids.loc[~ids.isna()] = series.index[ids.dropna().astype(int)]
#"2005-06-10" <= ids <= "2022-03-04"
return ids
df["l6d_highest_date"] = df.groupby("hy_code").high.apply(rolling_idmax, 6)
Based on this answer, I get the following workaround. Note that the linked answer can only handle series with the default index, I add x.index[global_index] to deal with non-default index.
window_size = 6
def get_idxmax_in_rolling(x: pd.Series):
local_index = x.rolling(window_size).apply(np.argmax)[window_size-1:].astype(int) # local index, removed nan before astype()
global_index = local_index + np.arange(len(x)-window_size+1)
# return list(x.index[global_index]) + [np.nan]*(window_size-1)
return [np.nan]*(window_size-1) + list(x.index[global_index]) # add nan back
df = df.assign(l6d_highest_date=lambda x: x.groupby('hy_code')['high'].transform(get_idxmax_in_rolling))
You can apply idxmax (for older versions of pandas before 1.0.0 you need to pass raw=False). The only caveat is that rolling must return a float (see linked docs), not a Timestamp. That's why you need to temporaryly reset the index, get the idxmax values and the corresponding week_dates and reset the index:
import pandas as pd
df = pd.read_csv('https://raw.githubusercontent.com/zero-jack/data/main/hy_data.csv', index_col='week_date', parse_dates=True)
df = df.reset_index()
df['l6d_highest_date'] = df.groupby('hy_code')['high'].transform(lambda x: x.rolling(6).apply(pd.Series.idxmax))
df.loc[df.l6d_highest_date.notna(), 'l6d_highest_date'] = df.loc[df.loc[df.l6d_highest_date.notna(), 'l6d_highest_date'].values, 'week_date'].values
df = df.set_index('week_date')

Adding empty rows in Pandas dataframe

I'd like to append consistently empty rows in my dataframe.
I have following code what does what I want but I'm struggling in adjusting it to my needs:
s = pd.Series('', data_only_trades.columns)
f = lambda d: d.append(s, ignore_index=True)
set_rows = np.arange(len(data_only_trades)) // 4
empty_rows = data_only_trades.groupby(set_rows, group_keys=False).apply(f).reset_index(drop=True)
How can I adjust the code so I add two or more rows instead of one?
How can I set a starting point (e.g. it should start with row 5 -- Do I have to use .loc then in arange?)
Also tried this code but I was struggling in setting the starting row and the values to blank (I got NaN):
df_new = pd.DataFrame()
for i, row in data_only_trades.iterrows():
df_new = df_new.append(row)
for _ in range(2):
df_new = df_new.append(pd.Series(), ignore_index=True)
Thank you!
import numpy as np
v = np.ndarray(shape=(numberOfRowsYouWant,df.values.shape[1]), dtype=object)
v[:] = ""
pd.DataFrame(np.vstack((df.values, v)))
I think you can use NumPy
but, if you want to use your manner, simply convert NaN to "":
df.fillna("")

Inserting Values into a Multilevel Index Dataframe

I have created a dataframe as shown:
idx = pd.MultiIndex.from_product([['batch1', 'batch2','batch3', 'batch4', 'batch5'], ['quiz1', 'quiz2']])
cols=['noofpresent', 'lesserthan50', 'between50and60', 'between60and70', 'between70and80', 'greaterthan80']
statdf = pd.DataFrame('-', idx, cols)
statdf
statdf.loc['quiz1', 'noofpresent'] = qdf1.b4ispresent.count()
statdf.loc['quiz2', 'noofpresent'] = qdf2.b4ispresent.count()
statdf.noopresent = qdf1.b4ispresent.count()
statdf.noopresent = qdf2.b4ispresent.count()
statdf
Then I made some calculations. I now want to append that specific calculation of the figures '50' and '53' in column 'noofpresent' in 'batch4', 'quiz1' and 'quiz2' respectively. But instead this happened...
How can I insert my data into the right place?
you can index it like this.
statdf.loc['batch4','quiz1']['noofpresent'] = qdf1.b4ispresent.count()
statdf.loc['batch4','quiz2']['noofpresent'] =qdf2.b4ispresent.count()

Pandas: Trouble setting value for each column

I have an empty Pandas dataframe and I'm trying to add a row to it. Here's what I mean:
text_img_count = len(BeautifulSoup(html, "lxml").find_all('img'))
print 'img count: ', text_img_count
keys = ['text_img_count', 'text_vid_count', 'text_link_count', 'text_par_count', 'text_h1_count',
'text_h2_count', 'text_h3_count', 'text_h4_count', 'text_h5_count', 'text_h6_count',
'text_bold_count', 'text_italic_count', 'text_table_count', 'text_word_length', 'text_char_length',
'text_capitals_count', 'text_sentences_count', 'text_middles_count', 'text_rows_count',
'text_nb_digits', 'title_char_length', 'title_word_length', 'title_nb_digits']
values = [text_img_count, text_vid_count, text_link_count, text_par_count, text_h1_count,
text_h2_count, text_h3_count, text_h4_count, text_h5_count, text_h6_count,
text_bold_count, text_italic_count, text_table_count, text_word_length,
text_char_length, text_capitals_count, text_sentences_count, text_middles_count,
text_rows_count, text_nb_digits, title_char_length, title_word_length, title_nb_digits]
numeric_df = pd.DataFrame()
for key, value in zip(keys, values):
numeric_df[key] = value
print numeric_df.head()
However, the output is this:
img count: 2
Empty DataFrame
Columns: [text_img_count, text_vid_count, text_link_count, text_par_count, text_h1_count, text_h2_count, text_h3_count, text_h4_count, text_h5_count, text_h6_count, text_bold_count, text_italic_count, text_table_count, text_word_length, text_char_length, text_capitals_count, text_sentences_count, text_middles_count, text_rows_count, text_nb_digits, title_char_length, title_word_length, title_nb_digits]
Index: []
[0 rows x 23 columns]
This makes it seem like numeric_df is empty after I just assigned values for each of its columns.
What's going on?
Thanks for the help!
What I usually do to add a column to the empty data frame is to append the information into a list and then give it a data frame structure. For example:
df=pd.DataFrame()
L=['a','b']
df['SomeName']=pd.DataFrame(L)
And you have to use pd.Series() if the list is make of numbers.

Pandas Columns Operations with List

I have a pandas dataframe with two columns, the first one with just a single date ('action_date') and the second one with a list of dates ('verification_date'). I am trying to calculate the time difference between the date in 'action_date' and each of the dates in the list in the corresponding 'verification_date' column, and then fill the df new columns with the number of dates in verification_date that have a difference of either over or under 360 days.
Here is my code:
df = pd.DataFrame()
df['action_date'] = ['2017-01-01', '2017-01-01', '2017-01-03']
df['action_date'] = pd.to_datetime(df['action_date'], format="%Y-%m-%d")
df['verification_date'] = ['2016-01-01', '2015-01-08', '2017-01-01']
df['verification_date'] = pd.to_datetime(df['verification_date'], format="%Y-%m-%d")
df['user_name'] = ['abc', 'wdt', 'sdf']
df.index = df.action_date
df = df.groupby(pd.TimeGrouper(freq='2D'))['verification_date'].apply(list).reset_index()
def make_columns(df):
df = df
for i in range(len(df)):
over_360 = []
under_360 = []
for w in [(df['action_date'][i]-x).days for x in df['verification_date'][i]]:
if w > 360:
over_360.append(w)
else:
under_360.append(w)
df['over_360'] = len(over_360)
df['under_360'] = len(under_360)
return df
make_columns(df)
This kinda works EXCEPT the df has the same values for each row, which is not true as the dates are different. For example, in the first row of the dataframe, there IS a difference of over 360 days between the action_date and both of the items in the list in the verification_date column, so the over_360 column should be populated with 2. However, it is empty and instead the under_360 column is populated with 1, which is accurate only for the second row in 'action_date'.
I have a feeling I'm just messing up the looping but am really stuck. Thanks for all help!
Your problem was that you were always updating the whole column with the value of the last calculation with these lines:
df['over_360'] = len(over_360)
df['under_360'] = len(under_360)
what you want to do instead is set the value for each line calculation accordingly, you can do this by replacing the above lines with these:
df.set_value(i,'over_360',len(over_360))
df.set_value(i,'under_360',len(under_360))
what it does is, it sets a value in line i and column over_360 or under_360.
you can learn more about it here.
If you don't like using set_values you can also use this:
df.ix[i,'over_360'] = len(over_360)
df.ix[i,'under_360'] = len(under_360)
you can check dataframe.ix here.
you might want to try this:
df['over_360'] = df.apply(lambda x: sum([((x['action_date'] - i).days >360) for i in x['verification_date']]) , axis=1)
df['under_360'] = df.apply(lambda x: sum([((x['action_date'] - i).days <360) for i in x['verification_date']]) , axis=1)
I believe it should be a bit faster.
You didn't specify what to do if == 360, so you can just change > or < into >= or <=.

Categories