Find minima and maxima of DataFrame by chronological order - python

I have a pandas data frame where I extract minima and extrema values. It work good so far, but the problem is how can I place them by Date (chronological order) into a list? They are separated into two list and I only want one price values list with them being in chronological order
import pandas as pd
import numpy as np
import yfinance
from scipy.signal import argrelextrema
import matplotlib.dates as mpl_dates
def extract_data():
ticker = 'GBPJPY=X'
ticker = yfinance.Ticker(ticker)
start_date = '2022-09-25'
end_date = '2022-10-08'
df = ticker.history(interval='1h', start=start_date, end=end_date)
df['Date'] = pd.to_datetime(df.index)
df['Date'] = df['Date'].apply(mpl_dates.date2num)
df = df.loc[:, ['Date', 'Open', 'High', 'Low', 'Close']]
# Call function to find Min-Max Extrema
find_extrema(df)
def find_extrema(df):
n = 10 # number of points to be checked before and after
# Find local peaks
df['min'] = df.iloc[argrelextrema(df.Close.values, np.less_equal,
order=n)[0]]['Close']
df['max'] = df.iloc[argrelextrema(df.Close.values, np.greater_equal,
order=n)[0]]['Close']
min_values_list = []
max_values_list = []
# Add min value to list
for item in df['min']:
check_NaN = np.isnan(item) # check if values is empty
if check_NaN == True:
pass
else:
min_values_list.append(item)
# Add max values to list
for item in df['max']:
check_NaN = np.isnan(item) # check if values is empty
if check_NaN == True:
pass
else:
max_values_list.append(item)
print(f"Min: {min_values_list}")
print(f"Max: {max_values_list}")
extract_data()

Option 1
First, use df.to_numpy to convert columns min and max to a np.array.
Get rid of all the NaN values by selecting from the array using np.logical_or applied to a boolean mask (created with np.isnan).
arr = df[['min','max']].to_numpy()
value_list = arr[np.logical_not(np.isnan(arr))].tolist()
print(value_list)
[159.7030029296875,
154.8979949951172,
160.7830047607422,
165.43800354003906,
149.55799865722656,
162.80499267578125,
156.6529998779297,
164.31900024414062,
156.125,
153.13499450683594,
161.3520050048828,
156.9340057373047,
162.52200317382812,
155.7740020751953,
160.98500061035156,
161.83700561523438]
Option 2
Rather more cumbersome:
n = 10
# get the indices for `min` and `max` in two arrays
_min = argrelextrema(df.Close.values, np.less_equal, order=n)[0]
_max = argrelextrema(df.Close.values, np.greater_equal, order=n)[0]
# create columns (assuming you need this for other purposes as well)
df['min'] = df.iloc[_min]['Close']
df['max'] = df.iloc[_max]['Close']
# create lists for `min` and `max`
min_values_list = df['min'].dropna().tolist()
max_values_list = df['max'].dropna().tolist()
# join the lists
value_list2 = min_values_list + max_values_list
value_idxs = _min.tolist() + _max.tolist()
# finally, sort `value_list2` based on `value_idxs`
value_list2 = [x for _, x in sorted(zip(value_idxs, value_list2))]
# check if result is the same:
value_list2 == value_list
# True

Assuming that you have max and min columns, what about something like this?
df['max_or_min'] = np.where(df['max'].notna(), df['max'], df['min'])
min_max_values = df['max_or_min'].dropna().values.tolist()

Related

get idxmax rolling for each group and each row?

data: https://github.com/zero-jack/data/blob/main/hy_data.csv#L7
Goal
get the idxmax from last n rows for each group.
Try
df=df.assign(
l6d_highest_date=lambda x: x.groupby('hy_code')['high'].transform(lambda x: x.rolling(6).idxmax())
AttributeError: 'Rolling' object has no attribute 'idxmax'
notice: week_date is the index.
My solution is based on the conversion of the argmax computed on each sliding-window. For each date, thanks to this information, you can infer the date the argmax refers to.
df = pd.read_csv(
"https://raw.githubusercontent.com/zero-jack/data/main/hy_data.csv",
sep=",", index_col="week_date"
)
def rolling_idmax(series, n):
#fist compute the index in the sliding window
ids = series.rolling(n).apply(np.argmax)
#0 <= ids <= n-1
#how many rows have past from the sliding window maximum?
ids = n-1-ids
#0 <= ids <= n-1
#subtract `ids` from the actual positions
ids = np.arange(len(series))-ids
#0 <= ids <= len(series)-1
#convert the positions stored in `ids` with the corrisponding dates (series.index)
ids.loc[~ids.isna()] = series.index[ids.dropna().astype(int)]
#"2005-06-10" <= ids <= "2022-03-04"
return ids
df["l6d_highest_date"] = df.groupby("hy_code").high.apply(rolling_idmax, 6)
Based on this answer, I get the following workaround. Note that the linked answer can only handle series with the default index, I add x.index[global_index] to deal with non-default index.
window_size = 6
def get_idxmax_in_rolling(x: pd.Series):
local_index = x.rolling(window_size).apply(np.argmax)[window_size-1:].astype(int) # local index, removed nan before astype()
global_index = local_index + np.arange(len(x)-window_size+1)
# return list(x.index[global_index]) + [np.nan]*(window_size-1)
return [np.nan]*(window_size-1) + list(x.index[global_index]) # add nan back
df = df.assign(l6d_highest_date=lambda x: x.groupby('hy_code')['high'].transform(get_idxmax_in_rolling))
You can apply idxmax (for older versions of pandas before 1.0.0 you need to pass raw=False). The only caveat is that rolling must return a float (see linked docs), not a Timestamp. That's why you need to temporaryly reset the index, get the idxmax values and the corresponding week_dates and reset the index:
import pandas as pd
df = pd.read_csv('https://raw.githubusercontent.com/zero-jack/data/main/hy_data.csv', index_col='week_date', parse_dates=True)
df = df.reset_index()
df['l6d_highest_date'] = df.groupby('hy_code')['high'].transform(lambda x: x.rolling(6).apply(pd.Series.idxmax))
df.loc[df.l6d_highest_date.notna(), 'l6d_highest_date'] = df.loc[df.loc[df.l6d_highest_date.notna(), 'l6d_highest_date'].values, 'week_date'].values
df = df.set_index('week_date')

Get index value from pandas groupby first/last

I am trying to recover the original date or index from grouping a times series with a datetime index by year. Is there a faster way without a loop and an extra column to obtain first_day_indices
import pandas as pd
import numpy as np
import datetime as dt
# Data
T = 1000
base = dt.date.today()
date_list = [base - dt.timedelta(weeks=x) for x in range(T)]
date_list.reverse()
test_data = pd.DataFrame(np.random.randn(T)/100, columns=['Col1'])
test_data.index = pd.to_datetime(date_list)
test_data['date'] = test_data.index
first_days = test_data['date'].groupby(test_data.index.year).first()
first_day_indices= []
for i in first_days:
first_day_indices.append(np.where(test_data.index == i)[0][0])
print(first_day_indices)
You can use pandas.Series.isin to check whether elements in Series are contained in a list of values.
test_data.reset_index()[test_data.index.isin(first_days)].index.tolist()

Different results from interpolation if (same data) is done with timeindex

I get different results from interpolation if (same data) is done with timeindex, how can that be?
On pandas docs it says:
The ‘krogh’, ‘piecewise_polynomial’, ‘spline’, ‘pchip’ and ‘akima’ methods
are wrappers around the respective SciPy implementations of similar names.
These use the actual numerical values of the index. For more information
on their behavior, see the SciPy documentation and SciPy tutorial.
the sub-methods in interpolation( method= ...), where i noticed this strange behavior are (among others):
['krogh', 'spline', 'pchip', 'akima', 'cubicspline']
reproducable sample (with comparison):
import numpy as np , pandas as pd
from math import isclose
# inputs:
no_timeindex = False # reset both dataframes indices to numerical indices # for comparison.
no_timeindex_for_B = True # reset only dataframe indices of the first approach to numerical indices, the other one stays datetime, for comparison.
holes = True # create date-timeindex that skips the timestamps, that would normally be at location 6,7,12, 14, 17, instead of a perfectly frequent one.
o_ = 2 # order parameter for interpolation.
method_ = 'cubicspline'
#------------------+
n = np.nan
arr = [n,n,10000000000 ,10,10,10000,10,10, 10,40,4,4,9,4,4,n,n,n,4,4,4,4,4,4,18,400000000,4,4,4,n,n,n,n,n,n,n,4,4,4,5,6000000000,4,5,4,5,4,3,n,n,n,n,n,n,n,n,n,n,n,n,n,4,n,n,n,n,n,n,n,n,n,n,n,n,n,n,2,n,n,n,10,1000000000,n,n,n,n,n,n,n,n,n,n,n,n,n,n,n,n,n,n,n,n,n,n,n,n,n,n,n,n,n,n,n,n,n,n,n,n,n,n,n,n,n,n,n,n,n,n,n,n,n,n,n,n,n,n,n,n,n,n,n,n,n,n,n,n,n,n,n,1,n,n,n,n,n,n,n,n,n]
#--------------------------------------------------------------------------------+
df = pd.DataFrame(arr) # create dataframe from array.
if holes: # create a date-timeindex that skips the timestamps, that would normally be at location 6,7,12, 14, 17.
ix = pd.date_range("01.01.2000", periods = len(df)+(2 +5), freq="T")[2:]
to_drop = [ix[6],ix[7],ix[12],ix[14],ix[17]]
ix = ix.drop( to_drop)
df.index = ix
else: # create a perfectly frequent datetime-index without any holes.
ix = pd.date_range("01.01.2000", periods = len(df)+2, freq="T")[2:]
df.index = ix
# if wanted, drop timeindex and set it to integer indices later
if no_timeindex == True:
df.reset_index( inplace=True, drop=True )
df = df.interpolate(method=method_, order=o_, limit_area = 'inside') # interpolate.
df.index = ix # set index equal to the second approach, for comparing later.
A = df.copy(deep=True) # create a copy, to compare result with second approach later.
#------------------------------+
# second approach with numerical index instead of index-wise
df = pd.DataFrame(arr) # create dataframe from array.
if holes: # create a date-timeindex that skips the timestamps, that would normally be at location 6,7,12, 14, 17.
ix = pd.date_range("01.01.2000", periods = len(df)+(2 +5), freq="T")[2:]
to_drop = [ix[6],ix[7],ix[12],ix[14],ix[17]]
ix = ix.drop( to_drop)
df.index = ix
else: # create a perfectly frequent datetime-index without any holes.
ix = pd.date_range("01.01.2000", periods = len(df)+2, freq="T")[2:]
df.index = ix
# if wanted, drop timeindex and set it to integer indices later
if no_timeindex == True or no_timeindex_for_B == True:
df.reset_index(inplace=True, drop=True)
df = df.interpolate(method=method_, order=o_, limit_area = 'inside') # interpolate.
df.index = ix # set index equal to the first approach, for comparing later.
B = df.copy(deep=True) # create a copy, to compare result with second approach later.
#--------------------------------------------------------------------------------+
# compare:
if A.equals(B)==False:
# if values arent equal, count the ones that arent.
i=0
for x,y in zip( A[A.columns[0]], B[B.columns[0]]):
if x!=y and not (np.isnan(x) and np.isnan(y) ) :
print( x, " ?= ", y," ", (x==y), abs(x-y))
i+=1
# if theres no different values, ...
if i==0: print(" both are the same. ")
else: # if theres different values, ...
# count those different values, that are NOT almost the same.
not_almost = 0
for x,y in zip( A[A.columns[0]], B[B.columns[0]]):
if not (np.isnan(x) and np.isnan(y) ) :
if isclose(x,y, abs_tol=0.000001) == False:
not_almost+=1
# if all values are almost the same, ...
if not_almost == 0: print(" both are not, but almost the same. ")
else: print(" both are definetly not the same. ")
else: print(" both are the same. ")
This shouldnt be the case, since the pandas docs state different. Why does it happen anyways?

How to fill missing date in timeSeries

Here's what my data looks like:
There are daily records, except for a gap from 2017-06-12 to 2017-06-16.
df2['timestamp'] = pd.to_datetime(df['timestamp'])
df2['timestamp'] = df2['timestamp'].map(lambda x:
datetime.datetime.strftime(x,'%Y-%m-%d'))
df2 = df2.convert_objects(convert_numeric = True)
df2 = df2.groupby('timestamp', as_index = False).sum()
I need to fill this missing gap and others with values for all fields (e.g. timestamp, temperature, humidity, light, pressure, speed, battery_voltage, etc...).
How can I accomplish this with Pandas?
This is what I have done before
weektime = pd.date_range(start = '06/04/2017', end = '12/05/2017', freq = 'W-SUN')
df['week'] = 'nan'
df['weektemp'] = 'nan'
df['weekhumidity'] = 'nan'
df['weeklight'] = 'nan'
df['weekpressure'] = 'nan'
df['weekspeed'] = 'nan'
df['weekbattery_voltage'] = 'nan'
for i in range(0,len(weektime)):
df['week'][i+1] = weektime[i]
df['weektemp'][i+1] = df['temperature'].iloc[7*i+1:7*i+7].sum()
df['weekhumidity'][i+1] = df['humidity'].iloc[7*i+1:7*i+7].sum()
df['weeklight'][i+1] = df['light'].iloc[7*i+1:7*i+7].sum()
df['weekpressure'][i+1] = df['pressure'].iloc[7*i+1:7*i+7].sum()
df['weekspeed'][i+1] = df['speed'].iloc[7*i+1:7*i+7].sum()
df['weekbattery_voltage'][i+1] =
df['battery_voltage'].iloc[7*i+1:7*i+7].sum()
i = i + 1
The value of sum is not correct. Cause the value of 2017-06-17 is a sum of 2017-06-12 to 2017-06-16. I do not want to add them again. This gap is not only one gap in the period. I want to fill all of them.
Here is a function I wrote that might be helpful to you. It looks for inconsistent jumps in time and fills them in. After using this function, try using a linear interpolation function (pandas has a good one) to fill in your null data values. Note: Numpy arrays are much faster to iterate over and manipulate than Pandas dataframes, which is why I switch between the two.
import numpy as np
import pandas as pd
data_arr = np.array(your_df)
periodicity = 'daily'
def fill_gaps(data_arr, periodicity):
rows = data_arr.shape[0]
data_no_gaps = np.copy(data_arr) #avoid altering the thing you're iterating over
data_no_gaps_idx = 0
for row_idx in np.arange(1, rows): #iterate once for each row (except the first record; nothing to compare)
oldtimestamp_str = str(data_arr[row_idx-1, 0])
oldtimestamp = np.datetime64(oldtimestamp_str)
currenttimestamp_str = str(data_arr[row_idx, 0])
currenttimestamp = np.datetime64(currenttimestamp_str)
period = currenttimestamp - oldtimestamp
if period != np.timedelta64(900,'s') and period != np.timedelta64(3600,'s') and period != np.timedelta64(86400,'s'):
if periodicity == 'quarterly':
desired_period = 900
elif periodicity == 'hourly':
desired_period = 3600
elif periodicity == 'daily':
desired_period = 86400
periods_missing = int(period / np.timedelta64(desired_period,'s'))
for missing in np.arange(1, periods_missing):
new_time_orig = str(oldtimestamp + missing*(np.timedelta64(desired_period,'s')))
new_time = new_time_orig.replace('T', ' ')
data_no_gaps = np.insert(data_no_gaps, (data_no_gaps_idx + missing),
np.array((new_time, np.nan, np.nan, np.nan, np.nan, np.nan)), 0) # INSERT VALUES YOU WANT IN THE NEW ROW
data_no_gaps_idx += (periods_missing-1) #incriment the index (zero-based => -1) in accordance with added rows
data_no_gaps_idx += 1 #allow index to change as we iterate over original data array (main for loop)
#create a dataframe:
data_arr_no_gaps = pd.DataFrame(data=data_no_gaps, index=None,columns=['Time', 'temp', 'humidity', 'light', 'pressure', 'speed'])
return data_arr_no_gaps
Fill time gaps and nulls
Use the function below to ensure expected date sequence exists, and then use forward fill to fill in nulls.
import pandas as pd
import os
def fill_gaps_and_nulls(df, freq='1D'):
'''
General steps:
A) check for extra dates (out of expected frequency/sequence)
B) check for missing dates (based on expected frequency/sequence)
C) use forwardfill to fill nulls
D) use backwardfill to fill remaining nulls
E) append to file
'''
#rename the timestamp to 'date'
df.rename(columns={"timestamp": "date"})
#sort to make indexing faster
df = df.sort_values(by=['date'], inplace=False)
#create an artificial index of dates at frequency = freq, with the same beginning and ending as the original data
all_dates = pd.date_range(start=df.date.min(), end=df.date.max(), freq=freq)
#record column names
df_cols = df.columns
#delete ffill_df.csv so we can begin anew
try:
os.remove('ffill_df.csv')
except FileNotFoundError:
pass
#check for extra dates and/or dates out of order. print warning statement for log
extra_dates = set(df.date).difference(all_dates)
#if there are extra dates (outside of expected sequence/frequency), deal with them
if len(extra_dates) > 0:
#############################
#INSERT DESIRED BEHAVIOR HERE
print('WARNING: Extra date(s):\n\t{}\n\t Shifting highlighted date(s) back by 1 day'.format(extra_dates))
for date in extra_dates:
#shift extra dates back one day
df.date[df.date == date] = date - pd.Timedelta(days=1)
#############################
#check the artificial date index against df to identify missing gaps in time and fill them with nulls
gaps = all_dates.difference(set(df.date))
print('\n-------\nWARNING: Missing dates: {}\n-------\n'.format(gaps))
#if there are time gaps, deal with them
if len(gaps) > 0:
#initialize df of correct size, filled with nulls
gaps_df = pd.DataFrame(index=gaps, columns=df_cols.drop('date')) #len(index) sets number of rows
#give index a name
gaps_df.index.name = 'date'
#add the region and type
gaps_df.region = r
gaps_df.type = t
#remove that index so gaps_df and df are compatible
gaps_df.reset_index(inplace=True)
#append gaps_df to df
new_df = pd.concat([df, gaps_df])
#sort on date
new_df.sort_values(by='date', inplace=True)
#fill nulls
new_df.fillna(method='ffill', inplace=True)
new_df.fillna(method='bfill', inplace=True)
#append to file
new_df.to_csv('ffill_df.csv', mode='a', header=False, index=False)
return df_cols, regions, types, all_dates

Create internal python loop based on index and groupings for all combinations

I have script which looks at the rows and columns headers belonging to a group (REG_ID) and sums the values. The code runs on a matrix (small subset) as follows:
Outputs
My code runs well for calculating the sum for all the IDs based on rows and columns belonging to each internal group (REG_ID). For example for all row and column IDs which belong to REG_ID 1 are summed so the total flows between region 1 and region 1 (internal flows) is calculated and so on for each region.
I wish to extend this code by calculating (summing) the flows between regions for example region 1 to region 2, 3, 4 ,5....
I figure that I need to include another loop within the existing while loop but would really appreciate some help to figure out where it should be and how to construct it.
My code which currently runs on the internal flow sum (1-1, 2-2, 3-3 etc) is as follows:
global index
index = 1
x = index
while index < len(idgroups):
ward_list = idgroups[index] #select list of ward ids for each region from list of lists
df6 = mergedcsv.loc[ward_list] #select rows with values in the list
dfcols = mergedcsv.loc[ward_list, :] #select columns with values in list
ward_liststr = map(str, ward_list) #convert ward_list to strings so that they can be used to select columns, won't work as integers.
ward_listint = map(int, ward_list)
#dfrowscols = mergedcsv.loc[ward_list, ward_listint]
df7 = df6.loc[:, ward_liststr]
print df7
regflowsum = df7.values.sum() #sum all values in dataframe
intflow = [regflowsum]
print intflow
dfintflow = pd.DataFrame(intflow)
dfintflow.reset_index(level=0, inplace=True)
dfintflow.columns = ["RegID", "regflowsum"]
dfflows.set_value(index, 'RegID', index)
dfflows.set_value(index, 'RegID2', index)
dfflows.set_value(index, 'regflow', regflowsum)
mergedcsv.set_value(ward_list, 'TotRegFlows', regflowsum)
index += 1 #increment index number
print dfflows
new_df = pd.merge(pairlist, dfflows, how='left', left_on=['origID','destID'], right_on = ['RegID', 'RegID2'])
print new_df #useful for checking dataframe merges
regionflows = r"C:\Temp\AllNI\regionflows.csv"
header = ["WardID","LABEL","REG_ID","Total","TotRegFlows"]
mergedcsv.to_csv(regionflows, columns = header, index=False)
regregflows = r"C:\Temp\AllNI\reg_regflows.csv"
headerreg = ["REG_ID_ORIG", "REG_ID_DEST", "FLOW"]
pairlistCSV = r"C:\Temp\AllNI\pairlist_regions.csv"
new_df.to_csv(pairlistCSV)
The output is as follows:
idgroups dataframe: (see image 1 - second part of image 1)
df7 and intflows for each region Reg_ID:(third part of image 1 - on the right)
ddflows dataframe:(fourth part of image 2)
and the final output is new_df:(fifth part of image 2)
I wish to populate the sums for all possible combinations of flows between the regions not just internal.
I figure I need to add another loop into the while loop. So possibly add an enumerate function like:
while index < len(idgroups):
#add line(s) to calculate flows between regions
for index, item in enumerate(idgroups):
ward_list = idgroups[index]
print ward_list
df6 = mergedcsv.loc[ward_list] #select rows with values in the list
dfcols = mergedcsv.loc[ward_list, :] #select columns with values in list
ward_liststr = map(str, ward_list) #convert ward_list to strings so that they can be used to select columns, won't work as integers.
ward_listint = map(int, ward_list)
#dfrowscols = mergedcsv.loc[ward_list, ward_listint]
df7 = df6.loc[:, ward_liststr]
print df7
regflowsum = df7.values.sum() #sum all values in dataframe
intflow = [regflowsum]
print intflow
dfintflow = pd.DataFrame(intflow)
dfintflow.reset_index(level=0, inplace=True)
dfintflow.columns = ["RegID", "regflowsum"]
dfflows.set_value(index, 'RegID', index)
dfflows.set_value(index, 'RegID2', index)
dfflows.set_value(index, 'regflow', regflowsum)
mergedcsv.set_value(ward_list, 'TotRegFlows', regflowsum)
index += 1 #increment index number
I'm unsure how to integrate the item so struggling to extend the code for all combinations. Any advice appreciated.
Update based on flows function:
w=pysal.rook_from_shapefile("C:/Temp/AllNI/NIW01_sort.shp",idVariable='LABEL')
Simil = pysal.open("C:/Temp/AllNI/simNI.csv")
Similarity = np.array(Simil)
db = pysal.open('C:\Temp\SQLite\MatrixCSV2.csv', 'r')
dbf = pysal.open(r'C:\Temp\AllNI\NIW01_sortC.dbf', 'r')
ids = np.array((dbf.by_col['LABEL']))
commuters = np.array((dbf.by_col['Total'],dbf.by_col['IDNO']))
commutersint = commuters.astype(int)
comm = commutersint[0]
floor = int(MIN_COM_CT + 100)
solution = pysal.region.Maxp(w=w,z=Similarity,floor=floor,floor_variable=comm)
regions = solution.regions
#print regions
writecsv = r"C:\Temp\AllNI\reg_output.csv"
csv = open(writecsv,'w')
csv.write('"LABEL","REG_ID"\n')
for i in range(len(regions)):
for lines in regions[i]:
csv.write('"' + lines + '","' + str(i+1) + '"\n')
csv.close()
flows = r"C:\Temp\SQLite\MatrixCSV2.csv"
regs = r"C:\Temp\AllNI\reg_output.csv"
wardflows = pd.read_csv(flows)
regoutput = pd.read_csv(regs)
merged = pd.merge(wardflows, regoutput)
#duplicate REG_ID column as the index to be used later
merged['REG_ID2'] = merged['REG_ID']
merged.to_csv("C:\Temp\AllNI\merged.csv", index=False)
mergedcsv = pd.read_csv("C:\Temp\AllNI\merged.csv",index_col='WardID_1') #index this dataframe using the WardID_1 column
flabelList = pd.read_csv("C:\Temp\AllNI\merged.csv", usecols = ["WardID", "REG_ID"]) #create list of all FLabel values
reg_id = "REG_ID"
ward_flows = "RegIntFlows"
flds = [reg_id, ward_flows] #create list of fields to be use in search
dict_ref = {} # create a dictionary with for each REG_ID a list of corresponding FLABEL fields
#group the dataframe by the REG_ID column
idgroups = flabelList.groupby('REG_ID')['WardID'].apply(lambda x: x.tolist())
print idgroups
idgrp_df = pd.DataFrame(idgroups)
csvcols = mergedcsv.columns
#create a list of column names to pass as an index to select columns
columnlist = list(mergedcsv.columns.values)
mergedcsvgroup = mergedcsv.groupby('REG_ID').sum()
mergedcsvgroup.describe()
idList = idgroups[2]
df4 = pd.DataFrame()
df5 = pd.DataFrame()
col_ids = idList #ward id no
regiddf = idgroups.index.get_values()
print regiddf
#total number of region ids
#print regiddf
#create pairlist combinations from region ids
#combinations with replacement allows for repeated items
#pairs = list(itertools.combinations_with_replacement(regiddf, 2))
pairs = list(itertools.product(regiddf, repeat=2))
#print len(pairs)
#create a new dataframe with pairlists and summed data
pairlist = pd.DataFrame(pairs,columns=['origID','destID'])
print pairlist.tail()
header_pairlist = ["origID","destID","flow"]
header_intflow = ["RegID", "RegID2", "regflow"]
dfflows = pd.DataFrame(columns=header_intflow)
print mergedcsv.index
print mergedcsv.dtypes
#mergedcsv = mergedcsv.select_dtypes(include=['int64'])
#print mergedcsv.columns
#mergedcsv.rename(columns = lambda x: int(x), inplace=True)
def flows():
pass
#def flows(mergedcsv, region_a, region_b):
def flows(mergedcsv, ward_lista, ward_listb):
"""Return the sum of all the cells in the row/column intersections
of ward_lista and ward_listb."""
mergedcsv = mergedcsv.loc[:, mergedcsv.dtypes == 'int64']
regionflows = mergedcsv.loc[ward_lista, ward_listb]
regionflowsum = regionflows.values.sum()
#grid = [ax, bx, regflowsuma, regflowsumb]
gridoutput = [ax, bx, regionflowsum]
print gridoutput
return regflowsuma
return regflowsumb
#print mergedcsv.index
#mergedcsv.columns = mergedcsv.columns.str.strip()
for ax, group_a in enumerate(idgroups):
ward_lista = map(int, group_a)
print ward_lista
for bx, group_b in enumerate(idgroups[ax:], start=ax):
ward_listb = map(int, group_b)
#print ward_listb
flow_ab = flows(mergedcsv, ward_lista, ward_listb)
#flow_ab = flows(mergedcsv, group_a, group_b)
This results in KeyError: 'None of [[189, 197, 198, 201]] are in the [columns]'
I have tried using ward_lista = map(str, group_a) and map(int, group_a) also but list objects not found in dataframe.loc.
The columns are mixed datatypes but all the columns containing the labels which should be sliced are of type int64.
I have tried many solutions around the datatypes but to no avail. Any suggestions?
I can't speak to the computations you're doing, but it seems like you just want to arrange combinations of groups. The question is whether they are directed or undirected- that is, do you need to compute flows(A,B) and flows(B,A), or just one?
If just one, you could do this:
for i,ward_list in enumerate(idgroups):
for j,ward_list2 in enumerate(idgroups[i:],start=i):
This would iterate over i,j pairs like:
0,0 0,1 0,2 ... 0,n
1,1 1,2 ... 1,n
2,2 ... 2,n
which would serve in the undirected case.
If you need to compute both flows(A,B) and flows(B,A), then you can simply push your code into a function called flows, and call it with reversed args, as shown. ;-)
Update
Let's define a function called flows:
def flows():
pass
Now, what are the parameters?
Well, looking at your code, it gets data from a DataFrame. And you want two different wards, so let's start with those. The result seems to be a sum of the resulting grid.
def flows(df, ward_a, ward_b):
"""Return the sum of all the cells in the row/column intersections
of ward_a and ward_b."""
return 0
Now I'm going to copy lines of your code:
ward_list = idgroups[index]
print ward_list
df6 = mergedcsv.loc[ward_list] #select rows with values in the list
dfcols = mergedcsv.loc[ward_list, :] #select columns with values in list
ward_liststr = map(str, ward_list) #convert ward_list to strings so that they can be used to select columns, won't work as integers.
ward_listint = map(int, ward_list)
#dfrowscols = mergedcsv.loc[ward_list, ward_listint]
df7 = df6.loc[:, ward_liststr]
print df7
regflowsum = df7.values.sum() #sum all values in dataframe
intflow = [regflowsum]
print intflow
I think this is most of the flow function right here. Let's look.
The ward_list will obviously be either the ward_a or ward_b parameters.
I'm not sure what df6 is, because you sort of recompute it in df7. So that need to be clarified.
regflowsum is our desired output, I think.
Rewriting this into the function:
def flows(df, ward_a, ward_b):
"""Return the sum of all the cells in the row/column intersections
of ward_a and ward_b."""
print "Computing flows from:"
print " ", ward_a
print ""
print "flows into:"
print " ", ward_b
# Filter rows by ward_a, cols by ward_b:
grid = df.loc[ward_a, ward_b]
print "Grid:"
print grid
flowsum = grid.values.sum()
print "Flows:", flowsum
return flowsum
Now, I have assumed that the ward_a and ward_b values are already in the correct format. So we'll have to str-ify them or whatever outside the function. Let's do that:
for ax, group_a in enumerate(idgroups):
ward_a = map(str, group_a)
for bx, group_b in enumerate(idgroups[ax:], start=ax):
ward_b = map(str, group_b)
flow_ab = flows(mergedcsv, ward_a, ward_b)
if ax != bx:
flow_ba = flows(mergedcsv, ward_b, ward_a)
else:
flow_ba = flow_ab
# Now what?
At this point, you have two numbers. They will be equal when the wards are the same (internal flow?). At this point your original code stops being helpful because it only deals with internal flows, and not A->B flows, so I don't know what to do. But the values are in the variables, so ...

Categories