I am having trouble dynamically binning my dataset for further calculation. My goal is to have specific bin/labels for each individual row in my dataframe, based on a function, and have the corresponding label assign to the column 'action'.
My dataset is:
id value1 value2 type length amount
1 0.9 1.0 X 10 ['A', 'B']
2 2.0 1.6 Y 80 ['A']
3 0.3 0.5 X 29 ['A', 'C']
The function is as follows:
def bin_label_generator(amount):
if amount< 2:
amount= 2
lower_bound = 1.0 - (1.0/amount)
mid_bound = 1.0
upper_bound = 1.0 + (1.0/amount)
thresholds = {
'bins':[-np.inf, lower_bound, mid_bound, upper_bound, np.inf],
'labels':[0, 1.0, 2.0, 3.0]
}
return thresholds
This is my current code, but it requires me to specify a row in order to cut. I would want this to happen automatically with the dictionary specified in the row itself.
# filter on type
filter_type_series = df['type'].str.contains('X')
# get amount of items in amount list
amount_series = df[filter_type_series ]['amount'].str.len()
# generate bins for each row in series
bins_series = amount_series.apply(bin_label_generator)
# get the max values to for binning
max_values = df[filter_type_series].loc[:, [value1, value2]].abs().max(1)
# following line requires a row index, what I do not want
df['action'] = pd.cut(max_values, bins=bins_series[0]['bins'], labels=bins_series[0]['labels'])
Found a fix myself, by just iterating over every single row in the series, and then adding it towards the columns in the actual df.
type = 'X'
first_df = df.copy()
type_series = mst_df['type'].str.contains(type)
# loop over every row to dynamically use pd.cut with bins/labels from specific row
for index, row in mst_df[mst_series].iterrows():
# get the max value from rows
max_val = row[[value1, value2]].abs().max()
# get amount of cables
amount = len(row['amount'])
# get bins and labels for specific row
bins_label_dict = bin_label_generator(amount)
bins = bins_label_dict['bins']
labels = bins_label_dict['labels']
# append label to row with max value
first_df .loc[index, 'action'] = pd.cut([max_val], bins=bins, labels=labels)
Related
I need to insert rows based on the column week based on the groupby type, in some cases i have missing weeks in the middle of the dataframe at different positions and i want to insert rows to fill in the missing rows as copies of the last existing row, in this case copies of week 7 to fill in the weeks 8 and 9 and copies of week 11 to fill in rows for week 12, 13 and 14 : on this table you can see the jump from week 7 to 10 and from 11 to 15:
the perfect output would be as follow: the final table with incremental values in column week the correct way :
Below is the code i have, it inserts only one row and im confused why:
def middle_values(final : DataFrame) -> DataFrame:
finaltemp= pd.DataFrame()
out= pd.DataFrame()
for i in range(0, len(final)):
for f in range(1, 52 , 1):
if final.iat[i,8]== f and final.iat[i-1,8] != f-1 :
if final.iat[i,8] > final.iat[i-1,8] and final.iat[i,8] != (final.iat[i-1,8] - 1):
line = final.iloc[i-1]
c1 = final[0:i]
c2 = final[i:]
c1.loc[i]=line
concatinated = pd.concat([c1, c2])
concatinated.reset_index(inplace=True)
concatinated.iat[i,11] = concatinated.iat[i-1,11]
concatinated.iat[i,9]= f-1
finaltemp = finaltemp.append(concatinated)
if 'type' in finaltemp.columns:
for name, groups in finaltemp.groupby(["type"]):
weeks = range(groups['week'].min(), groups['week'].max()+1)
out = out.append(pd.merge(finaltemp, pd.Series(weeks, name='week'), how='right').ffill())
out.drop_duplicates(subset=['project', 'week'], keep = 'first', inplace=True)
out.drop_duplicates(inplace = True)
out.sort_values(["Budget: Budget Name", "Budget Week"], ascending = (False, True), inplace=True)
out.drop(['level_0'], axis = 1, inplace=True)
out.reset_index(inplace=True)
out.drop(['level_0'], axis = 1, inplace=True)
return out
else :
return final
For the first part of your question. Suppose we have a dataframe like the following:
df = DataFrame({"project":[1,1,1,2,2,2], "week":[1,3,4,1,2,4], "value":[12,22,18,17,18,23]})
We can create a new multi index to get the additional rows that we need
new_index = pd.MultiIndex.from_arrays([sorted([i for i in df['project'].unique()]*52),
[i for i in np.arange(1,53,1)]*df['project'].unique().shape[0]], names=['project', 'week'])
We can then apply this index to get the new dataframe that you need with blanks in the new rows
df = df.set_index(['project', 'week']).reindex(new_index).reset_index().sort_values(['project', 'week'])
You would then need to apply a forward fill (using ffill) or a back fill (using bfill) with groupby and transform to get the required values in the rows that you need.
I have a dataframe as follows, only with more rows:
import pandas as pd
data = {'First': ['First value', 'Second value','Third value'],
'Second': [['old','new','gold','door'], ['old','view','bold','door'],['new','view','world','window']]}
df = pd.DataFrame (data, columns = ['First','Second'])
To calculate the jaccard similarity i found this piece online(not my solution):
def lexical_overlap(doc1, doc2):
words_doc1 = set(doc1)
words_doc2 = set(doc2)
intersection = words_doc1.intersection(words_doc2)
union = words_doc1.union(words_doc2)
return float(len(intersection)) / len(union) * 100
what i would like to get as a result is for the measure to take each row of the Second column as doc and compare each pair iteratively and outputs a measure with the row name from the First column something like this :
First value and Second value = 80
First value and Third value = 95
Second value and Third value = 90
Well, I'd do it somewhat like this:
from itertools import combinations
for val in list(combinations(range(len(df)), 2)):
firstlist = df.iloc[val[0],1]
secondlist = df.iloc[val[1],1]
value = round(lexical_overlap(firstlist,secondlist),2)
print(f"{df.iloc[val[0],0]} and {df.iloc[val[1],0]}'s value is: {value}")
Output:
First value and Second value's value is: 33.33
First value and Third value's value is: 14.29
Second value and Third value's value is: 14.29
Since your data is not big, you can try broadcasting with slightly different approach:
# dummy for each rows
s = pd.get_dummies(df.Second.explode()).sum(level=0).values
# pair-wise jaccard
(s#s.T)/(s|s[:,None,:]).sum(-1) * 100
Output:
array([[100. , 33.33333333, 14.28571429],
[ 33.33333333, 100. , 14.28571429],
[ 14.28571429, 14.28571429, 100. ]])
Here's an example of how I managed to extract the numerical values from dimensions and multiply them to return the volume:
import pandas as pd
# create a dict
d = {'model': ['merc','ford'], 'dimensions': ['4.31 m x 2 m x 3.222 m', '2 m']}
# create data frame from dict
df = pd.DataFrame(data=d)
# this extracts all instances of numbers but creates a new data frame with each num in new row
x = df['dimensions'].str.extractall(r'(\d*\.?\d+)')
# converts all numeric strings to float
x[0] = x[0].astype(float)
#multiplies the dimensions of the van
y = x.loc[0].prod(axis=0)
print(y)
Here's my attempted function to repeat the example from the code above but to return it to the new column in the data frame.
def my_function(col,row):
out = 0
if col.str.extractall(r'(\d*\.?\d+)') == True:
out = col.str.extractall(r'(\d*\.?\d+)')
col[0] = col[0].astype(float)
z = col.loc[row].prod(axis=0)
return z
# logic to create new column based on function and existing data.
df['volume'] = df.apply(lambda x: my_function(df['dimensions'], df.index)
Could somebody please help me in getting this volume data back into the original dataframe as a new column.
IIUC, do you want to try:
df['volume'] = df['dimensions'].str.extractall(r'(\d*\.?\d+)').astype(float).unstack().prod(axis=1)
Output:
model dimensions volume
0 merc 4.31 m x 2 m x 3.222 m 27.77364
1 ford 2 m 2.00000
I've several hundreds of pandas dataframes and And the number of rows are not exactly the same in all the dataframes like some have 600 but other have 540 only.
So what i want to do is like, i have two samples of exactly the same numbers of dataframes and i want to read all the dataframes(around 2000) from both the samples. So that's how thee data looks like and i can read the files like this:
5113.440 1 0.25846 0.10166 27.96867 0.94852 -0.25846 268.29305 5113.434129
5074.760 3 0.68155 0.16566 120.18771 3.02654 -0.68155 101.02457 5074.745627
5083.340 2 0.74771 0.13267 105.59355 2.15700 -0.74771 157.52406 5083.337081
5088.150 1 0.28689 0.12986 39.65747 2.43339 -0.28689 164.40787 5088.141849
5090.780 1 0.61464 0.14479 94.72901 2.78712 -0.61464 132.25865 5090.773443
#first Sample
path_to_files = '/home/Desktop/computed_2d_blaze/'
lst = []
for filen in [x for x in os.listdir(path_to_files) if '.ares' in x]:
df = pd.read_table(path_to_files+filen, skiprows=0, usecols=(0,1,2,3,4,8),names=['wave','num','stlines','fwhm','EWs','MeasredWave'],delimiter=r'\s+')
df = df.sort_values('stlines', ascending=False)
df = df.drop_duplicates('wave')
df = df.reset_index(drop=True)
lst.append(df)
#second sample
path_to_files1 = '/home/Desktop/computed_1d/'
lst1 = []
for filen in [x for x in os.listdir(path_to_files1) if '.ares' in x]:
df1 = pd.read_table(path_to_files1+filen, skiprows=0, usecols=(0,1,2,3,4,8),names=['wave','num','stlines','fwhm','EWs','MeasredWave'],delimiter=r'\s+')
df1 = df1.sort_values('stlines', ascending=False)
df1 = df1.drop_duplicates('wave')
df1 = df1.reset_index(drop=True)
lst1.append(df1)
Now the data is stored in lists and as the number of rows in all the dataframes are not same so i cant subtract them directly.
So how can i subtract them correctly?? And after that i want to take average(mean) of the residual to make a dataframe?
You shouldn't use apply. Just use Boolean making:
mask = df['waves'].between(lower_outlier, upper_outlier)
df[mask].plot(x='waves', y='stlines')
One solution that comes into mind is writing a function that finds outliers based on upper and lower bounds and then slices the data frames based on outliers index e.g.
df1 = pd.DataFrame({'wave': [1, 2, 3, 4, 5]})
df2 = pd.DataFrame({'stlines': [0.1, 0.2, 0.3, 0.4, 0.5]})
def outlier(value, upper, lower):
"""
Find outliers based on upper and lower bound
"""
# Check if input value is within bounds
in_bounds = (value <= upper) and (value >= lower)
return in_bounds
# Function finds outliers in wave column of DF1
outlier_index = df1.wave.apply(lambda x: outlier(x, 4, 1))
# Return DF2 without values at outlier index
df2[outlier_index]
# Return DF1 without values at outlier index
df1[outlier_index]
I have a large dataset stored as a pandas panel. I would like to count the occurence of values < 1.0 on the minor_axis for each item in the panel. What I have so far:
#%% Creating the first Dataframe
dates1 = pd.date_range('2014-10-19','2014-10-20',freq='H')
df1 = pd.DataFrame(index = dates)
n1 = len(dates)
df1.loc[:,'a'] = np.random.uniform(3,10,n1)
df1.loc[:,'b'] = np.random.uniform(0.9,1.2,n1)
#%% Creating the second DataFrame
dates2 = pd.date_range('2014-10-18','2014-10-20',freq='H')
df2 = pd.DataFrame(index = dates2)
n2 = len(dates2)
df2.loc[:,'a'] = np.random.uniform(3,10,n2)
df2.loc[:,'b'] = np.random.uniform(0.9,1.2,n2)
#%% Creating the panel from both DataFrames
dictionary = {}
dictionary['First_dataset'] = df1
dictionary['Second dataset'] = df2
P = pd.Panel.from_dict(dictionary)
#%% I want to count the number of values < 1.0 for all datasets in the panel
## Only for minor axis b, not minor axis a, stored seperately for each dataset
for dataset in P:
P.loc[dataset,:,'b'] #I need to count the numver of values <1.0 in this pandas_series
To count all the "b" values < 1.0, I would first isolate b in its own DataFrame by swapping the minor axis and the items.
In [43]: b = P.swapaxes("minor","items").b
In [44]: b.where(b<1.0).stack().count()
Out[44]: 30
Thanks for thinking with me guys, but I managed to figure out a surprisingly easy solution after many hours of attempting. I thought I should share it in case someone else is looking for a similar solution.
for dataset in P:
abc = P.loc[dataset,:,'b']
abc_low = sum(i < 1.0 for i in abc)