without any imports
# given
deps = {'W': ['R', 'S'], 'C': [], 'S': ['C'], 'R': ['C'], 'F': ['W']}
prob = {'C': [0.5], 'R': [0.2, 0.8], 'S': [0.5, 0.1], 'W': [0.01, 0.9, 0.9, 0.99], 'F' : [0.4, 0.3]}
k = 'F'
# want to return: L = [[0.2, 0.8], [0.5, 0.1], [0.01, 0.9, 0.9, 0.99], [0.4, 0.3]]
# attempt
L = []
for i in deps[k]:
s = i
while(deps[s] != []):
L.append(prob[s])
s = deps[s]
print(L)
I'm having trouble figuring this out. So given 2 dictionaries: dependents and probability I wish to traverse through a select point and set every value so for the above example I chose 'F'.
It would first go into the deps of 'F', find 'W' and then check the deps of that being ['R', 'S'] then check 'R' seeing that the depedent of 'R' is 'C' and 'C' does not a depedent so we stop at 'R' and append its probability into L.
[[0.2, 0.8]]
then we go into S and do the same thing
[[0.2, 0.8], [0.5, 0.1]]
then we're done with that and we're back at W
[[0.2, 0.8], [0.5, 0.1], [0.01, 0.9, 0.9, 0.99]]
and finally since we're done with W we get the prob dict of F
[[0.2, 0.8], [0.5, 0.1], [0.01, 0.9, 0.9, 0.99], [0.4, 0.3]]
My code fails when theres more than one dependent value. Not sure how to wrap my head around that. Trying to make a function that will do this given deps and prob and value of k
I would solve the problem with a while loop that keeps looking to see if you've used all the values you've recursively found. You can use a structure like:
deps = {'W': ['R', 'S'], 'C': [], 'S': ['C'], 'R': ['C'], 'F': ['W']}
# out = ['F', 'W', 'R', 'S']
prob = {'C': [0.5], 'R': [0.2, 0.8], 'S': [0.5, 0.1], 'W': [0.01, 0.9, 0.9, 0.99], 'F': [0.4, 0.3]}
k = 'F'
L = []
my_list = []
found_all = False
def get_values(dep_dictionary, prob_dict, start_key):
used_keys = []
keys_to_use = [start_key]
probability = []
# build a list of linked values from deps dictionary
while used_keys != keys_to_use:
print('used: {}'.format(used_keys))
print('to use: {}'.format(keys_to_use))
for i in range(len(keys_to_use)):
if keys_to_use[i] not in used_keys:
new_keys = dep_dictionary[keys_to_use[i]]
if len(new_keys):
for sub_key in new_keys:
if sub_key not in keys_to_use:
keys_to_use.append(sub_key)
used_keys.append(keys_to_use[i])
else:
del keys_to_use[i]
# at this point used_keys = ['F', 'W', 'R', 'S']
for key in used_keys:
probability.append(prob_dict[key])
print(probability)
get_values(deps, prob, k)
Which outputs:
used: []
to use: ['F']
used: ['F']
to use: ['F', 'W']
used: ['F', 'W']
to use: ['F', 'W', 'R', 'S']
used: ['F', 'W', 'R', 'S']
to use: ['F', 'W', 'R', 'S', 'C']
[[0.4, 0.3], [0.01, 0.9, 0.9, 0.99], [0.2, 0.8], [0.5, 0.1]]
Where you can see the output is correct ([[0.4, 0.3], [0.01, 0.9, 0.9, 0.99], [0.2, 0.8], [0.5, 0.1]]), however it is not in the exact same order, but it doesn't sound like that should be a huge issue. If it is, you can always re-splice it into a dictionary by adjusting the
for key in used_keys:
probability.append(prob_dict[key])
bit such that probability is a dictionary also. You can also take the print() statements out, they were just there to debug and show visually what is going on within the loop. You also would probably have the function return probability instead of printing it, but I'll leave that to your discretion!
Here is a solution that uses a stack-based depth-first search to traverse the dependency tree. It adds probabilities at each step iff. the node has dependencies, and then simply reverses the list at the end.
def prob_list(root):
nodes_to_visit = [root]
prob_list = []
while nodes_to_visit:
curr = nodes_to_visit.pop()
print(f"Visiting {curr}")
if deps[curr]:
prob_list.append(prob[curr])
for dep in deps[curr]:
nodes_to_visit.append(dep)
return list(reversed(prob_list))
print(prob_list("F")) # [[0.2, 0.8], [0.5, 0.1], [0.01, 0.9, 0.9, 0.99], [0.4, 0.3]]
Related
I have the DataFrame:
df =
sample_type observed_data
A [0.2, 0.5, 0.17, 0.1]
A [0.9, 0.3, 0.24, 0.5]
A [0.9, 0.5, 0.6, 0.39]
B [0.01, 0.07, 0.15, 0.26]
B [0.08, 0.14, 0.32, 0.58]
B [0.01, 0.16, 0.42, 0.41]
where the data type in the observed_data column is np.array. What's the easiest and most efficient way of plotting each of the numpy arrays overlayed on the same plot using matplotlib and/or plotly and showing A and B as separate colors or line types (eg. dashed, dotted, etc.)?
You can use this...
df = pd.DataFrame({'sample_type' : ['A', 'A', 'A', 'B', 'B', 'B'],
'observed_data' : [[0.2, 0.5, 0.17, 0.1], [0.9, 0.3, 0.24, 0.5], [0.9, 0.5, 0.6, 0.39],
[0.01, 0.07, 0.15, 0.26], [0.08, 0.14, 0.32, 0.58], [0.01, 0.16, 0.42, 0.41]]})
for ind, cell in df['observed_data'].iteritems():
if len(cell) > 0:
if df.loc[ind,'sample_type'] == 'A':
plotted = plt.plot(np.linspace(0,1,len(cell)), cell, color='blue', marker = 'o', linestyle = '-.')
else:
plotted = plt.plot(np.linspace(0,1,len(cell)), cell, color='red', marker = '*', linestyle = ':')
plt.show()
Using plotnine in python, I'd like to add dashed horizontal lines to my plot (a scatterplot, but preferably an answer compatible with other plot types) representing the mean for every color separately. I'd like to do so without manually computing the mean values myself or adapting other parts of the data (e.g. adding columns for color values etc).
Additionally, the original plot is generated via a function (make_plot below) and the mean lines are to be added afterwards, yet need to have the same color as the points from which they are derived.
Consider the following as a minimal example;
import pandas as pd
import numpy as np
from plotnine import *
df = pd.DataFrame( { 'MSE': [0.1, 0.7, 0.5, 0.2, 0.3, 0.4, 0.8, 0.9 ,1.0, 0.4, 0.7, 0.9 ],
'Size': ['S', 'M', 'L', 'XL', 'S', 'M', 'L', 'XL', 'S', 'M', 'L', 'XL'],
'Number': [1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3] } )
def make_plot(df, var_x, var_y, var_fill) :
plot = ggplot(df) + aes(x='Number', y='MSE', fill = 'Size') + geom_point()
return plot
plot = make_plot(df, 'Number', 'MSE', 'Size')
I'd like to add 4 lines, one for each Size. The exact same can be done in R using ggplot, as shown by this question. Adding geom_line(stat="hline", yintercept="mean", linetype="dashed") to plot however results in an error PlotnineError: "'stat_hline' Not in Registry. Make sure the module in which it is defined has been imported." that I am unable to resolve.
Answers that can resolve the aforementioned issue, or propose another working solution entirely, are greatly appreciated.
You can do it by first defining the means as a vector and then pass it to your function:
import pandas as pd
import numpy as np
from plotnine import *
from random import randint
df = pd.DataFrame( { 'MSE': [0.1, 0.7, 0.5, 0.2, 0.3, 0.4, 0.8, 0.9 ,1.0, 0.4, 0.7, 0.9 ],
'Size': ['S', 'M', 'L', 'XL', 'S', 'M', 'L', 'XL', 'S', 'M', 'L', 'XL'],
'Number': [1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3] } )
a = df.groupby(['Size'])['MSE'].mean() ### Defining yuor means
a = list(a)
def make_plot(df, var_x, var_y, var_fill):
plot = ggplot(df) + aes(x='Number', y='MSE', fill = 'Size') + geom_point()+ geom_hline(yintercept =a,linetype="dashed")
return plot
plot = make_plot(df, 'Number', 'MSE', 'Size')
which gives:
Note that two of the lines coincide:
a = [0.6666666666666666, 0.5, 0.4666666666666666, 0.6666666666666666]
To add different colors to each dashed line, you can do this:
import pandas as pd
import numpy as np
from plotnine import *
df = pd.DataFrame( { 'MSE': [0.1, 0.7, 0.5, 0.2, 0.3, 0.4, 0.8, 0.9 ,1.0, 0.4, 0.7, 0.9 ],
'Size': ['S', 'M', 'L', 'XL', 'S', 'M', 'L', 'XL', 'S', 'M', 'L', 'XL'],
'Number': [1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3] } )
### Generate a list of colors of the same length as your categories (Sizes)
color = []
n = len(list(set(df.Size)))
for i in range(n):
color.append('#%06X' % randint(0, 0xFFFFFF))
######################################################
def make_plot(df, var_x, var_y, var_fill):
plot = ggplot(df) + aes(x='Number', y='MSE', fill = 'Size') + geom_point()+ geom_hline(yintercept =list(df.groupby(['Size'])['MSE'].mean()),linetype="dashed", color =b)
return plot
plot = make_plot(df, 'Number', 'MSE', 'Size')
which returns:
Here's an example of my dataframe:
d = {'group': ['a', 'a', 'a', 'b', 'b', 'b', 'b', 'b', 'c', 'd', 'd'], \
'round': [3, 3, 2, 1, 3, 1, 3, 3, 3, 2, 1], \
'score': [0.3, 0.1, 0.6, 0.8, 0.2, 0.5, 0.5, 0.6, 0.4, 0.9, 0.1]}
df = pd.DataFrame(d)
df
group round score
0 a 3 0.3
1 a 3 0.1
2 a 2 0.6
3 b 1 0.8
4 b 3 0.2
5 b 1 0.5
6 b 3 0.5
7 b 3 0.6
8 c 3 0.4
9 d 2 0.9
10 d 1 0.1
My actual dataframe has 6 columns and > 1,000,000 rows. I'm trying to figure out the fastest way to do the following:
For each group find the average of scores and perform some calculation with it for each of 3 rounds. If there are no scores, write 'NA'.
I'm not sure if it would be faster to make a list of lists and then convert it into a dataframe or make a new dataframe and populate that, so i went with the list first:
def test_df(data):
value_counts = data['group'].value_counts().to_dict()
avgs = []
for key, val in value_counts.items():
row = data[data['group'] == key]
x = [key]
if val < 2:
x.extend([10 * row['score'].values[0] + 1 if i == row['round'].values[0] else 'NA' for i in range (1,4)])
else:
x.extend([(10 * row[row['round'] == i]['score'].mean() + 1) if len(row[row['round'] == i]) > 0 else 'NA' for i in range(1, 4)])
avgs.append(x)
return avgs
Here I created a separate case because about 80% of groups in my data only have one row, so I figured it might speed things up maybe?
this returns the correct results in format [group, round 1, round 2, round 3]
[['b', 7.5, 'NA', 5.333333333333333],
['a', 'NA', 7.0, 3.0],
['d', 2.0, 10.0, 'NA'],
['c', 'NA', 'NA', 5.0]]
but it's looking like it's going to take a really really long time on the actual dataframe...
Does anyone have any better ideas?
It looks to me like you're basically going a groupby/mean and a pivot.
import pandas as pd
d = {'group': ['a', 'a', 'a', 'b', 'b', 'b', 'b', 'b', 'c', 'd', 'd'], \
'round': [3, 3, 2, 1, 3, 1, 3, 3, 3, 2, 1], \
'score': [0.3, 0.1, 0.6, 0.8, 0.2, 0.5, 0.5, 0.6, 0.4, 0.9, 0.1]}
df = pd.DataFrame(d)
df = (df.groupby(['group','round'])['score'].mean()*10+1).reset_index()
df.pivot_table(index='group',columns='round',values='score', fill_value='NA').reset_index().values
Output
array([['a', 'NA', 7.0, 3.0],
['b', 7.5, 'NA', 5.333333333333333],
['c', 'NA', 'NA', 5.0],
['d', 2.0, 10.0, 'NA']], dtype=object)
The imbalanced dataset may show different results, but I tested with the blow scripts and found out even with the pandas dataframe, the result shows okay performance. However, you can always compare it with the native python data structure.
import random
import datetime
import pandas as pd
def generate_data(): # augmentation
data = {'group': [], 'round': [], 'score': []}
for index in range(10 ** 6): # sample size
data['group'].append(random.choice(['a', 'b', 'c', 'd']))
data['round'].append(random.randrange(1, 4))
data['score'].append(round(random.random(), 1))
return data
def calc_with_native_ds(data): # native python data structure
pass
def calc_with_pandas_df(df): # pandas dataframe
return df.groupby(['group', 'round']).mean()
if __name__ == '__main__':
data = generate_data()
df = pd.DataFrame(data)
print(df.shape)
start_datetime = datetime.datetime.now()
# calc_with_native_ds(data)
calc_with_pandas_df(df)
end_datetime = datetime.datetime.now()
elapsed_time = round((end_datetime - start_datetime).total_seconds(), 5)
print(f"elapsed_time: {elapsed_time}")
Say I have an array of values array = [0.0, 0.2, 0.5, 0.8, 1.0], and I want to pair adjacent values into a secondary list paired_array = [[0.0, 0.2], [0.2, 0.5], [0.5, 0.8], [0.8, 1.0]], is there an easy way of doing that in numpy?
For context, the pairs represent probability ranges which I will be using to randomise the values in a numpy array of type string. For example string_array = ['Fe', 'Pt', 'Fe', 'Pt', 'Fe', 'Pt', 'Fe', 'Pt'] may become something like randomised_array = ['Pt', 'Fe', 'Pt', 'Pt', 'Pt', 'Pt', 'Fe', 'Fe']. The ranges represent the probability a value is 'Pt' or 'Fe' in this case.
TRY:
from numpy.lib.stride_tricks import sliding_window_view
array = [0.0, 0.2, 0.5, 0.8, 1.0]
result = sliding_window_view(array, 2)
OUTPUT:
array([[0. , 0.2],
[0.2, 0.5],
[0.5, 0.8],
[0.8, 1. ]])
My code looks like:
{'Bug Out Bag': ['q', 0.25, 100], 'XM': ['q', 0.25, 10]}
print('{}'.format(float(addition1)+float(addition2)) )
I'm getting an error of :
addition1 = coins_in_the_bag[0]
KeyError: 0
Thanks!
coins_in_the_bag = {'Bug Out Bag': ['q', 0.25, 100], 'XM': ['q', 0.25, 10]}
coins_in_the_bag is a dict. Its contents are accessed using keys such as
>>> coins_in_the_bag['Bug Out Bag']
# ['q', 0.25, 100]
>>> coins_in_the_bag['XM']
# ['q', 0.25, 10]
Also, coins_in_the_bag['Bug Out Bag'] would return a list. Be alert to call float on a number, and not the whole list