Why entropy is more than 1 in Python? - python

I have a graph degree list degrees size of 10000 (e.g.: [1, 14, 4, 14, 6, 1 ...]. I am trying to calculate entropy of this list by this way:
Firstly, I am finding probability of each unique value in list:
uniqueDegreeList = list(set(degrees))
a = 0
for i in uniqueDegreeList:
p = degrees.count(i) / len(degrees)
print(p)
a += p
Output:
0.5054494550544946
0.24577542245775422
0.12188781121887811
0.06379362063793621
0.031596840315968405
0.0150984901509849
0.007799220077992201
0.0034996500349965005
0.0024997500249975004
0.0010998900109989002
0.0008999100089991
0.00039996000399960006
0.00019998000199980003
And:
print(a)
>> 1.0
This part is working. Then I am trying to find entropy of list:
S = 0
for i in uniqueDegreeList:
p = degrees.count(i) / len(degrees)
S -= p * math.log(p, 2)
And when I print S I get 1.99. Entropy should not be more than 1, why I get 1.99?

Related

Spacing points per decade for logarithmic plot

I'm trying to space out a number of points in between a start and end frequency.
In a way you can see here down below:
Startfreq = 1 Hz ( variable )
Stopfreq = 5402 Hz ( also variable )
stepsperdecade
How i want it to look:
1 - 2 - 3 - 4.. 10 - 20 - 30..100 - 200 - 300.. 1000 - 2000 - 3000 - 4000 - 5000 - 5402
OR
1 - steps based on the stepsperdecade - 10 - steps based on the stepsperdecade - 100 .. 1000 - steps based on the stepsperdecade 5402.
SO i want the spacing to be same until it reaches the end frequency
I tried to do it in the following way in python.
from math import log10
import numpy as np
startfreq = 1
endfreq = 10000
points_per_decade = 10
numberdecades = log10(endfreq) - log10(startfreq)
print(numberdecades)
points = int(numberdecades) * points_per_decade
points = np.logspace(log10(startfreq), log10(endfreq), num=points, endpoint=True, base=10)
print(points)
But this way doesn't give me the 10 - 100 - 1000 i want in between the steps.
Would any one know or could someone hint me in the right direction.
I don't know if this works for you but using some basic maths I created this while loop snippet
from math import log10
startfreq = 1
endfreq = 5402
points_per_decade = 10
points = [startfreq]
ndig = int(log10(startfreq))
point = startfreq - startfreq % 10 ** ndig + 10 ** ndig
while point < endfreq:
points.append(point)
ndig = int(log10(point))
point = round(point + 10 ** ndig, ndigits=-ndig)
points.append(endfreq)
print(points)
I edited the answer to fix certain values, like startfreq = 175 should produce 200 as the next value, then continue in steps of +100: [175, 200, 300...]
You could do this comfortably with numpy arrays, by taking an outer product:
import numpy as np
exponents = np.arange(0, 4)# -> [0, 1, 2, 3]
prefactors = np.arange(1, 10)# -> [1, 2, ..., 9]
factor_matrix = np.outer(10**exponents, prefactors)
This will give you what you want in matrix form:
[[1, 2, ..., 9],
[10, 20, ..., 90],
...,
[1000, 2000, ..., 9000]]
Of course, you want a flat array that stops before endpoint=5402, then append endpoint manually:
flattened_array = factor_matrix.flatten()
flattened_array = flattened_array[flattened_array<endpoint]
flattened_array = np.append(flattened_array, endpoint)

Pymoo generating candidates with nan parameters

I'm running a multi-objective optimisation with Pymoo (0.5.0) using NSGA-III and within my population of new candidates some of the generated candidates have nan parameters. This results in my evaluate function (which is a call to a neural network) returning nan. The optimisation is running and producing desired results but I'd like to know why some of the candidate parameters are nan. Here is the code for the problem.
Problem setup:
opt_name = "sopt00001KaRxD60fLn2"
pop_size = 165
n_gen = 350
cross_over_pb = 0.9
mutation_pb = 0.1
# Fixed params
band ="KaRx"
arc = "RevF"
source_spec = "sweep99p1"
lens_diameter = 60.0
source_z = 5.0
r_lam = 0.1
use_original_transformation = 0 # false
source_x0 = 0.0
target_scans = [0, 70, 50]
# Optimisation param ranges
lens_material_delta_n = [1.5, 3.6]
lens_thick = [5, 35]
lens_radii_back = [39, 22500]
lens_radii_front = [39, 22500]
source_width = [2, 20]
source_x = [12, 20]
params_lower_lim = [lens_thick[0], lens_radii_front[0], lens_radii_back[0], source_width[0], source_x[0], source_x[0],
lens_material_delta_n[0], -1, -1, -1, 0, -1, -1, -1, 0]
params_upper_lim = [lens_thick[1], lens_radii_front[1], lens_radii_back[1], source_width[1], source_x[1], source_x[1],
lens_material_delta_n[1], 1, 1, 1, 1, -1, -1, -1, 1]
n_var = len(params_lower_lim)
assert n_var == len(params_upper_lim), print("Upper and lower parameter limits are not equal length!")
# Other required params
if band == "KaRx":
freq_center = 19.45
freq_width = 3.5
Evaluate function:
class ProblemWrapper(Problem):
def _evaluate(self, params, out, *args, **kwargs):
res = []
for param in params:
source_x70 = source_x_f(param[4], param[5], source_x, 50, r_lam, target_scans, freq_center, freq_width)
source_x50 = source_x_f(param[4], param[5], source_x, 70, r_lam, target_scans, freq_center, freq_width)
res.append(smeep(band,
lens_diameter, param[0],
param[1], param[2],
param[3],
source_x0, source_x70, source_x50,
source_z,
param[6], param[7], param[8], param[9], param[10], param[11], param[12], param[13], param[14],
r_lam, use_original_transformation,
arc,
source_spec,
target_scans))
out['F'] = np.array(res)
Algorithm settings:
ref_dirs = get_reference_directions("das-dennis", 3, n_partitions=12)
problem = ProblemWrapper(n_var=n_var,
n_obj=len(target_scans),
xl=params_lower_lim,
xu=params_upper_lim)
algorithm = NSGA3(
pop_size=pop_size,
ref_dirs=ref_dirs,
sampling=get_sampling("real_random"),
cross_over=get_crossover("real_sbx", prob=cross_over_pb),
mutation=get_mutation("real_pm", prob=mutation_pb)
)
Execution:
res = minimize(problem=problem,
algorithm=algorithm,
termination=("n_gen", n_gen),
save_history=True,
verbose=True
)
It looks like the only affected parameters are the poly6 (param[11]), poly7 (param[12]) and poly8 (param[13]) terms. And it differs candidate to candidate. I confess I have not tried any different crossover or mutation schemes but these seemed the best from the documentation.
Thanks in advance!
The nan arise because the limits for your parameters 11, 12 and 12 are equal (-1 and -1 in all cases).
If you look at the code for the polynomial mutation (real_pm), you have the following lines:
delta1 = (X - xl) / (xu - xl)
delta2 = (xu - X) / (xu - xl)
where xu and xl are the upper and lower bounds of the parameters. In your case, that would cause a divide-by-0.
Since the limits are the same (if this is correct), they are actually not part of the optimization and you should remove them from the list.

How can we extract the maximum value from the neighborhood value of a specified coordinate?

How can I extract the maximum value from four points in the neighborhood of a specified coordinate?
import xarray as xr
import numpy as np
lat = [0, 10, 20]
lon = [50, 60, 70, 80]
#sample data
test_data = np.array([[1,2,3,4],[5,6,7,8],[9,10,11,12]])
#to xarray
data_xarray = xr.DataArray(test_data, dims=("lat","lon"), coords={"lat":lat, "lon":lon})
#<xarray.DataArray (lat: 3, lon: 4)>
#array([[ 1, 2, 3, 4],
# [ 5, 6, 7, 8],
# [ 9, 10, 11, 12]])
#Coordinates:
# * lat (lat) int64 0 10 20
# * lon (lon) int64 50 60 70 80
data_xarray.plot()
What I want to implement
When 5.5 and 52 are specified for lat and lon respectively, extract 10, the maximum value of the four surrounding points.
Labeled indexing using sel supports nearest neighbour lookup.
You can use that to look up the four values of interest, reconcatenate them and then compute the max:
lat_search = 5.5
lon_search = 52
# Select the four nearest values
llat_llon = data_xarray.sel(lat=lat_search, lon=lon_search, method="pad")
ulat_ulon = data_xarray.sel(lat=lat_search, lon=lon_search, method="backfill")
ulat_llon = data_xarray.sel(lat=lat_search, method="backfill").sel(
lon=lon_search, method="pad"
)
llat_ulon = data_xarray.sel(lat=lat_search, method="pad").sel(
lon=lon_search, method="backfill"
)
# Combine the four values providing them in the correct order
ds_grid = [[llat_llon, ulat_llon], [llat_ulon, ulat_ulon]]
neighbours = xr.combine_nested(ds_grid, concat_dim=("lon", "lat"))
# Alternatively, combine them automatically
neighbours = xr.combine_by_coords(
[
x.to_dataset(name="foo").expand_dims(["lat", "lon"])
for x in [llat_llon, ulat_llon, llat_ulon, ulat_ulon]
]
)
# Compute the maximum value
neighbours.max()
I admit that selecting the four values manually and recombining them is not very elegant (particularly if you would like to scale that to more than two dimensions).
I don't see a general way to retrieve both neighbours at the same time using sel.
If you have a regularly spaced grid of coordinates, you can select all neighbours at the same time passing a slice to sel:
delta_lat = 10
delta_lon = 10
neighbours = data_xarray.sel(
lat=slice(lat_search - delta_lat, lat_search + delta_lat),
lon=slice(lon_search - delta_lon, lon_search + delta_lon),
)

Which programming structure for clustering algorithm

I am trying to implement the following (divisive) clustering algorithm (below is presented short form of the algorithm, the full description is available here):
Start with a sample x, i = 1, ..., n regarded as a single cluster of n data points and a dissimilarity matrix D defined for all pairs of points. Fix a threshold T for deciding whether or not to split a cluster.
First determine the distance between all pairs of data points and choose a pair with the largest distance (Dmax) between them.
Compare Dmax to T. If Dmax > T then divide single cluster in two by using the selected pair as the first elements in two new clusters. The remaining n - 2 data points are put into one of the two new clusters. x_l is added to the new cluster containing x_i if D(x_i, x_l) < D(x_j, x_l), otherwise is added to new cluster containing x_i.
At the second stage, the values D(x_i, x_j) are found within one of two new clusters to find the pair in the cluster with the largest distance Dmax between them. If Dmax < T, the division of the cluster stops and the other cluster is considered. Then the procedure repeats on the clusters generated from this iteration.
Output is a hierarchy of clustered data records. I kindly ask for an advice how to implement the clustering algorithm.
EDIT 1: I attach Python function which defines distance (correlation coefficient) and function which finds maximal distance in data matrix.
# Read data from GitHub
import pandas as pd
df = pd.read_csv('https://raw.githubusercontent.com/nico/collectiveintelligence-book/master/blogdata.txt', sep = '\t', index_col = 0)
data = df.values.tolist()
data = data[1:10]
# Define correlation coefficient as distance of choice
def pearson(v1, v2):
# Simple sums
sum1 = sum(v1)
sum2 = sum(v2)
# Sums of the squares
sum1Sq = sum([pow(v, 2) for v in v1])
sum2Sq = sum([pow(v, 2) for v in v2])
# Sum of the products
pSum=sum([v1[i] * v2[i] for i in range(len(v1))])
# Calculate r (Pearson score)
num = pSum - (sum1 * sum2 / len(v1))
den = sqrt((sum1Sq - pow(sum1,2) / len(v1)) * (sum2Sq - pow(sum2, 2) / len(v1)))
if den == 0: return 0
return num / den
# Find largest distance
dist={}
max_dist = pearson(data[0], data[0])
# Loop over upper triangle of data matrix
for i in range(len(data)):
for j in range(i + 1, len(data)):
# Compute distance for each pair
dist_curr = pearson(data[i], data[j])
# Store distance in dict
dist[(i, j)] = dist_curr
# Store max distance
if dist_curr > max_dist:
max_dist = dist_curr
EDIT 2: Pasted below are functions from Dschoni's answer.
# Euclidean distance
def euclidean(x,y):
x = numpy.array(x)
y = numpy.array(y)
return numpy.sqrt(numpy.sum((x-y)**2))
# Create matrix
def dist_mat(data):
dist = {}
for i in range(len(data)):
for j in range(i + 1, len(data)):
dist[(i, j)] = euclidean(data[i], data[j])
return dist
# Returns i & k for max distance
def my_max(dict):
return max(dict)
# Sort function
list1 = []
list2 = []
def sort (rcd, i, k):
list1.append(i)
list2.append(k)
for j in range(len(rcd)):
if (euclidean(rcd[j], rcd[i]) < euclidean(rcd[j], rcd[k])):
list1.append(j)
else:
list2.append(j)
EDIT 3:
When I run the code provided by #Dschoni the algorithm works as expected. Then I modified the create_distance_list function so we can compute distance between multivariate data points. I use euclidean distance. For toy example I load iris data. I cluster only the first 50 instances of the dataset.
import pandas as pd
df = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data', header = None, sep = ',')
df = df.drop(4, 1)
df = df[1:50]
data = df.values.tolist()
idl=range(len(data))
dist = create_distance_list(data)
print sort(dist, idl)
The result is as follows:
[[24], [17], [4], [7], [40], [13], [14], [15], [26, 27, 38], [3, 16,
39], [25], [42], [18, 20, 45], [43], [1, 2, 11, 46], [12, 37, 41],
[5], [21], [22], [10, 23, 28, 29], [6, 34, 48], [0, 8, 33, 36, 44],
[31], [32], [19], [30], [35], [9, 47]]
Some data points are still clustered together. I solve this problem by adding small amount of data noise to actual dictionary in the sort function:
# Add small random noise
for key in actual:
actual[key] += np.random.normal(0, 0.005)
Any idea how to solve this problem properly?
A proper working example for the euclidean distance:
import numpy as np
#For random number generation
def create_distance_list(l):
'''Create a distance list for every
unique tuple of pairs'''
dist={}
for i in range(len(l)):
for k in range(i+1,len(l)):
dist[(i,k)]=abs(l[i]-l[k])
return dist
def maximum(distance_dict):
'''Returns the key of the maximum value if unique
or a random key with the maximum value.'''
maximum = max(distance_dict.values())
max_key = [key for key, value in distance_dict.items() if value == maximum]
if len(max_key)>1:
random_key = np.random.random_integers(0,len(max_key)-1)
return (max_key[random_key],)
else:
return max_key
def construct_new_dict(distance_dict,index_list):
'''Helper function to create a distance map for a subset
of data points.'''
new={}
for i in range(len(index_list)):
for k in range(i+1,len(index_list)):
m = index_list[i]
n = index_list[k]
new[(m,n)]=distance_dict[(m,n)]
return new
def sort(distance_dict,idl,threshold=4):
result=[idl]
i=0
try:
while True:
if len(result[i])>=2:
actual=construct_new_dict(dist,result[i])
act_max=maximum(actual)
if distance_dict[act_max[0]]>threshold:
j = act_max[0][0]
k = act_max[0][1]
result[i].remove(j)
result[i].remove(k)
l1=[j]
l2=[k]
for iterr in range(len(result[i])):
s = result[i][iterr]
if s>j:
c1=(j,s)
else:
c1=(s,j)
if s>k:
c2=(k,s)
else:
c2=(s,k)
if actual[c1]<actual[c2]:
l1.append(s)
else:
l2.append(s)
result.remove(result[i])
#What to do if distance is equal?
l1.sort()
l2.sort()
result.append(l1)
result.append(l2)
else:
i+=1
else:
i+=1
except:
return result
#This is the dataset
a = [1,2,2.5,5]
#Giving each entry a unique ID
idl=range(len(a))
dist = create_distance_list(a)
print sort(dist,idl)
I wrote the code for readability, there is a lot of stuff that can made faster, more reliable and prettier. This is just to give you an idea of how it can be done.
Some data points are still clustered together. I solve this problem by
adding small amount of data noise to actual dictionary in the sort
function.
If Dmax > T then divide single cluster in two
Your description doesn't necessarily creates n clusters.
If a cluster has two records which has a distance less than T,
they will be clustered together (am I missing something?)

python: adjust imported coordinates

I have a python script where i import coordinates of triangular elements, and element definitions from two seperate text files. I first define unique nodes for each triangluar element and then define a new rectangular element in between the triangular elements (this python scripts writes an input file for an FEM-calculation).
The coordinate file looks like the following:
id,x,y,
1, 0, 0
2, 0, 1
3, 0, 2
4, 1, 0
5, 1, 1
6, 1, 2
7, 2, 0
8, 2, 1
9, 2, 2
The element file looks like this:
id, n1, n2, n3
1, 1, 2, 4
2, 1, 2, 5
3, 2, 3, 5
4, 3, 5, 6
5, 5, 6, 8
6, 6, 8, 9
7, 5, 7, 8
8, 4, 5, 7
What i want to happend is that the coordinates of the nodes of the triangular elements are moved towarts the centre of mass of the triangular element, on this way the rectangular element in between the triangular elements will get a fysical thickness.
However i do something wrong in my python script (see below).
The part in the script between the horizontal lines should change the coordinates.
But for my script extra coordinates are added, instead of the (for the example here) 9 initial coordinates.
And the coordinates are also not moved in the correct direction.
Why does this happen and how can i solve this?
#!/usr/bin/env python
# Inlezen coordinaten
open("D://Documents//SkyDrive//afstuderen//99 EEM - Abaqus 6.11.2//scripting//_COORDINATEN.txt", "r")
import csv
import itertools
with open("_COORDINATEN.txt") as file:
data = csv.reader(file)
next(data)
coords = []
coords = ([[float(x) for x in line[1:]] for line in data])
#inlezen elementen
open("D://Documents//SkyDrive//afstuderen//99 EEM - Abaqus 6.11.2//scripting//_ELEMENTEN.txt", "r")
import csv
import itertools
with open("_ELEMENTEN.txt") as file:
data2 = csv.reader(file)
next(data2)
elems = []
elems = ([[int(x)-1 for x in line[1:]] for line in data2])
#Flip the original elements if required
for i,elem in enumerate(elems):
ecoords = [coords[e] for e in elem]
a = [x2-x1 for x1,x2 in zip(ecoords[0],ecoords[1])]
b = [x2-x1 for x1,x2 in zip(ecoords[1],ecoords[2])]
n = a[0]*b[1]-a[1]*b[0]
if n < 0:
elems[i] = [ elem[0], elem[2], elem[1] ]
#bewerking elementen
newcoords = []
newelems = []
for elem in elems:
ecoords = [coords[e] for e in elem]
newelem = range( len(newcoords), len(newcoords)+len(ecoords) )
newcoords += ecoords
newelems.append( newelem )
cohelems = []
for e,elem in enumerate(elems):
for edge in [[0,1],[1,2],[2,0]]:
eedge = [elem[i] for i in edge]
for e2,elem2 in enumerate(elems[e+1:]):
e2 += e+1
for edge2 in [[0,1],[1,2],[2,0]]:
eedge2 = [elem2[i] for i in edge2]
if all([i in eedge2 for i in eedge]):
newedge = [newelems[e][i] for i in edge ]
newedge += [newelems[e2][i] for i in edge2]
cohelems.append( newedge[-1::-1] )
#---------------------------------------------------------------------
def add_vectors(*points):
new_x = 0.0
new_y = 0.0
for point in points:
new_x += point[0]
new_y += point[1]
return [new_x, new_y]
def subtract_vectors(a, b):
new_x = a[0] - b[0]
new_y = a[1] - b[1]
return [new_x, new_y]
def mul_by_scalar(vector, scalar):
new_x = vector[0] * scalar
new_y = vector[1] * scalar
return [new_x, new_y]
new_triangles = []
for elem in elems:
new_triangles += [coords[e] for e in elem]
print 'new_triangles =', new_triangles
CM = mul_by_scalar(add_vectors(*new_triangles), 1.0/3)
point_to_CM_vectors = []
for point in new_triangles:
point_to_CM_vectors.append(subtract_vectors(CM, point))
new_triangle2 = []
#for e,elem in enumerate(elems):
for elem in elems:
for point, motion in zip(new_triangles, point_to_CM_vectors):
new_triangle2.append(add_vectors(point, mul_by_scalar(motion, 0.01)))
# new_triangle2 += [add_vectors(point, mul_by_scalar(motion, 0.01))]
print 'new_triangle2 =', new_triangle2
#---------------------------------------------------------------------
Thank you all in advance for the help!

Categories