Hello i need help for create genetic algorithme for converge to maximum or minimum value.
I develop a code for found maximum sentence ascii sum, but my code not converge to maximum, my code make "yoyo" value
like this picture :
matploltib output
i share my code :
import random
import statistics
EVOLUTION=[]
words = [
["Un", "Des", "Une", "On", "Elle"],
["a", "eu", "avait", "est", "était", "fut"],
["soif", "rouge"]
]
def individual(data):
#return tuple(random.choice(range(len(feature))) for feature in data)
return tuple(random.choice(range(len(feature))) for feature in data)
def population(data, initial=100):
return [individual(data) for i in range(initial)]
def fitness(individual, data):
chaine=sentence(individual,words)
somme = 0
for caractere in chaine:
somme = somme + ord(caractere)
print(chaine)
print(somme)
EVOLUTION.append(somme)
return somme
#return sum(data[i][individual[i]] for i in range(len(individual)))
def grade(population, data):
fit = [fitness(ind, data) for ind in population]
return statistics.mean(fit)
def mutate(ind, data):
gene = random.randrange(0, len(ind))
clone = list(ind)
clone[gene] = random.randrange(0, len(data[gene]))
#print(sentence(tuple(clone),words))
return tuple(clone)
def cross(mother, father):
return tuple(round(statistics.mean(genes)) for genes in zip(mother, father))
def sentence(individual, words):
return ' '.join([words[i][individual[i]] for i in range(len(words))])
def evolve(population, data, retain=0.0, random_select=0.00, mutation_rate=0.00):
def cmp_ind(ind):
return fitness(ind, data)
sorted_population = sorted(population, key=cmp_ind, reverse=True)
len_retained = round(len(population) * retain)
retained = sorted_population[:len_retained]
random_selected = [
ind
for ind in sorted_population[len_retained:]
if random.random() <= random_select
]
mutated = [
mutate(ind, data)
for ind in sorted_population[len_retained:]
if random.random() <= mutation_rate
]
children = [
cross(random.choice(sorted_population),
random.choice(sorted_population))
for i in range(len(population) - len(random_selected) - len(mutated))
]
return random_selected + mutated + children
if __name__ == '__main__':
data = [[len(w) for w in ws] for ws in words]
initial_population = population(data, 30)
next_population = initial_population
max_iter = 3
for i in range(max_iter):
next_population = evolve(next_population, data)
sorted_population = sorted(next_population, key=lambda x: fitness(x, data))
best_individual = sorted_population[0]
print("best solution :")
chaine=sentence(best_individual,words)
somme = 0
for caractere in chaine:
somme = somme + ord(caractere)
print(chaine)
print(somme)
import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt
plt.plot(EVOLUTION)
plt.savefig('myfig')
i want to found highter solution in my fitness function
thanks for advance for your help
Related
i'm having a hard time finding the bleus core for my seq to seq model for the task of question generation , my questions are the following :
if i use the sentence bleu to find the score beetween each refrence and the output and then devide the total of these sentence-bleu scores by the len of the test data , will it be the same as the corpus bleu ?
and for the corpus bleu implemented in the code as the nltk corpus bleu ?
import ntpath
import sys
import codecs
import os
import math
import operator
import functools
def fetch_data(cand, ref):
references = []
if '.eng' in ref:
reference_file = codecs.open(ref, 'r', 'utf-8')
references.append(reference_file.readlines())
else:
for root, dirs, files in os.walk(ref):
for f in files:
reference_file = codecs.open(os.path.join(root, f), 'r', 'utf-8')
references.append(reference_file.readlines())
candidate_file = codecs.open(cand, 'r', 'utf-8')
candidate = candidate_file.readlines()
return candidate, references
def count_ngram(candidate, references, n):
clipped_count = 0
count = 0
r = 0
c = 0
for si in range(len(candidate)):
# Calculate precision for each sentence
ref_counts = []
ref_lengths = []
# Build dictionary of ngram counts
for reference in references:
ref_sentence = reference[si]
ngram_d = {}
words = ref_sentence.strip().split()
ref_lengths.append(len(words))
limits = len(words) - n + 1
# loop through the sentance consider the ngram length
for i in range(limits):
ngram = ' '.join(words[i:i+n]).lower()
if ngram in ngram_d.keys():
ngram_d[ngram] += 1
else:
ngram_d[ngram] = 1
ref_counts.append(ngram_d)
# candidate
cand_sentence = candidate[si]
cand_dict = {}
words = cand_sentence.strip().split()
limits = len(words) - n + 1
for i in range(0, limits):
ngram = ' '.join(words[i:i + n]).lower()
if ngram in cand_dict:
cand_dict[ngram] += 1
else:
cand_dict[ngram] = 1
clipped_count += clip_count(cand_dict, ref_counts)
count += limits
r += best_length_match(ref_lengths, len(words))
c += len(words)
if clipped_count == 0:
pr = 0
else:
pr = float(clipped_count) / count
bp = brevity_penalty(c, r)
return pr, bp
def clip_count(cand_d, ref_ds):
"""Count the clip count for each ngram considering all references"""
count = 0
for m in cand_d.keys():
m_w = cand_d[m]
m_max = 0
for ref in ref_ds:
if m in ref:
m_max = max(m_max, ref[m])
m_w = min(m_w, m_max)
count += m_w
return count
def best_length_match(ref_l, cand_l):
"""Find the closest length of reference to that of candidate"""
least_diff = abs(cand_l-ref_l[0])
best = ref_l[0]
for ref in ref_l:
if abs(cand_l-ref) < least_diff:
least_diff = abs(cand_l-ref)
best = ref
return best
def brevity_penalty(c, r):
if c > r:
bp = 1
else:
bp = math.exp(1-(float(r)/c))
return bp
def geometric_mean(precisions):
return (functools.reduce(operator.mul, precisions)) ** (1.0 / len(precisions))
def BLEU(candidate, references):
precisions = []
for i in range(4):
pr, bp = count_ngram(candidate, references, i+1)
precisions.append(pr)
bleu = geometric_mean(precisions) * bp
return bleu
if __name__ == "__main__":
candidate, references = fetch_data(sys.argv[1], sys.argv[2])
bleu = BLEU(candidate, references)
print (bleu)
I'm not sure about the implementation you show but for implementations strictly following the original paper such as NLTKs it would not be the same: https://github.com/nltk/nltk/blob/develop/nltk/translate/bleu_score.py#L123.
Using sentence-BLEU means basically calling corpus-BLEU with just a one-sentence-corpus, but the other way around doesn't work. The scores should not be drastically different but they do differ because of macro-average vs micro-average.
I used BLEU for Seq2Seq evaluation before and just used sentence-BLEU and it worked just fine.
I am doing the DBSCAN clustering in python. I want to achieve an adaptive way to return the number of clusters by self calculating its eps and Minpts parameters. Below is my code.
import math
import copy
import numpy as np
import pandas as pd
from sklearn.cluster import DBSCAN
def loadDataSet(fileName, splitChar='\t'):
dataSet = []
with open(fileName) as fr:
for line in fr.readlines():
curline = line.strip().split(splitChar)
fltline = list(map(float, curline))
dataSet.append(fltline)
return dataSet
def dist(a,b):
return math.sqrt(math.pow(a[0]-b[0],2) + math.pow(a[1]-b[1],2))
def returnDk(matrix,k):
Dk = []
for i in range(len(matrix)):
Dk.append(matrix[i][k])
return Dk
def returnDkAverage(Dk):
sum = 0
for i in range(len(Dk)):
sum = sum + Dk[i]
return sum/len(Dk)
def CalculateDistMatrix(dataset):
DistMatrix = [[0 for j in range(len(dataset))] for i in range(len(dataset))]
for i in range(len(dataset)):
for j in range(len(dataset)):
DistMatrix[i][j] = dist(dataset[i], dataset[j])
return DistMatrix
def returnEpsCandidate(dataSet):
DistMatrix = CalculateDistMatrix(dataSet)
tmp_matrix = copy.deepcopy(DistMatrix)
for i in range(len(tmp_matrix)):
tmp_matrix[i].sort()
EpsCandidate = []
for k in range(1,len(dataSet)):
Dk = returnDk(tmp_matrix,k)
DkAverage = returnDkAverage(Dk)
EpsCandidate.append(DkAverage)
return EpsCandidate
def returnMinptsCandidate(DistMatrix,EpsCandidate):
MinptsCandidate = []
for k in range(len(EpsCandidate)):
tmp_eps = EpsCandidate[k]
tmp_count = 0
for i in range(len(DistMatrix)):
for j in range(len(DistMatrix[i])):
if DistMatrix[i][j] <= tmp_eps:
tmp_count = tmp_count + 1
MinptsCandidate.append(tmp_count/len(dataSet))
return MinptsCandidate
def returnClusterNumberList(dataset,EpsCandidate,MinptsCandidate):
np_dataset = np.array(dataset)
ClusterNumberList = []
for i in range(len(EpsCandidate)):
clustering = DBSCAN(eps= EpsCandidate[i],min_samples= MinptsCandidate[i]).fit(np_dataset)
num_clustering = max(clustering.labels_)
ClusterNumberList.append(num_clustering)
return ClusterNumberList
if __name__ == '__main__':
data = pd.read_csv('/Users/Desktop/Mic/recorder_test1/New folder/MFCCresultsforclustering/MFCCresultsforclustering.csv')
dataSet = data.iloc[:,0:13].values
EpsCandidate = returnEpsCandidate(dataSet)
DistMatrix = CalculateDistMatrix(dataSet)
MinptsCandidate = returnMinptsCandidate(DistMatrix,EpsCandidate)
ClusterNumberList = returnClusterNumberList(dataSet,EpsCandidate,MinptsCandidate)
print(EpsCandidate)
print(MinptsCandidate)
print('cluster number list is')
print(ClusterNumberList)
However, the output with the loading data set is all [-1]s. I am wondering where is the mistake. Am I right for this general direction? If not, how can I achieve the adaptive DBSCAN clustering?
I use the nearest neighbors method to predict the price of a stock. I have raw data in example.txt file. I use the close column (price at the end of the period = 1 minute). Linear regression predicts well (shown in green). But the method of nearest neighbors works only at the beginning and then turns into a straight line, please tell me how to fix this? Here is my code I wrote:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
class Reader:
def __init__(self, filename='example.txt'):
self.filename = filename
def read(self):
try:
file = open(self.filename)
return file.read()
except IOError:
return "File not found"
def main():
x = Reader('example.txt')
print(x.read())
class Regression:
def __init__(self, window, P0, Ptest, i):
self.window = window
self.P0 = P0
self.Ptest = Ptest
self.i = i
self.data_train = self.get_data_train()
self.x_train = self.get_x_train()
self.y_train = self.get_y_train()
self.data_test = self.get_data_test()
self.x_test = self.get_x_test()
self.y_test = self.get_y_test()
def get_data_train(self):
""" Method of obtaining data train on prices for the entire period."""
x = Reader('example.txt')
data = x.read().splitlines()
close_column = [x.split(',')[7] for x in data][1:]
result = [float(item) for item in close_column]
relative_price = result[:int(len(result)*P0)]
return relative_price
def get_data_test(self):
""" Method of obtaining data test on prices for the entire period."""
x = Reader('example.txt')
data = x.read().splitlines()
close_column = [x.split(',')[7] for x in data][1:]
result = [float(item) for item in close_column]
len_x_test = int(len(result) * Ptest)
len_x_train = int(len(result) * P0)
relative_price = result[(len_x_train + (len_x_test * self.i)): len_x_train + len_x_test
* (self.i + 1)]
return relative_price
def get_x_train(self):
x = []
for i in range(len(self.data_train)):
if i + self.window < len(self.data_train):
x.append(self.data_train[i: i + self.window])
return x
def get_y_train(self):
y = []
for i in self.data_train[self.window:]:
y += [i]
return y
def get_x_test(self):
x = []
for i in range(len(self.data_test)):
if i + self.window < len(self.data_test):
x.append(self.data_test[i: i + self.window])
return x
def get_y_test(self):
y = []
for i in self.data_test[self.window:]:
y += [i]
return y
class Linear_regression(Regression):
def callculate(self):
reg_linear = LinearRegression().fit(self.x_train, self.y_train)
y_pred = reg_linear.predict(self.x_test)
return y_pred
class Nearest_neighbor(Regression):
def callculate(self):
reg_neighbor = KNeighborsRegressor(n_neighbors=window, weights='distance')
reg_neighbor.fit(self.x_train, self.y_train)
y_pred = reg_neighbor.predict(self.x_test)
return y_pred
window = 10
Pk = 1
P0 = 0.1
Ptest = 0.01
k = (Pk - P0)/Ptest
i = 0
y_real = []
y_neigh = []
y_lin = []
while i < k:
lin_price = list(Linear_regression(window, P0, Ptest, i).callculate())
neighbor = list(Nearest_neighbor(window, P0, Ptest, i).callculate())
y_neigh.extend(neighbor)
y_lin.extend(lin_price)
y_real.extend(list(Linear_regression(window, P0, Ptest, i).y_test))
i += 1
""" Output to graphs of the received data """
fig, ax = plt.subplots()
ax.plot(y_real, label='Initial data')
ax.plot(y_neigh, label='Nearest Neighbor Data')
ax.plot(y_lin, label='Linear Regression Data')
ax.set_xlabel('Time (min)')
ax.set_ylabel('Price, ($)')
ax.legend()
plt.show()
"Linear regression predicts well"
No, it never predicted well. You just looked at the graph and thought it looked kind of similar. But if you look more closely, your 'model' simply takes the price of a bit ago as the prediction of the price now. That means, it's not predicting anything! It's a history device, not a prediction device.
That's why if you feed back this sort of 'model' into itself you get a straight line: it always predicts the next price is going to be equal to the last one.
In few trained lines, some numbers have too many digits that overflow float precision.
Im new to Machine Learning, Python (my college teacher recommended using Python) and this is my first StackOverflow question.
I googled first, ofc, but i didnt find something to help me.
This http://jrusev.github.io/post/hacking-mnist/ looks like interesting, but i cannot compare to my code because of my lack of experience. (I've always worked as Front-End)
import pandas as pd
import numpy as np
# Global variables
outputDictionary = {'0':[1,0,0,0,0,0,0,0,0,0], '1':[0,2,0,0,0,0,0,0,0,0],
'2':[0,0,1,0,0,0,0,0,0,0], '3':[0,0,0,1,0,0,0,0,0,0], '4':[0,0,0,0,1,0,0,0,0,0],
'5':[0,0,0,0,0,1,0,0,0,0], '6':[0,0,0,0,0,0,1,0,0,0], '7':[0,0,0,0,0,0,0,1,0,0],
'8':[0,0,0,0,0,0,0,0,1,0], '9':[0,0,0,0,0,0,0,0,0,1] }
learningRate = 0.2
middleLayerSize = 100
outputSize = 10
inputSize = 11
v = np.random.uniform(-1.00, 1.00,(inputSize, middleLayerSize)) # [linhas, middleLayerSize]
w = np.random.uniform(-1.00, 1.00,(middleLayerSize, outputSize)) # [middleLayerSize, outputSize]
errors = []
inputCsv = pd.read_csv('a.csv')
inputData = []
# Functions
def prepareData():
for row in inputCsv.itertuples(index=False):
arrRow = list(row)
for i in range(len(arrRow)):
if(i != 0):
arrRow[i] = float(arrRow[i]) / 255
inputData.append(arrRow)
def train(maxEpochs):
global errors
global graph
global inputData
for epoch in range(maxEpochs):
errorCount = 0
print('Period ' + str(epoch))
for row in inputData:
expectedNumber = row.pop(0)
expectedNumberObj = outputDictionary[str(expectedNumber)]
zIn = calcZIn(row)
zOutput = calcDelta(zIn, middleLayerSize)
yIn = calcYIn(zOutput)
yOutput = calcDelta(yIn, outputSize)
validate = validadeOutput(expectedNumberObj, yOutput)
if(validate == False):
errorCount+= 1;
propagateError(expectedNumberObj, row, yOutput, zOutput, zIn, yIn)
errors.append(errorCount)
print(errorCount)
def calcZIn(row):
result = []
for j in range(middleLayerSize):
result.append(0)
for i in range(inputSize):
result[j] += v[i,j] * row[i]
return result
def calcYIn(zOutput):
result = []
for j in range(outputSize):
result.append(0)
for i in range(middleLayerSize):
result[j] += w[i,j] * zOutput[i]
return result
def calcDelta(arr, arrSize):
deltas = []
for i in range(arrSize):
deltas.append(activationFunction(arr[i]))
return deltas
def activationFunction(x):
return 1.0 / (1.0 + np.exp(-x))
def validadeOutput(targetObj, yOutput):
for i in range(len(yOutput)):
if(targetObj[i] != yOutput[i]):
return False
return True
def propagateError(expectedArr, row, yOutput, zOutput, zIn, yIn):
errorY = calcError(expectedArr, yOutput, yIn, outputSize)
errorW = calcWeightCorrection(errorY, zOutput, middleLayerSize, outputSize)
sumError = sumDelta(errorY, w, middleLayerSize, outputSize)
errorZ = calcError(sumError, zOutput, zIn, middleLayerSize)
errorV = calcWeightCorrection(errorZ, row, inputSize, middleLayerSize)
updateWeight(w, errorW, middleLayerSize, outputSize)
updateWeight(v, errorV, inputSize, middleLayerSize)
def calcError(expectedArr, outputArr, inArr, size):
error = []
for i in range(size):
error.append((expectedArr[i] - outputArr[i]) * inArr[i] * (1.0 - inArr[i]));
return error
def calcWeightCorrection(error, output, lenght1, length2):
delta = [];
for i in range(lenght1):
delta.append([])
for j in range(length2):
delta[i].append(learningRate * error[j] * output[i])
return delta
def sumDelta(error, weights, lenght1, length2):
delta = []
for i in range(lenght1):
deltaValue = 0.0
for j in range(length2):
deltaValue += error[j] * weights[i][j];
delta.append(deltaValue)
return delta
def updateWeight(weight, delta, lenght1, length2):
# (lenght1)
# print(length2)
for i in range(lenght1):
for j in range(length2):
# print(str(i) + ' - ' + str(j))
weight[i][j] += delta[i][j]
# Execution
prepareData()
train(1)```
I am trying to plot a decision tree using ID3 in Python. I am really new to Python and couldn't understand the implementation of the following code. I need to know how I can apply this code to my data.
from math import log
import operator
def entropy(data):
entries = len(data)
labels = {}
for feat in data:
label = feat[-1]
if label not in labels.keys():
labels[label] = 0
labels[label] += 1
entropy = 0.0
for key in labels:
probability = float(labels[key])/entries
entropy -= probability * log(probability,2)
return entropy
def split(data, axis, val):
newData = []
for feat in data:
if feat[axis] == val:
reducedFeat = feat[:axis]
reducedFeat.extend(feat[axis+1:])
newData.append(reducedFeat)
return newData
def choose(data):
features = len(data[0]) - 1
baseEntropy = entropy(data)
bestInfoGain = 0.0;
bestFeat = -1
for i in range(features):
featList = [ex[i] for ex in data]
uniqueVals = set(featList)
newEntropy = 0.0
for value in uniqueVals:
newData = split(data, i, value)
probability = len(newData)/float(len(data))
newEntropy += probability * entropy(newData)
infoGain = baseEntropy - newEntropy
if (infoGain > bestInfoGain):
bestInfoGain = infoGain
bestFeat = i
return bestFeat
def majority(classList):
classCount={}
for vote in classList:
if vote not in classCount.keys(): classCount[vote] = 0
classCount[vote] += 1
sortedClassCount = sorted(classCount.iteritems(), key=operator.itemgetter(1), reverse=True)
return sortedClassCount[0][0]
def tree(data,labels):
classList = [ex[-1] for ex in data]
if classList.count(classList[0]) == len(classList):
return classList[0]
if len(data[0]) == 1:
return majority(classList)
bestFeat = choose(data)
bestFeatLabel = labels[bestFeat]
theTree = {bestFeatLabel:{}}
del(labels[bestFeat])
featValues = [ex[bestFeat] for ex in data]
uniqueVals = set(featValues)
for value in uniqueVals:
subLabels = labels[:]
theTree[bestFeatLabel][value] = tree(split/(data, bestFeat, value),subLabels)
return theTree
So what I did after this is the following:
infile=open("SData.csv","r")
data=infile.read()
tree(data)
The error which I got is "1 argument is missing" which is the label which I have to define and this is where I don't know what I have to put. I tried the variable for which I have to make the decision tree but it doesn't work:
tree(data,MinTemp)
Here I get an error "MinTemp is not defined".
Please help me out and let me know what I should do to have a look at the tree.
Following is the part of data and I want to generate a tree for MinTemp
MinTemp,Rainfall,Tempat9,RHat9,CAat9,WSat9
high,no,mild,normal,overcast,weak
high,no,mild,normal,cloudy,weak
high,no,mild,normal,cloudy,mild
high,yes,mild,high,cloudy,weak
high,yes,mild,high,cloudy,mild
medium,yes,mild,high,cloudy,mild
high,no,mild,high,overcast,weak
high,no,mild,normal,sunny,weak
high,no,hot,normal,sunny,weak
high,no,hot,normal,overcast,weak