Related
I am having this error for the part in bold:
Traceback (most recent call last):
File "C:/Users/appan/OneDrive/Documents/Year 3/AI Assignment Semester 1/Tabu Search/Tabu-search-on-Travelling-Salesman-Problem-master/TabuSearch2.py", line 238, in
solution, value, exec_time = tabu_search("five_d.txt")
File "C:/Users/appan/OneDrive/Documents/Year 3/AI Assignment Semester 1/Tabu Search/Tabu-search-on-Travelling-Salesman-Problem-master/TabuSearch2.py", line 175, in tabu_search
graph, max_weight = read_data(input_file_path)
File "C:/Users/appan/OneDrive/Documents/Year 3/AI Assignment Semester 1/Tabu Search/Tabu-search-on-Travelling-Salesman-Problem-master/TabuSearch2.py", line 64, in read_data
link.append(float(tmp[0]))
ValueError: could not convert string to float:
Process finished with exit code 1
can you help please
import math
from random import randint
import time
from random import shuffle
#import numpy as np
### Data Format is dict:
# data[node_name] = gives you a list of link info
# data[link_index][0] = name of node that edge goes to
# data[link_index][1] = weight of that edge
def read_data(path):
linkset = []
links = {}
max_weight = 0
'''
with open(path, "r") as f:
for line in f:
print (line)
link = []
#tmp = list(map(float,line.strip().split(' ')))
tmp=line.strip().split(' ')
arr=np.array(tmp)
print(arr)
link.append(float(tmp[0]))
link.append(float(tmp[1]))
link.append(float(tmp[2]))
linkset.append(link)
if float(tmp[2]) > max_weight:
max_weight = float(tmp[2])
link.append(int(tmp[0]))
link.append(int(tmp[1]))
link.append(int(tmp[2]))
linkset.append(link)
if int(tmp[2]) > max_weight:
max_weight = int(tmp[2])
'''
**with open(path,'r') as f:
for line in f:
#print(line)
link = []
#tmp = list(map(float,line.strip().split(' ')))
tmp = line.strip().split(' ')
#tmp = np.array()
print(tmp)
'''
for i in tmp:
link.append([i])
'''
link.append(float(tmp[0]))
link.append(float(tmp[1]))
link.append(float(tmp[2]))
linkset.append(link)
#print(link)
'''
link.append(list(map(float,tmp[0])))
link.append(list(map(float,tmp[1])))
link.append(list(map(float,tmp[2])))
linkset.append(link)
'''
if float(tmp[2]) > max_weight:
max_weight = float(tmp[2])**
for link in linkset:
try:
linklist = links[str(link[0])]
linklist.append(link[1:])
links[str(link[0])] = linklist
except:
links[str(link[0])] = [link[1:]]
return links, max_weight
def getNeighbors(state):
# return hill_climbing(state)
return two_opt_swap(state)
def hill_climbing(state):
node = randint(1, len(state) - 1)
neighbors = []
for i in range(len(state)):
if i != node and i != 0:
tmp_state = state.copy()
tmp = tmp_state[i]
tmp_state[i] = tmp_state[node]
tmp_state[node] = tmp
neighbors.append(tmp_state)
return neighbors
def two_opt_swap(state):
global neighborhood_size
neighbors = []
for i in range(neighborhood_size):
node1 = 0
node2 = 0
while node1 == node2:
node1 = randint(1, len(state) - 1)
node2 = randint(1, len(state) - 1)
if node1 > node2:
swap = node1
node1 = node2
node2 = swap
tmp = state[node1:node2]
tmp_state = state[:node1] + tmp[::-1] + state[node2:]
neighbors.append(tmp_state)
return neighbors
def fitness(route, graph):
path_length = 0
for i in range(len(route)):
if (i + 1 != len(route)):
dist = weight_distance(route[i], route[i + 1], graph)
if dist != -1:
path_length = path_length + dist
else:
return max_fitness # there is no such path
else:
dist = weight_distance(route[i], route[0], graph)
if dist != -1:
path_length = path_length + dist
else:
return max_fitness # there is no such path
return path_length
# not used in this code but some datasets has 2-or-more dimensional data points, in this case it is usable
def euclidean_distance(city1, city2):
return math.sqrt((city1[0] - city2[0]) ** 2 + ((city1[1] - city2[1]) ** 2))
def weight_distance(city1, city2, graph):
global max_fitness
neighbors = graph[str(city1)]
for neighbor in neighbors:
if neighbor[0] == int(city2):
return neighbor[1]
return -1 # there can't be minus distance, so -1 means there is not any city found in graph or there is not such edge
def tabu_search(input_file_path):
global max_fitness, start_node
graph, max_weight = read_data(input_file_path)
## Below, get the keys (node names) and shuffle them, and make start_node as start
s0 = list(graph.keys())
shuffle(s0)
if int(s0[0]) != start_node:
for i in range(len(s0)):
if int(s0[i]) == start_node:
swap = s0[0]
s0[0] = s0[i]
s0[i] = swap
break;
# max_fitness will act like infinite fitness
max_fitness = ((max_weight) * (len(s0))) + 1
sBest = s0
vBest = fitness(s0, graph)
bestCandidate = s0
tabuList = []
tabuList.append(s0)
stop = False
best_keep_turn = 0
start_time = time.time()
while not stop:
sNeighborhood = getNeighbors(bestCandidate)
bestCandidate = sNeighborhood[0]
for sCandidate in sNeighborhood:
if (sCandidate not in tabuList) and ((fitness(sCandidate, graph) < fitness(bestCandidate, graph))):
bestCandidate = sCandidate
if (fitness(bestCandidate, graph) < fitness(sBest, graph)):
sBest = bestCandidate
vBest = fitness(sBest, graph)
best_keep_turn = 0
tabuList.append(bestCandidate)
if (len(tabuList) > maxTabuSize):
tabuList.pop(0)
if best_keep_turn == stoppingTurn:
stop = True
best_keep_turn += 1
exec_time = time.time() - start_time
return sBest, vBest, exec_time
## Tabu Search Takes edge-list in a given format:
# nodefrom nodeto weight
# 0 1 5
# 3 2 4
# 1 0 3
# Undirectional edges should be written 2 times for both nodes.
# maxTabuSize = 10000
maxTabuSize = 500
neighborhood_size = 500
stoppingTurn = 500
max_fitness = 0
start_node = 0
# solution, value, exec_time = tabu_search("test.txt")
solution, value, exec_time = tabu_search("five_d.txt")
print(solution)
print(value)
print(exec_time)
I am trying to implement Flajolet Martin algorithm. I have a dataset with over 6000 records but the output of the following code is 4096. Please help me in understanding the mistake being made by me.
import xxhash
import math
def return_trailing_zeroes(s):
s = str(s)
rev = s[::-1]
count = 0
for i in rev:
if i is '0':
count = count + 1
else:
break
return count
def gethash(line):
num=abs(xxhash.xxh32(line).intdigest())
return num
fp=open("/content/drive/MyDrive/Data.txt","r")
h_max=0
for line in fp:
hash_value_1 = gethash(line)
binary_1 = format(hash_value_1, '032b')
t1 = return_trailing_zeroes(binary_1)
if t1>h_max:
h_max=t1
fp.close()
print(2**h_max)
I tried this implementation of HyperLogLog algorithm and the output of the following code is 2560.
def return_trailing_zeroes(s): s = str(s) rev = s[::-1] count = 0
for i in rev: if i is '0': count = count + 1 else: break
return count
h1_m=0
h2_m=0
h3_m=0
h4_m=0
fp=open("/content/drive/MyDrive/Data.txt","r")
for line in fp:
hash_value_1 = abs(xxhash.xxh32(line).intdigest())
hash_value_2 = abs(hash32(line))
hash_value_3 = abs(jhashcode.hashcode(line))
hash_value_4 = abs(mmh3.hash(line))
binary_1 = format(hash_value_1, '032b')
binary_2 = format(hash_value_2, '032b')
binary_3 = format(hash_value_3, '032b')
binary_4 = format(hash_value_4, '032b')
t1 = return_trailing_zeroes(binary_1)
t2 = return_trailing_zeroes(binary_2)
t3 = return_trailing_zeroes(binary_3)
t4 = return_trailing_zeroes(binary_4)
if t1>h1_m: h1_m=t1
if t2>h2_m: h2_m=t2
if t3>h3_m: h3_m=t3
if t4>h4_m: h4_m=t4
fp.close()
avg_hash12 = (2**(h1_m) + 2**(h2_m))/ float(2)
avg_hash34 = (2**(h3_m) + 2**(h4_m))/ float(2)
distinct_elements = math.ceil(statistics.median([avg_hash12,
avg_hash34]))
print(distinct_elements)
import requests
from bs4 import BeautifulSoup
import csv
import time
def fin_car(url):
x = {}
y = []
page = ''
while page == '':
try:
page = requests.get(url)
except:
print("Connection refused by the server..")
print("Let me sleep for 5 seconds")
print("ZZzzzz...")
time.sleep(5)
print("Was a nice sleep, now let me continue...")
continue
#page = requests.get(url)
soup = BeautifulSoup(page.content, 'lxml')
Precio = []
price = soup.find('div' , 'price').text
Precio = (str(price).strip())
print (Precio)
#-------------------------------------------------------------------------------------------------------------#
# #Tipo Propiedad #Tipo de oferta #
#-------------------------------------------------------------------------------------------------------------#
Tipo_Propiedad = []
Tipo_de_oferta = []
T_1 = soup.find('div' , 'box').h1.text
text = (str(T_1).strip())
l = text.find(' ')
m = text.find(' ', l+1)
n = text.find(' ', m+1)
Tipo_Propiedad = text[0 : l]
Tipo_de_oferta = text[m+1 : n]
print (Tipo_Propiedad)
print (Tipo_de_oferta)
#-------------------------------------------------------------------------------------------------------------#
# #Departamento #Ciudad #Zona #Barrio #
#-------------------------------------------------------------------------------------------------------------#
Departamento = []
Ciudad = []
Zona = []
Barrio = []
first = soup.find('div' , 'breadcrumb left')
link = first.find('div')
a_link = link.findAll('a')
box1 = []
for row in a_link:
box1.append(row.text)
Departamento = (box1[1:2].pop())
Ciudad = (box1[2:3].pop())
Zona = (box1[3:4].pop())
Barrio = (box1[4:5])
print (Departamento)
print (Ciudad)
print (Zona)
print (Barrio)
#-------------------------------------------------------------------------------------------------------------#
# #Área #Habitaciones #Baños #Parqueaderos #
#-------------------------------------------------------------------------------------------------------------#
box_2 = soup.find('div' ,'features clearfix')
box_2_1 = box_2.findAll('span')
box2 = []
Área=[]
Habitaciones = []
Baños = []
Parqueaderos = []
for row2 in box_2_1:
box2.append(str(row2.text).strip())
for i in box_2_1:
a = box2[0:1].pop()
b = box2[1:2].pop()
c = box2[2:3].pop()
d = box2[3:4].pop()
a1 = a[0 : a.find(' ')]
Área = (a1)
Habitaciones = (b.rstrip()[-1])
Baños = (c.rstrip()[-1])
Parqueaderos =(d)
print (Área)
print (Habitaciones)
print (Baños)
print (Parqueaderos)
#-------------------------------------------------------------------------------------------------------------#
# #Área_Privada #Área_Const #Antigüedad #Admón #Estrato #Estado #Piso_No #
#-------------------------------------------------------------------------------------------------------------#
box_3 = soup.find('div' ,'row features_2 ')
box_3_1 = box_3.findAll('li') #
Área_Privada = []
Área_Const = []
Antigüedad = []
Admón = []
Estrato = []
Estado = []
Piso_No = []
for li in box_3_1:
heading_words = li.b.text.split() #
target_content = str(li.br.next_sibling).strip() #
if "privada:" in heading_words:
Área_Privada = (target_content) #
elif "Const.:" in heading_words:
Área_Const = (target_content)
elif "Antigüedad:" in heading_words:
Antigüedad = (target_content)
elif "Admón:" in heading_words:
Admón = (target_content)
elif "Estrato:" in heading_words:
Estrato = (target_content)
elif "Estado:" in heading_words:
Estado = (target_content)
elif "Piso" in heading_words:
Piso_No = (target_content)
print (Área_Privada) #
print (Área_Const)
print (Antigüedad)
print (Admón)
print (Estrato) #
print (Estado)
print (Piso_No[0:1])
#-------------------------------------------------------------------------------------------------------------#
# #Actualizado #Visitas #Código_FincaRaiz # #
#-------------------------------------------------------------------------------------------------------------#
box4 = soup.find('div' , 'box_content row')
box4_1 = box4.findAll('span')
vis = []
Actualizado = []
Visitas = []
Código_FincaRaiz = []
for i in box4_1:
vis.append((str(i.text).strip()))
for j in box4_1:
e = vis[0:1].pop()
f = vis[2:3].pop()
Actualizado = e
Código_FincaRaiz = f
url="https://www.fincaraiz.com.co/WebServices/Statistics.asmx/GetAdvertVisits?idAdvert={}&idASource=40&idType=1001".format(Código_FincaRaiz) #
page1 = requests.get(url)
soup1 = BeautifulSoup(page1.content , 'lxml')
visit1 = soup1.find('double').text
Visitas = (visit1)
print (Actualizado)
print (Visitas)
print (Código_FincaRaiz)
#-------------------------------------------------------------------------------------------------------------#
x['Código FincaRaiz'] = Código_FincaRaiz
x['Departamento'] = Departamento
x['Ciudad'] = Ciudad
x['Zona'] = Zona
x['Barrio'] = Barrio
x['Tipo Propiedad'] = Tipo_Propiedad
x['Tipo de oferta'] = Tipo_de_oferta
x['Precio'] = Precio
x['Área'] = Área
x['Área Privada'] = Área_Privada
x['Área Const.'] = Área_Const
x['Antigüedad'] = Antigüedad
x['Baños'] = Baños
x['Habitaciones'] = Habitaciones
x['Parqueaderos'] = Parqueaderos
x['Admón'] = Admón
x['Estrato'] = Estrato
x['Estado'] = Estado
x['Piso No.'] = Piso_No
x['Actualizado'] = Actualizado
x['Visitas'] = Visitas
y.append(x)
x = {}
y = []
filename = 'Fincar.csv'
with open(filename, 'w', newline='') as f:
w = csv.DictWriter(f,['Código FincaRaiz','Departamento','Ciudad','Zona','Barrio', 'Tipo Propiedad', 'Tipo de oferta',
'Precio' , 'Área' , 'Área Privada' , 'Área Const.', 'Antigüedad', 'Baños' , 'Habitaciones',
'Parqueaderos' , 'Admón', 'Estrato' , 'Estado' , 'Piso No.' , 'Actualizado', 'Visitas'])
w.writeheader()
for x in y:
w.writerow(x)a
tab = []
xen = []
key_value = 'https://www.fincaraiz.com.co'
for i in range(2,6):
tab.append('https://www.fincaraiz.com.co/finca-raiz/?ad=30|{}||||1||||||||||||||||||||||1|||1||||||'.format(i))
for j in tab:
page = requests.get(j)
soup = BeautifulSoup(page.content , 'lxml')
index = soup.findAll('div' , 'span-title')
for i in index:
xen.append(i.find('a').get('href'))
for j in xen:
url = (key_value + j)
fin_car(url)
I've tried to fetch values from list of pages and trying to save the file to csv document, however csv document only storing the last value inside the csv document.
Tried multiple ways but it always giving the same output.
also the column with the blank values need to be filled with nil value, however it posting only [] symbol.
New to python and find it difficult to saving it to python. Need your support to achieve this task.
How should I proceed further
def fin_car(url):
x = {}
y = []
...
x = {}
y = []
These values are in different scopes. Assigning x inside fin_car doesn't affect it outside. You could change that using a global statement, but much better is to return from the function.
Even if you were changing the outside values of x and y, you only call fin_car long after writing to the CSV. The order of events in your code matters.
I suggest:
def fin_car(url):
x = {}
...
return x
with open...:
w = csv.DictWriter(...)
...
for j in tab:
...
for j in xen:
url = ...
w.writerow(fin_car(url))
You don't need y at all.
my viterbi code program becomes exponential. can you help me find the place i can change to make it dynamic program. I need to remember and use only the 2 previous tags of words.
thanks a lot.
from collections import defaultdict
import sys
import re
import feature_maker as fm
bla = ''
all_states = set()
#distirbuition over all of the corpus
POS_probability = fm.load_obj('probas')
POS_probability['START'] = 1.0
def cpd_tagwords(words, tag):
pattern = re.compile("\W")# to check for .,: etc.
if pattern.match(words) and tag == words:
return 1
elif pattern.match(tag):
return 0
for word in emle.split("\n"):
if word.__contains__(words) and word.__contains__(tag):
return word[word.index(":") + 2:]
#if we dont have data about the word with the tag,just retturn the probability
#to get the tag over all of the word in the corpus.
return POS_probability[tag]
def cpd_tags(early, prev, current):
lambda1 = 0
lambda3 = 0
lambda6 = 0
for word in qmle.split("\n"):
word1 = word.split()
if len(word1) > 0:
if word1[0].__contains__(current): #for tuple of 1
if len(word1) == 2:
lambda1 = word[word.index("]:") + 3:]
if len(word1) > 2 and word1[1].__contains__(prev): #for tuple of 2
if len(word1) == 3:
lambda3 = word[word.index("]:") + 3:]
if len(word1) > 3 and word1[2].__contains__(early): #for tuple of 3
if len(word1) == 4:
lambda6 = word[word.index("]:") + 3:]
return (0.6*float(lambda6)) + (0.3*float(lambda3)) + (0.1*float(lambda1))
#map: popular_copuler['POS'] = list of all pos that can come before it.
popular_copules = fm.load_obj('popular_copules')
# Viterbi Algo
def viterbi(sentence, tags1):
def findSet(index,tag):
if tag == 'ALL':
return tags1
if index in range(1, len(sentence) + 1):
possible_tags = set(popular_copules[tag])
if possible_tags == set([]):
return tags1
return set(popular_copules[tag])
elif index == 0 or index == -1:
return {'START'}
# stores (word:tag) in this whole sentence
sentence_with_tag = defaultdict(str)
# inner function to commpute pi values--start
def pi_viterbi(k, u, v, sentence):#here is the start of the bad sequence
prob = defaultdict(float)
# initialization
if k == 0 and u == 'START' and v == 'START':
return (1., 'START')
else:
for w in findSet(k - 2,u):
prev = pi_viterbi(k - 1, w, u, sentence)[0]
# tuple((w,u,v))
q = cpd_tags(w, u, v)**
e = cpd_tagwords(sentence[k - 1].lower(), v)
probability = float(prev) * q * float(e)
prob[tuple((w, u))] = probability**
#here is the end of the bad sequence
max_tuple = max(prob.items(), key=lambda x: x[1])
# print (max_tuple[1],max_tuple[0][0])
return max_tuple[1], max_tuple[0][0]
# inner function to commpute pi values--end
sentence_with_tag = list()
backpointer = defaultdict(str)
tags = defaultdict(str)
k = len(sentence)
u_glob = ''
v_glob = ''
glob = 0.
for i in range(1, k + 1):
prob = defaultdict(float)
#for current word we check all the tags
""" changed from for u in findSet(i - 1):"""
for u in findSet(i ,'ALL'):
#going backwards we call findset with u so it gives us only
# tags v that go togeter alot with u(this is purnnig)
""" changed from for v in findSet(i)"""
for v in findSet(i-1,u_glob):
#siwtched u and v
value, w = pi_viterbi(i, v, u, sentence)#the v recursion in the algorithm
prob[tuple((i, u, v))] = value
backpointer[tuple((i, u, v))] = w #bp from the algorithm
max_tuple = max(prob.items(), key=lambda x: x[1])
backpointer[tuple((i, max_tuple[0][1], max_tuple[0][-1]))] = max_tuple[0][1] # bp (k,u,v)= tag w
# sentence_with_tag.append(max_tuple[0][-1])
u_glob = max_tuple[0][-2]
v_glob = max_tuple[0][-1]
glob = max_tuple[1]
print ('Max', max_tuple)
tags[k - 1] = u_glob
tags[k] = v_glob
for i in range((k - 2), 0, -1):
tag = backpointer[tuple(((i + 2), tags[i + 1], tags[i + 2]))]
tags[i] = tag
tag_list = list()
for i in range(1, len(tags) + 1):
tag_list.append(tags[i])
file = open(sys.argv[4], 'w')
file.truncate()
for word in tag_list:
file.write(word)
# tag list as results
return tag_list
file=open(sys.argv[1],"r+")
fQ = open(sys.argv[2], 'r')
qmle = fQ.read()
fQ.close()
f = open("tags.txt",'r+')
tags = f.read()
f.close()
fe = open(sys.argv[3], 'r')
emle = fe.read()
distinct_tags = set()
# what is the list of all tags?
for word in tags.split():
distinct_tags.add(word)
sentence = []
sentence1 = []
sentence1 = file.read()
sentence = sentence1.split()
file.close()
file = open(sys.argv[4], 'w')
file.truncate()
viterbi(sentence, distinct_tags)
how can I reduce the time complexity?
output_rdpartition = mp.Queue()
def read_partition_zipfile(infile,stop_words,startline,endline):
# endline = startline + 100
chunk_user_d = defaultdict(lambda: defaultdict(list))
chunk_user_withoutstamp_d = defaultdict(list)
with gzip.open(in_file, "rb") as f:
for j, line in enumerate(f):
if j >= startline and j < endline:
if j%10000==0 : print "processed",j,"lines"
line = line[:-1].split("|:|")
time_stamp = int(line[0])
user_id = line[-1]
keywords=line[1].split(',')
keywords = [item.lower() for item in keywords if len(item)>=2]
keywords = [item for item in keywords if item not in stop_words]
# print 'user_id', user_id
# print 'time_stamp', time_stamp
# print 'keywords',keywords
chunk_user_d[user_id][time_stamp] += keywords
chunk_user_withoutstamp_d[user_id] +=keywords
# print chunk_user_withoutstamp_d,'chunk_user_withoutstamp_d'
# return chunk_user_d, chunk_user_withoutstamp_d
output_rdpartition.put((chunk_user_d,chunk_user_withoutstamp_d))
def main():
start_time = datetime.datetime.now()
print("at the start of main")
user_id ='1ss7fef4'
lenth = 0
tf_idf = defaultdict(int)
key_dic = defaultdict(float)
time_latest = 0
processes_rd = [mp.Process(target = read_partition_zipfile, args =(in_file, stop_words, p_index[j], p_index[j+1])) for j in range(0,3)]
for p in processes_rd:
p.start()
results_rd = [output_rdpartition.get() for p in processes_rd]
# results_rd[0]is the chunkuser ,results_rd[1]is the chunkuser_without stamp
print results_rd
if __name__ == '__main__':
stop_words = "a,able,about,across,after,all,almost,also,am,among,an,and,any,are,as,at,be,because,been,but,by,can,cannot,could,dear,did,do,does,either,else,ever,every,for,from,get,got,had,has,have,he,her,hers,him,his,how,however,i,if,in,into,is,it,its,just,least,let,like,likely,may,me,might,most,must,my,neither,no,nor,not,of,off,often,on,only,or,other,our,own,rather,said,say,says,she,should,since,so,some,than,that,the,their,them,then,there,these,they,this,tis,to,too,twas,us,wants,was,we,were,what,when,where,which,while,who,whom,why,will,with,would,yet,you,your"
stop_words = stop_words.split(",")
in_file = 'uniq.txt.gz'
p_index = range(0,28000000,2800000)
main()
It seems that it is because of the queue issue, i can print within function ,but i can not return the output of the function