Pandas + Python: More efficient code - python

This is my code:
import pandas as pd
import os
import glob as g
archivos = g.glob('C:\Users\Desktop\*.csv')
for archiv in archivos:
nombre = os.path.splitext(archiv)[0]
df = pd.read_csv(archiv, sep=",")
d = pd.to_datetime(df['DATA_LEITURA'], format="%Y%m%d")
df['FECHA_LECTURA'] = d.dt.date
del df['DATA_LEITURA']
df['CONSUMO']=""
df['DIAS']=""
df["SUMDIAS"]=""
df["SUMCONS"]=""
df["CONSANUAL"] = ""
ordenado = df.sort_values(['NR_CPE','FECHA_LECTURA', 'HORA_LEITURA'], ascending=True)
##Agrupamos por el CPE
agrupado = ordenado.groupby('NR_CPE')
for name, group in agrupado: #Recorremos el grupo
indice = group.index.values
inicio = indice[0]
fin = indice[-1]
#Llenamos la primeras lectura de cada CPE, con esa lectura (porque no hay una lectura anterior)
ordenado.CONSUMO.loc[inicio] = 0
ordenado.DIAS.loc[inicio] = 0
cont=0
for i in indice: #Recorremos lo que hay dentro de los grupos, dentro de los CPES(lecturas)
if i > inicio and i <= fin :
cont=cont+1
consumo = ordenado.VALOR_LEITURA[indice[cont]] - ordenado.VALOR_LEITURA[indice[cont-1]]
dias = (ordenado.FECHA_LECTURA[indice[cont]] - ordenado.FECHA_LECTURA[indice[cont-1]]).days
ordenado.CONSUMO.loc[i] = consumo
ordenado.DIAS.loc[i] = dias
# Hago las sumatorias, el resultado es un objeto DataFrame
dias = agrupado['DIAS'].sum()
consu = agrupado['CONSUMO'].sum()
canu = (consu/dias) * 365
#Contador con el numero de courrencias de los campos A,B y C
conta=0
contb=0
contc=0
#Como es un DF, para recorrerlo tengo que iterar sobre ellos para hacer la comparacion
print "Grupos:"
for ind, sumdias in dias.iteritems():
if sumdias <= 180:
grupo = "A"
conta=conta+1
elif sumdias > 180 and sumdias <= 365:
grupo = "B"
contb=contb+1
elif sumdias > 365:
grupo = "C"
contc=contc+1
print "grupo A: " , conta
print "grupo B: " , contb
print "grupo C: " , contc
#Formateamos los campos para no mostrar todos los decimales
Fdias = dias.map('{:.0f}'.format)
Fcanu = canu.map('{:.2f}'.format)
frames = [Fdias, consu, Fcanu]
concat = pd.concat(frames,axis=1).replace(['inf','nan'],[0,0])
with open('C:\Users\Documents\RPE_PORTUGAL\Datos.csv','a') as f:
concat.to_csv(f,header=False,columns=['CPE','DIAS','CONSUMO','CONSUMO_ANUAL'])
try:
ordenado.to_excel(nombre+'.xls', columns=["NOME_DISTRITO",
"NR_CPE","MARCA_EQUIPAMENTO","NR_EQUIPAMENTO","VALOR_LEITURA","REGISTADOR","TIPO_REGISTADOR",
"TIPO_DADOS_RECOLHIDOS","FACTOR_MULTIPLICATIVO_FINAL","NR_DIGITOS_INTEIRO","UNIDADE_MEDIDA",
"TIPO_LEITURA","MOTIVO_LEITURA","ESTADO_LEITURA","HORA_LEITURA","FECHA_LECTURA","CONSUMO","DIAS"],
index=False)
print (archiv)
print ("===============================================")
print ("*****Se ha creado el archivo correctamente*****")
print ("===============================================")
except IOError:
print ("===================================================")
print ("¡¡¡¡¡Hubo un error en la escritura del archivo!!!!!")
print ("===================================================")
This takes a file where I have lectures of energy consumption from different dates for every light meter('NR_CPE') and do some calculations:
Calculate the energy consumption for every 'NR_CPE' by substracting the previous reading with the next one and the result put in a new column named 'CONSUMO'.
Calculate the number of days where I'v got a reading and sum up the number of days
Add the consumption for every 'NR_CPE' and calculate the anual consumption.
Finally I want to classify by number of days that every light meter('NR_CPE') has a lecture. A if it has less than 180 days, B between 180 and 1 year and C more than a year.
And finally write this result in two differents files.
Any idea of how should I re-code this to have the same output and be faster?
Thank you all.
BTW this is my dataset:
,NOME_DISTRITO,NR_CPE,MARCA_EQUIPAMENTO,NR_EQUIPAMENTO,VALOR_LEITURA,REGISTADOR,TIPO_REGISTADOR,TIPO_DADOS_RECOLHIDOS,FACTOR_MULTIPLICATIVO_FINAL,NR_DIGITOS_INTEIRO,UNIDADE_MEDIDA,TIPO_LEITURA,MOTIVO_LEITURA,ESTADO_LEITURA,DATA_LEITURA,HORA_LEITURA
0,GUARDA,A002000642VW,101,1865411,4834,001,S,1,1,4,kWh,1,1,A,20150629,205600
1,GUARDA,A002000642VW,101,1865411,4834,001,S,1,1,4,kWh,2,2,A,20160218,123300
2,GUARDA,A002000642VJ,122,204534,25083,001,S,1,1,5,kWh,1,1,A,20150629,205700
3,GUARDA,A002000642VJ,122,204534,27536,001,S,1,1,5,kWh,2,2,A,20160218,123200
4,GUARDA,A002000642HR,101,1383899,11734,001,S,1,1,5,kWh,1,1,A,20150629,205600
5,GUARDA,A002000642HR,101,1383899,11800,001,S,1,1,5,kWh,2,2,A,20160218,123000
6,GUARDA,A002000995VM,101,97706436,12158,001,S,1,1,5,kWh,1,3,A,20150713,155300
7,GUARDA,A002000995VM,101,97706436,12163,001,S,1,1,5,kWh,2,2,A,20160129,162300
8,GUARDA,A002000995VM,101,97706436,12163,001,S,1,1,5,kWh,2,2,A,20160202,195800
9,GUARDA,A2000995VM,101,97706436,12163,001,S,1,1,5,kWh,1,3,A,20160404,145200
10,GUARDA,A002000996LV,168,5011703276,3567,001,V,1,1,6,kWh,1,1,A,20150528,205900
11,GUARDA,A02000996LV,168,5011703276,3697,001,V,1,1,6,kWh,2,2,A,20150929,163500
12,GUARDA,A02000996LV,168,5011703276,1287,002,P,1,1,6,kWh,1,1,A,20150528,205900

Generally you want to avoid for loops in pandas.
For example, the first loop where you calculate total consumption and days could be rewritten as a groupby apply something like:
def last_minus_first(df):
columns_of_interest = df[['VALOR_LEITURA', 'days']]
diff = columns_of_interest.iloc[-1] - columns_of_interest.iloc[0]
return diff
df['date'] = pd.to_datetime(df['DATA_LEITURA'], format="%Y%m%d")
df['days'] = (df['date'] - pd.datetime(1970,1,1)).dt.days # create days column
df.groupby('NR_CPE').apply(last_minus_first)
(btw I don't understand why you are subtracting each entry from the previous, surely for meter readings this is the same as last-first?)
Then given the result of the above as consumption, you can replace your second for loop (for ind, sumdias in dias.iteritems()) with something like:
pd.cut(consumption.days, [-1, 180, 365, np.inf], labels=['a', 'b', 'c']).value_counts()

Related

More efficient way to manipulate large dataframe

It's my first real Python script, so feel free to make comments in order to improve my code.
The purpose of this script is to extract 2 Oracle tables with Python, store them in a dataframe and then join them with pandas.
But for queries returning more than 500k lines I feel that it is slow. Do you know why?
import pandas as pd
from datetime import date
from sqlalchemy import create_engine
import cx_Oracle, time
import pandas as pd
import config
## Variable pour le timer
start = time.time()
## User input en ligne de commande
year = input('Saisir une annee : ')
month = input('Saisir le mois, au fomat MM : ')
societe_var = input('SA (APPLE,PEACH,BANANA,ANANAS,ALL) : ')
## SA + BU correspondantes aux SA
sa_list = ['APPLE','PEACH','BANANA','ANANAS']
bu_list_MERE = ['006111','1311402','1311403','1311404','1340115','13411106','1311407','1111','6115910','1166157','6811207','8311345','1111','1188100','8118101','8811102','8810113','8811104','8118105','8811106','8811107','8118108','1111']
bu_list_GARE = ['131400','310254']
bu_list_VOYA = ['0151100','1110073','1007115','1311335','1113340','1311341','1113342','1331143']
bu_list_RESO = ['1211345','13111345','11113395','73111345']
#Permet de pointre vers la bonne liste en fonction de la SA saisie
bu_list_map = {
'APPLE': bu_list_APPLE,
'PEACH': bu_list_PEACH,
'BANANA': bu_list_BANANA,
'ANANAS' : bu_list_ANANAS
}
if societe_var == 'ALL' :
print('non codé pour le moment')
elif societe_var in sa_list :
bu_list = bu_list_map.get(societe_var)
sa_var = societe_var
i=1
for bu in bu_list :
start_bu = time.time()
## On vient ici charger la requête SQL avec les bonnes variables pour gla_va_parametre -- EPOST
query1 = open('gla_va_parametre - VAR.sql',"r").read()
query1 = query1.replace('#ANNEE',"'" + year + "'").replace('%MOIS%',"'" + month + "'").replace('%SA%',"'" + societe_var + "'").replace('%BUGL%',"'" + bu + "'").replace('%DIVISION%','"C__en__PS_S1_D_OP_UNIT13".OPERATING_UNIT')
## On vient ici charger la requête SQL avec les bonnes variables pour cle-gla_tva -- FPOST
query2 = open('cle-gla_tva - VAR.sql',"r").read()
query2 = query2.replace('#ANNEE',"'" + year + "'").replace('%MOIS%',"'" + month + "'").replace('%SA%',"'" + societe_var + "'").replace('%BUGL%',"'" + bu + "'").replace('%DIVISION%','OPERATING_UNIT')
# Param de connexion
connection_EPOST = cx_Oracle.connect(user=config.user_EPOST, password=config.password_EPOST, dsn=config.host_EPOST, )
connection_FPOST = cx_Oracle.connect(user=config.user_FPOST, password=config.password_FPOST, dsn=config.host_FPOST, )
## Récup partie EPOST
with connection_EPOST :
# On déclare une variable liste vide
dfl = []
# On déclare un DataFrame vide
dfs = pd.DataFrame()
z=1
# Start Chunking
for chunk in pd.read_sql(query1, con=connection_EPOST,chunksize=25000) :
# Start Appending Data Chunks from SQL Result set into List
dfl.append(chunk)
print('chunk num : ' + str(z))
z = z + 1
# Start appending data from list to dataframe
dfs = pd.concat(dfl, ignore_index=True)
print('param récupéré')
## Récup partie FPOST
with connection_FPOST :
# On déclare une variable liste vide
df2 = []
# On déclare un DataFrame vide
dfs2 = pd.DataFrame()
# Start Chunking
for chunk in pd.read_sql(query2, con=connection_FPOST,chunksize=10000) :
# Start Appending Data Chunks from SQL Result set into List
df2.append(chunk)
# Start appending data from list to dataframe
dfs2 = pd.concat(df2, ignore_index=True)
print('clé récupéré')
print('Début de la jointure')
jointure = pd.merge(dfs,dfs2,how='left',left_on=['Code_BU_GL','Code_division','Code_ecriture','Date_comptable','Code_ligne_ecriture','UNPOST_SEQ'],right_on=['BUSINESS_UNIT','OPERATING_UNIT','JOURNAL_ID','JOURNAL_DATE','JOURNAL_LINE','UNPOST_SEQ']).drop(columns= ['BUSINESS_UNIT','OPERATING_UNIT','JOURNAL_ID','JOURNAL_DATE','JOURNAL_LINE'])
jointure.to_csv('out\gla_va_'+year+month+"_"+societe_var+"_"+bu+"_"+date.today().strftime("%Y%m%d")+'.csv', index=False, sep='|')
print('Fichier ' + str(i) + "/" + str(len(bu_list)) + ' généré en : '+ str(time.time() - start_bu)+' secondes')
i = i + 1
print("L'extraction du périmètre de la SA " + societe_var + " s'est effectué en :" + str((time.time() - start)/60) + " min" )

CNN in python. Array of images

I was trying to deeply search for a solution online before posting here, but I couldn't find it. My problem arises during the reading of images in a training of a convolutional neural network. Basically, I decided to create a function creating the in values and the out values out of a series of images. I want to read all the images of the set, but not all at the same time, to avoid running out of memory, so I create the next function:
def readImages(strSet = 'Train', nIni = 1, nFin = 20):
if strSet not in ('Train','Test'):
return None
#
# Inicializamos los arrays de salida: las imágenes y las etiquetas.
arrImages = []
arrLabels = []
#
# Recorremos todos y cada uno de los directorios dentro del set elegido
for strDir in os.listdir(data_dir+'/' + strSet + '/'):
# Nombre de la clase que estamos tratando.
strClass = strDir[strDir.find('-')+1:]
# Número y nombre de los ficheros, por si es menor que el número n indicado.
arrNameFiles = os.listdir(data_dir+'/' + strSet + '/'+strDir)
nFiles = len(os.listdir(data_dir+'/' + strSet + '/'+strDir))
#
# Cogemos los ficheros desde el nIni al nFin. De esta forma nos aseguramos los cogemos todos en cada directorio.
#print('nImagesClase(',strSet,',',strClass,'):',nImagesClase(strSet, strClass))
if (nIni == -1):
# Si el valor es -1, cogemos todas las imágenes del directorio.
listChosenFiles = arrNameFiles
#print('Todos: ', len(listChosenFiles))
else:
if (nImagesClase(strSet, strClass)<nFin):
# Si ya hemos dado la vuelta a todos los ficheros del grupo, los cogemos al azar.
listChosenFiles = random.sample(arrNameFiles, min(nFiles, nFin-nIni))
#print('Fin del directorio ',nFin,'>',nImagesClase(strSet,strClass),': ', len(listChosenFiles))
else:
# Si no, seguimos.
listChosenFiles = arrNameFiles[nIni-1:min(nFin,nImagesClase(strSet, strClass))-1]
#print('Seguimos ',nIni,'-',nFin,': ', len(listChosenFiles))
#
for file in listChosenFiles:
# Lectura del fichero.
image = plt.imread(data_dir+'/'+strSet+'/'+strDir+'/'+file)
#print('Original Shape: ',image.shape)
#plt.imshow(image)
image = cv2.resize(image, (crop_width, crop_height), interpolation=cv2.INTER_NEAREST)
#image = image.reshape((image_height,image_width,num_channels))
#print('Al array de imágenes: ',image.shape)
arrImages.append(image)
# Añadimos etiquetas.
arrLabel = np.zeros(n_classes)
arrLabel[array_classes.index(strClass)] = 1
arrLabels.append(arrLabel)
#
# Recogemos los valores de entrada y salida en arrays.
y = np.array(arrLabels)
X = np.array(arrImages, dtype=np.uint8)
# Una vez terminado el recorrido por todas las imágenes, reordenamos los índices para que no vayan las imágenes en secuendias de la misma clase.
arrIndexes = np.arange(X.shape[0])
np.random.shuffle(arrIndexes)
X = X[arrIndexes]
y = y[arrIndexes]
#
return X, y
To test the behavior of this function I just execute the following line.
X, y = readImages(strSet = 'Train', nIni = 1, nFin = 5)
Which is ok, until the moment nIni and nFin reach some values (101-105, for example). In that moment, I receive the following error:
ValueError Traceback (most recent call last)
<ipython-input-125-8a690256a1fc> in <module>
----> 1 X, y = readImages(strSet = 'Train', nIni = 101, nFin = 105)
<ipython-input-123-9e9ebc660c33> in readImages(strSet, nIni, nFin)
50 # Recogemos los valores de entrada y salida en arrays.
51 y = np.array(arrLabels)
---> 52 X = np.array(arrImages, dtype=np.uint8)
53 # Una vez terminado el recorrido por todas las imágenes, reordenamos los índices para que no vayan las imágenes en secuendias de la misma clase.
54 arrIndexes = np.arange(X.shape[0])
ValueError: could not broadcast input array from shape (28,28,3) into shape (28,28)
I put some print traces in the reading of the images, and everyone of the read images has a shape of (28,28,3), so I don't really understand from where do I have this (28,28) shape pointed out in the error trace.
Do you know what could be the problem? Did you face this problem earlier?
Thanks in advance.
Some of your images have single channels. Use cv2.imread instead of plt.imread
image = cv2.imread(data_dir+'/'+strSet+'/'+strDir+'/'+file)

Error: float() argument must be a string or a number, not 'dict_keys'

Hello I always get the error "Error: float() argument must be a string or a number, not 'dict_keys'" when running this code. I thought i ran it before without any problem but it seems like now it does not work anymore. Is that possible? And if so, what can I do to get it working again? My problem is within the very last part where I want to graph the time it took to do calculations. But I guess maybe one of the earlier variables isnt defined correctly or something? Appreaciate your help!
Thank you!
from sklearn.linear_model import LinearRegression
import numpy as np
from datetime import datetime
import pandas as pd
import matplotlib.pyplot as plt #biblioteca de gráficos a partir de listas o arreglos
import time
import threading #para hacer subprocesos
import gc #gc.collect va borrando lo de la memoria
#Definicion de filas y columnas (Parte de arriba)
np.random.seed(42)
n_rows = 100 #número fijo de filas
cols_min = 100 #comienzo numero de columnas
cols_max = 12_800 #máximo de columnas para que salga del loop
timeout = 90 # segundos maximos para correr la iteracion, se puede ir modificando para testeo
#Generaremos funciones para ir llamando
def mult_range(start, end): #Finalidad es que irá duplicando las columnas
val = start
while val <= end:
yield val
val *= 2
#gen_dataset irá generando un conjunto de datos, partiendo de 100 columnas que son las fijadas en la parte de arriba
#la gracia es tener por separado x e y para luego hacer en una linea los gráficos
def gen_dataset(n_rows, n_cols):
y = np.random.rand(n_rows) #crea la matriz y la rellena con el valor de n_rows // las n filas
x = np.random.rand(n_rows, n_cols) #para la matriz n filas m columnas
return x, y
#cromometrará el tiempo de cálculo
def timeit(timeout, n_cols, result, fn, *args, **kwargs):
try:# función que puede llegar a levantar una excepción
#link: para info https://uniwebsidad.com/libros/algoritmos-python/capitulo-12/excepciones
t1 = datetime.now() #valor actual
fn(*args, **kwargs)
t2 = datetime.now() #valor nuevo
delta = t2 - t1 #diferencia
delta_microsecs = delta.seconds * 1_000_000 + delta.microseconds #tiempo que tardará la ejecución
if (delta_microsecs / 1_000_000) < (timeout + 500_000): #condición para que imprima lo que tarda
#irá imprimiendo instantaneamente en la pantalla para saber el tiempo que tarda por cantidad
print(f"for {n_cols} columns it took {delta_microsecs / 1_000_000} seconds")
#imprime para n columnas tarda delta_microsecs de tiempo
result[n_cols] = delta_microsecs #
#else:
# print(f"for {n_cols} columns it took {delta_microsecs / 1_000_000} seconds")
except MemoryError: #cuando se genera la excepción
#en caso de que utilice muchos recursos del pc
print(f"data doesn't fit in memory for {n_cols} columns")
#regresion con sklearn para las x e y
def sklearn_reg(x, y):
reg = LinearRegression()
reg.fit(x, y) #toma los valor x e y ya definidos
#regresion con numpy para las x e y
def np_reg(x, y): #toma los valor x e y ya definidos
# agregar columna de unos
x_ = np.hstack((np.ones((x.shape[0], 1)), x))
# estimador (x'x)^-1 * (x'y)
return np.matmul(np.linalg.inv(np.matmul(x_.T, x_)),np.matmul(x_.T, y))
#regresion con timeout segundos m columnas de inicio, m++ columnas finales
def time_reg(timeout, cols_min, cols_max, reg_fn, square=False): #square es para poner cuando sea X2
timing = {}
threads = []
for n_cols in mult_range(cols_min, cols_max):
try:
gc.collect()
x, y = gen_dataset(n_rows, n_cols)
if square:
x = np.square(x)
thread = threading.Thread(target=timeit, args=(timeout, n_cols, timing, reg_fn, x, y))
thread.start()
for i in range(timeout):
time.sleep(1)
if not thread.is_alive():
break
else:
print(f"for {n_cols} columns it took more than {timeout} seconds")
except MemoryError:
print(f"data doesn't fit in memory for {n_cols} columns")
return timing
def plot_time(timing):
fig = plt.figure(figsize=(10, 10))
plt.plot(timing.keys(), timing.values())
#plt.axis((0, 1_000_000, 0, 1_000_000))
plt.xlabel('time (μs)')
plt.ylabel('columns')
plt.show()
plot_time(time_reg(timeout, cols_min, cols_max, sklearn_reg))
#generar gráfico con el tiempo que demora según cantidad de columnas
plot_time(time_reg(timeout, cols_min, cols_max, np_reg))
#generar gráfico con el tiempo que demora según cantidad de columnas
``

What am i doing wrong there? error "index out of range" trying to fill a list with lists

I want to do a list with lists inside, with a for and i get index out of range
I tryed with empleados.append() but it doesnt work
def main():
empleados=[]
for i in range(1):
empleados[i][0](input("Ingrese el Nombre: "))
empleados[i][1](input("Ingrese el Apellido: "))
empleados[i][2](int(input("Ingrese el Sueldo Base: ")))
empleados[i][3](int(input("Ingrese el AFP 1 o 2: ")))
empleados[i][4](datetime(int(input("Ingrese la Fecha de Ingreso(pulsa intro cada vez 2000 12 31): ")),int(input("/")),int(input("/"))))
empleados[i][5](int(input("Ingrese la cantidad de hijos que tiene: ")))
welcome to SO!
There's no list at empleados[0] to insert new values into. I find something like this is a little easier to read:
def main():
empleados=[]
for i in range(1):
empleado_nueva = []
empleado_nueva.append(input("Ingrese el Nombre: "))
empleado_nueva.append(input("Ingrese el Apellido: "))
empleado_nueva.append(int(input("Ingrese el Sueldo Base: ")))
empleado_nueva.append(int(input("Ingrese el AFP 1 o 2: ")))
empleado_nueva.append(datetime(int(input("Ingrese la Fecha de Ingreso(pulsa intro cada vez 2000 12 31): ")),int(input("/")),int(input("/"))))
empleado_nueva.append(int(input("Ingrese la cantidad de hijos que tiene: ")))
empleados.append(empleado_nueva)
return empleados
It's worth mentioning that the index-access pattern you're attempting (empleados[i][0] = ...) only works if there's something already at that index, for instance:
>>> x = []
>>> x[0] = 1
Traceback (most recent call last):
File "<stdin>", line 1, in <module>
IndexError: list assignment index out of range
>>> x = ['a', 'b', 'c']
>>> x[0] = 'd'
>>> x
['d', 'b', 'c']
So the append's are probably the best way to go.
The problem is you're trying use empleados[i] as a list with an existing index you can insert into, when at the moment, it's not.
You need to set up your variables a separate list and then append them. E.g.
def main():
empleados=[]
vars = [
input("Ingrese el Nombre: "),
input("Ingrese el Apellido: "),
int(input("Ingrese el Sueldo Base: ")),
int(input("Ingrese el AFP 1 o 2: ")),
datetime(int(input("Ingrese la Fecha de Ingreso(pulsa intro cada vez 2000 12 31): ")),int(input("/")),int(input("/"))),
int(input("Ingrese la cantidad de hijos que tiene: ")
empleados.append(vars)

Error when I try to iterate more than once

I've got this program which calculate k-means for IA
#! /usr/bin/env python
# -*- coding: utf-8 -*-
from random import sample
from itertools import repeat
from math import sqrt
# Parametros
k = 6
maxit = 2
def leeValoracionesFiltradas (nomFichero = "valoracionesFiltradas.data"):
lineas = [(l.strip()).split("\t") for l in (open(nomFichero).readlines())]
diccio = {}
for l in lineas:
diccio[int(l[0])] = {}
for l in lineas:
diccio[int(l[0])][int(l[1])] = (float(l[2]),float(l[3]))
return diccio
def distEuclidea(dic1, dic2):
# Se calcula la suma de cuadrados de los elementos comunes a los dos diccionarios
sum2 = sum([pow(dic1[elem]-dic2[elem], 2)
for elem in dic1 if elem in dic2])
return sqrt(sum2)
def similitudEuclidea(dic1, dic2):
return 1/(1+distEuclidea(dic1, dic2))
def coefPearson(dic1, dic2):
# Se consiguen los elementos comunes en ambos diccionarios
comunes = [x for x in dic1 if x in dic2]
nComunes = float(len(comunes))
# Si no hay comunes -> cero
if nComunes==0:
return 0
# Calculo de las medias de cada diccionario
media1 = sum([dic1[x][1] for x in comunes]) / nComunes
media2 = sum([dic2[x][1] for x in comunes]) / nComunes
# Numerador y denominador
num = sum([(dic1[x][1] - media1) * (dic2[x][1] - media2) for x in comunes])
den1 = sqrt(sum([pow(dic1[x][1] - media1, 2) for x in comunes]))
den2 = sqrt(sum([pow(dic2[x][1] - media2, 2) for x in comunes]))
den = den1 * den2
# Caculo del coeficiente
if den==0:
return 0
return num/den
# Dado un diccionario {key1 : {key2 : valor}} calcula el agrupamiento k-means
# con k clusters (grupo), ejecutando maxit iteraciones, con la funcion de similitud especificada
# Retorna una tupla
# -{key1:numero de clusters} con las asignaciones de clusters (a que clusters pertenece cada elemento)
# -[{key2:valores}] una lista con los k centroides (media de los valores para cada clusters)
def kmeans (diccionario, k, maxit, similitud = coefPearson):
# K puntos aleatorios son elegidos como centroides incialmente
# Cada centroide es {key2 : valor}
centroides = [diccionario[x] for x in sample(diccionario.keys(), k)]
# Se asigna cada key1 a un numero de cluster
previo = None
asignacion = {}
# En cada iteracion se asignan puntos a los centroides y se calculan nuevos centroides
for it in range(maxit):
# Se asignan puntos a los centroides mas cercanos
for key1 in diccionario:
similitudes = map(similitud,repeat(diccionario[key1],k), centroides)
asignacion[key1] = similitudes.index(max(similitudes))
# Si no hay cambios en la asignacion, se termina
if previo == asignacion: break
previo = asignacion
# Se recalculan los centroides (se anotan los valores de cada key a cada centroide)
valores = {x : {} for x in range(k)}
contadores = {x : {} for x in range(k)}
for key1 in diccionario:
grupo = asignacion[key1]
for key2 in diccionario[key1]:
if not valores[grupo].has_key(key2):
valores [grupo][key2] = 0
contadores [grupo][key2] = 0
valores [grupo][key2] += diccionario[key1][key2][1]
contadores[grupo][key2] += 1
# Se calculan las medias (nuevos centroides)
centroides = []
for grupo in valores:
centro = {}
for key2 in valores[grupo]:
centro[key2] = round((valores[grupo][key2] / contadores[grupo][key2]),2)
centroides.append(centro)
if None in centroides: break
return (asignacion, centroides)
# Se obtiene el diccionario de valoraciones (las valoraciones ya han sido filtradas)
diccionario = leeValoracionesFiltradas()
# Se obtienen las asignaciones y los centroides con la correlacion de Pearson
tupla = kmeans (diccionario, k, maxit)
asignaciones = tupla[0]
centroids = tupla[1]
print asignaciones
print centroids
And when I execute this for example for maxit = 2, it throws:
File "kmeans_dictio.py", line 46, in coefPearson
media2 = sum([dic2[x][1] for x in comunes]) / nComunes
TypeError: 'float' object has no attribute '__getitem__'
How can I fix this?
It looks like you have a dictionary (dic2) of floats and a dictionary of dictionaries of floats (dic1) that you are pulling an item out of with this line:
comunes = [x for x in dic1 if x in dic2]
Then you are trying to iterate over this float here:
media2 = sum([dic2[x][1] for x in comunes]) / nComunes
To fix this look at dic1 and dic2 and how they are defined.

Categories