Pandas + Python: More efficient code - python
This is my code:
import pandas as pd
import os
import glob as g
archivos = g.glob('C:\Users\Desktop\*.csv')
for archiv in archivos:
nombre = os.path.splitext(archiv)[0]
df = pd.read_csv(archiv, sep=",")
d = pd.to_datetime(df['DATA_LEITURA'], format="%Y%m%d")
df['FECHA_LECTURA'] = d.dt.date
del df['DATA_LEITURA']
df['CONSUMO']=""
df['DIAS']=""
df["SUMDIAS"]=""
df["SUMCONS"]=""
df["CONSANUAL"] = ""
ordenado = df.sort_values(['NR_CPE','FECHA_LECTURA', 'HORA_LEITURA'], ascending=True)
##Agrupamos por el CPE
agrupado = ordenado.groupby('NR_CPE')
for name, group in agrupado: #Recorremos el grupo
indice = group.index.values
inicio = indice[0]
fin = indice[-1]
#Llenamos la primeras lectura de cada CPE, con esa lectura (porque no hay una lectura anterior)
ordenado.CONSUMO.loc[inicio] = 0
ordenado.DIAS.loc[inicio] = 0
cont=0
for i in indice: #Recorremos lo que hay dentro de los grupos, dentro de los CPES(lecturas)
if i > inicio and i <= fin :
cont=cont+1
consumo = ordenado.VALOR_LEITURA[indice[cont]] - ordenado.VALOR_LEITURA[indice[cont-1]]
dias = (ordenado.FECHA_LECTURA[indice[cont]] - ordenado.FECHA_LECTURA[indice[cont-1]]).days
ordenado.CONSUMO.loc[i] = consumo
ordenado.DIAS.loc[i] = dias
# Hago las sumatorias, el resultado es un objeto DataFrame
dias = agrupado['DIAS'].sum()
consu = agrupado['CONSUMO'].sum()
canu = (consu/dias) * 365
#Contador con el numero de courrencias de los campos A,B y C
conta=0
contb=0
contc=0
#Como es un DF, para recorrerlo tengo que iterar sobre ellos para hacer la comparacion
print "Grupos:"
for ind, sumdias in dias.iteritems():
if sumdias <= 180:
grupo = "A"
conta=conta+1
elif sumdias > 180 and sumdias <= 365:
grupo = "B"
contb=contb+1
elif sumdias > 365:
grupo = "C"
contc=contc+1
print "grupo A: " , conta
print "grupo B: " , contb
print "grupo C: " , contc
#Formateamos los campos para no mostrar todos los decimales
Fdias = dias.map('{:.0f}'.format)
Fcanu = canu.map('{:.2f}'.format)
frames = [Fdias, consu, Fcanu]
concat = pd.concat(frames,axis=1).replace(['inf','nan'],[0,0])
with open('C:\Users\Documents\RPE_PORTUGAL\Datos.csv','a') as f:
concat.to_csv(f,header=False,columns=['CPE','DIAS','CONSUMO','CONSUMO_ANUAL'])
try:
ordenado.to_excel(nombre+'.xls', columns=["NOME_DISTRITO",
"NR_CPE","MARCA_EQUIPAMENTO","NR_EQUIPAMENTO","VALOR_LEITURA","REGISTADOR","TIPO_REGISTADOR",
"TIPO_DADOS_RECOLHIDOS","FACTOR_MULTIPLICATIVO_FINAL","NR_DIGITOS_INTEIRO","UNIDADE_MEDIDA",
"TIPO_LEITURA","MOTIVO_LEITURA","ESTADO_LEITURA","HORA_LEITURA","FECHA_LECTURA","CONSUMO","DIAS"],
index=False)
print (archiv)
print ("===============================================")
print ("*****Se ha creado el archivo correctamente*****")
print ("===============================================")
except IOError:
print ("===================================================")
print ("¡¡¡¡¡Hubo un error en la escritura del archivo!!!!!")
print ("===================================================")
This takes a file where I have lectures of energy consumption from different dates for every light meter('NR_CPE') and do some calculations:
Calculate the energy consumption for every 'NR_CPE' by substracting the previous reading with the next one and the result put in a new column named 'CONSUMO'.
Calculate the number of days where I'v got a reading and sum up the number of days
Add the consumption for every 'NR_CPE' and calculate the anual consumption.
Finally I want to classify by number of days that every light meter('NR_CPE') has a lecture. A if it has less than 180 days, B between 180 and 1 year and C more than a year.
And finally write this result in two differents files.
Any idea of how should I re-code this to have the same output and be faster?
Thank you all.
BTW this is my dataset:
,NOME_DISTRITO,NR_CPE,MARCA_EQUIPAMENTO,NR_EQUIPAMENTO,VALOR_LEITURA,REGISTADOR,TIPO_REGISTADOR,TIPO_DADOS_RECOLHIDOS,FACTOR_MULTIPLICATIVO_FINAL,NR_DIGITOS_INTEIRO,UNIDADE_MEDIDA,TIPO_LEITURA,MOTIVO_LEITURA,ESTADO_LEITURA,DATA_LEITURA,HORA_LEITURA
0,GUARDA,A002000642VW,101,1865411,4834,001,S,1,1,4,kWh,1,1,A,20150629,205600
1,GUARDA,A002000642VW,101,1865411,4834,001,S,1,1,4,kWh,2,2,A,20160218,123300
2,GUARDA,A002000642VJ,122,204534,25083,001,S,1,1,5,kWh,1,1,A,20150629,205700
3,GUARDA,A002000642VJ,122,204534,27536,001,S,1,1,5,kWh,2,2,A,20160218,123200
4,GUARDA,A002000642HR,101,1383899,11734,001,S,1,1,5,kWh,1,1,A,20150629,205600
5,GUARDA,A002000642HR,101,1383899,11800,001,S,1,1,5,kWh,2,2,A,20160218,123000
6,GUARDA,A002000995VM,101,97706436,12158,001,S,1,1,5,kWh,1,3,A,20150713,155300
7,GUARDA,A002000995VM,101,97706436,12163,001,S,1,1,5,kWh,2,2,A,20160129,162300
8,GUARDA,A002000995VM,101,97706436,12163,001,S,1,1,5,kWh,2,2,A,20160202,195800
9,GUARDA,A2000995VM,101,97706436,12163,001,S,1,1,5,kWh,1,3,A,20160404,145200
10,GUARDA,A002000996LV,168,5011703276,3567,001,V,1,1,6,kWh,1,1,A,20150528,205900
11,GUARDA,A02000996LV,168,5011703276,3697,001,V,1,1,6,kWh,2,2,A,20150929,163500
12,GUARDA,A02000996LV,168,5011703276,1287,002,P,1,1,6,kWh,1,1,A,20150528,205900
Generally you want to avoid for loops in pandas.
For example, the first loop where you calculate total consumption and days could be rewritten as a groupby apply something like:
def last_minus_first(df):
columns_of_interest = df[['VALOR_LEITURA', 'days']]
diff = columns_of_interest.iloc[-1] - columns_of_interest.iloc[0]
return diff
df['date'] = pd.to_datetime(df['DATA_LEITURA'], format="%Y%m%d")
df['days'] = (df['date'] - pd.datetime(1970,1,1)).dt.days # create days column
df.groupby('NR_CPE').apply(last_minus_first)
(btw I don't understand why you are subtracting each entry from the previous, surely for meter readings this is the same as last-first?)
Then given the result of the above as consumption, you can replace your second for loop (for ind, sumdias in dias.iteritems()) with something like:
pd.cut(consumption.days, [-1, 180, 365, np.inf], labels=['a', 'b', 'c']).value_counts()
Related
More efficient way to manipulate large dataframe
It's my first real Python script, so feel free to make comments in order to improve my code. The purpose of this script is to extract 2 Oracle tables with Python, store them in a dataframe and then join them with pandas. But for queries returning more than 500k lines I feel that it is slow. Do you know why? import pandas as pd from datetime import date from sqlalchemy import create_engine import cx_Oracle, time import pandas as pd import config ## Variable pour le timer start = time.time() ## User input en ligne de commande year = input('Saisir une annee : ') month = input('Saisir le mois, au fomat MM : ') societe_var = input('SA (APPLE,PEACH,BANANA,ANANAS,ALL) : ') ## SA + BU correspondantes aux SA sa_list = ['APPLE','PEACH','BANANA','ANANAS'] bu_list_MERE = ['006111','1311402','1311403','1311404','1340115','13411106','1311407','1111','6115910','1166157','6811207','8311345','1111','1188100','8118101','8811102','8810113','8811104','8118105','8811106','8811107','8118108','1111'] bu_list_GARE = ['131400','310254'] bu_list_VOYA = ['0151100','1110073','1007115','1311335','1113340','1311341','1113342','1331143'] bu_list_RESO = ['1211345','13111345','11113395','73111345'] #Permet de pointre vers la bonne liste en fonction de la SA saisie bu_list_map = { 'APPLE': bu_list_APPLE, 'PEACH': bu_list_PEACH, 'BANANA': bu_list_BANANA, 'ANANAS' : bu_list_ANANAS } if societe_var == 'ALL' : print('non codé pour le moment') elif societe_var in sa_list : bu_list = bu_list_map.get(societe_var) sa_var = societe_var i=1 for bu in bu_list : start_bu = time.time() ## On vient ici charger la requête SQL avec les bonnes variables pour gla_va_parametre -- EPOST query1 = open('gla_va_parametre - VAR.sql',"r").read() query1 = query1.replace('#ANNEE',"'" + year + "'").replace('%MOIS%',"'" + month + "'").replace('%SA%',"'" + societe_var + "'").replace('%BUGL%',"'" + bu + "'").replace('%DIVISION%','"C__en__PS_S1_D_OP_UNIT13".OPERATING_UNIT') ## On vient ici charger la requête SQL avec les bonnes variables pour cle-gla_tva -- FPOST query2 = open('cle-gla_tva - VAR.sql',"r").read() query2 = query2.replace('#ANNEE',"'" + year + "'").replace('%MOIS%',"'" + month + "'").replace('%SA%',"'" + societe_var + "'").replace('%BUGL%',"'" + bu + "'").replace('%DIVISION%','OPERATING_UNIT') # Param de connexion connection_EPOST = cx_Oracle.connect(user=config.user_EPOST, password=config.password_EPOST, dsn=config.host_EPOST, ) connection_FPOST = cx_Oracle.connect(user=config.user_FPOST, password=config.password_FPOST, dsn=config.host_FPOST, ) ## Récup partie EPOST with connection_EPOST : # On déclare une variable liste vide dfl = [] # On déclare un DataFrame vide dfs = pd.DataFrame() z=1 # Start Chunking for chunk in pd.read_sql(query1, con=connection_EPOST,chunksize=25000) : # Start Appending Data Chunks from SQL Result set into List dfl.append(chunk) print('chunk num : ' + str(z)) z = z + 1 # Start appending data from list to dataframe dfs = pd.concat(dfl, ignore_index=True) print('param récupéré') ## Récup partie FPOST with connection_FPOST : # On déclare une variable liste vide df2 = [] # On déclare un DataFrame vide dfs2 = pd.DataFrame() # Start Chunking for chunk in pd.read_sql(query2, con=connection_FPOST,chunksize=10000) : # Start Appending Data Chunks from SQL Result set into List df2.append(chunk) # Start appending data from list to dataframe dfs2 = pd.concat(df2, ignore_index=True) print('clé récupéré') print('Début de la jointure') jointure = pd.merge(dfs,dfs2,how='left',left_on=['Code_BU_GL','Code_division','Code_ecriture','Date_comptable','Code_ligne_ecriture','UNPOST_SEQ'],right_on=['BUSINESS_UNIT','OPERATING_UNIT','JOURNAL_ID','JOURNAL_DATE','JOURNAL_LINE','UNPOST_SEQ']).drop(columns= ['BUSINESS_UNIT','OPERATING_UNIT','JOURNAL_ID','JOURNAL_DATE','JOURNAL_LINE']) jointure.to_csv('out\gla_va_'+year+month+"_"+societe_var+"_"+bu+"_"+date.today().strftime("%Y%m%d")+'.csv', index=False, sep='|') print('Fichier ' + str(i) + "/" + str(len(bu_list)) + ' généré en : '+ str(time.time() - start_bu)+' secondes') i = i + 1 print("L'extraction du périmètre de la SA " + societe_var + " s'est effectué en :" + str((time.time() - start)/60) + " min" )
CNN in python. Array of images
I was trying to deeply search for a solution online before posting here, but I couldn't find it. My problem arises during the reading of images in a training of a convolutional neural network. Basically, I decided to create a function creating the in values and the out values out of a series of images. I want to read all the images of the set, but not all at the same time, to avoid running out of memory, so I create the next function: def readImages(strSet = 'Train', nIni = 1, nFin = 20): if strSet not in ('Train','Test'): return None # # Inicializamos los arrays de salida: las imágenes y las etiquetas. arrImages = [] arrLabels = [] # # Recorremos todos y cada uno de los directorios dentro del set elegido for strDir in os.listdir(data_dir+'/' + strSet + '/'): # Nombre de la clase que estamos tratando. strClass = strDir[strDir.find('-')+1:] # Número y nombre de los ficheros, por si es menor que el número n indicado. arrNameFiles = os.listdir(data_dir+'/' + strSet + '/'+strDir) nFiles = len(os.listdir(data_dir+'/' + strSet + '/'+strDir)) # # Cogemos los ficheros desde el nIni al nFin. De esta forma nos aseguramos los cogemos todos en cada directorio. #print('nImagesClase(',strSet,',',strClass,'):',nImagesClase(strSet, strClass)) if (nIni == -1): # Si el valor es -1, cogemos todas las imágenes del directorio. listChosenFiles = arrNameFiles #print('Todos: ', len(listChosenFiles)) else: if (nImagesClase(strSet, strClass)<nFin): # Si ya hemos dado la vuelta a todos los ficheros del grupo, los cogemos al azar. listChosenFiles = random.sample(arrNameFiles, min(nFiles, nFin-nIni)) #print('Fin del directorio ',nFin,'>',nImagesClase(strSet,strClass),': ', len(listChosenFiles)) else: # Si no, seguimos. listChosenFiles = arrNameFiles[nIni-1:min(nFin,nImagesClase(strSet, strClass))-1] #print('Seguimos ',nIni,'-',nFin,': ', len(listChosenFiles)) # for file in listChosenFiles: # Lectura del fichero. image = plt.imread(data_dir+'/'+strSet+'/'+strDir+'/'+file) #print('Original Shape: ',image.shape) #plt.imshow(image) image = cv2.resize(image, (crop_width, crop_height), interpolation=cv2.INTER_NEAREST) #image = image.reshape((image_height,image_width,num_channels)) #print('Al array de imágenes: ',image.shape) arrImages.append(image) # Añadimos etiquetas. arrLabel = np.zeros(n_classes) arrLabel[array_classes.index(strClass)] = 1 arrLabels.append(arrLabel) # # Recogemos los valores de entrada y salida en arrays. y = np.array(arrLabels) X = np.array(arrImages, dtype=np.uint8) # Una vez terminado el recorrido por todas las imágenes, reordenamos los índices para que no vayan las imágenes en secuendias de la misma clase. arrIndexes = np.arange(X.shape[0]) np.random.shuffle(arrIndexes) X = X[arrIndexes] y = y[arrIndexes] # return X, y To test the behavior of this function I just execute the following line. X, y = readImages(strSet = 'Train', nIni = 1, nFin = 5) Which is ok, until the moment nIni and nFin reach some values (101-105, for example). In that moment, I receive the following error: ValueError Traceback (most recent call last) <ipython-input-125-8a690256a1fc> in <module> ----> 1 X, y = readImages(strSet = 'Train', nIni = 101, nFin = 105) <ipython-input-123-9e9ebc660c33> in readImages(strSet, nIni, nFin) 50 # Recogemos los valores de entrada y salida en arrays. 51 y = np.array(arrLabels) ---> 52 X = np.array(arrImages, dtype=np.uint8) 53 # Una vez terminado el recorrido por todas las imágenes, reordenamos los índices para que no vayan las imágenes en secuendias de la misma clase. 54 arrIndexes = np.arange(X.shape[0]) ValueError: could not broadcast input array from shape (28,28,3) into shape (28,28) I put some print traces in the reading of the images, and everyone of the read images has a shape of (28,28,3), so I don't really understand from where do I have this (28,28) shape pointed out in the error trace. Do you know what could be the problem? Did you face this problem earlier? Thanks in advance.
Some of your images have single channels. Use cv2.imread instead of plt.imread image = cv2.imread(data_dir+'/'+strSet+'/'+strDir+'/'+file)
Error: float() argument must be a string or a number, not 'dict_keys'
Hello I always get the error "Error: float() argument must be a string or a number, not 'dict_keys'" when running this code. I thought i ran it before without any problem but it seems like now it does not work anymore. Is that possible? And if so, what can I do to get it working again? My problem is within the very last part where I want to graph the time it took to do calculations. But I guess maybe one of the earlier variables isnt defined correctly or something? Appreaciate your help! Thank you! from sklearn.linear_model import LinearRegression import numpy as np from datetime import datetime import pandas as pd import matplotlib.pyplot as plt #biblioteca de gráficos a partir de listas o arreglos import time import threading #para hacer subprocesos import gc #gc.collect va borrando lo de la memoria #Definicion de filas y columnas (Parte de arriba) np.random.seed(42) n_rows = 100 #número fijo de filas cols_min = 100 #comienzo numero de columnas cols_max = 12_800 #máximo de columnas para que salga del loop timeout = 90 # segundos maximos para correr la iteracion, se puede ir modificando para testeo #Generaremos funciones para ir llamando def mult_range(start, end): #Finalidad es que irá duplicando las columnas val = start while val <= end: yield val val *= 2 #gen_dataset irá generando un conjunto de datos, partiendo de 100 columnas que son las fijadas en la parte de arriba #la gracia es tener por separado x e y para luego hacer en una linea los gráficos def gen_dataset(n_rows, n_cols): y = np.random.rand(n_rows) #crea la matriz y la rellena con el valor de n_rows // las n filas x = np.random.rand(n_rows, n_cols) #para la matriz n filas m columnas return x, y #cromometrará el tiempo de cálculo def timeit(timeout, n_cols, result, fn, *args, **kwargs): try:# función que puede llegar a levantar una excepción #link: para info https://uniwebsidad.com/libros/algoritmos-python/capitulo-12/excepciones t1 = datetime.now() #valor actual fn(*args, **kwargs) t2 = datetime.now() #valor nuevo delta = t2 - t1 #diferencia delta_microsecs = delta.seconds * 1_000_000 + delta.microseconds #tiempo que tardará la ejecución if (delta_microsecs / 1_000_000) < (timeout + 500_000): #condición para que imprima lo que tarda #irá imprimiendo instantaneamente en la pantalla para saber el tiempo que tarda por cantidad print(f"for {n_cols} columns it took {delta_microsecs / 1_000_000} seconds") #imprime para n columnas tarda delta_microsecs de tiempo result[n_cols] = delta_microsecs # #else: # print(f"for {n_cols} columns it took {delta_microsecs / 1_000_000} seconds") except MemoryError: #cuando se genera la excepción #en caso de que utilice muchos recursos del pc print(f"data doesn't fit in memory for {n_cols} columns") #regresion con sklearn para las x e y def sklearn_reg(x, y): reg = LinearRegression() reg.fit(x, y) #toma los valor x e y ya definidos #regresion con numpy para las x e y def np_reg(x, y): #toma los valor x e y ya definidos # agregar columna de unos x_ = np.hstack((np.ones((x.shape[0], 1)), x)) # estimador (x'x)^-1 * (x'y) return np.matmul(np.linalg.inv(np.matmul(x_.T, x_)),np.matmul(x_.T, y)) #regresion con timeout segundos m columnas de inicio, m++ columnas finales def time_reg(timeout, cols_min, cols_max, reg_fn, square=False): #square es para poner cuando sea X2 timing = {} threads = [] for n_cols in mult_range(cols_min, cols_max): try: gc.collect() x, y = gen_dataset(n_rows, n_cols) if square: x = np.square(x) thread = threading.Thread(target=timeit, args=(timeout, n_cols, timing, reg_fn, x, y)) thread.start() for i in range(timeout): time.sleep(1) if not thread.is_alive(): break else: print(f"for {n_cols} columns it took more than {timeout} seconds") except MemoryError: print(f"data doesn't fit in memory for {n_cols} columns") return timing def plot_time(timing): fig = plt.figure(figsize=(10, 10)) plt.plot(timing.keys(), timing.values()) #plt.axis((0, 1_000_000, 0, 1_000_000)) plt.xlabel('time (μs)') plt.ylabel('columns') plt.show() plot_time(time_reg(timeout, cols_min, cols_max, sklearn_reg)) #generar gráfico con el tiempo que demora según cantidad de columnas plot_time(time_reg(timeout, cols_min, cols_max, np_reg)) #generar gráfico con el tiempo que demora según cantidad de columnas ``
What am i doing wrong there? error "index out of range" trying to fill a list with lists
I want to do a list with lists inside, with a for and i get index out of range I tryed with empleados.append() but it doesnt work def main(): empleados=[] for i in range(1): empleados[i][0](input("Ingrese el Nombre: ")) empleados[i][1](input("Ingrese el Apellido: ")) empleados[i][2](int(input("Ingrese el Sueldo Base: "))) empleados[i][3](int(input("Ingrese el AFP 1 o 2: "))) empleados[i][4](datetime(int(input("Ingrese la Fecha de Ingreso(pulsa intro cada vez 2000 12 31): ")),int(input("/")),int(input("/")))) empleados[i][5](int(input("Ingrese la cantidad de hijos que tiene: ")))
welcome to SO! There's no list at empleados[0] to insert new values into. I find something like this is a little easier to read: def main(): empleados=[] for i in range(1): empleado_nueva = [] empleado_nueva.append(input("Ingrese el Nombre: ")) empleado_nueva.append(input("Ingrese el Apellido: ")) empleado_nueva.append(int(input("Ingrese el Sueldo Base: "))) empleado_nueva.append(int(input("Ingrese el AFP 1 o 2: "))) empleado_nueva.append(datetime(int(input("Ingrese la Fecha de Ingreso(pulsa intro cada vez 2000 12 31): ")),int(input("/")),int(input("/")))) empleado_nueva.append(int(input("Ingrese la cantidad de hijos que tiene: "))) empleados.append(empleado_nueva) return empleados It's worth mentioning that the index-access pattern you're attempting (empleados[i][0] = ...) only works if there's something already at that index, for instance: >>> x = [] >>> x[0] = 1 Traceback (most recent call last): File "<stdin>", line 1, in <module> IndexError: list assignment index out of range >>> x = ['a', 'b', 'c'] >>> x[0] = 'd' >>> x ['d', 'b', 'c'] So the append's are probably the best way to go.
The problem is you're trying use empleados[i] as a list with an existing index you can insert into, when at the moment, it's not. You need to set up your variables a separate list and then append them. E.g. def main(): empleados=[] vars = [ input("Ingrese el Nombre: "), input("Ingrese el Apellido: "), int(input("Ingrese el Sueldo Base: ")), int(input("Ingrese el AFP 1 o 2: ")), datetime(int(input("Ingrese la Fecha de Ingreso(pulsa intro cada vez 2000 12 31): ")),int(input("/")),int(input("/"))), int(input("Ingrese la cantidad de hijos que tiene: ") empleados.append(vars)
Error when I try to iterate more than once
I've got this program which calculate k-means for IA #! /usr/bin/env python # -*- coding: utf-8 -*- from random import sample from itertools import repeat from math import sqrt # Parametros k = 6 maxit = 2 def leeValoracionesFiltradas (nomFichero = "valoracionesFiltradas.data"): lineas = [(l.strip()).split("\t") for l in (open(nomFichero).readlines())] diccio = {} for l in lineas: diccio[int(l[0])] = {} for l in lineas: diccio[int(l[0])][int(l[1])] = (float(l[2]),float(l[3])) return diccio def distEuclidea(dic1, dic2): # Se calcula la suma de cuadrados de los elementos comunes a los dos diccionarios sum2 = sum([pow(dic1[elem]-dic2[elem], 2) for elem in dic1 if elem in dic2]) return sqrt(sum2) def similitudEuclidea(dic1, dic2): return 1/(1+distEuclidea(dic1, dic2)) def coefPearson(dic1, dic2): # Se consiguen los elementos comunes en ambos diccionarios comunes = [x for x in dic1 if x in dic2] nComunes = float(len(comunes)) # Si no hay comunes -> cero if nComunes==0: return 0 # Calculo de las medias de cada diccionario media1 = sum([dic1[x][1] for x in comunes]) / nComunes media2 = sum([dic2[x][1] for x in comunes]) / nComunes # Numerador y denominador num = sum([(dic1[x][1] - media1) * (dic2[x][1] - media2) for x in comunes]) den1 = sqrt(sum([pow(dic1[x][1] - media1, 2) for x in comunes])) den2 = sqrt(sum([pow(dic2[x][1] - media2, 2) for x in comunes])) den = den1 * den2 # Caculo del coeficiente if den==0: return 0 return num/den # Dado un diccionario {key1 : {key2 : valor}} calcula el agrupamiento k-means # con k clusters (grupo), ejecutando maxit iteraciones, con la funcion de similitud especificada # Retorna una tupla # -{key1:numero de clusters} con las asignaciones de clusters (a que clusters pertenece cada elemento) # -[{key2:valores}] una lista con los k centroides (media de los valores para cada clusters) def kmeans (diccionario, k, maxit, similitud = coefPearson): # K puntos aleatorios son elegidos como centroides incialmente # Cada centroide es {key2 : valor} centroides = [diccionario[x] for x in sample(diccionario.keys(), k)] # Se asigna cada key1 a un numero de cluster previo = None asignacion = {} # En cada iteracion se asignan puntos a los centroides y se calculan nuevos centroides for it in range(maxit): # Se asignan puntos a los centroides mas cercanos for key1 in diccionario: similitudes = map(similitud,repeat(diccionario[key1],k), centroides) asignacion[key1] = similitudes.index(max(similitudes)) # Si no hay cambios en la asignacion, se termina if previo == asignacion: break previo = asignacion # Se recalculan los centroides (se anotan los valores de cada key a cada centroide) valores = {x : {} for x in range(k)} contadores = {x : {} for x in range(k)} for key1 in diccionario: grupo = asignacion[key1] for key2 in diccionario[key1]: if not valores[grupo].has_key(key2): valores [grupo][key2] = 0 contadores [grupo][key2] = 0 valores [grupo][key2] += diccionario[key1][key2][1] contadores[grupo][key2] += 1 # Se calculan las medias (nuevos centroides) centroides = [] for grupo in valores: centro = {} for key2 in valores[grupo]: centro[key2] = round((valores[grupo][key2] / contadores[grupo][key2]),2) centroides.append(centro) if None in centroides: break return (asignacion, centroides) # Se obtiene el diccionario de valoraciones (las valoraciones ya han sido filtradas) diccionario = leeValoracionesFiltradas() # Se obtienen las asignaciones y los centroides con la correlacion de Pearson tupla = kmeans (diccionario, k, maxit) asignaciones = tupla[0] centroids = tupla[1] print asignaciones print centroids And when I execute this for example for maxit = 2, it throws: File "kmeans_dictio.py", line 46, in coefPearson media2 = sum([dic2[x][1] for x in comunes]) / nComunes TypeError: 'float' object has no attribute '__getitem__' How can I fix this?
It looks like you have a dictionary (dic2) of floats and a dictionary of dictionaries of floats (dic1) that you are pulling an item out of with this line: comunes = [x for x in dic1 if x in dic2] Then you are trying to iterate over this float here: media2 = sum([dic2[x][1] for x in comunes]) / nComunes To fix this look at dic1 and dic2 and how they are defined.