import re
def one_day_or_another_day_relative_to_a_date_func(input_text):
#print(repr(input_text)) #print what you have captured, and you should replace
return "aaaaaaaa"
def identify(input_text):
some_text = r"(?:(?!\.\s*?\n)[^;])*"
date_capture_pattern = r"([12]\d{3}-[01]\d-[0-3]\d)(\D*?)"
previous_days = r"(\d+)\s*(?:dias|dia)\s*(?:antes|previos|previo|antes|atrás|atras)\s*"
after_days = r"(\d+)\s*(?:dias|dia)\s*(?:después|despues|luego)\s*"
n_patterns = [
previous_days + r"(?:del|de\s*el|de|al|a)\s*" + some_text + date_capture_pattern + some_text + r"\s*(?:,\s*o|o)\s*" + previous_days,
after_days + r"(?:del|de\s*el|de|al|a)\s*" + some_text + date_capture_pattern + some_text + r"\s*(?:,\s*o|o)\s*" + previous_days,
previous_days + r"(?:del|de\s*el|de|al|a)\s*" + some_text + date_capture_pattern + some_text + r"\s*(?:,\s*o|o)\s*" + after_days,
after_days + r"(?:del|de\s*el|de|al|a)\s*" + some_text + date_capture_pattern + some_text + r"\s*(?:,\s*o|o)\s*" + after_days]
#Itero la lista de patrones de búsqueda para que el programa intente con uno por uno
for n_pattern in n_patterns:
#Este es mi intento de realizar el reemplazo, aunque tiene problemas con modificadores non-greedy
input_text = re.sub(n_pattern, one_day_or_another_day_relative_to_a_date_func , input_text, re.IGNORECASE)
input_texts = ["8 dias antes o 9 dias antes del 2022-12-22",
"2 dias despues o 1 dia antes del 2022-12-22, dia en donde ocurrio",
"a tan solo 2 dias despues de 2022-12-22 o a caso eran 6 dias despues, mmm no recuerdo bien",
]
#Testing...
for input_text in input_texts:
#print(input_text)
print(one_day_or_another_day_relative_to_a_date_func(input_text))
Incorrect output that I am getting, because if I incorrectly capture the substrings, the replacements will also be incorrect
"aaaaaaaa"
"aaaaaaaa"
"aaaaaaaa"
Having well-defined limits, I don't understand why this capture pattern try to capture beyond them?
And the output that I need is that:
"aaaaaaaa"
"aaaaaaaa, dia en donde ocurrio"
"a tan solo aaaaaaaa, mmm no recuerdo bien"
There are several errors in your code, among which:
You are printing the result of the one_day_or_another_day_relative_to_a_date_func function. Print the result of identify instead.
In the identify function you are not returning the result text. Add return input_text at the end of it.
Make the "o..." suffix optional.
Use regex alternation instead of multiple patterns, otherwise you may get unexpected results.
Fixed code (I've also made it more compact):
import re
def one_day_or_another_day_relative_to_a_date_func(input_text):
#print(repr(input_text)) #print what you have captured, and you should replace
return "aaaaaaaa"
def identify(input_text):
some_text = r"(?:(?!\.\s*?\n)[^;])*"
date_capture_pattern = r"([12]\d{3}-[01]\d-[0-3]\d)(\D*?)"
previous_days = r"antes|previos|previo|antes|atrás|atras"
after_days = r"después|despues|luego"
prev_or_after = r"(\d+)\s*(?:dias|dia)\s*(?:" + previous_days + "|" + after_days + ")\s*"
preposition = r"(?:del|de\s*el|de|al|a)\s*"
suffix = "(?:" + r"\s*(?:,\s*o|o)\s*" + some_text + prev_or_after + ")?"
pattern = prev_or_after + some_text + preposition + date_capture_pattern + suffix
input_text = re.sub(pattern, one_day_or_another_day_relative_to_a_date_func , input_text, re.IGNORECASE)
return input_text
input_texts = ["8 dias antes o 9 dias antes del 2022-12-22",
"2 dias despues o 1 dia antes del 2022-12-22, dia en donde ocurrio",
"a tan solo 2 dias despues de 2022-12-22 o a caso eran 6 dias despues, mmm no recuerdo bien",
]
#Testing...
for input_text in input_texts:
#print(input_text)
print(identify(input_text))
Result:
aaaaaaaa
aaaaaaaa, dia en donde ocurrio
a tan solo aaaaaaaa, mmm no recuerdo bien
This is my code:
import pandas as pd
import os
import glob as g
archivos = g.glob('C:\Users\Desktop\*.csv')
for archiv in archivos:
nombre = os.path.splitext(archiv)[0]
df = pd.read_csv(archiv, sep=",")
d = pd.to_datetime(df['DATA_LEITURA'], format="%Y%m%d")
df['FECHA_LECTURA'] = d.dt.date
del df['DATA_LEITURA']
df['CONSUMO']=""
df['DIAS']=""
df["SUMDIAS"]=""
df["SUMCONS"]=""
df["CONSANUAL"] = ""
ordenado = df.sort_values(['NR_CPE','FECHA_LECTURA', 'HORA_LEITURA'], ascending=True)
##Agrupamos por el CPE
agrupado = ordenado.groupby('NR_CPE')
for name, group in agrupado: #Recorremos el grupo
indice = group.index.values
inicio = indice[0]
fin = indice[-1]
#Llenamos la primeras lectura de cada CPE, con esa lectura (porque no hay una lectura anterior)
ordenado.CONSUMO.loc[inicio] = 0
ordenado.DIAS.loc[inicio] = 0
cont=0
for i in indice: #Recorremos lo que hay dentro de los grupos, dentro de los CPES(lecturas)
if i > inicio and i <= fin :
cont=cont+1
consumo = ordenado.VALOR_LEITURA[indice[cont]] - ordenado.VALOR_LEITURA[indice[cont-1]]
dias = (ordenado.FECHA_LECTURA[indice[cont]] - ordenado.FECHA_LECTURA[indice[cont-1]]).days
ordenado.CONSUMO.loc[i] = consumo
ordenado.DIAS.loc[i] = dias
# Hago las sumatorias, el resultado es un objeto DataFrame
dias = agrupado['DIAS'].sum()
consu = agrupado['CONSUMO'].sum()
canu = (consu/dias) * 365
#Contador con el numero de courrencias de los campos A,B y C
conta=0
contb=0
contc=0
#Como es un DF, para recorrerlo tengo que iterar sobre ellos para hacer la comparacion
print "Grupos:"
for ind, sumdias in dias.iteritems():
if sumdias <= 180:
grupo = "A"
conta=conta+1
elif sumdias > 180 and sumdias <= 365:
grupo = "B"
contb=contb+1
elif sumdias > 365:
grupo = "C"
contc=contc+1
print "grupo A: " , conta
print "grupo B: " , contb
print "grupo C: " , contc
#Formateamos los campos para no mostrar todos los decimales
Fdias = dias.map('{:.0f}'.format)
Fcanu = canu.map('{:.2f}'.format)
frames = [Fdias, consu, Fcanu]
concat = pd.concat(frames,axis=1).replace(['inf','nan'],[0,0])
with open('C:\Users\Documents\RPE_PORTUGAL\Datos.csv','a') as f:
concat.to_csv(f,header=False,columns=['CPE','DIAS','CONSUMO','CONSUMO_ANUAL'])
try:
ordenado.to_excel(nombre+'.xls', columns=["NOME_DISTRITO",
"NR_CPE","MARCA_EQUIPAMENTO","NR_EQUIPAMENTO","VALOR_LEITURA","REGISTADOR","TIPO_REGISTADOR",
"TIPO_DADOS_RECOLHIDOS","FACTOR_MULTIPLICATIVO_FINAL","NR_DIGITOS_INTEIRO","UNIDADE_MEDIDA",
"TIPO_LEITURA","MOTIVO_LEITURA","ESTADO_LEITURA","HORA_LEITURA","FECHA_LECTURA","CONSUMO","DIAS"],
index=False)
print (archiv)
print ("===============================================")
print ("*****Se ha creado el archivo correctamente*****")
print ("===============================================")
except IOError:
print ("===================================================")
print ("¡¡¡¡¡Hubo un error en la escritura del archivo!!!!!")
print ("===================================================")
This takes a file where I have lectures of energy consumption from different dates for every light meter('NR_CPE') and do some calculations:
Calculate the energy consumption for every 'NR_CPE' by substracting the previous reading with the next one and the result put in a new column named 'CONSUMO'.
Calculate the number of days where I'v got a reading and sum up the number of days
Add the consumption for every 'NR_CPE' and calculate the anual consumption.
Finally I want to classify by number of days that every light meter('NR_CPE') has a lecture. A if it has less than 180 days, B between 180 and 1 year and C more than a year.
And finally write this result in two differents files.
Any idea of how should I re-code this to have the same output and be faster?
Thank you all.
BTW this is my dataset:
,NOME_DISTRITO,NR_CPE,MARCA_EQUIPAMENTO,NR_EQUIPAMENTO,VALOR_LEITURA,REGISTADOR,TIPO_REGISTADOR,TIPO_DADOS_RECOLHIDOS,FACTOR_MULTIPLICATIVO_FINAL,NR_DIGITOS_INTEIRO,UNIDADE_MEDIDA,TIPO_LEITURA,MOTIVO_LEITURA,ESTADO_LEITURA,DATA_LEITURA,HORA_LEITURA
0,GUARDA,A002000642VW,101,1865411,4834,001,S,1,1,4,kWh,1,1,A,20150629,205600
1,GUARDA,A002000642VW,101,1865411,4834,001,S,1,1,4,kWh,2,2,A,20160218,123300
2,GUARDA,A002000642VJ,122,204534,25083,001,S,1,1,5,kWh,1,1,A,20150629,205700
3,GUARDA,A002000642VJ,122,204534,27536,001,S,1,1,5,kWh,2,2,A,20160218,123200
4,GUARDA,A002000642HR,101,1383899,11734,001,S,1,1,5,kWh,1,1,A,20150629,205600
5,GUARDA,A002000642HR,101,1383899,11800,001,S,1,1,5,kWh,2,2,A,20160218,123000
6,GUARDA,A002000995VM,101,97706436,12158,001,S,1,1,5,kWh,1,3,A,20150713,155300
7,GUARDA,A002000995VM,101,97706436,12163,001,S,1,1,5,kWh,2,2,A,20160129,162300
8,GUARDA,A002000995VM,101,97706436,12163,001,S,1,1,5,kWh,2,2,A,20160202,195800
9,GUARDA,A2000995VM,101,97706436,12163,001,S,1,1,5,kWh,1,3,A,20160404,145200
10,GUARDA,A002000996LV,168,5011703276,3567,001,V,1,1,6,kWh,1,1,A,20150528,205900
11,GUARDA,A02000996LV,168,5011703276,3697,001,V,1,1,6,kWh,2,2,A,20150929,163500
12,GUARDA,A02000996LV,168,5011703276,1287,002,P,1,1,6,kWh,1,1,A,20150528,205900
Generally you want to avoid for loops in pandas.
For example, the first loop where you calculate total consumption and days could be rewritten as a groupby apply something like:
def last_minus_first(df):
columns_of_interest = df[['VALOR_LEITURA', 'days']]
diff = columns_of_interest.iloc[-1] - columns_of_interest.iloc[0]
return diff
df['date'] = pd.to_datetime(df['DATA_LEITURA'], format="%Y%m%d")
df['days'] = (df['date'] - pd.datetime(1970,1,1)).dt.days # create days column
df.groupby('NR_CPE').apply(last_minus_first)
(btw I don't understand why you are subtracting each entry from the previous, surely for meter readings this is the same as last-first?)
Then given the result of the above as consumption, you can replace your second for loop (for ind, sumdias in dias.iteritems()) with something like:
pd.cut(consumption.days, [-1, 180, 365, np.inf], labels=['a', 'b', 'c']).value_counts()
I'm touching the goal of my project, but I'm getting a problem on : How I can create a completeness map ?
I have lots of data, a field with maybe 500.000 objects which are represented by dots in my plot with different zoom :
I would like to create a mask, I mean, cut my plot in tiny pixels, and say if I have an object in this pixel, I get the value : 1 (black for example) elif, I have not object in my pixel, I get the value : 0 (white for example).
I'll create a mask and I could divide each field by this mask.
The problem is that I don't know how I can process in order to make that :/
I create a first script in order to get a selection on my data. This one :
#!/usr/bin/python
# coding: utf-8
from astropy.io import fits
from astropy.table import Table
import numpy as np
import matplotlib.pyplot as plt
###################################
# Fichier contenant le champ brut #
###################################
filename = '/home/valentin/Desktop/Field52_combined_final_roughcal.fits'
# Ouverture du fichier à l'aide d'astropy
field = fits.open(filename)
print "Ouverture du fichier : " + str(filename)
# Lecture des données fits
tbdata = field[1].data
print "Lecture des données du fits"
###############################
# Application du tri sur PROB #
###############################
mask = np.bitwise_and(tbdata['PROB'] < 1.1, tbdata['PROB'] > -0.1)
new_tbdata = tbdata[mask]
print "Création du Masque"
#################################################
# Détermination des valeurs extremales du champ #
#################################################
# Détermination de RA_max et RA_min
RA_max = np.max(new_tbdata['RA'])
RA_min = np.min(new_tbdata['RA'])
print "RA_max vaut : " + str(RA_max)
print "RA_min vaut : " + str(RA_min)
# Détermination de DEC_max et DEC_min
DEC_max = np.max(new_tbdata['DEC'])
DEC_min = np.min(new_tbdata['DEC'])
print "DEC_max vaut : " + str(DEC_max)
print "DEC_min vaut : " + str(DEC_min)
#########################################
# Calcul de la valeur centrale du champ #
#########################################
# Détermination de RA_moyen et DEC_moyen
RA_central = (RA_max + RA_min)/2.
DEC_central = (DEC_max + DEC_min)/2.
print "RA_central vaut : " + str(RA_central)
print "DEC_central vaut : " + str(DEC_central)
print " "
print " ------------------------------- "
print " "
##############################
# Détermination de X et de Y #
##############################
# Creation du tableau
new_col_data_X = array = (new_tbdata['RA'] - RA_central) * np.cos(DEC_central)
new_col_data_Y = array = new_tbdata['DEC'] - DEC_central
print 'Création du tableau'
# Creation des nouvelles colonnes
col_X = fits.Column(name='X', format='D', array=new_col_data_X)
col_Y = fits.Column(name='Y', format='D', array=new_col_data_Y)
print 'Création des nouvelles colonnes X et Y'
# Creation de la nouvelle table
tbdata_final = fits.BinTableHDU.from_columns(new_tbdata.columns + col_X + col_Y)
# Ecriture du fichier de sortie .fits
tbdata_final.writeto('{}_{}'.format(filename,'mask'))
print 'Ecriture du nouveau fichier mask'
field.close()
Ok, it's working ! But now, the second part is this to the moment :
###################################################
###################################################
###################################################
filename = '/home/valentin/Desktop/Field52_combined_final_roughcal.fits_mask'
print 'Fichier en cours de traitement' + str(filename) + '\n'
# Ouverture du fichier à l'aide d'astropy
field = fits.open(filename)
# Lecture des données fits
tbdata = field[1].data
figure = plt.figure(1)
plt.plot (tbdata['X'], tbdata['Y'], '.')
plt.show()
Do you have any idea how process ?
How I can cut my plot in tiny bin ?
Thank you !
UPDATE :
After the answer from armatita, I updated my script :
###################################################
###################################################
###################################################
filename = '/home/valentin/Desktop/Field52_combined_final_roughcal.fits_mask'
print 'Fichier en cours de traitement' + str(filename) + '\n'
# Opening file with astropy
field = fits.open(filename)
# fits data reading
tbdata = field[1].data
##### BUILDING A GRID FOR THE DATA ########
nodesx,nodesy = 360,360 # PIXELS IN X, PIXELS IN Y
firstx,firsty = np.min(tbdata['X']),np.min(tbdata['Y'])
sizex = (np.max(tbdata['X'])-np.min(tbdata['X']))/nodesx
sizey = (np.max(tbdata['Y'])-np.min(tbdata['Y']))/nodesy
grid = np.zeros((nodesx+1,nodesy+1),dtype='bool') # PLUS 1 TO ENSURE ALL DATA IS INSIDE GRID
# CALCULATING GRID COORDINATES OF DATA
indx = np.int_((tbdata['X']-firstx)/sizex)
indy = np.int_((tbdata['Y']-firsty)/sizey)
grid[indx,indy] = True # WHERE DATA EXISTS SET TRUE
# PLOT MY FINAL IMAGE
plt.imshow(grid.T,origin='lower',cmap='binary',interpolation='nearest')
plt.show()
I find this plot :
So, when I play with the bin size, I can see more or less blank which indicate object or not in my pixel :)
This is usually a process of inserting your data into a grid (pixel wise, or node wise). The following example builds a grid (2D array) and calculates the "grid coordinates" for the sample data. Once it has those grid coordinates (which in true are nothing but array indexes) you can just set those elements to True. Check the following example:
import numpy as np
import matplotlib.pyplot as plt
x = np.random.normal(0,1,1000)
y = np.random.normal(0,1,1000)
##### BUILDING A GRID FOR THE DATA ########
nodesx,nodesy = 100,100 # PIXELS IN X, PIXELS IN Y
firstx,firsty = x.min(),y.min()
sizex = (x.max()-x.min())/nodesx
sizey = (y.max()-y.min())/nodesy
grid = np.zeros((nodesx+1,nodesy+1),dtype='bool') # PLUS 1 TO ENSURE ALL DATA IS INSIDE GRID
# CALCULATING GRID COORDINATES OF DATA
indx = np.int_((x-firstx)/sizex)
indy = np.int_((y-firsty)/sizey)
grid[indx,indy] = True # WHERE DATA EXISTS SET TRUE
# PLOT MY FINAL IMAGE
plt.imshow(grid.T,origin='lower',cmap='binary',interpolation='nearest')
plt.show()
, which results in:
Notice I'm showing an image with imshow. Should I decrease the number of pixels (20,20 = nodesx, nodesy) I get:
Also for a more automatic plot in matplotlib you can consider hexbin.