More efficient way to manipulate large dataframe - python

It's my first real Python script, so feel free to make comments in order to improve my code.
The purpose of this script is to extract 2 Oracle tables with Python, store them in a dataframe and then join them with pandas.
But for queries returning more than 500k lines I feel that it is slow. Do you know why?
import pandas as pd
from datetime import date
from sqlalchemy import create_engine
import cx_Oracle, time
import config
## Variable pour le timer
start = time.time()
## User input en ligne de commande
year = input('Saisir une annee : ')
month = input('Saisir le mois, au fomat MM : ')
societe_var = input('SA (APPLE,PEACH,BANANA,ANANAS,ALL) : ')
## SA + BU correspondantes aux SA
sa_list = ['APPLE','PEACH','BANANA','ANANAS']
bu_list_MERE = ['006111','1311402','1311403','1311404','1340115','13411106','1311407','1111','6115910','1166157','6811207','8311345','1111','1188100','8118101','8811102','8810113','8811104','8118105','8811106','8811107','8118108','1111']
bu_list_GARE = ['131400','310254']
bu_list_VOYA = ['0151100','1110073','1007115','1311335','1113340','1311341','1113342','1331143']
bu_list_RESO = ['1211345','13111345','11113395','73111345']
#Permet de pointre vers la bonne liste en fonction de la SA saisie
bu_list_map = {
'APPLE': bu_list_APPLE,
'PEACH': bu_list_PEACH,
'BANANA': bu_list_BANANA,
'ANANAS' : bu_list_ANANAS
if societe_var == 'ALL' :
print('non codé pour le moment')
elif societe_var in sa_list :
bu_list = bu_list_map.get(societe_var)
sa_var = societe_var
for bu in bu_list :
start_bu = time.time()
## On vient ici charger la requête SQL avec les bonnes variables pour gla_va_parametre -- EPOST
query1 = open('gla_va_parametre - VAR.sql',"r").read()
query1 = query1.replace('#ANNEE',"'" + year + "'").replace('%MOIS%',"'" + month + "'").replace('%SA%',"'" + societe_var + "'").replace('%BUGL%',"'" + bu + "'").replace('%DIVISION%','"C__en__PS_S1_D_OP_UNIT13".OPERATING_UNIT')
## On vient ici charger la requête SQL avec les bonnes variables pour cle-gla_tva -- FPOST
query2 = open('cle-gla_tva - VAR.sql',"r").read()
query2 = query2.replace('#ANNEE',"'" + year + "'").replace('%MOIS%',"'" + month + "'").replace('%SA%',"'" + societe_var + "'").replace('%BUGL%',"'" + bu + "'").replace('%DIVISION%','OPERATING_UNIT')
# Param de connexion
connection_EPOST = cx_Oracle.connect(user=config.user_EPOST, password=config.password_EPOST, dsn=config.host_EPOST, )
connection_FPOST = cx_Oracle.connect(user=config.user_FPOST, password=config.password_FPOST, dsn=config.host_FPOST, )
## Récup partie EPOST
with connection_EPOST :
# On déclare une variable liste vide
dfl = []
# On déclare un DataFrame vide
dfs = pd.DataFrame()
# Start Chunking
for chunk in pd.read_sql(query1, con=connection_EPOST,chunksize=25000) :
# Start Appending Data Chunks from SQL Result set into List
print('chunk num : ' + str(z))
z = z + 1
# Start appending data from list to dataframe
dfs = pd.concat(dfl, ignore_index=True)
print('param récupéré')
## Récup partie FPOST
with connection_FPOST :
# On déclare une variable liste vide
df2 = []
# On déclare un DataFrame vide
dfs2 = pd.DataFrame()
# Start Chunking
for chunk in pd.read_sql(query2, con=connection_FPOST,chunksize=10000) :
# Start Appending Data Chunks from SQL Result set into List
# Start appending data from list to dataframe
dfs2 = pd.concat(df2, ignore_index=True)
print('clé récupéré')
print('Début de la jointure')
jointure = pd.merge(dfs,dfs2,how='left',left_on=['Code_BU_GL','Code_division','Code_ecriture','Date_comptable','Code_ligne_ecriture','UNPOST_SEQ'],right_on=['BUSINESS_UNIT','OPERATING_UNIT','JOURNAL_ID','JOURNAL_DATE','JOURNAL_LINE','UNPOST_SEQ']).drop(columns= ['BUSINESS_UNIT','OPERATING_UNIT','JOURNAL_ID','JOURNAL_DATE','JOURNAL_LINE'])
jointure.to_csv('out\gla_va_'+year+month+"_"+societe_var+"_"+bu+"_""%Y%m%d")+'.csv', index=False, sep='|')
print('Fichier ' + str(i) + "/" + str(len(bu_list)) + ' généré en : '+ str(time.time() - start_bu)+' secondes')
i = i + 1
print("L'extraction du périmètre de la SA " + societe_var + " s'est effectué en :" + str((time.time() - start)/60) + " min" )


Capture substring and send it to a function that modifies it and can replace it in this string

import re
def one_day_or_another_day_relative_to_a_date_func(input_text):
#print(repr(input_text)) #print what you have captured, and you should replace
return "aaaaaaaa"
def identify(input_text):
some_text = r"(?:(?!\.\s*?\n)[^;])*"
date_capture_pattern = r"([12]\d{3}-[01]\d-[0-3]\d)(\D*?)"
previous_days = r"(\d+)\s*(?:dias|dia)\s*(?:antes|previos|previo|antes|atrás|atras)\s*"
after_days = r"(\d+)\s*(?:dias|dia)\s*(?:después|despues|luego)\s*"
n_patterns = [
previous_days + r"(?:del|de\s*el|de|al|a)\s*" + some_text + date_capture_pattern + some_text + r"\s*(?:,\s*o|o)\s*" + previous_days,
after_days + r"(?:del|de\s*el|de|al|a)\s*" + some_text + date_capture_pattern + some_text + r"\s*(?:,\s*o|o)\s*" + previous_days,
previous_days + r"(?:del|de\s*el|de|al|a)\s*" + some_text + date_capture_pattern + some_text + r"\s*(?:,\s*o|o)\s*" + after_days,
after_days + r"(?:del|de\s*el|de|al|a)\s*" + some_text + date_capture_pattern + some_text + r"\s*(?:,\s*o|o)\s*" + after_days]
#Itero la lista de patrones de búsqueda para que el programa intente con uno por uno
for n_pattern in n_patterns:
#Este es mi intento de realizar el reemplazo, aunque tiene problemas con modificadores non-greedy
input_text = re.sub(n_pattern, one_day_or_another_day_relative_to_a_date_func , input_text, re.IGNORECASE)
input_texts = ["8 dias antes o 9 dias antes del 2022-12-22",
"2 dias despues o 1 dia antes del 2022-12-22, dia en donde ocurrio",
"a tan solo 2 dias despues de 2022-12-22 o a caso eran 6 dias despues, mmm no recuerdo bien",
for input_text in input_texts:
Incorrect output that I am getting, because if I incorrectly capture the substrings, the replacements will also be incorrect
Having well-defined limits, I don't understand why this capture pattern try to capture beyond them?
And the output that I need is that:
"aaaaaaaa, dia en donde ocurrio"
"a tan solo aaaaaaaa, mmm no recuerdo bien"
There are several errors in your code, among which:
You are printing the result of the one_day_or_another_day_relative_to_a_date_func function. Print the result of identify instead.
In the identify function you are not returning the result text. Add return input_text at the end of it.
Make the "o..." suffix optional.
Use regex alternation instead of multiple patterns, otherwise you may get unexpected results.
Fixed code (I've also made it more compact):
import re
def one_day_or_another_day_relative_to_a_date_func(input_text):
#print(repr(input_text)) #print what you have captured, and you should replace
return "aaaaaaaa"
def identify(input_text):
some_text = r"(?:(?!\.\s*?\n)[^;])*"
date_capture_pattern = r"([12]\d{3}-[01]\d-[0-3]\d)(\D*?)"
previous_days = r"antes|previos|previo|antes|atrás|atras"
after_days = r"después|despues|luego"
prev_or_after = r"(\d+)\s*(?:dias|dia)\s*(?:" + previous_days + "|" + after_days + ")\s*"
preposition = r"(?:del|de\s*el|de|al|a)\s*"
suffix = "(?:" + r"\s*(?:,\s*o|o)\s*" + some_text + prev_or_after + ")?"
pattern = prev_or_after + some_text + preposition + date_capture_pattern + suffix
input_text = re.sub(pattern, one_day_or_another_day_relative_to_a_date_func , input_text, re.IGNORECASE)
return input_text
input_texts = ["8 dias antes o 9 dias antes del 2022-12-22",
"2 dias despues o 1 dia antes del 2022-12-22, dia en donde ocurrio",
"a tan solo 2 dias despues de 2022-12-22 o a caso eran 6 dias despues, mmm no recuerdo bien",
for input_text in input_texts:
aaaaaaaa, dia en donde ocurrio
a tan solo aaaaaaaa, mmm no recuerdo bien

How to sort the websites by their popularity?

Im using the script currently and i cant seem to find out a way to sort the Websites by their popularity, im a beginner.
import random
# création d'un dictionnaire Hypertexte
Hypertext = {}
# création d'un dictionnaire pour le nombre de visite
Walk_Number = {}
# une variable pour le nombre total de visite
Total_Walk = 0
#liste des sites web
Websites = ["A","B","C","D","E","F"]
# les liens hypertextes
# le dictionnaire possède des clés ( nom des sites)
# Qui contiennent des listes (liens hypertextes)
Hypertext["A"] = ["B","C","E"]
Hypertext["B"] = ["F"]
Hypertext["C"] = ["A","E"]
Hypertext["D"] = ["B","C"]
Hypertext["E"] = ["A","B","C","D","F"]
Hypertext["F"] = ["E"]
# On initialise à 0.0 les visites des sites
Walk_Number["A"] = 0.0
Walk_Number["B"] = 0.0
Walk_Number["C"] = 0.0
Walk_Number["D"] = 0.0
Walk_Number["E"] = 0.0
Walk_Number["F"] = 0.0
i = 0
while i < 1000:
x = random.choice(Websites)
while random.random() < 0.85:
Walk_Number[x] = Walk_Number[x] + 1
Total_Walk = Total_Walk + 1
x = random.choice(Hypertext[x])
i = i + 1
print (Walk_Number)
I tried using the sort() function but i cant seem to find a way to sort it into the script
I think by popularity you mean the number of visits that you have saved in your Walk_Number dictionary. If you want to resort your dictionary based on values with a descending order you can do it like this:
def sort_dict_by_value(d, reverse=False):
return dict(sorted(d.items(), key=lambda x: x[1], reverse=reverse))
print(sort_dict_by_value(Walk_Number, True))

Python slicing output

I have a school project on analyzing logs & roughly, I need to be able to retrieve the day & time from a line in the format "MM DD HH:MM:SS" and it always shows as "['MM DD HH:MM:SS']"
def get_complete_date(line):
Pre : line est une ligne de log bien formée (str)
Post : Retourne la date et l'heure sous forme de chaine de caractère sans changer le format.
# splt = line.split(sep=" ", maxsplit=5)
# dt = splt[2]
complete_date = line.split(" ")
# print(complete_date)
return complete_date[0:3]
Very naive approach, but if the length of datetime part is the same (same pattern on each line), you can simply slice it from 0 to pattern length:
def get_complete_date(l):
tpl = "MM DD HH:MM:SS"
return l[0: len(tpl)]
line = "MM DD HH:MM:SS some text"
You almost there. It's an issue with slicing.
def get_complete_date(line):
Pre : line est une ligne de log bien formée (str)
Post : Retourne la date et l'heure sous forme de chaine de caractère sans changer le format.
# splt = line.split(sep=" ", maxsplit=5)
# dt = splt[2]
complete_date = line.split(" ")
# print(complete_date)
return complete_date[1:3:1]
line = "MM DD HH:MM:SS"
Gives #
['DD', 'HH:MM:SS']
Explanation: Your slicing should be
[1:3:1] = [start:stop:step]
The 0th element is month which you don't need to start from 1.

Pandas + Python: More efficient code

This is my code:
import pandas as pd
import os
import glob as g
archivos = g.glob('C:\Users\Desktop\*.csv')
for archiv in archivos:
nombre = os.path.splitext(archiv)[0]
df = pd.read_csv(archiv, sep=",")
d = pd.to_datetime(df['DATA_LEITURA'], format="%Y%m%d")
del df['DATA_LEITURA']
df["CONSANUAL"] = ""
ordenado = df.sort_values(['NR_CPE','FECHA_LECTURA', 'HORA_LEITURA'], ascending=True)
##Agrupamos por el CPE
agrupado = ordenado.groupby('NR_CPE')
for name, group in agrupado: #Recorremos el grupo
indice = group.index.values
inicio = indice[0]
fin = indice[-1]
#Llenamos la primeras lectura de cada CPE, con esa lectura (porque no hay una lectura anterior)
ordenado.CONSUMO.loc[inicio] = 0
ordenado.DIAS.loc[inicio] = 0
for i in indice: #Recorremos lo que hay dentro de los grupos, dentro de los CPES(lecturas)
if i > inicio and i <= fin :
consumo = ordenado.VALOR_LEITURA[indice[cont]] - ordenado.VALOR_LEITURA[indice[cont-1]]
dias = (ordenado.FECHA_LECTURA[indice[cont]] - ordenado.FECHA_LECTURA[indice[cont-1]]).days
ordenado.CONSUMO.loc[i] = consumo
ordenado.DIAS.loc[i] = dias
# Hago las sumatorias, el resultado es un objeto DataFrame
dias = agrupado['DIAS'].sum()
consu = agrupado['CONSUMO'].sum()
canu = (consu/dias) * 365
#Contador con el numero de courrencias de los campos A,B y C
#Como es un DF, para recorrerlo tengo que iterar sobre ellos para hacer la comparacion
print "Grupos:"
for ind, sumdias in dias.iteritems():
if sumdias <= 180:
grupo = "A"
elif sumdias > 180 and sumdias <= 365:
grupo = "B"
elif sumdias > 365:
grupo = "C"
print "grupo A: " , conta
print "grupo B: " , contb
print "grupo C: " , contc
#Formateamos los campos para no mostrar todos los decimales
Fdias ='{:.0f}'.format)
Fcanu ='{:.2f}'.format)
frames = [Fdias, consu, Fcanu]
concat = pd.concat(frames,axis=1).replace(['inf','nan'],[0,0])
with open('C:\Users\Documents\RPE_PORTUGAL\Datos.csv','a') as f:
ordenado.to_excel(nombre+'.xls', columns=["NOME_DISTRITO",
print (archiv)
print ("===============================================")
print ("*****Se ha creado el archivo correctamente*****")
print ("===============================================")
except IOError:
print ("===================================================")
print ("¡¡¡¡¡Hubo un error en la escritura del archivo!!!!!")
print ("===================================================")
This takes a file where I have lectures of energy consumption from different dates for every light meter('NR_CPE') and do some calculations:
Calculate the energy consumption for every 'NR_CPE' by substracting the previous reading with the next one and the result put in a new column named 'CONSUMO'.
Calculate the number of days where I'v got a reading and sum up the number of days
Add the consumption for every 'NR_CPE' and calculate the anual consumption.
Finally I want to classify by number of days that every light meter('NR_CPE') has a lecture. A if it has less than 180 days, B between 180 and 1 year and C more than a year.
And finally write this result in two differents files.
Any idea of how should I re-code this to have the same output and be faster?
Thank you all.
BTW this is my dataset:
Generally you want to avoid for loops in pandas.
For example, the first loop where you calculate total consumption and days could be rewritten as a groupby apply something like:
def last_minus_first(df):
columns_of_interest = df[['VALOR_LEITURA', 'days']]
diff = columns_of_interest.iloc[-1] - columns_of_interest.iloc[0]
return diff
df['date'] = pd.to_datetime(df['DATA_LEITURA'], format="%Y%m%d")
df['days'] = (df['date'] - pd.datetime(1970,1,1)).dt.days # create days column
(btw I don't understand why you are subtracting each entry from the previous, surely for meter readings this is the same as last-first?)
Then given the result of the above as consumption, you can replace your second for loop (for ind, sumdias in dias.iteritems()) with something like:
pd.cut(consumption.days, [-1, 180, 365, np.inf], labels=['a', 'b', 'c']).value_counts()

Create a binary completeness map

I'm touching the goal of my project, but I'm getting a problem on : How I can create a completeness map ?
I have lots of data, a field with maybe 500.000 objects which are represented by dots in my plot with different zoom :
I would like to create a mask, I mean, cut my plot in tiny pixels, and say if I have an object in this pixel, I get the value : 1 (black for example) elif, I have not object in my pixel, I get the value : 0 (white for example).
I'll create a mask and I could divide each field by this mask.
The problem is that I don't know how I can process in order to make that :/
I create a first script in order to get a selection on my data. This one :
# coding: utf-8
from import fits
from astropy.table import Table
import numpy as np
import matplotlib.pyplot as plt
# Fichier contenant le champ brut #
filename = '/home/valentin/Desktop/Field52_combined_final_roughcal.fits'
# Ouverture du fichier à l'aide d'astropy
field =
print "Ouverture du fichier : " + str(filename)
# Lecture des données fits
tbdata = field[1].data
print "Lecture des données du fits"
# Application du tri sur PROB #
mask = np.bitwise_and(tbdata['PROB'] < 1.1, tbdata['PROB'] > -0.1)
new_tbdata = tbdata[mask]
print "Création du Masque"
# Détermination des valeurs extremales du champ #
# Détermination de RA_max et RA_min
RA_max = np.max(new_tbdata['RA'])
RA_min = np.min(new_tbdata['RA'])
print "RA_max vaut : " + str(RA_max)
print "RA_min vaut : " + str(RA_min)
# Détermination de DEC_max et DEC_min
DEC_max = np.max(new_tbdata['DEC'])
DEC_min = np.min(new_tbdata['DEC'])
print "DEC_max vaut : " + str(DEC_max)
print "DEC_min vaut : " + str(DEC_min)
# Calcul de la valeur centrale du champ #
# Détermination de RA_moyen et DEC_moyen
RA_central = (RA_max + RA_min)/2.
DEC_central = (DEC_max + DEC_min)/2.
print "RA_central vaut : " + str(RA_central)
print "DEC_central vaut : " + str(DEC_central)
print " "
print " ------------------------------- "
print " "
# Détermination de X et de Y #
# Creation du tableau
new_col_data_X = array = (new_tbdata['RA'] - RA_central) * np.cos(DEC_central)
new_col_data_Y = array = new_tbdata['DEC'] - DEC_central
print 'Création du tableau'
# Creation des nouvelles colonnes
col_X = fits.Column(name='X', format='D', array=new_col_data_X)
col_Y = fits.Column(name='Y', format='D', array=new_col_data_Y)
print 'Création des nouvelles colonnes X et Y'
# Creation de la nouvelle table
tbdata_final = fits.BinTableHDU.from_columns(new_tbdata.columns + col_X + col_Y)
# Ecriture du fichier de sortie .fits
print 'Ecriture du nouveau fichier mask'
Ok, it's working ! But now, the second part is this to the moment :
filename = '/home/valentin/Desktop/Field52_combined_final_roughcal.fits_mask'
print 'Fichier en cours de traitement' + str(filename) + '\n'
# Ouverture du fichier à l'aide d'astropy
field =
# Lecture des données fits
tbdata = field[1].data
figure = plt.figure(1)
plt.plot (tbdata['X'], tbdata['Y'], '.')
Do you have any idea how process ?
How I can cut my plot in tiny bin ?
Thank you !
After the answer from armatita, I updated my script :
filename = '/home/valentin/Desktop/Field52_combined_final_roughcal.fits_mask'
print 'Fichier en cours de traitement' + str(filename) + '\n'
# Opening file with astropy
field =
# fits data reading
tbdata = field[1].data
nodesx,nodesy = 360,360 # PIXELS IN X, PIXELS IN Y
firstx,firsty = np.min(tbdata['X']),np.min(tbdata['Y'])
sizex = (np.max(tbdata['X'])-np.min(tbdata['X']))/nodesx
sizey = (np.max(tbdata['Y'])-np.min(tbdata['Y']))/nodesy
grid = np.zeros((nodesx+1,nodesy+1),dtype='bool') # PLUS 1 TO ENSURE ALL DATA IS INSIDE GRID
indx = np.int_((tbdata['X']-firstx)/sizex)
indy = np.int_((tbdata['Y']-firsty)/sizey)
grid[indx,indy] = True # WHERE DATA EXISTS SET TRUE
I find this plot :
So, when I play with the bin size, I can see more or less blank which indicate object or not in my pixel :)
This is usually a process of inserting your data into a grid (pixel wise, or node wise). The following example builds a grid (2D array) and calculates the "grid coordinates" for the sample data. Once it has those grid coordinates (which in true are nothing but array indexes) you can just set those elements to True. Check the following example:
import numpy as np
import matplotlib.pyplot as plt
x = np.random.normal(0,1,1000)
y = np.random.normal(0,1,1000)
nodesx,nodesy = 100,100 # PIXELS IN X, PIXELS IN Y
firstx,firsty = x.min(),y.min()
sizex = (x.max()-x.min())/nodesx
sizey = (y.max()-y.min())/nodesy
grid = np.zeros((nodesx+1,nodesy+1),dtype='bool') # PLUS 1 TO ENSURE ALL DATA IS INSIDE GRID
indx = np.int_((x-firstx)/sizex)
indy = np.int_((y-firsty)/sizey)
grid[indx,indy] = True # WHERE DATA EXISTS SET TRUE
, which results in:
Notice I'm showing an image with imshow. Should I decrease the number of pixels (20,20 = nodesx, nodesy) I get:
Also for a more automatic plot in matplotlib you can consider hexbin.
