Delete rows that contains no information on Tweet text on pandas - python

I´m trying to remove rows containing blank texts or in tweet texts column. But I have tried in different ways counting the rows that only contain whitespace or counting the leading spaces and trailing spaces but to get a criterion to eliminate it.
ID tweet WhiteSpaceCount HaveWhiteSpace
0 this is a text 0 False
1 0 False
2 Hello im fine 0 False
I want to delete all the rows that don´t have any information on the tweet column.
Code here:
def extractAndSave(api, name):
# Creamos una lista de tweets:
previous_date = date.today() - timedelta(days=1)
query_date = date.today()
name = name
tweets = API_EXTRACTOR.search(q=name + "-filter:retweets", result_type='recent', timeout=999999, count=200,
end_time=previous_date, tweet_mode='extended')
# Podemos crear un dataframe como sigue:
tweet_list = []
for tweet in tweets:
tweet_list.append(tweet.full_text)
datos = pd.DataFrame(data=tweet_list, columns=['TWEETS'])
# CREANDO COLUMNA DE ID
id_list = []
for id in tweets:
id_list.append(id.id)
id = pd.DataFrame(data=id_list, columns=['ID'])
# CREANDO COLUMNA DE ID
creado_list = []
for creado in tweets:
creado_list.append(creado.created_at)
creado = pd.DataFrame(data=creado_list, columns=['FECHA_CREACION'])
# CREANDO COLUMNA DE nombre de usuario
user_list = []
for usuario in tweets:
user_list.append(usuario.user.screen_name)
usuario = pd.DataFrame(data=user_list, columns=['USUARIO'])
# CREANDO COLUMNA DE FUENTE
fuente_list = []
for fuente in tweets:
fuente_list.append(fuente.source)
fuente = pd.DataFrame(data=fuente_list, columns=['FUENTE'])
# CREANDO COLUMNA DE ME GUSTA
like_list = []
for like in tweets:
like_list.append(like.favorite_count)
like = pd.DataFrame(data=like_list, columns=['ME_GUSTA'])
# CREANDO COLUMNA DE RT
rt_list = []
for rt in tweets:
rt_list.append(rt.retweet_count)
retweet = pd.DataFrame(data=rt_list, columns=['ME_GUSTA'])
# CREANDO COLUMNA DE IDIOMA
idioma_list = []
for idioma in tweets:
idioma_list.append(idioma.lang)
idioma = pd.DataFrame(data=idioma_list, columns=['IDIOMA'])
# CREANDO COLUMNA DE IDIOMA
quote_list = []
for quote in tweets:
quote_list.append(quote.is_quote_status)
quote = pd.DataFrame(data=quote_list, columns=['CITADO'])
# CREANDO COLUMNA DE IDIOMA
location_list = []
for location in tweets:
location_list.append(location.user.location)
location = pd.DataFrame(data=location_list, columns=['LOCACION'])
# CONCATENANDO DATAFRAMES
datos = pd.concat([datos, id, creado, usuario, fuente, like, retweet, quote, idioma, location], axis=1)
# Dropear toda la fila si la columna tweets viene vacia.
datos['pass/fail'] = np.where(datos['TWEETS'].astype(str).str.fullmatch(r"\s*"),'FAIL','PASS')
datos['CONTEO_ESPACIOS']= (datos['TWEETS'].str.startswith(" ") | datos['TWEETS'].str.endswith(" ")).sum()
# Hora de publicación
datos['HORA_PUBLICACION'] = datos['FECHA_CREACION'].dt.hour
datos['DIA_SEMANA'] = datos['FECHA_CREACION'].dt.day_name()
# Extrayendo solo los tweets del día anterior
datos['FECHA_CREACION'] = pd.to_datetime(datos['FECHA_CREACION']).dt.date
datos = datos[datos['FECHA_CREACION'] == previous_date]
print(datos)
# Guardando en dataframe.
return datos

Instead of removing rows that you don't need, keep only the ones you do need:
df = df[df["tweet"].str.strip().str.len()>0]
>>> df
ID tweet WhiteSpaceCount HaveWhiteSpace
0 0 this is a text 0 False
2 2 Hello im fine 0 False

Related

How can i delete all my tweets using twitter api

I'm trying to delete all the tweets/replies from my account, at first it worked, but it got to a point where he stopped deleting, and he can't receive the tweets anymore, giving the error:
File "main.py", line 29, in fetch_tweets
oldest = all_tweets[-1].id - 1
IndexError: list index out of range
and on my account, even if it doesn't appear on the profile (I don't know why) there are still 19.2k tweets to be deleted, does anyone have any idea how to fix this?
code:
'''
Script para apagar todos os meus tweets mais antigos que determinada data
'''
from keep_alive import keep_alive
import tweepy
from config import *
import datetime
import pandas as pd
client = tweepy.Client(bearer_token, api_key, api_secret, access_token, access_token_secret)
auth = tweepy.OAuth1UserHandler(api_key, api_secret, access_token, access_token_secret)
api = tweepy.API(auth)
def fetch_tweets(username):
'''
Baixa todos os tweets do usuário
determinado em 'username'
'''
print("Resgatando Tweets. . .")
all_tweets = []
new_tweets = api.user_timeline(screen_name=username, tweet_mode='extended', count=200, exclude_replies=False)
all_tweets.extend(new_tweets)
# Salva o id do tweet antigo menos um
oldest = all_tweets[-1].id - 1
while len(new_tweets) > 0: # Continua pegando tweets até a requisição retornar nada
# Todos as requests posteriores usam max_id "para avançar no tempo"
new_tweets = api.user_timeline(screen_name=username, tweet_mode='extended', count=200, max_id=oldest)
all_tweets.extend(new_tweets)
# Atualiza o id
oldest = all_tweets[-1].id - 1
# Transform the tweepy tweets into a 2D array that will populate the csv
output = [
[ tweet.id,
tweet.created_at,
tweet.created_at.strftime("%d-%m-%Y"),
tweet.retweet_count,
tweet.favorite_count,
username ] for tweet in all_tweets
]
for sublist in output:
sublist.append(username)
return output
def validate_date(date_text):
'''
Verifica se a data entrada pelo usuário
está no foramto YYYY-MM-DD. Se não estiver,
levanta uma exeção com mensagem de erro.
'''
try:
datetime.datetime.strptime(date_text, '%Y-%m-%d')
except ValueError:
raise ValueError("A data não está no formato YYYY-MM-DD. Execute o programa novamente.")
def filter_tweets(start, tweets):
'''
Usa o dataframe com todos os tweets
e a data de corte, depois da qual os
tweets devem ser mantidos, para gerar
uma lista com os ids das publicações
devem ser removidas.
'''
print("Filtrando Tweets. . .")
now = datetime.datetime.now()
start_date = pd.to_datetime(start, format = "%Y-%m-%d")
# Filtra intervalo de tweets que quero manter
keep_dates = pd.date_range(start=start_date, end=now)
keep_dates = [str(date)[:10] for date in keep_dates]
# Cria uma lista de ids cujo tweet deve ser mantido
tweets_to_delete = [ tweet[0] for tweet in tweets if str(pd.to_datetime(tweet[1]))[:10] not in keep_dates ]
return tweets_to_delete
def delete_tweets(tweet_ids):
'''
Deleta os tweets cujos números
identificadores estão na lista
tweet_ids
'''
print("Deletando Tweets. . .")
# Começa a deletar:
delete_count = 0
for tweet_id in tweet_ids:
try:
api.destroy_status(tweet_id)
print(tweet_id, 'deletado!', delete_count)
delete_count += 1
except:
print(tweet_id, 'não pode ser deletado!')
print('Pronto!', delete_count, 'tweets foram deletados, ao todo.')
##########################
### Execução principal ###
##########################
def main():
print("Iniciando. . .")
username = "xxxxxxxxxx"
start = "2022-10-25"
while True:
try:
tweets = fetch_tweets(username)
tweets = filter_tweets(start, tweets)
delete_tweets(tweets)
except tweepy.TweepyException as e
try:
print(e)
except:
print("error")
keep_alive()
main()
I already tried to change the parameters, put a conditional to check if the list is empty, but none of that worked
The API can only go back so far in retrieving older Tweets.
Another option would be to request your Twitter Archive, which would contain the Tweet IDs and content - you can then potentially use the API to delete Tweets by ID.
Note that there are account and API rate limits that will restrict the speed at which you can run this operation.

Problem adding information to Table Widget

I'm trying to add to the Table Widget the information that I have on my SQLite table.
That's it my SQL table:
.
That my Table Widget:
The code of the method:
if self.ui.btn_jugadores.clicked.connect(lambda: self.ui.pila_de_paginas.setCurrentWidget(self.ui.pg_jugadores)): self.mostrar_jugadores()
def mostrar_jugadores(self):
try:
cursor = conectarse_bbdd_jugadores()
consulta = ''' SELECT * FROM jugadores'''
ejecucion = cursor.execute(consulta).fetchall()
if len(ejecucion) > 0:
fila = 0
for e in ejecucion:
columna = 0
for apartado in e:
celda = QTableWidgetItem(apartado)
self.ui.tabla_jgds.setItem(fila, columna, celda)
columna += 1
print(apartado)
fila += 1
else:
QMessageBox.setText('No hay registros en la tabla de jugadores')
QMessageBox.setIcon(QMessageBox.warning)
QMessageBox.exec_()
except Error as error:
QMessageBox.setText('No se ha podido ejecutar la sentencia')
QMessageBox.setIcon(QMessageBox.warning)
QMessageBox.exec_()
And this is the result:
when I executed it
I don't know why the numbers are not appearing in the table widget.

Dealing with special characters in pandas Data Frame´s column Name

I am importing an excel worksheet that has the following columns name:
N° Pedido
1234
6424
4563
The column name ha a special character (°). Because of that, I can´t merge this with another Data Frame or rename the column. I don´t get any error message just the name stays the same. What should I do?
This is the code I am using and the result of the Dataframes:
import pandas as pd
import numpy as np
# Importando Planilhas
CRM = pd.ExcelFile(r'C:\Users\Michel\Desktop\Relatorio de
Vendas\relatorio_vendas_CRM.xlsx', encoding= 'utf-8')
protheus = pd.ExcelFile(r'C:\Users\Michel\Desktop\Relatorio de
Vendas\relatorio_vendas_protheus.xlsx', encoding= 'utf-8')
#transformando em Data Frame
df_crm = CRM.parse('190_pedido_export (33)')
df_protheus = protheus.parse('Relatorio de Pedido de Venda')]
# Transformando Campos em float o protheus
def turn_to_float(x):
return np.float(x)
df_protheus["TES"] = df_protheus["TES"].apply(turn_to_float)
df_protheus["Qtde"] = df_protheus["Qtde"].apply(turn_to_float)
df_protheus["Valor"] = df_protheus["Valor"].apply(turn_to_float)
#Tirando Tes de não venda do protheus
# tirando valores com código errado 6
df_protheus_1 = df_protheus[df_protheus.TES != 513.0]
df_protheus_2 = df_protheus_1[df_protheus_1.TES != 576.0]
**df_crm.columns = df_crm.columns.str.replace('N° Pedido', 'teste')
df_crm.columns**
Orçamento Origem N° Pedido Nº Pedido ERP Estabelecimento Tipo de
Pedido Classificação(Tipo) Aplicação Conta CNPJ/CPF Contato ...
Aprovação Parcial Antecipa Entrega Desconto da Tabela de Preço
Desconto do Cliente Desconto Informado Observações Observações NF Vl
Total Bruto Vl Total Completo
0 20619.0 23125 NaN Optitex 1 - Venda NaN Industrialização/Revenda
XAVIER E ARAUJO LTDA ME 7970626000170 NaN ... N N 0 0 0
Note that I used other codes for the bold part with the same result:
#renomeando tabela para dar Merge
#df_crm['proc'] = df_crm['N\xc2\xb0 Pedido']
#df_crm['N Pedido'] = df_crm['N° Pedido']
#df_crm.drop('N° Pedido',inplace=True,axis=1)
#df_crm
#df_crm['N Pedido'] = df_crm['N° Pedido']
#df.drop('N° Pedido',inplace=True,axis=1)
#df_crm
#df_crm_1 = df_crm.rename(columns={"N°Pedido": "teste"})
#df_crm_1
Thanks for posting the link to the Google Sheet. I downloaded it and loaded it via pandas:
df = pd.read_excel(r'~\relatorio_vendas_CRM.xlsx', encoding = 'utf-8')
df.columns = df.columns.str.replace('°', '')
df.columns = df.columns.str.replace('º', '')
Note that the two replace statements are replacing different characters, although they look very similar.
Help from: Why do I get a SyntaxError for a Unicode escape in my file path?
I was able to copy the values into another column. You could try that
df['N Pedido'] = df['N° Pedido']
df.drop('N° Pedido',inplace=True,axis=1)

Pandas + Python: More efficient code

This is my code:
import pandas as pd
import os
import glob as g
archivos = g.glob('C:\Users\Desktop\*.csv')
for archiv in archivos:
nombre = os.path.splitext(archiv)[0]
df = pd.read_csv(archiv, sep=",")
d = pd.to_datetime(df['DATA_LEITURA'], format="%Y%m%d")
df['FECHA_LECTURA'] = d.dt.date
del df['DATA_LEITURA']
df['CONSUMO']=""
df['DIAS']=""
df["SUMDIAS"]=""
df["SUMCONS"]=""
df["CONSANUAL"] = ""
ordenado = df.sort_values(['NR_CPE','FECHA_LECTURA', 'HORA_LEITURA'], ascending=True)
##Agrupamos por el CPE
agrupado = ordenado.groupby('NR_CPE')
for name, group in agrupado: #Recorremos el grupo
indice = group.index.values
inicio = indice[0]
fin = indice[-1]
#Llenamos la primeras lectura de cada CPE, con esa lectura (porque no hay una lectura anterior)
ordenado.CONSUMO.loc[inicio] = 0
ordenado.DIAS.loc[inicio] = 0
cont=0
for i in indice: #Recorremos lo que hay dentro de los grupos, dentro de los CPES(lecturas)
if i > inicio and i <= fin :
cont=cont+1
consumo = ordenado.VALOR_LEITURA[indice[cont]] - ordenado.VALOR_LEITURA[indice[cont-1]]
dias = (ordenado.FECHA_LECTURA[indice[cont]] - ordenado.FECHA_LECTURA[indice[cont-1]]).days
ordenado.CONSUMO.loc[i] = consumo
ordenado.DIAS.loc[i] = dias
# Hago las sumatorias, el resultado es un objeto DataFrame
dias = agrupado['DIAS'].sum()
consu = agrupado['CONSUMO'].sum()
canu = (consu/dias) * 365
#Contador con el numero de courrencias de los campos A,B y C
conta=0
contb=0
contc=0
#Como es un DF, para recorrerlo tengo que iterar sobre ellos para hacer la comparacion
print "Grupos:"
for ind, sumdias in dias.iteritems():
if sumdias <= 180:
grupo = "A"
conta=conta+1
elif sumdias > 180 and sumdias <= 365:
grupo = "B"
contb=contb+1
elif sumdias > 365:
grupo = "C"
contc=contc+1
print "grupo A: " , conta
print "grupo B: " , contb
print "grupo C: " , contc
#Formateamos los campos para no mostrar todos los decimales
Fdias = dias.map('{:.0f}'.format)
Fcanu = canu.map('{:.2f}'.format)
frames = [Fdias, consu, Fcanu]
concat = pd.concat(frames,axis=1).replace(['inf','nan'],[0,0])
with open('C:\Users\Documents\RPE_PORTUGAL\Datos.csv','a') as f:
concat.to_csv(f,header=False,columns=['CPE','DIAS','CONSUMO','CONSUMO_ANUAL'])
try:
ordenado.to_excel(nombre+'.xls', columns=["NOME_DISTRITO",
"NR_CPE","MARCA_EQUIPAMENTO","NR_EQUIPAMENTO","VALOR_LEITURA","REGISTADOR","TIPO_REGISTADOR",
"TIPO_DADOS_RECOLHIDOS","FACTOR_MULTIPLICATIVO_FINAL","NR_DIGITOS_INTEIRO","UNIDADE_MEDIDA",
"TIPO_LEITURA","MOTIVO_LEITURA","ESTADO_LEITURA","HORA_LEITURA","FECHA_LECTURA","CONSUMO","DIAS"],
index=False)
print (archiv)
print ("===============================================")
print ("*****Se ha creado el archivo correctamente*****")
print ("===============================================")
except IOError:
print ("===================================================")
print ("¡¡¡¡¡Hubo un error en la escritura del archivo!!!!!")
print ("===================================================")
This takes a file where I have lectures of energy consumption from different dates for every light meter('NR_CPE') and do some calculations:
Calculate the energy consumption for every 'NR_CPE' by substracting the previous reading with the next one and the result put in a new column named 'CONSUMO'.
Calculate the number of days where I'v got a reading and sum up the number of days
Add the consumption for every 'NR_CPE' and calculate the anual consumption.
Finally I want to classify by number of days that every light meter('NR_CPE') has a lecture. A if it has less than 180 days, B between 180 and 1 year and C more than a year.
And finally write this result in two differents files.
Any idea of how should I re-code this to have the same output and be faster?
Thank you all.
BTW this is my dataset:
,NOME_DISTRITO,NR_CPE,MARCA_EQUIPAMENTO,NR_EQUIPAMENTO,VALOR_LEITURA,REGISTADOR,TIPO_REGISTADOR,TIPO_DADOS_RECOLHIDOS,FACTOR_MULTIPLICATIVO_FINAL,NR_DIGITOS_INTEIRO,UNIDADE_MEDIDA,TIPO_LEITURA,MOTIVO_LEITURA,ESTADO_LEITURA,DATA_LEITURA,HORA_LEITURA
0,GUARDA,A002000642VW,101,1865411,4834,001,S,1,1,4,kWh,1,1,A,20150629,205600
1,GUARDA,A002000642VW,101,1865411,4834,001,S,1,1,4,kWh,2,2,A,20160218,123300
2,GUARDA,A002000642VJ,122,204534,25083,001,S,1,1,5,kWh,1,1,A,20150629,205700
3,GUARDA,A002000642VJ,122,204534,27536,001,S,1,1,5,kWh,2,2,A,20160218,123200
4,GUARDA,A002000642HR,101,1383899,11734,001,S,1,1,5,kWh,1,1,A,20150629,205600
5,GUARDA,A002000642HR,101,1383899,11800,001,S,1,1,5,kWh,2,2,A,20160218,123000
6,GUARDA,A002000995VM,101,97706436,12158,001,S,1,1,5,kWh,1,3,A,20150713,155300
7,GUARDA,A002000995VM,101,97706436,12163,001,S,1,1,5,kWh,2,2,A,20160129,162300
8,GUARDA,A002000995VM,101,97706436,12163,001,S,1,1,5,kWh,2,2,A,20160202,195800
9,GUARDA,A2000995VM,101,97706436,12163,001,S,1,1,5,kWh,1,3,A,20160404,145200
10,GUARDA,A002000996LV,168,5011703276,3567,001,V,1,1,6,kWh,1,1,A,20150528,205900
11,GUARDA,A02000996LV,168,5011703276,3697,001,V,1,1,6,kWh,2,2,A,20150929,163500
12,GUARDA,A02000996LV,168,5011703276,1287,002,P,1,1,6,kWh,1,1,A,20150528,205900
Generally you want to avoid for loops in pandas.
For example, the first loop where you calculate total consumption and days could be rewritten as a groupby apply something like:
def last_minus_first(df):
columns_of_interest = df[['VALOR_LEITURA', 'days']]
diff = columns_of_interest.iloc[-1] - columns_of_interest.iloc[0]
return diff
df['date'] = pd.to_datetime(df['DATA_LEITURA'], format="%Y%m%d")
df['days'] = (df['date'] - pd.datetime(1970,1,1)).dt.days # create days column
df.groupby('NR_CPE').apply(last_minus_first)
(btw I don't understand why you are subtracting each entry from the previous, surely for meter readings this is the same as last-first?)
Then given the result of the above as consumption, you can replace your second for loop (for ind, sumdias in dias.iteritems()) with something like:
pd.cut(consumption.days, [-1, 180, 365, np.inf], labels=['a', 'b', 'c']).value_counts()

Error when I try to iterate more than once

I've got this program which calculate k-means for IA
#! /usr/bin/env python
# -*- coding: utf-8 -*-
from random import sample
from itertools import repeat
from math import sqrt
# Parametros
k = 6
maxit = 2
def leeValoracionesFiltradas (nomFichero = "valoracionesFiltradas.data"):
lineas = [(l.strip()).split("\t") for l in (open(nomFichero).readlines())]
diccio = {}
for l in lineas:
diccio[int(l[0])] = {}
for l in lineas:
diccio[int(l[0])][int(l[1])] = (float(l[2]),float(l[3]))
return diccio
def distEuclidea(dic1, dic2):
# Se calcula la suma de cuadrados de los elementos comunes a los dos diccionarios
sum2 = sum([pow(dic1[elem]-dic2[elem], 2)
for elem in dic1 if elem in dic2])
return sqrt(sum2)
def similitudEuclidea(dic1, dic2):
return 1/(1+distEuclidea(dic1, dic2))
def coefPearson(dic1, dic2):
# Se consiguen los elementos comunes en ambos diccionarios
comunes = [x for x in dic1 if x in dic2]
nComunes = float(len(comunes))
# Si no hay comunes -> cero
if nComunes==0:
return 0
# Calculo de las medias de cada diccionario
media1 = sum([dic1[x][1] for x in comunes]) / nComunes
media2 = sum([dic2[x][1] for x in comunes]) / nComunes
# Numerador y denominador
num = sum([(dic1[x][1] - media1) * (dic2[x][1] - media2) for x in comunes])
den1 = sqrt(sum([pow(dic1[x][1] - media1, 2) for x in comunes]))
den2 = sqrt(sum([pow(dic2[x][1] - media2, 2) for x in comunes]))
den = den1 * den2
# Caculo del coeficiente
if den==0:
return 0
return num/den
# Dado un diccionario {key1 : {key2 : valor}} calcula el agrupamiento k-means
# con k clusters (grupo), ejecutando maxit iteraciones, con la funcion de similitud especificada
# Retorna una tupla
# -{key1:numero de clusters} con las asignaciones de clusters (a que clusters pertenece cada elemento)
# -[{key2:valores}] una lista con los k centroides (media de los valores para cada clusters)
def kmeans (diccionario, k, maxit, similitud = coefPearson):
# K puntos aleatorios son elegidos como centroides incialmente
# Cada centroide es {key2 : valor}
centroides = [diccionario[x] for x in sample(diccionario.keys(), k)]
# Se asigna cada key1 a un numero de cluster
previo = None
asignacion = {}
# En cada iteracion se asignan puntos a los centroides y se calculan nuevos centroides
for it in range(maxit):
# Se asignan puntos a los centroides mas cercanos
for key1 in diccionario:
similitudes = map(similitud,repeat(diccionario[key1],k), centroides)
asignacion[key1] = similitudes.index(max(similitudes))
# Si no hay cambios en la asignacion, se termina
if previo == asignacion: break
previo = asignacion
# Se recalculan los centroides (se anotan los valores de cada key a cada centroide)
valores = {x : {} for x in range(k)}
contadores = {x : {} for x in range(k)}
for key1 in diccionario:
grupo = asignacion[key1]
for key2 in diccionario[key1]:
if not valores[grupo].has_key(key2):
valores [grupo][key2] = 0
contadores [grupo][key2] = 0
valores [grupo][key2] += diccionario[key1][key2][1]
contadores[grupo][key2] += 1
# Se calculan las medias (nuevos centroides)
centroides = []
for grupo in valores:
centro = {}
for key2 in valores[grupo]:
centro[key2] = round((valores[grupo][key2] / contadores[grupo][key2]),2)
centroides.append(centro)
if None in centroides: break
return (asignacion, centroides)
# Se obtiene el diccionario de valoraciones (las valoraciones ya han sido filtradas)
diccionario = leeValoracionesFiltradas()
# Se obtienen las asignaciones y los centroides con la correlacion de Pearson
tupla = kmeans (diccionario, k, maxit)
asignaciones = tupla[0]
centroids = tupla[1]
print asignaciones
print centroids
And when I execute this for example for maxit = 2, it throws:
File "kmeans_dictio.py", line 46, in coefPearson
media2 = sum([dic2[x][1] for x in comunes]) / nComunes
TypeError: 'float' object has no attribute '__getitem__'
How can I fix this?
It looks like you have a dictionary (dic2) of floats and a dictionary of dictionaries of floats (dic1) that you are pulling an item out of with this line:
comunes = [x for x in dic1 if x in dic2]
Then you are trying to iterate over this float here:
media2 = sum([dic2[x][1] for x in comunes]) / nComunes
To fix this look at dic1 and dic2 and how they are defined.

Categories