Delete rows that contains no information on Tweet text on pandas - python
I´m trying to remove rows containing blank texts or in tweet texts column. But I have tried in different ways counting the rows that only contain whitespace or counting the leading spaces and trailing spaces but to get a criterion to eliminate it.
ID tweet WhiteSpaceCount HaveWhiteSpace
0 this is a text 0 False
1 0 False
2 Hello im fine 0 False
I want to delete all the rows that don´t have any information on the tweet column.
Code here:
def extractAndSave(api, name):
# Creamos una lista de tweets:
previous_date = date.today() - timedelta(days=1)
query_date = date.today()
name = name
tweets = API_EXTRACTOR.search(q=name + "-filter:retweets", result_type='recent', timeout=999999, count=200,
end_time=previous_date, tweet_mode='extended')
# Podemos crear un dataframe como sigue:
tweet_list = []
for tweet in tweets:
tweet_list.append(tweet.full_text)
datos = pd.DataFrame(data=tweet_list, columns=['TWEETS'])
# CREANDO COLUMNA DE ID
id_list = []
for id in tweets:
id_list.append(id.id)
id = pd.DataFrame(data=id_list, columns=['ID'])
# CREANDO COLUMNA DE ID
creado_list = []
for creado in tweets:
creado_list.append(creado.created_at)
creado = pd.DataFrame(data=creado_list, columns=['FECHA_CREACION'])
# CREANDO COLUMNA DE nombre de usuario
user_list = []
for usuario in tweets:
user_list.append(usuario.user.screen_name)
usuario = pd.DataFrame(data=user_list, columns=['USUARIO'])
# CREANDO COLUMNA DE FUENTE
fuente_list = []
for fuente in tweets:
fuente_list.append(fuente.source)
fuente = pd.DataFrame(data=fuente_list, columns=['FUENTE'])
# CREANDO COLUMNA DE ME GUSTA
like_list = []
for like in tweets:
like_list.append(like.favorite_count)
like = pd.DataFrame(data=like_list, columns=['ME_GUSTA'])
# CREANDO COLUMNA DE RT
rt_list = []
for rt in tweets:
rt_list.append(rt.retweet_count)
retweet = pd.DataFrame(data=rt_list, columns=['ME_GUSTA'])
# CREANDO COLUMNA DE IDIOMA
idioma_list = []
for idioma in tweets:
idioma_list.append(idioma.lang)
idioma = pd.DataFrame(data=idioma_list, columns=['IDIOMA'])
# CREANDO COLUMNA DE IDIOMA
quote_list = []
for quote in tweets:
quote_list.append(quote.is_quote_status)
quote = pd.DataFrame(data=quote_list, columns=['CITADO'])
# CREANDO COLUMNA DE IDIOMA
location_list = []
for location in tweets:
location_list.append(location.user.location)
location = pd.DataFrame(data=location_list, columns=['LOCACION'])
# CONCATENANDO DATAFRAMES
datos = pd.concat([datos, id, creado, usuario, fuente, like, retweet, quote, idioma, location], axis=1)
# Dropear toda la fila si la columna tweets viene vacia.
datos['pass/fail'] = np.where(datos['TWEETS'].astype(str).str.fullmatch(r"\s*"),'FAIL','PASS')
datos['CONTEO_ESPACIOS']= (datos['TWEETS'].str.startswith(" ") | datos['TWEETS'].str.endswith(" ")).sum()
# Hora de publicación
datos['HORA_PUBLICACION'] = datos['FECHA_CREACION'].dt.hour
datos['DIA_SEMANA'] = datos['FECHA_CREACION'].dt.day_name()
# Extrayendo solo los tweets del día anterior
datos['FECHA_CREACION'] = pd.to_datetime(datos['FECHA_CREACION']).dt.date
datos = datos[datos['FECHA_CREACION'] == previous_date]
print(datos)
# Guardando en dataframe.
return datos
Instead of removing rows that you don't need, keep only the ones you do need:
df = df[df["tweet"].str.strip().str.len()>0]
>>> df
ID tweet WhiteSpaceCount HaveWhiteSpace
0 0 this is a text 0 False
2 2 Hello im fine 0 False
Related
How can i delete all my tweets using twitter api
I'm trying to delete all the tweets/replies from my account, at first it worked, but it got to a point where he stopped deleting, and he can't receive the tweets anymore, giving the error: File "main.py", line 29, in fetch_tweets oldest = all_tweets[-1].id - 1 IndexError: list index out of range and on my account, even if it doesn't appear on the profile (I don't know why) there are still 19.2k tweets to be deleted, does anyone have any idea how to fix this? code: ''' Script para apagar todos os meus tweets mais antigos que determinada data ''' from keep_alive import keep_alive import tweepy from config import * import datetime import pandas as pd client = tweepy.Client(bearer_token, api_key, api_secret, access_token, access_token_secret) auth = tweepy.OAuth1UserHandler(api_key, api_secret, access_token, access_token_secret) api = tweepy.API(auth) def fetch_tweets(username): ''' Baixa todos os tweets do usuário determinado em 'username' ''' print("Resgatando Tweets. . .") all_tweets = [] new_tweets = api.user_timeline(screen_name=username, tweet_mode='extended', count=200, exclude_replies=False) all_tweets.extend(new_tweets) # Salva o id do tweet antigo menos um oldest = all_tweets[-1].id - 1 while len(new_tweets) > 0: # Continua pegando tweets até a requisição retornar nada # Todos as requests posteriores usam max_id "para avançar no tempo" new_tweets = api.user_timeline(screen_name=username, tweet_mode='extended', count=200, max_id=oldest) all_tweets.extend(new_tweets) # Atualiza o id oldest = all_tweets[-1].id - 1 # Transform the tweepy tweets into a 2D array that will populate the csv output = [ [ tweet.id, tweet.created_at, tweet.created_at.strftime("%d-%m-%Y"), tweet.retweet_count, tweet.favorite_count, username ] for tweet in all_tweets ] for sublist in output: sublist.append(username) return output def validate_date(date_text): ''' Verifica se a data entrada pelo usuário está no foramto YYYY-MM-DD. Se não estiver, levanta uma exeção com mensagem de erro. ''' try: datetime.datetime.strptime(date_text, '%Y-%m-%d') except ValueError: raise ValueError("A data não está no formato YYYY-MM-DD. Execute o programa novamente.") def filter_tweets(start, tweets): ''' Usa o dataframe com todos os tweets e a data de corte, depois da qual os tweets devem ser mantidos, para gerar uma lista com os ids das publicações devem ser removidas. ''' print("Filtrando Tweets. . .") now = datetime.datetime.now() start_date = pd.to_datetime(start, format = "%Y-%m-%d") # Filtra intervalo de tweets que quero manter keep_dates = pd.date_range(start=start_date, end=now) keep_dates = [str(date)[:10] for date in keep_dates] # Cria uma lista de ids cujo tweet deve ser mantido tweets_to_delete = [ tweet[0] for tweet in tweets if str(pd.to_datetime(tweet[1]))[:10] not in keep_dates ] return tweets_to_delete def delete_tweets(tweet_ids): ''' Deleta os tweets cujos números identificadores estão na lista tweet_ids ''' print("Deletando Tweets. . .") # Começa a deletar: delete_count = 0 for tweet_id in tweet_ids: try: api.destroy_status(tweet_id) print(tweet_id, 'deletado!', delete_count) delete_count += 1 except: print(tweet_id, 'não pode ser deletado!') print('Pronto!', delete_count, 'tweets foram deletados, ao todo.') ########################## ### Execução principal ### ########################## def main(): print("Iniciando. . .") username = "xxxxxxxxxx" start = "2022-10-25" while True: try: tweets = fetch_tweets(username) tweets = filter_tweets(start, tweets) delete_tweets(tweets) except tweepy.TweepyException as e try: print(e) except: print("error") keep_alive() main() I already tried to change the parameters, put a conditional to check if the list is empty, but none of that worked
The API can only go back so far in retrieving older Tweets. Another option would be to request your Twitter Archive, which would contain the Tweet IDs and content - you can then potentially use the API to delete Tweets by ID. Note that there are account and API rate limits that will restrict the speed at which you can run this operation.
Problem adding information to Table Widget
I'm trying to add to the Table Widget the information that I have on my SQLite table. That's it my SQL table: . That my Table Widget: The code of the method: if self.ui.btn_jugadores.clicked.connect(lambda: self.ui.pila_de_paginas.setCurrentWidget(self.ui.pg_jugadores)): self.mostrar_jugadores() def mostrar_jugadores(self): try: cursor = conectarse_bbdd_jugadores() consulta = ''' SELECT * FROM jugadores''' ejecucion = cursor.execute(consulta).fetchall() if len(ejecucion) > 0: fila = 0 for e in ejecucion: columna = 0 for apartado in e: celda = QTableWidgetItem(apartado) self.ui.tabla_jgds.setItem(fila, columna, celda) columna += 1 print(apartado) fila += 1 else: QMessageBox.setText('No hay registros en la tabla de jugadores') QMessageBox.setIcon(QMessageBox.warning) QMessageBox.exec_() except Error as error: QMessageBox.setText('No se ha podido ejecutar la sentencia') QMessageBox.setIcon(QMessageBox.warning) QMessageBox.exec_() And this is the result: when I executed it I don't know why the numbers are not appearing in the table widget.
Dealing with special characters in pandas Data Frame´s column Name
I am importing an excel worksheet that has the following columns name: N° Pedido 1234 6424 4563 The column name ha a special character (°). Because of that, I can´t merge this with another Data Frame or rename the column. I don´t get any error message just the name stays the same. What should I do? This is the code I am using and the result of the Dataframes: import pandas as pd import numpy as np # Importando Planilhas CRM = pd.ExcelFile(r'C:\Users\Michel\Desktop\Relatorio de Vendas\relatorio_vendas_CRM.xlsx', encoding= 'utf-8') protheus = pd.ExcelFile(r'C:\Users\Michel\Desktop\Relatorio de Vendas\relatorio_vendas_protheus.xlsx', encoding= 'utf-8') #transformando em Data Frame df_crm = CRM.parse('190_pedido_export (33)') df_protheus = protheus.parse('Relatorio de Pedido de Venda')] # Transformando Campos em float o protheus def turn_to_float(x): return np.float(x) df_protheus["TES"] = df_protheus["TES"].apply(turn_to_float) df_protheus["Qtde"] = df_protheus["Qtde"].apply(turn_to_float) df_protheus["Valor"] = df_protheus["Valor"].apply(turn_to_float) #Tirando Tes de não venda do protheus # tirando valores com código errado 6 df_protheus_1 = df_protheus[df_protheus.TES != 513.0] df_protheus_2 = df_protheus_1[df_protheus_1.TES != 576.0] **df_crm.columns = df_crm.columns.str.replace('N° Pedido', 'teste') df_crm.columns** Orçamento Origem N° Pedido Nº Pedido ERP Estabelecimento Tipo de Pedido Classificação(Tipo) Aplicação Conta CNPJ/CPF Contato ... Aprovação Parcial Antecipa Entrega Desconto da Tabela de Preço Desconto do Cliente Desconto Informado Observações Observações NF Vl Total Bruto Vl Total Completo 0 20619.0 23125 NaN Optitex 1 - Venda NaN Industrialização/Revenda XAVIER E ARAUJO LTDA ME 7970626000170 NaN ... N N 0 0 0 Note that I used other codes for the bold part with the same result: #renomeando tabela para dar Merge #df_crm['proc'] = df_crm['N\xc2\xb0 Pedido'] #df_crm['N Pedido'] = df_crm['N° Pedido'] #df_crm.drop('N° Pedido',inplace=True,axis=1) #df_crm #df_crm['N Pedido'] = df_crm['N° Pedido'] #df.drop('N° Pedido',inplace=True,axis=1) #df_crm #df_crm_1 = df_crm.rename(columns={"N°Pedido": "teste"}) #df_crm_1
Thanks for posting the link to the Google Sheet. I downloaded it and loaded it via pandas: df = pd.read_excel(r'~\relatorio_vendas_CRM.xlsx', encoding = 'utf-8') df.columns = df.columns.str.replace('°', '') df.columns = df.columns.str.replace('º', '') Note that the two replace statements are replacing different characters, although they look very similar. Help from: Why do I get a SyntaxError for a Unicode escape in my file path?
I was able to copy the values into another column. You could try that df['N Pedido'] = df['N° Pedido'] df.drop('N° Pedido',inplace=True,axis=1)
Pandas + Python: More efficient code
This is my code: import pandas as pd import os import glob as g archivos = g.glob('C:\Users\Desktop\*.csv') for archiv in archivos: nombre = os.path.splitext(archiv)[0] df = pd.read_csv(archiv, sep=",") d = pd.to_datetime(df['DATA_LEITURA'], format="%Y%m%d") df['FECHA_LECTURA'] = d.dt.date del df['DATA_LEITURA'] df['CONSUMO']="" df['DIAS']="" df["SUMDIAS"]="" df["SUMCONS"]="" df["CONSANUAL"] = "" ordenado = df.sort_values(['NR_CPE','FECHA_LECTURA', 'HORA_LEITURA'], ascending=True) ##Agrupamos por el CPE agrupado = ordenado.groupby('NR_CPE') for name, group in agrupado: #Recorremos el grupo indice = group.index.values inicio = indice[0] fin = indice[-1] #Llenamos la primeras lectura de cada CPE, con esa lectura (porque no hay una lectura anterior) ordenado.CONSUMO.loc[inicio] = 0 ordenado.DIAS.loc[inicio] = 0 cont=0 for i in indice: #Recorremos lo que hay dentro de los grupos, dentro de los CPES(lecturas) if i > inicio and i <= fin : cont=cont+1 consumo = ordenado.VALOR_LEITURA[indice[cont]] - ordenado.VALOR_LEITURA[indice[cont-1]] dias = (ordenado.FECHA_LECTURA[indice[cont]] - ordenado.FECHA_LECTURA[indice[cont-1]]).days ordenado.CONSUMO.loc[i] = consumo ordenado.DIAS.loc[i] = dias # Hago las sumatorias, el resultado es un objeto DataFrame dias = agrupado['DIAS'].sum() consu = agrupado['CONSUMO'].sum() canu = (consu/dias) * 365 #Contador con el numero de courrencias de los campos A,B y C conta=0 contb=0 contc=0 #Como es un DF, para recorrerlo tengo que iterar sobre ellos para hacer la comparacion print "Grupos:" for ind, sumdias in dias.iteritems(): if sumdias <= 180: grupo = "A" conta=conta+1 elif sumdias > 180 and sumdias <= 365: grupo = "B" contb=contb+1 elif sumdias > 365: grupo = "C" contc=contc+1 print "grupo A: " , conta print "grupo B: " , contb print "grupo C: " , contc #Formateamos los campos para no mostrar todos los decimales Fdias = dias.map('{:.0f}'.format) Fcanu = canu.map('{:.2f}'.format) frames = [Fdias, consu, Fcanu] concat = pd.concat(frames,axis=1).replace(['inf','nan'],[0,0]) with open('C:\Users\Documents\RPE_PORTUGAL\Datos.csv','a') as f: concat.to_csv(f,header=False,columns=['CPE','DIAS','CONSUMO','CONSUMO_ANUAL']) try: ordenado.to_excel(nombre+'.xls', columns=["NOME_DISTRITO", "NR_CPE","MARCA_EQUIPAMENTO","NR_EQUIPAMENTO","VALOR_LEITURA","REGISTADOR","TIPO_REGISTADOR", "TIPO_DADOS_RECOLHIDOS","FACTOR_MULTIPLICATIVO_FINAL","NR_DIGITOS_INTEIRO","UNIDADE_MEDIDA", "TIPO_LEITURA","MOTIVO_LEITURA","ESTADO_LEITURA","HORA_LEITURA","FECHA_LECTURA","CONSUMO","DIAS"], index=False) print (archiv) print ("===============================================") print ("*****Se ha creado el archivo correctamente*****") print ("===============================================") except IOError: print ("===================================================") print ("¡¡¡¡¡Hubo un error en la escritura del archivo!!!!!") print ("===================================================") This takes a file where I have lectures of energy consumption from different dates for every light meter('NR_CPE') and do some calculations: Calculate the energy consumption for every 'NR_CPE' by substracting the previous reading with the next one and the result put in a new column named 'CONSUMO'. Calculate the number of days where I'v got a reading and sum up the number of days Add the consumption for every 'NR_CPE' and calculate the anual consumption. Finally I want to classify by number of days that every light meter('NR_CPE') has a lecture. A if it has less than 180 days, B between 180 and 1 year and C more than a year. And finally write this result in two differents files. Any idea of how should I re-code this to have the same output and be faster? Thank you all. BTW this is my dataset: ,NOME_DISTRITO,NR_CPE,MARCA_EQUIPAMENTO,NR_EQUIPAMENTO,VALOR_LEITURA,REGISTADOR,TIPO_REGISTADOR,TIPO_DADOS_RECOLHIDOS,FACTOR_MULTIPLICATIVO_FINAL,NR_DIGITOS_INTEIRO,UNIDADE_MEDIDA,TIPO_LEITURA,MOTIVO_LEITURA,ESTADO_LEITURA,DATA_LEITURA,HORA_LEITURA 0,GUARDA,A002000642VW,101,1865411,4834,001,S,1,1,4,kWh,1,1,A,20150629,205600 1,GUARDA,A002000642VW,101,1865411,4834,001,S,1,1,4,kWh,2,2,A,20160218,123300 2,GUARDA,A002000642VJ,122,204534,25083,001,S,1,1,5,kWh,1,1,A,20150629,205700 3,GUARDA,A002000642VJ,122,204534,27536,001,S,1,1,5,kWh,2,2,A,20160218,123200 4,GUARDA,A002000642HR,101,1383899,11734,001,S,1,1,5,kWh,1,1,A,20150629,205600 5,GUARDA,A002000642HR,101,1383899,11800,001,S,1,1,5,kWh,2,2,A,20160218,123000 6,GUARDA,A002000995VM,101,97706436,12158,001,S,1,1,5,kWh,1,3,A,20150713,155300 7,GUARDA,A002000995VM,101,97706436,12163,001,S,1,1,5,kWh,2,2,A,20160129,162300 8,GUARDA,A002000995VM,101,97706436,12163,001,S,1,1,5,kWh,2,2,A,20160202,195800 9,GUARDA,A2000995VM,101,97706436,12163,001,S,1,1,5,kWh,1,3,A,20160404,145200 10,GUARDA,A002000996LV,168,5011703276,3567,001,V,1,1,6,kWh,1,1,A,20150528,205900 11,GUARDA,A02000996LV,168,5011703276,3697,001,V,1,1,6,kWh,2,2,A,20150929,163500 12,GUARDA,A02000996LV,168,5011703276,1287,002,P,1,1,6,kWh,1,1,A,20150528,205900
Generally you want to avoid for loops in pandas. For example, the first loop where you calculate total consumption and days could be rewritten as a groupby apply something like: def last_minus_first(df): columns_of_interest = df[['VALOR_LEITURA', 'days']] diff = columns_of_interest.iloc[-1] - columns_of_interest.iloc[0] return diff df['date'] = pd.to_datetime(df['DATA_LEITURA'], format="%Y%m%d") df['days'] = (df['date'] - pd.datetime(1970,1,1)).dt.days # create days column df.groupby('NR_CPE').apply(last_minus_first) (btw I don't understand why you are subtracting each entry from the previous, surely for meter readings this is the same as last-first?) Then given the result of the above as consumption, you can replace your second for loop (for ind, sumdias in dias.iteritems()) with something like: pd.cut(consumption.days, [-1, 180, 365, np.inf], labels=['a', 'b', 'c']).value_counts()
Error when I try to iterate more than once
I've got this program which calculate k-means for IA #! /usr/bin/env python # -*- coding: utf-8 -*- from random import sample from itertools import repeat from math import sqrt # Parametros k = 6 maxit = 2 def leeValoracionesFiltradas (nomFichero = "valoracionesFiltradas.data"): lineas = [(l.strip()).split("\t") for l in (open(nomFichero).readlines())] diccio = {} for l in lineas: diccio[int(l[0])] = {} for l in lineas: diccio[int(l[0])][int(l[1])] = (float(l[2]),float(l[3])) return diccio def distEuclidea(dic1, dic2): # Se calcula la suma de cuadrados de los elementos comunes a los dos diccionarios sum2 = sum([pow(dic1[elem]-dic2[elem], 2) for elem in dic1 if elem in dic2]) return sqrt(sum2) def similitudEuclidea(dic1, dic2): return 1/(1+distEuclidea(dic1, dic2)) def coefPearson(dic1, dic2): # Se consiguen los elementos comunes en ambos diccionarios comunes = [x for x in dic1 if x in dic2] nComunes = float(len(comunes)) # Si no hay comunes -> cero if nComunes==0: return 0 # Calculo de las medias de cada diccionario media1 = sum([dic1[x][1] for x in comunes]) / nComunes media2 = sum([dic2[x][1] for x in comunes]) / nComunes # Numerador y denominador num = sum([(dic1[x][1] - media1) * (dic2[x][1] - media2) for x in comunes]) den1 = sqrt(sum([pow(dic1[x][1] - media1, 2) for x in comunes])) den2 = sqrt(sum([pow(dic2[x][1] - media2, 2) for x in comunes])) den = den1 * den2 # Caculo del coeficiente if den==0: return 0 return num/den # Dado un diccionario {key1 : {key2 : valor}} calcula el agrupamiento k-means # con k clusters (grupo), ejecutando maxit iteraciones, con la funcion de similitud especificada # Retorna una tupla # -{key1:numero de clusters} con las asignaciones de clusters (a que clusters pertenece cada elemento) # -[{key2:valores}] una lista con los k centroides (media de los valores para cada clusters) def kmeans (diccionario, k, maxit, similitud = coefPearson): # K puntos aleatorios son elegidos como centroides incialmente # Cada centroide es {key2 : valor} centroides = [diccionario[x] for x in sample(diccionario.keys(), k)] # Se asigna cada key1 a un numero de cluster previo = None asignacion = {} # En cada iteracion se asignan puntos a los centroides y se calculan nuevos centroides for it in range(maxit): # Se asignan puntos a los centroides mas cercanos for key1 in diccionario: similitudes = map(similitud,repeat(diccionario[key1],k), centroides) asignacion[key1] = similitudes.index(max(similitudes)) # Si no hay cambios en la asignacion, se termina if previo == asignacion: break previo = asignacion # Se recalculan los centroides (se anotan los valores de cada key a cada centroide) valores = {x : {} for x in range(k)} contadores = {x : {} for x in range(k)} for key1 in diccionario: grupo = asignacion[key1] for key2 in diccionario[key1]: if not valores[grupo].has_key(key2): valores [grupo][key2] = 0 contadores [grupo][key2] = 0 valores [grupo][key2] += diccionario[key1][key2][1] contadores[grupo][key2] += 1 # Se calculan las medias (nuevos centroides) centroides = [] for grupo in valores: centro = {} for key2 in valores[grupo]: centro[key2] = round((valores[grupo][key2] / contadores[grupo][key2]),2) centroides.append(centro) if None in centroides: break return (asignacion, centroides) # Se obtiene el diccionario de valoraciones (las valoraciones ya han sido filtradas) diccionario = leeValoracionesFiltradas() # Se obtienen las asignaciones y los centroides con la correlacion de Pearson tupla = kmeans (diccionario, k, maxit) asignaciones = tupla[0] centroids = tupla[1] print asignaciones print centroids And when I execute this for example for maxit = 2, it throws: File "kmeans_dictio.py", line 46, in coefPearson media2 = sum([dic2[x][1] for x in comunes]) / nComunes TypeError: 'float' object has no attribute '__getitem__' How can I fix this?
It looks like you have a dictionary (dic2) of floats and a dictionary of dictionaries of floats (dic1) that you are pulling an item out of with this line: comunes = [x for x in dic1 if x in dic2] Then you are trying to iterate over this float here: media2 = sum([dic2[x][1] for x in comunes]) / nComunes To fix this look at dic1 and dic2 and how they are defined.