Reading XML file in pandas - python

I am having a XML file, I want to load it in pandas, I have tried other XML files, but for this the shape is unclear and every time this throws the error and I am not coming up with the pandas dataframe I need. Any suggestions
-
-<tweet id="591154373696323584">
Diga cuanto nos van a costar las
<sentiment polarity="N" entity="Partido_Popular" aspect="Economia">autovías</sentiment>
de sus amiguetes ¿4500 millones o más ? #EsperanzAguirre #PPopular
</tweet>
-<tweet id="591154532362670080">
#lhermoso_ #sanchezcastejon
<sentiment polarity="N" entity="Partido_Socialista_Obrero_Espanol" aspect="Propio_partido">#DobleMoral</sentiment>
Castilla antes que Aragón...
</tweet>
I am using the code below.
import xml.etree.cElementTree as et
import pandas as pd
def getvalueofnode(node):
""" return node text or None """
return node.text if node is not None else None
def main():
parsed_xml = et.parse("stompol-train-tagged.xml")
dfcols = ['tweet id', 'tweet', 'sentiment_polarity', 'entity', 'aspect', 'sentiment']
df_xml = pd.DataFrame(columns=dfcols)
for node in parsed_xml.getroot():
tweetid = node.attrib.get('tweetid')
tweet = node.find('tweet')
sentiment_polarity = node.find('polarity')
entity = node.find('entity')
aspect = node.find('aspect')
sentiment = node.find('sentiment')
df_xml = df_xml.append(
pd.Series([tweetid, tweet, sentiment_polarity, entity, aspect, sentiment], index=dfcols),
ignore_index=True)
print(df_xml)
main()
I get None and all.

Related

How can i delete all my tweets using twitter api

I'm trying to delete all the tweets/replies from my account, at first it worked, but it got to a point where he stopped deleting, and he can't receive the tweets anymore, giving the error:
File "main.py", line 29, in fetch_tweets
oldest = all_tweets[-1].id - 1
IndexError: list index out of range
and on my account, even if it doesn't appear on the profile (I don't know why) there are still 19.2k tweets to be deleted, does anyone have any idea how to fix this?
code:
'''
Script para apagar todos os meus tweets mais antigos que determinada data
'''
from keep_alive import keep_alive
import tweepy
from config import *
import datetime
import pandas as pd
client = tweepy.Client(bearer_token, api_key, api_secret, access_token, access_token_secret)
auth = tweepy.OAuth1UserHandler(api_key, api_secret, access_token, access_token_secret)
api = tweepy.API(auth)
def fetch_tweets(username):
'''
Baixa todos os tweets do usuário
determinado em 'username'
'''
print("Resgatando Tweets. . .")
all_tweets = []
new_tweets = api.user_timeline(screen_name=username, tweet_mode='extended', count=200, exclude_replies=False)
all_tweets.extend(new_tweets)
# Salva o id do tweet antigo menos um
oldest = all_tweets[-1].id - 1
while len(new_tweets) > 0: # Continua pegando tweets até a requisição retornar nada
# Todos as requests posteriores usam max_id "para avançar no tempo"
new_tweets = api.user_timeline(screen_name=username, tweet_mode='extended', count=200, max_id=oldest)
all_tweets.extend(new_tweets)
# Atualiza o id
oldest = all_tweets[-1].id - 1
# Transform the tweepy tweets into a 2D array that will populate the csv
output = [
[ tweet.id,
tweet.created_at,
tweet.created_at.strftime("%d-%m-%Y"),
tweet.retweet_count,
tweet.favorite_count,
username ] for tweet in all_tweets
]
for sublist in output:
sublist.append(username)
return output
def validate_date(date_text):
'''
Verifica se a data entrada pelo usuário
está no foramto YYYY-MM-DD. Se não estiver,
levanta uma exeção com mensagem de erro.
'''
try:
datetime.datetime.strptime(date_text, '%Y-%m-%d')
except ValueError:
raise ValueError("A data não está no formato YYYY-MM-DD. Execute o programa novamente.")
def filter_tweets(start, tweets):
'''
Usa o dataframe com todos os tweets
e a data de corte, depois da qual os
tweets devem ser mantidos, para gerar
uma lista com os ids das publicações
devem ser removidas.
'''
print("Filtrando Tweets. . .")
now = datetime.datetime.now()
start_date = pd.to_datetime(start, format = "%Y-%m-%d")
# Filtra intervalo de tweets que quero manter
keep_dates = pd.date_range(start=start_date, end=now)
keep_dates = [str(date)[:10] for date in keep_dates]
# Cria uma lista de ids cujo tweet deve ser mantido
tweets_to_delete = [ tweet[0] for tweet in tweets if str(pd.to_datetime(tweet[1]))[:10] not in keep_dates ]
return tweets_to_delete
def delete_tweets(tweet_ids):
'''
Deleta os tweets cujos números
identificadores estão na lista
tweet_ids
'''
print("Deletando Tweets. . .")
# Começa a deletar:
delete_count = 0
for tweet_id in tweet_ids:
try:
api.destroy_status(tweet_id)
print(tweet_id, 'deletado!', delete_count)
delete_count += 1
except:
print(tweet_id, 'não pode ser deletado!')
print('Pronto!', delete_count, 'tweets foram deletados, ao todo.')
##########################
### Execução principal ###
##########################
def main():
print("Iniciando. . .")
username = "xxxxxxxxxx"
start = "2022-10-25"
while True:
try:
tweets = fetch_tweets(username)
tweets = filter_tweets(start, tweets)
delete_tweets(tweets)
except tweepy.TweepyException as e
try:
print(e)
except:
print("error")
keep_alive()
main()
I already tried to change the parameters, put a conditional to check if the list is empty, but none of that worked
The API can only go back so far in retrieving older Tweets.
Another option would be to request your Twitter Archive, which would contain the Tweet IDs and content - you can then potentially use the API to delete Tweets by ID.
Note that there are account and API rate limits that will restrict the speed at which you can run this operation.

Concat fields using apache beam

I have following function. So I would like to take 2 fields and concatenate them but when my pipeline finishes it doesn´t work
the pipeline finishes as correct but when I see in bigquery the fields have not been concatenated
It will be great if u can help me.
it´s the code used in the function:
import apache_beam as beam
from ..tools import ProcessLogger
_logger = ProcessLogger()
class ConcatFieldsFn(beam.DoFn):
"""Concatena los valores de los campos especificados en una pCollection por un valor especificado"""
def __init__(self, process: str, data_name: str, parameters: dict):
# Configuracion del logger
self.logger_data_name = data_name
self.logger_process = process
self.logger_subprocess = "Concatenar valores"
_logger.data_name = self.logger_data_name
_logger.process = self.logger_process
_logger.subprocess = self.logger_subprocess
# Parametros del proceso
self._fields = [field.get("name") for field in parameters.get("fields", None)]
_logger.info(
f"Se aplica regla: {_logger.subprocess} con los parametros: {parameters}"
)
def process(self, element):
# Configuracion del logger
_logger.data_name = self.logger_data_name
_logger.process = self.logger_process
_logger.subprocess = self.logger_subprocess
for field in self._fields:
if element[field] != None:
try:
element[field] = "|".join(element[field])
except Exception as ex:
_logger.error(
f"No se pueden eliminar las letras de los campos seleccionados: {ex}"
)
return [element]

Delete rows that contains no information on Tweet text on pandas

I´m trying to remove rows containing blank texts or in tweet texts column. But I have tried in different ways counting the rows that only contain whitespace or counting the leading spaces and trailing spaces but to get a criterion to eliminate it.
ID tweet WhiteSpaceCount HaveWhiteSpace
0 this is a text 0 False
1 0 False
2 Hello im fine 0 False
I want to delete all the rows that don´t have any information on the tweet column.
Code here:
def extractAndSave(api, name):
# Creamos una lista de tweets:
previous_date = date.today() - timedelta(days=1)
query_date = date.today()
name = name
tweets = API_EXTRACTOR.search(q=name + "-filter:retweets", result_type='recent', timeout=999999, count=200,
end_time=previous_date, tweet_mode='extended')
# Podemos crear un dataframe como sigue:
tweet_list = []
for tweet in tweets:
tweet_list.append(tweet.full_text)
datos = pd.DataFrame(data=tweet_list, columns=['TWEETS'])
# CREANDO COLUMNA DE ID
id_list = []
for id in tweets:
id_list.append(id.id)
id = pd.DataFrame(data=id_list, columns=['ID'])
# CREANDO COLUMNA DE ID
creado_list = []
for creado in tweets:
creado_list.append(creado.created_at)
creado = pd.DataFrame(data=creado_list, columns=['FECHA_CREACION'])
# CREANDO COLUMNA DE nombre de usuario
user_list = []
for usuario in tweets:
user_list.append(usuario.user.screen_name)
usuario = pd.DataFrame(data=user_list, columns=['USUARIO'])
# CREANDO COLUMNA DE FUENTE
fuente_list = []
for fuente in tweets:
fuente_list.append(fuente.source)
fuente = pd.DataFrame(data=fuente_list, columns=['FUENTE'])
# CREANDO COLUMNA DE ME GUSTA
like_list = []
for like in tweets:
like_list.append(like.favorite_count)
like = pd.DataFrame(data=like_list, columns=['ME_GUSTA'])
# CREANDO COLUMNA DE RT
rt_list = []
for rt in tweets:
rt_list.append(rt.retweet_count)
retweet = pd.DataFrame(data=rt_list, columns=['ME_GUSTA'])
# CREANDO COLUMNA DE IDIOMA
idioma_list = []
for idioma in tweets:
idioma_list.append(idioma.lang)
idioma = pd.DataFrame(data=idioma_list, columns=['IDIOMA'])
# CREANDO COLUMNA DE IDIOMA
quote_list = []
for quote in tweets:
quote_list.append(quote.is_quote_status)
quote = pd.DataFrame(data=quote_list, columns=['CITADO'])
# CREANDO COLUMNA DE IDIOMA
location_list = []
for location in tweets:
location_list.append(location.user.location)
location = pd.DataFrame(data=location_list, columns=['LOCACION'])
# CONCATENANDO DATAFRAMES
datos = pd.concat([datos, id, creado, usuario, fuente, like, retweet, quote, idioma, location], axis=1)
# Dropear toda la fila si la columna tweets viene vacia.
datos['pass/fail'] = np.where(datos['TWEETS'].astype(str).str.fullmatch(r"\s*"),'FAIL','PASS')
datos['CONTEO_ESPACIOS']= (datos['TWEETS'].str.startswith(" ") | datos['TWEETS'].str.endswith(" ")).sum()
# Hora de publicación
datos['HORA_PUBLICACION'] = datos['FECHA_CREACION'].dt.hour
datos['DIA_SEMANA'] = datos['FECHA_CREACION'].dt.day_name()
# Extrayendo solo los tweets del día anterior
datos['FECHA_CREACION'] = pd.to_datetime(datos['FECHA_CREACION']).dt.date
datos = datos[datos['FECHA_CREACION'] == previous_date]
print(datos)
# Guardando en dataframe.
return datos
Instead of removing rows that you don't need, keep only the ones you do need:
df = df[df["tweet"].str.strip().str.len()>0]
>>> df
ID tweet WhiteSpaceCount HaveWhiteSpace
0 0 this is a text 0 False
2 2 Hello im fine 0 False

How can I modify an "if condition" in order to apply it to different list at the same time?

I wrote a script to extract sentences in huge set which contains particular pattern. The problem lied in the fact that , for some patterns I checked the value of the attribute at the beginning or ending of the pattern to see if the word is present in a particular list. I have 4 dictionaries with 2 lists of positive and negative word. So far I wrote the script and I am able to use the function I wrote with one dictionary. I am thinking how can I improve the my function so that I can use it at the same time of the 4 dictionaries without duplicating the bloc which loop in the dictionary.
I give an example with two dictionaries (since the script is quite long I make a small example with all the necessary element
import spacy.attrs
from spacy.attrs import POS
import spacy
from spacy import displacy
from spacy.lang.fr import French
from spacy.tokenizer import Tokenizer
from spacy.util import compile_prefix_regex, compile_infix_regex, compile_suffix_regex
from spacy.lemmatizer import Lemmatizer
nlp = spacy.load("fr_core_news_md")
from spacy.matcher import Matcher#LIST
##################### List of lexicon
# Lexique Diko
lexicon = open(os.path.join('/h/Ressources/Diko.txt'), 'r', encoding='utf-8')
data = pd.read_csv(lexicon, sep=";", header=None)
data.columns = ["id", "terme", "pol"]
pol_diko_pos = data.loc[data.pol =='positive', 'terme']
liste_pos_D = list(pol_diko_pos)
print(liste_pos[1])
pol_diko_neg = data.loc[data.pol =='negative', 'terme']
liste_neg_D = list(pol_diko_neg)
#print(type(liste_neg))
# Lexique Polarimots
lexicon_p = open(os.path.join('/h/Ressources/polarimots.txt'), 'r', encoding='utf-8')
data_p = pd.read_csv(lexicon_p, sep="\t", header=None)
#data.columns = ["terme", "pol", "pos", "degre"]
data_p.columns = ["ind", "terme", "cat", "pol", "fiabilité"]
pol_polarimot_pos = data_p.loc[data_p.pol =='POS', 'terme']
liste_pos_P = list(pol_polarimot_pos)
print(liste_pos_P[1])
pol_polarimot_neg = data_p.loc[data_p.pol =='NEG', 'terme']
liste_neg_P = list(pol_polarimot_neg)
#print(type(liste_neg))
# ############################# Lists
sentence_not_extract_lexique_1 =[] #List of all sentences without the specified pattern
sentence_extract_lexique_1 = [] #list of sentences which the pattern[0] is present in the first lexicon
sentence_not_extract_lexique_2 =[] #List of all sentences without the specified pattern
sentence_extract_lexique_2 = [] #list of sentences which the pattern[0] is present in the second lexicon
list_token_pos = [] #list of the token found in the lexique
list_token_neg = [] #list of the token found in the lexique
list_token_not_found = [] #list of the token not found in the lexique
#PATTERN
pattern1 = [{"POS": {"IN": ["VERB", "AUX","ADV","NOUN","ADJ"]}}, {"IS_PUNCT": True, "OP": "*"}, {"LOWER": "mais"} ]
pattern1_tup = (pattern1, 1, True)
pattern3 = [{"LOWER": {"IN": ["très","trop"]}},
{"POS": {"IN": ["ADV","ADJ"]}}]
pattern3_tup = (pattern3, 0, True)
pattern4 = [{"POS": "ADV"}, # adverbe de négation
{"POS": "PRON","OP": "*"},
{"POS": {"IN": ["VERB", "AUX"]}},
{"TEXT": {"IN": ["pas", "plus", "aucun", "aucunement", "point", "jamais", "nullement", "rien"]}},]
pattern4_tup = (pattern4, None, False)
#Tuple of pattern
pattern_list_tup =[pattern1_tup, pattern3_tup, pattern4_tup]
pattern_name = ['first', 'second', 'third', 'fourth']
length_of_list = len(pattern_list_tup)
print('length', length_of_list)
#index of the value of attribute to check in the lexicon
value_of_attribute = [0,-1,-1]
# List of lexicon to use
lexique_1 = [lexique_neg, lexique_pos]
lexique_2 = [lexique_2neg, lexique_2pos]
# text (example of some sentences)
file =b= ["Le film est superbe mais cette édition DVD est nulle !",
"J'allais dire déplorable, mais je serais peut-être un peu trop extrême.",
"Hélas, l'impression de violence, bien que très bien rendue, ne sauve pas cette histoire gothique moderne de la sécheresse scénaristique, le tout couvert d'un adultère dont le propos semble être gratuit, classique mais intéressant...",
"Tout ça ne me donne pas envie d'utiliser un pieu mais plutôt d'aller au pieu (suis-je drôle).",
"Oui biensur, il y a la superbe introduction des parapluies au debut, et puis lorsqu il sent des culs tout neufs et qu il s extase, j ai envie de faire la meme chose apres sur celui de ma voisine de palier (ma voisine de palier elle a un gros cul, mais j admets que je voudrais bien lui foute mon tarin), mais c est tout, apres c est un film tres noir, lent et qui te plonge dans le depression.",
"Et bien hélas ce DVD ne m'a pas appris grand chose par rapport à la doc des agences de voyages et la petite dame qui fait ses dessins est bien gentille mais tout tourne un peu trop autour d'elle.",
"Au final on passe de l'un a l'autre sans subtilité, et on n'arrive qu'à une caricature de plus : si Kechiche avait comme but initial déclaré de fustiger les préjugés, c'est le contraire qui ressort de ce ''film'' truffé de clichés très préjudiciables pour les quelques habitants de banlieue qui ne se reconnaîtront pas dans cette lourde farce.",
"-ci écorche les mots, les notes... mais surtout nos oreilles !"]
# Loop to check each sentence and extract the sentences with the specified pattern from above
for pat in range(0, length_of_list):
matcher = Matcher(nlp.vocab)
matcher.add("matching_2", None, pattern_list_tup[pat][0])
# print(pat)
# print(pattern_list_tup[pat][0])
for sent in file:
doc =nlp(sent)
matches= matcher(doc)
for match_id, start, end in matches:
span = doc[start:end].lemma_.split()
#print(f"{pattern_name[pat]} pattern found: {span}")
This is the part I want ot modify to use it for another dictionary, the goal is to able to retrieve sentences extract by 4 different dictionaries to make a comparison and then check which sentences are present in more than two list.
# Condition to use the lexicon and extract the sentence
if (pattern_list_tup[pat][2]):
if (span[value_of_attribute[pat]] in lexique_1[pattern_list_tup[pat][1]]):
if sent not in sentence_extract:
sentence_extract_lexique_1.append(sent)
if (pattern_list_tup[pat][1] == 1):
list_token_pos.append(span[value_of_attribute[pat]])
if (pattern_list_tup[pat][1] == 0):
list_token_neg.append(span[value_of_attribute[pat]])
else:
list_token_not_found.append(span[value_of_attribute[pat]]) # the text form is not present in the lexicon need the lemma form
sentence_not_extract_lexique_1.append(sent)
else:
if sent not in sentence_extract:
sentence_extract_lexique_1.append(sent)
print(len(sentence_extract))
print(sentence_extract)
One solution I find is to duplicate the code abode and change the name of the list where the sentences are stored but since I have 2 dictionaries duplicating will make the code longer is there a way to combine the looping the 2 dictionaries (actually 4 dictionaries in the original) and append the result to the good list. So, for example, when I use lexique_1 , all the sentences extracted are send to "sentence_extract_lexique_1" and so on for the other.
In my opinion attempt using the if-elif-else chain. If not attempt only using the if-elif block simply because the elif statement catches the specific condition of interest. In which you're trying to catch a specific to compare and check with the sentences. Keep in mind if you try the if-elif-else chain its a good method, but it only works when you need one test to pass. Because Python finds one test to pass and it skips the rest. Its very efficient and allows you to test for one specific condition.

Dealing with special characters in pandas Data Frame´s column Name

I am importing an excel worksheet that has the following columns name:
N° Pedido
1234
6424
4563
The column name ha a special character (°). Because of that, I can´t merge this with another Data Frame or rename the column. I don´t get any error message just the name stays the same. What should I do?
This is the code I am using and the result of the Dataframes:
import pandas as pd
import numpy as np
# Importando Planilhas
CRM = pd.ExcelFile(r'C:\Users\Michel\Desktop\Relatorio de
Vendas\relatorio_vendas_CRM.xlsx', encoding= 'utf-8')
protheus = pd.ExcelFile(r'C:\Users\Michel\Desktop\Relatorio de
Vendas\relatorio_vendas_protheus.xlsx', encoding= 'utf-8')
#transformando em Data Frame
df_crm = CRM.parse('190_pedido_export (33)')
df_protheus = protheus.parse('Relatorio de Pedido de Venda')]
# Transformando Campos em float o protheus
def turn_to_float(x):
return np.float(x)
df_protheus["TES"] = df_protheus["TES"].apply(turn_to_float)
df_protheus["Qtde"] = df_protheus["Qtde"].apply(turn_to_float)
df_protheus["Valor"] = df_protheus["Valor"].apply(turn_to_float)
#Tirando Tes de não venda do protheus
# tirando valores com código errado 6
df_protheus_1 = df_protheus[df_protheus.TES != 513.0]
df_protheus_2 = df_protheus_1[df_protheus_1.TES != 576.0]
**df_crm.columns = df_crm.columns.str.replace('N° Pedido', 'teste')
df_crm.columns**
Orçamento Origem N° Pedido Nº Pedido ERP Estabelecimento Tipo de
Pedido Classificação(Tipo) Aplicação Conta CNPJ/CPF Contato ...
Aprovação Parcial Antecipa Entrega Desconto da Tabela de Preço
Desconto do Cliente Desconto Informado Observações Observações NF Vl
Total Bruto Vl Total Completo
0 20619.0 23125 NaN Optitex 1 - Venda NaN Industrialização/Revenda
XAVIER E ARAUJO LTDA ME 7970626000170 NaN ... N N 0 0 0
Note that I used other codes for the bold part with the same result:
#renomeando tabela para dar Merge
#df_crm['proc'] = df_crm['N\xc2\xb0 Pedido']
#df_crm['N Pedido'] = df_crm['N° Pedido']
#df_crm.drop('N° Pedido',inplace=True,axis=1)
#df_crm
#df_crm['N Pedido'] = df_crm['N° Pedido']
#df.drop('N° Pedido',inplace=True,axis=1)
#df_crm
#df_crm_1 = df_crm.rename(columns={"N°Pedido": "teste"})
#df_crm_1
Thanks for posting the link to the Google Sheet. I downloaded it and loaded it via pandas:
df = pd.read_excel(r'~\relatorio_vendas_CRM.xlsx', encoding = 'utf-8')
df.columns = df.columns.str.replace('°', '')
df.columns = df.columns.str.replace('º', '')
Note that the two replace statements are replacing different characters, although they look very similar.
Help from: Why do I get a SyntaxError for a Unicode escape in my file path?
I was able to copy the values into another column. You could try that
df['N Pedido'] = df['N° Pedido']
df.drop('N° Pedido',inplace=True,axis=1)

Categories