I have a school project on analyzing logs & roughly, I need to be able to retrieve the day & time from a line in the format "MM DD HH:MM:SS" and it always shows as "['MM DD HH:MM:SS']"
def get_complete_date(line):
"""
Pre : line est une ligne de log bien formée (str)
Post : Retourne la date et l'heure sous forme de chaine de caractère sans changer le format.
"""
# splt = line.split(sep=" ", maxsplit=5)
# dt = splt[2]
complete_date = line.split(" ")
# print(complete_date)
return complete_date[0:3]
Very naive approach, but if the length of datetime part is the same (same pattern on each line), you can simply slice it from 0 to pattern length:
def get_complete_date(l):
tpl = "MM DD HH:MM:SS"
return l[0: len(tpl)]
line = "MM DD HH:MM:SS some text"
print(get_complete_date(line))
>>>'MM DD HH:MM:SS'
You almost there. It's an issue with slicing.
def get_complete_date(line):
"""
Pre : line est une ligne de log bien formée (str)
Post : Retourne la date et l'heure sous forme de chaine de caractère sans changer le format.
"""
# splt = line.split(sep=" ", maxsplit=5)
# dt = splt[2]
complete_date = line.split(" ")
# print(complete_date)
return complete_date[1:3:1]
line = "MM DD HH:MM:SS"
print(get_complete_date(line))
Gives #
['DD', 'HH:MM:SS']
Explanation: Your slicing should be
[1:3:1] = [start:stop:step]
The 0th element is month which you don't need to start from 1.
Related
import re
#example 1
input_text = "((PERSON)María Rosa) ((VERB)pasará) unos dias aqui, hay que ((VERB)mover) sus cosas viejas de aqui, ya que sus cosméticos ((VERB)estorban) si ((VERB)estan) tirados por aquí. ((PERSON)Cyntia) es una buena modelo, su cabello es muy bello, hay que ((VERB)lavar) su cabello"
#example 2
input_text = "Sus útiles escolares ((VERB)estan) aqui, me sorprende que ((PERSON)Juan Carlos) los haya olvidado siendo que suele ((VERB)ser) tan cuidadoso con sus útiles."
#I need replace "sus" or "su" but under certain conditions
subject_capture_pattern = r"\(\(PERSON\)((?:\w\s*)+)\)" #underlined in red in the image
associated_info_capture_pattern = r"(?:sus|su)\s+((?:\w\s*)+)(?:\s+(?:del|de )|\s*(?:\(\(VERB\)|[.,;]))" #underlined in green in the image
identification_pattern =
replacement_sequence =
input_text = re.sub(identification_pattern, replacement_sequence, input_text, flags = re.IGNORECASE)
this is the correct output:
#for example 1
"((PERSON)María Rosa) ((VERB)pasará) unos dias aqui, hay que ((VERB)mover) cosas viejas ((CONTEXT) de María Rosa) de aqui, ya que cosméticos ((CONTEXT) de María Rosa) ((VERB)estorban) si ((VERB)estan) tirados por aquí. ((PERSON)Cyntia) es una buena modelo, cabello ((CONTEXT) de Cyntia) ((VERB)es) muy bello, hay que ((VERB)lavar) cabello ((CONTEXT) de Cyntia)"
#for example 2
"útiles escolares ((CONTEXT) NO DATA) ((VERB)estan) aqui, me sorprende que ((PERSON)Juan Carlos) los haya olvidado siendo que suele ((VERB)ser) tan cuidadoso con útiles ((CONTEXT) Juan Carlos)."
Details:
Replace the possessive pronouns "sus" or "su" with "de " + the content inside the last ((PERSON) "THIS SUBSTRING"), and if there is no ((PERSON) "THIS SUBSTRING") before then replace sus or su with ((PERSON) NO DATA)
Sentences are read from left to right, so the replacement will be the substring inside the parentheses ((PERSON)the substring) before that "sus" or "su", as shown in the example.
In the end, the replaced substrings should end up with this structure:
associated_info_capture_pattern + "((CONTEXT)" + subject_capture_pattern + ")"
This shows a way to do the replacement of su/sus like you asked for (albeit not with just a single re.sub). I didn't move the additional info, but you could modify it to handle that as well.
import re
subject_capture_pattern = r"\(\(PERSON\)((?:\w\s*)+)\)"
def replace_su_and_sus(input_text):
start = 0
replacement = "((PERSON) NO DATA)"
output_text = ""
for m in re.finditer(subject_capture_pattern, input_text):
output_text += re.sub(r"\b[Ss]us?\b", replacement, input_text[start:m.end()])
start = m.end()
replacement = m.group(0).replace("(PERSON)", "(CONTEXT) de ")
output_text += re.sub(r"\b[Ss]us?\b", replacement, input_text[start:])
return output_text
My strategy was:
Up until the first subject capture, replace su/sus with "NO DATA"
Up until the second subject capture, replace su/sus with the name from the first capture
Proceed similarly for each subsequent subject capture
Finally, replace any su/sus between the last subject capture and the end of the string
I want to count occurrences of each word in a text to spot the key words coming over the most.
This script works quite well but the problem is that this text is written in French. So there are important key words that would be missed.
For example the word Europe may appear in the text like l'Europe or en Europe.
In the first case, the code will remove the apostrophe and l'Europe is considered as one unique word leurope in the final result.
How can I improve the code to split l' from Europe?
import string
# Open the file in read mode
#text = open("debat.txt", "r")
text = ["Monsieur Mitterrand, vous avez parlé une minute et demie de moins que Monsieur Chirac dans cette première partie. Je préfère ne pas avoir parlé une minute et demie de plus pour dire des choses aussi irréelles et aussi injustes que celles qui viennent d'être énoncées. Si vous êtes d'accord, nous arrêtons cette première partie, nous arrêtons les chronomètres et nous repartons maintenant pour une seconde partie en parlant de l'Europe. Pour les téléspectateurs, M. Mitterrand a parlé 18 minutes 36 et M. Chirac, 19 minutes 56. Ce n'est pas un drame !... On vous a, messieurs, probablement jamais vus plus proches à la fois physiquement et peut-être politiquement que sur les affaires européennes... les Français vous ont vus, en effet, à la télévision, participer ensemble à des négociations, au coude à coude... voilà, au moins, un domaine dans lequel, sans aucun doute, vous connaissez fort bien, l'un et l'autre, les opinions de l'un et de l'autre. Nous avons envie de vous demander ce qui, aujourd'hui, au -plan européen, vous sépare et vous rapproche ?... et aussi lequel de vous deux a le plus évolué au cours des quelques années qui viennent de s'écouler ?... #"]
# Create an empty dictionary
d = dict()
# Loop through each line of the file
for line in text:
# Remove the leading spaces and newline character
line = line.strip()
# Convert the characters in line to
# lowercase to avoid case mismatch
line = line.lower()
# Remove the punctuation marks from the line
line = line.translate(line.maketrans("", "", string.punctuation))
# Split the line into words
words = line.split(" ")
# Iterate over each word in line
for word in words:
# Check if the word is already in dictionary
if word in d:
# Increment count of word by 1
d[word] = d[word] + 1
else:
# Add the word to dictionary with count 1
d[word] = 1
sorted_tuples = sorted(d.items(), key=lambda item: item[1], reverse=True)
sorted_dict = {k: v for k, v in sorted_tuples}
# Print the contents of dictionary
for key in list(sorted_dict.keys()):
print(key, ":", sorted_dict[key])
line = line.translate(line.maketrans("", "", string.punctuation))
removes all punctuation characters (l'Europe becomes lEurope). Instead of that, you may want to replace them by spaces, using for example:
for p in string.punctuation:
line = line.replace(p, ' ')
Where you currently have:
line = line.translate(line.maketrans("", "", string.punctuation))
... you can add the following line before it:
line = line.translate(line.maketrans("'", " "))
This will replace the ' character with a space wherever it's found, and the line using string.punctuation will behave exactly as before, except that it will not encounter any ' characters since we have already replaced them.
I wrote a script to extract sentences in huge set which contains particular pattern. The problem lied in the fact that , for some patterns I checked the value of the attribute at the beginning or ending of the pattern to see if the word is present in a particular list. I have 4 dictionaries with 2 lists of positive and negative word. So far I wrote the script and I am able to use the function I wrote with one dictionary. I am thinking how can I improve the my function so that I can use it at the same time of the 4 dictionaries without duplicating the bloc which loop in the dictionary.
I give an example with two dictionaries (since the script is quite long I make a small example with all the necessary element
import spacy.attrs
from spacy.attrs import POS
import spacy
from spacy import displacy
from spacy.lang.fr import French
from spacy.tokenizer import Tokenizer
from spacy.util import compile_prefix_regex, compile_infix_regex, compile_suffix_regex
from spacy.lemmatizer import Lemmatizer
nlp = spacy.load("fr_core_news_md")
from spacy.matcher import Matcher#LIST
##################### List of lexicon
# Lexique Diko
lexicon = open(os.path.join('/h/Ressources/Diko.txt'), 'r', encoding='utf-8')
data = pd.read_csv(lexicon, sep=";", header=None)
data.columns = ["id", "terme", "pol"]
pol_diko_pos = data.loc[data.pol =='positive', 'terme']
liste_pos_D = list(pol_diko_pos)
print(liste_pos[1])
pol_diko_neg = data.loc[data.pol =='negative', 'terme']
liste_neg_D = list(pol_diko_neg)
#print(type(liste_neg))
# Lexique Polarimots
lexicon_p = open(os.path.join('/h/Ressources/polarimots.txt'), 'r', encoding='utf-8')
data_p = pd.read_csv(lexicon_p, sep="\t", header=None)
#data.columns = ["terme", "pol", "pos", "degre"]
data_p.columns = ["ind", "terme", "cat", "pol", "fiabilité"]
pol_polarimot_pos = data_p.loc[data_p.pol =='POS', 'terme']
liste_pos_P = list(pol_polarimot_pos)
print(liste_pos_P[1])
pol_polarimot_neg = data_p.loc[data_p.pol =='NEG', 'terme']
liste_neg_P = list(pol_polarimot_neg)
#print(type(liste_neg))
# ############################# Lists
sentence_not_extract_lexique_1 =[] #List of all sentences without the specified pattern
sentence_extract_lexique_1 = [] #list of sentences which the pattern[0] is present in the first lexicon
sentence_not_extract_lexique_2 =[] #List of all sentences without the specified pattern
sentence_extract_lexique_2 = [] #list of sentences which the pattern[0] is present in the second lexicon
list_token_pos = [] #list of the token found in the lexique
list_token_neg = [] #list of the token found in the lexique
list_token_not_found = [] #list of the token not found in the lexique
#PATTERN
pattern1 = [{"POS": {"IN": ["VERB", "AUX","ADV","NOUN","ADJ"]}}, {"IS_PUNCT": True, "OP": "*"}, {"LOWER": "mais"} ]
pattern1_tup = (pattern1, 1, True)
pattern3 = [{"LOWER": {"IN": ["très","trop"]}},
{"POS": {"IN": ["ADV","ADJ"]}}]
pattern3_tup = (pattern3, 0, True)
pattern4 = [{"POS": "ADV"}, # adverbe de négation
{"POS": "PRON","OP": "*"},
{"POS": {"IN": ["VERB", "AUX"]}},
{"TEXT": {"IN": ["pas", "plus", "aucun", "aucunement", "point", "jamais", "nullement", "rien"]}},]
pattern4_tup = (pattern4, None, False)
#Tuple of pattern
pattern_list_tup =[pattern1_tup, pattern3_tup, pattern4_tup]
pattern_name = ['first', 'second', 'third', 'fourth']
length_of_list = len(pattern_list_tup)
print('length', length_of_list)
#index of the value of attribute to check in the lexicon
value_of_attribute = [0,-1,-1]
# List of lexicon to use
lexique_1 = [lexique_neg, lexique_pos]
lexique_2 = [lexique_2neg, lexique_2pos]
# text (example of some sentences)
file =b= ["Le film est superbe mais cette édition DVD est nulle !",
"J'allais dire déplorable, mais je serais peut-être un peu trop extrême.",
"Hélas, l'impression de violence, bien que très bien rendue, ne sauve pas cette histoire gothique moderne de la sécheresse scénaristique, le tout couvert d'un adultère dont le propos semble être gratuit, classique mais intéressant...",
"Tout ça ne me donne pas envie d'utiliser un pieu mais plutôt d'aller au pieu (suis-je drôle).",
"Oui biensur, il y a la superbe introduction des parapluies au debut, et puis lorsqu il sent des culs tout neufs et qu il s extase, j ai envie de faire la meme chose apres sur celui de ma voisine de palier (ma voisine de palier elle a un gros cul, mais j admets que je voudrais bien lui foute mon tarin), mais c est tout, apres c est un film tres noir, lent et qui te plonge dans le depression.",
"Et bien hélas ce DVD ne m'a pas appris grand chose par rapport à la doc des agences de voyages et la petite dame qui fait ses dessins est bien gentille mais tout tourne un peu trop autour d'elle.",
"Au final on passe de l'un a l'autre sans subtilité, et on n'arrive qu'à une caricature de plus : si Kechiche avait comme but initial déclaré de fustiger les préjugés, c'est le contraire qui ressort de ce ''film'' truffé de clichés très préjudiciables pour les quelques habitants de banlieue qui ne se reconnaîtront pas dans cette lourde farce.",
"-ci écorche les mots, les notes... mais surtout nos oreilles !"]
# Loop to check each sentence and extract the sentences with the specified pattern from above
for pat in range(0, length_of_list):
matcher = Matcher(nlp.vocab)
matcher.add("matching_2", None, pattern_list_tup[pat][0])
# print(pat)
# print(pattern_list_tup[pat][0])
for sent in file:
doc =nlp(sent)
matches= matcher(doc)
for match_id, start, end in matches:
span = doc[start:end].lemma_.split()
#print(f"{pattern_name[pat]} pattern found: {span}")
This is the part I want ot modify to use it for another dictionary, the goal is to able to retrieve sentences extract by 4 different dictionaries to make a comparison and then check which sentences are present in more than two list.
# Condition to use the lexicon and extract the sentence
if (pattern_list_tup[pat][2]):
if (span[value_of_attribute[pat]] in lexique_1[pattern_list_tup[pat][1]]):
if sent not in sentence_extract:
sentence_extract_lexique_1.append(sent)
if (pattern_list_tup[pat][1] == 1):
list_token_pos.append(span[value_of_attribute[pat]])
if (pattern_list_tup[pat][1] == 0):
list_token_neg.append(span[value_of_attribute[pat]])
else:
list_token_not_found.append(span[value_of_attribute[pat]]) # the text form is not present in the lexicon need the lemma form
sentence_not_extract_lexique_1.append(sent)
else:
if sent not in sentence_extract:
sentence_extract_lexique_1.append(sent)
print(len(sentence_extract))
print(sentence_extract)
One solution I find is to duplicate the code abode and change the name of the list where the sentences are stored but since I have 2 dictionaries duplicating will make the code longer is there a way to combine the looping the 2 dictionaries (actually 4 dictionaries in the original) and append the result to the good list. So, for example, when I use lexique_1 , all the sentences extracted are send to "sentence_extract_lexique_1" and so on for the other.
In my opinion attempt using the if-elif-else chain. If not attempt only using the if-elif block simply because the elif statement catches the specific condition of interest. In which you're trying to catch a specific to compare and check with the sentences. Keep in mind if you try the if-elif-else chain its a good method, but it only works when you need one test to pass. Because Python finds one test to pass and it skips the rest. Its very efficient and allows you to test for one specific condition.
Good evening everyone,
I am currently trying to extract data from this website :
https://classic.sportsbookreview.com/betting-odds/nba-basketball/
The program logic is based on the followed loop.
Once the page from the website is open, the first step is to open the calendar and to seek if there were matches played this month.
If this is the case, for each of these day, the data will be written in an xls file. Then, when there aren't anymore matches to extract, the program click to the previous month, and perform the same statements.
On the contrary, if there isn't a single day where a match was played during the month, the treatment will click on the previous month and will perform the same statements as before.
Here is my code (sorry for the use of the french, if this is not clear, I could translate it) :
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import time
import xlsxwriter
## Création du fichier excel
tableur = xlsxwriter.Workbook("PrédictionMachine.xlsx") ;
feuille1 = tableur.add_worksheet("10-05-2018") ;
## Ouverture du navigateur (pour Chrome, remplacer Firefox par Chrome)
navigateur = webdriver.Firefox() ;
navigateur.get("https://classic.sportsbookreview.com/betting-odds/nba- basketball/") ;
time.sleep(3) ;
## Initialisation des variables nécessaires pour écriture dans tableur
row = 0 ;
col = 0 ;
moistraite = 0 ;
matchtreeltrouve = 0 ;
## PHASE TEST À SUPPRIMER EN VF
passage = 0
try :
## Boucle : le programme s'arrête lorsque qu'une touche quelqconque du clavier est pressée
while True :
## Ouverture du calendrier
print("Passage : ", passage)
Calendrier = navigateur.find_element_by_xpath("//a[#class='dd-go-button']").click()
time.sleep(3)
## Récupération du mois/année en cours de traitement
Mois = navigateur.find_element_by_xpath("//h2[#class='tbl-cal-top-middle']")
## Si aucun match n'est joué ce mois-ci, le programme clique sur le mois précédent
if not (navigateur.find_elements_by_xpath("//a[contains(#onclick, '20')]")) :
print("------------------------------------------------")
print("Aucun match trouvé pour le mois de :", Mois.text)
print("------------------------------------------------")
Moisprecedent = navigateur.find_element_by_xpath("//img[#alt='Left Arrow']").click()
moistraite += 1
print("------------------------------------------------")
print("Nombre de mois traité :", moistraite) ;
print("------------------------------------------------")
time.sleep(3) ;
else :
## Récupération de tous les jours du mois où un match a supposément été joué
JMtheorique = navigateur.find_elements_by_xpath("//a[contains(#onclick, '20')]")
print("------------------------------------------------")
print("Matchs supposés trouvés pour le mois de :", Mois.text)
print("------------------------------------------------")
for jmtheorique in JMtheorique :
print("ON Y CROIT", jmtheorique.text)
navigateur.find_element_by_xpath("//a[contains(#onclick, 'OddsEvent.GetLinkDate')]").click()
time.sleep(3)
passage += 1
Calendrier = navigateur.find_element_by_xpath("//a[#class='dd-go-button']").click()
time.sleep(2)
Moisprecedent = navigateur.find_element_by_xpath("//img[#alt='Left Arrow']").click()
moistraite += 1
## Arrêt Manuel du programme
except KeyboardInterrupt :
print("!/_\/_\/_\! Interruption manuelle du programme !/_\/_\/_\! ")
print("Le programme a été stoppé au mois de :", Mois.text)
print("Nombre de mois traité :", moistraite)
print("Nombre de match écrit :", matchtreeltrouve) ;
##navigateur.quit()
For now, I just want to navigate through the different months/days.
For August, the code source works fine.
But when I arrive at July, the program is looping. It seems that the 'if not' statement isnt' correctly written, since the treatment find a match whereas there isn't a single one played during this month.
Okay, I think I understood my problem. When I open the calendar, some days of the previous/next month seem to be present. When the treatment reaches the month of July, the 4 August (a day where a game was played) is also present on the calendar, which explains why the program is looping between July and August. I need to extract only the a href contained in the td and not the ones contained in a td class as showed below:
The calendar
The kind of a href I need
I will investigate further to learn how to do that. Thanks for the help.
I have a .txt file with csv data like this :
1529074866;29.89;29.41;321;70;1;60;1003.05
1529074868;29.87;29.82;140;79;1;60;1003.52
I made this function to extract the data from the file:
def __init__(self, file):
self.data_ = {"Temps": [], "Temperature": [], "Humidite": [], "Luminosite": [], "Niveau sonore": [], "Radiations EM": [], "Rythme cardiaque": [], "Pression": [] }
# data_ = {date : [t1, t2,...], temp : [temp1, temp2,...]...}. Cette disposition des données (par date, luminosité...) permet d'optimiser l affichage des graphiques ulterieurement.
try:
for line in file: # Cette commande permet de parcourir une à une toutes les lignes du fichier file.
line = line.rstrip() # Cette commande permet de supprimer le caractère invisible de retour chariot en fin de ligne.
line = line.rsplit(";") # Cette commande permet de transpormer la ligne en liste en coupant au niveau des ";".
self.data_["Temps"].append( int(line[0]) ) # Ceci implique que la donnée correspondant à la date soit bien la 1ère donnée (rang 0) sur la ligne.
self.data_["Temperature"].append( float(line[1])) # Ceci implique que la donnée correspondant à la date soit bien la 2ème donnée (rang 1) sur la ligne.
self.data_["Humidite"].append( float(line[2]) ) # ect
self.data_["Luminosite"].append( float(line[3]) )
self.data_["Niveau sonore"].append( float(line[4]) )
self.data_["Radiations EM"].append( float(line[5]))
self.data_["Rythme cardiaque"].append( float(line[6]) )
self.data_["Pression"].append( float(line[7]) )
except Exception as expt : # Cette exception est executee si l'execution du bloc try si dessus a echoue.
print("\n!!! Echec de l'importation - exception relevee !!!")
print expt
I would like to create a function which extracts the first parameter of the line, which is the Unix time, and, if the time is not between [1514761200; 1546297200], deletes the line.
How can I do this?
To delete a line from the file you're actually going to read it completely, and then to rewrite the filtered file.
One approach:
data_typo = "1529074866;29.89;29.41;321;70;1;60;1003.05\n"
with open("file.txt", "r") as f:
lines = f.readlines() # Extract all the lines
data = [line.rstrip().split(";") for line in lines]
# data elements: ['1529074866', '29.89', '29.41', '321', '70', '1', '60', '1003.05']
At that point, a simple approach is to filter the list data.
def criterion(elt):
if 1514761200 <= eval(elt[0]) and eval(elt[0]) <= 1546297200:
return True
else:
return False
data_to_rewrite = list(filter(criterion, data)) # Keeps the elements where criterion returns True.
with open("new_file.txt", "w") as f:
for elt in data_to_rewrite:
line = ";".join(elt) + "\n"
f.write(line)