I have a class that I created, at the end of the task I have to create two lists: one for tweets and the other for tweet labels.
After initiation, I want to load the tweets from a file and from another file their labels. After loading I want to check that each tweet is a json object and that it has no error, if it does, then I want to remove it and remove the associated label if provided. Labels are either 'pos' or 'neg'.
class flu_tweets:
def __init__(self):
self.tweets = [] #init create empty tweet list
self.labels = [] #init create empty label list
def load(self, tweets_filename, labels_filename = ''):
open_tweet_file = open(tweets_filename, 'r')
for tweet in open_tweet_file:
if tweet !='\n' and tweet != '\r\n':
self.tweets.extend([tweet])
if labels_filename != '':
open_label_file = open(labels_filename, 'r')
for label in open_label_file:
if label != '\n' and label != '\r\n':
if label[3:] != '\n':
self.labels.extend([label[:3]])
else:
self.labels.extend([label])
open_label_file.close()
index_tweet = 0
for tweet in self.tweets:
try:
json.loads(tweet)
index_tweet += 1
break
except:
print(index_tweet-1)
self.tweets.pop(index_tweet-1)
if self.labels != []:
self.labels.pop(index_tweet-1)
open_tweet_file.close()
Right now the method doesn't do that, and upon checking the list it does contain non-json objects.
Below is a copy of text file used that has tweets in it:
{"created_at":"Fri Oct 20 14:35:19 +0000 2017","id":921384339421745153,"id_str":"921384339421745153","text":"RT #alvindchipmunk: Dont let the DNC slide with no handcuffs. https://t.co/h72q7lGAHF","source":"\u003ca href=\"http://twitter.com/download/iphone\" rel=\"nofollow\"\u003eTwitter for iPhone\u003c/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":3435638633,"id_str":"3435638633","name":"alwaystrump","screen_name":"rodilosso_patty","location":"New Jersey, USA","url":null,"description":"Let's not give the media the Race war they want. POTUS we have your back! MAGA","translator_type":"none","protected":false,"verified":false,"followers_count":2770,"friends_count":2048,"listed_count":183,"favourites_count":85745,"statuses_count":152520,"created_at":"Sat Aug 22 16:27:18 +0000 2015","utc_offset":null,"time_zone":null,"geo_enabled":true,"lang":"en","contributors_enabled":false,"is_translator":false,"profile_background_color":"C0DEED","profile_background_image_url":"http://abs.twimg.com/images/themes/theme1/bg.png","profile_background_image_url_https":"https://abs.twimg.com/images/themes/theme1/bg.png","profile_background_tile":false,"profile_link_color":"1DA1F2","profile_sidebar_border_color":"C0DEED","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"333333","profile_use_background_image":true,"profile_image_url":"http://pbs.twimg.com/profile_images/773719268026257410/AuXU_l-D_normal.jpg","profile_image_url_https":"https://pbs.twimg.com/profile_images/773719268026257410/AuXU_l-D_normal.jpg","profile_banner_url":"https://pbs.twimg.com/profile_banners/3435638633/1461499638","default_profile":true,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweeted_status":{"created_at":"Fri Oct 20 12:27:04 +0000 2017","id":921352064160034816,"id_str":"921352064160034816","text":"Dont let the DNC slide with no handcuffs. https://t.co/h72q7lGAHF","display_text_range":[0,41],"source":"\u003ca href=\"http://twitter.com\" rel=\"nofollow\"\u003eTwitter Web Client\u003c/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":35962023,"id_str":"35962023","name":"alvin maldonado","screen_name":"alvindchipmunk","location":"Bunnell, FL","url":"http://alvindchipmunk-theconservativecomet.blogspot.com/","description":"Artist-musician-Medical Professional-Patriot-Guns-God-Country & 2Unite with others of like mind 2 re-elect Trump, redecorate DC making d USA gr8 as it still is","translator_type":"none","protected":false,"verified":false,"followers_count":1660,"friends_count":2051,"listed_count":43,"favourites_count":2001,"statuses_count":16294,"created_at":"Tue Apr 28 02:43:18 +0000 2009","utc_offset":-14400,"time_zone":"Eastern Time (US & Canada)","geo_enabled":true,"lang":"en","contributors_enabled":false,"is_translator":false,"profile_background_color":"9818A1","profile_background_image_url":"http://pbs.twimg.com/profile_background_images/627772771741732864/MHLgViA4.jpg","profile_background_image_url_https":"https://pbs.twimg.com/profile_background_images/627772771741732864/MHLgViA4.jpg","profile_background_tile":true,"profile_link_color":"981CEB","profile_sidebar_border_color":"DE3C88","profile_sidebar_fill_color":"E887E8","profile_text_color":"333333","profile_use_background_image":true,"profile_image_url":"http://pbs.twimg.com/profile_images/542852260422639616/75bqMWY3_normal.jpeg","profile_image_url_https":"https://pbs.twimg.com/profile_images/542852260422639616/75bqMWY3_normal.jpeg","profile_banner_url":"https://pbs.twimg.com/profile_banners/35962023/1481464103","default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"quoted_status_id":920847040132792320,"quoted_status_id_str":"920847040132792320","quoted_status":{"created_at":"Thu Oct 19 03:00:17 +0000 2017","id":920847040132792320,"id_str":"920847040132792320","text":"I'm sick of all the evidence against the Democrats and no handcuffs. Retweet-\nif you agree!\n\n#realDonaldTrump \ud83c\uddfa\ud83c\uddf8","source":"\u003ca href=\"http://twitter.com/download/android\" rel=\"nofollow\"\u003eTwitter for Android\u003c/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":889982846428782592,"id_str":"889982846428782592","name":"c\u2113\u03b9\u03b7\u0442\u03c3\u03b7 \u043c\u03b9c\u043d\u03b1\u03b5\u2113","screen_name":"crusher614","location":"*not of this world","url":"https://www.youtube.com/channel/UCthNh_qChVqAh_zamo0IQxQ","description":"FMR U.S. Border Patrol | New Mexico SWAT Operator | Legend Who Lives Rent Free In The Minds of Liberals World Wide. #MAGA \u03bc\u03bf\u03bb\u1f7c\u03bd \u03bb\u03b1\u03b2\u03ad III","translator_type":"none","protected":false,"verified":false,"followers_count":19821,"friends_count":135,"listed_count":73,"favourites_count":16032,"statuses_count":7708,"created_at":"Tue Jul 25 22:57:00 +0000 2017","utc_offset":-25200,"time_zone":"America/Phoenix","geo_enabled":false,"lang":"en","contributors_enabled":false,"is_translator":false,"profile_background_color":"000000","profile_background_image_url":"http://abs.twimg.com/images/themes/theme1/bg.png","profile_background_image_url_https":"https://abs.twimg.com/images/themes/theme1/bg.png","profile_background_tile":false,"profile_link_color":"19CF86","profile_sidebar_border_color":"000000","profile_sidebar_fill_color":"000000","profile_text_color":"000000","profile_use_background_image":false,"profile_image_url":"http://pbs.twimg.com/profile_images/916414292882219008/fvSIJCC6_normal.jpg","profile_image_url_https":"https://pbs.twimg.com/profile_images/916414292882219008/fvSIJCC6_normal.jpg","profile_banner_url":"https://pbs.twimg.com/profile_banners/889982846428782592/1506312325","default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"is_quote_status":false,"quote_count":551,"reply_count":703,"retweet_count":15439,"favorite_count":13676,"entities":{"hashtags":[],"urls":[],"user_mentions":[{"screen_name":"realDonaldTrump","name":"Donald J. Trump","id":25073877,"id_str":"25073877","indices":[93,109]}],"symbols":[]},"favorited":false,"retweeted":false,"filter_level":"low","lang":"en"},"is_quote_status":true,"quote_count":0,"reply_count":0,"retweet_count":1,"favorite_count":0,"entities":{"hashtags":[],"urls":[{"url":"https://t.co/h72q7lGAHF","expanded_url":"https://twitter.com/crusher614/status/920847040132792320","display_url":"twitter.com/crusher614/sta\u2026","indices":[42,65]}],"user_mentions":[],"symbols":[]},"favorited":false,"retweeted":false,"possibly_sensitive":false,"filter_level":"low","lang":"en"},"quoted_status_id":920847040132792320,"quoted_status_id_str":"920847040132792320","quoted_status":{"created_at":"Thu Oct 19 03:00:17 +0000 2017","id":920847040132792320,"id_str":"920847040132792320","text":"I'm sick of all the evidence against the Democrats and no handcuffs. Retweet-\nif you agree!\n\n#realDonaldTrump \ud83c\uddfa\ud83c\uddf8","source":"\u003ca href=\"http://twitter.com/download/android\" rel=\"nofollow\"\u003eTwitter for Android\u003c/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":889982846428782592,"id_str":"889982846428782592","name":"c\u2113\u03b9\u03b7\u0442\u03c3\u03b7 \u043c\u03b9c\u043d\u03b1\u03b5\u2113","screen_name":"crusher614","location":"*not of this world","url":"https://www.youtube.com/channel/UCthNh_qChVqAh_zamo0IQxQ","description":"FMR U.S. Border Patrol | New Mexico SWAT Operator | Legend Who Lives Rent Free In The Minds of Liberals World Wide. #MAGA \u03bc\u03bf\u03bb\u1f7c\u03bd \u03bb\u03b1\u03b2\u03ad III","translator_type":"none","protected":false,"verified":false,"followers_count":19821,"friends_count":135,"listed_count":73,"favourites_count":16032,"statuses_count":7708,"created_at":"Tue Jul 25 22:57:00 +0000 2017","utc_offset":-25200,"time_zone":"America/Phoenix","geo_enabled":false,"lang":"en","contributors_enabled":false,"is_translator":false,"profile_background_color":"000000","profile_background_image_url":"http://abs.twimg.com/images/themes/theme1/bg.png","profile_background_image_url_https":"https://abs.twimg.com/images/themes/theme1/bg.png","profile_background_tile":false,"profile_link_color":"19CF86","profile_sidebar_border_color":"000000","profile_sidebar_fill_color":"000000","profile_text_color":"000000","profile_use_background_image":false,"profile_image_url":"http://pbs.twimg.com/profile_images/916414292882219008/fvSIJCC6_normal.jpg","profile_image_url_https":"https://pbs.twimg.com/profile_images/916414292882219008/fvSIJCC6_normal.jpg","profile_banner_url":"https://pbs.twimg.com/profile_banners/889982846428782592/1506312325","default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"is_quote_status":false,"quote_count":551,"reply_count":703,"retweet_count":15439,"favorite_count":13676,"entities":{"hashtags":[],"urls":[],"user_mentions":[{"screen_name":"realDonaldTrump","name":"Donald J. Trump","id":25073877,"id_str":"25073877","indices":[93,109]}],"symbols":[]},"favorited":false,"retweeted":false,"filter_level":"low","lang":"en"},"is_quote_status":true,"quote_count":0,"reply_count":0,"retweet_count":0,"favorite_count":0,"entities":{"hashtags":[],"urls":[{"url":"https://t.co/h72q7lGAHF","expanded_url":"https://twitter.com/crusher614/status/920847040132792320","display_url":"twitter.com/crusher614/sta\u2026","indices":[62,85]}],"user_mentions":[{"screen_name":"alvindchipmunk","name":"alvin maldonado","id":35962023,"id_str":"35962023","indices":[3,18]}],"symbols":[]},"favorited":false,"retweeted":false,"possibly_sensitive":false,"filter_level":"low","lang":"en","timestamp_ms":"1508510119668"}
{"created_at":"Fri Oct 20 14:35:19 +0000 2017","id":921384340113670149,"id_str":"921384340113670149","text":"RT #mitchelmusso: My girl is so fine! but gaw}
Now that last tweet is incomplete,I expect that my function should raise an error and eliminate it.
So I have several log files, they are structured like this:
Sep 9 12:42:15 apollo sshd[25203]: pam_unix(sshd:auth): authentication failure; logname= uid=0 euid=0 tty=ssh ruser= rhost=189.26.255.11
Sep 9 12:42:15 apollo sshd[25203]: pam_succeed_if(sshd:auth): error retrieving information about user ftpuser
Sep 9 12:42:17 apollo sshd[25203]: Failed password for invalid user ftpuser from 189.26.255.11 port 44061 ssh2
Sep 9 12:42:17 apollo sshd[25204]: Received disconnect from 189.26.255.11: 11: Bye Bye
Sep 9 19:12:46 apollo sshd[30349]: Did not receive identification string from 199.19.112.130
Sep 10 03:29:48 apollo unix_chkpwd[4549]: password check failed for user (root)
Sep 10 03:29:48 apollo sshd[4546]: pam_unix(sshd:auth): authentication failure; logname= uid=0 euid=0 tty=ssh ruser= rhost=221.12.29.170 user=root
Sep 10 03:29:51 apollo sshd[4546]: Failed password for root from 221.12.29.170 port 56907 ssh2
There are more dates and times, But this is an example. I was wondering how I would calculate the total time that the file covers. I've tried a few things, and have had about 5 hours of no success.
I tried this first, and it was close, but it didn't work like I wanted it to, it kept repeating dates:
with open(filename, 'r') as file1:
lines = file1.readlines()
for line in lines:
linelist = line.split()
date2 = int(linelist[1])
time2 = linelist[2]
print linelist[0], linelist[1], linelist[2]
if date1 == 0:
date1 = date2
dates.append(linelist[0] + ' ' + str(linelist[1]))
if date1 < date2:
date1 = date2
ttimes.append(datetime.strptime(str(ltime1), FMT) - datetime.strptime(str(time1), FMT))
time1 = '23:59:59'
ltime1 = '00:00:00'
dates.append(linelist[0] + ' ' + str(linelist[1]))
if time2 < time1:
time1 = time2
if time2 > ltime1:
ltime1 = time2
If the entries are in a chronological order, you can just look at the first and at the last entry:
entries = lines.split("\n")
first_date = entries[0].split("apollo")[0]
last_date = entries[len(entries)-1].split("apollo")[0]
We don't have the year, so I took the current year. Read all the lines, convert the month to month index, and parse each date.
Then sort it (so works even if logs mixed) and take first & last item. Substract. Enjoy.
from datetime import datetime
months = ["","Jan","Feb","Mar","Apr","May","Jun","Jul","Aug","Sep","Oct","Nov","Dec"]
current_year = datetime.now().year
dates = list()
with open(filename, 'r') as file1:
for line in file1:
linelist = line.split()
if linelist: # filter out possible empty lines
linelist[0] = str(months.index(linelist[0])) # convert 3-letter months to index
date2 = int(linelist[1])
z=datetime.strptime(" ".join(linelist[0:3])+" "+str(current_year),"%m %d %H:%M:%S %Y") # compose & parse the date
dates.append(z) # store in list
dates.sort() # sort the list
first_date = dates[0]
last_date = dates[-1]
# print report & compute time span
print("start {}, end {}, time span {}".format(first_date,last_date,last_date-first_date))
result:
start 2016-09-09 12:42:15, end 2016-09-10 03:29:51, time span 14:47:36
Note that it won't work properly between december 31st and january the 1st because of the missing year info. I suppose we could make a guess if we find January & December in the log then assume that it's january from the next year. Unsupported yet.