Trying to parse twitter json from a text file - python
I am new to python and am trying to parse "tweets' from a text file for analysis.
My test file has a number of tweets, here is an example of one:
{"created_at":"Mon May 06 17:39:59 +0000 2013","id":331463367074148352,"id_str":"331463367074148352","text":"Extra\u00f1o el trabajo en las aulas !! * se jala los cabellos","source":"\u003ca href=\"http:\/\/twitter.com\/download\/iphone\" rel=\"nofollow\"\u003eTwitter for iPhone\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":276765971,"id_str":"276765971","name":"Shiro","screen_name":"_Shira3mmanueL_","location":"","url":null,"description":null,"protected":false,"followers_count":826,"friends_count":1080,"listed_count":5,"created_at":"Mon Apr 04 01:36:52 +0000 2011","favourites_count":1043,"utc_offset":-21600,"time_zone":"Mexico City","geo_enabled":true,"verified":false,"statuses_count":28727,"lang":"es","contributors_enabled":false,"is_translator":false,"profile_background_color":"1A1B1F","profile_background_image_url":"http:\/\/a0.twimg.com\/images\/themes\/theme9\/bg.gif","profile_background_image_url_https":"https:\/\/si0.twimg.com\/images\/themes\/theme9\/bg.gif","profile_background_tile":false,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/3608152674\/45133759fb72090ebbe880145d8966a6_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/3608152674\/45133759fb72090ebbe880145d8966a6_normal.jpeg","profile_banner_url":"https:\/\/si0.twimg.com\/profile_banners\/276765971\/1367525440","profile_link_color":"2FC2EF","profile_sidebar_border_color":"181A1E","profile_sidebar_fill_color":"252429","profile_text_color":"666666","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":{"type":"Point","coordinates":[19.30303082,-99.54709768]},"coordinates":{"type":"Point","coordinates":[-99.54709768,19.30303082]},"place":{"id":"1d23a12800a574a8","url":"http:\/\/api.twitter.com\/1\/geo\/id\/1d23a12800a574a8.json","place_type":"city","name":"Lerma","full_name":"Lerma, M\u00e9xico","country_code":"MX","country":"M\u00e9xico","bounding_box":{"type":"Polygon","coordinates":[[[-99.552193,19.223171],[-99.552193,19.4343],[-99.379483,19.4343],[-99.379483,19.223171]]]},"attributes":{}},"contributors":null,"retweet_count":0,"favorite_count":0,"entities":{"hashtags":[],"symbols":[],"urls":[],"user_mentions":[]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"es"}
My code is:
import re
import json
pattern_split = re.compile(r"\W+")
def sentment_tbl(sent_file):
# Read in AFINN-111.txt
tbl = dict(map(lambda (w, s): (w, int(s)), [
ws.strip().split('\t') for ws in open(sent_file)]))
return tbl
def sentiment(text,afinn):
# Word splitter pattern
words = pattern_split.split(text.lower())
sentiments = map(lambda word: afinn.get(word, 0), words)
if sentiments:
sentiment = float(sum(sentiments))
else:
sentiment = 0
return sentiment
def main():
sent_file = sys.argv[1]
afinn = sentment_tbl(sent_file)
tweet_file = (sys.argv[2])
with open(tweet_file) as f:
for line_str in f:
print type(line_str)
print line_str
tweet = json.loads(line_str.read())
print("%6.2f %s" % (sentiment(line_str,afinn)))
#Test: text = "Finn is stupid and idiotic"
#print("%6.2f %s" % (sentiment(text,afinn), text))
if __name__ == '__main__':
main()
I get an error about
I get the feeling I am mixing apples and oranges and would like some experienced assistance
thanks, Chris
If you've written multiple tweets to a file. EG:
o.write(tweet1)
o.write(tweet2)
You will have to read them in line by line as well, because json can't decode a file of multiple objects written line by line.
tweets = []
for line in open('test.txt', 'r'):
tweets.append(json.loads(line))
Why don't you use the built-in JSON library instead of your loop, reading and parsing every line as JSON, as follows:
import json
jsonObj = json.loads(open(tweet_file, 'r'))
# Now jsonObject is an array of dictionaries corresponding to the JSON
You need to pass a string to json.loads:
tweet = json.loads(line_str)
as line_str is a string.
After that you need to make sure you properly pass tweet or some details in tweet to sentiment() for further processing. Note that you're now calling sentiment() with line_str and tweet isn't used (yet).
Related
This is what python returns while I'm trying to tokenize tweets : TypeError: list indices must be integers or slices, not str
I'm trying to tokenize all my tweets that I previously saved in json file. I follow this example: https://marcobonzanini.com/2015/03/09/mining-twitter-data-with-python-part-2/ import re import json emoticons_str = r""" (?: [:=;] # Eyes [oO\-]? # Nose (optional) [D\)\]\(\]/\\OpP] # Mouth )""" regex_str = [ emoticons_str, r'<[^>]+>', # HTML tags r'(?:#[\w_]+)', # #-mentions r"(?:\#+[\w_]+[\w\'_\-]*[\w_]+)", # hash-tags r'http[s]?://(?:[a-z]|[0-9]|[$-_#.&+]|[!*\(\),]|(?:%[0-9a-f][0-9a-f]))+', # URLs r'(?:(?:\d+,?)+(?:\.?\d+)?)', # numbers r"(?:[a-z][a-z'\-_]+[a-z])", # words with - and ' r'(?:[\w_]+)', # other words r'(?:\S)' # anything else ] tokens_re = re.compile(r'('+'|'.join(regex_str)+')', re.VERBOSE | re.IGNORECASE) emoticon_re = re.compile(r'^'+emoticons_str+'$', re.VERBOSE | re.IGNORECASE) def tokenize(s): return tokens_re.findall(s) def preprocess(s, lowercase=False): tokens = tokenize(s) if lowercase: tokens = [token if emoticon_re.search(token) else token.lower() for token in tokens] return tokens When I add this at the end everything is working: tweet = 'RT #marcobonzanini: just an example! :D http://example.com #NLP' print(preprocess(tweet)) I want to tokenize my tweets that I saved in JSON file and the website suggests to do it this way: with open('mytweets.json', 'r') as f: for line in f: tweet = json.loads(line) tokens = preprocess(tweet['text']) do_something_else(tokens) This is how I'm trying to open my JSON file: with open('data/digitalhealth.json', 'r') as f: ... for line in f: ... tweet = json.loads(line) ... tokens = preprocess(tweet['text']) ... do_something_else(tokens) And this is what python returns: Traceback (most recent call last): File "<stdin>", line 4, in <module> TypeError: list indices must be integers, not str Does anyone know how to sort this out? I'm new to all this and I really don't have any idea what to do. This is my code for collecting Data from Twitter's API: import tweepy import json API_KEY = 'xxx' API_SECRET = 'xxx' TOKEN_KEY = 'xxx' TOKEN_SECRET = 'xxx' auth = tweepy.OAuthHandler(API_KEY, API_SECRET) auth.set_access_token(TOKEN_KEY, TOKEN_SECRET) api = tweepy.API(auth, wait_on_rate_limit=True) query = '#digitalhealth' cursor = tweepy.Cursor(api.search, q=query, lang="en") for page in cursor.pages(): tweets = [] for item in page: tweets.append(item._json) with open('Twitter_project/digitalhealth.json', 'wb') as outfile: json.dump(tweets, outfile) How do I change it now so I will have only dictionaries? Thanks all of you for your help! I really appreciate it
For some reason you're storing your JSON dictionaries in lists... You should try to store them as dictionaries since that'd be much easier for you, but if you want to access them now then simply do: tweet[0] to access the dictionary, and from there you can access the dictionary data like so tweet[0]['text']. Still, look into reformatting the JSON properly.
How to open large twitter file (30GB+) in Python?
I'm wondering what's the proper script to open large Twitter files streamed using tweepy on python 3. I've used the following with smaller files but now that my data collection is above 30GB+ I'm getting memory errors: with open('data.txt') as f: tweetStream = f.read().splitlines() tweet = json.loads(tweetStream[0]) print(tweet['text']) print(tweet['user']['screen_name']) I've been unable to find what I need online so far so any help would be much appreciated.
Don't try and create an object that contains the entire file. Instead, as each line contains a tweet, work on the file one line at a time: with open('data.txt') as f: for line in f: tweet = json.loads(line) print(tweet['text']) print(tweet['user']['screen_name']) Perhaps store relevant tweets to another file or database, or produce a stastical summay. For example: total = 0 about_badgers = 0 with open('data.txt') as f: for line in f: tweet = json.loads(line) total +=1 if "badger" in tweet['text'].lower(): about_badgers += 1 print("Of " + str(total) +", " + str(about_badgers) +" were about badgers.") Catch errors relating to unparseable lines like this: with open('data.txt') as f: for line in f: try: tweet = json.loads(line) print(tweet['text']) except json.decoder.JSONDecodeError: # Do something useful, like write the failing line to an error log pass print(tweet['user']['screen_name'])
Can't get any data on Python
I collected a lot of tweet. Then I want to output only English tweets. I can get all of tweet included non-English tweet. But if I append some code for i in range (0,1000): if tweet['statuses'][i][u'lang']==u'en': for getting only English tweet, it can't be collected like that. And there are no error. In [1]: runfile('C:/Users/Desktop/tweets.py', wdir='C:/Users/Desktop') It just runs and there("C:/Users/Desktop/A.txt") are no data. My code is as follows. What should I do with it? try: import json except ImportError: import simplejson as json tweets_filename = 'C:/Users/Desktop/tweet.txt' #Original tweet tweets_file = open(tweets_filename, "r") for line in tweets_file: try: tweet = json.loads(line.strip()) for i in range (0,1000): #This is the part for filtering English tweet if tweet['statuses'][i][u'lang']==u'en': #Same part if 'text' in tweet: print (tweet['created_at']) print (tweet['text']) hashtags = [] for hashtag in tweet['entities']['hashtags']: hashtags.append(hashtag['text']) print(hashtags) output = "C:/Users/Desktop/A.txt" #Only English tweet path out_file = open(output, 'a') out_file.write(tweet['user']['name'] + "," + tweet['text'] + "\n \n") out_file.close() except: continue
You have to read the lines of tweet_file, like this; lines = tweet_file.readlines() for line in lines: ... Also, if you want to see the errors. Donät catch them. Some good reading Zen of Python
I want to use out_file on Python
I'm studying Python for two months. My goal is to do the sentiment analysis! But self-study is too hard, so I want to ask for help. I collected data from Twitter API, and I put the data to notepad. It is too long like that. {created_at":"Fri Nov 03 03:28:33 +0000 2017", ~~ id, tweet, unicode} I converted data to simple on IPython console(Spyder). It's like "Fri Nov 03 03:46:46 +0000 2017 #user blah blah [hash tags] time stamp". Then I want to put the simple data to notepad again. The code is written as follows. How can I change the code on part of out_file? try: import json except ImportError: import simplejson as json tweets_filename = 'C:/Users/ID500/Desktop/SA/Corpus/siri/siri_0.txt' #Not converted data tweets_file = open(tweets_filename, "r") for line in tweets_file: try: tweet = json.loads(line.strip()) if 'text' in tweet: print (tweet['id']) print (tweet['created_at']) print (tweet['text']) print (tweet['user']['id']) print (tweet['user']['name']) print (tweet['user']['screen_name']) hashtags = [] for hashtag in tweet['entities']['hashtags']: hashtags.append(hashtag['text']) print(hashtags) out_file = open("C:/Users/ID500/Desktop/SA/Corpus/final/fn_siri_1.txt", 'a') # I want to put data to that path. out_file.write() # What can I write here? out_file.close() except: continue Thank you!
You can open two files at once. Don't open one within the loop For example open(tweets_filename) as tweets_file, open(output, "a") as out_file: for line in tweets_file: # parse the line here out_file.write(line)
Why my function is returning empty string in python?
What I am doing is, removing all parts of speech except nouns from a text. I have written a function for that. It may not be the best or optimized code to do that because I have just started coding in python. I am sure the bug must be very basic but I am just not able to figure it out. In my function two inputs go as parameters. One is the location of text on hard drive and other is the location of file where we want the output. Following is the code. def extract_nouns(i_location, o_location): import nltk with open(i_location, "r") as myfile: data = myfile.read().replace('\n', '') tokens = nltk.word_tokenize(data) tagged = nltk.pos_tag(tokens) length = len(tagged) a = list() for i in range(0,length): print(i) log = (tagged[i][1][0] == 'N') if log == False: a.append(tagged[i][0]) fin = open(i_location, 'r') fout = open(o_location, "w+") for line in fin: for word in a: line = line.replace(word, "") fout.write(line) with open(o_location, "r") as myfile_new: data_out = myfile_new.read().replace('\n', '') return data_out When I call this function it is working just fine. I am getting the output on hard disk as I had intended but it does not return the output on the interface or should I say, it is returning a blank string instead of the actual output string. This is how I am calling it. t = extract_nouns("input.txt","output.txt") If you want to try it, take following as the content of input file "At eight o'clock on Thursday film morning word line test best beautiful Ram Aaron design" This is the output I am getting in the output file (output.txt) when I call the function but the function returns blank string on the interface instead. It does not even print the output. " Thursday film morning word line test Ram Aar design"
You need to close the file first: for line in fin: for word in a: line = line.replace(word, "") fout.write(line) fout.close() Using with is usually the best way to open files as it automatically closes them and file.seek() to go back to the start of the file to read : def extract_nouns(i_location, o_location): import nltk with open(i_location, "r") as myfile: data = myfile.read().replace('\n', '') tokens = nltk.word_tokenize(data) tagged = nltk.pos_tag(tokens) length = len(tagged) a = [] for i in range(0,length): print(i) log = (tagged[i][1][0] == 'N') if not log: a.append(tagged[i][0]) with open(i_location, 'r') as fin, open(o_location, "w+") as fout: for line in fin: for word in a: line = line.replace(word, "") fout.write(line) fout.seek(0) # go back to start of file data_out = fout.read().replace('\n' , '') return data_out
The last statement in the function should be the return. Because there is the print data_out, you return the return value of print which is none. E.g: In []: def test(): ..: print 'Hello!' ..: In []: res = test() Hello! In []: res is None Out[]: True