How to save a tweepy Twitter stream to a file? - python

I have a working script that successfully gathers tweets that mention "stackoverflow". However, I want to run the script in iPython (rather than executive a separate .py file). Ideally, I just want to open it ipyb file, select run all, and let it run for a week or so (not closing my laptop of course) and in result I have a .json file with a week's worth of tweets.
Here is what I have so far:
from tweepy import Stream
from tweepy import OAuthHandler
from tweepy.streaming import StreamListener
access_token = "x"
access_token_secret = "x"
consumer_key = "x"
consumer_secret = "x"
# file name that you want to open is the second argument
save_file = open('data.json', 'a')
class listener(StreamListener):
def on_data(self, data):
print(data)
return True
def on_error(self, status):
print(status)
auth = OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_token_secret)
twitterStream = Stream(auth, listener())
twitterStream.filter(track=["stackoverflow"])

add the following code to your existing code. 'fetched_tweets.txt' is the name of file in which you want to save the tweets which is opened in 'a'(append mode).
class StdOutListener(StreamListener):
def on_data(self, data):
#print data
with open('fetched_tweets.txt','a') as tf:
tf.write(data)
return True
def on_error(self, status):
print status

You can do it by redirecting output to a file:
in Terminal/CMD just type python twitter_streaming.py > twitter_data.txt
for appending to an existing file use >> instead of >.

Related

Stream with Tweepy : 'str' object has no attribute 'entities'

I tried to get any tweet that contain images. But when I get tweet data in line if media in data.entities:, I get error AttributeError: str object has no attribute entities.
I tried adding to the line
twitterStream = Stream (auth, listener (), include_entities = 1)
but it does not work either
from tweepy import Stream
from tweepy import OAuthHandler
from tweepy.streaming import StreamListener
import pprint
consumer_key = "xxxxxxxxxxxxx"
consumer_secret = "xxxxxxxxxxxxxxxxx"
access_token = "xxxxxxxxxxxxxxxxxx"
access_secret = "xxxxxxxxxxxx"
class listener(StreamListener):
def on_data(self, data):
if 'media' in data.entities:
print(data)
#for image in data.extended_entities['media']:
#print(image['media_url'])
#return(True)
def on_error(self, status):
print ("error")
print (status)
auth = OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_secret)
twitterStream = Stream(auth, listener())
tweets=twitterStream.filter(track=["#picture"])
Tweepy passes the raw text data to tweepy.StreamListener's on_data() method, which is used for handling the raw data from API (so you need to parse JSON string and construst tweepy.Status object).
If you handle normal status objects, you'd better use on_status() method but on_data(). This method takes Tweepy's normal Status object as an argument, so you can use this status object as usual.
So following code
class MyStreamListener(tweepy.StreamListener):
def on_status(self, status):
print('#on_status')
print(type(status))
print(status.text)
def on_error(self, error_code):
print('#on_error')
print(error_code)
if error_code == 420:
return False
stream = tweepy.Stream(api.auth, MyStreamListener())
stream.filter(track=["#picture"])
will print like this:
#on_status
<class 'tweepy.models.Status'>
test1! #picture
#on_status
<class 'tweepy.models.Status'>
This is test picture tweet2! #picture
See also: Streaming With Tweepy — tweepy 3.6.0 documentation

Dictionary key error when parsing tweets with twitter api

from tweepy.streaming import StreamListener
from tweepy import OAuthHandler
from tweepy import Stream
import pw
import json
access_token = pw.access_token
access_token_secret = pw.access_token_secret
consumer_key = pw.consumer_key
consumer_secret = pw.consumer_secret
class StdOutListener(StreamListener):
def on_data(self, data):
dicto = json.loads(data)
print(dicto['user'])
return True
def on_error(self, status):
print(status)
auth = OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_token_secret)
stream = Stream(auth, StdOutListener())
stream.filter(track="music")
So I just followed a simple tutorial to process tweets. I'm just trying to parse the tweets, however when I use print(dicto['user']) it prints some and then throws a key error. The weird thing is that it's always after the 48th one. If I simply print the whole dictionary then it happily prints away. Right after the 48th piece of information is says traceback (most recent call last).
I'm a little confused what's going on here.

Python API Streaming, write new file after certain size

I have a python script that maintains an open connection to the Twitter Streaming API, and writes the data into a json file. Is it possible to write to a new file, without breaking the connection, after the current file being written reaches a certain size? For example, I just streamed data for over 1 week, but all the data is contained in a single file (~2gb) making it slow to parse. If I could write to a new file after, say 500mb, then I would have 4 smaller files (e.g. dump1.json, dump2.json etc) to parse instead of one large one.
import tweepy
from tweepy import OAuthHandler
from tweepy import Stream
from tweepy.streaming import StreamListener
# Add consumer/access tokens for Twitter API
consumer_key = '-----'
consumer_secret = '-----'
access_token = '-----'
access_secret = '-----'
auth = OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_secret)
api = tweepy.API(auth)
# Define streamlistener class to open a connection to Twitter and begin consuming data
class MyListener(StreamListener):
def on_data(self, data):
try:
with open('G:\xxxx\Raw_tweets.json', 'a') as f:
f.write(data)
return True
except BaseException as e:
print("Error on_data: %s" % str(e))
return True
def on_error(self, status):
print(status)
return True
bounding_box = [-77.2157,38.2036,-76.5215,39.3365]#filtering by location
keyword_list = ['']#filtering by keyword
twitter_stream = Stream(auth, MyListener())
twitter_stream.filter(locations=bounding_box) # Filter Tweets in stream by location bounding box
#twitter_stream.filter(track=keyword_list) # Filter Tweets in stream by keyword
Since you re-open your file every time, it is rather simple - use an index in file name and advance it if your file size reaches threshold
class MyListener(StreamListener):
def __init(self):
self._file_index = 0
def on_data(self, data):
tweets_file = 'G:\xxxx\Raw_tweets{}.json'.format(self._file_index)
while os.path.exists(tweets_file) and os.stat(tweet_file).st_size > 2**10:
self._file_index += 1
tweets_file = 'G:\xxxx\Raw_tweets{}.json'.format(self._file_index)
....
The cycle will take care of your app being restarted

Retrieve tweets tweepy

How can I retrieve only my tweets with a stream? I test that but I don't see my tweets.
My first attempt:
streamingAPI = tweepy.streaming.Stream(auth, CustomStreamListener())
streamingAPI.userstream(_with='followings')
streamingAPI.filter()
My second attempt:
streamingAPI = tweepy.streaming.Stream(auth, CustomStreamListener())
streamingAPI.filter(follow= ['2466458114'])
Thanks a lot.
If you want stream only tweets on your user, you can use the following lines:
from tweepy import StreamListener
from tweepy import Stream
import tweepy
consumer_key = ''
consumer_secret = ''
access_token = ''
access_token_secret = ''
auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_token_secret)
api = tweepy.API(auth)
class CustomStreamListener(StreamListener):
def on_data(self, data):
print(data)
def on_error(self, status):
print(status)
if __name__ == '__main__':
listener = CustomStreamListener()
twitterStream = Stream(auth, listener)
twitterStream.filter(follow=['2466458114'])
In your question, you said that you can't see your tweets. I don't know if is clear or not but just to be sure, with streaming you can see only the "real time" tweets. So also with my code, if you don't tweet nothing, you don't see nothing.
UPDATE AFTER CHAT IN COMMENTS
Since Twitter Official API has the bother limitation of time constraints, you can't get older tweets than a week.
For this task I suggest you to use this great python library.
It allows to get how many tweets you want and wrote when you want.
As documentation says, you can simply use it in this way:
tweetCriteria = got.manager.TweetCriteria().setUsername('<user_without_#>').setSince("2015-05-01").setUntil("2015-09-30")
If you are using python2.X you can use got, instead if you are using python3.X you can use got3.
I prepare an example in Python3:
from getOldTweets import got3
tweetCriteria = got3.manager.TweetCriteria().setUsername('barackobama').setSince("2015-09-01").setUntil("2015-09-30")
tweets_list = got3.manager.TweetManager.getTweets(tweetCriteria)
for tweet in tweets_list:
print(tweet.text)
Let me know.

Blank space in Twitter direct messages with tweepy from JSON API

So I have the following python code which receives notification of received direct messages via Tweepy:
#!/usr/bin/env python
from tweepy.streaming import StreamListener
from tweepy import OAuthHandler
from tweepy import Stream
import simplejson as json
consumer_key = "secret"
consumer_secret = "secret"
access_token = "secret"
access_token_secret = "secret"
class StdOutListener(StreamListener):
def __init__(self):
print "init"
def on_connect(self):
print "Connected"
def on_disconnect(self, status):
print ("Disconnected", status)
def on_direct_message(self, status):
print ("on_direct_message", status)
def on_data(self, status):
# print ("on_data", status)
decoded = json.loads(status)
## grab the direct message
directMessage = decoded['direct_message']
message = directMessage.get('text', None)
message.strip()
print "message:*", message, "*"
return True
def on_error(self, status):
print ("on_error", status)
if __name__ == '__main__':
## Connect to Twitter
auth = OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_token_secret)
## Init the StreamListener
print ("Init Stream")
l = StdOutListener()
stream = Stream(auth, l)
stream.userstream()
When it runs and I send the direct message "test message" I get the following printed to the console:
message:* text message *
ie the message was received and parsed but padded with space at either end which message.strip() didn't even correct. If I uncomment the print directMessage line and view the json sent by Twitter there is no space.
I cannot work out if there is a problem with my JSON editing or usage of tweepy or something else.
I've also tried using the json package as well as simplejson.

Categories