I am using Tweepy to capture streaming tweets based off of the hashtag #WorldCup, as seen by the code below. It works as expected.
class StdOutListener(StreamListener):
''' Handles data received from the stream. '''
def on_status(self, status):
# Prints the text of the tweet
print('Tweet text: ' + status.text)
# There are many options in the status object,
# hashtags can be very easily accessed.
for hashtag in status.entries['hashtags']:
print(hashtag['text'])
return true
def on_error(self, status_code):
print('Got an error with status code: ' + str(status_code))
return True # To continue listening
def on_timeout(self):
print('Timeout...')
return True # To continue listening
if __name__ == '__main__':
listener = StdOutListener()
auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_token_secret)
stream = Stream(auth, listener)
stream.filter(follow=[38744894], track=['#WorldCup'])
Because this is a hot hashtag right now, searches don't take too long to catch the maximum amount of tweets that Tweepy lets you get in one transaction. However, if I was going to search on #StackOverflow, it might be much slower, and therefore, I'd like a way to kill the stream. I could do this on several parameters, such as stopping after 100 tweets, stopping after 3 minutes, after a text output file has reached 150 lines, etc. I do know that the socket timeout time isn't used to achieve this.
I have taken a look at this similar question:
Tweepy Streaming - Stop collecting tweets at x amount
However, it appears to not use the streaming API. The data that it collects is also very messy, whereas this text output is clean.
Can anyone suggest a way to stop Tweepy (when using the stream in this method), based on some user input parameter, besides a keyboard interrupt?
Thanks
I solved this, so I'm going to be one of those internet heroes that answers their own question.
This is achieved by using static Python variables for the counter and for the stop value (e.g. stop after you grab 20 tweets). This is currently a geolocation search, but you could easily swap it for a hashtag search by using the getTweetsByHashtag() method.
#!/usr/bin/env python
from tweepy import (Stream, OAuthHandler)
from tweepy.streaming import StreamListener
class Listener(StreamListener):
tweet_counter = 0 # Static variable
def login(self):
CONSUMER_KEY =
CONSUMER_SECRET =
ACCESS_TOKEN =
ACCESS_TOKEN_SECRET =
auth = OAuthHandler(CONSUMER_KEY, CONSUMER_SECRET)
auth.set_access_token(ACCESS_TOKEN, ACCESS_TOKEN_SECRET)
return auth
def on_status(self, status):
Listener.tweet_counter += 1
print(str(Listener.tweet_counter) + '. Screen name = "%s" Tweet = "%s"'
%(status.author.screen_name, status.text.replace('\n', ' ')))
if Listener.tweet_counter < Listener.stop_at:
return True
else:
print('Max num reached = ' + str(Listener.tweet_counter))
return False
def getTweetsByGPS(self, stop_at_number, latitude_start, longitude_start, latitude_finish, longitude_finish):
try:
Listener.stop_at = stop_at_number # Create static variable
auth = self.login()
streaming_api = Stream(auth, Listener(), timeout=60) # Socket timeout value
streaming_api.filter(follow=None, locations=[latitude_start, longitude_start, latitude_finish, longitude_finish])
except KeyboardInterrupt:
print('Got keyboard interrupt')
def getTweetsByHashtag(self, stop_at_number, hashtag):
try:
Listener.stopAt = stop_at_number
auth = self.login()
streaming_api = Stream(auth, Listener(), timeout=60)
# Atlanta area.
streaming_api.filter(track=[hashtag])
except KeyboardInterrupt:
print('Got keyboard interrupt')
listener = Listener()
listener.getTweetsByGPS(20, -84.395198, 33.746876, -84.385585, 33.841601) # Atlanta area.
The above solution was helpful in getting tweets by hashtag, even though there is a small error while defining the getTweetByHashtag function. YOu had used Listener.stopAt instead of Listener.stop_at=stop_at_number.
I have tweaked the code a little bit, so you can easily kill the code for a specified number of seconds.
defined new functions init to help tweak the seconds and "on_data" which contains more information that on_status function.
Enjoy:
from tweepy import (Stream, OAuthHandler)
from tweepy.streaming import StreamListener
class Listener(StreamListener):
tweet_counter = 0 # Static variable
def login(self):
CONSUMER_KEY =
CONSUMER_SECRET =
ACCESS_TOKEN =
ACCESS_TOKEN_SECRET =
auth = OAuthHandler(CONSUMER_KEY, CONSUMER_SECRET)
auth.set_access_token(ACCESS_TOKEN, ACCESS_TOKEN_SECRET)
return auth
def __init__(self, time_limit=8):
self.start_time = time.time()
self.limit = time_limit
super(Listener, self).__init__()
def on_data(self, data):
Listener.tweet_counter += 1
if (time.time() - self.start_time) < self.limit and Listener.tweet_counter < Listener.stop_at:
print(str(Listener.tweet_counter)+data)
return True
else:
print("Either Max number reached or time limit up at:"+ str(Listener.tweet_counter)+" outputs")
self.saveFile.close()
return False
#def on_status(self, status):
#Listener.tweet_counter += 1
#print(str(Listener.tweet_counter) + '. Screen name = "%s" Tweet = "%s"'
#%(status.author.screen_name, status.text.replace('\n', ' ')))
#if Listener.tweet_counter < Listener.stop_at and (time.time() - self.start_time) < self.limit:
#return True
#else:
#print('Max num reached or time elapsed= ' + str(Listener.tweet_counter))
#return False
def getTweetsByGPS(self, stop_at_number, latitude_start, longitude_start, latitude_finish, longitude_finish):
try:
Listener.stop_at = stop_at_number # Create static variable
auth = self.login()
streaming_api = Stream(auth, Listener(), timeout=60) # Socket timeout value
streaming_api.filter(follow=None, locations=[latitude_start, longitude_start, latitude_finish, longitude_finish])
except KeyboardInterrupt:
print('Got keyboard interrupt')
def getTweetsByHashtag(self, stop_at_number, hashtag):
try:
Listener.stop_at = stop_at_number
auth = self.login()
streaming_api = Stream(auth, Listener(), timeout=60)
# Atlanta area.
streaming_api.filter(track=[hashtag])
except KeyboardInterrupt:
print('Got keyboard interrupt')
listener = Listener()
#listener.getTweetsByGPS(20, -84.395198, 33.746876, -84.385585, 33.841601) # Atlanta area.
listener.getTweetsByHashtag(1000,"hi")
You can change the 1000 value to the max tweets you want and the "hi" to the keyword you need find.. Under the init function, change the 8 time_limit to the value you want in seconds. So you use it depending on what you want.
You can either set limited time and adjust the count to a very high value, or set the count of tweets needed and give a higher time value, so it can get to the count. Your choice!
Chukwu Gozie unu (God bless!)
Related
I want to build a complex network of twitter followers.
I'm using the function api.GetFriends :
def get_friend_list_by_user (user, api) :
friends_lists = api.GetFriends(repr(user.id))
return friends_lists
The problem is that for the same twitter users, sometimes it works and sometimes doesn't.
When I'm debugging it,
the code is dead at that part on the api.py:
if enforce_auth:
if not self.__auth:
raise TwitterError("The twitter.Api instance must be authenticated.")
if url and self.sleep_on_rate_limit:
limit = self.CheckRateLimit(url)
if limit.remaining == 0:
try:
stime = max(int(limit.reset - time.time()) + 10, 0)
logger.debug('Rate limited requesting [%s], sleeping for [%s]', url, stime)
time.sleep(stime)
except ValueError:
pass
if not data:
data = {}
The stime value is 443.
1.The api: stream.filter(). I read the documentation which said that all parameters can be optional. However, when I left it empty, it won't work.
Still the question with api. It is said that if I write code like below:
twitter_stream.filter(locations = [-180,-90, 180, 90])
It can filter all tweets with geological information. However, when I check the json data, I still find many tweets, the value of their attribute geo are still null.
3.I tried to use stream to get as many tweets as possible. However, it is said that it can get tweets in real time. will there be any parameters to set the time
like to collect tweets from 2013 to 2015
4.I tried to collect data through users and their followers and continue the same step until I get as many tweets as I want. So my code is like below:
import tweepy
import chardet
import json
import sys
#set one global list to store all user_names
users_unused = ["Raithan8"]
users_used = []
def process_or_store(tweet):
print(json.dumps(tweet))
consumer_key =
consumer_secret =
access_token =
access_token_secret =
auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_token_secret)
api = tweepy.API(auth, wait_on_rate_limit=True)
def getAllTweets():
#initialize one empty list tw store all tweets
screen_name = users_unused[0]
users_unused.remove(screen_name)
users_used.append(screen_name)
print("this is the current user: " + screen_name)
for friend in tweepy.Cursor(api.friends, screen_name = screen_name).items():
if friend not in users_unused and friend not in users_used:
users_unused.append(friend.screen_name)
for follower in tweepy.Cursor(api.followers, screen_name = screen_name).items():
if follower not in users_unused and follower not in users_used:
users_unused.append(follower.screen_name)
print(users_unused)
print(users_used)
alltweets = []
#tweepy limits at most 200 tweets each time
new_tweets = api.user_timeline(screen_name = screen_name, count = 200)
alltweets.extend(new_tweets)
if not alltweets:
return alltweets
oldest = alltweets[-1].id - 1
while(len(new_tweets) <= 0):
new_tweets = api.user_timeline(screen_name = screen_name, count = 200, max_id = oldest)
alltweets.extend(new_tweets)
oldest = alltweets[-1].id - 1
return alltweets
def storeTweets(alltweets, file_name = "tweets.json"):
for tweet in alltweets:
json_data = tweet._json
data = json.dumps(tweet._json)
with open(file_name, "a") as f:
if json_data['geo'] is not None:
f.write(data)
f.write("\n")
if __name__ == "__main__":
while(1):
if not users_unused:
break
storeTweets(getAllTweets())
I don't why it runs so slow. Maybe it is mainly because I initialize tweepy API as below
api = tweepy.API(auth, wait_on_rate_limit=True)
But if I don't initialize it in this way, it will raise error below:
raise RateLimitError(error_msg, resp)
tweepy.error.RateLimitError: [{'message': 'Rate limit exceeded', 'code': 88}]
2) There's a difference between a tweet with coordinates and filtering by location.
Filtering by location means that the sender is located in the range of your filter. If you set it globally twitter_stream.filter(locations = [-180,-90, 180, 90]) it will return tweets for people who set their country name in their preferences.
If you need to filter by coordinates (a tweet that has a coordinates) you can take a look at my blog post. But basically you need to set a listener and then check if the tweet have some coordinates.
3 and 4) Twitter's Search API and Twitter's Streaming API are different in many ways and restrictions about rate limits (Tweepy) and Twitter rate limit.
You have a limitation about how many tweets you want to get (in the past).
Check again Tweepy API because wait_on_rate_limit set as true just wait that your current limit window is available again. That's why it's "slow" as you said.
However using streaming API doesn't have such restrictions.
My environment is below:
Python Python 3.6.5
sqlite3 3.28.0
import tweepy
import sqlite3
# 認証キーの設定
consumer_key = \
"XXXXXXXX"
consumer_secret = "XXXXXXXX"
access_token = "XXXXXXXX"
access_token_secret = "XXXXXXXX"
# OAuth認証
auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_token_secret)
class Database:
def __init__(self):
self.dbpath = 'db.sqlite3'
self.conn = sqlite3.connect(self.dbpath)
self.c = self.conn.cursor()
db = Database()
def output_users_from_db():
return db.c.execute('select name, comment from twitter_users')
def update_comment(name, comment='null'):
db.c.execute("""update twitter_users set comment = ? where name = ?""", (comment, name))
db.conn.commit()
if __name__ == "__main__":
api = tweepy.API(auth)
users_info_from_db = output_users_from_db()
for i, user_info_on_db in enumerate(users_info_from_db):
print(user_info_on_db[0])
time_line = api.user_timeline(screen_name=user_info_on_db[0])
for i, info in enumerate(time_line):
# Below print functions can out put all of characters from twitter
print(user_info_on_db[0]) # user account
print(info.text) # tweet
break
Code above works. But if I write code below,
time_line = api.user_timeline(screen_name=user_info_on_db[0])
for i, info in enumerate(time_line):
# Below print functions can out put all of characters from twitter
print(user_info_on_db[0]) # user account
print(info.text) # tweet
update_comment(user_info_on_db[0], comment=info.text)
break
print() only works once, it cannot print 2nd account's tweet. How come when the code include update_comment(user_info_on_db[0], comment=info.text), print() cannot output tweet of 2nd account?
I am trying to extract tweet locations from a specific area with python using tweepy + writing it into a csv-file.
I am not very much into python but I could manage to put together the following sript which kind of works:
import json
from tweepy import Stream
from tweepy import OAuthHandler
from tweepy.streaming import StreamListener
#Enter Twitter API Key information
consumer_key = 'cons_key'
consumer_secret = 'cons_secret'
access_token = 'acc_token'
access_secret = 'acc-secret'
file = open("C:\Python27\Output2.csv", "w")
file.write("X,Y\n")
data_list = []
count = 0
class listener(StreamListener):
def on_data(self, data):
global count
#How many tweets you want to find, could change to time based
if count <= 100:
json_data = json.loads(data)
coords = json_data["coordinates"]
if coords is not None:
print coords["coordinates"]
lon = coords["coordinates"][0]
lat = coords["coordinates"][1]
data_list.append(json_data)
file.write(str(lon) + ",")
file.write(str(lat) + "\n")
count += 1
return True
else:
file.close()
return False
def on_error(self, status):
print status
auth = OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_secret)
twitterStream = Stream(auth, listener())
#What you want to search for here
twitterStream.filter(locations=[11.01,47.85,12.09,48.43])
the problem is, that it extracts the coordinates very slowly (like 10 entries per 30 minutes). Would there be a way to make this faster?
How can I add the timestamps for each tweet?
Is there way to make sure to retrieve all tweets possible for the specific region (I guess the max is all tweets of the past week)?
thanks very much in advance!
Twitter’s standard streaming API provides a 1% sample of all the Tweets posted. In addition, very few Tweets have location data added to them. So, I’m not surprised that you’re only getting a small number of Tweets in a 30 minute timespan for one specific bounding box. The only way to improve the volume would be to pay for the enterprise PowerTrack API.
Tweets all contain a created_at value which is the time stamp you’ll want to record.
I'm trying to modify this script to only save the JSONs of tweets that have a location attached to them and am running into an issue with Python where checking that something isn't null doesn't seem to work. Has Key isn't working correctly, because they all have the key, most of them are just 'null'. Is not None isn't working because Python thinks null and None are different and checking it as text to not be "null" also didn't work. Does anyone have a clever idea on how to solve this?
from tweepy.streaming import StreamListener
from tweepy import OAuthHandler
from tweepy import Stream
import pymongo
import tweepy
import json
#Variables that contains the user credentials to access Twitter API
access_key = '' #redacted for privacy and such
access_secret = ''
consumer_key = ''
consumer_secret = ''
#Runs auth to Twitter API
auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_key, access_secret)
api = tweepy.API(auth)
#This is a basic listener that will print incoming data to stdout
class StdOutListener(StreamListener):
def on_data(self, data):
print data
return True
def on_error(self, status):
print status
#Customizes the stream and saves text and lang to databases
class CustomStreamListener(tweepy.StreamListener):
def __init__(self, api):
self.api = api
super(tweepy.StreamListener, self).__init__()
self.db = pymongo.MongoClient('localhost', 27017).crime
def on_data(self, data):
jd = json.loads(data)
if jd.has_key('coordinates') :
self.db.tweets.insert( { 'text' : jd['text'], 'coordinates' : jd['coordinates'], 'lang' : jd['lang'] } )
def on_error(self, status_code):
return True # Don't kill the stream
def on_timeout(self):
return True # Don't kill the stream
#Calls on StreamListerner and provides specifications of tracking
l = tweepy.streaming.Stream(auth, CustomStreamListener(api))
l.filter(track=['guns'])
You could try something like checking the length of the string:
if len( jd['coordinates'] ) > 1:
self.db.tweets.insert( { 'text' : jd['text'], 'coordinates' : jd['coordinates'], 'lang' : jd['lang'] } )