I have a list of tweet ids for which I would like to download their text content. Is there any easy solution to do this, preferably through a Python script? I had a look at other libraries like Tweepy and things don't appear to work so simple, and downloading them manually is out of the question since my list is very long.
You can access specific tweets by their id with the statuses/show/:id API route. Most Python Twitter libraries follow the exact same patterns, or offer 'friendly' names for the methods.
For example, Twython offers several show_* methods, including Twython.show_status() that lets you load specific tweets:
CONSUMER_KEY = "<consumer key>"
CONSUMER_SECRET = "<consumer secret>"
OAUTH_TOKEN = "<application key>"
OAUTH_TOKEN_SECRET = "<application secret"
twitter = Twython(
CONSUMER_KEY, CONSUMER_SECRET,
OAUTH_TOKEN, OAUTH_TOKEN_SECRET)
tweet = twitter.show_status(id=id_of_tweet)
print(tweet['text'])
and the returned dictionary follows the Tweet object definition given by the API.
The tweepy library uses tweepy.get_status():
auth = tweepy.OAuthHandler(CONSUMER_KEY, CONSUMER_SECRET)
auth.set_access_token(OAUTH_TOKEN, OAUTH_TOKEN_SECRET)
api = tweepy.API(auth)
tweet = api.get_status(id_of_tweet)
print(tweet.text)
where it returns a slightly richer object, but the attributes on it again reflect the published API.
Sharing my work that was vastly accelerated by the previous answers (thank you). This Python 2.7 script fetches the text for tweet IDs stored in a file. Adjust get_tweet_id() for your input data format;
original configured for data at https://github.com/mdredze/twitter_sandy
Update April 2018: responding late to #someone bug report (thank you). This script no longer discards every 100th tweet ID (that was my bug). Please note that if a tweet is unavailable for whatever reason, the bulk fetch silently skips it. The script now warns if the response size is different from the request size.
'''
Gets text content for tweet IDs
'''
# standard
from __future__ import print_function
import getopt
import logging
import os
import sys
# import traceback
# third-party: `pip install tweepy`
import tweepy
# global logger level is configured in main()
Logger = None
# Generate your own at https://apps.twitter.com/app
CONSUMER_KEY = 'Consumer Key (API key)'
CONSUMER_SECRET = 'Consumer Secret (API Secret)'
OAUTH_TOKEN = 'Access Token'
OAUTH_TOKEN_SECRET = 'Access Token Secret'
# batch size depends on Twitter limit, 100 at this time
batch_size=100
def get_tweet_id(line):
'''
Extracts and returns tweet ID from a line in the input.
'''
(tagid,_timestamp,_sandyflag) = line.split('\t')
(_tag, _search, tweet_id) = tagid.split(':')
return tweet_id
def get_tweets_single(twapi, idfilepath):
'''
Fetches content for tweet IDs in a file one at a time,
which means a ton of HTTPS requests, so NOT recommended.
`twapi`: Initialized, authorized API object from Tweepy
`idfilepath`: Path to file containing IDs
'''
# process IDs from the file
with open(idfilepath, 'rb') as idfile:
for line in idfile:
tweet_id = get_tweet_id(line)
Logger.debug('get_tweets_single: fetching tweet for ID %s', tweet_id)
try:
tweet = twapi.get_status(tweet_id)
print('%s,%s' % (tweet_id, tweet.text.encode('UTF-8')))
except tweepy.TweepError as te:
Logger.warn('get_tweets_single: failed to get tweet ID %s: %s', tweet_id, te.message)
# traceback.print_exc(file=sys.stderr)
# for
# with
def get_tweet_list(twapi, idlist):
'''
Invokes bulk lookup method.
Raises an exception if rate limit is exceeded.
'''
# fetch as little metadata as possible
tweets = twapi.statuses_lookup(id_=idlist, include_entities=False, trim_user=True)
if len(idlist) != len(tweets):
Logger.warn('get_tweet_list: unexpected response size %d, expected %d', len(tweets), len(idlist))
for tweet in tweets:
print('%s,%s' % (tweet.id, tweet.text.encode('UTF-8')))
def get_tweets_bulk(twapi, idfilepath):
'''
Fetches content for tweet IDs in a file using bulk request method,
which vastly reduces number of HTTPS requests compared to above;
however, it does not warn about IDs that yield no tweet.
`twapi`: Initialized, authorized API object from Tweepy
`idfilepath`: Path to file containing IDs
'''
# process IDs from the file
tweet_ids = list()
with open(idfilepath, 'rb') as idfile:
for line in idfile:
tweet_id = get_tweet_id(line)
Logger.debug('Enqueing tweet ID %s', tweet_id)
tweet_ids.append(tweet_id)
# API limits batch size
if len(tweet_ids) == batch_size:
Logger.debug('get_tweets_bulk: fetching batch of size %d', batch_size)
get_tweet_list(twapi, tweet_ids)
tweet_ids = list()
# process remainder
if len(tweet_ids) > 0:
Logger.debug('get_tweets_bulk: fetching last batch of size %d', len(tweet_ids))
get_tweet_list(twapi, tweet_ids)
def usage():
print('Usage: get_tweets_by_id.py [options] file')
print(' -s (single) makes one HTTPS request per tweet ID')
print(' -v (verbose) enables detailed logging')
sys.exit()
def main(args):
logging.basicConfig(level=logging.WARN)
global Logger
Logger = logging.getLogger('get_tweets_by_id')
bulk = True
try:
opts, args = getopt.getopt(args, 'sv')
except getopt.GetoptError:
usage()
for opt, _optarg in opts:
if opt in ('-s'):
bulk = False
elif opt in ('-v'):
Logger.setLevel(logging.DEBUG)
Logger.debug("main: verbose mode on")
else:
usage()
if len(args) != 1:
usage()
idfile = args[0]
if not os.path.isfile(idfile):
print('Not found or not a file: %s' % idfile, file=sys.stderr)
usage()
# connect to twitter
auth = tweepy.OAuthHandler(CONSUMER_KEY, CONSUMER_SECRET)
auth.set_access_token(OAUTH_TOKEN, OAUTH_TOKEN_SECRET)
api = tweepy.API(auth)
# hydrate tweet IDs
if bulk:
get_tweets_bulk(api, idfile)
else:
get_tweets_single(api, idfile)
if __name__ == '__main__':
main(sys.argv[1:])
You can access tweets in bulk (up to 100 at a time) with the status/lookup endpoint: https://dev.twitter.com/rest/reference/get/statuses/lookup
I don't have enough reputation to add an actual comment so sadly this is the way to go:
I found a bug and a strange thing in chrisinmtown answer:
Every 100th tweet will be skipped due to the bug. Here is a simple solution:
if len(tweet_ids) < 100:
tweet_ids.append(tweet_id)
else:
tweet_ids.append(tweet_id)
get_tweet_list(twapi, tweet_ids)
tweet_ids = list()
Using is better since it works even past the rate limit.
api = tweepy.API(auth_handler=auth, wait_on_rate_limit=True, wait_on_rate_limit_notify=True)
Related
I am trying to stream data from twitter to an aws bucket. The good news is I can get the data to stream to my bucket but the data comes in approx 20 kb chunks (I think this may be due to some firehose settings) and its not saving as JSON even after I specified it to in my python code using JSON.LOAD. Rather than saving as JSON, the data in my S3 bucket looks like it does not have a file extension and has long string of alphanumeric characters. I think it may be something to do with the parameters being used in client.put_record()
Any help is greatly appreciated!
Please find my code below, which I got from github here.
from tweepy.streaming import StreamListener
from tweepy import OAuthHandler
from tweepy import Stream
import json
import boto3
import time
#Variables that contains the user credentials to access Twitter API
consumer_key = "MY_CONSUMER_KEY"
consumer_secret = "MY_CONSUMER_SECRET"
access_token = "MY_ACCESS_TOKEN"
access_token_secret = "MY_SECRET_ACCESS_TOKEN"
#This is a basic listener that just prints received tweets to stdout.
class StdOutListener(StreamListener):
def on_data(self, data):
tweet = json.loads(data)
try:
if 'extended_tweet' in tweet.keys():
#print (tweet['text'])
message_lst = [str(tweet['id']),
str(tweet['user']['name']),
str(tweet['user']['screen_name']),
tweet['extended_tweet']['full_text'],
str(tweet['user']['followers_count']),
str(tweet['user']['location']),
str(tweet['geo']),
str(tweet['created_at']),
'\n'
]
message = '\t'.join(message_lst)
print(message)
client.put_record(
DeliveryStreamName=delivery_stream,
Record={
'Data': message
}
)
elif 'text' in tweet.keys():
#print (tweet['text'])
message_lst = [str(tweet['id']),
str(tweet['user']['name']),
str(tweet['user']['screen_name']),
tweet['text'].replace('\n',' ').replace('\r',' '),
str(tweet['user']['followers_count']),
str(tweet['user']['location']),
str(tweet['geo']),
str(tweet['created_at']),
'\n'
]
message = '\t'.join(message_lst)
print(message)
client.put_record(
DeliveryStreamName=delivery_stream,
Record={
'Data': message
}
)
except (AttributeError, Exception) as e:
print (e)
return True
def on_error(self, status):
print (status)
if __name__ == '__main__':
#This handles Twitter authetification and the connection to Twitter Streaming API
listener = StdOutListener()
auth = OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_token_secret)
#tweets = Table('tweets_ft',connection=conn)
client = boto3.client('firehose',
region_name='us-east-1',
aws_access_key_id='MY ACCESS KEY',
aws_secret_access_key='MY SECRET KEY'
)
delivery_stream = 'my_firehose'
#This line filter Twitter Streams to capture data by the keywords: 'python', 'javascript', 'ruby'
#stream.filter(track=['trump'], stall_warnings=True)
while True:
try:
print('Twitter streaming...')
stream = Stream(auth, listener)
stream.filter(track=['brexit'], languages=['en'], stall_warnings=True)
except Exception as e:
print(e)
print('Disconnected...')
time.sleep(5)
continue
Its possible that you have enabled S3 compression for your firehose. Please ensure that the compression is disabled if you want to store raw json data in your bucket:
You could also have some transformation applied to your firehose which encode or otherwise transform your json messages into some other format.
So it looks like the files were coming on with JSON formatting, i just had to open the files in S3 with firefox and i could see the contents of files. The issue with the file sizes is due to the firehose buffer settings, i have them set to the lowest which is why files were being sent in such small chunks
I'm trying to download my followers of twitter and the followers of my followers. T
The code seems to work but it doesn´t download all my followers. It downloads a portion and in this portion I think it works well. But why not all?
why is it?
-- coding: utf-8 --
"""
#author: Mik
"""
import csv
import time
import tweepy
# Copy the api key, the api secret, the access token and the access token secret from the relevant page on your Twitter app
api_key = ''
api_secret = ''
access_token = '-'
access_token_secret = ''
# You don't need to make any changes below here # This bit authorises you to ask for information from Twitter
auth = tweepy.OAuthHandler(api_key, api_secret)
auth.set_access_token(access_token, access_token_secret)
# The api object gives you access to all of the http calls that Twitter accepts
api = tweepy.API(auth)
#User we want to use as initial node
user=''
#This creates a csv file and defines that each new entry will be in a new line
csvfile=open(user+'network2.csv', 'w')
spamwriter = csv.writer(csvfile, delimiter=' ',quotechar='|', quoting=csv.QUOTE_MINIMAL)
#This is the function that takes a node (user) and looks for all its followers #and print them into a CSV file... and look for the followers of each follower...
def fib(n,user,spamwriter):
if n>0:
#There is a limit to the traffic you can have with the API, so you need to wait
#a few seconds per call or after a few calls it will restrict your traffic
#for 15 minutes. This parameter can be tweeked
time.sleep(40)
#This is for private users that we wont be able to see their followers
try:
users=tweepy.Cursor(api.followers, screen_name = user, wait_on_rate_limit = True).items()
for follower in users:
spamwriter.writerow([user+';'+follower.screen_name])
fib(n-1,follower.screen_name,spamwriter)
#n defines the level of autorecurrence
except tweepy.TweepError:
print("Failed to run the command on that user, Skipping...")
n=2
fib(n,user,spamwriter)
If I understood correctly then you want to get ids of all followers of each of your followers.
Use logic like following, it will get you details of your 3000 followers per 15 minutes
import tweepy
#twitter credentials here---------------------------------------------------
auth = tweepy.OAuthHandler(your keys)
auth.set_access_token(your keys)
api = tweepy.API(auth)
iter1 = tweepy.Cursor(api.followers, screen_name = 'your_screen_name',count = 200).pages()
for request in range(15):
your_200_followers = next(iter1)
for each_follower in your_200_followers:
variable = each_follower.followers_ids
#store the <list> variable somewhere
I have a python script that maintains an open connection to the Twitter Streaming API, and writes the data into a json file. Is it possible to write to a new file, without breaking the connection, after the current file being written reaches a certain size? For example, I just streamed data for over 1 week, but all the data is contained in a single file (~2gb) making it slow to parse. If I could write to a new file after, say 500mb, then I would have 4 smaller files (e.g. dump1.json, dump2.json etc) to parse instead of one large one.
import tweepy
from tweepy import OAuthHandler
from tweepy import Stream
from tweepy.streaming import StreamListener
# Add consumer/access tokens for Twitter API
consumer_key = '-----'
consumer_secret = '-----'
access_token = '-----'
access_secret = '-----'
auth = OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_secret)
api = tweepy.API(auth)
# Define streamlistener class to open a connection to Twitter and begin consuming data
class MyListener(StreamListener):
def on_data(self, data):
try:
with open('G:\xxxx\Raw_tweets.json', 'a') as f:
f.write(data)
return True
except BaseException as e:
print("Error on_data: %s" % str(e))
return True
def on_error(self, status):
print(status)
return True
bounding_box = [-77.2157,38.2036,-76.5215,39.3365]#filtering by location
keyword_list = ['']#filtering by keyword
twitter_stream = Stream(auth, MyListener())
twitter_stream.filter(locations=bounding_box) # Filter Tweets in stream by location bounding box
#twitter_stream.filter(track=keyword_list) # Filter Tweets in stream by keyword
Since you re-open your file every time, it is rather simple - use an index in file name and advance it if your file size reaches threshold
class MyListener(StreamListener):
def __init(self):
self._file_index = 0
def on_data(self, data):
tweets_file = 'G:\xxxx\Raw_tweets{}.json'.format(self._file_index)
while os.path.exists(tweets_file) and os.stat(tweet_file).st_size > 2**10:
self._file_index += 1
tweets_file = 'G:\xxxx\Raw_tweets{}.json'.format(self._file_index)
....
The cycle will take care of your app being restarted
I've been having some troubles using the GetStreamFilter function from the Python-Twitter library.
I have used this code:
import twitter
import time
consumer_key = 'myConsumerKey'
consumer_secret = 'myConsumerSecret'
access_token = 'myAccessToken'
access_token_secret = 'myAccessTokenSecret'
apiTest = twitter.Api(consumer_key,
consumer_secret,
access_token,
access_token_secret)
#print(apiTest.VerifyCredentials())
while (True):
stream = apiTest.GetStreamFilter(None, ['someWord'])
try:
print(stream.next())
except:
print ("No posts")
time.sleep(3)
What I want to do is to fetch new tweets that include the word "someWord", and to do these every three seconds (or every time there's a new one that is published).
You need to create the stream only once and outside the loop.
stream = apiTest.GetStreamFilter(None, ['someWord'])
while True:
for tweet in stream:
print(tweet)
How about replacing your while True with a loop that extracts things out of the stream?
for tweet in apiTest.GetStreamFilter(track=['someWord']):
print tweet
Please forgive me if this is a gross repeat of a question previously answered elsewhere, but I am lost on how to use the tweepy API search function. Is there any documentation available on how to search for tweets using the api.search() function?
Is there any way I can control features such as number of tweets returned, results type etc.?
The results seem to max out at 100 for some reason.
the code snippet I use is as follows
searched_tweets = self.api.search(q=query,rpp=100,count=1000)
I originally worked out a solution based on Yuva Raj's suggestion to use additional parameters in GET search/tweets - the max_id parameter in conjunction with the id of the last tweet returned in each iteration of a loop that also checks for the occurrence of a TweepError.
However, I discovered there is a far simpler way to solve the problem using a tweepy.Cursor (see tweepy Cursor tutorial for more on using Cursor).
The following code fetches the most recent 1000 mentions of 'python'.
import tweepy
# assuming twitter_authentication.py contains each of the 4 oauth elements (1 per line)
from twitter_authentication import API_KEY, API_SECRET, ACCESS_TOKEN, ACCESS_TOKEN_SECRET
auth = tweepy.OAuthHandler(API_KEY, API_SECRET)
auth.set_access_token(ACCESS_TOKEN, ACCESS_TOKEN_SECRET)
api = tweepy.API(auth)
query = 'python'
max_tweets = 1000
searched_tweets = [status for status in tweepy.Cursor(api.search, q=query).items(max_tweets)]
Update: in response to Andre Petre's comment about potential memory consumption issues with tweepy.Cursor, I'll include my original solution, replacing the single statement list comprehension used above to compute searched_tweets with the following:
searched_tweets = []
last_id = -1
while len(searched_tweets) < max_tweets:
count = max_tweets - len(searched_tweets)
try:
new_tweets = api.search(q=query, count=count, max_id=str(last_id - 1))
if not new_tweets:
break
searched_tweets.extend(new_tweets)
last_id = new_tweets[-1].id
except tweepy.TweepError as e:
# depending on TweepError.code, one may want to retry or wait
# to keep things simple, we will give up on an error
break
There's a problem in your code. Based on Twitter Documentation for GET search/tweets,
The number of tweets to return per page, up to a maximum of 100. Defaults to 15. This was
formerly the "rpp" parameter in the old Search API.
Your code should be,
CONSUMER_KEY = '....'
CONSUMER_SECRET = '....'
ACCESS_KEY = '....'
ACCESS_SECRET = '....'
auth = tweepy.auth.OAuthHandler(CONSUMER_KEY, CONSUMER_SECRET)
auth.set_access_token(ACCESS_KEY, ACCESS_SECRET)
api = tweepy.API(auth)
search_results = api.search(q="hello", count=100)
for i in search_results:
# Do Whatever You need to print here
The other questions are old and the API changed a lot.
Easy way, with Cursor (see the Cursor tutorial). Pages returns a list of elements (You can limit how many pages it returns. .pages(5) only returns 5 pages):
for page in tweepy.Cursor(api.search, q='python', count=100, tweet_mode='extended').pages():
# process status here
process_page(page)
Where q is the query, count how many will it bring for requests (100 is the maximum for requests) and tweet_mode='extended' is to have the full text. (without this the text is truncated to 140 characters) More info here. RTs are truncated as confirmed jaycech3n.
If you don't want to use tweepy.Cursor, you need to indicate max_id to bring the next chunk. See for more info.
last_id = None
result = True
while result:
result = api.search(q='python', count=100, tweet_mode='extended', max_id=last_id)
process_result(result)
# we subtract one to not have the same again.
last_id = result[-1]._json['id'] - 1
I am working on extracting twitter data for around a location (in here, around India), for all tweets which include a special keyword or a list of keywords.
import tweepy
import credentials ## all my twitter API credentials are in this file, this should be in the same directory as is this script
## set API connection
auth = tweepy.OAuthHandler(credentials.consumer_key,
credentials.consumer_secret)
auth.set_access_secret(credentials.access_token,
credentials.access_secret)
api = tweepy.API(auth, wait_on_rate_limit=True) # set wait_on_rate_limit =True; as twitter may block you from querying if it finds you exceeding some limits
search_words = ["#covid19", "2020", "lockdown"]
date_since = "2020-05-21"
tweets = tweepy.Cursor(api.search, =search_words,
geocode="20.5937,78.9629,3000km",
lang="en", since=date_since).items(10)
## the geocode is for India; format for geocode="lattitude,longitude,radius"
## radius should be in miles or km
for tweet in tweets:
print("created_at: {}\nuser: {}\ntweet text: {}\ngeo_location: {}".
format(tweet.created_at, tweet.user.screen_name, tweet.text, tweet.user.location))
print("\n")
## tweet.user.location will give you the general location of the user and not the particular location for the tweet itself, as it turns out, most of the users do not share the exact location of the tweet
Results:
created_at: 2020-05-28 16:48:23
user: XXXXXXXXX
tweet text: RT #Eatala_Rajender: Media Bulletin on status of positive cases #COVID19 in Telangana. (Dated. 28.05.2020)
# TelanganaFightsCorona
# StayHom…
geo_location: Hyderabad, India
You can search the tweets with specific strings as showed below:
tweets = api.search('Artificial Intelligence', count=200)