I'm trying to extract the amount of #btc since 2019-01-01 per day.
I know the error is about permission, but I'm already using the keys generated from Twitter developer's portal.
Here's my code, I deleted my developer keys.
# Python Script to Extract tweets of a
# particular Hashtag using Tweepy and Pandas
# import modules
import pandas as pd
import tweepy
# function to display data of each tweet
def printtweetdata(n, ith_tweet):
print()
print(f"Tweet {n}:")
print(f"Username:{ith_tweet[0]}")
print(f"Description:{ith_tweet[1]}")
print(f"Location:{ith_tweet[2]}")
print(f"Following Count:{ith_tweet[3]}")
print(f"Follower Count:{ith_tweet[4]}")
print(f"Total Tweets:{ith_tweet[5]}")
print(f"Retweet Count:{ith_tweet[6]}")
print(f"Tweet Text:{ith_tweet[7]}")
print(f"Hashtags Used:{ith_tweet[8]}")
# function to perform data extraction
def scrape(words, date_since, numtweet):
# Creating DataFrame using pandas
db = pd.DataFrame(columns=['username', 'description', 'location', 'following',
'followers', 'totaltweets', 'retweetcount', 'text', 'hashtags'])
# We are using .Cursor() to search through twitter for the required tweets.
# The number of tweets can be restricted using .items(number of tweets)
tweets = tweepy.Cursor(api.search, q=words, lang="en",
since=date_since, tweet_mode='extended').items(numtweet)
# .Cursor() returns an iterable object. Each item in
# the iterator has various attributes that you can access to
# get information about each tweet
list_tweets = [tweet for tweet in tweets]
# Counter to maintain Tweet Count
i = 1
# we will iterate over each tweet in the list for extracting information about each tweet
for tweet in list_tweets:
username = tweet.user.screen_name
description = tweet.user.description
location = tweet.user.location
following = tweet.user.friends_count
followers = tweet.user.followers_count
totaltweets = tweet.user.statuses_count
retweetcount = tweet.retweet_count
hashtags = tweet.entities['hashtags']
# Retweets can be distinguished by a retweeted_status attribute,
# in case it is an invalid reference, except block will be executed
try:
text = tweet.retweeted_status.full_text
except AttributeError:
text = tweet.full_text
hashtext = list()
for j in range(0, len(hashtags)):
hashtext.append(hashtags[j]['text'])
# Here we are appending all the extracted information in the DataFrame
ith_tweet = [username, description, location, following,
followers, totaltweets, retweetcount, text, hashtext]
db.loc[len(db)] = ith_tweet
# Function call to print tweet data on screen
printtweetdata(i, ith_tweet)
i = i+1
filename = 'scraped_tweets.csv'
# we will save our database as a CSV file.
db.to_csv(filename)
if __name__ == '__main__':
# Enter your own credentials obtained
# from your developer account
consumer_key = ""
consumer_secret = ""
access_key = ""
access_secret = ""
auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_key, access_secret)
api = tweepy.API(auth)
# Enter Hashtag and initial date
print("Enter Twitter HashTag to search for")
words = input()
print("Enter Date since The Tweets are required in yyyy-mm--dd")
date_since = input()
# number of tweets you want to extract in one run
numtweet = 100
scrape(words, date_since, numtweet)
print('Scraping has completed!')
Here's the error:
---------------------------------------------------------------------------
TweepError Traceback (most recent call last)
<ipython-input-4-dee0a1a7784b> in <module>()
98 # number of tweets you want to extract in one run
99 numtweet = 100
--> 100 scrape(words, date_since, numtweet)
101 print('Scraping has completed!')
6 frames
<ipython-input-4-dee0a1a7784b> in scrape(words, date_since, numtweet)
38 # the iterator has various attributes that you can access to
39 # get information about each tweet
---> 40 list_tweets = [tweet for tweet in tweets]
41
42 # Counter to maintain Tweet Count
<ipython-input-4-dee0a1a7784b> in <listcomp>(.0)
38 # the iterator has various attributes that you can access to
39 # get information about each tweet
---> 40 list_tweets = [tweet for tweet in tweets]
41
42 # Counter to maintain Tweet Count
/usr/local/lib/python3.7/dist-packages/tweepy/cursor.py in __next__(self)
49
50 def __next__(self):
---> 51 return self.next()
52
53 def next(self):
/usr/local/lib/python3.7/dist-packages/tweepy/cursor.py in next(self)
241 if self.current_page is None or self.page_index == len(self.current_page) - 1:
242 # Reached end of current page, get the next page...
--> 243 self.current_page = self.page_iterator.next()
244 while len(self.current_page) == 0:
245 self.current_page = self.page_iterator.next()
/usr/local/lib/python3.7/dist-packages/tweepy/cursor.py in next(self)
130
131 if self.index >= len(self.results) - 1:
--> 132 data = self.method(max_id=self.max_id, parser=RawParser(), *self.args, **self.kwargs)
133
134 if hasattr(self.method, '__self__'):
/usr/local/lib/python3.7/dist-packages/tweepy/binder.py in _call(*args, **kwargs)
251 return method
252 else:
--> 253 return method.execute()
254 finally:
255 method.session.close()
/usr/local/lib/python3.7/dist-packages/tweepy/binder.py in execute(self)
232 raise RateLimitError(error_msg, resp)
233 else:
--> 234 raise TweepError(error_msg, resp, api_code=api_error_code)
235
236 # Parse the response payload
TweepError: Twitter error response: status code = 403
I just had this issue and I resolved it by going back to my Developer Portal and applying for "Elevated Access". All 403 status code errors are related to authentication issues.
Related
I am trying to extract tweets that include a specific hashtag(#nike) and I have essential level access(Twitter developer account). I only want to collect 100 tweets. Help, please! Thank you!
I keep getting this error:
Forbidden Traceback (most recent call last)
Forbidden: 403 Forbidden
453 - You currently have Essential access which includes access to Twitter API v2 endpoints only. If you need access to this endpoint, you’ll need to apply for Elevated access via the Developer Portal. You can learn more here: https://developer.twitter.com/en/docs/twitter-api/getting-started/about-twitter-api#v2-access-leve
import tweepy
# function to display data of each tweet
def printtweetdata(n, ith_tweet):
print()
print(f"Tweet {n}:")
print(f"Username:{ith_tweet[0]}")
print(f"Description:{ith_tweet[1]}")
print(f"Location:{ith_tweet[2]}")
print(f"Following Count:{ith_tweet[3]}")
print(f"Follower Count:{ith_tweet[4]}")
print(f"Total Tweets:{ith_tweet[5]}")
print(f"Retweet Count:{ith_tweet[6]}")
print(f"Tweet Text:{ith_tweet[7]}")
print(f"Hashtags Used:{ith_tweet[8]}")
# function to perform data extraction
def scrape(words, date_since, numtweet):
# Creating DataFrame using pandas
db = pd.DataFrame(columns=['username',
'description',
'location',
'following',
'followers',
'totaltweets',
'retweetcount',
'text',
'hashtags'])
# We are using .Cursor() to search
# through twitter for the required tweets.
# The number of tweets can be
# restricted using .items(number of tweets)
tweets = tweepy.Cursor(api.search_tweets,
words, lang="en",
since_id=date_since,
tweet_mode='extended').items(numtweet)
# .Cursor() returns an iterable object. Each item in
# the iterator has various attributes
# that you can access to
# get information about each tweet
list_tweets = [tweet for tweet in tweets]
# Counter to maintain Tweet Count
i = 1
# we will iterate over each tweet in the
# list for extracting information about each tweet
for tweet in list_tweets:
username = tweet.user.screen_name
description = tweet.user.description
location = tweet.user.location
following = tweet.user.friends_count
followers = tweet.user.followers_count
totaltweets = tweet.user.statuses_count
retweetcount = tweet.retweet_count
hashtags = tweet.entities['hashtags']
# Retweets can be distinguished by
# a retweeted_status attribute,
# in case it is an invalid reference,
# except block will be executed
try:
text = tweet.retweeted_status.full_text
except AttributeError:
text = tweet.full_text
hashtext = list()
for j in range(0, len(hashtags)):
hashtext.append(hashtags[j]['text'])
# Here we are appending all the
# extracted information in the DataFrame
ith_tweet = [username, description,
location, following,
followers, totaltweets,
retweetcount, text, hashtext]
db.loc[len(db)] = ith_tweet
# Function call to print tweet data on screen
printtweetdata(i, ith_tweet)
i = i+1
filename = 'scraped_tweets.csv'
# we will save our database as a CSV file.
db.to_csv(filename)
if __name__ == '__main__':
# Enter your own credentials obtained
# from your developer account
consumer_key = ""
consumer_secret = ""
access_key = ""
access_secret = ""
auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_key, access_secret)
api = tweepy.API(auth)
# Enter Hashtag and initial date
print("#nike")
words = input("#nike")
print("2022-01-01")
date_since = input("2022-01-01")
# number of tweets you want to extract in one run
numtweet = 10
scrape(words, date_since, numtweet)
print('Scraping has completed!')```
I want to retrieve At least 1000 tweets from a {user} timeline replies included
● At least 100 tweets of the 1000 tweets are related to Covid-19 keyword like ["covid19", "wuhan", "mask", "lockdown", "quarantine", "sars-cov-2"] etc.
I wrote the function to retrieve the tweets:
def get_tweets_by_user(self, screen_name):
'''
Use user_timeline api to fetch POI related tweets, some postprocessing may be required.
:return: List
'''
result = []
tweets = api.user_timeline(screen_name=screen_name,
# 200 is the maximum allowed count
count=200,
include_rts = True,
# Necessary to keep full_text
# otherwise only the first 140 words are extracted
tweet_mode = 'extended'
)
for tw in tweets:
result.append(tw)
return result
Now how do I retrieve 100 tweets related to covid-19 keywords from user timeline?
Register for Twitter developer API. You'll need a couple consumer keys. Tell them you're a student.
import requests as re
import json
import twitter # install this library to work with twitter dev.
consumer_key = "your key"
consumer_secret = "your key"
access_token = "your key"
access_token_secret = "your key"
api = twitter.Api(consumer_key=yourkey,
consumer_secret=yoursecret,
access_token_key=youraccesstoken,
access_token_secret=yourtokensecret)
FILTER = ["covid-19 string here"] # PUT YOUR COVID 19 STRING HERE
LANGUAGES = ['en']
store_file = "outputfileforcovidtweets.txt"
_location = ["put coordinates here"]
def main():
with open(store_file, 'a') as z:
for line in api.GetStreamFilter(track=FILTER, languages=LANGUAGES, locations=_location):
z.write(json.dumps(line))
z.write('\n')
main()
This will collect real-time tweets to your output file. :)
Through the basic Academic Research Developer Account, I'm using the Tweepy API to collect tweets containing specified keywords or hashtags. This enables me to collect 10,000,000 tweets per month. Using the entire archive search, I'm trying to collect tweets from one whole calendar date at a time. I've gotten a rate limit error (despite the wait_on_rate_limit flag being set to true) Now there's an error with the request limit.
here is the code
import pandas as pd
import tweepy
# function to display data of each tweet
def printtweetdata(n, ith_tweet):
print()
print(f"Tweet {n}:")
print(f"Username:{ith_tweet[0]}")
print(f"tweet_ID:{ith_tweet[1]}")
print(f"userID:{ith_tweet[2]}")
print(f"creation:{ith_tweet[3]}")
print(f"location:{ith_tweet[4]}")
print(f"Total Tweets:{ith_tweet[5]}")
print(f"likes:{ith_tweet[6]}")
print(f"retweets:{ith_tweet[7]}")
print(f"hashtag:{ith_tweet[8]}")
# function to perform data extraction
def scrape(words, numtweet, since_date, until_date):
# Creating DataFrame using pandas
db = pd.DataFrame(columns=['username', 'tweet_ID', 'userID',
'creation', 'location', 'text','likes','retweets', 'hashtags'])
# We are using .Cursor() to search through twitter for the required tweets.
# The number of tweets can be restricted using .items(number of tweets)
tweets = tweepy.Cursor(api.search_full_archive,'research',query=words,
fromDate=since_date, toDate=until_date).items(numtweet)
# .Cursor() returns an iterable object. Each item in
# the iterator has various attributes that you can access to
# get information about each tweet
list_tweets = [tweet for tweet in tweets]
# Counter to maintain Tweet Count
i = 1
# we will iterate over each tweet in the list for extracting information about each tweet
for tweet in list_tweets:
username = tweet.user.screen_name
tweet_ID = tweet.id
userID= tweet.author.id
creation = tweet.created_at
location = tweet.user.location
likes = tweet.favorite_count
retweets = tweet.retweet_count
hashtags = tweet.entities['hashtags']
# Retweets can be distinguished by a retweeted_status attribute,
# in case it is an invalid reference, except block will be executed
try:
text = tweet.retweeted_status.full_text
except AttributeError:
text = tweet.text
hashtext = list()
for j in range(0, len(hashtags)):
hashtext.append(hashtags[j]['text'])
# Here we are appending all the extracted information in the DataFrame
ith_tweet = [username, tweet_ID, userID,
creation, location, text, likes,retweets,hashtext]
db.loc[len(db)] = ith_tweet
# Function call to print tweet data on screen
printtweetdata(i, ith_tweet)
i = i+1
filename = 'C:/Users/USER/Desktop/الجامعة الالمانية/output/twitter.csv'
# we will save our database as a CSV file.
db.to_csv(filename)
if __name__ == '__main__':
consumer_key = "####"
consumer_secret = "###"
access_token = "###"
access_token_secret = "###"
auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_token_secret)
api = tweepy.API(auth,wait_on_rate_limit=True)
since_date = '200701010000'
until_date = '202101012359'
words = "#USA"
# number of tweets you want to extract in one run
numtweet = 1000
scrape(words, numtweet, since_date, until_date)
print('Scraping has completed!')
I got this error:
TooManyRequests: 429 Too Many Requests
Request exceeds account’s current package request limits. Please upgrade your package and retry or contact Twitter about enterprise access.
Unfortunately, I believe this is due to the Sandbox quota. For a premium account it would be more.
Tweepy API Documentation
You may check out this answer here - Limit
I'm trying to take every open tweets in a hashtag but my code does not go further than 299 tweets.
I also trying to take tweets from a specific time line like tweets only in May 2015 and July 2016. Are there any way to do it in the main process or should I write a little code for it?
Here is my code:
# if this is the first time, creates a new array which
# will store max id of the tweets for each keyword
if not os.path.isfile("max_ids.npy"):
max_ids = np.empty(len(keywords))
# every value is initialized as -1 in order to start from the beginning the first time program run
max_ids.fill(-1)
else:
max_ids = np.load("max_ids.npy") # loads the previous max ids
# if there is any new keywords added, extends the max_ids array in order to correspond every keyword
if len(keywords) > len(max_ids):
new_indexes = np.empty(len(keywords) - len(max_ids))
new_indexes.fill(-1)
max_ids = np.append(arr=max_ids, values=new_indexes)
count = 0
for i in range(len(keywords)):
since_date="2015-01-01"
sinceId = None
tweetCount = 0
maxTweets = 5000000000000000000000 # maximum tweets to find per keyword
tweetsPerQry = 100
searchQuery = "#{0}".format(keywords[i])
while tweetCount < maxTweets:
if max_ids[i] < 0:
if (not sinceId):
new_tweets = api.search(q=searchQuery, count=tweetsPerQry)
else:
new_tweets = api.search(q=searchQuery, count=tweetsPerQry,
since_id=sinceId)
else:
if (not sinceId):
new_tweets = api.search(q=searchQuery, count=tweetsPerQry,
max_id=str(max_ids - 1))
else:
new_tweets = api.search(q=searchQuery, count=tweetsPerQry,
max_id=str(max_ids - 1),
since_id=sinceId)
if not new_tweets:
print("Keyword: {0} No more tweets found".format(searchQuery))
break
for tweet in new_tweets:
count += 1
print(count)
file_write.write(
.
.
.
)
item = {
.
.
.
.
.
}
# instead of using mongo's id for _id, using tweet's id
raw_data = tweet._json
raw_data["_id"] = tweet.id
raw_data.pop("id", None)
try:
db["Tweets"].insert_one(item)
except pymongo.errors.DuplicateKeyError as e:
print("Already exists in 'Tweets' collection.")
try:
db["RawTweets"].insert_one(raw_data)
except pymongo.errors.DuplicateKeyError as e:
print("Already exists in 'RawTweets' collection.")
tweetCount += len(new_tweets)
print("Downloaded {0} tweets".format(tweetCount))
max_ids[i] = new_tweets[-1].id
np.save(arr=max_ids, file="max_ids.npy") # saving in order to continue mining from where left next time program run
Have a look at this: https://tweepy.readthedocs.io/en/v3.5.0/cursor_tutorial.html
And try this:
import tweepy
auth = tweepy.OAuthHandler(CONSUMER_TOKEN, CONSUMER_SECRET)
api = tweepy.API(auth)
for tweet in tweepy.Cursor(api.search, q='#python', rpp=100).items():
# Do something
pass
In your case you have a max number of tweets to get, so as per the linked tutorial you could do:
import tweepy
MAX_TWEETS = 5000000000000000000000
auth = tweepy.OAuthHandler(CONSUMER_TOKEN, CONSUMER_SECRET)
api = tweepy.API(auth)
for tweet in tweepy.Cursor(api.search, q='#python', rpp=100).items(MAX_TWEETS):
# Do something
pass
If you want tweets after a given ID, you can also pass that argument.
Sorry, I can't answer in comment, too long. :)
Sure :) Check this example:
Advanced searched for #data keyword 2015 may - 2016 july
Got this url: https://twitter.com/search?l=&q=%23data%20since%3A2015-05-01%20until%3A2016-07-31&src=typd
session = requests.session()
keyword = 'data'
date1 = '2015-05-01'
date2 = '2016-07-31'
session.get('https://twitter.com/search?l=&q=%23+keyword+%20since%3A+date1+%20until%3A+date2&src=typd', streaming = True)
Now we have all the requested tweets,
Probably you could have problems with 'pagination'
Pagination url ->
https://twitter.com/i/search/timeline?vertical=news&q=%23data%20since%3A2015-05-01%20until%3A2016-07-31&src=typd&include_available_features=1&include_entities=1&max_position=TWEET-759522481271078912-759538448860581892-BD1UO2FFu9QAAAAAAAAETAAAAAcAAAASAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA&reset_error_state=false
Probably you could put a random tweet id, or you can parse first, or requests some data from twitter. It can be done.
Use Chrome's networking tab to find all the requested information :)
This code worked for me.
import tweepy
import pandas as pd
import os
#Twitter Access
auth = tweepy.OAuthHandler( 'xxx','xxx')
auth.set_access_token('xxx-xxx','xxx')
api = tweepy.API(auth,wait_on_rate_limit = True)
df = pd.DataFrame(columns=['text', 'source', 'url'])
msgs = []
msg =[]
for tweet in tweepy.Cursor(api.search, q='#bmw', rpp=100).items(10):
msg = [tweet.text, tweet.source, tweet.source_url]
msg = tuple(msg)
msgs.append(msg)
df = pd.DataFrame(msgs)
Check twitter api documentation, probably it allows just 300 tweets to parse.
I would recommend to forget api, make it with requests with streaming. The api is an implementation of requests with limitations.
I am using the python-twitter module to get the tweets of my friends. I used the following code:
import twitter
CONSUMER_KEY = ''
CONSUMER_SECRET = ''
OAUTH_TOKEN = ''
OAUTH_TOKEN_SECRET = ''
auth = twitter.oauth.OAuth(OAUTH_TOKEN, OAUTH_TOKEN_SECRET,
CONSUMER_KEY, CONSUMER_SECRET)
twitter_api = twitter.Twitter(auth=auth)
count = 0
for tweets in twitter_api.statuses.user_timeline(screen_name="thekiranbedi",count=500):
try:
print tweets['text']
count += 1
except Exception:
pass
print count
But as the result says, the value of count remains 200, so I am getting only the 200 recent tweets from the id with screen_name='thekiranbedi. But I want all the tweets. How can that be done?
That is a limitation of Twitter API, not of python-twitter module:
https://dev.twitter.com/rest/reference/get/statuses/user_timeline
count - Specifies the number of tweets to try and retrieve, up to a maximum of 200 per distinct request.
So as I understood you have to use 'since_id' and 'max_id' arguments to collect next portion of tweets.