Tweepy API doesn't download all tweets - python

So I created a hashtag #tweet230720211255 and I want to download each and every tweet posted with this hashtag from last 7 days. So I used Tweepy and so far it does a good job of downloading tweets. But there is a problem that I'm facing.
Tweepy only downloads the tweets that has text in them. By that what I mean is if you post a tweet like this or this, basically without any text except the hashtag, then it won't get downloaded. Would like some help here please. The code I have used is below:
#Scraping
import tweepy # for tweet mining
import csv # to read and write csv files
import glob
#Processing
import pandas as pd
import preprocessor as p
import requests
import string
import re # In-built regular expressions library
from collections import Counter
CONSUMER_KEY = 'xxxx'
CONSUMER_SECRET = 'xxxx'
ACCESS_KEY = 'xxxx'
ACCESS_SECRET = 'xxxx'
auth = tweepy.OAuthHandler(CONSUMER_KEY, CONSUMER_SECRET) # Pass in Consumer key and secret for authentication by API
auth.set_access_token(ACCESS_KEY, ACCESS_SECRET) # Pass in Access key and secret for authentication by API
api = tweepy.API(auth, wait_on_rate_limit = True, wait_on_rate_limit_notify = True) # Sleeps when API limit is reached
def get_tweets_in_data_range(data, phrase, max_tweets, end_date, start_date = None, start_via_tweet_id = None):
search_query = phrase
#+ " -filter:links AND -filter:retweets AND -filter:replies" #Exclude links, retweets, replies
for i in tweepy.Cursor(api.search, q = search_query, since = start_date, until = end_date, since_id = start_via_tweet_id, lang = "en", tweet_mode = "extended").items(max_tweets):
data.append([i.full_text, i.id, i.created_at, i.coordinates, i.retweet_count, i.favorite_count]) # Embedded Twitter parameters
scraped_data = []
scraped_data.append(["text", "id", "time", "location", "retweet_count", "fav_count"])
PHRASE = "\"#tweet230720211255\""
MAX_TWEETS = 1000 #Maximum number of tweets to scrape
START_DATE = '2021-07-23' #only last 7 days supported
END_DATE = '2021-07-27' #only last 7 days supported
get_tweets_in_data_range(scraped_data, PHRASE, MAX_TWEETS, END_DATE, START_DATE) #call to get tweets between date ranges
cursor = tweepy.Cursor(api.user_timeline, id='burgerking', tweet_mode = "extended").items(1)
for i in cursor:
print(dir(i))
tweets = pd.DataFrame(scraped_data[1:],columns=scraped_data[0])
tweets_csv = tweets.to_csv('download.csv', index=True) #saves a csv file with the data scraped
with pd.option_context('display.max_rows', None,
'display.max_columns', None,
'display.precision', 3,
):
print(tweets)
Please point out where I'm going wrong.

Related

There is a problem using "since" in Tweety function to extract Covaxin related hashtag tweets from the starting time of Covid'19

It is saying unexpected parameter :since when running the below code
import tweepy
# Enter your own credentials obtained
# from your developer account
consumer_key = "wwww"
consumer_secret = "xxxx"
access_key = "yyyy"
access_secret = "zzzz"
# The above keys are mentioned correctly in the programming code
# Twitter authentication
auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_key, access_secret)
# Creating an API object
api = tweepy.API(auth)
hashtag_tweets = tweepy.Cursor(api.search_tweets, q="#Covaxin", tweet_mode='extended').items(5)
date_tweets = tweepy.Cursor(api.search_tweets, q="#Covaxin", since="2020-03-31", until="2022-01-09",tweet_mode='extended').items(5)
list = []
for tweet in date_tweets:
text = tweet._json["full_text"]
#print(text)
refined_tweet = {'text' : text,
'favorite_count' : tweet.favorite_count,
'retweet_count' : tweet.retweet_count,
'created_at' : tweet.created_at}
list.append(refined_tweet)
#print(list)
import pandas as pd
df = pd.DataFrame(list)
print(df)
df.to_csv('refined_tweets.csv')
It is saying unexpected parameter :since when running the code
I was trying to get the output for all tweets satisfying date query for the particular hashtag Covaxin from the starting of Covid days till now.

tweepy favorite_count repeats the original tweet likes

The problem I'm facing is that whenever I try to retrieve a tweets from a hashtag most of the tweets are retweets of an origin tweet and they all repeat the same like and retweet number. For example if I have a tweet with over 100 likes and 20 retweets and there are over 10 retweets of my tweet all 10 of those tweets will have 100 likes and 20 retweets, which is redundant data. This is a very big issue especially because I usually retrieve about 5000 - 10000 tweets for analysis.
Code:
from os import access
import tweepy
import configparser
import pandas as pd
api_key = ''
api_key_secret = ''
access_token = ''
access_token_secret = ''
auth = tweepy.OAuthHandler(api_key, api_key_secret)
auth.set_access_token(access_token, access_token_secret)
api = tweepy.API(auth)
# user = '#elonmusk'
keywords = '#AsiaCup2022'
limit = 10000
tweets = tweepy.Cursor(api.search_tweets, q = keywords, count = 100, tweet_mode = 'extended').items(limit)
# tweets = api.user_timeline(screen_name = user, count = limit, tweet_mode = 'extended')
columns = ['User', 'Tweet', 'Likes', 'Retweets']
data = []
for tweet in tweets:
try:
data.append([tweet.user.screen_name, tweet.full_text, tweet.retweeted_status.favorite_count, tweet.retweet_count])
except:
data.append([tweet.user.screen_name, tweet.full_text, tweet.favorite_count, tweet.retweet_count])
df = pd.DataFrame(data, columns=columns)
df.to_excel("Cup2022.xlsx")
An example of what my issue is:
As you can see in the pic the same tweet has been retweeted by two different users and they have the same like and retweet count as the original tweet. Any help would be appreciated, this is a really big problem for me considering it messes up my entire result.
You can exclude the retweets from the results of your search.
keywords = '#AsiaCup2022 -filter:retweets'
That way you will get only the original tweets and you will avoid that redundancy.

TooManyRequests: 429 Too Many Requests while running tweepy

Through the basic Academic Research Developer Account, I'm using the Tweepy API to collect tweets containing specified keywords or hashtags. This enables me to collect 10,000,000 tweets per month. Using the entire archive search, I'm trying to collect tweets from one whole calendar date at a time. I've gotten a rate limit error (despite the wait_on_rate_limit flag being set to true) Now there's an error with the request limit.
here is the code
import pandas as pd
import tweepy
# function to display data of each tweet
def printtweetdata(n, ith_tweet):
print()
print(f"Tweet {n}:")
print(f"Username:{ith_tweet[0]}")
print(f"tweet_ID:{ith_tweet[1]}")
print(f"userID:{ith_tweet[2]}")
print(f"creation:{ith_tweet[3]}")
print(f"location:{ith_tweet[4]}")
print(f"Total Tweets:{ith_tweet[5]}")
print(f"likes:{ith_tweet[6]}")
print(f"retweets:{ith_tweet[7]}")
print(f"hashtag:{ith_tweet[8]}")
# function to perform data extraction
def scrape(words, numtweet, since_date, until_date):
# Creating DataFrame using pandas
db = pd.DataFrame(columns=['username', 'tweet_ID', 'userID',
'creation', 'location', 'text','likes','retweets', 'hashtags'])
# We are using .Cursor() to search through twitter for the required tweets.
# The number of tweets can be restricted using .items(number of tweets)
tweets = tweepy.Cursor(api.search_full_archive,'research',query=words,
fromDate=since_date, toDate=until_date).items(numtweet)
# .Cursor() returns an iterable object. Each item in
# the iterator has various attributes that you can access to
# get information about each tweet
list_tweets = [tweet for tweet in tweets]
# Counter to maintain Tweet Count
i = 1
# we will iterate over each tweet in the list for extracting information about each tweet
for tweet in list_tweets:
username = tweet.user.screen_name
tweet_ID = tweet.id
userID= tweet.author.id
creation = tweet.created_at
location = tweet.user.location
likes = tweet.favorite_count
retweets = tweet.retweet_count
hashtags = tweet.entities['hashtags']
# Retweets can be distinguished by a retweeted_status attribute,
# in case it is an invalid reference, except block will be executed
try:
text = tweet.retweeted_status.full_text
except AttributeError:
text = tweet.text
hashtext = list()
for j in range(0, len(hashtags)):
hashtext.append(hashtags[j]['text'])
# Here we are appending all the extracted information in the DataFrame
ith_tweet = [username, tweet_ID, userID,
creation, location, text, likes,retweets,hashtext]
db.loc[len(db)] = ith_tweet
# Function call to print tweet data on screen
printtweetdata(i, ith_tweet)
i = i+1
filename = 'C:/Users/USER/Desktop/الجامعة الالمانية/output/twitter.csv'
# we will save our database as a CSV file.
db.to_csv(filename)
if __name__ == '__main__':
consumer_key = "####"
consumer_secret = "###"
access_token = "###"
access_token_secret = "###"
auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_token_secret)
api = tweepy.API(auth,wait_on_rate_limit=True)
since_date = '200701010000'
until_date = '202101012359'
words = "#USA"
# number of tweets you want to extract in one run
numtweet = 1000
scrape(words, numtweet, since_date, until_date)
print('Scraping has completed!')
I got this error:
TooManyRequests: 429 Too Many Requests
Request exceeds account’s current package request limits. Please upgrade your package and retry or contact Twitter about enterprise access.
Unfortunately, I believe this is due to the Sandbox quota. For a premium account it would be more.
Tweepy API Documentation
You may check out this answer here - Limit

Python: Hashtag search with Tweepy

I'd like to get Tweets with #MeTooMen using Tweepy.
There are many Tweets using this hashtag as far as I searched Twitter, but I get 0 result when I try to get these Tweets with Tweepy. Do you have any idea what I can do to improve this code?
import os
import tweepy as tw
import pandas as pd
api_key = '*'
api_secret_key = '*'
access_token = '*'
access_token_secret = '*'
auth = tw.OAuthHandler(api_key, api_secret_key)
auth.set_access_token(access_token, access_token_secret)
api = tw.API(auth, wait_on_rate_limit=True)
# Define the search term and the date_since date as variables
search_words = "#metoomen"
date_since = "2017-10-17"
date_until = "2018-01-31"
tweets = tw.Cursor(api.search,
q = search_words,
lang = "en",
since = date_since,
until = date_until).items(5)
users_locs = [[tweet.user.screen_name, tweet.user.location, tweet.text] for tweet in tweets]
users_locs
>>> []
API.search uses Twitter's standard search API and doesn't accept date_since or date_until parameters:
Keep in mind that the search index has a 7-day limit. In other words, no tweets will be found for a date older than one week.
https://developer.twitter.com/en/docs/twitter-api/v1/tweets/search/guides/standard-operators also says:
[It] is not a complete index of all Tweets, but instead an index of recent Tweets. The index includes between 6-9 days of Tweets.
You'll need to use the Full-archive premium search API endpoint, with API.search_full_archive, instead.

How to print tweet from from specific profile in python using twitter api

I want to print the tweets from a profile but I can't. I guess that I'm not using the right commands or something. I'm new in coding so I don't uderstand to much about api's.
I can get info about the profile so the conection is right.
from tweepy import OAuthHandler
from tweepy import API
from tweepy import Cursor
from datetime import datetime, date, time, timedelta
from collections import Counter
import sys
import tweepy
#I don't put the secret token and all of that
auth = OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_token_secret)
auth_api = API(auth)
account_list = ["jufut390"]
if len(account_list) > 0:
for target in account_list:
print("Getting data for " + target)
item = auth_api.get_user(target)
print("screen_name: " + item.screen_name)
#Get info about tweets
end_date = datetime.utcnow() - timedelta(days=5)
for status in Cursor(auth_api.user_timeline, id=target, tweet_mode = "extended").items():
#print tweets
if status.created_at < end_date:
break
In this line :
for status in Cursor(auth_api.user_timeline, id=target, tweet_mode = "extended").items():
The id parameter has no effect. It should be user_id and a valid user ID (numeric) See : https://developer.twitter.com/en/docs/tweets/timelines/api-reference/get-statuses-user_timeline.html
In you case, you can use the screen_name.
Secondly, you say you want to print the tweets, so write a print. Try this :
#Get info about tweets
end_date = datetime.utcnow() - timedelta(days=5)
for status in Cursor(auth_api.user_timeline, screen_name=item.screen_name, tweet_mode = "extended").items():
print(status.full_text)
if status.created_at < end_date:
break

Categories