Function to extract tweets from user's timeline is looping an - python

When I run the function, the function brings back around 200 tweets and then loops, until it gets to the total number of tweets the user actually has. So it will repeat the 200 tweets until it reaches that number.
def extract_tweets(userid):
tweets = api.user_timeline(screen_name = userid,
count = 200,
include_rts = True,
tweet_mode = 'extended')
for info in tweets[:3]:
print('ID: {}'.format(info.id))
print(info.created_at)
print(info.full_text)
print('\n')
all_tweets = []
all_tweets.extend(tweets)
oldest_id = tweets[-1].id
tweet_num = api.get_user(userid).statuses_count
while len(all_tweets) < tweet_num:
tweets = api.user_timeline(screen_name = userid,
count = 200,
tweet_mode = 'extended')
if len(tweets) == 1000:
break
oldest_id = tweets[-1].id
all_tweets.extend(tweets)
outtweets = [[tweet.id_str,
tweet.created_at,
tweet.favorite_count,
tweet.retweet_count,
tweet.full_text.encode('utf-8').decode('utf-8')]
for idx, tweet in enumerate(all_tweets)]
df = DataFrame(outtweets, columns =['id', 'created_at', 'favorite_count', 'retweet_count', 'text'])
df.to_csv('%s_tweets.csv' % userid, index = False)
df.head(3)

Your break condition (len(tweets) == 1000) will never be true as the maximum number of tweets return is 200. If you want to exit the loop when all the user tweets are collected, the break condition should be:
if len(tweets) == 0:
break

Related

How to update search parameters within a for loop

How can i alter the if tweets_per_day == 100 if statement so that when the for loop does its next iteration, the day variable is equal to the prior day? For example, here is what I wish to do:
On the first iteration of the for loop, 100 tweets created within the specified parameters will be appended to the all_tweets list. Currently working properly
Once 100 tweets have been appended, the day count will be updated by 1. Currently working properly
Within the mentioned if statement, as 100 tweets for the day have been appended, the search parameters for the tweets should be updated, by only displaying tweets which were created 1 day less than the previous day. Not working
The for loop restarts with the previous parameters in mind
from datetime import datetime, timedelta
import pandas as pd
import sys
import numpy as np
from datetime import datetime, timedelta, time
import tweepy
from workalendar.usa import NewYork
import re
import pytz
import configparser
# Read Configs
config = configparser.ConfigParser()
config.read('config.ini')
# Twitter API Credentials
api_key = config['twitter']['api_key']
api_key_secret = config['twitter']['api_key_secret']
access_token = config['twitter']['access_token']
access_token_secret = config['twitter']['access_token_secret']
# Authentication
auth = tweepy.OAuthHandler(api_key, api_key_secret)
auth.set_access_token(access_token, access_token_secret)
# Make an API object
api = tweepy.API(auth)
stock = input('Enter a stock symbol: ').upper()
stock = '$' + stock # Convert user input to Twitter cashtag
print(stock)
start_date = datetime.now().date()
search_parameters = {
'q': stock,
'lang': 'en',
'count': 100,
'until': start_date,
'tweet_mode': 'ext'
}
cursor = tweepy.Cursor(api.search_tweets, **search_parameters)
# Create a list of market holidays, where the stock exchange is closed
cal = NewYork()
cal.holidays(2023)
# Create a dataframe of stock market holidays
holiday_df = pd.DataFrame(cal.holidays(2023), columns=['Date', 'Holiday'])
holiday_df = holiday_df.drop([0, 3, 4, 9, 10, 11, 12]) # Remove non-market-holidays
# Add additional market holidays
holiday_df.loc[9] = ['2023-04-07', 'Good Friday']
holiday_df.loc[10] = ['2023-06-19', 'Juneteenth']
holiday_df['Date'] = pd.to_datetime(holiday_df['Date'])
tweets_per_day = 0
day_count = 0
dates = [] # empty list to store the dates of appended tweets
all_tweets = [] # empty list to store all tweets being used
tweet_iterator = cursor.items(limit=1000)
for tweet in cursor.items(limit=10000):
day = tweet.created_at
print(f'Start of for loop day value: {day}')
tweet_date = pd.to_datetime(day.date().strftime('%Y-%m-%d'))
# Only add tweets which were made before market open on weekdays
if day.time() < time(hour=9, minute=30) and day.weekday() < 5:
if not holiday_df['Date'].eq(tweet_date).any():
print(f'Tweet date {tweet.created_at.date()}, day date {day.date()}')
while tweets_per_day < 100:
if tweet.created_at.date() != day.date(): # If the day changes before 100 tweets, update day count
day_count += 1
tweets_per_day = 0
print('day != block')
break
all_tweets.append(tweet.text)
tweets_per_day += 1
dates.append(tweet.created_at.date())
print(f'Appended Tweet date: {tweet.created_at}')
print(f'Tweets per day: {tweets_per_day}')
tweet = next(tweet_iterator)
if tweets_per_day == 100:
day_count += 1
search_parameters['until'] -= timedelta(days=1)
cursor = tweepy.Cursor(api.search_tweets, **search_parameters) # create a new Cursor with the updated search parameters
tweet_iterator = cursor.items(limit=1000) # update the tweet iterator with the new Cursor
tweets_per_day = 0
if day_count >= 5:
print(f'Day count: {day_count}')
break

If no tweets in period X continue to period Y with next_token

I am using Tweepy to extract tweets from the Twitter API v.2.0. Note: I have Academic Research access.
My code loops over five lists start_time, end_time, search_query, date, username. More description below.
My code collects up to 20 tweets for each day in each list. However, if there are no tweets for a specific day the code goes into an infinite loop where it tries to find the next_token without success.
If no tweets are found for a specific day then the code should resume to the subsequent day/elements in lists start_time and end_time. How can that be done?
My code extracts tweets in the time between the elements in lists start_date and end_date corresponding to 24 hours. The two lists run in a parallel loop. Each of start_date and end_datecontain three lists with 11 elements.
start_date = [['2022-02-06T00:00:00.000Z', '2022-02-07T00:00:00.000Z', '2022-02-08T00:00:00.000Z', '2022-02-09T00:00:00.000Z', '2022-02-10T00:00:00.000Z', '2022-02-11T00:00:00.000Z', '2022-02-12T00:00:00.000Z', '2022-02-13T00:00:00.000Z', '2022-02-14T00:00:00.000Z', '2022-02-15T00:00:00.000Z', '2022-02-16T00:00:00.000Z'], ['2022-01-28T00:00:00.000Z', '2022-01-29T00:00:00.000Z', '2022-01-30T00:00:00.000Z', '2022-01-31T00:00:00.000Z', '2022-02-01T00:00:00.000Z', '2022-02-02T00:00:00.000Z', '2022-02-03T00:00:00.000Z', '2022-02-04T00:00:00.000Z', '2022-02-05T00:00:00.000Z', '2022-02-06T00:00:00.000Z', '2022-02-07T00:00:00.000Z'], ['2022-01-28T00:00:00.000Z', '2022-01-29T00:00:00.000Z', '2022-01-30T00:00:00.000Z', '2022-01-31T00:00:00.000Z', '2022-02-01T00:00:00.000Z', '2022-02-02T00:00:00.000Z', '2022-02-03T00:00:00.000Z', '2022-02-04T00:00:00.000Z', '2022-02-05T00:00:00.000Z', '2022-02-06T00:00:00.000Z', '2022-02-07T00:00:00.000Z']]
end_date = [['2022-02-06T23:59:59.000Z', '2022-02-07T23:59:59.000Z', '2022-02-08T23:59:59.000Z', '2022-02-09T23:59:59.000Z', '2022-02-10T23:59:59.000Z', '2022-02-11T23:59:59.000Z', '2022-02-12T23:59:59.000Z', '2022-02-13T23:59:59.000Z', '2022-02-14T23:59:59.000Z', '2022-02-15T23:59:59.000Z', '2022-02-16T23:59:59.000Z'], ['2022-01-28T23:59:59.000Z', '2022-01-29T23:59:59.000Z', '2022-01-30T23:59:59.000Z', '2022-01-31T23:59:59.000Z', '2022-02-01T23:59:59.000Z', '2022-02-02T23:59:59.000Z', '2022-02-03T23:59:59.000Z', '2022-02-04T23:59:59.000Z', '2022-02-05T23:59:59.000Z', '2022-02-06T23:59:59.000Z', '2022-02-07T23:59:59.000Z'], ['2022-01-28T23:59:59.000Z', '2022-01-29T23:59:59.000Z', '2022-01-30T23:59:59.000Z', '2022-01-31T23:59:59.000Z', '2022-02-01T23:59:59.000Z', '2022-02-02T23:59:59.000Z', '2022-02-03T23:59:59.000Z', '2022-02-04T23:59:59.000Z', '2022-02-05T23:59:59.000Z', '2022-02-06T23:59:59.000Z', '2022-02-07T23:59:59.000Z']]
For each list in start_date and end_date there is a search_query. The two lists date and username are used to name the CSV files containing the extracted tweets.
search_query = ['(#brikeilarcnn OR "Brianna Keilar") -is:retweet', '(#brianstelter OR "Brian Stelter") -is:retweet', '(#Acosta OR "Jim Acosta") -is:retweet']
username = ['#brikeilarcnn', '#brianstelter', '#Acosta']
date = ['2022-02-11', '2022-02-02', '2022-02-02']
Apart from the lists above my code below contain three important functions: create_url, connect_to_endpoint, and append_to_csv. Both functions work as intended. However, if code for the functions is needed to answer my question I can provide it.
for suffixes_1, suffixes_2, name, day, user_handle in zip(start_time, end_time, search_query,
date, username):
for s1, s2 in zip(suffixes_1, suffixes_2):
# Inputs
count = 0 # Counting tweets per time period/journalist
max_count = 20 # Max tweets per time period/journalist
flag = True
next_token = None
# create csv files named after date
csvFile = open(day + "_" + user_handle + ".csv", "a", newline="", encoding='utf-8')
csvWriter = csv.writer(csvFile)
# create headers for the four variables: author_id, created_at, id, and tweet
csvWriter.writerow(
['author_id', 'created_at', 'id', 'tweet'])
csvFile.close()
# create url for tweet extraction based on for loop:
# loop over queries/name, start_time/si and end_time/s2
url = create_url(name, s1, s2, max_results)
json_response = connect_to_endpoint(url[0], headers, url[1], next_token)
result_count = json_response['meta']['result_count']
# Check if flag is true
while flag:
# Check if max_count reached
if count >= max_count:
break
print("-------------------")
print("Token: ", next_token) # The line that is # continuously printed when next_token = None
if 'next_token' in json_response['meta']:
# Save the token to use for next call
next_token = json_response['meta']['next_token']
print("Next Token: ", next_token)
if result_count is not None and result_count > 0 and next_token is not None:
print("Start Date: ", s1, "Name of journalist:", user_handle)
append_to_csv(json_response, day + "_" + user_handle + ".csv")
count += result_count
total_tweets += result_count
print("Total # of Tweets added: ", total_tweets)
print("-------------------")
sleep(5) # sleep for 5 sec. to avoid flooding
# If no next token exists
else:
if result_count is not None and result_count > 0:
print("-------------------")
print("Start Date: ", s1, "Name of journalist:", user_handle)
append_to_csv(json_response, day + "_" + user_handle + ".csv")
count += result_count
total_tweets += result_count
print("Total # of Tweets added: ", total_tweets)
print("-------------------")
sleep(5) # sleep for 5 sec. to avoid flooding
# Since this is the final request, turn flag to false to move to the next time period.
flag = False
next_token = None
sleep(5) # sleep for 5 sec. to avoid flooding
print("Total number of results: ", total_tweets)
Output when the code reaches a serach_query = name, start_time = s1, and end_time = s2 that matches with 0 tweets. The console continues to print the following output as it cannot find next_token. Is it possible to skip to the next combination of serach_query, s1, and s2?
-------------------
Token: None
-------------------
Token: None
-------------------
Token: None
-------------------
Token: None
-------------------
Token: None
-------------------
Token: None
-------------------
Token: None
-------------------
Token: None
-------------------
Token: None
-------------------
Quick fix: Remove any search_query i.e. journalists for which there are days with no tweets causing Token: None. This will 'solve' the issue though possibly bias to the dataset.
NOTE: My real data contains 138 search_query. However, for simplicity, I have reduced it to three.

Extracting tweets using Python and Tweepy

I am trying to look at President Trump's tweets on immigration and do some sentiment analysis on it. My code is:
import pprint
import datetime
# startDate = datetime.datetime(2019, 4, 20, 0, 0, 0)
# endDate = datetime.datetime(2020, 4, 29, 0, 0, 0)
username = "realDonaldTrump"
page = 1
stop_loop = False
finalList1 = []
curs = tweepy.Cursor(api.user_timeline, id=username)
for item in curs.items():
finalList1.append(item)
print(len(finalList1))
data = pd.DataFrame(data=[tweet.text for tweet in finalList1], columns=['Tweets'])
#Add Relevant data
data['len'] = np.array([len(tweet.text) for tweet in finalList1])
data['ID'] = np.array([tweet.id for tweet in finalList1])
data['Date'] = np.array([tweet.created_at for tweet in finalList1])
data['Source'] = np.array([tweet.source for tweet in finalList1])
data['Likes'] = np.array([tweet.favorite_count for tweet in finalList1])
data['RTs'] = np.array([tweet.retweet_count for tweet in finalList1])
#Sentiment Analysis
from textblob import TextBlob
import re
def clean_tweet(tweet):
'''
Utility function to clean the text in a tweet by removing
links and special characters using regex.
'''
return ' '.join(re.sub("(#[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)", " ", tweet).split())
def analize_sentiment(tweet):
'''
Utility function to classify the polarity of a tweet
using textblob.
'''
analysis = TextBlob(clean_tweet(tweet))
if analysis.sentiment.polarity > 0:
return 1
elif analysis.sentiment.polarity == 0:
return 0
else:
return -1
data['SA'] = np.array([ analize_sentiment(tweet) for tweet in data['Tweets'] ])
The code works perfectly fine. However, I have 2 questions:
How do I get access to tweets before these? It gives me 3200 tweets; how do I get the ones before that
How do I get the Donald Trump's tweets which have specific keywords like 'immigration', 'refugee', 'china' etc.
I have been trying to figure out a way but unsuccessful.
for searching for a specific keywords you can use
[API.search][1]
for example:
q="immigration"
searched_tweets = [status for status in tweepy.Cursor(api.search, q=query).items(max_tweets)]
[1]: http://docs.tweepy.org/en/latest/api.html [2]:
Managing Tweepy API Search

converting a python script into a function to iterate over each row

How can i convert the below python script into a fucntion so that i can call it over each row of a dataframe in which i want to keep few variables dynamic like screen_name, domain
# We create a tweet list as follows:
tweets = extractor.user_timeline(screen_name="abhi98358", count=200)
data = pd.DataFrame(data=[tweet.text for tweet in tweets], columns=['Tweets'])
# We add relevant data:
data['ID'] = np.array([tweet.id for tweet in tweets])
data['Date'] = np.array([tweet.created_at for tweet in tweets])
data['text'] = np.array([tweet.text for tweet in tweets])
#data['Date'] = pd.to_datetime(data['Date'], unit='ms').dt.tz_localize('UTC').dt.tz_convert('US/Eastern')
created_time = datetime.datetime.utcnow() - datetime.timedelta(minutes=1)
data = data[(data['Date'] > created_time) & (
data['Date'] < datetime.datetime.utcnow())]
my_list = ['Maintenance', 'Scheduled', 'downtime', 'Issue', 'Voice', 'Happy',
'Problem', 'Outage', 'Service', 'Interruption', 'voice-comms', 'Downtime']
ndata = data[data['Tweets'].str.contains(
"|".join(my_list), regex=True)].reset_index(drop=True)
slack = Slacker('xoxb-34234-44232424-sdkjfksdfjksd')
#message = "test message"
slack.chat.post_message('#ops-twitter-alerts', 'domain :' +' '+ ndata['Tweets'] + '<!channel|>')
my data frame is like below
inp = [{'client': 'epic', 'domain':'fnwp','twittername':'FortniteGame'},{'client': 'epic', 'domain':'fnwp','twittername':'Rainbow6Game'},{'client': 'abhi', 'domain':'abhi','twittername':'abhi98358'}]
df = pd.DataFrame(inp)
I want to iterate over each row one by one like start from scraping the data and send the slack notification and then go to the second row.
I already have gone through How to iterate over rows in a DataFrame in Pandas?
Here you go buddy :-
for index, row in dff.iterrows():
twt=row['twittername']
domain = row['domain']
print(twt)
print(domain)
extractor = twitter_setup()
# We create a tweet list as follows:
tweets = extractor.user_timeline(screen_name=twt, count=200)
data = pd.DataFrame(data=[tweet.text for tweet in tweets], columns=['Tweets'])
# We add relevant data:
data['ID'] = np.array([tweet.id for tweet in tweets])
data['Date'] = np.array([tweet.created_at for tweet in tweets])
data['text'] = np.array([tweet.text for tweet in tweets])
#data['Date'] = pd.to_datetime(data['Date'], unit='ms').dt.tz_localize('UTC').dt.tz_convert('US/Eastern')
created_time = datetime.datetime.utcnow() - datetime.timedelta(minutes=160)
data = data[(data['Date'] > created_time) & (data['Date'] < datetime.datetime.utcnow())]
my_list = ['Maintenance', 'Scheduled', 'downtime', 'Issue', 'Voice', 'Happy','hound',
'Problem', 'Outage', 'Service', 'Interruption', 'ready','voice-comms', 'Downtime','Patch']
ndata = data[data['Tweets'].str.contains( "|".join(my_list), regex=True)].reset_index(drop=True)
print(ndata)
if len(ndata['Tweets'])> 0:
slack.chat.post_message('#ops-twitter-alerts', domain +': '+ ndata['Tweets'] + '<!channel|>')
else:
print('hi')

Appending a python dict from a while loop gives unexpected results

The max number of records in my input json is 100 however there is a paging-next link that provides the next 100 records. Below is what I have but it returns a dict with only 100 entries- I know there are more- How should I modify this function to get all the records?
def process_comment_json(comment_json):
post_comment_dict = dict()
next_links = list()
if 'comments' in comment_json.keys():
try:
for y in comment_json['comments']['data']:
post_id = comment_json['id']
commentor_name = y['from']['name']
commentor_id = y['from']['id']
created_time = y['created_time']
message = remove_non_ascii(y['message'])
sentiment = return_sentiment_score(message)
post_comment_dict[commentor_id] = {'commentor_name':commentor_name,\
'created_time':created_time, 'message':message,\
'sentiment':sentiment}
except:
print("malformed data, skipping this comment in round1")
if 'next' in comment_json['comments']['paging']:
print('found_next appending')
next_links.append(comment_json['comments']['paging']['next'])
else:
return post_comment_dict
while next_links:
print("processing next_links")
print("current len of post_comment_dict is:", len(post_comment_dict))
for next_link in next_links:
t = requests.get(next_link)
nl_json = t.json()
next_links.pop()
if "data" in list(nl_json.keys()):
for record in nl_json['data']:
try:
for y in comment_json['comments']['data']:
post_id = comment_json['id']
commentor_name = y['from']['name']
commentor_id = y['from']['id']
created_time = y['created_time']
message = remove_non_ascii(y['message'])
sentiment = return_sentiment_score(message)
post_comment_dict[commentor_id] = {'commentor_name':commentor_name,\
'created_time':created_time, 'message':message,\
'sentiment':sentiment}
except:
print("malformed data, skipping this comment from the next_links list")
if 'next' in comment_json['comments']['paging']:
print('found_next appending')
next_links.append(comment_json['comments']['paging']['next'])
else:
return post_comment_dict

Categories