Extracting tweets using Python and Tweepy - python

I am trying to look at President Trump's tweets on immigration and do some sentiment analysis on it. My code is:
import pprint
import datetime
# startDate = datetime.datetime(2019, 4, 20, 0, 0, 0)
# endDate = datetime.datetime(2020, 4, 29, 0, 0, 0)
username = "realDonaldTrump"
page = 1
stop_loop = False
finalList1 = []
curs = tweepy.Cursor(api.user_timeline, id=username)
for item in curs.items():
finalList1.append(item)
print(len(finalList1))
data = pd.DataFrame(data=[tweet.text for tweet in finalList1], columns=['Tweets'])
#Add Relevant data
data['len'] = np.array([len(tweet.text) for tweet in finalList1])
data['ID'] = np.array([tweet.id for tweet in finalList1])
data['Date'] = np.array([tweet.created_at for tweet in finalList1])
data['Source'] = np.array([tweet.source for tweet in finalList1])
data['Likes'] = np.array([tweet.favorite_count for tweet in finalList1])
data['RTs'] = np.array([tweet.retweet_count for tweet in finalList1])
#Sentiment Analysis
from textblob import TextBlob
import re
def clean_tweet(tweet):
'''
Utility function to clean the text in a tweet by removing
links and special characters using regex.
'''
return ' '.join(re.sub("(#[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)", " ", tweet).split())
def analize_sentiment(tweet):
'''
Utility function to classify the polarity of a tweet
using textblob.
'''
analysis = TextBlob(clean_tweet(tweet))
if analysis.sentiment.polarity > 0:
return 1
elif analysis.sentiment.polarity == 0:
return 0
else:
return -1
data['SA'] = np.array([ analize_sentiment(tweet) for tweet in data['Tweets'] ])
The code works perfectly fine. However, I have 2 questions:
How do I get access to tweets before these? It gives me 3200 tweets; how do I get the ones before that
How do I get the Donald Trump's tweets which have specific keywords like 'immigration', 'refugee', 'china' etc.
I have been trying to figure out a way but unsuccessful.

for searching for a specific keywords you can use
[API.search][1]
for example:
q="immigration"
searched_tweets = [status for status in tweepy.Cursor(api.search, q=query).items(max_tweets)]
[1]: http://docs.tweepy.org/en/latest/api.html [2]:
Managing Tweepy API Search

Related

How to get only tweets from Snscrape?

After scraping data from Twitter using Snscrape, I am unable to get only tweets.
Under the column for tweet.sourceLabel, I am getting a mixture of twitter, instagram and foursquare.
import snscrape.modules.twitter as sntwitter
keyword = '(COVID OR Corona Vírus)'
maxTweets = 30
tweets = []
for i,tweet in enumerate(sntwitter.TwitterSearchScraper(keyword + ' since:2020-01-01 lang:pt').get_items()) :
if i > maxTweets :
break
tweets.append([tweet.date, tweet.id, tweet.content, tweet.user.username, tweet.sourceLabel])
I'm not seeing any other social media other than Twitter for tweet.sourceLabel. I have fixed few typos in your code as well.
import snscrape.modules.twitter as sntwitter
keyword = '(COVID OR Corona Vírus)'
maxTweets = 30
tweets = []
for i,tweet in enumerate(sntwitter.TwitterSearchScraper(keyword + ' since:2020-01-01 lang:pt').get_items()) :
if i > maxTweets :
break
tweets.append([tweet.sourceLabel])
print(tweets)
Output:

How to update search parameters within a for loop

How can i alter the if tweets_per_day == 100 if statement so that when the for loop does its next iteration, the day variable is equal to the prior day? For example, here is what I wish to do:
On the first iteration of the for loop, 100 tweets created within the specified parameters will be appended to the all_tweets list. Currently working properly
Once 100 tweets have been appended, the day count will be updated by 1. Currently working properly
Within the mentioned if statement, as 100 tweets for the day have been appended, the search parameters for the tweets should be updated, by only displaying tweets which were created 1 day less than the previous day. Not working
The for loop restarts with the previous parameters in mind
from datetime import datetime, timedelta
import pandas as pd
import sys
import numpy as np
from datetime import datetime, timedelta, time
import tweepy
from workalendar.usa import NewYork
import re
import pytz
import configparser
# Read Configs
config = configparser.ConfigParser()
config.read('config.ini')
# Twitter API Credentials
api_key = config['twitter']['api_key']
api_key_secret = config['twitter']['api_key_secret']
access_token = config['twitter']['access_token']
access_token_secret = config['twitter']['access_token_secret']
# Authentication
auth = tweepy.OAuthHandler(api_key, api_key_secret)
auth.set_access_token(access_token, access_token_secret)
# Make an API object
api = tweepy.API(auth)
stock = input('Enter a stock symbol: ').upper()
stock = '$' + stock # Convert user input to Twitter cashtag
print(stock)
start_date = datetime.now().date()
search_parameters = {
'q': stock,
'lang': 'en',
'count': 100,
'until': start_date,
'tweet_mode': 'ext'
}
cursor = tweepy.Cursor(api.search_tweets, **search_parameters)
# Create a list of market holidays, where the stock exchange is closed
cal = NewYork()
cal.holidays(2023)
# Create a dataframe of stock market holidays
holiday_df = pd.DataFrame(cal.holidays(2023), columns=['Date', 'Holiday'])
holiday_df = holiday_df.drop([0, 3, 4, 9, 10, 11, 12]) # Remove non-market-holidays
# Add additional market holidays
holiday_df.loc[9] = ['2023-04-07', 'Good Friday']
holiday_df.loc[10] = ['2023-06-19', 'Juneteenth']
holiday_df['Date'] = pd.to_datetime(holiday_df['Date'])
tweets_per_day = 0
day_count = 0
dates = [] # empty list to store the dates of appended tweets
all_tweets = [] # empty list to store all tweets being used
tweet_iterator = cursor.items(limit=1000)
for tweet in cursor.items(limit=10000):
day = tweet.created_at
print(f'Start of for loop day value: {day}')
tweet_date = pd.to_datetime(day.date().strftime('%Y-%m-%d'))
# Only add tweets which were made before market open on weekdays
if day.time() < time(hour=9, minute=30) and day.weekday() < 5:
if not holiday_df['Date'].eq(tweet_date).any():
print(f'Tweet date {tweet.created_at.date()}, day date {day.date()}')
while tweets_per_day < 100:
if tweet.created_at.date() != day.date(): # If the day changes before 100 tweets, update day count
day_count += 1
tweets_per_day = 0
print('day != block')
break
all_tweets.append(tweet.text)
tweets_per_day += 1
dates.append(tweet.created_at.date())
print(f'Appended Tweet date: {tweet.created_at}')
print(f'Tweets per day: {tweets_per_day}')
tweet = next(tweet_iterator)
if tweets_per_day == 100:
day_count += 1
search_parameters['until'] -= timedelta(days=1)
cursor = tweepy.Cursor(api.search_tweets, **search_parameters) # create a new Cursor with the updated search parameters
tweet_iterator = cursor.items(limit=1000) # update the tweet iterator with the new Cursor
tweets_per_day = 0
if day_count >= 5:
print(f'Day count: {day_count}')
break

How to get the exact frequency for trigram in text data?

I would like to know how to get the exact frequency for trigrams. I think the functions I used are more to get the "importance". It's kind of like the frequency but not the same.
To be clear, a trigram is 3 words in a row. The punctuation does not afect the trigram unit, I don't want to at least.
And my definition of the frequency is : I would like the number of comments of which the trigram are in , at least once.
Here’s how I obtained my database with web scraping :
import re
import json
import requests
from requests import get
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import datetime
import time
import random
root_url = 'https://fr.trustpilot.com/review/www.gammvert.fr'
urls = [ '{root}?page={i}'.format(root=root_url, i=i) for i in range(1,807) ]
comms = []
notes = []
dates = []
for url in urls:
results = requests.get(url)
time.sleep(20)
soup = BeautifulSoup(results.text, "html.parser")
commentary = soup.find_all('section', class_='review__content')
for container in commentary:
try:
comm = container.find('p', class_ = 'review-content__text').text.strip()
except:
comm = container.find('a', class_ = 'link link--large link--dark').text.strip()
comms.append(comm)
note = container.find('div', class_ = 'star-rating star-rating--medium').find('img')['alt']
notes.append(note)
date_tag = container.div.div.find("div", class_="review-content-header__dates")
date = json.loads(re.search(r"({.*})", str(date_tag)).group(1))["publishedDate"]
dates.append(date)
data = pd.DataFrame({
'comms' : comms,
'notes' : notes,
'dates' : dates
})
data['comms'] = data['comms'].str.replace('\n', '')
data['dates'] = pd.to_datetime(data['dates']).dt.date
data['dates'] = pd.to_datetime(data['dates'])
data.to_csv('file.csv', sep=';', index=False)
Here’s the function I used to obtained my comms_clean :
def clean_text(text):
text = tokenizer.tokenize(text)
text = nltk.pos_tag(text)
text = [word for word,pos in text if (pos == 'NN' or pos == 'NNP' or pos == 'NNS' or pos == 'NNPS')
]
text = [word for word in text if not word in stop_words]
text = [word for word in text if len(word) > 2]
final_text = ' '.join( [w for w in text if len(w)>2] ) #remove word with one letter
return final_text
data['comms_clean'] = data['comms'].apply(lambda x : clean_text(x))
data['month'] = data.dates.dt.strftime('%Y-%m')
And here’s some row of my database :
database
And here the function I used to obtained the frequency of trigram in my database :
def get_top_n_gram(corpus,ngram_range,n=None):
vec = CountVectorizer(ngram_range=ngram_range,stop_words = stop_words).fit(corpus)
bag_of_words = vec.transform(corpus)
sum_words = bag_of_words.sum(axis=0)
words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
return words_freq[:n]
def process(corpus):
corpus = pd.DataFrame(corpus, columns= ['Text', 'count']).sort_values('count', ascending = True)
return corpus
Here's the result with this line of code :
trigram = get_top_n_gram(data['comms_clean'], (3,3), 10)
trigram = process(trigram)
trigram.sort_values('count', ascending=False, inplace=True)
trigram.head(10)
trigram
Let me show you how it seems inconsistent but by short amount. I will show the 6 first trigram of my picture above :
df = data[data['comms_clean'].str.contains('très bon état',regex=False, case=False, na=False)]
df.shape
(150, 5)
df = data[data['comms_clean'].str.contains('rapport qualité prix',regex=False, case=False, na=False)]
df.shape
(148, 5)
df = data[data['comms_clean'].str.contains('très bien passé',regex=False, case=False, na=False)]
df.shape
(129, 5)
So with my function we have :
146
143
114
and when I checked for the number of comment with that trigram in it, I obtained :
150
148
129
It’s not so far, but I rather have the exact number.
So I would like to know: How to have the exact frequency for that trigram? And not some kind of importance. The importance is fine, don't get me wrong, but I also would like to know the right number.
I tried this :
from nltk.util import ngrams
for i in range(1,16120):
Counter(ngrams(data['comms_clean'][i].split(), 3))
But I cannot find how to concatenate all the counter in the loop.
Thank you.
EDIT :
stop_words = set(stopwords.words('french'))
stop_words.update(("Gamm", "gamm"))
tokenizer = nltk.tokenize.RegexpTokenizer(r'\w+')
lemmatizer = French.Defaults.create_lemmatizer()

How can I collect a number of tweets between 2 given dates using tweepy?

For example, I want to collect 20 tweets each day from 1st June to today's date but right now I am only able to view the 20 recent tweets of today's date.
def get_tweets_and_tones_json(searchTerm, NoOfTerms):
output = []
geo = "21.1498134,79.0820556,1045km"
start_date = datetime.date(2020, 6, 1)
end_date = datetime.date.today()
tweets = tw.Cursor(api.search,q=searchTerm,count=NoOfTerms, geocode=geo,lang='en',
since= start_date,until = end_date).items(20)
for i,tweet in enumerate(tweets):
tweetedText = tweet.text
jsonFile1 = analyze_tone(tweetedText)
x = tweet.created_at
print(tweetedText,'\n', x)

converting a python script into a function to iterate over each row

How can i convert the below python script into a fucntion so that i can call it over each row of a dataframe in which i want to keep few variables dynamic like screen_name, domain
# We create a tweet list as follows:
tweets = extractor.user_timeline(screen_name="abhi98358", count=200)
data = pd.DataFrame(data=[tweet.text for tweet in tweets], columns=['Tweets'])
# We add relevant data:
data['ID'] = np.array([tweet.id for tweet in tweets])
data['Date'] = np.array([tweet.created_at for tweet in tweets])
data['text'] = np.array([tweet.text for tweet in tweets])
#data['Date'] = pd.to_datetime(data['Date'], unit='ms').dt.tz_localize('UTC').dt.tz_convert('US/Eastern')
created_time = datetime.datetime.utcnow() - datetime.timedelta(minutes=1)
data = data[(data['Date'] > created_time) & (
data['Date'] < datetime.datetime.utcnow())]
my_list = ['Maintenance', 'Scheduled', 'downtime', 'Issue', 'Voice', 'Happy',
'Problem', 'Outage', 'Service', 'Interruption', 'voice-comms', 'Downtime']
ndata = data[data['Tweets'].str.contains(
"|".join(my_list), regex=True)].reset_index(drop=True)
slack = Slacker('xoxb-34234-44232424-sdkjfksdfjksd')
#message = "test message"
slack.chat.post_message('#ops-twitter-alerts', 'domain :' +' '+ ndata['Tweets'] + '<!channel|>')
my data frame is like below
inp = [{'client': 'epic', 'domain':'fnwp','twittername':'FortniteGame'},{'client': 'epic', 'domain':'fnwp','twittername':'Rainbow6Game'},{'client': 'abhi', 'domain':'abhi','twittername':'abhi98358'}]
df = pd.DataFrame(inp)
I want to iterate over each row one by one like start from scraping the data and send the slack notification and then go to the second row.
I already have gone through How to iterate over rows in a DataFrame in Pandas?
Here you go buddy :-
for index, row in dff.iterrows():
twt=row['twittername']
domain = row['domain']
print(twt)
print(domain)
extractor = twitter_setup()
# We create a tweet list as follows:
tweets = extractor.user_timeline(screen_name=twt, count=200)
data = pd.DataFrame(data=[tweet.text for tweet in tweets], columns=['Tweets'])
# We add relevant data:
data['ID'] = np.array([tweet.id for tweet in tweets])
data['Date'] = np.array([tweet.created_at for tweet in tweets])
data['text'] = np.array([tweet.text for tweet in tweets])
#data['Date'] = pd.to_datetime(data['Date'], unit='ms').dt.tz_localize('UTC').dt.tz_convert('US/Eastern')
created_time = datetime.datetime.utcnow() - datetime.timedelta(minutes=160)
data = data[(data['Date'] > created_time) & (data['Date'] < datetime.datetime.utcnow())]
my_list = ['Maintenance', 'Scheduled', 'downtime', 'Issue', 'Voice', 'Happy','hound',
'Problem', 'Outage', 'Service', 'Interruption', 'ready','voice-comms', 'Downtime','Patch']
ndata = data[data['Tweets'].str.contains( "|".join(my_list), regex=True)].reset_index(drop=True)
print(ndata)
if len(ndata['Tweets'])> 0:
slack.chat.post_message('#ops-twitter-alerts', domain +': '+ ndata['Tweets'] + '<!channel|>')
else:
print('hi')

Categories