I'm trying to learn how to scrape tweets using python. I'm trying to use the following code, but I keep getting the error. I'm not sure how to rectify it.
def fetch_tweets(query, count = 50):
api = connect() # Gets the tweepy API object
tweets = [] # Empty list that stores all the tweets
try:
fetched_data = api.search_tweets(q = query + ' -filter:retweets', count = count)
for tweet in fetched_data:
txt = tweet.text
clean_txt = cleanText(txt) # Cleans the tweet
stem_txt = TextBlob(stem(clean_txt)) # Stems the tweet
sent = sentiment(stem_txt) # Gets the sentiment from the tweet
tweets.append((txt, clean_txt, sent))
return tweets
except tweepy.TweepyException as e:
print("Error: "+ str(e))
exit(1)
tweets = fetch_tweets(query = 'Birdman', count = 200)
# Converting the list into a pandas Dataframe
df = pd.DataFrame(tweets, columns= ['tweets', 'clean_tweets','sentiment'])
# Dropping the duplicate values just in case there are some tweets that are copied and then stores the data in a csv file
df = df.drop_duplicates(subset='clean_tweets')
df.to_csv('data.csv', index= False)
ptweets = df[df['sentiment'] == 'positive']
p_perc = 100 * len(ptweets)/len(tweets)
ntweets = df[df['sentiment'] == 'negative']
n_perc = 100 * len(ntweets)/len(tweets)
print(f'Positive tweets {p_perc} %')
print(f'Neutral tweets {100 - p_perc - n_perc} %')
print(f'Negative tweets {n_perc} %')
I keep getting the following error
TypeError: The `text` argument passed to `__init__(text)` must be a string, not <class 'NoneType'>
And this is the stem(text) function, which is where the problem seems to occur:
def stem(text):
porter = PorterStemmer()
token_words = word_tokenize(text)
stem_sentence = []
for word in token_words:
stem_sentence.append(porter.stem(word))
return " ".join(stem_sentence)
I saw this response in a different place but since i'm new to coding, I wasn't sure how to use it?
df['data'].apply(lambda x: sentiment(' '.join(x)))
Related
I have been trying to write a python code to use snscrape to retrieve tweets about a hashtag within an hour. But my code has been returning an empty dataframe each time I tried.
This is what I have tried so far:
now = datetime.utcnow()
since = now - timedelta(hours=1)
since_str = since.strftime('%Y-%m-%d %H:%M:%S.%f%z')
until_str = now.strftime('%Y-%m-%d %H:%M:%S.%f%z')
# Query tweets with hashtag #SOSREX in the last one hour
query = '#SOSREX Since:' + since_str + ' until:' + until_str
SOSREX_data = []
SOSREX_data=[]
for tweet in sntwitter.TwitterSearchScraper(query).get_items():
if len(SOSREX_data)>100:
break
else:
SOSREX_data.append([tweet.date,tweet.user.username,tweet.user.displayname,
tweet.content,tweet.likeCount,tweet.retweetCount,
tweet.sourceLabel,tweet.user.followersCount,tweet.user.location
])
# Creating a dataframe from the tweets list above
Tweets_data = pd.DataFrame(SOSREX_data,
columns=["Date_tweeted","username","display_name",
"Tweets","Number_of_Likes","Number_retweets",
"Source_of_Tweet",
"number_of_followers","location"
])
print("Tweets_data")
When I try to run my sreamlit app having function:
def get_tweets(Topic,Count):
i=0
#my_bar = st.progress(100) # To track progress of Extracted tweets
for tweet in tweepy.Cursor(api.search_tweets, q=Topic,count=100, lang="en",exclude='retweets').items():
time.sleep(0.1)
#my_bar.progress(i)
df.loc[i,"Date"] = tweet.created_at
df.loc[i,"User"] = tweet.user.name
df.loc[i,"IsVerified"] = tweet.user.verified
df.loc[i,"Tweet"] = tweet.text
df.loc[i,"Likes"] = tweet.favorite_count
df.loc[i,"RT"] = tweet.retweet_count
df.loc[i,"User_location"] = tweet.user.location
df.to_csv("TweetDataset.csv",index=False)
df.to_excel('{}.xlsx'.format("TweetDataset"),index=False) ## Save as Excel
i=i+1
if i>Count:
break
else:
pass
I get this error:
ValueError: Excel does not support datetimes with timezones. Please ensure that datetimes are timezone unaware before writing to Excel.
I am trying to make a sentimental analysis in London's garden comment, but I can't add the geolocation in London and can't format a list of these tweet:(
but it shows for tweet in tweets:
TypeError: 'Cursor' object is not iterable
after that, I was trying to follow the YouTube tutorial to create a if loop to clean data.I want to delete the RT, #hashtag, #mention and HTTP link . but I can't find a efficient way to clean the data
api = tweepy.API(auth, wait_on_rate_limit=True)
#sentiment Analysis
keyword = ["Park","garden"]
noOfTweet = 500
date_since = "2020-01-01T00:00:00Z"
date_until= "2020-12-31T00:00:00Z"
tweets = tweepy.Cursor(api.search_tweets,
query = keyword,
start_time = date_since,
end_time = date_until,
tweet_mode ='extend',
geocode = "-0.098369,51.513557,70km",
lang ='en',
count = noOfTweet)
#try to format a list
all.tweets = []
for i in tweets:
all.tweets.append(i)
for tweet in tweets:
final_text = tweet.text.replace('RT','')
if final_text.startswith(' #'):
position = final_text.index(':')
final_text = final_text[position+2:]
elif final_text.startswith('#'):
position = final_text.index(' ')
final_text = final_text[position + 2:]
I am working on my thesis for economics and I am trying to scrape tweets between two dates for a list of users. Unfortunately, my program, which works fine for a single user breaks and throws this error when I try to loop it for the followers of an influencer. Anyone have suggestions?
Also once I get that fixed I will need to sort between two dates (I was just going to download a massive amount and then sort later using SPSS, but there must be a better way). Does anyone know a way to do this I tried this: tweepy get tweets between two dates
but it didn't work and gave me super irregular results. Also if anyone knows how to make this not trip rate limits that would be great because I think that will be the next problem. :)
Sorry if the code is a little messy it is my first time coding.
The error (I am working in spyder so its a bit long):
Traceback (most recent call last):
File "C:\Users\XPS.ipython\OG + BUILD UP FROM SCRACH.py", line 91, in
extract_followers(user)
File "C:\Users\XPS.ipython\OG + BUILD UP FROM SCRACH.py", line 66, in extract_followers
posts = api.user_timeline(screen_name = user, count = 100, language = "en", tweet_mode="extended", include_rts = True)
File "C:\Users\XPS\Python\lib\site-packages\tweepy\binder.py", line 252, in _call
return method.execute()
File "C:\Users\XPS\Python\lib\site-packages\tweepy\binder.py", line 238, in execute
result = self.parser.parse(self, resp.text, return_cursors=self.return_cursors)
File "C:\Users\XPS\Python\lib\site-packages\tweepy\parsers.py", line 98, in parse
result = model.parse_list(method.api, json)
File "C:\Users\XPS\Python\lib\site-packages\tweepy\models.py", line 75, in parse_list
results.append(cls.parse(api, obj))
File "C:\Users\XPS\Python\lib\site-packages\tweepy\models.py", line 89, in parse
for k, v in json.items():
AttributeError: 'str' object has no attribute 'items'
My Code
Import the libraries
import tweepy
from textblob import TextBlob
import pandas as pd
import re
import matplotlib.pyplot as plt
import csv
plt.style.use('fivethirtyeight')
Twitter API Credentials
consumerkey = ('a')
consumersecret = ('a')
bearer = ('a')
token = ('a')
tokensecret = ('a')
Create the authentication object
authenticate = tweepy.OAuthHandler(consumerkey, consumersecret)
#Set the access token
authenticate.set_access_token(token, tokensecret)
#create the API object while passing in the auth info
api = tweepy.API(authenticate, wait_on_rate_limit= True, wait_on_rate_limit_notify=True)
Create a function to clean the tweets
def cleanTxt(text):
text = re.sub('#[A-Za-z0–9]+', '', text) #Removing #mentions
text = re.sub('#', '', text) # Removing '#' hash tag
text = re.sub('RT[\s]+', '', text) # Removing RT
text = re.sub('https?:\/\/\S+', '', text) # Removing hyperlink
text = re.sub('https?:\/\/\S+', '', text) # Removing hyperlink
return text
Create a function to get the subjectivity
def getSubjectivity(text):
return TextBlob(text).sentiment.subjectivity
Create a function to get the polarity (how positive or negative the txt is)
def getPolarity(text):
return TextBlob(text).sentiment.polarity
#list of followers
name_list = ['_prashantnair','urxnlc', 'Gurmeet1018', 'arpit8691yahooc', 'bnirmaljain', 'anoldschoolboy', 'rrpatange']
In some versions I just call an excel - The full list is a few thousand per influencer
Create function to extract 100 tweets from the influencer with dates
def extract_followers (user):
results = []
posts = api.user_timeline(screen_name = user, count = 100, language = "en", tweet_mode="extended", include_rts = True)
for tweet in posts:
data = (
tweet.full_text,
tweet.created_at,
tweet.user.screen_name)
results.append(data)
cols = "Tweets Date screen_name".split()
global df
df = pd.DataFrame(results, columns=cols)
print("df original")
print (df)
for tweet in posts:
cleaned_text = cleanTxt(tweet.full_text)
with open('influencer.csv', 'a', newline= '') as f:
worksheet = csv.writer(f)
worksheet.writerow([str(tweet.user.screen_name), str(tweet.created_at), str(getSubjectivity(cleaned_text)), str(getPolarity(cleaned_text))])
print("Tweet Added")
Call extract tweets function
for user in name_list:
extract_followers(user)
Clean the tweets
df['Tweets'] = df['Tweets'].apply(cleanTxt)
Show the cleaned tweets
print('df cleaned')
print (df)
Create two new columns 'Subjectivity' & 'Polarity'
df['Subjectivity'] = df['Tweets'].apply(getSubjectivity)
df['Polarity'] = df['Tweets'].apply(getPolarity)
Show the new dataframe with columns 'Subjectivity' & 'Polarity'
print ("df with subjectivity")
print (df)
I'm building a program that collects a specified number of tweets(no specific hashtags, just random posts) from a specific country (based on co-ordinates) over the span of 1-2 months.
For example, I'm collecting 200 tweets/status updates from the United States which were posted anywhere between September and October.
The reason I'm doing this is because I want to gather these tweets and perform sentiment analysis on the to see whether or not the average tweet from a specified country is negative/positive.
The problem I'm having is that I don't know how to "filter" for random tweets/status updates because these kind of tweets don't have hashtags. Furthermore, I'm not sure if Twitter allows me to collect tweets which are 2 months old. Any suggestions?
code
import tweepy
from tweepy import OAuthHandler
import json
import datetime as dt
import time
import os
import sys
'''
I created a twitter account for anyone to use if they want to test the code!
I used Python 3 and tweepy version 3.5.0.
'''
def load_api():
''' Function that loads the twitter API after authorizing the user. '''
consumer_key = 'nn'
consumer_secret = 'nn'
access_token = 'nn'
access_secret = 'nnn'
auth = OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_secret)
api = tweepy.API(auth, wait_on_rate_limit=True)
# load the twitter API via tweepy
return tweepy.API(auth)
def tweet_search(api, query, max_tweets, max_id, since_id, geocode):
''' Function that takes in a search string 'query', the maximum
number of tweets 'max_tweets', and the minimum (i.e., starting)
tweet id. It returns a list of tweepy.models.Status objects. '''
searched_tweets = []
while len(searched_tweets) < max_tweets:
remaining_tweets = max_tweets - len(searched_tweets)
try:
new_tweets = api.search(q=query, count=remaining_tweets,
since_id=str(since_id),
max_id=str(max_id-1))
# geocode=geocode)
print('found',len(new_tweets),'tweets')
if not new_tweets:
print('no tweets found')
break
searched_tweets.extend(new_tweets)
max_id = new_tweets[-1].id
except tweepy.TweepError:
print('exception raised, waiting 15 minutes')
print('(until:', dt.datetime.now()+dt.timedelta(minutes=15), ')')
time.sleep(15*60)
break # stop the loop
return searched_tweets, max_id
def get_tweet_id(api, date='', days_ago=9, query='a'):
''' Function that gets the ID of a tweet. This ID can then be
used as a 'starting point' from which to search. The query is
required and has been set to a commonly used word by default.
The variable 'days_ago' has been initialized to the maximum
amount we are able to search back in time (9).'''
if date:
# return an ID from the start of the given day
td = date + dt.timedelta(days=1)
tweet_date = '{0}-{1:0>2}-{2:0>2}'.format(td.year, td.month, td.day)
tweet = api.search(q=query, count=1, until=tweet_date)
else:
# return an ID from __ days ago
td = dt.datetime.now() - dt.timedelta(days=days_ago)
tweet_date = '{0}-{1:0>2}-{2:0>2}'.format(td.year, td.month, td.day)
# get list of up to 10 tweets
tweet = api.search(q=query, count=10, until=tweet_date)
print('search limit (start/stop):',tweet[0].created_at)
# return the id of the first tweet in the list
return tweet[0].id
def write_tweets(tweets, filename):
''' Function that appends tweets to a file. '''
with open(filename, 'a') as f:
for tweet in tweets:
json.dump(tweet._json, f)
f.write('\n')
def main():
''' This is a script that continuously searches for tweets
that were created over a given number of days. The search
dates and search phrase can be changed below. '''
''' search variables: '''
search_phrases = ['#PythonPleaseWork']
time_limit = 1.0 # runtime limit in hours
max_tweets = 20 # number of tweets per search but it doesn't seem to be working
min_days_old, max_days_old = 1, 1 # search limits e.g., from 7 to 8
# gives current weekday from last week,
# min_days_old=0 will search from right now
USA = '39.8,-95.583068847656,2500km' # this geocode includes nearly all American
# states (and a large portion of Canada)
# but it still fetches from outside the USA
# loop over search items,
# creating a new file for each
for search_phrase in search_phrases:
print('Search phrase =', search_phrase)
''' other variables '''
name = search_phrase.split()[0]
json_file_root = name + '/' + name
os.makedirs(os.path.dirname(json_file_root), exist_ok=True)
read_IDs = False
# open a file in which to store the tweets
if max_days_old - min_days_old == 1:
d = dt.datetime.now() - dt.timedelta(days=min_days_old)
day = '{0}-{1:0>2}-{2:0>2}'.format(d.year, d.month, d.day)
else:
d1 = dt.datetime.now() - dt.timedelta(days=max_days_old-1)
d2 = dt.datetime.now() - dt.timedelta(days=min_days_old)
day = '{0}-{1:0>2}-{2:0>2}_to_{3}-{4:0>2}-{5:0>2}'.format(
d1.year, d1.month, d1.day, d2.year, d2.month, d2.day)
json_file = json_file_root + '_' + day + '.json'
if os.path.isfile(json_file):
print('Appending tweets to file named: ',json_file)
read_IDs = True
# authorize and load the twitter API
api = load_api()
# set the 'starting point' ID for tweet collection
if read_IDs:
# open the json file and get the latest tweet ID
with open(json_file, 'r') as f:
lines = f.readlines()
max_id = json.loads(lines[-1])['id']
print('Searching from the bottom ID in file')
else:
# get the ID of a tweet that is min_days_old
if min_days_old == 0:
max_id = -1
else:
max_id = get_tweet_id(api, days_ago=(min_days_old-1))
# set the smallest ID to search for
since_id = get_tweet_id(api, days_ago=(max_days_old-1))
print('max id (starting point) =', max_id)
print('since id (ending point) =', since_id)
''' tweet gathering loop '''
start = dt.datetime.now()
end = start + dt.timedelta(hours=time_limit)
count, exitcount = 0, 0
while dt.datetime.now() < end:
count += 1
print('count =',count)
# collect tweets and update max_id
tweets, max_id = tweet_search(api, search_phrase, max_tweets,
max_id=max_id, since_id=since_id,
geocode=USA)
# write tweets to file in JSON format
if tweets:
write_tweets(tweets, json_file)
exitcount = 0
else:
exitcount += 1
if exitcount == 3:
if search_phrase == search_phrases[-1]:
sys.exit('Maximum number of empty tweet strings reached - exiting')
else:
print('Maximum number of empty tweet strings reached - breaking')
break
if __name__ == "__main__":
main()
You can not get 2 months historical data with Search API.
"The Twitter Search API searches against a sampling of recent Tweets published in the past 7 days.
Before getting involved, it’s important to know that the Search API is focused on relevance and not completeness. This means that some Tweets and users may be missing from search results."
https://developer.twitter.com/en/docs/tweets/search/overview/basic-search
You can use Streaming api with country filter and instead of hashtags you can use a few stop words. Example, for US you can use "the,and" , for France "le,la,et" etc.
In addition, it is not a good idea to share your access tokens.