Twitter stream not delivering to S3 in json format - python

I'm trying to stream twitter data into AWS S3, however the stream data is not in json format:
S3 bucket does not use any compression. Firehose does not use any data transform. I'm getting the tweets, but they have no type. I need to the load the data into a pandas dataframe.
class TwitterStreamer():
"""
Class for streaming and processing fetched tweets
"""
def stream_tweets(self, fetched_tweets_filename, hash_tag_list):
#This handles twitter authentication and connection to streaming API
listener = TwitterListener(fetched_tweets_filename)
auth = OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(token_key, token_secret)
stream = Stream(auth, listener)
#filter stream
stream.filter(track=hash_tag_list, languages=['en'])
#Create class to print tweet - inherits from StreamListener
class TwitterListener(StreamListener):
"""
This is a basic listener class that just prints received tweets
"""
def __init__(self, fetched_tweets_filename):
self.fetched_tweets_filename = fetched_tweets_filename
def on_data(self, data):
tweet = json.loads(data)
if 'text' in tweet.keys():
#print (tweet['text'])
message_lst = [tweet['text'].replace('\n',' ').replace('\r',' '),
str(tweet['created_at']),'\n']
message = '\t'.join(message_lst)
print(message)
response = client.put_record(DeliveryStreamName=DeliveryStreamName,
Record={'Data': message})
print('Status: ' + json.dumps(response['ResponseMetadata']['HTTPStatusCode']))
try:
#print(data)
with open(self.fetched_tweets_filename, 'a') as tf:
tf.write(message)
return True
except BaseException as e:
print('error:')
return True
def on_error(self, status):
print(status)
if __name__ == "__main__":
#session = boto3.Session()
#kinesis_client = session.client('firehose')
client = boto3.client('firehose',
region_name='us-east-1',
aws_access_key_id='',
aws_secret_access_key=''
)
#partition_key = str(uuid.uuid4())
hash_tag_list=['omicron']
fetched_tweets_filename = 'tweets.json'
DeliveryStreamName = 'twitter-delivery-stream'
twitter_streamer = TwitterStreamer()
twitter_streamer.stream_tweets(fetched_tweets_filename, hash_tag_list)

Related

How to get Tweets of a Keyword

I'm trying to get tweets from a certain keyword 'comfama'. but I can't seem to get any results. Is something wrong with my code? I'm tried with 'donald trump' and this keyword shows results but with 'comfama' nothing happens.
import tweepy
import pandas
import json # The API returns JSON formatted text
TRACKING_KEYWORDS = ['comfama']
OUTPUT_FILE = "comfama_tweets.txt"
TWEETS_TO_CAPTURE = 10
access_token = "xxx"
access_token_secret = "xxx"
consumer_key = "xxx"
consumer_secret = "xxx"
# Pass OAuth details to tweepy's OAuth handler
auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_token_secret)
class MyStreamListener(tweepy.StreamListener):
"""
Twitter listener, collects streaming tweets and output to a file
"""
def __init__(self, api=None):
super(MyStreamListener, self).__init__()
self.num_tweets = 0
self.file = open(OUTPUT_FILE, "w")
def on_status(self, status):
tweet = status._json
self.file.write( json.dumps(tweet) + '\n' )
self.num_tweets += 1
# Stops streaming when it reaches the limit
if self.num_tweets <= TWEETS_TO_CAPTURE:
if self.num_tweets % 100 == 0: # just to see some progress...
print('Numer of tweets captured so far: {}'.format(self.num_tweets))
return True
else:
return False
self.file.close()
def on_error(self, status):
print(status)
# Initialize Stream listener
l = MyStreamListener()
# Create you Stream object with authentication
stream = tweepy.Stream(auth, l)
# Filter Twitter Streams to capture data by the keywords:
stream.filter(track=[TRACKING_KEYWORDS])

Tweepy streaming in MongoDB with full_text

I need help to develop Python code, working with the library Tweepy and MongoDB. I cannot stream Tweets with full text. I don't know how to implement this code to this aim.
Please look at this code:
keywords = ['trump']
language = ['en']
analyzer = SentimentIntensityAnalyzer()
class StdOutListener(StreamListener):
def on_status(self, status):
if hasattr(self, status):
try:
tweet = status.retweted_status.extended_tweet["full_text"]
except:
tweet = status.retweeted_status.text
else:
try:
tweet = status.extended_tweet["full_text"]
except AttributeError:
tweet = status.text
def on_data(self, data):
t = json.loads(data)
tweet_id = t['id_str']
username = t['user']['screen_name']
followers = t['user']['followers_count']
tweet = unidecode(t['text'])
text = t['full_text']
hashtags = t['entities']['hashtags']
dt = t['created_at']
language = t['lang']
blob = analyzer.polarity_scores(tweet)
sentiment = blob['compound']
created = datetime.datetime.strptime(dt, '%a %b %d %H:%M:%S +0000 %Y')
tweet = {'id':tweet_id, 'username':username, 'followers':followers, 'text':text, 'hashtags':hashtags, 'language':language, 'created':created, 'sentiment':sentiment}
print (username + ':' + ' ' + text)
return True
def on_error(self, status_code):
if status_code == 420:
print (status)
return False
if __name__ == '__main__':
l = StdOutListener()
auth = OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_token_secret)
stream = Stream(auth, l, tweet_mode="extended")
tweets = stream.filter(track=keywords, languages=language )
I've tried to do it in a bad way.. For example, I replaced text with full_text, but it doesn't work. I hope you can help me..
Thanks for you time!
if hasattr(self, status): will raise a TypeError, as hasattr requires a string input for attribute names.
You should refer to Tweepy's documentation on extended Tweets, which has an example very similar to what I think you're trying to do.

How to accept twitter stream using tweepy in streamparse spout and pass the tweets to bolt?

Recently, I started working on storm and being more comfortable with python, I decided to use streamparse for working with storm. I am planning to accept a twitter stream in spout and perform some computations in bolt. But I cannot figure out how I would code that in spout. I have gone through various streamparse tutorials but they all show spout emitting tuples from static list and do not have stream like twitter streaming api provides.
This is my code for storm:
class WordSpout(Spout):
def initialize(self, stormconf, context):
self.words = itertools.cycle(['dog', 'cat','zebra', 'elephant'])
def next_tuple(self):
word = next(self.words)
self.emit([word])
This is my code for tweepy:
class listener(StreamListener):
def on_status(self,status):
print(status.text)
print "--------------------------------"
return(True)
def on_error(self, status):
print "error"
def on_connect(self):
print "CONNECTED"
auth = OAuthHandler(ckey, csecret)
auth.set_access_token(atoken, asecret)
twitterStream = Stream(auth, listener())
twitterStream.filter(track=["california"])
How should I integrate both these codes?
To do this, I setup a kafka queue, by which the tweepy listener wrote the status.text into the queue using pykafka. The spout then constantly read data from the queue to perform the analytics. My code looks a bit like this:
listener.py:
class MyStreamListener(tweepy.StreamListener):
def on_status(self, status):
# print(status.text)
client = KafkaClient(hosts='127.0.0.1:9092')
topic = client.topics[str('tweets')]
with topic.get_producer(delivery_reports=False) as producer:
# print status.text
sentence = status.text
for word in sentence.split(" "):
if word is None:
continue
try:
word = str(word)
producer.produce(word)
except:
continue
def on_error(self, status_code):
if status_code == 420: # exceed rate limit
return False
else:
print("Failing with status code " + str(status_code))
return False
auth = tweepy.OAuthHandler(API_KEY, API_SECRET)
auth.set_access_token(ACCESS_TOKEN, ACCESS_TOKEN_SECRET)
api = tweepy.API(auth)
myStreamListener = MyStreamListener()
myStream = tweepy.Stream(auth=api.auth, listener=myStreamListener)
myStream.filter(track=['is'])
Spout File:
from streamparse.spout import Spout
from pykafka import KafkaClient
class TweetSpout(Spout):
words = []
def initialize(self, stormconf, context):
client = KafkaClient(hosts='127.0.0.1:9092')
self.topic = client.topics[str('tweets')]
def next_tuple(self):
consumer = self.topic.get_simple_consumer()
for message in consumer:
if message is not None:
self.emit([message.value])
else:
self.emit()

save twitter user information into a file using StreamListener

Guys i wanna save twitter user info like name, statuses, tweet in my file (either json,txt,csv or any other json or text are prefered). I tried this code and some other similar but none of them work. Guys have a look at below code and suggest me what changes should i made??
import time
from tweepy import Stream
from tweepy import OAuthHandler
from tweepy.streaming import StreamListener
import os
import json
ckey = '**********'
consumer_secret = '**********'
access_token_key = '**********'
access_token_secret = '**********'
start_time = time.time() #grabs the system time
keyword_list = ['twitter'] #track list
#Listener Class Override
class listener(StreamListener):
def __init__(self, start_time, time_limit=60):
self.time = start_time
self.limit = time_limit
def on_data(self, data):
while (time.time() - self.time) < self.limit:
try:
all_data = json.loads["text"]
username = all_data["user"]["name"]
tweets = all_date["user"]["statuses"]
saveFile = open('raw_tweets29.json', 'a')
saveFile.write(username)
saveFile.write('\n')
saveFile.close()
return True
except BaseException, e:
print 'failed ondata,', str(e)
time.sleep(5)
pass
exit()
def on_error(self, status):
print statuses
auth = OAuthHandler(ckey, consumer_secret) #OAuth object
auth.set_access_token(access_token_key, access_token_secret)
twitterStream = Stream(auth, listener(start_time, time_limit=20))
twitterStream.filter(track=['twitter'])
when i run below code this give me error -
failed ondata, 'function' object has no attribute '__getitem__'
I would greatly appreciate any help you can give me in working this problem
I am doing some mistake, now i figure it out there is no need of temp variable 'text' what i need to do is load actual data.
there is one more thing require is encoding.
thanks everyone for your time.
import time
from tweepy import Stream
from tweepy import OAuthHandler
from tweepy.streaming import StreamListener
import os,sys
import json
ckey = '***'
consumer_secret = '***'
access_token_key = '***'
access_token_secret = '***'
start_time = time.time()
class listener(StreamListener):
def __init__(self, start_time, time_limit=300):
self.time = start_time
self.limit = time_limit
def on_data(self, data):
while (time.time() - self.time) < self.limit:
try:
tweet = json.loads(data)
user_name = tweet['user']['name']
tweet_count = tweet['user']['statuses_count']
text = tweet['text']
saveFile = open('user_tweets29.json', 'a')
saveFile.write(text.encode('utf8'))
saveFile.write('\n')
saveFile.close()
return True
except BaseException, e:
print 'failed ondata,', str(e)
time.sleep(5)
pass
exit()
def on_error(self, status):
print statuses
auth = OAuthHandler(ckey, consumer_secret)
auth.set_access_token(access_token_key, access_token_secret)
twitterStream = Stream(auth, listener(start_time, time_limit=60))
twitterStream.filter(track=['twitter'])

Tweepy odd streaming error - python

I am attempting to make a script that searches in the user timeline, then favorites tweets. For some reason, it isnt working.
I wrote this code:
from tweepy.streaming import StreamListener
from tweepy import OAuthHandler
from tweepy import Stream
from tweepy import *
import tweepy, json
class StdOutListener(StreamListener):
def on_data(self, data):
data = json.loads(data)
try:
api.create_favorite(data[id])
except:
pass
print 'Favoriting tweet id ' + data[id] + ' in twitter timeline...'
return True
def on_error(self, status):
print status
l = StdOutListener()
auth = tweepy.OAuthHandler('x', 'x')
auth.set_access_token('x-x', 'x')
api = tweepy.API(auth)
stream = Stream(auth, l)
userz = api.followers_ids(screen_name='smileytechguy')
keywords = ['ebook', 'bot']
stream.filter(track=keywords, follow=userz)
But I am getting this Error message
Traceback (most recent call last):
File "FavTL.py", line 27, in <module>
stream.filter(track=keywords, follow=userz)
File "build\bdist.win-amd64\egg\tweepy\streaming.py", line 310, in filter
AttributeError: 'long' object has no attribute 'encode'
any idea on how can I fix it.
This code should work. Don't forget to enable writing through your API-keys
consumer_key = '..'
consumer_secret = '..'
access_token = '..'
access_secret = '..'
auth = OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_secret)
api = tweepy.API(auth)
class StdOutListener(StreamListener):
def on_data(self, data):
# Twitter returns data in JSON format - we need to decode it first
decoded = json.loads(data)
tweet_id = decoded['id']
api.create_favorite(tweet_id)
print 'Favoriting tweet id ' + str(tweet_id) + ' in twitter timeline...'
time.sleep(65)
return True
def on_error(self, status):
if(status == 420):
print "Twitter is limiting this account."
else:
print "Error Status "+ str(status)
l = StdOutListener()
api = tweepy.API(auth)
stream = Stream(auth, l)
userz = api.followers_ids('smileytechguy')
keywords = ['ebook', 'bot']
stream.filter(track=keywords, follow=str(userz))

Categories