I use python 2 and spark. I follow an instruction how to count words on twitter on this link https://github.com/Ruthvicp/CS5590_BigDataProgramming/wiki/Lab-Assignment-4----Spark-MLlib-classification-algorithms,-word-count-on-twitter-streaming
I have 2 file
TSWordCount
import findspark
findspark.init()
import pyspark
from pyspark import SparkContext
from pyspark.streaming import StreamingContext
from pyspark.sql.functions import desc
from collections import namedtuple
import os
os.environ["SPARK_HOME"] = "C:\\spark-2.3.1-bin-hadoop2.7\\spark-2.3.1-bin-hadoop2.7"
os.environ["HADOOP_HOME"] = "C:\\winutils\\"
def main():
sc =SparkContext(appName="Countwords1234")
wordcount = {}
ssc = StreamingContext(sc, 5)
lines = ssc.socketTextStream("localhost", 5678)
fields = ("word", "count")
Tweet = namedtuple('Text', fields)
# lines = socket_stream.window(20)
counts = lines.flatMap(lambda text: text.split(" "))\
.map(lambda x: (x, 1))\
.reduceByKey(lambda a, b: a + b).map(lambda rec: Tweet(rec[0], rec[1]))
counts.pprint()
ssc.start()
ssc.awaitTermination()
if __name__ == "__main__":
main()
When i run this file, it is succeed and output is "Listening to port 5678" and my second file is TwitterListener
import findspark
findspark.init()
import pyspark
import tweepy
from tweepy import OAuthHandler
from tweepy import Stream
from tweepy.streaming import StreamListener
import socket
import json
import time
consumer_key = '30f****'
consumer_secret = 'smu7B******
access_token = '153*******'
access_secret = 'QIizsB***'
auth = OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_secret)
class TweetsListener(StreamListener):
def __init__(self, csocket):
self.client_socket = csocket
def on_data(self, data):
try:
msg = json.loads(data)
print(msg['text'].encode('utf-8'))
self.client_socket.send(msg['text'].encode('utf-8'))
return True
except BaseException as e:
print("Error on_data: %s" % str(e))
return True
def on_error(self, status):
print(status)
return True
def sendData(c_socket):
auth = OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_secret)
twitter_stream = Stream(auth, TweetsListener(c_socket))
twitter_stream.filter(track=['fifa'])
if __name__ == "__main__":
s = socket.socket() # Create a socket object
host = "localhost" # Get local machine name
port = 5678 # Reserve a port for your service.
s.bind((host, port)) # Bind to the port
print("Listening on port: %s" % str(port))
s.listen(5) # Now wait for client connection.
c, addr = s.accept() # Establish connection with client.
print("Received request from: " + str(addr))
time.sleep(5)
sendData(c)
Like you see file twitter listener listening to port localhost:5678 . Then in file TSWordCount,I use SparkContext(appname="") , i think i should put my app 's name on twitter here so i put Countwors124 there. Then I call to port by ssc.socketTextStream("localhost",5678). But i have error at TSWordCount when i run it appear the error say that
Cannot run multiple SparkContexts at once; existing SparkContext(app=PySparkShell, master=local[*]) created by
I search the error and i found a solution like use sc.stop() so that i put it after ssc.awaitTermination(). But it didn't work. What should i do now ?
I found an answer . i replaced sc =SparkContext(appName="Countwords1234") with sc = SparkContext.getOrCreate() and everything worked . although i still not understand , at the end of the day result matters LOL
Related
I was trying to retrieve tweets via tweepy API with the following code but the json dictionary that was retrieved had an error.
The Code:
import tweepy
from tweepy import OAuthHandler
from tweepy import Stream
import socket
import json
consumer_key="****"
consumer_secret="****"
access_token="****"
access_secret="****"
class TweetListener(Stream):
def __init__(self, *args, csocket):
super().__init__(*args)
self.client_socket = csocket
def on_data(self, data):
try:
msg = json.loads(data)
print(msg('text').encode('utf=8'))
self.client_socket.send(msg('text').encode('utf=8'))
return True
except BaseException as e:
print('Error %s'%str(e))
return True
def on_error(self, status):
print(status)
return True
def send_data(c_socket):
twtr_stream = TweetListener(
consumer_key, consumer_secret,
access_token, access_secret,
csocket=c_socket
)
twtr_stream.filter(track=['ETH'])
s = socket.socket()
host = "127.0.0.1"
port = 5000
s.bind((host,port))
print("Active port %s"%str(port))
s.listen(5)
c, addr = s.accept()
print("request from addr "+str(addr))
send_data(c)
send_data(c) caused The Error:
Error 'dict' object is not callable
which kept on repeating.
I have another file that is associated with it, both these codes are required to be run simultaneously.
Code:
from pyspark import SparkContext
from pyspark.streaming import StreamingContext
sc = SparkContext(appName='StreamingTwitterAnalysis')
sc.setLogLevel("ERROR")
ssc = StreamingContext(sc,10)
socket_stream = ssc.socketTextStream("127.0.0.1",5000)
lines = socket_stream.window(20)
hashtags = lines.flatMap(lambda text: text.split(" ")).filter(lambda word: word.lower().startwith("#")).map(lambda word: (word.lower(),1)).reduceByKey(lambda a,b:a+b)
dstream = hashtags.transform(lambda foo: foo.sortBy(lambda x:x[0].lower()).sortBy(lambda x:x[1].ascending==False))
dstream.pprint()
ssc.start()
ssc.awaitTermination()
Notebook Snippet:
In line no. 17 of the code you uploaded on pastebin, you load a JSON object msg, which is presumably a dict:
msg = json.loads(data)
In line no. 18 you then call that object with a string parameter:
print(msg('text').encode('utf=8'))
Since dicts are not callable, you get the mentioned error.
So I presume you wanted to access a key by that name:
print(msg['text'].encode('utf=8'))
I am following guide on how stream data to Kafka with Python
After making the modifications suggested by #MarkTolonen I am now getting the following error:
AttributeError: 'NoneType' object has no attribute 'encode'
The full code is as follows:
from tweepy.streaming import StreamListener
from tweepy import OAuthHandler
from tweepy import Stream
from kafka import KafkaClient
#from kafka import SimpleProducer
access_token = ""
access_token_secret = ""
consumer_key = ""
consumer_secret = ""
#Configure Kafka
kafkaBrokers = '127.0.1.1:9092'
#producer = KafkaProducer(bootstrap_servers=kafkaBrokers,key_serializer=lambda k: k.encode('ascii','ignore'),value_serializer=lambda x: dumps(x).encode('utf-8'))
class StdOutListener(StreamListener):
def on_data(self, data):
producer.send("trump", data.encode('utf-8'))
print (data)
return True
def on_error(self, status):
print (status)
kafka = 'localhost:9092'
#kafka = KafkaClient("localhost:9092")
#producer = SimpleProducer(kafka)
l = StdOutListener()
auth = OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_token_secret)
stream = Stream(auth, l)
stream.filter(track="trump")
You need to refer the documentation why the data could ever be None, but you can work around that like so
def on_data(self, data):
if data:
producer.send("trump", data.encode('utf-8'))
print (data)
return True
return False
I have two programms, who connect via sockets. One is a tweepy StreamListener, where I also preprocess the data with the library "tweet-preprocessor". The other programm shall connect to that socket and analyze the data via Spark Structured Streaming. The Problem is, that Spark doesn't get batches when I preprocess the data before sending them.
This is the StreamListener
import tweepy
import socket
import json
import preprocessor as p
CONSUMER_KEY = ""
CONSUMER_SECRET = ""
ACCESS_TOKEN = ""
ACCESS_TOKEN_SECRET = ""
auth = tweepy.OAuthHandler(CONSUMER_KEY, CONSUMER_SECRET)
auth.set_access_token(ACCESS_TOKEN, ACCESS_TOKEN_SECRET)
p.set_options(p.OPT.URL, p.OPT.EMOJI, p.OPT.SMILEY)
class MyStreamListener(tweepy.StreamListener):
def __init__(self, csocket):
self.client_socket = csocket
def on_data(self, raw_data):
try:
data = json.loads(raw_data)
clean_text = p.clean(data["text"])
print(clean_text)
self.client_socket.send(clean_text.encode("utf-8"))
return True
except BaseException as e:
print("Error: " + str(e))
return True
def on_error(self, status_code):
print(status_code)
return True
skt = socket.socket()
host = "localhost"
port = 5555
skt.bind((host, port))
skt.listen()
client, address = skt.accept()
myStreamListener = MyStreamListener(csocket=client)
myStream = tweepy.Stream(auth=auth, listener=myStreamListener, )
myStream.filter(track=["Trump"], languages=["en"])
And simple Spark code:
from pyspark.sql import SparkSession
from pyspark.sql.functions import explode, split, size
spark = SparkSession.builder.appName("TwitterSpark").getOrCreate()
lines = spark.readStream.format("socket").option("host", "localhost").option("port", 5555).load()
#tweetlength = lines.select(
# size(split(lines.value, " ")).alias("tweetlength")
#)
query = lines.writeStream.outputMode("update").format("console").start()
query.awaitTermination()
Most likely clean_text does not have a new line character (\n) at the end. Unlike print(clean_text), which automatically adds a new line, socket.send() sends the bytes from clean_text.encode("utf-8") as-is and you need to add the \n explicitly:
self.client_socket.send((clean_text + "\n").encode("utf-8"))
With no \n to separate the lines in the socket data, Spark sees the input as one growing line, unless there are new lines in the tweet text itself.
Guys i wanna save twitter user info like name, statuses, tweet in my file (either json,txt,csv or any other json or text are prefered). I tried this code and some other similar but none of them work. Guys have a look at below code and suggest me what changes should i made??
import time
from tweepy import Stream
from tweepy import OAuthHandler
from tweepy.streaming import StreamListener
import os
import json
ckey = '**********'
consumer_secret = '**********'
access_token_key = '**********'
access_token_secret = '**********'
start_time = time.time() #grabs the system time
keyword_list = ['twitter'] #track list
#Listener Class Override
class listener(StreamListener):
def __init__(self, start_time, time_limit=60):
self.time = start_time
self.limit = time_limit
def on_data(self, data):
while (time.time() - self.time) < self.limit:
try:
all_data = json.loads["text"]
username = all_data["user"]["name"]
tweets = all_date["user"]["statuses"]
saveFile = open('raw_tweets29.json', 'a')
saveFile.write(username)
saveFile.write('\n')
saveFile.close()
return True
except BaseException, e:
print 'failed ondata,', str(e)
time.sleep(5)
pass
exit()
def on_error(self, status):
print statuses
auth = OAuthHandler(ckey, consumer_secret) #OAuth object
auth.set_access_token(access_token_key, access_token_secret)
twitterStream = Stream(auth, listener(start_time, time_limit=20))
twitterStream.filter(track=['twitter'])
when i run below code this give me error -
failed ondata, 'function' object has no attribute '__getitem__'
I would greatly appreciate any help you can give me in working this problem
I am doing some mistake, now i figure it out there is no need of temp variable 'text' what i need to do is load actual data.
there is one more thing require is encoding.
thanks everyone for your time.
import time
from tweepy import Stream
from tweepy import OAuthHandler
from tweepy.streaming import StreamListener
import os,sys
import json
ckey = '***'
consumer_secret = '***'
access_token_key = '***'
access_token_secret = '***'
start_time = time.time()
class listener(StreamListener):
def __init__(self, start_time, time_limit=300):
self.time = start_time
self.limit = time_limit
def on_data(self, data):
while (time.time() - self.time) < self.limit:
try:
tweet = json.loads(data)
user_name = tweet['user']['name']
tweet_count = tweet['user']['statuses_count']
text = tweet['text']
saveFile = open('user_tweets29.json', 'a')
saveFile.write(text.encode('utf8'))
saveFile.write('\n')
saveFile.close()
return True
except BaseException, e:
print 'failed ondata,', str(e)
time.sleep(5)
pass
exit()
def on_error(self, status):
print statuses
auth = OAuthHandler(ckey, consumer_secret)
auth.set_access_token(access_token_key, access_token_secret)
twitterStream = Stream(auth, listener(start_time, time_limit=60))
twitterStream.filter(track=['twitter'])
I'm trying to make a Python server that I can call from other applications to request Twitter data. I usually work with Python as a scripting language, so if there are any red flags anyone sees in my code, I'm all ears!
This is basically what I have so far, which works well when I ping the server, it gets 10 tweets from my timeline and sends them back to my other applications. My main issue is that I'd like to combine streaming and searching. That way I can have the stream open for a specific hash tag that I'd like to have sent to my other applications in real-time, but then I'd periodically search for other things that don't need to be coming down to me in real-time.
I've had success using both separately, but not sure where to start if I wanted to implement both, which in this case I'd like to bring the stream functionality into this.
I'm using Python Twitter Tools 1.10.2 - http://mike.verdone.ca/twitter/
and Python 3.3
Code below, thanks!
EDIT:I was able to get a step further by adding the twitter streaming connection after the if data == "SEARCH_NOW" if statement. But this brings up the original issue I was having. Once the twitter stream is open, the code seems to just wait there. If i put it before timeline lookup, then I can never call the timeline lookup. Updated code to reflect.
EDIT 2: Putting the search request inside of the twitter stream loop gets a little closer. I can now have the stream open and every time I get a tweet that matches the search term, then I can also do a request. But still not independently...
File: network_settings.py
#!/usr/bin/env python
#network settings
import socket
#set server variables
TCP_IP = '127.0.0.1'
TCP_PORT = 7001
BUFFER_SIZE = 20
s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
s.bind((TCP_IP, TCP_PORT))
s.listen(1)
conn, addr = s.accept()
#print connection address when someone connects
print ('Connection address:', addr)
File: twitter_settings.py
from twitter import *
import re
OAUTH_TOKEN = ''
OAUTH_SECRET = ''
CONSUMER_KEY = ''
CONSUMER_SECRET = ''
t = Twitter(auth=OAuth(OAUTH_TOKEN, OAUTH_SECRET, CONSUMER_KEY, CONSUMER_SECRET))
auth = OAuth(OAUTH_TOKEN, OAUTH_SECRET, CONSUMER_KEY, CONSUMER_SECRET)
stream = TwitterStream(auth = auth, secure = True)
File: python_server.py
#python server
import json
from network_settings import *
from twitter_settings import *
search_term = 'test'
while 1:
tweet_iter = stream.statuses.filter(track = search_term)
for tweet in tweet_iter:
# check whether this is a valid tweet
if tweet.get('text'):
userName = tweet["user"]["screen_name"]
userTweet = tweet["text"]
# now print our tweet
print ('user: ', userName)
print ('tweet: ', userTweet)
#send data back
delivery1 = json.dumps({'type':'showdown','userName':userName,'userTweet':userTweet})
conn.send(delivery1.encode('utf-8'))
data = conn.recv(BUFFER_SIZE)
data = data.decode('utf-8')
if data == "SEARCH_NOW":
print ('request newest IDS tweets')
x = t.statuses.home_timeline(count=10)
for i in range(10):
try:
#print(x[i])
userName = x[i]['entities']['user_mentions'][0]['screen_name']
userTweet = x[i]['text']
print('username: ', userName)
print('tweet: ', userTweet)
delivery = json.dumps({'type':'display','userName':userName,'userTweet':userTweet})
conn.send(delivery.encode('utf-8'))
except:
print('not valid tweet')
conn.close()
So finally have figured out a solution for this. I ended up using threading to run the stream in it's own thread, then I open another thread every time I do a search. Not sure if I need to close each thread, or if the return takes care of that. If anyone has any thing they thing could be improved, I'm all ears!
Code below:
#!/usr/bin/env python
#python server
import json
import threading
import time
import socket
from twitter import *
import re
#get thread lock ready
thread_lock = threading.Lock()
#set server variables
TCP_IP = '127.0.0.1'
TCP_PORT = 7001
BUFFER_SIZE = 20
s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
s.bind((TCP_IP, TCP_PORT))
s.listen(1)
conn, addr = s.accept()
#print connection address when someone connects
print ('Connection address:', addr)
#fill these in your app!
#twitter auth keys
OAUTH_TOKEN = ''
OAUTH_SECRET = ''
CONSUMER_KEY = ''
CONSUMER_SECRET = ''
t = Twitter(auth=OAuth(OAUTH_TOKEN, OAUTH_SECRET, CONSUMER_KEY, CONSUMER_SECRET))
auth = OAuth(OAUTH_TOKEN, OAUTH_SECRET, CONSUMER_KEY, CONSUMER_SECRET)
stream = TwitterStream(auth = auth, secure = True)
#twitter functions
def pythonSearch():
#lock thread to not interrupt search results
thread_lock.acquire()
print ('request newest tweets')
#get 10 things from timeline
x = t.statuses.home_timeline(count=10)
for i in range(10):
try:
#get username and tweet
userName = x[i]['entities']['user_mentions'][0]['screen_name']
userTweet = x[i]['text']
#print out values
print('username: ', userName)
print('tweet: ', userTweet)
#send json back
delivery = json.dumps({'type':'display','userName':userName,'userTweet':userTweet})
conn.send(delivery.encode('utf-8'))
except:
#not a retweet
print('not valid tweet')
#unlock thread when finished
thread_lock.release()
return
def pythonStream():
#open stream looking for search_term
search_term = 'TESTING'
tweet_iter = stream.statuses.filter(track = search_term)
for tweet in tweet_iter:
# check whether this is a valid tweet
if tweet.get('text'):
#get username and tweet
userName = tweet["user"]["screen_name"]
userTweet = tweet["text"]
# now print our tweet
print ('user: ', userName)
print ('tweet: ', userTweet)
#send json back
delivery1 = json.dumps({'type':'showdown','userName':userName,'userTweet':userTweet})
conn.send(delivery1.encode('utf-8'))
#start main loop
while 1:
#listen for calls
data = conn.recv(BUFFER_SIZE)
data = data.decode('utf-8')
#if someone calls search, do a search
if data == 'SEARCH':
threading.Thread(target = pythonSearch).start()
if data == 'STREAM':
threading.Thread(target = pythonStream).start()
conn.close()