I use tweepy to stream tweets filtered by location coordinates. Currently I've been streaming tweets from two different neighborhoods in a city and found out that the tweets are the same despite difference in the coordinates (they are non-overlapping areas).
Is there a resolution of the coordinates up to which the twitter API can filter the tweets?
Here's a sample code for a streamer:
(Code with sample locations)
import handler
from tweepy import Stream
from tweepy import OAuthHandler
from tweepy.streaming import StreamListener
from tweepy import API
ckey = 'blahblah..'
csecret = 'blah blah...'
atoken = 'blah blah...'
asecret = 'blah blah...'
Loc1 = [-70.981963,41.591322,-70.89941601,41.75938768]
Loc2 = [-70.919537579,41.608616525,-70.905971579,41.617116525]
Loc3 = [-70.92909611,41.621725545,-70.92084611,41.632153545]
class listener(StreamListener):
def on_status(self, status ):
try:
x = handler.editor('file.csv', status) #used to save some data to file
print x.encode("utf-8")
return True
except BaseException, e:
print 'Exception:, ',str(e)
def on_error(self, status):
print status
auth = OAuthHandler(ckey, csecret)
auth.set_access_token(atoken, asecret)
twitterStream = Stream(auth,listener())
twitterStream.filter(locations = Loc1)
I run several instances of this code with different authorization details and locations. In this case Loc2 and Loc3 are neighborhoods in Loc1.
I think the resolution is in increments of 0.1 degree. https://dev.twitter.com/docs/api/1.1/post/statuses/filter
You will have to take all tweets of the region and filter by yourself for those that have the coordenades.
Related
Looking for some assistance with tweepy, please.
I'm attempting to remove retweeted entries in my stream but am not having much luck. The script i've made is attempting to stream tweets from a particular user (in this example #olympics) but end up getting mainly retweets.
Code is:
#!/usr/bin/env python
from tweepy import Stream, OAuthHandler
from tweepy.streaming import StreamListener
import time
import json
# Add your own
access_token = ''
access_token_secret = ''
consumer_key = ''
consumer_secret = ''
class listener(StreamListener):
def on_data(self, data):
# Twitter returns data in JSON format - we need to decode it first
tweet = json.loads(data)
print('#%s: %s' % (tweet['user']['screen_name'], tweet['text'].encode('ascii', 'ignore')))
with open('fetched_tweets.csv','a') as tf:
tf.writelines('%d,\"%s\",%s,%d,%d,\"%s\"\n' % (tweet['id'], tweet['created_at'], tweet['user']['screen_name'],
tweet['user']['followers_count'], tweet['user']['friends_count'],
tweet['text'].encode('ascii','ignore')#.replace('\n',' ', 100).replace(',',' ', 100)
))
return True
def on_error(self, status_code):
print(status_code)
auth = OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_token_secret)
twitterStream = Stream(auth, listener())
twitterStream.filter(follow=["52422878"], is_async=True)
Sample output is
#FanaiMahriati: b'#Olympics #ittfworld '
#ArnHub: b'RT #Olympics: Tom Dumoulin of #NED takes #silver in the #CyclingRoad mens individual time trial.\n\n#StrongerTogether | #Tokyo2020 | #UCI_cy'
#ftw_cdhn: b'RT #Olympics: Its #gold for #rogla of #SLO in the #CyclingRoad mens individual time trial!\n\n#StrongerTogether | #Tokyo2020 | #UCI_cycling'
#ftw_cdhn: b'RT #Olympics: Tom Dumoulin of #NED takes #silver in the #CyclingRoad mens individual time trial.\n\n#StrongerTogether | #Tokyo2020 | #UCI_cy'
#Real_rafel20: b'RT #Olympics: Tom Dumoulin of #NED takes #silver in the #CyclingRoad mens individual time trial.\n\n#StrongerTogether | #Tokyo2020 | #UCI_cy'
#tezol_mutlu: b'#Olympics Bravo Tom, Bravo Dumoulin. Congratulation.'
Any help and suggestions are much appreciated! Thanks in advance!
Just check the retweet status of the tweet:
class listener(StreamListener):
def on_data(self, data):
tweet = json.loads(data)
# If the returned tweet object is NOT a retweet:
if not tweet['retweeted_status']:
print('#%s: %s' % (tweet['user']['screen_name'], tweet['text'].encode('ascii', 'ignore')))
with open('fetched_tweets.csv','a') as tf:
tf.writelines('%d,\"%s\",%s,%d,%d,\"%s\"\n' % (tweet['id'], tweet['created_at'], tweet['user']['screen_name'],
tweet['user']['followers_count'], tweet['user']['friends_count'],
tweet['text'].encode('ascii','ignore')#.replace('\n',' ', 100).replace(',',' ', 100)
))
return True
I am working with Twitter's API and tweepy in the hopes of scraping available geolocation coordinates from Tweets. My end goal is to store only the coordinates of each Tweet in a table.
My issue is that when location Tweets, I run into an error where more information than the coordinates is provided:
My code thus far is as follows:
import pandas as pd
import json
import tweepy
import csv
class MyStreamListener(tweepy.StreamListener):
def on_status(self, status):
if status.retweeted:
return
if True:
coords = status.coordinates
geo = status.geo
if geo is not None:
geo = json.dumps(geo)
if coords is not None:
coords = json.dumps(coords)
print(coords, geo)
with open('coordinates_data.csv', 'a') as f:
writer = csv.writer(f)
writer.writerow([coords,geo])
def on_error(self, status_code):
if status_code == 420:
#returning False in on_error disconnects the stream
return False
LOCATIONS = [-124.7771694, 24.520833, -66.947028, 49.384472, # Contiguous US
-164.639405, 58.806859, -144.152365, 71.76871, # Alaska
-160.161542, 18.776344, -154.641396, 22.878623] # Hawaii
auth = tweepy.OAuthHandler('access auths', 'access auths')
auth.set_access_token('token','token')
api = tweepy.API(auth)
user = api.me()
myStreamListener = MyStreamListener()
myStream = tweepy.Stream(auth = api.auth, listener=myStreamListener)
myStream.filter(locations=LOCATIONS)
I'm sure this issue relates to my lack of 'json' understanding, or that I need to use a data dictionary.
I would appreciate any help!
Just to clarify, Tweepy is a third-party library that interfaces with Twitter's API.
That's just how Twitter represents coordinates objects. Tweepy parses the coordinates attribute of the Status/Tweet object data as the dictionary that it is. You can simply access the coordinates field as a key for that dictionary to get the list with the longitude and latitude values.
You also have a missing quotation mark, ', where you initialize auth, but I assume that's a typo from when you replaced your credentials for this question.
I am trying to extract tweet locations from a specific area with python using tweepy + writing it into a csv-file.
I am not very much into python but I could manage to put together the following sript which kind of works:
import json
from tweepy import Stream
from tweepy import OAuthHandler
from tweepy.streaming import StreamListener
#Enter Twitter API Key information
consumer_key = 'cons_key'
consumer_secret = 'cons_secret'
access_token = 'acc_token'
access_secret = 'acc-secret'
file = open("C:\Python27\Output2.csv", "w")
file.write("X,Y\n")
data_list = []
count = 0
class listener(StreamListener):
def on_data(self, data):
global count
#How many tweets you want to find, could change to time based
if count <= 100:
json_data = json.loads(data)
coords = json_data["coordinates"]
if coords is not None:
print coords["coordinates"]
lon = coords["coordinates"][0]
lat = coords["coordinates"][1]
data_list.append(json_data)
file.write(str(lon) + ",")
file.write(str(lat) + "\n")
count += 1
return True
else:
file.close()
return False
def on_error(self, status):
print status
auth = OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_secret)
twitterStream = Stream(auth, listener())
#What you want to search for here
twitterStream.filter(locations=[11.01,47.85,12.09,48.43])
the problem is, that it extracts the coordinates very slowly (like 10 entries per 30 minutes). Would there be a way to make this faster?
How can I add the timestamps for each tweet?
Is there way to make sure to retrieve all tweets possible for the specific region (I guess the max is all tweets of the past week)?
thanks very much in advance!
Twitter’s standard streaming API provides a 1% sample of all the Tweets posted. In addition, very few Tweets have location data added to them. So, I’m not surprised that you’re only getting a small number of Tweets in a 30 minute timespan for one specific bounding box. The only way to improve the volume would be to pay for the enterprise PowerTrack API.
Tweets all contain a created_at value which is the time stamp you’ll want to record.
Using tweepy and elasticsearch Python modules I can create the index, but the documents do not get created/added. I've taken the code from an example that worked for the author (isn't that always the case).
#!/usr/bin/env python
#
import tweepy
import sys
import json
from textwrap import TextWrapper
from datetime import datetime
from elasticsearch import Elasticsearch
consumer_key = "abcd"
consumer_secret = "1234"
access_token = "qwerty-5678"
access_secret = "huffalump"
auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_secret)
es = Elasticsearch()
#es.indices.create(index="twitter", ignore=400)
class StreamListener(tweepy.StreamListener):
status_wrapper = TextWrapper(width=60, initial_indent=' ', subsequent_indent=' ')
def on_status(self, status):
try:
print 'n%s %s' % (status.author.screen_name, status.created_at)
json_data = status._json
print json_data['text']
es.create(index="twitter", doc_type="twitter_twp", body=json_data)
except Exception, e:
print e
pass
streamer = tweepy.Stream(auth=auth, listener=StreamListener(), timeout=3000000000 )
#Fill with your own Keywords below
terms = ['cyber']
streamer.filter(None,terms)
#streamer.userstream(None)
I monitor my Elasticsearch index at http://192.168.1.7:9200/_cat/indices?v and the data never changes:
health status index uuid pri rep docs.count docs.deleted store.size pri.store.size
yellow open twitter WDYgTflkRZ-5dTRHx2zuEw 3 2 0 0 390b 390b
I've tried everything - even reading the docs. Why aren't my docs going in my index!?
I'm trying to modify this script to only save the JSONs of tweets that have a location attached to them and am running into an issue with Python where checking that something isn't null doesn't seem to work. Has Key isn't working correctly, because they all have the key, most of them are just 'null'. Is not None isn't working because Python thinks null and None are different and checking it as text to not be "null" also didn't work. Does anyone have a clever idea on how to solve this?
from tweepy.streaming import StreamListener
from tweepy import OAuthHandler
from tweepy import Stream
import pymongo
import tweepy
import json
#Variables that contains the user credentials to access Twitter API
access_key = '' #redacted for privacy and such
access_secret = ''
consumer_key = ''
consumer_secret = ''
#Runs auth to Twitter API
auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_key, access_secret)
api = tweepy.API(auth)
#This is a basic listener that will print incoming data to stdout
class StdOutListener(StreamListener):
def on_data(self, data):
print data
return True
def on_error(self, status):
print status
#Customizes the stream and saves text and lang to databases
class CustomStreamListener(tweepy.StreamListener):
def __init__(self, api):
self.api = api
super(tweepy.StreamListener, self).__init__()
self.db = pymongo.MongoClient('localhost', 27017).crime
def on_data(self, data):
jd = json.loads(data)
if jd.has_key('coordinates') :
self.db.tweets.insert( { 'text' : jd['text'], 'coordinates' : jd['coordinates'], 'lang' : jd['lang'] } )
def on_error(self, status_code):
return True # Don't kill the stream
def on_timeout(self):
return True # Don't kill the stream
#Calls on StreamListerner and provides specifications of tracking
l = tweepy.streaming.Stream(auth, CustomStreamListener(api))
l.filter(track=['guns'])
You could try something like checking the length of the string:
if len( jd['coordinates'] ) > 1:
self.db.tweets.insert( { 'text' : jd['text'], 'coordinates' : jd['coordinates'], 'lang' : jd['lang'] } )