This is a Twitter scraping code that extracts tweets which contain famous keywords.
I want to repeat the entire code below every 12 hours. (Or 12 hours + 10 minutes breaks). Can you give me advice on repeating phrases?
import tweepy
import time
import os
import json
import simplejson
search_term = 'word1'
search_term2= 'word2'
search_term3='word3'
lat = "xxxx"
lon = "xxxx"
radius = "xxxx"
location = "%s,%s,%s" % (lat, lon, radius)
API_key = "xxxx"
API_secret = "xxxx"
Access_token = "xxxx"
Access_token_secret = "xxxx"
auth = tweepy.OAuthHandler(API_key, API_secret)
auth.set_access_token(Access_token, Access_token_secret)
api = tweepy.API(auth)
c=tweepy.Cursor(api.search,
q="{}+OR+{}".format(search_term, search_term2, search_term3),
rpp=1000,
geocode=location,
include_entities=True)
data = {}
i = 1
for tweet in c.items():
data['text'] = tweet.text
print(i, ":", data)
i += 1
time.sleep(1)
wfile = open(os.getcwd()+"/workk2.txt", mode='w')
data = {}
i = 0
for tweet in c.items():
data['text'] = tweet.text
wfile.write(data['text']+'\n')
i += 1
wfile.close()
You could set a Cron job that executes your script every 12 hours. To do so you should save your script with .py extension and make it executable. Then add it to your crontab:
0 0 0/12 * * ? /usr/bin/python yourscript.py
For more detail have a look at this question. Alternatively there are packages in python (e.g. APScheduler) that help you achieve this. In APScheduler you can define a job like this:
from apscheduler.schedulers.blocking import BlockingScheduler
sched = BlockingScheduler()
#sched.scheduled_job('interval', hours=12)
def timed_job():
print('This job is run every 12 hours.')
sched.configure(options_from_ini_file)
sched.start()
Related
Determine when to end and restart(loop) twitter scraping
Hello This is twitter scraping code. You are importing tweets that contain keywords.
What I want to do is end the crawl after 10 hours. And it is cumulative restart to the current output.
I left a note to hear the code advice on how to do this.
import tweepy
import time
import os
import json
import simplejson
search_term = 'word1'
search_term2= 'word2'
search_term3='word3'
lat = "xxxx"
lon = "xxxx"
radius = "xxxx"
location = "%s,%s,%s" % (lat, lon, radius)
API_key = "xxxx"
API_secret = "xxxx"
Access_token = "xxxx"
Access_token_secret = "xxxx"
auth = tweepy.OAuthHandler(API_key, API_secret)
auth.set_access_token(Access_token, Access_token_secret)
api = tweepy.API(auth)
c=tweepy.Cursor(api.search,
q="{}+OR+{}".format(search_term, search_term2, search_term3),
rpp=1000,
geocode=location,
include_entities=True)
data = {}
i = 1
for tweet in c.items():
data['text'] = tweet.text
print(i, ":", data)
i += 1
time.sleep(1)
wfile = open(os.getcwd()+"/workk2.txt", mode='w')
data = {}
i = 0
for tweet in c.items():
data['text'] = tweet.text
wfile.write(data['text']+'\n')
i += 1
wfile.close()
" .txt " files can not be created.
The code has been created, but the file is not created.
I've been advised to use " pickle ".
But I don't know how to use " pickle. "
How can I use this code to save it as a file
Also, I would like to place the number in order to save.
import tweepy
import time
import os
import json
import simplejson
search_term = 'word1'
search_term2= 'word2'
search_term3='word3'
lat = "xxxx"
lon = "xxxx"
radius = "xxxx"
location = "%s,%s,%s" % (lat, lon, radius)
API_key = "xxxx"
API_secret = "xxxx"
Access_token = "xxxx"
Access_token_secret = "xxxx"
auth = tweepy.OAuthHandler(API_key, API_secret)
auth.set_access_token(Access_token, Access_token_secret)
api = tweepy.API(auth)
c=tweepy.Cursor(api.search,
q="{}+OR+{}".format(search_term, search_term2, search_term3),
rpp=1000,
geocode=location,
include_entities=True)
data = {}
i = 1
for tweet in c.items():
data['text'] = tweet.text
print(i, ":", data)
i += 1
time.sleep(1)
wfile = open(os.getcwd()+"/workk2.txt", mode='w')
data = {}
i = 0
for tweet in c.items():
data['text'] = tweet.text
wfile.write(data['text']+'\n')
i += 1
wfile.close()
I'm trying to retrieve tweets and the dates as to when they were created. This is what my code looks like so far:
import tweepy
import json
import urllib
import sys
import datetime
from tweepy import OAuthHandler
user = "billgates"
count = 1
def twitter_fetch(screen_name = user,maxnumtweets=count):
consumer_token = 'INSERT CONSUMER TOKEN'
consumer_secret = 'INSERT CONSUMER SECRET'
access_token = 'INSERT ACCESS TOKEN'
access_secret = 'INSERT ACCESS SECRET'
auth = tweepy.OAuthHandler(consumer_token,consumer_secret)
auth.set_access_token(access_token,access_secret)
api = tweepy.API(auth)
for status in tweepy.Cursor(api.user_timeline,id=screen_name).items(count):
print status.text+'\n'
if __name__ == '__main__':
twitter_fetch(user,count)
I know that I presumably need to call the date using "created_at", but I'm not exactly sure where to put this in order to retrieve it. How can I do this?
As Wander Nauta said, changing the lines:
for status in tweepy.Cursor(api.user_timeline,id=screen_name).items(count):
print status.text + '\n'
to:
for status in tweepy.Cursor(api.user_timeline,id=screen_name).items(count):
print status.text + ' ' + str(status.created_at) + '\n'
should print out the tweet along with the time and date of the creation of the tweet.
I am not sure whether this is exactly what you are looking for, but this code should work:
import tweepy
import json
import urllib
import sys
import datetime
from tweepy import OAuthHandler
user = "billgates"
count = 1
def twitter_fetch(screen_name = user,maxnumtweets=count):
consumer_token = 'INSERT CONSUMER TOKEN'
consumer_secret = 'INSERT CONSUMER SECRET'
access_token = 'INSERT ACCESS TOKEN'
access_secret = 'INSERT ACCESS SECRET'
auth = tweepy.OAuthHandler(consumer_token,consumer_secret)
auth.set_access_token(access_token,access_secret)
api = tweepy.API(auth)
for status in tweepy.Cursor(api.user_timeline,id=screen_name).items(count):
print status.text+'\n'
print status.created_at
if __name__ == '__main__':
twitter_fetch(user,count)
I just added the line "print status.created_at" to your code, which will print the date and the time the tweets were created at (type is datetime.datetime).
My code gives continuous data, but I wanted to filter the data to last five minutes. Additionally, I wanted to report it every 1 minute. What I need to do for that?
try:
import json
except ImportError:
import simplejson as json
from twitter import Twitter, OAuth, TwitterHTTPError, TwitterStream
ACCESS_TOKEN = 'secret'
ACCESS_SECRET = 'secret'
CONSUMER_KEY = 'secret'
CONSUMER_SECRET = 'secret'
oauth = OAuth(ACCESS_TOKEN, ACCESS_SECRET, CONSUMER_KEY, CONSUMER_SECRET)
twitter_stream = TwitterStream(auth=oauth)
iterator = twitter_stream.statuses.filter(track="car", language="en")
for tweet in iterator:
try:
if 'text' in tweet:
print tweet['user']['name']
print tweet['user']['statuses_count']
# print '\n'
for hashtag in tweet['entities']['hashtags']:
hashtags.append(hashtag['text'])
print hashtags
except:
continue
Thanks in advance.
I am using this code below to retrieve twitter hashtag data using tweepy, but this code only retrieve the tweet message and the time created, but I need to retrieve the metadata for that hashtag, any help!!
import tweepy
from tweepy import Stream
from tweepy import OAuthHandler
from tweepy.streaming import StreamListener
import json
import datetime
#setting up the keys
consumer_key = '-------'
consumer_secret = '----------'
access_token = '--------'
access_secret = '-----------'
auth = OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_secret)
def date_range(start,end):
current = start
while (end - current).days >= 0:
yield current
current = current + datetime.timedelta(seconds=1)
class TweetListener(StreamListener):
def on_status(self, status):
#api = tweepy.API(auth_handler=auth)
#status.created_at += timedelta(hours=900)
startDate = datetime.datetime(2014, 03, 11)
stopDate = datetime.datetime(2014, 03, 13)
for date in date_range(startDate,stopDate):
status.created_at = date
print "tweet " + str(status.created_at) +"\n"
print status.text + "\n"
stream = Stream(auth, TweetListener(), secure=True, )
t = u"#سوريا"
stream.filter(track=[t])
this might help, a full specification of the status object.