Determine when to end and restart(loop) twitter scraping
Hello This is twitter scraping code. You are importing tweets that contain keywords.
What I want to do is end the crawl after 10 hours. And it is cumulative restart to the current output.
I left a note to hear the code advice on how to do this.
import tweepy
import time
import os
import json
import simplejson
search_term = 'word1'
search_term2= 'word2'
search_term3='word3'
lat = "xxxx"
lon = "xxxx"
radius = "xxxx"
location = "%s,%s,%s" % (lat, lon, radius)
API_key = "xxxx"
API_secret = "xxxx"
Access_token = "xxxx"
Access_token_secret = "xxxx"
auth = tweepy.OAuthHandler(API_key, API_secret)
auth.set_access_token(Access_token, Access_token_secret)
api = tweepy.API(auth)
c=tweepy.Cursor(api.search,
q="{}+OR+{}".format(search_term, search_term2, search_term3),
rpp=1000,
geocode=location,
include_entities=True)
data = {}
i = 1
for tweet in c.items():
data['text'] = tweet.text
print(i, ":", data)
i += 1
time.sleep(1)
wfile = open(os.getcwd()+"/workk2.txt", mode='w')
data = {}
i = 0
for tweet in c.items():
data['text'] = tweet.text
wfile.write(data['text']+'\n')
i += 1
wfile.close()
Related
I'm trying to make a bot in Python to manage some tasks on twitter using tweepy.
I'm saving the credentials for auth in several files but I want one file only. Here is an example:
for x in range(0, 3):
if x == 0 : from keysaccount1 import keys
if x == 1 : from keysaccount2 import keys
if x == 2 : from keysaccount3 import keys
if x == 3 : from keysaccount4 import keys
CONSUMER_KEY = keys['consumer_key']
CONSUMER_SECRET = keys['consumer_secret']
ACCESS_TOKEN = keys['access_token']
ACCESS_TOKEN_SECRET = keys['access_token_secret']
auth = tweepy.OAuthHandler(CONSUMER_KEY, CONSUMER_SECRET)
auth.set_access_token(ACCESS_TOKEN, ACCESS_TOKEN_SECRET)
api = tweepy.API(auth, wait_on_rate_limit=True)
The file on which the data are stored is in this format, for example keysaccount1 :
keys = dict(
consumer_key = 'xxxxxxxxxxxxx',
consumer_secret = 'xxxxxxxxxxxxx',
access_token = 'xxxxxxxxxxxxx',
access_token_secret = 'xxxxxxxxxxxxx',
)
Is there a way to save all this on the same file, as I have the x variable on the cycle that I could use to choose which account I'm managing?
You can use a single dictionary:
auth_data = {
0: {
'consumer_key': 'xxxxxxxxxx',
'consumer_secret': 'xxxxxxxxxxxxx',
'access_token': 'xxxxxxxxxx',
'access_token_secret': 'xxxxxxxxxxxxxx',
}
...
You can access particular fields like this then:
for x in range(0, 3):
keyset = auth_data[x]
auth = tweepy.OAuthHandler(keyset['consumer_key'], keyset['consumer_secret'])
auth.set_access_token(keyset['access_token'], keyset['access_token_secret'])
api = tweepy.API(auth, wait_on_rate_limit=True)
To save the data, you can just use the python module pickle:
import pickle
def read():
with open('data/keys.p', 'r') as f:
return pickle.load(f)
def write(data):
with open('data/keys.p', 'w+') as f:
return pickle.dump(f, data)
# Read data
auth_data = read()
# Write data
write(auth_data)
This is a Twitter scraping code that extracts tweets which contain famous keywords.
I want to repeat the entire code below every 12 hours. (Or 12 hours + 10 minutes breaks). Can you give me advice on repeating phrases?
import tweepy
import time
import os
import json
import simplejson
search_term = 'word1'
search_term2= 'word2'
search_term3='word3'
lat = "xxxx"
lon = "xxxx"
radius = "xxxx"
location = "%s,%s,%s" % (lat, lon, radius)
API_key = "xxxx"
API_secret = "xxxx"
Access_token = "xxxx"
Access_token_secret = "xxxx"
auth = tweepy.OAuthHandler(API_key, API_secret)
auth.set_access_token(Access_token, Access_token_secret)
api = tweepy.API(auth)
c=tweepy.Cursor(api.search,
q="{}+OR+{}".format(search_term, search_term2, search_term3),
rpp=1000,
geocode=location,
include_entities=True)
data = {}
i = 1
for tweet in c.items():
data['text'] = tweet.text
print(i, ":", data)
i += 1
time.sleep(1)
wfile = open(os.getcwd()+"/workk2.txt", mode='w')
data = {}
i = 0
for tweet in c.items():
data['text'] = tweet.text
wfile.write(data['text']+'\n')
i += 1
wfile.close()
You could set a Cron job that executes your script every 12 hours. To do so you should save your script with .py extension and make it executable. Then add it to your crontab:
0 0 0/12 * * ? /usr/bin/python yourscript.py
For more detail have a look at this question. Alternatively there are packages in python (e.g. APScheduler) that help you achieve this. In APScheduler you can define a job like this:
from apscheduler.schedulers.blocking import BlockingScheduler
sched = BlockingScheduler()
#sched.scheduled_job('interval', hours=12)
def timed_job():
print('This job is run every 12 hours.')
sched.configure(options_from_ini_file)
sched.start()
" .txt " files can not be created.
The code has been created, but the file is not created.
I've been advised to use " pickle ".
But I don't know how to use " pickle. "
How can I use this code to save it as a file
Also, I would like to place the number in order to save.
import tweepy
import time
import os
import json
import simplejson
search_term = 'word1'
search_term2= 'word2'
search_term3='word3'
lat = "xxxx"
lon = "xxxx"
radius = "xxxx"
location = "%s,%s,%s" % (lat, lon, radius)
API_key = "xxxx"
API_secret = "xxxx"
Access_token = "xxxx"
Access_token_secret = "xxxx"
auth = tweepy.OAuthHandler(API_key, API_secret)
auth.set_access_token(Access_token, Access_token_secret)
api = tweepy.API(auth)
c=tweepy.Cursor(api.search,
q="{}+OR+{}".format(search_term, search_term2, search_term3),
rpp=1000,
geocode=location,
include_entities=True)
data = {}
i = 1
for tweet in c.items():
data['text'] = tweet.text
print(i, ":", data)
i += 1
time.sleep(1)
wfile = open(os.getcwd()+"/workk2.txt", mode='w')
data = {}
i = 0
for tweet in c.items():
data['text'] = tweet.text
wfile.write(data['text']+'\n')
i += 1
wfile.close()
My code gives continuous data, but I wanted to filter the data to last five minutes. Additionally, I wanted to report it every 1 minute. What I need to do for that?
try:
import json
except ImportError:
import simplejson as json
from twitter import Twitter, OAuth, TwitterHTTPError, TwitterStream
ACCESS_TOKEN = 'secret'
ACCESS_SECRET = 'secret'
CONSUMER_KEY = 'secret'
CONSUMER_SECRET = 'secret'
oauth = OAuth(ACCESS_TOKEN, ACCESS_SECRET, CONSUMER_KEY, CONSUMER_SECRET)
twitter_stream = TwitterStream(auth=oauth)
iterator = twitter_stream.statuses.filter(track="car", language="en")
for tweet in iterator:
try:
if 'text' in tweet:
print tweet['user']['name']
print tweet['user']['statuses_count']
# print '\n'
for hashtag in tweet['entities']['hashtags']:
hashtags.append(hashtag['text'])
print hashtags
except:
continue
Thanks in advance.
I am using this code below to retrieve twitter hashtag data using tweepy, but this code only retrieve the tweet message and the time created, but I need to retrieve the metadata for that hashtag, any help!!
import tweepy
from tweepy import Stream
from tweepy import OAuthHandler
from tweepy.streaming import StreamListener
import json
import datetime
#setting up the keys
consumer_key = '-------'
consumer_secret = '----------'
access_token = '--------'
access_secret = '-----------'
auth = OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_secret)
def date_range(start,end):
current = start
while (end - current).days >= 0:
yield current
current = current + datetime.timedelta(seconds=1)
class TweetListener(StreamListener):
def on_status(self, status):
#api = tweepy.API(auth_handler=auth)
#status.created_at += timedelta(hours=900)
startDate = datetime.datetime(2014, 03, 11)
stopDate = datetime.datetime(2014, 03, 13)
for date in date_range(startDate,stopDate):
status.created_at = date
print "tweet " + str(status.created_at) +"\n"
print status.text + "\n"
stream = Stream(auth, TweetListener(), secure=True, )
t = u"#سوريا"
stream.filter(track=[t])
this might help, a full specification of the status object.