i'm trying to download twitter followers from a list of accounts. my function (that uses twython) works pretty well for short account lists but rise an error for longer lists. it is not a RateLimit problem since my function sleeps until the next time bin if the rate limit is hit.
the error is this
twythonerror: ('Connection aborted.', error(10054, ''))
others seem to have the same problem and the proposed solution is to make the function sleep between different REST API calls so i implemented the following code
del twapi
sleep(nap[afternoon])
afternoon = afternoon + 1
twapi = Twython(app_key=app_key, app_secret=app_secret,
oauth_token=oauth_token, oauth_token_secret=oauth_token_secret)
nap is a list of intervals in seconds and afternoon is an index.
despite this suggestion i still have the exact same problem. it seems that the sleep doesen't resolve the problem.
can anyone help me?
here is the whole finction
def download_follower(serie_lst):
"""Creates account named txt files containing followers ids. Uses for loop on accounts names list."""
nap = [1, 2, 4, 8, 16, 32, 64, 128]
afternoon = 0
for exemplar in serie_lst:
#username from serie_lst entries
account_name = exemplar
twapi = Twython(app_key=app_key, app_secret=app_secret,
oauth_token=oauth_token, oauth_token_secret=oauth_token_secret)
try:
#initializations
del twapi
if afternoon >= 7:
afternoon =0
sleep(nap[afternoon])
afternoon = afternoon + 1
twapi = Twython(app_key=app_key, app_secret=app_secret,
oauth_token=oauth_token, oauth_token_secret=oauth_token_secret)
next_cursor = -1
result = {}
result["screen_name"] = ""
result["followers"] = []
iteration = 0
file_name = ""
#user info
user = twapi.lookup_user(screen_name = account_name)
#store user name
result['screen_name'] = account_name
#loop until all cursored results are stored
while (next_cursor != 0):
sleep(random.randrange(start = 1, stop = 15, step = 1))
call_result = twapi.get_followers_ids(screen_name = account_name, cursor = next_cursor)
#loop over each entry of followers id and append each entry to results_follower
for i in call_result["ids"]:
result["followers"].append(i)
next_cursor = call_result["next_cursor"] #new next_cursor
iteration = iteration + 1
if (iteration > 13): #skip sleep if all cursored pages are processed
error_msg = localtime()
error_msg = "".join([str(error_msg.tm_mon), "/", str(error_msg.tm_mday), "/", str(error_msg.tm_year), " at ", str(error_msg.tm_hour), ":", str(error_msg.tm_min)])
error_msg ="".join(["Twitter API Request Rate Limit hit on ", error_msg, ", wait..."])
print(error_msg)
del error_msg
sleep(901) #15min + 1sec
iteration = 0
#output file
file_name = "".join([account_name, ".txt"])
#print output
out_file = open(file_name, "w") #open file "account_name.txt"
#out_file.write(str(result["followers"])) #standard format
for i in result["followers"]: #R friendly table format
out_file.write(str(i))
out_file.write("\n")
out_file.close()
except twython.TwythonRateLimitError:
#wait
error_msg = localtime()
error_msg = "".join([str(error_msg.tm_mon), "/", str(error_msg.tm_mday), "/", str(error_msg.tm_year), " at ", str(error_msg.tm_hour), ":", str(error_msg.tm_min)])
error_msg ="".join(["Twitter API Request Rate Limit hit on ", error_msg, ", wait..."])
print(error_msg)
del error_msg
del twapi
sleep(901) #15min + 1sec
#initializations
if afternoon >= 7:
afternoon =0
sleep(nap[afternoon])
afternoon = afternoon + 1
twapi = Twython(app_key=app_key, app_secret=app_secret,
oauth_token=oauth_token, oauth_token_secret=oauth_token_secret)
next_cursor = -1
result = {}
result["screen_name"] = ""
result["followers"] = []
iteration = 0
file_name = ""
#user info
user = twapi.lookup_user(screen_name = account_name)
#store user name
result['screen_name'] = account_name
#loop until all cursored results are stored
while (next_cursor != 0):
sleep(random.randrange(start = 1, stop = 15, step = 1))
call_result = twapi.get_followers_ids(screen_name = account_name, cursor = next_cursor)
#loop over each entry of followers id and append each entry to results_follower
for i in call_result["ids"]:
result["followers"].append(i)
next_cursor = call_result["next_cursor"] #new next_cursor
iteration = iteration + 1
if (iteration > 13): #skip sleep if all cursored pages are processed
error_msg = localtime()
error_msg = "".join([str(error_msg.tm_mon), "/", str(error_msg.tm_mday), "/", str(error_msg.tm_year), " at ", str(error_msg.tm_hour), ":", str(error_msg.tm_min)])
error_msg = "".join(["Twitter API Request Rate Limit hit on ", error_msg, ", wait..."])
print(error_msg)
del error_msg
sleep(901) #15min + 1sec
iteration = 0
#output file
file_name = "".join([account_name, ".txt"])
#print output
out_file = open(file_name, "w") #open file "account_name.txt"
#out_file.write(str(result["followers"])) #standard format
for i in result["followers"]: #R friendly table format
out_file.write(str(i))
out_file.write("\n")
out_file.close()
As discussed in the comments, there are a few issues with your code at present. You shouldn't need to delete your connection for it to function properly, and I think the issue comes because you initialise for a second time without having any catches for hitting your rate limit. Here is an example using Tweepy of how you can get the information you require:
import tweepy
from datetime import datetime
def download_followers(user, api):
all_followers = []
try:
for page in tweepy.Cursor(api.followers_ids, screen_name=user).pages():
all_followers.extend(map(str, page))
return all_followers
except tweepy.TweepError:
print('Could not access user {}. Skipping...'.format(user))
# Include your keys below:
consumer_key = 'YOUR_KEY'
consumer_secret = 'YOUR_KEY'
access_token = 'YOUR_KEY'
access_token_secret = 'YOUR_KEY'
# Set up tweepy API, with handling of rate limits
auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_token_secret)
main_api = tweepy.API(auth, wait_on_rate_limit=True, wait_on_rate_limit_notify=True)
# List of usernames to get followers for
lookup_users = ['asongtoruin', 'mbiella']
for username in lookup_users:
user_followers = download_followers(username, main_api)
if user_followers:
with open(username + '.txt', 'w') as outfile:
outfile.write('\n'.join(user_followers))
print('Finished outputting: {} at {}'.format(username, datetime.now().strftime('%Y/%m/%d %H:%M:%S')))
Tweepy is clever enough to know when it has hit its rate limit when we use wait_on_rate_limit=True, and checks how long it needs to sleep for before it can start again. By using wait_on_rate_limit_notify=True, we allow it to paste out how long it will be waiting until it can next get a page of followers (through this ID-based method, it seems as though there are 5000 IDs per page).
We additionally catch a TweepError exception - this can occur if the username provided relates to a protected account for which our authenticated user does not have permission to view. In this case, we simply skip the user to allow other information to be downloaded, but print out a warning that the user could not be accessed.
Running this saves a text file of follower ids for any user it can access. For me this prints the following:
Rate limit reached. Sleeping for: 593
Finished outputting: asongtoruin at 2017/02/22 11:43:12
Could not access user mbiella. Skipping...
With the follower IDs of asongtoruin (aka me) saved as asongtoruin.txt
There is one possible issue, in that our pages of followers start from the newest first. This could (though I don't understand the API well enough to say with certainty) result in issues with our output dataset if new users are added between our calls, as we may both miss these users and end up with duplicates in our dataset. If duplicates become an issue, you could change return all_followers to return set(all_followers)
Related
I used the tweepy library (for twitter api-v1.1) to get some metadata (e.g., tweet text, #retweets, userid, etc.) for a list of tweet ids. Here is my code:
consumer_key = 'xxxxxxxxxxxx'
consumer_key_secret = 'xxxxxxxxxxxx'
access_token = 'xxxxxxxxxxxxxxxxxx'
access_token_secret = 'xxxxxxxxxxxxxxxxxx'
auth = tweepy.OAuthHandler(consumer_key, consumer_key_secret)
auth.set_access_token(access_token, access_token_secret)
api = tweepy.API(auth)
def createTrainingSet(corpusFile, tweetContent):
import csv
import time
import json
counter = 0
corpus = []
with open(corpusFile, 'r') as csvfile:
lineReader = csv.reader(csvfile, delimiter=',')
for row in lineReader:
corpus.append({"tweet_id": row[0], "unreliable": row[1], "conspiracy": row[2],\
"clickbait": row[3], "political/biased": row[4], "date": row[5]})
sleepTime = 2
trainingDataSet = []
for tweet in corpus:
try:
tweetFetched = api.get_status(tweet["tweet_id"])
print("Tweet fetched" + tweetFetched.text)
print("followers_count: "+ str(tweetFetched.user.followers_count))
print("friends_count: " + str(tweetFetched.user.friends_count))
tweet["text"] = tweetFetched.text
tweet["retweet_count"] = tweetFetched.retweet_count
tweet["favorite_count"] = tweetFetched.favorite_count
tweet["created_at"] = tweetFetched.created_at
tweet["user_id"] = tweetFetched.user.id_str
tweet["user_created_at"] = tweetFetched.user.created_at
trainingDataSet.append(tweet)
time.sleep(sleepTime)
except:
print("Inside the exception - no:2")
continue
# This is corpus dataset
corpusFile = "sample.csv"
# This is my target file
tweetContent = "tweetContent.csv"
# Call the method
resultFile = createTrainingSet(corpusFile, tweetContent)
I don't know why this code doesn't work any more (the last time it worked was a bout a couple of months ago). However, when I run it now, it returns "Inside the exception - no:2". Why is that?
Here is the two lines of code that helped me find the erros:
except tweepy.TweepError as e:
print ('the error code:', e.args[0][0]['code'])
print ('the error message:', e.args[0][0]['message'])
Also, thanks to Jeyekomon's answer in this post, I found that the e.message[0]['code'] is not working anymore:
The error code used to be accessed using e.message[0]['code'] which no longer works. The message attribute has been deprecated in Python 2.6 and removed in Python 3.0. Currently you get an error 'TweepError' object has no attribute 'message'
In addition, it seems there are some other helpful attributes (api_code, reason and response) in TweepError exception class that are not in the documentation.
I'm writing this program for my A-Level Computer Science coursework, and I am trying to get a crawler to scrape all the found users from a given users following/followed list.
The start of the script is as followed:
import requests
# import database as db
from bs4 import BeautifulSoup
debug = True
def getStartNode(): # Get the Twitter profile of the starting node
global startNodeFollowing # Declare the nodes vars as global for use in external functions
global startNodeFollowers
global startNodeLink
if not debug: # If debugging == False, allow the user to enter any starting node Twitter profile
startNodeLink = input("Enter a link to the starting users Twitter profile\n[URL]: ")[:-1] # Get profile link, remove the last char from input (space char, needed to enter link in terminal)
else: # If debugging == True, have predetermined starting node to save time during development
startNodeLink = ("https://twitter.com/ckjellberg03")
startNodeFollowers = (startNodeLink + "/followers") # Create a new var using the starting node's Twitter profile, append for followers and following URL pages
startNodeFollowing = (startNodeLink + "/following")
And the crawler is here:
def spider(): # Web Crawler
getStartNode()
print("\nUsing:", startNodeLink)
urlFollowers = startNodeFollowers
sourceCode = requests.get(urlFollowers)
plainText = sourceCode.text # Source code of the URL (urlFollowers) in plain text format
soup = BeautifulSoup(plainText,'lxml') # BeautifulSoup object to search through plainText for specific items/classes etc
for link in soup.findAll('a', {'class': 'css-4rbku5 css-18t94o4 css-1dbjc4n r-1loqt21 r-1wbh5a2 r-dnmrzs r-1ny4l3l'}): # 'a' is a link in HTML (anchor), class is the Twitter class for a profile
href = link.get(href)
print(href) # Display everything found (development purposes)
I'm pretty sure the class identifier for a users link to their Twitter profile from a /followers is "css-4rbku5 css-18t94o4 css-1dbjc4n r-1loqt21 r-1wbh5a2 r-dnmrzs r-1ny4l3l" from looking at source code, but printing results displays nothing.
Any advice to point me in the right direction?
Thanks!
It's pretty difficult to scrape Twitter (trust me I have try every way), you can use Twitter API but they have limitation (you can't have the name of the followers only the number) if you want to scrape some information with Twitter API you can use this code:
from TwitterAPI import TwitterAPI, TwitterPager
import tweepy
from tweepy import Cursor
from datetime import datetime, date, time, timedelta
consumer_key = 'consumer key'
consumer_secret = 'consumer secret'
token = 'token'
token_secret = 'token secret'
auth= tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(token, token_secret)
api = tweepy.API(auth)
account_list = ['POTUS44']
for target in account_list:
print("Getting data for " + target)
item = api.get_user(target)
print("name: " + item.name)
print("screen_name: " + item.screen_name)
print("description: " + item.description)
print("statuses_count: " + str(item.statuses_count))
print("friends_count: " + str(item.friends_count))
print("followers_count: " + str(item.followers_count))
tweets = item.statuses_count
account_created_date = item.created_at
delta = datetime.utcnow() - account_created_date
account_age_days = delta.days
print("Account age (in days): " + str(account_age_days))
if account_age_days > 0:
print("Average tweets per day: " + "%.2f"%(float(tweets)/float(account_age_days)))
tweets = item.statuses_count
account_created_date = item.created_at
delta = datetime.utcnow() - account_created_date
account_age_days = delta.days
print("Account age (in days): " + str(account_age_days))
if account_age_days > 0:
print("Average tweets per day: " + "%.2f"%(float(tweets)/float(account_age_days)))
hashtags = []
mentions = []
tweet_count = 0
end_date = datetime.utcnow() - timedelta(days=30)
for status in Cursor(api.user_timeline, id=target).items():
tweet_count += 1
if hasattr(status, "entities"):
entities = status.entities
if "hashtags" in entities:
for ent in entities["hashtags"]:
if ent is not None:
if "text" in ent:
hashtag = ent["text"]
if hashtag is not None:
hashtags.append(hashtag)
if "user_mentions" in entities:
for ent in entities["user_mentions"]:
if ent is not None:
if "screen_name" in ent:
name = ent["screen_name"]
if name is not None:
mentions.append(name)
if status.created_at < end_date:
break
Here is how to do it without API. Some difficulties stem from using the right
browser in User-Agent,
import re, requests
headers = { 'User-Agent': 'UCWEB/2.0 (compatible; Googlebot/2.1; +google.com/bot.html)'}
def cleanhtml(raw_html):
cleanr = re.compile('<.*?>')
cleantext = re.sub(cleanr, '', raw_html)
return cleantext
content = ""
for user in ['billgates']:
content += "============================\n\n"
content += user + "\n\n"
content += "============================\n\n"
url_twitter = 'https://twitter.com/%s' % user
resp = requests.get(url_twitter, headers=headers) # Send request
res = re.findall(r'<p class="TweetTextSize.*?tweet-text.*?>(.*?)</p>',resp.text)
for x in res:
x = cleanhtml(x)
x = x.replace("'","'")
x = x.replace('"','"')
x = x.replace(" "," ")
content += x
content += "\n\n"
content += "---"
content += "\n\n"
I am pulling data from the Microsoft Academic Knowledge API and then using the json responses as dictionaries to extract the information I need. As I do this I add the information to a numpy array and at the end I change it to a pandas data frame to export. The program works just fine, but it takes a huge amount of time to run. It seems to slow down as it runs though, as the first few times through the loops, it only takes a few seconds, but later it takes minutes.
I have tried simplifying the if else statements as much as I can and this helped a little bit but not enough to make a big difference. I also reduced the number of times a query to the API is done as much as I can as well. Each query can only return 1000 results, but there are around 35000 results that I need.
rel_info = np.array([("Title", "Author_Name", "Jornal_Published_In", "Date")])
for l in range(0, loops): # loops is defined above to be 35
offset = 1000 * l
# keep track of progress
print("Progress:" + str(round((offset/total_res)*100, 2)) + "%")
# get data with request to MAK. 1000 is the max count
url = "https://api.labs.cognitive.microsoft.com/academic/v1.0/evaluate?expr=And(Composite(AA.AfN=='brigham young university'),Y>=1908)&model=latest&count=1000&offset="+str(offset)+"&attributes=Ti,D,AA.DAfN,AA.DAuN,J.JN"
response = req.get(url + '&subscription-key={key}')
data = response.json()
for i in range(0, len(data["entities"])):
new_data = data["entities"][i]
# get new data
new_title = new_data["Ti"] # get title
if 'J' not in new_data: # get journal account for if keys are not in dictionaries
new_journ = ""
else:
new_journ = new_data["J"]["JN"] or ""
new_date = new_data["D"] # get date
new_auth = "" # get authors only affiliated with BYU account for if keys are not in dictionary
for j in range(0, len(new_data["AA"])):
if 'DAfN' not in new_data["AA"][j]:
new_auth = new_auth + ""
else:
if new_data["AA"][j]["DAfN"] == "Brigham Young University" and new_auth == "": # posibly combine conditionals to make less complex
new_auth = new_data["AA"][j]["DAuN"]
elif new_data["AA"][j]["DAfN"] == "Brigham Young University" and new_auth != "":
new_auth = new_auth +", "+ new_data["AA"][j]["DAuN"]
# keep adding new data to whole dataframe
new_info = np.array([(new_title, new_auth, new_journ, new_date)])
rel_info = np.vstack((rel_info, new_info))
Try getting the results in a pool of worker threads using concurrent.futures like this:
import concurrent.futures
import urllib.request
URLS = ['http://www.foxnews.com/',
'http://www.cnn.com/',
'http://europe.wsj.com/',
'http://www.bbc.co.uk/',
'http://some-made-up-domain.com/']
# Retrieve a single page and report the URL and contents
def load_url(url, timeout):
with urllib.request.urlopen(url, timeout=timeout) as conn:
return conn.read()
# We can use a with statement to ensure threads are cleaned up promptly
with concurrent.futures.ThreadPoolExecutor() as executor:
# Start the load operations and mark each future with its URL
future_to_url = {executor.submit(load_url, url, 60): url for url in URLS}
for future in concurrent.futures.as_completed(future_to_url):
url = future_to_url[future]
try:
data = future.result()
except Exception as exc:
print('%r generated an exception: %s' % (url, exc))
else:
print('%r page is %d bytes' % (url, len(data)))
https://docs.python.org/3/library/concurrent.futures.html
I ended up solving this by changing how I was adding to the large array of data that I was collecting. Instead of adding one row of data in each iteration I built a temporary array that would hold 1000 rows of data, then I would add this temporary array to the complete data one. This took the run time down to about a minute as opposed to the 43 minutes it took before.
rel_info = np.array([("Title", "Author_Name", "Jornal_Published_In", "Date")])
for req_num in range(0, loops):
offset = 1000 * req_num
# keep track of progress
print("Progress:" + str(round((offset/total_res)*100, 2)) + "%")
# get data with request to MAK. 1000 is the max count
url = "https://api.labs.cognitive.microsoft.com/academic/v1.0/evaluate?expr=And(Composite(AA.AfN=='brigham young university'),Y>=1908)&model=latest&count=1000&offset="+str(offset)+"&attributes=Ti,D,AA.DAfN,AA.DAuN,J.JN"
response = req.get(url + '&subscription-key={key}')
data = response.json()
for i in range(0, len(data["entities"])):
new_data = data["entities"][i]
# get new data
new_title = new_data["Ti"] # get title
if 'J' not in new_data: # get journal account for if keys are not in dictionaries
new_journ = ""
else:
new_journ = new_data["J"]["JN"] or ""
new_date = new_data["D"] # get date
new_auth = "" # get authors only affiliated with BYU account for if keys are not in dictionary
for j in range(0, len(new_data["AA"])):
if 'DAfN' not in new_data["AA"][j]:
new_auth = new_auth + ""
else:
if new_data["AA"][j]["DAfN"] == "Brigham Young University" and new_auth == "": # posibly combine conditionals to make less complex
new_auth = new_data["AA"][j]["DAuN"]
elif new_data["AA"][j]["DAfN"] == "Brigham Young University" and new_auth != "":
new_auth = new_auth +", "+ new_data["AA"][j]["DAuN"]
# here are the changes
# keep adding to a temporary array for 1000 entities
new_info = np.array([(new_title, new_auth, new_journ, new_date)])
if (i == 0): work_stack = new_info
else: work_stack = np.vstack((work_stack, new_info))
# add temporary array to whole array (this is to speed up the program)
rel_info = np.vstack((rel_info, work_stack))
I've written folowing code in python for followers scraping:
import tweepy
import time
import csv
import sys
import random
consumer_key = ''
consumer_secret = ''
access_token = ''
access_token_secret = ''
auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_token_secret)
api = tweepy.API(auth, wait_on_rate_limit=True, wait_on_rate_limit_notify=True, retry_errors=set([401, 404, 500, 502, 503, 504]))
account = 'setavakfi'
log_file = '{}_followers.csv'.format(account)
retry = 0
errorCount = 0
tweepy_cursor = tweepy.Cursor(api.followers, screen_name=account, count=200, cursor=1574812962976647290).pages()
followers_count = []
while True:
try:
retry =0
user = tweepy_cursor.next()
cursor = tweepy_cursor.next_cursor
followers_count += user
print 'Retrieved {} followers accounts'.format(len(followers_count))
print 'Current cursor: {}'.format(cursor)
with open(log_file, 'ab') as fd:
writer = csv.writer(fd)
for i, user in enumerate(user):
writer.writerow([str("#"+user.screen_name), unicode(user.name).encode('utf-8'), str(user.lang), unicode(user.location).encode('utf-8')])
print "Resting..."
time.sleep(random.randint(60, 70)
except tweepy.TweepError as e:
print "Error code: {} with message: {}".format(e.api_code, e.message[0]['message'])
errorCount +=1
retry += 1
print 'Retrying in {} seconds'.format(60+retry*5)
time.sleep(60+retry*5)
if retry == 10:
break
except StopIteration:
break
print 'Done with {} errors'.format(errorCount)
Problem is that with given cursor (1574812962976647290) and account (#setavakfi) I'm only getting error 503 (code 130). This cursor is stuck exacly at page with 8000 folllowers. Whole account is >60,000 followers. I've tried this code on different accounts with more than 60,000 folowers and it works. Have tried to change ISP, IP address and twitter dev account. Nothing change.
Can you see what could be wrong with this code? Is it a problem with this single account? Is there a way to automaticly jump over problematic cursor to see if other cursors will have same problem?
Thanks in advance.
M.
I think that error is because a server overload on the twitter side with this account... I don't think it's an error with your code.
i want to know if there are any way to count the hashtags from twitter using the streaming API like hashtags.org i have made a script using python and tweetstream and i can make a count but for TTs are always 180k i believe its a limit of 50 tweets/seconds. this is the code:
#!/usr/bin/python
import tweetstream
import sys
print sys.argv
twitterUsername = "user"
twitterPassword = "pass"
twitterWordFilter = sys.argv[1]
try:
with tweetstream.FilterStream(twitterUsername, twitterPassword,track=twitterWordFilter) as stream:
for tweet in stream:
print stream.count
except tweetstream.ConnectionError, e:
print "Disconnected from twitter. Reason:", e.reason
def get_tweet_count(term):
total_tweet_count = 0
page = 1
while True:
url = 'http://search.twitter.com/search.json?q='
+ urllib.quote(term) + '&rpp=100&page=' + str(page)
response = urllib2.urlopen(url)
json_content = response.read()
tweets = json.loads(json_content)['results']
total_tweet_count += len(tweets)
# Are we at the last page or have we run out of pages?
if len(tweets) < 100 or page >= 15:
break
max_id = tweets[0]['id_str']
page += 1
# Wait so twitter doesn't get annoyed with us
time.sleep(1)
return total_tweet_count
This script I adaptated from code on GitHub.