Count tweets for a hashtag - python

i want to know if there are any way to count the hashtags from twitter using the streaming API like hashtags.org i have made a script using python and tweetstream and i can make a count but for TTs are always 180k i believe its a limit of 50 tweets/seconds. this is the code:
#!/usr/bin/python
import tweetstream
import sys
print sys.argv
twitterUsername = "user"
twitterPassword = "pass"
twitterWordFilter = sys.argv[1]
try:
with tweetstream.FilterStream(twitterUsername, twitterPassword,track=twitterWordFilter) as stream:
for tweet in stream:
print stream.count
except tweetstream.ConnectionError, e:
print "Disconnected from twitter. Reason:", e.reason

def get_tweet_count(term):
total_tweet_count = 0
page = 1
while True:
url = 'http://search.twitter.com/search.json?q='
+ urllib.quote(term) + '&rpp=100&page=' + str(page)
response = urllib2.urlopen(url)
json_content = response.read()
tweets = json.loads(json_content)['results']
total_tweet_count += len(tweets)
# Are we at the last page or have we run out of pages?
if len(tweets) < 100 or page >= 15:
break
max_id = tweets[0]['id_str']
page += 1
# Wait so twitter doesn't get annoyed with us
time.sleep(1)
return total_tweet_count
This script I adaptated from code on GitHub.

Related

Using a python web crawler to scrape twitter accounts

I'm writing this program for my A-Level Computer Science coursework, and I am trying to get a crawler to scrape all the found users from a given users following/followed list.
The start of the script is as followed:
import requests
# import database as db
from bs4 import BeautifulSoup
debug = True
def getStartNode(): # Get the Twitter profile of the starting node
global startNodeFollowing # Declare the nodes vars as global for use in external functions
global startNodeFollowers
global startNodeLink
if not debug: # If debugging == False, allow the user to enter any starting node Twitter profile
startNodeLink = input("Enter a link to the starting users Twitter profile\n[URL]: ")[:-1] # Get profile link, remove the last char from input (space char, needed to enter link in terminal)
else: # If debugging == True, have predetermined starting node to save time during development
startNodeLink = ("https://twitter.com/ckjellberg03")
startNodeFollowers = (startNodeLink + "/followers") # Create a new var using the starting node's Twitter profile, append for followers and following URL pages
startNodeFollowing = (startNodeLink + "/following")
And the crawler is here:
def spider(): # Web Crawler
getStartNode()
print("\nUsing:", startNodeLink)
urlFollowers = startNodeFollowers
sourceCode = requests.get(urlFollowers)
plainText = sourceCode.text # Source code of the URL (urlFollowers) in plain text format
soup = BeautifulSoup(plainText,'lxml') # BeautifulSoup object to search through plainText for specific items/classes etc
for link in soup.findAll('a', {'class': 'css-4rbku5 css-18t94o4 css-1dbjc4n r-1loqt21 r-1wbh5a2 r-dnmrzs r-1ny4l3l'}): # 'a' is a link in HTML (anchor), class is the Twitter class for a profile
href = link.get(href)
print(href) # Display everything found (development purposes)
I'm pretty sure the class identifier for a users link to their Twitter profile from a /followers is "css-4rbku5 css-18t94o4 css-1dbjc4n r-1loqt21 r-1wbh5a2 r-dnmrzs r-1ny4l3l" from looking at source code, but printing results displays nothing.
Any advice to point me in the right direction?
Thanks!
It's pretty difficult to scrape Twitter (trust me I have try every way), you can use Twitter API but they have limitation (you can't have the name of the followers only the number) if you want to scrape some information with Twitter API you can use this code:
from TwitterAPI import TwitterAPI, TwitterPager
import tweepy
from tweepy import Cursor
from datetime import datetime, date, time, timedelta
consumer_key = 'consumer key'
consumer_secret = 'consumer secret'
token = 'token'
token_secret = 'token secret'
auth= tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(token, token_secret)
api = tweepy.API(auth)
account_list = ['POTUS44']
for target in account_list:
print("Getting data for " + target)
item = api.get_user(target)
print("name: " + item.name)
print("screen_name: " + item.screen_name)
print("description: " + item.description)
print("statuses_count: " + str(item.statuses_count))
print("friends_count: " + str(item.friends_count))
print("followers_count: " + str(item.followers_count))
tweets = item.statuses_count
account_created_date = item.created_at
delta = datetime.utcnow() - account_created_date
account_age_days = delta.days
print("Account age (in days): " + str(account_age_days))
if account_age_days > 0:
print("Average tweets per day: " + "%.2f"%(float(tweets)/float(account_age_days)))
tweets = item.statuses_count
account_created_date = item.created_at
delta = datetime.utcnow() - account_created_date
account_age_days = delta.days
print("Account age (in days): " + str(account_age_days))
if account_age_days > 0:
print("Average tweets per day: " + "%.2f"%(float(tweets)/float(account_age_days)))
hashtags = []
mentions = []
tweet_count = 0
end_date = datetime.utcnow() - timedelta(days=30)
for status in Cursor(api.user_timeline, id=target).items():
tweet_count += 1
if hasattr(status, "entities"):
entities = status.entities
if "hashtags" in entities:
for ent in entities["hashtags"]:
if ent is not None:
if "text" in ent:
hashtag = ent["text"]
if hashtag is not None:
hashtags.append(hashtag)
if "user_mentions" in entities:
for ent in entities["user_mentions"]:
if ent is not None:
if "screen_name" in ent:
name = ent["screen_name"]
if name is not None:
mentions.append(name)
if status.created_at < end_date:
break
Here is how to do it without API. Some difficulties stem from using the right
browser in User-Agent,
import re, requests
headers = { 'User-Agent': 'UCWEB/2.0 (compatible; Googlebot/2.1; +google.com/bot.html)'}
def cleanhtml(raw_html):
cleanr = re.compile('<.*?>')
cleantext = re.sub(cleanr, '', raw_html)
return cleantext
content = ""
for user in ['billgates']:
content += "============================\n\n"
content += user + "\n\n"
content += "============================\n\n"
url_twitter = 'https://twitter.com/%s' % user
resp = requests.get(url_twitter, headers=headers) # Send request
res = re.findall(r'<p class="TweetTextSize.*?tweet-text.*?>(.*?)</p>',resp.text)
for x in res:
x = cleanhtml(x)
x = x.replace("'","'")
x = x.replace('"','"')
x = x.replace(" "," ")
content += x
content += "\n\n"
content += "---"
content += "\n\n"

Change a while true python script to run only once

I'm new to python and I want this code to run only once and stops, not every 30 seconds
because I want to run multiple codes like this with different access tokens every 5 seconds using the command line.
and when I tried this code it never jumps to the second one because it's a while true:
import requests
import time
api_url = "https://graph.facebook.com/v2.9/"
access_token = "access token"
graph_url = "site url"
post_data = { 'id':graph_url, 'scrape':True, 'access_token':access_token }
# Beware of rate limiting if trying to increase frequency.
refresh_rate = 30 # refresh rate in second
while True:
try:
resp = requests.post(api_url, data = post_data)
if resp.status_code == 200:
contents = resp.json()
print(contents['title'])
else:
error = "Warning: Status Code {}\n{}\n".format(
resp.status_code, resp.content)
print(error)
raise RuntimeWarning(error)
except Exception as e:
f = open ("open_graph_refresher.log", "a")
f.write("{} : {}".format(type(e), e))
f.close()
print(e)
time.sleep(refresh_rate)
From what I understood you're trying to execute the piece of code for multiple access tokens. To make your job simple, have all your access_tokens as lists and use the following code. It assumes that you know all your access_tokens in advance.
import requests
import time
def scrape_facebook(api_url, access_token, graph_url):
""" Scrapes the given access token"""
post_data = { 'id':graph_url, 'scrape':True, 'access_token':access_token }
try:
resp = requests.post(api_url, data = post_data)
if resp.status_code == 200:
contents = resp.json()
print(contents['title'])
else:
error = "Warning: Status Code {}\n{}\n".format(
resp.status_code, resp.content)
print(error)
raise RuntimeWarning(error)
except Exception as e:
f = open (access_token+"_"+"open_graph_refresher.log", "a")
f.write("{} : {}".format(type(e), e))
f.close()
print(e)
access_token = ['a','b','c']
graph_url = ['sss','xxx','ppp']
api_url = "https://graph.facebook.com/v2.9/"
for n in range(len(graph_url)):
scrape_facebook(api_url, access_token[n], graph_url[n])
time.sleep(5)

How big can the list argument of twitter api.statuses.lookup() be?

I am using the twitter api to retrieve tweets by ID.I have 13 ids. However tweets of only 8 ids are displayed. (There is no error though). The code is as follows
from tweepy import OAuthHandler
import tweepy
import TwitterCredentials
def status_lookup(api, filename):
id_list = [591672103788621824, 591673483832270848,91675312032776192,591677980394393600, 591678618935267328, 591679831399477248, 591681597054652416, 591681654047023104,591681941017100288, 591693321111744513,591712699421052928, 591712700138291201, 591714830446301184]
tweets = api.statuses_lookup(id_list)
# tweets = []
# tweets.extend(api.statuses_lookup(ids))
a = len(tweets)
print(a)
try:
for tweet in tweets:
print("New Tweet : ", tweet.text)
with open(filename, 'a', encoding="utf-8") as tf:
for tweet in tweets:
tf.write("New Tweet:" + ":" + tweet.text + "\n")
return True
except BaseException as e:
print("Error on_data %s" % str(e))
return True
if __name__ == '__main__':
# print(len(id_list))
fetched_tweets_filename = "Rtweets.txt"
auth=OAuthHandler(TwitterCredentials.CONSUMER_KEY,TwitterCredentials.CONSUMER_SECRET)
auth.set_access_token(TwitterCredentials.ACCESS_TOKEN,TwitterCredentials.ACCESS_TOKEN_SECRET)
api = tweepy.API(auth)
status_lookup(api, fetched_tweets_filename)
I am clueless! Please help!
The list of IDs passed to that method allows for up to 100 Tweet IDs.
I just checked this and it looks like for the five IDs not returned, the user accounts have been suspended and the Tweets are no longer available.

Twitter API connection aborted with Twython

i'm trying to download twitter followers from a list of accounts. my function (that uses twython) works pretty well for short account lists but rise an error for longer lists. it is not a RateLimit problem since my function sleeps until the next time bin if the rate limit is hit.
the error is this
twythonerror: ('Connection aborted.', error(10054, ''))
others seem to have the same problem and the proposed solution is to make the function sleep between different REST API calls so i implemented the following code
del twapi
sleep(nap[afternoon])
afternoon = afternoon + 1
twapi = Twython(app_key=app_key, app_secret=app_secret,
oauth_token=oauth_token, oauth_token_secret=oauth_token_secret)
nap is a list of intervals in seconds and afternoon is an index.
despite this suggestion i still have the exact same problem. it seems that the sleep doesen't resolve the problem.
can anyone help me?
here is the whole finction
def download_follower(serie_lst):
"""Creates account named txt files containing followers ids. Uses for loop on accounts names list."""
nap = [1, 2, 4, 8, 16, 32, 64, 128]
afternoon = 0
for exemplar in serie_lst:
#username from serie_lst entries
account_name = exemplar
twapi = Twython(app_key=app_key, app_secret=app_secret,
oauth_token=oauth_token, oauth_token_secret=oauth_token_secret)
try:
#initializations
del twapi
if afternoon >= 7:
afternoon =0
sleep(nap[afternoon])
afternoon = afternoon + 1
twapi = Twython(app_key=app_key, app_secret=app_secret,
oauth_token=oauth_token, oauth_token_secret=oauth_token_secret)
next_cursor = -1
result = {}
result["screen_name"] = ""
result["followers"] = []
iteration = 0
file_name = ""
#user info
user = twapi.lookup_user(screen_name = account_name)
#store user name
result['screen_name'] = account_name
#loop until all cursored results are stored
while (next_cursor != 0):
sleep(random.randrange(start = 1, stop = 15, step = 1))
call_result = twapi.get_followers_ids(screen_name = account_name, cursor = next_cursor)
#loop over each entry of followers id and append each entry to results_follower
for i in call_result["ids"]:
result["followers"].append(i)
next_cursor = call_result["next_cursor"] #new next_cursor
iteration = iteration + 1
if (iteration > 13): #skip sleep if all cursored pages are processed
error_msg = localtime()
error_msg = "".join([str(error_msg.tm_mon), "/", str(error_msg.tm_mday), "/", str(error_msg.tm_year), " at ", str(error_msg.tm_hour), ":", str(error_msg.tm_min)])
error_msg ="".join(["Twitter API Request Rate Limit hit on ", error_msg, ", wait..."])
print(error_msg)
del error_msg
sleep(901) #15min + 1sec
iteration = 0
#output file
file_name = "".join([account_name, ".txt"])
#print output
out_file = open(file_name, "w") #open file "account_name.txt"
#out_file.write(str(result["followers"])) #standard format
for i in result["followers"]: #R friendly table format
out_file.write(str(i))
out_file.write("\n")
out_file.close()
except twython.TwythonRateLimitError:
#wait
error_msg = localtime()
error_msg = "".join([str(error_msg.tm_mon), "/", str(error_msg.tm_mday), "/", str(error_msg.tm_year), " at ", str(error_msg.tm_hour), ":", str(error_msg.tm_min)])
error_msg ="".join(["Twitter API Request Rate Limit hit on ", error_msg, ", wait..."])
print(error_msg)
del error_msg
del twapi
sleep(901) #15min + 1sec
#initializations
if afternoon >= 7:
afternoon =0
sleep(nap[afternoon])
afternoon = afternoon + 1
twapi = Twython(app_key=app_key, app_secret=app_secret,
oauth_token=oauth_token, oauth_token_secret=oauth_token_secret)
next_cursor = -1
result = {}
result["screen_name"] = ""
result["followers"] = []
iteration = 0
file_name = ""
#user info
user = twapi.lookup_user(screen_name = account_name)
#store user name
result['screen_name'] = account_name
#loop until all cursored results are stored
while (next_cursor != 0):
sleep(random.randrange(start = 1, stop = 15, step = 1))
call_result = twapi.get_followers_ids(screen_name = account_name, cursor = next_cursor)
#loop over each entry of followers id and append each entry to results_follower
for i in call_result["ids"]:
result["followers"].append(i)
next_cursor = call_result["next_cursor"] #new next_cursor
iteration = iteration + 1
if (iteration > 13): #skip sleep if all cursored pages are processed
error_msg = localtime()
error_msg = "".join([str(error_msg.tm_mon), "/", str(error_msg.tm_mday), "/", str(error_msg.tm_year), " at ", str(error_msg.tm_hour), ":", str(error_msg.tm_min)])
error_msg = "".join(["Twitter API Request Rate Limit hit on ", error_msg, ", wait..."])
print(error_msg)
del error_msg
sleep(901) #15min + 1sec
iteration = 0
#output file
file_name = "".join([account_name, ".txt"])
#print output
out_file = open(file_name, "w") #open file "account_name.txt"
#out_file.write(str(result["followers"])) #standard format
for i in result["followers"]: #R friendly table format
out_file.write(str(i))
out_file.write("\n")
out_file.close()
As discussed in the comments, there are a few issues with your code at present. You shouldn't need to delete your connection for it to function properly, and I think the issue comes because you initialise for a second time without having any catches for hitting your rate limit. Here is an example using Tweepy of how you can get the information you require:
import tweepy
from datetime import datetime
def download_followers(user, api):
all_followers = []
try:
for page in tweepy.Cursor(api.followers_ids, screen_name=user).pages():
all_followers.extend(map(str, page))
return all_followers
except tweepy.TweepError:
print('Could not access user {}. Skipping...'.format(user))
# Include your keys below:
consumer_key = 'YOUR_KEY'
consumer_secret = 'YOUR_KEY'
access_token = 'YOUR_KEY'
access_token_secret = 'YOUR_KEY'
# Set up tweepy API, with handling of rate limits
auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_token_secret)
main_api = tweepy.API(auth, wait_on_rate_limit=True, wait_on_rate_limit_notify=True)
# List of usernames to get followers for
lookup_users = ['asongtoruin', 'mbiella']
for username in lookup_users:
user_followers = download_followers(username, main_api)
if user_followers:
with open(username + '.txt', 'w') as outfile:
outfile.write('\n'.join(user_followers))
print('Finished outputting: {} at {}'.format(username, datetime.now().strftime('%Y/%m/%d %H:%M:%S')))
Tweepy is clever enough to know when it has hit its rate limit when we use wait_on_rate_limit=True, and checks how long it needs to sleep for before it can start again. By using wait_on_rate_limit_notify=True, we allow it to paste out how long it will be waiting until it can next get a page of followers (through this ID-based method, it seems as though there are 5000 IDs per page).
We additionally catch a TweepError exception - this can occur if the username provided relates to a protected account for which our authenticated user does not have permission to view. In this case, we simply skip the user to allow other information to be downloaded, but print out a warning that the user could not be accessed.
Running this saves a text file of follower ids for any user it can access. For me this prints the following:
Rate limit reached. Sleeping for: 593
Finished outputting: asongtoruin at 2017/02/22 11:43:12
Could not access user mbiella. Skipping...
With the follower IDs of asongtoruin (aka me) saved as asongtoruin.txt
There is one possible issue, in that our pages of followers start from the newest first. This could (though I don't understand the API well enough to say with certainty) result in issues with our output dataset if new users are added between our calls, as we may both miss these users and end up with duplicates in our dataset. If duplicates become an issue, you could change return all_followers to return set(all_followers)

Best way to make thousands of get requests in python

Right now I am working on a python script which takes in a list of url's as an argument, then performs a GET request on each url and then searches through the output with xpath to fingerprint the website. It seems to work like a charm when the list is around 50 sites long, but anything after that causes the program to slow down to the point where it stop (usually around 150 sites). Scroll down to where you see main app logic and the relevant code it below. Right now I am just using 50 elements in the array and it works fine, but anything after makes the entire program stop. Any suggestions would be greatly appreciated!
#!/usr/bin/python
# Web Scraper
# 1.0
# Imports for file
from multiprocessing.dummy import Pool as ThreadPool
from threading import Thread
from Queue import Queue
from lxml import html
import requests
import time
import sys
# Get Raw HTML
def scrape(url):
try:
page = requests.get(url, timeout=2.0)
if page.status_code == requests.codes.ok:
html_page = html.fromstring(page.content)
s =requests.session()
s.close()
return html_page
else:
s =requests.session()
s.close()
return False
except:
s =requests.session()
s.close()
return False
# Format URL
def format_url(url):
if url.find("http://") == -1:
url = "http://"+url
if url[-1] == "/":
url = url[:-1]
return url
# Check if WordPress Site
def check_wordpress(tree):
scripts = tree.xpath("//script[contains(#src,'wp-content')]")
if len(scripts) > 0:
return True
return False
# Check WordPress Version
def wordpress_version(tree):
type = tree.xpath("//meta[#name='generator']/#content")
version = 0
if len(type) > 0:
details = type[0].split()
if len(details)>1 and details[0] == "WordPress":
if len(details) > 1:
version = details[1]
else:
version = type[0]
return version
# Find Contact Page
def find_contact_page(tree):
contact = tree.xpath("//a[contains(text(),'Contact')]/#href")
try_xpath = 1
while len(contact) == 0:
if try_xpath == 1:
contact = tree.xpath("//span[contains(text(),'Contact')]/../#href")
elif try_xpath == 2:
contact = tree.xpath("//p[contains(text(),'Contact')]/../#href")
elif try_xpath == 3:
break
try_xpath+=1
if len(contact) > 0:
contact = contact[0]
if contact.find('#') == -1:
if contact[0] == '/':
contact = url + "" + contact
print contact
# Juicer method
def juice(url):
url = format_url(url)
string = url
tree = scrape(url)
if tree == False:
return string + " \t\t\t No XML tree"
elif check_wordpress(tree) == True:
version = wordpress_version(tree)
return string + " \t\t\t WordPress: " + str(version)
else:
return string + " \t\t\t Not WordPress"
# Main App Logic Below ------------------------------------->
# Open list of websites from given argument
list = open(sys.argv[1],'r').read().split('\n')
# Juice url
def juice_url():
while True:
url = q.get()
result = juice(url)
print result
q.task_done()
# Create concurrent queues
concurrent = 50
q = Queue(concurrent)
for i in range(concurrent):
t = Thread(target=juice_url)
t.daemon = True
t.start()
# Add URL to Queue
time1 = time.time()
for url in list[0:50]:
q.put(url)
q.join()
# Calculate total time
total = time.time() - time1
print "Total Time: %f" % total
print "Average Time: %f" % (total/50)

Categories