Using a python web crawler to scrape twitter accounts - python

I'm writing this program for my A-Level Computer Science coursework, and I am trying to get a crawler to scrape all the found users from a given users following/followed list.
The start of the script is as followed:
import requests
# import database as db
from bs4 import BeautifulSoup
debug = True
def getStartNode(): # Get the Twitter profile of the starting node
global startNodeFollowing # Declare the nodes vars as global for use in external functions
global startNodeFollowers
global startNodeLink
if not debug: # If debugging == False, allow the user to enter any starting node Twitter profile
startNodeLink = input("Enter a link to the starting users Twitter profile\n[URL]: ")[:-1] # Get profile link, remove the last char from input (space char, needed to enter link in terminal)
else: # If debugging == True, have predetermined starting node to save time during development
startNodeLink = ("https://twitter.com/ckjellberg03")
startNodeFollowers = (startNodeLink + "/followers") # Create a new var using the starting node's Twitter profile, append for followers and following URL pages
startNodeFollowing = (startNodeLink + "/following")
And the crawler is here:
def spider(): # Web Crawler
getStartNode()
print("\nUsing:", startNodeLink)
urlFollowers = startNodeFollowers
sourceCode = requests.get(urlFollowers)
plainText = sourceCode.text # Source code of the URL (urlFollowers) in plain text format
soup = BeautifulSoup(plainText,'lxml') # BeautifulSoup object to search through plainText for specific items/classes etc
for link in soup.findAll('a', {'class': 'css-4rbku5 css-18t94o4 css-1dbjc4n r-1loqt21 r-1wbh5a2 r-dnmrzs r-1ny4l3l'}): # 'a' is a link in HTML (anchor), class is the Twitter class for a profile
href = link.get(href)
print(href) # Display everything found (development purposes)
I'm pretty sure the class identifier for a users link to their Twitter profile from a /followers is "css-4rbku5 css-18t94o4 css-1dbjc4n r-1loqt21 r-1wbh5a2 r-dnmrzs r-1ny4l3l" from looking at source code, but printing results displays nothing.
Any advice to point me in the right direction?
Thanks!

It's pretty difficult to scrape Twitter (trust me I have try every way), you can use Twitter API but they have limitation (you can't have the name of the followers only the number) if you want to scrape some information with Twitter API you can use this code:
from TwitterAPI import TwitterAPI, TwitterPager
import tweepy
from tweepy import Cursor
from datetime import datetime, date, time, timedelta
consumer_key = 'consumer key'
consumer_secret = 'consumer secret'
token = 'token'
token_secret = 'token secret'
auth= tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(token, token_secret)
api = tweepy.API(auth)
account_list = ['POTUS44']
for target in account_list:
print("Getting data for " + target)
item = api.get_user(target)
print("name: " + item.name)
print("screen_name: " + item.screen_name)
print("description: " + item.description)
print("statuses_count: " + str(item.statuses_count))
print("friends_count: " + str(item.friends_count))
print("followers_count: " + str(item.followers_count))
tweets = item.statuses_count
account_created_date = item.created_at
delta = datetime.utcnow() - account_created_date
account_age_days = delta.days
print("Account age (in days): " + str(account_age_days))
if account_age_days > 0:
print("Average tweets per day: " + "%.2f"%(float(tweets)/float(account_age_days)))
tweets = item.statuses_count
account_created_date = item.created_at
delta = datetime.utcnow() - account_created_date
account_age_days = delta.days
print("Account age (in days): " + str(account_age_days))
if account_age_days > 0:
print("Average tweets per day: " + "%.2f"%(float(tweets)/float(account_age_days)))
hashtags = []
mentions = []
tweet_count = 0
end_date = datetime.utcnow() - timedelta(days=30)
for status in Cursor(api.user_timeline, id=target).items():
tweet_count += 1
if hasattr(status, "entities"):
entities = status.entities
if "hashtags" in entities:
for ent in entities["hashtags"]:
if ent is not None:
if "text" in ent:
hashtag = ent["text"]
if hashtag is not None:
hashtags.append(hashtag)
if "user_mentions" in entities:
for ent in entities["user_mentions"]:
if ent is not None:
if "screen_name" in ent:
name = ent["screen_name"]
if name is not None:
mentions.append(name)
if status.created_at < end_date:
break

Here is how to do it without API. Some difficulties stem from using the right
browser in User-Agent,
import re, requests
headers = { 'User-Agent': 'UCWEB/2.0 (compatible; Googlebot/2.1; +google.com/bot.html)'}
def cleanhtml(raw_html):
cleanr = re.compile('<.*?>')
cleantext = re.sub(cleanr, '', raw_html)
return cleantext
content = ""
for user in ['billgates']:
content += "============================\n\n"
content += user + "\n\n"
content += "============================\n\n"
url_twitter = 'https://twitter.com/%s' % user
resp = requests.get(url_twitter, headers=headers) # Send request
res = re.findall(r'<p class="TweetTextSize.*?tweet-text.*?>(.*?)</p>',resp.text)
for x in res:
x = cleanhtml(x)
x = x.replace("'","'")
x = x.replace('"','"')
x = x.replace(" "," ")
content += x
content += "\n\n"
content += "---"
content += "\n\n"

Related

Why do i get this error when trying to run my Twitter weather bot

def reply_to_tweet():
print("retrieving and replying to tweets...")
last_seen_tweet = read_last_seen(FILE_NAME)
# mentions_timeline() returns a list of 20 most recent mentions
mentions = api.mentions_timeline(last_seen_tweet, tweet_mode="extended")
# reversing to read old tweets first
for mention in reversed(mentions):
print(str(mention.id) + " - " + mention.full_text)
last_seen_tweet = mention.id
store_last_seen(FILE_NAME, last_seen_tweet)
if "#" in mention.full_text.lower():
print("found #")
location = get_location(mention.full_text)
weather = get_weather(location)
# responding to tweet mention
api.update_status("#" + mention.user.screen_name + weather, mention.id)
def get_location(tweet):
# takes tweet and returns only the substring attached to hashtag
tweet_location = [i.strip("#") for i in tweet.split() if i.startswith("#")[0]]
tweet_location += " today weather.com"
return tweet_location
def get_weather(query):
for url in search(query, stop=1):
print("Results is " + url)
#this code sends a request and reads the webpage enclsed to the request
request = Request(url, headers={"User-Agent": "Mozilla/5.0"})
webpage = urlopen(request).read()
soup = BeautifulSoup(webpage, "html.parser")
try:
title = soup.findAll("span", "today-daypart-title")[0].string
phrase = soup.findAll("span", "today-daypart-wxphrase")[0].string
except IndexError as e:
forecast = (" could not find the weather, check back later")
print(e)
else:
forecast = (" forecast for " + title + " is " + phrase)
print(forecast)
return forecast
while True:
reply_to_tweet()
time.sleep(15)
I then call reply_to_tweet(), it searches and displays the tweet but when it tries the location i get:
TypeError: 'bool' object is not subscriptable
I am fairly new to bots and api, i am using BeautifulSoup, googlesearch and tweepy
It looks like you're attempting string slicing on a bool function startswith(), try removing [0] from tweet_location = [i.strip("#") for i in tweet.split() if i.startswith("#")[0]].

Add key-value in JSON when calling an API

I am calling an API to get a list of properties. I obtain the jsons in the 'listings' file (https://api.nestoria.es/show_example?name=search_listings_es&syntax=1).
I am using zip codes to find each property and I need to add to each property its zip code. At the end I transform the json to a csv. I don't know how to add to each property the zip code (so I would need a new key-value to each result from the API call)...
Thanks!!!!!!!!!
Here it is the code:
from requests import get
import json
import pandas as pd
import time
import datetime
import csv
def get_nestoria(type):
#call the api
api = 'http://api.nestoria.es/api?action=search_listings'
place = '&place_name=' + area_name
listing_type = '&listing_type=' + type
json_es = '&encoding=json&pretty=1&country=es'
page = '&page='
api_input = api + place + listing_type + json_es
response = get(api_input)
# Check if the API has worked
if response.status_code == 200:
print("API called successfully")
elif response.status_code == 400:
print("Wrong request for" + area_name + ". Chechk this area is searchable")
elif response.status_code == 403:
print("Forbidden API call. Maximum number of calls reached.")
else:
print("Wrong code", response.status_code)
content_as_string = response.content.decode()
# Decode JSON
content = json.loads(content_as_string)
content_response = content['response']
# Number of total web pages neded for the area
web_pages = content_response['total_pages']
print('Number of pages in that area: ', web_pages)
print("Numer of total properties " + area_name, content_response['total_results'])
#2nd call to the API
homes = pd.DataFrame()
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.0; WOW64; rv:24.0) Gecko/20100101 Firefox/24.0'}
for i in range(1, web_pages+1):
api_input = api + place + listing_type + json_es + page + str(i)
response = get(api_input, headers=headers)
content_as_string = response.content.decode()
content = json.loads(content_as_string)
content_response = content['response']
listings = content_response['listings']
listings = pd.DataFrame(listings)
if i==1:
homes = listings
else:
homes = homes.append(listings, sort=True)
time.sleep(3)
if homes.empty:
homes = homes
else:
homes = homes[['bathroom_number','bedroom_number','car_spaces','commission','construction_year','datasource_name', 'img_height','img_url','img_width', 'floor',
'keywords','latitude','lister_url','listing_type','location_accuracy','longitude','price','price_currency','price_formatted','price_high','price_low',
'property_type','room_number','size','size_type','size_unit','summary','thumb_height','thumb_url','thumb_width','title','updated_in_days','updated_in_days_formatted']]
return homes
homes = pd.DataFrame()
codigos_postales = ['01008']
today=datetime.date.today() #to change the name of the file
for i in codigos_postales:
area_name = i
temp = get_nestoria('buy')
if i == 0:
homes =temp
else:
homes = homes.append(temp,sort=True)
print('Number of extracted properties ', len(homes))
print(homes.head())
homes.to_csv('D:\\a000Master Big Data\\Prácticas\\Web scrapping\\Nestoria\\GranadaVenta'+str(today)+'.csv')
data = response.json()
Here data is the parsed response you get from sending the request.
You can then update the listings like this:
for i in data['listings']:
i['ZipCode'] = zipcode
zipcode being the one you want to assign i.e the one you sent in the request.
You can either convert the data object into dataframe and then call pd.to_csv or use python in-built csv.writer
If you don't have the ZipCode you can use the google maps API to get that
http://maps.googleapis.com/maps/api/geocode/json?address=valencia&sensor=true_or_false&key=YOUR_API_KEY
You will have to Sign Up to get your API key and then you will get zip code from the JSON response.
UPDATE:
Here is an example on how to use it.
api = 'http://api.nestoria.es/api?action=search_listings'
place = '&place_name=' + area_name
listing_type = '&listing_type=' + type
json_es = '&encoding=json&pretty=1&country=es'
page = '&page='
api_input = api + place + listing_type + json_es
response = get(api_input)
update = response.json()['response']['listings']
for i in update:
i['Zipcode'] = zipcode

Twitter API connection aborted with Twython

i'm trying to download twitter followers from a list of accounts. my function (that uses twython) works pretty well for short account lists but rise an error for longer lists. it is not a RateLimit problem since my function sleeps until the next time bin if the rate limit is hit.
the error is this
twythonerror: ('Connection aborted.', error(10054, ''))
others seem to have the same problem and the proposed solution is to make the function sleep between different REST API calls so i implemented the following code
del twapi
sleep(nap[afternoon])
afternoon = afternoon + 1
twapi = Twython(app_key=app_key, app_secret=app_secret,
oauth_token=oauth_token, oauth_token_secret=oauth_token_secret)
nap is a list of intervals in seconds and afternoon is an index.
despite this suggestion i still have the exact same problem. it seems that the sleep doesen't resolve the problem.
can anyone help me?
here is the whole finction
def download_follower(serie_lst):
"""Creates account named txt files containing followers ids. Uses for loop on accounts names list."""
nap = [1, 2, 4, 8, 16, 32, 64, 128]
afternoon = 0
for exemplar in serie_lst:
#username from serie_lst entries
account_name = exemplar
twapi = Twython(app_key=app_key, app_secret=app_secret,
oauth_token=oauth_token, oauth_token_secret=oauth_token_secret)
try:
#initializations
del twapi
if afternoon >= 7:
afternoon =0
sleep(nap[afternoon])
afternoon = afternoon + 1
twapi = Twython(app_key=app_key, app_secret=app_secret,
oauth_token=oauth_token, oauth_token_secret=oauth_token_secret)
next_cursor = -1
result = {}
result["screen_name"] = ""
result["followers"] = []
iteration = 0
file_name = ""
#user info
user = twapi.lookup_user(screen_name = account_name)
#store user name
result['screen_name'] = account_name
#loop until all cursored results are stored
while (next_cursor != 0):
sleep(random.randrange(start = 1, stop = 15, step = 1))
call_result = twapi.get_followers_ids(screen_name = account_name, cursor = next_cursor)
#loop over each entry of followers id and append each entry to results_follower
for i in call_result["ids"]:
result["followers"].append(i)
next_cursor = call_result["next_cursor"] #new next_cursor
iteration = iteration + 1
if (iteration > 13): #skip sleep if all cursored pages are processed
error_msg = localtime()
error_msg = "".join([str(error_msg.tm_mon), "/", str(error_msg.tm_mday), "/", str(error_msg.tm_year), " at ", str(error_msg.tm_hour), ":", str(error_msg.tm_min)])
error_msg ="".join(["Twitter API Request Rate Limit hit on ", error_msg, ", wait..."])
print(error_msg)
del error_msg
sleep(901) #15min + 1sec
iteration = 0
#output file
file_name = "".join([account_name, ".txt"])
#print output
out_file = open(file_name, "w") #open file "account_name.txt"
#out_file.write(str(result["followers"])) #standard format
for i in result["followers"]: #R friendly table format
out_file.write(str(i))
out_file.write("\n")
out_file.close()
except twython.TwythonRateLimitError:
#wait
error_msg = localtime()
error_msg = "".join([str(error_msg.tm_mon), "/", str(error_msg.tm_mday), "/", str(error_msg.tm_year), " at ", str(error_msg.tm_hour), ":", str(error_msg.tm_min)])
error_msg ="".join(["Twitter API Request Rate Limit hit on ", error_msg, ", wait..."])
print(error_msg)
del error_msg
del twapi
sleep(901) #15min + 1sec
#initializations
if afternoon >= 7:
afternoon =0
sleep(nap[afternoon])
afternoon = afternoon + 1
twapi = Twython(app_key=app_key, app_secret=app_secret,
oauth_token=oauth_token, oauth_token_secret=oauth_token_secret)
next_cursor = -1
result = {}
result["screen_name"] = ""
result["followers"] = []
iteration = 0
file_name = ""
#user info
user = twapi.lookup_user(screen_name = account_name)
#store user name
result['screen_name'] = account_name
#loop until all cursored results are stored
while (next_cursor != 0):
sleep(random.randrange(start = 1, stop = 15, step = 1))
call_result = twapi.get_followers_ids(screen_name = account_name, cursor = next_cursor)
#loop over each entry of followers id and append each entry to results_follower
for i in call_result["ids"]:
result["followers"].append(i)
next_cursor = call_result["next_cursor"] #new next_cursor
iteration = iteration + 1
if (iteration > 13): #skip sleep if all cursored pages are processed
error_msg = localtime()
error_msg = "".join([str(error_msg.tm_mon), "/", str(error_msg.tm_mday), "/", str(error_msg.tm_year), " at ", str(error_msg.tm_hour), ":", str(error_msg.tm_min)])
error_msg = "".join(["Twitter API Request Rate Limit hit on ", error_msg, ", wait..."])
print(error_msg)
del error_msg
sleep(901) #15min + 1sec
iteration = 0
#output file
file_name = "".join([account_name, ".txt"])
#print output
out_file = open(file_name, "w") #open file "account_name.txt"
#out_file.write(str(result["followers"])) #standard format
for i in result["followers"]: #R friendly table format
out_file.write(str(i))
out_file.write("\n")
out_file.close()
As discussed in the comments, there are a few issues with your code at present. You shouldn't need to delete your connection for it to function properly, and I think the issue comes because you initialise for a second time without having any catches for hitting your rate limit. Here is an example using Tweepy of how you can get the information you require:
import tweepy
from datetime import datetime
def download_followers(user, api):
all_followers = []
try:
for page in tweepy.Cursor(api.followers_ids, screen_name=user).pages():
all_followers.extend(map(str, page))
return all_followers
except tweepy.TweepError:
print('Could not access user {}. Skipping...'.format(user))
# Include your keys below:
consumer_key = 'YOUR_KEY'
consumer_secret = 'YOUR_KEY'
access_token = 'YOUR_KEY'
access_token_secret = 'YOUR_KEY'
# Set up tweepy API, with handling of rate limits
auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_token_secret)
main_api = tweepy.API(auth, wait_on_rate_limit=True, wait_on_rate_limit_notify=True)
# List of usernames to get followers for
lookup_users = ['asongtoruin', 'mbiella']
for username in lookup_users:
user_followers = download_followers(username, main_api)
if user_followers:
with open(username + '.txt', 'w') as outfile:
outfile.write('\n'.join(user_followers))
print('Finished outputting: {} at {}'.format(username, datetime.now().strftime('%Y/%m/%d %H:%M:%S')))
Tweepy is clever enough to know when it has hit its rate limit when we use wait_on_rate_limit=True, and checks how long it needs to sleep for before it can start again. By using wait_on_rate_limit_notify=True, we allow it to paste out how long it will be waiting until it can next get a page of followers (through this ID-based method, it seems as though there are 5000 IDs per page).
We additionally catch a TweepError exception - this can occur if the username provided relates to a protected account for which our authenticated user does not have permission to view. In this case, we simply skip the user to allow other information to be downloaded, but print out a warning that the user could not be accessed.
Running this saves a text file of follower ids for any user it can access. For me this prints the following:
Rate limit reached. Sleeping for: 593
Finished outputting: asongtoruin at 2017/02/22 11:43:12
Could not access user mbiella. Skipping...
With the follower IDs of asongtoruin (aka me) saved as asongtoruin.txt
There is one possible issue, in that our pages of followers start from the newest first. This could (though I don't understand the API well enough to say with certainty) result in issues with our output dataset if new users are added between our calls, as we may both miss these users and end up with duplicates in our dataset. If duplicates become an issue, you could change return all_followers to return set(all_followers)

Getting 403 error when trying to parse dropbox events page with python and mechanize

I use this script to get a list of all file updates to a certain directory. I then parse that list to get a list of time slots I have been active in that directory. That way I can quickly see how much time I have spent on the project and know what to charge my client.
I have written a small python script, adapted from this: https://github.com/jncraton/PythonDropboxUploader
I added the bottom function to retrieve a specific events page from https://www.dropbox.com/events?ns=false&n=50
I have used the script before 2 months ago and it worked well, but now I am getting 403: forbidden errors on:
eventSrc = self.browser.open(req).read()
Probably DropBox tries to block scrapers like mine to push programmers to use their API instead, but unfortunately the API doesn't support listing the events.
Can anybody help me out to get it working again?
This is the python code to create the connection:
import mechanize
import urllib
import re
import json
class DropboxConnection:
""" Creates a connection to Dropbox """
email = ""
password = ""
root_ns = ""
token = ""
browser = None
def __init__(self, email, password):
self.email = email
self.password = password
self.login()
self.get_constants()
def login(self):
""" Login to Dropbox and return mechanize browser instance """
# Fire up a browser using mechanize
self.browser = mechanize.Browser()
self.browser.set_handle_equiv(False)
self.browser.set_handle_redirect(True)
self.browser.set_handle_referer(True)
self.browser.set_handle_robots(False)
self.browser.addheaders = [('User-agent', 'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:14.0) Gecko/20120722 Firefox/14.0.1')]
# Browse to the login page
self.browser.open('https://www.dropbox.com/login')
# Enter the username and password into the login form
isLoginForm = lambda l: l.action == "https://www.dropbox.com/login" and l.method == "POST"
try:
self.browser.select_form(predicate=isLoginForm)
except:
self.browser = None
raise(Exception('Unable to find login form'))
self.browser['login_email'] = self.email
self.browser['login_password'] = self.password
self.browser['t'] = "1230"
# Send the form
response = self.browser.submit()
def get_constants(self):
""" Load constants from page """
home_src = self.browser.open('https://www.dropbox.com/home').read()
try:
self.root_ns = re.findall(r"root_ns: (\d+)", home_src)[0]
self.token = re.findall(r"TOKEN: '(.+)'", home_src)[0]
except:
raise(Exception("Unable to find constants for AJAX requests"))
def upload_file(self, local_file, remote_dir, remote_file):
""" Upload a local file to Dropbox """
if(not self.is_logged_in()):
raise(Exception("Can't upload when not logged in"))
self.browser.open('https://www.dropbox.com/')
# Add our file upload to the upload form
isUploadForm = lambda u: u.action == "https://dl-web.dropbox.com/upload" and u.method == "POST"
try:
self.browser.select_form(predicate=isUploadForm)
except:
raise(Exception('Unable to find upload form'))
self.browser.form.find_control("dest").readonly = False
self.browser.form.set_value(remote_dir, "dest")
self.browser.form.add_file(open(local_file, "rb"), "", remote_file)
# Submit the form with the file
self.browser.submit()
def get_dir_list(self, remote_dir):
""" Get file info for a directory """
if(not self.is_logged_in()):
raise(Exception("Can't download when not logged in"))
req_vars = "ns_id=" + self.root_ns + "&referrer=&t=" + self.token
req = urllib2.Request('https://www.dropbox.com/browse' + remote_dir, data=req_vars)
req.add_header('Referer', 'https://www.dropbox.com/home' + remote_dir)
dir_info = json.loads(self.browser.open(req).read())
dir_list = {}
for item in dir_info['file_info']:
# Eliminate directories
if(item[0] == False):
# get local filename
absolute_filename = item[3]
local_filename = re.findall(r".*\/(.*)", absolute_filename)[0]
# get file URL and add it to the dictionary
file_url = item[8]
dir_list[local_filename] = file_url
return dir_list
def get_download_url(self, remote_dir, remote_file):
""" Get the URL to download a file """
return self.get_dir_list(remote_dir)[remote_file]
def download_file(self, remote_dir, remote_file, local_file):
""" Download a file and save it locally """
fh = open(local_file, "wb")
fh.write(self.browser.open(self.get_download_url(remote_dir, remote_file)).read())
fh.close()
def is_logged_in(self):
""" Checks if a login has been established """
if(self.browser):
return True
else:
return False
def getEventsPage(self, n):
if(not self.is_logged_in()):
raise(Exception("Can't get event page when not logged in"))
url = 'https://www.dropbox.com/next_events'
values = {'cur_page': n, 'ns_id': 'false'}
data = urllib.urlencode(values)
req = mechanize.Request(url, data)
# print url + '?' + data
eventSrc = self.browser.open(req).read()
return eventSrc
And this is the loop that parses the events pages:
from dbupload import DropboxConnection
from getpass import getpass
from bs4 import BeautifulSoup
import re
import parsedatetime.parsedatetime as pdt
import parsedatetime.parsedatetime_consts as pdc
c = pdc.Constants()
p = pdt.Calendar(c)
email = "myemail#gmail.com" # raw_input("Enter Dropbox email address:")
password = getpass("Enter Dropbox password:")
dateFile = open('all_file_updates.txt', "wb")
try:
# Create the connection
conn = DropboxConnection(email, password)
except:
print("Connection failed")
else:
print("Connection succesful")
n = 250
found = 0
while(n >= 0):
eventsPageSrc = conn.getEventsPage(n)
soup = BeautifulSoup(eventsPageSrc)
table = soup.find("table", {"id": "events"})
for row in table.findAll('tr'):
link = row.find("a", href=re.compile('^https://dl-web.dropbox.com/get/ProjectName'))
if(link != None):
dateString = row.find("td", attrs={'class': 'modified'}).string
date = p.parse(dateString)
dateFile.write('Date: ' + str(date) + ' file: ' + link.string + '\n')
found = found + 1
n = n - 1
print 'page: ' + str(n) + ' Total found: ' + str(found)
In def get_constants(self): change
self.token = re.findall(r"TOKEN: '(.+)'", home_src)[0]
to
self.token = re.findall(r'TOKEN: "(.+)"', home_src)[0]
dropbox has changed the way it stores constants
Hope it helps.

Is it possible to use OAUTH 2 with the Google Reporting API?

I am currently using OAuth 1 for auth with the Reporting API with GData and Python. Is it possible to use OAuth 2, I can't find a reference that this is doable?
I wasn't able to find any reference for the OAuth 2 and the Reporting api but by following samples for the GData libraries (http://code.google.com/p/gdata-python-client/source/browse/#hg%2Fsamples%2Fapps) I was able to cobble this together:
#!/usr/bin/python
import sys
import os
import time
import gdata.gauth
import gdata.client
import httplib2
import oauth2client.file
import oauth2client.tools
REPORTING_URI = 'https://www.google.com/hosted/services/v1.0/reports/ReportingData'
REPORTING_XML_TEMPLATE = '''<?xml version="1.0" encoding="UTF-8"?>
<rest xmlns="google:accounts:rest:protocol"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
<type>Report</type>
<domain>%s</domain>
<date>%s</date>
<page>%s</page>
<reportType>daily</reportType>
<reportName>%s</reportName>
</rest>'''
OAUTH2FILENAME = 'oauth_20.dat'
OAUTH2JSONFILE = 'client_secrets.json'
OAUTH2SCOPES = 'https://www.google.com/hosted/services/v1.0/reports/ReportingData'
OAUTH2USERAGENT = 'REPORTING'
CLIENTSOURCE = 'REPORTING'
MISSING_OAUTHJSON_FILE_MESSAGE = """
WARNING: Please configure OAuth 2.0
To continue you will need to populate the client_secrets.json file:
%s
with information from the APIs Console <https://code.google.com/apis/console>.
""" % os.path.join(os.path.dirname(__file__), OAUTH2JSONFILE)
### Reporting
def RunReport (http_object, domain, report=None, date=None):
if date is None:
now = time.time()
report_time = time.gmtime(now)
date = time.strftime("%Y-%m-%d",report_time)
if report is None:
report='accounts'
report_data = RequestReport(http_object,domain=domain,report=report,date=date)
if not report_data:
print 'No report data'
return report_data
def RequestReport (http_object, domain=None, report=None, date=None):
"""Retrieves a report
Args:
domain: string
report: string: accounts, activity, disk_space, email_clients, summary
date: string: YYYY-MM-DD
Returns:
String, the report data
"""
report_data = ''
uri = REPORTING_URI
if not report or report is None:
return report_data
if not date or date is None:
return report_data
if not domain or domain is None:
domain = self.domain
page = 1
while True:
report_xml = REPORTING_XML_TEMPLATE %(domain, date, page, report)
response = ''
report_page = ''
try:
response, report_page = http_object.request(
uri,method='POST',body=report_xml)
except Exception, rexcept:
print 'Exception: ',rexcept
report_page = ''
break
if response.status != 200:
print 'Error: ',response.status
report_page = ''
break
if not report_page or report_page == 'End-Of-Report':
break
else:
report_data += report_page
page = page + 1
return report_data
scopes = OAUTH2SCOPES
user_agent = OAUTH2USERAGENT
client_source = CLIENTSOURCE
str_oauth2file = OAUTH2FILENAME
str_oauthjsonfile = OAUTH2JSONFILE
domain = 'somedomain'
report_name = 'accounts'
client_id = 'string'
client_secret = 'string'
report_data = ''
oauth2_flow = ''
now = time.time()
report_time = time.gmtime(now)
report_date = time.strftime("%Y-%m-%d",report_time)
if not os.path.isfile(str_oauth2file):
token = gdata.gauth.OAuth2Token(client_id=client_id,
client_secret=client_secret, scope=scopes, user_agent=user_agent)
uri = token.generate_authorize_url()
print 'Please visit this URL to authorize the application:'
print uri
# Get the verification code from the standard input.
code = raw_input('What is the verification code? ').strip()
token.get_access_token(code)
oauth2_flow = oauth2client.client.flow_from_clientsecrets(str_oauthjsonfile,
scope=scopes,message=MISSING_OAUTHJSON_FILE_MESSAGE)
storage = oauth2client.file.Storage(str_oauth2file)
oauth2_credentials = storage.get()
if oauth2_credentials is None or oauth2_credentials.invalid:
if not oauth2_flow:
oauth2_flow = oauth2client.client.flow_from_clientsecrets(str_oauthjsonfile,
scope=scopes,message=MISSING_OAUTHJSON_FILE_MESSAGE)
print '\nYou must authorize access to the request APIS.\n'
# Save the credentials in storage to be used in subsequent runs.
oauth2_credentials = oauth2client.tools.run(oauth2_flow, storage)
http_oauth2_object = httplib2.Http()
http_oauth2_object = oauth2_credentials.authorize(http_oauth2_object)
report_data = RunReport(
http_oauth2_object,domain,report=report_name,date=report_date)
if report_data:
print report_data
sys.exit(0)

Categories