I have write a simple python code which reads a list of domains from a txt file and checking each if is a WordPress site or not based on the returned result.
the code is as follow:
import requests
#Loop domains list
with open('domains2') as f:
for line in f:
domain = line
source = requests.get(domain)
if "wp-include" in source:
results = 'Yes'
else:
results = 'No'
print(line , ' : ' , results)
The errors are as follow:
Traceback (most recent call last):
File "./test4.py", line 8, in <module>
source = requests.get(domain)
File "/usr/local/lib/python2.7/dist-packages/requests/api.py", line 75, in get
return request('get', url, params=params, **kwargs)
File "/usr/local/lib/python2.7/dist-packages/requests/api.py", line 60, in request
return session.request(method=method, url=url, **kwargs)
File "/usr/local/lib/python2.7/dist-packages/requests/sessions.py", line 533, in request
resp = self.send(prep, **send_kwargs)
File "/usr/local/lib/python2.7/dist-packages/requests/sessions.py", line 646, in send
r = adapter.send(request, **kwargs)
File "/usr/local/lib/python2.7/dist-packages/requests/adapters.py", line 516, in send
raise ConnectionError(e, request=request)
requests.exceptions.ConnectionError: HTTPConnectionPool(host='testing.com%0a', port=80): Max retries exceeded with url: / (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x7fd5a00c4d50>: Failed to establish a new connection: [Errno -2] Name or service not known',))
I was able to run my code only if I set manually the value of source as follow and do not read the domains from the list and the results were correct:
source = requests.get(domain).text
import requests
#Loop domains list
with open('domains2') as f:
for line in f:
domain = line.rstrip()
source = requests.get(domain)
if "wp-include" in source.text:
results = 'Yes'
else:
results = 'No'
print(line , ' : ' , results)
source.text to get the requests response, rstrip() to remove \n
with domain transformation to a valid url (for requests) (python3):
#!/usr/bin/env python
import requests
import re
from urllib import parse
def get_domains(file):
res = []
with open(file) as f:
for x in f:
url = x.strip()
p = parse.urlparse(url, 'http')
netloc = p.netloc or p.path
path = p.path if p.netloc else ''
if not netloc.startswith('www.'):
netloc = 'www.' + netloc
p = parse.ParseResult('http', netloc, path, *p[3:])
res.append(p.geturl())
return res
def is_wordpress(url):
print(f"getting: {url}")
content = requests.get(url).text
if re.search('wp-include', content):
return True
else:
return False
def main():
result = {}
for domain in get_domains('domain.txt'):
result[domain] = is_wordpress(domain)
print(result)
if __name__ == '__main__':
main()
Related
I am trying to implement the Amazon Web Scraper mentioned here. However, I get the output mentioned below. The output repeats until it stops with RecursionError: maximum recursion depth exceeded.
I have already tried downgrading eventlet to version 0.17.4 as mentioned here.
Also, the requestsmodule is getting patched as you can see in helpers.py.
helpers.py
import os
import random
from datetime import datetime
from urllib.parse import urlparse
import eventlet
requests = eventlet.import_patched('requests.__init__')
time = eventlet.import_patched('time')
import redis
from bs4 import BeautifulSoup
from requests.exceptions import RequestException
import settings
num_requests = 0
redis = redis.StrictRedis(host=settings.redis_host, port=settings.redis_port, db=settings.redis_db)
def make_request(url, return_soup=True):
# global request building and response handling
url = format_url(url)
if "picassoRedirect" in url:
return None # skip the redirect URLs
global num_requests
if num_requests >= settings.max_requests:
raise Exception("Reached the max number of requests: {}".format(settings.max_requests))
proxies = get_proxy()
try:
r = requests.get(url, headers=settings.headers, proxies=proxies)
except RequestException as e:
log("WARNING: Request for {} failed, trying again.".format(url))
num_requests += 1
if r.status_code != 200:
os.system('say "Got non-200 Response"')
log("WARNING: Got a {} status code for URL: {}".format(r.status_code, url))
return None
if return_soup:
return BeautifulSoup(r.text), r.text
return r
def format_url(url):
# make sure URLs aren't relative, and strip unnecssary query args
u = urlparse(url)
scheme = u.scheme or "https"
host = u.netloc or "www.amazon.de"
path = u.path
if not u.query:
query = ""
else:
query = "?"
for piece in u.query.split("&"):
k, v = piece.split("=")
if k in settings.allowed_params:
query += "{k}={v}&".format(**locals())
query = query[:-1]
return "{scheme}://{host}{path}{query}".format(**locals())
def log(msg):
# global logging function
if settings.log_stdout:
try:
print("{}: {}".format(datetime.now(), msg))
except UnicodeEncodeError:
pass # squash logging errors in case of non-ascii text
def get_proxy():
# choose a proxy server to use for this request, if we need one
if not settings.proxies or len(settings.proxies) == 0:
return None
proxy = random.choice(settings.proxies)
proxy_url = "socks5://{user}:{passwd}#{ip}:{port}/".format(
user=settings.proxy_user,
passwd=settings.proxy_pass,
ip=proxy,
port=settings.proxy_port,
)
return {
"http": proxy_url,
"https": proxy_url
}
if __name__ == '__main__':
# test proxy server IP masking
r = make_request('https://api.ipify.org?format=json', return_soup=False)
print(r.text)
output
Traceback (most recent call last):
File "helpers.py", line 112, in <module>
r = make_request('https://api.ipify.org?format=json', return_soup=False)
File "helpers.py", line 36, in make_request
r = requests.get(url, headers=settings.headers, proxies=proxies)
File "/home/ec2-user/env/lib64/python3.7/site-packages/requests/api.py", line 76, in get
return request('get', url, params=params, **kwargs)
File "/home/ec2-user/env/lib64/python3.7/site-packages/requests/api.py", line 61, in request
return session.request(method=method, url=url, **kwargs)
File "/home/ec2-user/env/lib64/python3.7/site-packages/requests/sessions.py", line 530, in request
resp = self.send(prep, **send_kwargs)
File "/home/ec2-user/env/lib64/python3.7/site-packages/requests/sessions.py", line 643, in send
r = adapter.send(request, **kwargs)
File "/home/ec2-user/env/lib64/python3.7/site-packages/requests/adapters.py", line 449, in send
timeout=timeout
File "/home/ec2-user/env/lib64/python3.7/site-packages/urllib3/connectionpool.py", line 672, in urlopen
chunked=chunked,
File "/home/ec2-user/env/lib64/python3.7/site-packages/urllib3/connectionpool.py", line 376, in _make_request
self._validate_conn(conn)
File "/home/ec2-user/env/lib64/python3.7/site-packages/urllib3/connectionpool.py", line 994, in _validate_conn
conn.connect()
File "/home/ec2-user/env/lib64/python3.7/site-packages/urllib3/connection.py", line 300, in connect
conn = self._new_conn()
File "/home/ec2-user/env/lib64/python3.7/site-packages/urllib3/contrib/socks.py", line 99, in _new_conn
**extra_kw
File "/home/ec2-user/env/lib64/python3.7/site-packages/socks.py", line 199, in create_connection
sock.connect((remote_host, remote_port))
File "/home/ec2-user/env/lib64/python3.7/site-packages/socks.py", line 47, in wrapper
return function(*args, **kwargs)
File "/home/ec2-user/env/lib64/python3.7/site-packages/socks.py", line 774, in connect
super(socksocket, self).settimeout(self._timeout)
File "/home/ec2-user/env/lib64/python3.7/site-packages/eventlet/greenio/base.py", line 395, in settimeout
self.setblocking(True)
What might be the problem here?
Turns out removing eventlet.monkey_patch() and import eventlet solved the problem.
I am trying to connect to a website via RESTful API. A token has to be generated to access the methods. And its working file since i can access data through all the methods but i am stuck at this one.
My code So far:
class FlipkartAPI:
def __init__(self, token, sandbox=False):
self.token = token
self.session = self.get_session()
self.sandbox = sandbox
def get_session(self):
session = requests.Session()
session.headers.update({
'Authorization': 'Bearer %s' % self.token,
'Content-type': 'application/json',
})
return session
def returns(self, source, modified_after=None, created_after = None):
if self.sandbox == False:
url = "http://api.flipkart.net/returns"
else:
url = "http://sandbox-api.flipkart.net/returns"
payload = {'source':source,
'modifiedAfter':modified_after,
'createdAfter': created_after}
return self.session.get(url, params = payload)
test.py:
class ListOrders:
def __init__(self):
self.app_id = 'app_id'
self.app_secret = 'app_secret'
auth = Authentication(self.app_id, self.app_secret, sandbox=False)
get_token = auth.get_access_token()
token_str = get_token.json()
token = token_str['access_token']
self.flipkart = FlipkartAPI(token, sandbox=False)
def ret(self):
r = self.flipkart.returns('customer_return', modified_after='2015-09-01', created_after='2015-09-01')
print r.url
print r.status_code
The problem is that i am getting max tries exceeded error every time i call ret method. And It doesn't even print url and the status_code for the request. Link to Documentation. What i am i doing wrong? I can access other method so there is no problem with the token generation.
Traceback:
Traceback (most recent call last):
File "test.py", line 131, in <module>
r = x.ret()
File "test.py", line 123, in ret
r = self.flipkart.returns('customer_return')
File "/home/manish/Desktop/Flipkart_Api_Main/api.py", line 77, in returns
return self.session.get(url)
File "/usr/lib/python2.7/dist-packages/requests/sessions.py", line 467, in get
return self.request('GET', url, **kwargs)
File "/usr/lib/python2.7/dist-packages/requests/sessions.py", line 455, in request
resp = self.send(prep, **send_kwargs)
File "/usr/lib/python2.7/dist-packages/requests/sessions.py", line 558, in send
r = adapter.send(request, **kwargs)
File "/usr/lib/python2.7/dist-packages/requests/adapters.py", line 378, in send
raise ConnectionError(e)
requests.exceptions.ConnectionError: HTTPConnectionPool(host='api.flipkart.net', port=80): Max retries exceeded with url: /returns (Caused by <class 'socket.error'>: [Errno 111] Connection refused)
EDIT: POSTMAN APP DATA
Images
I am using rauth and requests to make calls to the Beatport API. The call works but I quite occasionaly get the following error ConnectionError: HTTPSConnectionPool(host='oauth-api.beatport.com', port=443): Max retries exceeded with url
Here is the traceback.
Traceback (most recent call last):
File "<console>", line 1, in <module>
File "scraper/songlist_top100.py", line 88, in <module>
'sortBy': 'releaseDate ASC'})
File "C:\Python27\lib\site-packages\requests\sessions.py", line 347, in get
return self.request('GET', url, **kwargs)
File "C:\Python27\lib\site-packages\rauth\session.py", line 208, in request
return super(OAuth1Session, self).request(method, url, **req_kwargs)
File "C:\Python27\lib\site-packages\requests\sessions.py", line 335, in reques
t
resp = self.send(prep, **send_kwargs)
File "C:\Python27\lib\site-packages\requests\sessions.py", line 438, in send
r = adapter.send(request, **kwargs)
File "C:\Python27\lib\site-packages\requests\adapters.py", line 327, in send
raise ConnectionError(e)
ConnectionError: HTTPSConnectionPool(host='oauth-api.beatport.com', port=443):
Max retries exceeded with url: /catalog/3/tracks?perPage=150&
oauth_nonce=xxxxx&oauth_timestamp=xxxxx&facets=artistName%3A
Avicii&oauth_signature_method=HMAC-SHA1&oauth_version=1.0&
oauth_consumer_key=xxxxx&oauth_token=xxxxxx&sortBy=releaseDate+ASC
&oauth_signature=xxxxx%3D&page=3 (Caused by <class 'httplib.BadStatusLine'>: '')
Here is my script
from rauth import OAuth1Service
import requests
from hunt.models import DJ, Song
def get_obj_or_none(model, **kwargs):
try:
return model.objects.get(**kwargs)
except model.DoesNotExist:
return None
beatport_login = 'xxx'
beatport_pass = 'xxx'
beatport = OAuth1Service(
name='beatport',
consumer_key='xxxxx',
consumer_secret='xxxxx',
request_token_url= 'https://oauth-api.beatport.com/identity/1/oauth/request-token',
access_token_url='https://oauth-api.beatport.com/identity/1/oauth/access-token',
authorize_url='https://oauth-api.beatport.com/identity/1/oauth/authorize',
base_url='https://oauth-api.beatport.com/json/catalog')
request_token, request_token_secret = beatport.get_request_token(method='POST', data={
'oauth_callback': 'http://www.edmhunters.com'})
authorize_url = beatport.get_authorize_url(request_token)
values = {
'oauth_token': request_token,
'username': beatport_login,
'password': beatport_pass,
'submit' : 'Login',
}
r = requests.post('https://oauth-api.beatport.com/identity/1/oauth/authorize-submit', data=values)
verifier = r.url.split("oauth_verifier=",1)[1]
tokens = beatport.get_raw_access_token(request_token, request_token_secret, method='POST', data={
'oauth_verifier': verifier})
token_string = tokens.content
access_token = token_string[token_string.find('=')+1:token_string.find('&')]
access_token_secret = token_string[token_string.find('t=')+2:token_string.rfind('&s')]
session = beatport.get_session((access_token, access_token_secret))
for dj in DJ.objects.all():
r = session.get('https://oauth-api.beatport.com/catalog/3/tracks', params={'facets': "artistName:"+dj.name, 'perPage': 150})
count_response = r.json()
results = []
for i in range(1, count_response['metadata']['totalPages']+1):
r1 = session.get('https://oauth-api.beatport.com/catalog/3/tracks', params={'facets': "artistName:"+dj.name,
'page': i,
'perPage': 150,
'sortBy': 'releaseDate ASC'})
json_response = r1.json()
results += json_response['results']
song_list = []
for song in results:
artists = [artist['name'] for artist in song['artists'] if str(artist['type'])=='artist']
remixers = [artist['name'] for artist in song['artists'] if str(artist['type'])=='remixer']
if not ((dj.name in artists) and ((dj.name not in remixers) if len(remixers)>0 else False)):
song_list.append(song)
for song in song_list:
artists = [artist['name'] for artist in song['artists'] if str(artist['type'])=='artist']
remixers = [artist['name'] for artist in song['artists'] if str(artist['type'])=='remixer']
artist_list = ', '.join(artists)
remixer_list = ', '.join(remixers)
song_name = song['name']
if not(song_name.lower().find("feat.") == -1 ):
normal_name=song_name[0:song_name.lower().find("feat.")].rstrip()
else:
normal_name=song_name
genre_list=[]
for genre in song['genres']:
genre_list.append(genre['name'])
genres = ', '.join(genre_list)
if not get_obj_or_none(Song, name__iexact=song_name, artist=dj):
s = Song(song_id=song['id'],
name=song_name,
title=song['title'],
normalized_name=normal_name,
artist=dj,
artists=artist_list,
remixers=remixer_list,
release_date=song['releaseDate'],
slug=song['slug'],
artwork=song['images']['large']['url'],
genres=genres)
s.save()
print "Added song:", s.song_id, s.artist
Why do I get the above mentioned error?
It looks as if the Beatport API is overloaded and closes the connection prematurely sometimes. Your first set of requests succeeded just fine, it was page 3 that threw the error because the response is empty.
You really should report this to Beatport, but you could perhaps work around this issue by instructing the requests module to retry requests:
from requests.adapters import HTTPAdapter
# ....
session = beatport.get_session((access_token, access_token_secret))
session.mount('https://oauth-api.beatport.com', HTTPAdapter(max_retries=5))
would retry your requests a few more times in case an error occurred.
I'm trying to write a script to programmatically login to Google Finance, view my portfolio and then display results on my desktop. I'm using the requests module, currently stuck on the 'login' part.
I keep getting this error requests.cookies.CookieConflictError: There are multiple cookies with name, 'APISID'
Here is the entire script, the error throws on line 48. I'm guessing it has something to do with requests keep-alive and the connection isn't recycling properly?
#!/usr/bin/env python
import getpass
import re
import requests
email = raw_input("Enter your Google username: ")
password = getpass.getpass("Enter your password: ")
session = requests.Session()
# Define URLs
login_page_url = 'https://accounts.google.com/ServiceLogin?passive=true&service=finance'
authenticate_url = 'https://accounts.google.com/ServiceLoginAuth?service=finance'
gf_home_page_url = 'http://www.google.com/finance/portfolio'
login_page_contents = session.get(login_page_url).text
# Find GALX value
galx_match_obj = re.search(r'name="GALX"\s*value="([^"]+)"', login_page_contents, re.IGNORECASE)
galx_value = galx_match_obj.group(1) if galx_match_obj.group(1) is not None else ''
# Find DSH value
dsh_match_obj = re.search(r'id="dsh"\s*value="([^"]+)"', login_page_contents, re.IGNORECASE)
dsh_value = dsh_match_obj.group(1) if dsh_match_obj.group(1) is not None else ''
# Set up login credentials
login_params = {
'Email': email,
'Passwd': password,
'continue': 'http://www.google.com/finance/portfolio',
'followup': 'http://www.google.com/finance/portfolio',
'service': 'finance',
'GALX': galx_value,
'pstMsg': 0,
'dnConn': '',
'checkConnection': '',
'timeStmp': '',
'secTok': '',
'bgresponse': 'js_disabled',
'PersistentCookie': 'yes'
}
print galx_value
print dsh_value
# Login
r = session.post(authenticate_url, params=login_params) # <- Error thrown here
print r.text
exit
Traceback:
Traceback (most recent call last):
File "crawl.py", line 48, in <module>
r = session.post(authenticate_url, params=login_params)
File "/Users/nathan/Development/Scripts/google-finance-crawler/requests/sessions.py", line 358, in post
return self.request('POST', url, data=data, **kwargs)
File "/Users/nathan/Development/Scripts/google-finance-crawler/requests/sessions.py", line 312, in request
resp = self.send(prep, **send_kwargs)
File "/Users/nathan/Development/Scripts/google-finance-crawler/requests/sessions.py", line 426, in send
history = [resp for resp in gen] if allow_redirects else []
File "/Users/nathan/Development/Scripts/google-finance-crawler/requests/sessions.py", line 163, in resolve_redirects
resp.cookies.update(cookiejar)
File "/usr/local/Cellar/python/2.7.3/Frameworks/Python.framework/Versions/2.7/lib/python2.7/_abcoll.py", line 494, in update
self[key] = other[key]
File "/Users/nathan/Development/Scripts/google-finance-crawler/requests/cookies.py", line 246, in __getitem__
return self._find_no_duplicates(name)
File "/Users/nathan/Development/Scripts/google-finance-crawler/requests/cookies.py", line 285, in _find_no_duplicates
raise CookieConflictError('There are multiple cookies with name, %r' % (name))
requests.cookies.CookieConflictError: There are multiple cookies with name, 'APISID'
It's a bug in requests, see issue 1189.
The current proposed fix is to simply delete line 163 of requests/sessions.py:
resp.cookies.update(cookiejar)
I need a python script that gets the google adsense earnings and I found adsense scraper:
http://pypi.python.org/pypi/adsense_scraper/0.5
It uses Twill and html5lib to scrape google adsense earnings data. When I use it I get this error message:
Traceback (most recent call last):
File "adsense_scraper.py", line 163, in <module>
data = main()
File "adsense_scraper.py", line 154, in main
b = get_adsense(login, password)
File "adsense_scraper.py", line 128, in get_adsense
b.submit()
File "c:\python26\lib\site-packages\twill-0.9-py2.6.egg\twill\browser.py", line 467, in submit
self._journey('open', request)
File "c:\python26\lib\site-packages\twill-0.9-py2.6.egg\twill\browser.py", line 523, in _journey
r = func(*args, **kwargs)
File "c:\python26\lib\site-packages\twill-0.9-py2.6.egg\twill\other_packages\_mechanize_dist\_mechanize.py", line 212, in open
return self._mech_open(url, data)
File "c:\python26\lib\site-packages\twill-0.9-py2.6.egg\twill\other_packages\_mechanize_dist\_mechanize.py", line 238, in _mech_open
response = UserAgentBase.open(self, request, data)
File "c:\python26\lib\site-packages\twill-0.9-py2.6.egg\twill\other_packages\_mechanize_dist\_opener.py", line 192, in open
response = meth(req, response)
File "c:\python26\lib\site-packages\twill-0.9-py2.6.egg\twill\other_packages\_mechanize_dist\_http.py", line 590, in http_response
"http", request, response, code, msg, hdrs)
File "c:\python26\lib\site-packages\twill-0.9-py2.6.egg\twill\other_packages\_mechanize_dist\_opener.py", line 209, in error
result = apply(self._call_chain, args)
File "C:\Python26\lib\urllib2.py", line 361, in _call_chain
result = func(*args)
File "c:\python26\lib\site-packages\twill-0.9-py2.6.egg\twill\other_packages\_mechanize_dist\_http.py", line 135, in http_error_302
return self.parent.open(new)
File "c:\python26\lib\site-packages\twill-0.9-py2.6.egg\twill\other_packages\_mechanize_dist\_mechanize.py", line 212, in open
return self._mech_open(url, data)
File "c:\python26\lib\site-packages\twill-0.9-py2.6.egg\twill\other_packages\_mechanize_dist\_mechanize.py", line 238, in _mech_open
response = UserAgentBase.open(self, request, data)
File "c:\python26\lib\site-packages\twill-0.9-py2.6.egg\twill\other_packages\_mechanize_dist\_opener.py", line 192, in open
response = meth(req, response)
File "c:\python26\lib\site-packages\twill-0.9-py2.6.egg\twill\utils.py", line 442, in http_response
"refresh", msg, hdrs)
File "c:\python26\lib\site-packages\twill-0.9-py2.6.egg\twill\other_packages\_mechanize_dist\_opener.py", line 209, in error
result = apply(self._call_chain, args)
File "C:\Python26\lib\urllib2.py", line 361, in _call_chain
result = func(*args)
File "c:\python26\lib\site-packages\twill-0.9-py2.6.egg\twill\other_packages\_mechanize_dist\_http.py", line 135, in http_error_302
return self.parent.open(new)
File "c:\python26\lib\site-packages\twill-0.9-py2.6.egg\twill\other_packages\_mechanize_dist\_mechanize.py", line 212, in open
return self._mech_open(url, data)
File "c:\python26\lib\site-packages\twill-0.9-py2.6.egg\twill\other_packages\_mechanize_dist\_mechanize.py", line 238, in _mech_open
response = UserAgentBase.open(self, request, data)
File "c:\python26\lib\site-packages\twill-0.9-py2.6.egg\twill\other_packages\_mechanize_dist\_opener.py", line 181, in open
response = urlopen(self, req, data)
File "C:\Python26\lib\urllib2.py", line 406, in _open 'unknown_open', req)
File "C:\Python26\lib\urllib2.py", line 361, in _call_chain result = func(*args)
File "C:\Python26\lib\urllib2.py", line 1163, in unknown_open raise URLError('unknown url type: %s' % type)
urllib2.URLError: <urlopen error unknown url type: 'http>
So the important thing is:
urllib2.URLError: <urlopen error unknown url type: 'http>
Can somebody tell me where the error is? Is there even a better way to get the data via python? Thanks
there are several errors with the package, you mentioned only the first one
1) twill package does not handle google's redirects correctly, adding
newurl = newurl.strip( "'" )
to twill/other_packages/_mechanize_dist/_http.py:108 before
newurl = _rfc3986.clean_url(newurl, "latin-1")
fixes that
2) you have to have the correct language set in adsense - English
3) there are several problems in the orignal adsense_scraper
#!/usr/bin/env python
"""Scrapes Google AdSense data with Python using Twill
Current canonical location of this module is here:
http://github.com/etrepum/adsense_scraper/tree/master
Usage::
from adsense_scraper import get_adsense, get_time_period
b = get_adsense('YOUR_ADSENSE_LOGIN', 'YOUR_ADSENSE_PASSWORD')
rows = get_time_period(b, 'yesterday')
# The summary data is always the first row with channel == ''
print 'I earned this much yesterday: $%(earnings)s' % rows[0]
"""
# requires html5lib, twill
import sys
import pprint
import decimal
from cStringIO import StringIO
from xml.etree import cElementTree
try:
from html5lib import HTMLParser
import twill.commands
except ImportError:
print >>sys.stderr, """\
adsense_scraper has dependencies::
Twill 0.9 http://twill.idyll.org/
html5lib 0.11 http://code.google.com/p/html5lib/
Try this::
$ easy_install twill html5lib
"""
raise SystemExit()
__version__ = '0.5'
SERVICE_LOGIN_BOX_URL = "https://www.google.com/accounts/ServiceLogin?service=adsense&rm=hide&fpui=3&nui=15&alwf=true<mpl=adsense&passive=true&continue=https%3A%2F%2Fwww.google.com%2Fadsense%2Fgaiaauth2&followup=https%3A%2F%2Fwww.google.com%2Fadsense%2Fgaiaauth2&hl=en_US"
OVERVIEW_URL = "https://www.google.com/adsense/report/overview?timePeriod="
TIME_PERIODS = [
'today',
'yesterday',
'thismonth',
'lastmonth',
'sincelastpayment',
]
def parse_decimal(s):
"""Return an int or decimal.Decimal given a human-readable number
"""
light_stripped = s.strip(u'\u20ac')
stripped = light_stripped.replace(',', '.').rstrip('%').lstrip('$')
try:
int(stripped)
return light_stripped
except ValueError:
pass
try:
float(stripped)
return light_stripped
except ValueError:
return decimal.Decimal(stripped)
def parse_summary_table(doc):
"""
Parse the etree doc for summarytable, returns::
[{'channel': unicode,
'impressions': int,
'clicks': int,
'ctr': decimal.Decimal,
'ecpm': decimal.Decimal,
'earnings': decimal.Decimal}]
"""
for t in doc.findall('.//table'):
if t.attrib.get('id') == 'summarytable':
break
else:
raise ValueError("summary table not found")
res = []
FIELDS = ['impressions', 'clicks', 'ctr', 'ecpm', 'earnings']
for row in t.findall('.//tr'):
celltext = []
for c in row.findall('td'):
tail = ''
# adsense inserts an empty span if a row has a period in it, so
# get the children and find the tail element to append to the text
if c.find('a') and c.find('a').getchildren():
tail = c.find('a').getchildren()[0].tail or ''
celltext.append('%s%s' % ((c.text or c.findtext('a') or '').strip(), tail.strip()))
celltext = filter( lambda x: x != "" , celltext )
if len(celltext) != len(FIELDS):
continue
try:
value_cols = map(parse_decimal, celltext)
except decimal.InvalidOperation:
continue
res.append(dict(zip(FIELDS, value_cols)))
return res
def get_adsense(login, password):
"""Returns a twill browser instance after having logged in to AdSense
with *login* and *password*.
The returned browser will have all of the appropriate cookies set but may
not be at the exact page that you want data from.
"""
b = twill.commands.get_browser()
b.go(SERVICE_LOGIN_BOX_URL)
for form in b.get_all_forms():
try:
form['Email'] = login
form['Passwd'] = password
except ValueError:
continue
else:
break
else:
raise ValueError("Could not find login form on page")
b._browser.select_form(predicate=lambda f: f is form)
b.submit()
return b
def get_time_period(b, period):
"""Returns the parsed summarytable for the time period *period* given
*b* which should be the result of a get_adsense call. *period* must be
a time period that AdSense supports:
``'today'``, ``'yesterday'``, ``'thismonth'``,
``'lastmonth'``, ``'sincelastpayment'``.
"""
b.go(OVERVIEW_URL + period)
# The cElementTree treebuilder doesn't work reliably enough
# to use directly, so we parse and then dump into cElementTree.
doc = cElementTree.fromstring(HTMLParser().parse(b.get_html()).toxml())
return parse_summary_table(doc)
def main():
try:
login, password = sys.argv[1:]
except ValueError:
raise SystemExit("usage: %s LOGIN PASSWORD" % (sys.argv[0],))
twill.set_output(StringIO())
twill.commands.reset_browser()
b = get_adsense(login, password)
data = {}
for period in TIME_PERIODS:
data[period] = get_time_period(b, period)
pprint.pprint(data)
twill.set_output(None)
return data
if __name__ == '__main__':
data = main()