Could anyone tell me what am I doing wrong? I keep getting an error with this code.
I'm trying to download all of the swf's from primaryschoolgames just as an experiment but I can't seem to do it:
#!/usr/bin/env python
# encoding: utf-8
import sys, getopt
import os, urllib, urllib2, re, string, math
help_message = '''
'''
no_param = '''
'''
verbose = False
fakeMode = False
curPath = os.getcwd() + "/"
urlRegex = ''
FileRegex = ''
outputPath = ''
currentFile = ''
def removeDuplicates(seq):
# Not order preserving
keys = {}
for e in seq:
keys[e] = 1
return keys.keys()
def go(filename):
print "Having a look at " + string.capwords(filename)
global urlRegex, FileRegex, outputPath, currentFile
url = 'http://cdn.primarygames.com' + filename
urlRegex = '/'+filename+'/.+/download'
FileRegex = '/'+filename+'/(.*?)/download'
outputPath = curPath+"Swfs"+"/"
if not os.path.exists(outputPath):
os.makedirs(outputPath)
filelist = []
while(len(url)):
# looping system
newlist, url = scrapePage(url, filename)
filelist.extend(newlist)
print 'Found %s Files.' % len(filelist)
for swf in filelist:
swfurl = swf['url']
name = swf['name']
currentFile = name
#print 'Downloading '+name,
if not fakeMode:
#print ''
urllib.urlretrieve('http://cdn.primarygames.com' + swfurl, outputPath+name)
else:
print 'Not downloading %s.' % name
print "All done with %s!" % filename
def scrapePage(url, filename):
print 'Looking through '+url
html = urllib2.urlopen(url).read()
swflist = re.findall(urlRegex, html)
swflist = removeDuplicates(swflist)
swfs = []
for swfurl in swflist:
r = re.compile(FileRegex)
swfname = r.search(swfurl).group(1)
swfname = swfname.replace('-', ' ')
name = filename + "/" + swfname + ".swf"
name = string.capwords(name)
swf.append({'name':name,'url':swfurl})
r = re.compile(nextRegex)
result = r.search(html)
if result:
nextUrl = 'http://cdn.primarygames.com' + result.group(1)
else:
nextUrl = ''
return swfs, nextUrl
def main(argv=None):
global verbose, fakeMode
if argv is None:
argv = sys.argv
try:
try:
opts, args = getopt.getopt(argv[1:], "ho:vf", ["help", "output="])
except getopt.error, msg:
raise Usage(msg)
# option processing
for option, value in opts:
if option == "-v":
verbose = True
if option in ("-f", "--fake"):
fakeMode = True
if option in ("-h", "--help"):
raise Usage(help_message)
if option in ("-o", "--output"):
output = value
if len(args):
swfs = args
else:
raise Usage(no_param)
except Usage, err:
print >> sys.stderr, sys.argv[0].split("/")[-1] + ": " + str(err.msg)
if err.msg != help_message:
print >> sys.stderr, "\t for help use --help"
return 2
for swf in swfs:
go(swf)
if __name__ == "__main__":
sys.exit(main())
This is the error I keep getting:
Having a look at *
Looking through http://cdn.primarygames.com/*
Traceback (most recent call last):
File "C:\PrimarySchoolGames Swf Downloader.py"
, line 129, in <module>
sys.exit(main())
File "C:\PrimarySchoolGames Swf Downloader.py"
, line 125, in main
go(swf)
File "C:\PrimarySchoolGames Swf Downloader.py"
, line 48, in go
newlist, url = scrapePage(url, filename)
File "C:\Users\Terrii\Desktop\VB Extra's\PrimarySchoolGames Swf Downloader.py"
, line 67, in scrapePage
html = urllib2.urlopen(url).read()
File "C:\Python27\lib\urllib2.py", line 126, in urlopen
return _opener.open(url, data, timeout)
File "C:\Python27\lib\urllib2.py", line 400, in open
response = self._open(req, data)
File "C:\Python27\lib\urllib2.py", line 418, in _open
'_open', req)
File "C:\Python27\lib\urllib2.py", line 378, in _call_chain
result = func(*args)
File "C:\Python27\lib\urllib2.py", line 1207, in http_open
return self.do_open(httplib.HTTPConnection, req)
File "C:\Python27\lib\urllib2.py", line 1177, in do_open
raise URLError(err)
urllib2.URLError: <urlopen error [Errno 11004] getaddrinfo failed>
A failed getaddrinfo normally suggests that something is wrong with the URL you're providing. Since I am able to resolve the address are you sure you aren't behind a proxy server? This could result in a failed DNS lookup which results in exactly this message.
How Python determines which proxy to use on Windows:
In a Windows environment, if no proxy environment variables are set,
proxy settings are obtained from the registry’s Internet Settings
section.
For more help I concurr with #MikeHunter. I tried to fix your code, but since I had to implement your Exception-Class to get the code running at all I think you should re-indent your code and provide more information. Sorry.
Related
I'm trying to write a small python 3 utility script that checks to see if a file exists on my server.
So I have the code below that has a big array of string values that I pass to a simple function that returns the url and the response code.
However, when I run it I get all these errors I don't even know where to start:
$ python ReturnPath.py
Traceback (most recent call last):
File "ReturnPath.py", line 86, in <module>
checkResponse(u)
File "ReturnPath.py", line 5, in checkResponse
code = urllib.request.urlopen(url).getcode()
File "C:\Program Files\Python37\lib\urllib\request.py", line 222, in urlopen
return opener.open(url, data, timeout)
File "C:\Program Files\Python37\lib\urllib\request.py", line 510, in open
req = Request(fullurl, data)
File "C:\Program Files\Python37\lib\urllib\request.py", line 328, in __init__
self.full_url = url
File "C:\Program Files\Python37\lib\urllib\request.py", line 354, in full_url
self._parse()
File "C:\Program Files\Python37\lib\urllib\request.py", line 383, in _parse
raise ValueError("unknown url type: %r" % self.full_url)
ValueError: unknown url type: '"https://myserver.org/Media/CharacterAvatarImages/ae275ecb-183e-4e8d-8465-9d6d36c1323f.jpg"'
Here is my code:
import urllib.request
def checkResponse(url):
code = urllib.request.urlopen(url).getcode()
print(url + " = " + code)
return
arrCases = []
arrCases.extend([
"https://myserver.org/Media/CharacterAvatarImages/ae275ecb-183e-4e8d-8465-9d6d36c1323f.jpg",
"https://myserver.org/Media/CharacterAvatarImages/3ea92fa3-1ef0-4358-b38d-bb04e653aa53.jpg",
"https://myserver.org/Media/CharacterAvatarImages/7958a0e3-171b-46b5-875e-970368389bdf.jpg",
"https://myserver.org/Media/CharacterAvatarImages/e9a6cb00-6811-4b47-9aac-88480578dd44.jpg",
"https://myserver.org/Media/CharacterAvatarImages/73df88c3-b829-4519-9523-2bbe1f2c8549.jpg",
"https://myserver.org/Media/CharacterAvatarImages/61aa614b-5c95-487c-b4e3-783231b43677.jpg",
"https://myserver.org/Media/CharacterAvatarImages/8be7811f-18dc-4a81-a557-8b81605e3452.jpg",
"https://myserver.org/Media/CharacterAvatarImages/56539acb-2b1b-4410-a4bc-ac2eb0dc00fa.jpg",
"https://myserver.org/Media/CharacterAvatarImages/8bcf93fc-b435-4fd4-9c82-4aba78c58529.jpg",
])
for u in arrCases:
checkResponse(u)
What am I doing wrong?
You have to catch errors from broken URLs. I also increased speed through multiprocessing.Pool.
import urllib.request
from urllib.error import HTTPError, URLError
import multiprocessing
def checkResponse(url):
try:
code = urllib.request.urlopen(url, timeout=1).getcode()
except (HTTPError, URLError) as error:
print(url, " = ", error)
else:
print(url, " = ", code)
return
arrCases = []
arrCases.extend([
"https://i.stack.imgur.com/DsNOB.jpg",
"https://myserver.org/Media/CharacterAvatarImages/ae275ecb-183e-4e8d-8465-9d6d36c1323f.jpg",
"https://myserver.org/Media/CharacterAvatarImages/3ea92fa3-1ef0-4358-b38d-bb04e653aa53.jpg",
"https://myserver.org/Media/CharacterAvatarImages/7958a0e3-171b-46b5-875e-970368389bdf.jpg",
"https://myserver.org/Media/CharacterAvatarImages/e9a6cb00-6811-4b47-9aac-88480578dd44.jpg",
"https://myserver.org/Media/CharacterAvatarImages/73df88c3-b829-4519-9523-2bbe1f2c8549.jpg",
"https://myserver.org/Media/CharacterAvatarImages/61aa614b-5c95-487c-b4e3-783231b43677.jpg",
"https://myserver.org/Media/CharacterAvatarImages/8be7811f-18dc-4a81-a557-8b81605e3452.jpg",
"https://myserver.org/Media/CharacterAvatarImages/56539acb-2b1b-4410-a4bc-ac2eb0dc00fa.jpg",
"https://myserver.org/Media/CharacterAvatarImages/8bcf93fc-b435-4fd4-9c82-4aba78c58529.jpg",
])
with multiprocessing.Pool(processes=4) as pool:
pool.map(checkResponse, arrCases)
I am implementing external authentication in ejabberd using a python script that call a HTTPS endpoint. But it gives me this error:
[17824] [ERROR] Error authenticating user
Traceback (most recent call last):
File "/opt/ejabberd-18.09/conf/external_auth.py", line 25, in auth_login
response = json.load(urllib2.urlopen(request))
File "/usr/lib/python2.7/urllib2.py", line 154, in urlopen
return opener.open(url, data, timeout)
File "/usr/lib/python2.7/urllib2.py", line 429, in open
response = self._open(req, data)
File "/usr/lib/python2.7/urllib2.py", line 452, in _open
'unknown_open', req)
File "/usr/lib/python2.7/urllib2.py", line 407, in _call_chain
result = func(*args)
File "/usr/lib/python2.7/urllib2.py", line 1266, in unknown_open
raise URLError('unknown url type: %s' % type)
URLError: <urlopen error unknown url type: https>
The extauth script below:
#!/usr/bin/env python
import sys
import struct
import logging
import os
import urllib2
import urllib
import json
LOGFILE = '/var/log/ejabberd/extauth.log'
ERRFILE = '/var/log/ejabberd/extauth.err'
AUTH_URL= "https://www.mocky.io/v2/5185415ba171ea3a00704eed"
def auth_login(token):
try:
headers = {'Content-Type': 'application/json'}
payload = {'token': token }
url = "{}".format(AUTH_URL)
logging.info('Url formatted: %s', url)
request = urllib2.Request(url, json.dumps(payload), headers)
response = json.load(urllib2.urlopen(request))
logging.info('Response: %s', response)
except urllib2.HTTPError as http_err:
return False
except Exception as e:
logging.exception("Error authenticating user")
return False
return response['valid']
def read():
(pkt_size,) = struct.unpack('>H', sys.stdin.read(2))
pkt = sys.stdin.read(pkt_size).split(':')
cmd = pkt[0]
args_num = len(pkt) - 1
if cmd == 'auth' and args_num >= 3:
logging.debug('User trying to auth')
is_valid_user = auth_login(pkt[3])
if is_valid_user:
logging.debug('Logged User :'+pkt[1]+":"+pkt[2]+":"+pkt[3])
write(True)
else:
logging.info('Error on authenticating user:'+pkt[1]+":"+pkt[2]+":"+pkt[3])
write(False)
elif cmd == 'isuser' and args_num == 2:
logging.debug('isuser received')
logging.debug(pkt[0]+":"+pkt[1])
write(False)
elif cmd == 'setpass' and args_num >= 3:
logging.debug('setpass received')
logging.debug(pkt[0]+":"+pkt[1]+":"+pkt[2])
write(False)
elif cmd == 'tryregister' and args_num >= 3:
logging.debug('tryregister received')
logging.debug(pkt[0]+":"+pkt[1]+":"+pkt[2])
write(False)
elif cmd == 'removeuser' and args_num == 2:
logging.debug('removeuser received')
logging.debug(pkt[0]+":"+pkt[1])
write(False)
elif cmd == 'removeuser3' and args_num >= 3:
logging.debug('removeuser3 received')
logging.debug(pkt[0]+":"+pkt[1]+":"+pkt[2])
write(False)
else:
write(False)
read()
def loop():
while True:
try:
read()
except KeyboardInterrupt:
logging.info('Terminating by user input')
break
except Exception as e:
logging.exception('Input error: ')
break
if __name__ == "__main__":
PID = str(os.getpid())
FMT = '[%(asctime)s] ['+PID+'] [%(levelname)s] %(message)s'
sys.stderr = open(ERRFILE, 'a+')
logging.basicConfig(level=logging.DEBUG, format=FMT, filename=LOGFILE)
try:
loop()
except struct.error:
pass
In the host config section on ejabberd.yml, the extauth_program property is defined as:
extauth_program: "python /opt/ejabberd-18.09/conf/external_auth.py"
Actually I am upgrading Ejabberd version from 16.01 to 18.09. In Ejabberd 16.01 this code runs correctly, and when I call this script on a terminal it runs ok too. Can anybody help me on this?
I have the following code which returns the public IP's
def gather_public_ip():
ACCESS_KEY = config.get('aws','access_key')
SECRET_KEY = config.get('aws','secret_key')
regions = regions = ['us-west-2','eu-central-1','ap-southeast-1']
# regions = config.get('aws','region').split(',')
all_EIP = []
for region in regions:
client = boto3.client('ec2',aws_access_key_id=ACCESS_KEY,aws_secret_access_key=SECRET_KEY,region_name=region,)
addresses_dict = client.describe_addresses()
for eip_dict in addresses_dict['Addresses']:
if 'PrivateIpAddress' in eip_dict:
print eip_dict['PublicIp']
# return str(eip_dict['PublicIp'])
all_EIP.append(eip_dict['PublicIp'])
print all_EIP
# print str(all_EIP)
return str(all_EIP)
This is called and returned as :
net_range = gather_public_ip()
for ip in net_range:
r = s.run(ip)
run looks like :
def run(self, targets="" ,options="-Pn"):
#start a new nmap scan on localhost with some specific options
syslog.syslog("Scan started")
parsed = None
nmproc = NmapProcess(targets,options)
rc = nmproc.run()
if rc != 0:
syslog.syslog("nmap scan failed: {0}".format(nmproc.stderr))
try:
parsed = NmapParser.parse(nmproc.stdout)
self.report = parsed
except NmapParserException as e:
syslog.syslog("Exception raised while parsing scan: {0}".format(e.msg))
syslog.syslog("Scan complete")
syslog.syslog("Scan duration: "+ str(parsed.elapsed))
self.report = parsed
return parsed
after printing the list , this throws me :
Traceback (most recent call last):
File "portwatch.py", line 300, in <module>
r = s.run(ip)
File "portwatch.py", line 239, in run
rc = nmproc.run()
File "/usr/local/lib/python2.7/dist-packages/libnmap/process.py", line 257, in run
else shlex.split(self.__nmap_command_line)
File "/usr/lib/python2.7/shlex.py", line 279, in split
return list(lex)
File "/usr/lib/python2.7/shlex.py", line 269, in next
token = self.get_token()
File "/usr/lib/python2.7/shlex.py", line 96, in get_token
raw = self.read_token()
File "/usr/lib/python2.7/shlex.py", line 172, in read_token
raise ValueError, "No closing quotation"
ValueError: No closing quotation
Make sure your ip is not "" or shlex will fail, cf Which exception to raise if a given string does not match some format?
I am getting this error at qr_image = urllib2.urlopen(url).read(). I have gone through all the questions similar to this here and here Answers to these questions suggest that there is a proxy connection problem. However, I am not using any proxies. I may be doing a simple mistake but I am totally stuck here.
I have cleared the proxy setting that I was using earlier in internet options>connections>LAN settings. I have also cleared the http_proxy ans https_proxy environmental variables.
EDIT: I was previously using a proxy connection but now switched to dialup modem.
import urllib2
import urllib
import gspread
from PIL import Image
from PIL import ImageDraw
from PIL import ImageFont
w = 420
gc = gspread.login('panxxxxxx#gmail.com', 'xxxxxxxx')
wks = gc.open("Spreadsheet").sheet1 # "Spreadsheet" is the name of the Google Spreadsheet made in the account
val = wks.acell('A1').value
print val
# print len(wks.row_values(1))
list_of_lists = wks.get_all_values()
print list_of_lists
url = "https://api.qrserver.com/v1/create-qr-code?data=BEGIN%3AVCARD%0AVERSION%3A2.1%0AFN%3APraveen+Sridhar%0AN%3A%3BPraveen+Sridhar%0ATEL%3BHOME%3BVOICE%3A9544344104%0AEMAIL%3BHOME%3BINTERNET%3Aprvn431%40gmail.com%0AEND%3AVCARD%0A&size=220x220&margin=0"
for i in range(5):
idi = list_of_lists[i][0]
name = list_of_lists[i][1]
name_url = urllib.quote_plus(list_of_lists[i][1]) # url encoded name for api call
email = urllib.quote_plus(list_of_lists[i][2])
number = urllib.quote_plus(list_of_lists[i][3])
url = "https://api.qrserver.com/v1/create-qr-code/?data=BEGIN%3AVCARD%0AVERSION%3A2.1%0AFN%3A" + name_url + "%0AN%3A%3B" +\
name_url + "%0ATEL%3BHOME%3BVOICE%3A" + number + "%0AEMAIL%3BHOME%3BINTERNET%3A" +\
email + "%0AEND%3AVCARD%0A&size=50x50"
qr_image = urllib2.urlopen(url).read()
name_qr = name + ".png"
outfile = open(name_qr, 'wb')
outfile.write(qr_image)
outfile.close()
qr = Image.open(name_qr)
qr.thumbnail((120, 120))
thid = Image.open("tinkerhub_card.png") # Blank ID Card over which the QR code has to be placed
thid_new = Image.new('RGB', (420, 680))
thid_new.paste(thid, (0, 0))
thid_new.paste(qr, (150, 220))
id_usr_font = ImageFont.truetype("resources/OpenSans-Regular.ttf", 25)
id_usr = ImageDraw.Draw(thid_new)
w1, h1 = id_usr_font.getsize(idi)
id_usr = id_usr.text(((w - w1) / 2, 150), idi, (0, 0, 0), font=id_usr_font)
name_usr_font = ImageFont.truetype("resources/OpenSans-Regular.ttf", 30)
name_usr = ImageDraw.Draw(thid_new)
name_usr = name_usr.text((90, 365), name, (0, 0, 0), font=name_usr_font)
thid_new.show()
Traceback
Traceback (most recent call last):
File "C:/Users/Harshil/Downloads/id-card-tinkerhub.py", line 41, in <module>
qr_image = urllib2.urlopen(url).read()
File "C:\Anaconda\lib\urllib2.py", line 154, in urlopen
return opener.open(url, data, timeout)
File "C:\Anaconda\lib\urllib2.py", line 431, in open
response = self._open(req, data)
File "C:\Anaconda\lib\urllib2.py", line 449, in _open
'_open', req)
File "C:\Anaconda\lib\urllib2.py", line 409, in _call_chain
result = func(*args)
File "C:\Anaconda\lib\urllib2.py", line 1240, in https_open
context=self._context)
File "C:\Anaconda\lib\urllib2.py", line 1197, in do_open
raise URLError(err)
urllib2.URLError: <urlopen error [Errno 10060] A connection attempt failed because the connected party did not properly respond after a period of time, or established connection failed because connected host has failed to respond>
Process finished with exit code 1
I need a python script that gets the google adsense earnings and I found adsense scraper:
http://pypi.python.org/pypi/adsense_scraper/0.5
It uses Twill and html5lib to scrape google adsense earnings data. When I use it I get this error message:
Traceback (most recent call last):
File "adsense_scraper.py", line 163, in <module>
data = main()
File "adsense_scraper.py", line 154, in main
b = get_adsense(login, password)
File "adsense_scraper.py", line 128, in get_adsense
b.submit()
File "c:\python26\lib\site-packages\twill-0.9-py2.6.egg\twill\browser.py", line 467, in submit
self._journey('open', request)
File "c:\python26\lib\site-packages\twill-0.9-py2.6.egg\twill\browser.py", line 523, in _journey
r = func(*args, **kwargs)
File "c:\python26\lib\site-packages\twill-0.9-py2.6.egg\twill\other_packages\_mechanize_dist\_mechanize.py", line 212, in open
return self._mech_open(url, data)
File "c:\python26\lib\site-packages\twill-0.9-py2.6.egg\twill\other_packages\_mechanize_dist\_mechanize.py", line 238, in _mech_open
response = UserAgentBase.open(self, request, data)
File "c:\python26\lib\site-packages\twill-0.9-py2.6.egg\twill\other_packages\_mechanize_dist\_opener.py", line 192, in open
response = meth(req, response)
File "c:\python26\lib\site-packages\twill-0.9-py2.6.egg\twill\other_packages\_mechanize_dist\_http.py", line 590, in http_response
"http", request, response, code, msg, hdrs)
File "c:\python26\lib\site-packages\twill-0.9-py2.6.egg\twill\other_packages\_mechanize_dist\_opener.py", line 209, in error
result = apply(self._call_chain, args)
File "C:\Python26\lib\urllib2.py", line 361, in _call_chain
result = func(*args)
File "c:\python26\lib\site-packages\twill-0.9-py2.6.egg\twill\other_packages\_mechanize_dist\_http.py", line 135, in http_error_302
return self.parent.open(new)
File "c:\python26\lib\site-packages\twill-0.9-py2.6.egg\twill\other_packages\_mechanize_dist\_mechanize.py", line 212, in open
return self._mech_open(url, data)
File "c:\python26\lib\site-packages\twill-0.9-py2.6.egg\twill\other_packages\_mechanize_dist\_mechanize.py", line 238, in _mech_open
response = UserAgentBase.open(self, request, data)
File "c:\python26\lib\site-packages\twill-0.9-py2.6.egg\twill\other_packages\_mechanize_dist\_opener.py", line 192, in open
response = meth(req, response)
File "c:\python26\lib\site-packages\twill-0.9-py2.6.egg\twill\utils.py", line 442, in http_response
"refresh", msg, hdrs)
File "c:\python26\lib\site-packages\twill-0.9-py2.6.egg\twill\other_packages\_mechanize_dist\_opener.py", line 209, in error
result = apply(self._call_chain, args)
File "C:\Python26\lib\urllib2.py", line 361, in _call_chain
result = func(*args)
File "c:\python26\lib\site-packages\twill-0.9-py2.6.egg\twill\other_packages\_mechanize_dist\_http.py", line 135, in http_error_302
return self.parent.open(new)
File "c:\python26\lib\site-packages\twill-0.9-py2.6.egg\twill\other_packages\_mechanize_dist\_mechanize.py", line 212, in open
return self._mech_open(url, data)
File "c:\python26\lib\site-packages\twill-0.9-py2.6.egg\twill\other_packages\_mechanize_dist\_mechanize.py", line 238, in _mech_open
response = UserAgentBase.open(self, request, data)
File "c:\python26\lib\site-packages\twill-0.9-py2.6.egg\twill\other_packages\_mechanize_dist\_opener.py", line 181, in open
response = urlopen(self, req, data)
File "C:\Python26\lib\urllib2.py", line 406, in _open 'unknown_open', req)
File "C:\Python26\lib\urllib2.py", line 361, in _call_chain result = func(*args)
File "C:\Python26\lib\urllib2.py", line 1163, in unknown_open raise URLError('unknown url type: %s' % type)
urllib2.URLError: <urlopen error unknown url type: 'http>
So the important thing is:
urllib2.URLError: <urlopen error unknown url type: 'http>
Can somebody tell me where the error is? Is there even a better way to get the data via python? Thanks
there are several errors with the package, you mentioned only the first one
1) twill package does not handle google's redirects correctly, adding
newurl = newurl.strip( "'" )
to twill/other_packages/_mechanize_dist/_http.py:108 before
newurl = _rfc3986.clean_url(newurl, "latin-1")
fixes that
2) you have to have the correct language set in adsense - English
3) there are several problems in the orignal adsense_scraper
#!/usr/bin/env python
"""Scrapes Google AdSense data with Python using Twill
Current canonical location of this module is here:
http://github.com/etrepum/adsense_scraper/tree/master
Usage::
from adsense_scraper import get_adsense, get_time_period
b = get_adsense('YOUR_ADSENSE_LOGIN', 'YOUR_ADSENSE_PASSWORD')
rows = get_time_period(b, 'yesterday')
# The summary data is always the first row with channel == ''
print 'I earned this much yesterday: $%(earnings)s' % rows[0]
"""
# requires html5lib, twill
import sys
import pprint
import decimal
from cStringIO import StringIO
from xml.etree import cElementTree
try:
from html5lib import HTMLParser
import twill.commands
except ImportError:
print >>sys.stderr, """\
adsense_scraper has dependencies::
Twill 0.9 http://twill.idyll.org/
html5lib 0.11 http://code.google.com/p/html5lib/
Try this::
$ easy_install twill html5lib
"""
raise SystemExit()
__version__ = '0.5'
SERVICE_LOGIN_BOX_URL = "https://www.google.com/accounts/ServiceLogin?service=adsense&rm=hide&fpui=3&nui=15&alwf=true<mpl=adsense&passive=true&continue=https%3A%2F%2Fwww.google.com%2Fadsense%2Fgaiaauth2&followup=https%3A%2F%2Fwww.google.com%2Fadsense%2Fgaiaauth2&hl=en_US"
OVERVIEW_URL = "https://www.google.com/adsense/report/overview?timePeriod="
TIME_PERIODS = [
'today',
'yesterday',
'thismonth',
'lastmonth',
'sincelastpayment',
]
def parse_decimal(s):
"""Return an int or decimal.Decimal given a human-readable number
"""
light_stripped = s.strip(u'\u20ac')
stripped = light_stripped.replace(',', '.').rstrip('%').lstrip('$')
try:
int(stripped)
return light_stripped
except ValueError:
pass
try:
float(stripped)
return light_stripped
except ValueError:
return decimal.Decimal(stripped)
def parse_summary_table(doc):
"""
Parse the etree doc for summarytable, returns::
[{'channel': unicode,
'impressions': int,
'clicks': int,
'ctr': decimal.Decimal,
'ecpm': decimal.Decimal,
'earnings': decimal.Decimal}]
"""
for t in doc.findall('.//table'):
if t.attrib.get('id') == 'summarytable':
break
else:
raise ValueError("summary table not found")
res = []
FIELDS = ['impressions', 'clicks', 'ctr', 'ecpm', 'earnings']
for row in t.findall('.//tr'):
celltext = []
for c in row.findall('td'):
tail = ''
# adsense inserts an empty span if a row has a period in it, so
# get the children and find the tail element to append to the text
if c.find('a') and c.find('a').getchildren():
tail = c.find('a').getchildren()[0].tail or ''
celltext.append('%s%s' % ((c.text or c.findtext('a') or '').strip(), tail.strip()))
celltext = filter( lambda x: x != "" , celltext )
if len(celltext) != len(FIELDS):
continue
try:
value_cols = map(parse_decimal, celltext)
except decimal.InvalidOperation:
continue
res.append(dict(zip(FIELDS, value_cols)))
return res
def get_adsense(login, password):
"""Returns a twill browser instance after having logged in to AdSense
with *login* and *password*.
The returned browser will have all of the appropriate cookies set but may
not be at the exact page that you want data from.
"""
b = twill.commands.get_browser()
b.go(SERVICE_LOGIN_BOX_URL)
for form in b.get_all_forms():
try:
form['Email'] = login
form['Passwd'] = password
except ValueError:
continue
else:
break
else:
raise ValueError("Could not find login form on page")
b._browser.select_form(predicate=lambda f: f is form)
b.submit()
return b
def get_time_period(b, period):
"""Returns the parsed summarytable for the time period *period* given
*b* which should be the result of a get_adsense call. *period* must be
a time period that AdSense supports:
``'today'``, ``'yesterday'``, ``'thismonth'``,
``'lastmonth'``, ``'sincelastpayment'``.
"""
b.go(OVERVIEW_URL + period)
# The cElementTree treebuilder doesn't work reliably enough
# to use directly, so we parse and then dump into cElementTree.
doc = cElementTree.fromstring(HTMLParser().parse(b.get_html()).toxml())
return parse_summary_table(doc)
def main():
try:
login, password = sys.argv[1:]
except ValueError:
raise SystemExit("usage: %s LOGIN PASSWORD" % (sys.argv[0],))
twill.set_output(StringIO())
twill.commands.reset_browser()
b = get_adsense(login, password)
data = {}
for period in TIME_PERIODS:
data[period] = get_time_period(b, period)
pprint.pprint(data)
twill.set_output(None)
return data
if __name__ == '__main__':
data = main()