proxy handler in python - python

when i disable proxy server in web browser setting and commenting the proxy handler coding it below code works fine.
import urllib2
import urllib2_file
import urllib
import random
import mimetypes
import string
from os import listdir
import time
from google.refine import refine
from google.refine import facet
proxy = urllib2.ProxyHandler({'http': '10.200.1.26'})
opener = urllib2.build_opener(proxy)
urllib2.install_opener(opener)
def encode_multipart(fields, files, boundary=None):
def escape_quote(s):
return s.replace('"', '\\"')
if boundary is None:
boundary = ''.join(random.choice(_BOUNDARY_CHARS) for i in range(30))
lines = []
for name, value in fields.items():
lines.extend((
'--{0}'.format(boundary),
'Content-Disposition: form-data; name="upload"', #.format(escape_quote(name)),
'',
str(value),
))
for name, value in files.items():
filename = value['filename']
if 'mimetype' in value:
mimetype = value['mimetype']
else:
mimetype = mimetypes.guess_type(filename)[0] or 'application/octet-stream'
lines.extend((
'--{0}'.format(boundary),
'Content-Disposition: form-data; name="upload"; filename="{0}"'.format(escape_quote(filename)),
'Content-Type: {0}'.format(mimetype),
'',
value['content'],
))
lines.extend((
'--{0}--'.format(boundary),
'',
))
body = '\r\n'.join(lines)
headers = {
'Content-Type': 'multipart/form-data; boundary={0}'.format(boundary),
'Content-Length': str(len(body)),
}
return (body, headers)
_BOUNDARY_CHARS = string.digits + string.ascii_letters
u = urllib2.urlopen("http://127.0.0.1:3333/command/core/create-importing-job",data=urllib.urlencode({"test":""}))
a=u.read()
id=""
for i in a:
if(i.isdigit()):
id+=str(i)
# sample output '{ "jobID" : 1393566803991 }'
files = {}
pathtoXML = r"C:\75"
#pathtoXML = r"C:\AM\trial"
for i in listdir(pathtoXML):
files[i] = {'filename': i, 'content': open(pathtoXML + "\\"+ i).read()}
#load raw data using the job id found in
url = "http://127.0.0.1:3333/command/core/importing-controller?controller=core%2Fdefault-importing-controller&jobI="+id+"&subCommand=load-raw-data"
data,headers = encode_multipart({}, files)
#print len(data)
#print headers
req = urllib2.Request(url, data=data, headers=headers)
f = urllib2.urlopen(req)
f.read()
# get job status
u=urllib2.urlopen("http://127.0.0.1:3333/command/core/get-importing-job-status?jobID="+id+"", "test")
u.read()
#from fileSelection update file selection
u=urllib2.urlopen("http://127.0.0.1:3333/command/core/importing-controller?controller=core%2Fdefault-importing-controller&subCommand=update-file-selection&jobID="+id+"", "fileSelection=%5B0%2C1%2C2%2C3%5D")
u.read()
#init parser format text
u=urllib2.urlopen("http://127.0.0.1:3333/command/core/importing-controller?controller=core%2Fdefault-importing-controller&jobID="+id+"&subCommand=initialize-parser-ui&format=text%2Fxml")
u.read()
#update format and options
updateformatoptionurl = "http://127.0.0.1:3333/command/core/importing-controller?controller=core%2Fdefault-importing-controller&jobID="+id+"&subCommand=update-format-and-options"
d=urllib.urlencode({"format":"text/xml","options":{"recordPath":["ArrayOfAfiles","Afiles"],"limit":-1,"includeFileSources":"false","guessCellValueTypes":"false"}})
u=urllib2.urlopen(updateformatoptionurl,d)
u.read()
'{"status":"ok"}'
#get-models
u=urllib2.urlopen("http://127.0.0.1:3333/command/core/get-models?importingJobID="+id)
u.read()
# create project from import job
createfromimporturl = "http://127.0.0.1:3333/command/core/importing-controller?controller=core%2Fdefault-importing-controller&jobID="+id+"&subCommand=create-project"
d=urllib.urlencode({"format":"text/xml","options":{"recordPath":["ArrayOfAfiles","Afiles"],"limit":-1,"includeFileSources":"false","projectName":time.ctime()}})
u=urllib2.urlopen(createfromimporturl, d)
r=u.read()
After embedding proxy handler coding its not working when i ran the code its complaining:
Traceback (most recent call last):
File "C:\hari\trial.py", line 87, in <module>
u = urllib2.urlopen("http://127.0.0.1:3333/command/core/create-importing-job",data=urllib.urlencode({"test":""}))
File "C:\Python27\lib\urllib2.py", line 126, in urlopen
return _opener.open(url, data, timeout)
File "C:\Python27\lib\urllib2.py", line 391, in open
response = self._open(req, data)
File "C:\Python27\lib\urllib2.py", line 409, in _open
'_open', req)
File "C:\Python27\lib\urllib2.py", line 369, in _call_chain
result = func(*args)
File "C:\Python27\urllib2_file.py", line 207, in http_open
return self.do_open(httplib.HTTP, req)
File "C:\Python27\urllib2_file.py", line 298, in do_open
return self.parent.error('http', req, fp, code, msg, hdrs)
File "C:\Python27\lib\urllib2.py", line 435, in error
return self._call_chain(*args)
File "C:\Python27\lib\urllib2.py", line 369, in _call_chain
result = func(*args)
File "C:\Python27\lib\urllib2.py", line 518, in http_error_default
raise HTTPError(req.get_full_url(), code, msg, hdrs, fp)
HTTPError: HTTP Error 404: Not Found

Related

Python Twitter API trying to retrieve tweet but error: AttributeError: 'int' object has no attribute 'encode'

why am I getting an AttributeError: 'int' object has no attribute 'encode'?
I am trying to retrieve a tweet using the Twitter API on Python. Full traceback here:
Traceback (most recent call last):
File "C:/Python27/lol.py", line 34, in <module>
headers = req.to_header()
File "build\bdist.win-amd64\egg\oauth2\__init__.py", line 398, in to_header
params_header = ', '.join(header_params)
File "build\bdist.win-amd64\egg\oauth2\__init__.py", line 397, in <genexpr>
header_params = ('%s="%s"' % (k, v) for k, v in stringy_params)
File "build\bdist.win-amd64\egg\oauth2\__init__.py", line 396, in <genexpr>
stringy_params = ((k, escape(v)) for k, v in oauth_params)
File "build\bdist.win-amd64\egg\oauth2\__init__.py", line 163, in escape
s = s.encode('utf-8')
AttributeError: 'int' object has no attribute 'encode'
Below is the code I'm using.
import oauth2
import time
import urllib2
import json
url1="https://api.twitter.com/1.1/search/tweets.json"
params = {
"oauth_version": "1.9.0",
"oauth_nonce": oauth2.generate_nonce(),
"oauth_timestamp": int(time.time())
}
consumer = oauth2.Consumer(key="*********", secret="*********")
token = oauth2.Token(key="*********", secret="*********")
params["oauth_consumer_key"] = consumer.key
params["oauth_token"] = token.key
for i in range(1):
url = url1
req = oauth2.Request(method="GET", url=url, parameters=params)
signature_method = oauth2.SignatureMethod_HMAC_SHA1()
req.sign_request(signature_method, consumer, token)
headers = req.to_url()
print headers
print url
for i in range(1):
url = url1
params["q"] = "pictorial"
params["count"] = 2
req = oauth2.Request(method="GET", url=url, parameters=params)
signature_method = oauth2.SignatureMethod_HMAC_SHA1()
req.sign_request(signature_method, consumer, token)
headers = req.to_header()
url = req.to_url()
response = urllib2.Request(url)
data = json.load(urllib2.urlopen(response))
if data["statuses"] == []:
print "end of data"
break
else:
print data
And if I change int(time.time()) into str(time.time())
I get the following error:
Traceback (most recent call last):
File "C:/Python27/lol.py", line 37, in <module>
data = json.load(urllib2.urlopen(response))
File "C:\Python27\lib\urllib2.py", line 154, in urlopen
return opener.open(url, data, timeout)
File "C:\Python27\lib\urllib2.py", line 437, in open
response = meth(req, response)
File "C:\Python27\lib\urllib2.py", line 550, in http_response
'http', request, response, code, msg, hdrs)
File "C:\Python27\lib\urllib2.py", line 475, in error
return self._call_chain(*args)
File "C:\Python27\lib\urllib2.py", line 409, in _call_chain
result = func(*args)
File "C:\Python27\lib\urllib2.py", line 558, in http_error_default
raise HTTPError(req.get_full_url(), code, msg, hdrs, fp)
HTTPError: HTTP Error 400: Bad Request
"oauth_timestamp": int(time.time())
here you use an int, but that field must be a string.

Problems with uploading content to website

I am just a newly self-learned programmer. I was hoping to upload some numbers to my website by python But somewhat I failed. Could you help me figure out what is wrong?
Here is my original python code.
#!/usr/bin/python
import time
import urllib.request
import random
import datetime
from urllib.request import Request,urlopen
basic_web = 'http://ihome.ust.hk/~xxxxx/cgi-bin/datafile.php?'
message=""
while(True):
local_time= time.time()
web_x = basic_web
file1 = open("datalist1.txt", "r")
queue1 = file1.read()
file1.close()
web_x += "&queue1=" + queue1
file2 = open("datalist2.txt", "r")
queue2 = file2.read()
file2.close()
web_x += "&queue2=" + queue2
web_x += "&local_time=" + str (local_time)
print (web_x)
#req = Request (web_x)
#html = urlopen(req).read()
response = urllib.request.urlopen(web_x, timeout = 1)
html = response.read()
print(html)
time.sleep(0.1)
print ("hehe")
And here is the output error that I got:
Traceback (most recent call last):
File "C:\web bus stop\local\datauploader.py", line 25, in <module>
response = urllib.request.urlopen(web_x)
File "C:\Users\ad\AppData\Local\Programs\Python\Python35-32\lib\urllib\request.py", line 162, in urlopen
return opener.open(url, data, timeout)
File "C:\Users\ad\AppData\Local\Programs\Python\Python35-32\lib\urllib\request.py", line 465, in open
response = self._open(req, data)
File "C:\Users\ad\AppData\Local\Programs\Python\Python35-32\lib\urllib\request.py", line 483, in _open
'_open', req)
File "C:\Users\ad\AppData\Local\Programs\Python\Python35-32\lib\urllib\request.py", line 443, in _call_chain
result = func(*args)
File "C:\Users\ad\AppData\Local\Programs\Python\Python35-32\lib\urllib\request.py", line 1268, in http_open
return self.do_open(http.client.HTTPConnection, req)
File "C:\Users\ad\AppData\Local\Programs\Python\Python35-32\lib\urllib\request.py", line 1243, in do_open
r = h.getresponse()
File "C:\Users\ad\AppData\Local\Programs\Python\Python35-32\lib\http\client.py", line 1174, in getresponse
response.begin()
File "C:\Users\ad\AppData\Local\Programs\Python\Python35-32\lib\http\client.py", line 282, in begin
version, status, reason = self._read_status()
File "C:\Users\ad\AppData\Local\Programs\Python\Python35-32\lib\http\client.py", line 264, in _read_status
raise BadStatusLine(line)
http.client.BadStatusLine: connected! queue1queue2 finish sir!
I would really appreciate it if you guys could help me figure out what is the bug.
Never mind.
I changed a computer to run it and it worked now.

Python Trackback Issue

When I execute it says Trackback most recent call last and line 22 also line 410 in lib and so on with problems within python. Error on home = opener.open() Traceback (most recent call last): File "", line 1, in File "C:\u\u\d\WinPython-32bit-2.7.6.2\python-2.7.6\lib\site-packages\spyderlib\widgets\externalshell\sitecustomize.py", line 540, in runfile execfile(filename, namespace) File "C:/u/u/d/WinPython-32bit-2.7.6.2/python-2.7.6/Scripts/ox.py", line 20, in home = opener.open('', data) File "C:\u\u\d\WinPython-32bit-2.7.6.2\python-2.7.6\lib\urllib2.py", line 410, in open response = meth(req, response) File "C:\u\u\d\WinPython-32bit-2.7.6.2\python-2.7.6\lib\urllib2.py", line 523, in http_response 'http', request, response, code, msg, hdrs) File "C:\u\u\d\WinPython-32bit-2.7.6.2\python-2.7.6\lib\urllib2.py", line 448, in error return self._call_chain(*args) File "C:\u\u\d\WinPython-32bit-2.7.6.2\python-2.7.6\lib\urllib2.py", line 382, in _call_chain result = func(*args) File "C:\u\u\d\WinPython-32bit-2.7.6.2\python-2.7.6\lib\urllib2.py", line 531, in http_error_default raise HTTPError(req.get_full_url(), code, msg, hdrs, fp) urllib2.HTTPError: HTTP Error 500: Internal Server Error
import urllib
import urllib2
import cookielib
import re
import os
from random import choice
cj = cookielib.CookieJar()
opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj))
opener.addheaders = [
("User-Agent", "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:26.0) Gecko/20100101 Firefox/26.0"),
("Cookie", ".SECURITY=232")
]
f = 58454700
print "",
url = 'http://www.myhost.com/increment'
x = 0
while x < 1000000000:
f = f + 1
z = str(f)
url = 'http://www.myhost.com/Send/SentIncrement'
data = urllib.urlencode({"Increment": z, "VerificationToken": "Verified"})
home = opener.open('http://www.myhost.com/Send/SentIncrement', data)
os.system("cls")
print f
print data
x = x + 1
raw_input("")

Error for M2Crypto https get thru a web proxy with cac card authentication

I'm trying to make a https get request from behind a squid proxy with cac card authentication. Loading the opensc engine and grabbing the cert and private key seem to work fine. Below is the traceback and the code.
Any help is greatly appreciated.
Traceback
Traceback (most recent call last):
File "testM2Crypto.py", line 64, in <module>
res = m2urllib2.urlopen(req)
File "c:\Python27\lib\urllib2.py", line 135, in urlopen
return _opener.open(url, data, timeout)
File "c:\Python27\lib\urllib2.py", line 415, in open
response = self._open(req, data)
File "c:\Python27\lib\urllib2.py", line 433, in _open
'_open', req)
File "c:\Python27\lib\urllib2.py", line 387, in _call_chain
result = func(*args)
File "c:\Python27\lib\site-packages\M2Crypto\m2urllib2.py", line 94, in https_open
h.request(req.get_method(), req.get_selector(), req.data, headers)
File "c:\Python27\lib\httplib.py", line 963, in request
self._send_request(method, url, body, headers)
File "c:\Python27\lib\httplib.py", line 994, in _send_request
self.putrequest(method, url, **skips)
File "c:\Python27\lib\site-packages\M2Crypto\httpslib.py", line 140, in putrequest
raise ValueError, "unknown URL type: %s" % url
ValueError: unknown URL type: /index.asp?site=SomeSite
Code
from M2Crypto import httpslib, m2urllib2, m2, SSL, Engine
import urllib2
url = 'https://some.domain.com/index.asp?site=SomeSite'
e = Engine.load_dynamic_engine("pkcs11", "c:/windows/syswow64/engine_pkcs11.dll")
pk = Engine.Engine("pkcs11")
pk.ctrl_cmd_string("MODULE_PATH", "c:/windows/syswow64/opensc-pkcs11.dll")
m2.engine_init(m2.engine_by_id("pkcs11"))
cert = e.load_certificate("slot_01-id_01")
privatekey = e.load_private_key("slot_01-id_01")
ctx = SSL.Context("sslv23")
ctx.set_cipher_list("HIGH:!aNULL:!eNULL:#STRENGTH")
ctx.set_session_id_ctx("foobar")
m2.ssl_ctx_use_x509(ctx.ctx, cert.x509)
m2.ssl_ctx_use_pkey_privkey(ctx.ctx, privatekey.pkey)
proxy_support=urllib2.ProxyHandler({'https':'https://proxy:3128'})
opener = m2urllib2.build_opener(ctx, proxy_support)
m2urllib2.install_opener(opener)
req = m2urllib2.Request(url)
res = m2urllib2.urlopen(req)
I finally was able to solve the problem yesterday. I had to make a few modifications to the code. I also had to patch a bug in the M2Crypto library that was preventing https through a proxy(Thanks to Miloslav Trmač from redhat for the patch). The solution is below for anyone else who might be running into a similar problem. Hope this helps.
Code
from M2Crypto import httpslib, m2urllib2, m2, SSL, Engine
import urllib2
userPin = "123456"
rootCertPath = 'd:/path/to/rootCert.pem'
url = 'https://some.domain.com/index.asp?site=SomeSite'
e = Engine.load_dynamic_engine("pkcs11", "c:/windows/syswow64/engine_pkcs11.dll")
pk = Engine.Engine("pkcs11")
pk.ctrl_cmd_string("MODULE_PATH", "c:/windows/syswow64/opensc-pkcs11.dll")
if len(userPin) > 0: pk.ctrl_cmd_string("PIN", userPin)
m2.engine_init(m2.engine_by_id("pkcs11"))
rootcert = X509.load_cert(rootCertPath)
cert = e.load_certificate("slot_01-id_01")
privatekey = e.load_private_key("slot_01-id_01")
ctx = SSL.Context("sslv23")
ctx.set_cipher_list("HIGH:!aNULL:!eNULL:#STRENGTH")
ctx.set_session_id_ctx("foobar")
ctx.load_verify_locations(cafile=rootcert)
m2.ssl_ctx_use_x509(ctx.ctx, cert.x509)
m2.ssl_ctx_use_pkey_privkey(ctx.ctx, privatekey.pkey)
proxy_support=urllib2.ProxyHandler({'https':'https://proxy:3128'})
opener = m2urllib2.build_opener(ctx, proxy_support)
m2urllib2.install_opener(opener)
req = m2urllib2.Request(url)
try:
res = m2urllib2.urlopen(req)
print '\nsuccess'
except urllib2.HTTPError, err:
print '\nerror'
print 'err.code: '+str(err.code)
print 'err.reason: '+str(err.reason)
print 'err.read(): '+str(err.read())
Thanks to Miloslav Trmač from redhat for the patch. I found this patch at the following url, http://arm.koji.fedoraproject.org/koji/buildinfo?buildID=61225 .
M2Crypto Patch
diff -urN M2Crypto/M2Crypto/httpslib.py M2Crypto-0.21.1/M2Crypto/httpslib.py
--- M2Crypto/M2Crypto/httpslib.py 2012-03-15 03:27:22.181524406 +0100
+++ M2Crypto-0.21.1/M2Crypto/httpslib.py 2012-03-15 03:27:40.467485033 +0100
## -182,14 +182,14 ##
else:
HTTPSConnection.putheader(self, header, value)
- def endheaders(self):
+ def endheaders(self, *args, **kwargs):
# We've recieved all of hte headers. Use the supplied username
# and password for authorization, possibly overriding the authstring
# supplied in the headers.
if not self._proxy_auth:
self._proxy_auth = self._encode_auth()
- HTTPSConnection.endheaders(self)
+ HTTPSConnection.endheaders(self, *args, **kwargs)
def connect(self):
HTTPConnection.connect(self)
diff -urN M2Crypto/M2Crypto/m2urllib2.py M2Crypto-0.21.1/M2Crypto/m2urllib2.py
--- M2Crypto/M2Crypto/m2urllib2.py 2011-01-15 20:10:05.000000000 +0100
+++ M2Crypto-0.21.1/M2Crypto/m2urllib2.py 2012-03-15 03:27:40.467485033 +0100
## -64,8 +64,10 ##
target_host = urlparse.urlparse(full_url)[1]
if (target_host != host):
+ request_uri = urlparse.urldefrag(full_url)[0]
h = httpslib.ProxyHTTPSConnection(host = host, ssl_context = self.ctx)
else:
+ request_uri = req.get_selector()
h = httpslib.HTTPSConnection(host = host, ssl_context = self.ctx)
# End our change
h.set_debuglevel(self._debuglevel)
## -80,7 +82,7 ##
# request.
headers["Connection"] = "close"
try:
- h.request(req.get_method(), req.get_selector(), req.data, headers)
+ h.request(req.get_method(), request_uri, req.data, headers)
r = h.getresponse()
except socket.error, err: # XXX what error?
raise URLError(err)

python: get google adsense earnings report

I need a python script that gets the google adsense earnings and I found adsense scraper:
http://pypi.python.org/pypi/adsense_scraper/0.5
It uses Twill and html5lib to scrape google adsense earnings data. When I use it I get this error message:
Traceback (most recent call last):
File "adsense_scraper.py", line 163, in <module>
data = main()
File "adsense_scraper.py", line 154, in main
b = get_adsense(login, password)
File "adsense_scraper.py", line 128, in get_adsense
b.submit()
File "c:\python26\lib\site-packages\twill-0.9-py2.6.egg\twill\browser.py", line 467, in submit
self._journey('open', request)
File "c:\python26\lib\site-packages\twill-0.9-py2.6.egg\twill\browser.py", line 523, in _journey
r = func(*args, **kwargs)
File "c:\python26\lib\site-packages\twill-0.9-py2.6.egg\twill\other_packages\_mechanize_dist\_mechanize.py", line 212, in open
return self._mech_open(url, data)
File "c:\python26\lib\site-packages\twill-0.9-py2.6.egg\twill\other_packages\_mechanize_dist\_mechanize.py", line 238, in _mech_open
response = UserAgentBase.open(self, request, data)
File "c:\python26\lib\site-packages\twill-0.9-py2.6.egg\twill\other_packages\_mechanize_dist\_opener.py", line 192, in open
response = meth(req, response)
File "c:\python26\lib\site-packages\twill-0.9-py2.6.egg\twill\other_packages\_mechanize_dist\_http.py", line 590, in http_response
"http", request, response, code, msg, hdrs)
File "c:\python26\lib\site-packages\twill-0.9-py2.6.egg\twill\other_packages\_mechanize_dist\_opener.py", line 209, in error
result = apply(self._call_chain, args)
File "C:\Python26\lib\urllib2.py", line 361, in _call_chain
result = func(*args)
File "c:\python26\lib\site-packages\twill-0.9-py2.6.egg\twill\other_packages\_mechanize_dist\_http.py", line 135, in http_error_302
return self.parent.open(new)
File "c:\python26\lib\site-packages\twill-0.9-py2.6.egg\twill\other_packages\_mechanize_dist\_mechanize.py", line 212, in open
return self._mech_open(url, data)
File "c:\python26\lib\site-packages\twill-0.9-py2.6.egg\twill\other_packages\_mechanize_dist\_mechanize.py", line 238, in _mech_open
response = UserAgentBase.open(self, request, data)
File "c:\python26\lib\site-packages\twill-0.9-py2.6.egg\twill\other_packages\_mechanize_dist\_opener.py", line 192, in open
response = meth(req, response)
File "c:\python26\lib\site-packages\twill-0.9-py2.6.egg\twill\utils.py", line 442, in http_response
"refresh", msg, hdrs)
File "c:\python26\lib\site-packages\twill-0.9-py2.6.egg\twill\other_packages\_mechanize_dist\_opener.py", line 209, in error
result = apply(self._call_chain, args)
File "C:\Python26\lib\urllib2.py", line 361, in _call_chain
result = func(*args)
File "c:\python26\lib\site-packages\twill-0.9-py2.6.egg\twill\other_packages\_mechanize_dist\_http.py", line 135, in http_error_302
return self.parent.open(new)
File "c:\python26\lib\site-packages\twill-0.9-py2.6.egg\twill\other_packages\_mechanize_dist\_mechanize.py", line 212, in open
return self._mech_open(url, data)
File "c:\python26\lib\site-packages\twill-0.9-py2.6.egg\twill\other_packages\_mechanize_dist\_mechanize.py", line 238, in _mech_open
response = UserAgentBase.open(self, request, data)
File "c:\python26\lib\site-packages\twill-0.9-py2.6.egg\twill\other_packages\_mechanize_dist\_opener.py", line 181, in open
response = urlopen(self, req, data)
File "C:\Python26\lib\urllib2.py", line 406, in _open 'unknown_open', req)
File "C:\Python26\lib\urllib2.py", line 361, in _call_chain result = func(*args)
File "C:\Python26\lib\urllib2.py", line 1163, in unknown_open raise URLError('unknown url type: %s' % type)
urllib2.URLError: <urlopen error unknown url type: 'http>
So the important thing is:
urllib2.URLError: <urlopen error unknown url type: 'http>
Can somebody tell me where the error is? Is there even a better way to get the data via python? Thanks
there are several errors with the package, you mentioned only the first one
1) twill package does not handle google's redirects correctly, adding
newurl = newurl.strip( "'" )
to twill/other_packages/_mechanize_dist/_http.py:108 before
newurl = _rfc3986.clean_url(newurl, "latin-1")
fixes that
2) you have to have the correct language set in adsense - English
3) there are several problems in the orignal adsense_scraper
#!/usr/bin/env python
"""Scrapes Google AdSense data with Python using Twill
Current canonical location of this module is here:
http://github.com/etrepum/adsense_scraper/tree/master
Usage::
from adsense_scraper import get_adsense, get_time_period
b = get_adsense('YOUR_ADSENSE_LOGIN', 'YOUR_ADSENSE_PASSWORD')
rows = get_time_period(b, 'yesterday')
# The summary data is always the first row with channel == ''
print 'I earned this much yesterday: $%(earnings)s' % rows[0]
"""
# requires html5lib, twill
import sys
import pprint
import decimal
from cStringIO import StringIO
from xml.etree import cElementTree
try:
from html5lib import HTMLParser
import twill.commands
except ImportError:
print >>sys.stderr, """\
adsense_scraper has dependencies::
Twill 0.9 http://twill.idyll.org/
html5lib 0.11 http://code.google.com/p/html5lib/
Try this::
$ easy_install twill html5lib
"""
raise SystemExit()
__version__ = '0.5'
SERVICE_LOGIN_BOX_URL = "https://www.google.com/accounts/ServiceLogin?service=adsense&rm=hide&fpui=3&nui=15&alwf=true&ltmpl=adsense&passive=true&continue=https%3A%2F%2Fwww.google.com%2Fadsense%2Fgaiaauth2&followup=https%3A%2F%2Fwww.google.com%2Fadsense%2Fgaiaauth2&hl=en_US"
OVERVIEW_URL = "https://www.google.com/adsense/report/overview?timePeriod="
TIME_PERIODS = [
'today',
'yesterday',
'thismonth',
'lastmonth',
'sincelastpayment',
]
def parse_decimal(s):
"""Return an int or decimal.Decimal given a human-readable number
"""
light_stripped = s.strip(u'\u20ac')
stripped = light_stripped.replace(',', '.').rstrip('%').lstrip('$')
try:
int(stripped)
return light_stripped
except ValueError:
pass
try:
float(stripped)
return light_stripped
except ValueError:
return decimal.Decimal(stripped)
def parse_summary_table(doc):
"""
Parse the etree doc for summarytable, returns::
[{'channel': unicode,
'impressions': int,
'clicks': int,
'ctr': decimal.Decimal,
'ecpm': decimal.Decimal,
'earnings': decimal.Decimal}]
"""
for t in doc.findall('.//table'):
if t.attrib.get('id') == 'summarytable':
break
else:
raise ValueError("summary table not found")
res = []
FIELDS = ['impressions', 'clicks', 'ctr', 'ecpm', 'earnings']
for row in t.findall('.//tr'):
celltext = []
for c in row.findall('td'):
tail = ''
# adsense inserts an empty span if a row has a period in it, so
# get the children and find the tail element to append to the text
if c.find('a') and c.find('a').getchildren():
tail = c.find('a').getchildren()[0].tail or ''
celltext.append('%s%s' % ((c.text or c.findtext('a') or '').strip(), tail.strip()))
celltext = filter( lambda x: x != "" , celltext )
if len(celltext) != len(FIELDS):
continue
try:
value_cols = map(parse_decimal, celltext)
except decimal.InvalidOperation:
continue
res.append(dict(zip(FIELDS, value_cols)))
return res
def get_adsense(login, password):
"""Returns a twill browser instance after having logged in to AdSense
with *login* and *password*.
The returned browser will have all of the appropriate cookies set but may
not be at the exact page that you want data from.
"""
b = twill.commands.get_browser()
b.go(SERVICE_LOGIN_BOX_URL)
for form in b.get_all_forms():
try:
form['Email'] = login
form['Passwd'] = password
except ValueError:
continue
else:
break
else:
raise ValueError("Could not find login form on page")
b._browser.select_form(predicate=lambda f: f is form)
b.submit()
return b
def get_time_period(b, period):
"""Returns the parsed summarytable for the time period *period* given
*b* which should be the result of a get_adsense call. *period* must be
a time period that AdSense supports:
``'today'``, ``'yesterday'``, ``'thismonth'``,
``'lastmonth'``, ``'sincelastpayment'``.
"""
b.go(OVERVIEW_URL + period)
# The cElementTree treebuilder doesn't work reliably enough
# to use directly, so we parse and then dump into cElementTree.
doc = cElementTree.fromstring(HTMLParser().parse(b.get_html()).toxml())
return parse_summary_table(doc)
def main():
try:
login, password = sys.argv[1:]
except ValueError:
raise SystemExit("usage: %s LOGIN PASSWORD" % (sys.argv[0],))
twill.set_output(StringIO())
twill.commands.reset_browser()
b = get_adsense(login, password)
data = {}
for period in TIME_PERIODS:
data[period] = get_time_period(b, period)
pprint.pprint(data)
twill.set_output(None)
return data
if __name__ == '__main__':
data = main()

Categories