Related
I am trying to implement the Amazon Web Scraper mentioned here. However, I get the output mentioned below. The output repeats until it stops with RecursionError: maximum recursion depth exceeded.
I have already tried downgrading eventlet to version 0.17.4 as mentioned here.
Also, the requestsmodule is getting patched as you can see in helpers.py.
helpers.py
import os
import random
from datetime import datetime
from urllib.parse import urlparse
import eventlet
requests = eventlet.import_patched('requests.__init__')
time = eventlet.import_patched('time')
import redis
from bs4 import BeautifulSoup
from requests.exceptions import RequestException
import settings
num_requests = 0
redis = redis.StrictRedis(host=settings.redis_host, port=settings.redis_port, db=settings.redis_db)
def make_request(url, return_soup=True):
# global request building and response handling
url = format_url(url)
if "picassoRedirect" in url:
return None # skip the redirect URLs
global num_requests
if num_requests >= settings.max_requests:
raise Exception("Reached the max number of requests: {}".format(settings.max_requests))
proxies = get_proxy()
try:
r = requests.get(url, headers=settings.headers, proxies=proxies)
except RequestException as e:
log("WARNING: Request for {} failed, trying again.".format(url))
num_requests += 1
if r.status_code != 200:
os.system('say "Got non-200 Response"')
log("WARNING: Got a {} status code for URL: {}".format(r.status_code, url))
return None
if return_soup:
return BeautifulSoup(r.text), r.text
return r
def format_url(url):
# make sure URLs aren't relative, and strip unnecssary query args
u = urlparse(url)
scheme = u.scheme or "https"
host = u.netloc or "www.amazon.de"
path = u.path
if not u.query:
query = ""
else:
query = "?"
for piece in u.query.split("&"):
k, v = piece.split("=")
if k in settings.allowed_params:
query += "{k}={v}&".format(**locals())
query = query[:-1]
return "{scheme}://{host}{path}{query}".format(**locals())
def log(msg):
# global logging function
if settings.log_stdout:
try:
print("{}: {}".format(datetime.now(), msg))
except UnicodeEncodeError:
pass # squash logging errors in case of non-ascii text
def get_proxy():
# choose a proxy server to use for this request, if we need one
if not settings.proxies or len(settings.proxies) == 0:
return None
proxy = random.choice(settings.proxies)
proxy_url = "socks5://{user}:{passwd}#{ip}:{port}/".format(
user=settings.proxy_user,
passwd=settings.proxy_pass,
ip=proxy,
port=settings.proxy_port,
)
return {
"http": proxy_url,
"https": proxy_url
}
if __name__ == '__main__':
# test proxy server IP masking
r = make_request('https://api.ipify.org?format=json', return_soup=False)
print(r.text)
output
Traceback (most recent call last):
File "helpers.py", line 112, in <module>
r = make_request('https://api.ipify.org?format=json', return_soup=False)
File "helpers.py", line 36, in make_request
r = requests.get(url, headers=settings.headers, proxies=proxies)
File "/home/ec2-user/env/lib64/python3.7/site-packages/requests/api.py", line 76, in get
return request('get', url, params=params, **kwargs)
File "/home/ec2-user/env/lib64/python3.7/site-packages/requests/api.py", line 61, in request
return session.request(method=method, url=url, **kwargs)
File "/home/ec2-user/env/lib64/python3.7/site-packages/requests/sessions.py", line 530, in request
resp = self.send(prep, **send_kwargs)
File "/home/ec2-user/env/lib64/python3.7/site-packages/requests/sessions.py", line 643, in send
r = adapter.send(request, **kwargs)
File "/home/ec2-user/env/lib64/python3.7/site-packages/requests/adapters.py", line 449, in send
timeout=timeout
File "/home/ec2-user/env/lib64/python3.7/site-packages/urllib3/connectionpool.py", line 672, in urlopen
chunked=chunked,
File "/home/ec2-user/env/lib64/python3.7/site-packages/urllib3/connectionpool.py", line 376, in _make_request
self._validate_conn(conn)
File "/home/ec2-user/env/lib64/python3.7/site-packages/urllib3/connectionpool.py", line 994, in _validate_conn
conn.connect()
File "/home/ec2-user/env/lib64/python3.7/site-packages/urllib3/connection.py", line 300, in connect
conn = self._new_conn()
File "/home/ec2-user/env/lib64/python3.7/site-packages/urllib3/contrib/socks.py", line 99, in _new_conn
**extra_kw
File "/home/ec2-user/env/lib64/python3.7/site-packages/socks.py", line 199, in create_connection
sock.connect((remote_host, remote_port))
File "/home/ec2-user/env/lib64/python3.7/site-packages/socks.py", line 47, in wrapper
return function(*args, **kwargs)
File "/home/ec2-user/env/lib64/python3.7/site-packages/socks.py", line 774, in connect
super(socksocket, self).settimeout(self._timeout)
File "/home/ec2-user/env/lib64/python3.7/site-packages/eventlet/greenio/base.py", line 395, in settimeout
self.setblocking(True)
What might be the problem here?
Turns out removing eventlet.monkey_patch() and import eventlet solved the problem.
when i disable proxy server in web browser setting and commenting the proxy handler coding it below code works fine.
import urllib2
import urllib2_file
import urllib
import random
import mimetypes
import string
from os import listdir
import time
from google.refine import refine
from google.refine import facet
proxy = urllib2.ProxyHandler({'http': '10.200.1.26'})
opener = urllib2.build_opener(proxy)
urllib2.install_opener(opener)
def encode_multipart(fields, files, boundary=None):
def escape_quote(s):
return s.replace('"', '\\"')
if boundary is None:
boundary = ''.join(random.choice(_BOUNDARY_CHARS) for i in range(30))
lines = []
for name, value in fields.items():
lines.extend((
'--{0}'.format(boundary),
'Content-Disposition: form-data; name="upload"', #.format(escape_quote(name)),
'',
str(value),
))
for name, value in files.items():
filename = value['filename']
if 'mimetype' in value:
mimetype = value['mimetype']
else:
mimetype = mimetypes.guess_type(filename)[0] or 'application/octet-stream'
lines.extend((
'--{0}'.format(boundary),
'Content-Disposition: form-data; name="upload"; filename="{0}"'.format(escape_quote(filename)),
'Content-Type: {0}'.format(mimetype),
'',
value['content'],
))
lines.extend((
'--{0}--'.format(boundary),
'',
))
body = '\r\n'.join(lines)
headers = {
'Content-Type': 'multipart/form-data; boundary={0}'.format(boundary),
'Content-Length': str(len(body)),
}
return (body, headers)
_BOUNDARY_CHARS = string.digits + string.ascii_letters
u = urllib2.urlopen("http://127.0.0.1:3333/command/core/create-importing-job",data=urllib.urlencode({"test":""}))
a=u.read()
id=""
for i in a:
if(i.isdigit()):
id+=str(i)
# sample output '{ "jobID" : 1393566803991 }'
files = {}
pathtoXML = r"C:\75"
#pathtoXML = r"C:\AM\trial"
for i in listdir(pathtoXML):
files[i] = {'filename': i, 'content': open(pathtoXML + "\\"+ i).read()}
#load raw data using the job id found in
url = "http://127.0.0.1:3333/command/core/importing-controller?controller=core%2Fdefault-importing-controller&jobI="+id+"&subCommand=load-raw-data"
data,headers = encode_multipart({}, files)
#print len(data)
#print headers
req = urllib2.Request(url, data=data, headers=headers)
f = urllib2.urlopen(req)
f.read()
# get job status
u=urllib2.urlopen("http://127.0.0.1:3333/command/core/get-importing-job-status?jobID="+id+"", "test")
u.read()
#from fileSelection update file selection
u=urllib2.urlopen("http://127.0.0.1:3333/command/core/importing-controller?controller=core%2Fdefault-importing-controller&subCommand=update-file-selection&jobID="+id+"", "fileSelection=%5B0%2C1%2C2%2C3%5D")
u.read()
#init parser format text
u=urllib2.urlopen("http://127.0.0.1:3333/command/core/importing-controller?controller=core%2Fdefault-importing-controller&jobID="+id+"&subCommand=initialize-parser-ui&format=text%2Fxml")
u.read()
#update format and options
updateformatoptionurl = "http://127.0.0.1:3333/command/core/importing-controller?controller=core%2Fdefault-importing-controller&jobID="+id+"&subCommand=update-format-and-options"
d=urllib.urlencode({"format":"text/xml","options":{"recordPath":["ArrayOfAfiles","Afiles"],"limit":-1,"includeFileSources":"false","guessCellValueTypes":"false"}})
u=urllib2.urlopen(updateformatoptionurl,d)
u.read()
'{"status":"ok"}'
#get-models
u=urllib2.urlopen("http://127.0.0.1:3333/command/core/get-models?importingJobID="+id)
u.read()
# create project from import job
createfromimporturl = "http://127.0.0.1:3333/command/core/importing-controller?controller=core%2Fdefault-importing-controller&jobID="+id+"&subCommand=create-project"
d=urllib.urlencode({"format":"text/xml","options":{"recordPath":["ArrayOfAfiles","Afiles"],"limit":-1,"includeFileSources":"false","projectName":time.ctime()}})
u=urllib2.urlopen(createfromimporturl, d)
r=u.read()
After embedding proxy handler coding its not working when i ran the code its complaining:
Traceback (most recent call last):
File "C:\hari\trial.py", line 87, in <module>
u = urllib2.urlopen("http://127.0.0.1:3333/command/core/create-importing-job",data=urllib.urlencode({"test":""}))
File "C:\Python27\lib\urllib2.py", line 126, in urlopen
return _opener.open(url, data, timeout)
File "C:\Python27\lib\urllib2.py", line 391, in open
response = self._open(req, data)
File "C:\Python27\lib\urllib2.py", line 409, in _open
'_open', req)
File "C:\Python27\lib\urllib2.py", line 369, in _call_chain
result = func(*args)
File "C:\Python27\urllib2_file.py", line 207, in http_open
return self.do_open(httplib.HTTP, req)
File "C:\Python27\urllib2_file.py", line 298, in do_open
return self.parent.error('http', req, fp, code, msg, hdrs)
File "C:\Python27\lib\urllib2.py", line 435, in error
return self._call_chain(*args)
File "C:\Python27\lib\urllib2.py", line 369, in _call_chain
result = func(*args)
File "C:\Python27\lib\urllib2.py", line 518, in http_error_default
raise HTTPError(req.get_full_url(), code, msg, hdrs, fp)
HTTPError: HTTP Error 404: Not Found
Could anyone tell me what am I doing wrong? I keep getting an error with this code.
I'm trying to download all of the swf's from primaryschoolgames just as an experiment but I can't seem to do it:
#!/usr/bin/env python
# encoding: utf-8
import sys, getopt
import os, urllib, urllib2, re, string, math
help_message = '''
'''
no_param = '''
'''
verbose = False
fakeMode = False
curPath = os.getcwd() + "/"
urlRegex = ''
FileRegex = ''
outputPath = ''
currentFile = ''
def removeDuplicates(seq):
# Not order preserving
keys = {}
for e in seq:
keys[e] = 1
return keys.keys()
def go(filename):
print "Having a look at " + string.capwords(filename)
global urlRegex, FileRegex, outputPath, currentFile
url = 'http://cdn.primarygames.com' + filename
urlRegex = '/'+filename+'/.+/download'
FileRegex = '/'+filename+'/(.*?)/download'
outputPath = curPath+"Swfs"+"/"
if not os.path.exists(outputPath):
os.makedirs(outputPath)
filelist = []
while(len(url)):
# looping system
newlist, url = scrapePage(url, filename)
filelist.extend(newlist)
print 'Found %s Files.' % len(filelist)
for swf in filelist:
swfurl = swf['url']
name = swf['name']
currentFile = name
#print 'Downloading '+name,
if not fakeMode:
#print ''
urllib.urlretrieve('http://cdn.primarygames.com' + swfurl, outputPath+name)
else:
print 'Not downloading %s.' % name
print "All done with %s!" % filename
def scrapePage(url, filename):
print 'Looking through '+url
html = urllib2.urlopen(url).read()
swflist = re.findall(urlRegex, html)
swflist = removeDuplicates(swflist)
swfs = []
for swfurl in swflist:
r = re.compile(FileRegex)
swfname = r.search(swfurl).group(1)
swfname = swfname.replace('-', ' ')
name = filename + "/" + swfname + ".swf"
name = string.capwords(name)
swf.append({'name':name,'url':swfurl})
r = re.compile(nextRegex)
result = r.search(html)
if result:
nextUrl = 'http://cdn.primarygames.com' + result.group(1)
else:
nextUrl = ''
return swfs, nextUrl
def main(argv=None):
global verbose, fakeMode
if argv is None:
argv = sys.argv
try:
try:
opts, args = getopt.getopt(argv[1:], "ho:vf", ["help", "output="])
except getopt.error, msg:
raise Usage(msg)
# option processing
for option, value in opts:
if option == "-v":
verbose = True
if option in ("-f", "--fake"):
fakeMode = True
if option in ("-h", "--help"):
raise Usage(help_message)
if option in ("-o", "--output"):
output = value
if len(args):
swfs = args
else:
raise Usage(no_param)
except Usage, err:
print >> sys.stderr, sys.argv[0].split("/")[-1] + ": " + str(err.msg)
if err.msg != help_message:
print >> sys.stderr, "\t for help use --help"
return 2
for swf in swfs:
go(swf)
if __name__ == "__main__":
sys.exit(main())
This is the error I keep getting:
Having a look at *
Looking through http://cdn.primarygames.com/*
Traceback (most recent call last):
File "C:\PrimarySchoolGames Swf Downloader.py"
, line 129, in <module>
sys.exit(main())
File "C:\PrimarySchoolGames Swf Downloader.py"
, line 125, in main
go(swf)
File "C:\PrimarySchoolGames Swf Downloader.py"
, line 48, in go
newlist, url = scrapePage(url, filename)
File "C:\Users\Terrii\Desktop\VB Extra's\PrimarySchoolGames Swf Downloader.py"
, line 67, in scrapePage
html = urllib2.urlopen(url).read()
File "C:\Python27\lib\urllib2.py", line 126, in urlopen
return _opener.open(url, data, timeout)
File "C:\Python27\lib\urllib2.py", line 400, in open
response = self._open(req, data)
File "C:\Python27\lib\urllib2.py", line 418, in _open
'_open', req)
File "C:\Python27\lib\urllib2.py", line 378, in _call_chain
result = func(*args)
File "C:\Python27\lib\urllib2.py", line 1207, in http_open
return self.do_open(httplib.HTTPConnection, req)
File "C:\Python27\lib\urllib2.py", line 1177, in do_open
raise URLError(err)
urllib2.URLError: <urlopen error [Errno 11004] getaddrinfo failed>
A failed getaddrinfo normally suggests that something is wrong with the URL you're providing. Since I am able to resolve the address are you sure you aren't behind a proxy server? This could result in a failed DNS lookup which results in exactly this message.
How Python determines which proxy to use on Windows:
In a Windows environment, if no proxy environment variables are set,
proxy settings are obtained from the registry’s Internet Settings
section.
For more help I concurr with #MikeHunter. I tried to fix your code, but since I had to implement your Exception-Class to get the code running at all I think you should re-indent your code and provide more information. Sorry.
I'm trying to make a https get request from behind a squid proxy with cac card authentication. Loading the opensc engine and grabbing the cert and private key seem to work fine. Below is the traceback and the code.
Any help is greatly appreciated.
Traceback
Traceback (most recent call last):
File "testM2Crypto.py", line 64, in <module>
res = m2urllib2.urlopen(req)
File "c:\Python27\lib\urllib2.py", line 135, in urlopen
return _opener.open(url, data, timeout)
File "c:\Python27\lib\urllib2.py", line 415, in open
response = self._open(req, data)
File "c:\Python27\lib\urllib2.py", line 433, in _open
'_open', req)
File "c:\Python27\lib\urllib2.py", line 387, in _call_chain
result = func(*args)
File "c:\Python27\lib\site-packages\M2Crypto\m2urllib2.py", line 94, in https_open
h.request(req.get_method(), req.get_selector(), req.data, headers)
File "c:\Python27\lib\httplib.py", line 963, in request
self._send_request(method, url, body, headers)
File "c:\Python27\lib\httplib.py", line 994, in _send_request
self.putrequest(method, url, **skips)
File "c:\Python27\lib\site-packages\M2Crypto\httpslib.py", line 140, in putrequest
raise ValueError, "unknown URL type: %s" % url
ValueError: unknown URL type: /index.asp?site=SomeSite
Code
from M2Crypto import httpslib, m2urllib2, m2, SSL, Engine
import urllib2
url = 'https://some.domain.com/index.asp?site=SomeSite'
e = Engine.load_dynamic_engine("pkcs11", "c:/windows/syswow64/engine_pkcs11.dll")
pk = Engine.Engine("pkcs11")
pk.ctrl_cmd_string("MODULE_PATH", "c:/windows/syswow64/opensc-pkcs11.dll")
m2.engine_init(m2.engine_by_id("pkcs11"))
cert = e.load_certificate("slot_01-id_01")
privatekey = e.load_private_key("slot_01-id_01")
ctx = SSL.Context("sslv23")
ctx.set_cipher_list("HIGH:!aNULL:!eNULL:#STRENGTH")
ctx.set_session_id_ctx("foobar")
m2.ssl_ctx_use_x509(ctx.ctx, cert.x509)
m2.ssl_ctx_use_pkey_privkey(ctx.ctx, privatekey.pkey)
proxy_support=urllib2.ProxyHandler({'https':'https://proxy:3128'})
opener = m2urllib2.build_opener(ctx, proxy_support)
m2urllib2.install_opener(opener)
req = m2urllib2.Request(url)
res = m2urllib2.urlopen(req)
I finally was able to solve the problem yesterday. I had to make a few modifications to the code. I also had to patch a bug in the M2Crypto library that was preventing https through a proxy(Thanks to Miloslav Trmač from redhat for the patch). The solution is below for anyone else who might be running into a similar problem. Hope this helps.
Code
from M2Crypto import httpslib, m2urllib2, m2, SSL, Engine
import urllib2
userPin = "123456"
rootCertPath = 'd:/path/to/rootCert.pem'
url = 'https://some.domain.com/index.asp?site=SomeSite'
e = Engine.load_dynamic_engine("pkcs11", "c:/windows/syswow64/engine_pkcs11.dll")
pk = Engine.Engine("pkcs11")
pk.ctrl_cmd_string("MODULE_PATH", "c:/windows/syswow64/opensc-pkcs11.dll")
if len(userPin) > 0: pk.ctrl_cmd_string("PIN", userPin)
m2.engine_init(m2.engine_by_id("pkcs11"))
rootcert = X509.load_cert(rootCertPath)
cert = e.load_certificate("slot_01-id_01")
privatekey = e.load_private_key("slot_01-id_01")
ctx = SSL.Context("sslv23")
ctx.set_cipher_list("HIGH:!aNULL:!eNULL:#STRENGTH")
ctx.set_session_id_ctx("foobar")
ctx.load_verify_locations(cafile=rootcert)
m2.ssl_ctx_use_x509(ctx.ctx, cert.x509)
m2.ssl_ctx_use_pkey_privkey(ctx.ctx, privatekey.pkey)
proxy_support=urllib2.ProxyHandler({'https':'https://proxy:3128'})
opener = m2urllib2.build_opener(ctx, proxy_support)
m2urllib2.install_opener(opener)
req = m2urllib2.Request(url)
try:
res = m2urllib2.urlopen(req)
print '\nsuccess'
except urllib2.HTTPError, err:
print '\nerror'
print 'err.code: '+str(err.code)
print 'err.reason: '+str(err.reason)
print 'err.read(): '+str(err.read())
Thanks to Miloslav Trmač from redhat for the patch. I found this patch at the following url, http://arm.koji.fedoraproject.org/koji/buildinfo?buildID=61225 .
M2Crypto Patch
diff -urN M2Crypto/M2Crypto/httpslib.py M2Crypto-0.21.1/M2Crypto/httpslib.py
--- M2Crypto/M2Crypto/httpslib.py 2012-03-15 03:27:22.181524406 +0100
+++ M2Crypto-0.21.1/M2Crypto/httpslib.py 2012-03-15 03:27:40.467485033 +0100
## -182,14 +182,14 ##
else:
HTTPSConnection.putheader(self, header, value)
- def endheaders(self):
+ def endheaders(self, *args, **kwargs):
# We've recieved all of hte headers. Use the supplied username
# and password for authorization, possibly overriding the authstring
# supplied in the headers.
if not self._proxy_auth:
self._proxy_auth = self._encode_auth()
- HTTPSConnection.endheaders(self)
+ HTTPSConnection.endheaders(self, *args, **kwargs)
def connect(self):
HTTPConnection.connect(self)
diff -urN M2Crypto/M2Crypto/m2urllib2.py M2Crypto-0.21.1/M2Crypto/m2urllib2.py
--- M2Crypto/M2Crypto/m2urllib2.py 2011-01-15 20:10:05.000000000 +0100
+++ M2Crypto-0.21.1/M2Crypto/m2urllib2.py 2012-03-15 03:27:40.467485033 +0100
## -64,8 +64,10 ##
target_host = urlparse.urlparse(full_url)[1]
if (target_host != host):
+ request_uri = urlparse.urldefrag(full_url)[0]
h = httpslib.ProxyHTTPSConnection(host = host, ssl_context = self.ctx)
else:
+ request_uri = req.get_selector()
h = httpslib.HTTPSConnection(host = host, ssl_context = self.ctx)
# End our change
h.set_debuglevel(self._debuglevel)
## -80,7 +82,7 ##
# request.
headers["Connection"] = "close"
try:
- h.request(req.get_method(), req.get_selector(), req.data, headers)
+ h.request(req.get_method(), request_uri, req.data, headers)
r = h.getresponse()
except socket.error, err: # XXX what error?
raise URLError(err)
I need a python script that gets the google adsense earnings and I found adsense scraper:
http://pypi.python.org/pypi/adsense_scraper/0.5
It uses Twill and html5lib to scrape google adsense earnings data. When I use it I get this error message:
Traceback (most recent call last):
File "adsense_scraper.py", line 163, in <module>
data = main()
File "adsense_scraper.py", line 154, in main
b = get_adsense(login, password)
File "adsense_scraper.py", line 128, in get_adsense
b.submit()
File "c:\python26\lib\site-packages\twill-0.9-py2.6.egg\twill\browser.py", line 467, in submit
self._journey('open', request)
File "c:\python26\lib\site-packages\twill-0.9-py2.6.egg\twill\browser.py", line 523, in _journey
r = func(*args, **kwargs)
File "c:\python26\lib\site-packages\twill-0.9-py2.6.egg\twill\other_packages\_mechanize_dist\_mechanize.py", line 212, in open
return self._mech_open(url, data)
File "c:\python26\lib\site-packages\twill-0.9-py2.6.egg\twill\other_packages\_mechanize_dist\_mechanize.py", line 238, in _mech_open
response = UserAgentBase.open(self, request, data)
File "c:\python26\lib\site-packages\twill-0.9-py2.6.egg\twill\other_packages\_mechanize_dist\_opener.py", line 192, in open
response = meth(req, response)
File "c:\python26\lib\site-packages\twill-0.9-py2.6.egg\twill\other_packages\_mechanize_dist\_http.py", line 590, in http_response
"http", request, response, code, msg, hdrs)
File "c:\python26\lib\site-packages\twill-0.9-py2.6.egg\twill\other_packages\_mechanize_dist\_opener.py", line 209, in error
result = apply(self._call_chain, args)
File "C:\Python26\lib\urllib2.py", line 361, in _call_chain
result = func(*args)
File "c:\python26\lib\site-packages\twill-0.9-py2.6.egg\twill\other_packages\_mechanize_dist\_http.py", line 135, in http_error_302
return self.parent.open(new)
File "c:\python26\lib\site-packages\twill-0.9-py2.6.egg\twill\other_packages\_mechanize_dist\_mechanize.py", line 212, in open
return self._mech_open(url, data)
File "c:\python26\lib\site-packages\twill-0.9-py2.6.egg\twill\other_packages\_mechanize_dist\_mechanize.py", line 238, in _mech_open
response = UserAgentBase.open(self, request, data)
File "c:\python26\lib\site-packages\twill-0.9-py2.6.egg\twill\other_packages\_mechanize_dist\_opener.py", line 192, in open
response = meth(req, response)
File "c:\python26\lib\site-packages\twill-0.9-py2.6.egg\twill\utils.py", line 442, in http_response
"refresh", msg, hdrs)
File "c:\python26\lib\site-packages\twill-0.9-py2.6.egg\twill\other_packages\_mechanize_dist\_opener.py", line 209, in error
result = apply(self._call_chain, args)
File "C:\Python26\lib\urllib2.py", line 361, in _call_chain
result = func(*args)
File "c:\python26\lib\site-packages\twill-0.9-py2.6.egg\twill\other_packages\_mechanize_dist\_http.py", line 135, in http_error_302
return self.parent.open(new)
File "c:\python26\lib\site-packages\twill-0.9-py2.6.egg\twill\other_packages\_mechanize_dist\_mechanize.py", line 212, in open
return self._mech_open(url, data)
File "c:\python26\lib\site-packages\twill-0.9-py2.6.egg\twill\other_packages\_mechanize_dist\_mechanize.py", line 238, in _mech_open
response = UserAgentBase.open(self, request, data)
File "c:\python26\lib\site-packages\twill-0.9-py2.6.egg\twill\other_packages\_mechanize_dist\_opener.py", line 181, in open
response = urlopen(self, req, data)
File "C:\Python26\lib\urllib2.py", line 406, in _open 'unknown_open', req)
File "C:\Python26\lib\urllib2.py", line 361, in _call_chain result = func(*args)
File "C:\Python26\lib\urllib2.py", line 1163, in unknown_open raise URLError('unknown url type: %s' % type)
urllib2.URLError: <urlopen error unknown url type: 'http>
So the important thing is:
urllib2.URLError: <urlopen error unknown url type: 'http>
Can somebody tell me where the error is? Is there even a better way to get the data via python? Thanks
there are several errors with the package, you mentioned only the first one
1) twill package does not handle google's redirects correctly, adding
newurl = newurl.strip( "'" )
to twill/other_packages/_mechanize_dist/_http.py:108 before
newurl = _rfc3986.clean_url(newurl, "latin-1")
fixes that
2) you have to have the correct language set in adsense - English
3) there are several problems in the orignal adsense_scraper
#!/usr/bin/env python
"""Scrapes Google AdSense data with Python using Twill
Current canonical location of this module is here:
http://github.com/etrepum/adsense_scraper/tree/master
Usage::
from adsense_scraper import get_adsense, get_time_period
b = get_adsense('YOUR_ADSENSE_LOGIN', 'YOUR_ADSENSE_PASSWORD')
rows = get_time_period(b, 'yesterday')
# The summary data is always the first row with channel == ''
print 'I earned this much yesterday: $%(earnings)s' % rows[0]
"""
# requires html5lib, twill
import sys
import pprint
import decimal
from cStringIO import StringIO
from xml.etree import cElementTree
try:
from html5lib import HTMLParser
import twill.commands
except ImportError:
print >>sys.stderr, """\
adsense_scraper has dependencies::
Twill 0.9 http://twill.idyll.org/
html5lib 0.11 http://code.google.com/p/html5lib/
Try this::
$ easy_install twill html5lib
"""
raise SystemExit()
__version__ = '0.5'
SERVICE_LOGIN_BOX_URL = "https://www.google.com/accounts/ServiceLogin?service=adsense&rm=hide&fpui=3&nui=15&alwf=true<mpl=adsense&passive=true&continue=https%3A%2F%2Fwww.google.com%2Fadsense%2Fgaiaauth2&followup=https%3A%2F%2Fwww.google.com%2Fadsense%2Fgaiaauth2&hl=en_US"
OVERVIEW_URL = "https://www.google.com/adsense/report/overview?timePeriod="
TIME_PERIODS = [
'today',
'yesterday',
'thismonth',
'lastmonth',
'sincelastpayment',
]
def parse_decimal(s):
"""Return an int or decimal.Decimal given a human-readable number
"""
light_stripped = s.strip(u'\u20ac')
stripped = light_stripped.replace(',', '.').rstrip('%').lstrip('$')
try:
int(stripped)
return light_stripped
except ValueError:
pass
try:
float(stripped)
return light_stripped
except ValueError:
return decimal.Decimal(stripped)
def parse_summary_table(doc):
"""
Parse the etree doc for summarytable, returns::
[{'channel': unicode,
'impressions': int,
'clicks': int,
'ctr': decimal.Decimal,
'ecpm': decimal.Decimal,
'earnings': decimal.Decimal}]
"""
for t in doc.findall('.//table'):
if t.attrib.get('id') == 'summarytable':
break
else:
raise ValueError("summary table not found")
res = []
FIELDS = ['impressions', 'clicks', 'ctr', 'ecpm', 'earnings']
for row in t.findall('.//tr'):
celltext = []
for c in row.findall('td'):
tail = ''
# adsense inserts an empty span if a row has a period in it, so
# get the children and find the tail element to append to the text
if c.find('a') and c.find('a').getchildren():
tail = c.find('a').getchildren()[0].tail or ''
celltext.append('%s%s' % ((c.text or c.findtext('a') or '').strip(), tail.strip()))
celltext = filter( lambda x: x != "" , celltext )
if len(celltext) != len(FIELDS):
continue
try:
value_cols = map(parse_decimal, celltext)
except decimal.InvalidOperation:
continue
res.append(dict(zip(FIELDS, value_cols)))
return res
def get_adsense(login, password):
"""Returns a twill browser instance after having logged in to AdSense
with *login* and *password*.
The returned browser will have all of the appropriate cookies set but may
not be at the exact page that you want data from.
"""
b = twill.commands.get_browser()
b.go(SERVICE_LOGIN_BOX_URL)
for form in b.get_all_forms():
try:
form['Email'] = login
form['Passwd'] = password
except ValueError:
continue
else:
break
else:
raise ValueError("Could not find login form on page")
b._browser.select_form(predicate=lambda f: f is form)
b.submit()
return b
def get_time_period(b, period):
"""Returns the parsed summarytable for the time period *period* given
*b* which should be the result of a get_adsense call. *period* must be
a time period that AdSense supports:
``'today'``, ``'yesterday'``, ``'thismonth'``,
``'lastmonth'``, ``'sincelastpayment'``.
"""
b.go(OVERVIEW_URL + period)
# The cElementTree treebuilder doesn't work reliably enough
# to use directly, so we parse and then dump into cElementTree.
doc = cElementTree.fromstring(HTMLParser().parse(b.get_html()).toxml())
return parse_summary_table(doc)
def main():
try:
login, password = sys.argv[1:]
except ValueError:
raise SystemExit("usage: %s LOGIN PASSWORD" % (sys.argv[0],))
twill.set_output(StringIO())
twill.commands.reset_browser()
b = get_adsense(login, password)
data = {}
for period in TIME_PERIODS:
data[period] = get_time_period(b, period)
pprint.pprint(data)
twill.set_output(None)
return data
if __name__ == '__main__':
data = main()