Stop at exception in my, not library code - python

I'm developing an app using a Python library urllib and it is sometimes rising exceptions due to not being able to access an URL.
However, the exception is raised almost 6 levels into the standard library stack:
/home/user/Workspace/application/main.py in call(path)
11 headers={'content-type': 'application/json'},
12 data=b'')
---> 13 resp = urllib.request.urlopen(req) ####### THIS IS MY CODE
14 return json.loads(resp.read().decode('utf-8'))
/usr/lib/python3.4/urllib/request.py in urlopen(url, data, timeout, cafile, capath, cadefault, context)
159 else:
160 opener = _opener
--> 161 return opener.open(url, data, timeout)
162
163 def install_opener(opener):
/usr/lib/python3.4/urllib/request.py in open(self, fullurl, data, timeout)
461 req = meth(req)
462
--> 463 response = self._open(req, data)
464
465 # post-process response
/usr/lib/python3.4/urllib/request.py in _open(self, req, data)
479 protocol = req.type
480 result = self._call_chain(self.handle_open, protocol, protocol +
--> 481 '_open', req)
482 if result:
483 return result
/usr/lib/python3.4/urllib/request.py in _call_chain(self, chain, kind, meth_name, *args)
439 for handler in handlers:
440 func = getattr(handler, meth_name)
--> 441 result = func(*args)
442 if result is not None:
443 return result
/usr/lib/python3.4/urllib/request.py in http_open(self, req)
1208
1209 def http_open(self, req):
-> 1210 return self.do_open(http.client.HTTPConnection, req)
1211
1212 http_request = AbstractHTTPHandler.do_request_
/usr/lib/python3.4/urllib/request.py in do_open(self, http_class, req, **http_conn_args)
1182 h.request(req.get_method(), req.selector, req.data, headers)
1183 except OSError as err: # timeout error
-> 1184 raise URLError(err)
1185 r = h.getresponse()
1186 except:
URLError: <urlopen error [Errno 111] Connection refused>
I usually run the code in ipython3 with the %pdb magic turned on so in case there is an exception I can inspect it immediately. However for this I have to go down the stack 6 levels to get to my code.
Is it achievable that my app crashes pointing to my code directly?

I would go with modifying the code:
try:
resp = urllib.request.urlopen(req)
except Exception as e:
raise RuntimeError(e)
That way:
%pdb moves you to your code,
the original exception is preserved as argument of the "secondary" exception.
You may also monkeypatch urllib.request.urlopen() function:
class MonkeyPatchUrllib(object):
def __enter__(self):
self.__urlopen = urllib.request.urlopen
urllib.request.urlopen = self
def __exit__(self, exception_type, exception_value, traceback):
urllib.request.urlopen = self.__urlopen
def __call__(self, *args, **kwargs):
try:
return self.__urlopen(*args, **kwargs)
except Exception as e:
raise RuntimeError(e)
Any time you have an exception raised in urlibopen() call within the context manager scope:
with MonkeyPatchUrllib():
#your code here
%pdb will move you only 1 level away from your code.
[EDIT]
With sys.exc_info() it is possible to preserve a more verbose context of the original exception (like its traceback).

pdb has only incremental frame positioning (moving up or down the list of frames).
To get the feature you want, you can try trepan (github repository). It has an IPython extension here. You then use the command frame -1 once the exception shows up:
Frame (absolute frame positioning)
frame [thread-Name*|*thread-number] [frame-number]
Change the current frame to frame frame-number if specified, or the current frame, 0, if no frame number specified.
If a thread name or thread number is given, change the current frame to a frame in that thread. Dot (.) can be used to indicate the name of the current frame the debugger is stopped in.
A negative number indicates the position from the other or least-recently-entered end. So frame -1 moves to the oldest frame, and frame 0 moves to the newest frame. Any variable or expression that evaluates to a number can be used as a position, however due to parsing limitations, the position expression has to be seen as a single blank-delimited parameter. That is, the expression (5*3)-1 is okay while (5 * 3) - 1) isn’t.
Once you are in the desired frame, you can use edit to modify your code.
You may find the command backtrace useful too as it gives a stack trace with the less recent call at the bottom.
trepan depends on uncompyle6 available here.
pydb provides a similar feature but was unfortunately not ported to Python3.
Otherwise, you may decide to be patient and wait for improvements. In IPython/core/debugger.py:
"""
Pdb debugger class.
Modified from the standard pdb.Pdb class to avoid including readline, so that
the command line completion of other programs which include this isn't damaged.
In the future, this class will be expanded with improvements over the standard pdb.
[...]
"""

It can be done with some hacking. These docs show how you can turn on post-mortem debugging with the following code in the entry point:
import sys
from IPython.core import ultratb
sys.excepthook = ultratb.FormattedTB(mode='Verbose',
color_scheme='Linux', call_pdb=1)
Stepping through this hook after an exception is raised shows that we need to tinker with the debugger method. Unfortunately I can see no better way to do this other than to copy the entire method and modify it where needed (I tried modifying self.tb but traceback objects are read only and can't be used with copy.deepcopy). Here's a demo:
import json
import sys
from IPython.core import debugger, ultratb
from IPython.core.display_trap import DisplayTrap
class CustomTB(ultratb.FormattedTB):
def debugger(self, force=False):
if force or self.call_pdb:
if self.pdb is None:
self.pdb = debugger.Pdb(
self.color_scheme_table.active_scheme_name)
# the system displayhook may have changed, restore the original
# for pdb
display_trap = DisplayTrap(hook=sys.__displayhook__)
with display_trap:
self.pdb.reset()
# Find the right frame so we don't pop up inside ipython itself
if hasattr(self, 'tb') and self.tb is not None:
etb = self.tb
else:
etb = self.tb = sys.last_traceback
# only modification is here -----+
# |
# V
while self.tb is not None and '/lib/python3' not in self.tb.tb_next.tb_frame.f_code.co_filename:
self.tb = self.tb.tb_next
if etb and etb.tb_next:
etb = etb.tb_next
self.pdb.botframe = etb.tb_frame
self.pdb.interaction(self.tb.tb_frame, self.tb)
if hasattr(self, 'tb'):
del self.tb
sys.excepthook = CustomTB(mode='Verbose',
color_scheme='Linux', call_pdb=1)
def foo():
bar()
def bar():
json.dumps(json)
foo()
As you can see it stops searching through the traceback when it's about to reach library code. Here's the result:
TypeErrorTraceback (most recent call last)
/Users/alexhall/Dropbox/python/sandbox3/sandbox.py in <module>()
40 json.dumps(json)
41
---> 42 foo()
global foo = <function foo at 0x1031358c8>
/Users/alexhall/Dropbox/python/sandbox3/sandbox.py in foo()
35
36 def foo():
---> 37 bar()
global bar = <function bar at 0x103135950>
38
39 def bar():
/Users/alexhall/Dropbox/python/sandbox3/sandbox.py in bar()
38
39 def bar():
---> 40 json.dumps(json)
global json.dumps = <function dumps at 0x10168b268>
global json = <module 'json' from '/Users/alexhall/.pyenv/versions/3.5.0/lib/python3.5/json/__init__.py'>
41
42 foo()
/Users/alexhall/.pyenv/versions/3.5.0/lib/python3.5/json/__init__.py in dumps(obj=<module 'json' from '/Users/alexhall/.pyenv/versions/3.5.0/lib/python3.5/json/__init__.py'>, skipkeys=False, ensure_ascii=True, check_circular=True, allow_nan=True, cls=None, indent=None, separators=None, default=None, sort_keys=False, **kw={})
228 cls is None and indent is None and separators is None and
229 default is None and not sort_keys and not kw):
--> 230 return _default_encoder.encode(obj)
global _default_encoder.encode = <bound method JSONEncoder.encode of <json.encoder.JSONEncoder object at 0x10166e8d0>>
obj = <module 'json' from '/Users/alexhall/.pyenv/versions/3.5.0/lib/python3.5/json/__init__.py'>
231 if cls is None:
232 cls = JSONEncoder
/Users/alexhall/.pyenv/versions/3.5.0/lib/python3.5/json/encoder.py in encode(self=<json.encoder.JSONEncoder object>, o=<module 'json' from '/Users/alexhall/.pyenv/versions/3.5.0/lib/python3.5/json/__init__.py'>)
197 # exceptions aren't as detailed. The list call should be roughly
198 # equivalent to the PySequence_Fast that ''.join() would do.
--> 199 chunks = self.iterencode(o, _one_shot=True)
chunks = undefined
self.iterencode = <bound method JSONEncoder.iterencode of <json.encoder.JSONEncoder object at 0x10166e8d0>>
o = <module 'json' from '/Users/alexhall/.pyenv/versions/3.5.0/lib/python3.5/json/__init__.py'>
global _one_shot = undefined
200 if not isinstance(chunks, (list, tuple)):
201 chunks = list(chunks)
/Users/alexhall/.pyenv/versions/3.5.0/lib/python3.5/json/encoder.py in iterencode(self=<json.encoder.JSONEncoder object>, o=<module 'json' from '/Users/alexhall/.pyenv/versions/3.5.0/lib/python3.5/json/__init__.py'>, _one_shot=True)
255 self.key_separator, self.item_separator, self.sort_keys,
256 self.skipkeys, _one_shot)
--> 257 return _iterencode(o, 0)
_iterencode = <_json.Encoder object at 0x1031296d8>
o = <module 'json' from '/Users/alexhall/.pyenv/versions/3.5.0/lib/python3.5/json/__init__.py'>
258
259 def _make_iterencode(markers, _default, _encoder, _indent, _floatstr,
/Users/alexhall/.pyenv/versions/3.5.0/lib/python3.5/json/encoder.py in default(self=<json.encoder.JSONEncoder object>, o=<module 'json' from '/Users/alexhall/.pyenv/versions/3.5.0/lib/python3.5/json/__init__.py'>)
178
179 ""
--> 180 raise TypeError(repr(o) + " is not JSON serializable")
global TypeError = undefined
global repr = undefined
o = <module 'json' from '/Users/alexhall/.pyenv/versions/3.5.0/lib/python3.5/json/__init__.py'>
181
182 def encode(self, o):
TypeError: <module 'json' from '/Users/alexhall/.pyenv/versions/3.5.0/lib/python3.5/json/__init__.py'> is not JSON serializable
> /Users/alexhall/Dropbox/python/sandbox3/sandbox.py(40)bar()
38
39 def bar():
---> 40 json.dumps(json)
41
42 foo()
ipdb> down
> /Users/alexhall/.pyenv/versions/3.5.0/lib/python3.5/json/__init__.py(230)dumps()
228 cls is None and indent is None and separators is None and
229 default is None and not sort_keys and not kw):
--> 230 return _default_encoder.encode(obj)
231 if cls is None:
232 cls = JSONEncoder
ipdb>
Basically the full traceback is still printed out but ipdb starts at your own code. If you enter the down command you find yourself in a library frame.

I think the answer is no.
pdb stops at the exception and shows you the stack.
Why would it be useful to hide the real source of the exception?
If it worked as you seem to be requesting and hides the 6 layers of stack how would you work out what to fix?
If this is still not on topic please add to your question.

urllib can raise a lot of exceptions.
You need to put a try block around the call into urllib and figure how to handle the exceptions for example:
try:
resp = urllib.request.urlopen(req)
except URLError as e:
# analyse e to figure out the detail
...
Certainly under python2's urllib lots of other exceptions are thrown.
I'm not sure about python3's urllib.

Related

Using Dask throws ImportError when SageMath code is run in python

This question is very similar to my earlier question and was prompted by one of the comments.
Recently, I have been trying to parallelize some code using Dask. The code involves computations in SageMath, but it seems that whenever I use Sage code in a function I am trying to parallelize it throws an ImportError even though Sage has been successfully loaded. I want to know why I am getting an ImportError even though Sage seems to have loaded successfully, and more importantly, how to fix it.
Here is a basic example of what I am running into. When I run this:
import time
from sage.all import *
from dask import delayed
from dask.distributed import Client
client = Client(n_workers=4)
#I can add Sage integers with no problem
#So Sage seems to be loaded
Integer(1)+Integer(1)
def Hello():
Integer(1)+Integer(1) #if I remove this line the code runs fine
return 'Hello World'
z = delayed(Hello)()
z.compute()
I get this error
ImportError Traceback (most recent call last)
<timed eval> in <module>
~/.sage/local/lib/python3.9/site-packages/dask/base.py in compute(self, **kwargs)
284 dask.base.compute
285 """
--> 286 (result,) = compute(self, traverse=False, **kwargs)
287 return result
288
~/.sage/local/lib/python3.9/site-packages/dask/base.py in compute(*args, **kwargs)
566 postcomputes.append(x.__dask_postcompute__())
567
--> 568 results = schedule(dsk, keys, **kwargs)
569 return repack([f(r, *a) for r, (f, a) in zip(results, postcomputes)])
570
~/.sage/local/lib/python3.9/site-packages/distributed/client.py in get(self, dsk, keys, workers, allow_other_workers, resources, sync, asynchronous, direct, retries, priority, fifo_timeout, actors, **kwargs)
2669 should_rejoin = False
2670 try:
-> 2671 results = self.gather(packed, asynchronous=asynchronous, direct=direct)
2672 finally:
2673 for f in futures.values():
~/.sage/local/lib/python3.9/site-packages/distributed/client.py in gather(self, futures, errors, direct, asynchronous)
1946 else:
1947 local_worker = None
-> 1948 return self.sync(
1949 self._gather,
1950 futures,
~/.sage/local/lib/python3.9/site-packages/distributed/client.py in sync(self, func, asynchronous, callback_timeout, *args, **kwargs)
843 return future
844 else:
--> 845 return sync(
846 self.loop, func, *args, callback_timeout=callback_timeout, **kwargs
847 )
~/.sage/local/lib/python3.9/site-packages/distributed/utils.py in sync(loop, func, callback_timeout, *args, **kwargs)
324 if error[0]:
325 typ, exc, tb = error[0]
--> 326 raise exc.with_traceback(tb)
327 else:
328 return result[0]
~/.sage/local/lib/python3.9/site-packages/distributed/utils.py in f()
307 if callback_timeout is not None:
308 future = asyncio.wait_for(future, callback_timeout)
--> 309 result[0] = yield future
310 except Exception:
311 error[0] = sys.exc_info()
/var/tmp/sage-9.4-current/local/lib/python3.9/site-packages/tornado/gen.py in run(self)
733
734 try:
--> 735 value = future.result()
736 except Exception:
737 exc_info = sys.exc_info()
~/.sage/local/lib/python3.9/site-packages/distributed/client.py in _gather(self, futures, errors, direct, local_worker)
1811 exc = CancelledError(key)
1812 else:
-> 1813 raise exception.with_traceback(traceback)
1814 raise exc
1815 if errors == "skip":
~/.sage/local/lib/python3.9/site-packages/distributed/protocol/pickle.py in loads()
73 return pickle.loads(x, buffers=buffers)
74 else:
---> 75 return pickle.loads(x)
76 except Exception:
77 logger.info("Failed to deserialize %s", x[:10000], exc_info=True)
/var/tmp/sage-9.4-current/local/lib/python3.9/site-packages/sage/rings/integer.pyx in init sage.rings.integer (build/cythonized/sage/rings/integer.c:54201)()
----> 1 r"""
2 Elements of the ring `\ZZ` of integers
3
4 Sage has highly optimized and extensive functionality for arithmetic with integers
5 and the ring of integers.
/var/tmp/sage-9.4-current/local/lib/python3.9/site-packages/sage/rings/rational.pyx in init sage.rings.rational (build/cythonized/sage/rings/rational.cpp:40442)()
98
99
--> 100 import sage.rings.real_mpfr
101 import sage.rings.real_double
102 from libc.stdint cimport uint64_t
/var/tmp/sage-9.4-current/local/lib/python3.9/site-packages/sage/rings/real_mpfr.pyx in init sage.rings.real_mpfr (build/cythonized/sage/rings/real_mpfr.c:46795)()
----> 1 r"""
2 Arbitrary Precision Real Numbers
3
4 AUTHORS:
5
/var/tmp/sage-9.4-current/local/lib/python3.9/site-packages/sage/libs/mpmath/utils.pyx in init sage.libs.mpmath.utils (build/cythonized/sage/libs/mpmath/utils.c:9062)()
----> 1 """
2 Utilities for Sage-mpmath interaction
3
4 Also patches some mpmath functions for speed
5 """
/var/tmp/sage-9.4-current/local/lib/python3.9/site-packages/sage/rings/complex_mpfr.pyx in init sage.rings.complex_mpfr (build/cythonized/sage/rings/complex_mpfr.c:34594)()
----> 1 """
2 Arbitrary Precision Floating Point Complex Numbers
3
4 AUTHORS:
5
/var/tmp/sage-9.4-current/local/lib/python3.9/site-packages/sage/rings/complex_double.pyx in init sage.rings.complex_double (build/cythonized/sage/rings/complex_double.c:25284)()
96 from cypari2.convert cimport new_gen_from_double, new_t_COMPLEX_from_double
97
---> 98 from . import complex_mpfr
99
100 from .complex_mpfr import ComplexField
ImportError: cannot import name complex_mpfr
Perhaps this has something to do with Dask not importing Sage when it goes to parallelize things
Unfortunately, you might be out of luck here (somewhat). It looks like sage is not developed with threaded execution driven by another language in mind - their root level modules modify key elements of the python environment and really try to take control of low-level functionality by default. For example, sage.__init__ modifies the way that both inspect and sqllite work (gross!)
The specific issue you're running into is that importing sage invokes the signal module, which cannot be run from a thread other than the main one. The issue isn't in sage operations, but simply the import statement:
In [8]: def hello_sage():
...: from sage.all import Integer
...: return 'Hello World'
...:
In [9]: futures = client.submit(hello_sage)
In [10]: distributed.worker - WARNING - Compute Failed
Function: hello_sage
args: ()
kwargs: {}
Exception: ValueError('signal only works in main thread of the main interpreter')
Unfortunately, this is fairly incompatible with dask, which runs all delayed jobs within threads. It's not that dask can't import modules locally to a remote function (it definitely can), it's that those functions can't use signal to control execution.
Because of the way sage is written, as far as multithreading goes I think your only choice is to go with the parallelization options their developers have provided. That said, you can trick sage into thinking it's in a world of its own by having threads start their own subprocesses:
In [1]: import dask.distributed as dd
In [2]: from subprocess import Popen, PIPE
In [3]: def invoke_sage_cli():
...: cmd = ["sage", "-c", "print(factor(35))"]
...: p = Popen(cmd, stdout=PIPE, stderr=PIPE, text=True)
...: o, e = p.communicate()
...:
...: if e:
...: raise SystemError(e)
...:
...: return o
...:
In [4]: client = dd.Client(n_workers=4)
In [5]: future = client.submit(invoke_sage_cli)
In [6]: print(future.result())
5 * 7
This is a pretty hacky way of getting around this issue, and I think it's unlikely to offer any performance benefits over the native sage parallelization options as long as you're working on a single machine. If you're using dask to scale up a Kubernetes cluster or work with nodes on an HPC or something, then you could definitely use this route to schedule distributed jobs and then have sage manage multithreading within each node.

'MyClass' object has no attribute 'type' , problem loading libraries inside a class

not sure how whould I embebed external libraries inside a class, to make it self-independant, I opted for:
from urllib.request import urlopen
class MyClass:
from urllib.request import urlopen as LOCALurlopen
LOC_baseUrl = 'https://www.someurl.com/api/'
def resolve_API(self, myValue):
LOC_fullRequest = self.LOC_baseUrl + '/' + myValue
print(Loc_fullRequest)
LOC_response = urlopen(LOC_fullRequest, timeout = 5).read() ## works, but external dependecy
LOC_response = self.LOCALurlopen(LOC_fullRequest, timeout = 5).read() ## doesn't work
print(LOC_response)
I made the same with other libs and works ok, but not url open, it give me the error:
---------------------------------------------------------------------------
AttributeError Traceback (most recent call last)
<ipython-input-99-425bb23495d5> in <module>()
---> 13 printINFO(MyClass.resolve_API('anyValue'))
<ipython-input-98-dee69a279550> in resolve_API(self, myValue)
---> 55 LOC_response = self.LOCALurlopen(LOC_fullRequest, timeout = 5).read()
/usr/lib/python3.7/urllib/request.py in urlopen(url, data, timeout, cafile, capath, cadefault, context)
220 else:
221 opener = _opener
--> 222 return opener.open(url, data, timeout)
223
224 def install_opener(opener):
/usr/lib/python3.7/urllib/request.py in open(self, fullurl, data, timeout)
515
516 req.timeout = timeout
--> 517 protocol = req.type
518
519 # pre-process request
AttributeError: 'MyClass' object has no attribute 'type'
how should I resolve this dependency ?

UnicodeEncodeError in urllib2

I met UnicodeEncodeError while crawling Wikipedia dump json file.
Here are my code snippet and the error message.
It seems like the character 'é' cause this problem. However, I do not know how to solve this issue.
import urllib2
import json
# List of philosopher's name: mergel list
# print mergel
i = 0
for name in mergel:
# Use the API to get the page content in a format that we like.
# https://en.wikipedia.org/w/api.php?action=query&titles=Spider-Man&prop=revisions&rvprop=content&format=json
# set the parameters (https://www.mediawiki.org/wiki/API:Tutorial)
i = i+1
baseurl = "https://en.wikipedia.org/w/api.php?"
action = "action=query"
titlename = name.replace(" ", "_")
print titlename
title = "titles="+titlename
content = "prop=revisions&rvprop=content"
dataformat = "format=json"
# construct the query
query = "%s%s&%s&%s&%s" % (baseurl, action, title, content, dataformat)
print query
wikiresponse = urllib2.urlopen(query)
wikisource = wikiresponse.read()
# print wikisource
wikijson = json.loads(wikisource)
jsonfilename = './json/'+titlename+'.json'
with open(jsonfilename, 'w') as outfile:
json.dump(wikijson, outfile)
Error message:
Tenzin_Gyatso
https://en.wikipedia.org/w/api.php?action=query&titles=Tenzin_Gyatso&prop=revisions&rvprop=content&format=json
Claude_Lévi-Strauss
https://en.wikipedia.org/w/api.php?action=query&titles=Claude_Lévi-Strauss&prop=revisions&rvprop=content&format=json
---------------------------------------------------------------------------
UnicodeEncodeError Traceback (most recent call last)
<ipython-input-203-8430fc805550> in <module>()
21 query = "%s%s&%s&%s&%s" % (baseurl, action, title, content, dataformat)
22 print query
---> 23 wikiresponse = urllib2.urlopen(query)
24 wikisource = wikiresponse.read()
25 # print wikisource
/Users/sundong/anaconda/lib/python2.7/urllib2.pyc in urlopen(url, data, timeout, cafile, capath, cadefault, context)
152 else:
153 opener = _opener
--> 154 return opener.open(url, data, timeout)
155
156 def install_opener(opener):
/Users/sundong/anaconda/lib/python2.7/urllib2.pyc in open(self, fullurl, data, timeout)
429 req = meth(req)
430
--> 431 response = self._open(req, data)
432
433 # post-process response
/Users/sundong/anaconda/lib/python2.7/urllib2.pyc in _open(self, req, data)
447 protocol = req.get_type()
448 result = self._call_chain(self.handle_open, protocol, protocol +
--> 449 '_open', req)
450 if result:
451 return result
/Users/sundong/anaconda/lib/python2.7/urllib2.pyc in _call_chain(self, chain, kind, meth_name, *args)
407 func = getattr(handler, meth_name)
408
--> 409 result = func(*args)
410 if result is not None:
411 return result
/Users/sundong/anaconda/lib/python2.7/urllib2.pyc in https_open(self, req)
1238 def https_open(self, req):
1239 return self.do_open(httplib.HTTPSConnection, req,
-> 1240 context=self._context)
1241
1242 https_request = AbstractHTTPHandler.do_request_
/Users/sundong/anaconda/lib/python2.7/urllib2.pyc in do_open(self, http_class, req, **http_conn_args)
1192
1193 try:
-> 1194 h.request(req.get_method(), req.get_selector(), req.data, headers)
1195 except socket.error, err: # XXX what error?
1196 h.close()
/Users/sundong/anaconda/lib/python2.7/httplib.pyc in request(self, method, url, body, headers)
1051 def request(self, method, url, body=None, headers={}):
1052 """Send a complete request to the server."""
-> 1053 self._send_request(method, url, body, headers)
1054
1055 def _set_content_length(self, body, method):
/Users/sundong/anaconda/lib/python2.7/httplib.pyc in _send_request(self, method, url, body, headers)
1091 for hdr, value in headers.iteritems():
1092 self.putheader(hdr, value)
-> 1093 self.endheaders(body)
1094
1095 def getresponse(self, buffering=False):
/Users/sundong/anaconda/lib/python2.7/httplib.pyc in endheaders(self, message_body)
1047 else:
1048 raise CannotSendHeader()
-> 1049 self._send_output(message_body)
1050
1051 def request(self, method, url, body=None, headers={}):
/Users/sundong/anaconda/lib/python2.7/httplib.pyc in _send_output(self, message_body)
891 msg += message_body
892 message_body = None
--> 893 self.send(msg)
894 if message_body is not None:
895 #message_body was not a string (i.e. it is a file) and
/Users/sundong/anaconda/lib/python2.7/httplib.pyc in send(self, data)
867 datablock = data.read(blocksize)
868 else:
--> 869 self.sock.sendall(data)
870
871 def _output(self, s):
/Users/sundong/anaconda/lib/python2.7/ssl.pyc in sendall(self, data, flags)
719 count = 0
720 while (count < amount):
--> 721 v = self.send(data[count:])
722 count += v
723 return amount
/Users/sundong/anaconda/lib/python2.7/ssl.pyc in send(self, data, flags)
685 self.__class__)
686 try:
--> 687 v = self._sslobj.write(data)
688 except SSLError as x:
689 if x.args[0] == SSL_ERROR_WANT_READ:
UnicodeEncodeError: 'ascii' codec can't encode character u'\xe9' in position 43: ordinal not in range(128)
However, below simple & direct code without getting a title from a list, just works without any issues.
import urllib2
import json
query = 'https://en.wikipedia.org/w/api.php?action=query&titles=Claude_Lévi-Strauss&prop=revisions&rvprop=content&format=json'
wikiresponse = urllib2.urlopen(query)
wikisource = wikiresponse.read()
wikijson = json.loads(wikisource)
jsonfilename = './json/'+'Claude_Lévi-Strauss'+'.json'
with open(jsonfilename, 'w') as outfile:
json.dump(wikijson, outfile)
Don't mix Unicode and bytestrings: use Unicode strings to work with text in Python.
Don't create urls by hand, use urllib functions such as quote(), urlencode(). Also, consider functions from urlparse module such as urljoin(), urlunsplit().
You've already requested json format, no need to parse it, only to dump it back immediately using the same format; you could use shutil.copyfileobj() to copy file-like objects. You could check the result file later, to make sure that it has been downloaded correctly.
Putting it all together, here's how to save a wiki-page with a given title to a file in JSON format:
#!/usr/bin/env python2
# -*- coding: utf-8 -*-
import os
from contextlib import closing
from urllib import quote
from urllib2 import urlopen
from shutil import copyfileobj
def urlretrieve(url, filename, chunksize=8096):
with closing(urlopen(url)) as response, open(filename, 'wb') as file:
copyfileobj(response, file, chunksize)
#XXX for name in mergel:
name = u"Claude Lévi-Strauss" #NOTE: Unicode string
urlretrieve("https://en.wikipedia.org/w/api.php?"
"action=query&prop=revisions&rvprop=content&format=json&"
"titles=" + quote(name.encode('utf-8')),
os.path.join('json', name + '.json'))
Note:
you don't need to .replace(' ', '_') in this case
os.path.join('json', name + '.json') line mixes bytestrings ('json', '.json') and Unicode (type(name) == unicode). It is ok here, because both 'json' and '.json' are ascii-only literals in the source code
# -*- coding: utf-8 -*- encoding declaration affects only characters that appear literally in your Python source code e.g., it is accidental that the query string also uses the same encoding in this particular case. The encoding of your source code has no relation with a character encoding that might be used for filenames, or to transfer data over http, or to write Unicode text to terminal, etc (all these encodings may be different from each other).
In principle, you could have used urllib.urlretrieve(url, filename) here instead of urlopen + copyfile but urllib.urlretrieve() behavior is different from urllib2.urlopen() on Python 2
Here's the same code using requests:
#!/usr/bin/env python2
# -*- coding: utf-8 -*-
import os
from urllib import quote
import requests # $ pip install requests
def urlretrieve(url, filename, chunksize=8096):
r = requests.get(url, stream=True)
r.raise_for_status() # raise on http error
with open(filename, 'wb') as f:
for chunk in r.iter_content(chunksize):
f.write(chunk)
#XXX for name in mergel:
name = u"Claude Lévi-Strauss" #NOTE: Unicode string
urlretrieve("https://en.wikipedia.org/w/api.php?"
"action=query&prop=revisions&rvprop=content&format=json&"
"titles=" + quote(name.encode('utf-8')),
os.path.join('json', name + '.json'))
However, below simple & direct code without getting a title from a list, just works without any issues.
Your code uses non-ascii bytestring literals (illegal in Python 3). There is no encoding error because all data is bytes already. The issue with using bytestrings is that it breaks if different environment may use different character encodings and they do (you can't expect that everything uses utf-8 however desirable it might be). Also, the query part should be properly encoded e.g., é should be sent as '%C3%A9'.
Unrelated: to download several web-pages at once, you could use a thread pool:
from multiprocessing.dummy import Pool # use threads
def download(name):
urlretrieve("https://en.wikipedia.org/w/api.php?"
"action=query&prop=revisions&rvprop=content&format=json&"
"titles=" + quote(name.encode('utf-8')),
os.path.join('json', name + '.json'))
pool = Pool(4) # download 4 titles concurrently
for _ in pool.imap_unordered(download, mergel, chunksize=100):
pass
It is polite to set maxlag query parameter and respect Retry-After http header. There are several wrappers for Wikipedia API that might do it for you.

Cannot pickle a Python Class instance

Here I have this class definition class definition. When I run below code, it raises following errors.
sm = SaliencyMaskSlic()
operations = [('img_resize', img_resize), ('sal_mask', sm.transform)]
args_list = [{'h_size':258}, {'cropped':True}]
pre_pipeline = Pipeline(ops=operations, arg_list=args_list)
ch = ColorHist('RGB', [6,6,6], [2,2], center=True, pre_pipeline = pre_pipeline)
dill.dump(ch, open('erogol.pkl','wb'))
...
dill.loads('erogol.pkl')
---------------------------------------------------------------------------
IndexError Traceback (most recent call last)
<ipython-input-11-c8a5937780b5> in <module>()
----> 1 dill.loads('erogol.pkl')
/usr/local/lib/python2.7/dist-packages/dill/dill.pyc in loads(str)
158 """unpickle an object from a string"""
159 file = StringIO(str)
--> 160 return load(file)
161
162 # def dumpzs(obj, protocol=None):
/usr/local/lib/python2.7/dist-packages/dill/dill.pyc in load(file)
148 pik = Unpickler(file)
149 pik._main_module = _main_module
--> 150 obj = pik.load()
151 if type(obj).__module__ == _main_module.__name__: # point obj class to main
152 try: obj.__class__ == getattr(pik._main_module, type(obj).__name__)
/usr/lib/python2.7/pickle.pyc in load(self)
856 while 1:
857 key = read(1)
--> 858 dispatch[key](self)
859 except _Stop, stopinst:
860 return stopinst.value
/usr/lib/python2.7/pickle.pyc in load_appends(self)
1185 def load_appends(self):
1186 stack = self.stack
-> 1187 mark = self.marker()
1188 list = stack[mark - 1]
1189 list.extend(stack[mark + 1:])
/usr/lib/python2.7/pickle.pyc in marker(self)
872 mark = self.mark
873 k = len(stack)-1
--> 874 while stack[k] is not mark: k = k-1
875 return k
876
IndexError: list index out of range
Basically I have one class instance using another class instance inside. I also used cPickle but it raises as I dump;
TypeError: can't pickle instancemethod objects
Any idea for the solution ?
This isn't a pickling error. You can't pickle class instances with pickle or cPickle, but you can with dill. Your code has a bug somewhere that's giving you an IndexError.
Also better than your class having a dump and load method, you might just use dump and load from dill directly... then if you are doing something complicated, you can still add a __getstate__ and __setstate__ method.
Also, your loading from a pickled file, has a bug. You are doing this:
self = dill.loads(in_path)
While you should (1) be using dill.load instead, and (2) load to _self, and then replace the relevant state.
_self = dill.load(in_path)
self.nbins = _self.nbins
self.mask = _self.mask
# and so on... (or update all at once using `__dict__`)

ProxyError Raised with Requests module

I am still new to python and can't figure out how to handle this error and what to do with it to avoid it even after trying to understand the different methods of the Requests module and reading out there.
Here's the simple request I use where line loops through a text file with the different URL I'm trying to access and d a list of dictionary containing the many URLs I'm using as proxies.
import requests
import collections
# [...]
d = collections.deque(proxies)
with requests.session() as r:
d.rotate(-1)
page = r.get(line.rstrip(), proxies=d[0])
It works perfectly until one of the proxies from the list timeout for some reason and force the script to raise this error:
ProxyError Traceback (most recent call last)
C:\Python27\lib\site-packages\IPython\utils\py3compat.pyc in execfile(fname, glob, loc)
195 else:
196 filename = fname
--> 197 exec compile(scripttext, filename, 'exec') in glob, loc
198 else:
199 def execfile(fname, *where):
C:\Users\Christopher Fargere\desktop\python\quick_scraper.py in <module>()
72 with requests.session() as r:
73 d.rotate(-1)
---> 74 page = r.get(line.rstrip(), proxies=d[0])
75 print d[0]
76 print page.status_code
C:\Python27\lib\site-packages\requests\sessions.pyc in get(self, url, **kwargs)
393
394 kwargs.setdefault('allow_redirects', True)
--> 395 return self.request('GET', url, **kwargs)
396
397 def options(self, url, **kwargs):
C:\Python27\lib\site-packages\requests\sessions.pyc in request(self, method, url, params, data, headers, cookies, files, auth, timeout, allow_redirects, proxies, hooks, stream, verify, cert)
381 'allow_redirects': allow_redirects,
382 }
--> 383 resp = self.send(prep, **send_kwargs)
384
385 return resp
C:\Python27\lib\site-packages\requests\sessions.pyc in send(self, request, **kwargs)
484 start = datetime.utcnow()
485 # Send the request
--> 486 r = adapter.send(request, **kwargs)
487 # Total elapsed time of the request (approximately)
488 r.elapsed = datetime.utcnow() - start
C:\Python27\lib\site-packages\requests\adapters.pyc in send(self, request, stream, timeout, verify, cert, proxies)
379
380 except _ProxyError as e:
--> 381 raise ProxyError(e)
382
383 except (_SSLError, _HTTPError) as e:
ProxyError: Cannot connect to proxy. Socket error: [Errno 11001] getaddrinfo failed.
I would love to implement an IF condition when an error is raised that pops out the proxy out of the d list and retry the same URL. I'm sure its very simple but can't understand how the errors are raised in Python. :(
To catch an exception, use exception handling; catch the ProxyError thrown:
from requests.exceptions import ProxyError
with requests.session() as r:
page = None
for _ in range(len(d)):
d.rotate(-1)
try:
page = r.get(line.rstrip(), proxies=d[0])
except ProxyError:
# ignore proxy exception, move to next proxy
pass
else:
# success, break loop
break
if page is None:
# none of the proxies worked
raise ProxyError
This tries, at most, all your proxies in d, one by one. If none of them worked, we raise the ProxyError again, as you probably want to know that all your proxies failed at that time.

Categories