How to get around error 304 in urllib2, Python - python

For the openers, opener = urllib2.build_opener(), if I try to add an header:
request.add_header('if-modified-since',request.headers.get('last-nodified'))
I get the error code:
Traceback (most recent call last):
File "<pyshell#19>", line 1, in <module>
feeddata = opener.open(request)
File "C:\Python27\lib\urllib2.py", line 391, in open
response = self._open(req, data)
File "C:\Python27\lib\urllib2.py", line 409, in _open
'_open', req)
File "C:\Python27\lib\urllib2.py", line 369, in _call_chain
result = func(*args)
File "C:\Python27\lib\urllib2.py", line 1173, in http_open
return self.do_open(httplib.HTTPConnection, req)
File "C:\Python27\lib\urllib2.py", line 1142, in do_open
h.request(req.get_method(), req.get_selector(), req.data, headers)
File "C:\Python27\lib\httplib.py", line 946, in request
self._send_request(method, url, body, headers)
File "C:\Python27\lib\httplib.py", line 986, in _send_request
self.putheader(hdr, value)
File "C:\Python27\lib\httplib.py", line 924, in putheader
str = '%s: %s' % (header, '\r\n\t'.join(values))
TypeError: sequence item 0: expected string, NoneType found
How do you get around this?
I tried building a class from urllib2.BaseHandler and it doesn't work.

Your traceback says: expected string, NoneType found from which I deduce that you've stored a None value as a header. Did you really write 'last-nodified'? The header you mean was probably 'last-modified', but even then you should check that it existed and not re-use it as a header if request.headers.get() returns None.

Related

Disable ssl check in python with headers [duplicate]

I'm trying to pull some JSON data from an API using urllib in Python 3.6. It requires header information to be passed for authorization. Here is my code:
import urllib.request, json
headers = {"authorization" : "Bearer {authorization_token}"}
with urllib.request.urlopen("{api_url}", data=headers) as url:
data = json.loads(url.read().decode())
print(data)
And the error message I get:
Traceback (most recent call last):
File "getter.py", line 5, in <module>
with urllib.request.urlopen("{url}", data=headers) as url:
File "AppData\Local\Programs\Python\Python36-32\lib\urllib\request.py", line 223, in urlopen
return opener.open(url, data, timeout)
File "AppData\Local\Programs\Python\Python36-32\lib\urllib\request.py", line 526, in open
response = self._open(req, data)
File "AppData\Local\Programs\Python\Python36-32\lib\urllib\request.py", line 544, in _open
'_open', req)
File "AppData\Local\Programs\Python\Python36-32\lib\urllib\request.py", line 504, in _call_chain
result = func(*args)
File "AppData\Local\Programs\Python\Python36-32\lib\urllib\request.py", line 1361, in https_open
context=self._context, check_hostname=self._check_hostname)
File "AppData\Local\Programs\Python\Python36-32\lib\urllib\request.py", line 1318, in do_open
encode_chunked=req.has_header('Transfer-encoding'))
File "AppData\Local\Programs\Python\Python36-32\lib\http\client.py", line 1239, in request
self._send_request(method, url, body, headers, encode_chunked)
File "AppData\Local\Programs\Python\Python36-32\lib\http\client.py", line 1285, in _send_request
self.endheaders(body, encode_chunked=encode_chunked)
File "AppData\Local\Programs\Python\Python36-32\lib\http\client.py", line 1234, in endheaders
self._send_output(message_body, encode_chunked=encode_chunked)
File "AppData\Local\Programs\Python\Python36-32\lib\http\client.py", line 1064, in _send_output
+ b'\r\n'
TypeError: can't concat bytes to str
Process finished with exit code 1
Not too sure what's going wrong here, I'm not inputting any bytes so I'm not sure why I'm getting an error telling me I can't concat bytes to str.
The data argument is expected to be a bytes-like object. you need to do the following:
urllib.request.urlopen({api_url}, data=bytes(json.dumps(headers), encoding="utf-8"))

Go to next item in list on error

I am pulling websites from a list and want to test, whether they are up or down. The code below works fine as long as they are up, but as soon as something is wrong with one of these urls, I get an error message and the whole scrip stops.
What I want to achieve: Error message == website not working therefore print down and move to next item in list.
import urllib2
from urllib2 import Request, urlopen, HTTPError, URLError
def checkurl(z):
user_agent = 'Mozilla/20.0.1 (compatible; MSIE 5.5; Windows NT)'
headers = { 'User-Agent':user_agent }
link = "http://"+z
req = Request(link, headers = headers)
try:
page_open = urlopen(req)
except HTTPError, e:
print "down"
else:
print 'up'
#print urllib2.urlopen('http://'+z).read()
Traceback (most recent call last):
File "/home/user/Videos/python/onion/qweqweqweq.py", line 48, in <module>
checkurl(x)
File "/home/user/Videos/python/onion/qweqweqweq.py", line 23, in checkurl
page_open = urlopen(req)
File "/usr/lib/python2.7/urllib2.py", line 127, in urlopen
return _opener.open(url, data, timeout)
File "/usr/lib/python2.7/urllib2.py", line 401, in open
response = self._open(req, data)
File "/usr/lib/python2.7/urllib2.py", line 419, in _open
'_open', req)
File "/usr/lib/python2.7/urllib2.py", line 379, in _call_chain
result = func(*args)
File "/usr/lib/python2.7/urllib2.py", line 1211, in http_open
return self.do_open(httplib.HTTPConnection, req)
File "/usr/lib/python2.7/urllib2.py", line 1178, in do_open
h.request(req.get_method(), req.get_selector(), req.data, headers)
File "/usr/lib/python2.7/httplib.py", line 962, in request
self._send_request(method, url, body, headers)
File "/usr/lib/python2.7/httplib.py", line 996, in _send_request
self.endheaders(body)
File "/usr/lib/python2.7/httplib.py", line 958, in endheaders
self._send_output(message_body)
File "/usr/lib/python2.7/httplib.py", line 818, in _send_output
self.send(msg)
File "/usr/lib/python2.7/httplib.py", line 780, in send
self.connect()
File "/usr/lib/python2.7/httplib.py", line 761, in connect
self.timeout, self.source_address)
File "/home/user/Videos/python/onion/qweqweqweq.py", line 5, in create_connection
sock.connect(address)
File "/usr/lib/python2.7/dist-packages/socks.py", line 369, in connect
self.__negotiatesocks5(destpair[0],destpair[1])
File "/usr/lib/python2.7/dist-packages/socks.py", line 236, in __negotiatesocks5
raise Socks5Error(ord(resp[1]),_generalerrors[ord(resp[1])])
TypeError: __init__() takes exactly 2 arguments (3 given)
You are catching HTTPError, but what is thrown is Socks5Error.
You're missing Socks5Error in your except clause. Look at the traceback:
raise Socks5Error(ord(resp[1]),_generalerrors[ord(resp[1])])
Note that this wouldn't have happened if you used requests instead of urllib2. The interface is a lot clearer, the documentation better.
In answer to "would it be possible to assume that the website is down regardless of the error", then this will do it:
req = Request(link, headers = headers)
try:
page_open = urlopen(req)
except:
print "down"
else:
print 'up'

urllib error: cannot fetch html data. Python, Beagle Bone Black

I was making my project on mac and I tried to do the same things by Beagle Bone Black(BBB).
However, I couldn't use urllib in BBB so I am stuck: I cannot go forward.(it is working well in my mac)
I tried this simple code as an example:
import urllib
conn = urllib.urlopen('http://stackoverflow.com/questions/8479736/using-python-urllib-how-to-avoid-non-html-content')
then this Error occurred:
Traceback (most recent call last):
File "<stdin>", line 1, in <module>
File "/usr/lib/python2.7/urllib.py", line 86, in urlopen
return opener.open(url)
File "/usr/lib/python2.7/urllib.py", line 207, in open
return getattr(self, name)(url)
File "/usr/lib/python2.7/urllib.py", line 351, in open_http
'got a bad status line', None)
IOError: ('http protocol error', 0, 'got a bad status line', None)
I need to fetch a html data for my project.
How can I solve this problem? Do you have any ideas ?
Thank you.
When I tried urllib2
I got this:
>>> import urllib2
>>> conn = urllib2.urlopen('http://stackoverflow.com/questions/8479736/using-python-urllib-how-to-avoid-non-html-content')
Traceback (most recent call last):
File "<stdin>", line 1, in <module>
File "/usr/lib/python2.7/urllib2.py", line 126, in urlopen
return _opener.open(url, data, timeout)
File "/usr/lib/python2.7/urllib2.py", line 400, in open
response = self._open(req, data)
File "/usr/lib/python2.7/urllib2.py", line 418, in _open
'_open', req)
File "/usr/lib/python2.7/urllib2.py", line 378, in _call_chain
result = func(*args)
File "/usr/lib/python2.7/urllib2.py", line 1207, in http_open
return self.do_open(httplib.HTTPConnection, req)
File "/usr/lib/python2.7/urllib2.py", line 1180, in do_open
r = h.getresponse(buffering=True)
File "/usr/lib/python2.7/httplib.py", line 1030, in getresponse
response.begin()
File "/usr/lib/python2.7/httplib.py", line 407, in begin
version, status, reason = self._read_status()
File "/usr/lib/python2.7/httplib.py", line 371, in _read_status
raise BadStatusLine(line)
httplib.BadStatusLine: ''
Also I tried this:
curl http://stackoverflow.com/questions/8479736/using-python-urllib-how-to-avoid-non-html-content
curl: (52) Empty reply from server
and this:
wget http://stackoverflow.com/questions/8479736/using-python-urllib-how-to-avoid-non-html-content
Connecting to stackoverflow.com (198.252.206.16:80)
wget: error getting response
but they didn't work
at home, I also tried and failed but returns a different error:
conn = urllib2.urlopen('http://stackoverflow.com/questions/8479736/using-python-urllib-how-to-avoid-non-html-content')
Traceback (most recent call last):
File "<stdin>", line 1, in <module>
File "/usr/lib/python2.7/urllib2.py", line 126, in urlopen
return _opener.open(url, data, timeout)
File "/usr/lib/python2.7/urllib2.py", line 400, in open
response = self._open(req, data)
File "/usr/lib/python2.7/urllib2.py", line 418, in _open
'_open', req)
File "/usr/lib/python2.7/urllib2.py", line 378, in _call_chain
result = func(*args)
File "/usr/lib/python2.7/urllib2.py", line 1215, in https_open
return self.do_open(httplib.HTTPSConnection, req)
File "/usr/lib/python2.7/urllib2.py", line 1177, in do_open
raise URLError(err)
urllib2.URLError: <urlopen error [Errno -2] Name or service not known>
environment
BBB: Linux beaglebone 3.8.13 #1 SMP Tue Jun 18 02:11:09 EDT 2013 armv7l GNU/Linux
python version: 2.7.3
I'm really want to recommend you requests lib:
>>> r = requests.get('https://api.github.com/user', auth=('user', 'pass'))
>>> r.status_code
200
>>> r.headers['content-type']
'application/json; charset=utf8'
>>> r.encoding
'utf-8'
>>> r.text
u'{"type":"User"...'
http://www.python-requests.org/en/latest/
How to install:
sudo pip install requests

Python urllib2 exception nonnumeric port: 'port/'

I wanted to execute this code I found on the python website:
#!/usr/bin/python
import urllib2
f = urllib2.urlopen('http://www.python.org/')
print f.read(100)
but the result was this:
Traceback (most recent call last):
File "WebServiceAusführen.py", line 5, in <module>
f = urllib2.urlopen('http://www.python.org/')
File "/usr/lib/python2.7/urllib2.py", line 126, in urlopen
return _opener.open(url, data, timeout)
File "/usr/lib/python2.7/urllib2.py", line 400, in open
response = self._open(req, data)
File "/usr/lib/python2.7/urllib2.py", line 418, in _open
'_open', req)
File "/usr/lib/python2.7/urllib2.py", line 378, in _call_chain
result = func(*args)
File "/usr/lib/python2.7/urllib2.py", line 1207, in http_open
return self.do_open(httplib.HTTPConnection, req)
File "/usr/lib/python2.7/urllib2.py", line 1146, in do_open
h = http_class(host, timeout=req.timeout) # will parse host:port
File "/usr/lib/python2.7/httplib.py", line 693, in __init__
self._set_hostport(host, port)
File "/usr/lib/python2.7/httplib.py", line 721, in _set_hostport
raise InvalidURL("nonnumeric port: '%s'" % host[i+1:])
httplib.InvalidURL: nonnumeric port: 'port/'
Error in sys.excepthook:
Traceback (most recent call last):
File "/usr/lib/python2.7/dist-packages/apport_python_hook.py", line 70, in apport_excepthook
binary = os.path.realpath(os.path.join(os.getcwdu(), sys.argv[0]))
File "/usr/lib/python2.7/posixpath.py", line 71, in join
path += '/' + b
UnicodeDecodeError: 'ascii' codec can't decode byte 0xc3 in position 15: ordinal not in range(128)
Original exception was:
Traceback (most recent call last):
File "WebServiceAusführen.py", line 5, in <module>
f = urllib2.urlopen('http://www.python.org/')
File "/usr/lib/python2.7/urllib2.py", line 126, in urlopen
return _opener.open(url, data, timeout)
File "/usr/lib/python2.7/urllib2.py", line 400, in open
response = self._open(req, data)
File "/usr/lib/python2.7/urllib2.py", line 418, in _open
'_open', req)
File "/usr/lib/python2.7/urllib2.py", line 378, in _call_chain
result = func(*args)
File "/usr/lib/python2.7/urllib2.py", line 1207, in http_open
return self.do_open(httplib.HTTPConnection, req)
File "/usr/lib/python2.7/urllib2.py", line 1146, in do_open
h = http_class(host, timeout=req.timeout) # will parse host:port
File "/usr/lib/python2.7/httplib.py", line 693, in __init__
self._set_hostport(host, port)
File "/usr/lib/python2.7/httplib.py", line 721, in _set_hostport
raise InvalidURL("nonnumeric port: '%s'" % host[i+1:])
httplib.InvalidURL: nonnumeric port: 'port/'
help would be pretty awesome because I really don't know how to make such a big error in such a small program!
The exception is thrown because you have a faulty proxy configuration.
On POSIX systems, the library looks for *_proxy environment variables (upper and lowercase). For a HTTP URL, that's the http_proxy environment variable.
You can verify this by looking for proxy variables:
import os
print {k: v for k, v in os.environ.items() if k.lower().endswith('_proxy')}
Whatever value you have configured on your system is not a valid hostname and port combination and Python is choking on that.

getting http.client.BadStatusLine with urlopen(IP).read()

The data I am trying to read is in xml format. There is a single space before the xml declaration. I can not edit this part as it is hard coded into the data source. I can only read from it. When the url is entered in IE the data comes up. When entered in Chrome/Firefox, an error is shown but data can be viewed from view source.
Is there a way with python to either strip this space off or ignore it as IE seems to do?
(tried to add strip() in many places)
Or is there a way to default to the page source (I think urlopen does this already)?
Here is the line giving the error:
html = urlopen(address).read()
Here is the error:
Traceback (most recent call last):
File "C:\Users\212311674\Desktop\Python Work\M10url.py", line 27, in <module>
html = urlopen(address).read()
File "C:\Python33\lib\urllib\request.py", line 160, in urlopen
return opener.open(url, data, timeout)
File "C:\Python33\lib\urllib\request.py", line 473, in open
response = self._open(req, data)
File "C:\Python33\lib\urllib\request.py", line 491, in _open
'_open', req)
File "C:\Python33\lib\urllib\request.py", line 451, in _call_chain
result = func(*args)
File "C:\Python33\lib\urllib\request.py", line 1272, in http_open
return self.do_open(http.client.HTTPConnection, req)
File "C:\Python33\lib\urllib\request.py", line 1257, in do_open
r = h.getresponse()
File "C:\Python33\lib\http\client.py", line 1131, in getresponse
response.begin()
File "C:\Python33\lib\http\client.py", line 354, in begin
version, status, reason = self._read_status()
File "C:\Python33\lib\http\client.py", line 336, in _read_status
raise BadStatusLine(line)
http.client.BadStatusLine: <?xml version="1.0"?><controller_history_cnd>

Categories