Python urlparse.parse_qs unicode url - python

urlparse.parse_qs is usefull for parsing url parameters, and it works fine with simple ASCII url, represented by str. So i can parse a query and then construct the same path using urllib.urlencode from parsed data:
>>> import urlparse
>>> import urllib
>>>
>>> path = '/?key=value' #path is str
>>> query = urlparse.urlparse(path).query
>>> query
'key=value'
>>> query_dict = urlparse.parse_qs(query)
>>> query_dict
{'key': ['value']}
>>> '/?' + urllib.urlencode(query_dict, doseq=True)
'/?key=value' # <-- path is the same here
It also works fine, when url contains percent encoded non-ASCII param:
>>> value = urllib.quote(u'значение'.encode('utf8'))
>>> value
'%D0%B7%D0%BD%D0%B0%D1%87%D0%B5%D0%BD%D0%B8%D0%B5'
>>> path = '/?key=%s' % value
>>> path
'/?key=%D0%B7%D0%BD%D0%B0%D1%87%D0%B5%D0%BD%D0%B8%D0%B5'
>>> query = urlparse.urlparse(path).query
>>> query
'key=%D0%B7%D0%BD%D0%B0%D1%87%D0%B5%D0%BD%D0%B8%D0%B5'
>>> query_dict = urlparse.parse_qs(query)
>>> query_dict
{'key': ['\xd0\xb7\xd0\xbd\xd0\xb0\xd1\x87\xd0\xb5\xd0\xbd\xd0\xb8\xd0\xb5']}
>>> '/?' + urllib.urlencode(query_dict, doseq=True)
'/?key=%D0%B7%D0%BD%D0%B0%D1%87%D0%B5%D0%BD%D0%B8%D0%B5' # <-- path is the same here
But, when using django, i get the url using request.get_full_path(), and it returns path as unicode string:
>>> path = request.get_full_path()
>>> path
u'/?key=%D0%B7%D0%BD%D0%B0%D1%87%D0%B5%D0%BD%D0%B8%D0%B5' # path is unicode
Look what will happen now:
>>> query = urlparse.urlparse(path).query
>>> query
u'key=%D0%B7%D0%BD%D0%B0%D1%87%D0%B5%D0%BD%D0%B8%D0%B5'
>>> query_dict = urlparse.parse_qs(query)
>>> query_dict
{u'key': [u'\xd0\xb7\xd0\xbd\xd0\xb0\xd1\x87\xd0\xb5\xd0\xbd\xd0\xb8\xd0\xb5']}
>>>
query_dict contains unicode string, that contains bytes! Not unicode points!
And of course i've got a UnicodeEncodeError, when trying to urlencode that string:
>>> urllib.urlencode(query_dict, doseq=True)
Traceback (most recent call last):
File "<stdin>", line 1, in <module>
File "C:\Python27\Lib\urllib.py", line 1337, in urlencode
l.append(k + '=' + quote_plus(str(elt)))
UnicodeEncodeError: 'ascii' codec can't encode characters in position 0-15: ordinal not in range(128)
Currently i have a solution:
# just convert path, returned by request.get_full_path(), to `str` explicitly:
path = str(request.get_full_path())
So the questions are:
why parse_qs return so strange string (unicode, that contains bytes)?
is it safe to convert url to str?

Encode back to bytes before passing it to .parse_qs(), using ASCII:
query_dict = urlparse.parse_qs(query.encode('ASCII'))
This does the same thing as str() but with an explicit encoding. Yes, this is safe, the URL encoding uses ASCII codepoints only.
parse_qs was handed a Unicode value, so it returned you a unicode value too; it is not it's job to decode bytes.

Related

Implementing SHA1-HMAC with Python

I am implementing SHA1-HMAC generation for python (v 3.7) to be able to create HMAC code.
I have used an online generator to create SHA1-HMAC with the following data:
string: '123'
Secret Key: 'secret'
Digest algorithm: SHA1
I am getting this result:
b14e92eb17f6b78ec5a205ee0e1ab220fb7f86d7
However when I try to do this same with Python I am getting different results which are wrong.
import hashlib
import hmac
import base64
def make_digest(message, key):
key = bytes(key, 'UTF-8')
message = bytes(message, 'UTF-8')
digester = hmac.new(key, message, hashlib.sha1)
signature1 = digester.digest()
signature2 = base64.urlsafe_b64encode(signature1)
return str(signature2, 'UTF-8')
result = make_digest('123', 'secret')
print(result)
This code gives result:
sU6S6xf2t47FogXuDhqyIPt_htc=
What could be wrong with this code?
You should not use Base64 here. The site you link to gives you the hex values of the digest bytes. Use the HMAC.hexdigest() method to get the same value in hex in Python:
>>> key = b'secret'
>>> message = b'123'
>>> digester = hmac.new(key, message, hashlib.sha1)
>>> digester.hexdigest()
'b14e92eb17f6b78ec5a205ee0e1ab220fb7f86d7'
put differently, your code outputs the correct value, but as Base64-encoded data:
>>> digester.digest()
b'\xb1N\x92\xeb\x17\xf6\xb7\x8e\xc5\xa2\x05\xee\x0e\x1a\xb2 \xfb\x7f\x86\xd7'
>>> base64.urlsafe_b64encode(digester.digest())
b'sU6S6xf2t47FogXuDhqyIPt_htc='
and the value you generated online contains the exact same bytes as the hex digest, so we can generate the same base64 output for that:
>>> bytes.fromhex('b14e92eb17f6b78ec5a205ee0e1ab220fb7f86d7')
b'\xb1N\x92\xeb\x17\xf6\xb7\x8e\xc5\xa2\x05\xee\x0e\x1a\xb2 \xfb\x7f\x86\xd7'
>>> base64.urlsafe_b64encode(bytes.fromhex('b14e92eb17f6b78ec5a205ee0e1ab220fb7f86d7'))
b'sU6S6xf2t47FogXuDhqyIPt_htc='

Python script won't work on Autokey

I'm trying to make a html entities encoder/decoder on Python that behaves similar to PHP's htmlentities and html_entity_decode, it works normally as a standalone script:
My input:
Lorem ÁÉÍÓÚÇÃOÁáéíóúção ##$%*()[]<>+ 0123456789
python decode.py
Output:
Lorem ÁÉÍÓÚÇÃOÁáéíóúção ##$%*()[]<>+ 0123456789
Now if I run it as an Autokey script I get this error:
Script name: 'html_entity_decode'
Traceback (most recent call last):
File "/usr/local/lib/python2.7/dist-packages/autokey/service.py", line 454, in execute
exec script.code in scope
File "<string>", line 40, in <module>
File "/usr/local/lib/python2.7/dist-packages/autokey/scripting.py", line 42, in send_keys
self.mediator.send_string(keyString.decode("utf-8"))
File "/usr/lib/python2.7/encodings/utf_8.py", line 16, in decode
return codecs.utf_8_decode(input, errors, True)
UnicodeEncodeError: 'ascii' codec can't encode characters in position 6-12: ordinal not in range(128)
What am I doing wrong? Here's the script:
import htmlentitydefs
import re
entity_re = re.compile(r'&(%s|#(\d{1,5}|[xX]([\da-fA-F]{1,4})));' % '|'.join(
htmlentitydefs.name2codepoint.keys()))
def html_entity_decode(s, encoding='utf-8'):
if not isinstance(s, basestring):
raise TypeError('argument 1: expected string, %s found' \
% s.__class__.__name__)
def entity_2_unichr(matchobj):
g1, g2, g3 = matchobj.groups()
if g3 is not None:
codepoint = int(g3, 16)
elif g2 is not None:
codepoint = int(g2)
else:
codepoint = htmlentitydefs.name2codepoint[g1]
return unichr(codepoint)
if isinstance(s, unicode):
entity_2_chr = entity_2_unichr
else:
entity_2_chr = lambda o: entity_2_unichr(o).encode(encoding,
'xmlcharrefreplace')
def silent_entity_replace(matchobj):
try:
return entity_2_chr(matchobj)
except ValueError:
return matchobj.group(0)
return entity_re.sub(silent_entity_replace, s)
text = clipboard.get_selection()
text = html_entity_decode(text)
keyboard.send_keys("%s" % text)
I found it on a Gist https://gist.github.com/607454, I'm not the author.
Looking at the backtrace the likely problem is that you are passing in a unicode string to keyboard.send_keys, which expects a UTF-8 encoded bytestring. autokey then tries to decode your string which fails because the input is unicode instead of utf-8. This looks like a bug in autokey: it should not try to decode strings unless their are really plain (byte)sstrings.
If this guess is correct you should be able to work around this by making sure you pass a unicode instance to send_keys. Try something like this:
text = clipboard.get_selection()
if isinstance(text, unicode):
text = text.encode('utf-8')
text = html_entity_decode(text)
assert isinstance(text, str)
keyboard.send_keys(text)
The assert is not needed but is a handy sanity check to make sure html_entity_decode does the right thing.
The problem is the the output of:
clipboard.get_selection()
is an unicode string.
to solve the problem replace:
text = clipboard.get_selection()
by:
text = clipboard.get_selection().encode("utf8")

How to pass Unicode string as argument to urllib.urlencode()

I'm using Microsoft's free translation service to translate some Hindi characters to English. They don't provide an API for Python, but I borrowed code from: tinyurl.com/dxh6thr
I'm trying to use the 'Detect' method as described here: tinyurl.com/bxkt3we
The 'hindi.txt' file is saved in unicode charset.
>>> hindi_string = open('hindi.txt').read()
>>> data = { 'text' : hindi_string }
>>> token = msmt.get_access_token(MY_USERID, MY_TOKEN)
>>> request = urllib2.Request('http://api.microsofttranslator.com/v2/Http.svc/Detect?'+urllib.urlencode(data))
>>> request.add_header('Authorization', 'Bearer '+token)
>>> response = urllib2.urlopen(request)
>>> print response.read()
<string xmlns="http://schemas.microsoft.com/2003/10/Serialization/">en</string>
>>>
The response shows that the Translator detected 'en', instead of 'hi' (for Hindi). When I check the encoding, it shows as 'string':
>>> type(hindi_string)
<type 'str'>
For reference, here is content of 'hindi.txt':
हाय, कैसे आप आज कर रहे हैं। मैं अच्छी तरह से, आपको धन्यवाद कर रहा हूँ।
I'm not sure if using string.encode or string.decode applies here. If it does, what do I need to encode/decode from/to? What is the best method to pass a Unicode string as a urllib.urlencode argument? How can I ensure that the actual Hindi characters are passed as the argument?
Thank you.
** Additional Information **
I tried using codecs.open() as suggested, but I get the following error:
>>> hindi_new = codecs.open('hindi.txt', encoding='utf-8').read()
Traceback (most recent call last):
File "<stdin>", line 1, in <module>
File "C:\Python27\lib\codecs.py", line 671, in read
return self.reader.read(size)
File "C:\Python27\lib\codecs.py", line 477, in read
newchars, decodedbytes = self.decode(data, self.errors)
UnicodeDecodeError: 'utf8' codec can't decode byte 0xff in position 0: invalid start byte
Here is the repr(hindi_string) output:
>>> repr(hindi_string)
"'\\xff\\xfe9\\t>\\t/\\t,\\x00 \\x00\\x15\\tH\\t8\\tG\\t \\x00\\x06\\t*\\t \\x00
\\x06\\t\\x1c\\t \\x00\\x15\\t0\\t \\x000\\t9\\tG\\t \\x009\\tH\\t\\x02\\td\\t \
\x00.\\tH\\t\\x02\\t \\x00\\x05\\t'"
Your file is utf-16, so you need to decode the content before sending it:
hindi_string = open('hindi.txt').read().decode('utf-16')
data = { 'text' : hindi_string.encode('utf-8') }
...
You could try opening the file using codecs.open and decode it with utf-8:
import codecs
with codecs.open('hindi.txt', encoding='utf-8') as f:
hindi_text = f.read()

How to fix this UnicodeDecodeError that appears when I try to remove accents in Python strings?

I'm trying to use this function:
import unicodedata
def remove_accents(input_str):
nkfd_form = unicodedata.normalize('NFKD', unicode(input_str))
return u"".join([c for c in nkfd_form if not unicodedata.combining(c)])
in the code below (which unzips and reads files with non-ASCII strings). But I'm getting this error, (from this library file C:\Python27\Lib\encodings\utf_8.py):
Message File Name Line Position
Traceback
<module> C:\Users\CG\Desktop\Google Drive\Sci&Tech\projects\naivebayes\USSSALoader.py 64
getNameList C:\Users\CG\Desktop\Google Drive\Sci&Tech\projects\naivebayes\USSSALoader.py 26
remove_accents C:\Users\CG\Desktop\Google Drive\Sci&Tech\projects\naivebayes\USSSALoader.py 17
UnicodeDecodeError: 'ascii' codec can't decode byte 0xe1 in position 3: ordinal not in range(128)
Why am I getting this error? How to avoid it and make remove_accents work?
Thanks for any help!
Here's the entire code:
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# -*- coding: utf-8 -*-
import os
import re
from zipfile import ZipFile
import csv
##def strip_accents(s):
## return ''.join((c for c in unicodedata.normalize('NFD', s) if unicodedata.category(c) != 'Mn'))
import unicodedata
def remove_accents(input_str):
nkfd_form = unicodedata.normalize('NFKD', unicode(input_str))
return u"".join([c for c in nkfd_form if not unicodedata.combining(c)])
def getNameList():
namesDict=extractNamesDict()
maleNames=list()
femaleNames=list()
for name in namesDict:
print name
# name = strip_accents(name)
name = remove_accents(name)
counts=namesDict[name]
tuple=(name,counts[0],counts[1])
if counts[0]>counts[1]:
maleNames.append(tuple)
elif counts[1]>counts[0]:
femaleNames.append(tuple)
names=(maleNames,femaleNames)
# print maleNames
return names
def extractNamesDict():
zf=ZipFile('names.zip', 'r')
filenames=zf.namelist()
names=dict()
genderMap={'M':0,'F':1}
for filename in filenames:
file=zf.open(filename,'r')
rows=csv.reader(file, delimiter=',')
for row in rows:
#name=row[0].upper().decode('latin1')
name=row[0].upper()
gender=genderMap[row[1]]
count=int(row[2])
if not names.has_key(name):
names[name]=[0,0]
names[name][gender]=names[name][gender]+count
file.close()
# print '\tImported %s'%filename
# print names
return names
if __name__ == "__main__":
getNameList()
Best practice is to decode to Unicode when the data comes into your program:
for row in rows:
name=row[0].upper().decode('utf8') # or whatever...you DO need to know the encoding.
Then remove_accents can just be:
def remove_accents(input_str):
nkfd_form = unicodedata.normalize('NFKD', input_str)
return u''.join(c for c in nkfd_form if not unicodedata.combining(c))
Encode data when leaving your program such as writing to a file, database, terminal, etc.
Why remove accents in the first place?
If you want to robustly convert unicode characters to ascii in a string, you should use the awesome unidecode module:
>>> import unidecode
>>> unidecode.unidecode(u'Björk')
'Bjork'
>>> unidecode.unidecode(u'András Sütő')
'Andras Suto'
>>> unidecode.unidecode(u'Ελλάς')
'Ellas'
You get it because you are decoding from a bytestring without specifying a codec:
unicode(input_str)
Add a codec there (here I assume your data is encoded in utf-8, 0xe1 would be the first of a 3-byte character):
unicode(input_str, 'utf8')

Best way to convert a Unicode URL to ASCII (UTF-8 percent-escaped) in Python?

I'm wondering what's the best way -- or if there's a simple way with the standard library -- to convert a URL with Unicode chars in the domain name and path to the equivalent ASCII URL, encoded with domain as IDNA and the path %-encoded, as per RFC 3986.
I get from the user a URL in UTF-8. So if they've typed in http://➡.ws/♥ I get 'http://\xe2\x9e\xa1.ws/\xe2\x99\xa5' in Python. And what I want out is the ASCII version: 'http://xn--hgi.ws/%E2%99%A5'.
What I do at the moment is split the URL up into parts via a regex, and then manually IDNA-encode the domain, and separately encode the path and query string with different urllib.quote() calls.
# url is UTF-8 here, eg: url = u'http://➡.ws/㉌'.encode('utf-8')
match = re.match(r'([a-z]{3,5})://(.+\.[a-z0-9]{1,6})'
r'(:\d{1,5})?(/.*?)(\?.*)?$', url, flags=re.I)
if not match:
raise BadURLException(url)
protocol, domain, port, path, query = match.groups()
try:
domain = unicode(domain, 'utf-8')
except UnicodeDecodeError:
return '' # bad UTF-8 chars in domain
domain = domain.encode('idna')
if port is None:
port = ''
path = urllib.quote(path)
if query is None:
query = ''
else:
query = urllib.quote(query, safe='=&?/')
url = protocol + '://' + domain + port + path + query
# url is ASCII here, eg: url = 'http://xn--hgi.ws/%E3%89%8C'
Is this correct? Any better suggestions? Is there a simple standard-library function to do this?
Code:
import urlparse, urllib
def fixurl(url):
# turn string into unicode
if not isinstance(url,unicode):
url = url.decode('utf8')
# parse it
parsed = urlparse.urlsplit(url)
# divide the netloc further
userpass,at,hostport = parsed.netloc.rpartition('#')
user,colon1,pass_ = userpass.partition(':')
host,colon2,port = hostport.partition(':')
# encode each component
scheme = parsed.scheme.encode('utf8')
user = urllib.quote(user.encode('utf8'))
colon1 = colon1.encode('utf8')
pass_ = urllib.quote(pass_.encode('utf8'))
at = at.encode('utf8')
host = host.encode('idna')
colon2 = colon2.encode('utf8')
port = port.encode('utf8')
path = '/'.join( # could be encoded slashes!
urllib.quote(urllib.unquote(pce).encode('utf8'),'')
for pce in parsed.path.split('/')
)
query = urllib.quote(urllib.unquote(parsed.query).encode('utf8'),'=&?/')
fragment = urllib.quote(urllib.unquote(parsed.fragment).encode('utf8'))
# put it back together
netloc = ''.join((user,colon1,pass_,at,host,colon2,port))
return urlparse.urlunsplit((scheme,netloc,path,query,fragment))
print fixurl('http://\xe2\x9e\xa1.ws/\xe2\x99\xa5')
print fixurl('http://\xe2\x9e\xa1.ws/\xe2\x99\xa5/%2F')
print fixurl(u'http://Åsa:abc123#➡.ws:81/admin')
print fixurl(u'http://➡.ws/admin')
Output:
http://xn--hgi.ws/%E2%99%A5
http://xn--hgi.ws/%E2%99%A5/%2F
http://%C3%85sa:abc123#xn--hgi.ws:81/admin
http://xn--hgi.ws/admin
Read more:
urllib.quote()
urlparse.urlparse()
urlparse.urlunparse()
urlparse.urlsplit()
urlparse.urlunsplit()
Edits:
Fixed the case of already quoted characters in the string.
Changed urlparse/urlunparse to urlsplit/urlunsplit.
Don't encode user and port information with the hostname. (Thanks Jehiah)
When "#" is missing, don't treat the host/port as user/pass! (Thanks hupf)
the code given by MizardX isnt 100% correct. This example wont work:
example.com/folder/?page=2
check out django.utils.encoding.iri_to_uri() to convert unicode URL to ASCII urls.
http://docs.djangoproject.com/en/dev/ref/unicode/
there's some RFC-3896 url parsing work underway (e.g. as part of the Summer Of Code) but nothing in the standard library yet AFAIK -- and nothing much on the uri encoding side of things either, again AFAIK. So you might as well go with MizardX's elegant approach.
Okay, with these comments and some bug-fixing in my own code (it didn't handle fragments at all), I've come up with the following canonurl() function -- returns a canonical, ASCII form of the URL:
import re
import urllib
import urlparse
def canonurl(url):
r"""Return the canonical, ASCII-encoded form of a UTF-8 encoded URL, or ''
if the URL looks invalid.
>>> canonurl(' ')
''
>>> canonurl('www.google.com')
'http://www.google.com/'
>>> canonurl('bad-utf8.com/path\xff/file')
''
>>> canonurl('svn://blah.com/path/file')
'svn://blah.com/path/file'
>>> canonurl('1234://badscheme.com')
''
>>> canonurl('bad$scheme://google.com')
''
>>> canonurl('site.badtopleveldomain')
''
>>> canonurl('site.com:badport')
''
>>> canonurl('http://123.24.8.240/blah')
'http://123.24.8.240/blah'
>>> canonurl('http://123.24.8.240:1234/blah?q#f')
'http://123.24.8.240:1234/blah?q#f'
>>> canonurl('\xe2\x9e\xa1.ws') # tinyarro.ws
'http://xn--hgi.ws/'
>>> canonurl(' http://www.google.com:80/path/file;params?query#fragment ')
'http://www.google.com:80/path/file;params?query#fragment'
>>> canonurl('http://\xe2\x9e\xa1.ws/\xe2\x99\xa5')
'http://xn--hgi.ws/%E2%99%A5'
>>> canonurl('http://\xe2\x9e\xa1.ws/\xe2\x99\xa5/pa%2Fth')
'http://xn--hgi.ws/%E2%99%A5/pa/th'
>>> canonurl('http://\xe2\x9e\xa1.ws/\xe2\x99\xa5/pa%2Fth;par%2Fams?que%2Fry=a&b=c')
'http://xn--hgi.ws/%E2%99%A5/pa/th;par/ams?que/ry=a&b=c'
>>> canonurl('http://\xe2\x9e\xa1.ws/\xe2\x99\xa5?\xe2\x99\xa5#\xe2\x99\xa5')
'http://xn--hgi.ws/%E2%99%A5?%E2%99%A5#%E2%99%A5'
>>> canonurl('http://\xe2\x9e\xa1.ws/%e2%99%a5?%E2%99%A5#%E2%99%A5')
'http://xn--hgi.ws/%E2%99%A5?%E2%99%A5#%E2%99%A5'
>>> canonurl('http://badutf8pcokay.com/%FF?%FE#%FF')
'http://badutf8pcokay.com/%FF?%FE#%FF'
>>> len(canonurl('google.com/' + 'a' * 16384))
4096
"""
# strip spaces at the ends and ensure it's prefixed with 'scheme://'
url = url.strip()
if not url:
return ''
if not urlparse.urlsplit(url).scheme:
url = 'http://' + url
# turn it into Unicode
try:
url = unicode(url, 'utf-8')
except UnicodeDecodeError:
return '' # bad UTF-8 chars in URL
# parse the URL into its components
parsed = urlparse.urlsplit(url)
scheme, netloc, path, query, fragment = parsed
# ensure scheme is a letter followed by letters, digits, and '+-.' chars
if not re.match(r'[a-z][-+.a-z0-9]*$', scheme, flags=re.I):
return ''
scheme = str(scheme)
# ensure domain and port are valid, eg: sub.domain.<1-to-6-TLD-chars>[:port]
match = re.match(r'(.+\.[a-z0-9]{1,6})(:\d{1,5})?$', netloc, flags=re.I)
if not match:
return ''
domain, port = match.groups()
netloc = domain + (port if port else '')
netloc = netloc.encode('idna')
# ensure path is valid and convert Unicode chars to %-encoded
if not path:
path = '/' # eg: 'http://google.com' -> 'http://google.com/'
path = urllib.quote(urllib.unquote(path.encode('utf-8')), safe='/;')
# ensure query is valid
query = urllib.quote(urllib.unquote(query.encode('utf-8')), safe='=&?/')
# ensure fragment is valid
fragment = urllib.quote(urllib.unquote(fragment.encode('utf-8')))
# piece it all back together, truncating it to a maximum of 4KB
url = urlparse.urlunsplit((scheme, netloc, path, query, fragment))
return url[:4096]
if __name__ == '__main__':
import doctest
doctest.testmod()
You might use urlparse.urlsplit instead, but otherwise you seem to have a very straightforward solution, there.
protocol, domain, path, query, fragment = urlparse.urlsplit(url)
(You can access the domain and port separately by accessing the returned value's named properties, but as port syntax is always in ASCII it is unaffected by the IDNA encoding process.)

Categories