Python BaseHTTPRequestHandler throws Lookup error on anything but utf-8 - python

I Need to write a server program in Python serving webpages and handling other GET and POST requests to and from client. I'm new to servers in Python, so I looked up some examples and after a while I had a basic Requesthandler running with some routing to my pages as a start. Routing worked in browser and pages were displayed there but I only got text, no styles, no pictures. Then I looked a bit further and realised that I also needed to handle GET requests for these .css, .js,.jpg files. So I did that, and ended up with smth like this:
class Serv(BaseHTTPRequestHandler):
def do_GET(self):
#route incoming path to correct page
if self.path in("","/"):
self.path = "/my_site/index.html"
#TODO do same for every page in the site
if self.path == "/foo":
self.path = "/my_site/fooandstuff.html"
if self.path == "/bar":
self.path = "/my_site/subdir/barfly.html"
try:
sendReply = False
if self.path.endswith(".html"):
mimetype = "text/html"
sendReply = True
if self.path.endswith(".jpg"):
mimetype = "image/jpg"
sendReply = True
if self.path.endswith(".js"):
mimetype = "application/javascript"
sendReply = True
if self.path.endswith(".css"):
mimetype = "text/css"
sendReply = True
if sendReply == True:
f = open(self.path[1:]).read()
self.send_response(200)
self.send_header('Content-type',mimetype)
self.end_headers()
self.wfile.write(f.encode(mimetype))
return
except IOError:
self.send_error(404, "File not found %s" % self.path)
When I run this and request a page, I get the following LookupError:
File "d:/somedir/myfile.py", line 47, in do_GET
self.wfile.write(f.encode(mimetype))
LookupError: unknown encoding: text/html
if I change text/html to utf-8, that seems te "solve" the problem, but then I run into the same Lookuperror but this time for image/jpg, and so on. It seems like wfile.write only accepts utf-8, although , when I look around, I see people passing file.read() just like that to wfile.write
wfile.write(file.read())
and for them it seems to work. Yet, when I do that, what I get is
File "C:\Users\myuser\AppData\Local\Programs\Python\Python37\lib\socketserver.py", line 799, in write
self._sock.sendall(b)
TypeError: a bytes-like object is required, not 'str'
What could cause this to happen?

for server handling with python better lookup flask
sample code will look like
from flask import Flask, render_template, url_for, request, redirect
import csv
#app.route('/')
def my_home():
return render_template('index.html')
#app.route('/<string:page_name>')
def html_page(page_name):
return render_template(page_name)
put all HTML in the same folder as your server.py in a folder called [template]
and all CSS and java in folder called [static] assets and all include. dont forget to change paths in css, java and html

The answer was in the opening of an image file, that needed an extra argument "rb" , like this:
if mimetype != "image/jpg":
f = open(self.path[1:])
else:
f = open(self.path[1:], "rb")
and then also:
if mimetype == "image/jpg":
self.wfile.write(f.read())
else:
self.wfile.write(f.read().encode("utf-8"))

Related

Python HTTP Server Serves Two Paths Using Different Kinds of Handlers

From other SO posts, it's clear how to serve content from a specific directory, and how to map an incoming path to different do_GET handlers.
To expand on the second question in a way relating to the first, how do you map paths to different kinds of handlers? Specifically, I'd like to map one path to do_GET handler, and another to just serving the content from a specific directory.
If it is not possible, what's the easier way to serve the two different kinds of contents? I know the two could be run on the server in two threads each serving a different port, that's not very neat.
I've got an answer by tracking the code from the first reference question answered by Jaymon, and incorporating the code from the second reference question.
The sample follows. It serves content on the local machine from the directory web/ to the URL base path /file/, and handles requests with URL base path /api in the user-supplied method do_GET() itself. Initially the code was derived from a sample on the web by Dr. Axel Rauschmayer.
#!/usr/bin/env python
# https://2ality.com/2014/06/simple-http-server.html
# https://stackoverflow.com/questions/39801718/how-to-run-a-http-server-which-serves-a-specific-path
from SimpleHTTPServer import SimpleHTTPRequestHandler
from BaseHTTPServer import HTTPServer as BaseHTTPServer
import os
PORT = 8000
class HTTPHandler(SimpleHTTPRequestHandler):
"""This handler uses server.base_path instead of always using os.getcwd()"""
def translate_path(self, path):
if path.startswith(self.server.base_url_path):
path = path[len(self.server.base_url_path):]
if path == '':
path = '/'
else:
#path = '/'
return None
path = SimpleHTTPRequestHandler.translate_path(self, path)
relpath = os.path.relpath(path, os.getcwd())
fullpath = os.path.join(self.server.base_local_path, relpath)
return fullpath
def do_GET(self):
path = self.path
if (type(path) is str or type(path) is unicode) and path.startswith('/api'):
# call local handler
self.send_response(200)
self.send_header('Content-type', 'text/html')
self.end_headers()
# Send the html message
self.wfile.write("<b> Hello World !</b>")
self.wfile.close()
return
elif (type(path) is str or type(path) is unicode) and path.startswith(self.server.base_url_path):
return SimpleHTTPRequestHandler.do_GET(self)
elif (type(path) is str or type(path) is unicode) and path.startswith('/'):
self.send_response(441)
self.end_headers()
self.wfile.close()
return
else:
self.send_response(442)
self.end_headers()
self.wfile.close()
return
Handler = HTTPHandler
Handler.extensions_map.update({
'.webapp': 'application/x-web-app-manifest+json',
});
class HTTPServer(BaseHTTPServer):
"""The main server, you pass in base_path which is the path you want to serve requests from"""
def __init__(self, base_local_path, base_url_path, server_address, RequestHandlerClass=HTTPHandler):
self.base_local_path = base_local_path
self.base_url_path = base_url_path
BaseHTTPServer.__init__(self, server_address, RequestHandlerClass)
web_dir = os.path.join(os.path.dirname(__file__), 'web')
httpd = HTTPServer(web_dir, '/file', ("", PORT))
print "Serving at port", PORT
httpd.serve_forever()

Python server can't access files with escaped characters in the URL

I have a python server for mp3 streaming. Using Twisted Matrix library.
If I try to access the file a.mp3 it works normally.
But this file, for example 衝.mp3 doesn't work, it says "File not found".
This file name as URL escaped is %E8%A1%9D.mp3 but it can't access it.
If I try to access it using unicode instead of the symbol, like this \u885d.mp3 it still says "File not found".
Here is the code, notice that I had to put request.path = request.path.replace('%20', ' ') because that's the only way it can access a file that has spaces in the path. That shouldn't be the normal behaviour I believe.
class playMP3(Resource):
isLeaf = True
def render_GET(self, request):
this=urlparse.urlparse(request.path)#scheme,netloc,path,query
root,ext=os.path.splitext(this.path)
filename=os.path.basename(request.path)
fileFolder=request.path.replace(filename,"")
self.serverRoot=os.getcwd()
print (request.path)
if ext==".mp3":
request.path = request.path.replace('%20', ' ')
thisFile=File(self.serverRoot+request.path)
return File.render_GET(thisFile,request)
resource = playMP3()
factory = Site(resource)
reactor.listenTCP(8880, factory)
reactor.run()
I also tried to put request.path = urllib.unquote(request.path) but instead of decoding it to 衝.mp3 it becomes ÞíØ.mp3. Weird.

How to access header information sent to http.server from an AJAX client using python 3+?

I am trying to read data sent to python's http.server from a local javascript program posted with AJAX. Everything works in python 2.7 as in this example, but now in python 3+ I can't access the header anymore to get the file length.
# python 2.7 (works!)
class handler_class(SimpleHTTPServer.SimpleHTTPRequestHandler):
def do_POST(self):
if self.path == '/data':
length = int(self.headers.getheader('Content-Length'))
NewData = self.rfile.read(length)
I've discovered I could use urllib.request, as I have mocked up below. However, I am running on a localhost and don't have a full url as I've seen in the examples, and I am starting to second guess if this is even the right way to go? Frustratingly, I can see the content-length printed out in the console, but I can't access it.
# python 3+ (url?)
import urllib.request
class handler_class(http.server.SimpleHTTPRequestHandler):
def do_POST(self):
if self.path == '/data':
print(self.headers) # I can see the content length here but cannot access it!
d = urllib.request.urlopen(url) # what url?
length = int(d.getheader('Content-Length'))
NewData = self.rfile.read(length)
Various url's I have tried are:
self.path
http://localhost:8000/data
/data
and I generally get this error:
ValueError: unknown url type: '/data'
So why is 'urllib.request' failing me and more importantly, how does one access 'self.header' in this Python3 world?

Why does SimpleHTTPServer redirect to ?querystring/ when I request ?querystring?

I like to use Python's SimpleHTTPServer for local development of all kinds of web applications which require loading resources via Ajax calls etc.
When I use query strings in my URLs, the server always redirects to the same URL with a slash appended.
For example /folder/?id=1 redirects to /folder/?id=1/ using a HTTP 301 response.
I simply start the server using python -m SimpleHTTPServer.
Any idea how I could get rid of the redirecting behaviour? This is Python 2.7.2.
The right way to do this, to ensure that the query parameters remain as they should, is to make sure you do a request to the filename directly instead of letting SimpleHTTPServer redirect to your index.html
For example http://localhost:8000/?param1=1 does a redirect (301) and changes the url to http://localhost:8000/?param=1/ which messes with the query parameter.
However http://localhost:8000/index.html?param1=1 (making the index file explicit) loads correctly.
So just not letting SimpleHTTPServer do a url redirection solves the problem.
Okay. With the help of Morten I've come up with this, which seems to be all I need: Simply ignoring the query strings if they are there and serving the static files.
import SimpleHTTPServer
import SocketServer
PORT = 8000
class CustomHandler(SimpleHTTPServer.SimpleHTTPRequestHandler):
def __init__(self, req, client_addr, server):
SimpleHTTPServer.SimpleHTTPRequestHandler.__init__(self, req, client_addr, server)
def do_GET(self):
# cut off a query string
if '?' in self.path:
self.path = self.path.split('?')[0]
SimpleHTTPServer.SimpleHTTPRequestHandler.do_GET(self)
class MyTCPServer(SocketServer.ThreadingTCPServer):
allow_reuse_address = True
if __name__ == '__main__':
httpd = MyTCPServer(('localhost', PORT), CustomHandler)
httpd.allow_reuse_address = True
print "Serving at port", PORT
httpd.serve_forever()
I'm not sure how the redirect is generated... I've tried implementing a very basic SimpleHTTPServer, and I don't get any redirects when using query string params.
Just do something like self.path.split("/") and process the path before handling the request?
This code does what you want I think:
import SocketServer
import SimpleHTTPServer
import os
class CustomHandler(SimpleHTTPServer.SimpleHTTPRequestHandler):
def folder(self):
fid = self.uri[-1].split("?id=")[-1].rstrip()
return "FOLDER ID: %s" % fid
def get_static_content(self):
# set default root to cwd
root = os.getcwd()
# look up routes and set root directory accordingly
for pattern, rootdir in ROUTES:
if path.startswith(pattern):
# found match!
path = path[len(pattern):] # consume path up to pattern len
root = rootdir
break
# normalize path and prepend root directory
path = path.split('?',1)[0]
path = path.split('#',1)[0]
path = posixpath.normpath(urllib.unquote(path))
words = path.split('/')
words = filter(None, words)
path = root
for word in words:
drive, word = os.path.splitdrive(word)
head, word = os.path.split(word)
if word in (os.curdir, os.pardir):
continue
path = os.path.join(path, word)
return path
def do_GET(self):
path = self.path
self.uri = path.split("/")[1:]
actions = {
"folder": self.folder,
}
resource = self.uri[0]
if not resource:
return self.get_static_content()
action = actions.get(resource)
if action:
print "action from looking up '%s' is:" % resource, action
return self.wfile.write(action())
SimpleHTTPServer.SimpleHTTPRequestHandler.do_GET(self)
class MyTCPServer(SocketServer.ThreadingTCPServer):
allow_reuse_address = True
httpd = MyTCPServer(('localhost', 8080), CustomHandler)
httpd.allow_reuse_address = True
print "serving at port", 8080
httpd.serve_forever()
Try it out:
HTTP GET /folder/?id=500x -> "FOLDER ID: 500x"
EDIT:
Okay so if you haven't used the SimpleHTTPServer-stuff before, you basically implement the base request handler, implement do_GET(), do_PUT(), do_POST() etc.
What I usually do then is parse the request string (using re), pattern match and see if I can find a request-handler and if not, handle the request as a request for static content if possible.
You say you want to serve static content if at all possible, then you should flip this pattern matching around, and FIRST see if the request matches the file-store and if not, then match against handlers :)

Python test for a url and image type

In the following code how to test for if the type is url or if the type is an image
for dictionaries in d_dict:
type = dictionaries.get('type')
if (type starts with http or https):
logging.debug("type is url")
else if type ends with .jpg or .png or .gif
logging.debug("type is image")
else:
logging.debug("invalid type")
You cannot tell what type a resource is purely from its URL. It is perfectly valid to have an GIF file at a URL without a .gif file extension, or with a misleading file extension like .txt. In fact it is quite likely, now that URL-rewriting is popular, that you'll get image URLs with no file extension at all.
It is the Content-Type HTTP response header that governs what type a resource on the web is, so the only way you can find out for sure is to fetch the resource and see what response you get. You can do this by looking at the headers returned by urllib.urlopen(url).headers, but that actually fetches the file itself. For efficiency you may prefer to make HEAD request that doesn't transfer the whole file:
import urllib2
class HeadRequest(urllib2.Request):
def get_method(self):
return 'HEAD'
response= urllib2.urlopen(HeadRequest(url))
maintype= response.headers['Content-Type'].split(';')[0].lower()
if maintype not in ('image/png', 'image/jpeg', 'image/gif'):
logging.debug('invalid type')
If you must try to sniff type based on the file extension in a URL path part (eg because you don't have a net connection), you should parse the URL with urlparse first to remove any ?query or #fragment part, so that http://www.example.com/image.png?blah=blah&foo=.txt doesn't confuse it. Also you should consider using mimetypes to map the filename to a Content-Type, so you can take advantage of its knowledge of file extensions:
import urlparse, mimetypes
maintype= mimetypes.guess_type(urlparse.urlparse(url).path)[0]
if maintype not in ('image/png', 'image/jpeg', 'image/gif'):
logging.debug('invalid type')
(eg. so that alternative extensions are also allowed. You should at the very least allow .jpeg for image/jpeg files, as well as the mutant three-letter Windows variant .jpg.)
Use regular expressions.
import re
r_url = re.compile(r"^https?:")
r_image = re.compile(r".*\.(jpg|png|gif)$")
for dictionaries in d_dict:
type = dictionaries.get('type')
if r_url.match(type):
logging.debug("type is url")
else if r_image.match(type)
logging.debug("type is image")
else:
logging.debug("invalid type")
Two remarks: type is a builtin, and images could be loaded from an URL too.
I wrote based on previous comments a python script, which first checks per HEAD request for the content_type and if this fails for the mimetype.
Hope this helps.
import mimetypes
import urllib2
class HeadRequest(urllib2.Request):
def get_method(self):
return 'HEAD'
def get_contenttype(image_url):
try:
response= urllib2.urlopen(HeadRequest(image_url))
maintype= response.headers['Content-Type'].split(';')[0].lower()
return maintype
except urllib2.HTTPError as e:
print(e)
return None
def get_mimetype(image_url):
(mimetype, encoding) = mimetypes.guess_type(image_url)
return mimetype
def get_extension_from_type(type_string):
if type(type_string) == str or type(type_string) == unicode:
temp = type_string.split('/')
if len(temp) >= 2:
return temp[1]
elif len(temp) >= 1:
return temp[0]
else:
return None
def get_type(image_url):
valid_types = ('image/png', 'image/jpeg', 'image/gif', 'image/jpg')
content_type = get_contenttype(image_url)
if content_type in valid_types:
return get_extension_from_type(content_type)
mimetypes = get_mimetype(image_url)
if mimetypes in valid_types:
return get_extension_from_type(mimetypes)
return None
If you are going to guess the type of a resource from its URL, then I suggest you use the mimetypes library. Realize, however, that you can really only make an educated guess this way.
As bobince suggests, you could also make a HEAD request and use the Content-Type header. This, however, assumes that the server is configured (or, in the case of a web application, programmed) to return the correct Content-Type. It might not be.
So, the only way to really tell is to download the file and use something like libmagic (although it is conceivable even that could fail). If you decide this degree of accuracy is necessary, you might be interested in this python binding for libmagic.

Categories