how can i find out the uploaded file name in python cgi - python

i made simple web server like below.
import BaseHTTPServer, os, cgi
import cgitb; cgitb.enable()
html = """
<html>
<body>
<form action="" method="POST" enctype="multipart/form-data">
File upload: <input type="file" name="upfile">
<input type="submit" value="upload">
</form>
</body>
</html>
"""
class Handler(BaseHTTPServer.BaseHTTPRequestHandler):
def do_GET(self):
self.send_response(200)
self.send_header("content-type", "text/html;charset=utf-8")
self.end_headers()
self.wfile.write(html)
def do_POST(self):
ctype, pdict = cgi.parse_header(self.headers.getheader('content-type'))
if ctype == 'multipart/form-data':
query = cgi.parse_multipart(self.rfile, pdict)
upfilecontent = query.get('upfile')
if upfilecontent:
# i don't know how to get the file name.. so i named it 'tmp.dat'
fout = file(os.path.join('tmp', 'tmp.dat'), 'wb')
fout.write (upfilecontent[0])
fout.close()
self.do_GET()
if __name__ == '__main__':
server = BaseHTTPServer.HTTPServer(("127.0.0.1", 8080), Handler)
print('web server on 8080..')
server.serve_forever()
In the do_Post method of BaseHTTPRequestHandler, i got the uploaded file data successfully.
But i can't figure out how to get the original name of the uploaded file.
self.rfile.name is just a 'socket'
How can i get the uploaded file name?

Pretty broken code you're using there as a starting point (e.g. look at that global rootnode where name rootnode is used nowhere -- clearly half-edited source, and badly at that).
Anyway, what form are you using "client-side" for the POST? How does it set that upfile field?
Why aren't you using the normal FieldStorage approach, as documented in Python's docs? That way, you could use the .file attribute of the appropriate field to get a file-like object to read, or its .value attribute to read it all in memory and get it as a string, plus the .filename attribute of the field to know the uploaded file's name. More detailed, though concise, docs on FieldStorage, are here.
Edit: now that the OP has edited the Q to clarify, I see the problem: BaseHTTPServer does not set the environment according to the CGI specs, so the cgi module isn't very usable with it. Unfortunately the only simple approach to environment setting is to steal and hack a big piece of code from CGIHTTPServer.py (wasn't intented for reuse, whence the need for, sigh, copy and paste coding), e.g....:
def populenv(self):
path = self.path
dir, rest = '.', 'ciao'
# find an explicit query string, if present.
i = rest.rfind('?')
if i >= 0:
rest, query = rest[:i], rest[i+1:]
else:
query = ''
# dissect the part after the directory name into a script name &
# a possible additional path, to be stored in PATH_INFO.
i = rest.find('/')
if i >= 0:
script, rest = rest[:i], rest[i:]
else:
script, rest = rest, ''
# Reference: http://hoohoo.ncsa.uiuc.edu/cgi/env.html
# XXX Much of the following could be prepared ahead of time!
env = {}
env['SERVER_SOFTWARE'] = self.version_string()
env['SERVER_NAME'] = self.server.server_name
env['GATEWAY_INTERFACE'] = 'CGI/1.1'
env['SERVER_PROTOCOL'] = self.protocol_version
env['SERVER_PORT'] = str(self.server.server_port)
env['REQUEST_METHOD'] = self.command
uqrest = urllib.unquote(rest)
env['PATH_INFO'] = uqrest
env['SCRIPT_NAME'] = 'ciao'
if query:
env['QUERY_STRING'] = query
host = self.address_string()
if host != self.client_address[0]:
env['REMOTE_HOST'] = host
env['REMOTE_ADDR'] = self.client_address[0]
authorization = self.headers.getheader("authorization")
if authorization:
authorization = authorization.split()
if len(authorization) == 2:
import base64, binascii
env['AUTH_TYPE'] = authorization[0]
if authorization[0].lower() == "basic":
try:
authorization = base64.decodestring(authorization[1])
except binascii.Error:
pass
else:
authorization = authorization.split(':')
if len(authorization) == 2:
env['REMOTE_USER'] = authorization[0]
# XXX REMOTE_IDENT
if self.headers.typeheader is None:
env['CONTENT_TYPE'] = self.headers.type
else:
env['CONTENT_TYPE'] = self.headers.typeheader
length = self.headers.getheader('content-length')
if length:
env['CONTENT_LENGTH'] = length
referer = self.headers.getheader('referer')
if referer:
env['HTTP_REFERER'] = referer
accept = []
for line in self.headers.getallmatchingheaders('accept'):
if line[:1] in "\t\n\r ":
accept.append(line.strip())
else:
accept = accept + line[7:].split(',')
env['HTTP_ACCEPT'] = ','.join(accept)
ua = self.headers.getheader('user-agent')
if ua:
env['HTTP_USER_AGENT'] = ua
co = filter(None, self.headers.getheaders('cookie'))
if co:
env['HTTP_COOKIE'] = ', '.join(co)
# XXX Other HTTP_* headers
# Since we're setting the env in the parent, provide empty
# values to override previously set values
for k in ('QUERY_STRING', 'REMOTE_HOST', 'CONTENT_LENGTH',
'HTTP_USER_AGENT', 'HTTP_COOKIE', 'HTTP_REFERER'):
env.setdefault(k, "")
os.environ.update(env)
This could be substantially simplified further, but not without spending some time and energy on that task:-(.
With this populenv function at hand, we can recode:
def do_POST(self):
populen(self)
form = cgi.FieldStorage(fp=self.rfile)
upfilecontent = form['upfile'].value
if upfilecontent:
fout = open(os.path.join('tmp', form['upfile'].filename), 'wb')
fout.write(upfilecontent)
fout.close()
self.do_GET()
...and live happily ever after;-). (Of course, using any decent WSGI server, or even the demo one, would be much easier, but this exercise is instructive about CGI and its internals;-).

By using cgi.FieldStorage you can easily extract the filename. Check the example below:
def do_POST(self):
ctype, pdict = cgi.parse_header(self.headers.getheader('content-type'))
if ctype == 'multipart/form-data':
form = cgi.FieldStorage( fp=self.rfile, headers=self.headers, environ={'REQUEST_METHOD':'POST', 'CONTENT_TYPE':self.headers['Content-Type'], })
filename = form['upfile'].filename
data = form['upfile'].file.read()
open("./%s"%filename, "wb").write(data)
self.do_GET()

...or use your own version of cgi.parse_multipart, especially fixing this:
# my fix: prefer 'filename' over 'name' field!
if 'filename' in params:
name = params['filename']
name = os.path.basename(name) # Edge, IE return abs path!
elif 'name' in params:
name = params['name']
else:
continue

Related

http webserver simply stops responding

Earlier I wrote multi threaded web server, which at times would simply stop processing requests and also getting terminated at peak times.
I've implemented same opencv based processing in Python Websocket based server too which is working fine.
For very old browsers, I also need POST based processing using web server. I converted from multithreading to single but that also is stopping different times and not printing any log etc.
I checked syslog but not clue. More than a week has gone by without finding a solution. I suspect something related to Digital Ocean VPS or network.
I've this code and can't figure why it should stop responding:
from http.server import HTTPServer, BaseHTTPRequestHandler
import threading
import cgi
import tempfile
import resource
import base64
from common import *
from datetime import datetime
print( datetime.now());
gg_hashmap = getHash()
USE_HTTPS = True
def dump(obj):
for attr in dir(obj):
print("obj.%s = %r" % (attr, getattr(obj, attr)))
class PostHandler(BaseHTTPRequestHandler):
def handle(self):
try:
BaseHTTPRequestHandler.handle(self)
except :
pass
def do_POST(self):
try:
print("new req="+str( datetime.now()),flush=True);
form = cgi.FieldStorage(
fp=self.rfile,
headers=self.headers,
environ={'REQUEST_METHOD': 'POST',
'CONTENT_TYPE': self.headers['Content-Type'],
})
self.send_response(200)
self.send_header("Content-type", "text/html")
self.send_header("Access-Control-Allow-Origin", "*")
self.end_headers()
for field in form.keys():
field_item = form[field]
if field_item.filename:
file_data = field_item.file.read()
file_len = len(file_data)
del file_data
self.wfile.write('\tUploaded %s as "%s" (%d bytes)\n' % \
(field, field_item.filename, file_len))
else:
pass
if ('base64' in form and 'license' in form):
print("license=",form['license'].value);
global gg_hashmap
file_content = form['base64'].value
try:
f, temp_file_path = tempfile.mkstemp(prefix='sand', suffix='jpg')
os.close(f)
with open(temp_file_path, 'wb') as w:
w.write(base64.b64decode (file_content))
input_hashes = get_input_img(temp_file_path)
all_letters = ""
if input_hashes != None:
for inp_hash in input_hashes:
lowest = 1000
lowest_letter = ''
for letter, arr in gg_hashmap.items():
for hashval in arr:
if int(inp_hash - hashval) < lowest:
lowest = int(inp_hash - hashval)
lowest_letter = letter
all_letters += lowest_letter
self.wfile.write(bytes(all_letters, "utf8"))
except Exception as e:
print("exception3 caught")
print(e)
print(str(e))
return
except Exception as e:
print("Caught unknown exception",e)
def do_GET(self):
self.send_response(200)
self.end_headers()
message = threading.currentThread().getName()
self.wfile.write(bytes(message,'utf-8'))
self.wfile.write('\n')
return
form = cgi.FieldStorage(
fp=self.rfile,
headers=self.headers,
environ={'REQUEST_METHOD': 'POST',
'CONTENT_TYPE': self.headers['Content-Type'],
})
self.send_response(200)
self.end_headers()
for field in form.keys():
field_item = form[field]
if field_item.filename:
file_data = field_item.file.read()
file_len = len(file_data)
del file_data
self.wfile.write('\tUploaded %s as "%s" (%d bytes)\n' % \
(field, field_item.filename, file_len))
else:
pass
return
def run():
# resource.setrlimit(resource.RLIMIT_STACK, (2**29,-1))
# threading.stack_size(24*1048576)
server = HTTPServer(('0.0.0.0', 443), PostHandler)
if USE_HTTPS:
import ssl
server.socket = ssl.wrap_socket(server.socket, keyfile='./ssl/key.pem', certfile='./ssl/public.pem'
, ca_certs="./ssl/cap1_transactionfailed_com.ca-bundle" , server_side=True)
server.serve_forever()
if __name__ == '__main__':
run()
I don't think much anyone will want to read through all 157 lines of convoluted HTTP request handling code (of which some isn't even posted, from common import *) to try and decipher why it might stop at some given time.
It's likely not the answer you want to hear, but HTTPServer really isn't what anyone uses in production for Python.
You should look into rewriting your code with either (my recommendations at the time of writing)
FastAPI (or its underlying Starlette framework) on Uvicorn (Uvicorn will let you do the websocket stuff in the same process), or
Flask on Gunicorn or uWSGI
For instance, here's a rough estimation of what your code would look like with Starlette. (There may be bugs since it's dry-coded, and it's certainly not fully async, but that doesn't matter here.)
import tempfile
import base64
from starlette.applications import Starlette
from starlette.requests import Request
from starlette.responses import PlainTextResponse
app = Starlette()
def get_all_letters(input_hashes):
all_letters = ""
if input_hashes:
for inp_hash in input_hashes:
lowest = 1000
lowest_letter = ""
for letter, arr in gg_hashmap.items():
for hashval in arr:
if int(inp_hash - hashval) < lowest:
lowest = int(inp_hash - hashval)
lowest_letter = letter
all_letters += lowest_letter
return all_letters
#app.route("/", methods=["GET", "POST"])
async def handle(request: Request):
if request.method == "GET":
return PlainTextResponse("Hello!")
form = await request.form()
if not ("base64" in form and "license" in form):
return PlainTextResponse("Missing data!", status_code=400)
with tempfile.NamedTemporaryFile(prefix="sand", suffix="jpg") as f:
content = await form["base64"].read()
f.write(base64.b64decode(content))
f.flush()
input_hashes = get_input_img(f)
if not input_hashes:
return PlainTextResponse("No input hashes!", status_code=400)
all_letters = get_all_letters(input_hashes)
return PlainTextResponse(all_letters)
You could then run this using Uvicorn (which will also handle all of that HTTPS stuff for you).
With mkstemp you must delete the tempfile. You probably run out of disk space or max out files in temp directory. As AKX mentioned though you should look into using a more robust http server. if the file thing isn't your problem there are numerous other issues that could come up when using a non production HTTP server.

Threads in python are using the same variable it seems

I built a class that watches for changes in a directory and upload them to a server, it is working fine for one dir. However, i had the idea to use the threading module from python to actually watch more than one directory. But, i am getting confused, since when i change a file in one location, it uploads just fine, but then the OTHER location starts uploading all it's files. I think it's because somehow the threads are sharing the same variable or something, but still it's impossible because each directory has it's own instance of the class working specifically for it.
Here's some code:
import os, ftplib, time
from threading import Thread
class FTPSync(Thread):
local_root = ''
remote_root = ''
host = ''
user = ''
password = ''
content = {
'previous': [],
'current': []
}
ignore = []
rest = 0.5
files = []
cwd = ''
watching = True
def __init__(self, local_root='', remote_root='', config={}):
Thread.__init__(self)
self.local_root = local_root if local_root != '' else os.path.join(os.path.dirname(__file__), os.pardir)
self.remote_root = remote_root
self.ignore = config['ignore'] if 'ignore' in config else []
self.rest = config['rest'] if 'rest' in config else 0.5
self.host, self.user, self.password = config['host'], config['user'], config['password']
self.content['previous'] = self.read_dir(self.local_root)
# Connect and reconnect to the server
def connect(self, reconnect=False):
print "Connecting..."
self.ftp = ftplib.FTP(self.host)
self.ftp.login(self.user, self.password)
print "Welcome message from server:\n"
print self.ftp.getwelcome()
if not reconnect:
self.cwd = self.remote_root
self.ftp.cwd(self.cwd)
# Start watching for local changes
def watch(self):
self.connect()
while self.watching:
self.files = []
self.content['current'] = self.read_dir(self.local_root)
diff = [f for f in self.content['current'] if f not in self.content['previous']]
if len(diff) > 0:
self.stor(diff)
self.content['previous'] = self.content['current']
diff = []
time.sleep(self.rest)
# Read a directory and its contents recursively
def read_dir(self, dir_name, return_value=True):
reading = os.listdir(dir_name)
file_content = None
for i in range(len(reading)):
d = self._local_abspath(dir_name, reading[i])
is_dir = os.path.isdir(d)
file_content = open(d).read() if not is_dir else None
offset = d.replace(self.local_root, '').replace(reading[i], '')
if is_dir and reading[i] not in self.ignore:
self.read_dir(d, return_value=False)
elif not is_dir:
info = {"name": reading[i], "content": file_content, "local_path": d, "offset": offset}
self.files.append(info)
if (return_value):
return self.files
pass
# Here we go
def run(self):
self.watch()
# Store (STOR) the files in the server
def stor(self, files):
nav = ''
try:
for f in files:
if self._server_abspath(f['offset']) != self.cwd:
nav = self._server_abspath(f['offset'])
self.ftp.cwd(nav)
mode = ''
if f['name'].split('.')[-1:][0] in ['jpg', 'png', 'gif'] or os.path.getsize(f['local_path']) > 8190:
mode = 'binary'
self.ftp.storbinary('STOR {!s}'.format(f['name']), open(f['local_path']))
else:
mode = 'ascii'
self.ftp.storlines('STOR {!s}'.format(f['name']), open(f['local_path']))
self.cwd = self._server_abspath(f['offset'])
print "Stored %s in %s mode" % (f['name'], mode)
# The connection has timed out
except ftplib.error_temp:
self.connect(reconnect=True)
self.stor(files)
# A new file has been created inside a folder that does not exist in the server
except ftplib.error_perm:
self.ftp.mkd(nav)
self.stor(files)
# A new folder has been created locally, but we'll wait to update this on the server
# when there's some content inside of it and throw us a ftplib.error_perm error, so here it'll just pass
except IOError:
pass
# Return the absolute path in the server
def _server_abspath(self, path):
return self.remote_root + '/' + path.replace('\\', '/')
# Return the absolute path locally
def _local_abspath(self, dn, fn):
return (dn +'\\'+ fn) if not dn[-1:]=='\\' else dn + fn
def start(local_root='', remote_root='', config={}):
instance = FTPSync(local_root, remote_root, config)
instance.start()
return instance
And this is how i use the class:
import ftpsync
config = {
'host': 'ftp.myhost.com',
'user': '****',
'password': '****',
'ignore': ['.git']
}
ftpsync.start(remote_root='/www/tst', config=config)
ftpsync.start(local_root='C:\\pygames', remote_root='/www/tst', config=config)
I would like to remember that it works fine for ONE directory.
After some time, I realized I had to use processes. I came back here in case someone finds it useful.
So basically, with threads you're just running two or more concurrent things at once, but they all share the same address space and memory, and can cause some unwanted things by having the same context and interacting with each other.
Now with processes, every process is independent from one another, so they all have resources reserved for each one of them. This won't let them share variables and stuff.

unable to detect request content_type

On a django server, I process uploaded zip files sent from a python script. But I am getting "" (a blank string) for file.content_type. What am I doing wrong?
#csrf_exempt
def Import( request ):
if request.method != 'POST':
return HttpResponseNotAllowed('Only POST here')
if not request.FILES or not request.FILES.get( u'file' ):
return HttpResponse('Must upload a file')
file = request.FILES[u'file']
if file.content_type == 'application/zip':
unzipped_dir = unzip_file( file )
uid = create_project( unzipped_dir )
shutil.rmtree( unzipped_dir )
py_ob = { }
py_ob['success'] = uid is not None
if uid is not None:
py_ob['id'] = uid
json_ob = simplejson.dumps(py_ob)
return HttpResponse( json_ob, mimetype="application/json" )
else:
return HttpResponseNotAllowed( 'Only POST zip files here' )
This is the script which sends the zip file up:
import sys
import os
import requests
if len (sys.argv) < 5:
print "pass in url, username, password, file"
else:
url = sys.argv[1]
username = sys.argv[2]
password = sys.argv[3]
phile = sys.argv[4]
if os.path.exists(phile):
files = {'file': open( phile, 'rb' )}
r = requests.post( url, files=files, auth=( username, password ) )
if r.status_code == 200:
json_response = r.json()
if json_response['success']:
print "id: " + str( json_response['id'] )
else:
print "failure in processing bundle"
else:
print "server problem: " + str(r.status_code)
print r.text
else:
print "cannot find file to upload"
The Content-Type header is completely arbitrary (and optional) and not a good way to detect whether or not you're dealing with a valid ZIP file. Have you made sure your browser is supplying it?
Django's documentation tells us the same:
UploadedFile.content_type
The content-type header uploaded with the file (e.g. text/plain or application/pdf). Like any data supplied by the user, you shouldn’t
trust that the uploaded file is actually this type. You’ll still need
to validate that the file contains the content that the content-type
header claims – “trust but verify.”
You should be using zipfile.is_zipfile instead.

Python fast static file serving

What's the fastest way to serve static files in Python? I'm looking for something equal or close enough to Nginx's static file serving.
I know of SimpleHTTPServer but not sure if it can handle serving multiple files efficiently and reliably.
Also, I don't mind it being a part of a lib/framework of some sort as long as its lib/framework is lightweight.
EDIT: This project appears to be dead.
What about FAPWS3? One of the selling points:
Static file server
FAPWS can be used to serve a huge amount of static file requests. With the help of a async database in the backend, you can use FAPWS as your own Amazon S3.
If you look for a oneliner you can do the following:
$> python -m SimpleHTTPServer
This will not fullfil all the task required but worth mentioning that this is the simplest way :-)
I would highly recommend using a 3rd party HTTP server to serve static files.
Servers like nginx are heavily optimized for the task at hand, parallelized and written in fast languages.
Python is tied to one processor and interpreted.
Original SimpleHTTPServer from python standard library does NOT "handle serving multiple files efficiently and reliably". For instance, if you are downloading one file from it, another HTTP access to it must be hovering since SimpleHTTPServer.py is a simple singal-thread HTTP server which could only support one connecting simultaneously.
Fortunately, note that SimpleHTTPServer.py use BaseHTTPServer.HTTPServer as handler, which can be wrapped by SocketServer.ForkingMixIn and SocketServer.ThreadingMixIn also from python standard library to support multi-process and multi-thread mode, which could highly enhance simple HTTP server's "efficience and reliability".
According to this idea, a SimpleHTTPServer with multi-thread/multi-process support modified from original one is given as follows:
$ python2.7 ModifiedSimpleHTTPServer.py
usage: ModifiedSimpleHTTPServer.py [-h] [--pydoc] [--port PORT]
[--type {process,thread}] [--root ROOT]
[--run]
Modified SimpleHTTPServer with MultiThread/MultiProcess and IP bind support.
Original: https://docs.python.org/2.7/library/simplehttpserver.html
Modified by: vbem#163.com
optional arguments:
-h, --help show this help message and exit
--pydoc show this module's pydoc
run arguments:
--port PORT specify server port (default: 8000)
--type {process,thread}
specify server type (default: 'thread')
--root ROOT specify root directory (default: cwd '/home/vbem')
--run run http server foreground
NOTE: stdin for input, stdout for result, stderr for logging
For example, ModifiedSimpleHTTPServer.py --run --root /var/log --type process will run a multi-process HTTP static files server with '/var/log' as its root directory.
Modified codes are:
#! /usr/bin/env python2.7
# -*- coding: utf-8 -*-
# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
r"""Modified SimpleHTTPServer with MultiThread/MultiProcess and IP bind support.
Original: https://docs.python.org/2.7/library/simplehttpserver.html
Modified by: vbem#163.com
"""
# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
import os, sys, pwd, posixpath, BaseHTTPServer, urllib, cgi, shutil, mimetypes, socket, SocketServer, BaseHTTPServer
from cStringIO import StringIO
USERNAME = pwd.getpwuid(os.getuid()).pw_name
HOSTNAME = socket.gethostname()
PORT_DFT = 8000
class SimpleHTTPRequestHandler(BaseHTTPServer.BaseHTTPRequestHandler):
server_version = "SimpleHTTP/0.6"
def do_GET(self):
f = self.send_head()
if f:
self.copyfile(f, self.wfile)
f.close()
def do_HEAD(self):
f = self.send_head()
if f:
f.close()
def send_head(self):
path = self.translate_path(self.path)
f = None
if os.path.isdir(path):
if not self.path.endswith('/'):
self.send_response(301)
self.send_header("Location", self.path + "/")
self.end_headers()
return None
for index in "index.html", "index.htm":
index = os.path.join(path, index)
if os.path.exists(index):
path = index
break
else:
return self.list_directory(path)
ctype = self.guess_type(path)
try:
f = open(path, 'rb')
except IOError:
self.send_error(404, "File not found")
return None
self.send_response(200)
self.send_header("Content-type", ctype)
fs = os.fstat(f.fileno())
self.send_header("Content-Length", str(fs[6]))
self.send_header("Last-Modified", self.date_time_string(fs.st_mtime))
self.end_headers()
return f
def list_directory(self, path):
try:
list = ['..'] + os.listdir(path) #
except os.error:
self.send_error(404, "No permission to list directory")
return None
list.sort(key=lambda a: a.lower())
f = StringIO()
displaypath = cgi.escape(urllib.unquote(self.path))
f.write('<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 3.2 Final//EN">')
f.write("<html>\n<title>%s %s</title>\n<body>" % (HOSTNAME, displaypath))
f.write("%s#%s:<strong>%s</strong>\n" % (USERNAME, HOSTNAME, path.rstrip('/')+'/'))
f.write("<hr>\n<ul>\n")
for name in list:
fullname = os.path.join(path, name)
displayname = linkname = name
if os.path.isdir(fullname):
displayname = name + "/"
linkname = name + "/"
if os.path.islink(fullname):
displayname = name + "#"
f.write('<li>%s\n'
% (urllib.quote(linkname), cgi.escape(displayname)))
f.write("</ul>\n<hr>\n<pre>%s</pre>\n</body>\n</html>\n" % __doc__)
length = f.tell()
f.seek(0)
self.send_response(200)
encoding = sys.getfilesystemencoding()
self.send_header("Content-type", "text/html; charset=%s" % encoding)
self.send_header("Content-Length", str(length))
self.end_headers()
return f
def translate_path(self, path):
path = path.split('?',1)[0]
path = path.split('#',1)[0]
path = posixpath.normpath(urllib.unquote(path))
words = path.split('/')
words = filter(None, words)
path = os.getcwd()
for word in words:
drive, word = os.path.splitdrive(word)
head, word = os.path.split(word)
if word in (os.curdir, os.pardir): continue
path = os.path.join(path, word)
return path
def copyfile(self, source, outputfile):
shutil.copyfileobj(source, outputfile)
def guess_type(self, path):
base, ext = posixpath.splitext(path)
if ext in self.extensions_map:
return self.extensions_map[ext]
ext = ext.lower()
if ext in self.extensions_map:
return self.extensions_map[ext]
else:
return self.extensions_map['']
if not mimetypes.inited:
mimetypes.init()
extensions_map = mimetypes.types_map.copy()
extensions_map.update({'': 'text/plain'})
class ProcessedHTTPServer(SocketServer.ForkingMixIn, BaseHTTPServer.HTTPServer):
r"""Handle requests in multi process."""
class ThreadedHTTPServer(SocketServer.ThreadingMixIn, BaseHTTPServer.HTTPServer):
r"""Handle requests in a separate thread."""
SERVER_DICT = {
'thread' : ThreadedHTTPServer,
'process' : ProcessedHTTPServer,
}
SERVER_DFT = 'thread'
def run(sCwd=None, sServer=SERVER_DFT, nPort=PORT_DFT, *lArgs, **dArgs):
r"""
"""
sys.stderr.write('start with %r\n' % sys._getframe().f_locals)
if sCwd is not None:
os.chdir(sCwd)
cServer = SERVER_DICT[sServer]
oHttpd = cServer(("", nPort), SimpleHTTPRequestHandler)
sys.stderr.write('http://%s:%s/\n' % (HOSTNAME, nPort))
oHttpd.serve_forever()
# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
# main
def _main():
r"""Main.
"""
import argparse
oParser = argparse.ArgumentParser(
description = __doc__,
formatter_class = argparse.RawTextHelpFormatter,
epilog = 'NOTE: stdin for input, stdout for result, stderr for logging',
)
oParser.add_argument('--pydoc', action='store_true',
help = "show this module's pydoc",
)
oGroupR = oParser.add_argument_group(title='run arguments', description='')
oGroupR.add_argument('--port', action='store', type=int, default=PORT_DFT,
help = 'specify server port (default: %(default)r)',
)
oGroupR.add_argument('--type', action='store', default=SERVER_DFT, choices=SERVER_DICT.keys(),
help = 'specify server type (default: %(default)r)',
)
oGroupR.add_argument('--root', action='store', default=os.getcwd(),
help = 'specify root directory (default: cwd %(default)r)',
)
oGroupR.add_argument('--run', action='store_true',
help = '\n'.join((
'run http server foreground',
)))
oArgs = oParser.parse_args()
if oArgs.pydoc:
help(os.path.splitext(os.path.basename(__file__))[0])
elif oArgs.run:
return run(sCwd=oArgs.root, sServer=oArgs.type, nPort=oArgs.port)
else:
oParser.print_help()
return 1
return 0
if __name__ == "__main__":
exit(_main())
Meanwhile, the single python file with only 200 lines may satisfy your "in Python" and "lightweight" demands.
Last but not least, this ModifiedSimpleHTTPServer.py may be a "killer app" by hand for temporary use, however, Nginx is advised for long term use.

Anyone know of a good Python based web crawler that I could use?

Locked. This question and its answers are locked because the question is off-topic but has historical significance. It is not currently accepting new answers or interactions.
I'm half-tempted to write my own, but I don't really have enough time right now. I've seen the Wikipedia list of open source crawlers but I'd prefer something written in Python. I realize that I could probably just use one of the tools on the Wikipedia page and wrap it in Python. I might end up doing that - if anyone has any advice about any of those tools, I'm open to hearing about them. I've used Heritrix via its web interface and I found it to be quite cumbersome. I definitely won't be using a browser API for my upcoming project.
Thanks in advance. Also, this is my first SO question!
Mechanize is my favorite; great high-level browsing capabilities (super-simple form filling and submission).
Twill is a simple scripting language built on top of Mechanize
BeautifulSoup + urllib2 also works quite nicely.
Scrapy looks like an extremely promising project; it's new.
Use Scrapy.
It is a twisted-based web crawler framework. Still under heavy development but it works already. Has many goodies:
Built-in support for parsing HTML, XML, CSV, and Javascript
A media pipeline for scraping items with images (or any other media) and download the image files as well
Support for extending Scrapy by plugging your own functionality using middlewares, extensions, and pipelines
Wide range of built-in middlewares and extensions for handling of compression, cache, cookies, authentication, user-agent spoofing, robots.txt handling, statistics, crawl depth restriction, etc
Interactive scraping shell console, very useful for developing and debugging
Web management console for monitoring and controlling your bot
Telnet console for low-level access to the Scrapy process
Example code to extract information about all torrent files added today in the mininova torrent site, by using a XPath selector on the HTML returned:
class Torrent(ScrapedItem):
pass
class MininovaSpider(CrawlSpider):
domain_name = 'mininova.org'
start_urls = ['http://www.mininova.org/today']
rules = [Rule(RegexLinkExtractor(allow=['/tor/\d+']), 'parse_torrent')]
def parse_torrent(self, response):
x = HtmlXPathSelector(response)
torrent = Torrent()
torrent.url = response.url
torrent.name = x.x("//h1/text()").extract()
torrent.description = x.x("//div[#id='description']").extract()
torrent.size = x.x("//div[#id='info-left']/p[2]/text()[2]").extract()
return [torrent]
Check the HarvestMan, a multi-threaded web-crawler written in Python, also give a look to the spider.py module.
And here you can find code samples to build a simple web-crawler.
I've used Ruya and found it pretty good.
I hacked the above script to include a login page as I needed it to access a drupal site. Not pretty but may help someone out there.
#!/usr/bin/python
import httplib2
import urllib
import urllib2
from cookielib import CookieJar
import sys
import re
from HTMLParser import HTMLParser
class miniHTMLParser( HTMLParser ):
viewedQueue = []
instQueue = []
headers = {}
opener = ""
def get_next_link( self ):
if self.instQueue == []:
return ''
else:
return self.instQueue.pop(0)
def gethtmlfile( self, site, page ):
try:
url = 'http://'+site+''+page
response = self.opener.open(url)
return response.read()
except Exception, err:
print " Error retrieving: "+page
sys.stderr.write('ERROR: %s\n' % str(err))
return ""
return resppage
def loginSite( self, site_url ):
try:
cj = CookieJar()
self.opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj))
url = 'http://'+site_url
params = {'name': 'customer_admin', 'pass': 'customer_admin123', 'opt': 'Log in', 'form_build_id': 'form-3560fb42948a06b01d063de48aa216ab', 'form_id':'user_login_block'}
user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'
self.headers = { 'User-Agent' : user_agent }
data = urllib.urlencode(params)
response = self.opener.open(url, data)
print "Logged in"
return response.read()
except Exception, err:
print " Error logging in"
sys.stderr.write('ERROR: %s\n' % str(err))
return 1
def handle_starttag( self, tag, attrs ):
if tag == 'a':
newstr = str(attrs[0][1])
print newstr
if re.search('http', newstr) == None:
if re.search('mailto', newstr) == None:
if re.search('#', newstr) == None:
if (newstr in self.viewedQueue) == False:
print " adding", newstr
self.instQueue.append( newstr )
self.viewedQueue.append( newstr )
else:
print " ignoring", newstr
else:
print " ignoring", newstr
else:
print " ignoring", newstr
def main():
if len(sys.argv)!=3:
print "usage is ./minispider.py site link"
sys.exit(2)
mySpider = miniHTMLParser()
site = sys.argv[1]
link = sys.argv[2]
url_login_link = site+"/node?destination=node"
print "\nLogging in", url_login_link
x = mySpider.loginSite( url_login_link )
while link != '':
print "\nChecking link ", link
# Get the file from the site and link
retfile = mySpider.gethtmlfile( site, link )
# Feed the file into the HTML parser
mySpider.feed(retfile)
# Search the retfile here
# Get the next link in level traversal order
link = mySpider.get_next_link()
mySpider.close()
print "\ndone\n"
if __name__ == "__main__":
main()
Trust me nothing is better than curl.. . the following code can crawl 10,000 urls in parallel in less than 300 secs on Amazon EC2
CAUTION: Don't hit the same domain at such a high speed.. .
#! /usr/bin/env python
# -*- coding: iso-8859-1 -*-
# vi:ts=4:et
# $Id: retriever-multi.py,v 1.29 2005/07/28 11:04:13 mfx Exp $
#
# Usage: python retriever-multi.py <file with URLs to fetch> [<# of
# concurrent connections>]
#
import sys
import pycurl
# We should ignore SIGPIPE when using pycurl.NOSIGNAL - see
# the libcurl tutorial for more info.
try:
import signal
from signal import SIGPIPE, SIG_IGN
signal.signal(signal.SIGPIPE, signal.SIG_IGN)
except ImportError:
pass
# Get args
num_conn = 10
try:
if sys.argv[1] == "-":
urls = sys.stdin.readlines()
else:
urls = open(sys.argv[1]).readlines()
if len(sys.argv) >= 3:
num_conn = int(sys.argv[2])
except:
print "Usage: %s <file with URLs to fetch> [<# of concurrent connections>]" % sys.argv[0]
raise SystemExit
# Make a queue with (url, filename) tuples
queue = []
for url in urls:
url = url.strip()
if not url or url[0] == "#":
continue
filename = "doc_%03d.dat" % (len(queue) + 1)
queue.append((url, filename))
# Check args
assert queue, "no URLs given"
num_urls = len(queue)
num_conn = min(num_conn, num_urls)
assert 1 <= num_conn <= 10000, "invalid number of concurrent connections"
print "PycURL %s (compiled against 0x%x)" % (pycurl.version, pycurl.COMPILE_LIBCURL_VERSION_NUM)
print "----- Getting", num_urls, "URLs using", num_conn, "connections -----"
# Pre-allocate a list of curl objects
m = pycurl.CurlMulti()
m.handles = []
for i in range(num_conn):
c = pycurl.Curl()
c.fp = None
c.setopt(pycurl.FOLLOWLOCATION, 1)
c.setopt(pycurl.MAXREDIRS, 5)
c.setopt(pycurl.CONNECTTIMEOUT, 30)
c.setopt(pycurl.TIMEOUT, 300)
c.setopt(pycurl.NOSIGNAL, 1)
m.handles.append(c)
# Main loop
freelist = m.handles[:]
num_processed = 0
while num_processed < num_urls:
# If there is an url to process and a free curl object, add to multi stack
while queue and freelist:
url, filename = queue.pop(0)
c = freelist.pop()
c.fp = open(filename, "wb")
c.setopt(pycurl.URL, url)
c.setopt(pycurl.WRITEDATA, c.fp)
m.add_handle(c)
# store some info
c.filename = filename
c.url = url
# Run the internal curl state machine for the multi stack
while 1:
ret, num_handles = m.perform()
if ret != pycurl.E_CALL_MULTI_PERFORM:
break
# Check for curl objects which have terminated, and add them to the freelist
while 1:
num_q, ok_list, err_list = m.info_read()
for c in ok_list:
c.fp.close()
c.fp = None
m.remove_handle(c)
print "Success:", c.filename, c.url, c.getinfo(pycurl.EFFECTIVE_URL)
freelist.append(c)
for c, errno, errmsg in err_list:
c.fp.close()
c.fp = None
m.remove_handle(c)
print "Failed: ", c.filename, c.url, errno, errmsg
freelist.append(c)
num_processed = num_processed + len(ok_list) + len(err_list)
if num_q == 0:
break
# Currently no more I/O is pending, could do something in the meantime
# (display a progress bar, etc.).
# We just call select() to sleep until some more data is available.
m.select(1.0)
# Cleanup
for c in m.handles:
if c.fp is not None:
c.fp.close()
c.fp = None
c.close()
m.close()
Another simple spider
Uses BeautifulSoup and urllib2. Nothing too sophisticated, just reads all a href's builds a list and goes though it.
pyspider.py

Categories