""" open/DurusWorks/qp/http/request.py """ from durus.utils import as_bytes, join_bytes, empty_byte_string, byte_string from qp.fill.html import url_quote, url_unquote from qp.lib.util import StringIO, message_from_file from qp.pub.common import get_publisher from shutil import move import os import re import sys import tempfile CRNL = as_bytes("\r\n") # Various regexes for parsing specific bits of HTTP, all from RFC 2616. # These are used by _parse_pref_header(). # LWS is linear whitespace; the latter two assume that LWS has been removed. _http_lws_re = re.compile(r"(\r\n)?[ \t]+") _http_list_re = re.compile(r",+") _http_encoding_re = re.compile(r"([^;]+)(;q=([\d.]+))?$") def _decode_string(s, charset): if hasattr(s, 'decode'): return s.decode(charset) else: return s # assume already decoded def parse_header(line): """Parse a Content-type like header. Return the main content-type and a dictionary of options. """ plist = [x.strip() for x in line.split(';')] key = plist.pop(0).lower() pdict = {} for p in plist: i = p.find('=') if i >= 0: name = p[:i].strip().lower() value = p[i+1:].strip() if len(value) >= 2 and value[0] == value[-1] == '"': value = value[1:-1] pdict[name] = value return key, pdict def parse_query(qs, charset): """(qs: string) -> {key:string, string|[string]} Parse a query given as a string argument and return a dictionary. """ if isinstance(qs, byte_string): qs = qs.decode(charset) fields = {} if '&' in qs: ampersand = '&' else: ampersand = '&' for chunk in qs.split(ampersand): if not chunk: continue chunk = chunk.replace('+', ' ') chunk = url_unquote(chunk) if '=' in chunk: name, value = chunk.split('=', 1) else: name = chunk value = '' _add_field_value(fields, name, value) return fields def _add_field_value(fields, name, value): if name in fields: values = fields[name] if not isinstance(values, list): fields[name] = values = [values] values.append(value) else: fields[name] = value class HTTPRequest (object): """ Model a single HTTP request and all associated data: environment variables, form variables, cookies, etc. To access environment variables associated with the request, use get_environ(): eg. request.get_environ('SERVER_PORT', 80). To access form variables, use get_field(), eg. request.get_field("name"). To access cookies, use get_cookie(). Various bits and pieces of the requested URL can be accessed with get_url(), get_path(), get_server() """ # In qp, we will encode html pages using utf-8. # Unless the client specifies otherwise, we will assume that requests use # the same charset. DEFAULT_CHARSET = 'utf-8' def __init__(self, stdin, environ): self.stdin = stdin self.environ = environ self.fields = None self.cookies = parse_cookies(environ.get('HTTP_COOKIE', '')) if environ.get('HTTPS', 'off').lower() in ('on', '1', 'yes'): self.scheme = "https" else: self.scheme = "http" self.body = None def get_content_length(self): length = self.environ.get('CONTENT_LENGTH') or "0" try: return int(length) except ValueError: raise ValueError('invalid content-length header') def read_body(self): if self.body is not None: return self.body length = self.get_content_length() body = self.stdin.read(length) self.body = body if len(body) != length: raise ValueError( "read_body() read %s/%s bytes" % (len(body), length)) return body def get_content_type(self): content_type = self.environ.get("CONTENT_TYPE") if content_type: return parse_header(content_type) else: return None, None def get_fields(self): if self.fields is None: self.fields = dict() query = self.get_query() if query: self.fields.update(parse_query(query, self.DEFAULT_CHARSET)) if self.get_content_length() > 0: ctype, ctype_params = self.get_content_type() if ctype == 'application/x-www-form-urlencoded': self._process_urlencoded(ctype_params) elif ctype == 'multipart/form-data': self._process_multipart(ctype_params) return self.fields def _process_urlencoded(self, params): body = self.read_body() charset = params.get('charset', self.DEFAULT_CHARSET) self.fields.update(parse_query(body, charset)) def _process_multipart(self, params): boundary = params.get('boundary') if not boundary: raise ValueError('multipart/form-data missing boundary') charset = params.get('charset') length = self.get_content_length() mimeinput = MIMEInput(self.stdin, boundary, length) try: for line in mimeinput.readpart(): pass # discard lines up to first boundary while mimeinput.moreparts(): self._process_multipart_body(mimeinput, charset) except EOFError: raise ValueError('unexpected end of multipart/form-data') def _process_multipart_body(self, mimeinput, charset): headers = StringIO() lines = mimeinput.readpart() for line in lines: headers.write(line.decode('latin1')) if line == CRNL: break headers.seek(0) headers = message_from_file(headers) ctype, ctype_params = parse_header(headers.get('content-type', '')) if ctype and 'charset' in ctype_params: charset = ctype_params['charset'] cdisp, cdisp_params = parse_header( headers.get('content-disposition', '')) if not cdisp: raise ValueError('expected Content-Disposition header') name = cdisp_params.get('name') filename = cdisp_params.get('filename') if not (cdisp == 'form-data' and name): raise ValueError('expected Content-Disposition: form-data' 'with a "name" parameter: got %r' % headers.get('content-disposition', '')) # FIXME: should really to handle Content-Transfer-Encoding and other # MIME complexity here. See RFC2048 for the full horror story. if filename: # it might be large file upload so use a temporary file upload = Upload(filename, ctype, charset) upload.receive(lines) _add_field_value(self.fields, name, upload) else: value = _decode_string(join_bytes(lines), charset or self.DEFAULT_CHARSET) _add_field_value(self.fields, name, value) def get_header(self, name, default=None): """(name : str, default : str = None) -> str Return the named HTTP header, or an optional default argument (or None) if the header is not found. Note that both original and CGI-ified header names are recognized, e.g. 'Content-Type', 'CONTENT_TYPE' and 'HTTP_CONTENT_TYPE' should all return the Content-Type header, if available. """ environ = self.environ name = name.replace('-', '_').upper() val = environ.get(name) if val is not None: return val if name[:5] != 'HTTP_': name = 'HTTP_' + name return environ.get(name, default) def get_cookie(self, cookie_name, default=None): return self.cookies.get(cookie_name, default) def get_cookies(self): return self.cookies def get_field(self, name, default=None): return self.get_fields().get(name, default) def get_method(self): """Returns the HTTP method for this request """ return self.environ.get('REQUEST_METHOD', 'GET') def get_scheme(self): return self.scheme def get_server(self): """() -> str Return the server name with an optional port number, eg. "www.example.com" or "foo.bar.com:8000". """ http_host = self.environ.get("HTTP_HOST") if http_host: return http_host server_name = self.environ["SERVER_NAME"].strip() server_port = self.environ.get("SERVER_PORT") if (not server_port or (self.get_scheme() == "http" and server_port == "80") or (self.get_scheme() == "https" and server_port == "443")): return server_name else: return server_name + ':' + server_port def get_script_name(self): return self.environ.get('SCRIPT_NAME', '') def get_path_info(self): return self.environ.get('PATH_INFO', '') def get_path(self): """() -> str""" path = self.get_script_name() + self.get_path_info() if path[:1] != '/': path = '/' + path return path def get_query(self): """() -> string Return the query component of the URL. """ return self.environ.get('QUERY_STRING', '') def get_path_query(self): query = self.get_query() path = url_quote(self.get_path()) if query: return path + '?' + query else: return path def get_url(self): """() -> str Return the URL of the current request. """ return "%s://%s%s" % (self.get_scheme(), self.get_server(), self.get_path_query()) def get_environ(self, key, default=None): """(key : string) -> str Fetch a CGI environment variable from the request environment. See http://hoohoo.ncsa.uiuc.edu/cgi/env.html for the variables specified by the CGI standard. """ return self.environ.get(key, default) def get_remote_address(self): return self.get_environ('REMOTE_ADDR') def get_encoding(self, encodings): """(encodings : [string]) -> str Parse the "Accept-encoding" header. 'encodings' is a list of encodings supported by the server sorted in order of preference. The return value is one of 'encodings' or None if the client does not accept any of the encodings. """ accept_encoding = self.get_header("accept-encoding") or '' found_encodings = self._parse_pref_header(accept_encoding) if found_encodings: for encoding in encodings: if encoding in found_encodings: return encoding return None def accepts_gzip_encoding(self): return bool(self.get_encoding(['gzip'])) def get_range(self): """() -> None | ( int|None , int|None )""" range_header = self.get_header("range") if range_header is not None: try: units, range = range_header.split("=") # We don't support multiple ranges - just one. if units != 'bytes': return None if "," in range: return None first_byte, last_byte = range.split("-") if first_byte == "": start = None else: start = int(first_byte) if last_byte == "": end = None if start is None: return None else: end = int(last_byte) if start is None or end is None or start <= end: return (start, end) except ValueError: # If anything went wrong with parsing the Range header, pretend the client # didn't submit it. (according to spec) pass return None def get_accepted_types(self): """() -> {string:float} Return a dictionary mapping MIME types the client will accept to the corresponding quality value (1.0 if no value was specified). """ accept_types = self.environ.get('HTTP_ACCEPT', '') return self._parse_pref_header(accept_types) def _parse_pref_header(self, S): """(S:str) -> {str:float} Parse a list of HTTP preferences (content types, encodings) and return a dictionary mapping strings to the quality value. """ found = {} # remove all linear whitespace S = _http_lws_re.sub('', S) for coding in _http_list_re.split(S): m = _http_encoding_re.match(coding) if m: encoding = m.group(1).lower() q = m.group(3) or 1.0 try: q = float(q) except ValueError: continue if encoding == '*': continue # stupid, ignore it if q > 0: found[encoding] = q return found # See RFC 2109 for details. Note that this parser is more liberal. _COOKIE_RE = re.compile(r""" \s* (?P[^=;,\s]+) \s* ( = \s* ( (?P "(\\[\x00-\x7f] | [^"])*") | (?P [^";,\s]*) ) )? \s* [;,]? """, re.VERBOSE) def parse_cookies(text): result = {} for m in _COOKIE_RE.finditer(text): name = m.group('name') if name[0] == '$': # discard, we don't handle per cookie attributes (e.g. $Path) continue qvalue = m.group('qvalue') if qvalue: value = re.sub(r'\\(.)', r'\1', qvalue)[1:-1] else: value = m.group('value') or '' result[name] = value return result SAFE_CHARS = ('abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ' '0123456789-@&+=_., ') _safe_trans = None def make_safe_filename(s): global _safe_trans if _safe_trans is None: _safe_trans = ['_'] * 256 for c in SAFE_CHARS: _safe_trans[ord(c)] = c _safe_trans = ''.join(_safe_trans) return s.translate(_safe_trans) class Upload (object): """ Represents a single uploaded file. fp an open file containing the content of the upload. The file pointer points to the beginning of the file orig_filename the complete filename supplied by the user-agent in the request that uploaded this file. Depending on the browser, this might have the complete path of the original file on the client system, in the client system's syntax. base_filename the base component of orig_filename, shorn of MS-DOS, Mac OS, and Unix path components and with "unsafe" characters neutralized (see make_safe_filename()) content_type the content type provided by the user-agent in the request that uploaded this file. charset the charset provide by the user-agent """ def __init__(self, orig_filename, content_type=None, charset=None): if orig_filename: self.orig_filename = orig_filename separator_position = max( orig_filename.rfind('\\'), orig_filename.rfind(':'), orig_filename.rfind('/')) basename = orig_filename[separator_position + 1:] self.base_filename = make_safe_filename(basename) else: self.orig_filename = None self.base_filename = None self.content_type = content_type self.charset = charset self.fp = None def receive(self, lines): if get_publisher(): tmp_dir = get_publisher().get_site().get_directory_for_temporary_files() else: tmp_dir = None self.fp = tempfile.NamedTemporaryFile("w+b", dir=tmp_dir, prefix="up.") for line in lines: self.fp.write(line) self.fp.seek(0) def read(self, n): return self.fp.read(n) def readline(self): return self.fp.readline() def readlines(self): return self.fp.readlines() def seek(self, *args): return self.fp.seek(*args) def tell(self): return self.fp.tell() def __iter__(self): return iter(self.fp) def close(self): self.fp.close() def get_size(self): """Return the size of the file, in bytes. """ if self.fp is None: return 0 else: return os.fstat(self.fp.fileno()).st_size def get_full_path(self): return self.fp.name def get_base_filename(self): return self.base_filename def get_orig_filename(self): return self.orig_filename def get_content_type(self): return self.content_type def get_charset(self): return self.get_charset def get_fp(self): return self.fp def move(self, new_path): print('move("%s", "%s")' % (self.get_full_path(), new_path)) move(self.get_full_path(), new_path) if getattr(self.fp, 'delete', None): self.fp.delete = False # Already gone. self.fp = open(new_path) class LineInput (object): """ A wrapper for an input stream that has the following properties: * lines are terminated by \r\n * lines shorter than 'maxlength' are always returned unbroken * lines longer than 'maxlength' are broken but the pair of characters \r\n are never split * no more than 'length' characters are read from the underlying stream * if the underlying stream does not produce at least 'length' characters then EOFError is raised """ def __init__(self, fp, length): self.fp = fp self.length = length self.buf = empty_byte_string def readline(self, maxlength=4096): # fill buffer n = min(self.length, maxlength - len(self.buf)) chunks = [self.buf] if n > 0: self.length -= n assert self.length >= 0 chunk = self.fp.read(n) if len(chunk) != n: raise EOFError('unexpected end of input') chunks.append(chunk) self.buf = join_bytes(chunks) # split into lines buf = self.buf i = buf.find(CRNL) if i >= 0: i += 2 self.buf = buf[i:] return buf[:i] elif buf[-1:] == CRNL[:1]: # avoid splitting CR NL pairs self.buf = buf[-1:] return buf[:-1] else: self.buf = empty_byte_string return buf class MIMEInput (object): """ Split a MIME input stream into parts. Note that this class does not handle headers, transfer encoding, etc. """ def __init__(self, fp, boundary, length): self.lineinput = LineInput(fp, length) self.pat = re.compile( as_bytes(r'--%s(--)?' % re.escape(boundary))) self.done = False def moreparts(self): """Return true if there are more parts to be read.""" return not self.done def readpart(self): """Generate all the lines up to a MIME boundary. Note that you must exhaust the generator before calling this function again.""" assert not self.done last_line = empty_byte_string while 1: line = self.lineinput.readline() if not line: # Hit EOF -- nothing more to read. This should *not* happen # in a well-formed MIME message. raise EOFError('MIME boundary not found (end of input)') if last_line[-2:] == CRNL or last_line == empty_byte_string: m = self.pat.match(line) if m: # If we hit the boundary line, return now. Forget # the current line *and* the CRNL ending of the # previous line. if m.group(1): # hit final boundary self.done = True yield last_line[:-2] return if last_line: yield last_line last_line = line