"""
open/dulcinea/lib/util.py
"""
from datetime import datetime
from distutils.fancy_getopt import wrap_text
from durus.utils import byte_string
from formatter import AbstractFormatter, DumbWriter, AS_IS, NullFormatter
from os.path import isdir, dirname, abspath
from qp.lib.spec import unicode_string
from qpy import stringify, xml, xml_quote
import re
import time
import sys
if sys.version < "3":
from __builtin__ import unichr
from StringIO import StringIO
from htmllib import HTMLParser
from urllib import urlretrieve, urlopen
from htmlentitydefs import name2codepoint
else:
from io import StringIO
from html.parser import HTMLParser as PlainHTMLParser
from urllib.request import urlretrieve, urlopen
# Patch bug in formatter module.
import formatter
def list_filter(*args):
return list(filter(*args))
formatter.filter = list_filter
from html.entities import name2codepoint
unichr = chr
class HTMLParser (PlainHTMLParser):
def __init__(self, formatter):
PlainHTMLParser.__init__(self)
self.formatter = formatter
self.nofill = 0
self.in_head = False
self.list_stack = []
# Overridable -- finish processing of start+end tag:
, ). Both upper and lower case versions of the tags will be applied """ if text is None: return None text = stringify(xml_quote(text)) for tag in safe_tags: tag_re = re.compile(stringify(xml_quote(tag)), re.IGNORECASE) text = tag_re.sub(tag, text) return xml(text) # re to automatically hyperlink email addresses and URLs _link_re = re.compile(r"""\b( # email address [\w.-]+ # local part @ [\w.-]+\.[\w]{2,4} # domain | # URL (?:https?://|www\.) # must start with http or www [\w.-]+\.[\w]{2,4} # domain (?::\d+)? # optional port (?:/[\w#$%&+,-./:;=?@\[\]^_|~]*)? # optional path )""", re.VERBOSE) def activate_links(text, links=None): """(text:str|None, links:[(url, htext)]) -> xml|None Returns an xml_quoted version of text, with things that look like email addresses and URLs turned into hyperlinks. links is a list of two-tuples. If the url in links appears in text it is replaced with htext. """ def _link_replace(m): text = url = m.group(0) extra = "" if text.find("@") == -1: if text[-1] in ".,": # don't include as part of the URL (easier to handle here # than in the regex) extra = text[-1] url = text = text[:-1] if not text.startswith("http"): url = "http://" + text else: if not text.startswith("mailto"): url = "mailto:" + text if links: for known_url, known_text in links: if url == known_url: text = known_text return '%s%s' % (url, text, extra) if text is None: return None return xml(_link_re.sub(_link_replace, stringify(xml_quote(text)))) SAFE_TAGS = ['
', '
', '', '', '', '', '', '', '', '', '
, , etc). These tags will not be escaped. """ return activate_links(_htmlescape_except(text, safe_tags), links) def sanitize_url(url): """(url:string) -> string Try to ensure a URL is well-formed, by adding http:// if it isn't present. """ if url is None: return None if '@' in url and not url.startswith('mailto:'): # assume it's an e-mail address url = "mailto:" + url elif url.find(":") == -1: # assume http:// is missing url = "http://" + url return url _paragraph_re = re.compile('\n\n+') def split_paragraphs(text): return _paragraph_re.split(text) def wrap_paragraphs(text): """(text) -> string Wrap a sequence of paragraphs for output as plain text. """ if text is None: return "" return '\n\n'.join([wrap_paragraph(paragraph) for paragraph in split_paragraphs(text)]) def wrap_paragraph(text): line_length = 70 if isinstance(text, unicode_string): lines = [text.__class__(x, 'utf-8') for x in wrap_text(text.encode('utf-8'), line_length)] else: lines = wrap_text(text, line_length) return '\n'.join(lines) def insert_paragraph_tags(text): """(text:string|None) -> xml|None Prepare a text field for display as HTML. Currently this just HTML quotes the string and then inserts
tags at blank lines. """ if text is None: return None return xml( '
' + _paragraph_re.sub('
', stringify(xml_quote(text))) + '
') def datetime_to_int(date_time): """(date_time:datetime) -> int Returns the number of seconds since the epoch. """ return int(time.mktime(date_time.timetuple())) def beginning_of_next_month(date_time): """(date_time:datetime) -> datetime Return a datetime for the exact beginning of the month following the given date_time. """ year = date_time.year month = date_time.month + 1 if month == 13: year += 1 month = 1 return datetime(year=year, month=month, day=1, hour=0, minute=0, second=0, microsecond=0, tzinfo=date_time.tzinfo) def is_new(persistent_object): """(persistent_object : durus.persistent.Persistent) -> boolean """ return persistent_object._p_connection is None def get_module_directory(module): result = abspath(module.__file__) if isdir(result): return result else: return dirname(result) def static(module, path): return get_module_directory(module) + '/' + path def get_id(x): return x.get_id() class HTMLSafetyParser (HTMLParser): allowed_tags = set(['p', 'b', 'i', 'ul', 'ol', 'li', 'br', 'pre', 'strong', 'dl', 'dd', 'dt', 'div', 'span', 'img', 'em', 'b', 'a', 'font', 'blockquote', 'hr', 'sup', 'sub', 'strike']) allowed_attrs = set(['style', 'class', 'src', 'href', 'width', 'height', 'size', 'face', 'title', 'alt']) def __init__(self, *args, **kwargs): HTMLParser.__init__(self, *args, **kwargs) self.risks = [] def get_risks(self): return self.risks def unknown_starttag(self, tag, attrs): if tag.lower() not in self.allowed_tags: self.risks.append('The "%s" element is not allowed.' % tag.upper()) for attr, val in attrs: if attr.lower() not in self.allowed_attrs: self.risks.append('The "%s" attribute is not allowed.' % attr.upper()) def handle_starttag(self, tag, doer, attrs): self.unknown_starttag(tag, attrs) def get_html_risks(html): parser = HTMLSafetyParser(NullFormatter()) parser.feed(html) return parser.get_risks()