""" open/dulcinea/lib/util.py """ from datetime import datetime from distutils.fancy_getopt import wrap_text from durus.utils import byte_string from formatter import AbstractFormatter, DumbWriter, AS_IS, NullFormatter from os.path import isdir, dirname, abspath from qp.lib.spec import unicode_string from qpy import stringify, xml, xml_quote import re import time import sys if sys.version < "3": from __builtin__ import unichr from StringIO import StringIO from htmllib import HTMLParser from urllib import urlretrieve, urlopen from htmlentitydefs import name2codepoint else: from io import StringIO from html.parser import HTMLParser as PlainHTMLParser from urllib.request import urlretrieve, urlopen # Patch bug in formatter module. import formatter def list_filter(*args): return list(filter(*args)) formatter.filter = list_filter from html.entities import name2codepoint unichr = chr class HTMLParser (PlainHTMLParser): def __init__(self, formatter): PlainHTMLParser.__init__(self) self.formatter = formatter self.nofill = 0 self.in_head = False self.list_stack = [] # Overridable -- finish processing of start+end tag: def handle_startendtag(self, tag, attrs): self.handle_starttag(tag, attrs) self.handle_endtag(tag) # Overridable -- handle start tag def handle_starttag(self, tag, attrs): tag = tag.lower() if self.in_head: return if tag == 'head': self.in_head = True elif tag == 'p': self.formatter.end_paragraph(1) elif tag in ('h1', 'h2', 'h3', 'h4', 'h5', 'h6'): self.formatter.end_paragraph(1) self.formatter.push_font((tag, 0, 1, 0)) elif tag == 'pre': self.formatter.end_paragraph(1) self.formatter.push_font((AS_IS, AS_IS , AS_IS, 1)) self.nofill = self.nofill + 1 elif tag == 'br': self.formatter.add_line_break() elif tag == 'hr': self.formatter.add_hor_rule() elif tag == 'blockquote': self.formatter.end_paragraph(1) self.formatter.push_margin('blockquote') elif tag == 'address': self.formatter.end_paragraph(0) self.formatter.push_font((AS_IS, 1, AS_IS, AS_IS)) elif tag == 'ul': self.formatter.end_paragraph(not self.list_stack) self.formatter.push_margin('ul') self.list_stack.append(['ul', '*', 0]) elif tag == 'li': self.formatter.end_paragraph(0) if self.list_stack: [dummy, label, counter] = top = self.list_stack[-1] top[2] = counter = counter+1 else: label, counter = '*', 0 self.formatter.add_label_data(label, counter) elif tag == 'ol': self.formatter.end_paragraph(not self.list_stack) self.formatter.push_margin('ol') label = '1.' for a, v in attrs: if a == 'type': if len(v) == 1: v = v + '.' label = v self.list_stack.append(['ol', label, 0]) elif tag == 'dl': self.formatter.end_paragraph(1) self.list_stack.append(['dl', '', 0]) elif tag == 'dt': self.ddpop() elif tag == 'dd': self.ddpop() self.formatter.push_margin('dd') self.list_stack.append(['dd', '', 0]) elif tag in ('i', 'em', 'cite'): self.formatter.push_font((AS_IS, 1, AS_IS, AS_IS)) elif tag in ('b', 'strong'): self.formatter.push_font((AS_IS, AS_IS, 1, AS_IS)) elif tag in ('tt', 'code'): self.formatter.push_font((AS_IS, AS_IS, AS_IS, 1)) #else: print('start', tag) def handle_endtag(self, tag): tag = tag.lower() if self.in_head: if tag == 'head': self.in_head = False return elif tag == 'p': self.formatter.end_paragraph(1) elif tag == 'pre': self.formatter.end_paragraph(1) self.formatter.pop_font() self.nofill = max(0, self.nofill - 1) elif tag in ('h1', 'h2', 'h3', 'h4', 'h5', 'h6'): self.formatter.end_paragraph(1) self.formatter.pop_font() elif tag == 'blockquote': self.formatter.end_paragraph(1) self.formatter.push_margin('blockquote') elif tag == 'address': self.formatter.end_paragraph(0) self.formatter.pop_font() elif tag in ('ul', 'ol'): if self.list_stack: del self.list_stack[-1] self.formatter.end_paragraph(not self.list_stack) self.formatter.pop_margin() elif tag == 'dl': self.ddpop(1) if self.list_stack: del self.list_stack[-1] elif tag in ('i', 'em', 'tt', 'strong', 'b', 'cite', 'code'): self.formatter.pop_font() #else: print('end', tag) def ddpop(self, bl=0): self.formatter.end_paragraph(bl) if self.list_stack: if self.list_stack[-1][0] == 'dd': del self.list_stack[-1] self.formatter.pop_margin() def handle_charref(self, name): self.formatter.add_literal_data('#' + name + ';') def handle_entityref(self, name): if name in self.entitydefs: self.handle_data(self.entitydefs[name]) else: self.handle_data('&' + name +';') def handle_data(self, data): if not self.in_head: if self.nofill: self.formatter.add_literal_data(data) else: self.formatter.add_flowing_data(data) def as_unicode(s): if isinstance(s, unicode_string): return s elif isinstance(s, byte_string): return s.decode('utf-8') return unicode_string(s, 'utf-8') join_unicode = as_unicode('').join urlretrieve, urlopen, StringIO # quiet checker class UnicodeStringIO (object): """A StringIO that forces all strings to Unicode """ def __init__(self): self.buflist = [] def getvalue(self): return join_unicode([as_unicode(line) for line in self.buflist]) def write(self, s): self.buflist.append(s) entitydefs_as_unichr = {} for (name, codepoint) in name2codepoint.items(): entitydefs_as_unichr[name] = unichr(codepoint) class EntityCharHTMLParser (HTMLParser): """An HTMLParser that replaces HTML entities with their Unicode characters. """ entitydefs = entitydefs_as_unichr def html2txt(text): """(any) -> str """ sio = UnicodeStringIO() parser = EntityCharHTMLParser(AbstractFormatter(DumbWriter(sio))) parser.feed(stringify(text)) return sio.getvalue() def _htmlescape_except(text, safe_tags): """(text) -> xml Quotes 'text' for use in an HTML page, except for the tags listed in 'safe_tags' which are considered safe (e.g.

, ). Both upper and lower case versions of the tags will be applied """ if text is None: return None text = stringify(xml_quote(text)) for tag in safe_tags: tag_re = re.compile(stringify(xml_quote(tag)), re.IGNORECASE) text = tag_re.sub(tag, text) return xml(text) # re to automatically hyperlink email addresses and URLs _link_re = re.compile(r"""\b( # email address [\w.-]+ # local part @ [\w.-]+\.[\w]{2,4} # domain | # URL (?:https?://|www\.) # must start with http or www [\w.-]+\.[\w]{2,4} # domain (?::\d+)? # optional port (?:/[\w#$%&+,-./:;=?@\[\]^_|~]*)? # optional path )""", re.VERBOSE) def activate_links(text, links=None): """(text:str|None, links:[(url, htext)]) -> xml|None Returns an xml_quoted version of text, with things that look like email addresses and URLs turned into hyperlinks. links is a list of two-tuples. If the url in links appears in text it is replaced with htext. """ def _link_replace(m): text = url = m.group(0) extra = "" if text.find("@") == -1: if text[-1] in ".,": # don't include as part of the URL (easier to handle here # than in the regex) extra = text[-1] url = text = text[:-1] if not text.startswith("http"): url = "http://" + text else: if not text.startswith("mailto"): url = "mailto:" + text if links: for known_url, known_text in links: if url == known_url: text = known_text return '%s%s' % (url, text, extra) if text is None: return None return xml(_link_re.sub(_link_replace, stringify(xml_quote(text)))) SAFE_TAGS = ['

', '

', '', '', '', '', '', '
    ', '
', '
  • ', '
  • ', '
    ', '
    ', '
    ',
                 '
    ', '', '', '
    ', '
    ', '
    ', '
    ', '
    ', '
    '] def format_text(text, safe_tags=SAFE_TAGS, links=None): """(text : string, safe_tags : [str]) -> xml Convert an ASCII string containing text into HTML. The resulting HTML has been reformatted in the following ways: * <, &, and > are escaped * Things that look like email addresses and URLs will be turned into hyperlinks. * If safe_tags is defined, it is expected to be a list of tags considered to be safe (e.g.

    , , etc). These tags will not be escaped. """ return activate_links(_htmlescape_except(text, safe_tags), links) def sanitize_url(url): """(url:string) -> string Try to ensure a URL is well-formed, by adding http:// if it isn't present. """ if url is None: return None if '@' in url and not url.startswith('mailto:'): # assume it's an e-mail address url = "mailto:" + url elif url.find(":") == -1: # assume http:// is missing url = "http://" + url return url _paragraph_re = re.compile('\n\n+') def split_paragraphs(text): return _paragraph_re.split(text) def wrap_paragraphs(text): """(text) -> string Wrap a sequence of paragraphs for output as plain text. """ if text is None: return "" return '\n\n'.join([wrap_paragraph(paragraph) for paragraph in split_paragraphs(text)]) def wrap_paragraph(text): line_length = 70 if isinstance(text, unicode_string): lines = [text.__class__(x, 'utf-8') for x in wrap_text(text.encode('utf-8'), line_length)] else: lines = wrap_text(text, line_length) return '\n'.join(lines) def insert_paragraph_tags(text): """(text:string|None) -> xml|None Prepare a text field for display as HTML. Currently this just HTML quotes the string and then inserts

    tags at blank lines. """ if text is None: return None return xml( '

    ' + _paragraph_re.sub('

    ', stringify(xml_quote(text))) + '

    ') def datetime_to_int(date_time): """(date_time:datetime) -> int Returns the number of seconds since the epoch. """ return int(time.mktime(date_time.timetuple())) def beginning_of_next_month(date_time): """(date_time:datetime) -> datetime Return a datetime for the exact beginning of the month following the given date_time. """ year = date_time.year month = date_time.month + 1 if month == 13: year += 1 month = 1 return datetime(year=year, month=month, day=1, hour=0, minute=0, second=0, microsecond=0, tzinfo=date_time.tzinfo) def is_new(persistent_object): """(persistent_object : durus.persistent.Persistent) -> boolean """ return persistent_object._p_connection is None def get_module_directory(module): result = abspath(module.__file__) if isdir(result): return result else: return dirname(result) def static(module, path): return get_module_directory(module) + '/' + path def get_id(x): return x.get_id() class HTMLSafetyParser (HTMLParser): allowed_tags = set(['p', 'b', 'i', 'ul', 'ol', 'li', 'br', 'pre', 'strong', 'dl', 'dd', 'dt', 'div', 'span', 'img', 'em', 'b', 'a', 'font', 'blockquote', 'hr', 'sup', 'sub', 'strike']) allowed_attrs = set(['style', 'class', 'src', 'href', 'width', 'height', 'size', 'face', 'title', 'alt']) def __init__(self, *args, **kwargs): HTMLParser.__init__(self, *args, **kwargs) self.risks = [] def get_risks(self): return self.risks def unknown_starttag(self, tag, attrs): if tag.lower() not in self.allowed_tags: self.risks.append('The "%s" element is not allowed.' % tag.upper()) for attr, val in attrs: if attr.lower() not in self.allowed_attrs: self.risks.append('The "%s" attribute is not allowed.' % attr.upper()) def handle_starttag(self, tag, doer, attrs): self.unknown_starttag(tag, attrs) def get_html_risks(html): parser = HTMLSafetyParser(NullFormatter()) parser.feed(html) return parser.get_risks()