[wxWidgets.git] / wxPython / samples / stxview / StructuredText / ClassicStructuredText.py

#! /usr/bin/env python -- # -*- python -*-
##############################################################################
# 
# Zope Public License (ZPL) Version 1.0
# -------------------------------------
# 
# Copyright (c) Digital Creations.  All rights reserved.
# 
# This license has been certified as Open Source(tm).
# 
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are
# met:
# 
# 1. Redistributions in source code must retain the above copyright
#    notice, this list of conditions, and the following disclaimer.
# 
# 2. Redistributions in binary form must reproduce the above copyright
#    notice, this list of conditions, and the following disclaimer in
#    the documentation and/or other materials provided with the
#    distribution.
# 
# 3. Digital Creations requests that attribution be given to Zope
#    in any manner possible. Zope includes a "Powered by Zope"
#    button that is installed by default. While it is not a license
#    violation to remove this button, it is requested that the
#    attribution remain. A significant investment has been put
#    into Zope, and this effort will continue if the Zope community
#    continues to grow. This is one way to assure that growth.
# 
# 4. All advertising materials and documentation mentioning
#    features derived from or use of this software must display
#    the following acknowledgement:
# 
#      "This product includes software developed by Digital Creations
#      for use in the Z Object Publishing Environment
#      (http://www.zope.org/)."
# 
#    In the event that the product being advertised includes an
#    intact Zope distribution (with copyright and license included)
#    then this clause is waived.
# 
# 5. Names associated with Zope or Digital Creations must not be used to
#    endorse or promote products derived from this software without
#    prior written permission from Digital Creations.
# 
# 6. Modified redistributions of any form whatsoever must retain
#    the following acknowledgment:
# 
#      "This product includes software developed by Digital Creations
#      for use in the Z Object Publishing Environment
#      (http://www.zope.org/)."
# 
#    Intact (re-)distributions of any official Zope release do not
#    require an external acknowledgement.
# 
# 7. Modifications are encouraged but must be packaged separately as
#    patches to official Zope releases.  Distributions that do not
#    clearly separate the patches from the original work must be clearly
#    labeled as unofficial distributions.  Modifications which do not
#    carry the name Zope may be packaged in any form, as long as they
#    conform to all of the clauses above.
# 
# 
# Disclaimer
# 
#   THIS SOFTWARE IS PROVIDED BY DIGITAL CREATIONS ``AS IS'' AND ANY
#   EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
#   IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
#   PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL DIGITAL CREATIONS OR ITS
#   CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
#   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
#   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
#   USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
#   ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
#   OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
#   OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
#   SUCH DAMAGE.
# 
# 
# This software consists of contributions made by Digital Creations and
# many individuals on behalf of Digital Creations.  Specific
# attributions are listed in the accompanying credits file.
# 
##############################################################################
'''Structured Text Manipulation

Parse a structured text string into a form that can be used with 
structured formats, like html.

Structured text is text that uses indentation and simple
symbology to indicate the structure of a document.  

A structured string consists of a sequence of paragraphs separated by
one or more blank lines.  Each paragraph has a level which is defined
as the minimum indentation of the paragraph.  A paragraph is a
sub-paragraph of another paragraph if the other paragraph is the last
preceding paragraph that has a lower level.

Special symbology is used to indicate special constructs:

- A single-line paragraph whose immediately succeeding paragraphs are lower
  level is treated as a header.

- A paragraph that begins with a '-', '*', or 'o' is treated as an
  unordered list (bullet) element.

- A paragraph that begins with a sequence of digits followed by a
  white-space character is treated as an ordered list element.

- A paragraph that begins with a sequence of sequences, where each
  sequence is a sequence of digits or a sequence of letters followed
  by a period, is treated as an ordered list element.

- A paragraph with a first line that contains some text, followed by
  some white-space and '--' is treated as
  a descriptive list element. The leading text is treated as the
  element title.

- Sub-paragraphs of a paragraph that ends in the word 'example' or the
  word 'examples', or '::' is treated as example code and is output as is.

- Text enclosed single quotes (with white-space to the left of the
  first quote and whitespace or punctuation to the right of the second quote)
  is treated as example code.

- Text surrounded by '*' characters (with white-space to the left of the
  first '*' and whitespace or punctuation to the right of the second '*')
  is emphasized.

- Text surrounded by '**' characters (with white-space to the left of the
  first '**' and whitespace or punctuation to the right of the second '**')
  is made strong.

- Text surrounded by '_' underscore characters (with whitespace to the left 
  and whitespace or punctuation to the right) is made underlined.

- Text encloded by double quotes followed by a colon, a URL, and concluded
  by punctuation plus white space, *or* just white space, is treated as a
  hyper link. For example:

    "Zope":http://www.zope.org/ is ...

  Is interpreted as '<a href="http://www.zope.org/">Zope</a> is ....'
  Note: This works for relative as well as absolute URLs.

- Text enclosed by double quotes followed by a comma, one or more spaces,
  an absolute URL and concluded by punctuation plus white space, or just
  white space, is treated as a hyper link. For example: 

    "mail me", mailto:amos@digicool.com.

  Is interpreted as '<a href="mailto:amos@digicool.com">mail me</a>.' 

- Text enclosed in brackets which consists only of letters, digits,
  underscores and dashes is treated as hyper links within the document.
  For example:
    
    As demonstrated by Smith [12] this technique is quite effective.

  Is interpreted as '... by Smith <a href="#12">[12]</a> this ...'. Together
  with the next rule this allows easy coding of references or end notes.

- Text enclosed in brackets which is preceded by the start of a line, two
  periods and a space is treated as a named link. For example:

    .. [12] "Effective Techniques" Smith, Joe ... 

  Is interpreted as '<a name="12">[12]</a> "Effective Techniques" ...'.
  Together with the previous rule this allows easy coding of references or
  end notes. 


- A paragraph that has blocks of text enclosed in '||' is treated as a
  table. The text blocks correspond to table cells and table rows are
  denoted by newlines. By default the cells are center aligned. A cell
  can span more than one column by preceding a block of text with an
  equivalent number of cell separators '||'. Newlines and '|' cannot
  be a part of the cell text. For example:

      |||| **Ingredients** ||
      || *Name* || *Amount* ||
      ||Spam||10||
      ||Eggs||3||

  is interpreted as::

    <TABLE BORDER=1 CELLPADDING=2>
     <TR>
      <TD ALIGN=CENTER COLSPAN=2> <strong>Ingredients</strong> </TD>
     </TR>
     <TR>
      <TD ALIGN=CENTER COLSPAN=1> <em>Name</em> </TD>
      <TD ALIGN=CENTER COLSPAN=1> <em>Amount</em> </TD>
     </TR>
     <TR>
      <TD ALIGN=CENTER COLSPAN=1>Spam</TD>
      <TD ALIGN=CENTER COLSPAN=1>10</TD>
     </TR>
     <TR>
      <TD ALIGN=CENTER COLSPAN=1>Eggs</TD>
      <TD ALIGN=CENTER COLSPAN=1>3</TD>
     </TR>
    </TABLE>

'''

import ts_regex
import regex
from ts_regex import gsub
from string import split, join, strip, find
import string,re


def untabify(aString,
             indent_tab=ts_regex.compile('\(\n\|^\)\( *\)\t').search_group,
             ):
    '''\
    Convert indentation tabs to spaces.
    '''
    result=''
    rest=aString
    while 1:
        ts_results = indent_tab(rest, (1,2))
        if ts_results:
            start, grps = ts_results
            lnl=len(grps[0])
            indent=len(grps[1])
            result=result+rest[:start]
            rest="\n%s%s" % (' ' * ((indent/8+1)*8),
                             rest[start+indent+1+lnl:])
        else:
            return result+rest

def indent(aString, indent=2):
    """Indent a string the given number of spaces"""
    r=split(untabify(aString),'\n')
    if not r: return ''
    if not r[-1]: del r[-1]
    tab=' '*level
    return "%s%s\n" % (tab,join(r,'\n'+tab))

def reindent(aString, indent=2, already_untabified=0):
    "reindent a block of text, so that the minimum indent is as given"

    if not already_untabified: aString=untabify(aString)

    l=indent_level(aString)[0]
    if indent==l: return aString

    r=[]

    append=r.append

    if indent > l:
        tab=' ' * (indent-l)
        for s in split(aString,'\n'): append(tab+s)
    else:
        l=l-indent
        for s in split(aString,'\n'): append(s[l:])

    return join(r,'\n')

def indent_level(aString,
                 indent_space=ts_regex.compile('\n\( *\)').search_group,
                 ):
    '''\
    Find the minimum indentation for a string, not counting blank lines.
    '''
    start=0
    text='\n'+aString
    indent=l=len(text)
    while 1:

        ts_results = indent_space(text, (1,2), start)
        if ts_results:
            start, grps = ts_results
            i=len(grps[0])
            start=start+i+1
            if start < l and text[start] != '\n':       # Skip blank lines
                if not i: return (0,aString)
                if i < indent: indent = i
        else:
            return (indent,aString)

def paragraphs(list,start):
    l=len(list)
    level=list[start][0]
    i=start+1
    while i < l and list[i][0] > level: i=i+1
    return i-1-start

def structure(list):
    if not list: return []
    i=0
    l=len(list)
    r=[]
    while i < l:
        sublen=paragraphs(list,i)
        i=i+1
        r.append((list[i-1][1],structure(list[i:i+sublen])))
        i=i+sublen
    return r


class Table:
    CELL='  <TD ALIGN=CENTER COLSPAN=%i>%s</TD>\n'
    ROW=' <TR>\n%s </TR>\n'
    TABLE='\n<TABLE BORDER=1 CELLPADDING=2>\n%s</TABLE>'
    
    def create(self,aPar,
        td_reg=re.compile(r'[ \t\n]*\|\|([^\0x00|]*)')
        ):
        '''parses a table and returns nested list representing the
        table'''
        self.table=[]
        text=filter(None,split(aPar,'\n'))
        for line in text:
            row=[]
            while 1:
                mo =  td_reg.match(line)
                if not mo: return 0
                pos = mo.end(1)
                row.append(mo.group(1))
                if pos==len(line):break
                line=line[pos:]
            self.table.append(row)
        return 1

    def html(self):
        '''Creates an HTML representation of table'''
        htmltable=[]
        for row in self.table:
            htmlrow=[]
            colspan=1
            for cell in row:
                if cell=='':
                    colspan=colspan+1
                    continue
                else:
                    htmlrow.append(self.CELL%(colspan,cell))
                    colspan=1
            htmltable.append(self.ROW%join(htmlrow,''))
        return self.TABLE%join(htmltable,'')

table=Table()

class StructuredText:

    """Model text as structured collection of paragraphs.

    Structure is implied by the indentation level.

    This class is intended as a base classes that do actual text
    output formatting.
    """

    def __init__(self, aStructuredString, level=0,
                 paragraph_divider=regex.compile('\(\r?\n *\)+\r?\n'),
                 ):
        '''Convert a structured text string into a structured text object.

        Aguments:

          aStructuredString -- The string to be parsed.
          level -- The level of top level headings to be created.
        '''


        pat = ' \"([%s0-9-_,./?=@~&]*)\":' % string.letters+ \
              '([-:%s0-9_,./?=@#~&]*?)' % string.letters + \
              '([.:?;] )' 

        p_reg = re.compile(pat,re.M)
                
        aStructuredString = p_reg.sub(r'<a href="\2">\1</a>\3 ' , aStructuredString)

        pat = ' \"([%s0-9-_,./?=@~&]*)\", ' % string.letters+ \
              '([-:%s0-9_,./?=@#~&]*?)' % string.letters + \
              '([.:?;] )' 

        p_reg = re.compile(pat,re.M)

        aStructuredString = p_reg.sub(r'<a href="\2">\1</a>\3 ' , aStructuredString)


        protoless = find(aStructuredString, '<a href=":')
        if protoless != -1:
            aStructuredString = re.sub('<a href=":', '<a href="',
                                     aStructuredString)

        self.level=level
        paragraphs=ts_regex.split(untabify(aStructuredString),
                                  paragraph_divider)
        paragraphs=map(indent_level,paragraphs)

        self.structure=structure(paragraphs)


    def __str__(self):
        return str(self.structure)


ctag_prefix=r'([\x00- \\(]|^)' 
ctag_suffix=r'([\x00- ,.:;!?\\)]|$)'         
ctag_middle=r'[%s]([^\x00- %s][^%s]*[^\x00- %s]|[^%s])[%s]' 
ctag_middl2=r'[%s][%s]([^\x00- %s][^%s]*[^\x00- %s]|[^%s])[%s][%s]'    

def ctag(s,
         em=re.compile(
             ctag_prefix+(ctag_middle % (("*",)*6) )+ctag_suffix),
         strong=re.compile(
             ctag_prefix+(ctag_middl2 % (("*",)*8))+ctag_suffix),
         under=re.compile(
             ctag_prefix+(ctag_middle % (("_",)*6) )+ctag_suffix),
         code=re.compile(
             ctag_prefix+(ctag_middle % (("\'",)*6))+ctag_suffix),
         ):
    if s is None: s=''
    s=strong.sub(r'\1<strong>\2</strong>\3',s)
    s=under.sub( r'\1<u>\2</u>\3',s)
    s=code.sub(  r'\1<code>\2</code>\3',s)
    s=em.sub(    r'\1<em>\2</em>\3',s)
    return s    

class HTML(StructuredText):

    '''\
    An HTML structured text formatter.
    '''\

    def __str__(self,
                extra_dl=re.compile("</dl>\n<dl>"),
                extra_ul=re.compile("</ul>\n<ul>"),
                extra_ol=re.compile("</ol>\n<ol>"),
                ):
        '''\
        Return an HTML string representation of the structured text data.

        '''
        s=self._str(self.structure,self.level)
        s=extra_dl.sub('\n',s)
        s=extra_ul.sub('\n',s)
        s=extra_ol.sub('\n',s)
        return s

    def ul(self, before, p, after):
        if p: p="<p>%s</p>" % strip(ctag(p))
        return ('%s<ul><li>%s\n%s\n</li></ul>\n'
                % (before,p,after))

    def ol(self, before, p, after):
        if p: p="<p>%s</p>" % strip(ctag(p))
        return ('%s<ol><li>%s\n%s\n</li></ol>\n'
                % (before,p,after))

    def dl(self, before, t, d, after):
        return ('%s<dl><dt>%s</dt><dd><p>%s</p>\n%s\n</dd></dl>\n'
                % (before,ctag(t),ctag(d),after))

    def head(self, before, t, level, d):
        if level > 0 and level < 6:
            return ('%s<h%d>%s</h%d>\n%s\n'
                    % (before,level,strip(ctag(t)),level,d))
            
        t="<p><strong>%s</strong></p>" % strip(ctag(t))
        return ('%s<dl><dt>%s\n</dt><dd>%s\n</dd></dl>\n'
                % (before,t,d))

    def normal(self,before,p,after):
        return '%s<p>%s</p>\n%s\n' % (before,ctag(p),after)

    def pre(self,structure,tagged=0):
        if not structure: return ''
        if tagged:
            r=''
        else:
            r='<PRE>\n'
        for s in structure:
            r="%s%s\n\n%s" % (r,html_quote(s[0]),self.pre(s[1],1))
        if not tagged: r=r+'</PRE>\n'
        return r
    
    def table(self,before,table,after):
        return '%s<p>%s</p>\n%s\n' % (before,ctag(table),after)
    
    def _str(self,structure,level,
             # Static
             bullet=ts_regex.compile('[ \t\n]*[o*-][ \t\n]+\([^\0]*\)'
                                     ).match_group,
             example=ts_regex.compile('[\0- ]examples?:[\0- ]*$'
                                      ).search,
             dl=ts_regex.compile('\([^\n]+\)[ \t]+--[ \t\n]+\([^\0]*\)'
                                 ).match_group,
             nl=ts_regex.compile('\n').search,
             ol=ts_regex.compile(
                 '[ \t]*\(\([0-9]+\|[%s]+\)[.)]\)+[ \t\n]+\([^\0]*\|$\)' % string.letters
                 ).match_group,
             olp=ts_regex.compile('[ \t]*([0-9]+)[ \t\n]+\([^\0]*\|$\)'
                                  ).match_group,
             ):
        r=''
        for s in structure:

            ts_results = bullet(s[0], (1,))
            if ts_results:
                p = ts_results[1]
                if s[0][-2:]=='::' and s[1]: ps=self.pre(s[1])
                else: ps=self._str(s[1],level)
                r=self.ul(r,p,ps)
                continue
            ts_results = ol(s[0], (3,))
            if ts_results:
                p = ts_results[1]
                if s[0][-2:]=='::' and s[1]: ps=self.pre(s[1])
                else: ps=self._str(s[1],level)
                r=self.ol(r,p,ps)
                continue
            ts_results = olp(s[0], (1,))
            if ts_results:
                p = ts_results[1]
                if s[0][-2:]=='::' and s[1]: ps=self.pre(s[1])
                else: ps=self._str(s[1],level)
                r=self.ol(r,p,ps)
                continue
            ts_results = dl(s[0], (1,2))
            if ts_results:
                t,d = ts_results[1]
                r=self.dl(r,t,d,self._str(s[1],level))
                continue
            if example(s[0]) >= 0 and s[1]:
                # Introduce an example, using pre tags:
                r=self.normal(r,s[0],self.pre(s[1]))
                continue
            if s[0][-2:]=='::' and s[1]:
                # Introduce an example, using pre tags:
                r=self.normal(r,s[0][:-1],self.pre(s[1]))
                continue
            if table.create(s[0]):
                ## table support.
                r=self.table(r,table.html(),self._str(s[1],level))
                continue
            else:

                if nl(s[0]) < 0 and s[1] and s[0][-1:] != ':':
                    # Treat as a heading
                    t=s[0]
                    r=self.head(r,t,level,
                                self._str(s[1],level and level+1))
                else:
                    r=self.normal(r,s[0],self._str(s[1],level))
        return r
        

def html_quote(v,
               character_entities=(
                       (re.compile('&'), '&amp;'),
                       (re.compile("<"), '&lt;' ),
                       (re.compile(">"), '&gt;' ),
                       (re.compile('"'), '&quot;')
                       )): #"
        text=str(v)
        for re,name in character_entities:
            text=re.sub(name,text)
        return text

def html_with_references(text, level=1):
    text = re.sub(
        r'[\0\n]\.\. \[([0-9_%s-]+)\]' % string.letters,
        r'\n  <a name="\1">[\1]</a>',
        text)

    text = re.sub(
        r'([\x00- ,])\[(?P<ref>[0-9_%s-]+)\]([\x00- ,.:])'   % string.letters,
        r'\1<a href="#\2">[\2]</a>\3',
        text)
    
    text = re.sub(
        r'([\0- ,])\[([^]]+)\.html\]([\0- ,.:])',
        r'\1<a href="\2.html">[\2]</a>\3',
        text)

    return HTML(text,level=level)
    

def main():
    import sys, getopt

    opts,args=getopt.getopt(sys.argv[1:],'twl')

    if args:
        [infile]=args
        s=open(infile,'r').read()
    else:
        s=sys.stdin.read()

    if opts:

        if filter(lambda o: o[0]=='-w', opts):
            print 'Content-Type: text/html\n'

        if filter(lambda o: o[0]=='-l', opts):
            import locale
            locale.setlocale(locale.LC_ALL,"")

        if s[:2]=='#!':
            s=re.sub('^#![^\n]+','',s)

        mo = re.compile('([\0-\n]*\n)').match(s)
        if mo is not None:
            s = s[len(mo.group(0)) :]
            
        s=str(html_with_references(s))
        if s[:4]=='<h1>':
            t=s[4:find(s,'</h1>')]
            s='''<html><head><title>%s</title>
            </head><body>
            %s
            </body></html>
            ''' % (t,s)
        print s
    else:
        print html_with_references(s)

if __name__=="__main__": main()
Commit	Line	Data
ddfc587a RD	1	#! /usr/bin/env python -- # -- python --
	2	##############################################################################
	3	#
	4	# Zope Public License (ZPL) Version 1.0
	5	# -------------------------------------
	6	#
	7	# Copyright (c) Digital Creations. All rights reserved.
	8	#
	9	# This license has been certified as Open Source(tm).
	10	#
	11	# Redistribution and use in source and binary forms, with or without
	12	# modification, are permitted provided that the following conditions are
	13	# met:
	14	#
	15	# 1. Redistributions in source code must retain the above copyright
	16	# notice, this list of conditions, and the following disclaimer.
	17	#
	18	# 2. Redistributions in binary form must reproduce the above copyright
	19	# notice, this list of conditions, and the following disclaimer in
	20	# the documentation and/or other materials provided with the
	21	# distribution.
	22	#
	23	# 3. Digital Creations requests that attribution be given to Zope
	24	# in any manner possible. Zope includes a "Powered by Zope"
	25	# button that is installed by default. While it is not a license
	26	# violation to remove this button, it is requested that the
	27	# attribution remain. A significant investment has been put
	28	# into Zope, and this effort will continue if the Zope community
	29	# continues to grow. This is one way to assure that growth.
	30	#
	31	# 4. All advertising materials and documentation mentioning
	32	# features derived from or use of this software must display
	33	# the following acknowledgement:
	34	#
	35	# "This product includes software developed by Digital Creations
	36	# for use in the Z Object Publishing Environment
	37	# (http://www.zope.org/)."
	38	#
	39	# In the event that the product being advertised includes an
	40	# intact Zope distribution (with copyright and license included)
	41	# then this clause is waived.
	42	#
	43	# 5. Names associated with Zope or Digital Creations must not be used to
	44	# endorse or promote products derived from this software without
	45	# prior written permission from Digital Creations.
	46	#
	47	# 6. Modified redistributions of any form whatsoever must retain
	48	# the following acknowledgment:
	49	#
	50	# "This product includes software developed by Digital Creations
	51	# for use in the Z Object Publishing Environment
	52	# (http://www.zope.org/)."
	53	#
	54	# Intact (re-)distributions of any official Zope release do not
	55	# require an external acknowledgement.
	56	#
	57	# 7. Modifications are encouraged but must be packaged separately as
	58	# patches to official Zope releases. Distributions that do not
	59	# clearly separate the patches from the original work must be clearly
	60	# labeled as unofficial distributions. Modifications which do not
	61	# carry the name Zope may be packaged in any form, as long as they
	62	# conform to all of the clauses above.
	63	#
	64	#
65	# Disclaimer
66	#
67	# THIS SOFTWARE IS PROVIDED BY DIGITAL CREATIONS ``AS IS'' AND ANY
68	# EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
69	# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
70	# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL DIGITAL CREATIONS OR ITS
71	# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
72	# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
73	# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
74	# USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
75	# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
76	# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
77	# OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
78	# SUCH DAMAGE.
79	#
80	#
81	# This software consists of contributions made by Digital Creations and
82	# many individuals on behalf of Digital Creations. Specific
83	# attributions are listed in the accompanying credits file.
84	#
85	##############################################################################
86	'''Structured Text Manipulation
87
88	Parse a structured text string into a form that can be used with
89	structured formats, like html.
90
91	Structured text is text that uses indentation and simple
92	symbology to indicate the structure of a document.
93
94	A structured string consists of a sequence of paragraphs separated by
95	one or more blank lines. Each paragraph has a level which is defined
96	as the minimum indentation of the paragraph. A paragraph is a
97	sub-paragraph of another paragraph if the other paragraph is the last
98	preceding paragraph that has a lower level.
99
100	Special symbology is used to indicate special constructs:
101
102	- A single-line paragraph whose immediately succeeding paragraphs are lower
103	level is treated as a header.
104
105	- A paragraph that begins with a '-', '*', or 'o' is treated as an
106	unordered list (bullet) element.
107
108	- A paragraph that begins with a sequence of digits followed by a
109	white-space character is treated as an ordered list element.
110
111	- A paragraph that begins with a sequence of sequences, where each
112	sequence is a sequence of digits or a sequence of letters followed
113	by a period, is treated as an ordered list element.
114
115	- A paragraph with a first line that contains some text, followed by
116	some white-space and '--' is treated as
117	a descriptive list element. The leading text is treated as the
118	element title.
119
120	- Sub-paragraphs of a paragraph that ends in the word 'example' or the
121	word 'examples', or '::' is treated as example code and is output as is.
122
123	- Text enclosed single quotes (with white-space to the left of the
124	first quote and whitespace or punctuation to the right of the second quote)
125	is treated as example code.
126
127	- Text surrounded by '*' characters (with white-space to the left of the
128	first '' and whitespace or punctuation to the right of the second '')
129	is emphasized.
130
131	- Text surrounded by '**' characters (with white-space to the left of the
132	first '' and whitespace or punctuation to the right of the second '')
133	is made strong.
134
135	- Text surrounded by '_' underscore characters (with whitespace to the left
136	and whitespace or punctuation to the right) is made underlined.
137
138	- Text encloded by double quotes followed by a colon, a URL, and concluded
139	by punctuation plus white space, or just white space, is treated as a
140	hyper link. For example:
141
142	"Zope":http://www.zope.org/ is ...
143
144	Is interpreted as '<a href="http://www.zope.org/">Zope</a> is ....'
145	Note: This works for relative as well as absolute URLs.
146
147	- Text enclosed by double quotes followed by a comma, one or more spaces,
148	an absolute URL and concluded by punctuation plus white space, or just
149	white space, is treated as a hyper link. For example:
150
151	"mail me", mailto:amos@digicool.com.
152
153	Is interpreted as '<a href="mailto:amos@digicool.com">mail me</a>.'
154
155	- Text enclosed in brackets which consists only of letters, digits,
156	underscores and dashes is treated as hyper links within the document.
157	For example:
158
159	As demonstrated by Smith [12] this technique is quite effective.
160
161	Is interpreted as '... by Smith <a href="#12">[12]</a> this ...'. Together
162	with the next rule this allows easy coding of references or end notes.
163
164	- Text enclosed in brackets which is preceded by the start of a line, two
165	periods and a space is treated as a named link. For example:
166
167	.. [12] "Effective Techniques" Smith, Joe ...
168
169	Is interpreted as '<a name="12">[12]</a> "Effective Techniques" ...'.
170	Together with the previous rule this allows easy coding of references or
171	end notes.
172
173
174	- A paragraph that has blocks of text enclosed in '\|\|' is treated as a
175	table. The text blocks correspond to table cells and table rows are
176	denoted by newlines. By default the cells are center aligned. A cell
177	can span more than one column by preceding a block of text with an
178	equivalent number of cell separators '\|\|'. Newlines and '\|' cannot
179	be a part of the cell text. For example:
180
181	\|\|\|\| Ingredients \|\|
182	\|\| Name \|\| Amount \|\|
183	\|\|Spam\|\|10\|\|
184	\|\|Eggs\|\|3\|\|
185
186	is interpreted as::
187
188	<TABLE BORDER=1 CELLPADDING=2>
189	<TR>
190	<TD ALIGN=CENTER COLSPAN=2> <strong>Ingredients</strong> </TD>
191	</TR>
192	<TR>
193	<TD ALIGN=CENTER COLSPAN=1> <em>Name</em> </TD>
194	<TD ALIGN=CENTER COLSPAN=1> <em>Amount</em> </TD>
195	</TR>
196	<TR>
197	<TD ALIGN=CENTER COLSPAN=1>Spam</TD>
198	<TD ALIGN=CENTER COLSPAN=1>10</TD>
199	</TR>
200	<TR>
201	<TD ALIGN=CENTER COLSPAN=1>Eggs</TD>
202	<TD ALIGN=CENTER COLSPAN=1>3</TD>
203	</TR>
204	</TABLE>
205
206	'''
207
208	import ts_regex
209	import regex
210	from ts_regex import gsub
211	from string import split, join, strip, find
212	import string,re
213
214
215	def untabify(aString,
216	indent_tab=ts_regex.compile('\(\n\\|^\)\( *\)\t').search_group,
217	):
218	'''\
219	Convert indentation tabs to spaces.
220	'''
221	result=''
222	rest=aString
223	while 1:
224	ts_results = indent_tab(rest, (1,2))
225	if ts_results:
226	start, grps = ts_results
227	lnl=len(grps[0])
228	indent=len(grps[1])
229	result=result+rest[:start]
230	rest="\n%s%s" % (' ' * ((indent/8+1)*8),
231	rest[start+indent+1+lnl:])
232	else:
233	return result+rest
234
235	def indent(aString, indent=2):
236	"""Indent a string the given number of spaces"""
237	r=split(untabify(aString),'\n')
238	if not r: return ''
239	if not r[-1]: del r[-1]
240	tab=' '*level
241	return "%s%s\n" % (tab,join(r,'\n'+tab))
242
243	def reindent(aString, indent=2, already_untabified=0):
244	"reindent a block of text, so that the minimum indent is as given"
245
246	if not already_untabified: aString=untabify(aString)
247
248	l=indent_level(aString)[0]
249	if indent==l: return aString
250
251	r=[]
252
253	append=r.append
254
255	if indent > l:
256	tab=' ' * (indent-l)
257	for s in split(aString,'\n'): append(tab+s)
258	else:
259	l=l-indent
260	for s in split(aString,'\n'): append(s[l:])
261
262	return join(r,'\n')
263
264	def indent_level(aString,
265	indent_space=ts_regex.compile('\n\( *\)').search_group,
266	):
267	'''\
268	Find the minimum indentation for a string, not counting blank lines.
269	'''
270	start=0
271	text='\n'+aString
272	indent=l=len(text)
273	while 1:
274
275	ts_results = indent_space(text, (1,2), start)
276	if ts_results:
277	start, grps = ts_results
278	i=len(grps[0])
279	start=start+i+1
280	if start < l and text[start] != '\n': # Skip blank lines
281	if not i: return (0,aString)
282	if i < indent: indent = i
283	else:
284	return (indent,aString)
285
286	def paragraphs(list,start):
287	l=len(list)
288	level=list[start][0]
289	i=start+1
290	while i < l and list[i][0] > level: i=i+1
291	return i-1-start
292
293	def structure(list):
294	if not list: return []
295	i=0
296	l=len(list)
297	r=[]
298	while i < l:
299	sublen=paragraphs(list,i)
300	i=i+1
301	r.append((list[i-1][1],structure(list[i:i+sublen])))
302	i=i+sublen
303	return r
304
305
306	class Table:
307	CELL=' <TD ALIGN=CENTER COLSPAN=%i>%s</TD>\n'
308	ROW=' <TR>\n%s </TR>\n'
309	TABLE='\n<TABLE BORDER=1 CELLPADDING=2>\n%s</TABLE>'
310
311	def create(self,aPar,
312	td_reg=re.compile(r'[ \t\n]\\|\\|([^\0x00\|])')
313	):
314	'''parses a table and returns nested list representing the
315	table'''
316	self.table=[]
317	text=filter(None,split(aPar,'\n'))
318	for line in text:
319	row=[]
320	while 1:
321	mo = td_reg.match(line)
322	if not mo: return 0
323	pos = mo.end(1)
324	row.append(mo.group(1))
325	if pos==len(line):break
326	line=line[pos:]
327	self.table.append(row)
328	return 1
329
330	def html(self):
331	'''Creates an HTML representation of table'''
332	htmltable=[]
333	for row in self.table:
334	htmlrow=[]
335	colspan=1
336	for cell in row:
337	if cell=='':
338	colspan=colspan+1
339	continue
340	else:
341	htmlrow.append(self.CELL%(colspan,cell))
342	colspan=1
343	htmltable.append(self.ROW%join(htmlrow,''))
344	return self.TABLE%join(htmltable,'')
345
346	table=Table()
347
348	class StructuredText:
349
350	"""Model text as structured collection of paragraphs.
351
352	Structure is implied by the indentation level.
353
354	This class is intended as a base classes that do actual text
355	output formatting.
356	"""
357
358	def __init__(self, aStructuredString, level=0,
359	paragraph_divider=regex.compile('\(\r?\n *\)+\r?\n'),
360	):
361	'''Convert a structured text string into a structured text object.
362
363	Aguments:
364
365	aStructuredString -- The string to be parsed.
366	level -- The level of top level headings to be created.
367	'''
368
369
370	pat = ' \"([%s0-9-_,./?=@~&]*)\":' % string.letters+ \
371	'([-:%s0-9_,./?=@#~&]*?)' % string.letters + \
372	'([.:?;] )'
373
374	p_reg = re.compile(pat,re.M)
375
376	aStructuredString = p_reg.sub(r'<a href="\2">\1</a>\3 ' , aStructuredString)
377
378	pat = ' \"([%s0-9-_,./?=@~&]*)\", ' % string.letters+ \
379	'([-:%s0-9_,./?=@#~&]*?)' % string.letters + \
380	'([.:?;] )'
381
382	p_reg = re.compile(pat,re.M)
383
384	aStructuredString = p_reg.sub(r'<a href="\2">\1</a>\3 ' , aStructuredString)
385
386
387	protoless = find(aStructuredString, '<a href=":')
388	if protoless != -1:
389	aStructuredString = re.sub('<a href=":', '<a href="',
390	aStructuredString)
391
392	self.level=level
393	paragraphs=ts_regex.split(untabify(aStructuredString),
394	paragraph_divider)
395	paragraphs=map(indent_level,paragraphs)
396
397	self.structure=structure(paragraphs)
398
399
400	def __str__(self):
401	return str(self.structure)
402
403
404	ctag_prefix=r'([\x00- \\(]\|^)'
405	ctag_suffix=r'([\x00- ,.:;!?\\)]\|$)'
406	ctag_middle=r'[%s]([^\x00- %s][^%s]*[^\x00- %s]\|[^%s])[%s]'
407	ctag_middl2=r'[%s][%s]([^\x00- %s][^%s]*[^\x00- %s]\|[^%s])[%s][%s]'
408
409	def ctag(s,
410	em=re.compile(
411	ctag_prefix+(ctag_middle % (("",)6) )+ctag_suffix),
412	strong=re.compile(
413	ctag_prefix+(ctag_middl2 % (("",)8))+ctag_suffix),
414	under=re.compile(
415	ctag_prefix+(ctag_middle % (("_",)*6) )+ctag_suffix),
416	code=re.compile(
417	ctag_prefix+(ctag_middle % (("\'",)*6))+ctag_suffix),
418	):
419	if s is None: s=''
420	s=strong.sub(r'\1<strong>\2</strong>\3',s)
421	s=under.sub( r'\1<u>\2</u>\3',s)
422	s=code.sub( r'\1<code>\2</code>\3',s)
423	s=em.sub( r'\1<em>\2</em>\3',s)
424	return s
425
426	class HTML(StructuredText):
427
428	'''\
429	An HTML structured text formatter.
430	'''\
431
432	def __str__(self,
433	extra_dl=re.compile("</dl>\n<dl>"),
434	extra_ul=re.compile("</ul>\n<ul>"),
435	extra_ol=re.compile("</ol>\n<ol>"),
436	):
437	'''\
438	Return an HTML string representation of the structured text data.
439
440	'''
441	s=self._str(self.structure,self.level)
442	s=extra_dl.sub('\n',s)
443	s=extra_ul.sub('\n',s)
444	s=extra_ol.sub('\n',s)
445	return s
446
447	def ul(self, before, p, after):
448	if p: p="<p>%s</p>" % strip(ctag(p))
449	return ('%s<ul><li>%s\n%s\n</li></ul>\n'
450	% (before,p,after))
451
452	def ol(self, before, p, after):
453	if p: p="<p>%s</p>" % strip(ctag(p))
454	return ('%s<ol><li>%s\n%s\n</li></ol>\n'
455	% (before,p,after))
456
457	def dl(self, before, t, d, after):
458	return ('%s<dl><dt>%s</dt><dd><p>%s</p>\n%s\n</dd></dl>\n'
459	% (before,ctag(t),ctag(d),after))
460
461	def head(self, before, t, level, d):
462	if level > 0 and level < 6:
463	return ('%s<h%d>%s</h%d>\n%s\n'
464	% (before,level,strip(ctag(t)),level,d))
465
466	t="<p><strong>%s</strong></p>" % strip(ctag(t))
467	return ('%s<dl><dt>%s\n</dt><dd>%s\n</dd></dl>\n'
468	% (before,t,d))
469
470	def normal(self,before,p,after):
471	return '%s<p>%s</p>\n%s\n' % (before,ctag(p),after)
472
473	def pre(self,structure,tagged=0):
474	if not structure: return ''
475	if tagged:
476	r=''
477	else:
478	r='<PRE>\n'
479	for s in structure:
480	r="%s%s\n\n%s" % (r,html_quote(s[0]),self.pre(s[1],1))
481	if not tagged: r=r+'</PRE>\n'
482	return r
483
484	def table(self,before,table,after):
485	return '%s<p>%s</p>\n%s\n' % (before,ctag(table),after)
486
487	def _str(self,structure,level,
488	# Static
489	bullet=ts_regex.compile('[ \t\n][o-][ \t\n]+\([^\0]*\)'
490	).match_group,
491	example=ts_regex.compile('[\0- ]examples?:[\0- ]*$'
492	).search,
493	dl=ts_regex.compile('\([^\n]+\)[ \t]+--[ \t\n]+\([^\0]*\)'
494	).match_group,
495	nl=ts_regex.compile('\n').search,
496	ol=ts_regex.compile(
497	'[ \t]\(\([0-9]+\\|[%s]+\)[.)]\)+[ \t\n]+\([^\0]\\|$\)' % string.letters
498	).match_group,
499	olp=ts_regex.compile('[ \t]([0-9]+)[ \t\n]+\([^\0]\\|$\)'
500	).match_group,
501	):
502	r=''
503	for s in structure:
504
505	ts_results = bullet(s[0], (1,))
506	if ts_results:
507	p = ts_results[1]
508	if s[0][-2:]=='::' and s[1]: ps=self.pre(s[1])
509	else: ps=self._str(s[1],level)
510	r=self.ul(r,p,ps)
511	continue
512	ts_results = ol(s[0], (3,))
513	if ts_results:
514	p = ts_results[1]
515	if s[0][-2:]=='::' and s[1]: ps=self.pre(s[1])
516	else: ps=self._str(s[1],level)
517	r=self.ol(r,p,ps)
518	continue
519	ts_results = olp(s[0], (1,))
520	if ts_results:
521	p = ts_results[1]
522	if s[0][-2:]=='::' and s[1]: ps=self.pre(s[1])
523	else: ps=self._str(s[1],level)
524	r=self.ol(r,p,ps)
525	continue
526	ts_results = dl(s[0], (1,2))
527	if ts_results:
528	t,d = ts_results[1]
529	r=self.dl(r,t,d,self._str(s[1],level))
530	continue
531	if example(s[0]) >= 0 and s[1]:
532	# Introduce an example, using pre tags:
533	r=self.normal(r,s[0],self.pre(s[1]))
534	continue
535	if s[0][-2:]=='::' and s[1]:
536	# Introduce an example, using pre tags:
537	r=self.normal(r,s[0][:-1],self.pre(s[1]))
538	continue
539	if table.create(s[0]):
540	## table support.
541	r=self.table(r,table.html(),self._str(s[1],level))
542	continue
543	else:
544
545	if nl(s[0]) < 0 and s[1] and s[0][-1:] != ':':
546	# Treat as a heading
547	t=s[0]
548	r=self.head(r,t,level,
549	self._str(s[1],level and level+1))
550	else:
551	r=self.normal(r,s[0],self._str(s[1],level))
552	return r
553
554
555	def html_quote(v,
556	character_entities=(
557	(re.compile('&'), '&'),
558	(re.compile("<"), '<' ),
559	(re.compile(">"), '>' ),
560	(re.compile('"'), '"')
561	)): #"
562	text=str(v)
563	for re,name in character_entities:
564	text=re.sub(name,text)
565	return text
566
567	def html_with_references(text, level=1):
568	text = re.sub(
569	r'[\0\n]\.\. \[([0-9_%s-]+)\]' % string.letters,
570	r'\n <a name="\1">[\1]</a>',
571	text)
572
573	text = re.sub(
574	r'([\x00- ,])\[(?P<ref>[0-9_%s-]+)\]([\x00- ,.:])' % string.letters,
575	r'\1<a href="#\2">[\2]</a>\3',
576	text)
577
578	text = re.sub(
579	r'([\0- ,])\[([^]]+)\.html\]([\0- ,.:])',
580	r'\1<a href="\2.html">[\2]</a>\3',
581	text)
582
583	return HTML(text,level=level)
584
585
586	def main():
587	import sys, getopt
588
589	opts,args=getopt.getopt(sys.argv[1:],'twl')
590
591	if args:
592	[infile]=args
593	s=open(infile,'r').read()
594	else:
595	s=sys.stdin.read()
596
597	if opts:
598
599	if filter(lambda o: o[0]=='-w', opts):
600	print 'Content-Type: text/html\n'
601
602	if filter(lambda o: o[0]=='-l', opts):
603	import locale
604	locale.setlocale(locale.LC_ALL,"")
605
606	if s[:2]=='#!':
607	s=re.sub('^#![^\n]+','',s)
608
609	mo = re.compile('([\0-\n]*\n)').match(s)
610	if mo is not None:
611	s = s[len(mo.group(0)) :]
612
613	s=str(html_with_references(s))
614	if s[:4]=='<h1>':
615	t=s[4:find(s,'</h1>')]
616	s='''<html><head><title>%s</title>
617	</head><body>
618	%s
619	</body></html>
620	''' % (t,s)
621	print s
622	else:
623	print html_with_references(s)
624
625	if __name__=="__main__": main()