wxPython/samples/stxview/StructuredText/ClassicStructuredText.py

   1 #! /usr/bin/env python -- # -*- python -*-
   2 ##############################################################################
   3 #
   4 # Zope Public License (ZPL) Version 1.0
   5 # -------------------------------------
   6 #
   7 # Copyright (c) Digital Creations.  All rights reserved.
   8 #
   9 # This license has been certified as Open Source(tm).
  10 #
  11 # Redistribution and use in source and binary forms, with or without
  12 # modification, are permitted provided that the following conditions are
  13 # met:
  14 #
  15 # 1. Redistributions in source code must retain the above copyright
  16 #    notice, this list of conditions, and the following disclaimer.
  17 #
  18 # 2. Redistributions in binary form must reproduce the above copyright
  19 #    notice, this list of conditions, and the following disclaimer in
  20 #    the documentation and/or other materials provided with the
  21 #    distribution.
  22 #
  23 # 3. Digital Creations requests that attribution be given to Zope
  24 #    in any manner possible. Zope includes a "Powered by Zope"
  25 #    button that is installed by default. While it is not a license
  26 #    violation to remove this button, it is requested that the
  27 #    attribution remain. A significant investment has been put
  28 #    into Zope, and this effort will continue if the Zope community
  29 #    continues to grow. This is one way to assure that growth.
  30 #
  31 # 4. All advertising materials and documentation mentioning
  32 #    features derived from or use of this software must display
  33 #    the following acknowledgement:
  34 #
  35 #      "This product includes software developed by Digital Creations
  36 #      for use in the Z Object Publishing Environment
  37 #      (http://www.zope.org/)."
  38 #
  39 #    In the event that the product being advertised includes an
  40 #    intact Zope distribution (with copyright and license included)
  41 #    then this clause is waived.
  42 #
  43 # 5. Names associated with Zope or Digital Creations must not be used to
  44 #    endorse or promote products derived from this software without
  45 #    prior written permission from Digital Creations.
  46 #
  47 # 6. Modified redistributions of any form whatsoever must retain
  48 #    the following acknowledgment:
  49 #
  50 #      "This product includes software developed by Digital Creations
  51 #      for use in the Z Object Publishing Environment
  52 #      (http://www.zope.org/)."
  53 #
  54 #    Intact (re-)distributions of any official Zope release do not
  55 #    require an external acknowledgement.
  56 #
  57 # 7. Modifications are encouraged but must be packaged separately as
  58 #    patches to official Zope releases.  Distributions that do not
  59 #    clearly separate the patches from the original work must be clearly
  60 #    labeled as unofficial distributions.  Modifications which do not
  61 #    carry the name Zope may be packaged in any form, as long as they
  62 #    conform to all of the clauses above.
  63 #
  64 #
  65 # Disclaimer
  66 #
  67 #   THIS SOFTWARE IS PROVIDED BY DIGITAL CREATIONS ``AS IS'' AND ANY
  68 #   EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  69 #   IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
  70 #   PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL DIGITAL CREATIONS OR ITS
  71 #   CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  72 #   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  73 #   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
  74 #   USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
  75 #   ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  76 #   OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
  77 #   OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  78 #   SUCH DAMAGE.
  79 #
  80 #
  81 # This software consists of contributions made by Digital Creations and
  82 # many individuals on behalf of Digital Creations.  Specific
  83 # attributions are listed in the accompanying credits file.
  84 #
  85 ##############################################################################
  86 '''Structured Text Manipulation
  87
  88 Parse a structured text string into a form that can be used with
  89 structured formats, like html.
  90
  91 Structured text is text that uses indentation and simple
  92 symbology to indicate the structure of a document.
  93
  94 A structured string consists of a sequence of paragraphs separated by
  95 one or more blank lines.  Each paragraph has a level which is defined
  96 as the minimum indentation of the paragraph.  A paragraph is a
  97 sub-paragraph of another paragraph if the other paragraph is the last
  98 preceding paragraph that has a lower level.
  99
 100 Special symbology is used to indicate special constructs:
 101
 102 - A single-line paragraph whose immediately succeeding paragraphs are lower
 103   level is treated as a header.
 104
 105 - A paragraph that begins with a '-', '*', or 'o' is treated as an
 106   unordered list (bullet) element.
 107
 108 - A paragraph that begins with a sequence of digits followed by a
 109   white-space character is treated as an ordered list element.
 110
 111 - A paragraph that begins with a sequence of sequences, where each
 112   sequence is a sequence of digits or a sequence of letters followed
 113   by a period, is treated as an ordered list element.
 114
 115 - A paragraph with a first line that contains some text, followed by
 116   some white-space and '--' is treated as
 117   a descriptive list element. The leading text is treated as the
 118   element title.
 119
 120 - Sub-paragraphs of a paragraph that ends in the word 'example' or the
 121   word 'examples', or '::' is treated as example code and is output as is.
 122
 123 - Text enclosed single quotes (with white-space to the left of the
 124   first quote and whitespace or punctuation to the right of the second quote)
 125   is treated as example code.
 126
 127 - Text surrounded by '*' characters (with white-space to the left of the
 128   first '*' and whitespace or punctuation to the right of the second '*')
 129   is emphasized.
 130
 131 - Text surrounded by '**' characters (with white-space to the left of the
 132   first '**' and whitespace or punctuation to the right of the second '**')
 133   is made strong.
 134
 135 - Text surrounded by '_' underscore characters (with whitespace to the left
 136   and whitespace or punctuation to the right) is made underlined.
 137
 138 - Text encloded by double quotes followed by a colon, a URL, and concluded
 139   by punctuation plus white space, *or* just white space, is treated as a
 140   hyper link. For example:
 141
 142     "Zope":http://www.zope.org/ is ...
 143
 144   Is interpreted as '<a href="http://www.zope.org/">Zope</a> is ....'
 145   Note: This works for relative as well as absolute URLs.
 146
 147 - Text enclosed by double quotes followed by a comma, one or more spaces,
 148   an absolute URL and concluded by punctuation plus white space, or just
 149   white space, is treated as a hyper link. For example:
 150
 151     "mail me", mailto:amos@digicool.com.
 152
 153   Is interpreted as '<a href="mailto:amos@digicool.com">mail me</a>.'
 154
 155 - Text enclosed in brackets which consists only of letters, digits,
 156   underscores and dashes is treated as hyper links within the document.
 157   For example:
 158
 159     As demonstrated by Smith [12] this technique is quite effective.
 160
 161   Is interpreted as '... by Smith <a href="#12">[12]</a> this ...'. Together
 162   with the next rule this allows easy coding of references or end notes.
 163
 164 - Text enclosed in brackets which is preceded by the start of a line, two
 165   periods and a space is treated as a named link. For example:
 166
 167     .. [12] "Effective Techniques" Smith, Joe ...
 168
 169   Is interpreted as '<a name="12">[12]</a> "Effective Techniques" ...'.
 170   Together with the previous rule this allows easy coding of references or
 171   end notes.
 172
 173
 174 - A paragraph that has blocks of text enclosed in '||' is treated as a
 175   table. The text blocks correspond to table cells and table rows are
 176   denoted by newlines. By default the cells are center aligned. A cell
 177   can span more than one column by preceding a block of text with an
 178   equivalent number of cell separators '||'. Newlines and '|' cannot
 179   be a part of the cell text. For example:
 180
 181       |||| **Ingredients** ||
 182       || *Name* || *Amount* ||
 183       ||Spam||10||
 184       ||Eggs||3||
 185
 186   is interpreted as::
 187
 188     <TABLE BORDER=1 CELLPADDING=2>
 189      <TR>
 190       <TD ALIGN=CENTER COLSPAN=2> <strong>Ingredients</strong> </TD>
 191      </TR>
 192      <TR>
 193       <TD ALIGN=CENTER COLSPAN=1> <em>Name</em> </TD>
 194       <TD ALIGN=CENTER COLSPAN=1> <em>Amount</em> </TD>
 195      </TR>
 196      <TR>
 197       <TD ALIGN=CENTER COLSPAN=1>Spam</TD>
 198       <TD ALIGN=CENTER COLSPAN=1>10</TD>
 199      </TR>
 200      <TR>
 201       <TD ALIGN=CENTER COLSPAN=1>Eggs</TD>
 202       <TD ALIGN=CENTER COLSPAN=1>3</TD>
 203      </TR>
 204     </TABLE>
 205
 206 '''
 207
 208 import ts_regex
 209 import regex
 210 from ts_regex import gsub
 211 from string import split, join, strip, find
 212 import string,re
 213
 214
 215 def untabify(aString,
 216              indent_tab=ts_regex.compile('\(\n\|^\)\( *\)\t').search_group,
 217              ):
 218     '''\
 219     Convert indentation tabs to spaces.
 220     '''
 221     result=''
 222     rest=aString
 223     while 1:
 224         ts_results = indent_tab(rest, (1,2))
 225         if ts_results:
 226             start, grps = ts_results
 227             lnl=len(grps[0])
 228             indent=len(grps[1])
 229             result=result+rest[:start]
 230             rest="\n%s%s" % (' ' * ((indent/8+1)*8),
 231                              rest[start+indent+1+lnl:])
 232         else:
 233             return result+rest
 234
 235 def indent(aString, indent=2):
 236     """Indent a string the given number of spaces"""
 237     r=split(untabify(aString),'\n')
 238     if not r: return ''
 239     if not r[-1]: del r[-1]
 240     tab=' '*level
 241     return "%s%s\n" % (tab,join(r,'\n'+tab))
 242
 243 def reindent(aString, indent=2, already_untabified=0):
 244     "reindent a block of text, so that the minimum indent is as given"
 245
 246     if not already_untabified: aString=untabify(aString)
 247
 248     l=indent_level(aString)[0]
 249     if indent==l: return aString
 250
 251     r=[]
 252
 253     append=r.append
 254
 255     if indent > l:
 256         tab=' ' * (indent-l)
 257         for s in split(aString,'\n'): append(tab+s)
 258     else:
 259         l=l-indent
 260         for s in split(aString,'\n'): append(s[l:])
 261
 262     return join(r,'\n')
 263
 264 def indent_level(aString,
 265                  indent_space=ts_regex.compile('\n\( *\)').search_group,
 266                  ):
 267     '''\
 268     Find the minimum indentation for a string, not counting blank lines.
 269     '''
 270     start=0
 271     text='\n'+aString
 272     indent=l=len(text)
 273     while 1:
 274
 275         ts_results = indent_space(text, (1,2), start)
 276         if ts_results:
 277             start, grps = ts_results
 278             i=len(grps[0])
 279             start=start+i+1
 280             if start < l and text[start] != '\n':       # Skip blank lines
 281                 if not i: return (0,aString)
 282                 if i < indent: indent = i
 283         else:
 284             return (indent,aString)
 285
 286 def paragraphs(list,start):
 287     l=len(list)
 288     level=list[start][0]
 289     i=start+1
 290     while i < l and list[i][0] > level: i=i+1
 291     return i-1-start
 292
 293 def structure(list):
 294     if not list: return []
 295     i=0
 296     l=len(list)
 297     r=[]
 298     while i < l:
 299         sublen=paragraphs(list,i)
 300         i=i+1
 301         r.append((list[i-1][1],structure(list[i:i+sublen])))
 302         i=i+sublen
 303     return r
 304
 305
 306 class Table:
 307     CELL='  <TD ALIGN=CENTER COLSPAN=%i>%s</TD>\n'
 308     ROW=' <TR>\n%s </TR>\n'
 309     TABLE='\n<TABLE BORDER=1 CELLPADDING=2>\n%s</TABLE>'
 310
 311     def create(self,aPar,
 312         td_reg=re.compile(r'[ \t\n]*\|\|([^\0x00|]*)')
 313         ):
 314         '''parses a table and returns nested list representing the
 315         table'''
 316         self.table=[]
 317         text=filter(None,split(aPar,'\n'))
 318         for line in text:
 319             row=[]
 320             while 1:
 321                 mo =  td_reg.match(line)
 322                 if not mo: return 0
 323                 pos = mo.end(1)
 324                 row.append(mo.group(1))
 325                 if pos==len(line):break
 326                 line=line[pos:]
 327             self.table.append(row)
 328         return 1
 329
 330     def html(self):
 331         '''Creates an HTML representation of table'''
 332         htmltable=[]
 333         for row in self.table:
 334             htmlrow=[]
 335             colspan=1
 336             for cell in row:
 337                 if cell=='':
 338                     colspan=colspan+1
 339                     continue
 340                 else:
 341                     htmlrow.append(self.CELL%(colspan,cell))
 342                     colspan=1
 343             htmltable.append(self.ROW%join(htmlrow,''))
 344         return self.TABLE%join(htmltable,'')
 345
 346 table=Table()
 347
 348 class StructuredText:
 349
 350     """Model text as structured collection of paragraphs.
 351
 352     Structure is implied by the indentation level.
 353
 354     This class is intended as a base classes that do actual text
 355     output formatting.
 356     """
 357
 358     def __init__(self, aStructuredString, level=0,
 359                  paragraph_divider=regex.compile('\(\r?\n *\)+\r?\n'),
 360                  ):
 361         '''Convert a structured text string into a structured text object.
 362
 363         Aguments:
 364
 365           aStructuredString -- The string to be parsed.
 366           level -- The level of top level headings to be created.
 367         '''
 368
 369
 370         pat = ' \"([%s0-9-_,./?=@~&]*)\":' % string.letters+ \
 371               '([-:%s0-9_,./?=@#~&]*?)' % string.letters + \
 372               '([.:?;] )'
 373
 374         p_reg = re.compile(pat,re.M)
 375
 376         aStructuredString = p_reg.sub(r'<a href="\2">\1</a>\3 ' , aStructuredString)
 377
 378         pat = ' \"([%s0-9-_,./?=@~&]*)\", ' % string.letters+ \
 379               '([-:%s0-9_,./?=@#~&]*?)' % string.letters + \
 380               '([.:?;] )'
 381
 382         p_reg = re.compile(pat,re.M)
 383
 384         aStructuredString = p_reg.sub(r'<a href="\2">\1</a>\3 ' , aStructuredString)
 385
 386
 387         protoless = find(aStructuredString, '<a href=":')
 388         if protoless != -1:
 389             aStructuredString = re.sub('<a href=":', '<a href="',
 390                                      aStructuredString)
 391
 392         self.level=level
 393         paragraphs=ts_regex.split(untabify(aStructuredString),
 394                                   paragraph_divider)
 395         paragraphs=map(indent_level,paragraphs)
 396
 397         self.structure=structure(paragraphs)
 398
 399
 400     def __str__(self):
 401         return str(self.structure)
 402
 403
 404 ctag_prefix=r'([\x00- \\(]|^)'
 405 ctag_suffix=r'([\x00- ,.:;!?\\)]|$)'
 406 ctag_middle=r'[%s]([^\x00- %s][^%s]*[^\x00- %s]|[^%s])[%s]'
 407 ctag_middl2=r'[%s][%s]([^\x00- %s][^%s]*[^\x00- %s]|[^%s])[%s][%s]'
 408
 409 def ctag(s,
 410          em=re.compile(
 411              ctag_prefix+(ctag_middle % (("*",)*6) )+ctag_suffix),
 412          strong=re.compile(
 413              ctag_prefix+(ctag_middl2 % (("*",)*8))+ctag_suffix),
 414          under=re.compile(
 415              ctag_prefix+(ctag_middle % (("_",)*6) )+ctag_suffix),
 416          code=re.compile(
 417              ctag_prefix+(ctag_middle % (("\'",)*6))+ctag_suffix),
 418          ):
 419     if s is None: s=''
 420     s=strong.sub(r'\1<strong>\2</strong>\3',s)
 421     s=under.sub( r'\1<u>\2</u>\3',s)
 422     s=code.sub(  r'\1<code>\2</code>\3',s)
 423     s=em.sub(    r'\1<em>\2</em>\3',s)
 424     return s
 425
 426 class HTML(StructuredText):
 427
 428     '''\
 429     An HTML structured text formatter.
 430     '''\
 431
 432     def __str__(self,
 433                 extra_dl=re.compile("</dl>\n<dl>"),
 434                 extra_ul=re.compile("</ul>\n<ul>"),
 435                 extra_ol=re.compile("</ol>\n<ol>"),
 436                 ):
 437         '''\
 438         Return an HTML string representation of the structured text data.
 439
 440         '''
 441         s=self._str(self.structure,self.level)
 442         s=extra_dl.sub('\n',s)
 443         s=extra_ul.sub('\n',s)
 444         s=extra_ol.sub('\n',s)
 445         return s
 446
 447     def ul(self, before, p, after):
 448         if p: p="<p>%s</p>" % strip(ctag(p))
 449         return ('%s<ul><li>%s\n%s\n</li></ul>\n'
 450                 % (before,p,after))
 451
 452     def ol(self, before, p, after):
 453         if p: p="<p>%s</p>" % strip(ctag(p))
 454         return ('%s<ol><li>%s\n%s\n</li></ol>\n'
 455                 % (before,p,after))
 456
 457     def dl(self, before, t, d, after):
 458         return ('%s<dl><dt>%s</dt><dd><p>%s</p>\n%s\n</dd></dl>\n'
 459                 % (before,ctag(t),ctag(d),after))
 460
 461     def head(self, before, t, level, d):
 462         if level > 0 and level < 6:
 463             return ('%s<h%d>%s</h%d>\n%s\n'
 464                     % (before,level,strip(ctag(t)),level,d))
 465
 466         t="<p><strong>%s</strong></p>" % strip(ctag(t))
 467         return ('%s<dl><dt>%s\n</dt><dd>%s\n</dd></dl>\n'
 468                 % (before,t,d))
 469
 470     def normal(self,before,p,after):
 471         return '%s<p>%s</p>\n%s\n' % (before,ctag(p),after)
 472
 473     def pre(self,structure,tagged=0):
 474         if not structure: return ''
 475         if tagged:
 476             r=''
 477         else:
 478             r='<PRE>\n'
 479         for s in structure:
 480             r="%s%s\n\n%s" % (r,html_quote(s[0]),self.pre(s[1],1))
 481         if not tagged: r=r+'</PRE>\n'
 482         return r
 483
 484     def table(self,before,table,after):
 485         return '%s<p>%s</p>\n%s\n' % (before,ctag(table),after)
 486
 487     def _str(self,structure,level,
 488              # Static
 489              bullet=ts_regex.compile('[ \t\n]*[o*-][ \t\n]+\([^\0]*\)'
 490                                      ).match_group,
 491              example=ts_regex.compile('[\0- ]examples?:[\0- ]*$'
 492                                       ).search,
 493              dl=ts_regex.compile('\([^\n]+\)[ \t]+--[ \t\n]+\([^\0]*\)'
 494                                  ).match_group,
 495              nl=ts_regex.compile('\n').search,
 496              ol=ts_regex.compile(
 497                  '[ \t]*\(\([0-9]+\|[%s]+\)[.)]\)+[ \t\n]+\([^\0]*\|$\)' % string.letters
 498                  ).match_group,
 499              olp=ts_regex.compile('[ \t]*([0-9]+)[ \t\n]+\([^\0]*\|$\)'
 500                                   ).match_group,
 501              ):
 502         r=''
 503         for s in structure:
 504
 505             ts_results = bullet(s[0], (1,))
 506             if ts_results:
 507                 p = ts_results[1]
 508                 if s[0][-2:]=='::' and s[1]: ps=self.pre(s[1])
 509                 else: ps=self._str(s[1],level)
 510                 r=self.ul(r,p,ps)
 511                 continue
 512             ts_results = ol(s[0], (3,))
 513             if ts_results:
 514                 p = ts_results[1]
 515                 if s[0][-2:]=='::' and s[1]: ps=self.pre(s[1])
 516                 else: ps=self._str(s[1],level)
 517                 r=self.ol(r,p,ps)
 518                 continue
 519             ts_results = olp(s[0], (1,))
 520             if ts_results:
 521                 p = ts_results[1]
 522                 if s[0][-2:]=='::' and s[1]: ps=self.pre(s[1])
 523                 else: ps=self._str(s[1],level)
 524                 r=self.ol(r,p,ps)
 525                 continue
 526             ts_results = dl(s[0], (1,2))
 527             if ts_results:
 528                 t,d = ts_results[1]
 529                 r=self.dl(r,t,d,self._str(s[1],level))
 530                 continue
 531             if example(s[0]) >= 0 and s[1]:
 532                 # Introduce an example, using pre tags:
 533                 r=self.normal(r,s[0],self.pre(s[1]))
 534                 continue
 535             if s[0][-2:]=='::' and s[1]:
 536                 # Introduce an example, using pre tags:
 537                 r=self.normal(r,s[0][:-1],self.pre(s[1]))
 538                 continue
 539             if table.create(s[0]):
 540                 ## table support.
 541                 r=self.table(r,table.html(),self._str(s[1],level))
 542                 continue
 543             else:
 544
 545                 if nl(s[0]) < 0 and s[1] and s[0][-1:] != ':':
 546                     # Treat as a heading
 547                     t=s[0]
 548                     r=self.head(r,t,level,
 549                                 self._str(s[1],level and level+1))
 550                 else:
 551                     r=self.normal(r,s[0],self._str(s[1],level))
 552         return r
 553
 554
 555 def html_quote(v,
 556                character_entities=(
 557                        (re.compile('&'), '&amp;'),
 558                        (re.compile("<"), '&lt;' ),
 559                        (re.compile(">"), '&gt;' ),
 560                        (re.compile('"'), '&quot;')
 561                        )): #"
 562         text=str(v)
 563         for re,name in character_entities:
 564             text=re.sub(name,text)
 565         return text
 566
 567 def html_with_references(text, level=1):
 568     text = re.sub(
 569         r'[\0\n]\.\. \[([0-9_%s-]+)\]' % string.letters,
 570         r'\n  <a name="\1">[\1]</a>',
 571         text)
 572
 573     text = re.sub(
 574         r'([\x00- ,])\[(?P<ref>[0-9_%s-]+)\]([\x00- ,.:])'   % string.letters,
 575         r'\1<a href="#\2">[\2]</a>\3',
 576         text)
 577
 578     text = re.sub(
 579         r'([\0- ,])\[([^]]+)\.html\]([\0- ,.:])',
 580         r'\1<a href="\2.html">[\2]</a>\3',
 581         text)
 582
 583     return HTML(text,level=level)
 584
 585
 586 def main():
 587     import sys, getopt
 588
 589     opts,args=getopt.getopt(sys.argv[1:],'twl')
 590
 591     if args:
 592         [infile]=args
 593         s=open(infile,'r').read()
 594     else:
 595         s=sys.stdin.read()
 596
 597     if opts:
 598
 599         if filter(lambda o: o[0]=='-w', opts):
 600             print 'Content-Type: text/html\n'
 601
 602         if filter(lambda o: o[0]=='-l', opts):
 603             import locale
 604             locale.setlocale(locale.LC_ALL,"")
 605
 606         if s[:2]=='#!':
 607             s=re.sub('^#![^\n]+','',s)
 608
 609         mo = re.compile('([\0-\n]*\n)').match(s)
 610         if mo is not None:
 611             s = s[len(mo.group(0)) :]
 612
 613         s=str(html_with_references(s))
 614         if s[:4]=='<h1>':
 615             t=s[4:find(s,'</h1>')]
 616             s='''<html><head><title>%s</title>
 617             </head><body>
 618             %s
 619             </body></html>
 620             ''' % (t,s)
 621         print s
 622     else:
 623         print html_with_references(s)
 624
 625 if __name__=="__main__": main()