wxPython/docs/bin/docparser/wxhtmlparse.py

   1 import sys, os, string, glob
   2 import re
   3 from docparser.wxclasses import *
   4 import wx
   5
   6
   7 outputdir = "output"
   8
   9 #
  10 # Class REs
  11 #
  12
  13 class_desc_re = """<H2>.*?</H2>(.*?)<B><FONT COLOR="#FF0000">"""
  14 win_styles_re = """<B><FONT COLOR="#FF0000">Window styles</FONT></B><P>(.*?)<B><FONT COLOR="#FF0000">"""
  15 win_styles_extra_re = """<B><FONT COLOR="#FF0000">Extra window styles</FONT></B><P>(.*?)<B><FONT COLOR="#FF0000">"""
  16 win_style_re = """<TR><TD VALIGN=TOP WIDTH=.*?>\s*?<FONT FACE=".*?">\s*?<B>(.*?)</B>\s*?</FONT></TD>\s*?<TD VALIGN=TOP>\s*?<FONT FACE=".*?">(.*?)</FONT></TD></TR>"""
  17 derived_re = """<B><FONT COLOR="#FF0000">Derived from</FONT></B><P>(.*?)<P>"""
  18 derived_class_re = """<A HREF=".*?">(.*?)</A>"""
  19
  20 #
  21 # Method REs
  22 #
  23
  24 # groups - header, description
  25 method_re = "<H3>(.*?)</H3>\s*?<P>(.*?)<HR>"
  26 lastmethod_re = "<H3>(.*?)</H3>\s*?<P>(.*?)\s*?<P>\s*?</FONT>"
  27 headings_re = "<B><FONT COLOR=\"#FF0000\">(.*?)</FONT></B><P>(.*?)"
  28 # groups = param name, param value
  29 param_re = "<I>(.*?)</I><UL><UL>(.*?)</UL></UL>"
  30 # groups - return type, method name, arguments
  31 proto_re = "<B>(.*?)</B>.*?<B>(.*?)</B>\s*?\((.*?)\)"
  32 # groups - arg type, arg name
  33 args_re = "<B>(.*?)</B>.*?<I>(.*?)</I>"
  34 code_re = "<PRE>(.*?)</PRE>"
  35 link_re = "<A href=\"(.*?)\"><B>(.*?)</B></A><BR>"
  36
  37 #
  38 # wxPython/wxPerl note REs
  39 #
  40
  41 wx_re = "wx[A-Z]\S+"
  42 wxperl_overload_re = "<B><FONT COLOR=\"#0000C8\">wxPerl note:</FONT></B> In wxPerl there are two methods instead of a single overloaded method:<P>\s*?<UL><UL>(.*?)</UL></UL>"
  43 wxperl_re = "<B><FONT COLOR=\"#0000C8\">wxPerl note:</FONT></B>(.*?)<P>"
  44
  45 wxpython_constructors_re = """<B><FONT COLOR="#0000C8">wxPython note:</FONT></B> Constructors supported by wxPython are:<P>\s*?<UL><UL>(.*?)</UL></UL>"""
  46 wxpython_overload_re = """<TR><TD VALIGN=TOP.*?>\s*?<FONT FACE=".*?">\s*?<B>(.*?)</B>\s*?</FONT></TD>\s*?<TD VALIGN=TOP>\s*?<FONT FACE=".*?">(.*?)</FONT></TD></TR>"""
  47
  48 wxpython_overloads_re = "<B><FONT COLOR=\"#0000C8\">wxPython note:</FONT></B> In place of a single overloaded method name, wxPython\s*?implements the following methods:<P>\s*?<UL><UL>(.*?)</UL></UL>"
  49 wxpython_re = "<B><FONT COLOR=\"#0000C8\">wxPython note:</FONT></B>(.*?)<P>"
  50
  51
  52 # convert wxWhatever to wx.Whatever
  53 def namespacify_wxClasses(contents):
  54     wx_regex = re.compile(wx_re, re.MULTILINE | re.DOTALL)
  55
  56     result = wx_regex.sub(wxReplaceFunc, contents)
  57     return result
  58
  59 def wxReplaceFunc(match):
  60     text = match.group()
  61     if text.find("wxWidgets") == -1 and text.find("wxPython") == -1 and text.find("wxPerl") == -1:
  62         text = text.replace("wx", "wx.")
  63     return text
  64
  65
  66
  67 # Methods to de-C++itize data.
  68 def pythonize_text(contents):
  69     """
  70     Remove C++isms that definitely shouldn't be in any text.
  71     """
  72     contents = contents.replace("false", "False")
  73     contents = contents.replace("true", "True")
  74     contents = contents.replace("non-NULL", "not None")
  75     contents = contents.replace("NULL", "None")
  76     contents = contents.replace("const ", "")
  77     contents = contents.replace("::", ".")
  78     contents = contents.replace("\r\n", "\n")
  79     contents = contents.replace("\r", "\n")
  80     contents = contents.replace("''", "\"")
  81     return namespacify_wxClasses(contents)
  82
  83 def pythonize_args(contents):
  84     """
  85     Remove C++isms from arguments (some of these terms may be used in other
  86     contexts in actual documentation, so we don't remove them there).
  87     """
  88     contents = contents.replace("static", "")
  89     contents = contents.replace("virtual void", "")
  90     contents = contents.replace("virtual", "")
  91     contents = contents.replace("void*", "int")
  92     contents = contents.replace("void", "")
  93
  94     contents = contents.replace("off_t", "long")
  95     contents = contents.replace("size_t", "long")
  96     contents = contents.replace("*", "")
  97     contents = contents.replace("&amp;", "")
  98     contents = contents.replace("&", "")
  99     contents = contents.replace("char", "string")
 100     contents = contents.replace("wxChar", "string")
 101     contents = contents.replace("wxCoord", "int")
 102     contents = contents.replace("<A HREF=\"wx_wxstring.html#wxstring\">wxString</A>", "string")
 103
 104     return pythonize_text(contents)
 105
 106 def formatMethodProtos(protos):
 107     """
 108     Remove C++isms in the method prototypes.
 109     """
 110     for proto in protos:
 111         proto[0] = pythonize_args(proto[0])
 112         proto[0] = proto[0].strip()
 113
 114         proto[1] = namespacify_wxClasses(proto[1])
 115         for arg in proto[2]:
 116             arg[0] = pythonize_args(arg[0])
 117             arg[0].strip()
 118
 119             # for arg names, we should be more careful about what we replace
 120             arg[1] = pythonize_text(arg[1])
 121             arg[1] = arg[1].replace("*", "")
 122             arg[1] = arg[1].replace("&", "")
 123
 124     return protos
 125
 126
 127
 128 # functions for getting data from methods
 129 def getMethodWxPythonOverrides(text, isConstructor=False):
 130     overloads_re = wxpython_overloads_re
 131     if isConstructor:
 132         overloads_re = wxpython_constructors_re
 133     overload_regex = re.compile(overloads_re, re.MULTILINE | re.DOTALL | re.IGNORECASE)
 134     match = overload_regex.search(text, 0)
 135     note = ""
 136     start = -1
 137     end = -1
 138     overrides = []
 139     if match:
 140         def getWxPythonOverridesFromMatch(match):
 141             return [namespacify_wxClasses(match.group(1)), pythonize_text(match.group(2))]
 142
 143         start = match.start()
 144         end = match.end()
 145         overrides, returntext = findAllMatches(wxpython_overload_re, match.group(1), getWxPythonOverridesFromMatch)
 146
 147     returntext = text
 148
 149     if start != -1 and end != -1:
 150         #print "note is: " + text[start:end]
 151         returntext = text.replace(text[start:end], "")
 152
 153     return overrides, returntext
 154
 155 def getMethodWxPythonNote(text):
 156     python_regex = re.compile(wxpython_re, re.MULTILINE | re.DOTALL | re.IGNORECASE)
 157     match = python_regex.search(text)
 158     start = -1
 159     end = -1
 160     note = ""
 161     if match:
 162         start = match.start()
 163         end = match.end()
 164         note = match.group(1)
 165
 166     returntext = text
 167
 168     if start != -1 and end != -1:
 169         #print "note is: " + text[start:end]
 170         returntext = text.replace(text[start:end], "")
 171
 172     return note, returntext
 173
 174 def findAllMatches(re_string, text, handler, start=0):
 175     """
 176     findAllMatches finds matches for a given regex, then runs the handler function
 177     on each match, and returns a list of objects, along with a version of the
 178     text with the area matches were found stripped.
 179     Note the stripping of text is not generally usable yet, it assumes matches
 180     are in continuous blocks, which is true of the wx docs.
 181     """
 182     regex = re.compile(re_string, re.MULTILINE | re.DOTALL | re.IGNORECASE)
 183     match = regex.search(text, start)
 184     results = []
 185
 186     startpoint = -1
 187     endpoint = -1
 188
 189     if match:
 190         startpoint = match.start()
 191
 192     while match:
 193         start = match.end()
 194         results.append(handler(match))
 195         endpoint = match.end()
 196         match = regex.search(text, start)
 197
 198     returntext = text
 199     if startpoint != -1 and endpoint != -1:
 200         returntext = text.replace(text[startpoint:endpoint], "")
 201
 202     return results, returntext
 203
 204 def getMethodParams(text):
 205     paramstart = text.find("<B><FONT COLOR=\"#FF0000\">Parameters</FONT></B><P>")
 206     params, returntext = findAllMatches(param_re, text, getMethodParamsFromMatch, paramstart)
 207
 208     return params, returntext
 209
 210 def getMethodParamsFromMatch(match):
 211     return [match.group(1).strip(), pythonize_text(match.group(2)).strip()]
 212
 213 def getPrototypeFromMatch(match):
 214     return [match.group(1), match.group(2), getProtoArgs(match.group(3))]
 215
 216 def getProtoArgsFromMatch(match):
 217     return [match.group(1), match.group(2)]
 218
 219
 220
 221 # These methods parse the docs, finding matches and then using the FromMatch
 222 # functions to parse the data. After that, the results are "Pythonized"
 223 # by removing C++isms.
 224 def getMethodProtos(text):
 225     protos, returntext = findAllMatches(proto_re, text, getPrototypeFromMatch)
 226     return formatMethodProtos(protos), returntext
 227
 228 def getProtoArgs(text):
 229     args, returntext = findAllMatches(args_re, text, getProtoArgsFromMatch)
 230     return args
 231
 232 def getMethodDesc(text):
 233     heading_text = "<B><FONT COLOR=\"#FF0000\">"
 234     return_text = text
 235     end = text.find(heading_text)
 236     if end != -1:
 237         return_text = text[0:end]
 238
 239     return pythonize_text(return_text)
 240
 241
 242 def removeWxPerlNotes(text):
 243     perl_overload_regex = re.compile(wxperl_overload_re, re.MULTILINE | re.DOTALL | re.IGNORECASE)
 244     result = perl_overload_regex.sub("", text)
 245
 246     perl_regex = re.compile(wxperl_re, re.MULTILINE | re.DOTALL | re.IGNORECASE)
 247     result = perl_regex.sub("", result)
 248
 249     return result
 250
 251 def removeCPPCode(text):
 252     code_regex = re.compile(code_re, re.MULTILINE | re.DOTALL | re.IGNORECASE)
 253
 254     result = code_regex.sub("", text)
 255     return result
 256
 257
 258 def getMethod(match, parent):
 259     name = match.group(1)
 260     if name.find("::") != -1:
 261         name = name.split("::")[1]
 262     name = namespacify_wxClasses(name).strip()
 263     start = match.end()
 264     protos, remainder = getMethodProtos(match.group(2))
 265
 266     isConstructor = False
 267     #print "name: %s, parent name: %s" % (name, parent.name)
 268     if name == parent.name.replace("wx", "wx."):
 269         isConstructor = True
 270     overrides, remainder = getMethodWxPythonOverrides(remainder, isConstructor)
 271
 272     note, remainder = getMethodWxPythonNote(remainder)
 273     params, remainder = getMethodParams(remainder)
 274     desc = getMethodDesc(remainder)
 275     method = wxMethod(name, parent, protos, params, desc)
 276     method.pythonNote = note
 277     method.pythonOverrides = overrides
 278     if len(method.pythonOverrides) > 0:
 279         print "has overrides!\n\n\n\n"
 280     return method
 281
 282 def getClassDerivedFrom(text):
 283
 284     def getDerivedClassesFromMatch(match):
 285         return namespacify_wxClasses(match.group(1))
 286
 287     derived_classes = []
 288     derived_regex = re.compile(derived_re, re.MULTILINE | re.DOTALL | re.IGNORECASE)
 289     match = derived_regex.search(text)
 290     if match:
 291         derived_classes, returntext = findAllMatches(derived_class_re, match.group(1), getDerivedClassesFromMatch)
 292
 293     return derived_classes
 294
 295 def getClassDescription(text):
 296
 297     def getClassDescriptionFromMatch(match):
 298         return match.group(1)
 299
 300     desc, returntext = findAllMatches(class_desc_re, text, getClassDescriptionFromMatch)
 301
 302     return pythonize_text(desc[0])
 303
 304 def getClassStyles(text, extraStyles=False):
 305     styles_re = win_styles_re
 306     if extraStyles:
 307         styles_re = win_styles_extra_re
 308     styles_regex = re.compile(styles_re, re.MULTILINE | re.DOTALL | re.IGNORECASE)
 309     match = styles_regex.search(text)
 310
 311     styles = []
 312     if match:
 313         def getClassStyleFromMatch(match):
 314             return [namespacify_wxClasses(match.group(1)), pythonize_text(match.group(2))]
 315
 316         styles, remainder = findAllMatches(win_style_re, match.group(1), getClassStyleFromMatch)
 317
 318     return styles
 319
 320 # Main functions - these drive the process.
 321 def getClassMethods(doc, parent):
 322     contents = open(doc, "rb").read()
 323
 324     # get rid of some particularly tricky parts before parsing
 325     contents = contents.replace("<B>const</B>", "")
 326     contents = removeWxPerlNotes(contents)
 327     contents = removeCPPCode(contents)
 328
 329     method_regex = re.compile(method_re, re.MULTILINE | re.DOTALL | re.IGNORECASE)
 330     match = method_regex.search(contents)
 331     start = 0
 332     methods = {}
 333     while match:
 334         start = match.end()
 335         newmethod = getMethod(match, parent)
 336         basename = parent.name.replace("wx", "")
 337         isConstructor = (basename == newmethod.name.replace("wx.", ""))
 338         if isConstructor or eval("newmethod.name in dir(wx.%s)" % basename):
 339             print "Adding %s.%s" % (parent.name, newmethod.name)
 340             methods[newmethod.name] = newmethod
 341         match = method_regex.search(contents, start)
 342
 343     lastmethod_regex = re.compile(lastmethod_re, re.MULTILINE | re.DOTALL | re.IGNORECASE)
 344     match = lastmethod_regex.search(contents, start)
 345     if match:
 346         newmethod = getMethod(match, parent)
 347         basename = parent.name.replace("wx", "")
 348         isConstructor = (basename == newmethod.name.replace("wx.", ""))
 349         if isConstructor or eval("newmethod.name in dir(wx.%s)" % basename):
 350             print "Adding %s.%s" % (parent.name, newmethod.name)
 351             methods[newmethod.name] = newmethod
 352
 353     for name in methods:
 354         if name[0:3] == "Get":
 355             propname = name[3:]
 356             basename = parent.name.replace("wx", "")
 357             if not propname in eval("dir(wx.%s)" % basename):
 358                 parent.props.append(propname)
 359             else:
 360                 parent.propConflicts.append(parent.name + "." + propname)
 361     # get rid of the destructor and operator methods
 362     ignore_methods = ["~" + namespacify_wxClasses(parent.name), "operator ==",
 363                         "operator &lt;&lt;", "operator &gt;&gt;", "operator =",
 364                         "operator !=", "operator*", "operator++" ]
 365     for method in ignore_methods:
 366         if method in methods:
 367             methods.pop(method)
 368
 369     return methods
 370
 371 def getClasses(doc):
 372     global docspath
 373     contents = open(doc, "rb").read()
 374     link_regex = re.compile(link_re, re.MULTILINE | re.DOTALL | re.IGNORECASE)
 375     start = contents.find("<H2>Alphabetical class reference</H2>")
 376     result = link_regex.search(contents, start)
 377     classes = {}
 378     while result:
 379         start = result.end()
 380         name = result.group(2).strip()
 381         classpage = result.group(1).split("#")[0]
 382         basename = name.replace("wx", "")
 383         if basename in dir(wx):
 384             classfile = os.path.join(os.path.dirname(doc), classpage)
 385             classtext = open(classfile, "rb").read()
 386             derivedClasses = getClassDerivedFrom(classtext)
 387             description = getClassDescription(classtext)
 388             styles = getClassStyles(classtext)
 389             extra_styles = getClassStyles(classtext, extraStyles=True)
 390             classes[name] = wxClass(name, description, derivedClasses, styles, extra_styles)
 391             classes[name].methods = getClassMethods(classfile, classes[name])
 392         result = link_regex.search(contents, start)
 393
 394     return classes