clang/docs/tools/dump_ast_matchers.py

   1 #!/usr/bin/env python3
   2 # A tool to parse ASTMatchers.h and update the documentation in
   3 # ../LibASTMatchersReference.html automatically. Run from the
   4 # directory in which this file is located to update the docs.
   5
   6 import collections
   7 import re
   8
   9 try:
  10     from urllib.request import urlopen
  11 except ImportError:
  12     from urllib2 import urlopen
  13
  14 CLASS_INDEX_PAGE_URL = "https://clang.llvm.org/doxygen/classes.html"
  15 try:
  16     CLASS_INDEX_PAGE = urlopen(CLASS_INDEX_PAGE_URL).read().decode("utf-8")
  17 except Exception as e:
  18     CLASS_INDEX_PAGE = None
  19     print("Unable to get %s: %s" % (CLASS_INDEX_PAGE_URL, e))
  20
  21 MATCHERS_FILE = "../../include/clang/ASTMatchers/ASTMatchers.h"
  22
  23 # Each matcher is documented in one row of the form:
  24 #   result | name | argA
  25 # The subsequent row contains the documentation and is hidden by default,
  26 # becoming visible via javascript when the user clicks the matcher name.
  27 TD_TEMPLATE = """
  28 <tr><td>%(result)s</td><td class="name" onclick="toggle('%(id)s')"><a name="%(id)sAnchor">%(name)s</a></td><td>%(args)s</td></tr>
  29 <tr><td colspan="4" class="doc" id="%(id)s"><pre>%(comment)s</pre></td></tr>
  30 """
  31
  32 # We categorize the matchers into these three categories in the reference:
  33 node_matchers = {}
  34 narrowing_matchers = {}
  35 traversal_matchers = {}
  36
  37 # We output multiple rows per matcher if the matcher can be used on multiple
  38 # node types. Thus, we need a new id per row to control the documentation
  39 # pop-up. ids[name] keeps track of those ids.
  40 ids = collections.defaultdict(int)
  41
  42 # Cache for doxygen urls we have already verified.
  43 doxygen_probes = {}
  44
  45
  46 def esc(text):
  47     """Escape any html in the given text."""
  48     text = re.sub(r"&", "&amp;", text)
  49     text = re.sub(r"<", "&lt;", text)
  50     text = re.sub(r">", "&gt;", text)
  51
  52     def link_if_exists(m):
  53         """Wrap a likely AST node name in a link to its clang docs.
  54
  55         We want to do this only if the page exists, in which case it will be
  56         referenced from the class index page.
  57         """
  58         name = m.group(1)
  59         url = "https://clang.llvm.org/doxygen/classclang_1_1%s.html" % name
  60         if url not in doxygen_probes:
  61             search_str = 'href="classclang_1_1%s.html"' % name
  62             if CLASS_INDEX_PAGE is not None:
  63                 doxygen_probes[url] = search_str in CLASS_INDEX_PAGE
  64             else:
  65                 doxygen_probes[url] = True
  66             if not doxygen_probes[url]:
  67                 print("Did not find %s in class index page" % name)
  68         if doxygen_probes[url]:
  69             return r'Matcher&lt;<a href="%s">%s</a>&gt;' % (url, name)
  70         else:
  71             return m.group(0)
  72
  73     text = re.sub(r"Matcher&lt;([^\*&]+)&gt;", link_if_exists, text)
  74     return text
  75
  76
  77 def extract_result_types(comment):
  78     """Extracts a list of result types from the given comment.
  79
  80     We allow annotations in the comment of the matcher to specify what
  81     nodes a matcher can match on. Those comments have the form:
  82       Usable as: Any Matcher | (Matcher<T1>[, Matcher<t2>[, ...]])
  83
  84     Returns ['*'] in case of 'Any Matcher', or ['T1', 'T2', ...].
  85     Returns the empty list if no 'Usable as' specification could be
  86     parsed.
  87     """
  88     result_types = []
  89     m = re.search(r"Usable as: Any Matcher[\s\n]*$", comment, re.S)
  90     if m:
  91         return ["*"]
  92     while True:
  93         m = re.match(r"^(.*)Matcher<([^>]+)>\s*,?[\s\n]*$", comment, re.S)
  94         if not m:
  95             if re.search(r"Usable as:\s*$", comment):
  96                 return result_types
  97             else:
  98                 return None
  99         result_types += [m.group(2)]
 100         comment = m.group(1)
 101
 102
 103 def strip_doxygen(comment):
 104     """Returns the given comment without \-escaped words."""
 105     # If there is only a doxygen keyword in the line, delete the whole line.
 106     comment = re.sub(r"^\\[^\s]+\n", r"", comment, flags=re.M)
 107
 108     # If there is a doxygen \see command, change the \see prefix into "See also:".
 109     # FIXME: it would be better to turn this into a link to the target instead.
 110     comment = re.sub(r"\\see", r"See also:", comment)
 111
 112     # Delete the doxygen command and the following whitespace.
 113     comment = re.sub(r"\\[^\s]+\s+", r"", comment)
 114     return comment
 115
 116
 117 def unify_arguments(args):
 118     """Gets rid of anything the user doesn't care about in the argument list."""
 119     args = re.sub(r"internal::", r"", args)
 120     args = re.sub(r"extern const\s+(.*)&", r"\1 ", args)
 121     args = re.sub(r"&", r" ", args)
 122     args = re.sub(r"(^|\s)M\d?(\s)", r"\1Matcher<*>\2", args)
 123     args = re.sub(r"BindableMatcher", r"Matcher", args)
 124     args = re.sub(r"const Matcher", r"Matcher", args)
 125     return args
 126
 127
 128 def unify_type(result_type):
 129     """Gets rid of anything the user doesn't care about in the type name."""
 130     result_type = re.sub(
 131         r"^internal::(Bindable)?Matcher<([a-zA-Z_][a-zA-Z0-9_]*)>$", r"\2", result_type
 132     )
 133     return result_type
 134
 135
 136 def add_matcher(result_type, name, args, comment, is_dyncast=False):
 137     """Adds a matcher to one of our categories."""
 138     if name == "id":
 139         # FIXME: Figure out whether we want to support the 'id' matcher.
 140         return
 141     matcher_id = "%s%d" % (name, ids[name])
 142     ids[name] += 1
 143     args = unify_arguments(args)
 144     result_type = unify_type(result_type)
 145
 146     docs_result_type = esc("Matcher<%s>" % result_type)
 147
 148     if name == "mapAnyOf":
 149         args = "nodeMatcherFunction..."
 150         docs_result_type = "<em>unspecified</em>"
 151
 152     matcher_html = TD_TEMPLATE % {
 153         "result": docs_result_type,
 154         "name": name,
 155         "args": esc(args),
 156         "comment": esc(strip_doxygen(comment)),
 157         "id": matcher_id,
 158     }
 159     if is_dyncast:
 160         dict = node_matchers
 161         lookup = result_type + name
 162     # Use a heuristic to figure out whether a matcher is a narrowing or
 163     # traversal matcher. By default, matchers that take other matchers as
 164     # arguments (and are not node matchers) do traversal. We specifically
 165     # exclude known narrowing matchers that also take other matchers as
 166     # arguments.
 167     elif "Matcher<" not in args or name in [
 168         "allOf",
 169         "anyOf",
 170         "anything",
 171         "unless",
 172         "mapAnyOf",
 173     ]:
 174         dict = narrowing_matchers
 175         lookup = result_type + name + esc(args)
 176     else:
 177         dict = traversal_matchers
 178         lookup = result_type + name + esc(args)
 179
 180     if dict.get(lookup) is None or len(dict.get(lookup)) < len(matcher_html):
 181         dict[lookup] = matcher_html
 182
 183
 184 def act_on_decl(declaration, comment, allowed_types):
 185     """Parse the matcher out of the given declaration and comment.
 186
 187     If 'allowed_types' is set, it contains a list of node types the matcher
 188     can match on, as extracted from the static type asserts in the matcher
 189     definition.
 190     """
 191     if declaration.strip():
 192
 193         if re.match(r"^\s?(#|namespace|using|template <typename NodeType> using|})", declaration):
 194             return
 195
 196         # Node matchers are defined by writing:
 197         #   VariadicDynCastAllOfMatcher<ResultType, ArgumentType> name;
 198         m = re.match(
 199             r""".*Variadic(?:DynCast)?AllOfMatcher\s*<
 200                        \s*([^\s,]+)\s*(?:,
 201                        \s*([^\s>]+)\s*)?>
 202                        \s*([^\s;]+)\s*;\s*$""",
 203             declaration,
 204             flags=re.X,
 205         )
 206         if m:
 207             result, inner, name = m.groups()
 208             if not inner:
 209                 inner = result
 210             add_matcher(
 211                 result, name, "Matcher<%s>..." % inner, comment, is_dyncast=True
 212             )
 213             return
 214
 215         # Special case of type matchers:
 216         #   AstTypeMatcher<ArgumentType> name
 217         m = re.match(
 218             r""".*AstTypeMatcher\s*<
 219                        \s*([^\s>]+)\s*>
 220                        \s*([^\s;]+)\s*;\s*$""",
 221             declaration,
 222             flags=re.X,
 223         )
 224         if m:
 225             inner, name = m.groups()
 226             add_matcher(
 227                 "Type", name, "Matcher<%s>..." % inner, comment, is_dyncast=True
 228             )
 229             # FIXME: re-enable once we have implemented casting on the TypeLoc
 230             # hierarchy.
 231             # add_matcher('TypeLoc', '%sLoc' % name, 'Matcher<%sLoc>...' % inner,
 232             #             comment, is_dyncast=True)
 233             return
 234
 235         # Parse the various matcher definition macros.
 236         m = re.match(
 237             """.*AST_TYPE(LOC)?_TRAVERSE_MATCHER(?:_DECL)?\(
 238                        \s*([^\s,]+\s*),
 239                        \s*(?:[^\s,]+\s*),
 240                        \s*AST_POLYMORPHIC_SUPPORTED_TYPES\(([^)]*)\)
 241                      \)\s*;\s*$""",
 242             declaration,
 243             flags=re.X,
 244         )
 245         if m:
 246             loc, name, results = m.groups()[0:3]
 247             result_types = [r.strip() for r in results.split(",")]
 248
 249             comment_result_types = extract_result_types(comment)
 250             if comment_result_types and sorted(result_types) != sorted(
 251                 comment_result_types
 252             ):
 253                 raise Exception("Inconsistent documentation for: %s" % name)
 254             for result_type in result_types:
 255                 add_matcher(result_type, name, "Matcher<Type>", comment)
 256                 # if loc:
 257                 #   add_matcher('%sLoc' % result_type, '%sLoc' % name, 'Matcher<TypeLoc>',
 258                 #               comment)
 259             return
 260
 261         m = re.match(
 262             r"""^\s*AST_POLYMORPHIC_MATCHER(_P)?(.?)(?:_OVERLOAD)?\(
 263                           \s*([^\s,]+)\s*,
 264                           \s*AST_POLYMORPHIC_SUPPORTED_TYPES\(([^)]*)\)
 265                        (?:,\s*([^\s,]+)\s*
 266                           ,\s*([^\s,]+)\s*)?
 267                        (?:,\s*([^\s,]+)\s*
 268                           ,\s*([^\s,]+)\s*)?
 269                        (?:,\s*\d+\s*)?
 270                       \)\s*{\s*$""",
 271             declaration,
 272             flags=re.X,
 273         )
 274
 275         if m:
 276             p, n, name, results = m.groups()[0:4]
 277             args = m.groups()[4:]
 278             result_types = [r.strip() for r in results.split(",")]
 279             if allowed_types and allowed_types != result_types:
 280                 raise Exception("Inconsistent documentation for: %s" % name)
 281             if n not in ["", "2"]:
 282                 raise Exception('Cannot parse "%s"' % declaration)
 283             args = ", ".join(
 284                 "%s %s" % (args[i], args[i + 1])
 285                 for i in range(0, len(args), 2)
 286                 if args[i]
 287             )
 288             for result_type in result_types:
 289                 add_matcher(result_type, name, args, comment)
 290             return
 291
 292         m = re.match(
 293             r"""^\s*AST_POLYMORPHIC_MATCHER_REGEX(?:_OVERLOAD)?\(
 294                           \s*([^\s,]+)\s*,
 295                           \s*AST_POLYMORPHIC_SUPPORTED_TYPES\(([^)]*)\),
 296                           \s*([^\s,]+)\s*
 297                        (?:,\s*\d+\s*)?
 298                       \)\s*{\s*$""",
 299             declaration,
 300             flags=re.X,
 301         )
 302
 303         if m:
 304             name, results, arg_name = m.groups()[0:3]
 305             result_types = [r.strip() for r in results.split(",")]
 306             if allowed_types and allowed_types != result_types:
 307                 raise Exception("Inconsistent documentation for: %s" % name)
 308             arg = "StringRef %s, Regex::RegexFlags Flags = NoFlags" % arg_name
 309             comment += """
 310 If the matcher is used in clang-query, RegexFlags parameter
 311 should be passed as a quoted string. e.g: "NoFlags".
 312 Flags can be combined with '|' example \"IgnoreCase | BasicRegex\"
 313 """
 314             for result_type in result_types:
 315                 add_matcher(result_type, name, arg, comment)
 316             return
 317
 318         m = re.match(
 319             r"""^\s*AST_MATCHER_FUNCTION(_P)?(.?)(?:_OVERLOAD)?\(
 320                        (?:\s*([^\s,]+)\s*,)?
 321                           \s*([^\s,]+)\s*
 322                        (?:,\s*([^\s,]+)\s*
 323                           ,\s*([^\s,]+)\s*)?
 324                        (?:,\s*([^\s,]+)\s*
 325                           ,\s*([^\s,]+)\s*)?
 326                        (?:,\s*\d+\s*)?
 327                       \)\s*{\s*$""",
 328             declaration,
 329             flags=re.X,
 330         )
 331         if m:
 332             p, n, result, name = m.groups()[0:4]
 333             args = m.groups()[4:]
 334             if n not in ["", "2"]:
 335                 raise Exception('Cannot parse "%s"' % declaration)
 336             args = ", ".join(
 337                 "%s %s" % (args[i], args[i + 1])
 338                 for i in range(0, len(args), 2)
 339                 if args[i]
 340             )
 341             add_matcher(result, name, args, comment)
 342             return
 343
 344         m = re.match(
 345             r"""^\s*AST_MATCHER(_P)?(.?)(?:_OVERLOAD)?\(
 346                        (?:\s*([^\s,]+)\s*,)?
 347                           \s*([^\s,]+)\s*
 348                        (?:,\s*([^,]+)\s*
 349                           ,\s*([^\s,]+)\s*)?
 350                        (?:,\s*([^\s,]+)\s*
 351                           ,\s*([^\s,]+)\s*)?
 352                        (?:,\s*\d+\s*)?
 353                       \)\s*{""",
 354             declaration,
 355             flags=re.X,
 356         )
 357         if m:
 358             p, n, result, name = m.groups()[0:4]
 359             args = m.groups()[4:]
 360             if not result:
 361                 if not allowed_types:
 362                     raise Exception("Did not find allowed result types for: %s" % name)
 363                 result_types = allowed_types
 364             else:
 365                 result_types = [result]
 366             if n not in ["", "2"]:
 367                 raise Exception('Cannot parse "%s"' % declaration)
 368             args = ", ".join(
 369                 "%s %s" % (args[i], args[i + 1])
 370                 for i in range(0, len(args), 2)
 371                 if args[i]
 372             )
 373             for result_type in result_types:
 374                 add_matcher(result_type, name, args, comment)
 375             return
 376
 377         m = re.match(
 378             r"""^\s*AST_MATCHER_REGEX(?:_OVERLOAD)?\(
 379                        \s*([^\s,]+)\s*,
 380                        \s*([^\s,]+)\s*,
 381                        \s*([^\s,]+)\s*
 382                        (?:,\s*\d+\s*)?
 383                       \)\s*{""",
 384             declaration,
 385             flags=re.X,
 386         )
 387         if m:
 388             result, name, arg_name = m.groups()[0:3]
 389             if not result:
 390                 if not allowed_types:
 391                     raise Exception("Did not find allowed result types for: %s" % name)
 392                 result_types = allowed_types
 393             else:
 394                 result_types = [result]
 395             arg = "StringRef %s, Regex::RegexFlags Flags = NoFlags" % arg_name
 396             comment += """
 397 If the matcher is used in clang-query, RegexFlags parameter
 398 should be passed as a quoted string. e.g: "NoFlags".
 399 Flags can be combined with '|' example \"IgnoreCase | BasicRegex\"
 400 """
 401
 402             for result_type in result_types:
 403                 add_matcher(result_type, name, arg, comment)
 404             return
 405
 406         # Parse ArgumentAdapting matchers.
 407         m = re.match(
 408             r"""^.*ArgumentAdaptingMatcherFunc<.*>\s*
 409               ([a-zA-Z]*);$""",
 410             declaration,
 411             flags=re.X,
 412         )
 413         if m:
 414             name = m.groups()[0]
 415             add_matcher("*", name, "Matcher<*>", comment)
 416             return
 417
 418         # Parse Variadic functions.
 419         m = re.match(
 420             r"""^.*internal::VariadicFunction\s*<\s*([^,]+),\s*([^,]+),\s*[^>]+>\s*
 421               ([a-zA-Z]*);$""",
 422             declaration,
 423             flags=re.X,
 424         )
 425         if m:
 426             result, arg, name = m.groups()[:3]
 427             add_matcher(result, name, "%s, ..., %s" % (arg, arg), comment)
 428             return
 429
 430         m = re.match(
 431             r"""^.*internal::VariadicFunction\s*<\s*
 432               internal::PolymorphicMatcher<[\S\s]+
 433               AST_POLYMORPHIC_SUPPORTED_TYPES\(([^)]*)\),\s*(.*);$""",
 434             declaration,
 435             flags=re.X,
 436         )
 437
 438         if m:
 439             results, trailing = m.groups()
 440             trailing, name = trailing.rsplit(">", 1)
 441             name = name.strip()
 442             trailing, _ = trailing.rsplit(",", 1)
 443             _, arg = trailing.rsplit(",", 1)
 444             arg = arg.strip()
 445
 446             result_types = [r.strip() for r in results.split(",")]
 447             for result_type in result_types:
 448                 add_matcher(result_type, name, "%s, ..., %s" % (arg, arg), comment)
 449             return
 450
 451         # Parse Variadic operator matchers.
 452         m = re.match(
 453             r"""^.*VariadicOperatorMatcherFunc\s*<\s*([^,]+),\s*([^\s]+)\s*>\s*
 454               ([a-zA-Z]*);$""",
 455             declaration,
 456             flags=re.X,
 457         )
 458         if m:
 459             min_args, max_args, name = m.groups()[:3]
 460             if max_args == "1":
 461                 add_matcher("*", name, "Matcher<*>", comment)
 462                 return
 463             elif max_args == "std::numeric_limits<unsigned>::max()":
 464                 add_matcher("*", name, "Matcher<*>, ..., Matcher<*>", comment)
 465                 return
 466
 467         m = re.match(
 468             r"""^.*MapAnyOfMatcher<.*>\s*
 469               ([a-zA-Z]*);$""",
 470             declaration,
 471             flags=re.X,
 472         )
 473         if m:
 474             name = m.groups()[0]
 475             add_matcher("*", name, "Matcher<*>...Matcher<*>", comment)
 476             return
 477
 478         # Parse free standing matcher functions, like:
 479         #   Matcher<ResultType> Name(Matcher<ArgumentType> InnerMatcher) {
 480         m = re.match(
 481             r"""^\s*(?:template\s+<\s*(?:class|typename)\s+(.+)\s*>\s+)?
 482                      (.*)\s+
 483                      ([^\s\(]+)\s*\(
 484                      (.*)
 485                      \)\s*{""",
 486             declaration,
 487             re.X,
 488         )
 489         if m:
 490             template_name, result, name, args = m.groups()
 491             if template_name:
 492                 matcherTemplateArgs = re.findall(
 493                     r"Matcher<\s*(%s)\s*>" % template_name, args
 494                 )
 495                 templateArgs = re.findall(
 496                     r"(?:^|[\s,<])(%s)(?:$|[\s,>])" % template_name, args
 497                 )
 498                 if len(matcherTemplateArgs) < len(templateArgs):
 499                     # The template name is used naked, so don't replace with `*`` later on
 500                     template_name = None
 501                 else:
 502                     args = re.sub(
 503                         r"(^|[\s,<])%s($|[\s,>])" % template_name, r"\1*\2", args
 504                     )
 505             args = ", ".join(p.strip() for p in args.split(","))
 506             m = re.match(r"(?:^|.*\s+)internal::(?:Bindable)?Matcher<([^>]+)>$", result)
 507             if m:
 508                 result_types = [m.group(1)]
 509                 if (
 510                     template_name
 511                     and len(result_types) == 1
 512                     and result_types[0] == template_name
 513                 ):
 514                     result_types = ["*"]
 515             else:
 516                 result_types = extract_result_types(comment)
 517             if not result_types:
 518                 if not comment:
 519                     # Only overloads don't have their own doxygen comments; ignore those.
 520                     print('Ignoring "%s"' % name)
 521                 else:
 522                     print('Cannot determine result type for "%s"' % name)
 523             else:
 524                 for result_type in result_types:
 525                     add_matcher(result_type, name, args, comment)
 526         else:
 527             print('*** Unparsable: "' + declaration + '" ***')
 528
 529
 530 def sort_table(matcher_type, matcher_map):
 531     """Returns the sorted html table for the given row map."""
 532     table = ""
 533     for key in sorted(matcher_map.keys()):
 534         table += matcher_map[key] + "\n"
 535     return (
 536         "<!-- START_%(type)s_MATCHERS -->\n"
 537         + "%(table)s"
 538         + "<!--END_%(type)s_MATCHERS -->"
 539     ) % {
 540         "type": matcher_type,
 541         "table": table,
 542     }
 543
 544
 545 # Parse the ast matchers.
 546 # We alternate between two modes:
 547 # body = True: We parse the definition of a matcher. We need
 548 #   to parse the full definition before adding a matcher, as the
 549 #   definition might contain static asserts that specify the result
 550 #   type.
 551 # body = False: We parse the comments and declaration of the matcher.
 552 comment = ""
 553 declaration = ""
 554 allowed_types = []
 555 body = False
 556 for line in open(MATCHERS_FILE).read().splitlines():
 557     if body:
 558         if line.strip() and line[0] == "}":
 559             if declaration:
 560                 act_on_decl(declaration, comment, allowed_types)
 561                 comment = ""
 562                 declaration = ""
 563                 allowed_types = []
 564             body = False
 565         else:
 566             m = re.search(r"is_base_of<([^,]+), NodeType>", line)
 567             if m and m.group(1):
 568                 allowed_types += [m.group(1)]
 569         continue
 570     if line.strip() and line.lstrip()[0] == "/":
 571         comment += re.sub(r"^/+\s?", "", line) + "\n"
 572     else:
 573         declaration += " " + line
 574         if (
 575             (not line.strip())
 576             or line.rstrip()[-1] == ";"
 577             or (line.rstrip()[-1] == "{" and line.rstrip()[-3:] != "= {")
 578         ):
 579             if line.strip() and line.rstrip()[-1] == "{":
 580                 body = True
 581             else:
 582                 act_on_decl(declaration, comment, allowed_types)
 583                 comment = ""
 584                 declaration = ""
 585                 allowed_types = []
 586
 587 node_matcher_table = sort_table("DECL", node_matchers)
 588 narrowing_matcher_table = sort_table("NARROWING", narrowing_matchers)
 589 traversal_matcher_table = sort_table("TRAVERSAL", traversal_matchers)
 590
 591 reference = open("../LibASTMatchersReference.html").read()
 592 reference = re.sub(
 593     r"<!-- START_DECL_MATCHERS.*END_DECL_MATCHERS -->",
 594     node_matcher_table,
 595     reference,
 596     flags=re.S,
 597 )
 598 reference = re.sub(
 599     r"<!-- START_NARROWING_MATCHERS.*END_NARROWING_MATCHERS -->",
 600     narrowing_matcher_table,
 601     reference,
 602     flags=re.S,
 603 )
 604 reference = re.sub(
 605     r"<!-- START_TRAVERSAL_MATCHERS.*END_TRAVERSAL_MATCHERS -->",
 606     traversal_matcher_table,
 607     reference,
 608     flags=re.S,
 609 )
 610
 611 with open("../LibASTMatchersReference.html", "w", newline="\n") as output:
 612     output.write(reference)