tools/lint/fluent-lint/__init__.py

   1 # This Source Code Form is subject to the terms of the Mozilla Public
   2 # License, v. 2.0. If a copy of the MPL was not distributed with this
   3 # file, You can obtain one at http://mozilla.org/MPL/2.0/.
   4 import bisect
   5 import os
   6 import re
   7 from html.parser import HTMLParser
   8
   9 import mozpack.path as mozpath
  10 import yaml
  11 from fluent.syntax import ast, parse, visitor
  12 from mozlint import result
  13 from mozlint.pathutils import expand_exclusions
  14
  15
  16 class TextElementHTMLParser(HTMLParser):
  17     """HTML Parser for TextElement.
  18
  19     TextElements may contain embedded html tags, which can include
  20     quotes in attributes. We only want to check the actual text.
  21     """
  22
  23     def __init__(self):
  24         super().__init__()
  25         self.extracted_text = []
  26
  27     def handle_data(self, data):
  28         self.extracted_text.append(data)
  29
  30
  31 class Linter(visitor.Visitor):
  32     """Fluent linter implementation.
  33
  34     This subclasses the Fluent AST visitor. Methods are called corresponding
  35     to each type of node in the Fluent AST. It is possible to control
  36     whether a node is recursed into by calling the generic_visit method on
  37     the superclass.
  38
  39     See the documentation here:
  40     https://www.projectfluent.org/python-fluent/fluent.syntax/stable/usage.html
  41     """
  42
  43     def __init__(
  44         self, path, config, exclusions, contents, offsets_and_lines, brand_names=[]
  45     ):
  46         super().__init__()
  47         self.path = path
  48         self.config = config
  49         self.exclusions = exclusions
  50         self.contents = contents
  51         self.offsets_and_lines = offsets_and_lines
  52
  53         self.results = []
  54         self.identifier_re = re.compile(r"[a-z0-9-]+")
  55         self.apostrophe_re = re.compile(r"\w'")
  56         self.incorrect_apostrophe_re = re.compile(r"\w\u2018\w")
  57         self.single_quote_re = re.compile(r"'(.+)'")
  58         self.double_quote_re = re.compile(r"\".+\"")
  59         self.ellipsis_re = re.compile(r"\.\.\.")
  60
  61         self.brand_names = brand_names
  62         self.minimum_id_length = 9
  63
  64         self.state = {
  65             # The resource comment should be at the top of the page after the license.
  66             "node_can_be_resource_comment": True,
  67             # Group comments must be followed by a message. Two group comments are not
  68             # allowed in a row.
  69             "can_have_group_comment": True,
  70             # Comment bound to the current message
  71             "comment": "",
  72             # The current group comment
  73             "group_comment": "",
  74             # Variables in the current message
  75             "variables": [],
  76         }
  77
  78         attributes = [
  79             "label",
  80             "value",
  81             "accesskey",
  82             "alt",
  83             "title",
  84             "tooltiptext",
  85             "placeholder",
  86             "aria-label",
  87             "aria-description",
  88             "aria-valuetext",
  89             "style",
  90             # For XUL key/command setup.
  91             "key",
  92             "keycode",
  93             # For download filenames:
  94             "download",
  95             # Used in the Firefox prefs
  96             "searchkeywords",
  97             # Used by search-textbox.js
  98             "searchbuttonlabel",
  99             # Used in toolbar customization.
 100             "toolbarname",
 101             # Used in moz-message-bar.
 102             "message",
 103             # Used in dialogs (should be moved to using fluent IDs though)
 104             "buttonlabelaccept",
 105             "buttonaccesskeyaccept",
 106             "buttonlabelcancel",
 107             "buttonaccesskeycancel",
 108             "buttonlabelextra2",
 109             "buttonaccesskeyextra2",
 110             # Used in app menu notifications (should be moved to use fluent IDs)
 111             "buttonlabel",
 112             "buttonaccesskey",
 113             "secondarybuttonlabel",
 114             "secondarybuttonaccesskey",
 115             # Commonly used in Lit-based web components
 116             "heading",
 117             "description",
 118         ]
 119         self.known_attribute_list = [a.lower() for a in attributes]
 120
 121         # Set this to true to debug print the root node's json. This is useful for
 122         # writing new lint rules, or debugging existing ones.
 123         self.debug_print_json = False
 124
 125     def generic_visit(self, node):
 126         node_name = type(node).__name__
 127         self.state["node_can_be_resource_comment"] = self.state[
 128             "node_can_be_resource_comment"
 129         ] and (
 130             # This is the root node.
 131             node_name == "Resource"
 132             # Empty space is allowed.
 133             or node_name == "Span"
 134             # Comments are allowed
 135             or node_name == "Comment"
 136         )
 137
 138         if self.debug_print_json:
 139             import json
 140
 141             print(json.dumps(node.to_json(), indent=2))
 142             # Only debug print the root node.
 143             self.debug_print_json = False
 144
 145         super(Linter, self).generic_visit(node)
 146
 147     def visit_Attribute(self, node):
 148         # Only visit values for Attribute nodes, the identifier comes from dom.
 149         super().generic_visit(node.value)
 150
 151     def visit_FunctionReference(self, node):
 152         # We don't recurse into function references, the identifiers there are
 153         # allowed to be free form.
 154         pass
 155
 156     def visit_Message(self, node):
 157         # There must be at least one message or term between group comments.
 158         self.state["can_have_group_comment"] = True
 159         self.last_message_id = node.id.name
 160
 161         super().generic_visit(node)
 162
 163         # Do this here instead as visit_Attribute doesn't have access to the
 164         # message's comment.
 165         for attr in node.attributes:
 166             if not attr.id.name.lower() in self.known_attribute_list:
 167                 comment = self.state["comment"] + self.state["group_comment"]
 168                 if not f".{attr.id.name}" in comment:
 169                     self.add_error(
 170                         attr,
 171                         "VA01",
 172                         "Use attributes designed for localized content directly."
 173                         " If script-based processing is necessary, add a comment"
 174                         f" explaining why. The linter didn't recognize: .{attr.id.name}",
 175                         "warning",
 176                     )
 177
 178         # Check if variables are referenced in comments
 179         if self.state["variables"]:
 180             comments = self.state["comment"] + self.state["group_comment"]
 181             missing_references = [
 182                 v for v in self.state["variables"] if f"${v}" not in comments
 183             ]
 184             if missing_references:
 185                 self.add_error(
 186                     node,
 187                     "VC01",
 188                     "Messages including variables should have a comment "
 189                     "explaining what will replace the variable. "
 190                     "Missing references: "
 191                     + ", ".join([f"${m}" for m in missing_references]),
 192                 )
 193
 194         # Reset current comment and variable references after reading the
 195         # message.
 196         self.state["comment"] = ""
 197         self.state["variables"] = []
 198
 199     def visit_Term(self, node):
 200         # There must be at least one message or term between group comments.
 201         self.state["can_have_group_comment"] = True
 202         self.last_message_id = None
 203
 204         super().generic_visit(node)
 205
 206         # Reset current comment and variable references after reading the term.
 207         self.state["comment"] = ""
 208         self.state["variables"] = []
 209
 210     def visit_MessageReference(self, node):
 211         # We don't recurse into message references, the identifiers are either
 212         # checked elsewhere or are attributes and come from DOM.
 213         pass
 214
 215     def visit_Identifier(self, node):
 216         if (
 217             self.path not in self.exclusions["ID01"]["files"]
 218             and node.name not in self.exclusions["ID01"]["messages"]
 219             and not self.identifier_re.fullmatch(node.name)
 220         ):
 221             self.add_error(
 222                 node,
 223                 "ID01",
 224                 "Identifiers may only contain lowercase characters and -",
 225             )
 226         if (
 227             len(node.name) < self.minimum_id_length
 228             and self.path not in self.exclusions["ID02"]["files"]
 229             and node.name not in self.exclusions["ID02"]["messages"]
 230         ):
 231             self.add_error(
 232                 node,
 233                 "ID02",
 234                 f"Identifiers must be at least {self.minimum_id_length} characters long",
 235             )
 236
 237     def visit_TextElement(self, node):
 238         parser = TextElementHTMLParser()
 239         parser.feed(node.value)
 240         for text in parser.extracted_text:
 241             # To check for apostrophes, first remove pairs of straight quotes
 242             # used as delimiters.
 243             cleaned_str = re.sub(self.single_quote_re, "\1", node.value)
 244             if self.apostrophe_re.search(cleaned_str):
 245                 self.add_error(
 246                     node,
 247                     "TE01",
 248                     "Strings with apostrophes should use foo\u2019s instead of foo's.",
 249                 )
 250             if self.incorrect_apostrophe_re.search(text):
 251                 self.add_error(
 252                     node,
 253                     "TE02",
 254                     "Strings with apostrophes should use foo\u2019s instead of foo\u2018s.",
 255                 )
 256             if self.single_quote_re.search(text):
 257                 self.add_error(
 258                     node,
 259                     "TE03",
 260                     "Single-quoted strings should use Unicode \u2018foo\u2019 instead of 'foo'.",
 261                 )
 262             if self.double_quote_re.search(text):
 263                 self.add_error(
 264                     node,
 265                     "TE04",
 266                     'Double-quoted strings should use Unicode \u201cfoo\u201d instead of "foo".',
 267                 )
 268             if self.ellipsis_re.search(text):
 269                 self.add_error(
 270                     node,
 271                     "TE05",
 272                     "Strings with an ellipsis should use the Unicode \u2026 character"
 273                     " instead of three periods",
 274                 )
 275
 276             # If part of a message, check for brand names
 277             if (
 278                 self.last_message_id is not None
 279                 and self.path not in self.exclusions["CO01"]["files"]
 280                 and self.last_message_id not in self.exclusions["CO01"]["messages"]
 281             ):
 282                 found_brands = []
 283                 for brand in self.brand_names:
 284                     if brand in text:
 285                         found_brands.append(brand)
 286                 if found_brands:
 287                     self.add_error(
 288                         node,
 289                         "CO01",
 290                         "Strings should use the corresponding terms instead of"
 291                         f" hard-coded brand names ({', '.join(found_brands)})",
 292                     )
 293
 294     def visit_ResourceComment(self, node):
 295         # This node is a comment with: "###"
 296         if not self.state["node_can_be_resource_comment"]:
 297             self.add_error(
 298                 node,
 299                 "RC01",
 300                 "Resource comments (###) should be placed at the top of the file, just "
 301                 "after the license header. There should only be one resource comment "
 302                 "per file.",
 303             )
 304             return
 305
 306         lines_after = get_newlines_count_after(node.span, self.contents)
 307         lines_before = get_newlines_count_before(node.span, self.contents)
 308
 309         if node.span.end == len(self.contents) - 1:
 310             # This file only contains a resource comment.
 311             return
 312
 313         if lines_after != 2:
 314             self.add_error(
 315                 node,
 316                 "RC02",
 317                 "Resource comments (###) should be followed by one empty line.",
 318             )
 319             return
 320
 321         if lines_before != 2:
 322             self.add_error(
 323                 node,
 324                 "RC03",
 325                 "Resource comments (###) should have one empty line above them.",
 326             )
 327             return
 328
 329     def visit_SelectExpression(self, node):
 330         # We only want to visit the variant values, the identifiers in selectors
 331         # and keys are allowed to be free form.
 332         for variant in node.variants:
 333             super().generic_visit(variant.value)
 334
 335         # Store the variable used for the SelectExpression, excluding functions
 336         # like PLATFORM()
 337         if (
 338             type(node.selector) is ast.VariableReference
 339             and node.selector.id.name not in self.state["variables"]
 340         ):
 341             self.state["variables"].append(node.selector.id.name)
 342
 343     def visit_Comment(self, node):
 344         # This node is a comment with: "#"
 345
 346         # Store the comment
 347         self.state["comment"] = node.content
 348
 349     def visit_GroupComment(self, node):
 350         # This node is a comment with: "##"
 351
 352         # Store the group comment
 353         self.state["group_comment"] = node.content
 354
 355         if not self.state["can_have_group_comment"]:
 356             self.add_error(
 357                 node,
 358                 "GC04",
 359                 "Group comments (##) must be followed by at least one message "
 360                 "or term. Make sure that a single group comment with multiple "
 361                 "paragraphs is not separated by whitespace, as it will be "
 362                 "interpreted as two different comments.",
 363             )
 364             return
 365
 366         self.state["can_have_group_comment"] = False
 367
 368         lines_after = get_newlines_count_after(node.span, self.contents)
 369         lines_before = get_newlines_count_before(node.span, self.contents)
 370
 371         if node.span.end == len(self.contents) - 1:
 372             # The group comment is the last thing in the file.
 373
 374             if node.content == "":
 375                 # Empty comments are allowed at the end of the file.
 376                 return
 377
 378             self.add_error(
 379                 node,
 380                 "GC01",
 381                 "Group comments (##) should not be at the end of the file, they should "
 382                 "always be above a message. Only an empty group comment is allowed at "
 383                 "the end of a file.",
 384             )
 385             return
 386
 387         if lines_after != 2:
 388             self.add_error(
 389                 node,
 390                 "GC02",
 391                 "Group comments (##) should be followed by one empty line.",
 392             )
 393             return
 394
 395         if lines_before != 2:
 396             self.add_error(
 397                 node,
 398                 "GC03",
 399                 "Group comments (##) should have an empty line before them.",
 400             )
 401             return
 402
 403     def visit_VariableReference(self, node):
 404         # Identifiers are allowed to be free form, but need to store them
 405         # for comment checks.
 406
 407         if node.id.name not in self.state["variables"]:
 408             self.state["variables"].append(node.id.name)
 409
 410     def add_error(self, node, rule, msg, level=None):
 411         (col, line) = self.span_to_line_and_col(node.span)
 412         res = {
 413             "path": self.path,
 414             "lineno": line,
 415             "column": col,
 416             "rule": rule,
 417             "message": msg,
 418         }
 419         if level:
 420             res["level"] = level
 421
 422         self.results.append(result.from_config(self.config, **res))
 423
 424     def span_to_line_and_col(self, span):
 425         i = bisect.bisect_left(self.offsets_and_lines, (span.start, 0))
 426         if i > 0:
 427             col = span.start - self.offsets_and_lines[i - 1][0]
 428         else:
 429             col = 1 + span.start
 430         return (col, self.offsets_and_lines[i][1])
 431
 432
 433 def get_offsets_and_lines(contents):
 434     """Return a list consisting of tuples of (offset, line).
 435
 436     The Fluent AST contains spans of start and end offsets in the file.
 437     This function returns a list of offsets and line numbers so that errors
 438     can be reported using line and column.
 439     """
 440     line = 1
 441     result = []
 442     for m in re.finditer(r"\n", contents):
 443         result.append((m.start(), line))
 444         line += 1
 445     return result
 446
 447
 448 def get_newlines_count_after(span, contents):
 449     # Determine the number of newlines.
 450     count = 0
 451     for i in range(span.end, len(contents)):
 452         assert contents[i] != "\r", "This linter does not handle \\r characters."
 453         if contents[i] != "\n":
 454             break
 455         count += 1
 456
 457     return count
 458
 459
 460 def get_newlines_count_before(span, contents):
 461     # Determine the range of newline characters.
 462     count = 0
 463     for i in range(span.start - 1, 0, -1):
 464         assert contents[i] != "\r", "This linter does not handle \\r characters."
 465         if contents[i] != "\n":
 466             break
 467         count += 1
 468
 469     return count
 470
 471
 472 def get_exclusions(root):
 473     with open(
 474         mozpath.join(root, "tools", "lint", "fluent-lint", "exclusions.yml")
 475     ) as f:
 476         exclusions = list(yaml.safe_load_all(f))[0]
 477         for error_type in exclusions:
 478             exclusions[error_type]["files"] = set(
 479                 [mozpath.join(root, x) for x in exclusions[error_type]["files"]]
 480             )
 481         return exclusions
 482
 483
 484 def get_branding_list(root, brand_files):
 485     class MessageExtractor(visitor.Visitor):
 486         def __init__(self):
 487             self.brands = []
 488             self.last_message_id = None
 489
 490         def visit_Term(self, node):
 491             self.last_message_id = node.id.name
 492             self.generic_visit(node)
 493
 494         def visit_TextElement(self, node):
 495             if self.last_message_id:
 496                 self.brands += [node.value]
 497                 self.last_message_id = None
 498             self.generic_visit(node)
 499
 500     extractor = MessageExtractor()
 501
 502     for brand_path in brand_files:
 503         brand_file = mozpath.join(root, brand_path)
 504         if os.path.exists(brand_file):
 505             with open(brand_file, encoding="utf-8") as f:
 506                 messages = parse(f.read())
 507                 extractor.visit(messages)
 508
 509     return list(set(extractor.brands))
 510
 511
 512 def lint(paths, config, fix=None, **lintargs):
 513     root = lintargs["root"]
 514     files = list(expand_exclusions(paths, config, root))
 515     exclusions = get_exclusions(root)
 516     brand_files = config.get("brand-files")
 517     brand_names = get_branding_list(root, brand_files)
 518     results = []
 519     for path in files:
 520         contents = open(path, "r", encoding="utf-8").read()
 521         linter = Linter(
 522             path,
 523             config,
 524             exclusions,
 525             contents,
 526             get_offsets_and_lines(contents),
 527             brand_names,
 528         )
 529         linter.visit(parse(contents))
 530         results.extend(linter.results)
 531     return results