python/samba/tests/source_chars.py

   1 #!/usr/bin/env python3
   2 # Unix SMB/CIFS implementation.
   3 #
   4 # Copyright (C) Catalyst.Net Ltd. 2021
   5 #
   6 # This program is free software; you can redistribute it and/or modify
   7 # it under the terms of the GNU General Public License as published by
   8 # the Free Software Foundation; either version 3 of the License, or
   9 # (at your option) any later version.
  10 #
  11 # This program is distributed in the hope that it will be useful,
  12 # but WITHOUT ANY WARRANTY; without even the implied warranty of
  13 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14 # GNU General Public License for more details.
  15 #
  16 # You should have received a copy of the GNU General Public License
  17 # along with this program.  If not, see <http://www.gnu.org/licenses/>.
  18
  19 import os
  20 import sys
  21
  22 sys.path.insert(0, 'bin/python')
  23 os.environ['PYTHONUNBUFFERED'] = '1'
  24
  25 import subprocess
  26 from collections import Counter
  27 from samba.colour import c_RED, c_GREEN, c_DARK_YELLOW, switch_colour_off
  28 import re
  29 import unicodedata as u
  30 from samba.tests import TestCase, SkipTest
  31
  32 if not sys.stdout.isatty():
  33     switch_colour_off()
  34
  35
  36 def _find_root():
  37     try:
  38         p = subprocess.run(['git', 'rev-parse', '--show-toplevel'],
  39                            stdout=subprocess.PIPE,
  40                            stderr=subprocess.PIPE,
  41                            timeout=10)
  42     except subprocess.CalledProcessError as err:
  43         print(c_RED("Error running git (is this a git tree?): %s" % (err)))
  44
  45         SkipTest("This test is only useful in a git working tree")
  46         sys.exit(0)
  47
  48     if p.returncode != 0:
  49         raise SkipTest("This test is only useful in a git working tree")
  50         sys.exit(0)
  51
  52     root = p.stdout.decode().strip()
  53
  54     should_be_roots = (
  55         os.path.abspath(os.path.join(os.path.dirname(__file__),
  56                                      "../../..")),
  57         os.path.abspath(os.path.join(os.path.dirname(__file__),
  58                                      "../../../..")),
  59     )
  60     if root not in should_be_roots:
  61         print(c_RED("It looks like we have found the wrong git tree!"))
  62         sys.exit(1)
  63     return root
  64
  65
  66 ROOT = None
  67
  68 IGNORED_FILES = (
  69     'source3/selftest/ktest-krb5_ccache-2',
  70     'source3/selftest/ktest-krb5_ccache-3',
  71     'testdata/source-chars-bad.c',
  72 )
  73
  74 IGNORED_RE = (
  75     r'^third_party/heimdal/lib/hcrypto/passwd_dialog',
  76     r'^third_party/heimdal/lib/hx509/data/',
  77     r'^third_party/heimdal/po',
  78     r'^third_party/heimdal/tests/kdc/hdb-mitdb',
  79     r'^testdata/compression/',
  80     r'^third_party/heimdal/lib/asn1/fuzz-inputs/',
  81 )
  82
  83 IGNORED_EXTENSIONS = {
  84     'bmp',
  85     'cer',
  86     'corrupt',
  87     'crl',
  88     'crt',
  89     'dat',
  90     'der',
  91     'dump',
  92     'gpg',
  93     'gz',
  94     'ico',
  95     'keytab',
  96     'ldb',
  97     'p12',
  98     'pdf',
  99     'pem',
 100     'png',
 101     'SAMBABACKUP',
 102     'sxd',
 103     'tdb',
 104     'tif',
 105     'reg',
 106     'req'
 107 }
 108
 109
 110 # This list is by no means exhaustive -- these are just the format
 111 # characters we actually use.
 112 SAFE_FORMAT_CHARS = {
 113     '\u200b',
 114     '\ufeff'
 115 }
 116
 117 # These files legitimately mix left-to-right and right-to-left text.
 118 # In the real world mixing directions would be normal in bilingual
 119 # documents, but it is rare in Samba source code.
 120 BIDI_FILES = {
 121     'third_party/heimdal/lib/base/test_base.c',
 122     'third_party/heimdal/lib/wind/NormalizationTest.txt',
 123     'testdata/source-chars-bidi.py',
 124 }
 125
 126
 127 def get_git_files():
 128     try:
 129         p = subprocess.run(['git',
 130                             '-C', ROOT,
 131                             'ls-files',
 132                             '-z'],
 133                            stdout=subprocess.PIPE,
 134                            stderr=subprocess.PIPE,
 135                            timeout=10)
 136     except subprocess.SubprocessError as e:
 137         print(c_RED(f"Error running git (is this a git tree?): {e}"))
 138         print("This test is only useful in a git working tree")
 139         return []
 140
 141     filenames = p.stdout.split(b'\x00')
 142     return [x.decode() for x in filenames[:-1]]
 143
 144
 145 def iter_source_files():
 146     filenames = get_git_files()
 147
 148     for name in filenames:
 149         ignore = False
 150         if name in IGNORED_FILES:
 151             print(c_DARK_YELLOW(f"ignoring (exact) {name}"))
 152             continue
 153
 154         for ignored in IGNORED_RE:
 155             ignore = (re.match(ignored, name))
 156             if ignore:
 157                 break
 158
 159         if ignore:
 160             print(c_DARK_YELLOW(f"ignoring (via RE) {name}"))
 161             continue
 162
 163         if '.' in name:
 164             ext = name.rsplit('.', 1)[1]
 165             if ext in IGNORED_EXTENSIONS:
 166                 print(c_DARK_YELLOW(f"ignoring {name}"))
 167                 continue
 168
 169         yield name
 170
 171
 172 def is_latin1_file(name):
 173     for pattern in (
 174             r'^source4/setup/ad-schema/\w+.ldf$',
 175             r'^source4/setup/display-specifiers/D[\w-]+.txt$',
 176             r'^third_party/heimdal/cf/pkg.m4$',
 177             r'^third_party/heimdal/doc/standardisation/',
 178     ):
 179         if re.match(pattern, name):
 180             return True
 181     return False
 182
 183
 184 def is_bad_latin1_file(fullname):
 185     # In practice, the few latin-1 files we have have single non-ASCII
 186     # byte islands in a sea of ASCII. The utf-8 sequences we are
 187     # concerned about involve sequences of 3 high bytes. We can say a
 188     # file is safe latin-1 if it has only individual high bytes.
 189     with open(fullname, 'rb') as f:
 190         b = f.read()
 191     in_seq = False
 192     for c in b:
 193         if c > 0x7f:
 194             if in_seq:
 195                 return True
 196             in_seq = True
 197         else:
 198             in_seq = False
 199     return False
 200
 201
 202 def is_bad_char(c):
 203     if u.category(c) != 'Cf':
 204         return False
 205     if c in SAFE_FORMAT_CHARS:
 206         return False
 207     return True
 208
 209
 210 class CharacterTests(TestCase):
 211     def setUp(self):
 212         global ROOT
 213         if not ROOT:
 214             ROOT = _find_root()
 215
 216     def test_no_unexpected_format_chars(self):
 217         """This test tries to ensure that no source file has unicode control
 218         characters that can change the apparent order of other
 219         characters. These characters could make code appear to have
 220         different semantic meaning it really does.
 221
 222         This issue is sometimes called "Trojan Source", "CVE-2021-42574",
 223         or "CVE-2021-42694".
 224         """
 225         for name in iter_source_files():
 226             fullname = os.path.join(ROOT, name)
 227             try:
 228                 with open(fullname) as f:
 229                     s = f.read()
 230             except UnicodeDecodeError as e:
 231                 # probably a latin-1 encoding, which we tolerate in a few
 232                 # files for historical reasons, though we check that there
 233                 # are not long sequences of high bytes.
 234                 if is_latin1_file(name):
 235                     if is_bad_latin1_file(fullname):
 236                         self.fail(f"latin-1 file {name} has long sequences "
 237                                   "of high bytes")
 238                 else:
 239                     self.fail(f"could not decode {name}: {e}")
 240
 241             dirs = set()
 242             for c in set(s):
 243                 if is_bad_char(c):
 244                     self.fail(f"{name} has potentially bad format character {ord(c[0])}!")
 245                 dirs.add(u.bidirectional(c))
 246
 247             if 'L' in dirs and 'R' in dirs:
 248                 if name not in BIDI_FILES:
 249                     self.fail(f"{name} has LTR and RTL text ({dirs})")
 250
 251     def test_unexpected_format_chars_do_fail(self):
 252         """Test the test"""
 253         for name, n_bad in [
 254                 ('testdata/source-chars-bad.c', 3)
 255         ]:
 256             fullname = os.path.join(ROOT, name)
 257             with open(fullname) as f:
 258                 s = f.read()
 259             chars = set(s)
 260             bad_chars = [c for c in chars if is_bad_char(c)]
 261             self.assertEqual(len(bad_chars), n_bad)
 262
 263     def test_unexpected_bidi_fails(self):
 264         """Test the test"""
 265         for name in [
 266                 'testdata/source-chars-bidi.py'
 267         ]:
 268             fullname = os.path.join(ROOT, name)
 269             with open(fullname) as f:
 270                 s = f.read()
 271
 272             dirs = set()
 273             for c in set(s):
 274                 dirs.add(u.bidirectional(c))
 275             self.assertIn('L', dirs)
 276             self.assertIn('R', dirs)
 277
 278
 279 def check_file_text():
 280     """If called directly as a script, count the found characters."""
 281     global ROOT
 282     if not ROOT:
 283         ROOT = _find_root()
 284
 285     counts = Counter()
 286     for name in iter_source_files():
 287         fullname = os.path.join(ROOT, name)
 288         try:
 289             with open(fullname) as f:
 290                 s = f.read()
 291         except UnicodeDecodeError as e:
 292             if is_latin1_file(name):
 293                 if is_bad_latin1_file(fullname):
 294                     print(c_RED(f"latin-1 file {name} has long sequences "
 295                                 "of high bytes"))
 296                 else:
 297                     print(c_GREEN(f"latin-1 file {name} is fine"))
 298             else:
 299                 print(c_RED(f"can't read {name}: {e}"))
 300
 301         counts.update(s)
 302         chars = set(s)
 303         for c in chars:
 304             if u.category(c) == 'Cf':
 305                 print(c_GREEN(f"{name} has {u.name(c)}"))
 306
 307     print(len(counts))
 308     controls = []
 309     formats = []
 310     others = []
 311     for x in counts:
 312         c = u.category(x)
 313         if c == 'Cc':
 314             controls.append(x)
 315         elif c == 'Cf':
 316             formats.append(x)
 317         elif c[0] == 'C':
 318             others.append(x)
 319
 320     print(f"normal control characters {controls}")
 321     print(f"format characters {formats}")
 322     print(f"other control characters {others}")
 323
 324
 325 if __name__ == '__main__':
 326     check_file_text()