ctdb-scripts: Improve update and listing code
[samba4-gss.git] / python / samba / tests / source_chars.py
blobdea7fcc98192161abf38618c823679efb1000779
1 #!/usr/bin/env python3
2 # Unix SMB/CIFS implementation.
4 # Copyright (C) Catalyst.Net Ltd. 2021
6 # This program is free software; you can redistribute it and/or modify
7 # it under the terms of the GNU General Public License as published by
8 # the Free Software Foundation; either version 3 of the License, or
9 # (at your option) any later version.
11 # This program is distributed in the hope that it will be useful,
12 # but WITHOUT ANY WARRANTY; without even the implied warranty of
13 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 # GNU General Public License for more details.
16 # You should have received a copy of the GNU General Public License
17 # along with this program. If not, see <http://www.gnu.org/licenses/>.
19 import os
20 import sys
22 sys.path.insert(0, 'bin/python')
23 os.environ['PYTHONUNBUFFERED'] = '1'
25 import subprocess
26 from collections import Counter
27 from samba.colour import c_RED, c_GREEN, c_DARK_YELLOW, switch_colour_off
28 import re
29 import unicodedata as u
30 from samba.tests import TestCase, SkipTest
32 if not sys.stdout.isatty():
33 switch_colour_off()
36 def _find_root():
37 try:
38 p = subprocess.run(['git', 'rev-parse', '--show-toplevel'],
39 stdout=subprocess.PIPE,
40 stderr=subprocess.PIPE,
41 timeout=10)
42 except subprocess.CalledProcessError as err:
43 print(c_RED("Error running git (is this a git tree?): %s" % (err)))
45 SkipTest("This test is only useful in a git working tree")
46 sys.exit(0)
48 if p.returncode != 0:
49 raise SkipTest("This test is only useful in a git working tree")
50 sys.exit(0)
52 root = p.stdout.decode().strip()
54 should_be_roots = (
55 os.path.abspath(os.path.join(os.path.dirname(__file__),
56 "../../..")),
57 os.path.abspath(os.path.join(os.path.dirname(__file__),
58 "../../../..")),
60 if root not in should_be_roots:
61 print(c_RED("It looks like we have found the wrong git tree!"))
62 sys.exit(1)
63 return root
66 ROOT = None
68 IGNORED_FILES = (
69 'source3/selftest/ktest-krb5_ccache-2',
70 'source3/selftest/ktest-krb5_ccache-3',
71 'testdata/source-chars-bad.c',
74 IGNORED_RE = (
75 r'^third_party/heimdal/lib/hcrypto/passwd_dialog',
76 r'^third_party/heimdal/lib/hx509/data/',
77 r'^third_party/heimdal/po',
78 r'^third_party/heimdal/tests/kdc/hdb-mitdb',
79 r'^testdata/compression/',
80 r'^third_party/heimdal/lib/asn1/fuzz-inputs/',
83 IGNORED_EXTENSIONS = {
84 'bmp',
85 'cer',
86 'corrupt',
87 'crl',
88 'crt',
89 'dat',
90 'der',
91 'dump',
92 'gpg',
93 'gz',
94 'ico',
95 'keytab',
96 'ldb',
97 'p12',
98 'pdf',
99 'pem',
100 'png',
101 'SAMBABACKUP',
102 'sxd',
103 'tdb',
104 'tif',
105 'reg',
106 'req'
110 # This list is by no means exhaustive -- these are just the format
111 # characters we actually use.
112 SAFE_FORMAT_CHARS = {
113 '\u200b',
114 '\ufeff'
117 # These files legitimately mix left-to-right and right-to-left text.
118 # In the real world mixing directions would be normal in bilingual
119 # documents, but it is rare in Samba source code.
120 BIDI_FILES = {
121 'third_party/heimdal/lib/base/test_base.c',
122 'third_party/heimdal/lib/wind/NormalizationTest.txt',
123 'testdata/source-chars-bidi.py',
127 def get_git_files():
128 try:
129 p = subprocess.run(['git',
130 '-C', ROOT,
131 'ls-files',
132 '-z'],
133 stdout=subprocess.PIPE,
134 stderr=subprocess.PIPE,
135 timeout=10)
136 except subprocess.SubprocessError as e:
137 print(c_RED(f"Error running git (is this a git tree?): {e}"))
138 print("This test is only useful in a git working tree")
139 return []
141 filenames = p.stdout.split(b'\x00')
142 return [x.decode() for x in filenames[:-1]]
145 def iter_source_files():
146 filenames = get_git_files()
148 for name in filenames:
149 ignore = False
150 if name in IGNORED_FILES:
151 print(c_DARK_YELLOW(f"ignoring (exact) {name}"))
152 continue
154 for ignored in IGNORED_RE:
155 ignore = (re.match(ignored, name))
156 if ignore:
157 break
159 if ignore:
160 print(c_DARK_YELLOW(f"ignoring (via RE) {name}"))
161 continue
163 if '.' in name:
164 ext = name.rsplit('.', 1)[1]
165 if ext in IGNORED_EXTENSIONS:
166 print(c_DARK_YELLOW(f"ignoring {name}"))
167 continue
169 yield name
172 def is_latin1_file(name):
173 for pattern in (
174 r'^source4/setup/ad-schema/\w+.ldf$',
175 r'^source4/setup/display-specifiers/D[\w-]+.txt$',
176 r'^third_party/heimdal/cf/pkg.m4$',
177 r'^third_party/heimdal/doc/standardisation/',
179 if re.match(pattern, name):
180 return True
181 return False
184 def is_bad_latin1_file(fullname):
185 # In practice, the few latin-1 files we have have single non-ASCII
186 # byte islands in a sea of ASCII. The utf-8 sequences we are
187 # concerned about involve sequences of 3 high bytes. We can say a
188 # file is safe latin-1 if it has only individual high bytes.
189 with open(fullname, 'rb') as f:
190 b = f.read()
191 in_seq = False
192 for c in b:
193 if c > 0x7f:
194 if in_seq:
195 return True
196 in_seq = True
197 else:
198 in_seq = False
199 return False
202 def is_bad_char(c):
203 if u.category(c) != 'Cf':
204 return False
205 if c in SAFE_FORMAT_CHARS:
206 return False
207 return True
210 class CharacterTests(TestCase):
211 def setUp(self):
212 global ROOT
213 if not ROOT:
214 ROOT = _find_root()
216 def test_no_unexpected_format_chars(self):
217 """This test tries to ensure that no source file has unicode control
218 characters that can change the apparent order of other
219 characters. These characters could make code appear to have
220 different semantic meaning it really does.
222 This issue is sometimes called "Trojan Source", "CVE-2021-42574",
223 or "CVE-2021-42694".
225 for name in iter_source_files():
226 fullname = os.path.join(ROOT, name)
227 try:
228 with open(fullname) as f:
229 s = f.read()
230 except UnicodeDecodeError as e:
231 # probably a latin-1 encoding, which we tolerate in a few
232 # files for historical reasons, though we check that there
233 # are not long sequences of high bytes.
234 if is_latin1_file(name):
235 if is_bad_latin1_file(fullname):
236 self.fail(f"latin-1 file {name} has long sequences "
237 "of high bytes")
238 else:
239 self.fail(f"could not decode {name}: {e}")
241 dirs = set()
242 for c in set(s):
243 if is_bad_char(c):
244 self.fail(f"{name} has potentially bad format character {ord(c[0])}!")
245 dirs.add(u.bidirectional(c))
247 if 'L' in dirs and 'R' in dirs:
248 if name not in BIDI_FILES:
249 self.fail(f"{name} has LTR and RTL text ({dirs})")
251 def test_unexpected_format_chars_do_fail(self):
252 """Test the test"""
253 for name, n_bad in [
254 ('testdata/source-chars-bad.c', 3)
256 fullname = os.path.join(ROOT, name)
257 with open(fullname) as f:
258 s = f.read()
259 chars = set(s)
260 bad_chars = [c for c in chars if is_bad_char(c)]
261 self.assertEqual(len(bad_chars), n_bad)
263 def test_unexpected_bidi_fails(self):
264 """Test the test"""
265 for name in [
266 'testdata/source-chars-bidi.py'
268 fullname = os.path.join(ROOT, name)
269 with open(fullname) as f:
270 s = f.read()
272 dirs = set()
273 for c in set(s):
274 dirs.add(u.bidirectional(c))
275 self.assertIn('L', dirs)
276 self.assertIn('R', dirs)
279 def check_file_text():
280 """If called directly as a script, count the found characters."""
281 global ROOT
282 if not ROOT:
283 ROOT = _find_root()
285 counts = Counter()
286 for name in iter_source_files():
287 fullname = os.path.join(ROOT, name)
288 try:
289 with open(fullname) as f:
290 s = f.read()
291 except UnicodeDecodeError as e:
292 if is_latin1_file(name):
293 if is_bad_latin1_file(fullname):
294 print(c_RED(f"latin-1 file {name} has long sequences "
295 "of high bytes"))
296 else:
297 print(c_GREEN(f"latin-1 file {name} is fine"))
298 else:
299 print(c_RED(f"can't read {name}: {e}"))
301 counts.update(s)
302 chars = set(s)
303 for c in chars:
304 if u.category(c) == 'Cf':
305 print(c_GREEN(f"{name} has {u.name(c)}"))
307 print(len(counts))
308 controls = []
309 formats = []
310 others = []
311 for x in counts:
312 c = u.category(x)
313 if c == 'Cc':
314 controls.append(x)
315 elif c == 'Cf':
316 formats.append(x)
317 elif c[0] == 'C':
318 others.append(x)
320 print(f"normal control characters {controls}")
321 print(f"format characters {formats}")
322 print(f"other control characters {others}")
325 if __name__ == '__main__':
326 check_file_text()