2 * Copyright (c) 2017-2020, De Rais <derais@cock.li>
4 * Permission to use, copy, modify, and/or distribute this software for
5 * any purpose with or without fee is hereby granted, provided that the
6 * above copyright notice and this permission notice appear in all
9 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL
10 * WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED
11 * WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE
12 * AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL
13 * DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR
14 * PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
15 * TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
16 * PERFORMANCE OF THIS SOFTWARE.
26 #define PCRE2_CODE_UNIT_WIDTH 8
31 #include "unicode-transforms.h"
34 * We need a way to get codepoints out of UTF-8 strings and if
35 * wchar_t stored codepoint values, that would be great. That's
36 * __STDC_ISO_10646__, though. You can remove this check and cross
37 * your fingers, since rb79 will do a quick check on startup, but
38 * please check why the C implementation doesn't define
39 * __STDC_ISO_10646__ first.
41 #ifndef __STDC_ISO_10646__
42 #error We really want __STD_ISO_10646__
46 * A wordfilter consists of a pcre2 regex and a replacement string
51 const char *replacement
;
52 size_t replacement_len
;
56 * A forbidden consists of a pcre2 regex only
62 const char *ban_reason
;
65 /* These are constructed in setup_sanitize_comment() */
66 static struct wordfilter
*wordfilters
;
67 static size_t wordfilters_num
;
68 static struct forbidden
*forbiddens
;
69 static size_t forbiddens_num
;
71 /* Special matcher for quoting, newlines, linkifying, etc. */
72 static pcre2_code
*format_replacements
;
75 * Comparison function for struct translate.
79 * - *key_v is a wchar_t.
81 * - *tr_v is a struct translate object.
85 * - Returns -1 (0) [1] if *key_v is less than (equal to) [greater
86 * than] *tr_v's starting range.
89 match_translate(const void *key_v
, const void *tr_v
)
91 const wchar_t *key
= key_v
;
92 const struct translate
*tr
= tr_v
;
94 if (*key
< tr
->from_s
) {
96 } else if (*key
> tr
->from_t
) {
104 * Add a UTF-8 sequence str onto *buf
108 * - *buf is memory of length *sz, and up to *idx is a valid UTF-8
111 * - str is a valid ASCII (not just UTF-8) string of length str_len.
113 * Postconditions (success):
115 * - *buf is memory of length *sz, and up to *idx is a valid UTF-8
118 * - The contents of str have been appended to *buf (and *idx
122 append_str(char **buf
, size_t *idx
, size_t *sz
, const char *str
, size_t str_len
)
124 if (str_len
+ *idx
>= *sz
) {
126 size_t new_sz
= str_len
+ *idx
+ (1 << 9);
128 if (str_len
+ *idx
< str_len
||
129 str_len
+ *idx
+ (1 << 9) < str_len
+ *idx
) {
130 ERROR_MESSAGE("overflow (str_len = %zu, *idx = %zu)",
136 if (!(newmem
= realloc(*buf
, new_sz
))) {
137 PERROR_MESSAGE("realloc");
146 strncpy(*buf
+ *idx
, str
, str_len
);
147 *(*buf
+ *idx
+ str_len
) = '\0';
153 /* Dummy function for when I can't be bothered to strlen(). */
155 append_const_str(char **buf
, size_t *idx
, size_t *len
, const char *str
)
157 return append_str(buf
, idx
, len
, str
, strlen(str
));
161 * Add a single character onto *buf
165 * - *buf is memory of length *len, and up to *idx is a valid UTF-8
168 * - c is an ASCII character.
170 * Postconditions (success):
172 * - *buf is memory of length *len, and up to *idx is a valid UTF-8
175 * - c has been appended to *buf (and *idx includes this).
178 append_char(char **buf
, size_t *idx
, size_t *len
, char c
)
180 if (1 + *idx
>= *len
) {
182 size_t new_len
= 1 + *idx
+ (1 << 9);
184 if (*idx
+ 1 < *idx
||
185 *idx
+ 1 + (1 << 9) < *idx
+ 1) {
186 ERROR_MESSAGE("overflow (*idx = %zu)", *idx
);
191 if (!(newmem
= realloc(*buf
, new_len
))) {
192 PERROR_MESSAGE("realloc");
202 *(*buf
+ *idx
+ 1) = '\0';
209 * Add a Unicode codepoint onto *buf
213 * - *buf is memory of length *sz, and up to *idx is a valid UTF-8
216 * - wchar_t is a valid Unicode codepoint.
218 * Postconditions (success):
220 * - *buf is memory of length *sz, and up to *idx is a valid UTF-8
223 * - An HTML-escaped sequence like { has been appended to
224 * *buf (and *idx includes this).
227 append_wchar_escaped(char **buf
, size_t *idx
, size_t *sz
, wchar_t wc
)
229 size_t l
= snprintf(0, 0, "&#%ld;", (long) wc
);
231 if (l
+ *idx
>= *sz
) {
233 size_t new_sz
= l
+ *idx
+ (1 << 9);
235 if (*idx
+ l
< *idx
||
236 *idx
+ l
+ (1 << 9) < *idx
+ l
) {
237 ERROR_MESSAGE("overflow (*idx = %zu, l = %zu)", *idx
,
243 if (!(newmem
= realloc(*buf
, new_sz
))) {
244 PERROR_MESSAGE("realloc");
253 sprintf(*buf
+ *idx
, "&#%ld;", (long) wc
);
260 * Ensure that (*map)[j] = k, fixing up length as appropriate.
264 * - *map is memory of length len.
266 * Postconditions (success):
268 * - *map is memory of length len.
273 set_position_mapping(size_t **map
, size_t *len
, size_t j
, size_t k
)
279 ((j
+ 2) * sizeof **map
) / (j
+ 2) != sizeof **map
) {
280 ERROR_MESSAGE("overflow (j = %zu)", j
);
285 if (!(newmem
= realloc(*map
, (j
+ 2) * sizeof **map
))) {
286 PERROR_MESSAGE("realloc");
293 for (size_t l
= *len
; l
< j
+ 2; ++l
) {
294 (*map
)[l
] = ((size_t) -1);
306 * HTML-escape in to *out.
310 * - in is memory of at least length in_len, valid UTF-8
313 * - *out is memory of at least length *out_len (if *out_len = 0,
314 * *out may be 0), valid UTF-8 text.
316 * - Overwriting *out and *out_len shall not cause a memory leak.
318 * - out, out_len, and out_idx are not 0.
320 * Postconditions (success):
322 * - *out is memory of at least length *out_len, valid UTF-8 text.
324 * - A stretch of HTML-escaped ASCII text representing in has been
325 * added to *out at the position that was *out_idx.
327 * - *out_idx has been updated to point to the end of this stretch.
329 * - If necessary, *out_len has been updated.
332 to_html(const char *in
, const size_t in_len
, size_t in_idx
, char **out
,
333 size_t *out_len
, size_t *out_idx
)
339 size_t initial_out_idx
= *out_idx
;
342 if (!(*out
= malloc(1))) {
343 PERROR_MESSAGE("malloc");
353 * XXX: If you make this multithreaded, be sure to use
356 while (in_idx
< in_len
&&
358 /* Extract next character */
359 mbret
= mbtowc(&wc
, in
+ in_idx
, in_len
- in_idx
);
362 PERROR_MESSAGE("mbtowc");
367 ret
= append_str(out
, out_idx
, &out_sz
, "&", 5);
368 } else if (wc
== L
'"') {
369 ret
= append_str(out
, out_idx
, &out_sz
, """, 6);
370 } else if (wc
== L
'\'') {
371 ret
= append_str(out
, out_idx
, &out_sz
, "'", 6);
372 } else if (wc
== L
'<') {
373 ret
= append_str(out
, out_idx
, &out_sz
, "<", 4);
374 } else if (wc
== L
'>') {
375 ret
= append_str(out
, out_idx
, &out_sz
, ">", 4);
376 } else if (mbret
== 1 &&
379 ret
= append_char(out
, out_idx
, &out_sz
, in
[in_idx
]);
380 } else if (mbret
== 1 &&
381 in
[in_idx
] == '\r') {
383 } else if (mbret
== 1 &&
384 in
[in_idx
] == '\n') {
385 ret
= append_char(out
, out_idx
, &out_sz
, in
[in_idx
]);
387 ret
= append_wchar_escaped(out
, out_idx
, &out_sz
, wc
);
397 *out_len
= *out_len
+ (*out_idx
- initial_out_idx
);
405 * From in construct *out, which is a codepoint-for-codepoint
406 * translation following the rules of unicode-transforms.h. The
407 * result is that *out can be matched with normal regex, even if
408 * in contains obfuscatory Unicode bullshit.
412 * - setup_sanitize_comment() has been invoked more recently than
413 * clean_sanitize_comment().
415 * - in is memory of at least length in_len, valid UTF-8 text.
417 * - Overwriting *out and *out_position_map shall not cause a
420 * - out, out_len, out_position_map, and out_position_map_len are
423 * Postconditions (success):
425 * - *out is valid, UTF-8 text of length *out_len.
427 * - For every j in [0, *out_len) such that (*out)[j] starts a
428 * codepoint, in[*(position_map)[j]] is the start of the
429 * corresponding codepoint.
431 * - (*position_map)[*out_len] = in_len.
434 to_scannable(const char *in
, size_t in_len
, char **out
, size_t *out_len
,
435 size_t **out_position_map
, size_t *out_position_map_len
)
442 struct translate
*tr
= 0;
446 if (!(*out
= malloc(1))) {
447 PERROR_MESSAGE("malloc");
457 * Position_map is here to make wordfiltering work. Suppose in is
459 * I think Nina Purpleton did
462 * and a wordfilter /Nina Purpleton/i -> "worst girl" is
463 * in effect. Then *out will be
465 * I think Nina Purpleton did nothing wrong
467 * The message should, of course, be filtered to
469 * I think worst girl did nothing
472 * In order to do that, it would be necessary to have a map
473 * from in to *out on the byte level, since the wordfilter
474 * will only be run against *out.
476 * position_map[j] = k means that out[j] and in[k] mean the
479 while (in_idx
< in_len
) {
480 mbret
= mbtowc(&wc
, in
+ in_idx
, in_len
- in_idx
);
483 PERROR_MESSAGE("mbtowc");
487 /* We pre-suppose that the insert will go as planned */
488 if (set_position_mapping(out_position_map
, out_position_map_len
,
489 out_idx
, in_idx
) < 0) {
496 if (append_str(out
, &out_idx
, &out_sz
, in
+ in_idx
, 1) <
501 if ((tr
= bsearch(&wc
, translates
, NUM_OF(translates
),
504 if (append_str(out
, &out_idx
, &out_sz
, tr
->to
,
505 strlen(tr
->to
)) < 0) {
509 if (append_str(out
, &out_idx
, &out_sz
, in
+
510 in_idx
, mbret
) < 0) {
519 if (set_position_mapping(out_position_map
, out_position_map_len
,
520 out_idx
, in_len
) < 0) {
524 (*out
)[out_idx
] = '\0';
533 * Read through raw and scannable, checking all forbidden texts in
534 * scannable. If any match is detected, set *is_forbidden to 1.
538 * - setup_sanitize_comment() has been invoked more recently than
539 * clean_sanitize_comment().
541 * - scannable is memory of length at least scannable_len.
543 * - out_is_forbidden, out_ban_duration, out_ban_reason are not 0.
545 * Postconditions (success):
547 * - if any regex specified by the forbidden array matches scannable,
548 * then *out_is_forbidden has been set to 1, with relevant
549 * *out_ban_duration, *out_ban_reason.
552 check_forbidden_filters(const char *scannable
, const size_t scannable_len
,
553 uint_fast8_t *out_is_forbidden
, int *out_ban_duration
,
555 char **out_ban_reason
)
559 /* These hold the match locations from pcre2 */
561 pcre2_match_data
*match_data
= 0;
563 for (size_t j
= 0; j
< forbiddens_num
; ++j
) {
564 if (!(match_data
= pcre2_match_data_create_from_pattern(
565 forbiddens
[j
].code
, 0))) {
566 PERROR_MESSAGE("pcre2_match_data_create_from_pattern");
570 num_matches
= pcre2_match(forbiddens
[j
].code
,
571 (PCRE2_SPTR
) scannable
, scannable_len
,
572 0, 0, match_data
, 0);
574 if (num_matches
> 0) {
575 *out_is_forbidden
= 1;
576 *out_ban_duration
= forbiddens
[j
].ban_duration
;
577 *out_ban_reason
= forbiddens
[j
].ban_reason
;
581 pcre2_match_data_free(match_data
);
592 * Read through raw and scannable, checking all wordfilters in
593 * scannable. Where a match is detected, the corresponding postion
594 * (via position_map) in raw is replaced by the replacement specified
595 * by the matching wordfilter.
599 * - setup_sanitize_comment() has been invoked more recently than
600 * clean_sanitize_comment().
602 * - raw is memory of length at least raw_len, valid UTF-8 text.
604 * - scannable is memory of length at least scannable_len.
606 * - For any j in [0, scannable_len), position_map[j] is a valid
607 * index into raw, or is (size_t) -1.
609 * - position_map[scannable_len] = raw_len.
611 * - For any j in [0, scannable_len) such that k = position_map[j]
612 * is not (size_t) -1, scannable[j] and raw[k] are conceptually
613 * the same for wordfiltering.
615 * - Overwriting *out shall not cause a memory leak.
617 * - out and out_len are not 0.
619 * Postconditions (success):
621 * - *out is valid, UTF-8 text of length *out_len such that all
622 * non ASCII codepoints (and '<', '>', '&', '"', ''') are
625 * - *out represents raw, except in those sections of scannable
626 * where a wordfilter matched.
629 wordfilter_to_html(const char *raw
, const size_t raw_len
, const char *scannable
,
630 const size_t scannable_len
, size_t *position_map
, char **out
,
635 /* These hold the match locations from pcre2 */
636 uint32_t *ov_counts
= 0;
637 PCRE2_SIZE
**ov_ps
= 0;
638 int *num_matches
= 0;
639 pcre2_match_data
**match_data
= 0;
641 size_t scannable_idx
= 0;
643 size_t best_match_pos
= 0;
644 size_t best_match_idx
= 0;
648 if (!(ov_counts
= calloc(wordfilters_num
, sizeof *ov_counts
))) {
649 PERROR_MESSAGE("calloc");
653 if (!(ov_ps
= calloc(wordfilters_num
, sizeof *ov_ps
))) {
654 PERROR_MESSAGE("calloc");
658 if (!(num_matches
= calloc(wordfilters_num
, sizeof *num_matches
))) {
659 PERROR_MESSAGE("calloc");
663 if (!(match_data
= calloc(wordfilters_num
, sizeof *match_data
))) {
664 PERROR_MESSAGE("calloc");
668 /* First scan, before the loop */
669 for (size_t j
= 0; j
< wordfilters_num
; ++j
) {
670 if (!(match_data
[j
] = pcre2_match_data_create_from_pattern(
671 wordfilters
[j
].code
, 0))) {
672 PERROR_MESSAGE("pcre2_match_data_create_from_pattern");
676 num_matches
[j
] = pcre2_match(wordfilters
[j
].code
,
677 (PCRE2_SPTR
) scannable
,
678 scannable_len
, scannable_idx
, 0,
683 best_match_pos
= (size_t) -1;
684 best_match_idx
= (size_t) -1;
686 /* We've run pcre2_match() on everything. Find the soonest match */
687 for (size_t j
= 0; j
< wordfilters_num
; ++j
) {
688 if (num_matches
[j
] <= 0) {
692 ov_ps
[j
] = pcre2_get_ovector_pointer(match_data
[j
]);
694 if (ov_ps
[j
][0] >= scannable_idx
&&
695 ov_ps
[j
][0] < best_match_pos
) {
696 best_match_pos
= ov_ps
[j
][0];
701 if (best_match_idx
== (size_t) -1) {
702 /* No matches. Turn the rest to html boring-like */
703 ret
= to_html(raw
, raw_len
, raw_idx
, out
, out_len
, &out_idx
);
707 /* Figure out where in raw this match starts */
710 while (l
!= (size_t) -1 &&
711 position_map
[l
] == (size_t) -1) {
715 if (l
== (size_t) -1) {
716 ERROR_MESSAGE("Impossible condition in "
717 "wordfilter_to_html: raw=\"%s\", best_match_pos = %zu",
724 * Now position_map[l] points to the first character in raw
725 * that should be replaced. Fill up to that point.
727 if (position_map
[l
] &&
728 position_map
[l
] > raw_idx
) {
729 if (to_html(raw
, position_map
[l
], raw_idx
, out
, out_len
,
735 /* Put the substituted text in */
736 if (to_html(wordfilters
[best_match_idx
].replacement
,
737 wordfilters
[best_match_idx
].replacement_len
, 0, out
,
744 * Figure out where we should advance to in inputs. Naively,
745 * we want to set scannable_idx to ov_ps[best_match_idx][1]
746 * (the first character in scannable beyond the match).
747 * However, we have to consider the case of
751 * where "foo" -> "baz" is the only transformation. Since
752 * some characters, like "!", are completely ignored by
753 * the scannable transformation, the naive method would
754 * start our scanning at the "b", skipping information.
756 * So, instead, we carefully find the last character in
757 * "foo", then jump one past it. This (unfortunately)
758 * requires a bit more manual fiddling with wide character
762 if (ov_ps
[best_match_idx
][1] <= scannable_idx
) {
764 * This should never happen, but let's make sure
765 * we always keep advancing.
769 scannable_idx
= ov_ps
[best_match_idx
][1] - 1;
774 while (position_map
[l
] == (size_t) -1) {
778 raw_idx
= position_map
[l
];
780 /* This is the "jump one past it" part */
783 mbret
= mbrlen(raw
+ raw_idx
, MB_CUR_MAX
, 0);
788 PERROR_MESSAGE("mbrlen");
795 * Now re-check all our matches and figure out which ones
798 for (size_t j
= 0; j
< wordfilters_num
; ++j
) {
799 if ((num_matches
[j
] <= 0) ||
800 ov_ps
[j
][0] >= scannable_idx
) {
804 num_matches
[j
] = pcre2_match(wordfilters
[j
].code
,
805 (PCRE2_SPTR
) scannable
,
806 scannable_len
, scannable_idx
, 0,
810 goto handle_next_match
;
813 for (size_t j
= 0; j
< wordfilters_num
; ++j
) {
814 pcre2_match_data_free(match_data
[j
]);
827 * Read through in. Each time a match for format_replacements is
828 * found (something like a newline or a quote) is found, replace
829 * it with some HTML markup. The result is placed in out.
833 * - setup_sanitize_comment() has been invoked more recently than
834 * clean_sanitize_comment().
836 * - in is memory of length at least in_len, valid UTF-8 text.
838 * - Overwriting *out shall not cause a memory leak.
840 * - out and out_len are not 0.
842 * Postconditions (success):
844 * - *out is valid, UTF-8 text of length *out_len with sane HTML
845 * markup (and HTML escaped), suitable for outputting into an
849 insert_html_tags(const char *in
, size_t in_len
, const char *board
, char **out
,
854 size_t match_pos
= 0;
855 size_t after_match_pos
= 0;
857 pcre2_match_data
*match_data
= 0;
859 PCRE2_UCHAR
*tmp_1
= 0;
860 PCRE2_SIZE tmp_1_len
= 0;
861 PCRE2_UCHAR
*tmp_2
= 0;
862 PCRE2_SIZE tmp_2_len
= 0;
863 PCRE2_UCHAR
*tmp_3
= 0;
864 PCRE2_SIZE tmp_3_len
= 0;
865 uint_fast8_t last_was_newline
= 1;
866 char *link_target
= 0;
867 size_t link_target_len
= 0;
869 if (!(match_data
= pcre2_match_data_create_from_pattern(
870 format_replacements
, 0))) {
871 PERROR_MESSAGE("pcre2_match_data_create_from_pattern");
877 if (in_idx
>= in_len
) {
881 nret
= pcre2_match(format_replacements
, (PCRE2_SPTR
) in
, in_len
, in_idx
,
884 if (nret
== PCRE2_ERROR_NOMATCH
) {
885 ret
= append_str(out
, &out_idx
, out_len
, in
+ in_idx
, in_len
-
891 PCRE2_UCHAR8 err_buf
[120];
893 pcre2_get_error_message(nret
, err_buf
, 120);
894 ERROR_MESSAGE("pcre2_match: error while matching \"%.*s\": %s"
895 " (PCRE2 %d)", (int) (in_len
- in_idx
), in
+
901 pcre2_substring_free(tmp_1
);
902 pcre2_substring_free(tmp_2
);
903 pcre2_substring_free(tmp_3
);
910 /* We have match, stuff everything up to it in *out */
911 match_pos
= pcre2_get_ovector_pointer(match_data
)[0];
912 after_match_pos
= pcre2_get_ovector_pointer(match_data
)[1];
914 if (match_pos
> in_idx
) {
915 if (append_str(out
, &out_idx
, out_len
, in
+ in_idx
, match_pos
-
920 last_was_newline
= 0;
924 /* Figure out what type of match. */
925 if (!pcre2_substring_get_byname(match_data
, (PCRE2_SPTR
) "newline",
926 &tmp_1
, &tmp_1_len
)) {
927 if (last_was_newline
) {
928 if (append_const_str(out
, &out_idx
, out_len
,
929 " <br />") < 0) {
933 if (append_const_str(out
, &out_idx
, out_len
, "<br />") <
939 last_was_newline
= 1;
940 in_idx
= after_match_pos
;
944 last_was_newline
= 0;
946 if (!pcre2_substring_get_byname(match_data
, (PCRE2_SPTR
) "quote",
947 &tmp_1
, &tmp_1_len
)) {
948 if (append_const_str(out
, &out_idx
, out_len
,
949 "<span class=\"quote\">") < 0) {
953 if (append_str(out
, &out_idx
, out_len
, (const char *) tmp_1
,
954 (size_t) tmp_1_len
) < 0) {
958 if (append_const_str(out
, &out_idx
, out_len
, "</span>") < 0) {
962 in_idx
= after_match_pos
;
966 if (!pcre2_substring_get_byname(match_data
,
967 (PCRE2_SPTR
) "intra_postlink", &tmp_1
,
969 if (pcre2_substring_get_byname(match_data
, (PCRE2_SPTR
) "a_num",
970 &tmp_2
, &tmp_2_len
)) {
971 goto problem_with_match
;
976 if (db_construct_post_link(board
, strlen(board
), (const
978 tmp_2_len
, &found
, &link_target
,
979 &link_target_len
) < 0) {
984 if (append_str(out
, &out_idx
, out_len
, in
+ match_pos
,
985 after_match_pos
- match_pos
) < 0) {
989 in_idx
= after_match_pos
;
993 if (append_const_str(out
, &out_idx
, out_len
, "<a href=\"") <
998 if (append_str(out
, &out_idx
, out_len
, link_target
,
999 link_target_len
) < 0) {
1003 if (append_const_str(out
, &out_idx
, out_len
, "\">") < 0) {
1007 if (append_str(out
, &out_idx
, out_len
, (const char *) tmp_1
,
1008 (size_t) tmp_1_len
) < 0) {
1012 if (append_const_str(out
, &out_idx
, out_len
, "</a>") < 0) {
1016 in_idx
= after_match_pos
;
1020 if (!pcre2_substring_get_byname(match_data
,
1021 (PCRE2_SPTR
) "inter_postlink", &tmp_1
,
1023 if (pcre2_substring_get_byname(match_data
, (PCRE2_SPTR
) "e_num",
1024 &tmp_2
, &tmp_2_len
)) {
1025 goto problem_with_match
;
1028 if (pcre2_substring_get_byname(match_data
,
1029 (PCRE2_SPTR
) "e_board", &tmp_3
,
1031 goto problem_with_match
;
1036 if (db_construct_post_link((const char *) tmp_3
, tmp_3_len
,
1037 (const char *) tmp_2
, tmp_2_len
,
1038 &found
, &link_target
,
1039 &link_target_len
) < 0) {
1044 if (append_str(out
, &out_idx
, out_len
, in
+ match_pos
,
1045 after_match_pos
- match_pos
) < 0) {
1049 in_idx
= after_match_pos
;
1053 if (append_const_str(out
, &out_idx
, out_len
, "<a href=\"") <
1058 if (append_str(out
, &out_idx
, out_len
, link_target
,
1059 link_target_len
) < 0) {
1063 if (append_const_str(out
, &out_idx
, out_len
, "\">") < 0) {
1067 if (append_str(out
, &out_idx
, out_len
, (const char *) tmp_1
,
1068 (size_t) tmp_1_len
) < 0) {
1072 if (append_const_str(out
, &out_idx
, out_len
, "</a>") < 0) {
1076 in_idx
= after_match_pos
;
1082 /* There was some kind of match, but it went wrong. */
1089 pcre2_substring_free(tmp_1
);
1090 pcre2_substring_free(tmp_2
);
1091 pcre2_substring_free(tmp_3
);
1092 pcre2_match_data_free(match_data
);
1098 * Make sure that the contents of *pc are ready for safe injection
1099 * into the board, including HTML escaping, wordfiltering, general
1100 * formatting, and adding links.
1104 * - setup_sanitize_comment() has been invoked more recently than
1105 * clean_sanitize_comment().
1107 * - *pc has been filled out (fields like action, board, etc. have
1108 * been populated) from the POST data.
1110 * Postconditions (success):
1112 * - The prepared_XYZ fields of *pc have been filled out, and each
1113 * is valid ASCII text, with Unicode codepoints.
1116 st_sanitize_text(struct post_cmd
*pc
, int *our_fault
,
1117 uint_fast8_t *is_forbidden
, int *ban_duration
, const
1122 char *html_escaped_comment
= 0;
1123 size_t html_escaped_comment_len
= 0;
1125 /* Flush out lurking double-free bugs */
1126 free(pc
->prepared
.name
);
1127 pc
->prepared
.name
= 0;
1128 pc
->prepared
.name_len
= 0;
1129 free(pc
->prepared
.email
);
1130 pc
->prepared
.email
= 0;
1131 pc
->prepared
.email_len
= 0;
1132 free(pc
->prepared
.subject
);
1133 pc
->prepared
.subject
= 0;
1134 pc
->prepared
.subject_len
= 0;
1135 free(pc
->prepared
.comment
);
1136 pc
->prepared
.comment
= 0;
1137 pc
->prepared
.comment_len
= 0;
1138 free(pc
->prepared
.file_name
);
1139 pc
->prepared
.file_name
= 0;
1140 pc
->prepared
.file_name_len
= 0;
1141 free(pc
->scannable_comment
);
1142 pc
->scannable_comment
= 0;
1143 pc
->scannable_comment_len
= 0;
1144 free(pc
->comment_position_map
);
1145 pc
->comment_position_map
= 0;
1146 pc
->comment_position_map_len
= 0;
1147 free(pc
->scannable_name
);
1148 pc
->scannable_name
= 0;
1149 pc
->scannable_name_len
= 0;
1150 free(pc
->name_position_map
);
1151 pc
->name_position_map
= 0;
1152 pc
->name_position_map_len
= 0;
1153 free(pc
->scannable_email
);
1154 pc
->scannable_email
= 0;
1155 pc
->scannable_email_len
= 0;
1156 free(pc
->email_position_map
);
1157 pc
->email_position_map
= 0;
1158 pc
->email_position_map_len
= 0;
1159 free(pc
->scannable_subject
);
1160 pc
->scannable_subject
= 0;
1161 pc
->scannable_subject_len
= 0;
1162 free(pc
->subject_position_map
);
1163 pc
->subject_position_map
= 0;
1164 pc
->subject_position_map_len
= 0;
1165 free(pc
->scannable_filename
);
1166 pc
->scannable_filename
= 0;
1167 pc
->scannable_filename_len
= 0;
1168 free(pc
->filename_position_map
);
1169 pc
->filename_position_map
= 0;
1170 pc
->filename_position_map_len
= 0;
1173 if (!pc
->raw
.name_len
) {
1176 if (!(pc
->raw
.name
= strdup("Anonymous"))) {
1177 PERROR_MESSAGE("strdup");
1182 pc
->raw
.name_len
= strlen(pc
->raw
.name
);
1185 if (pc
->raw
.name_len
) {
1186 if (to_html(pc
->raw
.name
, pc
->raw
.name_len
, 0,
1187 &pc
->prepared
.name
, &pc
->prepared
.name_len
,
1196 if (pc
->raw
.email_len
) {
1197 if (to_html(pc
->raw
.email
, pc
->raw
.email_len
, 0,
1198 &pc
->prepared
.email
, &pc
->prepared
.email_len
,
1207 if (pc
->raw
.tripcode_len
) {
1208 if (to_html(pc
->raw
.tripcode
, pc
->raw
.tripcode_len
, 0,
1209 &pc
->prepared
.tripcode
, &pc
->prepared
.tripcode_len
,
1219 if (pc
->raw
.subject_len
) {
1220 if (to_html(pc
->raw
.subject
, pc
->raw
.subject_len
, 0,
1221 &pc
->prepared
.subject
, &pc
->prepared
.subject_len
,
1231 if (pc
->raw
.file_name_len
) {
1232 if (to_html(pc
->raw
.file_name
, pc
->raw
.file_name_len
, 0,
1233 &pc
->prepared
.file_name
,
1234 &pc
->prepared
.file_name_len
,
1241 if (to_scannable(pc
->raw
.comment
, pc
->raw
.comment_len
,
1242 &pc
->scannable_comment
, &pc
->scannable_comment_len
,
1243 &pc
->comment_position_map
,
1244 &pc
->comment_position_map_len
)) {
1249 if (to_scannable(pc
->raw
.name
, pc
->raw
.name_len
, &pc
->scannable_name
,
1250 &pc
->scannable_name_len
, &pc
->name_position_map
,
1251 &pc
->name_position_map_len
)) {
1256 if (to_scannable(pc
->raw
.email
, pc
->raw
.email_len
, &pc
->scannable_email
,
1257 &pc
->scannable_email_len
, &pc
->email_position_map
,
1258 &pc
->email_position_map_len
)) {
1263 if (to_scannable(pc
->raw
.subject
, pc
->raw
.subject_len
,
1264 &pc
->scannable_subject
, &pc
->scannable_subject_len
,
1265 &pc
->subject_position_map
,
1266 &pc
->subject_position_map_len
)) {
1271 if (to_scannable(pc
->raw
.file_name
, pc
->raw
.file_name_len
,
1272 &pc
->scannable_filename
, &pc
->scannable_filename_len
,
1273 &pc
->filename_position_map
,
1274 &pc
->filename_position_map_len
)) {
1280 * Are they a spambot?
1282 if (check_forbidden_filters(pc
->scannable_comment
,
1283 pc
->scannable_comment_len
, is_forbidden
,
1284 ban_duration
, ban_reason
) <
1290 if (*is_forbidden
) {
1294 if (check_forbidden_filters(pc
->scannable_name
, pc
->scannable_name_len
,
1295 is_forbidden
, ban_duration
, ban_reason
) <
1301 if (*is_forbidden
) {
1305 if (check_forbidden_filters(pc
->scannable_email
,
1306 pc
->scannable_email_len
, is_forbidden
,
1307 ban_duration
, ban_reason
) < 0) {
1312 if (*is_forbidden
) {
1316 if (check_forbidden_filters(pc
->scannable_subject
,
1317 pc
->scannable_subject_len
, is_forbidden
,
1318 ban_duration
, ban_reason
) <
1324 if (*is_forbidden
) {
1328 if (check_forbidden_filters(pc
->scannable_filename
,
1329 pc
->scannable_filename_len
, is_forbidden
,
1330 ban_duration
, ban_reason
) <
1336 if (*is_forbidden
) {
1342 * Now we do the fancy thing. Match scannable, build prepared
1345 if (wordfilter_to_html(pc
->raw
.comment
, pc
->raw
.comment_len
,
1346 pc
->scannable_comment
, pc
->scannable_comment_len
,
1347 pc
->comment_position_map
, &html_escaped_comment
,
1348 &html_escaped_comment_len
) < 0) {
1354 * Everything's in { form, but now take care of >>123,
1357 if (insert_html_tags(html_escaped_comment
, html_escaped_comment_len
,
1358 pc
->raw
.board
, &pc
->prepared
.comment
,
1359 &pc
->prepared
.comment_len
) < 0) {
1366 free(html_escaped_comment
);
1372 * Initialize any static elements needed for this file.
1376 * - setup_sanitize_comment() was not invoked more recently than
1377 * clean_sanitize_comment().
1379 * Postconditions (success):
1381 * - Any other function in this file may be safely called.
1384 setup_sanitize_comment(const struct configuration
*conf
)
1387 * Check that the locale/libc/whatever is set up so that
1388 * UTF-8 handling can work.
1392 "<script>alert(1)</script> , \U0001d511\U0001d526\U0001d52b"
1393 "\U0001d51e\u3000\U0001d513\U0001d532\U0001d52f\U0001d52d"
1394 "\U0001d529\U0001d522\U0001d531\U0001d52c\U0001d52b & "
1395 "\u2468\u0294!\u0ce2!!";
1396 const char *correct_html
=
1397 "<script>alert(1)</script> , 𝔑𝔦"
1398 "𝔫𝔞 𝔓𝔲𝔯"
1399 "𝔭𝔩𝔢𝔱𝔬𝔫 &"
1400 " ⑨ʔ!ೢ!!";
1401 const char *correct_scannable
=
1402 "<script>alert(1)</script> , Nina Purpleton & 9!!!";
1404 size_t html_len
= 0;
1405 char *scannable
= 0;
1406 size_t scannable_len
= 0;
1407 size_t *position_map
= 0;
1408 size_t position_map_len
= 0;
1411 /* For pcre2_get_error_message */
1413 PCRE2_SIZE err_offset
= 0;
1414 PCRE2_UCHAR8 err_buf
[120];
1416 if (to_html(raw
, strlen(raw
), 0, &html
, &html_len
, &out_idx
) < 0) {
1420 if (strcmp(html
, correct_html
)) {
1421 ERROR_MESSAGE("Was expecting html conversion to yield "
1422 "\n\n\u00ab%s\u00bb\n\nInstead, got "
1423 "\n\n\u00ab%s\u00bb\n\n",
1424 correct_html
, html
);
1428 if (to_scannable(raw
, strlen(raw
), &scannable
, &scannable_len
,
1429 &position_map
, &position_map_len
) < 0) {
1433 if (strcmp(scannable
, correct_scannable
)) {
1434 ERROR_MESSAGE("Was expecting scannable conversion to yield "
1435 "\n\n\u00ab%s\u00bb\n\nInstead, got "
1436 "\n\n\u00ab%s\u00bb\n\n",
1437 correct_scannable
, scannable
);
1441 if (!(wordfilters
= calloc(conf
->wordfilter_inputs_num
,
1442 sizeof *wordfilters
))) {
1443 PERROR_MESSAGE("calloc");
1447 wordfilters_num
= conf
->wordfilter_inputs_num
;
1449 for (size_t j
= 0; j
< wordfilters_num
; ++j
) {
1450 wordfilters
[j
].replacement
=
1451 conf
->wordfilter_inputs
[j
].replacement
;
1452 wordfilters
[j
].replacement_len
= strlen(
1453 conf
->wordfilter_inputs
[j
].replacement
);
1455 if ((wordfilters
[j
].code
= pcre2_compile(
1456 (PCRE2_SPTR8
) conf
->wordfilter_inputs
[j
].pattern
,
1457 PCRE2_ZERO_TERMINATED
, PCRE2_UTF
, &err_code
,
1462 pcre2_get_error_message(err_code
, err_buf
, 120);
1463 ERROR_MESSAGE("pcre2_compile: error with pattern \"%s\": %s",
1464 conf
->wordfilter_inputs
[j
].pattern
, err_buf
);
1468 if (!(forbiddens
= calloc(conf
->forbidden_inputs_num
,
1469 sizeof *forbiddens
))) {
1470 PERROR_MESSAGE("calloc");
1474 forbiddens_num
= conf
->forbidden_inputs_num
;
1476 for (size_t j
= 0; j
< forbiddens_num
; ++j
) {
1477 forbiddens
[j
].ban_duration
=
1478 conf
->forbidden_inputs
[j
].ban_duration
;
1479 forbiddens
[j
].ban_reason
= conf
->forbidden_inputs
[j
].ban_reason
;
1481 if ((forbiddens
[j
].code
= pcre2_compile(
1482 (PCRE2_SPTR8
) conf
->forbidden_inputs
[j
].pattern
,
1483 PCRE2_ZERO_TERMINATED
, PCRE2_UTF
, &err_code
,
1488 pcre2_get_error_message(err_code
, err_buf
, 120);
1489 ERROR_MESSAGE("pcre2_compile: error with pattern \"%s\": %s",
1490 conf
->forbidden_inputs
[j
].pattern
, err_buf
);
1494 const char *format_match_str
=
1497 "(?<newline>\\n)" /* */
1498 "|(?<intra_postlink>>>(?<a_num>[0-9]+))" /* */
1499 "|(?<inter_postlink>>>>/" /* */
1500 "(?<e_board>[^ /]+)/(?<e_num>[0-9]+))" /* */
1501 "|(?<quote>(?<![^\n])>[^\n]*)"; /* */
1503 if (!(format_replacements
= pcre2_compile(
1504 (PCRE2_SPTR8
) format_match_str
, PCRE2_ZERO_TERMINATED
,
1506 &err_code
, &err_offset
, 0))) {
1507 pcre2_get_error_message(err_code
, err_buf
, 120);
1508 ERROR_MESSAGE("pcre2_compile: error with pattern \"%s\": %s",
1509 format_match_str
, err_buf
);
1523 * Clean up any memory from this file
1525 * Postconditions (success):
1527 * - Valgrind won't report any memory leaks from this file.
1529 * - setup_sanitize_comment() can be safely called again.
1532 clean_sanitize_comment(void)
1534 for (size_t j
= 0; j
< wordfilters_num
; ++j
) {
1535 pcre2_code_free(wordfilters
[j
].code
);
1536 wordfilters
[j
] = (struct wordfilter
) { 0 };
1539 for (size_t j
= 0; j
< forbiddens_num
; ++j
) {
1540 pcre2_code_free(forbiddens
[j
].code
);
1541 forbiddens
[j
] = (struct forbidden
) { 0 };
1544 pcre2_code_free(format_replacements
);
1545 format_replacements
= 0;
1548 wordfilters_num
= 0;