1 /*-------------------------------------------------------------------------
3 * SASLprep normalization, for SCRAM authentication
5 * The SASLprep algorithm is used to process a user-supplied password into
6 * canonical form. For more details, see:
8 * [RFC3454] Preparation of Internationalized Strings ("stringprep"),
9 * http://www.ietf.org/rfc/rfc3454.txt
11 * [RFC4013] SASLprep: Stringprep Profile for User Names and Passwords
12 * http://www.ietf.org/rfc/rfc4013.txt
15 * Portions Copyright (c) 2017-2021, PostgreSQL Global Development Group
18 * src/common/saslprep.c
20 *-------------------------------------------------------------------------
25 #include "postgres_fe.h"
28 #include "common/saslprep.h"
29 #include "common/string.h"
30 #include "common/unicode_norm.h"
31 #include "mb/pg_wchar.h"
34 * In backend, we will use palloc/pfree. In frontend, use malloc, and
35 * return SASLPREP_OOM on out-of-memory.
38 #define STRDUP(s) pstrdup(s)
39 #define ALLOC(size) palloc(size)
40 #define FREE(size) pfree(size)
42 #define STRDUP(s) strdup(s)
43 #define ALLOC(size) malloc(size)
44 #define FREE(size) free(size)
47 /* Prototypes for local functions */
48 static int codepoint_range_cmp(const void *a
, const void *b
);
49 static bool is_code_in_table(pg_wchar code
, const pg_wchar
*map
, int mapsize
);
50 static int pg_utf8_string_len(const char *source
);
53 * Stringprep Mapping Tables.
55 * The stringprep specification includes a number of tables of Unicode
56 * codepoints, used in different parts of the algorithm. They are below,
57 * as arrays of codepoint ranges. Each range is a pair of codepoints,
58 * for the first and last codepoint included the range (inclusive!).
62 * C.1.2 Non-ASCII space characters
64 * These are all mapped to the ASCII space character (U+00A0).
66 static const pg_wchar non_ascii_space_ranges
[] =
77 * B.1 Commonly mapped to nothing
79 * If any of these appear in the input, they are removed.
81 static const pg_wchar commonly_mapped_to_nothing_ranges
[] =
94 * prohibited_output_ranges is a union of all the characters from
95 * the following tables:
97 * C.1.2 Non-ASCII space characters
98 * C.2.1 ASCII control characters
99 * C.2.2 Non-ASCII control characters
100 * C.3 Private Use characters
101 * C.4 Non-character code points
102 * C.5 Surrogate code points
103 * C.6 Inappropriate for plain text characters
104 * C.7 Inappropriate for canonical representation characters
105 * C.7 Change display properties or deprecated characters
106 * C.8 Tagging characters
108 * These are the tables that are listed as "prohibited output"
109 * characters in the SASLprep profile.
111 * The comment after each code range indicates which source table
112 * the code came from. Note that there is some overlap in the source
113 * tables, so one code might originate from multiple source tables.
114 * Adjacent ranges have also been merged together, to save space.
116 static const pg_wchar prohibited_output_ranges
[] =
118 0x0000, 0x001F, /* C.2.1 */
119 0x007F, 0x00A0, /* C.1.2, C.2.1, C.2.2 */
120 0x0340, 0x0341, /* C.8 */
121 0x06DD, 0x06DD, /* C.2.2 */
122 0x070F, 0x070F, /* C.2.2 */
123 0x1680, 0x1680, /* C.1.2 */
124 0x180E, 0x180E, /* C.2.2 */
125 0x2000, 0x200F, /* C.1.2, C.2.2, C.8 */
126 0x2028, 0x202F, /* C.1.2, C.2.2, C.8 */
127 0x205F, 0x2063, /* C.1.2, C.2.2 */
128 0x206A, 0x206F, /* C.2.2, C.8 */
129 0x2FF0, 0x2FFB, /* C.7 */
130 0x3000, 0x3000, /* C.1.2 */
131 0xD800, 0xF8FF, /* C.3, C.5 */
132 0xFDD0, 0xFDEF, /* C.4 */
133 0xFEFF, 0xFEFF, /* C.2.2 */
134 0xFFF9, 0xFFFF, /* C.2.2, C.4, C.6 */
135 0x1D173, 0x1D17A, /* C.2.2 */
136 0x1FFFE, 0x1FFFF, /* C.4 */
137 0x2FFFE, 0x2FFFF, /* C.4 */
138 0x3FFFE, 0x3FFFF, /* C.4 */
139 0x4FFFE, 0x4FFFF, /* C.4 */
140 0x5FFFE, 0x5FFFF, /* C.4 */
141 0x6FFFE, 0x6FFFF, /* C.4 */
142 0x7FFFE, 0x7FFFF, /* C.4 */
143 0x8FFFE, 0x8FFFF, /* C.4 */
144 0x9FFFE, 0x9FFFF, /* C.4 */
145 0xAFFFE, 0xAFFFF, /* C.4 */
146 0xBFFFE, 0xBFFFF, /* C.4 */
147 0xCFFFE, 0xCFFFF, /* C.4 */
148 0xDFFFE, 0xDFFFF, /* C.4 */
149 0xE0001, 0xE0001, /* C.9 */
150 0xE0020, 0xE007F, /* C.9 */
151 0xEFFFE, 0xEFFFF, /* C.4 */
152 0xF0000, 0xFFFFF, /* C.3, C.4 */
153 0x100000, 0x10FFFF /* C.3, C.4 */
156 /* A.1 Unassigned code points in Unicode 3.2 */
157 static const pg_wchar unassigned_codepoint_ranges
[] =
557 /* D.1 Characters with bidirectional property "R" or "AL" */
558 static const pg_wchar RandALCat_codepoint_ranges
[] =
596 /* D.2 Characters with bidirectional property "L" */
597 static const pg_wchar LCat_codepoint_ranges
[] =
961 /* End of stringprep tables */
964 /* Is the given Unicode codepoint in the given table of ranges? */
965 #define IS_CODE_IN_TABLE(code, map) is_code_in_table(code, map, lengthof(map))
968 codepoint_range_cmp(const void *a
, const void *b
)
970 const pg_wchar
*key
= (const pg_wchar
*) a
;
971 const pg_wchar
*range
= (const pg_wchar
*) b
;
974 return -1; /* less than lower bound */
976 return 1; /* greater than upper bound */
978 return 0; /* within range */
982 is_code_in_table(pg_wchar code
, const pg_wchar
*map
, int mapsize
)
984 Assert(mapsize
% 2 == 0);
986 if (code
< map
[0] || code
> map
[mapsize
- 1])
989 if (bsearch(&code
, map
, mapsize
/ 2, sizeof(pg_wchar
) * 2,
990 codepoint_range_cmp
))
997 * Calculate the length in characters of a null-terminated UTF-8 string.
999 * Returns -1 if the input is not valid UTF-8.
1002 pg_utf8_string_len(const char *source
)
1004 const unsigned char *p
= (const unsigned char *) source
;
1010 l
= pg_utf_mblen(p
);
1012 if (!pg_utf8_islegal(p
, l
))
1024 * pg_saslprep - Normalize a password with SASLprep.
1026 * SASLprep requires the input to be in UTF-8 encoding, but PostgreSQL
1027 * supports many encodings, so we don't blindly assume that. pg_saslprep
1028 * will check if the input looks like valid UTF-8, and returns
1029 * SASLPREP_INVALID_UTF8 if not.
1031 * If the string contains prohibited characters (or more precisely, if the
1032 * output string would contain prohibited characters after normalization),
1033 * returns SASLPREP_PROHIBITED.
1035 * On success, returns SASLPREP_SUCCESS, and the normalized string in
1038 * In frontend, the normalized string is malloc'd, and the caller is
1039 * responsible for freeing it. If an allocation fails, returns
1040 * SASLPREP_OOM. In backend, the normalized string is palloc'd instead,
1041 * and a failed allocation leads to ereport(ERROR).
1044 pg_saslprep(const char *input
, char **output
)
1046 pg_wchar
*input_chars
= NULL
;
1047 pg_wchar
*output_chars
= NULL
;
1053 bool contains_RandALCat
;
1057 /* Ensure we return *output as NULL on failure */
1061 * Quick check if the input is pure ASCII. An ASCII string requires no
1062 * further processing.
1064 if (pg_is_ascii(input
))
1066 *output
= STRDUP(input
);
1069 return SASLPREP_SUCCESS
;
1073 * Convert the input from UTF-8 to an array of Unicode codepoints.
1075 * This also checks that the input is a legal UTF-8 string.
1077 input_size
= pg_utf8_string_len(input
);
1079 return SASLPREP_INVALID_UTF8
;
1081 input_chars
= ALLOC((input_size
+ 1) * sizeof(pg_wchar
));
1085 p
= (unsigned char *) input
;
1086 for (i
= 0; i
< input_size
; i
++)
1088 input_chars
[i
] = utf8_to_unicode(p
);
1089 p
+= pg_utf_mblen(p
);
1091 input_chars
[i
] = (pg_wchar
) '\0';
1094 * The steps below correspond to the steps listed in [RFC3454], Section
1095 * "2. Preparation Overview"
1099 * 1) Map -- For each character in the input, check if it has a mapping
1100 * and, if so, replace it with its mapping.
1103 for (i
= 0; i
< input_size
; i
++)
1105 pg_wchar code
= input_chars
[i
];
1107 if (IS_CODE_IN_TABLE(code
, non_ascii_space_ranges
))
1108 input_chars
[count
++] = 0x0020;
1109 else if (IS_CODE_IN_TABLE(code
, commonly_mapped_to_nothing_ranges
))
1111 /* map to nothing */
1114 input_chars
[count
++] = code
;
1116 input_chars
[count
] = (pg_wchar
) '\0';
1119 if (input_size
== 0)
1120 goto prohibited
; /* don't allow empty password */
1123 * 2) Normalize -- Normalize the result of step 1 using Unicode
1126 output_chars
= unicode_normalize(UNICODE_NFKC
, input_chars
);
1131 * 3) Prohibit -- Check for any characters that are not allowed in the
1132 * output. If any are found, return an error.
1134 for (i
= 0; i
< input_size
; i
++)
1136 pg_wchar code
= input_chars
[i
];
1138 if (IS_CODE_IN_TABLE(code
, prohibited_output_ranges
))
1140 if (IS_CODE_IN_TABLE(code
, unassigned_codepoint_ranges
))
1145 * 4) Check bidi -- Possibly check for right-to-left characters, and if
1146 * any are found, make sure that the whole string satisfies the
1147 * requirements for bidirectional strings. If the string does not satisfy
1148 * the requirements for bidirectional strings, return an error.
1150 * [RFC3454], Section "6. Bidirectional Characters" explains in more
1151 * detail what that means:
1153 * "In any profile that specifies bidirectional character handling, all
1154 * three of the following requirements MUST be met:
1156 * 1) The characters in section 5.8 MUST be prohibited.
1158 * 2) If a string contains any RandALCat character, the string MUST NOT
1159 * contain any LCat character.
1161 * 3) If a string contains any RandALCat character, a RandALCat character
1162 * MUST be the first character of the string, and a RandALCat character
1163 * MUST be the last character of the string."
1165 contains_RandALCat
= false;
1166 for (i
= 0; i
< input_size
; i
++)
1168 pg_wchar code
= input_chars
[i
];
1170 if (IS_CODE_IN_TABLE(code
, RandALCat_codepoint_ranges
))
1172 contains_RandALCat
= true;
1177 if (contains_RandALCat
)
1179 pg_wchar first
= input_chars
[0];
1180 pg_wchar last
= input_chars
[input_size
- 1];
1182 for (i
= 0; i
< input_size
; i
++)
1184 pg_wchar code
= input_chars
[i
];
1186 if (IS_CODE_IN_TABLE(code
, LCat_codepoint_ranges
))
1190 if (!IS_CODE_IN_TABLE(first
, RandALCat_codepoint_ranges
) ||
1191 !IS_CODE_IN_TABLE(last
, RandALCat_codepoint_ranges
))
1196 * Finally, convert the result back to UTF-8.
1199 for (wp
= output_chars
; *wp
; wp
++)
1201 unsigned char buf
[4];
1203 unicode_to_utf8(*wp
, buf
);
1204 result_size
+= pg_utf_mblen(buf
);
1207 result
= ALLOC(result_size
+ 1);
1212 * There are no error exits below here, so the error exit paths don't need
1213 * to worry about possibly freeing "result".
1215 p
= (unsigned char *) result
;
1216 for (wp
= output_chars
; *wp
; wp
++)
1218 unicode_to_utf8(*wp
, p
);
1219 p
+= pg_utf_mblen(p
);
1221 Assert((char *) p
== result
+ result_size
);
1228 return SASLPREP_SUCCESS
;
1236 return SASLPREP_PROHIBITED
;
1244 return SASLPREP_OOM
;