1 /*-------------------------------------------------------------------------
3 * SASLprep normalization, for SCRAM authentication
5 * The SASLprep algorithm is used to process a user-supplied password into
6 * canonical form. For more details, see:
8 * [RFC3454] Preparation of Internationalized Strings ("stringprep"),
9 * http://www.ietf.org/rfc/rfc3454.txt
11 * [RFC4013] SASLprep: Stringprep Profile for User Names and Passwords
12 * http://www.ietf.org/rfc/rfc4013.txt
15 * Portions Copyright (c) 2017-2024, PostgreSQL Global Development Group
18 * src/common/saslprep.c
20 *-------------------------------------------------------------------------
24 #include "utils/memutils.h"
26 #include "postgres_fe.h"
29 #include "common/saslprep.h"
30 #include "common/string.h"
31 #include "common/unicode_norm.h"
32 #include "mb/pg_wchar.h"
35 * In backend, we will use palloc/pfree. In frontend, use malloc, and
36 * return SASLPREP_OOM on out-of-memory.
39 #define STRDUP(s) pstrdup(s)
40 #define ALLOC(size) palloc(size)
41 #define FREE(size) pfree(size)
43 #define STRDUP(s) strdup(s)
44 #define ALLOC(size) malloc(size)
45 #define FREE(size) free(size)
48 /* Prototypes for local functions */
49 static int codepoint_range_cmp(const void *a
, const void *b
);
50 static bool is_code_in_table(pg_wchar code
, const pg_wchar
*map
, int mapsize
);
51 static int pg_utf8_string_len(const char *source
);
54 * Stringprep Mapping Tables.
56 * The stringprep specification includes a number of tables of Unicode
57 * codepoints, used in different parts of the algorithm. They are below,
58 * as arrays of codepoint ranges. Each range is a pair of codepoints,
59 * for the first and last codepoint included the range (inclusive!).
63 * C.1.2 Non-ASCII space characters
65 * These are all mapped to the ASCII space character (U+00A0).
67 static const pg_wchar non_ascii_space_ranges
[] =
78 * B.1 Commonly mapped to nothing
80 * If any of these appear in the input, they are removed.
82 static const pg_wchar commonly_mapped_to_nothing_ranges
[] =
95 * prohibited_output_ranges is a union of all the characters from
96 * the following tables:
98 * C.1.2 Non-ASCII space characters
99 * C.2.1 ASCII control characters
100 * C.2.2 Non-ASCII control characters
101 * C.3 Private Use characters
102 * C.4 Non-character code points
103 * C.5 Surrogate code points
104 * C.6 Inappropriate for plain text characters
105 * C.7 Inappropriate for canonical representation characters
106 * C.7 Change display properties or deprecated characters
107 * C.8 Tagging characters
109 * These are the tables that are listed as "prohibited output"
110 * characters in the SASLprep profile.
112 * The comment after each code range indicates which source table
113 * the code came from. Note that there is some overlap in the source
114 * tables, so one code might originate from multiple source tables.
115 * Adjacent ranges have also been merged together, to save space.
117 static const pg_wchar prohibited_output_ranges
[] =
119 0x0000, 0x001F, /* C.2.1 */
120 0x007F, 0x00A0, /* C.1.2, C.2.1, C.2.2 */
121 0x0340, 0x0341, /* C.8 */
122 0x06DD, 0x06DD, /* C.2.2 */
123 0x070F, 0x070F, /* C.2.2 */
124 0x1680, 0x1680, /* C.1.2 */
125 0x180E, 0x180E, /* C.2.2 */
126 0x2000, 0x200F, /* C.1.2, C.2.2, C.8 */
127 0x2028, 0x202F, /* C.1.2, C.2.2, C.8 */
128 0x205F, 0x2063, /* C.1.2, C.2.2 */
129 0x206A, 0x206F, /* C.2.2, C.8 */
130 0x2FF0, 0x2FFB, /* C.7 */
131 0x3000, 0x3000, /* C.1.2 */
132 0xD800, 0xF8FF, /* C.3, C.5 */
133 0xFDD0, 0xFDEF, /* C.4 */
134 0xFEFF, 0xFEFF, /* C.2.2 */
135 0xFFF9, 0xFFFF, /* C.2.2, C.4, C.6 */
136 0x1D173, 0x1D17A, /* C.2.2 */
137 0x1FFFE, 0x1FFFF, /* C.4 */
138 0x2FFFE, 0x2FFFF, /* C.4 */
139 0x3FFFE, 0x3FFFF, /* C.4 */
140 0x4FFFE, 0x4FFFF, /* C.4 */
141 0x5FFFE, 0x5FFFF, /* C.4 */
142 0x6FFFE, 0x6FFFF, /* C.4 */
143 0x7FFFE, 0x7FFFF, /* C.4 */
144 0x8FFFE, 0x8FFFF, /* C.4 */
145 0x9FFFE, 0x9FFFF, /* C.4 */
146 0xAFFFE, 0xAFFFF, /* C.4 */
147 0xBFFFE, 0xBFFFF, /* C.4 */
148 0xCFFFE, 0xCFFFF, /* C.4 */
149 0xDFFFE, 0xDFFFF, /* C.4 */
150 0xE0001, 0xE0001, /* C.9 */
151 0xE0020, 0xE007F, /* C.9 */
152 0xEFFFE, 0xEFFFF, /* C.4 */
153 0xF0000, 0xFFFFF, /* C.3, C.4 */
154 0x100000, 0x10FFFF /* C.3, C.4 */
157 /* A.1 Unassigned code points in Unicode 3.2 */
158 static const pg_wchar unassigned_codepoint_ranges
[] =
558 /* D.1 Characters with bidirectional property "R" or "AL" */
559 static const pg_wchar RandALCat_codepoint_ranges
[] =
597 /* D.2 Characters with bidirectional property "L" */
598 static const pg_wchar LCat_codepoint_ranges
[] =
962 /* End of stringprep tables */
965 /* Is the given Unicode codepoint in the given table of ranges? */
966 #define IS_CODE_IN_TABLE(code, map) is_code_in_table(code, map, lengthof(map))
969 codepoint_range_cmp(const void *a
, const void *b
)
971 const pg_wchar
*key
= (const pg_wchar
*) a
;
972 const pg_wchar
*range
= (const pg_wchar
*) b
;
975 return -1; /* less than lower bound */
977 return 1; /* greater than upper bound */
979 return 0; /* within range */
983 is_code_in_table(pg_wchar code
, const pg_wchar
*map
, int mapsize
)
985 Assert(mapsize
% 2 == 0);
987 if (code
< map
[0] || code
> map
[mapsize
- 1])
990 if (bsearch(&code
, map
, mapsize
/ 2, sizeof(pg_wchar
) * 2,
991 codepoint_range_cmp
))
998 * Calculate the length in characters of a null-terminated UTF-8 string.
1000 * Returns -1 if the input is not valid UTF-8.
1003 pg_utf8_string_len(const char *source
)
1005 const unsigned char *p
= (const unsigned char *) source
;
1008 size_t len
= strlen(source
);
1012 l
= pg_utf_mblen(p
);
1014 if (len
< l
|| !pg_utf8_islegal(p
, l
))
1027 * pg_saslprep - Normalize a password with SASLprep.
1029 * SASLprep requires the input to be in UTF-8 encoding, but PostgreSQL
1030 * supports many encodings, so we don't blindly assume that. pg_saslprep
1031 * will check if the input looks like valid UTF-8, and returns
1032 * SASLPREP_INVALID_UTF8 if not.
1034 * If the string contains prohibited characters (or more precisely, if the
1035 * output string would contain prohibited characters after normalization),
1036 * returns SASLPREP_PROHIBITED.
1038 * On success, returns SASLPREP_SUCCESS, and the normalized string in
1041 * In frontend, the normalized string is malloc'd, and the caller is
1042 * responsible for freeing it. If an allocation fails, returns
1043 * SASLPREP_OOM. In backend, the normalized string is palloc'd instead,
1044 * and a failed allocation leads to ereport(ERROR).
1047 pg_saslprep(const char *input
, char **output
)
1049 pg_wchar
*input_chars
= NULL
;
1050 pg_wchar
*output_chars
= NULL
;
1056 bool contains_RandALCat
;
1060 /* Ensure we return *output as NULL on failure */
1064 * Quick check if the input is pure ASCII. An ASCII string requires no
1065 * further processing.
1067 if (pg_is_ascii(input
))
1069 *output
= STRDUP(input
);
1072 return SASLPREP_SUCCESS
;
1076 * Convert the input from UTF-8 to an array of Unicode codepoints.
1078 * This also checks that the input is a legal UTF-8 string.
1080 input_size
= pg_utf8_string_len(input
);
1082 return SASLPREP_INVALID_UTF8
;
1083 if (input_size
>= MaxAllocSize
/ sizeof(pg_wchar
))
1086 input_chars
= ALLOC((input_size
+ 1) * sizeof(pg_wchar
));
1090 p
= (unsigned char *) input
;
1091 for (i
= 0; i
< input_size
; i
++)
1093 input_chars
[i
] = utf8_to_unicode(p
);
1094 p
+= pg_utf_mblen(p
);
1096 input_chars
[i
] = (pg_wchar
) '\0';
1099 * The steps below correspond to the steps listed in [RFC3454], Section
1100 * "2. Preparation Overview"
1104 * 1) Map -- For each character in the input, check if it has a mapping
1105 * and, if so, replace it with its mapping.
1108 for (i
= 0; i
< input_size
; i
++)
1110 pg_wchar code
= input_chars
[i
];
1112 if (IS_CODE_IN_TABLE(code
, non_ascii_space_ranges
))
1113 input_chars
[count
++] = 0x0020;
1114 else if (IS_CODE_IN_TABLE(code
, commonly_mapped_to_nothing_ranges
))
1116 /* map to nothing */
1119 input_chars
[count
++] = code
;
1121 input_chars
[count
] = (pg_wchar
) '\0';
1124 if (input_size
== 0)
1125 goto prohibited
; /* don't allow empty password */
1128 * 2) Normalize -- Normalize the result of step 1 using Unicode
1131 output_chars
= unicode_normalize(UNICODE_NFKC
, input_chars
);
1136 * 3) Prohibit -- Check for any characters that are not allowed in the
1137 * output. If any are found, return an error.
1139 for (i
= 0; i
< input_size
; i
++)
1141 pg_wchar code
= input_chars
[i
];
1143 if (IS_CODE_IN_TABLE(code
, prohibited_output_ranges
))
1145 if (IS_CODE_IN_TABLE(code
, unassigned_codepoint_ranges
))
1150 * 4) Check bidi -- Possibly check for right-to-left characters, and if
1151 * any are found, make sure that the whole string satisfies the
1152 * requirements for bidirectional strings. If the string does not satisfy
1153 * the requirements for bidirectional strings, return an error.
1155 * [RFC3454], Section "6. Bidirectional Characters" explains in more
1156 * detail what that means:
1158 * "In any profile that specifies bidirectional character handling, all
1159 * three of the following requirements MUST be met:
1161 * 1) The characters in section 5.8 MUST be prohibited.
1163 * 2) If a string contains any RandALCat character, the string MUST NOT
1164 * contain any LCat character.
1166 * 3) If a string contains any RandALCat character, a RandALCat character
1167 * MUST be the first character of the string, and a RandALCat character
1168 * MUST be the last character of the string."
1170 contains_RandALCat
= false;
1171 for (i
= 0; i
< input_size
; i
++)
1173 pg_wchar code
= input_chars
[i
];
1175 if (IS_CODE_IN_TABLE(code
, RandALCat_codepoint_ranges
))
1177 contains_RandALCat
= true;
1182 if (contains_RandALCat
)
1184 pg_wchar first
= input_chars
[0];
1185 pg_wchar last
= input_chars
[input_size
- 1];
1187 for (i
= 0; i
< input_size
; i
++)
1189 pg_wchar code
= input_chars
[i
];
1191 if (IS_CODE_IN_TABLE(code
, LCat_codepoint_ranges
))
1195 if (!IS_CODE_IN_TABLE(first
, RandALCat_codepoint_ranges
) ||
1196 !IS_CODE_IN_TABLE(last
, RandALCat_codepoint_ranges
))
1201 * Finally, convert the result back to UTF-8.
1204 for (wp
= output_chars
; *wp
; wp
++)
1206 unsigned char buf
[4];
1208 unicode_to_utf8(*wp
, buf
);
1209 result_size
+= pg_utf_mblen(buf
);
1212 result
= ALLOC(result_size
+ 1);
1217 * There are no error exits below here, so the error exit paths don't need
1218 * to worry about possibly freeing "result".
1220 p
= (unsigned char *) result
;
1221 for (wp
= output_chars
; *wp
; wp
++)
1223 unicode_to_utf8(*wp
, p
);
1224 p
+= pg_utf_mblen(p
);
1226 Assert((char *) p
== result
+ result_size
);
1233 return SASLPREP_SUCCESS
;
1241 return SASLPREP_PROHIBITED
;
1249 return SASLPREP_OOM
;