Merge remote-tracking branch 'refs/remotes/origin/master'
[ghsmtp.git] / mbx.cpp
blobb16bf6cc410166a7c08b81c7132d5d095d21bf14
1 // Email address parsing and validating.
3 #include <cassert>
4 #include <string>
6 #include <tao/pegtl.hpp>
7 #include <tao/pegtl/contrib/abnf.hpp>
9 using namespace tao::pegtl;
10 using namespace tao::pegtl::abnf;
12 struct Address {
13 std::string local_part;
14 std::string domain;
17 namespace RFC3629 {
18 // clang-format off
20 // 4. Syntax of UTF-8 Byte Sequences
22 struct UTF8_tail : range<'\x80', '\xBF'> {};
24 struct UTF8_1 : range<0x00, 0x7F> {};
26 struct UTF8_2 : seq<range<'\xC2', '\xDF'>, UTF8_tail> {};
28 struct UTF8_3 : sor<seq<one<'\xE0'>, range<'\xA0', '\xBF'>, UTF8_tail>,
29 seq<range<'\xE1', '\xEC'>, rep<2, UTF8_tail>>,
30 seq<one<'\xED'>, range<'\x80', '\x9F'>, UTF8_tail>,
31 seq<range<'\xEE', '\xEF'>, rep<2, UTF8_tail>>> {};
33 struct UTF8_4 : sor<seq<one<'\xF0'>, range<'\x90', '\xBF'>, rep<2, UTF8_tail>>,
34 seq<range<'\xF1', '\xF3'>, rep<3, UTF8_tail>>,
35 seq<one<'\xF4'>, range<'\x80', '\x8F'>, rep<2, UTF8_tail>>> {};
37 struct non_ascii : sor<UTF8_2, UTF8_3, UTF8_4> {};
39 } // namespace RFC3629
41 namespace Chars {
42 struct VUCHAR : sor<VCHAR, RFC3629::non_ascii> {};
44 // excluded from atext: "(),.@[]"
45 struct atext : sor<ALPHA, DIGIT,
46 one<'!', '#',
47 '$', '%',
48 '&', '\'',
49 '*', '+',
50 '-', '/',
51 '=', '?',
52 '^', '_',
53 '`', '{',
54 '|', '}',
55 '~'>,
56 RFC3629::non_ascii> {};
58 } // namespace Chars
60 namespace RFC5321 {
61 // <https://tools.ietf.org/html/rfc5321>
63 using dot = one<'.'>;
64 using colon = one<':'>;
66 struct u_let_dig : sor<ALPHA, DIGIT, RFC3629::non_ascii> {};
68 struct u_ldh_tail : star<sor<seq<plus<one<'-'>>, u_let_dig>, u_let_dig>> {};
70 struct u_label : seq<u_let_dig, u_ldh_tail> {};
72 struct let_dig : sor<ALPHA, DIGIT> {};
74 struct ldh_tail : star<sor<seq<plus<one<'-'>>, let_dig>, let_dig>> {};
76 struct ldh_str : seq<let_dig, ldh_tail> {};
78 struct label : ldh_str {};
80 struct sub_domain : sor<label, u_label> {};
82 struct domain : list<sub_domain, dot> {};
84 struct dec_octet : sor<DIGIT,
85 seq<range<'1', '9'>, DIGIT>,
86 seq<one<'1'>, rep<2, DIGIT>>,
87 seq<one<'2'>, range<'0','4'>, DIGIT>,
88 seq<string<'2','5'>, range<'0','5'>>> {};
90 struct IPv4_address_literal : seq<dec_octet, dot, dec_octet, dot, dec_octet, dot, dec_octet> {};
92 struct h16 : rep_min_max<1, 4, HEXDIG> {};
94 struct ls32 : sor<seq<h16, colon, h16>, IPv4_address_literal> {};
96 struct dcolon : two<':'> {};
98 struct IPv6address : sor<seq< rep<6, h16, colon>, ls32>,
99 seq< dcolon, rep<5, h16, colon>, ls32>,
100 seq<opt<h16 >, dcolon, rep<4, h16, colon>, ls32>,
101 seq<opt<h16, opt< colon, h16>>, dcolon, rep<3, h16, colon>, ls32>,
102 seq<opt<h16, rep_opt<2, colon, h16>>, dcolon, rep<2, h16, colon>, ls32>,
103 seq<opt<h16, rep_opt<3, colon, h16>>, dcolon, h16, colon, ls32>,
104 seq<opt<h16, rep_opt<4, colon, h16>>, dcolon, ls32>,
105 seq<opt<h16, rep_opt<5, colon, h16>>, dcolon, h16>,
106 seq<opt<h16, rep_opt<6, colon, h16>>, dcolon >> {};
108 struct IPv6_address_literal : seq<TAO_PEGTL_ISTRING("IPv6:"), IPv6address> {};
110 struct dcontent : ranges<33, 90, 94, 126> {};
112 struct standardized_tag : ldh_str {};
114 struct general_address_literal : seq<standardized_tag, colon, plus<dcontent>> {};
116 // 4.1.3. Address Literals
117 struct address_literal : seq<one<'['>,
118 sor<IPv4_address_literal,
119 IPv6_address_literal,
120 general_address_literal>,
121 one<']'>> {};
124 struct qtextSMTP : sor<ranges<32, 33, 35, 91, 93, 126>, RFC3629::non_ascii> {};
125 struct graphic : range<32, 126> {};
126 struct quoted_pairSMTP : seq<one<'\\'>, graphic> {};
127 struct qcontentSMTP : sor<qtextSMTP, quoted_pairSMTP> {};
129 struct atom : plus<Chars::atext> {};
130 struct dot_string : list<atom, dot> {};
131 struct quoted_string : seq<one<'"'>, star<qcontentSMTP>, one<'"'>> {};
132 struct local_part : sor<dot_string, quoted_string> {};
133 struct non_local_part : sor<domain, address_literal> {};
134 struct mailbox : seq<local_part, one<'@'>, non_local_part> {};
135 struct mailbox_only : seq<mailbox, eof> {};
137 // clang-format on
138 // Actions
140 template <typename Rule>
141 struct action : nothing<Rule> {
144 template <>
145 struct action<local_part> {
146 template <typename Input>
147 static void apply(Input const& in, Address& addr)
149 addr.local_part = in.string();
153 template <>
154 struct action<non_local_part> {
155 template <typename Input>
156 static void apply(Input const& in, Address& addr)
158 addr.domain = in.string();
161 } // namespace RFC5321
163 namespace RFC5322 {
164 // <https://tools.ietf.org/html/rfc5322>
165 // clang-format off
167 using dot = one<'.'>;
169 struct quoted_pair : seq<one<'\\'>, sor<Chars::VUCHAR, WSP>> {};
171 // 3.2.2. Folding White Space and Comments
173 struct FWS : seq<opt<seq<star<WSP>, eol>>, plus<WSP>> {};
175 // ctext is ASCII but not '(' or ')' or '\\', plus non-ASCII
176 struct ctext : sor<ranges<33, 39, 42, 91, 93, 126>, RFC3629::non_ascii> {};
178 struct comment;
180 struct ccontent : sor<ctext, quoted_pair, comment> {};
182 struct comment : seq<one<'('>, star<seq<opt<FWS>, ccontent>>, opt<FWS>, one<')'>> {};
184 struct CFWS : sor<seq<plus<seq<opt<FWS>, comment>, opt<FWS>>>, FWS> {};
186 // 3.2.3. Atom
188 struct atom : seq<opt<CFWS>, plus<Chars::atext>, opt<CFWS>> {};
189 struct dot_atom_text : list<plus<Chars::atext>, dot> {};
190 struct dot_atom : seq<opt<CFWS>, dot_atom_text, opt<CFWS>> {};
192 // 3.2.4. Quoted Strings
194 struct qtext : sor<one<33>, ranges<35, 91, 93, 126>, RFC3629::non_ascii> {};
195 struct qcontent : sor<qtext, quoted_pair> {};
197 // Corrected in errata ID: 3135
198 struct quoted_string
199 : seq<opt<CFWS>,
200 DQUOTE,
201 sor<seq<star<seq<opt<FWS>, qcontent>>, opt<FWS>>, FWS>,
202 DQUOTE,
203 opt<CFWS>> {};
205 // 3.2.5. Miscellaneous Tokens
207 struct word : sor<atom, quoted_string> {};
208 struct phrase : plus<word> {};
210 // 3.4.1. Addr-Spec Specification
212 struct dtext : ranges<33, 90, 94, 126> {};
213 struct domain_literal : seq<opt<CFWS>,
214 one<'['>, star<seq<opt<FWS>, dtext>>, opt<FWS>, one<']'>,
215 opt<CFWS>> {};
216 struct domain : sor<dot_atom, domain_literal> {};
217 struct local_part : sor<dot_atom, quoted_string> {};
218 struct addr_spec : seq<local_part, one<'@'>, domain> {};
220 // 3.4 Address Specification
222 struct group_list;
223 struct display_name : phrase {};
224 struct group : seq<display_name, one<':'>, opt<group_list>, one<';'>, opt<CFWS>> {};
225 struct angle_addr : seq<opt<CFWS>, one<'<'>, addr_spec, one<'>'>, opt<CFWS>> {};
226 struct name_addr : seq<opt<display_name>, angle_addr> {};
227 struct mailbox : sor<name_addr, addr_spec> {};
228 struct mailbox_list : list<mailbox, one<','>> {};
229 struct group_list : sor<mailbox_list, CFWS> {};
230 struct address : sor<mailbox, group> {};
231 struct address_only : seq<address, eof> {};
233 // clang-format on
234 // Actions
236 template <typename Rule>
237 struct action : nothing<Rule> {
240 template <>
241 struct action<local_part> {
242 template <typename Input>
243 static void apply(Input const& in, Address& addr)
245 addr.local_part = in.string();
249 template <>
250 struct action<domain> {
251 template <typename Input>
252 static void apply(Input const& in, Address& addr)
254 addr.domain = in.string();
257 } // namespace RFC5322
259 bool validate_mailbox(std::string_view value)
261 Address addr;
263 memory_input<> address_in(value, "address");
264 if (!parse<RFC5321::mailbox_only, RFC5321::action>(address_in, addr)) {
265 return false;
268 // RFC-5321 section 4.5.3.1. Size Limits and Minimums
270 if (addr.local_part.length() > 64) { // Section 4.5.3.1.1. Local-part
271 return false;
273 if (addr.domain.length() > 255) { // Section 4.5.3.1.2.
274 // Also RFC 2181 section 11. Name syntax
275 return false;
278 // FIXME
279 // each label is limited to between 1 and 63 octets
281 return true;
284 bool validate_address(std::string_view value)
286 Address addr;
288 memory_input<> address_in(value, "address");
289 if (!parse<RFC5322::address_only, RFC5322::action>(address_in, addr)) {
290 return false;
293 return true;
296 int main()
298 // <https://en.wikipedia.org/wiki/Email_address#Examples>
300 // Valid email addresses
302 assert(validate_mailbox("simple@example.com"));
303 assert(validate_mailbox("very.common@example.com"));
304 assert(validate_mailbox("disposable.style.email.with+symbol@example.com"));
305 assert(validate_mailbox("other.email-with-hyphen@example.com"));
306 assert(validate_mailbox("fully-qualified-domain@example.com"));
308 // (may go to user.name@example.com inbox depending on mail server)
309 assert(validate_mailbox("user.name+tag+sorting@example.com"));
311 assert(validate_mailbox("x@example.com"));
312 assert(validate_mailbox("example-indeed@strange-example.com"));
314 // (local domain name with no TLD, although ICANN highly discourages
315 // dotless email addresses)
316 assert(validate_mailbox("admin@mailserver1"));
318 // (see the List of Internet top-level domains)
319 assert(validate_mailbox("example@s.example"));
321 // (space between the quotes)
322 assert(validate_mailbox("\" \"@example.org"));
324 // (quoted double dot)
325 assert(validate_mailbox("\"john..doe\"@example.org"));
327 // (bangified host route used for uucp mailers)
328 assert(validate_mailbox("mailhost!username@example.org"));
330 // (% escaped mail route to user@example.com via example.org)
331 assert(validate_mailbox("user%example.com@example.org"));
333 // Invalid email addresses
335 assert(!validate_mailbox("Abc.example.com")); // (no @ character)
337 assert(!validate_mailbox("A@b@c@example.com")); // (only one @ is allowed)
339 // (none of the special characters in this local-part are allowed
340 // outside quotation marks)
341 assert(!validate_mailbox("a\"b(c)d,e:f;g<h>i[j\\k]l@example.com"));
343 // (quoted strings must be dot separated or the only element making
344 // up the local-part)
345 assert(!validate_mailbox("just\"not\"right@example.com"));
347 // (spaces, quotes, and backslashes may only exist when within
348 // quoted strings and preceded by a backslash)
349 assert(!validate_mailbox("this is\"not\\allowed@example.com"));
351 // (even if escaped (preceded by a backslash), spaces, quotes, and
352 // backslashes must still be contained by quotes)
353 assert(!validate_mailbox("this\\ still\\\"not\\\\allowed@example.com"));
355 // (local part is longer than 64 characters)
356 assert(!validate_mailbox(
357 "1234567890123456789012345678901234567890123456789012345"
358 "678901234+x@example.com"));
360 assert(!validate_address("foo bar@digilicious.com"));
361 assert(validate_address("gene@digilicious.com"));
362 assert(validate_address("Gene Hightower <gene@digilicious.com>"));
363 assert(validate_address("gene@[127.999.0.1]"));
364 assert(validate_address("madness!@example.org"));
365 assert(validate_address("(comment)mailbox@example.com"));
367 assert(validate_mailbox("gene@digilicious.com"));
368 assert(validate_mailbox("gene@[127.0.0.1]"));
369 assert(!validate_mailbox("gene@[127.999.0.1]"));
370 assert(!validate_mailbox("allen@bad_d0main.com"));
372 assert(!validate_mailbox("2962"));
373 assert(validate_mailbox("실례@실례.테스트"));
375 // <https://docs.microsoft.com/en-us/archive/blogs/testing123/email-address-test-cases>
377 // Valid email addresses:
378 assert(validate_mailbox("email@domain.com"));
380 // Email contains dot in the local part, a dot-atom-string.
381 assert(validate_mailbox("firstname.lastname@domain.com"));
383 // Multiple lables in domain.
384 assert(validate_mailbox("email@subdomain.domain.com"));
386 // Plus sign is a valid character.
387 assert(validate_mailbox("firstname+lastname@domain.com"));
389 // Domain is valid IP address, but this is matched as a domain.
390 assert(validate_mailbox("email@123.123.123.123"));
392 // Square bracket around IP address is a "address literal."
393 assert(validate_mailbox("email@[123.123.123.123]"));
395 // Quotes around local part is valid.
396 assert(validate_mailbox("\"email\"@domain.com"));
398 // Digits in address are valid.
399 assert(validate_mailbox("1234567890@domain.com"));
401 // Dash in domain name is valid.
402 assert(validate_mailbox("email@domain-one.com"));
404 // Underscore in the address field is valid.
405 assert(validate_mailbox("_______@domain.com"));
407 assert(validate_mailbox("email@domain.name"));
408 assert(validate_mailbox("email@domain.co.jp"));
410 // Dash in local part is valid.
411 assert(validate_mailbox("firstname-lastname@domain.com"));
413 assert(!validate_mailbox("plainaddress")); // Missing @ sign and domain
414 assert(!validate_mailbox("#@%^%#$@#$@#.com")); // Garbage
415 assert(!validate_mailbox("@domain.com")); // Missing username
417 assert(!validate_mailbox("Joe Smith <email@domain.com>"));
418 assert(validate_address("Joe Smith <email@domain.com>"));
420 assert(!validate_mailbox("email.domain.com")); // Missing @
421 assert(!validate_mailbox("email@domain@domain.com")); // Two @ sign
423 // Leading dot in address is not allowed
424 assert(!validate_mailbox(".email@domain.com"));
426 // Trailing dot in address is not allowed
427 assert(!validate_mailbox("email.@domain.com"));
429 // Multiple dots
430 assert(!validate_mailbox("email..email@domain.com"));
432 // OK! Unicode char as address
433 assert(validate_mailbox("あいうえお@domain.com"));
435 // Comment not allowed in 5321 mailbox.
436 assert(!validate_mailbox("email@domain.com (Joe Smith)"));
438 // Comment fine in 5322 address.
439 assert(validate_address("email@domain.com (Joe Smith)"));
441 // Missing top level domain (.com/.net/.org/etc).
442 assert(validate_mailbox("email@domain"));
444 // Leading dash in front of domain is invalid.
445 assert(!validate_mailbox("email@-domain.com"));
447 // .web is not a valid top level domain, oh yeah? says who?
448 assert(validate_mailbox("email@domain.web"));
450 // Invalid IP address.
451 assert(!validate_mailbox("email@[111.222.333.44444]"));
453 // Invalid IP address, but valid domain name as it turns out.
454 assert(validate_mailbox("email@111.222.333.44444"));
456 // Not a valid domain name.
457 assert(!validate_mailbox("email@domain..com"));
459 // general_address_literal
460 assert(validate_mailbox("email@[x:~Foo_Bar_Baz<\?\?>]"));