1 // Email address parsing and validating.
6 #include <tao/pegtl.hpp>
7 #include <tao/pegtl/contrib/abnf.hpp>
9 using namespace tao::pegtl
;
10 using namespace tao::pegtl::abnf
;
13 std::string local_part
;
20 // 4. Syntax of UTF-8 Byte Sequences
22 struct UTF8_tail
: range
<'\x80', '\xBF'> {};
24 struct UTF8_1
: range
<0x00, 0x7F> {};
26 struct UTF8_2
: seq
<range
<'\xC2', '\xDF'>, UTF8_tail
> {};
28 struct UTF8_3
: sor
<seq
<one
<'\xE0'>, range
<'\xA0', '\xBF'>, UTF8_tail
>,
29 seq
<range
<'\xE1', '\xEC'>, rep
<2, UTF8_tail
>>,
30 seq
<one
<'\xED'>, range
<'\x80', '\x9F'>, UTF8_tail
>,
31 seq
<range
<'\xEE', '\xEF'>, rep
<2, UTF8_tail
>>> {};
33 struct UTF8_4
: sor
<seq
<one
<'\xF0'>, range
<'\x90', '\xBF'>, rep
<2, UTF8_tail
>>,
34 seq
<range
<'\xF1', '\xF3'>, rep
<3, UTF8_tail
>>,
35 seq
<one
<'\xF4'>, range
<'\x80', '\x8F'>, rep
<2, UTF8_tail
>>> {};
37 struct non_ascii
: sor
<UTF8_2
, UTF8_3
, UTF8_4
> {};
39 } // namespace RFC3629
42 struct VUCHAR
: sor
<VCHAR
, RFC3629::non_ascii
> {};
44 // excluded from atext: "(),.@[]"
45 struct atext
: sor
<ALPHA
, DIGIT
,
56 RFC3629::non_ascii
> {};
61 // <https://tools.ietf.org/html/rfc5321>
64 using colon
= one
<':'>;
66 struct u_let_dig
: sor
<ALPHA
, DIGIT
, RFC3629::non_ascii
> {};
68 struct u_ldh_tail
: star
<sor
<seq
<plus
<one
<'-'>>, u_let_dig
>, u_let_dig
>> {};
70 struct u_label
: seq
<u_let_dig
, u_ldh_tail
> {};
72 struct let_dig
: sor
<ALPHA
, DIGIT
> {};
74 struct ldh_tail
: star
<sor
<seq
<plus
<one
<'-'>>, let_dig
>, let_dig
>> {};
76 struct ldh_str
: seq
<let_dig
, ldh_tail
> {};
78 struct label
: ldh_str
{};
80 struct sub_domain
: sor
<label
, u_label
> {};
82 struct domain
: list
<sub_domain
, dot
> {};
84 struct dec_octet
: sor
<DIGIT
,
85 seq
<range
<'1', '9'>, DIGIT
>,
86 seq
<one
<'1'>, rep
<2, DIGIT
>>,
87 seq
<one
<'2'>, range
<'0','4'>, DIGIT
>,
88 seq
<string
<'2','5'>, range
<'0','5'>>> {};
90 struct IPv4_address_literal
: seq
<dec_octet
, dot
, dec_octet
, dot
, dec_octet
, dot
, dec_octet
> {};
92 struct h16
: rep_min_max
<1, 4, HEXDIG
> {};
94 struct ls32
: sor
<seq
<h16
, colon
, h16
>, IPv4_address_literal
> {};
96 struct dcolon
: two
<':'> {};
98 struct IPv6address
: sor
<seq
< rep
<6, h16
, colon
>, ls32
>,
99 seq
< dcolon
, rep
<5, h16
, colon
>, ls32
>,
100 seq
<opt
<h16
>, dcolon
, rep
<4, h16
, colon
>, ls32
>,
101 seq
<opt
<h16
, opt
< colon
, h16
>>, dcolon
, rep
<3, h16
, colon
>, ls32
>,
102 seq
<opt
<h16
, rep_opt
<2, colon
, h16
>>, dcolon
, rep
<2, h16
, colon
>, ls32
>,
103 seq
<opt
<h16
, rep_opt
<3, colon
, h16
>>, dcolon
, h16
, colon
, ls32
>,
104 seq
<opt
<h16
, rep_opt
<4, colon
, h16
>>, dcolon
, ls32
>,
105 seq
<opt
<h16
, rep_opt
<5, colon
, h16
>>, dcolon
, h16
>,
106 seq
<opt
<h16
, rep_opt
<6, colon
, h16
>>, dcolon
>> {};
108 struct IPv6_address_literal
: seq
<TAO_PEGTL_ISTRING("IPv6:"), IPv6address
> {};
110 struct dcontent
: ranges
<33, 90, 94, 126> {};
112 struct standardized_tag
: ldh_str
{};
114 struct general_address_literal
: seq
<standardized_tag
, colon
, plus
<dcontent
>> {};
116 // 4.1.3. Address Literals
117 struct address_literal
: seq
<one
<'['>,
118 sor
<IPv4_address_literal
,
119 IPv6_address_literal
,
120 general_address_literal
>,
124 struct qtextSMTP
: sor
<ranges
<32, 33, 35, 91, 93, 126>, RFC3629::non_ascii
> {};
125 struct graphic
: range
<32, 126> {};
126 struct quoted_pairSMTP
: seq
<one
<'\\'>, graphic
> {};
127 struct qcontentSMTP
: sor
<qtextSMTP
, quoted_pairSMTP
> {};
129 struct atom
: plus
<Chars::atext
> {};
130 struct dot_string
: list
<atom
, dot
> {};
131 struct quoted_string
: seq
<one
<'"'>, star
<qcontentSMTP
>, one
<'"'>> {};
132 struct local_part
: sor
<dot_string
, quoted_string
> {};
133 struct non_local_part
: sor
<domain
, address_literal
> {};
134 struct mailbox
: seq
<local_part
, one
<'@'>, non_local_part
> {};
135 struct mailbox_only
: seq
<mailbox
, eof
> {};
140 template <typename Rule
>
141 struct action
: nothing
<Rule
> {
145 struct action
<local_part
> {
146 template <typename Input
>
147 static void apply(Input
const& in
, Address
& addr
)
149 addr
.local_part
= in
.string();
154 struct action
<non_local_part
> {
155 template <typename Input
>
156 static void apply(Input
const& in
, Address
& addr
)
158 addr
.domain
= in
.string();
161 } // namespace RFC5321
164 // <https://tools.ietf.org/html/rfc5322>
167 using dot
= one
<'.'>;
169 struct quoted_pair
: seq
<one
<'\\'>, sor
<Chars::VUCHAR
, WSP
>> {};
171 // 3.2.2. Folding White Space and Comments
173 struct FWS
: seq
<opt
<seq
<star
<WSP
>, eol
>>, plus
<WSP
>> {};
175 // ctext is ASCII but not '(' or ')' or '\\', plus non-ASCII
176 struct ctext
: sor
<ranges
<33, 39, 42, 91, 93, 126>, RFC3629::non_ascii
> {};
180 struct ccontent
: sor
<ctext
, quoted_pair
, comment
> {};
182 struct comment
: seq
<one
<'('>, star
<seq
<opt
<FWS
>, ccontent
>>, opt
<FWS
>, one
<')'>> {};
184 struct CFWS
: sor
<seq
<plus
<seq
<opt
<FWS
>, comment
>, opt
<FWS
>>>, FWS
> {};
188 struct atom
: seq
<opt
<CFWS
>, plus
<Chars::atext
>, opt
<CFWS
>> {};
189 struct dot_atom_text
: list
<plus
<Chars::atext
>, dot
> {};
190 struct dot_atom
: seq
<opt
<CFWS
>, dot_atom_text
, opt
<CFWS
>> {};
192 // 3.2.4. Quoted Strings
194 struct qtext
: sor
<one
<33>, ranges
<35, 91, 93, 126>, RFC3629::non_ascii
> {};
195 struct qcontent
: sor
<qtext
, quoted_pair
> {};
197 // Corrected in errata ID: 3135
201 sor
<seq
<star
<seq
<opt
<FWS
>, qcontent
>>, opt
<FWS
>>, FWS
>,
205 // 3.2.5. Miscellaneous Tokens
207 struct word
: sor
<atom
, quoted_string
> {};
208 struct phrase
: plus
<word
> {};
210 // 3.4.1. Addr-Spec Specification
212 struct dtext
: ranges
<33, 90, 94, 126> {};
213 struct domain_literal
: seq
<opt
<CFWS
>,
214 one
<'['>, star
<seq
<opt
<FWS
>, dtext
>>, opt
<FWS
>, one
<']'>,
216 struct domain
: sor
<dot_atom
, domain_literal
> {};
217 struct local_part
: sor
<dot_atom
, quoted_string
> {};
218 struct addr_spec
: seq
<local_part
, one
<'@'>, domain
> {};
220 // 3.4 Address Specification
223 struct display_name
: phrase
{};
224 struct group
: seq
<display_name
, one
<':'>, opt
<group_list
>, one
<';'>, opt
<CFWS
>> {};
225 struct angle_addr
: seq
<opt
<CFWS
>, one
<'<'>, addr_spec
, one
<'>'>, opt
<CFWS
>> {};
226 struct name_addr
: seq
<opt
<display_name
>, angle_addr
> {};
227 struct mailbox
: sor
<name_addr
, addr_spec
> {};
228 struct mailbox_list
: list
<mailbox
, one
<','>> {};
229 struct group_list
: sor
<mailbox_list
, CFWS
> {};
230 struct address
: sor
<mailbox
, group
> {};
231 struct address_only
: seq
<address
, eof
> {};
236 template <typename Rule
>
237 struct action
: nothing
<Rule
> {
241 struct action
<local_part
> {
242 template <typename Input
>
243 static void apply(Input
const& in
, Address
& addr
)
245 addr
.local_part
= in
.string();
250 struct action
<domain
> {
251 template <typename Input
>
252 static void apply(Input
const& in
, Address
& addr
)
254 addr
.domain
= in
.string();
257 } // namespace RFC5322
259 bool validate_mailbox(std::string_view value
)
263 memory_input
<> address_in(value
, "address");
264 if (!parse
<RFC5321::mailbox_only
, RFC5321::action
>(address_in
, addr
)) {
268 // RFC-5321 section 4.5.3.1. Size Limits and Minimums
270 if (addr
.local_part
.length() > 64) { // Section 4.5.3.1.1. Local-part
273 if (addr
.domain
.length() > 255) { // Section 4.5.3.1.2.
274 // Also RFC 2181 section 11. Name syntax
279 // each label is limited to between 1 and 63 octets
284 bool validate_address(std::string_view value
)
288 memory_input
<> address_in(value
, "address");
289 if (!parse
<RFC5322::address_only
, RFC5322::action
>(address_in
, addr
)) {
298 // <https://en.wikipedia.org/wiki/Email_address#Examples>
300 // Valid email addresses
302 assert(validate_mailbox("simple@example.com"));
303 assert(validate_mailbox("very.common@example.com"));
304 assert(validate_mailbox("disposable.style.email.with+symbol@example.com"));
305 assert(validate_mailbox("other.email-with-hyphen@example.com"));
306 assert(validate_mailbox("fully-qualified-domain@example.com"));
308 // (may go to user.name@example.com inbox depending on mail server)
309 assert(validate_mailbox("user.name+tag+sorting@example.com"));
311 assert(validate_mailbox("x@example.com"));
312 assert(validate_mailbox("example-indeed@strange-example.com"));
314 // (local domain name with no TLD, although ICANN highly discourages
315 // dotless email addresses)
316 assert(validate_mailbox("admin@mailserver1"));
318 // (see the List of Internet top-level domains)
319 assert(validate_mailbox("example@s.example"));
321 // (space between the quotes)
322 assert(validate_mailbox("\" \"@example.org"));
324 // (quoted double dot)
325 assert(validate_mailbox("\"john..doe\"@example.org"));
327 // (bangified host route used for uucp mailers)
328 assert(validate_mailbox("mailhost!username@example.org"));
330 // (% escaped mail route to user@example.com via example.org)
331 assert(validate_mailbox("user%example.com@example.org"));
333 // Invalid email addresses
335 assert(!validate_mailbox("Abc.example.com")); // (no @ character)
337 assert(!validate_mailbox("A@b@c@example.com")); // (only one @ is allowed)
339 // (none of the special characters in this local-part are allowed
340 // outside quotation marks)
341 assert(!validate_mailbox("a\"b(c)d,e:f;g<h>i[j\\k]l@example.com"));
343 // (quoted strings must be dot separated or the only element making
344 // up the local-part)
345 assert(!validate_mailbox("just\"not\"right@example.com"));
347 // (spaces, quotes, and backslashes may only exist when within
348 // quoted strings and preceded by a backslash)
349 assert(!validate_mailbox("this is\"not\\allowed@example.com"));
351 // (even if escaped (preceded by a backslash), spaces, quotes, and
352 // backslashes must still be contained by quotes)
353 assert(!validate_mailbox("this\\ still\\\"not\\\\allowed@example.com"));
355 // (local part is longer than 64 characters)
356 assert(!validate_mailbox(
357 "1234567890123456789012345678901234567890123456789012345"
358 "678901234+x@example.com"));
360 assert(!validate_address("foo bar@digilicious.com"));
361 assert(validate_address("gene@digilicious.com"));
362 assert(validate_address("Gene Hightower <gene@digilicious.com>"));
363 assert(validate_address("gene@[127.999.0.1]"));
364 assert(validate_address("madness!@example.org"));
365 assert(validate_address("(comment)mailbox@example.com"));
367 assert(validate_mailbox("gene@digilicious.com"));
368 assert(validate_mailbox("gene@[127.0.0.1]"));
369 assert(!validate_mailbox("gene@[127.999.0.1]"));
370 assert(!validate_mailbox("allen@bad_d0main.com"));
372 assert(!validate_mailbox("2962"));
373 assert(validate_mailbox("실례@실례.테스트"));
375 // <https://docs.microsoft.com/en-us/archive/blogs/testing123/email-address-test-cases>
377 // Valid email addresses:
378 assert(validate_mailbox("email@domain.com"));
380 // Email contains dot in the local part, a dot-atom-string.
381 assert(validate_mailbox("firstname.lastname@domain.com"));
383 // Multiple lables in domain.
384 assert(validate_mailbox("email@subdomain.domain.com"));
386 // Plus sign is a valid character.
387 assert(validate_mailbox("firstname+lastname@domain.com"));
389 // Domain is valid IP address, but this is matched as a domain.
390 assert(validate_mailbox("email@123.123.123.123"));
392 // Square bracket around IP address is a "address literal."
393 assert(validate_mailbox("email@[123.123.123.123]"));
395 // Quotes around local part is valid.
396 assert(validate_mailbox("\"email\"@domain.com"));
398 // Digits in address are valid.
399 assert(validate_mailbox("1234567890@domain.com"));
401 // Dash in domain name is valid.
402 assert(validate_mailbox("email@domain-one.com"));
404 // Underscore in the address field is valid.
405 assert(validate_mailbox("_______@domain.com"));
407 assert(validate_mailbox("email@domain.name"));
408 assert(validate_mailbox("email@domain.co.jp"));
410 // Dash in local part is valid.
411 assert(validate_mailbox("firstname-lastname@domain.com"));
413 assert(!validate_mailbox("plainaddress")); // Missing @ sign and domain
414 assert(!validate_mailbox("#@%^%#$@#$@#.com")); // Garbage
415 assert(!validate_mailbox("@domain.com")); // Missing username
417 assert(!validate_mailbox("Joe Smith <email@domain.com>"));
418 assert(validate_address("Joe Smith <email@domain.com>"));
420 assert(!validate_mailbox("email.domain.com")); // Missing @
421 assert(!validate_mailbox("email@domain@domain.com")); // Two @ sign
423 // Leading dot in address is not allowed
424 assert(!validate_mailbox(".email@domain.com"));
426 // Trailing dot in address is not allowed
427 assert(!validate_mailbox("email.@domain.com"));
430 assert(!validate_mailbox("email..email@domain.com"));
432 // OK! Unicode char as address
433 assert(validate_mailbox("あいうえお@domain.com"));
435 // Comment not allowed in 5321 mailbox.
436 assert(!validate_mailbox("email@domain.com (Joe Smith)"));
438 // Comment fine in 5322 address.
439 assert(validate_address("email@domain.com (Joe Smith)"));
441 // Missing top level domain (.com/.net/.org/etc).
442 assert(validate_mailbox("email@domain"));
444 // Leading dash in front of domain is invalid.
445 assert(!validate_mailbox("email@-domain.com"));
447 // .web is not a valid top level domain, oh yeah? says who?
448 assert(validate_mailbox("email@domain.web"));
450 // Invalid IP address.
451 assert(!validate_mailbox("email@[111.222.333.44444]"));
453 // Invalid IP address, but valid domain name as it turns out.
454 assert(validate_mailbox("email@111.222.333.44444"));
456 // Not a valid domain name.
457 assert(!validate_mailbox("email@domain..com"));
459 // general_address_literal
460 assert(validate_mailbox("email@[x:~Foo_Bar_Baz<\?\?>]"));