1 // Email address parsing and validating.
6 #include <tao/pegtl.hpp>
7 #include <tao/pegtl/contrib/abnf.hpp>
9 using namespace tao::pegtl
;
10 using namespace tao::pegtl::abnf
;
13 std::string local_part
;
20 // 4. Syntax of UTF-8 Byte Sequences
22 struct UTF8_tail
: range
<'\x80', '\xBF'> {};
24 struct UTF8_1
: range
<0x00, 0x7F> {};
26 struct UTF8_2
: seq
<range
<'\xC2', '\xDF'>, UTF8_tail
> {};
28 struct UTF8_3
: sor
<seq
<one
<'\xE0'>, range
<'\xA0', '\xBF'>, UTF8_tail
>,
29 seq
<range
<'\xE1', '\xEC'>, rep
<2, UTF8_tail
>>,
30 seq
<one
<'\xED'>, range
<'\x80', '\x9F'>, UTF8_tail
>,
31 seq
<range
<'\xEE', '\xEF'>, rep
<2, UTF8_tail
>>> {};
33 struct UTF8_4
: sor
<seq
<one
<'\xF0'>, range
<'\x90', '\xBF'>, rep
<2, UTF8_tail
>>,
34 seq
<range
<'\xF1', '\xF3'>, rep
<3, UTF8_tail
>>,
35 seq
<one
<'\xF4'>, range
<'\x80', '\x8F'>, rep
<2, UTF8_tail
>>> {};
37 struct non_ascii
: sor
<UTF8_2
, UTF8_3
, UTF8_4
> {};
39 } // namespace RFC3629
42 struct VUCHAR
: sor
<VCHAR
, RFC3629::non_ascii
> {};
44 // excluded from atext: "(),.@[]"
45 struct atext
: sor
<ALPHA
, DIGIT
,
56 RFC3629::non_ascii
> {};
61 // <https://tools.ietf.org/html/rfc5321>
64 using colon
= one
<':'>;
66 struct u_let_dig
: sor
<ALPHA
, DIGIT
, RFC3629::non_ascii
> {};
68 struct u_ldh_tail
: star
<sor
<seq
<plus
<one
<'-'>>, u_let_dig
>, u_let_dig
>> {};
70 struct u_label
: seq
<u_let_dig
, u_ldh_tail
> {};
72 struct let_dig
: sor
<ALPHA
, DIGIT
> {};
74 struct ldh_tail
: star
<sor
<seq
<plus
<one
<'-'>>, let_dig
>, let_dig
>> {};
76 struct ldh_str
: seq
<let_dig
, ldh_tail
> {};
78 struct label
: ldh_str
{};
80 struct sub_domain
: sor
<label
, u_label
> {};
82 struct domain
: list
<sub_domain
, dot
> {};
84 struct dec_octet
: sor
<seq
<string
<'2','5'>, range
<'0','5'>>,
85 seq
<one
<'2'>, range
<'0','4'>, DIGIT
>,
86 seq
<range
<'0', '1'>, rep
<2, DIGIT
>>,
87 rep_min_max
<1, 2, DIGIT
>> {};
89 struct IPv4_address_literal
: seq
<dec_octet
, dot
, dec_octet
, dot
, dec_octet
, dot
, dec_octet
> {};
91 struct h16
: rep_min_max
<1, 4, HEXDIG
> {};
93 struct ls32
: sor
<seq
<h16
, colon
, h16
>, IPv4_address_literal
> {};
95 struct dcolon
: two
<':'> {};
97 struct IPv6address
: sor
<seq
< rep
<6, h16
, colon
>, ls32
>,
98 seq
< dcolon
, rep
<5, h16
, colon
>, ls32
>,
99 seq
<opt
<h16
>, dcolon
, rep
<4, h16
, colon
>, ls32
>,
100 seq
<opt
<h16
, opt
< colon
, h16
>>, dcolon
, rep
<3, h16
, colon
>, ls32
>,
101 seq
<opt
<h16
, rep_opt
<2, colon
, h16
>>, dcolon
, rep
<2, h16
, colon
>, ls32
>,
102 seq
<opt
<h16
, rep_opt
<3, colon
, h16
>>, dcolon
, h16
, colon
, ls32
>,
103 seq
<opt
<h16
, rep_opt
<4, colon
, h16
>>, dcolon
, ls32
>,
104 seq
<opt
<h16
, rep_opt
<5, colon
, h16
>>, dcolon
, h16
>,
105 seq
<opt
<h16
, rep_opt
<6, colon
, h16
>>, dcolon
>> {};
107 struct IPv6_address_literal
: seq
<TAO_PEGTL_ISTRING("IPv6:"), IPv6address
> {};
109 struct dcontent
: ranges
<33, 90, 94, 126> {};
111 struct standardized_tag
: ldh_str
{};
113 struct general_address_literal
: seq
<standardized_tag
, colon
, plus
<dcontent
>> {};
115 // 4.1.3. Address Literals
116 struct address_literal
: seq
<one
<'['>,
117 sor
<IPv4_address_literal
,
118 IPv6_address_literal
,
119 general_address_literal
>,
123 struct qtextSMTP
: sor
<ranges
<32, 33, 35, 91, 93, 126>, RFC3629::non_ascii
> {};
124 struct graphic
: range
<32, 126> {};
125 struct quoted_pairSMTP
: seq
<one
<'\\'>, graphic
> {};
126 struct qcontentSMTP
: sor
<qtextSMTP
, quoted_pairSMTP
> {};
128 struct atom
: plus
<Chars::atext
> {};
129 struct dot_string
: list
<atom
, dot
> {};
130 struct quoted_string
: seq
<one
<'"'>, star
<qcontentSMTP
>, one
<'"'>> {};
131 struct local_part
: sor
<dot_string
, quoted_string
> {};
132 struct non_local_part
: sor
<domain
, address_literal
> {};
133 struct mailbox
: seq
<local_part
, one
<'@'>, non_local_part
> {};
134 struct mailbox_only
: seq
<mailbox
, eof
> {};
139 template <typename Rule
>
140 struct action
: nothing
<Rule
> {
144 struct action
<local_part
> {
145 template <typename Input
>
146 static void apply(Input
const& in
, Address
& addr
)
148 addr
.local_part
= in
.string();
153 struct action
<non_local_part
> {
154 template <typename Input
>
155 static void apply(Input
const& in
, Address
& addr
)
157 addr
.domain
= in
.string();
160 } // namespace RFC5321
163 // <https://tools.ietf.org/html/rfc5322>
166 using dot
= one
<'.'>;
168 struct quoted_pair
: seq
<one
<'\\'>, sor
<Chars::VUCHAR
, WSP
>> {};
170 // 3.2.2. Folding White Space and Comments
172 struct FWS
: seq
<opt
<seq
<star
<WSP
>, eol
>>, plus
<WSP
>> {};
174 // ctext is ASCII but not '(' or ')' or '\\', plus non-ASCII
175 struct ctext
: sor
<ranges
<33, 39, 42, 91, 93, 126>, RFC3629::non_ascii
> {};
179 struct ccontent
: sor
<ctext
, quoted_pair
, comment
> {};
181 struct comment
: seq
<one
<'('>, star
<seq
<opt
<FWS
>, ccontent
>>, opt
<FWS
>, one
<')'>> {};
183 struct CFWS
: sor
<seq
<plus
<seq
<opt
<FWS
>, comment
>, opt
<FWS
>>>, FWS
> {};
187 struct atom
: seq
<opt
<CFWS
>, plus
<Chars::atext
>, opt
<CFWS
>> {};
188 struct dot_atom_text
: list
<plus
<Chars::atext
>, dot
> {};
189 struct dot_atom
: seq
<opt
<CFWS
>, dot_atom_text
, opt
<CFWS
>> {};
191 // 3.2.4. Quoted Strings
193 struct qtext
: sor
<one
<33>, ranges
<35, 91, 93, 126>, RFC3629::non_ascii
> {};
194 struct qcontent
: sor
<qtext
, quoted_pair
> {};
196 // Corrected in errata ID: 3135
200 sor
<seq
<star
<seq
<opt
<FWS
>, qcontent
>>, opt
<FWS
>>, FWS
>,
204 // 3.2.5. Miscellaneous Tokens
206 struct word
: sor
<atom
, quoted_string
> {};
207 struct phrase
: plus
<word
> {};
209 // 3.4.1. Addr-Spec Specification
211 struct dtext
: ranges
<33, 90, 94, 126> {};
212 struct domain_literal
: seq
<opt
<CFWS
>,
213 one
<'['>, star
<seq
<opt
<FWS
>, dtext
>>, opt
<FWS
>, one
<']'>,
215 struct domain
: sor
<dot_atom
, domain_literal
> {};
216 struct local_part
: sor
<dot_atom
, quoted_string
> {};
217 struct addr_spec
: seq
<local_part
, one
<'@'>, domain
> {};
219 // 3.4 Address Specification
222 struct display_name
: phrase
{};
223 struct group
: seq
<display_name
, one
<':'>, opt
<group_list
>, one
<';'>, opt
<CFWS
>> {};
224 struct angle_addr
: seq
<opt
<CFWS
>, one
<'<'>, addr_spec
, one
<'>'>, opt
<CFWS
>> {};
225 struct name_addr
: seq
<opt
<display_name
>, angle_addr
> {};
226 struct mailbox
: sor
<name_addr
, addr_spec
> {};
227 struct mailbox_list
: list
<mailbox
, one
<','>> {};
228 struct group_list
: sor
<mailbox_list
, CFWS
> {};
229 struct address
: sor
<mailbox
, group
> {};
230 struct address_only
: seq
<address
, eof
> {};
235 template <typename Rule
>
236 struct action
: nothing
<Rule
> {
240 struct action
<local_part
> {
241 template <typename Input
>
242 static void apply(Input
const& in
, Address
& addr
)
244 addr
.local_part
= in
.string();
249 struct action
<domain
> {
250 template <typename Input
>
251 static void apply(Input
const& in
, Address
& addr
)
253 addr
.domain
= in
.string();
256 } // namespace RFC5322
258 bool validate_mailbox(std::string_view value
)
262 memory_input
<> address_in(value
, "address");
263 if (!parse
<RFC5321::mailbox_only
, RFC5321::action
>(address_in
, addr
)) {
267 // RFC-5321 section 4.5.3.1. Size Limits and Minimums
269 if (addr
.local_part
.length() > 64) { // Section 4.5.3.1.1. Local-part
272 if (addr
.domain
.length() > 255) { // Section 4.5.3.1.2.
273 // Also RFC 2181 section 11. Name syntax
278 // each label is limited to between 1 and 63 octets
283 bool validate_address(std::string_view value
)
287 memory_input
<> address_in(value
, "address");
288 if (!parse
<RFC5322::address_only
, RFC5322::action
>(address_in
, addr
)) {
297 // <https://en.wikipedia.org/wiki/Email_address#Examples>
299 // Valid email addresses
301 assert(validate_mailbox("simple@example.com"));
302 assert(validate_mailbox("very.common@example.com"));
303 assert(validate_mailbox("disposable.style.email.with+symbol@example.com"));
304 assert(validate_mailbox("other.email-with-hyphen@example.com"));
305 assert(validate_mailbox("fully-qualified-domain@example.com"));
307 // (may go to user.name@example.com inbox depending on mail server)
308 assert(validate_mailbox("user.name+tag+sorting@example.com"));
310 assert(validate_mailbox("x@example.com"));
311 assert(validate_mailbox("example-indeed@strange-example.com"));
313 // (local domain name with no TLD, although ICANN highly discourages
314 // dotless email addresses)
315 assert(validate_mailbox("admin@mailserver1"));
317 // (see the List of Internet top-level domains)
318 assert(validate_mailbox("example@s.example"));
320 // (space between the quotes)
321 assert(validate_mailbox("\" \"@example.org"));
323 // (quoted double dot)
324 assert(validate_mailbox("\"john..doe\"@example.org"));
326 // (bangified host route used for uucp mailers)
327 assert(validate_mailbox("mailhost!username@example.org"));
329 // (% escaped mail route to user@example.com via example.org)
330 assert(validate_mailbox("user%example.com@example.org"));
332 // Invalid email addresses
334 assert(!validate_mailbox("Abc.example.com")); // (no @ character)
336 assert(!validate_mailbox("A@b@c@example.com")); // (only one @ is allowed)
338 // (none of the special characters in this local-part are allowed
339 // outside quotation marks)
340 assert(!validate_mailbox("a\"b(c)d,e:f;g<h>i[j\\k]l@example.com"));
342 // (quoted strings must be dot separated or the only element making
343 // up the local-part)
344 assert(!validate_mailbox("just\"not\"right@example.com"));
346 // (spaces, quotes, and backslashes may only exist when within
347 // quoted strings and preceded by a backslash)
348 assert(!validate_mailbox("this is\"not\\allowed@example.com"));
350 // (even if escaped (preceded by a backslash), spaces, quotes, and
351 // backslashes must still be contained by quotes)
352 assert(!validate_mailbox("this\\ still\\\"not\\\\allowed@example.com"));
354 // (local part is longer than 64 characters)
355 assert(!validate_mailbox(
356 "1234567890123456789012345678901234567890123456789012345"
357 "678901234+x@example.com"));
359 assert(!validate_address("foo bar@digilicious.com"));
360 assert(validate_address("gene@digilicious.com"));
361 assert(validate_address("Gene Hightower <gene@digilicious.com>"));
362 assert(validate_address("gene@[127.999.0.1]"));
363 assert(validate_address("madness!@example.org"));
364 assert(validate_address("(comment)mailbox@example.com"));
366 assert(validate_mailbox("gene@digilicious.com"));
367 assert(validate_mailbox("gene@[127.0.0.1]"));
368 assert(!validate_mailbox("gene@[127.999.0.1]"));
369 assert(!validate_mailbox("allen@bad_d0main.com"));
371 assert(!validate_mailbox("2962"));
372 assert(validate_mailbox("실례@실례.테스트"));
374 // <https://docs.microsoft.com/en-us/archive/blogs/testing123/email-address-test-cases>
376 // Valid email addresses:
377 assert(validate_mailbox("email@domain.com"));
379 // Email contains dot in the local part, a dot-atom-string.
380 assert(validate_mailbox("firstname.lastname@domain.com"));
382 // Multiple lables in domain.
383 assert(validate_mailbox("email@subdomain.domain.com"));
385 // Plus sign is a valid character.
386 assert(validate_mailbox("firstname+lastname@domain.com"));
388 // Domain is valid IP address, but this is matched as a domain.
389 assert(validate_mailbox("email@123.123.123.123"));
391 // Square bracket around IP address is a "address literal."
392 assert(validate_mailbox("email@[123.123.123.123]"));
394 // Quotes around local part is valid.
395 assert(validate_mailbox("\"email\"@domain.com"));
397 // Digits in address are valid.
398 assert(validate_mailbox("1234567890@domain.com"));
400 // Dash in domain name is valid.
401 assert(validate_mailbox("email@domain-one.com"));
403 // Underscore in the address field is valid.
404 assert(validate_mailbox("_______@domain.com"));
406 assert(validate_mailbox("email@domain.name"));
407 assert(validate_mailbox("email@domain.co.jp"));
409 // Dash in local part is valid.
410 assert(validate_mailbox("firstname-lastname@domain.com"));
412 assert(!validate_mailbox("plainaddress")); // Missing @ sign and domain
413 assert(!validate_mailbox("#@%^%#$@#$@#.com")); // Garbage
414 assert(!validate_mailbox("@domain.com")); // Missing username
416 assert(!validate_mailbox("Joe Smith <email@domain.com>"));
417 assert(validate_address("Joe Smith <email@domain.com>"));
419 assert(!validate_mailbox("email.domain.com")); // Missing @
420 assert(!validate_mailbox("email@domain@domain.com")); // Two @ sign
422 // Leading dot in address is not allowed
423 assert(!validate_mailbox(".email@domain.com"));
425 // Trailing dot in address is not allowed
426 assert(!validate_mailbox("email.@domain.com"));
429 assert(!validate_mailbox("email..email@domain.com"));
431 // OK! Unicode char as address
432 assert(validate_mailbox("あいうえお@domain.com"));
434 // Comment not allowed in 5321 mailbox.
435 assert(!validate_mailbox("email@domain.com (Joe Smith)"));
437 // Comment fine in 5322 address.
438 assert(validate_address("email@domain.com (Joe Smith)"));
440 // Missing top level domain (.com/.net/.org/etc).
441 assert(validate_mailbox("email@domain"));
443 // Leading dash in front of domain is invalid.
444 assert(!validate_mailbox("email@-domain.com"));
446 // .web is not a valid top level domain, oh yeah? says who?
447 assert(validate_mailbox("email@domain.web"));
449 // Invalid IP address.
450 assert(!validate_mailbox("email@[111.222.333.44444]"));
452 // Invalid IP address, but valid domain name as it turns out.
453 assert(validate_mailbox("email@111.222.333.44444"));
455 // Not a valid domain name.
456 assert(!validate_mailbox("email@domain..com"));
458 // general_address_literal
459 assert(validate_mailbox("email@[x:~Foo_Bar_Baz<\?\?>]"));