more Spamhaus checking and EHLO id lookup
[ghsmtp.git] / mbx.cpp
blob08c546112a09fb96ddd179a32bb77d5ed83522fe
1 // Email address parsing and validating.
3 #include <cassert>
4 #include <string>
6 #include <tao/pegtl.hpp>
7 #include <tao/pegtl/contrib/abnf.hpp>
9 using namespace tao::pegtl;
10 using namespace tao::pegtl::abnf;
12 struct Address {
13 std::string local_part;
14 std::string domain;
17 namespace RFC3629 {
18 // clang-format off
20 // 4. Syntax of UTF-8 Byte Sequences
22 struct UTF8_tail : range<'\x80', '\xBF'> {};
24 struct UTF8_1 : range<0x00, 0x7F> {};
26 struct UTF8_2 : seq<range<'\xC2', '\xDF'>, UTF8_tail> {};
28 struct UTF8_3 : sor<seq<one<'\xE0'>, range<'\xA0', '\xBF'>, UTF8_tail>,
29 seq<range<'\xE1', '\xEC'>, rep<2, UTF8_tail>>,
30 seq<one<'\xED'>, range<'\x80', '\x9F'>, UTF8_tail>,
31 seq<range<'\xEE', '\xEF'>, rep<2, UTF8_tail>>> {};
33 struct UTF8_4 : sor<seq<one<'\xF0'>, range<'\x90', '\xBF'>, rep<2, UTF8_tail>>,
34 seq<range<'\xF1', '\xF3'>, rep<3, UTF8_tail>>,
35 seq<one<'\xF4'>, range<'\x80', '\x8F'>, rep<2, UTF8_tail>>> {};
37 struct non_ascii : sor<UTF8_2, UTF8_3, UTF8_4> {};
39 } // namespace RFC3629
41 namespace Chars {
42 struct VUCHAR : sor<VCHAR, RFC3629::non_ascii> {};
44 // excluded from atext: "(),.@[]"
45 struct atext : sor<ALPHA, DIGIT,
46 one<'!', '#',
47 '$', '%',
48 '&', '\'',
49 '*', '+',
50 '-', '/',
51 '=', '?',
52 '^', '_',
53 '`', '{',
54 '|', '}',
55 '~'>,
56 RFC3629::non_ascii> {};
58 } // namespace Chars
60 namespace RFC5321 {
61 // <https://tools.ietf.org/html/rfc5321>
63 using dot = one<'.'>;
64 using colon = one<':'>;
66 struct u_let_dig : sor<ALPHA, DIGIT, RFC3629::non_ascii> {};
68 struct u_ldh_tail : star<sor<seq<plus<one<'-'>>, u_let_dig>, u_let_dig>> {};
70 struct u_label : seq<u_let_dig, u_ldh_tail> {};
72 struct let_dig : sor<ALPHA, DIGIT> {};
74 struct ldh_tail : star<sor<seq<plus<one<'-'>>, let_dig>, let_dig>> {};
76 struct ldh_str : seq<let_dig, ldh_tail> {};
78 struct label : ldh_str {};
80 struct sub_domain : sor<label, u_label> {};
82 struct domain : list<sub_domain, dot> {};
84 struct dec_octet : sor<seq<string<'2','5'>, range<'0','5'>>,
85 seq<one<'2'>, range<'0','4'>, DIGIT>,
86 seq<range<'0', '1'>, rep<2, DIGIT>>,
87 rep_min_max<1, 2, DIGIT>> {};
89 struct IPv4_address_literal : seq<dec_octet, dot, dec_octet, dot, dec_octet, dot, dec_octet> {};
91 struct h16 : rep_min_max<1, 4, HEXDIG> {};
93 struct ls32 : sor<seq<h16, colon, h16>, IPv4_address_literal> {};
95 struct dcolon : two<':'> {};
97 struct IPv6address : sor<seq< rep<6, h16, colon>, ls32>,
98 seq< dcolon, rep<5, h16, colon>, ls32>,
99 seq<opt<h16 >, dcolon, rep<4, h16, colon>, ls32>,
100 seq<opt<h16, opt< colon, h16>>, dcolon, rep<3, h16, colon>, ls32>,
101 seq<opt<h16, rep_opt<2, colon, h16>>, dcolon, rep<2, h16, colon>, ls32>,
102 seq<opt<h16, rep_opt<3, colon, h16>>, dcolon, h16, colon, ls32>,
103 seq<opt<h16, rep_opt<4, colon, h16>>, dcolon, ls32>,
104 seq<opt<h16, rep_opt<5, colon, h16>>, dcolon, h16>,
105 seq<opt<h16, rep_opt<6, colon, h16>>, dcolon >> {};
107 struct IPv6_address_literal : seq<TAO_PEGTL_ISTRING("IPv6:"), IPv6address> {};
109 struct dcontent : ranges<33, 90, 94, 126> {};
111 struct standardized_tag : ldh_str {};
113 struct general_address_literal : seq<standardized_tag, colon, plus<dcontent>> {};
115 // 4.1.3. Address Literals
116 struct address_literal : seq<one<'['>,
117 sor<IPv4_address_literal,
118 IPv6_address_literal,
119 general_address_literal>,
120 one<']'>> {};
123 struct qtextSMTP : sor<ranges<32, 33, 35, 91, 93, 126>, RFC3629::non_ascii> {};
124 struct graphic : range<32, 126> {};
125 struct quoted_pairSMTP : seq<one<'\\'>, graphic> {};
126 struct qcontentSMTP : sor<qtextSMTP, quoted_pairSMTP> {};
128 struct atom : plus<Chars::atext> {};
129 struct dot_string : list<atom, dot> {};
130 struct quoted_string : seq<one<'"'>, star<qcontentSMTP>, one<'"'>> {};
131 struct local_part : sor<dot_string, quoted_string> {};
132 struct non_local_part : sor<domain, address_literal> {};
133 struct mailbox : seq<local_part, one<'@'>, non_local_part> {};
134 struct mailbox_only : seq<mailbox, eof> {};
136 // clang-format on
137 // Actions
139 template <typename Rule>
140 struct action : nothing<Rule> {
143 template <>
144 struct action<local_part> {
145 template <typename Input>
146 static void apply(Input const& in, Address& addr)
148 addr.local_part = in.string();
152 template <>
153 struct action<non_local_part> {
154 template <typename Input>
155 static void apply(Input const& in, Address& addr)
157 addr.domain = in.string();
160 } // namespace RFC5321
162 namespace RFC5322 {
163 // <https://tools.ietf.org/html/rfc5322>
164 // clang-format off
166 using dot = one<'.'>;
168 struct quoted_pair : seq<one<'\\'>, sor<Chars::VUCHAR, WSP>> {};
170 // 3.2.2. Folding White Space and Comments
172 struct FWS : seq<opt<seq<star<WSP>, eol>>, plus<WSP>> {};
174 // ctext is ASCII but not '(' or ')' or '\\', plus non-ASCII
175 struct ctext : sor<ranges<33, 39, 42, 91, 93, 126>, RFC3629::non_ascii> {};
177 struct comment;
179 struct ccontent : sor<ctext, quoted_pair, comment> {};
181 struct comment : seq<one<'('>, star<seq<opt<FWS>, ccontent>>, opt<FWS>, one<')'>> {};
183 struct CFWS : sor<seq<plus<seq<opt<FWS>, comment>, opt<FWS>>>, FWS> {};
185 // 3.2.3. Atom
187 struct atom : seq<opt<CFWS>, plus<Chars::atext>, opt<CFWS>> {};
188 struct dot_atom_text : list<plus<Chars::atext>, dot> {};
189 struct dot_atom : seq<opt<CFWS>, dot_atom_text, opt<CFWS>> {};
191 // 3.2.4. Quoted Strings
193 struct qtext : sor<one<33>, ranges<35, 91, 93, 126>, RFC3629::non_ascii> {};
194 struct qcontent : sor<qtext, quoted_pair> {};
196 // Corrected in errata ID: 3135
197 struct quoted_string
198 : seq<opt<CFWS>,
199 DQUOTE,
200 sor<seq<star<seq<opt<FWS>, qcontent>>, opt<FWS>>, FWS>,
201 DQUOTE,
202 opt<CFWS>> {};
204 // 3.2.5. Miscellaneous Tokens
206 struct word : sor<atom, quoted_string> {};
207 struct phrase : plus<word> {};
209 // 3.4.1. Addr-Spec Specification
211 struct dtext : ranges<33, 90, 94, 126> {};
212 struct domain_literal : seq<opt<CFWS>,
213 one<'['>, star<seq<opt<FWS>, dtext>>, opt<FWS>, one<']'>,
214 opt<CFWS>> {};
215 struct domain : sor<dot_atom, domain_literal> {};
216 struct local_part : sor<dot_atom, quoted_string> {};
217 struct addr_spec : seq<local_part, one<'@'>, domain> {};
219 // 3.4 Address Specification
221 struct group_list;
222 struct display_name : phrase {};
223 struct group : seq<display_name, one<':'>, opt<group_list>, one<';'>, opt<CFWS>> {};
224 struct angle_addr : seq<opt<CFWS>, one<'<'>, addr_spec, one<'>'>, opt<CFWS>> {};
225 struct name_addr : seq<opt<display_name>, angle_addr> {};
226 struct mailbox : sor<name_addr, addr_spec> {};
227 struct mailbox_list : list<mailbox, one<','>> {};
228 struct group_list : sor<mailbox_list, CFWS> {};
229 struct address : sor<mailbox, group> {};
230 struct address_only : seq<address, eof> {};
232 // clang-format on
233 // Actions
235 template <typename Rule>
236 struct action : nothing<Rule> {
239 template <>
240 struct action<local_part> {
241 template <typename Input>
242 static void apply(Input const& in, Address& addr)
244 addr.local_part = in.string();
248 template <>
249 struct action<domain> {
250 template <typename Input>
251 static void apply(Input const& in, Address& addr)
253 addr.domain = in.string();
256 } // namespace RFC5322
258 bool validate_mailbox(std::string_view value)
260 Address addr;
262 memory_input<> address_in(value, "address");
263 if (!parse<RFC5321::mailbox_only, RFC5321::action>(address_in, addr)) {
264 return false;
267 // RFC-5321 section 4.5.3.1. Size Limits and Minimums
269 if (addr.local_part.length() > 64) { // Section 4.5.3.1.1. Local-part
270 return false;
272 if (addr.domain.length() > 255) { // Section 4.5.3.1.2.
273 // Also RFC 2181 section 11. Name syntax
274 return false;
277 // FIXME
278 // each label is limited to between 1 and 63 octets
280 return true;
283 bool validate_address(std::string_view value)
285 Address addr;
287 memory_input<> address_in(value, "address");
288 if (!parse<RFC5322::address_only, RFC5322::action>(address_in, addr)) {
289 return false;
292 return true;
295 int main()
297 // <https://en.wikipedia.org/wiki/Email_address#Examples>
299 // Valid email addresses
301 assert(validate_mailbox("simple@example.com"));
302 assert(validate_mailbox("very.common@example.com"));
303 assert(validate_mailbox("disposable.style.email.with+symbol@example.com"));
304 assert(validate_mailbox("other.email-with-hyphen@example.com"));
305 assert(validate_mailbox("fully-qualified-domain@example.com"));
307 // (may go to user.name@example.com inbox depending on mail server)
308 assert(validate_mailbox("user.name+tag+sorting@example.com"));
310 assert(validate_mailbox("x@example.com"));
311 assert(validate_mailbox("example-indeed@strange-example.com"));
313 // (local domain name with no TLD, although ICANN highly discourages
314 // dotless email addresses)
315 assert(validate_mailbox("admin@mailserver1"));
317 // (see the List of Internet top-level domains)
318 assert(validate_mailbox("example@s.example"));
320 // (space between the quotes)
321 assert(validate_mailbox("\" \"@example.org"));
323 // (quoted double dot)
324 assert(validate_mailbox("\"john..doe\"@example.org"));
326 // (bangified host route used for uucp mailers)
327 assert(validate_mailbox("mailhost!username@example.org"));
329 // (% escaped mail route to user@example.com via example.org)
330 assert(validate_mailbox("user%example.com@example.org"));
332 // Invalid email addresses
334 assert(!validate_mailbox("Abc.example.com")); // (no @ character)
336 assert(!validate_mailbox("A@b@c@example.com")); // (only one @ is allowed)
338 // (none of the special characters in this local-part are allowed
339 // outside quotation marks)
340 assert(!validate_mailbox("a\"b(c)d,e:f;g<h>i[j\\k]l@example.com"));
342 // (quoted strings must be dot separated or the only element making
343 // up the local-part)
344 assert(!validate_mailbox("just\"not\"right@example.com"));
346 // (spaces, quotes, and backslashes may only exist when within
347 // quoted strings and preceded by a backslash)
348 assert(!validate_mailbox("this is\"not\\allowed@example.com"));
350 // (even if escaped (preceded by a backslash), spaces, quotes, and
351 // backslashes must still be contained by quotes)
352 assert(!validate_mailbox("this\\ still\\\"not\\\\allowed@example.com"));
354 // (local part is longer than 64 characters)
355 assert(!validate_mailbox(
356 "1234567890123456789012345678901234567890123456789012345"
357 "678901234+x@example.com"));
359 assert(!validate_address("foo bar@digilicious.com"));
360 assert(validate_address("gene@digilicious.com"));
361 assert(validate_address("Gene Hightower <gene@digilicious.com>"));
362 assert(validate_address("gene@[127.999.0.1]"));
363 assert(validate_address("madness!@example.org"));
364 assert(validate_address("(comment)mailbox@example.com"));
366 assert(validate_mailbox("gene@digilicious.com"));
367 assert(validate_mailbox("gene@[127.0.0.1]"));
368 assert(!validate_mailbox("gene@[127.999.0.1]"));
369 assert(!validate_mailbox("allen@bad_d0main.com"));
371 assert(!validate_mailbox("2962"));
372 assert(validate_mailbox("실례@실례.테스트"));
374 // <https://docs.microsoft.com/en-us/archive/blogs/testing123/email-address-test-cases>
376 // Valid email addresses:
377 assert(validate_mailbox("email@domain.com"));
379 // Email contains dot in the local part, a dot-atom-string.
380 assert(validate_mailbox("firstname.lastname@domain.com"));
382 // Multiple lables in domain.
383 assert(validate_mailbox("email@subdomain.domain.com"));
385 // Plus sign is a valid character.
386 assert(validate_mailbox("firstname+lastname@domain.com"));
388 // Domain is valid IP address, but this is matched as a domain.
389 assert(validate_mailbox("email@123.123.123.123"));
391 // Square bracket around IP address is a "address literal."
392 assert(validate_mailbox("email@[123.123.123.123]"));
394 // Quotes around local part is valid.
395 assert(validate_mailbox("\"email\"@domain.com"));
397 // Digits in address are valid.
398 assert(validate_mailbox("1234567890@domain.com"));
400 // Dash in domain name is valid.
401 assert(validate_mailbox("email@domain-one.com"));
403 // Underscore in the address field is valid.
404 assert(validate_mailbox("_______@domain.com"));
406 assert(validate_mailbox("email@domain.name"));
407 assert(validate_mailbox("email@domain.co.jp"));
409 // Dash in local part is valid.
410 assert(validate_mailbox("firstname-lastname@domain.com"));
412 assert(!validate_mailbox("plainaddress")); // Missing @ sign and domain
413 assert(!validate_mailbox("#@%^%#$@#$@#.com")); // Garbage
414 assert(!validate_mailbox("@domain.com")); // Missing username
416 assert(!validate_mailbox("Joe Smith <email@domain.com>"));
417 assert(validate_address("Joe Smith <email@domain.com>"));
419 assert(!validate_mailbox("email.domain.com")); // Missing @
420 assert(!validate_mailbox("email@domain@domain.com")); // Two @ sign
422 // Leading dot in address is not allowed
423 assert(!validate_mailbox(".email@domain.com"));
425 // Trailing dot in address is not allowed
426 assert(!validate_mailbox("email.@domain.com"));
428 // Multiple dots
429 assert(!validate_mailbox("email..email@domain.com"));
431 // OK! Unicode char as address
432 assert(validate_mailbox("あいうえお@domain.com"));
434 // Comment not allowed in 5321 mailbox.
435 assert(!validate_mailbox("email@domain.com (Joe Smith)"));
437 // Comment fine in 5322 address.
438 assert(validate_address("email@domain.com (Joe Smith)"));
440 // Missing top level domain (.com/.net/.org/etc).
441 assert(validate_mailbox("email@domain"));
443 // Leading dash in front of domain is invalid.
444 assert(!validate_mailbox("email@-domain.com"));
446 // .web is not a valid top level domain, oh yeah? says who?
447 assert(validate_mailbox("email@domain.web"));
449 // Invalid IP address.
450 assert(!validate_mailbox("email@[111.222.333.44444]"));
452 // Invalid IP address, but valid domain name as it turns out.
453 assert(validate_mailbox("email@111.222.333.44444"));
455 // Not a valid domain name.
456 assert(!validate_mailbox("email@domain..com"));
458 // general_address_literal
459 assert(validate_mailbox("email@[x:~Foo_Bar_Baz<\?\?>]"));