call out non-ASCII
[ghsmtp.git] / Mailbox.cpp
blob7fabe11efdff9f6c98ab819726de2d440cad21e5
1 #include "Mailbox.hpp"
3 #include <string>
5 #include <boost/algorithm/string/classification.hpp>
6 #include <boost/algorithm/string/split.hpp>
8 #include <tao/pegtl.hpp>
9 #include <tao/pegtl/contrib/abnf.hpp>
11 #include <glog/logging.h>
13 #include <boost/algorithm/string/classification.hpp>
14 #include <boost/algorithm/string/split.hpp>
16 using namespace tao::pegtl;
17 using namespace tao::pegtl::abnf;
19 namespace RFC3629 {
20 // clang-format off
22 // 4. Syntax of UTF-8 Byte Sequences
24 struct UTF8_tail : range<'\x80', '\xBF'> {};
26 struct UTF8_1 : range<0x00, 0x7F> {};
28 struct UTF8_2 : seq<range<'\xC2', '\xDF'>, UTF8_tail> {};
30 struct UTF8_3 : sor<seq<one<'\xE0'>, range<'\xA0', '\xBF'>, UTF8_tail>,
31 seq<range<'\xE1', '\xEC'>, rep<2, UTF8_tail>>,
32 seq<one<'\xED'>, range<'\x80', '\x9F'>, UTF8_tail>,
33 seq<range<'\xEE', '\xEF'>, rep<2, UTF8_tail>>> {};
35 struct UTF8_4 : sor<seq<one<'\xF0'>, range<'\x90', '\xBF'>, rep<2, UTF8_tail>>,
36 seq<range<'\xF1', '\xF3'>, rep<3, UTF8_tail>>,
37 seq<one<'\xF4'>, range<'\x80', '\x8F'>, rep<2, UTF8_tail>>> {};
39 struct non_ascii : sor<UTF8_2, UTF8_3, UTF8_4> {};
41 } // namespace RFC3629
43 namespace Chars {
44 struct VUCHAR : sor<VCHAR, RFC3629::non_ascii> {};
46 // excluded from atext: "(),.@[]"
47 struct atext : sor<ALPHA, DIGIT,
48 one<'!', '#',
49 '$', '%',
50 '&', '\'',
51 '*', '+',
52 '-', '/',
53 '=', '?',
54 '^', '_',
55 '`', '{',
56 '|', '}',
57 '~'>,
58 RFC3629::non_ascii> {};
60 } // namespace Chars
62 namespace RFC5321 {
63 // <https://tools.ietf.org/html/rfc5321>
65 using dot = one<'.'>;
66 using colon = one<':'>;
68 struct u_let_dig : sor<ALPHA, DIGIT, RFC3629::non_ascii> {};
70 struct u_ldh_tail : star<sor<seq<plus<one<'-'>>, u_let_dig>, u_let_dig>> {};
72 struct u_label : seq<u_let_dig, u_ldh_tail> {};
74 struct let_dig : sor<ALPHA, DIGIT> {};
76 struct ldh_tail : star<sor<seq<plus<one<'-'>>, let_dig>, let_dig>> {};
78 struct ldh_str : seq<let_dig, ldh_tail> {};
80 struct label : ldh_str {};
82 struct sub_domain : sor<label, u_label> {};
84 struct domain : list<sub_domain, dot> {};
86 struct dec_octet : sor<seq<string<'2','5'>, range<'0','5'>>,
87 seq<one<'2'>, range<'0','4'>, DIGIT>,
88 seq<range<'0', '1'>, rep<2, DIGIT>>,
89 rep_min_max<1, 2, DIGIT>> {};
91 struct IPv4_address_literal : seq<dec_octet, dot, dec_octet, dot, dec_octet, dot, dec_octet> {};
93 struct h16 : rep_min_max<1, 4, HEXDIG> {};
95 struct ls32 : sor<seq<h16, colon, h16>, IPv4_address_literal> {};
97 struct dcolon : two<':'> {};
99 struct IPv6address : sor<seq< rep<6, h16, colon>, ls32>,
100 seq< dcolon, rep<5, h16, colon>, ls32>,
101 seq<opt<h16 >, dcolon, rep<4, h16, colon>, ls32>,
102 seq<opt<h16, opt< colon, h16>>, dcolon, rep<3, h16, colon>, ls32>,
103 seq<opt<h16, rep_opt<2, colon, h16>>, dcolon, rep<2, h16, colon>, ls32>,
104 seq<opt<h16, rep_opt<3, colon, h16>>, dcolon, h16, colon, ls32>,
105 seq<opt<h16, rep_opt<4, colon, h16>>, dcolon, ls32>,
106 seq<opt<h16, rep_opt<5, colon, h16>>, dcolon, h16>,
107 seq<opt<h16, rep_opt<6, colon, h16>>, dcolon >> {};
109 struct IPv6_address_literal : seq<TAO_PEGTL_ISTRING("IPv6:"), IPv6address> {};
111 struct dcontent : ranges<33, 90, 94, 126> {};
113 struct standardized_tag : ldh_str {};
115 struct general_address_literal : seq<standardized_tag, colon, plus<dcontent>> {};
117 // 4.1.3. Address Literals
118 struct address_literal : seq<one<'['>,
119 sor<IPv4_address_literal,
120 IPv6_address_literal,
121 general_address_literal>,
122 one<']'>> {};
125 struct qtextSMTP : sor<ranges<32, 33, 35, 91, 93, 126>, RFC3629::non_ascii> {};
126 struct graphic : range<32, 126> {};
127 struct quoted_pairSMTP : seq<one<'\\'>, graphic> {};
128 struct qcontentSMTP : sor<qtextSMTP, quoted_pairSMTP> {};
130 struct atom : plus<Chars::atext> {};
131 struct dot_string : list<atom, dot> {};
132 struct quoted_string : seq<one<'"'>, star<qcontentSMTP>, one<'"'>> {};
133 struct local_part : sor<dot_string, quoted_string> {};
134 struct non_local_part : sor<domain, address_literal> {};
135 struct mailbox : seq<local_part, one<'@'>, non_local_part> {};
136 struct mailbox_only : seq<mailbox, eof> {};
138 // clang-format on
139 // Actions
141 template <typename Input>
142 static std::string_view make_view(Input const& in)
144 return std::string_view(in.begin(), std::distance(in.begin(), in.end()));
147 template <typename Rule>
148 struct action : nothing<Rule> {
151 template <>
152 struct action<dot_string> {
153 template <typename Input>
154 static void apply(Input const& in, Mailbox::parse_results& results)
156 results.local_type = Mailbox::local_types::dot_string;
160 template <>
161 struct action<quoted_string> {
162 template <typename Input>
163 static void apply(Input const& in, Mailbox::parse_results& results)
165 results.local_type = Mailbox::local_types::quoted_string;
169 template <>
170 struct action<domain> {
171 template <typename Input>
172 static void apply(Input const& in, Mailbox::parse_results& results)
174 results.domain_type = Mailbox::domain_types::domain;
178 template <>
179 struct action<address_literal> {
180 template <typename Input>
181 static void apply(Input const& in, Mailbox::parse_results& results)
183 results.domain_type = Mailbox::domain_types::address_literal;
187 template <>
188 struct action<local_part> {
189 template <typename Input>
190 static void apply(Input const& in, Mailbox::parse_results& results)
192 results.local = make_view(in);
196 template <>
197 struct action<non_local_part> {
198 template <typename Input>
199 static void apply(Input const& in, Mailbox::parse_results& results)
201 results.domain = make_view(in);
204 } // namespace RFC5321
206 std::optional<Mailbox::parse_results> Mailbox::parse(std::string_view mailbox)
208 if (mailbox.empty())
209 return {};
211 parse_results results;
212 memory_input<> mbx_in(mailbox, "mailbox");
213 if (tao::pegtl::parse<RFC5321::mailbox_only, RFC5321::action>(mbx_in,
214 results)) {
215 return results;
217 return {};
220 Mailbox::Mailbox(std::string_view mailbox)
222 if (mailbox.empty()) {
223 LOG(ERROR) << "empty mailbox string";
224 throw std::invalid_argument("empty mailbox string");
227 parse_results results;
228 memory_input<> mbx_in(mailbox, "mailbox");
229 if (!tao::pegtl::parse<RFC5321::mailbox_only, RFC5321::action>(mbx_in,
230 results)) {
231 LOG(ERROR) << "invalid mailbox syntax «" << mailbox << "»";
232 throw std::invalid_argument("invalid mailbox syntax");
235 // "Impossible" errors; if the parse succeeded, the types must not
236 // be unknown.
237 CHECK(results.local_type != local_types::unknown);
238 CHECK(results.domain_type != domain_types::unknown);
240 // RFC-5321 4.5.3.1. Size Limits and Minimums
242 // “To the maximum extent possible, implementation techniques that
243 // impose no limits on the length of these objects should be used.”
245 // In practice, long local-parts are used and work fine. DNS imposes
246 // length limits, so we check those.
248 if (results.domain.length() > 255) { // Section 4.5.3.1.2.
249 // Also RFC 2181 section 11. Name syntax
250 LOG(ERROR) << "domain > 255 octets in «" << mailbox << "»";
251 throw std::invalid_argument("mailbox domain too long");
254 std::string dom{results.domain.begin(), results.domain.end()};
255 std::vector<boost::iterator_range<std::string::iterator>> labels;
256 boost::algorithm::split(labels, dom, boost::algorithm::is_any_of("."));
258 // Checks for DNS style domains, not address literals.
259 if (results.domain_type == domain_types::domain) {
260 if (labels.size() < 2) {
261 LOG(ERROR) << "domain not fully qualified in «" << mailbox << "»";
262 throw std::invalid_argument("mailbox domain not fully qualified");
265 if (labels[labels.size() - 1].size() < 2) {
266 LOG(ERROR) << "single octet TLD in «" << mailbox << "»";
267 throw std::invalid_argument("mailbox TLD must be two or more octets");
270 for (auto label : labels) {
271 if (label.size() > 63) {
272 LOG(ERROR) << "label > 63 octets in «" << mailbox << "»";
273 throw std::invalid_argument(
274 "mailbox domain label greater than 63 octets");
279 set_local(results.local);
280 set_domain(results.domain);
283 size_t Mailbox::length(domain_encoding enc) const
285 if (enc == domain_encoding::ascii) {
286 for (auto ch : local_part_) {
287 if (!isascii(static_cast<unsigned char>(ch))) {
288 LOG(WARNING) << "non ascii chars in local part:" << local_part_;
289 // throw std::range_error("non ascii chars in local part of mailbox");
293 auto const& d
294 = (enc == domain_encoding::utf8) ? domain().utf8() : domain().ascii();
295 return local_part_.length() + (d.length() ? (d.length() + 1) : 0);
298 std::string Mailbox::as_string(domain_encoding enc) const
300 if (enc == domain_encoding::ascii) {
301 for (auto ch : local_part_) {
302 if (!isascii(static_cast<unsigned char>(ch))) {
303 LOG(WARNING) << "non ascii chars in local part:" << local_part_;
304 // throw std::range_error("non ascii chars in local part of mailbox");
308 std::string s;
309 s.reserve(length(enc));
310 s = local_part();
311 auto const& d
312 = (enc == domain_encoding::utf8) ? domain().utf8() : domain().ascii();
313 if (!d.empty()) {
314 s += '@';
315 s += d;
317 return s;