5 #include <boost/algorithm/string/classification.hpp>
6 #include <boost/algorithm/string/split.hpp>
8 #include <tao/pegtl.hpp>
9 #include <tao/pegtl/contrib/abnf.hpp>
11 #include <glog/logging.h>
13 #include <boost/algorithm/string/classification.hpp>
14 #include <boost/algorithm/string/split.hpp>
16 using namespace tao::pegtl
;
17 using namespace tao::pegtl::abnf
;
22 // 4. Syntax of UTF-8 Byte Sequences
24 struct UTF8_tail
: range
<'\x80', '\xBF'> {};
26 struct UTF8_1
: range
<0x00, 0x7F> {};
28 struct UTF8_2
: seq
<range
<'\xC2', '\xDF'>, UTF8_tail
> {};
30 struct UTF8_3
: sor
<seq
<one
<'\xE0'>, range
<'\xA0', '\xBF'>, UTF8_tail
>,
31 seq
<range
<'\xE1', '\xEC'>, rep
<2, UTF8_tail
>>,
32 seq
<one
<'\xED'>, range
<'\x80', '\x9F'>, UTF8_tail
>,
33 seq
<range
<'\xEE', '\xEF'>, rep
<2, UTF8_tail
>>> {};
35 struct UTF8_4
: sor
<seq
<one
<'\xF0'>, range
<'\x90', '\xBF'>, rep
<2, UTF8_tail
>>,
36 seq
<range
<'\xF1', '\xF3'>, rep
<3, UTF8_tail
>>,
37 seq
<one
<'\xF4'>, range
<'\x80', '\x8F'>, rep
<2, UTF8_tail
>>> {};
39 struct non_ascii
: sor
<UTF8_2
, UTF8_3
, UTF8_4
> {};
41 } // namespace RFC3629
44 struct VUCHAR
: sor
<VCHAR
, RFC3629::non_ascii
> {};
46 // excluded from atext: "(),.@[]"
47 struct atext
: sor
<ALPHA
, DIGIT
,
58 RFC3629::non_ascii
> {};
63 // <https://tools.ietf.org/html/rfc5321>
66 using colon
= one
<':'>;
68 struct u_let_dig
: sor
<ALPHA
, DIGIT
, RFC3629::non_ascii
> {};
70 struct u_ldh_tail
: star
<sor
<seq
<plus
<one
<'-'>>, u_let_dig
>, u_let_dig
>> {};
72 struct u_label
: seq
<u_let_dig
, u_ldh_tail
> {};
74 struct let_dig
: sor
<ALPHA
, DIGIT
> {};
76 struct ldh_tail
: star
<sor
<seq
<plus
<one
<'-'>>, let_dig
>, let_dig
>> {};
78 struct ldh_str
: seq
<let_dig
, ldh_tail
> {};
80 struct label
: ldh_str
{};
82 struct sub_domain
: sor
<label
, u_label
> {};
84 struct domain
: list
<sub_domain
, dot
> {};
86 struct dec_octet
: sor
<seq
<string
<'2','5'>, range
<'0','5'>>,
87 seq
<one
<'2'>, range
<'0','4'>, DIGIT
>,
88 seq
<range
<'0', '1'>, rep
<2, DIGIT
>>,
89 rep_min_max
<1, 2, DIGIT
>> {};
91 struct IPv4_address_literal
: seq
<dec_octet
, dot
, dec_octet
, dot
, dec_octet
, dot
, dec_octet
> {};
93 struct h16
: rep_min_max
<1, 4, HEXDIG
> {};
95 struct ls32
: sor
<seq
<h16
, colon
, h16
>, IPv4_address_literal
> {};
97 struct dcolon
: two
<':'> {};
99 struct IPv6address
: sor
<seq
< rep
<6, h16
, colon
>, ls32
>,
100 seq
< dcolon
, rep
<5, h16
, colon
>, ls32
>,
101 seq
<opt
<h16
>, dcolon
, rep
<4, h16
, colon
>, ls32
>,
102 seq
<opt
<h16
, opt
< colon
, h16
>>, dcolon
, rep
<3, h16
, colon
>, ls32
>,
103 seq
<opt
<h16
, rep_opt
<2, colon
, h16
>>, dcolon
, rep
<2, h16
, colon
>, ls32
>,
104 seq
<opt
<h16
, rep_opt
<3, colon
, h16
>>, dcolon
, h16
, colon
, ls32
>,
105 seq
<opt
<h16
, rep_opt
<4, colon
, h16
>>, dcolon
, ls32
>,
106 seq
<opt
<h16
, rep_opt
<5, colon
, h16
>>, dcolon
, h16
>,
107 seq
<opt
<h16
, rep_opt
<6, colon
, h16
>>, dcolon
>> {};
109 struct IPv6_address_literal
: seq
<TAO_PEGTL_ISTRING("IPv6:"), IPv6address
> {};
111 struct dcontent
: ranges
<33, 90, 94, 126> {};
113 struct standardized_tag
: ldh_str
{};
115 struct general_address_literal
: seq
<standardized_tag
, colon
, plus
<dcontent
>> {};
117 // 4.1.3. Address Literals
118 struct address_literal
: seq
<one
<'['>,
119 sor
<IPv4_address_literal
,
120 IPv6_address_literal
,
121 general_address_literal
>,
125 struct qtextSMTP
: sor
<ranges
<32, 33, 35, 91, 93, 126>, RFC3629::non_ascii
> {};
126 struct graphic
: range
<32, 126> {};
127 struct quoted_pairSMTP
: seq
<one
<'\\'>, graphic
> {};
128 struct qcontentSMTP
: sor
<qtextSMTP
, quoted_pairSMTP
> {};
130 struct atom
: plus
<Chars::atext
> {};
131 struct dot_string
: list
<atom
, dot
> {};
132 struct quoted_string
: seq
<one
<'"'>, star
<qcontentSMTP
>, one
<'"'>> {};
133 struct local_part
: sor
<dot_string
, quoted_string
> {};
134 struct non_local_part
: sor
<domain
, address_literal
> {};
135 struct mailbox
: seq
<local_part
, one
<'@'>, non_local_part
> {};
136 struct mailbox_only
: seq
<mailbox
, eof
> {};
141 template <typename Input
>
142 static std::string_view
make_view(Input
const& in
)
144 return std::string_view(in
.begin(), std::distance(in
.begin(), in
.end()));
147 template <typename Rule
>
148 struct action
: nothing
<Rule
> {
152 struct action
<dot_string
> {
153 template <typename Input
>
154 static void apply(Input
const& in
, Mailbox::parse_results
& results
)
156 results
.local_type
= Mailbox::local_types::dot_string
;
161 struct action
<quoted_string
> {
162 template <typename Input
>
163 static void apply(Input
const& in
, Mailbox::parse_results
& results
)
165 results
.local_type
= Mailbox::local_types::quoted_string
;
170 struct action
<domain
> {
171 template <typename Input
>
172 static void apply(Input
const& in
, Mailbox::parse_results
& results
)
174 results
.domain_type
= Mailbox::domain_types::domain
;
179 struct action
<address_literal
> {
180 template <typename Input
>
181 static void apply(Input
const& in
, Mailbox::parse_results
& results
)
183 results
.domain_type
= Mailbox::domain_types::address_literal
;
188 struct action
<local_part
> {
189 template <typename Input
>
190 static void apply(Input
const& in
, Mailbox::parse_results
& results
)
192 results
.local
= make_view(in
);
197 struct action
<non_local_part
> {
198 template <typename Input
>
199 static void apply(Input
const& in
, Mailbox::parse_results
& results
)
201 results
.domain
= make_view(in
);
204 } // namespace RFC5321
206 std::optional
<Mailbox::parse_results
> Mailbox::parse(std::string_view mailbox
)
211 parse_results results
;
212 memory_input
<> mbx_in(mailbox
, "mailbox");
213 if (tao::pegtl::parse
<RFC5321::mailbox_only
, RFC5321::action
>(mbx_in
,
220 Mailbox::Mailbox(std::string_view mailbox
)
222 if (mailbox
.empty()) {
223 LOG(ERROR
) << "empty mailbox string";
224 throw std::invalid_argument("empty mailbox string");
227 parse_results results
;
228 memory_input
<> mbx_in(mailbox
, "mailbox");
229 if (!tao::pegtl::parse
<RFC5321::mailbox_only
, RFC5321::action
>(mbx_in
,
231 LOG(ERROR
) << "invalid mailbox syntax «" << mailbox
<< "»";
232 throw std::invalid_argument("invalid mailbox syntax");
235 // "Impossible" errors; if the parse succeeded, the types must not
237 CHECK(results
.local_type
!= local_types::unknown
);
238 CHECK(results
.domain_type
!= domain_types::unknown
);
240 // RFC-5321 4.5.3.1. Size Limits and Minimums
242 // “To the maximum extent possible, implementation techniques that
243 // impose no limits on the length of these objects should be used.”
245 // In practice, long local-parts are used and work fine. DNS imposes
246 // length limits, so we check those.
248 if (results
.domain
.length() > 255) { // Section 4.5.3.1.2.
249 // Also RFC 2181 section 11. Name syntax
250 LOG(ERROR
) << "domain > 255 octets in «" << mailbox
<< "»";
251 throw std::invalid_argument("mailbox domain too long");
254 std::string dom
{results
.domain
.begin(), results
.domain
.end()};
255 std::vector
<boost::iterator_range
<std::string::iterator
>> labels
;
256 boost::algorithm::split(labels
, dom
, boost::algorithm::is_any_of("."));
258 // Checks for DNS style domains, not address literals.
259 if (results
.domain_type
== domain_types::domain
) {
260 if (labels
.size() < 2) {
261 LOG(ERROR
) << "domain not fully qualified in «" << mailbox
<< "»";
262 throw std::invalid_argument("mailbox domain not fully qualified");
265 if (labels
[labels
.size() - 1].size() < 2) {
266 LOG(ERROR
) << "single octet TLD in «" << mailbox
<< "»";
267 throw std::invalid_argument("mailbox TLD must be two or more octets");
270 for (auto label
: labels
) {
271 if (label
.size() > 63) {
272 LOG(ERROR
) << "label > 63 octets in «" << mailbox
<< "»";
273 throw std::invalid_argument(
274 "mailbox domain label greater than 63 octets");
279 set_local(results
.local
);
280 set_domain(results
.domain
);
283 size_t Mailbox::length(domain_encoding enc
) const
285 if (enc
== domain_encoding::ascii
) {
286 for (auto ch
: local_part_
) {
287 if (!isascii(static_cast<unsigned char>(ch
))) {
288 LOG(WARNING
) << "non ascii chars in local part:" << local_part_
;
289 // throw std::range_error("non ascii chars in local part of mailbox");
294 = (enc
== domain_encoding::utf8
) ? domain().utf8() : domain().ascii();
295 return local_part_
.length() + (d
.length() ? (d
.length() + 1) : 0);
298 std::string
Mailbox::as_string(domain_encoding enc
) const
300 if (enc
== domain_encoding::ascii
) {
301 for (auto ch
: local_part_
) {
302 if (!isascii(static_cast<unsigned char>(ch
))) {
303 LOG(WARNING
) << "non ascii chars in local part:" << local_part_
;
304 // throw std::range_error("non ascii chars in local part of mailbox");
309 s
.reserve(length(enc
));
312 = (enc
== domain_encoding::utf8
) ? domain().utf8() : domain().ascii();