6 #include <glog/logging.h>
8 #include <fmt/format.h>
9 #include <fmt/ostream.h>
14 #include <boost/algorithm/string/join.hpp>
15 #include <boost/algorithm/string/split.hpp>
17 #include <tao/pegtl.hpp>
18 #include <tao/pegtl/contrib/abnf.hpp>
19 // #include <tao/pegtl/contrib/tracer.hpp>
21 using namespace tao::pegtl
;
22 using namespace tao::pegtl::abnf
;
25 class category_impl
: public std::error_category
{
27 category_impl() = default;
28 virtual ~category_impl() {}
29 virtual char const* name() const noexcept
;
30 virtual std::string
message(int ev
) const;
33 char const* category_impl::name() const noexcept
35 static const char name
[] = "uri_error";
39 std::string
category_impl::message(int ev
) const
41 switch (static_cast<error
>(ev
)) {
42 case error::invalid_syntax
:
43 return "unable to parse URI";
45 return "unknown URI error";
48 const std::error_category
& category()
50 static category_impl category
;
54 std::error_code
make_error_code(error e
)
56 return std::error_code(static_cast<int>(e
), category());
59 syntax_error::syntax_error()
60 : std::system_error(make_error_code(error::invalid_syntax
))
64 syntax_error::~syntax_error() noexcept
{}
68 namespace uri_internal
{
70 // Rules are from <https://tools.ietf.org/html/rfc3986#appendix-A>
72 // The order is the rules is mostly reversed here, since we need to
73 // define them before use.
75 // UTF-8 is from RFC-3987
77 struct UTF8_tail
: range
<'\x80', '\xBF'> {};
79 struct UTF8_1
: range
<'\x00', '\x7F'> {};
81 struct UTF8_2
: seq
<range
<'\xC2', '\xDF'>, UTF8_tail
> {};
83 struct UTF8_3
: sor
<seq
<one
<'\xE0'>, range
<'\xA0', '\xBF'>, UTF8_tail
>,
84 seq
<range
<'\xE1', '\xEC'>, rep
<2, UTF8_tail
>>,
85 seq
<one
<'\xED'>, range
<'\x80', '\x9F'>, UTF8_tail
>,
86 seq
<range
<'\xEE', '\xEF'>, rep
<2, UTF8_tail
>>> {};
88 struct UTF8_4
: sor
<seq
<one
<'\xF0'>, range
<'\x90', '\xBF'>, rep
<2, UTF8_tail
>>,
89 seq
<range
<'\xF1', '\xF3'>, rep
<3, UTF8_tail
>>,
90 seq
<one
<'\xF4'>, range
<'\x80', '\x8F'>, rep
<2, UTF8_tail
>>> {};
92 struct UTF8_non_ascii
: sor
<UTF8_2
, UTF8_3
, UTF8_4
> {};
94 // sub-delims = "!" / "$" / "&" / "'" / "(" / ")"
95 // / "*" / "+" / "," / ";" / "="
96 struct sub_delims
: one
<'!', '$', '&', '\'', '(', ')',
97 '*', '+', ',', ';', '='> {};
99 // gen-delims = ":" / "/" / "?" / "#" / "[" / "]" / "@"
100 struct gen_delims
: one
<':', '/', '?', '#', '[', ']', '@'> {};
102 // reserved = gen-delims / sub-delims
103 struct reserved
: sor
<gen_delims
, sub_delims
> {};
105 // Allowing UTF-8 in the unreserved rule isn't strictly RFC-3987 since we
106 // make no attempt to limit the code points to exaclude the private use
107 // areas. See <https://tools.ietf.org/html/rfc3987>
109 // iunreserved = ALPHA / DIGIT / "-" / "." / "_" / "~" / ucschar
110 // unreserved = ALPHA / DIGIT / "-" / "." / "_" / "~"
111 struct unreserved
: sor
<ALPHA
, DIGIT
, one
<'-', '.', '_', '~'>, UTF8_non_ascii
> {};
113 // pct-encoded = "%" HEXDIG HEXDIG
114 struct pct_encoded
: seq
<one
<'%'>, HEXDIG
, HEXDIG
> {};
116 // pchar = unreserved / pct-encoded / sub-delims / ":" / "@"
117 struct pchar
: sor
<unreserved
, pct_encoded
, sub_delims
, one
<':', '@'>> {};
119 // fragment = *( pchar / "/" / "?" )
120 struct fragment
: star
<sor
<pchar
, one
<'/', '?'>>> {};
122 // query = *( pchar / "/" / "?" )
123 struct query
: star
<sor
<pchar
, one
<'/', '?'>>> {};
125 // segment-nz-nc = 1*( unreserved / pct-encoded / sub-delims / "@" )
126 // ; non-zero-length segment without any colon ":"
127 struct segment_nz_nc
: plus
<sor
<unreserved
, pct_encoded
, sub_delims
, one
<'@'>>> {};
129 // segment-nz = 1*pchar
130 struct segment_nz
: plus
<pchar
> {};
133 struct segment
: star
<pchar
> {};
135 // Updated by Errata ID: 2033
137 struct path_empty
: success
{};
139 // path-rootless = segment-nz *( "/" segment )
140 struct path_rootless
: seq
<segment_nz
, star
<seq
<one
<'/'>, segment
>>> {};
142 // path-noscheme = segment-nz-nc *( "/" segment )
143 struct path_noscheme
: seq
<segment_nz_nc
, star
<seq
<one
<'/'>, segment
>>> {};
145 // path-absolute = "/" [ segment-nz *( "/" segment ) ]
146 struct path_absolute
: seq
<one
<'/'>, opt
<seq
<segment_nz
, star
<seq
<one
<'/'>, segment
>>>>> {};
148 // path-abempty = *( "/" segment )
149 struct path_abempty
: star
<seq
<one
<'/'>, segment
>> {};
151 // path = path-abempty ; begins with "/" or is empty
152 // / path-absolute ; begins with "/" but not "//"
153 // / path-noscheme ; begins with a non-colon segment
154 // / path-rootless ; begins with a segment
155 // / path-empty ; zero characters
156 // struct path : sor<path_abempty,
162 /////////////////////////////////////////////////////////////////////////////
164 // The definition of reg-name is where I stray from the (very loose)
165 // grammar of RFC-3986 and apply the stricter rules of RFC-1123 plus
166 // the UTF-8 of RFC-3987.
168 // We allow a very limited set of percent encoded characters in the
169 // reg_name part: just letter, digit, hyphen, and dot. If you want
170 // Unicode in your host part, use UTF-8 or punycode: you can't percent
173 struct pct_let_dig
: seq
<one
<'%'>,
174 sor
<// ALPHA x41 -> x5A
175 seq
<one
<'4'>, range
<'1','9'>>,
176 seq
<one
<'4'>, range
<'A','F'>>,
177 seq
<one
<'4'>, range
<'a','f'>>,
178 seq
<one
<'5'>, range
<'0','9'>>,
179 seq
<one
<'5'>, one
<'A'>>,
180 seq
<one
<'5'>, one
<'a'>>,
182 seq
<one
<'3'>, range
<'0','9'>>
186 struct u_let_dig
: sor
<ALPHA
, DIGIT
, UTF8_non_ascii
, pct_let_dig
> {};
188 struct dash
: sor
<one
<'-'>, TAOCPP_PEGTL_ISTRING("%2D")> {};
190 struct u_ldh_tail
: star
<sor
<seq
<plus
<dash
>, u_let_dig
>, u_let_dig
>> {};
192 struct u_label
: seq
<u_let_dig
, u_ldh_tail
> {};
194 struct dot
: sor
<one
<'.'>, TAOCPP_PEGTL_ISTRING("%2E")> {};
196 // An Internet (RFC-1123) style hostname:
197 struct reg_name
: list_tail
<u_label
, dot
> {};
199 // All that is required for 3986 (as updated by Errata ID: 4942) is the following:
201 // reg-name = *( unreserved / pct-encoded / "-" / "." )
202 //struct reg_name : star<sor<unreserved, pct_encoded, one<'-'>, one<'.'>>> {};
204 /////////////////////////////////////////////////////////////////////////////
206 // dec-octet = DIGIT ; 0-9
207 // / %x31-39 DIGIT ; 10-99
208 // / "1" 2DIGIT ; 100-199
209 // / "2" %x30-34 DIGIT ; 200-249
210 // / "25" %x30-35 ; 250-255
211 struct dec_octet
: sor
<seq
<string
<'2','5'>, range
<'0','5'>>,
212 seq
<one
<'2'>, range
<'0','4'>, DIGIT
>,
213 seq
<one
<'1'>, DIGIT
, DIGIT
>,
214 seq
<range
<'1','9'>, DIGIT
>,
217 // IPv4address = dec-octet "." dec-octet "." dec-octet "." dec-octet
218 struct IPv4address
: seq
<dec_octet
, one
<'.'>, dec_octet
, one
<'.'>, dec_octet
, one
<'.'>, dec_octet
> {};
219 struct IPv4address_eof
: seq
<IPv4address
, eof
> {};
222 // ; 16 bits of address represented in hexadecimal
223 struct h16
: rep_min_max
<1, 4, HEXDIG
> {};
225 // ls32 = ( h16 ":" h16 ) / IPv4address
226 // ; least-significant 32 bits of address
227 struct ls32
: sor
<seq
<h16
, one
<':'>, h16
>, IPv4address
> {};
229 // IPv6address = 6( h16 ":" ) ls32
230 // / "::" 5( h16 ":" ) ls32
231 // / [ h16 ] "::" 4( h16 ":" ) ls32
232 // / [ *1( h16 ":" ) h16 ] "::" 3( h16 ":" ) ls32
233 // / [ *2( h16 ":" ) h16 ] "::" 2( h16 ":" ) ls32
234 // / [ *3( h16 ":" ) h16 ] "::" h16 ":" ls32
235 // / [ *4( h16 ":" ) h16 ] "::" ls32
236 // / [ *5( h16 ":" ) h16 ] "::" h16
237 // / [ *6( h16 ":" ) h16 ] "::"
239 struct IPv6address
: sor
<seq
< rep
<6, h16
, one
<':'>>, ls32
>,
240 seq
< two
<':'>, rep
<5, h16
, one
<':'>>, ls32
>,
241 seq
<opt
<h16
>, two
<':'>, rep
<4, h16
, one
<':'>>, ls32
>,
242 seq
<opt
<h16
, opt
< one
<':'>, h16
>>, two
<':'>, rep
<3, h16
, one
<':'>>, ls32
>,
243 seq
<opt
<h16
, rep_opt
<2, one
<':'>, h16
>>, two
<':'>, rep
<2, h16
, one
<':'>>, ls32
>,
244 seq
<opt
<h16
, rep_opt
<3, one
<':'>, h16
>>, two
<':'>, h16
, one
<':'>, ls32
>,
245 seq
<opt
<h16
, rep_opt
<4, one
<':'>, h16
>>, two
<':'>, ls32
>,
246 seq
<opt
<h16
, rep_opt
<5, one
<':'>, h16
>>, two
<':'>, h16
>,
247 seq
<opt
<h16
, rep_opt
<6, one
<':'>, h16
>>, two
<':'>>> {};
249 // IPvFuture = "v" 1*HEXDIG "." 1*( unreserved / sub-delims / ":" )
250 struct IPvFuture
: seq
<one
<'v'>, plus
<HEXDIG
>, one
<'.'>, plus
<sor
<unreserved
, sub_delims
, one
<':'>>>> {};
252 // IP-literal = "[" ( IPv6address / IPvFuture ) "]"
253 //struct IP_literal : seq<one<'['>, sor<IPv6address, IPvFuture>, one<']'>> {};
255 // RFC 6874 replaced the above rule with:
257 // ZoneID = 1*( unreserved / pct-encoded )
258 struct ZoneID
: plus
<sor
<unreserved
, pct_encoded
>> {};
260 // IPv6addrz = IPv6address "%25" ZoneID
261 struct IPv6addrz
: seq
<IPv6address
, one
<'%'>, ZoneID
> {};
263 // IP-literal = "[" ( IPv6address / IPv6addrz / IPvFuture ) "]"
264 struct IP_literal
: seq
<one
<'['>, sor
<IPv6addrz
, IPv6address
, IPvFuture
>, one
<']'>> {};
266 struct IP_literal_eof
: seq
<IP_literal
, eof
> {};
269 struct port
: star
<DIGIT
> {};
271 // host = IP-literal / IPv4address / reg-name
272 struct host
: sor
<IP_literal
, IPv4address
, reg_name
> {};
274 // userinfo = *( unreserved / pct-encoded / sub-delims / ":" )
275 struct userinfo
: star
<sor
<unreserved
, pct_encoded
, sub_delims
, one
<':'>>> {};
277 // Use userinfo_at rule to trigger setting userinfo field only after '@' char is found.
278 struct userinfo_at
: seq
<userinfo
, one
<'@'>> {};
280 // authority = [ userinfo "@" ] host [ ":" port ]
281 struct authority
: seq
<opt
<userinfo_at
>, host
, opt
<seq
<one
<':'>, port
>>> {};
283 // scheme = ALPHA *( ALPHA / DIGIT / "+" / "-" / "." )
284 struct scheme
: seq
<ALPHA
, star
<sor
<ALPHA
, DIGIT
, one
<'+', '-', '.'>>>> {};
286 // Use scheme_colon rule to trigger setting scheme field only after ':' char is found.
287 struct scheme_colon
: seq
<scheme
, one
<':'>> {};
289 // relative-part = "//" authority path-abempty
292 // / path-abempty ; this was added in Errata ID: 5428
294 struct relative_part
: sor
<seq
<two
<'/'>, authority
, path_abempty
>,
300 // relative-ref = relative-part [ "?" query ] [ "#" fragment ]
301 struct relative_ref
: seq
<relative_part
, opt
<seq
<one
<'?'>, query
>>, opt
<seq
<one
<'#'>, fragment
>>> {};
302 struct relative_ref_eof
: seq
<relative_ref
, eof
> {};
304 // hier-part = "//" authority path-abempty
308 struct hier_part
: sor
<seq
<two
<'/'>, authority
, path_abempty
>,
313 // absolute-URI = scheme ":" hier-part [ "?" query ]
314 struct absolute_URI
: seq
<scheme_colon
, hier_part
, opt
<seq
<one
<'?'>, query
>>> {};
315 struct absolute_URI_eof
: seq
<absolute_URI
, eof
> {};
317 // URI = scheme ":" hier-part [ "?" query ] [ "#" fragment ]
318 struct URI
: seq
<scheme_colon
, hier_part
, opt
<seq
<one
<'?'>, query
>>, opt
<seq
<one
<'#'>, fragment
>>> {};
319 struct URI_eof
: seq
<URI
, eof
> {};
321 // URI-reference = URI / relative-ref
322 struct URI_reference
: sor
<URI
, relative_ref
> {};
323 struct URI_reference_eof
: seq
<URI_reference
, eof
> {};
325 struct path_segment
: seq
<opt
<one
<'/'>>, seq
<star
<not_at
<one
<'/'>>, not_at
<eof
>, any
>>> {};
329 template <typename Rule
> struct action
: nothing
<Rule
> {
332 template <> struct action
<scheme_colon
> {
333 template <typename Input
>
334 static void apply(Input
const& in
, uri::components
& parts
)
336 auto sc
= in
.string();
337 CHECK((size(sc
) >= 1) && (sc
.back() == ':'));
343 template <> struct action
<authority
> {
344 template <typename Input
>
345 static void apply(Input
const& in
, uri::components
& parts
)
347 parts
.authority
= in
.string();
351 template <> struct action
<path_abempty
> {
352 template <typename Input
>
353 static void apply(Input
const& in
, uri::components
& parts
)
355 parts
.path
= in
.string();
359 template <> struct action
<path_empty
> {
360 template <typename Input
>
361 static void apply(Input
const& in
, uri::components
& parts
)
363 parts
.path
= std::string
{};
367 template <> struct action
<path_absolute
> {
368 template <typename Input
>
369 static void apply(Input
const& in
, uri::components
& parts
)
371 parts
.path
= in
.string();
375 template <> struct action
<path_rootless
> {
376 template <typename Input
>
377 static void apply(Input
const& in
, uri::components
& parts
)
379 parts
.path
= in
.string();
383 template <> struct action
<path_noscheme
> {
384 template <typename Input
>
385 static void apply(Input
const& in
, uri::components
& parts
)
387 parts
.path
= in
.string();
391 template <> struct action
<query
> {
392 template <typename Input
>
393 static void apply(Input
const& in
, uri::components
& parts
)
395 parts
.query
= in
.string();
399 template <> struct action
<fragment
> {
400 template <typename Input
>
401 static void apply(Input
const& in
, uri::components
& parts
)
403 parts
.fragment
= in
.string();
407 // The _at rule gives us userinfo + '@', so remove the at.
409 template <> struct action
<userinfo_at
> {
410 template <typename Input
>
411 static void apply(Input
const& in
, uri::components
& parts
)
413 auto ui
= in
.string();
414 CHECK((size(ui
) >= 1) && (ui
.back() == '@'));
420 template <> struct action
<host
> {
421 template <typename Input
>
422 static void apply(Input
const& in
, uri::components
& parts
)
424 parts
.host
= in
.string();
428 template <> struct action
<port
> {
429 template <typename Input
>
430 static void apply(Input
const& in
, uri::components
& parts
)
432 parts
.port
= in
.string();
436 template <> struct action
<path_segment
> {
437 template <typename Input
>
438 static void apply(Input
const& in
, std::string
& path_seg
)
440 path_seg
= in
.string();
443 } // namespace uri_internal
446 DLL_PUBLIC
bool parse_generic(std::string_view uri
, components
& parts
)
448 auto in
{memory_input
<>{uri
.data(), uri
.size(), "uri"}};
449 if (tao::pegtl::parse
<uri_internal::URI_eof
, uri_internal::action
>(in
,
456 DLL_PUBLIC
bool parse_relative_ref(std::string_view uri
, components
& parts
)
458 auto in
{memory_input
<>{uri
.data(), uri
.size(), "uri"}};
459 if (tao::pegtl::parse
<uri_internal::relative_ref_eof
, uri_internal::action
>(
466 DLL_PUBLIC
bool parse_reference(std::string_view uri
, components
& parts
)
468 auto in
{memory_input
<>{uri
.data(), uri
.size(), "uri"}};
469 if (tao::pegtl::parse
<uri_internal::URI_reference_eof
, uri_internal::action
>(
476 DLL_PUBLIC
bool parse_absolute(std::string_view uri
, components
& parts
)
478 auto in
{memory_input
<>{uri
.data(), uri
.size(), "uri"}};
479 if (tao::pegtl::parse
<uri_internal::absolute_URI_eof
, uri_internal::action
>(
486 std::string
to_string(uri
const& uri_in
) { return to_string(uri_in
.parts()); }
488 std::string
to_string(components
const& uri
)
490 std::ostringstream os
;
498 bool constexpr isunreserved(unsigned char in
)
501 case '0': case '1': case '2': case '3': case '4':
502 case '5': case '6': case '7': case '8': case '9':
503 case 'a': case 'b': case 'c': case 'd': case 'e':
504 case 'f': case 'g': case 'h': case 'i': case 'j':
505 case 'k': case 'l': case 'm': case 'n': case 'o':
506 case 'p': case 'q': case 'r': case 's': case 't':
507 case 'u': case 'v': case 'w': case 'x': case 'y': case 'z':
508 case 'A': case 'B': case 'C': case 'D': case 'E':
509 case 'F': case 'G': case 'H': case 'I': case 'J':
510 case 'K': case 'L': case 'M': case 'N': case 'O':
511 case 'P': case 'Q': case 'R': case 'S': case 'T':
512 case 'U': case 'V': case 'W': case 'X': case 'Y': case 'Z':
513 case '-': case '.': case '_': case '~':
521 bool constexpr ishexdigit(unsigned char in
)
524 case '0': case '1': case '2': case '3': case '4':
525 case '5': case '6': case '7': case '8': case '9':
526 case 'a': case 'b': case 'c': case 'd': case 'e':
528 case 'A': case 'B': case 'C': case 'D': case 'E':
537 unsigned char constexpr hexdigit2bin(unsigned char in
)
540 case '0': case '1': case '2': case '3': case '4':
541 case '5': case '6': case '7': case '8': case '9':
543 case 'a': case 'b': case 'c': case 'd': case 'e':
545 return 10 + (in
- 'a');
546 case 'A': case 'B': case 'C': case 'D': case 'E':
550 return 10 + (in
- 'A');
554 std::string
normalize_pct_encoded(std::string_view string
)
556 fmt::memory_buffer out
;
558 for (auto s
= begin(string
); s
< end(string
); ++s
) {
561 if ((s
+ 3 <= end(string
)) && ishexdigit(s
[1]) && ishexdigit(s
[2])) {
562 auto pct_ch
= 0x10 * hexdigit2bin(s
[1]) + hexdigit2bin(s
[2]);
563 if (isunreserved(pct_ch
)) {
564 fmt::format_to(out
, "{}", char(pct_ch
));
567 fmt::format_to(out
, "%{:02X}", pct_ch
);
573 fmt::format_to(out
, "{}", ch
);
576 return fmt::to_string(out
);
579 bool starts_with(std::string_view str
, std::string_view prefix
)
581 if (str
.size() >= prefix
.size())
582 return str
.compare(0, prefix
.size(), prefix
) == 0;
586 bool ends_with(std::string_view str
, std::string_view suffix
)
588 if (str
.size() >= suffix
.size())
589 return str
.compare(str
.length() - suffix
.length(), suffix
.length(), suffix
)
594 std::string
all_but_the_last(std::string_view path
)
597 // excluding any characters after the right-most "/" in the base URI
598 // path, or excluding the entire base URI path if it does not contain
599 // any "/" characters).
601 auto x
= path
.rfind('/');
602 if (x
== std::string_view::npos
)
603 return std::string
{};
604 return std::string(path
.data(), x
+ 1);
607 // <https://tools.ietf.org/html/rfc3986#section-5.2.3>
609 // 5.2.3. Merge Paths
611 std::string
merge(components
const& base_parts
, components
const& ref_parts
)
614 // Updated by Errata ID: 4789
616 // o If the base URI has a defined authority component and an empty
617 // path, or if the base URI's path is ending with "/..", then return
618 // a string consisting of base's path concatenated with "/" and then
619 // concatenated with the reference's path; otherwise,
621 if ((base_parts
.authority
&& base_parts
.path
->empty())
622 || ends_with(*base_parts
.path
, "/..")) {
623 return "/" + *ref_parts
.path
;
626 // o return a string consisting of the reference's path component
627 // appended to all but the last segment of the base URI's path…
629 return all_but_the_last(*base_parts
.path
) + *ref_parts
.path
;
632 // <https://tools.ietf.org/html/rfc3986#section-5.2.4>
634 // 5.2.4. Remove Dot Segments
636 std::string
remove_dot_segments(std::string input
)
639 output
.reserve(input
.length());
641 while (!input
.empty()) {
643 if (starts_with(input
, "../")) {
647 if (starts_with(input
, "./")) {
653 if (starts_with(input
, "/./")) {
655 input
.insert(0, "/");
660 input
.insert(0, "/");
665 if (starts_with(input
, "/../")) {
667 input
.insert(0, "/");
668 // remove last segment from output
669 auto last
= output
.rfind("/");
670 if (last
!= std::string::npos
) {
671 output
.erase(output
.begin() + last
, output
.end());
675 if (input
== "/..") {
677 input
.insert(0, "/");
678 // remove last segment from output
679 auto last
= output
.rfind("/");
680 if (last
!= std::string::npos
) {
681 output
.erase(output
.begin() + last
, output
.end());
696 auto in
{memory_input
<>{input
.data(), input
.size(), "path-segment"}};
698 std::string path_seg
;
699 if (tao::pegtl::parse
<uri_internal::path_segment
, uri_internal::action
>(
702 input
.erase(0, path_seg
.length());
705 LOG(FATAL
) << "no match, we'll be looping forever";
712 size_t constexpr max_length
= 255;
714 std::string_view
remove_trailing_dot(std::string_view a
)
716 if (a
.length() && ('.' == a
.back())) {
722 // Normalization Form KC (NFKC) Compatibility Decomposition, followed
723 // by Canonical Composition, see <http://unicode.org/reports/tr15/>
725 std::string
nfkc(std::string_view str
)
727 size_t length
= max_length
;
728 char bfr
[max_length
];
729 if (str
.length() > max_length
) {
730 throw std::runtime_error("hostname too long");
732 auto udata
= reinterpret_cast<uint8_t const*>(str
.data());
733 auto ubfr
= reinterpret_cast<uint8_t*>(bfr
);
734 if (u8_normalize(UNINORM_NFKC
, udata
, str
.size(), ubfr
, &length
) == nullptr) {
735 throw std::runtime_error("u8_normalize failure");
737 return std::string
{bfr
, length
};
740 bool is_IPv4address(std::string_view x
)
742 auto in
{memory_input
<>{x
.data(), x
.size(), "maybe-IPv4address"}};
743 if (tao::pegtl::parse
<uri_internal::IPv4address_eof
, uri_internal::action
>(
750 bool is_IP_literal(std::string_view x
)
752 auto in
{memory_input
<>{x
.data(), x
.size(), "maybe-IP_literal"}};
753 if (tao::pegtl::parse
<uri_internal::IP_literal_eof
, uri_internal::action
>(
760 std::string
normalize_host(std::string_view host
)
762 host
= remove_trailing_dot(host
);
764 auto norm_host
= normalize_pct_encoded(host
);
766 norm_host
= nfkc(norm_host
);
769 auto code
= idn2_to_ascii_8z(norm_host
.data(), &ptr
, IDN2_TRANSITIONAL
);
770 if (code
!= IDN2_OK
) {
771 throw std::runtime_error(idn2_strerror(code
));
776 // At this point, we have a (normalized) ascii norm_host. Continue
777 // on to get the UTF-8 version.
779 //#ifdef PREFER_UNICODE_HOSTNAME
781 code
= idn2_to_unicode_8z8z(norm_host
.c_str(), &ptr
, IDN2_TRANSITIONAL
);
782 if (code
!= IDN2_OK
) {
783 throw std::runtime_error(idn2_strerror(code
));
793 DLL_PUBLIC
std::string
normalize(components uri
)
795 // Normalize the scheme.
797 std::transform(begin(*uri
.scheme
), end(*uri
.scheme
), begin(*uri
.scheme
),
798 [](unsigned char c
) { return std::tolower(c
); });
801 // Normalize the host name.
803 if (!(is_IPv4address(*uri
.host
) || is_IP_literal(*uri
.host
))) {
804 uri
.host
= normalize_host(*uri
.host
);
808 // we'll want to remove default port numbers
810 // Rebuild authority from user@host:port triple.
811 std::stringstream authstream
;
813 authstream
<< *uri
.userinfo
<< '@';
816 authstream
<< *uri
.host
;
819 authstream
<< ':' << *uri
.port
;
821 if (uri
.userinfo
|| uri
.host
|| uri
.port
) {
822 uri
.authority
= authstream
.str();
825 // Normalize the path.
827 uri
.path
= remove_dot_segments(normalize_pct_encoded(*uri
.path
));
830 return to_string(uri
);
833 DLL_PUBLIC uri
resolve_ref(absolute
const& base
, reference
const& ref
)
835 // 5.2. Relative Resolution
841 components
const& base_parts
= base
.parts();
842 components
const& ref_parts
= ref
.parts();
844 components target_parts
;
846 // if defined(R.scheme) then
848 if (ref_parts
.scheme
) {
850 // T.scheme = R.scheme;
851 target_parts
.scheme
= *ref_parts
.scheme
;
853 // T.authority = R.authority;
854 if (ref_parts
.authority
) {
855 target_parts
.authority
= *ref_parts
.authority
;
858 if (ref_parts
.path
) {
859 target_parts
.path
= remove_dot_segments(*ref_parts
.path
);
862 if (ref_parts
.query
) {
863 target_parts
.query
= *ref_parts
.query
;
867 if (ref_parts
.authority
) {
868 target_parts
.authority
= *ref_parts
.authority
;
869 if (ref_parts
.path
) {
870 target_parts
.path
= remove_dot_segments(*ref_parts
.path
);
872 target_parts
.query
= ref_parts
.query
;
876 if (ref_parts
.path
== "") {
877 target_parts
.path
= base_parts
.path
;
878 if (ref_parts
.query
) {
879 target_parts
.query
= ref_parts
.query
;
882 target_parts
.query
= base_parts
.query
;
886 if (starts_with(*ref_parts
.path
, "/")) {
887 if (ref_parts
.path
) {
888 target_parts
.path
= remove_dot_segments(*ref_parts
.path
);
892 // T.path = merge(Base.path, R.path);
893 // T.path = remove_dot_segments(T.path);
894 target_parts
.path
= remove_dot_segments(merge(base_parts
, ref_parts
));
897 // T.query = R.query;
898 target_parts
.query
= ref_parts
.query
;
901 // T.authority = Base.authority;
902 target_parts
.authority
= base_parts
.authority
;
905 // T.scheme = Base.scheme;
906 target_parts
.scheme
= base_parts
.scheme
;
909 // T.fragment = R.fragment;
910 if (ref_parts
.fragment
) {
911 target_parts
.fragment
= *ref_parts
.fragment
;
914 return generic(target_parts
);
919 // <https://tools.ietf.org/html/rfc3986#section-5.3>
921 // 5.3. Component Recomposition
923 DLL_PUBLIC
std::ostream
& operator<<(std::ostream
& os
,
924 uri::components
const& uri
)
927 os
<< *uri
.scheme
<< ':';
930 // The individual parts take precedence over the single authority.
932 if (uri
.userinfo
|| uri
.host
|| uri
.port
) {
936 os
<< *uri
.userinfo
<< '@';
938 // Host is never undefined, but perhaps zero length.
943 os
<< ':' << *uri
.userinfo
;
945 else if (uri
.authority
) {
946 os
<< "//" << *uri
.authority
;
954 os
<< '?' << *uri
.query
;
958 os
<< '#' << *uri
.fragment
;
964 DLL_PUBLIC
std::ostream
& operator<<(std::ostream
& os
, uri::uri
const& uri_in
)
966 return os
<< uri_in
.parts();