1 /*************************************************************************
3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
5 * Copyright 2008 by Sun Microsystems, Inc.
7 * OpenOffice.org - a multi-platform office productivity suite
12 * This file is part of OpenOffice.org.
14 * OpenOffice.org is free software: you can redistribute it and/or modify
15 * it under the terms of the GNU Lesser General Public License version 3
16 * only, as published by the Free Software Foundation.
18 * OpenOffice.org is distributed in the hope that it will be useful,
19 * but WITHOUT ANY WARRANTY; without even the implied warranty of
20 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21 * GNU Lesser General Public License version 3 for more details
22 * (a copy is included in the LICENSE file that accompanied this code).
24 * You should have received a copy of the GNU Lesser General Public License
25 * version 3 along with OpenOffice.org. If not, see
26 * <http://www.openoffice.org/license.html>
27 * for a copy of the LGPLv3 License.
29 ************************************************************************/
34 #include "rtl/textenc.h"
35 #include "rtl/ustring.h"
36 #include "sal/types.h"
38 #if defined __cplusplus
40 #endif /* __cplusplus */
42 /** Various predefined URI 'char classes.'
45 A 'char class' defines which (ASCII) characters can be written 'as they
46 are' in a part of a Uri, and which characters have to be written using
47 escape sequences ('%' followed by two hex digits). Characters outside
48 the ASCII range are always written using escape sequences.
51 If there are other frequently used char classes, they can be added to
52 this enumeration; the function rtl_getUriCharClass() has to be adapted
57 /** The empty char class.
60 All characters are written using escape sequences.
64 /** The RFC 2732 <uric> char class.
67 The 'valid' characters are !$&'()*+,-./:;=?@[]_~ plus digits and
72 /** The RFC 2396 <uric_no_slash> char class.
75 The 'valid' characters are !$&'()*+,-.:;=?@_~ plus digits and letters.
77 rtl_UriCharClassUricNoSlash
,
79 /** The RFC 2396 <rel_segment> char class.
82 The 'valid' characters are !$&'()*+,-.;=@_~ plus digits and letters.
84 rtl_UriCharClassRelSegment
,
86 /** The RFC 2396 <reg_name> char class.
89 The 'valid' characters are !$&'()*+,-.:;=@_~ plus digits and letters.
91 rtl_UriCharClassRegName
,
93 /** The RFC 2396 <userinfo> char class.
96 The 'valid' characters are !$&'()*+,-.:;=_~ plus digits and letters.
98 rtl_UriCharClassUserinfo
,
100 /** The RFC 2396 <pchar> char class.
103 The 'valid' characters are !$&'()*+,-.:=@_~ plus digits and letters.
105 rtl_UriCharClassPchar
,
107 /** The char class for the values of uno URL parameters.
110 The 'valid' characters are !$&'()*+-./:?@_~ plus digits and letters.
112 rtl_UriCharClassUnoParamValue
,
114 rtl_UriCharClass_FORCE_EQUAL_SIZE
= SAL_MAX_ENUM
118 /** The mechanism describing how escape sequences in the input of
119 rtl_uriEncode() are handled.
123 /** The special meaning of '%' is ignored (i.e., there are by definition
124 no escape sequences in the input).
127 This mechanism is useful to encode user input as part of a URI (e.g.,
128 the user-supplied password in an ftp URL---'%20abcde' is a valid
129 password, so do not assume that the '%20' is an escaped space).
131 rtl_UriEncodeIgnoreEscapes
,
133 /** All escape sequences ('%' followed by two hex digits) are kept intact,
134 even if they represent characters that need not be escaped or if they
135 do not even map to characters in the given charset.
138 This mechanism is useful when passing on complete URIs more or less
139 unmodified (e.g., within an HTTP proxy): missing escape sequences are
140 added, but existing escape sequences are not touched (except that any
141 lower case hex digits are replaced by upper case hex digits).
143 rtl_UriEncodeKeepEscapes
,
145 /** All escape sequences ('%' followed by two hex digits) are resolved in
146 a first step; only those that represent characters that need to be
147 escaped are kept intact.
150 This mechanism is useful to properly encode complete URIs entered by
151 the user: the URI is brought into a 'canonic form,' but care is taken
152 not to damage (valid) escape sequences the (careful) user already
155 rtl_UriEncodeCheckEscapes
,
157 /** Like rtl_UriEncodeIgnoreEscapes, but indicating failure when converting
158 unmappable characters.
164 /** Like rtl_UriEncodeKeepEscapes, but indicating failure when converting
165 unmappable characters.
169 rtl_UriEncodeStrictKeepEscapes
,
171 rtl_UriEncode_FORCE_EQUAL_SIZE
= SAL_MAX_ENUM
173 rtl_UriEncodeMechanism
;
175 /** The mechanism describing how rtl_uriDecode() translates (part of) a URI
176 into a Unicode string.
180 /** The text is returned completely unmodified.
184 /** The text is returned in the form of an IURI (cf.
185 draft-masinter-url-i18n-05.txt).
188 All escape sequences representing ASCII characters (%00--%7F) are
189 kept, all other escape sequences are interpreted as UTF-8 characters
190 and translated to Unicode, if possible.
194 /** The text is decoded.
197 All escape sequences representing characters from the given charset
198 are decoded and translated to Unicode, if possible.
200 rtl_UriDecodeWithCharset
,
202 /** Like rtl_UriDecodeWithCharset, but indicating failure when converting
203 unmappable characters.
209 rtl_UriDecode_FORCE_EQUAL_SIZE
= SAL_MAX_ENUM
211 rtl_UriDecodeMechanism
;
213 /** Map a predefined rtl_UriCharClass to a form usable by rtl_uriEncode().
216 The function rtl_uriEncode() expects an array of 128 booleans, and this
217 function maps rtl_UriCharClass enumeration members to such arrays.
220 Any valid member of rtl_UriCharClass.
223 An array of 128 booleans, to be used in calls to rtl_uriEncode().
225 sal_Bool
const * SAL_CALL
rtl_getUriCharClass(rtl_UriCharClass eCharClass
)
226 SAL_THROW_EXTERN_C();
228 /** Encode a text as (part of) a URI.
231 Any Unicode string. Must not be null.
234 A char class, represented as an array of 128 booleans (true means keep the
235 corresponding ASCII character unencoded, false means encode it). Must not
236 be null, and the boolean corresponding to the percent sign (0x25) must be
237 false. (See rtl_getUriCharClass() for a function mapping from
238 rtl_UriCharClass to such arrays.)
241 The mechanism describing how escape sequences in the input text are
245 When Unicode characters from the input text have to be written using
246 escape sequences (because they are either outside the ASCII range or do
247 not belong to the given char class), they are first translated into this
248 charset before being encoded using escape sequences.
250 Also, if the encode mechanism is rtl_UriEncodeCheckEscapes, all escape
251 sequences already present in the input text are interpreted as characters
255 Returns an encoded representation of the input text. Must itself not be
256 null, and must point to either null or a valid string.
258 If the encode mechanism is rtl_UriEncodeStrict, and pText cannot be
259 converted to eCharset because it contains unmappable characters (which
260 implies that pText is not empty), then an empty string is returned.
262 void SAL_CALL
rtl_uriEncode(rtl_uString
* pText
,
263 sal_Bool
const * pCharClass
,
264 rtl_UriEncodeMechanism eMechanism
,
265 rtl_TextEncoding eCharset
,
266 rtl_uString
** pResult
)
267 SAL_THROW_EXTERN_C();
269 /** Decode (a part of) a URI.
272 Any Unicode string. Must not be null. (If the input is indeed part of a
273 valid URI, this string will only contain a subset of the ASCII characters,
274 but this function also handles other Unicode characters properly.)
277 The mechanism describing how the input text is translated into a Unicode
281 When the decode mechanism is rtl_UriDecodeWithCharset, all escape
282 sequences in the input text are interpreted as characters from this
283 charset. Those characters are translated to Unicode characters in the
284 resulting output, if possible.
286 When the decode mechanism is rtl_UriDecodeNone or rtl_UriDecodeToIuri,
287 this parameter is ignored (and is best specified as
288 RTL_TEXTENCODING_UTF8).
291 Returns a decoded representation of the input text. Must itself not be
292 null, and must point to either null or a valid string.
294 If the decode mechanism is rtl_UriDecodeStrict, and pText cannot be
295 converted to eCharset because it contains (encodings of) unmappable
296 characters (which implies that pText is not empty), then an empty string is
299 void SAL_CALL
rtl_uriDecode(rtl_uString
* pText
,
300 rtl_UriDecodeMechanism eMechanism
,
301 rtl_TextEncoding eCharset
,
302 rtl_uString
** pResult
)
303 SAL_THROW_EXTERN_C();
305 /** Convert a relative URI reference into an absolute one.
307 A URI reference is a URI plus an optional <"#" fragment> part.
309 This function uses the algorithm described in RFC 2396, section 5.2, with
310 the following clarifications: (1) Backwards-compatible relative URIs
311 starting with a scheme component (see RFC 2396, section 5.2, step 3) are not
312 supported. (2) Segments "." and ".." within the path of the base URI are
313 not considered special, RFC 2396 seems a bit unlcear about that point.
314 (3) Erroneous excess segments ".." within the path of the relative URI (if
315 it is indeed relative) are left intact, as the examples in RFC 2396,
316 section C.2, suggest. (4) If the relative URI is a reference to the
317 "current document," the "current document" is taken to be the base URI.
319 This function signals exceptions by returning false and letting pException
320 point to a message explaining the exception.
323 An absolute, hierarchical URI reference that serves as the base URI. If it
324 has to be inspected (i.e., pRelUriRef is not an absolute URI already), and
325 if it either is not an absolute URI (i.e., does not begin with a
326 <scheme ":"> part) or has a path that is non-empty but does not start
327 with "/", an exception will be signaled.
330 An URI reference that may be either absolute or relative. If it is
331 absolute, it will be returned unmodified (and it need not be hierarchical
335 Returns an absolute URI reference. Must itself not be null, and must point
336 to either null or a valid string. If an exception is signalled, it is left
340 Returns an explanatory message in case an exception is signalled. Must
341 itself not be null, and must point to either null or a valid string. If no
342 exception is signalled, it is left unchanged.
345 True if no exception is signalled, otherwise false.
347 sal_Bool SAL_CALL
rtl_uriConvertRelToAbs(rtl_uString
* pBaseUriRef
,
348 rtl_uString
* pRelUriRef
,
349 rtl_uString
** pResult
,
350 rtl_uString
** pException
)
351 SAL_THROW_EXTERN_C();
353 #if defined __cplusplus
355 #endif /* __cplusplus */
357 #endif /* _RTL_URI_H_ */