1 /* strfuncs.cc: string functions
3 This file is part of Cygwin.
5 This software is a copyrighted work licensed under the terms of the
6 Cygwin license. Please consult the file "CYGWIN_LICENSE" for
11 #include <sys/param.h>
20 /* Transform characters invalid for Windows filenames to the Unicode private
21 use area in the U+f0XX range. The affected characters are all control
22 chars 1 <= c <= 31, as well as the characters " * : < > ? |. The backslash
23 is affected as well, but we can't transform it as long as we accept Win32
25 static const WCHAR tfx_chars
[] = {
26 0xf000 | 0, 0xf000 | 1, 0xf000 | 2, 0xf000 | 3,
27 0xf000 | 4, 0xf000 | 5, 0xf000 | 6, 0xf000 | 7,
28 0xf000 | 8, 0xf000 | 9, 0xf000 | 10, 0xf000 | 11,
29 0xf000 | 12, 0xf000 | 13, 0xf000 | 14, 0xf000 | 15,
30 0xf000 | 16, 0xf000 | 17, 0xf000 | 18, 0xf000 | 19,
31 0xf000 | 20, 0xf000 | 21, 0xf000 | 22, 0xf000 | 23,
32 0xf000 | 24, 0xf000 | 25, 0xf000 | 26, 0xf000 | 27,
33 0xf000 | 28, 0xf000 | 29, 0xf000 | 30, 0xf000 | 31,
34 ' ', '!', 0xf000 | '"', '#',
36 '(', ')', 0xf000 | '*', '+',
40 '8', '9', 0xf000 | ':', ';',
41 0xf000 | '<', '=', 0xf000 | '>', 0xf000 | '?',
57 0xf000 | '|', '}', '~', 127
60 /* This is the table for the reverse functionality in sys_wcstombs.
61 It differs deliberately in two code places (space and dot) to allow
62 converting back space and dot on filesystems only supporting DOS
64 static const WCHAR tfx_rev_chars
[] = {
65 0xf000 | 0, 0xf000 | 1, 0xf000 | 2, 0xf000 | 3,
66 0xf000 | 4, 0xf000 | 5, 0xf000 | 6, 0xf000 | 7,
67 0xf000 | 8, 0xf000 | 9, 0xf000 | 10, 0xf000 | 11,
68 0xf000 | 12, 0xf000 | 13, 0xf000 | 14, 0xf000 | 15,
69 0xf000 | 16, 0xf000 | 17, 0xf000 | 18, 0xf000 | 19,
70 0xf000 | 20, 0xf000 | 21, 0xf000 | 22, 0xf000 | 23,
71 0xf000 | 24, 0xf000 | 25, 0xf000 | 26, 0xf000 | 27,
72 0xf000 | 28, 0xf000 | 29, 0xf000 | 30, 0xf000 | 31,
73 0xf000 | ' ', '!', 0xf000 | '"', '#',
75 '(', ')', 0xf000 | '*', '+',
76 ',', '-', 0xf000 | '.', '\\',
79 '8', '9', 0xf000 | ':', ';',
80 0xf000 | '<', '=', 0xf000 | '>', 0xf000 | '?',
96 0xf000 | '|', '}', '~', 127
100 transform_chars (PWCHAR path
, PWCHAR path_end
)
102 for (; path
<= path_end
; ++path
)
104 *path
= tfx_chars
[*path
];
108 transform_chars_af_unix (PWCHAR out
, const char *path
, __socklen_t len
)
110 len
-= sizeof (__sa_family_t
);
111 for (const unsigned char *p
= (const unsigned char *) path
; len
-- > 0; ++p
)
112 *out
++ = (*p
<= 0x7f) ? tfx_chars
[*p
] : *p
;
116 /* convert wint_t string to wchar_t string. Make sure dest
117 has room for at least twice as much characters to account
118 for surrogate pairs, plus a wchar_t NUL. */
120 wcintowcs (wchar_t *dest
, wint_t *src
, size_t len
)
122 while (*src
&& len
-- > 0)
125 *dest
++ = ((*src
- 0x10000) >> 10) + 0xd800;
126 *dest
++ = ((*src
++ - 0x10000) & 0x3ff) + 0xdc00;
133 /* replacement function for wcrtomb, converting a UTF-32 char to a
136 c32rtomb (char *s
, char32_t wc
, mbstate_t *ps
)
140 _REENT_CHECK_MISC(_REENT
);
141 ps
= &(_REENT_C32RTOMB_STATE(_REENT
));
144 /* If s is NULL, behave as if s pointed to an internal buffer and wc
145 was a null wide character (L''). wcrtomb will do that for us*/
146 if (wc
<= 0xffff || !s
)
147 return wcrtomb (s
, (wchar_t) wc
, ps
);
150 const wchar_t *wcp
= wc_arr
;
153 wc_arr
[0] = (wc
>> 10) + 0xd800;
154 wc_arr
[1] = (wc
& 0x3ff) + 0xdc00;
155 return wcsnrtombs (s
, &wcp
, 2, SIZE_MAX
, ps
);
159 c16rtomb (char *s
, char16_t wc
, mbstate_t *ps
)
163 _REENT_CHECK_MISC(_REENT
);
164 ps
= &(_REENT_C16RTOMB_STATE(_REENT
));
167 return wcrtomb (s
, (wchar_t) wc
, ps
);
171 c8rtomb (char *s
, char8_t c8
, mbstate_t *ps
)
173 struct _reent
*reent
= _REENT
;
178 _REENT_CHECK_MISC(reent
);
179 ps
= &(_REENT_C8RTOMB_STATE(reent
));
187 if ((ps
->__count
& 0xff00) != 0xc800)
191 case 0 ... 0x7f: /* single octet */
195 case 0xc2 ... 0xf4: /* valid lead byte */
196 ps
->__count
= 0xc801;
197 ps
->__value
.__wchb
[0] = c8
;
205 /* We already collected something... */
206 int idx
= ps
->__count
& 0x3;
207 char8_t
&c1
= ps
->__value
.__wchb
[0];
208 char8_t
&c2
= ps
->__value
.__wchb
[1];
209 char8_t
&c3
= ps
->__value
.__wchb
[2];
214 /* Annoyingly complex check for validity for 2nd octet. */
215 if (c8
<= 0x7f || c8
>= 0xc0)
217 if (c1
== 0xe0 && c8
<= 0x9f)
219 if (c1
== 0xed && c8
>= 0xa0)
221 if (c1
== 0xf0 && c8
<= 0x8f)
223 if (c1
== 0xf4 && c8
>= 0x90)
227 ps
->__count
= 0xc802;
231 wc
= ((c1
& 0x1f) << 6)
235 if (c8
<= 0x7f || c8
>= 0xc0)
239 ps
->__count
= 0xc803;
243 wc
= ((c1
& 0x0f) << 12)
248 if (c8
<= 0x7f || c8
>= 0xc0)
250 wc
= ((c1
& 0x07) << 18)
251 | ((c2
& 0x3f) << 12)
255 default: /* Shouldn't happen */
260 return c32rtomb (s
, wc
, ps
);
263 _REENT_ERRNO(reent
) = EILSEQ
;
268 mbrtoc32 (char32_t
*pwc
, const char *s
, size_t n
, mbstate_t *ps
)
275 _REENT_CHECK_MISC(_REENT
);
276 ps
= &(_REENT_MBRTOC32_STATE(_REENT
));
279 len
= mbrtowc (&w1
, s
, n
, ps
);
280 if (len
== (size_t) -1 || len
== (size_t) -2)
284 /* Convert surrogate pair to wint_t value */
285 if (len
> 0 && w1
>= 0xd800 && w1
<= 0xdbff)
289 len2
= mbrtowc (&w2
, s
, n
, ps
);
290 if (len2
> 0 && w2
>= 0xdc00 && w2
<= 0xdfff)
294 *pwc
= (((w1
& 0x3ff) << 10) | (w2
& 0x3ff)) + 0x10000;
305 /* Like mbrtowc, but we already defined how to return a surrogate, and
306 the definition of mbrtoc16 differes from that.
307 Return the high surrogate with a return value representing the length
308 of the entire multibyte sequence, and in the next call return the low
309 surrogate with a return value of -3. */
311 mbrtoc16 (char16_t
*pwc
, const char *s
, size_t n
, mbstate_t *ps
)
314 struct _reent
*reent
= _REENT
;
319 _REENT_CHECK_MISC(reent
);
320 ps
= &(_REENT_MBRTOC16_STATE(reent
));
324 retval
= __MBTOWC (reent
, NULL
, "", 1, ps
);
325 else if (ps
->__count
== 0xdc00)
327 /* Return stored second half of the surrogate. */
329 *pwc
= ps
->__value
.__wch
;
334 retval
= __MBTOWC (reent
, &wc
, s
, n
, ps
);
341 /* Did we catch the first half of a surrogate? */
342 if (wc
>= 0xd800 && wc
<= 0xdbff)
344 if (n
<= (size_t) retval
)
346 int r2
= __MBTOWC (reent
, &wc
, s
+ retval
, n
, ps
);
349 /* Store second half of the surrogate in state, and return the
350 length of the entire multibyte sequence. */
351 ps
->__count
= 0xdc00;
352 ps
->__value
.__wch
= wc
;
355 return (size_t)retval
;
359 _REENT_ERRNO(reent
) = EILSEQ
;
364 mbrtoc8 (char8_t
*pc8
, const char *s
, size_t n
, mbstate_t *ps
)
366 struct _reent
*reent
= _REENT
;
372 _REENT_CHECK_MISC(reent
);
373 ps
= &(_REENT_MBRTOC8_STATE(reent
));
382 else if ((ps
->__count
& 0xff00) == 0xc800)
384 /* Return next utf-8 octet in line. */
385 int idx
= ps
->__count
& 0x3;
388 *pc8
= ps
->__value
.__wchb
[--idx
];
393 len
= mbrtoc32 (&wc
, s
, n
, ps
);
396 /* octets stored back to front for easier indexing */
400 ps
->__value
.__wchb
[0] = wc
;
404 ps
->__value
.__wchb
[1] = 0xc0 | ((wc
& 0x7c0) >> 6);
405 ps
->__value
.__wchb
[0] = 0x80 | (wc
& 0x3f);
406 ps
->__count
= 0xc800 | 1;
408 case 0x800 ... 0xffff:
409 ps
->__value
.__wchb
[2] = 0xe0 | ((wc
& 0xf000) >> 12);
410 ps
->__value
.__wchb
[1] = 0x80 | ((wc
& 0xfc0) >> 6);
411 ps
->__value
.__wchb
[0] = 0x80 | (wc
& 0x3f);
412 ps
->__count
= 0xc800 | 2;
414 case 0x10000 ... 0x10ffff:
415 ps
->__value
.__wchb
[3] = 0xf0 | ((wc
& 0x1c0000) >> 18);
416 ps
->__value
.__wchb
[2] = 0x80 | ((wc
& 0x3f000) >> 12);
417 ps
->__value
.__wchb
[1] = 0x80 | ((wc
& 0xfc0) >> 6);
418 ps
->__value
.__wchb
[0] = 0x80 | (wc
& 0x3f);
419 ps
->__count
= 0xc800 | 3;
423 _REENT_ERRNO(reent
) = EILSEQ
;
427 *pc8
= ps
->__value
.__wchb
[ps
->__count
& 0x3];
433 mbsnrtowci(wint_t *dst
, const char **src
, size_t nms
, size_t len
, mbstate_t *ps
)
443 /* Ignore original len value and do not alter src pointer if the
444 dst pointer is NULL. */
452 bytes
= mbrtowi (ptr
, *src
, MB_CUR_MAX
, ps
);
458 ptr
= (dst
== NULL
) ? NULL
: ptr
+ 1;
468 /* Deviation from standard: If the input is broken, the output
469 will be broken. I. e., we just copy the current byte over
470 into the wint_t destination and try to pick up on the next
471 byte. This is in line with the way fnmatch works. */
475 *ptr
++ = (const wint_t) *(*src
)++;
485 /* The SJIS, JIS and eucJP conversion in newlib does not use UTF as
486 wchar_t character representation. That's unfortunate for us since
487 we require UTF for the OS. What we do here is to have our own
488 implementation of the base functions for the conversion using
489 the MulitByteToWideChar/WideCharToMultiByte functions. */
491 /* FIXME: We can't support JIS (ISO-2022-JP) at all right now. It's a
492 stateful charset encoding. The translation from mbtowc to
493 MulitByteToWideChar is quite complex. Given that we support SJIS and
494 eucJP, the both most used Japanese charset encodings, this shouldn't
495 be such a big problem. */
497 /* GBK, GB18030, eucKR, and Big5 conversions are not available so far
501 __db_wctomb (struct _reent
*r
, char *s
, wchar_t wchar
, UINT cp
)
512 BOOL def_used
= false;
513 int ret
= WideCharToMultiByte (cp
, WC_NO_BEST_FIT_CHARS
, &wchar
, 1, s
,
515 if (ret
> 0 && !def_used
)
518 _REENT_ERRNO(r
) = EILSEQ
;
523 __sjis_wctomb (struct _reent
*r
, char *s
, wchar_t wchar
, mbstate_t *state
)
525 return __db_wctomb (r
,s
, wchar
, 932);
529 __eucjp_wctomb (struct _reent
*r
, char *s
, wchar_t wchar
, mbstate_t *state
)
531 /* Unfortunately, the Windows eucJP codepage 20932 is not really 100%
532 compatible to eucJP. It's a cute approximation which makes it a
534 The JIS-X-0212 three byte codes (0x8f,0xa1-0xfe,0xa1-0xfe) are folded
535 into two byte codes as follows: The 0x8f is stripped, the next byte is
536 taken as is, the third byte is mapped into the lower 7-bit area by
537 masking it with 0x7f. So, for instance, the eucJP code 0x8f,0xdd,0xf8
538 becomes 0xdd,0x78 in CP 20932.
540 To be really eucJP compatible, we have to map the JIS-X-0212 characters
541 between CP 20932 and eucJP ourselves. */
551 BOOL def_used
= false;
552 int ret
= WideCharToMultiByte (20932, WC_NO_BEST_FIT_CHARS
, &wchar
, 1, s
,
554 if (ret
> 0 && !def_used
)
556 /* CP20932 representation of JIS-X-0212 character? */
557 if (ret
== 2 && (unsigned char) s
[1] <= 0x7f)
559 /* Yes, convert to eucJP three byte sequence */
568 _REENT_ERRNO(r
) = EILSEQ
;
573 __gbk_wctomb (struct _reent
*r
, char *s
, wchar_t wchar
, mbstate_t *state
)
575 return __db_wctomb (r
,s
, wchar
, 936);
579 __gb18030_wctomb (struct _reent
*r
, char *s
, wchar_t wchar
, mbstate_t *state
)
587 if (state
->__count
== 0)
595 if (wchar
>= 0xd800 && wchar
<= 0xdbff)
597 /* First half of a surrogate pair */
598 state
->__count
= 18030;
599 state
->__value
.__wch
= wchar
;
602 ret
= WideCharToMultiByte (54936, WC_ERR_INVALID_CHARS
, &wchar
, 1, s
,
608 else if (state
->__count
== 18030 && state
->__value
.__wch
>= 0xd800
609 && state
->__value
.__wch
<= 0xdbff)
611 if (wchar
>= 0xdc00 && wchar
<= 0xdfff)
613 /* Create multibyte sequence from full surrogate pair. */
614 wres
[0] = state
->__value
.__wch
;
616 ret
= WideCharToMultiByte (54936, WC_ERR_INVALID_CHARS
, wres
, 2, s
, 4,
625 _REENT_ERRNO(r
) = EILSEQ
;
628 _REENT_ERRNO(r
) = EINVAL
;
633 __kr_wctomb (struct _reent
*r
, char *s
, wchar_t wchar
, mbstate_t *state
)
635 return __db_wctomb (r
,s
, wchar
, 949);
639 __big5_wctomb (struct _reent
*r
, char *s
, wchar_t wchar
, mbstate_t *state
)
641 return __db_wctomb (r
,s
, wchar
, 950);
645 __db_mbtowc (struct _reent
*r
, wchar_t *pwc
, const char *s
, size_t n
, UINT cp
,
652 return 0; /* not state-dependent */
660 if (state
->__count
== 0)
662 if (*(unsigned char *) s
< 0x80)
664 *pwc
= *(unsigned char *) s
;
667 size_t cnt
= MIN (n
, 2);
668 ret
= MultiByteToWideChar (cp
, MB_ERR_INVALID_CHARS
, s
, cnt
, pwc
, 1);
674 state
->__value
.__wchb
[0] = *s
;
677 /* These Win32 functions are really crappy. Assuming n is 2 but the
678 first byte is a singlebyte charcode, the function does not convert
679 that byte and return 1, rather it just returns 0. So, what we do
680 here is to check if the first byte returns a valid value... */
681 else if (MultiByteToWideChar (cp
, MB_ERR_INVALID_CHARS
, s
, 1, pwc
, 1))
683 _REENT_ERRNO(r
) = EILSEQ
;
686 state
->__value
.__wchb
[state
->__count
] = *s
;
687 ret
= MultiByteToWideChar (cp
, MB_ERR_INVALID_CHARS
,
688 (const char *) state
->__value
.__wchb
, 2, pwc
, 1);
691 _REENT_ERRNO(r
) = EILSEQ
;
699 __sjis_mbtowc (struct _reent
*r
, wchar_t *pwc
, const char *s
, size_t n
,
702 return __db_mbtowc (r
, pwc
, s
, n
, 932, state
);
706 __eucjp_mbtowc (struct _reent
*r
, wchar_t *pwc
, const char *s
, size_t n
,
709 /* See comment in __eucjp_wctomb above. */
714 return 0; /* not state-dependent */
722 if (state
->__count
== 0)
724 if (*(unsigned char *) s
< 0x80)
726 *pwc
= *(unsigned char *) s
;
729 if (*(unsigned char *) s
== 0x8f) /* JIS-X-0212 lead byte? */
731 /* Yes. Store sequence in mbstate and handle in the __count != 0
732 case at the end of the function. */
734 for (i
= 0; i
< 3 && i
< n
; i
++)
735 state
->__value
.__wchb
[i
] = s
[i
];
736 if ((state
->__count
= i
) < 3) /* Incomplete sequence? */
741 size_t cnt
= MIN (n
, 2);
742 if (MultiByteToWideChar (20932, MB_ERR_INVALID_CHARS
, s
, cnt
, pwc
, 1))
747 state
->__value
.__wchb
[0] = *s
;
750 else if (MultiByteToWideChar (20932, MB_ERR_INVALID_CHARS
, s
, 1, pwc
, 1))
752 _REENT_ERRNO(r
) = EILSEQ
;
755 state
->__value
.__wchb
[state
->__count
++] = *s
;
758 if (state
->__value
.__wchb
[0] == 0x8f)
760 if (state
->__count
== 2)
764 state
->__value
.__wchb
[state
->__count
] = s
[1];
767 /* Ok, we have a full JIS-X-0212 sequence in mbstate. Convert it
768 to the CP 20932 representation and feed it to MultiByteToWideChar. */
769 state
->__value
.__wchb
[0] = state
->__value
.__wchb
[1];
770 state
->__value
.__wchb
[1] = state
->__value
.__wchb
[2] & 0x7f;
772 if (!MultiByteToWideChar (20932, MB_ERR_INVALID_CHARS
,
773 (const char *) state
->__value
.__wchb
, 2, pwc
, 1))
775 _REENT_ERRNO(r
) = EILSEQ
;
783 __gbk_mbtowc (struct _reent
*r
, wchar_t *pwc
, const char *s
, size_t n
,
786 return __db_mbtowc (r
, pwc
, s
, n
, 936, state
);
790 __gb18030_mbtowc (struct _reent
*r
, wchar_t *pwc
, const char *s
, size_t n
,
793 wchar_t wres
[2], dummy
;
795 int ret
, len
, ocount
;
798 if (state
->__count
< 0 || (state
->__count
> (int) sizeof state
->__value
.__wchb
799 && state
->__count
!= 18030))
812 if (state
->__count
== 18030)
814 /* Return second half of the surrogate pair */
815 *pwc
= state
->__value
.__wch
;
820 ncopy
= MIN (MIN (n
, MB_CUR_MAX
),
821 sizeof state
->__value
.__wchb
- state
->__count
);
822 memcpy (state
->__value
.__wchb
+ state
->__count
, s
, ncopy
);
823 ocount
= state
->__count
;
824 state
->__count
+= ncopy
;
825 s
= (char *) state
->__value
.__wchb
;
828 if (n
== 0) /* Incomplete multibyte sequence */
834 /* Check if input is a valid GB18030 char (per FreeBSD):
835 * Single byte: [00-7f]
836 * Two byte: [81-fe][40-7e,80-fe]
837 * Four byte: [81-fe][30-39][81-fe][30-39]
839 ch
= *(unsigned char *) s
;
846 if (ch
>= 0x81 && ch
<= 0xfe)
850 ch
= (unsigned char) s
[1];
851 if ((ch
>= 0x40 && ch
<= 0x7e) || (ch
>= 0x80 && ch
<= 0xfe))
853 else if (ch
>= 0x30 && ch
<= 0x39)
857 ch
= (unsigned char) s
[2];
858 if (ch
< 0x81 || ch
> 0xfe)
862 ch
= (unsigned char) s
[3];
863 if (ch
< 0x30 || ch
> 0x39)
872 ret
= MultiByteToWideChar (54936, MB_ERR_INVALID_CHARS
, s
, len
, wres
, 2);
878 /* Surrogate pair. Store second half for later and return
879 first half. Return real count - 1, return 1 when the second
880 half of the pair is returned in the next run. */
881 state
->__count
= 18030;
882 state
->__value
.__wch
= wres
[1];
890 _REENT_ERRNO(r
) = EILSEQ
;
895 __kr_mbtowc (struct _reent
*r
, wchar_t *pwc
, const char *s
, size_t n
,
898 return __db_mbtowc (r
, pwc
, s
, n
, 949, state
);
902 __big5_mbtowc (struct _reent
*r
, wchar_t *pwc
, const char *s
, size_t n
,
905 return __db_mbtowc (r
, pwc
, s
, n
, 950, state
);
908 /* Our own sys_wcstombs/sys_mbstowcs functions differ from the
909 wcstombs/mbstowcs API in three ways:
911 - The UNICODE private use area is used in filenames to specify
912 characters not allowed in Windows filenames ('*', '?', etc).
913 The sys_wcstombs converts characters in the private use area
914 back to the corresponding ASCII chars.
916 - If a wide character in a filename has no representation in the current
917 multibyte charset, then usually you wouldn't be able to access the
918 file. To fix this problem, sys_wcstombs creates a replacement multibyte
919 sequences for the non-representable wide-char. The sequence starts with
920 an ASCII CAN (0x18, Ctrl-X), followed by the UTF-8 representation of the
921 character. The sys_(cp_)mbstowcs function detects ASCII CAN characters
922 in the input multibyte string and converts the following multibyte
923 sequence in by treating it as an UTF-8 char. If that fails, the ASCII
924 CAN was probably standalone and it gets just copied over as ASCII CAN.
926 - Three cases have to be distinguished for the return value:
928 - dst == NULL; len is ignored, the return value is the number of bytes
929 required for the string without the trailing NUL, just like the return
930 value of the wcstombs function.
932 - dst != NULL, len == (size_t) -1; the return value is the size in bytes
933 of the destination string without the trailing NUL. If the incoming
934 wide char string was not NUL-terminated, the target string won't be
935 NUL-terminated either.
937 - dst != NULL; len != (size_t) -1; the return value is the size in bytes
938 of the destination string without the trailing NUL. The target string
939 will be NUL-terminated, no matter what. If the result is truncated due
940 to buffer size, it's a bug in Cygwin and the buffer in the calling
941 function should be raised.
944 _sys_wcstombs (char *dst
, size_t len
, const wchar_t *src
, size_t nwc
,
949 wchar_t *pwcs
= (wchar_t *) src
;
953 wctomb_p f_wctomb
= __WCTOMB
;
955 if (f_wctomb
== __ascii_wctomb
)
956 f_wctomb
= __utf8_wctomb
;
957 memset (&ps
, 0, sizeof ps
);
960 while (n
< len
&& nwc
-- > 0)
966 /* Convert UNICODE private use area. Reverse functionality for the
967 ASCII area <= 0x7f (only for path names) is transform_chars above.
968 Reverse functionality for invalid bytes in a multibyte sequence is
969 in _sys_mbstowcs below. */
970 if (is_path
&& (pw
& 0xff00) == 0xf000
971 && (((cwc
= (pw
& 0xff)) <= 0x7f && tfx_rev_chars
[cwc
] >= 0xf000)
972 || (cwc
>= 0x80 && MB_CUR_MAX
> 1)))
979 bytes
= f_wctomb (_REENT
, buf
, pw
, &ps
);
980 if (bytes
== -1 && f_wctomb
!= __utf8_wctomb
)
982 /* Convert chars invalid in the current codepage to a sequence
983 ASCII CAN; UTF-8 representation of invalid char. */
984 buf
[0] = 0x18; /* ASCII CAN */
985 bytes
= __utf8_wctomb (_REENT
, buf
+ 1, pw
, &ps
);
992 ++bytes
; /* Add the ASCII CAN to the byte count. */
993 if (ps
.__count
== -4 && nwc
> 0)
995 /* First half of a surrogate pair. */
997 if ((*pwcs
& 0xfc00) != 0xdc00) /* Invalid second half. */
1003 bytes
+= __utf8_wctomb (_REENT
, buf
+ bytes
, *pwcs
, &ps
);
1008 if (n
+ bytes
<= len
)
1012 for (int i
= 0; i
< bytes
; ++i
)
1015 if (*pwcs
++ == 0x00)
1022 if (n
&& dst
&& len
!= (size_t) -1)
1024 n
= (n
< len
) ? n
: len
- 1;
1031 /* Allocate a buffer big enough for the string, always including the
1032 terminating '\0'. The buffer pointer is returned in *dst_p, the return
1033 value is the number of bytes written to the buffer, as usual.
1034 The "type" argument determines where the resulting buffer is stored.
1035 It's either one of the cygheap_types values, or it's "HEAP_NOTHEAP".
1036 In the latter case the allocation uses simple calloc.
1038 Note that this code is shared by cygserver (which requires it via
1039 __small_vsprintf) and so when built there plain calloc is the
1042 _sys_wcstombs_alloc (char **dst_p
, int type
, const wchar_t *src
, size_t nwc
,
1047 ret
= _sys_wcstombs (NULL
, (size_t) -1, src
, nwc
, is_path
);
1050 size_t dlen
= ret
+ 1;
1052 if (type
== HEAP_NOTHEAP
)
1053 *dst_p
= (char *) calloc (dlen
, sizeof (char));
1055 *dst_p
= (char *) ccalloc ((cygheap_types
) type
, dlen
, sizeof (char));
1058 ret
= _sys_wcstombs (*dst_p
, dlen
, src
, nwc
, is_path
);
1063 /* _sys_mbstowcs is actually most of the time called as sys_mbstowcs with
1064 a 0 codepage. If cp is not 0, the codepage is evaluated and used for the
1065 conversion. This is so that fhandler_console can switch to an alternate
1066 charset, which is the charset returned by GetConsoleCP (). Most of the
1067 time this is used for box and line drawing characters. */
1069 _sys_mbstowcs (mbtowc_p f_mbtowc
, wchar_t *dst
, size_t dlen
, const char *src
,
1073 unsigned const char *pmbs
= (unsigned const char *) src
;
1080 memset (&ps
, 0, sizeof ps
);
1083 while (len
> 0 && nms
> 0)
1085 /* ASCII CAN handling. */
1088 /* Sanity check: If this is a lead CAN byte for a following UTF-8
1089 sequence, there must be at least two more bytes left, and the
1090 next byte must be a valid UTF-8 start byte. If the charset
1091 isn't UTF-8 anyway, try to convert the following bytes as UTF-8
1093 if (nms
> 2 && pmbs
[1] >= 0xc2 && pmbs
[1] <= 0xf4
1094 && f_mbtowc
!= __utf8_mbtowc
)
1096 bytes
= __utf8_mbtowc (_REENT
, ptr
, (const char *) pmbs
+ 1,
1100 /* Invalid UTF-8 sequence? Treat the ASCII CAN character as
1101 stand-alone ASCII CAN char. */
1105 memset (&ps
, 0, sizeof ps
);
1109 ++bytes
; /* Count CAN byte */
1110 if (bytes
> 1 && ps
.__count
== 4)
1112 /* First half of a surrogate. */
1113 wchar_t *ptr2
= dst
? ptr
+ 1 : NULL
;
1114 int bytes2
= __utf8_mbtowc (_REENT
, ptr2
,
1115 (const char *) pmbs
+ bytes
,
1118 memset (&ps
, 0, sizeof ps
);
1123 ptr
= dst
? ptr
+ 1 : NULL
;
1129 /* Otherwise it's just a simple ASCII CAN. */
1137 else if ((bytes
= f_mbtowc (_REENT
, ptr
, (const char *) pmbs
, nms
,
1140 /* The technique is based on a discussion here:
1141 http://www.mail-archive.com/linux-utf8@nl.linux.org/msg00080.html
1143 Invalid bytes in a multibyte sequence are converted to
1144 the private use area which is already used to store ASCII
1145 chars invalid in Windows filenames. This technque allows
1146 to store them in a symmetric way. */
1149 *ptr
= L
'\xf000' | *pmbs
;
1150 memset (&ps
, 0, sizeof ps
);
1158 ptr
= dst
? ptr
+ 1 : NULL
;
1171 count
= (count
< dlen
) ? count
: dlen
- 1;
1178 /* Same as sys_wcstombs_alloc, just backwards. */
1180 sys_mbstowcs_alloc (wchar_t **dst_p
, int type
, const char *src
, size_t nms
)
1184 ret
= sys_mbstowcs (NULL
, (size_t) -1, src
, nms
);
1187 size_t dlen
= ret
+ 1;
1189 if (type
== HEAP_NOTHEAP
)
1190 *dst_p
= (wchar_t *) calloc (dlen
, sizeof (wchar_t));
1192 *dst_p
= (wchar_t *) ccalloc ((cygheap_types
) type
, dlen
,
1196 ret
= sys_mbstowcs (*dst_p
, dlen
, src
, nms
);
1201 /* Copy string, until c or <nul> is encountered.
1202 NUL-terminate the destination string (s1).
1203 Return pointer to terminating byte in dst string. */
1205 strccpy (char *__restrict s1
, const char **__restrict s2
, char c
)
1207 while (**s2
&& **s2
!= c
)
1214 const unsigned char case_folded_lower
[] = {
1215 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
1216 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
1217 32, '!', '"', '#', '$', '%', '&', 39, '(', ')', '*', '+', ',', '-', '.', '/',
1218 '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', ';', '<', '=', '>', '?',
1219 '@', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o',
1220 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '[', 92, ']', '^', '_',
1221 '`', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o',
1222 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '{', '|', '}', '~', 127,
1223 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143,
1224 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159,
1225 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175,
1226 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191,
1227 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207,
1228 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223,
1229 224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239,
1230 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255
1233 const unsigned char case_folded_upper
[] = {
1234 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
1235 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
1236 32, '!', '"', '#', '$', '%', '&', 39, '(', ')', '*', '+', ',', '-', '.', '/',
1237 '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', ';', '<', '=', '>', '?',
1238 '@', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O',
1239 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', '[', 92, ']', '^', '_',
1240 '`', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O',
1241 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', '{', '|', '}', '~', 127,
1242 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143,
1243 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159,
1244 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175,
1245 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191,
1246 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207,
1247 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223,
1248 224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239,
1249 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255
1252 const char isalpha_array
[] = {
1253 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1254 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1255 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1256 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1257 0,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,
1258 0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20, 0, 0, 0, 0, 0,
1259 0,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,
1260 0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20, 0, 0, 0, 0, 0,
1261 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1262 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1263 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1264 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1265 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1266 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1267 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1268 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
1272 cygwin_wcscasecmp (const wchar_t *ws
, const wchar_t *wt
)
1274 UNICODE_STRING us
, ut
;
1276 RtlInitUnicodeString (&us
, ws
);
1277 RtlInitUnicodeString (&ut
, wt
);
1278 return RtlCompareUnicodeString (&us
, &ut
, TRUE
);
1282 cygwin_wcsncasecmp (const wchar_t *ws
, const wchar_t *wt
, size_t n
)
1284 UNICODE_STRING us
, ut
;
1285 size_t ls
= 0, lt
= 0;
1287 while (ws
[ls
] && ls
< n
)
1289 RtlInitCountedUnicodeString (&us
, ws
, ls
* sizeof (WCHAR
));
1290 while (wt
[lt
] && lt
< n
)
1292 RtlInitCountedUnicodeString (&ut
, wt
, lt
* sizeof (WCHAR
));
1293 return RtlCompareUnicodeString (&us
, &ut
, TRUE
);
1297 cygwin_strcasecmp (const char *cs
, const char *ct
)
1299 UNICODE_STRING us
, ut
;
1302 len
= strlen (cs
) + 1;
1303 ulen
= len
* sizeof (WCHAR
);
1304 RtlInitEmptyUnicodeString (&us
, (PWCHAR
) alloca (ulen
), ulen
);
1305 us
.Length
= sys_mbstowcs (us
.Buffer
, len
, cs
) * sizeof (WCHAR
);
1307 len
= strlen (ct
) + 1;
1308 ulen
= len
* sizeof (WCHAR
);
1309 RtlInitEmptyUnicodeString (&ut
, (PWCHAR
) alloca (ulen
), ulen
);
1310 ut
.Length
= sys_mbstowcs (ut
.Buffer
, len
, ct
) * sizeof (WCHAR
);
1312 return RtlCompareUnicodeString (&us
, &ut
, TRUE
);
1316 cygwin_strncasecmp (const char *cs
, const char *ct
, size_t n
)
1318 UNICODE_STRING us
, ut
;
1320 size_t ls
= 0, lt
= 0;
1322 while (cs
[ls
] && ls
< n
)
1324 ulen
= (ls
+ 1) * sizeof (WCHAR
);
1325 RtlInitEmptyUnicodeString (&us
, (PWCHAR
) alloca (ulen
), ulen
);
1326 us
.Length
= sys_mbstowcs (us
.Buffer
, ls
+ 1, cs
, ls
) * sizeof (WCHAR
);
1328 while (ct
[lt
] && lt
< n
)
1330 ulen
= (lt
+ 1) * sizeof (WCHAR
);
1331 RtlInitEmptyUnicodeString (&ut
, (PWCHAR
) alloca (ulen
), ulen
);
1332 ut
.Length
= sys_mbstowcs (ut
.Buffer
, lt
+ 1, ct
, lt
) * sizeof (WCHAR
);
1334 return RtlCompareUnicodeString (&us
, &ut
, TRUE
);
1338 strlwr (char *string
)
1341 size_t len
= (strlen (string
) + 1) * sizeof (WCHAR
);
1343 us
.MaximumLength
= len
; us
.Buffer
= (PWCHAR
) alloca (len
);
1344 us
.Length
= sys_mbstowcs (us
.Buffer
, len
, string
) * sizeof (WCHAR
)
1346 RtlDowncaseUnicodeString (&us
, &us
, FALSE
);
1347 sys_wcstombs (string
, len
/ sizeof (WCHAR
), us
.Buffer
);
1352 strupr (char *string
)
1355 size_t len
= (strlen (string
) + 1) * sizeof (WCHAR
);
1357 us
.MaximumLength
= len
; us
.Buffer
= (PWCHAR
) alloca (len
);
1358 us
.Length
= sys_mbstowcs (us
.Buffer
, len
, string
) * sizeof (WCHAR
)
1360 RtlUpcaseUnicodeString (&us
, &us
, FALSE
);
1361 sys_wcstombs (string
, len
/ sizeof (WCHAR
), us
.Buffer
);
1365 /* backslashify: Convert all forward slashes in src path to back slashes
1366 in dst path. Add a trailing slash to dst when trailing_slash_p arg
1370 backslashify (const char *src
, char *dst
, bool trailing_slash_p
)
1372 const char *start
= src
;
1382 if (trailing_slash_p
1384 && !isdirsep (src
[-1]))
1389 /* slashify: Convert all back slashes in src path to forward slashes
1390 in dst path. Add a trailing slash to dst when trailing_slash_p arg
1394 slashify (const char *src
, char *dst
, bool trailing_slash_p
)
1396 const char *start
= src
;
1406 if (trailing_slash_p
1408 && !isdirsep (src
[-1]))
1413 static WCHAR hex_wchars
[] = L
"0123456789abcdef";
1416 RtlInt64ToHexUnicodeString (ULONGLONG value
, PUNICODE_STRING dest
,
1419 USHORT len
= append
? dest
->Length
: 0;
1420 if (dest
->MaximumLength
- len
< 16 * (int) sizeof (WCHAR
))
1421 return STATUS_BUFFER_OVERFLOW
;
1422 wchar_t *end
= (PWCHAR
) ((PBYTE
) dest
->Buffer
+ len
);
1423 PWCHAR p
= end
+ 16;
1426 *p
= hex_wchars
[value
& 0xf];
1429 dest
->Length
+= 16 * sizeof (WCHAR
);
1430 return STATUS_SUCCESS
;