# HG changeset patch # Parent 4587a24030e347df0461be19ec3b5814bf3993f5 Fix MinGW-Issue #39687; reimplement wcrtomb() and wcsrtombs(). * include/wchar.h [__MSVCRT_VERSION__ < __MSVCR80_DLL] (wcrtomb, wcsrtombs): Implement them as static inline redirects to... (__mingw_wcrtomb, __mingw_wcsrtombs): ...these; declare them. * include/limits.h (MB_LEN_MAX): Update value; was 2, but should be 5. * mingwex/wcsrtombs.c: New file; it implements... (__mingw_wcsrtombs): ...this new function, which replaces... (wcsrtombs): ...this; it was originally implemented... * mingwex/wcrtomb.c: ...here; rewritten as new, it now implements... (__mingw_wcrtomb): ...only this new function, which replaces... (wcrtomb): ...this. * mingwex/wcharmap.h: New private header; it declares the API for... * mingwex/wcharmap.c: ...this new file, which implements... (__mingw_wchar_to_mbcs_map): ...this new function, required by... (__mingw_wcrtomb, __mingw_wcsrtombs): ...both of these. * mingwex/codeset.c: New file; it implements... (__mb_codeset_for_locale, __mb_len_max_for_codeset): ...this pair of new helper functions; they identify the codeset, and respectively, its MB_CUR_MAX for the effective process locale, which are required by... (__mingw_wchar_to_mbcs_map): ...this. * Makefile.in (libmingwex.a): Add dependency references for... (codeset.$OBJEXT, wcharmap.$OBJEXT, wcsrtombs.$OBJEXT): ...these. * msvcrt.def.in (wcrtomb, wcsrtombs): Require dlsym look-up for MSVCRT.DLL entry point addresses. diff --git a/mingwrt/Makefile.in b/mingwrt/Makefile.in --- a/mingwrt/Makefile.in +++ b/mingwrt/Makefile.in @@ -466,13 +466,13 @@ libmingwex.a: $(addsuffix .$(OBJEXT), di libmingwex.a: $(addsuffix .$(OBJEXT), mkstemp mkdtemp cryptnam setenv) libmingwex.a: $(addsuffix .$(OBJEXT), getdelim gettimeofday) vpath %.s ${mingwrt_srcdir}/mingwex vpath %.sx ${mingwrt_srcdir}/mingwex -libmingwex.a: $(addsuffix .$(OBJEXT), fwide mbrtowc mbsinit strnlen wcrtomb \ - wcsnlen wcstof wcstold wctob wctrans wctype wmemchr wmemcmp wmemcpy wmemmove \ - wmemset) +libmingwex.a: $(addsuffix .$(OBJEXT), codeset fwide mbrtowc mbsinit strnlen \ + wcharmap wcrtomb wcsrtombs wcsnlen wcstof wcstold wctob wctrans wctype wmemchr \ + wmemcmp wmemcpy wmemmove wmemset) # The wcsnlen() function, enumerated above, is an adaptation of strnlen(); # we need a specific rule to compile it, from shared source. # wcsnlen.$(OBJEXT): strnlen.sx diff --git a/mingwrt/include/limits.h b/mingwrt/include/limits.h --- a/mingwrt/include/limits.h +++ b/mingwrt/include/limits.h @@ -4,11 +4,11 @@ * Manifest constants defining the sizes of integral types. * * $Id$ * * Written by Colin Peters <colin@bird.fu.is.saga-u.ac.jp> - * Copyright (C) 1997, 1999-2001, 2004, 2005, 2010, 2012, 2017, + * Copyright (C) 1997, 1999-2001, 2004, 2005, 2010, 2012, 2017, 2019, * MinGW.org Project * * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the "Software"), @@ -51,15 +51,17 @@ # define PATH_MAX 260 #endif /* Characteristics of the char data type. * - * FIXME: Is MB_LEN_MAX correct? Probably yes, for Microsoft MBCS, which - * effectively seem to all be DBCS. + * FIXME: Is MB_LEN_MAX correct? Earlier Microsoft documentation specified + * it as two, (which would probably have been okay, in the case of only DBCS + * encodings); today (2019), Microsoft's documentation says that five is the + * appropriate value. */ #define CHAR_BIT 8 -#define MB_LEN_MAX 2 +#define MB_LEN_MAX 5 #define SCHAR_MIN (-128) #define SCHAR_MAX 127 #define UCHAR_MAX 255 diff --git a/mingwrt/include/wchar.h b/mingwrt/include/wchar.h --- a/mingwrt/include/wchar.h +++ b/mingwrt/include/wchar.h @@ -6,11 +6,12 @@ * * $Id$ * * Unattributed original source. * Adapted by Rob Savoye <rob@cygnus.com> - * Copyright (C) 1997, 1999-2009, 2011, 2015, 2016, 2018, MinGW.org Project. + * Copyright (C) 1997, 1999-2009, 2011, 2015, 2016, 2018, 2019, + * MinGW.org Project. * * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the "Software"), * to deal in the Software without restriction, including without limitation @@ -526,32 +527,63 @@ extern size_t __mingw_wcsnlen (const wch typedef wchar_t _Wint_t; #endif typedef int mbstate_t; -/* The following multi-byte character conversion functions are - * implemented in libmingwex.a, (and maybe also in some non-free - * Microsoft libraries, such as MSVCP60.DLL and later). +/* The following multi-byte character conversion functions have been + * implemented by Microsoft, in non-free MSVCR80.DLL and later, (and + * maybe also in some earlier non-free DLLs, such as MSVCP60.DLL and + * later); they are also available in MSVCRT.DLL, from Vista onward, + * but to provide continuing support for earlier Windows versions, + * we invoke them via MinGW specific wrappers, defined below. */ __cdecl __MINGW_NOTHROW wint_t btowc (int); __cdecl __MINGW_NOTHROW int wctob (wint_t); -__cdecl __MINGW_NOTHROW -size_t mbrlen (const char *__restrict__, size_t, mbstate_t *__restrict__); +__cdecl __MINGW_NOTHROW size_t mbrlen +(const char *__restrict__, size_t, mbstate_t *__restrict__); __cdecl __MINGW_NOTHROW size_t mbrtowc (wchar_t *__restrict__, const char *__restrict__, size_t, mbstate_t *__restrict__); __cdecl __MINGW_NOTHROW size_t mbsrtowcs (wchar_t *__restrict__, const char **__restrict__, size_t, mbstate_t *__restrict__); -__cdecl __MINGW_NOTHROW -size_t wcrtomb (char * __restrict__, wchar_t, mbstate_t *__restrict__); +__cdecl __MINGW_NOTHROW size_t wcrtomb +(char * __restrict__, wchar_t, mbstate_t *__restrict__); __cdecl __MINGW_NOTHROW size_t wcsrtombs (char *__restrict__, const wchar_t **__restrict__, size_t, mbstate_t *__restrict__); +/* To provide support for the above, on legacy Windows versions, + * we implement fall back wrappers in libmingwex.a; each of these + * will delegate to the corresponding Microsoft implementation, if + * it exists in the process address space; otherwise, execution + * will fall back to a MinGW implementation. + */ +__cdecl __MINGW_NOTHROW size_t __mingw_wcrtomb +(char * __restrict__, wchar_t, mbstate_t *__restrict__); + +__cdecl __MINGW_NOTHROW size_t __mingw_wcsrtombs +(char *__restrict__, const wchar_t **__restrict__, size_t, mbstate_t *__restrict__); + +#if __MSVCRT_VERSION__ < __MSVCR80_DLL +/* For linking with all versions of MSVCRT.DLL, and with non-free + * alternatives predating MSVCR80.DLL, we enforce inline mapping to + * the libmingwex.a implementations, (which will delegate the calls + * to the Microsoft DLL implementations, when they are available). + */ +__CRT_ALIAS __cdecl __MINGW_NOTHROW size_t wcrtomb +(char * __mbc, wchar_t __wc, mbstate_t *__ps) +{ return __mingw_wcrtomb(__mbc, __wc, __ps); } + +__CRT_ALIAS __cdecl __MINGW_NOTHROW size_t wcsrtombs +(char *__mbs, const wchar_t **__wcs, size_t __len, mbstate_t *__ps) +{ return __mingw_wcsrtombs(__mbs, __wcs, __len, __ps); } + +#endif /* ! MSVCR80.DLL or later */ + #if defined _ISOC99_SOURCE || defined __cplusplus /* These ISO-C99 functions are implemented in libmingwex.a, * or, in some cases, as inline stubs; while provided as MinGW * extensions to support ISO-C99, they are also required by * GNU C++. diff --git a/mingwrt/mingwex/codeset.c b/mingwrt/mingwex/codeset.c new file mode 100644 --- /dev/null +++ b/mingwrt/mingwex/codeset.c @@ -0,0 +1,111 @@ +/* + * codeset.c + * + * Provides implementation-private helper functions, to identify the + * code page which is associated with the active process locale, and to + * establish the effective MB_CUR_MAX value for this code page. + * + * $Id$ + * + * Written by Keith Marshall <keith@users.osdn.me> + * Copyright (C) 2019, MinGW.org Project + * + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice, this permission notice, and the following + * disclaimer shall be included in all copies or substantial portions of + * the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OF OR OTHER + * DEALINGS IN THE SOFTWARE. + * + */ +#include <locale.h> +#include <stdlib.h> +#include <string.h> +#include <winnls.h> + +unsigned int __mb_codeset_for_locale( void ); +unsigned int __mb_len_max_for_codeset( unsigned int ); + +unsigned int __mb_codeset_for_locale( void ) +{ + /* Extract the code page identification string (if any) from the LC_CTYPE + * identification string, as returned in "language[_region[.codeset]]", or + * ".codeset" format, by a setlocale() query on the current locale. + */ + char *default_locale_specification, *codeset_string; + if( (default_locale_specification = setlocale( LC_CTYPE, NULL )) != NULL ) + { + /* An unfortunate -- albeit documented -- limitation of Microsoft's + * setlocale() implementation is that it cannot correctly process any + * locale specification which refers to a MBCS codeset which may use + * more than two bytes for any single code point; to mitigate this, + * when the active locale matches the system default... + */ + char string_buffer[1 + strlen( default_locale_specification )]; + codeset_string = strcpy( string_buffer, default_locale_specification ); + if( strcmp( codeset_string, setlocale( LC_CTYPE, "" )) == 0 ) + { + /* ...although Microsoft's setlocale() doesn't support it, (and + * is neither expected to, nor required to), we may adopt POSIX.1 + * convention, in this particular case, to acquire a preferred + * default locale specification from the environment... + */ + if( ((default_locale_specification = getenv( "LC_ALL" )) != NULL) + || ((default_locale_specification = getenv( "LC_CTYPE" )) != NULL) + || ((default_locale_specification = getenv( "LANG" )) != NULL) ) + + /* ...and use that in place of Microsoft's setlocale() notion + * of the current effective LC_CTYPE locale category. + */ + codeset_string = default_locale_specification; + } + else + { /* The originally active locale does NOT match the system default, + * but we made it do so, by checking, so restore the original. + */ + setlocale( LC_CTYPE, codeset_string ); + } + /* Regardless of how we established the effective LC_CTYPE category + * for the active locale, we may extract its codeset element... + */ + if( (codeset_string = strchr( codeset_string, '.' )) != NULL ) + { + /* ...interpreting the resultant string as its equivalent integer + * value, for validation and return. + */ + unsigned int retval = (unsigned int)(atoi( codeset_string + 1 )); + if( __mb_len_max_for_codeset( retval ) > 0 ) return retval; + } + } + /* In the event that LC_CTYPE doesn't include a codeset identification, + * return an effective value of zero, which we may later interpret as a + * default representation for the "C" locale. + */ + return 0; +} + +unsigned int __mb_len_max_for_codeset( unsigned int codeset ) +{ + /* Identify the length of the longest valid multibyte character encoding + * sequence, used within the specified MS-Windows code page, by consulting + * the relevant Win32 API database. Returns the appropriate byte count, + * or zero if the codeset identifier is not valid. + */ + CPINFO codeset_info; + return (GetCPInfo( codeset, &codeset_info )) ? codeset_info.MaxCharSize : 0; +} + +/* $RCSfile$: end of file */ diff --git a/mingwrt/mingwex/wcharmap.c b/mingwrt/mingwex/wcharmap.c new file mode 100644 --- /dev/null +++ b/mingwrt/mingwex/wcharmap.c @@ -0,0 +1,172 @@ +/* + * wcharmap.c + * + * Provides an implementation-private helper function, to facilitate + * conversion from UTF-16LE wchar_t data, of arbitrary length, to an + * equivalent multi-byte character encoding sequence. + * + * $Id$ + * + * Written by Keith Marshall <keith@users.osdn.me> + * Copyright (C) 2019, MinGW.org Project + * + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice, this permission notice, and the following + * disclaimer shall be included in all copies or substantial portions of + * the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OF OR OTHER + * DEALINGS IN THE SOFTWARE. + * + */ +#include "wcharmap.h" + +#include <limits.h> + +size_t __mingw_wchar_to_mbcs_map +( unsigned cp, char *mbs, int mblen, const wchar_t *wcs, int wclen ) +{ + /* Helper function to map a sequence of wchars to their corresponding + * sequence of multibyte characters, encoded as is appropriate for the + * specified code page, (which is nominally the code page associated + * with the current locale). + * + * Inputs: + * cp The code page for which encoding is to be performed. + * + * mbs Buffer in which the encoded multibyte sequence may be + * returned, or NULL, if only the sequence length is to + * be determined, discarding the encoded data. + * + * mblen Number of bytes available in mbs; ignured if mbs is + * passed as NULL. + * + * wcs The sequence of wchars which is to be encoded. + * + * wclen The number of wchars in wcs; if passed as (size_t)(-1), + * scan until (wchar_t)(0), or until a wchar with no valid + * encoding, or space in the encoding buffer is exhausted. + * + * Returns: + * The number of encoded bytes (which would be) stored into mbs, if + * mbs is not NULL, and all specifed wchars in wcs are successfully + * encoded; otherwise, returns (size_t)(-1), and sets errno to: + * + * EILSEQ If encoding is interrupted by a wchar with no valid + * encoding within the specified code page. + * + * ENOMEM The mbs pointer isn't NULL, but there is insufficient + * space in the designated buffer to store the encoded + * multibyte character sequence. + */ + size_t retval; int eilseq_flag = 0; + + if( cp == 0 ) + { /* Code page zero is assumed to represent the encoding which applies + * within the "C" locale; this is a single-byte encoding, with wchar + * values in the range L'\0'..L'\255' mapped to their identical byte + * values, and all greater wchar values considered to be invalid. + * + * Simply scan, count, and optionally store valid byte values, + * starting from an initial count of zero. + */ + retval = 0; + + if( (size_t)(wclen) == (size_t)(-1) ) + do { /* This is an unbounded scan; simply check that each + * successive wchar lies in the valid range... + */ + if( (unsigned)(*wcs) > UCHAR_MAX ) + /* ...otherwise, report an invalid encoding, and + * bail out. + */ + return errout( EILSEQ, wclen ); + + /* We got a valid input wchar... + */ + if( mbs != NULL ) + { /* ...which we are now expected to store... + */ + if( mblen-- > 0 ) *mbs++ = (unsigned char)(*wcs); + + /* ...but, we must bail out, if there is no + * space left in the encoding buffer. + */ + else return errout( ENOMEM, (size_t)(-1) ); + } + + /* We've accepted the current input wchar; count + * it, and then, provided it isn't the terminating + * NUL, move on to the next. + */ + ++retval; + } while( *wcs++ != L'\0' ); + + else while( wclen-- > 0 ) + { /* This is a bounded scan; as in the unbounded case, take + * each input wchar in turn, and verify that each lies in + * the valid encoding range. + */ + if( (unsigned)(*wcs) > UCHAR_MAX ) + return errout( EILSEQ, (size_t)(-1) ); + + /* We got a valid input wchar... + */ + if( mbs != NULL ) + { /* ...which we are now expected to store... + */ + if( mblen-- > 0 ) *mbs++ = (unsigned char)(*wcs); + + /* ...but, we must bail out, if there is no + * space left in the encoding buffer. + */ + else return errout( ENOMEM, (size_t)(-1) ); + } + + /* Ensure that we don't scan beyond a terminating NUL + * wchar, even if this lies within the bounded count. + */ + if( *wcs++ == L'\0' ) wclen = 0; + + /* In any case, count the current encoded byte. + */ + ++retval; + } + + /* We now have the final count, for a code page zero encoding; + * we are done. + */ + return retval; + } + + /* For any code page other than zero, we delegate both encoding + * and byte counting to the Windows API; note that for code pages + * other than CP_UTF7 or CP_UTF8, (and CP_UTF8 is the only code + * page with an identifier greater than that for CP_UTF7), there + * may be unrepresentable UTF-16 code points, and we must pass a + * flag reference to detect their presence in the UTF-16LE input + * sequence; OTOH, any valid UTF-16 code point is representable + * in both CP_UTF7 and CP_UTF8, so no such flag is required, and + * WideCharToMultiByte() will choke, if the flag reference is + * not passed as NULL. + */ + retval = WideCharToMultiByte( cp, 0, wcs, wclen, mbs, mblen, NULL, + (CP_UTF7 > cp) ? &eilseq_flag : NULL + ); + return (eilseq_flag || (retval == 0)) ? errout( EILSEQ, (size_t)(-1) ) + : retval; +} + +/* $RCSfile$: end of file */ diff --git a/mingwrt/mingwex/wcharmap.h b/mingwrt/mingwex/wcharmap.h new file mode 100644 --- /dev/null +++ b/mingwrt/mingwex/wcharmap.h @@ -0,0 +1,62 @@ +/* + * wcharmap.h + * + * Private header file, declaring common components of the MinGW.org + * fallback implementations of wide to multi-byte (and complementary) + * character set conversion API functions. + * + * $Id$ + * + * Written by Keith Marshall <keith@users.osdn.me> + * Copyright (C) 2019, MinGW.org Project + * + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice, this permission notice, and the following + * disclaimer shall be included in all copies or substantial portions of + * the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OF OR OTHER + * DEALINGS IN THE SOFTWARE. + * + */ +#include <wchar.h> +#include <winnls.h> +#include <stdlib.h> +#include <errno.h> + +/* Define a pair of inline helper functions, to facilitate preservation + * of the "errno" state on entry, such that it may be restored or modified, + * as necessary for ISO-C99 conformance, on function return. + * + * First, a helper to save, and clear, error state on entry... + */ +static __inline__ __attribute__((__always_inline__)) +int save_error_status_and_clear (int state, int clear) +{ errno = clear; return state; } + +/* ...and the complementary helper, which may be used to either restore + * the saved state, or to report a new error condition, on return. + */ +static __inline__ __attribute__((__always_inline__)) +size_t errout (int errcode, size_t status){ errno = errcode; return status; } + +unsigned int __mb_codeset_for_locale (void); +unsigned int __mb_len_max_for_codeset (unsigned int); +size_t __mingw_wchar_to_mbcs_map (unsigned, char *, int, const wchar_t *, int); + +static __inline__ __attribute__((__always_inline__)) +unsigned int get_codepage(){ return __mb_codeset_for_locale(); } + +/* $RCSfile$: end of file */ diff --git a/mingwrt/mingwex/wcrtomb.c b/mingwrt/mingwex/wcrtomb.c --- a/mingwrt/mingwex/wcrtomb.c +++ b/mingwrt/mingwex/wcrtomb.c @@ -1,94 +1,113 @@ -#include "mb_wc_common.h" -#include <wchar.h> -#include <stdlib.h> -#include <errno.h> -#include <limits.h> -#define WIN32_LEAN_AND_MEAN -#include <windows.h> +/* + * wcrtomb.c + * + * MinGW.org replacement for the wcrtomb() function; delegates to the + * Microsoft implementation, if available in the C runtime DLL, otherwise + * handles the call locally. + * + * $Id$ + * + * Written by Keith Marshall <keith@users.osdn.me> + * Copyright (C) 2019, MinGW.org Project + * + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice, this permission notice, and the following + * disclaimer shall be included in all copies or substantial portions of + * the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OF OR OTHER + * DEALINGS IN THE SOFTWARE. + * + */ +#include "wcharmap.h" +/* For runtime delegation, we need a mechanism for detection of an + * implementation, within the default C runtime DLL; we may use the + * MinGW dlfcn emulation, to facilitate this. + */ +#include <dlfcn.h> -static int __MINGW_ATTRIB_NONNULL(1) - __wcrtomb_cp (char *dst, wchar_t wc, const unsigned int cp, - const unsigned int mb_max) +/* We need to look up the effective working codeset, before choosing + * between MSVCRT.DLL and MinGW fallback implementations; to avoid a + * need to look it up again, within the MinGW fallback, we store the + * result of the initial look up in this file-global variable. + */ +static unsigned int codeset; + +static size_t __mingw_wcrtomb_fallback +( char *restrict mb, wchar_t wc, mbstate_t *__UNUSED_PARAM(ps) ) +# define mbcs_map __mingw_wchar_to_mbcs_map { - if (cp == 0) - { - if (wc > 255) - { - errno = EILSEQ; - return -1; - } - *dst = (char) wc; - return 1; - } - else - { - int invalid_char = 0; + /* Fallback function, providing an implementation of the wcrtomb() + * function, when none is available within the Microsoft runtime. + * + * When mb is a NULL pointer, ISO-C99 decrees that the call shall + * be interpreted as the equivalent of: + * + * wcrtomb( internal_buffer, L'\0', ps ); + * + * with the encoding of the NUL wchar, preceded by any sequence + * of bytes needed restore ps to the initial shift state, being + * stored in the internal buffer, (and thus, inaccessible to the + * caller). Since Microsoft's MBCS encodings do not use shift + * states, and the encoding for NUL is always a single NUL byte, + * this becomes the equivalent of returning (size_t)(1). + */ + if( mb == NULL ) return (size_t)(1); - int size = WideCharToMultiByte (cp, 0 /* Is this correct flag? */, - &wc, 1, dst, mb_max, - NULL, &invalid_char); - if (size == 0 || invalid_char) - { - errno = EILSEQ; - return -1; - } - return size; - } + /* Otherwise, we return the byte count, and effect of encoding + * the single wchar passed by value in wc. + */ + return mbcs_map( codeset, mb, MB_CUR_MAX, &wc, 1 ); } -size_t -wcrtomb (char *dst, wchar_t wc, mbstate_t * __UNUSED_PARAM (ps)) +size_t __mingw_wcrtomb( char *restrict mb, wchar_t wc, mbstate_t *restrict ps ) { - char byte_bucket [MB_LEN_MAX]; - char* tmp_dst = dst ? dst : byte_bucket; - return (size_t)__wcrtomb_cp (tmp_dst, wc, get_codepage (), - MB_CUR_MAX); + /* Wrapper for the wcrtomb() function; it will initially attempt + * to delegate the call to a Microsoft-provided implementation, but + * if no such implementation can be found, fall back to the MinGW + * substitute (defined above). + */ + typedef size_t (*redirect_t)( char *restrict, wchar_t, mbstate_t *restrict ); + static redirect_t redirector_hook = NULL; + + /* MSVCRT.DLL's setlocale() cannot reliably handle code pages with + * more than two bytes per code point, (e.g. UTF-7 and UTF-8); thus, + * Microsoft's wcsrtombs() is likely to be similarly unreliable, so + * always use the MinGW fallback with such code pages. + */ + if( __mb_len_max_for_codeset( codeset = __mb_codeset_for_locale() ) > 2 ) + return __mingw_wcrtomb_fallback( mb, wc, ps ); + + /* On first time call, we don't know which implementation is to be + * selected; look for a Microsoft implementation, which, if available, + * may be registered for immediate use on this, and any subsequent, + * calls to this function wrapper... + */ + if( (redirector_hook == NULL) + && ((redirector_hook = dlsym( RTLD_DEFAULT, "wcrtomb" )) == NULL) ) + + /* ...but when no Microsoft implementation can be found, register + * the MinGW fall back in its stead. + */ + redirector_hook = __mingw_wcrtomb_fallback; + + /* Finally, delegate the call to whichever implementation has been + * registered on first-time call. + */ + return redirector_hook( mb, wc, ps ); } -size_t wcsrtombs (char *dst, const wchar_t **src, size_t len, - mbstate_t * __UNUSED_PARAM (ps)) -{ - int ret = 0; - size_t n = 0; - const unsigned int cp = get_codepage(); - const unsigned int mb_max = MB_CUR_MAX; - const wchar_t *pwc = *src; - - if (src == NULL || *src == NULL) /* undefined behavior */ - return 0; - - if (dst != NULL) - { - while (n < len) - { - if ((ret = __wcrtomb_cp (dst, *pwc, cp, mb_max)) <= 0) - return (size_t) -1; - n += ret; - dst += ret; - if (*(dst - 1) == '\0') - { - *src = (wchar_t*) NULL;; - return (n - 1); - } - pwc++; - } - *src = pwc; - } - else - { - char byte_bucket [MB_LEN_MAX]; - while (n < len) - { - if ((ret = __wcrtomb_cp (byte_bucket, *pwc, cp, mb_max)) - <= 0) - return (size_t) -1; - n += ret; - if (byte_bucket [ret - 1] == '\0') - return (n - 1); - pwc++; - } - } - - return n; -} +/* $RCSfile$: end of file */ diff --git a/mingwrt/mingwex/wcsrtombs.c b/mingwrt/mingwex/wcsrtombs.c new file mode 100644 --- /dev/null +++ b/mingwrt/mingwex/wcsrtombs.c @@ -0,0 +1,167 @@ +/* + * wcsrtombs.c + * + * MinGW.org replacement for the wcsrtombs() function; delegates to the + * Microsoft implementation, if available in the C runtime DLL, otherwise + * handles the call locally. + * + * $Id$ + * + * Written by Keith Marshall <keith@users.osdn.me> + * Copyright (C) 2019, MinGW.org Project + * + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice, this permission notice, and the following + * disclaimer shall be included in all copies or substantial portions of + * the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OF OR OTHER + * DEALINGS IN THE SOFTWARE. + * + */ +#include "wcharmap.h" + +/* For runtime delegation, we need a mechanism for detection of an + * implementation, within the default C runtime DLL; we may use the + * MinGW dlfcn emulation, to facilitate this. + */ +#include <dlfcn.h> + +/* We need to look up the effective working codeset, before choosing + * between MSVCRT.DLL and MinGW fallback implementations; to avoid a + * need to look it up again, within the MinGW fallback, we store the + * result of the initial look up in this file-global variable. + */ +static unsigned int codeset; + +static size_t __mingw_wcsrtombs_fallback +( char *restrict mbs, const wchar_t **restrict wcs, size_t len, mbstate_t *__UNUSED_PARAM(ps) ) +# define mbcs_map __mingw_wchar_to_mbcs_map +{ + /* Fallback function, providing an implementation of the wcsrtombs() + * function, when none is available within the Microsoft runtime. + * + * Initially, identify the code page for which the multibyte encoding + * is required, save the current errno state, so that we may restore + * it on return, clear it to zero for internal checking, and compute + * the size of buffer required to accommodate the conversion. + */ + int errno_reset = save_error_status_and_clear( errno, 0 ); + size_t wanted = mbcs_map( codeset, NULL, 0, *wcs, -1 ); + + if( mbs == NULL ) + /* There is no buffer designated to store the encoded multibyte + * character sequence; we are only interested in the size of the + * buffer which would otherwise be required, and we've already + * determined that, so simply return it. + */ + return (errno == 0) ? errout( errno_reset, wanted - 1 ) : wanted; + + if( (errno == 0) && (len >= wanted) ) + { /* There is an encoding buffer designated, its size is sufficient + * to accommodate the encoding of the entire NUL terminated input + * sequence, and there was no incipient encoding error during the + * initial minimum buffer size determination; encode the entire + * input sequence for return, and clean up the input state. + */ + len = mbcs_map( codeset, mbs, len, *wcs, -1 ) - 1; + *wcs = NULL; + } + + else + { /* There is an encoding buffer designated, but either it is too + * small, or a incipient encoding error has been detected; rescan + * the input sequence, encoding one code point at a time, until we + * either exhaust the encoding buffer space, or we encounter the + * encoding error previously identified. + */ + size_t count = 0; errno = 0; + while( (len >= mbcs_map( codeset, NULL, 0, *wcs, 1 )) && (errno == 0) ) + { + /* There is still sufficient space to store the encoding of one + * more input code point, and we haven't yet fallen foul of any + * incipient encoding error; store this encoding, and adjust to + * prepare for the next. + */ + size_t step = mbcs_map( codeset, mbs, len, (*wcs)++, 1 ); + count += step; len -= step; mbs += step; + } + + /* Check that we didn't fall foul of any incipient encoding error; + * if we did, then we must bail out. + */ + if( errno != 0 ) return (size_t)(-1); + + /* If we're still here, then we've encoded as much of the input + * sequence as we can accommodate; the input pointer has already + * been adjusted, as required, but we must preserve the count of + * cumulatively encoded bytes, for return. + */ + len = count; + } + + /* We have now successfully encoded as much of the input sequence + * as possible, without encountering any encoding error; restore + * the saved errno state, and return the encoded byte count. + */ + return errout( errno_reset, len ); +} + +size_t __mingw_wcsrtombs +( char *mbs, const wchar_t **wcs, size_t len, mbstate_t *ps ) +{ + /* Wrapper for the wcsrtombs() function; it will initially attempt + * to delegate the call to a Microsoft-provided implementation, but + * if no such implementation can be found, fall back to the MinGW + * substitute (defined above). + */ + typedef size_t (*redirect_t)(char *, const wchar_t **, size_t, mbstate_t *); + static redirect_t redirector_hook = NULL; + + /* Neither wcs, not the pointer to which it refers, may be NULL. + * ISO C doesn't specify any particular outcome for this condition, + * (so a segmentation fault would conform); it makes more sense to + * catch the abnormality, and bail out. + */ + if( (wcs == NULL) || (*wcs == NULL) ) return errout( EINVAL, (size_t)(-1) ); + + /* MSVCRT.DLL's setlocale() cannot reliably handle code pages with + * more than two bytes per code point, (e.g. UTF-7 and UTF-8); thus, + * Microsoft's wcsrtombs() is likely to be similarly unreliable, so + * always use the MinGW fallback with such code pages. + */ + if( __mb_len_max_for_codeset( codeset = __mb_codeset_for_locale() ) > 2 ) + return __mingw_wcsrtombs_fallback( mbs, wcs, len, ps ); + + /* On first time call, we don't know which implementation is to be + * selected; look for a Microsoft implementation, which, if available, + * may be registered for immediate use on this, and any subsequent, + * calls to this function wrapper... + */ + if( (redirector_hook == NULL) + && ((redirector_hook = dlsym( RTLD_DEFAULT, "wcsrtombs" )) == NULL) ) + { + /* ...but when no Microsoft implementation can be found, register + * the MinGW fallback in its stead. + */ + redirector_hook = __mingw_wcsrtombs_fallback; + } + /* Finally, delegate the call to whichever implementation has been + * registered on first-time call. + */ + return redirector_hook( mbs, wcs, len, ps ); +} + +/* $RCSfile$: end of file */ diff --git a/mingwrt/msvcrt-xref/msvcrt.def.in b/mingwrt/msvcrt-xref/msvcrt.def.in --- a/mingwrt/msvcrt-xref/msvcrt.def.in +++ b/mingwrt/msvcrt-xref/msvcrt.def.in @@ -3563,11 +3563,11 @@ vwprintf vwprintf_s # if __MSVCRT_VERSION__ >= 0x12000000UL vwscanf vwscanf_s # endif -wcrtomb +__MINGW_DLSYM(wcrtomb) wcrtomb_s # endif #endif wcscat #if __MSVCRT_VERSION__ >= 0x0600UL @@ -3603,11 +3603,11 @@ wcsnlen #endif wcspbrk wcsrchr #if __MSVCRT_VERSION__ >= 0x0600UL # if __MSVCRT_VERSION__ < 0x07000000UL || __MSVCRT_VERSION__ >= 0x08000000UL -wcsrtombs +__MINGW_DLSYM(wcsrtombs) wcsrtombs_s # endif #endif wcsspn wcsstr