/* * $Id: unicode_helper.c 9700 2007-07-04 14:12:58Z mjevans $ */ #ifdef WITH_UNICODE #include "ODBC.h" #include #include "ConvertUTF.h" typedef enum { do_new=1, do_cat, do_set } new_cat_set_t; /* static prototypes */ static unsigned short utf16_len(UTF16 *wp); static void utf16_copy(UTF16 *d, UTF16 *s); static SV * _dosvwv(SV * sv, UTF16 * wp, STRLEN len, new_cat_set_t mode); /* * If len>=0, wp is an array of wide characters without a * termination character. * If len==-1, wp is a null-terminated wide string */ static SV * _dosvwv(SV * sv, UTF16 * wp, STRLEN len, new_cat_set_t mode) { char * p=NULL; STRLEN svlen; #ifdef WIN32 int bytes; bytes=WideCharToMultiByte(CP_UTF8,0,wp,len,NULL,0,NULL,NULL); Newz(0,p,1+bytes,char); /* allocate bytes+1 chars - ptr to p */ if (bytes!=0) { if(!WideCharToMultiByte(CP_UTF8,0,wp,len,p,bytes,NULL,NULL)) { int err=GetLastError(); switch (err) { case ERROR_INSUFFICIENT_BUFFER: croak("_dosvwv: WideCharToMultiByte() failed: insufficient buffer"); case ERROR_INVALID_FLAGS: croak("_dosvwv: WideCharToMultiByte() failed: invalid flags"); case ERROR_INVALID_PARAMETER: croak("_dosvwv: WideCharToMultiByte() failed: invalid parameter"); default: croak("_dosvwv: WideCharToMultiByte() failed: error code %i",err); } } } svlen=(len==-1 ? strlen(p) : bytes); #else unsigned int bytes; if (len == -1) { len = utf16_len(wp); } if (len > 0) { ConversionResult ret; UTF16 *source_start = wp; UTF16 *source_end = source_start + len; UTF8 *target_start; UTF8 *target_end; /* Test conversion and find size UTF* of buffer we need */ ret = ConvertUTF16toUTF8((const UTF16 **)&source_start, source_end, NULL, NULL, strictConversion, &bytes); /*fprintf(stderr, "Bytes = %d\n", bytes);*/ if (ret != conversionOK) { if (ret == sourceExhausted) { croak("_dosvwc: Partial character in input"); } else if (ret == targetExhausted) { croak("_dosvwc: target buffer exhausted"); } else if (ret == sourceIllegal) { croak("_dosvwc: malformed/illegal source sequence"); } else { croak("_dosvwc: unknown ConvertUTF16toUTF8 error"); } } Newz(0, p, bytes + 1, char); /* convert UTF16 to UTF8 */ target_start = p; target_end = p + bytes; source_start = (UTF16 *)wp; source_end = source_start + len; ret = ConvertUTF16toUTF8((const UTF16 **)&source_start, source_end, &target_start, target_end, strictConversion, &bytes); /*fprintf(stderr, "%s\n", p);*/ if (ret != conversionOK) { croak("_dosvwc: second call to ConvertUTF16toUTF8 failed (%d)", ret); } svlen = bytes; } else { svlen = 0; } #endif switch (mode) { case do_new: sv=newSVpvn(p,svlen); break; case do_cat: sv_catpvn(sv,p,svlen); break; case do_set: sv_setpvn(sv,p,svlen); break; default: croak("_dosvwv called with bad mode value"); } if (*p) { SvUTF8_on(sv); } else if (mode!=do_cat) { SvUTF8_off(sv); /* Don't switch off UTF8 just because we *APPENDED* an empty string! sv may still be UTF8. */ } Safefree(p); return sv; } /* * Set the string value of an SV* to a representation of a UTF16 * value, * similar to sv_setpvn() and sv_setpv() * SV contains UTF-8 representation of wp, has UTF8-Flag on except for * empty strings * * wp is an array of wide characters without a termination character */ void sv_setwvn(SV * sv, UTF16 * wp, STRLEN len) { if (wp==NULL) { sv_setpvn(sv,NULL,len); } else if (len==0) { sv_setpvn(sv,"",0); } else { _dosvwv(sv,wp,len,do_set); } } /* * Get a UTF16 * representation of a char * * The representation is a converted copy, so the result needs to be freed * usng WVfree(). * char * s == NULL is handled properly * * Does not handle byte arrays, only null-terminated strings. */ UTF16 * WValloc(char * s) { UTF16 * buf=NULL; if (NULL!=s) { #ifdef WIN32 int widechars=MultiByteToWideChar(CP_UTF8,0,s,-1,NULL,0); Newz(0,buf,widechars+1,UTF16); if (widechars!=0) { MultiByteToWideChar(CP_UTF8,0,s,-1,buf,widechars); } #else unsigned int widechrs, bytes; size_t slen; ConversionResult ret; UTF8 *source_start, *source_end; UTF16 *target_start, *target_end; slen = strlen(s); /*fprintf(stderr, "utf8 string \\%s\\ is %ld bytes long\n", s, strlen(s));*/ source_start = s; source_end = s + slen + 1; /* include NUL terminator */ ret = ConvertUTF8toUTF16( (const UTF8 **)&source_start, source_end, NULL, NULL, strictConversion, &bytes); if (ret != conversionOK) { if (ret == sourceExhausted) { croak("WValloc: Partial character in input"); } else if (ret == targetExhausted) { croak("WValloc: target buffer exhausted"); } else if (ret == sourceIllegal) { croak("WValloc: malformed/illegal source sequence"); } else { croak("WValloc: unknown ConvertUTF16toUTF8 error"); } } /*fprintf(stderr,"utf8 -> utf16 requires %d bytes\n", bytes);*/ widechrs = bytes / sizeof(UTF16); /*fprintf(stderr, "Allocating %d wide chrs\n", widechrs);*/ Newz(0,buf,widechrs+1,UTF16); if (widechrs != 0) { source_start = s; source_end = s + slen + 1; target_start = buf; target_end = buf + widechrs + 1; /*fprintf(stderr, "%p %p %p %p\n", source_start, source_end, target_start, target_end);*/ ret = ConvertUTF8toUTF16( (const UTF8 **)&source_start, source_end, &target_start, target_end, strictConversion, &bytes); if (ret != conversionOK) { croak("WValloc: second call to ConvertUTF8toUTF16 failed (%d)", ret); } /*fprintf(stderr, "Second returned %d bytes\n", bytes);*/ } #endif } return buf; } /* * Free a UTF16 * representation of a char * * Used to free the return values of WValloc() */ void WVfree(UTF16 * wp) { if (wp != NULL) Safefree(wp); } /* * Get a char * representation of a UTF16 * * The representation is a converted copy, so the result needs to be freed * using PVfree(). * wp == NULL is handled properly * * Does not handle byte arrays, only null-terminated strings. */ char * PVallocW(UTF16 * wp) { char * p=NULL; if (wp!=NULL) { #ifdef WIN32 int bytes=WideCharToMultiByte(CP_UTF8,0,wp,-1,NULL,0,NULL,NULL); Newz(0,p,bytes,char); if (!WideCharToMultiByte(CP_UTF8,0,wp,-1,p,bytes,NULL,NULL)) { croak("WideCharToMultiByte() failed"); } #else ConversionResult ret; UTF16 *source_start; UTF16 *source_end; unsigned int bytes; UTF8 *target_start; UTF8 *target_end; unsigned int len; if (wp != NULL) { len = utf16_len(wp); } source_start = (UTF16 *)wp; source_end = source_start + len; ret = ConvertUTF16toUTF8((const UTF16 **)&source_start, source_end, NULL, NULL, strictConversion, &bytes); if (ret != conversionOK) { if (ret == sourceExhausted) { croak("PVallocW: Partial character in input"); } else if (ret == targetExhausted) { croak("PVallocW: target buffer exhausted"); } else if (ret == sourceIllegal) { croak("PVallocW: malformed/illegal source sequence"); } else { croak("PVallocW: unknown ConvertUTF16toUTF8 error"); } } Newz(0,p,bytes,char); target_start = p; target_end = p + bytes; source_start = (UTF16 *)wp; source_end = source_start + len; ret = ConvertUTF16toUTF8((const UTF16 **)&source_start, source_end, &target_start, target_end, strictConversion, &bytes); if (ret != conversionOK) { croak("PVallocW: second call to ConvertUTF16toUTF8 failed (%d)", ret); } #endif } return p; } /* * Free a UTF16 * representation of a char * * Used to free the return value of PVallocW() * char * s == NULL is handled properly */ void PVfreeW(char * s) { if (s!=NULL) Safefree(s); } /* * Mutate an SV's PV INPLACE to contain UTF-16. Does not handle byte arrays, * only null-terminated strings. * Turns the UTF8 flag OFF unconditionally, because SV becomes a byte array * (for Perl). */ void SV_toWCHAR(SV * sv) { STRLEN len; UTF16 * wp; char * p; if (!SvOK(sv)) { /* warn("SV_toWCHAR called for undef"); */ return; } p=SvPVutf8_force(sv,len); /* _force makes sure SV is only a string */ wp=WValloc(p); len=utf16_len(wp); p=SvGROW(sv,sizeof(UTF16)*(1+len)); utf16_copy((UTF16 *)p,wp); SvCUR_set(sv,sizeof(UTF16)*len); WVfree(wp); SvPOK_only(sv); /* sv is nothing but a non-UTF8 string -- for Perl ;-) */ } static unsigned short utf16_len(UTF16 *wp) { unsigned short len = 0; if (!wp) return 0; while (*wp != 0) { wp++; len++; } return len; } static void utf16_copy(UTF16 *d, UTF16 *s) { while(*s) { *d++ = *s++; } } #endif /* WITH_UNICODE */