赞
踩
Linux string conversion from UTF-8 to UNICODE, UCS-4LE, UCS-4LE.
It is astonishing for windows developers that Linux has two distinct difference to Windows character set.
1. standard char * is default in UTF-8 coding. strlen may greater than 2*(Chinese Word) + English.
2.wchar_t UNICODE string is 4 bytes long.
See Ubuntu 16.04
#include
using namespace std;
#include
#include
#include
#include
#include //wprintf is here!!!!!!!!
void PrintHexData( unsigned char *str, int iLen )
{
unsigned char * pHex = (unsigned char *)str;
for ( int i = 0; i < iLen ; i ++ )
{
printf( "%02x ", pHex[i] );
}
printf( "\n" );
}
int main()
{
char szName[128] = "Linux-7字符串转换测试";
wchar_t wzName[128] = L"Linux-7字符串转换测试";
wchar_t szTransName[128] ;
const char * strCharSet[] = { "UCS-2LE", "UCS-4LE", "UNICODE" };
memset( szTransName, 0, sizeof(szTransName) );
cout << "char size : " << sizeof(char) << " Bytes" << endl;
cout<< "wchar_t size : " << sizeof(wchar_t) << " Bytes" << endl;
cout << "RAND_MAX : " << RAND_MAX << endl;
//Linux utf-8 length is greater than UNICODE*2 for non-English word.
size_t iNameByteLen = strlen( szName );
cout << "String: " << szName << endl;
cout << " length of strlen: " << iNameByteLen << endl;
int wzlen ;
wzlen = wcslen( wzName );
printf( " length wcslen of wchar_t %d \n", wzlen );
//Linux UNICODE-4LE for wchar_t.
cout << "string: " << szName << " from utf-8 conversion result:" << endl;
for ( int iset = 0 ; iset < (int)(sizeof( strCharSet )/ sizeof(const char *)); iset ++ )
{
cout << "dst char set: " << strCharSet[iset] << endl;
iconv_t ct = iconv_open( strCharSet[iset],"utf-8" );
if ( ct != (iconv_t)-1 )
{
char * s_in ;
char * s_out ;
size_t iInLen, iInLen1;
size_t iOutLen, iOutLen1;
int iconved = 0;
try{
s_in = (char *)szName;
s_out = (char *)szTransName;
iInLen1 = iInLen = strlen( szName );
iOutLen1 = iOutLen = sizeof( szTransName );
memset( szTransName, 0xff, iOutLen );
iconved = iconv( ct, (char **)&s_in, &iInLen, (char **)&s_out, &iOutLen );
iconv_close( ct );
int iConvChars = iOutLen1 - iOutLen;
printf( "ICONV in len: %d=>%d out len %d=>%d (%d) conv ret: %d\n",
(unsigned int)iInLen1, (unsigned int)iInLen, (unsigned int)iOutLen1, (unsigned int)iOutLen
,iConvChars, iconved );
unsigned char * pHex = (unsigned char *)szTransName;
PrintHexData ( pHex, iConvChars );
}
catch( ... )
{
cout << "erro iconv" << endl;
}
}
}
printf( "expected Unicode linux string : \n" );
PrintHexData ( (unsigned char *)wzName, wcslen(wzName)*sizeof(wchar_t) );
getchar();
return 0;
}
running result:
char size : 1 Bytes
wchar_t size : 4 Bytes
RAND_MAX : 2147483647
String: Linux-7字符串转换测试
length of strlen: 28
length wcslen of wchar_t 14
string: Linux-7字符串转换测试 from utf-8 conversion result:
dst char set: UCS-2LE
ICONV in len: 28=>0 out len 512=>484 (28) conv ret: 0
4c 00 69 00 6e 00 75 00 78 00 2d 00 37 00 57 5b 26 7b 32 4e 6c 8f 62 63 4b 6d d5 8b
dst char set: UCS-4LE
ICONV in len: 28=>0 out len 512=>456 (56) conv ret: 0
4c 00 00 00 69 00 00 00 6e 00 00 00 75 00 00 00 78 00 00 00 2d 00 00 00 37 00 00 00 57 5b 00 00 26 7b 00 00 32 4e 00 00 6c 8f 00 00 62 63 00 00 4b 6d 00 00 d5 8b 00 00
dst char set: UNICODE
ICONV in len: 28=>0 out len 512=>482 (30) conv ret: 0
ff fe 4c 00 69 00 6e 00 75 00 78 00 2d 00 37 00 57 5b 26 7b 32 4e 6c 8f 62 63 4b 6d d5 8b
expected Unicode linux string :
4c 00 00 00 69 00 00 00 6e 00 00 00 75 00 00 00 78 00 00 00 2d 00 00 00 37 00 00 00 57 5b 00 00 26 7b 00 00 32 4e 00 00 6c 8f 00 00 62 63 00 00 4b 6d 00 00 d5 8b 00 00
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。