感谢老师分享的源码和思路,我在实际使用过程中发现有一处小BUG:
[C] 纯文本查看 复制代码 if (len + 2 < gbk_size)
{
*_pOut++ = code1;
*_pOut++ = code2;
len += 2;
}
应改为:
[C] 纯文本查看 复制代码 if (len + 2 <= gbk_size)
{
*_pOut++ = code1;
*_pOut++ = code2;
len += 2;
}
否则会出现当最后一个UTF-8字符为两个字节时,会直接跳出。
同时根据实际项目需求,在参照老师的思路下,写了一个UTF-8转GBK的函数,有需要的小伙伴可以参考一下:
[C] 纯文本查看 复制代码 /*
*********************************************************************************************************
* 函 数 名: StrGBKToUTF8
* 功能说明: 将GBK字符串转换UTF8字符串
* 形 参: utf8 - 输出字符串
* gbk - 输入字符串
* wGBKSize - GBK字符串长度
* 返 回 值: 字符串
*********************************************************************************************************
*/
char *StrGBKToUTF8(char *utf8, char *gbk, U16 wGBKSize)
{
U16 wGBKCode = 0U;
U32 dwUnicode = 0U;
char *_ptr;
char *_pOut;
U16 wLen = 0;
_ptr = gbk;
_pOut = utf8;
/* 开始循环处理字符 */
while((*_ptr != 0) && (wLen < wGBKSize))
{
/* 读取字符串数据, 该数据可能是ascii代码,也可能汉字代码的高字节 */
wGBKCode = *_ptr & 0xFFU;
if(IsDBCS1(wGBKCode))
{
/* 汉字,解析两个字节 */
wGBKCode = (wGBKCode << 8U) | *(++_ptr);
wLen++;
}
dwUnicode = ff_convert(wGBKCode, 1U);
if (dwUnicode <= 0x0000007F)
{
/* U-00000000 - U-0000007F: 0xxxxxxx */
*_pOut = (dwUnicode & 0x7F);
_pOut += 1U;
}
else if (dwUnicode >= 0x00000080 && dwUnicode <= 0x000007FF)
{
/* U-00000080 - U-000007FF: 110xxxxx 10xxxxxx */
*(_pOut + 1) = (dwUnicode & 0x3F) | 0x80;
*_pOut = ((dwUnicode >> 6) & 0x1F) | 0xC0;
_pOut += 2U;
}
else if (dwUnicode >= 0x00000800 && dwUnicode <= 0x0000FFFF)
{
/* U-00000800 - U-0000FFFF: 1110xxxx 10xxxxxx 10xxxxxx */
*(_pOut + 2) = (dwUnicode & 0x3F) | 0x80;
*(_pOut + 1) = ((dwUnicode >> 6) & 0x3F) | 0x80;
*_pOut = ((dwUnicode >> 12) & 0x0F) | 0xE0;
_pOut += 3U;
}
else if (dwUnicode >= 0x00010000 && dwUnicode <= 0x001FFFFF)
{
/* U-00010000 - U-001FFFFF: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx */
*(_pOut + 3) = (dwUnicode & 0x3F) | 0x80;
*(_pOut + 2) = ((dwUnicode >> 6) & 0x3F) | 0x80;
*(_pOut + 1) = ((dwUnicode >> 12) & 0x3F) | 0x80;
*_pOut = ((dwUnicode >> 18) & 0x07) | 0xF0;
_pOut += 4U;
}
else if (dwUnicode >= 0x00200000 && dwUnicode <= 0x03FFFFFF)
{
/* U-00200000 - U-03FFFFFF: 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx */
*(_pOut + 4) = (dwUnicode & 0x3F) | 0x80;
*(_pOut + 3) = ((dwUnicode >> 6) & 0x3F) | 0x80;
*(_pOut + 2) = ((dwUnicode >> 12) & 0x3F) | 0x80;
*(_pOut + 1) = ((dwUnicode >> 18) & 0x3F) | 0x80;
*_pOut = ((dwUnicode >> 24) & 0x03) | 0xF8;
_pOut += 5U;
}
else if (dwUnicode >= 0x04000000 && dwUnicode <= 0x7FFFFFFF)
{
/* U-04000000 - U-7FFFFFFF: 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx */
*(_pOut + 5) = (dwUnicode & 0x3F) | 0x80;
*(_pOut + 4) = ((dwUnicode >> 6) & 0x3F) | 0x80;
*(_pOut + 3) = ((dwUnicode >> 12) & 0x3F) | 0x80;
*(_pOut + 2) = ((dwUnicode >> 18) & 0x3F) | 0x80;
*(_pOut + 1) = ((dwUnicode >> 24) & 0x3F) | 0x80;
*_pOut = ((dwUnicode >> 30) & 0x01) | 0xFC;
_pOut += 6U;
}
_ptr++;
wLen++;
}
*_pOut = 0;
return utf8;
} |