std::string 和 std::wstring、UTF-8 和 UTF-16 之间的转换
如何安全地在 Unicode 格式之间转换 STL 字符串。
引言
我需要将 UTF-8 编码的 std::string
和 UTF-16 编码的 std::wstring
相互转换。我找到了一些用于原生 C string
的转换函数,但这些函数将内存管理留给调用者。在现代编程中,这不太好。
最好的转换器可能是来自 unicode.org 的那个。这里提供一个围绕它的封装器,用于转换 STL string
。
与其他文章不同,这篇文章没有其他依赖项,没有引入另一个 string
类,只转换 STL string
,仅此而已。而且它比广泛流传的...
std::wstring widestring(sourcestring.begin(), sourcestring.end());
... 更好,后者只能处理 ASCII 文本。
来源
头文件如下所示
#ifndef UTFCONVERTER__H__
#define UTFCONVERTER__H__
namespace UtfConverter
{
std::wstring FromUtf8(const std::string& utf8string);
std::string ToUtf8(const std::wstring& widestring);
}
#endif
我想这很简单易于使用。
这是源代码
#include "stdafx.h"
#include "UtfConverter.h"
#include "ConvertUTF.h"
namespace UtfConverter
{
std::wstring FromUtf8(const std::string& utf8string)
{
size_t widesize = utf8string.length();
if (sizeof(wchar_t) == 2)
{
wchar_t* widestringnative = new wchar_t[widesize+1];
const UTF8* sourcestart = reinterpret_cast<const UTF8*>(utf8string.c_str());
const UTF8* sourceend = sourcestart + widesize;
UTF16* targetstart = reinterpret_cast<UTF16*>(widestringnative);
UTF16* targetend = targetstart + widesize+1;
ConversionResult res = ConvertUTF8toUTF16
(&sourcestart, sourceend, &targetstart, targetend, strictConversion);
if (res != conversionOK)
{
delete [] widestringnative;
throw std::exception("La falla!");
}
*targetstart = 0;
std::wstring resultstring(widestringnative);
delete [] widestringnative;
return resultstring;
}
else if (sizeof(wchar_t) == 4)
{
wchar_t* widestringnative = new wchar_t[widesize];
const UTF8* sourcestart = reinterpret_cast<const UTF8*>(utf8string.c_str());
const UTF8* sourceend = sourcestart + widesize;
UTF32* targetstart = reinterpret_cast<UTF32*>(widestringnative);
UTF32* targetend = targetstart + widesize;
ConversionResult res = ConvertUTF8toUTF32
(&sourcestart, sourceend, &targetstart, targetend, strictConversion);
if (res != conversionOK)
{
delete [] widestringnative;
throw std::exception("La falla!");
}
*targetstart = 0;
std::wstring resultstring(widestringnative);
delete [] widestringnative;
return resultstring;
}
else
{
throw std::exception("La falla!");
}
return L"";
}
std::string ToUtf8(const std::wstring& widestring)
{
size_t widesize = widestring.length();
if (sizeof(wchar_t) == 2)
{
size_t utf8size = 3 * widesize + 1;
char* utf8stringnative = new char[utf8size];
const UTF16* sourcestart =
reinterpret_cast<const UTF16*>(widestring.c_str());
const UTF16* sourceend = sourcestart + widesize;
UTF8* targetstart = reinterpret_cast<UTF8*>(utf8stringnative);
UTF8* targetend = targetstart + utf8size;
ConversionResult res = ConvertUTF16toUTF8
(&sourcestart, sourceend, &targetstart, targetend, strictConversion);
if (res != conversionOK)
{
delete [] utf8stringnative;
throw std::exception("La falla!");
}
*targetstart = 0;
std::string resultstring(utf8stringnative);
delete [] utf8stringnative;
return resultstring;
}
else if (sizeof(wchar_t) == 4)
{
size_t utf8size = 4 * widesize + 1;
char* utf8stringnative = new char[utf8size];
const UTF32* sourcestart =
reinterpret_cast<const UTF32*>(widestring.c_str());
const UTF32* sourceend = sourcestart + widesize;
UTF8* targetstart = reinterpret_cast<UTF8*>(utf8stringnative);
UTF8* targetend = targetstart + utf8size;
ConversionResult res = ConvertUTF32toUTF8
(&sourcestart, sourceend, &targetstart, targetend, strictConversion);
if (res != conversionOK)
{
delete [] utf8stringnative;
throw std::exception("La falla!");
}
*targetstart = 0;
std::string resultstring(utf8stringnative);
delete [] utf8stringnative;
return resultstring;
}
else
{
throw std::exception("La falla!");
}
return "";
}
}
更好的做法
这是另一个版本,它通过直接写入 string
缓冲区来避免使用 new 和 delete。有人知道这样可以吗?
#include "stdafx.h"
#include "UtfConverter.h"
#include "ConvertUTF.h"
namespace UtfConverter
{
std::wstring FromUtf8(const std::string& utf8string)
{
size_t widesize = utf8string.length();
if (sizeof(wchar_t) == 2)
{
std::wstring resultstring;
resultstring.resize(widesize+1, L'\0');
const UTF8* sourcestart = reinterpret_cast<const UTF8*>(utf8string.c_str());
const UTF8* sourceend = sourcestart + widesize;
UTF16* targetstart = reinterpret_cast<UTF16*>(&resultstring[0]);
UTF16* targetend = targetstart + widesize;
ConversionResult res = ConvertUTF8toUTF16
(&sourcestart, sourceend, &targetstart, targetend, strictConversion);
if (res != conversionOK)
{
throw std::exception("La falla!");
}
*targetstart = 0;
return resultstring;
}
else if (sizeof(wchar_t) == 4)
{
std::wstring resultstring;
resultstring.resize(widesize+1, L'\0');
const UTF8* sourcestart = reinterpret_cast<const UTF8*>(utf8string.c_str());
const UTF8* sourceend = sourcestart + widesize;
UTF32* targetstart = reinterpret_cast<UTF32*>(&resultstring[0]);
UTF32* targetend = targetstart + widesize;
ConversionResult res = ConvertUTF8toUTF32
(&sourcestart, sourceend, &targetstart, targetend, strictConversion);
if (res != conversionOK)
{
throw std::exception("La falla!");
}
*targetstart = 0;
return resultstring;
}
else
{
throw std::exception("La falla!");
}
return L"";
}
std::string ToUtf8(const std::wstring& widestring)
{
size_t widesize = widestring.length();
if (sizeof(wchar_t) == 2)
{
size_t utf8size = 3 * widesize + 1;
std::string resultstring;
resultstring.resize(utf8size, '\0');
const UTF16* sourcestart =
reinterpret_cast<const UTF16*>(widestring.c_str());
const UTF16* sourceend = sourcestart + widesize;
UTF8* targetstart = reinterpret_cast<UTF8*>(&resultstring[0]);
UTF8* targetend = targetstart + utf8size;
ConversionResult res = ConvertUTF16toUTF8
(&sourcestart, sourceend, &targetstart, targetend, strictConversion);
if (res != conversionOK)
{
throw std::exception("La falla!");
}
*targetstart = 0;
return resultstring;
}
else if (sizeof(wchar_t) == 4)
{
size_t utf8size = 4 * widesize + 1;
std::string resultstring;
resultstring.resize(utf8size, '\0');
const UTF32* sourcestart =
reinterpret_cast<const UTF32*>(widestring.c_str());
const UTF32* sourceend = sourcestart + widesize;
UTF8* targetstart = reinterpret_cast<UTF8*>(&resultstring[0]);
UTF8* targetend = targetstart + utf8size;
ConversionResult res = ConvertUTF32toUTF8
(&sourcestart, sourceend, &targetstart, targetend, strictConversion);
if (res != conversionOK)
{
throw std::exception("La falla!");
}
*targetstart = 0;
return resultstring;
}
else
{
throw std::exception("La falla!");
}
return "";
}
}
如何使用它
只需将其添加到您的项目中即可。同时从 这里 下载 Unicode 转换器并将其添加到项目中。它应该可以正常工作。
当然,您可以在失败时抛出任何您喜欢的异常。
我承认我只用 2 字节的 wchar_t
进行了测试。
欢迎提出评论。