65.9K
CodeProject 正在变化。 阅读更多。
Home

std::string 和 std::wstring、UTF-8 和 UTF-16 之间的转换

starIconstarIconstarIcon
emptyStarIcon
starIcon
emptyStarIcon

3.60/5 (15投票s)

2007年2月9日

CPOL
viewsIcon

338220

downloadIcon

2862

如何安全地在 Unicode 格式之间转换 STL 字符串。

引言

我需要将 UTF-8 编码的 std::string 和 UTF-16 编码的 std::wstring 相互转换。我找到了一些用于原生 C string 的转换函数,但这些函数将内存管理留给调用者。在现代编程中,这不太好。

最好的转换器可能是来自 unicode.org 的那个。这里提供一个围绕它的封装器,用于转换 STL string

与其他文章不同,这篇文章没有其他依赖项,没有引入另一个 string 类,只转换 STL string,仅此而已。而且它比广泛流传的...

std::wstring widestring(sourcestring.begin(), sourcestring.end()); 

... 更好,后者只能处理 ASCII 文本。

来源

头文件如下所示

#ifndef UTFCONVERTER__H__
#define UTFCONVERTER__H__

namespace UtfConverter
{
    std::wstring FromUtf8(const std::string& utf8string);
    std::string ToUtf8(const std::wstring& widestring);
}

#endif

我想这很简单易于使用。

这是源代码

#include "stdafx.h"
#include "UtfConverter.h"
#include "ConvertUTF.h"

namespace UtfConverter
{
    std::wstring FromUtf8(const std::string& utf8string)
    {
        size_t widesize = utf8string.length();
        if (sizeof(wchar_t) == 2)
        {
            wchar_t* widestringnative = new wchar_t[widesize+1];
            const UTF8* sourcestart = reinterpret_cast<const UTF8*>(utf8string.c_str());
            const UTF8* sourceend = sourcestart + widesize;
            UTF16* targetstart = reinterpret_cast<UTF16*>(widestringnative);
            UTF16* targetend = targetstart + widesize+1;
            ConversionResult res = ConvertUTF8toUTF16
		(&sourcestart, sourceend, &targetstart, targetend, strictConversion);
            if (res != conversionOK)
            {
                delete [] widestringnative;
                throw std::exception("La falla!");
            }
            *targetstart = 0;
            std::wstring resultstring(widestringnative);
            delete [] widestringnative;
            return resultstring;
        }
        else if (sizeof(wchar_t) == 4)
        {
            wchar_t* widestringnative = new wchar_t[widesize];
            const UTF8* sourcestart = reinterpret_cast<const UTF8*>(utf8string.c_str());
            const UTF8* sourceend = sourcestart + widesize;
            UTF32* targetstart = reinterpret_cast<UTF32*>(widestringnative);
            UTF32* targetend = targetstart + widesize;
            ConversionResult res = ConvertUTF8toUTF32
		(&sourcestart, sourceend, &targetstart, targetend, strictConversion);
            if (res != conversionOK)
            {
                delete [] widestringnative;
                throw std::exception("La falla!");
            }
            *targetstart = 0;
            std::wstring resultstring(widestringnative);
            delete [] widestringnative;
            return resultstring;
        }
        else
        {
            throw std::exception("La falla!");
        }
        return L"";
    }

    std::string ToUtf8(const std::wstring& widestring)
    {
        size_t widesize = widestring.length();

        if (sizeof(wchar_t) == 2)
        {
            size_t utf8size = 3 * widesize + 1;
            char* utf8stringnative = new char[utf8size];
            const UTF16* sourcestart = 
		reinterpret_cast<const UTF16*>(widestring.c_str());
            const UTF16* sourceend = sourcestart + widesize;
            UTF8* targetstart = reinterpret_cast<UTF8*>(utf8stringnative);
            UTF8* targetend = targetstart + utf8size;
            ConversionResult res = ConvertUTF16toUTF8
		(&sourcestart, sourceend, &targetstart, targetend, strictConversion);
            if (res != conversionOK)
            {
                delete [] utf8stringnative;
                throw std::exception("La falla!");
            }
            *targetstart = 0;
            std::string resultstring(utf8stringnative);
            delete [] utf8stringnative;
            return resultstring;
        }
        else if (sizeof(wchar_t) == 4)
        {
            size_t utf8size = 4 * widesize + 1;
            char* utf8stringnative = new char[utf8size];
            const UTF32* sourcestart = 
		reinterpret_cast<const UTF32*>(widestring.c_str());
            const UTF32* sourceend = sourcestart + widesize;
            UTF8* targetstart = reinterpret_cast<UTF8*>(utf8stringnative);
            UTF8* targetend = targetstart + utf8size;
            ConversionResult res = ConvertUTF32toUTF8
		(&sourcestart, sourceend, &targetstart, targetend, strictConversion);
            if (res != conversionOK)
            {
                delete [] utf8stringnative;
                throw std::exception("La falla!");
            }
            *targetstart = 0;
            std::string resultstring(utf8stringnative);
            delete [] utf8stringnative;
            return resultstring;
        }
        else
        {
            throw std::exception("La falla!");
        }
        return "";
    }
} 

更好的做法

这是另一个版本,它通过直接写入 string 缓冲区来避免使用 new 和 delete。有人知道这样可以吗?

#include "stdafx.h"
#include "UtfConverter.h"
#include "ConvertUTF.h"

namespace UtfConverter
{
    std::wstring FromUtf8(const std::string& utf8string)
    {
        size_t widesize = utf8string.length();
        if (sizeof(wchar_t) == 2)
        {
            std::wstring resultstring;
            resultstring.resize(widesize+1, L'\0');
            const UTF8* sourcestart = reinterpret_cast<const UTF8*>(utf8string.c_str());
            const UTF8* sourceend = sourcestart + widesize;
            UTF16* targetstart = reinterpret_cast<UTF16*>(&resultstring[0]);
            UTF16* targetend = targetstart + widesize;
            ConversionResult res = ConvertUTF8toUTF16
		(&sourcestart, sourceend, &targetstart, targetend, strictConversion);
            if (res != conversionOK)
            {
                throw std::exception("La falla!");
            }
            *targetstart = 0;
            return resultstring;
        }
        else if (sizeof(wchar_t) == 4)
        {
            std::wstring resultstring;
            resultstring.resize(widesize+1, L'\0');
            const UTF8* sourcestart = reinterpret_cast<const UTF8*>(utf8string.c_str());
            const UTF8* sourceend = sourcestart + widesize;
            UTF32* targetstart = reinterpret_cast<UTF32*>(&resultstring[0]);
            UTF32* targetend = targetstart + widesize;
            ConversionResult res = ConvertUTF8toUTF32
		(&sourcestart, sourceend, &targetstart, targetend, strictConversion);
            if (res != conversionOK)
            {
                throw std::exception("La falla!");
            }
            *targetstart = 0;
            return resultstring;
        }
        else
        {
            throw std::exception("La falla!");
        }
        return L"";
    }

    std::string ToUtf8(const std::wstring& widestring)
    {
        size_t widesize = widestring.length();

        if (sizeof(wchar_t) == 2)
        {
            size_t utf8size = 3 * widesize + 1;
            std::string resultstring;
            resultstring.resize(utf8size, '\0');
            const UTF16* sourcestart = 
		reinterpret_cast<const UTF16*>(widestring.c_str());
            const UTF16* sourceend = sourcestart + widesize;
            UTF8* targetstart = reinterpret_cast<UTF8*>(&resultstring[0]);
            UTF8* targetend = targetstart + utf8size;
            ConversionResult res = ConvertUTF16toUTF8
		(&sourcestart, sourceend, &targetstart, targetend, strictConversion);
            if (res != conversionOK)
            {
                throw std::exception("La falla!");
            }
            *targetstart = 0;
            return resultstring;
        }
        else if (sizeof(wchar_t) == 4)
        {
            size_t utf8size = 4 * widesize + 1;
            std::string resultstring;
            resultstring.resize(utf8size, '\0');
            const UTF32* sourcestart = 
		reinterpret_cast<const UTF32*>(widestring.c_str());
            const UTF32* sourceend = sourcestart + widesize;
            UTF8* targetstart = reinterpret_cast<UTF8*>(&resultstring[0]);
            UTF8* targetend = targetstart + utf8size;
            ConversionResult res = ConvertUTF32toUTF8
		(&sourcestart, sourceend, &targetstart, targetend, strictConversion);
            if (res != conversionOK)
            {
                throw std::exception("La falla!");
            }
            *targetstart = 0;
            return resultstring;
        }
        else
        {
            throw std::exception("La falla!");
        }
        return "";
    }
}

如何使用它

只需将其添加到您的项目中即可。同时从 这里 下载 Unicode 转换器并将其添加到项目中。它应该可以正常工作。

当然,您可以在失败时抛出任何您喜欢的异常。

我承认我只用 2 字节的 wchar_t 进行了测试。

欢迎提出评论。

© . All rights reserved.