C# 中原生处理 UTF8 的自定义字符串

Pascal Ganaye

5.00/5 (2投票s)

2011年12月15日

CPOL

14636

一个自定义的 C# 字符串实现，它将数据存储在 UTF8 字节数组中。

我正在进行大量的 UTF8 处理，并且发现大部分处理器时间都浪费在字节数组与字符串之间的编码和解码上。我正在尝试一种不将整个数据转换为字符串的想法。这应该运行得更快，并且占用更少的内存空间。这个想法不是试图比框架更快地编码。想法是加载到字节数组中，并在需要时仅转换从 UTF8 到字符或从字符到 UTF8 的内容。觉得这可能有用。

    public struct Utf8String : IEnumerable<char>
    {
        private readonly Byte[] m_bytes;
        private static Byte[] mCharLength;

        static Utf8String()
        {
            mCharLength = new Byte[256];
            int i = 0;
            while (/* i>=0x00 && */ i <= 0x7F) mCharLength[i++] = 1;
            while (/* i>=0x80 && */ i <= 0xBF) mCharLength[i++] = 1; // invalid 
            while (/* i>=0xC0 && */ i <= 0xDF) mCharLength[i++] = 2;
            while (/* i>=0xE0 && */ i <= 0xEF) mCharLength[i++] = 3;
            while (/* i>=0xF0 && */ i <= 0xF7) mCharLength[i++] = 1; // 4 but not available in Windows
            while (/* i>=0xF8 && */ i <= 0xFB) mCharLength[i++] = 1; // 5 but not available in Windows
            while (/* i>=0xFC && */ i <= 0xFD) mCharLength[i++] = 1; // 6 but not available in Windows
            mCharLength[0xFE] = 1; // invalid
            mCharLength[0xFF] = 1; // invalid
        }

        public Utf8String(string value)
        {
            m_bytes = Encoding.UTF8.GetBytes(value);
        }

        /// <summary>
        /// We can't make this public as people would then have 
        /// a point to the internal array and potentially change it.
        /// </summary>
        /// <param name="value"></param>
        private Utf8String(Byte[] value)
        {
            m_bytes = value;
        }

        public static implicit operator Utf8String(string value)
        {
            return new Utf8String(value);
        }

        public static Utf8String operator +(Utf8String a, Utf8String b)
        {
            int alength = a.m_bytes.Length;
            int blength = b.m_bytes.Length;
            var newBytes = new Byte[alength + blength];
            a.m_bytes.CopyTo(newBytes, 0);
            b.m_bytes.CopyTo(newBytes, alength);
            return new Utf8String(newBytes);
        }

        public override string ToString()
        {
            // not efficient this is why I add the  (... bytes) at the end to discourage people using this method.
            return string.Format("{0} ({1} bytes)", System.Text.Encoding.UTF8.GetString(m_bytes), m_bytes.Length);
        }

        public static bool operator ==(Utf8String a, Utf8String b)
        {

            return CompareArrays(a.m_bytes, b.m_bytes);
        }

        public static bool operator !=(Utf8String a, Utf8String b)
        {
            return !CompareArrays(a.m_bytes, b.m_bytes);
        }

        private static bool CompareArrays(byte[] a, byte[] b)
        {
            if (a.Length != b.Length)
            {
                return false;
            }
            int len = a.Length;
            for (int i = 0; i < len; i++)
            {
                if (a[i] != b[i])
                {
                    return false;
                }
            }
            return true;
        }

        public int length
        {
            get
            {
                int result = 0;
                int len = m_bytes.Length;
                int mbytesIndex = 0;
                while (mbytesIndex < len)
                {
                    mbytesIndex += mCharLength[m_bytes[mbytesIndex]];
                    result++;
                }
                return result;
            }
        }

        internal Utf8String SubString(int startIndex, int length)
        {
            if (startIndex < 0) throw new ArgumentOutOfRangeException("startIndex");
            var startmBytesIndex = GetmBytesIndex(startIndex);
            if (startmBytesIndex < 0) throw new ArgumentOutOfRangeException("startIndex");

            if (length == 0) return Utf8String.Empty;
            if (length < 0) throw new ArgumentOutOfRangeException("length");

            var endmBytesIndex = GetmBytesIndex(length, startmBytesIndex);
            if (endmBytesIndex < 0) throw new ArgumentOutOfRangeException("length");

            if (startIndex == 0 && length == m_bytes.Length) return this;

            var newBytes = new Byte[endmBytesIndex - startmBytesIndex];
            Array.Copy(m_bytes, startmBytesIndex, newBytes, 0, endmBytesIndex - startmBytesIndex);
            return new Utf8String(newBytes);
        }

        internal Utf8String SubString(int startIndex)
        {
            if (startIndex == 0) return this;

            if (startIndex < 0) throw new ArgumentOutOfRangeException("startIndex");
            var startmBytesIndex = GetmBytesIndex(startIndex);
            if (startmBytesIndex < 0) throw new ArgumentOutOfRangeException("startIndex");

            var newBytes = new Byte[m_bytes.Length - startmBytesIndex];
            Array.Copy(m_bytes, startmBytesIndex, newBytes, 0, m_bytes.Length - startmBytesIndex);
            return new Utf8String(newBytes);
        }

        private int GetmBytesIndex(int charCount, int mbytesIndex = 0)
        {
            if (charCount == 0) return mbytesIndex;
            int len = m_bytes.Length;
            while (mbytesIndex < len)
            {
                mbytesIndex += mCharLength[m_bytes[mbytesIndex]];
                charCount--;
                if (charCount == 0) return mbytesIndex;
            }
            return -1;
        }

        public static readonly Utf8String Empty = new Utf8String(new byte[] { });

        public IEnumerator<char> GetEnumerator()
        {
            return new Utf8StringEnumerator(this);
        }

        System.Collections.IEnumerator System.Collections.IEnumerable.GetEnumerator()
        {
            return new Utf8StringEnumerator(this);
        }

        class Utf8StringEnumerator : IEnumerator<char>
        {
            private Byte[] m_bytes;
            private int mBytesIndex;
            private char mCurChar;
            private int mLength;

            public char Current
            {
                get
                {
                    return mCurChar;
                }
            }

            object System.Collections.IEnumerator.Current
            {
                get { return mCurChar; }
            }

            internal Utf8StringEnumerator(Utf8String source)
            {
                m_bytes = source.m_bytes;
                mLength = m_bytes.Length;
                Reset();
            }

            public bool MoveNext()
            {
                if (mBytesIndex < mLength)
                {
                    int curCharLength = Utf8String.mCharLength[m_bytes[mBytesIndex]];
                    if (mBytesIndex + curCharLength< mLength)
                    {
                        mCurChar = System.Text.Encoding.UTF8.GetString(m_bytes, mBytesIndex, curCharLength)[0];
                        mBytesIndex += curCharLength;
                        return true;
                    }
                }
                return false;
            }

            public void Reset()
            {
                this.mCurChar = '\0';
                this.mBytesIndex = 0;
            }

            public void Dispose()
            {
            }

        }
    }

这是在 C# 中进行 UTF8/Char 转换的尝试

public static char Utf8ToChar(byte[] bytes)
{
    if (bytes.Length == 1)
    {
        return (char)bytes[0];
    }
    else if (bytes.Length == 2)
    {
        return (char)((bytes[0] - 0xc0) * 0x40 + (bytes[1] - 0x80));
    }
    else if (bytes.Length == 3)
    {
        return (char)((bytes[0] - 0xE0) * 0x1000 + (bytes[1] - 0x80) * 0x40 + (bytes[2] - 0x80));
    }
    else return (char)0;
}

public static byte[] CharToUtf8(char c)
{
    if (c &lt; 0x80)
    {
        return new byte[] { (byte)c };
    }
    else
    {
        byte c0 = (byte)(c & 0x3f);
        byte c1 = (byte)((c &gt;&gt; 6) & 0x3f);
        byte c2 = (byte)((c &gt;&gt; 12) & 0x3f);

        if (c &lt; 0x800)
        {
            return new byte[] { (byte)(0xC0 + c1), (byte)(0x80 + c0) };
        }
        else if (c &lt; 0xd800 || c &gt;= 0xe000)
        {
            return new byte[] { (byte)(0xE0 + c2), (byte)(0x80 + c1), (byte)(0x80 + c0) };
        }
        else
        {
            return new byte[] { 0xEF, 0xBF, 0xBD };
        }
    }
}