C# - 使用 x86 汇编的快速内存复制方法
如何在C#.NET中使用汇编以及最快的内存复制方法
引言
我是 Oleksandr Karpov,这是我在这里发表的第一篇文章,感谢您的阅读。
在这里,我将展示并解释如何非常快速地复制数据,以及如何在C#和.NET中使用汇编。 在我的例子中,我将其用于从图像、视频和声音创建视频的应用程序中。
此外,如果您有一个需要在C#中使用的汇编方法或函数,它将向您展示如何以快速而简单的方式完成它。
背景
为了完全理解它,最好您了解汇编语言、内存对齐以及一些C#、Windows和.NET高级技术。
为了能够非常快速地复制粘贴数据,您需要它具有16字节对齐的内存地址,否则它将几乎具有相同的速度(在我的例子中,大约快1.02倍)。
该代码使用SSE指令,这些指令受Pentium III+ (KNI/MMX2)、AMD Athlon (AMD EMMX) 等处理器支持。
我已在我的Pentium Dual-Core E5800 3.2GHz(4GB RAM)双通道模式下进行了测试。
对我而言,快速复制方法比标准方法快1.5倍,内存对齐为16字节,
对于未对齐的内存地址,速度也几乎相同(快1.02倍)。
为了能够在Windows下的C#中分配16字节对齐的内存,我们有三种方法可以做到
a) 目前看来,Bitmap
对象(实际上是Windows本身内部)会分配16字节对齐的内存地址,因此我们可以使用Bitmap
来轻松快速地分配对齐的内存;
b) 作为托管数组,增加8个字节(因为Windows堆是8字节对齐的),并在已分配的内存中计算16字节对齐的内存点
int dataLength = 4096;
// +8 bytes as windows heap is 8 byte aligned
byte[] buffer = new byte[dataLength + 8];
IntPtr addr = Marshal.UnsafeAddrOfPinnedArrayElement(buffer, 0);
//(int)(((long)addr + 15) / 16 * 16 - getting point to 16 byte aligned address
int bufferAlignedOffset = (int)(((long)addr + 15) / 16 * 16 - addr);
c) 通过使用VirtualAlloc
API 分配内存
IntPtr addr = VirtualAlloc(
IntPtr.Zero,
new UIntPtr(dataLength + 8),
AllocationTypes.Commit | AllocationTypes.Reserve,
MemoryProtections.ExecuteReadWrite);
addr = new IntPtr(((long)addr + 15)/16*16);
Using the Code
这是一个完整的性能测试,将向您展示性能测量结果以及如何使用所有这些。
FastMemCopy
类包含快速内存复制逻辑所需的所有内容。
首先,您需要创建一个默认的Windows Forms应用程序项目,并在窗体上放置两个按钮和PictureBox
控件,因为我们将在图像上进行测试。
让我们声明一些字段
string bitmapPath;
Bitmap bmp, bmp2;
BitmapData bmpd, bmpd2;
byte[] buffer = null;
现在,我们将创建两种方法来处理按钮的点击事件。
对于标准方法
private void btnStandard_Click(object sender, EventArgs e)
{
using (OpenFileDialog ofd = new OpenFileDialog())
{
if (ofd.ShowDialog() != System.Windows.Forms.DialogResult.OK)
return;
bitmapPath = ofd.FileName;
}
//open a selected image and create an empty image with the same size
OpenImage();
//unlock for read and write images
UnlockBitmap();
//copy data from one image to another by standard method
CopyImage();
//lock images to be able to see them
LockBitmap();
//lets see what we have
pictureBox1.Image = bmp2;
}
和快速方法
private void btnFast_Click(object sender, EventArgs e)
{
using (OpenFileDialog ofd = new OpenFileDialog())
{
if (ofd.ShowDialog() != System.Windows.Forms.DialogResult.OK)
return;
bitmapPath = ofd.FileName;
}
//open a selected image and create an empty image with the same size
OpenImage();
//unlock for read and write images
UnlockBitmap();
//copy data from one image to another with our fast method
FastCopyImage();
//lock images to be able to see them
LockBitmap();
//lets see what we have
pictureBox1.Image = bmp2;
}
好的,现在我们有了按钮和事件处理程序,让我们实现将打开图像、锁定、解锁它们和标准复制方法的方法
打开一张图片
void OpenImage()
{
pictureBox1.Image = null;
buffer = null;
if (bmp != null)
{
bmp.Dispose();
bmp = null;
}
if (bmp2 != null)
{
bmp2.Dispose();
bmp2 = null;
}
GC.Collect(GC.MaxGeneration, GCCollectionMode.Forced);
bmp = (Bitmap)Bitmap.FromFile(bitmapPath);
buffer = new byte[bmp.Width * 4 * bmp.Height];
bmp2 = new Bitmap(bmp.Width, bmp.Height, bmp.Width * 4, PixelFormat.Format32bppArgb,
Marshal.UnsafeAddrOfPinnedArrayElement(buffer, 0));
}
锁定和解锁位图
void UnlockBitmap()
{
bmpd = bmp.LockBits(new Rectangle(0, 0, bmp.Width, bmp.Height), ImageLockMode.ReadWrite,
PixelFormat.Format32bppArgb);
bmpd2 = bmp2.LockBits(new Rectangle(0, 0, bmp.Width, bmp.Height), ImageLockMode.ReadWrite,
PixelFormat.Format32bppArgb);
}
void LockBitmap()
{
bmp.UnlockBits(bmpd);
bmp2.UnlockBits(bmpd2);
}
并将数据从一个图像复制到另一个图像,并显示测量时间
void CopyImage()
{
//start stopwatch
Stopwatch sw = new Stopwatch();
sw.Start();
//copy-past data 10 times
for (int i = 0; i < 10; i++)
{
System.Runtime.InteropServices.Marshal.Copy(bmpd.Scan0, buffer, 0, buffer.Length);
}
//stop stopwatch
sw.Stop();
//show measured time
MessageBox.Show(sw.ElapsedTicks.ToString());
}
这就是标准复制粘贴方法。 实际上,没有太复杂的内容,我们使用众所周知的System.Runtime.InteropServices.Marshal.Copy
方法。
还有一种用于快速复制逻辑的“中间方法”
void FastCopyImage()
{
FastMemCopy.FastMemoryCopy(bmpd.Scan0, bmpd2.Scan0, buffer.Length);
}
现在,让我们实现FastMemCopy
类。 这是类的声明和我们将在其中使用的一些类型
internal static class FastMemCopy
{
[Flags]
private enum AllocationTypes : uint
{
Commit = 0x1000, Reserve = 0x2000,
Reset = 0x80000, LargePages = 0x20000000,
Physical = 0x400000, TopDown = 0x100000,
WriteWatch = 0x200000
}
[Flags]
private enum MemoryProtections : uint
{
Execute = 0x10, ExecuteRead = 0x20,
ExecuteReadWrite = 0x40, ExecuteWriteCopy = 0x80,
NoAccess = 0x01, ReadOnly = 0x02,
ReadWrite = 0x04, WriteCopy = 0x08,
GuartModifierflag = 0x100, NoCacheModifierflag = 0x200,
WriteCombineModifierflag = 0x400
}
[Flags]
private enum FreeTypes : uint
{
Decommit = 0x4000, Release = 0x8000
}
[UnmanagedFunctionPointerAttribute(CallingConvention.Cdecl)]
private unsafe delegate void FastMemCopyDelegate();
private static class NativeMethods
{
[DllImport("kernel32.dll", SetLastError = true)]
internal static extern IntPtr VirtualAlloc(
IntPtr lpAddress,
UIntPtr dwSize,
AllocationTypes flAllocationType,
MemoryProtections flProtect);
[DllImport("kernel32")]
[return: MarshalAs(UnmanagedType.Bool)]
internal static extern bool VirtualFree(
IntPtr lpAddress,
uint dwSize,
FreeTypes flFreeType);
}
现在让我们声明该方法本身
public static unsafe void FastMemoryCopy(IntPtr src, IntPtr dst, int nBytes)
{
if (IntPtr.Size == 4)
{
//we are in 32 bit mode
//allocate memory for our asm method
IntPtr p = NativeMethods.VirtualAlloc(
IntPtr.Zero,
new UIntPtr((uint)x86_FastMemCopy_New.Length),
AllocationTypes.Commit | AllocationTypes.Reserve,
MemoryProtections.ExecuteReadWrite);
try
{
//copy our method bytes to allocated memory
Marshal.Copy(x86_FastMemCopy_New, 0, p, x86_FastMemCopy_New.Length);
//make a delegate to our method
FastMemCopyDelegate _fastmemcopy =
(FastMemCopyDelegate)Marshal.GetDelegateForFunctionPointer(p,
typeof(FastMemCopyDelegate));
//offset to the end of our method block
p += x86_FastMemCopy_New.Length;
//store length param
p -= 8;
Marshal.Copy(BitConverter.GetBytes((long)nBytes), 0, p, 4);
//store destination address param
p -= 8;
Marshal.Copy(BitConverter.GetBytes((long)dst), 0, p, 4);
//store source address param
p -= 8;
Marshal.Copy(BitConverter.GetBytes((long)src), 0, p, 4);
//Start stopwatch
Stopwatch sw = new Stopwatch();
sw.Start();
//copy-past all data 10 times
for (int i = 0; i < 10; i++)
_fastmemcopy();
//stop stopwatch
sw.Stop();
//get message with measured time
System.Windows.Forms.MessageBox.Show(sw.ElapsedTicks.ToString());
}
catch (Exception ex)
{
//if any exception
System.Windows.Forms.MessageBox.Show(ex.Message);
}
finally
{
//free allocated memory
NativeMethods.VirtualFree(p, (uint)(x86_FastMemCopy_New.Length),
FreeTypes.Release);
GC.Collect(GC.MaxGeneration, GCCollectionMode.Forced);
}
}
else if (IntPtr.Size == 8)
{
throw new ApplicationException("x64 is not supported yet!");
}
}
以及汇编代码,它表示为一个字节数组,并附带解释
private static byte[] x86_FastMemCopy_New = new byte[]
{
0x90, //nop do nothing
0x60, //pushad store flag register on stack
0x95, //xchg ebp, eax eax contains memory address of our method
0x8B, 0xB5, 0x5A, 0x01, 0x00, 0x00, //mov esi,[ebp][00000015A] get source buffer address
0x89, 0xF0, //mov eax,esi
0x83, 0xE0, 0x0F, //and eax,00F will check if it is 16 byte aligned
0x8B, 0xBD, 0x62, 0x01, 0x00, 0x00, //mov edi,[ebp][000000162] get destination address
0x89, 0xFB, //mov ebx,edi
0x83, 0xE3, 0x0F, //and ebx,00F will check if it is 16 byte aligned
0x8B, 0x8D, 0x6A, 0x01, 0x00, 0x00, //mov ecx,[ebp][00000016A] get number of bytes to copy
0xC1, 0xE9, 0x07, //shr ecx,7 divide length by 128
0x85, 0xC9, //test ecx,ecx check if zero
0x0F, 0x84, 0x1C, 0x01, 0x00, 0x00, //jz 000000146 ? copy the rest
0x0F, 0x18, 0x06, //prefetchnta [esi] pre-fetch non-temporal source data for reading
0x85, 0xC0, //test eax,eax check if source address is 16 byte aligned
0x0F, 0x84, 0x8B, 0x00, 0x00, 0x00, //jz 0000000C0 ? go to copy if aligned
0x0F, 0x18, 0x86, 0x80, 0x02, 0x00, 0x00, //prefetchnta [esi][000000280] pre-fetch more source data
0x0F, 0x10, 0x06, //movups xmm0,[esi] copy 16 bytes of source data
0x0F, 0x10, 0x4E, 0x10, //movups xmm1,[esi][010] copy more 16 bytes
0x0F, 0x10, 0x56, 0x20, //movups xmm2,[esi][020] copy more
0x0F, 0x18, 0x86, 0xC0, 0x02, 0x00, 0x00, //prefetchnta [esi][0000002C0] pre-fetch more
0x0F, 0x10, 0x5E, 0x30, //movups xmm3,[esi][030]
0x0F, 0x10, 0x66, 0x40, //movups xmm4,[esi][040]
0x0F, 0x10, 0x6E, 0x50, //movups xmm5,[esi][050]
0x0F, 0x10, 0x76, 0x60, //movups xmm6,[esi][060]
0x0F, 0x10, 0x7E, 0x70, //movups xmm7,[esi][070] we've copied 128 bytes of source data
0x85, 0xDB, //test ebx,ebx check if destination address is 16 byte aligned
0x74, 0x21, //jz 000000087 ? go to past if aligned
0x0F, 0x11, 0x07, //movups [edi],xmm0 past first 16 bytes to non-aligned destination address
0x0F, 0x11, 0x4F, 0x10, //movups [edi][010],xmm1 past more
0x0F, 0x11, 0x57, 0x20, //movups [edi][020],xmm2
0x0F, 0x11, 0x5F, 0x30, //movups [edi][030],xmm3
0x0F, 0x11, 0x67, 0x40, //movups [edi][040],xmm4
0x0F, 0x11, 0x6F, 0x50, //movups [edi][050],xmm5
0x0F, 0x11, 0x77, 0x60, //movups [edi][060],xmm6
0x0F, 0x11, 0x7F, 0x70, //movups [edi][070],xmm7 we've pasted 128 bytes of source data
0xEB, 0x1F, //jmps 0000000A6 ? continue
0x0F, 0x2B, 0x07, //movntps [edi],xmm0 past first 16 bytes to aligned destination address
0x0F, 0x2B, 0x4F, 0x10, //movntps [edi][010],xmm1 past more
0x0F, 0x2B, 0x57, 0x20, //movntps [edi][020],xmm2
0x0F, 0x2B, 0x5F, 0x30, //movntps [edi][030],xmm3
0x0F, 0x2B, 0x67, 0x40, //movntps [edi][040],xmm4
0x0F, 0x2B, 0x6F, 0x50, //movntps [edi][050],xmm5
0x0F, 0x2B, 0x77, 0x60, //movntps [edi][060],xmm6
0x0F, 0x2B, 0x7F, 0x70, //movntps [edi][070],xmm7 we've pasted 128 bytes of source data
0x81, 0xC6, 0x80, 0x00, 0x00, 0x00, //add esi,000000080 increment source address by 128
0x81, 0xC7, 0x80, 0x00, 0x00, 0x00, //add edi,000000080 increment destination address by 128
0x83, 0xE9, 0x01, //sub ecx,1 decrement counter
0x0F, 0x85, 0x7A, 0xFF, 0xFF, 0xFF, //jnz 000000035 ? continue if not zero
0xE9, 0x86, 0x00, 0x00, 0x00, //jmp 000000146 ? go to copy the rest of data
0x0F, 0x18, 0x86, 0x80, 0x02, 0x00, 0x00, //prefetchnta [esi][000000280] pre-fetch source data
0x0F, 0x28, 0x06, //movaps xmm0,[esi] copy 128 bytes from aligned source address
0x0F, 0x28, 0x4E, 0x10, //movaps xmm1,[esi][010] copy more
0x0F, 0x28, 0x56, 0x20, //movaps xmm2,[esi][020]
0x0F, 0x18, 0x86, 0xC0, 0x02, 0x00, 0x00, //prefetchnta [esi][0000002C0] pre-fetch more data
0x0F, 0x28, 0x5E, 0x30, //movaps xmm3,[esi][030]
0x0F, 0x28, 0x66, 0x40, //movaps xmm4,[esi][040]
0x0F, 0x28, 0x6E, 0x50, //movaps xmm5,[esi][050]
0x0F, 0x28, 0x76, 0x60, //movaps xmm6,[esi][060]
0x0F, 0x28, 0x7E, 0x70, //movaps xmm7,[esi][070] we've copied 128 bytes of source data
0x85, 0xDB, //test ebx,ebx check if destination address is 16 byte aligned
0x74, 0x21, //jz 000000112 ? go to past if aligned
0x0F, 0x11, 0x07, //movups [edi],xmm0 past 16 bytes to non-aligned destination address
0x0F, 0x11, 0x4F, 0x10, //movups [edi][010],xmm1 past more
0x0F, 0x11, 0x57, 0x20, //movups [edi][020],xmm2
0x0F, 0x11, 0x5F, 0x30, //movups [edi][030],xmm3
0x0F, 0x11, 0x67, 0x40, //movups [edi][040],xmm4
0x0F, 0x11, 0x6F, 0x50, //movups [edi][050],xmm5
0x0F, 0x11, 0x77, 0x60, //movups [edi][060],xmm6
0x0F, 0x11, 0x7F, 0x70, //movups [edi][070],xmm7 we've pasted 128 bytes of data
0xEB, 0x1F, //jmps 000000131 ? continue copy-past
0x0F, 0x2B, 0x07, //movntps [edi],xmm0 past 16 bytes to aligned destination address
0x0F, 0x2B, 0x4F, 0x10, //movntps [edi][010],xmm1 past more
0x0F, 0x2B, 0x57, 0x20, //movntps [edi][020],xmm2
0x0F, 0x2B, 0x5F, 0x30, //movntps [edi][030],xmm3
0x0F, 0x2B, 0x67, 0x40, //movntps [edi][040],xmm4
0x0F, 0x2B, 0x6F, 0x50, //movntps [edi][050],xmm5
0x0F, 0x2B, 0x77, 0x60, //movntps [edi][060],xmm6
0x0F, 0x2B, 0x7F, 0x70, //movntps [edi][070],xmm7 we've pasted 128 bytes of data
0x81, 0xC6, 0x80, 0x00, 0x00, 0x00, //add esi,000000080 increment source address by 128
0x81, 0xC7, 0x80, 0x00, 0x00, 0x00, //add edi,000000080 increment destination address by 128
0x83, 0xE9, 0x01, //sub ecx,1 decrement counter
0x0F, 0x85, 0x7A, 0xFF, 0xFF, 0xFF, //jnz 0000000C0 ? continue copy-past if non-zero
0x8B, 0x8D, 0x6A, 0x01, 0x00, 0x00, //mov ecx,[ebp][00000016A] get number of bytes to copy
0x83, 0xE1, 0x7F, //and ecx,07F get rest number of bytes
0x85, 0xC9, //test ecx,ecx check if there are bytes
0x74, 0x02, //jz 000000155 ? exit if there are no more bytes
0xF3, 0xA4, //rep movsb copy rest of bytes
0x0F, 0xAE, 0xF8, //sfence performs a serializing operation on all store-to-memory instructions
0x61, //popad restore flag register
0xC3, //retn return from our method to C#
0x00, 0x00, 0x00, 0x00, //source buffer address
0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, //destination buffer address
0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, //number of bytes to copy-past
0x00, 0x00, 0x00, 0x00
};
我们将通过之前创建的委托来调用此汇编方法。
此方法目前在32位模式下工作,我稍后将实现64位模式。
如果有人感兴趣,我会添加源代码(几乎所有代码都在文章中)。
请注意,如果在Visual Studio下运行汇编代码,则会抛出异常,而且我仍然不明白为什么。
关注点
在实现和测试此方法期间,我发现即使是英特尔规范对prefetchnta
命令的描述也不是很清楚,因此我尝试自己并通过Google来理解它。
另外,请注意movntps
和movaps
指令,因为它们仅适用于16字节内存对齐的地址。
历史
Bitmap
和16字节内存对齐- 已添加源代码和内存对齐示例
- 第一个版本 - 2014年12月19日