如果你有 1000 万个粒子要模拟






4.76/5 (18投票s)
C++与C#中数据结构的性能比较
引言
假设你正在编写一个粒子模拟物理程序。你需要模拟大量的粒子,比如1000万个。为了表示每个粒子,你需要用到位置、速度和加速度。在程序的每次迭代中,你需要用速度更新位置,用加速度更新速度,为了本文的讨论,我们假设加速度是恒定的。会有x、y和z三个维度,但我们只关注一维。
为了在计算机程序中模拟你的数据,你可以使用原始数组来表示每个粒子的位置、速度和加速度,或者你可以创建一个结构体——C++或C#——并将位置、速度和加速度放在其中,或者你可以创建一个类似结构体的类。
看起来原始数组的性能最好,然后是C++结构体,然后是C++类,然后是C#结构体,最后是C#类。我这么说,是因为C++结构体非常轻量级,C++类在内存中与C++结构体相同,C#结构体也很精简,但C#类会带来另一层性能开销,即C#结构体的数组具有数据局部性,而C#类则分散在各处。
测试程序
Using the Code
code.zip 包含解决方案。plusplusms
是C++测试工具,sharpms
是C#测试工具。
plusplusms
包含一个types.h文件,其中包含C++数据结构。
#pragma once
// UPDATE: adding a dummy field for cache alignment (RE: Daniel Pfeffer) did not pay off
struct BareStruct
{
double p, v, a, dummy;
};
class HeaderClass
{
public:
HeaderClass()
{
set(0, 0, 0);
}
void set(double p, double v, double a)
{
m_p = p;
m_v = v;
m_a = a;
}
// UPDATE: Separating out updating position from updating velocity did not pay off
void updatep()
{
m_p += m_v;
}
void updatev()
{
m_v += m_a;
}
// NOTE: Having one update function that calls the other two performed very well,
// inlining all around.
void update()
{
updatep();
updatev();
}
private:
double m_p, m_v, m_a, dummy;
};
class SourceClass
{
public:
SourceClass();
void set(double p, double v, double a);
// UPDATE: Separating out...did not pay off
//void updatep();
//void updatev();
void update();
private:
double m_p, m_v, m_a, dummy;
};
types.cpp包含SourceClass
的实现,代码与HeaderClass
相同,只是分开。这种分离是为了测试链接时代码生成 (LTCG) 的性能,看看它是否会产生与仅包含头文件的实现相同的性能。**更新:** LTCG 的效果确实如宣传的那样。
C++的main.cpp包含测试规模和计时结果的宏,以及执行数据处理的函数。
#include "types.h"
#include <stdio.h>
#include <stdlib.h>
#include <chrono>
#define SAMPLE_SIZE 10 * 1024 * 1024
#define TEST_RUNS 4
double parr[SAMPLE_SIZE];
double varr[SAMPLE_SIZE];
double aarr[SAMPLE_SIZE];
BareStruct structs[SAMPLE_SIZE];
HeaderClass headers[SAMPLE_SIZE];
SourceClass sources[SAMPLE_SIZE];
typedef std::chrono::high_resolution_clock timer;
typedef std::chrono::milliseconds ms;
typedef std::chrono::duration<int> fsec;
#define START_RUN(name) { timer t; printf("%s: ", name); auto start = t.now();
#define END_RUN() \
auto end = t.now(); \
int elapsedMs = (int)std::chrono::duration_cast<ms>(end - start).count(); \
printf("%d ms\n", elapsedMs); \
}
void updateArrays1()
{
for (size_t i = 0; i < SAMPLE_SIZE; ++i)
{
parr[i] += varr[i];
}
for (size_t i = 0; i < SAMPLE_SIZE; ++i)
{
varr[i] += aarr[i];
}
}
void updateArrays2()
{
for (size_t i = 0; i < SAMPLE_SIZE; ++i)
{
parr[i] += varr[i];
varr[i] += aarr[i];
}
}
void updateStructs1()
{
for (size_t i = 0; i < SAMPLE_SIZE; ++i)
{
structs[i].p += structs[i].v;
}
for (size_t i = 0; i < SAMPLE_SIZE; ++i)
{
structs[i].v += structs[i].a;
}
}
void updateStructs2()
{
for (size_t i = 0; i < SAMPLE_SIZE; ++i)
{
structs[i].p += structs[i].v;
structs[i].v += structs[i].a;
}
}
void updateHeaders1()
{
for (size_t i = 0; i < SAMPLE_SIZE; ++i)
{
headers[i].updatep();
}
for (size_t i = 0; i < SAMPLE_SIZE; ++i)
{
headers[i].updatev();
}
}
void updateHeaders2()
{
for (size_t i = 0; i < SAMPLE_SIZE; ++i)
{
headers[i].update();
}
}
void updateSources1()
{
for (size_t i = 0; i < SAMPLE_SIZE; ++i)
{
sources[i].updatep();
}
for (size_t i = 0; i < SAMPLE_SIZE; ++i)
{
sources[i].updatev();
}
}
void updateSources2()
{
for (size_t i = 0; i < SAMPLE_SIZE; ++i)
{
sources[i].update();
}
}
int main()
{
printf("Setting up...\n");
srand(0);
for (int i = 0; i < SAMPLE_SIZE; ++i)
{
structs[i].p = parr[i] = rand();
structs[i].v = varr[i] = rand();
structs[i].a = aarr[i] = rand();
headers[i].set(structs[i].p, structs[i].v, structs[i].a);
sources[i].set(structs[i].p, structs[i].v, structs[i].a);
}
for (int o = 1; o <= 3; ++o)
{
START_RUN("arrays separate");
for (int r = 1; r <= TEST_RUNS; ++r)
{
updateArrays1();
}
END_RUN();
START_RUN("arrays together");
for (int r = 1; r <= TEST_RUNS; ++r)
{
updateArrays2();
}
END_RUN();
START_RUN("structs separate");
for (int r = 1; r <= TEST_RUNS; ++r)
{
updateStructs1();
}
END_RUN();
START_RUN("structs together");
for (int r = 1; r <= TEST_RUNS; ++r)
{
updateStructs2();
}
END_RUN();
START_RUN("header classes separate");
for (int r = 1; r <= TEST_RUNS; ++r)
{
updateHeaders1();
}
END_RUN();
START_RUN("header classes together");
for (int r = 1; r <= TEST_RUNS; ++r)
{
updateHeaders2();
}
END_RUN();
START_RUN("source classes separate");
for (int r = 1; r <= TEST_RUNS; ++r)
{
updateSources1();
}
END_RUN();
START_RUN("source classes together");
for (int r = 1; r <= TEST_RUNS; ++r)
{
updateSources2();
}
END_RUN();
printf("\n");
}
}
C#代码更简洁,这是意料之中的。
using System;
using System.Diagnostics;
namespace sharpms
{
// UPDATE: Adding the dummy field did not pay off in C# either
struct Struct
{
public double p, v, a, dummy;
}
// UPDATE: marked class sealed (RE: Andre_Prellwitz) did not pay off
sealed class Class
{
public void updatep()
{
p += v;
}
public void updatev()
{
v += a;
}
public void update()
{
p += v;
v += a;
}
public double p, v, a, dummy;
}
class Program
{
const int SAMPLE_SIZE = 10 * 1024 * 1024;
const int TEST_RUNS = 4;
static double[] parr = new double[SAMPLE_SIZE];
static double[] varr = new double[SAMPLE_SIZE];
static double[] aarr = new double[SAMPLE_SIZE];
static Struct[] structs = new Struct[SAMPLE_SIZE];
static Class[] classes = new Class[SAMPLE_SIZE];
static void updateArrays1()
{
for (int i = 0; i < SAMPLE_SIZE; ++i)
{
parr[i] += varr[i];
}
for (int i = 0; i < SAMPLE_SIZE; ++i)
{
varr[i] += aarr[i];
}
}
static void updateArrays2()
{
for (int i = 0; i < SAMPLE_SIZE; ++i)
{
parr[i] += varr[i];
varr[i] += aarr[i];
}
}
// UPDATE: Resorting to unsafe code (// RE: igrulya) performs well,
// but is not necessary as the next solution shows
static unsafe void updateArrays3()
{
fixed (double* pparr0 = &parr[0])
fixed (double* pvarr0 = &varr[0])
fixed (double* paarr0 = &aarr[0])
{
double* pparr = pparr0;
double* pvarr = pvarr0;
double* paarr = paarr0;
int i = SAMPLE_SIZE;
while (i-- > 0)
{
*pparr++ += *pvarr;
*pvarr++ += *paarr++;
}
}
}
// RE: <a href="https://codeproject.org.cn/script/Membership/View.aspx?mid=15342246">OracPrime</a>
static void updateArrays4()
{
var pref = parr;
var vref = varr;
var aref = aarr;
for (int i = 0; i < SAMPLE_SIZE; ++i)
{
pref[i] += vref[i];
vref[i] += aref[i];
}
}
static void updateStructs1()
{
for (int i = 0; i < SAMPLE_SIZE; ++i)
{
structs[i].p += structs[i].v;
}
for (int i = 0; i < SAMPLE_SIZE; ++i)
{
structs[i].v += structs[i].a;
}
}
static void updateStructs2()
{
for (int i = 0; i < SAMPLE_SIZE; ++i)
{
structs[i].p += structs[i].v;
structs[i].v += structs[i].a;
}
}
static void updateClasses1()
{
for (int i = 0; i < SAMPLE_SIZE; ++i)
{
classes[i].updatep();
}
for (int i = 0; i < SAMPLE_SIZE; ++i)
{
classes[i].updatev();
}
}
static void updateClasses2()
{
for (int i = 0; i < SAMPLE_SIZE; ++i)
{
classes[i].update();
}
}
static void updateClasses3()
{
for (int i = 0; i < SAMPLE_SIZE; ++i)
{
classes[i].p += classes[i].v;
classes[i].v += classes[i].a;
}
}
static Stopwatch sw = new Stopwatch();
static void Start(string phase)
{
Console.Write(phase + ": ");
sw.Restart();
}
static void Finish()
{
Console.WriteLine(sw.ElapsedMilliseconds + " ms");
}
static void Main(string[] args)
{
Random rnd = new Random(0);
for (int i = 0; i < SAMPLE_SIZE; ++i)
{
structs[i] = new Struct();
classes[i] = new Class();
classes[i].p = structs[i].p = parr[i] = rnd.Next();
classes[i].v = structs[i].v = varr[i] = rnd.Next();
classes[i].a = structs[i].a = aarr[i] = rnd.Next();
}
for (int run = 1; run <= 3; ++run)
{
Start("arrays separate");
for (int r = 1; r <= TEST_RUNS; ++r)
{
updateArrays1();
}
Finish();
Start("arrays together");
for (int r = 1; r <= TEST_RUNS; ++r)
{
updateArrays2(); // bug fix RE: Kermel
}
Finish();
Start("arrays unsafe");
for (int r = 1; r <= TEST_RUNS; ++r)
{
updateArrays3();
}
Finish();
Start("arrays with refs");
for (int r = 1; r <= TEST_RUNS; ++r)
{
updateArrays4();
}
Finish();
Start("structs separate");
for (int r = 1; r <= TEST_RUNS; ++r)
{
updateStructs1();
}
Finish();
Start("structs together");
for (int r = 1; r <= TEST_RUNS; ++r)
{
updateStructs2();
}
Finish();
Start("classes separate");
for (int r = 1; r <= TEST_RUNS; ++r)
{
updateClasses1();
}
Finish();
Start("classes together");
for (int r = 1; r <= TEST_RUNS; ++r)
{
updateClasses2();
}
Finish();
Start("classes inlined"); // RE: Andre_Prellwitz
for (int r = 1; r <= TEST_RUNS; ++r)
{
updateClasses3();
}
Finish();
}
}
}
}
C++结果
arrays separate: 75 ms
arrays together: 76 ms
structs separate: 182 ms
structs together: 100 ms
header classes separate: 182 ms
header classes together: 98 ms
source classes separate: 189 ms
source classes together: 100 ms
你可以看到,使用原始数组的速度快得多,比使用结构体或类的单独技术快2倍。注意,LTCG 完美地内联了 Source 类的代码,并且类的速度比结构体略快。有趣。
C#结果
arrays separate: 129 ms
arrays together: 112 ms
arrays unsafe: 91 ms
arrays with refs: 90 ms
structs separate: 196 ms
structs together: 104 ms
classes separate: 384 ms
classes together: 191 ms
classes inlined: 204 ms
C#中的数组并没有像C++中那样比其他方法有很大的性能提升。请注意,C#结构体的性能与C++类相当。并且C#结构体比C#数组快。奇怪。不出所料,C#类的性能不佳,尤其是在更新分离的情况下。
结论和关注点
- 如果你想要最佳的数据处理性能,C/C++中的并行数组是最佳选择。你一开始就应该知道这一点。
- 如果这种代码模式不可接受,那么使用C++类或C#结构体是不错的选择。代码更易于管理,并且性能良好。
- 应该避免使用C#类。密封它们并内联它们的代码并不能使其物有所值。
- 对类或结构体中的不同字段使用单独的循环来处理并不会带来任何好处,并且应该避免,因为它会对代码质量产生负面影响。在这个用例中,数据缓存似乎比代码缓存重要得多。
- 在C#中,对于并行数组,你可以通过使用局部变量来“固定”数据来获得与不安全代码相同的性能,同时处理它。
历史
- 2021年8月13日:初始版本
- 2021年8月26日:采纳了Andre_Prellwitz的评论,并将其应用于代码和文章