c 语言优化问题-内存写耗时测试

void calRowSum(uint32_t *pDst, intptr_t strideD, int32_t width, int32_t height)
{
 for (int32_t w = 0; w < width; w++)
 {
  uint32_t *pDstTmp = &pDst[w];
  for (int32_t h = 0; h < height; h++)
  {
   pDstTmp[0] += pDstTmp[-1];
   pDstTmp += strideD ;
  }
 }
}

按上图写，release 模式下耗时0.072ms,pDstTmp每次移动strideD(1000以上的大数）;

void calRowSum(uint32_t *pDst, intptr_t strideD, int32_t width, int32_t height)
{
 for (int32_t w = 0; w < width; w++)
 {
  uint32_t *pDstTmp = &pDst[w];
  for (int32_t h = 0; h < height; h++)
  {
   pDstTmp[0] += pDstTmp[-1];
  }
 }
}

但是如果这么写，对相同内存进行写操作，耗时是0.15ms，少了一行，却耗时是前面的两倍;

void calRowSum(uint32_t *pDst, intptr_t strideD, int32_t width, int32_t height)
{
 for (int32_t w = 0; w < width; w++)
 {
  uint32_t *pDstTmp = &pDst[w];
  for (int32_t h = 0; h < height; h++)
  {
   pDstTmp[0] += pDstTmp[-1];
   pDstTmp += 2 ;
  }
 }
}

这么写，耗时0.03ms;

void calRowSum(uint32_t *pDst, intptr_t strideD, int32_t width, int32_t height)
{
 for (int32_t w = 0; w < width; w++)
 {
  uint32_t *pDstTmp = &pDst[w];
  for (int32_t h = 0; h < height; h++)
  {
   pDstTmp[0] += pDstTmp[-1];
   pDstTmp += 1 ;
  }
 }
}

但是这么写，耗时是0.148ms；

此外还测了+3,4,5...的实验，暂时结果是，对同一个内存反复写是非常耗时的，每次偏移一个单位，仍然耗时，当每次偏移2个单位内存时，耗时急剧下降，但随着strideD的变大（位数的变化），耗时会慢慢变大，如下：

void calRowSum(uint32_t *pDst, intptr_t strideD, int32_t width, int32_t height)
{
 for (int32_t w = 0; w < width; w++)
 {
  uint32_t *pDstTmp = &pDst[w];
  for (int32_t h = 0; h < height; h++)
  {
   pDstTmp[0] += pDstTmp[-1];
   pDstTmp += 254;
  }
 }
}

耗时0.035ms;

void calRowSum(uint32_t *pDst, intptr_t strideD, int32_t width, int32_t height)
{
 for (int32_t w = 0; w < width; w++)
 {
  uint32_t *pDstTmp = &pDst[w];
  for (int32_t h = 0; h < height; h++)
  {
   pDstTmp[0] += pDstTmp[-1];
   pDstTmp += 300;
  }
 }
}

耗时0.055ms

void calRowSum(uint32_t *pDst, intptr_t strideD, int32_t width, int32_t height)
{
 for (int32_t w = 0; w < width; w++)
 {
  uint32_t *pDstTmp = &pDst[w];
  for (int32_t h = 0; h < height; h++)
  {
   pDstTmp[0] += pDstTmp[-1];
   pDstTmp += 700;
  }
 }
}

耗时0.086ms...

结论，当下一次的计算依赖上一次的结果时，速度会变慢，因为上一次的结果还在计算中，此次需要继续等待，如

for (int32_t h = 0; h < height; h++)
{
pDstTmp[0] += pDstTmp[-1];
}

这个循环中，反复对pDstTmp[0]这块内存进行写，在第二次进行写操作时可能会等待第一次写的完成后才开始进行；

而，

for (int32_t h = 0; h < height; h++)
{
pDstTmp[0] += pDstTmp[-1];
pDstTmp += 1 ;
}

这个程序和上者的区别是，第二次调用的pDstTmp[-1]是第一次 pDstTmp[0]，同样第一次pDstTmp[0]写结束前是不会进行第二次将pDstTmp[-1]赋值给新的pDstTmp[0]，需要等待。

总之：

1，写内存速度相对比较操作，算术操作等要慢得多；

2，

for (int32_t h = 0; h < height; h++)
{
pDstTmp[0] += pDstTmp[-1];
pDstTmp += 1 ;
}

pDstTmp[0] += pDstTmp[-1];在计算结果出来之前，pDstTmp += 1 ;h < height; h++指令就已经执行完毕，快速到又到了pDstTmp[0] += pDstTmp[-1];指令，但是此时上一次的计算还没结束。。。。

3，为了提高程序的运行速度，应尽量避免写具有内存依赖的程序。。。，或者将具有内存依赖的放距离远点，中间可插入别的指令。。。这个和写汇编是一个道理，比如应尽量避免将多个paddw指令放在一起，而是最好和别的指令交叉使用（在不影响功能的情况下）。

c 语言优化问题-内存写耗时测试

浏览过的版块