编辑:出于参考目的(如果有人偶然发现这个问题),Igor Ostrovsky 写了一篇关于缓存未命中的精彩文章。它讨论了几个不同的问题并显示了示例编号。 结束编辑
我做了一些测试<long story goes here>,想知道性能差异是否是由于内存缓存未命中造成的。以下代码演示了该问题并将其归结为关键时序部分。以下代码有几个循环以随机顺序访问内存,然后以升序地址顺序访问内存。  
我在 XP 机器(用 VS2005 编译:cl /O2)和 Linux 机器(gcc –Os)上运行它。两者都产生了相似的时间。这些时间以毫秒为单位。我相信所有循环都在运行并且没有优化(否则它会“立即”运行)。
*** 测试 20000 个节点 总订购时间:888.822899 总随机时间:2155.846268
这些数字有意义吗?差异主要是由于 L1 缓存未命中还是还有其他原因?有 20,000^2 次内存访问,如果每个都是缓存未命中,则每次未命中大约为 3.2 纳秒。我测试的 XP (P4) 机器是 3.2GHz,我怀疑(但不知道)有 32KB L1 缓存和 512KB L2。对于 20,000 个条目 (80KB),我假设没有大量的 L2 未命中。所以这将是(3.2*10^9 cycles/second) * 3.2*10^-9 seconds/miss) = 10.1 cycles/miss。这对我来说似乎很高。也许不是,或者我的数学不好。我尝试用 VTune 测量缓存未命中,但我得到了一个 BSOD。现在我无法让它连接到许可证服务器(grrrr)。  
typedef struct stItem
{
   long     lData;
   //char     acPad[20];
} LIST_NODE;
#if defined( WIN32 )
void StartTimer( LONGLONG *pt1 )
{
   QueryPerformanceCounter( (LARGE_INTEGER*)pt1 );
}
void StopTimer( LONGLONG t1, double *pdMS )
{
   LONGLONG t2, llFreq;
   QueryPerformanceCounter( (LARGE_INTEGER*)&t2 );
   QueryPerformanceFrequency( (LARGE_INTEGER*)&llFreq );
   *pdMS = ((double)( t2 - t1 ) / (double)llFreq) * 1000.0;
}
#else
// doesn't need 64-bit integer in this case
void StartTimer( LONGLONG *pt1 )
{
   // Just use clock(), this test doesn't need higher resolution
   *pt1 = clock();
}
void StopTimer( LONGLONG t1, double *pdMS )
{
   LONGLONG t2 = clock();
   *pdMS = (double)( t2 - t1 ) / ( CLOCKS_PER_SEC / 1000 );
}
#endif
long longrand()
{
   #if defined( WIN32 )
   // Stupid cheesy way to make sure it is not just a 16-bit rand value
   return ( rand() << 16 ) | rand();
   #else
   return rand();
   #endif
}
// get random value in the given range
int randint( int m, int n )
{
   int ret = longrand() % ( n - m + 1 );
   return ret + m;
}
// I think I got this out of Programming Pearls (Bentley).
void ShuffleArray
(
   long *plShuffle,  // (O) return array of "randomly" ordered integers
   long lNumItems    // (I) length of array
)
{
   long i;
   long j;
   long t;
   for ( i = 0; i < lNumItems; i++ )
      plShuffle[i] = i;
   for ( i = 0; i < lNumItems; i++ )
      {
      j = randint( i, lNumItems - 1 );
      t = plShuffle[i];
      plShuffle[i] = plShuffle[j];
      plShuffle[j] = t;
      }
}
int main( int argc, char* argv[] )
{
   long          *plDataValues;
   LIST_NODE     *pstNodes;
   long          lNumItems = 20000;
   long          i, j;
   LONGLONG      t1;  // for timing
   double dms;
   if ( argc > 1 && atoi(argv[1]) > 0 )
      lNumItems = atoi( argv[1] );
   printf( "\n\n*** Testing %u nodes\n", lNumItems );
   srand( (unsigned int)time( 0 ));
   // allocate the nodes as one single chunk of memory
   pstNodes = (LIST_NODE*)malloc( lNumItems * sizeof( LIST_NODE ));
   assert( pstNodes != NULL );
   // Create an array that gives the access order for the nodes
   plDataValues = (long*)malloc( lNumItems * sizeof( long ));
   assert( plDataValues != NULL );
   // Access the data in order
   for ( i = 0; i < lNumItems; i++ )
      plDataValues[i] = i;
   StartTimer( &t1 );
   // Loop through and access the memory a bunch of times
   for ( j = 0; j < lNumItems; j++ )
      {
      for ( i = 0; i < lNumItems; i++ )
         {
         pstNodes[plDataValues[i]].lData = i * j;
         }
      }
   StopTimer( t1, &dms );
   printf( "Total Ordered Time: %f\n", dms );
   // now access the array positions in a "random" order
   ShuffleArray( plDataValues, lNumItems );
   StartTimer( &t1 );
   for ( j = 0; j < lNumItems; j++ )
      {
      for ( i = 0; i < lNumItems; i++ )
         {
         pstNodes[plDataValues[i]].lData = i * j;
         }
      }
   StopTimer( t1, &dms );
   printf( "Total Random Time: %f\n", dms );
}
    