0

在学习 CUDA 时,我正在做一个计算移动平均线的小项目。虽然我的简单移动平均线 (SMA) 工作正常(尽管速度慢且未优化),但我的指数移动平均线 (EMA) 总是导致错误的数字。

我发现问题在于它*(ema + i - 1)始终为 0。同样的数组访问概念在测试 C++ 文件中效果很好,但在我的 CUDA 应用程序中却不行。我想我只是不知道一些关于指针或 CUDA 的概念。


using namespace std;

// simple_ma not included

void __global__ exponential_ma(int n, int period, float *data, float *ema){
    int i = blockIdx.x * blockDim.x + threadIdx.x;

    if(i == 0){
        *ema = *data;
    }else if(i < n){

        float k = 2.0f/(period+1);
        *(ema + i) = *(data + i)*k + *(ema + i - 1) * (1.0f-k);
        // PROBLEM OCCURS ON THE LINE ABOVE, neither does ema[i-1] work
    }
}

int main(){
    /**
     * Function that computes a moving average on a vector
     */
    int N = 1<<5; // data size
    cout << "N = " << N << " bytes = " << N*sizeof(float) << endl;
    int period = 10; // moving average period

    // malloc'ed for stack usage instead of small heap size
    float *data = (float*)malloc(N*sizeof(float));
    float *sma = (float*)malloc(N*sizeof(float));
    float *ema = (float*)malloc(N*sizeof(float));

    float *d_data; // device pointer for data
    float *d_sma; // device pointer for simple moving average
    float *d_ema; // device pointer for exponential moving average

    // CUDA allocate memory for data, SMA, and EMA
    cudaMalloc(&d_data, N*sizeof(float));
    cudaMalloc(&d_sma, N*sizeof(float));
    cudaMalloc(&d_ema, N*sizeof(float));

    // initialize data
    srand(time(0));
    data[0] = rand() % 100 + 50;
    for(int i = 1; i < N; i++){
        data[i] = data[i-1] + rand() % 11 - 5;
    }

    // copy data from host to device
    cudaMemcpy(d_data, data, N*sizeof(float), cudaMemcpyHostToDevice);
    cudaMemcpy(d_sma, sma, N*sizeof(float), cudaMemcpyHostToDevice);
    cudaMemcpy(d_ema, ema, N*sizeof(float), cudaMemcpyHostToDevice);

    // call device function
    simple_ma<<<(N+255)/256, 256>>>(N, period, d_data, d_sma);
    exponential_ma<<<(N+255)/256, 256>>>(N, period, d_data, d_ema);

    cudaMemcpy(sma, d_sma, N*sizeof(float), cudaMemcpyDeviceToHost);
    cudaMemcpy(ema, d_ema, N*sizeof(float), cudaMemcpyDeviceToHost);

    for(int i = 0; i < N; i += 1){
        cout << "i = " << i << " data = "<< data[i] << " ---sma---> " << sma[i] << " ---ema---> " << ema[i] << endl;
    }

    cudaFree(d_data);
    cudaFree(d_sma);
    cudaFree(d_ema);

    return 0;
}
4

1 回答 1

3

CUDA 中的线程可以按任何顺序执行。当您尝试在另一个线程中计算时(这取决于计算是否完成),计算ema[i-1]可能尚未开始。您用于此算法的简单串行实现的方法不会以线程并行方式工作ema[i]ema[i-1]

考虑到这一点,这是一种可能的方法。

首先,重铸您的递归 ema 计算:

     ema[0] = data[0]
i>0: ema[i] = k*data[i]+(1-k)*ema[i-1]

非递归形式:

     ema[0] = data[0]
i>0: ema[i] = ((1-k)^i)*data[0] + ∑(((1-k)^x)*k*data[i-x])  
                                  x=0..i-1

这将告诉我们如何编写我们的 CUDA 内核代码。如果这种转换对您来说似乎很模糊,您可能希望创建一个包含序列前几个条目的表,类似于此答案中描述的方法。

它可以工作,但是每个线程都在遍历整个输入数组直到它的索引。将有一个线程块(具有最高数组索引)比所有其他线程块花费更长的时间。最坏情况下的线程所做的工作与串行版本大致相同,因此不是一个非常有趣的并行实现。

为了解决这个问题,我们可以对非递归形式方程进行观察。根据您的代码,该术语(1.0 - k)始终小于 1,因为k2 除以某个大于 2 的正整数(即我们假设period为 2 或更大)。因此,(1.0 - k)^x随着求和的进行,该项最终会变得非常小。我们还将假设您的数据在范围内,大致如您所示。在这种情况下,随着求和的进行,最终求和的项对float总和数量没有明显影响。有了这些假设,当我们的(1.0 - k)^x项变得足够小以至于对结果没有显着影响时,我们将减少循环处理。

通过这些假设和修改,我们可以创建一个运行速度比原始串行 CPU 版本更快的 CUDA 代码,同时保持较小的误差范围。

$ cat t1444.cu
#include <iostream>
#include <cstdio>
#define cudaCheckErrors(msg) \
    do { \
        cudaError_t __err = cudaGetLastError(); \
        if (__err != cudaSuccess) { \
            fprintf(stderr, "Fatal error: %s (%s at %s:%d)\n", \
                msg, cudaGetErrorString(__err), \
                __FILE__, __LINE__); \
            fprintf(stderr, "*** FAILED - ABORTING\n"); \
            exit(1); \
        } \
    } while (0)
#include <time.h>
#include <sys/time.h>
#define USECPSEC 1000000ULL

unsigned long long dtime_usec(unsigned long long start){

  timeval tv;
  gettimeofday(&tv, 0);
  return ((tv.tv_sec*USECPSEC)+tv.tv_usec)-start;
}


__global__ void gpu_ema(const int n, const float k, const float * __restrict__ data, float * __restrict__ ema, const float tol){
  int i = blockIdx.x*blockDim.x+threadIdx.x;
  if (i == 0) ema[0] = data[0];
  else if (i < n){
    float sum = 0;
    float fac = 1.0f - k;
    float m = 1.0f;
    int j;
    for (j = 0; j < i; j++){
      sum += m*k*data[i-j];
      m *= fac;
      if (m < tol) break;  // early exit validity depends on a variety of assumptions
      }
    if (j == i) sum += m*data[0];
    ema[i] = sum;
    }
}

void cpu_ema(int n, int period, float *data, float *ema){
  ema[0] = data[0];
  float k = 2.0f/(period+1);
  for (int i = 1; i < n; i++)
    ema[i] = data[i]*k + ema[i-1]*(1.0f-k);
}
int main(){
    /**
     * Function that computes a moving average on a vector
     */
    int N = 1<<20; // data size
    std::cout << "N = " << N << " bytes = " << N*sizeof(float) << std::endl;
    int period = 10; // moving average period

    // malloc'ed for stack usage instead of small heap size
    float *data = (float*)malloc(N*sizeof(float));
    float *ema = (float*)malloc(N*sizeof(float));
    float *gema = (float*)malloc(N*sizeof(float));

    float *d_data; // device pointer for data
    float *d_ema; // device pointer for exponential moving average

    // CUDA allocate memory for data, SMA, and EMA
    cudaMalloc(&d_data, N*sizeof(float));
    cudaMalloc(&d_ema, N*sizeof(float));

    // initialize data
    srand(time(0));
    data[0] = rand() % 100 + 50;
    for(int i = 1; i < N; i++){
        data[i] = data[i-1] + rand() % 11 - 5;
    }

    // copy data from host to device
    cudaMemcpy(d_data, data, N*sizeof(float), cudaMemcpyHostToDevice);

    // call device function
    long long gpu_t = dtime_usec(0);
    gpu_ema<<<(N+255)/256, 256>>>(N, 2.0f/(period+1), d_data, d_ema, 1e-7);
    cudaDeviceSynchronize();
    gpu_t = dtime_usec(gpu_t);
    long long cpu_t = dtime_usec(0);
    cpu_ema(N, period, data, ema);
    cpu_t = dtime_usec(cpu_t);
    if (N < 33)
      for (int i = 0; i < N; i++)
        std::cout << ema[i] << ",";
    std::cout << std::endl;

    cudaMemcpy(gema, d_ema, N*sizeof(float), cudaMemcpyDeviceToHost);
    cudaCheckErrors("some CUDA error");
    if (N < 33)
      for(int i = 0; i < N; i += 1)
          std::cout << gema[i] << ",";
    std::cout << std::endl;
    float max_err = fabs(gema[0] - ema[0])/ema[0];
    for (int i = 1; i < N; i++)
      max_err = max(max_err, fabs(gema[i] - ema[i])/ema[0]);
    std::cout << "max err: " << max_err*100.0 << "% final gpu: " << gema[N-1] << " final cpu: " << ema[N-1] << std::endl;
    std::cout << "cpu time: " << cpu_t/(float)USECPSEC << "s gpu time: " << gpu_t/(float)USECPSEC  << "s" << std::endl;
    cudaFree(d_data);
    cudaFree(d_ema);

    return 0;
}
$ nvcc -o t1444 t1444.cu
$ ./t1444
N = 1048576 bytes = 4194304


max err: 0.00218633% final gpu: 1311.38 final cpu: 1311.38
cpu time: 0.006346s gpu time: 0.000214s
$

特斯拉 V100,CUDA 10.1

重复一遍,上述代码的有效性与性能增强提前退出取决于范围有限的输入数据。我不会尝试仔细地涵盖统计信息,但是如果您不了解输入数据的统计信息,那么上述方法可能无效。

于 2019-07-06T21:16:49.253 回答