在学习 CUDA 时,我正在做一个计算移动平均线的小项目。虽然我的简单移动平均线 (SMA) 工作正常(尽管速度慢且未优化),但我的指数移动平均线 (EMA) 总是导致错误的数字。
我发现问题在于它*(ema + i - 1)始终为 0。同样的数组访问概念在测试 C++ 文件中效果很好,但在我的 CUDA 应用程序中却不行。我想我只是不知道一些关于指针或 CUDA 的概念。
using namespace std;
// simple_ma not included
void __global__ exponential_ma(int n, int period, float *data, float *ema){
int i = blockIdx.x * blockDim.x + threadIdx.x;
if(i == 0){
*ema = *data;
}else if(i < n){
float k = 2.0f/(period+1);
*(ema + i) = *(data + i)*k + *(ema + i - 1) * (1.0f-k);
// PROBLEM OCCURS ON THE LINE ABOVE, neither does ema[i-1] work
}
}
int main(){
/**
* Function that computes a moving average on a vector
*/
int N = 1<<5; // data size
cout << "N = " << N << " bytes = " << N*sizeof(float) << endl;
int period = 10; // moving average period
// malloc'ed for stack usage instead of small heap size
float *data = (float*)malloc(N*sizeof(float));
float *sma = (float*)malloc(N*sizeof(float));
float *ema = (float*)malloc(N*sizeof(float));
float *d_data; // device pointer for data
float *d_sma; // device pointer for simple moving average
float *d_ema; // device pointer for exponential moving average
// CUDA allocate memory for data, SMA, and EMA
cudaMalloc(&d_data, N*sizeof(float));
cudaMalloc(&d_sma, N*sizeof(float));
cudaMalloc(&d_ema, N*sizeof(float));
// initialize data
srand(time(0));
data[0] = rand() % 100 + 50;
for(int i = 1; i < N; i++){
data[i] = data[i-1] + rand() % 11 - 5;
}
// copy data from host to device
cudaMemcpy(d_data, data, N*sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(d_sma, sma, N*sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(d_ema, ema, N*sizeof(float), cudaMemcpyHostToDevice);
// call device function
simple_ma<<<(N+255)/256, 256>>>(N, period, d_data, d_sma);
exponential_ma<<<(N+255)/256, 256>>>(N, period, d_data, d_ema);
cudaMemcpy(sma, d_sma, N*sizeof(float), cudaMemcpyDeviceToHost);
cudaMemcpy(ema, d_ema, N*sizeof(float), cudaMemcpyDeviceToHost);
for(int i = 0; i < N; i += 1){
cout << "i = " << i << " data = "<< data[i] << " ---sma---> " << sma[i] << " ---ema---> " << ema[i] << endl;
}
cudaFree(d_data);
cudaFree(d_sma);
cudaFree(d_ema);
return 0;
}