我正在尝试进行 FFT 加内核计算。FFT : managedCUDA 库内核 calc : 自己的内核
C# 代码
public void cuFFTreconstruct() {
CudaContext ctx = new CudaContext(0);
CudaKernel cuKernel = ctx.LoadKernel("kernel_Array.ptx", "cu_ArrayInversion");
float[] fData = new float[Resolution * Resolution * 2];
float[] result = new float[Resolution * Resolution * 2];
CudaDeviceVariable<float> devData = new CudaDeviceVariable<float>(Resolution * Resolution * 2);
CudaDeviceVariable<float> copy_devData = new CudaDeviceVariable<float>(Resolution * Resolution * 2);
int i, j;
Random rnd = new Random();
double avrg = 0.0;
for (i = 0; i < Resolution; i++)
{
for (j = 0; j < Resolution; j++)
{
fData[(i * Resolution + j) * 2] = i + j * 2;
fData[(i * Resolution + j) * 2 + 1] = 0.0f;
}
}
devData.CopyToDevice(fData);
CudaFFTPlan1D plan1D = new CudaFFTPlan1D(Resolution * 2, cufftType.C2C, Resolution * 2);
plan1D.Exec(devData.DevicePointer, TransformDirection.Forward);
cuKernel.GridDimensions = new ManagedCuda.VectorTypes.dim3(Resolution / 256, Resolution, 1);
cuKernel.BlockDimensions = new ManagedCuda.VectorTypes.dim3(256, 1, 1);
cuKernel.Run(devData.DevicePointer, copy_devData.DevicePointer, Resolution);
devData.CopyToHost(result);
for (i = 0; i < Resolution; i++)
{
for (j = 0; j < Resolution; j++)
{
ResultData[i, j, 0] = result[(i * Resolution + j) * 2];
ResultData[i, j, 1] = result[(i * Resolution + j) * 2 + 1];
}
}
ctx.FreeMemory(devData.DevicePointer);
ctx.FreeMemory(copy_devData.DevicePointer);
}
内核代码
//Includes for IntelliSense
#define _SIZE_T_DEFINED
#ifndef __CUDACC__
#define __CUDACC__
#endif
#ifndef __cplusplus
#define __cplusplus
#endif
#include <cuda.h>
#include <device_launch_parameters.h>
#include <texture_fetch_functions.h>
#include "float.h"
#include <builtin_types.h>
#include <vector_functions.h>
// Texture reference
texture<float2, 2> texref;
extern "C"
{
__global__ void cu_ArrayInversion(float* data_A, float* data_B, int Resolution)
{
int image_x = blockIdx.x * blockDim.x + threadIdx.x;
int image_y = blockIdx.y;
data_B[(Resolution * image_x + image_y) * 2] = data_A[(Resolution * image_y + image_x) * 2];
data_B[(Resolution * image_x + image_y) * 2 + 1] = data_A[(Resolution * image_y + image_x) * 2 + 1];
}
}
但是,这个程序效果不佳。发生以下错误:
ErrorLaunchFailed:执行内核时设备发生异常。常见原因包括取消引用无效的设备指针和访问越界共享内存。上下文不能被使用,所以它必须被销毁(并且应该创建一个新的)。此上下文中的所有现有设备内存分配都是无效的,如果程序要继续使用 CUDA,则必须重新构建。