我正在实时处理我尝试使用 GeForce GTX 960M 处理的视频流。(Windows 10、VS 2013、CUDA 8.0)
每一帧都必须被捕捉,轻微模糊,只要有可能,我需要对最新的 10 帧进行一些艰苦的计算。所以我需要以 30 fps 的速度捕获所有帧,并且我希望以 5 fps 的速度获得艰苦的工作结果。
我的问题是我无法保持捕获以正确的速度运行:无论是在 CPU 级别还是在 GPU 级别,艰苦的计算似乎都会减慢帧的捕获速度。我错过了一些帧...
我尝试了很多解决方案。没有工作:
- 我尝试在 2 个流上设置作业(下图):
- 主机得到一个框架
- 第一个流(称为 Stream2):cudaMemcpyAsync 将帧复制到设备上。然后,第一个内核进行基本的模糊计算。(在附图中,模糊显示为 3.07 秒和 3.085 秒处的一个短槽。然后什么都没有……直到大部分完成)
- 由于 CudaEvent,主机检查第二个流是否“可用”,并在可能的情况下启动它。实际上,流在 1/2 的尝试中可用。
- 第二个流(称为 Stream4):在内核(kernelCalcul_W2)中开始艰苦的计算,输出结果,并记录一个事件。
实际上,我写道:
cudaStream_t sHigh, sLow;
cudaStreamCreateWithPriority(&sHigh, cudaStreamNonBlocking, priority_high);
cudaStreamCreateWithPriority(&sLow, cudaStreamNonBlocking, priority_low);
cudaEvent_t event_1;
cudaEventCreate(&event_1);
if (frame has arrived)
{
cudaMemcpyAsync(..., sHigh); // HtoD, to upload images in the GPU
blur_Image <<<... , sHigh>>> (...)
if (cudaEventQuery(event_1)==cudaSuccess)) hard_work(sLow);
else printf("Event 2 not ready\n");
}
void hard_work( cudaStream_t sLow_)
{
kernelCalcul_W2<<<... , sLow_>>> (...);
cudaMemcpyAsync(... the result..., sLow_); //DtoH
cudaEventRecord(event_1, sLow_);
}
- 我试图只使用一个流。与上面的代码相同,但在启动 hard_work 时更改了 1 个参数。
- 主机得到一个框架
- 流:cudaMemcpyAsync 将帧复制到设备上。然后,内核进行基本的模糊计算。然后,如果 CudaEvent Event_1 没问题,我会努力工作,并添加一个 Event_1 以获得下一轮的状态。实际上,流始终可用:我从不属于“其他”部分。
这样,在努力工作的同时,我希望“缓冲”所有要复制的帧,而不会丢失任何帧。但我确实失去了一些:事实证明,每次我得到一个帧并复制它时,Event_1 似乎还可以,所以我开始努力工作,直到很晚才得到下一帧。
- 我试图将两个流放在两个不同的线程中(在 C 中)。没有更好(甚至更糟)。
所以问题是:如何确保第一个流捕获所有帧?我真的感觉不同的流会阻塞 CPU。
我用 OpenGL 显示图像。会不会干扰?
有什么改进方法的想法吗?非常感谢!
编辑: 根据要求,我在这里放了一个 MCVE。
您可以调整一个参数 (#define ADJUST) 以查看发生了什么。基本上,主程序以异步模式发送 CUDA 请求,但它似乎阻塞了主线程。正如您将在图像中看到的那样,我每 30 毫秒进行一次“内存访问”(即捕获的图像),除非正在运行艰苦的工作(然后,我只是没有得到图像)。
最后一个细节:我正在使用 CUDA 7.5 来运行它。我试图安装 8.0 但显然编译器仍然是 7.5
#define _USE_MATH_DEFINES 1
#define _CRT_SECURE_NO_WARNINGS 1
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <Windows.h>
#define ADJUST 400
// adjusting this paramter may make the problem occur.
// Too high => probably watchdog will stop the kernel
// too low => probably the kernel will run smothly
unsigned short * images_as_Unsigned_in_Host;
unsigned short * Images_as_Unsigned_in_Device;
unsigned short * camera;
float * images_as_Output_in_Host;
float * Images_as_Float_in_Device;
float * imageOutput_in_Device;
unsigned short imageWidth, imageHeight, totNbOfImages, imageSlot;
unsigned long imagePixelSize;
unsigned short lastImageFromCamera;
cudaStream_t s1, s2;
cudaEvent_t event_2;
clock_t timeRef;
// Basically, in the middle of the image, I average the values. I removed the logic behind to make it simpler.
// This kernel runs fast, and that's the point.
__global__ void blurImage(unsigned short * Images_as_Unsigned_in_Device_, float * Images_as_Float_in_Device_, unsigned short imageWidth_,
unsigned long imagePixelSize_, short blur_distance)
{
// we start from 'blur_distance' from the edge
// p0 is the point we will calculate. p is a pointer which will move around for average
unsigned long p0 = (threadIdx.x + blur_distance) + (blockIdx.x + blur_distance) * imageWidth_;
unsigned long p = p0;
unsigned short * us;
if (p >= imagePixelSize_) return;
unsigned long tot = 0;
short a, b, n, k;
k = 0;
// p starts from the top edge and will move to the right-bottom
p -= blur_distance + blur_distance * imageWidth_;
us = Images_as_Unsigned_in_Device_ + p;
for (a = 2 * blur_distance; a >= 0; a--)
{
for (b = 2 * blur_distance; b >= 0; b--)
{
n = *us;
if (n > 0) { tot += n; k++; }
us++;
}
us += imageWidth_ - 2 * blur_distance - 1;
}
if (k > 0) Images_as_Float_in_Device_[p0] = (float)tot / (float)k;
else Images_as_Float_in_Device_[p0] = 128.f;
}
__global__ void kernelCalcul_W2(float *inputImage, float *outputImage, unsigned long imagePixelSize_, unsigned short imageWidth_, unsigned short slot, unsigned short totImages)
{
// point the pixel and crunch it
unsigned long p = threadIdx.x + blockIdx.x * imageWidth_;
if (p >= imagePixelSize_) { return; }
float result;
long a, b, n, n0;
float input;
b = 3;
// this is not the right algorithm (which is pretty complex).
// I know this is not optimal in terms of memory management. Still, I want a "long" calculation here so I don't care...
for (n = 0; n < 10; n++)
{
n0 = slot - n;
if (n0 < 0) n0 += totImages;
input = inputImage[p + n0 * imagePixelSize_];
for (a = 0; a < ADJUST ; a++)
result += pow(input, inputImage[a + n0 * imagePixelSize_]) * cos(input);
}
outputImage[p] = result;
}
void hard_work( cudaStream_t s){
cudaError err;
// launch the hard work
printf("Hard work is launched after image %d is captured ==> ", imageSlot);
kernelCalcul_W2 << <340, 500, 0, s >> >(Images_as_Float_in_Device, imageOutput_in_Device, imagePixelSize, imageWidth, imageSlot, totNbOfImages);
err = cudaPeekAtLastError();
if (err != cudaSuccess) printf( "running error: %s \n", cudaGetErrorString(err));
else printf("running ok\n");
// copy the result back to Host
//printf(" %p %p \n", images_as_Output_in_Host, imageOutput_in_Device);
cudaMemcpyAsync(images_as_Output_in_Host, imageOutput_in_Device, sizeof(float) * imagePixelSize, cudaMemcpyDeviceToHost, s);
cudaEventRecord(event_2, s);
}
void createStorageSpace()
{
imageWidth = 640;
imageHeight = 480;
totNbOfImages = 300;
imageSlot = 0;
imagePixelSize = 640 * 480;
lastImageFromCamera = 0;
camera = (unsigned short *)malloc(imagePixelSize * sizeof(unsigned short));
for (int i = 0; i < imagePixelSize; i++) camera[i] = rand() % 255;
// storing the images in the Host memory. I know I could optimize with cudaHostAllocate.
images_as_Unsigned_in_Host = (unsigned short *) malloc(imagePixelSize * sizeof(unsigned short) * totNbOfImages);
images_as_Output_in_Host = (float *)malloc(imagePixelSize * sizeof(float));
cudaMalloc(&Images_as_Unsigned_in_Device, imagePixelSize * sizeof(unsigned short) * totNbOfImages);
cudaMalloc(&Images_as_Float_in_Device, imagePixelSize * sizeof(float) * totNbOfImages);
cudaMalloc(&imageOutput_in_Device, imagePixelSize * sizeof(float));
int priority_high, priority_low;
cudaDeviceGetStreamPriorityRange(&priority_low, &priority_high);
cudaStreamCreateWithPriority(&s1, cudaStreamNonBlocking, priority_high);
cudaStreamCreateWithPriority(&s2, cudaStreamNonBlocking, priority_low);
cudaEventCreate(&event_2);
}
void releaseMapFile()
{
cudaFree(Images_as_Unsigned_in_Device);
cudaFree(Images_as_Float_in_Device);
cudaFree(imageOutput_in_Device);
free(images_as_Output_in_Host);
free(camera);
cudaStreamDestroy(s1);
cudaStreamDestroy(s2);
cudaEventDestroy(event_2);
}
void putImageCUDA(const void * data)
{
// We put the image in a round-robin. The slot to put the image is imageSlot
printf("\nDealing with image %d\n", imageSlot);
// Copy the image in the Round Robin
cudaMemcpyAsync(Images_as_Unsigned_in_Device + imageSlot * imagePixelSize, data, sizeof(unsigned short) * imagePixelSize, cudaMemcpyHostToDevice, s1);
// We will blur the image. Let's prepare the memory to get the results as floats
cudaMemsetAsync(Images_as_Float_in_Device + imageSlot * imagePixelSize, 0., sizeof(float) * imagePixelSize, s1);
// blur image
blurImage << <imageHeight - 140, imageWidth - 140, 0, s1 >> > (Images_as_Unsigned_in_Device + imageSlot * imagePixelSize,
Images_as_Float_in_Device + imageSlot * imagePixelSize,
imageWidth, imagePixelSize, 3);
// launches the hard-work
if (cudaEventQuery(event_2) == cudaSuccess) hard_work(s2);
else printf("Hard_work still running, so unable to process after image %d\n", imageSlot);
imageSlot++;
if (imageSlot >= totNbOfImages) {
imageSlot = 0;
}
}
int main()
{
createStorageSpace();
printf("The following loop is supposed to push images in the GPU and do calculations in Async mode, and to wait 30 ms before the next image, so we should have the output on the screen in 10 x 30 ms. But it's far slower...\nYou may adjust a #define ADJUST parameter to see what's happening.");
for (int i = 0; i < 10; i++)
{
putImageCUDA(camera); // Puts an image in the GPU, does the bluring, and tries to do the hard-work
Sleep(30); // to simulate Camera
}
releaseMapFile();
getchar();
}