我在 Ada 中有一个代码,必须使用 CUDA 而不使用 Ada 绑定。所以我做了一个接口,允许Ada程序调用C代码。现在我想编译它。
如何告诉 gprbuild 不要使用 gcc.cu
通过 nvcc 编译文件?如果不可能,也许我必须使用 nvcc 生成对象,然后将它们与 ada 代码链接?你会怎么做?
编辑:使用 Simon Wright 提供的链接,我制作了这个 gpr 文件:
project Cuda_Interface is
for Languages use ("Ada", "Cuda");
for Source_Dirs use ("src");
for Object_Dir use "obj";
for Exec_Dir use ".";
for Main use ("cuda_interface.adb");
for Create_Missing_Dirs use "True";
package Naming is
for Body_Suffix("Cuda") use ".cu";
for Spec_Suffix("Cuda") use ".cuh";
end Naming;
package Compiler is
for Driver("Cuda") use "nvcc";
for Leading_Required_Switches("Cuda") use ("-c");
end Compiler;
package Linker is
for Default_Switches("Ada") use ("-L/usr/local/cuda/lib64", "-lcuda", "-lcudart", "-lm");
end Linker;
end Cuda_Interface;
编译运行良好,但链接器返回此错误:
/usr/bin/ld : cuda_interface.o : in the function « _ada_cuda_interface » :
cuda_interface.adb:(.text+0x3a5) : undefined reference to « inter_add_two »
collect2: error: ld returned 1 exit status
gprbuild: link of cuda_interface.adb failed
cuda_interface.adb:
with Ada.Text_IO; use Ada.Text_IO;
procedure Cuda_Interface is
type Index is range 1 .. 5;
type Element_Type is new Natural;
type Array_Type is array (Index) of Element_Type;
procedure Inter_Add_Two(Arr : in out Array_Type; Length : Index)
with
Import => True,
Convention => C,
External_Name => "inter_add_two";
A : Array_Type := (1, 2, 3, 4, 5);
begin
for I in Index loop
Put_Line("Value at "
& Index'Image(I)
& " is "
& Element_Type'Image(A(I)));
end loop;
New_Line;
Inter_Add_Two(A, Index'Last);
for I in Index loop
Put_Line("Value at "
& Index'Image(I)
& " is "
& Element_Type'Image(A(I)));
end loop;
end Cuda_Interface;
内核.cuh
#ifndef __KERNEL_CUH__
#define __KERNEL_CUH__
#include <cuda.h>
__global__ void kernel_add_two(unsigned int *a, unsigned int length);
void inter_add_two(unsigned int *a, unsigned int length);
#endif // __KERNEL_CUH__
内核.cu
#include "kernel.cuh"
#include <math.h>
#define THREADS_PER_BLOCK (1024)
__global__ void kernel_add_two(unsigned int *a, unsigned int length)
{
unsigned int tid = threadIdx.x + blockIdx.x * blockDim.x;
if (tid < length) a[tid] += 2;
}
void inter_add_two(unsigned int *a, unsigned int length)
{
unsigned int block_number = ceil(((float)length) / THREADS_PER_BLOCK);
unsigned int *d_a;
cudaMalloc((void**)&d_a, sizeof(unsigned int) * length);
cudaMemcpy(d_a, a, sizeof(unsigned int) * length, cudaMemcpyHostToDevice);
kernel_add_two<<<block_number, THREADS_PER_BLOCK>>>(d_a, length);
cudaMemcpy(a, d_a, sizeof(unsigned int) * length, cudaMemcpyDeviceToHost);
cudaFree(d_a);
}