#include "math_functions.h"
#include "device_functions.h"
#include
__global__ void VecAdd(double* a, double* SIN, double* COS)
{
int i = blockDim.x * blockIdx.x + threadIdx.x;
sincos(a, SIN + i, COS + i);
}
int main()
{
int NUM = 5;
size_t SIZE = sizeof(double) * NUM;
double* ah = (double*)malloc(SIZE);
ah[0] = 0;
ah[1] = 90;
ah[2] = 3.1415926;
ah[3] = 1.247155;
ah[4] = 3.109967;
double* ad;
cudaMalloc((void**)&ad, SIZE);
cudaMemcpy(ad, ah, SIZE, cudaMemcpyHostToDevice);
double* sinh = (double*)malloc(SIZE);
double* sind;
cudaMalloc((void**)&sind, SIZE);
double* cosh = (double*)malloc(SIZE);
double* cosd;
cudaMalloc((void**)&cosd, SIZE);
int maxthreadperblock = 512;
int blochpergird = (NUM - 1) / maxthreadperblock + 1;
VecAdd>(ad, sind, cosd);
cudaMemcpy(sinh, sind, SIZE, cudaMemcpyDeviceToHost);
cudaMemcpy(cosh, cosd, SIZE, cudaMemcpyDeviceToHost);
for(int i = 0; i < NUM; i++)
{
printf("a=%f sin=%f cos=%f",ah,sinh,cosh);
}
cudaFree(ad);
cudaFree(sind);
cudaFree(cosd);
free(ah);
free(sinh);
free(cosh);
}
很简单的一个调用三角函数的实现,自己测吧。顺便可以查查文献看看到底怎么回事,我也有兴趣了解下。
随便说说,不做考证:cuda函数如果是段cpu代码,它永远别想在gpu上执行;如果是段gpu代码,那要看kernel能不能链接到这段代码。
[ 本帖最后由 kanshengzhe 于 2011-3-18 11:14 编辑 ]
|