首页>Program>source

我想在CUDA中实现此原子功能:

__device__ float lowest;   // global var
__device__ int  lowIdx;    // global var
float realNum;   // thread reg var
int index;       // thread reg var
if(realNum < lowest) {
 lowest= realNum;  // the new lowest
 lowIdx= index;    // update the 'low' index
}

我不相信我可以用任何原子功能来做到这一点.我需要为几个指令锁定几个全局内存位置。 我可以用PTXAS(汇编)代码实现吗?

最新回答
  • 2021-1-11
    1 #

    正如我在上面第二条评论中所述,可以将两个32位量合并为一个64位原子管理量 ,然后以这种方式处理问题.然后,我们使用任意原子示例作为粗略指南,以原子方式管理64位数量.显然,您不能将这个想法扩展到两个32位数量之外.这是一个示例:

    #include <stdio.h>
    #define DSIZE 5000
    #define nTPB 256
    #define cudaCheckErrors(msg) \
        do { \
            cudaError_t __err = cudaGetLastError(); \
            if (__err != cudaSuccess) { \
                fprintf(stderr, "Fatal error: %s (%s at %s:%d)\n", \
                    msg, cudaGetErrorString(__err), \
                    __FILE__, __LINE__); \
                fprintf(stderr, "*** FAILED - ABORTING\n"); \
                exit(1); \
            } \
        } while (0)
    typedef union  {
      float floats[2];                 // floats[0] = lowest
      int ints[2];                     // ints[1] = lowIdx
      unsigned long long int ulong;    // for atomic update
    } my_atomics;
    __device__ my_atomics test;
    __device__ unsigned long long int my_atomicMin(unsigned long long int* address, float val1, int val2)
    {
        my_atomics loc, loctest;
        loc.floats[0] = val1;
        loc.ints[1] = val2;
        loctest.ulong = *address;
        while (loctest.floats[0] >  val1) 
          loctest.ulong = atomicCAS(address, loctest.ulong,  loc.ulong);
        return loctest.ulong;
    }
    
    __global__ void min_test(const float* data)
    {
        int idx = (blockDim.x * blockIdx.x) + threadIdx.x;
        if (idx < DSIZE)
          my_atomicMin(&(test.ulong), data[idx],idx);
    }
    int main() {
      float *d_data, *h_data;
      my_atomics my_init;
      my_init.floats[0] = 10.0f;
      my_init.ints[1] = DSIZE;
      h_data = (float *)malloc(DSIZE * sizeof(float));
      if (h_data == 0) {printf("malloc fail\n"); return 1;}
      cudaMalloc((void **)&d_data, DSIZE * sizeof(float));
      cudaCheckErrors("cm1 fail");
      // create random floats between 0 and 1
      for (int i = 0; i < DSIZE; i++) h_data[i] = rand()/(float)RAND_MAX;
      cudaMemcpy(d_data, h_data, DSIZE*sizeof(float), cudaMemcpyHostToDevice);
      cudaCheckErrors("cmcp1 fail");
      cudaMemcpyToSymbol(test, &(my_init.ulong), sizeof(unsigned long long int));
      cudaCheckErrors("cmcp2 fail");
      min_test<<<(DSIZE+nTPB-1)/nTPB, nTPB>>>(d_data);
      cudaDeviceSynchronize();
      cudaCheckErrors("kernel fail");
      cudaMemcpyFromSymbol(&(my_init.ulong), test, sizeof(unsigned long long int));
      cudaCheckErrors("cmcp3 fail");
      printf("device min result = %f\n", my_init.floats[0]);
      printf("device idx result = %d\n", my_init.ints[1]);
      float host_val = 10.0f;
      int host_idx = DSIZE;
      for (int i=0; i<DSIZE; i++)
        if (h_data[i] < host_val){
          host_val = h_data[i];
          host_idx = i;
          }
      printf("host min result = %f\n", host_val);
      printf("host idx result = %d\n", host_idx);
      return 0;
    }
    

  • 2021-1-11
    2 #

    @Robert Crovella:很好的主意,但是我认为应该对功能进行如下修改:

    __device__ unsigned long long int my_atomicMin(unsigned long long int* address, float val1, int val2)
    {
        my_atomics loc, loctest, old;
        loc.floats[0] = val1;
        loc.ints[1] = val2;
        loctest.ulong = *address;
        old.ulong = loctest.ulong;
        while (loctest.floats[0] > val1){
            old.ulong = loctest.ulong;
            loctest.ulong = atomicCAS(address, loctest.ulong,  loc.ulong);
        }
        return old.ulong;
    }
    

相关问题

  • graphics:如何在Java中为游戏加倍缓冲?
  • multithreading:如何从另一个线程暂停和恢复Java中的线程