Go to the documentation of this file.00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025
00026
00027
00028
00029
00030
00031
00032
00033
00034
00035
00036
00037
00038
00039
00040
00041
00042
00043 #ifndef OPENCV_GPU_EMULATION_HPP_
00044 #define OPENCV_GPU_EMULATION_HPP_
00045
00046 #include "warp_reduce.hpp"
00047 #include <stdio.h>
00048
00049 namespace cv { namespace gpu { namespace device
00050 {
00051 struct Emulation
00052 {
00053
00054 static __device__ __forceinline__ int syncthreadsOr(int pred)
00055 {
00056 #if defined (__CUDA_ARCH__) && (__CUDA_ARCH__ < 200)
00057
00058 return 0;
00059 #else
00060 return __syncthreads_or(pred);
00061 #endif
00062 }
00063
00064 template<int CTA_SIZE>
00065 static __forceinline__ __device__ int Ballot(int predicate)
00066 {
00067 #if defined (__CUDA_ARCH__) && (__CUDA_ARCH__ >= 200)
00068 return __ballot(predicate);
00069 #else
00070 __shared__ volatile int cta_buffer[CTA_SIZE];
00071
00072 int tid = threadIdx.x;
00073 cta_buffer[tid] = predicate ? (1 << (tid & 31)) : 0;
00074 return warp_reduce(cta_buffer);
00075 #endif
00076 }
00077
00078 struct smem
00079 {
00080 enum { TAG_MASK = (1U << ( (sizeof(unsigned int) << 3) - 5U)) - 1U };
00081
00082 template<typename T>
00083 static __device__ __forceinline__ T atomicInc(T* address, T val)
00084 {
00085 #if defined (__CUDA_ARCH__) && (__CUDA_ARCH__ < 120)
00086 T count;
00087 unsigned int tag = threadIdx.x << ( (sizeof(unsigned int) << 3) - 5U);
00088 do
00089 {
00090 count = *address & TAG_MASK;
00091 count = tag | (count + 1);
00092 *address = count;
00093 } while (*address != count);
00094
00095 return (count & TAG_MASK) - 1;
00096 #else
00097 return ::atomicInc(address, val);
00098 #endif
00099 }
00100
00101 template<typename T>
00102 static __device__ __forceinline__ T atomicAdd(T* address, T val)
00103 {
00104 #if defined (__CUDA_ARCH__) && (__CUDA_ARCH__ < 120)
00105 T count;
00106 unsigned int tag = threadIdx.x << ( (sizeof(unsigned int) << 3) - 5U);
00107 do
00108 {
00109 count = *address & TAG_MASK;
00110 count = tag | (count + val);
00111 *address = count;
00112 } while (*address != count);
00113
00114 return (count & TAG_MASK) - val;
00115 #else
00116 return ::atomicAdd(address, val);
00117 #endif
00118 }
00119
00120 template<typename T>
00121 static __device__ __forceinline__ T atomicMin(T* address, T val)
00122 {
00123 #if defined (__CUDA_ARCH__) && (__CUDA_ARCH__ < 120)
00124 T count = ::min(*address, val);
00125 do
00126 {
00127 *address = count;
00128 } while (*address > count);
00129
00130 return count;
00131 #else
00132 return ::atomicMin(address, val);
00133 #endif
00134 }
00135 };
00136 };
00137 }}}
00138
00139 #endif