Go to the documentation of this file.00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025
00026
00027
00028
00029
00030
00031
00032
00033
00034
00035
00036
00037
00038
00039
00040
00041
00042
00043 #ifndef __OPENCV_GPU_SCAN_HPP__
00044 #define __OPENCV_GPU_SCAN_HPP__
00045
00046 #include "common.hpp"
00047
00048 namespace cv { namespace gpu { namespace device
00049 {
00050 enum ScanKind { EXCLUSIVE = 0, INCLUSIVE = 1 };
00051
00052 template <ScanKind Kind, typename T, typename F> struct WarpScan
00053 {
00054 __device__ __forceinline__ WarpScan() {}
00055 __device__ __forceinline__ WarpScan(const WarpScan& other) { (void)other; }
00056
00057 __device__ __forceinline__ T operator()( volatile T *ptr , const unsigned int idx)
00058 {
00059 const unsigned int lane = idx & 31;
00060 F op;
00061
00062 if ( lane >= 1) ptr [idx ] = op(ptr [idx - 1], ptr [idx]);
00063 if ( lane >= 2) ptr [idx ] = op(ptr [idx - 2], ptr [idx]);
00064 if ( lane >= 4) ptr [idx ] = op(ptr [idx - 4], ptr [idx]);
00065 if ( lane >= 8) ptr [idx ] = op(ptr [idx - 8], ptr [idx]);
00066 if ( lane >= 16) ptr [idx ] = op(ptr [idx - 16], ptr [idx]);
00067
00068 if( Kind == INCLUSIVE )
00069 return ptr [idx];
00070 else
00071 return (lane > 0) ? ptr [idx - 1] : 0;
00072 }
00073
00074 __device__ __forceinline__ unsigned int index(const unsigned int tid)
00075 {
00076 return tid;
00077 }
00078
00079 __device__ __forceinline__ void init(volatile T *ptr){}
00080
00081 static const int warp_offset = 0;
00082
00083 typedef WarpScan<INCLUSIVE, T, F> merge;
00084 };
00085
00086 template <ScanKind Kind , typename T, typename F> struct WarpScanNoComp
00087 {
00088 __device__ __forceinline__ WarpScanNoComp() {}
00089 __device__ __forceinline__ WarpScanNoComp(const WarpScanNoComp& other) { (void)other; }
00090
00091 __device__ __forceinline__ T operator()( volatile T *ptr , const unsigned int idx)
00092 {
00093 const unsigned int lane = threadIdx.x & 31;
00094 F op;
00095
00096 ptr [idx ] = op(ptr [idx - 1], ptr [idx]);
00097 ptr [idx ] = op(ptr [idx - 2], ptr [idx]);
00098 ptr [idx ] = op(ptr [idx - 4], ptr [idx]);
00099 ptr [idx ] = op(ptr [idx - 8], ptr [idx]);
00100 ptr [idx ] = op(ptr [idx - 16], ptr [idx]);
00101
00102 if( Kind == INCLUSIVE )
00103 return ptr [idx];
00104 else
00105 return (lane > 0) ? ptr [idx - 1] : 0;
00106 }
00107
00108 __device__ __forceinline__ unsigned int index(const unsigned int tid)
00109 {
00110 return (tid >> warp_log) * warp_smem_stride + 16 + (tid & warp_mask);
00111 }
00112
00113 __device__ __forceinline__ void init(volatile T *ptr)
00114 {
00115 ptr[threadIdx.x] = 0;
00116 }
00117
00118 static const int warp_smem_stride = 32 + 16 + 1;
00119 static const int warp_offset = 16;
00120 static const int warp_log = 5;
00121 static const int warp_mask = 31;
00122
00123 typedef WarpScanNoComp<INCLUSIVE, T, F> merge;
00124 };
00125
00126 template <ScanKind Kind , typename T, typename Sc, typename F> struct BlockScan
00127 {
00128 __device__ __forceinline__ BlockScan() {}
00129 __device__ __forceinline__ BlockScan(const BlockScan& other) { (void)other; }
00130
00131 __device__ __forceinline__ T operator()(volatile T *ptr)
00132 {
00133 const unsigned int tid = threadIdx.x;
00134 const unsigned int lane = tid & warp_mask;
00135 const unsigned int warp = tid >> warp_log;
00136
00137 Sc scan;
00138 typename Sc::merge merge_scan;
00139 const unsigned int idx = scan.index(tid);
00140
00141 T val = scan(ptr, idx);
00142 __syncthreads ();
00143
00144 if( warp == 0)
00145 scan.init(ptr);
00146 __syncthreads ();
00147
00148 if( lane == 31 )
00149 ptr [scan.warp_offset + warp ] = (Kind == INCLUSIVE) ? val : ptr [idx];
00150 __syncthreads ();
00151
00152 if( warp == 0 )
00153 merge_scan(ptr, idx);
00154 __syncthreads();
00155
00156 if ( warp > 0)
00157 val = ptr [scan.warp_offset + warp - 1] + val;
00158 __syncthreads ();
00159
00160 ptr[idx] = val;
00161 __syncthreads ();
00162
00163 return val ;
00164 }
00165
00166 static const int warp_log = 5;
00167 static const int warp_mask = 31;
00168 };
00169 }}}
00170
00171 #endif // __OPENCV_GPU_SCAN_HPP__