43 #ifndef __OPENCV_GPU_SCAN_HPP__
44 #define __OPENCV_GPU_SCAN_HPP__
51 namespace cv {
namespace gpu {
namespace device
55 template <ScanKind Kind,
typename T,
typename F>
struct WarpScan
62 const unsigned int lane = idx & 31;
65 if ( lane >= 1) ptr [
idx ] = op(ptr [idx - 1], ptr [idx]);
66 if ( lane >= 2) ptr [
idx ] = op(ptr [idx - 2], ptr [idx]);
67 if ( lane >= 4) ptr [
idx ] = op(ptr [idx - 4], ptr [idx]);
68 if ( lane >= 8) ptr [
idx ] = op(ptr [idx - 8], ptr [idx]);
69 if ( lane >= 16) ptr [
idx ] = op(ptr [idx - 16], ptr [idx]);
74 return (lane > 0) ? ptr [idx - 1] : 0;
77 __device__ __forceinline__
unsigned int index(
const unsigned int tid)
82 __device__ __forceinline__
void init(
volatile T *
ptr){}
96 const unsigned int lane = threadIdx.x & 31;
99 ptr [
idx ] = op(ptr [idx - 1], ptr [idx]);
100 ptr [
idx ] = op(ptr [idx - 2], ptr [idx]);
101 ptr [
idx ] = op(ptr [idx - 4], ptr [idx]);
102 ptr [
idx ] = op(ptr [idx - 8], ptr [idx]);
103 ptr [
idx ] = op(ptr [idx - 16], ptr [idx]);
108 return (lane > 0) ? ptr [idx - 1] : 0;
111 __device__ __forceinline__
unsigned int index(
const unsigned int tid)
116 __device__ __forceinline__
void init(
volatile T *
ptr)
118 ptr[threadIdx.x] = 0;
129 template <ScanKind Kind ,
typename T,
typename Sc,
typename F>
struct BlockScan
136 const unsigned int tid = threadIdx.x;
137 const unsigned int lane = tid &
warp_mask;
138 const unsigned int warp = tid >>
warp_log;
142 const unsigned int idx = scan.index(tid);
144 T val = scan(ptr, idx);
152 ptr [scan.warp_offset + warp ] = (Kind ==
INCLUSIVE) ? val : ptr [idx];
156 merge_scan(ptr, idx);
160 val = ptr [scan.warp_offset + warp - 1] + val;
173 template <
typename T>
176 #if __CUDA_ARCH__ >= 300
181 for (
int i = 1; i <= (OPENCV_GPU_WARP_SIZE / 2); i *= 2)
190 unsigned int pos = 2 * tid - (tid & (OPENCV_GPU_WARP_SIZE - 1));
192 pos += OPENCV_GPU_WARP_SIZE;
195 s_Data[
pos] += s_Data[pos - 1];
196 s_Data[
pos] += s_Data[pos - 2];
197 s_Data[
pos] += s_Data[pos - 4];
198 s_Data[
pos] += s_Data[pos - 8];
199 s_Data[
pos] += s_Data[pos - 16];
205 template <
typename T>
211 template <
int tiNumScanThreads,
typename T>
214 if (tiNumScanThreads > OPENCV_GPU_WARP_SIZE)
222 if ((tid & (OPENCV_GPU_WARP_SIZE - 1)) == (OPENCV_GPU_WARP_SIZE - 1))
224 s_Data[tid >> OPENCV_GPU_LOG_WARP_SIZE] = warpResult;
230 if (tid < (tiNumScanThreads / OPENCV_GPU_WARP_SIZE) )
241 return warpResult + s_Data[tid >> OPENCV_GPU_LOG_WARP_SIZE];
250 #endif // __OPENCV_GPU_SCAN_HPP__
__device__ T blockScanInclusive(T idata, volatile T *s_Data, unsigned int tid)
Definition: scan.hpp:212
CV_EXPORTS void merge(const Mat *mv, size_t count, OutputArray dst)
makes multi-channel array out of several single-channel arrays
const int * idx
Definition: core_c.h:323
static __device__ __forceinline__ unsigned int laneId()
Returns the warp lane ID of the calling thread.
Definition: warp.hpp:58
static const int warp_offset
Definition: scan.hpp:122
__device__ __forceinline__ BlockScan()
Definition: scan.hpp:131
const CvMat const CvMat const CvMat CvMat CvMat CvMat CvMat CvSize CvMat CvMat * T
Definition: calib3d.hpp:270
typedef void(CV_CDECL *CvMouseCallback)(int event
__device__ T warpScanInclusive(T idata, volatile T *s_Data, unsigned int tid)
Definition: scan.hpp:174
CvMemStoragePos * pos
Definition: core_c.h:933
static const int warp_mask
Definition: scan.hpp:124
__device__ __forceinline__ T shfl_up(T val, unsigned int delta, int width=warpSize)
Definition: warp_shuffle.hpp:113
static const int warp_offset
Definition: scan.hpp:84
static const int warp_log
Definition: scan.hpp:123
__device__ __forceinline__ WarpScan(const WarpScan &other)
Definition: scan.hpp:58
__device__ __forceinline__ unsigned int index(const unsigned int tid)
Definition: scan.hpp:77
__device__ __forceinline__ WarpScanNoComp(const WarpScanNoComp &other)
Definition: scan.hpp:92
static const int warp_smem_stride
Definition: scan.hpp:121
__device__ __forceinline__ void init(volatile T *ptr)
Definition: scan.hpp:116
__device__ __forceinline__ T warpScanExclusive(T idata, volatile T *s_Data, unsigned int tid)
Definition: scan.hpp:206
__device__ __forceinline__ WarpScan()
Definition: scan.hpp:57
const char * ptr
Definition: core_c.h:942
__device__ __forceinline__ BlockScan(const BlockScan &other)
Definition: scan.hpp:132
__device__ __forceinline__ T operator()(volatile T *ptr)
Definition: scan.hpp:134
const CvMat const CvMat * F
Definition: calib3d.hpp:297
int n
Definition: legacy.hpp:3070
WarpScan< INCLUSIVE, T, F > merge
Definition: scan.hpp:86
static const int warp_log
Definition: scan.hpp:169
__device__ __forceinline__ T operator()(volatile T *ptr, const unsigned int idx)
Definition: scan.hpp:60
ScanKind
Definition: scan.hpp:53
static const int warp_mask
Definition: scan.hpp:170
__device__ __forceinline__ void init(volatile T *ptr)
Definition: scan.hpp:82
WarpScanNoComp< INCLUSIVE, T, F > merge
Definition: scan.hpp:126
__device__ __forceinline__ unsigned int index(const unsigned int tid)
Definition: scan.hpp:111
__device__ __forceinline__ T operator()(volatile T *ptr, const unsigned int idx)
Definition: scan.hpp:94
__device__ __forceinline__ WarpScanNoComp()
Definition: scan.hpp:91