scan.hpp
Go to the documentation of this file.
1 /*M///////////////////////////////////////////////////////////////////////////////////////
2 //
3 // IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
4 //
5 // By downloading, copying, installing or using the software you agree to this license.
6 // If you do not agree to this license, do not download, install,
7 // copy or use the software.
8 //
9 //
10 // License Agreement
11 // For Open Source Computer Vision Library
12 //
13 // Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
14 // Copyright (C) 2009, Willow Garage Inc., all rights reserved.
15 // Third party copyrights are property of their respective owners.
16 //
17 // Redistribution and use in source and binary forms, with or without modification,
18 // are permitted provided that the following conditions are met:
19 //
20 // * Redistribution's of source code must retain the above copyright notice,
21 // this list of conditions and the following disclaimer.
22 //
23 // * Redistribution's in binary form must reproduce the above copyright notice,
24 // this list of conditions and the following disclaimer in the documentation
25 // and/or other materials provided with the distribution.
26 //
27 // * The name of the copyright holders may not be used to endorse or promote products
28 // derived from this software without specific prior written permission.
29 //
30 // This software is provided by the copyright holders and contributors "as is" and
31 // any express or implied warranties, including, but not limited to, the implied
32 // warranties of merchantability and fitness for a particular purpose are disclaimed.
33 // In no event shall the Intel Corporation or contributors be liable for any direct,
34 // indirect, incidental, special, exemplary, or consequential damages
35 // (including, but not limited to, procurement of substitute goods or services;
36 // loss of use, data, or profits; or business interruption) however caused
37 // and on any theory of liability, whether in contract, strict liability,
38 // or tort (including negligence or otherwise) arising in any way out of
39 // the use of this software, even if advised of the possibility of such damage.
40 //
41 //M*/
42 
43 #ifndef __OPENCV_GPU_SCAN_HPP__
44 #define __OPENCV_GPU_SCAN_HPP__
45 
50 
51 namespace cv { namespace gpu { namespace device
52 {
53  enum ScanKind { EXCLUSIVE = 0, INCLUSIVE = 1 };
54 
55  template <ScanKind Kind, typename T, typename F> struct WarpScan
56  {
57  __device__ __forceinline__ WarpScan() {}
58  __device__ __forceinline__ WarpScan(const WarpScan& other) { (void)other; }
59 
60  __device__ __forceinline__ T operator()( volatile T *ptr , const unsigned int idx)
61  {
62  const unsigned int lane = idx & 31;
63  F op;
64 
65  if ( lane >= 1) ptr [idx ] = op(ptr [idx - 1], ptr [idx]);
66  if ( lane >= 2) ptr [idx ] = op(ptr [idx - 2], ptr [idx]);
67  if ( lane >= 4) ptr [idx ] = op(ptr [idx - 4], ptr [idx]);
68  if ( lane >= 8) ptr [idx ] = op(ptr [idx - 8], ptr [idx]);
69  if ( lane >= 16) ptr [idx ] = op(ptr [idx - 16], ptr [idx]);
70 
71  if( Kind == INCLUSIVE )
72  return ptr [idx];
73  else
74  return (lane > 0) ? ptr [idx - 1] : 0;
75  }
76 
77  __device__ __forceinline__ unsigned int index(const unsigned int tid)
78  {
79  return tid;
80  }
81 
82  __device__ __forceinline__ void init(volatile T *ptr){}
83 
84  static const int warp_offset = 0;
85 
87  };
88 
89  template <ScanKind Kind , typename T, typename F> struct WarpScanNoComp
90  {
91  __device__ __forceinline__ WarpScanNoComp() {}
92  __device__ __forceinline__ WarpScanNoComp(const WarpScanNoComp& other) { (void)other; }
93 
94  __device__ __forceinline__ T operator()( volatile T *ptr , const unsigned int idx)
95  {
96  const unsigned int lane = threadIdx.x & 31;
97  F op;
98 
99  ptr [idx ] = op(ptr [idx - 1], ptr [idx]);
100  ptr [idx ] = op(ptr [idx - 2], ptr [idx]);
101  ptr [idx ] = op(ptr [idx - 4], ptr [idx]);
102  ptr [idx ] = op(ptr [idx - 8], ptr [idx]);
103  ptr [idx ] = op(ptr [idx - 16], ptr [idx]);
104 
105  if( Kind == INCLUSIVE )
106  return ptr [idx];
107  else
108  return (lane > 0) ? ptr [idx - 1] : 0;
109  }
110 
111  __device__ __forceinline__ unsigned int index(const unsigned int tid)
112  {
113  return (tid >> warp_log) * warp_smem_stride + 16 + (tid & warp_mask);
114  }
115 
116  __device__ __forceinline__ void init(volatile T *ptr)
117  {
118  ptr[threadIdx.x] = 0;
119  }
120 
121  static const int warp_smem_stride = 32 + 16 + 1;
122  static const int warp_offset = 16;
123  static const int warp_log = 5;
124  static const int warp_mask = 31;
125 
127  };
128 
129  template <ScanKind Kind , typename T, typename Sc, typename F> struct BlockScan
130  {
131  __device__ __forceinline__ BlockScan() {}
132  __device__ __forceinline__ BlockScan(const BlockScan& other) { (void)other; }
133 
134  __device__ __forceinline__ T operator()(volatile T *ptr)
135  {
136  const unsigned int tid = threadIdx.x;
137  const unsigned int lane = tid & warp_mask;
138  const unsigned int warp = tid >> warp_log;
139 
140  Sc scan;
141  typename Sc::merge merge_scan;
142  const unsigned int idx = scan.index(tid);
143 
144  T val = scan(ptr, idx);
145  __syncthreads ();
146 
147  if( warp == 0)
148  scan.init(ptr);
149  __syncthreads ();
150 
151  if( lane == 31 )
152  ptr [scan.warp_offset + warp ] = (Kind == INCLUSIVE) ? val : ptr [idx];
153  __syncthreads ();
154 
155  if( warp == 0 )
156  merge_scan(ptr, idx);
157  __syncthreads();
158 
159  if ( warp > 0)
160  val = ptr [scan.warp_offset + warp - 1] + val;
161  __syncthreads ();
162 
163  ptr[idx] = val;
164  __syncthreads ();
165 
166  return val ;
167  }
168 
169  static const int warp_log = 5;
170  static const int warp_mask = 31;
171  };
172 
173  template <typename T>
174  __device__ T warpScanInclusive(T idata, volatile T* s_Data, unsigned int tid)
175  {
176  #if __CUDA_ARCH__ >= 300
177  const unsigned int laneId = cv::gpu::device::Warp::laneId();
178 
179  // scan on shuffl functions
180  #pragma unroll
181  for (int i = 1; i <= (OPENCV_GPU_WARP_SIZE / 2); i *= 2)
182  {
183  const T n = cv::gpu::device::shfl_up(idata, i);
184  if (laneId >= i)
185  idata += n;
186  }
187 
188  return idata;
189  #else
190  unsigned int pos = 2 * tid - (tid & (OPENCV_GPU_WARP_SIZE - 1));
191  s_Data[pos] = 0;
192  pos += OPENCV_GPU_WARP_SIZE;
193  s_Data[pos] = idata;
194 
195  s_Data[pos] += s_Data[pos - 1];
196  s_Data[pos] += s_Data[pos - 2];
197  s_Data[pos] += s_Data[pos - 4];
198  s_Data[pos] += s_Data[pos - 8];
199  s_Data[pos] += s_Data[pos - 16];
200 
201  return s_Data[pos];
202  #endif
203  }
204 
205  template <typename T>
206  __device__ __forceinline__ T warpScanExclusive(T idata, volatile T* s_Data, unsigned int tid)
207  {
208  return warpScanInclusive(idata, s_Data, tid) - idata;
209  }
210 
211  template <int tiNumScanThreads, typename T>
212  __device__ T blockScanInclusive(T idata, volatile T* s_Data, unsigned int tid)
213  {
214  if (tiNumScanThreads > OPENCV_GPU_WARP_SIZE)
215  {
216  //Bottom-level inclusive warp scan
217  T warpResult = warpScanInclusive(idata, s_Data, tid);
218 
219  //Save top elements of each warp for exclusive warp scan
220  //sync to wait for warp scans to complete (because s_Data is being overwritten)
221  __syncthreads();
222  if ((tid & (OPENCV_GPU_WARP_SIZE - 1)) == (OPENCV_GPU_WARP_SIZE - 1))
223  {
224  s_Data[tid >> OPENCV_GPU_LOG_WARP_SIZE] = warpResult;
225  }
226 
227  //wait for warp scans to complete
228  __syncthreads();
229 
230  if (tid < (tiNumScanThreads / OPENCV_GPU_WARP_SIZE) )
231  {
232  //grab top warp elements
233  T val = s_Data[tid];
234  //calculate exclusive scan and write back to shared memory
235  s_Data[tid] = warpScanExclusive(val, s_Data, tid);
236  }
237 
238  //return updated warp scans with exclusive scan results
239  __syncthreads();
240 
241  return warpResult + s_Data[tid >> OPENCV_GPU_LOG_WARP_SIZE];
242  }
243  else
244  {
245  return warpScanInclusive(idata, s_Data, tid);
246  }
247  }
248 }}}
249 
250 #endif // __OPENCV_GPU_SCAN_HPP__
__device__ T blockScanInclusive(T idata, volatile T *s_Data, unsigned int tid)
Definition: scan.hpp:212
CV_EXPORTS void merge(const Mat *mv, size_t count, OutputArray dst)
makes multi-channel array out of several single-channel arrays
const int * idx
Definition: core_c.h:323
static __device__ __forceinline__ unsigned int laneId()
Returns the warp lane ID of the calling thread.
Definition: warp.hpp:58
static const int warp_offset
Definition: scan.hpp:122
Definition: scan.hpp:55
__device__ __forceinline__ BlockScan()
Definition: scan.hpp:131
const CvMat const CvMat const CvMat CvMat CvMat CvMat CvMat CvSize CvMat CvMat * T
Definition: calib3d.hpp:270
typedef void(CV_CDECL *CvMouseCallback)(int event
__device__ T warpScanInclusive(T idata, volatile T *s_Data, unsigned int tid)
Definition: scan.hpp:174
CvMemStoragePos * pos
Definition: core_c.h:933
GLuint GLfloat * val
static const int warp_mask
Definition: scan.hpp:124
__device__ __forceinline__ T shfl_up(T val, unsigned int delta, int width=warpSize)
Definition: warp_shuffle.hpp:113
static const int warp_offset
Definition: scan.hpp:84
static const int warp_log
Definition: scan.hpp:123
__device__ __forceinline__ WarpScan(const WarpScan &other)
Definition: scan.hpp:58
__device__ __forceinline__ unsigned int index(const unsigned int tid)
Definition: scan.hpp:77
__device__ __forceinline__ WarpScanNoComp(const WarpScanNoComp &other)
Definition: scan.hpp:92
Definition: scan.hpp:53
GLenum GLsizei n
Definition: scan.hpp:89
static const int warp_smem_stride
Definition: scan.hpp:121
__device__ __forceinline__ void init(volatile T *ptr)
Definition: scan.hpp:116
__device__ __forceinline__ T warpScanExclusive(T idata, volatile T *s_Data, unsigned int tid)
Definition: scan.hpp:206
__device__ __forceinline__ WarpScan()
Definition: scan.hpp:57
const char * ptr
Definition: core_c.h:942
__device__ __forceinline__ BlockScan(const BlockScan &other)
Definition: scan.hpp:132
__device__ __forceinline__ T operator()(volatile T *ptr)
Definition: scan.hpp:134
const CvMat const CvMat * F
Definition: calib3d.hpp:297
int n
Definition: legacy.hpp:3070
WarpScan< INCLUSIVE, T, F > merge
Definition: scan.hpp:86
static const int warp_log
Definition: scan.hpp:169
__device__ __forceinline__ T operator()(volatile T *ptr, const unsigned int idx)
Definition: scan.hpp:60
Definition: scan.hpp:53
ScanKind
Definition: scan.hpp:53
static const int warp_mask
Definition: scan.hpp:170
__device__ __forceinline__ void init(volatile T *ptr)
Definition: scan.hpp:82
WarpScanNoComp< INCLUSIVE, T, F > merge
Definition: scan.hpp:126
__device__ __forceinline__ unsigned int index(const unsigned int tid)
Definition: scan.hpp:111
Definition: scan.hpp:129
__device__ __forceinline__ T operator()(volatile T *ptr, const unsigned int idx)
Definition: scan.hpp:94
__device__ __forceinline__ WarpScanNoComp()
Definition: scan.hpp:91