reduce.hpp
Go to the documentation of this file.
1 /*M///////////////////////////////////////////////////////////////////////////////////////
2 //
3 // IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
4 //
5 // By downloading, copying, installing or using the software you agree to this license.
6 // If you do not agree to this license, do not download, install,
7 // copy or use the software.
8 //
9 //
10 // License Agreement
11 // For Open Source Computer Vision Library
12 //
13 // Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
14 // Copyright (C) 2009, Willow Garage Inc., all rights reserved.
15 // Third party copyrights are property of their respective owners.
16 //
17 // Redistribution and use in source and binary forms, with or without modification,
18 // are permitted provided that the following conditions are met:
19 //
20 // * Redistribution's of source code must retain the above copyright notice,
21 // this list of conditions and the following disclaimer.
22 //
23 // * Redistribution's in binary form must reproduce the above copyright notice,
24 // this list of conditions and the following disclaimer in the documentation
25 // and/or other materials provided with the distribution.
26 //
27 // * The name of the copyright holders may not be used to endorse or promote products
28 // derived from this software without specific prior written permission.
29 //
30 // This software is provided by the copyright holders and contributors "as is" and
31 // any express or implied warranties, including, but not limited to, the implied
32 // warranties of merchantability and fitness for a particular purpose are disclaimed.
33 // In no event shall the Intel Corporation or contributors be liable for any direct,
34 // indirect, incidental, special, exemplary, or consequential damages
35 // (including, but not limited to, procurement of substitute goods or services;
36 // loss of use, data, or profits; or business interruption) however caused
37 // and on any theory of liability, whether in contract, strict liability,
38 // or tort (including negligence or otherwise) arising in any way out of
39 // the use of this software, even if advised of the possibility of such damage.
40 //
41 //M*/
42 
43 #ifndef __OPENCV_GPU_REDUCE_DETAIL_HPP__
44 #define __OPENCV_GPU_REDUCE_DETAIL_HPP__
45 
46 #include <thrust/tuple.h>
47 #include "../warp.hpp"
48 #include "../warp_shuffle.hpp"
49 
50 namespace cv { namespace gpu { namespace device
51 {
52  namespace reduce_detail
53  {
54  template <typename T> struct GetType;
55  template <typename T> struct GetType<T*>
56  {
57  typedef T type;
58  };
59  template <typename T> struct GetType<volatile T*>
60  {
61  typedef T type;
62  };
63  template <typename T> struct GetType<T&>
64  {
65  typedef T type;
66  };
67 
68  template <unsigned int I, unsigned int N>
69  struct For
70  {
71  template <class PointerTuple, class ValTuple>
72  static __device__ void loadToSmem(const PointerTuple& smem, const ValTuple& val, unsigned int tid)
73  {
74  thrust::get<I>(smem)[tid] = thrust::get<I>(val);
75 
76  For<I + 1, N>::loadToSmem(smem, val, tid);
77  }
78  template <class PointerTuple, class ValTuple>
79  static __device__ void loadFromSmem(const PointerTuple& smem, const ValTuple& val, unsigned int tid)
80  {
81  thrust::get<I>(val) = thrust::get<I>(smem)[tid];
82 
83  For<I + 1, N>::loadFromSmem(smem, val, tid);
84  }
85 
86  template <class PointerTuple, class ValTuple, class OpTuple>
87  static __device__ void merge(const PointerTuple& smem, const ValTuple& val, unsigned int tid, unsigned int delta, const OpTuple& op)
88  {
89  typename GetType<typename thrust::tuple_element<I, PointerTuple>::type>::type reg = thrust::get<I>(smem)[tid + delta];
90  thrust::get<I>(smem)[tid] = thrust::get<I>(val) = thrust::get<I>(op)(thrust::get<I>(val), reg);
91 
92  For<I + 1, N>::merge(smem, val, tid, delta, op);
93  }
94  template <class ValTuple, class OpTuple>
95  static __device__ void mergeShfl(const ValTuple& val, unsigned int delta, unsigned int width, const OpTuple& op)
96  {
97  typename GetType<typename thrust::tuple_element<I, ValTuple>::type>::type reg = shfl_down(thrust::get<I>(val), delta, width);
98  thrust::get<I>(val) = thrust::get<I>(op)(thrust::get<I>(val), reg);
99 
100  For<I + 1, N>::mergeShfl(val, delta, width, op);
101  }
102  };
103  template <unsigned int N>
104  struct For<N, N>
105  {
106  template <class PointerTuple, class ValTuple>
107  static __device__ void loadToSmem(const PointerTuple&, const ValTuple&, unsigned int)
108  {
109  }
110  template <class PointerTuple, class ValTuple>
111  static __device__ void loadFromSmem(const PointerTuple&, const ValTuple&, unsigned int)
112  {
113  }
114 
115  template <class PointerTuple, class ValTuple, class OpTuple>
116  static __device__ void merge(const PointerTuple&, const ValTuple&, unsigned int, unsigned int, const OpTuple&)
117  {
118  }
119  template <class ValTuple, class OpTuple>
120  static __device__ void mergeShfl(const ValTuple&, unsigned int, unsigned int, const OpTuple&)
121  {
122  }
123  };
124 
125  template <typename T>
126  __device__ __forceinline__ void loadToSmem(volatile T* smem, T& val, unsigned int tid)
127  {
128  smem[tid] = val;
129  }
130  template <typename T>
131  __device__ __forceinline__ void loadFromSmem(volatile T* smem, T& val, unsigned int tid)
132  {
133  val = smem[tid];
134  }
135  template <typename P0, typename P1, typename P2, typename P3, typename P4, typename P5, typename P6, typename P7, typename P8, typename P9,
136  typename R0, typename R1, typename R2, typename R3, typename R4, typename R5, typename R6, typename R7, typename R8, typename R9>
137  __device__ __forceinline__ void loadToSmem(const thrust::tuple<P0, P1, P2, P3, P4, P5, P6, P7, P8, P9>& smem,
138  const thrust::tuple<R0, R1, R2, R3, R4, R5, R6, R7, R8, R9>& val,
139  unsigned int tid)
140  {
142  }
143  template <typename P0, typename P1, typename P2, typename P3, typename P4, typename P5, typename P6, typename P7, typename P8, typename P9,
144  typename R0, typename R1, typename R2, typename R3, typename R4, typename R5, typename R6, typename R7, typename R8, typename R9>
145  __device__ __forceinline__ void loadFromSmem(const thrust::tuple<P0, P1, P2, P3, P4, P5, P6, P7, P8, P9>& smem,
146  const thrust::tuple<R0, R1, R2, R3, R4, R5, R6, R7, R8, R9>& val,
147  unsigned int tid)
148  {
150  }
151 
152  template <typename T, class Op>
153  __device__ __forceinline__ void merge(volatile T* smem, T& val, unsigned int tid, unsigned int delta, const Op& op)
154  {
155  T reg = smem[tid + delta];
156  smem[tid] = val = op(val, reg);
157  }
158  template <typename T, class Op>
159  __device__ __forceinline__ void mergeShfl(T& val, unsigned int delta, unsigned int width, const Op& op)
160  {
161  T reg = shfl_down(val, delta, width);
162  val = op(val, reg);
163  }
164  template <typename P0, typename P1, typename P2, typename P3, typename P4, typename P5, typename P6, typename P7, typename P8, typename P9,
165  typename R0, typename R1, typename R2, typename R3, typename R4, typename R5, typename R6, typename R7, typename R8, typename R9,
166  class Op0, class Op1, class Op2, class Op3, class Op4, class Op5, class Op6, class Op7, class Op8, class Op9>
167  __device__ __forceinline__ void merge(const thrust::tuple<P0, P1, P2, P3, P4, P5, P6, P7, P8, P9>& smem,
168  const thrust::tuple<R0, R1, R2, R3, R4, R5, R6, R7, R8, R9>& val,
169  unsigned int tid,
170  unsigned int delta,
171  const thrust::tuple<Op0, Op1, Op2, Op3, Op4, Op5, Op6, Op7, Op8, Op9>& op)
172  {
174  }
175  template <typename R0, typename R1, typename R2, typename R3, typename R4, typename R5, typename R6, typename R7, typename R8, typename R9,
176  class Op0, class Op1, class Op2, class Op3, class Op4, class Op5, class Op6, class Op7, class Op8, class Op9>
177  __device__ __forceinline__ void mergeShfl(const thrust::tuple<R0, R1, R2, R3, R4, R5, R6, R7, R8, R9>& val,
178  unsigned int delta,
179  unsigned int width,
180  const thrust::tuple<Op0, Op1, Op2, Op3, Op4, Op5, Op6, Op7, Op8, Op9>& op)
181  {
183  }
184 
185  template <unsigned int N> struct Generic
186  {
187  template <typename Pointer, typename Reference, class Op>
188  static __device__ void reduce(Pointer smem, Reference val, unsigned int tid, Op op)
189  {
190  loadToSmem(smem, val, tid);
191  if (N >= 32)
192  __syncthreads();
193 
194  if (N >= 2048)
195  {
196  if (tid < 1024)
197  merge(smem, val, tid, 1024, op);
198 
199  __syncthreads();
200  }
201  if (N >= 1024)
202  {
203  if (tid < 512)
204  merge(smem, val, tid, 512, op);
205 
206  __syncthreads();
207  }
208  if (N >= 512)
209  {
210  if (tid < 256)
211  merge(smem, val, tid, 256, op);
212 
213  __syncthreads();
214  }
215  if (N >= 256)
216  {
217  if (tid < 128)
218  merge(smem, val, tid, 128, op);
219 
220  __syncthreads();
221  }
222  if (N >= 128)
223  {
224  if (tid < 64)
225  merge(smem, val, tid, 64, op);
226 
227  __syncthreads();
228  }
229  if (N >= 64)
230  {
231  if (tid < 32)
232  merge(smem, val, tid, 32, op);
233  }
234 
235  if (tid < 16)
236  {
237  merge(smem, val, tid, 16, op);
238  merge(smem, val, tid, 8, op);
239  merge(smem, val, tid, 4, op);
240  merge(smem, val, tid, 2, op);
241  merge(smem, val, tid, 1, op);
242  }
243  }
244  };
245 
246  template <unsigned int I, typename Pointer, typename Reference, class Op>
247  struct Unroll
248  {
249  static __device__ void loopShfl(Reference val, Op op, unsigned int N)
250  {
251  mergeShfl(val, I, N, op);
252  Unroll<I / 2, Pointer, Reference, Op>::loopShfl(val, op, N);
253  }
254  static __device__ void loop(Pointer smem, Reference val, unsigned int tid, Op op)
255  {
256  merge(smem, val, tid, I, op);
257  Unroll<I / 2, Pointer, Reference, Op>::loop(smem, val, tid, op);
258  }
259  };
260  template <typename Pointer, typename Reference, class Op>
261  struct Unroll<0, Pointer, Reference, Op>
262  {
263  static __device__ void loopShfl(Reference, Op, unsigned int)
264  {
265  }
266  static __device__ void loop(Pointer, Reference, unsigned int, Op)
267  {
268  }
269  };
270 
271  template <unsigned int N> struct WarpOptimized
272  {
273  template <typename Pointer, typename Reference, class Op>
274  static __device__ void reduce(Pointer smem, Reference val, unsigned int tid, Op op)
275  {
276  #if __CUDA_ARCH__ >= 300
277  (void) smem;
278  (void) tid;
279 
280  Unroll<N / 2, Pointer, Reference, Op>::loopShfl(val, op, N);
281  #else
282  loadToSmem(smem, val, tid);
283 
284  if (tid < N / 2)
285  Unroll<N / 2, Pointer, Reference, Op>::loop(smem, val, tid, op);
286  #endif
287  }
288  };
289 
290  template <unsigned int N> struct GenericOptimized32
291  {
292  enum { M = N / 32 };
293 
294  template <typename Pointer, typename Reference, class Op>
295  static __device__ void reduce(Pointer smem, Reference val, unsigned int tid, Op op)
296  {
297  const unsigned int laneId = Warp::laneId();
298 
299  #if __CUDA_ARCH__ >= 300
301 
302  if (laneId == 0)
303  loadToSmem(smem, val, tid / 32);
304  #else
305  loadToSmem(smem, val, tid);
306 
307  if (laneId < 16)
308  Unroll<16, Pointer, Reference, Op>::loop(smem, val, tid, op);
309 
310  __syncthreads();
311 
312  if (laneId == 0)
313  loadToSmem(smem, val, tid / 32);
314  #endif
315 
316  __syncthreads();
317 
318  loadFromSmem(smem, val, tid);
319 
320  if (tid < 32)
321  {
322  #if __CUDA_ARCH__ >= 300
323  Unroll<M / 2, Pointer, Reference, Op>::loopShfl(val, op, M);
324  #else
325  Unroll<M / 2, Pointer, Reference, Op>::loop(smem, val, tid, op);
326  #endif
327  }
328  }
329  };
330 
331  template <bool val, class T1, class T2> struct StaticIf;
332  template <class T1, class T2> struct StaticIf<true, T1, T2>
333  {
334  typedef T1 type;
335  };
336  template <class T1, class T2> struct StaticIf<false, T1, T2>
337  {
338  typedef T2 type;
339  };
340 
341  template <unsigned int N> struct IsPowerOf2
342  {
343  enum { value = ((N != 0) && !(N & (N - 1))) };
344  };
345 
346  template <unsigned int N> struct Dispatcher
347  {
348  typedef typename StaticIf<
351  typename StaticIf<
354  Generic<N>
355  >::type
357  };
358  }
359 }}}
360 
361 #endif // __OPENCV_GPU_REDUCE_DETAIL_HPP__
Definition: reduce.hpp:247
static __device__ void loadToSmem(const PointerTuple &smem, const ValTuple &val, unsigned int tid)
Definition: reduce.hpp:72
const CvMat const CvMat const CvMat CvSize const CvMat const CvMat CvMat CvMat * R2
Definition: calib3d.hpp:284
int CvScalar value
Definition: core_c.h:340
__device__ __forceinline__ void loadToSmem(volatile T *smem, T &val, unsigned int tid)
Definition: reduce.hpp:126
__device__ __forceinline__ void loadFromSmem(volatile T *smem, T &val, unsigned int tid)
Definition: reduce.hpp:131
static __device__ __forceinline__ unsigned int laneId()
Returns the warp lane ID of the calling thread.
Definition: warp.hpp:58
GLenum GLsizei width
static __device__ void mergeShfl(const ValTuple &, unsigned int, unsigned int, const OpTuple &)
Definition: reduce.hpp:120
static __device__ void loopShfl(Reference val, Op op, unsigned int N)
Definition: reduce.hpp:249
__device__ __forceinline__ T shfl_down(T val, unsigned int delta, int width=warpSize)
Definition: warp_shuffle.hpp:81
__device__ __forceinline__ void mergeShfl(T &val, unsigned int delta, unsigned int width, const Op &op)
Definition: reduce.hpp:159
static __device__ void loop(Pointer smem, Reference val, unsigned int tid, Op op)
Definition: reduce.hpp:254
const CvMat const CvMat const CvMat CvMat CvMat CvMat CvMat CvSize CvMat CvMat * T
Definition: calib3d.hpp:270
static __device__ void loadFromSmem(const PointerTuple &smem, const ValTuple &val, unsigned int tid)
Definition: reduce.hpp:79
Definition: reduce.hpp:185
static __device__ void loopShfl(Reference, Op, unsigned int)
Definition: reduce.hpp:263
StaticIf< (N<=32)&&IsPowerOf2< N >::value, WarpOptimized< N >, typename StaticIf< (N<=1024)&&IsPowerOf2< N >::value, GenericOptimized32< N >, Generic< N > >::type >::type reductor
Definition: reduce.hpp:356
const CvMat const CvMat const CvMat CvSize const CvMat const CvMat CvMat * R1
Definition: calib3d.hpp:284
typedef void(CV_CDECL *CvMouseCallback)(int event
Definition: reduce.hpp:69
GLuint GLfloat * val
static __device__ void loop(Pointer, Reference, unsigned int, Op)
Definition: reduce.hpp:266
static __device__ void loadToSmem(const PointerTuple &, const ValTuple &, unsigned int)
Definition: reduce.hpp:107
static __device__ void merge(const PointerTuple &, const ValTuple &, unsigned int, unsigned int, const OpTuple &)
Definition: reduce.hpp:116
CvSize int int int CvPoint int delta
Definition: core_c.h:1427
const CvMat const CvMat const CvMat CvSize const CvMat const CvMat CvMat CvMat CvMat * P1
Definition: calib3d.hpp:284
static __device__ void reduce(Pointer smem, Reference val, unsigned int tid, Op op)
Definition: reduce.hpp:295
static __device__ void reduce(Pointer smem, Reference val, unsigned int tid, Op op)
Definition: reduce.hpp:274
static __device__ void merge(const PointerTuple &smem, const ValTuple &val, unsigned int tid, unsigned int delta, const OpTuple &op)
Definition: reduce.hpp:87
static __device__ void mergeShfl(const ValTuple &val, unsigned int delta, unsigned int width, const OpTuple &op)
Definition: reduce.hpp:95
GLsizei const GLfloat * value
Definition: core_c.h:341
GLuint GLuint GLsizei GLenum type
Definition: core_c.h:114
double double double double double double CvSize * warpSize
Definition: legacy.hpp:680
false
Definition: color.hpp:230
static __device__ void reduce(Pointer smem, Reference val, unsigned int tid, Op op)
Definition: reduce.hpp:188
true
Definition: color.hpp:221
static __device__ void loadFromSmem(const PointerTuple &, const ValTuple &, unsigned int)
Definition: reduce.hpp:111
__device__ __forceinline__ void merge(volatile T *smem, T &val, unsigned int tid, unsigned int delta, const Op &op)
Definition: reduce.hpp:153
const CvMat const CvMat const CvMat CvSize const CvMat const CvMat CvMat CvMat CvMat CvMat * P2
Definition: calib3d.hpp:284