reduce_key_val.hpp
Go to the documentation of this file.
1 /*M///////////////////////////////////////////////////////////////////////////////////////
2 //
3 // IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
4 //
5 // By downloading, copying, installing or using the software you agree to this license.
6 // If you do not agree to this license, do not download, install,
7 // copy or use the software.
8 //
9 //
10 // License Agreement
11 // For Open Source Computer Vision Library
12 //
13 // Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
14 // Copyright (C) 2009, Willow Garage Inc., all rights reserved.
15 // Third party copyrights are property of their respective owners.
16 //
17 // Redistribution and use in source and binary forms, with or without modification,
18 // are permitted provided that the following conditions are met:
19 //
20 // * Redistribution's of source code must retain the above copyright notice,
21 // this list of conditions and the following disclaimer.
22 //
23 // * Redistribution's in binary form must reproduce the above copyright notice,
24 // this list of conditions and the following disclaimer in the documentation
25 // and/or other materials provided with the distribution.
26 //
27 // * The name of the copyright holders may not be used to endorse or promote products
28 // derived from this software without specific prior written permission.
29 //
30 // This software is provided by the copyright holders and contributors "as is" and
31 // any express or implied warranties, including, but not limited to, the implied
32 // warranties of merchantability and fitness for a particular purpose are disclaimed.
33 // In no event shall the Intel Corporation or contributors be liable for any direct,
34 // indirect, incidental, special, exemplary, or consequential damages
35 // (including, but not limited to, procurement of substitute goods or services;
36 // loss of use, data, or profits; or business interruption) however caused
37 // and on any theory of liability, whether in contract, strict liability,
38 // or tort (including negligence or otherwise) arising in any way out of
39 // the use of this software, even if advised of the possibility of such damage.
40 //
41 //M*/
42 
43 #ifndef __OPENCV_GPU_PRED_VAL_REDUCE_DETAIL_HPP__
44 #define __OPENCV_GPU_PRED_VAL_REDUCE_DETAIL_HPP__
45 
46 #include <thrust/tuple.h>
47 #include "../warp.hpp"
48 #include "../warp_shuffle.hpp"
49 
50 namespace cv { namespace gpu { namespace device
51 {
52  namespace reduce_key_val_detail
53  {
54  template <typename T> struct GetType;
55  template <typename T> struct GetType<T*>
56  {
57  typedef T type;
58  };
59  template <typename T> struct GetType<volatile T*>
60  {
61  typedef T type;
62  };
63  template <typename T> struct GetType<T&>
64  {
65  typedef T type;
66  };
67 
68  template <unsigned int I, unsigned int N>
69  struct For
70  {
71  template <class PointerTuple, class ReferenceTuple>
72  static __device__ void loadToSmem(const PointerTuple& smem, const ReferenceTuple& data, unsigned int tid)
73  {
74  thrust::get<I>(smem)[tid] = thrust::get<I>(data);
75 
76  For<I + 1, N>::loadToSmem(smem, data, tid);
77  }
78  template <class PointerTuple, class ReferenceTuple>
79  static __device__ void loadFromSmem(const PointerTuple& smem, const ReferenceTuple& data, unsigned int tid)
80  {
81  thrust::get<I>(data) = thrust::get<I>(smem)[tid];
82 
83  For<I + 1, N>::loadFromSmem(smem, data, tid);
84  }
85 
86  template <class ReferenceTuple>
87  static __device__ void copyShfl(const ReferenceTuple& val, unsigned int delta, int width)
88  {
89  thrust::get<I>(val) = shfl_down(thrust::get<I>(val), delta, width);
90 
91  For<I + 1, N>::copyShfl(val, delta, width);
92  }
93  template <class PointerTuple, class ReferenceTuple>
94  static __device__ void copy(const PointerTuple& svals, const ReferenceTuple& val, unsigned int tid, unsigned int delta)
95  {
96  thrust::get<I>(svals)[tid] = thrust::get<I>(val) = thrust::get<I>(svals)[tid + delta];
97 
98  For<I + 1, N>::copy(svals, val, tid, delta);
99  }
100 
101  template <class KeyReferenceTuple, class ValReferenceTuple, class CmpTuple>
102  static __device__ void mergeShfl(const KeyReferenceTuple& key, const ValReferenceTuple& val, const CmpTuple& cmp, unsigned int delta, int width)
103  {
104  typename GetType<typename thrust::tuple_element<I, KeyReferenceTuple>::type>::type reg = shfl_down(thrust::get<I>(key), delta, width);
105 
106  if (thrust::get<I>(cmp)(reg, thrust::get<I>(key)))
107  {
108  thrust::get<I>(key) = reg;
109  thrust::get<I>(val) = shfl_down(thrust::get<I>(val), delta, width);
110  }
111 
112  For<I + 1, N>::mergeShfl(key, val, cmp, delta, width);
113  }
114  template <class KeyPointerTuple, class KeyReferenceTuple, class ValPointerTuple, class ValReferenceTuple, class CmpTuple>
115  static __device__ void merge(const KeyPointerTuple& skeys, const KeyReferenceTuple& key,
116  const ValPointerTuple& svals, const ValReferenceTuple& val,
117  const CmpTuple& cmp,
118  unsigned int tid, unsigned int delta)
119  {
120  typename GetType<typename thrust::tuple_element<I, KeyPointerTuple>::type>::type reg = thrust::get<I>(skeys)[tid + delta];
121 
122  if (thrust::get<I>(cmp)(reg, thrust::get<I>(key)))
123  {
124  thrust::get<I>(skeys)[tid] = thrust::get<I>(key) = reg;
125  thrust::get<I>(svals)[tid] = thrust::get<I>(val) = thrust::get<I>(svals)[tid + delta];
126  }
127 
128  For<I + 1, N>::merge(skeys, key, svals, val, cmp, tid, delta);
129  }
130  };
131  template <unsigned int N>
132  struct For<N, N>
133  {
134  template <class PointerTuple, class ReferenceTuple>
135  static __device__ void loadToSmem(const PointerTuple&, const ReferenceTuple&, unsigned int)
136  {
137  }
138  template <class PointerTuple, class ReferenceTuple>
139  static __device__ void loadFromSmem(const PointerTuple&, const ReferenceTuple&, unsigned int)
140  {
141  }
142 
143  template <class ReferenceTuple>
144  static __device__ void copyShfl(const ReferenceTuple&, unsigned int, int)
145  {
146  }
147  template <class PointerTuple, class ReferenceTuple>
148  static __device__ void copy(const PointerTuple&, const ReferenceTuple&, unsigned int, unsigned int)
149  {
150  }
151 
152  template <class KeyReferenceTuple, class ValReferenceTuple, class CmpTuple>
153  static __device__ void mergeShfl(const KeyReferenceTuple&, const ValReferenceTuple&, const CmpTuple&, unsigned int, int)
154  {
155  }
156  template <class KeyPointerTuple, class KeyReferenceTuple, class ValPointerTuple, class ValReferenceTuple, class CmpTuple>
157  static __device__ void merge(const KeyPointerTuple&, const KeyReferenceTuple&,
158  const ValPointerTuple&, const ValReferenceTuple&,
159  const CmpTuple&,
160  unsigned int, unsigned int)
161  {
162  }
163  };
164 
166  // loadToSmem
167 
168  template <typename T>
169  __device__ __forceinline__ void loadToSmem(volatile T* smem, T& data, unsigned int tid)
170  {
171  smem[tid] = data;
172  }
173  template <typename T>
174  __device__ __forceinline__ void loadFromSmem(volatile T* smem, T& data, unsigned int tid)
175  {
176  data = smem[tid];
177  }
178  template <typename VP0, typename VP1, typename VP2, typename VP3, typename VP4, typename VP5, typename VP6, typename VP7, typename VP8, typename VP9,
179  typename VR0, typename VR1, typename VR2, typename VR3, typename VR4, typename VR5, typename VR6, typename VR7, typename VR8, typename VR9>
180  __device__ __forceinline__ void loadToSmem(const thrust::tuple<VP0, VP1, VP2, VP3, VP4, VP5, VP6, VP7, VP8, VP9>& smem,
181  const thrust::tuple<VR0, VR1, VR2, VR3, VR4, VR5, VR6, VR7, VR8, VR9>& data,
182  unsigned int tid)
183  {
185  }
186  template <typename VP0, typename VP1, typename VP2, typename VP3, typename VP4, typename VP5, typename VP6, typename VP7, typename VP8, typename VP9,
187  typename VR0, typename VR1, typename VR2, typename VR3, typename VR4, typename VR5, typename VR6, typename VR7, typename VR8, typename VR9>
188  __device__ __forceinline__ void loadFromSmem(const thrust::tuple<VP0, VP1, VP2, VP3, VP4, VP5, VP6, VP7, VP8, VP9>& smem,
189  const thrust::tuple<VR0, VR1, VR2, VR3, VR4, VR5, VR6, VR7, VR8, VR9>& data,
190  unsigned int tid)
191  {
193  }
194 
196  // copyVals
197 
198  template <typename V>
199  __device__ __forceinline__ void copyValsShfl(V& val, unsigned int delta, int width)
200  {
201  val = shfl_down(val, delta, width);
202  }
203  template <typename V>
204  __device__ __forceinline__ void copyVals(volatile V* svals, V& val, unsigned int tid, unsigned int delta)
205  {
206  svals[tid] = val = svals[tid + delta];
207  }
208  template <typename VR0, typename VR1, typename VR2, typename VR3, typename VR4, typename VR5, typename VR6, typename VR7, typename VR8, typename VR9>
209  __device__ __forceinline__ void copyValsShfl(const thrust::tuple<VR0, VR1, VR2, VR3, VR4, VR5, VR6, VR7, VR8, VR9>& val,
210  unsigned int delta,
211  int width)
212  {
214  }
215  template <typename VP0, typename VP1, typename VP2, typename VP3, typename VP4, typename VP5, typename VP6, typename VP7, typename VP8, typename VP9,
216  typename VR0, typename VR1, typename VR2, typename VR3, typename VR4, typename VR5, typename VR6, typename VR7, typename VR8, typename VR9>
217  __device__ __forceinline__ void copyVals(const thrust::tuple<VP0, VP1, VP2, VP3, VP4, VP5, VP6, VP7, VP8, VP9>& svals,
218  const thrust::tuple<VR0, VR1, VR2, VR3, VR4, VR5, VR6, VR7, VR8, VR9>& val,
219  unsigned int tid, unsigned int delta)
220  {
222  }
223 
225  // merge
226 
227  template <typename K, typename V, class Cmp>
228  __device__ __forceinline__ void mergeShfl(K& key, V& val, const Cmp& cmp, unsigned int delta, int width)
229  {
230  K reg = shfl_down(key, delta, width);
231 
232  if (cmp(reg, key))
233  {
234  key = reg;
235  copyValsShfl(val, delta, width);
236  }
237  }
238  template <typename K, typename V, class Cmp>
239  __device__ __forceinline__ void merge(volatile K* skeys, K& key, volatile V* svals, V& val, const Cmp& cmp, unsigned int tid, unsigned int delta)
240  {
241  K reg = skeys[tid + delta];
242 
243  if (cmp(reg, key))
244  {
245  skeys[tid] = key = reg;
246  copyVals(svals, val, tid, delta);
247  }
248  }
249  template <typename K,
250  typename VR0, typename VR1, typename VR2, typename VR3, typename VR4, typename VR5, typename VR6, typename VR7, typename VR8, typename VR9,
251  class Cmp>
252  __device__ __forceinline__ void mergeShfl(K& key,
253  const thrust::tuple<VR0, VR1, VR2, VR3, VR4, VR5, VR6, VR7, VR8, VR9>& val,
254  const Cmp& cmp,
255  unsigned int delta, int width)
256  {
257  K reg = shfl_down(key, delta, width);
258 
259  if (cmp(reg, key))
260  {
261  key = reg;
262  copyValsShfl(val, delta, width);
263  }
264  }
265  template <typename K,
266  typename VP0, typename VP1, typename VP2, typename VP3, typename VP4, typename VP5, typename VP6, typename VP7, typename VP8, typename VP9,
267  typename VR0, typename VR1, typename VR2, typename VR3, typename VR4, typename VR5, typename VR6, typename VR7, typename VR8, typename VR9,
268  class Cmp>
269  __device__ __forceinline__ void merge(volatile K* skeys, K& key,
270  const thrust::tuple<VP0, VP1, VP2, VP3, VP4, VP5, VP6, VP7, VP8, VP9>& svals,
271  const thrust::tuple<VR0, VR1, VR2, VR3, VR4, VR5, VR6, VR7, VR8, VR9>& val,
272  const Cmp& cmp, unsigned int tid, unsigned int delta)
273  {
274  K reg = skeys[tid + delta];
275 
276  if (cmp(reg, key))
277  {
278  skeys[tid] = key = reg;
279  copyVals(svals, val, tid, delta);
280  }
281  }
282  template <typename KR0, typename KR1, typename KR2, typename KR3, typename KR4, typename KR5, typename KR6, typename KR7, typename KR8, typename KR9,
283  typename VR0, typename VR1, typename VR2, typename VR3, typename VR4, typename VR5, typename VR6, typename VR7, typename VR8, typename VR9,
284  class Cmp0, class Cmp1, class Cmp2, class Cmp3, class Cmp4, class Cmp5, class Cmp6, class Cmp7, class Cmp8, class Cmp9>
285  __device__ __forceinline__ void mergeShfl(const thrust::tuple<KR0, KR1, KR2, KR3, KR4, KR5, KR6, KR7, KR8, KR9>& key,
286  const thrust::tuple<VR0, VR1, VR2, VR3, VR4, VR5, VR6, VR7, VR8, VR9>& val,
287  const thrust::tuple<Cmp0, Cmp1, Cmp2, Cmp3, Cmp4, Cmp5, Cmp6, Cmp7, Cmp8, Cmp9>& cmp,
288  unsigned int delta, int width)
289  {
291  }
292  template <typename KP0, typename KP1, typename KP2, typename KP3, typename KP4, typename KP5, typename KP6, typename KP7, typename KP8, typename KP9,
293  typename KR0, typename KR1, typename KR2, typename KR3, typename KR4, typename KR5, typename KR6, typename KR7, typename KR8, typename KR9,
294  typename VP0, typename VP1, typename VP2, typename VP3, typename VP4, typename VP5, typename VP6, typename VP7, typename VP8, typename VP9,
295  typename VR0, typename VR1, typename VR2, typename VR3, typename VR4, typename VR5, typename VR6, typename VR7, typename VR8, typename VR9,
296  class Cmp0, class Cmp1, class Cmp2, class Cmp3, class Cmp4, class Cmp5, class Cmp6, class Cmp7, class Cmp8, class Cmp9>
297  __device__ __forceinline__ void merge(const thrust::tuple<KP0, KP1, KP2, KP3, KP4, KP5, KP6, KP7, KP8, KP9>& skeys,
298  const thrust::tuple<KR0, KR1, KR2, KR3, KR4, KR5, KR6, KR7, KR8, KR9>& key,
299  const thrust::tuple<VP0, VP1, VP2, VP3, VP4, VP5, VP6, VP7, VP8, VP9>& svals,
300  const thrust::tuple<VR0, VR1, VR2, VR3, VR4, VR5, VR6, VR7, VR8, VR9>& val,
301  const thrust::tuple<Cmp0, Cmp1, Cmp2, Cmp3, Cmp4, Cmp5, Cmp6, Cmp7, Cmp8, Cmp9>& cmp,
302  unsigned int tid, unsigned int delta)
303  {
305  }
306 
308  // Generic
309 
310  template <unsigned int N> struct Generic
311  {
312  template <class KP, class KR, class VP, class VR, class Cmp>
313  static __device__ void reduce(KP skeys, KR key, VP svals, VR val, unsigned int tid, Cmp cmp)
314  {
315  loadToSmem(skeys, key, tid);
316  loadValsToSmem(svals, val, tid);
317  if (N >= 32)
318  __syncthreads();
319 
320  if (N >= 2048)
321  {
322  if (tid < 1024)
323  merge(skeys, key, svals, val, cmp, tid, 1024);
324 
325  __syncthreads();
326  }
327  if (N >= 1024)
328  {
329  if (tid < 512)
330  merge(skeys, key, svals, val, cmp, tid, 512);
331 
332  __syncthreads();
333  }
334  if (N >= 512)
335  {
336  if (tid < 256)
337  merge(skeys, key, svals, val, cmp, tid, 256);
338 
339  __syncthreads();
340  }
341  if (N >= 256)
342  {
343  if (tid < 128)
344  merge(skeys, key, svals, val, cmp, tid, 128);
345 
346  __syncthreads();
347  }
348  if (N >= 128)
349  {
350  if (tid < 64)
351  merge(skeys, key, svals, val, cmp, tid, 64);
352 
353  __syncthreads();
354  }
355  if (N >= 64)
356  {
357  if (tid < 32)
358  merge(skeys, key, svals, val, cmp, tid, 32);
359  }
360 
361  if (tid < 16)
362  {
363  merge(skeys, key, svals, val, cmp, tid, 16);
364  merge(skeys, key, svals, val, cmp, tid, 8);
365  merge(skeys, key, svals, val, cmp, tid, 4);
366  merge(skeys, key, svals, val, cmp, tid, 2);
367  merge(skeys, key, svals, val, cmp, tid, 1);
368  }
369  }
370  };
371 
372  template <unsigned int I, class KP, class KR, class VP, class VR, class Cmp>
373  struct Unroll
374  {
375  static __device__ void loopShfl(KR key, VR val, Cmp cmp, unsigned int N)
376  {
377  mergeShfl(key, val, cmp, I, N);
378  Unroll<I / 2, KP, KR, VP, VR, Cmp>::loopShfl(key, val, cmp, N);
379  }
380  static __device__ void loop(KP skeys, KR key, VP svals, VR val, unsigned int tid, Cmp cmp)
381  {
382  merge(skeys, key, svals, val, cmp, tid, I);
383  Unroll<I / 2, KP, KR, VP, VR, Cmp>::loop(skeys, key, svals, val, tid, cmp);
384  }
385  };
386  template <class KP, class KR, class VP, class VR, class Cmp>
387  struct Unroll<0, KP, KR, VP, VR, Cmp>
388  {
389  static __device__ void loopShfl(KR, VR, Cmp, unsigned int)
390  {
391  }
392  static __device__ void loop(KP, KR, VP, VR, unsigned int, Cmp)
393  {
394  }
395  };
396 
397  template <unsigned int N> struct WarpOptimized
398  {
399  template <class KP, class KR, class VP, class VR, class Cmp>
400  static __device__ void reduce(KP skeys, KR key, VP svals, VR val, unsigned int tid, Cmp cmp)
401  {
402  #if 0 // __CUDA_ARCH__ >= 300
403  (void) skeys;
404  (void) svals;
405  (void) tid;
406 
407  Unroll<N / 2, KP, KR, VP, VR, Cmp>::loopShfl(key, val, cmp, N);
408  #else
409  loadToSmem(skeys, key, tid);
410  loadToSmem(svals, val, tid);
411 
412  if (tid < N / 2)
413  Unroll<N / 2, KP, KR, VP, VR, Cmp>::loop(skeys, key, svals, val, tid, cmp);
414  #endif
415  }
416  };
417 
418  template <unsigned int N> struct GenericOptimized32
419  {
420  enum { M = N / 32 };
421 
422  template <class KP, class KR, class VP, class VR, class Cmp>
423  static __device__ void reduce(KP skeys, KR key, VP svals, VR val, unsigned int tid, Cmp cmp)
424  {
425  const unsigned int laneId = Warp::laneId();
426 
427  #if 0 // __CUDA_ARCH__ >= 300
429 
430  if (laneId == 0)
431  {
432  loadToSmem(skeys, key, tid / 32);
433  loadToSmem(svals, val, tid / 32);
434  }
435  #else
436  loadToSmem(skeys, key, tid);
437  loadToSmem(svals, val, tid);
438 
439  if (laneId < 16)
440  Unroll<16, KP, KR, VP, VR, Cmp>::loop(skeys, key, svals, val, tid, cmp);
441 
442  __syncthreads();
443 
444  if (laneId == 0)
445  {
446  loadToSmem(skeys, key, tid / 32);
447  loadToSmem(svals, val, tid / 32);
448  }
449  #endif
450 
451  __syncthreads();
452 
453  loadFromSmem(skeys, key, tid);
454 
455  if (tid < 32)
456  {
457  #if 0 // __CUDA_ARCH__ >= 300
458  loadFromSmem(svals, val, tid);
459 
460  Unroll<M / 2, KP, KR, VP, VR, Cmp>::loopShfl(key, val, cmp, M);
461  #else
462  Unroll<M / 2, KP, KR, VP, VR, Cmp>::loop(skeys, key, svals, val, tid, cmp);
463  #endif
464  }
465  }
466  };
467 
468  template <bool val, class T1, class T2> struct StaticIf;
469  template <class T1, class T2> struct StaticIf<true, T1, T2>
470  {
471  typedef T1 type;
472  };
473  template <class T1, class T2> struct StaticIf<false, T1, T2>
474  {
475  typedef T2 type;
476  };
477 
478  template <unsigned int N> struct IsPowerOf2
479  {
480  enum { value = ((N != 0) && !(N & (N - 1))) };
481  };
482 
483  template <unsigned int N> struct Dispatcher
484  {
485  typedef typename StaticIf<
488  typename StaticIf<
491  Generic<N>
492  >::type
494  };
495  }
496 }}}
497 
498 #endif // __OPENCV_GPU_PRED_VAL_REDUCE_DETAIL_HPP__
static __device__ void reduce(KP skeys, KR key, VP svals, VR val, unsigned int tid, Cmp cmp)
Definition: reduce_key_val.hpp:423
Definition: reduce_key_val.hpp:468
static __device__ void loadToSmem(const PointerTuple &smem, const ReferenceTuple &data, unsigned int tid)
Definition: reduce_key_val.hpp:72
int CvScalar value
Definition: core_c.h:340
CV_EXPORTS void copy(const Mat &src, Mat &dst, const Mat &mask=Mat(), bool invertMask=false)
CvFileNode const CvStringHashNode * key
Definition: core_c.h:1584
static __device__ __forceinline__ unsigned int laneId()
Returns the warp lane ID of the calling thread.
Definition: warp.hpp:58
__device__ __forceinline__ void copyVals(volatile V *svals, V &val, unsigned int tid, unsigned int delta)
Definition: reduce_key_val.hpp:204
GLenum GLsizei width
Definition: reduce_key_val.hpp:310
__device__ __forceinline__ T shfl_down(T val, unsigned int delta, int width=warpSize)
Definition: warp_shuffle.hpp:81
GLsizei GLsizei GLenum GLenum const GLvoid * data
Definition: core_c.h:403
const CvMat const CvMat const CvMat CvMat CvMat CvMat CvMat CvSize CvMat CvMat * T
Definition: calib3d.hpp:270
typedef void(CV_CDECL *CvMouseCallback)(int event
static __device__ void reduce(KP skeys, KR key, VP svals, VR val, unsigned int tid, Cmp cmp)
Definition: reduce_key_val.hpp:313
__device__ __forceinline__ void loadToSmem(volatile T *smem, T &data, unsigned int tid)
Definition: reduce_key_val.hpp:169
static __device__ void reduce(KP skeys, KR key, VP svals, VR val, unsigned int tid, Cmp cmp)
Definition: reduce_key_val.hpp:400
__device__ __forceinline__ void merge(volatile K *skeys, K &key, volatile V *svals, V &val, const Cmp &cmp, unsigned int tid, unsigned int delta)
Definition: reduce_key_val.hpp:239
StaticIf< (N<=32)&&IsPowerOf2< N >::value, WarpOptimized< N >, typename StaticIf< (N<=1024)&&IsPowerOf2< N >::value, GenericOptimized32< N >, Generic< N > >::type >::type reductor
Definition: reduce_key_val.hpp:493
GLuint GLfloat * val
int width
Definition: highgui_c.h:130
static __device__ void loopShfl(KR, VR, Cmp, unsigned int)
Definition: reduce_key_val.hpp:389
Definition: reduce_key_val.hpp:483
CvSize int int int CvPoint int delta
Definition: core_c.h:1427
static __device__ void copy(const PointerTuple &svals, const ReferenceTuple &val, unsigned int tid, unsigned int delta)
Definition: reduce_key_val.hpp:94
static __device__ void loopShfl(KR key, VR val, Cmp cmp, unsigned int N)
Definition: reduce_key_val.hpp:375
static __device__ void copyShfl(const ReferenceTuple &val, unsigned int delta, int width)
Definition: reduce_key_val.hpp:87
void * data
Definition: core_c.h:206
Definition: reduce_key_val.hpp:54
Definition: reduce_key_val.hpp:478
__device__ __forceinline__ void copyValsShfl(V &val, unsigned int delta, int width)
Definition: reduce_key_val.hpp:199
static __device__ void mergeShfl(const KeyReferenceTuple &, const ValReferenceTuple &, const CmpTuple &, unsigned int, int)
Definition: reduce_key_val.hpp:153
const CvArr const CvArr * V
Definition: core_c.h:733
static __device__ void copy(const PointerTuple &, const ReferenceTuple &, unsigned int, unsigned int)
Definition: reduce_key_val.hpp:148
GLsizei const GLfloat * value
Definition: core_c.h:341
GLuint GLuint GLsizei GLenum type
Definition: core_c.h:114
double double double double double double CvSize * warpSize
Definition: legacy.hpp:680
static __device__ void copyShfl(const ReferenceTuple &, unsigned int, int)
Definition: reduce_key_val.hpp:144
__device__ __forceinline__ void mergeShfl(K &key, V &val, const Cmp &cmp, unsigned int delta, int width)
Definition: reduce_key_val.hpp:228
Definition: reduce_key_val.hpp:373
static __device__ void mergeShfl(const KeyReferenceTuple &key, const ValReferenceTuple &val, const CmpTuple &cmp, unsigned int delta, int width)
Definition: reduce_key_val.hpp:102
__device__ __forceinline__ void loadFromSmem(volatile T *smem, T &data, unsigned int tid)
Definition: reduce_key_val.hpp:174
static __device__ void loadFromSmem(const PointerTuple &, const ReferenceTuple &, unsigned int)
Definition: reduce_key_val.hpp:139
static __device__ void loop(KP skeys, KR key, VP svals, VR val, unsigned int tid, Cmp cmp)
Definition: reduce_key_val.hpp:380
false
Definition: color.hpp:230
static __device__ void loadToSmem(const PointerTuple &, const ReferenceTuple &, unsigned int)
Definition: reduce_key_val.hpp:135
true
Definition: color.hpp:221
static __device__ void loadFromSmem(const PointerTuple &smem, const ReferenceTuple &data, unsigned int tid)
Definition: reduce_key_val.hpp:79
static __device__ void loop(KP, KR, VP, VR, unsigned int, Cmp)
Definition: reduce_key_val.hpp:392
Definition: reduce_key_val.hpp:69
static __device__ void merge(const KeyPointerTuple &skeys, const KeyReferenceTuple &key, const ValPointerTuple &svals, const ValReferenceTuple &val, const CmpTuple &cmp, unsigned int tid, unsigned int delta)
Definition: reduce_key_val.hpp:115
static __device__ void merge(const KeyPointerTuple &, const KeyReferenceTuple &, const ValPointerTuple &, const ValReferenceTuple &, const CmpTuple &, unsigned int, unsigned int)
Definition: reduce_key_val.hpp:157