43 #ifndef __OPENCV_GPU_PRED_VAL_REDUCE_DETAIL_HPP__
44 #define __OPENCV_GPU_PRED_VAL_REDUCE_DETAIL_HPP__
46 #include <thrust/tuple.h>
47 #include "../warp.hpp"
48 #include "../warp_shuffle.hpp"
50 namespace cv {
namespace gpu {
namespace device
52 namespace reduce_key_val_detail
59 template <
typename T>
struct GetType<volatile
T*>
68 template <
unsigned int I,
unsigned int N>
71 template <
class Po
interTuple,
class ReferenceTuple>
72 static __device__
void loadToSmem(
const PointerTuple& smem,
const ReferenceTuple&
data,
unsigned int tid)
74 thrust::get<I>(smem)[tid] = thrust::get<I>(data);
78 template <
class Po
interTuple,
class ReferenceTuple>
79 static __device__
void loadFromSmem(
const PointerTuple& smem,
const ReferenceTuple&
data,
unsigned int tid)
81 thrust::get<I>(
data) = thrust::get<I>(smem)[tid];
86 template <
class ReferenceTuple>
93 template <
class Po
interTuple,
class ReferenceTuple>
94 static __device__
void copy(
const PointerTuple& svals,
const ReferenceTuple&
val,
unsigned int tid,
unsigned int delta)
96 thrust::get<I>(svals)[tid] = thrust::get<I>(val) = thrust::get<I>(svals)[tid + delta];
101 template <
class KeyReferenceTuple,
class ValReferenceTuple,
class CmpTuple>
102 static __device__
void mergeShfl(
const KeyReferenceTuple&
key,
const ValReferenceTuple&
val,
const CmpTuple& cmp,
unsigned int delta,
int width)
106 if (thrust::get<I>(cmp)(reg, thrust::get<I>(key)))
108 thrust::get<I>(
key) = reg;
114 template <
class KeyPo
interTuple,
class KeyReferenceTuple,
class ValPo
interTuple,
class ValReferenceTuple,
class CmpTuple>
115 static __device__
void merge(
const KeyPointerTuple& skeys,
const KeyReferenceTuple&
key,
116 const ValPointerTuple& svals,
const ValReferenceTuple&
val,
118 unsigned int tid,
unsigned int delta)
122 if (thrust::get<I>(cmp)(reg, thrust::get<I>(
key)))
124 thrust::get<I>(skeys)[tid] = thrust::get<I>(key) = reg;
125 thrust::get<I>(svals)[tid] = thrust::get<I>(val) = thrust::get<I>(svals)[tid + delta];
131 template <
unsigned int N>
134 template <
class Po
interTuple,
class ReferenceTuple>
135 static __device__
void loadToSmem(
const PointerTuple&,
const ReferenceTuple&,
unsigned int)
138 template <
class Po
interTuple,
class ReferenceTuple>
139 static __device__
void loadFromSmem(
const PointerTuple&,
const ReferenceTuple&,
unsigned int)
143 template <
class ReferenceTuple>
144 static __device__
void copyShfl(
const ReferenceTuple&,
unsigned int,
int)
147 template <
class Po
interTuple,
class ReferenceTuple>
148 static __device__
void copy(
const PointerTuple&,
const ReferenceTuple&,
unsigned int,
unsigned int)
152 template <
class KeyReferenceTuple,
class ValReferenceTuple,
class CmpTuple>
153 static __device__
void mergeShfl(
const KeyReferenceTuple&,
const ValReferenceTuple&,
const CmpTuple&,
unsigned int,
int)
156 template <
class KeyPo
interTuple,
class KeyReferenceTuple,
class ValPo
interTuple,
class ValReferenceTuple,
class CmpTuple>
157 static __device__
void merge(
const KeyPointerTuple&,
const KeyReferenceTuple&,
158 const ValPointerTuple&,
const ValReferenceTuple&,
160 unsigned int,
unsigned int)
168 template <
typename T>
173 template <
typename T>
178 template <
typename VP0,
typename VP1,
typename VP2,
typename VP3,
typename VP4,
typename VP5,
typename VP6,
typename VP7,
typename VP8,
typename VP9,
179 typename VR0,
typename VR1,
typename VR2,
typename VR3,
typename VR4,
typename VR5,
typename VR6,
typename VR7,
typename VR8,
typename VR9>
180 __device__ __forceinline__
void loadToSmem(
const thrust::tuple<VP0, VP1, VP2, VP3, VP4, VP5, VP6, VP7, VP8, VP9>& smem,
181 const thrust::tuple<VR0, VR1, VR2, VR3, VR4, VR5, VR6, VR7, VR8, VR9>&
data,
184 For<0, thrust::tuple_size<thrust::tuple<VP0, VP1, VP2, VP3, VP4, VP5, VP6, VP7, VP8, VP9> >
::value>
::loadToSmem(smem, data, tid);
186 template <
typename VP0,
typename VP1,
typename VP2,
typename VP3,
typename VP4,
typename VP5,
typename VP6,
typename VP7,
typename VP8,
typename VP9,
187 typename VR0,
typename VR1,
typename VR2,
typename VR3,
typename VR4,
typename VR5,
typename VR6,
typename VR7,
typename VR8,
typename VR9>
188 __device__ __forceinline__
void loadFromSmem(
const thrust::tuple<VP0, VP1, VP2, VP3, VP4, VP5, VP6, VP7, VP8, VP9>& smem,
189 const thrust::tuple<VR0, VR1, VR2, VR3, VR4, VR5, VR6, VR7, VR8, VR9>&
data,
192 For<0, thrust::tuple_size<thrust::tuple<VP0, VP1, VP2, VP3, VP4, VP5, VP6, VP7, VP8, VP9> >
::value>
::loadFromSmem(smem, data, tid);
198 template <
typename V>
203 template <
typename V>
204 __device__ __forceinline__
void copyVals(
volatile V* svals,
V&
val,
unsigned int tid,
unsigned int delta)
206 svals[tid] = val = svals[tid +
delta];
208 template <
typename VR0,
typename VR1,
typename VR2,
typename VR3,
typename VR4,
typename VR5,
typename VR6,
typename VR7,
typename VR8,
typename VR9>
209 __device__ __forceinline__
void copyValsShfl(
const thrust::tuple<VR0, VR1, VR2, VR3, VR4, VR5, VR6, VR7, VR8, VR9>&
val,
213 For<0, thrust::tuple_size<thrust::tuple<VR0, VR1, VR2, VR3, VR4, VR5, VR6, VR7, VR8, VR9> >
::value>::copyShfl(val, delta, width);
215 template <
typename VP0,
typename VP1,
typename VP2,
typename VP3,
typename VP4,
typename VP5,
typename VP6,
typename VP7,
typename VP8,
typename VP9,
216 typename VR0,
typename VR1,
typename VR2,
typename VR3,
typename VR4,
typename VR5,
typename VR6,
typename VR7,
typename VR8,
typename VR9>
217 __device__ __forceinline__
void copyVals(
const thrust::tuple<VP0, VP1, VP2, VP3, VP4, VP5, VP6, VP7, VP8, VP9>& svals,
218 const thrust::tuple<VR0, VR1, VR2, VR3, VR4, VR5, VR6, VR7, VR8, VR9>&
val,
219 unsigned int tid,
unsigned int delta)
221 For<0, thrust::tuple_size<thrust::tuple<VP0, VP1, VP2, VP3, VP4, VP5, VP6, VP7, VP8, VP9> >
::value>
::copy(svals, val, tid, delta);
227 template <
typename K,
typename V,
class Cmp>
238 template <
typename K,
typename V,
class Cmp>
239 __device__ __forceinline__
void merge(
volatile K* skeys, K&
key,
volatile V* svals,
V&
val,
const Cmp& cmp,
unsigned int tid,
unsigned int delta)
241 K reg = skeys[tid +
delta];
245 skeys[tid] = key = reg;
249 template <
typename K,
250 typename VR0,
typename VR1,
typename VR2,
typename VR3,
typename VR4,
typename VR5,
typename VR6,
typename VR7,
typename VR8,
typename VR9,
253 const thrust::tuple<VR0, VR1, VR2, VR3, VR4, VR5, VR6, VR7, VR8, VR9>&
val,
265 template <
typename K,
266 typename VP0,
typename VP1,
typename VP2,
typename VP3,
typename VP4,
typename VP5,
typename VP6,
typename VP7,
typename VP8,
typename VP9,
267 typename VR0,
typename VR1,
typename VR2,
typename VR3,
typename VR4,
typename VR5,
typename VR6,
typename VR7,
typename VR8,
typename VR9,
269 __device__ __forceinline__
void merge(
volatile K* skeys, K&
key,
270 const thrust::tuple<VP0, VP1, VP2, VP3, VP4, VP5, VP6, VP7, VP8, VP9>& svals,
271 const thrust::tuple<VR0, VR1, VR2, VR3, VR4, VR5, VR6, VR7, VR8, VR9>&
val,
272 const Cmp& cmp,
unsigned int tid,
unsigned int delta)
274 K reg = skeys[tid +
delta];
278 skeys[tid] = key = reg;
282 template <
typename KR0,
typename KR1,
typename KR2,
typename KR3,
typename KR4,
typename KR5,
typename KR6,
typename KR7,
typename KR8,
typename KR9,
283 typename VR0,
typename VR1,
typename VR2,
typename VR3,
typename VR4,
typename VR5,
typename VR6,
typename VR7,
typename VR8,
typename VR9,
284 class Cmp0,
class Cmp1,
class Cmp2,
class Cmp3,
class Cmp4,
class Cmp5,
class Cmp6,
class Cmp7,
class Cmp8,
class Cmp9>
285 __device__ __forceinline__
void mergeShfl(
const thrust::tuple<KR0, KR1, KR2, KR3, KR4, KR5, KR6, KR7, KR8, KR9>&
key,
286 const thrust::tuple<VR0, VR1, VR2, VR3, VR4, VR5, VR6, VR7, VR8, VR9>&
val,
287 const thrust::tuple<Cmp0, Cmp1, Cmp2, Cmp3, Cmp4, Cmp5, Cmp6, Cmp7, Cmp8, Cmp9>& cmp,
290 For<0, thrust::tuple_size<thrust::tuple<KR0, KR1, KR2, KR3, KR4, KR5, KR6, KR7, KR8, KR9> >
::value>
::mergeShfl(key, val, cmp, delta, width);
292 template <
typename KP0,
typename KP1,
typename KP2,
typename KP3,
typename KP4,
typename KP5,
typename KP6,
typename KP7,
typename KP8,
typename KP9,
293 typename KR0,
typename KR1,
typename KR2,
typename KR3,
typename KR4,
typename KR5,
typename KR6,
typename KR7,
typename KR8,
typename KR9,
294 typename VP0,
typename VP1,
typename VP2,
typename VP3,
typename VP4,
typename VP5,
typename VP6,
typename VP7,
typename VP8,
typename VP9,
295 typename VR0,
typename VR1,
typename VR2,
typename VR3,
typename VR4,
typename VR5,
typename VR6,
typename VR7,
typename VR8,
typename VR9,
296 class Cmp0,
class Cmp1,
class Cmp2,
class Cmp3,
class Cmp4,
class Cmp5,
class Cmp6,
class Cmp7,
class Cmp8,
class Cmp9>
297 __device__ __forceinline__
void merge(
const thrust::tuple<KP0, KP1, KP2, KP3, KP4, KP5, KP6, KP7, KP8, KP9>& skeys,
298 const thrust::tuple<KR0, KR1, KR2, KR3, KR4, KR5, KR6, KR7, KR8, KR9>&
key,
299 const thrust::tuple<VP0, VP1, VP2, VP3, VP4, VP5, VP6, VP7, VP8, VP9>& svals,
300 const thrust::tuple<VR0, VR1, VR2, VR3, VR4, VR5, VR6, VR7, VR8, VR9>&
val,
301 const thrust::tuple<Cmp0, Cmp1, Cmp2, Cmp3, Cmp4, Cmp5, Cmp6, Cmp7, Cmp8, Cmp9>& cmp,
302 unsigned int tid,
unsigned int delta)
304 For<0, thrust::tuple_size<thrust::tuple<VP0, VP1, VP2, VP3, VP4, VP5, VP6, VP7, VP8, VP9> >
::value>
::merge(skeys, key, svals, val, cmp, tid, delta);
312 template <
class KP,
class KR,
class VP,
class VR,
class Cmp>
313 static __device__
void reduce(KP skeys, KR
key, VP svals, VR
val,
unsigned int tid, Cmp cmp)
316 loadValsToSmem(svals, val, tid);
323 merge(skeys, key, svals, val, cmp, tid, 1024);
330 merge(skeys, key, svals, val, cmp, tid, 512);
337 merge(skeys, key, svals, val, cmp, tid, 256);
344 merge(skeys, key, svals, val, cmp, tid, 128);
351 merge(skeys, key, svals, val, cmp, tid, 64);
358 merge(skeys, key, svals, val, cmp, tid, 32);
363 merge(skeys, key, svals, val, cmp, tid, 16);
364 merge(skeys, key, svals, val, cmp, tid, 8);
365 merge(skeys, key, svals, val, cmp, tid, 4);
366 merge(skeys, key, svals, val, cmp, tid, 2);
367 merge(skeys, key, svals, val, cmp, tid, 1);
372 template <
unsigned int I,
class KP,
class KR,
class VP,
class VR,
class Cmp>
380 static __device__
void loop(KP skeys, KR
key, VP svals, VR
val,
unsigned int tid, Cmp cmp)
382 merge(skeys, key, svals, val, cmp, tid, I);
383 Unroll<I / 2, KP, KR, VP, VR, Cmp>
::loop(skeys, key, svals, val, tid, cmp);
386 template <
class KP,
class KR,
class VP,
class VR,
class Cmp>
389 static __device__
void loopShfl(KR, VR, Cmp,
unsigned int)
392 static __device__
void loop(KP, KR, VP, VR,
unsigned int, Cmp)
399 template <
class KP,
class KR,
class VP,
class VR,
class Cmp>
400 static __device__
void reduce(KP skeys, KR
key, VP svals, VR
val,
unsigned int tid, Cmp cmp)
402 #if 0 // __CUDA_ARCH__ >= 300
407 Unroll<N / 2, KP, KR, VP, VR, Cmp>::loopShfl(key, val, cmp, N);
413 Unroll<N / 2, KP, KR, VP, VR, Cmp>::loop(skeys, key, svals, val, tid, cmp);
422 template <
class KP,
class KR,
class VP,
class VR,
class Cmp>
423 static __device__
void reduce(KP skeys, KR
key, VP svals, VR
val,
unsigned int tid, Cmp cmp)
427 #if 0 // __CUDA_ARCH__ >= 300
440 Unroll<16, KP, KR, VP, VR, Cmp>::loop(skeys, key, svals, val, tid, cmp);
457 #if 0 // __CUDA_ARCH__ >= 300
460 Unroll<
M / 2, KP, KR, VP, VR, Cmp>::loopShfl(key, val, cmp,
M);
462 Unroll<
M / 2, KP, KR, VP, VR, Cmp>::loop(skeys, key, svals, val, tid, cmp);
468 template <
bool val,
class T1,
class T2>
struct StaticIf;
480 enum {
value = ((N != 0) && !(N & (N - 1))) };
498 #endif // __OPENCV_GPU_PRED_VAL_REDUCE_DETAIL_HPP__
static __device__ void reduce(KP skeys, KR key, VP svals, VR val, unsigned int tid, Cmp cmp)
Definition: reduce_key_val.hpp:423
Definition: reduce_key_val.hpp:468
T2 type
Definition: reduce_key_val.hpp:475
static __device__ void loadToSmem(const PointerTuple &smem, const ReferenceTuple &data, unsigned int tid)
Definition: reduce_key_val.hpp:72
int CvScalar value
Definition: core_c.h:340
CV_EXPORTS void copy(const Mat &src, Mat &dst, const Mat &mask=Mat(), bool invertMask=false)
T1 type
Definition: reduce_key_val.hpp:471
CvFileNode const CvStringHashNode * key
Definition: core_c.h:1584
static __device__ __forceinline__ unsigned int laneId()
Returns the warp lane ID of the calling thread.
Definition: warp.hpp:58
__device__ __forceinline__ void copyVals(volatile V *svals, V &val, unsigned int tid, unsigned int delta)
Definition: reduce_key_val.hpp:204
Definition: reduce_key_val.hpp:310
__device__ __forceinline__ T shfl_down(T val, unsigned int delta, int width=warpSize)
Definition: warp_shuffle.hpp:81
GLsizei GLsizei GLenum GLenum const GLvoid * data
Definition: core_c.h:403
const CvMat const CvMat const CvMat CvMat CvMat CvMat CvMat CvSize CvMat CvMat * T
Definition: calib3d.hpp:270
Definition: reduce_key_val.hpp:397
typedef void(CV_CDECL *CvMouseCallback)(int event
T type
Definition: reduce_key_val.hpp:61
static __device__ void reduce(KP skeys, KR key, VP svals, VR val, unsigned int tid, Cmp cmp)
Definition: reduce_key_val.hpp:313
__device__ __forceinline__ void loadToSmem(volatile T *smem, T &data, unsigned int tid)
Definition: reduce_key_val.hpp:169
static __device__ void reduce(KP skeys, KR key, VP svals, VR val, unsigned int tid, Cmp cmp)
Definition: reduce_key_val.hpp:400
__device__ __forceinline__ void merge(volatile K *skeys, K &key, volatile V *svals, V &val, const Cmp &cmp, unsigned int tid, unsigned int delta)
Definition: reduce_key_val.hpp:239
StaticIf< (N<=32)&&IsPowerOf2< N >::value, WarpOptimized< N >, typename StaticIf< (N<=1024)&&IsPowerOf2< N >::value, GenericOptimized32< N >, Generic< N > >::type >::type reductor
Definition: reduce_key_val.hpp:493
int width
Definition: highgui_c.h:130
static __device__ void loopShfl(KR, VR, Cmp, unsigned int)
Definition: reduce_key_val.hpp:389
Definition: reduce_key_val.hpp:483
T type
Definition: reduce_key_val.hpp:57
CvSize int int int CvPoint int delta
Definition: core_c.h:1427
static __device__ void copy(const PointerTuple &svals, const ReferenceTuple &val, unsigned int tid, unsigned int delta)
Definition: reduce_key_val.hpp:94
Definition: reduce_key_val.hpp:418
static __device__ void loopShfl(KR key, VR val, Cmp cmp, unsigned int N)
Definition: reduce_key_val.hpp:375
static __device__ void copyShfl(const ReferenceTuple &val, unsigned int delta, int width)
Definition: reduce_key_val.hpp:87
void * data
Definition: core_c.h:206
Definition: reduce_key_val.hpp:54
Definition: reduce_key_val.hpp:478
__device__ __forceinline__ void copyValsShfl(V &val, unsigned int delta, int width)
Definition: reduce_key_val.hpp:199
static __device__ void mergeShfl(const KeyReferenceTuple &, const ValReferenceTuple &, const CmpTuple &, unsigned int, int)
Definition: reduce_key_val.hpp:153
const CvArr const CvArr * V
Definition: core_c.h:733
static __device__ void copy(const PointerTuple &, const ReferenceTuple &, unsigned int, unsigned int)
Definition: reduce_key_val.hpp:148
GLsizei const GLfloat * value
Definition: core_c.h:341
GLuint GLuint GLsizei GLenum type
Definition: core_c.h:114
double double double double double double CvSize * warpSize
Definition: legacy.hpp:680
static __device__ void copyShfl(const ReferenceTuple &, unsigned int, int)
Definition: reduce_key_val.hpp:144
__device__ __forceinline__ void mergeShfl(K &key, V &val, const Cmp &cmp, unsigned int delta, int width)
Definition: reduce_key_val.hpp:228
Definition: reduce_key_val.hpp:373
static __device__ void mergeShfl(const KeyReferenceTuple &key, const ValReferenceTuple &val, const CmpTuple &cmp, unsigned int delta, int width)
Definition: reduce_key_val.hpp:102
__device__ __forceinline__ void loadFromSmem(volatile T *smem, T &data, unsigned int tid)
Definition: reduce_key_val.hpp:174
static __device__ void loadFromSmem(const PointerTuple &, const ReferenceTuple &, unsigned int)
Definition: reduce_key_val.hpp:139
static __device__ void loop(KP skeys, KR key, VP svals, VR val, unsigned int tid, Cmp cmp)
Definition: reduce_key_val.hpp:380
T type
Definition: reduce_key_val.hpp:65
false
Definition: color.hpp:230
static __device__ void loadToSmem(const PointerTuple &, const ReferenceTuple &, unsigned int)
Definition: reduce_key_val.hpp:135
true
Definition: color.hpp:221
static __device__ void loadFromSmem(const PointerTuple &smem, const ReferenceTuple &data, unsigned int tid)
Definition: reduce_key_val.hpp:79
static __device__ void loop(KP, KR, VP, VR, unsigned int, Cmp)
Definition: reduce_key_val.hpp:392
Definition: reduce_key_val.hpp:420
Definition: reduce_key_val.hpp:69
static __device__ void merge(const KeyPointerTuple &skeys, const KeyReferenceTuple &key, const ValPointerTuple &svals, const ValReferenceTuple &val, const CmpTuple &cmp, unsigned int tid, unsigned int delta)
Definition: reduce_key_val.hpp:115
static __device__ void merge(const KeyPointerTuple &, const KeyReferenceTuple &, const ValPointerTuple &, const ValReferenceTuple &, const CmpTuple &, unsigned int, unsigned int)
Definition: reduce_key_val.hpp:157