43 #ifndef __OPENCV_GPU_REDUCE_DETAIL_HPP__
44 #define __OPENCV_GPU_REDUCE_DETAIL_HPP__
46 #include <thrust/tuple.h>
47 #include "../warp.hpp"
48 #include "../warp_shuffle.hpp"
50 namespace cv {
namespace gpu {
namespace device
52 namespace reduce_detail
59 template <
typename T>
struct GetType<volatile
T*>
68 template <
unsigned int I,
unsigned int N>
71 template <
class Po
interTuple,
class ValTuple>
72 static __device__
void loadToSmem(
const PointerTuple& smem,
const ValTuple&
val,
unsigned int tid)
74 thrust::get<I>(smem)[tid] = thrust::get<I>(val);
78 template <
class Po
interTuple,
class ValTuple>
79 static __device__
void loadFromSmem(
const PointerTuple& smem,
const ValTuple&
val,
unsigned int tid)
81 thrust::get<I>(val) = thrust::get<I>(smem)[tid];
86 template <
class Po
interTuple,
class ValTuple,
class OpTuple>
87 static __device__
void merge(
const PointerTuple& smem,
const ValTuple&
val,
unsigned int tid,
unsigned int delta,
const OpTuple& op)
90 thrust::get<I>(smem)[tid] = thrust::get<I>(val) = thrust::get<I>(op)(thrust::get<I>(val), reg);
94 template <
class ValTuple,
class OpTuple>
98 thrust::get<I>(val) = thrust::get<I>(op)(thrust::get<I>(val), reg);
103 template <
unsigned int N>
106 template <
class Po
interTuple,
class ValTuple>
107 static __device__
void loadToSmem(
const PointerTuple&,
const ValTuple&,
unsigned int)
110 template <
class Po
interTuple,
class ValTuple>
111 static __device__
void loadFromSmem(
const PointerTuple&,
const ValTuple&,
unsigned int)
115 template <
class Po
interTuple,
class ValTuple,
class OpTuple>
116 static __device__
void merge(
const PointerTuple&,
const ValTuple&,
unsigned int,
unsigned int,
const OpTuple&)
119 template <
class ValTuple,
class OpTuple>
120 static __device__
void mergeShfl(
const ValTuple&,
unsigned int,
unsigned int,
const OpTuple&)
125 template <
typename T>
126 __device__ __forceinline__
void loadToSmem(
volatile T* smem,
T&
val,
unsigned int tid)
130 template <
typename T>
135 template <
typename P0,
typename P1,
typename P2,
typename P3,
typename P4,
typename P5,
typename P6,
typename P7,
typename P8,
typename P9,
136 typename R0,
typename R1,
typename R2,
typename R3,
typename R4,
typename R5,
typename R6,
typename R7,
typename R8,
typename R9>
137 __device__ __forceinline__
void loadToSmem(
const thrust::tuple<P0, P1, P2, P3, P4, P5, P6, P7, P8, P9>& smem,
138 const thrust::tuple<R0, R1, R2, R3, R4, R5, R6, R7, R8, R9>&
val,
141 For<0, thrust::tuple_size<thrust::tuple<P0, P1, P2, P3, P4, P5, P6, P7, P8, P9> >
::value>
::loadToSmem(smem, val, tid);
143 template <
typename P0,
typename P1,
typename P2,
typename P3,
typename P4,
typename P5,
typename P6,
typename P7,
typename P8,
typename P9,
144 typename R0,
typename R1,
typename R2,
typename R3,
typename R4,
typename R5,
typename R6,
typename R7,
typename R8,
typename R9>
145 __device__ __forceinline__
void loadFromSmem(
const thrust::tuple<P0, P1, P2, P3, P4, P5, P6, P7, P8, P9>& smem,
146 const thrust::tuple<R0, R1, R2, R3, R4, R5, R6, R7, R8, R9>&
val,
149 For<0, thrust::tuple_size<thrust::tuple<P0, P1, P2, P3, P4, P5, P6, P7, P8, P9> >
::value>
::loadFromSmem(smem, val, tid);
152 template <
typename T,
class Op>
153 __device__ __forceinline__
void merge(
volatile T* smem,
T&
val,
unsigned int tid,
unsigned int delta,
const Op& op)
155 T reg = smem[tid +
delta];
156 smem[tid] = val = op(val, reg);
158 template <
typename T,
class Op>
164 template <
typename P0,
typename P1,
typename P2,
typename P3,
typename P4,
typename P5,
typename P6,
typename P7,
typename P8,
typename P9,
165 typename R0,
typename R1,
typename R2,
typename R3,
typename R4,
typename R5,
typename R6,
typename R7,
typename R8,
typename R9,
166 class Op0,
class Op1,
class Op2,
class Op3,
class Op4,
class Op5,
class Op6,
class Op7,
class Op8,
class Op9>
167 __device__ __forceinline__
void merge(
const thrust::tuple<P0, P1, P2, P3, P4, P5, P6, P7, P8, P9>& smem,
168 const thrust::tuple<R0, R1, R2, R3, R4, R5, R6, R7, R8, R9>&
val,
171 const thrust::tuple<Op0, Op1, Op2, Op3, Op4, Op5, Op6, Op7, Op8, Op9>& op)
173 For<0, thrust::tuple_size<thrust::tuple<P0, P1, P2, P3, P4, P5, P6, P7, P8, P9> >
::value>
::merge(smem, val, tid, delta, op);
175 template <
typename R0,
typename R1,
typename R2,
typename R3,
typename R4,
typename R5,
typename R6,
typename R7,
typename R8,
typename R9,
176 class Op0,
class Op1,
class Op2,
class Op3,
class Op4,
class Op5,
class Op6,
class Op7,
class Op8,
class Op9>
177 __device__ __forceinline__
void mergeShfl(
const thrust::tuple<R0, R1, R2, R3, R4, R5, R6, R7, R8, R9>&
val,
180 const thrust::tuple<Op0, Op1, Op2, Op3, Op4, Op5, Op6, Op7, Op8, Op9>& op)
182 For<0, thrust::tuple_size<thrust::tuple<R0, R1, R2, R3, R4, R5, R6, R7, R8, R9> >
::value>
::mergeShfl(val, delta, width, op);
187 template <
typename Po
inter,
typename Reference,
class Op>
188 static __device__
void reduce(Pointer smem, Reference
val,
unsigned int tid, Op op)
197 merge(smem, val, tid, 1024, op);
204 merge(smem, val, tid, 512, op);
211 merge(smem, val, tid, 256, op);
218 merge(smem, val, tid, 128, op);
225 merge(smem, val, tid, 64, op);
232 merge(smem, val, tid, 32, op);
237 merge(smem, val, tid, 16, op);
238 merge(smem, val, tid, 8, op);
239 merge(smem, val, tid, 4, op);
240 merge(smem, val, tid, 2, op);
241 merge(smem, val, tid, 1, op);
246 template <
unsigned int I,
typename Po
inter,
typename Reference,
class Op>
249 static __device__
void loopShfl(Reference
val, Op op,
unsigned int N)
254 static __device__
void loop(Pointer smem, Reference
val,
unsigned int tid, Op op)
256 merge(smem, val, tid, I, op);
257 Unroll<I / 2, Pointer, Reference, Op>
::loop(smem, val, tid, op);
260 template <
typename Po
inter,
typename Reference,
class Op>
263 static __device__
void loopShfl(Reference, Op,
unsigned int)
266 static __device__
void loop(Pointer, Reference,
unsigned int, Op)
273 template <
typename Po
inter,
typename Reference,
class Op>
274 static __device__
void reduce(Pointer smem, Reference
val,
unsigned int tid, Op op)
276 #if __CUDA_ARCH__ >= 300
280 Unroll<N / 2, Pointer, Reference, Op>::loopShfl(val, op, N);
285 Unroll<N / 2, Pointer, Reference, Op>::loop(smem, val, tid, op);
294 template <
typename Po
inter,
typename Reference,
class Op>
295 static __device__
void reduce(Pointer smem, Reference
val,
unsigned int tid, Op op)
299 #if __CUDA_ARCH__ >= 300
322 #if __CUDA_ARCH__ >= 300
323 Unroll<
M / 2, Pointer, Reference, Op>::loopShfl(val, op,
M);
325 Unroll<
M / 2, Pointer, Reference, Op>::loop(smem, val, tid, op);
331 template <
bool val,
class T1,
class T2>
struct StaticIf;
343 enum {
value = ((N != 0) && !(N & (N - 1))) };
361 #endif // __OPENCV_GPU_REDUCE_DETAIL_HPP__
Definition: reduce.hpp:331
Definition: reduce.hpp:247
Definition: reduce.hpp:341
static __device__ void loadToSmem(const PointerTuple &smem, const ValTuple &val, unsigned int tid)
Definition: reduce.hpp:72
const CvMat const CvMat const CvMat CvSize const CvMat const CvMat CvMat CvMat * R2
Definition: calib3d.hpp:284
int CvScalar value
Definition: core_c.h:340
__device__ __forceinline__ void loadToSmem(volatile T *smem, T &val, unsigned int tid)
Definition: reduce.hpp:126
Definition: reduce.hpp:290
__device__ __forceinline__ void loadFromSmem(volatile T *smem, T &val, unsigned int tid)
Definition: reduce.hpp:131
static __device__ __forceinline__ unsigned int laneId()
Returns the warp lane ID of the calling thread.
Definition: warp.hpp:58
static __device__ void mergeShfl(const ValTuple &, unsigned int, unsigned int, const OpTuple &)
Definition: reduce.hpp:120
static __device__ void loopShfl(Reference val, Op op, unsigned int N)
Definition: reduce.hpp:249
Definition: reduce.hpp:346
__device__ __forceinline__ T shfl_down(T val, unsigned int delta, int width=warpSize)
Definition: warp_shuffle.hpp:81
__device__ __forceinline__ void mergeShfl(T &val, unsigned int delta, unsigned int width, const Op &op)
Definition: reduce.hpp:159
Definition: reduce.hpp:292
static __device__ void loop(Pointer smem, Reference val, unsigned int tid, Op op)
Definition: reduce.hpp:254
const CvMat const CvMat const CvMat CvMat CvMat CvMat CvMat CvSize CvMat CvMat * T
Definition: calib3d.hpp:270
static __device__ void loadFromSmem(const PointerTuple &smem, const ValTuple &val, unsigned int tid)
Definition: reduce.hpp:79
Definition: reduce.hpp:185
static __device__ void loopShfl(Reference, Op, unsigned int)
Definition: reduce.hpp:263
StaticIf< (N<=32)&&IsPowerOf2< N >::value, WarpOptimized< N >, typename StaticIf< (N<=1024)&&IsPowerOf2< N >::value, GenericOptimized32< N >, Generic< N > >::type >::type reductor
Definition: reduce.hpp:356
Definition: reduce.hpp:271
const CvMat const CvMat const CvMat CvSize const CvMat const CvMat CvMat * R1
Definition: calib3d.hpp:284
typedef void(CV_CDECL *CvMouseCallback)(int event
Definition: reduce.hpp:69
static __device__ void loop(Pointer, Reference, unsigned int, Op)
Definition: reduce.hpp:266
static __device__ void loadToSmem(const PointerTuple &, const ValTuple &, unsigned int)
Definition: reduce.hpp:107
static __device__ void merge(const PointerTuple &, const ValTuple &, unsigned int, unsigned int, const OpTuple &)
Definition: reduce.hpp:116
CvSize int int int CvPoint int delta
Definition: core_c.h:1427
T2 type
Definition: reduce.hpp:338
const CvMat const CvMat const CvMat CvSize const CvMat const CvMat CvMat CvMat CvMat * P1
Definition: calib3d.hpp:284
static __device__ void reduce(Pointer smem, Reference val, unsigned int tid, Op op)
Definition: reduce.hpp:295
static __device__ void reduce(Pointer smem, Reference val, unsigned int tid, Op op)
Definition: reduce.hpp:274
T type
Definition: reduce.hpp:57
static __device__ void merge(const PointerTuple &smem, const ValTuple &val, unsigned int tid, unsigned int delta, const OpTuple &op)
Definition: reduce.hpp:87
static __device__ void mergeShfl(const ValTuple &val, unsigned int delta, unsigned int width, const OpTuple &op)
Definition: reduce.hpp:95
Definition: reduce.hpp:54
T1 type
Definition: reduce.hpp:334
T type
Definition: reduce.hpp:61
GLsizei const GLfloat * value
Definition: core_c.h:341
GLuint GLuint GLsizei GLenum type
Definition: core_c.h:114
double double double double double double CvSize * warpSize
Definition: legacy.hpp:680
T type
Definition: reduce.hpp:65
false
Definition: color.hpp:230
static __device__ void reduce(Pointer smem, Reference val, unsigned int tid, Op op)
Definition: reduce.hpp:188
true
Definition: color.hpp:221
static __device__ void loadFromSmem(const PointerTuple &, const ValTuple &, unsigned int)
Definition: reduce.hpp:111
__device__ __forceinline__ void merge(volatile T *smem, T &val, unsigned int tid, unsigned int delta, const Op &op)
Definition: reduce.hpp:153
const CvMat const CvMat const CvMat CvSize const CvMat const CvMat CvMat CvMat CvMat CvMat * P2
Definition: calib3d.hpp:284