43 #ifndef __OPENCV_GPU_DEVICE_BLOCK_HPP__
44 #define __OPENCV_GPU_DEVICE_BLOCK_HPP__
46 namespace cv {
namespace gpu {
namespace device
50 static __device__ __forceinline__
unsigned int id()
55 static __device__ __forceinline__
unsigned int stride()
57 return blockDim.x * blockDim.y * blockDim.z;
60 static __device__ __forceinline__
void sync()
67 return threadIdx.z * blockDim.x * blockDim.y + threadIdx.y * blockDim.x + threadIdx.x;
70 template<
typename It,
typename T>
71 static __device__ __forceinline__
void fill(It beg, It
end,
const T&
value)
76 for(; t <
end; t += STRIDE)
80 template<
typename OutIt,
typename T>
81 static __device__ __forceinline__
void yota(OutIt beg, OutIt
end,
T value)
87 for(OutIt
t = beg + tid;
t <
end;
t += STRIDE, value += STRIDE)
91 template<
typename InIt,
typename OutIt>
92 static __device__ __forceinline__
void copy(InIt beg, InIt
end, OutIt out)
96 OutIt o = out + (t - beg);
98 for(; t <
end; t += STRIDE, o += STRIDE)
102 template<
typename InIt,
typename OutIt,
class UnOp>
103 static __device__ __forceinline__
void transfrom(InIt beg, InIt
end, OutIt out, UnOp op)
107 OutIt o = out + (t - beg);
109 for(; t <
end; t += STRIDE, o += STRIDE)
113 template<
typename InIt1,
typename InIt2,
typename OutIt,
class BinOp>
114 static __device__ __forceinline__
void transfrom(InIt1 beg1, InIt1 end1, InIt2 beg2, OutIt out, BinOp op)
119 OutIt o = out + (t1 - beg1);
121 for(; t1 < end1; t1 += STRIDE, t2 += STRIDE, o += STRIDE)
125 template<
int CTA_SIZE,
typename T,
class BinOp>
131 if (CTA_SIZE >= 1024) {
if (tid < 512) buffer[tid] = val = op(val, buffer[tid + 512]); __syncthreads(); }
132 if (CTA_SIZE >= 512) {
if (tid < 256) buffer[tid] = val = op(val, buffer[tid + 256]); __syncthreads(); }
133 if (CTA_SIZE >= 256) {
if (tid < 128) buffer[tid] = val = op(val, buffer[tid + 128]); __syncthreads(); }
134 if (CTA_SIZE >= 128) {
if (tid < 64) buffer[tid] = val = op(val, buffer[tid + 64]); __syncthreads(); }
138 if (CTA_SIZE >= 64) { buffer[tid] = val = op(val, buffer[tid + 32]); }
139 if (CTA_SIZE >= 32) { buffer[tid] = val = op(val, buffer[tid + 16]); }
140 if (CTA_SIZE >= 16) { buffer[tid] = val = op(val, buffer[tid + 8]); }
141 if (CTA_SIZE >= 8) { buffer[tid] = val = op(val, buffer[tid + 4]); }
142 if (CTA_SIZE >= 4) { buffer[tid] = val = op(val, buffer[tid + 2]); }
143 if (CTA_SIZE >= 2) { buffer[tid] = val = op(val, buffer[tid + 1]); }
147 template<
int CTA_SIZE,
typename T,
class BinOp>
151 T val = buffer[tid] = init;
154 if (CTA_SIZE >= 1024) {
if (tid < 512) buffer[tid] = val = op(val, buffer[tid + 512]); __syncthreads(); }
155 if (CTA_SIZE >= 512) {
if (tid < 256) buffer[tid] = val = op(val, buffer[tid + 256]); __syncthreads(); }
156 if (CTA_SIZE >= 256) {
if (tid < 128) buffer[tid] = val = op(val, buffer[tid + 128]); __syncthreads(); }
157 if (CTA_SIZE >= 128) {
if (tid < 64) buffer[tid] = val = op(val, buffer[tid + 64]); __syncthreads(); }
161 if (CTA_SIZE >= 64) { buffer[tid] = val = op(val, buffer[tid + 32]); }
162 if (CTA_SIZE >= 32) { buffer[tid] = val = op(val, buffer[tid + 16]); }
163 if (CTA_SIZE >= 16) { buffer[tid] = val = op(val, buffer[tid + 8]); }
164 if (CTA_SIZE >= 8) { buffer[tid] = val = op(val, buffer[tid + 4]); }
165 if (CTA_SIZE >= 4) { buffer[tid] = val = op(val, buffer[tid + 2]); }
166 if (CTA_SIZE >= 2) { buffer[tid] = val = op(val, buffer[tid + 1]); }
172 template <
typename T,
class BinOp>
173 static __device__ __forceinline__
void reduce_n(
T*
data,
unsigned int n, BinOp op)
180 for (
unsigned int i = sft + ftid; i <
n; i += sft)
181 data[ftid] = op(data[ftid], data[i]);
190 unsigned int half = n/2;
193 data[ftid] = op(data[ftid], data[n - ftid - 1]);
static __device__ __forceinline__ void yota(OutIt beg, OutIt end, T value)
Definition: block.hpp:81
static __device__ __forceinline__ int flattenedThreadId()
Definition: block.hpp:65
static __device__ __forceinline__ void copy(InIt beg, InIt end, OutIt out)
Definition: block.hpp:92
static __device__ __forceinline__ void fill(It beg, It end, const T &value)
Definition: block.hpp:71
static __device__ __forceinline__ unsigned int stride()
Definition: block.hpp:55
GLsizei GLsizei GLenum GLenum const GLvoid * data
Definition: core_c.h:403
static __device__ __forceinline__ void reduce(volatile T *buffer, BinOp op)
Definition: block.hpp:126
const CvMat const CvMat const CvMat CvMat CvMat CvMat CvMat CvSize CvMat CvMat * T
Definition: calib3d.hpp:270
static __device__ __forceinline__ void reduce_n(T *data, unsigned int n, BinOp op)
Definition: block.hpp:173
static __device__ __forceinline__ unsigned int id()
Definition: block.hpp:50
static __device__ __forceinline__ void sync()
Definition: block.hpp:60
GLsizei const GLfloat * value
Definition: core_c.h:341
static __device__ __forceinline__ void transfrom(InIt beg, InIt end, OutIt out, UnOp op)
Definition: block.hpp:103
int n
Definition: legacy.hpp:3070
static __device__ __forceinline__ void transfrom(InIt1 beg1, InIt1 end1, InIt2 beg2, OutIt out, BinOp op)
Definition: block.hpp:114
double double end
Definition: core_c.h:774
static __device__ __forceinline__ T reduce(volatile T *buffer, T init, BinOp op)
Definition: block.hpp:148