43 #ifndef __OPENCV_GPU_TRANSFORM_DETAIL_HPP__
44 #define __OPENCV_GPU_TRANSFORM_DETAIL_HPP__
46 #include "../common.hpp"
47 #include "../vec_traits.hpp"
48 #include "../functional.hpp"
50 namespace cv {
namespace gpu {
namespace device
52 namespace transform_detail
74 template <
typename T,
typename D,
typename UnOp,
typename Mask>
75 static __device__ __forceinline__
void unroll(
const T&
src, D&
dst,
const Mask&
mask, UnOp& op,
int x_shifted,
int y)
77 if (
mask(y, x_shifted))
81 template <
typename T1,
typename T2,
typename D,
typename BinOp,
typename Mask>
82 static __device__ __forceinline__
void unroll(
const T1&
src1,
const T2&
src2, D&
dst,
const Mask&
mask, BinOp& op,
int x_shifted,
int y)
84 if (
mask(y, x_shifted))
85 dst.x = op(src1.x, src2.x);
90 template <
typename T,
typename D,
typename UnOp,
typename Mask>
91 static __device__ __forceinline__
void unroll(
const T&
src, D&
dst,
const Mask&
mask, UnOp& op,
int x_shifted,
int y)
93 if (
mask(y, x_shifted))
95 if (
mask(y, x_shifted + 1))
99 template <
typename T1,
typename T2,
typename D,
typename BinOp,
typename Mask>
100 static __device__ __forceinline__
void unroll(
const T1&
src1,
const T2&
src2, D&
dst,
const Mask&
mask, BinOp& op,
int x_shifted,
int y)
102 if (
mask(y, x_shifted))
103 dst.x = op(src1.x, src2.x);
104 if (
mask(y, x_shifted + 1))
105 dst.y = op(src1.y, src2.y);
110 template <
typename T,
typename D,
typename UnOp,
typename Mask>
111 static __device__ __forceinline__
void unroll(
const T&
src, D&
dst,
const Mask&
mask,
const UnOp& op,
int x_shifted,
int y)
113 if (
mask(y, x_shifted))
115 if (
mask(y, x_shifted + 1))
117 if (
mask(y, x_shifted + 2))
121 template <
typename T1,
typename T2,
typename D,
typename BinOp,
typename Mask>
122 static __device__ __forceinline__
void unroll(
const T1&
src1,
const T2&
src2, D&
dst,
const Mask&
mask,
const BinOp& op,
int x_shifted,
int y)
124 if (
mask(y, x_shifted))
125 dst.x = op(src1.x, src2.x);
126 if (
mask(y, x_shifted + 1))
127 dst.y = op(src1.y, src2.y);
128 if (
mask(y, x_shifted + 2))
129 dst.z = op(src1.z, src2.z);
134 template <
typename T,
typename D,
typename UnOp,
typename Mask>
135 static __device__ __forceinline__
void unroll(
const T&
src, D&
dst,
const Mask&
mask,
const UnOp& op,
int x_shifted,
int y)
137 if (
mask(y, x_shifted))
139 if (
mask(y, x_shifted + 1))
141 if (
mask(y, x_shifted + 2))
143 if (
mask(y, x_shifted + 3))
147 template <
typename T1,
typename T2,
typename D,
typename BinOp,
typename Mask>
148 static __device__ __forceinline__
void unroll(
const T1&
src1,
const T2&
src2, D&
dst,
const Mask&
mask,
const BinOp& op,
int x_shifted,
int y)
150 if (
mask(y, x_shifted))
151 dst.x = op(src1.x, src2.x);
152 if (
mask(y, x_shifted + 1))
153 dst.y = op(src1.y, src2.y);
154 if (
mask(y, x_shifted + 2))
155 dst.z = op(src1.z, src2.z);
156 if (
mask(y, x_shifted + 3))
157 dst.w = op(src1.w, src2.w);
162 template <
typename T,
typename D,
typename UnOp,
typename Mask>
163 static __device__ __forceinline__
void unroll(
const T&
src, D&
dst,
const Mask&
mask,
const UnOp& op,
int x_shifted,
int y)
165 if (
mask(y, x_shifted))
167 if (
mask(y, x_shifted + 1))
169 if (
mask(y, x_shifted + 2))
171 if (
mask(y, x_shifted + 3))
173 if (
mask(y, x_shifted + 4))
175 if (
mask(y, x_shifted + 5))
177 if (
mask(y, x_shifted + 6))
179 if (
mask(y, x_shifted + 7))
183 template <
typename T1,
typename T2,
typename D,
typename BinOp,
typename Mask>
184 static __device__ __forceinline__
void unroll(
const T1&
src1,
const T2&
src2, D&
dst,
const Mask&
mask,
const BinOp& op,
int x_shifted,
int y)
186 if (
mask(y, x_shifted))
187 dst.a0 = op(src1.a0, src2.a0);
188 if (
mask(y, x_shifted + 1))
189 dst.a1 = op(src1.a1, src2.a1);
190 if (
mask(y, x_shifted + 2))
191 dst.a2 = op(src1.a2, src2.a2);
192 if (
mask(y, x_shifted + 3))
193 dst.a3 = op(src1.a3, src2.a3);
194 if (
mask(y, x_shifted + 4))
195 dst.a4 = op(src1.a4, src2.a4);
196 if (
mask(y, x_shifted + 5))
197 dst.a5 = op(src1.a5, src2.a5);
198 if (
mask(y, x_shifted + 6))
199 dst.a6 = op(src1.a6, src2.a6);
200 if (
mask(y, x_shifted + 7))
201 dst.a7 = op(src1.a7, src2.a7);
205 template <
typename T,
typename D,
typename UnOp,
typename Mask>
209 typedef typename UnaryReadWriteTraits<T, D, ft::smart_shift>::read_type read_type;
210 typedef typename UnaryReadWriteTraits<T, D, ft::smart_shift>::write_type write_type;
212 const int x = threadIdx.x + blockIdx.x * blockDim.x;
213 const int y = threadIdx.y + blockIdx.y * blockDim.y;
214 const int x_shifted = x * ft::smart_shift;
221 if (x_shifted + ft::smart_shift - 1 < src_.
cols)
223 const read_type src_n_el = ((
const read_type*)src)[
x];
224 write_type dst_n_el = ((
const write_type*)dst)[
x];
226 OpUnroller<ft::smart_shift>::unroll(src_n_el, dst_n_el, mask, op, x_shifted, y);
228 ((write_type*)dst)[
x] = dst_n_el;
232 for (
int real_x = x_shifted; real_x < src_.
cols; ++real_x)
235 dst[real_x] = op(src[real_x]);
241 template <
typename T,
typename D,
typename UnOp,
typename Mask>
242 __global__
static void transformSimple(
const PtrStepSz<T> src, PtrStep<D> dst,
const Mask mask,
const UnOp op)
244 const int x = blockDim.x * blockIdx.x + threadIdx.x;
245 const int y = blockDim.y * blockIdx.y + threadIdx.y;
247 if (x < src.cols && y < src.rows && mask(y, x))
249 dst.ptr(y)[
x] = op(src.ptr(y)[
x]);
253 template <
typename T1,
typename T2,
typename D,
typename BinOp,
typename Mask>
254 static __global__
void transformSmart(
const PtrStepSz<T1> src1_,
const PtrStep<T2> src2_, PtrStep<D> dst_,
255 const Mask mask,
const BinOp op)
257 typedef TransformFunctorTraits<BinOp> ft;
258 typedef typename BinaryReadWriteTraits<T1, T2, D, ft::smart_shift>::read_type1 read_type1;
259 typedef typename BinaryReadWriteTraits<T1, T2, D, ft::smart_shift>::read_type2 read_type2;
260 typedef typename BinaryReadWriteTraits<T1, T2, D, ft::smart_shift>::write_type write_type;
262 const int x = threadIdx.x + blockIdx.x * blockDim.x;
263 const int y = threadIdx.y + blockIdx.y * blockDim.y;
264 const int x_shifted = x * ft::smart_shift;
268 const T1*
src1 = src1_.ptr(y);
269 const T2*
src2 = src2_.ptr(y);
270 D* dst = dst_.ptr(y);
272 if (x_shifted + ft::smart_shift - 1 < src1_.cols)
274 const read_type1 src1_n_el = ((
const read_type1*)src1)[
x];
275 const read_type2 src2_n_el = ((
const read_type2*)src2)[
x];
276 write_type dst_n_el = ((
const write_type*)dst)[
x];
278 OpUnroller<ft::smart_shift>::unroll(src1_n_el, src2_n_el, dst_n_el, mask, op, x_shifted, y);
280 ((write_type*)dst)[
x] = dst_n_el;
284 for (
int real_x = x_shifted; real_x < src1_.cols; ++real_x)
287 dst[real_x] = op(src1[real_x], src2[real_x]);
293 template <
typename T1,
typename T2,
typename D,
typename BinOp,
typename Mask>
294 static __global__
void transformSimple(
const PtrStepSz<T1> src1,
const PtrStep<T2> src2, PtrStep<D> dst,
295 const Mask mask,
const BinOp op)
297 const int x = blockDim.x * blockIdx.x + threadIdx.x;
298 const int y = blockDim.y * blockIdx.y + threadIdx.y;
300 if (x < src1.cols && y < src1.rows && mask(y, x))
302 const T1 src1_data = src1.ptr(y)[
x];
303 const T2 src2_data = src2.ptr(y)[
x];
304 dst.ptr(y)[
x] = op(src1_data, src2_data);
311 template <
typename T,
typename D,
typename UnOp,
typename Mask>
316 const dim3 threads(ft::simple_block_dim_x, ft::simple_block_dim_y, 1);
319 transformSimple<T, D><<<grid, threads, 0, stream>>>(
src,
dst,
mask, op);
320 cudaSafeCall( cudaGetLastError() );
323 cudaSafeCall( cudaDeviceSynchronize() );
326 template <
typename T1,
typename T2,
typename D,
typename BinOp,
typename Mask>
331 const dim3 threads(ft::simple_block_dim_x, ft::simple_block_dim_y, 1);
334 transformSimple<T1, T2, D><<<grid, threads, 0, stream>>>(
src1,
src2,
dst,
mask, op);
335 cudaSafeCall( cudaGetLastError() );
338 cudaSafeCall( cudaDeviceSynchronize() );
343 template <
typename T,
typename D,
typename UnOp,
typename Mask>
350 if (!isAligned(src.
data, ft::smart_shift *
sizeof(
T)) || !isAligned(src.
step, ft::smart_shift *
sizeof(
T)) ||
351 !isAligned(dst.
data, ft::smart_shift *
sizeof(D)) || !isAligned(dst.
step, ft::smart_shift *
sizeof(D)))
357 const dim3 threads(ft::smart_block_dim_x, ft::smart_block_dim_y, 1);
358 const dim3 grid(
divUp(src.
cols, threads.x * ft::smart_shift),
divUp(src.
rows, threads.y), 1);
360 transformSmart<T, D><<<grid, threads, 0, stream>>>(
src,
dst,
mask, op);
361 cudaSafeCall( cudaGetLastError() );
364 cudaSafeCall( cudaDeviceSynchronize() );
367 template <
typename T1,
typename T2,
typename D,
typename BinOp,
typename Mask>
374 if (!isAligned(src1.
data, ft::smart_shift *
sizeof(T1)) || !isAligned(src1.
step, ft::smart_shift *
sizeof(T1)) ||
375 !isAligned(src2.
data, ft::smart_shift *
sizeof(T2)) || !isAligned(src2.
step, ft::smart_shift *
sizeof(T2)) ||
376 !isAligned(dst.
data, ft::smart_shift *
sizeof(D)) || !isAligned(dst.
step, ft::smart_shift *
sizeof(D)))
382 const dim3 threads(ft::smart_block_dim_x, ft::smart_block_dim_y, 1);
383 const dim3 grid(
divUp(src1.
cols, threads.x * ft::smart_shift),
divUp(src1.
rows, threads.y), 1);
385 transformSmart<T1, T2, D><<<grid, threads, 0, stream>>>(
src1,
src2,
dst,
mask, op);
386 cudaSafeCall( cudaGetLastError() );
389 cudaSafeCall( cudaDeviceSynchronize() );
395 #endif // __OPENCV_GPU_TRANSFORM_DETAIL_HPP__
const void * src
Definition: core_c.h:1568
GLenum GLint GLint y
Definition: core_c.h:613
const CvArr * src1
Definition: core_c.h:436
int cols
Definition: cuda_devptrs.hpp:113
GLuint src
Definition: core_c.h:1650
T * data
Definition: cuda_devptrs.hpp:71
const CvMat const CvMat const CvMat CvMat CvMat CvMat CvMat CvSize CvMat CvMat * T
Definition: calib3d.hpp:270
__host__ __device__ __forceinline__ int divUp(int total, int grain)
Definition: common.hpp:90
Definition: cuda_devptrs.hpp:104
OutputArray dst
Definition: imgproc.hpp:823
GLenum GLint x
Definition: core_c.h:632
Definition: cuda_devptrs.hpp:89
size_t step
stride between two consecutive rows in bytes. Step is stored always and everywhere in bytes!!! ...
Definition: cuda_devptrs.hpp:95
__CV_GPU_HOST_DEVICE__ T * ptr(int y=0)
Definition: cuda_devptrs.hpp:97
int rows
Definition: cuda_devptrs.hpp:114
CV_EXPORTS int check(const Mat &data, double min_val, double max_val, vector< int > *idx)
Definition: vec_traits.hpp:50
GLuint dst
Definition: calib3d.hpp:134
false
Definition: color.hpp:230
true
Definition: color.hpp:221
int x
Definition: highgui_c.h:186
const CvArr const CvArr * src2
Definition: core_c.h:436
GLenum GLint GLuint mask
Definition: tracking.hpp:132
CvArr const CvArr * mask
Definition: core_c.h:288