transform_detail.hpp
Go to the documentation of this file.
1 /*M///////////////////////////////////////////////////////////////////////////////////////
2 //
3 // IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
4 //
5 // By downloading, copying, installing or using the software you agree to this license.
6 // If you do not agree to this license, do not download, install,
7 // copy or use the software.
8 //
9 //
10 // License Agreement
11 // For Open Source Computer Vision Library
12 //
13 // Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
14 // Copyright (C) 2009, Willow Garage Inc., all rights reserved.
15 // Third party copyrights are property of their respective owners.
16 //
17 // Redistribution and use in source and binary forms, with or without modification,
18 // are permitted provided that the following conditions are met:
19 //
20 // * Redistribution's of source code must retain the above copyright notice,
21 // this list of conditions and the following disclaimer.
22 //
23 // * Redistribution's in binary form must reproduce the above copyright notice,
24 // this list of conditions and the following disclaimer in the documentation
25 // and/or other materials provided with the distribution.
26 //
27 // * The name of the copyright holders may not be used to endorse or promote products
28 // derived from this software without specific prior written permission.
29 //
30 // This software is provided by the copyright holders and contributors "as is" and
31 // any express or implied warranties, including, but not limited to, the implied
32 // warranties of merchantability and fitness for a particular purpose are disclaimed.
33 // In no event shall the Intel Corporation or contributors be liable for any direct,
34 // indirect, incidental, special, exemplary, or consequential damages
35 // (including, but not limited to, procurement of substitute goods or services;
36 // loss of use, data, or profits; or business interruption) however caused
37 // and on any theory of liability, whether in contract, strict liability,
38 // or tort (including negligence or otherwise) arising in any way out of
39 // the use of this software, even if advised of the possibility of such damage.
40 //
41 //M*/
42 
43 #ifndef __OPENCV_GPU_TRANSFORM_DETAIL_HPP__
44 #define __OPENCV_GPU_TRANSFORM_DETAIL_HPP__
45 
46 #include "../common.hpp"
47 #include "../vec_traits.hpp"
48 #include "../functional.hpp"
49 
50 namespace cv { namespace gpu { namespace device
51 {
52  namespace transform_detail
53  {
55 
56  template <typename T, typename D, int shift> struct UnaryReadWriteTraits
57  {
60  };
61 
62  template <typename T1, typename T2, typename D, int shift> struct BinaryReadWriteTraits
63  {
67  };
68 
70 
71  template <int shift> struct OpUnroller;
72  template <> struct OpUnroller<1>
73  {
74  template <typename T, typename D, typename UnOp, typename Mask>
75  static __device__ __forceinline__ void unroll(const T& src, D& dst, const Mask& mask, UnOp& op, int x_shifted, int y)
76  {
77  if (mask(y, x_shifted))
78  dst.x = op(src.x);
79  }
80 
81  template <typename T1, typename T2, typename D, typename BinOp, typename Mask>
82  static __device__ __forceinline__ void unroll(const T1& src1, const T2& src2, D& dst, const Mask& mask, BinOp& op, int x_shifted, int y)
83  {
84  if (mask(y, x_shifted))
85  dst.x = op(src1.x, src2.x);
86  }
87  };
88  template <> struct OpUnroller<2>
89  {
90  template <typename T, typename D, typename UnOp, typename Mask>
91  static __device__ __forceinline__ void unroll(const T& src, D& dst, const Mask& mask, UnOp& op, int x_shifted, int y)
92  {
93  if (mask(y, x_shifted))
94  dst.x = op(src.x);
95  if (mask(y, x_shifted + 1))
96  dst.y = op(src.y);
97  }
98 
99  template <typename T1, typename T2, typename D, typename BinOp, typename Mask>
100  static __device__ __forceinline__ void unroll(const T1& src1, const T2& src2, D& dst, const Mask& mask, BinOp& op, int x_shifted, int y)
101  {
102  if (mask(y, x_shifted))
103  dst.x = op(src1.x, src2.x);
104  if (mask(y, x_shifted + 1))
105  dst.y = op(src1.y, src2.y);
106  }
107  };
108  template <> struct OpUnroller<3>
109  {
110  template <typename T, typename D, typename UnOp, typename Mask>
111  static __device__ __forceinline__ void unroll(const T& src, D& dst, const Mask& mask, const UnOp& op, int x_shifted, int y)
112  {
113  if (mask(y, x_shifted))
114  dst.x = op(src.x);
115  if (mask(y, x_shifted + 1))
116  dst.y = op(src.y);
117  if (mask(y, x_shifted + 2))
118  dst.z = op(src.z);
119  }
120 
121  template <typename T1, typename T2, typename D, typename BinOp, typename Mask>
122  static __device__ __forceinline__ void unroll(const T1& src1, const T2& src2, D& dst, const Mask& mask, const BinOp& op, int x_shifted, int y)
123  {
124  if (mask(y, x_shifted))
125  dst.x = op(src1.x, src2.x);
126  if (mask(y, x_shifted + 1))
127  dst.y = op(src1.y, src2.y);
128  if (mask(y, x_shifted + 2))
129  dst.z = op(src1.z, src2.z);
130  }
131  };
132  template <> struct OpUnroller<4>
133  {
134  template <typename T, typename D, typename UnOp, typename Mask>
135  static __device__ __forceinline__ void unroll(const T& src, D& dst, const Mask& mask, const UnOp& op, int x_shifted, int y)
136  {
137  if (mask(y, x_shifted))
138  dst.x = op(src.x);
139  if (mask(y, x_shifted + 1))
140  dst.y = op(src.y);
141  if (mask(y, x_shifted + 2))
142  dst.z = op(src.z);
143  if (mask(y, x_shifted + 3))
144  dst.w = op(src.w);
145  }
146 
147  template <typename T1, typename T2, typename D, typename BinOp, typename Mask>
148  static __device__ __forceinline__ void unroll(const T1& src1, const T2& src2, D& dst, const Mask& mask, const BinOp& op, int x_shifted, int y)
149  {
150  if (mask(y, x_shifted))
151  dst.x = op(src1.x, src2.x);
152  if (mask(y, x_shifted + 1))
153  dst.y = op(src1.y, src2.y);
154  if (mask(y, x_shifted + 2))
155  dst.z = op(src1.z, src2.z);
156  if (mask(y, x_shifted + 3))
157  dst.w = op(src1.w, src2.w);
158  }
159  };
160  template <> struct OpUnroller<8>
161  {
162  template <typename T, typename D, typename UnOp, typename Mask>
163  static __device__ __forceinline__ void unroll(const T& src, D& dst, const Mask& mask, const UnOp& op, int x_shifted, int y)
164  {
165  if (mask(y, x_shifted))
166  dst.a0 = op(src.a0);
167  if (mask(y, x_shifted + 1))
168  dst.a1 = op(src.a1);
169  if (mask(y, x_shifted + 2))
170  dst.a2 = op(src.a2);
171  if (mask(y, x_shifted + 3))
172  dst.a3 = op(src.a3);
173  if (mask(y, x_shifted + 4))
174  dst.a4 = op(src.a4);
175  if (mask(y, x_shifted + 5))
176  dst.a5 = op(src.a5);
177  if (mask(y, x_shifted + 6))
178  dst.a6 = op(src.a6);
179  if (mask(y, x_shifted + 7))
180  dst.a7 = op(src.a7);
181  }
182 
183  template <typename T1, typename T2, typename D, typename BinOp, typename Mask>
184  static __device__ __forceinline__ void unroll(const T1& src1, const T2& src2, D& dst, const Mask& mask, const BinOp& op, int x_shifted, int y)
185  {
186  if (mask(y, x_shifted))
187  dst.a0 = op(src1.a0, src2.a0);
188  if (mask(y, x_shifted + 1))
189  dst.a1 = op(src1.a1, src2.a1);
190  if (mask(y, x_shifted + 2))
191  dst.a2 = op(src1.a2, src2.a2);
192  if (mask(y, x_shifted + 3))
193  dst.a3 = op(src1.a3, src2.a3);
194  if (mask(y, x_shifted + 4))
195  dst.a4 = op(src1.a4, src2.a4);
196  if (mask(y, x_shifted + 5))
197  dst.a5 = op(src1.a5, src2.a5);
198  if (mask(y, x_shifted + 6))
199  dst.a6 = op(src1.a6, src2.a6);
200  if (mask(y, x_shifted + 7))
201  dst.a7 = op(src1.a7, src2.a7);
202  }
203  };
204 
205  template <typename T, typename D, typename UnOp, typename Mask>
206  static __global__ void transformSmart(const PtrStepSz<T> src_, PtrStep<D> dst_, const Mask mask, const UnOp op)
207  {
208  typedef TransformFunctorTraits<UnOp> ft;
209  typedef typename UnaryReadWriteTraits<T, D, ft::smart_shift>::read_type read_type;
210  typedef typename UnaryReadWriteTraits<T, D, ft::smart_shift>::write_type write_type;
211 
212  const int x = threadIdx.x + blockIdx.x * blockDim.x;
213  const int y = threadIdx.y + blockIdx.y * blockDim.y;
214  const int x_shifted = x * ft::smart_shift;
215 
216  if (y < src_.rows)
217  {
218  const T* src = src_.ptr(y);
219  D* dst = dst_.ptr(y);
220 
221  if (x_shifted + ft::smart_shift - 1 < src_.cols)
222  {
223  const read_type src_n_el = ((const read_type*)src)[x];
224  write_type dst_n_el = ((const write_type*)dst)[x];
225 
226  OpUnroller<ft::smart_shift>::unroll(src_n_el, dst_n_el, mask, op, x_shifted, y);
227 
228  ((write_type*)dst)[x] = dst_n_el;
229  }
230  else
231  {
232  for (int real_x = x_shifted; real_x < src_.cols; ++real_x)
233  {
234  if (mask(y, real_x))
235  dst[real_x] = op(src[real_x]);
236  }
237  }
238  }
239  }
240 
241  template <typename T, typename D, typename UnOp, typename Mask>
242  __global__ static void transformSimple(const PtrStepSz<T> src, PtrStep<D> dst, const Mask mask, const UnOp op)
243  {
244  const int x = blockDim.x * blockIdx.x + threadIdx.x;
245  const int y = blockDim.y * blockIdx.y + threadIdx.y;
246 
247  if (x < src.cols && y < src.rows && mask(y, x))
248  {
249  dst.ptr(y)[x] = op(src.ptr(y)[x]);
250  }
251  }
252 
253  template <typename T1, typename T2, typename D, typename BinOp, typename Mask>
254  static __global__ void transformSmart(const PtrStepSz<T1> src1_, const PtrStep<T2> src2_, PtrStep<D> dst_,
255  const Mask mask, const BinOp op)
256  {
257  typedef TransformFunctorTraits<BinOp> ft;
258  typedef typename BinaryReadWriteTraits<T1, T2, D, ft::smart_shift>::read_type1 read_type1;
259  typedef typename BinaryReadWriteTraits<T1, T2, D, ft::smart_shift>::read_type2 read_type2;
260  typedef typename BinaryReadWriteTraits<T1, T2, D, ft::smart_shift>::write_type write_type;
261 
262  const int x = threadIdx.x + blockIdx.x * blockDim.x;
263  const int y = threadIdx.y + blockIdx.y * blockDim.y;
264  const int x_shifted = x * ft::smart_shift;
265 
266  if (y < src1_.rows)
267  {
268  const T1* src1 = src1_.ptr(y);
269  const T2* src2 = src2_.ptr(y);
270  D* dst = dst_.ptr(y);
271 
272  if (x_shifted + ft::smart_shift - 1 < src1_.cols)
273  {
274  const read_type1 src1_n_el = ((const read_type1*)src1)[x];
275  const read_type2 src2_n_el = ((const read_type2*)src2)[x];
276  write_type dst_n_el = ((const write_type*)dst)[x];
277 
278  OpUnroller<ft::smart_shift>::unroll(src1_n_el, src2_n_el, dst_n_el, mask, op, x_shifted, y);
279 
280  ((write_type*)dst)[x] = dst_n_el;
281  }
282  else
283  {
284  for (int real_x = x_shifted; real_x < src1_.cols; ++real_x)
285  {
286  if (mask(y, real_x))
287  dst[real_x] = op(src1[real_x], src2[real_x]);
288  }
289  }
290  }
291  }
292 
293  template <typename T1, typename T2, typename D, typename BinOp, typename Mask>
294  static __global__ void transformSimple(const PtrStepSz<T1> src1, const PtrStep<T2> src2, PtrStep<D> dst,
295  const Mask mask, const BinOp op)
296  {
297  const int x = blockDim.x * blockIdx.x + threadIdx.x;
298  const int y = blockDim.y * blockIdx.y + threadIdx.y;
299 
300  if (x < src1.cols && y < src1.rows && mask(y, x))
301  {
302  const T1 src1_data = src1.ptr(y)[x];
303  const T2 src2_data = src2.ptr(y)[x];
304  dst.ptr(y)[x] = op(src1_data, src2_data);
305  }
306  }
307 
308  template <bool UseSmart> struct TransformDispatcher;
309  template<> struct TransformDispatcher<false>
310  {
311  template <typename T, typename D, typename UnOp, typename Mask>
312  static void call(PtrStepSz<T> src, PtrStepSz<D> dst, UnOp op, Mask mask, cudaStream_t stream)
313  {
314  typedef TransformFunctorTraits<UnOp> ft;
315 
316  const dim3 threads(ft::simple_block_dim_x, ft::simple_block_dim_y, 1);
317  const dim3 grid(divUp(src.cols, threads.x), divUp(src.rows, threads.y), 1);
318 
319  transformSimple<T, D><<<grid, threads, 0, stream>>>(src, dst, mask, op);
320  cudaSafeCall( cudaGetLastError() );
321 
322  if (stream == 0)
323  cudaSafeCall( cudaDeviceSynchronize() );
324  }
325 
326  template <typename T1, typename T2, typename D, typename BinOp, typename Mask>
327  static void call(PtrStepSz<T1> src1, PtrStepSz<T2> src2, PtrStepSz<D> dst, BinOp op, Mask mask, cudaStream_t stream)
328  {
330 
331  const dim3 threads(ft::simple_block_dim_x, ft::simple_block_dim_y, 1);
332  const dim3 grid(divUp(src1.cols, threads.x), divUp(src1.rows, threads.y), 1);
333 
334  transformSimple<T1, T2, D><<<grid, threads, 0, stream>>>(src1, src2, dst, mask, op);
335  cudaSafeCall( cudaGetLastError() );
336 
337  if (stream == 0)
338  cudaSafeCall( cudaDeviceSynchronize() );
339  }
340  };
341  template<> struct TransformDispatcher<true>
342  {
343  template <typename T, typename D, typename UnOp, typename Mask>
344  static void call(PtrStepSz<T> src, PtrStepSz<D> dst, UnOp op, Mask mask, cudaStream_t stream)
345  {
346  typedef TransformFunctorTraits<UnOp> ft;
347 
349 
350  if (!isAligned(src.data, ft::smart_shift * sizeof(T)) || !isAligned(src.step, ft::smart_shift * sizeof(T)) ||
351  !isAligned(dst.data, ft::smart_shift * sizeof(D)) || !isAligned(dst.step, ft::smart_shift * sizeof(D)))
352  {
353  TransformDispatcher<false>::call(src, dst, op, mask, stream);
354  return;
355  }
356 
357  const dim3 threads(ft::smart_block_dim_x, ft::smart_block_dim_y, 1);
358  const dim3 grid(divUp(src.cols, threads.x * ft::smart_shift), divUp(src.rows, threads.y), 1);
359 
360  transformSmart<T, D><<<grid, threads, 0, stream>>>(src, dst, mask, op);
361  cudaSafeCall( cudaGetLastError() );
362 
363  if (stream == 0)
364  cudaSafeCall( cudaDeviceSynchronize() );
365  }
366 
367  template <typename T1, typename T2, typename D, typename BinOp, typename Mask>
368  static void call(PtrStepSz<T1> src1, PtrStepSz<T2> src2, PtrStepSz<D> dst, BinOp op, Mask mask, cudaStream_t stream)
369  {
371 
373 
374  if (!isAligned(src1.data, ft::smart_shift * sizeof(T1)) || !isAligned(src1.step, ft::smart_shift * sizeof(T1)) ||
375  !isAligned(src2.data, ft::smart_shift * sizeof(T2)) || !isAligned(src2.step, ft::smart_shift * sizeof(T2)) ||
376  !isAligned(dst.data, ft::smart_shift * sizeof(D)) || !isAligned(dst.step, ft::smart_shift * sizeof(D)))
377  {
378  TransformDispatcher<false>::call(src1, src2, dst, op, mask, stream);
379  return;
380  }
381 
382  const dim3 threads(ft::smart_block_dim_x, ft::smart_block_dim_y, 1);
383  const dim3 grid(divUp(src1.cols, threads.x * ft::smart_shift), divUp(src1.rows, threads.y), 1);
384 
385  transformSmart<T1, T2, D><<<grid, threads, 0, stream>>>(src1, src2, dst, mask, op);
386  cudaSafeCall( cudaGetLastError() );
387 
388  if (stream == 0)
389  cudaSafeCall( cudaDeviceSynchronize() );
390  }
391  };
392  } // namespace transform_detail
393 }}} // namespace cv { namespace gpu { namespace device
394 
395 #endif // __OPENCV_GPU_TRANSFORM_DETAIL_HPP__
const void * src
Definition: core_c.h:1568
static void call(PtrStepSz< T1 > src1, PtrStepSz< T2 > src2, PtrStepSz< D > dst, BinOp op, Mask mask, cudaStream_t stream)
Definition: transform_detail.hpp:368
GLenum GLint GLint y
Definition: core_c.h:613
Definition: transform_detail.hpp:308
const CvArr * src1
Definition: core_c.h:436
int cols
Definition: cuda_devptrs.hpp:113
static __device__ __forceinline__ void unroll(const T &src, D &dst, const Mask &mask, UnOp &op, int x_shifted, int y)
Definition: transform_detail.hpp:91
GLuint src
Definition: core_c.h:1650
T * data
Definition: cuda_devptrs.hpp:71
Transform kernels.
Definition: transform_detail.hpp:71
static void call(PtrStepSz< T > src, PtrStepSz< D > dst, UnOp op, Mask mask, cudaStream_t stream)
Definition: transform_detail.hpp:344
const CvMat const CvMat const CvMat CvMat CvMat CvMat CvMat CvSize CvMat CvMat * T
Definition: calib3d.hpp:270
__host__ __device__ __forceinline__ int divUp(int total, int grain)
Definition: common.hpp:90
Read Write Traits.
Definition: transform_detail.hpp:56
static __device__ __forceinline__ void unroll(const T &src, D &dst, const Mask &mask, const UnOp &op, int x_shifted, int y)
Definition: transform_detail.hpp:163
Definition: cuda_devptrs.hpp:104
static __device__ __forceinline__ void unroll(const T1 &src1, const T2 &src2, D &dst, const Mask &mask, const BinOp &op, int x_shifted, int y)
Definition: transform_detail.hpp:148
TypeVec< T, shift >::vec_type read_type
Definition: transform_detail.hpp:58
static __device__ __forceinline__ void unroll(const T1 &src1, const T2 &src2, D &dst, const Mask &mask, const BinOp &op, int x_shifted, int y)
Definition: transform_detail.hpp:184
TypeVec< T1, shift >::vec_type read_type1
Definition: transform_detail.hpp:64
static void call(PtrStepSz< T > src, PtrStepSz< D > dst, UnOp op, Mask mask, cudaStream_t stream)
Definition: transform_detail.hpp:312
static __device__ __forceinline__ void unroll(const T &src, D &dst, const Mask &mask, UnOp &op, int x_shifted, int y)
Definition: transform_detail.hpp:75
OutputArray dst
Definition: imgproc.hpp:823
GLenum GLint x
Definition: core_c.h:632
static __device__ __forceinline__ void unroll(const T1 &src1, const T2 &src2, D &dst, const Mask &mask, BinOp &op, int x_shifted, int y)
Definition: transform_detail.hpp:100
Definition: cuda_devptrs.hpp:89
TypeVec< T2, shift >::vec_type read_type2
Definition: transform_detail.hpp:65
size_t step
stride between two consecutive rows in bytes. Step is stored always and everywhere in bytes!!! ...
Definition: cuda_devptrs.hpp:95
TypeVec< D, shift >::vec_type write_type
Definition: transform_detail.hpp:66
TypeVec< D, shift >::vec_type write_type
Definition: transform_detail.hpp:59
__CV_GPU_HOST_DEVICE__ T * ptr(int y=0)
Definition: cuda_devptrs.hpp:97
static __device__ __forceinline__ void unroll(const T &src, D &dst, const Mask &mask, const UnOp &op, int x_shifted, int y)
Definition: transform_detail.hpp:111
static void call(PtrStepSz< T1 > src1, PtrStepSz< T2 > src2, PtrStepSz< D > dst, BinOp op, Mask mask, cudaStream_t stream)
Definition: transform_detail.hpp:327
int rows
Definition: cuda_devptrs.hpp:114
CV_EXPORTS int check(const Mat &data, double min_val, double max_val, vector< int > *idx)
Definition: vec_traits.hpp:50
GLuint dst
Definition: calib3d.hpp:134
false
Definition: color.hpp:230
true
Definition: color.hpp:221
int x
Definition: highgui_c.h:186
static __device__ __forceinline__ void unroll(const T &src, D &dst, const Mask &mask, const UnOp &op, int x_shifted, int y)
Definition: transform_detail.hpp:135
static __device__ __forceinline__ void unroll(const T1 &src1, const T2 &src2, D &dst, const Mask &mask, const BinOp &op, int x_shifted, int y)
Definition: transform_detail.hpp:122
Definition: functional.hpp:783
const CvArr const CvArr * src2
Definition: core_c.h:436
GLenum GLint GLuint mask
Definition: tracking.hpp:132
CvArr const CvArr * mask
Definition: core_c.h:288
static __device__ __forceinline__ void unroll(const T1 &src1, const T2 &src2, D &dst, const Mask &mask, BinOp &op, int x_shifted, int y)
Definition: transform_detail.hpp:82