00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025
00026
00027
00028
00029
00030
00031
00032
00033
00034
00035
00036
00037
00038
00039
00040
00041
00042
00043 #ifndef __OPENCV_GPU_TRANSFORM_DETAIL_HPP__
00044 #define __OPENCV_GPU_TRANSFORM_DETAIL_HPP__
00045
00046 #include "../common.hpp"
00047 #include "../vec_traits.hpp"
00048 #include "../functional.hpp"
00049
00050 namespace cv { namespace gpu { namespace device
00051 {
00052 namespace transform_detail
00053 {
00055
00056 template <typename T, typename D, int shift> struct UnaryReadWriteTraits
00057 {
00058 typedef typename TypeVec<T, shift>::vec_type read_type;
00059 typedef typename TypeVec<D, shift>::vec_type write_type;
00060 };
00061
00062 template <typename T1, typename T2, typename D, int shift> struct BinaryReadWriteTraits
00063 {
00064 typedef typename TypeVec<T1, shift>::vec_type read_type1;
00065 typedef typename TypeVec<T2, shift>::vec_type read_type2;
00066 typedef typename TypeVec<D, shift>::vec_type write_type;
00067 };
00068
00070
00071 template <int shift> struct OpUnroller;
00072 template <> struct OpUnroller<1>
00073 {
00074 template <typename T, typename D, typename UnOp, typename Mask>
00075 static __device__ __forceinline__ void unroll(const T& src, D& dst, const Mask& mask, UnOp& op, int x_shifted, int y)
00076 {
00077 if (mask(y, x_shifted))
00078 dst.x = op(src.x);
00079 }
00080
00081 template <typename T1, typename T2, typename D, typename BinOp, typename Mask>
00082 static __device__ __forceinline__ void unroll(const T1& src1, const T2& src2, D& dst, const Mask& mask, BinOp& op, int x_shifted, int y)
00083 {
00084 if (mask(y, x_shifted))
00085 dst.x = op(src1.x, src2.x);
00086 }
00087 };
00088 template <> struct OpUnroller<2>
00089 {
00090 template <typename T, typename D, typename UnOp, typename Mask>
00091 static __device__ __forceinline__ void unroll(const T& src, D& dst, const Mask& mask, UnOp& op, int x_shifted, int y)
00092 {
00093 if (mask(y, x_shifted))
00094 dst.x = op(src.x);
00095 if (mask(y, x_shifted + 1))
00096 dst.y = op(src.y);
00097 }
00098
00099 template <typename T1, typename T2, typename D, typename BinOp, typename Mask>
00100 static __device__ __forceinline__ void unroll(const T1& src1, const T2& src2, D& dst, const Mask& mask, BinOp& op, int x_shifted, int y)
00101 {
00102 if (mask(y, x_shifted))
00103 dst.x = op(src1.x, src2.x);
00104 if (mask(y, x_shifted + 1))
00105 dst.y = op(src1.y, src2.y);
00106 }
00107 };
00108 template <> struct OpUnroller<3>
00109 {
00110 template <typename T, typename D, typename UnOp, typename Mask>
00111 static __device__ __forceinline__ void unroll(const T& src, D& dst, const Mask& mask, const UnOp& op, int x_shifted, int y)
00112 {
00113 if (mask(y, x_shifted))
00114 dst.x = op(src.x);
00115 if (mask(y, x_shifted + 1))
00116 dst.y = op(src.y);
00117 if (mask(y, x_shifted + 2))
00118 dst.z = op(src.z);
00119 }
00120
00121 template <typename T1, typename T2, typename D, typename BinOp, typename Mask>
00122 static __device__ __forceinline__ void unroll(const T1& src1, const T2& src2, D& dst, const Mask& mask, const BinOp& op, int x_shifted, int y)
00123 {
00124 if (mask(y, x_shifted))
00125 dst.x = op(src1.x, src2.x);
00126 if (mask(y, x_shifted + 1))
00127 dst.y = op(src1.y, src2.y);
00128 if (mask(y, x_shifted + 2))
00129 dst.z = op(src1.z, src2.z);
00130 }
00131 };
00132 template <> struct OpUnroller<4>
00133 {
00134 template <typename T, typename D, typename UnOp, typename Mask>
00135 static __device__ __forceinline__ void unroll(const T& src, D& dst, const Mask& mask, const UnOp& op, int x_shifted, int y)
00136 {
00137 if (mask(y, x_shifted))
00138 dst.x = op(src.x);
00139 if (mask(y, x_shifted + 1))
00140 dst.y = op(src.y);
00141 if (mask(y, x_shifted + 2))
00142 dst.z = op(src.z);
00143 if (mask(y, x_shifted + 3))
00144 dst.w = op(src.w);
00145 }
00146
00147 template <typename T1, typename T2, typename D, typename BinOp, typename Mask>
00148 static __device__ __forceinline__ void unroll(const T1& src1, const T2& src2, D& dst, const Mask& mask, const BinOp& op, int x_shifted, int y)
00149 {
00150 if (mask(y, x_shifted))
00151 dst.x = op(src1.x, src2.x);
00152 if (mask(y, x_shifted + 1))
00153 dst.y = op(src1.y, src2.y);
00154 if (mask(y, x_shifted + 2))
00155 dst.z = op(src1.z, src2.z);
00156 if (mask(y, x_shifted + 3))
00157 dst.w = op(src1.w, src2.w);
00158 }
00159 };
00160 template <> struct OpUnroller<8>
00161 {
00162 template <typename T, typename D, typename UnOp, typename Mask>
00163 static __device__ __forceinline__ void unroll(const T& src, D& dst, const Mask& mask, const UnOp& op, int x_shifted, int y)
00164 {
00165 if (mask(y, x_shifted))
00166 dst.a0 = op(src.a0);
00167 if (mask(y, x_shifted + 1))
00168 dst.a1 = op(src.a1);
00169 if (mask(y, x_shifted + 2))
00170 dst.a2 = op(src.a2);
00171 if (mask(y, x_shifted + 3))
00172 dst.a3 = op(src.a3);
00173 if (mask(y, x_shifted + 4))
00174 dst.a4 = op(src.a4);
00175 if (mask(y, x_shifted + 5))
00176 dst.a5 = op(src.a5);
00177 if (mask(y, x_shifted + 6))
00178 dst.a6 = op(src.a6);
00179 if (mask(y, x_shifted + 7))
00180 dst.a7 = op(src.a7);
00181 }
00182
00183 template <typename T1, typename T2, typename D, typename BinOp, typename Mask>
00184 static __device__ __forceinline__ void unroll(const T1& src1, const T2& src2, D& dst, const Mask& mask, const BinOp& op, int x_shifted, int y)
00185 {
00186 if (mask(y, x_shifted))
00187 dst.a0 = op(src1.a0, src2.a0);
00188 if (mask(y, x_shifted + 1))
00189 dst.a1 = op(src1.a1, src2.a1);
00190 if (mask(y, x_shifted + 2))
00191 dst.a2 = op(src1.a2, src2.a2);
00192 if (mask(y, x_shifted + 3))
00193 dst.a3 = op(src1.a3, src2.a3);
00194 if (mask(y, x_shifted + 4))
00195 dst.a4 = op(src1.a4, src2.a4);
00196 if (mask(y, x_shifted + 5))
00197 dst.a5 = op(src1.a5, src2.a5);
00198 if (mask(y, x_shifted + 6))
00199 dst.a6 = op(src1.a6, src2.a6);
00200 if (mask(y, x_shifted + 7))
00201 dst.a7 = op(src1.a7, src2.a7);
00202 }
00203 };
00204
00205 template <typename T, typename D, typename UnOp, typename Mask>
00206 static __global__ void transformSmart(const PtrStepSz<T> src_, PtrStep<D> dst_, const Mask mask, const UnOp op)
00207 {
00208 typedef TransformFunctorTraits<UnOp> ft;
00209 typedef typename UnaryReadWriteTraits<T, D, ft::smart_shift>::read_type read_type;
00210 typedef typename UnaryReadWriteTraits<T, D, ft::smart_shift>::write_type write_type;
00211
00212 const int x = threadIdx.x + blockIdx.x * blockDim.x;
00213 const int y = threadIdx.y + blockIdx.y * blockDim.y;
00214 const int x_shifted = x * ft::smart_shift;
00215
00216 if (y < src_.rows)
00217 {
00218 const T* src = src_.ptr(y);
00219 D* dst = dst_.ptr(y);
00220
00221 if (x_shifted + ft::smart_shift - 1 < src_.cols)
00222 {
00223 const read_type src_n_el = ((const read_type*)src)[x];
00224 write_type dst_n_el = ((const write_type*)dst)[x];
00225
00226 OpUnroller<ft::smart_shift>::unroll(src_n_el, dst_n_el, mask, op, x_shifted, y);
00227
00228 ((write_type*)dst)[x] = dst_n_el;
00229 }
00230 else
00231 {
00232 for (int real_x = x_shifted; real_x < src_.cols; ++real_x)
00233 {
00234 if (mask(y, real_x))
00235 dst[real_x] = op(src[real_x]);
00236 }
00237 }
00238 }
00239 }
00240
00241 template <typename T, typename D, typename UnOp, typename Mask>
00242 __global__ static void transformSimple(const PtrStepSz<T> src, PtrStep<D> dst, const Mask mask, const UnOp op)
00243 {
00244 const int x = blockDim.x * blockIdx.x + threadIdx.x;
00245 const int y = blockDim.y * blockIdx.y + threadIdx.y;
00246
00247 if (x < src.cols && y < src.rows && mask(y, x))
00248 {
00249 dst.ptr(y)[x] = op(src.ptr(y)[x]);
00250 }
00251 }
00252
00253 template <typename T1, typename T2, typename D, typename BinOp, typename Mask>
00254 static __global__ void transformSmart(const PtrStepSz<T1> src1_, const PtrStep<T2> src2_, PtrStep<D> dst_,
00255 const Mask mask, const BinOp op)
00256 {
00257 typedef TransformFunctorTraits<BinOp> ft;
00258 typedef typename BinaryReadWriteTraits<T1, T2, D, ft::smart_shift>::read_type1 read_type1;
00259 typedef typename BinaryReadWriteTraits<T1, T2, D, ft::smart_shift>::read_type2 read_type2;
00260 typedef typename BinaryReadWriteTraits<T1, T2, D, ft::smart_shift>::write_type write_type;
00261
00262 const int x = threadIdx.x + blockIdx.x * blockDim.x;
00263 const int y = threadIdx.y + blockIdx.y * blockDim.y;
00264 const int x_shifted = x * ft::smart_shift;
00265
00266 if (y < src1_.rows)
00267 {
00268 const T1* src1 = src1_.ptr(y);
00269 const T2* src2 = src2_.ptr(y);
00270 D* dst = dst_.ptr(y);
00271
00272 if (x_shifted + ft::smart_shift - 1 < src1_.cols)
00273 {
00274 const read_type1 src1_n_el = ((const read_type1*)src1)[x];
00275 const read_type2 src2_n_el = ((const read_type2*)src2)[x];
00276 write_type dst_n_el = ((const write_type*)dst)[x];
00277
00278 OpUnroller<ft::smart_shift>::unroll(src1_n_el, src2_n_el, dst_n_el, mask, op, x_shifted, y);
00279
00280 ((write_type*)dst)[x] = dst_n_el;
00281 }
00282 else
00283 {
00284 for (int real_x = x_shifted; real_x < src1_.cols; ++real_x)
00285 {
00286 if (mask(y, real_x))
00287 dst[real_x] = op(src1[real_x], src2[real_x]);
00288 }
00289 }
00290 }
00291 }
00292
00293 template <typename T1, typename T2, typename D, typename BinOp, typename Mask>
00294 static __global__ void transformSimple(const PtrStepSz<T1> src1, const PtrStep<T2> src2, PtrStep<D> dst,
00295 const Mask mask, const BinOp op)
00296 {
00297 const int x = blockDim.x * blockIdx.x + threadIdx.x;
00298 const int y = blockDim.y * blockIdx.y + threadIdx.y;
00299
00300 if (x < src1.cols && y < src1.rows && mask(y, x))
00301 {
00302 const T1 src1_data = src1.ptr(y)[x];
00303 const T2 src2_data = src2.ptr(y)[x];
00304 dst.ptr(y)[x] = op(src1_data, src2_data);
00305 }
00306 }
00307
00308 template <bool UseSmart> struct TransformDispatcher;
00309 template<> struct TransformDispatcher<false>
00310 {
00311 template <typename T, typename D, typename UnOp, typename Mask>
00312 static void call(PtrStepSz<T> src, PtrStepSz<D> dst, UnOp op, Mask mask, cudaStream_t stream)
00313 {
00314 typedef TransformFunctorTraits<UnOp> ft;
00315
00316 const dim3 threads(ft::simple_block_dim_x, ft::simple_block_dim_y, 1);
00317 const dim3 grid(divUp(src.cols, threads.x), divUp(src.rows, threads.y), 1);
00318
00319 transformSimple<T, D><<<grid, threads, 0, stream>>>(src, dst, mask, op);
00320 cudaSafeCall( cudaGetLastError() );
00321
00322 if (stream == 0)
00323 cudaSafeCall( cudaDeviceSynchronize() );
00324 }
00325
00326 template <typename T1, typename T2, typename D, typename BinOp, typename Mask>
00327 static void call(PtrStepSz<T1> src1, PtrStepSz<T2> src2, PtrStepSz<D> dst, BinOp op, Mask mask, cudaStream_t stream)
00328 {
00329 typedef TransformFunctorTraits<BinOp> ft;
00330
00331 const dim3 threads(ft::simple_block_dim_x, ft::simple_block_dim_y, 1);
00332 const dim3 grid(divUp(src1.cols, threads.x), divUp(src1.rows, threads.y), 1);
00333
00334 transformSimple<T1, T2, D><<<grid, threads, 0, stream>>>(src1, src2, dst, mask, op);
00335 cudaSafeCall( cudaGetLastError() );
00336
00337 if (stream == 0)
00338 cudaSafeCall( cudaDeviceSynchronize() );
00339 }
00340 };
00341 template<> struct TransformDispatcher<true>
00342 {
00343 template <typename T, typename D, typename UnOp, typename Mask>
00344 static void call(PtrStepSz<T> src, PtrStepSz<D> dst, UnOp op, Mask mask, cudaStream_t stream)
00345 {
00346 typedef TransformFunctorTraits<UnOp> ft;
00347
00348 StaticAssert<ft::smart_shift != 1>::check();
00349
00350 if (!isAligned(src.data, ft::smart_shift * sizeof(T)) || !isAligned(src.step, ft::smart_shift * sizeof(T)) ||
00351 !isAligned(dst.data, ft::smart_shift * sizeof(D)) || !isAligned(dst.step, ft::smart_shift * sizeof(D)))
00352 {
00353 TransformDispatcher<false>::call(src, dst, op, mask, stream);
00354 return;
00355 }
00356
00357 const dim3 threads(ft::smart_block_dim_x, ft::smart_block_dim_y, 1);
00358 const dim3 grid(divUp(src.cols, threads.x * ft::smart_shift), divUp(src.rows, threads.y), 1);
00359
00360 transformSmart<T, D><<<grid, threads, 0, stream>>>(src, dst, mask, op);
00361 cudaSafeCall( cudaGetLastError() );
00362
00363 if (stream == 0)
00364 cudaSafeCall( cudaDeviceSynchronize() );
00365 }
00366
00367 template <typename T1, typename T2, typename D, typename BinOp, typename Mask>
00368 static void call(PtrStepSz<T1> src1, PtrStepSz<T2> src2, PtrStepSz<D> dst, BinOp op, Mask mask, cudaStream_t stream)
00369 {
00370 typedef TransformFunctorTraits<BinOp> ft;
00371
00372 StaticAssert<ft::smart_shift != 1>::check();
00373
00374 if (!isAligned(src1.data, ft::smart_shift * sizeof(T1)) || !isAligned(src1.step, ft::smart_shift * sizeof(T1)) ||
00375 !isAligned(src2.data, ft::smart_shift * sizeof(T2)) || !isAligned(src2.step, ft::smart_shift * sizeof(T2)) ||
00376 !isAligned(dst.data, ft::smart_shift * sizeof(D)) || !isAligned(dst.step, ft::smart_shift * sizeof(D)))
00377 {
00378 TransformDispatcher<false>::call(src1, src2, dst, op, mask, stream);
00379 return;
00380 }
00381
00382 const dim3 threads(ft::smart_block_dim_x, ft::smart_block_dim_y, 1);
00383 const dim3 grid(divUp(src1.cols, threads.x * ft::smart_shift), divUp(src1.rows, threads.y), 1);
00384
00385 transformSmart<T1, T2, D><<<grid, threads, 0, stream>>>(src1, src2, dst, mask, op);
00386 cudaSafeCall( cudaGetLastError() );
00387
00388 if (stream == 0)
00389 cudaSafeCall( cudaDeviceSynchronize() );
00390 }
00391 };
00392 }
00393 }}}
00394
00395 #endif // __OPENCV_GPU_TRANSFORM_DETAIL_HPP__