include/opencv2/gpu/device/detail/transform_detail.hpp
Go to the documentation of this file.
00001 /*M///////////////////////////////////////////////////////////////////////////////////////
00002 //
00003 //  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
00004 //
00005 //  By downloading, copying, installing or using the software you agree to this license.
00006 //  If you do not agree to this license, do not download, install,
00007 //  copy or use the software.
00008 //
00009 //
00010 //                           License Agreement
00011 //                For Open Source Computer Vision Library
00012 //
00013 // Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
00014 // Copyright (C) 2009, Willow Garage Inc., all rights reserved.
00015 // Third party copyrights are property of their respective owners.
00016 //
00017 // Redistribution and use in source and binary forms, with or without modification,
00018 // are permitted provided that the following conditions are met:
00019 //
00020 //   * Redistribution's of source code must retain the above copyright notice,
00021 //     this list of conditions and the following disclaimer.
00022 //
00023 //   * Redistribution's in binary form must reproduce the above copyright notice,
00024 //     this list of conditions and the following disclaimer in the documentation
00025 //     and/or other materials provided with the distribution.
00026 //
00027 //   * The name of the copyright holders may not be used to endorse or promote products
00028 //     derived from this software without specific prior written permission.
00029 //
00030 // This software is provided by the copyright holders and contributors "as is" and
00031 // any express or implied warranties, including, but not limited to, the implied
00032 // warranties of merchantability and fitness for a particular purpose are disclaimed.
00033 // In no event shall the Intel Corporation or contributors be liable for any direct,
00034 // indirect, incidental, special, exemplary, or consequential damages
00035 // (including, but not limited to, procurement of substitute goods or services;
00036 // loss of use, data, or profits; or business interruption) however caused
00037 // and on any theory of liability, whether in contract, strict liability,
00038 // or tort (including negligence or otherwise) arising in any way out of
00039 // the use of this software, even if advised of the possibility of such damage.
00040 //
00041 //M*/
00042 
00043 #ifndef __OPENCV_GPU_TRANSFORM_DETAIL_HPP__
00044 #define __OPENCV_GPU_TRANSFORM_DETAIL_HPP__
00045 
00046 #include "../common.hpp"
00047 #include "../vec_traits.hpp"
00048 #include "../functional.hpp"
00049 
00050 namespace cv { namespace gpu { namespace device
00051 {
00052     namespace transform_detail
00053     {
00055 
00056         template <typename T, typename D, int shift> struct UnaryReadWriteTraits
00057         {
00058             typedef typename TypeVec<T, shift>::vec_type read_type;
00059             typedef typename TypeVec<D, shift>::vec_type write_type;
00060         };
00061 
00062         template <typename T1, typename T2, typename D, int shift> struct BinaryReadWriteTraits
00063         {
00064             typedef typename TypeVec<T1, shift>::vec_type read_type1;
00065             typedef typename TypeVec<T2, shift>::vec_type read_type2;
00066             typedef typename TypeVec<D, shift>::vec_type write_type;
00067         };
00068 
00070 
00071         template <int shift> struct OpUnroller;
00072         template <> struct OpUnroller<1>
00073         {
00074             template <typename T, typename D, typename UnOp, typename Mask>
00075             static __device__ __forceinline__ void unroll(const T& src, D& dst, const Mask& mask, UnOp& op, int x_shifted, int y)
00076             {
00077                 if (mask(y, x_shifted))
00078                     dst.x = op(src.x);
00079             }
00080 
00081             template <typename T1, typename T2, typename D, typename BinOp, typename Mask>
00082             static __device__ __forceinline__ void unroll(const T1& src1, const T2& src2, D& dst, const Mask& mask, BinOp& op, int x_shifted, int y)
00083             {
00084                 if (mask(y, x_shifted))
00085                     dst.x = op(src1.x, src2.x);
00086             }
00087         };
00088         template <> struct OpUnroller<2>
00089         {
00090             template <typename T, typename D, typename UnOp, typename Mask>
00091             static __device__ __forceinline__ void unroll(const T& src, D& dst, const Mask& mask, UnOp& op, int x_shifted, int y)
00092             {
00093                 if (mask(y, x_shifted))
00094                     dst.x = op(src.x);
00095                 if (mask(y, x_shifted + 1))
00096                     dst.y = op(src.y);
00097             }
00098 
00099             template <typename T1, typename T2, typename D, typename BinOp, typename Mask>
00100             static __device__ __forceinline__ void unroll(const T1& src1, const T2& src2, D& dst, const Mask& mask, BinOp& op, int x_shifted, int y)
00101             {
00102                 if (mask(y, x_shifted))
00103                     dst.x = op(src1.x, src2.x);
00104                 if (mask(y, x_shifted + 1))
00105                     dst.y = op(src1.y, src2.y);
00106             }
00107         };
00108         template <> struct OpUnroller<3>
00109         {
00110             template <typename T, typename D, typename UnOp, typename Mask>
00111             static __device__ __forceinline__ void unroll(const T& src, D& dst, const Mask& mask, const UnOp& op, int x_shifted, int y)
00112             {
00113                 if (mask(y, x_shifted))
00114                     dst.x = op(src.x);
00115                 if (mask(y, x_shifted + 1))
00116                     dst.y = op(src.y);
00117                 if (mask(y, x_shifted + 2))
00118                     dst.z = op(src.z);
00119             }
00120 
00121             template <typename T1, typename T2, typename D, typename BinOp, typename Mask>
00122             static __device__ __forceinline__ void unroll(const T1& src1, const T2& src2, D& dst, const Mask& mask, const BinOp& op, int x_shifted, int y)
00123             {
00124                 if (mask(y, x_shifted))
00125                     dst.x = op(src1.x, src2.x);
00126                 if (mask(y, x_shifted + 1))
00127                     dst.y = op(src1.y, src2.y);
00128                 if (mask(y, x_shifted + 2))
00129                     dst.z = op(src1.z, src2.z);
00130             }
00131         };
00132         template <> struct OpUnroller<4>
00133         {
00134             template <typename T, typename D, typename UnOp, typename Mask>
00135             static __device__ __forceinline__ void unroll(const T& src, D& dst, const Mask& mask, const UnOp& op, int x_shifted, int y)
00136             {
00137                 if (mask(y, x_shifted))
00138                     dst.x = op(src.x);
00139                 if (mask(y, x_shifted + 1))
00140                     dst.y = op(src.y);
00141                 if (mask(y, x_shifted + 2))
00142                     dst.z = op(src.z);
00143                 if (mask(y, x_shifted + 3))
00144                     dst.w = op(src.w);
00145             }
00146 
00147             template <typename T1, typename T2, typename D, typename BinOp, typename Mask>
00148             static __device__ __forceinline__ void unroll(const T1& src1, const T2& src2, D& dst, const Mask& mask, const BinOp& op, int x_shifted, int y)
00149             {
00150                 if (mask(y, x_shifted))
00151                     dst.x = op(src1.x, src2.x);
00152                 if (mask(y, x_shifted + 1))
00153                     dst.y = op(src1.y, src2.y);
00154                 if (mask(y, x_shifted + 2))
00155                     dst.z = op(src1.z, src2.z);
00156                 if (mask(y, x_shifted + 3))
00157                     dst.w = op(src1.w, src2.w);
00158             }
00159         };
00160         template <> struct OpUnroller<8>
00161         {
00162             template <typename T, typename D, typename UnOp, typename Mask>
00163             static __device__ __forceinline__ void unroll(const T& src, D& dst, const Mask& mask, const UnOp& op, int x_shifted, int y)
00164             {
00165                 if (mask(y, x_shifted))
00166                     dst.a0 = op(src.a0);
00167                 if (mask(y, x_shifted + 1))
00168                     dst.a1 = op(src.a1);
00169                 if (mask(y, x_shifted + 2))
00170                     dst.a2 = op(src.a2);
00171                 if (mask(y, x_shifted + 3))
00172                     dst.a3 = op(src.a3);
00173                 if (mask(y, x_shifted + 4))
00174                     dst.a4 = op(src.a4);
00175                 if (mask(y, x_shifted + 5))
00176                     dst.a5 = op(src.a5);
00177                 if (mask(y, x_shifted + 6))
00178                     dst.a6 = op(src.a6);
00179                 if (mask(y, x_shifted + 7))
00180                     dst.a7 = op(src.a7);
00181             }
00182 
00183             template <typename T1, typename T2, typename D, typename BinOp, typename Mask>
00184             static __device__ __forceinline__ void unroll(const T1& src1, const T2& src2, D& dst, const Mask& mask, const BinOp& op, int x_shifted, int y)
00185             {
00186                 if (mask(y, x_shifted))
00187                     dst.a0 = op(src1.a0, src2.a0);
00188                 if (mask(y, x_shifted + 1))
00189                     dst.a1 = op(src1.a1, src2.a1);
00190                 if (mask(y, x_shifted + 2))
00191                     dst.a2 = op(src1.a2, src2.a2);
00192                 if (mask(y, x_shifted + 3))
00193                     dst.a3 = op(src1.a3, src2.a3);
00194                 if (mask(y, x_shifted + 4))
00195                     dst.a4 = op(src1.a4, src2.a4);
00196                 if (mask(y, x_shifted + 5))
00197                     dst.a5 = op(src1.a5, src2.a5);
00198                 if (mask(y, x_shifted + 6))
00199                     dst.a6 = op(src1.a6, src2.a6);
00200                 if (mask(y, x_shifted + 7))
00201                     dst.a7 = op(src1.a7, src2.a7);
00202             }
00203         };
00204 
00205         template <typename T, typename D, typename UnOp, typename Mask>
00206         static __global__ void transformSmart(const PtrStepSz<T> src_, PtrStep<D> dst_, const Mask mask, const UnOp op)
00207         {
00208             typedef TransformFunctorTraits<UnOp> ft;
00209             typedef typename UnaryReadWriteTraits<T, D, ft::smart_shift>::read_type read_type;
00210             typedef typename UnaryReadWriteTraits<T, D, ft::smart_shift>::write_type write_type;
00211 
00212             const int x = threadIdx.x + blockIdx.x * blockDim.x;
00213             const int y = threadIdx.y + blockIdx.y * blockDim.y;
00214             const int x_shifted = x * ft::smart_shift;
00215 
00216             if (y < src_.rows)
00217             {
00218                 const T* src = src_.ptr(y);
00219                 D* dst = dst_.ptr(y);
00220 
00221                 if (x_shifted + ft::smart_shift - 1 < src_.cols)
00222                 {
00223                     const read_type src_n_el = ((const read_type*)src)[x];
00224                     write_type dst_n_el = ((const write_type*)dst)[x];
00225 
00226                     OpUnroller<ft::smart_shift>::unroll(src_n_el, dst_n_el, mask, op, x_shifted, y);
00227 
00228                     ((write_type*)dst)[x] = dst_n_el;
00229                 }
00230                 else
00231                 {
00232                     for (int real_x = x_shifted; real_x < src_.cols; ++real_x)
00233                     {
00234                         if (mask(y, real_x))
00235                             dst[real_x] = op(src[real_x]);
00236                     }
00237                 }
00238             }
00239         }
00240 
00241         template <typename T, typename D, typename UnOp, typename Mask>
00242         __global__ static void transformSimple(const PtrStepSz<T> src, PtrStep<D> dst, const Mask mask, const UnOp op)
00243         {
00244             const int x = blockDim.x * blockIdx.x + threadIdx.x;
00245             const int y = blockDim.y * blockIdx.y + threadIdx.y;
00246 
00247             if (x < src.cols && y < src.rows && mask(y, x))
00248             {
00249                 dst.ptr(y)[x] = op(src.ptr(y)[x]);
00250             }
00251         }
00252 
00253         template <typename T1, typename T2, typename D, typename BinOp, typename Mask>
00254         static __global__ void transformSmart(const PtrStepSz<T1> src1_, const PtrStep<T2> src2_, PtrStep<D> dst_,
00255             const Mask mask, const BinOp op)
00256         {
00257             typedef TransformFunctorTraits<BinOp> ft;
00258             typedef typename BinaryReadWriteTraits<T1, T2, D, ft::smart_shift>::read_type1 read_type1;
00259             typedef typename BinaryReadWriteTraits<T1, T2, D, ft::smart_shift>::read_type2 read_type2;
00260             typedef typename BinaryReadWriteTraits<T1, T2, D, ft::smart_shift>::write_type write_type;
00261 
00262             const int x = threadIdx.x + blockIdx.x * blockDim.x;
00263             const int y = threadIdx.y + blockIdx.y * blockDim.y;
00264             const int x_shifted = x * ft::smart_shift;
00265 
00266             if (y < src1_.rows)
00267             {
00268                 const T1* src1 = src1_.ptr(y);
00269                 const T2* src2 = src2_.ptr(y);
00270                 D* dst = dst_.ptr(y);
00271 
00272                 if (x_shifted + ft::smart_shift - 1 < src1_.cols)
00273                 {
00274                     const read_type1 src1_n_el = ((const read_type1*)src1)[x];
00275                     const read_type2 src2_n_el = ((const read_type2*)src2)[x];
00276                     write_type dst_n_el = ((const write_type*)dst)[x];
00277 
00278                     OpUnroller<ft::smart_shift>::unroll(src1_n_el, src2_n_el, dst_n_el, mask, op, x_shifted, y);
00279 
00280                     ((write_type*)dst)[x] = dst_n_el;
00281                 }
00282                 else
00283                 {
00284                     for (int real_x = x_shifted; real_x < src1_.cols; ++real_x)
00285                     {
00286                         if (mask(y, real_x))
00287                             dst[real_x] = op(src1[real_x], src2[real_x]);
00288                     }
00289                 }
00290             }
00291         }
00292 
00293         template <typename T1, typename T2, typename D, typename BinOp, typename Mask>
00294         static __global__ void transformSimple(const PtrStepSz<T1> src1, const PtrStep<T2> src2, PtrStep<D> dst,
00295             const Mask mask, const BinOp op)
00296         {
00297             const int x = blockDim.x * blockIdx.x + threadIdx.x;
00298             const int y = blockDim.y * blockIdx.y + threadIdx.y;
00299 
00300             if (x < src1.cols && y < src1.rows && mask(y, x))
00301             {
00302                 const T1 src1_data = src1.ptr(y)[x];
00303                 const T2 src2_data = src2.ptr(y)[x];
00304                 dst.ptr(y)[x] = op(src1_data, src2_data);
00305             }
00306         }
00307 
00308         template <bool UseSmart> struct TransformDispatcher;
00309         template<> struct TransformDispatcher<false>
00310         {
00311             template <typename T, typename D, typename UnOp, typename Mask>
00312             static void call(PtrStepSz<T> src, PtrStepSz<D> dst, UnOp op, Mask mask, cudaStream_t stream)
00313             {
00314                 typedef TransformFunctorTraits<UnOp> ft;
00315 
00316                 const dim3 threads(ft::simple_block_dim_x, ft::simple_block_dim_y, 1);
00317                 const dim3 grid(divUp(src.cols, threads.x), divUp(src.rows, threads.y), 1);
00318 
00319                 transformSimple<T, D><<<grid, threads, 0, stream>>>(src, dst, mask, op);
00320                 cudaSafeCall( cudaGetLastError() );
00321 
00322                 if (stream == 0)
00323                     cudaSafeCall( cudaDeviceSynchronize() );
00324             }
00325 
00326             template <typename T1, typename T2, typename D, typename BinOp, typename Mask>
00327             static void call(PtrStepSz<T1> src1, PtrStepSz<T2> src2, PtrStepSz<D> dst, BinOp op, Mask mask, cudaStream_t stream)
00328             {
00329                 typedef TransformFunctorTraits<BinOp> ft;
00330 
00331                 const dim3 threads(ft::simple_block_dim_x, ft::simple_block_dim_y, 1);
00332                 const dim3 grid(divUp(src1.cols, threads.x), divUp(src1.rows, threads.y), 1);
00333 
00334                 transformSimple<T1, T2, D><<<grid, threads, 0, stream>>>(src1, src2, dst, mask, op);
00335                 cudaSafeCall( cudaGetLastError() );
00336 
00337                 if (stream == 0)
00338                     cudaSafeCall( cudaDeviceSynchronize() );
00339             }
00340         };
00341         template<> struct TransformDispatcher<true>
00342         {
00343             template <typename T, typename D, typename UnOp, typename Mask>
00344             static void call(PtrStepSz<T> src, PtrStepSz<D> dst, UnOp op, Mask mask, cudaStream_t stream)
00345             {
00346                 typedef TransformFunctorTraits<UnOp> ft;
00347 
00348                 StaticAssert<ft::smart_shift != 1>::check();
00349 
00350                 if (!isAligned(src.data, ft::smart_shift * sizeof(T)) || !isAligned(src.step, ft::smart_shift * sizeof(T)) ||
00351                     !isAligned(dst.data, ft::smart_shift * sizeof(D)) || !isAligned(dst.step, ft::smart_shift * sizeof(D)))
00352                 {
00353                     TransformDispatcher<false>::call(src, dst, op, mask, stream);
00354                     return;
00355                 }
00356 
00357                 const dim3 threads(ft::smart_block_dim_x, ft::smart_block_dim_y, 1);
00358                 const dim3 grid(divUp(src.cols, threads.x * ft::smart_shift), divUp(src.rows, threads.y), 1);
00359 
00360                 transformSmart<T, D><<<grid, threads, 0, stream>>>(src, dst, mask, op);
00361                 cudaSafeCall( cudaGetLastError() );
00362 
00363                 if (stream == 0)
00364                     cudaSafeCall( cudaDeviceSynchronize() );
00365             }
00366 
00367             template <typename T1, typename T2, typename D, typename BinOp, typename Mask>
00368             static void call(PtrStepSz<T1> src1, PtrStepSz<T2> src2, PtrStepSz<D> dst, BinOp op, Mask mask, cudaStream_t stream)
00369             {
00370                 typedef TransformFunctorTraits<BinOp> ft;
00371 
00372                 StaticAssert<ft::smart_shift != 1>::check();
00373 
00374                 if (!isAligned(src1.data, ft::smart_shift * sizeof(T1)) || !isAligned(src1.step, ft::smart_shift * sizeof(T1)) ||
00375                     !isAligned(src2.data, ft::smart_shift * sizeof(T2)) || !isAligned(src2.step, ft::smart_shift * sizeof(T2)) ||
00376                     !isAligned(dst.data, ft::smart_shift * sizeof(D)) || !isAligned(dst.step, ft::smart_shift * sizeof(D)))
00377                 {
00378                     TransformDispatcher<false>::call(src1, src2, dst, op, mask, stream);
00379                     return;
00380                 }
00381 
00382                 const dim3 threads(ft::smart_block_dim_x, ft::smart_block_dim_y, 1);
00383                 const dim3 grid(divUp(src1.cols, threads.x * ft::smart_shift), divUp(src1.rows, threads.y), 1);
00384 
00385                 transformSmart<T1, T2, D><<<grid, threads, 0, stream>>>(src1, src2, dst, mask, op);
00386                 cudaSafeCall( cudaGetLastError() );
00387 
00388                 if (stream == 0)
00389                     cudaSafeCall( cudaDeviceSynchronize() );
00390             }
00391         };
00392     } // namespace transform_detail
00393 }}} // namespace cv { namespace gpu { namespace device
00394 
00395 #endif // __OPENCV_GPU_TRANSFORM_DETAIL_HPP__