include/opencv2/gpu/device/detail/reduction_detail.hpp
Go to the documentation of this file.
00001 /*M///////////////////////////////////////////////////////////////////////////////////////
00002 //
00003 //  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
00004 //
00005 //  By downloading, copying, installing or using the software you agree to this license.
00006 //  If you do not agree to this license, do not download, install,
00007 //  copy or use the software.
00008 //
00009 //
00010 //                           License Agreement
00011 //                For Open Source Computer Vision Library
00012 //
00013 // Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
00014 // Copyright (C) 2009, Willow Garage Inc., all rights reserved.
00015 // Third party copyrights are property of their respective owners.
00016 //
00017 // Redistribution and use in source and binary forms, with or without modification,
00018 // are permitted provided that the following conditions are met:
00019 //
00020 //   * Redistribution's of source code must retain the above copyright notice,
00021 //     this list of conditions and the following disclaimer.
00022 //
00023 //   * Redistribution's in binary form must reproduce the above copyright notice,
00024 //     this list of conditions and the following disclaimer in the documentation
00025 //     and/or other materials provided with the distribution.
00026 //
00027 //   * The name of the copyright holders may not be used to endorse or promote products
00028 //     derived from this software without specific prior written permission.
00029 //
00030 // This software is provided by the copyright holders and contributors "as is" and
00031 // any express or implied warranties, including, but not limited to, the implied
00032 // warranties of merchantability and fitness for a particular purpose are disclaimed.
00033 // In no event shall the Intel Corporation or contributors be liable for any direct,
00034 // indirect, incidental, special, exemplary, or consequential damages
00035 // (including, but not limited to, procurement of substitute goods or services;
00036 // loss of use, data, or profits; or business interruption) however caused
00037 // and on any theory of liability, whether in contract, strict liability,
00038 // or tort (including negligence or otherwise) arising in any way out of
00039 // the use of this software, even if advised of the possibility of such damage.
00040 //
00041 //M*/
00042 
00043 #ifndef __OPENCV_GPU_REDUCTION_DETAIL_HPP__
00044 #define __OPENCV_GPU_REDUCTION_DETAIL_HPP__
00045 
00046 namespace cv { namespace gpu { namespace device
00047 {
00048     namespace utility_detail
00049     {
00051         // Reductor
00052 
00053         template <int n> struct WarpReductor
00054         {
00055             template <typename T, typename Op> static __device__ __forceinline__ void reduce(volatile T* data, T& partial_reduction, int tid, const Op& op)
00056             {
00057                 if (tid < n)
00058                     data[tid] = partial_reduction;
00059                 if (n > 32) __syncthreads();
00060 
00061                 if (n > 32)
00062                 {
00063                     if (tid < n - 32)
00064                         data[tid] = partial_reduction = op(partial_reduction, data[tid + 32]);
00065                     if (tid < 16)
00066                     {
00067                         data[tid] = partial_reduction = op(partial_reduction, data[tid + 16]);
00068                         data[tid] = partial_reduction = op(partial_reduction, data[tid +  8]);
00069                         data[tid] = partial_reduction = op(partial_reduction, data[tid +  4]);
00070                         data[tid] = partial_reduction = op(partial_reduction, data[tid +  2]);
00071                         data[tid] = partial_reduction = op(partial_reduction, data[tid +  1]);
00072                     }
00073                 }
00074                 else if (n > 16)
00075                 {
00076                     if (tid < n - 16)
00077                         data[tid] = partial_reduction = op(partial_reduction, data[tid + 16]);
00078                     if (tid < 8)
00079                     {
00080                         data[tid] = partial_reduction = op(partial_reduction, data[tid +  8]);
00081                         data[tid] = partial_reduction = op(partial_reduction, data[tid +  4]);
00082                         data[tid] = partial_reduction = op(partial_reduction, data[tid +  2]);
00083                         data[tid] = partial_reduction = op(partial_reduction, data[tid +  1]);
00084                     }
00085                 }
00086                 else if (n > 8)
00087                 {
00088                     if (tid < n - 8)
00089                         data[tid] = partial_reduction = op(partial_reduction, data[tid +  8]);
00090                     if (tid < 4)
00091                     {
00092                         data[tid] = partial_reduction = op(partial_reduction, data[tid +  4]);
00093                         data[tid] = partial_reduction = op(partial_reduction, data[tid +  2]);
00094                         data[tid] = partial_reduction = op(partial_reduction, data[tid +  1]);
00095                     }
00096                 }
00097                 else if (n > 4)
00098                 {
00099                     if (tid < n - 4)
00100                         data[tid] = partial_reduction = op(partial_reduction, data[tid +  4]);
00101                     if (tid < 2)
00102                     {
00103                         data[tid] = partial_reduction = op(partial_reduction, data[tid +  2]);
00104                         data[tid] = partial_reduction = op(partial_reduction, data[tid +  1]);
00105                     }
00106                 }
00107                 else if (n > 2)
00108                 {
00109                     if (tid < n - 2)
00110                         data[tid] = partial_reduction = op(partial_reduction, data[tid +  2]);
00111                     if (tid < 2)
00112                     {
00113                         data[tid] = partial_reduction = op(partial_reduction, data[tid +  1]);
00114                     }
00115                 }
00116             }
00117         };
00118         template <> struct WarpReductor<64>
00119         {
00120             template <typename T, typename Op> static __device__ void reduce(volatile T* data, T& partial_reduction, int tid, const Op& op)
00121             {
00122                 data[tid] = partial_reduction;
00123                 __syncthreads();
00124 
00125                 if (tid < 32)
00126                 {
00127                     data[tid] = partial_reduction = op(partial_reduction, data[tid + 32]);
00128                     data[tid] = partial_reduction = op(partial_reduction, data[tid + 16]);
00129                     data[tid] = partial_reduction = op(partial_reduction, data[tid + 8 ]);
00130                     data[tid] = partial_reduction = op(partial_reduction, data[tid + 4 ]);
00131                     data[tid] = partial_reduction = op(partial_reduction, data[tid + 2 ]);
00132                     data[tid] = partial_reduction = op(partial_reduction, data[tid + 1 ]);
00133                 }
00134             }
00135         };
00136         template <> struct WarpReductor<32>
00137         {
00138             template <typename T, typename Op> static __device__ void reduce(volatile T* data, T& partial_reduction, int tid, const Op& op)
00139             {
00140                 data[tid] = partial_reduction;
00141 
00142                 if (tid < 16)
00143                 {
00144                     data[tid] = partial_reduction = op(partial_reduction, data[tid + 16]);
00145                     data[tid] = partial_reduction = op(partial_reduction, data[tid + 8 ]);
00146                     data[tid] = partial_reduction = op(partial_reduction, data[tid + 4 ]);
00147                     data[tid] = partial_reduction = op(partial_reduction, data[tid + 2 ]);
00148                     data[tid] = partial_reduction = op(partial_reduction, data[tid + 1 ]);
00149                 }
00150             }
00151         };
00152         template <> struct WarpReductor<16>
00153         {
00154             template <typename T, typename Op> static __device__ void reduce(volatile T* data, T& partial_reduction, int tid, const Op& op)
00155             {
00156                 data[tid] = partial_reduction;
00157 
00158                 if (tid < 8)
00159                 {
00160                     data[tid] = partial_reduction = op(partial_reduction, data[tid + 8 ]);
00161                     data[tid] = partial_reduction = op(partial_reduction, data[tid + 4 ]);
00162                     data[tid] = partial_reduction = op(partial_reduction, data[tid + 2 ]);
00163                     data[tid] = partial_reduction = op(partial_reduction, data[tid + 1 ]);
00164                 }
00165             }
00166         };
00167         template <> struct WarpReductor<8>
00168         {
00169             template <typename T, typename Op> static __device__ void reduce(volatile T* data, T& partial_reduction, int tid, const Op& op)
00170             {
00171                 data[tid] = partial_reduction;
00172 
00173                 if (tid < 4)
00174                 {
00175                     data[tid] = partial_reduction = op(partial_reduction, data[tid + 4 ]);
00176                     data[tid] = partial_reduction = op(partial_reduction, data[tid + 2 ]);
00177                     data[tid] = partial_reduction = op(partial_reduction, data[tid + 1 ]);
00178                 }
00179             }
00180         };
00181 
00182         template <bool warp> struct ReductionDispatcher;
00183         template <> struct ReductionDispatcher<true>
00184         {
00185             template <int n, typename T, typename Op> static __device__ void reduce(volatile T* data, T& partial_reduction, int tid, const Op& op)
00186             {
00187                 WarpReductor<n>::reduce(data, partial_reduction, tid, op);
00188             }
00189         };
00190         template <> struct ReductionDispatcher<false>
00191         {
00192             template <int n, typename T, typename Op> static __device__ void reduce(volatile T* data, T& partial_reduction, int tid, const Op& op)
00193             {
00194                 if (tid < n)
00195                     data[tid] = partial_reduction;
00196                 __syncthreads();
00197 
00198 
00199                 if (n == 512) { if (tid < 256) { data[tid] = partial_reduction = op(partial_reduction, data[tid + 256]); } __syncthreads(); }
00200                 if (n >= 256) { if (tid < 128) { data[tid] = partial_reduction = op(partial_reduction, data[tid + 128]); } __syncthreads(); }
00201                 if (n >= 128) { if (tid <  64) { data[tid] = partial_reduction = op(partial_reduction, data[tid +  64]); } __syncthreads(); }
00202 
00203                 if (tid < 32)
00204                 {
00205                     data[tid] = partial_reduction = op(partial_reduction, data[tid + 32]);
00206                     data[tid] = partial_reduction = op(partial_reduction, data[tid + 16]);
00207                     data[tid] = partial_reduction = op(partial_reduction, data[tid +  8]);
00208                     data[tid] = partial_reduction = op(partial_reduction, data[tid +  4]);
00209                     data[tid] = partial_reduction = op(partial_reduction, data[tid +  2]);
00210                     data[tid] = partial_reduction = op(partial_reduction, data[tid +  1]);
00211                 }
00212             }
00213         };
00214 
00216         // PredValWarpReductor
00217 
00218         template <int n> struct PredValWarpReductor;
00219         template <> struct PredValWarpReductor<64>
00220         {
00221             template <typename T, typename V, typename Pred>
00222             static __device__ void reduce(T& myData, V& myVal, volatile T* sdata, V* sval, int tid, const Pred& pred)
00223             {
00224                 if (tid < 32)
00225                 {
00226                     myData = sdata[tid];
00227                     myVal = sval[tid];
00228 
00229                     T reg = sdata[tid + 32];
00230                     if (pred(reg, myData))
00231                     {
00232                         sdata[tid] = myData = reg;
00233                         sval[tid] = myVal = sval[tid + 32];
00234                     }
00235 
00236                     reg = sdata[tid + 16];
00237                     if (pred(reg, myData))
00238                     {
00239                         sdata[tid] = myData = reg;
00240                         sval[tid] = myVal = sval[tid + 16];
00241                     }
00242 
00243                     reg = sdata[tid + 8];
00244                     if (pred(reg, myData))
00245                     {
00246                         sdata[tid] = myData = reg;
00247                         sval[tid] = myVal = sval[tid + 8];
00248                     }
00249 
00250                     reg = sdata[tid + 4];
00251                     if (pred(reg, myData))
00252                     {
00253                         sdata[tid] = myData = reg;
00254                         sval[tid] = myVal = sval[tid + 4];
00255                     }
00256 
00257                     reg = sdata[tid + 2];
00258                     if (pred(reg, myData))
00259                     {
00260                         sdata[tid] = myData = reg;
00261                         sval[tid] = myVal = sval[tid + 2];
00262                     }
00263 
00264                     reg = sdata[tid + 1];
00265                     if (pred(reg, myData))
00266                     {
00267                         sdata[tid] = myData = reg;
00268                         sval[tid] = myVal = sval[tid + 1];
00269                     }
00270                 }
00271             }
00272         };
00273         template <> struct PredValWarpReductor<32>
00274         {
00275             template <typename T, typename V, typename Pred>
00276             static __device__ void reduce(T& myData, V& myVal, volatile T* sdata, V* sval, int tid, const Pred& pred)
00277             {
00278                 if (tid < 16)
00279                 {
00280                     myData = sdata[tid];
00281                     myVal = sval[tid];
00282 
00283                     T reg = sdata[tid + 16];
00284                     if (pred(reg, myData))
00285                     {
00286                         sdata[tid] = myData = reg;
00287                         sval[tid] = myVal = sval[tid + 16];
00288                     }
00289 
00290                     reg = sdata[tid + 8];
00291                     if (pred(reg, myData))
00292                     {
00293                         sdata[tid] = myData = reg;
00294                         sval[tid] = myVal = sval[tid + 8];
00295                     }
00296 
00297                     reg = sdata[tid + 4];
00298                     if (pred(reg, myData))
00299                     {
00300                         sdata[tid] = myData = reg;
00301                         sval[tid] = myVal = sval[tid + 4];
00302                     }
00303 
00304                     reg = sdata[tid + 2];
00305                     if (pred(reg, myData))
00306                     {
00307                         sdata[tid] = myData = reg;
00308                         sval[tid] = myVal = sval[tid + 2];
00309                     }
00310 
00311                     reg = sdata[tid + 1];
00312                     if (pred(reg, myData))
00313                     {
00314                         sdata[tid] = myData = reg;
00315                         sval[tid] = myVal = sval[tid + 1];
00316                     }
00317                 }
00318             }
00319         };
00320 
00321         template <> struct PredValWarpReductor<16>
00322         {
00323             template <typename T, typename V, typename Pred>
00324             static __device__ void reduce(T& myData, V& myVal, volatile T* sdata, V* sval, int tid, const Pred& pred)
00325             {
00326                 if (tid < 8)
00327                 {
00328                     myData = sdata[tid];
00329                     myVal = sval[tid];
00330 
00331                     T reg = reg = sdata[tid + 8];
00332                     if (pred(reg, myData))
00333                     {
00334                         sdata[tid] = myData = reg;
00335                         sval[tid] = myVal = sval[tid + 8];
00336                     }
00337 
00338                     reg = sdata[tid + 4];
00339                     if (pred(reg, myData))
00340                     {
00341                         sdata[tid] = myData = reg;
00342                         sval[tid] = myVal = sval[tid + 4];
00343                     }
00344 
00345                     reg = sdata[tid + 2];
00346                     if (pred(reg, myData))
00347                     {
00348                         sdata[tid] = myData = reg;
00349                         sval[tid] = myVal = sval[tid + 2];
00350                     }
00351 
00352                     reg = sdata[tid + 1];
00353                     if (pred(reg, myData))
00354                     {
00355                         sdata[tid] = myData = reg;
00356                         sval[tid] = myVal = sval[tid + 1];
00357                     }
00358                 }
00359             }
00360         };
00361         template <> struct PredValWarpReductor<8>
00362         {
00363             template <typename T, typename V, typename Pred>
00364             static __device__ void reduce(T& myData, V& myVal, volatile T* sdata, V* sval, int tid, const Pred& pred)
00365             {
00366                 if (tid < 4)
00367                 {
00368                     myData = sdata[tid];
00369                     myVal = sval[tid];
00370 
00371                     T reg = reg = sdata[tid + 4];
00372                     if (pred(reg, myData))
00373                     {
00374                         sdata[tid] = myData = reg;
00375                         sval[tid] = myVal = sval[tid + 4];
00376                     }
00377 
00378                     reg = sdata[tid + 2];
00379                     if (pred(reg, myData))
00380                     {
00381                         sdata[tid] = myData = reg;
00382                         sval[tid] = myVal = sval[tid + 2];
00383                     }
00384 
00385                     reg = sdata[tid + 1];
00386                     if (pred(reg, myData))
00387                     {
00388                         sdata[tid] = myData = reg;
00389                         sval[tid] = myVal = sval[tid + 1];
00390                     }
00391                 }
00392             }
00393         };
00394 
00395         template <bool warp> struct PredValReductionDispatcher;
00396         template <> struct PredValReductionDispatcher<true>
00397         {
00398             template <int n, typename T, typename V, typename Pred> static __device__ void reduce(T& myData, V& myVal, volatile T* sdata, V* sval, int tid, const Pred& pred)
00399             {
00400                 PredValWarpReductor<n>::reduce(myData, myVal, sdata, sval, tid, pred);
00401             }
00402         };
00403         template <> struct PredValReductionDispatcher<false>
00404         {
00405             template <int n, typename T, typename V, typename Pred> static __device__ void reduce(T& myData, V& myVal, volatile T* sdata, V* sval, int tid, const Pred& pred)
00406             {
00407                 myData = sdata[tid];
00408                 myVal = sval[tid];
00409 
00410                 if (n >= 512 && tid < 256)
00411                 {
00412                     T reg = sdata[tid + 256];
00413 
00414                     if (pred(reg, myData))
00415                     {
00416                         sdata[tid] = myData = reg;
00417                         sval[tid] = myVal = sval[tid + 256];
00418                     }
00419                     __syncthreads();
00420                 }
00421                 if (n >= 256 && tid < 128)
00422                 {
00423                     T reg = sdata[tid + 128];
00424 
00425                     if (pred(reg, myData))
00426                     {
00427                         sdata[tid] = myData = reg;
00428                         sval[tid] = myVal = sval[tid + 128];
00429                     }
00430                     __syncthreads();
00431                 }
00432                 if (n >= 128 && tid < 64)
00433                 {
00434                     T reg = sdata[tid + 64];
00435 
00436                     if (pred(reg, myData))
00437                     {
00438                         sdata[tid] = myData = reg;
00439                         sval[tid] = myVal = sval[tid + 64];
00440                     }
00441                     __syncthreads();
00442                 }
00443 
00444                 if (tid < 32)
00445                 {
00446                     if (n >= 64)
00447                     {
00448                         T reg = sdata[tid + 32];
00449 
00450                         if (pred(reg, myData))
00451                         {
00452                             sdata[tid] = myData = reg;
00453                             sval[tid] = myVal = sval[tid + 32];
00454                         }
00455                     }
00456                     if (n >= 32)
00457                     {
00458                         T reg = sdata[tid + 16];
00459 
00460                         if (pred(reg, myData))
00461                         {
00462                             sdata[tid] = myData = reg;
00463                             sval[tid] = myVal = sval[tid + 16];
00464                         }
00465                     }
00466                     if (n >= 16)
00467                     {
00468                         T reg = sdata[tid + 8];
00469 
00470                         if (pred(reg, myData))
00471                         {
00472                             sdata[tid] = myData = reg;
00473                             sval[tid] = myVal = sval[tid + 8];
00474                         }
00475                     }
00476                     if (n >= 8)
00477                     {
00478                         T reg = sdata[tid + 4];
00479 
00480                         if (pred(reg, myData))
00481                         {
00482                             sdata[tid] = myData = reg;
00483                             sval[tid] = myVal = sval[tid + 4];
00484                         }
00485                     }
00486                     if (n >= 4)
00487                     {
00488                         T reg = sdata[tid + 2];
00489 
00490                         if (pred(reg, myData))
00491                         {
00492                             sdata[tid] = myData = reg;
00493                             sval[tid] = myVal = sval[tid + 2];
00494                         }
00495                     }
00496                     if (n >= 2)
00497                     {
00498                         T reg = sdata[tid + 1];
00499 
00500                         if (pred(reg, myData))
00501                         {
00502                             sdata[tid] = myData = reg;
00503                             sval[tid] = myVal = sval[tid + 1];
00504                         }
00505                     }
00506                 }
00507             }
00508         };
00509 
00511         // PredVal2WarpReductor
00512 
00513         template <int n> struct PredVal2WarpReductor;
00514         template <> struct PredVal2WarpReductor<64>
00515         {
00516             template <typename T, typename V1, typename V2, typename Pred>
00517             static __device__ void reduce(T& myData, V1& myVal1, V2& myVal2, volatile T* sdata, V1* sval1, V2* sval2, int tid, const Pred& pred)
00518             {
00519                 if (tid < 32)
00520                 {
00521                     myData = sdata[tid];
00522                     myVal1 = sval1[tid];
00523                     myVal2 = sval2[tid];
00524 
00525                     T reg = sdata[tid + 32];
00526                     if (pred(reg, myData))
00527                     {
00528                         sdata[tid] = myData = reg;
00529                         sval1[tid] = myVal1 = sval1[tid + 32];
00530                         sval2[tid] = myVal2 = sval2[tid + 32];
00531                     }
00532 
00533                     reg = sdata[tid + 16];
00534                     if (pred(reg, myData))
00535                     {
00536                         sdata[tid] = myData = reg;
00537                         sval1[tid] = myVal1 = sval1[tid + 16];
00538                         sval2[tid] = myVal2 = sval2[tid + 16];
00539                     }
00540 
00541                     reg = sdata[tid + 8];
00542                     if (pred(reg, myData))
00543                     {
00544                         sdata[tid] = myData = reg;
00545                         sval1[tid] = myVal1 = sval1[tid + 8];
00546                         sval2[tid] = myVal2 = sval2[tid + 8];
00547                     }
00548 
00549                     reg = sdata[tid + 4];
00550                     if (pred(reg, myData))
00551                     {
00552                         sdata[tid] = myData = reg;
00553                         sval1[tid] = myVal1 = sval1[tid + 4];
00554                         sval2[tid] = myVal2 = sval2[tid + 4];
00555                     }
00556 
00557                     reg = sdata[tid + 2];
00558                     if (pred(reg, myData))
00559                     {
00560                         sdata[tid] = myData = reg;
00561                         sval1[tid] = myVal1 = sval1[tid + 2];
00562                         sval2[tid] = myVal2 = sval2[tid + 2];
00563                     }
00564 
00565                     reg = sdata[tid + 1];
00566                     if (pred(reg, myData))
00567                     {
00568                         sdata[tid] = myData = reg;
00569                         sval1[tid] = myVal1 = sval1[tid + 1];
00570                         sval2[tid] = myVal2 = sval2[tid + 1];
00571                     }
00572                 }
00573             }
00574         };
00575         template <> struct PredVal2WarpReductor<32>
00576         {
00577             template <typename T, typename V1, typename V2, typename Pred>
00578             static __device__ void reduce(T& myData, V1& myVal1, V2& myVal2, volatile T* sdata, V1* sval1, V2* sval2, int tid, const Pred& pred)
00579             {
00580                 if (tid < 16)
00581                 {
00582                     myData = sdata[tid];
00583                     myVal1 = sval1[tid];
00584                     myVal2 = sval2[tid];
00585 
00586                     T reg = sdata[tid + 16];
00587                     if (pred(reg, myData))
00588                     {
00589                         sdata[tid] = myData = reg;
00590                         sval1[tid] = myVal1 = sval1[tid + 16];
00591                         sval2[tid] = myVal2 = sval2[tid + 16];
00592                     }
00593 
00594                     reg = sdata[tid + 8];
00595                     if (pred(reg, myData))
00596                     {
00597                         sdata[tid] = myData = reg;
00598                         sval1[tid] = myVal1 = sval1[tid + 8];
00599                         sval2[tid] = myVal2 = sval2[tid + 8];
00600                     }
00601 
00602                     reg = sdata[tid + 4];
00603                     if (pred(reg, myData))
00604                     {
00605                         sdata[tid] = myData = reg;
00606                         sval1[tid] = myVal1 = sval1[tid + 4];
00607                         sval2[tid] = myVal2 = sval2[tid + 4];
00608                     }
00609 
00610                     reg = sdata[tid + 2];
00611                     if (pred(reg, myData))
00612                     {
00613                         sdata[tid] = myData = reg;
00614                         sval1[tid] = myVal1 = sval1[tid + 2];
00615                         sval2[tid] = myVal2 = sval2[tid + 2];
00616                     }
00617 
00618                     reg = sdata[tid + 1];
00619                     if (pred(reg, myData))
00620                     {
00621                         sdata[tid] = myData = reg;
00622                         sval1[tid] = myVal1 = sval1[tid + 1];
00623                         sval2[tid] = myVal2 = sval2[tid + 1];
00624                     }
00625                 }
00626             }
00627         };
00628 
00629         template <> struct PredVal2WarpReductor<16>
00630         {
00631             template <typename T, typename V1, typename V2, typename Pred>
00632             static __device__ void reduce(T& myData, V1& myVal1, V2& myVal2, volatile T* sdata, V1* sval1, V2* sval2, int tid, const Pred& pred)
00633             {
00634                 if (tid < 8)
00635                 {
00636                     myData = sdata[tid];
00637                     myVal1 = sval1[tid];
00638                     myVal2 = sval2[tid];
00639 
00640                     T reg = reg = sdata[tid + 8];
00641                     if (pred(reg, myData))
00642                     {
00643                         sdata[tid] = myData = reg;
00644                         sval1[tid] = myVal1 = sval1[tid + 8];
00645                         sval2[tid] = myVal2 = sval2[tid + 8];
00646                     }
00647 
00648                     reg = sdata[tid + 4];
00649                     if (pred(reg, myData))
00650                     {
00651                         sdata[tid] = myData = reg;
00652                         sval1[tid] = myVal1 = sval1[tid + 4];
00653                         sval2[tid] = myVal2 = sval2[tid + 4];
00654                     }
00655 
00656                     reg = sdata[tid + 2];
00657                     if (pred(reg, myData))
00658                     {
00659                         sdata[tid] = myData = reg;
00660                         sval1[tid] = myVal1 = sval1[tid + 2];
00661                         sval2[tid] = myVal2 = sval2[tid + 2];
00662                     }
00663 
00664                     reg = sdata[tid + 1];
00665                     if (pred(reg, myData))
00666                     {
00667                         sdata[tid] = myData = reg;
00668                         sval1[tid] = myVal1 = sval1[tid + 1];
00669                         sval2[tid] = myVal2 = sval2[tid + 1];
00670                     }
00671                 }
00672             }
00673         };
00674         template <> struct PredVal2WarpReductor<8>
00675         {
00676             template <typename T, typename V1, typename V2, typename Pred>
00677             static __device__ void reduce(T& myData, V1& myVal1, V2& myVal2, volatile T* sdata, V1* sval1, V2* sval2, int tid, const Pred& pred)
00678             {
00679                 if (tid < 4)
00680                 {
00681                     myData = sdata[tid];
00682                     myVal1 = sval1[tid];
00683                     myVal2 = sval2[tid];
00684 
00685                     T reg = reg = sdata[tid + 4];
00686                     if (pred(reg, myData))
00687                     {
00688                         sdata[tid] = myData = reg;
00689                         sval1[tid] = myVal1 = sval1[tid + 4];
00690                         sval2[tid] = myVal2 = sval2[tid + 4];
00691                     }
00692 
00693                     reg = sdata[tid + 2];
00694                     if (pred(reg, myData))
00695                     {
00696                         sdata[tid] = myData = reg;
00697                         sval1[tid] = myVal1 = sval1[tid + 2];
00698                         sval2[tid] = myVal2 = sval2[tid + 2];
00699                     }
00700 
00701                     reg = sdata[tid + 1];
00702                     if (pred(reg, myData))
00703                     {
00704                         sdata[tid] = myData = reg;
00705                         sval1[tid] = myVal1 = sval1[tid + 1];
00706                         sval2[tid] = myVal2 = sval2[tid + 1];
00707                     }
00708                 }
00709             }
00710         };
00711 
00712         template <bool warp> struct PredVal2ReductionDispatcher;
00713         template <> struct PredVal2ReductionDispatcher<true>
00714         {
00715             template <int n, typename T, typename V1, typename V2, typename Pred>
00716             static __device__ void reduce(T& myData, V1& myVal1, V2& myVal2, volatile T* sdata, V1* sval1, V2* sval2, int tid, const Pred& pred)
00717             {
00718                 PredVal2WarpReductor<n>::reduce(myData, myVal1, myVal2, sdata, sval1, sval2, tid, pred);
00719             }
00720         };
00721         template <> struct PredVal2ReductionDispatcher<false>
00722         {
00723             template <int n, typename T, typename V1, typename V2, typename Pred>
00724             static __device__ void reduce(T& myData, V1& myVal1, V2& myVal2, volatile T* sdata, V1* sval1, V2* sval2, int tid, const Pred& pred)
00725             {
00726                 myData = sdata[tid];
00727                 myVal1 = sval1[tid];
00728                 myVal2 = sval2[tid];
00729 
00730                 if (n >= 512 && tid < 256)
00731                 {
00732                     T reg = sdata[tid + 256];
00733 
00734                     if (pred(reg, myData))
00735                     {
00736                         sdata[tid] = myData = reg;
00737                         sval1[tid] = myVal1 = sval1[tid + 256];
00738                         sval2[tid] = myVal2 = sval2[tid + 256];
00739                     }
00740                     __syncthreads();
00741                 }
00742                 if (n >= 256 && tid < 128)
00743                 {
00744                     T reg = sdata[tid + 128];
00745 
00746                     if (pred(reg, myData))
00747                     {
00748                         sdata[tid] = myData = reg;
00749                         sval1[tid] = myVal1 = sval1[tid + 128];
00750                         sval2[tid] = myVal2 = sval2[tid + 128];
00751                     }
00752                     __syncthreads();
00753                 }
00754                 if (n >= 128 && tid < 64)
00755                 {
00756                     T reg = sdata[tid + 64];
00757 
00758                     if (pred(reg, myData))
00759                     {
00760                         sdata[tid] = myData = reg;
00761                         sval1[tid] = myVal1 = sval1[tid + 64];
00762                         sval2[tid] = myVal2 = sval2[tid + 64];
00763                     }
00764                     __syncthreads();
00765                 }
00766 
00767                 if (tid < 32)
00768                 {
00769                     if (n >= 64)
00770                     {
00771                         T reg = sdata[tid + 32];
00772 
00773                         if (pred(reg, myData))
00774                         {
00775                             sdata[tid] = myData = reg;
00776                             sval1[tid] = myVal1 = sval1[tid + 32];
00777                             sval2[tid] = myVal2 = sval2[tid + 32];
00778                         }
00779                     }
00780                     if (n >= 32)
00781                     {
00782                         T reg = sdata[tid + 16];
00783 
00784                         if (pred(reg, myData))
00785                         {
00786                             sdata[tid] = myData = reg;
00787                             sval1[tid] = myVal1 = sval1[tid + 16];
00788                             sval2[tid] = myVal2 = sval2[tid + 16];
00789                         }
00790                     }
00791                     if (n >= 16)
00792                     {
00793                         T reg = sdata[tid + 8];
00794 
00795                         if (pred(reg, myData))
00796                         {
00797                             sdata[tid] = myData = reg;
00798                             sval1[tid] = myVal1 = sval1[tid + 8];
00799                             sval2[tid] = myVal2 = sval2[tid + 8];
00800                         }
00801                     }
00802                     if (n >= 8)
00803                     {
00804                         T reg = sdata[tid + 4];
00805 
00806                         if (pred(reg, myData))
00807                         {
00808                             sdata[tid] = myData = reg;
00809                             sval1[tid] = myVal1 = sval1[tid + 4];
00810                             sval2[tid] = myVal2 = sval2[tid + 4];
00811                         }
00812                     }
00813                     if (n >= 4)
00814                     {
00815                         T reg = sdata[tid + 2];
00816 
00817                         if (pred(reg, myData))
00818                         {
00819                             sdata[tid] = myData = reg;
00820                             sval1[tid] = myVal1 = sval1[tid + 2];
00821                             sval2[tid] = myVal2 = sval2[tid + 2];
00822                         }
00823                     }
00824                     if (n >= 2)
00825                     {
00826                         T reg = sdata[tid + 1];
00827 
00828                         if (pred(reg, myData))
00829                         {
00830                             sdata[tid] = myData = reg;
00831                             sval1[tid] = myVal1 = sval1[tid + 1];
00832                             sval2[tid] = myVal2 = sval2[tid + 1];
00833                         }
00834                     }
00835                 }
00836             }
00837         };
00838     } // namespace utility_detail
00839 }}} // namespace cv { namespace gpu { namespace device
00840 
00841 #endif // __OPENCV_GPU_REDUCTION_DETAIL_HPP__