00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025
00026
00027
00028
00029
00030
00031
00032
00033
00034
00035
00036
00037
00038
00039
00040
00041
00042
00043 #ifndef __OPENCV_GPU_REDUCTION_DETAIL_HPP__
00044 #define __OPENCV_GPU_REDUCTION_DETAIL_HPP__
00045
00046 namespace cv { namespace gpu { namespace device
00047 {
00048 namespace utility_detail
00049 {
00051
00052
00053 template <int n> struct WarpReductor
00054 {
00055 template <typename T, typename Op> static __device__ __forceinline__ void reduce(volatile T* data, T& partial_reduction, int tid, const Op& op)
00056 {
00057 if (tid < n)
00058 data[tid] = partial_reduction;
00059 if (n > 32) __syncthreads();
00060
00061 if (n > 32)
00062 {
00063 if (tid < n - 32)
00064 data[tid] = partial_reduction = op(partial_reduction, data[tid + 32]);
00065 if (tid < 16)
00066 {
00067 data[tid] = partial_reduction = op(partial_reduction, data[tid + 16]);
00068 data[tid] = partial_reduction = op(partial_reduction, data[tid + 8]);
00069 data[tid] = partial_reduction = op(partial_reduction, data[tid + 4]);
00070 data[tid] = partial_reduction = op(partial_reduction, data[tid + 2]);
00071 data[tid] = partial_reduction = op(partial_reduction, data[tid + 1]);
00072 }
00073 }
00074 else if (n > 16)
00075 {
00076 if (tid < n - 16)
00077 data[tid] = partial_reduction = op(partial_reduction, data[tid + 16]);
00078 if (tid < 8)
00079 {
00080 data[tid] = partial_reduction = op(partial_reduction, data[tid + 8]);
00081 data[tid] = partial_reduction = op(partial_reduction, data[tid + 4]);
00082 data[tid] = partial_reduction = op(partial_reduction, data[tid + 2]);
00083 data[tid] = partial_reduction = op(partial_reduction, data[tid + 1]);
00084 }
00085 }
00086 else if (n > 8)
00087 {
00088 if (tid < n - 8)
00089 data[tid] = partial_reduction = op(partial_reduction, data[tid + 8]);
00090 if (tid < 4)
00091 {
00092 data[tid] = partial_reduction = op(partial_reduction, data[tid + 4]);
00093 data[tid] = partial_reduction = op(partial_reduction, data[tid + 2]);
00094 data[tid] = partial_reduction = op(partial_reduction, data[tid + 1]);
00095 }
00096 }
00097 else if (n > 4)
00098 {
00099 if (tid < n - 4)
00100 data[tid] = partial_reduction = op(partial_reduction, data[tid + 4]);
00101 if (tid < 2)
00102 {
00103 data[tid] = partial_reduction = op(partial_reduction, data[tid + 2]);
00104 data[tid] = partial_reduction = op(partial_reduction, data[tid + 1]);
00105 }
00106 }
00107 else if (n > 2)
00108 {
00109 if (tid < n - 2)
00110 data[tid] = partial_reduction = op(partial_reduction, data[tid + 2]);
00111 if (tid < 2)
00112 {
00113 data[tid] = partial_reduction = op(partial_reduction, data[tid + 1]);
00114 }
00115 }
00116 }
00117 };
00118 template <> struct WarpReductor<64>
00119 {
00120 template <typename T, typename Op> static __device__ void reduce(volatile T* data, T& partial_reduction, int tid, const Op& op)
00121 {
00122 data[tid] = partial_reduction;
00123 __syncthreads();
00124
00125 if (tid < 32)
00126 {
00127 data[tid] = partial_reduction = op(partial_reduction, data[tid + 32]);
00128 data[tid] = partial_reduction = op(partial_reduction, data[tid + 16]);
00129 data[tid] = partial_reduction = op(partial_reduction, data[tid + 8 ]);
00130 data[tid] = partial_reduction = op(partial_reduction, data[tid + 4 ]);
00131 data[tid] = partial_reduction = op(partial_reduction, data[tid + 2 ]);
00132 data[tid] = partial_reduction = op(partial_reduction, data[tid + 1 ]);
00133 }
00134 }
00135 };
00136 template <> struct WarpReductor<32>
00137 {
00138 template <typename T, typename Op> static __device__ void reduce(volatile T* data, T& partial_reduction, int tid, const Op& op)
00139 {
00140 data[tid] = partial_reduction;
00141
00142 if (tid < 16)
00143 {
00144 data[tid] = partial_reduction = op(partial_reduction, data[tid + 16]);
00145 data[tid] = partial_reduction = op(partial_reduction, data[tid + 8 ]);
00146 data[tid] = partial_reduction = op(partial_reduction, data[tid + 4 ]);
00147 data[tid] = partial_reduction = op(partial_reduction, data[tid + 2 ]);
00148 data[tid] = partial_reduction = op(partial_reduction, data[tid + 1 ]);
00149 }
00150 }
00151 };
00152 template <> struct WarpReductor<16>
00153 {
00154 template <typename T, typename Op> static __device__ void reduce(volatile T* data, T& partial_reduction, int tid, const Op& op)
00155 {
00156 data[tid] = partial_reduction;
00157
00158 if (tid < 8)
00159 {
00160 data[tid] = partial_reduction = op(partial_reduction, data[tid + 8 ]);
00161 data[tid] = partial_reduction = op(partial_reduction, data[tid + 4 ]);
00162 data[tid] = partial_reduction = op(partial_reduction, data[tid + 2 ]);
00163 data[tid] = partial_reduction = op(partial_reduction, data[tid + 1 ]);
00164 }
00165 }
00166 };
00167 template <> struct WarpReductor<8>
00168 {
00169 template <typename T, typename Op> static __device__ void reduce(volatile T* data, T& partial_reduction, int tid, const Op& op)
00170 {
00171 data[tid] = partial_reduction;
00172
00173 if (tid < 4)
00174 {
00175 data[tid] = partial_reduction = op(partial_reduction, data[tid + 4 ]);
00176 data[tid] = partial_reduction = op(partial_reduction, data[tid + 2 ]);
00177 data[tid] = partial_reduction = op(partial_reduction, data[tid + 1 ]);
00178 }
00179 }
00180 };
00181
00182 template <bool warp> struct ReductionDispatcher;
00183 template <> struct ReductionDispatcher<true>
00184 {
00185 template <int n, typename T, typename Op> static __device__ void reduce(volatile T* data, T& partial_reduction, int tid, const Op& op)
00186 {
00187 WarpReductor<n>::reduce(data, partial_reduction, tid, op);
00188 }
00189 };
00190 template <> struct ReductionDispatcher<false>
00191 {
00192 template <int n, typename T, typename Op> static __device__ void reduce(volatile T* data, T& partial_reduction, int tid, const Op& op)
00193 {
00194 if (tid < n)
00195 data[tid] = partial_reduction;
00196 __syncthreads();
00197
00198
00199 if (n == 512) { if (tid < 256) { data[tid] = partial_reduction = op(partial_reduction, data[tid + 256]); } __syncthreads(); }
00200 if (n >= 256) { if (tid < 128) { data[tid] = partial_reduction = op(partial_reduction, data[tid + 128]); } __syncthreads(); }
00201 if (n >= 128) { if (tid < 64) { data[tid] = partial_reduction = op(partial_reduction, data[tid + 64]); } __syncthreads(); }
00202
00203 if (tid < 32)
00204 {
00205 data[tid] = partial_reduction = op(partial_reduction, data[tid + 32]);
00206 data[tid] = partial_reduction = op(partial_reduction, data[tid + 16]);
00207 data[tid] = partial_reduction = op(partial_reduction, data[tid + 8]);
00208 data[tid] = partial_reduction = op(partial_reduction, data[tid + 4]);
00209 data[tid] = partial_reduction = op(partial_reduction, data[tid + 2]);
00210 data[tid] = partial_reduction = op(partial_reduction, data[tid + 1]);
00211 }
00212 }
00213 };
00214
00216
00217
00218 template <int n> struct PredValWarpReductor;
00219 template <> struct PredValWarpReductor<64>
00220 {
00221 template <typename T, typename V, typename Pred>
00222 static __device__ void reduce(T& myData, V& myVal, volatile T* sdata, V* sval, int tid, const Pred& pred)
00223 {
00224 if (tid < 32)
00225 {
00226 myData = sdata[tid];
00227 myVal = sval[tid];
00228
00229 T reg = sdata[tid + 32];
00230 if (pred(reg, myData))
00231 {
00232 sdata[tid] = myData = reg;
00233 sval[tid] = myVal = sval[tid + 32];
00234 }
00235
00236 reg = sdata[tid + 16];
00237 if (pred(reg, myData))
00238 {
00239 sdata[tid] = myData = reg;
00240 sval[tid] = myVal = sval[tid + 16];
00241 }
00242
00243 reg = sdata[tid + 8];
00244 if (pred(reg, myData))
00245 {
00246 sdata[tid] = myData = reg;
00247 sval[tid] = myVal = sval[tid + 8];
00248 }
00249
00250 reg = sdata[tid + 4];
00251 if (pred(reg, myData))
00252 {
00253 sdata[tid] = myData = reg;
00254 sval[tid] = myVal = sval[tid + 4];
00255 }
00256
00257 reg = sdata[tid + 2];
00258 if (pred(reg, myData))
00259 {
00260 sdata[tid] = myData = reg;
00261 sval[tid] = myVal = sval[tid + 2];
00262 }
00263
00264 reg = sdata[tid + 1];
00265 if (pred(reg, myData))
00266 {
00267 sdata[tid] = myData = reg;
00268 sval[tid] = myVal = sval[tid + 1];
00269 }
00270 }
00271 }
00272 };
00273 template <> struct PredValWarpReductor<32>
00274 {
00275 template <typename T, typename V, typename Pred>
00276 static __device__ void reduce(T& myData, V& myVal, volatile T* sdata, V* sval, int tid, const Pred& pred)
00277 {
00278 if (tid < 16)
00279 {
00280 myData = sdata[tid];
00281 myVal = sval[tid];
00282
00283 T reg = sdata[tid + 16];
00284 if (pred(reg, myData))
00285 {
00286 sdata[tid] = myData = reg;
00287 sval[tid] = myVal = sval[tid + 16];
00288 }
00289
00290 reg = sdata[tid + 8];
00291 if (pred(reg, myData))
00292 {
00293 sdata[tid] = myData = reg;
00294 sval[tid] = myVal = sval[tid + 8];
00295 }
00296
00297 reg = sdata[tid + 4];
00298 if (pred(reg, myData))
00299 {
00300 sdata[tid] = myData = reg;
00301 sval[tid] = myVal = sval[tid + 4];
00302 }
00303
00304 reg = sdata[tid + 2];
00305 if (pred(reg, myData))
00306 {
00307 sdata[tid] = myData = reg;
00308 sval[tid] = myVal = sval[tid + 2];
00309 }
00310
00311 reg = sdata[tid + 1];
00312 if (pred(reg, myData))
00313 {
00314 sdata[tid] = myData = reg;
00315 sval[tid] = myVal = sval[tid + 1];
00316 }
00317 }
00318 }
00319 };
00320
00321 template <> struct PredValWarpReductor<16>
00322 {
00323 template <typename T, typename V, typename Pred>
00324 static __device__ void reduce(T& myData, V& myVal, volatile T* sdata, V* sval, int tid, const Pred& pred)
00325 {
00326 if (tid < 8)
00327 {
00328 myData = sdata[tid];
00329 myVal = sval[tid];
00330
00331 T reg = reg = sdata[tid + 8];
00332 if (pred(reg, myData))
00333 {
00334 sdata[tid] = myData = reg;
00335 sval[tid] = myVal = sval[tid + 8];
00336 }
00337
00338 reg = sdata[tid + 4];
00339 if (pred(reg, myData))
00340 {
00341 sdata[tid] = myData = reg;
00342 sval[tid] = myVal = sval[tid + 4];
00343 }
00344
00345 reg = sdata[tid + 2];
00346 if (pred(reg, myData))
00347 {
00348 sdata[tid] = myData = reg;
00349 sval[tid] = myVal = sval[tid + 2];
00350 }
00351
00352 reg = sdata[tid + 1];
00353 if (pred(reg, myData))
00354 {
00355 sdata[tid] = myData = reg;
00356 sval[tid] = myVal = sval[tid + 1];
00357 }
00358 }
00359 }
00360 };
00361 template <> struct PredValWarpReductor<8>
00362 {
00363 template <typename T, typename V, typename Pred>
00364 static __device__ void reduce(T& myData, V& myVal, volatile T* sdata, V* sval, int tid, const Pred& pred)
00365 {
00366 if (tid < 4)
00367 {
00368 myData = sdata[tid];
00369 myVal = sval[tid];
00370
00371 T reg = reg = sdata[tid + 4];
00372 if (pred(reg, myData))
00373 {
00374 sdata[tid] = myData = reg;
00375 sval[tid] = myVal = sval[tid + 4];
00376 }
00377
00378 reg = sdata[tid + 2];
00379 if (pred(reg, myData))
00380 {
00381 sdata[tid] = myData = reg;
00382 sval[tid] = myVal = sval[tid + 2];
00383 }
00384
00385 reg = sdata[tid + 1];
00386 if (pred(reg, myData))
00387 {
00388 sdata[tid] = myData = reg;
00389 sval[tid] = myVal = sval[tid + 1];
00390 }
00391 }
00392 }
00393 };
00394
00395 template <bool warp> struct PredValReductionDispatcher;
00396 template <> struct PredValReductionDispatcher<true>
00397 {
00398 template <int n, typename T, typename V, typename Pred> static __device__ void reduce(T& myData, V& myVal, volatile T* sdata, V* sval, int tid, const Pred& pred)
00399 {
00400 PredValWarpReductor<n>::reduce(myData, myVal, sdata, sval, tid, pred);
00401 }
00402 };
00403 template <> struct PredValReductionDispatcher<false>
00404 {
00405 template <int n, typename T, typename V, typename Pred> static __device__ void reduce(T& myData, V& myVal, volatile T* sdata, V* sval, int tid, const Pred& pred)
00406 {
00407 myData = sdata[tid];
00408 myVal = sval[tid];
00409
00410 if (n >= 512 && tid < 256)
00411 {
00412 T reg = sdata[tid + 256];
00413
00414 if (pred(reg, myData))
00415 {
00416 sdata[tid] = myData = reg;
00417 sval[tid] = myVal = sval[tid + 256];
00418 }
00419 __syncthreads();
00420 }
00421 if (n >= 256 && tid < 128)
00422 {
00423 T reg = sdata[tid + 128];
00424
00425 if (pred(reg, myData))
00426 {
00427 sdata[tid] = myData = reg;
00428 sval[tid] = myVal = sval[tid + 128];
00429 }
00430 __syncthreads();
00431 }
00432 if (n >= 128 && tid < 64)
00433 {
00434 T reg = sdata[tid + 64];
00435
00436 if (pred(reg, myData))
00437 {
00438 sdata[tid] = myData = reg;
00439 sval[tid] = myVal = sval[tid + 64];
00440 }
00441 __syncthreads();
00442 }
00443
00444 if (tid < 32)
00445 {
00446 if (n >= 64)
00447 {
00448 T reg = sdata[tid + 32];
00449
00450 if (pred(reg, myData))
00451 {
00452 sdata[tid] = myData = reg;
00453 sval[tid] = myVal = sval[tid + 32];
00454 }
00455 }
00456 if (n >= 32)
00457 {
00458 T reg = sdata[tid + 16];
00459
00460 if (pred(reg, myData))
00461 {
00462 sdata[tid] = myData = reg;
00463 sval[tid] = myVal = sval[tid + 16];
00464 }
00465 }
00466 if (n >= 16)
00467 {
00468 T reg = sdata[tid + 8];
00469
00470 if (pred(reg, myData))
00471 {
00472 sdata[tid] = myData = reg;
00473 sval[tid] = myVal = sval[tid + 8];
00474 }
00475 }
00476 if (n >= 8)
00477 {
00478 T reg = sdata[tid + 4];
00479
00480 if (pred(reg, myData))
00481 {
00482 sdata[tid] = myData = reg;
00483 sval[tid] = myVal = sval[tid + 4];
00484 }
00485 }
00486 if (n >= 4)
00487 {
00488 T reg = sdata[tid + 2];
00489
00490 if (pred(reg, myData))
00491 {
00492 sdata[tid] = myData = reg;
00493 sval[tid] = myVal = sval[tid + 2];
00494 }
00495 }
00496 if (n >= 2)
00497 {
00498 T reg = sdata[tid + 1];
00499
00500 if (pred(reg, myData))
00501 {
00502 sdata[tid] = myData = reg;
00503 sval[tid] = myVal = sval[tid + 1];
00504 }
00505 }
00506 }
00507 }
00508 };
00509
00511
00512
00513 template <int n> struct PredVal2WarpReductor;
00514 template <> struct PredVal2WarpReductor<64>
00515 {
00516 template <typename T, typename V1, typename V2, typename Pred>
00517 static __device__ void reduce(T& myData, V1& myVal1, V2& myVal2, volatile T* sdata, V1* sval1, V2* sval2, int tid, const Pred& pred)
00518 {
00519 if (tid < 32)
00520 {
00521 myData = sdata[tid];
00522 myVal1 = sval1[tid];
00523 myVal2 = sval2[tid];
00524
00525 T reg = sdata[tid + 32];
00526 if (pred(reg, myData))
00527 {
00528 sdata[tid] = myData = reg;
00529 sval1[tid] = myVal1 = sval1[tid + 32];
00530 sval2[tid] = myVal2 = sval2[tid + 32];
00531 }
00532
00533 reg = sdata[tid + 16];
00534 if (pred(reg, myData))
00535 {
00536 sdata[tid] = myData = reg;
00537 sval1[tid] = myVal1 = sval1[tid + 16];
00538 sval2[tid] = myVal2 = sval2[tid + 16];
00539 }
00540
00541 reg = sdata[tid + 8];
00542 if (pred(reg, myData))
00543 {
00544 sdata[tid] = myData = reg;
00545 sval1[tid] = myVal1 = sval1[tid + 8];
00546 sval2[tid] = myVal2 = sval2[tid + 8];
00547 }
00548
00549 reg = sdata[tid + 4];
00550 if (pred(reg, myData))
00551 {
00552 sdata[tid] = myData = reg;
00553 sval1[tid] = myVal1 = sval1[tid + 4];
00554 sval2[tid] = myVal2 = sval2[tid + 4];
00555 }
00556
00557 reg = sdata[tid + 2];
00558 if (pred(reg, myData))
00559 {
00560 sdata[tid] = myData = reg;
00561 sval1[tid] = myVal1 = sval1[tid + 2];
00562 sval2[tid] = myVal2 = sval2[tid + 2];
00563 }
00564
00565 reg = sdata[tid + 1];
00566 if (pred(reg, myData))
00567 {
00568 sdata[tid] = myData = reg;
00569 sval1[tid] = myVal1 = sval1[tid + 1];
00570 sval2[tid] = myVal2 = sval2[tid + 1];
00571 }
00572 }
00573 }
00574 };
00575 template <> struct PredVal2WarpReductor<32>
00576 {
00577 template <typename T, typename V1, typename V2, typename Pred>
00578 static __device__ void reduce(T& myData, V1& myVal1, V2& myVal2, volatile T* sdata, V1* sval1, V2* sval2, int tid, const Pred& pred)
00579 {
00580 if (tid < 16)
00581 {
00582 myData = sdata[tid];
00583 myVal1 = sval1[tid];
00584 myVal2 = sval2[tid];
00585
00586 T reg = sdata[tid + 16];
00587 if (pred(reg, myData))
00588 {
00589 sdata[tid] = myData = reg;
00590 sval1[tid] = myVal1 = sval1[tid + 16];
00591 sval2[tid] = myVal2 = sval2[tid + 16];
00592 }
00593
00594 reg = sdata[tid + 8];
00595 if (pred(reg, myData))
00596 {
00597 sdata[tid] = myData = reg;
00598 sval1[tid] = myVal1 = sval1[tid + 8];
00599 sval2[tid] = myVal2 = sval2[tid + 8];
00600 }
00601
00602 reg = sdata[tid + 4];
00603 if (pred(reg, myData))
00604 {
00605 sdata[tid] = myData = reg;
00606 sval1[tid] = myVal1 = sval1[tid + 4];
00607 sval2[tid] = myVal2 = sval2[tid + 4];
00608 }
00609
00610 reg = sdata[tid + 2];
00611 if (pred(reg, myData))
00612 {
00613 sdata[tid] = myData = reg;
00614 sval1[tid] = myVal1 = sval1[tid + 2];
00615 sval2[tid] = myVal2 = sval2[tid + 2];
00616 }
00617
00618 reg = sdata[tid + 1];
00619 if (pred(reg, myData))
00620 {
00621 sdata[tid] = myData = reg;
00622 sval1[tid] = myVal1 = sval1[tid + 1];
00623 sval2[tid] = myVal2 = sval2[tid + 1];
00624 }
00625 }
00626 }
00627 };
00628
00629 template <> struct PredVal2WarpReductor<16>
00630 {
00631 template <typename T, typename V1, typename V2, typename Pred>
00632 static __device__ void reduce(T& myData, V1& myVal1, V2& myVal2, volatile T* sdata, V1* sval1, V2* sval2, int tid, const Pred& pred)
00633 {
00634 if (tid < 8)
00635 {
00636 myData = sdata[tid];
00637 myVal1 = sval1[tid];
00638 myVal2 = sval2[tid];
00639
00640 T reg = reg = sdata[tid + 8];
00641 if (pred(reg, myData))
00642 {
00643 sdata[tid] = myData = reg;
00644 sval1[tid] = myVal1 = sval1[tid + 8];
00645 sval2[tid] = myVal2 = sval2[tid + 8];
00646 }
00647
00648 reg = sdata[tid + 4];
00649 if (pred(reg, myData))
00650 {
00651 sdata[tid] = myData = reg;
00652 sval1[tid] = myVal1 = sval1[tid + 4];
00653 sval2[tid] = myVal2 = sval2[tid + 4];
00654 }
00655
00656 reg = sdata[tid + 2];
00657 if (pred(reg, myData))
00658 {
00659 sdata[tid] = myData = reg;
00660 sval1[tid] = myVal1 = sval1[tid + 2];
00661 sval2[tid] = myVal2 = sval2[tid + 2];
00662 }
00663
00664 reg = sdata[tid + 1];
00665 if (pred(reg, myData))
00666 {
00667 sdata[tid] = myData = reg;
00668 sval1[tid] = myVal1 = sval1[tid + 1];
00669 sval2[tid] = myVal2 = sval2[tid + 1];
00670 }
00671 }
00672 }
00673 };
00674 template <> struct PredVal2WarpReductor<8>
00675 {
00676 template <typename T, typename V1, typename V2, typename Pred>
00677 static __device__ void reduce(T& myData, V1& myVal1, V2& myVal2, volatile T* sdata, V1* sval1, V2* sval2, int tid, const Pred& pred)
00678 {
00679 if (tid < 4)
00680 {
00681 myData = sdata[tid];
00682 myVal1 = sval1[tid];
00683 myVal2 = sval2[tid];
00684
00685 T reg = reg = sdata[tid + 4];
00686 if (pred(reg, myData))
00687 {
00688 sdata[tid] = myData = reg;
00689 sval1[tid] = myVal1 = sval1[tid + 4];
00690 sval2[tid] = myVal2 = sval2[tid + 4];
00691 }
00692
00693 reg = sdata[tid + 2];
00694 if (pred(reg, myData))
00695 {
00696 sdata[tid] = myData = reg;
00697 sval1[tid] = myVal1 = sval1[tid + 2];
00698 sval2[tid] = myVal2 = sval2[tid + 2];
00699 }
00700
00701 reg = sdata[tid + 1];
00702 if (pred(reg, myData))
00703 {
00704 sdata[tid] = myData = reg;
00705 sval1[tid] = myVal1 = sval1[tid + 1];
00706 sval2[tid] = myVal2 = sval2[tid + 1];
00707 }
00708 }
00709 }
00710 };
00711
00712 template <bool warp> struct PredVal2ReductionDispatcher;
00713 template <> struct PredVal2ReductionDispatcher<true>
00714 {
00715 template <int n, typename T, typename V1, typename V2, typename Pred>
00716 static __device__ void reduce(T& myData, V1& myVal1, V2& myVal2, volatile T* sdata, V1* sval1, V2* sval2, int tid, const Pred& pred)
00717 {
00718 PredVal2WarpReductor<n>::reduce(myData, myVal1, myVal2, sdata, sval1, sval2, tid, pred);
00719 }
00720 };
00721 template <> struct PredVal2ReductionDispatcher<false>
00722 {
00723 template <int n, typename T, typename V1, typename V2, typename Pred>
00724 static __device__ void reduce(T& myData, V1& myVal1, V2& myVal2, volatile T* sdata, V1* sval1, V2* sval2, int tid, const Pred& pred)
00725 {
00726 myData = sdata[tid];
00727 myVal1 = sval1[tid];
00728 myVal2 = sval2[tid];
00729
00730 if (n >= 512 && tid < 256)
00731 {
00732 T reg = sdata[tid + 256];
00733
00734 if (pred(reg, myData))
00735 {
00736 sdata[tid] = myData = reg;
00737 sval1[tid] = myVal1 = sval1[tid + 256];
00738 sval2[tid] = myVal2 = sval2[tid + 256];
00739 }
00740 __syncthreads();
00741 }
00742 if (n >= 256 && tid < 128)
00743 {
00744 T reg = sdata[tid + 128];
00745
00746 if (pred(reg, myData))
00747 {
00748 sdata[tid] = myData = reg;
00749 sval1[tid] = myVal1 = sval1[tid + 128];
00750 sval2[tid] = myVal2 = sval2[tid + 128];
00751 }
00752 __syncthreads();
00753 }
00754 if (n >= 128 && tid < 64)
00755 {
00756 T reg = sdata[tid + 64];
00757
00758 if (pred(reg, myData))
00759 {
00760 sdata[tid] = myData = reg;
00761 sval1[tid] = myVal1 = sval1[tid + 64];
00762 sval2[tid] = myVal2 = sval2[tid + 64];
00763 }
00764 __syncthreads();
00765 }
00766
00767 if (tid < 32)
00768 {
00769 if (n >= 64)
00770 {
00771 T reg = sdata[tid + 32];
00772
00773 if (pred(reg, myData))
00774 {
00775 sdata[tid] = myData = reg;
00776 sval1[tid] = myVal1 = sval1[tid + 32];
00777 sval2[tid] = myVal2 = sval2[tid + 32];
00778 }
00779 }
00780 if (n >= 32)
00781 {
00782 T reg = sdata[tid + 16];
00783
00784 if (pred(reg, myData))
00785 {
00786 sdata[tid] = myData = reg;
00787 sval1[tid] = myVal1 = sval1[tid + 16];
00788 sval2[tid] = myVal2 = sval2[tid + 16];
00789 }
00790 }
00791 if (n >= 16)
00792 {
00793 T reg = sdata[tid + 8];
00794
00795 if (pred(reg, myData))
00796 {
00797 sdata[tid] = myData = reg;
00798 sval1[tid] = myVal1 = sval1[tid + 8];
00799 sval2[tid] = myVal2 = sval2[tid + 8];
00800 }
00801 }
00802 if (n >= 8)
00803 {
00804 T reg = sdata[tid + 4];
00805
00806 if (pred(reg, myData))
00807 {
00808 sdata[tid] = myData = reg;
00809 sval1[tid] = myVal1 = sval1[tid + 4];
00810 sval2[tid] = myVal2 = sval2[tid + 4];
00811 }
00812 }
00813 if (n >= 4)
00814 {
00815 T reg = sdata[tid + 2];
00816
00817 if (pred(reg, myData))
00818 {
00819 sdata[tid] = myData = reg;
00820 sval1[tid] = myVal1 = sval1[tid + 2];
00821 sval2[tid] = myVal2 = sval2[tid + 2];
00822 }
00823 }
00824 if (n >= 2)
00825 {
00826 T reg = sdata[tid + 1];
00827
00828 if (pred(reg, myData))
00829 {
00830 sdata[tid] = myData = reg;
00831 sval1[tid] = myVal1 = sval1[tid + 1];
00832 sval2[tid] = myVal2 = sval2[tid + 1];
00833 }
00834 }
00835 }
00836 }
00837 };
00838 }
00839 }}}
00840
00841 #endif // __OPENCV_GPU_REDUCTION_DETAIL_HPP__