00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025
00026
00027
00028
00029
00030
00031
00032
00033
00034
00035
00036
00037
00038
00039
00040
00041
00042
00043 #ifndef __OPENCV_GPU_VEC_DISTANCE_HPP__
00044 #define __OPENCV_GPU_VEC_DISTANCE_HPP__
00045
00046 #include "utility.hpp"
00047 #include "functional.hpp"
00048 #include "detail/vec_distance_detail.hpp"
00049
00050 namespace cv { namespace gpu { namespace device
00051 {
00052 template <typename T> struct L1Dist
00053 {
00054 typedef int value_type;
00055 typedef int result_type;
00056
00057 __device__ __forceinline__ L1Dist() : mySum(0) {}
00058
00059 __device__ __forceinline__ void reduceIter(int val1, int val2)
00060 {
00061 mySum = __sad(val1, val2, mySum);
00062 }
00063
00064 template <int THREAD_DIM> __device__ __forceinline__ void reduceAll(int* smem, int tid)
00065 {
00066 reduce<THREAD_DIM>(smem, mySum, tid, plus<volatile int>());
00067 }
00068
00069 __device__ __forceinline__ operator int() const
00070 {
00071 return mySum;
00072 }
00073
00074 int mySum;
00075 };
00076 template <> struct L1Dist<float>
00077 {
00078 typedef float value_type;
00079 typedef float result_type;
00080
00081 __device__ __forceinline__ L1Dist() : mySum(0.0f) {}
00082
00083 __device__ __forceinline__ void reduceIter(float val1, float val2)
00084 {
00085 mySum += ::fabs(val1 - val2);
00086 }
00087
00088 template <int THREAD_DIM> __device__ __forceinline__ void reduceAll(float* smem, int tid)
00089 {
00090 reduce<THREAD_DIM>(smem, mySum, tid, plus<volatile float>());
00091 }
00092
00093 __device__ __forceinline__ operator float() const
00094 {
00095 return mySum;
00096 }
00097
00098 float mySum;
00099 };
00100
00101 struct L2Dist
00102 {
00103 typedef float value_type;
00104 typedef float result_type;
00105
00106 __device__ __forceinline__ L2Dist() : mySum(0.0f) {}
00107
00108 __device__ __forceinline__ void reduceIter(float val1, float val2)
00109 {
00110 float reg = val1 - val2;
00111 mySum += reg * reg;
00112 }
00113
00114 template <int THREAD_DIM> __device__ __forceinline__ void reduceAll(float* smem, int tid)
00115 {
00116 reduce<THREAD_DIM>(smem, mySum, tid, plus<volatile float>());
00117 }
00118
00119 __device__ __forceinline__ operator float() const
00120 {
00121 return sqrtf(mySum);
00122 }
00123
00124 float mySum;
00125 };
00126
00127 struct HammingDist
00128 {
00129 typedef int value_type;
00130 typedef int result_type;
00131
00132 __device__ __forceinline__ HammingDist() : mySum(0) {}
00133
00134 __device__ __forceinline__ void reduceIter(int val1, int val2)
00135 {
00136 mySum += __popc(val1 ^ val2);
00137 }
00138
00139 template <int THREAD_DIM> __device__ __forceinline__ void reduceAll(int* smem, int tid)
00140 {
00141 reduce<THREAD_DIM>(smem, mySum, tid, plus<volatile int>());
00142 }
00143
00144 __device__ __forceinline__ operator int() const
00145 {
00146 return mySum;
00147 }
00148
00149 int mySum;
00150 };
00151
00152
00153 template <int THREAD_DIM, typename Dist, typename T1, typename T2>
00154 __device__ void calcVecDiffGlobal(const T1* vec1, const T2* vec2, int len, Dist& dist, typename Dist::result_type* smem, int tid)
00155 {
00156 for (int i = tid; i < len; i += THREAD_DIM)
00157 {
00158 T1 val1;
00159 ForceGlob<T1>::Load(vec1, i, val1);
00160
00161 T2 val2;
00162 ForceGlob<T2>::Load(vec2, i, val2);
00163
00164 dist.reduceIter(val1, val2);
00165 }
00166
00167 dist.reduceAll<THREAD_DIM>(smem, tid);
00168 }
00169
00170
00171 template <int THREAD_DIM, int MAX_LEN, bool LEN_EQ_MAX_LEN, typename Dist, typename T1, typename T2>
00172 __device__ __forceinline__ void calcVecDiffCached(const T1* vecCached, const T2* vecGlob, int len, Dist& dist, typename Dist::result_type* smem, int tid)
00173 {
00174 vec_distance_detail::VecDiffCachedCalculator<THREAD_DIM, MAX_LEN, LEN_EQ_MAX_LEN>::calc(vecCached, vecGlob, len, dist, tid);
00175
00176 dist.reduceAll<THREAD_DIM>(smem, tid);
00177 }
00178
00179
00180 template <int THREAD_DIM, typename T1> struct VecDiffGlobal
00181 {
00182 explicit __device__ __forceinline__ VecDiffGlobal(const T1* vec1_, int = 0, void* = 0, int = 0, int = 0)
00183 {
00184 vec1 = vec1_;
00185 }
00186
00187 template <typename T2, typename Dist>
00188 __device__ __forceinline__ void calc(const T2* vec2, int len, Dist& dist, typename Dist::result_type* smem, int tid) const
00189 {
00190 calcVecDiffGlobal<THREAD_DIM>(vec1, vec2, len, dist, smem, tid);
00191 }
00192
00193 const T1* vec1;
00194 };
00195
00196
00197 template <int THREAD_DIM, int MAX_LEN, bool LEN_EQ_MAX_LEN, typename U> struct VecDiffCachedRegister
00198 {
00199 template <typename T1> __device__ __forceinline__ VecDiffCachedRegister(const T1* vec1, int len, U* smem, int glob_tid, int tid)
00200 {
00201 if (glob_tid < len)
00202 smem[glob_tid] = vec1[glob_tid];
00203 __syncthreads();
00204
00205 U* vec1ValsPtr = vec1Vals;
00206
00207 #pragma unroll
00208 for (int i = tid; i < MAX_LEN; i += THREAD_DIM)
00209 *vec1ValsPtr++ = smem[i];
00210
00211 __syncthreads();
00212 }
00213
00214 template <typename T2, typename Dist>
00215 __device__ __forceinline__ void calc(const T2* vec2, int len, Dist& dist, typename Dist::result_type* smem, int tid) const
00216 {
00217 calcVecDiffCached<THREAD_DIM, MAX_LEN, LEN_EQ_MAX_LEN>(vec1Vals, vec2, len, dist, smem, tid);
00218 }
00219
00220 U vec1Vals[MAX_LEN / THREAD_DIM];
00221 };
00222 }}}
00223
00224 #endif // __OPENCV_GPU_VEC_DISTANCE_HPP__