73 #ifndef __OPENCV_GPU_SIMD_FUNCTIONS_HPP__
74 #define __OPENCV_GPU_SIMD_FUNCTIONS_HPP__
126 namespace cv {
namespace gpu {
namespace device
130 static __device__ __forceinline__
unsigned int vadd2(
unsigned int a,
unsigned int b)
134 #if __CUDA_ARCH__ >= 300
135 asm(
"vadd2.u32.u32.u32.sat %0, %1, %2, %3;" :
"=r"(
r) :
"r"(a),
"r"(
b),
"r"(r));
136 #elif __CUDA_ARCH__ >= 200
137 asm(
"vadd.u32.u32.u32.sat %0.h0, %1.h0, %2.h0, %3;" :
"=r"(
r) :
"r"(a),
"r"(
b),
"r"(r));
138 asm(
"vadd.u32.u32.u32.sat %0.h1, %1.h1, %2.h1, %3;" :
"=r"(
r) :
"r"(a),
"r"(
b),
"r"(r));
151 static __device__ __forceinline__
unsigned int vsub2(
unsigned int a,
unsigned int b)
155 #if __CUDA_ARCH__ >= 300
156 asm(
"vsub2.u32.u32.u32.sat %0, %1, %2, %3;" :
"=r"(
r) :
"r"(a),
"r"(
b),
"r"(r));
157 #elif __CUDA_ARCH__ >= 200
158 asm(
"vsub.u32.u32.u32.sat %0.h0, %1.h0, %2.h0, %3;" :
"=r"(
r) :
"r"(a),
"r"(
b),
"r"(r));
159 asm(
"vsub.u32.u32.u32.sat %0.h1, %1.h1, %2.h1, %3;" :
"=r"(
r) :
"r"(a),
"r"(
b),
"r"(r));
172 static __device__ __forceinline__
unsigned int vabsdiff2(
unsigned int a,
unsigned int b)
176 #if __CUDA_ARCH__ >= 300
177 asm(
"vabsdiff2.u32.u32.u32.sat %0, %1, %2, %3;" :
"=r"(
r) :
"r"(a),
"r"(
b),
"r"(r));
178 #elif __CUDA_ARCH__ >= 200
179 asm(
"vabsdiff.u32.u32.u32.sat %0.h0, %1.h0, %2.h0, %3;" :
"=r"(
r) :
"r"(a),
"r"(
b),
"r"(r));
180 asm(
"vabsdiff.u32.u32.u32.sat %0.h1, %1.h1, %2.h1, %3;" :
"=r"(
r) :
"r"(a),
"r"(
b),
"r"(r));
182 unsigned int s,
t, u,
v;
199 static __device__ __forceinline__
unsigned int vavg2(
unsigned int a,
unsigned int b)
214 static __device__ __forceinline__
unsigned int vavrg2(
unsigned int a,
unsigned int b)
218 #if __CUDA_ARCH__ >= 300
219 asm(
"vavrg2.u32.u32.u32 %0, %1, %2, %3;" :
"=r"(
r) :
"r"(a),
"r"(
b),
"r"(r));
234 static __device__ __forceinline__
unsigned int vseteq2(
unsigned int a,
unsigned int b)
238 #if __CUDA_ARCH__ >= 300
239 asm(
"vset2.u32.u32.eq %0, %1, %2, %3;" :
"=r"(
r) :
"r"(a),
"r"(
b),
"r"(r));
255 static __device__ __forceinline__
unsigned int vcmpeq2(
unsigned int a,
unsigned int b)
259 #if __CUDA_ARCH__ >= 300
279 static __device__ __forceinline__
unsigned int vsetge2(
unsigned int a,
unsigned int b)
283 #if __CUDA_ARCH__ >= 300
284 asm(
"vset2.u32.u32.ge %0, %1, %2, %3;" :
"=r"(
r) :
"r"(a),
"r"(
b),
"r"(r));
287 asm(
"not.b32 %0, %0;" :
"+r"(
b));
296 static __device__ __forceinline__
unsigned int vcmpge2(
unsigned int a,
unsigned int b)
300 #if __CUDA_ARCH__ >= 300
305 asm(
"not.b32 %0, %0;" :
"+r"(
b));
316 static __device__ __forceinline__
unsigned int vsetgt2(
unsigned int a,
unsigned int b)
320 #if __CUDA_ARCH__ >= 300
321 asm(
"vset2.u32.u32.gt %0, %1, %2, %3;" :
"=r"(
r) :
"r"(a),
"r"(
b),
"r"(r));
324 asm(
"not.b32 %0, %0;" :
"+r"(
b));
333 static __device__ __forceinline__
unsigned int vcmpgt2(
unsigned int a,
unsigned int b)
337 #if __CUDA_ARCH__ >= 300
342 asm(
"not.b32 %0, %0;" :
"+r"(
b));
353 static __device__ __forceinline__
unsigned int vsetle2(
unsigned int a,
unsigned int b)
357 #if __CUDA_ARCH__ >= 300
358 asm(
"vset2.u32.u32.le %0, %1, %2, %3;" :
"=r"(
r) :
"r"(a),
"r"(
b),
"r"(r));
361 asm(
"not.b32 %0, %0;" :
"+r"(
a));
370 static __device__ __forceinline__
unsigned int vcmple2(
unsigned int a,
unsigned int b)
374 #if __CUDA_ARCH__ >= 300
379 asm(
"not.b32 %0, %0;" :
"+r"(
a));
390 static __device__ __forceinline__
unsigned int vsetlt2(
unsigned int a,
unsigned int b)
394 #if __CUDA_ARCH__ >= 300
395 asm(
"vset2.u32.u32.lt %0, %1, %2, %3;" :
"=r"(
r) :
"r"(a),
"r"(
b),
"r"(r));
398 asm(
"not.b32 %0, %0;" :
"+r"(
a));
407 static __device__ __forceinline__
unsigned int vcmplt2(
unsigned int a,
unsigned int b)
411 #if __CUDA_ARCH__ >= 300
416 asm(
"not.b32 %0, %0;" :
"+r"(
a));
427 static __device__ __forceinline__
unsigned int vsetne2(
unsigned int a,
unsigned int b)
431 #if __CUDA_ARCH__ >= 300
432 asm (
"vset2.u32.u32.ne %0, %1, %2, %3;" :
"=r"(
r) :
"r"(a),
"r"(
b),
"r"(r));
448 static __device__ __forceinline__
unsigned int vcmpne2(
unsigned int a,
unsigned int b)
452 #if __CUDA_ARCH__ >= 300
472 static __device__ __forceinline__
unsigned int vmax2(
unsigned int a,
unsigned int b)
476 #if __CUDA_ARCH__ >= 300
477 asm(
"vmax2.u32.u32.u32 %0, %1, %2, %3;" :
"=r"(
r) :
"r"(a),
"r"(
b),
"r"(r));
478 #elif __CUDA_ARCH__ >= 200
479 asm(
"vmax.u32.u32.u32 %0.h0, %1.h0, %2.h0, %3;" :
"=r"(
r) :
"r"(a),
"r"(
b),
"r"(r));
480 asm(
"vmax.u32.u32.u32 %0.h1, %1.h1, %2.h1, %3;" :
"=r"(
r) :
"r"(a),
"r"(
b),
"r"(r));
482 unsigned int s,
t, u;
495 static __device__ __forceinline__
unsigned int vmin2(
unsigned int a,
unsigned int b)
499 #if __CUDA_ARCH__ >= 300
500 asm(
"vmin2.u32.u32.u32 %0, %1, %2, %3;" :
"=r"(
r) :
"r"(a),
"r"(
b),
"r"(r));
501 #elif __CUDA_ARCH__ >= 200
502 asm(
"vmin.u32.u32.u32 %0.h0, %1.h0, %2.h0, %3;" :
"=r"(
r) :
"r"(a),
"r"(
b),
"r"(r));
503 asm(
"vmin.u32.u32.u32 %0.h1, %1.h1, %2.h1, %3;" :
"=r"(
r) :
"r"(a),
"r"(
b),
"r"(r));
505 unsigned int s,
t, u;
520 static __device__ __forceinline__
unsigned int vadd4(
unsigned int a,
unsigned int b)
524 #if __CUDA_ARCH__ >= 300
525 asm(
"vadd4.u32.u32.u32.sat %0, %1, %2, %3;" :
"=r"(
r) :
"r"(a),
"r"(
b),
"r"(r));
526 #elif __CUDA_ARCH__ >= 200
527 asm(
"vadd.u32.u32.u32.sat %0.b0, %1.b0, %2.b0, %3;" :
"=r"(
r) :
"r"(a),
"r"(
b),
"r"(r));
528 asm(
"vadd.u32.u32.u32.sat %0.b1, %1.b1, %2.b1, %3;" :
"=r"(
r) :
"r"(a),
"r"(
b),
"r"(r));
529 asm(
"vadd.u32.u32.u32.sat %0.b2, %1.b2, %2.b2, %3;" :
"=r"(
r) :
"r"(a),
"r"(
b),
"r"(r));
530 asm(
"vadd.u32.u32.u32.sat %0.b3, %1.b3, %2.b3, %3;" :
"=r"(
r) :
"r"(a),
"r"(
b),
"r"(r));
544 static __device__ __forceinline__
unsigned int vsub4(
unsigned int a,
unsigned int b)
548 #if __CUDA_ARCH__ >= 300
549 asm(
"vsub4.u32.u32.u32.sat %0, %1, %2, %3;" :
"=r"(
r) :
"r"(a),
"r"(
b),
"r"(r));
550 #elif __CUDA_ARCH__ >= 200
551 asm(
"vsub.u32.u32.u32.sat %0.b0, %1.b0, %2.b0, %3;" :
"=r"(
r) :
"r"(a),
"r"(
b),
"r"(r));
552 asm(
"vsub.u32.u32.u32.sat %0.b1, %1.b1, %2.b1, %3;" :
"=r"(
r) :
"r"(a),
"r"(
b),
"r"(r));
553 asm(
"vsub.u32.u32.u32.sat %0.b2, %1.b2, %2.b2, %3;" :
"=r"(
r) :
"r"(a),
"r"(
b),
"r"(r));
554 asm(
"vsub.u32.u32.u32.sat %0.b3, %1.b3, %2.b3, %3;" :
"=r"(
r) :
"r"(a),
"r"(
b),
"r"(r));
568 static __device__ __forceinline__
unsigned int vavg4(
unsigned int a,
unsigned int b)
583 static __device__ __forceinline__
unsigned int vavrg4(
unsigned int a,
unsigned int b)
587 #if __CUDA_ARCH__ >= 300
588 asm(
"vavrg4.u32.u32.u32 %0, %1, %2, %3;" :
"=r"(
r) :
"r"(a),
"r"(
b),
"r"(r));
603 static __device__ __forceinline__
unsigned int vseteq4(
unsigned int a,
unsigned int b)
607 #if __CUDA_ARCH__ >= 300
608 asm(
"vset4.u32.u32.eq %0, %1, %2, %3;" :
"=r"(
r) :
"r"(a),
"r"(
b),
"r"(r));
624 static __device__ __forceinline__
unsigned int vcmpeq4(
unsigned int a,
unsigned int b)
628 #if __CUDA_ARCH__ >= 300
648 static __device__ __forceinline__
unsigned int vsetle4(
unsigned int a,
unsigned int b)
652 #if __CUDA_ARCH__ >= 300
653 asm(
"vset4.u32.u32.le %0, %1, %2, %3;" :
"=r"(
r) :
"r"(a),
"r"(
b),
"r"(r));
656 asm(
"not.b32 %0, %0;" :
"+r"(
a));
665 static __device__ __forceinline__
unsigned int vcmple4(
unsigned int a,
unsigned int b)
669 #if __CUDA_ARCH__ >= 300
674 asm(
"not.b32 %0, %0;" :
"+r"(
a));
685 static __device__ __forceinline__
unsigned int vsetlt4(
unsigned int a,
unsigned int b)
689 #if __CUDA_ARCH__ >= 300
690 asm(
"vset4.u32.u32.lt %0, %1, %2, %3;" :
"=r"(
r) :
"r"(a),
"r"(
b),
"r"(r));
693 asm(
"not.b32 %0, %0;" :
"+r"(
a));
702 static __device__ __forceinline__
unsigned int vcmplt4(
unsigned int a,
unsigned int b)
706 #if __CUDA_ARCH__ >= 300
711 asm(
"not.b32 %0, %0;" :
"+r"(
a));
722 static __device__ __forceinline__
unsigned int vsetge4(
unsigned int a,
unsigned int b)
726 #if __CUDA_ARCH__ >= 300
727 asm(
"vset4.u32.u32.ge %0, %1, %2, %3;" :
"=r"(
r) :
"r"(a),
"r"(
b),
"r"(r));
730 asm(
"not.b32 %0, %0;" :
"+r"(
b));
739 static __device__ __forceinline__
unsigned int vcmpge4(
unsigned int a,
unsigned int b)
743 #if __CUDA_ARCH__ >= 300
748 asm (
"not.b32 %0,%0;" :
"+r"(
b));
759 static __device__ __forceinline__
unsigned int vsetgt4(
unsigned int a,
unsigned int b)
763 #if __CUDA_ARCH__ >= 300
764 asm(
"vset4.u32.u32.gt %0, %1, %2, %3;" :
"=r"(
r) :
"r"(a),
"r"(
b),
"r"(r));
767 asm(
"not.b32 %0, %0;" :
"+r"(
b));
776 static __device__ __forceinline__
unsigned int vcmpgt4(
unsigned int a,
unsigned int b)
780 #if __CUDA_ARCH__ >= 300
785 asm(
"not.b32 %0, %0;" :
"+r"(
b));
796 static __device__ __forceinline__
unsigned int vsetne4(
unsigned int a,
unsigned int b)
800 #if __CUDA_ARCH__ >= 300
801 asm(
"vset4.u32.u32.ne %0, %1, %2, %3;" :
"=r"(
r) :
"r"(a),
"r"(
b),
"r"(r));
817 static __device__ __forceinline__
unsigned int vcmpne4(
unsigned int a,
unsigned int b)
821 #if __CUDA_ARCH__ >= 300
841 static __device__ __forceinline__
unsigned int vabsdiff4(
unsigned int a,
unsigned int b)
845 #if __CUDA_ARCH__ >= 300
846 asm(
"vabsdiff4.u32.u32.u32.sat %0, %1, %2, %3;" :
"=r"(
r) :
"r"(a),
"r"(
b),
"r"(r));
847 #elif __CUDA_ARCH__ >= 200
848 asm(
"vabsdiff.u32.u32.u32.sat %0.b0, %1.b0, %2.b0, %3;" :
"=r"(
r) :
"r"(a),
"r"(
b),
"r"(r));
849 asm(
"vabsdiff.u32.u32.u32.sat %0.b1, %1.b1, %2.b1, %3;" :
"=r"(
r) :
"r"(a),
"r"(
b),
"r"(r));
850 asm(
"vabsdiff.u32.u32.u32.sat %0.b2, %1.b2, %2.b2, %3;" :
"=r"(
r) :
"r"(a),
"r"(
b),
"r"(r));
851 asm(
"vabsdiff.u32.u32.u32.sat %0.b3, %1.b3, %2.b3, %3;" :
"=r"(
r) :
"r"(a),
"r"(
b),
"r"(r));
864 static __device__ __forceinline__
unsigned int vmax4(
unsigned int a,
unsigned int b)
868 #if __CUDA_ARCH__ >= 300
869 asm(
"vmax4.u32.u32.u32 %0, %1, %2, %3;" :
"=r"(
r) :
"r"(a),
"r"(
b),
"r"(r));
870 #elif __CUDA_ARCH__ >= 200
871 asm(
"vmax.u32.u32.u32 %0.b0, %1.b0, %2.b0, %3;" :
"=r"(
r) :
"r"(a),
"r"(
b),
"r"(r));
872 asm(
"vmax.u32.u32.u32 %0.b1, %1.b1, %2.b1, %3;" :
"=r"(
r) :
"r"(a),
"r"(
b),
"r"(r));
873 asm(
"vmax.u32.u32.u32 %0.b2, %1.b2, %2.b2, %3;" :
"=r"(
r) :
"r"(a),
"r"(
b),
"r"(r));
874 asm(
"vmax.u32.u32.u32 %0.b3, %1.b3, %2.b3, %3;" :
"=r"(
r) :
"r"(a),
"r"(
b),
"r"(r));
886 static __device__ __forceinline__
unsigned int vmin4(
unsigned int a,
unsigned int b)
890 #if __CUDA_ARCH__ >= 300
891 asm(
"vmin4.u32.u32.u32 %0, %1, %2, %3;" :
"=r"(
r) :
"r"(a),
"r"(
b),
"r"(r));
892 #elif __CUDA_ARCH__ >= 200
893 asm(
"vmin.u32.u32.u32 %0.b0, %1.b0, %2.b0, %3;" :
"=r"(
r) :
"r"(a),
"r"(
b),
"r"(r));
894 asm(
"vmin.u32.u32.u32 %0.b1, %1.b1, %2.b1, %3;" :
"=r"(
r) :
"r"(a),
"r"(
b),
"r"(r));
895 asm(
"vmin.u32.u32.u32 %0.b2, %1.b2, %2.b2, %3;" :
"=r"(
r) :
"r"(a),
"r"(
b),
"r"(r));
896 asm(
"vmin.u32.u32.u32 %0.b3, %1.b3, %2.b3, %3;" :
"=r"(
r) :
"r"(a),
"r"(
b),
"r"(r));
909 #endif // __OPENCV_GPU_SIMD_FUNCTIONS_HPP__
GLdouble GLdouble GLdouble r
CV_EXPORTS void max(const GpuMat &src1, const GpuMat &src2, GpuMat &dst, Stream &stream=Stream::Null())
computes per-element maximum of two arrays (dst = max(src1, src2))
CV_EXPORTS void min(const GpuMat &src1, const GpuMat &src2, GpuMat &dst, Stream &stream=Stream::Null())
computes per-element minimum of two arrays (dst = min(src1, src2))
CvPoint2D32f float float b
Definition: legacy.hpp:578
CvRect r
Definition: core_c.h:1282
CvPoint2D32f float float float c
Definition: legacy.hpp:578
GLboolean GLboolean GLboolean b
Definition: legacy.hpp:633
GLboolean GLboolean GLboolean GLboolean a
Definition: legacy.hpp:633
CvPoint2D32f float a
Definition: legacy.hpp:578