simd_functions.hpp
Go to the documentation of this file.
1 /*M///////////////////////////////////////////////////////////////////////////////////////
2 //
3 // IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
4 //
5 // By downloading, copying, installing or using the software you agree to this license.
6 // If you do not agree to this license, do not download, install,
7 // copy or use the software.
8 //
9 //
10 // License Agreement
11 // For Open Source Computer Vision Library
12 //
13 // Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
14 // Copyright (C) 2009, Willow Garage Inc., all rights reserved.
15 // Third party copyrights are property of their respective owners.
16 //
17 // Redistribution and use in source and binary forms, with or without modification,
18 // are permitted provided that the following conditions are met:
19 //
20 // * Redistribution's of source code must retain the above copyright notice,
21 // this list of conditions and the following disclaimer.
22 //
23 // * Redistribution's in binary form must reproduce the above copyright notice,
24 // this list of conditions and the following disclaimer in the documentation
25 // and/or other materials provided with the distribution.
26 //
27 // * The name of the copyright holders may not be used to endorse or promote products
28 // derived from this software without specific prior written permission.
29 //
30 // This software is provided by the copyright holders and contributors "as is" and
31 // any express or implied warranties, including, but not limited to, the implied
32 // warranties of merchantability and fitness for a particular purpose are disclaimed.
33 // In no event shall the Intel Corporation or contributors be liable for any direct,
34 // indirect, incidental, special, exemplary, or consequential damages
35 // (including, but not limited to, procurement of substitute goods or services;
36 // loss of use, data, or profits; or business interruption) however caused
37 // and on any theory of liability, whether in contract, strict liability,
38 // or tort (including negligence or otherwise) arising in any way out of
39 // the use of this software, even if advised of the possibility of such damage.
40 //
41 //M*/
42 
43 /*
44  * Copyright (c) 2013 NVIDIA Corporation. All rights reserved.
45  *
46  * Redistribution and use in source and binary forms, with or without
47  * modification, are permitted provided that the following conditions are met:
48  *
49  * Redistributions of source code must retain the above copyright notice,
50  * this list of conditions and the following disclaimer.
51  *
52  * Redistributions in binary form must reproduce the above copyright notice,
53  * this list of conditions and the following disclaimer in the documentation
54  * and/or other materials provided with the distribution.
55  *
56  * Neither the name of NVIDIA Corporation nor the names of its contributors
57  * may be used to endorse or promote products derived from this software
58  * without specific prior written permission.
59  *
60  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
61  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
62  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
63  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
64  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
65  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
66  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
67  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
68  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
69  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
70  * POSSIBILITY OF SUCH DAMAGE.
71  */
72 
73 #ifndef __OPENCV_GPU_SIMD_FUNCTIONS_HPP__
74 #define __OPENCV_GPU_SIMD_FUNCTIONS_HPP__
75 
76 #include "common.hpp"
77 
78 /*
79  This header file contains inline functions that implement intra-word SIMD
80  operations, that are hardware accelerated on sm_3x (Kepler) GPUs. Efficient
81  emulation code paths are provided for earlier architectures (sm_1x, sm_2x)
82  to make the code portable across all GPUs supported by CUDA. The following
83  functions are currently implemented:
84 
85  vadd2(a,b) per-halfword unsigned addition, with wrap-around: a + b
86  vsub2(a,b) per-halfword unsigned subtraction, with wrap-around: a - b
87  vabsdiff2(a,b) per-halfword unsigned absolute difference: |a - b|
88  vavg2(a,b) per-halfword unsigned average: (a + b) / 2
89  vavrg2(a,b) per-halfword unsigned rounded average: (a + b + 1) / 2
90  vseteq2(a,b) per-halfword unsigned comparison: a == b ? 1 : 0
91  vcmpeq2(a,b) per-halfword unsigned comparison: a == b ? 0xffff : 0
92  vsetge2(a,b) per-halfword unsigned comparison: a >= b ? 1 : 0
93  vcmpge2(a,b) per-halfword unsigned comparison: a >= b ? 0xffff : 0
94  vsetgt2(a,b) per-halfword unsigned comparison: a > b ? 1 : 0
95  vcmpgt2(a,b) per-halfword unsigned comparison: a > b ? 0xffff : 0
96  vsetle2(a,b) per-halfword unsigned comparison: a <= b ? 1 : 0
97  vcmple2(a,b) per-halfword unsigned comparison: a <= b ? 0xffff : 0
98  vsetlt2(a,b) per-halfword unsigned comparison: a < b ? 1 : 0
99  vcmplt2(a,b) per-halfword unsigned comparison: a < b ? 0xffff : 0
100  vsetne2(a,b) per-halfword unsigned comparison: a != b ? 1 : 0
101  vcmpne2(a,b) per-halfword unsigned comparison: a != b ? 0xffff : 0
102  vmax2(a,b) per-halfword unsigned maximum: max(a, b)
103  vmin2(a,b) per-halfword unsigned minimum: min(a, b)
104 
105  vadd4(a,b) per-byte unsigned addition, with wrap-around: a + b
106  vsub4(a,b) per-byte unsigned subtraction, with wrap-around: a - b
107  vabsdiff4(a,b) per-byte unsigned absolute difference: |a - b|
108  vavg4(a,b) per-byte unsigned average: (a + b) / 2
109  vavrg4(a,b) per-byte unsigned rounded average: (a + b + 1) / 2
110  vseteq4(a,b) per-byte unsigned comparison: a == b ? 1 : 0
111  vcmpeq4(a,b) per-byte unsigned comparison: a == b ? 0xff : 0
112  vsetge4(a,b) per-byte unsigned comparison: a >= b ? 1 : 0
113  vcmpge4(a,b) per-byte unsigned comparison: a >= b ? 0xff : 0
114  vsetgt4(a,b) per-byte unsigned comparison: a > b ? 1 : 0
115  vcmpgt4(a,b) per-byte unsigned comparison: a > b ? 0xff : 0
116  vsetle4(a,b) per-byte unsigned comparison: a <= b ? 1 : 0
117  vcmple4(a,b) per-byte unsigned comparison: a <= b ? 0xff : 0
118  vsetlt4(a,b) per-byte unsigned comparison: a < b ? 1 : 0
119  vcmplt4(a,b) per-byte unsigned comparison: a < b ? 0xff : 0
120  vsetne4(a,b) per-byte unsigned comparison: a != b ? 1: 0
121  vcmpne4(a,b) per-byte unsigned comparison: a != b ? 0xff: 0
122  vmax4(a,b) per-byte unsigned maximum: max(a, b)
123  vmin4(a,b) per-byte unsigned minimum: min(a, b)
124 */
125 
126 namespace cv { namespace gpu { namespace device
127 {
128  // 2
129 
130  static __device__ __forceinline__ unsigned int vadd2(unsigned int a, unsigned int b)
131  {
132  unsigned int r = 0;
133 
134  #if __CUDA_ARCH__ >= 300
135  asm("vadd2.u32.u32.u32.sat %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
136  #elif __CUDA_ARCH__ >= 200
137  asm("vadd.u32.u32.u32.sat %0.h0, %1.h0, %2.h0, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
138  asm("vadd.u32.u32.u32.sat %0.h1, %1.h1, %2.h1, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
139  #else
140  unsigned int s;
141  s = a ^ b; // sum bits
142  r = a + b; // actual sum
143  s = s ^ r; // determine carry-ins for each bit position
144  s = s & 0x00010000; // carry-in to high word (= carry-out from low word)
145  r = r - s; // subtract out carry-out from low word
146  #endif
147 
148  return r;
149  }
150 
151  static __device__ __forceinline__ unsigned int vsub2(unsigned int a, unsigned int b)
152  {
153  unsigned int r = 0;
154 
155  #if __CUDA_ARCH__ >= 300
156  asm("vsub2.u32.u32.u32.sat %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
157  #elif __CUDA_ARCH__ >= 200
158  asm("vsub.u32.u32.u32.sat %0.h0, %1.h0, %2.h0, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
159  asm("vsub.u32.u32.u32.sat %0.h1, %1.h1, %2.h1, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
160  #else
161  unsigned int s;
162  s = a ^ b; // sum bits
163  r = a - b; // actual sum
164  s = s ^ r; // determine carry-ins for each bit position
165  s = s & 0x00010000; // borrow to high word
166  r = r + s; // compensate for borrow from low word
167  #endif
168 
169  return r;
170  }
171 
172  static __device__ __forceinline__ unsigned int vabsdiff2(unsigned int a, unsigned int b)
173  {
174  unsigned int r = 0;
175 
176  #if __CUDA_ARCH__ >= 300
177  asm("vabsdiff2.u32.u32.u32.sat %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
178  #elif __CUDA_ARCH__ >= 200
179  asm("vabsdiff.u32.u32.u32.sat %0.h0, %1.h0, %2.h0, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
180  asm("vabsdiff.u32.u32.u32.sat %0.h1, %1.h1, %2.h1, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
181  #else
182  unsigned int s, t, u, v;
183  s = a & 0x0000ffff; // extract low halfword
184  r = b & 0x0000ffff; // extract low halfword
185  u = ::max(r, s); // maximum of low halfwords
186  v = ::min(r, s); // minimum of low halfwords
187  s = a & 0xffff0000; // extract high halfword
188  r = b & 0xffff0000; // extract high halfword
189  t = ::max(r, s); // maximum of high halfwords
190  s = ::min(r, s); // minimum of high halfwords
191  r = u | t; // maximum of both halfwords
192  s = v | s; // minimum of both halfwords
193  r = r - s; // |a - b| = max(a,b) - min(a,b);
194  #endif
195 
196  return r;
197  }
198 
199  static __device__ __forceinline__ unsigned int vavg2(unsigned int a, unsigned int b)
200  {
201  unsigned int r, s;
202 
203  // HAKMEM #23: a + b = 2 * (a & b) + (a ^ b) ==>
204  // (a + b) / 2 = (a & b) + ((a ^ b) >> 1)
205  s = a ^ b;
206  r = a & b;
207  s = s & 0xfffefffe; // ensure shift doesn't cross halfword boundaries
208  s = s >> 1;
209  s = r + s;
210 
211  return s;
212  }
213 
214  static __device__ __forceinline__ unsigned int vavrg2(unsigned int a, unsigned int b)
215  {
216  unsigned int r = 0;
217 
218  #if __CUDA_ARCH__ >= 300
219  asm("vavrg2.u32.u32.u32 %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
220  #else
221  // HAKMEM #23: a + b = 2 * (a | b) - (a ^ b) ==>
222  // (a + b + 1) / 2 = (a | b) - ((a ^ b) >> 1)
223  unsigned int s;
224  s = a ^ b;
225  r = a | b;
226  s = s & 0xfffefffe; // ensure shift doesn't cross half-word boundaries
227  s = s >> 1;
228  r = r - s;
229  #endif
230 
231  return r;
232  }
233 
234  static __device__ __forceinline__ unsigned int vseteq2(unsigned int a, unsigned int b)
235  {
236  unsigned int r = 0;
237 
238  #if __CUDA_ARCH__ >= 300
239  asm("vset2.u32.u32.eq %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
240  #else
241  // inspired by Alan Mycroft's null-byte detection algorithm:
242  // null_byte(x) = ((x - 0x01010101) & (~x & 0x80808080))
243  unsigned int c;
244  r = a ^ b; // 0x0000 if a == b
245  c = r | 0x80008000; // set msbs, to catch carry out
246  r = r ^ c; // extract msbs, msb = 1 if r < 0x8000
247  c = c - 0x00010001; // msb = 0, if r was 0x0000 or 0x8000
248  c = r & ~c; // msb = 1, if r was 0x0000
249  r = c >> 15; // convert to bool
250  #endif
251 
252  return r;
253  }
254 
255  static __device__ __forceinline__ unsigned int vcmpeq2(unsigned int a, unsigned int b)
256  {
257  unsigned int r, c;
258 
259  #if __CUDA_ARCH__ >= 300
260  r = vseteq2(a, b);
261  c = r << 16; // convert bool
262  r = c - r; // into mask
263  #else
264  // inspired by Alan Mycroft's null-byte detection algorithm:
265  // null_byte(x) = ((x - 0x01010101) & (~x & 0x80808080))
266  r = a ^ b; // 0x0000 if a == b
267  c = r | 0x80008000; // set msbs, to catch carry out
268  r = r ^ c; // extract msbs, msb = 1 if r < 0x8000
269  c = c - 0x00010001; // msb = 0, if r was 0x0000 or 0x8000
270  c = r & ~c; // msb = 1, if r was 0x0000
271  r = c >> 15; // convert
272  r = c - r; // msbs to
273  r = c | r; // mask
274  #endif
275 
276  return r;
277  }
278 
279  static __device__ __forceinline__ unsigned int vsetge2(unsigned int a, unsigned int b)
280  {
281  unsigned int r = 0;
282 
283  #if __CUDA_ARCH__ >= 300
284  asm("vset2.u32.u32.ge %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
285  #else
286  unsigned int c;
287  asm("not.b32 %0, %0;" : "+r"(b));
288  c = vavrg2(a, b); // (a + ~b + 1) / 2 = (a - b) / 2
289  c = c & 0x80008000; // msb = carry-outs
290  r = c >> 15; // convert to bool
291  #endif
292 
293  return r;
294  }
295 
296  static __device__ __forceinline__ unsigned int vcmpge2(unsigned int a, unsigned int b)
297  {
298  unsigned int r, c;
299 
300  #if __CUDA_ARCH__ >= 300
301  r = vsetge2(a, b);
302  c = r << 16; // convert bool
303  r = c - r; // into mask
304  #else
305  asm("not.b32 %0, %0;" : "+r"(b));
306  c = vavrg2(a, b); // (a + ~b + 1) / 2 = (a - b) / 2
307  c = c & 0x80008000; // msb = carry-outs
308  r = c >> 15; // convert
309  r = c - r; // msbs to
310  r = c | r; // mask
311  #endif
312 
313  return r;
314  }
315 
316  static __device__ __forceinline__ unsigned int vsetgt2(unsigned int a, unsigned int b)
317  {
318  unsigned int r = 0;
319 
320  #if __CUDA_ARCH__ >= 300
321  asm("vset2.u32.u32.gt %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
322  #else
323  unsigned int c;
324  asm("not.b32 %0, %0;" : "+r"(b));
325  c = vavg2(a, b); // (a + ~b) / 2 = (a - b) / 2 [rounded down]
326  c = c & 0x80008000; // msbs = carry-outs
327  r = c >> 15; // convert to bool
328  #endif
329 
330  return r;
331  }
332 
333  static __device__ __forceinline__ unsigned int vcmpgt2(unsigned int a, unsigned int b)
334  {
335  unsigned int r, c;
336 
337  #if __CUDA_ARCH__ >= 300
338  r = vsetgt2(a, b);
339  c = r << 16; // convert bool
340  r = c - r; // into mask
341  #else
342  asm("not.b32 %0, %0;" : "+r"(b));
343  c = vavg2(a, b); // (a + ~b) / 2 = (a - b) / 2 [rounded down]
344  c = c & 0x80008000; // msbs = carry-outs
345  r = c >> 15; // convert
346  r = c - r; // msbs to
347  r = c | r; // mask
348  #endif
349 
350  return r;
351  }
352 
353  static __device__ __forceinline__ unsigned int vsetle2(unsigned int a, unsigned int b)
354  {
355  unsigned int r = 0;
356 
357  #if __CUDA_ARCH__ >= 300
358  asm("vset2.u32.u32.le %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
359  #else
360  unsigned int c;
361  asm("not.b32 %0, %0;" : "+r"(a));
362  c = vavrg2(a, b); // (b + ~a + 1) / 2 = (b - a) / 2
363  c = c & 0x80008000; // msb = carry-outs
364  r = c >> 15; // convert to bool
365  #endif
366 
367  return r;
368  }
369 
370  static __device__ __forceinline__ unsigned int vcmple2(unsigned int a, unsigned int b)
371  {
372  unsigned int r, c;
373 
374  #if __CUDA_ARCH__ >= 300
375  r = vsetle2(a, b);
376  c = r << 16; // convert bool
377  r = c - r; // into mask
378  #else
379  asm("not.b32 %0, %0;" : "+r"(a));
380  c = vavrg2(a, b); // (b + ~a + 1) / 2 = (b - a) / 2
381  c = c & 0x80008000; // msb = carry-outs
382  r = c >> 15; // convert
383  r = c - r; // msbs to
384  r = c | r; // mask
385  #endif
386 
387  return r;
388  }
389 
390  static __device__ __forceinline__ unsigned int vsetlt2(unsigned int a, unsigned int b)
391  {
392  unsigned int r = 0;
393 
394  #if __CUDA_ARCH__ >= 300
395  asm("vset2.u32.u32.lt %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
396  #else
397  unsigned int c;
398  asm("not.b32 %0, %0;" : "+r"(a));
399  c = vavg2(a, b); // (b + ~a) / 2 = (b - a) / 2 [rounded down]
400  c = c & 0x80008000; // msb = carry-outs
401  r = c >> 15; // convert to bool
402  #endif
403 
404  return r;
405  }
406 
407  static __device__ __forceinline__ unsigned int vcmplt2(unsigned int a, unsigned int b)
408  {
409  unsigned int r, c;
410 
411  #if __CUDA_ARCH__ >= 300
412  r = vsetlt2(a, b);
413  c = r << 16; // convert bool
414  r = c - r; // into mask
415  #else
416  asm("not.b32 %0, %0;" : "+r"(a));
417  c = vavg2(a, b); // (b + ~a) / 2 = (b - a) / 2 [rounded down]
418  c = c & 0x80008000; // msb = carry-outs
419  r = c >> 15; // convert
420  r = c - r; // msbs to
421  r = c | r; // mask
422  #endif
423 
424  return r;
425  }
426 
427  static __device__ __forceinline__ unsigned int vsetne2(unsigned int a, unsigned int b)
428  {
429  unsigned int r = 0;
430 
431  #if __CUDA_ARCH__ >= 300
432  asm ("vset2.u32.u32.ne %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
433  #else
434  // inspired by Alan Mycroft's null-byte detection algorithm:
435  // null_byte(x) = ((x - 0x01010101) & (~x & 0x80808080))
436  unsigned int c;
437  r = a ^ b; // 0x0000 if a == b
438  c = r | 0x80008000; // set msbs, to catch carry out
439  c = c - 0x00010001; // msb = 0, if r was 0x0000 or 0x8000
440  c = r | c; // msb = 1, if r was not 0x0000
441  c = c & 0x80008000; // extract msbs
442  r = c >> 15; // convert to bool
443  #endif
444 
445  return r;
446  }
447 
448  static __device__ __forceinline__ unsigned int vcmpne2(unsigned int a, unsigned int b)
449  {
450  unsigned int r, c;
451 
452  #if __CUDA_ARCH__ >= 300
453  r = vsetne2(a, b);
454  c = r << 16; // convert bool
455  r = c - r; // into mask
456  #else
457  // inspired by Alan Mycroft's null-byte detection algorithm:
458  // null_byte(x) = ((x - 0x01010101) & (~x & 0x80808080))
459  r = a ^ b; // 0x0000 if a == b
460  c = r | 0x80008000; // set msbs, to catch carry out
461  c = c - 0x00010001; // msb = 0, if r was 0x0000 or 0x8000
462  c = r | c; // msb = 1, if r was not 0x0000
463  c = c & 0x80008000; // extract msbs
464  r = c >> 15; // convert
465  r = c - r; // msbs to
466  r = c | r; // mask
467  #endif
468 
469  return r;
470  }
471 
472  static __device__ __forceinline__ unsigned int vmax2(unsigned int a, unsigned int b)
473  {
474  unsigned int r = 0;
475 
476  #if __CUDA_ARCH__ >= 300
477  asm("vmax2.u32.u32.u32 %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
478  #elif __CUDA_ARCH__ >= 200
479  asm("vmax.u32.u32.u32 %0.h0, %1.h0, %2.h0, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
480  asm("vmax.u32.u32.u32 %0.h1, %1.h1, %2.h1, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
481  #else
482  unsigned int s, t, u;
483  r = a & 0x0000ffff; // extract low halfword
484  s = b & 0x0000ffff; // extract low halfword
485  t = ::max(r, s); // maximum of low halfwords
486  r = a & 0xffff0000; // extract high halfword
487  s = b & 0xffff0000; // extract high halfword
488  u = ::max(r, s); // maximum of high halfwords
489  r = t | u; // combine halfword maximums
490  #endif
491 
492  return r;
493  }
494 
495  static __device__ __forceinline__ unsigned int vmin2(unsigned int a, unsigned int b)
496  {
497  unsigned int r = 0;
498 
499  #if __CUDA_ARCH__ >= 300
500  asm("vmin2.u32.u32.u32 %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
501  #elif __CUDA_ARCH__ >= 200
502  asm("vmin.u32.u32.u32 %0.h0, %1.h0, %2.h0, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
503  asm("vmin.u32.u32.u32 %0.h1, %1.h1, %2.h1, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
504  #else
505  unsigned int s, t, u;
506  r = a & 0x0000ffff; // extract low halfword
507  s = b & 0x0000ffff; // extract low halfword
508  t = ::min(r, s); // minimum of low halfwords
509  r = a & 0xffff0000; // extract high halfword
510  s = b & 0xffff0000; // extract high halfword
511  u = ::min(r, s); // minimum of high halfwords
512  r = t | u; // combine halfword minimums
513  #endif
514 
515  return r;
516  }
517 
518  // 4
519 
520  static __device__ __forceinline__ unsigned int vadd4(unsigned int a, unsigned int b)
521  {
522  unsigned int r = 0;
523 
524  #if __CUDA_ARCH__ >= 300
525  asm("vadd4.u32.u32.u32.sat %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
526  #elif __CUDA_ARCH__ >= 200
527  asm("vadd.u32.u32.u32.sat %0.b0, %1.b0, %2.b0, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
528  asm("vadd.u32.u32.u32.sat %0.b1, %1.b1, %2.b1, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
529  asm("vadd.u32.u32.u32.sat %0.b2, %1.b2, %2.b2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
530  asm("vadd.u32.u32.u32.sat %0.b3, %1.b3, %2.b3, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
531  #else
532  unsigned int s, t;
533  s = a ^ b; // sum bits
534  r = a & 0x7f7f7f7f; // clear msbs
535  t = b & 0x7f7f7f7f; // clear msbs
536  s = s & 0x80808080; // msb sum bits
537  r = r + t; // add without msbs, record carry-out in msbs
538  r = r ^ s; // sum of msb sum and carry-in bits, w/o carry-out
539  #endif /* __CUDA_ARCH__ >= 300 */
540 
541  return r;
542  }
543 
544  static __device__ __forceinline__ unsigned int vsub4(unsigned int a, unsigned int b)
545  {
546  unsigned int r = 0;
547 
548  #if __CUDA_ARCH__ >= 300
549  asm("vsub4.u32.u32.u32.sat %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
550  #elif __CUDA_ARCH__ >= 200
551  asm("vsub.u32.u32.u32.sat %0.b0, %1.b0, %2.b0, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
552  asm("vsub.u32.u32.u32.sat %0.b1, %1.b1, %2.b1, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
553  asm("vsub.u32.u32.u32.sat %0.b2, %1.b2, %2.b2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
554  asm("vsub.u32.u32.u32.sat %0.b3, %1.b3, %2.b3, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
555  #else
556  unsigned int s, t;
557  s = a ^ ~b; // inverted sum bits
558  r = a | 0x80808080; // set msbs
559  t = b & 0x7f7f7f7f; // clear msbs
560  s = s & 0x80808080; // inverted msb sum bits
561  r = r - t; // subtract w/o msbs, record inverted borrows in msb
562  r = r ^ s; // combine inverted msb sum bits and borrows
563  #endif
564 
565  return r;
566  }
567 
568  static __device__ __forceinline__ unsigned int vavg4(unsigned int a, unsigned int b)
569  {
570  unsigned int r, s;
571 
572  // HAKMEM #23: a + b = 2 * (a & b) + (a ^ b) ==>
573  // (a + b) / 2 = (a & b) + ((a ^ b) >> 1)
574  s = a ^ b;
575  r = a & b;
576  s = s & 0xfefefefe; // ensure following shift doesn't cross byte boundaries
577  s = s >> 1;
578  s = r + s;
579 
580  return s;
581  }
582 
583  static __device__ __forceinline__ unsigned int vavrg4(unsigned int a, unsigned int b)
584  {
585  unsigned int r = 0;
586 
587  #if __CUDA_ARCH__ >= 300
588  asm("vavrg4.u32.u32.u32 %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
589  #else
590  // HAKMEM #23: a + b = 2 * (a | b) - (a ^ b) ==>
591  // (a + b + 1) / 2 = (a | b) - ((a ^ b) >> 1)
592  unsigned int c;
593  c = a ^ b;
594  r = a | b;
595  c = c & 0xfefefefe; // ensure following shift doesn't cross byte boundaries
596  c = c >> 1;
597  r = r - c;
598  #endif
599 
600  return r;
601  }
602 
603  static __device__ __forceinline__ unsigned int vseteq4(unsigned int a, unsigned int b)
604  {
605  unsigned int r = 0;
606 
607  #if __CUDA_ARCH__ >= 300
608  asm("vset4.u32.u32.eq %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
609  #else
610  // inspired by Alan Mycroft's null-byte detection algorithm:
611  // null_byte(x) = ((x - 0x01010101) & (~x & 0x80808080))
612  unsigned int c;
613  r = a ^ b; // 0x00 if a == b
614  c = r | 0x80808080; // set msbs, to catch carry out
615  r = r ^ c; // extract msbs, msb = 1 if r < 0x80
616  c = c - 0x01010101; // msb = 0, if r was 0x00 or 0x80
617  c = r & ~c; // msb = 1, if r was 0x00
618  r = c >> 7; // convert to bool
619  #endif
620 
621  return r;
622  }
623 
624  static __device__ __forceinline__ unsigned int vcmpeq4(unsigned int a, unsigned int b)
625  {
626  unsigned int r, t;
627 
628  #if __CUDA_ARCH__ >= 300
629  r = vseteq4(a, b);
630  t = r << 8; // convert bool
631  r = t - r; // to mask
632  #else
633  // inspired by Alan Mycroft's null-byte detection algorithm:
634  // null_byte(x) = ((x - 0x01010101) & (~x & 0x80808080))
635  t = a ^ b; // 0x00 if a == b
636  r = t | 0x80808080; // set msbs, to catch carry out
637  t = t ^ r; // extract msbs, msb = 1 if t < 0x80
638  r = r - 0x01010101; // msb = 0, if t was 0x00 or 0x80
639  r = t & ~r; // msb = 1, if t was 0x00
640  t = r >> 7; // build mask
641  t = r - t; // from
642  r = t | r; // msbs
643  #endif
644 
645  return r;
646  }
647 
648  static __device__ __forceinline__ unsigned int vsetle4(unsigned int a, unsigned int b)
649  {
650  unsigned int r = 0;
651 
652  #if __CUDA_ARCH__ >= 300
653  asm("vset4.u32.u32.le %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
654  #else
655  unsigned int c;
656  asm("not.b32 %0, %0;" : "+r"(a));
657  c = vavrg4(a, b); // (b + ~a + 1) / 2 = (b - a) / 2
658  c = c & 0x80808080; // msb = carry-outs
659  r = c >> 7; // convert to bool
660  #endif
661 
662  return r;
663  }
664 
665  static __device__ __forceinline__ unsigned int vcmple4(unsigned int a, unsigned int b)
666  {
667  unsigned int r, c;
668 
669  #if __CUDA_ARCH__ >= 300
670  r = vsetle4(a, b);
671  c = r << 8; // convert bool
672  r = c - r; // to mask
673  #else
674  asm("not.b32 %0, %0;" : "+r"(a));
675  c = vavrg4(a, b); // (b + ~a + 1) / 2 = (b - a) / 2
676  c = c & 0x80808080; // msbs = carry-outs
677  r = c >> 7; // convert
678  r = c - r; // msbs to
679  r = c | r; // mask
680  #endif
681 
682  return r;
683  }
684 
685  static __device__ __forceinline__ unsigned int vsetlt4(unsigned int a, unsigned int b)
686  {
687  unsigned int r = 0;
688 
689  #if __CUDA_ARCH__ >= 300
690  asm("vset4.u32.u32.lt %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
691  #else
692  unsigned int c;
693  asm("not.b32 %0, %0;" : "+r"(a));
694  c = vavg4(a, b); // (b + ~a) / 2 = (b - a) / 2 [rounded down]
695  c = c & 0x80808080; // msb = carry-outs
696  r = c >> 7; // convert to bool
697  #endif
698 
699  return r;
700  }
701 
702  static __device__ __forceinline__ unsigned int vcmplt4(unsigned int a, unsigned int b)
703  {
704  unsigned int r, c;
705 
706  #if __CUDA_ARCH__ >= 300
707  r = vsetlt4(a, b);
708  c = r << 8; // convert bool
709  r = c - r; // to mask
710  #else
711  asm("not.b32 %0, %0;" : "+r"(a));
712  c = vavg4(a, b); // (b + ~a) / 2 = (b - a) / 2 [rounded down]
713  c = c & 0x80808080; // msbs = carry-outs
714  r = c >> 7; // convert
715  r = c - r; // msbs to
716  r = c | r; // mask
717  #endif
718 
719  return r;
720  }
721 
722  static __device__ __forceinline__ unsigned int vsetge4(unsigned int a, unsigned int b)
723  {
724  unsigned int r = 0;
725 
726  #if __CUDA_ARCH__ >= 300
727  asm("vset4.u32.u32.ge %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
728  #else
729  unsigned int c;
730  asm("not.b32 %0, %0;" : "+r"(b));
731  c = vavrg4(a, b); // (a + ~b + 1) / 2 = (a - b) / 2
732  c = c & 0x80808080; // msb = carry-outs
733  r = c >> 7; // convert to bool
734  #endif
735 
736  return r;
737  }
738 
739  static __device__ __forceinline__ unsigned int vcmpge4(unsigned int a, unsigned int b)
740  {
741  unsigned int r, s;
742 
743  #if __CUDA_ARCH__ >= 300
744  r = vsetge4(a, b);
745  s = r << 8; // convert bool
746  r = s - r; // to mask
747  #else
748  asm ("not.b32 %0,%0;" : "+r"(b));
749  r = vavrg4 (a, b); // (a + ~b + 1) / 2 = (a - b) / 2
750  r = r & 0x80808080; // msb = carry-outs
751  s = r >> 7; // build mask
752  s = r - s; // from
753  r = s | r; // msbs
754  #endif
755 
756  return r;
757  }
758 
759  static __device__ __forceinline__ unsigned int vsetgt4(unsigned int a, unsigned int b)
760  {
761  unsigned int r = 0;
762 
763  #if __CUDA_ARCH__ >= 300
764  asm("vset4.u32.u32.gt %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
765  #else
766  unsigned int c;
767  asm("not.b32 %0, %0;" : "+r"(b));
768  c = vavg4(a, b); // (a + ~b) / 2 = (a - b) / 2 [rounded down]
769  c = c & 0x80808080; // msb = carry-outs
770  r = c >> 7; // convert to bool
771  #endif
772 
773  return r;
774  }
775 
776  static __device__ __forceinline__ unsigned int vcmpgt4(unsigned int a, unsigned int b)
777  {
778  unsigned int r, c;
779 
780  #if __CUDA_ARCH__ >= 300
781  r = vsetgt4(a, b);
782  c = r << 8; // convert bool
783  r = c - r; // to mask
784  #else
785  asm("not.b32 %0, %0;" : "+r"(b));
786  c = vavg4(a, b); // (a + ~b) / 2 = (a - b) / 2 [rounded down]
787  c = c & 0x80808080; // msb = carry-outs
788  r = c >> 7; // convert
789  r = c - r; // msbs to
790  r = c | r; // mask
791  #endif
792 
793  return r;
794  }
795 
796  static __device__ __forceinline__ unsigned int vsetne4(unsigned int a, unsigned int b)
797  {
798  unsigned int r = 0;
799 
800  #if __CUDA_ARCH__ >= 300
801  asm("vset4.u32.u32.ne %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
802  #else
803  // inspired by Alan Mycroft's null-byte detection algorithm:
804  // null_byte(x) = ((x - 0x01010101) & (~x & 0x80808080))
805  unsigned int c;
806  r = a ^ b; // 0x00 if a == b
807  c = r | 0x80808080; // set msbs, to catch carry out
808  c = c - 0x01010101; // msb = 0, if r was 0x00 or 0x80
809  c = r | c; // msb = 1, if r was not 0x00
810  c = c & 0x80808080; // extract msbs
811  r = c >> 7; // convert to bool
812  #endif
813 
814  return r;
815  }
816 
817  static __device__ __forceinline__ unsigned int vcmpne4(unsigned int a, unsigned int b)
818  {
819  unsigned int r, c;
820 
821  #if __CUDA_ARCH__ >= 300
822  r = vsetne4(a, b);
823  c = r << 8; // convert bool
824  r = c - r; // to mask
825  #else
826  // inspired by Alan Mycroft's null-byte detection algorithm:
827  // null_byte(x) = ((x - 0x01010101) & (~x & 0x80808080))
828  r = a ^ b; // 0x00 if a == b
829  c = r | 0x80808080; // set msbs, to catch carry out
830  c = c - 0x01010101; // msb = 0, if r was 0x00 or 0x80
831  c = r | c; // msb = 1, if r was not 0x00
832  c = c & 0x80808080; // extract msbs
833  r = c >> 7; // convert
834  r = c - r; // msbs to
835  r = c | r; // mask
836  #endif
837 
838  return r;
839  }
840 
841  static __device__ __forceinline__ unsigned int vabsdiff4(unsigned int a, unsigned int b)
842  {
843  unsigned int r = 0;
844 
845  #if __CUDA_ARCH__ >= 300
846  asm("vabsdiff4.u32.u32.u32.sat %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
847  #elif __CUDA_ARCH__ >= 200
848  asm("vabsdiff.u32.u32.u32.sat %0.b0, %1.b0, %2.b0, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
849  asm("vabsdiff.u32.u32.u32.sat %0.b1, %1.b1, %2.b1, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
850  asm("vabsdiff.u32.u32.u32.sat %0.b2, %1.b2, %2.b2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
851  asm("vabsdiff.u32.u32.u32.sat %0.b3, %1.b3, %2.b3, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
852  #else
853  unsigned int s;
854  s = vcmpge4(a, b); // mask = 0xff if a >= b
855  r = a ^ b; //
856  s = (r & s) ^ b; // select a when a >= b, else select b => max(a,b)
857  r = s ^ r; // select a when b >= a, else select b => min(a,b)
858  r = s - r; // |a - b| = max(a,b) - min(a,b);
859  #endif
860 
861  return r;
862  }
863 
864  static __device__ __forceinline__ unsigned int vmax4(unsigned int a, unsigned int b)
865  {
866  unsigned int r = 0;
867 
868  #if __CUDA_ARCH__ >= 300
869  asm("vmax4.u32.u32.u32 %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
870  #elif __CUDA_ARCH__ >= 200
871  asm("vmax.u32.u32.u32 %0.b0, %1.b0, %2.b0, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
872  asm("vmax.u32.u32.u32 %0.b1, %1.b1, %2.b1, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
873  asm("vmax.u32.u32.u32 %0.b2, %1.b2, %2.b2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
874  asm("vmax.u32.u32.u32 %0.b3, %1.b3, %2.b3, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
875  #else
876  unsigned int s;
877  s = vcmpge4(a, b); // mask = 0xff if a >= b
878  r = a & s; // select a when b >= a
879  s = b & ~s; // select b when b < a
880  r = r | s; // combine byte selections
881  #endif
882 
883  return r; // byte-wise unsigned maximum
884  }
885 
886  static __device__ __forceinline__ unsigned int vmin4(unsigned int a, unsigned int b)
887  {
888  unsigned int r = 0;
889 
890  #if __CUDA_ARCH__ >= 300
891  asm("vmin4.u32.u32.u32 %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
892  #elif __CUDA_ARCH__ >= 200
893  asm("vmin.u32.u32.u32 %0.b0, %1.b0, %2.b0, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
894  asm("vmin.u32.u32.u32 %0.b1, %1.b1, %2.b1, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
895  asm("vmin.u32.u32.u32 %0.b2, %1.b2, %2.b2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
896  asm("vmin.u32.u32.u32 %0.b3, %1.b3, %2.b3, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
897  #else
898  unsigned int s;
899  s = vcmpge4(b, a); // mask = 0xff if a >= b
900  r = a & s; // select a when b >= a
901  s = b & ~s; // select b when b < a
902  r = r | s; // combine byte selections
903  #endif
904 
905  return r;
906  }
907 }}}
908 
909 #endif // __OPENCV_GPU_SIMD_FUNCTIONS_HPP__
GLdouble GLdouble GLdouble r
CV_EXPORTS void max(const GpuMat &src1, const GpuMat &src2, GpuMat &dst, Stream &stream=Stream::Null())
computes per-element maximum of two arrays (dst = max(src1, src2))
CV_EXPORTS void min(const GpuMat &src1, const GpuMat &src2, GpuMat &dst, Stream &stream=Stream::Null())
computes per-element minimum of two arrays (dst = min(src1, src2))
CvPoint2D32f float float b
Definition: legacy.hpp:578
CvRect r
Definition: core_c.h:1282
CvPoint2D32f float float float c
Definition: legacy.hpp:578
const GLdouble * v
GLboolean GLboolean GLboolean b
Definition: legacy.hpp:633
GLboolean GLboolean GLboolean GLboolean a
Definition: legacy.hpp:633
CvPoint2D32f float a
Definition: legacy.hpp:578
GLdouble GLdouble t
GLdouble s