Cinder  0.8.6
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Properties Friends Macros Pages
MatrixAlgo.h
Go to the documentation of this file.
1 /*
2  Copyright (c) 2010, The Cinder Project: http://libcinder.org
3  All rights reserved.
4 
5  Redistribution and use in source and binary forms, with or without modification, are permitted provided that
6  the following conditions are met:
7 
8  * Redistributions of source code must retain the above copyright notice, this list of conditions and
9  the following disclaimer.
10  * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and
11  the following disclaimer in the documentation and/or other materials provided with the distribution.
12 
13  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
14  WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
15  PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
16  ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
17  TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
18  HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
19  NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
20  POSSIBILITY OF SUCH DAMAGE.
21 */
22 
23 
24 #pragma once
25 
26 #include "cinder/Matrix22.h"
27 #include "cinder/Matrix33.h"
28 #include "cinder/Matrix44.h"
29 
30 #if defined( CINDER_MSW ) || defined( CINDER_MAC )
31 # include <emmintrin.h>
32 # include <xmmintrin.h>
33 #endif
34 
35 namespace cinder {
36 
37 #if defined( CINDER_MSW ) || defined( CINDER_MAC )
38 
39 #if defined( CINDER_MSW )
40  #define CINDER_ALIGN16_MATRIX44F( VAR ) \
41  __declspec( align( 16 ) ) Matrix44f VAR
42 #elif defined( CINDER_MAC )
43  #define CINDER_ALIGN16_MATRIX44F( VAR ) \
44  Matrix44f VAR __attribute__ ( ( aligned ( 16 ) ) )
45 #endif
46 
47 inline Matrix44f SseMultiply( const Matrix44f& mat0, const Matrix44f& mat1 )
48 {
49  CINDER_ALIGN16_MATRIX44F( ret );
50 
51  const float* a = mat0.m;
52  const float* b = mat1.m;
53  float* res = ret.m;
54 
55  __m128 b_line, r_line, ab;
56  __m128 a_line_0, a_line_4, a_line_8, a_line_12;
57 
58  // NOTE: Keep the usage of i in. Removed it and
59  // used constants and it ran slower. I'm not
60  // sure why - so I'm just going with it at
61  // the moment.
62  int i;
63 
64  // Load in the rows for b
65  a_line_0 = _mm_load_ps( a + 0 );
66  a_line_4 = _mm_load_ps( a + 4 );
67  a_line_8 = _mm_load_ps( a + 8 );
68  a_line_12 = _mm_load_ps( a + 12 );
69 
70  i = 0;
71  //
72  b_line = _mm_set1_ps( *(b + i + 0) );
73  r_line = _mm_mul_ps( b_line, a_line_0 );
74  //
75  b_line = _mm_set1_ps( *(b + i + 1) );
76  ab = _mm_mul_ps( b_line, a_line_4 );
77  r_line = _mm_add_ps( ab, r_line );
78  //
79  b_line = _mm_set1_ps( *(b + i + 2) );
80  ab = _mm_mul_ps( b_line, a_line_8 );
81  r_line = _mm_add_ps( ab, r_line );
82  //
83  b_line = _mm_set1_ps( *(b + i + 3) );
84  ab = _mm_mul_ps( b_line, a_line_12 );
85  r_line = _mm_add_ps( ab, r_line );
86  //
87  _mm_store_ps( res + i, r_line );
88 
89  i = 4;
90  //
91  b_line = _mm_set1_ps( *(b + i + 0) );
92  r_line = _mm_mul_ps( b_line, a_line_0 );
93  //
94  b_line = _mm_set1_ps( *(b + i + 1) );
95  ab = _mm_mul_ps( b_line, a_line_4 );
96  r_line = _mm_add_ps( ab, r_line );
97  //
98  b_line = _mm_set1_ps( *(b + i + 2) );
99  ab = _mm_mul_ps( b_line, a_line_8 );
100  r_line = _mm_add_ps( ab, r_line );
101  //
102  b_line = _mm_set1_ps( *(b + i + 3) );
103  ab = _mm_mul_ps( b_line, a_line_12 );
104  r_line = _mm_add_ps( ab, r_line );
105  //
106  _mm_store_ps( res + i, r_line );
107 
108  i = 8;
109  //
110  b_line = _mm_set1_ps( *(b + i + 0) );
111  r_line = _mm_mul_ps( b_line, a_line_0 );
112  //
113  b_line = _mm_set1_ps( *(b + i + 1) );
114  ab = _mm_mul_ps( b_line, a_line_4 );
115  r_line = _mm_add_ps( ab, r_line );
116  //
117  b_line = _mm_set1_ps( *(b + i + 2) );
118  ab = _mm_mul_ps( b_line, a_line_8 );
119  r_line = _mm_add_ps( ab, r_line );
120  //
121  b_line = _mm_set1_ps( *(b + i + 3) );
122  ab = _mm_mul_ps( b_line, a_line_12 );
123  r_line = _mm_add_ps( ab, r_line );
124  //
125  _mm_store_ps( res + i, r_line );
126 
127  i = 12;
128  //
129  b_line = _mm_set1_ps( *(b + i + 0) );
130  r_line = _mm_mul_ps( b_line, a_line_0 );
131  //
132  b_line = _mm_set1_ps( *(b + i + 1) );
133  ab = _mm_mul_ps( b_line, a_line_4 );
134  r_line = _mm_add_ps( ab, r_line );
135  //
136  b_line = _mm_set1_ps( *(b + i + 2) );
137  ab = _mm_mul_ps( b_line, a_line_8 );
138  r_line = _mm_add_ps( ab, r_line );
139  //
140  b_line = _mm_set1_ps( *(b + i + 3) );
141  ab = _mm_mul_ps( b_line, a_line_12 );
142  r_line = _mm_add_ps( ab, r_line );
143  //
144  _mm_store_ps( res + i, r_line );
145 
146  return ret;
147 }
148 
149 #if defined( CINDER_MSW )
150 # pragma runtime_checks( "", off )
151 # pragma warning( push )
152 # pragma warning( disable:4700 )
153 #endif
154 
155 inline Matrix44f SseInvert( const Matrix44f& mat )
156 {
157  CINDER_ALIGN16_MATRIX44F( ret );
158 
159  const float* src = mat.m;
160  float* dst = ret.m;
161 
162  __m128 minor0, minor1, minor2, minor3;
163  __m128 row0, row1, row2, row3;
164  __m128 det, tmp1;
165  tmp1 = _mm_loadh_pi( _mm_loadl_pi( tmp1, ( __m64* )( src ) ), ( __m64* )( src + 4 ) );
166  row1 = _mm_loadh_pi( _mm_loadl_pi( row1, ( __m64* )( src + 8 ) ), ( __m64* )( src + 12 ) );
167  row0 = _mm_shuffle_ps( tmp1, row1, 0x88 );
168  row1 = _mm_shuffle_ps( row1, tmp1, 0xDD );
169  tmp1 = _mm_loadh_pi( _mm_loadl_pi( tmp1, ( __m64* )( src + 2 ) ), ( __m64* )( src + 6 ) );
170  row3 = _mm_loadh_pi( _mm_loadl_pi( row3, ( __m64* )( src + 10 ) ), ( __m64* )( src + 14 ) );
171  row2 = _mm_shuffle_ps( tmp1, row3, 0x88 );
172  row3 = _mm_shuffle_ps( row3, tmp1, 0xDD );
173  // -----------------------------------------------
174  tmp1 = _mm_mul_ps( row2, row3 );
175  tmp1 = _mm_shuffle_ps( tmp1, tmp1, 0xB1 );
176  minor0 = _mm_mul_ps( row1, tmp1 );
177  minor1 = _mm_mul_ps( row0, tmp1 );
178  tmp1 = _mm_shuffle_ps( tmp1, tmp1, 0x4E );
179  minor0 = _mm_sub_ps( _mm_mul_ps( row1, tmp1 ), minor0 );
180  minor1 = _mm_sub_ps( _mm_mul_ps( row0, tmp1 ), minor1 );
181  minor1 = _mm_shuffle_ps( minor1, minor1, 0x4E );
182  // -----------------------------------------------
183  tmp1 = _mm_mul_ps( row1, row2 );
184  tmp1 = _mm_shuffle_ps( tmp1, tmp1, 0xB1 );
185  minor0 = _mm_add_ps( _mm_mul_ps( row3, tmp1 ), minor0 );
186  minor3 = _mm_mul_ps( row0, tmp1 );
187  tmp1 = _mm_shuffle_ps( tmp1, tmp1, 0x4E );
188  minor0 = _mm_sub_ps( minor0, _mm_mul_ps( row3, tmp1 ) );
189  minor3 = _mm_sub_ps( _mm_mul_ps( row0, tmp1 ), minor3 );
190  minor3 = _mm_shuffle_ps( minor3, minor3, 0x4E );
191  // -----------------------------------------------
192  tmp1 = _mm_mul_ps( _mm_shuffle_ps( row1, row1, 0x4E ), row3 );
193  tmp1 = _mm_shuffle_ps( tmp1, tmp1, 0xB1 );
194  row2 = _mm_shuffle_ps( row2, row2, 0x4E );
195  minor0 = _mm_add_ps( _mm_mul_ps( row2, tmp1 ), minor0 );
196  minor2 = _mm_mul_ps( row0, tmp1 );
197  tmp1 = _mm_shuffle_ps( tmp1, tmp1, 0x4E );
198  minor0 = _mm_sub_ps( minor0, _mm_mul_ps( row2, tmp1 ) );
199  minor2 = _mm_sub_ps( _mm_mul_ps( row0, tmp1 ), minor2 );
200  minor2 = _mm_shuffle_ps( minor2, minor2, 0x4E );
201  // -----------------------------------------------
202  tmp1 = _mm_mul_ps( row0, row1 );
203  tmp1 = _mm_shuffle_ps( tmp1, tmp1, 0xB1 );
204  minor2 = _mm_add_ps( _mm_mul_ps( row3, tmp1 ), minor2 );
205  minor3 = _mm_sub_ps( _mm_mul_ps( row2, tmp1 ), minor3 );
206  tmp1 = _mm_shuffle_ps( tmp1, tmp1, 0x4E );
207  minor2 = _mm_sub_ps( _mm_mul_ps( row3, tmp1 ), minor2 );
208  minor3 = _mm_sub_ps( minor3, _mm_mul_ps( row2, tmp1 ) );
209  // -----------------------------------------------
210  tmp1 = _mm_mul_ps( row0, row3 );
211  tmp1 = _mm_shuffle_ps( tmp1, tmp1, 0xB1 );
212  minor1 = _mm_sub_ps( minor1, _mm_mul_ps( row2, tmp1 ) );
213  minor2 = _mm_add_ps( _mm_mul_ps( row1, tmp1 ), minor2 );
214  tmp1 = _mm_shuffle_ps( tmp1, tmp1, 0x4E );
215  minor1 = _mm_add_ps( _mm_mul_ps( row2, tmp1 ), minor1 );
216  minor2 = _mm_sub_ps( minor2, _mm_mul_ps( row1, tmp1 ) );
217  // -----------------------------------------------
218  tmp1 = _mm_mul_ps( row0, row2 );
219  tmp1 = _mm_shuffle_ps( tmp1, tmp1, 0xB1 );
220  minor1 = _mm_add_ps( _mm_mul_ps( row3, tmp1 ), minor1 );
221  minor3 = _mm_sub_ps( minor3, _mm_mul_ps( row1, tmp1 ) );
222  tmp1 = _mm_shuffle_ps( tmp1, tmp1, 0x4E );
223  minor1 = _mm_sub_ps( minor1, _mm_mul_ps( row3, tmp1 ) );
224  minor3 = _mm_add_ps( _mm_mul_ps( row1, tmp1 ), minor3 );
225  // -----------------------------------------------
226  det = _mm_mul_ps( row0, minor0 );
227  det = _mm_add_ps( _mm_shuffle_ps( det, det, 0x4E ), det );
228  det = _mm_add_ss( _mm_shuffle_ps( det, det, 0xB1 ), det );
229  tmp1 = _mm_rcp_ss( det );
230  det = _mm_sub_ss( _mm_add_ss( tmp1, tmp1 ), _mm_mul_ss( det, _mm_mul_ss( tmp1, tmp1 ) ) );
231  det = _mm_shuffle_ps( det, det, 0x00 );
232 
233  minor0 = _mm_mul_ps( det, minor0 );
234  _mm_storel_pi( ( __m64* )( dst ), minor0 );
235  _mm_storeh_pi( ( __m64* )( dst + 2 ), minor0 );
236 
237  minor1 = _mm_mul_ps( det, minor1 );
238  _mm_storel_pi( ( __m64* )( dst + 4 ), minor1 );
239  _mm_storeh_pi( ( __m64* )( dst + 6 ), minor1 );
240 
241  minor2 = _mm_mul_ps( det, minor2 );
242  _mm_storel_pi( ( __m64* )( dst + 8 ), minor2 );
243  _mm_storeh_pi( ( __m64* )( dst + 10 ), minor2 );
244 
245  minor3 = _mm_mul_ps( det, minor3 );
246  _mm_storel_pi( ( __m64* )( dst + 12 ), minor3 );
247  _mm_storeh_pi( ( __m64* )( dst + 14 ), minor3 );
248 
249  return ret;
250 }
251 
252 #if defined( CINDER_MSW )
253 # pragma warning( pop )
254 # pragma runtime_checks( "", restore )
255 #endif
256 
257 #endif // #if defined( CINDER_MSW ) || defined( CINDER_MAC )
258 
259 } // namespace cinder
GLuint src
Definition: GLee.h:10873
GLuint res
Definition: GLee.h:10843
GLboolean GLboolean GLboolean b
Definition: GLee.h:2964
Matrix44< float > Matrix44f
Definition: Matrix44.h:1314
GLboolean GLboolean GLboolean GLboolean a
Definition: GLee.h:2964
GLuint dst
Definition: GLee.h:10536