30 #if defined( CINDER_MSW ) || defined( CINDER_MAC )
31 # include <emmintrin.h>
32 # include <xmmintrin.h>
37 #if defined( CINDER_MSW ) || defined( CINDER_MAC )
39 #if defined( CINDER_MSW )
40 #define CINDER_ALIGN16_MATRIX44F( VAR ) \
41 __declspec( align( 16 ) ) Matrix44f VAR
42 #elif defined( CINDER_MAC )
43 #define CINDER_ALIGN16_MATRIX44F( VAR ) \
44 Matrix44f VAR __attribute__ ( ( aligned ( 16 ) ) )
49 CINDER_ALIGN16_MATRIX44F( ret );
51 const float*
a = mat0.m;
52 const float*
b = mat1.m;
55 __m128 b_line, r_line, ab;
56 __m128 a_line_0, a_line_4, a_line_8, a_line_12;
65 a_line_0 = _mm_load_ps( a + 0 );
66 a_line_4 = _mm_load_ps( a + 4 );
67 a_line_8 = _mm_load_ps( a + 8 );
68 a_line_12 = _mm_load_ps( a + 12 );
72 b_line = _mm_set1_ps( *(b + i + 0) );
73 r_line = _mm_mul_ps( b_line, a_line_0 );
75 b_line = _mm_set1_ps( *(b + i + 1) );
76 ab = _mm_mul_ps( b_line, a_line_4 );
77 r_line = _mm_add_ps( ab, r_line );
79 b_line = _mm_set1_ps( *(b + i + 2) );
80 ab = _mm_mul_ps( b_line, a_line_8 );
81 r_line = _mm_add_ps( ab, r_line );
83 b_line = _mm_set1_ps( *(b + i + 3) );
84 ab = _mm_mul_ps( b_line, a_line_12 );
85 r_line = _mm_add_ps( ab, r_line );
87 _mm_store_ps( res + i, r_line );
91 b_line = _mm_set1_ps( *(b + i + 0) );
92 r_line = _mm_mul_ps( b_line, a_line_0 );
94 b_line = _mm_set1_ps( *(b + i + 1) );
95 ab = _mm_mul_ps( b_line, a_line_4 );
96 r_line = _mm_add_ps( ab, r_line );
98 b_line = _mm_set1_ps( *(b + i + 2) );
99 ab = _mm_mul_ps( b_line, a_line_8 );
100 r_line = _mm_add_ps( ab, r_line );
102 b_line = _mm_set1_ps( *(b + i + 3) );
103 ab = _mm_mul_ps( b_line, a_line_12 );
104 r_line = _mm_add_ps( ab, r_line );
106 _mm_store_ps( res + i, r_line );
110 b_line = _mm_set1_ps( *(b + i + 0) );
111 r_line = _mm_mul_ps( b_line, a_line_0 );
113 b_line = _mm_set1_ps( *(b + i + 1) );
114 ab = _mm_mul_ps( b_line, a_line_4 );
115 r_line = _mm_add_ps( ab, r_line );
117 b_line = _mm_set1_ps( *(b + i + 2) );
118 ab = _mm_mul_ps( b_line, a_line_8 );
119 r_line = _mm_add_ps( ab, r_line );
121 b_line = _mm_set1_ps( *(b + i + 3) );
122 ab = _mm_mul_ps( b_line, a_line_12 );
123 r_line = _mm_add_ps( ab, r_line );
125 _mm_store_ps( res + i, r_line );
129 b_line = _mm_set1_ps( *(b + i + 0) );
130 r_line = _mm_mul_ps( b_line, a_line_0 );
132 b_line = _mm_set1_ps( *(b + i + 1) );
133 ab = _mm_mul_ps( b_line, a_line_4 );
134 r_line = _mm_add_ps( ab, r_line );
136 b_line = _mm_set1_ps( *(b + i + 2) );
137 ab = _mm_mul_ps( b_line, a_line_8 );
138 r_line = _mm_add_ps( ab, r_line );
140 b_line = _mm_set1_ps( *(b + i + 3) );
141 ab = _mm_mul_ps( b_line, a_line_12 );
142 r_line = _mm_add_ps( ab, r_line );
144 _mm_store_ps( res + i, r_line );
149 #if defined( CINDER_MSW )
150 # pragma runtime_checks( "", off )
151 # pragma warning( push )
152 # pragma warning( disable:4700 )
157 CINDER_ALIGN16_MATRIX44F( ret );
159 const float*
src = mat.m;
162 __m128 minor0, minor1, minor2, minor3;
163 __m128 row0, row1, row2, row3;
165 tmp1 = _mm_loadh_pi( _mm_loadl_pi( tmp1, ( __m64* )( src ) ), ( __m64* )( src + 4 ) );
166 row1 = _mm_loadh_pi( _mm_loadl_pi( row1, ( __m64* )( src + 8 ) ), ( __m64* )( src + 12 ) );
167 row0 = _mm_shuffle_ps( tmp1, row1, 0x88 );
168 row1 = _mm_shuffle_ps( row1, tmp1, 0xDD );
169 tmp1 = _mm_loadh_pi( _mm_loadl_pi( tmp1, ( __m64* )( src + 2 ) ), ( __m64* )( src + 6 ) );
170 row3 = _mm_loadh_pi( _mm_loadl_pi( row3, ( __m64* )( src + 10 ) ), ( __m64* )( src + 14 ) );
171 row2 = _mm_shuffle_ps( tmp1, row3, 0x88 );
172 row3 = _mm_shuffle_ps( row3, tmp1, 0xDD );
174 tmp1 = _mm_mul_ps( row2, row3 );
175 tmp1 = _mm_shuffle_ps( tmp1, tmp1, 0xB1 );
176 minor0 = _mm_mul_ps( row1, tmp1 );
177 minor1 = _mm_mul_ps( row0, tmp1 );
178 tmp1 = _mm_shuffle_ps( tmp1, tmp1, 0x4E );
179 minor0 = _mm_sub_ps( _mm_mul_ps( row1, tmp1 ), minor0 );
180 minor1 = _mm_sub_ps( _mm_mul_ps( row0, tmp1 ), minor1 );
181 minor1 = _mm_shuffle_ps( minor1, minor1, 0x4E );
183 tmp1 = _mm_mul_ps( row1, row2 );
184 tmp1 = _mm_shuffle_ps( tmp1, tmp1, 0xB1 );
185 minor0 = _mm_add_ps( _mm_mul_ps( row3, tmp1 ), minor0 );
186 minor3 = _mm_mul_ps( row0, tmp1 );
187 tmp1 = _mm_shuffle_ps( tmp1, tmp1, 0x4E );
188 minor0 = _mm_sub_ps( minor0, _mm_mul_ps( row3, tmp1 ) );
189 minor3 = _mm_sub_ps( _mm_mul_ps( row0, tmp1 ), minor3 );
190 minor3 = _mm_shuffle_ps( minor3, minor3, 0x4E );
192 tmp1 = _mm_mul_ps( _mm_shuffle_ps( row1, row1, 0x4E ), row3 );
193 tmp1 = _mm_shuffle_ps( tmp1, tmp1, 0xB1 );
194 row2 = _mm_shuffle_ps( row2, row2, 0x4E );
195 minor0 = _mm_add_ps( _mm_mul_ps( row2, tmp1 ), minor0 );
196 minor2 = _mm_mul_ps( row0, tmp1 );
197 tmp1 = _mm_shuffle_ps( tmp1, tmp1, 0x4E );
198 minor0 = _mm_sub_ps( minor0, _mm_mul_ps( row2, tmp1 ) );
199 minor2 = _mm_sub_ps( _mm_mul_ps( row0, tmp1 ), minor2 );
200 minor2 = _mm_shuffle_ps( minor2, minor2, 0x4E );
202 tmp1 = _mm_mul_ps( row0, row1 );
203 tmp1 = _mm_shuffle_ps( tmp1, tmp1, 0xB1 );
204 minor2 = _mm_add_ps( _mm_mul_ps( row3, tmp1 ), minor2 );
205 minor3 = _mm_sub_ps( _mm_mul_ps( row2, tmp1 ), minor3 );
206 tmp1 = _mm_shuffle_ps( tmp1, tmp1, 0x4E );
207 minor2 = _mm_sub_ps( _mm_mul_ps( row3, tmp1 ), minor2 );
208 minor3 = _mm_sub_ps( minor3, _mm_mul_ps( row2, tmp1 ) );
210 tmp1 = _mm_mul_ps( row0, row3 );
211 tmp1 = _mm_shuffle_ps( tmp1, tmp1, 0xB1 );
212 minor1 = _mm_sub_ps( minor1, _mm_mul_ps( row2, tmp1 ) );
213 minor2 = _mm_add_ps( _mm_mul_ps( row1, tmp1 ), minor2 );
214 tmp1 = _mm_shuffle_ps( tmp1, tmp1, 0x4E );
215 minor1 = _mm_add_ps( _mm_mul_ps( row2, tmp1 ), minor1 );
216 minor2 = _mm_sub_ps( minor2, _mm_mul_ps( row1, tmp1 ) );
218 tmp1 = _mm_mul_ps( row0, row2 );
219 tmp1 = _mm_shuffle_ps( tmp1, tmp1, 0xB1 );
220 minor1 = _mm_add_ps( _mm_mul_ps( row3, tmp1 ), minor1 );
221 minor3 = _mm_sub_ps( minor3, _mm_mul_ps( row1, tmp1 ) );
222 tmp1 = _mm_shuffle_ps( tmp1, tmp1, 0x4E );
223 minor1 = _mm_sub_ps( minor1, _mm_mul_ps( row3, tmp1 ) );
224 minor3 = _mm_add_ps( _mm_mul_ps( row1, tmp1 ), minor3 );
226 det = _mm_mul_ps( row0, minor0 );
227 det = _mm_add_ps( _mm_shuffle_ps( det, det, 0x4E ), det );
228 det = _mm_add_ss( _mm_shuffle_ps( det, det, 0xB1 ), det );
229 tmp1 = _mm_rcp_ss( det );
230 det = _mm_sub_ss( _mm_add_ss( tmp1, tmp1 ), _mm_mul_ss( det, _mm_mul_ss( tmp1, tmp1 ) ) );
231 det = _mm_shuffle_ps( det, det, 0x00 );
233 minor0 = _mm_mul_ps( det, minor0 );
234 _mm_storel_pi( ( __m64* )( dst ), minor0 );
235 _mm_storeh_pi( ( __m64* )( dst + 2 ), minor0 );
237 minor1 = _mm_mul_ps( det, minor1 );
238 _mm_storel_pi( ( __m64* )( dst + 4 ), minor1 );
239 _mm_storeh_pi( ( __m64* )( dst + 6 ), minor1 );
241 minor2 = _mm_mul_ps( det, minor2 );
242 _mm_storel_pi( ( __m64* )( dst + 8 ), minor2 );
243 _mm_storeh_pi( ( __m64* )( dst + 10 ), minor2 );
245 minor3 = _mm_mul_ps( det, minor3 );
246 _mm_storel_pi( ( __m64* )( dst + 12 ), minor3 );
247 _mm_storeh_pi( ( __m64* )( dst + 14 ), minor3 );
252 #if defined( CINDER_MSW )
253 # pragma warning( pop )
254 # pragma runtime_checks( "", restore )
257 #endif // #if defined( CINDER_MSW ) || defined( CINDER_MAC )
GLuint src
Definition: GLee.h:10873
GLuint res
Definition: GLee.h:10843
GLboolean GLboolean GLboolean b
Definition: GLee.h:2964
Matrix44< float > Matrix44f
Definition: Matrix44.h:1314
GLboolean GLboolean GLboolean GLboolean a
Definition: GLee.h:2964
GLuint dst
Definition: GLee.h:10536