00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024 #pragma once
00025
00026 #include "cinder/Matrix22.h"
00027 #include "cinder/Matrix33.h"
00028 #include "cinder/Matrix44.h"
00029
00030 #if defined( CINDER_MSW ) || defined( CINDER_MAC )
00031 # include <emmintrin.h>
00032 # include <xmmintrin.h>
00033 #endif
00034
00035 namespace cinder {
00036
00037 #if defined( CINDER_MSW ) || defined( CINDER_MAC )
00038
00039 #if defined( CINDER_MSW )
00040 #define CINDER_ALIGN16_MATRIX44F( VAR ) \
00041 __declspec( align( 16 ) ) Matrix44f VAR
00042 #elif defined( CINDER_MAC )
00043 #define CINDER_ALIGN16_MATRIX44F( VAR ) \
00044 Matrix44f VAR __attribute__ ( ( aligned ( 16 ) ) )
00045 #endif
00046
00047 inline Matrix44f SseMultiply( const Matrix44f& mat0, const Matrix44f& mat1 )
00048 {
00049 CINDER_ALIGN16_MATRIX44F( ret );
00050
00051 const float* a = mat0.m;
00052 const float* b = mat1.m;
00053 float* res = ret.m;
00054
00055 __m128 b_line, r_line, ab;
00056 __m128 a_line_0, a_line_4, a_line_8, a_line_12;
00057
00058
00059
00060
00061
00062 int i;
00063
00064
00065 a_line_0 = _mm_load_ps( a + 0 );
00066 a_line_4 = _mm_load_ps( a + 4 );
00067 a_line_8 = _mm_load_ps( a + 8 );
00068 a_line_12 = _mm_load_ps( a + 12 );
00069
00070 i = 0;
00071
00072 b_line = _mm_set1_ps( *(b + i + 0) );
00073 r_line = _mm_mul_ps( b_line, a_line_0 );
00074
00075 b_line = _mm_set1_ps( *(b + i + 1) );
00076 ab = _mm_mul_ps( b_line, a_line_4 );
00077 r_line = _mm_add_ps( ab, r_line );
00078
00079 b_line = _mm_set1_ps( *(b + i + 2) );
00080 ab = _mm_mul_ps( b_line, a_line_8 );
00081 r_line = _mm_add_ps( ab, r_line );
00082
00083 b_line = _mm_set1_ps( *(b + i + 3) );
00084 ab = _mm_mul_ps( b_line, a_line_12 );
00085 r_line = _mm_add_ps( ab, r_line );
00086
00087 _mm_store_ps( res + i, r_line );
00088
00089 i = 4;
00090
00091 b_line = _mm_set1_ps( *(b + i + 0) );
00092 r_line = _mm_mul_ps( b_line, a_line_0 );
00093
00094 b_line = _mm_set1_ps( *(b + i + 1) );
00095 ab = _mm_mul_ps( b_line, a_line_4 );
00096 r_line = _mm_add_ps( ab, r_line );
00097
00098 b_line = _mm_set1_ps( *(b + i + 2) );
00099 ab = _mm_mul_ps( b_line, a_line_8 );
00100 r_line = _mm_add_ps( ab, r_line );
00101
00102 b_line = _mm_set1_ps( *(b + i + 3) );
00103 ab = _mm_mul_ps( b_line, a_line_12 );
00104 r_line = _mm_add_ps( ab, r_line );
00105
00106 _mm_store_ps( res + i, r_line );
00107
00108 i = 8;
00109
00110 b_line = _mm_set1_ps( *(b + i + 0) );
00111 r_line = _mm_mul_ps( b_line, a_line_0 );
00112
00113 b_line = _mm_set1_ps( *(b + i + 1) );
00114 ab = _mm_mul_ps( b_line, a_line_4 );
00115 r_line = _mm_add_ps( ab, r_line );
00116
00117 b_line = _mm_set1_ps( *(b + i + 2) );
00118 ab = _mm_mul_ps( b_line, a_line_8 );
00119 r_line = _mm_add_ps( ab, r_line );
00120
00121 b_line = _mm_set1_ps( *(b + i + 3) );
00122 ab = _mm_mul_ps( b_line, a_line_12 );
00123 r_line = _mm_add_ps( ab, r_line );
00124
00125 _mm_store_ps( res + i, r_line );
00126
00127 i = 12;
00128
00129 b_line = _mm_set1_ps( *(b + i + 0) );
00130 r_line = _mm_mul_ps( b_line, a_line_0 );
00131
00132 b_line = _mm_set1_ps( *(b + i + 1) );
00133 ab = _mm_mul_ps( b_line, a_line_4 );
00134 r_line = _mm_add_ps( ab, r_line );
00135
00136 b_line = _mm_set1_ps( *(b + i + 2) );
00137 ab = _mm_mul_ps( b_line, a_line_8 );
00138 r_line = _mm_add_ps( ab, r_line );
00139
00140 b_line = _mm_set1_ps( *(b + i + 3) );
00141 ab = _mm_mul_ps( b_line, a_line_12 );
00142 r_line = _mm_add_ps( ab, r_line );
00143
00144 _mm_store_ps( res + i, r_line );
00145
00146 return ret;
00147 }
00148
00149 #if defined( CINDER_MSW )
00150 # pragma runtime_checks( "", off )
00151 # pragma warning( push )
00152 # pragma warning( disable:4700 )
00153 #endif
00154
00155 inline Matrix44f SseInvert( const Matrix44f& mat )
00156 {
00157 CINDER_ALIGN16_MATRIX44F( ret );
00158
00159 const float* src = mat.m;
00160 float* dst = ret.m;
00161
00162 __m128 minor0, minor1, minor2, minor3;
00163 __m128 row0, row1, row2, row3;
00164 __m128 det, tmp1;
00165 tmp1 = _mm_loadh_pi( _mm_loadl_pi( tmp1, ( __m64* )( src ) ), ( __m64* )( src + 4 ) );
00166 row1 = _mm_loadh_pi( _mm_loadl_pi( row1, ( __m64* )( src + 8 ) ), ( __m64* )( src + 12 ) );
00167 row0 = _mm_shuffle_ps( tmp1, row1, 0x88 );
00168 row1 = _mm_shuffle_ps( row1, tmp1, 0xDD );
00169 tmp1 = _mm_loadh_pi( _mm_loadl_pi( tmp1, ( __m64* )( src + 2 ) ), ( __m64* )( src + 6 ) );
00170 row3 = _mm_loadh_pi( _mm_loadl_pi( row3, ( __m64* )( src + 10 ) ), ( __m64* )( src + 14 ) );
00171 row2 = _mm_shuffle_ps( tmp1, row3, 0x88 );
00172 row3 = _mm_shuffle_ps( row3, tmp1, 0xDD );
00173
00174 tmp1 = _mm_mul_ps( row2, row3 );
00175 tmp1 = _mm_shuffle_ps( tmp1, tmp1, 0xB1 );
00176 minor0 = _mm_mul_ps( row1, tmp1 );
00177 minor1 = _mm_mul_ps( row0, tmp1 );
00178 tmp1 = _mm_shuffle_ps( tmp1, tmp1, 0x4E );
00179 minor0 = _mm_sub_ps( _mm_mul_ps( row1, tmp1 ), minor0 );
00180 minor1 = _mm_sub_ps( _mm_mul_ps( row0, tmp1 ), minor1 );
00181 minor1 = _mm_shuffle_ps( minor1, minor1, 0x4E );
00182
00183 tmp1 = _mm_mul_ps( row1, row2 );
00184 tmp1 = _mm_shuffle_ps( tmp1, tmp1, 0xB1 );
00185 minor0 = _mm_add_ps( _mm_mul_ps( row3, tmp1 ), minor0 );
00186 minor3 = _mm_mul_ps( row0, tmp1 );
00187 tmp1 = _mm_shuffle_ps( tmp1, tmp1, 0x4E );
00188 minor0 = _mm_sub_ps( minor0, _mm_mul_ps( row3, tmp1 ) );
00189 minor3 = _mm_sub_ps( _mm_mul_ps( row0, tmp1 ), minor3 );
00190 minor3 = _mm_shuffle_ps( minor3, minor3, 0x4E );
00191
00192 tmp1 = _mm_mul_ps( _mm_shuffle_ps( row1, row1, 0x4E ), row3 );
00193 tmp1 = _mm_shuffle_ps( tmp1, tmp1, 0xB1 );
00194 row2 = _mm_shuffle_ps( row2, row2, 0x4E );
00195 minor0 = _mm_add_ps( _mm_mul_ps( row2, tmp1 ), minor0 );
00196 minor2 = _mm_mul_ps( row0, tmp1 );
00197 tmp1 = _mm_shuffle_ps( tmp1, tmp1, 0x4E );
00198 minor0 = _mm_sub_ps( minor0, _mm_mul_ps( row2, tmp1 ) );
00199 minor2 = _mm_sub_ps( _mm_mul_ps( row0, tmp1 ), minor2 );
00200 minor2 = _mm_shuffle_ps( minor2, minor2, 0x4E );
00201
00202 tmp1 = _mm_mul_ps( row0, row1 );
00203 tmp1 = _mm_shuffle_ps( tmp1, tmp1, 0xB1 );
00204 minor2 = _mm_add_ps( _mm_mul_ps( row3, tmp1 ), minor2 );
00205 minor3 = _mm_sub_ps( _mm_mul_ps( row2, tmp1 ), minor3 );
00206 tmp1 = _mm_shuffle_ps( tmp1, tmp1, 0x4E );
00207 minor2 = _mm_sub_ps( _mm_mul_ps( row3, tmp1 ), minor2 );
00208 minor3 = _mm_sub_ps( minor3, _mm_mul_ps( row2, tmp1 ) );
00209
00210 tmp1 = _mm_mul_ps( row0, row3 );
00211 tmp1 = _mm_shuffle_ps( tmp1, tmp1, 0xB1 );
00212 minor1 = _mm_sub_ps( minor1, _mm_mul_ps( row2, tmp1 ) );
00213 minor2 = _mm_add_ps( _mm_mul_ps( row1, tmp1 ), minor2 );
00214 tmp1 = _mm_shuffle_ps( tmp1, tmp1, 0x4E );
00215 minor1 = _mm_add_ps( _mm_mul_ps( row2, tmp1 ), minor1 );
00216 minor2 = _mm_sub_ps( minor2, _mm_mul_ps( row1, tmp1 ) );
00217
00218 tmp1 = _mm_mul_ps( row0, row2 );
00219 tmp1 = _mm_shuffle_ps( tmp1, tmp1, 0xB1 );
00220 minor1 = _mm_add_ps( _mm_mul_ps( row3, tmp1 ), minor1 );
00221 minor3 = _mm_sub_ps( minor3, _mm_mul_ps( row1, tmp1 ) );
00222 tmp1 = _mm_shuffle_ps( tmp1, tmp1, 0x4E );
00223 minor1 = _mm_sub_ps( minor1, _mm_mul_ps( row3, tmp1 ) );
00224 minor3 = _mm_add_ps( _mm_mul_ps( row1, tmp1 ), minor3 );
00225
00226 det = _mm_mul_ps( row0, minor0 );
00227 det = _mm_add_ps( _mm_shuffle_ps( det, det, 0x4E ), det );
00228 det = _mm_add_ss( _mm_shuffle_ps( det, det, 0xB1 ), det );
00229 tmp1 = _mm_rcp_ss( det );
00230 det = _mm_sub_ss( _mm_add_ss( tmp1, tmp1 ), _mm_mul_ss( det, _mm_mul_ss( tmp1, tmp1 ) ) );
00231 det = _mm_shuffle_ps( det, det, 0x00 );
00232
00233 minor0 = _mm_mul_ps( det, minor0 );
00234 _mm_storel_pi( ( __m64* )( dst ), minor0 );
00235 _mm_storeh_pi( ( __m64* )( dst + 2 ), minor0 );
00236
00237 minor1 = _mm_mul_ps( det, minor1 );
00238 _mm_storel_pi( ( __m64* )( dst + 4 ), minor1 );
00239 _mm_storeh_pi( ( __m64* )( dst + 6 ), minor1 );
00240
00241 minor2 = _mm_mul_ps( det, minor2 );
00242 _mm_storel_pi( ( __m64* )( dst + 8 ), minor2 );
00243 _mm_storeh_pi( ( __m64* )( dst + 10 ), minor2 );
00244
00245 minor3 = _mm_mul_ps( det, minor3 );
00246 _mm_storel_pi( ( __m64* )( dst + 12 ), minor3 );
00247 _mm_storeh_pi( ( __m64* )( dst + 14 ), minor3 );
00248
00249 return ret;
00250 }
00251
00252 #if defined( CINDER_MSW )
00253 # pragma warning( pop )
00254 # pragma runtime_checks( "", restore )
00255 #endif
00256
00257 #endif // #if defined( CINDER_MSW ) || defined( CINDER_MAC )
00258
00259 }