00001 /* 00002 Copyright (c) 2010, The Cinder Project: http://libcinder.org 00003 All rights reserved. 00004 00005 Redistribution and use in source and binary forms, with or without modification, are permitted provided that 00006 the following conditions are met: 00007 00008 * Redistributions of source code must retain the above copyright notice, this list of conditions and 00009 the following disclaimer. 00010 * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and 00011 the following disclaimer in the documentation and/or other materials provided with the distribution. 00012 00013 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED 00014 WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A 00015 PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR 00016 ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED 00017 TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 00018 HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 00019 NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 00020 POSSIBILITY OF SUCH DAMAGE. 00021 */ 00022 00023 00024 #pragma once 00025 00026 #include "cinder/Matrix22.h" 00027 #include "cinder/Matrix33.h" 00028 #include "cinder/Matrix44.h" 00029 00030 #if defined( CINDER_MSW ) || defined( CINDER_MAC ) 00031 # include <emmintrin.h> 00032 # include <xmmintrin.h> 00033 #endif 00034 00035 namespace cinder { 00036 00037 #if defined( CINDER_MSW ) || defined( CINDER_MAC ) 00038 00039 #if defined( CINDER_MSW ) 00040 #define CINDER_ALIGN16_MATRIX44F( VAR ) \ 00041 __declspec( align( 16 ) ) Matrix44f VAR 00042 #elif defined( CINDER_MAC ) 00043 #define CINDER_ALIGN16_MATRIX44F( VAR ) \ 00044 Matrix44f VAR __attribute__ ( ( aligned ( 16 ) ) ) 00045 #endif 00046 00047 inline Matrix44f SseMultiply( const Matrix44f& mat0, const Matrix44f& mat1 ) 00048 { 00049 CINDER_ALIGN16_MATRIX44F( ret ); 00050 00051 const float* a = mat0.m; 00052 const float* b = mat1.m; 00053 float* res = ret.m; 00054 00055 __m128 b_line, r_line, ab; 00056 __m128 a_line_0, a_line_4, a_line_8, a_line_12; 00057 00058 // NOTE: Keep the usage of i in. Removed it and 00059 // used constants and it ran slower. I'm not 00060 // sure why - so I'm just going with it at 00061 // the moment. 00062 int i; 00063 00064 // Load in the rows for b 00065 a_line_0 = _mm_load_ps( a + 0 ); 00066 a_line_4 = _mm_load_ps( a + 4 ); 00067 a_line_8 = _mm_load_ps( a + 8 ); 00068 a_line_12 = _mm_load_ps( a + 12 ); 00069 00070 i = 0; 00071 // 00072 b_line = _mm_set1_ps( *(b + i + 0) ); 00073 r_line = _mm_mul_ps( b_line, a_line_0 ); 00074 // 00075 b_line = _mm_set1_ps( *(b + i + 1) ); 00076 ab = _mm_mul_ps( b_line, a_line_4 ); 00077 r_line = _mm_add_ps( ab, r_line ); 00078 // 00079 b_line = _mm_set1_ps( *(b + i + 2) ); 00080 ab = _mm_mul_ps( b_line, a_line_8 ); 00081 r_line = _mm_add_ps( ab, r_line ); 00082 // 00083 b_line = _mm_set1_ps( *(b + i + 3) ); 00084 ab = _mm_mul_ps( b_line, a_line_12 ); 00085 r_line = _mm_add_ps( ab, r_line ); 00086 // 00087 _mm_store_ps( res + i, r_line ); 00088 00089 i = 4; 00090 // 00091 b_line = _mm_set1_ps( *(b + i + 0) ); 00092 r_line = _mm_mul_ps( b_line, a_line_0 ); 00093 // 00094 b_line = _mm_set1_ps( *(b + i + 1) ); 00095 ab = _mm_mul_ps( b_line, a_line_4 ); 00096 r_line = _mm_add_ps( ab, r_line ); 00097 // 00098 b_line = _mm_set1_ps( *(b + i + 2) ); 00099 ab = _mm_mul_ps( b_line, a_line_8 ); 00100 r_line = _mm_add_ps( ab, r_line ); 00101 // 00102 b_line = _mm_set1_ps( *(b + i + 3) ); 00103 ab = _mm_mul_ps( b_line, a_line_12 ); 00104 r_line = _mm_add_ps( ab, r_line ); 00105 // 00106 _mm_store_ps( res + i, r_line ); 00107 00108 i = 8; 00109 // 00110 b_line = _mm_set1_ps( *(b + i + 0) ); 00111 r_line = _mm_mul_ps( b_line, a_line_0 ); 00112 // 00113 b_line = _mm_set1_ps( *(b + i + 1) ); 00114 ab = _mm_mul_ps( b_line, a_line_4 ); 00115 r_line = _mm_add_ps( ab, r_line ); 00116 // 00117 b_line = _mm_set1_ps( *(b + i + 2) ); 00118 ab = _mm_mul_ps( b_line, a_line_8 ); 00119 r_line = _mm_add_ps( ab, r_line ); 00120 // 00121 b_line = _mm_set1_ps( *(b + i + 3) ); 00122 ab = _mm_mul_ps( b_line, a_line_12 ); 00123 r_line = _mm_add_ps( ab, r_line ); 00124 // 00125 _mm_store_ps( res + i, r_line ); 00126 00127 i = 12; 00128 // 00129 b_line = _mm_set1_ps( *(b + i + 0) ); 00130 r_line = _mm_mul_ps( b_line, a_line_0 ); 00131 // 00132 b_line = _mm_set1_ps( *(b + i + 1) ); 00133 ab = _mm_mul_ps( b_line, a_line_4 ); 00134 r_line = _mm_add_ps( ab, r_line ); 00135 // 00136 b_line = _mm_set1_ps( *(b + i + 2) ); 00137 ab = _mm_mul_ps( b_line, a_line_8 ); 00138 r_line = _mm_add_ps( ab, r_line ); 00139 // 00140 b_line = _mm_set1_ps( *(b + i + 3) ); 00141 ab = _mm_mul_ps( b_line, a_line_12 ); 00142 r_line = _mm_add_ps( ab, r_line ); 00143 // 00144 _mm_store_ps( res + i, r_line ); 00145 00146 return ret; 00147 } 00148 00149 #if defined( CINDER_MSW ) 00150 # pragma runtime_checks( "", off ) 00151 # pragma warning( push ) 00152 # pragma warning( disable:4700 ) 00153 #endif 00154 00155 inline Matrix44f SseInvert( const Matrix44f& mat ) 00156 { 00157 CINDER_ALIGN16_MATRIX44F( ret ); 00158 00159 const float* src = mat.m; 00160 float* dst = ret.m; 00161 00162 __m128 minor0, minor1, minor2, minor3; 00163 __m128 row0, row1, row2, row3; 00164 __m128 det, tmp1; 00165 tmp1 = _mm_loadh_pi( _mm_loadl_pi( tmp1, ( __m64* )( src ) ), ( __m64* )( src + 4 ) ); 00166 row1 = _mm_loadh_pi( _mm_loadl_pi( row1, ( __m64* )( src + 8 ) ), ( __m64* )( src + 12 ) ); 00167 row0 = _mm_shuffle_ps( tmp1, row1, 0x88 ); 00168 row1 = _mm_shuffle_ps( row1, tmp1, 0xDD ); 00169 tmp1 = _mm_loadh_pi( _mm_loadl_pi( tmp1, ( __m64* )( src + 2 ) ), ( __m64* )( src + 6 ) ); 00170 row3 = _mm_loadh_pi( _mm_loadl_pi( row3, ( __m64* )( src + 10 ) ), ( __m64* )( src + 14 ) ); 00171 row2 = _mm_shuffle_ps( tmp1, row3, 0x88 ); 00172 row3 = _mm_shuffle_ps( row3, tmp1, 0xDD ); 00173 // ----------------------------------------------- 00174 tmp1 = _mm_mul_ps( row2, row3 ); 00175 tmp1 = _mm_shuffle_ps( tmp1, tmp1, 0xB1 ); 00176 minor0 = _mm_mul_ps( row1, tmp1 ); 00177 minor1 = _mm_mul_ps( row0, tmp1 ); 00178 tmp1 = _mm_shuffle_ps( tmp1, tmp1, 0x4E ); 00179 minor0 = _mm_sub_ps( _mm_mul_ps( row1, tmp1 ), minor0 ); 00180 minor1 = _mm_sub_ps( _mm_mul_ps( row0, tmp1 ), minor1 ); 00181 minor1 = _mm_shuffle_ps( minor1, minor1, 0x4E ); 00182 // ----------------------------------------------- 00183 tmp1 = _mm_mul_ps( row1, row2 ); 00184 tmp1 = _mm_shuffle_ps( tmp1, tmp1, 0xB1 ); 00185 minor0 = _mm_add_ps( _mm_mul_ps( row3, tmp1 ), minor0 ); 00186 minor3 = _mm_mul_ps( row0, tmp1 ); 00187 tmp1 = _mm_shuffle_ps( tmp1, tmp1, 0x4E ); 00188 minor0 = _mm_sub_ps( minor0, _mm_mul_ps( row3, tmp1 ) ); 00189 minor3 = _mm_sub_ps( _mm_mul_ps( row0, tmp1 ), minor3 ); 00190 minor3 = _mm_shuffle_ps( minor3, minor3, 0x4E ); 00191 // ----------------------------------------------- 00192 tmp1 = _mm_mul_ps( _mm_shuffle_ps( row1, row1, 0x4E ), row3 ); 00193 tmp1 = _mm_shuffle_ps( tmp1, tmp1, 0xB1 ); 00194 row2 = _mm_shuffle_ps( row2, row2, 0x4E ); 00195 minor0 = _mm_add_ps( _mm_mul_ps( row2, tmp1 ), minor0 ); 00196 minor2 = _mm_mul_ps( row0, tmp1 ); 00197 tmp1 = _mm_shuffle_ps( tmp1, tmp1, 0x4E ); 00198 minor0 = _mm_sub_ps( minor0, _mm_mul_ps( row2, tmp1 ) ); 00199 minor2 = _mm_sub_ps( _mm_mul_ps( row0, tmp1 ), minor2 ); 00200 minor2 = _mm_shuffle_ps( minor2, minor2, 0x4E ); 00201 // ----------------------------------------------- 00202 tmp1 = _mm_mul_ps( row0, row1 ); 00203 tmp1 = _mm_shuffle_ps( tmp1, tmp1, 0xB1 ); 00204 minor2 = _mm_add_ps( _mm_mul_ps( row3, tmp1 ), minor2 ); 00205 minor3 = _mm_sub_ps( _mm_mul_ps( row2, tmp1 ), minor3 ); 00206 tmp1 = _mm_shuffle_ps( tmp1, tmp1, 0x4E ); 00207 minor2 = _mm_sub_ps( _mm_mul_ps( row3, tmp1 ), minor2 ); 00208 minor3 = _mm_sub_ps( minor3, _mm_mul_ps( row2, tmp1 ) ); 00209 // ----------------------------------------------- 00210 tmp1 = _mm_mul_ps( row0, row3 ); 00211 tmp1 = _mm_shuffle_ps( tmp1, tmp1, 0xB1 ); 00212 minor1 = _mm_sub_ps( minor1, _mm_mul_ps( row2, tmp1 ) ); 00213 minor2 = _mm_add_ps( _mm_mul_ps( row1, tmp1 ), minor2 ); 00214 tmp1 = _mm_shuffle_ps( tmp1, tmp1, 0x4E ); 00215 minor1 = _mm_add_ps( _mm_mul_ps( row2, tmp1 ), minor1 ); 00216 minor2 = _mm_sub_ps( minor2, _mm_mul_ps( row1, tmp1 ) ); 00217 // ----------------------------------------------- 00218 tmp1 = _mm_mul_ps( row0, row2 ); 00219 tmp1 = _mm_shuffle_ps( tmp1, tmp1, 0xB1 ); 00220 minor1 = _mm_add_ps( _mm_mul_ps( row3, tmp1 ), minor1 ); 00221 minor3 = _mm_sub_ps( minor3, _mm_mul_ps( row1, tmp1 ) ); 00222 tmp1 = _mm_shuffle_ps( tmp1, tmp1, 0x4E ); 00223 minor1 = _mm_sub_ps( minor1, _mm_mul_ps( row3, tmp1 ) ); 00224 minor3 = _mm_add_ps( _mm_mul_ps( row1, tmp1 ), minor3 ); 00225 // ----------------------------------------------- 00226 det = _mm_mul_ps( row0, minor0 ); 00227 det = _mm_add_ps( _mm_shuffle_ps( det, det, 0x4E ), det ); 00228 det = _mm_add_ss( _mm_shuffle_ps( det, det, 0xB1 ), det ); 00229 tmp1 = _mm_rcp_ss( det ); 00230 det = _mm_sub_ss( _mm_add_ss( tmp1, tmp1 ), _mm_mul_ss( det, _mm_mul_ss( tmp1, tmp1 ) ) ); 00231 det = _mm_shuffle_ps( det, det, 0x00 ); 00232 00233 minor0 = _mm_mul_ps( det, minor0 ); 00234 _mm_storel_pi( ( __m64* )( dst ), minor0 ); 00235 _mm_storeh_pi( ( __m64* )( dst + 2 ), minor0 ); 00236 00237 minor1 = _mm_mul_ps( det, minor1 ); 00238 _mm_storel_pi( ( __m64* )( dst + 4 ), minor1 ); 00239 _mm_storeh_pi( ( __m64* )( dst + 6 ), minor1 ); 00240 00241 minor2 = _mm_mul_ps( det, minor2 ); 00242 _mm_storel_pi( ( __m64* )( dst + 8 ), minor2 ); 00243 _mm_storeh_pi( ( __m64* )( dst + 10 ), minor2 ); 00244 00245 minor3 = _mm_mul_ps( det, minor3 ); 00246 _mm_storel_pi( ( __m64* )( dst + 12 ), minor3 ); 00247 _mm_storeh_pi( ( __m64* )( dst + 14 ), minor3 ); 00248 00249 return ret; 00250 } 00251 00252 #if defined( CINDER_MSW ) 00253 # pragma warning( pop ) 00254 # pragma runtime_checks( "", restore ) 00255 #endif 00256 00257 #endif // #if defined( CINDER_MSW ) || defined( CINDER_MAC ) 00258 00259 } // namespace cinder