Cinder

  • Main Page
  • Related Pages
  • Namespaces
  • Classes
  • Files
  • Examples
  • File List
  • File Members

include/cinder/MatrixAlgo.h

Go to the documentation of this file.
00001 /*
00002  Copyright (c) 2010, The Cinder Project: http://libcinder.org
00003  All rights reserved.
00004 
00005  Redistribution and use in source and binary forms, with or without modification, are permitted provided that
00006  the following conditions are met:
00007 
00008     * Redistributions of source code must retain the above copyright notice, this list of conditions and
00009     the following disclaimer.
00010     * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and
00011     the following disclaimer in the documentation and/or other materials provided with the distribution.
00012 
00013  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
00014  WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
00015  PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
00016  ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
00017  TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
00018  HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
00019  NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
00020  POSSIBILITY OF SUCH DAMAGE.
00021 */
00022 
00023 
00024 #pragma once
00025 
00026 #include "cinder/Matrix22.h"
00027 #include "cinder/Matrix33.h"
00028 #include "cinder/Matrix44.h"
00029 
00030 #if defined( CINDER_MSW ) || defined( CINDER_MAC )
00031 #   include <emmintrin.h>
00032 #   include <xmmintrin.h>
00033 #endif
00034 
00035 namespace cinder {
00036 
00037 #if defined( CINDER_MSW ) || defined( CINDER_MAC )
00038 
00039 #if defined( CINDER_MSW )
00040     #define CINDER_ALIGN16_MATRIX44F( VAR ) \
00041         __declspec( align( 16 ) ) Matrix44f VAR
00042 #elif defined( CINDER_MAC )
00043     #define CINDER_ALIGN16_MATRIX44F( VAR ) \
00044         Matrix44f VAR __attribute__ ( ( aligned ( 16 ) ) )
00045 #endif
00046 
00047 inline Matrix44f SseMultiply( const Matrix44f& mat0, const Matrix44f& mat1 ) 
00048 {
00049     CINDER_ALIGN16_MATRIX44F( ret );
00050 
00051     const float* a = mat0.m;
00052     const float* b = mat1.m;
00053     float* res = ret.m;
00054 
00055     __m128 b_line, r_line, ab;
00056     __m128 a_line_0, a_line_4, a_line_8, a_line_12;
00057 
00058     // NOTE: Keep the usage of i in. Removed it and
00059     //       used constants and it ran slower. I'm not
00060     //       sure why - so I'm just going with it at 
00061     //       the moment.
00062     int i;
00063 
00064     // Load in the rows for b
00065     a_line_0  = _mm_load_ps( a +  0 );
00066     a_line_4  = _mm_load_ps( a +  4 );
00067     a_line_8  = _mm_load_ps( a +  8 );
00068     a_line_12 = _mm_load_ps( a + 12 );
00069 
00070     i = 0;
00071     // 
00072     b_line = _mm_set1_ps( *(b + i + 0) );
00073     r_line = _mm_mul_ps( b_line, a_line_0 );
00074     //
00075     b_line = _mm_set1_ps( *(b + i + 1) );
00076     ab     = _mm_mul_ps( b_line, a_line_4 );
00077     r_line = _mm_add_ps( ab, r_line );
00078     //
00079     b_line = _mm_set1_ps( *(b + i + 2) );
00080     ab     = _mm_mul_ps( b_line, a_line_8 );
00081     r_line = _mm_add_ps( ab, r_line );
00082     //
00083     b_line = _mm_set1_ps( *(b + i + 3) );
00084     ab     = _mm_mul_ps( b_line, a_line_12 );
00085     r_line = _mm_add_ps( ab, r_line );
00086     //
00087     _mm_store_ps( res + i, r_line );
00088 
00089     i = 4;
00090     // 
00091     b_line = _mm_set1_ps( *(b + i + 0) );
00092     r_line = _mm_mul_ps( b_line, a_line_0 );
00093     //
00094     b_line = _mm_set1_ps( *(b + i + 1) );
00095     ab     = _mm_mul_ps( b_line, a_line_4 );
00096     r_line = _mm_add_ps( ab, r_line );
00097     //
00098     b_line = _mm_set1_ps( *(b + i + 2) );
00099     ab     = _mm_mul_ps( b_line, a_line_8 );
00100     r_line = _mm_add_ps( ab, r_line );
00101     //
00102     b_line = _mm_set1_ps( *(b + i + 3) );
00103     ab     = _mm_mul_ps( b_line, a_line_12 );
00104     r_line = _mm_add_ps( ab, r_line );
00105     //
00106     _mm_store_ps( res + i, r_line );
00107 
00108     i = 8;
00109     // 
00110     b_line = _mm_set1_ps( *(b + i + 0) );
00111     r_line = _mm_mul_ps( b_line, a_line_0 );
00112     //
00113     b_line = _mm_set1_ps( *(b + i + 1) );
00114     ab     = _mm_mul_ps( b_line, a_line_4 );
00115     r_line = _mm_add_ps( ab, r_line );
00116     //
00117     b_line = _mm_set1_ps( *(b + i + 2) );
00118     ab     = _mm_mul_ps( b_line, a_line_8 );
00119     r_line = _mm_add_ps( ab, r_line );
00120     //
00121     b_line = _mm_set1_ps( *(b + i + 3) );
00122     ab     = _mm_mul_ps( b_line, a_line_12 );
00123     r_line = _mm_add_ps( ab, r_line );
00124     //
00125     _mm_store_ps( res + i, r_line );
00126 
00127     i = 12;
00128     // 
00129     b_line = _mm_set1_ps( *(b + i + 0) );
00130     r_line = _mm_mul_ps( b_line, a_line_0 );
00131     //
00132     b_line = _mm_set1_ps( *(b + i + 1) );
00133     ab     = _mm_mul_ps( b_line, a_line_4 );
00134     r_line = _mm_add_ps( ab, r_line );
00135     //
00136     b_line = _mm_set1_ps( *(b + i + 2) );
00137     ab     = _mm_mul_ps( b_line, a_line_8 );
00138     r_line = _mm_add_ps( ab, r_line );
00139     //
00140     b_line = _mm_set1_ps( *(b + i + 3) );
00141     ab     = _mm_mul_ps( b_line, a_line_12 );
00142     r_line = _mm_add_ps( ab, r_line );
00143     //
00144     _mm_store_ps( res + i, r_line );
00145 
00146     return ret;
00147 }
00148 
00149 #if defined( CINDER_MSW )
00150 #   pragma runtime_checks( "", off )
00151 #   pragma warning( push )
00152 #   pragma warning( disable:4700 )
00153 #endif 
00154 
00155 inline Matrix44f SseInvert( const Matrix44f& mat )
00156 {
00157     CINDER_ALIGN16_MATRIX44F( ret );
00158 
00159     const float* src = mat.m;
00160     float* dst = ret.m;
00161 
00162     __m128 minor0, minor1, minor2, minor3;
00163     __m128 row0, row1, row2, row3;
00164     __m128 det, tmp1;
00165     tmp1   = _mm_loadh_pi( _mm_loadl_pi( tmp1, ( __m64* )( src ) ), ( __m64* )( src + 4 ) );
00166     row1   = _mm_loadh_pi( _mm_loadl_pi( row1, ( __m64* )( src + 8 ) ), ( __m64* )( src + 12 ) );
00167     row0   = _mm_shuffle_ps( tmp1, row1, 0x88 );
00168     row1   = _mm_shuffle_ps( row1, tmp1, 0xDD );
00169     tmp1   = _mm_loadh_pi( _mm_loadl_pi( tmp1, ( __m64* )( src + 2 ) ), ( __m64* )( src + 6 ) );
00170     row3   = _mm_loadh_pi( _mm_loadl_pi( row3, ( __m64* )( src + 10 ) ), ( __m64* )( src + 14 ) );
00171     row2   = _mm_shuffle_ps( tmp1, row3, 0x88 );
00172     row3   = _mm_shuffle_ps( row3, tmp1, 0xDD );
00173     // ----------------------------------------------- 
00174     tmp1   = _mm_mul_ps( row2, row3 );
00175     tmp1   = _mm_shuffle_ps( tmp1, tmp1, 0xB1 );
00176     minor0 = _mm_mul_ps( row1, tmp1 );
00177     minor1 = _mm_mul_ps( row0, tmp1 );
00178     tmp1   = _mm_shuffle_ps( tmp1, tmp1, 0x4E );
00179     minor0 = _mm_sub_ps( _mm_mul_ps( row1, tmp1 ), minor0 );
00180     minor1 = _mm_sub_ps( _mm_mul_ps( row0, tmp1 ), minor1 );
00181     minor1 = _mm_shuffle_ps( minor1, minor1, 0x4E );
00182     // -----------------------------------------------
00183     tmp1   = _mm_mul_ps( row1, row2 );
00184     tmp1   = _mm_shuffle_ps( tmp1, tmp1, 0xB1 );
00185     minor0 = _mm_add_ps( _mm_mul_ps( row3, tmp1 ), minor0 );
00186     minor3 = _mm_mul_ps( row0, tmp1 );
00187     tmp1   = _mm_shuffle_ps( tmp1, tmp1, 0x4E );
00188     minor0 = _mm_sub_ps( minor0, _mm_mul_ps( row3, tmp1 ) );
00189     minor3 = _mm_sub_ps( _mm_mul_ps( row0, tmp1 ), minor3 );
00190     minor3 = _mm_shuffle_ps( minor3, minor3, 0x4E );
00191     // -----------------------------------------------
00192     tmp1   = _mm_mul_ps( _mm_shuffle_ps( row1, row1, 0x4E ), row3 );
00193     tmp1   = _mm_shuffle_ps( tmp1, tmp1, 0xB1 );
00194     row2   = _mm_shuffle_ps( row2, row2, 0x4E );
00195     minor0 = _mm_add_ps( _mm_mul_ps( row2, tmp1 ), minor0 );
00196     minor2 = _mm_mul_ps( row0, tmp1 );
00197     tmp1   = _mm_shuffle_ps( tmp1, tmp1, 0x4E );
00198     minor0 = _mm_sub_ps( minor0, _mm_mul_ps( row2, tmp1 ) );
00199     minor2 = _mm_sub_ps( _mm_mul_ps( row0, tmp1 ), minor2 );
00200     minor2 = _mm_shuffle_ps( minor2, minor2, 0x4E );
00201     // -----------------------------------------------
00202     tmp1   = _mm_mul_ps( row0, row1 );
00203     tmp1   = _mm_shuffle_ps( tmp1, tmp1, 0xB1 );
00204     minor2 = _mm_add_ps( _mm_mul_ps( row3, tmp1 ), minor2 );
00205     minor3 = _mm_sub_ps( _mm_mul_ps( row2, tmp1 ), minor3 );
00206     tmp1   = _mm_shuffle_ps( tmp1, tmp1, 0x4E );
00207     minor2 = _mm_sub_ps( _mm_mul_ps( row3, tmp1 ), minor2 );
00208     minor3 = _mm_sub_ps( minor3, _mm_mul_ps( row2, tmp1 ) );
00209     // -----------------------------------------------
00210     tmp1   = _mm_mul_ps( row0, row3 );
00211     tmp1   = _mm_shuffle_ps( tmp1, tmp1, 0xB1 );
00212     minor1 = _mm_sub_ps( minor1, _mm_mul_ps( row2, tmp1 ) );
00213     minor2 = _mm_add_ps( _mm_mul_ps( row1, tmp1 ), minor2 );
00214     tmp1   = _mm_shuffle_ps( tmp1, tmp1, 0x4E );
00215     minor1 = _mm_add_ps( _mm_mul_ps( row2, tmp1 ), minor1 );
00216     minor2 = _mm_sub_ps( minor2, _mm_mul_ps( row1, tmp1 ) );
00217     // -----------------------------------------------
00218     tmp1   = _mm_mul_ps( row0, row2 );
00219     tmp1   = _mm_shuffle_ps( tmp1, tmp1, 0xB1 );
00220     minor1 = _mm_add_ps( _mm_mul_ps( row3, tmp1 ), minor1 );
00221     minor3 = _mm_sub_ps( minor3, _mm_mul_ps( row1, tmp1 ) );
00222     tmp1   = _mm_shuffle_ps( tmp1, tmp1, 0x4E );
00223     minor1 = _mm_sub_ps( minor1, _mm_mul_ps( row3, tmp1 ) );
00224     minor3 = _mm_add_ps( _mm_mul_ps( row1, tmp1 ), minor3 );
00225     // -----------------------------------------------
00226     det  = _mm_mul_ps( row0, minor0 );
00227     det  = _mm_add_ps( _mm_shuffle_ps( det, det, 0x4E ), det );
00228     det  = _mm_add_ss( _mm_shuffle_ps( det, det, 0xB1 ), det );
00229     tmp1 = _mm_rcp_ss( det );
00230     det  = _mm_sub_ss( _mm_add_ss( tmp1, tmp1 ), _mm_mul_ss( det, _mm_mul_ss( tmp1, tmp1 ) ) );
00231     det  = _mm_shuffle_ps( det, det, 0x00 );
00232 
00233     minor0 = _mm_mul_ps( det, minor0 );
00234     _mm_storel_pi( ( __m64* )( dst ), minor0 );
00235     _mm_storeh_pi( ( __m64* )( dst + 2 ), minor0 );
00236 
00237     minor1 = _mm_mul_ps( det, minor1 );
00238     _mm_storel_pi( ( __m64* )( dst + 4 ), minor1 );
00239     _mm_storeh_pi( ( __m64* )( dst + 6 ), minor1 );
00240 
00241     minor2 = _mm_mul_ps( det, minor2 );
00242     _mm_storel_pi( ( __m64* )( dst +  8 ), minor2 );
00243     _mm_storeh_pi( ( __m64* )( dst + 10 ), minor2 );
00244 
00245     minor3 = _mm_mul_ps( det, minor3 );
00246     _mm_storel_pi( ( __m64* )( dst + 12 ), minor3 );
00247     _mm_storeh_pi( ( __m64* )( dst + 14 ), minor3 );    
00248 
00249     return ret;
00250 }
00251 
00252 #if defined( CINDER_MSW )
00253 #   pragma warning( pop )
00254 #   pragma runtime_checks( "", restore )
00255 #endif
00256 
00257 #endif // #if defined( CINDER_MSW ) || defined( CINDER_MAC )
00258 
00259 } // namespace cinder