From 456ef6e2f5ef5fdb8e91c2e24b36db806b76ec18 Mon Sep 17 00:00:00 2001 From: Matthew Bellew Date: Tue, 15 May 2018 22:34:32 -0700 Subject: [PATCH] SSE version of PresetOutputs::PerPixelMath() --- .../MilkdropPresetFactory/PresetFrameIO.cpp | 288 ++++++++++++++++-- .../MilkdropPresetFactory/PresetFrameIO.hpp | 6 + src/libprojectM/wipemalloc.cpp | 3 +- 3 files changed, 271 insertions(+), 26 deletions(-) diff --git a/src/libprojectM/MilkdropPresetFactory/PresetFrameIO.cpp b/src/libprojectM/MilkdropPresetFactory/PresetFrameIO.cpp index af3ceeab2..725f5e912 100644 --- a/src/libprojectM/MilkdropPresetFactory/PresetFrameIO.cpp +++ b/src/libprojectM/MilkdropPresetFactory/PresetFrameIO.cpp @@ -5,6 +5,9 @@ #include #include #include "Renderer/BeatDetect.hpp" +//#include // X86 SSE1 +//#include // X86 SSE2 +#include PresetInputs::PresetInputs() : PipelineContext() { @@ -186,7 +189,7 @@ void PresetOutputs::Render(const BeatDetect &music, const PipelineContext &conte } // N.B. The more optimization that can be done on this method, the better! This is called a lot and can probably be improved. -void PresetOutputs::PerPixelMath(const PipelineContext &context) +void PresetOutputs::PerPixelMath_c(const PipelineContext &context) { int x, y; @@ -200,25 +203,9 @@ void PresetOutputs::PerPixelMath(const PipelineContext &context) rad_mesh[x][y] * 2.0f - 1.0f)); fZoom2Inv = 1.0f / fZoom2; this->x_mesh[x][y] = this->orig_x[x][y] * 0.5f * fZoom2Inv + 0.5f; + this->x_mesh[x][y] = (this->x_mesh[x][y] - this->cx_mesh[x][y]) / this->sx_mesh[x][y] + this->cx_mesh[x][y]; this->y_mesh[x][y] = this->orig_y[x][y] * 0.5f * fZoom2Inv + 0.5f; - } - } - - for (x = 0; x < gx; x++) - { - for (y = 0; y < gy; y++) - { - this->x_mesh[x][y] = (this->x_mesh[x][y] - this->cx_mesh[x][y]) - / this->sx_mesh[x][y] + this->cx_mesh[x][y]; - } - } - - for (x = 0; x < gx; x++) - { - for (y = 0; y < gy; y++) - { - this->y_mesh[x][y] = (this->y_mesh[x][y] - this->cy_mesh[x][y]) - / this->sy_mesh[x][y] + this->cy_mesh[x][y]; + this->y_mesh[x][y] = (this->y_mesh[x][y] - this->cy_mesh[x][y]) / this->sy_mesh[x][y] + this->cy_mesh[x][y]; } } @@ -234,6 +221,7 @@ void PresetOutputs::PerPixelMath(const PipelineContext &context) { for (y = 0; y < gy; y++) { +#if 0 this->x_mesh[x][y] += this->warp_mesh[x][y] * 0.0035f * sinf(fWarpTime * 0.333f + fWarpScaleInv * (this->orig_x[x][y] * f[0] - this->orig_y[x][y] * f[3])); this->y_mesh[x][y] += this->warp_mesh[x][y] * 0.0035f * cosf(fWarpTime * 0.375f @@ -242,8 +230,23 @@ void PresetOutputs::PerPixelMath(const PipelineContext &context) - fWarpScaleInv * (this->orig_x[x][y] * f[1] - this->orig_y[x][y] * f[2])); this->y_mesh[x][y] += this->warp_mesh[x][y] * 0.0035f * sinf(fWarpTime * 0.825f + fWarpScaleInv * (this->orig_x[x][y] * f[0] + this->orig_y[x][y] * f[3])); +#else + float orig_x = this->orig_x[x][y]; + float orig_y = this->orig_y[x][y]; + float warp_mesh = this->warp_mesh[x][y] * 0.0035f; + + this->x_mesh[x][y] += + (warp_mesh * sinf(fWarpTime * 0.333f + fWarpScaleInv * (orig_x * f[0] - orig_y * f[3]))) + + (warp_mesh * cosf(fWarpTime * 0.753f - fWarpScaleInv * (orig_x * f[1] - orig_y * f[2]))); + + this->y_mesh[x][y] += + (warp_mesh * cosf(fWarpTime * 0.375f - fWarpScaleInv * (orig_x * f[2] + orig_y * f[1]))) + + (warp_mesh * sinf(fWarpTime * 0.825f + fWarpScaleInv * (orig_x * f[0] + orig_y * f[3]))); +#endif } } + +#if 0 for (x = 0; x < gx; x++) { for (y = 0; y < gy; y++) @@ -267,16 +270,254 @@ void PresetOutputs::PerPixelMath(const PipelineContext &context) for (x = 0; x < gx; x++) for (y = 0; y < gy; y++) this->y_mesh[x][y] -= this->dy_mesh[x][y]; +#else + for (x = 0; x < gx; x++) + { + for (y = 0; y < gy; y++) + { + const float u2 = this->x_mesh[x][y] - this->cx_mesh[x][y]; + const float v2 = this->y_mesh[x][y] - this->cy_mesh[x][y]; + const float rot = this->rot_mesh[x][y]; + const float cos_rot = cosf(rot); + const float sin_rot = sinf(rot); + + this->x_mesh[x][y] = u2 * cos_rot - v2 * sin_rot + this->cx_mesh[x][y] - this->dx_mesh[x][y]; + this->y_mesh[x][y] = u2 * sin_rot + v2 * cos_rot + this->cy_mesh[x][y] - this->dy_mesh[x][y]; + } + } +#endif +} + + + +#ifdef __SSE2__ + +// is there an SSE way to do this? +inline __m128 _mm_pow(__m128 x, __m128 y) +{ + float X[4]; + float Y[4]; + _mm_store_ps(X,x); + _mm_store_ps(Y,x); + X[0] = std::pow(X[0],Y[0]); + X[1] = std::pow(X[1],Y[1]); + X[2] = std::pow(X[2],Y[2]); + X[3] = std::pow(X[3],Y[3]); + return _mm_load_ps(X); +} +inline __m128 _mm_sincosf(__m128 x, __m128 &sinx, __m128 &cosx) +{ + float X[4], S[4], C[4]; + _mm_store_ps(X,x); + S[0] = sinf(X[0]); + C[0] = cosf(X[0]); + S[1] = sinf(X[1]); + C[1] = cosf(X[1]); + S[2] = sinf(X[2]); + C[2] = cosf(X[2]); + S[3] = sinf(X[3]); + C[3] = cosf(X[3]); + sinx = _mm_load_ps(S); + cosx = _mm_load_ps(C); +} +inline __m128 _mm_sinf(__m128 x) +{ + float X[4]; + _mm_store_ps(X,x); + X[0] = sinf(X[0]); + X[1] = sinf(X[1]); + X[2] = sinf(X[2]); + X[3] = sinf(X[3]); + return _mm_load_ps(X); +} +inline __m128 _mm_cosf(__m128 x) +{ + float X[4]; + _mm_store_ps(X,x); + X[0] = cosf(X[0]); + X[1] = cosf(X[1]); + X[2] = cosf(X[2]); + X[3] = cosf(X[3]); + return _mm_load_ps(X); +} + + +void PresetOutputs::PerPixelMath_sse(const PipelineContext &context) +{ + for (int x = 0; x < gx; x++) + { + for (int y = 0; y < gy; y += 4) + { + // fZoom2 = std::pow(this->zoom_mesh[x][y], std::pow(this->zoomexp_mesh[x][y], + // rad_mesh[x][y] * 2.0f - 1.0f)); + // fZoom2Inv = 1.0f / fZoom2; + __m128 rad_mesh_scaled = + _mm_sub_ps( + _mm_mul_ps( + _mm_load_ps(&this->rad_mesh[x][y]), + _mm_set_ps1(2.0f)), + _mm_set_ps1(1.0f)); + __m128 zoom_mesh = _mm_load_ps(&this->zoom_mesh[x][y]); + __m128 zoomexp_mesh = _mm_load_ps(&this->zoomexp_mesh[x][y]); + __m128 fZoom2 = _mm_pow(zoom_mesh, _mm_pow(zoomexp_mesh, rad_mesh_scaled)); + __m128 fZoomInv = _mm_mul_ps(_mm_rcp_ps(fZoom2), _mm_set_ps1(0.5f)); + // this->x_mesh[x][y] = this->orig_x[x][y] * 0.5f * fZoom2Inv + 0.5f; + __m128 x_mesh = _mm_load_ps(&this->orig_x[x][y]); + x_mesh = + _mm_add_ps( + _mm_mul_ps( + _mm_load_ps(&this->orig_x[x][y]), + fZoomInv), + _mm_set_ps1(0.5f)); + // this->x_mesh[x][y] = (this->x_mesh[x][y] - this->cx_mesh[x][y]) / this->sx_mesh[x][y] + this->cx_mesh[x][y]; + __m128 cx_mesh = _mm_load_ps(&this->cx_mesh[x][y]); + __m128 sx_mesh = _mm_load_ps(&this->sx_mesh[x][y]); + _mm_store_ps(&this->x_mesh[x][y], + _mm_add_ps( + _mm_div_ps( + _mm_sub_ps(x_mesh,cx_mesh), + sx_mesh), + cx_mesh + )); + + // this->y_mesh[x][y] = this->orig_y[x][y] * 0.5f * fZoom2Inv + 0.5f; + __m128 y_mesh = + _mm_add_ps( + _mm_mul_ps( + _mm_load_ps(&this->orig_y[x][y]), + fZoomInv), + _mm_set_ps1(0.5f)); + // this->y_mesh[x][y] = (this->y_mesh[x][y] - this->cy_mesh[x][y]) / this->sy_mesh[x][y] + this->cy_mesh[x][y]; + __m128 cy_mesh = _mm_load_ps(&this->cy_mesh[x][y]); + __m128 sy_mesh = _mm_load_ps(&this->sy_mesh[x][y]); + _mm_store_ps(&this->y_mesh[x][y], + _mm_add_ps( + _mm_div_ps( + _mm_sub_ps(y_mesh,cy_mesh), + sy_mesh), + cy_mesh + )); + } + } + + const float fWarpTime = context.time * this->fWarpAnimSpeed; + const float fWarpScaleInv = 1.0f / this->fWarpScale; + const float f[4] = + { + 11.68f + 4.0f * cosf(fWarpTime * 1.413f + 10), + 8.77f + 3.0f * cosf(fWarpTime * 1.113f + 7), + 10.54f + 3.0f * cosf(fWarpTime * 1.233f + 3), + 11.49f + 4.0f * cosf(fWarpTime * 0.933f + 5) + }; + + for (int x = 0; x < gx; x++) + { + for (int y = 0; y < gy; y+=4) + { + //float orig_x = this->orig_x[x][y]; + //float orig_y = this->orig_y[x][y]; + //float warp_mesh = this->warp_mesh[x][y] * 0.0035f; + const __m128 orig_x = _mm_load_ps(&this->orig_x[x][y]); + const __m128 orig_y = _mm_load_ps(&this->orig_y[x][y]); + const __m128 warp_mesh = _mm_mul_ps(_mm_load_ps(&this->warp_mesh[x][y]), _mm_set_ps1(0.0035f)); + + // this->x_mesh[x][y] += + // (warp_mesh * sinf(fWarpTime * 0.333f + fWarpScaleInv * (orig_x * f[0] - orig_y * f[3]))) + + // (warp_mesh * cosf(fWarpTime * 0.753f - fWarpScaleInv * (orig_x * f[1] - orig_y * f[2]))); + _mm_store_ps(&this->x_mesh[x][y], + _mm_add_ps(_mm_load_ps(&this->x_mesh[x][y]), + _mm_add_ps( + _mm_mul_ps(warp_mesh, _mm_sinf( + _mm_add_ps( + _mm_set_ps1(fWarpTime*0.333f), + _mm_mul_ps(_mm_set_ps1(fWarpScaleInv), + _mm_sub_ps( + _mm_mul_ps(orig_x, _mm_set_ps1(f[0])), + _mm_mul_ps(orig_y, _mm_set_ps1(f[3])) + ))))), + _mm_mul_ps(warp_mesh, _mm_cosf( + _mm_sub_ps( + _mm_set_ps1(fWarpTime*0.753f), + _mm_mul_ps(_mm_set_ps1(fWarpScaleInv), + _mm_sub_ps( + _mm_mul_ps(orig_x, _mm_set_ps1(f[1])), + _mm_mul_ps(orig_y, _mm_set_ps1(f[2])) + )))))))); + + // this->y_mesh[x][y] += + // (warp_mesh * cosf(fWarpTime * 0.375f - fWarpScaleInv * (orig_x * f[2] + orig_y * f[1]))) + + // (warp_mesh * sinf(fWarpTime * 0.825f + fWarpScaleInv * (orig_x * f[0] + orig_y * f[3]))); + _mm_store_ps(&this->y_mesh[x][y], + _mm_add_ps(_mm_load_ps(&this->y_mesh[x][y]), + _mm_add_ps( + _mm_mul_ps(warp_mesh, _mm_cosf( + _mm_sub_ps( + _mm_set_ps1(fWarpTime*0.375f), + _mm_mul_ps(_mm_set_ps1(fWarpScaleInv), + _mm_add_ps( + _mm_mul_ps(orig_x, _mm_set_ps1(f[2])), + _mm_mul_ps(orig_y, _mm_set_ps1(f[1])) + ))))), + _mm_mul_ps(warp_mesh, _mm_sinf( + _mm_add_ps( + _mm_set_ps1(fWarpTime*0.825f), + _mm_mul_ps(_mm_set_ps1(fWarpScaleInv), + _mm_add_ps( + _mm_mul_ps(orig_x, _mm_set_ps1(f[0])), + _mm_mul_ps(orig_y, _mm_set_ps1(f[3])) + )))))))); + } + } + for (int x = 0; x < gx; x++) + { + for (int y = 0; y < gy; y+=4) + { + // const float u2 = this->x_mesh[x][y] - this->cx_mesh[x][y]; + // const float v2 = this->y_mesh[x][y] - this->cy_mesh[x][y]; + const __m128 u2 = _mm_sub_ps(_mm_load_ps(&this->x_mesh[x][y]),_mm_load_ps(&this->cx_mesh[x][y])); + const __m128 v2 = _mm_sub_ps(_mm_load_ps(&this->y_mesh[x][y]),_mm_load_ps(&this->cy_mesh[x][y])); + + // const float rot = this->rot_mesh[x][y]; + // const float cos_rot = cosf(rot); + // const float sin_rot = sinf(rot); + __m128 sin_rot, cos_rot; + _mm_sincosf(_mm_load_ps(&this->rot_mesh[x][y]), sin_rot, cos_rot); + + // this->x_mesh[x][y] = u2 * cos_rot - v2 * sin_rot + this->cx_mesh[x][y] - this->dx_mesh[x][y]; + _mm_store_ps(&this->x_mesh[x][y], + _mm_add_ps( + _mm_sub_ps(_mm_mul_ps(u2, cos_rot), _mm_mul_ps(v2,sin_rot)), + _mm_sub_ps(_mm_load_ps(&this->cx_mesh[x][y]), _mm_load_ps(&this->dx_mesh[x][y])) + )); + // this->y_mesh[x][y] = u2 * sin_rot + v2 * cos_rot + this->cy_mesh[x][y] - this->dy_mesh[x][y]; + _mm_store_ps(&this->y_mesh[x][y], + _mm_add_ps( + _mm_add_ps(_mm_mul_ps(u2, sin_rot), _mm_mul_ps(v2,cos_rot)), + _mm_sub_ps(_mm_load_ps(&this->cy_mesh[x][y]), _mm_load_ps(&this->dy_mesh[x][y])) + )); + } + } +} +#endif + +void PresetOutputs::PerPixelMath(const PipelineContext &context) +{ +#ifdef __SSE2__ + PerPixelMath_sse(context); +#else + PerPixelMath_c(context); +#endif } void PresetOutputs::Initialize ( int gx, int gy ) { - assert(gx > 0); - this->gx = gx; - this->gy= gy; + + // round gx/gy up to multiple 4 (for possible SSE optimization) + this->gx = (gx+3) & ~(size_t)3; + this->gy = (gy+3) & ~(size_t)3; staticPerPixel = true; setStaticPerPixel(gx,gy); @@ -379,8 +620,6 @@ PresetInputs::~PresetInputs() { for ( int x = 0; x < this->gx; x++ ) { - - free ( this->origtheta[x] ); free ( this->origrad[x] ); free ( this->origx[x] ); @@ -390,7 +629,6 @@ PresetInputs::~PresetInputs() free ( this->y_mesh[x] ); free ( this->rad_mesh[x] ); free ( this->theta_mesh[x] ); - } diff --git a/src/libprojectM/MilkdropPresetFactory/PresetFrameIO.hpp b/src/libprojectM/MilkdropPresetFactory/PresetFrameIO.hpp index 229879c2f..35f42d8e7 100644 --- a/src/libprojectM/MilkdropPresetFactory/PresetFrameIO.hpp +++ b/src/libprojectM/MilkdropPresetFactory/PresetFrameIO.hpp @@ -138,6 +138,12 @@ public: float **orig_x; //original mesh float **orig_y; float **rad_mesh; + +private: + void PerPixelMath_c( const PipelineContext &context); +#ifdef __SSE2__ + void PerPixelMath_sse( const PipelineContext &context); +#endif }; diff --git a/src/libprojectM/wipemalloc.cpp b/src/libprojectM/wipemalloc.cpp index 14bcf7816..b1e196eb0 100755 --- a/src/libprojectM/wipemalloc.cpp +++ b/src/libprojectM/wipemalloc.cpp @@ -27,7 +27,8 @@ #include "wipemalloc.h" void *wipemalloc( size_t count ) { - void *mem = malloc( count ); + count = (count + 15) & ~(size_t)15; + void *mem = aligned_alloc( 16, count ); if ( mem != NULL ) { memset( mem, 0, count ); } else {