From d1578229ce3b8a03209dd61563f63487e27c6ad6 Mon Sep 17 00:00:00 2001 From: Matthew Bellew Date: Tue, 15 May 2018 22:34:32 -0700 Subject: [PATCH 1/9] SSE version of PresetOutputs::PerPixelMath() --- .../MilkdropPresetFactory/PresetFrameIO.cpp | 288 ++++++++++++++++-- .../MilkdropPresetFactory/PresetFrameIO.hpp | 6 + src/libprojectM/wipemalloc.cpp | 3 +- 3 files changed, 271 insertions(+), 26 deletions(-) diff --git a/src/libprojectM/MilkdropPresetFactory/PresetFrameIO.cpp b/src/libprojectM/MilkdropPresetFactory/PresetFrameIO.cpp index af3ceeab2..725f5e912 100644 --- a/src/libprojectM/MilkdropPresetFactory/PresetFrameIO.cpp +++ b/src/libprojectM/MilkdropPresetFactory/PresetFrameIO.cpp @@ -5,6 +5,9 @@ #include #include #include "Renderer/BeatDetect.hpp" +//#include // X86 SSE1 +//#include // X86 SSE2 +#include PresetInputs::PresetInputs() : PipelineContext() { @@ -186,7 +189,7 @@ void PresetOutputs::Render(const BeatDetect &music, const PipelineContext &conte } // N.B. The more optimization that can be done on this method, the better! This is called a lot and can probably be improved. -void PresetOutputs::PerPixelMath(const PipelineContext &context) +void PresetOutputs::PerPixelMath_c(const PipelineContext &context) { int x, y; @@ -200,25 +203,9 @@ void PresetOutputs::PerPixelMath(const PipelineContext &context) rad_mesh[x][y] * 2.0f - 1.0f)); fZoom2Inv = 1.0f / fZoom2; this->x_mesh[x][y] = this->orig_x[x][y] * 0.5f * fZoom2Inv + 0.5f; + this->x_mesh[x][y] = (this->x_mesh[x][y] - this->cx_mesh[x][y]) / this->sx_mesh[x][y] + this->cx_mesh[x][y]; this->y_mesh[x][y] = this->orig_y[x][y] * 0.5f * fZoom2Inv + 0.5f; - } - } - - for (x = 0; x < gx; x++) - { - for (y = 0; y < gy; y++) - { - this->x_mesh[x][y] = (this->x_mesh[x][y] - this->cx_mesh[x][y]) - / this->sx_mesh[x][y] + this->cx_mesh[x][y]; - } - } - - for (x = 0; x < gx; x++) - { - for (y = 0; y < gy; y++) - { - this->y_mesh[x][y] = (this->y_mesh[x][y] - this->cy_mesh[x][y]) - / this->sy_mesh[x][y] + this->cy_mesh[x][y]; + this->y_mesh[x][y] = (this->y_mesh[x][y] - this->cy_mesh[x][y]) / this->sy_mesh[x][y] + this->cy_mesh[x][y]; } } @@ -234,6 +221,7 @@ void PresetOutputs::PerPixelMath(const PipelineContext &context) { for (y = 0; y < gy; y++) { +#if 0 this->x_mesh[x][y] += this->warp_mesh[x][y] * 0.0035f * sinf(fWarpTime * 0.333f + fWarpScaleInv * (this->orig_x[x][y] * f[0] - this->orig_y[x][y] * f[3])); this->y_mesh[x][y] += this->warp_mesh[x][y] * 0.0035f * cosf(fWarpTime * 0.375f @@ -242,8 +230,23 @@ void PresetOutputs::PerPixelMath(const PipelineContext &context) - fWarpScaleInv * (this->orig_x[x][y] * f[1] - this->orig_y[x][y] * f[2])); this->y_mesh[x][y] += this->warp_mesh[x][y] * 0.0035f * sinf(fWarpTime * 0.825f + fWarpScaleInv * (this->orig_x[x][y] * f[0] + this->orig_y[x][y] * f[3])); +#else + float orig_x = this->orig_x[x][y]; + float orig_y = this->orig_y[x][y]; + float warp_mesh = this->warp_mesh[x][y] * 0.0035f; + + this->x_mesh[x][y] += + (warp_mesh * sinf(fWarpTime * 0.333f + fWarpScaleInv * (orig_x * f[0] - orig_y * f[3]))) + + (warp_mesh * cosf(fWarpTime * 0.753f - fWarpScaleInv * (orig_x * f[1] - orig_y * f[2]))); + + this->y_mesh[x][y] += + (warp_mesh * cosf(fWarpTime * 0.375f - fWarpScaleInv * (orig_x * f[2] + orig_y * f[1]))) + + (warp_mesh * sinf(fWarpTime * 0.825f + fWarpScaleInv * (orig_x * f[0] + orig_y * f[3]))); +#endif } } + +#if 0 for (x = 0; x < gx; x++) { for (y = 0; y < gy; y++) @@ -267,16 +270,254 @@ void PresetOutputs::PerPixelMath(const PipelineContext &context) for (x = 0; x < gx; x++) for (y = 0; y < gy; y++) this->y_mesh[x][y] -= this->dy_mesh[x][y]; +#else + for (x = 0; x < gx; x++) + { + for (y = 0; y < gy; y++) + { + const float u2 = this->x_mesh[x][y] - this->cx_mesh[x][y]; + const float v2 = this->y_mesh[x][y] - this->cy_mesh[x][y]; + const float rot = this->rot_mesh[x][y]; + const float cos_rot = cosf(rot); + const float sin_rot = sinf(rot); + + this->x_mesh[x][y] = u2 * cos_rot - v2 * sin_rot + this->cx_mesh[x][y] - this->dx_mesh[x][y]; + this->y_mesh[x][y] = u2 * sin_rot + v2 * cos_rot + this->cy_mesh[x][y] - this->dy_mesh[x][y]; + } + } +#endif +} + + + +#ifdef __SSE2__ + +// is there an SSE way to do this? +inline __m128 _mm_pow(__m128 x, __m128 y) +{ + float X[4]; + float Y[4]; + _mm_store_ps(X,x); + _mm_store_ps(Y,x); + X[0] = std::pow(X[0],Y[0]); + X[1] = std::pow(X[1],Y[1]); + X[2] = std::pow(X[2],Y[2]); + X[3] = std::pow(X[3],Y[3]); + return _mm_load_ps(X); +} +inline __m128 _mm_sincosf(__m128 x, __m128 &sinx, __m128 &cosx) +{ + float X[4], S[4], C[4]; + _mm_store_ps(X,x); + S[0] = sinf(X[0]); + C[0] = cosf(X[0]); + S[1] = sinf(X[1]); + C[1] = cosf(X[1]); + S[2] = sinf(X[2]); + C[2] = cosf(X[2]); + S[3] = sinf(X[3]); + C[3] = cosf(X[3]); + sinx = _mm_load_ps(S); + cosx = _mm_load_ps(C); +} +inline __m128 _mm_sinf(__m128 x) +{ + float X[4]; + _mm_store_ps(X,x); + X[0] = sinf(X[0]); + X[1] = sinf(X[1]); + X[2] = sinf(X[2]); + X[3] = sinf(X[3]); + return _mm_load_ps(X); +} +inline __m128 _mm_cosf(__m128 x) +{ + float X[4]; + _mm_store_ps(X,x); + X[0] = cosf(X[0]); + X[1] = cosf(X[1]); + X[2] = cosf(X[2]); + X[3] = cosf(X[3]); + return _mm_load_ps(X); +} + + +void PresetOutputs::PerPixelMath_sse(const PipelineContext &context) +{ + for (int x = 0; x < gx; x++) + { + for (int y = 0; y < gy; y += 4) + { + // fZoom2 = std::pow(this->zoom_mesh[x][y], std::pow(this->zoomexp_mesh[x][y], + // rad_mesh[x][y] * 2.0f - 1.0f)); + // fZoom2Inv = 1.0f / fZoom2; + __m128 rad_mesh_scaled = + _mm_sub_ps( + _mm_mul_ps( + _mm_load_ps(&this->rad_mesh[x][y]), + _mm_set_ps1(2.0f)), + _mm_set_ps1(1.0f)); + __m128 zoom_mesh = _mm_load_ps(&this->zoom_mesh[x][y]); + __m128 zoomexp_mesh = _mm_load_ps(&this->zoomexp_mesh[x][y]); + __m128 fZoom2 = _mm_pow(zoom_mesh, _mm_pow(zoomexp_mesh, rad_mesh_scaled)); + __m128 fZoomInv = _mm_mul_ps(_mm_rcp_ps(fZoom2), _mm_set_ps1(0.5f)); + // this->x_mesh[x][y] = this->orig_x[x][y] * 0.5f * fZoom2Inv + 0.5f; + __m128 x_mesh = _mm_load_ps(&this->orig_x[x][y]); + x_mesh = + _mm_add_ps( + _mm_mul_ps( + _mm_load_ps(&this->orig_x[x][y]), + fZoomInv), + _mm_set_ps1(0.5f)); + // this->x_mesh[x][y] = (this->x_mesh[x][y] - this->cx_mesh[x][y]) / this->sx_mesh[x][y] + this->cx_mesh[x][y]; + __m128 cx_mesh = _mm_load_ps(&this->cx_mesh[x][y]); + __m128 sx_mesh = _mm_load_ps(&this->sx_mesh[x][y]); + _mm_store_ps(&this->x_mesh[x][y], + _mm_add_ps( + _mm_div_ps( + _mm_sub_ps(x_mesh,cx_mesh), + sx_mesh), + cx_mesh + )); + + // this->y_mesh[x][y] = this->orig_y[x][y] * 0.5f * fZoom2Inv + 0.5f; + __m128 y_mesh = + _mm_add_ps( + _mm_mul_ps( + _mm_load_ps(&this->orig_y[x][y]), + fZoomInv), + _mm_set_ps1(0.5f)); + // this->y_mesh[x][y] = (this->y_mesh[x][y] - this->cy_mesh[x][y]) / this->sy_mesh[x][y] + this->cy_mesh[x][y]; + __m128 cy_mesh = _mm_load_ps(&this->cy_mesh[x][y]); + __m128 sy_mesh = _mm_load_ps(&this->sy_mesh[x][y]); + _mm_store_ps(&this->y_mesh[x][y], + _mm_add_ps( + _mm_div_ps( + _mm_sub_ps(y_mesh,cy_mesh), + sy_mesh), + cy_mesh + )); + } + } + + const float fWarpTime = context.time * this->fWarpAnimSpeed; + const float fWarpScaleInv = 1.0f / this->fWarpScale; + const float f[4] = + { + 11.68f + 4.0f * cosf(fWarpTime * 1.413f + 10), + 8.77f + 3.0f * cosf(fWarpTime * 1.113f + 7), + 10.54f + 3.0f * cosf(fWarpTime * 1.233f + 3), + 11.49f + 4.0f * cosf(fWarpTime * 0.933f + 5) + }; + + for (int x = 0; x < gx; x++) + { + for (int y = 0; y < gy; y+=4) + { + //float orig_x = this->orig_x[x][y]; + //float orig_y = this->orig_y[x][y]; + //float warp_mesh = this->warp_mesh[x][y] * 0.0035f; + const __m128 orig_x = _mm_load_ps(&this->orig_x[x][y]); + const __m128 orig_y = _mm_load_ps(&this->orig_y[x][y]); + const __m128 warp_mesh = _mm_mul_ps(_mm_load_ps(&this->warp_mesh[x][y]), _mm_set_ps1(0.0035f)); + + // this->x_mesh[x][y] += + // (warp_mesh * sinf(fWarpTime * 0.333f + fWarpScaleInv * (orig_x * f[0] - orig_y * f[3]))) + + // (warp_mesh * cosf(fWarpTime * 0.753f - fWarpScaleInv * (orig_x * f[1] - orig_y * f[2]))); + _mm_store_ps(&this->x_mesh[x][y], + _mm_add_ps(_mm_load_ps(&this->x_mesh[x][y]), + _mm_add_ps( + _mm_mul_ps(warp_mesh, _mm_sinf( + _mm_add_ps( + _mm_set_ps1(fWarpTime*0.333f), + _mm_mul_ps(_mm_set_ps1(fWarpScaleInv), + _mm_sub_ps( + _mm_mul_ps(orig_x, _mm_set_ps1(f[0])), + _mm_mul_ps(orig_y, _mm_set_ps1(f[3])) + ))))), + _mm_mul_ps(warp_mesh, _mm_cosf( + _mm_sub_ps( + _mm_set_ps1(fWarpTime*0.753f), + _mm_mul_ps(_mm_set_ps1(fWarpScaleInv), + _mm_sub_ps( + _mm_mul_ps(orig_x, _mm_set_ps1(f[1])), + _mm_mul_ps(orig_y, _mm_set_ps1(f[2])) + )))))))); + + // this->y_mesh[x][y] += + // (warp_mesh * cosf(fWarpTime * 0.375f - fWarpScaleInv * (orig_x * f[2] + orig_y * f[1]))) + + // (warp_mesh * sinf(fWarpTime * 0.825f + fWarpScaleInv * (orig_x * f[0] + orig_y * f[3]))); + _mm_store_ps(&this->y_mesh[x][y], + _mm_add_ps(_mm_load_ps(&this->y_mesh[x][y]), + _mm_add_ps( + _mm_mul_ps(warp_mesh, _mm_cosf( + _mm_sub_ps( + _mm_set_ps1(fWarpTime*0.375f), + _mm_mul_ps(_mm_set_ps1(fWarpScaleInv), + _mm_add_ps( + _mm_mul_ps(orig_x, _mm_set_ps1(f[2])), + _mm_mul_ps(orig_y, _mm_set_ps1(f[1])) + ))))), + _mm_mul_ps(warp_mesh, _mm_sinf( + _mm_add_ps( + _mm_set_ps1(fWarpTime*0.825f), + _mm_mul_ps(_mm_set_ps1(fWarpScaleInv), + _mm_add_ps( + _mm_mul_ps(orig_x, _mm_set_ps1(f[0])), + _mm_mul_ps(orig_y, _mm_set_ps1(f[3])) + )))))))); + } + } + for (int x = 0; x < gx; x++) + { + for (int y = 0; y < gy; y+=4) + { + // const float u2 = this->x_mesh[x][y] - this->cx_mesh[x][y]; + // const float v2 = this->y_mesh[x][y] - this->cy_mesh[x][y]; + const __m128 u2 = _mm_sub_ps(_mm_load_ps(&this->x_mesh[x][y]),_mm_load_ps(&this->cx_mesh[x][y])); + const __m128 v2 = _mm_sub_ps(_mm_load_ps(&this->y_mesh[x][y]),_mm_load_ps(&this->cy_mesh[x][y])); + + // const float rot = this->rot_mesh[x][y]; + // const float cos_rot = cosf(rot); + // const float sin_rot = sinf(rot); + __m128 sin_rot, cos_rot; + _mm_sincosf(_mm_load_ps(&this->rot_mesh[x][y]), sin_rot, cos_rot); + + // this->x_mesh[x][y] = u2 * cos_rot - v2 * sin_rot + this->cx_mesh[x][y] - this->dx_mesh[x][y]; + _mm_store_ps(&this->x_mesh[x][y], + _mm_add_ps( + _mm_sub_ps(_mm_mul_ps(u2, cos_rot), _mm_mul_ps(v2,sin_rot)), + _mm_sub_ps(_mm_load_ps(&this->cx_mesh[x][y]), _mm_load_ps(&this->dx_mesh[x][y])) + )); + // this->y_mesh[x][y] = u2 * sin_rot + v2 * cos_rot + this->cy_mesh[x][y] - this->dy_mesh[x][y]; + _mm_store_ps(&this->y_mesh[x][y], + _mm_add_ps( + _mm_add_ps(_mm_mul_ps(u2, sin_rot), _mm_mul_ps(v2,cos_rot)), + _mm_sub_ps(_mm_load_ps(&this->cy_mesh[x][y]), _mm_load_ps(&this->dy_mesh[x][y])) + )); + } + } +} +#endif + +void PresetOutputs::PerPixelMath(const PipelineContext &context) +{ +#ifdef __SSE2__ + PerPixelMath_sse(context); +#else + PerPixelMath_c(context); +#endif } void PresetOutputs::Initialize ( int gx, int gy ) { - assert(gx > 0); - this->gx = gx; - this->gy= gy; + + // round gx/gy up to multiple 4 (for possible SSE optimization) + this->gx = (gx+3) & ~(size_t)3; + this->gy = (gy+3) & ~(size_t)3; staticPerPixel = true; setStaticPerPixel(gx,gy); @@ -379,8 +620,6 @@ PresetInputs::~PresetInputs() { for ( int x = 0; x < this->gx; x++ ) { - - free ( this->origtheta[x] ); free ( this->origrad[x] ); free ( this->origx[x] ); @@ -390,7 +629,6 @@ PresetInputs::~PresetInputs() free ( this->y_mesh[x] ); free ( this->rad_mesh[x] ); free ( this->theta_mesh[x] ); - } diff --git a/src/libprojectM/MilkdropPresetFactory/PresetFrameIO.hpp b/src/libprojectM/MilkdropPresetFactory/PresetFrameIO.hpp index 229879c2f..35f42d8e7 100644 --- a/src/libprojectM/MilkdropPresetFactory/PresetFrameIO.hpp +++ b/src/libprojectM/MilkdropPresetFactory/PresetFrameIO.hpp @@ -138,6 +138,12 @@ public: float **orig_x; //original mesh float **orig_y; float **rad_mesh; + +private: + void PerPixelMath_c( const PipelineContext &context); +#ifdef __SSE2__ + void PerPixelMath_sse( const PipelineContext &context); +#endif }; diff --git a/src/libprojectM/wipemalloc.cpp b/src/libprojectM/wipemalloc.cpp index 14bcf7816..b1e196eb0 100755 --- a/src/libprojectM/wipemalloc.cpp +++ b/src/libprojectM/wipemalloc.cpp @@ -27,7 +27,8 @@ #include "wipemalloc.h" void *wipemalloc( size_t count ) { - void *mem = malloc( count ); + count = (count + 15) & ~(size_t)15; + void *mem = aligned_alloc( 16, count ); if ( mem != NULL ) { memset( mem, 0, count ); } else { From 681b23d9f919fe3598e33b1f6e53e09e52d2836a Mon Sep 17 00:00:00 2001 From: Matthew Bellew Date: Wed, 16 May 2018 09:31:45 -0700 Subject: [PATCH 2/9] unused member pcmBuffer --- src/projectM-sdl/pmSDL.cpp | 8 ++++++-- src/projectM-sdl/pmSDL.hpp | 3 +-- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/src/projectM-sdl/pmSDL.cpp b/src/projectM-sdl/pmSDL.cpp index 0c171f1d8..8748d7dbc 100644 --- a/src/projectM-sdl/pmSDL.cpp +++ b/src/projectM-sdl/pmSDL.cpp @@ -96,13 +96,11 @@ int projectMSDL::openAudioInput() { void projectMSDL::beginAudioCapture() { // allocate a buffer to store PCM data for feeding in unsigned int maxSamples = audioChannelsCount * audioSampleCount; - pcmBuffer = (unsigned char *) malloc(maxSamples); SDL_PauseAudioDevice(audioDeviceID, false); pcm()->initPCM(2048); } void projectMSDL::endAudioCapture() { - free(pcmBuffer); SDL_PauseAudioDevice(audioDeviceID, true); } @@ -236,3 +234,9 @@ void projectMSDL::init(SDL_Window *window, SDL_Renderer *renderer) { selectRandom(true); projectM_resetGL(width, height); } + + +std::string projectMSDL::getActivePresetName() +{ + return std::string("hey"); +} diff --git a/src/projectM-sdl/pmSDL.hpp b/src/projectM-sdl/pmSDL.hpp index ff798586c..ea4eefeb2 100644 --- a/src/projectM-sdl/pmSDL.hpp +++ b/src/projectM-sdl/pmSDL.hpp @@ -44,6 +44,7 @@ public: void renderFrame(); void pollEvent(); void maximize(); + std::string getActivePresetName(); private: SDL_Window *win; @@ -59,12 +60,10 @@ private: unsigned short audioSampleCount; SDL_AudioFormat audioFormat; SDL_AudioDeviceID audioDeviceID; - unsigned char *pcmBuffer; // pre-allocated buffer for audioInputCallback static void audioInputCallbackF32(void *userdata, unsigned char *stream, int len); static void audioInputCallbackS16(void *userdata, unsigned char *stream, int len); - void addFakePCM(); void keyHandler(SDL_Event *); SDL_AudioDeviceID selectAudioInput(int count); From 59ee73842864383c6e3fff358e93142897392176 Mon Sep 17 00:00:00 2001 From: Matthew Bellew Date: Wed, 16 May 2018 09:32:55 -0700 Subject: [PATCH 3/9] cleanup --- .../MilkdropPresetFactory/PresetFrameIO.cpp | 132 +++++++----------- 1 file changed, 49 insertions(+), 83 deletions(-) diff --git a/src/libprojectM/MilkdropPresetFactory/PresetFrameIO.cpp b/src/libprojectM/MilkdropPresetFactory/PresetFrameIO.cpp index 725f5e912..01f44d509 100644 --- a/src/libprojectM/MilkdropPresetFactory/PresetFrameIO.cpp +++ b/src/libprojectM/MilkdropPresetFactory/PresetFrameIO.cpp @@ -9,10 +9,12 @@ //#include // X86 SSE2 #include + PresetInputs::PresetInputs() : PipelineContext() { } + void PresetInputs::update(const BeatDetect & music, const PipelineContext & context) { // Reflect new values form the beat detection unit @@ -31,6 +33,7 @@ void PresetInputs::update(const BeatDetect & music, const PipelineContext & cont this->progress = context.progress; } + void PresetInputs::Initialize ( int gx, int gy ) { int x, y; @@ -102,14 +105,13 @@ void PresetInputs::Initialize ( int gx, int gy ) this->origtheta[x][y]=atan2 ( ( ( this->origy[x][y]-.5 ) *2 ), ( ( this->origx[x][y]-.5 ) *2 ) ); } } - - - } + PresetOutputs::PresetOutputs() : Pipeline() {} + PresetOutputs::~PresetOutputs() { assert(this->gx > 0); @@ -132,22 +134,22 @@ PresetOutputs::~PresetOutputs() free(this->rad_mesh[x]); } - free(this->rad_mesh); - free(this->sx_mesh); - free(this->sy_mesh); - free(this->dy_mesh); - free(this->dx_mesh); - free(this->cy_mesh); - free(this->cx_mesh); - free(this->warp_mesh); - free(this->zoom_mesh); - free(this->zoomexp_mesh); - free(this->rot_mesh); - free(this->orig_x); - free(this->orig_y); - + free(this->rad_mesh); + free(this->sx_mesh); + free(this->sy_mesh); + free(this->dy_mesh); + free(this->dx_mesh); + free(this->cy_mesh); + free(this->cx_mesh); + free(this->warp_mesh); + free(this->zoom_mesh); + free(this->zoomexp_mesh); + free(this->rot_mesh); + free(this->orig_x); + free(this->orig_y); } + void PresetOutputs::Render(const BeatDetect &music, const PipelineContext &context) { PerPixelMath(context); @@ -158,18 +160,22 @@ void PresetOutputs::Render(const BeatDetect &music, const PipelineContext &conte for (PresetOutputs::cshape_container::iterator pos = customShapes.begin(); pos != customShapes.end(); ++pos) - { - if( (*pos)->enabled==1) drawables.push_back((*pos)); - } + { + if ((*pos)->enabled==1) + drawables.push_back((*pos)); + } for (PresetOutputs::cwave_container::iterator pos = customWaves.begin(); pos != customWaves.end(); ++pos) - { - if( (*pos)->enabled==1) drawables.push_back((*pos)); - } + { + if ((*pos)->enabled==1) + drawables.push_back((*pos)); + } - drawables.push_back(&wave); - if(bDarkenCenter==1) drawables.push_back(&darkenCenter); + drawables.push_back(&wave); + + if (bDarkenCenter==1) + drawables.push_back(&darkenCenter); drawables.push_back(&border); compositeDrawables.clear(); @@ -188,20 +194,17 @@ void PresetOutputs::Render(const BeatDetect &music, const PipelineContext &conte compositeDrawables.push_back(&invert); } + // N.B. The more optimization that can be done on this method, the better! This is called a lot and can probably be improved. void PresetOutputs::PerPixelMath_c(const PipelineContext &context) { - - int x, y; - float fZoom2, fZoom2Inv; - - for (x = 0; x < gx; x++) + for (int x = 0; x < gx; x++) { - for (y = 0; y < gy; y++) + for (int y = 0; y < gy; y++) { - fZoom2 = std::pow(this->zoom_mesh[x][y], std::pow(this->zoomexp_mesh[x][y], + const float fZoom2 = std::pow(this->zoom_mesh[x][y], std::pow(this->zoomexp_mesh[x][y], rad_mesh[x][y] * 2.0f - 1.0f)); - fZoom2Inv = 1.0f / fZoom2; + const float fZoom2Inv = 1.0f / fZoom2; this->x_mesh[x][y] = this->orig_x[x][y] * 0.5f * fZoom2Inv + 0.5f; this->x_mesh[x][y] = (this->x_mesh[x][y] - this->cx_mesh[x][y]) / this->sx_mesh[x][y] + this->cx_mesh[x][y]; this->y_mesh[x][y] = this->orig_y[x][y] * 0.5f * fZoom2Inv + 0.5f; @@ -209,71 +212,35 @@ void PresetOutputs::PerPixelMath_c(const PipelineContext &context) } } - float fWarpTime = context.time * this->fWarpAnimSpeed; - float fWarpScaleInv = 1.0f / this->fWarpScale; + const float fWarpTime = context.time * this->fWarpAnimSpeed; + const float fWarpScaleInv = 1.0f / this->fWarpScale; float f[4]; f[0] = 11.68f + 4.0f * cosf(fWarpTime * 1.413f + 10); f[1] = 8.77f + 3.0f * cosf(fWarpTime * 1.113f + 7); f[2] = 10.54f + 3.0f * cosf(fWarpTime * 1.233f + 3); f[3] = 11.49f + 4.0f * cosf(fWarpTime * 0.933f + 5); - for (x = 0; x < gx; x++) + for (int x = 0; x < gx; x++) { - for (y = 0; y < gy; y++) + for (int y = 0; y < gy; y++) { -#if 0 - this->x_mesh[x][y] += this->warp_mesh[x][y] * 0.0035f * sinf(fWarpTime * 0.333f - + fWarpScaleInv * (this->orig_x[x][y] * f[0] - this->orig_y[x][y] * f[3])); - this->y_mesh[x][y] += this->warp_mesh[x][y] * 0.0035f * cosf(fWarpTime * 0.375f - - fWarpScaleInv * (this->orig_x[x][y] * f[2] + this->orig_y[x][y] * f[1])); - this->x_mesh[x][y] += this->warp_mesh[x][y] * 0.0035f * cosf(fWarpTime * 0.753f - - fWarpScaleInv * (this->orig_x[x][y] * f[1] - this->orig_y[x][y] * f[2])); - this->y_mesh[x][y] += this->warp_mesh[x][y] * 0.0035f * sinf(fWarpTime * 0.825f - + fWarpScaleInv * (this->orig_x[x][y] * f[0] + this->orig_y[x][y] * f[3])); -#else - float orig_x = this->orig_x[x][y]; - float orig_y = this->orig_y[x][y]; - float warp_mesh = this->warp_mesh[x][y] * 0.0035f; + const float orig_x = this->orig_x[x][y]; + const float orig_y = this->orig_y[x][y]; + const float warp_mesh = this->warp_mesh[x][y] * 0.0035f; - this->x_mesh[x][y] += + this->x_mesh[x][y] += (warp_mesh * sinf(fWarpTime * 0.333f + fWarpScaleInv * (orig_x * f[0] - orig_y * f[3]))) + (warp_mesh * cosf(fWarpTime * 0.753f - fWarpScaleInv * (orig_x * f[1] - orig_y * f[2]))); - this->y_mesh[x][y] += + this->y_mesh[x][y] += (warp_mesh * cosf(fWarpTime * 0.375f - fWarpScaleInv * (orig_x * f[2] + orig_y * f[1]))) + (warp_mesh * sinf(fWarpTime * 0.825f + fWarpScaleInv * (orig_x * f[0] + orig_y * f[3]))); -#endif } } -#if 0 - for (x = 0; x < gx; x++) + for (int x = 0; x < gx; x++) { - for (y = 0; y < gy; y++) - { - float u2 = this->x_mesh[x][y] - this->cx_mesh[x][y]; - float v2 = this->y_mesh[x][y] - this->cy_mesh[x][y]; - - float cos_rot = cosf(this->rot_mesh[x][y]); - float sin_rot = sinf(this->rot_mesh[x][y]); - - this->x_mesh[x][y] = u2 * cos_rot - v2 * sin_rot + this->cx_mesh[x][y]; - this->y_mesh[x][y] = u2 * sin_rot + v2 * cos_rot + this->cy_mesh[x][y]; - - } - } - - for (x = 0; x < gx; x++) - for (y = 0; y < gy; y++) - this->x_mesh[x][y] -= this->dx_mesh[x][y]; - - for (x = 0; x < gx; x++) - for (y = 0; y < gy; y++) - this->y_mesh[x][y] -= this->dy_mesh[x][y]; -#else - for (x = 0; x < gx; x++) - { - for (y = 0; y < gy; y++) + for (int y = 0; y < gy; y++) { const float u2 = this->x_mesh[x][y] - this->cx_mesh[x][y]; const float v2 = this->y_mesh[x][y] - this->cy_mesh[x][y]; @@ -286,11 +253,9 @@ void PresetOutputs::PerPixelMath_c(const PipelineContext &context) this->y_mesh[x][y] = u2 * sin_rot + v2 * cos_rot + this->cy_mesh[x][y] - this->dy_mesh[x][y]; } } -#endif } - #ifdef __SSE2__ // is there an SSE way to do this? @@ -501,6 +466,7 @@ void PresetOutputs::PerPixelMath_sse(const PipelineContext &context) } #endif + void PresetOutputs::PerPixelMath(const PipelineContext &context) { #ifdef __SSE2__ @@ -616,6 +582,7 @@ void PresetOutputs::Initialize ( int gx, int gy ) } } + PresetInputs::~PresetInputs() { for ( int x = 0; x < this->gx; x++ ) @@ -673,7 +640,6 @@ void PresetInputs::resetMesh() theta_mesh[x][y]=this->origtheta[x][y]; } } - } From 17942b79ace63e15c740a78edcf904df65057a99 Mon Sep 17 00:00:00 2001 From: Matthew Bellew Date: Sun, 20 May 2018 16:25:20 -0700 Subject: [PATCH 4/9] alloc mesh as one memory block --- .../MilkdropPresetFactory/Param.cpp | 12 +- .../MilkdropPresetFactory/Param.hpp | 5 + .../MilkdropPresetFactory/PresetFrameIO.cpp | 314 ++++++------------ src/libprojectM/Renderer/Pipeline.cpp | 33 +- 4 files changed, 114 insertions(+), 250 deletions(-) diff --git a/src/libprojectM/MilkdropPresetFactory/Param.cpp b/src/libprojectM/MilkdropPresetFactory/Param.cpp index ca21e8ffc..d49875ff3 100755 --- a/src/libprojectM/MilkdropPresetFactory/Param.cpp +++ b/src/libprojectM/MilkdropPresetFactory/Param.cpp @@ -65,7 +65,7 @@ Param::Param(std::string _name) : matrix(0) { - engine_val = new float(); + engine_val = (float *)&local_value; default_init_val.float_val = DEFAULT_DOUBLE_IV; upper_bound.float_val = DEFAULT_DOUBLE_UB; @@ -73,18 +73,10 @@ Param::Param(std::string _name) : /// @note may have fixed a recent bug. testing *((float*)engine_val) = default_init_val.float_val; - - -} + } /* Free's a parameter type */ Param::~Param() { - - // I hate this, but will let it be for now - if (flags & P_FLAG_USERDEF) { - delete((double*)engine_val); - } - if (PARAM_DEBUG) printf("~Param: freeing \"%s\".\n", name.c_str()); } diff --git a/src/libprojectM/MilkdropPresetFactory/Param.hpp b/src/libprojectM/MilkdropPresetFactory/Param.hpp index 1f6bcf272..c2631cd6c 100755 --- a/src/libprojectM/MilkdropPresetFactory/Param.hpp +++ b/src/libprojectM/MilkdropPresetFactory/Param.hpp @@ -58,6 +58,8 @@ class InitCond; class Param; class Preset; //#include +#include + /* Parameter Type */ class Param { @@ -72,6 +74,9 @@ public: CValue upper_bound; /* this parameter's upper bound */ CValue lower_bound; /* this parameter's lower bound */ + // for a local variable, engine_val can point here + float local_value; + /// Create a new parameter Param(std::string name, short int type, short int flags, void * eqn_val, void *matrix, diff --git a/src/libprojectM/MilkdropPresetFactory/PresetFrameIO.cpp b/src/libprojectM/MilkdropPresetFactory/PresetFrameIO.cpp index 01f44d509..e4b7b9db5 100644 --- a/src/libprojectM/MilkdropPresetFactory/PresetFrameIO.cpp +++ b/src/libprojectM/MilkdropPresetFactory/PresetFrameIO.cpp @@ -5,8 +5,6 @@ #include #include #include "Renderer/BeatDetect.hpp" -//#include // X86 SSE1 -//#include // X86 SSE2 #include @@ -34,12 +32,37 @@ void PresetInputs::update(const BeatDetect & music, const PipelineContext & cont } +float **alloc_mesh(size_t gx, size_t gy) +{ + // round gy up to multiple 4 (for possible SSE optimization) + gy = (gy+3) & ~(size_t)3; + + float **mesh = (float **)wipemalloc(gx * sizeof(float *)); + float *m = (float *)wipemalloc(gx * gy * sizeof(float)); + for ( int x = 0; x < gx; x++ ) + mesh[x] = m + (gy * x); + return mesh; +} + +float **free_mesh(float **mesh) +{ + free(mesh[0]); + free(mesh); + return NULL; +} + +void copy_mesh(float **dst, float **src, int gx, int gy) +{ + memcpy(dst[0], src[0], gx*gy*sizeof(float)); +} + + void PresetInputs::Initialize ( int gx, int gy ) { int x, y; - this->gx =gx; - this->gy= gy; + this->gx = gx; + this->gy = gy; /// @bug no clue if this block belongs here @@ -53,47 +76,14 @@ void PresetInputs::Initialize ( int gx, int gy ) ang_per_pixel = 0; // *** - this->x_mesh= ( float ** ) wipemalloc ( gx * sizeof ( float * ) ); - for ( x = 0; x < gx; x++ ) - { - this->x_mesh[x] = ( float * ) wipemalloc ( gy * sizeof ( float ) ); - } - this->y_mesh= ( float ** ) wipemalloc ( gx * sizeof ( float * ) ); - for ( x = 0; x y_mesh[x] = ( float * ) wipemalloc ( gy * sizeof ( float ) ); - } - this->rad_mesh= ( float ** ) wipemalloc ( gx * sizeof ( float * ) ); - for ( x = 0; x < gx; x++ ) - { - this->rad_mesh[x] = ( float * ) wipemalloc ( gy * sizeof ( float ) ); - } - this->theta_mesh= ( float ** ) wipemalloc ( gx * sizeof ( float * ) ); - for ( x = 0; x theta_mesh[x] = ( float * ) wipemalloc ( gy * sizeof ( float ) ); - } - - this->origtheta= ( float ** ) wipemalloc ( gx * sizeof ( float * ) ); - for ( x = 0; x < gx; x++ ) - { - this->origtheta[x] = ( float * ) wipemalloc ( gy * sizeof ( float ) ); - } - this->origrad= ( float ** ) wipemalloc ( gx * sizeof ( float * ) ); - for ( x = 0; x < gx; x++ ) - { - this->origrad[x] = ( float * ) wipemalloc ( gy * sizeof ( float ) ); - } - this->origx= ( float ** ) wipemalloc ( gx * sizeof ( float * ) ); - for ( x = 0; x < gx; x++ ) - { - this->origx[x] = ( float * ) wipemalloc ( gy * sizeof ( float ) ); - } - this->origy= ( float ** ) wipemalloc ( gx * sizeof ( float * ) ); - for ( x = 0; x < gx; x++ ) - { - this->origy[x] = ( float * ) wipemalloc ( gy * sizeof ( float ) ); - } + this->x_mesh = alloc_mesh(gx, gy); + this->y_mesh = alloc_mesh(gx, gy); + this->rad_mesh = alloc_mesh(gx, gy); + this->theta_mesh= alloc_mesh(gx, gy); + this->origtheta = alloc_mesh(gx, gy); + this->origrad = alloc_mesh(gx, gy); + this->origx = alloc_mesh(gx, gy); + this->origy = alloc_mesh(gx, gy); for ( x=0;xgx > 0); - for ( int x = 0; x < this->gx; x++ ) - { - free(this->sx_mesh[x]); - free(this->sy_mesh[x]); - free(this->dy_mesh[x]); - free(this->dx_mesh[x]); - free(this->cy_mesh[x]); - free(this->cx_mesh[x]); - - free(this->warp_mesh[x]); - free(this->zoom_mesh[x]); - free(this->zoomexp_mesh[x]); - free(this->rot_mesh[x]); - free(this->orig_x[x]); - free(this->orig_y[x]); - free(this->rad_mesh[x]); - } - - free(this->rad_mesh); - free(this->sx_mesh); - free(this->sy_mesh); - free(this->dy_mesh); - free(this->dx_mesh); - free(this->cy_mesh); - free(this->cx_mesh); - free(this->warp_mesh); - free(this->zoom_mesh); - free(this->zoomexp_mesh); - free(this->rot_mesh); - free(this->orig_x); - free(this->orig_y); + this->rad_mesh = free_mesh(this->rad_mesh); + this->sx_mesh = free_mesh(this->sx_mesh); + this->sy_mesh = free_mesh(this->sy_mesh); + this->dy_mesh = free_mesh(this->dy_mesh); + this->dx_mesh = free_mesh(this->dx_mesh); + this->cy_mesh = free_mesh(this->cy_mesh); + this->cx_mesh = free_mesh(this->cx_mesh); + this->warp_mesh = free_mesh(this->warp_mesh); + this->zoom_mesh = free_mesh(this->zoom_mesh); + this->zoomexp_mesh = free_mesh(this->zoomexp_mesh); + this->rot_mesh = free_mesh(this->rot_mesh); + this->orig_x = free_mesh(this->orig_x); + this->orig_y = free_mesh(this->orig_y); } @@ -265,10 +237,10 @@ inline __m128 _mm_pow(__m128 x, __m128 y) float Y[4]; _mm_store_ps(X,x); _mm_store_ps(Y,x); - X[0] = std::pow(X[0],Y[0]); - X[1] = std::pow(X[1],Y[1]); - X[2] = std::pow(X[2],Y[2]); - X[3] = std::pow(X[3],Y[3]); + X[0] = __builtin_powf(X[0],Y[0]); + X[1] = __builtin_powf(X[1],Y[1]); + X[2] = __builtin_powf(X[2],Y[2]); + X[3] = __builtin_powf(X[3],Y[3]); return _mm_load_ps(X); } inline __m128 _mm_sincosf(__m128 x, __m128 &sinx, __m128 &cosx) @@ -316,7 +288,6 @@ void PresetOutputs::PerPixelMath_sse(const PipelineContext &context) { // fZoom2 = std::pow(this->zoom_mesh[x][y], std::pow(this->zoomexp_mesh[x][y], // rad_mesh[x][y] * 2.0f - 1.0f)); - // fZoom2Inv = 1.0f / fZoom2; __m128 rad_mesh_scaled = _mm_sub_ps( _mm_mul_ps( @@ -326,14 +297,15 @@ void PresetOutputs::PerPixelMath_sse(const PipelineContext &context) __m128 zoom_mesh = _mm_load_ps(&this->zoom_mesh[x][y]); __m128 zoomexp_mesh = _mm_load_ps(&this->zoomexp_mesh[x][y]); __m128 fZoom2 = _mm_pow(zoom_mesh, _mm_pow(zoomexp_mesh, rad_mesh_scaled)); - __m128 fZoomInv = _mm_mul_ps(_mm_rcp_ps(fZoom2), _mm_set_ps1(0.5f)); + // fZoom2Inv = 1.0f / fZoom2; + __m128 fZoomInv = _mm_rcp_ps(fZoom2); + // this->x_mesh[x][y] = this->orig_x[x][y] * 0.5f * fZoom2Inv + 0.5f; - __m128 x_mesh = _mm_load_ps(&this->orig_x[x][y]); - x_mesh = + __m128 x_mesh = _mm_add_ps( _mm_mul_ps( _mm_load_ps(&this->orig_x[x][y]), - fZoomInv), + _mm_mul_ps(fZoomInv,_mm_set_ps1(0.5f))), // CONSIDER: common sub-expression _mm_set_ps1(0.5f)); // this->x_mesh[x][y] = (this->x_mesh[x][y] - this->cx_mesh[x][y]) / this->sx_mesh[x][y] + this->cx_mesh[x][y]; __m128 cx_mesh = _mm_load_ps(&this->cx_mesh[x][y]); @@ -351,7 +323,7 @@ void PresetOutputs::PerPixelMath_sse(const PipelineContext &context) _mm_add_ps( _mm_mul_ps( _mm_load_ps(&this->orig_y[x][y]), - fZoomInv), + _mm_mul_ps(fZoomInv,_mm_set_ps1(0.5f))), _mm_set_ps1(0.5f)); // this->y_mesh[x][y] = (this->y_mesh[x][y] - this->cy_mesh[x][y]) / this->sy_mesh[x][y] + this->cy_mesh[x][y]; __m128 cy_mesh = _mm_load_ps(&this->cy_mesh[x][y]); @@ -481,165 +453,71 @@ void PresetOutputs::Initialize ( int gx, int gy ) { assert(gx > 0); - // round gx/gy up to multiple 4 (for possible SSE optimization) - this->gx = (gx+3) & ~(size_t)3; - this->gy = (gy+3) & ~(size_t)3; + this->gx = gx; + this->gy = gy; staticPerPixel = true; setStaticPerPixel(gx,gy); assert(this->gx > 0); int x; - this->x_mesh= ( float ** ) wipemalloc ( gx * sizeof ( float * ) ); - for ( x = 0; x < gx; x++ ) - { - this->x_mesh[x] = ( float * ) wipemalloc ( gy * sizeof ( float ) ); - } - this->y_mesh= ( float ** ) wipemalloc ( gx * sizeof ( float * ) ); - for ( x = 0; x < gx; x++ ) - { - this->y_mesh[x] = ( float * ) wipemalloc ( gy * sizeof ( float ) ); - } - this->sx_mesh= ( float ** ) wipemalloc ( gx * sizeof ( float * ) ); - for ( x = 0; x < gx; x++ ) - { - this->sx_mesh[x] = ( float * ) wipemalloc ( gy * sizeof ( float ) ); - } - this->sy_mesh= ( float ** ) wipemalloc ( gx * sizeof ( float * ) ); - for ( x = 0; x < gx; x++ ) - { - this->sy_mesh[x] = ( float * ) wipemalloc ( gy * sizeof ( float ) ); - } - this->dx_mesh= ( float ** ) wipemalloc ( gx * sizeof ( float * ) ); - for ( x = 0; x < gx; x++ ) - { - this->dx_mesh[x] = ( float * ) wipemalloc ( gy * sizeof ( float ) ); - } - this->dy_mesh= ( float ** ) wipemalloc ( gx * sizeof ( float * ) ); - for ( x = 0; x < gx; x++ ) - { - this->dy_mesh[x] = ( float * ) wipemalloc ( gy * sizeof ( float ) ); - } - this->cx_mesh= ( float ** ) wipemalloc ( gx * sizeof ( float * ) ); - for ( x = 0; x < gx; x++ ) - { - this->cx_mesh[x] = ( float * ) wipemalloc ( gy * sizeof ( float ) ); - } - this->cy_mesh= ( float ** ) wipemalloc ( gx * sizeof ( float * ) ); - for ( x = 0; x < gx; x++ ) - { - this->cy_mesh[x] = ( float * ) wipemalloc ( gy * sizeof ( float ) ); - } - this->zoom_mesh= ( float ** ) wipemalloc ( gx * sizeof ( float * ) ); - for ( x = 0; x < gx; x++ ) - { - this->zoom_mesh[x] = ( float * ) wipemalloc ( gy * sizeof ( float ) ); - } - this->zoomexp_mesh= ( float ** ) wipemalloc ( gx * sizeof ( float * ) ); - for ( x = 0; x < gx; x++ ) - { - this->zoomexp_mesh[x] = ( float * ) wipemalloc ( gy * sizeof ( float ) ); - } - this->rot_mesh= ( float ** ) wipemalloc ( gx * sizeof ( float * ) ); - for ( x = 0; x < gx; x++ ) - { - this->rot_mesh[x] = ( float * ) wipemalloc ( gy * sizeof ( float ) ); - } + this->x_mesh = alloc_mesh( gx, gy ); + this->y_mesh = alloc_mesh( gx, gy ); + this->sx_mesh = alloc_mesh( gx, gy ); + this->sy_mesh = alloc_mesh( gx, gy ); + this->dx_mesh = alloc_mesh( gx, gy ); + this->dy_mesh = alloc_mesh( gx, gy ); + this->cx_mesh = alloc_mesh( gx, gy ); + this->cy_mesh = alloc_mesh( gx, gy ); + this->zoom_mesh = alloc_mesh( gx, gy ); + this->zoomexp_mesh = alloc_mesh( gx, gy ); + this->rot_mesh = alloc_mesh( gx, gy ); - this->warp_mesh= ( float ** ) wipemalloc ( gx * sizeof ( float * ) ); - for ( x = 0; x < gx; x++ ) - { - this->warp_mesh[x] = ( float * ) wipemalloc ( gy * sizeof ( float ) ); - } - this->rad_mesh= ( float ** ) wipemalloc ( gx * sizeof ( float * ) ); - for ( x = 0; x < gx; x++ ) - { - this->rad_mesh[x] = ( float * ) wipemalloc ( gy * sizeof ( float ) ); - } - this->orig_x = (float **) wipemalloc(gx * sizeof(float *)); + this->warp_mesh = alloc_mesh( gx, gy ); + this->rad_mesh = alloc_mesh( gx, gy ); + this->orig_x = alloc_mesh( gx, gy ); + this->orig_y = alloc_mesh( gx, gy ); + + //initialize reference grid values for (x = 0; x < gx; x++) { - this->orig_x[x] = (float *) wipemalloc(gy * sizeof(float)); - } - this->orig_y = (float **) wipemalloc(gx * sizeof(float *)); - for (x = 0; x < gx; x++) - { - this->orig_y[x] = (float *) wipemalloc(gy * sizeof(float)); - } - - //initialize reference grid values - for (x = 0; x < gx; x++) + for (int y = 0; y < gy; y++) { - for (int y = 0; y < gy; y++) - { - float origx = x / (float) (gx - 1); - float origy = -((y / (float) (gy - 1)) - 1); + float origx = x / (float) (gx - 1); + float origy = -((y / (float) (gy - 1)) - 1); - rad_mesh[x][y]=hypot ( ( origx-.5 ) *2, ( origy-.5 ) *2 ) * .7071067; - orig_x[x][y] = (origx - .5) * 2; - orig_y[x][y] = (origy - .5) * 2; - } + rad_mesh[x][y]=hypot ( ( origx-.5 ) *2, ( origy-.5 ) *2 ) * .7071067; + orig_x[x][y] = (origx - .5) * 2; + orig_y[x][y] = (origy - .5) * 2; } + } } PresetInputs::~PresetInputs() { - for ( int x = 0; x < this->gx; x++ ) - { - free ( this->origtheta[x] ); - free ( this->origrad[x] ); - free ( this->origx[x] ); - free ( this->origy[x] ); - - free ( this->x_mesh[x] ); - free ( this->y_mesh[x] ); - free ( this->rad_mesh[x] ); - free ( this->theta_mesh[x] ); - } - - - free ( this->origx ); - free ( this->origy ); - free ( this->origrad ); - free ( this->origtheta ); - - free ( this->x_mesh ); - free ( this->y_mesh ); - free ( this->rad_mesh ); - free ( this->theta_mesh ); - - this->origx = NULL; - this->origy = NULL; - this->origtheta = NULL; - this->origrad = NULL; - - this->x_mesh = NULL; - this->y_mesh = NULL; - this->rad_mesh = NULL; - this->theta_mesh = NULL; + this->origx = free_mesh ( this->origx ); + this->origy = free_mesh ( this->origy ); + this->origrad = free_mesh ( this->origrad ); + this->origtheta = free_mesh ( this->origtheta ); + this->x_mesh = free_mesh ( this->x_mesh ); + this->y_mesh = free_mesh ( this->y_mesh ); + this->rad_mesh = free_mesh ( this->rad_mesh ); + this->theta_mesh = free_mesh ( this->theta_mesh ); } void PresetInputs::resetMesh() { - int x,y; - assert ( x_mesh ); assert ( y_mesh ); assert ( rad_mesh ); assert ( theta_mesh ); - for ( x=0;xgx;x++ ) - { - for ( y=0;ygy;y++ ) - { - x_mesh[x][y]=this->origx[x][y]; - y_mesh[x][y]=this->origy[x][y]; - rad_mesh[x][y]=this->origrad[x][y]; - theta_mesh[x][y]=this->origtheta[x][y]; - } - } + copy_mesh(this->x_mesh, this->origx, gx, gy); + copy_mesh(this->y_mesh, this->origy, gx, gy); + copy_mesh(this->rad_mesh, this->origrad, gx, gy); + copy_mesh(this->theta_mesh, this->origtheta, gx, gy); } diff --git a/src/libprojectM/Renderer/Pipeline.cpp b/src/libprojectM/Renderer/Pipeline.cpp index 3daf7ae86..e13742c91 100644 --- a/src/libprojectM/Renderer/Pipeline.cpp +++ b/src/libprojectM/Renderer/Pipeline.cpp @@ -11,37 +11,26 @@ Pipeline::Pipeline() : staticPerPixel(false),gx(0),gy(0),blur1n(1), blur2n(1), b blur1x(1), blur2x(1), blur3x(1), blur1ed(1){} +float **alloc_mesh(size_t gx, size_t gy); +float **free_mesh(float **mesh); + void Pipeline::setStaticPerPixel(int gx, int gy) { - staticPerPixel = true; - this->gx = gx; - this->gy = gy; - - this->x_mesh= ( float ** ) wipemalloc ( gx * sizeof ( float * ) ); - for ( int x = 0; x < gx; x++ ) - { - this->x_mesh[x] = ( float * ) wipemalloc ( gy * sizeof ( float ) ); - } - this->y_mesh= ( float ** ) wipemalloc ( gx * sizeof ( float * ) ); - for ( int x = 0; x < gx; x++ ) - { - this->y_mesh[x] = ( float * ) wipemalloc ( gy * sizeof ( float ) ); - } + staticPerPixel = true; + this->gx = gx; + this->gy = gy; + this->x_mesh = alloc_mesh(gx, gy); + this->y_mesh = alloc_mesh(gx, gy); } Pipeline::~Pipeline() { -if (staticPerPixel) -{ - for ( int x = 0; x < this->gx; x++ ) + if (staticPerPixel) { - free(this->x_mesh[x]); - free(this->y_mesh[x]); + free_mesh(x_mesh); + free_mesh(y_mesh); } - free(x_mesh); - free(y_mesh); -} } //void Pipeline::Render(const BeatDetect &music, const PipelineContext &context){} From a374bd93bc66a296918f6067c6ba70ba3b168078 Mon Sep 17 00:00:00 2001 From: Matthew Bellew Date: Sun, 20 May 2018 16:34:52 -0700 Subject: [PATCH 5/9] some comments --- .../MilkdropPresetFactory/PresetFrameIO.cpp | 25 +++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/src/libprojectM/MilkdropPresetFactory/PresetFrameIO.cpp b/src/libprojectM/MilkdropPresetFactory/PresetFrameIO.cpp index e4b7b9db5..bee2d3456 100644 --- a/src/libprojectM/MilkdropPresetFactory/PresetFrameIO.cpp +++ b/src/libprojectM/MilkdropPresetFactory/PresetFrameIO.cpp @@ -168,8 +168,11 @@ void PresetOutputs::Render(const BeatDetect &music, const PipelineContext &conte // N.B. The more optimization that can be done on this method, the better! This is called a lot and can probably be improved. +// NOTE : Keep PerPixelMath_sse and PerPixelMath_c in sync + void PresetOutputs::PerPixelMath_c(const PipelineContext &context) { + for (int x = 0; x < gx; x++) { for (int y = 0; y < gy; y++) @@ -280,6 +283,28 @@ inline __m128 _mm_cosf(__m128 x) } +/** + * SSE instructions let us do the math on 4 floats in parallel. You an see the main loop uses y += 4. Each time through the loop, + * we read operands in group of 4. This looks like a mess, but just think of it as rewriting the infix expressions as a prefix expression + * + * e.g. + * this->orig_x[x][y] * 0.5f * fZoom2Inv + 0.5f + * becomes + * __m128 x_mesh = + * _mm_add_ps( + * _mm_mul_ps( + * _mm_load_ps(&this->orig_x[x][y]), + * _mm_mul_ps(fZoomInv,_mm_set_ps1(0.5f))), // CONSIDER: common sub-expression + * _mm_set_ps1(0.5f)); + * + * _mm_load_ps loads an SSE register from memory (4 floats at a time) + * _mm_set_ps1 takes a constant 0.5 and loads it (replicated 4 times) + * * The other expressions are what they sound like: + * a + b --> _mm_add_ps(a, b) + * a * b --> _mm_mul_ps(a, b) + */ +// NOTE : Keep PerPixelMath_sse and PerPixelMath_c in sync +// NOTE : Even better would be to rewrite this as a compute shader void PresetOutputs::PerPixelMath_sse(const PipelineContext &context) { for (int x = 0; x < gx; x++) From 43070063c74b9eca5f356d840eb95608514e4d98 Mon Sep 17 00:00:00 2001 From: Matthew Bellew Date: Mon, 21 May 2018 21:00:57 -0700 Subject: [PATCH 6/9] TARGET_OS_MAC --- .../MilkdropPresetFactory/PresetFrameIO.cpp | 33 ++----------- src/libprojectM/wipemalloc.cpp | 48 +++++++++++++++++-- src/libprojectM/wipemalloc.h | 4 ++ 3 files changed, 52 insertions(+), 33 deletions(-) diff --git a/src/libprojectM/MilkdropPresetFactory/PresetFrameIO.cpp b/src/libprojectM/MilkdropPresetFactory/PresetFrameIO.cpp index bee2d3456..f3ce4f8fc 100644 --- a/src/libprojectM/MilkdropPresetFactory/PresetFrameIO.cpp +++ b/src/libprojectM/MilkdropPresetFactory/PresetFrameIO.cpp @@ -37,8 +37,8 @@ float **alloc_mesh(size_t gx, size_t gy) // round gy up to multiple 4 (for possible SSE optimization) gy = (gy+3) & ~(size_t)3; - float **mesh = (float **)wipemalloc(gx * sizeof(float *)); - float *m = (float *)wipemalloc(gx * gy * sizeof(float)); + float **mesh = (float **)wipe_aligned_alloc(gx * sizeof(float *)); + float *m = (float *)wipe_aligned_alloc(gx * gy * sizeof(float)); for ( int x = 0; x < gx; x++ ) mesh[x] = m + (gy * x); return mesh; @@ -46,8 +46,8 @@ float **alloc_mesh(size_t gx, size_t gy) float **free_mesh(float **mesh) { - free(mesh[0]); - free(mesh); + wipe_aligned_free(mesh[0]); + wipe_aligned_free(mesh); return NULL; } @@ -168,11 +168,8 @@ void PresetOutputs::Render(const BeatDetect &music, const PipelineContext &conte // N.B. The more optimization that can be done on this method, the better! This is called a lot and can probably be improved. -// NOTE : Keep PerPixelMath_sse and PerPixelMath_c in sync - void PresetOutputs::PerPixelMath_c(const PipelineContext &context) { - for (int x = 0; x < gx; x++) { for (int y = 0; y < gy; y++) @@ -283,28 +280,6 @@ inline __m128 _mm_cosf(__m128 x) } -/** - * SSE instructions let us do the math on 4 floats in parallel. You an see the main loop uses y += 4. Each time through the loop, - * we read operands in group of 4. This looks like a mess, but just think of it as rewriting the infix expressions as a prefix expression - * - * e.g. - * this->orig_x[x][y] * 0.5f * fZoom2Inv + 0.5f - * becomes - * __m128 x_mesh = - * _mm_add_ps( - * _mm_mul_ps( - * _mm_load_ps(&this->orig_x[x][y]), - * _mm_mul_ps(fZoomInv,_mm_set_ps1(0.5f))), // CONSIDER: common sub-expression - * _mm_set_ps1(0.5f)); - * - * _mm_load_ps loads an SSE register from memory (4 floats at a time) - * _mm_set_ps1 takes a constant 0.5 and loads it (replicated 4 times) - * * The other expressions are what they sound like: - * a + b --> _mm_add_ps(a, b) - * a * b --> _mm_mul_ps(a, b) - */ -// NOTE : Keep PerPixelMath_sse and PerPixelMath_c in sync -// NOTE : Even better would be to rewrite this as a compute shader void PresetOutputs::PerPixelMath_sse(const PipelineContext &context) { for (int x = 0; x < gx; x++) diff --git a/src/libprojectM/wipemalloc.cpp b/src/libprojectM/wipemalloc.cpp index b1e196eb0..94b4affe6 100755 --- a/src/libprojectM/wipemalloc.cpp +++ b/src/libprojectM/wipemalloc.cpp @@ -25,17 +25,18 @@ */ #include "wipemalloc.h" +#include - void *wipemalloc( size_t count ) { - count = (count + 15) & ~(size_t)15; - void *mem = aligned_alloc( 16, count ); + void *wipemalloc( size_t count ) + { + void *mem = malloc( count ); if ( mem != NULL ) { memset( mem, 0, count ); } else { printf( "wipemalloc() failed to allocate %d bytes\n", (int)count ); } return mem; - } + } /** Safe memory deallocator */ void wipefree( void *ptr ) { @@ -43,3 +44,42 @@ free( ptr ); } } + +void *wipe_aligned_alloc( size_t align, size_t size ) +{ +#if TARGET_OS_MAC + // only support powers of 2 for align + assert( (align & (align-1)) == 0 ); + void *allocated = malloc(size + align - 1 + sizeof(void*)); + if (allocated == NULL) + { + printf( "wipe_aligned_malloc() failed to allocate %d bytes\n", (int)size ); + return NULL; + } + void *ret = (void*) (((size_t)allocated + sizeof(void*) + align -1) & ~(align-1)); + *((void**)((size_t)ret - sizeof(void*))) = allocated; + return ret; +#else + void *mem = aligned_alloc( align, size ); + if ( mem != NULL ) { + memset( mem, 0, size ); + } else { + printf( "wipe_aligned_alloc() failed to allocate %d bytes\n", (int)size ); + } + return mem; +#endif +} + +void wipe_aligned_free( void *p ) +{ +#if TARGET_OS_MAC + if (p != NULL) + { + void *allocated = *((void**)((size_t)p - sizeof(void*))); + free(allocated); + } +#else + if (p != NULL) + free(p); +#endif +} diff --git a/src/libprojectM/wipemalloc.h b/src/libprojectM/wipemalloc.h index 6ff625d36..26b9fa0f2 100755 --- a/src/libprojectM/wipemalloc.h +++ b/src/libprojectM/wipemalloc.h @@ -57,4 +57,8 @@ void *wipemalloc( size_t count ); void wipefree( void *ptr ); +/** wipe_aligned_malloc() must be matched with aligned_free() */ + void *wipe_aligned_alloc( size_t align, size_t count); + inline void *wipe_aligned_alloc( size_t count ) { return wipe_aligned_alloc(16,count); } + void wipe_aligned_free( void *ptr ); #endif /** !_WIPEMALLOC_H */ From 6834d407dc92b99eeeee87cad6cd95cfe82b5a25 Mon Sep 17 00:00:00 2001 From: Matthew Bellew Date: Mon, 21 May 2018 22:22:39 -0700 Subject: [PATCH 7/9] __APPLE__ --- src/libprojectM/wipemalloc.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/libprojectM/wipemalloc.cpp b/src/libprojectM/wipemalloc.cpp index 94b4affe6..1501b31ab 100755 --- a/src/libprojectM/wipemalloc.cpp +++ b/src/libprojectM/wipemalloc.cpp @@ -47,7 +47,7 @@ void *wipe_aligned_alloc( size_t align, size_t size ) { -#if TARGET_OS_MAC +#if __APPLE__ // only support powers of 2 for align assert( (align & (align-1)) == 0 ); void *allocated = malloc(size + align - 1 + sizeof(void*)); @@ -72,7 +72,7 @@ void *wipe_aligned_alloc( size_t align, size_t size ) void wipe_aligned_free( void *p ) { -#if TARGET_OS_MAC +#if __APPLE__ if (p != NULL) { void *allocated = *((void**)((size_t)p - sizeof(void*))); From 541a22c71240b681edd8638865d9eb0283a121ac Mon Sep 17 00:00:00 2001 From: Matthew Bellew Date: Tue, 22 May 2018 12:03:26 -0700 Subject: [PATCH 8/9] AC_CHECK_FUNCS_ONCE --- configure.ac | 2 + src/libprojectM/wipemalloc.cpp | 88 +++++++++++++++++++++------------- 2 files changed, 58 insertions(+), 32 deletions(-) diff --git a/configure.ac b/configure.ac index eb582aa1a..25332b1c6 100644 --- a/configure.ac +++ b/configure.ac @@ -11,6 +11,8 @@ AX_CHECK_GL AC_CHECK_LIB(c, dlopen, LIBDL="", AC_CHECK_LIB(dl, dlopen, LIBDL="-ldl")) +AC_CHECK_FUNCS_ONCE([aligned_alloc posix_memalign]) + AC_CONFIG_HEADERS([config.h]) AC_CONFIG_FILES([ Makefile diff --git a/src/libprojectM/wipemalloc.cpp b/src/libprojectM/wipemalloc.cpp index 1501b31ab..840df3609 100755 --- a/src/libprojectM/wipemalloc.cpp +++ b/src/libprojectM/wipemalloc.cpp @@ -27,59 +27,83 @@ #include "wipemalloc.h" #include - void *wipemalloc( size_t count ) - { + +void *wipemalloc( size_t count ) +{ void *mem = malloc( count ); - if ( mem != NULL ) { + if ( mem != NULL ) + { memset( mem, 0, count ); - } else { + } + else + { printf( "wipemalloc() failed to allocate %d bytes\n", (int)count ); - } + } return mem; - } +} + /** Safe memory deallocator */ - void wipefree( void *ptr ) { - if ( ptr != NULL ) { +void wipefree( void *ptr ) +{ + if ( ptr != NULL ) free( ptr ); - } - } +} + void *wipe_aligned_alloc( size_t align, size_t size ) { -#if __APPLE__ + void *mem = NULL; + +#if HAVE_ALIGNED_ALLOC==1 + + mem = aligned_alloc( align, size ); + +#elif HAVE_POSIX_MEMALIGN==1 + + if (posix_memalign(&mem, align, size)) + mem = NULL; + +#else + // only support powers of 2 for align assert( (align & (align-1)) == 0 ); + assert( (size % align) == 0 ); void *allocated = malloc(size + align - 1 + sizeof(void*)); - if (allocated == NULL) + if (allocated) { - printf( "wipe_aligned_malloc() failed to allocate %d bytes\n", (int)size ); - return NULL; + mem = (void*) (((size_t)allocated + sizeof(void*) + align -1) & ~(align-1)); + ((void**)mem)[-1] = allocated; } - void *ret = (void*) (((size_t)allocated + sizeof(void*) + align -1) & ~(align-1)); - *((void**)((size_t)ret - sizeof(void*))) = allocated; - return ret; -#else - void *mem = aligned_alloc( align, size ); - if ( mem != NULL ) { - memset( mem, 0, size ); - } else { - printf( "wipe_aligned_alloc() failed to allocate %d bytes\n", (int)size ); - } - return mem; + #endif + + if (mem) + { + memset( mem, 0, size ); + } + else + { + printf( "wipe_aligned_alloc() failed to allocate %d bytes\n", (int)size ); + } + return mem; } + void wipe_aligned_free( void *p ) { -#if __APPLE__ - if (p != NULL) - { - void *allocated = *((void**)((size_t)p - sizeof(void*))); - free(allocated); - } -#else +#if HAVE_ALIGNED_ALLOC==1 || HAVE_POSIX_MEMALIGN==1 + if (p != NULL) free(p); + +#else + + if (p != NULL) + { + void *allocated = ((void**)p)[-1]; + free(allocated); + } + #endif } From 5f8a525f9bba085c7b5d97f034ed3e8747fd1967 Mon Sep 17 00:00:00 2001 From: Matthew Bellew Date: Tue, 22 May 2018 12:45:07 -0700 Subject: [PATCH 9/9] fix signature for _mm_sincosf(), thanks Mischa --- src/libprojectM/MilkdropPresetFactory/PresetFrameIO.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/libprojectM/MilkdropPresetFactory/PresetFrameIO.cpp b/src/libprojectM/MilkdropPresetFactory/PresetFrameIO.cpp index f3ce4f8fc..b5374c290 100644 --- a/src/libprojectM/MilkdropPresetFactory/PresetFrameIO.cpp +++ b/src/libprojectM/MilkdropPresetFactory/PresetFrameIO.cpp @@ -243,7 +243,7 @@ inline __m128 _mm_pow(__m128 x, __m128 y) X[3] = __builtin_powf(X[3],Y[3]); return _mm_load_ps(X); } -inline __m128 _mm_sincosf(__m128 x, __m128 &sinx, __m128 &cosx) +inline void _mm_sincosf(__m128 x, __m128 &sinx, __m128 &cosx) { float X[4], S[4], C[4]; _mm_store_ps(X,x);