From d1578229ce3b8a03209dd61563f63487e27c6ad6 Mon Sep 17 00:00:00 2001
From: Matthew Bellew <matthewb@labkey.com>
Date: Tue, 15 May 2018 22:34:32 -0700
Subject: [PATCH 1/9] SSE version of PresetOutputs::PerPixelMath()

---
 .../MilkdropPresetFactory/PresetFrameIO.cpp   | 288 ++++++++++++++++--
 .../MilkdropPresetFactory/PresetFrameIO.hpp   |   6 +
 src/libprojectM/wipemalloc.cpp                |   3 +-
 3 files changed, 271 insertions(+), 26 deletions(-)
diff --git a/src/libprojectM/MilkdropPresetFactory/PresetFrameIO.cpp b/src/libprojectM/MilkdropPresetFactory/PresetFrameIO.cpp
index af3ceeab2..725f5e912 100644
--- a/src/libprojectM/MilkdropPresetFactory/PresetFrameIO.cpp
+++ b/src/libprojectM/MilkdropPresetFactory/PresetFrameIO.cpp
@@ -5,6 +5,9 @@
 #include <iostream>
 #include <cmath>
 #include "Renderer/BeatDetect.hpp"
+//#include <xmmintrin.h> // X86 SSE1
+//#include <emmintrin.h> // X86 SSE2
+#include <immintrin.h>
 
 PresetInputs::PresetInputs() : PipelineContext()
 {
@@ -186,7 +189,7 @@ void PresetOutputs::Render(const BeatDetect &music, const PipelineContext &conte
 }
 
 // N.B. The more optimization that can be done on this method, the better! This is called a lot and can probably be improved.
-void PresetOutputs::PerPixelMath(const PipelineContext &context)
+void PresetOutputs::PerPixelMath_c(const PipelineContext &context)
 {
 
 	int x, y;
@@ -200,25 +203,9 @@ void PresetOutputs::PerPixelMath(const PipelineContext &context)
 					rad_mesh[x][y] * 2.0f - 1.0f));
 			fZoom2Inv = 1.0f / fZoom2;
 			this->x_mesh[x][y] = this->orig_x[x][y] * 0.5f * fZoom2Inv + 0.5f;
+			this->x_mesh[x][y] = (this->x_mesh[x][y] - this->cx_mesh[x][y]) / this->sx_mesh[x][y] + this->cx_mesh[x][y];
 			this->y_mesh[x][y] = this->orig_y[x][y] * 0.5f * fZoom2Inv + 0.5f;
-		}
-	}
-
-	for (x = 0; x < gx; x++)
-	{
-		for (y = 0; y < gy; y++)
-		{
-			this->x_mesh[x][y] = (this->x_mesh[x][y] - this->cx_mesh[x][y])
-					/ this->sx_mesh[x][y] + this->cx_mesh[x][y];
-		}
-	}
-
-	for (x = 0; x < gx; x++)
-	{
-		for (y = 0; y < gy; y++)
-		{
-			this->y_mesh[x][y] = (this->y_mesh[x][y] - this->cy_mesh[x][y])
-					/ this->sy_mesh[x][y] + this->cy_mesh[x][y];
+			this->y_mesh[x][y] = (this->y_mesh[x][y] - this->cy_mesh[x][y]) / this->sy_mesh[x][y] + this->cy_mesh[x][y];
 		}
 	}
 
@@ -234,6 +221,7 @@ void PresetOutputs::PerPixelMath(const PipelineContext &context)
 	{
 		for (y = 0; y < gy; y++)
 		{
+#if 0
 			this->x_mesh[x][y] += this->warp_mesh[x][y] * 0.0035f * sinf(fWarpTime * 0.333f
 					+ fWarpScaleInv * (this->orig_x[x][y] * f[0] - this->orig_y[x][y] * f[3]));
 			this->y_mesh[x][y] += this->warp_mesh[x][y] * 0.0035f * cosf(fWarpTime * 0.375f
@@ -242,8 +230,23 @@ void PresetOutputs::PerPixelMath(const PipelineContext &context)
 					- fWarpScaleInv * (this->orig_x[x][y] * f[1] - this->orig_y[x][y] * f[2]));
 			this->y_mesh[x][y] += this->warp_mesh[x][y] * 0.0035f * sinf(fWarpTime * 0.825f
 					+ fWarpScaleInv * (this->orig_x[x][y] * f[0] + this->orig_y[x][y] * f[3]));
+#else
+			float orig_x = this->orig_x[x][y];
+			float orig_y = this->orig_y[x][y];
+			float warp_mesh = this->warp_mesh[x][y] * 0.0035f;
+
+			this->x_mesh[x][y] += 
+				(warp_mesh * sinf(fWarpTime * 0.333f + fWarpScaleInv * (orig_x * f[0] - orig_y * f[3]))) +
+				(warp_mesh * cosf(fWarpTime * 0.753f - fWarpScaleInv * (orig_x * f[1] - orig_y * f[2])));
+
+			this->y_mesh[x][y] += 
+				(warp_mesh * cosf(fWarpTime * 0.375f - fWarpScaleInv * (orig_x * f[2] + orig_y * f[1]))) +
+				(warp_mesh * sinf(fWarpTime * 0.825f + fWarpScaleInv * (orig_x * f[0] + orig_y * f[3])));
+#endif
 		}
 	}
+
+#if 0
 	for (x = 0; x < gx; x++)
 	{
 		for (y = 0; y < gy; y++)
@@ -267,16 +270,254 @@ void PresetOutputs::PerPixelMath(const PipelineContext &context)
 	for (x = 0; x < gx; x++)
 		for (y = 0; y < gy; y++)
 			this->y_mesh[x][y] -= this->dy_mesh[x][y];
+#else
+	for (x = 0; x < gx; x++)
+	{
+		for (y = 0; y < gy; y++)
+		{
+			const float u2 = this->x_mesh[x][y] - this->cx_mesh[x][y];
+			const float v2 = this->y_mesh[x][y] - this->cy_mesh[x][y];
 
+			const float rot = this->rot_mesh[x][y];
+			const float cos_rot = cosf(rot);
+			const float sin_rot = sinf(rot);
+
+			this->x_mesh[x][y] = u2 * cos_rot - v2 * sin_rot + this->cx_mesh[x][y] - this->dx_mesh[x][y];
+			this->y_mesh[x][y] = u2 * sin_rot + v2 * cos_rot + this->cy_mesh[x][y] - this->dy_mesh[x][y];
+		}
+	}
+#endif
+}
+
+
+
+#ifdef __SSE2__
+
+// is there an SSE way to do this?
+inline __m128 _mm_pow(__m128 x, __m128 y)
+{
+	float X[4];
+	float Y[4];
+	_mm_store_ps(X,x);
+	_mm_store_ps(Y,x);
+	X[0] = std::pow(X[0],Y[0]);
+	X[1] = std::pow(X[1],Y[1]);
+	X[2] = std::pow(X[2],Y[2]);
+	X[3] = std::pow(X[3],Y[3]);
+	return _mm_load_ps(X);
+}
+inline __m128 _mm_sincosf(__m128 x, __m128 &sinx, __m128 &cosx)
+{
+	float X[4], S[4], C[4];
+	_mm_store_ps(X,x);
+	S[0] = sinf(X[0]);
+	C[0] = cosf(X[0]);
+	S[1] = sinf(X[1]);
+	C[1] = cosf(X[1]);
+	S[2] = sinf(X[2]);
+	C[2] = cosf(X[2]);
+	S[3] = sinf(X[3]);
+	C[3] = cosf(X[3]);
+	sinx = _mm_load_ps(S);
+	cosx = _mm_load_ps(C);
+}
+inline __m128 _mm_sinf(__m128 x)
+{
+	float X[4];
+	_mm_store_ps(X,x);
+	X[0] = sinf(X[0]);
+	X[1] = sinf(X[1]);
+	X[2] = sinf(X[2]);
+	X[3] = sinf(X[3]);
+	return _mm_load_ps(X);
+}
+inline __m128 _mm_cosf(__m128 x)
+{
+	float X[4];
+	_mm_store_ps(X,x);
+	X[0] = cosf(X[0]);
+	X[1] = cosf(X[1]);
+	X[2] = cosf(X[2]);
+	X[3] = cosf(X[3]);
+	return _mm_load_ps(X);
+}
+
+
+void PresetOutputs::PerPixelMath_sse(const PipelineContext &context)
+{
+	for (int x = 0; x < gx; x++)
+	{
+		for (int y = 0; y < gy; y += 4)
+		{
+			// fZoom2 = std::pow(this->zoom_mesh[x][y], std::pow(this->zoomexp_mesh[x][y],
+			// 		rad_mesh[x][y] * 2.0f - 1.0f));
+			// fZoom2Inv = 1.0f / fZoom2;
+			__m128 rad_mesh_scaled = 
+				_mm_sub_ps(
+					_mm_mul_ps(
+						_mm_load_ps(&this->rad_mesh[x][y]), 
+						_mm_set_ps1(2.0f)), 
+					_mm_set_ps1(1.0f));
+			__m128 zoom_mesh = _mm_load_ps(&this->zoom_mesh[x][y]);
+			__m128 zoomexp_mesh = _mm_load_ps(&this->zoomexp_mesh[x][y]);
+			__m128 fZoom2 = _mm_pow(zoom_mesh, _mm_pow(zoomexp_mesh, rad_mesh_scaled));
+			__m128 fZoomInv = _mm_mul_ps(_mm_rcp_ps(fZoom2), _mm_set_ps1(0.5f));
+			// this->x_mesh[x][y] = this->orig_x[x][y] * 0.5f * fZoom2Inv + 0.5f;
+			__m128 x_mesh = _mm_load_ps(&this->orig_x[x][y]); 
+			x_mesh = 
+				_mm_add_ps(
+					_mm_mul_ps(
+						_mm_load_ps(&this->orig_x[x][y]), 
+						fZoomInv),
+					_mm_set_ps1(0.5f));
+			// this->x_mesh[x][y] = (this->x_mesh[x][y] - this->cx_mesh[x][y]) / this->sx_mesh[x][y] + this->cx_mesh[x][y];
+			__m128 cx_mesh = _mm_load_ps(&this->cx_mesh[x][y]);
+			__m128 sx_mesh = _mm_load_ps(&this->sx_mesh[x][y]);
+			_mm_store_ps(&this->x_mesh[x][y],
+				_mm_add_ps(
+					_mm_div_ps(
+						_mm_sub_ps(x_mesh,cx_mesh),
+						sx_mesh),
+					cx_mesh
+				));
+
+			// this->y_mesh[x][y] = this->orig_y[x][y] * 0.5f * fZoom2Inv + 0.5f;
+			__m128 y_mesh =  
+				_mm_add_ps(
+					_mm_mul_ps(
+						_mm_load_ps(&this->orig_y[x][y]), 
+						fZoomInv),
+					_mm_set_ps1(0.5f));
+			// this->y_mesh[x][y] = (this->y_mesh[x][y] - this->cy_mesh[x][y]) / this->sy_mesh[x][y] + this->cy_mesh[x][y];
+			__m128 cy_mesh = _mm_load_ps(&this->cy_mesh[x][y]);
+			__m128 sy_mesh = _mm_load_ps(&this->sy_mesh[x][y]);
+			_mm_store_ps(&this->y_mesh[x][y],
+				_mm_add_ps(
+					_mm_div_ps(
+						_mm_sub_ps(y_mesh,cy_mesh),
+						sy_mesh),
+					cy_mesh
+				));
+		}
+	}
+
+	const float fWarpTime = context.time * this->fWarpAnimSpeed;
+	const float fWarpScaleInv = 1.0f / this->fWarpScale;
+	const float f[4] = 
+	{
+		11.68f + 4.0f * cosf(fWarpTime * 1.413f + 10),
+		 8.77f + 3.0f * cosf(fWarpTime * 1.113f + 7),
+		10.54f + 3.0f * cosf(fWarpTime * 1.233f + 3),
+		11.49f + 4.0f * cosf(fWarpTime * 0.933f + 5)
+	};
+
+	for (int x = 0; x < gx; x++)
+	{
+		for (int y = 0; y < gy; y+=4)
+		{
+			//float orig_x = this->orig_x[x][y];
+			//float orig_y = this->orig_y[x][y];
+			//float warp_mesh = this->warp_mesh[x][y] * 0.0035f;
+			const __m128 orig_x = _mm_load_ps(&this->orig_x[x][y]);
+			const __m128 orig_y = _mm_load_ps(&this->orig_y[x][y]);
+			const __m128 warp_mesh = _mm_mul_ps(_mm_load_ps(&this->warp_mesh[x][y]), _mm_set_ps1(0.0035f));
+
+			// this->x_mesh[x][y] += 
+			// 	(warp_mesh * sinf(fWarpTime * 0.333f + fWarpScaleInv * (orig_x * f[0] - orig_y * f[3]))) +
+			// 	(warp_mesh * cosf(fWarpTime * 0.753f - fWarpScaleInv * (orig_x * f[1] - orig_y * f[2])));
+			_mm_store_ps(&this->x_mesh[x][y],
+				_mm_add_ps(_mm_load_ps(&this->x_mesh[x][y]),
+					_mm_add_ps(
+						_mm_mul_ps(warp_mesh, _mm_sinf(
+							_mm_add_ps(
+								_mm_set_ps1(fWarpTime*0.333f),
+								_mm_mul_ps(_mm_set_ps1(fWarpScaleInv),
+									_mm_sub_ps(
+										_mm_mul_ps(orig_x, _mm_set_ps1(f[0])),
+										_mm_mul_ps(orig_y, _mm_set_ps1(f[3]))
+									))))),
+						_mm_mul_ps(warp_mesh, _mm_cosf(
+							_mm_sub_ps(
+								_mm_set_ps1(fWarpTime*0.753f),
+								_mm_mul_ps(_mm_set_ps1(fWarpScaleInv),
+									_mm_sub_ps(
+										_mm_mul_ps(orig_x, _mm_set_ps1(f[1])),
+										_mm_mul_ps(orig_y, _mm_set_ps1(f[2]))
+									))))))));
+
+			// this->y_mesh[x][y] += 
+			// 	(warp_mesh * cosf(fWarpTime * 0.375f - fWarpScaleInv * (orig_x * f[2] + orig_y * f[1]))) +
+			// 	(warp_mesh * sinf(fWarpTime * 0.825f + fWarpScaleInv * (orig_x * f[0] + orig_y * f[3])));
+			_mm_store_ps(&this->y_mesh[x][y],
+				_mm_add_ps(_mm_load_ps(&this->y_mesh[x][y]),
+					_mm_add_ps(
+						_mm_mul_ps(warp_mesh, _mm_cosf(
+							_mm_sub_ps(
+								_mm_set_ps1(fWarpTime*0.375f),
+								_mm_mul_ps(_mm_set_ps1(fWarpScaleInv),
+									_mm_add_ps(
+										_mm_mul_ps(orig_x, _mm_set_ps1(f[2])),
+										_mm_mul_ps(orig_y, _mm_set_ps1(f[1]))
+									))))),
+						_mm_mul_ps(warp_mesh, _mm_sinf(
+							_mm_add_ps(
+								_mm_set_ps1(fWarpTime*0.825f),
+								_mm_mul_ps(_mm_set_ps1(fWarpScaleInv),
+									_mm_add_ps(
+										_mm_mul_ps(orig_x, _mm_set_ps1(f[0])),
+										_mm_mul_ps(orig_y, _mm_set_ps1(f[3]))
+									))))))));
+		}
+	}
+	for (int x = 0; x < gx; x++)
+	{
+		for (int y = 0; y < gy; y+=4)
+		{
+			// const float u2 = this->x_mesh[x][y] - this->cx_mesh[x][y];
+			// const float v2 = this->y_mesh[x][y] - this->cy_mesh[x][y];
+			const __m128 u2 = _mm_sub_ps(_mm_load_ps(&this->x_mesh[x][y]),_mm_load_ps(&this->cx_mesh[x][y]));
+			const __m128 v2 = _mm_sub_ps(_mm_load_ps(&this->y_mesh[x][y]),_mm_load_ps(&this->cy_mesh[x][y]));
+
+			// const float rot = this->rot_mesh[x][y];
+			// const float cos_rot = cosf(rot);
+			// const float sin_rot = sinf(rot);
+			__m128 sin_rot, cos_rot;
+			_mm_sincosf(_mm_load_ps(&this->rot_mesh[x][y]), sin_rot, cos_rot);
+
+			// this->x_mesh[x][y] = u2 * cos_rot - v2 * sin_rot + this->cx_mesh[x][y] - this->dx_mesh[x][y];
+			_mm_store_ps(&this->x_mesh[x][y],
+				_mm_add_ps(
+					_mm_sub_ps(_mm_mul_ps(u2, cos_rot), _mm_mul_ps(v2,sin_rot)),
+					_mm_sub_ps(_mm_load_ps(&this->cx_mesh[x][y]), _mm_load_ps(&this->dx_mesh[x][y]))
+					));
+			// this->y_mesh[x][y] = u2 * sin_rot + v2 * cos_rot + this->cy_mesh[x][y] - this->dy_mesh[x][y];
+			_mm_store_ps(&this->y_mesh[x][y],
+				_mm_add_ps(
+					_mm_add_ps(_mm_mul_ps(u2, sin_rot), _mm_mul_ps(v2,cos_rot)),
+					_mm_sub_ps(_mm_load_ps(&this->cy_mesh[x][y]), _mm_load_ps(&this->dy_mesh[x][y]))
+					));
+		}
+	}
+}
+#endif
+
+void PresetOutputs::PerPixelMath(const PipelineContext &context)
+{
+#ifdef __SSE2__
+	PerPixelMath_sse(context);
+#else
+	PerPixelMath_c(context);
+#endif
 }
 
 
 void PresetOutputs::Initialize ( int gx, int gy )
 {
-
 	assert(gx > 0);
-	this->gx = gx;
-	this->gy= gy;
+
+	// round gx/gy up to multiple 4 (for possible SSE optimization)
+	this->gx = (gx+3) & ~(size_t)3;
+	this->gy = (gy+3) & ~(size_t)3;
 
 	staticPerPixel = true;
 	setStaticPerPixel(gx,gy);
@@ -379,8 +620,6 @@ PresetInputs::~PresetInputs()
 {
 	for ( int x = 0; x < this->gx; x++ )
 	{
-
-
 		free ( this->origtheta[x] );
 		free ( this->origrad[x] );
 		free ( this->origx[x] );
@@ -390,7 +629,6 @@ PresetInputs::~PresetInputs()
 		free ( this->y_mesh[x] );
 		free ( this->rad_mesh[x] );
 		free ( this->theta_mesh[x] );
-
 	}
 
 
diff --git a/src/libprojectM/MilkdropPresetFactory/PresetFrameIO.hpp b/src/libprojectM/MilkdropPresetFactory/PresetFrameIO.hpp
index 229879c2f..35f42d8e7 100644
--- a/src/libprojectM/MilkdropPresetFactory/PresetFrameIO.hpp
+++ b/src/libprojectM/MilkdropPresetFactory/PresetFrameIO.hpp
@@ -138,6 +138,12 @@ public:
     float **orig_x;  //original mesh
     float **orig_y;
     float **rad_mesh;
+
+private:
+    void PerPixelMath_c( const PipelineContext &context);
+#ifdef __SSE2__
+    void PerPixelMath_sse( const PipelineContext &context);
+#endif
 };
 
 
diff --git a/src/libprojectM/wipemalloc.cpp b/src/libprojectM/wipemalloc.cpp
index 14bcf7816..b1e196eb0 100755
--- a/src/libprojectM/wipemalloc.cpp
+++ b/src/libprojectM/wipemalloc.cpp
@@ -27,7 +27,8 @@
 #include "wipemalloc.h"
 
  void *wipemalloc( size_t count ) {
-    void *mem = malloc( count );
+    count = (count + 15) & ~(size_t)15;
+    void *mem = aligned_alloc( 16, count );
     if ( mem != NULL ) {
         memset( mem, 0, count );
       } else {

From 681b23d9f919fe3598e33b1f6e53e09e52d2836a Mon Sep 17 00:00:00 2001
From: Matthew Bellew <matthewb@labkey.com>
Date: Wed, 16 May 2018 09:31:45 -0700
Subject: [PATCH 2/9] unused member pcmBuffer

---
 src/projectM-sdl/pmSDL.cpp | 8 ++++++--
 src/projectM-sdl/pmSDL.hpp | 3 +--
 2 files changed, 7 insertions(+), 4 deletions(-)

diff --git a/src/projectM-sdl/pmSDL.cpp b/src/projectM-sdl/pmSDL.cpp
index 0c171f1d8..8748d7dbc 100644
--- a/src/projectM-sdl/pmSDL.cpp
+++ b/src/projectM-sdl/pmSDL.cpp
@@ -96,13 +96,11 @@ int projectMSDL::openAudioInput() {
 void projectMSDL::beginAudioCapture() {
     // allocate a buffer to store PCM data for feeding in
     unsigned int maxSamples = audioChannelsCount * audioSampleCount;
-    pcmBuffer = (unsigned char *) malloc(maxSamples);
     SDL_PauseAudioDevice(audioDeviceID, false);
     pcm()->initPCM(2048);
 }
 
 void projectMSDL::endAudioCapture() {
-    free(pcmBuffer);
     SDL_PauseAudioDevice(audioDeviceID, true);
 }
 
@@ -236,3 +234,9 @@ void projectMSDL::init(SDL_Window *window, SDL_Renderer *renderer) {
     selectRandom(true);
     projectM_resetGL(width, height);
 }
+
+
+std::string projectMSDL::getActivePresetName()
+{
+    return std::string("hey");
+}
diff --git a/src/projectM-sdl/pmSDL.hpp b/src/projectM-sdl/pmSDL.hpp
index ff798586c..ea4eefeb2 100644
--- a/src/projectM-sdl/pmSDL.hpp
+++ b/src/projectM-sdl/pmSDL.hpp
@@ -44,6 +44,7 @@ public:
     void renderFrame();
     void pollEvent();
     void maximize();
+    std::string getActivePresetName();
 
 private:
     SDL_Window *win;
@@ -59,12 +60,10 @@ private:
     unsigned short audioSampleCount;
     SDL_AudioFormat audioFormat;
     SDL_AudioDeviceID audioDeviceID;
-    unsigned char *pcmBuffer;  // pre-allocated buffer for audioInputCallback
 
     static void audioInputCallbackF32(void *userdata, unsigned char *stream, int len);
     static void audioInputCallbackS16(void *userdata, unsigned char *stream, int len);
 
-
     void addFakePCM();
     void keyHandler(SDL_Event *);
     SDL_AudioDeviceID selectAudioInput(int count);

From 59ee73842864383c6e3fff358e93142897392176 Mon Sep 17 00:00:00 2001
From: Matthew Bellew <matthewb@labkey.com>
Date: Wed, 16 May 2018 09:32:55 -0700
Subject: [PATCH 3/9] cleanup

---
 .../MilkdropPresetFactory/PresetFrameIO.cpp   | 132 +++++++-----------
 1 file changed, 49 insertions(+), 83 deletions(-)

diff --git a/src/libprojectM/MilkdropPresetFactory/PresetFrameIO.cpp b/src/libprojectM/MilkdropPresetFactory/PresetFrameIO.cpp
index 725f5e912..01f44d509 100644
--- a/src/libprojectM/MilkdropPresetFactory/PresetFrameIO.cpp
+++ b/src/libprojectM/MilkdropPresetFactory/PresetFrameIO.cpp
@@ -9,10 +9,12 @@
 //#include <emmintrin.h> // X86 SSE2
 #include <immintrin.h>
 
+
 PresetInputs::PresetInputs() : PipelineContext()
 {
 }
 
+
 void PresetInputs::update(const BeatDetect & music, const PipelineContext & context) {
 
     // Reflect new values form the beat detection unit
@@ -31,6 +33,7 @@ void PresetInputs::update(const BeatDetect & music, const PipelineContext & cont
     this->progress = context.progress;
 }
 
+
 void PresetInputs::Initialize ( int gx, int gy )
 {
 	int x, y;
@@ -102,14 +105,13 @@ void PresetInputs::Initialize ( int gx, int gy )
 			this->origtheta[x][y]=atan2 ( ( ( this->origy[x][y]-.5 ) *2 ), ( ( this->origx[x][y]-.5 ) *2 ) );
 		}
 	}
-
-
-
 }
 
+
 PresetOutputs::PresetOutputs() : Pipeline()
 {}
 
+
 PresetOutputs::~PresetOutputs()
 {
 	assert(this->gx > 0);
@@ -132,22 +134,22 @@ PresetOutputs::~PresetOutputs()
 		free(this->rad_mesh[x]);
 	}
 
-		free(this->rad_mesh);
-		free(this->sx_mesh);
-		free(this->sy_mesh);
-		free(this->dy_mesh);
-		free(this->dx_mesh);
-		free(this->cy_mesh);
-		free(this->cx_mesh);
-		free(this->warp_mesh);
-		free(this->zoom_mesh);
-		free(this->zoomexp_mesh);
-		free(this->rot_mesh);
-		free(this->orig_x);
-		free(this->orig_y);
-
+	free(this->rad_mesh);
+	free(this->sx_mesh);
+	free(this->sy_mesh);
+	free(this->dy_mesh);
+	free(this->dx_mesh);
+	free(this->cy_mesh);
+	free(this->cx_mesh);
+	free(this->warp_mesh);
+	free(this->zoom_mesh);
+	free(this->zoomexp_mesh);
+	free(this->rot_mesh);
+	free(this->orig_x);
+	free(this->orig_y);
 }
 
+
 void PresetOutputs::Render(const BeatDetect &music, const PipelineContext &context)
 {
 	PerPixelMath(context);
@@ -158,18 +160,22 @@ void PresetOutputs::Render(const BeatDetect &music, const PipelineContext &conte
 
 	for (PresetOutputs::cshape_container::iterator pos = customShapes.begin();
 			pos != customShapes.end(); ++pos)
-			{
-				if( (*pos)->enabled==1)	drawables.push_back((*pos));
-			}
+	{
+		if ((*pos)->enabled==1)
+			drawables.push_back((*pos));
+	}
 
 	for (PresetOutputs::cwave_container::iterator pos = customWaves.begin();
 			pos != customWaves.end(); ++pos)
-			{
-				if( (*pos)->enabled==1)	drawables.push_back((*pos));
-			}
+	{
+		if ((*pos)->enabled==1)
+			drawables.push_back((*pos));
+	}
 
-    	drawables.push_back(&wave);
-	if(bDarkenCenter==1) drawables.push_back(&darkenCenter);
+	drawables.push_back(&wave);
+
+	if (bDarkenCenter==1)
+		drawables.push_back(&darkenCenter);
 	drawables.push_back(&border);
 
 	compositeDrawables.clear();
@@ -188,20 +194,17 @@ void PresetOutputs::Render(const BeatDetect &music, const PipelineContext &conte
 		compositeDrawables.push_back(&invert);
 }
 
+
 // N.B. The more optimization that can be done on this method, the better! This is called a lot and can probably be improved.
 void PresetOutputs::PerPixelMath_c(const PipelineContext &context)
 {
-
-	int x, y;
-	float fZoom2, fZoom2Inv;
-
-	for (x = 0; x < gx; x++)
+	for (int x = 0; x < gx; x++)
 	{
-		for (y = 0; y < gy; y++)
+		for (int y = 0; y < gy; y++)
 		{
-			fZoom2 = std::pow(this->zoom_mesh[x][y], std::pow(this->zoomexp_mesh[x][y],
+			const float fZoom2 = std::pow(this->zoom_mesh[x][y], std::pow(this->zoomexp_mesh[x][y],
 					rad_mesh[x][y] * 2.0f - 1.0f));
-			fZoom2Inv = 1.0f / fZoom2;
+			const float fZoom2Inv = 1.0f / fZoom2;
 			this->x_mesh[x][y] = this->orig_x[x][y] * 0.5f * fZoom2Inv + 0.5f;
 			this->x_mesh[x][y] = (this->x_mesh[x][y] - this->cx_mesh[x][y]) / this->sx_mesh[x][y] + this->cx_mesh[x][y];
 			this->y_mesh[x][y] = this->orig_y[x][y] * 0.5f * fZoom2Inv + 0.5f;
@@ -209,71 +212,35 @@ void PresetOutputs::PerPixelMath_c(const PipelineContext &context)
 		}
 	}
 
-	float fWarpTime = context.time * this->fWarpAnimSpeed;
-	float fWarpScaleInv = 1.0f / this->fWarpScale;
+	const float fWarpTime = context.time * this->fWarpAnimSpeed;
+	const float fWarpScaleInv = 1.0f / this->fWarpScale;
 	float f[4];
 	f[0] = 11.68f + 4.0f * cosf(fWarpTime * 1.413f + 10);
 	f[1] = 8.77f + 3.0f * cosf(fWarpTime * 1.113f + 7);
 	f[2] = 10.54f + 3.0f * cosf(fWarpTime * 1.233f + 3);
 	f[3] = 11.49f + 4.0f * cosf(fWarpTime * 0.933f + 5);
 
-	for (x = 0; x < gx; x++)
+	for (int x = 0; x < gx; x++)
 	{
-		for (y = 0; y < gy; y++)
+		for (int y = 0; y < gy; y++)
 		{
-#if 0
-			this->x_mesh[x][y] += this->warp_mesh[x][y] * 0.0035f * sinf(fWarpTime * 0.333f
-					+ fWarpScaleInv * (this->orig_x[x][y] * f[0] - this->orig_y[x][y] * f[3]));
-			this->y_mesh[x][y] += this->warp_mesh[x][y] * 0.0035f * cosf(fWarpTime * 0.375f
-					- fWarpScaleInv * (this->orig_x[x][y] * f[2] + this->orig_y[x][y] * f[1]));
-			this->x_mesh[x][y] += this->warp_mesh[x][y] * 0.0035f * cosf(fWarpTime * 0.753f
-					- fWarpScaleInv * (this->orig_x[x][y] * f[1] - this->orig_y[x][y] * f[2]));
-			this->y_mesh[x][y] += this->warp_mesh[x][y] * 0.0035f * sinf(fWarpTime * 0.825f
-					+ fWarpScaleInv * (this->orig_x[x][y] * f[0] + this->orig_y[x][y] * f[3]));
-#else
-			float orig_x = this->orig_x[x][y];
-			float orig_y = this->orig_y[x][y];
-			float warp_mesh = this->warp_mesh[x][y] * 0.0035f;
+			const float orig_x = this->orig_x[x][y];
+			const float orig_y = this->orig_y[x][y];
+			const float warp_mesh = this->warp_mesh[x][y] * 0.0035f;
 
-			this->x_mesh[x][y] += 
+			this->x_mesh[x][y] +=
 				(warp_mesh * sinf(fWarpTime * 0.333f + fWarpScaleInv * (orig_x * f[0] - orig_y * f[3]))) +
 				(warp_mesh * cosf(fWarpTime * 0.753f - fWarpScaleInv * (orig_x * f[1] - orig_y * f[2])));
 
-			this->y_mesh[x][y] += 
+			this->y_mesh[x][y] +=
 				(warp_mesh * cosf(fWarpTime * 0.375f - fWarpScaleInv * (orig_x * f[2] + orig_y * f[1]))) +
 				(warp_mesh * sinf(fWarpTime * 0.825f + fWarpScaleInv * (orig_x * f[0] + orig_y * f[3])));
-#endif
 		}
 	}
 
-#if 0
-	for (x = 0; x < gx; x++)
+	for (int x = 0; x < gx; x++)
 	{
-		for (y = 0; y < gy; y++)
-		{
-			float u2 = this->x_mesh[x][y] - this->cx_mesh[x][y];
-			float v2 = this->y_mesh[x][y] - this->cy_mesh[x][y];
-
-			float cos_rot = cosf(this->rot_mesh[x][y]);
-			float sin_rot = sinf(this->rot_mesh[x][y]);
-
-			this->x_mesh[x][y] = u2 * cos_rot - v2 * sin_rot + this->cx_mesh[x][y];
-			this->y_mesh[x][y] = u2 * sin_rot + v2 * cos_rot + this->cy_mesh[x][y];
-
-		}
-	}
-
-	for (x = 0; x < gx; x++)
-		for (y = 0; y < gy; y++)
-			this->x_mesh[x][y] -= this->dx_mesh[x][y];
-
-	for (x = 0; x < gx; x++)
-		for (y = 0; y < gy; y++)
-			this->y_mesh[x][y] -= this->dy_mesh[x][y];
-#else
-	for (x = 0; x < gx; x++)
-	{
-		for (y = 0; y < gy; y++)
+		for (int y = 0; y < gy; y++)
 		{
 			const float u2 = this->x_mesh[x][y] - this->cx_mesh[x][y];
 			const float v2 = this->y_mesh[x][y] - this->cy_mesh[x][y];
@@ -286,11 +253,9 @@ void PresetOutputs::PerPixelMath_c(const PipelineContext &context)
 			this->y_mesh[x][y] = u2 * sin_rot + v2 * cos_rot + this->cy_mesh[x][y] - this->dy_mesh[x][y];
 		}
 	}
-#endif
 }
 
 
-
 #ifdef __SSE2__
 
 // is there an SSE way to do this?
@@ -501,6 +466,7 @@ void PresetOutputs::PerPixelMath_sse(const PipelineContext &context)
 }
 #endif
 
+
 void PresetOutputs::PerPixelMath(const PipelineContext &context)
 {
 #ifdef __SSE2__
@@ -616,6 +582,7 @@ void PresetOutputs::Initialize ( int gx, int gy )
 		}
 }
 
+
 PresetInputs::~PresetInputs()
 {
 	for ( int x = 0; x < this->gx; x++ )
@@ -673,7 +640,6 @@ void PresetInputs::resetMesh()
 			theta_mesh[x][y]=this->origtheta[x][y];
 		}
 	}
-
 }
 
 

From 17942b79ace63e15c740a78edcf904df65057a99 Mon Sep 17 00:00:00 2001
From: Matthew Bellew <matthewb@labkey.com>
Date: Sun, 20 May 2018 16:25:20 -0700
Subject: [PATCH 4/9] alloc mesh as one memory block

---
 .../MilkdropPresetFactory/Param.cpp           |  12 +-
 .../MilkdropPresetFactory/Param.hpp           |   5 +
 .../MilkdropPresetFactory/PresetFrameIO.cpp   | 314 ++++++------------
 src/libprojectM/Renderer/Pipeline.cpp         |  33 +-
 4 files changed, 114 insertions(+), 250 deletions(-)

diff --git a/src/libprojectM/MilkdropPresetFactory/Param.cpp b/src/libprojectM/MilkdropPresetFactory/Param.cpp
index ca21e8ffc..d49875ff3 100755
--- a/src/libprojectM/MilkdropPresetFactory/Param.cpp
+++ b/src/libprojectM/MilkdropPresetFactory/Param.cpp
@@ -65,7 +65,7 @@ Param::Param(std::string _name) :
         matrix(0)
         {
 
-	engine_val = new float();
+	engine_val = (float *)&local_value;
 
 	default_init_val.float_val = DEFAULT_DOUBLE_IV;
         upper_bound.float_val = DEFAULT_DOUBLE_UB;
@@ -73,18 +73,10 @@ Param::Param(std::string _name) :
 
     /// @note may have fixed a recent bug. testing
     *((float*)engine_val) = default_init_val.float_val;
-
-   
-}
+ }
 
 /* Free's a parameter type */
 Param::~Param() {
-
-    // I hate this, but will let it be for now
-    if (flags & P_FLAG_USERDEF) {
-        delete((double*)engine_val);
-    }
-
     if (PARAM_DEBUG) printf("~Param: freeing \"%s\".\n", name.c_str());
 }
 
diff --git a/src/libprojectM/MilkdropPresetFactory/Param.hpp b/src/libprojectM/MilkdropPresetFactory/Param.hpp
index 1f6bcf272..c2631cd6c 100755
--- a/src/libprojectM/MilkdropPresetFactory/Param.hpp
+++ b/src/libprojectM/MilkdropPresetFactory/Param.hpp
@@ -58,6 +58,8 @@ class InitCond;
 class Param;
 class Preset;
 //#include <map>
+#include <immintrin.h>
+
 
 /* Parameter Type */
 class Param {
@@ -72,6 +74,9 @@ public:
     CValue upper_bound; /* this parameter's upper bound */
     CValue lower_bound; /* this parameter's lower bound */
 
+    // for a local variable, engine_val can point here
+    float local_value;
+
     /// Create a new parameter
     Param(std::string name, short int type, short int flags,
            void * eqn_val, void *matrix,
diff --git a/src/libprojectM/MilkdropPresetFactory/PresetFrameIO.cpp b/src/libprojectM/MilkdropPresetFactory/PresetFrameIO.cpp
index 01f44d509..e4b7b9db5 100644
--- a/src/libprojectM/MilkdropPresetFactory/PresetFrameIO.cpp
+++ b/src/libprojectM/MilkdropPresetFactory/PresetFrameIO.cpp
@@ -5,8 +5,6 @@
 #include <iostream>
 #include <cmath>
 #include "Renderer/BeatDetect.hpp"
-//#include <xmmintrin.h> // X86 SSE1
-//#include <emmintrin.h> // X86 SSE2
 #include <immintrin.h>
 
 
@@ -34,12 +32,37 @@ void PresetInputs::update(const BeatDetect & music, const PipelineContext & cont
 }
 
 
+float **alloc_mesh(size_t gx, size_t gy)
+{
+	// round gy up to multiple 4 (for possible SSE optimization) 
+	gy = (gy+3) & ~(size_t)3;
+
+	float **mesh = (float **)wipemalloc(gx * sizeof(float *));
+	float *m = (float *)wipemalloc(gx * gy * sizeof(float));
+	for ( int x = 0; x < gx; x++ )
+		mesh[x] = m + (gy * x);
+	return mesh;
+}
+
+float **free_mesh(float **mesh)
+{
+	free(mesh[0]);
+	free(mesh);
+	return NULL;
+}
+
+void copy_mesh(float **dst, float **src, int gx, int gy)
+{
+	memcpy(dst[0], src[0], gx*gy*sizeof(float));
+}
+
+
 void PresetInputs::Initialize ( int gx, int gy )
 {
 	int x, y;
 
-	this->gx =gx;
-	this->gy= gy;
+	this->gx = gx;
+	this->gy = gy;
 
 
 	/// @bug no clue if this block belongs here
@@ -53,47 +76,14 @@ void PresetInputs::Initialize ( int gx, int gy )
 	ang_per_pixel = 0;
 	// ***
 
-	this->x_mesh= ( float ** ) wipemalloc ( gx * sizeof ( float * ) );
-	for ( x = 0; x < gx; x++ )
-	{
-		this->x_mesh[x] = ( float * ) wipemalloc ( gy * sizeof ( float ) );
-	}
-	this->y_mesh= ( float ** ) wipemalloc ( gx * sizeof ( float * ) );
-	for ( x = 0; x <gx; x++ )
-	{
-		this->y_mesh[x] = ( float * ) wipemalloc ( gy * sizeof ( float ) );
-	}
-	this->rad_mesh= ( float ** ) wipemalloc ( gx * sizeof ( float * ) );
-	for ( x = 0; x < gx; x++ )
-	{
-		this->rad_mesh[x] = ( float * ) wipemalloc ( gy * sizeof ( float ) );
-	}
-	this->theta_mesh= ( float ** ) wipemalloc ( gx * sizeof ( float * ) );
-	for ( x = 0; x <gx; x++ )
-	{
-		this->theta_mesh[x] = ( float * ) wipemalloc ( gy * sizeof ( float ) );
-	}
-
-	this->origtheta= ( float ** ) wipemalloc ( gx * sizeof ( float * ) );
-	for ( x = 0; x < gx; x++ )
-	{
-		this->origtheta[x] = ( float * ) wipemalloc ( gy * sizeof ( float ) );
-	}
-	this->origrad= ( float ** ) wipemalloc ( gx * sizeof ( float * ) );
-	for ( x = 0; x < gx; x++ )
-	{
-		this->origrad[x] = ( float * ) wipemalloc ( gy * sizeof ( float ) );
-	}
-	this->origx= ( float ** ) wipemalloc ( gx * sizeof ( float * ) );
-	for ( x = 0; x < gx; x++ )
-	{
-		this->origx[x] = ( float * ) wipemalloc ( gy * sizeof ( float ) );
-	}
-	this->origy= ( float ** ) wipemalloc ( gx * sizeof ( float * ) );
-	for ( x = 0; x < gx; x++ )
-	{
-		this->origy[x] = ( float * ) wipemalloc ( gy * sizeof ( float ) );
-	}
+	this->x_mesh    = alloc_mesh(gx, gy);
+	this->y_mesh    = alloc_mesh(gx, gy);
+	this->rad_mesh  = alloc_mesh(gx, gy);
+	this->theta_mesh= alloc_mesh(gx, gy);
+	this->origtheta = alloc_mesh(gx, gy);
+	this->origrad   = alloc_mesh(gx, gy);
+	this->origx     = alloc_mesh(gx, gy);
+	this->origy     = alloc_mesh(gx, gy);
 
 	for ( x=0;x<gx;x++ )
 	{
@@ -116,37 +106,19 @@ PresetOutputs::~PresetOutputs()
 {
 	assert(this->gx > 0);
 
-	for ( int x = 0; x < this->gx; x++ )
-	{
-		free(this->sx_mesh[x]);
-		free(this->sy_mesh[x]);
-		free(this->dy_mesh[x]);
-		free(this->dx_mesh[x]);
-		free(this->cy_mesh[x]);
-		free(this->cx_mesh[x]);
-
-		free(this->warp_mesh[x]);
-		free(this->zoom_mesh[x]);
-		free(this->zoomexp_mesh[x]);
-		free(this->rot_mesh[x]);
-		free(this->orig_x[x]);
-		free(this->orig_y[x]);
-		free(this->rad_mesh[x]);
-	}
-
-	free(this->rad_mesh);
-	free(this->sx_mesh);
-	free(this->sy_mesh);
-	free(this->dy_mesh);
-	free(this->dx_mesh);
-	free(this->cy_mesh);
-	free(this->cx_mesh);
-	free(this->warp_mesh);
-	free(this->zoom_mesh);
-	free(this->zoomexp_mesh);
-	free(this->rot_mesh);
-	free(this->orig_x);
-	free(this->orig_y);
+	this->rad_mesh = free_mesh(this->rad_mesh);
+	this->sx_mesh  = free_mesh(this->sx_mesh);
+	this->sy_mesh  = free_mesh(this->sy_mesh);
+	this->dy_mesh  = free_mesh(this->dy_mesh);
+	this->dx_mesh  = free_mesh(this->dx_mesh);
+	this->cy_mesh  = free_mesh(this->cy_mesh);
+	this->cx_mesh  = free_mesh(this->cx_mesh);
+	this->warp_mesh = free_mesh(this->warp_mesh);
+	this->zoom_mesh = free_mesh(this->zoom_mesh);
+	this->zoomexp_mesh = free_mesh(this->zoomexp_mesh);
+	this->rot_mesh = free_mesh(this->rot_mesh);
+	this->orig_x   = free_mesh(this->orig_x);
+	this->orig_y   = free_mesh(this->orig_y);
 }
 
 
@@ -265,10 +237,10 @@ inline __m128 _mm_pow(__m128 x, __m128 y)
 	float Y[4];
 	_mm_store_ps(X,x);
 	_mm_store_ps(Y,x);
-	X[0] = std::pow(X[0],Y[0]);
-	X[1] = std::pow(X[1],Y[1]);
-	X[2] = std::pow(X[2],Y[2]);
-	X[3] = std::pow(X[3],Y[3]);
+	X[0] = __builtin_powf(X[0],Y[0]);
+	X[1] = __builtin_powf(X[1],Y[1]);
+	X[2] = __builtin_powf(X[2],Y[2]);
+	X[3] = __builtin_powf(X[3],Y[3]);
 	return _mm_load_ps(X);
 }
 inline __m128 _mm_sincosf(__m128 x, __m128 &sinx, __m128 &cosx)
@@ -316,7 +288,6 @@ void PresetOutputs::PerPixelMath_sse(const PipelineContext &context)
 		{
 			// fZoom2 = std::pow(this->zoom_mesh[x][y], std::pow(this->zoomexp_mesh[x][y],
 			// 		rad_mesh[x][y] * 2.0f - 1.0f));
-			// fZoom2Inv = 1.0f / fZoom2;
 			__m128 rad_mesh_scaled = 
 				_mm_sub_ps(
 					_mm_mul_ps(
@@ -326,14 +297,15 @@ void PresetOutputs::PerPixelMath_sse(const PipelineContext &context)
 			__m128 zoom_mesh = _mm_load_ps(&this->zoom_mesh[x][y]);
 			__m128 zoomexp_mesh = _mm_load_ps(&this->zoomexp_mesh[x][y]);
 			__m128 fZoom2 = _mm_pow(zoom_mesh, _mm_pow(zoomexp_mesh, rad_mesh_scaled));
-			__m128 fZoomInv = _mm_mul_ps(_mm_rcp_ps(fZoom2), _mm_set_ps1(0.5f));
+			// fZoom2Inv = 1.0f / fZoom2;
+			__m128 fZoomInv = _mm_rcp_ps(fZoom2);
+
 			// this->x_mesh[x][y] = this->orig_x[x][y] * 0.5f * fZoom2Inv + 0.5f;
-			__m128 x_mesh = _mm_load_ps(&this->orig_x[x][y]); 
-			x_mesh = 
+			__m128 x_mesh = 
 				_mm_add_ps(
 					_mm_mul_ps(
 						_mm_load_ps(&this->orig_x[x][y]), 
-						fZoomInv),
+						_mm_mul_ps(fZoomInv,_mm_set_ps1(0.5f))),		// CONSIDER: common sub-expression
 					_mm_set_ps1(0.5f));
 			// this->x_mesh[x][y] = (this->x_mesh[x][y] - this->cx_mesh[x][y]) / this->sx_mesh[x][y] + this->cx_mesh[x][y];
 			__m128 cx_mesh = _mm_load_ps(&this->cx_mesh[x][y]);
@@ -351,7 +323,7 @@ void PresetOutputs::PerPixelMath_sse(const PipelineContext &context)
 				_mm_add_ps(
 					_mm_mul_ps(
 						_mm_load_ps(&this->orig_y[x][y]), 
-						fZoomInv),
+						_mm_mul_ps(fZoomInv,_mm_set_ps1(0.5f))),
 					_mm_set_ps1(0.5f));
 			// this->y_mesh[x][y] = (this->y_mesh[x][y] - this->cy_mesh[x][y]) / this->sy_mesh[x][y] + this->cy_mesh[x][y];
 			__m128 cy_mesh = _mm_load_ps(&this->cy_mesh[x][y]);
@@ -481,165 +453,71 @@ void PresetOutputs::Initialize ( int gx, int gy )
 {
 	assert(gx > 0);
 
-	// round gx/gy up to multiple 4 (for possible SSE optimization)
-	this->gx = (gx+3) & ~(size_t)3;
-	this->gy = (gy+3) & ~(size_t)3;
+	this->gx = gx;
+	this->gy = gy;
 
 	staticPerPixel = true;
 	setStaticPerPixel(gx,gy);
 
 	assert(this->gx > 0);
 	int x;
-	this->x_mesh= ( float ** ) wipemalloc ( gx * sizeof ( float * ) );
-	for ( x = 0; x < gx; x++ )
-	{
-		this->x_mesh[x] = ( float * ) wipemalloc ( gy * sizeof ( float ) );
-	}
-	this->y_mesh= ( float ** ) wipemalloc ( gx * sizeof ( float * ) );
-	for ( x = 0; x < gx; x++ )
-	{
-		this->y_mesh[x] = ( float * ) wipemalloc ( gy * sizeof ( float ) );
-	}
-	this->sx_mesh= ( float ** ) wipemalloc ( gx * sizeof ( float * ) );
-	for ( x = 0; x < gx; x++ )
-	{
-		this->sx_mesh[x] = ( float * ) wipemalloc ( gy * sizeof ( float ) );
-	}
-	this->sy_mesh= ( float ** ) wipemalloc ( gx * sizeof ( float * ) );
-	for ( x = 0; x < gx; x++ )
-	{
-		this->sy_mesh[x] = ( float * ) wipemalloc ( gy * sizeof ( float ) );
-	}
-	this->dx_mesh= ( float ** ) wipemalloc ( gx * sizeof ( float * ) );
-	for ( x = 0; x < gx; x++ )
-	{
-		this->dx_mesh[x] = ( float * ) wipemalloc ( gy * sizeof ( float ) );
-	}
-	this->dy_mesh= ( float ** ) wipemalloc ( gx * sizeof ( float * ) );
-	for ( x = 0; x < gx; x++ )
-	{
-		this->dy_mesh[x] = ( float * ) wipemalloc ( gy * sizeof ( float ) );
-	}
-	this->cx_mesh= ( float ** ) wipemalloc ( gx * sizeof ( float * ) );
-	for ( x = 0; x < gx; x++ )
-	{
-		this->cx_mesh[x] = ( float * ) wipemalloc ( gy * sizeof ( float ) );
-	}
-	this->cy_mesh= ( float ** ) wipemalloc ( gx * sizeof ( float * ) );
-	for ( x = 0; x < gx; x++ )
-	{
-		this->cy_mesh[x] = ( float * ) wipemalloc ( gy * sizeof ( float ) );
-	}
-	this->zoom_mesh= ( float ** ) wipemalloc ( gx * sizeof ( float * ) );
-	for ( x = 0; x < gx; x++ )
-	{
-		this->zoom_mesh[x] = ( float * ) wipemalloc ( gy * sizeof ( float ) );
-	}
-	this->zoomexp_mesh= ( float ** ) wipemalloc ( gx * sizeof ( float * ) );
-	for ( x = 0; x < gx; x++ )
-	{
-		this->zoomexp_mesh[x] = ( float * ) wipemalloc ( gy * sizeof ( float ) );
-	}
-	this->rot_mesh= ( float ** ) wipemalloc ( gx * sizeof ( float * ) );
-	for ( x = 0; x < gx; x++ )
-	{
-		this->rot_mesh[x] = ( float * ) wipemalloc ( gy * sizeof ( float ) );
-	}
+	this->x_mesh  = alloc_mesh( gx, gy );
+	this->y_mesh  = alloc_mesh( gx, gy );
+	this->sx_mesh = alloc_mesh( gx, gy );
+	this->sy_mesh = alloc_mesh( gx, gy );
+	this->dx_mesh = alloc_mesh( gx, gy );
+	this->dy_mesh = alloc_mesh( gx, gy );
+	this->cx_mesh = alloc_mesh( gx, gy );
+	this->cy_mesh = alloc_mesh( gx, gy );
+	this->zoom_mesh = alloc_mesh( gx, gy );
+	this->zoomexp_mesh = alloc_mesh( gx, gy );
+	this->rot_mesh = alloc_mesh( gx, gy );
 
-	this->warp_mesh= ( float ** ) wipemalloc ( gx * sizeof ( float * ) );
-	for ( x = 0; x < gx; x++ )
-	{
-		this->warp_mesh[x] = ( float * ) wipemalloc ( gy * sizeof ( float ) );
-	}
-	this->rad_mesh= ( float ** ) wipemalloc ( gx * sizeof ( float * ) );
-	for ( x = 0; x < gx; x++ )
-	{
-		this->rad_mesh[x] = ( float * ) wipemalloc ( gy * sizeof ( float ) );
-	}
-	this->orig_x = (float **) wipemalloc(gx * sizeof(float *));
+	this->warp_mesh = alloc_mesh( gx, gy );
+	this->rad_mesh = alloc_mesh( gx, gy );
+	this->orig_x  = alloc_mesh( gx, gy );
+	this->orig_y  = alloc_mesh( gx, gy );
+
+	//initialize reference grid values
 	for (x = 0; x < gx; x++)
 	{
-		this->orig_x[x] = (float *) wipemalloc(gy * sizeof(float));
-	}
-	this->orig_y = (float **) wipemalloc(gx * sizeof(float *));
-	for (x = 0; x < gx; x++)
-	{
-		this->orig_y[x] = (float *) wipemalloc(gy * sizeof(float));
-	}
-
-		//initialize reference grid values
-		for (x = 0; x < gx; x++)
+		for (int y = 0; y < gy; y++)
 		{
-			for (int y = 0; y < gy; y++)
-			{
-				float origx = x / (float) (gx - 1);
-				float origy = -((y / (float) (gy - 1)) - 1);
+			float origx = x / (float) (gx - 1);
+			float origy = -((y / (float) (gy - 1)) - 1);
 
-				rad_mesh[x][y]=hypot ( ( origx-.5 ) *2, ( origy-.5 ) *2 ) * .7071067;
-				orig_x[x][y] = (origx - .5) * 2;
-				orig_y[x][y] = (origy - .5) * 2;
-			}
+			rad_mesh[x][y]=hypot ( ( origx-.5 ) *2, ( origy-.5 ) *2 ) * .7071067;
+			orig_x[x][y] = (origx - .5) * 2;
+			orig_y[x][y] = (origy - .5) * 2;
 		}
+	}
 }
 
 
 PresetInputs::~PresetInputs()
 {
-	for ( int x = 0; x < this->gx; x++ )
-	{
-		free ( this->origtheta[x] );
-		free ( this->origrad[x] );
-		free ( this->origx[x] );
-		free ( this->origy[x] );
-
-		free ( this->x_mesh[x] );
-		free ( this->y_mesh[x] );
-		free ( this->rad_mesh[x] );
-		free ( this->theta_mesh[x] );
-	}
-
-
-	free ( this->origx );
-	free ( this->origy );
-	free ( this->origrad );
-	free ( this->origtheta );
-
-	free ( this->x_mesh );
-	free ( this->y_mesh );
-	free ( this->rad_mesh );
-	free ( this->theta_mesh );
-
-	this->origx = NULL;
-	this->origy = NULL;
-	this->origtheta = NULL;
-	this->origrad = NULL;
-
-	this->x_mesh = NULL;
-	this->y_mesh = NULL;
-	this->rad_mesh = NULL;
-	this->theta_mesh = NULL;
+	this->origx = free_mesh ( this->origx );
+	this->origy = free_mesh ( this->origy );
+	this->origrad = free_mesh ( this->origrad );
+	this->origtheta = free_mesh ( this->origtheta );
+	this->x_mesh = free_mesh ( this->x_mesh );
+	this->y_mesh = free_mesh ( this->y_mesh );
+	this->rad_mesh = free_mesh ( this->rad_mesh );
+	this->theta_mesh = free_mesh ( this->theta_mesh );
 }
 
 
 void PresetInputs::resetMesh()
 {
-	int x,y;
-
 	assert ( x_mesh );
 	assert ( y_mesh );
 	assert ( rad_mesh );
 	assert ( theta_mesh );
 
-	for ( x=0;x<this->gx;x++ )
-	{
-		for ( y=0;y<this->gy;y++ )
-		{
-			x_mesh[x][y]=this->origx[x][y];
-			y_mesh[x][y]=this->origy[x][y];
-			rad_mesh[x][y]=this->origrad[x][y];
-			theta_mesh[x][y]=this->origtheta[x][y];
-		}
-	}
+	copy_mesh(this->x_mesh, this->origx, gx, gy);
+	copy_mesh(this->y_mesh, this->origy, gx, gy);
+	copy_mesh(this->rad_mesh, this->origrad, gx, gy);
+	copy_mesh(this->theta_mesh, this->origtheta, gx, gy);
 }
 
 
diff --git a/src/libprojectM/Renderer/Pipeline.cpp b/src/libprojectM/Renderer/Pipeline.cpp
index 3daf7ae86..e13742c91 100644
--- a/src/libprojectM/Renderer/Pipeline.cpp
+++ b/src/libprojectM/Renderer/Pipeline.cpp
@@ -11,37 +11,26 @@ Pipeline::Pipeline() : staticPerPixel(false),gx(0),gy(0),blur1n(1), blur2n(1), b
 blur1x(1), blur2x(1), blur3x(1),
 blur1ed(1){}
 
+float **alloc_mesh(size_t gx, size_t gy);
+float **free_mesh(float **mesh);
+
 void Pipeline::setStaticPerPixel(int gx, int gy)
 {
-	 staticPerPixel = true;
-	 this->gx = gx;
-	 this->gy = gy;
-
-		this->x_mesh= ( float ** ) wipemalloc ( gx * sizeof ( float * ) );
-		for ( int x = 0; x < gx; x++ )
-		{
-			this->x_mesh[x] = ( float * ) wipemalloc ( gy * sizeof ( float ) );
-		}
-		this->y_mesh= ( float ** ) wipemalloc ( gx * sizeof ( float * ) );
-		for ( int x = 0; x < gx; x++ )
-		{
-			this->y_mesh[x] = ( float * ) wipemalloc ( gy * sizeof ( float ) );
-		}
+	staticPerPixel = true;
+	this->gx = gx;
+	this->gy = gy;
 
+	this->x_mesh = alloc_mesh(gx, gy);
+	this->y_mesh = alloc_mesh(gx, gy);
 }
 
 Pipeline::~Pipeline()
 {
-if (staticPerPixel)
-{
-	for ( int x = 0; x < this->gx; x++ )
+	if (staticPerPixel)
 	{
-		free(this->x_mesh[x]);
-		free(this->y_mesh[x]);
+		free_mesh(x_mesh);
+		free_mesh(y_mesh);
 	}
-	free(x_mesh);
-	free(y_mesh);
-}
 }
 
 //void Pipeline::Render(const BeatDetect &music, const PipelineContext &context){}

From a374bd93bc66a296918f6067c6ba70ba3b168078 Mon Sep 17 00:00:00 2001
From: Matthew Bellew <matthewb@labkey.com>
Date: Sun, 20 May 2018 16:34:52 -0700
Subject: [PATCH 5/9] some comments

---
 .../MilkdropPresetFactory/PresetFrameIO.cpp   | 25 +++++++++++++++++++
 1 file changed, 25 insertions(+)

diff --git a/src/libprojectM/MilkdropPresetFactory/PresetFrameIO.cpp b/src/libprojectM/MilkdropPresetFactory/PresetFrameIO.cpp
index e4b7b9db5..bee2d3456 100644
--- a/src/libprojectM/MilkdropPresetFactory/PresetFrameIO.cpp
+++ b/src/libprojectM/MilkdropPresetFactory/PresetFrameIO.cpp
@@ -168,8 +168,11 @@ void PresetOutputs::Render(const BeatDetect &music, const PipelineContext &conte
 
 
 // N.B. The more optimization that can be done on this method, the better! This is called a lot and can probably be improved.
+// NOTE : Keep PerPixelMath_sse and PerPixelMath_c in sync
+
 void PresetOutputs::PerPixelMath_c(const PipelineContext &context)
 {
+
 	for (int x = 0; x < gx; x++)
 	{
 		for (int y = 0; y < gy; y++)
@@ -280,6 +283,28 @@ inline __m128 _mm_cosf(__m128 x)
 }
 
 
+/**
+ * SSE instructions let us do the math on 4 floats in parallel.  You an see the main loop uses y += 4.  Each time through the loop,
+ * we read operands in group of 4.  This looks like a mess, but just think of it as rewriting the infix expressions as a prefix expression
+ * 
+ * e.g.
+ *   this->orig_x[x][y] * 0.5f * fZoom2Inv + 0.5f
+ * becomes
+ *			__m128 x_mesh = 
+ *				_mm_add_ps(
+ *					_mm_mul_ps(
+ *						_mm_load_ps(&this->orig_x[x][y]), 
+ *						_mm_mul_ps(fZoomInv,_mm_set_ps1(0.5f))),		// CONSIDER: common sub-expression
+ *					_mm_set_ps1(0.5f));
+ *
+ * _mm_load_ps loads an SSE register from memory (4 floats at a time)
+ * _mm_set_ps1 takes a constant 0.5 and loads it (replicated 4 times)
+ *  * The other expressions are what they sound like:
+ *    a + b --> _mm_add_ps(a, b)
+ *    a * b --> _mm_mul_ps(a, b)
+ */
+// NOTE : Keep PerPixelMath_sse and PerPixelMath_c in sync
+// NOTE : Even better would be to rewrite this as a compute shader
 void PresetOutputs::PerPixelMath_sse(const PipelineContext &context)
 {
 	for (int x = 0; x < gx; x++)

From 43070063c74b9eca5f356d840eb95608514e4d98 Mon Sep 17 00:00:00 2001
From: Matthew Bellew <matthewb@labkey.com>
Date: Mon, 21 May 2018 21:00:57 -0700
Subject: [PATCH 6/9] TARGET_OS_MAC

---
 .../MilkdropPresetFactory/PresetFrameIO.cpp   | 33 ++-----------
 src/libprojectM/wipemalloc.cpp                | 48 +++++++++++++++++--
 src/libprojectM/wipemalloc.h                  |  4 ++
 3 files changed, 52 insertions(+), 33 deletions(-)

diff --git a/src/libprojectM/MilkdropPresetFactory/PresetFrameIO.cpp b/src/libprojectM/MilkdropPresetFactory/PresetFrameIO.cpp
index bee2d3456..f3ce4f8fc 100644
--- a/src/libprojectM/MilkdropPresetFactory/PresetFrameIO.cpp
+++ b/src/libprojectM/MilkdropPresetFactory/PresetFrameIO.cpp
@@ -37,8 +37,8 @@ float **alloc_mesh(size_t gx, size_t gy)
 	// round gy up to multiple 4 (for possible SSE optimization) 
 	gy = (gy+3) & ~(size_t)3;
 
-	float **mesh = (float **)wipemalloc(gx * sizeof(float *));
-	float *m = (float *)wipemalloc(gx * gy * sizeof(float));
+	float **mesh = (float **)wipe_aligned_alloc(gx * sizeof(float *));
+	float *m = (float *)wipe_aligned_alloc(gx * gy * sizeof(float));
 	for ( int x = 0; x < gx; x++ )
 		mesh[x] = m + (gy * x);
 	return mesh;
@@ -46,8 +46,8 @@ float **alloc_mesh(size_t gx, size_t gy)
 
 float **free_mesh(float **mesh)
 {
-	free(mesh[0]);
-	free(mesh);
+	wipe_aligned_free(mesh[0]);
+	wipe_aligned_free(mesh);
 	return NULL;
 }
 
@@ -168,11 +168,8 @@ void PresetOutputs::Render(const BeatDetect &music, const PipelineContext &conte
 
 
 // N.B. The more optimization that can be done on this method, the better! This is called a lot and can probably be improved.
-// NOTE : Keep PerPixelMath_sse and PerPixelMath_c in sync
-
 void PresetOutputs::PerPixelMath_c(const PipelineContext &context)
 {
-
 	for (int x = 0; x < gx; x++)
 	{
 		for (int y = 0; y < gy; y++)
@@ -283,28 +280,6 @@ inline __m128 _mm_cosf(__m128 x)
 }
 
 
-/**
- * SSE instructions let us do the math on 4 floats in parallel.  You an see the main loop uses y += 4.  Each time through the loop,
- * we read operands in group of 4.  This looks like a mess, but just think of it as rewriting the infix expressions as a prefix expression
- * 
- * e.g.
- *   this->orig_x[x][y] * 0.5f * fZoom2Inv + 0.5f
- * becomes
- *			__m128 x_mesh = 
- *				_mm_add_ps(
- *					_mm_mul_ps(
- *						_mm_load_ps(&this->orig_x[x][y]), 
- *						_mm_mul_ps(fZoomInv,_mm_set_ps1(0.5f))),		// CONSIDER: common sub-expression
- *					_mm_set_ps1(0.5f));
- *
- * _mm_load_ps loads an SSE register from memory (4 floats at a time)
- * _mm_set_ps1 takes a constant 0.5 and loads it (replicated 4 times)
- *  * The other expressions are what they sound like:
- *    a + b --> _mm_add_ps(a, b)
- *    a * b --> _mm_mul_ps(a, b)
- */
-// NOTE : Keep PerPixelMath_sse and PerPixelMath_c in sync
-// NOTE : Even better would be to rewrite this as a compute shader
 void PresetOutputs::PerPixelMath_sse(const PipelineContext &context)
 {
 	for (int x = 0; x < gx; x++)
diff --git a/src/libprojectM/wipemalloc.cpp b/src/libprojectM/wipemalloc.cpp
index b1e196eb0..94b4affe6 100755
--- a/src/libprojectM/wipemalloc.cpp
+++ b/src/libprojectM/wipemalloc.cpp
@@ -25,17 +25,18 @@
  */
 
 #include "wipemalloc.h"
+#include <assert.h>
 
- void *wipemalloc( size_t count ) {
-    count = (count + 15) & ~(size_t)15;
-    void *mem = aligned_alloc( 16, count );
+ void *wipemalloc( size_t count )
+ {
+    void *mem = malloc( count );
     if ( mem != NULL ) {
         memset( mem, 0, count );
       } else {
         printf( "wipemalloc() failed to allocate %d bytes\n", (int)count );
       }
     return mem;
-  }
+ }
 
 /** Safe memory deallocator */
  void wipefree( void *ptr ) {
@@ -43,3 +44,42 @@
         free( ptr );
       }
   }
+
+void *wipe_aligned_alloc( size_t align, size_t size )
+{
+#if TARGET_OS_MAC
+    // only support powers of 2 for align
+    assert( (align & (align-1)) == 0 );
+    void *allocated = malloc(size + align - 1 + sizeof(void*));
+    if (allocated == NULL)
+    {
+        printf( "wipe_aligned_malloc() failed to allocate %d bytes\n", (int)size );
+        return NULL;
+    }
+    void *ret = (void*) (((size_t)allocated + sizeof(void*) + align -1) & ~(align-1));
+    *((void**)((size_t)ret - sizeof(void*))) = allocated;
+    return ret;
+#else
+    void *mem = aligned_alloc( align, size );
+    if ( mem != NULL ) {
+        memset( mem, 0, size );
+      } else {
+        printf( "wipe_aligned_alloc() failed to allocate %d bytes\n", (int)size );
+      }
+    return mem;
+#endif
+}
+
+void wipe_aligned_free( void *p )
+{
+#if TARGET_OS_MAC
+    if (p != NULL)
+    {
+        void *allocated = *((void**)((size_t)p - sizeof(void*)));
+        free(allocated);
+    }
+#else
+    if (p != NULL)
+        free(p);
+#endif
+}
diff --git a/src/libprojectM/wipemalloc.h b/src/libprojectM/wipemalloc.h
index 6ff625d36..26b9fa0f2 100755
--- a/src/libprojectM/wipemalloc.h
+++ b/src/libprojectM/wipemalloc.h
@@ -57,4 +57,8 @@
  void *wipemalloc( size_t count );
  void wipefree( void *ptr );
 
+/** wipe_aligned_malloc() must be matched with aligned_free() */
+ void *wipe_aligned_alloc( size_t align, size_t count);
+ inline void *wipe_aligned_alloc( size_t count ) { return wipe_aligned_alloc(16,count); }
+ void wipe_aligned_free( void *ptr );
 #endif /** !_WIPEMALLOC_H */

From 6834d407dc92b99eeeee87cad6cd95cfe82b5a25 Mon Sep 17 00:00:00 2001
From: Matthew Bellew <matthew@macmini.bellew.net>
Date: Mon, 21 May 2018 22:22:39 -0700
Subject: [PATCH 7/9] __APPLE__

---
 src/libprojectM/wipemalloc.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/libprojectM/wipemalloc.cpp b/src/libprojectM/wipemalloc.cpp
index 94b4affe6..1501b31ab 100755
--- a/src/libprojectM/wipemalloc.cpp
+++ b/src/libprojectM/wipemalloc.cpp
@@ -47,7 +47,7 @@
 
 void *wipe_aligned_alloc( size_t align, size_t size )
 {
-#if TARGET_OS_MAC
+#if __APPLE__
     // only support powers of 2 for align
     assert( (align & (align-1)) == 0 );
     void *allocated = malloc(size + align - 1 + sizeof(void*));
@@ -72,7 +72,7 @@ void *wipe_aligned_alloc( size_t align, size_t size )
 
 void wipe_aligned_free( void *p )
 {
-#if TARGET_OS_MAC
+#if __APPLE__
     if (p != NULL)
     {
         void *allocated = *((void**)((size_t)p - sizeof(void*)));

From 541a22c71240b681edd8638865d9eb0283a121ac Mon Sep 17 00:00:00 2001
From: Matthew Bellew <matthewb@labkey.com>
Date: Tue, 22 May 2018 12:03:26 -0700
Subject: [PATCH 8/9] AC_CHECK_FUNCS_ONCE

---
 configure.ac                   |  2 +
 src/libprojectM/wipemalloc.cpp | 88 +++++++++++++++++++++-------------
 2 files changed, 58 insertions(+), 32 deletions(-)

diff --git a/configure.ac b/configure.ac
index eb582aa1a..25332b1c6 100644
--- a/configure.ac
+++ b/configure.ac
@@ -11,6 +11,8 @@ AX_CHECK_GL
 
 AC_CHECK_LIB(c, dlopen, LIBDL="", AC_CHECK_LIB(dl, dlopen, LIBDL="-ldl"))
 
+AC_CHECK_FUNCS_ONCE([aligned_alloc posix_memalign])
+
 AC_CONFIG_HEADERS([config.h])
 AC_CONFIG_FILES([
   Makefile
diff --git a/src/libprojectM/wipemalloc.cpp b/src/libprojectM/wipemalloc.cpp
index 1501b31ab..840df3609 100755
--- a/src/libprojectM/wipemalloc.cpp
+++ b/src/libprojectM/wipemalloc.cpp
@@ -27,59 +27,83 @@
 #include "wipemalloc.h"
 #include <assert.h>
 
- void *wipemalloc( size_t count )
- {
+
+void *wipemalloc( size_t count )
+{
     void *mem = malloc( count );
-    if ( mem != NULL ) {
+    if ( mem != NULL )
+    {
         memset( mem, 0, count );
-      } else {
+    }
+    else
+    {
         printf( "wipemalloc() failed to allocate %d bytes\n", (int)count );
-      }
+    }
     return mem;
- }
+}
+
 
 /** Safe memory deallocator */
- void wipefree( void *ptr ) {
-    if ( ptr != NULL ) {
+void wipefree( void *ptr )
+{
+    if ( ptr != NULL )
         free( ptr );
-      }
-  }
+}
+
 
 void *wipe_aligned_alloc( size_t align, size_t size )
 {
-#if __APPLE__
+    void *mem = NULL;
+
+#if HAVE_ALIGNED_ALLOC==1
+
+    mem = aligned_alloc( align, size );
+
+#elif HAVE_POSIX_MEMALIGN==1
+
+    if (posix_memalign(&mem, align, size))
+      mem = NULL;
+
+#else
+
     // only support powers of 2 for align
     assert( (align & (align-1)) == 0 );
+    assert( (size % align) == 0 );
     void *allocated = malloc(size + align - 1 + sizeof(void*));
-    if (allocated == NULL)
+    if (allocated)
     {
-        printf( "wipe_aligned_malloc() failed to allocate %d bytes\n", (int)size );
-        return NULL;
+        mem = (void*) (((size_t)allocated + sizeof(void*) + align -1) & ~(align-1));
+        ((void**)mem)[-1] = allocated;
     }
-    void *ret = (void*) (((size_t)allocated + sizeof(void*) + align -1) & ~(align-1));
-    *((void**)((size_t)ret - sizeof(void*))) = allocated;
-    return ret;
-#else
-    void *mem = aligned_alloc( align, size );
-    if ( mem != NULL ) {
-        memset( mem, 0, size );
-      } else {
-        printf( "wipe_aligned_alloc() failed to allocate %d bytes\n", (int)size );
-      }
-    return mem;
+
 #endif
+
+    if (mem)
+    {
+        memset( mem, 0, size );
+    }
+    else
+    {
+        printf( "wipe_aligned_alloc() failed to allocate %d bytes\n", (int)size );
+    }
+    return mem;
 }
 
+
 void wipe_aligned_free( void *p )
 {
-#if __APPLE__
-    if (p != NULL)
-    {
-        void *allocated = *((void**)((size_t)p - sizeof(void*)));
-        free(allocated);
-    }
-#else
+#if HAVE_ALIGNED_ALLOC==1 || HAVE_POSIX_MEMALIGN==1
+
     if (p != NULL)
         free(p);
+
+#else
+
+    if (p != NULL)
+    {
+        void *allocated = ((void**)p)[-1];
+        free(allocated);
+    }
+
 #endif
 }

From 5f8a525f9bba085c7b5d97f034ed3e8747fd1967 Mon Sep 17 00:00:00 2001
From: Matthew Bellew <matthewb@labkey.com>
Date: Tue, 22 May 2018 12:45:07 -0700
Subject: [PATCH 9/9] fix signature for _mm_sincosf(), thanks Mischa

---
 src/libprojectM/MilkdropPresetFactory/PresetFrameIO.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/libprojectM/MilkdropPresetFactory/PresetFrameIO.cpp b/src/libprojectM/MilkdropPresetFactory/PresetFrameIO.cpp
index f3ce4f8fc..b5374c290 100644
--- a/src/libprojectM/MilkdropPresetFactory/PresetFrameIO.cpp
+++ b/src/libprojectM/MilkdropPresetFactory/PresetFrameIO.cpp
@@ -243,7 +243,7 @@ inline __m128 _mm_pow(__m128 x, __m128 y)
 	X[3] = __builtin_powf(X[3],Y[3]);
 	return _mm_load_ps(X);
 }
-inline __m128 _mm_sincosf(__m128 x, __m128 &sinx, __m128 &cosx)
+inline void _mm_sincosf(__m128 x, __m128 &sinx, __m128 &cosx)
 {
 	float X[4], S[4], C[4];
 	_mm_store_ps(X,x);