From fc8040d695644aaca4596adebeca4ea1369ef630 Mon Sep 17 00:00:00 2001
From: Fox Caminiti <fox@foxcam.net>
Date: Fri, 22 Jul 2022 20:45:08 -0400
Subject: first

---
 effects.cpp | 777 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 777 insertions(+)
 create mode 100644 effects.cpp

(limited to 'effects.cpp')

diff --git a/effects.cpp b/effects.cpp
new file mode 100644
index 0000000..fe593a4
--- /dev/null
+++ b/effects.cpp
@@ -0,0 +1,777 @@
+internal void
+DrawColor(pixel_buffer *Buffer, memory *Memory, property_channel Property[])
+{
+    v4 FloatColor = Property[0].CurrentValue.col;
+    blend_mode BlendMode = Property[1].CurrentValue.blendmode;
+
+    __m256 ZeroReal = _mm256_set1_ps(0);
+    __m256 ZeroPointFive = _mm256_set1_ps(0.5);
+    __m256 One = _mm256_set1_ps(1);
+    __m256 Two = _mm256_set1_ps(2);
+    __m256 Four = _mm256_set1_ps(4);
+
+    __m256 Fraction255 = _mm256_set1_ps(1/255.0f);
+    __m256 Real255  = _mm256_set1_ps(255);
+
+    __m256i Zero  = _mm256_set1_epi8(0);
+    __m256i FF = _mm256_set1_epi32(0xFF);
+    __m256i Int255  = _mm256_set1_epi8((uint8)255);
+
+    __m256 Alpha  = _mm256_set1_ps(FloatColor.a);
+    __m256 AlphaInv  = _mm256_set1_ps(1.0f - FloatColor.a);
+
+    __m256 R_Col    = _mm256_set1_ps(FloatColor.E[0]);
+    __m256 R_Colx2  = _mm256_mul_ps(R_Col, Two);
+    __m256 R_ColInv = _mm256_set1_ps(1.0f - FloatColor.E[0]);
+
+    __m256 G_Col    = _mm256_set1_ps(FloatColor.E[1]);
+    __m256 G_Colx2  = _mm256_mul_ps(G_Col, Two);
+    __m256 G_ColInv = _mm256_set1_ps(1.0f - FloatColor.E[1]);
+
+    __m256 B_Col    = _mm256_set1_ps(FloatColor.E[2]);
+    __m256 B_Colx2  = _mm256_mul_ps(B_Col, Two);
+    __m256 B_ColInv = _mm256_set1_ps(1.0f - FloatColor.E[2]);
+
+    for (int16 Y = 0; Y < Buffer->Height; Y += 2)
+    {
+        for (int16 X = 0; X < Buffer->Width; X += 4)
+        {
+            uint32 XLookup = (X >> 2)*16 + (X % 4);
+            uint32 YLookup = (Y >> 2)*(Buffer->Width*4) + (Y % 4)*4;
+            uint32 PixelToSeek = XLookup + YLookup;
+            uint8 *Pixel = (uint8 *)Buffer->EffectBuffer + PixelToSeek*Buffer->BytesPerPixel;
+            __m256i DestPixel = _mm256_loadu_si256((const __m256i *)Pixel);
+
+            // normalized values
+            __m256 R_Dest = _mm256_mul_ps(_mm256_cvtepi32_ps(_mm256_and_si256(                  DestPixel,      FF)), Fraction255);
+            __m256 G_Dest = _mm256_mul_ps(_mm256_cvtepi32_ps(_mm256_and_si256(_mm256_srli_epi32(DestPixel, 8),  FF)), Fraction255);
+            __m256 B_Dest = _mm256_mul_ps(_mm256_cvtepi32_ps(_mm256_and_si256(_mm256_srli_epi32(DestPixel, 16), FF)), Fraction255);
+            __m256i A_Out = _mm256_and_si256(_mm256_srli_epi32(DestPixel, 24), FF);
+            __m256 A_Dest = _mm256_mul_ps(_mm256_cvtepi32_ps(A_Out), Fraction255);
+
+            __m256 R_Blend = _mm256_setzero_ps();
+            __m256 G_Blend = _mm256_setzero_ps();
+            __m256 B_Blend = _mm256_setzero_ps();
+            switch (BlendMode)
+            {
+                case blend_normal:
+                {
+                } break;
+                case blend_multiply:
+                {
+                    R_Blend  = _mm256_mul_ps(R_Dest, R_Col);
+                    G_Blend  = _mm256_mul_ps(G_Dest, G_Col);
+                    B_Blend  = _mm256_mul_ps(B_Dest, B_Col);
+                } break;
+                case blend_colorburn:
+                {
+                    R_Blend  = _mm256_sub_ps(One, _mm256_div_ps(_mm256_sub_ps(One, R_Dest), R_Col));
+                    G_Blend  = _mm256_sub_ps(One, _mm256_div_ps(_mm256_sub_ps(One, G_Dest), G_Col));
+                    B_Blend  = _mm256_sub_ps(One, _mm256_div_ps(_mm256_sub_ps(One, B_Dest), B_Col));
+                } break;
+                case blend_linearburn:
+                {
+                    R_Blend  = _mm256_sub_ps(_mm256_add_ps(R_Dest, R_Col), One);
+                    G_Blend  = _mm256_sub_ps(_mm256_add_ps(G_Dest, G_Col), One);
+                    B_Blend  = _mm256_sub_ps(_mm256_add_ps(B_Dest, B_Col), One);
+                } break;
+                case blend_add:
+                {
+                    R_Blend  = _mm256_add_ps(R_Dest, R_Col);
+                    G_Blend  = _mm256_add_ps(G_Dest, G_Col);
+                    B_Blend  = _mm256_add_ps(B_Dest, B_Col);
+                } break;
+                case blend_screen:
+                {
+                    R_Blend  = _mm256_sub_ps(One, _mm256_mul_ps(_mm256_sub_ps(One, R_Dest), R_ColInv));
+                    G_Blend  = _mm256_sub_ps(One, _mm256_mul_ps(_mm256_sub_ps(One, G_Dest), G_ColInv));
+                    B_Blend  = _mm256_sub_ps(One, _mm256_mul_ps(_mm256_sub_ps(One, B_Dest), B_ColInv));
+                } break;
+                case blend_overlay:
+                {
+                    __m256 R_Mask  = _mm256_cmp_ps(R_Dest,  ZeroPointFive, 1);
+                    __m256 G_Mask  = _mm256_cmp_ps(G_Dest,  ZeroPointFive, 1);
+                    __m256 B_Mask  = _mm256_cmp_ps(B_Dest,  ZeroPointFive, 1);
+                    __m256 R_Lower = _mm256_mul_ps(Two, _mm256_mul_ps(R_Dest, R_Col));
+                    __m256 G_Lower = _mm256_mul_ps(Two, _mm256_mul_ps(G_Dest, G_Col));
+                    __m256 B_Lower = _mm256_mul_ps(Two, _mm256_mul_ps(B_Dest, B_Col));
+                    __m256 R_Upper = _mm256_sub_ps(One, _mm256_mul_ps(Two, _mm256_mul_ps(_mm256_sub_ps(One, R_Dest),  R_ColInv)));
+                    __m256 G_Upper = _mm256_sub_ps(One, _mm256_mul_ps(Two, _mm256_mul_ps(_mm256_sub_ps(One, G_Dest),  G_ColInv)));
+                    __m256 B_Upper = _mm256_sub_ps(One, _mm256_mul_ps(Two, _mm256_mul_ps(_mm256_sub_ps(One, B_Dest),  B_ColInv)));
+                    R_Blend = _mm256_blendv_ps(R_Upper, R_Lower, R_Mask);
+                    G_Blend = _mm256_blendv_ps(G_Upper, G_Lower, G_Mask);
+                    B_Blend = _mm256_blendv_ps(B_Upper, B_Lower, B_Mask);
+                } break;
+                case blend_softlight:
+                {
+                    // using Pegtop's equation
+                    R_Blend  = _mm256_add_ps(_mm256_mul_ps(_mm256_sub_ps(One, R_Colx2), _mm256_mul_ps(R_Dest, R_Dest)), _mm256_mul_ps(R_Colx2, R_Dest));
+                    G_Blend  = _mm256_add_ps(_mm256_mul_ps(_mm256_sub_ps(One, G_Colx2), _mm256_mul_ps(G_Dest, G_Dest)), _mm256_mul_ps(G_Colx2, G_Dest));
+                    B_Blend  = _mm256_add_ps(_mm256_mul_ps(_mm256_sub_ps(One, B_Colx2), _mm256_mul_ps(B_Dest, B_Dest)), _mm256_mul_ps(B_Colx2, B_Dest));
+                } break;
+                case blend_hardlight:
+                {
+                    __m256 R_Mask   = _mm256_cmp_ps(R_Dest,  ZeroPointFive, 13);
+                    __m256 G_Mask   = _mm256_cmp_ps(G_Dest,  ZeroPointFive, 13);
+                    __m256 B_Mask   = _mm256_cmp_ps(B_Dest,  ZeroPointFive, 13);
+                    __m256 R_Lower  = _mm256_mul_ps(Two, _mm256_mul_ps(R_Dest, R_Col));
+                    __m256 G_Lower  = _mm256_mul_ps(Two, _mm256_mul_ps(G_Dest, G_Col));
+                    __m256 B_Lower  = _mm256_mul_ps(Two, _mm256_mul_ps(B_Dest, B_Col));
+                    __m256 R_Upper  = _mm256_sub_ps(One, _mm256_mul_ps(Two, _mm256_mul_ps(_mm256_sub_ps(One, R_Dest),  R_ColInv)));
+                    __m256 G_Upper  = _mm256_sub_ps(One, _mm256_mul_ps(Two, _mm256_mul_ps(_mm256_sub_ps(One, G_Dest),  G_ColInv)));
+                    __m256 B_Upper  = _mm256_sub_ps(One, _mm256_mul_ps(Two, _mm256_mul_ps(_mm256_sub_ps(One, B_Dest),  B_ColInv)));
+                    R_Blend = _mm256_blendv_ps(R_Upper, R_Lower, R_Mask);
+                    G_Blend = _mm256_blendv_ps(G_Upper, G_Lower, G_Mask);
+                    B_Blend = _mm256_blendv_ps(B_Upper, B_Lower, B_Mask);
+                } break;
+                case blend_subtract:
+                {
+                    R_Blend  = _mm256_sub_ps(R_Dest, R_Col);
+                    G_Blend  = _mm256_sub_ps(G_Dest, G_Col);
+                    B_Blend  = _mm256_sub_ps(B_Dest, B_Col);
+                } break;
+                case blend_divide:
+                {
+                    R_Blend  = _mm256_div_ps(R_Dest, R_Col);
+                    G_Blend  = _mm256_div_ps(G_Dest, G_Col);
+                    B_Blend  = _mm256_div_ps(B_Dest, B_Col);
+                } break;
+                case blend_difference:
+                {
+                    __m256 R_Lower  = _mm256_sub_ps(R_Col, R_Dest);
+                    __m256 G_Lower  = _mm256_sub_ps(G_Col, G_Dest);
+                    __m256 B_Lower  = _mm256_sub_ps(B_Col, B_Dest);
+                    __m256 R_Upper  = _mm256_sub_ps(R_Dest, R_Col);
+                    __m256 G_Upper  = _mm256_sub_ps(G_Dest, G_Col);
+                    __m256 B_Upper  = _mm256_sub_ps(B_Dest, B_Col);
+                    __m256 R_Mask  = _mm256_cmp_ps(R_Lower,  ZeroReal, 14);
+                    __m256 G_Mask  = _mm256_cmp_ps(G_Lower,  ZeroReal, 14);
+                    __m256 B_Mask  = _mm256_cmp_ps(B_Lower,  ZeroReal, 14);
+                    R_Blend = _mm256_blendv_ps(R_Upper, R_Lower, R_Mask);
+                    G_Blend = _mm256_blendv_ps(G_Upper, G_Lower, G_Mask);
+                    B_Blend = _mm256_blendv_ps(B_Upper, B_Lower, B_Mask);
+                } break;
+            }
+
+            R_Blend = _mm256_add_ps(_mm256_mul_ps(R_Dest, AlphaInv),
+                                 _mm256_mul_ps(R_Blend, Alpha));
+            G_Blend = _mm256_add_ps(_mm256_mul_ps(G_Dest, AlphaInv),
+                                 _mm256_mul_ps(G_Blend, Alpha));
+            B_Blend = _mm256_add_ps(_mm256_mul_ps(B_Dest, AlphaInv),
+                                 _mm256_mul_ps(B_Blend, Alpha));
+
+            R_Blend = _mm256_max_ps(_mm256_min_ps(One, R_Blend), ZeroReal);
+            G_Blend = _mm256_max_ps(_mm256_min_ps(One, G_Blend), ZeroReal);
+            B_Blend = _mm256_max_ps(_mm256_min_ps(One, B_Blend), ZeroReal);
+
+            __m256i R_Out = _mm256_cvttps_epi32(_mm256_mul_ps(R_Blend, Real255));
+            __m256i G_Out = _mm256_cvttps_epi32(_mm256_mul_ps(G_Blend, Real255));
+            __m256i B_Out = _mm256_cvttps_epi32(_mm256_mul_ps(B_Blend, Real255));
+
+            __m256i OutputPixel = _mm256_or_si256(
+                                  _mm256_or_si256(R_Out, _mm256_slli_epi32(G_Out, 8)),
+                                  _mm256_or_si256(_mm256_slli_epi32(B_Out, 16), _mm256_slli_epi32(A_Out, 24)));
+
+            _mm256_storeu_si256((__m256i *)Pixel, OutputPixel);
+        }
+    }
+}
+
+internal void
+DrawGradient(pixel_buffer *Buffer, memory *Memory, property_channel Property[])
+{
+    v4 StartColor = Property[0].CurrentValue.col;
+    v4 EndColor = Property[1].CurrentValue.col;
+}
+
+global_variable effect_header EffectList[] {
+    {
+        "Solid Color",
+        &DrawColor, 2, standard,  {
+            {"Color", {.col = V4(0.0f, 0.0f, 0.0f, 0.0f)}, type_color, NORMALIZED_COL_MIN, NORMALIZED_COL_MAX},
+            {"Blend mode", {.blendmode = blend_normal}, type_blendmode},
+        }
+    },
+    {
+        "Linear Gradient",
+        &DrawGradient, 3, standard, {
+            {"Start Color", {.col = V4(0.0f, 1.0f, 0.0f, 0.0f)}, type_color, NORMALIZED_COL_MIN, NORMALIZED_COL_MAX},
+            {"End Color",   {.col = V4(1.0f, 0.0f, 0.0f, 1.0f)}, type_color, NORMALIZED_COL_MIN, NORMALIZED_COL_MAX},
+            {"Opacity", {1.0f}, type_real, NORMALIZED_REAL_MIN, NORMALIZED_REAL_MAX}
+        }
+    }
+};
+#if 0
+    {
+        "Solid Color",
+        &DrawColor, standard,  {
+            {"Color", {.col = V4(0.5f, 1.0f, 0.4f, 0.5f)}, color},
+        }
+    },
+    {
+        "Test Grid",
+        &DrawGrid, standard,  {
+            {"Color 1", {.col = V4(0.5f, 1.0f, 0.4f, 1.0f)}, color},
+            {"Color 2", {.col = V4(0.0f, 0.0f, 0.0f, 1.0f)}, color}
+        }
+    },
+    {
+        "Gaussian Blur",
+        &GaussianBlur, standard,  {
+            {"Radius", {2.0f}, real},
+        }
+    },
+    {
+        "Canny edges",
+        &Canny, standard,  {
+            {"Blur Radius", {1.0f}, real},
+            {"Threshold", {5.0f}, real},
+        }
+    },
+    {
+        "Levels",
+        &Levels, levels,  {
+            {"Start point", {0.0f}, real},
+            {"Mid point", {1.0f}, real},
+            {"End point", {1.0f}, real},
+            {"Start Col", {.col = V4(0.0f)}, color},
+            {"Mid Col", {.col = V4(1.0f)}, color},
+            {"End Col", {.col = V4(1.0f)}, color},
+        }
+    },
+    {
+        "Kernel",
+        &SpacialFilter, standard,  {
+            {"V1", {-1.0f}, real},
+            {"V2", {0.0f}, real},
+            {"V3", {1.0f}, real},
+            {"V4", {-2.0f}, real},
+            {"V5", {0.0f}, real},
+            {"V6", {2.0f}, real},
+            {"V7", {-1.0f}, real},
+            {"V8", {0.0f}, real},
+            {"V9", {1.0f}, real},
+        }
+    },
+    {
+        "Invert",
+        &Invert, 0, standard,  {
+        }
+    }
+#endif
+
+internal void
+AddEffect(project_layer *Layer, memory *Memory, uint16 EffectListIndex)
+{
+    Layer->Effect[Layer->NumberOfEffects] = (effect *)AllocateMemory(Memory, sizeof(effect), F_Effects);
+    effect *Effect = Layer->Effect[Layer->NumberOfEffects];
+    effect_header EffectHeader = EffectList[EffectListIndex];
+    Effect->Name = EffectHeader.Name;
+    Effect->func = EffectHeader.func;
+    Effect->NumberOfProperties = EffectHeader.NumberOfProperties;
+    Effect->DisplayType = EffectHeader.DisplayType;
+    Effect->IsActive = true;
+    for (int16 i = 0; i < Effect->NumberOfProperties; i++) {
+        Effect->Property[i].Name = EffectHeader.PropertyHeader[i].Name;
+        Effect->Property[i].CurrentValue = EffectHeader.PropertyHeader[i].Value;
+        Effect->Property[i].MinVal = EffectHeader.PropertyHeader[i].MinVal;
+        Effect->Property[i].MaxVal = EffectHeader.PropertyHeader[i].MaxVal;
+        Effect->Property[i].VarType = EffectHeader.PropertyHeader[i].VarType;
+    }
+    Layer->NumberOfEffects++;
+}
+
+internal void
+SSE_CopyToBuffer(pixel_buffer *, uint16 asda = 0);
+
+internal void
+UpdateEffects(project_layer *Layer, memory *Memory)
+{
+    image_source *Source = (image_source *)Layer->RenderInfo;
+    if (!Source->Raster.EffectBuffer) {
+        Source->Raster.EffectBuffer = AllocateMemory(Memory, Source->Raster.Width * Source->Raster.Height * Source->Raster.BytesPerPixel,
+                                                    B_Scratch);
+    }
+    SSE_CopyToBuffer(&Source->Raster);
+    for (int i = 0; i < Layer->NumberOfEffects; i++)
+    {
+        if (Layer->Effect[i]->IsActive)
+            Layer->Effect[i]->func(&Source->Raster, Memory, Layer->Effect[i]->Property);
+    }
+}
+
+#if 0
+
+internal void
+DrawColor(pixel_buffer *Buffer, memory *Memory, property_channel Property[])
+{
+    v4 FloatColor = Property[0].CurrentValue.col;
+    uint32 Color = ColToUint32(FloatColor);
+
+    uint8 *Row = ((uint8 *)Buffer->EffectBuffer);
+    v2 Origin = {(real32)Buffer->Width / 2.0f, (real32)Buffer->Height / 2.0f};
+
+    real32 MaxLength = sqrt(LengthSq(Origin));
+
+    for(int Y = 0;
+        Y < Buffer->Height;
+        ++Y)
+    {
+        uint32 *Pixel = (uint32 *)Row;
+        for(int X = 0;
+            X < Buffer->Width;
+            ++X)
+        {
+            RenderAlpha(Pixel, Color);
+            Pixel++;
+        }
+        Row += Buffer->Pitch;
+    }
+}
+
+internal void
+Invert(pixel_buffer *Buffer, memory *Memory, property_channel Property[])
+{
+    uint8 *Row = ((uint8 *)Buffer->EffectBuffer);
+
+    for(int Y = 0;
+        Y < Buffer->Height;
+        ++Y)
+    {
+        uint32 *Pixel = (uint32 *)Row;
+        for(int X = 0;
+            X < Buffer->Width;
+            ++X)
+        {
+            v4 col = Uint32ToCol8(*Pixel);
+            col.r = 255 - col.r;
+            col.g = 255 - col.g;
+            col.b = 255 - col.b;
+            *Pixel++ = Col8ToUint32(col);
+        }
+        Row += Buffer->Pitch;
+    }
+}
+
+internal void
+DrawGradient(pixel_buffer *Buffer, memory *Memory, property_channel Property[])
+{
+    v4 StartColor = Property[0].CurrentValue.col;
+    v4 EndColor = Property[1].CurrentValue.col;
+    real32 Alpha = Property[2].CurrentValue.f;
+    uint8 *Row = ((uint8 *)Buffer->EffectBuffer +
+                     Buffer->BytesPerPixel +
+                     Buffer->Pitch);
+    for(int Y = 0;
+        Y < Buffer->Height;
+        ++Y)
+    {
+        uint32 *Pixel = (uint32 *)Row;
+        for(int X = 0;
+            X < Buffer->Width;
+            ++X)
+        {
+            real32 PlusAlpha = ((real32)X / Buffer->Width);
+            v4 PL = V4(V3(PlusAlpha), 1.0f);
+            v4 C1 = ClipV4((StartColor - PL));
+            v4 C2 = ClipV4( (EndColor - (1 - PL) ) );
+            v4 FloatColor = ClipV4( C1 + C2 );
+
+            uint32 Color = ColToUint32(FloatColor);
+            *(uint32 *)Pixel++ = Color;
+        }
+        Row += Buffer->Pitch;
+    }
+}
+
+internal void
+DrawGrid(pixel_buffer *Buffer, memory *Memory, property_channel Property[])
+{
+    v4 StartColor = Property[0].CurrentValue.col;
+    v4 EndColor = Property[1].CurrentValue.col;
+    uint32 Color1 = ColToUint32(StartColor);
+    uint32 Color2 = ColToUint32(EndColor);
+
+    uint8 *Row = ((uint8 *)Buffer->EffectBuffer);
+    for(int Y = 0;
+        Y < Buffer->Height;
+        ++Y)
+    {
+        uint32 *Pixel = (uint32 *)Row;
+        for(int X = 0;
+            X < Buffer->Width;
+            ++X)
+        {
+            if (X & 4 || Y & 4) {
+                *(uint32 *)Pixel++ = Color1;
+            } else {
+                *(uint32 *)Pixel++ = Color2;
+            }
+        }
+        Row += Buffer->Pitch;
+    }
+}
+
+internal real32
+KernLoop(pixel_buffer *Buffer, int16 Xp, int16 Yp, real32 Value[8])
+{
+    real32 P[9];
+    uint8 *Row = ((uint8 *)Buffer->EffectBuffer +
+                     (Buffer->Pitch*Yp));
+    Row -= Buffer->Pitch;
+    int16 n = 0;
+    for(int Y = 0;
+        Y < 3;
+        ++Y)
+    {
+        uint32 *Pixel = (uint32 *)Row + Xp;
+        for(int X = 0;
+            X < 3;
+            ++X)
+            {
+                real32 BW = Uint32ToNormalizedBW(*Pixel);
+                P[n] = BW * Value[n];
+                Pixel++;
+                n++;
+            }
+        Row += Buffer->Pitch;
+    }
+    real32 Sum = P[0] + P[1] + P[2] +
+                 P[3] + P[4] + P[5] +
+                 P[6] + P[7] + P[8] ;
+    return Sum;
+}
+
+internal void
+SpacialFilter(pixel_buffer *Buffer, memory *Memory, property_channel Property[])
+{
+    real32 P[9];
+    P[0] = Property[0].CurrentValue.f;
+    P[1] = Property[1].CurrentValue.f;
+    P[2] = Property[2].CurrentValue.f;
+    P[3] = Property[3].CurrentValue.f;
+    P[4] = Property[4].CurrentValue.f;
+    P[5] = Property[5].CurrentValue.f;
+    P[6] = Property[6].CurrentValue.f;
+    P[7] = Property[7].CurrentValue.f;
+    P[8] = Property[8].CurrentValue.f;
+
+    if (!Buffer->Scratch) {
+        Buffer->Scratch = (uint64 *)Memory->Address + Memory->CurrentPosition;
+        Memory->CurrentPosition += Buffer->Width * Buffer->Height * Buffer->BytesPerPixel;
+    }
+
+    for(int Y = 1;
+        Y < Buffer->Height - 1;
+        ++Y)
+    {
+        for(int X = 1;
+            X < Buffer->Width - 1;
+            ++X)
+            {
+                real32 Sum = KernLoop(Buffer, X, Y, P);
+                uint8 *FloatRow = ((uint8 *)Buffer->Scratch +
+                                 Buffer->BytesPerPixel +
+                                 Buffer->Pitch + (Buffer->Pitch*Y));
+                real32 *FloatValue = (real32 *)FloatRow + X;
+                *(real32 *)FloatValue++ = Sum;
+            }
+    }
+    for(int Y = 1;
+        Y < Buffer->Height - 1;
+        ++Y)
+    {
+        for(int X = 1;
+            X < Buffer->Width - 1;
+            ++X)
+            {
+                uint8 *Row = ((uint8 *)Buffer->EffectBuffer +
+                          Buffer->BytesPerPixel +
+                          Buffer->Pitch + (Buffer->Pitch*Y));
+                uint32 *Pixel = (uint32 *)Row + X;
+                uint8 *RowR = ((uint8 *)Buffer->Scratch +
+                          Buffer->BytesPerPixel +
+                          Buffer->Pitch + (Buffer->Pitch*Y));
+                real32 *PixelR = (real32 *)RowR + X;
+                *(uint32 *)Pixel= ColToUint32(abs(*PixelR / 4.0f));
+                PixelR++;
+            }
+    }
+}
+
+
+internal void
+Gaussian(pixel_buffer *Buffer, void *FloatStorage, real32 Radius)
+{
+    if (Radius < 1.0f)
+        Radius = 1.0f;
+    real32 Omega = Radius / 2;
+    real32 Total = pow((Radius + Radius + 1), 2) / 2;
+    int32 ColorPitch = Buffer->Pitch * 2;
+
+    real32 P2 = 2*(Omega*Omega);
+    for(int16 Y = Radius;
+        Y < Buffer->Height - Radius;
+        ++Y)
+    {
+        uint8 *Row = ((uint8 *)Buffer->EffectBuffer +
+                               Buffer->BytesPerPixel +
+                               Buffer->Pitch + Buffer->Pitch*(Y));
+        for(int16 X = Radius;
+            X < Buffer->Width - Radius;
+            ++X)
+            {
+                uint32 *Pixel = (uint32 *)Row + X;
+                v4 FloatCol = Uint32ToNormalizedCol(*Pixel);
+                for(int16 Y2 = -Radius;
+                    Y2 <= Radius;
+                    ++Y2)
+                {
+                    uint16 *TempRow = ((uint16 *)FloatStorage +
+                                     Buffer->BytesPerPixel +
+                                     ColorPitch + (ColorPitch*(Y + Y2)));
+                    for(int16 X2 = -Radius;
+                        X2 <= Radius;
+                        ++X2)
+                        {
+                            v4 *TempValue = (v4 *)TempRow + (X + X2);
+                            real32 P1 = ((X2 * X2) + (Y2 * Y2));
+                            real32 G = exp(-(P1/P2));
+                            *TempValue = *TempValue + (FloatCol*V4(G) / V4(Total));
+                        }
+                }
+            }
+    }
+    for(int Y = Radius;
+        Y < Buffer->Height - Radius;
+        ++Y)
+    {
+        for(int X = Radius;
+            X < Buffer->Width - Radius;
+            ++X)
+            {
+                uint8 *Row = ((uint8 *)Buffer->EffectBuffer +
+                          Buffer->BytesPerPixel +
+                          Buffer->Pitch + (Buffer->Pitch*Y));
+                uint32 *Pixel = (uint32 *)Row + X;
+                uint16 *TempRow = ((uint16 *)FloatStorage +
+                                 Buffer->BytesPerPixel +
+                                 ColorPitch + (ColorPitch*Y));
+                v4 *TempValue = (v4 *)TempRow + X;
+                TempValue->a = 1.0f;
+                uint32 Color = ColToUint32(Clamp(0.0, *TempValue, 1.0));
+                *Pixel = Color;
+                *TempValue = {0};
+            }
+    }
+}
+
+internal void
+Canny(pixel_buffer *Buffer, memory *Memory, property_channel Property[])
+{
+    real32 SH[9] = { -1, 0, 1,
+                     -2, 0, 2,
+                     -1, 0, 1 };
+    real32 SV[9] = { -1, -2, -1,
+                     0, 0, 0,
+                     1, 2, 1 };
+
+    real32 Radius = Property[0].CurrentValue.f;
+    real32 Threshold = Property[1].CurrentValue.f / 100;
+    real32 UpperThreshold = Threshold * 1.5;
+    int32 ColorPitch = Buffer->Pitch * 2;
+
+    if (!Buffer->Scratch) {
+        Buffer->Scratch = (uint64 *)Memory->Address + Memory->CurrentPosition;
+        // NOTE(fox): this buffer is four times as large to store four real32s
+        Memory->CurrentPosition += Buffer->Width * Buffer->Height * Buffer->BytesPerPixel * 4;
+    }
+
+    Gaussian(Buffer, Buffer->Scratch, Radius);
+
+    for(int Y = 1;
+        Y < Buffer->Height - 1;
+        ++Y)
+    {
+        uint16 *TempRow = ((uint16 *)Buffer->Scratch +
+                         Buffer->BytesPerPixel +
+                         ColorPitch + (ColorPitch*(Y)));
+        for(int X = 1;
+            X < Buffer->Width - 1;
+            ++X)
+            {
+                real32 HSum = KernLoop(Buffer, X, Y, SH);
+                real32 VSum = KernLoop(Buffer, X, Y, SV);
+                real32 Mag = sqrt((HSum*HSum) + (VSum*VSum));
+                real32 Angle = atan(VSum/HSum) * (180 / PI);
+                v4 *TempValue = (v4 *)TempRow + (X);
+                TempValue->r = Mag;
+                TempValue->g = Angle;
+            }
+    }
+    for(int Y = 1;
+        Y < Buffer->Height - 1;
+        ++Y)
+    {
+        uint16 *TempRow = ((uint16 *)Buffer->Scratch +
+                         Buffer->BytesPerPixel +
+                         ColorPitch + (ColorPitch*(Y)));
+        uint16 *Row = ((uint16 *)Buffer->EffectBuffer +
+                  Buffer->BytesPerPixel +
+                  Buffer->Pitch + (Buffer->Pitch*Y));
+        for(int X = 1;
+            X < Buffer->Width - 1;
+            ++X)
+            {
+                uint32 *Pixel = (uint32 *)Row + X;
+                v4 *TempValue = (v4 *)TempRow + X;
+                if (TempValue->g < 45 && TempValue->g > -45) {
+                    v4 *Mag1 = (v4 *)TempRow + X + 1;
+                    v4 *Mag2 = (v4 *)TempRow + X - 1;
+                    if (TempValue->r > Mag1->r && TempValue->r > Mag2->r)
+                        TempValue->b = 1;
+                }
+                if (TempValue->g < 90 && TempValue->g > 45) {
+                    v4 *Mag1 = (v4 *)(TempRow + ColorPitch) + X + 1;
+                    v4 *Mag2 = (v4 *)(TempRow - ColorPitch) + X - 1;
+                    if (TempValue->r > Mag1->r && TempValue->r > Mag2->r)
+                        TempValue->b = 1;
+                }
+                if (TempValue->g < -45 && TempValue->g > -90) {
+                    v4 *Mag1 = (v4 *)(TempRow - ColorPitch) + X + 1;
+                    v4 *Mag2 = (v4 *)(TempRow + ColorPitch) + X - 1;
+                    if (TempValue->r > Mag1->r && TempValue->r > Mag2->r)
+                        TempValue->b = 1;
+                } else {
+                    v4 *Mag1 = (v4 *)(TempRow + ColorPitch) + X;
+                    v4 *Mag2 = (v4 *)(TempRow - ColorPitch) + X;
+                    if (TempValue->r > Mag1->r && TempValue->r > Mag2->r)
+                        TempValue->b = 1;
+                }
+    }
+    for(int Y = 1;
+        Y < Buffer->Height - 1;
+        ++Y)
+    {
+        uint16 *TempRow = ((uint16 *)Buffer->Scratch +
+                         Buffer->BytesPerPixel +
+                         ColorPitch + (ColorPitch*(Y)));
+        uint8 *Row = ((uint8 *)Buffer->EffectBuffer +
+                  Buffer->BytesPerPixel +
+                  Buffer->Pitch + (Buffer->Pitch*Y));
+        for(int X = 1;
+            X < Buffer->Width - 1;
+            ++X)
+            {
+                uint32 *Pixel = (uint32 *)Row + X;
+                v4 *TempValue = (v4 *)TempRow + (X);
+                if (TempValue->b == 1) {
+                    if (TempValue->r > UpperThreshold)
+                        *Pixel = 0xFF0000FF;
+                    }
+                    else if (TempValue->r > Threshold)
+                    {
+                        bool32 pp = false;
+                        uint16 *TempRow2 = TempRow - ColorPitch;
+                        for(int Y2 = 0;
+                            Y2 < 3;
+                            ++Y2)
+                        {
+                            v4 *TempValue2 = (v4 *)TempRow + (X - 1);
+                            for(int X2 = 0;
+                                X2 < 3;
+                                ++X2)
+                                {
+                                    if (TempValue2->r > UpperThreshold)
+                                        pp = true;
+                                    TempValue2++;
+                                }
+                            TempRow2 += ColorPitch;
+                        }
+                        if (pp)
+                            *Pixel = 0xFFFFFF00;
+                    }
+                }
+            }
+    }
+}
+
+internal void
+Levels(pixel_buffer *Buffer, memory *Memory, property_channel Property[])
+{
+    real32 Min = Property[0].CurrentValue.f;
+    real32 Mid = Property[1].CurrentValue.f;
+    real32 Max = Property[2].CurrentValue.f;
+
+    v4 ColMin = Property[3].CurrentValue.col;
+    v4 ColMid = Property[4].CurrentValue.col;
+    v4 ColMax = Property[5].CurrentValue.col;
+
+    if (!Property[0].Scratch) {
+        Property[0].Scratch = (uint64 *)Memory->Address + Memory->CurrentPosition;
+        Memory->CurrentPosition += Buffer->Width * Buffer->Height * Buffer->BytesPerPixel;
+
+        uint16 *Levels = (uint16 *)Property[0].Scratch;
+        uint8 *Row = ((uint8 *)Buffer->OriginalBuffer);
+
+        for(int Y = 0;
+            Y < Buffer->Height;
+            ++Y)
+        {
+            uint32 *Pixel = (uint32 *)Row;
+            for(int X = 0;
+                X < Buffer->Width;
+                ++X)
+            {
+                v4 Col = Uint32ToCol8(*Pixel);
+                uint16 Global = (uint16)(RoundReal32ToUint32((Col.r + Col.g + Col.b)/3));
+                *(Levels + Global) += 1;
+                *(Levels + 256   + (uint16)Col.r) += 1;
+                *(Levels + 256*2 + (uint16)Col.g) += 1;
+                *(Levels + 256*3 + (uint16)Col.b) += 1;
+                *(Levels + 256*4 + (uint16)Col.a) += 1;
+                Pixel++;
+            }
+            Row += Buffer->Pitch;
+        }
+    }
+
+
+    uint8 *Row = ((uint8 *)Buffer->EffectBuffer);
+    for(int Y = 0;
+        Y < Buffer->Height;
+        ++Y)
+    {
+        uint32 *Pixel = (uint32 *)Row;
+        for(int X = 0;
+            X < Buffer->Width;
+            ++X)
+        {
+            // individual channels
+            v4 ColorI = powv4(Uint32ToNormalizedCol(*Pixel), ColMid);
+            v4 ValI = 1.0f/(ColMax-ColMin) * (ColorI - ColMin);
+
+            // global channel
+            v4 ColorG = powv4(ValI, Mid);
+            v4 ValG = 1.0f/(Max-Min) * (ColorG - Min);
+
+            *Pixel++ = ColToUint32(Clamp(0.0f, ValG, 1.0f));
+        }
+        Row += Buffer->Pitch;
+    }
+
+}
+
+internal void
+GaussianBlur(pixel_buffer *Buffer, memory *Memory, property_channel Property[])
+{
+    real32 Radius = Property[0].CurrentValue.f;
+
+    if (!Buffer->Scratch) {
+        Buffer->Scratch = (uint64 *)Memory->Address + Memory->CurrentPosition;
+        Memory->CurrentPosition += Buffer->Width * Buffer->Height * Buffer->BytesPerPixel;
+    }
+
+    Gaussian(Buffer, Buffer->Scratch, Radius);
+}
+#endif
-- 
cgit v1.2.3