static void
PushRect(rectangle RenderRegion);

static void
RenderLayerNeon(project_layer *Layer, comp_buffer *Buffer, rectangle RenderRegion);
static void
AVX2_RenderLayer(transform_info TransformInfo, comp_buffer *Buffer, rectangle RenderRegion);
static void
SSE2_RenderLayer(transform_info TransformInfo, comp_buffer *Buffer, rectangle RenderRegion);
static void
Fallback_RenderLayer(transform_info TransformInfo, comp_buffer *Buffer, rectangle RenderRegion);

static bool32
CheckQueue(render_queue RenderInfo, uint16 Index);

// for the anchor point moving UI
static void
CalculateAnchorOffset(project_layer *Layer, real32 Value, uint16 Dir)
{
    source *Source = Layer->Source;

    real32 Rad = (Layer->rotation.CurrentValue.f * (PI / 180));
    real32 s = Layer->scale.CurrentValue.f;

    if (Dir == 0) {
        v2 XAxis = V2(cos(Rad), sin(Rad)) * (Value / s);
        Layer->x.CurrentValue.f += Value;
        Layer->ax.CurrentValue.f += XAxis.x/Source->Info.Width;
        Layer->ay.CurrentValue.f -= XAxis.y/Source->Info.Height;
    } else {
        v2 YAxis = V2(sin(Rad), -cos(Rad)) * (Value / -s);
        Layer->y.CurrentValue.f += Value;
        Layer->ax.CurrentValue.f -= YAxis.x/Source->Info.Width;
        Layer->ay.CurrentValue.f += YAxis.y/Source->Info.Height;
    }
}

static transform_info
CalculateTransforms(project_layer *Layer, comp_buffer *CompBuffer)
{
    transform_info TransformInfo;
    source *Source = Layer->Source;

    real32 Rad = (Layer->rotation.CurrentValue.f * (PI / 180));
    real32 s = Layer->scale.CurrentValue.f;
    // v2 Scale = {Source->Raster.Width * s, Source->Raster.Height * s};

    v2 XAxis = (Source->Info.Width * s)*V2(cos(Rad), sin(Rad));
    v2 YAxis = (Source->Info.Height * -s)*V2(sin(Rad), -cos(Rad));

    real32 AnchorX = Layer->ax.CurrentValue.f;
    real32 AnchorY = Layer->ay.CurrentValue.f;

    v2 Pos = {Layer->x.CurrentValue.f, Layer->y.CurrentValue.f};
    v2 Origin = Pos - (XAxis * AnchorX) - (YAxis * AnchorY);

    real32 XLengthSq = 1.0f / LengthSq(XAxis);
    real32 YLengthSq = 1.0f / LengthSq(YAxis);

    int32 MaxX = 0;
    int32 MaxY = 0;
    int32 MinX = CompBuffer->Width;
    int32 MinY = CompBuffer->Height;

    v2 Points[4] = {Origin, Origin + XAxis, Origin + YAxis, Origin + XAxis + YAxis};
    for (int i = 0; i < 4; i++) {
        if (Points[i].x < MinX) { MinX = Points[i].x; }
        if (Points[i].y < MinY) { MinY = Points[i].y; }
        if (Points[i].x > MaxX) { MaxX = Points[i].x; }
        if (Points[i].y > MaxY) { MaxY = Points[i].y; }
    }

    TransformInfo.XAxisPX = XLengthSq*XAxis.x;
    TransformInfo.XAxisPY = XLengthSq*XAxis.y;
    TransformInfo.YAxisPX = YLengthSq*YAxis.x;
    TransformInfo.YAxisPY = YLengthSq*YAxis.y;

    uint16 Width = Source->Info.Width;
    uint16 Height = Source->Info.Height;
    uint16 WidthP, HeightP;
    Bitmap_CalcPackedDimensions(Width, Height, &WidthP, &HeightP);

    TransformInfo.LayerWidth = Width;
    TransformInfo.LayerHeight = Height;
    TransformInfo.FullLayerWidth = WidthP;
    TransformInfo.FullLayerHeight = HeightP;
    TransformInfo.LayerOpacity = Layer->opacity.CurrentValue.f;
    TransformInfo.BlendMode =Layer->BlendMode;
    TransformInfo.OriginX = Origin.x;
    TransformInfo.OriginY = Origin.y;
    TransformInfo.BufferPitch = CompBuffer->Width*CompBuffer->BytesPerPixel;
    TransformInfo.LayerPitch = Source->Info.Width*Source->Info.BytesPerPixel;
    TransformInfo.ClipRect = {MinX - (MinX & 3), MinY, MaxX + 1, MaxY + 1};

    TransformInfo.SourceBuffer = Layer->BitmapInfo.BitmapBuffer;

    return TransformInfo;
}

static void
EndRenderState(project_state *State)
{
      IsRendering = false;

      for (int16 i = 0; i < State->NumberOfLayersToRender; i++)
      {
          State->LayersToRender[i] = 0;
      }

      State->NumberOfLayersToRender = 0;
#if THREADED
      SDL_AtomicSet(&CurrentEntry, 0);
      SDL_AtomicSet(&QueuedEntries, 0);
      SDL_AtomicSet(&CompletedEntries, 0);
#endif

}

static void
RenderLayers(render_queue *RenderInfo, rectangle RenderRegion) {
    for (int16 i = 0; i < RenderInfo->State->NumberOfLayersToRender; i++) {
        int16 Idx = RenderInfo->State->LayersToRender[i];
        if (InstructionMode == instruction_mode_avx)
            AVX2_RenderLayer(RenderInfo->File->Layer[Idx]->TransformInfo, RenderInfo->CompBuffer, RenderRegion);
        else if (InstructionMode == instruction_mode_sse)
            SSE2_RenderLayer(RenderInfo->File->Layer[Idx]->TransformInfo, RenderInfo->CompBuffer, RenderRegion);
        else
            Fallback_RenderLayer(RenderInfo->File->Layer[Idx]->TransformInfo, RenderInfo->CompBuffer, RenderRegion);
    }
}

static void
QueueCurrentFrame(project_data *File, comp_buffer *CompBuffer, project_state *State)
{
    IsRendering = true;
    // render_queue RenderInfo = {File, State, CompBuffer};

    for (int16 i = 0; i < File->NumberOfLayers; i++)
    {
        if (File->Layer[i]->StartFrame <= File->CurrentFrame &&
            File->Layer[i]->EndFrame >= File->CurrentFrame)
        {
            File->Layer[i]->TransformInfo = CalculateTransforms(File->Layer[i], CompBuffer);
            State->LayersToRender[State->NumberOfLayersToRender] = i;
            State->NumberOfLayersToRender++;
        }
    }

#if THREADED

    uint16 TileWidth = CompBuffer->Width / 4;
    uint16 TileHeight = CompBuffer->Height / 4;

    for (int y = 0; y < 4; y++) {
        for (int x = 0; x < 4; x++) {
            // if (x == y) {
            rectangle RenderRegion = {TileWidth*x, TileHeight*y, TileWidth + TileWidth*x, TileHeight + TileHeight*y};
            // The render regions always have to be aligned to the top left of
            // a 4x4 chunk (at least for AVX2) and cannot exceed the bounds of
            // the comp.
            // It seems we don't need any special math to guarantee this aside
            // from dividing by 4 and modulating.
            RenderRegion.Min.x -= RenderRegion.Min.x % 4;
            RenderRegion.Min.y -= RenderRegion.Min.y % 4;
            RenderRegion.Max.x -= RenderRegion.Max.x % 4;
            RenderRegion.Max.y -= RenderRegion.Max.y % 4;
            if (RenderRegion.Max.x > CompBuffer->Width)
                RenderRegion.Max.x = CompBuffer->Width;
            if (RenderRegion.Max.y > CompBuffer->Height)
                RenderRegion.Max.y = CompBuffer->Height;
            PushRect(RenderRegion);
            // }
        }
    }

#else

    rectangle RenderRegion = {0, 0, (int32)CompBuffer->Width, (int32)CompBuffer->Height};
    RenderLayers(&RenderInfo, RenderRegion);

#endif
}


#if ARM
static void
RenderLayerNeon(project_layer *Layer, pixel_buffer *Buffer, rectangle RenderRegion)
{
    float32x4_t XAxisPX = vdupq_n_f32(XAxisP.x);
    float32x4_t XAxisPY = vdupq_n_f32(XAxisP.y);
    float32x4_t YAxisPX = vdupq_n_f32(YAxisP.x);
    float32x4_t YAxisPY = vdupq_n_f32(YAxisP.y);
    float32x4_t LayerWidth = vdupq_n_f32();
    float32x4_t LayerHeight = vdupq_n_f32();
    float32x4_t LayerOpacity = vdupq_n_f32();
    float32x4_t OriginX = vdupq_n_f32(Origin.x);
    float32x4_t OriginY = vdupq_n_f32(Origin.y);


    float32x4_t One = vdupq_n_f32(1);
    float32x4_t Zero = vdupq_n_f32(0);
    float32x4_t Four = vdupq_n_f32(4);
    int32x4_t FourInt = vdupq_n_s32(4);
    int32x4_t EightInt = vdupq_n_s32(8);
    int32x4_t SixteenInt = vdupq_n_s32(16);
    int32x4_t TwentyFourInt = vdupq_n_s32(24);
    float32x4_t Float255 = vdupq_n_f32(255.0f);
    int32x4_t Int255 = vdupq_n_s32(255);
    float32x4_t Norm255 = vdupq_n_f32(1/255.0f);

    for(int16 Y = LayerBounds.Min.y;
        Y < LayerBounds.Max.y;
        Y++)
    {
        uint32 *Pixel = (uint32 *)Row + LayerBounds.Min.x;

        real32 ScalarPixelX[4] = {(real32)LayerBounds.Min.x,
                                  (real32)LayerBounds.Min.x+1,
                                  (real32)LayerBounds.Min.x+2,
                                  (real32)LayerBounds.Min.x+3};
        float32x4_t PixelX = vld1q_f32(ScalarPixelX);
        float32x4_t PixelY = vdupq_n_f32((real32)Y);
        float32x4_t StartVectorY = vsubq_f32(PixelY, OriginY);

        for(int16 XI = LayerBounds.Min.x;
            XI < LayerBounds.Max.x;
            XI += 1)
        {
            float32x4_t StartVectorX = vsubq_f32(PixelX, OriginX);
            float32x4_t U = vaddq_f32(vmulq_f32(StartVectorX, XAxisPX), vmulq_f32(StartVectorY, XAxisPY));
            float32x4_t V = vaddq_f32(vmulq_f32(StartVectorX, YAxisPX), vmulq_f32(StartVectorY, YAxisPY));

            uint32x4_t R = vandq_u32(vandq_u32(vcleq_f32(U, One), vcgezq_f32(U)),
                                     vandq_u32(vcleq_f32(V, One), vcgezq_f32(V)));

            // TODO(fox): Make more efficient with some sort of truncation
            uint32 comp[4];
            vst1q_u32(comp, R);
            if (comp[0] || comp[1] || comp[2] || comp[3]) {
                U = vmaxq_f32(vminq_f32(One, U), Zero);
                V = vmaxq_f32(vminq_f32(One, V), Zero);

                float32x4_t TexXFull = vmulq_f32(U, LayerWidth);
                float32x4_t TexYFull = vmulq_f32(V, LayerHeight);

                int32x4_t TexXInt = vcvtq_s32_f32(TexXFull);
                int32x4_t TexYInt = vcvtq_s32_f32(TexYFull);

                // fractions
                float32x4_t TexX = vsubq_f32(TexXFull, vcvtq_f32_s32(TexXInt));
                float32x4_t TexY = vsubq_f32(TexYFull, vcvtq_f32_s32(TexYInt));
                float32x4_t TexXInv     = vsubq_f32(One, TexX);
                float32x4_t TexYInv     = vsubq_f32(One, TexY);
                float32x4_t TexBothXInv = vmulq_f32(TexXInv, TexY);
                float32x4_t TexBothYInv = vmulq_f32(TexX, TexYInv);
                float32x4_t TexBoth     = vmulq_f32(TexY, TexX);
                float32x4_t TexBothInv  = vmulq_f32(TexXInv, TexYInv);

                int32 TexXP[4];
                vst1q_s32(TexXP, TexXInt);
                int32 TexYP[4];
                vst1q_s32(TexYP, TexYInt);

                uint8 *TexPTR0 = ((uint8 *)Source->Raster.MainBuffer + LayerPitch*TexYP[0] + TexXP[0]*sizeof(uint32));
                uint8 *TexPTR1 = ((uint8 *)Source->Raster.MainBuffer + LayerPitch*TexYP[1] + TexXP[1]*sizeof(uint32));
                uint8 *TexPTR2 = ((uint8 *)Source->Raster.MainBuffer + LayerPitch*TexYP[2] + TexXP[2]*sizeof(uint32));
                uint8 *TexPTR3 = ((uint8 *)Source->Raster.MainBuffer + LayerPitch*TexYP[3] + TexXP[3]*sizeof(uint32));

                // TexRGBA = vld4_u8(TexPTR0);
                // TexRGBA = vld4q_lane_u8(TexPTR0, TexRGBA, 0);
                // TexRGBA = vld4q_lane_u8(TexPTR1, TexRGBA, 4);
                // TexRGBA = vld4q_lane_u8(TexPTR2, TexRGBA, 8);
                // TexRGBA = vld4q_lane_u8(TexPTR3, TexRGBA, 12);
                // TexRGBA = vld4q_lane_u8(TexPTR0 + sizeof(uint32), TexRGBA, 1);
                // TexRGBA = vld4q_lane_u8(TexPTR1 + sizeof(uint32), TexRGBA, 5);
                // TexRGBA = vld4q_lane_u8(TexPTR2 + sizeof(uint32), TexRGBA, 9);
                // TexRGBA = vld4q_lane_u8(TexPTR3 + sizeof(uint32), TexRGBA, 13);
                // TexRGBA = vld4q_lane_u8(TexPTR0 + LayerPitch, TexRGBA, 2);
                // TexRGBA = vld4q_lane_u8(TexPTR1 + LayerPitch, TexRGBA, 6);
                // TexRGBA = vld4q_lane_u8(TexPTR2 + LayerPitch, TexRGBA, 10);
                // TexRGBA = vld4q_lane_u8(TexPTR3 + LayerPitch, TexRGBA, 14);
                // TexRGBA = vld4q_lane_u8(TexPTR0 + LayerPitch + sizeof(uint32), TexRGBA, 3);
                // TexRGBA = vld4q_lane_u8(TexPTR1 + LayerPitch + sizeof(uint32), TexRGBA, 7);
                // TexRGBA = vld4q_lane_u8(TexPTR2 + LayerPitch + sizeof(uint32), TexRGBA, 11);
                // TexRGBA = vld4q_lane_u8(TexPTR3 + LayerPitch + sizeof(uint32), TexRGBA, 15);
                uint8x16x4_t TexRGBA_A = {};
                uint8x16x4_t TexRGBA_B = {};
                uint8x16x4_t TexRGBA_C = {};
                uint8x16x4_t TexRGBA_D = {};
                TexRGBA_A = vld4q_lane_u8(TexPTR0, TexRGBA_A, 0);
                TexRGBA_B = vld4q_lane_u8(TexPTR1, TexRGBA_B, 0);
                TexRGBA_C = vld4q_lane_u8(TexPTR2, TexRGBA_C, 0);
                TexRGBA_D = vld4q_lane_u8(TexPTR3, TexRGBA_D, 0);
                TexRGBA_A = vld4q_lane_u8(TexPTR0 + sizeof(uint32), TexRGBA_A, 4);
                TexRGBA_B = vld4q_lane_u8(TexPTR1 + sizeof(uint32), TexRGBA_B, 4);
                TexRGBA_C = vld4q_lane_u8(TexPTR2 + sizeof(uint32), TexRGBA_C, 4);
                TexRGBA_D = vld4q_lane_u8(TexPTR3 + sizeof(uint32), TexRGBA_D, 4);
                TexRGBA_A = vld4q_lane_u8(TexPTR0 + LayerPitch, TexRGBA_A, 8);
                TexRGBA_B = vld4q_lane_u8(TexPTR1 + LayerPitch, TexRGBA_B, 8);
                TexRGBA_C = vld4q_lane_u8(TexPTR2 + LayerPitch, TexRGBA_C, 8);
                TexRGBA_D = vld4q_lane_u8(TexPTR3 + LayerPitch, TexRGBA_D, 8);
                TexRGBA_A = vld4q_lane_u8(TexPTR0 + LayerPitch + sizeof(uint32), TexRGBA_A, 12);
                TexRGBA_B = vld4q_lane_u8(TexPTR1 + LayerPitch + sizeof(uint32), TexRGBA_B, 12);
                TexRGBA_C = vld4q_lane_u8(TexPTR2 + LayerPitch + sizeof(uint32), TexRGBA_C, 12);
                TexRGBA_D = vld4q_lane_u8(TexPTR3 + LayerPitch + sizeof(uint32), TexRGBA_D, 12);

                uint32x4_t test = (uint32x4_t)TexRGBA_A.val[0];

                float32x4_t asd = vcvtq_f32_u32((uint32x4_t)TexRGBA_A.val[0]);
                float32x4_t pp = vaddq_f32(vmulq_f32(TexBothInv,  vcvtq_f32_u32((uint32x4_t)TexRGBA_A.val[0])),
                                           vmulq_f32(TexBothYInv, vcvtq_f32_u32((uint32x4_t)TexRGBA_B.val[0])));

                uint32x4_t test2 = (uint32x4_t)TexRGBA_A.val[0];

#if 0
                float32x4_t PixelBlendR = vaddq_f32(vaddq_f32(vmulq_f32(TexBothInv,  vcvtq_f32_u32((uint32x4_t)TexRGBA_A.val[0])),
                                                    vmulq_f32(TexBothYInv, vcvtq_f32_u32((uint32x4_t)TexRGBA_B.val[0]))),
                                          vaddq_f32(vmulq_f32(TexBothXInv, vcvtq_f32_u32((uint32x4_t)TexRGBA_C.val[0])),
                                                    vmulq_f32(TexBoth,     vcvtq_f32_u32((uint32x4_t)TexRGBA_D.val[0]))));

                float32x4_t PixelBlendG = vaddq_f32(vaddq_f32(vmulq_f32(TexBothInv,  vcvtq_f32_u32((uint32x4_t)TexRGBA_A.val[1])),
                                                    vmulq_f32(TexBothYInv, vcvtq_f32_u32((uint32x4_t)TexRGBA_B.val[1]))),
                                          vaddq_f32(vmulq_f32(TexBothXInv, vcvtq_f32_u32((uint32x4_t)TexRGBA_C.val[1])),
                                                    vmulq_f32(TexBoth,     vcvtq_f32_u32((uint32x4_t)TexRGBA_D.val[1]))));

                float32x4_t PixelBlendB = vaddq_f32(vaddq_f32(vmulq_f32(TexBothInv,  vcvtq_f32_u32((uint32x4_t)TexRGBA_A.val[2])),
                                                    vmulq_f32(TexBothYInv, vcvtq_f32_u32((uint32x4_t)TexRGBA_B.val[2]))),
                                          vaddq_f32(vmulq_f32(TexBothXInv, vcvtq_f32_u32((uint32x4_t)TexRGBA_C.val[2])),
                                                    vmulq_f32(TexBoth,     vcvtq_f32_u32((uint32x4_t)TexRGBA_D.val[2]))));

                float32x4_t PixelBlendA = vaddq_f32(vaddq_f32(vmulq_f32(TexBothInv,  vcvtq_f32_u32((uint32x4_t)TexRGBA_A.val[3])),
                                                    vmulq_f32(TexBothYInv, vcvtq_f32_u32((uint32x4_t)TexRGBA_B.val[3]))),
                                          vaddq_f32(vmulq_f32(TexBothXInv, vcvtq_f32_u32((uint32x4_t)TexRGBA_C.val[3])),
                                                    vmulq_f32(TexBoth,     vcvtq_f32_u32((uint32x4_t)TexRGBA_D.val[3]))));
#endif
                float32x4_t PixelBlendR = vcvtq_f32_u32((uint32x4_t)TexRGBA_A.val[0]);
                float32x4_t PixelBlendG = vcvtq_f32_u32((uint32x4_t)TexRGBA_A.val[1]);
                float32x4_t PixelBlendB = vcvtq_f32_u32((uint32x4_t)TexRGBA_A.val[2]);
                float32x4_t PixelBlendA = vcvtq_f32_u32((uint32x4_t)TexRGBA_A.val[3]);

                // __m128 PixelBlendR = _mm_add_ps(_mm_add_ps(_mm_mul_ps(TexBothInv, TexARx4),
                //                                            _mm_mul_ps(TexBothYInv, TexBRx4)),
                //                                 _mm_add_ps(_mm_mul_ps(TexBothXInv, TexCRx4),
                //                                            _mm_mul_ps(TexBoth, TexDRx4)));

                PixelBlendA = vsubq_f32(PixelBlendA, vmulq_f32(PixelBlendA, LayerOpacity));
                uint32x4_t Output = vorrq_u32(vorrq_u32(vshlq_n_u32(vcvtq_u32_f32(PixelBlendR), 16),
                                                       vshlq_n_u32(vcvtq_u32_f32(PixelBlendA), 24)),
                                            (vorrq_u32(vshlq_n_u32(vcvtq_u32_f32(PixelBlendG), 8),
                                                                 vcvtq_u32_f32(PixelBlendB))));

                uint32 ma[4] = {0xFFFFFFFF, 0, 0, 0};
                uint32x4_t mask = vld1q_u32(ma);
                Output = vandq_u32(Output, mask);
                vst1q_u32(Pixel, Output);

            }
            Pixel++;
            PixelX = vaddq_f32(PixelX, One);
        }
        Row += BufferPitch;
    }

}
#else

static void
AVX2_RenderLayer(transform_info T, comp_buffer *Buffer, rectangle RenderRegion)
{
    rectangle LayerBounds = ClipRectangle( T.ClipRect,
                                           RenderRegion );
    // Remember: since bitmaps are packed in 4x4 cubes, we always need to be aligned.
    LayerBounds.Min.x -= LayerBounds.Min.x % 4;
    LayerBounds.Min.y -= LayerBounds.Min.y % 4;

    uint16 WidthP, HeightP;
    Bitmap_CalcPackedDimensions(Buffer->Width, Buffer->Height, &WidthP, &HeightP);

    uint8 *TexPTR = (uint8 *)T.SourceBuffer;
    Assert(LayerBounds.Max.x <= Buffer->Width);
    Assert(LayerBounds.Max.y <= Buffer->Height);

    __m256 XAxisPX = _mm256_set1_ps(T.XAxisPX);
    __m256 XAxisPY = _mm256_set1_ps(T.XAxisPY);
    __m256 YAxisPX = _mm256_set1_ps(T.YAxisPX);
    __m256 YAxisPY = _mm256_set1_ps(T.YAxisPY);

    __m256 LayerWidth = _mm256_set1_ps(T.LayerWidth);
    __m256i FullLayerWidth4i = _mm256_set1_epi32(T.FullLayerWidth*4);
    __m256i LayerWidthMinusOne = _mm256_set1_epi32(T.LayerWidth - 1);
    __m256i LayerHeightMinusOne = _mm256_set1_epi32(T.LayerHeight - 1);
    __m256 LayerHeight = _mm256_set1_ps(T.LayerHeight);
    __m256 LayerOpacity = _mm256_set1_ps(T.LayerOpacity);
    __m256 OriginX = _mm256_set1_ps(T.OriginX);
    __m256 OriginY = _mm256_set1_ps(T.OriginY);

    __m256 ClipPrevent = _mm256_set1_ps(0.001f);
    __m256 One = _mm256_set1_ps(1);
    __m256 Two = _mm256_set1_ps(2);
    __m256 Zero = _mm256_set1_ps(0);

    __m256 ZeroPoint25 = _mm256_set1_ps(0.25);
    __m256 ZeroPointFive = _mm256_set1_ps(0.5);
    __m256i Onei = _mm256_set1_epi32(1);
    __m256 Four = _mm256_set1_ps(4);
    __m256i FF = _mm256_set1_epi32(0xFF);
    __m256i BottomTwoBits = _mm256_set1_epi32(0x03);
    __m256i Fouri = _mm256_set1_epi32(4);
    __m256i Sixteeni = _mm256_set1_epi32(16);
    __m256 Real255 = _mm256_set1_ps(255.0f);
    __m256 Norm255 = _mm256_set1_ps(1/255.0f);
    // __m256i White = _mm256_setr_epi32(0xFFFFFFFF, 0, 0, 0, 0xFFFFFFFF, 0, 0, 0);
    // __m256i White2 = _mm256_set1_epi32(0xFFFFFFFF);

    // NOTE(fox):  Each loop operates on 8 pixels, 4 horizontal by 2 vertical,
    // as per the bitmap packing scheme in memory.

    for (int32 Y = LayerBounds.Min.y; Y < LayerBounds.Max.y; Y+=2)
    {
        __m256 PixelX = _mm256_setr_ps((real32)LayerBounds.Min.x,
                                       (real32)LayerBounds.Min.x+1,
                                       (real32)LayerBounds.Min.x+2,
                                       (real32)LayerBounds.Min.x+3,
                                       (real32)LayerBounds.Min.x,
                                       (real32)LayerBounds.Min.x+1,
                                       (real32)LayerBounds.Min.x+2,
                                       (real32)LayerBounds.Min.x+3);

        __m256 PixelY = _mm256_setr_ps((real32)Y,
                                       (real32)Y,
                                       (real32)Y,
                                       (real32)Y,
                                       (real32)Y+1,
                                       (real32)Y+1,
                                       (real32)Y+1,
                                       (real32)Y+1);

        __m256 StartVectorY = _mm256_sub_ps(PixelY, OriginY);

        for (int32 X = LayerBounds.Min.x; X < LayerBounds.Max.x; X += 4)
        {
            IACA_START;

            // TODO(fox): Tried an MSAA technique for anti aliasing, but it still looks pretty sucky.
            __m256 X0 = _mm256_set1_ps(0.30);
            __m256 Y0 = _mm256_set1_ps(0.10);
            __m256 X1 = _mm256_set1_ps(0.80);
            __m256 Y1 = _mm256_set1_ps(0.35);
            __m256 X2 = _mm256_set1_ps(0.05);
            __m256 Y2 = _mm256_set1_ps(0.60);
            __m256 X3 = _mm256_set1_ps(0.55);
            __m256 Y3 = _mm256_set1_ps(0.85);

            __m256 StartVectorX = _mm256_sub_ps(PixelX, OriginX);
            __m256 StartVectorX0 = _mm256_add_ps(StartVectorX, X0);
            __m256 StartVectorY0 = _mm256_add_ps(StartVectorY, Y0);
            __m256 StartVectorX1 = _mm256_add_ps(StartVectorX, X1);
            __m256 StartVectorY1 = _mm256_add_ps(StartVectorY, Y1);
            __m256 StartVectorX2 = _mm256_add_ps(StartVectorX, X2);
            __m256 StartVectorY2 = _mm256_add_ps(StartVectorY, Y2);
            __m256 StartVectorX3 = _mm256_add_ps(StartVectorX, X3);
            __m256 StartVectorY3 = _mm256_add_ps(StartVectorY, Y3);

            uint32 XLookup = (X >> 2)*16 + (X % 4);
            uint32 YLookup = (Y >> 2)*(WidthP*4) + (Y % 4)*4;
            uint32 PixelToSeek = XLookup + YLookup;
            uint8 *Pixel = (uint8 *)Buffer->PackedBuffer + PixelToSeek*Buffer->BytesPerPixel;

            __m256 U = _mm256_add_ps(_mm256_mul_ps(StartVectorX, XAxisPX), _mm256_mul_ps(StartVectorY, XAxisPY));
            __m256 V = _mm256_add_ps(_mm256_mul_ps(StartVectorX, YAxisPX), _mm256_mul_ps(StartVectorY, YAxisPY));

            __m256 U0 = _mm256_add_ps(_mm256_mul_ps(StartVectorX0, XAxisPX), _mm256_mul_ps(StartVectorY0, XAxisPY));
            __m256 V0 = _mm256_add_ps(_mm256_mul_ps(StartVectorX0, YAxisPX), _mm256_mul_ps(StartVectorY0, YAxisPY));
            __m256 U1 = _mm256_add_ps(_mm256_mul_ps(StartVectorX1, XAxisPX), _mm256_mul_ps(StartVectorY1, XAxisPY));
            __m256 V1 = _mm256_add_ps(_mm256_mul_ps(StartVectorX1, YAxisPX), _mm256_mul_ps(StartVectorY1, YAxisPY));
            __m256 U2 = _mm256_add_ps(_mm256_mul_ps(StartVectorX2, XAxisPX), _mm256_mul_ps(StartVectorY2, XAxisPY));
            __m256 V2 = _mm256_add_ps(_mm256_mul_ps(StartVectorX2, YAxisPX), _mm256_mul_ps(StartVectorY2, YAxisPY));
            __m256 U3 = _mm256_add_ps(_mm256_mul_ps(StartVectorX3, XAxisPX), _mm256_mul_ps(StartVectorY3, XAxisPY));
            __m256 V3 = _mm256_add_ps(_mm256_mul_ps(StartVectorX3, YAxisPX), _mm256_mul_ps(StartVectorY3, YAxisPY));

            __m256 LayerMask0 = _mm256_and_ps(_mm256_and_ps(_mm256_cmp_ps(U0, Zero, 13), _mm256_cmp_ps(U0, One, 1)),
                                              _mm256_and_ps(_mm256_cmp_ps(V0, Zero, 13), _mm256_cmp_ps(V0, One, 1)));
            __m256 LayerMask1 = _mm256_and_ps(_mm256_and_ps(_mm256_cmp_ps(U1, Zero, 13), _mm256_cmp_ps(U1, One, 1)),
                                              _mm256_and_ps(_mm256_cmp_ps(V1, Zero, 13), _mm256_cmp_ps(V1, One, 1)));
            __m256 LayerMask2 = _mm256_and_ps(_mm256_and_ps(_mm256_cmp_ps(U2, Zero, 13), _mm256_cmp_ps(U2, One, 1)),
                                              _mm256_and_ps(_mm256_cmp_ps(V2, Zero, 13), _mm256_cmp_ps(V2, One, 1)));
            __m256 LayerMask3 = _mm256_and_ps(_mm256_and_ps(_mm256_cmp_ps(U3, Zero, 13), _mm256_cmp_ps(U3, One, 1)),
                                              _mm256_and_ps(_mm256_cmp_ps(V3, Zero, 13), _mm256_cmp_ps(V3, One, 1)));

            // Each point that passes adds .25
            __m256 Avg = _mm256_add_ps(_mm256_add_ps(_mm256_and_ps(LayerMask0, ZeroPoint25), _mm256_and_ps(LayerMask1, ZeroPoint25)),
                                       _mm256_add_ps(_mm256_and_ps(LayerMask2, ZeroPoint25), _mm256_and_ps(LayerMask3, ZeroPoint25)));

            // Zero - no points pass
            // One - all points pass; not an edge
            __m256i Mask = _mm256_cmp_ps(Avg, Zero, 14);
            __m256i NonEdge = _mm256_cmp_ps(Avg, One, 13);

            // If all of the pixels are zeroed in the mask (aka fall outside
            // the UV lookup), we can skip the iteration.
            if (_mm256_movemask_epi8(Mask))
            {
                __m256i EdgeMask = _mm256_andnot_si256(NonEdge, Mask);

                U = _mm256_max_ps(_mm256_min_ps(One, U), Zero);
                V = _mm256_max_ps(_mm256_min_ps(One, V), Zero);

                __m256 TexXFull = _mm256_mul_ps(U, LayerWidth);
                __m256 TexYFull = _mm256_mul_ps(V, LayerHeight);
                __m256i TexXInt = _mm256_cvttps_epi32(TexXFull);
                __m256i TexXIntPlusOne = _mm256_add_epi32(TexXInt, _mm256_and_si256(_mm256_cmpgt_epi32(LayerWidthMinusOne, TexXInt), Onei));
                __m256i TexYInt = _mm256_cvttps_epi32(TexYFull);
                __m256i TexYIntPlusOne = _mm256_add_epi32(TexYInt, _mm256_and_si256(_mm256_cmpgt_epi32(LayerHeightMinusOne, TexYInt), Onei));

                // NOTE(fox): The comparison is for when we're on the last pixel of the texel.

                __m256 TexX = _mm256_sub_ps(TexXFull, _mm256_cvtepi32_ps(TexXInt));
                __m256 TexY = _mm256_sub_ps(TexYFull, _mm256_cvtepi32_ps(TexYInt));
                __m256 TexXInv     = _mm256_sub_ps(One, TexX);
                __m256 TexYInv     = _mm256_sub_ps(One, TexY);
                __m256 TexBothXInv = _mm256_mul_ps(TexXInv, TexY);
                __m256 TexBothYInv = _mm256_mul_ps(TexX, TexYInv);
                __m256 TexBoth     = _mm256_mul_ps(TexY, TexX);
                __m256 TexBothInv  = _mm256_mul_ps(TexXInv, TexYInv);

                __m256i XLookup =        _mm256_add_epi32(_mm256_mullo_epi32(_mm256_srli_epi32(TexXInt, 2), Sixteeni),
                                                          _mm256_and_si256(TexXInt, BottomTwoBits));
                __m256i YLookup =        _mm256_add_epi32(_mm256_mullo_epi32(_mm256_srli_epi32(TexYInt, 2), FullLayerWidth4i),
                                                          _mm256_mullo_epi32(_mm256_and_si256(TexYInt, BottomTwoBits), Fouri));
                __m256i XLookupPlusOne = _mm256_add_epi32(_mm256_mullo_epi32(_mm256_srli_epi32(TexXIntPlusOne, 2), Sixteeni),
                                                          _mm256_and_si256(TexXIntPlusOne, BottomTwoBits));
                __m256i YLookupPlusOne = _mm256_add_epi32(_mm256_mullo_epi32(_mm256_srli_epi32(TexYIntPlusOne, 2), FullLayerWidth4i),
                                                          _mm256_mullo_epi32(_mm256_and_si256(TexYIntPlusOne, BottomTwoBits), Fouri));

                __m256i PixelLookupTL = _mm256_add_epi32(XLookup, YLookup);
                __m256i PixelLookupTR = _mm256_add_epi32(XLookupPlusOne, YLookup);
                __m256i PixelLookupBL = _mm256_add_epi32(XLookup, YLookupPlusOne);
                __m256i PixelLookupBR = _mm256_add_epi32(XLookupPlusOne, YLookupPlusOne);

                // The big feature of AVX2: gathering.
                __m256i PixelsTL = _mm256_i32gather_epi32((const int32 *)TexPTR, PixelLookupTL, 4);
                __m256i PixelsTR = _mm256_i32gather_epi32((const int32 *)TexPTR, PixelLookupTR, 4);
                __m256i PixelsBL = _mm256_i32gather_epi32((const int32 *)TexPTR, PixelLookupBL, 4);
                __m256i PixelsBR = _mm256_i32gather_epi32((const int32 *)TexPTR, PixelLookupBR, 4);

                __m256 R_TexTL = _mm256_mul_ps(_mm256_cvtepi32_ps(_mm256_and_si256(                  PixelsTL,      FF)), Norm255);
                __m256 G_TexTL = _mm256_mul_ps(_mm256_cvtepi32_ps(_mm256_and_si256(_mm256_srli_epi32(PixelsTL, 8),  FF)), Norm255);
                __m256 B_TexTL = _mm256_mul_ps(_mm256_cvtepi32_ps(_mm256_and_si256(_mm256_srli_epi32(PixelsTL, 16), FF)), Norm255);
                __m256 A_TexTL = _mm256_mul_ps(_mm256_cvtepi32_ps(_mm256_and_si256(_mm256_srli_epi32(PixelsTL, 24), FF)), Norm255);

                __m256 R_TexTR = _mm256_mul_ps(_mm256_cvtepi32_ps(_mm256_and_si256(                  PixelsTR,      FF)), Norm255);
                __m256 G_TexTR = _mm256_mul_ps(_mm256_cvtepi32_ps(_mm256_and_si256(_mm256_srli_epi32(PixelsTR, 8),  FF)), Norm255);
                __m256 B_TexTR = _mm256_mul_ps(_mm256_cvtepi32_ps(_mm256_and_si256(_mm256_srli_epi32(PixelsTR, 16), FF)), Norm255);
                __m256 A_TexTR = _mm256_mul_ps(_mm256_cvtepi32_ps(_mm256_and_si256(_mm256_srli_epi32(PixelsTR, 24), FF)), Norm255);

                __m256 R_TexBL = _mm256_mul_ps(_mm256_cvtepi32_ps(_mm256_and_si256(                  PixelsBL,      FF)), Norm255);
                __m256 G_TexBL = _mm256_mul_ps(_mm256_cvtepi32_ps(_mm256_and_si256(_mm256_srli_epi32(PixelsBL, 8),  FF)), Norm255);
                __m256 B_TexBL = _mm256_mul_ps(_mm256_cvtepi32_ps(_mm256_and_si256(_mm256_srli_epi32(PixelsBL, 16), FF)), Norm255);
                __m256 A_TexBL = _mm256_mul_ps(_mm256_cvtepi32_ps(_mm256_and_si256(_mm256_srli_epi32(PixelsBL, 24), FF)), Norm255);

                __m256 R_TexBR = _mm256_mul_ps(_mm256_cvtepi32_ps(_mm256_and_si256(                  PixelsBR,      FF)), Norm255);
                __m256 G_TexBR = _mm256_mul_ps(_mm256_cvtepi32_ps(_mm256_and_si256(_mm256_srli_epi32(PixelsBR, 8),  FF)), Norm255);
                __m256 B_TexBR = _mm256_mul_ps(_mm256_cvtepi32_ps(_mm256_and_si256(_mm256_srli_epi32(PixelsBR, 16), FF)), Norm255);
                __m256 A_TexBR = _mm256_mul_ps(_mm256_cvtepi32_ps(_mm256_and_si256(_mm256_srli_epi32(PixelsBR, 24), FF)), Norm255);

                __m256 R_Col = _mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(TexBothInv,  R_TexTL),
                                                           _mm256_mul_ps(TexBothYInv, R_TexTR)),
                                             _mm256_add_ps(_mm256_mul_ps(TexBothXInv, R_TexBL),
                                                           _mm256_mul_ps(TexBoth,     R_TexBR)));
                __m256 G_Col = _mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(TexBothInv,  G_TexTL),
                                                           _mm256_mul_ps(TexBothYInv, G_TexTR)),
                                             _mm256_add_ps(_mm256_mul_ps(TexBothXInv, G_TexBL),
                                                           _mm256_mul_ps(TexBoth,     G_TexBR)));
                __m256 B_Col = _mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(TexBothInv,  B_TexTL),
                                                           _mm256_mul_ps(TexBothYInv, B_TexTR)),
                                             _mm256_add_ps(_mm256_mul_ps(TexBothXInv, B_TexBL),
                                                           _mm256_mul_ps(TexBoth,     B_TexBR)));
                __m256 A_Col = _mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(TexBothInv,  A_TexTL),
                                                           _mm256_mul_ps(TexBothYInv, A_TexTR)),
                                             _mm256_add_ps(_mm256_mul_ps(TexBothXInv, A_TexBL),
                                                           _mm256_mul_ps(TexBoth,     A_TexBR)));

                // Apply anti-aliasing to edges if there are any
                if (_mm256_movemask_epi8(EdgeMask))
                {
                    A_Col = _mm256_blendv_ps(A_Col, _mm256_mul_ps(A_Col, Avg), EdgeMask);
                }

                __m256 LayerAlpha = _mm256_mul_ps(A_Col, LayerOpacity);
                __m256 LayerAlphaInv = _mm256_sub_ps(One, LayerAlpha);

                // Hoisted out of some blend modes; maybe it'd be better to just keep them in there.
                __m256 R_Colx2  = _mm256_mul_ps(R_Col, Two);
                __m256 R_ColInv = _mm256_sub_ps(One, R_Col);

                __m256 G_Colx2  = _mm256_mul_ps(G_Col, Two);
                __m256 G_ColInv = _mm256_sub_ps(One, G_Col);

                __m256 B_Colx2  = _mm256_mul_ps(B_Col, Two);
                __m256 B_ColInv = _mm256_sub_ps(One, B_Col);

                __m256 R_Blend = R_Col;
                __m256 G_Blend = G_Col;
                __m256 B_Blend = B_Col;
                __m256 A_Blend = LayerAlpha;

                // Only load the dest pixel if we actually need to (a pixel's opacity isn't 255 or the blend mode requires it).
                if (_mm256_movemask_epi8(_mm256_cmp_ps(LayerAlpha, One, 2)) || T.BlendMode != blend_normal)
                {
                    __m256i DestPixel = _mm256_loadu_si256((const __m256i *)Pixel);
                    __m256 R_Dest = _mm256_mul_ps(_mm256_cvtepi32_ps(_mm256_and_si256(                  DestPixel,      FF)), Norm255);
                    __m256 G_Dest = _mm256_mul_ps(_mm256_cvtepi32_ps(_mm256_and_si256(_mm256_srli_epi32(DestPixel, 8),  FF)), Norm255);
                    __m256 B_Dest = _mm256_mul_ps(_mm256_cvtepi32_ps(_mm256_and_si256(_mm256_srli_epi32(DestPixel, 16), FF)), Norm255);
                    __m256 A_Dest = _mm256_mul_ps(_mm256_cvtepi32_ps(_mm256_and_si256(_mm256_srli_epi32(DestPixel, 24), FF)), Norm255);

                    switch (T.BlendMode)
                    {
                        case blend_normal:
                        {
                        } break;
                        case blend_multiply:
                        {
                            R_Blend  = _mm256_mul_ps(R_Dest, R_Col);
                            G_Blend  = _mm256_mul_ps(G_Dest, G_Col);
                            B_Blend  = _mm256_mul_ps(B_Dest, B_Col);
                        } break;
                        case blend_colorburn:
                        {
                            // NOTE(fox): A small amount is added to Col since images with zero for alpha may also zero out the
                            // color channels, causing black clipping.
                            R_Blend  = _mm256_sub_ps(One, _mm256_div_ps(_mm256_sub_ps(One, R_Dest), _mm256_add_ps(R_Col, ClipPrevent)));
                            G_Blend  = _mm256_sub_ps(One, _mm256_div_ps(_mm256_sub_ps(One, G_Dest), _mm256_add_ps(G_Col, ClipPrevent)));
                            B_Blend  = _mm256_sub_ps(One, _mm256_div_ps(_mm256_sub_ps(One, B_Dest), _mm256_add_ps(B_Col, ClipPrevent)));
                        } break;
                        case blend_linearburn:
                        {
                            R_Blend  = _mm256_sub_ps(_mm256_add_ps(R_Dest, R_Col), One);
                            G_Blend  = _mm256_sub_ps(_mm256_add_ps(G_Dest, G_Col), One);
                            B_Blend  = _mm256_sub_ps(_mm256_add_ps(B_Dest, B_Col), One);
                        } break;
                        case blend_add:
                        {
                            R_Blend  = _mm256_add_ps(R_Dest, R_Col);
                            G_Blend  = _mm256_add_ps(G_Dest, G_Col);
                            B_Blend  = _mm256_add_ps(B_Dest, B_Col);
                        } break;
                        case blend_screen:
                        {
                            R_Blend  = _mm256_sub_ps(One, _mm256_mul_ps(_mm256_sub_ps(One, R_Dest), R_ColInv));
                            G_Blend  = _mm256_sub_ps(One, _mm256_mul_ps(_mm256_sub_ps(One, G_Dest), G_ColInv));
                            B_Blend  = _mm256_sub_ps(One, _mm256_mul_ps(_mm256_sub_ps(One, B_Dest), B_ColInv));
                        } break;
                        case blend_overlay:
                        {
                            __m256 R_Mask  = _mm256_cmp_ps(R_Dest,  ZeroPointFive, 1);
                            __m256 G_Mask  = _mm256_cmp_ps(G_Dest,  ZeroPointFive, 1);
                            __m256 B_Mask  = _mm256_cmp_ps(B_Dest,  ZeroPointFive, 1);
                            __m256 R_Lower = _mm256_mul_ps(Two, _mm256_mul_ps(R_Dest, R_Col));
                            __m256 G_Lower = _mm256_mul_ps(Two, _mm256_mul_ps(G_Dest, G_Col));
                            __m256 B_Lower = _mm256_mul_ps(Two, _mm256_mul_ps(B_Dest, B_Col));
                            __m256 R_Upper = _mm256_sub_ps(One, _mm256_mul_ps(Two, _mm256_mul_ps(_mm256_sub_ps(One, R_Dest),  R_ColInv)));
                            __m256 G_Upper = _mm256_sub_ps(One, _mm256_mul_ps(Two, _mm256_mul_ps(_mm256_sub_ps(One, G_Dest),  G_ColInv)));
                            __m256 B_Upper = _mm256_sub_ps(One, _mm256_mul_ps(Two, _mm256_mul_ps(_mm256_sub_ps(One, B_Dest),  B_ColInv)));
                            R_Blend = _mm256_blendv_ps(R_Upper, R_Lower, R_Mask);
                            G_Blend = _mm256_blendv_ps(G_Upper, G_Lower, G_Mask);
                            B_Blend = _mm256_blendv_ps(B_Upper, B_Lower, B_Mask);
                        } break;
                        case blend_softlight:
                        {
                            // using Pegtop's equation
                            R_Blend  = _mm256_add_ps(_mm256_mul_ps(_mm256_sub_ps(One, R_Colx2), _mm256_mul_ps(R_Dest, R_Dest)), _mm256_mul_ps(R_Colx2, R_Dest));
                            G_Blend  = _mm256_add_ps(_mm256_mul_ps(_mm256_sub_ps(One, G_Colx2), _mm256_mul_ps(G_Dest, G_Dest)), _mm256_mul_ps(G_Colx2, G_Dest));
                            B_Blend  = _mm256_add_ps(_mm256_mul_ps(_mm256_sub_ps(One, B_Colx2), _mm256_mul_ps(B_Dest, B_Dest)), _mm256_mul_ps(B_Colx2, B_Dest));
                        } break;
                        case blend_hardlight:
                        {
                            __m256 R_Mask   = _mm256_cmp_ps(R_Dest,  ZeroPointFive, 13);
                            __m256 G_Mask   = _mm256_cmp_ps(G_Dest,  ZeroPointFive, 13);
                            __m256 B_Mask   = _mm256_cmp_ps(B_Dest,  ZeroPointFive, 13);
                            __m256 R_Lower  = _mm256_mul_ps(Two, _mm256_mul_ps(R_Dest, R_Col));
                            __m256 G_Lower  = _mm256_mul_ps(Two, _mm256_mul_ps(G_Dest, G_Col));
                            __m256 B_Lower  = _mm256_mul_ps(Two, _mm256_mul_ps(B_Dest, B_Col));
                            __m256 R_Upper  = _mm256_sub_ps(One, _mm256_mul_ps(Two, _mm256_mul_ps(_mm256_sub_ps(One, R_Dest),  R_ColInv)));
                            __m256 G_Upper  = _mm256_sub_ps(One, _mm256_mul_ps(Two, _mm256_mul_ps(_mm256_sub_ps(One, G_Dest),  G_ColInv)));
                            __m256 B_Upper  = _mm256_sub_ps(One, _mm256_mul_ps(Two, _mm256_mul_ps(_mm256_sub_ps(One, B_Dest),  B_ColInv)));
                            R_Blend = _mm256_blendv_ps(R_Upper, R_Lower, R_Mask);
                            G_Blend = _mm256_blendv_ps(G_Upper, G_Lower, G_Mask);
                            B_Blend = _mm256_blendv_ps(B_Upper, B_Lower, B_Mask);
                        } break;
                        case blend_subtract:
                        {
                            R_Blend  = _mm256_sub_ps(R_Dest, R_Col);
                            G_Blend  = _mm256_sub_ps(G_Dest, G_Col);
                            B_Blend  = _mm256_sub_ps(B_Dest, B_Col);
                        } break;
                        case blend_divide:
                        {
                            R_Blend  = _mm256_div_ps(R_Dest, _mm256_add_ps(R_Col, ClipPrevent));
                            G_Blend  = _mm256_div_ps(G_Dest, _mm256_add_ps(G_Col, ClipPrevent));
                            B_Blend  = _mm256_div_ps(B_Dest, _mm256_add_ps(B_Col, ClipPrevent));
                        } break;
                        case blend_difference:
                        {
                            __m256 R_Lower  = _mm256_sub_ps(R_Col, R_Dest);
                            __m256 G_Lower  = _mm256_sub_ps(G_Col, G_Dest);
                            __m256 B_Lower  = _mm256_sub_ps(B_Col, B_Dest);
                            __m256 R_Upper  = _mm256_sub_ps(R_Dest, R_Col);
                            __m256 G_Upper  = _mm256_sub_ps(G_Dest, G_Col);
                            __m256 B_Upper  = _mm256_sub_ps(B_Dest, B_Col);
                            __m256 R_Mask  = _mm256_cmp_ps(R_Lower,  Zero, 14);
                            __m256 G_Mask  = _mm256_cmp_ps(G_Lower,  Zero, 14);
                            __m256 B_Mask  = _mm256_cmp_ps(B_Lower,  Zero, 14);
                            R_Blend = _mm256_blendv_ps(R_Upper, R_Lower, R_Mask);
                            G_Blend = _mm256_blendv_ps(G_Upper, G_Lower, G_Mask);
                            B_Blend = _mm256_blendv_ps(B_Upper, B_Lower, B_Mask);
                        } break;
                    }

                    R_Blend = _mm256_add_ps(_mm256_mul_ps(R_Dest, LayerAlphaInv), _mm256_mul_ps(R_Blend, LayerAlpha));
                    G_Blend = _mm256_add_ps(_mm256_mul_ps(G_Dest, LayerAlphaInv), _mm256_mul_ps(G_Blend, LayerAlpha));
                    B_Blend = _mm256_add_ps(_mm256_mul_ps(B_Dest, LayerAlphaInv), _mm256_mul_ps(B_Blend, LayerAlpha));

                    // Standard behavior in photo apps is for blend modes to
                    // inherit underlying opacity instead of adding to it.
                    if (T.BlendMode == blend_normal)
                        A_Blend = _mm256_add_ps(A_Dest, LayerAlpha);
                    else
                        A_Blend = A_Dest;
                }

                __m256i R_Out = _mm256_cvtps_epi32(_mm256_mul_ps(_mm256_max_ps(_mm256_min_ps(One, R_Blend), Zero), Real255));
                __m256i G_Out = _mm256_cvtps_epi32(_mm256_mul_ps(_mm256_max_ps(_mm256_min_ps(One, G_Blend), Zero), Real255));
                __m256i B_Out = _mm256_cvtps_epi32(_mm256_mul_ps(_mm256_max_ps(_mm256_min_ps(One, B_Blend), Zero), Real255));
                __m256i A_Out = _mm256_cvtps_epi32(_mm256_mul_ps(_mm256_max_ps(_mm256_min_ps(One, A_Blend), Zero), Real255));

                __m256i OutputPixel = _mm256_or_si256(
                                      _mm256_or_si256(R_Out, _mm256_slli_epi32(G_Out, 8)),
                                      _mm256_or_si256(_mm256_slli_epi32(B_Out, 16), _mm256_slli_epi32(A_Out, 24)));


                _mm256_maskstore_epi32((int *)Pixel, Mask, OutputPixel);
            }
            PixelX = _mm256_add_ps(PixelX, Four);
        }
    }
}

static void
SSE2_RenderLayer(transform_info T, comp_buffer *Buffer, rectangle RenderRegion)
{
    rectangle LayerBounds = ClipRectangle( T.ClipRect,
                                           RenderRegion );
    // Remember: since bitmaps are packed in 4x4 cubes, we always need to be aligned.
    LayerBounds.Min.x -= LayerBounds.Min.x % 4;
    LayerBounds.Min.y -= LayerBounds.Min.y % 4;

    uint16 WidthP, HeightP;
    Bitmap_CalcPackedDimensions(Buffer->Width, Buffer->Height, &WidthP, &HeightP);

    uint8 *TexPTR = (uint8 *)T.SourceBuffer;
    Assert(LayerBounds.Max.x <= Buffer->Width);
    Assert(LayerBounds.Max.y <= Buffer->Height);

    __m128 XAxisPX = _mm_set1_ps(T.XAxisPX);
    __m128 XAxisPY = _mm_set1_ps(T.XAxisPY);
    __m128 YAxisPX = _mm_set1_ps(T.YAxisPX);
    __m128 YAxisPY = _mm_set1_ps(T.YAxisPY);

    __m128 LayerWidth = _mm_set1_ps(T.LayerWidth);
    __m128i LayerWidthMinusOne = _mm_set1_epi32(T.LayerWidth - 1);
    __m128i FullLayerWidth4i = _mm_set1_epi32(T.FullLayerWidth*4);
    __m128 LayerHeight = _mm_set1_ps(T.LayerHeight);
    __m128i LayerHeightMinusOne = _mm_set1_epi32(T.LayerHeight - 1);
    __m128 LayerOpacity = _mm_set1_ps(T.LayerOpacity);
    __m128 OriginX = _mm_set1_ps(T.OriginX);
    __m128 OriginY = _mm_set1_ps(T.OriginY);

    __m128 ClipPrevent = _mm_set1_ps(0.001f);
    __m128 One = _mm_set1_ps(1);
    __m128 Two = _mm_set1_ps(2);
    __m128 Zero = _mm_set1_ps(0);
    __m128 ZeroPointFive = _mm_set1_ps(0.5);
    __m128i Onei = _mm_set1_epi32(1);
    __m128 Four = _mm_set1_ps(4);
    __m128i FF = _mm_set1_epi32(0xFF);
    __m128i BottomTwoBits = _mm_set1_epi32(0x03);
    __m128i Fouri = _mm_set1_epi32(4);
    __m128i Sixteeni = _mm_set1_epi32(16);
    __m128 Reg255 = _mm_set1_ps(255.0f);
    __m128 Norm255 = _mm_set1_ps(1/255.0f);

    // NOTE(fox):  Each loop operates on 4 pixels, 4 horizontal by 1 vertical.

    for (int32 Y = LayerBounds.Min.y; Y < LayerBounds.Max.y; Y++)
    {
        __m128 PixelX = _mm_setr_ps((real32)LayerBounds.Min.x,
                                    (real32)LayerBounds.Min.x+1,
                                    (real32)LayerBounds.Min.x+2,
                                    (real32)LayerBounds.Min.x+3);

        __m128 PixelY = _mm_set1_ps((real32)Y);
        __m128 StartVectorY = _mm_sub_ps(PixelY, OriginY);

        for (int32 X = LayerBounds.Min.x; X < LayerBounds.Max.x; X += 4)
        {
            IACA_START;

            __m128 StartVectorX = _mm_sub_ps(PixelX, OriginX);

            uint32 XLookup = (X >> 2)*16 + (X % 4);
            uint32 YLookup = (Y >> 2)*(WidthP*4) + (Y % 4)*4;
            uint32 PixelToSeek = XLookup + YLookup;
            uint8 *Pixel = (uint8 *)Buffer->PackedBuffer + PixelToSeek*Buffer->BytesPerPixel;

            __m128 U = _mm_add_ps(_mm_mul_ps(StartVectorX, XAxisPX), _mm_mul_ps(StartVectorY, XAxisPY));
            __m128 V = _mm_add_ps(_mm_mul_ps(StartVectorX, YAxisPX), _mm_mul_ps(StartVectorY, YAxisPY));

            __m128i LayerMask = _mm_castps_si128(_mm_and_ps(_mm_and_ps(_mm_cmpge_ps(U, Zero), _mm_cmplt_ps(U, One)),
                                                            _mm_and_ps(_mm_cmpge_ps(V, Zero), _mm_cmplt_ps(V, One))));

            if (_mm_movemask_epi8(LayerMask))
            {
                U = _mm_max_ps(_mm_min_ps(One, U), Zero);
                V = _mm_max_ps(_mm_min_ps(One, V), Zero);

                __m128 TexXFull = _mm_mul_ps(U, LayerWidth);
                __m128 TexYFull = _mm_mul_ps(V, LayerHeight);
                __m128i TexXInt = _mm_cvttps_epi32(TexXFull);
                __m128i TexXIntPlusOne = _mm_add_epi32(TexXInt, _mm_and_si128(_mm_cmplt_epi32(TexXInt, LayerWidthMinusOne), Onei));
                __m128i TexYInt = _mm_cvttps_epi32(TexYFull);
                __m128i TexYIntPlusOne = _mm_add_epi32(TexYInt, _mm_and_si128(_mm_cmplt_epi32(TexYInt, LayerHeightMinusOne), Onei));

                __m128 TexX = _mm_sub_ps(TexXFull, _mm_cvtepi32_ps(TexXInt));
                __m128 TexY = _mm_sub_ps(TexYFull, _mm_cvtepi32_ps(TexYInt));
                __m128 TexXInv     = _mm_sub_ps(One, TexX);
                __m128 TexYInv     = _mm_sub_ps(One, TexY);
                __m128 TexBothXInv = _mm_mul_ps(TexXInv, TexY);
                __m128 TexBothYInv = _mm_mul_ps(TexX, TexYInv);
                __m128 TexBoth     = _mm_mul_ps(TexY, TexX);
                __m128 TexBothInv  = _mm_mul_ps(TexXInv, TexYInv);

                __m128i XLookup =        _mm_add_epi32(_mm_mullo_epi32(_mm_srli_epi32(TexXInt, 2), Sixteeni),
                                                    _mm_and_si128(TexXInt, BottomTwoBits));
                __m128i YLookup =        _mm_add_epi32(_mm_mullo_epi32(_mm_srli_epi32(TexYInt, 2), FullLayerWidth4i),
                                                    _mm_mullo_epi32(_mm_and_si128(TexYInt, BottomTwoBits), Fouri));
                __m128i XLookupPlusOne = _mm_add_epi32(_mm_mullo_epi32(_mm_srli_epi32(TexXIntPlusOne, 2), Sixteeni),
                                                    _mm_and_si128(TexXIntPlusOne, BottomTwoBits));
                __m128i YLookupPlusOne = _mm_add_epi32(_mm_mullo_epi32(_mm_srli_epi32(TexYIntPlusOne, 2), FullLayerWidth4i),
                                                    _mm_mullo_epi32(_mm_and_si128(TexYIntPlusOne, BottomTwoBits), Fouri));

                __m128i PixelLookupTL = _mm_add_epi32(XLookup, YLookup);
                __m128i PixelLookupTR = _mm_add_epi32(XLookupPlusOne, YLookup);
                __m128i PixelLookupBL = _mm_add_epi32(XLookup, YLookupPlusOne);
                __m128i PixelLookupBR = _mm_add_epi32(XLookupPlusOne, YLookupPlusOne);

                // SSE lacks gathering, so we have no choice but to manually
                // look up each pixel's four bilinear samples in scalar.

                uint32 S_PixelLookupTL0 = _mm_cvtsi128_si32(PixelLookupTL);
                uint32 S_PixelLookupTR0 = _mm_cvtsi128_si32(PixelLookupTR);
                uint32 S_PixelLookupBL0 = _mm_cvtsi128_si32(PixelLookupBL);
                uint32 S_PixelLookupBR0 = _mm_cvtsi128_si32(PixelLookupBR);
                uint32 S_PixelsTL0 = *(uint32 *)(TexPTR + S_PixelLookupTL0*4);
                uint32 S_PixelsTR0 = *(uint32 *)(TexPTR + S_PixelLookupTR0*4);
                uint32 S_PixelsBL0 = *(uint32 *)(TexPTR + S_PixelLookupBL0*4);
                uint32 S_PixelsBR0 = *(uint32 *)(TexPTR + S_PixelLookupBR0*4);

                uint32 S_PixelLookupTL1 = _mm_cvtsi128_si32(_mm_srli_si128(PixelLookupTL, 4));
                uint32 S_PixelLookupTR1 = _mm_cvtsi128_si32(_mm_srli_si128(PixelLookupTR, 4));
                uint32 S_PixelLookupBL1 = _mm_cvtsi128_si32(_mm_srli_si128(PixelLookupBL, 4));
                uint32 S_PixelLookupBR1 = _mm_cvtsi128_si32(_mm_srli_si128(PixelLookupBR, 4));
                uint32 S_PixelsTL1 = *(uint32 *)(TexPTR + S_PixelLookupTL1*4);
                uint32 S_PixelsTR1 = *(uint32 *)(TexPTR + S_PixelLookupTR1*4);
                uint32 S_PixelsBL1 = *(uint32 *)(TexPTR + S_PixelLookupBL1*4);
                uint32 S_PixelsBR1 = *(uint32 *)(TexPTR + S_PixelLookupBR1*4);

                uint32 S_PixelLookupTL2 = _mm_cvtsi128_si32(_mm_srli_si128(PixelLookupTL, 8));
                uint32 S_PixelLookupTR2 = _mm_cvtsi128_si32(_mm_srli_si128(PixelLookupTR, 8));
                uint32 S_PixelLookupBL2 = _mm_cvtsi128_si32(_mm_srli_si128(PixelLookupBL, 8));
                uint32 S_PixelLookupBR2 = _mm_cvtsi128_si32(_mm_srli_si128(PixelLookupBR, 8));
                uint32 S_PixelsTL2 = *(uint32 *)(TexPTR + S_PixelLookupTL2*4);
                uint32 S_PixelsTR2 = *(uint32 *)(TexPTR + S_PixelLookupTR2*4);
                uint32 S_PixelsBL2 = *(uint32 *)(TexPTR + S_PixelLookupBL2*4);
                uint32 S_PixelsBR2 = *(uint32 *)(TexPTR + S_PixelLookupBR2*4);

                uint32 S_PixelLookupTL3 = _mm_cvtsi128_si32(_mm_srli_si128(PixelLookupTL, 12));
                uint32 S_PixelLookupTR3 = _mm_cvtsi128_si32(_mm_srli_si128(PixelLookupTR, 12));
                uint32 S_PixelLookupBL3 = _mm_cvtsi128_si32(_mm_srli_si128(PixelLookupBL, 12));
                uint32 S_PixelLookupBR3 = _mm_cvtsi128_si32(_mm_srli_si128(PixelLookupBR, 12));
                uint32 S_PixelsTL3 = *(uint32 *)(TexPTR + S_PixelLookupTL3*4);
                uint32 S_PixelsTR3 = *(uint32 *)(TexPTR + S_PixelLookupTR3*4);
                uint32 S_PixelsBL3 = *(uint32 *)(TexPTR + S_PixelLookupBL3*4);
                uint32 S_PixelsBR3 = *(uint32 *)(TexPTR + S_PixelLookupBR3*4);

                __m128i PixelsTL = _mm_setr_epi32(S_PixelsTL0, S_PixelsTL1, S_PixelsTL2, S_PixelsTL3);
                __m128i PixelsTR = _mm_setr_epi32(S_PixelsTR0, S_PixelsTR1, S_PixelsTR2, S_PixelsTR3);
                __m128i PixelsBL = _mm_setr_epi32(S_PixelsBL0, S_PixelsBL1, S_PixelsBL2, S_PixelsBL3);
                __m128i PixelsBR = _mm_setr_epi32(S_PixelsBR0, S_PixelsBR1, S_PixelsBR2, S_PixelsBR3);

                __m128 R_TexTL = _mm_mul_ps(_mm_cvtepi32_ps(_mm_and_si128(               PixelsTL,      FF)), Norm255);
                __m128 G_TexTL = _mm_mul_ps(_mm_cvtepi32_ps(_mm_and_si128(_mm_srli_epi32(PixelsTL, 8),  FF)), Norm255);
                __m128 B_TexTL = _mm_mul_ps(_mm_cvtepi32_ps(_mm_and_si128(_mm_srli_epi32(PixelsTL, 16), FF)), Norm255);
                __m128 A_TexTL = _mm_mul_ps(_mm_cvtepi32_ps(_mm_and_si128(_mm_srli_epi32(PixelsTL, 24), FF)), Norm255);

                __m128 R_TexTR = _mm_mul_ps(_mm_cvtepi32_ps(_mm_and_si128(               PixelsTR,      FF)), Norm255);
                __m128 G_TexTR = _mm_mul_ps(_mm_cvtepi32_ps(_mm_and_si128(_mm_srli_epi32(PixelsTR, 8),  FF)), Norm255);
                __m128 B_TexTR = _mm_mul_ps(_mm_cvtepi32_ps(_mm_and_si128(_mm_srli_epi32(PixelsTR, 16), FF)), Norm255);
                __m128 A_TexTR = _mm_mul_ps(_mm_cvtepi32_ps(_mm_and_si128(_mm_srli_epi32(PixelsTR, 24), FF)), Norm255);

                __m128 R_TexBL = _mm_mul_ps(_mm_cvtepi32_ps(_mm_and_si128(               PixelsBL,      FF)), Norm255);
                __m128 G_TexBL = _mm_mul_ps(_mm_cvtepi32_ps(_mm_and_si128(_mm_srli_epi32(PixelsBL, 8),  FF)), Norm255);
                __m128 B_TexBL = _mm_mul_ps(_mm_cvtepi32_ps(_mm_and_si128(_mm_srli_epi32(PixelsBL, 16), FF)), Norm255);
                __m128 A_TexBL = _mm_mul_ps(_mm_cvtepi32_ps(_mm_and_si128(_mm_srli_epi32(PixelsBL, 24), FF)), Norm255);

                __m128 R_TexBR = _mm_mul_ps(_mm_cvtepi32_ps(_mm_and_si128(               PixelsBR,      FF)), Norm255);
                __m128 G_TexBR = _mm_mul_ps(_mm_cvtepi32_ps(_mm_and_si128(_mm_srli_epi32(PixelsBR, 8),  FF)), Norm255);
                __m128 B_TexBR = _mm_mul_ps(_mm_cvtepi32_ps(_mm_and_si128(_mm_srli_epi32(PixelsBR, 16), FF)), Norm255);
                __m128 A_TexBR = _mm_mul_ps(_mm_cvtepi32_ps(_mm_and_si128(_mm_srli_epi32(PixelsBR, 24), FF)), Norm255);

                __m128 R_Col = _mm_add_ps(_mm_add_ps(_mm_mul_ps(TexBothInv,  R_TexTL),
                                                     _mm_mul_ps(TexBothYInv, R_TexTR)),
                                          _mm_add_ps(_mm_mul_ps(TexBothXInv, R_TexBL),
                                                     _mm_mul_ps(TexBoth,     R_TexBR)));
                __m128 G_Col = _mm_add_ps(_mm_add_ps(_mm_mul_ps(TexBothInv,  G_TexTL),
                                                     _mm_mul_ps(TexBothYInv, G_TexTR)),
                                          _mm_add_ps(_mm_mul_ps(TexBothXInv, G_TexBL),
                                                     _mm_mul_ps(TexBoth,     G_TexBR)));
                __m128 B_Col = _mm_add_ps(_mm_add_ps(_mm_mul_ps(TexBothInv,  B_TexTL),
                                                     _mm_mul_ps(TexBothYInv, B_TexTR)),
                                          _mm_add_ps(_mm_mul_ps(TexBothXInv, B_TexBL),
                                                     _mm_mul_ps(TexBoth,     B_TexBR)));
                __m128 A_Col = _mm_add_ps(_mm_add_ps(_mm_mul_ps(TexBothInv,  A_TexTL),
                                                     _mm_mul_ps(TexBothYInv, A_TexTR)),
                                          _mm_add_ps(_mm_mul_ps(TexBothXInv, A_TexBL),
                                                     _mm_mul_ps(TexBoth,     A_TexBR)));


                __m128i R_Out, G_Out, B_Out, A_Out;

                __m128 LayerAlpha = _mm_mul_ps(A_Col, LayerOpacity);
                __m128 LayerAlphaInv = _mm_sub_ps(One, LayerAlpha);

                __m128 R_Colx2  = _mm_mul_ps(R_Col, Two);
                __m128 R_ColInv = _mm_sub_ps(One, R_Col);

                __m128 G_Colx2  = _mm_mul_ps(G_Col, Two);
                __m128 G_ColInv = _mm_sub_ps(One, G_Col);

                __m128 B_Colx2  = _mm_mul_ps(B_Col, Two);
                __m128 B_ColInv = _mm_sub_ps(One, B_Col);

                __m128 R_Blend = R_Col;
                __m128 G_Blend = G_Col;
                __m128 B_Blend = B_Col;
                __m128 A_Blend = LayerAlpha;

                if (!_mm_movemask_epi8(_mm_cmpeq_ps(LayerAlpha, One)) || T.BlendMode != blend_normal)
                {
                    __m128i DestPixel = _mm_loadu_si128((const __m128i *)Pixel);
                    __m128 R_Dest = _mm_mul_ps(_mm_cvtepi32_ps(_mm_and_si128(               DestPixel,      FF)), Norm255);
                    __m128 G_Dest = _mm_mul_ps(_mm_cvtepi32_ps(_mm_and_si128(_mm_srli_epi32(DestPixel, 8),  FF)), Norm255);
                    __m128 B_Dest = _mm_mul_ps(_mm_cvtepi32_ps(_mm_and_si128(_mm_srli_epi32(DestPixel, 16), FF)), Norm255);
                    __m128 A_Dest = _mm_mul_ps(_mm_cvtepi32_ps(_mm_and_si128(_mm_srli_epi32(DestPixel, 24), FF)), Norm255);

                    switch (T.BlendMode)
                    {
                        case blend_normal:
                        {
                        } break;
                        case blend_multiply:
                        {
                            R_Blend  = _mm_mul_ps(R_Dest, R_Col);
                            G_Blend  = _mm_mul_ps(G_Dest, G_Col);
                            B_Blend  = _mm_mul_ps(B_Dest, B_Col);
                        } break;
                        case blend_colorburn:
                        {
                            // NOTE(fox): A small amount is added to Col since images with zero for alpha may also zero out the
                            // color channels, causing black clipping.
                            R_Blend  = _mm_sub_ps(One, _mm_div_ps(_mm_sub_ps(One, R_Dest), _mm_add_ps(R_Col, ClipPrevent)));
                            G_Blend  = _mm_sub_ps(One, _mm_div_ps(_mm_sub_ps(One, G_Dest), _mm_add_ps(G_Col, ClipPrevent)));
                            B_Blend  = _mm_sub_ps(One, _mm_div_ps(_mm_sub_ps(One, B_Dest), _mm_add_ps(B_Col, ClipPrevent)));
                        } break;
                        case blend_linearburn:
                        {
                            R_Blend  = _mm_sub_ps(_mm_add_ps(R_Dest, R_Col), One);
                            G_Blend  = _mm_sub_ps(_mm_add_ps(G_Dest, G_Col), One);
                            B_Blend  = _mm_sub_ps(_mm_add_ps(B_Dest, B_Col), One);
                        } break;
                        case blend_add:
                        {
                            R_Blend  = _mm_add_ps(R_Dest, R_Col);
                            G_Blend  = _mm_add_ps(G_Dest, G_Col);
                            B_Blend  = _mm_add_ps(B_Dest, B_Col);
                        } break;
                        case blend_screen:
                        {
                            R_Blend  = _mm_sub_ps(One, _mm_mul_ps(_mm_sub_ps(One, R_Dest), R_ColInv));
                            G_Blend  = _mm_sub_ps(One, _mm_mul_ps(_mm_sub_ps(One, G_Dest), G_ColInv));
                            B_Blend  = _mm_sub_ps(One, _mm_mul_ps(_mm_sub_ps(One, B_Dest), B_ColInv));
                        } break;
                        case blend_overlay:
                        {
                            __m128 R_Mask  = _mm_cmp_ps(R_Dest,  ZeroPointFive, 1);
                            __m128 G_Mask  = _mm_cmp_ps(G_Dest,  ZeroPointFive, 1);
                            __m128 B_Mask  = _mm_cmp_ps(B_Dest,  ZeroPointFive, 1);
                            __m128 R_Lower = _mm_mul_ps(Two, _mm_mul_ps(R_Dest, R_Col));
                            __m128 G_Lower = _mm_mul_ps(Two, _mm_mul_ps(G_Dest, G_Col));
                            __m128 B_Lower = _mm_mul_ps(Two, _mm_mul_ps(B_Dest, B_Col));
                            __m128 R_Upper = _mm_sub_ps(One, _mm_mul_ps(Two, _mm_mul_ps(_mm_sub_ps(One, R_Dest),  R_ColInv)));
                            __m128 G_Upper = _mm_sub_ps(One, _mm_mul_ps(Two, _mm_mul_ps(_mm_sub_ps(One, G_Dest),  G_ColInv)));
                            __m128 B_Upper = _mm_sub_ps(One, _mm_mul_ps(Two, _mm_mul_ps(_mm_sub_ps(One, B_Dest),  B_ColInv)));
                            R_Blend = _mm_blendv_ps(R_Upper, R_Lower, R_Mask);
                            G_Blend = _mm_blendv_ps(G_Upper, G_Lower, G_Mask);
                            B_Blend = _mm_blendv_ps(B_Upper, B_Lower, B_Mask);
                        } break;
                        case blend_softlight:
                        {
                            // using Pegtop's equation
                            R_Blend  = _mm_add_ps(_mm_mul_ps(_mm_sub_ps(One, R_Colx2), _mm_mul_ps(R_Dest, R_Dest)), _mm_mul_ps(R_Colx2, R_Dest));
                            G_Blend  = _mm_add_ps(_mm_mul_ps(_mm_sub_ps(One, G_Colx2), _mm_mul_ps(G_Dest, G_Dest)), _mm_mul_ps(G_Colx2, G_Dest));
                            B_Blend  = _mm_add_ps(_mm_mul_ps(_mm_sub_ps(One, B_Colx2), _mm_mul_ps(B_Dest, B_Dest)), _mm_mul_ps(B_Colx2, B_Dest));
                        } break;
                        case blend_hardlight:
                        {
                            __m128 R_Mask   = _mm_cmp_ps(R_Dest,  ZeroPointFive, 13);
                            __m128 G_Mask   = _mm_cmp_ps(G_Dest,  ZeroPointFive, 13);
                            __m128 B_Mask   = _mm_cmp_ps(B_Dest,  ZeroPointFive, 13);
                            __m128 R_Lower  = _mm_mul_ps(Two, _mm_mul_ps(R_Dest, R_Col));
                            __m128 G_Lower  = _mm_mul_ps(Two, _mm_mul_ps(G_Dest, G_Col));
                            __m128 B_Lower  = _mm_mul_ps(Two, _mm_mul_ps(B_Dest, B_Col));
                            __m128 R_Upper  = _mm_sub_ps(One, _mm_mul_ps(Two, _mm_mul_ps(_mm_sub_ps(One, R_Dest),  R_ColInv)));
                            __m128 G_Upper  = _mm_sub_ps(One, _mm_mul_ps(Two, _mm_mul_ps(_mm_sub_ps(One, G_Dest),  G_ColInv)));
                            __m128 B_Upper  = _mm_sub_ps(One, _mm_mul_ps(Two, _mm_mul_ps(_mm_sub_ps(One, B_Dest),  B_ColInv)));
                            R_Blend = _mm_blendv_ps(R_Upper, R_Lower, R_Mask);
                            G_Blend = _mm_blendv_ps(G_Upper, G_Lower, G_Mask);
                            B_Blend = _mm_blendv_ps(B_Upper, B_Lower, B_Mask);
                        } break;
                        case blend_subtract:
                        {
                            R_Blend  = _mm_sub_ps(R_Dest, R_Col);
                            G_Blend  = _mm_sub_ps(G_Dest, G_Col);
                            B_Blend  = _mm_sub_ps(B_Dest, B_Col);
                        } break;
                        case blend_divide:
                        {
                            R_Blend  = _mm_div_ps(R_Dest, _mm_add_ps(R_Col, ClipPrevent));
                            G_Blend  = _mm_div_ps(G_Dest, _mm_add_ps(G_Col, ClipPrevent));
                            B_Blend  = _mm_div_ps(B_Dest, _mm_add_ps(B_Col, ClipPrevent));
                        } break;
                        case blend_difference:
                        {
                            __m128 R_Lower  = _mm_sub_ps(R_Col, R_Dest);
                            __m128 G_Lower  = _mm_sub_ps(G_Col, G_Dest);
                            __m128 B_Lower  = _mm_sub_ps(B_Col, B_Dest);
                            __m128 R_Upper  = _mm_sub_ps(R_Dest, R_Col);
                            __m128 G_Upper  = _mm_sub_ps(G_Dest, G_Col);
                            __m128 B_Upper  = _mm_sub_ps(B_Dest, B_Col);
                            __m128 R_Mask  = _mm_cmp_ps(R_Lower,  Zero, 14);
                            __m128 G_Mask  = _mm_cmp_ps(G_Lower,  Zero, 14);
                            __m128 B_Mask  = _mm_cmp_ps(B_Lower,  Zero, 14);
                            R_Blend = _mm_blendv_ps(R_Upper, R_Lower, R_Mask);
                            G_Blend = _mm_blendv_ps(G_Upper, G_Lower, G_Mask);
                            B_Blend = _mm_blendv_ps(B_Upper, B_Lower, B_Mask);
                        } break;
                    }

                    R_Blend = _mm_add_ps(_mm_mul_ps(R_Dest, LayerAlphaInv), _mm_mul_ps(R_Blend, LayerAlpha));
                    G_Blend = _mm_add_ps(_mm_mul_ps(G_Dest, LayerAlphaInv), _mm_mul_ps(G_Blend, LayerAlpha));
                    B_Blend = _mm_add_ps(_mm_mul_ps(B_Dest, LayerAlphaInv), _mm_mul_ps(B_Blend, LayerAlpha));

                    // Standard behavior in photo apps is for blend modes to
                    // inherit underlying opacity instead of adding to it.
                    if (T.BlendMode == blend_normal)
                        A_Blend = _mm_add_ps(A_Dest, LayerAlpha);
                    else
                        A_Blend = A_Dest;
                }

                R_Out = _mm_cvtps_epi32(_mm_mul_ps(_mm_max_ps(_mm_min_ps(One, R_Blend), Zero), Reg255));
                G_Out = _mm_cvtps_epi32(_mm_mul_ps(_mm_max_ps(_mm_min_ps(One, G_Blend), Zero), Reg255));
                B_Out = _mm_cvtps_epi32(_mm_mul_ps(_mm_max_ps(_mm_min_ps(One, B_Blend), Zero), Reg255));
                A_Out = _mm_cvtps_epi32(_mm_mul_ps(_mm_max_ps(_mm_min_ps(One, A_Blend), Zero), Reg255));

                __m128i OutputPixel = _mm_or_si128(
                                      _mm_or_si128(R_Out, _mm_slli_epi32(G_Out, 8)),
                                      _mm_or_si128(_mm_slli_epi32(B_Out, 16), _mm_slli_epi32(A_Out, 24)));
                _mm_maskmoveu_si128(OutputPixel, LayerMask, (char *)Pixel);
            }
            PixelX = _mm_add_ps(PixelX, Four);
        }
    }
}


#endif

static void
Fallback_RenderLayer(transform_info T, comp_buffer *Buffer, rectangle RenderRegion)
{
    rectangle LayerBounds = ClipRectangle( T.ClipRect, RenderRegion);

    Assert(LayerBounds.Max.x <= Buffer->Width);
    Assert(LayerBounds.Max.y <= Buffer->Height);

    uint16 WidthP, HeightP;
    Bitmap_CalcPackedDimensions(Buffer->Width, Buffer->Height, &WidthP, &HeightP);

    real32 Normalized255 = 1 / 255.0f;

    for (int16 Y = LayerBounds.Min.y; Y < LayerBounds.Max.y; Y++)
    {
        real32 StartVectorY = (real32)Y - T.OriginY;

        for (int16 X = LayerBounds.Min.x; X < LayerBounds.Max.x; X++)
        {
            IACA_START;

            real32 StartVectorX = X - T.OriginX;
            real32 U = (StartVectorX * T.XAxisPX) + (StartVectorY * T.XAxisPY);
            real32 V = (StartVectorX * T.YAxisPX) + (StartVectorY * T.YAxisPY);

            if (U < 1.0f && U >= 0.0f && V < 1.0f && V >= 0.0f) {

                real32 TexXFull = U * T.LayerWidth;
                uint32 TexXInt = (uint32)TexXFull;
                real32 TexX = TexXFull - TexXInt;

                real32 TexYFull = V * T.LayerHeight;
                uint32 TexYInt = (uint32)TexYFull;
                real32 TexY = TexYFull - TexYInt;

                real32 TexXInv = 1 - TexX;
                real32 TexYInv = 1 - TexY;
                real32 TexBothXInv = TexXInv * TexY;
                real32 TexBothYInv = TexX * TexYInv;
                real32 TexBoth = TexY * TexX;
                real32 TexBothInv = TexXInv * TexYInv;

#if 0
                uint8 *TexPTR0 = ((uint8 *)T.SourceBuffer + (uint16)T.LayerPitch*TexYInt + TexXInt*Buffer->BytesPerPixel);
                uint8 *TexPTR1 = ((uint8 *)T.SourceBuffer + (uint16)T.LayerPitch*(TexYInt+1) + TexXInt*Buffer->BytesPerPixel);

                uint32 PixelA = *(uint32 *)TexPTR0;
                uint32 PixelB = *((uint32 *)TexPTR0 + 1);
                uint32 PixelC = *(uint32 *)TexPTR1;
                uint32 PixelD = *((uint32 *)TexPTR1 + 1);
#else
                uint32 XLookup, YLookup, PixelToSeek;

                // TODO(fox): Anti-aliasing on edges
                uint16 LX = TexXInt;
                uint16 LY = TexYInt;
                uint16 LXPlus = Ceil(TexXInt+1, (uint32)T.LayerWidth - 1);
                uint16 LYPlus = Ceil(TexYInt+1, (uint32)T.LayerHeight - 1);

                // TODO(fox): Be careful with the BytesPerPixel here! It's the
                // buffer's, not the layer's (currently everything is 4 bytes
                // per pixel).
                XLookup = (LX >> 2)*16 + (LX % 4);
                YLookup = (LY >> 2)*(T.FullLayerWidth*4) + (LY % 4)*4;
                PixelToSeek = XLookup + YLookup;
                uint32 PixelA = *(uint32 *)((uint8 *)T.SourceBuffer + PixelToSeek*Buffer->BytesPerPixel);

                XLookup = (LXPlus >> 2)*16 + (LXPlus % 4);
                YLookup = (LY >> 2)*(T.FullLayerWidth*4) + (LY % 4)*4;
                PixelToSeek = XLookup + YLookup;
                uint32 PixelB = *(uint32 *)((uint8 *)T.SourceBuffer + PixelToSeek*Buffer->BytesPerPixel);

                XLookup = (LX >> 2)*16 + (LX % 4);
                YLookup = (LYPlus >> 2)*(T.FullLayerWidth*4) + (LYPlus % 4)*4;
                PixelToSeek = XLookup + YLookup;
                uint32 PixelC = *(uint32 *)((uint8 *)T.SourceBuffer + PixelToSeek*Buffer->BytesPerPixel);

                XLookup = (LXPlus >> 2)*16 + (LXPlus % 4);
                YLookup = (LYPlus >> 2)*(T.FullLayerWidth*4) + (LYPlus % 4)*4;
                PixelToSeek = XLookup + YLookup;
                uint32 PixelD = *(uint32 *)((uint8 *)T.SourceBuffer + PixelToSeek*Buffer->BytesPerPixel);
#endif
                XLookup = (X >> 2)*16 + (X % 4);
                YLookup = (Y >> 2)*(WidthP*4) + (Y % 4)*4;
                PixelToSeek = XLookup + YLookup;
                uint32 *Pixel = (uint32 *)((uint8 *)Buffer->PackedBuffer + PixelToSeek*Buffer->BytesPerPixel);

                real32 TexRA = (real32)(PixelA & 0xFF) * Normalized255;
                real32 TexRB = (real32)(PixelB & 0xFF) * Normalized255;
                real32 TexRC = (real32)(PixelC & 0xFF) * Normalized255;
                real32 TexRD = (real32)(PixelD & 0xFF) * Normalized255;

                real32 TexGA = (real32)((PixelA >> 8) & 0xFF) * Normalized255;
                real32 TexGB = (real32)((PixelB >> 8) & 0xFF) * Normalized255;
                real32 TexGC = (real32)((PixelC >> 8) & 0xFF) * Normalized255;
                real32 TexGD = (real32)((PixelD >> 8) & 0xFF) * Normalized255;

                real32 TexBA = (real32)((PixelA >> 16) & 0xFF) * Normalized255;
                real32 TexBB = (real32)((PixelB >> 16) & 0xFF) * Normalized255;
                real32 TexBC = (real32)((PixelC >> 16) & 0xFF) * Normalized255;
                real32 TexBD = (real32)((PixelD >> 16) & 0xFF) * Normalized255;

                real32 TexAA = (real32)((PixelA >> 24) & 0xFF) * Normalized255;
                real32 TexAB = (real32)((PixelB >> 24) & 0xFF) * Normalized255;
                real32 TexAC = (real32)((PixelC >> 24) & 0xFF) * Normalized255;
                real32 TexAD = (real32)((PixelD >> 24) & 0xFF) * Normalized255;

                real32 R_Col = (TexBothInv * TexRA) + (TexBothYInv * TexRB)
                                + (TexBothXInv * TexRC) + (TexBoth * TexRD);
                real32 G_Col = (TexBothInv * TexGA) + (TexBothYInv * TexGB)
                                + (TexBothXInv * TexGC) + (TexBoth * TexGD);
                real32 B_Col = (TexBothInv * TexBA) + (TexBothYInv * TexBB)
                                + (TexBothXInv * TexBC) + (TexBoth * TexBD);
                real32 A_Col = (TexBothInv * TexAA) + (TexBothYInv * TexAB)
                                + (TexBothXInv * TexAC) + (TexBoth * TexAD);

                real32 LayerAlpha = A_Col * T.LayerOpacity;

                real32 R_Blend = R_Col;
                real32 G_Blend = G_Col;
                real32 B_Blend = B_Col;
                real32 A_Blend = A_Col;

                if (LayerAlpha != 1.0f || T.BlendMode != blend_normal) {

                    real32 R_Dest = (real32)((*Pixel >>  0) & 0xFF) * Normalized255;
                    real32 G_Dest = (real32)((*Pixel >>  8) & 0xFF) * Normalized255;
                    real32 B_Dest = (real32)((*Pixel >> 16) & 0xFF) * Normalized255;
                    real32 A_Dest = (real32)((*Pixel >> 24) & 0xFF) * Normalized255;

                    switch (T.BlendMode)
                    {
                        case blend_normal:
                        {
                        } break;
                        case blend_multiply:
                        {
                            R_Blend  = R_Dest * R_Col;
                            G_Blend  = G_Dest * G_Col;
                            B_Blend  = B_Dest * B_Col;
                        } break;
                        case blend_colorburn:
                        {
                            // NOTE(fox): Padding to prevent actual crashing from zero division
                            R_Blend = 1.0f - ((1.0f - R_Dest) / (R_Col + 0.001f));
                            G_Blend = 1.0f - ((1.0f - G_Dest) / (G_Col + 0.001f));
                            B_Blend = 1.0f - ((1.0f - B_Dest) / (B_Col + 0.001f));
                        } break;
                        case blend_linearburn:
                        {
                            R_Blend = (R_Dest + R_Col) - 1.0f;
                            G_Blend = (G_Dest + G_Col) - 1.0f;
                            B_Blend = (B_Dest + B_Col) - 1.0f;
                        } break;
                        case blend_add:
                        {
                            R_Blend = R_Dest + R_Col;
                            G_Blend = G_Dest + G_Col;
                            B_Blend = B_Dest + B_Col;
                        } break;
                        case blend_screen:
                        {
                            R_Blend  = 1.0f - ((1.0f - R_Dest) * (1.0f - R_Col));
                            G_Blend  = 1.0f - ((1.0f - G_Dest) * (1.0f - G_Col));
                            B_Blend  = 1.0f - ((1.0f - B_Dest) * (1.0f - B_Col));
                        } break;
                        case blend_overlay:
                        {
                            if (R_Dest < 0.5) {
                                R_Blend = 2.0f * R_Dest * R_Col;
                            } else {
                                R_Blend = 1.0f - (2.0f * (1.0f - R_Dest) * (1.0f - R_Col));
                            }
                            if (G_Dest < 0.5) {
                                G_Blend = 2.0f * G_Dest * G_Col;
                            } else {
                                G_Blend = 1.0f - (2.0f * (1.0f - G_Dest) * (1.0f - G_Col));
                            }
                            if (B_Dest < 0.5) {
                                B_Blend = 2.0f * B_Dest * B_Col;
                            } else {
                                B_Blend = 1.0f - (2.0f * (1.0f - B_Dest) * (1.0f - B_Col));
                            }
                        } break;
                        case blend_softlight:
                        {
                            // using Pegtop's equation
                            R_Blend = ((1.0f - R_Col * 2) * R_Dest * R_Dest) + (R_Col * 2 * R_Dest);
                            G_Blend = ((1.0f - G_Col * 2) * G_Dest * G_Dest) + (G_Col * 2 * G_Dest);
                            B_Blend = ((1.0f - B_Col * 2) * B_Dest * B_Dest) + (B_Col * 2 * B_Dest);
                        } break;
                        case blend_hardlight:
                        {
                            if (R_Dest > 0.5) {
                                R_Blend = 2.0f * R_Dest * R_Col;
                            } else {
                                R_Blend = 1.0f - (2.0f * (1.0f - R_Dest) * (1.0f - R_Col));
                            }
                            if (G_Dest > 0.5) {
                                G_Blend = 2.0f * G_Dest * G_Col;
                            } else {
                                G_Blend = 1.0f - (2.0f * (1.0f - G_Dest) * (1.0f - G_Col));
                            }
                            if (B_Dest > 0.5) {
                                B_Blend = 2.0f * B_Dest * B_Col;
                            } else {
                                B_Blend = 1.0f - (2.0f * (1.0f - B_Dest) * (1.0f - B_Col));
                            }
                        } break;
                        case blend_subtract:
                        {
                            R_Blend = R_Dest - R_Col;
                            G_Blend = G_Dest - G_Col;
                            B_Blend = B_Dest - B_Col;
                        } break;
                        case blend_divide:
                        {
                            R_Blend = R_Dest / (R_Col + 0.001f);
                            G_Blend = G_Dest / (G_Col + 0.001f);
                            B_Blend = B_Dest / (B_Col + 0.001f);
                        } break;
                        case blend_difference:
                        {
                            if (R_Col - R_Dest > 0) {
                                R_Blend = R_Col - R_Dest;
                            } else {
                                R_Blend = R_Dest - R_Col;
                            }
                            if (G_Col - G_Dest > 0) {
                                G_Blend = G_Col - G_Dest;
                            } else {
                                G_Blend = G_Dest - G_Col;
                            }
                            if (B_Col - B_Dest > 0) {
                                B_Blend = B_Col - B_Dest;
                            } else {
                                B_Blend = B_Dest - B_Col;
                            }
                        } break;
                    }

                    R_Blend = (R_Dest * (1.0f - LayerAlpha)) + (R_Blend * LayerAlpha);
                    G_Blend = (G_Dest * (1.0f - LayerAlpha)) + (G_Blend * LayerAlpha);
                    B_Blend = (B_Dest * (1.0f - LayerAlpha)) + (B_Blend * LayerAlpha);

                    if (T.BlendMode == blend_normal)
                        A_Blend = A_Dest + LayerAlpha;
                    else
                        A_Blend = A_Dest;
                }

                uint8 R_Out = (uint8)(Normalize(R_Blend) * 255.0f);
                uint8 G_Out = (uint8)(Normalize(G_Blend) * 255.0f);
                uint8 B_Out = (uint8)(Normalize(B_Blend) * 255.0f);
                uint8 A_Out = (uint8)(Normalize(A_Blend) * 255.0f);

                *Pixel = ((A_Out << 24) |
                          (B_Out << 16) |
                          (G_Out <<  8) |
                          (R_Out <<  0));
            }
        }
    }
}