summaryrefslogtreecommitdiff
path: root/prenderer.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'prenderer.cpp')
-rw-r--r--prenderer.cpp734
1 files changed, 551 insertions, 183 deletions
diff --git a/prenderer.cpp b/prenderer.cpp
index 72c2893..5df28f4 100644
--- a/prenderer.cpp
+++ b/prenderer.cpp
@@ -81,7 +81,8 @@ CalculateTransforms(project_layer *Layer, pixel_buffer *Buffer)
TransformInfo.LayerHeight = (real32)Source->Raster.Height;
TransformInfo.FullLayerWidth = Source->Raster.FullWidth;
TransformInfo.FullLayerHeight = Source->Raster.FullHeight;
- TransformInfo.LayerOpacity = 1.0f - Layer->opacity.CurrentValue.f;
+ TransformInfo.LayerOpacity = Layer->opacity.CurrentValue.f;
+ TransformInfo.BlendMode =Layer->BlendMode;
TransformInfo.OriginX = Origin.x;
TransformInfo.OriginY = Origin.y;
TransformInfo.BufferPitch = Buffer->Pitch;
@@ -152,6 +153,15 @@ QueueCurrentFrame(project_data *File, pixel_buffer *CompBuffer, project_state *S
for (int x = 0; x < 4; x++) {
// if (x == y) {
rectangle RenderRegion = {TileWidth*x, TileHeight*y, TileWidth + TileWidth*x, TileHeight + TileHeight*y};
+ // The render regions always have to be aligned to the top left of
+ // a 4x4 chunk (at least for AVX2) and cannot exceed the bounds of
+ // the comp.
+ // It seems we don't need any special math to guarantee this aside
+ // from dividing by 4 and modulating.
+ RenderRegion.Min.x -= RenderRegion.Min.x % 4;
+ RenderRegion.Min.y -= RenderRegion.Min.y % 4;
+ RenderRegion.Max.x -= RenderRegion.Max.x % 4;
+ RenderRegion.Max.y -= RenderRegion.Max.y % 4;
if (RenderRegion.Max.x > CompBuffer->Width)
RenderRegion.Max.x = CompBuffer->Width;
if (RenderRegion.Max.y > CompBuffer->Height)
@@ -379,8 +389,11 @@ AVX2_RenderLayer(transform_info T, pixel_buffer *Buffer, rectangle RenderRegion)
__m256 OriginX = _mm256_set1_ps(T.OriginX);
__m256 OriginY = _mm256_set1_ps(T.OriginY);
+ __m256 ClipPrevent = _mm256_set1_ps(0.001f);
__m256 One = _mm256_set1_ps(1);
+ __m256 Two = _mm256_set1_ps(2);
__m256 Zero = _mm256_set1_ps(0);
+ __m256 ZeroPointFive = _mm256_set1_ps(0.5);
__m256i Zeroi = _mm256_set1_epi32(0);
__m256i Onei = _mm256_set1_epi32(1);
__m256 Four = _mm256_set1_ps(4);
@@ -389,7 +402,7 @@ AVX2_RenderLayer(transform_info T, pixel_buffer *Buffer, rectangle RenderRegion)
__m256i BottomTwoBits = _mm256_set1_epi32(0x03);
__m256i Fouri = _mm256_set1_epi32(4);
__m256i Sixteeni = _mm256_set1_epi32(16);
- __m256 Reg255 = _mm256_set1_ps(255.0f);
+ __m256 Real255 = _mm256_set1_ps(255.0f);
__m256i Int255 = _mm256_set1_epi32(255);
__m256 Norm255 = _mm256_set1_ps(1/255.0f);
// __m256i White = _mm256_setr_epi32(0xFFFFFFFF, 0, 0, 0, 0xFFFFFFFF, 0, 0, 0);
@@ -450,7 +463,7 @@ AVX2_RenderLayer(transform_info T, pixel_buffer *Buffer, rectangle RenderRegion)
__m256i TexXIntPlusOne = _mm256_add_epi32(TexXInt, _mm256_and_si256(_mm256_cmpgt_epi32(LayerWidthMinusOne, TexXInt), Onei));
__m256i TexYInt = _mm256_cvttps_epi32(TexYFull);
__m256i TexYIntPlusOne = _mm256_add_epi32(TexYInt, _mm256_and_si256(_mm256_cmpgt_epi32(LayerHeightMinusOne, TexYInt), Onei));
- // NOTE(fox): The comparison is for when we're on the last pixel.
+ // NOTE(fox): The comparison is for when we're on the last pixel of the texel.
__m256 TexX = _mm256_sub_ps(TexXFull, _mm256_cvtepi32_ps(TexXInt));
__m256 TexY = _mm256_sub_ps(TexYFull, _mm256_cvtepi32_ps(TexYInt));
@@ -481,71 +494,190 @@ AVX2_RenderLayer(transform_info T, pixel_buffer *Buffer, rectangle RenderRegion)
__m256i PixelsBL = _mm256_i32gather_epi32((const int32 *)TexPTR, PixelLookupBL, 4);
__m256i PixelsBR = _mm256_i32gather_epi32((const int32 *)TexPTR, PixelLookupBR, 4);
- __m256i R_TexTL = _mm256_and_si256( PixelsTL, FF);
- __m256i G_TexTL = _mm256_and_si256(_mm256_srli_epi32(PixelsTL, 8), FF);
- __m256i B_TexTL = _mm256_and_si256(_mm256_srli_epi32(PixelsTL, 16), FF);
- __m256i A_TexTL = _mm256_and_si256(_mm256_srli_epi32(PixelsTL, 24), FF);
-
- __m256i R_TexTR = _mm256_and_si256( PixelsTR, FF);
- __m256i G_TexTR = _mm256_and_si256(_mm256_srli_epi32(PixelsTR, 8), FF);
- __m256i B_TexTR = _mm256_and_si256(_mm256_srli_epi32(PixelsTR, 16), FF);
- __m256i A_TexTR = _mm256_and_si256(_mm256_srli_epi32(PixelsTR, 24), FF);
-
- __m256i R_TexBL = _mm256_and_si256( PixelsBL, FF);
- __m256i G_TexBL = _mm256_and_si256(_mm256_srli_epi32(PixelsBL, 8), FF);
- __m256i B_TexBL = _mm256_and_si256(_mm256_srli_epi32(PixelsBL, 16), FF);
- __m256i A_TexBL = _mm256_and_si256(_mm256_srli_epi32(PixelsBL, 24), FF);
-
- __m256i R_TexBR = _mm256_and_si256( PixelsBR, FF);
- __m256i G_TexBR = _mm256_and_si256(_mm256_srli_epi32(PixelsBR, 8), FF);
- __m256i B_TexBR = _mm256_and_si256(_mm256_srli_epi32(PixelsBR, 16), FF);
- __m256i A_TexBR = _mm256_and_si256(_mm256_srli_epi32(PixelsBR, 24), FF);
-
- __m256 R_PixelBlend = _mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(TexBothInv, _mm256_cvtepi32_ps(R_TexTL)),
- _mm256_mul_ps(TexBothYInv, _mm256_cvtepi32_ps(R_TexTR))),
- _mm256_add_ps(_mm256_mul_ps(TexBothXInv, _mm256_cvtepi32_ps(R_TexBL)),
- _mm256_mul_ps(TexBoth, _mm256_cvtepi32_ps(R_TexBR))));
- __m256 G_PixelBlend = _mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(TexBothInv, _mm256_cvtepi32_ps(G_TexTL)),
- _mm256_mul_ps(TexBothYInv, _mm256_cvtepi32_ps(G_TexTR))),
- _mm256_add_ps(_mm256_mul_ps(TexBothXInv, _mm256_cvtepi32_ps(G_TexBL)),
- _mm256_mul_ps(TexBoth, _mm256_cvtepi32_ps(G_TexBR))));
- __m256 B_PixelBlend = _mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(TexBothInv, _mm256_cvtepi32_ps(B_TexTL)),
- _mm256_mul_ps(TexBothYInv, _mm256_cvtepi32_ps(B_TexTR))),
- _mm256_add_ps(_mm256_mul_ps(TexBothXInv, _mm256_cvtepi32_ps(B_TexBL)),
- _mm256_mul_ps(TexBoth, _mm256_cvtepi32_ps(B_TexBR))));
- __m256 A_PixelBlend = _mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(TexBothInv, _mm256_cvtepi32_ps(A_TexTL)),
- _mm256_mul_ps(TexBothYInv, _mm256_cvtepi32_ps(A_TexTR))),
- _mm256_add_ps(_mm256_mul_ps(TexBothXInv, _mm256_cvtepi32_ps(A_TexBL)),
- _mm256_mul_ps(TexBoth, _mm256_cvtepi32_ps(A_TexBR))));
-
- A_PixelBlend = _mm256_sub_ps(A_PixelBlend, _mm256_mul_ps(A_PixelBlend, LayerOpacity));
-
- __m256i R_Out, G_Out, B_Out, A_Out;
- // Only do alpha blending if a pixel's value doesn't equal 255
- if (_mm256_movemask_epi8(_mm256_sub_epi32(_mm256_cvtps_epi32(A_PixelBlend), Int255)))
+ __m256 R_TexTL = _mm256_mul_ps(_mm256_cvtepi32_ps(_mm256_and_si256( PixelsTL, FF)), Norm255);
+ __m256 G_TexTL = _mm256_mul_ps(_mm256_cvtepi32_ps(_mm256_and_si256(_mm256_srli_epi32(PixelsTL, 8), FF)), Norm255);
+ __m256 B_TexTL = _mm256_mul_ps(_mm256_cvtepi32_ps(_mm256_and_si256(_mm256_srli_epi32(PixelsTL, 16), FF)), Norm255);
+ __m256 A_TexTL = _mm256_mul_ps(_mm256_cvtepi32_ps(_mm256_and_si256(_mm256_srli_epi32(PixelsTL, 24), FF)), Norm255);
+
+ __m256 R_TexTR = _mm256_mul_ps(_mm256_cvtepi32_ps(_mm256_and_si256( PixelsTR, FF)), Norm255);
+ __m256 G_TexTR = _mm256_mul_ps(_mm256_cvtepi32_ps(_mm256_and_si256(_mm256_srli_epi32(PixelsTR, 8), FF)), Norm255);
+ __m256 B_TexTR = _mm256_mul_ps(_mm256_cvtepi32_ps(_mm256_and_si256(_mm256_srli_epi32(PixelsTR, 16), FF)), Norm255);
+ __m256 A_TexTR = _mm256_mul_ps(_mm256_cvtepi32_ps(_mm256_and_si256(_mm256_srli_epi32(PixelsTR, 24), FF)), Norm255);
+
+ __m256 R_TexBL = _mm256_mul_ps(_mm256_cvtepi32_ps(_mm256_and_si256( PixelsBL, FF)), Norm255);
+ __m256 G_TexBL = _mm256_mul_ps(_mm256_cvtepi32_ps(_mm256_and_si256(_mm256_srli_epi32(PixelsBL, 8), FF)), Norm255);
+ __m256 B_TexBL = _mm256_mul_ps(_mm256_cvtepi32_ps(_mm256_and_si256(_mm256_srli_epi32(PixelsBL, 16), FF)), Norm255);
+ __m256 A_TexBL = _mm256_mul_ps(_mm256_cvtepi32_ps(_mm256_and_si256(_mm256_srli_epi32(PixelsBL, 24), FF)), Norm255);
+
+ __m256 R_TexBR = _mm256_mul_ps(_mm256_cvtepi32_ps(_mm256_and_si256( PixelsBR, FF)), Norm255);
+ __m256 G_TexBR = _mm256_mul_ps(_mm256_cvtepi32_ps(_mm256_and_si256(_mm256_srli_epi32(PixelsBR, 8), FF)), Norm255);
+ __m256 B_TexBR = _mm256_mul_ps(_mm256_cvtepi32_ps(_mm256_and_si256(_mm256_srli_epi32(PixelsBR, 16), FF)), Norm255);
+ __m256 A_TexBR = _mm256_mul_ps(_mm256_cvtepi32_ps(_mm256_and_si256(_mm256_srli_epi32(PixelsBR, 24), FF)), Norm255);
+
+ __m256 R_Col = _mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(TexBothInv, R_TexTL),
+ _mm256_mul_ps(TexBothYInv, R_TexTR)),
+ _mm256_add_ps(_mm256_mul_ps(TexBothXInv, R_TexBL),
+ _mm256_mul_ps(TexBoth, R_TexBR)));
+ __m256 G_Col = _mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(TexBothInv, G_TexTL),
+ _mm256_mul_ps(TexBothYInv, G_TexTR)),
+ _mm256_add_ps(_mm256_mul_ps(TexBothXInv, G_TexBL),
+ _mm256_mul_ps(TexBoth, G_TexBR)));
+ __m256 B_Col = _mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(TexBothInv, B_TexTL),
+ _mm256_mul_ps(TexBothYInv, B_TexTR)),
+ _mm256_add_ps(_mm256_mul_ps(TexBothXInv, B_TexBL),
+ _mm256_mul_ps(TexBoth, B_TexBR)));
+ __m256 A_Col = _mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(TexBothInv, A_TexTL),
+ _mm256_mul_ps(TexBothYInv, A_TexTR)),
+ _mm256_add_ps(_mm256_mul_ps(TexBothXInv, A_TexBL),
+ _mm256_mul_ps(TexBoth, A_TexBR)));
+
+ __m256 LayerAlpha = _mm256_mul_ps(A_Col, LayerOpacity);
+ __m256 LayerAlphaInv = _mm256_sub_ps(One, LayerAlpha);
+
+ // Hoisted out of some blend modes; maybe it'd be better to just keep them in there.
+ __m256 R_Colx2 = _mm256_mul_ps(R_Col, Two);
+ __m256 R_ColInv = _mm256_sub_ps(One, R_Col);
+
+ __m256 G_Colx2 = _mm256_mul_ps(G_Col, Two);
+ __m256 G_ColInv = _mm256_sub_ps(One, G_Col);
+
+ __m256 B_Colx2 = _mm256_mul_ps(B_Col, Two);
+ __m256 B_ColInv = _mm256_sub_ps(One, B_Col);
+
+ __m256 R_Blend = R_Col;
+ __m256 G_Blend = G_Col;
+ __m256 B_Blend = B_Col;
+ __m256 A_Blend = LayerAlpha;
+
+ // Only load the dest pixel if we actually need to (a pixel's opacity isn't 255 or the blend mode requires it).
+ if (!_mm256_movemask_epi8(_mm256_cmp_ps(LayerAlpha, One, 0)) || T.BlendMode != blend_normal)
{
- __m256 LayerAlpha = _mm256_mul_ps(A_PixelBlend, Norm255);
- __m256 LayerAlphaInv = _mm256_mul_ps(_mm256_sub_ps(Reg255, A_PixelBlend), Norm255);
-
__m256i DestPixel = _mm256_loadu_si256((const __m256i *)Pixel);
- __m256i R_Dest = _mm256_and_si256( DestPixel, FF);
- __m256i G_Dest = _mm256_and_si256(_mm256_srli_epi32(DestPixel, 8), FF);
- __m256i B_Dest = _mm256_and_si256(_mm256_srli_epi32(DestPixel, 16), FF);
- __m256i A_Dest = _mm256_and_si256(_mm256_srli_epi32(DestPixel, 24), FF);
-
- R_Out = _mm256_cvtps_epi32(_mm256_add_ps(_mm256_mul_ps(_mm256_cvtepi32_ps(R_Dest), LayerAlphaInv), _mm256_mul_ps(R_PixelBlend, LayerAlpha)));
- G_Out = _mm256_cvtps_epi32(_mm256_add_ps(_mm256_mul_ps(_mm256_cvtepi32_ps(G_Dest), LayerAlphaInv), _mm256_mul_ps(G_PixelBlend, LayerAlpha)));
- B_Out = _mm256_cvtps_epi32(_mm256_add_ps(_mm256_mul_ps(_mm256_cvtepi32_ps(B_Dest), LayerAlphaInv), _mm256_mul_ps(B_PixelBlend, LayerAlpha)));
- A_Out = _mm256_cvtps_epi32(_mm256_min_ps(_mm256_add_ps(_mm256_cvtepi32_ps(A_Dest), A_PixelBlend), Reg255));
- }
- else
- {
- R_Out = _mm256_cvtps_epi32(R_PixelBlend);
- G_Out = _mm256_cvtps_epi32(G_PixelBlend);
- B_Out = _mm256_cvtps_epi32(B_PixelBlend);
- A_Out = _mm256_cvtps_epi32(A_PixelBlend);
+ __m256 R_Dest = _mm256_mul_ps(_mm256_cvtepi32_ps(_mm256_and_si256( DestPixel, FF)), Norm255);
+ __m256 G_Dest = _mm256_mul_ps(_mm256_cvtepi32_ps(_mm256_and_si256(_mm256_srli_epi32(DestPixel, 8), FF)), Norm255);
+ __m256 B_Dest = _mm256_mul_ps(_mm256_cvtepi32_ps(_mm256_and_si256(_mm256_srli_epi32(DestPixel, 16), FF)), Norm255);
+ __m256 A_Dest = _mm256_mul_ps(_mm256_cvtepi32_ps(_mm256_and_si256(_mm256_srli_epi32(DestPixel, 24), FF)), Norm255);
+
+ switch (T.BlendMode)
+ {
+ case blend_normal:
+ {
+ } break;
+ case blend_multiply:
+ {
+ R_Blend = _mm256_mul_ps(R_Dest, R_Col);
+ G_Blend = _mm256_mul_ps(G_Dest, G_Col);
+ B_Blend = _mm256_mul_ps(B_Dest, B_Col);
+ } break;
+ case blend_colorburn:
+ {
+ // NOTE(fox): A small amount is added to Col since images with zero for alpha may also zero out the
+ // color channels, causing black clipping.
+ R_Blend = _mm256_sub_ps(One, _mm256_div_ps(_mm256_sub_ps(One, R_Dest), _mm256_add_ps(R_Col, ClipPrevent)));
+ G_Blend = _mm256_sub_ps(One, _mm256_div_ps(_mm256_sub_ps(One, G_Dest), _mm256_add_ps(G_Col, ClipPrevent)));
+ B_Blend = _mm256_sub_ps(One, _mm256_div_ps(_mm256_sub_ps(One, B_Dest), _mm256_add_ps(B_Col, ClipPrevent)));
+ } break;
+ case blend_linearburn:
+ {
+ R_Blend = _mm256_sub_ps(_mm256_add_ps(R_Dest, R_Col), One);
+ G_Blend = _mm256_sub_ps(_mm256_add_ps(G_Dest, G_Col), One);
+ B_Blend = _mm256_sub_ps(_mm256_add_ps(B_Dest, B_Col), One);
+ } break;
+ case blend_add:
+ {
+ R_Blend = _mm256_add_ps(R_Dest, R_Col);
+ G_Blend = _mm256_add_ps(G_Dest, G_Col);
+ B_Blend = _mm256_add_ps(B_Dest, B_Col);
+ } break;
+ case blend_screen:
+ {
+ R_Blend = _mm256_sub_ps(One, _mm256_mul_ps(_mm256_sub_ps(One, R_Dest), R_ColInv));
+ G_Blend = _mm256_sub_ps(One, _mm256_mul_ps(_mm256_sub_ps(One, G_Dest), G_ColInv));
+ B_Blend = _mm256_sub_ps(One, _mm256_mul_ps(_mm256_sub_ps(One, B_Dest), B_ColInv));
+ } break;
+ case blend_overlay:
+ {
+ __m256 R_Mask = _mm256_cmp_ps(R_Dest, ZeroPointFive, 1);
+ __m256 G_Mask = _mm256_cmp_ps(G_Dest, ZeroPointFive, 1);
+ __m256 B_Mask = _mm256_cmp_ps(B_Dest, ZeroPointFive, 1);
+ __m256 R_Lower = _mm256_mul_ps(Two, _mm256_mul_ps(R_Dest, R_Col));
+ __m256 G_Lower = _mm256_mul_ps(Two, _mm256_mul_ps(G_Dest, G_Col));
+ __m256 B_Lower = _mm256_mul_ps(Two, _mm256_mul_ps(B_Dest, B_Col));
+ __m256 R_Upper = _mm256_sub_ps(One, _mm256_mul_ps(Two, _mm256_mul_ps(_mm256_sub_ps(One, R_Dest), R_ColInv)));
+ __m256 G_Upper = _mm256_sub_ps(One, _mm256_mul_ps(Two, _mm256_mul_ps(_mm256_sub_ps(One, G_Dest), G_ColInv)));
+ __m256 B_Upper = _mm256_sub_ps(One, _mm256_mul_ps(Two, _mm256_mul_ps(_mm256_sub_ps(One, B_Dest), B_ColInv)));
+ R_Blend = _mm256_blendv_ps(R_Upper, R_Lower, R_Mask);
+ G_Blend = _mm256_blendv_ps(G_Upper, G_Lower, G_Mask);
+ B_Blend = _mm256_blendv_ps(B_Upper, B_Lower, B_Mask);
+ } break;
+ case blend_softlight:
+ {
+ // using Pegtop's equation
+ R_Blend = _mm256_add_ps(_mm256_mul_ps(_mm256_sub_ps(One, R_Colx2), _mm256_mul_ps(R_Dest, R_Dest)), _mm256_mul_ps(R_Colx2, R_Dest));
+ G_Blend = _mm256_add_ps(_mm256_mul_ps(_mm256_sub_ps(One, G_Colx2), _mm256_mul_ps(G_Dest, G_Dest)), _mm256_mul_ps(G_Colx2, G_Dest));
+ B_Blend = _mm256_add_ps(_mm256_mul_ps(_mm256_sub_ps(One, B_Colx2), _mm256_mul_ps(B_Dest, B_Dest)), _mm256_mul_ps(B_Colx2, B_Dest));
+ } break;
+ case blend_hardlight:
+ {
+ __m256 R_Mask = _mm256_cmp_ps(R_Dest, ZeroPointFive, 13);
+ __m256 G_Mask = _mm256_cmp_ps(G_Dest, ZeroPointFive, 13);
+ __m256 B_Mask = _mm256_cmp_ps(B_Dest, ZeroPointFive, 13);
+ __m256 R_Lower = _mm256_mul_ps(Two, _mm256_mul_ps(R_Dest, R_Col));
+ __m256 G_Lower = _mm256_mul_ps(Two, _mm256_mul_ps(G_Dest, G_Col));
+ __m256 B_Lower = _mm256_mul_ps(Two, _mm256_mul_ps(B_Dest, B_Col));
+ __m256 R_Upper = _mm256_sub_ps(One, _mm256_mul_ps(Two, _mm256_mul_ps(_mm256_sub_ps(One, R_Dest), R_ColInv)));
+ __m256 G_Upper = _mm256_sub_ps(One, _mm256_mul_ps(Two, _mm256_mul_ps(_mm256_sub_ps(One, G_Dest), G_ColInv)));
+ __m256 B_Upper = _mm256_sub_ps(One, _mm256_mul_ps(Two, _mm256_mul_ps(_mm256_sub_ps(One, B_Dest), B_ColInv)));
+ R_Blend = _mm256_blendv_ps(R_Upper, R_Lower, R_Mask);
+ G_Blend = _mm256_blendv_ps(G_Upper, G_Lower, G_Mask);
+ B_Blend = _mm256_blendv_ps(B_Upper, B_Lower, B_Mask);
+ } break;
+ case blend_subtract:
+ {
+ R_Blend = _mm256_sub_ps(R_Dest, R_Col);
+ G_Blend = _mm256_sub_ps(G_Dest, G_Col);
+ B_Blend = _mm256_sub_ps(B_Dest, B_Col);
+ } break;
+ case blend_divide:
+ {
+ R_Blend = _mm256_div_ps(R_Dest, _mm256_add_ps(R_Col, ClipPrevent));
+ G_Blend = _mm256_div_ps(G_Dest, _mm256_add_ps(G_Col, ClipPrevent));
+ B_Blend = _mm256_div_ps(B_Dest, _mm256_add_ps(B_Col, ClipPrevent));
+ } break;
+ case blend_difference:
+ {
+ __m256 R_Lower = _mm256_sub_ps(R_Col, R_Dest);
+ __m256 G_Lower = _mm256_sub_ps(G_Col, G_Dest);
+ __m256 B_Lower = _mm256_sub_ps(B_Col, B_Dest);
+ __m256 R_Upper = _mm256_sub_ps(R_Dest, R_Col);
+ __m256 G_Upper = _mm256_sub_ps(G_Dest, G_Col);
+ __m256 B_Upper = _mm256_sub_ps(B_Dest, B_Col);
+ __m256 R_Mask = _mm256_cmp_ps(R_Lower, Zero, 14);
+ __m256 G_Mask = _mm256_cmp_ps(G_Lower, Zero, 14);
+ __m256 B_Mask = _mm256_cmp_ps(B_Lower, Zero, 14);
+ R_Blend = _mm256_blendv_ps(R_Upper, R_Lower, R_Mask);
+ G_Blend = _mm256_blendv_ps(G_Upper, G_Lower, G_Mask);
+ B_Blend = _mm256_blendv_ps(B_Upper, B_Lower, B_Mask);
+ } break;
+ }
+
+ R_Blend = _mm256_add_ps(_mm256_mul_ps(R_Dest, LayerAlphaInv), _mm256_mul_ps(R_Blend, LayerAlpha));
+ G_Blend = _mm256_add_ps(_mm256_mul_ps(G_Dest, LayerAlphaInv), _mm256_mul_ps(G_Blend, LayerAlpha));
+ B_Blend = _mm256_add_ps(_mm256_mul_ps(B_Dest, LayerAlphaInv), _mm256_mul_ps(B_Blend, LayerAlpha));
+
+ // Standard behavior in photo apps is for blend modes to
+ // inherit underlying opacity instead of adding to it.
+ if (T.BlendMode == blend_normal)
+ A_Blend = _mm256_add_ps(A_Dest, LayerAlpha);
+ else
+ A_Blend = A_Dest;
}
+ __m256i R_Out = _mm256_cvtps_epi32(_mm256_mul_ps(_mm256_max_ps(_mm256_min_ps(One, R_Blend), Zero), Real255));
+ __m256i G_Out = _mm256_cvtps_epi32(_mm256_mul_ps(_mm256_max_ps(_mm256_min_ps(One, G_Blend), Zero), Real255));
+ __m256i B_Out = _mm256_cvtps_epi32(_mm256_mul_ps(_mm256_max_ps(_mm256_min_ps(One, B_Blend), Zero), Real255));
+ __m256i A_Out = _mm256_cvtps_epi32(_mm256_mul_ps(_mm256_max_ps(_mm256_min_ps(One, A_Blend), Zero), Real255));
+
__m256i OutputPixel = _mm256_or_si256(
_mm256_or_si256(R_Out, _mm256_slli_epi32(G_Out, 8)),
_mm256_or_si256(_mm256_slli_epi32(B_Out, 16), _mm256_slli_epi32(A_Out, 24)));
@@ -585,8 +717,11 @@ SSE2_RenderLayer(transform_info T, pixel_buffer *Buffer, rectangle RenderRegion)
__m128 OriginX = _mm_set1_ps(T.OriginX);
__m128 OriginY = _mm_set1_ps(T.OriginY);
+ __m128 ClipPrevent = _mm_set1_ps(0.001f);
__m128 One = _mm_set1_ps(1);
+ __m128 Two = _mm_set1_ps(2);
__m128 Zero = _mm_set1_ps(0);
+ __m128 ZeroPointFive = _mm_set1_ps(0.5);
__m128i Zeroi = _mm_set1_epi32(0);
__m128i Onei = _mm_set1_epi32(1);
__m128 Four = _mm_set1_ps(4);
@@ -707,71 +842,191 @@ SSE2_RenderLayer(transform_info T, pixel_buffer *Buffer, rectangle RenderRegion)
__m128i PixelsBL = _mm_setr_epi32(S_PixelsBL0, S_PixelsBL1, S_PixelsBL2, S_PixelsBL3);
__m128i PixelsBR = _mm_setr_epi32(S_PixelsBR0, S_PixelsBR1, S_PixelsBR2, S_PixelsBR3);
- __m128i R_TexTL = _mm_and_si128( PixelsTL, FF);
- __m128i G_TexTL = _mm_and_si128(_mm_srli_epi32(PixelsTL, 8), FF);
- __m128i B_TexTL = _mm_and_si128(_mm_srli_epi32(PixelsTL, 16), FF);
- __m128i A_TexTL = _mm_and_si128(_mm_srli_epi32(PixelsTL, 24), FF);
-
- __m128i R_TexTR = _mm_and_si128( PixelsTR, FF);
- __m128i G_TexTR = _mm_and_si128(_mm_srli_epi32(PixelsTR, 8), FF);
- __m128i B_TexTR = _mm_and_si128(_mm_srli_epi32(PixelsTR, 16), FF);
- __m128i A_TexTR = _mm_and_si128(_mm_srli_epi32(PixelsTR, 24), FF);
-
- __m128i R_TexBL = _mm_and_si128( PixelsBL, FF);
- __m128i G_TexBL = _mm_and_si128(_mm_srli_epi32(PixelsBL, 8), FF);
- __m128i B_TexBL = _mm_and_si128(_mm_srli_epi32(PixelsBL, 16), FF);
- __m128i A_TexBL = _mm_and_si128(_mm_srli_epi32(PixelsBL, 24), FF);
-
- __m128i R_TexBR = _mm_and_si128( PixelsBR, FF);
- __m128i G_TexBR = _mm_and_si128(_mm_srli_epi32(PixelsBR, 8), FF);
- __m128i B_TexBR = _mm_and_si128(_mm_srli_epi32(PixelsBR, 16), FF);
- __m128i A_TexBR = _mm_and_si128(_mm_srli_epi32(PixelsBR, 24), FF);
-
- __m128 R_PixelBlend = _mm_add_ps(_mm_add_ps(_mm_mul_ps(TexBothInv, _mm_cvtepi32_ps(R_TexTL)),
- _mm_mul_ps(TexBothYInv, _mm_cvtepi32_ps(R_TexTR))),
- _mm_add_ps(_mm_mul_ps(TexBothXInv, _mm_cvtepi32_ps(R_TexBL)),
- _mm_mul_ps(TexBoth, _mm_cvtepi32_ps(R_TexBR))));
- __m128 G_PixelBlend = _mm_add_ps(_mm_add_ps(_mm_mul_ps(TexBothInv, _mm_cvtepi32_ps(G_TexTL)),
- _mm_mul_ps(TexBothYInv, _mm_cvtepi32_ps(G_TexTR))),
- _mm_add_ps(_mm_mul_ps(TexBothXInv, _mm_cvtepi32_ps(G_TexBL)),
- _mm_mul_ps(TexBoth, _mm_cvtepi32_ps(G_TexBR))));
- __m128 B_PixelBlend = _mm_add_ps(_mm_add_ps(_mm_mul_ps(TexBothInv, _mm_cvtepi32_ps(B_TexTL)),
- _mm_mul_ps(TexBothYInv, _mm_cvtepi32_ps(B_TexTR))),
- _mm_add_ps(_mm_mul_ps(TexBothXInv, _mm_cvtepi32_ps(B_TexBL)),
- _mm_mul_ps(TexBoth, _mm_cvtepi32_ps(B_TexBR))));
- __m128 A_PixelBlend = _mm_add_ps(_mm_add_ps(_mm_mul_ps(TexBothInv, _mm_cvtepi32_ps(A_TexTL)),
- _mm_mul_ps(TexBothYInv, _mm_cvtepi32_ps(A_TexTR))),
- _mm_add_ps(_mm_mul_ps(TexBothXInv, _mm_cvtepi32_ps(A_TexBL)),
- _mm_mul_ps(TexBoth, _mm_cvtepi32_ps(A_TexBR))));
-
- A_PixelBlend = _mm_sub_ps(A_PixelBlend, _mm_mul_ps(A_PixelBlend, LayerOpacity));
+ __m128 R_TexTL = _mm_mul_ps(_mm_cvtepi32_ps(_mm_and_si128( PixelsTL, FF)), Norm255);
+ __m128 G_TexTL = _mm_mul_ps(_mm_cvtepi32_ps(_mm_and_si128(_mm_srli_epi32(PixelsTL, 8), FF)), Norm255);
+ __m128 B_TexTL = _mm_mul_ps(_mm_cvtepi32_ps(_mm_and_si128(_mm_srli_epi32(PixelsTL, 16), FF)), Norm255);
+ __m128 A_TexTL = _mm_mul_ps(_mm_cvtepi32_ps(_mm_and_si128(_mm_srli_epi32(PixelsTL, 24), FF)), Norm255);
+
+ __m128 R_TexTR = _mm_mul_ps(_mm_cvtepi32_ps(_mm_and_si128( PixelsTR, FF)), Norm255);
+ __m128 G_TexTR = _mm_mul_ps(_mm_cvtepi32_ps(_mm_and_si128(_mm_srli_epi32(PixelsTR, 8), FF)), Norm255);
+ __m128 B_TexTR = _mm_mul_ps(_mm_cvtepi32_ps(_mm_and_si128(_mm_srli_epi32(PixelsTR, 16), FF)), Norm255);
+ __m128 A_TexTR = _mm_mul_ps(_mm_cvtepi32_ps(_mm_and_si128(_mm_srli_epi32(PixelsTR, 24), FF)), Norm255);
+
+ __m128 R_TexBL = _mm_mul_ps(_mm_cvtepi32_ps(_mm_and_si128( PixelsBL, FF)), Norm255);
+ __m128 G_TexBL = _mm_mul_ps(_mm_cvtepi32_ps(_mm_and_si128(_mm_srli_epi32(PixelsBL, 8), FF)), Norm255);
+ __m128 B_TexBL = _mm_mul_ps(_mm_cvtepi32_ps(_mm_and_si128(_mm_srli_epi32(PixelsBL, 16), FF)), Norm255);
+ __m128 A_TexBL = _mm_mul_ps(_mm_cvtepi32_ps(_mm_and_si128(_mm_srli_epi32(PixelsBL, 24), FF)), Norm255);
+
+ __m128 R_TexBR = _mm_mul_ps(_mm_cvtepi32_ps(_mm_and_si128( PixelsBR, FF)), Norm255);
+ __m128 G_TexBR = _mm_mul_ps(_mm_cvtepi32_ps(_mm_and_si128(_mm_srli_epi32(PixelsBR, 8), FF)), Norm255);
+ __m128 B_TexBR = _mm_mul_ps(_mm_cvtepi32_ps(_mm_and_si128(_mm_srli_epi32(PixelsBR, 16), FF)), Norm255);
+ __m128 A_TexBR = _mm_mul_ps(_mm_cvtepi32_ps(_mm_and_si128(_mm_srli_epi32(PixelsBR, 24), FF)), Norm255);
+
+ __m128 R_Col = _mm_add_ps(_mm_add_ps(_mm_mul_ps(TexBothInv, R_TexTL),
+ _mm_mul_ps(TexBothYInv, R_TexTR)),
+ _mm_add_ps(_mm_mul_ps(TexBothXInv, R_TexBL),
+ _mm_mul_ps(TexBoth, R_TexBR)));
+ __m128 G_Col = _mm_add_ps(_mm_add_ps(_mm_mul_ps(TexBothInv, G_TexTL),
+ _mm_mul_ps(TexBothYInv, G_TexTR)),
+ _mm_add_ps(_mm_mul_ps(TexBothXInv, G_TexBL),
+ _mm_mul_ps(TexBoth, G_TexBR)));
+ __m128 B_Col = _mm_add_ps(_mm_add_ps(_mm_mul_ps(TexBothInv, B_TexTL),
+ _mm_mul_ps(TexBothYInv, B_TexTR)),
+ _mm_add_ps(_mm_mul_ps(TexBothXInv, B_TexBL),
+ _mm_mul_ps(TexBoth, B_TexBR)));
+ __m128 A_Col = _mm_add_ps(_mm_add_ps(_mm_mul_ps(TexBothInv, A_TexTL),
+ _mm_mul_ps(TexBothYInv, A_TexTR)),
+ _mm_add_ps(_mm_mul_ps(TexBothXInv, A_TexBL),
+ _mm_mul_ps(TexBoth, A_TexBR)));
+
__m128i R_Out, G_Out, B_Out, A_Out;
- // Only do alpha blending if a pixel's value doesn't equal 255
- if (_mm_movemask_epi8(_mm_sub_epi32(_mm_cvtps_epi32(A_PixelBlend), Int255)))
- {
- __m128 LayerAlpha = _mm_mul_ps(A_PixelBlend, Norm255);
- __m128 LayerAlphaInv = _mm_mul_ps(_mm_sub_ps(Reg255, A_PixelBlend), Norm255);
- __m128i DestPixel = _mm_loadu_si128((const __m128i *)Pixel);
- __m128i R_Dest = _mm_and_si128( DestPixel, FF);
- __m128i G_Dest = _mm_and_si128(_mm_srli_epi32(DestPixel, 8), FF);
- __m128i B_Dest = _mm_and_si128(_mm_srli_epi32(DestPixel, 16), FF);
- __m128i A_Dest = _mm_and_si128(_mm_srli_epi32(DestPixel, 24), FF);
-
- R_Out = _mm_cvtps_epi32(_mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(R_Dest), LayerAlphaInv), _mm_mul_ps(R_PixelBlend, LayerAlpha)));
- G_Out = _mm_cvtps_epi32(_mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(G_Dest), LayerAlphaInv), _mm_mul_ps(G_PixelBlend, LayerAlpha)));
- B_Out = _mm_cvtps_epi32(_mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(B_Dest), LayerAlphaInv), _mm_mul_ps(B_PixelBlend, LayerAlpha)));
- A_Out = _mm_cvtps_epi32(_mm_min_ps(_mm_add_ps(_mm_cvtepi32_ps(A_Dest), A_PixelBlend), Reg255));
- }
- else
+ __m128 LayerAlpha = _mm_mul_ps(A_Col, LayerOpacity);
+ __m128 LayerAlphaInv = _mm_sub_ps(One, LayerAlpha);
+
+ __m128 R_Colx2 = _mm_mul_ps(R_Col, Two);
+ __m128 R_ColInv = _mm_sub_ps(One, R_Col);
+
+ __m128 G_Colx2 = _mm_mul_ps(G_Col, Two);
+ __m128 G_ColInv = _mm_sub_ps(One, G_Col);
+
+ __m128 B_Colx2 = _mm_mul_ps(B_Col, Two);
+ __m128 B_ColInv = _mm_sub_ps(One, B_Col);
+
+ __m128 R_Blend = R_Col;
+ __m128 G_Blend = G_Col;
+ __m128 B_Blend = B_Col;
+ __m128 A_Blend = LayerAlpha;
+
+ if (!_mm_movemask_epi8(_mm_cmpeq_ps(LayerAlpha, One)) || T.BlendMode != blend_normal)
{
- R_Out = _mm_cvtps_epi32(R_PixelBlend);
- G_Out = _mm_cvtps_epi32(G_PixelBlend);
- B_Out = _mm_cvtps_epi32(B_PixelBlend);
- A_Out = _mm_cvtps_epi32(A_PixelBlend);
+ __m128i DestPixel = _mm_loadu_si128((const __m128i *)Pixel);
+ __m128 R_Dest = _mm_mul_ps(_mm_cvtepi32_ps(_mm_and_si128( DestPixel, FF)), Norm255);
+ __m128 G_Dest = _mm_mul_ps(_mm_cvtepi32_ps(_mm_and_si128(_mm_srli_epi32(DestPixel, 8), FF)), Norm255);
+ __m128 B_Dest = _mm_mul_ps(_mm_cvtepi32_ps(_mm_and_si128(_mm_srli_epi32(DestPixel, 16), FF)), Norm255);
+ __m128 A_Dest = _mm_mul_ps(_mm_cvtepi32_ps(_mm_and_si128(_mm_srli_epi32(DestPixel, 24), FF)), Norm255);
+
+ switch (T.BlendMode)
+ {
+ case blend_normal:
+ {
+ } break;
+ case blend_multiply:
+ {
+ R_Blend = _mm_mul_ps(R_Dest, R_Col);
+ G_Blend = _mm_mul_ps(G_Dest, G_Col);
+ B_Blend = _mm_mul_ps(B_Dest, B_Col);
+ } break;
+ case blend_colorburn:
+ {
+ // NOTE(fox): A small amount is added to Col since images with zero for alpha may also zero out the
+ // color channels, causing black clipping.
+ R_Blend = _mm_sub_ps(One, _mm_div_ps(_mm_sub_ps(One, R_Dest), _mm_add_ps(R_Col, ClipPrevent)));
+ G_Blend = _mm_sub_ps(One, _mm_div_ps(_mm_sub_ps(One, G_Dest), _mm_add_ps(G_Col, ClipPrevent)));
+ B_Blend = _mm_sub_ps(One, _mm_div_ps(_mm_sub_ps(One, B_Dest), _mm_add_ps(B_Col, ClipPrevent)));
+ } break;
+ case blend_linearburn:
+ {
+ R_Blend = _mm_sub_ps(_mm_add_ps(R_Dest, R_Col), One);
+ G_Blend = _mm_sub_ps(_mm_add_ps(G_Dest, G_Col), One);
+ B_Blend = _mm_sub_ps(_mm_add_ps(B_Dest, B_Col), One);
+ } break;
+ case blend_add:
+ {
+ R_Blend = _mm_add_ps(R_Dest, R_Col);
+ G_Blend = _mm_add_ps(G_Dest, G_Col);
+ B_Blend = _mm_add_ps(B_Dest, B_Col);
+ } break;
+ case blend_screen:
+ {
+ R_Blend = _mm_sub_ps(One, _mm_mul_ps(_mm_sub_ps(One, R_Dest), R_ColInv));
+ G_Blend = _mm_sub_ps(One, _mm_mul_ps(_mm_sub_ps(One, G_Dest), G_ColInv));
+ B_Blend = _mm_sub_ps(One, _mm_mul_ps(_mm_sub_ps(One, B_Dest), B_ColInv));
+ } break;
+ case blend_overlay:
+ {
+ __m128 R_Mask = _mm_cmp_ps(R_Dest, ZeroPointFive, 1);
+ __m128 G_Mask = _mm_cmp_ps(G_Dest, ZeroPointFive, 1);
+ __m128 B_Mask = _mm_cmp_ps(B_Dest, ZeroPointFive, 1);
+ __m128 R_Lower = _mm_mul_ps(Two, _mm_mul_ps(R_Dest, R_Col));
+ __m128 G_Lower = _mm_mul_ps(Two, _mm_mul_ps(G_Dest, G_Col));
+ __m128 B_Lower = _mm_mul_ps(Two, _mm_mul_ps(B_Dest, B_Col));
+ __m128 R_Upper = _mm_sub_ps(One, _mm_mul_ps(Two, _mm_mul_ps(_mm_sub_ps(One, R_Dest), R_ColInv)));
+ __m128 G_Upper = _mm_sub_ps(One, _mm_mul_ps(Two, _mm_mul_ps(_mm_sub_ps(One, G_Dest), G_ColInv)));
+ __m128 B_Upper = _mm_sub_ps(One, _mm_mul_ps(Two, _mm_mul_ps(_mm_sub_ps(One, B_Dest), B_ColInv)));
+ R_Blend = _mm_blendv_ps(R_Upper, R_Lower, R_Mask);
+ G_Blend = _mm_blendv_ps(G_Upper, G_Lower, G_Mask);
+ B_Blend = _mm_blendv_ps(B_Upper, B_Lower, B_Mask);
+ } break;
+ case blend_softlight:
+ {
+ // using Pegtop's equation
+ R_Blend = _mm_add_ps(_mm_mul_ps(_mm_sub_ps(One, R_Colx2), _mm_mul_ps(R_Dest, R_Dest)), _mm_mul_ps(R_Colx2, R_Dest));
+ G_Blend = _mm_add_ps(_mm_mul_ps(_mm_sub_ps(One, G_Colx2), _mm_mul_ps(G_Dest, G_Dest)), _mm_mul_ps(G_Colx2, G_Dest));
+ B_Blend = _mm_add_ps(_mm_mul_ps(_mm_sub_ps(One, B_Colx2), _mm_mul_ps(B_Dest, B_Dest)), _mm_mul_ps(B_Colx2, B_Dest));
+ } break;
+ case blend_hardlight:
+ {
+ __m128 R_Mask = _mm_cmp_ps(R_Dest, ZeroPointFive, 13);
+ __m128 G_Mask = _mm_cmp_ps(G_Dest, ZeroPointFive, 13);
+ __m128 B_Mask = _mm_cmp_ps(B_Dest, ZeroPointFive, 13);
+ __m128 R_Lower = _mm_mul_ps(Two, _mm_mul_ps(R_Dest, R_Col));
+ __m128 G_Lower = _mm_mul_ps(Two, _mm_mul_ps(G_Dest, G_Col));
+ __m128 B_Lower = _mm_mul_ps(Two, _mm_mul_ps(B_Dest, B_Col));
+ __m128 R_Upper = _mm_sub_ps(One, _mm_mul_ps(Two, _mm_mul_ps(_mm_sub_ps(One, R_Dest), R_ColInv)));
+ __m128 G_Upper = _mm_sub_ps(One, _mm_mul_ps(Two, _mm_mul_ps(_mm_sub_ps(One, G_Dest), G_ColInv)));
+ __m128 B_Upper = _mm_sub_ps(One, _mm_mul_ps(Two, _mm_mul_ps(_mm_sub_ps(One, B_Dest), B_ColInv)));
+ R_Blend = _mm_blendv_ps(R_Upper, R_Lower, R_Mask);
+ G_Blend = _mm_blendv_ps(G_Upper, G_Lower, G_Mask);
+ B_Blend = _mm_blendv_ps(B_Upper, B_Lower, B_Mask);
+ } break;
+ case blend_subtract:
+ {
+ R_Blend = _mm_sub_ps(R_Dest, R_Col);
+ G_Blend = _mm_sub_ps(G_Dest, G_Col);
+ B_Blend = _mm_sub_ps(B_Dest, B_Col);
+ } break;
+ case blend_divide:
+ {
+ R_Blend = _mm_div_ps(R_Dest, _mm_add_ps(R_Col, ClipPrevent));
+ G_Blend = _mm_div_ps(G_Dest, _mm_add_ps(G_Col, ClipPrevent));
+ B_Blend = _mm_div_ps(B_Dest, _mm_add_ps(B_Col, ClipPrevent));
+ } break;
+ case blend_difference:
+ {
+ __m128 R_Lower = _mm_sub_ps(R_Col, R_Dest);
+ __m128 G_Lower = _mm_sub_ps(G_Col, G_Dest);
+ __m128 B_Lower = _mm_sub_ps(B_Col, B_Dest);
+ __m128 R_Upper = _mm_sub_ps(R_Dest, R_Col);
+ __m128 G_Upper = _mm_sub_ps(G_Dest, G_Col);
+ __m128 B_Upper = _mm_sub_ps(B_Dest, B_Col);
+ __m128 R_Mask = _mm_cmp_ps(R_Lower, Zero, 14);
+ __m128 G_Mask = _mm_cmp_ps(G_Lower, Zero, 14);
+ __m128 B_Mask = _mm_cmp_ps(B_Lower, Zero, 14);
+ R_Blend = _mm_blendv_ps(R_Upper, R_Lower, R_Mask);
+ G_Blend = _mm_blendv_ps(G_Upper, G_Lower, G_Mask);
+ B_Blend = _mm_blendv_ps(B_Upper, B_Lower, B_Mask);
+ } break;
+ }
+
+ R_Blend = _mm_add_ps(_mm_mul_ps(R_Dest, LayerAlphaInv), _mm_mul_ps(R_Blend, LayerAlpha));
+ G_Blend = _mm_add_ps(_mm_mul_ps(G_Dest, LayerAlphaInv), _mm_mul_ps(G_Blend, LayerAlpha));
+ B_Blend = _mm_add_ps(_mm_mul_ps(B_Dest, LayerAlphaInv), _mm_mul_ps(B_Blend, LayerAlpha));
+
+ // Standard behavior in photo apps is for blend modes to
+ // inherit underlying opacity instead of adding to it.
+ if (T.BlendMode == blend_normal)
+ A_Blend = _mm_add_ps(A_Dest, LayerAlpha);
+ else
+ A_Blend = A_Dest;
}
+ R_Out = _mm_cvtps_epi32(_mm_mul_ps(_mm_max_ps(_mm_min_ps(One, R_Blend), Zero), Reg255));
+ G_Out = _mm_cvtps_epi32(_mm_mul_ps(_mm_max_ps(_mm_min_ps(One, G_Blend), Zero), Reg255));
+ B_Out = _mm_cvtps_epi32(_mm_mul_ps(_mm_max_ps(_mm_min_ps(One, B_Blend), Zero), Reg255));
+ A_Out = _mm_cvtps_epi32(_mm_mul_ps(_mm_max_ps(_mm_min_ps(One, A_Blend), Zero), Reg255));
+
__m128i OutputPixel = _mm_or_si128(
_mm_or_si128(R_Out, _mm_slli_epi32(G_Out, 8)),
_mm_or_si128(_mm_slli_epi32(B_Out, 16), _mm_slli_epi32(A_Out, 24)));
@@ -796,9 +1051,7 @@ Fallback_RenderLayer(transform_info T, pixel_buffer *Buffer, rectangle RenderReg
uint8 *Row = ((uint8 *)Buffer->OriginalBuffer + Buffer->Pitch*(int16)(LayerBounds.Min.y) );
uint32 Channel = (T.LayerWidth * T.LayerHeight);
- // uint32 pp1 = 2;
- // uint32 pp2 = 3;
- // bool32 real = true;
+ real32 Normalized255 = 1 / 255.0f;
for (int16 Y = LayerBounds.Min.y; Y < LayerBounds.Max.y; Y++)
{
@@ -813,6 +1066,7 @@ Fallback_RenderLayer(transform_info T, pixel_buffer *Buffer, rectangle RenderReg
real32 V = (StartVectorX * T.YAxisPX) + (StartVectorY * T.YAxisPY);
if (U <= 1.0f && U >= 0.0f && V <= 1.0f && V >= 0.0f) {
+
real32 TexXFull = U * T.LayerWidth;
uint32 TexXInt = (uint32)TexXFull;
real32 TexX = TexXFull - TexXInt;
@@ -868,70 +1122,184 @@ Fallback_RenderLayer(transform_info T, pixel_buffer *Buffer, rectangle RenderReg
PixelToSeek = XLookup + YLookup;
uint32 PixelD = *(uint32 *)((uint8 *)T.SourceBuffer + PixelToSeek*Buffer->BytesPerPixel);
#endif
-
- uint8 TexRA = (PixelA & 0xFF);
- uint8 TexRB = (PixelB & 0xFF);
- uint8 TexRC = (PixelC & 0xFF);
- uint8 TexRD = (PixelD & 0xFF);
-
- uint8 TexGA = ((PixelA >> 8) & 0xFF);
- uint8 TexGB = ((PixelB >> 8) & 0xFF);
- uint8 TexGC = ((PixelC >> 8) & 0xFF);
- uint8 TexGD = ((PixelD >> 8) & 0xFF);
-
- uint8 TexBA = ((PixelA >> 16) & 0xFF);
- uint8 TexBB = ((PixelB >> 16) & 0xFF);
- uint8 TexBC = ((PixelC >> 16) & 0xFF);
- uint8 TexBD = ((PixelD >> 16) & 0xFF);
-
- uint8 TexAA = ((PixelA >> 24) & 0xFF);
- uint8 TexAB = ((PixelB >> 24) & 0xFF);
- uint8 TexAC = ((PixelC >> 24) & 0xFF);
- uint8 TexAD = ((PixelD >> 24) & 0xFF);
-
- real32 PixelBlendR = (TexBothInv * TexRA) + (TexBothYInv * TexRB)
- + (TexBothXInv * TexRC) + (TexBoth * TexRD);
- real32 PixelBlendG = (TexBothInv * TexGA) + (TexBothYInv * TexGB)
- + (TexBothXInv * TexGC) + (TexBoth * TexGD);
- real32 PixelBlendB = (TexBothInv * TexBA) + (TexBothYInv * TexBB)
- + (TexBothXInv * TexBC) + (TexBoth * TexBD);
- real32 PixelBlendA = (TexBothInv * TexAA) + (TexBothYInv * TexAB)
- + (TexBothXInv * TexAC) + (TexBoth * TexAD);
- PixelBlendA = PixelBlendA - (PixelBlendA * T.LayerOpacity);
-
- uint8 R = (uint8)PixelBlendR;
- uint8 G = (uint8)PixelBlendG;
- uint8 B = (uint8)PixelBlendB;
- uint8 A = (uint8)PixelBlendA;
-
XLookup = (X >> 2)*16 + (X % 4);
YLookup = (Y >> 2)*(Buffer->FullWidth*4) + (Y % 4)*4;
-
- // if (real) {
- // real = false;
- // printf("XLook: %i, YLook: %i\n", XLookup, YLookup);
- // printf("X: %i, Y: %i\n", X, Y);
- // }
PixelToSeek = XLookup + YLookup;
uint32 *Pixel = (uint32 *)((uint8 *)Buffer->OriginalBuffer + PixelToSeek*Buffer->BytesPerPixel);
- uint8 R1 = (*Pixel >> 0);
- uint8 G1 = (*Pixel >> 8);
- uint8 B1 = (*Pixel >> 16);
- uint8 A1 = (*Pixel >> 24);
-
- if (A != 255) {
- real32 LayerAlpha = (255 - A) / 255.0f;
- R = (R1 * LayerAlpha) - (R * LayerAlpha) + R;
- G = (G1 * LayerAlpha) - (G * LayerAlpha) + G;
- B = (B1 * LayerAlpha) - (B * LayerAlpha) + B;
- A = ClipAdd(A1, A);
+ real32 TexRA = (real32)(PixelA & 0xFF) * Normalized255;
+ real32 TexRB = (real32)(PixelB & 0xFF) * Normalized255;
+ real32 TexRC = (real32)(PixelC & 0xFF) * Normalized255;
+ real32 TexRD = (real32)(PixelD & 0xFF) * Normalized255;
+
+ real32 TexGA = (real32)((PixelA >> 8) & 0xFF) * Normalized255;
+ real32 TexGB = (real32)((PixelB >> 8) & 0xFF) * Normalized255;
+ real32 TexGC = (real32)((PixelC >> 8) & 0xFF) * Normalized255;
+ real32 TexGD = (real32)((PixelD >> 8) & 0xFF) * Normalized255;
+
+ real32 TexBA = (real32)((PixelA >> 16) & 0xFF) * Normalized255;
+ real32 TexBB = (real32)((PixelB >> 16) & 0xFF) * Normalized255;
+ real32 TexBC = (real32)((PixelC >> 16) & 0xFF) * Normalized255;
+ real32 TexBD = (real32)((PixelD >> 16) & 0xFF) * Normalized255;
+
+ real32 TexAA = (real32)((PixelA >> 24) & 0xFF) * Normalized255;
+ real32 TexAB = (real32)((PixelB >> 24) & 0xFF) * Normalized255;
+ real32 TexAC = (real32)((PixelC >> 24) & 0xFF) * Normalized255;
+ real32 TexAD = (real32)((PixelD >> 24) & 0xFF) * Normalized255;
+
+ real32 R_Col = (TexBothInv * TexRA) + (TexBothYInv * TexRB)
+ + (TexBothXInv * TexRC) + (TexBoth * TexRD);
+ real32 G_Col = (TexBothInv * TexGA) + (TexBothYInv * TexGB)
+ + (TexBothXInv * TexGC) + (TexBoth * TexGD);
+ real32 B_Col = (TexBothInv * TexBA) + (TexBothYInv * TexBB)
+ + (TexBothXInv * TexBC) + (TexBoth * TexBD);
+ real32 A_Col = (TexBothInv * TexAA) + (TexBothYInv * TexAB)
+ + (TexBothXInv * TexAC) + (TexBoth * TexAD);
+
+ real32 LayerAlpha = A_Col * T.LayerOpacity;
+
+ real32 R_Blend = R_Col;
+ real32 G_Blend = G_Col;
+ real32 B_Blend = B_Col;
+ real32 A_Blend = A_Col;
+
+ if (LayerAlpha != 1.0f || T.BlendMode != blend_normal) {
+
+ real32 R_Dest = (real32)((*Pixel >> 0) & 0xFF) * Normalized255;
+ real32 G_Dest = (real32)((*Pixel >> 8) & 0xFF) * Normalized255;
+ real32 B_Dest = (real32)((*Pixel >> 16) & 0xFF) * Normalized255;
+ real32 A_Dest = (real32)((*Pixel >> 24) & 0xFF) * Normalized255;
+
+ switch (T.BlendMode)
+ {
+ case blend_normal:
+ {
+ } break;
+ case blend_multiply:
+ {
+ R_Blend = R_Dest * R_Col;
+ G_Blend = G_Dest * G_Col;
+ B_Blend = B_Dest * B_Col;
+ } break;
+ case blend_colorburn:
+ {
+ // NOTE(fox): Padding to prevent actual crashing from zero division
+ R_Blend = 1.0f - ((1.0f - R_Dest) / (R_Col + 0.001f));
+ G_Blend = 1.0f - ((1.0f - G_Dest) / (G_Col + 0.001f));
+ B_Blend = 1.0f - ((1.0f - B_Dest) / (B_Col + 0.001f));
+ } break;
+ case blend_linearburn:
+ {
+ R_Blend = (R_Dest + R_Col) - 1.0f;
+ G_Blend = (G_Dest + G_Col) - 1.0f;
+ B_Blend = (B_Dest + B_Col) - 1.0f;
+ } break;
+ case blend_add:
+ {
+ R_Blend = R_Dest + R_Col;
+ G_Blend = G_Dest + G_Col;
+ B_Blend = B_Dest + B_Col;
+ } break;
+ case blend_screen:
+ {
+ R_Blend = 1.0f - ((1.0f - R_Dest) * (1.0f - R_Col));
+ G_Blend = 1.0f - ((1.0f - G_Dest) * (1.0f - G_Col));
+ B_Blend = 1.0f - ((1.0f - B_Dest) * (1.0f - B_Col));
+ } break;
+ case blend_overlay:
+ {
+ if (R_Dest < 0.5) {
+ R_Blend = 2.0f * R_Dest * R_Col;
+ } else {
+ R_Blend = 1.0f - (2.0f * (1.0f - R_Dest) * (1.0f - R_Col));
+ }
+ if (G_Dest < 0.5) {
+ G_Blend = 2.0f * G_Dest * G_Col;
+ } else {
+ G_Blend = 1.0f - (2.0f * (1.0f - G_Dest) * (1.0f - G_Col));
+ }
+ if (B_Dest < 0.5) {
+ B_Blend = 2.0f * B_Dest * B_Col;
+ } else {
+ B_Blend = 1.0f - (2.0f * (1.0f - B_Dest) * (1.0f - B_Col));
+ }
+ } break;
+ case blend_softlight:
+ {
+ // using Pegtop's equation
+ R_Blend = ((1.0f - R_Col * 2) * R_Dest * R_Dest) + (R_Col * 2 * R_Dest);
+ G_Blend = ((1.0f - G_Col * 2) * G_Dest * G_Dest) + (G_Col * 2 * G_Dest);
+ B_Blend = ((1.0f - B_Col * 2) * B_Dest * B_Dest) + (B_Col * 2 * B_Dest);
+ } break;
+ case blend_hardlight:
+ {
+ if (R_Dest > 0.5) {
+ R_Blend = 2.0f * R_Dest * R_Col;
+ } else {
+ R_Blend = 1.0f - (2.0f * (1.0f - R_Dest) * (1.0f - R_Col));
+ }
+ if (G_Dest > 0.5) {
+ G_Blend = 2.0f * G_Dest * G_Col;
+ } else {
+ G_Blend = 1.0f - (2.0f * (1.0f - G_Dest) * (1.0f - G_Col));
+ }
+ if (B_Dest > 0.5) {
+ B_Blend = 2.0f * B_Dest * B_Col;
+ } else {
+ B_Blend = 1.0f - (2.0f * (1.0f - B_Dest) * (1.0f - B_Col));
+ }
+ } break;
+ case blend_subtract:
+ {
+ R_Blend = R_Dest - R_Col;
+ G_Blend = G_Dest - G_Col;
+ B_Blend = B_Dest - B_Col;
+ } break;
+ case blend_divide:
+ {
+ R_Blend = R_Dest / (R_Col + 0.001f);
+ G_Blend = G_Dest / (G_Col + 0.001f);
+ B_Blend = B_Dest / (B_Col + 0.001f);
+ } break;
+ case blend_difference:
+ {
+ if (R_Col - R_Dest > 0) {
+ R_Blend = R_Col - R_Dest;
+ } else {
+ R_Blend = R_Dest - R_Col;
+ }
+ if (G_Col - G_Dest > 0) {
+ G_Blend = G_Col - G_Dest;
+ } else {
+ G_Blend = G_Dest - G_Col;
+ }
+ if (B_Col - B_Dest > 0) {
+ B_Blend = B_Col - B_Dest;
+ } else {
+ B_Blend = B_Dest - B_Col;
+ }
+ } break;
+ }
+
+ R_Blend = (R_Dest * (1.0f - LayerAlpha)) + (R_Blend * LayerAlpha);
+ G_Blend = (G_Dest * (1.0f - LayerAlpha)) + (G_Blend * LayerAlpha);
+ B_Blend = (B_Dest * (1.0f - LayerAlpha)) + (B_Blend * LayerAlpha);
+
+ if (T.BlendMode == blend_normal)
+ A_Blend = A_Dest + LayerAlpha;
+ else
+ A_Blend = A_Dest;
}
- *Pixel = ((A << 24) |
- (B << 16) |
- (G << 8) |
- (R << 0));
+ uint8 R_Out = (uint8)(Normalize(R_Blend) * 255.0f);
+ uint8 G_Out = (uint8)(Normalize(G_Blend) * 255.0f);
+ uint8 B_Out = (uint8)(Normalize(B_Blend) * 255.0f);
+ uint8 A_Out = (uint8)(Normalize(A_Blend) * 255.0f);
+
+ *Pixel = ((A_Out << 24) |
+ (B_Out << 16) |
+ (G_Out << 8) |
+ (R_Out << 0));
}
}
}