From 7d3dcee5b370c05065eb409ad5c21d0bc64790b1 Mon Sep 17 00:00:00 2001 From: Fox Caminiti Date: Thu, 28 Jul 2022 17:28:13 -0400 Subject: blend modes implemented in renderers --- prenderer.cpp | 734 +++++++++++++++++++++++++++++++++++++++++++--------------- 1 file changed, 551 insertions(+), 183 deletions(-) (limited to 'prenderer.cpp') diff --git a/prenderer.cpp b/prenderer.cpp index 72c2893..5df28f4 100644 --- a/prenderer.cpp +++ b/prenderer.cpp @@ -81,7 +81,8 @@ CalculateTransforms(project_layer *Layer, pixel_buffer *Buffer) TransformInfo.LayerHeight = (real32)Source->Raster.Height; TransformInfo.FullLayerWidth = Source->Raster.FullWidth; TransformInfo.FullLayerHeight = Source->Raster.FullHeight; - TransformInfo.LayerOpacity = 1.0f - Layer->opacity.CurrentValue.f; + TransformInfo.LayerOpacity = Layer->opacity.CurrentValue.f; + TransformInfo.BlendMode =Layer->BlendMode; TransformInfo.OriginX = Origin.x; TransformInfo.OriginY = Origin.y; TransformInfo.BufferPitch = Buffer->Pitch; @@ -152,6 +153,15 @@ QueueCurrentFrame(project_data *File, pixel_buffer *CompBuffer, project_state *S for (int x = 0; x < 4; x++) { // if (x == y) { rectangle RenderRegion = {TileWidth*x, TileHeight*y, TileWidth + TileWidth*x, TileHeight + TileHeight*y}; + // The render regions always have to be aligned to the top left of + // a 4x4 chunk (at least for AVX2) and cannot exceed the bounds of + // the comp. + // It seems we don't need any special math to guarantee this aside + // from dividing by 4 and modulating. + RenderRegion.Min.x -= RenderRegion.Min.x % 4; + RenderRegion.Min.y -= RenderRegion.Min.y % 4; + RenderRegion.Max.x -= RenderRegion.Max.x % 4; + RenderRegion.Max.y -= RenderRegion.Max.y % 4; if (RenderRegion.Max.x > CompBuffer->Width) RenderRegion.Max.x = CompBuffer->Width; if (RenderRegion.Max.y > CompBuffer->Height) @@ -379,8 +389,11 @@ AVX2_RenderLayer(transform_info T, pixel_buffer *Buffer, rectangle RenderRegion) __m256 OriginX = _mm256_set1_ps(T.OriginX); __m256 OriginY = _mm256_set1_ps(T.OriginY); + __m256 ClipPrevent = _mm256_set1_ps(0.001f); __m256 One = _mm256_set1_ps(1); + __m256 Two = _mm256_set1_ps(2); __m256 Zero = _mm256_set1_ps(0); + __m256 ZeroPointFive = _mm256_set1_ps(0.5); __m256i Zeroi = _mm256_set1_epi32(0); __m256i Onei = _mm256_set1_epi32(1); __m256 Four = _mm256_set1_ps(4); @@ -389,7 +402,7 @@ AVX2_RenderLayer(transform_info T, pixel_buffer *Buffer, rectangle RenderRegion) __m256i BottomTwoBits = _mm256_set1_epi32(0x03); __m256i Fouri = _mm256_set1_epi32(4); __m256i Sixteeni = _mm256_set1_epi32(16); - __m256 Reg255 = _mm256_set1_ps(255.0f); + __m256 Real255 = _mm256_set1_ps(255.0f); __m256i Int255 = _mm256_set1_epi32(255); __m256 Norm255 = _mm256_set1_ps(1/255.0f); // __m256i White = _mm256_setr_epi32(0xFFFFFFFF, 0, 0, 0, 0xFFFFFFFF, 0, 0, 0); @@ -450,7 +463,7 @@ AVX2_RenderLayer(transform_info T, pixel_buffer *Buffer, rectangle RenderRegion) __m256i TexXIntPlusOne = _mm256_add_epi32(TexXInt, _mm256_and_si256(_mm256_cmpgt_epi32(LayerWidthMinusOne, TexXInt), Onei)); __m256i TexYInt = _mm256_cvttps_epi32(TexYFull); __m256i TexYIntPlusOne = _mm256_add_epi32(TexYInt, _mm256_and_si256(_mm256_cmpgt_epi32(LayerHeightMinusOne, TexYInt), Onei)); - // NOTE(fox): The comparison is for when we're on the last pixel. + // NOTE(fox): The comparison is for when we're on the last pixel of the texel. __m256 TexX = _mm256_sub_ps(TexXFull, _mm256_cvtepi32_ps(TexXInt)); __m256 TexY = _mm256_sub_ps(TexYFull, _mm256_cvtepi32_ps(TexYInt)); @@ -481,71 +494,190 @@ AVX2_RenderLayer(transform_info T, pixel_buffer *Buffer, rectangle RenderRegion) __m256i PixelsBL = _mm256_i32gather_epi32((const int32 *)TexPTR, PixelLookupBL, 4); __m256i PixelsBR = _mm256_i32gather_epi32((const int32 *)TexPTR, PixelLookupBR, 4); - __m256i R_TexTL = _mm256_and_si256( PixelsTL, FF); - __m256i G_TexTL = _mm256_and_si256(_mm256_srli_epi32(PixelsTL, 8), FF); - __m256i B_TexTL = _mm256_and_si256(_mm256_srli_epi32(PixelsTL, 16), FF); - __m256i A_TexTL = _mm256_and_si256(_mm256_srli_epi32(PixelsTL, 24), FF); - - __m256i R_TexTR = _mm256_and_si256( PixelsTR, FF); - __m256i G_TexTR = _mm256_and_si256(_mm256_srli_epi32(PixelsTR, 8), FF); - __m256i B_TexTR = _mm256_and_si256(_mm256_srli_epi32(PixelsTR, 16), FF); - __m256i A_TexTR = _mm256_and_si256(_mm256_srli_epi32(PixelsTR, 24), FF); - - __m256i R_TexBL = _mm256_and_si256( PixelsBL, FF); - __m256i G_TexBL = _mm256_and_si256(_mm256_srli_epi32(PixelsBL, 8), FF); - __m256i B_TexBL = _mm256_and_si256(_mm256_srli_epi32(PixelsBL, 16), FF); - __m256i A_TexBL = _mm256_and_si256(_mm256_srli_epi32(PixelsBL, 24), FF); - - __m256i R_TexBR = _mm256_and_si256( PixelsBR, FF); - __m256i G_TexBR = _mm256_and_si256(_mm256_srli_epi32(PixelsBR, 8), FF); - __m256i B_TexBR = _mm256_and_si256(_mm256_srli_epi32(PixelsBR, 16), FF); - __m256i A_TexBR = _mm256_and_si256(_mm256_srli_epi32(PixelsBR, 24), FF); - - __m256 R_PixelBlend = _mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(TexBothInv, _mm256_cvtepi32_ps(R_TexTL)), - _mm256_mul_ps(TexBothYInv, _mm256_cvtepi32_ps(R_TexTR))), - _mm256_add_ps(_mm256_mul_ps(TexBothXInv, _mm256_cvtepi32_ps(R_TexBL)), - _mm256_mul_ps(TexBoth, _mm256_cvtepi32_ps(R_TexBR)))); - __m256 G_PixelBlend = _mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(TexBothInv, _mm256_cvtepi32_ps(G_TexTL)), - _mm256_mul_ps(TexBothYInv, _mm256_cvtepi32_ps(G_TexTR))), - _mm256_add_ps(_mm256_mul_ps(TexBothXInv, _mm256_cvtepi32_ps(G_TexBL)), - _mm256_mul_ps(TexBoth, _mm256_cvtepi32_ps(G_TexBR)))); - __m256 B_PixelBlend = _mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(TexBothInv, _mm256_cvtepi32_ps(B_TexTL)), - _mm256_mul_ps(TexBothYInv, _mm256_cvtepi32_ps(B_TexTR))), - _mm256_add_ps(_mm256_mul_ps(TexBothXInv, _mm256_cvtepi32_ps(B_TexBL)), - _mm256_mul_ps(TexBoth, _mm256_cvtepi32_ps(B_TexBR)))); - __m256 A_PixelBlend = _mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(TexBothInv, _mm256_cvtepi32_ps(A_TexTL)), - _mm256_mul_ps(TexBothYInv, _mm256_cvtepi32_ps(A_TexTR))), - _mm256_add_ps(_mm256_mul_ps(TexBothXInv, _mm256_cvtepi32_ps(A_TexBL)), - _mm256_mul_ps(TexBoth, _mm256_cvtepi32_ps(A_TexBR)))); - - A_PixelBlend = _mm256_sub_ps(A_PixelBlend, _mm256_mul_ps(A_PixelBlend, LayerOpacity)); - - __m256i R_Out, G_Out, B_Out, A_Out; - // Only do alpha blending if a pixel's value doesn't equal 255 - if (_mm256_movemask_epi8(_mm256_sub_epi32(_mm256_cvtps_epi32(A_PixelBlend), Int255))) + __m256 R_TexTL = _mm256_mul_ps(_mm256_cvtepi32_ps(_mm256_and_si256( PixelsTL, FF)), Norm255); + __m256 G_TexTL = _mm256_mul_ps(_mm256_cvtepi32_ps(_mm256_and_si256(_mm256_srli_epi32(PixelsTL, 8), FF)), Norm255); + __m256 B_TexTL = _mm256_mul_ps(_mm256_cvtepi32_ps(_mm256_and_si256(_mm256_srli_epi32(PixelsTL, 16), FF)), Norm255); + __m256 A_TexTL = _mm256_mul_ps(_mm256_cvtepi32_ps(_mm256_and_si256(_mm256_srli_epi32(PixelsTL, 24), FF)), Norm255); + + __m256 R_TexTR = _mm256_mul_ps(_mm256_cvtepi32_ps(_mm256_and_si256( PixelsTR, FF)), Norm255); + __m256 G_TexTR = _mm256_mul_ps(_mm256_cvtepi32_ps(_mm256_and_si256(_mm256_srli_epi32(PixelsTR, 8), FF)), Norm255); + __m256 B_TexTR = _mm256_mul_ps(_mm256_cvtepi32_ps(_mm256_and_si256(_mm256_srli_epi32(PixelsTR, 16), FF)), Norm255); + __m256 A_TexTR = _mm256_mul_ps(_mm256_cvtepi32_ps(_mm256_and_si256(_mm256_srli_epi32(PixelsTR, 24), FF)), Norm255); + + __m256 R_TexBL = _mm256_mul_ps(_mm256_cvtepi32_ps(_mm256_and_si256( PixelsBL, FF)), Norm255); + __m256 G_TexBL = _mm256_mul_ps(_mm256_cvtepi32_ps(_mm256_and_si256(_mm256_srli_epi32(PixelsBL, 8), FF)), Norm255); + __m256 B_TexBL = _mm256_mul_ps(_mm256_cvtepi32_ps(_mm256_and_si256(_mm256_srli_epi32(PixelsBL, 16), FF)), Norm255); + __m256 A_TexBL = _mm256_mul_ps(_mm256_cvtepi32_ps(_mm256_and_si256(_mm256_srli_epi32(PixelsBL, 24), FF)), Norm255); + + __m256 R_TexBR = _mm256_mul_ps(_mm256_cvtepi32_ps(_mm256_and_si256( PixelsBR, FF)), Norm255); + __m256 G_TexBR = _mm256_mul_ps(_mm256_cvtepi32_ps(_mm256_and_si256(_mm256_srli_epi32(PixelsBR, 8), FF)), Norm255); + __m256 B_TexBR = _mm256_mul_ps(_mm256_cvtepi32_ps(_mm256_and_si256(_mm256_srli_epi32(PixelsBR, 16), FF)), Norm255); + __m256 A_TexBR = _mm256_mul_ps(_mm256_cvtepi32_ps(_mm256_and_si256(_mm256_srli_epi32(PixelsBR, 24), FF)), Norm255); + + __m256 R_Col = _mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(TexBothInv, R_TexTL), + _mm256_mul_ps(TexBothYInv, R_TexTR)), + _mm256_add_ps(_mm256_mul_ps(TexBothXInv, R_TexBL), + _mm256_mul_ps(TexBoth, R_TexBR))); + __m256 G_Col = _mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(TexBothInv, G_TexTL), + _mm256_mul_ps(TexBothYInv, G_TexTR)), + _mm256_add_ps(_mm256_mul_ps(TexBothXInv, G_TexBL), + _mm256_mul_ps(TexBoth, G_TexBR))); + __m256 B_Col = _mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(TexBothInv, B_TexTL), + _mm256_mul_ps(TexBothYInv, B_TexTR)), + _mm256_add_ps(_mm256_mul_ps(TexBothXInv, B_TexBL), + _mm256_mul_ps(TexBoth, B_TexBR))); + __m256 A_Col = _mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(TexBothInv, A_TexTL), + _mm256_mul_ps(TexBothYInv, A_TexTR)), + _mm256_add_ps(_mm256_mul_ps(TexBothXInv, A_TexBL), + _mm256_mul_ps(TexBoth, A_TexBR))); + + __m256 LayerAlpha = _mm256_mul_ps(A_Col, LayerOpacity); + __m256 LayerAlphaInv = _mm256_sub_ps(One, LayerAlpha); + + // Hoisted out of some blend modes; maybe it'd be better to just keep them in there. + __m256 R_Colx2 = _mm256_mul_ps(R_Col, Two); + __m256 R_ColInv = _mm256_sub_ps(One, R_Col); + + __m256 G_Colx2 = _mm256_mul_ps(G_Col, Two); + __m256 G_ColInv = _mm256_sub_ps(One, G_Col); + + __m256 B_Colx2 = _mm256_mul_ps(B_Col, Two); + __m256 B_ColInv = _mm256_sub_ps(One, B_Col); + + __m256 R_Blend = R_Col; + __m256 G_Blend = G_Col; + __m256 B_Blend = B_Col; + __m256 A_Blend = LayerAlpha; + + // Only load the dest pixel if we actually need to (a pixel's opacity isn't 255 or the blend mode requires it). + if (!_mm256_movemask_epi8(_mm256_cmp_ps(LayerAlpha, One, 0)) || T.BlendMode != blend_normal) { - __m256 LayerAlpha = _mm256_mul_ps(A_PixelBlend, Norm255); - __m256 LayerAlphaInv = _mm256_mul_ps(_mm256_sub_ps(Reg255, A_PixelBlend), Norm255); - __m256i DestPixel = _mm256_loadu_si256((const __m256i *)Pixel); - __m256i R_Dest = _mm256_and_si256( DestPixel, FF); - __m256i G_Dest = _mm256_and_si256(_mm256_srli_epi32(DestPixel, 8), FF); - __m256i B_Dest = _mm256_and_si256(_mm256_srli_epi32(DestPixel, 16), FF); - __m256i A_Dest = _mm256_and_si256(_mm256_srli_epi32(DestPixel, 24), FF); - - R_Out = _mm256_cvtps_epi32(_mm256_add_ps(_mm256_mul_ps(_mm256_cvtepi32_ps(R_Dest), LayerAlphaInv), _mm256_mul_ps(R_PixelBlend, LayerAlpha))); - G_Out = _mm256_cvtps_epi32(_mm256_add_ps(_mm256_mul_ps(_mm256_cvtepi32_ps(G_Dest), LayerAlphaInv), _mm256_mul_ps(G_PixelBlend, LayerAlpha))); - B_Out = _mm256_cvtps_epi32(_mm256_add_ps(_mm256_mul_ps(_mm256_cvtepi32_ps(B_Dest), LayerAlphaInv), _mm256_mul_ps(B_PixelBlend, LayerAlpha))); - A_Out = _mm256_cvtps_epi32(_mm256_min_ps(_mm256_add_ps(_mm256_cvtepi32_ps(A_Dest), A_PixelBlend), Reg255)); - } - else - { - R_Out = _mm256_cvtps_epi32(R_PixelBlend); - G_Out = _mm256_cvtps_epi32(G_PixelBlend); - B_Out = _mm256_cvtps_epi32(B_PixelBlend); - A_Out = _mm256_cvtps_epi32(A_PixelBlend); + __m256 R_Dest = _mm256_mul_ps(_mm256_cvtepi32_ps(_mm256_and_si256( DestPixel, FF)), Norm255); + __m256 G_Dest = _mm256_mul_ps(_mm256_cvtepi32_ps(_mm256_and_si256(_mm256_srli_epi32(DestPixel, 8), FF)), Norm255); + __m256 B_Dest = _mm256_mul_ps(_mm256_cvtepi32_ps(_mm256_and_si256(_mm256_srli_epi32(DestPixel, 16), FF)), Norm255); + __m256 A_Dest = _mm256_mul_ps(_mm256_cvtepi32_ps(_mm256_and_si256(_mm256_srli_epi32(DestPixel, 24), FF)), Norm255); + + switch (T.BlendMode) + { + case blend_normal: + { + } break; + case blend_multiply: + { + R_Blend = _mm256_mul_ps(R_Dest, R_Col); + G_Blend = _mm256_mul_ps(G_Dest, G_Col); + B_Blend = _mm256_mul_ps(B_Dest, B_Col); + } break; + case blend_colorburn: + { + // NOTE(fox): A small amount is added to Col since images with zero for alpha may also zero out the + // color channels, causing black clipping. + R_Blend = _mm256_sub_ps(One, _mm256_div_ps(_mm256_sub_ps(One, R_Dest), _mm256_add_ps(R_Col, ClipPrevent))); + G_Blend = _mm256_sub_ps(One, _mm256_div_ps(_mm256_sub_ps(One, G_Dest), _mm256_add_ps(G_Col, ClipPrevent))); + B_Blend = _mm256_sub_ps(One, _mm256_div_ps(_mm256_sub_ps(One, B_Dest), _mm256_add_ps(B_Col, ClipPrevent))); + } break; + case blend_linearburn: + { + R_Blend = _mm256_sub_ps(_mm256_add_ps(R_Dest, R_Col), One); + G_Blend = _mm256_sub_ps(_mm256_add_ps(G_Dest, G_Col), One); + B_Blend = _mm256_sub_ps(_mm256_add_ps(B_Dest, B_Col), One); + } break; + case blend_add: + { + R_Blend = _mm256_add_ps(R_Dest, R_Col); + G_Blend = _mm256_add_ps(G_Dest, G_Col); + B_Blend = _mm256_add_ps(B_Dest, B_Col); + } break; + case blend_screen: + { + R_Blend = _mm256_sub_ps(One, _mm256_mul_ps(_mm256_sub_ps(One, R_Dest), R_ColInv)); + G_Blend = _mm256_sub_ps(One, _mm256_mul_ps(_mm256_sub_ps(One, G_Dest), G_ColInv)); + B_Blend = _mm256_sub_ps(One, _mm256_mul_ps(_mm256_sub_ps(One, B_Dest), B_ColInv)); + } break; + case blend_overlay: + { + __m256 R_Mask = _mm256_cmp_ps(R_Dest, ZeroPointFive, 1); + __m256 G_Mask = _mm256_cmp_ps(G_Dest, ZeroPointFive, 1); + __m256 B_Mask = _mm256_cmp_ps(B_Dest, ZeroPointFive, 1); + __m256 R_Lower = _mm256_mul_ps(Two, _mm256_mul_ps(R_Dest, R_Col)); + __m256 G_Lower = _mm256_mul_ps(Two, _mm256_mul_ps(G_Dest, G_Col)); + __m256 B_Lower = _mm256_mul_ps(Two, _mm256_mul_ps(B_Dest, B_Col)); + __m256 R_Upper = _mm256_sub_ps(One, _mm256_mul_ps(Two, _mm256_mul_ps(_mm256_sub_ps(One, R_Dest), R_ColInv))); + __m256 G_Upper = _mm256_sub_ps(One, _mm256_mul_ps(Two, _mm256_mul_ps(_mm256_sub_ps(One, G_Dest), G_ColInv))); + __m256 B_Upper = _mm256_sub_ps(One, _mm256_mul_ps(Two, _mm256_mul_ps(_mm256_sub_ps(One, B_Dest), B_ColInv))); + R_Blend = _mm256_blendv_ps(R_Upper, R_Lower, R_Mask); + G_Blend = _mm256_blendv_ps(G_Upper, G_Lower, G_Mask); + B_Blend = _mm256_blendv_ps(B_Upper, B_Lower, B_Mask); + } break; + case blend_softlight: + { + // using Pegtop's equation + R_Blend = _mm256_add_ps(_mm256_mul_ps(_mm256_sub_ps(One, R_Colx2), _mm256_mul_ps(R_Dest, R_Dest)), _mm256_mul_ps(R_Colx2, R_Dest)); + G_Blend = _mm256_add_ps(_mm256_mul_ps(_mm256_sub_ps(One, G_Colx2), _mm256_mul_ps(G_Dest, G_Dest)), _mm256_mul_ps(G_Colx2, G_Dest)); + B_Blend = _mm256_add_ps(_mm256_mul_ps(_mm256_sub_ps(One, B_Colx2), _mm256_mul_ps(B_Dest, B_Dest)), _mm256_mul_ps(B_Colx2, B_Dest)); + } break; + case blend_hardlight: + { + __m256 R_Mask = _mm256_cmp_ps(R_Dest, ZeroPointFive, 13); + __m256 G_Mask = _mm256_cmp_ps(G_Dest, ZeroPointFive, 13); + __m256 B_Mask = _mm256_cmp_ps(B_Dest, ZeroPointFive, 13); + __m256 R_Lower = _mm256_mul_ps(Two, _mm256_mul_ps(R_Dest, R_Col)); + __m256 G_Lower = _mm256_mul_ps(Two, _mm256_mul_ps(G_Dest, G_Col)); + __m256 B_Lower = _mm256_mul_ps(Two, _mm256_mul_ps(B_Dest, B_Col)); + __m256 R_Upper = _mm256_sub_ps(One, _mm256_mul_ps(Two, _mm256_mul_ps(_mm256_sub_ps(One, R_Dest), R_ColInv))); + __m256 G_Upper = _mm256_sub_ps(One, _mm256_mul_ps(Two, _mm256_mul_ps(_mm256_sub_ps(One, G_Dest), G_ColInv))); + __m256 B_Upper = _mm256_sub_ps(One, _mm256_mul_ps(Two, _mm256_mul_ps(_mm256_sub_ps(One, B_Dest), B_ColInv))); + R_Blend = _mm256_blendv_ps(R_Upper, R_Lower, R_Mask); + G_Blend = _mm256_blendv_ps(G_Upper, G_Lower, G_Mask); + B_Blend = _mm256_blendv_ps(B_Upper, B_Lower, B_Mask); + } break; + case blend_subtract: + { + R_Blend = _mm256_sub_ps(R_Dest, R_Col); + G_Blend = _mm256_sub_ps(G_Dest, G_Col); + B_Blend = _mm256_sub_ps(B_Dest, B_Col); + } break; + case blend_divide: + { + R_Blend = _mm256_div_ps(R_Dest, _mm256_add_ps(R_Col, ClipPrevent)); + G_Blend = _mm256_div_ps(G_Dest, _mm256_add_ps(G_Col, ClipPrevent)); + B_Blend = _mm256_div_ps(B_Dest, _mm256_add_ps(B_Col, ClipPrevent)); + } break; + case blend_difference: + { + __m256 R_Lower = _mm256_sub_ps(R_Col, R_Dest); + __m256 G_Lower = _mm256_sub_ps(G_Col, G_Dest); + __m256 B_Lower = _mm256_sub_ps(B_Col, B_Dest); + __m256 R_Upper = _mm256_sub_ps(R_Dest, R_Col); + __m256 G_Upper = _mm256_sub_ps(G_Dest, G_Col); + __m256 B_Upper = _mm256_sub_ps(B_Dest, B_Col); + __m256 R_Mask = _mm256_cmp_ps(R_Lower, Zero, 14); + __m256 G_Mask = _mm256_cmp_ps(G_Lower, Zero, 14); + __m256 B_Mask = _mm256_cmp_ps(B_Lower, Zero, 14); + R_Blend = _mm256_blendv_ps(R_Upper, R_Lower, R_Mask); + G_Blend = _mm256_blendv_ps(G_Upper, G_Lower, G_Mask); + B_Blend = _mm256_blendv_ps(B_Upper, B_Lower, B_Mask); + } break; + } + + R_Blend = _mm256_add_ps(_mm256_mul_ps(R_Dest, LayerAlphaInv), _mm256_mul_ps(R_Blend, LayerAlpha)); + G_Blend = _mm256_add_ps(_mm256_mul_ps(G_Dest, LayerAlphaInv), _mm256_mul_ps(G_Blend, LayerAlpha)); + B_Blend = _mm256_add_ps(_mm256_mul_ps(B_Dest, LayerAlphaInv), _mm256_mul_ps(B_Blend, LayerAlpha)); + + // Standard behavior in photo apps is for blend modes to + // inherit underlying opacity instead of adding to it. + if (T.BlendMode == blend_normal) + A_Blend = _mm256_add_ps(A_Dest, LayerAlpha); + else + A_Blend = A_Dest; } + __m256i R_Out = _mm256_cvtps_epi32(_mm256_mul_ps(_mm256_max_ps(_mm256_min_ps(One, R_Blend), Zero), Real255)); + __m256i G_Out = _mm256_cvtps_epi32(_mm256_mul_ps(_mm256_max_ps(_mm256_min_ps(One, G_Blend), Zero), Real255)); + __m256i B_Out = _mm256_cvtps_epi32(_mm256_mul_ps(_mm256_max_ps(_mm256_min_ps(One, B_Blend), Zero), Real255)); + __m256i A_Out = _mm256_cvtps_epi32(_mm256_mul_ps(_mm256_max_ps(_mm256_min_ps(One, A_Blend), Zero), Real255)); + __m256i OutputPixel = _mm256_or_si256( _mm256_or_si256(R_Out, _mm256_slli_epi32(G_Out, 8)), _mm256_or_si256(_mm256_slli_epi32(B_Out, 16), _mm256_slli_epi32(A_Out, 24))); @@ -585,8 +717,11 @@ SSE2_RenderLayer(transform_info T, pixel_buffer *Buffer, rectangle RenderRegion) __m128 OriginX = _mm_set1_ps(T.OriginX); __m128 OriginY = _mm_set1_ps(T.OriginY); + __m128 ClipPrevent = _mm_set1_ps(0.001f); __m128 One = _mm_set1_ps(1); + __m128 Two = _mm_set1_ps(2); __m128 Zero = _mm_set1_ps(0); + __m128 ZeroPointFive = _mm_set1_ps(0.5); __m128i Zeroi = _mm_set1_epi32(0); __m128i Onei = _mm_set1_epi32(1); __m128 Four = _mm_set1_ps(4); @@ -707,71 +842,191 @@ SSE2_RenderLayer(transform_info T, pixel_buffer *Buffer, rectangle RenderRegion) __m128i PixelsBL = _mm_setr_epi32(S_PixelsBL0, S_PixelsBL1, S_PixelsBL2, S_PixelsBL3); __m128i PixelsBR = _mm_setr_epi32(S_PixelsBR0, S_PixelsBR1, S_PixelsBR2, S_PixelsBR3); - __m128i R_TexTL = _mm_and_si128( PixelsTL, FF); - __m128i G_TexTL = _mm_and_si128(_mm_srli_epi32(PixelsTL, 8), FF); - __m128i B_TexTL = _mm_and_si128(_mm_srli_epi32(PixelsTL, 16), FF); - __m128i A_TexTL = _mm_and_si128(_mm_srli_epi32(PixelsTL, 24), FF); - - __m128i R_TexTR = _mm_and_si128( PixelsTR, FF); - __m128i G_TexTR = _mm_and_si128(_mm_srli_epi32(PixelsTR, 8), FF); - __m128i B_TexTR = _mm_and_si128(_mm_srli_epi32(PixelsTR, 16), FF); - __m128i A_TexTR = _mm_and_si128(_mm_srli_epi32(PixelsTR, 24), FF); - - __m128i R_TexBL = _mm_and_si128( PixelsBL, FF); - __m128i G_TexBL = _mm_and_si128(_mm_srli_epi32(PixelsBL, 8), FF); - __m128i B_TexBL = _mm_and_si128(_mm_srli_epi32(PixelsBL, 16), FF); - __m128i A_TexBL = _mm_and_si128(_mm_srli_epi32(PixelsBL, 24), FF); - - __m128i R_TexBR = _mm_and_si128( PixelsBR, FF); - __m128i G_TexBR = _mm_and_si128(_mm_srli_epi32(PixelsBR, 8), FF); - __m128i B_TexBR = _mm_and_si128(_mm_srli_epi32(PixelsBR, 16), FF); - __m128i A_TexBR = _mm_and_si128(_mm_srli_epi32(PixelsBR, 24), FF); - - __m128 R_PixelBlend = _mm_add_ps(_mm_add_ps(_mm_mul_ps(TexBothInv, _mm_cvtepi32_ps(R_TexTL)), - _mm_mul_ps(TexBothYInv, _mm_cvtepi32_ps(R_TexTR))), - _mm_add_ps(_mm_mul_ps(TexBothXInv, _mm_cvtepi32_ps(R_TexBL)), - _mm_mul_ps(TexBoth, _mm_cvtepi32_ps(R_TexBR)))); - __m128 G_PixelBlend = _mm_add_ps(_mm_add_ps(_mm_mul_ps(TexBothInv, _mm_cvtepi32_ps(G_TexTL)), - _mm_mul_ps(TexBothYInv, _mm_cvtepi32_ps(G_TexTR))), - _mm_add_ps(_mm_mul_ps(TexBothXInv, _mm_cvtepi32_ps(G_TexBL)), - _mm_mul_ps(TexBoth, _mm_cvtepi32_ps(G_TexBR)))); - __m128 B_PixelBlend = _mm_add_ps(_mm_add_ps(_mm_mul_ps(TexBothInv, _mm_cvtepi32_ps(B_TexTL)), - _mm_mul_ps(TexBothYInv, _mm_cvtepi32_ps(B_TexTR))), - _mm_add_ps(_mm_mul_ps(TexBothXInv, _mm_cvtepi32_ps(B_TexBL)), - _mm_mul_ps(TexBoth, _mm_cvtepi32_ps(B_TexBR)))); - __m128 A_PixelBlend = _mm_add_ps(_mm_add_ps(_mm_mul_ps(TexBothInv, _mm_cvtepi32_ps(A_TexTL)), - _mm_mul_ps(TexBothYInv, _mm_cvtepi32_ps(A_TexTR))), - _mm_add_ps(_mm_mul_ps(TexBothXInv, _mm_cvtepi32_ps(A_TexBL)), - _mm_mul_ps(TexBoth, _mm_cvtepi32_ps(A_TexBR)))); - - A_PixelBlend = _mm_sub_ps(A_PixelBlend, _mm_mul_ps(A_PixelBlend, LayerOpacity)); + __m128 R_TexTL = _mm_mul_ps(_mm_cvtepi32_ps(_mm_and_si128( PixelsTL, FF)), Norm255); + __m128 G_TexTL = _mm_mul_ps(_mm_cvtepi32_ps(_mm_and_si128(_mm_srli_epi32(PixelsTL, 8), FF)), Norm255); + __m128 B_TexTL = _mm_mul_ps(_mm_cvtepi32_ps(_mm_and_si128(_mm_srli_epi32(PixelsTL, 16), FF)), Norm255); + __m128 A_TexTL = _mm_mul_ps(_mm_cvtepi32_ps(_mm_and_si128(_mm_srli_epi32(PixelsTL, 24), FF)), Norm255); + + __m128 R_TexTR = _mm_mul_ps(_mm_cvtepi32_ps(_mm_and_si128( PixelsTR, FF)), Norm255); + __m128 G_TexTR = _mm_mul_ps(_mm_cvtepi32_ps(_mm_and_si128(_mm_srli_epi32(PixelsTR, 8), FF)), Norm255); + __m128 B_TexTR = _mm_mul_ps(_mm_cvtepi32_ps(_mm_and_si128(_mm_srli_epi32(PixelsTR, 16), FF)), Norm255); + __m128 A_TexTR = _mm_mul_ps(_mm_cvtepi32_ps(_mm_and_si128(_mm_srli_epi32(PixelsTR, 24), FF)), Norm255); + + __m128 R_TexBL = _mm_mul_ps(_mm_cvtepi32_ps(_mm_and_si128( PixelsBL, FF)), Norm255); + __m128 G_TexBL = _mm_mul_ps(_mm_cvtepi32_ps(_mm_and_si128(_mm_srli_epi32(PixelsBL, 8), FF)), Norm255); + __m128 B_TexBL = _mm_mul_ps(_mm_cvtepi32_ps(_mm_and_si128(_mm_srli_epi32(PixelsBL, 16), FF)), Norm255); + __m128 A_TexBL = _mm_mul_ps(_mm_cvtepi32_ps(_mm_and_si128(_mm_srli_epi32(PixelsBL, 24), FF)), Norm255); + + __m128 R_TexBR = _mm_mul_ps(_mm_cvtepi32_ps(_mm_and_si128( PixelsBR, FF)), Norm255); + __m128 G_TexBR = _mm_mul_ps(_mm_cvtepi32_ps(_mm_and_si128(_mm_srli_epi32(PixelsBR, 8), FF)), Norm255); + __m128 B_TexBR = _mm_mul_ps(_mm_cvtepi32_ps(_mm_and_si128(_mm_srli_epi32(PixelsBR, 16), FF)), Norm255); + __m128 A_TexBR = _mm_mul_ps(_mm_cvtepi32_ps(_mm_and_si128(_mm_srli_epi32(PixelsBR, 24), FF)), Norm255); + + __m128 R_Col = _mm_add_ps(_mm_add_ps(_mm_mul_ps(TexBothInv, R_TexTL), + _mm_mul_ps(TexBothYInv, R_TexTR)), + _mm_add_ps(_mm_mul_ps(TexBothXInv, R_TexBL), + _mm_mul_ps(TexBoth, R_TexBR))); + __m128 G_Col = _mm_add_ps(_mm_add_ps(_mm_mul_ps(TexBothInv, G_TexTL), + _mm_mul_ps(TexBothYInv, G_TexTR)), + _mm_add_ps(_mm_mul_ps(TexBothXInv, G_TexBL), + _mm_mul_ps(TexBoth, G_TexBR))); + __m128 B_Col = _mm_add_ps(_mm_add_ps(_mm_mul_ps(TexBothInv, B_TexTL), + _mm_mul_ps(TexBothYInv, B_TexTR)), + _mm_add_ps(_mm_mul_ps(TexBothXInv, B_TexBL), + _mm_mul_ps(TexBoth, B_TexBR))); + __m128 A_Col = _mm_add_ps(_mm_add_ps(_mm_mul_ps(TexBothInv, A_TexTL), + _mm_mul_ps(TexBothYInv, A_TexTR)), + _mm_add_ps(_mm_mul_ps(TexBothXInv, A_TexBL), + _mm_mul_ps(TexBoth, A_TexBR))); + __m128i R_Out, G_Out, B_Out, A_Out; - // Only do alpha blending if a pixel's value doesn't equal 255 - if (_mm_movemask_epi8(_mm_sub_epi32(_mm_cvtps_epi32(A_PixelBlend), Int255))) - { - __m128 LayerAlpha = _mm_mul_ps(A_PixelBlend, Norm255); - __m128 LayerAlphaInv = _mm_mul_ps(_mm_sub_ps(Reg255, A_PixelBlend), Norm255); - __m128i DestPixel = _mm_loadu_si128((const __m128i *)Pixel); - __m128i R_Dest = _mm_and_si128( DestPixel, FF); - __m128i G_Dest = _mm_and_si128(_mm_srli_epi32(DestPixel, 8), FF); - __m128i B_Dest = _mm_and_si128(_mm_srli_epi32(DestPixel, 16), FF); - __m128i A_Dest = _mm_and_si128(_mm_srli_epi32(DestPixel, 24), FF); - - R_Out = _mm_cvtps_epi32(_mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(R_Dest), LayerAlphaInv), _mm_mul_ps(R_PixelBlend, LayerAlpha))); - G_Out = _mm_cvtps_epi32(_mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(G_Dest), LayerAlphaInv), _mm_mul_ps(G_PixelBlend, LayerAlpha))); - B_Out = _mm_cvtps_epi32(_mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(B_Dest), LayerAlphaInv), _mm_mul_ps(B_PixelBlend, LayerAlpha))); - A_Out = _mm_cvtps_epi32(_mm_min_ps(_mm_add_ps(_mm_cvtepi32_ps(A_Dest), A_PixelBlend), Reg255)); - } - else + __m128 LayerAlpha = _mm_mul_ps(A_Col, LayerOpacity); + __m128 LayerAlphaInv = _mm_sub_ps(One, LayerAlpha); + + __m128 R_Colx2 = _mm_mul_ps(R_Col, Two); + __m128 R_ColInv = _mm_sub_ps(One, R_Col); + + __m128 G_Colx2 = _mm_mul_ps(G_Col, Two); + __m128 G_ColInv = _mm_sub_ps(One, G_Col); + + __m128 B_Colx2 = _mm_mul_ps(B_Col, Two); + __m128 B_ColInv = _mm_sub_ps(One, B_Col); + + __m128 R_Blend = R_Col; + __m128 G_Blend = G_Col; + __m128 B_Blend = B_Col; + __m128 A_Blend = LayerAlpha; + + if (!_mm_movemask_epi8(_mm_cmpeq_ps(LayerAlpha, One)) || T.BlendMode != blend_normal) { - R_Out = _mm_cvtps_epi32(R_PixelBlend); - G_Out = _mm_cvtps_epi32(G_PixelBlend); - B_Out = _mm_cvtps_epi32(B_PixelBlend); - A_Out = _mm_cvtps_epi32(A_PixelBlend); + __m128i DestPixel = _mm_loadu_si128((const __m128i *)Pixel); + __m128 R_Dest = _mm_mul_ps(_mm_cvtepi32_ps(_mm_and_si128( DestPixel, FF)), Norm255); + __m128 G_Dest = _mm_mul_ps(_mm_cvtepi32_ps(_mm_and_si128(_mm_srli_epi32(DestPixel, 8), FF)), Norm255); + __m128 B_Dest = _mm_mul_ps(_mm_cvtepi32_ps(_mm_and_si128(_mm_srli_epi32(DestPixel, 16), FF)), Norm255); + __m128 A_Dest = _mm_mul_ps(_mm_cvtepi32_ps(_mm_and_si128(_mm_srli_epi32(DestPixel, 24), FF)), Norm255); + + switch (T.BlendMode) + { + case blend_normal: + { + } break; + case blend_multiply: + { + R_Blend = _mm_mul_ps(R_Dest, R_Col); + G_Blend = _mm_mul_ps(G_Dest, G_Col); + B_Blend = _mm_mul_ps(B_Dest, B_Col); + } break; + case blend_colorburn: + { + // NOTE(fox): A small amount is added to Col since images with zero for alpha may also zero out the + // color channels, causing black clipping. + R_Blend = _mm_sub_ps(One, _mm_div_ps(_mm_sub_ps(One, R_Dest), _mm_add_ps(R_Col, ClipPrevent))); + G_Blend = _mm_sub_ps(One, _mm_div_ps(_mm_sub_ps(One, G_Dest), _mm_add_ps(G_Col, ClipPrevent))); + B_Blend = _mm_sub_ps(One, _mm_div_ps(_mm_sub_ps(One, B_Dest), _mm_add_ps(B_Col, ClipPrevent))); + } break; + case blend_linearburn: + { + R_Blend = _mm_sub_ps(_mm_add_ps(R_Dest, R_Col), One); + G_Blend = _mm_sub_ps(_mm_add_ps(G_Dest, G_Col), One); + B_Blend = _mm_sub_ps(_mm_add_ps(B_Dest, B_Col), One); + } break; + case blend_add: + { + R_Blend = _mm_add_ps(R_Dest, R_Col); + G_Blend = _mm_add_ps(G_Dest, G_Col); + B_Blend = _mm_add_ps(B_Dest, B_Col); + } break; + case blend_screen: + { + R_Blend = _mm_sub_ps(One, _mm_mul_ps(_mm_sub_ps(One, R_Dest), R_ColInv)); + G_Blend = _mm_sub_ps(One, _mm_mul_ps(_mm_sub_ps(One, G_Dest), G_ColInv)); + B_Blend = _mm_sub_ps(One, _mm_mul_ps(_mm_sub_ps(One, B_Dest), B_ColInv)); + } break; + case blend_overlay: + { + __m128 R_Mask = _mm_cmp_ps(R_Dest, ZeroPointFive, 1); + __m128 G_Mask = _mm_cmp_ps(G_Dest, ZeroPointFive, 1); + __m128 B_Mask = _mm_cmp_ps(B_Dest, ZeroPointFive, 1); + __m128 R_Lower = _mm_mul_ps(Two, _mm_mul_ps(R_Dest, R_Col)); + __m128 G_Lower = _mm_mul_ps(Two, _mm_mul_ps(G_Dest, G_Col)); + __m128 B_Lower = _mm_mul_ps(Two, _mm_mul_ps(B_Dest, B_Col)); + __m128 R_Upper = _mm_sub_ps(One, _mm_mul_ps(Two, _mm_mul_ps(_mm_sub_ps(One, R_Dest), R_ColInv))); + __m128 G_Upper = _mm_sub_ps(One, _mm_mul_ps(Two, _mm_mul_ps(_mm_sub_ps(One, G_Dest), G_ColInv))); + __m128 B_Upper = _mm_sub_ps(One, _mm_mul_ps(Two, _mm_mul_ps(_mm_sub_ps(One, B_Dest), B_ColInv))); + R_Blend = _mm_blendv_ps(R_Upper, R_Lower, R_Mask); + G_Blend = _mm_blendv_ps(G_Upper, G_Lower, G_Mask); + B_Blend = _mm_blendv_ps(B_Upper, B_Lower, B_Mask); + } break; + case blend_softlight: + { + // using Pegtop's equation + R_Blend = _mm_add_ps(_mm_mul_ps(_mm_sub_ps(One, R_Colx2), _mm_mul_ps(R_Dest, R_Dest)), _mm_mul_ps(R_Colx2, R_Dest)); + G_Blend = _mm_add_ps(_mm_mul_ps(_mm_sub_ps(One, G_Colx2), _mm_mul_ps(G_Dest, G_Dest)), _mm_mul_ps(G_Colx2, G_Dest)); + B_Blend = _mm_add_ps(_mm_mul_ps(_mm_sub_ps(One, B_Colx2), _mm_mul_ps(B_Dest, B_Dest)), _mm_mul_ps(B_Colx2, B_Dest)); + } break; + case blend_hardlight: + { + __m128 R_Mask = _mm_cmp_ps(R_Dest, ZeroPointFive, 13); + __m128 G_Mask = _mm_cmp_ps(G_Dest, ZeroPointFive, 13); + __m128 B_Mask = _mm_cmp_ps(B_Dest, ZeroPointFive, 13); + __m128 R_Lower = _mm_mul_ps(Two, _mm_mul_ps(R_Dest, R_Col)); + __m128 G_Lower = _mm_mul_ps(Two, _mm_mul_ps(G_Dest, G_Col)); + __m128 B_Lower = _mm_mul_ps(Two, _mm_mul_ps(B_Dest, B_Col)); + __m128 R_Upper = _mm_sub_ps(One, _mm_mul_ps(Two, _mm_mul_ps(_mm_sub_ps(One, R_Dest), R_ColInv))); + __m128 G_Upper = _mm_sub_ps(One, _mm_mul_ps(Two, _mm_mul_ps(_mm_sub_ps(One, G_Dest), G_ColInv))); + __m128 B_Upper = _mm_sub_ps(One, _mm_mul_ps(Two, _mm_mul_ps(_mm_sub_ps(One, B_Dest), B_ColInv))); + R_Blend = _mm_blendv_ps(R_Upper, R_Lower, R_Mask); + G_Blend = _mm_blendv_ps(G_Upper, G_Lower, G_Mask); + B_Blend = _mm_blendv_ps(B_Upper, B_Lower, B_Mask); + } break; + case blend_subtract: + { + R_Blend = _mm_sub_ps(R_Dest, R_Col); + G_Blend = _mm_sub_ps(G_Dest, G_Col); + B_Blend = _mm_sub_ps(B_Dest, B_Col); + } break; + case blend_divide: + { + R_Blend = _mm_div_ps(R_Dest, _mm_add_ps(R_Col, ClipPrevent)); + G_Blend = _mm_div_ps(G_Dest, _mm_add_ps(G_Col, ClipPrevent)); + B_Blend = _mm_div_ps(B_Dest, _mm_add_ps(B_Col, ClipPrevent)); + } break; + case blend_difference: + { + __m128 R_Lower = _mm_sub_ps(R_Col, R_Dest); + __m128 G_Lower = _mm_sub_ps(G_Col, G_Dest); + __m128 B_Lower = _mm_sub_ps(B_Col, B_Dest); + __m128 R_Upper = _mm_sub_ps(R_Dest, R_Col); + __m128 G_Upper = _mm_sub_ps(G_Dest, G_Col); + __m128 B_Upper = _mm_sub_ps(B_Dest, B_Col); + __m128 R_Mask = _mm_cmp_ps(R_Lower, Zero, 14); + __m128 G_Mask = _mm_cmp_ps(G_Lower, Zero, 14); + __m128 B_Mask = _mm_cmp_ps(B_Lower, Zero, 14); + R_Blend = _mm_blendv_ps(R_Upper, R_Lower, R_Mask); + G_Blend = _mm_blendv_ps(G_Upper, G_Lower, G_Mask); + B_Blend = _mm_blendv_ps(B_Upper, B_Lower, B_Mask); + } break; + } + + R_Blend = _mm_add_ps(_mm_mul_ps(R_Dest, LayerAlphaInv), _mm_mul_ps(R_Blend, LayerAlpha)); + G_Blend = _mm_add_ps(_mm_mul_ps(G_Dest, LayerAlphaInv), _mm_mul_ps(G_Blend, LayerAlpha)); + B_Blend = _mm_add_ps(_mm_mul_ps(B_Dest, LayerAlphaInv), _mm_mul_ps(B_Blend, LayerAlpha)); + + // Standard behavior in photo apps is for blend modes to + // inherit underlying opacity instead of adding to it. + if (T.BlendMode == blend_normal) + A_Blend = _mm_add_ps(A_Dest, LayerAlpha); + else + A_Blend = A_Dest; } + R_Out = _mm_cvtps_epi32(_mm_mul_ps(_mm_max_ps(_mm_min_ps(One, R_Blend), Zero), Reg255)); + G_Out = _mm_cvtps_epi32(_mm_mul_ps(_mm_max_ps(_mm_min_ps(One, G_Blend), Zero), Reg255)); + B_Out = _mm_cvtps_epi32(_mm_mul_ps(_mm_max_ps(_mm_min_ps(One, B_Blend), Zero), Reg255)); + A_Out = _mm_cvtps_epi32(_mm_mul_ps(_mm_max_ps(_mm_min_ps(One, A_Blend), Zero), Reg255)); + __m128i OutputPixel = _mm_or_si128( _mm_or_si128(R_Out, _mm_slli_epi32(G_Out, 8)), _mm_or_si128(_mm_slli_epi32(B_Out, 16), _mm_slli_epi32(A_Out, 24))); @@ -796,9 +1051,7 @@ Fallback_RenderLayer(transform_info T, pixel_buffer *Buffer, rectangle RenderReg uint8 *Row = ((uint8 *)Buffer->OriginalBuffer + Buffer->Pitch*(int16)(LayerBounds.Min.y) ); uint32 Channel = (T.LayerWidth * T.LayerHeight); - // uint32 pp1 = 2; - // uint32 pp2 = 3; - // bool32 real = true; + real32 Normalized255 = 1 / 255.0f; for (int16 Y = LayerBounds.Min.y; Y < LayerBounds.Max.y; Y++) { @@ -813,6 +1066,7 @@ Fallback_RenderLayer(transform_info T, pixel_buffer *Buffer, rectangle RenderReg real32 V = (StartVectorX * T.YAxisPX) + (StartVectorY * T.YAxisPY); if (U <= 1.0f && U >= 0.0f && V <= 1.0f && V >= 0.0f) { + real32 TexXFull = U * T.LayerWidth; uint32 TexXInt = (uint32)TexXFull; real32 TexX = TexXFull - TexXInt; @@ -868,70 +1122,184 @@ Fallback_RenderLayer(transform_info T, pixel_buffer *Buffer, rectangle RenderReg PixelToSeek = XLookup + YLookup; uint32 PixelD = *(uint32 *)((uint8 *)T.SourceBuffer + PixelToSeek*Buffer->BytesPerPixel); #endif - - uint8 TexRA = (PixelA & 0xFF); - uint8 TexRB = (PixelB & 0xFF); - uint8 TexRC = (PixelC & 0xFF); - uint8 TexRD = (PixelD & 0xFF); - - uint8 TexGA = ((PixelA >> 8) & 0xFF); - uint8 TexGB = ((PixelB >> 8) & 0xFF); - uint8 TexGC = ((PixelC >> 8) & 0xFF); - uint8 TexGD = ((PixelD >> 8) & 0xFF); - - uint8 TexBA = ((PixelA >> 16) & 0xFF); - uint8 TexBB = ((PixelB >> 16) & 0xFF); - uint8 TexBC = ((PixelC >> 16) & 0xFF); - uint8 TexBD = ((PixelD >> 16) & 0xFF); - - uint8 TexAA = ((PixelA >> 24) & 0xFF); - uint8 TexAB = ((PixelB >> 24) & 0xFF); - uint8 TexAC = ((PixelC >> 24) & 0xFF); - uint8 TexAD = ((PixelD >> 24) & 0xFF); - - real32 PixelBlendR = (TexBothInv * TexRA) + (TexBothYInv * TexRB) - + (TexBothXInv * TexRC) + (TexBoth * TexRD); - real32 PixelBlendG = (TexBothInv * TexGA) + (TexBothYInv * TexGB) - + (TexBothXInv * TexGC) + (TexBoth * TexGD); - real32 PixelBlendB = (TexBothInv * TexBA) + (TexBothYInv * TexBB) - + (TexBothXInv * TexBC) + (TexBoth * TexBD); - real32 PixelBlendA = (TexBothInv * TexAA) + (TexBothYInv * TexAB) - + (TexBothXInv * TexAC) + (TexBoth * TexAD); - PixelBlendA = PixelBlendA - (PixelBlendA * T.LayerOpacity); - - uint8 R = (uint8)PixelBlendR; - uint8 G = (uint8)PixelBlendG; - uint8 B = (uint8)PixelBlendB; - uint8 A = (uint8)PixelBlendA; - XLookup = (X >> 2)*16 + (X % 4); YLookup = (Y >> 2)*(Buffer->FullWidth*4) + (Y % 4)*4; - - // if (real) { - // real = false; - // printf("XLook: %i, YLook: %i\n", XLookup, YLookup); - // printf("X: %i, Y: %i\n", X, Y); - // } PixelToSeek = XLookup + YLookup; uint32 *Pixel = (uint32 *)((uint8 *)Buffer->OriginalBuffer + PixelToSeek*Buffer->BytesPerPixel); - uint8 R1 = (*Pixel >> 0); - uint8 G1 = (*Pixel >> 8); - uint8 B1 = (*Pixel >> 16); - uint8 A1 = (*Pixel >> 24); - - if (A != 255) { - real32 LayerAlpha = (255 - A) / 255.0f; - R = (R1 * LayerAlpha) - (R * LayerAlpha) + R; - G = (G1 * LayerAlpha) - (G * LayerAlpha) + G; - B = (B1 * LayerAlpha) - (B * LayerAlpha) + B; - A = ClipAdd(A1, A); + real32 TexRA = (real32)(PixelA & 0xFF) * Normalized255; + real32 TexRB = (real32)(PixelB & 0xFF) * Normalized255; + real32 TexRC = (real32)(PixelC & 0xFF) * Normalized255; + real32 TexRD = (real32)(PixelD & 0xFF) * Normalized255; + + real32 TexGA = (real32)((PixelA >> 8) & 0xFF) * Normalized255; + real32 TexGB = (real32)((PixelB >> 8) & 0xFF) * Normalized255; + real32 TexGC = (real32)((PixelC >> 8) & 0xFF) * Normalized255; + real32 TexGD = (real32)((PixelD >> 8) & 0xFF) * Normalized255; + + real32 TexBA = (real32)((PixelA >> 16) & 0xFF) * Normalized255; + real32 TexBB = (real32)((PixelB >> 16) & 0xFF) * Normalized255; + real32 TexBC = (real32)((PixelC >> 16) & 0xFF) * Normalized255; + real32 TexBD = (real32)((PixelD >> 16) & 0xFF) * Normalized255; + + real32 TexAA = (real32)((PixelA >> 24) & 0xFF) * Normalized255; + real32 TexAB = (real32)((PixelB >> 24) & 0xFF) * Normalized255; + real32 TexAC = (real32)((PixelC >> 24) & 0xFF) * Normalized255; + real32 TexAD = (real32)((PixelD >> 24) & 0xFF) * Normalized255; + + real32 R_Col = (TexBothInv * TexRA) + (TexBothYInv * TexRB) + + (TexBothXInv * TexRC) + (TexBoth * TexRD); + real32 G_Col = (TexBothInv * TexGA) + (TexBothYInv * TexGB) + + (TexBothXInv * TexGC) + (TexBoth * TexGD); + real32 B_Col = (TexBothInv * TexBA) + (TexBothYInv * TexBB) + + (TexBothXInv * TexBC) + (TexBoth * TexBD); + real32 A_Col = (TexBothInv * TexAA) + (TexBothYInv * TexAB) + + (TexBothXInv * TexAC) + (TexBoth * TexAD); + + real32 LayerAlpha = A_Col * T.LayerOpacity; + + real32 R_Blend = R_Col; + real32 G_Blend = G_Col; + real32 B_Blend = B_Col; + real32 A_Blend = A_Col; + + if (LayerAlpha != 1.0f || T.BlendMode != blend_normal) { + + real32 R_Dest = (real32)((*Pixel >> 0) & 0xFF) * Normalized255; + real32 G_Dest = (real32)((*Pixel >> 8) & 0xFF) * Normalized255; + real32 B_Dest = (real32)((*Pixel >> 16) & 0xFF) * Normalized255; + real32 A_Dest = (real32)((*Pixel >> 24) & 0xFF) * Normalized255; + + switch (T.BlendMode) + { + case blend_normal: + { + } break; + case blend_multiply: + { + R_Blend = R_Dest * R_Col; + G_Blend = G_Dest * G_Col; + B_Blend = B_Dest * B_Col; + } break; + case blend_colorburn: + { + // NOTE(fox): Padding to prevent actual crashing from zero division + R_Blend = 1.0f - ((1.0f - R_Dest) / (R_Col + 0.001f)); + G_Blend = 1.0f - ((1.0f - G_Dest) / (G_Col + 0.001f)); + B_Blend = 1.0f - ((1.0f - B_Dest) / (B_Col + 0.001f)); + } break; + case blend_linearburn: + { + R_Blend = (R_Dest + R_Col) - 1.0f; + G_Blend = (G_Dest + G_Col) - 1.0f; + B_Blend = (B_Dest + B_Col) - 1.0f; + } break; + case blend_add: + { + R_Blend = R_Dest + R_Col; + G_Blend = G_Dest + G_Col; + B_Blend = B_Dest + B_Col; + } break; + case blend_screen: + { + R_Blend = 1.0f - ((1.0f - R_Dest) * (1.0f - R_Col)); + G_Blend = 1.0f - ((1.0f - G_Dest) * (1.0f - G_Col)); + B_Blend = 1.0f - ((1.0f - B_Dest) * (1.0f - B_Col)); + } break; + case blend_overlay: + { + if (R_Dest < 0.5) { + R_Blend = 2.0f * R_Dest * R_Col; + } else { + R_Blend = 1.0f - (2.0f * (1.0f - R_Dest) * (1.0f - R_Col)); + } + if (G_Dest < 0.5) { + G_Blend = 2.0f * G_Dest * G_Col; + } else { + G_Blend = 1.0f - (2.0f * (1.0f - G_Dest) * (1.0f - G_Col)); + } + if (B_Dest < 0.5) { + B_Blend = 2.0f * B_Dest * B_Col; + } else { + B_Blend = 1.0f - (2.0f * (1.0f - B_Dest) * (1.0f - B_Col)); + } + } break; + case blend_softlight: + { + // using Pegtop's equation + R_Blend = ((1.0f - R_Col * 2) * R_Dest * R_Dest) + (R_Col * 2 * R_Dest); + G_Blend = ((1.0f - G_Col * 2) * G_Dest * G_Dest) + (G_Col * 2 * G_Dest); + B_Blend = ((1.0f - B_Col * 2) * B_Dest * B_Dest) + (B_Col * 2 * B_Dest); + } break; + case blend_hardlight: + { + if (R_Dest > 0.5) { + R_Blend = 2.0f * R_Dest * R_Col; + } else { + R_Blend = 1.0f - (2.0f * (1.0f - R_Dest) * (1.0f - R_Col)); + } + if (G_Dest > 0.5) { + G_Blend = 2.0f * G_Dest * G_Col; + } else { + G_Blend = 1.0f - (2.0f * (1.0f - G_Dest) * (1.0f - G_Col)); + } + if (B_Dest > 0.5) { + B_Blend = 2.0f * B_Dest * B_Col; + } else { + B_Blend = 1.0f - (2.0f * (1.0f - B_Dest) * (1.0f - B_Col)); + } + } break; + case blend_subtract: + { + R_Blend = R_Dest - R_Col; + G_Blend = G_Dest - G_Col; + B_Blend = B_Dest - B_Col; + } break; + case blend_divide: + { + R_Blend = R_Dest / (R_Col + 0.001f); + G_Blend = G_Dest / (G_Col + 0.001f); + B_Blend = B_Dest / (B_Col + 0.001f); + } break; + case blend_difference: + { + if (R_Col - R_Dest > 0) { + R_Blend = R_Col - R_Dest; + } else { + R_Blend = R_Dest - R_Col; + } + if (G_Col - G_Dest > 0) { + G_Blend = G_Col - G_Dest; + } else { + G_Blend = G_Dest - G_Col; + } + if (B_Col - B_Dest > 0) { + B_Blend = B_Col - B_Dest; + } else { + B_Blend = B_Dest - B_Col; + } + } break; + } + + R_Blend = (R_Dest * (1.0f - LayerAlpha)) + (R_Blend * LayerAlpha); + G_Blend = (G_Dest * (1.0f - LayerAlpha)) + (G_Blend * LayerAlpha); + B_Blend = (B_Dest * (1.0f - LayerAlpha)) + (B_Blend * LayerAlpha); + + if (T.BlendMode == blend_normal) + A_Blend = A_Dest + LayerAlpha; + else + A_Blend = A_Dest; } - *Pixel = ((A << 24) | - (B << 16) | - (G << 8) | - (R << 0)); + uint8 R_Out = (uint8)(Normalize(R_Blend) * 255.0f); + uint8 G_Out = (uint8)(Normalize(G_Blend) * 255.0f); + uint8 B_Out = (uint8)(Normalize(B_Blend) * 255.0f); + uint8 A_Out = (uint8)(Normalize(A_Blend) * 255.0f); + + *Pixel = ((A_Out << 24) | + (B_Out << 16) | + (G_Out << 8) | + (R_Out << 0)); } } } -- cgit v1.2.3