From d03d7187c1881237b1a98404a125507d33d85a0e Mon Sep 17 00:00:00 2001 From: Fox Caminiti Date: Sun, 21 Aug 2022 22:05:10 -0400 Subject: a bit of housekeeping --- effects_software.cpp | 174 +++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 174 insertions(+) create mode 100644 effects_software.cpp (limited to 'effects_software.cpp') diff --git a/effects_software.cpp b/effects_software.cpp new file mode 100644 index 0000000..5b9e4d9 --- /dev/null +++ b/effects_software.cpp @@ -0,0 +1,174 @@ +static void +Effect_DrawColor_Software(source *Source, layer_bitmap_info *BitmapInfo, memory *Memory, property_channel Property[]) +{ + v4 FloatColor = Property[0].CurrentValue.col; + blend_mode BlendMode = Property[1].CurrentValue.blendmode; + + __m256 ZeroReal = _mm256_set1_ps(0); + __m256 ZeroPointFive = _mm256_set1_ps(0.5); + __m256 One = _mm256_set1_ps(1); + __m256 Two = _mm256_set1_ps(2); + + __m256 Fraction255 = _mm256_set1_ps(1/255.0f); + __m256 Real255 = _mm256_set1_ps(255); + + __m256i FF = _mm256_set1_epi32(0xFF); + + __m256 Alpha = _mm256_set1_ps(FloatColor.a); + __m256 AlphaInv = _mm256_set1_ps(1.0f - FloatColor.a); + + __m256 R_Col = _mm256_set1_ps(FloatColor.E[0]); + __m256 R_Colx2 = _mm256_mul_ps(R_Col, Two); + __m256 R_ColInv = _mm256_set1_ps(1.0f - FloatColor.E[0]); + + __m256 G_Col = _mm256_set1_ps(FloatColor.E[1]); + __m256 G_Colx2 = _mm256_mul_ps(G_Col, Two); + __m256 G_ColInv = _mm256_set1_ps(1.0f - FloatColor.E[1]); + + __m256 B_Col = _mm256_set1_ps(FloatColor.E[2]); + __m256 B_Colx2 = _mm256_mul_ps(B_Col, Two); + __m256 B_ColInv = _mm256_set1_ps(1.0f - FloatColor.E[2]); + + for (int16 Y = 0; Y < Source->Info.Height; Y += 2) + { + for (int16 X = 0; X < Source->Info.Width; X += 4) + { + uint32 XLookup = (X >> 2)*16 + (X % 4); + uint32 YLookup = (Y >> 2)*(Source->Info.Width*4) + (Y % 4)*4; + uint32 PixelToSeek = XLookup + YLookup; + uint8 *Pixel = (uint8 *)BitmapInfo->BitmapBuffer + PixelToSeek*Source->Info.BytesPerPixel; + __m256i DestPixel = _mm256_loadu_si256((const __m256i *)Pixel); + + // normalized values + __m256 R_Dest = _mm256_mul_ps(_mm256_cvtepi32_ps(_mm256_and_si256( DestPixel, FF)), Fraction255); + __m256 G_Dest = _mm256_mul_ps(_mm256_cvtepi32_ps(_mm256_and_si256(_mm256_srli_epi32(DestPixel, 8), FF)), Fraction255); + __m256 B_Dest = _mm256_mul_ps(_mm256_cvtepi32_ps(_mm256_and_si256(_mm256_srli_epi32(DestPixel, 16), FF)), Fraction255); + __m256i A_Out = _mm256_and_si256(_mm256_srli_epi32(DestPixel, 24), FF); + + __m256 R_Blend = R_Col; + __m256 G_Blend = G_Col; + __m256 B_Blend = B_Col; + switch (BlendMode) + { + case blend_normal: + { + } break; + case blend_multiply: + { + R_Blend = _mm256_mul_ps(R_Dest, R_Col); + G_Blend = _mm256_mul_ps(G_Dest, G_Col); + B_Blend = _mm256_mul_ps(B_Dest, B_Col); + } break; + case blend_colorburn: + { + R_Blend = _mm256_sub_ps(One, _mm256_div_ps(_mm256_sub_ps(One, R_Dest), R_Col)); + G_Blend = _mm256_sub_ps(One, _mm256_div_ps(_mm256_sub_ps(One, G_Dest), G_Col)); + B_Blend = _mm256_sub_ps(One, _mm256_div_ps(_mm256_sub_ps(One, B_Dest), B_Col)); + } break; + case blend_linearburn: + { + R_Blend = _mm256_sub_ps(_mm256_add_ps(R_Dest, R_Col), One); + G_Blend = _mm256_sub_ps(_mm256_add_ps(G_Dest, G_Col), One); + B_Blend = _mm256_sub_ps(_mm256_add_ps(B_Dest, B_Col), One); + } break; + case blend_add: + { + R_Blend = _mm256_add_ps(R_Dest, R_Col); + G_Blend = _mm256_add_ps(G_Dest, G_Col); + B_Blend = _mm256_add_ps(B_Dest, B_Col); + } break; + case blend_screen: + { + R_Blend = _mm256_sub_ps(One, _mm256_mul_ps(_mm256_sub_ps(One, R_Dest), R_ColInv)); + G_Blend = _mm256_sub_ps(One, _mm256_mul_ps(_mm256_sub_ps(One, G_Dest), G_ColInv)); + B_Blend = _mm256_sub_ps(One, _mm256_mul_ps(_mm256_sub_ps(One, B_Dest), B_ColInv)); + } break; + case blend_overlay: + { + __m256 R_Mask = _mm256_cmp_ps(R_Dest, ZeroPointFive, 1); + __m256 G_Mask = _mm256_cmp_ps(G_Dest, ZeroPointFive, 1); + __m256 B_Mask = _mm256_cmp_ps(B_Dest, ZeroPointFive, 1); + __m256 R_Lower = _mm256_mul_ps(Two, _mm256_mul_ps(R_Dest, R_Col)); + __m256 G_Lower = _mm256_mul_ps(Two, _mm256_mul_ps(G_Dest, G_Col)); + __m256 B_Lower = _mm256_mul_ps(Two, _mm256_mul_ps(B_Dest, B_Col)); + __m256 R_Upper = _mm256_sub_ps(One, _mm256_mul_ps(Two, _mm256_mul_ps(_mm256_sub_ps(One, R_Dest), R_ColInv))); + __m256 G_Upper = _mm256_sub_ps(One, _mm256_mul_ps(Two, _mm256_mul_ps(_mm256_sub_ps(One, G_Dest), G_ColInv))); + __m256 B_Upper = _mm256_sub_ps(One, _mm256_mul_ps(Two, _mm256_mul_ps(_mm256_sub_ps(One, B_Dest), B_ColInv))); + R_Blend = _mm256_blendv_ps(R_Upper, R_Lower, R_Mask); + G_Blend = _mm256_blendv_ps(G_Upper, G_Lower, G_Mask); + B_Blend = _mm256_blendv_ps(B_Upper, B_Lower, B_Mask); + } break; + case blend_softlight: + { + // using Pegtop's equation + R_Blend = _mm256_add_ps(_mm256_mul_ps(_mm256_sub_ps(One, R_Colx2), _mm256_mul_ps(R_Dest, R_Dest)), _mm256_mul_ps(R_Colx2, R_Dest)); + G_Blend = _mm256_add_ps(_mm256_mul_ps(_mm256_sub_ps(One, G_Colx2), _mm256_mul_ps(G_Dest, G_Dest)), _mm256_mul_ps(G_Colx2, G_Dest)); + B_Blend = _mm256_add_ps(_mm256_mul_ps(_mm256_sub_ps(One, B_Colx2), _mm256_mul_ps(B_Dest, B_Dest)), _mm256_mul_ps(B_Colx2, B_Dest)); + } break; + case blend_hardlight: + { + __m256 R_Mask = _mm256_cmp_ps(R_Dest, ZeroPointFive, 13); + __m256 G_Mask = _mm256_cmp_ps(G_Dest, ZeroPointFive, 13); + __m256 B_Mask = _mm256_cmp_ps(B_Dest, ZeroPointFive, 13); + __m256 R_Lower = _mm256_mul_ps(Two, _mm256_mul_ps(R_Dest, R_Col)); + __m256 G_Lower = _mm256_mul_ps(Two, _mm256_mul_ps(G_Dest, G_Col)); + __m256 B_Lower = _mm256_mul_ps(Two, _mm256_mul_ps(B_Dest, B_Col)); + __m256 R_Upper = _mm256_sub_ps(One, _mm256_mul_ps(Two, _mm256_mul_ps(_mm256_sub_ps(One, R_Dest), R_ColInv))); + __m256 G_Upper = _mm256_sub_ps(One, _mm256_mul_ps(Two, _mm256_mul_ps(_mm256_sub_ps(One, G_Dest), G_ColInv))); + __m256 B_Upper = _mm256_sub_ps(One, _mm256_mul_ps(Two, _mm256_mul_ps(_mm256_sub_ps(One, B_Dest), B_ColInv))); + R_Blend = _mm256_blendv_ps(R_Upper, R_Lower, R_Mask); + G_Blend = _mm256_blendv_ps(G_Upper, G_Lower, G_Mask); + B_Blend = _mm256_blendv_ps(B_Upper, B_Lower, B_Mask); + } break; + case blend_subtract: + { + R_Blend = _mm256_sub_ps(R_Dest, R_Col); + G_Blend = _mm256_sub_ps(G_Dest, G_Col); + B_Blend = _mm256_sub_ps(B_Dest, B_Col); + } break; + case blend_divide: + { + R_Blend = _mm256_div_ps(R_Dest, R_Col); + G_Blend = _mm256_div_ps(G_Dest, G_Col); + B_Blend = _mm256_div_ps(B_Dest, B_Col); + } break; + case blend_difference: + { + __m256 R_Lower = _mm256_sub_ps(R_Col, R_Dest); + __m256 G_Lower = _mm256_sub_ps(G_Col, G_Dest); + __m256 B_Lower = _mm256_sub_ps(B_Col, B_Dest); + __m256 R_Upper = _mm256_sub_ps(R_Dest, R_Col); + __m256 G_Upper = _mm256_sub_ps(G_Dest, G_Col); + __m256 B_Upper = _mm256_sub_ps(B_Dest, B_Col); + __m256 R_Mask = _mm256_cmp_ps(R_Lower, ZeroReal, 14); + __m256 G_Mask = _mm256_cmp_ps(G_Lower, ZeroReal, 14); + __m256 B_Mask = _mm256_cmp_ps(B_Lower, ZeroReal, 14); + R_Blend = _mm256_blendv_ps(R_Upper, R_Lower, R_Mask); + G_Blend = _mm256_blendv_ps(G_Upper, G_Lower, G_Mask); + B_Blend = _mm256_blendv_ps(B_Upper, B_Lower, B_Mask); + } break; + } + + R_Blend = _mm256_add_ps(_mm256_mul_ps(R_Dest, AlphaInv), + _mm256_mul_ps(R_Blend, Alpha)); + G_Blend = _mm256_add_ps(_mm256_mul_ps(G_Dest, AlphaInv), + _mm256_mul_ps(G_Blend, Alpha)); + B_Blend = _mm256_add_ps(_mm256_mul_ps(B_Dest, AlphaInv), + _mm256_mul_ps(B_Blend, Alpha)); + + R_Blend = _mm256_max_ps(_mm256_min_ps(One, R_Blend), ZeroReal); + G_Blend = _mm256_max_ps(_mm256_min_ps(One, G_Blend), ZeroReal); + B_Blend = _mm256_max_ps(_mm256_min_ps(One, B_Blend), ZeroReal); + + __m256i R_Out = _mm256_cvttps_epi32(_mm256_mul_ps(R_Blend, Real255)); + __m256i G_Out = _mm256_cvttps_epi32(_mm256_mul_ps(G_Blend, Real255)); + __m256i B_Out = _mm256_cvttps_epi32(_mm256_mul_ps(B_Blend, Real255)); + + __m256i OutputPixel = _mm256_or_si256( + _mm256_or_si256(R_Out, _mm256_slli_epi32(G_Out, 8)), + _mm256_or_si256(_mm256_slli_epi32(B_Out, 16), _mm256_slli_epi32(A_Out, 24))); + + _mm256_storeu_si256((__m256i *)Pixel, OutputPixel); + } + } +} -- cgit v1.2.3