From b39c7374009d03d448e47e08ddabc30abeee5247 Mon Sep 17 00:00:00 2001 From: Fox Caminiti Date: Thu, 25 Aug 2022 18:55:59 -0400 Subject: minor fixes --- bitmap_calls.cpp | 33 ++++++++++++++++++++++++++++----- 1 file changed, 28 insertions(+), 5 deletions(-) (limited to 'bitmap_calls.cpp') diff --git a/bitmap_calls.cpp b/bitmap_calls.cpp index 46f0c31..0e76039 100644 --- a/bitmap_calls.cpp +++ b/bitmap_calls.cpp @@ -248,16 +248,25 @@ Bitmap_StencilAlpha(void *Input, void *Output, uint16 BytesPerPixel, uint64 Tota uint16 ByteOffset = Bitmap_CalcByteOffset(BytesPerPixel); uint64 RemainderBytes = TotalBytes % ByteOffset; +#if ARM +#else __m256i AlphaBytes = _mm256_set1_epi32(0x00FFFFFF); __m256i Zeroi = _mm256_set1_epi32(0); +#endif while (bytes <= TotalBytes - RemainderBytes) { uint8 *Pixel = (uint8 *)Row + bytes; uint8 *Pixel2 = (uint8 *)Row2 + bytes; #if ARM if (InstructionMode == instruction_mode_neon) { - uint32x2x4_t OutputPixel = vld4_u32((uint32 *)Pixel); - vst4_u32((uint32 *)Pixel2, OutputPixel); + // TODO(fox): Optimize and write NEON! + uint8 *DestAlpha = Pixel2 + (BytesPerPixel/4)*3; + uint8 Alpha = *DestAlpha; + uint32 *DestPixel = (uint32 *)Pixel2; + uint32 *SrcPixel = (uint32 *)Pixel; + *DestPixel = *SrcPixel; + *DestAlpha = Alpha; + bytes += BytesPerPixel; #else if (InstructionMode == instruction_mode_avx) { __m256i InputPixel = _mm256_loadu_si256((__m256i *)Pixel); @@ -266,19 +275,33 @@ Bitmap_StencilAlpha(void *Input, void *Output, uint16 BytesPerPixel, uint64 Tota OutputPixel = _mm256_blendv_epi8(OutputPixel, InputPixel, AlphaBytes); _mm256_storeu_si256((__m256i *)Pixel2, OutputPixel); } + bytes += ByteOffset; } else if (InstructionMode == instruction_mode_sse) { __m128i OutputPixel = _mm_loadu_si128((__m128i *)Pixel); _mm_storeu_si128((__m128i *)Pixel2, OutputPixel); + bytes += ByteOffset; #endif } else { - *(uint32 *)Pixel2 = *(uint32 *)Pixel; + // TODO(fox): Optimize and write NEON! + uint8 *DestAlpha = Pixel2 + (BytesPerPixel/4)*3; + uint8 Alpha = *DestAlpha; + uint32 *DestPixel = (uint32 *)Pixel2; + uint32 *SrcPixel = (uint32 *)Pixel; + *DestPixel = *SrcPixel; + *DestAlpha = Alpha; + bytes += BytesPerPixel; } - bytes += ByteOffset; } while (bytes <= TotalBytes) { uint8 *Pixel = (uint8 *)Row + bytes; uint8 *Pixel2 = (uint8 *)Row2 + bytes; - *(uint32 *)Pixel2 = *(uint32 *)Pixel; + // TODO(fox): Optimize and write NEON! + uint8 *DestAlpha = Pixel2 + (BytesPerPixel/4)*3; + uint8 Alpha = *DestAlpha; + uint32 *DestPixel = (uint32 *)Pixel2; + uint32 *SrcPixel = (uint32 *)Pixel; + *DestPixel = *SrcPixel; + *DestAlpha = Alpha; bytes += BytesPerPixel; } } -- cgit v1.2.3