summaryrefslogtreecommitdiff
path: root/bitmap_calls.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'bitmap_calls.cpp')
-rw-r--r--bitmap_calls.cpp33
1 files changed, 28 insertions, 5 deletions
diff --git a/bitmap_calls.cpp b/bitmap_calls.cpp
index 46f0c31..0e76039 100644
--- a/bitmap_calls.cpp
+++ b/bitmap_calls.cpp
@@ -248,16 +248,25 @@ Bitmap_StencilAlpha(void *Input, void *Output, uint16 BytesPerPixel, uint64 Tota
uint16 ByteOffset = Bitmap_CalcByteOffset(BytesPerPixel);
uint64 RemainderBytes = TotalBytes % ByteOffset;
+#if ARM
+#else
__m256i AlphaBytes = _mm256_set1_epi32(0x00FFFFFF);
__m256i Zeroi = _mm256_set1_epi32(0);
+#endif
while (bytes <= TotalBytes - RemainderBytes) {
uint8 *Pixel = (uint8 *)Row + bytes;
uint8 *Pixel2 = (uint8 *)Row2 + bytes;
#if ARM
if (InstructionMode == instruction_mode_neon) {
- uint32x2x4_t OutputPixel = vld4_u32((uint32 *)Pixel);
- vst4_u32((uint32 *)Pixel2, OutputPixel);
+ // TODO(fox): Optimize and write NEON!
+ uint8 *DestAlpha = Pixel2 + (BytesPerPixel/4)*3;
+ uint8 Alpha = *DestAlpha;
+ uint32 *DestPixel = (uint32 *)Pixel2;
+ uint32 *SrcPixel = (uint32 *)Pixel;
+ *DestPixel = *SrcPixel;
+ *DestAlpha = Alpha;
+ bytes += BytesPerPixel;
#else
if (InstructionMode == instruction_mode_avx) {
__m256i InputPixel = _mm256_loadu_si256((__m256i *)Pixel);
@@ -266,19 +275,33 @@ Bitmap_StencilAlpha(void *Input, void *Output, uint16 BytesPerPixel, uint64 Tota
OutputPixel = _mm256_blendv_epi8(OutputPixel, InputPixel, AlphaBytes);
_mm256_storeu_si256((__m256i *)Pixel2, OutputPixel);
}
+ bytes += ByteOffset;
} else if (InstructionMode == instruction_mode_sse) {
__m128i OutputPixel = _mm_loadu_si128((__m128i *)Pixel);
_mm_storeu_si128((__m128i *)Pixel2, OutputPixel);
+ bytes += ByteOffset;
#endif
} else {
- *(uint32 *)Pixel2 = *(uint32 *)Pixel;
+ // TODO(fox): Optimize and write NEON!
+ uint8 *DestAlpha = Pixel2 + (BytesPerPixel/4)*3;
+ uint8 Alpha = *DestAlpha;
+ uint32 *DestPixel = (uint32 *)Pixel2;
+ uint32 *SrcPixel = (uint32 *)Pixel;
+ *DestPixel = *SrcPixel;
+ *DestAlpha = Alpha;
+ bytes += BytesPerPixel;
}
- bytes += ByteOffset;
}
while (bytes <= TotalBytes) {
uint8 *Pixel = (uint8 *)Row + bytes;
uint8 *Pixel2 = (uint8 *)Row2 + bytes;
- *(uint32 *)Pixel2 = *(uint32 *)Pixel;
+ // TODO(fox): Optimize and write NEON!
+ uint8 *DestAlpha = Pixel2 + (BytesPerPixel/4)*3;
+ uint8 Alpha = *DestAlpha;
+ uint32 *DestPixel = (uint32 *)Pixel2;
+ uint32 *SrcPixel = (uint32 *)Pixel;
+ *DestPixel = *SrcPixel;
+ *DestAlpha = Alpha;
bytes += BytesPerPixel;
}
}