diff options
Diffstat (limited to 'bitmap_calls.cpp')
-rw-r--r-- | bitmap_calls.cpp | 402 |
1 files changed, 0 insertions, 402 deletions
diff --git a/bitmap_calls.cpp b/bitmap_calls.cpp deleted file mode 100644 index 0e76039..0000000 --- a/bitmap_calls.cpp +++ /dev/null @@ -1,402 +0,0 @@ -// Bitmaps are curently stored two ways in this program, which I'm calling -// "packed" and "unpacked." Both are 0xAAGGBBRR little endian. Unpacked bitmaps -// use the typical method of storage, rows of X that you increment by -// Width*BytesPerPixel to look up the Y coordinate. "Packed" bitmaps encode -// pixels as 4x4 chunks. To illustrate this clearly with an 8x4 bitmap: - -// A1 A2 A3 A4 E1 E2 E3 E4 -// B1 B2 B3 B4 F1 F2 F3 F4 -// C1 C2 C3 C4 G1 G2 G3 G4 -// D1 D2 D3 D4 H1 H2 H3 H4 - -// Unpacked would be stored in memory order as A1 A2 A3 A4 E1 E2 E3 E4... -// while packed would be stored as A1 A2 A3 A4 B1 B2 B3 B4... - -// In cases where the bitmap is a non-divisible-by-four size, we simply treat -// the bitmap as if it's the right size and add the extra pixels in the allocation. - -// This wasn't an optimization I necessarily _needed_ to make this early on--I -// never even did any measuring to see if there was any speedup-- but I -// couldn't resist it. I like doing the software rendering stuff. - - -// TODO(fox): I could write an AVX version of this function, but it may not be -// that much faster since we have to do a bit of uninterleaving. - -// 0 - store in 4x4 chunks -// 1 - unpack to 1xwidth -void Bitmap_ConvertPacking(void *Buffer, void *DestBuffer, uint16 Width, uint16 Height, uint16 BytesPerPixel, uint16 Which) -{ - uint8 *Src = (uint8 *)Buffer; - uint8 *Temp = (uint8 *)DestBuffer; - uint32 RemainderPixels = Width % 4; - uint16 WidthP, HeightP; - Bitmap_CalcPackedDimensions(Width, Height, &WidthP, &HeightP); - for (uint32 Y = 0; Y < Height; Y++) { - uint32 X = 0; - while (X < Width - RemainderPixels) { - uint32 XLookup = (X >> 2)*16 + (X % 4); - uint32 YLookup = (Y >> 2)*(WidthP*4) + (Y % 4)*4; - uint32 PixelToSeek = XLookup + YLookup; - if (Y == 48 && X == 0) - uint8 war = 0; - // if (YLookup == 2500 && XLookup == 1) - uint8 *DPixel, *Pixel; - if (Which == 0) { - DPixel = Temp + PixelToSeek*BytesPerPixel; - Pixel = Src + Y*Width*4 + X*BytesPerPixel; - } else { - Pixel = Src + PixelToSeek*BytesPerPixel; - DPixel = Temp + Y*Width*4 + X*BytesPerPixel; - } - -#if ARM - if (InstructionMode == instruction_mode_neon) { - uint32x2x2_t Row = vld2_u32((uint32 *)Pixel); - vst2_u32((uint32 *)DPixel, Row); - X += 4; -#else - if (InstructionMode == instruction_mode_sse || InstructionMode == instruction_mode_avx) { - __m128i Row = _mm_loadu_si128((__m128i *)Pixel); - _mm_storeu_si128((__m128i *)DPixel, Row); - X+=4; -#endif - } else { - *(uint32 *)DPixel = *(uint32 *)Pixel; - X++; - } - } - while (X < Width) { - uint16 WidthP, HeightP; - Bitmap_CalcPackedDimensions(Width, Height, &WidthP, &HeightP); - uint32 XLookup = (X >> 2)*16 + (X % 4); - uint32 YLookup = (Y >> 2)*(WidthP*4) + (Y % 4)*4; - uint32 PixelToSeek = XLookup + YLookup; - uint8 *DPixel, *Pixel; - if (Which == 0) { - DPixel = Temp + PixelToSeek*BytesPerPixel; - Pixel = Src + Y*Width*4 + X*BytesPerPixel; - } else { - Pixel = Src + PixelToSeek*BytesPerPixel; - DPixel = Temp + Y*Width*4 + X*BytesPerPixel; - } - - *(uint32 *)DPixel = *(uint32 *)Pixel; - X++; - } - } -} - -// TODO(fox): Replace this in the future. -#if 0 -static void * -MoveImportToBitmap(memory *Memory, pixel_buffer *Raster, void *Input) -{ - uint8 *Row = ((uint8 *)Input); - // void *Output = AllocateMemory(Memory, Bitmap_CalcTotalBytes(Raster->Width, Raster->Height, Raster->BytesPerPixel), B_Layers); - uint8 *Row2 = ((uint8 *)Output); - - uint64 bytes = 0; - uint16 ByteOffset = Bitmap_CalculateByteOffset(BytesPerPixel); - uint64 TotalBytes = Bitmap_CalculateTotalBytes(Width, Height, BytesPerPixel); - uint64 RemainderBytes = TotalBytes % ByteOffset; - - while (bytes <= TotalBytes - RemainderBytes) { - uint8 *Pixel = (uint8 *)Row + bytes; - uint8 *Pixel2 = (uint8 *)Row2 + bytes; - if (InstructionMode == instruction_mode_sse || InstructionMode == instruction_mode_avx) { - __m128i OutputPixel = _mm_loadu_si128((__m128i *)Pixel); - _mm_storeu_si128((__m128i *)Pixel2, OutputPixel); - bytes += 4*Raster->BytesPerPixel; - } else { - *(uint32 *)Pixel2 = *(uint32 *)Pixel; - bytes += Raster->BytesPerPixel; - } - } - while (bytes <= TotalBytes) { - uint8 *Pixel = (uint8 *)Row + bytes; - uint8 *Pixel2 = (uint8 *)Row2 + bytes; - *(uint32 *)Pixel2 = *(uint32 *)Pixel; - bytes += Raster->BytesPerPixel; - } - return Output; -} -#endif - -static void -Bitmap_Clear(void *Buffer, uint16 Width, uint16 Height, uint16 BytesPerPixel) -{ - uint8 *Row = (uint8 *)Buffer; -#if ARM - uint32 Zero[4] = {0}; - uint32x2x4_t Zero8 = vld4_dup_u32(Zero); -#else - __m256i Zero8 = _mm256_setzero_si256(); - __m128i Zero = _mm_setzero_si128(); -#endif - uint64 bytes = 0; - - uint16 ByteOffset = Bitmap_CalcByteOffset(BytesPerPixel); - uint64 TotalBytes = Bitmap_CalcTotalBytes(Width, Height, BytesPerPixel); - - while (bytes < TotalBytes) { - uint8 *Pixel = Row + bytes; -#if ARM - if (InstructionMode == instruction_mode_neon) { - vst4_u32((uint32 *)Pixel, Zero8); -#else - if (InstructionMode == instruction_mode_avx) { - _mm256_storeu_si256((__m256i *)Pixel, Zero8); - } else if (InstructionMode == instruction_mode_sse) { - _mm_storeu_si128((__m128i *)Pixel, Zero); -#endif - } else { - *(uint32 *)Pixel = 0x00000000; - } - bytes += ByteOffset; - } -} - -static void -Bitmap_CalcPackedDimensions(uint16 Width, uint16 Height, uint16 *WidthP, uint16 *HeightP) { - uint16 ExtraWidth = 4 - (Width % 4); - if (ExtraWidth == 4) - ExtraWidth = 0; - uint16 ExtraHeight = 4 - (Height % 4); - if (ExtraHeight == 4) - ExtraHeight = 0; - *WidthP = Width + ExtraWidth; - *HeightP = Height + ExtraHeight; -} - -static uint16 -Bitmap_CalcByteOffset(uint16 BytesPerPixel) { - uint16 ByteOffset = BytesPerPixel; -#if ARM - if (InstructionMode == instruction_mode_neon) - ByteOffset = 8*BytesPerPixel; -#else - if (InstructionMode == instruction_mode_avx) - ByteOffset = 8*BytesPerPixel; - if (InstructionMode == instruction_mode_sse) - ByteOffset = 4*BytesPerPixel; -#endif - return ByteOffset; -} - -static uint64 -Bitmap_CalcUnpackedBytes(uint16 Width, uint16 Height, uint16 BytesPerPixel) { - uint64 TotalBytes = (uint64)Width*Height*BytesPerPixel; - return TotalBytes; -} - -static uint64 -Bitmap_CalcTotalBytes(uint16 Width, uint16 Height, uint16 BytesPerPixel) { - uint16 WidthP, HeightP; - Bitmap_CalcPackedDimensions(Width, Height, &WidthP, &HeightP); - uint64 TotalBytes = (uint64)WidthP*HeightP*BytesPerPixel; - return TotalBytes; -} - -// TODO(fox): Maybe turn this into a generic memory copy; we don't need to care -// about pixels for any particular reason here. -static void -Bitmap_CopyToPointer(void *Input, void *Output, uint16 BytesPerPixel, uint64 TotalBytes) -{ - uint8 *Row = (uint8 *)Input; - uint8 *Row2 = (uint8 *)Output; - - uint64 bytes = 0; - uint16 ByteOffset = Bitmap_CalcByteOffset(BytesPerPixel); - uint64 RemainderBytes = TotalBytes % ByteOffset; - - while (bytes <= TotalBytes - RemainderBytes) { - uint8 *Pixel = (uint8 *)Row + bytes; - uint8 *Pixel2 = (uint8 *)Row2 + bytes; -#if ARM - if (InstructionMode == instruction_mode_neon) { - uint32x2x4_t OutputPixel = vld4_u32((uint32 *)Pixel); - vst4_u32((uint32 *)Pixel2, OutputPixel); -#else - if (InstructionMode == instruction_mode_avx) { - __m256i OutputPixel = _mm256_loadu_si256((__m256i *)Pixel); - _mm256_storeu_si256((__m256i *)Pixel2, OutputPixel); - } else if (InstructionMode == instruction_mode_sse) { - __m128i OutputPixel = _mm_loadu_si128((__m128i *)Pixel); - _mm_storeu_si128((__m128i *)Pixel2, OutputPixel); -#endif - } else { - *(uint32 *)Pixel2 = *(uint32 *)Pixel; - } - bytes += ByteOffset; - } - while (bytes <= TotalBytes) { - uint8 *Pixel = (uint8 *)Row + bytes; - uint8 *Pixel2 = (uint8 *)Row2 + bytes; - *(uint32 *)Pixel2 = *(uint32 *)Pixel; - bytes += BytesPerPixel; - } -} - -static void -Bitmap_StencilAlpha(void *Input, void *Output, uint16 BytesPerPixel, uint64 TotalBytes) -{ - uint8 *Row = (uint8 *)Input; - uint8 *Row2 = (uint8 *)Output; - - uint64 bytes = 0; - uint16 ByteOffset = Bitmap_CalcByteOffset(BytesPerPixel); - uint64 RemainderBytes = TotalBytes % ByteOffset; - -#if ARM -#else - __m256i AlphaBytes = _mm256_set1_epi32(0x00FFFFFF); - __m256i Zeroi = _mm256_set1_epi32(0); -#endif - - while (bytes <= TotalBytes - RemainderBytes) { - uint8 *Pixel = (uint8 *)Row + bytes; - uint8 *Pixel2 = (uint8 *)Row2 + bytes; -#if ARM - if (InstructionMode == instruction_mode_neon) { - // TODO(fox): Optimize and write NEON! - uint8 *DestAlpha = Pixel2 + (BytesPerPixel/4)*3; - uint8 Alpha = *DestAlpha; - uint32 *DestPixel = (uint32 *)Pixel2; - uint32 *SrcPixel = (uint32 *)Pixel; - *DestPixel = *SrcPixel; - *DestAlpha = Alpha; - bytes += BytesPerPixel; -#else - if (InstructionMode == instruction_mode_avx) { - __m256i InputPixel = _mm256_loadu_si256((__m256i *)Pixel); - __m256i OutputPixel = _mm256_loadu_si256((__m256i *)Pixel2); - if (_mm256_movemask_epi8(OutputPixel)) { - OutputPixel = _mm256_blendv_epi8(OutputPixel, InputPixel, AlphaBytes); - _mm256_storeu_si256((__m256i *)Pixel2, OutputPixel); - } - bytes += ByteOffset; - } else if (InstructionMode == instruction_mode_sse) { - __m128i OutputPixel = _mm_loadu_si128((__m128i *)Pixel); - _mm_storeu_si128((__m128i *)Pixel2, OutputPixel); - bytes += ByteOffset; -#endif - } else { - // TODO(fox): Optimize and write NEON! - uint8 *DestAlpha = Pixel2 + (BytesPerPixel/4)*3; - uint8 Alpha = *DestAlpha; - uint32 *DestPixel = (uint32 *)Pixel2; - uint32 *SrcPixel = (uint32 *)Pixel; - *DestPixel = *SrcPixel; - *DestAlpha = Alpha; - bytes += BytesPerPixel; - } - } - while (bytes <= TotalBytes) { - uint8 *Pixel = (uint8 *)Row + bytes; - uint8 *Pixel2 = (uint8 *)Row2 + bytes; - // TODO(fox): Optimize and write NEON! - uint8 *DestAlpha = Pixel2 + (BytesPerPixel/4)*3; - uint8 Alpha = *DestAlpha; - uint32 *DestPixel = (uint32 *)Pixel2; - uint32 *SrcPixel = (uint32 *)Pixel; - *DestPixel = *SrcPixel; - *DestAlpha = Alpha; - bytes += BytesPerPixel; - } -} - -// This would be an easy SIMD if only AVX had a scatter call... -// NOTE(fox): Only works with unpacked bitmaps for now. -static void -Bitmap_CalcHistogram(void *Data, void *Input, uint16 BytesPerPixel, uint64 TotalBytes) -{ - uint32 *Slot = (uint32 *)Data; - uint8 *Row = (uint8 *)Input; - uint64 bytes = 0; - uint16 ByteOffset = Bitmap_CalcByteOffset(BytesPerPixel); - uint64 RemainderBytes = TotalBytes % ByteOffset; - - for (int i = 0; i < 256*5; i++) { - *(real32 *)((uint8 *)Slot + i*sizeof(real32)) = 0; - } - - while (bytes <= TotalBytes) { - uint8 *Pixel = (uint8 *)Row + bytes; - - uint8 A = (*(uint32 *)Pixel >> 24); - uint8 R = (*(uint32 *)Pixel >> 16); - uint8 G = (*(uint32 *)Pixel >> 8); - uint8 B = (*(uint32 *)Pixel >> 0); - - uint8 Avg = (uint8)((real32)(R + G + B) / 3.0f); - - *(real32 *)((uint8 *)Slot + Avg*sizeof(real32)) += 1; - *(real32 *)((uint8 *)Slot + (256 + R)*sizeof(real32)) += 1; - *(real32 *)((uint8 *)Slot + (256*2 + G)*sizeof(real32)) += 1; - *(real32 *)((uint8 *)Slot + (256*3 + B)*sizeof(real32)) += 1; - *(real32 *)((uint8 *)Slot + (256*4 + A)*sizeof(real32)) += 1; - - bytes += BytesPerPixel; - } -} - -#if 0 -static void -BitmapPackRGB(pixel_buffer *Buffer) { - Assert(Buffer->Pitch); - Convert4x4Chunk(Buffer, 0); - CopyToBuffer(Buffer, 1); - ClearBuffer(Buffer, Buffer->EffectBuffer); -} - -static void -DebugFillSolid(pixel_buffer *Raster, v4 Color) -{ - uint32 ColS = ColToUint32(Color); - __m256i Col8 = _mm256_set1_epi32(ColS); - __m128i Col = _mm_set1_epi32(ColS); - uint8 *Row = (uint8 *)Raster->OriginalBuffer; - - uint64 bytes = 0; - uint16 ByteOffset = Raster->BytesPerPixel; - if (InstructionMode == instruction_mode_avx) - ByteOffset = 8*Raster->BytesPerPixel; - else if (InstructionMode == instruction_mode_sse) - ByteOffset = 4*Raster->BytesPerPixel; - - uint64 TotalBytes = Raster->FullHeight*Raster->FullWidth*Raster->BytesPerPixel; - - while (bytes < TotalBytes) { - uint8 *Pixel = Row + bytes; - if (InstructionMode == instruction_mode_avx) { - _mm256_storeu_si256((__m256i *)Pixel, Col8); - } else if (InstructionMode == instruction_mode_sse) { - _mm_storeu_si128((__m128i *)Pixel, Col); - } else { - *(uint32 *)Pixel = ColS; - } - bytes += ByteOffset; - } -} - -static void -DebugBitmap(pixel_buffer *Raster) -{ - uint8 asda = 0x0; - uint8 *Row = ((uint8 *)Raster->OriginalBuffer); - real32 XInc = 255.0f / Raster->Width; - real32 YInc = 255.0f / Raster->Height; - for (uint8 Y = 0; Y < Raster->Height; Y++) { - for (uint8 X = 0; X < Raster->Width; X++) { - uint8 *Pixel = (uint8 *)Row + Raster->FullWidth*Y*4 + X*4; - // *(uint32 *)Pixel = 0xffffffff; - if (Y > 3) { asda = 0xff; } - *(uint32 *)Pixel = ((0xff << 24) | - (asda << 16) | - (RoundReal32ToInt32((YInc * Y)) << 8) | - (RoundReal32ToInt32((XInc * X))) ); - } - } -} -#endif |