// Bitmaps are curently stored two ways in this program, which I'm calling // "packed" and "unpacked." Both are 0xAAGGBBRR little endian. Unpacked bitmaps // use the typical method of storage, rows of X that you increment by // Width*BytesPerPixel to look up the Y coordinate. "Packed" bitmaps encode // pixels as 4x4 chunks. To illustrate this clearly with an 8x4 bitmap: // A1 A2 A3 A4 E1 E2 E3 E4 // B1 B2 B3 B4 F1 F2 F3 F4 // C1 C2 C3 C4 G1 G2 G3 G4 // D1 D2 D3 D4 H1 H2 H3 H4 // Unpacked would be stored in memory order as A1 A2 A3 A4 E1 E2 E3 E4... // while packed would be stored as A1 A2 A3 A4 B1 B2 B3 B4... // In cases where the bitmap is a non-divisible-by-four size, we simply treat // the bitmap as if it's the right size and add the extra pixels in the allocation. // This wasn't an optimization I necessarily _needed_ to make this early on--I // never even did any measuring to see if there was any speedup-- but I // couldn't resist it. I like doing the software rendering stuff. // TODO(fox): I could write an AVX version of this function, but it may not be // that much faster since we have to do a bit of uninterleaving. // 0 - store in 4x4 chunks // 1 - unpack to 1xwidth void Bitmap_ConvertPacking(void *Buffer, void *DestBuffer, uint16 Width, uint16 Height, uint16 BytesPerPixel, uint16 Which) { uint8 *Src = (uint8 *)Buffer; uint8 *Temp = (uint8 *)DestBuffer; uint32 RemainderPixels = Width % 4; uint16 WidthP, HeightP; Bitmap_CalcPackedDimensions(Width, Height, &WidthP, &HeightP); for (uint32 Y = 0; Y < Height; Y++) { uint32 X = 0; while (X < Width - RemainderPixels) { uint32 XLookup = (X >> 2)*16 + (X % 4); uint32 YLookup = (Y >> 2)*(WidthP*4) + (Y % 4)*4; uint32 PixelToSeek = XLookup + YLookup; if (Y == 48 && X == 0) uint8 war = 0; // if (YLookup == 2500 && XLookup == 1) uint8 *DPixel, *Pixel; if (Which == 0) { DPixel = Temp + PixelToSeek*BytesPerPixel; Pixel = Src + Y*Width*4 + X*BytesPerPixel; } else { Pixel = Src + PixelToSeek*BytesPerPixel; DPixel = Temp + Y*Width*4 + X*BytesPerPixel; } #if ARM if (InstructionMode == instruction_mode_neon) { uint32x2x2_t Row = vld2_u32((uint32 *)Pixel); vst2_u32((uint32 *)DPixel, Row); X += 4; #else if (InstructionMode == instruction_mode_sse || InstructionMode == instruction_mode_avx) { __m128i Row = _mm_loadu_si128((__m128i *)Pixel); _mm_storeu_si128((__m128i *)DPixel, Row); X+=4; #endif } else { *(uint32 *)DPixel = *(uint32 *)Pixel; X++; } } while (X < Width) { uint16 WidthP, HeightP; Bitmap_CalcPackedDimensions(Width, Height, &WidthP, &HeightP); uint32 XLookup = (X >> 2)*16 + (X % 4); uint32 YLookup = (Y >> 2)*(WidthP*4) + (Y % 4)*4; uint32 PixelToSeek = XLookup + YLookup; uint8 *DPixel, *Pixel; if (Which == 0) { DPixel = Temp + PixelToSeek*BytesPerPixel; Pixel = Src + Y*Width*4 + X*BytesPerPixel; } else { Pixel = Src + PixelToSeek*BytesPerPixel; DPixel = Temp + Y*Width*4 + X*BytesPerPixel; } *(uint32 *)DPixel = *(uint32 *)Pixel; X++; } } } // TODO(fox): Replace this in the future. #if 0 static void * MoveImportToBitmap(memory *Memory, pixel_buffer *Raster, void *Input) { uint8 *Row = ((uint8 *)Input); // void *Output = AllocateMemory(Memory, Bitmap_CalcTotalBytes(Raster->Width, Raster->Height, Raster->BytesPerPixel), B_Layers); uint8 *Row2 = ((uint8 *)Output); uint64 bytes = 0; uint16 ByteOffset = Bitmap_CalculateByteOffset(BytesPerPixel); uint64 TotalBytes = Bitmap_CalculateTotalBytes(Width, Height, BytesPerPixel); uint64 RemainderBytes = TotalBytes % ByteOffset; while (bytes <= TotalBytes - RemainderBytes) { uint8 *Pixel = (uint8 *)Row + bytes; uint8 *Pixel2 = (uint8 *)Row2 + bytes; if (InstructionMode == instruction_mode_sse || InstructionMode == instruction_mode_avx) { __m128i OutputPixel = _mm_loadu_si128((__m128i *)Pixel); _mm_storeu_si128((__m128i *)Pixel2, OutputPixel); bytes += 4*Raster->BytesPerPixel; } else { *(uint32 *)Pixel2 = *(uint32 *)Pixel; bytes += Raster->BytesPerPixel; } } while (bytes <= TotalBytes) { uint8 *Pixel = (uint8 *)Row + bytes; uint8 *Pixel2 = (uint8 *)Row2 + bytes; *(uint32 *)Pixel2 = *(uint32 *)Pixel; bytes += Raster->BytesPerPixel; } return Output; } #endif static void Bitmap_Clear(void *Buffer, uint16 Width, uint16 Height, uint16 BytesPerPixel) { uint8 *Row = (uint8 *)Buffer; #if ARM uint32 Zero[4] = {0}; uint32x2x4_t Zero8 = vld4_dup_u32(Zero); #else __m256i Zero8 = _mm256_setzero_si256(); __m128i Zero = _mm_setzero_si128(); #endif uint64 bytes = 0; uint16 ByteOffset = Bitmap_CalcByteOffset(BytesPerPixel); uint64 TotalBytes = Bitmap_CalcTotalBytes(Width, Height, BytesPerPixel); while (bytes < TotalBytes) { uint8 *Pixel = Row + bytes; #if ARM if (InstructionMode == instruction_mode_neon) { vst4_u32((uint32 *)Pixel, Zero8); #else if (InstructionMode == instruction_mode_avx) { _mm256_storeu_si256((__m256i *)Pixel, Zero8); } else if (InstructionMode == instruction_mode_sse) { _mm_storeu_si128((__m128i *)Pixel, Zero); #endif } else { *(uint32 *)Pixel = 0x00000000; } bytes += ByteOffset; } } static void Bitmap_CalcPackedDimensions(uint16 Width, uint16 Height, uint16 *WidthP, uint16 *HeightP) { uint16 ExtraWidth = 4 - (Width % 4); if (ExtraWidth == 4) ExtraWidth = 0; uint16 ExtraHeight = 4 - (Height % 4); if (ExtraHeight == 4) ExtraHeight = 0; *WidthP = Width + ExtraWidth; *HeightP = Height + ExtraHeight; } static uint16 Bitmap_CalcByteOffset(uint16 BytesPerPixel) { uint16 ByteOffset = BytesPerPixel; #if ARM if (InstructionMode == instruction_mode_neon) ByteOffset = 8*BytesPerPixel; #else if (InstructionMode == instruction_mode_avx) ByteOffset = 8*BytesPerPixel; if (InstructionMode == instruction_mode_sse) ByteOffset = 4*BytesPerPixel; #endif return ByteOffset; } static uint64 Bitmap_CalcUnpackedBytes(uint16 Width, uint16 Height, uint16 BytesPerPixel) { uint64 TotalBytes = (uint64)Width*Height*BytesPerPixel; return TotalBytes; } static uint64 Bitmap_CalcTotalBytes(uint16 Width, uint16 Height, uint16 BytesPerPixel) { uint16 WidthP, HeightP; Bitmap_CalcPackedDimensions(Width, Height, &WidthP, &HeightP); uint64 TotalBytes = (uint64)WidthP*HeightP*BytesPerPixel; return TotalBytes; } // TODO(fox): Maybe turn this into a generic memory copy; we don't need to care // about pixels for any particular reason here. static void Bitmap_CopyToPointer(void *Input, void *Output, uint16 BytesPerPixel, uint64 TotalBytes) { uint8 *Row = (uint8 *)Input; uint8 *Row2 = (uint8 *)Output; uint64 bytes = 0; uint16 ByteOffset = Bitmap_CalcByteOffset(BytesPerPixel); uint64 RemainderBytes = TotalBytes % ByteOffset; while (bytes <= TotalBytes - RemainderBytes) { uint8 *Pixel = (uint8 *)Row + bytes; uint8 *Pixel2 = (uint8 *)Row2 + bytes; #if ARM if (InstructionMode == instruction_mode_neon) { uint32x2x4_t OutputPixel = vld4_u32((uint32 *)Pixel); vst4_u32((uint32 *)Pixel2, OutputPixel); #else if (InstructionMode == instruction_mode_avx) { __m256i OutputPixel = _mm256_loadu_si256((__m256i *)Pixel); _mm256_storeu_si256((__m256i *)Pixel2, OutputPixel); } else if (InstructionMode == instruction_mode_sse) { __m128i OutputPixel = _mm_loadu_si128((__m128i *)Pixel); _mm_storeu_si128((__m128i *)Pixel2, OutputPixel); #endif } else { *(uint32 *)Pixel2 = *(uint32 *)Pixel; } bytes += ByteOffset; } while (bytes <= TotalBytes) { uint8 *Pixel = (uint8 *)Row + bytes; uint8 *Pixel2 = (uint8 *)Row2 + bytes; *(uint32 *)Pixel2 = *(uint32 *)Pixel; bytes += BytesPerPixel; } } static void Bitmap_StencilAlpha(void *Input, void *Output, uint16 BytesPerPixel, uint64 TotalBytes) { uint8 *Row = (uint8 *)Input; uint8 *Row2 = (uint8 *)Output; uint64 bytes = 0; uint16 ByteOffset = Bitmap_CalcByteOffset(BytesPerPixel); uint64 RemainderBytes = TotalBytes % ByteOffset; #if ARM #else __m256i AlphaBytes = _mm256_set1_epi32(0x00FFFFFF); __m256i Zeroi = _mm256_set1_epi32(0); #endif while (bytes <= TotalBytes - RemainderBytes) { uint8 *Pixel = (uint8 *)Row + bytes; uint8 *Pixel2 = (uint8 *)Row2 + bytes; #if ARM if (InstructionMode == instruction_mode_neon) { // TODO(fox): Optimize and write NEON! uint8 *DestAlpha = Pixel2 + (BytesPerPixel/4)*3; uint8 Alpha = *DestAlpha; uint32 *DestPixel = (uint32 *)Pixel2; uint32 *SrcPixel = (uint32 *)Pixel; *DestPixel = *SrcPixel; *DestAlpha = Alpha; bytes += BytesPerPixel; #else if (InstructionMode == instruction_mode_avx) { __m256i InputPixel = _mm256_loadu_si256((__m256i *)Pixel); __m256i OutputPixel = _mm256_loadu_si256((__m256i *)Pixel2); if (_mm256_movemask_epi8(OutputPixel)) { OutputPixel = _mm256_blendv_epi8(OutputPixel, InputPixel, AlphaBytes); _mm256_storeu_si256((__m256i *)Pixel2, OutputPixel); } bytes += ByteOffset; } else if (InstructionMode == instruction_mode_sse) { __m128i OutputPixel = _mm_loadu_si128((__m128i *)Pixel); _mm_storeu_si128((__m128i *)Pixel2, OutputPixel); bytes += ByteOffset; #endif } else { // TODO(fox): Optimize and write NEON! uint8 *DestAlpha = Pixel2 + (BytesPerPixel/4)*3; uint8 Alpha = *DestAlpha; uint32 *DestPixel = (uint32 *)Pixel2; uint32 *SrcPixel = (uint32 *)Pixel; *DestPixel = *SrcPixel; *DestAlpha = Alpha; bytes += BytesPerPixel; } } while (bytes <= TotalBytes) { uint8 *Pixel = (uint8 *)Row + bytes; uint8 *Pixel2 = (uint8 *)Row2 + bytes; // TODO(fox): Optimize and write NEON! uint8 *DestAlpha = Pixel2 + (BytesPerPixel/4)*3; uint8 Alpha = *DestAlpha; uint32 *DestPixel = (uint32 *)Pixel2; uint32 *SrcPixel = (uint32 *)Pixel; *DestPixel = *SrcPixel; *DestAlpha = Alpha; bytes += BytesPerPixel; } } // This would be an easy SIMD if only AVX had a scatter call... // NOTE(fox): Only works with unpacked bitmaps for now. static void Bitmap_CalcHistogram(void *Data, void *Input, uint16 BytesPerPixel, uint64 TotalBytes) { uint32 *Slot = (uint32 *)Data; uint8 *Row = (uint8 *)Input; uint64 bytes = 0; uint16 ByteOffset = Bitmap_CalcByteOffset(BytesPerPixel); uint64 RemainderBytes = TotalBytes % ByteOffset; for (int i = 0; i < 256*5; i++) { *(real32 *)((uint8 *)Slot + i*sizeof(real32)) = 0; } while (bytes <= TotalBytes) { uint8 *Pixel = (uint8 *)Row + bytes; uint8 A = (*(uint32 *)Pixel >> 24); uint8 R = (*(uint32 *)Pixel >> 16); uint8 G = (*(uint32 *)Pixel >> 8); uint8 B = (*(uint32 *)Pixel >> 0); uint8 Avg = (uint8)((real32)(R + G + B) / 3.0f); *(real32 *)((uint8 *)Slot + Avg*sizeof(real32)) += 1; *(real32 *)((uint8 *)Slot + (256 + R)*sizeof(real32)) += 1; *(real32 *)((uint8 *)Slot + (256*2 + G)*sizeof(real32)) += 1; *(real32 *)((uint8 *)Slot + (256*3 + B)*sizeof(real32)) += 1; *(real32 *)((uint8 *)Slot + (256*4 + A)*sizeof(real32)) += 1; bytes += BytesPerPixel; } } #if 0 static void BitmapPackRGB(pixel_buffer *Buffer) { Assert(Buffer->Pitch); Convert4x4Chunk(Buffer, 0); CopyToBuffer(Buffer, 1); ClearBuffer(Buffer, Buffer->EffectBuffer); } static void DebugFillSolid(pixel_buffer *Raster, v4 Color) { uint32 ColS = ColToUint32(Color); __m256i Col8 = _mm256_set1_epi32(ColS); __m128i Col = _mm_set1_epi32(ColS); uint8 *Row = (uint8 *)Raster->OriginalBuffer; uint64 bytes = 0; uint16 ByteOffset = Raster->BytesPerPixel; if (InstructionMode == instruction_mode_avx) ByteOffset = 8*Raster->BytesPerPixel; else if (InstructionMode == instruction_mode_sse) ByteOffset = 4*Raster->BytesPerPixel; uint64 TotalBytes = Raster->FullHeight*Raster->FullWidth*Raster->BytesPerPixel; while (bytes < TotalBytes) { uint8 *Pixel = Row + bytes; if (InstructionMode == instruction_mode_avx) { _mm256_storeu_si256((__m256i *)Pixel, Col8); } else if (InstructionMode == instruction_mode_sse) { _mm_storeu_si128((__m128i *)Pixel, Col); } else { *(uint32 *)Pixel = ColS; } bytes += ByteOffset; } } static void DebugBitmap(pixel_buffer *Raster) { uint8 asda = 0x0; uint8 *Row = ((uint8 *)Raster->OriginalBuffer); real32 XInc = 255.0f / Raster->Width; real32 YInc = 255.0f / Raster->Height; for (uint8 Y = 0; Y < Raster->Height; Y++) { for (uint8 X = 0; X < Raster->Width; X++) { uint8 *Pixel = (uint8 *)Row + Raster->FullWidth*Y*4 + X*4; // *(uint32 *)Pixel = 0xffffffff; if (Y > 3) { asda = 0xff; } *(uint32 *)Pixel = ((0xff << 24) | (asda << 16) | (RoundReal32ToInt32((YInc * Y)) << 8) | (RoundReal32ToInt32((XInc * X))) ); } } } #endif