// NOTE(fox): Pay attention to how the Y pitch differs between the unpacked // bitmaps and the 4x4 packed bitmaps, since odd-sized bitmaps are padded. // TODO(fox): I could write an AVX version of this function, but it may not be // that much faster since we have to do a bit of uninterleaving. // 0 - store in 4x4 chunks // 1 - unpack to 1xwidth internal void Convert4x4Chunk(pixel_buffer *Buffer, uint8 Which) { uint8 *Src = (uint8 *)Buffer->OriginalBuffer; uint8 *Temp = (uint8 *)Buffer->EffectBuffer; uint32 RemainderPixels = Buffer->Width % 4; for (uint32 Y = 0; Y < Buffer->Height; Y++) { uint32 X = 0; while (X < Buffer->Width - RemainderPixels) { uint32 XLookup = (X >> 2)*16 + (X % 4); uint32 YLookup = (Y >> 2)*(Buffer->FullWidth*4) + (Y % 4)*4; uint32 PixelToSeek = XLookup + YLookup; uint8 *DPixel, *Pixel; if (Which == 0) { DPixel = Temp + PixelToSeek*Buffer->BytesPerPixel; Pixel = Src + Y*Buffer->Width*4 + X*Buffer->BytesPerPixel; } else { Pixel = Src + PixelToSeek*Buffer->BytesPerPixel; DPixel = Temp + Y*Buffer->Width*4 + X*Buffer->BytesPerPixel; } if (InstructionMode == sse_enabled || InstructionMode == avx_enabled) { __m128i Row = _mm_loadu_si128((__m128i *)Pixel); _mm_storeu_si128((__m128i *)DPixel, Row); X+=4; } else { *(uint32 *)DPixel = *(uint32 *)Pixel; X++; } } while (X < Buffer->Width) { uint32 XLookup = (X >> 2)*16 + (X % 4); uint32 YLookup = (Y >> 2)*(Buffer->FullWidth*4) + (Y % 4)*4; uint32 PixelToSeek = XLookup + YLookup; uint8 *DPixel, *Pixel; if (Which == 0) { DPixel = Temp + PixelToSeek*Buffer->BytesPerPixel; Pixel = Src + Y*Buffer->Width*4 + X*Buffer->BytesPerPixel; } else { Pixel = Src + PixelToSeek*Buffer->BytesPerPixel; DPixel = Temp + Y*Buffer->Width*4 + X*Buffer->BytesPerPixel; } *(uint32 *)DPixel = *(uint32 *)Pixel; X++; } } } // TODO(fox): Replace this in the future. internal void * MoveImportToBitmap(memory *Memory, pixel_buffer *Raster, void *Input) { uint8 *Row = ((uint8 *)Input); void *Output = AllocateMemory(Memory, Raster->FullWidth * Raster->FullHeight * Raster->BytesPerPixel, B_Scratch); uint8 *Row2 = ((uint8 *)Output); uint64 bytes = 0; uint16 ByteOffset = Raster->BytesPerPixel; if (InstructionMode == avx_enabled) ByteOffset = 8*Raster->BytesPerPixel; else if (InstructionMode == avx_enabled) ByteOffset = 4*Raster->BytesPerPixel; uint64 TotalBytes = Raster->Height*Raster->Width*Raster->BytesPerPixel; uint64 RemainderBytes = TotalBytes % ByteOffset; while (bytes <= TotalBytes - RemainderBytes) { uint8 *Pixel = (uint8 *)Row + bytes; uint8 *Pixel2 = (uint8 *)Row2 + bytes; if (InstructionMode == sse_enabled || InstructionMode == avx_enabled) { __m128i OutputPixel = _mm_loadu_si128((__m128i *)Pixel); _mm_storeu_si128((__m128i *)Pixel2, OutputPixel); bytes += 4*Raster->BytesPerPixel; } else { *(uint32 *)Pixel2 = *(uint32 *)Pixel; bytes += Raster->BytesPerPixel; } } while (bytes <= TotalBytes) { uint8 *Pixel = (uint8 *)Row + bytes; uint8 *Pixel2 = (uint8 *)Row2 + bytes; *(uint32 *)Pixel2 = *(uint32 *)Pixel; bytes += Raster->BytesPerPixel; } return Output; } internal void ClearBuffer(pixel_buffer *Raster, void *Buffer) { uint8 *Row = (uint8 *)Buffer; __m256i Zero8 = _mm256_setzero_si256(); __m128i Zero = _mm_setzero_si128(); uint64 bytes = 0; uint16 ByteOffset = Raster->BytesPerPixel; if (InstructionMode == avx_enabled) ByteOffset = 8*Raster->BytesPerPixel; else if (InstructionMode == avx_enabled) ByteOffset = 4*Raster->BytesPerPixel; uint64 TotalBytes = Raster->FullHeight*Raster->FullWidth*Raster->BytesPerPixel; while (bytes < TotalBytes) { uint8 *Pixel = Row + bytes; if (InstructionMode == avx_enabled) { _mm256_storeu_si256((__m256i *)Pixel, Zero8); } else if (InstructionMode == sse_enabled) { _mm_storeu_si128((__m128i *)Pixel, Zero); } else { *(uint32 *)Pixel = 0x00000000; } bytes += ByteOffset; } } // 0 - original -> effect // 1 - effect -> original internal void CopyToBuffer(pixel_buffer *Raster, uint16 Which) { uint8 *Row, *Row2; if (Which == 0) { Row = ((uint8 *)Raster->OriginalBuffer); Row2 = ((uint8 *)Raster->EffectBuffer); } else { Row = ((uint8 *)Raster->EffectBuffer); Row2 = ((uint8 *)Raster->OriginalBuffer); } uint64 bytes = 0; uint16 ByteOffset = Raster->BytesPerPixel; if (InstructionMode == avx_enabled) ByteOffset = 8*Raster->BytesPerPixel; else if (InstructionMode == avx_enabled) ByteOffset = 4*Raster->BytesPerPixel; uint64 TotalBytes = Raster->FullHeight*Raster->FullWidth*Raster->BytesPerPixel; uint64 RemainderBytes = TotalBytes % ByteOffset; while (bytes <= TotalBytes - RemainderBytes) { uint8 *Pixel = (uint8 *)Row + bytes; uint8 *Pixel2 = (uint8 *)Row2 + bytes; if (InstructionMode == sse_enabled || InstructionMode == avx_enabled) { __m128i OutputPixel = _mm_loadu_si128((__m128i *)Pixel); _mm_storeu_si128((__m128i *)Pixel2, OutputPixel); bytes += 4*Raster->BytesPerPixel; } else { *(uint32 *)Pixel2 = *(uint32 *)Pixel; bytes += Raster->BytesPerPixel; } } while (bytes <= TotalBytes) { uint8 *Pixel = (uint8 *)Row + bytes; uint8 *Pixel2 = (uint8 *)Row2 + bytes; *(uint32 *)Pixel2 = *(uint32 *)Pixel; bytes += Raster->BytesPerPixel; } } internal void BitmapPackRGB(pixel_buffer *Buffer) { Assert(Buffer->Pitch); Convert4x4Chunk(Buffer, 0); CopyToBuffer(Buffer, 1); ClearBuffer(Buffer, Buffer->EffectBuffer); } internal void OutputToViewport(pixel_buffer *CompBuffer, project_state *State, GLuint textureID) { Convert4x4Chunk(CompBuffer, 1); EndRenderState(State); glBindTexture(GL_TEXTURE_2D, textureID); glTexSubImage2D(GL_TEXTURE_2D, 0, 0, 0, CompBuffer->Width, CompBuffer->Height, GL_RGBA, GL_UNSIGNED_BYTE, CompBuffer->EffectBuffer); } internal void DebugFillSolid(pixel_buffer *Raster, v4 Color) { uint32 ColS = ColToUint32(Color); __m256i Col8 = _mm256_set1_epi32(ColS); __m128i Col = _mm_set1_epi32(ColS); uint8 *Row = (uint8 *)Raster->OriginalBuffer; uint64 bytes = 0; uint16 ByteOffset = Raster->BytesPerPixel; if (InstructionMode == avx_enabled) ByteOffset = 8*Raster->BytesPerPixel; else if (InstructionMode == avx_enabled) ByteOffset = 4*Raster->BytesPerPixel; uint64 TotalBytes = Raster->FullHeight*Raster->FullWidth*Raster->BytesPerPixel; while (bytes < TotalBytes) { uint8 *Pixel = Row + bytes; if (InstructionMode == avx_enabled) { _mm256_storeu_si256((__m256i *)Pixel, Col8); } else if (InstructionMode == sse_enabled) { _mm_storeu_si128((__m128i *)Pixel, Col); } else { *(uint32 *)Pixel = ColS; } bytes += ByteOffset; } } internal void DebugBitmap(pixel_buffer *Raster) { uint8 asda = 0x0; uint8 *Row = ((uint8 *)Raster->OriginalBuffer); real32 XInc = 255.0f / Raster->Width; real32 YInc = 255.0f / Raster->Height; for (uint8 Y = 0; Y < Raster->Height; Y++) { for (uint8 X = 0; X < Raster->Width; X++) { uint8 *Pixel = (uint8 *)Row + Raster->FullWidth*Y*4 + X*4; // *(uint32 *)Pixel = 0xffffffff; if (Y > 3) { asda = 0xff; } *(uint32 *)Pixel = ((0xff << 24) | (asda << 16) | (RoundReal32ToInt32((YInc * Y)) << 8) | (RoundReal32ToInt32((XInc * X))) ); } } }