diff options
-rw-r--r-- | bitmap_calls.cpp | 238 | ||||
-rwxr-xr-x | build.bat | 4 | ||||
-rwxr-xr-x | build.sh | 2 | ||||
-rw-r--r-- | createcalls.cpp | 502 | ||||
-rw-r--r-- | debug.h | 2 | ||||
-rw-r--r-- | effects.cpp | 4 | ||||
-rw-r--r-- | main.cpp | 75 | ||||
-rw-r--r-- | main.h | 18 | ||||
-rw-r--r-- | my_imgui_widgets.cpp | 20 | ||||
-rw-r--r-- | prenderer.cpp | 379 | ||||
-rw-r--r-- | threading.cpp | 19 | ||||
-rw-r--r-- | video.cpp | 23 |
12 files changed, 653 insertions, 633 deletions
diff --git a/bitmap_calls.cpp b/bitmap_calls.cpp new file mode 100644 index 0000000..2cdb463 --- /dev/null +++ b/bitmap_calls.cpp @@ -0,0 +1,238 @@ +// NOTE(fox): Pay attention to how the Y pitch differs between the unpacked +// bitmaps and the 4x4 packed bitmaps, since odd-sized bitmaps are padded. + +// TODO(fox): I could write an AVX version of this function, but it may not be +// that much faster since we have to do a bit of uninterleaving. + +// 0 - store in 4x4 chunks +// 1 - unpack to 1xwidth +internal void +Convert4x4Chunk(pixel_buffer *Buffer, uint8 Which) +{ + uint8 *Src = (uint8 *)Buffer->OriginalBuffer; + uint8 *Temp = (uint8 *)Buffer->EffectBuffer; + uint32 RemainderPixels = Buffer->Width % 4; + for (uint32 Y = 0; Y < Buffer->Height; Y++) { + uint32 X = 0; + while (X < Buffer->Width - RemainderPixels) { + uint32 XLookup = (X >> 2)*16 + (X % 4); + uint32 YLookup = (Y >> 2)*(Buffer->FullWidth*4) + (Y % 4)*4; + uint32 PixelToSeek = XLookup + YLookup; + uint8 *DPixel, *Pixel; + if (Which == 0) { + DPixel = Temp + PixelToSeek*Buffer->BytesPerPixel; + Pixel = Src + Y*Buffer->Width*4 + X*Buffer->BytesPerPixel; + } else { + Pixel = Src + PixelToSeek*Buffer->BytesPerPixel; + DPixel = Temp + Y*Buffer->Width*4 + X*Buffer->BytesPerPixel; + } + + if (InstructionMode == sse_enabled || InstructionMode == avx_enabled) { + __m128i Row = _mm_loadu_si128((__m128i *)Pixel); + _mm_storeu_si128((__m128i *)DPixel, Row); + X+=4; + } else { + *(uint32 *)DPixel = *(uint32 *)Pixel; + X++; + } + } + while (X < Buffer->Width) { + uint32 XLookup = (X >> 2)*16 + (X % 4); + uint32 YLookup = (Y >> 2)*(Buffer->FullWidth*4) + (Y % 4)*4; + uint32 PixelToSeek = XLookup + YLookup; + uint8 *DPixel, *Pixel; + if (Which == 0) { + DPixel = Temp + PixelToSeek*Buffer->BytesPerPixel; + Pixel = Src + Y*Buffer->Width*4 + X*Buffer->BytesPerPixel; + } else { + Pixel = Src + PixelToSeek*Buffer->BytesPerPixel; + DPixel = Temp + Y*Buffer->Width*4 + X*Buffer->BytesPerPixel; + } + + *(uint32 *)DPixel = *(uint32 *)Pixel; + X++; + } + } +} + +// TODO(fox): Replace this in the future. +internal void * +MoveImportToBitmap(memory *Memory, pixel_buffer *Raster, void *Input) +{ + uint8 *Row = ((uint8 *)Input); + void *Output = AllocateMemory(Memory, Raster->FullWidth * Raster->FullHeight * Raster->BytesPerPixel, B_Scratch); + uint8 *Row2 = ((uint8 *)Output); + + uint64 bytes = 0; + uint16 ByteOffset = Raster->BytesPerPixel; + if (InstructionMode == avx_enabled) + ByteOffset = 8*Raster->BytesPerPixel; + else if (InstructionMode == avx_enabled) + ByteOffset = 4*Raster->BytesPerPixel; + + uint64 TotalBytes = Raster->Height*Raster->Width*Raster->BytesPerPixel; + uint64 RemainderBytes = TotalBytes % ByteOffset; + + while (bytes <= TotalBytes - RemainderBytes) { + uint8 *Pixel = (uint8 *)Row + bytes; + uint8 *Pixel2 = (uint8 *)Row2 + bytes; + if (InstructionMode == sse_enabled || InstructionMode == avx_enabled) { + __m128i OutputPixel = _mm_loadu_si128((__m128i *)Pixel); + _mm_storeu_si128((__m128i *)Pixel2, OutputPixel); + bytes += 4*Raster->BytesPerPixel; + } else { + *(uint32 *)Pixel2 = *(uint32 *)Pixel; + bytes += Raster->BytesPerPixel; + } + } + while (bytes <= TotalBytes) { + uint8 *Pixel = (uint8 *)Row + bytes; + uint8 *Pixel2 = (uint8 *)Row2 + bytes; + *(uint32 *)Pixel2 = *(uint32 *)Pixel; + bytes += Raster->BytesPerPixel; + } + return Output; +} + +internal void +ClearBuffer(pixel_buffer *Raster, void *Buffer) +{ + uint8 *Row = (uint8 *)Buffer; + __m256i Zero8 = _mm256_setzero_si256(); + __m128i Zero = _mm_setzero_si128(); + + uint64 bytes = 0; + uint16 ByteOffset = Raster->BytesPerPixel; + if (InstructionMode == avx_enabled) + ByteOffset = 8*Raster->BytesPerPixel; + else if (InstructionMode == avx_enabled) + ByteOffset = 4*Raster->BytesPerPixel; + + uint64 TotalBytes = Raster->FullHeight*Raster->FullWidth*Raster->BytesPerPixel; + + while (bytes < TotalBytes) { + uint8 *Pixel = Row + bytes; + if (InstructionMode == avx_enabled) { + _mm256_storeu_si256((__m256i *)Pixel, Zero8); + } else if (InstructionMode == sse_enabled) { + _mm_storeu_si128((__m128i *)Pixel, Zero); + } else { + *(uint32 *)Pixel = 0x00000000; + } + bytes += ByteOffset; + } +} + +// 0 - original -> effect +// 1 - effect -> original +internal void +CopyToBuffer(pixel_buffer *Raster, uint16 Which) +{ + uint8 *Row, *Row2; + if (Which == 0) { + Row = ((uint8 *)Raster->OriginalBuffer); + Row2 = ((uint8 *)Raster->EffectBuffer); + } else { + Row = ((uint8 *)Raster->EffectBuffer); + Row2 = ((uint8 *)Raster->OriginalBuffer); + } + + uint64 bytes = 0; + uint16 ByteOffset = Raster->BytesPerPixel; + if (InstructionMode == avx_enabled) + ByteOffset = 8*Raster->BytesPerPixel; + else if (InstructionMode == avx_enabled) + ByteOffset = 4*Raster->BytesPerPixel; + + uint64 TotalBytes = Raster->FullHeight*Raster->FullWidth*Raster->BytesPerPixel; + uint64 RemainderBytes = TotalBytes % ByteOffset; + + while (bytes <= TotalBytes - RemainderBytes) { + uint8 *Pixel = (uint8 *)Row + bytes; + uint8 *Pixel2 = (uint8 *)Row2 + bytes; + if (InstructionMode == sse_enabled || InstructionMode == avx_enabled) { + __m128i OutputPixel = _mm_loadu_si128((__m128i *)Pixel); + _mm_storeu_si128((__m128i *)Pixel2, OutputPixel); + bytes += 4*Raster->BytesPerPixel; + } else { + *(uint32 *)Pixel2 = *(uint32 *)Pixel; + bytes += Raster->BytesPerPixel; + } + } + while (bytes <= TotalBytes) { + uint8 *Pixel = (uint8 *)Row + bytes; + uint8 *Pixel2 = (uint8 *)Row2 + bytes; + *(uint32 *)Pixel2 = *(uint32 *)Pixel; + bytes += Raster->BytesPerPixel; + } +} + +internal void +BitmapPackRGB(pixel_buffer *Buffer) { + Assert(Buffer->Pitch); + Convert4x4Chunk(Buffer, 0); + CopyToBuffer(Buffer, 1); + ClearBuffer(Buffer, Buffer->EffectBuffer); +} + +internal void +OutputToViewport(pixel_buffer *CompBuffer, project_state *State, GLuint textureID) { + if (D) + Convert4x4Chunk(CompBuffer, 1); + else + CopyToBuffer(CompBuffer, 0); + EndRenderState(State); + glBindTexture(GL_TEXTURE_2D, textureID); + glTexSubImage2D(GL_TEXTURE_2D, 0, 0, 0, CompBuffer->Width, CompBuffer->Height, GL_RGBA, GL_UNSIGNED_BYTE, + CompBuffer->EffectBuffer); +} + +internal void +DebugFillSolid(pixel_buffer *Raster, v4 Color) +{ + uint32 ColS = ColToUint32(Color); + __m256i Col8 = _mm256_set1_epi32(ColS); + __m128i Col = _mm_set1_epi32(ColS); + uint8 *Row = (uint8 *)Raster->OriginalBuffer; + + uint64 bytes = 0; + uint16 ByteOffset = Raster->BytesPerPixel; + if (InstructionMode == avx_enabled) + ByteOffset = 8*Raster->BytesPerPixel; + else if (InstructionMode == avx_enabled) + ByteOffset = 4*Raster->BytesPerPixel; + + uint64 TotalBytes = Raster->FullHeight*Raster->FullWidth*Raster->BytesPerPixel; + + while (bytes < TotalBytes) { + uint8 *Pixel = Row + bytes; + if (InstructionMode == avx_enabled) { + _mm256_storeu_si256((__m256i *)Pixel, Col8); + } else if (InstructionMode == sse_enabled) { + _mm_storeu_si128((__m128i *)Pixel, Col); + } else { + *(uint32 *)Pixel = ColS; + } + bytes += ByteOffset; + } +} + +internal void +DebugBitmap(pixel_buffer *Raster) +{ + uint8 asda = 0x0; + uint8 *Row = ((uint8 *)Raster->OriginalBuffer); + real32 XInc = 255.0f / Raster->Width; + real32 YInc = 255.0f / Raster->Height; + for (uint8 Y = 0; Y < Raster->Height; Y++) { + for (uint8 X = 0; X < Raster->Width; X++) { + uint8 *Pixel = (uint8 *)Row + Raster->FullWidth*Y*4 + X*4; + // *(uint32 *)Pixel = 0xffffffff; + if (Y > 3) { asda = 0xff; } + *(uint32 *)Pixel = ((0xff << 24) | + (asda << 16) | + (RoundReal32ToInt32((YInc * Y)) << 8) | + (RoundReal32ToInt32((XInc * X))) ); + } + } +} @@ -6,5 +6,5 @@ REM call "C:\Program Files (x86)\Microsoft Visual Studio\2022\BuildTools\VC\Auxi @set IMGUI_SOURCES=imgui\backends\imgui_impl_sdl.cpp imgui\backends\imgui_impl_opengl3.cpp imgui\imgui*.cpp
@set SDL_LIBS=/LIBPATH:%SDL2_DIR%\lib\x64 SDL2.lib SDL2main.lib opengl32.lib shell32.lib
@set FFMPEG_LIBS=/LIBPATH:%FFMPEG_DIR%\lib avcodec.lib avfilter.lib avformat.lib swscale.lib avutil.lib
-@set PREPROCESSORS=/DWINDOWS=1 /DARM=0 /DTHREADED=0 /DPACKEDRGB=1
-cl /nologo /Zi /MD %PREPROCESSORS% %INCLUDES% main.cpp %IMGUI_SOURCES% /Febin/real2d.exe /Fobin/ /link %SDL_LIBS% %FFMPEG_LIBS% /subsystem:console
\ No newline at end of file +@set PREPROCESSORS=/DWINDOWS=1 /DARM=0 /DTHREADED=0
+cl /nologo /Zi /MD %PREPROCESSORS% %INCLUDES% main.cpp %IMGUI_SOURCES% /Febin/real2d.exe /Fobin/ /link %SDL_LIBS% %FFMPEG_LIBS% /subsystem:console
@@ -8,7 +8,7 @@ WARNING_FLAGS=" -Wno-missing-field-initializers -Wno-sign-compare -Wno-write-strings -Wno-unused-but-set-parameter \ -Wno-missing-braces -Wno-format-security -fno-exceptions -Wno-strict-aliasing \ - -DDEBUG=1 -DARM=0 -DTHREADED=0 -DPACKEDRGB=1 \ + -DDEBUG=1 -DARM=0 -DTHREADED=0 \ " if [[ "$WINDOWS" == 1 ]]; then diff --git a/createcalls.cpp b/createcalls.cpp index e5ca18d..0dbf75c 100644 --- a/createcalls.cpp +++ b/createcalls.cpp @@ -7,22 +7,29 @@ IncrementFrame(project_data *File, int16 Amount) { } } +internal void +CalculateFull(pixel_buffer *Buffer) { + uint16 ExtraWidth = 4 - (Buffer->Width % 4); + if (ExtraWidth == 4) + ExtraWidth = 0; + uint16 ExtraHeight = 4 - (Buffer->Height % 4); + if (ExtraHeight == 4) + ExtraHeight = 0; + Buffer->FullWidth = Buffer->Width + ExtraWidth; + Buffer->FullHeight = Buffer->Height + ExtraHeight; +} internal pixel_buffer CreateBuffer(int Width, int Height, memory *Memory) { pixel_buffer Buffer = {}; Buffer.BytesPerPixel = 4; - Buffer.OriginalBuffer = AllocateMemory(Memory, Width * Height * Buffer.BytesPerPixel, B_Scratch); - Buffer.EffectBuffer = AllocateMemory(Memory, Width * Height * Buffer.BytesPerPixel, B_Scratch); Buffer.Width = Width; Buffer.Height = Height; -#if PACKEDRGB - Buffer.Pitch = Buffer.Width*Buffer.BytesPerPixel; -#else - Buffer.Pitch = Buffer.Width; // each row has only 1 byte, 8 bits, per pixel - Buffer.Channel = Buffer.Width*Buffer.Height; -#endif + CalculateFull(&Buffer); + Buffer.Pitch = Buffer.FullWidth*Buffer.BytesPerPixel; + Buffer.OriginalBuffer = AllocateMemory(Memory, Buffer.FullWidth * Buffer.FullHeight * Buffer.BytesPerPixel, B_Scratch); + Buffer.EffectBuffer = AllocateMemory(Memory, Buffer.FullWidth * Buffer.FullHeight * Buffer.BytesPerPixel, B_Scratch); Buffer.ToUpdate = true; return Buffer; } @@ -40,299 +47,6 @@ AddSource(project_data *File, memory *Memory, char *Path) } internal pixel_buffer -CreateDebugBitmap(int16 Width, int16 Height, memory *Memory) -{ - pixel_buffer Raster = CreateBuffer(Width, Height, Memory); - uint32 Channel = (Raster.Width * Raster.Height); - uint8 inc = 0; - uint8 incY = 0; - for (uint32 Y = 0; Y < Raster.Height; Y+=1) { - for (uint32 X = 0; X < Raster.Width; X+=1) { -#if PACKEDRGB - uint8 *Pix = ((uint8 *)Raster.OriginalBuffer + (Raster.Pitch*Y) + X*Raster.BytesPerPixel); - uint32 *Pixel = (uint32 *)Pix; - *Pixel = ( - (X << 0) | - (Y << 8) | - (0xaa << 16) | - (0xff << 24)); - inc++; -#else - uint8 *Pix = ((uint8 *)Raster.OriginalBuffer + (Raster.Pitch*Y) + X); - uint8 *Pix2 = ((uint8 *)Raster.OriginalBuffer + Channel + (Raster.Pitch*Y) + X); - uint8 *Pix3 = ((uint8 *)Raster.OriginalBuffer + Channel*2 + (Raster.Pitch*Y) + X); - uint8 *PixA = ((uint8 *)Raster.OriginalBuffer + Channel*3 + (Raster.Pitch*Y) + X); - // if (X == 0 && Y == 1) { - // *Pix++ = 0xaa; - // inc++; - // } else if (X == 0 && Y == 2) { - // *Pix++ = 0xbb; - // inc++; - // } else if (X == 0 && Y == 3) { - // *Pix++ = 0xcc; - // inc++; - // } else { - *Pix++ = 16*inc++; - *Pix2++ = 16*incY; - *Pix3++ = 0xaa; - *PixA++ = 0xff; - // } -#endif - } - incY++; - } - return Raster; -} - -internal void -ClearBuffer(pixel_buffer *Buffer) -{ - uint8 *Row = ((uint8 *)Buffer->OriginalBuffer); - for(int Y = 0; - Y < Buffer->Height; - ++Y) - { - uint32 *Pixel = (uint32 *)Row; - for(int X = 0; - X < Buffer->Width; - ++X) - { - *(uint32 *)Pixel++ = 0x00000000; - } - Row += Buffer->Pitch; - } -} - -#if PACKEDRGB -internal void -Unpack4x4Chunk(pixel_buffer *Buffer) -{ - uint8 *Src = (uint8 *)Buffer->OriginalBuffer; - uint8 *Temp = (uint8 *)Buffer->EffectBuffer; - uint32 bytes = 0; - for (uint32 Y = 0; Y < Buffer->Height; Y+=4) { - uint8 *DPixel1 = Temp + Y*Buffer->Pitch; - uint8 *DPixel2 = Temp + (Y+1)*Buffer->Pitch; - uint8 *DPixel3 = Temp + (Y+2)*Buffer->Pitch; - uint8 *DPixel4 = Temp + (Y+3)*Buffer->Pitch; - for (uint32 X = 0; X < Buffer->Width; X+=4) { - uint8 *Pixel1 = Src + bytes; - uint8 *Pixel2 = Pixel1 + 4*Buffer->BytesPerPixel; - uint8 *Pixel3 = Pixel1 + 4*Buffer->BytesPerPixel*2; - uint8 *Pixel4 = Pixel1 + 4*Buffer->BytesPerPixel*3; - - __m128i Row1 = _mm_loadu_si128((__m128i *)Pixel1); - __m128i Row2 = _mm_loadu_si128((__m128i *)Pixel2); - __m128i Row3 = _mm_loadu_si128((__m128i *)Pixel3); - __m128i Row4 = _mm_loadu_si128((__m128i *)Pixel4); - _mm_storeu_si128((__m128i *)DPixel1, Row1); - DPixel1 += 4*Buffer->BytesPerPixel; - _mm_storeu_si128((__m128i *)DPixel2, Row2); - DPixel2 += 4*Buffer->BytesPerPixel; - _mm_storeu_si128((__m128i *)DPixel3, Row3); - DPixel3 += 4*Buffer->BytesPerPixel; - _mm_storeu_si128((__m128i *)DPixel4, Row4); - DPixel4 += 4*Buffer->BytesPerPixel; - - bytes += 16*Buffer->BytesPerPixel; - } - } -} -internal void -Store4x4Chunk(pixel_buffer *Buffer) -{ -#if 1 - uint8 *Src = (uint8 *)Buffer->OriginalBuffer; - uint8 *Temp = (uint8 *)Buffer->EffectBuffer; - for (uint32 Y = 0; Y+4 < Buffer->Height; Y+=4) { - uint8 *DPixel = Temp + Y*Buffer->Pitch; - for (uint32 X = 0; X < Buffer->Width; X+=4) { - uint8 *Pixel1 = Src + Y*Buffer->Pitch + X*Buffer->BytesPerPixel; - uint8 *Pixel2 = Pixel1 + Buffer->Pitch; - uint8 *Pixel3 = Pixel1 + Buffer->Pitch*2; - uint8 *Pixel4 = Pixel1 + Buffer->Pitch*3; - - // NOTE(fox): Remember this is RGB packed, so 128-bit registers hold 4 pixels. - - __m128i Row1 = _mm_loadu_si128((__m128i *)Pixel1); - __m128i Row2 = _mm_loadu_si128((__m128i *)Pixel2); - __m128i Row3 = _mm_loadu_si128((__m128i *)Pixel3); - __m128i Row4 = _mm_loadu_si128((__m128i *)Pixel4); - _mm_storeu_si128((__m128i *)DPixel, Row1); - DPixel += 4*Buffer->BytesPerPixel; - _mm_storeu_si128((__m128i *)DPixel, Row2); - DPixel += 4*Buffer->BytesPerPixel; - _mm_storeu_si128((__m128i *)DPixel, Row3); - DPixel += 4*Buffer->BytesPerPixel; - _mm_storeu_si128((__m128i *)DPixel, Row4); - DPixel += 4*Buffer->BytesPerPixel; - } - // TODO(fox): Clear the last row if the buffer isn't divisible by 4. - } -#else - for (uint32 Y = 0; Y < Buffer->Height; Y+=1) { - uint8 *DPixel = Temp + Y*Buffer->Pitch; - for (uint32 X = 0; X < Buffer->Width; X+=1) { - uint32 XLookup = (X >> 2)*16 + (X % 4); - uint32 YLookup = (Y >> 2)*(Buffer->Width*4) + (Y % 4)*4; - uint32 PixelToSeek = XLookup + YLookup; - uint32 Pixel = *(uint32 *)((uint8 *)Buffer->EffectBuffer + PixelToSeek*Buffer->BytesPerPixel); - uint8 Xp = Pixel & 0xFF; - uint8 Yp = (Pixel >> 8) & 0xFF; - printf("X %u, Y %u, val: %i, %i\n", X, Y, Xp, Yp); - } - } - __m256i PixelX0 = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7); - __m256i FF = _mm256_set1_epi32(8); - uint8 *Src = (uint8 *)Buffer->EffectBuffer; - for (int i = 0; i < 16; i++) { - _mm256_storeu_si256((__m256i *)Src, PixelX0); - Src += 32; - PixelX0 = _mm256_add_epi32(PixelX0, FF); - } - uint32 Width = 3; - for (uint32 Y = 0; Y < 4*2; Y++) { - for (uint32 X = 0; X < 4*3; X++) { - uint32 XLookup = (X >> 2)*16 + (X % 4); - uint32 YLookup = (Y >> 2)*(Width*16) + (Y % 4)*4; - uint32 PixelToSeek = XLookup + YLookup; - uint32 Pixel = *((uint8 *)Buffer->EffectBuffer + PixelToSeek*Buffer->BytesPerPixel); - printf("X %u, Y %u, %i\n", X, Y, Pixel); - } - } - Assert(0); -#endif -} -#else -internal void -PackBitmapRGB(pixel_buffer *Buffer) -{ - uint8 *Row = (uint8 *)Buffer->OriginalBuffer; - uint8 *PackedRow = (uint8 *)Buffer->EffectBuffer; - for (uint32 Y = 0; Y < Buffer->Height; Y++) { - uint32 *Pixel = (uint32 *)PackedRow; - for (uint32 X = 0; X < Buffer->Width; X++) { - uint8 *ValR = (uint8 *)Row + X; - // if (X > 16 && Y > 16) { - // Assert(*ValR == 0); - // } - uint8 *ValG = ValR + Buffer->Channel; - uint8 *ValB = ValR + Buffer->Channel*2; - uint8 *ValA = ValR + Buffer->Channel*3; - - *Pixel = ( - (*ValR << 0) | - (*ValG << 8) | - (*ValB << 16) | - (*ValA << 24)); - - Pixel++; - } - Row += Buffer->Pitch; - PackedRow += Buffer->Pitch*Buffer->BytesPerPixel; - } -} -// TODO(fox): Libav only exports GBRA array frames for some reason; see if you -// can mod the source if you end up not using packed RGB. -internal void -Libav_GBRAToRGBA(pixel_buffer *Raster) -{ - uint8 *Row = ((uint8 *)Raster->OriginalBuffer); - uint32 bytes = 0; - __m128i Zero = _mm_setzero_si128(); - while (bytes <= Raster->Height*Raster->Width) { - uint8 *ChannelG = (uint8 *)Row + bytes; - uint8 *ChannelB = (uint8 *)Row + bytes + Raster->Channel; - uint8 *ChannelR = (uint8 *)Row + bytes + Raster->Channel*2; - __m128i RegG = _mm_loadu_si128((__m128i *)ChannelG); - __m128i RegB = _mm_loadu_si128((__m128i *)ChannelB); - __m128i RegR = _mm_loadu_si128((__m128i *)ChannelR); - _mm_storeu_si128((__m128i *)ChannelG, RegR); - _mm_storeu_si128((__m128i *)ChannelB, RegG); - _mm_storeu_si128((__m128i *)ChannelR, RegB); - bytes += 16; - } -} -#endif - - - -// 0 - original -// 1 - effect -// 2 - both -internal void -SSE_ClearBuffer(pixel_buffer *Raster, uint16 Which = 2) -{ - uint8 *Row = ((uint8 *)Raster->OriginalBuffer); - uint8 *Row2 = ((uint8 *)Raster->EffectBuffer); - uint32 bytes = 0; - __m128i Zero = _mm_setzero_si128(); - while (bytes <= Raster->Height*Raster->Width*4) { - if (Which == 2 || Which == 0) { - uint8 *Pixel = (uint8 *)Row + bytes; - _mm_storeu_si128((__m128i *)Pixel, Zero); - } - if (Which == 2 || Which == 1) { - uint8 *Pixel2 = (uint8 *)Row2 + bytes; - _mm_storeu_si128((__m128i *)Pixel2, Zero); - } - bytes += 16; - } -} - -// 0 - original -> effect -// 1 - effect -> original -internal void -SSE_CopyToBuffer(pixel_buffer *Raster, uint16 Which) -{ - uint8 *Row = ((uint8 *)Raster->OriginalBuffer); - uint8 *Row2 = ((uint8 *)Raster->EffectBuffer); - uint32 bytes = 0; - while (bytes <= Raster->Height*Raster->Width*4) { - uint8 *Pixel = (uint8 *)Row + bytes; - uint8 *Pixel2 = (uint8 *)Row2 + bytes; - if (Which == 0) { - __m128i OutputPixel = _mm_loadu_si128((__m128i *)Pixel); - _mm_storeu_si128((__m128i *)Pixel2, OutputPixel); - } else { - __m128i OutputPixel = _mm_loadu_si128((__m128i *)Pixel2); - _mm_storeu_si128((__m128i *)Pixel, OutputPixel); - } - bytes += 16; - } -} - -internal void -DebugFillSolid(pixel_buffer *Raster, v4 Color) -{ - __m128i Col = _mm_set1_epi32(ColToUint32(Color)); - uint8 *Row = ((uint8 *)Raster->OriginalBuffer); - uint32 bytes = 0; - while (bytes <= Raster->Height*Raster->Width*4) { - uint8 *Pixel = (uint8 *)Row + bytes; - _mm_storeu_si128((__m128i *)Pixel, Col); - bytes += 16; - } -} - -internal void -BitmapPackRGB(pixel_buffer *Buffer) { -#if PACKEDRGB - Buffer->Pitch = Buffer->Width*Buffer->BytesPerPixel; -#else - Buffer->Pitch = Buffer->Width; // each row has only 1 byte, 8 bits, per pixel - Buffer->Channel = Buffer->Width*Buffer->Height; -#endif -#if PACKEDRGB - Store4x4Chunk(Buffer); - SSE_CopyToBuffer(Buffer, 1); - SSE_ClearBuffer(Buffer, 1); -#else - Libav_GBRAToRGBA(Buffer); -#endif -} - -internal pixel_buffer LoadImage(memory *Memory, char *filename) { pixel_buffer Buffer = {}; @@ -340,11 +54,16 @@ LoadImage(memory *Memory, char *filename) int n = 0; int h, w; - Buffer.OriginalBuffer = stbi_load(filename, &w, &h, &n, 4); - Buffer.EffectBuffer = AllocateMemory(Memory, w * h * Buffer.BytesPerPixel, B_Scratch); + void *temp = stbi_load(filename, &w, &h, &n, 4); + // printf("%s", stbi_failure_reason()); Buffer.Height = h; Buffer.Width = w; - // printf("%s", stbi_failure_reason()); + CalculateFull(&Buffer); + Buffer.Pitch = Buffer.FullWidth*Buffer.BytesPerPixel; + // TODO(fox): Implement custom malloc in stbi so we don't have to do this. + Buffer.OriginalBuffer = MoveImportToBitmap(Memory, &Buffer, temp); + stbi_image_free(temp); + Buffer.EffectBuffer = AllocateMemory(Memory, Buffer.FullWidth * Buffer.FullHeight * Buffer.BytesPerPixel, B_Scratch); BitmapPackRGB(&Buffer); Buffer.ToUpdate = true; return Buffer; @@ -354,16 +73,34 @@ internal pixel_buffer CreateSolidBitmap(memory *Memory, uint16 Height, uint16 Width, v4 Color) { pixel_buffer Buffer = {}; Buffer.BytesPerPixel = 4; - Buffer.OriginalBuffer = AllocateMemory(Memory, Height * Width * Buffer.BytesPerPixel, B_Scratch); - Buffer.EffectBuffer = AllocateMemory(Memory, Height * Width * Buffer.BytesPerPixel, B_Scratch); Buffer.Height = Height; Buffer.Width = Width; + CalculateFull(&Buffer); + Buffer.Pitch = Buffer.FullWidth*Buffer.BytesPerPixel; + Buffer.OriginalBuffer = AllocateMemory(Memory, Buffer.FullHeight * Buffer.FullWidth * Buffer.BytesPerPixel, B_Scratch); + Buffer.EffectBuffer = AllocateMemory(Memory, Buffer.FullHeight * Buffer.FullWidth * Buffer.BytesPerPixel, B_Scratch); DebugFillSolid(&Buffer, Color); BitmapPackRGB(&Buffer); Buffer.ToUpdate = true; return Buffer; } +internal pixel_buffer +CreateDebugBitmap(memory *Memory, uint16 Height, uint16 Width) { + pixel_buffer Buffer = {}; + Buffer.BytesPerPixel = 4; + Buffer.Height = Height; + Buffer.Width = Width; + CalculateFull(&Buffer); + Buffer.Pitch = Buffer.FullWidth*Buffer.BytesPerPixel; + Buffer.OriginalBuffer = AllocateMemory(Memory, Buffer.FullHeight * Buffer.FullWidth * Buffer.BytesPerPixel, B_Scratch); + Buffer.EffectBuffer = AllocateMemory(Memory, Buffer.FullHeight * Buffer.FullWidth * Buffer.BytesPerPixel, B_Scratch); + DebugBitmap(&Buffer); + BitmapPackRGB(&Buffer); + Buffer.ToUpdate = true; + return Buffer; +} + internal void DrawHistogram(project_layer *Layer, pixel_buffer *UIBuffer, void *Scratch, memory *Memory, sdl_input Input, project_state *State, @@ -435,144 +172,6 @@ DrawHistogram(project_layer *Layer, pixel_buffer *UIBuffer, void *Scratch, memor } } -internal void -DebugBitmap(pixel_buffer *Raster) -{ -#if 0 - for (uint32 Y = 0; Y < Raster->Height; Y+=2) { - for (uint32 X = 0; X < Raster->Width; X+=32) { - for (uint32 pp = 0; pp < 4; pp++) { - uint32 Increment = ((uint32)Raster->Width*Y*4) + X + pp*8; - uint32 Increment2 = ((uint32)Raster->Width*(Y+1)*4) + X + pp*8; - uint8 *TexPTR = ((uint8 *)Raster->OriginalBuffer + Increment); - uint8 *TexPTR2 = ((uint8 *)Raster->OriginalBuffer + Increment2); - uint8 *TexPTR3 = ((uint8 *)Raster->OriginalBuffer + Increment + 4); - uint8 *TexPTR4 = ((uint8 *)Raster->OriginalBuffer + Increment2 + 4); - if (pp == 0) { - // *(uint32 *)TexPTR = 0x5f5e5d5c; - // *(uint32 *)TexPTR2 = 0x4f4e4d4c; - // *(uint32 *)TexPTR3 = 0x3f3e3d3c; - // *(uint32 *)TexPTR4 = 0x2f2e2d2c; - // *(uint32 *)TexPTR3 = 0xaaaaaaaa; - // *(uint32 *)TexPTR4 = 0xaaaaaaaa; - *(uint32 *)TexPTR = 0xcccaccc1; - *(uint32 *)TexPTR2 = 0xdddaddd1; - *(uint32 *)TexPTR3 = 0xeeeaeee1; - *(uint32 *)TexPTR4 = 0xfffafff1; - } else if (pp == 1) { - // *(uint32 *)TexPTR = 0xb2a2b1a1; - // *(uint32 *)TexPTR = 0xd2c2d1c1; - // *(uint32 *)TexPTR3 = 0xbbaabbaa; - // *(uint32 *)TexPTR4 = 0xddccddcc; - *(uint32 *)TexPTR = 0xccccccc2; - *(uint32 *)TexPTR2 = 0xddddddd2; - *(uint32 *)TexPTR3 = 0xeeeeeee2; - *(uint32 *)TexPTR4 = 0xfffffff2; - } else if (pp == 2) { - *(uint32 *)TexPTR = 0xccccccc3; - *(uint32 *)TexPTR2 = 0xddddddd3; - *(uint32 *)TexPTR3 = 0xeeeeeee3; - *(uint32 *)TexPTR4 = 0xfffffff3; - } else { - *(uint32 *)TexPTR = 0xccccccc4; - *(uint32 *)TexPTR2 = 0xddddddd4; - *(uint32 *)TexPTR3 = 0xeeeeeee4; - *(uint32 *)TexPTR4 = 0xfffffff4; - } - } - } - } -#endif -#if 0 - uint32 Channel = (Raster->Width * Raster->Height)*4; - for (uint32 Y = 0; Y < Raster->Height; Y+=2) { - for (uint32 X = 0; X < Raster->Width; X+=2) { - uint8 *TopL = ((uint8 *)Raster->OriginalBuffer + (Raster->Width*Y*4) + X); - uint8 *TopL2 = ((uint8 *)Raster->OriginalBuffer + (Raster->Width*Y*4) + X + Channel); - uint8 *TopL3 = ((uint8 *)Raster->OriginalBuffer + (Raster->Width*Y*4) + X + Channel*2); - uint8 *TopR = TopL + 1; - uint8 *TopR2 = TopL2 + 1; - uint8 *TopR3 = TopL3 + 1; - uint8 *BotL = ((uint8 *)Raster->OriginalBuffer + (Raster->Width*(Y+1)*4) + X); - uint8 *BotL2 = ((uint8 *)Raster->OriginalBuffer + (Raster->Width*(Y+1)*4) + X + Channel); - uint8 *BotL3 = ((uint8 *)Raster->OriginalBuffer + (Raster->Width*(Y+1)*4) + X + Channel*2); - uint8 *BotR = BotL + 1; - uint8 *BotR2 = BotL2 + 1; - uint8 *BotR3 = BotL3 + 1; - - *TopL = 0xff; - *TopL2 = 0x00; - *TopL3 = 0x00; - *TopR = 0xcc; - *TopR2 = 0xff; - *TopR3 = 0x00; - *BotL = 0x55; - *BotL2 = 0x00; - *BotL3 = 0xff; - *BotR = 0x00; - *BotR2 = 0xff; - *BotR3 = 0xff; - } - } -#endif -#if 1 - uint32 Channel = (Raster->Width * Raster->Height); - uint32 Width = 10; - uint8 inc = 0; - uint8 incY = 0; - for (uint32 Y = 0; Y < Raster->Height; Y+=1) { - for (uint32 X = 0; X < Width; X+=1) { - uint8 *Pix = ((uint8 *)Raster->OriginalBuffer + (Raster->Pitch*Y) + X); - uint8 *Pix2 = ((uint8 *)Raster->OriginalBuffer + Channel + (Raster->Pitch*Y) + X); - // if (X == 0 && Y == 1) { - // *Pix++ = 0xaa; - // inc++; - // } else if (X == 0 && Y == 2) { - // *Pix++ = 0xbb; - // inc++; - // } else if (X == 0 && Y == 3) { - // *Pix++ = 0xcc; - // inc++; - // } else { - *Pix++ = inc++; - *Pix2++ = incY; - // } - } - incY++; - } -#endif - - for (uint32 Y = 0; Y < Raster->Height; Y+=2) { - for (uint32 X = 0; X < Raster->Width; X+=32) { - uint32 Channel = (Raster->Width * Raster->Height)*4; - for (int16 i = 0; i < 4; i++) { - uint32 Increment = (Raster->Width*Y*4) + X + Channel*i; - uint32 Increment2 = (Raster->Width*(Y+1)*4) + X + Channel*i; - uint8 *TexPTR = ((uint8 *)Raster->OriginalBuffer + Increment); - uint8 *Pixel = ((uint8 *)Raster->EffectBuffer + Increment); - uint8 *TexPTR2 = ((uint8 *)Raster->OriginalBuffer + Increment2); - uint8 *Pixel2 = ((uint8 *)Raster->EffectBuffer + Increment2); - __m256i T1 = _mm256_loadu_si256((__m256i *)TexPTR); - __m256i T2 = _mm256_loadu_si256((__m256i *)TexPTR2); - __m256i pp = _mm256_unpackhi_epi16(T1, T2); - __m256i pp2 = _mm256_unpacklo_epi16(T1, T2); - __m256i pp3 = _mm256_unpacklo_epi64(pp2, pp); - __m256i pp4 = _mm256_unpackhi_epi64(pp2, pp); - __m256i T4 = _mm256_permute2x128_si256(pp2, pp, 32); - __m256i T5 = _mm256_permute2x128_si256(pp2, pp, 53); - _mm256_storeu_si256((__m256i *)Pixel, T1); - _mm256_storeu_si256((__m256i *)Pixel2, T2); - } - } - } - // _mm256_unpackhi_epi8 - // for (int Y = 0; Y < Raster.Height; Y+=2) { - // for (int X = 0; X < Raster.Width; X+=2) { - // uint8 *Row = ((uint8 *)UIBuffer->OriginalBuffer + - // } - // } -} - internal property_channel InitFloatProperty(char *Name, real32 Val, real32 ScrubVal, real32 MinVal = PROPERTY_REAL_MIN, real32 MaxVal = PROPERTY_REAL_MAX) { property_channel Property = {}; @@ -700,6 +299,17 @@ CreateSolidLayer(project_data *File, memory *Memory, uint16 Width, uint16 Height return Layer; } +internal project_layer * +CreateDebugLayer(project_data *File, memory *Memory, uint16 Width, uint16 Height) +{ + project_layer *Layer = CreateLayer(File, Memory); + Layer->RenderInfo = AllocateMemory(Memory, sizeof(image_source), P_SourceData); + image_source *Source = (image_source *)Layer->RenderInfo; + Source->Raster = CreateDebugBitmap(Memory, Width, Height); + Layer->SourceType = source_image; + return Layer; +} + internal void CreateDemoScene(project_data *File, memory *Memory) { @@ -708,7 +318,7 @@ CreateDemoScene(project_data *File, memory *Memory) Layer1->y.CurrentValue.f = 720/2; Layer1->StartFrame = 0; Layer1->EndFrame = File->EndFrame; - project_layer *Layer2 = CreateSolidLayer(File, Memory, 500, 500, V4(0.0, 1.0, 0.4, 1.0)); + project_layer *Layer2 = CreateSolidLayer(File, Memory, 499, 503, V4(0.0, 1.0, 0.4, 1.0)); Layer2->x.CurrentValue.f = 1280/2; Layer2->y.CurrentValue.f = 720/2; Layer2->StartFrame = 0; @@ -717,7 +327,7 @@ CreateDemoScene(project_data *File, memory *Memory) ManualKeyframeInsertF(&Layer2->rotation, Memory, 50, 360); Layer2->rotation.IsToggled = true; Layer2->scale.IsToggled = true; - project_layer *Layer3 = CreateSolidLayer(File, Memory, 160, 160, V4(1.0, 0.3, 0.2, 1.0)); + project_layer *Layer3 = CreateSolidLayer(File, Memory, 157, 163, V4(1.0, 0.3, 0.2, 1.0)); Layer3->x.CurrentValue.f = 1280/4; Layer3->y.CurrentValue.f = 720/4; Layer3->opacity.CurrentValue.f = 0.5f; @@ -28,6 +28,8 @@ struct project_debug char *String[6]; uint32 WatchedProperties; bool32 ToggleWindow; + + bool32 ToggleRenders; }; global_variable project_debug Debug; diff --git a/effects.cpp b/effects.cpp index 5532a97..733e4d6 100644 --- a/effects.cpp +++ b/effects.cpp @@ -286,7 +286,7 @@ AddEffect(project_layer *Layer, memory *Memory, uint16 EffectListIndex) } internal void -SSE_CopyToBuffer(pixel_buffer *, uint16 asda = 0); +CopyToBuffer(pixel_buffer *, uint16 asda = 0); internal void UpdateEffects(project_layer *Layer, memory *Memory) @@ -296,7 +296,7 @@ UpdateEffects(project_layer *Layer, memory *Memory) Source->Raster.EffectBuffer = AllocateMemory(Memory, Source->Raster.Width * Source->Raster.Height * Source->Raster.BytesPerPixel, B_Scratch); } - SSE_CopyToBuffer(&Source->Raster); + CopyToBuffer(&Source->Raster); for (int i = 0; i < Layer->NumberOfEffects; i++) { if (Layer->Effect[i]->IsActive) @@ -105,7 +105,9 @@ global_variable uint32 volatile CompletedJobs; global_variable uint32 volatile NextEntryToDo; global_variable uint32 volatile EntryCount; global_variable bool32 IsRendering = false; -global_variable bool32 AVXEnabled = true; +global_variable bool32 D = true; +global_variable instruction_mode InstructionMode = scalar_only; + render_entry Entries[256]; @@ -122,6 +124,7 @@ SDL_sem *Semaphore; #endif #include "prenderer.cpp" #include "video.cpp" +#include "bitmap_calls.cpp" #include "createcalls.cpp" #include "my_imgui_widgets.cpp" @@ -132,7 +135,8 @@ MainFunction(main_sdl *Main, memory *Memory, project_state *State, project_data *File, cache_pool *Cache, pixel_buffer *CompBuffer) { - SSE_ClearBuffer(CompBuffer); + ClearBuffer(CompBuffer, CompBuffer->OriginalBuffer); + ClearBuffer(CompBuffer, CompBuffer->EffectBuffer); for (int i = 0; i < File->NumberOfLayers; i++) { project_layer *Layer = File->Layer[i]; if (Layer->RenderInfo) { @@ -301,14 +305,16 @@ int main(int argc, char *argv[]) { InitMemoryTable(&GlobalMemory, &Memory, 10 * 1024 * 1024, F_Strings, "Strings"); InitMemoryTable(&GlobalMemory, &Memory, 1024 * 1024 * 1024, B_Scratch, "Scratch buffer"); - if (!SDL_HasAVX2()) { - AVXEnabled = false; - printf("CPU does not have AVX2!"); - return -1; - } + project_state State = {}; + if (SDL_HasSSE2()) { + InstructionMode = sse_enabled; + } + if (SDL_HasAVX2()) { + InstructionMode = avx_enabled; + } - project_state State = {}; + InstructionMode = scalar_only; project_data File = {}; File.Width = 1280; @@ -339,7 +345,6 @@ int main(int argc, char *argv[]) { // CreateLayerFromSource(&File, &State, &Memory, File.Source[0]); // CreateLayerFromSource(&File, &State, &Memory, File.Source[1]); -#if 1 // shm_unlink("/testl"); // int fd = shm_open("/testl", O_CREAT | O_EXCL | O_RDWR, // S_IRUSR | S_IWUSR); @@ -362,17 +367,18 @@ int main(int argc, char *argv[]) { // if (sem_init(&shmp->sem2, 1, 0) == -1) // Assert(0); - // CreateLayer(&File, &Memory); - // CreateRenderInfo(File.Layer[1], &Memory, File, video, "./asset/24.mp4"); - // File.Layer[1]->Name = "yuyu"; - // File.Layer[1]->StartFrame = 0; - // File.Layer[1]->EndFrame = 65; -#else - CreateDebugLayer(&File, &Memory, 12, 8); - File.Layer[0]->Name = "debug"; - File.Layer[0]->StartFrame = 0; - File.Layer[0]->EndFrame = 65; -#endif + // CreateLayerFromSource(&File, &State, &Memory, "../asset/24.mp4"); + // project_layer *Layer1 = CreateDebugLayer(&File, &Memory, 9, 14); + // project_layer *Layer1 = CreateSolidLayer(&File, &Memory, 9, 13, V4(1.0, 1.0, 1.0, 1.0)); + // Layer1->x.CurrentValue.f = 7; + // Layer1->y.CurrentValue.f = 4; + // Layer1->StartFrame = 0; + // Layer1->EndFrame = File.EndFrame; + + // CreateDebugLayer(&File, &Memory, 12, 8); + // File.Layer[0]->Name = "debug"; + // File.Layer[0]->StartFrame = 0; + // File.Layer[0]->EndFrame = 65; // CreateLayer(&File, &Memory); @@ -418,7 +424,7 @@ int main(int argc, char *argv[]) { // AddEffect(File.Layer[0], &Memory, 0); // AddEffect(File.Layer[0], &Memory, 0); - for (int i = 0; i < 3; i++) + // for (int i = 0; i < 3; i++) // CreateLayer(&File, &Memory); // DebugPrintMemoryUsage(Memory); @@ -472,8 +478,10 @@ int main(int argc, char *argv[]) { SDL_GL_SetAttribute(SDL_GL_DEPTH_SIZE, 24); SDL_GL_SetAttribute(SDL_GL_STENCIL_SIZE, 8); SDL_WindowFlags window_flags = (SDL_WindowFlags)(SDL_WINDOW_OPENGL | SDL_WINDOW_RESIZABLE | SDL_WINDOW_ALLOW_HIGHDPI); +#if DEBUG // uint32 ScreenSize[2] = {2560/1.2, 1600/1.2}; - // real32 ScreenSize[2] = {3840/1.2, 2160/1.2}; + real32 ScreenSize[2] = {3840/1.2, 2160/1.2}; +#else real32 ScreenSize[2]; SDL_DisplayMode current; int windowtest = SDL_GetCurrentDisplayMode(0, ¤t); @@ -484,6 +492,7 @@ int main(int argc, char *argv[]) { ScreenSize[0] = 1920; ScreenSize[1] = 1080; } +#endif SDL_Window* window = SDL_CreateWindow("Event Tester", SDL_WINDOWPOS_CENTERED, SDL_WINDOWPOS_CENTERED, ScreenSize[0], ScreenSize[1], window_flags); SDL_GLContext gl_context = SDL_GL_CreateContext(window); SDL_GL_MakeCurrent(window, gl_context); @@ -513,8 +522,8 @@ int main(int argc, char *argv[]) { GLuint textureID; glGenTextures(1, &textureID); glBindTexture(GL_TEXTURE_2D, textureID); - glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_NEAREST); - glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_NEAREST); + glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_LINEAR); + glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_LINEAR); glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_S, GL_CLAMP_TO_EDGE); // This is required on WebGL for non power-of-two textures glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_T, GL_CLAMP_TO_EDGE); // Same #if defined(GL_UNPACK_ROW_LENGTH) && !defined(__EMSCRIPTEN__) @@ -577,6 +586,7 @@ int main(int argc, char *argv[]) { if (State.UpdateFrame && !IsRendering) { MainFunction(0, &Memory, &State, &File, &Cache, &CompBuffer); State.UpdateFrame = 0; + OutputToViewport(&CompBuffer, &State, textureID); } #if THREADED @@ -585,12 +595,7 @@ int main(int argc, char *argv[]) { CheckQueue(RenderInfo, 8); } if (CompletedJobs == 16) { -#if PACKEDRGB - Unpack4x4Chunk(&CompBuffer); - // SSE_CopyToBuffer(CompBuffer); -#else - PackBitmapRGB(&CompBuffer); -#endif + Convert4x4Chunk(&CompBuffer, 1); EndRenderState(&State); glBindTexture(GL_TEXTURE_2D, textureID); glTexSubImage2D(GL_TEXTURE_2D, 0, 0, 0, CompBuffer.Width, CompBuffer.Height, GL_RGBA, GL_UNSIGNED_BYTE, @@ -603,16 +608,6 @@ int main(int argc, char *argv[]) { } } #else -#if PACKEDRGB - Unpack4x4Chunk(&CompBuffer); - // SSE_CopyToBuffer(CompBuffer); -#else - PackBitmapRGB(&CompBuffer); -#endif - EndRenderState(&State); - glBindTexture(GL_TEXTURE_2D, textureID); - glTexSubImage2D(GL_TEXTURE_2D, 0, 0, 0, CompBuffer.Width, CompBuffer.Height, GL_RGBA, GL_UNSIGNED_BYTE, - CompBuffer.EffectBuffer); #endif ImGui::Render(); @@ -4,17 +4,25 @@ enum source_type { source_image }; +enum instruction_mode { + scalar_only, + sse_enabled, + avx_enabled +}; + struct pixel_buffer { void *OriginalBuffer; void *EffectBuffer; void *Scratch; uint16 Width; uint16 Height; + // IMPORTANT(fox): Since we're storing 4x4 chunks, I'm opting to pad out each + // dimension with an extra 1-3 pixels to make our lookup functions simpler. + // This has the cost of extra RAM, but it's a miniscule amount (0.2% extra + // data for a worst-case 1080p 16bpc frame, or 140 kb). + uint16 FullWidth; + uint16 FullHeight; uint16 Pitch; -#if PACKEDRGB -#else - uint32 Channel; -#endif uint16 BytesPerPixel; bool32 ToUpdate; // Set whenever effects or video frames need to be updated. }; @@ -246,6 +254,8 @@ struct transform_info { real32 YAxisPY; real32 LayerWidth; real32 LayerHeight; + uint32 FullLayerWidth; + uint32 FullLayerHeight; real32 LayerOpacity; real32 OriginX; real32 OriginY; diff --git a/my_imgui_widgets.cpp b/my_imgui_widgets.cpp index c199aa4..1190430 100644 --- a/my_imgui_widgets.cpp +++ b/my_imgui_widgets.cpp @@ -196,6 +196,17 @@ ImGui_Viewport(project_data File, project_state *State, ui *UI, pixel_buffer Com UI->CompPos.x += io.MouseDelta.x; UI->CompPos.y += io.MouseDelta.y; } + // if (IsActive && ImGui::IsMouseDown(ImGuiMouseButton_Right)) + // { + // Debug.ToggleRenders = true; + // } + ImGui::OpenPopupOnItemClick("context", ImGuiPopupFlags_MouseButtonRight); + if (ImGui::BeginPopup("context")) { + if (ImGui::MenuItem("Scalar", NULL, false, InstructionMode != scalar_only)) { InstructionMode = scalar_only; } + if (ImGui::MenuItem("SSE", NULL, false, InstructionMode != sse_enabled)) { InstructionMode = sse_enabled; } + if (ImGui::MenuItem("AVX2", NULL, false, InstructionMode != avx_enabled)) { InstructionMode = avx_enabled; } + ImGui::EndPopup(); + } if (IsActive && ImGui::IsMouseDragging(ImGuiMouseButton_Left, -1.0f) && ImGui::IsKeyDown(ImGuiKey_Z)) { real32 Distance = io.MouseDelta.x + io.MouseDelta.y; @@ -999,10 +1010,11 @@ ImGui_ProcessInputs(project_data *File, project_state *State, pixel_buffer *Comp } #if DEBUG - if (ImGui::IsKeyPressed(ImGuiKey_E)) { - SwitchBool(AVXEnabled); - State->UpdateFrame = true; - } + if (ImGui::IsKeyPressed(ImGuiKey_Z)) + { + // SwitchBool(D); + // State->UpdateFrame = true; + } if (ImGui::IsKeyPressed(ImGuiKey_M)) { Debug.Markers[Debug.MarkerIndex] = File->CurrentFrame; diff --git a/prenderer.cpp b/prenderer.cpp index 4d4152d..356ecd7 100644 --- a/prenderer.cpp +++ b/prenderer.cpp @@ -7,11 +7,14 @@ RenderLayerNeon(project_layer *Layer, pixel_buffer *Buffer, rectangle RenderRegi internal void AVX2_RenderLayer(transform_info TransformInfo, pixel_buffer *Buffer, rectangle RenderRegion); internal void +SSE2_RenderLayer(transform_info TransformInfo, pixel_buffer *Buffer, rectangle RenderRegion); +internal void Fallback_RenderLayer(transform_info TransformInfo, pixel_buffer *Buffer, rectangle RenderRegion); internal bool32 CheckQueue(render_queue RenderInfo, uint16 Index); +// for the anchor point moving UI internal void CalculateAnchorOffset(project_layer *Layer, real32 Value, uint16 Dir) { @@ -76,12 +79,14 @@ CalculateTransforms(project_layer *Layer, pixel_buffer *Buffer) TransformInfo.YAxisPY = YLengthSq*YAxis.y; TransformInfo.LayerWidth = (real32)Source->Raster.Width; TransformInfo.LayerHeight = (real32)Source->Raster.Height; + TransformInfo.FullLayerWidth = Source->Raster.FullWidth; + TransformInfo.FullLayerHeight = Source->Raster.FullHeight; TransformInfo.LayerOpacity = 1.0f - Layer->opacity.CurrentValue.f; TransformInfo.OriginX = Origin.x; TransformInfo.OriginY = Origin.y; TransformInfo.BufferPitch = Buffer->Pitch; TransformInfo.LayerPitch = Source->Raster.Pitch; - TransformInfo.ClipRect = {MinX - (MinX & 3), MinY, MaxX, MaxY}; + TransformInfo.ClipRect = {MinX - (MinX & 3), MinY, MaxX + 1, MaxY + 1}; TransformInfo.SourceBuffer = Source->Raster.EffectBuffer; @@ -115,6 +120,19 @@ EndRenderState(project_state *State) } internal void +RenderLayers(render_queue *RenderInfo, rectangle RenderRegion) { + for (int16 i = 0; i < RenderInfo->State->NumberOfLayersToRender; i++) { + int16 Idx = RenderInfo->State->LayersToRender[i]; + if (InstructionMode == avx_enabled) + AVX2_RenderLayer(RenderInfo->File->Layer[Idx]->TransformInfo, RenderInfo->CompBuffer, RenderRegion); + else if (InstructionMode == sse_enabled) + SSE2_RenderLayer(RenderInfo->File->Layer[Idx]->TransformInfo, RenderInfo->CompBuffer, RenderRegion); + else + Fallback_RenderLayer(RenderInfo->File->Layer[Idx]->TransformInfo, RenderInfo->CompBuffer, RenderRegion); + } +} + +internal void QueueCurrentFrame(project_data *File, pixel_buffer *CompBuffer, project_state *State) { IsRendering = true; @@ -163,18 +181,7 @@ QueueCurrentFrame(project_data *File, pixel_buffer *CompBuffer, project_state *S // DEBUG_CycleCountStart(3); rectangle RenderRegion = {0, 0, (int32)CompBuffer->Width, (int32)CompBuffer->Height}; - for (int16 i = 0; i < RenderInfo.State->NumberOfLayersToRender; i++) { - int16 Idx = RenderInfo.State->LayersToRender[i]; -#if ARM - RenderLayerNeon(RenderInfo.File->Layer[Idx], RenderInfo.CompBuffer, RenderRegion); -#else - // RenderLayerSSE(RenderInfo.File->Layer[Idx]->TransformInfo, RenderInfo.CompBuffer, RenderRegion); - if (AVXEnabled) - AVX2_RenderLayer(RenderInfo.File->Layer[Idx]->TransformInfo, RenderInfo.CompBuffer, RenderRegion); - else - Fallback_RenderLayer(RenderInfo.File->Layer[Idx]->TransformInfo, RenderInfo.CompBuffer, RenderRegion); -#endif - } + RenderLayers(&RenderInfo, RenderRegion); // DEBUG_CycleCountEnd(3); // Debug.ExecutionAmount[4] += 1280*720; @@ -378,6 +385,7 @@ RenderLayerNeon(project_layer *Layer, pixel_buffer *Buffer, rectangle RenderRegi } #else + internal void AVX2_RenderLayer(transform_info T, pixel_buffer *Buffer, rectangle RenderRegion) { @@ -397,7 +405,9 @@ AVX2_RenderLayer(transform_info T, pixel_buffer *Buffer, rectangle RenderRegion) __m256 YAxisPY = _mm256_set1_ps(T.YAxisPY); __m256 LayerWidth = _mm256_set1_ps(T.LayerWidth); - __m256i LayerWidth4i = _mm256_set1_epi32(T.LayerWidth*4); + __m256i FullLayerWidth4i = _mm256_set1_epi32(T.FullLayerWidth*4); + __m256i LayerWidthMinusOne = _mm256_set1_epi32(T.LayerWidth - 1); + __m256i LayerHeightMinusOne = _mm256_set1_epi32(T.LayerHeight - 1); __m256 LayerHeight = _mm256_set1_ps(T.LayerHeight); __m256 LayerOpacity = _mm256_set1_ps(T.LayerOpacity); __m256 OriginX = _mm256_set1_ps(T.OriginX); @@ -451,7 +461,7 @@ AVX2_RenderLayer(transform_info T, pixel_buffer *Buffer, rectangle RenderRegion) __m256 StartVectorX = _mm256_sub_ps(PixelX, OriginX); uint32 XLookup = (X >> 2)*16 + (X % 4); - uint32 YLookup = (Y >> 2)*(Buffer->Width*4) + (Y % 4)*4; + uint32 YLookup = (Y >> 2)*(Buffer->FullWidth*4) + (Y % 4)*4; uint32 PixelToSeek = XLookup + YLookup; uint8 *Pixel = (uint8 *)Buffer->OriginalBuffer + PixelToSeek*Buffer->BytesPerPixel; @@ -461,6 +471,8 @@ AVX2_RenderLayer(transform_info T, pixel_buffer *Buffer, rectangle RenderRegion) __m256i LayerMask = _mm256_castps_si256(_mm256_and_ps(_mm256_and_ps(_mm256_cmp_ps(U, Zero, 13), _mm256_cmp_ps(U, One, 2)), _mm256_and_ps(_mm256_cmp_ps(V, Zero, 13), _mm256_cmp_ps(V, One, 2)))); + // If all of the pixels are zeroed in the mask (aka fall outside + // the UV lookup), we can skip the iteration. if (_mm256_movemask_epi8(LayerMask)) { U = _mm256_max_ps(_mm256_min_ps(One, U), Zero); @@ -469,9 +481,10 @@ AVX2_RenderLayer(transform_info T, pixel_buffer *Buffer, rectangle RenderRegion) __m256 TexXFull = _mm256_mul_ps(U, LayerWidth); __m256 TexYFull = _mm256_mul_ps(V, LayerHeight); __m256i TexXInt = _mm256_cvttps_epi32(TexXFull); - __m256i TexXIntPlusOne = _mm256_add_epi32(TexXInt, Onei); + __m256i TexXIntPlusOne = _mm256_add_epi32(TexXInt, _mm256_and_si256(_mm256_cmpgt_epi32(LayerWidthMinusOne, TexXInt), Onei)); __m256i TexYInt = _mm256_cvttps_epi32(TexYFull); - __m256i TexYIntPlusOne = _mm256_add_epi32(TexYInt, Onei); + __m256i TexYIntPlusOne = _mm256_add_epi32(TexYInt, _mm256_and_si256(_mm256_cmpgt_epi32(LayerHeightMinusOne, TexYInt), Onei)); + // NOTE(fox): The comparison is for when we're on the last pixel. __m256 TexX = _mm256_sub_ps(TexXFull, _mm256_cvtepi32_ps(TexXInt)); __m256 TexY = _mm256_sub_ps(TexYFull, _mm256_cvtepi32_ps(TexYInt)); @@ -484,11 +497,11 @@ AVX2_RenderLayer(transform_info T, pixel_buffer *Buffer, rectangle RenderRegion) __m256i XLookup = _mm256_add_epi32(_mm256_mullo_epi32(_mm256_srli_epi32(TexXInt, 2), Sixteeni), _mm256_and_si256(TexXInt, BottomTwoBits)); - __m256i YLookup = _mm256_add_epi32(_mm256_mullo_epi32(_mm256_srli_epi32(TexYInt, 2), LayerWidth4i), + __m256i YLookup = _mm256_add_epi32(_mm256_mullo_epi32(_mm256_srli_epi32(TexYInt, 2), FullLayerWidth4i), _mm256_mullo_epi32(_mm256_and_si256(TexYInt, BottomTwoBits), Fouri)); __m256i XLookupPlusOne = _mm256_add_epi32(_mm256_mullo_epi32(_mm256_srli_epi32(TexXIntPlusOne, 2), Sixteeni), _mm256_and_si256(TexXIntPlusOne, BottomTwoBits)); - __m256i YLookupPlusOne = _mm256_add_epi32(_mm256_mullo_epi32(_mm256_srli_epi32(TexYIntPlusOne, 2), LayerWidth4i), + __m256i YLookupPlusOne = _mm256_add_epi32(_mm256_mullo_epi32(_mm256_srli_epi32(TexYIntPlusOne, 2), FullLayerWidth4i), _mm256_mullo_epi32(_mm256_and_si256(TexYIntPlusOne, BottomTwoBits), Fouri)); __m256i PixelLookupTL = _mm256_add_epi32(XLookup, YLookup); @@ -571,13 +584,239 @@ AVX2_RenderLayer(transform_info T, pixel_buffer *Buffer, rectangle RenderRegion) _mm256_or_si256(R_Out, _mm256_slli_epi32(G_Out, 8)), _mm256_or_si256(_mm256_slli_epi32(B_Out, 16), _mm256_slli_epi32(A_Out, 24))); - __m256i PixelsMask = _mm256_blendv_epi8(Zeroi, OutputPixel, LayerMask); - _mm256_storeu_si256((__m256i *)Pixel, PixelsMask); + // __m256i PixelsMask = _mm256_blendv_epi8(Zeroi, OutputPixel, LayerMask); + _mm256_maskstore_epi32((int *)Pixel, LayerMask, OutputPixel); } PixelX = _mm256_add_ps(PixelX, Four); } } } + +internal void +SSE2_RenderLayer(transform_info T, pixel_buffer *Buffer, rectangle RenderRegion) +{ + rectangle LayerBounds = ClipRectangle( T.ClipRect, + RenderRegion ); + // Remember: since bitmaps are packed in 4x4 cubes, we always need to be aligned. + LayerBounds.Min.x -= LayerBounds.Min.x % 4; + LayerBounds.Min.y -= LayerBounds.Min.y % 4; + + uint8 *TexPTR = (uint8 *)T.SourceBuffer; + Assert(LayerBounds.Max.x <= Buffer->Width); + Assert(LayerBounds.Max.y <= Buffer->Height); + + __m128 XAxisPX = _mm_set1_ps(T.XAxisPX); + __m128 XAxisPY = _mm_set1_ps(T.XAxisPY); + __m128 YAxisPX = _mm_set1_ps(T.YAxisPX); + __m128 YAxisPY = _mm_set1_ps(T.YAxisPY); + + __m128 LayerWidth = _mm_set1_ps(T.LayerWidth); + __m128i LayerWidthMinusOne = _mm_set1_epi32(T.LayerWidth - 1); + __m128i FullLayerWidth4i = _mm_set1_epi32(T.FullLayerWidth*4); + __m128 LayerHeight = _mm_set1_ps(T.LayerHeight); + __m128i LayerHeightMinusOne = _mm_set1_epi32(T.LayerHeight - 1); + __m128 LayerOpacity = _mm_set1_ps(T.LayerOpacity); + __m128 OriginX = _mm_set1_ps(T.OriginX); + __m128 OriginY = _mm_set1_ps(T.OriginY); + + __m128 One = _mm_set1_ps(1); + __m128 Zero = _mm_set1_ps(0); + __m128i Zeroi = _mm_set1_epi32(0); + __m128i Onei = _mm_set1_epi32(1); + __m128 Four = _mm_set1_ps(4); + __m128 Sixteen = _mm_set1_ps(16); + __m128i FF = _mm_set1_epi32(0xFF); + __m128i BottomTwoBits = _mm_set1_epi32(0x03); + __m128i Fouri = _mm_set1_epi32(4); + __m128i Sixteeni = _mm_set1_epi32(16); + __m128 Reg255 = _mm_set1_ps(255.0f); + __m128i Int255 = _mm_set1_epi32(255); + __m128 Norm255 = _mm_set1_ps(1/255.0f); + + // NOTE(fox): Each loop operates on 4 pixels, 4 horizontal by 1 vertical. + + for (int32 Y = LayerBounds.Min.y; Y < LayerBounds.Max.y; Y++) + { + __m128 PixelX = _mm_setr_ps((real32)LayerBounds.Min.x, + (real32)LayerBounds.Min.x+1, + (real32)LayerBounds.Min.x+2, + (real32)LayerBounds.Min.x+3); + + __m128 PixelY = _mm_set1_ps((real32)Y); + __m128 StartVectorY = _mm_sub_ps(PixelY, OriginY); + + for (int32 X = LayerBounds.Min.x; X < LayerBounds.Max.x; X += 4) + { + IACA_START; + + __m128 StartVectorX = _mm_sub_ps(PixelX, OriginX); + + uint32 XLookup = (X >> 2)*16 + (X % 4); + uint32 YLookup = (Y >> 2)*(Buffer->FullWidth*4) + (Y % 4)*4; + uint32 PixelToSeek = XLookup + YLookup; + uint8 *Pixel = (uint8 *)Buffer->OriginalBuffer + PixelToSeek*Buffer->BytesPerPixel; + + __m128 U = _mm_add_ps(_mm_mul_ps(StartVectorX, XAxisPX), _mm_mul_ps(StartVectorY, XAxisPY)); + __m128 V = _mm_add_ps(_mm_mul_ps(StartVectorX, YAxisPX), _mm_mul_ps(StartVectorY, YAxisPY)); + + __m128i LayerMask = _mm_castps_si128(_mm_and_ps(_mm_and_ps(_mm_cmpge_ps(U, Zero), _mm_cmple_ps(U, One)), + _mm_and_ps(_mm_cmpge_ps(V, Zero), _mm_cmple_ps(V, One)))); + + if (_mm_movemask_epi8(LayerMask)) + { + U = _mm_max_ps(_mm_min_ps(One, U), Zero); + V = _mm_max_ps(_mm_min_ps(One, V), Zero); + + __m128 TexXFull = _mm_mul_ps(U, LayerWidth); + __m128 TexYFull = _mm_mul_ps(V, LayerHeight); + __m128i TexXInt = _mm_cvttps_epi32(TexXFull); + __m128i TexXIntPlusOne = _mm_add_epi32(TexXInt, _mm_and_si128(_mm_cmplt_epi32(TexXInt, LayerWidthMinusOne), Onei)); + __m128i TexYInt = _mm_cvttps_epi32(TexYFull); + __m128i TexYIntPlusOne = _mm_add_epi32(TexYInt, _mm_and_si128(_mm_cmplt_epi32(TexYInt, LayerHeightMinusOne), Onei)); + + __m128 TexX = _mm_sub_ps(TexXFull, _mm_cvtepi32_ps(TexXInt)); + __m128 TexY = _mm_sub_ps(TexYFull, _mm_cvtepi32_ps(TexYInt)); + __m128 TexXInv = _mm_sub_ps(One, TexX); + __m128 TexYInv = _mm_sub_ps(One, TexY); + __m128 TexBothXInv = _mm_mul_ps(TexXInv, TexY); + __m128 TexBothYInv = _mm_mul_ps(TexX, TexYInv); + __m128 TexBoth = _mm_mul_ps(TexY, TexX); + __m128 TexBothInv = _mm_mul_ps(TexXInv, TexYInv); + + __m128i XLookup = _mm_add_epi32(_mm_mullo_epi32(_mm_srli_epi32(TexXInt, 2), Sixteeni), + _mm_and_si128(TexXInt, BottomTwoBits)); + __m128i YLookup = _mm_add_epi32(_mm_mullo_epi32(_mm_srli_epi32(TexYInt, 2), FullLayerWidth4i), + _mm_mullo_epi32(_mm_and_si128(TexYInt, BottomTwoBits), Fouri)); + __m128i XLookupPlusOne = _mm_add_epi32(_mm_mullo_epi32(_mm_srli_epi32(TexXIntPlusOne, 2), Sixteeni), + _mm_and_si128(TexXIntPlusOne, BottomTwoBits)); + __m128i YLookupPlusOne = _mm_add_epi32(_mm_mullo_epi32(_mm_srli_epi32(TexYIntPlusOne, 2), FullLayerWidth4i), + _mm_mullo_epi32(_mm_and_si128(TexYIntPlusOne, BottomTwoBits), Fouri)); + + __m128i PixelLookupTL = _mm_add_epi32(XLookup, YLookup); + __m128i PixelLookupTR = _mm_add_epi32(XLookupPlusOne, YLookup); + __m128i PixelLookupBL = _mm_add_epi32(XLookup, YLookupPlusOne); + __m128i PixelLookupBR = _mm_add_epi32(XLookupPlusOne, YLookupPlusOne); + + // SSE lacks gathering, so we have no choice but to manually + // look up each pixel's four bilinear samples in scalar. + + uint32 S_PixelLookupTL0 = _mm_cvtsi128_si32(PixelLookupTL); + uint32 S_PixelLookupTR0 = _mm_cvtsi128_si32(PixelLookupTR); + uint32 S_PixelLookupBL0 = _mm_cvtsi128_si32(PixelLookupBL); + uint32 S_PixelLookupBR0 = _mm_cvtsi128_si32(PixelLookupBR); + uint32 S_PixelsTL0 = *(uint32 *)(TexPTR + S_PixelLookupTL0*4); + uint32 S_PixelsTR0 = *(uint32 *)(TexPTR + S_PixelLookupTR0*4); + uint32 S_PixelsBL0 = *(uint32 *)(TexPTR + S_PixelLookupBL0*4); + uint32 S_PixelsBR0 = *(uint32 *)(TexPTR + S_PixelLookupBR0*4); + + uint32 S_PixelLookupTL1 = _mm_cvtsi128_si32(_mm_srli_si128(PixelLookupTL, 4)); + uint32 S_PixelLookupTR1 = _mm_cvtsi128_si32(_mm_srli_si128(PixelLookupTR, 4)); + uint32 S_PixelLookupBL1 = _mm_cvtsi128_si32(_mm_srli_si128(PixelLookupBL, 4)); + uint32 S_PixelLookupBR1 = _mm_cvtsi128_si32(_mm_srli_si128(PixelLookupBR, 4)); + uint32 S_PixelsTL1 = *(uint32 *)(TexPTR + S_PixelLookupTL1*4); + uint32 S_PixelsTR1 = *(uint32 *)(TexPTR + S_PixelLookupTR1*4); + uint32 S_PixelsBL1 = *(uint32 *)(TexPTR + S_PixelLookupBL1*4); + uint32 S_PixelsBR1 = *(uint32 *)(TexPTR + S_PixelLookupBR1*4); + + uint32 S_PixelLookupTL2 = _mm_cvtsi128_si32(_mm_srli_si128(PixelLookupTL, 8)); + uint32 S_PixelLookupTR2 = _mm_cvtsi128_si32(_mm_srli_si128(PixelLookupTR, 8)); + uint32 S_PixelLookupBL2 = _mm_cvtsi128_si32(_mm_srli_si128(PixelLookupBL, 8)); + uint32 S_PixelLookupBR2 = _mm_cvtsi128_si32(_mm_srli_si128(PixelLookupBR, 8)); + uint32 S_PixelsTL2 = *(uint32 *)(TexPTR + S_PixelLookupTL2*4); + uint32 S_PixelsTR2 = *(uint32 *)(TexPTR + S_PixelLookupTR2*4); + uint32 S_PixelsBL2 = *(uint32 *)(TexPTR + S_PixelLookupBL2*4); + uint32 S_PixelsBR2 = *(uint32 *)(TexPTR + S_PixelLookupBR2*4); + + uint32 S_PixelLookupTL3 = _mm_cvtsi128_si32(_mm_srli_si128(PixelLookupTL, 12)); + uint32 S_PixelLookupTR3 = _mm_cvtsi128_si32(_mm_srli_si128(PixelLookupTR, 12)); + uint32 S_PixelLookupBL3 = _mm_cvtsi128_si32(_mm_srli_si128(PixelLookupBL, 12)); + uint32 S_PixelLookupBR3 = _mm_cvtsi128_si32(_mm_srli_si128(PixelLookupBR, 12)); + uint32 S_PixelsTL3 = *(uint32 *)(TexPTR + S_PixelLookupTL3*4); + uint32 S_PixelsTR3 = *(uint32 *)(TexPTR + S_PixelLookupTR3*4); + uint32 S_PixelsBL3 = *(uint32 *)(TexPTR + S_PixelLookupBL3*4); + uint32 S_PixelsBR3 = *(uint32 *)(TexPTR + S_PixelLookupBR3*4); + + __m128i PixelsTL = _mm_setr_epi32(S_PixelsTL0, S_PixelsTL1, S_PixelsTL2, S_PixelsTL3); + __m128i PixelsTR = _mm_setr_epi32(S_PixelsTR0, S_PixelsTR1, S_PixelsTR2, S_PixelsTR3); + __m128i PixelsBL = _mm_setr_epi32(S_PixelsBL0, S_PixelsBL1, S_PixelsBL2, S_PixelsBL3); + __m128i PixelsBR = _mm_setr_epi32(S_PixelsBR0, S_PixelsBR1, S_PixelsBR2, S_PixelsBR3); + + __m128i R_TexTL = _mm_and_si128( PixelsTL, FF); + __m128i G_TexTL = _mm_and_si128(_mm_srli_epi32(PixelsTL, 8), FF); + __m128i B_TexTL = _mm_and_si128(_mm_srli_epi32(PixelsTL, 16), FF); + __m128i A_TexTL = _mm_and_si128(_mm_srli_epi32(PixelsTL, 24), FF); + + __m128i R_TexTR = _mm_and_si128( PixelsTR, FF); + __m128i G_TexTR = _mm_and_si128(_mm_srli_epi32(PixelsTR, 8), FF); + __m128i B_TexTR = _mm_and_si128(_mm_srli_epi32(PixelsTR, 16), FF); + __m128i A_TexTR = _mm_and_si128(_mm_srli_epi32(PixelsTR, 24), FF); + + __m128i R_TexBL = _mm_and_si128( PixelsBL, FF); + __m128i G_TexBL = _mm_and_si128(_mm_srli_epi32(PixelsBL, 8), FF); + __m128i B_TexBL = _mm_and_si128(_mm_srli_epi32(PixelsBL, 16), FF); + __m128i A_TexBL = _mm_and_si128(_mm_srli_epi32(PixelsBL, 24), FF); + + __m128i R_TexBR = _mm_and_si128( PixelsBR, FF); + __m128i G_TexBR = _mm_and_si128(_mm_srli_epi32(PixelsBR, 8), FF); + __m128i B_TexBR = _mm_and_si128(_mm_srli_epi32(PixelsBR, 16), FF); + __m128i A_TexBR = _mm_and_si128(_mm_srli_epi32(PixelsBR, 24), FF); + + __m128 R_PixelBlend = _mm_add_ps(_mm_add_ps(_mm_mul_ps(TexBothInv, _mm_cvtepi32_ps(R_TexTL)), + _mm_mul_ps(TexBothYInv, _mm_cvtepi32_ps(R_TexTR))), + _mm_add_ps(_mm_mul_ps(TexBothXInv, _mm_cvtepi32_ps(R_TexBL)), + _mm_mul_ps(TexBoth, _mm_cvtepi32_ps(R_TexBR)))); + __m128 G_PixelBlend = _mm_add_ps(_mm_add_ps(_mm_mul_ps(TexBothInv, _mm_cvtepi32_ps(G_TexTL)), + _mm_mul_ps(TexBothYInv, _mm_cvtepi32_ps(G_TexTR))), + _mm_add_ps(_mm_mul_ps(TexBothXInv, _mm_cvtepi32_ps(G_TexBL)), + _mm_mul_ps(TexBoth, _mm_cvtepi32_ps(G_TexBR)))); + __m128 B_PixelBlend = _mm_add_ps(_mm_add_ps(_mm_mul_ps(TexBothInv, _mm_cvtepi32_ps(B_TexTL)), + _mm_mul_ps(TexBothYInv, _mm_cvtepi32_ps(B_TexTR))), + _mm_add_ps(_mm_mul_ps(TexBothXInv, _mm_cvtepi32_ps(B_TexBL)), + _mm_mul_ps(TexBoth, _mm_cvtepi32_ps(B_TexBR)))); + __m128 A_PixelBlend = _mm_add_ps(_mm_add_ps(_mm_mul_ps(TexBothInv, _mm_cvtepi32_ps(A_TexTL)), + _mm_mul_ps(TexBothYInv, _mm_cvtepi32_ps(A_TexTR))), + _mm_add_ps(_mm_mul_ps(TexBothXInv, _mm_cvtepi32_ps(A_TexBL)), + _mm_mul_ps(TexBoth, _mm_cvtepi32_ps(A_TexBR)))); + + A_PixelBlend = _mm_sub_ps(A_PixelBlend, _mm_mul_ps(A_PixelBlend, LayerOpacity)); + + __m128i R_Out, G_Out, B_Out, A_Out; + // Only do alpha blending if a pixel's value doesn't equal 255 + if (_mm_movemask_epi8(_mm_sub_epi32(_mm_cvtps_epi32(A_PixelBlend), Int255))) + { + __m128 LayerAlpha = _mm_mul_ps(A_PixelBlend, Norm255); + __m128 LayerAlphaInv = _mm_mul_ps(_mm_sub_ps(Reg255, A_PixelBlend), Norm255); + + __m128i DestPixel = _mm_loadu_si128((const __m128i *)Pixel); + __m128i R_Dest = _mm_and_si128( DestPixel, FF); + __m128i G_Dest = _mm_and_si128(_mm_srli_epi32(DestPixel, 8), FF); + __m128i B_Dest = _mm_and_si128(_mm_srli_epi32(DestPixel, 16), FF); + __m128i A_Dest = _mm_and_si128(_mm_srli_epi32(DestPixel, 24), FF); + + R_Out = _mm_cvtps_epi32(_mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(R_Dest), LayerAlphaInv), _mm_mul_ps(R_PixelBlend, LayerAlpha))); + G_Out = _mm_cvtps_epi32(_mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(G_Dest), LayerAlphaInv), _mm_mul_ps(G_PixelBlend, LayerAlpha))); + B_Out = _mm_cvtps_epi32(_mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(B_Dest), LayerAlphaInv), _mm_mul_ps(B_PixelBlend, LayerAlpha))); + A_Out = _mm_cvtps_epi32(_mm_min_ps(_mm_add_ps(_mm_cvtepi32_ps(A_Dest), A_PixelBlend), Reg255)); + } + else + { + R_Out = _mm_cvtps_epi32(R_PixelBlend); + G_Out = _mm_cvtps_epi32(G_PixelBlend); + B_Out = _mm_cvtps_epi32(B_PixelBlend); + A_Out = _mm_cvtps_epi32(A_PixelBlend); + } + + __m128i OutputPixel = _mm_or_si128( + _mm_or_si128(R_Out, _mm_slli_epi32(G_Out, 8)), + _mm_or_si128(_mm_slli_epi32(B_Out, 16), _mm_slli_epi32(A_Out, 24))); + _mm_maskmoveu_si128(OutputPixel, LayerMask, (char *)Pixel); + } + PixelX = _mm_add_ps(PixelX, Four); + } + } +} + + #endif internal void @@ -595,25 +834,17 @@ Fallback_RenderLayer(transform_info T, pixel_buffer *Buffer, rectangle RenderReg // uint32 pp2 = 3; // bool32 real = true; - for (int16 Y = LayerBounds.Min.y; Y < LayerBounds.Max.y; Y += 2) + for (int16 Y = LayerBounds.Min.y; Y < LayerBounds.Max.y; Y++) { -#if PACKEDRGB -#else - uint8 *Pixel = (uint8 *)Row + (uint16)LayerBounds.Min.x; -#endif - real32 StartVectorY[2]; - StartVectorY[0] = (real32)Y - T.OriginY; - StartVectorY[1] = (real32)(Y+1) - T.OriginY; + real32 StartVectorY = (real32)Y - T.OriginY; for (int16 X = LayerBounds.Min.x; X < LayerBounds.Max.x; X++) { - for (int16 i = 0; i < 2; i++) - { IACA_START; real32 StartVectorX = X - T.OriginX; - real32 U = (StartVectorX * T.XAxisPX) + (StartVectorY[i] * T.XAxisPY); - real32 V = (StartVectorX * T.YAxisPX) + (StartVectorY[i] * T.YAxisPY); + real32 U = (StartVectorX * T.XAxisPX) + (StartVectorY * T.XAxisPY); + real32 V = (StartVectorX * T.YAxisPX) + (StartVectorY * T.YAxisPY); if (U <= 1.0f && U >= 0.0f && V <= 1.0f && V >= 0.0f) { real32 TexXFull = U * T.LayerWidth; @@ -631,7 +862,6 @@ Fallback_RenderLayer(transform_info T, pixel_buffer *Buffer, rectangle RenderReg real32 TexBoth = TexY * TexX; real32 TexBothInv = TexXInv * TexYInv; -#if PACKEDRGB #if 0 uint8 *TexPTR0 = ((uint8 *)T.SourceBuffer + (uint16)T.LayerPitch*TexYInt + TexXInt*Buffer->BytesPerPixel); uint8 *TexPTR1 = ((uint8 *)T.SourceBuffer + (uint16)T.LayerPitch*(TexYInt+1) + TexXInt*Buffer->BytesPerPixel); @@ -641,35 +871,34 @@ Fallback_RenderLayer(transform_info T, pixel_buffer *Buffer, rectangle RenderReg uint32 PixelC = *(uint32 *)TexPTR1; uint32 PixelD = *((uint32 *)TexPTR1 + 1); #else - uint16 LX, LY; uint32 XLookup, YLookup, PixelToSeek; - // TODO(fox): Be careful with the BytesPerPixel here! It's the buffer's, not the layer's! - LX = TexXInt; - LY = TexYInt; + // TODO(fox): Anti-aliasing on edges + uint16 LX = TexXInt; + uint16 LY = TexYInt; + uint16 LXPlus = Ceil(TexXInt+1, (uint32)T.LayerWidth - 1); + uint16 LYPlus = Ceil(TexYInt+1, (uint32)T.LayerHeight - 1); + + // TODO(fox): Be careful with the BytesPerPixel here! It's the + // buffer's, not the layer's (currently everything is 4 bytes + // per pixel). XLookup = (LX >> 2)*16 + (LX % 4); - YLookup = (LY >> 2)*(T.LayerWidth*4) + (LY % 4)*4; + YLookup = (LY >> 2)*(T.FullLayerWidth*4) + (LY % 4)*4; PixelToSeek = XLookup + YLookup; uint32 PixelA = *(uint32 *)((uint8 *)T.SourceBuffer + PixelToSeek*Buffer->BytesPerPixel); - LX = TexXInt+1; - LY = TexYInt; - XLookup = (LX >> 2)*16 + (LX % 4); - YLookup = (LY >> 2)*(T.LayerWidth*4) + (LY % 4)*4; + XLookup = (LXPlus >> 2)*16 + (LXPlus % 4); + YLookup = (LY >> 2)*(T.FullLayerWidth*4) + (LY % 4)*4; PixelToSeek = XLookup + YLookup; uint32 PixelB = *(uint32 *)((uint8 *)T.SourceBuffer + PixelToSeek*Buffer->BytesPerPixel); - LX = TexXInt; - LY = TexYInt+1; XLookup = (LX >> 2)*16 + (LX % 4); - YLookup = (LY >> 2)*(T.LayerWidth*4) + (LY % 4)*4; + YLookup = (LYPlus >> 2)*(T.FullLayerWidth*4) + (LYPlus % 4)*4; PixelToSeek = XLookup + YLookup; uint32 PixelC = *(uint32 *)((uint8 *)T.SourceBuffer + PixelToSeek*Buffer->BytesPerPixel); - LX = TexXInt+1; - LY = TexYInt+1; - XLookup = (LX >> 2)*16 + (LX % 4); - YLookup = (LY >> 2)*(T.LayerWidth*4) + (LY % 4)*4; + XLookup = (LXPlus >> 2)*16 + (LXPlus % 4); + YLookup = (LYPlus >> 2)*(T.FullLayerWidth*4) + (LYPlus % 4)*4; PixelToSeek = XLookup + YLookup; uint32 PixelD = *(uint32 *)((uint8 *)T.SourceBuffer + PixelToSeek*Buffer->BytesPerPixel); #endif @@ -693,30 +922,6 @@ Fallback_RenderLayer(transform_info T, pixel_buffer *Buffer, rectangle RenderReg uint8 TexAB = ((PixelB >> 24) & 0xFF); uint8 TexAC = ((PixelC >> 24) & 0xFF); uint8 TexAD = ((PixelD >> 24) & 0xFF); -#else - uint8 *TexPTR0 = ((uint8 *)T.SourceBuffer + (uint16)T.LayerPitch*TexYInt + TexXInt); - uint8 *TexPTR1 = ((uint8 *)T.SourceBuffer + (uint16)T.LayerPitch*(TexYInt+1) + TexXInt); - - uint8 TexRA = *TexPTR0; - uint8 TexRB = *(TexPTR0 + 1); - uint8 TexRC = *TexPTR1; - uint8 TexRD = *(TexPTR1 + 1); - - uint8 TexGA = *(TexPTR0 + Channel); - uint8 TexGB = *(TexPTR0 + 1 + Channel); - uint8 TexGC = *(TexPTR1 + Channel); - uint8 TexGD = *(TexPTR1 + 1 + Channel); - - uint8 TexBA = *(TexPTR0 + Channel*2); - uint8 TexBB = *(TexPTR0 + 1 + Channel*2); - uint8 TexBC = *(TexPTR1 + Channel*2); - uint8 TexBD = *(TexPTR1 + 1 + Channel*2); - - uint8 TexAA = *(TexPTR0 + Channel*3); - uint8 TexAB = *(TexPTR0 + 1 + Channel*3); - uint8 TexAC = *(TexPTR1 + Channel*3); - uint8 TexAD = *(TexPTR1 + 1 + Channel*3); -#endif real32 PixelBlendR = (TexBothInv * TexRA) + (TexBothYInv * TexRB) + (TexBothXInv * TexRC) + (TexBoth * TexRD); @@ -733,9 +938,9 @@ Fallback_RenderLayer(transform_info T, pixel_buffer *Buffer, rectangle RenderReg uint8 B = (uint8)PixelBlendB; uint8 A = (uint8)PixelBlendA; -#if PACKEDRGB XLookup = (X >> 2)*16 + (X % 4); - YLookup = ((Y+i) >> 2)*(Buffer->Width*4) + ((Y+i) % 4)*4; + YLookup = (Y >> 2)*(Buffer->FullWidth*4) + (Y % 4)*4; + // if (real) { // real = false; // printf("XLook: %i, YLook: %i\n", XLookup, YLookup); @@ -748,16 +953,6 @@ Fallback_RenderLayer(transform_info T, pixel_buffer *Buffer, rectangle RenderReg uint8 G1 = (*Pixel >> 8); uint8 B1 = (*Pixel >> 16); uint8 A1 = (*Pixel >> 24); -#else - uint8 *RD = Pixel; - uint8 *GD = Pixel + Buffer->Channel; - uint8 *BD = Pixel + Buffer->Channel*2; - uint8 *AD = Pixel + Buffer->Channel*3; - uint8 R1 = *RD; - uint8 G1 = *GD; - uint8 B1 = *BD; - uint8 A1 = *AD; -#endif if (A != 255) { real32 LayerAlpha = (255 - A) / 255.0f; @@ -767,23 +962,11 @@ Fallback_RenderLayer(transform_info T, pixel_buffer *Buffer, rectangle RenderReg A = ClipAdd(A1, A); } -#if PACKEDRGB *Pixel = ((A << 24) | (B << 16) | (G << 8) | (R << 0)); } - } - } -#else - *RD = R; - *GD = G; - *BD = B; - *AD = A; - } - Pixel++; } - Row += Buffer->Pitch*2; -#endif } } diff --git a/threading.cpp b/threading.cpp index 07584bd..39e7b75 100644 --- a/threading.cpp +++ b/threading.cpp @@ -7,10 +7,6 @@ PushRect(rectangle RenderRegion) SDL_SemPost(Semaphore); } -internal void -AVX2_RenderLayer(transform_info TransformInfo, pixel_buffer *Buffer, rectangle RenderRegion); -internal void -Fallback_RenderLayer(transform_info TransformInfo, pixel_buffer *Buffer, rectangle RenderRegion); internal bool32 CheckQueue(render_queue RenderInfo, uint16 Index) @@ -22,20 +18,7 @@ CheckQueue(render_queue RenderInfo, uint16 Index) if (__atomic_compare_exchange_n(&NextEntryToDo, &OriginalEntry, NextEntryToDo + 1, true, __ATOMIC_RELEASE, __ATOMIC_ACQUIRE)) { render_entry *Entry = Entries + OriginalEntry; Assert(Entry->RenderRegion.Max.x != 0); - for (int16 i = 0; i < RenderInfo.State->NumberOfLayersToRender; i++) { - int16 Idx = RenderInfo.State->LayersToRender[i]; -#if ARM - Fallback_RenderLayer(RenderInfo.File->Layer[Idx]->TransformInfo, RenderInfo.CompBuffer, Entry->RenderRegion); - // RenderLayerNeon(RenderInfo.File->Layer[Idx], RenderInfo.CompBuffer, Entry->RenderRegion); -#else - // printf("(RENDERING) Thread %i, region X%i Y%i\n", Index, Entry->RenderRegion.Min.x/240, Entry->RenderRegion.Min.y/135); - if (AVXEnabled) - AVX2_RenderLayer(RenderInfo.File->Layer[Idx]->TransformInfo, RenderInfo.CompBuffer, Entry->RenderRegion); - else - Fallback_RenderLayer(RenderInfo.File->Layer[Idx]->TransformInfo, RenderInfo.CompBuffer, Entry->RenderRegion); -#endif - } - + RenderLayers(RenderInfo, Entry->RenderRegion); // printf("(FINISHED) Thread %i, region X%i Y%i\n", Index, Entry->RenderRegion.Min.x/240, Entry->RenderRegion.Min.y/135); __atomic_add_fetch(&CompletedJobs, 1, __ATOMIC_ACQ_REL); Result = 1; @@ -190,15 +190,10 @@ InitAV(char *filename, av_info *AV) av_seek_frame(AV->FileFormatContext, -1, 0, AVSEEK_FLAG_BACKWARD); }; -#if PACKEDRGB internal void -Store4x4Chunk(pixel_buffer *Raster); +Convert4x4Chunk(pixel_buffer *Raster, uint8); internal void -SSE_ClearBuffer(pixel_buffer *Raster, uint16); -#else -internal void -Libav_GBRAToRGBA(pixel_buffer *Raster); -#endif +ClearBuffer(pixel_buffer *Raster, void *); internal int16 LoadVideoFrame(video_source *Source, memory *Memory, int32 TimelineFrame) @@ -261,11 +256,7 @@ LoadVideoFrame(video_source *Source, memory *Memory, int32 TimelineFrame) // NOTE(fox): This function will be replaced in the future. AV->RGBContext = sws_getContext(AV->VideoFrame->width, AV->VideoFrame->height, (AVPixelFormat)AV->VideoFrame->format, -#if PACKEDRGB AV->VideoFrame->width, AV->VideoFrame->height, AV_PIX_FMT_RGBA, SWS_BILINEAR, -#else - AV->VideoFrame->width, AV->VideoFrame->height, AV_PIX_FMT_GBRAP, SWS_BILINEAR, -#endif NULL, NULL, NULL); if(!AV->RGBContext) { @@ -277,13 +268,9 @@ LoadVideoFrame(video_source *Source, memory *Memory, int32 TimelineFrame) av_frame_unref(AV->VideoFrame); -#if PACKEDRGB - Store4x4Chunk(Buffer); - SSE_CopyToBuffer(Buffer, 1); - SSE_ClearBuffer(Buffer, 1); -#else - Libav_GBRAToRGBA(Buffer); -#endif + Convert4x4Chunk(Buffer, 0); + CopyToBuffer(Buffer, 1); + ClearBuffer(Buffer, Buffer->EffectBuffer); return 0; } |