summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorFox Caminiti <fox@foxcam.net>2022-07-27 11:00:45 -0400
committerFox Caminiti <fox@foxcam.net>2022-07-27 11:00:45 -0400
commit83ce428d8bb5f4a762abf879adec076bc34cf36a (patch)
treec1500f027d9eec514ba1a2912e7a4763e7be26b2
parentc6bd84c356b6aaa029b9708d7b99a4aba1673b6b (diff)
full support for odd-dimension bitmaps and comps
-rw-r--r--bitmap_calls.cpp238
-rwxr-xr-xbuild.bat4
-rwxr-xr-xbuild.sh2
-rw-r--r--createcalls.cpp502
-rw-r--r--debug.h2
-rw-r--r--effects.cpp4
-rw-r--r--main.cpp75
-rw-r--r--main.h18
-rw-r--r--my_imgui_widgets.cpp20
-rw-r--r--prenderer.cpp379
-rw-r--r--threading.cpp19
-rw-r--r--video.cpp23
12 files changed, 653 insertions, 633 deletions
diff --git a/bitmap_calls.cpp b/bitmap_calls.cpp
new file mode 100644
index 0000000..2cdb463
--- /dev/null
+++ b/bitmap_calls.cpp
@@ -0,0 +1,238 @@
+// NOTE(fox): Pay attention to how the Y pitch differs between the unpacked
+// bitmaps and the 4x4 packed bitmaps, since odd-sized bitmaps are padded.
+
+// TODO(fox): I could write an AVX version of this function, but it may not be
+// that much faster since we have to do a bit of uninterleaving.
+
+// 0 - store in 4x4 chunks
+// 1 - unpack to 1xwidth
+internal void
+Convert4x4Chunk(pixel_buffer *Buffer, uint8 Which)
+{
+ uint8 *Src = (uint8 *)Buffer->OriginalBuffer;
+ uint8 *Temp = (uint8 *)Buffer->EffectBuffer;
+ uint32 RemainderPixels = Buffer->Width % 4;
+ for (uint32 Y = 0; Y < Buffer->Height; Y++) {
+ uint32 X = 0;
+ while (X < Buffer->Width - RemainderPixels) {
+ uint32 XLookup = (X >> 2)*16 + (X % 4);
+ uint32 YLookup = (Y >> 2)*(Buffer->FullWidth*4) + (Y % 4)*4;
+ uint32 PixelToSeek = XLookup + YLookup;
+ uint8 *DPixel, *Pixel;
+ if (Which == 0) {
+ DPixel = Temp + PixelToSeek*Buffer->BytesPerPixel;
+ Pixel = Src + Y*Buffer->Width*4 + X*Buffer->BytesPerPixel;
+ } else {
+ Pixel = Src + PixelToSeek*Buffer->BytesPerPixel;
+ DPixel = Temp + Y*Buffer->Width*4 + X*Buffer->BytesPerPixel;
+ }
+
+ if (InstructionMode == sse_enabled || InstructionMode == avx_enabled) {
+ __m128i Row = _mm_loadu_si128((__m128i *)Pixel);
+ _mm_storeu_si128((__m128i *)DPixel, Row);
+ X+=4;
+ } else {
+ *(uint32 *)DPixel = *(uint32 *)Pixel;
+ X++;
+ }
+ }
+ while (X < Buffer->Width) {
+ uint32 XLookup = (X >> 2)*16 + (X % 4);
+ uint32 YLookup = (Y >> 2)*(Buffer->FullWidth*4) + (Y % 4)*4;
+ uint32 PixelToSeek = XLookup + YLookup;
+ uint8 *DPixel, *Pixel;
+ if (Which == 0) {
+ DPixel = Temp + PixelToSeek*Buffer->BytesPerPixel;
+ Pixel = Src + Y*Buffer->Width*4 + X*Buffer->BytesPerPixel;
+ } else {
+ Pixel = Src + PixelToSeek*Buffer->BytesPerPixel;
+ DPixel = Temp + Y*Buffer->Width*4 + X*Buffer->BytesPerPixel;
+ }
+
+ *(uint32 *)DPixel = *(uint32 *)Pixel;
+ X++;
+ }
+ }
+}
+
+// TODO(fox): Replace this in the future.
+internal void *
+MoveImportToBitmap(memory *Memory, pixel_buffer *Raster, void *Input)
+{
+ uint8 *Row = ((uint8 *)Input);
+ void *Output = AllocateMemory(Memory, Raster->FullWidth * Raster->FullHeight * Raster->BytesPerPixel, B_Scratch);
+ uint8 *Row2 = ((uint8 *)Output);
+
+ uint64 bytes = 0;
+ uint16 ByteOffset = Raster->BytesPerPixel;
+ if (InstructionMode == avx_enabled)
+ ByteOffset = 8*Raster->BytesPerPixel;
+ else if (InstructionMode == avx_enabled)
+ ByteOffset = 4*Raster->BytesPerPixel;
+
+ uint64 TotalBytes = Raster->Height*Raster->Width*Raster->BytesPerPixel;
+ uint64 RemainderBytes = TotalBytes % ByteOffset;
+
+ while (bytes <= TotalBytes - RemainderBytes) {
+ uint8 *Pixel = (uint8 *)Row + bytes;
+ uint8 *Pixel2 = (uint8 *)Row2 + bytes;
+ if (InstructionMode == sse_enabled || InstructionMode == avx_enabled) {
+ __m128i OutputPixel = _mm_loadu_si128((__m128i *)Pixel);
+ _mm_storeu_si128((__m128i *)Pixel2, OutputPixel);
+ bytes += 4*Raster->BytesPerPixel;
+ } else {
+ *(uint32 *)Pixel2 = *(uint32 *)Pixel;
+ bytes += Raster->BytesPerPixel;
+ }
+ }
+ while (bytes <= TotalBytes) {
+ uint8 *Pixel = (uint8 *)Row + bytes;
+ uint8 *Pixel2 = (uint8 *)Row2 + bytes;
+ *(uint32 *)Pixel2 = *(uint32 *)Pixel;
+ bytes += Raster->BytesPerPixel;
+ }
+ return Output;
+}
+
+internal void
+ClearBuffer(pixel_buffer *Raster, void *Buffer)
+{
+ uint8 *Row = (uint8 *)Buffer;
+ __m256i Zero8 = _mm256_setzero_si256();
+ __m128i Zero = _mm_setzero_si128();
+
+ uint64 bytes = 0;
+ uint16 ByteOffset = Raster->BytesPerPixel;
+ if (InstructionMode == avx_enabled)
+ ByteOffset = 8*Raster->BytesPerPixel;
+ else if (InstructionMode == avx_enabled)
+ ByteOffset = 4*Raster->BytesPerPixel;
+
+ uint64 TotalBytes = Raster->FullHeight*Raster->FullWidth*Raster->BytesPerPixel;
+
+ while (bytes < TotalBytes) {
+ uint8 *Pixel = Row + bytes;
+ if (InstructionMode == avx_enabled) {
+ _mm256_storeu_si256((__m256i *)Pixel, Zero8);
+ } else if (InstructionMode == sse_enabled) {
+ _mm_storeu_si128((__m128i *)Pixel, Zero);
+ } else {
+ *(uint32 *)Pixel = 0x00000000;
+ }
+ bytes += ByteOffset;
+ }
+}
+
+// 0 - original -> effect
+// 1 - effect -> original
+internal void
+CopyToBuffer(pixel_buffer *Raster, uint16 Which)
+{
+ uint8 *Row, *Row2;
+ if (Which == 0) {
+ Row = ((uint8 *)Raster->OriginalBuffer);
+ Row2 = ((uint8 *)Raster->EffectBuffer);
+ } else {
+ Row = ((uint8 *)Raster->EffectBuffer);
+ Row2 = ((uint8 *)Raster->OriginalBuffer);
+ }
+
+ uint64 bytes = 0;
+ uint16 ByteOffset = Raster->BytesPerPixel;
+ if (InstructionMode == avx_enabled)
+ ByteOffset = 8*Raster->BytesPerPixel;
+ else if (InstructionMode == avx_enabled)
+ ByteOffset = 4*Raster->BytesPerPixel;
+
+ uint64 TotalBytes = Raster->FullHeight*Raster->FullWidth*Raster->BytesPerPixel;
+ uint64 RemainderBytes = TotalBytes % ByteOffset;
+
+ while (bytes <= TotalBytes - RemainderBytes) {
+ uint8 *Pixel = (uint8 *)Row + bytes;
+ uint8 *Pixel2 = (uint8 *)Row2 + bytes;
+ if (InstructionMode == sse_enabled || InstructionMode == avx_enabled) {
+ __m128i OutputPixel = _mm_loadu_si128((__m128i *)Pixel);
+ _mm_storeu_si128((__m128i *)Pixel2, OutputPixel);
+ bytes += 4*Raster->BytesPerPixel;
+ } else {
+ *(uint32 *)Pixel2 = *(uint32 *)Pixel;
+ bytes += Raster->BytesPerPixel;
+ }
+ }
+ while (bytes <= TotalBytes) {
+ uint8 *Pixel = (uint8 *)Row + bytes;
+ uint8 *Pixel2 = (uint8 *)Row2 + bytes;
+ *(uint32 *)Pixel2 = *(uint32 *)Pixel;
+ bytes += Raster->BytesPerPixel;
+ }
+}
+
+internal void
+BitmapPackRGB(pixel_buffer *Buffer) {
+ Assert(Buffer->Pitch);
+ Convert4x4Chunk(Buffer, 0);
+ CopyToBuffer(Buffer, 1);
+ ClearBuffer(Buffer, Buffer->EffectBuffer);
+}
+
+internal void
+OutputToViewport(pixel_buffer *CompBuffer, project_state *State, GLuint textureID) {
+ if (D)
+ Convert4x4Chunk(CompBuffer, 1);
+ else
+ CopyToBuffer(CompBuffer, 0);
+ EndRenderState(State);
+ glBindTexture(GL_TEXTURE_2D, textureID);
+ glTexSubImage2D(GL_TEXTURE_2D, 0, 0, 0, CompBuffer->Width, CompBuffer->Height, GL_RGBA, GL_UNSIGNED_BYTE,
+ CompBuffer->EffectBuffer);
+}
+
+internal void
+DebugFillSolid(pixel_buffer *Raster, v4 Color)
+{
+ uint32 ColS = ColToUint32(Color);
+ __m256i Col8 = _mm256_set1_epi32(ColS);
+ __m128i Col = _mm_set1_epi32(ColS);
+ uint8 *Row = (uint8 *)Raster->OriginalBuffer;
+
+ uint64 bytes = 0;
+ uint16 ByteOffset = Raster->BytesPerPixel;
+ if (InstructionMode == avx_enabled)
+ ByteOffset = 8*Raster->BytesPerPixel;
+ else if (InstructionMode == avx_enabled)
+ ByteOffset = 4*Raster->BytesPerPixel;
+
+ uint64 TotalBytes = Raster->FullHeight*Raster->FullWidth*Raster->BytesPerPixel;
+
+ while (bytes < TotalBytes) {
+ uint8 *Pixel = Row + bytes;
+ if (InstructionMode == avx_enabled) {
+ _mm256_storeu_si256((__m256i *)Pixel, Col8);
+ } else if (InstructionMode == sse_enabled) {
+ _mm_storeu_si128((__m128i *)Pixel, Col);
+ } else {
+ *(uint32 *)Pixel = ColS;
+ }
+ bytes += ByteOffset;
+ }
+}
+
+internal void
+DebugBitmap(pixel_buffer *Raster)
+{
+ uint8 asda = 0x0;
+ uint8 *Row = ((uint8 *)Raster->OriginalBuffer);
+ real32 XInc = 255.0f / Raster->Width;
+ real32 YInc = 255.0f / Raster->Height;
+ for (uint8 Y = 0; Y < Raster->Height; Y++) {
+ for (uint8 X = 0; X < Raster->Width; X++) {
+ uint8 *Pixel = (uint8 *)Row + Raster->FullWidth*Y*4 + X*4;
+ // *(uint32 *)Pixel = 0xffffffff;
+ if (Y > 3) { asda = 0xff; }
+ *(uint32 *)Pixel = ((0xff << 24) |
+ (asda << 16) |
+ (RoundReal32ToInt32((YInc * Y)) << 8) |
+ (RoundReal32ToInt32((XInc * X))) );
+ }
+ }
+}
diff --git a/build.bat b/build.bat
index 9fd305b..f325dd8 100755
--- a/build.bat
+++ b/build.bat
@@ -6,5 +6,5 @@ REM call "C:\Program Files (x86)\Microsoft Visual Studio\2022\BuildTools\VC\Auxi
@set IMGUI_SOURCES=imgui\backends\imgui_impl_sdl.cpp imgui\backends\imgui_impl_opengl3.cpp imgui\imgui*.cpp
@set SDL_LIBS=/LIBPATH:%SDL2_DIR%\lib\x64 SDL2.lib SDL2main.lib opengl32.lib shell32.lib
@set FFMPEG_LIBS=/LIBPATH:%FFMPEG_DIR%\lib avcodec.lib avfilter.lib avformat.lib swscale.lib avutil.lib
-@set PREPROCESSORS=/DWINDOWS=1 /DARM=0 /DTHREADED=0 /DPACKEDRGB=1
-cl /nologo /Zi /MD %PREPROCESSORS% %INCLUDES% main.cpp %IMGUI_SOURCES% /Febin/real2d.exe /Fobin/ /link %SDL_LIBS% %FFMPEG_LIBS% /subsystem:console \ No newline at end of file
+@set PREPROCESSORS=/DWINDOWS=1 /DARM=0 /DTHREADED=0
+cl /nologo /Zi /MD %PREPROCESSORS% %INCLUDES% main.cpp %IMGUI_SOURCES% /Febin/real2d.exe /Fobin/ /link %SDL_LIBS% %FFMPEG_LIBS% /subsystem:console
diff --git a/build.sh b/build.sh
index 5f10e49..8b5a63e 100755
--- a/build.sh
+++ b/build.sh
@@ -8,7 +8,7 @@ WARNING_FLAGS="
-Wno-missing-field-initializers -Wno-sign-compare -Wno-write-strings -Wno-unused-but-set-parameter \
-Wno-missing-braces -Wno-format-security
-fno-exceptions -Wno-strict-aliasing \
- -DDEBUG=1 -DARM=0 -DTHREADED=0 -DPACKEDRGB=1 \
+ -DDEBUG=1 -DARM=0 -DTHREADED=0 \
"
if [[ "$WINDOWS" == 1 ]]; then
diff --git a/createcalls.cpp b/createcalls.cpp
index e5ca18d..0dbf75c 100644
--- a/createcalls.cpp
+++ b/createcalls.cpp
@@ -7,22 +7,29 @@ IncrementFrame(project_data *File, int16 Amount) {
}
}
+internal void
+CalculateFull(pixel_buffer *Buffer) {
+ uint16 ExtraWidth = 4 - (Buffer->Width % 4);
+ if (ExtraWidth == 4)
+ ExtraWidth = 0;
+ uint16 ExtraHeight = 4 - (Buffer->Height % 4);
+ if (ExtraHeight == 4)
+ ExtraHeight = 0;
+ Buffer->FullWidth = Buffer->Width + ExtraWidth;
+ Buffer->FullHeight = Buffer->Height + ExtraHeight;
+}
internal pixel_buffer
CreateBuffer(int Width, int Height, memory *Memory)
{
pixel_buffer Buffer = {};
Buffer.BytesPerPixel = 4;
- Buffer.OriginalBuffer = AllocateMemory(Memory, Width * Height * Buffer.BytesPerPixel, B_Scratch);
- Buffer.EffectBuffer = AllocateMemory(Memory, Width * Height * Buffer.BytesPerPixel, B_Scratch);
Buffer.Width = Width;
Buffer.Height = Height;
-#if PACKEDRGB
- Buffer.Pitch = Buffer.Width*Buffer.BytesPerPixel;
-#else
- Buffer.Pitch = Buffer.Width; // each row has only 1 byte, 8 bits, per pixel
- Buffer.Channel = Buffer.Width*Buffer.Height;
-#endif
+ CalculateFull(&Buffer);
+ Buffer.Pitch = Buffer.FullWidth*Buffer.BytesPerPixel;
+ Buffer.OriginalBuffer = AllocateMemory(Memory, Buffer.FullWidth * Buffer.FullHeight * Buffer.BytesPerPixel, B_Scratch);
+ Buffer.EffectBuffer = AllocateMemory(Memory, Buffer.FullWidth * Buffer.FullHeight * Buffer.BytesPerPixel, B_Scratch);
Buffer.ToUpdate = true;
return Buffer;
}
@@ -40,299 +47,6 @@ AddSource(project_data *File, memory *Memory, char *Path)
}
internal pixel_buffer
-CreateDebugBitmap(int16 Width, int16 Height, memory *Memory)
-{
- pixel_buffer Raster = CreateBuffer(Width, Height, Memory);
- uint32 Channel = (Raster.Width * Raster.Height);
- uint8 inc = 0;
- uint8 incY = 0;
- for (uint32 Y = 0; Y < Raster.Height; Y+=1) {
- for (uint32 X = 0; X < Raster.Width; X+=1) {
-#if PACKEDRGB
- uint8 *Pix = ((uint8 *)Raster.OriginalBuffer + (Raster.Pitch*Y) + X*Raster.BytesPerPixel);
- uint32 *Pixel = (uint32 *)Pix;
- *Pixel = (
- (X << 0) |
- (Y << 8) |
- (0xaa << 16) |
- (0xff << 24));
- inc++;
-#else
- uint8 *Pix = ((uint8 *)Raster.OriginalBuffer + (Raster.Pitch*Y) + X);
- uint8 *Pix2 = ((uint8 *)Raster.OriginalBuffer + Channel + (Raster.Pitch*Y) + X);
- uint8 *Pix3 = ((uint8 *)Raster.OriginalBuffer + Channel*2 + (Raster.Pitch*Y) + X);
- uint8 *PixA = ((uint8 *)Raster.OriginalBuffer + Channel*3 + (Raster.Pitch*Y) + X);
- // if (X == 0 && Y == 1) {
- // *Pix++ = 0xaa;
- // inc++;
- // } else if (X == 0 && Y == 2) {
- // *Pix++ = 0xbb;
- // inc++;
- // } else if (X == 0 && Y == 3) {
- // *Pix++ = 0xcc;
- // inc++;
- // } else {
- *Pix++ = 16*inc++;
- *Pix2++ = 16*incY;
- *Pix3++ = 0xaa;
- *PixA++ = 0xff;
- // }
-#endif
- }
- incY++;
- }
- return Raster;
-}
-
-internal void
-ClearBuffer(pixel_buffer *Buffer)
-{
- uint8 *Row = ((uint8 *)Buffer->OriginalBuffer);
- for(int Y = 0;
- Y < Buffer->Height;
- ++Y)
- {
- uint32 *Pixel = (uint32 *)Row;
- for(int X = 0;
- X < Buffer->Width;
- ++X)
- {
- *(uint32 *)Pixel++ = 0x00000000;
- }
- Row += Buffer->Pitch;
- }
-}
-
-#if PACKEDRGB
-internal void
-Unpack4x4Chunk(pixel_buffer *Buffer)
-{
- uint8 *Src = (uint8 *)Buffer->OriginalBuffer;
- uint8 *Temp = (uint8 *)Buffer->EffectBuffer;
- uint32 bytes = 0;
- for (uint32 Y = 0; Y < Buffer->Height; Y+=4) {
- uint8 *DPixel1 = Temp + Y*Buffer->Pitch;
- uint8 *DPixel2 = Temp + (Y+1)*Buffer->Pitch;
- uint8 *DPixel3 = Temp + (Y+2)*Buffer->Pitch;
- uint8 *DPixel4 = Temp + (Y+3)*Buffer->Pitch;
- for (uint32 X = 0; X < Buffer->Width; X+=4) {
- uint8 *Pixel1 = Src + bytes;
- uint8 *Pixel2 = Pixel1 + 4*Buffer->BytesPerPixel;
- uint8 *Pixel3 = Pixel1 + 4*Buffer->BytesPerPixel*2;
- uint8 *Pixel4 = Pixel1 + 4*Buffer->BytesPerPixel*3;
-
- __m128i Row1 = _mm_loadu_si128((__m128i *)Pixel1);
- __m128i Row2 = _mm_loadu_si128((__m128i *)Pixel2);
- __m128i Row3 = _mm_loadu_si128((__m128i *)Pixel3);
- __m128i Row4 = _mm_loadu_si128((__m128i *)Pixel4);
- _mm_storeu_si128((__m128i *)DPixel1, Row1);
- DPixel1 += 4*Buffer->BytesPerPixel;
- _mm_storeu_si128((__m128i *)DPixel2, Row2);
- DPixel2 += 4*Buffer->BytesPerPixel;
- _mm_storeu_si128((__m128i *)DPixel3, Row3);
- DPixel3 += 4*Buffer->BytesPerPixel;
- _mm_storeu_si128((__m128i *)DPixel4, Row4);
- DPixel4 += 4*Buffer->BytesPerPixel;
-
- bytes += 16*Buffer->BytesPerPixel;
- }
- }
-}
-internal void
-Store4x4Chunk(pixel_buffer *Buffer)
-{
-#if 1
- uint8 *Src = (uint8 *)Buffer->OriginalBuffer;
- uint8 *Temp = (uint8 *)Buffer->EffectBuffer;
- for (uint32 Y = 0; Y+4 < Buffer->Height; Y+=4) {
- uint8 *DPixel = Temp + Y*Buffer->Pitch;
- for (uint32 X = 0; X < Buffer->Width; X+=4) {
- uint8 *Pixel1 = Src + Y*Buffer->Pitch + X*Buffer->BytesPerPixel;
- uint8 *Pixel2 = Pixel1 + Buffer->Pitch;
- uint8 *Pixel3 = Pixel1 + Buffer->Pitch*2;
- uint8 *Pixel4 = Pixel1 + Buffer->Pitch*3;
-
- // NOTE(fox): Remember this is RGB packed, so 128-bit registers hold 4 pixels.
-
- __m128i Row1 = _mm_loadu_si128((__m128i *)Pixel1);
- __m128i Row2 = _mm_loadu_si128((__m128i *)Pixel2);
- __m128i Row3 = _mm_loadu_si128((__m128i *)Pixel3);
- __m128i Row4 = _mm_loadu_si128((__m128i *)Pixel4);
- _mm_storeu_si128((__m128i *)DPixel, Row1);
- DPixel += 4*Buffer->BytesPerPixel;
- _mm_storeu_si128((__m128i *)DPixel, Row2);
- DPixel += 4*Buffer->BytesPerPixel;
- _mm_storeu_si128((__m128i *)DPixel, Row3);
- DPixel += 4*Buffer->BytesPerPixel;
- _mm_storeu_si128((__m128i *)DPixel, Row4);
- DPixel += 4*Buffer->BytesPerPixel;
- }
- // TODO(fox): Clear the last row if the buffer isn't divisible by 4.
- }
-#else
- for (uint32 Y = 0; Y < Buffer->Height; Y+=1) {
- uint8 *DPixel = Temp + Y*Buffer->Pitch;
- for (uint32 X = 0; X < Buffer->Width; X+=1) {
- uint32 XLookup = (X >> 2)*16 + (X % 4);
- uint32 YLookup = (Y >> 2)*(Buffer->Width*4) + (Y % 4)*4;
- uint32 PixelToSeek = XLookup + YLookup;
- uint32 Pixel = *(uint32 *)((uint8 *)Buffer->EffectBuffer + PixelToSeek*Buffer->BytesPerPixel);
- uint8 Xp = Pixel & 0xFF;
- uint8 Yp = (Pixel >> 8) & 0xFF;
- printf("X %u, Y %u, val: %i, %i\n", X, Y, Xp, Yp);
- }
- }
- __m256i PixelX0 = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7);
- __m256i FF = _mm256_set1_epi32(8);
- uint8 *Src = (uint8 *)Buffer->EffectBuffer;
- for (int i = 0; i < 16; i++) {
- _mm256_storeu_si256((__m256i *)Src, PixelX0);
- Src += 32;
- PixelX0 = _mm256_add_epi32(PixelX0, FF);
- }
- uint32 Width = 3;
- for (uint32 Y = 0; Y < 4*2; Y++) {
- for (uint32 X = 0; X < 4*3; X++) {
- uint32 XLookup = (X >> 2)*16 + (X % 4);
- uint32 YLookup = (Y >> 2)*(Width*16) + (Y % 4)*4;
- uint32 PixelToSeek = XLookup + YLookup;
- uint32 Pixel = *((uint8 *)Buffer->EffectBuffer + PixelToSeek*Buffer->BytesPerPixel);
- printf("X %u, Y %u, %i\n", X, Y, Pixel);
- }
- }
- Assert(0);
-#endif
-}
-#else
-internal void
-PackBitmapRGB(pixel_buffer *Buffer)
-{
- uint8 *Row = (uint8 *)Buffer->OriginalBuffer;
- uint8 *PackedRow = (uint8 *)Buffer->EffectBuffer;
- for (uint32 Y = 0; Y < Buffer->Height; Y++) {
- uint32 *Pixel = (uint32 *)PackedRow;
- for (uint32 X = 0; X < Buffer->Width; X++) {
- uint8 *ValR = (uint8 *)Row + X;
- // if (X > 16 && Y > 16) {
- // Assert(*ValR == 0);
- // }
- uint8 *ValG = ValR + Buffer->Channel;
- uint8 *ValB = ValR + Buffer->Channel*2;
- uint8 *ValA = ValR + Buffer->Channel*3;
-
- *Pixel = (
- (*ValR << 0) |
- (*ValG << 8) |
- (*ValB << 16) |
- (*ValA << 24));
-
- Pixel++;
- }
- Row += Buffer->Pitch;
- PackedRow += Buffer->Pitch*Buffer->BytesPerPixel;
- }
-}
-// TODO(fox): Libav only exports GBRA array frames for some reason; see if you
-// can mod the source if you end up not using packed RGB.
-internal void
-Libav_GBRAToRGBA(pixel_buffer *Raster)
-{
- uint8 *Row = ((uint8 *)Raster->OriginalBuffer);
- uint32 bytes = 0;
- __m128i Zero = _mm_setzero_si128();
- while (bytes <= Raster->Height*Raster->Width) {
- uint8 *ChannelG = (uint8 *)Row + bytes;
- uint8 *ChannelB = (uint8 *)Row + bytes + Raster->Channel;
- uint8 *ChannelR = (uint8 *)Row + bytes + Raster->Channel*2;
- __m128i RegG = _mm_loadu_si128((__m128i *)ChannelG);
- __m128i RegB = _mm_loadu_si128((__m128i *)ChannelB);
- __m128i RegR = _mm_loadu_si128((__m128i *)ChannelR);
- _mm_storeu_si128((__m128i *)ChannelG, RegR);
- _mm_storeu_si128((__m128i *)ChannelB, RegG);
- _mm_storeu_si128((__m128i *)ChannelR, RegB);
- bytes += 16;
- }
-}
-#endif
-
-
-
-// 0 - original
-// 1 - effect
-// 2 - both
-internal void
-SSE_ClearBuffer(pixel_buffer *Raster, uint16 Which = 2)
-{
- uint8 *Row = ((uint8 *)Raster->OriginalBuffer);
- uint8 *Row2 = ((uint8 *)Raster->EffectBuffer);
- uint32 bytes = 0;
- __m128i Zero = _mm_setzero_si128();
- while (bytes <= Raster->Height*Raster->Width*4) {
- if (Which == 2 || Which == 0) {
- uint8 *Pixel = (uint8 *)Row + bytes;
- _mm_storeu_si128((__m128i *)Pixel, Zero);
- }
- if (Which == 2 || Which == 1) {
- uint8 *Pixel2 = (uint8 *)Row2 + bytes;
- _mm_storeu_si128((__m128i *)Pixel2, Zero);
- }
- bytes += 16;
- }
-}
-
-// 0 - original -> effect
-// 1 - effect -> original
-internal void
-SSE_CopyToBuffer(pixel_buffer *Raster, uint16 Which)
-{
- uint8 *Row = ((uint8 *)Raster->OriginalBuffer);
- uint8 *Row2 = ((uint8 *)Raster->EffectBuffer);
- uint32 bytes = 0;
- while (bytes <= Raster->Height*Raster->Width*4) {
- uint8 *Pixel = (uint8 *)Row + bytes;
- uint8 *Pixel2 = (uint8 *)Row2 + bytes;
- if (Which == 0) {
- __m128i OutputPixel = _mm_loadu_si128((__m128i *)Pixel);
- _mm_storeu_si128((__m128i *)Pixel2, OutputPixel);
- } else {
- __m128i OutputPixel = _mm_loadu_si128((__m128i *)Pixel2);
- _mm_storeu_si128((__m128i *)Pixel, OutputPixel);
- }
- bytes += 16;
- }
-}
-
-internal void
-DebugFillSolid(pixel_buffer *Raster, v4 Color)
-{
- __m128i Col = _mm_set1_epi32(ColToUint32(Color));
- uint8 *Row = ((uint8 *)Raster->OriginalBuffer);
- uint32 bytes = 0;
- while (bytes <= Raster->Height*Raster->Width*4) {
- uint8 *Pixel = (uint8 *)Row + bytes;
- _mm_storeu_si128((__m128i *)Pixel, Col);
- bytes += 16;
- }
-}
-
-internal void
-BitmapPackRGB(pixel_buffer *Buffer) {
-#if PACKEDRGB
- Buffer->Pitch = Buffer->Width*Buffer->BytesPerPixel;
-#else
- Buffer->Pitch = Buffer->Width; // each row has only 1 byte, 8 bits, per pixel
- Buffer->Channel = Buffer->Width*Buffer->Height;
-#endif
-#if PACKEDRGB
- Store4x4Chunk(Buffer);
- SSE_CopyToBuffer(Buffer, 1);
- SSE_ClearBuffer(Buffer, 1);
-#else
- Libav_GBRAToRGBA(Buffer);
-#endif
-}
-
-internal pixel_buffer
LoadImage(memory *Memory, char *filename)
{
pixel_buffer Buffer = {};
@@ -340,11 +54,16 @@ LoadImage(memory *Memory, char *filename)
int n = 0;
int h, w;
- Buffer.OriginalBuffer = stbi_load(filename, &w, &h, &n, 4);
- Buffer.EffectBuffer = AllocateMemory(Memory, w * h * Buffer.BytesPerPixel, B_Scratch);
+ void *temp = stbi_load(filename, &w, &h, &n, 4);
+ // printf("%s", stbi_failure_reason());
Buffer.Height = h;
Buffer.Width = w;
- // printf("%s", stbi_failure_reason());
+ CalculateFull(&Buffer);
+ Buffer.Pitch = Buffer.FullWidth*Buffer.BytesPerPixel;
+ // TODO(fox): Implement custom malloc in stbi so we don't have to do this.
+ Buffer.OriginalBuffer = MoveImportToBitmap(Memory, &Buffer, temp);
+ stbi_image_free(temp);
+ Buffer.EffectBuffer = AllocateMemory(Memory, Buffer.FullWidth * Buffer.FullHeight * Buffer.BytesPerPixel, B_Scratch);
BitmapPackRGB(&Buffer);
Buffer.ToUpdate = true;
return Buffer;
@@ -354,16 +73,34 @@ internal pixel_buffer
CreateSolidBitmap(memory *Memory, uint16 Height, uint16 Width, v4 Color) {
pixel_buffer Buffer = {};
Buffer.BytesPerPixel = 4;
- Buffer.OriginalBuffer = AllocateMemory(Memory, Height * Width * Buffer.BytesPerPixel, B_Scratch);
- Buffer.EffectBuffer = AllocateMemory(Memory, Height * Width * Buffer.BytesPerPixel, B_Scratch);
Buffer.Height = Height;
Buffer.Width = Width;
+ CalculateFull(&Buffer);
+ Buffer.Pitch = Buffer.FullWidth*Buffer.BytesPerPixel;
+ Buffer.OriginalBuffer = AllocateMemory(Memory, Buffer.FullHeight * Buffer.FullWidth * Buffer.BytesPerPixel, B_Scratch);
+ Buffer.EffectBuffer = AllocateMemory(Memory, Buffer.FullHeight * Buffer.FullWidth * Buffer.BytesPerPixel, B_Scratch);
DebugFillSolid(&Buffer, Color);
BitmapPackRGB(&Buffer);
Buffer.ToUpdate = true;
return Buffer;
}
+internal pixel_buffer
+CreateDebugBitmap(memory *Memory, uint16 Height, uint16 Width) {
+ pixel_buffer Buffer = {};
+ Buffer.BytesPerPixel = 4;
+ Buffer.Height = Height;
+ Buffer.Width = Width;
+ CalculateFull(&Buffer);
+ Buffer.Pitch = Buffer.FullWidth*Buffer.BytesPerPixel;
+ Buffer.OriginalBuffer = AllocateMemory(Memory, Buffer.FullHeight * Buffer.FullWidth * Buffer.BytesPerPixel, B_Scratch);
+ Buffer.EffectBuffer = AllocateMemory(Memory, Buffer.FullHeight * Buffer.FullWidth * Buffer.BytesPerPixel, B_Scratch);
+ DebugBitmap(&Buffer);
+ BitmapPackRGB(&Buffer);
+ Buffer.ToUpdate = true;
+ return Buffer;
+}
+
internal void
DrawHistogram(project_layer *Layer, pixel_buffer *UIBuffer, void *Scratch, memory *Memory, sdl_input Input, project_state *State,
@@ -435,144 +172,6 @@ DrawHistogram(project_layer *Layer, pixel_buffer *UIBuffer, void *Scratch, memor
}
}
-internal void
-DebugBitmap(pixel_buffer *Raster)
-{
-#if 0
- for (uint32 Y = 0; Y < Raster->Height; Y+=2) {
- for (uint32 X = 0; X < Raster->Width; X+=32) {
- for (uint32 pp = 0; pp < 4; pp++) {
- uint32 Increment = ((uint32)Raster->Width*Y*4) + X + pp*8;
- uint32 Increment2 = ((uint32)Raster->Width*(Y+1)*4) + X + pp*8;
- uint8 *TexPTR = ((uint8 *)Raster->OriginalBuffer + Increment);
- uint8 *TexPTR2 = ((uint8 *)Raster->OriginalBuffer + Increment2);
- uint8 *TexPTR3 = ((uint8 *)Raster->OriginalBuffer + Increment + 4);
- uint8 *TexPTR4 = ((uint8 *)Raster->OriginalBuffer + Increment2 + 4);
- if (pp == 0) {
- // *(uint32 *)TexPTR = 0x5f5e5d5c;
- // *(uint32 *)TexPTR2 = 0x4f4e4d4c;
- // *(uint32 *)TexPTR3 = 0x3f3e3d3c;
- // *(uint32 *)TexPTR4 = 0x2f2e2d2c;
- // *(uint32 *)TexPTR3 = 0xaaaaaaaa;
- // *(uint32 *)TexPTR4 = 0xaaaaaaaa;
- *(uint32 *)TexPTR = 0xcccaccc1;
- *(uint32 *)TexPTR2 = 0xdddaddd1;
- *(uint32 *)TexPTR3 = 0xeeeaeee1;
- *(uint32 *)TexPTR4 = 0xfffafff1;
- } else if (pp == 1) {
- // *(uint32 *)TexPTR = 0xb2a2b1a1;
- // *(uint32 *)TexPTR = 0xd2c2d1c1;
- // *(uint32 *)TexPTR3 = 0xbbaabbaa;
- // *(uint32 *)TexPTR4 = 0xddccddcc;
- *(uint32 *)TexPTR = 0xccccccc2;
- *(uint32 *)TexPTR2 = 0xddddddd2;
- *(uint32 *)TexPTR3 = 0xeeeeeee2;
- *(uint32 *)TexPTR4 = 0xfffffff2;
- } else if (pp == 2) {
- *(uint32 *)TexPTR = 0xccccccc3;
- *(uint32 *)TexPTR2 = 0xddddddd3;
- *(uint32 *)TexPTR3 = 0xeeeeeee3;
- *(uint32 *)TexPTR4 = 0xfffffff3;
- } else {
- *(uint32 *)TexPTR = 0xccccccc4;
- *(uint32 *)TexPTR2 = 0xddddddd4;
- *(uint32 *)TexPTR3 = 0xeeeeeee4;
- *(uint32 *)TexPTR4 = 0xfffffff4;
- }
- }
- }
- }
-#endif
-#if 0
- uint32 Channel = (Raster->Width * Raster->Height)*4;
- for (uint32 Y = 0; Y < Raster->Height; Y+=2) {
- for (uint32 X = 0; X < Raster->Width; X+=2) {
- uint8 *TopL = ((uint8 *)Raster->OriginalBuffer + (Raster->Width*Y*4) + X);
- uint8 *TopL2 = ((uint8 *)Raster->OriginalBuffer + (Raster->Width*Y*4) + X + Channel);
- uint8 *TopL3 = ((uint8 *)Raster->OriginalBuffer + (Raster->Width*Y*4) + X + Channel*2);
- uint8 *TopR = TopL + 1;
- uint8 *TopR2 = TopL2 + 1;
- uint8 *TopR3 = TopL3 + 1;
- uint8 *BotL = ((uint8 *)Raster->OriginalBuffer + (Raster->Width*(Y+1)*4) + X);
- uint8 *BotL2 = ((uint8 *)Raster->OriginalBuffer + (Raster->Width*(Y+1)*4) + X + Channel);
- uint8 *BotL3 = ((uint8 *)Raster->OriginalBuffer + (Raster->Width*(Y+1)*4) + X + Channel*2);
- uint8 *BotR = BotL + 1;
- uint8 *BotR2 = BotL2 + 1;
- uint8 *BotR3 = BotL3 + 1;
-
- *TopL = 0xff;
- *TopL2 = 0x00;
- *TopL3 = 0x00;
- *TopR = 0xcc;
- *TopR2 = 0xff;
- *TopR3 = 0x00;
- *BotL = 0x55;
- *BotL2 = 0x00;
- *BotL3 = 0xff;
- *BotR = 0x00;
- *BotR2 = 0xff;
- *BotR3 = 0xff;
- }
- }
-#endif
-#if 1
- uint32 Channel = (Raster->Width * Raster->Height);
- uint32 Width = 10;
- uint8 inc = 0;
- uint8 incY = 0;
- for (uint32 Y = 0; Y < Raster->Height; Y+=1) {
- for (uint32 X = 0; X < Width; X+=1) {
- uint8 *Pix = ((uint8 *)Raster->OriginalBuffer + (Raster->Pitch*Y) + X);
- uint8 *Pix2 = ((uint8 *)Raster->OriginalBuffer + Channel + (Raster->Pitch*Y) + X);
- // if (X == 0 && Y == 1) {
- // *Pix++ = 0xaa;
- // inc++;
- // } else if (X == 0 && Y == 2) {
- // *Pix++ = 0xbb;
- // inc++;
- // } else if (X == 0 && Y == 3) {
- // *Pix++ = 0xcc;
- // inc++;
- // } else {
- *Pix++ = inc++;
- *Pix2++ = incY;
- // }
- }
- incY++;
- }
-#endif
-
- for (uint32 Y = 0; Y < Raster->Height; Y+=2) {
- for (uint32 X = 0; X < Raster->Width; X+=32) {
- uint32 Channel = (Raster->Width * Raster->Height)*4;
- for (int16 i = 0; i < 4; i++) {
- uint32 Increment = (Raster->Width*Y*4) + X + Channel*i;
- uint32 Increment2 = (Raster->Width*(Y+1)*4) + X + Channel*i;
- uint8 *TexPTR = ((uint8 *)Raster->OriginalBuffer + Increment);
- uint8 *Pixel = ((uint8 *)Raster->EffectBuffer + Increment);
- uint8 *TexPTR2 = ((uint8 *)Raster->OriginalBuffer + Increment2);
- uint8 *Pixel2 = ((uint8 *)Raster->EffectBuffer + Increment2);
- __m256i T1 = _mm256_loadu_si256((__m256i *)TexPTR);
- __m256i T2 = _mm256_loadu_si256((__m256i *)TexPTR2);
- __m256i pp = _mm256_unpackhi_epi16(T1, T2);
- __m256i pp2 = _mm256_unpacklo_epi16(T1, T2);
- __m256i pp3 = _mm256_unpacklo_epi64(pp2, pp);
- __m256i pp4 = _mm256_unpackhi_epi64(pp2, pp);
- __m256i T4 = _mm256_permute2x128_si256(pp2, pp, 32);
- __m256i T5 = _mm256_permute2x128_si256(pp2, pp, 53);
- _mm256_storeu_si256((__m256i *)Pixel, T1);
- _mm256_storeu_si256((__m256i *)Pixel2, T2);
- }
- }
- }
- // _mm256_unpackhi_epi8
- // for (int Y = 0; Y < Raster.Height; Y+=2) {
- // for (int X = 0; X < Raster.Width; X+=2) {
- // uint8 *Row = ((uint8 *)UIBuffer->OriginalBuffer +
- // }
- // }
-}
-
internal property_channel
InitFloatProperty(char *Name, real32 Val, real32 ScrubVal, real32 MinVal = PROPERTY_REAL_MIN, real32 MaxVal = PROPERTY_REAL_MAX) {
property_channel Property = {};
@@ -700,6 +299,17 @@ CreateSolidLayer(project_data *File, memory *Memory, uint16 Width, uint16 Height
return Layer;
}
+internal project_layer *
+CreateDebugLayer(project_data *File, memory *Memory, uint16 Width, uint16 Height)
+{
+ project_layer *Layer = CreateLayer(File, Memory);
+ Layer->RenderInfo = AllocateMemory(Memory, sizeof(image_source), P_SourceData);
+ image_source *Source = (image_source *)Layer->RenderInfo;
+ Source->Raster = CreateDebugBitmap(Memory, Width, Height);
+ Layer->SourceType = source_image;
+ return Layer;
+}
+
internal void
CreateDemoScene(project_data *File, memory *Memory)
{
@@ -708,7 +318,7 @@ CreateDemoScene(project_data *File, memory *Memory)
Layer1->y.CurrentValue.f = 720/2;
Layer1->StartFrame = 0;
Layer1->EndFrame = File->EndFrame;
- project_layer *Layer2 = CreateSolidLayer(File, Memory, 500, 500, V4(0.0, 1.0, 0.4, 1.0));
+ project_layer *Layer2 = CreateSolidLayer(File, Memory, 499, 503, V4(0.0, 1.0, 0.4, 1.0));
Layer2->x.CurrentValue.f = 1280/2;
Layer2->y.CurrentValue.f = 720/2;
Layer2->StartFrame = 0;
@@ -717,7 +327,7 @@ CreateDemoScene(project_data *File, memory *Memory)
ManualKeyframeInsertF(&Layer2->rotation, Memory, 50, 360);
Layer2->rotation.IsToggled = true;
Layer2->scale.IsToggled = true;
- project_layer *Layer3 = CreateSolidLayer(File, Memory, 160, 160, V4(1.0, 0.3, 0.2, 1.0));
+ project_layer *Layer3 = CreateSolidLayer(File, Memory, 157, 163, V4(1.0, 0.3, 0.2, 1.0));
Layer3->x.CurrentValue.f = 1280/4;
Layer3->y.CurrentValue.f = 720/4;
Layer3->opacity.CurrentValue.f = 0.5f;
diff --git a/debug.h b/debug.h
index 6128627..4a6891e 100644
--- a/debug.h
+++ b/debug.h
@@ -28,6 +28,8 @@ struct project_debug
char *String[6];
uint32 WatchedProperties;
bool32 ToggleWindow;
+
+ bool32 ToggleRenders;
};
global_variable project_debug Debug;
diff --git a/effects.cpp b/effects.cpp
index 5532a97..733e4d6 100644
--- a/effects.cpp
+++ b/effects.cpp
@@ -286,7 +286,7 @@ AddEffect(project_layer *Layer, memory *Memory, uint16 EffectListIndex)
}
internal void
-SSE_CopyToBuffer(pixel_buffer *, uint16 asda = 0);
+CopyToBuffer(pixel_buffer *, uint16 asda = 0);
internal void
UpdateEffects(project_layer *Layer, memory *Memory)
@@ -296,7 +296,7 @@ UpdateEffects(project_layer *Layer, memory *Memory)
Source->Raster.EffectBuffer = AllocateMemory(Memory, Source->Raster.Width * Source->Raster.Height * Source->Raster.BytesPerPixel,
B_Scratch);
}
- SSE_CopyToBuffer(&Source->Raster);
+ CopyToBuffer(&Source->Raster);
for (int i = 0; i < Layer->NumberOfEffects; i++)
{
if (Layer->Effect[i]->IsActive)
diff --git a/main.cpp b/main.cpp
index 59d0e3a..3a517be 100644
--- a/main.cpp
+++ b/main.cpp
@@ -105,7 +105,9 @@ global_variable uint32 volatile CompletedJobs;
global_variable uint32 volatile NextEntryToDo;
global_variable uint32 volatile EntryCount;
global_variable bool32 IsRendering = false;
-global_variable bool32 AVXEnabled = true;
+global_variable bool32 D = true;
+global_variable instruction_mode InstructionMode = scalar_only;
+
render_entry Entries[256];
@@ -122,6 +124,7 @@ SDL_sem *Semaphore;
#endif
#include "prenderer.cpp"
#include "video.cpp"
+#include "bitmap_calls.cpp"
#include "createcalls.cpp"
#include "my_imgui_widgets.cpp"
@@ -132,7 +135,8 @@ MainFunction(main_sdl *Main, memory *Memory,
project_state *State, project_data *File,
cache_pool *Cache, pixel_buffer *CompBuffer)
{
- SSE_ClearBuffer(CompBuffer);
+ ClearBuffer(CompBuffer, CompBuffer->OriginalBuffer);
+ ClearBuffer(CompBuffer, CompBuffer->EffectBuffer);
for (int i = 0; i < File->NumberOfLayers; i++) {
project_layer *Layer = File->Layer[i];
if (Layer->RenderInfo) {
@@ -301,14 +305,16 @@ int main(int argc, char *argv[]) {
InitMemoryTable(&GlobalMemory, &Memory, 10 * 1024 * 1024, F_Strings, "Strings");
InitMemoryTable(&GlobalMemory, &Memory, 1024 * 1024 * 1024, B_Scratch, "Scratch buffer");
- if (!SDL_HasAVX2()) {
- AVXEnabled = false;
- printf("CPU does not have AVX2!");
- return -1;
- }
+ project_state State = {};
+ if (SDL_HasSSE2()) {
+ InstructionMode = sse_enabled;
+ }
+ if (SDL_HasAVX2()) {
+ InstructionMode = avx_enabled;
+ }
- project_state State = {};
+ InstructionMode = scalar_only;
project_data File = {};
File.Width = 1280;
@@ -339,7 +345,6 @@ int main(int argc, char *argv[]) {
// CreateLayerFromSource(&File, &State, &Memory, File.Source[0]);
// CreateLayerFromSource(&File, &State, &Memory, File.Source[1]);
-#if 1
// shm_unlink("/testl");
// int fd = shm_open("/testl", O_CREAT | O_EXCL | O_RDWR,
// S_IRUSR | S_IWUSR);
@@ -362,17 +367,18 @@ int main(int argc, char *argv[]) {
// if (sem_init(&shmp->sem2, 1, 0) == -1)
// Assert(0);
- // CreateLayer(&File, &Memory);
- // CreateRenderInfo(File.Layer[1], &Memory, File, video, "./asset/24.mp4");
- // File.Layer[1]->Name = "yuyu";
- // File.Layer[1]->StartFrame = 0;
- // File.Layer[1]->EndFrame = 65;
-#else
- CreateDebugLayer(&File, &Memory, 12, 8);
- File.Layer[0]->Name = "debug";
- File.Layer[0]->StartFrame = 0;
- File.Layer[0]->EndFrame = 65;
-#endif
+ // CreateLayerFromSource(&File, &State, &Memory, "../asset/24.mp4");
+ // project_layer *Layer1 = CreateDebugLayer(&File, &Memory, 9, 14);
+ // project_layer *Layer1 = CreateSolidLayer(&File, &Memory, 9, 13, V4(1.0, 1.0, 1.0, 1.0));
+ // Layer1->x.CurrentValue.f = 7;
+ // Layer1->y.CurrentValue.f = 4;
+ // Layer1->StartFrame = 0;
+ // Layer1->EndFrame = File.EndFrame;
+
+ // CreateDebugLayer(&File, &Memory, 12, 8);
+ // File.Layer[0]->Name = "debug";
+ // File.Layer[0]->StartFrame = 0;
+ // File.Layer[0]->EndFrame = 65;
// CreateLayer(&File, &Memory);
@@ -418,7 +424,7 @@ int main(int argc, char *argv[]) {
// AddEffect(File.Layer[0], &Memory, 0);
// AddEffect(File.Layer[0], &Memory, 0);
- for (int i = 0; i < 3; i++)
+ // for (int i = 0; i < 3; i++)
// CreateLayer(&File, &Memory);
// DebugPrintMemoryUsage(Memory);
@@ -472,8 +478,10 @@ int main(int argc, char *argv[]) {
SDL_GL_SetAttribute(SDL_GL_DEPTH_SIZE, 24);
SDL_GL_SetAttribute(SDL_GL_STENCIL_SIZE, 8);
SDL_WindowFlags window_flags = (SDL_WindowFlags)(SDL_WINDOW_OPENGL | SDL_WINDOW_RESIZABLE | SDL_WINDOW_ALLOW_HIGHDPI);
+#if DEBUG
// uint32 ScreenSize[2] = {2560/1.2, 1600/1.2};
- // real32 ScreenSize[2] = {3840/1.2, 2160/1.2};
+ real32 ScreenSize[2] = {3840/1.2, 2160/1.2};
+#else
real32 ScreenSize[2];
SDL_DisplayMode current;
int windowtest = SDL_GetCurrentDisplayMode(0, &current);
@@ -484,6 +492,7 @@ int main(int argc, char *argv[]) {
ScreenSize[0] = 1920;
ScreenSize[1] = 1080;
}
+#endif
SDL_Window* window = SDL_CreateWindow("Event Tester", SDL_WINDOWPOS_CENTERED, SDL_WINDOWPOS_CENTERED, ScreenSize[0], ScreenSize[1], window_flags);
SDL_GLContext gl_context = SDL_GL_CreateContext(window);
SDL_GL_MakeCurrent(window, gl_context);
@@ -513,8 +522,8 @@ int main(int argc, char *argv[]) {
GLuint textureID;
glGenTextures(1, &textureID);
glBindTexture(GL_TEXTURE_2D, textureID);
- glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_NEAREST);
- glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_NEAREST);
+ glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_LINEAR);
+ glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_LINEAR);
glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_S, GL_CLAMP_TO_EDGE); // This is required on WebGL for non power-of-two textures
glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_T, GL_CLAMP_TO_EDGE); // Same
#if defined(GL_UNPACK_ROW_LENGTH) && !defined(__EMSCRIPTEN__)
@@ -577,6 +586,7 @@ int main(int argc, char *argv[]) {
if (State.UpdateFrame && !IsRendering) {
MainFunction(0, &Memory, &State, &File, &Cache, &CompBuffer);
State.UpdateFrame = 0;
+ OutputToViewport(&CompBuffer, &State, textureID);
}
#if THREADED
@@ -585,12 +595,7 @@ int main(int argc, char *argv[]) {
CheckQueue(RenderInfo, 8);
}
if (CompletedJobs == 16) {
-#if PACKEDRGB
- Unpack4x4Chunk(&CompBuffer);
- // SSE_CopyToBuffer(CompBuffer);
-#else
- PackBitmapRGB(&CompBuffer);
-#endif
+ Convert4x4Chunk(&CompBuffer, 1);
EndRenderState(&State);
glBindTexture(GL_TEXTURE_2D, textureID);
glTexSubImage2D(GL_TEXTURE_2D, 0, 0, 0, CompBuffer.Width, CompBuffer.Height, GL_RGBA, GL_UNSIGNED_BYTE,
@@ -603,16 +608,6 @@ int main(int argc, char *argv[]) {
}
}
#else
-#if PACKEDRGB
- Unpack4x4Chunk(&CompBuffer);
- // SSE_CopyToBuffer(CompBuffer);
-#else
- PackBitmapRGB(&CompBuffer);
-#endif
- EndRenderState(&State);
- glBindTexture(GL_TEXTURE_2D, textureID);
- glTexSubImage2D(GL_TEXTURE_2D, 0, 0, 0, CompBuffer.Width, CompBuffer.Height, GL_RGBA, GL_UNSIGNED_BYTE,
- CompBuffer.EffectBuffer);
#endif
ImGui::Render();
diff --git a/main.h b/main.h
index 801b13e..d5a0126 100644
--- a/main.h
+++ b/main.h
@@ -4,17 +4,25 @@ enum source_type {
source_image
};
+enum instruction_mode {
+ scalar_only,
+ sse_enabled,
+ avx_enabled
+};
+
struct pixel_buffer {
void *OriginalBuffer;
void *EffectBuffer;
void *Scratch;
uint16 Width;
uint16 Height;
+ // IMPORTANT(fox): Since we're storing 4x4 chunks, I'm opting to pad out each
+ // dimension with an extra 1-3 pixels to make our lookup functions simpler.
+ // This has the cost of extra RAM, but it's a miniscule amount (0.2% extra
+ // data for a worst-case 1080p 16bpc frame, or 140 kb).
+ uint16 FullWidth;
+ uint16 FullHeight;
uint16 Pitch;
-#if PACKEDRGB
-#else
- uint32 Channel;
-#endif
uint16 BytesPerPixel;
bool32 ToUpdate; // Set whenever effects or video frames need to be updated.
};
@@ -246,6 +254,8 @@ struct transform_info {
real32 YAxisPY;
real32 LayerWidth;
real32 LayerHeight;
+ uint32 FullLayerWidth;
+ uint32 FullLayerHeight;
real32 LayerOpacity;
real32 OriginX;
real32 OriginY;
diff --git a/my_imgui_widgets.cpp b/my_imgui_widgets.cpp
index c199aa4..1190430 100644
--- a/my_imgui_widgets.cpp
+++ b/my_imgui_widgets.cpp
@@ -196,6 +196,17 @@ ImGui_Viewport(project_data File, project_state *State, ui *UI, pixel_buffer Com
UI->CompPos.x += io.MouseDelta.x;
UI->CompPos.y += io.MouseDelta.y;
}
+ // if (IsActive && ImGui::IsMouseDown(ImGuiMouseButton_Right))
+ // {
+ // Debug.ToggleRenders = true;
+ // }
+ ImGui::OpenPopupOnItemClick("context", ImGuiPopupFlags_MouseButtonRight);
+ if (ImGui::BeginPopup("context")) {
+ if (ImGui::MenuItem("Scalar", NULL, false, InstructionMode != scalar_only)) { InstructionMode = scalar_only; }
+ if (ImGui::MenuItem("SSE", NULL, false, InstructionMode != sse_enabled)) { InstructionMode = sse_enabled; }
+ if (ImGui::MenuItem("AVX2", NULL, false, InstructionMode != avx_enabled)) { InstructionMode = avx_enabled; }
+ ImGui::EndPopup();
+ }
if (IsActive && ImGui::IsMouseDragging(ImGuiMouseButton_Left, -1.0f) && ImGui::IsKeyDown(ImGuiKey_Z))
{
real32 Distance = io.MouseDelta.x + io.MouseDelta.y;
@@ -999,10 +1010,11 @@ ImGui_ProcessInputs(project_data *File, project_state *State, pixel_buffer *Comp
}
#if DEBUG
- if (ImGui::IsKeyPressed(ImGuiKey_E)) {
- SwitchBool(AVXEnabled);
- State->UpdateFrame = true;
- }
+ if (ImGui::IsKeyPressed(ImGuiKey_Z))
+ {
+ // SwitchBool(D);
+ // State->UpdateFrame = true;
+ }
if (ImGui::IsKeyPressed(ImGuiKey_M))
{
Debug.Markers[Debug.MarkerIndex] = File->CurrentFrame;
diff --git a/prenderer.cpp b/prenderer.cpp
index 4d4152d..356ecd7 100644
--- a/prenderer.cpp
+++ b/prenderer.cpp
@@ -7,11 +7,14 @@ RenderLayerNeon(project_layer *Layer, pixel_buffer *Buffer, rectangle RenderRegi
internal void
AVX2_RenderLayer(transform_info TransformInfo, pixel_buffer *Buffer, rectangle RenderRegion);
internal void
+SSE2_RenderLayer(transform_info TransformInfo, pixel_buffer *Buffer, rectangle RenderRegion);
+internal void
Fallback_RenderLayer(transform_info TransformInfo, pixel_buffer *Buffer, rectangle RenderRegion);
internal bool32
CheckQueue(render_queue RenderInfo, uint16 Index);
+// for the anchor point moving UI
internal void
CalculateAnchorOffset(project_layer *Layer, real32 Value, uint16 Dir)
{
@@ -76,12 +79,14 @@ CalculateTransforms(project_layer *Layer, pixel_buffer *Buffer)
TransformInfo.YAxisPY = YLengthSq*YAxis.y;
TransformInfo.LayerWidth = (real32)Source->Raster.Width;
TransformInfo.LayerHeight = (real32)Source->Raster.Height;
+ TransformInfo.FullLayerWidth = Source->Raster.FullWidth;
+ TransformInfo.FullLayerHeight = Source->Raster.FullHeight;
TransformInfo.LayerOpacity = 1.0f - Layer->opacity.CurrentValue.f;
TransformInfo.OriginX = Origin.x;
TransformInfo.OriginY = Origin.y;
TransformInfo.BufferPitch = Buffer->Pitch;
TransformInfo.LayerPitch = Source->Raster.Pitch;
- TransformInfo.ClipRect = {MinX - (MinX & 3), MinY, MaxX, MaxY};
+ TransformInfo.ClipRect = {MinX - (MinX & 3), MinY, MaxX + 1, MaxY + 1};
TransformInfo.SourceBuffer = Source->Raster.EffectBuffer;
@@ -115,6 +120,19 @@ EndRenderState(project_state *State)
}
internal void
+RenderLayers(render_queue *RenderInfo, rectangle RenderRegion) {
+ for (int16 i = 0; i < RenderInfo->State->NumberOfLayersToRender; i++) {
+ int16 Idx = RenderInfo->State->LayersToRender[i];
+ if (InstructionMode == avx_enabled)
+ AVX2_RenderLayer(RenderInfo->File->Layer[Idx]->TransformInfo, RenderInfo->CompBuffer, RenderRegion);
+ else if (InstructionMode == sse_enabled)
+ SSE2_RenderLayer(RenderInfo->File->Layer[Idx]->TransformInfo, RenderInfo->CompBuffer, RenderRegion);
+ else
+ Fallback_RenderLayer(RenderInfo->File->Layer[Idx]->TransformInfo, RenderInfo->CompBuffer, RenderRegion);
+ }
+}
+
+internal void
QueueCurrentFrame(project_data *File, pixel_buffer *CompBuffer, project_state *State)
{
IsRendering = true;
@@ -163,18 +181,7 @@ QueueCurrentFrame(project_data *File, pixel_buffer *CompBuffer, project_state *S
// DEBUG_CycleCountStart(3);
rectangle RenderRegion = {0, 0, (int32)CompBuffer->Width, (int32)CompBuffer->Height};
- for (int16 i = 0; i < RenderInfo.State->NumberOfLayersToRender; i++) {
- int16 Idx = RenderInfo.State->LayersToRender[i];
-#if ARM
- RenderLayerNeon(RenderInfo.File->Layer[Idx], RenderInfo.CompBuffer, RenderRegion);
-#else
- // RenderLayerSSE(RenderInfo.File->Layer[Idx]->TransformInfo, RenderInfo.CompBuffer, RenderRegion);
- if (AVXEnabled)
- AVX2_RenderLayer(RenderInfo.File->Layer[Idx]->TransformInfo, RenderInfo.CompBuffer, RenderRegion);
- else
- Fallback_RenderLayer(RenderInfo.File->Layer[Idx]->TransformInfo, RenderInfo.CompBuffer, RenderRegion);
-#endif
- }
+ RenderLayers(&RenderInfo, RenderRegion);
// DEBUG_CycleCountEnd(3);
// Debug.ExecutionAmount[4] += 1280*720;
@@ -378,6 +385,7 @@ RenderLayerNeon(project_layer *Layer, pixel_buffer *Buffer, rectangle RenderRegi
}
#else
+
internal void
AVX2_RenderLayer(transform_info T, pixel_buffer *Buffer, rectangle RenderRegion)
{
@@ -397,7 +405,9 @@ AVX2_RenderLayer(transform_info T, pixel_buffer *Buffer, rectangle RenderRegion)
__m256 YAxisPY = _mm256_set1_ps(T.YAxisPY);
__m256 LayerWidth = _mm256_set1_ps(T.LayerWidth);
- __m256i LayerWidth4i = _mm256_set1_epi32(T.LayerWidth*4);
+ __m256i FullLayerWidth4i = _mm256_set1_epi32(T.FullLayerWidth*4);
+ __m256i LayerWidthMinusOne = _mm256_set1_epi32(T.LayerWidth - 1);
+ __m256i LayerHeightMinusOne = _mm256_set1_epi32(T.LayerHeight - 1);
__m256 LayerHeight = _mm256_set1_ps(T.LayerHeight);
__m256 LayerOpacity = _mm256_set1_ps(T.LayerOpacity);
__m256 OriginX = _mm256_set1_ps(T.OriginX);
@@ -451,7 +461,7 @@ AVX2_RenderLayer(transform_info T, pixel_buffer *Buffer, rectangle RenderRegion)
__m256 StartVectorX = _mm256_sub_ps(PixelX, OriginX);
uint32 XLookup = (X >> 2)*16 + (X % 4);
- uint32 YLookup = (Y >> 2)*(Buffer->Width*4) + (Y % 4)*4;
+ uint32 YLookup = (Y >> 2)*(Buffer->FullWidth*4) + (Y % 4)*4;
uint32 PixelToSeek = XLookup + YLookup;
uint8 *Pixel = (uint8 *)Buffer->OriginalBuffer + PixelToSeek*Buffer->BytesPerPixel;
@@ -461,6 +471,8 @@ AVX2_RenderLayer(transform_info T, pixel_buffer *Buffer, rectangle RenderRegion)
__m256i LayerMask = _mm256_castps_si256(_mm256_and_ps(_mm256_and_ps(_mm256_cmp_ps(U, Zero, 13), _mm256_cmp_ps(U, One, 2)),
_mm256_and_ps(_mm256_cmp_ps(V, Zero, 13), _mm256_cmp_ps(V, One, 2))));
+ // If all of the pixels are zeroed in the mask (aka fall outside
+ // the UV lookup), we can skip the iteration.
if (_mm256_movemask_epi8(LayerMask))
{
U = _mm256_max_ps(_mm256_min_ps(One, U), Zero);
@@ -469,9 +481,10 @@ AVX2_RenderLayer(transform_info T, pixel_buffer *Buffer, rectangle RenderRegion)
__m256 TexXFull = _mm256_mul_ps(U, LayerWidth);
__m256 TexYFull = _mm256_mul_ps(V, LayerHeight);
__m256i TexXInt = _mm256_cvttps_epi32(TexXFull);
- __m256i TexXIntPlusOne = _mm256_add_epi32(TexXInt, Onei);
+ __m256i TexXIntPlusOne = _mm256_add_epi32(TexXInt, _mm256_and_si256(_mm256_cmpgt_epi32(LayerWidthMinusOne, TexXInt), Onei));
__m256i TexYInt = _mm256_cvttps_epi32(TexYFull);
- __m256i TexYIntPlusOne = _mm256_add_epi32(TexYInt, Onei);
+ __m256i TexYIntPlusOne = _mm256_add_epi32(TexYInt, _mm256_and_si256(_mm256_cmpgt_epi32(LayerHeightMinusOne, TexYInt), Onei));
+ // NOTE(fox): The comparison is for when we're on the last pixel.
__m256 TexX = _mm256_sub_ps(TexXFull, _mm256_cvtepi32_ps(TexXInt));
__m256 TexY = _mm256_sub_ps(TexYFull, _mm256_cvtepi32_ps(TexYInt));
@@ -484,11 +497,11 @@ AVX2_RenderLayer(transform_info T, pixel_buffer *Buffer, rectangle RenderRegion)
__m256i XLookup = _mm256_add_epi32(_mm256_mullo_epi32(_mm256_srli_epi32(TexXInt, 2), Sixteeni),
_mm256_and_si256(TexXInt, BottomTwoBits));
- __m256i YLookup = _mm256_add_epi32(_mm256_mullo_epi32(_mm256_srli_epi32(TexYInt, 2), LayerWidth4i),
+ __m256i YLookup = _mm256_add_epi32(_mm256_mullo_epi32(_mm256_srli_epi32(TexYInt, 2), FullLayerWidth4i),
_mm256_mullo_epi32(_mm256_and_si256(TexYInt, BottomTwoBits), Fouri));
__m256i XLookupPlusOne = _mm256_add_epi32(_mm256_mullo_epi32(_mm256_srli_epi32(TexXIntPlusOne, 2), Sixteeni),
_mm256_and_si256(TexXIntPlusOne, BottomTwoBits));
- __m256i YLookupPlusOne = _mm256_add_epi32(_mm256_mullo_epi32(_mm256_srli_epi32(TexYIntPlusOne, 2), LayerWidth4i),
+ __m256i YLookupPlusOne = _mm256_add_epi32(_mm256_mullo_epi32(_mm256_srli_epi32(TexYIntPlusOne, 2), FullLayerWidth4i),
_mm256_mullo_epi32(_mm256_and_si256(TexYIntPlusOne, BottomTwoBits), Fouri));
__m256i PixelLookupTL = _mm256_add_epi32(XLookup, YLookup);
@@ -571,13 +584,239 @@ AVX2_RenderLayer(transform_info T, pixel_buffer *Buffer, rectangle RenderRegion)
_mm256_or_si256(R_Out, _mm256_slli_epi32(G_Out, 8)),
_mm256_or_si256(_mm256_slli_epi32(B_Out, 16), _mm256_slli_epi32(A_Out, 24)));
- __m256i PixelsMask = _mm256_blendv_epi8(Zeroi, OutputPixel, LayerMask);
- _mm256_storeu_si256((__m256i *)Pixel, PixelsMask);
+ // __m256i PixelsMask = _mm256_blendv_epi8(Zeroi, OutputPixel, LayerMask);
+ _mm256_maskstore_epi32((int *)Pixel, LayerMask, OutputPixel);
}
PixelX = _mm256_add_ps(PixelX, Four);
}
}
}
+
+internal void
+SSE2_RenderLayer(transform_info T, pixel_buffer *Buffer, rectangle RenderRegion)
+{
+ rectangle LayerBounds = ClipRectangle( T.ClipRect,
+ RenderRegion );
+ // Remember: since bitmaps are packed in 4x4 cubes, we always need to be aligned.
+ LayerBounds.Min.x -= LayerBounds.Min.x % 4;
+ LayerBounds.Min.y -= LayerBounds.Min.y % 4;
+
+ uint8 *TexPTR = (uint8 *)T.SourceBuffer;
+ Assert(LayerBounds.Max.x <= Buffer->Width);
+ Assert(LayerBounds.Max.y <= Buffer->Height);
+
+ __m128 XAxisPX = _mm_set1_ps(T.XAxisPX);
+ __m128 XAxisPY = _mm_set1_ps(T.XAxisPY);
+ __m128 YAxisPX = _mm_set1_ps(T.YAxisPX);
+ __m128 YAxisPY = _mm_set1_ps(T.YAxisPY);
+
+ __m128 LayerWidth = _mm_set1_ps(T.LayerWidth);
+ __m128i LayerWidthMinusOne = _mm_set1_epi32(T.LayerWidth - 1);
+ __m128i FullLayerWidth4i = _mm_set1_epi32(T.FullLayerWidth*4);
+ __m128 LayerHeight = _mm_set1_ps(T.LayerHeight);
+ __m128i LayerHeightMinusOne = _mm_set1_epi32(T.LayerHeight - 1);
+ __m128 LayerOpacity = _mm_set1_ps(T.LayerOpacity);
+ __m128 OriginX = _mm_set1_ps(T.OriginX);
+ __m128 OriginY = _mm_set1_ps(T.OriginY);
+
+ __m128 One = _mm_set1_ps(1);
+ __m128 Zero = _mm_set1_ps(0);
+ __m128i Zeroi = _mm_set1_epi32(0);
+ __m128i Onei = _mm_set1_epi32(1);
+ __m128 Four = _mm_set1_ps(4);
+ __m128 Sixteen = _mm_set1_ps(16);
+ __m128i FF = _mm_set1_epi32(0xFF);
+ __m128i BottomTwoBits = _mm_set1_epi32(0x03);
+ __m128i Fouri = _mm_set1_epi32(4);
+ __m128i Sixteeni = _mm_set1_epi32(16);
+ __m128 Reg255 = _mm_set1_ps(255.0f);
+ __m128i Int255 = _mm_set1_epi32(255);
+ __m128 Norm255 = _mm_set1_ps(1/255.0f);
+
+ // NOTE(fox): Each loop operates on 4 pixels, 4 horizontal by 1 vertical.
+
+ for (int32 Y = LayerBounds.Min.y; Y < LayerBounds.Max.y; Y++)
+ {
+ __m128 PixelX = _mm_setr_ps((real32)LayerBounds.Min.x,
+ (real32)LayerBounds.Min.x+1,
+ (real32)LayerBounds.Min.x+2,
+ (real32)LayerBounds.Min.x+3);
+
+ __m128 PixelY = _mm_set1_ps((real32)Y);
+ __m128 StartVectorY = _mm_sub_ps(PixelY, OriginY);
+
+ for (int32 X = LayerBounds.Min.x; X < LayerBounds.Max.x; X += 4)
+ {
+ IACA_START;
+
+ __m128 StartVectorX = _mm_sub_ps(PixelX, OriginX);
+
+ uint32 XLookup = (X >> 2)*16 + (X % 4);
+ uint32 YLookup = (Y >> 2)*(Buffer->FullWidth*4) + (Y % 4)*4;
+ uint32 PixelToSeek = XLookup + YLookup;
+ uint8 *Pixel = (uint8 *)Buffer->OriginalBuffer + PixelToSeek*Buffer->BytesPerPixel;
+
+ __m128 U = _mm_add_ps(_mm_mul_ps(StartVectorX, XAxisPX), _mm_mul_ps(StartVectorY, XAxisPY));
+ __m128 V = _mm_add_ps(_mm_mul_ps(StartVectorX, YAxisPX), _mm_mul_ps(StartVectorY, YAxisPY));
+
+ __m128i LayerMask = _mm_castps_si128(_mm_and_ps(_mm_and_ps(_mm_cmpge_ps(U, Zero), _mm_cmple_ps(U, One)),
+ _mm_and_ps(_mm_cmpge_ps(V, Zero), _mm_cmple_ps(V, One))));
+
+ if (_mm_movemask_epi8(LayerMask))
+ {
+ U = _mm_max_ps(_mm_min_ps(One, U), Zero);
+ V = _mm_max_ps(_mm_min_ps(One, V), Zero);
+
+ __m128 TexXFull = _mm_mul_ps(U, LayerWidth);
+ __m128 TexYFull = _mm_mul_ps(V, LayerHeight);
+ __m128i TexXInt = _mm_cvttps_epi32(TexXFull);
+ __m128i TexXIntPlusOne = _mm_add_epi32(TexXInt, _mm_and_si128(_mm_cmplt_epi32(TexXInt, LayerWidthMinusOne), Onei));
+ __m128i TexYInt = _mm_cvttps_epi32(TexYFull);
+ __m128i TexYIntPlusOne = _mm_add_epi32(TexYInt, _mm_and_si128(_mm_cmplt_epi32(TexYInt, LayerHeightMinusOne), Onei));
+
+ __m128 TexX = _mm_sub_ps(TexXFull, _mm_cvtepi32_ps(TexXInt));
+ __m128 TexY = _mm_sub_ps(TexYFull, _mm_cvtepi32_ps(TexYInt));
+ __m128 TexXInv = _mm_sub_ps(One, TexX);
+ __m128 TexYInv = _mm_sub_ps(One, TexY);
+ __m128 TexBothXInv = _mm_mul_ps(TexXInv, TexY);
+ __m128 TexBothYInv = _mm_mul_ps(TexX, TexYInv);
+ __m128 TexBoth = _mm_mul_ps(TexY, TexX);
+ __m128 TexBothInv = _mm_mul_ps(TexXInv, TexYInv);
+
+ __m128i XLookup = _mm_add_epi32(_mm_mullo_epi32(_mm_srli_epi32(TexXInt, 2), Sixteeni),
+ _mm_and_si128(TexXInt, BottomTwoBits));
+ __m128i YLookup = _mm_add_epi32(_mm_mullo_epi32(_mm_srli_epi32(TexYInt, 2), FullLayerWidth4i),
+ _mm_mullo_epi32(_mm_and_si128(TexYInt, BottomTwoBits), Fouri));
+ __m128i XLookupPlusOne = _mm_add_epi32(_mm_mullo_epi32(_mm_srli_epi32(TexXIntPlusOne, 2), Sixteeni),
+ _mm_and_si128(TexXIntPlusOne, BottomTwoBits));
+ __m128i YLookupPlusOne = _mm_add_epi32(_mm_mullo_epi32(_mm_srli_epi32(TexYIntPlusOne, 2), FullLayerWidth4i),
+ _mm_mullo_epi32(_mm_and_si128(TexYIntPlusOne, BottomTwoBits), Fouri));
+
+ __m128i PixelLookupTL = _mm_add_epi32(XLookup, YLookup);
+ __m128i PixelLookupTR = _mm_add_epi32(XLookupPlusOne, YLookup);
+ __m128i PixelLookupBL = _mm_add_epi32(XLookup, YLookupPlusOne);
+ __m128i PixelLookupBR = _mm_add_epi32(XLookupPlusOne, YLookupPlusOne);
+
+ // SSE lacks gathering, so we have no choice but to manually
+ // look up each pixel's four bilinear samples in scalar.
+
+ uint32 S_PixelLookupTL0 = _mm_cvtsi128_si32(PixelLookupTL);
+ uint32 S_PixelLookupTR0 = _mm_cvtsi128_si32(PixelLookupTR);
+ uint32 S_PixelLookupBL0 = _mm_cvtsi128_si32(PixelLookupBL);
+ uint32 S_PixelLookupBR0 = _mm_cvtsi128_si32(PixelLookupBR);
+ uint32 S_PixelsTL0 = *(uint32 *)(TexPTR + S_PixelLookupTL0*4);
+ uint32 S_PixelsTR0 = *(uint32 *)(TexPTR + S_PixelLookupTR0*4);
+ uint32 S_PixelsBL0 = *(uint32 *)(TexPTR + S_PixelLookupBL0*4);
+ uint32 S_PixelsBR0 = *(uint32 *)(TexPTR + S_PixelLookupBR0*4);
+
+ uint32 S_PixelLookupTL1 = _mm_cvtsi128_si32(_mm_srli_si128(PixelLookupTL, 4));
+ uint32 S_PixelLookupTR1 = _mm_cvtsi128_si32(_mm_srli_si128(PixelLookupTR, 4));
+ uint32 S_PixelLookupBL1 = _mm_cvtsi128_si32(_mm_srli_si128(PixelLookupBL, 4));
+ uint32 S_PixelLookupBR1 = _mm_cvtsi128_si32(_mm_srli_si128(PixelLookupBR, 4));
+ uint32 S_PixelsTL1 = *(uint32 *)(TexPTR + S_PixelLookupTL1*4);
+ uint32 S_PixelsTR1 = *(uint32 *)(TexPTR + S_PixelLookupTR1*4);
+ uint32 S_PixelsBL1 = *(uint32 *)(TexPTR + S_PixelLookupBL1*4);
+ uint32 S_PixelsBR1 = *(uint32 *)(TexPTR + S_PixelLookupBR1*4);
+
+ uint32 S_PixelLookupTL2 = _mm_cvtsi128_si32(_mm_srli_si128(PixelLookupTL, 8));
+ uint32 S_PixelLookupTR2 = _mm_cvtsi128_si32(_mm_srli_si128(PixelLookupTR, 8));
+ uint32 S_PixelLookupBL2 = _mm_cvtsi128_si32(_mm_srli_si128(PixelLookupBL, 8));
+ uint32 S_PixelLookupBR2 = _mm_cvtsi128_si32(_mm_srli_si128(PixelLookupBR, 8));
+ uint32 S_PixelsTL2 = *(uint32 *)(TexPTR + S_PixelLookupTL2*4);
+ uint32 S_PixelsTR2 = *(uint32 *)(TexPTR + S_PixelLookupTR2*4);
+ uint32 S_PixelsBL2 = *(uint32 *)(TexPTR + S_PixelLookupBL2*4);
+ uint32 S_PixelsBR2 = *(uint32 *)(TexPTR + S_PixelLookupBR2*4);
+
+ uint32 S_PixelLookupTL3 = _mm_cvtsi128_si32(_mm_srli_si128(PixelLookupTL, 12));
+ uint32 S_PixelLookupTR3 = _mm_cvtsi128_si32(_mm_srli_si128(PixelLookupTR, 12));
+ uint32 S_PixelLookupBL3 = _mm_cvtsi128_si32(_mm_srli_si128(PixelLookupBL, 12));
+ uint32 S_PixelLookupBR3 = _mm_cvtsi128_si32(_mm_srli_si128(PixelLookupBR, 12));
+ uint32 S_PixelsTL3 = *(uint32 *)(TexPTR + S_PixelLookupTL3*4);
+ uint32 S_PixelsTR3 = *(uint32 *)(TexPTR + S_PixelLookupTR3*4);
+ uint32 S_PixelsBL3 = *(uint32 *)(TexPTR + S_PixelLookupBL3*4);
+ uint32 S_PixelsBR3 = *(uint32 *)(TexPTR + S_PixelLookupBR3*4);
+
+ __m128i PixelsTL = _mm_setr_epi32(S_PixelsTL0, S_PixelsTL1, S_PixelsTL2, S_PixelsTL3);
+ __m128i PixelsTR = _mm_setr_epi32(S_PixelsTR0, S_PixelsTR1, S_PixelsTR2, S_PixelsTR3);
+ __m128i PixelsBL = _mm_setr_epi32(S_PixelsBL0, S_PixelsBL1, S_PixelsBL2, S_PixelsBL3);
+ __m128i PixelsBR = _mm_setr_epi32(S_PixelsBR0, S_PixelsBR1, S_PixelsBR2, S_PixelsBR3);
+
+ __m128i R_TexTL = _mm_and_si128( PixelsTL, FF);
+ __m128i G_TexTL = _mm_and_si128(_mm_srli_epi32(PixelsTL, 8), FF);
+ __m128i B_TexTL = _mm_and_si128(_mm_srli_epi32(PixelsTL, 16), FF);
+ __m128i A_TexTL = _mm_and_si128(_mm_srli_epi32(PixelsTL, 24), FF);
+
+ __m128i R_TexTR = _mm_and_si128( PixelsTR, FF);
+ __m128i G_TexTR = _mm_and_si128(_mm_srli_epi32(PixelsTR, 8), FF);
+ __m128i B_TexTR = _mm_and_si128(_mm_srli_epi32(PixelsTR, 16), FF);
+ __m128i A_TexTR = _mm_and_si128(_mm_srli_epi32(PixelsTR, 24), FF);
+
+ __m128i R_TexBL = _mm_and_si128( PixelsBL, FF);
+ __m128i G_TexBL = _mm_and_si128(_mm_srli_epi32(PixelsBL, 8), FF);
+ __m128i B_TexBL = _mm_and_si128(_mm_srli_epi32(PixelsBL, 16), FF);
+ __m128i A_TexBL = _mm_and_si128(_mm_srli_epi32(PixelsBL, 24), FF);
+
+ __m128i R_TexBR = _mm_and_si128( PixelsBR, FF);
+ __m128i G_TexBR = _mm_and_si128(_mm_srli_epi32(PixelsBR, 8), FF);
+ __m128i B_TexBR = _mm_and_si128(_mm_srli_epi32(PixelsBR, 16), FF);
+ __m128i A_TexBR = _mm_and_si128(_mm_srli_epi32(PixelsBR, 24), FF);
+
+ __m128 R_PixelBlend = _mm_add_ps(_mm_add_ps(_mm_mul_ps(TexBothInv, _mm_cvtepi32_ps(R_TexTL)),
+ _mm_mul_ps(TexBothYInv, _mm_cvtepi32_ps(R_TexTR))),
+ _mm_add_ps(_mm_mul_ps(TexBothXInv, _mm_cvtepi32_ps(R_TexBL)),
+ _mm_mul_ps(TexBoth, _mm_cvtepi32_ps(R_TexBR))));
+ __m128 G_PixelBlend = _mm_add_ps(_mm_add_ps(_mm_mul_ps(TexBothInv, _mm_cvtepi32_ps(G_TexTL)),
+ _mm_mul_ps(TexBothYInv, _mm_cvtepi32_ps(G_TexTR))),
+ _mm_add_ps(_mm_mul_ps(TexBothXInv, _mm_cvtepi32_ps(G_TexBL)),
+ _mm_mul_ps(TexBoth, _mm_cvtepi32_ps(G_TexBR))));
+ __m128 B_PixelBlend = _mm_add_ps(_mm_add_ps(_mm_mul_ps(TexBothInv, _mm_cvtepi32_ps(B_TexTL)),
+ _mm_mul_ps(TexBothYInv, _mm_cvtepi32_ps(B_TexTR))),
+ _mm_add_ps(_mm_mul_ps(TexBothXInv, _mm_cvtepi32_ps(B_TexBL)),
+ _mm_mul_ps(TexBoth, _mm_cvtepi32_ps(B_TexBR))));
+ __m128 A_PixelBlend = _mm_add_ps(_mm_add_ps(_mm_mul_ps(TexBothInv, _mm_cvtepi32_ps(A_TexTL)),
+ _mm_mul_ps(TexBothYInv, _mm_cvtepi32_ps(A_TexTR))),
+ _mm_add_ps(_mm_mul_ps(TexBothXInv, _mm_cvtepi32_ps(A_TexBL)),
+ _mm_mul_ps(TexBoth, _mm_cvtepi32_ps(A_TexBR))));
+
+ A_PixelBlend = _mm_sub_ps(A_PixelBlend, _mm_mul_ps(A_PixelBlend, LayerOpacity));
+
+ __m128i R_Out, G_Out, B_Out, A_Out;
+ // Only do alpha blending if a pixel's value doesn't equal 255
+ if (_mm_movemask_epi8(_mm_sub_epi32(_mm_cvtps_epi32(A_PixelBlend), Int255)))
+ {
+ __m128 LayerAlpha = _mm_mul_ps(A_PixelBlend, Norm255);
+ __m128 LayerAlphaInv = _mm_mul_ps(_mm_sub_ps(Reg255, A_PixelBlend), Norm255);
+
+ __m128i DestPixel = _mm_loadu_si128((const __m128i *)Pixel);
+ __m128i R_Dest = _mm_and_si128( DestPixel, FF);
+ __m128i G_Dest = _mm_and_si128(_mm_srli_epi32(DestPixel, 8), FF);
+ __m128i B_Dest = _mm_and_si128(_mm_srli_epi32(DestPixel, 16), FF);
+ __m128i A_Dest = _mm_and_si128(_mm_srli_epi32(DestPixel, 24), FF);
+
+ R_Out = _mm_cvtps_epi32(_mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(R_Dest), LayerAlphaInv), _mm_mul_ps(R_PixelBlend, LayerAlpha)));
+ G_Out = _mm_cvtps_epi32(_mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(G_Dest), LayerAlphaInv), _mm_mul_ps(G_PixelBlend, LayerAlpha)));
+ B_Out = _mm_cvtps_epi32(_mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(B_Dest), LayerAlphaInv), _mm_mul_ps(B_PixelBlend, LayerAlpha)));
+ A_Out = _mm_cvtps_epi32(_mm_min_ps(_mm_add_ps(_mm_cvtepi32_ps(A_Dest), A_PixelBlend), Reg255));
+ }
+ else
+ {
+ R_Out = _mm_cvtps_epi32(R_PixelBlend);
+ G_Out = _mm_cvtps_epi32(G_PixelBlend);
+ B_Out = _mm_cvtps_epi32(B_PixelBlend);
+ A_Out = _mm_cvtps_epi32(A_PixelBlend);
+ }
+
+ __m128i OutputPixel = _mm_or_si128(
+ _mm_or_si128(R_Out, _mm_slli_epi32(G_Out, 8)),
+ _mm_or_si128(_mm_slli_epi32(B_Out, 16), _mm_slli_epi32(A_Out, 24)));
+ _mm_maskmoveu_si128(OutputPixel, LayerMask, (char *)Pixel);
+ }
+ PixelX = _mm_add_ps(PixelX, Four);
+ }
+ }
+}
+
+
#endif
internal void
@@ -595,25 +834,17 @@ Fallback_RenderLayer(transform_info T, pixel_buffer *Buffer, rectangle RenderReg
// uint32 pp2 = 3;
// bool32 real = true;
- for (int16 Y = LayerBounds.Min.y; Y < LayerBounds.Max.y; Y += 2)
+ for (int16 Y = LayerBounds.Min.y; Y < LayerBounds.Max.y; Y++)
{
-#if PACKEDRGB
-#else
- uint8 *Pixel = (uint8 *)Row + (uint16)LayerBounds.Min.x;
-#endif
- real32 StartVectorY[2];
- StartVectorY[0] = (real32)Y - T.OriginY;
- StartVectorY[1] = (real32)(Y+1) - T.OriginY;
+ real32 StartVectorY = (real32)Y - T.OriginY;
for (int16 X = LayerBounds.Min.x; X < LayerBounds.Max.x; X++)
{
- for (int16 i = 0; i < 2; i++)
- {
IACA_START;
real32 StartVectorX = X - T.OriginX;
- real32 U = (StartVectorX * T.XAxisPX) + (StartVectorY[i] * T.XAxisPY);
- real32 V = (StartVectorX * T.YAxisPX) + (StartVectorY[i] * T.YAxisPY);
+ real32 U = (StartVectorX * T.XAxisPX) + (StartVectorY * T.XAxisPY);
+ real32 V = (StartVectorX * T.YAxisPX) + (StartVectorY * T.YAxisPY);
if (U <= 1.0f && U >= 0.0f && V <= 1.0f && V >= 0.0f) {
real32 TexXFull = U * T.LayerWidth;
@@ -631,7 +862,6 @@ Fallback_RenderLayer(transform_info T, pixel_buffer *Buffer, rectangle RenderReg
real32 TexBoth = TexY * TexX;
real32 TexBothInv = TexXInv * TexYInv;
-#if PACKEDRGB
#if 0
uint8 *TexPTR0 = ((uint8 *)T.SourceBuffer + (uint16)T.LayerPitch*TexYInt + TexXInt*Buffer->BytesPerPixel);
uint8 *TexPTR1 = ((uint8 *)T.SourceBuffer + (uint16)T.LayerPitch*(TexYInt+1) + TexXInt*Buffer->BytesPerPixel);
@@ -641,35 +871,34 @@ Fallback_RenderLayer(transform_info T, pixel_buffer *Buffer, rectangle RenderReg
uint32 PixelC = *(uint32 *)TexPTR1;
uint32 PixelD = *((uint32 *)TexPTR1 + 1);
#else
- uint16 LX, LY;
uint32 XLookup, YLookup, PixelToSeek;
- // TODO(fox): Be careful with the BytesPerPixel here! It's the buffer's, not the layer's!
- LX = TexXInt;
- LY = TexYInt;
+ // TODO(fox): Anti-aliasing on edges
+ uint16 LX = TexXInt;
+ uint16 LY = TexYInt;
+ uint16 LXPlus = Ceil(TexXInt+1, (uint32)T.LayerWidth - 1);
+ uint16 LYPlus = Ceil(TexYInt+1, (uint32)T.LayerHeight - 1);
+
+ // TODO(fox): Be careful with the BytesPerPixel here! It's the
+ // buffer's, not the layer's (currently everything is 4 bytes
+ // per pixel).
XLookup = (LX >> 2)*16 + (LX % 4);
- YLookup = (LY >> 2)*(T.LayerWidth*4) + (LY % 4)*4;
+ YLookup = (LY >> 2)*(T.FullLayerWidth*4) + (LY % 4)*4;
PixelToSeek = XLookup + YLookup;
uint32 PixelA = *(uint32 *)((uint8 *)T.SourceBuffer + PixelToSeek*Buffer->BytesPerPixel);
- LX = TexXInt+1;
- LY = TexYInt;
- XLookup = (LX >> 2)*16 + (LX % 4);
- YLookup = (LY >> 2)*(T.LayerWidth*4) + (LY % 4)*4;
+ XLookup = (LXPlus >> 2)*16 + (LXPlus % 4);
+ YLookup = (LY >> 2)*(T.FullLayerWidth*4) + (LY % 4)*4;
PixelToSeek = XLookup + YLookup;
uint32 PixelB = *(uint32 *)((uint8 *)T.SourceBuffer + PixelToSeek*Buffer->BytesPerPixel);
- LX = TexXInt;
- LY = TexYInt+1;
XLookup = (LX >> 2)*16 + (LX % 4);
- YLookup = (LY >> 2)*(T.LayerWidth*4) + (LY % 4)*4;
+ YLookup = (LYPlus >> 2)*(T.FullLayerWidth*4) + (LYPlus % 4)*4;
PixelToSeek = XLookup + YLookup;
uint32 PixelC = *(uint32 *)((uint8 *)T.SourceBuffer + PixelToSeek*Buffer->BytesPerPixel);
- LX = TexXInt+1;
- LY = TexYInt+1;
- XLookup = (LX >> 2)*16 + (LX % 4);
- YLookup = (LY >> 2)*(T.LayerWidth*4) + (LY % 4)*4;
+ XLookup = (LXPlus >> 2)*16 + (LXPlus % 4);
+ YLookup = (LYPlus >> 2)*(T.FullLayerWidth*4) + (LYPlus % 4)*4;
PixelToSeek = XLookup + YLookup;
uint32 PixelD = *(uint32 *)((uint8 *)T.SourceBuffer + PixelToSeek*Buffer->BytesPerPixel);
#endif
@@ -693,30 +922,6 @@ Fallback_RenderLayer(transform_info T, pixel_buffer *Buffer, rectangle RenderReg
uint8 TexAB = ((PixelB >> 24) & 0xFF);
uint8 TexAC = ((PixelC >> 24) & 0xFF);
uint8 TexAD = ((PixelD >> 24) & 0xFF);
-#else
- uint8 *TexPTR0 = ((uint8 *)T.SourceBuffer + (uint16)T.LayerPitch*TexYInt + TexXInt);
- uint8 *TexPTR1 = ((uint8 *)T.SourceBuffer + (uint16)T.LayerPitch*(TexYInt+1) + TexXInt);
-
- uint8 TexRA = *TexPTR0;
- uint8 TexRB = *(TexPTR0 + 1);
- uint8 TexRC = *TexPTR1;
- uint8 TexRD = *(TexPTR1 + 1);
-
- uint8 TexGA = *(TexPTR0 + Channel);
- uint8 TexGB = *(TexPTR0 + 1 + Channel);
- uint8 TexGC = *(TexPTR1 + Channel);
- uint8 TexGD = *(TexPTR1 + 1 + Channel);
-
- uint8 TexBA = *(TexPTR0 + Channel*2);
- uint8 TexBB = *(TexPTR0 + 1 + Channel*2);
- uint8 TexBC = *(TexPTR1 + Channel*2);
- uint8 TexBD = *(TexPTR1 + 1 + Channel*2);
-
- uint8 TexAA = *(TexPTR0 + Channel*3);
- uint8 TexAB = *(TexPTR0 + 1 + Channel*3);
- uint8 TexAC = *(TexPTR1 + Channel*3);
- uint8 TexAD = *(TexPTR1 + 1 + Channel*3);
-#endif
real32 PixelBlendR = (TexBothInv * TexRA) + (TexBothYInv * TexRB)
+ (TexBothXInv * TexRC) + (TexBoth * TexRD);
@@ -733,9 +938,9 @@ Fallback_RenderLayer(transform_info T, pixel_buffer *Buffer, rectangle RenderReg
uint8 B = (uint8)PixelBlendB;
uint8 A = (uint8)PixelBlendA;
-#if PACKEDRGB
XLookup = (X >> 2)*16 + (X % 4);
- YLookup = ((Y+i) >> 2)*(Buffer->Width*4) + ((Y+i) % 4)*4;
+ YLookup = (Y >> 2)*(Buffer->FullWidth*4) + (Y % 4)*4;
+
// if (real) {
// real = false;
// printf("XLook: %i, YLook: %i\n", XLookup, YLookup);
@@ -748,16 +953,6 @@ Fallback_RenderLayer(transform_info T, pixel_buffer *Buffer, rectangle RenderReg
uint8 G1 = (*Pixel >> 8);
uint8 B1 = (*Pixel >> 16);
uint8 A1 = (*Pixel >> 24);
-#else
- uint8 *RD = Pixel;
- uint8 *GD = Pixel + Buffer->Channel;
- uint8 *BD = Pixel + Buffer->Channel*2;
- uint8 *AD = Pixel + Buffer->Channel*3;
- uint8 R1 = *RD;
- uint8 G1 = *GD;
- uint8 B1 = *BD;
- uint8 A1 = *AD;
-#endif
if (A != 255) {
real32 LayerAlpha = (255 - A) / 255.0f;
@@ -767,23 +962,11 @@ Fallback_RenderLayer(transform_info T, pixel_buffer *Buffer, rectangle RenderReg
A = ClipAdd(A1, A);
}
-#if PACKEDRGB
*Pixel = ((A << 24) |
(B << 16) |
(G << 8) |
(R << 0));
}
- }
- }
-#else
- *RD = R;
- *GD = G;
- *BD = B;
- *AD = A;
- }
- Pixel++;
}
- Row += Buffer->Pitch*2;
-#endif
}
}
diff --git a/threading.cpp b/threading.cpp
index 07584bd..39e7b75 100644
--- a/threading.cpp
+++ b/threading.cpp
@@ -7,10 +7,6 @@ PushRect(rectangle RenderRegion)
SDL_SemPost(Semaphore);
}
-internal void
-AVX2_RenderLayer(transform_info TransformInfo, pixel_buffer *Buffer, rectangle RenderRegion);
-internal void
-Fallback_RenderLayer(transform_info TransformInfo, pixel_buffer *Buffer, rectangle RenderRegion);
internal bool32
CheckQueue(render_queue RenderInfo, uint16 Index)
@@ -22,20 +18,7 @@ CheckQueue(render_queue RenderInfo, uint16 Index)
if (__atomic_compare_exchange_n(&NextEntryToDo, &OriginalEntry, NextEntryToDo + 1, true, __ATOMIC_RELEASE, __ATOMIC_ACQUIRE)) {
render_entry *Entry = Entries + OriginalEntry;
Assert(Entry->RenderRegion.Max.x != 0);
- for (int16 i = 0; i < RenderInfo.State->NumberOfLayersToRender; i++) {
- int16 Idx = RenderInfo.State->LayersToRender[i];
-#if ARM
- Fallback_RenderLayer(RenderInfo.File->Layer[Idx]->TransformInfo, RenderInfo.CompBuffer, Entry->RenderRegion);
- // RenderLayerNeon(RenderInfo.File->Layer[Idx], RenderInfo.CompBuffer, Entry->RenderRegion);
-#else
- // printf("(RENDERING) Thread %i, region X%i Y%i\n", Index, Entry->RenderRegion.Min.x/240, Entry->RenderRegion.Min.y/135);
- if (AVXEnabled)
- AVX2_RenderLayer(RenderInfo.File->Layer[Idx]->TransformInfo, RenderInfo.CompBuffer, Entry->RenderRegion);
- else
- Fallback_RenderLayer(RenderInfo.File->Layer[Idx]->TransformInfo, RenderInfo.CompBuffer, Entry->RenderRegion);
-#endif
- }
-
+ RenderLayers(RenderInfo, Entry->RenderRegion);
// printf("(FINISHED) Thread %i, region X%i Y%i\n", Index, Entry->RenderRegion.Min.x/240, Entry->RenderRegion.Min.y/135);
__atomic_add_fetch(&CompletedJobs, 1, __ATOMIC_ACQ_REL);
Result = 1;
diff --git a/video.cpp b/video.cpp
index d39719e..bb3e17e 100644
--- a/video.cpp
+++ b/video.cpp
@@ -190,15 +190,10 @@ InitAV(char *filename, av_info *AV)
av_seek_frame(AV->FileFormatContext, -1, 0, AVSEEK_FLAG_BACKWARD);
};
-#if PACKEDRGB
internal void
-Store4x4Chunk(pixel_buffer *Raster);
+Convert4x4Chunk(pixel_buffer *Raster, uint8);
internal void
-SSE_ClearBuffer(pixel_buffer *Raster, uint16);
-#else
-internal void
-Libav_GBRAToRGBA(pixel_buffer *Raster);
-#endif
+ClearBuffer(pixel_buffer *Raster, void *);
internal int16
LoadVideoFrame(video_source *Source, memory *Memory, int32 TimelineFrame)
@@ -261,11 +256,7 @@ LoadVideoFrame(video_source *Source, memory *Memory, int32 TimelineFrame)
// NOTE(fox): This function will be replaced in the future.
AV->RGBContext = sws_getContext(AV->VideoFrame->width, AV->VideoFrame->height, (AVPixelFormat)AV->VideoFrame->format,
-#if PACKEDRGB
AV->VideoFrame->width, AV->VideoFrame->height, AV_PIX_FMT_RGBA, SWS_BILINEAR,
-#else
- AV->VideoFrame->width, AV->VideoFrame->height, AV_PIX_FMT_GBRAP, SWS_BILINEAR,
-#endif
NULL, NULL, NULL);
if(!AV->RGBContext) {
@@ -277,13 +268,9 @@ LoadVideoFrame(video_source *Source, memory *Memory, int32 TimelineFrame)
av_frame_unref(AV->VideoFrame);
-#if PACKEDRGB
- Store4x4Chunk(Buffer);
- SSE_CopyToBuffer(Buffer, 1);
- SSE_ClearBuffer(Buffer, 1);
-#else
- Libav_GBRAToRGBA(Buffer);
-#endif
+ Convert4x4Chunk(Buffer, 0);
+ CopyToBuffer(Buffer, 1);
+ ClearBuffer(Buffer, Buffer->EffectBuffer);
return 0;
}