internal void PushRect(rectangle RenderRegion); internal void RenderLayerNeon(project_layer *Layer, pixel_buffer *Buffer, rectangle RenderRegion); internal void AVX2_RenderLayer(transform_info TransformInfo, pixel_buffer *Buffer, rectangle RenderRegion); internal void SSE2_RenderLayer(transform_info TransformInfo, pixel_buffer *Buffer, rectangle RenderRegion); internal void Fallback_RenderLayer(transform_info TransformInfo, pixel_buffer *Buffer, rectangle RenderRegion); internal bool32 CheckQueue(render_queue RenderInfo, uint16 Index); // for the anchor point moving UI internal void CalculateAnchorOffset(project_layer *Layer, real32 Value, uint16 Dir) { v2 Result = {}; transform_info TransformInfo; image_source *Source = (image_source *)Layer->RenderInfo; real32 Rad = (Layer->rotation.CurrentValue.f * (PI / 180)); real32 s = Layer->scale.CurrentValue.f; if (Dir == 0) { v2 XAxis = V2(cos(Rad), sin(Rad)) * (Value / s); Layer->x.CurrentValue.f += Value; Layer->ax.CurrentValue.f += XAxis.x/Source->Raster.Width; Layer->ay.CurrentValue.f -= XAxis.y/Source->Raster.Height; } else { v2 YAxis = V2(sin(Rad), -cos(Rad)) * (Value / -s); Layer->y.CurrentValue.f += Value; Layer->ax.CurrentValue.f -= YAxis.x/Source->Raster.Width; Layer->ay.CurrentValue.f += YAxis.y/Source->Raster.Height; } } internal transform_info CalculateTransforms(project_layer *Layer, pixel_buffer *Buffer) { transform_info TransformInfo; image_source *Source = (image_source *)Layer->RenderInfo; real32 Rad = (Layer->rotation.CurrentValue.f * (PI / 180)); real32 s = Layer->scale.CurrentValue.f; // v2 Scale = {Source->Raster.Width * s, Source->Raster.Height * s}; v2 XAxis = (Source->Raster.Width * s)*V2(cos(Rad), sin(Rad)); v2 YAxis = (Source->Raster.Height * -s)*V2(sin(Rad), -cos(Rad)); real32 AnchorX = Layer->ax.CurrentValue.f; real32 AnchorY = Layer->ay.CurrentValue.f; v2 Pos = {Layer->x.CurrentValue.f, Layer->y.CurrentValue.f}; v2 Origin = Pos - (XAxis * AnchorX) - (YAxis * AnchorY); real32 XLengthSq = 1.0f / LengthSq(XAxis); real32 YLengthSq = 1.0f / LengthSq(YAxis); int32 MaxX = 0; int32 MaxY = 0; int32 MinX = Buffer->Width; int32 MinY = Buffer->Height; v2 Points[4] = {Origin, Origin + XAxis, Origin + YAxis, Origin + XAxis + YAxis}; for (int i = 0; i < 4; i++) { if (Points[i].x < MinX) { MinX = Points[i].x; } if (Points[i].y < MinY) { MinY = Points[i].y; } if (Points[i].x > MaxX) { MaxX = Points[i].x; } if (Points[i].y > MaxY) { MaxY = Points[i].y; } } TransformInfo.XAxisPX = XLengthSq*XAxis.x; TransformInfo.XAxisPY = XLengthSq*XAxis.y; TransformInfo.YAxisPX = YLengthSq*YAxis.x; TransformInfo.YAxisPY = YLengthSq*YAxis.y; TransformInfo.LayerWidth = (real32)Source->Raster.Width; TransformInfo.LayerHeight = (real32)Source->Raster.Height; TransformInfo.FullLayerWidth = Source->Raster.FullWidth; TransformInfo.FullLayerHeight = Source->Raster.FullHeight; TransformInfo.LayerOpacity = 1.0f - Layer->opacity.CurrentValue.f; TransformInfo.OriginX = Origin.x; TransformInfo.OriginY = Origin.y; TransformInfo.BufferPitch = Buffer->Pitch; TransformInfo.LayerPitch = Source->Raster.Pitch; TransformInfo.ClipRect = {MinX - (MinX & 3), MinY, MaxX + 1, MaxY + 1}; TransformInfo.SourceBuffer = Source->Raster.EffectBuffer; return TransformInfo; } internal void EndRenderState(project_state *State) { IsRendering = false; for (int16 i = 0; i < State->NumberOfLayersToRender; i++) { State->LayersToRender[i] = 0; } State->NumberOfLayersToRender = 0; #if THREADED SDL_AtomicSet(&CurrentEntry, 0); SDL_AtomicSet(&QueuedEntries, 0); SDL_AtomicSet(&CompletedEntries, 0); #endif } internal void RenderLayers(render_queue *RenderInfo, rectangle RenderRegion) { for (int16 i = 0; i < RenderInfo->State->NumberOfLayersToRender; i++) { int16 Idx = RenderInfo->State->LayersToRender[i]; if (InstructionMode == avx_enabled) AVX2_RenderLayer(RenderInfo->File->Layer[Idx]->TransformInfo, RenderInfo->CompBuffer, RenderRegion); else if (InstructionMode == sse_enabled) SSE2_RenderLayer(RenderInfo->File->Layer[Idx]->TransformInfo, RenderInfo->CompBuffer, RenderRegion); else Fallback_RenderLayer(RenderInfo->File->Layer[Idx]->TransformInfo, RenderInfo->CompBuffer, RenderRegion); } } internal void QueueCurrentFrame(project_data *File, pixel_buffer *CompBuffer, project_state *State) { IsRendering = true; render_queue RenderInfo = {File, State, CompBuffer}; for (int16 i = 0; i < File->NumberOfLayers; i++) { if (File->Layer[i]->StartFrame <= File->CurrentFrame && File->Layer[i]->EndFrame >= File->CurrentFrame) { File->Layer[i]->TransformInfo = CalculateTransforms(File->Layer[i], CompBuffer); State->LayersToRender[State->NumberOfLayersToRender] = i; State->NumberOfLayersToRender++; } } #if THREADED uint16 TileWidth = CompBuffer->Width / 4; uint16 TileHeight = CompBuffer->Height / 4; DEBUG_CycleCountStart(3); for (int y = 0; y < 4; y++) { for (int x = 0; x < 4; x++) { // if (x == y) { rectangle RenderRegion = {TileWidth*x, TileHeight*y, TileWidth + TileWidth*x, TileHeight + TileHeight*y}; if (RenderRegion.Max.x > CompBuffer->Width) RenderRegion.Max.x = CompBuffer->Width; if (RenderRegion.Max.y > CompBuffer->Height) RenderRegion.Max.y = CompBuffer->Height; PushRect(RenderRegion); // } } } #else rectangle RenderRegion = {0, 0, (int32)CompBuffer->Width, (int32)CompBuffer->Height}; RenderLayers(&RenderInfo, RenderRegion); #endif } #if ARM internal void RenderLayerNeon(project_layer *Layer, pixel_buffer *Buffer, rectangle RenderRegion) { float32x4_t XAxisPX = vdupq_n_f32(XAxisP.x); float32x4_t XAxisPY = vdupq_n_f32(XAxisP.y); float32x4_t YAxisPX = vdupq_n_f32(YAxisP.x); float32x4_t YAxisPY = vdupq_n_f32(YAxisP.y); float32x4_t LayerWidth = vdupq_n_f32(); float32x4_t LayerHeight = vdupq_n_f32(); float32x4_t LayerOpacity = vdupq_n_f32(); float32x4_t OriginX = vdupq_n_f32(Origin.x); float32x4_t OriginY = vdupq_n_f32(Origin.y); float32x4_t One = vdupq_n_f32(1); float32x4_t Zero = vdupq_n_f32(0); float32x4_t Four = vdupq_n_f32(4); int32x4_t FourInt = vdupq_n_s32(4); int32x4_t EightInt = vdupq_n_s32(8); int32x4_t SixteenInt = vdupq_n_s32(16); int32x4_t TwentyFourInt = vdupq_n_s32(24); float32x4_t Float255 = vdupq_n_f32(255.0f); int32x4_t Int255 = vdupq_n_s32(255); float32x4_t Norm255 = vdupq_n_f32(1/255.0f); for(int16 Y = LayerBounds.Min.y; Y < LayerBounds.Max.y; Y++) { uint32 *Pixel = (uint32 *)Row + LayerBounds.Min.x; real32 ScalarPixelX[4] = {(real32)LayerBounds.Min.x, (real32)LayerBounds.Min.x+1, (real32)LayerBounds.Min.x+2, (real32)LayerBounds.Min.x+3}; float32x4_t PixelX = vld1q_f32(ScalarPixelX); float32x4_t PixelY = vdupq_n_f32((real32)Y); float32x4_t StartVectorY = vsubq_f32(PixelY, OriginY); for(int16 XI = LayerBounds.Min.x; XI < LayerBounds.Max.x; XI += 1) { float32x4_t StartVectorX = vsubq_f32(PixelX, OriginX); float32x4_t U = vaddq_f32(vmulq_f32(StartVectorX, XAxisPX), vmulq_f32(StartVectorY, XAxisPY)); float32x4_t V = vaddq_f32(vmulq_f32(StartVectorX, YAxisPX), vmulq_f32(StartVectorY, YAxisPY)); uint32x4_t R = vandq_u32(vandq_u32(vcleq_f32(U, One), vcgezq_f32(U)), vandq_u32(vcleq_f32(V, One), vcgezq_f32(V))); // TODO(fox): Make more efficient with some sort of truncation uint32 comp[4]; vst1q_u32(comp, R); if (comp[0] || comp[1] || comp[2] || comp[3]) { U = vmaxq_f32(vminq_f32(One, U), Zero); V = vmaxq_f32(vminq_f32(One, V), Zero); float32x4_t TexXFull = vmulq_f32(U, LayerWidth); float32x4_t TexYFull = vmulq_f32(V, LayerHeight); int32x4_t TexXInt = vcvtq_s32_f32(TexXFull); int32x4_t TexYInt = vcvtq_s32_f32(TexYFull); // fractions float32x4_t TexX = vsubq_f32(TexXFull, vcvtq_f32_s32(TexXInt)); float32x4_t TexY = vsubq_f32(TexYFull, vcvtq_f32_s32(TexYInt)); float32x4_t TexXInv = vsubq_f32(One, TexX); float32x4_t TexYInv = vsubq_f32(One, TexY); float32x4_t TexBothXInv = vmulq_f32(TexXInv, TexY); float32x4_t TexBothYInv = vmulq_f32(TexX, TexYInv); float32x4_t TexBoth = vmulq_f32(TexY, TexX); float32x4_t TexBothInv = vmulq_f32(TexXInv, TexYInv); int32 TexXP[4]; vst1q_s32(TexXP, TexXInt); int32 TexYP[4]; vst1q_s32(TexYP, TexYInt); uint8 *TexPTR0 = ((uint8 *)Source->Raster.MainBuffer + LayerPitch*TexYP[0] + TexXP[0]*sizeof(uint32)); uint8 *TexPTR1 = ((uint8 *)Source->Raster.MainBuffer + LayerPitch*TexYP[1] + TexXP[1]*sizeof(uint32)); uint8 *TexPTR2 = ((uint8 *)Source->Raster.MainBuffer + LayerPitch*TexYP[2] + TexXP[2]*sizeof(uint32)); uint8 *TexPTR3 = ((uint8 *)Source->Raster.MainBuffer + LayerPitch*TexYP[3] + TexXP[3]*sizeof(uint32)); // TexRGBA = vld4_u8(TexPTR0); // TexRGBA = vld4q_lane_u8(TexPTR0, TexRGBA, 0); // TexRGBA = vld4q_lane_u8(TexPTR1, TexRGBA, 4); // TexRGBA = vld4q_lane_u8(TexPTR2, TexRGBA, 8); // TexRGBA = vld4q_lane_u8(TexPTR3, TexRGBA, 12); // TexRGBA = vld4q_lane_u8(TexPTR0 + sizeof(uint32), TexRGBA, 1); // TexRGBA = vld4q_lane_u8(TexPTR1 + sizeof(uint32), TexRGBA, 5); // TexRGBA = vld4q_lane_u8(TexPTR2 + sizeof(uint32), TexRGBA, 9); // TexRGBA = vld4q_lane_u8(TexPTR3 + sizeof(uint32), TexRGBA, 13); // TexRGBA = vld4q_lane_u8(TexPTR0 + LayerPitch, TexRGBA, 2); // TexRGBA = vld4q_lane_u8(TexPTR1 + LayerPitch, TexRGBA, 6); // TexRGBA = vld4q_lane_u8(TexPTR2 + LayerPitch, TexRGBA, 10); // TexRGBA = vld4q_lane_u8(TexPTR3 + LayerPitch, TexRGBA, 14); // TexRGBA = vld4q_lane_u8(TexPTR0 + LayerPitch + sizeof(uint32), TexRGBA, 3); // TexRGBA = vld4q_lane_u8(TexPTR1 + LayerPitch + sizeof(uint32), TexRGBA, 7); // TexRGBA = vld4q_lane_u8(TexPTR2 + LayerPitch + sizeof(uint32), TexRGBA, 11); // TexRGBA = vld4q_lane_u8(TexPTR3 + LayerPitch + sizeof(uint32), TexRGBA, 15); uint8x16x4_t TexRGBA_A = {}; uint8x16x4_t TexRGBA_B = {}; uint8x16x4_t TexRGBA_C = {}; uint8x16x4_t TexRGBA_D = {}; TexRGBA_A = vld4q_lane_u8(TexPTR0, TexRGBA_A, 0); TexRGBA_B = vld4q_lane_u8(TexPTR1, TexRGBA_B, 0); TexRGBA_C = vld4q_lane_u8(TexPTR2, TexRGBA_C, 0); TexRGBA_D = vld4q_lane_u8(TexPTR3, TexRGBA_D, 0); TexRGBA_A = vld4q_lane_u8(TexPTR0 + sizeof(uint32), TexRGBA_A, 4); TexRGBA_B = vld4q_lane_u8(TexPTR1 + sizeof(uint32), TexRGBA_B, 4); TexRGBA_C = vld4q_lane_u8(TexPTR2 + sizeof(uint32), TexRGBA_C, 4); TexRGBA_D = vld4q_lane_u8(TexPTR3 + sizeof(uint32), TexRGBA_D, 4); TexRGBA_A = vld4q_lane_u8(TexPTR0 + LayerPitch, TexRGBA_A, 8); TexRGBA_B = vld4q_lane_u8(TexPTR1 + LayerPitch, TexRGBA_B, 8); TexRGBA_C = vld4q_lane_u8(TexPTR2 + LayerPitch, TexRGBA_C, 8); TexRGBA_D = vld4q_lane_u8(TexPTR3 + LayerPitch, TexRGBA_D, 8); TexRGBA_A = vld4q_lane_u8(TexPTR0 + LayerPitch + sizeof(uint32), TexRGBA_A, 12); TexRGBA_B = vld4q_lane_u8(TexPTR1 + LayerPitch + sizeof(uint32), TexRGBA_B, 12); TexRGBA_C = vld4q_lane_u8(TexPTR2 + LayerPitch + sizeof(uint32), TexRGBA_C, 12); TexRGBA_D = vld4q_lane_u8(TexPTR3 + LayerPitch + sizeof(uint32), TexRGBA_D, 12); uint32x4_t test = (uint32x4_t)TexRGBA_A.val[0]; float32x4_t asd = vcvtq_f32_u32((uint32x4_t)TexRGBA_A.val[0]); float32x4_t pp = vaddq_f32(vmulq_f32(TexBothInv, vcvtq_f32_u32((uint32x4_t)TexRGBA_A.val[0])), vmulq_f32(TexBothYInv, vcvtq_f32_u32((uint32x4_t)TexRGBA_B.val[0]))); uint32x4_t test2 = (uint32x4_t)TexRGBA_A.val[0]; #if 0 float32x4_t PixelBlendR = vaddq_f32(vaddq_f32(vmulq_f32(TexBothInv, vcvtq_f32_u32((uint32x4_t)TexRGBA_A.val[0])), vmulq_f32(TexBothYInv, vcvtq_f32_u32((uint32x4_t)TexRGBA_B.val[0]))), vaddq_f32(vmulq_f32(TexBothXInv, vcvtq_f32_u32((uint32x4_t)TexRGBA_C.val[0])), vmulq_f32(TexBoth, vcvtq_f32_u32((uint32x4_t)TexRGBA_D.val[0])))); float32x4_t PixelBlendG = vaddq_f32(vaddq_f32(vmulq_f32(TexBothInv, vcvtq_f32_u32((uint32x4_t)TexRGBA_A.val[1])), vmulq_f32(TexBothYInv, vcvtq_f32_u32((uint32x4_t)TexRGBA_B.val[1]))), vaddq_f32(vmulq_f32(TexBothXInv, vcvtq_f32_u32((uint32x4_t)TexRGBA_C.val[1])), vmulq_f32(TexBoth, vcvtq_f32_u32((uint32x4_t)TexRGBA_D.val[1])))); float32x4_t PixelBlendB = vaddq_f32(vaddq_f32(vmulq_f32(TexBothInv, vcvtq_f32_u32((uint32x4_t)TexRGBA_A.val[2])), vmulq_f32(TexBothYInv, vcvtq_f32_u32((uint32x4_t)TexRGBA_B.val[2]))), vaddq_f32(vmulq_f32(TexBothXInv, vcvtq_f32_u32((uint32x4_t)TexRGBA_C.val[2])), vmulq_f32(TexBoth, vcvtq_f32_u32((uint32x4_t)TexRGBA_D.val[2])))); float32x4_t PixelBlendA = vaddq_f32(vaddq_f32(vmulq_f32(TexBothInv, vcvtq_f32_u32((uint32x4_t)TexRGBA_A.val[3])), vmulq_f32(TexBothYInv, vcvtq_f32_u32((uint32x4_t)TexRGBA_B.val[3]))), vaddq_f32(vmulq_f32(TexBothXInv, vcvtq_f32_u32((uint32x4_t)TexRGBA_C.val[3])), vmulq_f32(TexBoth, vcvtq_f32_u32((uint32x4_t)TexRGBA_D.val[3])))); #endif float32x4_t PixelBlendR = vcvtq_f32_u32((uint32x4_t)TexRGBA_A.val[0]); float32x4_t PixelBlendG = vcvtq_f32_u32((uint32x4_t)TexRGBA_A.val[1]); float32x4_t PixelBlendB = vcvtq_f32_u32((uint32x4_t)TexRGBA_A.val[2]); float32x4_t PixelBlendA = vcvtq_f32_u32((uint32x4_t)TexRGBA_A.val[3]); // __m128 PixelBlendR = _mm_add_ps(_mm_add_ps(_mm_mul_ps(TexBothInv, TexARx4), // _mm_mul_ps(TexBothYInv, TexBRx4)), // _mm_add_ps(_mm_mul_ps(TexBothXInv, TexCRx4), // _mm_mul_ps(TexBoth, TexDRx4))); PixelBlendA = vsubq_f32(PixelBlendA, vmulq_f32(PixelBlendA, LayerOpacity)); uint32x4_t Output = vorrq_u32(vorrq_u32(vshlq_n_u32(vcvtq_u32_f32(PixelBlendR), 16), vshlq_n_u32(vcvtq_u32_f32(PixelBlendA), 24)), (vorrq_u32(vshlq_n_u32(vcvtq_u32_f32(PixelBlendG), 8), vcvtq_u32_f32(PixelBlendB)))); uint32 ma[4] = {0xFFFFFFFF, 0, 0, 0}; uint32x4_t mask = vld1q_u32(ma); Output = vandq_u32(Output, mask); vst1q_u32(Pixel, Output); } Pixel++; PixelX = vaddq_f32(PixelX, One); } Row += BufferPitch; } } #else internal void AVX2_RenderLayer(transform_info T, pixel_buffer *Buffer, rectangle RenderRegion) { rectangle LayerBounds = ClipRectangle( T.ClipRect, RenderRegion ); // Remember: since bitmaps are packed in 4x4 cubes, we always need to be aligned. LayerBounds.Min.x -= LayerBounds.Min.x % 4; LayerBounds.Min.y -= LayerBounds.Min.y % 4; uint8 *TexPTR = (uint8 *)T.SourceBuffer; Assert(LayerBounds.Max.x <= Buffer->Width); Assert(LayerBounds.Max.y <= Buffer->Height); __m256 XAxisPX = _mm256_set1_ps(T.XAxisPX); __m256 XAxisPY = _mm256_set1_ps(T.XAxisPY); __m256 YAxisPX = _mm256_set1_ps(T.YAxisPX); __m256 YAxisPY = _mm256_set1_ps(T.YAxisPY); __m256 LayerWidth = _mm256_set1_ps(T.LayerWidth); __m256i FullLayerWidth4i = _mm256_set1_epi32(T.FullLayerWidth*4); __m256i LayerWidthMinusOne = _mm256_set1_epi32(T.LayerWidth - 1); __m256i LayerHeightMinusOne = _mm256_set1_epi32(T.LayerHeight - 1); __m256 LayerHeight = _mm256_set1_ps(T.LayerHeight); __m256 LayerOpacity = _mm256_set1_ps(T.LayerOpacity); __m256 OriginX = _mm256_set1_ps(T.OriginX); __m256 OriginY = _mm256_set1_ps(T.OriginY); __m256 One = _mm256_set1_ps(1); __m256 Zero = _mm256_set1_ps(0); __m256i Zeroi = _mm256_set1_epi32(0); __m256i Onei = _mm256_set1_epi32(1); __m256 Four = _mm256_set1_ps(4); __m256 Sixteen = _mm256_set1_ps(16); __m256i FF = _mm256_set1_epi32(0xFF); __m256i BottomTwoBits = _mm256_set1_epi32(0x03); __m256i Fouri = _mm256_set1_epi32(4); __m256i Sixteeni = _mm256_set1_epi32(16); __m256 Reg255 = _mm256_set1_ps(255.0f); __m256i Int255 = _mm256_set1_epi32(255); __m256 Norm255 = _mm256_set1_ps(1/255.0f); // __m256i White = _mm256_setr_epi32(0xFFFFFFFF, 0, 0, 0, 0xFFFFFFFF, 0, 0, 0); // __m256i White2 = _mm256_set1_epi32(0xFFFFFFFF); // NOTE(fox): Each loop operates on 8 pixels, 4 horizontal by 2 vertical, // as per the bitmap packing scheme in memory. for (int32 Y = LayerBounds.Min.y; Y < LayerBounds.Max.y; Y+=2) { __m256 PixelX = _mm256_setr_ps((real32)LayerBounds.Min.x, (real32)LayerBounds.Min.x+1, (real32)LayerBounds.Min.x+2, (real32)LayerBounds.Min.x+3, (real32)LayerBounds.Min.x, (real32)LayerBounds.Min.x+1, (real32)LayerBounds.Min.x+2, (real32)LayerBounds.Min.x+3); __m256 PixelY = _mm256_setr_ps((real32)Y, (real32)Y, (real32)Y, (real32)Y, (real32)Y+1, (real32)Y+1, (real32)Y+1, (real32)Y+1); __m256 StartVectorY = _mm256_sub_ps(PixelY, OriginY); for (int32 X = LayerBounds.Min.x; X < LayerBounds.Max.x; X += 4) { IACA_START; __m256 StartVectorX = _mm256_sub_ps(PixelX, OriginX); uint32 XLookup = (X >> 2)*16 + (X % 4); uint32 YLookup = (Y >> 2)*(Buffer->FullWidth*4) + (Y % 4)*4; uint32 PixelToSeek = XLookup + YLookup; uint8 *Pixel = (uint8 *)Buffer->OriginalBuffer + PixelToSeek*Buffer->BytesPerPixel; __m256 U = _mm256_add_ps(_mm256_mul_ps(StartVectorX, XAxisPX), _mm256_mul_ps(StartVectorY, XAxisPY)); __m256 V = _mm256_add_ps(_mm256_mul_ps(StartVectorX, YAxisPX), _mm256_mul_ps(StartVectorY, YAxisPY)); __m256i LayerMask = _mm256_castps_si256(_mm256_and_ps(_mm256_and_ps(_mm256_cmp_ps(U, Zero, 13), _mm256_cmp_ps(U, One, 2)), _mm256_and_ps(_mm256_cmp_ps(V, Zero, 13), _mm256_cmp_ps(V, One, 2)))); // If all of the pixels are zeroed in the mask (aka fall outside // the UV lookup), we can skip the iteration. if (_mm256_movemask_epi8(LayerMask)) { U = _mm256_max_ps(_mm256_min_ps(One, U), Zero); V = _mm256_max_ps(_mm256_min_ps(One, V), Zero); __m256 TexXFull = _mm256_mul_ps(U, LayerWidth); __m256 TexYFull = _mm256_mul_ps(V, LayerHeight); __m256i TexXInt = _mm256_cvttps_epi32(TexXFull); __m256i TexXIntPlusOne = _mm256_add_epi32(TexXInt, _mm256_and_si256(_mm256_cmpgt_epi32(LayerWidthMinusOne, TexXInt), Onei)); __m256i TexYInt = _mm256_cvttps_epi32(TexYFull); __m256i TexYIntPlusOne = _mm256_add_epi32(TexYInt, _mm256_and_si256(_mm256_cmpgt_epi32(LayerHeightMinusOne, TexYInt), Onei)); // NOTE(fox): The comparison is for when we're on the last pixel. __m256 TexX = _mm256_sub_ps(TexXFull, _mm256_cvtepi32_ps(TexXInt)); __m256 TexY = _mm256_sub_ps(TexYFull, _mm256_cvtepi32_ps(TexYInt)); __m256 TexXInv = _mm256_sub_ps(One, TexX); __m256 TexYInv = _mm256_sub_ps(One, TexY); __m256 TexBothXInv = _mm256_mul_ps(TexXInv, TexY); __m256 TexBothYInv = _mm256_mul_ps(TexX, TexYInv); __m256 TexBoth = _mm256_mul_ps(TexY, TexX); __m256 TexBothInv = _mm256_mul_ps(TexXInv, TexYInv); __m256i XLookup = _mm256_add_epi32(_mm256_mullo_epi32(_mm256_srli_epi32(TexXInt, 2), Sixteeni), _mm256_and_si256(TexXInt, BottomTwoBits)); __m256i YLookup = _mm256_add_epi32(_mm256_mullo_epi32(_mm256_srli_epi32(TexYInt, 2), FullLayerWidth4i), _mm256_mullo_epi32(_mm256_and_si256(TexYInt, BottomTwoBits), Fouri)); __m256i XLookupPlusOne = _mm256_add_epi32(_mm256_mullo_epi32(_mm256_srli_epi32(TexXIntPlusOne, 2), Sixteeni), _mm256_and_si256(TexXIntPlusOne, BottomTwoBits)); __m256i YLookupPlusOne = _mm256_add_epi32(_mm256_mullo_epi32(_mm256_srli_epi32(TexYIntPlusOne, 2), FullLayerWidth4i), _mm256_mullo_epi32(_mm256_and_si256(TexYIntPlusOne, BottomTwoBits), Fouri)); __m256i PixelLookupTL = _mm256_add_epi32(XLookup, YLookup); __m256i PixelLookupTR = _mm256_add_epi32(XLookupPlusOne, YLookup); __m256i PixelLookupBL = _mm256_add_epi32(XLookup, YLookupPlusOne); __m256i PixelLookupBR = _mm256_add_epi32(XLookupPlusOne, YLookupPlusOne); // The big feature of AVX2: gathering. __m256i PixelsTL = _mm256_i32gather_epi32((const int32 *)TexPTR, PixelLookupTL, 4); __m256i PixelsTR = _mm256_i32gather_epi32((const int32 *)TexPTR, PixelLookupTR, 4); __m256i PixelsBL = _mm256_i32gather_epi32((const int32 *)TexPTR, PixelLookupBL, 4); __m256i PixelsBR = _mm256_i32gather_epi32((const int32 *)TexPTR, PixelLookupBR, 4); __m256i R_TexTL = _mm256_and_si256( PixelsTL, FF); __m256i G_TexTL = _mm256_and_si256(_mm256_srli_epi32(PixelsTL, 8), FF); __m256i B_TexTL = _mm256_and_si256(_mm256_srli_epi32(PixelsTL, 16), FF); __m256i A_TexTL = _mm256_and_si256(_mm256_srli_epi32(PixelsTL, 24), FF); __m256i R_TexTR = _mm256_and_si256( PixelsTR, FF); __m256i G_TexTR = _mm256_and_si256(_mm256_srli_epi32(PixelsTR, 8), FF); __m256i B_TexTR = _mm256_and_si256(_mm256_srli_epi32(PixelsTR, 16), FF); __m256i A_TexTR = _mm256_and_si256(_mm256_srli_epi32(PixelsTR, 24), FF); __m256i R_TexBL = _mm256_and_si256( PixelsBL, FF); __m256i G_TexBL = _mm256_and_si256(_mm256_srli_epi32(PixelsBL, 8), FF); __m256i B_TexBL = _mm256_and_si256(_mm256_srli_epi32(PixelsBL, 16), FF); __m256i A_TexBL = _mm256_and_si256(_mm256_srli_epi32(PixelsBL, 24), FF); __m256i R_TexBR = _mm256_and_si256( PixelsBR, FF); __m256i G_TexBR = _mm256_and_si256(_mm256_srli_epi32(PixelsBR, 8), FF); __m256i B_TexBR = _mm256_and_si256(_mm256_srli_epi32(PixelsBR, 16), FF); __m256i A_TexBR = _mm256_and_si256(_mm256_srli_epi32(PixelsBR, 24), FF); __m256 R_PixelBlend = _mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(TexBothInv, _mm256_cvtepi32_ps(R_TexTL)), _mm256_mul_ps(TexBothYInv, _mm256_cvtepi32_ps(R_TexTR))), _mm256_add_ps(_mm256_mul_ps(TexBothXInv, _mm256_cvtepi32_ps(R_TexBL)), _mm256_mul_ps(TexBoth, _mm256_cvtepi32_ps(R_TexBR)))); __m256 G_PixelBlend = _mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(TexBothInv, _mm256_cvtepi32_ps(G_TexTL)), _mm256_mul_ps(TexBothYInv, _mm256_cvtepi32_ps(G_TexTR))), _mm256_add_ps(_mm256_mul_ps(TexBothXInv, _mm256_cvtepi32_ps(G_TexBL)), _mm256_mul_ps(TexBoth, _mm256_cvtepi32_ps(G_TexBR)))); __m256 B_PixelBlend = _mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(TexBothInv, _mm256_cvtepi32_ps(B_TexTL)), _mm256_mul_ps(TexBothYInv, _mm256_cvtepi32_ps(B_TexTR))), _mm256_add_ps(_mm256_mul_ps(TexBothXInv, _mm256_cvtepi32_ps(B_TexBL)), _mm256_mul_ps(TexBoth, _mm256_cvtepi32_ps(B_TexBR)))); __m256 A_PixelBlend = _mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(TexBothInv, _mm256_cvtepi32_ps(A_TexTL)), _mm256_mul_ps(TexBothYInv, _mm256_cvtepi32_ps(A_TexTR))), _mm256_add_ps(_mm256_mul_ps(TexBothXInv, _mm256_cvtepi32_ps(A_TexBL)), _mm256_mul_ps(TexBoth, _mm256_cvtepi32_ps(A_TexBR)))); A_PixelBlend = _mm256_sub_ps(A_PixelBlend, _mm256_mul_ps(A_PixelBlend, LayerOpacity)); __m256i R_Out, G_Out, B_Out, A_Out; // Only do alpha blending if a pixel's value doesn't equal 255 if (_mm256_movemask_epi8(_mm256_sub_epi32(_mm256_cvtps_epi32(A_PixelBlend), Int255))) { __m256 LayerAlpha = _mm256_mul_ps(A_PixelBlend, Norm255); __m256 LayerAlphaInv = _mm256_mul_ps(_mm256_sub_ps(Reg255, A_PixelBlend), Norm255); __m256i DestPixel = _mm256_loadu_si256((const __m256i *)Pixel); __m256i R_Dest = _mm256_and_si256( DestPixel, FF); __m256i G_Dest = _mm256_and_si256(_mm256_srli_epi32(DestPixel, 8), FF); __m256i B_Dest = _mm256_and_si256(_mm256_srli_epi32(DestPixel, 16), FF); __m256i A_Dest = _mm256_and_si256(_mm256_srli_epi32(DestPixel, 24), FF); R_Out = _mm256_cvtps_epi32(_mm256_add_ps(_mm256_mul_ps(_mm256_cvtepi32_ps(R_Dest), LayerAlphaInv), _mm256_mul_ps(R_PixelBlend, LayerAlpha))); G_Out = _mm256_cvtps_epi32(_mm256_add_ps(_mm256_mul_ps(_mm256_cvtepi32_ps(G_Dest), LayerAlphaInv), _mm256_mul_ps(G_PixelBlend, LayerAlpha))); B_Out = _mm256_cvtps_epi32(_mm256_add_ps(_mm256_mul_ps(_mm256_cvtepi32_ps(B_Dest), LayerAlphaInv), _mm256_mul_ps(B_PixelBlend, LayerAlpha))); A_Out = _mm256_cvtps_epi32(_mm256_min_ps(_mm256_add_ps(_mm256_cvtepi32_ps(A_Dest), A_PixelBlend), Reg255)); } else { R_Out = _mm256_cvtps_epi32(R_PixelBlend); G_Out = _mm256_cvtps_epi32(G_PixelBlend); B_Out = _mm256_cvtps_epi32(B_PixelBlend); A_Out = _mm256_cvtps_epi32(A_PixelBlend); } __m256i OutputPixel = _mm256_or_si256( _mm256_or_si256(R_Out, _mm256_slli_epi32(G_Out, 8)), _mm256_or_si256(_mm256_slli_epi32(B_Out, 16), _mm256_slli_epi32(A_Out, 24))); // __m256i PixelsMask = _mm256_blendv_epi8(Zeroi, OutputPixel, LayerMask); _mm256_maskstore_epi32((int *)Pixel, LayerMask, OutputPixel); } PixelX = _mm256_add_ps(PixelX, Four); } } } internal void SSE2_RenderLayer(transform_info T, pixel_buffer *Buffer, rectangle RenderRegion) { rectangle LayerBounds = ClipRectangle( T.ClipRect, RenderRegion ); // Remember: since bitmaps are packed in 4x4 cubes, we always need to be aligned. LayerBounds.Min.x -= LayerBounds.Min.x % 4; LayerBounds.Min.y -= LayerBounds.Min.y % 4; uint8 *TexPTR = (uint8 *)T.SourceBuffer; Assert(LayerBounds.Max.x <= Buffer->Width); Assert(LayerBounds.Max.y <= Buffer->Height); __m128 XAxisPX = _mm_set1_ps(T.XAxisPX); __m128 XAxisPY = _mm_set1_ps(T.XAxisPY); __m128 YAxisPX = _mm_set1_ps(T.YAxisPX); __m128 YAxisPY = _mm_set1_ps(T.YAxisPY); __m128 LayerWidth = _mm_set1_ps(T.LayerWidth); __m128i LayerWidthMinusOne = _mm_set1_epi32(T.LayerWidth - 1); __m128i FullLayerWidth4i = _mm_set1_epi32(T.FullLayerWidth*4); __m128 LayerHeight = _mm_set1_ps(T.LayerHeight); __m128i LayerHeightMinusOne = _mm_set1_epi32(T.LayerHeight - 1); __m128 LayerOpacity = _mm_set1_ps(T.LayerOpacity); __m128 OriginX = _mm_set1_ps(T.OriginX); __m128 OriginY = _mm_set1_ps(T.OriginY); __m128 One = _mm_set1_ps(1); __m128 Zero = _mm_set1_ps(0); __m128i Zeroi = _mm_set1_epi32(0); __m128i Onei = _mm_set1_epi32(1); __m128 Four = _mm_set1_ps(4); __m128 Sixteen = _mm_set1_ps(16); __m128i FF = _mm_set1_epi32(0xFF); __m128i BottomTwoBits = _mm_set1_epi32(0x03); __m128i Fouri = _mm_set1_epi32(4); __m128i Sixteeni = _mm_set1_epi32(16); __m128 Reg255 = _mm_set1_ps(255.0f); __m128i Int255 = _mm_set1_epi32(255); __m128 Norm255 = _mm_set1_ps(1/255.0f); // NOTE(fox): Each loop operates on 4 pixels, 4 horizontal by 1 vertical. for (int32 Y = LayerBounds.Min.y; Y < LayerBounds.Max.y; Y++) { __m128 PixelX = _mm_setr_ps((real32)LayerBounds.Min.x, (real32)LayerBounds.Min.x+1, (real32)LayerBounds.Min.x+2, (real32)LayerBounds.Min.x+3); __m128 PixelY = _mm_set1_ps((real32)Y); __m128 StartVectorY = _mm_sub_ps(PixelY, OriginY); for (int32 X = LayerBounds.Min.x; X < LayerBounds.Max.x; X += 4) { IACA_START; __m128 StartVectorX = _mm_sub_ps(PixelX, OriginX); uint32 XLookup = (X >> 2)*16 + (X % 4); uint32 YLookup = (Y >> 2)*(Buffer->FullWidth*4) + (Y % 4)*4; uint32 PixelToSeek = XLookup + YLookup; uint8 *Pixel = (uint8 *)Buffer->OriginalBuffer + PixelToSeek*Buffer->BytesPerPixel; __m128 U = _mm_add_ps(_mm_mul_ps(StartVectorX, XAxisPX), _mm_mul_ps(StartVectorY, XAxisPY)); __m128 V = _mm_add_ps(_mm_mul_ps(StartVectorX, YAxisPX), _mm_mul_ps(StartVectorY, YAxisPY)); __m128i LayerMask = _mm_castps_si128(_mm_and_ps(_mm_and_ps(_mm_cmpge_ps(U, Zero), _mm_cmple_ps(U, One)), _mm_and_ps(_mm_cmpge_ps(V, Zero), _mm_cmple_ps(V, One)))); if (_mm_movemask_epi8(LayerMask)) { U = _mm_max_ps(_mm_min_ps(One, U), Zero); V = _mm_max_ps(_mm_min_ps(One, V), Zero); __m128 TexXFull = _mm_mul_ps(U, LayerWidth); __m128 TexYFull = _mm_mul_ps(V, LayerHeight); __m128i TexXInt = _mm_cvttps_epi32(TexXFull); __m128i TexXIntPlusOne = _mm_add_epi32(TexXInt, _mm_and_si128(_mm_cmplt_epi32(TexXInt, LayerWidthMinusOne), Onei)); __m128i TexYInt = _mm_cvttps_epi32(TexYFull); __m128i TexYIntPlusOne = _mm_add_epi32(TexYInt, _mm_and_si128(_mm_cmplt_epi32(TexYInt, LayerHeightMinusOne), Onei)); __m128 TexX = _mm_sub_ps(TexXFull, _mm_cvtepi32_ps(TexXInt)); __m128 TexY = _mm_sub_ps(TexYFull, _mm_cvtepi32_ps(TexYInt)); __m128 TexXInv = _mm_sub_ps(One, TexX); __m128 TexYInv = _mm_sub_ps(One, TexY); __m128 TexBothXInv = _mm_mul_ps(TexXInv, TexY); __m128 TexBothYInv = _mm_mul_ps(TexX, TexYInv); __m128 TexBoth = _mm_mul_ps(TexY, TexX); __m128 TexBothInv = _mm_mul_ps(TexXInv, TexYInv); __m128i XLookup = _mm_add_epi32(_mm_mullo_epi32(_mm_srli_epi32(TexXInt, 2), Sixteeni), _mm_and_si128(TexXInt, BottomTwoBits)); __m128i YLookup = _mm_add_epi32(_mm_mullo_epi32(_mm_srli_epi32(TexYInt, 2), FullLayerWidth4i), _mm_mullo_epi32(_mm_and_si128(TexYInt, BottomTwoBits), Fouri)); __m128i XLookupPlusOne = _mm_add_epi32(_mm_mullo_epi32(_mm_srli_epi32(TexXIntPlusOne, 2), Sixteeni), _mm_and_si128(TexXIntPlusOne, BottomTwoBits)); __m128i YLookupPlusOne = _mm_add_epi32(_mm_mullo_epi32(_mm_srli_epi32(TexYIntPlusOne, 2), FullLayerWidth4i), _mm_mullo_epi32(_mm_and_si128(TexYIntPlusOne, BottomTwoBits), Fouri)); __m128i PixelLookupTL = _mm_add_epi32(XLookup, YLookup); __m128i PixelLookupTR = _mm_add_epi32(XLookupPlusOne, YLookup); __m128i PixelLookupBL = _mm_add_epi32(XLookup, YLookupPlusOne); __m128i PixelLookupBR = _mm_add_epi32(XLookupPlusOne, YLookupPlusOne); // SSE lacks gathering, so we have no choice but to manually // look up each pixel's four bilinear samples in scalar. uint32 S_PixelLookupTL0 = _mm_cvtsi128_si32(PixelLookupTL); uint32 S_PixelLookupTR0 = _mm_cvtsi128_si32(PixelLookupTR); uint32 S_PixelLookupBL0 = _mm_cvtsi128_si32(PixelLookupBL); uint32 S_PixelLookupBR0 = _mm_cvtsi128_si32(PixelLookupBR); uint32 S_PixelsTL0 = *(uint32 *)(TexPTR + S_PixelLookupTL0*4); uint32 S_PixelsTR0 = *(uint32 *)(TexPTR + S_PixelLookupTR0*4); uint32 S_PixelsBL0 = *(uint32 *)(TexPTR + S_PixelLookupBL0*4); uint32 S_PixelsBR0 = *(uint32 *)(TexPTR + S_PixelLookupBR0*4); uint32 S_PixelLookupTL1 = _mm_cvtsi128_si32(_mm_srli_si128(PixelLookupTL, 4)); uint32 S_PixelLookupTR1 = _mm_cvtsi128_si32(_mm_srli_si128(PixelLookupTR, 4)); uint32 S_PixelLookupBL1 = _mm_cvtsi128_si32(_mm_srli_si128(PixelLookupBL, 4)); uint32 S_PixelLookupBR1 = _mm_cvtsi128_si32(_mm_srli_si128(PixelLookupBR, 4)); uint32 S_PixelsTL1 = *(uint32 *)(TexPTR + S_PixelLookupTL1*4); uint32 S_PixelsTR1 = *(uint32 *)(TexPTR + S_PixelLookupTR1*4); uint32 S_PixelsBL1 = *(uint32 *)(TexPTR + S_PixelLookupBL1*4); uint32 S_PixelsBR1 = *(uint32 *)(TexPTR + S_PixelLookupBR1*4); uint32 S_PixelLookupTL2 = _mm_cvtsi128_si32(_mm_srli_si128(PixelLookupTL, 8)); uint32 S_PixelLookupTR2 = _mm_cvtsi128_si32(_mm_srli_si128(PixelLookupTR, 8)); uint32 S_PixelLookupBL2 = _mm_cvtsi128_si32(_mm_srli_si128(PixelLookupBL, 8)); uint32 S_PixelLookupBR2 = _mm_cvtsi128_si32(_mm_srli_si128(PixelLookupBR, 8)); uint32 S_PixelsTL2 = *(uint32 *)(TexPTR + S_PixelLookupTL2*4); uint32 S_PixelsTR2 = *(uint32 *)(TexPTR + S_PixelLookupTR2*4); uint32 S_PixelsBL2 = *(uint32 *)(TexPTR + S_PixelLookupBL2*4); uint32 S_PixelsBR2 = *(uint32 *)(TexPTR + S_PixelLookupBR2*4); uint32 S_PixelLookupTL3 = _mm_cvtsi128_si32(_mm_srli_si128(PixelLookupTL, 12)); uint32 S_PixelLookupTR3 = _mm_cvtsi128_si32(_mm_srli_si128(PixelLookupTR, 12)); uint32 S_PixelLookupBL3 = _mm_cvtsi128_si32(_mm_srli_si128(PixelLookupBL, 12)); uint32 S_PixelLookupBR3 = _mm_cvtsi128_si32(_mm_srli_si128(PixelLookupBR, 12)); uint32 S_PixelsTL3 = *(uint32 *)(TexPTR + S_PixelLookupTL3*4); uint32 S_PixelsTR3 = *(uint32 *)(TexPTR + S_PixelLookupTR3*4); uint32 S_PixelsBL3 = *(uint32 *)(TexPTR + S_PixelLookupBL3*4); uint32 S_PixelsBR3 = *(uint32 *)(TexPTR + S_PixelLookupBR3*4); __m128i PixelsTL = _mm_setr_epi32(S_PixelsTL0, S_PixelsTL1, S_PixelsTL2, S_PixelsTL3); __m128i PixelsTR = _mm_setr_epi32(S_PixelsTR0, S_PixelsTR1, S_PixelsTR2, S_PixelsTR3); __m128i PixelsBL = _mm_setr_epi32(S_PixelsBL0, S_PixelsBL1, S_PixelsBL2, S_PixelsBL3); __m128i PixelsBR = _mm_setr_epi32(S_PixelsBR0, S_PixelsBR1, S_PixelsBR2, S_PixelsBR3); __m128i R_TexTL = _mm_and_si128( PixelsTL, FF); __m128i G_TexTL = _mm_and_si128(_mm_srli_epi32(PixelsTL, 8), FF); __m128i B_TexTL = _mm_and_si128(_mm_srli_epi32(PixelsTL, 16), FF); __m128i A_TexTL = _mm_and_si128(_mm_srli_epi32(PixelsTL, 24), FF); __m128i R_TexTR = _mm_and_si128( PixelsTR, FF); __m128i G_TexTR = _mm_and_si128(_mm_srli_epi32(PixelsTR, 8), FF); __m128i B_TexTR = _mm_and_si128(_mm_srli_epi32(PixelsTR, 16), FF); __m128i A_TexTR = _mm_and_si128(_mm_srli_epi32(PixelsTR, 24), FF); __m128i R_TexBL = _mm_and_si128( PixelsBL, FF); __m128i G_TexBL = _mm_and_si128(_mm_srli_epi32(PixelsBL, 8), FF); __m128i B_TexBL = _mm_and_si128(_mm_srli_epi32(PixelsBL, 16), FF); __m128i A_TexBL = _mm_and_si128(_mm_srli_epi32(PixelsBL, 24), FF); __m128i R_TexBR = _mm_and_si128( PixelsBR, FF); __m128i G_TexBR = _mm_and_si128(_mm_srli_epi32(PixelsBR, 8), FF); __m128i B_TexBR = _mm_and_si128(_mm_srli_epi32(PixelsBR, 16), FF); __m128i A_TexBR = _mm_and_si128(_mm_srli_epi32(PixelsBR, 24), FF); __m128 R_PixelBlend = _mm_add_ps(_mm_add_ps(_mm_mul_ps(TexBothInv, _mm_cvtepi32_ps(R_TexTL)), _mm_mul_ps(TexBothYInv, _mm_cvtepi32_ps(R_TexTR))), _mm_add_ps(_mm_mul_ps(TexBothXInv, _mm_cvtepi32_ps(R_TexBL)), _mm_mul_ps(TexBoth, _mm_cvtepi32_ps(R_TexBR)))); __m128 G_PixelBlend = _mm_add_ps(_mm_add_ps(_mm_mul_ps(TexBothInv, _mm_cvtepi32_ps(G_TexTL)), _mm_mul_ps(TexBothYInv, _mm_cvtepi32_ps(G_TexTR))), _mm_add_ps(_mm_mul_ps(TexBothXInv, _mm_cvtepi32_ps(G_TexBL)), _mm_mul_ps(TexBoth, _mm_cvtepi32_ps(G_TexBR)))); __m128 B_PixelBlend = _mm_add_ps(_mm_add_ps(_mm_mul_ps(TexBothInv, _mm_cvtepi32_ps(B_TexTL)), _mm_mul_ps(TexBothYInv, _mm_cvtepi32_ps(B_TexTR))), _mm_add_ps(_mm_mul_ps(TexBothXInv, _mm_cvtepi32_ps(B_TexBL)), _mm_mul_ps(TexBoth, _mm_cvtepi32_ps(B_TexBR)))); __m128 A_PixelBlend = _mm_add_ps(_mm_add_ps(_mm_mul_ps(TexBothInv, _mm_cvtepi32_ps(A_TexTL)), _mm_mul_ps(TexBothYInv, _mm_cvtepi32_ps(A_TexTR))), _mm_add_ps(_mm_mul_ps(TexBothXInv, _mm_cvtepi32_ps(A_TexBL)), _mm_mul_ps(TexBoth, _mm_cvtepi32_ps(A_TexBR)))); A_PixelBlend = _mm_sub_ps(A_PixelBlend, _mm_mul_ps(A_PixelBlend, LayerOpacity)); __m128i R_Out, G_Out, B_Out, A_Out; // Only do alpha blending if a pixel's value doesn't equal 255 if (_mm_movemask_epi8(_mm_sub_epi32(_mm_cvtps_epi32(A_PixelBlend), Int255))) { __m128 LayerAlpha = _mm_mul_ps(A_PixelBlend, Norm255); __m128 LayerAlphaInv = _mm_mul_ps(_mm_sub_ps(Reg255, A_PixelBlend), Norm255); __m128i DestPixel = _mm_loadu_si128((const __m128i *)Pixel); __m128i R_Dest = _mm_and_si128( DestPixel, FF); __m128i G_Dest = _mm_and_si128(_mm_srli_epi32(DestPixel, 8), FF); __m128i B_Dest = _mm_and_si128(_mm_srli_epi32(DestPixel, 16), FF); __m128i A_Dest = _mm_and_si128(_mm_srli_epi32(DestPixel, 24), FF); R_Out = _mm_cvtps_epi32(_mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(R_Dest), LayerAlphaInv), _mm_mul_ps(R_PixelBlend, LayerAlpha))); G_Out = _mm_cvtps_epi32(_mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(G_Dest), LayerAlphaInv), _mm_mul_ps(G_PixelBlend, LayerAlpha))); B_Out = _mm_cvtps_epi32(_mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(B_Dest), LayerAlphaInv), _mm_mul_ps(B_PixelBlend, LayerAlpha))); A_Out = _mm_cvtps_epi32(_mm_min_ps(_mm_add_ps(_mm_cvtepi32_ps(A_Dest), A_PixelBlend), Reg255)); } else { R_Out = _mm_cvtps_epi32(R_PixelBlend); G_Out = _mm_cvtps_epi32(G_PixelBlend); B_Out = _mm_cvtps_epi32(B_PixelBlend); A_Out = _mm_cvtps_epi32(A_PixelBlend); } __m128i OutputPixel = _mm_or_si128( _mm_or_si128(R_Out, _mm_slli_epi32(G_Out, 8)), _mm_or_si128(_mm_slli_epi32(B_Out, 16), _mm_slli_epi32(A_Out, 24))); _mm_maskmoveu_si128(OutputPixel, LayerMask, (char *)Pixel); } PixelX = _mm_add_ps(PixelX, Four); } } } #endif internal void Fallback_RenderLayer(transform_info T, pixel_buffer *Buffer, rectangle RenderRegion) { rectangle LayerBounds = ClipRectangle( T.ClipRect, RenderRegion); Assert(LayerBounds.Max.x <= Buffer->Width); Assert(LayerBounds.Max.y <= Buffer->Height); uint8 *Row = ((uint8 *)Buffer->OriginalBuffer + Buffer->Pitch*(int16)(LayerBounds.Min.y) ); uint32 Channel = (T.LayerWidth * T.LayerHeight); // uint32 pp1 = 2; // uint32 pp2 = 3; // bool32 real = true; for (int16 Y = LayerBounds.Min.y; Y < LayerBounds.Max.y; Y++) { real32 StartVectorY = (real32)Y - T.OriginY; for (int16 X = LayerBounds.Min.x; X < LayerBounds.Max.x; X++) { IACA_START; real32 StartVectorX = X - T.OriginX; real32 U = (StartVectorX * T.XAxisPX) + (StartVectorY * T.XAxisPY); real32 V = (StartVectorX * T.YAxisPX) + (StartVectorY * T.YAxisPY); if (U <= 1.0f && U >= 0.0f && V <= 1.0f && V >= 0.0f) { real32 TexXFull = U * T.LayerWidth; uint32 TexXInt = (uint32)TexXFull; real32 TexX = TexXFull - TexXInt; real32 TexYFull = V * T.LayerHeight; uint32 TexYInt = (uint32)TexYFull; real32 TexY = TexYFull - TexYInt; real32 TexXInv = 1 - TexX; real32 TexYInv = 1 - TexY; real32 TexBothXInv = TexXInv * TexY; real32 TexBothYInv = TexX * TexYInv; real32 TexBoth = TexY * TexX; real32 TexBothInv = TexXInv * TexYInv; #if 0 uint8 *TexPTR0 = ((uint8 *)T.SourceBuffer + (uint16)T.LayerPitch*TexYInt + TexXInt*Buffer->BytesPerPixel); uint8 *TexPTR1 = ((uint8 *)T.SourceBuffer + (uint16)T.LayerPitch*(TexYInt+1) + TexXInt*Buffer->BytesPerPixel); uint32 PixelA = *(uint32 *)TexPTR0; uint32 PixelB = *((uint32 *)TexPTR0 + 1); uint32 PixelC = *(uint32 *)TexPTR1; uint32 PixelD = *((uint32 *)TexPTR1 + 1); #else uint32 XLookup, YLookup, PixelToSeek; // TODO(fox): Anti-aliasing on edges uint16 LX = TexXInt; uint16 LY = TexYInt; uint16 LXPlus = Ceil(TexXInt+1, (uint32)T.LayerWidth - 1); uint16 LYPlus = Ceil(TexYInt+1, (uint32)T.LayerHeight - 1); // TODO(fox): Be careful with the BytesPerPixel here! It's the // buffer's, not the layer's (currently everything is 4 bytes // per pixel). XLookup = (LX >> 2)*16 + (LX % 4); YLookup = (LY >> 2)*(T.FullLayerWidth*4) + (LY % 4)*4; PixelToSeek = XLookup + YLookup; uint32 PixelA = *(uint32 *)((uint8 *)T.SourceBuffer + PixelToSeek*Buffer->BytesPerPixel); XLookup = (LXPlus >> 2)*16 + (LXPlus % 4); YLookup = (LY >> 2)*(T.FullLayerWidth*4) + (LY % 4)*4; PixelToSeek = XLookup + YLookup; uint32 PixelB = *(uint32 *)((uint8 *)T.SourceBuffer + PixelToSeek*Buffer->BytesPerPixel); XLookup = (LX >> 2)*16 + (LX % 4); YLookup = (LYPlus >> 2)*(T.FullLayerWidth*4) + (LYPlus % 4)*4; PixelToSeek = XLookup + YLookup; uint32 PixelC = *(uint32 *)((uint8 *)T.SourceBuffer + PixelToSeek*Buffer->BytesPerPixel); XLookup = (LXPlus >> 2)*16 + (LXPlus % 4); YLookup = (LYPlus >> 2)*(T.FullLayerWidth*4) + (LYPlus % 4)*4; PixelToSeek = XLookup + YLookup; uint32 PixelD = *(uint32 *)((uint8 *)T.SourceBuffer + PixelToSeek*Buffer->BytesPerPixel); #endif uint8 TexRA = (PixelA & 0xFF); uint8 TexRB = (PixelB & 0xFF); uint8 TexRC = (PixelC & 0xFF); uint8 TexRD = (PixelD & 0xFF); uint8 TexGA = ((PixelA >> 8) & 0xFF); uint8 TexGB = ((PixelB >> 8) & 0xFF); uint8 TexGC = ((PixelC >> 8) & 0xFF); uint8 TexGD = ((PixelD >> 8) & 0xFF); uint8 TexBA = ((PixelA >> 16) & 0xFF); uint8 TexBB = ((PixelB >> 16) & 0xFF); uint8 TexBC = ((PixelC >> 16) & 0xFF); uint8 TexBD = ((PixelD >> 16) & 0xFF); uint8 TexAA = ((PixelA >> 24) & 0xFF); uint8 TexAB = ((PixelB >> 24) & 0xFF); uint8 TexAC = ((PixelC >> 24) & 0xFF); uint8 TexAD = ((PixelD >> 24) & 0xFF); real32 PixelBlendR = (TexBothInv * TexRA) + (TexBothYInv * TexRB) + (TexBothXInv * TexRC) + (TexBoth * TexRD); real32 PixelBlendG = (TexBothInv * TexGA) + (TexBothYInv * TexGB) + (TexBothXInv * TexGC) + (TexBoth * TexGD); real32 PixelBlendB = (TexBothInv * TexBA) + (TexBothYInv * TexBB) + (TexBothXInv * TexBC) + (TexBoth * TexBD); real32 PixelBlendA = (TexBothInv * TexAA) + (TexBothYInv * TexAB) + (TexBothXInv * TexAC) + (TexBoth * TexAD); PixelBlendA = PixelBlendA - (PixelBlendA * T.LayerOpacity); uint8 R = (uint8)PixelBlendR; uint8 G = (uint8)PixelBlendG; uint8 B = (uint8)PixelBlendB; uint8 A = (uint8)PixelBlendA; XLookup = (X >> 2)*16 + (X % 4); YLookup = (Y >> 2)*(Buffer->FullWidth*4) + (Y % 4)*4; // if (real) { // real = false; // printf("XLook: %i, YLook: %i\n", XLookup, YLookup); // printf("X: %i, Y: %i\n", X, Y); // } PixelToSeek = XLookup + YLookup; uint32 *Pixel = (uint32 *)((uint8 *)Buffer->OriginalBuffer + PixelToSeek*Buffer->BytesPerPixel); uint8 R1 = (*Pixel >> 0); uint8 G1 = (*Pixel >> 8); uint8 B1 = (*Pixel >> 16); uint8 A1 = (*Pixel >> 24); if (A != 255) { real32 LayerAlpha = (255 - A) / 255.0f; R = (R1 * LayerAlpha) - (R * LayerAlpha) + R; G = (G1 * LayerAlpha) - (G * LayerAlpha) + G; B = (B1 * LayerAlpha) - (B * LayerAlpha) + B; A = ClipAdd(A1, A); } *Pixel = ((A << 24) | (B << 16) | (G << 8) | (R << 0)); } } } }