序言
很早之前就看HDRP裡面的Color ColorPyramid的生成不爽了,都不知道為什麼明明有現成的ColorPyramid.compute放著不用,還是用的是PixelShader生成。DrawCall數量多,效率也不夠Compute的方式快。
這篇文章主要解析ColorPyramid.compute怎麼用LDS最佳化GuassianBlur以及不同方式Copy mip 0的效能對比。
Color Pyramid
ColorPyramid主要用來模擬毛玻璃折射(Refraction)效果,物體表面越粗糙,折射越模糊。(並不物理正確)
HDRP中透過物體表面的PerceptualRoughness透過一個經驗公式對映出一個計算透射時取樣ColorPyramid的Mip Level。
PreLightData GetPreLightData(float3 V, PositionInputs posInput, inout BSDFData bsdfData)
{
...
// refraction (forward only)
#if HAS_REFRACTION
RefractionModelResult refraction = REFRACTION_MODEL(V, posInput, bsdfData);
preLightData.transparentRefractV = refraction.rayWS;
preLightData.transparentPositionWS = refraction.positionWS;
preLightData.transparentTransmittance = exp(-bsdfData.absorptionCoefficient * refraction.dist);
// Empirical remap to try to match a bit the refraction probe blurring for the fallback
// Use IblPerceptualRoughness so we can handle approx of clear coat.
preLightData.transparentSSMipLevel = PositivePow(preLightData.iblPerceptualRoughness, 1.3) * uint(max(_ColorPyramidLodCount - 1, 0));
#endif
}
透過trilinear三線性過濾,就能夠在不同Mip之間做插值。
IndirectLighting EvaluateBSDF_ScreenspaceRefraction(LightLoopContext lightLoopContext,
float3 V, PositionInputs posInput,
PreLightData preLightData, BSDFData bsdfData,
EnvLightData envLightData,
inout float hierarchyWeight)
{
...
float2 samplingPositionNDC = lerp(posInput.positionNDC, hit.positionNDC, refractionOffsetMultiplier);
//三線性過濾
float3 preLD = SAMPLE_TEXTURE2D_X_LOD(_ColorPyramidTexture, s_trilinear_clamp_sampler, samplingPositionNDC * _RTHandleScaleHistory.xy, preLightData.transparentSSMipLevel).rgb;
// Offset by half a texel to properly interpolate between this pixel and its mips
// Inverse pre-exposure
preLD *= GetInverseCurrentExposureMultiplier();
// We use specularFGD as an approximation of the fresnel effect (that also handle smoothness)
float3 F = preLightData.specularFGD;
//(1-F模擬菲尼爾效果)
lighting.specularTransmitted = (1.0 - F) * preLD.rgb * preLightData.transparentTransmittance * weight;
UpdateLightingHierarchyWeights(hierarchyWeight, weight); // Shouldn't be needed, but safer in case we decide to change hierarchy priority
#else // HAS_REFRACTION
// No refraction, no need to go further
hierarchyWeight = 1.0;
#endif
return lighting;
}
一般在透明佇列渲染之前就需要生成mip level越高越模糊的Color Pyramid留著後續透明佇列模擬折射時取樣。
如果還有Distortion Pass的話,後處理之後還要生成Color Pyramid多一次。
HDRP生成Color Pyramid流程比較簡單,
首先申請一張臨時的RT用於DownSample(),以及Horizontal Blur輸出的臨時RT,
讀取DownSample的RT Horizontal Blur+vertical Blur輸出到ColorPyramid上面,
(Vertical Blur直接輸出到ColorPyramid當前Downsample Size所對應的mip中)
mip0不需要模糊直接複製即可。
//MipGenerator.cs
public int RenderColorGaussianPyramid(CommandBuffer cmd, Vector2Int size, Texture source, RenderTexture destination)
{
...
// Mip0直接複製
m_PropertyBlock.SetTexture(Blitter.BlitShaderIDs._BlitTexture, source);
m_PropertyBlock.SetVector(Blitter.BlitShaderIDs._BlitScaleBias, new Vector4(sourceScaleX, sourceScaleY, 0f, 0f));
m_PropertyBlock.SetFloat(Blitter.BlitShaderIDs._BlitMipLevel, 0f);
cmd.SetRenderTarget(destination, 0, CubemapFace.Unknown, -1);
cmd.SetViewport(new Rect(0, 0, srcMipWidth, srcMipHeight));
cmd.DrawProcedural(Matrix4x4.identity, Blitter.GetBlitMaterial(source.dimension), 0, MeshTopology.Triangles, 3, 1, m_PropertyBlock);
int finalTargetMipWidth = destination.width;
int finalTargetMipHeight = destination.height;
// Note: smaller mips are excluded as we don't need them and the gaussian compute works
// on 8x8 blocks
while (srcMipWidth >= 8 || srcMipHeight >= 8)
{
int dstMipWidth = Mathf.Max(1, srcMipWidth >> 1);
int dstMipHeight = Mathf.Max(1, srcMipHeight >> 1);
// Scale for downsample
float scaleX = ((float) srcMipWidth / finalTargetMipWidth);
float scaleY = ((float) srcMipHeight / finalTargetMipHeight);
// Downsample輸出到臨時的RT中
m_PropertyBlock.SetTexture(Blitter.BlitShaderIDs._BlitTexture, destination);
m_PropertyBlock.SetVector(Blitter.BlitShaderIDs._BlitScaleBias, new Vector4(scaleX, scaleY, 0f, 0f));
m_PropertyBlock.SetFloat(Blitter.BlitShaderIDs._BlitMipLevel, srcMipLevel);
cmd.SetRenderTarget(m_TempDownsamplePyramid0[rtIndex], 0, CubemapFace.Unknown, -1);
cmd.SetViewport(new Rect(0, 0, dstMipWidth, dstMipHeight));
cmd.DrawProcedural(Matrix4x4.identity, Blitter.GetBlitMaterial(source.dimension), 1, MeshTopology.Triangles, 3, 1, m_PropertyBlock);
...
// Blur horizontal
m_PropertyBlock.SetTexture(ShaderIDs._Source, m_TempDownsamplePyramid0[rtIndex]);
m_PropertyBlock.SetVector(ShaderIDs._SrcScaleBias, new Vector4(scaleX, scaleY, 0f, 0f));
m_PropertyBlock.SetVector(ShaderIDs._SrcUvLimits,
new Vector4((dstMipWidth - 0.5f) / blurSourceTextureWidth, (dstMipHeight - 0.5f) / blurSourceTextureHeight, 1.0f / blurSourceTextureWidth, 0f));
m_PropertyBlock.SetFloat(ShaderIDs._SourceMip, 0);
//輸出到臨時RT上
cmd.SetRenderTarget(m_TempColorTargets[rtIndex], 0, CubemapFace.Unknown, -1);
cmd.SetViewport(new Rect(0, 0, dstMipWidth, dstMipHeight));
cmd.DrawProcedural(Matrix4x4.identity, m_ColorPyramidPSMat, rtIndex, MeshTopology.Triangles, 3, 1, m_PropertyBlock);
// Blur vertical
m_PropertyBlock.SetTexture(ShaderIDs._Source, m_TempColorTargets[rtIndex]);
m_PropertyBlock.SetVector(ShaderIDs._SrcScaleBias, new Vector4(scaleX, scaleY, 0f, 0f));
m_PropertyBlock.SetVector(ShaderIDs._SrcUvLimits,
new Vector4((dstMipWidth - 0.5f) / blurSourceTextureWidth, (dstMipHeight - 0.5f) / blurSourceTextureHeight, 0f, 1.0f / blurSourceTextureHeight));
m_PropertyBlock.SetFloat(ShaderIDs._SourceMip, 0);
//輸出到Color Pyramid RT的當前Downsample Size對應mip上
cmd.SetRenderTarget(destination, srcMipLevel + 1, CubemapFace.Unknown, -1);
cmd.SetViewport(new Rect(0, 0, dstMipWidth, dstMipHeight));
cmd.DrawProcedural(Matrix4x4.identity, m_ColorPyramidPSMat, rtIndex, MeshTopology.Triangles, 3, 1, m_PropertyBlock);
srcMipLevel++;
srcMipWidth = srcMipWidth >> 1;
srcMipHeight = srcMipHeight >> 1;
finalTargetMipWidth = finalTargetMipWidth >> 1;
finalTargetMipHeight = finalTargetMipHeight >> 1;
}
return srcMipLevel + 1;
}
Blur時透過_SrcUvLimits.zw控制取樣Offset的方向(Horizontal/Vertical)。
half4 Frag(Varyings input) : SV_Target
{
// Gaussian weights for 9 texel kernel from center textel to furthest texel. Keep in sync with ColorPyramid.compute
const half gaussWeights[] = {0.27343750, 0.21875000, 0.10937500, 0.03125000, 0.00390625};
//Blur horizontal. 1.0f / blurSourceTextureWidth, 0f
//Blur vertical. 0f, 1.0f / blurSourceTextureHeight
const half2 offset = _SrcUvLimits.zw;
const half2 offset1 = offset * (1.0 + (gaussWeights[2] / (gaussWeights[1] + gaussWeights[2])));
const half2 offset2 = offset * (3.0 + (gaussWeights[4] / (gaussWeights[3] + gaussWeights[4])));
half2 uv_m2 = input.texcoord.xy - offset2;
half2 uv_m1 = input.texcoord.xy - offset1;
half2 uv_p0 = input.texcoord.xy;
half2 uv_p1 = min(_SrcUvLimits.xy, input.texcoord.xy + offset1);
half2 uv_p2 = min(_SrcUvLimits.xy, input.texcoord.xy + offset2);
return
+ SAMPLE_TEXTURE2D_LOD(_Source, sampler_LinearClamp, uv_m2, _SourceMip) * (gaussWeights[3] + gaussWeights[4])
+ SAMPLE_TEXTURE2D_LOD(_Source, sampler_LinearClamp, uv_m1, _SourceMip) * (gaussWeights[1] + gaussWeights[2])
+ SAMPLE_TEXTURE2D_LOD(_Source, sampler_LinearClamp, uv_p0, _SourceMip) * gaussWeights[0]
+ SAMPLE_TEXTURE2D_LOD(_Source, sampler_LinearClamp, uv_p1, _SourceMip) * (gaussWeights[1] + gaussWeights[2])
+ SAMPLE_TEXTURE2D_LOD(_Source, sampler_LinearClamp, uv_p2, _SourceMip) * (gaussWeights[3] + gaussWeights[4]);
}
Downsample
顯然上面除了Mip0之外一個Mip就要呼叫三次DrawProcedural(Downsample+Horizontal Blur+Vertical Blur)
其中的Horizontal Blur以及Vertical Blur可以在Compute Shader中利用LDS一次Dispatch完成,從而節省DrawCall的消耗。
首先還是得先Downsample,4個畫素Down Sample成一個,在第一次Downsample的時候可以順便把取樣的四個畫素複製到Color Pyramid Mip0中(不需要Blur)
管線呼叫
public int RenderColorGaussianPyramidCS(CommandBuffer cmd, Vector2Int size, Texture source, RenderTexture destination)
{
...
int finalTargetMipWidth = destination.width;
int finalTargetMipHeight = destination.height;
var cs = m_ColorPyramidCS;
bool isFirstLoop = true;
bool switchFlag = false;
// Note: smaller mips are excluded as we don't need them and the gaussian compute works
// on 8x8 blocks
while (srcMipWidth >= 8 || srcMipHeight >= 8)
{
int dstMipWidth = Mathf.Max(1, srcMipWidth >> 1);
int dstMipHeight = Mathf.Max(1, srcMipHeight >> 1);
RenderTargetIdentifier sourceRTI, destinationRTI;
if (isFirstLoop)
{
sourceRTI = source;
destinationRTI = m_TempDownsamplePyramid0[rtIndex];
//啟用關鍵字Copy Mip0
//第一次Loop的時候DownSample之餘,順便把source的Mip0 Copy到ColorPyramid Mip0
cmd.EnableKeyword(cs, this.copyMip0);
cmd.SetComputeTextureParam(cs, m_ColorPyramidDownSampleKernel, "_Mip0", destination, 0);
}
else
{
//Ping Pong Downsample
if (switchFlag)
{
sourceRTI = m_TempDownsamplePyramid1[rtIndex];
destinationRTI = m_TempDownsamplePyramid0[rtIndex];
}
else
{
sourceRTI = m_TempDownsamplePyramid0[rtIndex];
destinationRTI = m_TempDownsamplePyramid1[rtIndex];
}
switchFlag = !switchFlag;
}
// Downsample.
//
this.size[0] = srcMipWidth;
this.size[1] = srcMipHeight;
cmd.SetComputeVectorParam(cs, ShaderIDs._Size, this.size);
cmd.SetComputeTextureParam(cs, m_ColorPyramidDownSampleKernel, ShaderIDs._Source, sourceRTI);
cmd.SetComputeTextureParam(cs, m_ColorPyramidDownSampleKernel, ShaderIDs._Destination, destinationRTI);
cmd.DispatchCompute(cs, m_ColorPyramidDownSampleKernel, HQUtils.DivRoundUp(dstMipWidth, 8), HQUtils.DivRoundUp(dstMipHeight, 8), 1);
if (isFirstLoop)
{
cmd.DisableKeyword(cs, this.copyMip0);
isFirstLoop = false;
}
//Blur.
//Blur完直接輸出到ColorPyramid對應的Mip
this.size[0] = dstMipWidth;
this.size[1] = dstMipHeight;
cmd.SetComputeVectorParam(cs, ShaderIDs._Size, this.size);
cmd.SetComputeTextureParam(cs, m_ColorPyramidGaussianKernel, ShaderIDs._Source, destinationRTI);
cmd.SetComputeTextureParam(cs, m_ColorPyramidGaussianKernel, ShaderIDs._Destination, destination, srcMipLevel + 1);
cmd.DispatchCompute(cs, m_ColorPyramidGaussianKernel, HQUtils.DivRoundUp(dstMipWidth, 8), HQUtils.DivRoundUp(dstMipHeight, 8), 1);
srcMipLevel++;
srcMipWidth = srcMipWidth >> 1;
srcMipHeight = srcMipHeight >> 1;
finalTargetMipWidth = finalTargetMipWidth >> 1;
finalTargetMipHeight = finalTargetMipHeight >> 1;
}
return srcMipLevel + 1;
}
DownSample Compute Shader
COPY_MIP_0的Keyword控制是否複製到Mip0中
#if COPY_MIP_0
TEXTURE2D(_Source);
RW_TEXTURE2D(float4, _Mip0);
#else
RW_TEXTURE2D(float4, _Source);
#endif
RW_TEXTURE2D(float4, _Destination);
SamplerState sampler_LinearClamp;
CBUFFER_START(cb)
float4 _Size; // x: src width, y: src height, zw: unused
CBUFFER_END
[numthreads(KERNEL_SIZE, KERNEL_SIZE, 1)]
void MAIN_DOWNSAMPLE(uint3 dispatchThreadId : SV_DispatchThreadID)
{
uint2 offset = dispatchThreadId.xy * 2u;
uint2 size = uint2(_Size.xy) - 1u;
uint2 c00 = min(offset + uint2(0u, 0u), size);
uint2 c10 = min(offset + uint2(1u, 0u), size);
uint2 c11 = min(offset + uint2(1u, 1u), size);
uint2 c01 = min(offset + uint2(0u, 1u), size);
float4 p00 = _Source[(c00)];
float4 p10 = _Source[(c10)];
float4 p11 = _Source[(c11)];
float4 p01 = _Source[(c01)];
#if COPY_MIP_0
_Mip0[(c00)] = p00;
_Mip0[(c10)] = p10;
_Mip0[(c11)] = p11;
_Mip0[(c01)] = p01;
#endif
_Destination[(dispatchThreadId.xy)] = (p00 + p01 + p11 + p10) * 0.25;
}
Downsample完之後就可以對DownSample的結果進行Blur處理
Gaussian Blur
Store Pixel Into LDS
這裡的threadUL的命名其實有點誤導,因為unity (0,0)是左下角。這裡應該是LL才對,但是影響不大。
可以看到這裡先讀取了四個畫素threadUL上,左,斜上角的四個畫素。
讀取完之後將float32轉成16位,透過位運算將兩個畫素的r,g,b,a分別塞進gs_cacheR/gs_cacheG/gs_cacheB/gs_cacheA的LDS中。
// 16x16 pixels with an 8x8 center that we will be blurring writing out. Each uint is two color
// channels packed together.
// The reason for separating channels is to reduce bank conflicts in the local data memory
// controller. A large stride will cause more threads to collide on the same memory bank.
groupshared uint gs_cacheR[128];
groupshared uint gs_cacheG[128];
groupshared uint gs_cacheB[128];
groupshared uint gs_cacheA[128];
void Store2Pixels(uint index, float4 pixel1, float4 pixel2)
{
gs_cacheR[index] = f32tof16(pixel1.r) | f32tof16(pixel2.r) << 16;
gs_cacheG[index] = f32tof16(pixel1.g) | f32tof16(pixel2.g) << 16;
gs_cacheB[index] = f32tof16(pixel1.b) | f32tof16(pixel2.b) << 16;
gs_cacheA[index] = f32tof16(pixel1.a) | f32tof16(pixel2.a) << 16;
}
//KERNEL_SIZE=8
[numthreads(KERNEL_SIZE, KERNEL_SIZE, 1)]
void MAIN_GAUSSIAN(uint2 groupId : SV_GroupID, uint2 groupThreadId : SV_GroupThreadID, uint3 dispatchThreadId : SV_DispatchThreadID)
{
// Upper-left pixel coordinate of quad that this thread will read
//kernel = 8 groupId<<3為當前group的左下角
//groupThreadId<<1即groupThreadId*2
//-4即每個Group的邊界Padding4個畫素 讀取Source畫素Tile的大小:8+4+4=16
//16*16/2=128(LDS的容量)
int2 threadUL = (groupThreadId << 1) + (groupId << 3) - 4;
//邊界控制
//當groupThreadId.y=0時,groupThreadId.x = 0,1,2依舊讀取左下角(0,0)
uint2 uthreadUL = uint2(max(0, threadUL));
uint2 size = uint2(_Size.xy) - 1u;
float4 p00 = _Source[(min(uthreadUL + uint2(0u, 0u), size))];
float4 p10 = _Source[(min(uthreadUL + uint2(1u, 0u), size))];
float4 p11 = _Source[(min(uthreadUL + uint2(1u, 1u), size))];
float4 p01 = _Source[(min(uthreadUL + uint2(0u, 1u), size))];
// Store the 4 downsampled pixels in LDS
uint destIdx = groupThreadId.x + (groupThreadId.y << 4u);
Store2Pixels(destIdx, p00, p10);
Store2Pixels(destIdx + 8u, p01, p11);
//同步LDS
GroupMemoryBarrierWithGroupSync();
...
}
LDS對映關係(寫得太抽象了,建議自己理一遍Index)
上面的LDS對映關係(destIdx):
2*2的取樣Quad中
p00, p10(offsetY=0)儲存在destIdx中,而p01, p11(offsetY=1)儲存在destIdx+8的位置中。
也就是說在LDS中:
groupThreadId.y=0:(邊界畫素)
(destIdx=0,1....,7儲存Tile[8*8] y=0的畫素)
(destIdx=8,9....,15儲存Tile y=1的畫素)
groupThreadId.y=1:(邊界畫素)
(destIdx=16,....,23儲存Tile y=0的畫素)
(destIdx=24,....,31儲存Tile y=1的畫素)
groupThreadId.y=2:(邊界畫素)
(destIdx=32,....,39儲存Tile y=0的畫素)
(destIdx=40,....,47儲存Tile y=1的畫素)
groupThreadId.y=3:
(destIdx=48,....,55儲存Tile y=2的畫素)
(destIdx=56,....,63儲存Tile y=3的畫素)
groupThreadId.y=4:
(destIdx=64,....,71儲存Tile y=3的畫素)
(destIdx=72,....,79儲存Tile y=4的畫素)
...
groupThreadId.y=7:(min(uthreadUL + uint2(0u, 0u), size)限制在RT的size之內。
(destIdx=112,....,119儲存Tile y=6的畫素)
(destIdx=120,....,127儲存Tile y=7的畫素)
對於groupThreadId.x來說也同樣有類似的邊界限制。
groupThreadId.x=0/1/2依舊以Tile中的(0,0)畫素作為左下角進行取樣
BlurHorizontally
水平方向上的模糊計算,讀取LDS上的畫素,計算高斯模糊的結果,然後把Blur結果存在Gaussian Kernel中心點(e)對應LDS中Index位置上[outIndex]。
leftMostIndex為讀取LDS的起始Index。
一共讀取5次LDS,獲得10個畫素,以9個畫素計算一次Blur。
ex:
groupThreadId.y=0
groupThreadId.x:0~3 BlurHorizontally tile.y=0的畫素 儲存Blur的結果在LDS中,Index範圍為(0~7)
groupThreadId.x:4~7 BlurHorizontally tile.y=1的畫素 Index範圍為(8~15)
以此類推
//根據Gaussian曲線上的值近似計算
float4 BlurPixels(float4 a, float4 b, float4 c, float4 d, float4 e, float4 f, float4 g, float4 h, float4 i)
{
return 0.27343750 * (e)
+ 0.21875000 * (d + f)
+ 0.10937500 * (c + g)
+ 0.03125000 * (b + h)
+ 0.00390625 * (a + i);
}
void Load2Pixels(uint index, out float4 pixel1, out float4 pixel2)
{
uint rr = gs_cacheR[index];
uint gg = gs_cacheG[index];
uint bb = gs_cacheB[index];
uint aa = gs_cacheA[index];
pixel1 = float4(f16tof32(rr), f16tof32(gg), f16tof32(bb), f16tof32(aa));
pixel2 = float4(f16tof32(rr >> 16), f16tof32(gg >> 16), f16tof32(bb >> 16), f16tof32(aa >> 16));
}
void Store1Pixel(uint index, float4 pixel)
{
gs_cacheR[index] = asuint(pixel.r);
gs_cacheG[index] = asuint(pixel.g);
gs_cacheB[index] = asuint(pixel.b);
gs_cacheA[index] = asuint(pixel.a);
}
// Blur two pixels horizontally. This reduces LDS reads and pixel unpacking.
void BlurHorizontally(uint outIndex, uint leftMostIndex)
{
float4 s0, s1, s2, s3, s4, s5, s6, s7, s8, s9;
Load2Pixels(leftMostIndex + 0, s0, s1);
Load2Pixels(leftMostIndex + 1, s2, s3);
Load2Pixels(leftMostIndex + 2, s4, s5);
Load2Pixels(leftMostIndex + 3, s6, s7);
Load2Pixels(leftMostIndex + 4, s8, s9);
Store1Pixel(outIndex, BlurPixels(s0, s1, s2, s3, s4, s5, s6, s7, s8));
Store1Pixel(outIndex + 1, BlurPixels(s1, s2, s3, s4, s5, s6, s7, s8, s9));
}
[numthreads(KERNEL_SIZE, KERNEL_SIZE, 1)]
void MAIN_GAUSSIAN(uint2 groupId : SV_GroupID, uint2 groupThreadId : SV_GroupThreadID, uint3 dispatchThreadId : SV_DispatchThreadID)
{
...
// Horizontally blur the pixels in LDS
uint row = groupThreadId.y << 4u;
BlurHorizontally(row + (groupThreadId.x << 1u), row + groupThreadId.x + (groupThreadId.x & 4u));
GroupMemoryBarrierWithGroupSync();
...
}
BlurVertically
豎直方向上Blur就用剛剛水平方向上Blur完的LDS繼續模糊即可。
以s4的畫素作為中心點進行模糊,並把模糊的結果寫入到畫素對應的位置上。
float4 BlurPixels(float4 a, float4 b, float4 c, float4 d, float4 e, float4 f, float4 g, float4 h, float4 i)
{
return 0.27343750 * (e)
+ 0.21875000 * (d + f)
+ 0.10937500 * (c + g)
+ 0.03125000 * (b + h)
+ 0.00390625 * (a + i);
}
void Load1Pixel(uint index, out float4 pixel)
{
pixel = asfloat(uint4(gs_cacheR[index], gs_cacheG[index], gs_cacheB[index], gs_cacheA[index]));
}
//以s4的畫素作為中心點進行模糊
void BlurVertically(uint2 pixelCoord, uint topMostIndex)
{
float4 s0, s1, s2, s3, s4, s5, s6, s7, s8;
Load1Pixel(topMostIndex, s0);
Load1Pixel(topMostIndex + 8, s1);
Load1Pixel(topMostIndex + 16, s2);
Load1Pixel(topMostIndex + 24, s3);
Load1Pixel(topMostIndex + 32, s4);
Load1Pixel(topMostIndex + 40, s5);
Load1Pixel(topMostIndex + 48, s6);
Load1Pixel(topMostIndex + 56, s7);
Load1Pixel(topMostIndex + 64, s8);
float4 blurred = BlurPixels(s0, s1, s2, s3, s4, s5, s6, s7, s8);
// Write to the final target
_Destination[(pixelCoord)] = blurred;
}
[numthreads(KERNEL_SIZE, KERNEL_SIZE, 1)]
void MAIN_GAUSSIAN(uint2 groupId : SV_GroupID, uint2 groupThreadId : SV_GroupThreadID, uint3 dispatchThreadId : SV_DispatchThreadID)
{
...
// Horizontally blur the pixels in LDS
uint row = groupThreadId.y << 4u;
BlurHorizontally(row + (groupThreadId.x << 1u), row + groupThreadId.x + (groupThreadId.x & 4u));
GroupMemoryBarrierWithGroupSync();
// Vertically blur the pixels in LDS and write the result to memory
//(groupThreadId.y << 3u) + groupThreadId.x為當前執行緒對應畫素在LDS中的Index。存放執行緒對應畫素HorizontallyBlur後的結果。
BlurVertically(dispatchThreadId.xy, (groupThreadId.y << 3u) + groupThreadId.x);
}
Copy Mip 0
在Copy Mip 0上面,除了可以用上面提到的第一次DownSample時複製的方式之外,還能夠用Cmd.CopyTexture的方式。
public int RenderColorGaussianPyramidCS(CommandBuffer cmd, Vector2Int size, Texture source, RenderTexture destination)
{
...
// Copies src mip0 to dst mip0
cmd.CopyTexture(source, 0, 0, 0, 0, size.x, size.y, destination, 0, 0, 0, 0);
int finalTargetMipWidth = destination.width;
int finalTargetMipHeight = destination.height;
var cs = m_ColorPyramidCS;
bool isFirstLoop = true;
bool switchFlag = false;
// Note: smaller mips are excluded as we don't need them and the gaussian compute works
// on 8x8 blocks
while (srcMipWidth >= 8 || srcMipHeight >= 8)
{
int dstMipWidth = Mathf.Max(1, srcMipWidth >> 1);
int dstMipHeight = Mathf.Max(1, srcMipHeight >> 1);
// Downsample.
RenderTargetIdentifier sourceRTI, destinationRTI;
if (isFirstLoop)
{
sourceRTI = source;
destinationRTI = m_TempDownsamplePyramid0[rtIndex];
// cmd.EnableKeyword(cs, this.copyMip0);
// cmd.SetComputeTextureParam(cs, m_ColorPyramidDownSampleKernel, "_Mip0", destination, 0);
}
else
{
if (switchFlag)
{
sourceRTI = m_TempDownsamplePyramid1[rtIndex];
destinationRTI = m_TempDownsamplePyramid0[rtIndex];
}
else
{
sourceRTI = m_TempDownsamplePyramid0[rtIndex];
destinationRTI = m_TempDownsamplePyramid1[rtIndex];
}
switchFlag = !switchFlag;
}
this.size[0] = srcMipWidth;
this.size[1] = srcMipHeight;
cmd.SetComputeVectorParam(cs, ShaderIDs._Size, this.size);
cmd.SetComputeTextureParam(cs, m_ColorPyramidDownSampleKernel, ShaderIDs._Source, sourceRTI);
cmd.SetComputeTextureParam(cs, m_ColorPyramidDownSampleKernel, ShaderIDs._Destination, destinationRTI);
cmd.DispatchCompute(cs, m_ColorPyramidDownSampleKernel, HQUtils.DivRoundUp(dstMipWidth, 8), HQUtils.DivRoundUp(dstMipHeight, 8), 1);
if (isFirstLoop)
{
// cmd.DisableKeyword(cs, this.copyMip0);
isFirstLoop = false;
}
this.size[0] = dstMipWidth;
this.size[1] = dstMipHeight;
cmd.SetComputeVectorParam(cs, ShaderIDs._Size, this.size);
cmd.SetComputeTextureParam(cs, m_ColorPyramidGaussianKernel, ShaderIDs._Source, destinationRTI);
cmd.SetComputeTextureParam(cs, m_ColorPyramidGaussianKernel, ShaderIDs._Destination, destination, srcMipLevel + 1);
cmd.DispatchCompute(cs, m_ColorPyramidGaussianKernel, HQUtils.DivRoundUp(dstMipWidth, 8), HQUtils.DivRoundUp(dstMipHeight, 8), 1);
srcMipLevel++;
srcMipWidth = srcMipWidth >> 1;
srcMipHeight = srcMipHeight >> 1;
finalTargetMipWidth = finalTargetMipWidth >> 1;
finalTargetMipHeight = finalTargetMipHeight >> 1;
}
return srcMipLevel + 1;
}
效能對比
上面是HDRP原來用PixelShader 生成ColorPyramid的耗時
下面是使用cmd.CopyTexture+Compute Shader Blur生成ColorPyramid的耗時
這個是不用cmd.CopyTexture生成ColorPyramid的耗時
測試用的顯示卡是RTX3080,一通操作下來減少DrawCall的呼叫最佳化了大概0.01ms左右,可以看到相比於不使用Cmd.CopyTexture的方式還能夠提升0.004ms左右。以我目前貧瘠的硬體知識,我猜測透過Cmd.CopyTexture的方式減輕了Compute pipeline的壓力,從而有所提升。