序言
看完上一節基本上HDRP的光照剔除資料的流程就寫完了,這一節主要是解析適用於透明佇列的Cluster光照剔除。
為了讓Cluster的均勻分佈,Cluster的光照剔除也同樣是藉助了PreDepth的深度圖。
RenderGraph Dispatch
這裡的buildPerVoxelLightListKernel根據是否開啟讀取Depth制定suggestBase劃分Cluster以及MSAA是否開啟有很多的不同的變體。
這裡主要解析的Kernel是TileLightListGen_DepthRT_SrcBigTile。
//Cluster的TileSize
public static int s_TileSizeClustered = 32;
static int GetNumTileClusteredX(HDCamera hdCamera)
{
return HDUtils.DivRoundUp((int) hdCamera.screenSize.x, LightDefinitions.s_TileSizeClustered);
}
unsafe void PrepareBuildGPULightListPassData(
RenderGraph renderGraph,
RenderGraphBuilder builder,
HDCamera hdCamera,
TileAndClusterData tileAndClusterData,
ref ShaderVariablesLightList constantBuffer,
int totalLightCount,
TextureHandle depthStencilBuffer,
TextureHandle stencilBufferCopy,
GBufferOutput gBuffer,
BuildGPULightListPassData passData)
{
...
// Cluster
bool msaa = hdCamera.msaaEnabled;
var clustPrepassSourceIdx = hdCamera.frameSettings.IsEnabled(FrameSettingsField.BigTilePrepass) ? ClusterPrepassSource.BigTile : ClusterPrepassSource.None;
var clustDepthSourceIdx = ClusterDepthSource.NoDepth;
if (tileAndClusterData.clusterNeedsDepth)
clustDepthSourceIdx = msaa ? ClusterDepthSource.MSAA_Depth : ClusterDepthSource.Depth;
passData.buildPerVoxelLightListShader = buildPerVoxelLightListShader;
passData.clearClusterAtomicIndexShader = clearClusterAtomicIndexShader;
//類似UE的Shader宏開啟寫法
passData.buildPerVoxelLightListKernel = isProjectionOblique ? s_ClusterObliqueKernels[(int)clustPrepassSourceIdx, (int)clustDepthSourceIdx] : s_ClusterKernels[(int)clustPrepassSourceIdx, (int)clustDepthSourceIdx];
passData.numTilesClusterX = GetNumTileClusteredX(hdCamera);
passData.numTilesClusterY = GetNumTileClusteredY(hdCamera);
passData.clusterNeedsDepth = tileAndClusterData.clusterNeedsDepth;
...
}
static void VoxelLightListGeneration(BuildGPULightListPassData data, CommandBuffer cmd)
{
if (data.runLightList)
{
// clear atomic offset index
cmd.SetComputeBufferParam(data.clearClusterAtomicIndexShader, s_ClearVoxelAtomicKernel, HDShaderIDs.g_LayeredSingleIdxBuffer, data.globalLightListAtomic);
cmd.DispatchCompute(data.clearClusterAtomicIndexShader, s_ClearVoxelAtomicKernel, 1, 1, 1);
cmd.SetComputeBufferParam(data.buildPerVoxelLightListShader, s_ClearVoxelAtomicKernel, HDShaderIDs.g_LayeredSingleIdxBuffer, data.globalLightListAtomic);
cmd.SetComputeBufferParam(data.buildPerVoxelLightListShader, data.buildPerVoxelLightListKernel, HDShaderIDs.g_vLayeredLightList, data.output.perVoxelLightLists);
cmd.SetComputeBufferParam(data.buildPerVoxelLightListShader, data.buildPerVoxelLightListKernel, HDShaderIDs.g_LayeredOffset, data.output.perVoxelOffset);
cmd.SetComputeBufferParam(data.buildPerVoxelLightListShader, data.buildPerVoxelLightListKernel, HDShaderIDs.g_LayeredSingleIdxBuffer, data.globalLightListAtomic);
//開啟Big Tile
if (data.runBigTilePrepass)
cmd.SetComputeBufferParam(data.buildPerVoxelLightListShader, data.buildPerVoxelLightListKernel, HDShaderIDs.g_vBigTileLightList, data.output.bigTileLightList);
//開啟Hiz culling
if (data.clusterNeedsDepth)
{
cmd.SetComputeTextureParam(data.buildPerVoxelLightListShader, data.buildPerVoxelLightListKernel, HDShaderIDs.g_depth_tex, data.depthBuffer);
cmd.SetComputeBufferParam(data.buildPerVoxelLightListShader, data.buildPerVoxelLightListKernel, HDShaderIDs.g_logBaseBuffer, data.output.perTileLogBaseTweak);
}
cmd.SetComputeBufferParam(data.buildPerVoxelLightListShader, data.buildPerVoxelLightListKernel, HDShaderIDs.g_vBoundsBuffer, data.AABBBoundsBuffer);
cmd.SetComputeBufferParam(data.buildPerVoxelLightListShader, data.buildPerVoxelLightListKernel, HDShaderIDs._LightVolumeData, data.lightVolumeDataBuffer);
cmd.SetComputeBufferParam(data.buildPerVoxelLightListShader, data.buildPerVoxelLightListKernel, HDShaderIDs.g_data, data.convexBoundsBuffer);
ConstantBuffer.Push(cmd, data.lightListCB, data.buildPerVoxelLightListShader, HDShaderIDs._ShaderVariablesLightList);
cmd.DispatchCompute(data.buildPerVoxelLightListShader, data.buildPerVoxelLightListKernel, data.numTilesClusterX, data.numTilesClusterY, data.viewCount);
}
}
Initialize
首先依舊是跟之前的TileLightListGen類似,計算當前執行緒的Tile的對映關係要用到的資料.(Tile的X/Y軸上的數量,當前執行緒組對應的TileID)
#define TILE_SIZE_CLUSTERED (32)
//若data.clusterNeedsDepth==true,
//就使用TileLightListGen_DepthRT_SrcBigTile LIGHTLISTGEN=TileLightListGen_DepthRT_SrcBigTile ENABLE_DEPTH_TEXTURE_BACKPLANE
//即#define ENABLE_DEPTH_TEXTURE_BACKPLANE
#define ENABLE_DEPTH_TEXTURE_BACKPLANE
groupshared uint lightOffs;
#ifdef ENABLE_DEPTH_TEXTURE_BACKPLANE
groupshared uint ldsZMax;
#endif
[numthreads(NR_THREADS, 1, 1)]
void LIGHTLISTGEN(uint threadID : SV_GroupIndex, uint3 u3GroupID : SV_GroupID)
{
uint eyeIndex = u3GroupID.z;
uint2 tileIDX = u3GroupID.xy;
uint t=threadID;
//firstbithigh(32)=5
const uint log2TileSize = firstbithigh(TILE_SIZE_CLUSTERED);
uint nrTilesX = ((uint)g_screenSize.x +(TILE_SIZE_CLUSTERED-1))>>log2TileSize;//DivRoundUp(g_screenSize.x,32)
uint nrTilesY = ((uint)g_screenSize.y +(TILE_SIZE_CLUSTERED-1))>>log2TileSize;//DivRoundUp(g_screenSize.y,32)
// Screen space coordinates of clustered tile
//當前Tile的左下角螢幕座標
uint2 viTilLL = TILE_SIZE_CLUSTERED*tileIDX;
//當前Tile的右上角螢幕座標
uint2 viTilUR = min( viTilLL+uint2(TILE_SIZE_CLUSTERED,TILE_SIZE_CLUSTERED), uint2(g_screenSize.x, g_screenSize.y) ); // not width and height minus 1 since viTilUR represents the end of the tile corner.
//重置lightOffs,ldsZMax(跟TileLightListGen類似需要求Tile內的ZMax)
if(t==0)
{
lightOffs = 0;
#ifdef ENABLE_DEPTH_TEXTURE_BACKPLANE
ldsZMax = 0;
#endif
}
#if NR_THREADS > PLATFORM_LANE_COUNT
GroupMemoryBarrierWithGroupSync();
#endif
...
}
ldsZMax
透過遍歷Tile內的深度,得到linMaDist,最後再InterlockedMax Resolve得到ldsZMax(Tile內的Max Z)
#define TILE_SIZE_CLUSTERED (32)
#define VIEWPORT_SCALE_Z (1)
//跟lightlistbuild.compute一樣,一樣是透過zDptBufSpace以及對應的螢幕座標計算出對應的Linear Depth [Near,Far]
float GetLinearDepth(float2 pixXY, float zDptBufSpace, uint eyeIndex) // 0 is near 1 is far
{
float4x4 g_mInvScrProjection = g_mInvScrProjectionArr[eyeIndex];
#ifdef USE_OBLIQUE_MODE
float2 res2 = mul(g_mInvScrProjection, float4(pixXY, zDptBufSpace, 1.0)).zw;
return res2.x / res2.y;
#else
// for perspective projection m22 is zero and m23 is +1/-1 (depends on left/right hand proj)
// however this function must also work for orthographic projection so we keep it like this.
float m22 = g_mInvScrProjection[2].z, m23 = g_mInvScrProjection[2].w;
float m32 = g_mInvScrProjection[3].z, m33 = g_mInvScrProjection[3].w;
return (m22 * zDptBufSpace + m23) / (m32 * zDptBufSpace + m33);
#endif
}
[numthreads(NR_THREADS, 1, 1)]
void LIGHTLISTGEN(uint threadID : SV_GroupIndex, uint3 u3GroupID : SV_GroupID)
{
...
#ifdef ENABLE_DEPTH_TEXTURE_BACKPLANE
// establish max depth first
float linMaDist = 0.0;
//TILE_SIZE_CLUSTERED * TILE_SIZE_CLUSTERED=32*32
//遍歷Cluster Tile內對應的Depth
for (int idx = t; idx < (TILE_SIZE_CLUSTERED * TILE_SIZE_CLUSTERED); idx += NR_THREADS)
{
uint2 uPixCrd = min(uint2(viTilLL.x + (idx & (TILE_SIZE_CLUSTERED - 1)), viTilLL.y + (idx >> log2TileSize)), uint2(g_screenSize.x - 1, g_screenSize.y - 1));
//#ifdef MSAA_ENABLED
//for(int i=0; i<g_iNumSamplesMSAA; i++)
//{
//const float fDpth = FetchDepthMSAA(uPixCrd, i);
//const float2 fracSampleCoord = g_depth_tex.GetSamplePosition(i).xy; // this is optimized away when USE_OBLIQUE_MODE is NOT set.
//#else
const float fDpth = FetchDepth(uPixCrd);
const float2 fracSampleCoord = float2(0.5, 0.5);
//#endif
if (fDpth < VIEWPORT_SCALE_Z) // if not skydome
{
float linZ = GetLinearDepth(uPixCrd + fracSampleCoord, fDpth, eyeIndex);
#if USE_LEFT_HAND_CAMERA_SPACE
float linDistZ = linZ;
#else
float linDistZ = -linZ;
#endif
//求ZMax
linMaDist = max(linDistZ, linMaDist);
}
//#ifdef MSAA_ENABLED
//}
//#endif
}
//Resolve linMaDist併執行緒同步(InterlockedMax)賦值給ldsZMax
linMaDist = max(linMaDist, 0.0);
InterlockedMax(ldsZMax, asuint(linMaDist));
//這個GroupMemoryBarrierWithGroupSync莫名其妙的
#if NR_THREADS > PLATFORM_LANE_COUNT
GroupMemoryBarrierWithGroupSync();
#endif
linMaDist = asfloat(ldsZMax);
//if (fDpth < VIEWPORT_SCALE_Z)
if (linMaDist <= 0.0)
linMaDist = g_fFarPlane; // assume sky pixel
#endif
...
}
Build coarse list,SphericalIntersectionTests
跟lightlistbuild.compute一樣,這裡也同樣可以借用Big Tile的計算結果(g_vBigTileLightList),只遍歷Big Tile內的燈光列表來Build coarseList
然後SphericalIntersectionTests Tile內的燈光,剔除掉並沒有與Tile相交的燈光(DoesSphereOverlapTile)
[numthreads(NR_THREADS, 1, 1)]
void LIGHTLISTGEN(uint threadID : SV_GroupIndex, uint3 u3GroupID : SV_GroupID)
{
...
// 'Normalized' coordinates of tile, for use with AABB bounds in g_vBoundsBuffer
float2 vTileLL = float2(viTilLL.x / g_screenSize.x, viTilLL.y / g_screenSize.y);
float2 vTileUR = float2(viTilUR.x / g_screenSize.x, viTilUR.y / g_screenSize.y);
// build coarse list using AABB
#ifdef USE_TWO_PASS_TILED_LIGHTING
//tileIDX對映bigTileIdx
const uint log2BigTileToClustTileRatio = firstbithigh(TILE_SIZE_BIG_TILE) - log2TileSize;
int NrBigTilesX = (nrTilesX + ((1 << log2BigTileToClustTileRatio) - 1)) >> log2BigTileToClustTileRatio;
int NrBigTilesY = (nrTilesY + ((1 << log2BigTileToClustTileRatio) - 1)) >> log2BigTileToClustTileRatio;
const int bigTileBase = eyeIndex * NrBigTilesX * NrBigTilesY;
const int bigTileIdx = bigTileBase + ((tileIDX.y >> log2BigTileToClustTileRatio) * NrBigTilesX) + (tileIDX.x >> log2BigTileToClustTileRatio); // map the idx to 64x64 tiles
int nrBigTileLights = g_vBigTileLightList[MAX_NR_BIG_TILE_LIGHTS_PLUS_ONE * bigTileIdx + 0];
for (int l0 = (int)t; l0 < (int)nrBigTileLights; l0 += NR_THREADS)
{
int l = g_vBigTileLightList[MAX_NR_BIG_TILE_LIGHTS_PLUS_ONE * bigTileIdx + l0 + 1];
#else
for (int l = (int)t; l < (int)g_iNrVisibLights; l += NR_THREADS)
{
#endif
const ScreenSpaceBoundsIndices boundsIndices = GenerateScreenSpaceBoundsIndices(l, g_iNrVisibLights, eyeIndex);
const float2 vMi = g_vBoundsBuffer[boundsIndices.min].xy;
const float2 vMa = g_vBoundsBuffer[boundsIndices.max].xy;
//在Tile內
if (all(vMa > vTileLL) && all(vMi < vTileUR))
{
unsigned int uInc = 1;
unsigned int uIndex;
InterlockedAdd(lightOffs, uInc, uIndex);
if (uIndex < MAX_NR_COARSE_ENTRIES)
coarseList[uIndex] = l; // add to light list
}
}
#if NR_THREADS > PLATFORM_LANE_COUNT
GroupMemoryBarrierWithGroupSync();
#endif
int iNrCoarseLights = min(lightOffs,MAX_NR_COARSE_ENTRIES);
#ifdef PERFORM_SPHERICAL_INTERSECTION_TESTS
iNrCoarseLights = SphericalIntersectionTests(t, iNrCoarseLights, float2(min(viTilLL.xy + uint2(TILE_SIZE_CLUSTERED / 2,TILE_SIZE_CLUSTERED / 2), uint2(g_screenSize.x - 1, g_screenSize.y - 1))),
eyeIndex);
#endif
...
}
根據Tile內的linMaDist分割Cluster
以Tile內linMaDist(fTileFarPlane)作為分割Cluster,即根據Tile內的最遠的深度進行劃分,當fTileFarPlane離得近時,Cluster的Index在靠前的深度分佈越多。(提高Cluster的利用率)
函式影像:SuggestLogBase50
float suggested_base = pow((1.0 + sqrt(max(0.0, 1.0 - 4.0 * rangeFittedDistance * (1.0 - rangeFittedDistance)))) / (2.0 * rangeFittedDistance), 2.0 / C);
可以化簡為:令d=rangeFittedDistance
\(\begin{cases}
suggestedBase=(\frac{1}{d}-1)^{\frac{1}{32}} & \text{ if } d<0.5 \\
suggestedBase=1 & \text{ if } d\ge0.5
\end{cases}\)
max(g_fClustBase, suggested_base)之後suggestedBase(g_fClustBase=1.02f)
\(\begin{cases}
suggestedBase=(\frac{1}{d}-1)^{\frac{1}{32}} & \text{ if } d<0.3466 \\
suggestedBase=1.02 & \text{ if } d\ge0.3466
\end{cases}\)
函式影像:SnapToClusterIdxFlex
f1(x,t)被限制在了[1.02,1.68]
1.68是代入rangeFittedDistance的最小值FLT_EPS計算得到的suggestedBase。
f2(x),f3(x)就是在演示suggestedBase在[1.02,1.68]之間滑動對SnapToClusterIdxFlex的影響。
可以看到當f1(x,t)從1.02變化到1.68的時候,由原本接近線性分佈,變成了log曲線一樣,使得更多的Index分佈了在前面的深度。
float LogBase(float x, float b)
{
return log2(x) / log2(b);
}
int SnapToClusterIdxFlex(float z_in, float suggestedBase, bool logBasePerTile)
{
#if USE_LEFT_HAND_CAMERA_SPACE
float z = z_in;
#else
float z = -z_in;
#endif
//float userscale = g_fClustScale;
//if (logBasePerTile)
// userscale = GetScaleFromBase(suggestedBase);
// using the inverse of the geometric series
//const float dist = max(0, z - g_fNearPlane);
//return (int)clamp(log2(dist * userscale * (suggestedBase - 1.0f) + 1) / log2(suggestedBase), 0.0, (float)((1 << g_iLog2NumClusters) - 1));
const int C = 1 << g_iLog2NumClusters;
const float rangeFittedDistance = max(0, z - g_fNearPlane) / (g_fFarPlane - g_fNearPlane);
return (int)clamp( LogBase( lerp(1.0, PositivePow(suggestedBase, (float) C), rangeFittedDistance), suggestedBase), 0.0, (float)(C - 1));
}
int SnapToClusterIdx(float z_in, float suggestedBase)
{
#ifdef ENABLE_DEPTH_TEXTURE_BACKPLANE
bool logBasePerTile = true; // resolved compile time
#else
bool logBasePerTile = false;
#endif
return SnapToClusterIdxFlex(z_in, suggestedBase, logBasePerTile);
}
// generate a log-base value such that half of the clusters are consumed from near plane to max. opaque depth of tile.
float SuggestLogBase50(float tileFarPlane)
{
const float C = (float)(1 << g_iLog2NumClusters);
float rangeFittedDistance = clamp((tileFarPlane - g_fNearPlane) / (g_fFarPlane - g_fNearPlane), FLT_EPS, 1.0);
float suggested_base = pow((1.0 + sqrt(max(0.0, 1.0 - 4.0 * rangeFittedDistance * (1.0 - rangeFittedDistance)))) / (2.0 * rangeFittedDistance), 2.0 / C); //
//g_fClustBase=1.02f;
return max(g_fClustBase, suggested_base);
}
#define MAX_NR_COARSE_ENTRIES 128
//兩盞燈的ClusterId Min MaxID合併成一個clusterIdxs 128/2
groupshared unsigned int clusterIdxs[MAX_NR_COARSE_ENTRIES / 2];
[numthreads(NR_THREADS, 1, 1)]
void LIGHTLISTGEN(uint threadID : SV_GroupIndex, uint3 u3GroupID : SV_GroupID)
{
...
#ifdef ENABLE_DEPTH_TEXTURE_BACKPLANE
float fTileFarPlane = linMaDist;
float suggestedBase = SuggestLogBase50(fTileFarPlane);
#else // ENABLE_DEPTH_TEXTURE_BACKPLANE
float fTileFarPlane = g_fFarPlane;
float suggestedBase = g_fClustBase;
#endif
// //#define EXACT_EDGE_TESTS EXACT_EDGE_TESTS沒有啟用
#ifdef EXACT_EDGE_TESTS
iNrCoarseLights = CullByExactEdgeTests(t, iNrCoarseLights, viTilLL.xy, viTilUR.xy, fTileFarPlane, eyeIndex);
#endif
//這裡的燈光Index排序依舊是上一節的雙調排序
// sort lights (gives a more efficient execution in both deferred and tiled forward lighting).
#if NR_THREADS > PLATFORM_LANE_COUNT
SORTLIST(coarseList, iNrCoarseLights, MAX_NR_COARSE_ENTRIES, t, NR_THREADS);
#endif
//////////// cell specific code
//根據BoundBuffer.w(viewPos.z),結合上面求的suggestedBase用SnapToClusterIdx指定Cluster ID
{
for (int l = (int)t; l < ((iNrCoarseLights + 1) >> 1); l += NR_THREADS)
{
const int l0 = coarseList[2 * l + 0], l1 = coarseList[min(2 * l + 1, iNrCoarseLights - 1)];
const ScreenSpaceBoundsIndices l0Bounds = GenerateScreenSpaceBoundsIndices(l0, g_iNrVisibLights, eyeIndex);
const ScreenSpaceBoundsIndices l1Bounds = GenerateScreenSpaceBoundsIndices(l1, g_iNrVisibLights, eyeIndex);
const unsigned int clustIdxMi0 = (const unsigned int)min(255, SnapToClusterIdx(g_vBoundsBuffer[l0Bounds.min].w, suggestedBase));
const unsigned int clustIdxMa0 = (const unsigned int)min(255, SnapToClusterIdx(g_vBoundsBuffer[l0Bounds.max].w, suggestedBase));
const unsigned int clustIdxMi1 = (const unsigned int)min(255, SnapToClusterIdx(g_vBoundsBuffer[l1Bounds.min].w, suggestedBase));
const unsigned int clustIdxMa1 = (const unsigned int)min(255, SnapToClusterIdx(g_vBoundsBuffer[l1Bounds.max].w, suggestedBase));
//這裡兩盞燈的 clustIdxMin,clustIdxMax合併成一個ClusterIdx
clusterIdxs[l] = (clustIdxMa1 << 24) | (clustIdxMi1 << 16) | (clustIdxMa0 << 8) | (clustIdxMi0 << 0);
}
}
//執行緒同步
#if NR_THREADS > PLATFORM_LANE_COUNT
GroupMemoryBarrierWithGroupSync();
#endif
...
}
統計各個Cluster內的燈光數量[iSpaceAvail]
上面簡單了Test燈光是否在Cluster內,是不夠精準的,還需要檢測構成cluster的點是否跟燈光Volume相交(CheckIntersection)。
如果相交了,燈光的Index才最終加入到g_vLayeredLightList裡。
相對應的lightCategory也要計數+1
#define NR_THREADS 64
#define LIGHTCATEGORY_COUNT 5
//每個執行緒對應一個Cluster,即categoryListCountScratch記錄每個Cluster的Light Count
groupshared int categoryListCountScratch[NR_THREADS * LIGHTCATEGORY_COUNT];
//記錄不同Category的在LightData/LightVolumeData中的Index偏移量(_EnvLightIndexShift/_DecalIndexShift/_LocalVolumetricFogIndexShift)
//這個值沒有相關操作,感覺是多餘的,不如直接用ConstantBuffer的變數。
groupshared int shiftIndexScratch[NR_THREADS * LIGHTCATEGORY_COUNT];
//4盞燈,每盞燈記錄6個平面,每個平面用float4描述,float4(vN.xyz,-dot(vN,p0))
groupshared float4 lightPlanes[4 * 6];// Each plane is defined by a float4. 6 planes per light, 4 lights (24 planes)
bool CheckIntersectionBasic(int l, int k)
{
unsigned int val = (clusterIdxs[l >> 1] >> (16 * (l & 1))) & 0xffff;
return ((val >> 0) & 0xff) <= ((uint)k) && ((uint)k) <= ((val >> 8) & 0xff);
}
void ZeroCategoryListCountAndShiftIndex(uint threadIdx)
{
for (int i = 0; i < LIGHTCATEGORY_COUNT; ++i)
{
categoryListCountScratch[threadIdx * LIGHTCATEGORY_COUNT + i] = 0;
shiftIndexScratch[threadIdx * LIGHTCATEGORY_COUNT + i] = 0;
}
}
void WriteShiftIndex(uint threadIdx, uint index, int value)
{
shiftIndexScratch[threadIdx * LIGHTCATEGORY_COUNT + index] = value;
}
void IncrementCategoryListCount(uint threadIdx, uint index)
{
categoryListCountScratch[threadIdx * LIGHTCATEGORY_COUNT + index]++;
}
[numthreads(NR_THREADS, 1, 1)]
void LIGHTLISTGEN(uint threadID : SV_GroupIndex, uint3 u3GroupID : SV_GroupID)
{
...
//g_iLog2NumClusters=6 nrClusters=64
int nrClusters = (1 << g_iLog2NumClusters);
//////////////////////////////////////////////////////////
uint start = 0;
int i = (int)t;
int iSpaceAvail = 0;
int iSum = 0;
if (i < nrClusters)
{
// Each thread checks it's respective cluster against all coarse lights for intersection.
// At the end, 'iSum' represents the number of lights that intersect this cluster!
for (int l = 0; l < iNrCoarseLights; l++)
{
iSum += (CheckIntersectionBasic(l, i) ? 1 : 0);
}
// We have a limit to the number of lights we will track in a cluster (128). This is how much memory we
// want to allocate out of g_LayeredSingleIdxBuffer.
iSpaceAvail = min(iSum,MAX_NR_COARSE_ENTRIES); // combined storage for both direct lights and reflection
InterlockedAdd(g_LayeredSingleIdxBuffer[0], (uint)iSpaceAvail, start); // alloc list memory
//Start記錄前面所有Cluster包含的Light數量
//iSpaceAvail記錄當前Cluster裡包含的Light數量
//g_LayeredSingleIdxBuffer[0]沒有用到過,應該是某個Indirect的操作,記錄所有Cluster的LightIndex總數
}
// All our cull data are in the same list, but at render time envLights are separated so we need to shift the index
// to make it work correctly
ZeroCategoryListCountAndShiftIndex(t);
WriteShiftIndex(t, LIGHTCATEGORY_ENV, _EnvLightIndexShift);
WriteShiftIndex(t, LIGHTCATEGORY_DECAL, _DecalIndexShift);
WriteShiftIndex(t, LIGHTCATEGORY_LOCAL_VOLUMETRIC_FOG, _LocalVolumetricFogIndexShift);
uint offs = start;
//遍歷Tile內燈光列表(CoarseLights)把對應的燈光Index放到對應的
//int iNrCoarseLights = min(lightOffs,MAX_NR_COARSE_ENTRIES);最大值為128
for (int ll = 0; ll < iNrCoarseLights; ll += 4)
{
//只有執行緒組前24執行緒FetchPlane,每次迴圈只fetch 4盞燈的24個平面
int p = i >> 2;
int m = i & 3;
if (i < 24)
lightPlanes[6 * m + p] = FetchPlane(min(iNrCoarseLights - 1, ll + m), p, eyeIndex);
//執行緒同步
#if NR_THREADS > PLATFORM_LANE_COUNT
GroupMemoryBarrierWithGroupSync();
#endif
//檢測剛剛Fetch過平面的燈光,檢測是否與Cluster的8個AABB平面相交。
for (int l = ll; l < min(iNrCoarseLights, (ll + 4)); l++)
{
if (offs < (start + iSpaceAvail) && i < nrClusters && CheckIntersection(l, i, viTilLL.xy, viTilUR.xy, suggestedBase, eyeIndex))
{
const int lightVolIndex = GenerateLightCullDataIndex(coarseList[l], g_iNrVisibLights, eyeIndex);
uint lightCategory = _LightVolumeData[lightVolIndex].lightCategory;
//當前Cluster對應categoryListCount計數器Index++,
//categoryListCountScratch LDS裡面的count才是不同Category Cluster內的lightDataCount
IncrementCategoryListCount(t, lightCategory);
//跟上一篇的BuildPerTileLightList末尾輸出LightIndex類似,也需要減去對應lightCategory偏移得到對應Category的Data Index
g_vLayeredLightList[offs++] = coarseList[l] - ReadShiftIndex(t, lightCategory);
}
}
#if NR_THREADS > PLATFORM_LANE_COUNT
GroupMemoryBarrierWithGroupSync();
#endif
}
...
}
Fetch Plane
這裡的FetchPlane函式依舊是使用LightingConvexHullUtils.hlsl裡面的,
[GetHullPlane]根據不同面序號返回對應的平面上的一點以及平面的法向。
[GetHullPlaneEq]後續為了判斷點與平面的朝向(ToLeftTest)就預先構成float4(vN, -dot(vN,p0))這樣的表示平面方式。
//LightingConvexHullUtils.hlsl
void GetHullPlane(out float3 p0, out float3 n0, const float3 boxX, const float3 boxY, const float3 boxZ, const float3 center, const float2 scaleXY, const int sideIndex)
{
//const int iAbsSide = (sideIndex == 0 || sideIndex == 1) ? 0 : ((sideIndex == 2 || sideIndex == 3) ? 1 : 2);
const int iAbsSide = min(sideIndex>>1, 2);
const float fS = (sideIndex & 1) != 0 ? 1 : (-1);
float3 vA = fS*(iAbsSide == 0 ? boxX : (iAbsSide == 1 ? (-boxY) : boxZ));
float3 vB = fS*(iAbsSide == 0 ? (-boxY) : (iAbsSide == 1 ? (-boxX) : (-boxY)));
float3 vC = iAbsSide == 0 ? boxZ : (iAbsSide == 1 ? boxZ : (-boxX));
//isTop的條件判斷多少有點抽象,建議直接在C#模擬一遍就夠了
bool bIsTopQuad = iAbsSide == 2 && (sideIndex & 1) != 0; // in this case all 4 verts get scaled.
bool bIsSideQuad = (iAbsSide == 0 || iAbsSide == 1); // if side quad only two verts get scaled (impacts q1 and q2)
if (bIsTopQuad)
{
vB *= scaleXY.y;
vC *= scaleXY.x;
}
float3 vA2 = vA;
float3 vB2 = vB;
if (bIsSideQuad)
{
vA2 *= (iAbsSide == 0 ? scaleXY.x : scaleXY.y);
vB2 *= (iAbsSide == 0 ? scaleXY.y : scaleXY.x);
}
float3 vN = cross(vB2, 0.5 * (vA - vA2) - vC); // +/- normal
float3 v0 = vA + vB - vC; // vector from center to p0
p0 = center + v0; // center + vA is center of face when scaleXY is 1.0
//dot(vN,v0) < 0.0 保證法線朝外
n0 = dot(vN,v0) < 0.0 ? (-vN) : vN;
}
float4 GetHullPlaneEq(const float3 boxX, const float3 boxY, const float3 boxZ, const float3 center, const float2 scaleXY, const int sideIndex)
{
float3 p0, vN;
GetHullPlane(p0, vN, boxX, boxY, boxZ, center, scaleXY, sideIndex);
return float4(vN, -dot(vN,p0));
}
CheckIntersection
判斷Cluster與燈光是否相交,有兩個判斷方式,
一個就是用前面計算的Cluster Index範圍做簡單的判斷,
第二個就是用Cluster ID計算出構成Cluster的八個頂點與燈光平面的幾何關係。
//用每個Tile ldsZMax計算出來的suggestedBase來計算計算每個Cluster的NearPlaneZ
//下一個Cluster的NearPlane就是當前Cluster的FarPlaneZ
float ClusterIdxToZFlex(int k, float suggestedBase, bool logBasePerTile)
{
float res;
//float userscale = g_fClustScale;
//if (logBasePerTile)
// userscale = GetScaleFromBase(suggestedBase);
//float dist = (PositivePow(suggestedBase, (float)k) - 1.0) / (userscale * (suggestedBase - 1.0f));
//res = dist + g_fNearPlane;
const float C = (float)(1 << g_iLog2NumClusters);
float rangeFittedDistance = (PositivePow(suggestedBase, (float)k) - 1.0) / (PositivePow(suggestedBase, C) - 1.0);
res = lerp(g_fNearPlane, g_fFarPlane, rangeFittedDistance);
#if USE_LEFT_HAND_CAMERA_SPACE
return res;
#else
return -res;
#endif
}
float ClusterIdxToZ(int k, float suggestedBase)
{
#ifdef ENABLE_DEPTH_TEXTURE_BACKPLANE
bool logBasePerTile = true; // resolved compile time
#else
bool logBasePerTile = false;
#endif
return ClusterIdxToZFlex(k, suggestedBase, logBasePerTile);
}
bool CheckIntersection(int l, int k, uint2 viTilLL, uint2 viTilUR, float suggestedBase, uint eyeIndex)
{
//先簡單透過Cluster ID是否在燈光MinMax範圍內判交,若不在就直接返回
// If this light's screen space depth bounds intersect this cluster...simple cluster test
unsigned int val = (clusterIdxs[l >> 1] >> (16 * (l & 1))) & 0xffff;
bool bIsHit = ((val >> 0) & 0xff) <= ((uint)k) && ((uint)k) <= ((val >> 8) & 0xff);
if (bIsHit)
{
#ifdef CONV_HULL_TEST_ENABLED
float depthAtNearZ = ClusterIdxToZ(k, suggestedBase);
float depthAtFarZ = ClusterIdxToZ(k + 1, suggestedBase);
//若Cluster所有的點都在燈光的某一平面外側說明Cluster與燈光不相交
for (int p = 0; p < 6; p++)
{
float4 plane = lightPlanes[6 * (l & 3) + p];
bool bAllInvisib = true;
for (int i = 0; i < 8; i++)
{
float x = (i & 1) == 0 ? viTilLL.x : viTilUR.x;
float y = (i & 2) == 0 ? viTilLL.y : viTilUR.y;
float z = (i & 4) == 0 ? depthAtNearZ : depthAtFarZ;
//用螢幕座標以及對應的LinDepth計算ViewPositiion,計算方式跟上一節一樣都是用g_mScrProjectionArr計算
float3 vP = GetViewPosFromLinDepth(float2(x, y), z, eyeIndex);
//plane = float4(vN.xyz,-dot(vN,p0))
//dot(plane, float4(vP, 1.0)) > 0即vP對平面vN做ToLeftTest
//vN.xyz*vP.xyz>dot(vN,p0)檢測vP是否在平面左側
// Test each corner of the cluster against the light bounding box planes
bAllInvisib = bAllInvisib && dot(plane, float4(vP, 1.0)) > 0;
}
//即找到一個平面能跟Cluster完全分離
if (bAllInvisib)
bIsHit = false;
}
#endif
}
return bIsHit;
}
Final Resolve
上面的start值記錄的是當前Cluster在g_vLayeredLightList記錄LightData的起始Index。
categoryListCountScratch也記錄了Cluster不同的Category的LightData Count,
所以我們可以透過start以及對應的Category的LightData Count就可以在g_vLayeredLightList中定址得到對應的Category LightData Index
uint GenerateLayeredOffsetBufferIndex(uint lightCategory, uint2 tileIndex, uint clusterIndex, uint numTilesX, uint numTilesY, int numClusters, uint eyeIndex)
{
// Each eye is split into category, cluster, x, y
uint eyeOffset = eyeIndex * LIGHTCATEGORY_COUNT * numClusters * numTilesX * numTilesY;
int lightOffset = ((lightCategory * numClusters + clusterIndex) * numTilesY + tileIndex.y) * numTilesX + tileIndex.x;
return (eyeOffset + lightOffset);
}
//67108863=1<<26-1
#define LIGHT_CLUSTER_PACKING_OFFSET_MASK (67108863)
#define LIGHT_CLUSTER_PACKING_COUNT_MASK (63)
#define LIGHT_CLUSTER_PACKING_OFFSET_BITS (26)
uint PackClusterLayeredOffset(uint offset, uint count)
{
return (offset & LIGHT_CLUSTER_PACKING_OFFSET_MASK) | (min(count, LIGHT_CLUSTER_PACKING_COUNT_MASK) << LIGHT_CLUSTER_PACKING_OFFSET_BITS);
}
//統計當前Tile內的Cluster(64個)不同category計數器
groupshared int categoryListCountScratch[NR_THREADS * LIGHTCATEGORY_COUNT];
void IncrementCategoryListCount(uint threadIdx, uint index)
{
categoryListCountScratch[threadIdx * LIGHTCATEGORY_COUNT + index]++;
}
int ReadCategoryListCount(uint threadIdx, uint index)
{
return categoryListCountScratch[threadIdx * LIGHTCATEGORY_COUNT + index];
}
//LogBaseBufferIndex計算,suggestedBase是逐tile資料
uint GenerateLogBaseBufferIndex(uint2 tileIndex, uint numTilesX, uint numTilesY, uint eyeIndex)
{
uint eyeOffset = eyeIndex * numTilesX * numTilesY;
return (eyeOffset + (tileIndex.y * numTilesX) + tileIndex.x);
}
[numthreads(NR_THREADS, 1, 1)]
void LIGHTLISTGEN(uint threadID : SV_GroupIndex, uint3 u3GroupID : SV_GroupID)
{
...
uint start = 0;
int i = (int)t;
int iSpaceAvail = 0;
int iSum = 0;
if (i < nrClusters)
{
// Each thread checks it's respective cluster against all coarse lights for intersection.
// At the end, 'iSum' represents the number of lights that intersect this cluster!
for (int l = 0; l < iNrCoarseLights; l++)
{
iSum += (CheckIntersectionBasic(l, i) ? 1 : 0);
}
// We have a limit to the number of lights we will track in a cluster (128). This is how much memory we
// want to allocate out of g_LayeredSingleIdxBuffer.
iSpaceAvail = min(iSum,MAX_NR_COARSE_ENTRIES); // combined storage for both direct lights and reflection
//start = g_LayeredSingleIdxBuffer[0];
//InterlockedAdd(g_LayeredSingleIdxBuffer[0], (uint)iSpaceAvail);
InterlockedAdd(g_LayeredSingleIdxBuffer[0], (uint)iSpaceAvail, start); // alloc list memory
//start記錄前面所有Cluster包含的Light數量
//iSpaceAvail記錄當前Cluster裡包含的Light數量
//g_LayeredSingleIdxBuffer[0]沒有用到過,應該是某個Indirect的操作,記錄所有Cluster的LightIndex總數
}
...
//start記錄前面所有Cluster包含的Light數量,也是後續跳錶List(g_LayeredOffset)儲存的
uint offs = start;
for (int ll = 0; ll < iNrCoarseLights; ll += 4)
{
int p = i >> 2;
int m = i & 3;
if (i < 24)
lightPlanes[6 * m + p] = FetchPlane(min(iNrCoarseLights - 1, ll + m), p, eyeIndex);
#if NR_THREADS > PLATFORM_LANE_COUNT
GroupMemoryBarrierWithGroupSync();
#endif
for (int l = ll; l < min(iNrCoarseLights, (ll + 4)); l++)
{
//iSpaceAvail是透過CheckIntersectionBasic測試的燈光數量
if (offs < (start + iSpaceAvail) && i < nrClusters && CheckIntersection(l, i, viTilLL.xy, viTilUR.xy, suggestedBase, eyeIndex))
{
//不同lightCategory是連續儲存的,後續透過ReadCategoryListCount讀取到不同Category的LightData的數量
//從而分開不同Category.
const int lightVolIndex = GenerateLightCullDataIndex(coarseList[l], g_iNrVisibLights, eyeIndex);
uint lightCategory = _LightVolumeData[lightVolIndex].lightCategory;
IncrementCategoryListCount(t, lightCategory);
g_vLayeredLightList[offs++] = coarseList[l] - ReadShiftIndex(t, lightCategory);
}
}
#if NR_THREADS > PLATFORM_LANE_COUNT
GroupMemoryBarrierWithGroupSync();
#endif
}
...
uint localOffs = 0;
//每個Cluster起始offset由LIGHTCATEGORY_PUNCTUAL(0)為標準
//在loop裡面再累計[offs += (nrClusters * nrTilesX * nrTilesY);]
//nrClusters=64
offs = GenerateLayeredOffsetBufferIndex(0, tileIDX, i, nrTilesX, nrTilesY, nrClusters, eyeIndex);
for (int category = 0; category < LIGHTCATEGORY_COUNT; category++)
{
//讀取當前Cluster中category對應的light Count
int numLights = ReadCategoryListCount(t, category);
if (i < nrClusters)
{
//(讀取g_vLayeredLightList的起始Index) start + localOffs
//(Cluster內category對應的light Count) numLights
//g_vLayeredLightList裡面才儲存LightIndex,g_LayeredOffset作為跳轉的List
g_LayeredOffset[offs] = PackClusterLayeredOffset((start + localOffs), (uint)numLights);
offs += (nrClusters * nrTilesX * nrTilesY);
localOffs += numLights; // use unclamped count for localOffs
}
}
//為了後面LightingLoop對映ClusterIdx,需要儲存每個Tile的劃分Cluster的引數(suggestedBase)
#ifdef ENABLE_DEPTH_TEXTURE_BACKPLANE
const uint logBaseIndex = GenerateLogBaseBufferIndex(tileIDX, nrTilesX, nrTilesY, eyeIndex);
if (threadID == 0)
g_logBaseBuffer[logBaseIndex] = suggestedBase;
#endif
...
}
LightLoop
類似FPTL,Cluster這裡也同樣呼叫的是GetCountAndStart介面獲取當前Cluster的lightData Count以及start的Index,
FetchIndex獲取最終的LightData Index。
最終的流程如下:
1.positionInput的TileCoord + Depth.z計算出當前片元歸屬的clusterIdx
2.clusterIdx + 當前要計算的category (GenerateLayeredOffsetBufferIndex)計算當前cluster在g_vLayeredOffsetsBuffer的idx
3.g_vLayeredOffsetsBuffer[idx]即為上面最後記錄的用來跳轉用的dataPair(PackClusterLayeredOffset)
4.UnpackClusterLayeredOffset得到start和lightCount
5.Lighting的時候從start開始FetchIndex就可以得到lightData的真正Index。
//LightLoopDef.hlsl
...
#elif defined(USE_CLUSTERED_LIGHTLIST)
#include "Packages/com.unity.render-pipelines.high-definition/Runtime/Lighting/LightLoop/ClusteredUtils.hlsl"
uint GetTileSize()
{
return TILE_SIZE_CLUSTERED;
}
uint GetLightClusterIndex(uint2 tileIndex, float linearDepth)
{
float logBase = g_fClustBase;
if (g_isLogBaseBufferEnabled)
{
const uint logBaseIndex = GenerateLogBaseBufferIndex(tileIndex, _NumTileClusteredX, _NumTileClusteredY, unity_StereoEyeIndex);
logBase = g_logBaseBuffer[logBaseIndex];
}
return SnapToClusterIdxFlex(linearDepth, logBase, g_isLogBaseBufferEnabled != 0);
}
void UnpackClusterLayeredOffset(uint packedValue, out uint offset, out uint count)
{
offset = packedValue & LIGHT_CLUSTER_PACKING_OFFSET_MASK;
count = packedValue >> LIGHT_CLUSTER_PACKING_OFFSET_BITS;
}
void GetCountAndStartCluster(uint2 tileIndex, uint clusterIndex, uint lightCategory, out uint start, out uint lightCount)
{
int nrClusters = (1 << g_iLog2NumClusters);
const int idx = GenerateLayeredOffsetBufferIndex(lightCategory, tileIndex, clusterIndex, _NumTileClusteredX, _NumTileClusteredY, nrClusters, unity_StereoEyeIndex);
uint dataPair = g_vLayeredOffsetsBuffer[idx];
UnpackClusterLayeredOffset(dataPair, start, lightCount);
}
void GetCountAndStartCluster(PositionInputs posInput, uint lightCategory, out uint start, out uint lightCount)
{
// Note: XR depends on unity_StereoEyeIndex already being defined,
// which means ShaderVariables.hlsl needs to be defined ahead of this!
uint2 tileIndex = posInput.tileCoord;
uint clusterIndex = GetLightClusterIndex(tileIndex, posInput.linearDepth);
GetCountAndStartCluster(tileIndex, clusterIndex, lightCategory, start, lightCount);
}
void GetCountAndStart(PositionInputs posInput, uint lightCategory, out uint start, out uint lightCount)
{
GetCountAndStartCluster(posInput, lightCategory, start, lightCount);
}
uint FetchIndex(uint lightStart, uint lightOffset)
{
return g_vLightListCluster[lightStart + lightOffset];
}
...