Unity的Forward+ FPTL光照剔除解析(四)

凶恶的真实發表於2024-04-08

序言

看完上一節基本上HDRP的光照剔除資料的流程就寫完了,這一節主要是解析適用於透明佇列的Cluster光照剔除。
為了讓Cluster的均勻分佈,Cluster的光照剔除也同樣是藉助了PreDepth的深度圖

RenderGraph Dispatch

這裡的buildPerVoxelLightListKernel根據是否開啟讀取Depth制定suggestBase劃分Cluster以及MSAA是否開啟有很多的不同的變體。
這裡主要解析的Kernel是TileLightListGen_DepthRT_SrcBigTile

//Cluster的TileSize
public static int s_TileSizeClustered = 32;

static int GetNumTileClusteredX(HDCamera hdCamera)
{
    return HDUtils.DivRoundUp((int) hdCamera.screenSize.x, LightDefinitions.s_TileSizeClustered);
}

unsafe void PrepareBuildGPULightListPassData(
    RenderGraph renderGraph,
    RenderGraphBuilder builder,
    HDCamera hdCamera,
    TileAndClusterData tileAndClusterData,
    ref ShaderVariablesLightList constantBuffer,
    int totalLightCount,
    TextureHandle depthStencilBuffer,
    TextureHandle stencilBufferCopy,
    GBufferOutput gBuffer,
    BuildGPULightListPassData passData)
{
    ...
    // Cluster
    bool msaa = hdCamera.msaaEnabled;
    var clustPrepassSourceIdx = hdCamera.frameSettings.IsEnabled(FrameSettingsField.BigTilePrepass) ? ClusterPrepassSource.BigTile : ClusterPrepassSource.None;
    var clustDepthSourceIdx = ClusterDepthSource.NoDepth;
    if (tileAndClusterData.clusterNeedsDepth)
        clustDepthSourceIdx = msaa ? ClusterDepthSource.MSAA_Depth : ClusterDepthSource.Depth;

    passData.buildPerVoxelLightListShader = buildPerVoxelLightListShader;
    passData.clearClusterAtomicIndexShader = clearClusterAtomicIndexShader;
    //類似UE的Shader宏開啟寫法
    passData.buildPerVoxelLightListKernel = isProjectionOblique ? s_ClusterObliqueKernels[(int)clustPrepassSourceIdx, (int)clustDepthSourceIdx] : s_ClusterKernels[(int)clustPrepassSourceIdx, (int)clustDepthSourceIdx];
    passData.numTilesClusterX = GetNumTileClusteredX(hdCamera);
    passData.numTilesClusterY = GetNumTileClusteredY(hdCamera);
    passData.clusterNeedsDepth = tileAndClusterData.clusterNeedsDepth;

    ...
}


static void VoxelLightListGeneration(BuildGPULightListPassData data, CommandBuffer cmd)
{
    if (data.runLightList)
    {
        // clear atomic offset index
        cmd.SetComputeBufferParam(data.clearClusterAtomicIndexShader, s_ClearVoxelAtomicKernel, HDShaderIDs.g_LayeredSingleIdxBuffer, data.globalLightListAtomic);
        cmd.DispatchCompute(data.clearClusterAtomicIndexShader, s_ClearVoxelAtomicKernel, 1, 1, 1);

        cmd.SetComputeBufferParam(data.buildPerVoxelLightListShader, s_ClearVoxelAtomicKernel, HDShaderIDs.g_LayeredSingleIdxBuffer, data.globalLightListAtomic);
        cmd.SetComputeBufferParam(data.buildPerVoxelLightListShader, data.buildPerVoxelLightListKernel, HDShaderIDs.g_vLayeredLightList, data.output.perVoxelLightLists);
        cmd.SetComputeBufferParam(data.buildPerVoxelLightListShader, data.buildPerVoxelLightListKernel, HDShaderIDs.g_LayeredOffset, data.output.perVoxelOffset);
        cmd.SetComputeBufferParam(data.buildPerVoxelLightListShader, data.buildPerVoxelLightListKernel, HDShaderIDs.g_LayeredSingleIdxBuffer, data.globalLightListAtomic);

        //開啟Big Tile
        if (data.runBigTilePrepass)
            cmd.SetComputeBufferParam(data.buildPerVoxelLightListShader, data.buildPerVoxelLightListKernel, HDShaderIDs.g_vBigTileLightList, data.output.bigTileLightList);

        //開啟Hiz culling
        if (data.clusterNeedsDepth)
        {
            cmd.SetComputeTextureParam(data.buildPerVoxelLightListShader, data.buildPerVoxelLightListKernel, HDShaderIDs.g_depth_tex, data.depthBuffer);
            cmd.SetComputeBufferParam(data.buildPerVoxelLightListShader, data.buildPerVoxelLightListKernel, HDShaderIDs.g_logBaseBuffer, data.output.perTileLogBaseTweak);
        }

        cmd.SetComputeBufferParam(data.buildPerVoxelLightListShader, data.buildPerVoxelLightListKernel, HDShaderIDs.g_vBoundsBuffer, data.AABBBoundsBuffer);
        cmd.SetComputeBufferParam(data.buildPerVoxelLightListShader, data.buildPerVoxelLightListKernel, HDShaderIDs._LightVolumeData, data.lightVolumeDataBuffer);
        cmd.SetComputeBufferParam(data.buildPerVoxelLightListShader, data.buildPerVoxelLightListKernel, HDShaderIDs.g_data, data.convexBoundsBuffer);

        ConstantBuffer.Push(cmd, data.lightListCB, data.buildPerVoxelLightListShader, HDShaderIDs._ShaderVariablesLightList);

        cmd.DispatchCompute(data.buildPerVoxelLightListShader, data.buildPerVoxelLightListKernel, data.numTilesClusterX, data.numTilesClusterY, data.viewCount);
    }
}

Initialize

首先依舊是跟之前的TileLightListGen類似,計算當前執行緒的Tile的對映關係要用到的資料.(Tile的X/Y軸上的數量,當前執行緒組對應的TileID)

#define TILE_SIZE_CLUSTERED (32)
//若data.clusterNeedsDepth==true,
//就使用TileLightListGen_DepthRT_SrcBigTile  LIGHTLISTGEN=TileLightListGen_DepthRT_SrcBigTile  ENABLE_DEPTH_TEXTURE_BACKPLANE
//即#define ENABLE_DEPTH_TEXTURE_BACKPLANE
#define ENABLE_DEPTH_TEXTURE_BACKPLANE

groupshared uint lightOffs;

#ifdef ENABLE_DEPTH_TEXTURE_BACKPLANE
groupshared uint ldsZMax;
#endif

[numthreads(NR_THREADS, 1, 1)]
void LIGHTLISTGEN(uint threadID : SV_GroupIndex, uint3 u3GroupID : SV_GroupID)
{
    uint eyeIndex = u3GroupID.z;

    uint2 tileIDX = u3GroupID.xy;
    uint t=threadID;

    //firstbithigh(32)=5
    const uint log2TileSize = firstbithigh(TILE_SIZE_CLUSTERED);
    uint nrTilesX = ((uint)g_screenSize.x +(TILE_SIZE_CLUSTERED-1))>>log2TileSize;//DivRoundUp(g_screenSize.x,32)
    uint nrTilesY = ((uint)g_screenSize.y +(TILE_SIZE_CLUSTERED-1))>>log2TileSize;//DivRoundUp(g_screenSize.y,32)

    // Screen space coordinates of clustered tile
    //當前Tile的左下角螢幕座標
    uint2 viTilLL = TILE_SIZE_CLUSTERED*tileIDX;
    //當前Tile的右上角螢幕座標
    uint2 viTilUR = min( viTilLL+uint2(TILE_SIZE_CLUSTERED,TILE_SIZE_CLUSTERED), uint2(g_screenSize.x, g_screenSize.y) );       // not width and height minus 1 since viTilUR represents the end of the tile corner.
    
    //重置lightOffs,ldsZMax(跟TileLightListGen類似需要求Tile內的ZMax)
    if(t==0)
    {
        lightOffs = 0;

#ifdef ENABLE_DEPTH_TEXTURE_BACKPLANE
        ldsZMax = 0;
#endif
    }

#if NR_THREADS > PLATFORM_LANE_COUNT
    GroupMemoryBarrierWithGroupSync();
#endif

    ...
}

ldsZMax

透過遍歷Tile內的深度,得到linMaDist,最後再InterlockedMax Resolve得到ldsZMax(Tile內的Max Z)

#define TILE_SIZE_CLUSTERED (32)
#define VIEWPORT_SCALE_Z (1)

//跟lightlistbuild.compute一樣,一樣是透過zDptBufSpace以及對應的螢幕座標計算出對應的Linear Depth [Near,Far]
float GetLinearDepth(float2 pixXY, float zDptBufSpace, uint eyeIndex) // 0 is near 1 is far
{
    float4x4 g_mInvScrProjection = g_mInvScrProjectionArr[eyeIndex];

    #ifdef USE_OBLIQUE_MODE
    float2 res2 = mul(g_mInvScrProjection, float4(pixXY, zDptBufSpace, 1.0)).zw;
    return res2.x / res2.y;
    #else
    // for perspective projection m22 is zero and m23 is +1/-1 (depends on left/right hand proj)
    // however this function must also work for orthographic projection so we keep it like this.
    float m22 = g_mInvScrProjection[2].z, m23 = g_mInvScrProjection[2].w;
    float m32 = g_mInvScrProjection[3].z, m33 = g_mInvScrProjection[3].w;

    return (m22 * zDptBufSpace + m23) / (m32 * zDptBufSpace + m33);
    #endif
}

[numthreads(NR_THREADS, 1, 1)]
void LIGHTLISTGEN(uint threadID : SV_GroupIndex, uint3 u3GroupID : SV_GroupID)
{
    ...

    #ifdef ENABLE_DEPTH_TEXTURE_BACKPLANE
    // establish max depth first
    float linMaDist = 0.0;

    //TILE_SIZE_CLUSTERED * TILE_SIZE_CLUSTERED=32*32
    //遍歷Cluster Tile內對應的Depth
    for (int idx = t; idx < (TILE_SIZE_CLUSTERED * TILE_SIZE_CLUSTERED); idx += NR_THREADS)
    {
        uint2 uPixCrd = min(uint2(viTilLL.x + (idx & (TILE_SIZE_CLUSTERED - 1)), viTilLL.y + (idx >> log2TileSize)), uint2(g_screenSize.x - 1, g_screenSize.y - 1));

        //#ifdef MSAA_ENABLED
        //for(int i=0; i<g_iNumSamplesMSAA; i++)
        //{
        //const float fDpth = FetchDepthMSAA(uPixCrd, i);
        //const float2 fracSampleCoord = g_depth_tex.GetSamplePosition(i).xy;     // this is optimized away when USE_OBLIQUE_MODE is NOT set.
        //#else
        const float fDpth = FetchDepth(uPixCrd);
        const float2 fracSampleCoord = float2(0.5, 0.5);
        //#endif

        if (fDpth < VIEWPORT_SCALE_Z) // if not skydome
        {
            float linZ = GetLinearDepth(uPixCrd + fracSampleCoord, fDpth, eyeIndex);
            #if USE_LEFT_HAND_CAMERA_SPACE
            float linDistZ = linZ;
            #else
            float linDistZ = -linZ;
            #endif

            //求ZMax
            linMaDist = max(linDistZ, linMaDist);
        }
        //#ifdef MSAA_ENABLED
        //}
        //#endif
    }

    //Resolve linMaDist併執行緒同步(InterlockedMax)賦值給ldsZMax
    linMaDist = max(linMaDist, 0.0);
    InterlockedMax(ldsZMax, asuint(linMaDist));

    //這個GroupMemoryBarrierWithGroupSync莫名其妙的
    #if NR_THREADS > PLATFORM_LANE_COUNT
    GroupMemoryBarrierWithGroupSync();
    #endif

    linMaDist = asfloat(ldsZMax);
    //if (fDpth < VIEWPORT_SCALE_Z)
    if (linMaDist <= 0.0) 
        linMaDist = g_fFarPlane; // assume sky pixel
    #endif
    ...
}

Build coarse list,SphericalIntersectionTests

跟lightlistbuild.compute一樣,這裡也同樣可以借用Big Tile的計算結果(g_vBigTileLightList),只遍歷Big Tile內的燈光列表來Build coarseList
然後SphericalIntersectionTests Tile內的燈光,剔除掉並沒有與Tile相交的燈光(DoesSphereOverlapTile)

[numthreads(NR_THREADS, 1, 1)]
void LIGHTLISTGEN(uint threadID : SV_GroupIndex, uint3 u3GroupID : SV_GroupID)
{
    ...

    // 'Normalized' coordinates of tile, for use with AABB bounds in g_vBoundsBuffer
    float2 vTileLL = float2(viTilLL.x / g_screenSize.x, viTilLL.y / g_screenSize.y);
    float2 vTileUR = float2(viTilUR.x / g_screenSize.x, viTilUR.y / g_screenSize.y);

    // build coarse list using AABB
    #ifdef USE_TWO_PASS_TILED_LIGHTING

    //tileIDX對映bigTileIdx
    const uint log2BigTileToClustTileRatio = firstbithigh(TILE_SIZE_BIG_TILE) - log2TileSize;

    int NrBigTilesX = (nrTilesX + ((1 << log2BigTileToClustTileRatio) - 1)) >> log2BigTileToClustTileRatio;
    int NrBigTilesY = (nrTilesY + ((1 << log2BigTileToClustTileRatio) - 1)) >> log2BigTileToClustTileRatio;
    const int bigTileBase = eyeIndex * NrBigTilesX * NrBigTilesY;
    const int bigTileIdx = bigTileBase + ((tileIDX.y >> log2BigTileToClustTileRatio) * NrBigTilesX) + (tileIDX.x >> log2BigTileToClustTileRatio); // map the idx to 64x64 tiles

    int nrBigTileLights = g_vBigTileLightList[MAX_NR_BIG_TILE_LIGHTS_PLUS_ONE * bigTileIdx + 0];
    for (int l0 = (int)t; l0 < (int)nrBigTileLights; l0 += NR_THREADS)
    {
        int l = g_vBigTileLightList[MAX_NR_BIG_TILE_LIGHTS_PLUS_ONE * bigTileIdx + l0 + 1];

    #else

    for (int l = (int)t; l < (int)g_iNrVisibLights; l += NR_THREADS)
    {
        #endif
        const ScreenSpaceBoundsIndices boundsIndices = GenerateScreenSpaceBoundsIndices(l, g_iNrVisibLights, eyeIndex);
        const float2 vMi = g_vBoundsBuffer[boundsIndices.min].xy;
        const float2 vMa = g_vBoundsBuffer[boundsIndices.max].xy;

        //在Tile內
        if (all(vMa > vTileLL) && all(vMi < vTileUR))
        {
            unsigned int uInc = 1;
            unsigned int uIndex;
            InterlockedAdd(lightOffs, uInc, uIndex);
            if (uIndex < MAX_NR_COARSE_ENTRIES) 
                coarseList[uIndex] = l; // add to light list
        }
    }

    #if NR_THREADS > PLATFORM_LANE_COUNT
    GroupMemoryBarrierWithGroupSync();
    #endif

    int iNrCoarseLights = min(lightOffs,MAX_NR_COARSE_ENTRIES);

    #ifdef PERFORM_SPHERICAL_INTERSECTION_TESTS
    iNrCoarseLights = SphericalIntersectionTests(t, iNrCoarseLights, float2(min(viTilLL.xy + uint2(TILE_SIZE_CLUSTERED / 2,TILE_SIZE_CLUSTERED / 2), uint2(g_screenSize.x - 1, g_screenSize.y - 1))),
                                                 eyeIndex);
    #endif


    ...
}

根據Tile內的linMaDist分割Cluster

以Tile內linMaDist(fTileFarPlane)作為分割Cluster,即根據Tile內的最遠的深度進行劃分,當fTileFarPlane離得近時,Cluster的Index在靠前的深度分佈越多。(提高Cluster的利用率)

函式影像:SuggestLogBase50
float suggested_base = pow((1.0 + sqrt(max(0.0, 1.0 - 4.0 * rangeFittedDistance * (1.0 - rangeFittedDistance)))) / (2.0 * rangeFittedDistance), 2.0 / C);
可以化簡為:令d=rangeFittedDistance
\(\begin{cases} suggestedBase=(\frac{1}{d}-1)^{\frac{1}{32}} & \text{ if } d<0.5 \\ suggestedBase=1 & \text{ if } d\ge0.5 \end{cases}\)
max(g_fClustBase, suggested_base)之後suggestedBase(g_fClustBase=1.02f)
\(\begin{cases} suggestedBase=(\frac{1}{d}-1)^{\frac{1}{32}} & \text{ if } d<0.3466 \\ suggestedBase=1.02 & \text{ if } d\ge0.3466 \end{cases}\)
函式影像:SnapToClusterIdxFlex
f1(x,t)被限制在了[1.02,1.68]
1.68是代入rangeFittedDistance的最小值FLT_EPS計算得到的suggestedBase。
f2(x),f3(x)就是在演示suggestedBase在[1.02,1.68]之間滑動對SnapToClusterIdxFlex的影響。
可以看到當f1(x,t)從1.02變化到1.68的時候,由原本接近線性分佈,變成了log曲線一樣,使得更多的Index分佈了在前面的深度。


float LogBase(float x, float b)
{
    return log2(x) / log2(b);
}

int SnapToClusterIdxFlex(float z_in, float suggestedBase, bool logBasePerTile)
{
#if USE_LEFT_HAND_CAMERA_SPACE
    float z = z_in;
#else
    float z = -z_in;
#endif

    //float userscale = g_fClustScale;
    //if (logBasePerTile)
    //    userscale = GetScaleFromBase(suggestedBase);

    // using the inverse of the geometric series
    //const float dist = max(0, z - g_fNearPlane);
    //return (int)clamp(log2(dist * userscale * (suggestedBase - 1.0f) + 1) / log2(suggestedBase), 0.0, (float)((1 << g_iLog2NumClusters) - 1));

    const int C = 1 << g_iLog2NumClusters;
    const float rangeFittedDistance = max(0, z - g_fNearPlane) / (g_fFarPlane - g_fNearPlane);
    return (int)clamp( LogBase( lerp(1.0, PositivePow(suggestedBase, (float) C), rangeFittedDistance), suggestedBase), 0.0, (float)(C - 1));
}

int SnapToClusterIdx(float z_in, float suggestedBase)
{
#ifdef ENABLE_DEPTH_TEXTURE_BACKPLANE
    bool logBasePerTile = true;     // resolved compile time
#else
    bool logBasePerTile = false;
#endif

    return SnapToClusterIdxFlex(z_in, suggestedBase, logBasePerTile);
}

// generate a log-base value such that half of the clusters are consumed from near plane to max. opaque depth of tile.
float SuggestLogBase50(float tileFarPlane)
{
    const float C = (float)(1 << g_iLog2NumClusters);
    float rangeFittedDistance = clamp((tileFarPlane - g_fNearPlane) / (g_fFarPlane - g_fNearPlane), FLT_EPS, 1.0);
    float suggested_base = pow((1.0 + sqrt(max(0.0, 1.0 - 4.0 * rangeFittedDistance * (1.0 - rangeFittedDistance)))) / (2.0 * rangeFittedDistance), 2.0 / C);      //
    
    //g_fClustBase=1.02f;
    return max(g_fClustBase, suggested_base);
}

#define MAX_NR_COARSE_ENTRIES       128
//兩盞燈的ClusterId Min MaxID合併成一個clusterIdxs 128/2
groupshared unsigned int clusterIdxs[MAX_NR_COARSE_ENTRIES / 2];

[numthreads(NR_THREADS, 1, 1)]
void LIGHTLISTGEN(uint threadID : SV_GroupIndex, uint3 u3GroupID : SV_GroupID)
{
    ...
    #ifdef ENABLE_DEPTH_TEXTURE_BACKPLANE
    float fTileFarPlane = linMaDist;
    float suggestedBase = SuggestLogBase50(fTileFarPlane);
    #else // ENABLE_DEPTH_TEXTURE_BACKPLANE
    float fTileFarPlane = g_fFarPlane;
    float suggestedBase = g_fClustBase;
    #endif

    // //#define EXACT_EDGE_TESTS EXACT_EDGE_TESTS沒有啟用
    #ifdef EXACT_EDGE_TESTS
    iNrCoarseLights = CullByExactEdgeTests(t, iNrCoarseLights, viTilLL.xy, viTilUR.xy, fTileFarPlane, eyeIndex);
    #endif

    //這裡的燈光Index排序依舊是上一節的雙調排序
    // sort lights (gives a more efficient execution in both deferred and tiled forward lighting).
    #if NR_THREADS > PLATFORM_LANE_COUNT
    SORTLIST(coarseList, iNrCoarseLights, MAX_NR_COARSE_ENTRIES, t, NR_THREADS);
    #endif

    //////////// cell specific code
    //根據BoundBuffer.w(viewPos.z),結合上面求的suggestedBase用SnapToClusterIdx指定Cluster ID
    {
        for (int l = (int)t; l < ((iNrCoarseLights + 1) >> 1); l += NR_THREADS)
        {
            const int l0 = coarseList[2 * l + 0], l1 = coarseList[min(2 * l + 1, iNrCoarseLights - 1)];
            const ScreenSpaceBoundsIndices l0Bounds = GenerateScreenSpaceBoundsIndices(l0, g_iNrVisibLights, eyeIndex);
            const ScreenSpaceBoundsIndices l1Bounds = GenerateScreenSpaceBoundsIndices(l1, g_iNrVisibLights, eyeIndex);

            const unsigned int clustIdxMi0 = (const unsigned int)min(255, SnapToClusterIdx(g_vBoundsBuffer[l0Bounds.min].w, suggestedBase));
            const unsigned int clustIdxMa0 = (const unsigned int)min(255, SnapToClusterIdx(g_vBoundsBuffer[l0Bounds.max].w, suggestedBase));
            const unsigned int clustIdxMi1 = (const unsigned int)min(255, SnapToClusterIdx(g_vBoundsBuffer[l1Bounds.min].w, suggestedBase));
            const unsigned int clustIdxMa1 = (const unsigned int)min(255, SnapToClusterIdx(g_vBoundsBuffer[l1Bounds.max].w, suggestedBase));
            //這裡兩盞燈的 clustIdxMin,clustIdxMax合併成一個ClusterIdx 
            clusterIdxs[l] = (clustIdxMa1 << 24) | (clustIdxMi1 << 16) | (clustIdxMa0 << 8) | (clustIdxMi0 << 0);
        }
    }

    //執行緒同步
    #if NR_THREADS > PLATFORM_LANE_COUNT
    GroupMemoryBarrierWithGroupSync();
    #endif
    ...
}

統計各個Cluster內的燈光數量[iSpaceAvail]

上面簡單了Test燈光是否在Cluster內,是不夠精準的,還需要檢測構成cluster的點是否跟燈光Volume相交(CheckIntersection)
如果相交了,燈光的Index才最終加入到g_vLayeredLightList裡。
相對應的lightCategory也要計數+1

#define NR_THREADS       64
#define LIGHTCATEGORY_COUNT       5

//每個執行緒對應一個Cluster,即categoryListCountScratch記錄每個Cluster的Light Count
groupshared int categoryListCountScratch[NR_THREADS * LIGHTCATEGORY_COUNT];
//記錄不同Category的在LightData/LightVolumeData中的Index偏移量(_EnvLightIndexShift/_DecalIndexShift/_LocalVolumetricFogIndexShift)
//這個值沒有相關操作,感覺是多餘的,不如直接用ConstantBuffer的變數。
groupshared int shiftIndexScratch[NR_THREADS * LIGHTCATEGORY_COUNT];

//4盞燈,每盞燈記錄6個平面,每個平面用float4描述,float4(vN.xyz,-dot(vN,p0))
groupshared float4 lightPlanes[4 * 6];// Each plane is defined by a float4. 6 planes per light, 4 lights (24 planes)

bool CheckIntersectionBasic(int l, int k)
{
    unsigned int val = (clusterIdxs[l >> 1] >> (16 * (l & 1))) & 0xffff;
    return ((val >> 0) & 0xff) <= ((uint)k) && ((uint)k) <= ((val >> 8) & 0xff);
}

void ZeroCategoryListCountAndShiftIndex(uint threadIdx)
{
    for (int i = 0; i < LIGHTCATEGORY_COUNT; ++i)
    {
        categoryListCountScratch[threadIdx * LIGHTCATEGORY_COUNT + i] = 0;
        shiftIndexScratch[threadIdx * LIGHTCATEGORY_COUNT + i] = 0;
    }
}

void WriteShiftIndex(uint threadIdx, uint index, int value)
{
    shiftIndexScratch[threadIdx * LIGHTCATEGORY_COUNT + index] = value;
}

void IncrementCategoryListCount(uint threadIdx, uint index)
{
    categoryListCountScratch[threadIdx * LIGHTCATEGORY_COUNT + index]++;
}

[numthreads(NR_THREADS, 1, 1)]
void LIGHTLISTGEN(uint threadID : SV_GroupIndex, uint3 u3GroupID : SV_GroupID)
{
    ...
    //g_iLog2NumClusters=6 nrClusters=64
    int nrClusters = (1 << g_iLog2NumClusters);


    //////////////////////////////////////////////////////////

    uint start = 0;
    int i = (int)t;
    int iSpaceAvail = 0;
    int iSum = 0;
    if (i < nrClusters)
    {
        // Each thread checks it's respective cluster against all coarse lights for intersection.
        // At the end, 'iSum' represents the number of lights that intersect this cluster!
        for (int l = 0; l < iNrCoarseLights; l++)
        {
            iSum += (CheckIntersectionBasic(l, i) ? 1 : 0);
        }

        // We have a limit to the number of lights we will track in a cluster (128). This is how much memory we
        // want to allocate out of g_LayeredSingleIdxBuffer.
        iSpaceAvail = min(iSum,MAX_NR_COARSE_ENTRIES); // combined storage for both direct lights and reflection
        InterlockedAdd(g_LayeredSingleIdxBuffer[0], (uint)iSpaceAvail, start); // alloc list memory

        //Start記錄前面所有Cluster包含的Light數量
        //iSpaceAvail記錄當前Cluster裡包含的Light數量
        //g_LayeredSingleIdxBuffer[0]沒有用到過,應該是某個Indirect的操作,記錄所有Cluster的LightIndex總數
    }

    // All our cull data are in the same list, but at render time envLights are separated so we need to shift the index
    // to make it work correctly
    ZeroCategoryListCountAndShiftIndex(t);

    WriteShiftIndex(t, LIGHTCATEGORY_ENV, _EnvLightIndexShift);
    WriteShiftIndex(t, LIGHTCATEGORY_DECAL, _DecalIndexShift);
    WriteShiftIndex(t, LIGHTCATEGORY_LOCAL_VOLUMETRIC_FOG, _LocalVolumetricFogIndexShift);

    uint offs = start;
    //遍歷Tile內燈光列表(CoarseLights)把對應的燈光Index放到對應的
    //int iNrCoarseLights = min(lightOffs,MAX_NR_COARSE_ENTRIES);最大值為128
    for (int ll = 0; ll < iNrCoarseLights; ll += 4)
    {
        //只有執行緒組前24執行緒FetchPlane,每次迴圈只fetch 4盞燈的24個平面
        int p = i >> 2;
        int m = i & 3;
        if (i < 24)
            lightPlanes[6 * m + p] = FetchPlane(min(iNrCoarseLights - 1, ll + m), p, eyeIndex);

        //執行緒同步
        #if NR_THREADS > PLATFORM_LANE_COUNT
        GroupMemoryBarrierWithGroupSync();
        #endif

        //檢測剛剛Fetch過平面的燈光,檢測是否與Cluster的8個AABB平面相交。
        for (int l = ll; l < min(iNrCoarseLights, (ll + 4)); l++)
        {
            if (offs < (start + iSpaceAvail) && i < nrClusters && CheckIntersection(l, i, viTilLL.xy, viTilUR.xy, suggestedBase, eyeIndex))
            {
                const int lightVolIndex = GenerateLightCullDataIndex(coarseList[l], g_iNrVisibLights, eyeIndex);
                uint lightCategory = _LightVolumeData[lightVolIndex].lightCategory;

                //當前Cluster對應categoryListCount計數器Index++,
                //categoryListCountScratch LDS裡面的count才是不同Category Cluster內的lightDataCount
                IncrementCategoryListCount(t, lightCategory);

                //跟上一篇的BuildPerTileLightList末尾輸出LightIndex類似,也需要減去對應lightCategory偏移得到對應Category的Data Index
                g_vLayeredLightList[offs++] = coarseList[l] - ReadShiftIndex(t, lightCategory);
            }
        }

        #if NR_THREADS > PLATFORM_LANE_COUNT
        GroupMemoryBarrierWithGroupSync();
        #endif
    }
    ...
}

Fetch Plane

這裡的FetchPlane函式依舊是使用LightingConvexHullUtils.hlsl裡面的,
[GetHullPlane]根據不同面序號返回對應的平面上的一點以及平面的法向
[GetHullPlaneEq]後續為了判斷點與平面的朝向(ToLeftTest)預先構成float4(vN, -dot(vN,p0))這樣的表示平面方式

_D9F6E79F-8A33-47ed-B15D-A01A967A5788_.png

圖中標紅Cube的是Hull的頂點,紅線是Hull的平面法向
//LightingConvexHullUtils.hlsl

void GetHullPlane(out float3 p0, out float3 n0, const float3 boxX, const float3 boxY, const float3 boxZ, const float3 center, const float2 scaleXY, const int sideIndex)
{
    //const int iAbsSide = (sideIndex == 0 || sideIndex == 1) ? 0 : ((sideIndex == 2 || sideIndex == 3) ? 1 : 2);
    const int iAbsSide = min(sideIndex>>1, 2);
    const float fS = (sideIndex & 1) != 0 ? 1 : (-1);

    float3 vA = fS*(iAbsSide == 0 ? boxX : (iAbsSide == 1 ? (-boxY) : boxZ));
    float3 vB = fS*(iAbsSide == 0 ? (-boxY) : (iAbsSide == 1 ? (-boxX) : (-boxY)));
    float3 vC = iAbsSide == 0 ? boxZ : (iAbsSide == 1 ? boxZ : (-boxX));

    //isTop的條件判斷多少有點抽象,建議直接在C#模擬一遍就夠了
    bool bIsTopQuad = iAbsSide == 2 && (sideIndex & 1) != 0;        // in this case all 4 verts get scaled.
    bool bIsSideQuad = (iAbsSide == 0 || iAbsSide == 1);        // if side quad only two verts get scaled (impacts q1 and q2)

    if (bIsTopQuad) 
    { 
        vB *= scaleXY.y; 
        vC *= scaleXY.x; 
    }

    float3 vA2 = vA;
    float3 vB2 = vB;

    if (bIsSideQuad) 
    {
        vA2 *= (iAbsSide == 0 ? scaleXY.x : scaleXY.y); 
        vB2 *= (iAbsSide == 0 ? scaleXY.y : scaleXY.x); 
    }

    float3 vN = cross(vB2, 0.5 * (vA - vA2) - vC);  // +/- normal
    float3 v0 = vA + vB - vC;   // vector from center to p0
    p0 = center + v0;           // center + vA is center of face when scaleXY is 1.0
    //dot(vN,v0) < 0.0 保證法線朝外
    n0 = dot(vN,v0) < 0.0 ? (-vN) : vN;
}

float4 GetHullPlaneEq(const float3 boxX, const float3 boxY, const float3 boxZ, const float3 center, const float2 scaleXY, const int sideIndex)
{
    float3 p0, vN;
    GetHullPlane(p0, vN, boxX, boxY, boxZ, center, scaleXY, sideIndex);

    return float4(vN, -dot(vN,p0));
}

CheckIntersection

判斷Cluster與燈光是否相交,有兩個判斷方式,
一個就是用前面計算的Cluster Index範圍做簡單的判斷,
第二個就是用Cluster ID計算出構成Cluster的八個頂點與燈光平面的幾何關係

//用每個Tile ldsZMax計算出來的suggestedBase來計算計算每個Cluster的NearPlaneZ
//下一個Cluster的NearPlane就是當前Cluster的FarPlaneZ
float ClusterIdxToZFlex(int k, float suggestedBase, bool logBasePerTile)
{
    float res;

    //float userscale = g_fClustScale;
    //if (logBasePerTile)
    //    userscale = GetScaleFromBase(suggestedBase);

    //float dist = (PositivePow(suggestedBase, (float)k) - 1.0) / (userscale * (suggestedBase - 1.0f));
    //res = dist + g_fNearPlane;

    const float C = (float)(1 << g_iLog2NumClusters);
    float rangeFittedDistance = (PositivePow(suggestedBase, (float)k) - 1.0) / (PositivePow(suggestedBase, C) - 1.0);
    res = lerp(g_fNearPlane, g_fFarPlane, rangeFittedDistance);


#if USE_LEFT_HAND_CAMERA_SPACE
    return res;
#else
    return -res;
#endif
}

float ClusterIdxToZ(int k, float suggestedBase)
{
#ifdef ENABLE_DEPTH_TEXTURE_BACKPLANE
    bool logBasePerTile = true;     // resolved compile time
#else
    bool logBasePerTile = false;
#endif

    return ClusterIdxToZFlex(k, suggestedBase, logBasePerTile);
}

bool CheckIntersection(int l, int k, uint2 viTilLL, uint2 viTilUR, float suggestedBase, uint eyeIndex)
{
    //先簡單透過Cluster ID是否在燈光MinMax範圍內判交,若不在就直接返回
    // If this light's screen space depth bounds intersect this cluster...simple cluster test
    unsigned int val = (clusterIdxs[l >> 1] >> (16 * (l & 1))) & 0xffff;
    bool bIsHit = ((val >> 0) & 0xff) <= ((uint)k) && ((uint)k) <= ((val >> 8) & 0xff);
    if (bIsHit)
    {
        #ifdef CONV_HULL_TEST_ENABLED
        float depthAtNearZ = ClusterIdxToZ(k, suggestedBase);
        float depthAtFarZ = ClusterIdxToZ(k + 1, suggestedBase);

        //若Cluster所有的點都在燈光的某一平面外側說明Cluster與燈光不相交
        for (int p = 0; p < 6; p++)
        {
            float4 plane = lightPlanes[6 * (l & 3) + p];

            bool bAllInvisib = true;

            for (int i = 0; i < 8; i++)
            {
                float x = (i & 1) == 0 ? viTilLL.x : viTilUR.x;
                float y = (i & 2) == 0 ? viTilLL.y : viTilUR.y;
                float z = (i & 4) == 0 ? depthAtNearZ : depthAtFarZ;

                //用螢幕座標以及對應的LinDepth計算ViewPositiion,計算方式跟上一節一樣都是用g_mScrProjectionArr計算
                float3 vP = GetViewPosFromLinDepth(float2(x, y), z, eyeIndex);

                //plane = float4(vN.xyz,-dot(vN,p0))
                //dot(plane, float4(vP, 1.0)) > 0即vP對平面vN做ToLeftTest
                //vN.xyz*vP.xyz>dot(vN,p0)檢測vP是否在平面左側

                // Test each corner of the cluster against the light bounding box planes
                bAllInvisib = bAllInvisib && dot(plane, float4(vP, 1.0)) > 0;
            }
            
            //即找到一個平面能跟Cluster完全分離
            if (bAllInvisib) 
                bIsHit = false;
        }
        #endif
    }

    return bIsHit;
}

Final Resolve

上面的start值記錄的是當前Cluster在g_vLayeredLightList記錄LightData的起始Index
categoryListCountScratch也記錄了Cluster不同的Category的LightData Count,
所以我們可以透過start以及對應的Category的LightData Count就可以在g_vLayeredLightList中定址得到對應的Category LightData Index


uint GenerateLayeredOffsetBufferIndex(uint lightCategory, uint2 tileIndex, uint clusterIndex, uint numTilesX, uint numTilesY, int numClusters, uint eyeIndex)
{
    // Each eye is split into category, cluster, x, y

    uint eyeOffset = eyeIndex * LIGHTCATEGORY_COUNT * numClusters * numTilesX * numTilesY;
    int lightOffset = ((lightCategory * numClusters + clusterIndex) * numTilesY + tileIndex.y) * numTilesX + tileIndex.x;

    return (eyeOffset + lightOffset);
}


//67108863=1<<26-1
#define LIGHT_CLUSTER_PACKING_OFFSET_MASK (67108863)

#define LIGHT_CLUSTER_PACKING_COUNT_MASK (63)
#define LIGHT_CLUSTER_PACKING_OFFSET_BITS (26)

uint PackClusterLayeredOffset(uint offset, uint count)
{
    return (offset & LIGHT_CLUSTER_PACKING_OFFSET_MASK) | (min(count, LIGHT_CLUSTER_PACKING_COUNT_MASK) << LIGHT_CLUSTER_PACKING_OFFSET_BITS);
}

//統計當前Tile內的Cluster(64個)不同category計數器
groupshared int categoryListCountScratch[NR_THREADS * LIGHTCATEGORY_COUNT];

void IncrementCategoryListCount(uint threadIdx, uint index)
{
    categoryListCountScratch[threadIdx * LIGHTCATEGORY_COUNT + index]++;
}

int ReadCategoryListCount(uint threadIdx, uint index)
{
    return categoryListCountScratch[threadIdx * LIGHTCATEGORY_COUNT + index];
}

//LogBaseBufferIndex計算,suggestedBase是逐tile資料
uint GenerateLogBaseBufferIndex(uint2 tileIndex, uint numTilesX, uint numTilesY, uint eyeIndex)
{
    uint eyeOffset = eyeIndex * numTilesX * numTilesY;
    return (eyeOffset + (tileIndex.y * numTilesX) + tileIndex.x);
}


[numthreads(NR_THREADS, 1, 1)]
void LIGHTLISTGEN(uint threadID : SV_GroupIndex, uint3 u3GroupID : SV_GroupID)
{
    ...

    uint start = 0;
    int i = (int)t;
    int iSpaceAvail = 0;
    int iSum = 0;
    if (i < nrClusters)
    {
        // Each thread checks it's respective cluster against all coarse lights for intersection.
        // At the end, 'iSum' represents the number of lights that intersect this cluster!
        for (int l = 0; l < iNrCoarseLights; l++)
        {
            iSum += (CheckIntersectionBasic(l, i) ? 1 : 0);
        }

        // We have a limit to the number of lights we will track in a cluster (128). This is how much memory we
        // want to allocate out of g_LayeredSingleIdxBuffer.
        iSpaceAvail = min(iSum,MAX_NR_COARSE_ENTRIES); // combined storage for both direct lights and reflection
        
        //start = g_LayeredSingleIdxBuffer[0];
        //InterlockedAdd(g_LayeredSingleIdxBuffer[0], (uint)iSpaceAvail);
        InterlockedAdd(g_LayeredSingleIdxBuffer[0], (uint)iSpaceAvail, start); // alloc list memory

        //start記錄前面所有Cluster包含的Light數量
        //iSpaceAvail記錄當前Cluster裡包含的Light數量
        //g_LayeredSingleIdxBuffer[0]沒有用到過,應該是某個Indirect的操作,記錄所有Cluster的LightIndex總數
    }

    ...
    //start記錄前面所有Cluster包含的Light數量,也是後續跳錶List(g_LayeredOffset)儲存的
    uint offs = start;
    for (int ll = 0; ll < iNrCoarseLights; ll += 4)
    {
        int p = i >> 2;
        int m = i & 3;
        if (i < 24)
            lightPlanes[6 * m + p] = FetchPlane(min(iNrCoarseLights - 1, ll + m), p, eyeIndex);

        #if NR_THREADS > PLATFORM_LANE_COUNT
        GroupMemoryBarrierWithGroupSync();
        #endif

        for (int l = ll; l < min(iNrCoarseLights, (ll + 4)); l++)
        {
            //iSpaceAvail是透過CheckIntersectionBasic測試的燈光數量
            if (offs < (start + iSpaceAvail) && i < nrClusters && CheckIntersection(l, i, viTilLL.xy, viTilUR.xy, suggestedBase, eyeIndex))
            {
                //不同lightCategory是連續儲存的,後續透過ReadCategoryListCount讀取到不同Category的LightData的數量
                //從而分開不同Category.
                const int lightVolIndex = GenerateLightCullDataIndex(coarseList[l], g_iNrVisibLights, eyeIndex);
                uint lightCategory = _LightVolumeData[lightVolIndex].lightCategory;
                IncrementCategoryListCount(t, lightCategory);
                g_vLayeredLightList[offs++] = coarseList[l] - ReadShiftIndex(t, lightCategory);
            }
        }

        #if NR_THREADS > PLATFORM_LANE_COUNT
        GroupMemoryBarrierWithGroupSync();
        #endif
    }

    ...
    uint localOffs = 0;

    //每個Cluster起始offset由LIGHTCATEGORY_PUNCTUAL(0)為標準
    //在loop裡面再累計[offs += (nrClusters * nrTilesX * nrTilesY);]

    //nrClusters=64
    offs = GenerateLayeredOffsetBufferIndex(0, tileIDX, i, nrTilesX, nrTilesY, nrClusters, eyeIndex);

    for (int category = 0; category < LIGHTCATEGORY_COUNT; category++)
    {
        //讀取當前Cluster中category對應的light Count
        int numLights = ReadCategoryListCount(t, category);
        if (i < nrClusters)
        {
            //(讀取g_vLayeredLightList的起始Index) start + localOffs 
            //(Cluster內category對應的light Count) numLights
            //g_vLayeredLightList裡面才儲存LightIndex,g_LayeredOffset作為跳轉的List

            g_LayeredOffset[offs] = PackClusterLayeredOffset((start + localOffs), (uint)numLights);
            offs += (nrClusters * nrTilesX * nrTilesY);
            localOffs += numLights; // use unclamped count for localOffs
        }
    }

    //為了後面LightingLoop對映ClusterIdx,需要儲存每個Tile的劃分Cluster的引數(suggestedBase)
    #ifdef ENABLE_DEPTH_TEXTURE_BACKPLANE
    const uint logBaseIndex = GenerateLogBaseBufferIndex(tileIDX, nrTilesX, nrTilesY, eyeIndex);
    if (threadID == 0) 
        g_logBaseBuffer[logBaseIndex] = suggestedBase;
    #endif
    ...
}

LightLoop

類似FPTL,Cluster這裡也同樣呼叫的是GetCountAndStart介面獲取當前Cluster的lightData Count以及start的Index,
FetchIndex獲取最終的LightData Index
最終的流程如下:
1.positionInput的TileCoord + Depth.z計算出當前片元歸屬的clusterIdx
2.clusterIdx + 當前要計算的category (GenerateLayeredOffsetBufferIndex)計算當前cluster在g_vLayeredOffsetsBuffer的idx
3.g_vLayeredOffsetsBuffer[idx]即為上面最後記錄的用來跳轉用的dataPair(PackClusterLayeredOffset)
4.UnpackClusterLayeredOffset得到start和lightCount
5.Lighting的時候從start開始FetchIndex就可以得到lightData的真正Index

//LightLoopDef.hlsl

...

#elif defined(USE_CLUSTERED_LIGHTLIST)

#include "Packages/com.unity.render-pipelines.high-definition/Runtime/Lighting/LightLoop/ClusteredUtils.hlsl"

uint GetTileSize()
{
    return TILE_SIZE_CLUSTERED;
}

uint GetLightClusterIndex(uint2 tileIndex, float linearDepth)
{
    float logBase = g_fClustBase;
    if (g_isLogBaseBufferEnabled)
    {
        const uint logBaseIndex = GenerateLogBaseBufferIndex(tileIndex, _NumTileClusteredX, _NumTileClusteredY, unity_StereoEyeIndex);
        logBase = g_logBaseBuffer[logBaseIndex];
    }

    return SnapToClusterIdxFlex(linearDepth, logBase, g_isLogBaseBufferEnabled != 0);
}

void UnpackClusterLayeredOffset(uint packedValue, out uint offset, out uint count)
{
    offset = packedValue & LIGHT_CLUSTER_PACKING_OFFSET_MASK;
    count = packedValue >> LIGHT_CLUSTER_PACKING_OFFSET_BITS;
}

void GetCountAndStartCluster(uint2 tileIndex, uint clusterIndex, uint lightCategory, out uint start, out uint lightCount)
{
    int nrClusters = (1 << g_iLog2NumClusters);

    const int idx = GenerateLayeredOffsetBufferIndex(lightCategory, tileIndex, clusterIndex, _NumTileClusteredX, _NumTileClusteredY, nrClusters, unity_StereoEyeIndex);

    uint dataPair = g_vLayeredOffsetsBuffer[idx];
    UnpackClusterLayeredOffset(dataPair, start, lightCount);
}

void GetCountAndStartCluster(PositionInputs posInput, uint lightCategory, out uint start, out uint lightCount)
{
    // Note: XR depends on unity_StereoEyeIndex already being defined,
    // which means ShaderVariables.hlsl needs to be defined ahead of this!

    uint2 tileIndex    = posInput.tileCoord;
    uint  clusterIndex = GetLightClusterIndex(tileIndex, posInput.linearDepth);

    GetCountAndStartCluster(tileIndex, clusterIndex, lightCategory, start, lightCount);
}

void GetCountAndStart(PositionInputs posInput, uint lightCategory, out uint start, out uint lightCount)
{
    GetCountAndStartCluster(posInput, lightCategory, start, lightCount);
}

uint FetchIndex(uint lightStart, uint lightOffset)
{
    return g_vLightListCluster[lightStart + lightOffset];
}

...

相關文章