Unity的Forward+ FPTL光照剔除解析(三)

凶恶的真实發表於2024-04-06

序言

如果看了前面的BigTileLightList的建立,這一章會簡單一點。
因為如果啟用了BigTile之後,這裡的BuildPerTileLightList就需要從BigTileLightList裡面讀取LightList。
否則還是需要像BigTile一樣去先走同樣的燈光剔除流程(NDCAABBBoundTest,SphericalIntersectionTests)
然後才到最後的FinePruneLightsTest

LightListBuild

RenderGraph Dispatch

下面是RenderGraph中Dispatch時需要的Buffer以及ConstantBuffer

//HDRenderPipeline.LightLoop.cs
static void BuildPerTileLightList(BuildGPULightListPassData data, ref bool tileFlagsWritten, CommandBuffer cmd)
{
    // optimized for opaques only
    if (data.runLightList && data.runFPTL)
    {
        //第一步計算的燈光AABB Buffer
        cmd.SetComputeBufferParam(data.buildPerTileLightListShader, data.buildPerTileLightListKernel, HDShaderIDs.g_vBoundsBuffer, data.AABBBoundsBuffer);

        //LightVolumeData與SFiniteLightBound的ComputeBuffer
        cmd.SetComputeBufferParam(data.buildPerTileLightListShader, data.buildPerTileLightListKernel, HDShaderIDs._LightVolumeData, data.lightVolumeDataBuffer);
        cmd.SetComputeBufferParam(data.buildPerTileLightListShader, data.buildPerTileLightListKernel, HDShaderIDs.g_data, data.convexBoundsBuffer);

        //用作Hiz剔除的深度圖
        cmd.SetComputeTextureParam(data.buildPerTileLightListShader, data.buildPerTileLightListKernel, HDShaderIDs.g_depth_tex, data.depthBuffer);
        //最終輸出的lightList
        cmd.SetComputeBufferParam(data.buildPerTileLightListShader, data.buildPerTileLightListKernel, HDShaderIDs.g_vLightList, data.output.lightList);
        //Big Tile Light List
        if (data.runBigTilePrepass)
            cmd.SetComputeBufferParam(data.buildPerTileLightListShader, data.buildPerTileLightListKernel, HDShaderIDs.g_vBigTileLightList, data.output.bigTileLightList);

        var localLightListCB = data.lightListCB;
        //計算Tile內需要計算的FeatureVariant
        //LightLoop中用於控制著色光照計算流程,baseFeatureFlags就是最基礎的Flag
        //LightLoop的時候獲取TileFeatureFlag,就可以知道當前Tile是否需要計算Punctual/Area/Directional/Env的光照
        if (data.enableFeatureVariants)
        {
            uint baseFeatureFlags = 0;
            if (data.directionalLightCount > 0)
            {
                baseFeatureFlags |= (uint)LightFeatureFlags.Directional;
            }
            if (data.skyEnabled)
            {
                baseFeatureFlags |= (uint)LightFeatureFlags.Sky;
            }
            if (!data.computeMaterialVariants)
            {
                baseFeatureFlags |= LightDefinitions.s_MaterialFeatureMaskFlags;
            }

            localLightListCB.g_BaseFeatureFlags = baseFeatureFlags;

            cmd.SetComputeBufferParam(data.buildPerTileLightListShader, data.buildPerTileLightListKernel, HDShaderIDs.g_TileFeatureFlags, data.output.tileFeatureFlags);
            tileFlagsWritten = true;
        }

        ConstantBuffer.Push(cmd, localLightListCB, data.buildPerTileLightListShader, HDShaderIDs._ShaderVariablesLightList);

        cmd.DispatchCompute(data.buildPerTileLightListShader, data.buildPerTileLightListKernel, data.numTilesFPTLX, data.numTilesFPTLY, data.viewCount);
    }
}

Initialize

BigTile類似,計算當前執行緒的Tile的對映關係要用到的資料.(Tile的X/Y軸上的數量,當前執行緒組對應的TileID)

//FPTL這一步的Tile Size為16*16
#define TILE_SIZE_FPTL (16)

[numthreads(NR_THREADS, 1, 1)]
void TileLightListGen(uint3 dispatchThreadId : SV_DispatchThreadID, uint threadID : SV_GroupIndex, uint3 u3GroupID : SV_GroupID)
{
    UNITY_XR_ASSIGN_VIEW_INDEX(dispatchThreadId.z);
    uint2 tileIDX = u3GroupID.xy;
    uint t=threadID;

    if(t<LIGHT_LIST_MAX_COARSE_ENTRIES)
        prunedList[t]=0;

    uint iWidth = g_viDimensions.x;
    uint iHeight = g_viDimensions.y;
    uint nrTilesX = (iWidth+15)/16;
    uint nrTilesY = (iHeight+15)/16;

    // build tile scr boundary
    const uint uFltMax = 0x7f7fffff;  // FLT_MAX as a uint
    if(t==0)
    {
        ldsZMin = uFltMax;
        ldsZMax = 0;
        lightOffs = 0;
    }

#if NR_THREADS > PLATFORM_LANE_COUNT
    GroupMemoryBarrierWithGroupSync();
#endif

...
}

Hi-z剔除

讀取PreDepth深度圖獲取4個畫素內的Min、MaxDepth,透過執行緒同步計算得出一個執行緒組內的Min/MaxDepth 即(64*4=16*16),建立起當前Tile的Bounds(ldsZMin/ldsZMax)

讀取深度圖圖,計算viewPostion.z

LinearDepth

camera.projectMatrix是右手座標系的(OpenGL),為了統一整個剔除流程的軸向,ScrProjection翻轉了z軸,採用左手座標系

groupshared uint ldsZMin;
groupshared uint ldsZMax;

TEXTURE2D_X(g_depth_tex) : register( t0 );

float FetchDepth(uint2 pixCoord)
{
    float zdpth = LOAD_TEXTURE2D_X(g_depth_tex, pixCoord.xy).x;
    //https://zhuanlan.zhihu.com/p/389971233
    //投影矩陣的Z Flip了,讀取時也要翻轉過來(匹配 0 is near 1 is far)
#if UNITY_REVERSED_Z
        zdpth = 1.0 - zdpth;
#endif
    return zdpth;
}

//USE_OBLIQUE_MODE m_LightListProjMatrices.m20!= 0 || m_LightListProjMatrices.m21 != 0;
//即投影矩陣的r+l!=0,t+b!=0
//linearDepth; // View space Z coordinate                              : [Near, Far]
//Reverse z :-z_eye=1/((n-f)/(n*f)*depth+1/n); //https://zhuanlan.zhihu.com/p/393643084
float GetLinearDepth(float2 pixXY, float zDptBufSpace)    // 0 is near 1 is far
{
    float4x4 g_mInvScrProjection = g_mInvScrProjectionArr[unity_StereoEyeIndex];

#ifdef USE_OBLIQUE_MODE
    float2 res2 = mul(g_mInvScrProjection, float4(pixXY, zDptBufSpace, 1.0)).zw;
    return res2.x / res2.y;
#else
    //正交矩陣用(m22*zDptBufSpace+m23),透視用(m32*zDptBufSpace+m33)
    // for perspective projection m22 is zero and m23 is +1/-1 (depends on left/right hand proj)
    // however this function must also work for orthographic projection so we keep it like this.
    float m22 = g_mInvScrProjection[2].z, m23 = g_mInvScrProjection[2].w;
    float m32 = g_mInvScrProjection[3].z, m33 = g_mInvScrProjection[3].w;

    return (m22*zDptBufSpace+m23) / (m32*zDptBufSpace+m33);
#endif
}
GetViewPosFromLinDepth

這裡簡單的以x軸的推導為例
由投影矩陣中的相似三角形易得
[Unity Shader入門精要]
{92766D46-7FAD-4038-AAD9-06849DBD8F6B}.png
\(\frac{(ScreenPos.x-pixWidth/2)}{pixWidth/2}=\frac{clipPos.x}{clipPos.w}\)

\(\frac{clipPos.x}{clipPos.w}=\frac{viewPos.x*\frac{cotFOV}{Aspect}}{-viewPos.z}\)

\(viewPos.x = \frac{Screen.x-pixWidth/2}{pixWidth/2*\frac{cotFOV}{Aspect}}*-viewPos.z\)

由於之前ScrProjection已經FlipZ,所以可以直接fLinDepth * p.xy

unsafe void PrepareBuildGPULightListPassData(
    RenderGraph renderGraph,
    RenderGraphBuilder builder,
    HQCamera hqCamera,
    TileAndClusterData tileAndClusterData,
    ref ShaderVariablesLightList constantBuffer,
    int totalLightCount,
    TextureHandle depthStencilBuffer,
    TextureHandle stencilBufferCopy,
    BuildGPULightListPassData passData)
{
    ....

    // camera to screen matrix (and it's inverse)
    for (int viewIndex = 0; viewIndex < hqCamera.viewCount; ++viewIndex)
    {
        var proj = camera.projectionMatrix;
        // Note: we need to take into account the TAA jitter when indexing the light list
        proj = hqCamera.RequiresCameraJitter() ? hqCamera.GetJitteredProjectionMatrix(proj) : proj;

        m_LightListProjMatrices[viewIndex] = proj * s_FlipMatrixLHSRHS;

        var tempMatrix = temp * m_LightListProjMatrices[viewIndex];
        var invTempMatrix = tempMatrix.inverse;

        for (int i = 0; i < 16; ++i)
        {
            cb.g_mScrProjectionArr[viewIndex * 16 + i] = tempMatrix[i];
            cb.g_mInvScrProjectionArr[viewIndex * 16 + i] = invTempMatrix[i];
        }
    }
}

\(pixWidth/2*\frac{cotFOV}{Aspect}=fSx\)
\(pixHeight/2*\frac{cotFOV}{Aspect}=fSy\)
\(pixWidth/2=fCx\)
\(pixHeight/2=fCy\)

//
float3 GetViewPosFromLinDepth(float2 v2ScrPos, float fLinDepth)
{
    float4x4 g_mScrProjection = g_mScrProjectionArr[unity_StereoEyeIndex];

    bool isOrthographic = g_isOrthographic != 0;
    float fSx = g_mScrProjection[0].x;
    float fSy = g_mScrProjection[1].y;
    float fCx = isOrthographic ? g_mScrProjection[0].w : g_mScrProjection[0].z;
    float fCy = isOrthographic ? g_mScrProjection[1].w : g_mScrProjection[1].z;

    #if USE_LEFT_HAND_CAMERA_SPACE
    bool useLeftHandVersion = true;
    #else
    bool useLeftHandVersion = isOrthographic;
    #endif

    float s = useLeftHandVersion ? 1 : (-1);
    float2 p = float2((s * v2ScrPos.x - fCx) / fSx, (s * v2ScrPos.y - fCy) / fSy);

    return float3(isOrthographic ? p.xy : (fLinDepth * p.xy), fLinDepth);
}

這裡讀取深度圖並將其轉換到[Near,Far],然後計算出2*2畫素中的MinDepth,MaxDepth,
然後透過執行緒同步(InterlockedMax/InterlockedMin)計算執行緒組內(Tile內)的MinDepth,MaxDepth
注:FPTL的Tile Size為16*16,64執行緒一組,一個執行緒計算4個畫素。(16*16=64*4),這裡的同步計算不會影響到別的執行緒組(Tile)

#define NR_THREADS              64
#define TILE_SIZE_FPTL (16)
#define VIEWPORT_SCALE_Z (1)
#define PIXEL_PER_THREAD      ((TILE_SIZE_FPTL*TILE_SIZE_FPTL) / NR_THREADS) // 8 or 4
//16*16/64=4
[numthreads(NR_THREADS, 1, 1)]
void TileLightListGen(uint3 dispatchThreadId : SV_DispatchThreadID, uint threadID : SV_GroupIndex, uint3 u3GroupID : SV_GroupID)
{
    ...

    uint2 viTilLL = 16*tileIDX;

    // establish min and max depth first
    float dpt_mi=asfloat(uFltMax), dpt_ma=0.0;


#if PIXEL_PER_THREAD == 4
    float4 vLinDepths;
#else
    float vLinDepths[PIXEL_PER_THREAD];
#endif
    {

        //VIEWPORT_SCALE_Z

        // Fetch depths and calculate min/max
        UNITY_UNROLL
        for(int i = 0; i < PIXEL_PER_THREAD; i++)
        {
            int idx = i * NR_THREADS + t;
            uint2 uCrd = min( uint2(viTilLL.x+(idx&0xf), viTilLL.y+(idx>>4)), uint2(iWidth-1, iHeight-1) );
            const float fDepth = FetchDepth(uCrd);
            vLinDepths[i] = GetLinearDepth(uCrd+float2(0.5,0.5), fDepth);
            if(fDepth<VIEWPORT_SCALE_Z)     // if not skydome
            {
                dpt_mi = min(fDepth, dpt_mi);
                dpt_ma = max(fDepth, dpt_ma);
            }
        }

        
        InterlockedMax(ldsZMax, asuint(dpt_ma));
        InterlockedMin(ldsZMin, asuint(dpt_mi));

#if NR_THREADS > PLATFORM_LANE_COUNT || defined(SHADER_API_SWITCH) // not sure why Switch needs the barrier (it will not be correct without)
        GroupMemoryBarrierWithGroupSync();
#endif
    }

    ...
}

NDCAABBBoundTest

若啟用了BigTile預計算,則透過對映讀取LightOffset(Big-tile內的燈光數量)以及對應的lightIndex
若沒有則按原樣直接遍歷g_vBoundBuffer,用AABB計算當前燈光是否在Tile內若在則加入到CoareList中

[numthreads(NR_THREADS, 1, 1)]
void TileLightListGen(uint3 dispatchThreadId : SV_DispatchThreadID, uint threadID : SV_GroupIndex, uint3 u3GroupID : SV_GroupID)
{
    ...
    // build coarse list using AABB
    //若啟用了BigTile計算


    #ifdef USE_TWO_PASS_TILED_LIGHTING
    //firstbithigh(64)=6 64*64
    //firstbithigh(16)=4 16*16
    //log2BigTileToTileRatio=2 
    //#define TILE_SIZE_FPTL (16)
    //#define TILE_SIZE_BIG_TILE (64)
    //即4*4個FTPL Tile構成一個Big Tile log2_4=2
    const uint log2BigTileToTileRatio = firstbithigh(64) - firstbithigh(16);

    //計算Big Tile X/Y的數量
    //((1 << log2BigTileToTileRatio) -1 ) 1<<2-1 = 3 相當於DivRoundUp(nrTilesX,4)
    int NrBigTilesX = (nrTilesX + ((1 << log2BigTileToTileRatio) -1 )) >> log2BigTileToTileRatio;
    int NrBigTilesY = (nrTilesY + ((1 << log2BigTileToTileRatio) - 1)) >> log2BigTileToTileRatio;
    //BigTile總數
    const int bigTileBase = unity_StereoEyeIndex * NrBigTilesX * NrBigTilesY;
    //計算當前Tile對應的BigTile
    const int bigTileIdx = bigTileBase + (tileIDX.y>>log2BigTileToTileRatio)*NrBigTilesX + (tileIDX.x>>log2BigTileToTileRatio);       // map the idx to 64x64 tiles
    //第一位記錄當前Tile的燈光數量
    int nrBigTileLights = g_vBigTileLightList[MAX_NR_BIG_TILE_LIGHTS_PLUS_ONE*bigTileIdx+0];
    //遍歷BigTileLightList
    for(int l0=(int) t; l0<(int) nrBigTileLights; l0 += NR_THREADS)
    {
        int l = g_vBigTileLightList[MAX_NR_BIG_TILE_LIGHTS_PLUS_ONE*bigTileIdx+l0+1];
    
    #else
    //若沒有啟用Big Tile,則直接遍歷所有的燈光,利用Scrbound計算的AABBBound計算Tile(16*16)的當前燈光列表(coarseList)
    for (int l = (int)t; l < (int)g_iNrVisibLights; l += NR_THREADS)
    {
        #endif
        // Skip Local Volumetric Fog (lights are sorted by category). TODO: improve data locality

        // if (_LightVolumeData[l].lightCategory == LIGHTCATEGORY_LOCAL_VOLUMETRIC_FOG) { break; }

        const ScreenSpaceBoundsIndices boundsIndices = GenerateScreenSpaceBoundsIndices(l, g_iNrVisibLights, unity_StereoEyeIndex);
        const float3 vMi = g_vBoundsBuffer[boundsIndices.min].xyz;
        const float3 vMa = g_vBoundsBuffer[boundsIndices.max].xyz;

        if (all(vMa > vTileLL) && all(vMi < vTileUR))
        {
            unsigned int uInc = 1;
            unsigned int uIndex;
            InterlockedAdd(lightOffs, uInc, uIndex);
            if (uIndex < LIGHT_LIST_MAX_COARSE_ENTRIES) coarseList[uIndex] = l; // add to light list
        }
    }
    ...
}

清空ldsDoesLightInterset初始化,並同步執行緒組

#define FINE_PRUNING_ENABLED
#define LIGHT_LIST_MAX_COARSE_ENTRIES (64)//coarseList/prunedList LDS的最大容量為64
//uint 32Bit容納不了64盞燈的燈光與當前Tile相交情況,所以加多了一個記錄另外32盞燈.
groupshared uint ldsDoesLightIntersect[2];
[numthreads(NR_THREADS, 1, 1)]
void TileLightListGen(uint3 dispatchThreadId : SV_DispatchThreadID, uint threadID : SV_GroupIndex, uint3 u3GroupID : SV_GroupID)
{
    ...
    #ifdef FINE_PRUNING_ENABLED
    if (t < 2)
        ldsDoesLightIntersect[t] = 0;
    #endif

    #if NR_THREADS > PLATFORM_LANE_COUNT
    GroupMemoryBarrierWithGroupSync();
    #endif

    //coarseList/prunedList LDS的最大容量為64
    int iNrCoarseLights = min(lightOffs,LIGHT_LIST_MAX_COARSE_ENTRIES);
    ...
}

SphericalIntersectionTest,FinePruneLights

SphericalIntersectionTest

這裡的SphericalIntersectionTest與BigTile中的SphericalIntersectionTest唯一不同的區別是就是需要把coarseList複製到prunedList暫存
檢測到燈光Overlap Tile之後再暫存在coarseList的lightIndex加入到prunedList

#define FINE_PRUNING_ENABLED
#define PERFORM_SPHERICAL_INTERSECTION_TESTS

int SphericalIntersectionTests(uint threadID, int iNrCoarseLights, float2 screenCoordinate)
{
    if (threadID == 0) lightOffsSph = 0;

    // make a copy of coarseList in prunedList.
    int l;
    for (l = threadID; l < iNrCoarseLights; l += NR_THREADS)
        prunedList[l] = coarseList[l];

    #if NR_THREADS > PLATFORM_LANE_COUNT
    GroupMemoryBarrierWithGroupSync();
    #endif

    #if USE_LEFT_HAND_CAMERA_SPACE
    float3 V = GetViewPosFromLinDepth(screenCoordinate, 1.0);
    #else
    float3 V = GetViewPosFromLinDepth( screenCoordinate, -1.0);
    #endif

    float onePixDiagDist = GetOnePixDiagWorldDistAtDepthOne();
    float halfTileSizeAtZDistOne = 8 * onePixDiagDist; // scale by half a tile

    for (l = threadID; l < iNrCoarseLights; l += NR_THREADS)
    {
        const int lightBoundIndex = GenerateLightCullDataIndex(prunedList[l], g_iNrVisibLights, unity_StereoEyeIndex);
        SFiniteLightBound lightData = g_data[lightBoundIndex];

        if (DoesSphereOverlapTile(V, halfTileSizeAtZDistOne, lightData.center.xyz, lightData.radius, g_isOrthographic != 0))
        {
            unsigned int uIndex;
            InterlockedAdd(lightOffsSph, 1, uIndex);
            coarseList[uIndex] = prunedList[l]; // read from the original copy of coarseList which is backed up in prunedList
        }
    }

    #if NR_THREADS > PLATFORM_LANE_COUNT
    GroupMemoryBarrierWithGroupSync();
    #endif

    return lightOffsSph;
}

[numthreads(NR_THREADS, 1, 1)]
void TileLightListGen(uint3 dispatchThreadId : SV_DispatchThreadID, uint threadID : SV_GroupIndex, uint3 u3GroupID : SV_GroupID)
{
    ...
    #ifdef PERFORM_SPHERICAL_INTERSECTION_TESTS
    iNrCoarseLights = SphericalIntersectionTests(t, iNrCoarseLights, float2(min(viTilLL.xy + uint2(16 / 2, 16 / 2), uint2(iWidth - 1, iHeight - 1))));
    #endif
    ...
}

FinePruneLights

s_lightVolumesCache LDS

在開始計算FinePruneLights之前,需要預先記錄coarseList對應的LightVolume進LDS s_lightVolumesCache(StoreLightVolumeCache)中

#define FINE_PRUNING_ENABLED

[numthreads(NR_THREADS, 1, 1)]
void TileLightListGen(uint3 dispatchThreadId : SV_DispatchThreadID, uint threadID : SV_GroupIndex, uint3 u3GroupID : SV_GroupID)
{
    ...
    #ifndef FINE_PRUNING_ENABLED
        {
            if((int)t<iNrCoarseLights)
                prunedList[t] = coarseList[t];
            if(t==0)
                ldsNrLightsFinal=iNrCoarseLights;
        }
    #else
    {
        // initializes ldsNrLightsFinal with the number of accepted lights.
        // all accepted entries delivered in prunedList[].
        FinePruneLights(t, iNrCoarseLights, viTilLL, vLinDepths);
    }
    #endif
    ...
}

//

//eyeIndex==0時,GetCoarseLightIndex=>coarseList[l]
uint GenerateLightCullDataIndex(uint lightIndex, uint numVisibleLights, uint eyeIndex)
{
    lightIndex = min(lightIndex, numVisibleLights - 1); // Stay within bounds

    // For monoscopic, there is just one set of light cull data structs.
    // In stereo, all of the left eye structs are first, followed by the right eye structs.
    const uint perEyeBaseIndex = eyeIndex * numVisibleLights;
    return (perEyeBaseIndex + lightIndex);
}

int GetCoarseLightIndex(int l, int iNrCoarseLights)
{
    return l < iNrCoarseLights ? GenerateLightCullDataIndex(coarseList[l], g_iNrVisibLights, unity_StereoEyeIndex) : 0;
}

//

groupshared uint s_lightVolumesCache[LIGHT_LIST_MAX_COARSE_ENTRIES];

void StoreLightVolumeCache(int lightIndex, int coarseIndex, uint volumeType)
{
    // 3 bits for the volume type, in case we have a corrupted one we can early out of the switch statement.
    // 29 bits for a coarse light index.
    s_lightVolumesCache[lightIndex] = (volumeType & 0x7) | (uint)(coarseIndex << 3);
}

void LoadLightVolumeCache(int lightIndex, out int coarseIndex, out int volumeType)
{
    int data = s_lightVolumesCache[lightIndex];
    coarseIndex = data >> 3;
    volumeType = data & 0x7;
}


// initializes ldsNrLightsFinal with the number of accepted lights.
// all accepted entries delivered in prunedList[].
#if PIXEL_PER_THREAD == 4
void FinePruneLights(uint threadID, int iNrCoarseLights, uint2 viTilLL, float4 vLinDepths) // keep float4 vectorization when possible, as shader compiler may generate bad code for array of floats.
#else
void FinePruneLights(uint threadID, int iNrCoarseLights, uint2 viTilLL, float vLinDepths[PIXEL_PER_THREAD])
#endif
{
    uint t = threadID;
    uint iWidth = g_viDimensions.x;
    uint iHeight = g_viDimensions.y;

    uint uLightsFlags[2] = {0, 0};
    int l = 0;
    // need this outer loop even on xb1 and ps4 since direct lights and
    // reflection lights are kept in separate regions.

    if (threadID < (uint)iNrCoarseLights)
    {
        int idxCoarse = GetCoarseLightIndex((int)threadID, iNrCoarseLights);
        int uLightVolume = (int)_LightVolumeData[idxCoarse].lightVolume;
        StoreLightVolumeCache(threadID, idxCoarse, uLightVolume);
    }

    #if NR_THREADS > PLATFORM_LANE_COUNT
    GroupMemoryBarrierWithGroupSync();
    #endif

    ....
}


判交計算

讀取LDS中的volumeData,並且利用之前的DepthBound(vLinDepths)逐畫素進行判交

#define FINE_PRUNING_ENABLED
#define PERFORM_SPHERICAL_INTERSECTION_TESTS

//(記錄Tile內燈光數量)
groupshared int ldsNrLightsFinal;

// initializes ldsNrLightsFinal with the number of accepted lights.
// all accepted entries delivered in prunedList[].
#if PIXEL_PER_THREAD == 4
void FinePruneLights(uint threadID, int iNrCoarseLights, uint2 viTilLL, float4 vLinDepths) // keep float4 vectorization when possible, as shader compiler may generate bad code for array of floats.
#else
void FinePruneLights(uint threadID, int iNrCoarseLights, uint2 viTilLL, float vLinDepths[PIXEL_PER_THREAD])
#endif
{
    ...

    //When using LDS to cache the volume data, this produces the best most optimal code.
    //Doing a manual loop like the one below adds an extra cost of .1 ms on ps4 if we use LDS.
    for (; l < iNrCoarseLights; ++l)
    {
        int idxCoarse;
        int uLightVolume;
        //讀取之前的LightVolumeData
        LoadLightVolumeCache(l, idxCoarse, uLightVolume);
        bool lightValid = false;
        if (uLightVolume == LIGHTVOLUMETYPE_CONE)
        {
            LightVolumeData lightData = _LightVolumeData[idxCoarse];
            const bool bIsSpotDisc = true; // (lightData.flags&IS_CIRCULAR_SPOT_SHAPE) != 0;
            for (int i = 0; i < PIXEL_PER_THREAD; i++)
            {
                int idx = t + i * NR_THREADS;
                //先計算當前畫素的深度對應的ViewPosition
                uint2 uPixLoc = min(uint2(viTilLL.x + (idx & 0xf), viTilLL.y + (idx >> 4)), uint2(iWidth - 1, iHeight - 1));
                float3 vVPos = GetViewPosFromLinDepth(uPixLoc + float2(0.5, 0.5), vLinDepths[i]);

                //LightData.lightPos是View Space
                // check pixel
                //用當前畫素到LightPos的向量fromLight以及LightAxis判斷fromLight在Cone內部
                float3 fromLight = vVPos - lightData.lightPos.xyz;
                float distSq = dot(fromLight, fromLight);
                const float fSclProj = dot(fromLight, lightData.lightAxisZ.xyz); // spotDir = lightData.lightAxisZ.xyz

                float2 V = abs(float2(dot(fromLight, lightData.lightAxisX.xyz), dot(fromLight, lightData.lightAxisY.xyz)));
                //bIsSpotDisc=true
                //即fDist2D=dot(fromLight, lightData.lightAxisX)^2+dot(fromLight, lightData.lightAxisX)^2;
                float fDist2D = bIsSpotDisc ? length(V) : max(V.x, V.y);



                //lightData.radiusSq>distSq
                //fSclProj>fDist2D * lightData.cotan即fSclProj/fDist2D>lightData.cotan,用fromLight和Axis計算夾角的cot

                //lightVolumeData radiusSq的計算
                //lightVolumeData.radiusSq = range * range;

                //lightVolumeData cotan的計算
                // var sa = light.spotAngle;
                // var cs = Mathf.Cos(0.5f * sa * Mathf.Deg2Rad);
                // var si = Mathf.Sin(0.5f * sa * Mathf.Deg2Rad);
                //  if (gpuLightType == GPULightType.ProjectorPyramid)
                //    {
                //        Vector3 lightPosToProjWindowCorner = (0.5f * lightDimensions.x) * vx + (0.5f * lightDimensions.y) * vy + 1.0f * vz;
                //        cs = Vector3.Dot(vz, Vector3.Normalize(lightPosToProjWindowCorner));
                //        si = Mathf.Sqrt(1.0f - cs * cs);
                //   }
                //   const float FltMax = 3.402823466e+38F;
                //   var ta = cs > 0.0f ? (si / cs) : FltMax;
                //   var cota = si > 0.0f ? (cs / si) : FltMax;
                //   lightVolumeData.cotan = cota;


                bool validInPixel = all(float2(lightData.radiusSq, fSclProj) > float2(distSq, fDist2D * lightData.cotan));
                #ifdef PLATFORM_SUPPORTS_WAVE_INTRINSICS
                //a wave is on the same tile, and the loop is uniform for the wave.
                // thus we early out if at least 1 thread in the wave passed this light, saving some ALU.
                lightValid = WaveActiveAnyTrue(validInPixel);
                #else
                lightValid = validInPixel;
                #endif
                if (lightValid)
                    break;
            }
        }
        else if (uLightVolume == LIGHTVOLUMETYPE_SPHERE)
        {
            LightVolumeData lightData = _LightVolumeData[idxCoarse];
            for (int i = 0; i < PIXEL_PER_THREAD; i++)
            {
                int idx = t + i * NR_THREADS;
                //先計算當前畫素的深度對應的ViewPosition
                uint2 uPixLoc = min(uint2(viTilLL.x + (idx & 0xf), viTilLL.y + (idx >> 4)), uint2(iWidth - 1, iHeight - 1));
                float3 vVPos = GetViewPosFromLinDepth(uPixLoc + float2(0.5, 0.5), vLinDepths[i]);

                //簡單的球形距離場
                // check pixel
                float3 vLp = lightData.lightPos.xyz;
                float3 toLight = vLp - vVPos;
                float distSq = dot(toLight, toLight);

                bool validInPixel = lightData.radiusSq > distSq;
                #ifdef PLATFORM_SUPPORTS_WAVE_INTRINSICS
                lightValid = WaveActiveAnyTrue(validInPixel);
                #else
                lightValid = validInPixel;
                #endif
                if (lightValid)
                    break;
            }
        }
        else if (uLightVolume == LIGHTVOLUMETYPE_BOX)
        {
            LightVolumeData lightData = _LightVolumeData[idxCoarse];
            for (int i = 0; i < PIXEL_PER_THREAD; i++)
            {
                int idx = t + i * NR_THREADS;
                //先計算當前畫素的深度對應的ViewPosition
                uint2 uPixLoc = min(uint2(viTilLL.x + (idx & 0xf), viTilLL.y + (idx >> 4)), uint2(iWidth - 1, iHeight - 1));
                float3 vVPos = GetViewPosFromLinDepth(uPixLoc + float2(0.5, 0.5), vLinDepths[i]);

                // check pixel
                float3 toLight = lightData.lightPos.xyz - vVPos;

                //用toLight計算有向距離場判斷當前畫素是否在Box內部
                float3 dist = float3(dot(toLight, lightData.lightAxisX), dot(toLight, lightData.lightAxisY), dot(toLight, lightData.lightAxisZ));
                dist = (abs(dist) - lightData.boxInnerDist) * lightData.boxInvRange; // not as efficient as it could be
                bool validInPixel = max(max(dist.x, dist.y), dist.z) < 1; // but allows us to not write out OuterDists
                #ifdef PLATFORM_SUPPORTS_WAVE_INTRINSICS
                lightValid = WaveActiveAnyTrue(validInPixel);
                #else
                lightValid = validInPixel;
                #endif
                if (lightValid)
                    break;
            }
        }
        else
            break;

        //lightValid記錄判交結果,前32盞燈記錄到uLightsFlags[0],剩下的記錄到uLightsFlags[1]
        uLightsFlags[l < 32 ? 0 : 1] |= ((lightValid ? 1 : 0) << (l & 31));
    }

    ...
}



[numthreads(NR_THREADS, 1, 1)]
void TileLightListGen(uint3 dispatchThreadId : SV_DispatchThreadID, uint threadID : SV_GroupIndex, uint3 u3GroupID : SV_GroupID)
{
    ...
    #ifndef FINE_PRUNING_ENABLED
        {
            if((int)t<iNrCoarseLights)
                prunedList[t] = coarseList[t];
            if(t==0)
                ldsNrLightsFinal=iNrCoarseLights;
        }
    #else
    {
        // initializes ldsNrLightsFinal with the number of accepted lights.
        // all accepted entries delivered in prunedList[].
        FinePruneLights(t, iNrCoarseLights, viTilLL, vLinDepths);
    }
    #endif
    ...
}
Resolve Pruned List

遍歷ldsDoesLightIntersect的Flag,重新結算Tile內的燈光數量,並把對應燈光Index(coarseList)加入到prunedList

#define FINE_PRUNING_ENABLED
#define PERFORM_SPHERICAL_INTERSECTION_TESTS

//(記錄Tile內燈光數量)
groupshared int ldsNrLightsFinal;

// initializes ldsNrLightsFinal with the number of accepted lights.
// all accepted entries delivered in prunedList[].
#if PIXEL_PER_THREAD == 4
void FinePruneLights(uint threadID, int iNrCoarseLights, uint2 viTilLL, float4 vLinDepths) // keep float4 vectorization when possible, as shader compiler may generate bad code for array of floats.
#else
void FinePruneLights(uint threadID, int iNrCoarseLights, uint2 viTilLL, float vLinDepths[PIXEL_PER_THREAD])
#endif
{
    ....

    //When using LDS to cache the volume data, this produces the best most optimal code.
    //Doing a manual loop like the one below adds an extra cost of .1 ms on ps4 if we use LDS.
    for (; l < iNrCoarseLights; ++l)
    {
        //判交計算
        ....
        //lightValid記錄判交結果,前32盞燈記錄到uLightsFlags[0],剩下的記錄到uLightsFlags[1]
        uLightsFlags[l < 32 ? 0 : 1] |= ((lightValid ? 1 : 0) << (l & 31));
    }

    //執行緒同步uLightsFlags
    InterlockedOr(ldsDoesLightIntersect[0], uLightsFlags[0]);
    InterlockedOr(ldsDoesLightIntersect[1], uLightsFlags[1]);
    //重置ldsNrLightsFinal(記錄Tile內燈光數量)
    if (t == 0) 
        ldsNrLightsFinal = 0;

    #if NR_THREADS > PLATFORM_LANE_COUNT
    GroupMemoryBarrierWithGroupSync();
    #endif

    //遍歷ldsDoesLightIntersect的Flag
    if (t < (uint)iNrCoarseLights && (ldsDoesLightIntersect[t < 32 ? 0 : 1] & (1 << (t & 31))) != 0)
    {
        unsigned int uInc = 1;
        unsigned int uIndex;
        InterlockedAdd(ldsNrLightsFinal, uInc, uIndex);
        if (uIndex < LIGHT_LIST_MAX_COARSE_ENTRIES) 
            prunedList[uIndex] = coarseList[t]; // we allow up to 64 pruned lights while stored in LDS.
    }
}



[numthreads(NR_THREADS, 1, 1)]
void TileLightListGen(uint3 dispatchThreadId : SV_DispatchThreadID, uint threadID : SV_GroupIndex, uint3 u3GroupID : SV_GroupID)
{
    ...
    #ifndef FINE_PRUNING_ENABLED
        {
            if((int)t<iNrCoarseLights)
                prunedList[t] = coarseList[t];
            if(t==0)
                ldsNrLightsFinal=iNrCoarseLights;
        }
    #else
    {
        // initializes ldsNrLightsFinal with the number of accepted lights.
        // all accepted entries delivered in prunedList[].
        FinePruneLights(t, iNrCoarseLights, viTilLL, vLinDepths);
    }
    #endif
    ...
}

遍歷PruneList根據不同的LightCategory進行劃分,Resolve FeatureFlag

根據上一步FinePruneLights得到的prunedList,可以透過遍歷prunedList對應的燈光,拿到燈光的lightCategory以及featureFlags
這樣就可以計算出Tile內CategoryCount以及光照涉及的Light Feature Flag(Punctual/Env/Decal....)

groupshared unsigned int prunedList[LIGHT_LIST_MAX_COARSE_ENTRIES]; 

//LightCategory計數器
groupshared int ldsCategoryListCount[CATEGORY_LIST_SIZE];

//Light Feature Flag
#ifdef USE_FEATURE_FLAGS
groupshared uint ldsFeatureFlags;
RWStructuredBuffer<uint> g_TileFeatureFlags;
#endif

[numthreads(NR_THREADS, 1, 1)]
void TileLightListGen(uint3 dispatchThreadId : SV_DispatchThreadID, uint threadID : SV_GroupIndex, uint3 u3GroupID : SV_GroupID)
{
    ...

    //重置LightCategory計數器
    if (t < CATEGORY_LIST_SIZE)
        ldsCategoryListCount[t] = 0;
    //重置ldsFeatureFlags
    #ifdef USE_FEATURE_FLAGS
        if(t==0)
            ldsFeatureFlags=0;
    #endif

    //執行緒同步
    #if NR_THREADS > PLATFORM_LANE_COUNT
    GroupMemoryBarrierWithGroupSync();
    #endif

    //ldsNrLightsFinal為上一步FinePruneLights計算得出的Tile內燈光數量
    int nrLightsCombinedList = min(ldsNrLightsFinal,LIGHT_LIST_MAX_COARSE_ENTRIES);
    //遍歷prunedList,用InterlockedAdd累計不同的LightCategory到ldsCategoryListCount LDS計數器中
    //InterlockedOr合計Tile內所有的Light Feature Flag
    for (int i = t; i < nrLightsCombinedList; i += NR_THREADS)
    {
        const int lightBoundIndex = GenerateLightCullDataIndex(prunedList[i], g_iNrVisibLights, unity_StereoEyeIndex);

        InterlockedAdd(ldsCategoryListCount[_LightVolumeData[lightBoundIndex].lightCategory], 1);
        #ifdef USE_FEATURE_FLAGS
            InterlockedOr(ldsFeatureFlags, _LightVolumeData[lightBoundIndex].featureFlags);
        #endif
    }

    //排序prunedList
    // sort lights (gives a more efficient execution in both deferred and tiled forward lighting).
    #if NR_THREADS > PLATFORM_LANE_COUNT
    SORTLIST(prunedList, nrLightsCombinedList, LIGHT_LIST_MAX_COARSE_ENTRIES, t, NR_THREADS);
    //MERGESORTLIST(prunedList, coarseList, nrLightsCombinedList, t, NR_THREADS);
    #endif

    //初始化每個Tile對應的g_TileFeatureFlags
    #ifdef USE_FEATURE_FLAGS
        if(t == 0)
        {
            uint featureFlags = ldsFeatureFlags | g_BaseFeatureFlags;
            // In case of back
            if(ldsZMax < ldsZMin)   // is background pixel
                {
                // There is no stencil usage with compute path, featureFlags set to 0 is use to have fast rejection of tile in this case. 
                // It will still execute but will do nothing
                featureFlags = 0;
                }

            g_TileFeatureFlags[tileIDX.y * nrTilesX + tileIDX.x + unity_StereoEyeIndex * nrTilesX * nrTilesY] = featureFlags;
        }
    #endif

    ...
}

根據不同的LightCategory使用對應的Offset壓入到g_vLightList

這最後一步就是把prunedList放進g_vLightList(大象裝進冰箱)
其中由於Index用不了那麼UInt32那麼大的精度,所以需要將兩個Index合併成一個,用的時候再Unpack出來.


/////////HDRenderPipeline.LightLoop.cs PrepareBuildGPULightListPassData
//燈光數量作為EnvLightIndex起始點
//cb._EnvLightIndexShift = (uint)m_GpuLightsBuilder.lightsCount;

//燈光數量以及反射探針數量之和作為DecalIndex起始點
//cb._DecalIndexShift = (uint)(m_GpuLightsBuilder.lightsCount + m_lightList.envLights.Count);

//燈光數量,反射探針數量以及貼花數量之和作為Local Volumetric Fog Index起始點
//cb._LocalVolumetricFogIndexShift = (uint)(m_GpuLightsBuilder.lightsCount + m_lightList.envLights.Count + decalDatasCount);
/////////End of HDRenderPipeline.LightLoop.cs

CBUFFER_START(ShaderVariablesLightList)
    ...
    uint _EnvLightIndexShift;
    uint _DecalIndexShift;
    uint _LocalVolumetricFogIndexShift;
    ...
CBUFFER_END
//原本每個Tile內的元素數量64,由於兩個Index合併成一個,所以Tile內元素數量就變成32
#define LIGHT_DWORD_PER_FPTL_TILE (32)

//ShaderConfig.cs.hlsl
//由指令碼GenerateHLSL生成控制 FPTL Tile內的LightIndex數量上限
#define SHADEROPTIONS_FPTLMAX_LIGHT_COUNT (63)

[numthreads(NR_THREADS, 1, 1)]
void TileLightListGen(uint3 dispatchThreadId : SV_DispatchThreadID, uint threadID : SV_GroupIndex, uint3 u3GroupID : SV_GroupID)
{
    ...

    //Tile的起始Index計算
    // write lights to global buffers
    int localOffs = 0;
    int offs = tileIDX.y * nrTilesX + tileIDX.x;

    #if defined(UNITY_STEREO_INSTANCING_ENABLED)
        // Eye base offset must match code in GetCountAndStartTile()
        offs += unity_StereoEyeIndex * nrTilesX * nrTilesY * LIGHTCATEGORY_COUNT;
    #endif

    //CBUFFER變數初始化shiftIndex
    // All our cull data are in the same list, but at render time envLights are separated so we need to shift the index
    // to make it work correctly
    int shiftIndex[CATEGORY_LIST_SIZE];
    ZERO_INITIALIZE_ARRAY(int, shiftIndex, CATEGORY_LIST_SIZE);
    
    shiftIndex[LIGHTCATEGORY_ENV] = _EnvLightIndexShift;
    shiftIndex[LIGHTCATEGORY_DECAL] = _DecalIndexShift;

    //透過讀取ldsCategoryListCount獲取不同Category(Punctual/Area/Env/Decal)對應的LightData數量
    for (int category = 0; category < CATEGORY_LIST_SIZE; category++)
    {
        int nrLightsFinal = ldsCategoryListCount[category];
        int nrLightsFinalClamped = nrLightsFinal < SHADEROPTIONS_FPTLMAX_LIGHT_COUNT ? nrLightsFinal : SHADEROPTIONS_FPTLMAX_LIGHT_COUNT;

        //由於LightIndex用不了uint那麼多的位數(32Bit),所以可以對LightList中的Index每兩個合併(uLow/uHigh)合併成一個Index
        //nrLightsFinalClamped + 1(記錄LightList的Index數量的nrLightsFinalClamped)
        //((nrLightsFinalClamped + 1) + 1) >> 1  相當於DivRoundUp(nrLightsFinalClamped + 1,2)
        const int nrDWords = ((nrLightsFinalClamped + 1) + 1) >> 1;
        for (int l = (int)t; l < (int)nrDWords; l += NR_THREADS)
        {
            //prunedList裡儲存的Index是:所有的Category(Punctual/Area/Env/Decal)的LightData/LightVolumeData Buffer的Index,
            //所以需要減去Category對應的shiftIndex,重新對映得到對應Category在各自Buffer中真正的Index

            // We remap the prunedList index to the original LightData / EnvLightData indices
            uint uLow = l == 0 ? nrLightsFinalClamped : prunedList[max(0, 2 * l - 1 + localOffs)] - shiftIndex[category];
            uint uHigh = prunedList[2 * l + 0 + localOffs] - shiftIndex[category];

            //偏移16位
            g_vLightList[LIGHT_DWORD_PER_FPTL_TILE * offs + l] = (uLow & 0xffff) | (uHigh << 16);
        }

        //localOffs滑動到下一個Category
        localOffs += nrLightsFinal;
        //不同Category儲存Index的偏移是Tile的數量。這樣有利於遍歷時提高Cache Hit.
        offs += (nrTilesX * nrTilesY);
    }

    ...
}

Lighting Loop

最終來到了LightLoop中應用g_vLightList計算的結果。
應用的流程如下:
1.首先需要根據當前畫素PositionSS計算出Tile的Index
2.根據Tile的Index以及當前計算的Category得到g_vLightList的偏移tileOffset
3.Tile List的起始點的Index(start)就是tileOffset,用&0xffff取出第一個元素即為LightCount
4.後續獲取LightData的Index時,只需要從start前面的uHigh(前16bit)開始遍歷就行了

#define LIGHT_DWORD_PER_FPTL_TILE (32)

//渲染不透明佇列時啟用
#ifdef USE_FPTL_LIGHTLIST

//計算PositionInputs的tileIndex
//uint2 tileIndex = uint2(fragInput.positionSS.xy) / TILE_SIZE_FPTL;
//PositionInputs posInput = GetPositionInput(fragInput.positionSS.xy, _ScreenSize.zw, fragInput.positionSS.z, fragInput.positionSS.w, input.positionWS.xyz, tileIndex);

//根據tileIndex計算當前lightCategory對應的Tile在g_vLightListTile Buffer中的Offset Index
int GetTileOffset(PositionInputs posInput, uint lightCategory)
{
    uint2 tileIndex = posInput.tileCoord;
    return (tileIndex.y + lightCategory * _NumTileFtplY) * _NumTileFtplX + tileIndex.x;
}

void GetCountAndStartTile(PositionInputs posInput, uint lightCategory, out uint start, out uint lightCount)
{
    int tileOffset = GetTileOffset(posInput, lightCategory);

    #if defined(UNITY_STEREO_INSTANCING_ENABLED)
    // Eye base offset must match code in lightlistbuild.compute
    tileOffset += unity_StereoEyeIndex * _NumTileFtplX * _NumTileFtplY * LIGHTCATEGORY_COUNT;
    #endif

    //List的第一個元素就是Light的數量
    // The first entry inside a tile is the number of light for lightCategory (thus the +0)
    lightCount = g_vLightListTile[LIGHT_DWORD_PER_FPTL_TILE * tileOffset + 0] & 0xffff;
    start = tileOffset;
}

uint GetTileSize()
{
    return TILE_SIZE_FPTL;
}

void GetCountAndStart(PositionInputs posInput, uint lightCategory, out uint start, out uint lightCount)
{
    GetCountAndStartTile(posInput, lightCategory, start, lightCount);
}

//Loop中讀取燈光Index的函式
uint FetchIndex(uint tileOffset, uint lightOffset)
{
    //List的第一個元素就是Light的數量
    //從start前面的uHigh開始遍歷
    const uint lightOffsetPlusOne = lightOffset + 1; // Add +1 as first slot is reserved to store number of light

    //用32bit存了兩個Index
    // Light index are store on 16bit
    return (g_vLightListTile[LIGHT_DWORD_PER_FPTL_TILE * tileOffset + (lightOffsetPlusOne >> 1)] >> ((lightOffsetPlusOne & 1) * 16)) & 0xffff;
}

//渲染透明佇列時啟用
#elif defined(USE_CLUSTERED_LIGHTLIST)
...
    //LightingLoop.hlsl

    // This struct is define in the material. the Lightloop must not access it
    // PostEvaluateBSDF call at the end will convert Lighting to diffuse and specular lighting
    AggregateLighting aggregateLighting;
    ZERO_INITIALIZE(AggregateLighting, aggregateLighting); // LightLoop is in charge of initializing the struct

    
    if (featureFlags & LIGHTFEATUREFLAGS_PUNCTUAL)
    {
        uint lightCount, lightStart;

//預設開啟
//#ifndef LIGHTLOOP_DISABLE_TILE_AND_CLUSTER
        GetCountAndStart(posInput, LIGHTCATEGORY_PUNCTUAL, lightStart, lightCount);
//#else   // LIGHTLOOP_DISABLE_TILE_AND_CLUSTER
        //lightCount = _PunctualLightCount;
        //lightStart = 0;
//#endif

        bool fastPath = false;

        //SCALARIZE_LIGHT_LOOP涉及到Wave相關的指令,詳細介紹可以看https://zhuanlan.zhihu.com/p/469436345
        #if SCALARIZE_LIGHT_LOOP
            uint lightStartLane0;
            fastPath = IsFastPath(lightStart, lightStartLane0);

            if (fastPath)
            {
                lightStart = lightStartLane0;
            }
        #endif

        // Scalarized loop. All lights that are in a tile/cluster touched by any pixel in the wave are loaded (scalar load), only the one relevant to current thread/pixel are processed.
        // For clarity, the following code will follow the convention: variables starting with s_ are meant to be wave uniform (meant for scalar register),
        // v_ are variables that might have different value for each thread in the wave (meant for vector registers).
        // This will perform more loads than it is supposed to, however, the benefits should offset the downside, especially given that light data accessed should be largely coherent.
        // Note that the above is valid only if wave intriniscs are supported.
        uint v_lightListOffset = 0;
        uint v_lightIdx = lightStart;

        while (v_lightListOffset < lightCount)
        {
            v_lightIdx = FetchIndex(lightStart, v_lightListOffset);
#if SCALARIZE_LIGHT_LOOP
            uint s_lightIdx = ScalarizeElementIndex(v_lightIdx, fastPath);
#else
            uint s_lightIdx = v_lightIdx;
#endif
            if (s_lightIdx == -1)
                break;

            //獲取LightData
            LightData s_lightData = FetchLight(s_lightIdx);

            ...
        }
    }

相關文章