序言
如果看了前面的BigTileLightList的建立,這一章會簡單一點。
因為如果啟用了BigTile之後,這裡的BuildPerTileLightList就需要從BigTileLightList裡面讀取LightList。
否則還是需要像BigTile一樣去先走同樣的燈光剔除流程(NDCAABBBoundTest,SphericalIntersectionTests),
然後才到最後的FinePruneLightsTest。
LightListBuild
RenderGraph Dispatch
下面是RenderGraph中Dispatch時需要的Buffer以及ConstantBuffer
//HDRenderPipeline.LightLoop.cs
static void BuildPerTileLightList(BuildGPULightListPassData data, ref bool tileFlagsWritten, CommandBuffer cmd)
{
// optimized for opaques only
if (data.runLightList && data.runFPTL)
{
//第一步計算的燈光AABB Buffer
cmd.SetComputeBufferParam(data.buildPerTileLightListShader, data.buildPerTileLightListKernel, HDShaderIDs.g_vBoundsBuffer, data.AABBBoundsBuffer);
//LightVolumeData與SFiniteLightBound的ComputeBuffer
cmd.SetComputeBufferParam(data.buildPerTileLightListShader, data.buildPerTileLightListKernel, HDShaderIDs._LightVolumeData, data.lightVolumeDataBuffer);
cmd.SetComputeBufferParam(data.buildPerTileLightListShader, data.buildPerTileLightListKernel, HDShaderIDs.g_data, data.convexBoundsBuffer);
//用作Hiz剔除的深度圖
cmd.SetComputeTextureParam(data.buildPerTileLightListShader, data.buildPerTileLightListKernel, HDShaderIDs.g_depth_tex, data.depthBuffer);
//最終輸出的lightList
cmd.SetComputeBufferParam(data.buildPerTileLightListShader, data.buildPerTileLightListKernel, HDShaderIDs.g_vLightList, data.output.lightList);
//Big Tile Light List
if (data.runBigTilePrepass)
cmd.SetComputeBufferParam(data.buildPerTileLightListShader, data.buildPerTileLightListKernel, HDShaderIDs.g_vBigTileLightList, data.output.bigTileLightList);
var localLightListCB = data.lightListCB;
//計算Tile內需要計算的FeatureVariant
//LightLoop中用於控制著色光照計算流程,baseFeatureFlags就是最基礎的Flag
//LightLoop的時候獲取TileFeatureFlag,就可以知道當前Tile是否需要計算Punctual/Area/Directional/Env的光照
if (data.enableFeatureVariants)
{
uint baseFeatureFlags = 0;
if (data.directionalLightCount > 0)
{
baseFeatureFlags |= (uint)LightFeatureFlags.Directional;
}
if (data.skyEnabled)
{
baseFeatureFlags |= (uint)LightFeatureFlags.Sky;
}
if (!data.computeMaterialVariants)
{
baseFeatureFlags |= LightDefinitions.s_MaterialFeatureMaskFlags;
}
localLightListCB.g_BaseFeatureFlags = baseFeatureFlags;
cmd.SetComputeBufferParam(data.buildPerTileLightListShader, data.buildPerTileLightListKernel, HDShaderIDs.g_TileFeatureFlags, data.output.tileFeatureFlags);
tileFlagsWritten = true;
}
ConstantBuffer.Push(cmd, localLightListCB, data.buildPerTileLightListShader, HDShaderIDs._ShaderVariablesLightList);
cmd.DispatchCompute(data.buildPerTileLightListShader, data.buildPerTileLightListKernel, data.numTilesFPTLX, data.numTilesFPTLY, data.viewCount);
}
}
Initialize
跟BigTile類似,計算當前執行緒的Tile的對映關係要用到的資料.(Tile的X/Y軸上的數量,當前執行緒組對應的TileID)
//FPTL這一步的Tile Size為16*16
#define TILE_SIZE_FPTL (16)
[numthreads(NR_THREADS, 1, 1)]
void TileLightListGen(uint3 dispatchThreadId : SV_DispatchThreadID, uint threadID : SV_GroupIndex, uint3 u3GroupID : SV_GroupID)
{
UNITY_XR_ASSIGN_VIEW_INDEX(dispatchThreadId.z);
uint2 tileIDX = u3GroupID.xy;
uint t=threadID;
if(t<LIGHT_LIST_MAX_COARSE_ENTRIES)
prunedList[t]=0;
uint iWidth = g_viDimensions.x;
uint iHeight = g_viDimensions.y;
uint nrTilesX = (iWidth+15)/16;
uint nrTilesY = (iHeight+15)/16;
// build tile scr boundary
const uint uFltMax = 0x7f7fffff; // FLT_MAX as a uint
if(t==0)
{
ldsZMin = uFltMax;
ldsZMax = 0;
lightOffs = 0;
}
#if NR_THREADS > PLATFORM_LANE_COUNT
GroupMemoryBarrierWithGroupSync();
#endif
...
}
Hi-z剔除
讀取PreDepth深度圖獲取4個畫素內的Min、MaxDepth,透過執行緒同步計算得出一個執行緒組內的Min/MaxDepth 即(64*4=16*16),建立起當前Tile的Bounds(ldsZMin/ldsZMax)。
讀取深度圖圖,計算viewPostion.z
LinearDepth
camera.projectMatrix是右手座標系的(OpenGL),為了統一整個剔除流程的軸向,ScrProjection翻轉了z軸,採用左手座標系。
groupshared uint ldsZMin;
groupshared uint ldsZMax;
TEXTURE2D_X(g_depth_tex) : register( t0 );
float FetchDepth(uint2 pixCoord)
{
float zdpth = LOAD_TEXTURE2D_X(g_depth_tex, pixCoord.xy).x;
//https://zhuanlan.zhihu.com/p/389971233
//投影矩陣的Z Flip了,讀取時也要翻轉過來(匹配 0 is near 1 is far)
#if UNITY_REVERSED_Z
zdpth = 1.0 - zdpth;
#endif
return zdpth;
}
//USE_OBLIQUE_MODE m_LightListProjMatrices.m20!= 0 || m_LightListProjMatrices.m21 != 0;
//即投影矩陣的r+l!=0,t+b!=0
//linearDepth; // View space Z coordinate : [Near, Far]
//Reverse z :-z_eye=1/((n-f)/(n*f)*depth+1/n); //https://zhuanlan.zhihu.com/p/393643084
float GetLinearDepth(float2 pixXY, float zDptBufSpace) // 0 is near 1 is far
{
float4x4 g_mInvScrProjection = g_mInvScrProjectionArr[unity_StereoEyeIndex];
#ifdef USE_OBLIQUE_MODE
float2 res2 = mul(g_mInvScrProjection, float4(pixXY, zDptBufSpace, 1.0)).zw;
return res2.x / res2.y;
#else
//正交矩陣用(m22*zDptBufSpace+m23),透視用(m32*zDptBufSpace+m33)
// for perspective projection m22 is zero and m23 is +1/-1 (depends on left/right hand proj)
// however this function must also work for orthographic projection so we keep it like this.
float m22 = g_mInvScrProjection[2].z, m23 = g_mInvScrProjection[2].w;
float m32 = g_mInvScrProjection[3].z, m33 = g_mInvScrProjection[3].w;
return (m22*zDptBufSpace+m23) / (m32*zDptBufSpace+m33);
#endif
}
GetViewPosFromLinDepth
這裡簡單的以x軸的推導為例
由投影矩陣中的相似三角形易得
[Unity Shader入門精要]
\(\frac{(ScreenPos.x-pixWidth/2)}{pixWidth/2}=\frac{clipPos.x}{clipPos.w}\)
\(\frac{clipPos.x}{clipPos.w}=\frac{viewPos.x*\frac{cotFOV}{Aspect}}{-viewPos.z}\)
\(viewPos.x = \frac{Screen.x-pixWidth/2}{pixWidth/2*\frac{cotFOV}{Aspect}}*-viewPos.z\)
由於之前ScrProjection已經FlipZ,所以可以直接fLinDepth * p.xy
unsafe void PrepareBuildGPULightListPassData(
RenderGraph renderGraph,
RenderGraphBuilder builder,
HQCamera hqCamera,
TileAndClusterData tileAndClusterData,
ref ShaderVariablesLightList constantBuffer,
int totalLightCount,
TextureHandle depthStencilBuffer,
TextureHandle stencilBufferCopy,
BuildGPULightListPassData passData)
{
....
// camera to screen matrix (and it's inverse)
for (int viewIndex = 0; viewIndex < hqCamera.viewCount; ++viewIndex)
{
var proj = camera.projectionMatrix;
// Note: we need to take into account the TAA jitter when indexing the light list
proj = hqCamera.RequiresCameraJitter() ? hqCamera.GetJitteredProjectionMatrix(proj) : proj;
m_LightListProjMatrices[viewIndex] = proj * s_FlipMatrixLHSRHS;
var tempMatrix = temp * m_LightListProjMatrices[viewIndex];
var invTempMatrix = tempMatrix.inverse;
for (int i = 0; i < 16; ++i)
{
cb.g_mScrProjectionArr[viewIndex * 16 + i] = tempMatrix[i];
cb.g_mInvScrProjectionArr[viewIndex * 16 + i] = invTempMatrix[i];
}
}
}
\(pixWidth/2*\frac{cotFOV}{Aspect}=fSx\)
\(pixHeight/2*\frac{cotFOV}{Aspect}=fSy\)
\(pixWidth/2=fCx\)
\(pixHeight/2=fCy\)
//
float3 GetViewPosFromLinDepth(float2 v2ScrPos, float fLinDepth)
{
float4x4 g_mScrProjection = g_mScrProjectionArr[unity_StereoEyeIndex];
bool isOrthographic = g_isOrthographic != 0;
float fSx = g_mScrProjection[0].x;
float fSy = g_mScrProjection[1].y;
float fCx = isOrthographic ? g_mScrProjection[0].w : g_mScrProjection[0].z;
float fCy = isOrthographic ? g_mScrProjection[1].w : g_mScrProjection[1].z;
#if USE_LEFT_HAND_CAMERA_SPACE
bool useLeftHandVersion = true;
#else
bool useLeftHandVersion = isOrthographic;
#endif
float s = useLeftHandVersion ? 1 : (-1);
float2 p = float2((s * v2ScrPos.x - fCx) / fSx, (s * v2ScrPos.y - fCy) / fSy);
return float3(isOrthographic ? p.xy : (fLinDepth * p.xy), fLinDepth);
}
這裡讀取深度圖並將其轉換到[Near,Far],然後計算出2*2畫素中的MinDepth,MaxDepth,
然後透過執行緒同步(InterlockedMax/InterlockedMin)計算執行緒組內(Tile內)的MinDepth,MaxDepth。
注:FPTL的Tile Size為16*16,64執行緒一組,一個執行緒計算4個畫素。(16*16=64*4),這裡的同步計算不會影響到別的執行緒組(Tile)
#define NR_THREADS 64
#define TILE_SIZE_FPTL (16)
#define VIEWPORT_SCALE_Z (1)
#define PIXEL_PER_THREAD ((TILE_SIZE_FPTL*TILE_SIZE_FPTL) / NR_THREADS) // 8 or 4
//16*16/64=4
[numthreads(NR_THREADS, 1, 1)]
void TileLightListGen(uint3 dispatchThreadId : SV_DispatchThreadID, uint threadID : SV_GroupIndex, uint3 u3GroupID : SV_GroupID)
{
...
uint2 viTilLL = 16*tileIDX;
// establish min and max depth first
float dpt_mi=asfloat(uFltMax), dpt_ma=0.0;
#if PIXEL_PER_THREAD == 4
float4 vLinDepths;
#else
float vLinDepths[PIXEL_PER_THREAD];
#endif
{
//VIEWPORT_SCALE_Z
// Fetch depths and calculate min/max
UNITY_UNROLL
for(int i = 0; i < PIXEL_PER_THREAD; i++)
{
int idx = i * NR_THREADS + t;
uint2 uCrd = min( uint2(viTilLL.x+(idx&0xf), viTilLL.y+(idx>>4)), uint2(iWidth-1, iHeight-1) );
const float fDepth = FetchDepth(uCrd);
vLinDepths[i] = GetLinearDepth(uCrd+float2(0.5,0.5), fDepth);
if(fDepth<VIEWPORT_SCALE_Z) // if not skydome
{
dpt_mi = min(fDepth, dpt_mi);
dpt_ma = max(fDepth, dpt_ma);
}
}
InterlockedMax(ldsZMax, asuint(dpt_ma));
InterlockedMin(ldsZMin, asuint(dpt_mi));
#if NR_THREADS > PLATFORM_LANE_COUNT || defined(SHADER_API_SWITCH) // not sure why Switch needs the barrier (it will not be correct without)
GroupMemoryBarrierWithGroupSync();
#endif
}
...
}
NDCAABBBoundTest
若啟用了BigTile預計算,則透過對映讀取LightOffset(Big-tile內的燈光數量)以及對應的lightIndex
若沒有則按原樣直接遍歷g_vBoundBuffer,用AABB計算當前燈光是否在Tile內,若在則加入到CoareList中
[numthreads(NR_THREADS, 1, 1)]
void TileLightListGen(uint3 dispatchThreadId : SV_DispatchThreadID, uint threadID : SV_GroupIndex, uint3 u3GroupID : SV_GroupID)
{
...
// build coarse list using AABB
//若啟用了BigTile計算
#ifdef USE_TWO_PASS_TILED_LIGHTING
//firstbithigh(64)=6 64*64
//firstbithigh(16)=4 16*16
//log2BigTileToTileRatio=2
//#define TILE_SIZE_FPTL (16)
//#define TILE_SIZE_BIG_TILE (64)
//即4*4個FTPL Tile構成一個Big Tile log2_4=2
const uint log2BigTileToTileRatio = firstbithigh(64) - firstbithigh(16);
//計算Big Tile X/Y的數量
//((1 << log2BigTileToTileRatio) -1 ) 1<<2-1 = 3 相當於DivRoundUp(nrTilesX,4)
int NrBigTilesX = (nrTilesX + ((1 << log2BigTileToTileRatio) -1 )) >> log2BigTileToTileRatio;
int NrBigTilesY = (nrTilesY + ((1 << log2BigTileToTileRatio) - 1)) >> log2BigTileToTileRatio;
//BigTile總數
const int bigTileBase = unity_StereoEyeIndex * NrBigTilesX * NrBigTilesY;
//計算當前Tile對應的BigTile
const int bigTileIdx = bigTileBase + (tileIDX.y>>log2BigTileToTileRatio)*NrBigTilesX + (tileIDX.x>>log2BigTileToTileRatio); // map the idx to 64x64 tiles
//第一位記錄當前Tile的燈光數量
int nrBigTileLights = g_vBigTileLightList[MAX_NR_BIG_TILE_LIGHTS_PLUS_ONE*bigTileIdx+0];
//遍歷BigTileLightList
for(int l0=(int) t; l0<(int) nrBigTileLights; l0 += NR_THREADS)
{
int l = g_vBigTileLightList[MAX_NR_BIG_TILE_LIGHTS_PLUS_ONE*bigTileIdx+l0+1];
#else
//若沒有啟用Big Tile,則直接遍歷所有的燈光,利用Scrbound計算的AABBBound計算Tile(16*16)的當前燈光列表(coarseList)
for (int l = (int)t; l < (int)g_iNrVisibLights; l += NR_THREADS)
{
#endif
// Skip Local Volumetric Fog (lights are sorted by category). TODO: improve data locality
// if (_LightVolumeData[l].lightCategory == LIGHTCATEGORY_LOCAL_VOLUMETRIC_FOG) { break; }
const ScreenSpaceBoundsIndices boundsIndices = GenerateScreenSpaceBoundsIndices(l, g_iNrVisibLights, unity_StereoEyeIndex);
const float3 vMi = g_vBoundsBuffer[boundsIndices.min].xyz;
const float3 vMa = g_vBoundsBuffer[boundsIndices.max].xyz;
if (all(vMa > vTileLL) && all(vMi < vTileUR))
{
unsigned int uInc = 1;
unsigned int uIndex;
InterlockedAdd(lightOffs, uInc, uIndex);
if (uIndex < LIGHT_LIST_MAX_COARSE_ENTRIES) coarseList[uIndex] = l; // add to light list
}
}
...
}
清空ldsDoesLightInterset初始化,並同步執行緒組
#define FINE_PRUNING_ENABLED
#define LIGHT_LIST_MAX_COARSE_ENTRIES (64)//coarseList/prunedList LDS的最大容量為64
//uint 32Bit容納不了64盞燈的燈光與當前Tile相交情況,所以加多了一個記錄另外32盞燈.
groupshared uint ldsDoesLightIntersect[2];
[numthreads(NR_THREADS, 1, 1)]
void TileLightListGen(uint3 dispatchThreadId : SV_DispatchThreadID, uint threadID : SV_GroupIndex, uint3 u3GroupID : SV_GroupID)
{
...
#ifdef FINE_PRUNING_ENABLED
if (t < 2)
ldsDoesLightIntersect[t] = 0;
#endif
#if NR_THREADS > PLATFORM_LANE_COUNT
GroupMemoryBarrierWithGroupSync();
#endif
//coarseList/prunedList LDS的最大容量為64
int iNrCoarseLights = min(lightOffs,LIGHT_LIST_MAX_COARSE_ENTRIES);
...
}
SphericalIntersectionTest,FinePruneLights
SphericalIntersectionTest
這裡的SphericalIntersectionTest與BigTile中的SphericalIntersectionTest唯一不同的區別是就是需要把coarseList複製到prunedList暫存。
檢測到燈光Overlap Tile之後再暫存在coarseList的lightIndex加入到prunedList中。
#define FINE_PRUNING_ENABLED
#define PERFORM_SPHERICAL_INTERSECTION_TESTS
int SphericalIntersectionTests(uint threadID, int iNrCoarseLights, float2 screenCoordinate)
{
if (threadID == 0) lightOffsSph = 0;
// make a copy of coarseList in prunedList.
int l;
for (l = threadID; l < iNrCoarseLights; l += NR_THREADS)
prunedList[l] = coarseList[l];
#if NR_THREADS > PLATFORM_LANE_COUNT
GroupMemoryBarrierWithGroupSync();
#endif
#if USE_LEFT_HAND_CAMERA_SPACE
float3 V = GetViewPosFromLinDepth(screenCoordinate, 1.0);
#else
float3 V = GetViewPosFromLinDepth( screenCoordinate, -1.0);
#endif
float onePixDiagDist = GetOnePixDiagWorldDistAtDepthOne();
float halfTileSizeAtZDistOne = 8 * onePixDiagDist; // scale by half a tile
for (l = threadID; l < iNrCoarseLights; l += NR_THREADS)
{
const int lightBoundIndex = GenerateLightCullDataIndex(prunedList[l], g_iNrVisibLights, unity_StereoEyeIndex);
SFiniteLightBound lightData = g_data[lightBoundIndex];
if (DoesSphereOverlapTile(V, halfTileSizeAtZDistOne, lightData.center.xyz, lightData.radius, g_isOrthographic != 0))
{
unsigned int uIndex;
InterlockedAdd(lightOffsSph, 1, uIndex);
coarseList[uIndex] = prunedList[l]; // read from the original copy of coarseList which is backed up in prunedList
}
}
#if NR_THREADS > PLATFORM_LANE_COUNT
GroupMemoryBarrierWithGroupSync();
#endif
return lightOffsSph;
}
[numthreads(NR_THREADS, 1, 1)]
void TileLightListGen(uint3 dispatchThreadId : SV_DispatchThreadID, uint threadID : SV_GroupIndex, uint3 u3GroupID : SV_GroupID)
{
...
#ifdef PERFORM_SPHERICAL_INTERSECTION_TESTS
iNrCoarseLights = SphericalIntersectionTests(t, iNrCoarseLights, float2(min(viTilLL.xy + uint2(16 / 2, 16 / 2), uint2(iWidth - 1, iHeight - 1))));
#endif
...
}
FinePruneLights
s_lightVolumesCache LDS
在開始計算FinePruneLights之前,需要預先記錄coarseList對應的LightVolume進LDS s_lightVolumesCache(StoreLightVolumeCache)中
#define FINE_PRUNING_ENABLED
[numthreads(NR_THREADS, 1, 1)]
void TileLightListGen(uint3 dispatchThreadId : SV_DispatchThreadID, uint threadID : SV_GroupIndex, uint3 u3GroupID : SV_GroupID)
{
...
#ifndef FINE_PRUNING_ENABLED
{
if((int)t<iNrCoarseLights)
prunedList[t] = coarseList[t];
if(t==0)
ldsNrLightsFinal=iNrCoarseLights;
}
#else
{
// initializes ldsNrLightsFinal with the number of accepted lights.
// all accepted entries delivered in prunedList[].
FinePruneLights(t, iNrCoarseLights, viTilLL, vLinDepths);
}
#endif
...
}
//
//eyeIndex==0時,GetCoarseLightIndex=>coarseList[l]
uint GenerateLightCullDataIndex(uint lightIndex, uint numVisibleLights, uint eyeIndex)
{
lightIndex = min(lightIndex, numVisibleLights - 1); // Stay within bounds
// For monoscopic, there is just one set of light cull data structs.
// In stereo, all of the left eye structs are first, followed by the right eye structs.
const uint perEyeBaseIndex = eyeIndex * numVisibleLights;
return (perEyeBaseIndex + lightIndex);
}
int GetCoarseLightIndex(int l, int iNrCoarseLights)
{
return l < iNrCoarseLights ? GenerateLightCullDataIndex(coarseList[l], g_iNrVisibLights, unity_StereoEyeIndex) : 0;
}
//
groupshared uint s_lightVolumesCache[LIGHT_LIST_MAX_COARSE_ENTRIES];
void StoreLightVolumeCache(int lightIndex, int coarseIndex, uint volumeType)
{
// 3 bits for the volume type, in case we have a corrupted one we can early out of the switch statement.
// 29 bits for a coarse light index.
s_lightVolumesCache[lightIndex] = (volumeType & 0x7) | (uint)(coarseIndex << 3);
}
void LoadLightVolumeCache(int lightIndex, out int coarseIndex, out int volumeType)
{
int data = s_lightVolumesCache[lightIndex];
coarseIndex = data >> 3;
volumeType = data & 0x7;
}
// initializes ldsNrLightsFinal with the number of accepted lights.
// all accepted entries delivered in prunedList[].
#if PIXEL_PER_THREAD == 4
void FinePruneLights(uint threadID, int iNrCoarseLights, uint2 viTilLL, float4 vLinDepths) // keep float4 vectorization when possible, as shader compiler may generate bad code for array of floats.
#else
void FinePruneLights(uint threadID, int iNrCoarseLights, uint2 viTilLL, float vLinDepths[PIXEL_PER_THREAD])
#endif
{
uint t = threadID;
uint iWidth = g_viDimensions.x;
uint iHeight = g_viDimensions.y;
uint uLightsFlags[2] = {0, 0};
int l = 0;
// need this outer loop even on xb1 and ps4 since direct lights and
// reflection lights are kept in separate regions.
if (threadID < (uint)iNrCoarseLights)
{
int idxCoarse = GetCoarseLightIndex((int)threadID, iNrCoarseLights);
int uLightVolume = (int)_LightVolumeData[idxCoarse].lightVolume;
StoreLightVolumeCache(threadID, idxCoarse, uLightVolume);
}
#if NR_THREADS > PLATFORM_LANE_COUNT
GroupMemoryBarrierWithGroupSync();
#endif
....
}
判交計算
讀取LDS中的volumeData,並且利用之前的DepthBound(vLinDepths)逐畫素進行判交。
#define FINE_PRUNING_ENABLED
#define PERFORM_SPHERICAL_INTERSECTION_TESTS
//(記錄Tile內燈光數量)
groupshared int ldsNrLightsFinal;
// initializes ldsNrLightsFinal with the number of accepted lights.
// all accepted entries delivered in prunedList[].
#if PIXEL_PER_THREAD == 4
void FinePruneLights(uint threadID, int iNrCoarseLights, uint2 viTilLL, float4 vLinDepths) // keep float4 vectorization when possible, as shader compiler may generate bad code for array of floats.
#else
void FinePruneLights(uint threadID, int iNrCoarseLights, uint2 viTilLL, float vLinDepths[PIXEL_PER_THREAD])
#endif
{
...
//When using LDS to cache the volume data, this produces the best most optimal code.
//Doing a manual loop like the one below adds an extra cost of .1 ms on ps4 if we use LDS.
for (; l < iNrCoarseLights; ++l)
{
int idxCoarse;
int uLightVolume;
//讀取之前的LightVolumeData
LoadLightVolumeCache(l, idxCoarse, uLightVolume);
bool lightValid = false;
if (uLightVolume == LIGHTVOLUMETYPE_CONE)
{
LightVolumeData lightData = _LightVolumeData[idxCoarse];
const bool bIsSpotDisc = true; // (lightData.flags&IS_CIRCULAR_SPOT_SHAPE) != 0;
for (int i = 0; i < PIXEL_PER_THREAD; i++)
{
int idx = t + i * NR_THREADS;
//先計算當前畫素的深度對應的ViewPosition
uint2 uPixLoc = min(uint2(viTilLL.x + (idx & 0xf), viTilLL.y + (idx >> 4)), uint2(iWidth - 1, iHeight - 1));
float3 vVPos = GetViewPosFromLinDepth(uPixLoc + float2(0.5, 0.5), vLinDepths[i]);
//LightData.lightPos是View Space
// check pixel
//用當前畫素到LightPos的向量fromLight以及LightAxis判斷fromLight在Cone內部
float3 fromLight = vVPos - lightData.lightPos.xyz;
float distSq = dot(fromLight, fromLight);
const float fSclProj = dot(fromLight, lightData.lightAxisZ.xyz); // spotDir = lightData.lightAxisZ.xyz
float2 V = abs(float2(dot(fromLight, lightData.lightAxisX.xyz), dot(fromLight, lightData.lightAxisY.xyz)));
//bIsSpotDisc=true
//即fDist2D=dot(fromLight, lightData.lightAxisX)^2+dot(fromLight, lightData.lightAxisX)^2;
float fDist2D = bIsSpotDisc ? length(V) : max(V.x, V.y);
//lightData.radiusSq>distSq
//fSclProj>fDist2D * lightData.cotan即fSclProj/fDist2D>lightData.cotan,用fromLight和Axis計算夾角的cot
//lightVolumeData radiusSq的計算
//lightVolumeData.radiusSq = range * range;
//lightVolumeData cotan的計算
// var sa = light.spotAngle;
// var cs = Mathf.Cos(0.5f * sa * Mathf.Deg2Rad);
// var si = Mathf.Sin(0.5f * sa * Mathf.Deg2Rad);
// if (gpuLightType == GPULightType.ProjectorPyramid)
// {
// Vector3 lightPosToProjWindowCorner = (0.5f * lightDimensions.x) * vx + (0.5f * lightDimensions.y) * vy + 1.0f * vz;
// cs = Vector3.Dot(vz, Vector3.Normalize(lightPosToProjWindowCorner));
// si = Mathf.Sqrt(1.0f - cs * cs);
// }
// const float FltMax = 3.402823466e+38F;
// var ta = cs > 0.0f ? (si / cs) : FltMax;
// var cota = si > 0.0f ? (cs / si) : FltMax;
// lightVolumeData.cotan = cota;
bool validInPixel = all(float2(lightData.radiusSq, fSclProj) > float2(distSq, fDist2D * lightData.cotan));
#ifdef PLATFORM_SUPPORTS_WAVE_INTRINSICS
//a wave is on the same tile, and the loop is uniform for the wave.
// thus we early out if at least 1 thread in the wave passed this light, saving some ALU.
lightValid = WaveActiveAnyTrue(validInPixel);
#else
lightValid = validInPixel;
#endif
if (lightValid)
break;
}
}
else if (uLightVolume == LIGHTVOLUMETYPE_SPHERE)
{
LightVolumeData lightData = _LightVolumeData[idxCoarse];
for (int i = 0; i < PIXEL_PER_THREAD; i++)
{
int idx = t + i * NR_THREADS;
//先計算當前畫素的深度對應的ViewPosition
uint2 uPixLoc = min(uint2(viTilLL.x + (idx & 0xf), viTilLL.y + (idx >> 4)), uint2(iWidth - 1, iHeight - 1));
float3 vVPos = GetViewPosFromLinDepth(uPixLoc + float2(0.5, 0.5), vLinDepths[i]);
//簡單的球形距離場
// check pixel
float3 vLp = lightData.lightPos.xyz;
float3 toLight = vLp - vVPos;
float distSq = dot(toLight, toLight);
bool validInPixel = lightData.radiusSq > distSq;
#ifdef PLATFORM_SUPPORTS_WAVE_INTRINSICS
lightValid = WaveActiveAnyTrue(validInPixel);
#else
lightValid = validInPixel;
#endif
if (lightValid)
break;
}
}
else if (uLightVolume == LIGHTVOLUMETYPE_BOX)
{
LightVolumeData lightData = _LightVolumeData[idxCoarse];
for (int i = 0; i < PIXEL_PER_THREAD; i++)
{
int idx = t + i * NR_THREADS;
//先計算當前畫素的深度對應的ViewPosition
uint2 uPixLoc = min(uint2(viTilLL.x + (idx & 0xf), viTilLL.y + (idx >> 4)), uint2(iWidth - 1, iHeight - 1));
float3 vVPos = GetViewPosFromLinDepth(uPixLoc + float2(0.5, 0.5), vLinDepths[i]);
// check pixel
float3 toLight = lightData.lightPos.xyz - vVPos;
//用toLight計算有向距離場判斷當前畫素是否在Box內部
float3 dist = float3(dot(toLight, lightData.lightAxisX), dot(toLight, lightData.lightAxisY), dot(toLight, lightData.lightAxisZ));
dist = (abs(dist) - lightData.boxInnerDist) * lightData.boxInvRange; // not as efficient as it could be
bool validInPixel = max(max(dist.x, dist.y), dist.z) < 1; // but allows us to not write out OuterDists
#ifdef PLATFORM_SUPPORTS_WAVE_INTRINSICS
lightValid = WaveActiveAnyTrue(validInPixel);
#else
lightValid = validInPixel;
#endif
if (lightValid)
break;
}
}
else
break;
//lightValid記錄判交結果,前32盞燈記錄到uLightsFlags[0],剩下的記錄到uLightsFlags[1]
uLightsFlags[l < 32 ? 0 : 1] |= ((lightValid ? 1 : 0) << (l & 31));
}
...
}
[numthreads(NR_THREADS, 1, 1)]
void TileLightListGen(uint3 dispatchThreadId : SV_DispatchThreadID, uint threadID : SV_GroupIndex, uint3 u3GroupID : SV_GroupID)
{
...
#ifndef FINE_PRUNING_ENABLED
{
if((int)t<iNrCoarseLights)
prunedList[t] = coarseList[t];
if(t==0)
ldsNrLightsFinal=iNrCoarseLights;
}
#else
{
// initializes ldsNrLightsFinal with the number of accepted lights.
// all accepted entries delivered in prunedList[].
FinePruneLights(t, iNrCoarseLights, viTilLL, vLinDepths);
}
#endif
...
}
Resolve Pruned List
遍歷ldsDoesLightIntersect的Flag,重新結算Tile內的燈光數量,並把對應燈光Index(coarseList)加入到prunedList中
#define FINE_PRUNING_ENABLED
#define PERFORM_SPHERICAL_INTERSECTION_TESTS
//(記錄Tile內燈光數量)
groupshared int ldsNrLightsFinal;
// initializes ldsNrLightsFinal with the number of accepted lights.
// all accepted entries delivered in prunedList[].
#if PIXEL_PER_THREAD == 4
void FinePruneLights(uint threadID, int iNrCoarseLights, uint2 viTilLL, float4 vLinDepths) // keep float4 vectorization when possible, as shader compiler may generate bad code for array of floats.
#else
void FinePruneLights(uint threadID, int iNrCoarseLights, uint2 viTilLL, float vLinDepths[PIXEL_PER_THREAD])
#endif
{
....
//When using LDS to cache the volume data, this produces the best most optimal code.
//Doing a manual loop like the one below adds an extra cost of .1 ms on ps4 if we use LDS.
for (; l < iNrCoarseLights; ++l)
{
//判交計算
....
//lightValid記錄判交結果,前32盞燈記錄到uLightsFlags[0],剩下的記錄到uLightsFlags[1]
uLightsFlags[l < 32 ? 0 : 1] |= ((lightValid ? 1 : 0) << (l & 31));
}
//執行緒同步uLightsFlags
InterlockedOr(ldsDoesLightIntersect[0], uLightsFlags[0]);
InterlockedOr(ldsDoesLightIntersect[1], uLightsFlags[1]);
//重置ldsNrLightsFinal(記錄Tile內燈光數量)
if (t == 0)
ldsNrLightsFinal = 0;
#if NR_THREADS > PLATFORM_LANE_COUNT
GroupMemoryBarrierWithGroupSync();
#endif
//遍歷ldsDoesLightIntersect的Flag
if (t < (uint)iNrCoarseLights && (ldsDoesLightIntersect[t < 32 ? 0 : 1] & (1 << (t & 31))) != 0)
{
unsigned int uInc = 1;
unsigned int uIndex;
InterlockedAdd(ldsNrLightsFinal, uInc, uIndex);
if (uIndex < LIGHT_LIST_MAX_COARSE_ENTRIES)
prunedList[uIndex] = coarseList[t]; // we allow up to 64 pruned lights while stored in LDS.
}
}
[numthreads(NR_THREADS, 1, 1)]
void TileLightListGen(uint3 dispatchThreadId : SV_DispatchThreadID, uint threadID : SV_GroupIndex, uint3 u3GroupID : SV_GroupID)
{
...
#ifndef FINE_PRUNING_ENABLED
{
if((int)t<iNrCoarseLights)
prunedList[t] = coarseList[t];
if(t==0)
ldsNrLightsFinal=iNrCoarseLights;
}
#else
{
// initializes ldsNrLightsFinal with the number of accepted lights.
// all accepted entries delivered in prunedList[].
FinePruneLights(t, iNrCoarseLights, viTilLL, vLinDepths);
}
#endif
...
}
遍歷PruneList根據不同的LightCategory進行劃分,Resolve FeatureFlag
根據上一步FinePruneLights得到的prunedList,可以透過遍歷prunedList對應的燈光,拿到燈光的lightCategory以及featureFlags。
這樣就可以計算出Tile內CategoryCount以及光照涉及的Light Feature Flag(Punctual/Env/Decal....)
groupshared unsigned int prunedList[LIGHT_LIST_MAX_COARSE_ENTRIES];
//LightCategory計數器
groupshared int ldsCategoryListCount[CATEGORY_LIST_SIZE];
//Light Feature Flag
#ifdef USE_FEATURE_FLAGS
groupshared uint ldsFeatureFlags;
RWStructuredBuffer<uint> g_TileFeatureFlags;
#endif
[numthreads(NR_THREADS, 1, 1)]
void TileLightListGen(uint3 dispatchThreadId : SV_DispatchThreadID, uint threadID : SV_GroupIndex, uint3 u3GroupID : SV_GroupID)
{
...
//重置LightCategory計數器
if (t < CATEGORY_LIST_SIZE)
ldsCategoryListCount[t] = 0;
//重置ldsFeatureFlags
#ifdef USE_FEATURE_FLAGS
if(t==0)
ldsFeatureFlags=0;
#endif
//執行緒同步
#if NR_THREADS > PLATFORM_LANE_COUNT
GroupMemoryBarrierWithGroupSync();
#endif
//ldsNrLightsFinal為上一步FinePruneLights計算得出的Tile內燈光數量
int nrLightsCombinedList = min(ldsNrLightsFinal,LIGHT_LIST_MAX_COARSE_ENTRIES);
//遍歷prunedList,用InterlockedAdd累計不同的LightCategory到ldsCategoryListCount LDS計數器中
//InterlockedOr合計Tile內所有的Light Feature Flag
for (int i = t; i < nrLightsCombinedList; i += NR_THREADS)
{
const int lightBoundIndex = GenerateLightCullDataIndex(prunedList[i], g_iNrVisibLights, unity_StereoEyeIndex);
InterlockedAdd(ldsCategoryListCount[_LightVolumeData[lightBoundIndex].lightCategory], 1);
#ifdef USE_FEATURE_FLAGS
InterlockedOr(ldsFeatureFlags, _LightVolumeData[lightBoundIndex].featureFlags);
#endif
}
//排序prunedList
// sort lights (gives a more efficient execution in both deferred and tiled forward lighting).
#if NR_THREADS > PLATFORM_LANE_COUNT
SORTLIST(prunedList, nrLightsCombinedList, LIGHT_LIST_MAX_COARSE_ENTRIES, t, NR_THREADS);
//MERGESORTLIST(prunedList, coarseList, nrLightsCombinedList, t, NR_THREADS);
#endif
//初始化每個Tile對應的g_TileFeatureFlags
#ifdef USE_FEATURE_FLAGS
if(t == 0)
{
uint featureFlags = ldsFeatureFlags | g_BaseFeatureFlags;
// In case of back
if(ldsZMax < ldsZMin) // is background pixel
{
// There is no stencil usage with compute path, featureFlags set to 0 is use to have fast rejection of tile in this case.
// It will still execute but will do nothing
featureFlags = 0;
}
g_TileFeatureFlags[tileIDX.y * nrTilesX + tileIDX.x + unity_StereoEyeIndex * nrTilesX * nrTilesY] = featureFlags;
}
#endif
...
}
根據不同的LightCategory使用對應的Offset壓入到g_vLightList
這最後一步就是把prunedList放進g_vLightList(大象裝進冰箱)
其中由於Index用不了那麼UInt32那麼大的精度,所以需要將兩個Index合併成一個,用的時候再Unpack出來.
/////////HDRenderPipeline.LightLoop.cs PrepareBuildGPULightListPassData
//燈光數量作為EnvLightIndex起始點
//cb._EnvLightIndexShift = (uint)m_GpuLightsBuilder.lightsCount;
//燈光數量以及反射探針數量之和作為DecalIndex起始點
//cb._DecalIndexShift = (uint)(m_GpuLightsBuilder.lightsCount + m_lightList.envLights.Count);
//燈光數量,反射探針數量以及貼花數量之和作為Local Volumetric Fog Index起始點
//cb._LocalVolumetricFogIndexShift = (uint)(m_GpuLightsBuilder.lightsCount + m_lightList.envLights.Count + decalDatasCount);
/////////End of HDRenderPipeline.LightLoop.cs
CBUFFER_START(ShaderVariablesLightList)
...
uint _EnvLightIndexShift;
uint _DecalIndexShift;
uint _LocalVolumetricFogIndexShift;
...
CBUFFER_END
//原本每個Tile內的元素數量64,由於兩個Index合併成一個,所以Tile內元素數量就變成32
#define LIGHT_DWORD_PER_FPTL_TILE (32)
//ShaderConfig.cs.hlsl
//由指令碼GenerateHLSL生成控制 FPTL Tile內的LightIndex數量上限
#define SHADEROPTIONS_FPTLMAX_LIGHT_COUNT (63)
[numthreads(NR_THREADS, 1, 1)]
void TileLightListGen(uint3 dispatchThreadId : SV_DispatchThreadID, uint threadID : SV_GroupIndex, uint3 u3GroupID : SV_GroupID)
{
...
//Tile的起始Index計算
// write lights to global buffers
int localOffs = 0;
int offs = tileIDX.y * nrTilesX + tileIDX.x;
#if defined(UNITY_STEREO_INSTANCING_ENABLED)
// Eye base offset must match code in GetCountAndStartTile()
offs += unity_StereoEyeIndex * nrTilesX * nrTilesY * LIGHTCATEGORY_COUNT;
#endif
//CBUFFER變數初始化shiftIndex
// All our cull data are in the same list, but at render time envLights are separated so we need to shift the index
// to make it work correctly
int shiftIndex[CATEGORY_LIST_SIZE];
ZERO_INITIALIZE_ARRAY(int, shiftIndex, CATEGORY_LIST_SIZE);
shiftIndex[LIGHTCATEGORY_ENV] = _EnvLightIndexShift;
shiftIndex[LIGHTCATEGORY_DECAL] = _DecalIndexShift;
//透過讀取ldsCategoryListCount獲取不同Category(Punctual/Area/Env/Decal)對應的LightData數量
for (int category = 0; category < CATEGORY_LIST_SIZE; category++)
{
int nrLightsFinal = ldsCategoryListCount[category];
int nrLightsFinalClamped = nrLightsFinal < SHADEROPTIONS_FPTLMAX_LIGHT_COUNT ? nrLightsFinal : SHADEROPTIONS_FPTLMAX_LIGHT_COUNT;
//由於LightIndex用不了uint那麼多的位數(32Bit),所以可以對LightList中的Index每兩個合併(uLow/uHigh)合併成一個Index
//nrLightsFinalClamped + 1(記錄LightList的Index數量的nrLightsFinalClamped)
//((nrLightsFinalClamped + 1) + 1) >> 1 相當於DivRoundUp(nrLightsFinalClamped + 1,2)
const int nrDWords = ((nrLightsFinalClamped + 1) + 1) >> 1;
for (int l = (int)t; l < (int)nrDWords; l += NR_THREADS)
{
//prunedList裡儲存的Index是:所有的Category(Punctual/Area/Env/Decal)的LightData/LightVolumeData Buffer的Index,
//所以需要減去Category對應的shiftIndex,重新對映得到對應Category在各自Buffer中真正的Index
// We remap the prunedList index to the original LightData / EnvLightData indices
uint uLow = l == 0 ? nrLightsFinalClamped : prunedList[max(0, 2 * l - 1 + localOffs)] - shiftIndex[category];
uint uHigh = prunedList[2 * l + 0 + localOffs] - shiftIndex[category];
//偏移16位
g_vLightList[LIGHT_DWORD_PER_FPTL_TILE * offs + l] = (uLow & 0xffff) | (uHigh << 16);
}
//localOffs滑動到下一個Category
localOffs += nrLightsFinal;
//不同Category儲存Index的偏移是Tile的數量。這樣有利於遍歷時提高Cache Hit.
offs += (nrTilesX * nrTilesY);
}
...
}
Lighting Loop
最終來到了LightLoop中應用g_vLightList計算的結果。
應用的流程如下:
1.首先需要根據當前畫素PositionSS計算出Tile的Index。
2.根據Tile的Index以及當前計算的Category得到g_vLightList的偏移tileOffset。
3.Tile List的起始點的Index(start)就是tileOffset,用&0xffff取出第一個元素即為LightCount
4.後續獲取LightData的Index時,只需要從start前面的uHigh(前16bit)開始遍歷就行了
#define LIGHT_DWORD_PER_FPTL_TILE (32)
//渲染不透明佇列時啟用
#ifdef USE_FPTL_LIGHTLIST
//計算PositionInputs的tileIndex
//uint2 tileIndex = uint2(fragInput.positionSS.xy) / TILE_SIZE_FPTL;
//PositionInputs posInput = GetPositionInput(fragInput.positionSS.xy, _ScreenSize.zw, fragInput.positionSS.z, fragInput.positionSS.w, input.positionWS.xyz, tileIndex);
//根據tileIndex計算當前lightCategory對應的Tile在g_vLightListTile Buffer中的Offset Index
int GetTileOffset(PositionInputs posInput, uint lightCategory)
{
uint2 tileIndex = posInput.tileCoord;
return (tileIndex.y + lightCategory * _NumTileFtplY) * _NumTileFtplX + tileIndex.x;
}
void GetCountAndStartTile(PositionInputs posInput, uint lightCategory, out uint start, out uint lightCount)
{
int tileOffset = GetTileOffset(posInput, lightCategory);
#if defined(UNITY_STEREO_INSTANCING_ENABLED)
// Eye base offset must match code in lightlistbuild.compute
tileOffset += unity_StereoEyeIndex * _NumTileFtplX * _NumTileFtplY * LIGHTCATEGORY_COUNT;
#endif
//List的第一個元素就是Light的數量
// The first entry inside a tile is the number of light for lightCategory (thus the +0)
lightCount = g_vLightListTile[LIGHT_DWORD_PER_FPTL_TILE * tileOffset + 0] & 0xffff;
start = tileOffset;
}
uint GetTileSize()
{
return TILE_SIZE_FPTL;
}
void GetCountAndStart(PositionInputs posInput, uint lightCategory, out uint start, out uint lightCount)
{
GetCountAndStartTile(posInput, lightCategory, start, lightCount);
}
//Loop中讀取燈光Index的函式
uint FetchIndex(uint tileOffset, uint lightOffset)
{
//List的第一個元素就是Light的數量
//從start前面的uHigh開始遍歷
const uint lightOffsetPlusOne = lightOffset + 1; // Add +1 as first slot is reserved to store number of light
//用32bit存了兩個Index
// Light index are store on 16bit
return (g_vLightListTile[LIGHT_DWORD_PER_FPTL_TILE * tileOffset + (lightOffsetPlusOne >> 1)] >> ((lightOffsetPlusOne & 1) * 16)) & 0xffff;
}
//渲染透明佇列時啟用
#elif defined(USE_CLUSTERED_LIGHTLIST)
...
//LightingLoop.hlsl
// This struct is define in the material. the Lightloop must not access it
// PostEvaluateBSDF call at the end will convert Lighting to diffuse and specular lighting
AggregateLighting aggregateLighting;
ZERO_INITIALIZE(AggregateLighting, aggregateLighting); // LightLoop is in charge of initializing the struct
if (featureFlags & LIGHTFEATUREFLAGS_PUNCTUAL)
{
uint lightCount, lightStart;
//預設開啟
//#ifndef LIGHTLOOP_DISABLE_TILE_AND_CLUSTER
GetCountAndStart(posInput, LIGHTCATEGORY_PUNCTUAL, lightStart, lightCount);
//#else // LIGHTLOOP_DISABLE_TILE_AND_CLUSTER
//lightCount = _PunctualLightCount;
//lightStart = 0;
//#endif
bool fastPath = false;
//SCALARIZE_LIGHT_LOOP涉及到Wave相關的指令,詳細介紹可以看https://zhuanlan.zhihu.com/p/469436345
#if SCALARIZE_LIGHT_LOOP
uint lightStartLane0;
fastPath = IsFastPath(lightStart, lightStartLane0);
if (fastPath)
{
lightStart = lightStartLane0;
}
#endif
// Scalarized loop. All lights that are in a tile/cluster touched by any pixel in the wave are loaded (scalar load), only the one relevant to current thread/pixel are processed.
// For clarity, the following code will follow the convention: variables starting with s_ are meant to be wave uniform (meant for scalar register),
// v_ are variables that might have different value for each thread in the wave (meant for vector registers).
// This will perform more loads than it is supposed to, however, the benefits should offset the downside, especially given that light data accessed should be largely coherent.
// Note that the above is valid only if wave intriniscs are supported.
uint v_lightListOffset = 0;
uint v_lightIdx = lightStart;
while (v_lightListOffset < lightCount)
{
v_lightIdx = FetchIndex(lightStart, v_lightListOffset);
#if SCALARIZE_LIGHT_LOOP
uint s_lightIdx = ScalarizeElementIndex(v_lightIdx, fastPath);
#else
uint s_lightIdx = v_lightIdx;
#endif
if (s_lightIdx == -1)
break;
//獲取LightData
LightData s_lightData = FetchLight(s_lightIdx);
...
}
}