1、基於反饋的Optimization Report解決單個Work-item的Kernel相關性
在許多情況下,將OpenCL™應用程式設計為單個工作項核心就足以在不執行其他優化步驟的情況下最大化效能。
建議採用以下優化單個work-item kernel的選項來按照實用性順序解決單個work-item kernel迴圈攜帶的依賴性:
removal,relaxation,simplification,transfer to local memory。
(1) Removing Loop-Carried Dependency
根據優化報告的反饋,可以通過實現更簡單的記憶體訪問的方式來消除迴圈攜帶的依賴關係。
優化前:
1 #define N 128 2 3 __kernel void unoptimized (__global int * restrict A, 4 __global int * restrict B, 5 __global int* restrict result) 6 { 7 int sum = 0; 8 9 for (unsigned i = 0; i < N; i++) { 10 for (unsigned j = 0; j < N; j++) { 11 sum += A[i*N+j]; 12 } 13 sum += B[i]; 14 } 15 16 * result = sum; 17 }
優化後:
1 #define N 128 2 3 __kernel void optimized (__global int * restrict A, 4 __global int * restrict B, 5 __global int * restrict result) 6 { 7 int sum = 0; 8 9 for (unsigned i = 0; i < N; i++) { 10 // Step 1: Definition 11 int sum2 = 0; 12 13 // Step 2: Accumulation of array A values for one outer loop iteration 14 for (unsigned j = 0; j < N; j++) { 15 sum2 += A[i*N+j]; 16 } 17 18 // Step 3: Addition of array B value for an outer loop iteration 19 sum += sum2; 20 sum += B[i]; 21 } 22 23 * result = sum; 24 }
(2) Relaxing Loop-Carried Dependency
根據優化報告的反饋,可以通過增加依賴距離的方式來relax緩解迴圈攜帶的依賴關係。
優化前:
1 #define N 128 2 3 __kernel void unoptimized (__global float * restrict A, 4 __global float * restrict result) 5 { 6 float mul = 1.0f; 7 8 for (unsigned i = 0; i < N; i++) 9 mul *= A[i]; 10 11 * result = mul; 12 }
優化後:
1 #define N 128 2 #define M 8 3 4 __kernel void optimized (__global float * restrict A, 5 __global float * restrict result) 6 { 7 float mul = 1.0f; 8 9 // Step 1: Declare multiple copies of variable mul 10 float mul_copies[M]; 11 12 // Step 2: Initialize all copies 13 for (unsigned i = 0; i < M; i++) 14 mul_copies[i] = 1.0f; 15 16 for (unsigned i = 0; i < N; i++) { 17 // Step 3: Perform multiplication on the last copy 18 float cur = mul_copies[M-1] * A[i]; 19 20 // Step 4a: Shift copies 21 #pragma unroll 22 for (unsigned j = M-1; j > 0; j--) 23 mul_copies[j] = mul_copies[j-1]; 24 25 // Step 4b: Insert updated copy at the beginning 26 mul_copies[0] = cur; 27 } 28 29 // Step 5: Perform reduction on copies 30 #pragma unroll 31 for (unsigned i = 0; i < M; i++) 32 mul *= mul_copies[i]; 33 34 * result = mul; 35 }
(3) Transferring Loop-Carried Dependency to Local Memory
對於不能消除的迴圈攜帶的依賴關係,可以通過將具有迴圈依賴的陣列從全域性記憶體global memory移動到local memory的方式來改進迴圈的啟動間隔(initiation interval, II)。
優化前:
1 #define N 128 2 3 __kernel void unoptimized( __global int* restrict A ) 4 { 5 for (unsigned i = 0; i < N; i++) 6 A[N-i] = A[i]; 7 }
優化後:
1 #define N 128 2 3 __kernel void optimized( __global int* restrict A ) 4 { 5 int B[N]; 6 7 for (unsigned i = 0; i < N; i++) 8 B[i] = A[i]; 9 10 for (unsigned i = 0; i < N; i++) 11 B[N-i] = B[i]; 12 13 for (unsigned i = 0; i < N; i++) 14 A[i] = B[i]; 15 }
(4) Relaxing Loop-Carried Dependency by Inferring Shift Registers
為了使SDK能夠有效地處理執行雙精度浮點運算的單個work-item kernels,可以通過推斷移位暫存器的方式移除迴圈攜帶的依賴性。
優化前:
1 __kernel void double_add_1 (__global double *arr, 2 int N, 3 __global double *result) 4 { 5 double temp_sum = 0; 6 7 for (int i = 0; i < N; ++i) 8 { 9 temp_sum += arr[i]; 10 } 11 12 *result = temp_sum; 13 }
優化後:
1 //Shift register size must be statically determinable 2 #define II_CYCLES 12 3 4 __kernel void double_add_2 (__global double *arr, 5 int N, 6 __global double *result) 7 { 8 //Create shift register with II_CYCLE+1 elements 9 double shift_reg[II_CYCLES+1]; 10 11 //Initialize all elements of the register to 0 12 for (int i = 0; i < II_CYCLES + 1; i++) 13 { 14 shift_reg[i] = 0; 15 } 16 17 //Iterate through every element of input array 18 for(int i = 0; i < N; ++i) 19 { 20 //Load ith element into end of shift register 21 //if N > II_CYCLE, add to shift_reg[0] to preserve values 22 shift_reg[II_CYCLES] = shift_reg[0] + arr[i]; 23 24 #pragma unroll 25 //Shift every element of shift register 26 for(int j = 0; j < II_CYCLES; ++j) 27 { 28 shift_reg[j] = shift_reg[j + 1]; 29 } 30 } 31 32 //Sum every element of shift register 33 double temp_sum = 0; 34 35 #pragma unroll 36 for(int i = 0; i < II_CYCLES; ++i) 37 { 38 temp_sum += shift_reg[i]; 39 } 40 41 *result = temp_sum; 42 }
(5) Removing Loop-Carried Dependencies Cause by Accesses to Memory Arrays
在單個work-item的kernel中包含ivdep pragma以保證堆記憶體array的訪問不會導致迴圈攜帶的依賴性。
如果對迴圈中的記憶體陣列的所有訪問都不會導致迴圈攜帶的依賴性,在核心程式碼中的迴圈之前新增#pragma ivdep。
// no loop-carried dependencies for A and B array accesses #pragma ivdep for (int i = 0; i < N; i++) { A[i] = A[i - X[i]]; B[i] = B[i - Y[i]]; }
要指定對迴圈內特定記憶體陣列的訪問不會導致迴圈攜帶的依賴關係,在核心程式碼中的迴圈之前新增#pragma ivdep arrayarray_name)。
ivdep pragma指定的陣列必須是local / private記憶體陣列,或者是指向global/local/private記憶體儲存的指標變數。 如果指定的陣列是指標,則ivdep pragma也適用於所有可能帶有指定指標別名的陣列。
ivdep pragma指示指定的陣列也可以是struct的陣列或指標成員。
// No loop-carried dependencies for A array accesses // The offline compiler will insert hardware that reinforces dependency constraints for B #pragma ivdep array(A) for (int i = 0; i < N; i++) { A[i] = A[i - X[i]]; B[i] = B[i - Y[i]]; } // No loop-carried dependencies for array A inside struct #pragma ivdep array(S.A) for (int i = 0; i < N; i++) { S.A[i] = S.A[i - X[i]]; } // No loop-carried dependencies for array A inside the struct pointed by S #pragma ivdep array(S->X[2][3].A) for (int i = 0; i < N; i++) { S->X[2][3].A[i] = S.A[i - X[i]]; } // No loop-carried dependencies for A and B because ptr aliases // with both arrays int *ptr = select ? A : B; #pragma ivdep array(ptr) for (int i = 0; i < N; i++) { A[i] = A[i - X[i]]; B[i] = B[i - Y[i]]; } // No loop-carried dependencies for A because ptr only aliases with A int *ptr = &A[10]; #pragma ivdep array(ptr) for (int i = 0; i < N; i++) { A[i] = A[i - X[i]]; B[i] = B[i - Y[i]]; }
2、Single work-item kernel的設計技巧
避免指標混淆。使用restrict關鍵詞。
建立格式正確的迴圈。退出條件與整數進行比較,且每次迭代的增量為1。格式正確的巢狀迴圈也對最大化kernel效能有幫助。
最小化loop-carried迴圈攜帶依賴遵循以下原則:
避免使用指標算數;
宣告簡單的陣列索引;
儘可能在kernel中使用恆定邊界的迴圈。
避免複雜的迴圈退出條件;
將巢狀迴圈轉換為單迴圈;
避免條件迴圈; 避免if else中包含迴圈,儘量轉換為在迴圈中包含if else。
在儘可能深的範圍內宣告變數。