OpenCL 增強單work-item kernel效能策略

ZhuzhuDong發表於2020-08-03

1、基於反饋的Optimization Report解決單個Work-item的Kernel相關性

  在許多情況下,將OpenCL™應用程式設計為單個工作項核心就足以在不執行其他優化步驟的情況下最大化效能。

  建議採用以下優化單個work-item kernel的選項來按照實用性順序解決單個work-item kernel迴圈攜帶的依賴性:

    removal,relaxation,simplification,transfer to local memory。

 

(1) Removing Loop-Carried Dependency

  根據優化報告的反饋,可以通過實現更簡單的記憶體訪問的方式來消除迴圈攜帶的依賴關係。

優化前:

 1 #define N 128
 2 
 3 __kernel void unoptimized (__global int * restrict A,
 4                            __global int * restrict B,
 5                            __global int* restrict result)
 6 {
 7   int sum = 0;
 8 
 9   for (unsigned i = 0; i < N; i++) {
10     for (unsigned j = 0; j < N; j++) {
11       sum += A[i*N+j];
12     }
13     sum += B[i];
14   }
15 
16   * result = sum;
17 }

優化後:

 1 #define N 128
 2 
 3 __kernel void optimized (__global int * restrict A,
 4                          __global int * restrict B,
 5                          __global int * restrict result)
 6 {
 7   int sum = 0;
 8 
 9   for (unsigned i = 0; i < N; i++) {
10     // Step 1: Definition
11     int sum2 = 0;
12 
13     // Step 2: Accumulation of array A values for one outer loop iteration
14     for (unsigned j = 0; j < N; j++) {
15       sum2 += A[i*N+j];
16     }
17 
18     // Step 3: Addition of array B value for an outer loop iteration
19     sum += sum2;
20     sum += B[i];
21   }
22 
23   * result = sum;
24 }

(2) Relaxing Loop-Carried Dependency

  根據優化報告的反饋,可以通過增加依賴距離的方式來relax緩解迴圈攜帶的依賴關係。

優化前:

 1 #define N 128
 2 
 3 __kernel void unoptimized (__global float * restrict A,
 4                            __global float * restrict result)
 5 {
 6   float mul = 1.0f;
 7 
 8   for (unsigned i = 0; i < N; i++)
 9     mul *= A[i];
10 
11   * result = mul;
12 }

優化後:

 1 #define N 128
 2 #define M 8
 3 
 4 __kernel void optimized (__global float * restrict A,
 5                          __global float * restrict result)
 6 {
 7   float mul = 1.0f;
 8 
 9   // Step 1: Declare multiple copies of variable mul
10   float mul_copies[M];
11 
12   // Step 2: Initialize all copies
13   for (unsigned i = 0; i < M; i++)
14     mul_copies[i] = 1.0f;
15 
16   for (unsigned i = 0; i < N; i++) {
17     // Step 3: Perform multiplication on the last copy
18     float cur = mul_copies[M-1] * A[i];
19 
20     // Step 4a: Shift copies
21     #pragma unroll 
22     for (unsigned j = M-1; j > 0; j--)
23       mul_copies[j] = mul_copies[j-1];
24 
25     // Step 4b: Insert updated copy at the beginning
26     mul_copies[0] = cur;
27   }
28 
29   // Step 5: Perform reduction on copies
30   #pragma unroll 
31   for (unsigned i = 0; i < M; i++)
32     mul *= mul_copies[i];
33 
34   * result = mul;
35 }

(3) Transferring Loop-Carried Dependency to Local Memory

  對於不能消除的迴圈攜帶的依賴關係,可以通過將具有迴圈依賴的陣列從全域性記憶體global memory移動到local memory的方式來改進迴圈的啟動間隔(initiation interval, II)。

優化前:

1 #define N 128
2
3 __kernel void unoptimized( __global int* restrict A )
4 {
5     for (unsigned i = 0; i < N; i++)
6           A[N-i] = A[i];
7 }     

優化後:

 1 #define N 128
 2
 3 __kernel void optimized( __global int* restrict A )
 4 {
 5     int B[N];
 6
 7     for (unsigned i = 0; i < N; i++)
 8         B[i] = A[i];
 9
10     for (unsigned i = 0; i < N; i++)
11         B[N-i] = B[i];
12
13     for (unsigned i = 0; i < N; i++)
14         A[i] = B[i];
15 }

(4) Relaxing Loop-Carried Dependency by Inferring Shift Registers

  為了使SDK能夠有效地處理執行雙精度浮點運算的單個work-item kernels,可以通過推斷移位暫存器的方式移除迴圈攜帶的依賴性。

優化前:

 1 __kernel void double_add_1 (__global double *arr,
 2                             int N,
 3                             __global double *result)
 4 {
 5   double temp_sum = 0;
 6
 7   for (int i = 0; i < N; ++i)
 8   {
 9       temp_sum += arr[i];
10   }
11 
12   *result = temp_sum;
13 }

優化後:

 1 //Shift register size must be statically determinable
 2 #define II_CYCLES 12
 3
 4 __kernel void double_add_2 (__global double *arr,
 5                             int N,
 6                             __global double *result)
 7 {
 8     //Create shift register with II_CYCLE+1 elements
 9     double shift_reg[II_CYCLES+1];
10    
11     //Initialize all elements of the register to 0
12     for (int i = 0; i < II_CYCLES + 1; i++)
13     {
14         shift_reg[i] = 0;
15     }
16    
17     //Iterate through every element of input array
18     for(int i = 0; i < N; ++i)
19     {
20         //Load ith element into end of shift register
21         //if N > II_CYCLE, add to shift_reg[0] to preserve values
22         shift_reg[II_CYCLES] = shift_reg[0] + arr[i];
23 
24         #pragma unroll
25         //Shift every element of shift register
26         for(int j = 0; j < II_CYCLES; ++j)
27         {
28             shift_reg[j] = shift_reg[j + 1];
29         }
30     }
31 
32     //Sum every element of shift register
33     double temp_sum = 0;
34     
35     #pragma unroll 
36     for(int i = 0; i < II_CYCLES; ++i)
37     {
38         temp_sum += shift_reg[i];
39     }
40
41     *result = temp_sum;
42 }

(5) Removing Loop-Carried Dependencies Cause by Accesses to Memory Arrays

  在單個work-item的kernel中包含ivdep pragma以保證堆記憶體array的訪問不會導致迴圈攜帶的依賴性。

  如果對迴圈中的記憶體陣列的所有訪問都不會導致迴圈攜帶的依賴性,在核心程式碼中的迴圈之前新增#pragma ivdep。

// no loop-carried dependencies for A and B array accesses
#pragma ivdep
for (int i = 0; i < N; i++) {
    A[i] = A[i - X[i]];
    B[i] = B[i - Y[i]];
}

  要指定對迴圈內特定記憶體陣列的訪問不會導致迴圈攜帶的依賴關係,在核心程式碼中的迴圈之前新增#pragma ivdep arrayarray_name)。

   ivdep pragma指定的陣列必須是local / private記憶體陣列,或者是指向global/local/private記憶體儲存的指標變數。 如果指定的陣列是指標,則ivdep pragma也適用於所有可能帶有指定指標別名的陣列。

   ivdep pragma指示指定的陣列也可以是struct的陣列或指標成員。

 

// No loop-carried dependencies for A array accesses
// The offline compiler will insert hardware that reinforces dependency constraints for B
#pragma ivdep array(A)
for (int i = 0; i < N; i++) {
    A[i] = A[i - X[i]];
    B[i] = B[i - Y[i]];
}

// No loop-carried dependencies for array A inside struct
#pragma ivdep array(S.A)
for (int i = 0; i < N; i++) {
    S.A[i] = S.A[i - X[i]];
}

// No loop-carried dependencies for array A inside the struct pointed by S
#pragma ivdep array(S->X[2][3].A)
for (int i = 0; i < N; i++) {
    S->X[2][3].A[i] = S.A[i - X[i]];
}

// No loop-carried dependencies for A and B because ptr aliases
// with both arrays
int *ptr = select ? A : B;
#pragma ivdep array(ptr)
for (int i = 0; i < N; i++) {
    A[i] = A[i - X[i]];
    B[i] = B[i - Y[i]];
}

// No loop-carried dependencies for A because ptr only aliases with A
int *ptr = &A[10];
#pragma ivdep array(ptr)
for (int i = 0; i < N; i++) {
    A[i] = A[i - X[i]];
    B[i] = B[i - Y[i]];
}

2、Single work-item kernel的設計技巧

  避免指標混淆。使用restrict關鍵詞。

  建立格式正確的迴圈。退出條件與整數進行比較,且每次迭代的增量為1。格式正確的巢狀迴圈也對最大化kernel效能有幫助。

  最小化loop-carried迴圈攜帶依賴遵循以下原則:

    避免使用指標算數;

    宣告簡單的陣列索引;

    儘可能在kernel中使用恆定邊界的迴圈。

  避免複雜的迴圈退出條件;

  將巢狀迴圈轉換為單迴圈;

  避免條件迴圈; 避免if else中包含迴圈,儘量轉換為在迴圈中包含if else。

  在儘可能深的範圍內宣告變數。

相關文章