OpenCL 增強單work-item kernel效能策略

ZhuzhuDong發表於2020-08-03

原文網址 : https://www.cnblogs.com/zhuzhudong/p/13403737.html

1、基於反饋的Optimization Report解決單個Work-item的Kernel相關性

　　在許多情況下，將OpenCL™應用程式設計為單個工作項核心就足以在不執行其他優化步驟的情況下最大化效能。

　　建議採用以下優化單個work-item kernel的選項來按照實用性順序解決單個work-item kernel迴圈攜帶的依賴性：

　　　　removal，relaxation，simplification，transfer to local memory。

(1) Removing Loop-Carried Dependency

　　根據優化報告的反饋，可以通過實現更簡單的記憶體訪問的方式來消除迴圈攜帶的依賴關係。

優化前：

 1 #define N 128
 2 
 3 __kernel void unoptimized (__global int * restrict A,
 4                            __global int * restrict B,
 5                            __global int* restrict result)
 6 {
 7   int sum = 0;
 8 
 9   for (unsigned i = 0; i < N; i++) {
10     for (unsigned j = 0; j < N; j++) {
11       sum += A[i*N+j];
12     }
13     sum += B[i];
14   }
15 
16   * result = sum;
17 }

優化後：

 1 #define N 128
 2 
 3 __kernel void optimized (__global int * restrict A,
 4                          __global int * restrict B,
 5                          __global int * restrict result)
 6 {
 7   int sum = 0;
 8 
 9   for (unsigned i = 0; i < N; i++) {
10     // Step 1: Definition
11     int sum2 = 0;
12 
13     // Step 2: Accumulation of array A values for one outer loop iteration
14     for (unsigned j = 0; j < N; j++) {
15       sum2 += A[i*N+j];
16     }
17 
18     // Step 3: Addition of array B value for an outer loop iteration
19     sum += sum2;
20     sum += B[i];
21   }
22 
23   * result = sum;
24 }

(2) Relaxing Loop-Carried Dependency

　　根據優化報告的反饋，可以通過增加依賴距離的方式來relax緩解迴圈攜帶的依賴關係。

優化前：

 1 #define N 128
 2 
 3 __kernel void unoptimized (__global float * restrict A,
 4                            __global float * restrict result)
 5 {
 6   float mul = 1.0f;
 7 
 8   for (unsigned i = 0; i < N; i++)
 9     mul *= A[i];
10 
11   * result = mul;
12 }

優化後：

 1 #define N 128
 2 #define M 8
 3 
 4 __kernel void optimized (__global float * restrict A,
 5                          __global float * restrict result)
 6 {
 7   float mul = 1.0f;
 8 
 9   // Step 1: Declare multiple copies of variable mul
10   float mul_copies[M];
11 
12   // Step 2: Initialize all copies
13   for (unsigned i = 0; i < M; i++)
14     mul_copies[i] = 1.0f;
15 
16   for (unsigned i = 0; i < N; i++) {
17     // Step 3: Perform multiplication on the last copy
18     float cur = mul_copies[M-1] * A[i];
19 
20     // Step 4a: Shift copies
21     #pragma unroll 
22     for (unsigned j = M-1; j > 0; j--)
23       mul_copies[j] = mul_copies[j-1];
24 
25     // Step 4b: Insert updated copy at the beginning
26     mul_copies[0] = cur;
27   }
28 
29   // Step 5: Perform reduction on copies
30   #pragma unroll 
31   for (unsigned i = 0; i < M; i++)
32     mul *= mul_copies[i];
33 
34   * result = mul;
35 }

(3) Transferring Loop-Carried Dependency to Local Memory

　　對於不能消除的迴圈攜帶的依賴關係，可以通過將具有迴圈依賴的陣列從全域性記憶體global memory移動到local memory的方式來改進迴圈的啟動間隔（initiation interval, II）。

優化前：

1 #define N 128
2
3 __kernel void unoptimized( __global int* restrict A )
4 {
5     for (unsigned i = 0; i < N; i++)
6           A[N-i] = A[i];
7 }

優化後：

 1 #define N 128
 2
 3 __kernel void optimized( __global int* restrict A )
 4 {
 5     int B[N];
 6
 7     for (unsigned i = 0; i < N; i++)
 8         B[i] = A[i];
 9
10     for (unsigned i = 0; i < N; i++)
11         B[N-i] = B[i];
12
13     for (unsigned i = 0; i < N; i++)
14         A[i] = B[i];
15 }

(4) Relaxing Loop-Carried Dependency by Inferring Shift Registers

　　為了使SDK能夠有效地處理執行雙精度浮點運算的單個work-item kernels，可以通過推斷移位暫存器的方式移除迴圈攜帶的依賴性。

優化前：

 1 __kernel void double_add_1 (__global double *arr,
 2                             int N,
 3                             __global double *result)
 4 {
 5   double temp_sum = 0;
 6
 7   for (int i = 0; i < N; ++i)
 8   {
 9       temp_sum += arr[i];
10   }
11 
12   *result = temp_sum;
13 }

優化後：

 1 //Shift register size must be statically determinable
 2 #define II_CYCLES 12
 3
 4 __kernel void double_add_2 (__global double *arr,
 5                             int N,
 6                             __global double *result)
 7 {
 8     //Create shift register with II_CYCLE+1 elements
 9     double shift_reg[II_CYCLES+1];
10    
11     //Initialize all elements of the register to 0
12     for (int i = 0; i < II_CYCLES + 1; i++)
13     {
14         shift_reg[i] = 0;
15     }
16    
17     //Iterate through every element of input array
18     for(int i = 0; i < N; ++i)
19     {
20         //Load ith element into end of shift register
21         //if N > II_CYCLE, add to shift_reg[0] to preserve values
22         shift_reg[II_CYCLES] = shift_reg[0] + arr[i];
23 
24         #pragma unroll
25         //Shift every element of shift register
26         for(int j = 0; j < II_CYCLES; ++j)
27         {
28             shift_reg[j] = shift_reg[j + 1];
29         }
30     }
31 
32     //Sum every element of shift register
33     double temp_sum = 0;
34     
35     #pragma unroll 
36     for(int i = 0; i < II_CYCLES; ++i)
37     {
38         temp_sum += shift_reg[i];
39     }
40
41     *result = temp_sum;
42 }

(5) Removing Loop-Carried Dependencies Cause by Accesses to Memory Arrays

　　在單個work-item的kernel中包含ivdep pragma以保證堆記憶體array的訪問不會導致迴圈攜帶的依賴性。

　　如果對迴圈中的記憶體陣列的所有訪問都不會導致迴圈攜帶的依賴性，在核心程式碼中的迴圈之前新增#pragma ivdep。

// no loop-carried dependencies for A and B array accesses
#pragma ivdep
for (int i = 0; i < N; i++) {
    A[i] = A[i - X[i]];
    B[i] = B[i - Y[i]];
}

　　要指定對迴圈內特定記憶體陣列的訪問不會導致迴圈攜帶的依賴關係，在核心程式碼中的迴圈之前新增#pragma ivdep arrayarray_name）。

　　ivdep pragma指定的陣列必須是local / private記憶體陣列，或者是指向global/local/private記憶體儲存的指標變數。如果指定的陣列是指標，則ivdep pragma也適用於所有可能帶有指定指標別名的陣列。

　　ivdep pragma指示指定的陣列也可以是struct的陣列或指標成員。

// No loop-carried dependencies for A array accesses
// The offline compiler will insert hardware that reinforces dependency constraints for B
#pragma ivdep array(A)
for (int i = 0; i < N; i++) {
    A[i] = A[i - X[i]];
    B[i] = B[i - Y[i]];
}

// No loop-carried dependencies for array A inside struct
#pragma ivdep array(S.A)
for (int i = 0; i < N; i++) {
    S.A[i] = S.A[i - X[i]];
}

// No loop-carried dependencies for array A inside the struct pointed by S
#pragma ivdep array(S->X[2][3].A)
for (int i = 0; i < N; i++) {
    S->X[2][3].A[i] = S.A[i - X[i]];
}

// No loop-carried dependencies for A and B because ptr aliases
// with both arrays
int *ptr = select ? A : B;
#pragma ivdep array(ptr)
for (int i = 0; i < N; i++) {
    A[i] = A[i - X[i]];
    B[i] = B[i - Y[i]];
}

// No loop-carried dependencies for A because ptr only aliases with A
int *ptr = &A[10];
#pragma ivdep array(ptr)
for (int i = 0; i < N; i++) {
    A[i] = A[i - X[i]];
    B[i] = B[i - Y[i]];
}

2、Single work-item kernel的設計技巧

　　避免指標混淆。使用restrict關鍵詞。

　　建立格式正確的迴圈。退出條件與整數進行比較，且每次迭代的增量為1。格式正確的巢狀迴圈也對最大化kernel效能有幫助。

　　最小化loop-carried迴圈攜帶依賴遵循以下原則：

　　　　避免使用指標算數；

　　　　宣告簡單的陣列索引；

　　　　儘可能在kernel中使用恆定邊界的迴圈。

　　避免複雜的迴圈退出條件；

　　將巢狀迴圈轉換為單迴圈；

　　避免條件迴圈；避免if else中包含迴圈，儘量轉換為在迴圈中包含if else。

　　在儘可能深的範圍內宣告變數。

MySQL 8 複製效能的增強
2024-11-15
MySql
【譯】Visual Studio 2022 - 17.10 效能增強
2024-06-25
在C#中基於Semantic Kernel的檢索增強生成（RAG）實踐
2024-10-19
C#
用歸納偏置來增強你的模型效能
2020-11-24
模型
深圳市恆訊科技分析如何增強mySQL效能？
2021-04-27
MySql
SAP訂單編排和流程增強概述
2018-12-13
帶你熟悉CCE叢集增強型CPU管理策略enhanced-static
2024-05-11
JMeter做效能測試（1）-效能壓測指令碼的生成以及完善和增強
2024-03-16
JMeter指令碼
OpenCL中的SVM使用案例
2024-06-23
基於AI排序演算法的指數增強策略【附原始碼】
2019-01-24
AI排序演算法原始碼
基於深度學習的單通道語音增強
2020-12-25
深度學習
高通SNPE - OpenCL簡介（1）
2020-11-05
一文說清OpenCL框架
2021-07-31
框架
高效能索引策略二
2019-02-22
索引
基於SAP Kyma的訂單編排增強介紹
2019-01-08
增強for 迴圈
2018-03-31
關於MNN的OPENCL和Vulkan支援
2024-08-22
OpenCL中的管道cl::Pipe的使用
2024-06-06
指數增強策略：如何透過動態調整權重擊敗滬深300？
2024-06-28
Mac右鍵選單增強工具：iBoysoft MagicMenu mac中文版
2023-02-02
Mac
桌面端前端效能優化策略
2018-07-15
前端優化
openGauss DSS功能增強
2024-03-28
影像增強（Image enhancement）
2024-10-07
TotalFinder for MacFinder增強工具
2020-11-22
Mac
Kernel Method
2020-11-08
GPU程式設計--OpenCL四大模型
2018-05-24
GPU程式設計大模型
IDEA 2024.1：Spring支援增強、GitHub Action支援增強、更新HTTP Client等
2024-02-17
IdeaSpringGithubHTTPclient
「MySQL」高效能索引優化策略
2019-03-02
MySql索引優化
MYSQL索引及高效能索引策略
2020-09-11
MySql索引
萬字長文學會對接 AI 模型：Semantic Kernel 和 Kernel Memory，工良出品，超簡單的教程
2024-03-01
AI模型
核心引數kernel.shmall和kernel.shmmax
2024-06-16
HMM
SAPS/4HANA生產訂單的BAdI增強點之Initialize方法
2019-01-31
C# 編寫一個簡單易用的 Windows 截圖增強工具
2022-05-11
C#Windows
Omdia觀察：寬頻ARPU增長策略
2021-11-10
Java排序理解增強版
2018-11-12
Java排序
Spring AOP 增強介紹
2018-08-22
Spring
Mac鍵盤增強工具
2021-12-07
Mac
TotalFinder for Mac(Finder增強工具)
2023-02-22
Mac

OpenCL 增強單work-item kernel效能策略

相關文章