1. 這是一個對影像通道 bgra 轉換為 rgb的示例程式。轉換方式有普通寫法、openmp-simd編譯指導語句、neon intrinsic函式三種實現方式
2. 原始碼
#include <stdio.h>
#include <stdlib.h>
#include <iostream>
#include <arm_neon.h>
#include <omp.h>
using namespace std;
void bgra2rgb(const uint8_t *src, uint8_t *dst, int w, int h)
{
for (int i = 0; i < h; ++i)
{
for (int j = 0; j < w; j++)
{
dst[(i * w + j) * 3] = src[(i * w + j) * 4 + 2];
dst[(i * w + j) * 3 + 1] = src[(i * w + j) * 4 + 1];
dst[(i * w + j) * 3 + 2] = src[(i * w + j) * 4];
}
}
}
// #pragma omp declare simd notinbranch simdlen(4) //uniform(w,h) 取址常量不參與計算不用uniform
// gcc 不支援混合資料
void bgra2rgb_omp(uint8_t *src, uint8_t *dst, int w, int h)
{
for (int i = 0; i < h; ++i)
{
#pragma omp simd
for (int j = 0; j < w; j++)
{
dst[(i * w + j) * 3] = src[(i * w + j) * 4 + 2];
dst[(i * w + j) * 3 + 1] = src[(i * w + j) * 4 + 1];
dst[(i * w + j) * 3 + 2] = src[(i * w + j) * 4];
}
}
}
void bgra2rgb_neon(const uint8_t *src, uint8_t *dst, int w, int h)
{
uint8x16x4_t vec1 = {0};
uint8x16x3_t vec2 = {0};
for (int i = 0; i < h; ++i)
{
// 使用 intrinsic 函式
for (int j = 0; j < w; j+=16)
{
//交叉載入 bgra 資料到4個向量暫存器
vec1 = vld4q_u8(&src[(i*w+j)*4]);
vec2.val[0] = vec1.val[2];
vec2.val[1] = vec1.val[1];
vec2.val[2] = vec1.val[0];
vst3q_u8(&dst[(i*w+j)*3],vec2);
}
}
}
int main(int argc,char** argv)
{
if(argc!=2)
{
printf("should parameter 0:original 1:omp simd 2:neon simd.");
return 0;
}
int mode = atoi(argv[1]);
int nloop = 100;
const int w = 480;
const int h = 640;
uint8_t bgra_mat[w * h * 4];
uint8_t rgb_mat[w * h * 3];
srand(100);
for (int i = 0; i < w * h * 4; i++)
{
bgra_mat[i] = rand() % 256;
}
clock_t t = clock();
switch (mode)
{
case 0: //original
for (int iloop = 0; iloop < nloop; iloop++)
bgra2rgb(bgra_mat, rgb_mat, w, h);
break;
case 1: //omp simd
omp_set_num_threads(4);
for (int iloop = 0; iloop < nloop; iloop++)
bgra2rgb_omp(bgra_mat, rgb_mat, w, h);
break;
case 2: //neon
for (int iloop = 0; iloop < nloop; iloop++)
bgra2rgb_neon(bgra_mat, rgb_mat, w, h);
break;
default:
break;
}
t = clock() - t;
cout << "bgra[4-6] data:" << (int)bgra_mat[4] << "," << (int)bgra_mat[5] << "," << (int)bgra_mat[6] << endl;
cout << "rgb[3-5] data:" << (int)rgb_mat[3] << "," << (int)rgb_mat[4] << "," << (int)rgb_mat[5] << endl;
cout << "cost time(clock):" << t / nloop << endl;
}
3. 編譯命令
g++ brga2rgb.cpp -o brga2rgb -fopenmp -O1
4. 資料分析
原始碼未向量化耗時:490 時間單位
openmp-simd 耗時:250 時間單位
neon intrinsic 函式:210 時間單位
分析:使用手動向量化neon intrinsic 函式的效率最高。