高效能運算-bgra2rgb轉換-向量化最佳化(15)

安洛8發表於2024-11-30

1. 這是一個對影像通道 bgra 轉換為 rgb的示例程式。轉換方式有普通寫法、openmp-simd編譯指導語句、neon intrinsic函式三種實現方式

2. 原始碼

#include <stdio.h>
#include <stdlib.h>
#include <iostream>
#include <arm_neon.h>
#include <omp.h>
using namespace std;

void bgra2rgb(const uint8_t *src, uint8_t *dst, int w, int h)
{
	for (int i = 0; i < h; ++i)
	{
		for (int j = 0; j < w; j++)
		{
			dst[(i * w + j) * 3] = src[(i * w + j) * 4 + 2];
			dst[(i * w + j) * 3 + 1] = src[(i * w + j) * 4 + 1];
			dst[(i * w + j) * 3 + 2] = src[(i * w + j) * 4];
		}
	}
}

// #pragma omp declare simd notinbranch simdlen(4) //uniform(w,h) 取址常量不參與計算不用uniform
// gcc 不支援混合資料
void bgra2rgb_omp(uint8_t *src, uint8_t *dst, int w, int h)
{
	for (int i = 0; i < h; ++i)
	{
		#pragma omp simd
		for (int j = 0; j < w; j++)
		{
			dst[(i * w + j) * 3] = src[(i * w + j) * 4 + 2];
			dst[(i * w + j) * 3 + 1] = src[(i * w + j) * 4 + 1];
			dst[(i * w + j) * 3 + 2] = src[(i * w + j) * 4];
		}
	}
}

void bgra2rgb_neon(const uint8_t *src, uint8_t *dst, int w, int h)
{
	uint8x16x4_t vec1 = {0};
	uint8x16x3_t vec2 = {0};
	for (int i = 0; i < h; ++i)
	{
        // 使用 intrinsic 函式
		for (int j = 0; j < w; j+=16)
		{
			//交叉載入 bgra 資料到4個向量暫存器
			vec1 = vld4q_u8(&src[(i*w+j)*4]);
			vec2.val[0] = vec1.val[2];
			vec2.val[1] = vec1.val[1];
			vec2.val[2] = vec1.val[0];
			vst3q_u8(&dst[(i*w+j)*3],vec2);
		}
	}
}

int main(int argc,char** argv)
{
	if(argc!=2)
	{
		printf("should parameter 0:original 1:omp simd 2:neon simd.");
		return 0;
	}
	int mode = atoi(argv[1]);
	int nloop = 100;
	const int w = 480;
	const int h = 640;
	uint8_t bgra_mat[w * h * 4];
	uint8_t rgb_mat[w * h * 3];
	srand(100);
	for (int i = 0; i < w * h * 4; i++)
	{
		bgra_mat[i] = rand() % 256;
	}

	clock_t t = clock();
	switch (mode)
	{
	case 0:	//original
		for (int iloop = 0; iloop < nloop; iloop++)
			bgra2rgb(bgra_mat, rgb_mat, w, h);
		break;
	case 1:	//omp simd
		omp_set_num_threads(4);
		for (int iloop = 0; iloop < nloop; iloop++)
			bgra2rgb_omp(bgra_mat, rgb_mat, w, h);
		break;
	case 2:	//neon
		for (int iloop = 0; iloop < nloop; iloop++)
			bgra2rgb_neon(bgra_mat, rgb_mat, w, h);
		break;
	default:
		break;
	}

	t = clock() - t;

	cout << "bgra[4-6] data:" << (int)bgra_mat[4] << "," << (int)bgra_mat[5] << "," << (int)bgra_mat[6] << endl;
	cout << "rgb[3-5] data:" << (int)rgb_mat[3] << "," << (int)rgb_mat[4] << "," << (int)rgb_mat[5] << endl;
	cout << "cost time(clock):" << t / nloop << endl;
}

3. 編譯命令

g++ brga2rgb.cpp -o brga2rgb -fopenmp -O1

4. 資料分析

原始碼未向量化耗時:490 時間單位
openmp-simd 耗時:250 時間單位
neon intrinsic 函式:210 時間單位

分析:使用手動向量化neon intrinsic 函式的效率最高。

相關文章