高效能運算-NEON-影像旋轉

安洛8發表於2024-12-04

1. 目標:使用 NEON intrinsic 函式,對512*512 png 四通道影像順時針旋轉90度。

思路: 畫素分塊,對塊內轉置;再水平映象。影像庫使用 stb img

2. 程式碼

#include <stdio.h>
#include <arm_neon.h>

#include <stdlib.h>
#define STB_IMAGE_IMPLEMENTATION
#include "./stb/stb_image.h"
#define STB_IMAGE_WRITE_IMPLEMENTATION
#include "./stb/stb_image_write.h"

// #define DEBUG

int main()
{
    //讀取影像
    int w,h,c;
#ifdef DEBUG
    w=h=8;c=4;
    uint8_t* src = (uint8_t*)calloc(w*h*c,1);
    for(int i=0;i<h;i++)
    {
        for(int j=0;j<h*c;j++)
            src[i*h*c+j] = j;
    }
    for(int i=0;i<h;i++)
    {
        for(int j=0;j<w*c;j+=4)
            printf("%u%u%u%u ",*(dst+i*h*c+j),*(dst+i*h*c+j+1),*(dst+i*h*c+j+2),*(dst+i*h*c+j+3));
        printf("\n");
    }
    printf("======\n");
#else
    uint8_t *src = stbi_load("./pic.png",&w,&h,&c,0);
    if(!src)
    {
        printf("load img failed.\n");
        return 0;
    }
    else
        printf("int w %d h %d c %d\n",w,h,c);//512 512 4

#endif

    uint8_t *dst = (uint8_t*)calloc(w*h*c,sizeof(uint8_t));
    int blockSize = 4;// 128/sizeof(src[0][0]);

    for(int i=0;i<h;i+=blockSize)
    {
        for(int j=0;j<w;j+=blockSize)
        {
            uint32x4x4_t block = {0};
            uint32x4x2_t blockTemp = {0};
            //儲存資料: 畫素轉置、然後水平翻轉儲存[i+m][j] -> [j][i+m] -> [j][N-(i+m)]
            //載入塊資料
            for(int m=0;m<blockSize;m++)
                block.val[m] = vreinterpretq_u32_u8(vld1q_u8(src+((i+m)*w+j)*c));
            //畫素轉置
            blockTemp = vtrnq_u32(block.val[0],block.val[1]);
            block.val[0] = blockTemp.val[0];
            block.val[1] = blockTemp.val[1];
            blockTemp = vtrnq_u32(block.val[2],block.val[3]);
            block.val[2] = blockTemp.val[0];
            block.val[3] = blockTemp.val[1];
            //沒有 vtrnq_u64 所以手動交換資料
            blockTemp.val[0] = vreinterpretq_u32_u64(vtrn1q_u64(vreinterpretq_u64_u32(block.val[0]),vreinterpretq_u64_u32(block.val[2])));
            blockTemp.val[1] = vreinterpretq_u32_u64(vtrn2q_u64(vreinterpretq_u64_u32(block.val[0]),vreinterpretq_u64_u32(block.val[2])));
            block.val[0] = blockTemp.val[0];
            block.val[2] = blockTemp.val[1];

            blockTemp.val[0] = vreinterpretq_u32_u64(vtrn1q_u64(vreinterpretq_u64_u32(block.val[1]),vreinterpretq_u64_u32(block.val[3])));
            blockTemp.val[1] = vreinterpretq_u32_u64(vtrn2q_u64(vreinterpretq_u64_u32(block.val[1]),vreinterpretq_u64_u32(block.val[3])));
            block.val[1] = blockTemp.val[0];
            block.val[3] = blockTemp.val[1];

            for(int m=0;m<blockSize;m++)
            {
                block.val[m] = vrev64q_u32(block.val[m]);
                block.val[m] = vcombine_u32(vget_high_u32(block.val[m]),vget_low_u32(block.val[m]));
                //儲存
                vst1q_u8(dst+((j+m)*h+(h-i-blockSize))*c,vreinterpretq_u8_u32(block.val[m]));
            }
        }
    }
    #ifdef DEBUG
    for(int i=0;i<w;i++)
    {
        for(int j=0;j<h*c;j+=4)
            printf("%u%u%u%u ",*(dst+i*h*c+j),*(dst+i*h*c+j+1),*(dst+i*h*c+j+2),*(dst+i*h*c+j+3));
        printf("\n");
    }
    free(src);
    #else
    stbi_write_png("pic1.png",h,w,c,dst,h*c);
    stbi_image_free(src);
    #endif
    free(dst);
    return 0;
    
}

3. 測試結果

原圖

image

旋轉後影像

image

相關文章