能是最快的演算法alpha blend彙編原始碼 (轉)
官方網站有一個ablend_565的演算法,理論上是是把一塊32bit RGBA渲染到16bit的buffer上,我的機器是PIII800,在system menory中進行,640*480的256級alpha blending,達到100fps,我想可以滿足絕大部分的要求了,在這裡,我提供了這個演算法的應用,希望可以對大家有所幫助。
ablend_565函式,可以直接編譯使用,無需其他庫函式,感謝intel提供這麼好的東西。
首先,我提供一些本人編寫的把32bit tga讀入pRGBABuffer的函式
檔案尺寸儲存在 width,height
//-----------------------------------------------------------------------
// Name: LoadTgaFile( TCHAR* strPathname, D** pRGBABuffer, long* width, long* height )
// Desc: 讀取32bit tga檔案到DWORD緩衝裡,返回其尺寸
// Time: 2002.06.22 00:36
// Author: RealRender
// Para:
// Return:
// Note: 這段程式碼來自 7.0 sample中的d3dtextr.cpp,我把他提取了出來
// 方便使用
//-----------------------------------------------------------------------
BOOL LoadTgaFile( TCHAR* strPathname, DWORD** pRGBABuffer, long* width, long* height )
{
FILE* file = fopen( strPathname, "rb" );
if( NULL == file )
return false;
struct TargaHeader
{
BYTE IDLength;
BYTE ColormapType;
BYTE ImageType;
BYTE ColormapSpecification[5];
WORD XOrigin;
WORD YOrigin;
WORD ImageWidth;
WORD ImageHeight;
BYTE PixelDepth;
BYTE ImageDescriptor;
} tga;
fread( &tga, sizeof(TargaHeader), 1, file );
// Only true color, non-mapped images are supported
if( ( 0 != tga.ColormapType ) ||
( tga.ImageType != 10 && tga.ImageType != 2 ) )
{
fclose( file );
return false;
}
// Skthe ID field. The first byte of the header is the length of this field
if( tga.IDLength )
fseek( file, tga.IDLength, SEEK_CUR );
DWORD m_dwWidth = tga.ImageWidth;
DWORD m_dwHeight = tga.ImageHeight;
DWORD m_dwBPP = tga.PixelDepth;
DWORD *m_pRGBAData = new DWORD[m_dwWidth*m_dwHeight];
if( m_pRGBAData == NULL )
{
fclose(file);
return false;
}
for( DWORD y=0; y
{
DWORD dwOffset = y*m_dwWidth;
if( 0 == ( tga.ImageDescriptor & 0x0010 ) )
dwOffset = (m_dwHeight-y-1)*m_dwWidth;
for( DWORD x=0; x
{
if( tga.ImageType == 10 )
{
BYTE PacketInfo = getc( file );
WORD PacketType = 0x80 & PacketInfo;
WORD PixelCount = ( 0x007f & PacketInfo ) + 1;
if( PacketType )
{
DWORD b = getc( file );
DWORD g = getc( file );
DWORD r = getc( file );
DWORD a = 0xff;
if( m_dwBPP == 32 )
a = getc( file );
while( PixelCount-- )
{
m_pRGBAData[dwOffset+x] = (r<<24L)+(g<<16L)+(b<<8L)+(a);
x++;
}
}
else
{
while( PixelCount-- )
{
BYTE b = getc( file );
BYTE g = getc( file );
BYTE r = getc( file );
BYTE a = 0xff;
if( m_dwBPP == 32 )
a = getc( file );
m_pRGBAData[dwOffset+x] = (r<<24L)+(g<<16L)+(b<<8L)+(a);
x++;
}
}
}
else
{
BYTE b = getc( file );
BYTE g = getc( file );
BYTE r = getc( file );
BYTE a = 0xff;
if( m_dwBPP == 32 )
a = getc( file );
m_pRGBAData[dwOffset+x] = (r<<24L)+(g<<16L)+(b<<8L)+(a);
x++;
}
}
}
fclose( file );
// Check for alpha content
for( DWORD i=0; i{
if( m_pRGBAData[i] & 0x000000ff != 0xff )
{
//m_bHasAlpha = TRUE;
break;
}
}
*pRGBABuffer = m_pRGBAData;
*width = m_dwWidth;
*height = m_dwHeight;
return true;
}
把32bit buffer分割為rgb和alpha的程式碼。:namespace prefix = o ns = "urn:schemas--com::office" />
注意,分割後的pBitmap一定要是8位元組對齊,這是的一個重要條件,所以,我的演算法中:
BYTE* p = new BYTE[lSize*2+8];
BYTE* pOrig = p;
p += (DWORD)p%8;
WORD* color = (WORD*)p;
這是不規範的寫法,把指標強行改變為8位對齊,實際使用的時候,要記住釋放的原始指標不是p,而是pOrig,在這裡,我沒有釋放分配的,請諒解。
//-----------------------------------------------------------------------
// Name: SplitRGBA( DWORD* pRGBABuffer, LPBYTE* pAlpha, LPWORD* pBitmap, long lWidth, long lHeight )
// Desc:
// Time: 2002.06.22 00:36
// Author: RealRender
// Para:
// Return:
// Note: 把從32bit的緩衝建立16bit的565緩衝和8bit的alpha通道
//-----------------------------------------------------------------------
void SplitRGBA( DWORD* pRGBABuffer, LPBYTE* pAlpha, LPWORD* pBitmap, long lWidth, long lHeight )
{
long lSize = lWidth*lHeight;
BYTE* alpha = new BYTE[lSize];
BYTE* p = new BYTE[lSize*2+8];
// 強行轉換為8位元組對齊
p += (DWORD)p%8;
WORD* color = (WORD*)p;
DWORD dwPixel;
DWORD r, g, b, a;
for( int i = 0; i < lSize; i++ )
{
dwPixel = pRGBABuffer[i];
r = ((dwPixel>>24)&0x000000ff);
g = ((dwPixel>>16)&0x000000ff);
b = ((dwPixel>> 8)&0x000000ff);
a = ((dwPixel>> 0)&0x000000ff);
alpha[i] = a;
// 888i轉化為565
color[i] = RGBTo16( r, g, b );
}
*pAlpha = alpha;
*pBitmap = color;
}
//
這個視intel官方提供的函式,函式的描述,用我的話來說就是把一個帶有256級alpha通道的565顏色資料繪製到16位目標頁面。
unsigned char *lpAlpha, // 256 級alpha通道
unsigned int iAlpPitch, // alpha通道的pitch
unsigned char *lpSrc, // 原色彩緩衝
unsigned int iSrcX, //
unsigned int iSrcY, // 原色彩位置
unsigned int iSrcPitch, // 原色彩pitch
unsigned char *lpDst, // 目標緩衝
unsigned int iDstX,
unsigned int iDstY, // 目標位置
unsigned int iDstW,
unsigned int iDstH, // 目標緩衝的尺寸
unsigned int iDstPitch // 目標緩衝的pitch
void ablend_565(unsigned char *lpAlpha,unsigned int iAlpPitch,
unsigned char *lpSrc,unsigned int iSrcX, unsigned int iSrcY,
unsigned int iSrcPitch, unsigned char *lpDst,
unsigned int iDstX, unsigned int iDstY,
unsigned int iDstW, unsigned int iDstH,
unsigned int iDstPitch)
{
//Mask for isolating the red,green, and blue components
static __int64 MASKB=0x001F001F001F001F;
static __int64 MASKG=0x07E007E007E007E0;
static __int64 MASKSHIFTG=0x03F003F003F003F0;
static __int64 MASKR=0xF800F800F800F800;
//constants used by the integer alpha blending equation
static __int64 SIXTEEN=0x0010001000100010;
static __int64 FIVETWELVE=0x0200020002000200;
static __int64 SIXONES=0x003F003F003F003F;
unsigned char *lpLinearDstBp=(iDstX<<1)+(iDstY*iDstPitch)+lpDst; //base pointer for linear destination
unsigned char *lpLinearSrcBp=(iSrcX<<1)+(iSrcY*iSrcPitch)+lpSrc; //base pointer for linear
unsigned char *lpLinearAlpBp=iSrcX+(iSrcY*iAlpPitch)+lpAlpha; //base pointer for linear alpha
_asm{
mov esi,lpLinearSrcBp; //src
mov edi,lpLinearDstBp; //dst
mov eax,lpLinearAlpBp; //alpha
mov ecx,iDstH; //ecx=number of lines to copy
mov ebx,iDstW; //ebx=span width to copy
test esi,6; //check if source address is qword aligned
//since addr coming in is always word aligned(16bit)
jnz done; //if not qword aligned we don't do anything
primel:
movd mm1,[eax]; //mm1=00 00 00 00 a3 a2 a1 a0
pxor mm2,mm2; //mm2=0;
movq mm4,[esi]; //g1: mm4=src3 src2 src1 src0
punpcklbw mm1,mm2; //mm1=00a3 00a2 00a1 00a0
loopqword:
mov edx,[eax];
test ebx,0xFFFFFFFC; //check if only 3 pixels left
jz checkback; //3 or less pixels left
//early out tests
cmp edx,0xffffffff; //test for alpha value of 1
je copyback; //if 1's copy the source pixels to the destination
test edx,0xffffffff; //test for alpha value of 0
jz leavefront; //if so go to the next 4 pixels
//the alpha blend starts
//green
//i=a*sg+(63-a)*dg;
//i=(i+32)+((i+32)>>6)>>6;
//red
//i=a*sr+(31-a)*dr;
//i=(i+16)+((i+16)>>5)>>5;
movq mm5,[edi]; //g2: mm5=dst3 dst2 dst1 dst0
psrlw mm1,2; //mm1=a?>>2 nuke out lower 2 bits
movq mm7,MASKSHIFTG; //g3: mm7=1 bit shifted green mask
psrlw mm4,1; //g3a: move src green down by 1 so that we won't overflow
movq mm0,mm1; //mm0=00a3 00a2 00a1 00a0
psrlw mm5,1; //g3b: move dst green down by 1 so that we won't overflow
psrlw mm1,1; //mm1=a?>>1 nuke out lower 1 bits
pand mm4,mm7; //g5: mm4=sg3 sg2 sg1 sg0
movq mm2,SIXONES;//g4: mm2=63
pand mm5,mm7; //g7: mm5=dg3 dg2 dg1 dg0
movq mm3,[esi]; //b1: mm3=src3 src2 src1 src0
psubsb mm2,mm0; //g6: mm2=63-a3 63-a2 63-a1 63-a0
movq mm7,MASKB; //b2: mm7=BLUE MASK
pmullw mm4,mm0; //g8: mm4=sg?*a?
movq mm0,[edi]; //b3: mm0=dst3 dst2 dst1 dst0
pmullw mm5,mm2; //g9: mm5=dg?*(1-a?)
movq mm2,mm7; //b4: mm2=fiveones
pand mm3,mm7; //b4: mm3=sb3 sb2 sb1 sb0
pmullw mm3,mm1; //b6: mm3=sb?*a?
pand mm0,mm7; //b5: mm0=3 db1 db0
movq mm7,[esi]; //r1: mm7=src3 src2 src1 src0
paddw mm4,mm5; //g10: mm4=sg?*a?+dg?*(1-a?)
pand mm7,MASKR; //r2: mm7=sr3 sr2 sr1 sr0
psubsb mm2,mm1; //b5a: mm2=31-a3 31-a2 31-a1 31-a0
paddw mm4,FIVETWELVE; //g11: mm4=(mm4+512) green
pmullw mm0,mm2; //b7: mm0=db?*(1-a?)
movq mm5,mm4; //g12: mm5=mm4 green
psrlw mm7,11; //r4: shift src red down to position 0
psrlw mm4,6; //g13: mm4=mm4>>6
paddw mm4,mm5; //g14: mm4=mm4+mm5 green
paddw mm0,mm3; //b8: mm0=sb?*a?+db?*(1-a?)
movq mm5,[edi]; //r3: mm5=dst3 dst2 dst1 dst0
paddw mm0,SIXTEEN; //b9: mm0=(mm0+16) blue
pand mm5,MASKR; //r5: mm5=dr3 dr2 dr1 dr0
psrlw mm4,5; //g15: mm4=0?g0 0?g0 0?g0 0?g0 green
movq mm3,mm0; //b10: mm3=mm0 blue
psrlw mm0,5; //b11: mm0=mm0>>5 blue
psrlw mm5,11; //r6: shift dst red down to position 0
paddw mm0,mm3; //b12: mm0=mm3+mm0 blue
psrlw mm0,5; //b13: mm0=000b 000b 000b 000b blue
pmullw mm7,mm1; //mm7=sr?*a?
pand mm4,MASKG; //g16: mm4=00g0 00g0 00g0 00g0 green
pmullw mm5,mm2; //r7: mm5=dr?*(31-a?)
por mm0,mm4; //mm0=00gb 00gb 00gb 00gb
add eax,4; //move to next 4 alphas
add esi,8; //move to next 4 pixels in src
add edi,8; //move to next 4 pixels in dst
movd mm1,[eax]; //mm1=00 00 00 00 a3 a2 a1 a0
paddw mm5,mm7; //r8: mm5=sr?*a?+dr?*(31-a?)
paddw mm5,SIXTEEN; //r9: mm5=(mm5+16) red
pxor mm2,mm2; //mm2=0;
movq mm7,mm5; //r10: mm7=mm5 red
psrlw mm5,5; //r11: mm5=mm5>>5 red
movq mm4,[esi]; //g1: mm4=src3 src2 src1 src0
paddw mm5,mm7; //r12: mm5=mm7+mm5 red
punpcklbw mm1,mm2; //mm1=00a3 00a2 00a1 00a0
psrlw mm5,5; //r13: mm5=mm5>>5 red
psllw mm5,11; //r14: mm5=mm5<<10 red
por mm0,mm5; //mm0=0rgb 0rgb 0rgb 0rgb
sub ebx,4; //polished off 4 pixels
movq [edi-8],mm0; //dst=0rgb 0rgb 0rgb 0rgb
jmp loopqword; //go back to start
copyback:
movq [edi],mm4; //copy source to destination
leavefront:
add edi,8; //advance destination by 4 pixels
add eax,4; //advance alpha by 4
add esi,8; //advance source by 4 pixels
sub ebx,4; //decrease pixel count by 4
jmp primeloop;
checkback:
test ebx,0xFF; //check if 0 pixels left
jz nextline; //done with this span
//backalign: //work out back end pixels
movq mm5,[edi]; //g2: mm5=dst3 dst2 dst1 dst0
psrlw mm1,2; //mm1=a?>>2 nuke out lower 2 bits
movq mm7,MASKSHIFTG; //g3: mm7=shift 1 bit green mask
psrlw mm4,1; //g3a: move src green down by 1 so that we won't overflow
movq mm0,mm1; //mm0=00a3 00a2 00a1 00a0
psrlw mm5,1; //g3b: move dst green down by 1 so that we won't overflow
psrlw mm1,1; //mm1=a?>>1 nuke out lower 1 bits
pand mm4,mm7; //g5: mm4=sg3 sg2 sg1 sg0
movq mm2,SIXONES;//g4: mm2=63
pand mm5,mm7; //g7: mm5=dg3 dg2 dg1 dg0
movq mm3,[esi]; //b1: mm3=src3 src2 src1 src0
psubsb mm2,mm0; //g6: mm2=63-a3 63-a2 63-a1 63-a0
movq mm7,MASKB; //b2: mm7=BLUE MASK
pmullw mm4,mm0; //g8: mm4=sg?*a?
movq mm0,[edi]; //b3: mm0=dst3 dst2 dst1 dst0
pmullw mm5,mm2; //g9: mm5=dg?*(1-a?)
movq mm2,mm7; //b4: mm2=fiveones
pand mm3,mm7; //b4: mm3=sr3 sr2 sr1 sr0
pmullw mm3,mm1; //b6: mm3=sb?*a?
pand mm0,mm7; //b5: mm0=db3 db2 db1 db0
movq mm7,[esi]; //r1: mm7=src3 src2 src1 src0
paddw mm4,mm5; //g10: mm4=sg?*a?+dg?*(1-a?)
pand mm7,MASKR; //r2: mm7=sr3 sr2 sr1 sr0
psubsb mm2,mm1; //b5a: mm2=31-a3 31-a2 31-a1 31-a0
paddw mm4,FIVETWELVE; //g11: mm4=(i+512) green
pmullw mm0,mm2; //b7: mm0=db?*(1-a?)
movq mm5,mm4; //g12: mm5=(i+512) green
psrlw mm7,11; //r4: shift src red down to position 0
psrlw mm4,6; //g13: mm4=(i+512)>>6
paddw mm4,mm5; //g14: mm4=(i+512)+((i+512)>>6) green
paddw mm0,mm3; //b8: mm0=sb?*a?+db?*(1-a?)
movq mm5,[edi]; //r3: mm5=dst3 dst2 dst1 dst0
paddw mm0,SIXTEEN; //b9: mm0=(i+16) blue
pand mm5,MASKR; //r5: mm5=dr3 dr2 dr1 dr0
psrlw mm4,5; //g15: mm4=0?g0 0?g0 0?g0 0?g0 green
movq mm3,mm0; //b10: mm3=(i+16) blue
psrlw mm0,5; //b11: mm0=(i+16)>>5 blue
psrlw mm5,11; //r6: shift dst red down to position 0
paddw mm0,mm3; //b12: mm0=(i+16)+(i+16)>>5 blue
psrlw mm0,5; //b13: mm0=000r 000r 000r 000r blue
pmullw mm7,mm1; //mm7=sr?*a?
pand mm4,MASKG; //g16: mm4=00g0 00g0 00g0 00g0 green
pmullw mm5,mm2; //r7: mm5=dr?*(31-a?)
por mm0,mm4; //mm0=00gb 00gb 00gb 00gb
add eax,4; //move to next 4 alphas
//stall
paddw mm5,mm7; //r8: mm5=sr?*a?+dr?*(31-a?)
paddw mm5,SIXTEEN; //r9: mm5=(i+16) red
movq mm7,mm5; //r10: mm7=(i+16) red
psrlw mm5,5; //r11: mm5=(i+16)>>5 red
paddw mm5,mm7; //r12: mm5=(i+16)+((i+16)>>5) red
psrlw mm5,5; //r13: mm5=(i+16)+((i+16)>>5)>>5 red
psllw mm5,11; //r14: mm5=mm5<<10 red
por mm0,mm5; //mm0=0rgb 0rgb 0rgb 0rgb
test ebx,2; //check if there are 2 pixels
jz oneendpixel; //goto one pixel if that's it
movd [edi],mm0; //dst=0000 0000 0rgb 0rgb
psrlq mm0,32; //mm0>>32
add edi,4; //edi=edi+4
sub ebx,2; //saved 2 pixels
jz nextline; //all done goto next line
oneendpixel: //work on last pixel
movd edx,mm0; //edx=0rgb
mov [edi],dx; //dst=0rgb
nextline: //goto next line
dec ecx; //nuke one line
jz done; //all done
mov eax,lpLinearAlpBp; //alpha
mov esi,lpLinearSrcBp; //src
mov edi,lpLinearDstBp; //dst
add eax,iAlpPitch; //inc alpha ptr by 1 line
add esi,iSrcPitch; //inc src ptr by 1 line
add edi,iDstPitch; //inc dst ptr by 1 line
mov lpLinearAlpBp,eax; //save new alpha base ptr
mov ebx,iDstW; //ebx=span width to copy
mov lpLinearSrcBp,esi; //save new src base ptr
mov lpLinearDstBp,edi; //save new dst base ptr
jmp primeloop; //start the next span
done:
emms
}
}
來自 “ ITPUB部落格 ” ,連結:http://blog.itpub.net/10752019/viewspace-982698/,如需轉載,請註明出處,否則將追究法律責任。
相關文章
- 最快的演算法alphablend彙編原始碼,Intel官方提供(轉)演算法原始碼Intel
- 彙編環境下的原始碼除錯原始碼除錯
- 16位Alpha混合的簡單演算法 (轉)演算法
- shellcode轉換成彙編程式碼
- 最快的程式碼,是不執行的程式碼
- 彙編跳轉指令
- c語言if語句是如何變成彙編程式碼的?C語言
- 彙編程式碼Helloworld
- 萬能java字串編碼轉換工具類Java字串編碼
- 【轉】MySQL原始碼編譯安裝MySql原始碼編譯
- 【轉】編譯Android系統原始碼和核心原始碼編譯Android原始碼
- DRF原始碼彙總原始碼
- Flutter Engine 編譯 —— 我是這樣讀原始碼的Flutter編譯原始碼
- №窮舉密碼演算法指要(原始碼) (轉)密碼演算法原始碼
- 64K色模式下的快速Alpha混合演算法(轉)模式演算法
- BOOT0的主要程式碼兼Unix下彙編小節(轉)boot
- 彙編debug程式跳轉指令的方法
- 編譯FFMPEG原始碼的指令碼編寫案例編譯原始碼指令碼
- CatFly【彙編程式碼還原】
- Delphi編碼標準——一般的原始碼格式規則 (轉)原始碼
- Expression BlendExpress
- 二進位制入門--動態跟蹤原始碼和反彙編程式碼原始碼
- 原始碼推薦:vb的GUID生成演算法 (轉)原始碼GUI演算法
- 反彙編器-javap.exe(轉)Java
- 彙編+qemu玩轉控制檯列印
- 原始碼閱讀系列彙總原始碼
- Win32彙編教程二 Win32彙編程式的結構和語法 (轉)Win32
- GCC內聯彙編(2)GCC生成彙編程式碼簡單例項GC單例
- PHP原始碼包編譯安裝錯誤及解決方法彙總PHP原始碼編譯
- 經典作業系統教材中的LRU演算法的自編c++實現及原始碼。 (轉)作業系統演算法C++原始碼
- openssl-3.0.0-alpha9編譯編譯
- 從原始檔到可執行檔案:原始檔的預處理、編譯、彙編、連結編譯
- 【字元編碼】字元編碼 && Base64編碼演算法字元演算法
- AOSP 原始碼整編單編原始碼
- iOS彙編入門教程(二)在Xcode工程中嵌入彙編程式碼iOSXCode
- Webpack外掛是如何編寫的——prerender-spa-plugin原始碼解析WebPlugin原始碼
- 計算機執行彙編程式碼的原理計算機
- 【資料結構與演算法】Huffman樹&&Huffman編碼(附完整原始碼)資料結構演算法原始碼