最快的演算法alphablend彙編原始碼，Intel官方提供(轉)[@more@]

　　Intel官方網站有一個ablend_565的快速彙編演算法，理論上是是把一塊32bit RGBA渲染到16bit的buffer上，我的機器是PIII800,函式在system menory中進行，640*480的256級alpha blending，達到100fps，我想可以滿足絕大部分的要求了，在這裡，我提供了這個演算法的應用，希望可以對大家有所幫助。

　　ablend_565函式，原始碼可以直接編譯使用，無需其他庫函式，感謝intel提供這麼好的東西。

　　首先，我提供一些本人編寫的把32bit tga檔案讀入pRGBABuffer的函式

　　檔案尺寸儲存在 width,height

　　//-----------------------------------------------------------------------

　　// Name: LoadTgaFile( TCHAR* strPathname, DWORD** pRGBABuffer, long* width, long* height )

　　// Desc: 讀取32bit tga檔案到DWORD緩衝裡，返回其尺寸

　　// Time: 2002.06.22 00:36

　　// Author: RealRender

　　// Para:

　　// Return:

　　// Note: 這段程式碼來自directx 7.0 sample中的d3dtextr.cpp，我把他提取了出來

　　// 方便使用

　　//-----------------------------------------------------------------------

　　BOOL LoadTgaFile( TCHAR* strPathname, DWORD** pRGBABuffer, long* width, long* height )

　　{

　　FILE* file = fopen( strPathname, "rb" );

　　if( NULL == file )

　　return false;

　　struct TargaHeader

　　{

　　BYTE IDLength;

　　BYTE ColormapType;

　　BYTE ImageType;

　　BYTE ColormapSpecification[5];

　　WORD XOrigin;

　　WORD YOrigin;

　　WORD ImageWidth;

　　WORD ImageHeight;

　　BYTE PixelDepth;

　　BYTE ImageDescriptor;

　　} tga;

　　fread( &tga, sizeof(TargaHeader), 1, file );

　　// Only true color, non-mapped images are supported

　　if( ( 0 != tga.ColormapType ) ||

　　( tga.ImageType != 10 && tga.ImageType != 2 ) )

　　{

　　fclose( file );

　　return false;

　　}

　　// Skip the ID field. The first byte of the header is the length of this field

　　if( tga.IDLength )

　　fseek( file, tga.IDLength, SEEK_CUR );

　　DWORD m_dwWidth = tga.ImageWidth;

　　DWORD m_dwHeight = tga.ImageHeight;

　　DWORD m_dwBPP = tga.PixelDepth;

　　DWORD *m_pRGBAData = new DWORD[m_dwWidth*m_dwHeight];

　　if( m_pRGBAData == NULL )

　　{

　　fclose(file);

　　return false;

　　}

　　for( DWORD y=0; y　　{

　　DWORD dwOffset = y*m_dwWidth;

　　if( 0 == ( tga.ImageDescriptor & 0x0010 ) )

　　dwOffset = (m_dwHeight-y-1)*m_dwWidth;

　　for( DWORD x=0; x　　{

　　if( tga.ImageType == 10 )

　　{

　　BYTE PacketInfo = getc( file );

　　WORD PacketType = 0x80 & PacketInfo;

　　WORD PixelCount = ( 0x007f & PacketInfo ) + 1;

　　if( PacketType )

　　{

　　DWORD b = getc( file );

　　DWORD g = getc( file );

　　DWORD r = getc( file );

　　DWORD a = 0xff;

　　if( m_dwBPP == 32 )

　　a = getc( file );

　　while( PixelCount-- )

　　{

　　m_pRGBAData[dwOffset+x] = (r<<24L)+(g<<16L)+(b<<8L)+(a);

　　x++;

　　}

　　else

　　{

　　while( PixelCount-- )

　　{

　　BYTE b = getc( file );

　　BYTE g = getc( file );

　　BYTE r = getc( file );

　　BYTE a = 0xff;

　　if( m_dwBPP == 32 )

　　a = getc( file );

　　m_pRGBAData[dwOffset+x] = (r<<24L)+(g<<16L)+(b<<8L)+(a);

　　x++;

　　}

　　else

　　{

　　BYTE b = getc( file );

　　BYTE g = getc( file );

　　BYTE r = getc( file );

　　BYTE a = 0xff;

　　if( m_dwBPP == 32 )

　　a = getc( file );

　　m_pRGBAData[dwOffset+x] = (r<<24L)+(g<<16L)+(b<<8L)+(a);

　　x++;

　　}

　　fclose( file );

　　// Check for alpha content

　　for( DWORD i=0; i　　{

　　if( m_pRGBAData[i] & 0x000000ff != 0xff )

　　{

　　//m_bHasAlpha = TRUE;

　　break;

　　}

　　*pRGBABuffer = m_pRGBAData;

　　*width = m_dwWidth;

　　*height = m_dwHeight;

　　return true;

　　}

　　把32bit buffer分割為rgb和alpha的程式碼。

　　注意，分割後的pBitmap一定要是8位元組對齊，這是最佳化的一個重要條件，所以，我的演算法中：

　　BYTE* p = new BYTE[lSize*2+8];

　　BYTE* pOrig = p;

　　p += (DWORD)p%8;

　　WORD* color = (WORD*)p;

　　這是不規範的寫法，把指標強行改變為8位對齊，實際使用的時候，要記住釋放的原始指標不是p，而是pOrig，在這裡，我沒有釋放分配的記憶體，請諒解。

　　//-----------------------------------------------------------------------

　　// Name: SplitRGBA( DWORD* pRGBABuffer, LPBYTE* pAlpha, LPWORD* pBitmap, long lWidth, long lHeight )

　　// Desc:

　　// Time: 2002.06.22 00:36

　　// Author: RealRender

　　// Para:

　　// Return:

　　// Note: 把從32bit的緩衝建立16bit的565緩衝和8bit的alpha通道

　　//-----------------------------------------------------------------------

　　void SplitRGBA( DWORD* pRGBABuffer, LPBYTE* pAlpha, LPWORD* pBitmap, long lWidth, long lHeight )

　　{

　　long lSize = lWidth*lHeight;

　　BYTE* alpha = new BYTE[lSize];

　　BYTE* p = new BYTE[lSize*2+8];

　　// 強行轉換為8位元組對齊

　　p += (DWORD)p%8;

　　WORD* color = (WORD*)p;

　　DWORD dwPixel;

　　DWORD r, g, b, a;

　　for( int i = 0; i < lSize; i++ )

　　{

　　dwPixel = pRGBABuffer[i];

　　r = ((dwPixel>>24)&0x000000ff);

　　g = ((dwPixel>>16)&0x000000ff);

　　b = ((dwPixel>> 8)&0x000000ff);

　　a = ((dwPixel>> 0)&0x000000ff);

　　alpha[i] = a;

　　// 888i轉化為565

　　color[i] = RGBTo16( r, g, b );

　　}

　　*pAlpha = alpha;

　　*pBitmap = color;

　　}

　　//

　　這個視intel官方提供的函式，函式的描述，用我的話來說就是把一個帶有256級alpha通道的565顏色資料繪製到16位目標頁面。

　　函式說明：

　　unsigned char *lpAlpha, // 256 級alpha通道

　　unsigned int iAlpPitch, // alpha通道的pitch

　　unsigned char *lpSrc, // 原色彩緩衝

　　unsigned int iSrcX, //

　　unsigned int iSrcY, // 原色彩位置

　　unsigned int iSrcPitch, // 原色彩pitch

　　unsigned char *lpDst, // 目標緩衝

　　unsigned int iDstX,

　　unsigned int iDstY, // 目標位置

　　unsigned int iDstW,

　　unsigned int iDstH, // 目標緩衝的尺寸

　　unsigned int iDstPitch // 目標緩衝的pitch

　　void ablend_565(unsigned char *lpAlpha,unsigned int iAlpPitch,

　　unsigned char *lpSrc,unsigned int iSrcX, unsigned int iSrcY,

　　unsigned int iSrcPitch, unsigned char *lpDst,

　　unsigned int iDstX, unsigned int iDstY,

　　unsigned int iDstW, unsigned int iDstH,

　　unsigned int iDstPitch)

　　{

　　//Mask for isolating the red,green, and blue components

　　static __int64 MASKB=0x001F001F001F001F;

　　static __int64 MASKG=0x07E007E007E007E0;

　　static __int64 MASKSHIFTG=0x03F003F003F003F0;

　　static __int64 MASKR=0xF800F800F800F800;

　　//constants used by the integer alpha blending equation

　　static __int64 SIXTEEN=0x0010001000100010;

　　static __int64 FIVETWELVE=0x0200020002000200;

　　static __int64 SIXONES=0x003F003F003F003F;

　　unsigned char *lpLinearDstBp=(iDstX<<1)+(iDstY*iDstPitch)+lpDst; //base pointer for linear destination

　　unsigned char *lpLinearSrcBp=(iSrcX<<1)+(iSrcY*iSrcPitch)+lpSrc; //base pointer for linear source

　　unsigned char *lpLinearAlpBp=iSrcX+(iSrcY*iAlpPitch)+lpAlpha; //base pointer for linear alpha

　　_asm{

　　mov esi,lpLinearSrcBp; //src

　　mov edi,lpLinearDstBp; //dst

　　mov eax,lpLinearAlpBp; //alpha

　　mov ecx,iDstH; //ecx=number of lines to copy

　　mov ebx,iDstW; //ebx=span width to copy

　　test esi,6; //check if source address is qword aligned

　　//since addr coming in is always word aligned(16bit)

　　jnz done; //if not qword

最快的演算法alphablend彙編原始碼，Intel官方提供(轉)

相關文章