最近打算做一個基於字串首字元(漢字取拼音的首字母)進行查詢的電話簿WEB專案,由於Web本身並不支援漢字的編碼程式設計,因此需要求助於平臺開發工具。
Google上搜到一種實現方法,是用C++實現的。以下是原始碼:
void GetFirstLetter(CString strName, CString& strFirstLetter)
{
TBYTE ucHigh, ucLow;
int nCode;
CString strRet;
strFirstLetter.Empty();
for (int i=0; i<strName.GetLength(); i++)
{
if ( (TBYTE)strName[i] < 0x80 )
continue;
ucHigh = (TBYTE)strName[i];
ucLow = (TBYTE)strName[i+1];
if ( ucHigh < 0xa1 || ucLow < 0xa1)
continue;
else
// Treat code by section-position as an int type parameter,
// so make following change to nCode.
nCode = (ucHigh - 0xa0) * 100 + ucLow - 0xa0;
FirstLetter(nCode, strRet);
strFirstLetter += strRet;
i++;
}
}
void FirstLetter(int nCode, CString& strLetter)
{
if(nCode >= 1601 && nCode < 1637) strLetter = _T("A");
if(nCode >= 1637 && nCode < 1833) strLetter = _T("B");
if(nCode >= 1833 && nCode < 2078) strLetter = _T("C");
if(nCode >= 2078 && nCode < 2274) strLetter = _T("D");
if(nCode >= 2274 && nCode < 2302) strLetter = _T("E");
if(nCode >= 2302 && nCode < 2433) strLetter = _T("F");
if(nCode >= 2433 && nCode < 2594) strLetter = _T("G");
if(nCode >= 2594 && nCode < 2787) strLetter = _T("H");
if(nCode >= 2787 && nCode < 3106) strLetter = _T("J");
if(nCode >= 3106 && nCode < 3212) strLetter = _T("K");
if(nCode >= 3212 && nCode < 3472) strLetter = _T("L");
if(nCode >= 3472 && nCode < 3635) strLetter = _T("M");
if(nCode >= 3635 && nCode < 3722) strLetter = _T("N");
if(nCode >= 3722 && nCode < 3730) strLetter = _T("O");
if(nCode >= 3730 && nCode < 3858) strLetter = _T("P");
if(nCode >= 3858 && nCode < 4027) strLetter = _T("Q");
if(nCode >= 4027 && nCode < 4086) strLetter = _T("R");
if(nCode >= 4086 && nCode < 4390) strLetter = _T("S");
if(nCode >= 4390 && nCode < 4558) strLetter = _T("T");
if(nCode >= 4558 && nCode < 4684) strLetter = _T("W");
if(nCode >= 4684 && nCode < 4925) strLetter = _T("X");
if(nCode >= 4925 && nCode < 5249) strLetter = _T("Y");
if(nCode >= 5249 && nCode < 5590) strLetter = _T("Z");
}
How to use:
CString strName, strRes;
strName = _T("A李小三");
GetFirstLetter(strName, strRes);
//Then the value of strRes is "LXS".
很快應用到專案中,但很快發現,上述方法竟然不能識別一些漢字的編碼。究其原因,是因為上述程式碼主要是根據漢字的編碼是由拼音排序的這個基本道理得來,而且只適用於GB2312編碼。對於大字符集的GBK編碼,上述方法就無能為力了。
再次搜尋和嘗試了好久,有一種解決辦法似乎可行。該方法把所有的漢字列在一個大陣列中,然後每次都是通過在陣列中迴圈比較的方法得出所在的拼音字母。其效率實在太低了。
難道就真的沒有辦法了嗎?終於在論壇裡找到一種完美的解決方案,是在C++Builder中實現的。其程式碼很簡潔,實現的原理也讓人難以理解。我修改了一下,在VC 6.0中除錯通過。
CString CWebEventsApp::GetFirstLetter(LPCTSTR strName)
{
static int li_SecPosValue[] = {
1601, 1637, 1833, 2078, 2274, 2302, 2433, 2594, 2787, 3106, 3212,
3472, 3635, 3722, 3730, 3858, 4027, 4086, 4390, 4558, 4684, 4925, 5249
};
static char* lc_FirstLetter[] = {
"A", "B", "C", "D", "E", "F", "G", "H", "J", "K", "L", "M", "N", "O",
"P", "Q", "R", "S", "T", "W", "X", "Y", "Z"
};
static char* ls_SecondSecTable =
"CJWGNSPGCGNE[Y[BTYYZDXYKYGT[JNNJQMBSGZSCYJSYY[PGKBZGY[YWJKGKLJYWKPJQHY[W[DZLSGMRYPYWWCCKZNKYYGTTNJJNYKKZYTCJNMCYLQLYPYQFQRPZSLWBTGKJFYXJWZLTBNCXJJJJTXDTTSQZYCDXXHGCK[PHFFSS[YBGXLPPBYLL[HLXS[ZM[JHSOJNGHDZQYKLGJHSGQZHXQGKEZZWYSCSCJXYEYXADZPMDSSMZJZQJYZC[J[WQJBYZPXGZNZCPWHKXHQKMWFBPBYDTJZZKQHY"
"LYGXFPTYJYYZPSZLFCHMQSHGMXXSXJ[[DCSBBQBEFSJYHXWGZKPYLQBGLDLCCTNMAYDDKSSNGYCSGXLYZAYBNPTSDKDYLHGYMYLCXPY[JNDQJWXQXFYYFJLEJPZRXCCQWQQSBNKYMGPLBMJRQCFLNYMYQMSQYRBCJTHZTQFRXQHXMJJCJLXQGJMSHZKBSWYEMYLTXFSYDSWLYCJQXSJNQBSCTYHBFTDCYZDJWYGHQFRXWCKQKXEBPTLPXJZSRMEBWHJLBJSLYYSMDXLCLQKXLHXJRZJMFQHXHWY"
"WSBHTRXXGLHQHFNM[YKLDYXZPYLGG[MTCFPAJJZYLJTYANJGBJPLQGDZYQYAXBKYSECJSZNSLYZHSXLZCGHPXZHZNYTDSBCJKDLZAYFMYDLEBBGQYZKXGLDNDNYSKJSHDLYXBCGHXYPKDJMMZNGMMCLGWZSZXZJFZNMLZZTHCSYDBDLLSCDDNLKJYKJSYCJLKWHQASDKNHCSGANHDAASHTCPLCPQYBSDMPJLPZJOQLCDHJJYSPRCHN[NNLHLYYQYHWZPTCZGWWMZFFJQQQQYXACLBHKDJXDGMMY"
"DJXZLLSYGXGKJRYWZWYCLZMSSJZLDBYD[FCXYHLXCHYZJQ[[QAGMNYXPFRKSSBJLYXYSYGLNSCMHZWWMNZJJLXXHCHSY[[TTXRYCYXBYHCSMXJSZNPWGPXXTAYBGAJCXLY[DCCWZOCWKCCSBNHCPDYZNFCYYTYCKXKYBSQKKYTQQXFCWCHCYKELZQBSQYJQCCLMTHSYWHMKTLKJLYCXWHEQQHTQH[PQ[QSCFYMNDMGBWHWLGSLLYSDLMLXPTHMJHWLJZYHZJXHTXJLHXRSWLWZJCBXMHZQXSDZP"
"MGFCSGLSXYMJSHXPJXWMYQKSMYPLRTHBXFTPMHYXLCHLHLZYLXGSSSSTCLSLDCLRPBHZHXYYFHB[GDMYCNQQWLQHJJ[YWJZYEJJDHPBLQXTQKWHLCHQXAGTLXLJXMSL[HTZKZJECXJCJNMFBY[SFYWYBJZGNYSDZSQYRSLJPCLPWXSDWEJBJCBCNAYTWGMPAPCLYQPCLZXSBNMSGGFNZJJBZSFZYNDXHPLQKZCZWALSBCCJX[YZGWKYPSGXFZFCDKHJGXDLQFSGDSLQWZKXTMHSBGZMJZRGLYJB"
"PMLMSXLZJQQHZYJCZYDJWBMYKLDDPMJEGXYHYLXHLQYQHKYCWCJMYYXNATJHYCCXZPCQLBZWWYTWBQCMLPMYRJCCCXFPZNZZLJPLXXYZTZLGDLDCKLYRZZGQTGJHHGJLJAXFGFJZSLCFDQZLCLGJDJCSNZLLJPJQDCCLCJXMYZFTSXGCGSBRZXJQQCTZHGYQTJQQLZXJYLYLBCYAMCSTYLPDJBYREGKLZYZHLYSZQLZNWCZCLLWJQJJJKDGJZOLBBZPPGLGHTGZXYGHZMYCNQSYCYHBHGXKAMTX"
"YXNBSKYZZGJZLQJDFCJXDYGJQJJPMGWGJJJPKQSBGBMMCJSSCLPQPDXCDYYKY[CJDDYYGYWRHJRTGZNYQLDKLJSZZGZQZJGDYKSHPZMTLCPWNJAFYZDJCNMWESCYGLBTZCGMSSLLYXQSXSBSJSBBSGGHFJLYPMZJNLYYWDQSHZXTYYWHMZYHYWDBXBTLMSYYYFSXJC[DXXLHJHF[SXZQHFZMZCZTQCXZXRTTDJHNNYZQQMNQDMMG[YDXMJGDHCDYZBFFALLZTDLTFXMXQZDNGWQDBDCZJDXBZGS"
"QQDDJCMBKZFFXMKDMDSYYSZCMLJDSYNSBRSKMKMPCKLGDBQTFZSWTFGGLYPLLJZHGJ[GYPZLTCSMCNBTJBQFKTHBYZGKPBBYMTDSSXTBNPDKLEYCJNYDDYKZDDHQHSDZSCTARLLTKZLGECLLKJLQJAQNBDKKGHPJTZQKSECSHALQFMMGJNLYJBBTMLYZXDCJPLDLPCQDHZYCBZSCZBZMSLJFLKRZJSNFRGJHXPDHYJYBZGDLQCSEZGXLBLGYXTWMABCHECMWYJYZLLJJYHLG[DJLSLYGKDZPZXJ"
"YYZLWCXSZFGWYYDLYHCLJSCMBJHBLYZLYCBLYDPDQYSXQZBYTDKYXJY[CNRJMPDJGKLCLJBCTBJDDBBLBLCZQRPPXJCJLZCSHLTOLJNMDDDLNGKAQHQHJGYKHEZNMSHRP[QQJCHGMFPRXHJGDYCHGHLYRZQLCYQJNZSQTKQJYMSZSWLCFQQQXYFGGYPTQWLMCRNFKKFSYYLQBMQAMMMYXCTPSHCPTXXZZSMPHPSHMCLMLDQFYQXSZYYDYJZZHQPDSZGLSTJBCKBXYQZJSGPSXQZQZRQTBDKYXZK"
"HHGFLBCSMDLDGDZDBLZYYCXNNCSYBZBFGLZZXSWMSCCMQNJQSBDQSJTXXMBLTXZCLZSHZCXRQJGJYLXZFJPHYMZQQYDFQJJLZZNZJCDGZYGCTXMZYSCTLKPHTXHTLBJXJLXSCDQXCBBTJFQZFSLTJBTKQBXXJJLJCHCZDBZJDCZJDCPRNPQCJPFCZLCLZXZDMXMPHJSGZGSZZQLYLWTJPFSYASMCJBTZKYCWMYTCSJJLJCQLWZMALBXYFBPNLSFHTGJWEJJXXGLLJSTGSHJQLZFKCGNNNSZFDEQ"
"FHBSAQTGYLBXMMYGSZLDYDQMJJRGBJTKGDHGKBLQKBDMBYLXWCXYTTYBKMRTJZXQJBHLMHMJJZMQASLDCYXYQDLQCAFYWYXQHZ";
CString result;
int H, L, W;
UINT i, stringlen = _tcslen(strName);
int j;
for (i = 0; i < stringlen; i++) {
H = (UCHAR) (strName[i + 0]);
L = (UCHAR) (strName[i + 1]);
if (H < 0xA1 || L < 0xA1) {
result += strName[i];
continue;
} else {
W = (H - 160) * 100 + L - 160;
}
if (W > 1600 && W < 5590) {
for (j = 22; j >= 0; j--) {
if (W >= li_SecPosValue[j]) {
result += lc_FirstLetter[j];
i ++;
break;
}
}
continue;
} else {
i++;
W = (H - 160 - 56) * 94 + L - 161;
if (W >= 0 && W <= 3007)
result += ls_SecondSecTable[W];
else {
result += (char) H;
result += (char) L;
}
}
}
return result;
}
上述程式碼的特殊之處在於,它使用了一個二級拼音雜湊表。該雜湊表可能綜合了大字符集中漢字的排布規律。具體原理還待進一步研究。
本文來自CSDN部落格,轉載請標明出處:http://blog.csdn.net/blackoto/archive/2009/08/10/4430983.aspx