為Vertica資料庫增加自定義函式to_base64和from_base64

卡卡西村長發表於2021-04-08

Vertica提供了UDx的機制,用來擴充套件自定義函式,本文演示了在公司專案中的真實案例。

 

------------1、編寫自定義函式類---------------

自定義函式的Java類,依賴VerticaSDK.jar包,位於伺服器上/opt/vertica/bin/VerticaSDK.jar,下載到本地,讓專案依賴這個jar包。

然後就可以實現to_base64和from_base64這兩個函式了。

package tebon.vertica;

import com.vertica.sdk.*;

import java.util.TimeZone;

/**
 * 自定義vertica函式,tebon_to_base64,把指定的字串轉換為base64格式。
 *
 * @author zhanglei
 */
public class ToBase64Function extends ScalarFunctionFactory {
    public ScalarFunction createScalarFunction(ServerInterface serverInterface) {
        return new InternToBase64Function();
    }

    public void getPrototype(ServerInterface serverInterface, ColumnTypes inputTypes, ColumnTypes returnType) {
        inputTypes.addVarchar();
        returnType.addVarchar();
    }
    
    @Override
    public void getReturnType(ServerInterface srvInterface, final SizedColumnTypes argTypes, SizedColumnTypes returnType){
        returnType.addVarchar(argTypes.getColumnType(0).getStringLength());
    }

    public class InternToBase64Function extends ScalarFunction {
        public void processBlock(ServerInterface serverInterface, BlockReader argReader, BlockWriter resWriter) throws UdfException, DestroyInvocation {
            TimeZone.setDefault(TimeZone.getTimeZone("Asia/Shanghai"));
            do {
                String inputString = argReader.getString(0);
                String returnString = null;
                if (inputString != null) {
                    try {
                        returnString = encodeToBase64(inputString.getBytes("utf-8"));
                    } catch (Exception ex) {
                        returnString = inputString;
                    }
                }
                resWriter.setString(returnString);
                resWriter.next();
            } while (argReader.next());
        }

        //Base64編碼表
        private final char[] BASE64CODE
                = {'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V',
                'W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q',
                'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '+', '/',};

        private final int HEX_255 = 0x0000ff;
        private final int HEX_16515072 = 0xfc0000;
        private final int HEX_258048 = 0x3f000;
        private final int HEX_4032 = 0xfc0;
        private final int HEX_63 = 0x3f;
        private final int NUMBER_TWO = 2;
        private final int NUMBER_THREE = 3;
        private final int NUMBER_FOUR = 4;
        private final int NUMBER_SIX = 6;
        private final int NUMBER_EIGHT = 8;
        private final int NUMBER_TWELVE = 12;
        private final int NUMBER_SIXTEEN = 16;
        private final int NUMBER_EIGHTEEN = 18;

        private String encodeToBase64(byte[] b) {
            if (b == null || b.length == 0) {
                return "";
            }

            // 按實際編碼後長度開闢記憶體,加快速度
            StringBuilder sb = new StringBuilder(((b.length - 1) / NUMBER_THREE) << NUMBER_TWO + NUMBER_FOUR);

            // 進行編碼
            int code = 0;
            for (int i = 0; i < b.length; i++) {
                code |= (b[i] << (NUMBER_SIXTEEN - i % NUMBER_THREE * NUMBER_EIGHT))
                        & (HEX_255 << (NUMBER_SIXTEEN - i % NUMBER_THREE * NUMBER_EIGHT));
                if (i % NUMBER_THREE == NUMBER_TWO || i == b.length - 1) {
                    sb.append(BASE64CODE[(code & HEX_16515072) >>> NUMBER_EIGHTEEN]);
                    sb.append(BASE64CODE[(code & HEX_258048) >>> NUMBER_TWELVE]);
                    sb.append(BASE64CODE[(code & HEX_4032) >>> NUMBER_SIX]);
                    sb.append(BASE64CODE[code & HEX_63]);
                    code = 0;
                }
            }

            // 對於長度非3的整數倍的位元組陣列,編碼前先補0,編碼後結尾處編碼用=代替,
            // =的個數和短缺的長度一致,以此來標識出資料實際長度
            if (b.length % NUMBER_THREE > 0) {
                sb.setCharAt(sb.length() - 1, '=');
            }
            if (b.length % NUMBER_THREE == 1) {
                sb.setCharAt(sb.length() - NUMBER_TWO, '=');
            }
            return sb.toString();
        }
    }
}
package tebon.vertica;

import com.vertica.sdk.*;

import java.util.TimeZone;

/**
 * 自定義vertica函式,tebon_from_base64,把指定的base64字串轉換為普通格式。
 *
 * @author zhanglei
 */
public class FromBase64Function extends ScalarFunctionFactory {
    public ScalarFunction createScalarFunction(ServerInterface serverInterface) {
        return new InternFromBase64Function();
    }

    public void getPrototype(ServerInterface serverInterface, ColumnTypes inputTypes, ColumnTypes returnType) {
        inputTypes.addVarchar();
        returnType.addVarchar();
    }

    @Override
    public void getReturnType(ServerInterface srvInterface, final SizedColumnTypes argTypes, SizedColumnTypes returnType){
        returnType.addVarchar(argTypes.getColumnType(0).getStringLength());
    }

    public class InternFromBase64Function extends ScalarFunction {
        public void processBlock(ServerInterface serverInterface, BlockReader argReader, BlockWriter resWriter) throws UdfException, DestroyInvocation {
            TimeZone.setDefault(TimeZone.getTimeZone("Asia/Shanghai"));
            do {
                String inputString = argReader.getString(0);
                String returnString = null;
                if (inputString != null) {
                    try {
                        returnString = new String(decodeFromBase64(inputString), "utf-8");
                    } catch (Exception ex) {
                        returnString = inputString;
                    }
                }
                resWriter.setString(returnString);
                resWriter.next();
            } while (argReader.next());
        }

        //Base64解碼錶
        private final byte[] BASE64DECODE
                = {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
                -1, -1, -1,
                -1,
                -1, // 注意兩個63,為相容SMP,
                -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 62, -1, 63,
                -1,
                63, // “/”和“-”都翻譯成63。
                52, 53, 54, 55, 56, 57, 58, 59, 60, 61, -1, -1, -1, 0, -1, -1, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11,
                12, 13,
                14, // 注意兩個0:
                15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, -1, -1, -1, -1,
                -1, // “A”和“=”都翻譯成0。
                -1, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51,
                -1, -1, -1, -1, -1,};

        private final int HEX_255 = 0x0000ff;
        private final int HEX_16711680 = 0xff0000;
        private final int HEX_65280 = 0x00ff00;
        private final int NUMBER_TWO = 2;
        private final int NUMBER_THREE = 3;
        private final int NUMBER_FOUR = 4;
        private final int NUMBER_SIX = 6;
        private final int NUMBER_EIGHT = 8;
        private final int NUMBER_TWELVE = 12;
        private final int NUMBER_SIXTEEN = 16;
        private final int NUMBER_EIGHTEEN = 18;

        private byte[] decodeFromBase64(String code) {
            if (code == null || code.length() <= 0) {
                return null;
            }

            code = code.replace("\r", "").replace("\n", "").replace(" ", "");

            int len = code.length();
            if (len % NUMBER_FOUR != 0) {
                throw new IllegalArgumentException("Base64 string length must be 4*n");
            }

            // 統計填充的等號個數
            int pad = 0;
            if (code.charAt(len - 1) == '=') {
                pad++;
            }
            if (code.charAt(len - NUMBER_TWO) == '=') {
                pad++;
            }

            // 根據填充等號的個數來計算實際資料長度
            int retLen = len / NUMBER_FOUR * NUMBER_THREE - pad;

            // 分配位元組陣列空間
            byte[] ret = new byte[retLen];

            // 查表解碼
            char ch1, ch2, ch3, ch4;
            int i;
            for (i = 0; i < len; i += NUMBER_FOUR) {
                int j = i / NUMBER_FOUR * NUMBER_THREE;
                ch1 = code.charAt(i);
                ch2 = code.charAt(i + 1);
                ch3 = code.charAt(i + NUMBER_TWO);
                ch4 = code.charAt(i + NUMBER_THREE);
                int tmp = (BASE64DECODE[ch1] << NUMBER_EIGHTEEN) | (BASE64DECODE[ch2] << NUMBER_TWELVE)
                        | (BASE64DECODE[ch3] << NUMBER_SIX) | (BASE64DECODE[ch4]);
                ret[j] = (byte) ((tmp & HEX_16711680) >> NUMBER_SIXTEEN);
                if (i < len - NUMBER_FOUR) {
                    ret[j + 1] = (byte) ((tmp & HEX_65280) >> NUMBER_EIGHT);
                    ret[j + NUMBER_TWO] = (byte) ((tmp & HEX_255));
                } else {
                    if (j + 1 < retLen) {
                        ret[j + 1] = (byte) ((tmp & HEX_65280) >> NUMBER_EIGHT);
                    }
                    if (j + NUMBER_TWO < retLen) {
                        ret[j + NUMBER_TWO] = (byte) ((tmp & HEX_255));
                    }
                }
            }
            return ret;
        }
    }
}

 

------------2、把函式類上傳到伺服器上---------------

確保java程式碼在本地是編譯透過的,然後上傳到伺服器上。

建議上傳到這個位置:/opt/vertica/sdk/   ,後面以此路徑為準。

 

------------3、配置java編譯器環境---------------

安裝Jave-devel。

透過如下命令檢視已經安裝的jdk版本:
rpm -qa|grep java

如果是Centos,一般情況下會列出兩個,例如:
java-1.8.0-openjdk-1.8.0.282.b08-1.el7_9.x86_64
java-1.8.0-openjdk-headless-1.8.0.282.b08-1.el7_9.x86_64

透過如下命令解除安裝掉:
rpm -e --nodeps java-1.8.0-openjdk-1.8.0.282.b08-1.el7_9.x86_64
rpm -e --nodeps java-1.8.0-openjdk-headless-1.8.0.282.b08-1.el7_9.x86_64

然後安裝新的java-devel:
yum -y install java-1.8.0-openjdk-devel.x86_64

測試一下:
java -version
javac -version

 

------------4、編譯原始碼---------------

cd /opt/vertica/sdk

javac -g -cp /opt/vertica/bin/VerticaSDK.jar /opt/vertica/sdk/BuildInfo.java -d .

javac -g -cp /opt/vertica/bin/VerticaSDK.jar FromBase64Function.java -d .

javac -g -cp /opt/vertica/bin/VerticaSDK.jar ToBase64Function.java -d .

jar cf TebonVerticaFunctions.jar ./tebon/vertica/*.class ./com/vertica/sdk/*.class

 

------------5、註冊類庫和函式---------------

用vsql登入vertica,執行如下SQL:

SELECT SET_CONFIG_PARAMETER('JavaBinaryForUDx','/usr/bin/java');

DROP
LIBRARY TebonVerticaFunctions CASCADE; CREATE LIBRARY TebonVerticaFunctions AS '/opt/vertica/sdk/TebonVerticaFunctions.jar' LANGUAGE 'JAVA'; CREATE FUNCTION tebon_to_base64 AS language 'java' NAME 'tebon.vertica.ToBase64Function' LIBRARY TebonVerticaFunctions; CREATE FUNCTION tebon_from_base64 AS language 'java' NAME 'tebon.vertica.FromBase64Function' LIBRARY TebonVerticaFunctions;

 

 

------------6、測試---------------

dbadmin=> select tebon_to_base64('hello123你好'), tebon_from_base64('aGVsbG8xMjPkvaDlpb0=');
tebon_to_base64 | tebon_from_base64
----------------------+-------------------
aGVsbG8xMjPkvaDlpb0= | hello123你好
(1 row)

 

到這裡就結束了。

如果java程式碼執行有問題,可以看錯誤日誌,位於這裡(記得把/home/dbadmin/bigdata換成你自己的資料位置):

/home/dbadmin/bigdata/v_bigdata_node0001_catalog/UDxLogs/UDxFencedProcessesJava.log

 

相關文章