Vertica提供了UDx的機制,用來擴充套件自定義函式,本文演示了在公司專案中的真實案例。
------------1、編寫自定義函式類---------------
自定義函式的Java類,依賴VerticaSDK.jar包,位於伺服器上/opt/vertica/bin/VerticaSDK.jar,下載到本地,讓專案依賴這個jar包。
然後就可以實現to_base64和from_base64這兩個函式了。
package tebon.vertica; import com.vertica.sdk.*; import java.util.TimeZone; /** * 自定義vertica函式,tebon_to_base64,把指定的字串轉換為base64格式。 * * @author zhanglei */ public class ToBase64Function extends ScalarFunctionFactory { public ScalarFunction createScalarFunction(ServerInterface serverInterface) { return new InternToBase64Function(); } public void getPrototype(ServerInterface serverInterface, ColumnTypes inputTypes, ColumnTypes returnType) { inputTypes.addVarchar(); returnType.addVarchar(); } @Override public void getReturnType(ServerInterface srvInterface, final SizedColumnTypes argTypes, SizedColumnTypes returnType){ returnType.addVarchar(argTypes.getColumnType(0).getStringLength()); } public class InternToBase64Function extends ScalarFunction { public void processBlock(ServerInterface serverInterface, BlockReader argReader, BlockWriter resWriter) throws UdfException, DestroyInvocation { TimeZone.setDefault(TimeZone.getTimeZone("Asia/Shanghai")); do { String inputString = argReader.getString(0); String returnString = null; if (inputString != null) { try { returnString = encodeToBase64(inputString.getBytes("utf-8")); } catch (Exception ex) { returnString = inputString; } } resWriter.setString(returnString); resWriter.next(); } while (argReader.next()); } //Base64編碼表 private final char[] BASE64CODE = {'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '+', '/',}; private final int HEX_255 = 0x0000ff; private final int HEX_16515072 = 0xfc0000; private final int HEX_258048 = 0x3f000; private final int HEX_4032 = 0xfc0; private final int HEX_63 = 0x3f; private final int NUMBER_TWO = 2; private final int NUMBER_THREE = 3; private final int NUMBER_FOUR = 4; private final int NUMBER_SIX = 6; private final int NUMBER_EIGHT = 8; private final int NUMBER_TWELVE = 12; private final int NUMBER_SIXTEEN = 16; private final int NUMBER_EIGHTEEN = 18; private String encodeToBase64(byte[] b) { if (b == null || b.length == 0) { return ""; } // 按實際編碼後長度開闢記憶體,加快速度 StringBuilder sb = new StringBuilder(((b.length - 1) / NUMBER_THREE) << NUMBER_TWO + NUMBER_FOUR); // 進行編碼 int code = 0; for (int i = 0; i < b.length; i++) { code |= (b[i] << (NUMBER_SIXTEEN - i % NUMBER_THREE * NUMBER_EIGHT)) & (HEX_255 << (NUMBER_SIXTEEN - i % NUMBER_THREE * NUMBER_EIGHT)); if (i % NUMBER_THREE == NUMBER_TWO || i == b.length - 1) { sb.append(BASE64CODE[(code & HEX_16515072) >>> NUMBER_EIGHTEEN]); sb.append(BASE64CODE[(code & HEX_258048) >>> NUMBER_TWELVE]); sb.append(BASE64CODE[(code & HEX_4032) >>> NUMBER_SIX]); sb.append(BASE64CODE[code & HEX_63]); code = 0; } } // 對於長度非3的整數倍的位元組陣列,編碼前先補0,編碼後結尾處編碼用=代替, // =的個數和短缺的長度一致,以此來標識出資料實際長度 if (b.length % NUMBER_THREE > 0) { sb.setCharAt(sb.length() - 1, '='); } if (b.length % NUMBER_THREE == 1) { sb.setCharAt(sb.length() - NUMBER_TWO, '='); } return sb.toString(); } } }
package tebon.vertica; import com.vertica.sdk.*; import java.util.TimeZone; /** * 自定義vertica函式,tebon_from_base64,把指定的base64字串轉換為普通格式。 * * @author zhanglei */ public class FromBase64Function extends ScalarFunctionFactory { public ScalarFunction createScalarFunction(ServerInterface serverInterface) { return new InternFromBase64Function(); } public void getPrototype(ServerInterface serverInterface, ColumnTypes inputTypes, ColumnTypes returnType) { inputTypes.addVarchar(); returnType.addVarchar(); } @Override public void getReturnType(ServerInterface srvInterface, final SizedColumnTypes argTypes, SizedColumnTypes returnType){ returnType.addVarchar(argTypes.getColumnType(0).getStringLength()); } public class InternFromBase64Function extends ScalarFunction { public void processBlock(ServerInterface serverInterface, BlockReader argReader, BlockWriter resWriter) throws UdfException, DestroyInvocation { TimeZone.setDefault(TimeZone.getTimeZone("Asia/Shanghai")); do { String inputString = argReader.getString(0); String returnString = null; if (inputString != null) { try { returnString = new String(decodeFromBase64(inputString), "utf-8"); } catch (Exception ex) { returnString = inputString; } } resWriter.setString(returnString); resWriter.next(); } while (argReader.next()); } //Base64解碼錶 private final byte[] BASE64DECODE = {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, // 注意兩個63,為相容SMP, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 62, -1, 63, -1, 63, // “/”和“-”都翻譯成63。 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, -1, -1, -1, 0, -1, -1, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, // 注意兩個0: 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, -1, -1, -1, -1, -1, // “A”和“=”都翻譯成0。 -1, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, -1, -1, -1, -1, -1,}; private final int HEX_255 = 0x0000ff; private final int HEX_16711680 = 0xff0000; private final int HEX_65280 = 0x00ff00; private final int NUMBER_TWO = 2; private final int NUMBER_THREE = 3; private final int NUMBER_FOUR = 4; private final int NUMBER_SIX = 6; private final int NUMBER_EIGHT = 8; private final int NUMBER_TWELVE = 12; private final int NUMBER_SIXTEEN = 16; private final int NUMBER_EIGHTEEN = 18; private byte[] decodeFromBase64(String code) { if (code == null || code.length() <= 0) { return null; } code = code.replace("\r", "").replace("\n", "").replace(" ", ""); int len = code.length(); if (len % NUMBER_FOUR != 0) { throw new IllegalArgumentException("Base64 string length must be 4*n"); } // 統計填充的等號個數 int pad = 0; if (code.charAt(len - 1) == '=') { pad++; } if (code.charAt(len - NUMBER_TWO) == '=') { pad++; } // 根據填充等號的個數來計算實際資料長度 int retLen = len / NUMBER_FOUR * NUMBER_THREE - pad; // 分配位元組陣列空間 byte[] ret = new byte[retLen]; // 查表解碼 char ch1, ch2, ch3, ch4; int i; for (i = 0; i < len; i += NUMBER_FOUR) { int j = i / NUMBER_FOUR * NUMBER_THREE; ch1 = code.charAt(i); ch2 = code.charAt(i + 1); ch3 = code.charAt(i + NUMBER_TWO); ch4 = code.charAt(i + NUMBER_THREE); int tmp = (BASE64DECODE[ch1] << NUMBER_EIGHTEEN) | (BASE64DECODE[ch2] << NUMBER_TWELVE) | (BASE64DECODE[ch3] << NUMBER_SIX) | (BASE64DECODE[ch4]); ret[j] = (byte) ((tmp & HEX_16711680) >> NUMBER_SIXTEEN); if (i < len - NUMBER_FOUR) { ret[j + 1] = (byte) ((tmp & HEX_65280) >> NUMBER_EIGHT); ret[j + NUMBER_TWO] = (byte) ((tmp & HEX_255)); } else { if (j + 1 < retLen) { ret[j + 1] = (byte) ((tmp & HEX_65280) >> NUMBER_EIGHT); } if (j + NUMBER_TWO < retLen) { ret[j + NUMBER_TWO] = (byte) ((tmp & HEX_255)); } } } return ret; } } }
------------2、把函式類上傳到伺服器上---------------
確保java程式碼在本地是編譯透過的,然後上傳到伺服器上。
建議上傳到這個位置:/opt/vertica/sdk/ ,後面以此路徑為準。
------------3、配置java編譯器環境---------------
安裝Jave-devel。 透過如下命令檢視已經安裝的jdk版本: rpm -qa|grep java 如果是Centos,一般情況下會列出兩個,例如: java-1.8.0-openjdk-1.8.0.282.b08-1.el7_9.x86_64 java-1.8.0-openjdk-headless-1.8.0.282.b08-1.el7_9.x86_64 透過如下命令解除安裝掉: rpm -e --nodeps java-1.8.0-openjdk-1.8.0.282.b08-1.el7_9.x86_64 rpm -e --nodeps java-1.8.0-openjdk-headless-1.8.0.282.b08-1.el7_9.x86_64 然後安裝新的java-devel: yum -y install java-1.8.0-openjdk-devel.x86_64 測試一下: java -version javac -version
------------4、編譯原始碼---------------
cd /opt/vertica/sdk javac -g -cp /opt/vertica/bin/VerticaSDK.jar /opt/vertica/sdk/BuildInfo.java -d . javac -g -cp /opt/vertica/bin/VerticaSDK.jar FromBase64Function.java -d . javac -g -cp /opt/vertica/bin/VerticaSDK.jar ToBase64Function.java -d . jar cf TebonVerticaFunctions.jar ./tebon/vertica/*.class ./com/vertica/sdk/*.class
------------5、註冊類庫和函式---------------
用vsql登入vertica,執行如下SQL:
SELECT SET_CONFIG_PARAMETER('JavaBinaryForUDx','/usr/bin/java');
DROP LIBRARY TebonVerticaFunctions CASCADE; CREATE LIBRARY TebonVerticaFunctions AS '/opt/vertica/sdk/TebonVerticaFunctions.jar' LANGUAGE 'JAVA'; CREATE FUNCTION tebon_to_base64 AS language 'java' NAME 'tebon.vertica.ToBase64Function' LIBRARY TebonVerticaFunctions; CREATE FUNCTION tebon_from_base64 AS language 'java' NAME 'tebon.vertica.FromBase64Function' LIBRARY TebonVerticaFunctions;
------------6、測試---------------
dbadmin=> select tebon_to_base64('hello123你好'), tebon_from_base64('aGVsbG8xMjPkvaDlpb0=');
tebon_to_base64 | tebon_from_base64
----------------------+-------------------
aGVsbG8xMjPkvaDlpb0= | hello123你好
(1 row)
到這裡就結束了。
如果java程式碼執行有問題,可以看錯誤日誌,位於這裡(記得把/home/dbadmin/bigdata換成你自己的資料位置):
/home/dbadmin/bigdata/v_bigdata_node0001_catalog/UDxLogs/UDxFencedProcessesJava.log