0. 資料準備
1. 建立索引
curl -X PUT -H 'Content-Type:application/json' -d '{"settings":{"index":{"number_of_shards":2,"number_of_replicas":0}},"mappings":{"properties":{"description":{"type":"text"},"name":{"type":"keyword"},"age":{"type":"integer"}}}}' localhost:9200/user
2. 檢視索引資訊
(base) xxx@58deMacBook-Pro business_scf_productservice % curl localhost:9200/_cat/indices?v
health status index uuid pri rep docs.count docs.deleted store.size pri.store.size
green open user uWw_V1ECRbSmZxyLF0TdBg 2 0 0 0 452b 452b
3. 插入資料
curl -X POST -H 'Content-Type:application/json' -d '{"description":"this is a good boy","name":"zhangsan","age":20}' localhost:9200/user/_doc/
4. 查詢資料
curl -X GET localhost:9200/user/_search
curl -X GET "localhost:9200/user/_search" -H 'Content-Type: application/json' -d'
{
"query": {
"match": {
"description": "good boy"
}
}
}'
1. es什麼情況下呼叫分詞?
1、寫入
同步對寫入的text 欄位進行分詞,然後進行相關後續儲存
斷點下到 : org.apache.lucene.analysis.standard.StandardTokenizer#incrementToken 檢視呼叫鏈。 可以看到從org.apache.lucene.index.DefaultIndexingChain#processField 呼叫到分詞方法。
2、查詢:
同步對輸入的條件進行分詞。 分詞後去詞典表以及倒排列表進行查詢。
斷點下到 : org.apache.lucene.analysis.standard.StandardTokenizer#incrementToken 檢視呼叫鏈。 可以看到從org.elasticsearch.index.search.MatchQueryParser#parse 呼叫過去。
2. es呼叫分詞返回的結果是什麼?
返回結果:是一個物件,包含當前的詞、起始位置、結束位置等資訊,用於es 建立倒排列表。
1、 自己curl 測試
curl -X GET "localhost:9200/_analyze" -H 'Content-Type: application/json' -d'
{
"analyzer": "standard",
"text": "Hello, world! This is a test. 123@example.com! 我是中國人~"
}
'
--- 結果
{
"tokens": [{
"token": "hello",
"start_offset": 0,
"end_offset": 5,
"type": "<ALPHANUM>",
"position": 0
}, {
"token": "world",
"start_offset": 7,
"end_offset": 12,
"type": "<ALPHANUM>",
"position": 1
}, {
"token": "this",
"start_offset": 14,
"end_offset": 18,
"type": "<ALPHANUM>",
"position": 2
}, {
"token": "is",
"start_offset": 19,
"end_offset": 21,
"type": "<ALPHANUM>",
"position": 3
}, {
"token": "a",
"start_offset": 22,
"end_offset": 23,
"type": "<ALPHANUM>",
"position": 4
}, {
"token": "test",
"start_offset": 24,
"end_offset": 28,
"type": "<ALPHANUM>",
"position": 5
}, {
"token": "123",
"start_offset": 30,
"end_offset": 33,
"type": "<NUM>",
"position": 6
}, {
"token": "example.com",
"start_offset": 34,
"end_offset": 45,
"type": "<ALPHANUM>",
"position": 7
}, {
"token": "我",
"start_offset": 47,
"end_offset": 48,
"type": "<IDEOGRAPHIC>",
"position": 8
}, {
"token": "是",
"start_offset": 48,
"end_offset": 49,
"type": "<IDEOGRAPHIC>",
"position": 9
}, {
"token": "中",
"start_offset": 49,
"end_offset": 50,
"type": "<IDEOGRAPHIC>",
"position": 10
}, {
"token": "國",
"start_offset": 50,
"end_offset": 51,
"type": "<IDEOGRAPHIC>",
"position": 11
}, {
"token": "人",
"start_offset": 51,
"end_offset": 52,
"type": "<IDEOGRAPHIC>",
"position": 12
}]
}
2、程式碼測試
package qz.es;
import org.apache.lucene.analysis.standard.StandardTokenizer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import java.io.StringReader;
public class AnalyzerTest {
public static void main(String[] args) throws Exception {
String text = "Hello, world! This is a test. 123@example.com! 我是中國人~";
// 初始化StringReader,準備輸入文字
StringReader reader = new StringReader(text);
// 建立StandardTokenizer例項
StandardTokenizer tokenizer = new StandardTokenizer();
// 獲取CharTermAttribute,用於獲取分詞結果
CharTermAttribute termAtt = tokenizer.addAttribute(CharTermAttribute.class);
// 開始分詞
tokenizer.setReader(reader);
tokenizer.reset();
while (tokenizer.incrementToken()) {
String token = termAtt.toString();
System.out.println(token);
}
tokenizer.end();
tokenizer.close();
reader.close();
}
}
--- 結果
Hello
world
This
is
a
test
123
example.com
我
是
中
國
人
檢視單個返回的詞資訊:
3. 自定義自己的分詞外掛
脫離ES 只用兩個元件: Analyzer、Tokenizer。Analyzer是一個更全面的概念,涵蓋了文字分析的整個流程,包括但不限於分詞(比如過濾等精細化處理)。Tokenizer則專注於分詞這一特定步驟。
整合到es需要2個元件:AbstractIndexAnalyzerProvider、AnalysisPlugin,(測試不需要 AbstractTokenizerFactory 也能正常使用)
// 參考專案: https://gitee.com/Qiao-Zhi/custom_analyzer_es_plugin
4. 自己的分詞外掛呼叫其他分詞外掛如何實現-擴充套件分詞?
1、 org.apache.lucene.analysis.Tokenizer#reset 重置的時候可以拿到當前需要分詞的詞, 然後processAnalyzer(sb.toString()); 分詞完成快取到物件屬性
private BufferedReader reader;
public void reset(Reader input) throws IOException {
if (BufferedReader.class.isAssignableFrom(input.getClass())) {
reader = ((BufferedReader) input);
} else {
reader = new BufferedReader(input);
}
CharBuffer buffer = CharBuffer.allocate(256);
StringBuilder sb = new StringBuilder();
while (reader.read(buffer) != -1) {
sb.append(buffer.flip());
buffer.clear();
}
// 要分詞的字串
terms = null;
processAnalyzer(sb.toString());
}
2、org.apache.lucene.analysis.TokenStream#incrementToken 遍歷上面分詞結果返回
@Override
public final boolean incrementToken() throws IOException {
CustomTom customTerm = tokenizerAdapter.nextTerm();
if (lexerTerm == null) {
return false;
}
String word = lexerTerm.word;
int offset = lexerTerm.offset;
int endOffset = offset + word.length();
termAtt.setEmpty().append(word);
offsetAtt.setOffset(correctOffset(offset),
correctOffset(endOffset));
return true;
}