lucene第一步,lucene基礎,索引建立
出自:http://blog.csdn.net/wxwzy738/article/details/8799184 的整理
1、工程結構
2、索引建立時的屬性:
Field.Store.YES或者NO(儲存域選項)
設定為YES表示或把這個域中的內容完全儲存到檔案中,方便進行文字的還原
設定為NO表示把這個域的內容不儲存到檔案中,但是可以被索引,此時內容無法完全還原(doc.get)
Field.Index(索引選項)
Index.ANALYZED:進行分詞和索引,適用於標題、內容等
Index.NOT_ANALYZED:進行索引,但是不進行分詞,如果身份證號,姓名,ID等,適用於精確搜尋
Index.ANALYZED_NOT_NORMS:進行分詞但是不儲存norms資訊,這個norms中包括了建立索引的時間和權值等資訊
norms中儲存了很多排序的資訊,
Index.NOT_ANALYZED_NOT_NORMS:即不進行分詞也不儲存norms資訊
Index.NO:不進行索引
3、lucene的增刪改查類
- package org.itat.index;
- import java.io.IOException;
- import java.text.ParseException;
- import java.text.SimpleDateFormat;
- import java.util.Date;
- import java.util.HashMap;
- import java.util.Map;
- import org.apache.lucene.analysis.standard.StandardAnalyzer;
- import org.apache.lucene.document.Document;
- import org.apache.lucene.document.Field;
- import org.apache.lucene.document.NumericField;
- import org.apache.lucene.index.CorruptIndexException;
- import org.apache.lucene.index.IndexReader;
- import org.apache.lucene.index.IndexWriter;
- import org.apache.lucene.index.IndexWriterConfig;
- import org.apache.lucene.index.StaleReaderException;
- import org.apache.lucene.index.Term;
- import org.apache.lucene.search.IndexSearcher;
- import org.apache.lucene.search.ScoreDoc;
- import org.apache.lucene.search.TermQuery;
- import org.apache.lucene.search.TopDocs;
- import org.apache.lucene.store.Directory;
- import org.apache.lucene.store.LockObtainFailedException;
- import org.apache.lucene.store.RAMDirectory;
- import org.apache.lucene.util.Version;
- public class IndexUtil {
- private String[] ids = {"1","2","3","4","5","6"};
- private String[] emails = {"aa@itat.org","bb@itat.org","cc@cc.org","dd@sina.org","ee@zttc.edu","ff@itat.org"};
- private String[] contents = {
- "welcome to visited the space,I like book",
- "hello boy, I like pingpeng ball",
- "my name is cc I like game",
- "I like football",
- "I like football and I like basketball too",
- "I like movie and swim"
- };
- private Date[] dates = null;
- private int[] attachs = {2,3,1,4,5,5};
- private String[] names = {"zhangsan","lisi","john","jetty","mike","jake"};
- private Directory directory = null;
- private Map<String,Float> scores = new HashMap<String,Float>();
- private static IndexReader reader = null;
- public IndexUtil() {
- try {
- setDates();
- scores.put("itat.org",2.0f);
- scores.put("zttc.edu", 1.5f);
- //directory = FSDirectory.open(new File("d:/lucene/index02"));
- directory = new RAMDirectory();
- index();
- reader = IndexReader.open(directory,false);
- } catch (IOException e) {
- e.printStackTrace();
- }
- }
- /**
- * 對於IndexReader而言,反覆使用Index.open開啟會有很大的開銷,所以一般在整個程式的生命週期中
- * 只會開啟一個IndexReader,通過這個IndexReader來建立不同的IndexSearcher,如果使用單例模式,
- * 可能出現的問題有:
- * 1、當使用Writer修改了索引之後不會更新資訊,所以需要使用IndexReader.openIfChange方法操作
- * 如果IndexWriter在建立完成之後,沒有關閉,需要進行commit操作之後才能提交
- * @return
- */
- public IndexSearcher getSearcher() {
- try {
- if(reader==null) {
- reader = IndexReader.open(directory,false);
- } else {
- IndexReader tr = IndexReader.openIfChanged(reader);
- //如果原來的reader沒改變,返回null
- //如果原來的reader改變,則更新為新的索引
- if(tr!=null) {
- reader.close();
- reader = tr;
- }
- }
- return new IndexSearcher(reader);
- } catch (CorruptIndexException e) {
- e.printStackTrace();
- } catch (IOException e) {
- e.printStackTrace();
- }
- return null;
- }
- private void setDates() {
- SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd");
- try {
- dates = new Date[ids.length];
- dates[0] = sdf.parse("2010-02-19");
- dates[1] = sdf.parse("2012-01-11");
- dates[2] = sdf.parse("2011-09-19");
- dates[3] = sdf.parse("2010-12-22");
- dates[4] = sdf.parse("2012-01-01");
- dates[5] = sdf.parse("2011-05-19");
- } catch (ParseException e) {
- e.printStackTrace();
- }
- }
- /**
- * 把之前刪除的索引資料進行恢復
- */
- public void undelete() {
- //使用IndexReader進行恢復
- try {
- IndexReader reader = IndexReader.open(directory,false);
- //恢復時,必須把IndexReader的只讀(readOnly)設定為false
- reader.undeleteAll();
- reader.close();
- } catch (CorruptIndexException e) {
- e.printStackTrace();
- } catch (StaleReaderException e) {
- e.printStackTrace();
- } catch (LockObtainFailedException e) {
- e.printStackTrace();
- } catch (IOException e) {
- e.printStackTrace();
- }
- }
- /**
- * forceMerge是lucene3.5之前替代optimize方法的,其實只是改了個名稱,因為優化的使效率變低
- * 因為一到優化它就會全部更新索引,這個所涉及到的負載是很大的
- * 所以改了個名稱,不推薦使用,在做優化的時候會把索引回收站中的資料檔案全部刪除
- * lucene會在你寫索引的時候根據你的索引的段越來越多會自動幫忙優化的,force是強制優化
- */
- public void merge() {
- IndexWriter writer = null;
- try {
- writer = new IndexWriter(directory,
- new IndexWriterConfig(Version.LUCENE_35,new StandardAnalyzer(Version.LUCENE_35)));
- //會將索引合併為2段,這兩段中的被刪除的資料會被清空
- //特別注意:此處Lucene在3.5之後不建議使用,因為會消耗大量的開銷,
- //Lucene會根據情況自動處理的
- writer.forceMerge(2);
- } catch (CorruptIndexException e) {
- e.printStackTrace();
- } catch (LockObtainFailedException e) {
- e.printStackTrace();
- } catch (IOException e) {
- e.printStackTrace();
- } finally {
- try {
- if(writer!=null) writer.close();
- } catch (CorruptIndexException e) {
- e.printStackTrace();
- } catch (IOException e) {
- e.printStackTrace();
- }
- }
- }
- /**
- * 假如你想要強制刪除回收站的資訊可以呼叫writer.forceMergeDeletes()這個方法,
- * 但是這個方法不推薦使用,比較消耗記憶體,lucene會自動根據容量的大小刪除所刪除的檔案
- */
- public void forceDelete() {
- IndexWriter writer = null;
- try {
- writer = new IndexWriter(directory,
- new IndexWriterConfig(Version.LUCENE_35,new StandardAnalyzer(Version.LUCENE_35)));
- writer.forceMergeDeletes();
- } catch (CorruptIndexException e) {
- e.printStackTrace();
- } catch (LockObtainFailedException e) {
- e.printStackTrace();
- } catch (IOException e) {
- e.printStackTrace();
- } finally {
- try {
- if(writer!=null) writer.close();
- } catch (CorruptIndexException e) {
- e.printStackTrace();
- } catch (IOException e) {
- e.printStackTrace();
- }
- }
- }
- /**
- * 刪除索引資料,預設不會完全刪除,被放入索引回收站
- */
- public void delete() {
- IndexWriter writer = null;
- try {
- writer = new IndexWriter(directory,
- new IndexWriterConfig(Version.LUCENE_35,new StandardAnalyzer(Version.LUCENE_35)));
- //引數是一個選項,可以是一個Query,也可以是一個term,term是一個精確查詢的值
- //此時刪除的文件並不會被完全刪除,而是儲存在一個回收站中的,可以恢復
- //執行完這個操作,索引資料夾下就會多出一個名叫_0_1.del的檔案,也就是刪除的檔案在這個檔案中記錄了
- writer.deleteDocuments(new Term("id","1"));
- writer.commit();
- } catch (CorruptIndexException e) {
- e.printStackTrace();
- } catch (LockObtainFailedException e) {
- e.printStackTrace();
- } catch (IOException e) {
- e.printStackTrace();
- } finally {
- try {
- if(writer!=null) writer.close();
- } catch (CorruptIndexException e) {
- e.printStackTrace();
- } catch (IOException e) {
- e.printStackTrace();
- }
- }
- }
- /**
- * 使用reader刪除,其實裡面也會呼叫writer刪除,
- * 優點是使用reader刪除馬上會更新索引資訊
- * 現在一般還是使用writer來刪除,reader.getWriter這個方法被過時了
- */
- public void delete02() {
- try {
- reader.deleteDocuments(new Term("id","1"));
- } catch (CorruptIndexException e) {
- e.printStackTrace();
- } catch (LockObtainFailedException e) {
- e.printStackTrace();
- } catch (IOException e) {
- e.printStackTrace();
- }
- }
- /**
- * 更新操作
- * Lucene並沒有提供更新,這裡的更新操作其實是如下兩個操作的合集
- * 先刪除之後再新增
- */
- public void update() {
- IndexWriter writer = null;
- try {
- writer = new IndexWriter(directory,
- new IndexWriterConfig(Version.LUCENE_35,new StandardAnalyzer(Version.LUCENE_35)));
- Document doc = new Document();
- doc.add(new Field("id","11",Field.Store.YES,Field.Index.NOT_ANALYZED_NO_NORMS));
- doc.add(new Field("email",emails[0],Field.Store.YES,Field.Index.NOT_ANALYZED));
- doc.add(new Field("content",contents[0],Field.Store.NO,Field.Index.ANALYZED));
- doc.add(new Field("name",names[0],Field.Store.YES,Field.Index.NOT_ANALYZED_NO_NORMS));
- writer.updateDocument(new Term("id","1"), doc);
- } catch (CorruptIndexException e) {
- e.printStackTrace();
- } catch (LockObtainFailedException e) {
- e.printStackTrace();
- } catch (IOException e) {
- e.printStackTrace();
- } finally {
- try {
- if(writer!=null) writer.close();
- } catch (CorruptIndexException e) {
- e.printStackTrace();
- } catch (IOException e) {
- e.printStackTrace();
- }
- }
- }
- public void query() {
- try {
- IndexReader reader = IndexReader.open(directory);
- //通過reader可以有效的獲取到文件的數量
- System.out.println("numDocs:"+reader.numDocs());//儲存的文件數//不包括被刪除的
- System.out.println("maxDocs:"+reader.maxDoc());//總儲存量,包括在回收站中的索引
- System.out.println("deleteDocs:"+reader.numDeletedDocs());
- reader.close();
- } catch (CorruptIndexException e) {
- e.printStackTrace();
- } catch (IOException e) {
- e.printStackTrace();
- }
- }
- /**
- * 索引檔案字尾為.fmn為儲存的是域的名稱等
- * .fdt和.fdx儲存的是Store.YES的資訊,儲存域裡面儲存的資料
- * .frq表示這裡的域哪些出現多少次,哪些單詞出現多少次,
- * .nrm儲存一些評分資訊
- * .prx儲存一些偏移量等
- * .tii和.tis專門儲存索引裡面的所有內容資訊
- */
- public void index() {
- IndexWriter writer = null;
- try {
- //在2.9版本之後,lucene的就不是全部的索引格式都相容的了,所以在使用的時候必須寫明版本號
- writer = new IndexWriter(directory, new IndexWriterConfig(Version.LUCENE_35, new StandardAnalyzer(Version.LUCENE_35)));
- writer.deleteAll();//清空索引
- Document doc = null;
- for(int i=0;i<ids.length;i++) {
- doc = new Document();
- doc.add(new Field("id",ids[i],Field.Store.YES,Field.Index.NOT_ANALYZED_NO_NORMS));
- doc.add(new Field("email",emails[i],Field.Store.YES,Field.Index.NOT_ANALYZED));
- doc.add(new Field("email","test"+i+"@test.com",Field.Store.YES,Field.Index.NOT_ANALYZED));
- doc.add(new Field("content",contents[i],Field.Store.NO,Field.Index.ANALYZED));
- doc.add(new Field("name",names[i],Field.Store.YES,Field.Index.NOT_ANALYZED_NO_NORMS));
- //儲存數字
- //NumberTools.stringToLong("");已經被標記為過時了
- doc.add(new NumericField("attach",Field.Store.YES,true).setIntValue(attachs[i]));
- //儲存日期
- doc.add(new NumericField("date",Field.Store.YES,true).setLongValue(dates[i].getTime()));
- String et = emails[i].substring(emails[i].lastIndexOf("@")+1);
- System.out.println(et);
- if(scores.containsKey(et)) {
- doc.setBoost(scores.get(et));
- } else {
- doc.setBoost(0.5f);//預設是1.0f
- }
- writer.addDocument(doc);
- }
- } catch (CorruptIndexException e) {
- e.printStackTrace();
- } catch (LockObtainFailedException e) {
- e.printStackTrace();
- } catch (IOException e) {
- e.printStackTrace();
- } finally {
- try {
- if(writer!=null)writer.close();
- } catch (CorruptIndexException e) {
- e.printStackTrace();
- } catch (IOException e) {
- e.printStackTrace();
- }
- }
- }
- public void search01() {
- try {
- IndexReader reader = IndexReader.open(directory);
- IndexSearcher searcher = new IndexSearcher(reader);
- TermQuery query = new TermQuery(new Term("email","test0@test.com"));
- TopDocs tds = searcher.search(query, 10);
- for(ScoreDoc sd:tds.scoreDocs) {
- Document doc = searcher.doc(sd.doc);
- System.out.println("("+sd.doc+"-"+doc.getBoost()+"-"+sd.score+")"+
- doc.get("name")+"["+doc.get("email")+"]-->"+doc.get("id")+","+
- doc.get("attach")+","+doc.get("date")+","+doc.getValues("email")[1]);
- }
- reader.close();
- } catch (CorruptIndexException e) {
- e.printStackTrace();
- } catch (IOException e) {
- e.printStackTrace();
- }
- }
- public void search02() {
- try {
- IndexSearcher searcher = getSearcher();
- TermQuery query = new TermQuery(new Term("content","like"));
- TopDocs tds = searcher.search(query, 10);
- for(ScoreDoc sd:tds.scoreDocs) {
- Document doc = searcher.doc(sd.doc);
- System.out.println(doc.get("id")+"---->"+
- doc.get("name")+"["+doc.get("email")+"]-->"+doc.get("id")+","+
- doc.get("attach")+","+doc.get("date")+","+doc.getValues("email")[1]);
- }
- searcher.close();
- } catch (CorruptIndexException e) {
- e.printStackTrace();
- } catch (IOException e) {
- e.printStackTrace();
- }
- }
- }
- package org.itat.test;
- import org.itat.index.IndexUtil;
- import org.junit.Test;
- public class TestIndex {
- @Test
- public void testIndex() {
- IndexUtil iu = new IndexUtil();
- iu.index();
- }
- @Test
- public void testQuery() {
- IndexUtil iu = new IndexUtil();
- iu.query();
- }
- @Test
- public void testDelete() {
- IndexUtil iu = new IndexUtil();
- iu.delete();
- }
- @Test
- public void testDelete02() {
- IndexUtil iu = new IndexUtil();
- iu.delete02();
- }
- @Test
- public void testUnDelete() {
- IndexUtil iu = new IndexUtil();
- iu.undelete();
- }
- @Test
- public void testForceDelete() {
- IndexUtil iu = new IndexUtil();
- iu.forceDelete();
- }
- @Test
- public void testMerge() {
- IndexUtil iu = new IndexUtil();
- iu.merge();
- }
- @Test
- public void testUpdate() {
- IndexUtil iu = new IndexUtil();
- iu.update();
- }
- @Test
- public void testSearch01() {
- IndexUtil iu = new IndexUtil();
- iu.search01();
- }
- @Test
- public void testSearch02() {
- IndexUtil iu = new IndexUtil();
- for(int i=0;i<5;i++) {
- iu.search02();
- System.out.println("-----------------------------");
- try {
- Thread.sleep(10000);
- } catch (InterruptedException e) {
- e.printStackTrace();
- }
- }
- }
- }
相關文章
- Lucene建立索引流程索引
- 【Lucene&&Solr】Lucene索引和搜尋流程Solr索引
- lucene(二) 索引的建立、增刪改查索引
- Lucene 2.0 對 html檔案建立索引的bugHTML索引
- Lucene中建立索引的效率和刪除索引的實現索引
- 基於Java的全文索引引擎Lucene簡介 (轉)Java索引
- hadoop異構儲存+lucene索引Hadoop索引
- lucene join解決父子關係索引索引
- 初探Lucene
- Lucene教程
- Lucene底層原理和最佳化經驗分享(1)-Lucene簡介和索引原理索引
- lucene索引檔案大小優化小結索引優化
- lucene(一) lucene一些概念的理解
- Lucene原始碼解析--Lucene中的CloseableThreadLocal類原始碼thread
- Lucene索引檔案大小優化方案總結索引優化
- Lucene學習
- 在JAVA中將Elasticsearch索引載入到Lucene APIJavaElasticsearch索引API
- lucene、lucene.NET詳細使用與優化詳解優化
- 後端技術雜談3:Lucene基礎原理與實踐後端
- Lucene學習一
- lucene入門使用
- Lucene查詢原理
- 基於Lucene的全文檢索實踐
- Lucene : 基於Java的全文搜尋引擎Java
- lucene學習總結篇--lucene全文檢索的基本原理和lucene API簡單的使用API
- 深度解析 Lucene 輕量級全文索引實現原理索引
- lucene 總結收集(url)
- lucene 自定義評分
- Lucene 排序 Sort與SortField排序
- lucene Filter過濾器Filter過濾器
- lucene字典實現原理
- Lucene五分鐘教程
- Apache Lucene 4釋出Apache
- lucene入門問題
- 基於Lucene查詢原理分析Elasticsearch的效能Elasticsearch
- 全文索引-lucene,solr,nutch,hadoop之nutch與hadoop索引SolrHadoop
- 網站搜尋功能lucene網站
- Lucene打分公式的推導公式