lucene3.3的简单例子

论坛 期权论坛 脚本     
匿名技术用户   2021-1-12 13:51   538   0
这里主要写一个lucene3.3的简单例子:
首先,当然是helloworld程序:
Java代码 收藏代码
  1. package com.lucene.demo;
  2. import java.io.File;
  3. import java.util.List;
  4. import org.apache.lucene.analysis.Analyzer;
  5. import org.apache.lucene.analysis.cn.smart.SmartChineseAnalyzer;
  6. import org.apache.lucene.document.Document;
  7. import org.apache.lucene.index.IndexWriter;
  8. import org.apache.lucene.index.IndexWriterConfig;
  9. import org.apache.lucene.queryParser.MultiFieldQueryParser;
  10. import org.apache.lucene.queryParser.QueryParser;
  11. import org.apache.lucene.search.IndexSearcher;
  12. import org.apache.lucene.search.Query;
  13. import org.apache.lucene.store.Directory;
  14. import org.apache.lucene.store.FSDirectory;
  15. import org.apache.lucene.util.Version;
  16. import org.junit.Test;
  17. import com.lucene.model.SerachResult;
  18. import com.lucene.utils.FileToDocument;
  19. import com.lucene.utils.SerachUtil;
  20. public class HelloWorld {
  21. File indexPath=new File("F:\\java\\Workspaces\\Lucene\\luceneIndex");//存放索引文件目录
  22. Analyzer analyzer=new SmartChineseAnalyzer(Version.LUCENE_33);
  23. IndexWriterConfig conf = new IndexWriterConfig(Version.LUCENE_33, analyzer);
  24. Document doc=FileToDocument.file2Document(new File("F:\\java\\Workspaces\\Lucene\\datasooruce\\IndexWriter.txt"));
  25. Document doc1=FileToDocument.file2Document(new File("F:\\java\\Workspaces\\Lucene\\datasooruce\\china.txt"));
  26. Document doc2=FileToDocument.file2Document(new File("F:\\java\\Workspaces\\Lucene\\datasooruce\\开玩笑.txt"));
  27. /**
  28. * 创建索引
  29. */
  30. @Test
  31. public void createIndex() throws Exception{
  32. conf.setOpenMode(IndexWriterConfig.OpenMode.CREATE);//总是重新创建
  33. Directory fsDir =FSDirectory.open(indexPath);
  34. long startTime =System.currentTimeMillis();
  35. IndexWriter indexWriter =new IndexWriter(fsDir, conf);
  36. indexWriter.addDocument(doc);
  37. indexWriter.addDocument(doc1);
  38. indexWriter.addDocument(doc2);
  39. indexWriter.optimize();
  40. indexWriter.close();
  41. long endTime=System.currentTimeMillis();
  42. System.out.println("共有"+indexWriter.numDocs()+"条索引");
  43. System.out.println("建立索引用时: " + (endTime-startTime)+"毫秒");
  44. }
  45. /**
  46. * 搜索
  47. */
  48. @Test
  49. public void serach() throws Exception {
  50. //1 把要搜素的文本解析为QUery
  51. String queryString ="开玩笑";
  52. String [] fields = {"name","content"};
  53. QueryParser queryParser =new MultiFieldQueryParser(Version.LUCENE_33, fields, analyzer);
  54. Query query =queryParser.parse(queryString);
  55. Directory fsDir =FSDirectory.open(indexPath);
  56. IndexSearcher indexSearcher= new IndexSearcher(fsDir);
  57. SerachResult serachResult=SerachUtil.serach(1, 3, query,analyzer,indexSearcher,null);
  58. System.out.println("总共有[ "+serachResult.getTotalCount()+" ]条匹配结果");
  59. //3 打印结果
  60. for (Document doc : (List<Document>)serachResult.getRecords()) {
  61. System.out.println();
  62. FileToDocument.printDocumentInfo(doc);
  63. }
  64. indexSearcher.close();
  65. }
  66. }

其中用到了2个util类:
Java代码 收藏代码
  1. package com.lucene.utils;
  2. import java.util.ArrayList;
  3. import java.util.List;
  4. import java.io.BufferedReader;
  5. import java.io.File;
  6. import java.io.FileFilter;
  7. import java.io.FileInputStream;
  8. import java.io.InputStreamReader;
  9. import org.apache.lucene.document.Document;
  10. import org.apache.lucene.document.Field;
  11. import org.apache.lucene.document.Field.Index;
  12. import org.apache.lucene.document.Field.Store;
  13. import org.apache.lucene.util.NumericUtils;
  14. public class FileToDocument {
  15. /**
  16. * 将文件转换成document
  17. * @param file
  18. * @return
  19. */
  20. public static Document file2Document(File file) {
  21. Document doc= new Document();
  22. doc.add(new Field("name",file.getName(),Store.YES,Index.ANALYZED));
  23. doc.add(new Field("content",readFile(file),Store.YES,Index.ANALYZED));
  24. doc.add(new Field("size",NumericUtils.longToPrefixCoded(file.length()),Store.YES,Index.NOT_ANALYZED));
  25. doc.add(new Field("path",file.getAbsolutePath(),Store.YES,Index.ANALYZED));
  26. return doc;
  27. }
  28. /**
  29. * 读取文件内容
  30. * @param file
  31. * @return
  32. */
  33. private static String readFile(File file) {
  34. StringBuffer content =new StringBuffer();
  35. try {
  36. BufferedReader reader =new BufferedReader(new InputStreamReader(new FileInputStream(file)));
  37. for(String line =null;(line=reader.readLine())!=null;) {
  38. content.append(line).append("\n");
  39. }
  40. } catch (Exception e) {
  41. e.printStackTrace();
  42. }
  43. return content.toString();
  44. }
  45. /**
  46. * 打印document
  47. * 获取name熟悉的值的两种方法
  48. * 1 FIeld f=doc.getField("name");
  49. * f.StringValue();
  50. * 2 doc.get("name");
  51. * @param doc
  52. */
  53. public static void printDocumentInfo(Document doc) {
  54. System.out.println("name = "+doc.get("name"));
  55. System.out.println("content = "+doc.get("content"));
  56. System.out.println("size = "+NumericUtils.prefixCodedToLong(doc.get("size")));
  57. System.out.println("path = "+doc.get("path"));
  58. System.out.println("comment = "+doc.get("comment"));
  59. }
  60. /**
  61. * 得到该目录下得所有文件(非目录,可读,非一次,存在)
  62. * @param sourceDir
  63. * @return
  64. */
  65. public List<File> index(File sourceDir) {
  66. File[] files=sourceDir.listFiles();
  67. List<File> fileList= new ArrayList<File>();
  68. for (int i = 0; i < files.length; i++) {
  69. File f=files[i];
  70. if(!f.isDirectory() && f.canRead() && !f.isHidden() && f.exists()){
  71. fileList.add(f);
  72. }
  73. }
  74. return fileList;
  75. }
  76. /**
  77. * 只接受txt文件的拦截器
  78. * @author Administrator
  79. *
  80. */
  81. private static class MyFileFilter implements FileFilter{
  82. @Override
  83. public boolean accept(File pathname) {
  84. return pathname.getName().toLowerCase().endsWith(".txt");
  85. }
  86. }
  87. }



Java代码 收藏代码
  1. package com.lucene.utils;
  2. import java.util.ArrayList;
  3. import java.util.List;
  4. import org.apache.lucene.analysis.Analyzer;
  5. import org.apache.lucene.analysis.TokenStream;
  6. import org.apache.lucene.document.Document;
  7. import org.apache.lucene.document.Field;
  8. import org.apache.lucene.document.Field.Index;
  9. import org.apache.lucene.document.Field.Store;
  10. import org.apache.lucene.queryParser.MultiFieldQueryParser;
  11. import org.apache.lucene.queryParser.QueryParser;
  12. import org.apache.lucene.search.Filter;
  13. import org.apache.lucene.search.IndexSearcher;
  14. import org.apache.lucene.search.Query;
  15. import org.apache.lucene.search.ScoreDoc;
  16. import org.apache.lucene.search.TopDocs;
  17. import org.apache.lucene.search.highlight.Formatter;
  18. import org.apache.lucene.search.highlight.Fragmenter;
  19. import org.apache.lucene.search.highlight.Highlighter;
  20. import org.apache.lucene.search.highlight.QueryScorer;
  21. import org.apache.lucene.search.highlight.Scorer;
  22. import org.apache.lucene.search.highlight.SimpleFragmenter;
  23. import org.apache.lucene.search.highlight.SimpleHTMLFormatter;
  24. import org.apache.lucene.util.Version;
  25. import com.lucene.model.SerachResult;
  26. /**
  27. *
  28. * @author: agoni
  29. * @date: Aug 13, 2011
  30. * @function:
  31. *
  32. */
  33. public class SerachUtil {
  34. public static SerachResult serach(int pageNo,int pageSize ,Query query,Analyzer analyzer,IndexSearcher indexSearcher,Filter filter) throws Exception {
  35. SerachResult serachResult = new SerachResult();
  36. long startTime =System.currentTimeMillis();
  37. //2 进行查询,并进行过滤
  38. TopDocs topDocs=indexSearcher.search(query, filter, 10000);
  39. //准备高亮器
  40. Formatter formatter = new SimpleHTMLFormatter("<font color='red'>","</font>");
  41. Scorer scorer = new QueryScorer(query);
  42. Highlighter highlighter = new Highlighter(formatter, scorer);
  43. Fragmenter fragmenter = new SimpleFragmenter(100);//摘要信息的长度
  44. highlighter.setTextFragmenter(fragmenter);
  45. //分页读取数据
  46. List<Document> recordList =new ArrayList<Document>();
  47. int end=Math.min(pageNo*pageSize,topDocs.totalHits);
  48. for(int i=(pageNo-1)*pageSize;i<end;i++) {
  49. ScoreDoc scoreDoc = topDocs.scoreDocs[i];//读取第几条记录
  50. int docSn=scoreDoc.doc; //文档内部编号
  51. Document document=indexSearcher.doc(docSn);
  52. //高亮
  53. //当正文中没有出现关键字, 返回null
  54. String hc=highlighter.getBestFragment(analyzer, "content",document.get("content"));
  55. if(hc==null) {
  56. String content =document.get("content");
  57. int endIndex=Math.min(100, content.length());
  58. hc=content.substring(0,endIndex);
  59. }
  60. Field contentField=(Field) document.getFieldable("content");
  61. contentField.setValue(hc);
  62. document.add(new Field("comment",hc,Store.YES,Index.ANALYZED));//将搜索的提示信息放入文档中
  63. recordList.add(document);
  64. }
  65. serachResult.setPageNo(pageNo);
  66. serachResult.setPageSize(pageSize);
  67. serachResult.setTotalCount(topDocs.totalHits);
  68. serachResult.setRecords(recordList);
  69. long endTime=System.currentTimeMillis();
  70. serachResult.setTime(endTime-startTime);
  71. return serachResult;
  72. }
  73. }




对搜索结果封装的类:
Java代码 收藏代码
  1. package com.lucene.model;
  2. import java.util.List;
  3. import org.apache.lucene.document.Document;
  4. /**
  5. *
  6. * @author: agoni
  7. * @date: Aug 13, 2011
  8. * @function:
  9. *
  10. */
  11. public class SerachResult {
  12. private int totalCount; //搜索的总记录数
  13. private List<Document> records; //返回当前页的结果
  14. private long Time; //搜索用时
  15. private int pageNo; //当前页
  16. private int pageSize; //每页数量
  17. public SerachResult() {
  18. }
  19. /**
  20. * @param totalCount
  21. * @param recordList
  22. * @param time
  23. */
  24. public SerachResult(int totalCount, List<Document> recordList, long time) {
  25. super();
  26. this.totalCount = totalCount;
  27. this.setRecords(recordList);
  28. Time = time;
  29. }
  30. public int getTotalCount() {
  31. return totalCount;
  32. }
  33. public void setTotalCount(int totalCount) {
  34. this.totalCount = totalCount;
  35. }
  36. public long getTime() {
  37. return Time;
  38. }
  39. public void setTime(long time) {
  40. Time = time;
  41. }
  42. public int getPageNo() {
  43. return pageNo;
  44. }
  45. public void setPageNo(int pageNo) {
  46. this.pageNo = pageNo;
  47. }
  48. public int getPageSize() {
  49. return pageSize;
  50. }
  51. public void setPageSize(int pageSize) {
  52. this.pageSize = pageSize;
  53. }
  54. public void setRecords(List<Document> records) {
  55. this.records = records;
  56. }
  57. public List<Document> getRecords() {
  58. return records;
  59. }
  60. }




helloworld程序到此结束。

下面是测试中文分词器和英文分词器的简单例子:
Java代码 收藏代码
  1. package com.lucene.demo;
  2. import java.io.IOException;
  3. import java.io.StringReader;
  4. import org.apache.lucene.analysis.Analyzer;
  5. import org.apache.lucene.analysis.TokenStream;
  6. import org.apache.lucene.analysis.cn.smart.SmartChineseAnalyzer;
  7. import org.apache.lucene.analysis.en.EnglishAnalyzer;
  8. import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
  9. import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
  10. import org.apache.lucene.util.Version;
  11. import org.junit.Test;
  12. /**
  13. *
  14. * @author: agoni
  15. * @date: Aug 14, 2011
  16. * @function:
  17. *
  18. */
  19. public class TestAnalyze {
  20. private String enText= "IndexWriter addDocument's a javadoc.txt menu_down up-down";
  21. private String chText="衣服 矿泉水和分词器-看不起一个人的审美";
  22. Analyzer enAnalyzer = new EnglishAnalyzer(Version.LUCENE_33);
  23. Analyzer chAnalyzer = new SmartChineseAnalyzer(Version.LUCENE_33);
  24. @Test
  25. public void testAnalyze() throws IOException {
  26. analyze(enAnalyzer, enText);
  27. System.out.println("---------------------------------------------------------------------------------------------------");
  28. analyze(enAnalyzer, chText);
  29. System.out.println("---------------------------------------------------------------------------------------------------");
  30. analyze(chAnalyzer, enText);
  31. System.out.println("---------------------------------------------------------------------------------------------------");
  32. analyze(chAnalyzer, chText);
  33. }
  34. /**
  35. *
  36. * @param analyzer
  37. * @param text
  38. * @throws IOException
  39. */
  40. public void analyze(Analyzer analyzer ,String text) throws IOException {
  41. System.out.println("-----------分词器:"+analyzer.getClass().getName()+"------------");
  42. System.out.println("输入:"+text);
  43. TokenStream tokenStream = analyzer.tokenStream("content",new StringReader(text));
  44. displayTokenStream(tokenStream);
  45. }
  46. private static void displayTokenStream(TokenStream ts) throws IOException
  47. {
  48. CharTermAttribute termAtt = (CharTermAttribute)ts.getAttribute(CharTermAttribute.class);
  49. TypeAttribute typeAtt = (TypeAttribute)ts.getAttribute(TypeAttribute.class);
  50. while (ts.incrementToken())
  51. {
  52. System.out.println("type="+typeAtt.type()+ " "+termAtt);
  53. }
  54. System.out.println(' ');
  55. }
  56. }


下面是测试几种常用query的例子:
Java代码 收藏代码
  1. package com.lucene.demo;
  2. import java.io.File;
  3. import java.util.List;
  4. import org.apache.lucene.analysis.Analyzer;
  5. import org.apache.lucene.analysis.cn.smart.SmartChineseAnalyzer;
  6. import org.apache.lucene.document.Document;
  7. import org.apache.lucene.index.IndexWriterConfig;
  8. import org.apache.lucene.index.Term;
  9. import org.apache.lucene.search.BooleanClause.Occur;
  10. import org.apache.lucene.search.BooleanQuery;
  11. import org.apache.lucene.search.IndexSearcher;
  12. import org.apache.lucene.search.PhraseQuery;
  13. import org.apache.lucene.search.Query;
  14. import org.apache.lucene.search.TermQuery;
  15. import org.apache.lucene.search.TermRangeQuery;
  16. import org.apache.lucene.search.WildcardQuery;
  17. import org.apache.lucene.store.Directory;
  18. import org.apache.lucene.store.FSDirectory;
  19. import org.apache.lucene.util.NumericUtils;
  20. import org.apache.lucene.util.Version;
  21. import org.junit.Test;
  22. import org.apache.lucene.search.FuzzyLikeThisQuery;
  23. import com.lucene.model.SerachResult;
  24. import com.lucene.utils.FileToDocument;
  25. import com.lucene.utils.SerachUtil;
  26. /**
  27. *
  28. * @author: agoni
  29. * @date: Aug 14, 2011
  30. * @function:
  31. *
  32. */
  33. public class TestQuery {
  34. File indexPath=new File("F:\\java\\Workspaces\\Lucene\\luceneIndex");//存放索引文件目录
  35. Analyzer analyzer=new SmartChineseAnalyzer(Version.LUCENE_33);
  36. IndexWriterConfig conf = new IndexWriterConfig(Version.LUCENE_33, analyzer);
  37. Document doc=FileToDocument.file2Document(new File("F:\\java\\Workspaces\\Lucene\\datasooruce\\IndexWriter.txt"));
  38. Document doc1=FileToDocument.file2Document(new File("F:\\java\\Workspaces\\Lucene\\datasooruce\\china.txt"));
  39. Document doc2=FileToDocument.file2Document(new File("F:\\java\\Workspaces\\Lucene\\datasooruce\\开玩笑.txt"));
  40. /**
  41. * 搜索
  42. */
  43. public void serach(Query query) {
  44. Directory fsDir;
  45. try {
  46. fsDir = FSDirectory.open(indexPath);
  47. IndexSearcher indexSearcher= new IndexSearcher(fsDir);
  48. SerachResult serachResult=SerachUtil.serach(1, 10, query,analyzer,indexSearcher,null);
  49. System.out.println("总共有[ "+serachResult.getTotalCount()+" ]条匹配结果");
  50. //3 打印结果
  51. for (Document doc : (List<Document>)serachResult.getRecords()) {
  52. System.out.println("--------------------------------------------------------------------------------------------------------------------------------------");
  53. FileToDocument.printDocumentInfo(doc);//打印文档
  54. }
  55. indexSearcher.close();
  56. } catch (Exception e) {
  57. e.printStackTrace();
  58. }
  59. }
  60. /**
  61. * 关键字查询
  62. */
  63. @Test
  64. public void testTermQuery() {
  65. //Term term = new Term("name","开玩笑");
  66. Term term = new Term("content","缓存");
  67. Query query = new TermQuery(term);
  68. serach(query);
  69. }
  70. /**
  71. * 范围查询
  72. * 参数3.4 是否包含上下边界
  73. */
  74. @Test
  75. public void testTermRangeQuery() {
  76. TermRangeQuery query =new TermRangeQuery("size",NumericUtils.longToPrefixCoded(100),NumericUtils.longToPrefixCoded(1000)
  77. , true, true);
  78. serach(query);//查询文件大小为100到1000之间的数据
  79. }
  80. /**
  81. * 模糊查询
  82. * ?表示一位 * 表示多位
  83. */
  84. @Test
  85. public void testWildcardQuery() {
  86. //Term term = new Term("name","index*"); //Index* 将查询不到任何结果,因为建立索引已将所有大写转为小写
  87. //Term term = new Term("name","?ndex*");
  88. //Term term = new Term("name","*dex*");
  89. Term term = new Term("name","*玩?");
  90. Query query = new WildcardQuery(term);
  91. serach(query);
  92. }
  93. /**
  94. * 短语查询
  95. */
  96. @Test
  97. public void testPhraseQuery() {
  98. PhraseQuery phraseQuery = new PhraseQuery();
  99. //phraseQuery.add(new Term("content","优化"),1);
  100. //phraseQuery.add(new Term("content","缓存"),3);
  101. phraseQuery.add(new Term("content","优化"));
  102. phraseQuery.add(new Term("content","缓存"));
  103. phraseQuery.setSlop(10);//表示分词之后 两个词的相隔的次数上限
  104. serach(phraseQuery);
  105. }
  106. /**
  107. * 最常使用的就是booleanquery了,可以将各种条件组合一起形成复杂查询
  108. **/
  109. @Test
  110. public void testBooleanQuery() {
  111. //条件一
  112. PhraseQuery phraseQuery = new PhraseQuery();
  113. phraseQuery.add(new Term("content","小弟"));
  114. phraseQuery.add(new Term("content","分词"));
  115. phraseQuery.setSlop(100);//两词组间隔最大距离
  116. //条件二
  117. TermRangeQuery query =new TermRangeQuery("size",NumericUtils.longToPrefixCoded(100),NumericUtils.longToPrefixCoded(500)
  118. , true, true);
  119. BooleanQuery booleanQuery = new BooleanQuery();
  120. booleanQuery.add(phraseQuery, Occur.MUST);
  121. booleanQuery.add(query,Occur.MUST);
  122. serach(booleanQuery);
  123. }
  124. }

分享到 :
0 人收藏
您需要登录后才可以回帖 登录 | 立即注册

本版积分规则

积分:7942463
帖子:1588486
精华:0
期权论坛 期权论坛
发布
内容

下载期权论坛手机APP