这里主要写一个lucene3.3的简单例子:
首先,当然是helloworld程序:
- package com.lucene.demo;
-
- import java.io.File;
- import java.util.List;
-
- import org.apache.lucene.analysis.Analyzer;
- import org.apache.lucene.analysis.cn.smart.SmartChineseAnalyzer;
- import org.apache.lucene.document.Document;
- import org.apache.lucene.index.IndexWriter;
- import org.apache.lucene.index.IndexWriterConfig;
- import org.apache.lucene.queryParser.MultiFieldQueryParser;
- import org.apache.lucene.queryParser.QueryParser;
- import org.apache.lucene.search.IndexSearcher;
- import org.apache.lucene.search.Query;
- import org.apache.lucene.store.Directory;
- import org.apache.lucene.store.FSDirectory;
- import org.apache.lucene.util.Version;
- import org.junit.Test;
-
- import com.lucene.model.SerachResult;
- import com.lucene.utils.FileToDocument;
- import com.lucene.utils.SerachUtil;
-
- public class HelloWorld {
-
- File indexPath=new File("F:\\java\\Workspaces\\Lucene\\luceneIndex");
- Analyzer analyzer=new SmartChineseAnalyzer(Version.LUCENE_33);
- IndexWriterConfig conf = new IndexWriterConfig(Version.LUCENE_33, analyzer);
- Document doc=FileToDocument.file2Document(new File("F:\\java\\Workspaces\\Lucene\\datasooruce\\IndexWriter.txt"));
- Document doc1=FileToDocument.file2Document(new File("F:\\java\\Workspaces\\Lucene\\datasooruce\\china.txt"));
- Document doc2=FileToDocument.file2Document(new File("F:\\java\\Workspaces\\Lucene\\datasooruce\\开玩笑.txt"));
-
-
-
- @Test
- public void createIndex() throws Exception{
- conf.setOpenMode(IndexWriterConfig.OpenMode.CREATE);
- Directory fsDir =FSDirectory.open(indexPath);
- long startTime =System.currentTimeMillis();
- IndexWriter indexWriter =new IndexWriter(fsDir, conf);
- indexWriter.addDocument(doc);
- indexWriter.addDocument(doc1);
- indexWriter.addDocument(doc2);
- indexWriter.optimize();
- indexWriter.close();
- long endTime=System.currentTimeMillis();
- System.out.println("共有"+indexWriter.numDocs()+"条索引");
- System.out.println("建立索引用时: " + (endTime-startTime)+"毫秒");
- }
-
-
-
-
- @Test
- public void serach() throws Exception {
-
- String queryString ="开玩笑";
- String [] fields = {"name","content"};
- QueryParser queryParser =new MultiFieldQueryParser(Version.LUCENE_33, fields, analyzer);
- Query query =queryParser.parse(queryString);
-
-
- Directory fsDir =FSDirectory.open(indexPath);
-
- IndexSearcher indexSearcher= new IndexSearcher(fsDir);
- SerachResult serachResult=SerachUtil.serach(1, 3, query,analyzer,indexSearcher,null);
- System.out.println("总共有[ "+serachResult.getTotalCount()+" ]条匹配结果");
-
- for (Document doc : (List<Document>)serachResult.getRecords()) {
- System.out.println();
- FileToDocument.printDocumentInfo(doc);
- }
- indexSearcher.close();
- }
- }
其中用到了2个util类:
- package com.lucene.utils;
-
- import java.util.ArrayList;
- import java.util.List;
- import java.io.BufferedReader;
- import java.io.File;
- import java.io.FileFilter;
- import java.io.FileInputStream;
- import java.io.InputStreamReader;
-
- import org.apache.lucene.document.Document;
- import org.apache.lucene.document.Field;
- import org.apache.lucene.document.Field.Index;
- import org.apache.lucene.document.Field.Store;
- import org.apache.lucene.util.NumericUtils;
-
- public class FileToDocument {
-
-
-
-
-
- public static Document file2Document(File file) {
- Document doc= new Document();
- doc.add(new Field("name",file.getName(),Store.YES,Index.ANALYZED));
- doc.add(new Field("content",readFile(file),Store.YES,Index.ANALYZED));
- doc.add(new Field("size",NumericUtils.longToPrefixCoded(file.length()),Store.YES,Index.NOT_ANALYZED));
- doc.add(new Field("path",file.getAbsolutePath(),Store.YES,Index.ANALYZED));
- return doc;
- }
-
-
-
-
-
-
- private static String readFile(File file) {
- StringBuffer content =new StringBuffer();
- try {
- BufferedReader reader =new BufferedReader(new InputStreamReader(new FileInputStream(file)));
- for(String line =null;(line=reader.readLine())!=null;) {
- content.append(line).append("\n");
- }
- } catch (Exception e) {
- e.printStackTrace();
- }
- return content.toString();
- }
-
-
-
-
-
-
-
-
-
-
- public static void printDocumentInfo(Document doc) {
-
- System.out.println("name = "+doc.get("name"));
- System.out.println("content = "+doc.get("content"));
- System.out.println("size = "+NumericUtils.prefixCodedToLong(doc.get("size")));
- System.out.println("path = "+doc.get("path"));
- System.out.println("comment = "+doc.get("comment"));
- }
-
-
-
-
-
-
- public List<File> index(File sourceDir) {
- File[] files=sourceDir.listFiles();
- List<File> fileList= new ArrayList<File>();
- for (int i = 0; i < files.length; i++) {
- File f=files[i];
- if(!f.isDirectory() && f.canRead() && !f.isHidden() && f.exists()){
- fileList.add(f);
- }
- }
- return fileList;
- }
-
-
-
-
-
-
-
- private static class MyFileFilter implements FileFilter{
-
- @Override
- public boolean accept(File pathname) {
- return pathname.getName().toLowerCase().endsWith(".txt");
- }
-
- }
- }
- package com.lucene.utils;
-
- import java.util.ArrayList;
- import java.util.List;
- import org.apache.lucene.analysis.Analyzer;
- import org.apache.lucene.analysis.TokenStream;
- import org.apache.lucene.document.Document;
- import org.apache.lucene.document.Field;
- import org.apache.lucene.document.Field.Index;
- import org.apache.lucene.document.Field.Store;
- import org.apache.lucene.queryParser.MultiFieldQueryParser;
- import org.apache.lucene.queryParser.QueryParser;
- import org.apache.lucene.search.Filter;
- import org.apache.lucene.search.IndexSearcher;
- import org.apache.lucene.search.Query;
- import org.apache.lucene.search.ScoreDoc;
- import org.apache.lucene.search.TopDocs;
- import org.apache.lucene.search.highlight.Formatter;
- import org.apache.lucene.search.highlight.Fragmenter;
- import org.apache.lucene.search.highlight.Highlighter;
- import org.apache.lucene.search.highlight.QueryScorer;
- import org.apache.lucene.search.highlight.Scorer;
- import org.apache.lucene.search.highlight.SimpleFragmenter;
- import org.apache.lucene.search.highlight.SimpleHTMLFormatter;
- import org.apache.lucene.util.Version;
-
- import com.lucene.model.SerachResult;
-
-
-
-
-
-
-
-
- public class SerachUtil {
-
- public static SerachResult serach(int pageNo,int pageSize ,Query query,Analyzer analyzer,IndexSearcher indexSearcher,Filter filter) throws Exception {
-
- SerachResult serachResult = new SerachResult();
-
-
- long startTime =System.currentTimeMillis();
-
-
-
-
- TopDocs topDocs=indexSearcher.search(query, filter, 10000);
-
- Formatter formatter = new SimpleHTMLFormatter("<font color='red'>","</font>");
- Scorer scorer = new QueryScorer(query);
- Highlighter highlighter = new Highlighter(formatter, scorer);
- Fragmenter fragmenter = new SimpleFragmenter(100);
- highlighter.setTextFragmenter(fragmenter);
-
-
-
-
- List<Document> recordList =new ArrayList<Document>();
- int end=Math.min(pageNo*pageSize,topDocs.totalHits);
- for(int i=(pageNo-1)*pageSize;i<end;i++) {
- ScoreDoc scoreDoc = topDocs.scoreDocs[i];
- int docSn=scoreDoc.doc;
- Document document=indexSearcher.doc(docSn);
-
-
-
- String hc=highlighter.getBestFragment(analyzer, "content",document.get("content"));
- if(hc==null) {
- String content =document.get("content");
- int endIndex=Math.min(100, content.length());
- hc=content.substring(0,endIndex);
- }
- Field contentField=(Field) document.getFieldable("content");
- contentField.setValue(hc);
- document.add(new Field("comment",hc,Store.YES,Index.ANALYZED));
-
- recordList.add(document);
- }
- serachResult.setPageNo(pageNo);
- serachResult.setPageSize(pageSize);
- serachResult.setTotalCount(topDocs.totalHits);
- serachResult.setRecords(recordList);
- long endTime=System.currentTimeMillis();
- serachResult.setTime(endTime-startTime);
-
- return serachResult;
- }
-
-
- }
对搜索结果封装的类:
- package com.lucene.model;
-
- import java.util.List;
-
- import org.apache.lucene.document.Document;
-
-
-
-
-
-
-
-
- public class SerachResult {
- private int totalCount;
- private List<Document> records;
- private long Time;
- private int pageNo;
- private int pageSize;
-
- public SerachResult() {
- }
-
-
-
-
-
-
- public SerachResult(int totalCount, List<Document> recordList, long time) {
- super();
- this.totalCount = totalCount;
- this.setRecords(recordList);
- Time = time;
- }
- public int getTotalCount() {
- return totalCount;
- }
- public void setTotalCount(int totalCount) {
- this.totalCount = totalCount;
- }
-
- public long getTime() {
- return Time;
- }
- public void setTime(long time) {
- Time = time;
- }
- public int getPageNo() {
- return pageNo;
- }
- public void setPageNo(int pageNo) {
- this.pageNo = pageNo;
- }
- public int getPageSize() {
- return pageSize;
- }
- public void setPageSize(int pageSize) {
- this.pageSize = pageSize;
- }
-
- public void setRecords(List<Document> records) {
- this.records = records;
- }
-
- public List<Document> getRecords() {
- return records;
- }
-
- }
helloworld程序到此结束。
下面是测试中文分词器和英文分词器的简单例子:
- package com.lucene.demo;
-
- import java.io.IOException;
- import java.io.StringReader;
-
- import org.apache.lucene.analysis.Analyzer;
- import org.apache.lucene.analysis.TokenStream;
- import org.apache.lucene.analysis.cn.smart.SmartChineseAnalyzer;
- import org.apache.lucene.analysis.en.EnglishAnalyzer;
- import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
- import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
- import org.apache.lucene.util.Version;
- import org.junit.Test;
-
-
-
-
-
-
-
-
- public class TestAnalyze {
- private String enText= "IndexWriter addDocument's a javadoc.txt menu_down up-down";
- private String chText="衣服 矿泉水和分词器-看不起一个人的审美";
- Analyzer enAnalyzer = new EnglishAnalyzer(Version.LUCENE_33);
- Analyzer chAnalyzer = new SmartChineseAnalyzer(Version.LUCENE_33);
-
-
- @Test
- public void testAnalyze() throws IOException {
- analyze(enAnalyzer, enText);
- System.out.println("---------------------------------------------------------------------------------------------------");
- analyze(enAnalyzer, chText);
- System.out.println("---------------------------------------------------------------------------------------------------");
- analyze(chAnalyzer, enText);
- System.out.println("---------------------------------------------------------------------------------------------------");
- analyze(chAnalyzer, chText);
- }
-
-
-
-
-
-
- public void analyze(Analyzer analyzer ,String text) throws IOException {
- System.out.println("-----------分词器:"+analyzer.getClass().getName()+"------------");
- System.out.println("输入:"+text);
- TokenStream tokenStream = analyzer.tokenStream("content",new StringReader(text));
- displayTokenStream(tokenStream);
- }
-
- private static void displayTokenStream(TokenStream ts) throws IOException
- {
- CharTermAttribute termAtt = (CharTermAttribute)ts.getAttribute(CharTermAttribute.class);
- TypeAttribute typeAtt = (TypeAttribute)ts.getAttribute(TypeAttribute.class);
-
- while (ts.incrementToken())
- {
- System.out.println("type="+typeAtt.type()+ " "+termAtt);
- }
- System.out.println(' ');
- }
- }
下面是测试几种常用query的例子:
- package com.lucene.demo;
-
-
- import java.io.File;
- import java.util.List;
-
- import org.apache.lucene.analysis.Analyzer;
- import org.apache.lucene.analysis.cn.smart.SmartChineseAnalyzer;
- import org.apache.lucene.document.Document;
- import org.apache.lucene.index.IndexWriterConfig;
- import org.apache.lucene.index.Term;
- import org.apache.lucene.search.BooleanClause.Occur;
- import org.apache.lucene.search.BooleanQuery;
- import org.apache.lucene.search.IndexSearcher;
- import org.apache.lucene.search.PhraseQuery;
- import org.apache.lucene.search.Query;
- import org.apache.lucene.search.TermQuery;
- import org.apache.lucene.search.TermRangeQuery;
- import org.apache.lucene.search.WildcardQuery;
- import org.apache.lucene.store.Directory;
- import org.apache.lucene.store.FSDirectory;
- import org.apache.lucene.util.NumericUtils;
- import org.apache.lucene.util.Version;
- import org.junit.Test;
- import org.apache.lucene.search.FuzzyLikeThisQuery;
- import com.lucene.model.SerachResult;
- import com.lucene.utils.FileToDocument;
- import com.lucene.utils.SerachUtil;
-
-
-
-
-
-
-
-
- public class TestQuery {
- File indexPath=new File("F:\\java\\Workspaces\\Lucene\\luceneIndex");
- Analyzer analyzer=new SmartChineseAnalyzer(Version.LUCENE_33);
- IndexWriterConfig conf = new IndexWriterConfig(Version.LUCENE_33, analyzer);
- Document doc=FileToDocument.file2Document(new File("F:\\java\\Workspaces\\Lucene\\datasooruce\\IndexWriter.txt"));
- Document doc1=FileToDocument.file2Document(new File("F:\\java\\Workspaces\\Lucene\\datasooruce\\china.txt"));
- Document doc2=FileToDocument.file2Document(new File("F:\\java\\Workspaces\\Lucene\\datasooruce\\开玩笑.txt"));
-
-
-
-
- public void serach(Query query) {
- Directory fsDir;
- try {
- fsDir = FSDirectory.open(indexPath);
- IndexSearcher indexSearcher= new IndexSearcher(fsDir);
- SerachResult serachResult=SerachUtil.serach(1, 10, query,analyzer,indexSearcher,null);
- System.out.println("总共有[ "+serachResult.getTotalCount()+" ]条匹配结果");
-
- for (Document doc : (List<Document>)serachResult.getRecords()) {
- System.out.println("--------------------------------------------------------------------------------------------------------------------------------------");
- FileToDocument.printDocumentInfo(doc);
- }
- indexSearcher.close();
- } catch (Exception e) {
- e.printStackTrace();
- }
-
- }
-
-
-
-
-
- @Test
- public void testTermQuery() {
-
- Term term = new Term("content","缓存");
- Query query = new TermQuery(term);
- serach(query);
- }
-
-
-
-
-
- @Test
- public void testTermRangeQuery() {
-
- TermRangeQuery query =new TermRangeQuery("size",NumericUtils.longToPrefixCoded(100),NumericUtils.longToPrefixCoded(1000)
- , true, true);
- serach(query);
- }
-
-
-
-
-
-
- @Test
- public void testWildcardQuery() {
-
-
-
- Term term = new Term("name","*玩?");
- Query query = new WildcardQuery(term);
- serach(query);
- }
-
-
-
-
-
-
-
- @Test
- public void testPhraseQuery() {
- PhraseQuery phraseQuery = new PhraseQuery();
-
-
-
- phraseQuery.add(new Term("content","优化"));
- phraseQuery.add(new Term("content","缓存"));
- phraseQuery.setSlop(10);
-
- serach(phraseQuery);
- }
-
-
-
-
-
-
- @Test
- public void testBooleanQuery() {
-
- PhraseQuery phraseQuery = new PhraseQuery();
- phraseQuery.add(new Term("content","小弟"));
- phraseQuery.add(new Term("content","分词"));
- phraseQuery.setSlop(100);
-
-
- TermRangeQuery query =new TermRangeQuery("size",NumericUtils.longToPrefixCoded(100),NumericUtils.longToPrefixCoded(500)
- , true, true);
-
- BooleanQuery booleanQuery = new BooleanQuery();
- booleanQuery.add(phraseQuery, Occur.MUST);
- booleanQuery.add(query,Occur.MUST);
-
-
- serach(booleanQuery);
- }
-
- }
|
|