|
在SearchManager里定义分词器
public class LuceneManager implements SearchManager {
Analyzer analyzer = new ZCJChineseAnalyzer(); // Analyzer analyzer = new SmartChineseAnalyzer(Version.LUCENE_4_9); this.settings = new LuceneSettings(analyzer);
......................................
}
//ZCJChineseAnalyzer 类源码 继承Analyzer
package com.onegrid.darj.search; import java.io.IOException; import java.io.Reader; import java.io.StringReader;
import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.TokenFilter; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.analysis.core.LowerCaseFilter; import org.apache.lucene.analysis.core.StopFilter; import org.apache.lucene.analysis.en.PorterStemFilter; import org.apache.lucene.analysis.standard.StandardTokenizer; import org.apache.lucene.analysis.synonym.SynonymFilter; import org.apache.lucene.analysis.synonym.SynonymMap; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.apache.lucene.analysis.util.CharArraySet; import org.apache.lucene.util.CharsRef; import org.apache.lucene.util.Version;
/** * * @author 周长江 自定义分词器 2015年1月7日 14:10:53 * */ public class ZCJChineseAnalyzer extends Analyzer {
// 自定义停用词 private static final String[] stopWords = {"and", "of", "the", "to", "is", "their", "can", "all"}; public ZCJChineseAnalyzer() { }
@Override protected TokenStreamComponents createComponents(String fieldName, Reader reader) { // 创建一个分词器 Tokenizer tokenizer = new China(reader); // TokenStream的包装类 在2.2之中 是TokenStream return new TokenStreamComponents(tokenizer); }
}
//China类源码 继承Tokenizer
package com.onegrid.darj.search;
import java.io.IOException; import java.io.Reader; import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; import org.apache.lucene.util.AttributeFactory; /** * * @author 周长江 2015年1月7日 14:10:53 *每个单词都分 和数据库查询似的 适合中英文 */ public class China extends Tokenizer { public China(Reader in) { super(in); }
public China(AttributeFactory factory, Reader in) { super(factory, in); }
private int offset = 0, bufferIndex = 0, dataLen = 0; private final static int MAX_WORD_LEN = 255; private final static int IO_BUFFER_SIZE = 1024; private final char[] buffer = new char[MAX_WORD_LEN]; private final char[] ioBuffer = new char[IO_BUFFER_SIZE]; private int length; private int start; private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
private final void push(char c) { if (length == 0) start = offset - 1; // start of token buffer[length++] = Character.toLowerCase(c); // buffer it }
private final boolean flush() { if (length > 0) { // System.out.println(new String(buffer, 0, // length)); termAtt.copyBuffer(buffer, 0, length); offsetAtt.setOffset(correctOffset(start), correctOffset(start + length)); return true; } else return false; }
@Override public boolean incrementToken() throws IOException { clearAttributes(); length = 0; start = offset; while (true) { final char c; offset++; if (bufferIndex >= dataLen) { dataLen = input.read(ioBuffer); bufferIndex = 0; } if (dataLen == -1) { offset--; return flush(); } else c = ioBuffer[bufferIndex++]; switch (Character.getType(c)) { case Character.DECIMAL_DIGIT_NUMBER:// 注意此部分不过滤一些熟悉或者字母 case Character.LOWERCASE_LETTER:// 注意此部分 case Character.UPPERCASE_LETTER:// 注意此部分 // push(c); // if (length == MAX_WORD_LEN) return flush(); // break; case Character.OTHER_LETTER: if (length > 0) { bufferIndex--; offset--; return flush(); } push(c); return flush(); default: if (length > 0) return flush(); break; } } }
@Override public final void end() { // set final offset final int finalOffset = correctOffset(offset); this.offsetAtt.setOffset(finalOffset, finalOffset); }
@Override public void reset() throws IOException { super.reset(); offset = bufferIndex = dataLen = 0; } }
|