elasticsearch ik 0.20.x => 0.90.x
This commit is contained in:
parent
a2dc3c7842
commit
5e14e3d629
2
pom.xml
2
pom.xml
@ -31,7 +31,7 @@
|
||||
</parent>
|
||||
|
||||
<properties>
|
||||
<elasticsearch.version>0.20.2</elasticsearch.version>
|
||||
<elasticsearch.version>0.90.0</elasticsearch.version>
|
||||
</properties>
|
||||
|
||||
<repositories>
|
||||
|
@ -2,19 +2,32 @@ package org.elasticsearch.index.analysis;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.wltea.analyzer.lucene.IKTokenizer;
|
||||
//import org.wltea.lucene.IKTokenizer;
|
||||
|
||||
import java.io.Reader;
|
||||
|
||||
|
||||
public class IkAnalyzer extends Analyzer {
|
||||
|
||||
@Override public TokenStream tokenStream(String fieldName, Reader reader) {
|
||||
return new IKTokenizer(reader,true);
|
||||
}
|
||||
// private boolean isMaxWordLength = false;
|
||||
// @Override public TokenStream tokenStream(String fieldName, Reader reader) {
|
||||
// return new IKTokenizer(reader,true);
|
||||
// }
|
||||
|
||||
|
||||
public IkAnalyzer() {
|
||||
super();
|
||||
}
|
||||
|
||||
@Override
|
||||
protected TokenStreamComponents createComponents(String s, Reader reader) {
|
||||
// new TokenStreamComponents
|
||||
Tokenizer tokenizer = new IKTokenizer(reader, true);
|
||||
return new TokenStreamComponents(tokenizer, null); //To change body of implemented methods use File | Settings | File Templates.
|
||||
}
|
||||
|
||||
// public boolean isMaxWordLength() {
|
||||
// return isMaxWordLength;
|
||||
// }
|
||||
}
|
||||
|
@ -24,11 +24,16 @@
|
||||
*/
|
||||
package org.wltea.analyzer.core;
|
||||
|
||||
import org.wltea.analyzer.dic.Dictionary;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.Reader;
|
||||
import java.util.*;
|
||||
import java.util.HashMap;
|
||||
import java.util.HashSet;
|
||||
import java.util.LinkedList;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
|
||||
import org.wltea.analyzer.cfg.Configuration;
|
||||
import org.wltea.analyzer.dic.Dictionary;
|
||||
|
||||
/**
|
||||
*
|
||||
@ -68,12 +73,12 @@ class AnalyzeContext {
|
||||
private Map<Integer , LexemePath> pathMap;
|
||||
//最终分词结果集
|
||||
private LinkedList<Lexeme> results;
|
||||
|
||||
private boolean useSmart;
|
||||
//分词器配置项
|
||||
private boolean useSmart;
|
||||
// private Configuration cfg;
|
||||
|
||||
public AnalyzeContext(boolean useSmart){
|
||||
this.useSmart = useSmart;
|
||||
this.useSmart = useSmart;
|
||||
this.segmentBuff = new char[BUFF_SIZE];
|
||||
this.charTypes = new int[BUFF_SIZE];
|
||||
this.buffLocker = new HashSet<String>();
|
||||
@ -313,7 +318,7 @@ class AnalyzeContext {
|
||||
while(result != null){
|
||||
//数量词合并
|
||||
this.compound(result);
|
||||
if(Dictionary.isStopWord(this.segmentBuff , result.getBegin() , result.getLength())){
|
||||
if(Dictionary.getSingleton().isStopWord(this.segmentBuff , result.getBegin() , result.getLength())){
|
||||
//是停止词继续取列表的下一个
|
||||
result = this.results.pollFirst();
|
||||
}else{
|
||||
@ -344,6 +349,7 @@ class AnalyzeContext {
|
||||
* 组合词元
|
||||
*/
|
||||
private void compound(Lexeme result){
|
||||
|
||||
if(!this.useSmart){
|
||||
return ;
|
||||
}
|
||||
|
@ -25,12 +25,12 @@
|
||||
*/
|
||||
package org.wltea.analyzer.core;
|
||||
|
||||
import org.wltea.analyzer.dic.Dictionary;
|
||||
import org.wltea.analyzer.dic.Hit;
|
||||
|
||||
import java.util.LinkedList;
|
||||
import java.util.List;
|
||||
|
||||
import org.wltea.analyzer.dic.Dictionary;
|
||||
import org.wltea.analyzer.dic.Hit;
|
||||
|
||||
|
||||
/**
|
||||
* 中文-日韩文子分词器
|
||||
@ -58,7 +58,7 @@ class CJKSegmenter implements ISegmenter {
|
||||
//处理词段队列
|
||||
Hit[] tmpArray = this.tmpHits.toArray(new Hit[this.tmpHits.size()]);
|
||||
for(Hit hit : tmpArray){
|
||||
hit = Dictionary.matchInMainDictWithHit(context.getSegmentBuff(), context.getCursor() , hit);
|
||||
hit = Dictionary.getSingleton().matchWithHit(context.getSegmentBuff(), context.getCursor() , hit);
|
||||
if(hit.isMatch()){
|
||||
//输出当前的词
|
||||
Lexeme newLexeme = new Lexeme(context.getBufferOffset() , hit.getBegin() , context.getCursor() - hit.getBegin() + 1 , Lexeme.TYPE_CNWORD);
|
||||
@ -77,7 +77,7 @@ class CJKSegmenter implements ISegmenter {
|
||||
|
||||
//*********************************
|
||||
//再对当前指针位置的字符进行单字匹配
|
||||
Hit singleCharHit = Dictionary.matchInMainDict(context.getSegmentBuff(), context.getCursor(), 1);
|
||||
Hit singleCharHit = Dictionary.getSingleton().matchInMainDict(context.getSegmentBuff(), context.getCursor(), 1);
|
||||
if(singleCharHit.isMatch()){//首字成词
|
||||
//输出当前的词
|
||||
Lexeme newLexeme = new Lexeme(context.getBufferOffset() , context.getCursor() , 1 , Lexeme.TYPE_CNWORD);
|
||||
|
@ -24,14 +24,14 @@
|
||||
*/
|
||||
package org.wltea.analyzer.core;
|
||||
|
||||
import org.wltea.analyzer.dic.Dictionary;
|
||||
import org.wltea.analyzer.dic.Hit;
|
||||
|
||||
import java.util.HashSet;
|
||||
import java.util.LinkedList;
|
||||
import java.util.List;
|
||||
import java.util.Set;
|
||||
|
||||
import org.wltea.analyzer.dic.Dictionary;
|
||||
import org.wltea.analyzer.dic.Hit;
|
||||
|
||||
/**
|
||||
*
|
||||
* 中文数量词子分词器
|
||||
@ -155,7 +155,7 @@ class CN_QuantifierSegmenter implements ISegmenter{
|
||||
//处理词段队列
|
||||
Hit[] tmpArray = this.countHits.toArray(new Hit[this.countHits.size()]);
|
||||
for(Hit hit : tmpArray){
|
||||
hit = Dictionary.matchInMainDictWithHit(context.getSegmentBuff(), context.getCursor() , hit);
|
||||
hit = Dictionary.getSingleton().matchWithHit(context.getSegmentBuff(), context.getCursor() , hit);
|
||||
if(hit.isMatch()){
|
||||
//输出当前的词
|
||||
Lexeme newLexeme = new Lexeme(context.getBufferOffset() , hit.getBegin() , context.getCursor() - hit.getBegin() + 1 , Lexeme.TYPE_COUNT);
|
||||
@ -174,7 +174,7 @@ class CN_QuantifierSegmenter implements ISegmenter{
|
||||
|
||||
//*********************************
|
||||
//对当前指针位置的字符进行单字匹配
|
||||
Hit singleCharHit = Dictionary.matchInQuantifierDict(context.getSegmentBuff(), context.getCursor(), 1);
|
||||
Hit singleCharHit = Dictionary.getSingleton().matchInQuantifierDict(context.getSegmentBuff(), context.getCursor(), 1);
|
||||
if(singleCharHit.isMatch()){//首字成量词词
|
||||
//输出当前的词
|
||||
Lexeme newLexeme = new Lexeme(context.getBufferOffset() , context.getCursor() , 1 , Lexeme.TYPE_COUNT);
|
||||
|
@ -38,7 +38,7 @@ class IKArbitrator {
|
||||
|
||||
/**
|
||||
* 分词歧义处理
|
||||
* @param orgLexemes
|
||||
// * @param orgLexemes
|
||||
* @param useSmart
|
||||
*/
|
||||
void process(AnalyzeContext context , boolean useSmart){
|
||||
@ -87,7 +87,6 @@ class IKArbitrator {
|
||||
* 歧义识别
|
||||
* @param lexemeCell 歧义路径链表头
|
||||
* @param fullTextLength 歧义路径文本长度
|
||||
* @param option 候选结果路径
|
||||
* @return
|
||||
*/
|
||||
private LexemePath judge(QuickSortSet.Cell lexemeCell , int fullTextLength){
|
||||
@ -120,7 +119,7 @@ class IKArbitrator {
|
||||
|
||||
/**
|
||||
* 向前遍历,添加词元,构造一个无歧义词元组合
|
||||
* @param LexemePath path
|
||||
// * @param LexemePath path
|
||||
* @return
|
||||
*/
|
||||
private Stack<QuickSortSet.Cell> forwardPath(QuickSortSet.Cell lexemeCell , LexemePath option){
|
||||
@ -140,7 +139,7 @@ class IKArbitrator {
|
||||
|
||||
/**
|
||||
* 回滚词元链,直到它能够接受指定的词元
|
||||
* @param lexeme
|
||||
// * @param lexeme
|
||||
* @param l
|
||||
*/
|
||||
private void backPath(Lexeme l , LexemePath option){
|
||||
|
@ -23,14 +23,15 @@
|
||||
*/
|
||||
package org.wltea.analyzer.core;
|
||||
|
||||
import org.elasticsearch.common.logging.ESLogger;
|
||||
import org.elasticsearch.common.logging.Loggers;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.Reader;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
import org.wltea.analyzer.cfg.Configuration;
|
||||
//import org.wltea.analyzer.cfg.DefaultConfig;
|
||||
import org.wltea.analyzer.dic.Dictionary;
|
||||
|
||||
/**
|
||||
* IK分词器主类
|
||||
*
|
||||
@ -39,16 +40,18 @@ public final class IKSegmenter {
|
||||
|
||||
//字符窜reader
|
||||
private Reader input;
|
||||
//分词器配置项
|
||||
private Configuration cfg;
|
||||
//分词器上下文
|
||||
private AnalyzeContext context;
|
||||
//分词处理器列表
|
||||
private List<ISegmenter> segmenters;
|
||||
//分词歧义裁决器
|
||||
private IKArbitrator arbitrator;
|
||||
private ESLogger logger=null;
|
||||
private final boolean useSmart;
|
||||
private boolean useSmart = false;
|
||||
|
||||
/**
|
||||
|
||||
/**
|
||||
* IK分词器构造函数
|
||||
* @param input
|
||||
* @param useSmart 为true,使用智能分词策略
|
||||
@ -57,16 +60,31 @@ public final class IKSegmenter {
|
||||
* 智能分词: 合并数词和量词,对分词结果进行歧义判断
|
||||
*/
|
||||
public IKSegmenter(Reader input , boolean useSmart){
|
||||
logger = Loggers.getLogger("ik-analyzer");
|
||||
this.input = input;
|
||||
// this.cfg = DefaultConfig.getInstance();
|
||||
this.useSmart=useSmart;
|
||||
this.init();
|
||||
this.init();
|
||||
}
|
||||
|
||||
/**
|
||||
* IK分词器构造函数
|
||||
* @param input
|
||||
* @param cfg 使用自定义的Configuration构造分词器
|
||||
*
|
||||
*/
|
||||
public IKSegmenter(Reader input , Configuration cfg){
|
||||
this.input = input;
|
||||
this.cfg = cfg;
|
||||
this.init();
|
||||
}
|
||||
|
||||
/**
|
||||
* 初始化
|
||||
*/
|
||||
private void init(){
|
||||
//初始化词典单例
|
||||
// Dictionary.initial(this.cfg);
|
||||
// Dictionary.getSingleton();
|
||||
//初始化分词上下文
|
||||
this.context = new AnalyzeContext(useSmart);
|
||||
//加载子分词器
|
||||
|
@ -120,7 +120,7 @@ class LetterSegmenter implements ISegmenter {
|
||||
/**
|
||||
* 处理数字字母混合输出
|
||||
* 如:windos2000 | linliangyi2005@gmail.com
|
||||
* @param input
|
||||
// * @param input
|
||||
* @param context
|
||||
* @return
|
||||
*/
|
||||
|
@ -327,12 +327,4 @@ class DictSegment implements Comparable<DictSegment>{
|
||||
return this.nodeChar.compareTo(o.nodeChar);
|
||||
}
|
||||
|
||||
public int getDicNum(){
|
||||
if(charMap!=null)
|
||||
{
|
||||
return charMap.size();
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
}
|
||||
|
@ -1,74 +1,233 @@
|
||||
/**
|
||||
* IK 中文分词 版本 5.0
|
||||
* IK Analyzer release 5.0
|
||||
*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*
|
||||
* 源代码由林良益(linliangyi2005@gmail.com)提供
|
||||
* 版权声明 2012,乌龙茶工作室
|
||||
* provided by Linliangyi and copyright 2012 by Oolong studio
|
||||
*
|
||||
*
|
||||
*/
|
||||
package org.wltea.analyzer.dic;
|
||||
|
||||
import java.io.*;
|
||||
import java.util.Collection;
|
||||
import java.util.List;
|
||||
|
||||
import org.elasticsearch.common.logging.ESLogger;
|
||||
import org.elasticsearch.common.logging.Loggers;
|
||||
import org.elasticsearch.common.settings.Settings;
|
||||
import org.elasticsearch.env.Environment;
|
||||
import org.wltea.analyzer.cfg.Configuration;
|
||||
|
||||
import java.io.*;
|
||||
import java.util.Collection;
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
* 词典管理类,单子模式
|
||||
*/
|
||||
public class Dictionary {
|
||||
|
||||
public static final String PATH_DIC_MAIN = "ik/main.dic";
|
||||
public static final String PATH_DIC_SURNAME = "ik/surname.dic";
|
||||
public static final String PATH_DIC_QUANTIFIER = "ik/quantifier.dic";
|
||||
public static final String PATH_DIC_SUFFIX = "ik/suffix.dic";
|
||||
public static final String PATH_DIC_PREP = "ik/preposition.dic";
|
||||
public static final String PATH_DIC_STOP = "ik/stopword.dic";
|
||||
private static final Dictionary singleton;
|
||||
|
||||
static{
|
||||
singleton = new Dictionary();
|
||||
}
|
||||
/*
|
||||
* 词典单子实例
|
||||
*/
|
||||
private static Dictionary singleton;
|
||||
|
||||
/*
|
||||
* 主词典对象
|
||||
*/
|
||||
private DictSegment _MainDict;
|
||||
|
||||
private DictSegment _SurnameDict;
|
||||
|
||||
/*
|
||||
* 停止词词典
|
||||
*/
|
||||
private DictSegment _StopWordDict;
|
||||
/*
|
||||
* 量词词典
|
||||
*/
|
||||
private DictSegment _QuantifierDict;
|
||||
|
||||
private DictSegment _SuffixDict;
|
||||
|
||||
private DictSegment _PrepDict;
|
||||
|
||||
private DictSegment _StopWords;
|
||||
|
||||
private Environment environment;
|
||||
private Configuration configuration;
|
||||
/**
|
||||
* 配置对象
|
||||
*/
|
||||
private Configuration configuration;
|
||||
private ESLogger logger=null;
|
||||
private static boolean dictInited=false;
|
||||
private Dictionary(){
|
||||
private Environment environment;
|
||||
public static final String PATH_DIC_MAIN = "ik/main.dic";
|
||||
public static final String PATH_DIC_SURNAME = "ik/surname.dic";
|
||||
public static final String PATH_DIC_QUANTIFIER = "ik/quantifier.dic";
|
||||
public static final String PATH_DIC_SUFFIX = "ik/suffix.dic";
|
||||
public static final String PATH_DIC_PREP = "ik/preposition.dic";
|
||||
public static final String PATH_DIC_STOP = "ik/stopword.dic";
|
||||
private Dictionary(){
|
||||
logger = Loggers.getLogger("ik-analyzer");
|
||||
}
|
||||
|
||||
public Configuration getConfig(){
|
||||
return configuration;
|
||||
}
|
||||
}
|
||||
static{
|
||||
singleton = new Dictionary();
|
||||
}
|
||||
// public Configuration getConfig(){
|
||||
// return configuration;
|
||||
// }
|
||||
// private Dictionary(Configuration cfg){
|
||||
// this.cfg = cfg;
|
||||
// this.loadMainDict();
|
||||
// this.loadStopWordDict();
|
||||
// this.loadQuantifierDict();
|
||||
// }
|
||||
|
||||
public void Init(Settings indexSettings){
|
||||
|
||||
if(!dictInited){
|
||||
environment =new Environment(indexSettings);
|
||||
configuration=new Configuration(indexSettings);
|
||||
loadMainDict();
|
||||
loadSurnameDict();
|
||||
loadQuantifierDict();
|
||||
loadSuffixDict();
|
||||
loadPrepDict();
|
||||
loadStopWordDict();
|
||||
dictInited=true;
|
||||
}
|
||||
if(!dictInited){
|
||||
environment =new Environment(indexSettings);
|
||||
configuration=new Configuration(indexSettings);
|
||||
loadMainDict();
|
||||
// loadSurnameDict();
|
||||
loadQuantifierDict();
|
||||
// loadSuffixDict();
|
||||
// loadPrepDict();
|
||||
loadStopWordDict();
|
||||
dictInited=true;
|
||||
}
|
||||
}
|
||||
|
||||
private void loadMainDict(){
|
||||
_MainDict = new DictSegment((char)0);
|
||||
/**
|
||||
* 词典初始化
|
||||
* 由于IK Analyzer的词典采用Dictionary类的静态方法进行词典初始化
|
||||
* 只有当Dictionary类被实际调用时,才会开始载入词典,
|
||||
* 这将延长首次分词操作的时间
|
||||
* 该方法提供了一个在应用加载阶段就初始化字典的手段
|
||||
* @return Dictionary
|
||||
*/
|
||||
// public static Dictionary initial(Configuration cfg){
|
||||
// if(singleton == null){
|
||||
// synchronized(Dictionary.class){
|
||||
// if(singleton == null){
|
||||
// singleton = new Dictionary();
|
||||
// return singleton;
|
||||
// }
|
||||
// }
|
||||
// }
|
||||
// return singleton;
|
||||
// }
|
||||
|
||||
/**
|
||||
* 获取词典单子实例
|
||||
* @return Dictionary 单例对象
|
||||
*/
|
||||
public static Dictionary getSingleton(){
|
||||
if(singleton == null){
|
||||
throw new IllegalStateException("词典尚未初始化,请先调用initial方法");
|
||||
}
|
||||
return singleton;
|
||||
}
|
||||
|
||||
/**
|
||||
* 批量加载新词条
|
||||
* @param words Collection<String>词条列表
|
||||
*/
|
||||
public void addWords(Collection<String> words){
|
||||
if(words != null){
|
||||
for(String word : words){
|
||||
if (word != null) {
|
||||
//批量加载词条到主内存词典中
|
||||
singleton._MainDict.fillSegment(word.trim().toLowerCase().toCharArray());
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* 批量移除(屏蔽)词条
|
||||
* @param words
|
||||
*/
|
||||
public void disableWords(Collection<String> words){
|
||||
if(words != null){
|
||||
for(String word : words){
|
||||
if (word != null) {
|
||||
//批量屏蔽词条
|
||||
singleton._MainDict.disableSegment(word.trim().toLowerCase().toCharArray());
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* 检索匹配主词典
|
||||
* @param charArray
|
||||
* @return Hit 匹配结果描述
|
||||
*/
|
||||
public Hit matchInMainDict(char[] charArray){
|
||||
return singleton._MainDict.match(charArray);
|
||||
}
|
||||
|
||||
/**
|
||||
* 检索匹配主词典
|
||||
* @param charArray
|
||||
* @param begin
|
||||
* @param length
|
||||
* @return Hit 匹配结果描述
|
||||
*/
|
||||
public Hit matchInMainDict(char[] charArray , int begin, int length){
|
||||
return singleton._MainDict.match(charArray, begin, length);
|
||||
}
|
||||
|
||||
/**
|
||||
* 检索匹配量词词典
|
||||
* @param charArray
|
||||
* @param begin
|
||||
* @param length
|
||||
* @return Hit 匹配结果描述
|
||||
*/
|
||||
public Hit matchInQuantifierDict(char[] charArray , int begin, int length){
|
||||
return singleton._QuantifierDict.match(charArray, begin, length);
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* 从已匹配的Hit中直接取出DictSegment,继续向下匹配
|
||||
* @param charArray
|
||||
* @param currentIndex
|
||||
* @param matchedHit
|
||||
* @return Hit
|
||||
*/
|
||||
public Hit matchWithHit(char[] charArray , int currentIndex , Hit matchedHit){
|
||||
DictSegment ds = matchedHit.getMatchedDictSegment();
|
||||
return ds.match(charArray, currentIndex, 1 , matchedHit);
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* 判断是否是停止词
|
||||
* @param charArray
|
||||
* @param begin
|
||||
* @param length
|
||||
* @return boolean
|
||||
*/
|
||||
public boolean isStopWord(char[] charArray , int begin, int length){
|
||||
return singleton._StopWordDict.match(charArray, begin, length).isMatch();
|
||||
}
|
||||
|
||||
/**
|
||||
* 加载主词典及扩展词典
|
||||
*/
|
||||
private void loadMainDict(){
|
||||
//建立一个主词典实例
|
||||
_MainDict = new DictSegment((char)0);
|
||||
//读取主词典文件
|
||||
File file= new File(environment.configFile(), Dictionary.PATH_DIC_MAIN);
|
||||
|
||||
InputStream is = null;
|
||||
@ -77,20 +236,17 @@ public class Dictionary {
|
||||
} catch (FileNotFoundException e) {
|
||||
e.printStackTrace();
|
||||
}
|
||||
if(is == null){
|
||||
throw new RuntimeException("Main Dictionary not found!!!");
|
||||
}
|
||||
|
||||
try {
|
||||
BufferedReader br = new BufferedReader(new InputStreamReader(is , "UTF-8"), 512);
|
||||
String theWord;
|
||||
String theWord = null;
|
||||
do {
|
||||
theWord = br.readLine();
|
||||
if (theWord != null && !"".equals(theWord.trim())) {
|
||||
_MainDict.fillSegment(theWord.trim().toCharArray());
|
||||
_MainDict.fillSegment(theWord.trim().toLowerCase().toCharArray());
|
||||
}
|
||||
} while (theWord != null);
|
||||
logger.info("[Dict Loading] {},MainDict Size:{}",file.toString(),_MainDict.getDicNum());
|
||||
|
||||
} catch (IOException ioe) {
|
||||
System.err.println("Main Dictionary loading exception.");
|
||||
ioe.printStackTrace();
|
||||
@ -105,37 +261,38 @@ public class Dictionary {
|
||||
e.printStackTrace();
|
||||
}
|
||||
}
|
||||
//加载扩展词典
|
||||
this.loadExtDict();
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* 加载用户配置的扩展词典到主词库表
|
||||
*/
|
||||
private void loadExtDict(){
|
||||
//加载扩展词典配置
|
||||
List<String> extDictFiles = configuration.getExtDictionarys();
|
||||
if(extDictFiles != null){
|
||||
InputStream is = null;
|
||||
for(String extDictName : extDictFiles){
|
||||
|
||||
File tempFile=new File(environment.configFile(),extDictName);
|
||||
|
||||
try {
|
||||
is = new FileInputStream(tempFile);
|
||||
} catch (FileNotFoundException e) {
|
||||
e.printStackTrace();
|
||||
logger.error("[Dict Loading]",e);
|
||||
}
|
||||
|
||||
if(is == null){
|
||||
//读取扩展词典文件
|
||||
System.out.println("加载扩展词典:" + extDictName);
|
||||
is = this.getClass().getClassLoader().getResourceAsStream(extDictName);
|
||||
//如果找不到扩展的字典,则忽略
|
||||
if(is == null){
|
||||
continue;
|
||||
}
|
||||
try {
|
||||
|
||||
BufferedReader br = new BufferedReader(new InputStreamReader(is , "UTF-8"), 512);
|
||||
String theWord;
|
||||
String theWord = null;
|
||||
do {
|
||||
theWord = br.readLine();
|
||||
if (theWord != null && !"".equals(theWord.trim())) {
|
||||
|
||||
|
||||
//加载扩展词典数据到主内存词典中
|
||||
//System.out.println(theWord);
|
||||
_MainDict.fillSegment(theWord.trim().toLowerCase().toCharArray());
|
||||
}
|
||||
} while (theWord != null);
|
||||
logger.info("[Dict Loading] {},MainDict Size:{}",tempFile.toString(),_MainDict.getDicNum());
|
||||
|
||||
} catch (IOException ioe) {
|
||||
System.err.println("Extension Dictionary loading exception.");
|
||||
ioe.printStackTrace();
|
||||
@ -154,70 +311,78 @@ public class Dictionary {
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
private void loadSurnameDict(){
|
||||
|
||||
_SurnameDict = new DictSegment((char)0);
|
||||
File file=new File(environment.configFile(),Dictionary.PATH_DIC_SURNAME);
|
||||
InputStream is = null;
|
||||
try {
|
||||
is = new FileInputStream(file);
|
||||
} catch (FileNotFoundException e) {
|
||||
e.printStackTrace();
|
||||
}
|
||||
if(is == null){
|
||||
throw new RuntimeException("Surname Dictionary not found!!!");
|
||||
}
|
||||
try {
|
||||
BufferedReader br = new BufferedReader(new InputStreamReader(is , "UTF-8"), 512);
|
||||
String theWord;
|
||||
do {
|
||||
theWord = br.readLine();
|
||||
if (theWord != null && !"".equals(theWord.trim())) {
|
||||
_SurnameDict.fillSegment(theWord.trim().toCharArray());
|
||||
/**
|
||||
* 加载用户扩展的停止词词典
|
||||
*/
|
||||
private void loadStopWordDict(){
|
||||
//建立一个主词典实例
|
||||
_StopWordDict = new DictSegment((char)0);
|
||||
//加载扩展停止词典
|
||||
List<String> extStopWordDictFiles = configuration.getExtStopWordDictionarys();
|
||||
if(extStopWordDictFiles != null){
|
||||
InputStream is = null;
|
||||
for(String extStopWordDictName : extStopWordDictFiles){
|
||||
System.out.println("加载扩展停止词典:" + extStopWordDictName);
|
||||
//读取扩展词典文件
|
||||
is = this.getClass().getClassLoader().getResourceAsStream(extStopWordDictName);
|
||||
//如果找不到扩展的字典,则忽略
|
||||
if(is == null){
|
||||
continue;
|
||||
}
|
||||
} while (theWord != null);
|
||||
logger.info("[Dict Loading] {},SurnameDict Size:{}",file.toString(),_SurnameDict.getDicNum());
|
||||
} catch (IOException ioe) {
|
||||
System.err.println("Surname Dictionary loading exception.");
|
||||
ioe.printStackTrace();
|
||||
try {
|
||||
BufferedReader br = new BufferedReader(new InputStreamReader(is , "UTF-8"), 512);
|
||||
String theWord = null;
|
||||
do {
|
||||
theWord = br.readLine();
|
||||
if (theWord != null && !"".equals(theWord.trim())) {
|
||||
//System.out.println(theWord);
|
||||
//加载扩展停止词典数据到内存中
|
||||
_StopWordDict.fillSegment(theWord.trim().toLowerCase().toCharArray());
|
||||
}
|
||||
} while (theWord != null);
|
||||
|
||||
}finally{
|
||||
try {
|
||||
if(is != null){
|
||||
is.close();
|
||||
is = null;
|
||||
} catch (IOException ioe) {
|
||||
System.err.println("Extension Stop word Dictionary loading exception.");
|
||||
ioe.printStackTrace();
|
||||
|
||||
}finally{
|
||||
try {
|
||||
if(is != null){
|
||||
is.close();
|
||||
is = null;
|
||||
}
|
||||
} catch (IOException e) {
|
||||
e.printStackTrace();
|
||||
}
|
||||
}
|
||||
} catch (IOException e) {
|
||||
e.printStackTrace();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* 加载量词词典
|
||||
*/
|
||||
private void loadQuantifierDict(){
|
||||
|
||||
//建立一个量词典实例
|
||||
_QuantifierDict = new DictSegment((char)0);
|
||||
//读取量词词典文件
|
||||
File file=new File(environment.configFile(),Dictionary.PATH_DIC_QUANTIFIER);
|
||||
InputStream is = null;
|
||||
try {
|
||||
is = new FileInputStream(file);
|
||||
} catch (FileNotFoundException e) {
|
||||
e.printStackTrace();
|
||||
}
|
||||
if(is == null){
|
||||
throw new RuntimeException("Quantifier Dictionary not found!!!");
|
||||
}
|
||||
try {
|
||||
BufferedReader br = new BufferedReader(new InputStreamReader(is , "UTF-8"), 512);
|
||||
String theWord;
|
||||
String theWord = null;
|
||||
do {
|
||||
theWord = br.readLine();
|
||||
if (theWord != null && !"".equals(theWord.trim())) {
|
||||
_QuantifierDict.fillSegment(theWord.trim().toCharArray());
|
||||
_QuantifierDict.fillSegment(theWord.trim().toLowerCase().toCharArray());
|
||||
}
|
||||
} while (theWord != null);
|
||||
logger.info("[Dict Loading] {},QuantifierDict Size:{}",file.toString(),_QuantifierDict.getDicNum());
|
||||
|
||||
} catch (IOException ioe) {
|
||||
System.err.println("Quantifier Dictionary loading exception.");
|
||||
ioe.printStackTrace();
|
||||
@ -235,304 +400,8 @@ public class Dictionary {
|
||||
}
|
||||
|
||||
|
||||
private void loadSuffixDict(){
|
||||
public static Dictionary getInstance(){
|
||||
return Dictionary.singleton;
|
||||
}
|
||||
|
||||
_SuffixDict = new DictSegment((char)0);
|
||||
File file=new File(environment.configFile(),Dictionary.PATH_DIC_SUFFIX);
|
||||
InputStream is = null;
|
||||
try {
|
||||
is = new FileInputStream(file);
|
||||
} catch (FileNotFoundException e) {
|
||||
e.printStackTrace();
|
||||
}
|
||||
if(is == null){
|
||||
throw new RuntimeException("Suffix Dictionary not found!!!");
|
||||
}
|
||||
try {
|
||||
|
||||
BufferedReader br = new BufferedReader(new InputStreamReader(is , "UTF-8"), 512);
|
||||
String theWord;
|
||||
do {
|
||||
theWord = br.readLine();
|
||||
if (theWord != null && !"".equals(theWord.trim())) {
|
||||
_SuffixDict.fillSegment(theWord.trim().toCharArray());
|
||||
}
|
||||
} while (theWord != null);
|
||||
logger.info("[Dict Loading] {},SuffixDict Size:{}",file.toString(),_SuffixDict.getDicNum());
|
||||
} catch (IOException ioe) {
|
||||
System.err.println("Suffix Dictionary loading exception.");
|
||||
ioe.printStackTrace();
|
||||
|
||||
}finally{
|
||||
try {
|
||||
if(is != null){
|
||||
is.close();
|
||||
is = null;
|
||||
}
|
||||
} catch (IOException e) {
|
||||
e.printStackTrace();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
private void loadPrepDict(){
|
||||
|
||||
_PrepDict = new DictSegment((char)0);
|
||||
File file=new File(environment.configFile(),Dictionary.PATH_DIC_PREP);
|
||||
InputStream is = null;
|
||||
try {
|
||||
is = new FileInputStream(file);
|
||||
} catch (FileNotFoundException e) {
|
||||
e.printStackTrace();
|
||||
}
|
||||
if(is == null){
|
||||
throw new RuntimeException("Preposition Dictionary not found!!!");
|
||||
}
|
||||
try {
|
||||
|
||||
BufferedReader br = new BufferedReader(new InputStreamReader(is , "UTF-8"), 512);
|
||||
String theWord;
|
||||
do {
|
||||
theWord = br.readLine();
|
||||
if (theWord != null && !"".equals(theWord.trim())) {
|
||||
|
||||
_PrepDict.fillSegment(theWord.trim().toCharArray());
|
||||
}
|
||||
} while (theWord != null);
|
||||
logger.info("[Dict Loading] {},PrepDict Size:{}",file.toString(),_PrepDict.getDicNum());
|
||||
} catch (IOException ioe) {
|
||||
System.err.println("Preposition Dictionary loading exception.");
|
||||
ioe.printStackTrace();
|
||||
|
||||
}finally{
|
||||
try {
|
||||
if(is != null){
|
||||
is.close();
|
||||
is = null;
|
||||
}
|
||||
} catch (IOException e) {
|
||||
e.printStackTrace();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
private void loadStopWordDict(){
|
||||
|
||||
_StopWords = new DictSegment((char)0);
|
||||
File file=new File(environment.configFile(),Dictionary.PATH_DIC_STOP);
|
||||
InputStream is = null;
|
||||
try {
|
||||
is = new FileInputStream(file);
|
||||
} catch (FileNotFoundException e) {
|
||||
e.printStackTrace();
|
||||
}
|
||||
if(is == null){
|
||||
throw new RuntimeException("Stopword Dictionary not found!!!");
|
||||
}
|
||||
try {
|
||||
|
||||
BufferedReader br = new BufferedReader(new InputStreamReader(is , "UTF-8"), 512);
|
||||
String theWord;
|
||||
do {
|
||||
theWord = br.readLine();
|
||||
if (theWord != null && !"".equals(theWord.trim())) {
|
||||
_StopWords.fillSegment(theWord.trim().toCharArray());
|
||||
}
|
||||
} while (theWord != null);
|
||||
logger.info("[Dict Loading] {},Stopwords Size:{}",file.toString(),_StopWords.getDicNum());
|
||||
} catch (IOException ioe) {
|
||||
System.err.println("Stopword Dictionary loading exception.");
|
||||
ioe.printStackTrace();
|
||||
|
||||
}finally{
|
||||
try {
|
||||
if(is != null){
|
||||
is.close();
|
||||
is = null;
|
||||
}
|
||||
} catch (IOException e) {
|
||||
e.printStackTrace();
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
List<String> extStopWordDictFiles = configuration.getExtStopWordDictionarys();
|
||||
if(extStopWordDictFiles != null){
|
||||
for(String extStopWordDictName : extStopWordDictFiles){
|
||||
File tempFile=new File(environment.configFile(),extStopWordDictName);
|
||||
try {
|
||||
is = new FileInputStream(tempFile);
|
||||
} catch (FileNotFoundException e) {
|
||||
e.printStackTrace();
|
||||
}
|
||||
|
||||
if(is == null){
|
||||
continue;
|
||||
}
|
||||
try {
|
||||
|
||||
BufferedReader br = new BufferedReader(new InputStreamReader(is , "UTF-8"), 512);
|
||||
String theWord;
|
||||
do {
|
||||
theWord = br.readLine();
|
||||
if (theWord != null && !"".equals(theWord.trim())) {
|
||||
|
||||
|
||||
_StopWords.fillSegment(theWord.trim().toCharArray());
|
||||
}
|
||||
} while (theWord != null);
|
||||
logger.info("[Dict Loading] {},Stopwords Size:{}",tempFile.toString(),_StopWords.getDicNum());
|
||||
} catch (IOException ioe) {
|
||||
System.err.println("Extension Stop word Dictionary loading exception.");
|
||||
ioe.printStackTrace();
|
||||
|
||||
}finally{
|
||||
try {
|
||||
if(is != null){
|
||||
is.close();
|
||||
is = null;
|
||||
}
|
||||
} catch (IOException e) {
|
||||
e.printStackTrace();
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
public static Dictionary getInstance(){
|
||||
return Dictionary.singleton;
|
||||
}
|
||||
|
||||
public static void loadExtendWords(Collection<String> extWords){
|
||||
if(extWords != null){
|
||||
for(String extWord : extWords){
|
||||
if (extWord != null) {
|
||||
|
||||
singleton._MainDict.fillSegment(extWord.trim().toCharArray());
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
public static void loadExtendStopWords(Collection<String> extStopWords){
|
||||
if(extStopWords != null){
|
||||
for(String extStopWord : extStopWords){
|
||||
if (extStopWord != null) {
|
||||
|
||||
singleton._StopWords.fillSegment(extStopWord.trim().toCharArray());
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
public static Hit matchInMainDict(char[] charArray){
|
||||
return singleton._MainDict.match(charArray);
|
||||
}
|
||||
|
||||
|
||||
public static Hit matchInMainDict(char[] charArray , int begin, int length){
|
||||
return singleton._MainDict.match(charArray, begin, length);
|
||||
}
|
||||
|
||||
|
||||
public static Hit matchInMainDictWithHit(char[] charArray , int currentIndex , Hit matchedHit){
|
||||
DictSegment ds = matchedHit.getMatchedDictSegment();
|
||||
return ds.match(charArray, currentIndex, 1 , matchedHit);
|
||||
}
|
||||
|
||||
|
||||
public static Hit matchInSurnameDict(char[] charArray , int begin, int length){
|
||||
return singleton._SurnameDict.match(charArray, begin, length);
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
/**
|
||||
* 检索匹配量词词典
|
||||
* @param charArray
|
||||
* @param begin
|
||||
* @param length
|
||||
* @return Hit 匹配结果描述
|
||||
*/
|
||||
public static Hit matchInQuantifierDict(char[] charArray , int begin, int length){
|
||||
return singleton._QuantifierDict.match(charArray, begin, length);
|
||||
}
|
||||
|
||||
/**
|
||||
* 检索匹配在后缀词典
|
||||
* @param charArray
|
||||
* @param begin
|
||||
* @param length
|
||||
* @return Hit 匹配结果描述
|
||||
*/
|
||||
public static Hit matchInSuffixDict(char[] charArray , int begin, int length){
|
||||
return singleton._SuffixDict.match(charArray, begin, length);
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
/**
|
||||
* 检索匹配介词、副词词典
|
||||
* @param charArray
|
||||
* @param begin
|
||||
* @param length
|
||||
* @return Hit 匹配结果描述
|
||||
*/
|
||||
public static Hit matchInPrepDict(char[] charArray , int begin, int length){
|
||||
return singleton._PrepDict.match(charArray, begin, length);
|
||||
}
|
||||
|
||||
/**
|
||||
* 判断是否是停止词
|
||||
* @param charArray
|
||||
* @param begin
|
||||
* @param length
|
||||
* @return boolean
|
||||
*/
|
||||
public static boolean isStopWord(char[] charArray , int begin, int length){
|
||||
return singleton._StopWords.match(charArray, begin, length).isMatch();
|
||||
}
|
||||
}
|
||||
|
@ -58,7 +58,9 @@ public class Hit {
|
||||
public boolean isMatch() {
|
||||
return (this.hitState & MATCH) > 0;
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
*/
|
||||
public void setMatch() {
|
||||
this.hitState = this.hitState | MATCH;
|
||||
}
|
||||
@ -69,7 +71,9 @@ public class Hit {
|
||||
public boolean isPrefix() {
|
||||
return (this.hitState & PREFIX) > 0;
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
*/
|
||||
public void setPrefix() {
|
||||
this.hitState = this.hitState | PREFIX;
|
||||
}
|
||||
@ -79,7 +83,9 @@ public class Hit {
|
||||
public boolean isUnmatch() {
|
||||
return this.hitState == UNMATCH ;
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
*/
|
||||
public void setUnmatch() {
|
||||
this.hitState = UNMATCH;
|
||||
}
|
||||
|
@ -1,51 +1,87 @@
|
||||
/**
|
||||
* IK 中文分词 版本 5.0.1
|
||||
* IK Analyzer release 5.0.1
|
||||
*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*
|
||||
* 源代码由林良益(linliangyi2005@gmail.com)提供
|
||||
* 版权声明 2012,乌龙茶工作室
|
||||
* provided by Linliangyi and copyright 2012 by Oolong studio
|
||||
*
|
||||
*/
|
||||
package org.wltea.analyzer.lucene;
|
||||
|
||||
import java.io.Reader;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.elasticsearch.common.settings.Settings;
|
||||
import org.wltea.analyzer.dic.Dictionary;
|
||||
|
||||
import java.io.Reader;
|
||||
/**
|
||||
* IK分词器,Lucene Analyzer接口实现
|
||||
* 兼容Lucene 4.0版本
|
||||
*/
|
||||
public final class IKAnalyzer extends Analyzer{
|
||||
|
||||
public final class IKAnalyzer extends Analyzer {
|
||||
private boolean useSmart;
|
||||
|
||||
private boolean isMaxWordLength = false;
|
||||
private boolean useSmart=false;
|
||||
public boolean useSmart() {
|
||||
return useSmart;
|
||||
}
|
||||
|
||||
public IKAnalyzer(){
|
||||
public void setUseSmart(boolean useSmart) {
|
||||
this.useSmart = useSmart;
|
||||
}
|
||||
|
||||
/**
|
||||
* IK分词器Lucene Analyzer接口实现类
|
||||
*
|
||||
* 默认细粒度切分算法
|
||||
*/
|
||||
public IKAnalyzer(){
|
||||
this(false);
|
||||
}
|
||||
|
||||
|
||||
public IKAnalyzer(boolean isMaxWordLength){
|
||||
/**
|
||||
* IK分词器Lucene Analyzer接口实现类
|
||||
*
|
||||
* @param useSmart 当为true时,分词器进行智能切分
|
||||
*/
|
||||
public IKAnalyzer(boolean useSmart){
|
||||
super();
|
||||
this.setMaxWordLength(isMaxWordLength);
|
||||
this.useSmart = useSmart;
|
||||
}
|
||||
|
||||
public IKAnalyzer(Settings indexSetting,Settings settings1) {
|
||||
super();
|
||||
Dictionary.getInstance().Init(indexSetting);
|
||||
Dictionary.getInstance().Init(indexSetting);
|
||||
|
||||
if(settings1.get("use_smart", "true").equals("true")){
|
||||
useSmart=true;
|
||||
useSmart = true;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public TokenStream tokenStream(String fieldName, Reader reader) {
|
||||
return new IKTokenizer(reader , useSmart);
|
||||
}
|
||||
|
||||
public void setMaxWordLength(boolean isMaxWordLength) {
|
||||
this.isMaxWordLength = isMaxWordLength;
|
||||
}
|
||||
|
||||
public boolean isMaxWordLength() {
|
||||
return isMaxWordLength;
|
||||
/**
|
||||
* 重载Analyzer接口,构造分词组件
|
||||
*/
|
||||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName, final Reader in) {
|
||||
Tokenizer _IKTokenizer = new IKTokenizer(in , this.useSmart());
|
||||
return new TokenStreamComponents(_IKTokenizer);
|
||||
}
|
||||
|
||||
}
|
||||
|
@ -26,88 +26,89 @@
|
||||
*/
|
||||
package org.wltea.analyzer.lucene;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.Reader;
|
||||
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
|
||||
|
||||
import org.wltea.analyzer.core.IKSegmenter;
|
||||
import org.wltea.analyzer.core.Lexeme;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.Reader;
|
||||
|
||||
/**
|
||||
* IK分词器 Lucene Tokenizer适配器类
|
||||
* 兼容Lucene 4.0版本
|
||||
*/
|
||||
public final class IKTokenizer extends Tokenizer {
|
||||
|
||||
//IK分词器实现
|
||||
private IKSegmenter _IKImplement;
|
||||
//IK分词器实现
|
||||
private IKSegmenter _IKImplement;
|
||||
|
||||
//词元文本属性
|
||||
private final CharTermAttribute termAtt;
|
||||
//词元位移属性
|
||||
private final OffsetAttribute offsetAtt;
|
||||
//词元分类属性(该属性分类参考org.wltea.analyzer.core.Lexeme中的分类常量)
|
||||
private final TypeAttribute typeAtt;
|
||||
//记录最后一个词元的结束位置
|
||||
private int endPosition;
|
||||
//词元文本属性
|
||||
private final CharTermAttribute termAtt;
|
||||
//词元位移属性
|
||||
private final OffsetAttribute offsetAtt;
|
||||
//词元分类属性(该属性分类参考org.wltea.analyzer.core.Lexeme中的分类常量)
|
||||
private final TypeAttribute typeAtt;
|
||||
//记录最后一个词元的结束位置
|
||||
private int endPosition;
|
||||
|
||||
/**
|
||||
* Lucene 4.0 Tokenizer适配器类构造函数
|
||||
* @param in
|
||||
* @param useSmart
|
||||
*/
|
||||
public IKTokenizer(Reader in , boolean useSmart){
|
||||
super(in);
|
||||
offsetAtt = addAttribute(OffsetAttribute.class);
|
||||
termAtt = addAttribute(CharTermAttribute.class);
|
||||
typeAtt = addAttribute(TypeAttribute.class);
|
||||
_IKImplement = new IKSegmenter(input , useSmart);
|
||||
}
|
||||
/**
|
||||
* Lucene 4.0 Tokenizer适配器类构造函数
|
||||
* @param in
|
||||
* @param useSmart
|
||||
*/
|
||||
public IKTokenizer(Reader in , boolean useSmart){
|
||||
super(in);
|
||||
offsetAtt = addAttribute(OffsetAttribute.class);
|
||||
termAtt = addAttribute(CharTermAttribute.class);
|
||||
typeAtt = addAttribute(TypeAttribute.class);
|
||||
_IKImplement = new IKSegmenter(input , useSmart);
|
||||
}
|
||||
|
||||
/* (non-Javadoc)
|
||||
* @see org.apache.lucene.analysis.TokenStream#incrementToken()
|
||||
*/
|
||||
@Override
|
||||
public boolean incrementToken() throws IOException {
|
||||
//清除所有的词元属性
|
||||
clearAttributes();
|
||||
Lexeme nextLexeme = _IKImplement.next();
|
||||
if(nextLexeme != null){
|
||||
//将Lexeme转成Attributes
|
||||
//设置词元文本
|
||||
termAtt.append(nextLexeme.getLexemeText());
|
||||
//设置词元长度
|
||||
termAtt.setLength(nextLexeme.getLength());
|
||||
//设置词元位移
|
||||
offsetAtt.setOffset(nextLexeme.getBeginPosition(), nextLexeme.getEndPosition());
|
||||
//记录分词的最后位置
|
||||
endPosition = nextLexeme.getEndPosition();
|
||||
//记录词元分类
|
||||
typeAtt.setType(nextLexeme.getLexemeTypeString());
|
||||
//返会true告知还有下个词元
|
||||
return true;
|
||||
}
|
||||
//返会false告知词元输出完毕
|
||||
return false;
|
||||
}
|
||||
/* (non-Javadoc)
|
||||
* @see org.apache.lucene.analysis.TokenStream#incrementToken()
|
||||
*/
|
||||
@Override
|
||||
public boolean incrementToken() throws IOException {
|
||||
//清除所有的词元属性
|
||||
clearAttributes();
|
||||
Lexeme nextLexeme = _IKImplement.next();
|
||||
if(nextLexeme != null){
|
||||
//将Lexeme转成Attributes
|
||||
//设置词元文本
|
||||
termAtt.append(nextLexeme.getLexemeText());
|
||||
//设置词元长度
|
||||
termAtt.setLength(nextLexeme.getLength());
|
||||
//设置词元位移
|
||||
offsetAtt.setOffset(nextLexeme.getBeginPosition(), nextLexeme.getEndPosition());
|
||||
//记录分词的最后位置
|
||||
endPosition = nextLexeme.getEndPosition();
|
||||
//记录词元分类
|
||||
typeAtt.setType(nextLexeme.getLexemeTypeString());
|
||||
//返会true告知还有下个词元
|
||||
return true;
|
||||
}
|
||||
//返会false告知词元输出完毕
|
||||
return false;
|
||||
}
|
||||
|
||||
/*
|
||||
* (non-Javadoc)
|
||||
* @see org.apache.lucene.analysis.Tokenizer#reset(java.io.Reader)
|
||||
*/
|
||||
@Override
|
||||
public void reset() throws IOException {
|
||||
super.reset();
|
||||
_IKImplement.reset(input);
|
||||
}
|
||||
/*
|
||||
* (non-Javadoc)
|
||||
* @see org.apache.lucene.analysis.Tokenizer#reset(java.io.Reader)
|
||||
*/
|
||||
@Override
|
||||
public void reset() throws IOException {
|
||||
super.reset();
|
||||
_IKImplement.reset(input);
|
||||
}
|
||||
|
||||
@Override
|
||||
public final void end() {
|
||||
// set final offset
|
||||
int finalOffset = correctOffset(this.endPosition);
|
||||
offsetAtt.setOffset(finalOffset, finalOffset);
|
||||
}
|
||||
@Override
|
||||
public final void end() {
|
||||
// set final offset
|
||||
int finalOffset = correctOffset(this.endPosition);
|
||||
offsetAtt.setOffset(finalOffset, finalOffset);
|
||||
}
|
||||
}
|
||||
|
File diff suppressed because it is too large
Load Diff
@ -1,153 +1,153 @@
|
||||
///**
|
||||
// * IK 中文分词 版本 5.0
|
||||
// * IK Analyzer release 5.0
|
||||
// *
|
||||
// * Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
// * contributor license agreements. See the NOTICE file distributed with
|
||||
// * this work for additional information regarding copyright ownership.
|
||||
// * The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
// * (the "License"); you may not use this file except in compliance with
|
||||
// * the License. You may obtain a copy of the License at
|
||||
// *
|
||||
// * http://www.apache.org/licenses/LICENSE-2.0
|
||||
// *
|
||||
// * Unless required by applicable law or agreed to in writing, software
|
||||
// * distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// * See the License for the specific language governing permissions and
|
||||
// * limitations under the License.
|
||||
// *
|
||||
// * 源代码由林良益(linliangyi2005@gmail.com)提供
|
||||
// * 版权声明 2012,乌龙茶工作室
|
||||
// * provided by Linliangyi and copyright 2012 by Oolong studio
|
||||
// *
|
||||
// */
|
||||
//package org.wltea.analyzer.query;
|
||||
//
|
||||
//import java.io.IOException;
|
||||
//import java.io.StringReader;
|
||||
//import java.util.ArrayList;
|
||||
//import java.util.List;
|
||||
//
|
||||
//import org.apache.lucene.analysis.standard.StandardAnalyzer;
|
||||
//import org.apache.lucene.queryparser.classic.ParseException;
|
||||
//import org.apache.lucene.queryparser.classic.QueryParser;
|
||||
//import org.apache.lucene.search.Query;
|
||||
//import org.apache.lucene.util.Version;
|
||||
//import org.wltea.analyzer.core.IKSegmenter;
|
||||
//import org.wltea.analyzer.core.Lexeme;
|
||||
//
|
||||
///**
|
||||
// * Single Word Multi Char Query Builder
|
||||
// * IK分词算法专用
|
||||
// * @author linliangyi
|
||||
// *
|
||||
// */
|
||||
//public class SWMCQueryBuilder {
|
||||
//
|
||||
// /**
|
||||
// * 生成SWMCQuery
|
||||
// * @param fieldName
|
||||
// * @param keywords
|
||||
// * @param quickMode
|
||||
// * @return Lucene Query
|
||||
// */
|
||||
// public static Query create(String fieldName ,String keywords , boolean quickMode){
|
||||
// if(fieldName == null || keywords == null){
|
||||
// throw new IllegalArgumentException("参数 fieldName 、 keywords 不能为null.");
|
||||
// }
|
||||
// //1.对keywords进行分词处理
|
||||
// List<Lexeme> lexemes = doAnalyze(keywords);
|
||||
// //2.根据分词结果,生成SWMCQuery
|
||||
// Query _SWMCQuery = getSWMCQuery(fieldName , lexemes , quickMode);
|
||||
// return _SWMCQuery;
|
||||
// }
|
||||
//
|
||||
// /**
|
||||
// * 分词切分,并返回结链表
|
||||
// * @param keywords
|
||||
// * @return
|
||||
// */
|
||||
// private static List<Lexeme> doAnalyze(String keywords){
|
||||
// List<Lexeme> lexemes = new ArrayList<Lexeme>();
|
||||
// IKSegmenter ikSeg = new IKSegmenter(new StringReader(keywords) , true);
|
||||
// try{
|
||||
// Lexeme l = null;
|
||||
// while( (l = ikSeg.next()) != null){
|
||||
// lexemes.add(l);
|
||||
// }
|
||||
// }catch(IOException e){
|
||||
// e.printStackTrace();
|
||||
// }
|
||||
// return lexemes;
|
||||
// }
|
||||
//
|
||||
//
|
||||
// /**
|
||||
// * 根据分词结果生成SWMC搜索
|
||||
// * @param fieldName
|
||||
/**
|
||||
* IK 中文分词 版本 5.0
|
||||
* IK Analyzer release 5.0
|
||||
*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*
|
||||
* 源代码由林良益(linliangyi2005@gmail.com)提供
|
||||
* 版权声明 2012,乌龙茶工作室
|
||||
* provided by Linliangyi and copyright 2012 by Oolong studio
|
||||
*
|
||||
*/
|
||||
package org.wltea.analyzer.query;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.StringReader;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
import org.apache.lucene.analysis.standard.StandardAnalyzer;
|
||||
import org.apache.lucene.queryparser.classic.ParseException;
|
||||
import org.apache.lucene.queryparser.classic.QueryParser;
|
||||
import org.apache.lucene.search.Query;
|
||||
import org.apache.lucene.util.Version;
|
||||
import org.wltea.analyzer.core.IKSegmenter;
|
||||
import org.wltea.analyzer.core.Lexeme;
|
||||
|
||||
/**
|
||||
* Single Word Multi Char Query Builder
|
||||
* IK分词算法专用
|
||||
* @author linliangyi
|
||||
*
|
||||
*/
|
||||
public class SWMCQueryBuilder {
|
||||
|
||||
/**
|
||||
* 生成SWMCQuery
|
||||
* @param fieldName
|
||||
* @param keywords
|
||||
* @param quickMode
|
||||
* @return Lucene Query
|
||||
*/
|
||||
public static Query create(String fieldName ,String keywords , boolean quickMode){
|
||||
if(fieldName == null || keywords == null){
|
||||
throw new IllegalArgumentException("参数 fieldName 、 keywords 不能为null.");
|
||||
}
|
||||
//1.对keywords进行分词处理
|
||||
List<Lexeme> lexemes = doAnalyze(keywords);
|
||||
//2.根据分词结果,生成SWMCQuery
|
||||
Query _SWMCQuery = getSWMCQuery(fieldName , lexemes , quickMode);
|
||||
return _SWMCQuery;
|
||||
}
|
||||
|
||||
/**
|
||||
* 分词切分,并返回结链表
|
||||
* @param keywords
|
||||
* @return
|
||||
*/
|
||||
private static List<Lexeme> doAnalyze(String keywords){
|
||||
List<Lexeme> lexemes = new ArrayList<Lexeme>();
|
||||
IKSegmenter ikSeg = new IKSegmenter(new StringReader(keywords) , true);
|
||||
try{
|
||||
Lexeme l = null;
|
||||
while( (l = ikSeg.next()) != null){
|
||||
lexemes.add(l);
|
||||
}
|
||||
}catch(IOException e){
|
||||
e.printStackTrace();
|
||||
}
|
||||
return lexemes;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* 根据分词结果生成SWMC搜索
|
||||
* @param fieldName
|
||||
// * @param pathOption
|
||||
// * @param quickMode
|
||||
// * @return
|
||||
// */
|
||||
// private static Query getSWMCQuery(String fieldName , List<Lexeme> lexemes , boolean quickMode){
|
||||
// //构造SWMC的查询表达式
|
||||
// StringBuffer keywordBuffer = new StringBuffer();
|
||||
// //精简的SWMC的查询表达式
|
||||
// StringBuffer keywordBuffer_Short = new StringBuffer();
|
||||
// //记录最后词元长度
|
||||
// int lastLexemeLength = 0;
|
||||
// //记录最后词元结束位置
|
||||
// int lastLexemeEnd = -1;
|
||||
//
|
||||
// int shortCount = 0;
|
||||
// int totalCount = 0;
|
||||
// for(Lexeme l : lexemes){
|
||||
// totalCount += l.getLength();
|
||||
// //精简表达式
|
||||
// if(l.getLength() > 1){
|
||||
// keywordBuffer_Short.append(' ').append(l.getLexemeText());
|
||||
// shortCount += l.getLength();
|
||||
// }
|
||||
//
|
||||
// if(lastLexemeLength == 0){
|
||||
// keywordBuffer.append(l.getLexemeText());
|
||||
// }else if(lastLexemeLength == 1 && l.getLength() == 1
|
||||
// && lastLexemeEnd == l.getBeginPosition()){//单字位置相邻,长度为一,合并)
|
||||
// keywordBuffer.append(l.getLexemeText());
|
||||
// }else{
|
||||
// keywordBuffer.append(' ').append(l.getLexemeText());
|
||||
//
|
||||
// }
|
||||
// lastLexemeLength = l.getLength();
|
||||
// lastLexemeEnd = l.getEndPosition();
|
||||
// }
|
||||
//
|
||||
// //借助lucene queryparser 生成SWMC Query
|
||||
// QueryParser qp = new QueryParser(Version.LUCENE_40, fieldName, new StandardAnalyzer(Version.LUCENE_40));
|
||||
// qp.setDefaultOperator(QueryParser.AND_OPERATOR);
|
||||
// qp.setAutoGeneratePhraseQueries(true);
|
||||
//
|
||||
// if(quickMode && (shortCount * 1.0f / totalCount) > 0.5f){
|
||||
// try {
|
||||
// //System.out.println(keywordBuffer.toString());
|
||||
// Query q = qp.parse(keywordBuffer_Short.toString());
|
||||
// return q;
|
||||
// } catch (ParseException e) {
|
||||
// e.printStackTrace();
|
||||
// }
|
||||
//
|
||||
// }else{
|
||||
// if(keywordBuffer.length() > 0){
|
||||
// try {
|
||||
// //System.out.println(keywordBuffer.toString());
|
||||
// Query q = qp.parse(keywordBuffer.toString());
|
||||
// return q;
|
||||
// } catch (ParseException e) {
|
||||
// e.printStackTrace();
|
||||
// }
|
||||
// }
|
||||
// }
|
||||
// return null;
|
||||
// }
|
||||
//}
|
||||
* @param quickMode
|
||||
* @return
|
||||
*/
|
||||
private static Query getSWMCQuery(String fieldName , List<Lexeme> lexemes , boolean quickMode){
|
||||
//构造SWMC的查询表达式
|
||||
StringBuffer keywordBuffer = new StringBuffer();
|
||||
//精简的SWMC的查询表达式
|
||||
StringBuffer keywordBuffer_Short = new StringBuffer();
|
||||
//记录最后词元长度
|
||||
int lastLexemeLength = 0;
|
||||
//记录最后词元结束位置
|
||||
int lastLexemeEnd = -1;
|
||||
|
||||
int shortCount = 0;
|
||||
int totalCount = 0;
|
||||
for(Lexeme l : lexemes){
|
||||
totalCount += l.getLength();
|
||||
//精简表达式
|
||||
if(l.getLength() > 1){
|
||||
keywordBuffer_Short.append(' ').append(l.getLexemeText());
|
||||
shortCount += l.getLength();
|
||||
}
|
||||
|
||||
if(lastLexemeLength == 0){
|
||||
keywordBuffer.append(l.getLexemeText());
|
||||
}else if(lastLexemeLength == 1 && l.getLength() == 1
|
||||
&& lastLexemeEnd == l.getBeginPosition()){//单字位置相邻,长度为一,合并)
|
||||
keywordBuffer.append(l.getLexemeText());
|
||||
}else{
|
||||
keywordBuffer.append(' ').append(l.getLexemeText());
|
||||
|
||||
}
|
||||
lastLexemeLength = l.getLength();
|
||||
lastLexemeEnd = l.getEndPosition();
|
||||
}
|
||||
|
||||
//借助lucene queryparser 生成SWMC Query
|
||||
QueryParser qp = new QueryParser(Version.LUCENE_40, fieldName, new StandardAnalyzer(Version.LUCENE_40));
|
||||
qp.setDefaultOperator(QueryParser.AND_OPERATOR);
|
||||
qp.setAutoGeneratePhraseQueries(true);
|
||||
|
||||
if(quickMode && (shortCount * 1.0f / totalCount) > 0.5f){
|
||||
try {
|
||||
//System.out.println(keywordBuffer.toString());
|
||||
Query q = qp.parse(keywordBuffer_Short.toString());
|
||||
return q;
|
||||
} catch (ParseException e) {
|
||||
e.printStackTrace();
|
||||
}
|
||||
|
||||
}else{
|
||||
if(keywordBuffer.length() > 0){
|
||||
try {
|
||||
//System.out.println(keywordBuffer.toString());
|
||||
Query q = qp.parse(keywordBuffer.toString());
|
||||
return q;
|
||||
} catch (ParseException e) {
|
||||
e.printStackTrace();
|
||||
}
|
||||
}
|
||||
}
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
@ -1,147 +1,147 @@
|
||||
///**
|
||||
// * IK 中文分词 版本 5.0
|
||||
// * IK Analyzer release 5.0
|
||||
// *
|
||||
// * Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
// * contributor license agreements. See the NOTICE file distributed with
|
||||
// * this work for additional information regarding copyright ownership.
|
||||
// * The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
// * (the "License"); you may not use this file except in compliance with
|
||||
// * the License. You may obtain a copy of the License at
|
||||
// *
|
||||
// * http://www.apache.org/licenses/LICENSE-2.0
|
||||
// *
|
||||
// * Unless required by applicable law or agreed to in writing, software
|
||||
// * distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// * See the License for the specific language governing permissions and
|
||||
// * limitations under the License.
|
||||
// *
|
||||
// * 源代码由林良益(linliangyi2005@gmail.com)提供
|
||||
// * 版权声明 2012,乌龙茶工作室
|
||||
// * provided by Linliangyi and copyright 2012 by Oolong studio
|
||||
// *
|
||||
// *
|
||||
// */
|
||||
//package org.wltea.analyzer.sample;
|
||||
//
|
||||
//import java.io.IOException;
|
||||
//
|
||||
//import org.apache.lucene.analysis.Analyzer;
|
||||
//import org.apache.lucene.document.Document;
|
||||
//import org.apache.lucene.document.Field;
|
||||
//import org.apache.lucene.document.StringField;
|
||||
//import org.apache.lucene.document.TextField;
|
||||
//import org.apache.lucene.index.CorruptIndexException;
|
||||
//import org.apache.lucene.index.DirectoryReader;
|
||||
//import org.apache.lucene.index.IndexReader;
|
||||
//import org.apache.lucene.index.IndexWriter;
|
||||
//import org.apache.lucene.index.IndexWriterConfig;
|
||||
//import org.apache.lucene.index.IndexWriterConfig.OpenMode;
|
||||
//import org.apache.lucene.queryparser.classic.ParseException;
|
||||
//import org.apache.lucene.queryparser.classic.QueryParser;
|
||||
//import org.apache.lucene.search.IndexSearcher;
|
||||
//import org.apache.lucene.search.Query;
|
||||
//import org.apache.lucene.search.ScoreDoc;
|
||||
//import org.apache.lucene.search.TopDocs;
|
||||
//import org.apache.lucene.store.Directory;
|
||||
//import org.apache.lucene.store.LockObtainFailedException;
|
||||
//import org.apache.lucene.store.RAMDirectory;
|
||||
//import org.apache.lucene.util.Version;
|
||||
//import org.wltea.analyzer.lucene.IKAnalyzer;
|
||||
//
|
||||
//
|
||||
//
|
||||
//
|
||||
///**
|
||||
// * 使用IKAnalyzer进行Lucene索引和查询的演示
|
||||
// * 2012-3-2
|
||||
// *
|
||||
// * 以下是结合Lucene4.0 API的写法
|
||||
// *
|
||||
// */
|
||||
//public class LuceneIndexAndSearchDemo {
|
||||
//
|
||||
//
|
||||
// /**
|
||||
// * 模拟:
|
||||
// * 创建一个单条记录的索引,并对其进行搜索
|
||||
// * @param args
|
||||
// */
|
||||
// public static void main(String[] args){
|
||||
// //Lucene Document的域名
|
||||
// String fieldName = "text";
|
||||
// //检索内容
|
||||
// String text = "IK Analyzer是一个结合词典分词和文法分词的中文分词开源工具包。它使用了全新的正向迭代最细粒度切分算法。";
|
||||
//
|
||||
// //实例化IKAnalyzer分词器
|
||||
// Analyzer analyzer = new IKAnalyzer(true);
|
||||
//
|
||||
// Directory directory = null;
|
||||
// IndexWriter iwriter = null;
|
||||
// IndexReader ireader = null;
|
||||
// IndexSearcher isearcher = null;
|
||||
// try {
|
||||
// //建立内存索引对象
|
||||
// directory = new RAMDirectory();
|
||||
//
|
||||
// //配置IndexWriterConfig
|
||||
// IndexWriterConfig iwConfig = new IndexWriterConfig(Version.LUCENE_40 , analyzer);
|
||||
// iwConfig.setOpenMode(OpenMode.CREATE_OR_APPEND);
|
||||
// iwriter = new IndexWriter(directory , iwConfig);
|
||||
// //写入索引
|
||||
// Document doc = new Document();
|
||||
// doc.add(new StringField("ID", "10000", Field.Store.YES));
|
||||
// doc.add(new TextField(fieldName, text, Field.Store.YES));
|
||||
// iwriter.addDocument(doc);
|
||||
// iwriter.close();
|
||||
//
|
||||
//
|
||||
// //搜索过程**********************************
|
||||
// //实例化搜索器
|
||||
// ireader = DirectoryReader.open(directory);
|
||||
// isearcher = new IndexSearcher(ireader);
|
||||
//
|
||||
// String keyword = "中文分词工具包";
|
||||
// //使用QueryParser查询分析器构造Query对象
|
||||
// QueryParser qp = new QueryParser(Version.LUCENE_40, fieldName, analyzer);
|
||||
// qp.setDefaultOperator(QueryParser.AND_OPERATOR);
|
||||
// Query query = qp.parse(keyword);
|
||||
// System.out.println("Query = " + query);
|
||||
//
|
||||
// //搜索相似度最高的5条记录
|
||||
// TopDocs topDocs = isearcher.search(query , 5);
|
||||
// System.out.println("命中:" + topDocs.totalHits);
|
||||
// //输出结果
|
||||
// ScoreDoc[] scoreDocs = topDocs.scoreDocs;
|
||||
// for (int i = 0; i < topDocs.totalHits; i++){
|
||||
// Document targetDoc = isearcher.doc(scoreDocs[i].doc);
|
||||
// System.out.println("内容:" + targetDoc.toString());
|
||||
// }
|
||||
//
|
||||
// } catch (CorruptIndexException e) {
|
||||
// e.printStackTrace();
|
||||
// } catch (LockObtainFailedException e) {
|
||||
// e.printStackTrace();
|
||||
// } catch (IOException e) {
|
||||
// e.printStackTrace();
|
||||
// } catch (ParseException e) {
|
||||
// e.printStackTrace();
|
||||
// } finally{
|
||||
// if(ireader != null){
|
||||
// try {
|
||||
// ireader.close();
|
||||
// } catch (IOException e) {
|
||||
// e.printStackTrace();
|
||||
// }
|
||||
// }
|
||||
// if(directory != null){
|
||||
// try {
|
||||
// directory.close();
|
||||
// } catch (IOException e) {
|
||||
// e.printStackTrace();
|
||||
// }
|
||||
// }
|
||||
// }
|
||||
// }
|
||||
//}
|
||||
/**
|
||||
* IK 中文分词 版本 5.0
|
||||
* IK Analyzer release 5.0
|
||||
*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*
|
||||
* 源代码由林良益(linliangyi2005@gmail.com)提供
|
||||
* 版权声明 2012,乌龙茶工作室
|
||||
* provided by Linliangyi and copyright 2012 by Oolong studio
|
||||
*
|
||||
*
|
||||
*/
|
||||
package org.wltea.analyzer.sample;
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.document.Document;
|
||||
import org.apache.lucene.document.Field;
|
||||
import org.apache.lucene.document.StringField;
|
||||
import org.apache.lucene.document.TextField;
|
||||
import org.apache.lucene.index.CorruptIndexException;
|
||||
import org.apache.lucene.index.DirectoryReader;
|
||||
import org.apache.lucene.index.IndexReader;
|
||||
import org.apache.lucene.index.IndexWriter;
|
||||
import org.apache.lucene.index.IndexWriterConfig;
|
||||
import org.apache.lucene.index.IndexWriterConfig.OpenMode;
|
||||
import org.apache.lucene.queryparser.classic.ParseException;
|
||||
import org.apache.lucene.queryparser.classic.QueryParser;
|
||||
import org.apache.lucene.search.IndexSearcher;
|
||||
import org.apache.lucene.search.Query;
|
||||
import org.apache.lucene.search.ScoreDoc;
|
||||
import org.apache.lucene.search.TopDocs;
|
||||
import org.apache.lucene.store.Directory;
|
||||
import org.apache.lucene.store.LockObtainFailedException;
|
||||
import org.apache.lucene.store.RAMDirectory;
|
||||
import org.apache.lucene.util.Version;
|
||||
import org.wltea.analyzer.lucene.IKAnalyzer;
|
||||
|
||||
|
||||
|
||||
|
||||
/**
|
||||
* 使用IKAnalyzer进行Lucene索引和查询的演示
|
||||
* 2012-3-2
|
||||
*
|
||||
* 以下是结合Lucene4.0 API的写法
|
||||
*
|
||||
*/
|
||||
public class LuceneIndexAndSearchDemo {
|
||||
|
||||
|
||||
/**
|
||||
* 模拟:
|
||||
* 创建一个单条记录的索引,并对其进行搜索
|
||||
* @param args
|
||||
*/
|
||||
public static void main(String[] args){
|
||||
//Lucene Document的域名
|
||||
String fieldName = "text";
|
||||
//检索内容
|
||||
String text = "IK Analyzer是一个结合词典分词和文法分词的中文分词开源工具包。它使用了全新的正向迭代最细粒度切分算法。";
|
||||
|
||||
//实例化IKAnalyzer分词器
|
||||
Analyzer analyzer = new IKAnalyzer(true);
|
||||
|
||||
Directory directory = null;
|
||||
IndexWriter iwriter = null;
|
||||
IndexReader ireader = null;
|
||||
IndexSearcher isearcher = null;
|
||||
try {
|
||||
//建立内存索引对象
|
||||
directory = new RAMDirectory();
|
||||
|
||||
//配置IndexWriterConfig
|
||||
IndexWriterConfig iwConfig = new IndexWriterConfig(Version.LUCENE_40 , analyzer);
|
||||
iwConfig.setOpenMode(OpenMode.CREATE_OR_APPEND);
|
||||
iwriter = new IndexWriter(directory , iwConfig);
|
||||
//写入索引
|
||||
Document doc = new Document();
|
||||
doc.add(new StringField("ID", "10000", Field.Store.YES));
|
||||
doc.add(new TextField(fieldName, text, Field.Store.YES));
|
||||
iwriter.addDocument(doc);
|
||||
iwriter.close();
|
||||
|
||||
|
||||
//搜索过程**********************************
|
||||
//实例化搜索器
|
||||
ireader = DirectoryReader.open(directory);
|
||||
isearcher = new IndexSearcher(ireader);
|
||||
|
||||
String keyword = "中文分词工具包";
|
||||
//使用QueryParser查询分析器构造Query对象
|
||||
QueryParser qp = new QueryParser(Version.LUCENE_40, fieldName, analyzer);
|
||||
qp.setDefaultOperator(QueryParser.AND_OPERATOR);
|
||||
Query query = qp.parse(keyword);
|
||||
System.out.println("Query = " + query);
|
||||
|
||||
//搜索相似度最高的5条记录
|
||||
TopDocs topDocs = isearcher.search(query , 5);
|
||||
System.out.println("命中:" + topDocs.totalHits);
|
||||
//输出结果
|
||||
ScoreDoc[] scoreDocs = topDocs.scoreDocs;
|
||||
for (int i = 0; i < topDocs.totalHits; i++){
|
||||
Document targetDoc = isearcher.doc(scoreDocs[i].doc);
|
||||
System.out.println("内容:" + targetDoc.toString());
|
||||
}
|
||||
|
||||
} catch (CorruptIndexException e) {
|
||||
e.printStackTrace();
|
||||
} catch (LockObtainFailedException e) {
|
||||
e.printStackTrace();
|
||||
} catch (IOException e) {
|
||||
e.printStackTrace();
|
||||
} catch (ParseException e) {
|
||||
e.printStackTrace();
|
||||
} finally{
|
||||
if(ireader != null){
|
||||
try {
|
||||
ireader.close();
|
||||
} catch (IOException e) {
|
||||
e.printStackTrace();
|
||||
}
|
||||
}
|
||||
if(directory != null){
|
||||
try {
|
||||
directory.close();
|
||||
} catch (IOException e) {
|
||||
e.printStackTrace();
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
Loading…
x
Reference in New Issue
Block a user