update to 1.2.3

This commit is contained in:
medcl 2013-11-06 10:15:32 +08:00
parent 72718510e4
commit ce6e7fd070
17 changed files with 426837 additions and 144 deletions

View File

@ -6,7 +6,8 @@ The IK Analysis plugin integrates Lucene IK analyzer into elasticsearch, support
Version
-------------
master | 0.90.0 -> master
master | 0.90.2 -> master
1.2.3 | 0.90.2
1.2.0 | 0.90.0
1.1.3 | 0.20.2
1.1.2 | 0.19.x
@ -35,7 +36,7 @@ https://github.com/medcl/elasticsearch-analysis-ik/blob/master/config/ik/IKAnaly
<properties>
<comment>IK Analyzer 扩展配置</comment>
<!--用户可以在这里配置自己的扩展字典 -->
<entry key="ext_dict">custom/mydict.dic;custom/sougou.dict</entry>
<entry key="ext_dict">custom/mydict.dic;custom/single_word_low_freq.dic</entry>
<!--用户可以在这里配置自己的扩展停止词字典-->
<entry key="ext_stopwords">custom/ext_stopword.dic</entry>
</properties>

View File

@ -3,7 +3,7 @@
<properties>
<comment>IK Analyzer 扩展配置</comment>
<!--用户可以在这里配置自己的扩展字典 -->
<entry key="ext_dict">custom/mydict.dic</entry>
<entry key="ext_dict">custom/mydict.dic;custom/single_word_low_freq.dic</entry>
<!--用户可以在这里配置自己的扩展停止词字典-->
<entry key="ext_stopwords">custom/ext_stopword.dic</entry>
</properties>

View File

@ -1 +1 @@
medcl
medcl

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

398716
config/ik/custom/sougou.dic Normal file

File diff suppressed because it is too large Load Diff

View File

@ -6,7 +6,7 @@
<modelVersion>4.0.0</modelVersion>
<groupId>org.elasticsearch</groupId>
<artifactId>elasticsearch-analysis-ik</artifactId>
<version>1.2.2</version>
<version>1.2.3</version>
<packaging>jar</packaging>
<description>IK Analyzer for ElasticSearch</description>
<inceptionYear>2009</inceptionYear>
@ -31,7 +31,7 @@
</parent>
<properties>
<elasticsearch.version>0.90.0</elasticsearch.version>
<elasticsearch.version>0.90.2</elasticsearch.version>
</properties>
<repositories>

View File

@ -6,6 +6,8 @@ import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.env.Environment;
import org.elasticsearch.index.Index;
import org.elasticsearch.index.settings.IndexSettings;
import org.wltea.analyzer.cfg.Configuration;
import org.wltea.analyzer.dic.Dictionary;
import org.wltea.analyzer.lucene.IKAnalyzer;
public class IkAnalyzerProvider extends AbstractIndexAnalyzerProvider<IKAnalyzer> {
@ -13,18 +15,21 @@ public class IkAnalyzerProvider extends AbstractIndexAnalyzerProvider<IKAnalyzer
@Inject
public IkAnalyzerProvider(Index index, @IndexSettings Settings indexSettings, Environment env, @Assisted String name, @Assisted Settings settings) {
super(index, indexSettings, name, settings);
Dictionary.initial(new Configuration(settings));
analyzer=new IKAnalyzer(indexSettings,settings);
}
public IkAnalyzerProvider(Index index, Settings indexSettings, String name,
Settings settings) {
super(index, indexSettings, name, settings);
Dictionary.initial(new Configuration(settings));
analyzer=new IKAnalyzer(indexSettings,settings);
}
public IkAnalyzerProvider(Index index, Settings indexSettings,
String prefixSettings, String name, Settings settings) {
super(index, indexSettings, prefixSettings, name, settings);
Dictionary.initial(new Configuration(settings));
analyzer=new IKAnalyzer(indexSettings,settings);
}

View File

@ -6,27 +6,25 @@ import org.elasticsearch.common.inject.assistedinject.Assisted;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.index.Index;
import org.elasticsearch.index.settings.IndexSettings;
import org.wltea.analyzer.cfg.Configuration;
import org.wltea.analyzer.dic.Dictionary;
import org.wltea.analyzer.lucene.IKTokenizer;
import java.io.Reader;
public class IkTokenizerFactory extends AbstractTokenizerFactory {
private boolean useSmart = false;
private Settings settings;
@Inject
public IkTokenizerFactory(Index index,@IndexSettings Settings indexSettings,@Assisted String name, @Assisted Settings settings) {
super(index, indexSettings, name, settings);
Dictionary.getInstance().Init(indexSettings);
if (settings.get("use_smart", "true").equals("true")) {
useSmart = true;
}
this.settings=settings;
Dictionary.initial(new Configuration(settings));
}
@Override
public Tokenizer create(Reader reader) {
return new IKTokenizer(reader, useSmart);
return new IKTokenizer(reader, settings);
}
}

View File

@ -21,16 +21,13 @@ public class Configuration {
private static final String EXT_STOP = "ext_stopwords";
private static ESLogger logger = null;
private Properties props;
/*
* 是否使用smart方式分词
*/
private boolean useSmart=true;
public Configuration(Settings settings){
private Environment environment;
public Configuration(Settings settings){
logger = Loggers.getLogger("ik-analyzer");
props = new Properties();
Environment environment=new Environment(settings);
environment=new Environment(settings);
File fileConfig= new File(environment.configFile(), FILE_NAME);
InputStream input = null;
@ -42,7 +39,6 @@ public class Configuration {
if(input != null){
try {
props.loadFromXML(input);
logger.info("[Dict Loading] {}",FILE_NAME);
} catch (InvalidPropertiesFormatException e) {
e.printStackTrace();
} catch (IOException e) {
@ -51,7 +47,6 @@ public class Configuration {
}
}
public List<String> getExtDictionarys(){
List<String> extDictFiles = new ArrayList<String>(2);
String extDictCfg = props.getProperty(EXT_DICT);
@ -89,4 +84,8 @@ public class Configuration {
}
return extStopWordDictFiles;
}
public File getDictRoot() {
return environment.configFile();
}
}

View File

@ -90,10 +90,6 @@ class AnalyzeContext {
int getCursor(){
return this.cursor;
}
//
// void setCursor(int cursor){
// this.cursor = cursor;
// }
char[] getSegmentBuff(){
return this.segmentBuff;
@ -115,7 +111,7 @@ class AnalyzeContext {
* 根据context的上下文情况填充segmentBuff
* @param reader
* @return 返回待分析的有效的字串长度
* @throws IOException
* @throws java.io.IOException
*/
int fillBuffer(Reader reader) throws IOException{
int readCount = 0;

View File

@ -28,8 +28,8 @@ import java.io.Reader;
import java.util.ArrayList;
import java.util.List;
import org.elasticsearch.common.settings.Settings;
import org.wltea.analyzer.cfg.Configuration;
//import org.wltea.analyzer.cfg.DefaultConfig;
import org.wltea.analyzer.dic.Dictionary;
/**
@ -53,38 +53,33 @@ public final class IKSegmenter {
/**
* IK分词器构造函数
* @param input
* @param useSmart 为true使用智能分词策略
*
* 非智能分词细粒度输出所有可能的切分结果
* 智能分词 合并数词和量词对分词结果进行歧义判断
*/
public IKSegmenter(Reader input , boolean useSmart){
* @param input
*/
public IKSegmenter(Reader input , Settings settings){
this.input = input;
// this.cfg = DefaultConfig.getInstance();
this.useSmart=useSmart;
this.init();
this.cfg = new Configuration(settings);
this.useSmart = settings.get("use_smart", "true").equals("true");
this.init();
}
/**
* IK分词器构造函数
* @param input
* @param cfg 使用自定义的Configuration构造分词器
*
*/
public IKSegmenter(Reader input , Configuration cfg){
this.input = input;
this.cfg = cfg;
this.init();
}
// /**
// * IK分词器构造函数
// * @param input
// * @param cfg 使用自定义的Configuration构造分词器
// *
// */
// public IKSegmenter(Reader input , Configuration cfg){
// this.input = input;
// this.cfg = cfg;
// this.init();
// }
/**
* 初始化
*/
private void init(){
//初始化词典单例
// Dictionary.initial(this.cfg);
// Dictionary.getSingleton();
Dictionary.initial(this.cfg);
//初始化分词上下文
this.context = new AnalyzeContext(useSmart);
//加载子分词器
@ -111,7 +106,7 @@ public final class IKSegmenter {
/**
* 分词获取下一个词元
* @return Lexeme 词元对象
* @throws IOException
* @throws java.io.IOException
*/
public synchronized Lexeme next()throws IOException{
Lexeme l = null;

View File

@ -64,8 +64,6 @@ public class Dictionary {
*/
private Configuration configuration;
private ESLogger logger=null;
private static boolean dictInited=false;
private Environment environment;
public static final String PATH_DIC_MAIN = "ik/main.dic";
public static final String PATH_DIC_SURNAME = "ik/surname.dic";
public static final String PATH_DIC_QUANTIFIER = "ik/quantifier.dic";
@ -75,33 +73,6 @@ public class Dictionary {
private Dictionary(){
logger = Loggers.getLogger("ik-analyzer");
}
static{
singleton = new Dictionary();
}
// public Configuration getConfig(){
// return configuration;
// }
// private Dictionary(Configuration cfg){
// this.cfg = cfg;
// this.loadMainDict();
// this.loadStopWordDict();
// this.loadQuantifierDict();
// }
public void Init(Settings indexSettings){
if(!dictInited){
environment =new Environment(indexSettings);
configuration=new Configuration(indexSettings);
loadMainDict();
loadSurnameDict();
loadQuantifierDict();
loadSuffixDict();
loadPrepDict();
loadStopWordDict();
dictInited=true;
}
}
/**
* 词典初始化
@ -111,17 +82,24 @@ public class Dictionary {
* 该方法提供了一个在应用加载阶段就初始化字典的手段
* @return Dictionary
*/
// public static Dictionary initial(Configuration cfg){
// if(singleton == null){
// synchronized(Dictionary.class){
// if(singleton == null){
// singleton = new Dictionary();
// return singleton;
// }
// }
// }
// return singleton;
// }
public static Dictionary initial(Configuration cfg){
if(singleton == null){
synchronized(Dictionary.class){
if(singleton == null){
singleton = new Dictionary();
singleton.configuration=cfg;
singleton.loadMainDict();
singleton.loadSurnameDict();
singleton.loadQuantifierDict();
singleton.loadSuffixDict();
singleton.loadPrepDict();
singleton.loadStopWordDict();
return singleton;
}
}
}
return singleton;
}
/**
* 获取词典单子实例
@ -151,7 +129,6 @@ public class Dictionary {
/**
* 批量移除屏蔽词条
* @param words
*/
public void disableWords(Collection<String> words){
if(words != null){
@ -166,7 +143,6 @@ public class Dictionary {
/**
* 检索匹配主词典
* @param charArray
* @return Hit 匹配结果描述
*/
public Hit matchInMainDict(char[] charArray){
@ -175,9 +151,6 @@ public class Dictionary {
/**
* 检索匹配主词典
* @param charArray
* @param begin
* @param length
* @return Hit 匹配结果描述
*/
public Hit matchInMainDict(char[] charArray , int begin, int length){
@ -186,9 +159,6 @@ public class Dictionary {
/**
* 检索匹配量词词典
* @param charArray
* @param begin
* @param length
* @return Hit 匹配结果描述
*/
public Hit matchInQuantifierDict(char[] charArray , int begin, int length){
@ -198,9 +168,6 @@ public class Dictionary {
/**
* 从已匹配的Hit中直接取出DictSegment继续向下匹配
* @param charArray
* @param currentIndex
* @param matchedHit
* @return Hit
*/
public Hit matchWithHit(char[] charArray , int currentIndex , Hit matchedHit){
@ -211,9 +178,6 @@ public class Dictionary {
/**
* 判断是否是停止词
* @param charArray
* @param begin
* @param length
* @return boolean
*/
public boolean isStopWord(char[] charArray , int begin, int length){
@ -226,8 +190,9 @@ public class Dictionary {
private void loadMainDict(){
//建立一个主词典实例
_MainDict = new DictSegment((char)0);
//读取主词典文件
File file= new File(environment.configFile(), Dictionary.PATH_DIC_MAIN);
File file= new File(configuration.getDictRoot(), Dictionary.PATH_DIC_MAIN);
InputStream is = null;
try {
@ -273,8 +238,8 @@ public class Dictionary {
InputStream is = null;
for(String extDictName : extDictFiles){
//读取扩展词典文件
logger.info("加载扩展词典:" + extDictName);
File file=new File(environment.configFile(), extDictName);
logger.info("[Dict Loading]" + extDictName);
File file=new File(configuration.getDictRoot(), extDictName);
try {
is = new FileInputStream(file);
} catch (FileNotFoundException e) {
@ -316,17 +281,53 @@ public class Dictionary {
* 加载用户扩展的停止词词典
*/
private void loadStopWordDict(){
//建立一个主词典实例
//建立主词典实例
_StopWords = new DictSegment((char)0);
//读取主词典文件
File file= new File(configuration.getDictRoot(), Dictionary.PATH_DIC_STOP);
InputStream is = null;
try {
is = new FileInputStream(file);
} catch (FileNotFoundException e) {
e.printStackTrace();
}
try {
BufferedReader br = new BufferedReader(new InputStreamReader(is , "UTF-8"), 512);
String theWord = null;
do {
theWord = br.readLine();
if (theWord != null && !"".equals(theWord.trim())) {
_StopWords.fillSegment(theWord.trim().toLowerCase().toCharArray());
}
} while (theWord != null);
} catch (IOException e) {
logger.error("ik-analyzer",e);
}finally{
try {
if(is != null){
is.close();
is = null;
}
} catch (IOException e) {
logger.error("ik-analyzer",e);
}
}
//加载扩展停止词典
List<String> extStopWordDictFiles = configuration.getExtStopWordDictionarys();
if(extStopWordDictFiles != null){
InputStream is = null;
is = null;
for(String extStopWordDictName : extStopWordDictFiles){
// logger.info("加载扩展停止词典:" + extStopWordDictName);
logger.info("[Dict Loading]" + extStopWordDictName);
//读取扩展词典文件
File file=new File(environment.configFile(), extStopWordDictName);
//读取扩展词典文件
file=new File(configuration.getDictRoot(), extStopWordDictName);
try {
is = new FileInputStream(file);
} catch (FileNotFoundException e) {
@ -371,7 +372,7 @@ public class Dictionary {
//建立一个量词典实例
_QuantifierDict = new DictSegment((char)0);
//读取量词词典文件
File file=new File(environment.configFile(),Dictionary.PATH_DIC_QUANTIFIER);
File file=new File(configuration.getDictRoot(),Dictionary.PATH_DIC_QUANTIFIER);
InputStream is = null;
try {
is = new FileInputStream(file);
@ -407,7 +408,7 @@ public class Dictionary {
private void loadSurnameDict(){
_SurnameDict = new DictSegment((char)0);
File file=new File(environment.configFile(),Dictionary.PATH_DIC_SURNAME);
File file=new File(configuration.getDictRoot(),Dictionary.PATH_DIC_SURNAME);
InputStream is = null;
try {
is = new FileInputStream(file);
@ -426,7 +427,6 @@ public class Dictionary {
_SurnameDict.fillSegment(theWord.trim().toCharArray());
}
} while (theWord != null);
// logger.info("[Dict Loading] {},SurnameDict Size:{}",file.toString(),_SurnameDict.getDicNum());
} catch (IOException e) {
logger.error("ik-analyzer",e);
}finally{
@ -446,7 +446,7 @@ public class Dictionary {
private void loadSuffixDict(){
_SuffixDict = new DictSegment((char)0);
File file=new File(environment.configFile(),Dictionary.PATH_DIC_SUFFIX);
File file=new File(configuration.getDictRoot(),Dictionary.PATH_DIC_SUFFIX);
InputStream is = null;
try {
is = new FileInputStream(file);
@ -466,15 +466,12 @@ public class Dictionary {
_SuffixDict.fillSegment(theWord.trim().toCharArray());
}
} while (theWord != null);
// logger.info("[Dict Loading] {},SuffixDict Size:{}",file.toString(),_SuffixDict.getDicNum());
} catch (IOException e) {
logger.error("ik-analyzer",e);
}finally{
try {
if(is != null){
is.close();
is = null;
}
is.close();
is = null;
} catch (IOException e) {
logger.error("ik-analyzer",e);
}
@ -485,7 +482,7 @@ public class Dictionary {
private void loadPrepDict(){
_PrepDict = new DictSegment((char)0);
File file=new File(environment.configFile(),Dictionary.PATH_DIC_PREP);
File file=new File(configuration.getDictRoot(),Dictionary.PATH_DIC_PREP);
InputStream is = null;
try {
is = new FileInputStream(file);
@ -506,23 +503,17 @@ public class Dictionary {
_PrepDict.fillSegment(theWord.trim().toCharArray());
}
} while (theWord != null);
// logger.info("[Dict Loading] {},PrepDict Size:{}",file.toString(),_PrepDict.getDicNum());
} catch (IOException e) {
logger.error("ik-analyzer",e);
}finally{
try {
if(is != null){
is.close();
is = null;
}
is.close();
is = null;
} catch (IOException e) {
logger.error("ik-analyzer",e);
}
}
}
public static Dictionary getInstance(){
return Dictionary.singleton;
}
}

View File

@ -66,13 +66,11 @@ public final class IKAnalyzer extends Analyzer{
this.useSmart = useSmart;
}
public IKAnalyzer(Settings indexSetting,Settings settings1) {
super();
Dictionary.getInstance().Init(indexSetting);
Settings settings;
if(settings1.get("use_smart", "true").equals("true")){
useSmart = true;
}
public IKAnalyzer(Settings indexSetting,Settings settings) {
super();
this.settings=settings;
}
/**
@ -80,7 +78,7 @@ public final class IKAnalyzer extends Analyzer{
*/
@Override
protected TokenStreamComponents createComponents(String fieldName, final Reader in) {
Tokenizer _IKTokenizer = new IKTokenizer(in , this.useSmart());
Tokenizer _IKTokenizer = new IKTokenizer(in , settings);
return new TokenStreamComponents(_IKTokenizer);
}

View File

@ -34,6 +34,8 @@ import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
import org.elasticsearch.common.settings.ImmutableSettings;
import org.elasticsearch.common.settings.Settings;
import org.wltea.analyzer.core.IKSegmenter;
import org.wltea.analyzer.core.Lexeme;
@ -58,14 +60,14 @@ public final class IKTokenizer extends Tokenizer {
/**
* Lucene 4.0 Tokenizer适配器类构造函数
* @param in
* @param useSmart
*/
public IKTokenizer(Reader in , boolean useSmart){
*/
public IKTokenizer(Reader in , Settings settings){
super(in);
offsetAtt = addAttribute(OffsetAttribute.class);
termAtt = addAttribute(CharTermAttribute.class);
typeAtt = addAttribute(TypeAttribute.class);
_IKImplement = new IKSegmenter(input , useSmart);
_IKImplement = new IKSegmenter(input , settings);
}
/* (non-Javadoc)

View File

@ -34,6 +34,7 @@ import org.apache.lucene.queryparser.classic.ParseException;
import org.apache.lucene.queryparser.classic.QueryParser;
import org.apache.lucene.search.Query;
import org.apache.lucene.util.Version;
import org.elasticsearch.common.settings.Settings;
import org.wltea.analyzer.core.IKSegmenter;
import org.wltea.analyzer.core.Lexeme;
@ -70,7 +71,8 @@ public class SWMCQueryBuilder {
*/
private static List<Lexeme> doAnalyze(String keywords){
List<Lexeme> lexemes = new ArrayList<Lexeme>();
IKSegmenter ikSeg = new IKSegmenter(new StringReader(keywords) , true);
Settings settings=null;
IKSegmenter ikSeg = new IKSegmenter(new StringReader(keywords) , settings);
try{
Lexeme l = null;
while( (l = ikSeg.next()) != null){