diff --git a/README.md b/README.md index aa864f9..be20cb0 100644 --- a/README.md +++ b/README.md @@ -230,7 +230,12 @@ mvn compile mvn package ``` -copy & unzip file #{project_path}/elasticsearch-analysis-ik/target/releases/elasticsearch-analysis-ik-*.zip to your elasticsearch's folder: plugins/ik +拷贝和解压release下的文件: #{project_path}/elasticsearch-analysis-ik/target/releases/elasticsearch-analysis-ik-*.zip 到你的 elasticsearch 插件目录, 如: plugins/ik +重启elasticsearch + +3.分词测试失败 +请在某个索引下调用analyze接口测试,而不是直接调用analyze接口 +如:http://localhost:9200/your_index/_analyze?text=中华人民共和国MN&tokenizer=my_ik Thanks diff --git a/src/main/java/org/elasticsearch/index/analysis/IkAnalysisBinderProcessor.java b/src/main/java/org/elasticsearch/index/analysis/IkAnalysisBinderProcessor.java index 4b73fc4..ae60ce3 100644 --- a/src/main/java/org/elasticsearch/index/analysis/IkAnalysisBinderProcessor.java +++ b/src/main/java/org/elasticsearch/index/analysis/IkAnalysisBinderProcessor.java @@ -1,7 +1,6 @@ package org.elasticsearch.index.analysis; -@Deprecated public class IkAnalysisBinderProcessor extends AnalysisModule.AnalysisBinderProcessor { diff --git a/src/main/java/org/elasticsearch/index/analysis/IkAnalyzerProvider.java b/src/main/java/org/elasticsearch/index/analysis/IkAnalyzerProvider.java index 797fbd0..e8e1fe2 100644 --- a/src/main/java/org/elasticsearch/index/analysis/IkAnalyzerProvider.java +++ b/src/main/java/org/elasticsearch/index/analysis/IkAnalyzerProvider.java @@ -10,17 +10,16 @@ import org.wltea.analyzer.cfg.Configuration; import org.wltea.analyzer.dic.Dictionary; import org.wltea.analyzer.lucene.IKAnalyzer; -@Deprecated public class IkAnalyzerProvider extends AbstractIndexAnalyzerProvider { private final IKAnalyzer analyzer; - private boolean useSmart=false; @Inject public IkAnalyzerProvider(Index index, IndexSettingsService indexSettingsService, Environment env, @Assisted String name, @Assisted Settings settings) { super(index, indexSettingsService.getSettings(), name, settings); - Dictionary.initial(new Configuration(env)); - useSmart = settings.get("use_smart", "false").equals("true"); - analyzer=new IKAnalyzer(useSmart); + + Configuration configuration=new Configuration(env,settings); + + analyzer=new IKAnalyzer(configuration); } @Override public IKAnalyzer get() { diff --git a/src/main/java/org/elasticsearch/index/analysis/IkTokenizerFactory.java b/src/main/java/org/elasticsearch/index/analysis/IkTokenizerFactory.java index 6d06892..59b70c1 100644 --- a/src/main/java/org/elasticsearch/index/analysis/IkTokenizerFactory.java +++ b/src/main/java/org/elasticsearch/index/analysis/IkTokenizerFactory.java @@ -8,25 +8,18 @@ import org.elasticsearch.env.Environment; import org.elasticsearch.index.Index; import org.elasticsearch.index.settings.IndexSettingsService; import org.wltea.analyzer.cfg.Configuration; -import org.wltea.analyzer.dic.Dictionary; import org.wltea.analyzer.lucene.IKTokenizer; -@Deprecated public class IkTokenizerFactory extends AbstractTokenizerFactory { - private final Settings settings; - private boolean useSmart=false; + private Configuration configuration; @Inject public IkTokenizerFactory(Index index, IndexSettingsService indexSettingsService,Environment env, @Assisted String name, @Assisted Settings settings) { super(index, indexSettingsService.getSettings(), name, settings); - this.settings=settings; - Dictionary.initial(new Configuration(env)); + configuration=new Configuration(env,settings); } - @Override public Tokenizer create() { - this.useSmart = settings.get("use_smart", "false").equals("true"); - - return new IKTokenizer(useSmart); } + return new IKTokenizer(configuration); } } diff --git a/src/main/java/org/elasticsearch/indices/analysis/IKIndicesAnalysis.java b/src/main/java/org/elasticsearch/indices/analysis/IKIndicesAnalysis.java index 231cbbe..96fab83 100644 --- a/src/main/java/org/elasticsearch/indices/analysis/IKIndicesAnalysis.java +++ b/src/main/java/org/elasticsearch/indices/analysis/IKIndicesAnalysis.java @@ -3,6 +3,7 @@ package org.elasticsearch.indices.analysis; import org.apache.lucene.analysis.Tokenizer; import org.elasticsearch.common.component.AbstractComponent; import org.elasticsearch.common.inject.Inject; +import org.elasticsearch.common.inject.assistedinject.Assisted; import org.elasticsearch.common.settings.Settings; import org.elasticsearch.env.Environment; import org.elasticsearch.index.analysis.AnalyzerScope; @@ -26,21 +27,20 @@ public class IKIndicesAnalysis extends AbstractComponent { public IKIndicesAnalysis(final Settings settings, IndicesAnalysisService indicesAnalysisService,Environment env) { super(settings); - Dictionary.initial(new Configuration(env)); - - this.useSmart = settings.get("use_smart", "false").equals("true"); + final Configuration configuration=new Configuration(env,settings).setUseSmart(false); + final Configuration smartConfiguration=new Configuration(env,settings).setUseSmart(true); indicesAnalysisService.analyzerProviderFactories().put("ik", new PreBuiltAnalyzerProviderFactory("ik", AnalyzerScope.GLOBAL, - new IKAnalyzer(useSmart))); + new IKAnalyzer(configuration))); indicesAnalysisService.analyzerProviderFactories().put("ik_smart", new PreBuiltAnalyzerProviderFactory("ik_smart", AnalyzerScope.GLOBAL, - new IKAnalyzer(true))); + new IKAnalyzer(smartConfiguration))); indicesAnalysisService.analyzerProviderFactories().put("ik_max_word", new PreBuiltAnalyzerProviderFactory("ik_max_word", AnalyzerScope.GLOBAL, - new IKAnalyzer(false))); + new IKAnalyzer(configuration))); indicesAnalysisService.tokenizerFactories().put("ik", new PreBuiltTokenizerFactoryFactory(new TokenizerFactory() { @@ -51,7 +51,7 @@ public class IKIndicesAnalysis extends AbstractComponent { @Override public Tokenizer create() { - return new IKTokenizer(false); + return new IKTokenizer(configuration); } })); @@ -64,7 +64,7 @@ public class IKIndicesAnalysis extends AbstractComponent { @Override public Tokenizer create() { - return new IKTokenizer(true); + return new IKTokenizer(smartConfiguration); } })); @@ -77,8 +77,8 @@ public class IKIndicesAnalysis extends AbstractComponent { @Override public Tokenizer create() { - return new IKTokenizer(false); + return new IKTokenizer(configuration); } })); } -} \ No newline at end of file +} diff --git a/src/main/java/org/wltea/analyzer/cfg/Configuration.java b/src/main/java/org/wltea/analyzer/cfg/Configuration.java index 1c5ef56..a58f46a 100644 --- a/src/main/java/org/wltea/analyzer/cfg/Configuration.java +++ b/src/main/java/org/wltea/analyzer/cfg/Configuration.java @@ -7,8 +7,10 @@ import org.elasticsearch.common.inject.Inject; import org.elasticsearch.common.io.PathUtils; import org.elasticsearch.common.logging.ESLogger; import org.elasticsearch.common.logging.Loggers; +import org.elasticsearch.common.settings.Settings; import org.elasticsearch.env.Environment; import org.elasticsearch.plugin.analysis.ik.AnalysisIkPlugin; +import org.wltea.analyzer.dic.Dictionary; import java.io.*; import java.net.URL; @@ -20,132 +22,61 @@ import java.util.Properties; public class Configuration { - private static String FILE_NAME = "IKAnalyzer.cfg.xml"; - private static final String EXT_DICT = "ext_dict"; - private static final String REMOTE_EXT_DICT = "remote_ext_dict"; - private static final String EXT_STOP = "ext_stopwords"; - private static final String REMOTE_EXT_STOP = "remote_ext_stopwords"; - private static ESLogger logger = Loggers.getLogger("ik-analyzer"); - private Path conf_dir; - private Properties props; private Environment environment; + private Settings settings; + + //是否启用智能分词 + private boolean useSmart; + + //是否启用远程词典加载 + private boolean enableRemoteDict=false; + + //是否启用小写处理 + private boolean enableLowercase=true; + @Inject - public Configuration(Environment env) { - props = new Properties(); - environment = env; + public Configuration(Environment env,Settings settings) { + this.environment = env; + this.settings=settings; - conf_dir = environment.configFile().resolve(AnalysisIkPlugin.PLUGIN_NAME); - Path configFile = conf_dir.resolve(FILE_NAME); + this.useSmart = settings.get("use_smart", "false").equals("true"); + this.enableLowercase = settings.get("enable_lowercase", "true").equals("true"); + this.enableRemoteDict = settings.get("enable_remote_dict", "true").equals("true"); + + Dictionary.initial(this); - InputStream input = null; - try { - logger.info("try load config from {}", configFile); - input = new FileInputStream(configFile.toFile()); - } catch (FileNotFoundException e) { - conf_dir = this.getConfigInPluginDir(); - configFile = conf_dir.resolve(FILE_NAME); - try { - logger.info("try load config from {}", configFile); - input = new FileInputStream(configFile.toFile()); - } catch (FileNotFoundException ex) { - // We should report origin exception - logger.error("ik-analyzer", e); - } - } - if (input != null) { - try { - props.loadFromXML(input); - } catch (InvalidPropertiesFormatException e) { - logger.error("ik-analyzer", e); - } catch (IOException e) { - logger.error("ik-analyzer", e); - } - } } - public List getExtDictionarys() { - List extDictFiles = new ArrayList(2); - String extDictCfg = props.getProperty(EXT_DICT); - if (extDictCfg != null) { - - String[] filePaths = extDictCfg.split(";"); - if (filePaths != null) { - for (String filePath : filePaths) { - if (filePath != null && !"".equals(filePath.trim())) { - Path file = PathUtils.get(filePath.trim()); - extDictFiles.add(file.toString()); - - } - } - } - } - return extDictFiles; - } - - public List getRemoteExtDictionarys() { - List remoteExtDictFiles = new ArrayList(2); - String remoteExtDictCfg = props.getProperty(REMOTE_EXT_DICT); - if (remoteExtDictCfg != null) { - - String[] filePaths = remoteExtDictCfg.split(";"); - if (filePaths != null) { - for (String filePath : filePaths) { - if (filePath != null && !"".equals(filePath.trim())) { - remoteExtDictFiles.add(filePath); - - } - } - } - } - return remoteExtDictFiles; - } - - public List getExtStopWordDictionarys() { - List extStopWordDictFiles = new ArrayList(2); - String extStopWordDictCfg = props.getProperty(EXT_STOP); - if (extStopWordDictCfg != null) { - - String[] filePaths = extStopWordDictCfg.split(";"); - if (filePaths != null) { - for (String filePath : filePaths) { - if (filePath != null && !"".equals(filePath.trim())) { - Path file = PathUtils.get(filePath.trim()); - extStopWordDictFiles.add(file.toString()); - - } - } - } - } - return extStopWordDictFiles; - } - - public List getRemoteExtStopWordDictionarys() { - List remoteExtStopWordDictFiles = new ArrayList(2); - String remoteExtStopWordDictCfg = props.getProperty(REMOTE_EXT_STOP); - if (remoteExtStopWordDictCfg != null) { - - String[] filePaths = remoteExtStopWordDictCfg.split(";"); - if (filePaths != null) { - for (String filePath : filePaths) { - if (filePath != null && !"".equals(filePath.trim())) { - remoteExtStopWordDictFiles.add(filePath); - - } - } - } - } - return remoteExtStopWordDictFiles; - } - - public String getDictRoot() { - return conf_dir.toAbsolutePath().toString(); - } - - private Path getConfigInPluginDir() { + public Path getConfigInPluginDir() { return PathUtils .get(new File(AnalysisIkPlugin.class.getProtectionDomain().getCodeSource().getLocation().getPath()) .getParent(), "config") .toAbsolutePath(); } + + public boolean isUseSmart() { + return useSmart; + } + + public Configuration setUseSmart(boolean useSmart) { + this.useSmart = useSmart; + return this; + } + + public Environment getEnvironment() { + return environment; + } + + public Settings getSettings() { + return settings; + } + + public boolean isEnableRemoteDict() { + return enableRemoteDict; + } + + public boolean isEnableLowercase() { + return enableLowercase; + } } diff --git a/src/main/java/org/wltea/analyzer/core/AnalyzeContext.java b/src/main/java/org/wltea/analyzer/core/AnalyzeContext.java index 1cbe0c4..a1da96f 100644 --- a/src/main/java/org/wltea/analyzer/core/AnalyzeContext.java +++ b/src/main/java/org/wltea/analyzer/core/AnalyzeContext.java @@ -32,6 +32,7 @@ import java.util.LinkedList; import java.util.Map; import java.util.Set; +import org.wltea.analyzer.cfg.Configuration; import org.wltea.analyzer.dic.Dictionary; /** @@ -72,12 +73,11 @@ class AnalyzeContext { private Map pathMap; //最终分词结果集 private LinkedList results; - private boolean useSmart; //分词器配置项 -// private Configuration cfg; + private Configuration cfg; - public AnalyzeContext(boolean useSmart){ - this.useSmart = useSmart; + public AnalyzeContext(Configuration configuration){ + this.cfg = configuration; this.segmentBuff = new char[BUFF_SIZE]; this.charTypes = new int[BUFF_SIZE]; this.buffLocker = new HashSet(); @@ -139,7 +139,7 @@ class AnalyzeContext { */ void initCursor(){ this.cursor = 0; - this.segmentBuff[this.cursor] = CharacterUtil.regularize(this.segmentBuff[this.cursor]); + this.segmentBuff[this.cursor] = CharacterUtil.regularize(this.segmentBuff[this.cursor],cfg.isEnableLowercase()); this.charTypes[this.cursor] = CharacterUtil.identifyCharType(this.segmentBuff[this.cursor]); } @@ -151,7 +151,7 @@ class AnalyzeContext { boolean moveCursor(){ if(this.cursor < this.available - 1){ this.cursor++; - this.segmentBuff[this.cursor] = CharacterUtil.regularize(this.segmentBuff[this.cursor]); + this.segmentBuff[this.cursor] = CharacterUtil.regularize(this.segmentBuff[this.cursor],cfg.isEnableLowercase()); this.charTypes[this.cursor] = CharacterUtil.identifyCharType(this.segmentBuff[this.cursor]); return true; }else{ @@ -345,7 +345,7 @@ class AnalyzeContext { */ private void compound(Lexeme result){ - if(!this.useSmart){ + if(!this.cfg.isUseSmart()){ return ; } //数量词合并处理 diff --git a/src/main/java/org/wltea/analyzer/core/CharacterUtil.java b/src/main/java/org/wltea/analyzer/core/CharacterUtil.java index bfa8b1a..cc687ce 100644 --- a/src/main/java/org/wltea/analyzer/core/CharacterUtil.java +++ b/src/main/java/org/wltea/analyzer/core/CharacterUtil.java @@ -86,14 +86,14 @@ class CharacterUtil { * @param input * @return char */ - static char regularize(char input){ + static char regularize(char input,boolean lowercase){ if (input == 12288) { input = (char) 32; }else if (input > 65280 && input < 65375) { input = (char) (input - 65248); - }else if (input >= 'A' && input <= 'Z') { + }else if (input >= 'A' && input <= 'Z' && lowercase) { input += 32; } diff --git a/src/main/java/org/wltea/analyzer/core/IKSegmenter.java b/src/main/java/org/wltea/analyzer/core/IKSegmenter.java index b813101..789a3a6 100644 --- a/src/main/java/org/wltea/analyzer/core/IKSegmenter.java +++ b/src/main/java/org/wltea/analyzer/core/IKSegmenter.java @@ -23,10 +23,7 @@ */ package org.wltea.analyzer.core; -import org.elasticsearch.common.settings.Settings; -import org.elasticsearch.env.Environment; import org.wltea.analyzer.cfg.Configuration; -import org.wltea.analyzer.dic.Dictionary; import java.io.IOException; import java.io.Reader; @@ -47,16 +44,16 @@ public final class IKSegmenter { private List segmenters; //分词歧义裁决器 private IKArbitrator arbitrator; - private boolean useSmart = false; + private Configuration configuration; /** * IK分词器构造函数 * @param input */ - public IKSegmenter(Reader input ,boolean useSmart){ + public IKSegmenter(Reader input ,Configuration configuration){ this.input = input; - this.useSmart = useSmart; + this.configuration = configuration; this.init(); } @@ -66,7 +63,7 @@ public final class IKSegmenter { */ private void init(){ //初始化分词上下文 - this.context = new AnalyzeContext(useSmart); + this.context = new AnalyzeContext(configuration); //加载子分词器 this.segmenters = this.loadSegmenters(); //加载歧义裁决器 @@ -127,7 +124,7 @@ public final class IKSegmenter { } } //对分词进行歧义处理 - this.arbitrator.process(context, useSmart); + this.arbitrator.process(context, configuration.isUseSmart()); //将分词结果输出到结果集,并处理未切分的单个CJK字符 context.outputToResult(); //记录本次分词的缓冲区位移 diff --git a/src/main/java/org/wltea/analyzer/dic/Dictionary.java b/src/main/java/org/wltea/analyzer/dic/Dictionary.java index 0b4ac70..d36c8d1 100644 --- a/src/main/java/org/wltea/analyzer/dic/Dictionary.java +++ b/src/main/java/org/wltea/analyzer/dic/Dictionary.java @@ -33,9 +33,7 @@ import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import java.nio.file.Path; -import java.util.ArrayList; -import java.util.Collection; -import java.util.List; +import java.util.*; import java.util.concurrent.Executors; import java.util.concurrent.ScheduledExecutorService; import java.util.concurrent.TimeUnit; @@ -49,6 +47,7 @@ import org.apache.http.impl.client.HttpClients; import org.elasticsearch.common.io.PathUtils; import org.elasticsearch.common.logging.ESLogger; import org.elasticsearch.common.logging.Loggers; +import org.elasticsearch.plugin.analysis.ik.AnalysisIkPlugin; import org.wltea.analyzer.cfg.Configuration; /** @@ -88,10 +87,53 @@ public class Dictionary { public static final String PATH_DIC_PREP = "preposition.dic"; public static final String PATH_DIC_STOP = "stopword.dic"; - private Dictionary() { + private final static String FILE_NAME = "IKAnalyzer.cfg.xml"; + private final static String EXT_DICT = "ext_dict"; + private final static String REMOTE_EXT_DICT = "remote_ext_dict"; + private final static String EXT_STOP = "ext_stopwords"; + private final static String REMOTE_EXT_STOP = "remote_ext_stopwords"; + private Path conf_dir; + private Properties props; + + private Dictionary(Configuration cfg) { + this.configuration = cfg; + this.props = new Properties(); + this.conf_dir = cfg.getEnvironment().configFile().resolve(AnalysisIkPlugin.PLUGIN_NAME); + Path configFile = conf_dir.resolve(FILE_NAME); + + InputStream input = null; + try { + logger.info("try load config from {}", configFile); + input = new FileInputStream(configFile.toFile()); + } catch (FileNotFoundException e) { + conf_dir = cfg.getConfigInPluginDir(); + configFile = conf_dir.resolve(FILE_NAME); + try { + logger.info("try load config from {}", configFile); + input = new FileInputStream(configFile.toFile()); + } catch (FileNotFoundException ex) { + // We should report origin exception + logger.error("ik-analyzer", e); + } + } + if (input != null) { + try { + props.loadFromXML(input); + } catch (InvalidPropertiesFormatException e) { + logger.error("ik-analyzer", e); + } catch (IOException e) { + logger.error("ik-analyzer", e); + } + } } + public String getProperty(String key){ + if(props!=null){ + return props.getProperty(key); + } + return null; + } /** * 词典初始化 由于IK Analyzer的词典采用Dictionary类的静态方法进行词典初始化 * 只有当Dictionary类被实际调用时,才会开始载入词典, 这将延长首次分词操作的时间 该方法提供了一个在应用加载阶段就初始化字典的手段 @@ -102,8 +144,8 @@ public class Dictionary { if (singleton == null) { synchronized (Dictionary.class) { if (singleton == null) { - singleton = new Dictionary(); - singleton.configuration = cfg; + + singleton = new Dictionary(cfg); singleton.loadMainDict(); singleton.loadSurnameDict(); singleton.loadQuantifierDict(); @@ -111,13 +153,15 @@ public class Dictionary { singleton.loadPrepDict(); singleton.loadStopWordDict(); - // 建立监控线程 - for (String location : cfg.getRemoteExtDictionarys()) { - // 10 秒是初始延迟可以修改的 60是间隔时间 单位秒 - pool.scheduleAtFixedRate(new Monitor(location), 10, 60, TimeUnit.SECONDS); - } - for (String location : cfg.getRemoteExtStopWordDictionarys()) { - pool.scheduleAtFixedRate(new Monitor(location), 10, 60, TimeUnit.SECONDS); + if(cfg.isEnableRemoteDict()){ + // 建立监控线程 + for (String location : singleton.getRemoteExtDictionarys()) { + // 10 秒是初始延迟可以修改的 60是间隔时间 单位秒 + pool.scheduleAtFixedRate(new Monitor(location), 10, 60, TimeUnit.SECONDS); + } + for (String location : singleton.getRemoteExtStopWordDictionarys()) { + pool.scheduleAtFixedRate(new Monitor(location), 10, 60, TimeUnit.SECONDS); + } } return singleton; @@ -127,6 +171,77 @@ public class Dictionary { return singleton; } + public List getExtDictionarys() { + List extDictFiles = new ArrayList(2); + String extDictCfg = getProperty(EXT_DICT); + if (extDictCfg != null) { + + String[] filePaths = extDictCfg.split(";"); + for (String filePath : filePaths) { + if (filePath != null && !"".equals(filePath.trim())) { + Path file = PathUtils.get(filePath.trim()); + extDictFiles.add(file.toString()); + + } + } + } + return extDictFiles; + } + + public List getRemoteExtDictionarys() { + List remoteExtDictFiles = new ArrayList(2); + String remoteExtDictCfg = getProperty(REMOTE_EXT_DICT); + if (remoteExtDictCfg != null) { + + String[] filePaths = remoteExtDictCfg.split(";"); + for (String filePath : filePaths) { + if (filePath != null && !"".equals(filePath.trim())) { + remoteExtDictFiles.add(filePath); + + } + } + } + return remoteExtDictFiles; + } + + public List getExtStopWordDictionarys() { + List extStopWordDictFiles = new ArrayList(2); + String extStopWordDictCfg = getProperty(EXT_STOP); + if (extStopWordDictCfg != null) { + + String[] filePaths = extStopWordDictCfg.split(";"); + for (String filePath : filePaths) { + if (filePath != null && !"".equals(filePath.trim())) { + Path file = PathUtils.get(filePath.trim()); + extStopWordDictFiles.add(file.toString()); + + } + } + } + return extStopWordDictFiles; + } + + public List getRemoteExtStopWordDictionarys() { + List remoteExtStopWordDictFiles = new ArrayList(2); + String remoteExtStopWordDictCfg = getProperty(REMOTE_EXT_STOP); + if (remoteExtStopWordDictCfg != null) { + + String[] filePaths = remoteExtStopWordDictCfg.split(";"); + for (String filePath : filePaths) { + if (filePath != null && !"".equals(filePath.trim())) { + remoteExtStopWordDictFiles.add(filePath); + + } + } + } + return remoteExtStopWordDictFiles; + } + + public String getDictRoot() { + return conf_dir.toAbsolutePath().toString(); + } + + /** * 获取词典单子实例 * @@ -139,6 +254,7 @@ public class Dictionary { return singleton; } + /** * 批量加载新词条 * @@ -224,7 +340,7 @@ public class Dictionary { _MainDict = new DictSegment((char) 0); // 读取主词典文件 - Path file = PathUtils.get(configuration.getDictRoot(), Dictionary.PATH_DIC_MAIN); + Path file = PathUtils.get(getDictRoot(), Dictionary.PATH_DIC_MAIN); InputStream is = null; try { @@ -267,13 +383,13 @@ public class Dictionary { */ private void loadExtDict() { // 加载扩展词典配置 - List extDictFiles = configuration.getExtDictionarys(); + List extDictFiles = getExtDictionarys(); if (extDictFiles != null) { InputStream is = null; for (String extDictName : extDictFiles) { // 读取扩展词典文件 logger.info("[Dict Loading] " + extDictName); - Path file = PathUtils.get(configuration.getDictRoot(), extDictName); + Path file = PathUtils.get(getDictRoot(), extDictName); try { is = new FileInputStream(file.toFile()); } catch (FileNotFoundException e) { @@ -315,7 +431,7 @@ public class Dictionary { * 加载远程扩展词典到主词库表 */ private void loadRemoteExtDict() { - List remoteExtDictFiles = configuration.getRemoteExtDictionarys(); + List remoteExtDictFiles = getRemoteExtDictionarys(); for (String location : remoteExtDictFiles) { logger.info("[Dict Loading] " + location); List lists = getRemoteWords(location); @@ -386,7 +502,7 @@ public class Dictionary { _StopWords = new DictSegment((char) 0); // 读取主词典文件 - Path file = PathUtils.get(configuration.getDictRoot(), Dictionary.PATH_DIC_STOP); + Path file = PathUtils.get(getDictRoot(), Dictionary.PATH_DIC_STOP); InputStream is = null; try { @@ -420,14 +536,14 @@ public class Dictionary { } // 加载扩展停止词典 - List extStopWordDictFiles = configuration.getExtStopWordDictionarys(); + List extStopWordDictFiles = getExtStopWordDictionarys(); if (extStopWordDictFiles != null) { is = null; for (String extStopWordDictName : extStopWordDictFiles) { logger.info("[Dict Loading] " + extStopWordDictName); // 读取扩展词典文件 - file = PathUtils.get(configuration.getDictRoot(), extStopWordDictName); + file = PathUtils.get(getDictRoot(), extStopWordDictName); try { is = new FileInputStream(file.toFile()); } catch (FileNotFoundException e) { @@ -465,7 +581,7 @@ public class Dictionary { } // 加载远程停用词典 - List remoteExtStopWordDictFiles = configuration.getRemoteExtStopWordDictionarys(); + List remoteExtStopWordDictFiles = getRemoteExtStopWordDictionarys(); for (String location : remoteExtStopWordDictFiles) { logger.info("[Dict Loading] " + location); List lists = getRemoteWords(location); @@ -492,7 +608,7 @@ public class Dictionary { // 建立一个量词典实例 _QuantifierDict = new DictSegment((char) 0); // 读取量词词典文件 - Path file = PathUtils.get(configuration.getDictRoot(), Dictionary.PATH_DIC_QUANTIFIER); + Path file = PathUtils.get(getDictRoot(), Dictionary.PATH_DIC_QUANTIFIER); InputStream is = null; try { is = new FileInputStream(file.toFile()); @@ -527,7 +643,7 @@ public class Dictionary { private void loadSurnameDict() { _SurnameDict = new DictSegment((char) 0); - Path file = PathUtils.get(configuration.getDictRoot(), Dictionary.PATH_DIC_SURNAME); + Path file = PathUtils.get(getDictRoot(), Dictionary.PATH_DIC_SURNAME); InputStream is = null; try { is = new FileInputStream(file.toFile()); @@ -563,7 +679,7 @@ public class Dictionary { private void loadSuffixDict() { _SuffixDict = new DictSegment((char) 0); - Path file = PathUtils.get(configuration.getDictRoot(), Dictionary.PATH_DIC_SUFFIX); + Path file = PathUtils.get(getDictRoot(), Dictionary.PATH_DIC_SUFFIX); InputStream is = null; try { is = new FileInputStream(file.toFile()); @@ -598,7 +714,7 @@ public class Dictionary { private void loadPrepDict() { _PrepDict = new DictSegment((char) 0); - Path file = PathUtils.get(configuration.getDictRoot(), Dictionary.PATH_DIC_PREP); + Path file = PathUtils.get(getDictRoot(), Dictionary.PATH_DIC_PREP); InputStream is = null; try { is = new FileInputStream(file.toFile()); @@ -634,7 +750,7 @@ public class Dictionary { public void reLoadMainDict() { logger.info("重新加载词典..."); // 新开一个实例加载词典,减少加载过程对当前词典使用的影响 - Dictionary tmpDict = new Dictionary(); + Dictionary tmpDict = new Dictionary(configuration); tmpDict.configuration = getSingleton().configuration; tmpDict.loadMainDict(); tmpDict.loadStopWordDict(); @@ -643,4 +759,4 @@ public class Dictionary { logger.info("重新加载词典完毕..."); } -} \ No newline at end of file +} diff --git a/src/main/java/org/wltea/analyzer/lucene/IKAnalyzer.java b/src/main/java/org/wltea/analyzer/lucene/IKAnalyzer.java index 4bfe50b..6067c4b 100644 --- a/src/main/java/org/wltea/analyzer/lucene/IKAnalyzer.java +++ b/src/main/java/org/wltea/analyzer/lucene/IKAnalyzer.java @@ -26,6 +26,7 @@ package org.wltea.analyzer.lucene; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.Tokenizer; +import org.wltea.analyzer.cfg.Configuration; /** * IK分词器,Lucene Analyzer接口实现 @@ -33,15 +34,7 @@ import org.apache.lucene.analysis.Tokenizer; */ public final class IKAnalyzer extends Analyzer{ - private boolean useSmart; - - public boolean useSmart() { - return useSmart; - } - - public void setUseSmart(boolean useSmart) { - this.useSmart = useSmart; - } + private Configuration configuration; /** * IK分词器Lucene Analyzer接口实现类 @@ -54,11 +47,11 @@ public final class IKAnalyzer extends Analyzer{ /** * IK分词器Lucene Analyzer接口实现类 * - * @param useSmart 当为true时,分词器进行智能切分 + * @param configuration IK配置 */ - public IKAnalyzer(boolean useSmart){ + public IKAnalyzer(Configuration configuration){ super(); - this.useSmart = useSmart; + this.configuration = configuration; } @@ -67,7 +60,7 @@ public final class IKAnalyzer extends Analyzer{ */ @Override protected TokenStreamComponents createComponents(String fieldName) { - Tokenizer _IKTokenizer = new IKTokenizer(useSmart); + Tokenizer _IKTokenizer = new IKTokenizer(configuration); return new TokenStreamComponents(_IKTokenizer); } diff --git a/src/main/java/org/wltea/analyzer/lucene/IKTokenizer.java b/src/main/java/org/wltea/analyzer/lucene/IKTokenizer.java index 00b86a7..42d380a 100644 --- a/src/main/java/org/wltea/analyzer/lucene/IKTokenizer.java +++ b/src/main/java/org/wltea/analyzer/lucene/IKTokenizer.java @@ -32,6 +32,7 @@ import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; import org.apache.lucene.analysis.tokenattributes.TypeAttribute; import org.elasticsearch.common.settings.Settings; import org.elasticsearch.env.Environment; +import org.wltea.analyzer.cfg.Configuration; import org.wltea.analyzer.core.IKSegmenter; import org.wltea.analyzer.core.Lexeme; @@ -64,16 +65,15 @@ public final class IKTokenizer extends Tokenizer { /** * Lucene 4.0 Tokenizer适配器类构造函数 - * @param in */ - public IKTokenizer(boolean useSmart){ + public IKTokenizer(Configuration configuration){ super(); offsetAtt = addAttribute(OffsetAttribute.class); termAtt = addAttribute(CharTermAttribute.class); typeAtt = addAttribute(TypeAttribute.class); posIncrAtt = addAttribute(PositionIncrementAttribute.class); - _IKImplement = new IKSegmenter(input,useSmart); + _IKImplement = new IKSegmenter(input,configuration); } /* (non-Javadoc) diff --git a/src/main/java/org/wltea/analyzer/sample/IKAnalzyerDemo.java b/src/main/java/org/wltea/analyzer/sample/IKAnalzyerDemo.java deleted file mode 100644 index d22fe3c..0000000 --- a/src/main/java/org/wltea/analyzer/sample/IKAnalzyerDemo.java +++ /dev/null @@ -1,90 +0,0 @@ -/** - * IK 中文分词 版本 5.0.1 - * IK Analyzer release 5.0.1 - * - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - * - * 源代码由林良益(linliangyi2005@gmail.com)提供 - * 版权声明 2012,乌龙茶工作室 - * provided by Linliangyi and copyright 2012 by Oolong studio - * - * - */ -package org.wltea.analyzer.sample; - -import java.io.IOException; -import java.io.StringReader; - -import org.apache.lucene.analysis.Analyzer; -import org.apache.lucene.analysis.TokenStream; -import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; -import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; -import org.apache.lucene.analysis.tokenattributes.TypeAttribute; -import org.elasticsearch.common.logging.ESLogger; -import org.elasticsearch.common.logging.Loggers; -import org.wltea.analyzer.lucene.IKAnalyzer; - -/** - * 使用IKAnalyzer进行分词的演示 - * 2012-10-22 - * - */ -public class IKAnalzyerDemo { - - public static ESLogger logger= Loggers.getLogger("ik-analyzer"); - - public static void main(String[] args){ - //构建IK分词器,使用smart分词模式 - Analyzer analyzer = new IKAnalyzer(true); - - //获取Lucene的TokenStream对象 - TokenStream ts = null; - try { - ts = analyzer.tokenStream("myfield", new StringReader("WORLD ,.. html DATAHELLO")); -// ts = analyzer.tokenStream("myfield", new StringReader("这是一个中文分词的例子,你可以直接运行它!IKAnalyer can analysis english text too")); - //获取词元位置属性 - OffsetAttribute offset = ts.addAttribute(OffsetAttribute.class); - //获取词元文本属性 - CharTermAttribute term = ts.addAttribute(CharTermAttribute.class); - //获取词元文本属性 - TypeAttribute type = ts.addAttribute(TypeAttribute.class); - - - //重置TokenStream(重置StringReader) - ts.reset(); - //迭代获取分词结果 - while (ts.incrementToken()) { - System.out.println(offset.startOffset() + " - " + offset.endOffset() + " : " + term.toString() + " | " + type.type()); - } - //关闭TokenStream(关闭StringReader) - ts.end(); // Perform end-of-stream operations, e.g. set the final offset. - - } catch (IOException e) { - logger.error(e.getMessage(), e); - } finally { - //释放TokenStream的所有资源 - if(ts != null){ - try { - ts.close(); - } catch (IOException e) { - logger.error(e.getMessage(), e); - } - } - } - - } - -} \ No newline at end of file diff --git a/src/main/java/org/wltea/analyzer/sample/LuceneIndexAndSearchDemo.java b/src/main/java/org/wltea/analyzer/sample/LuceneIndexAndSearchDemo.java deleted file mode 100644 index 70bd7a5..0000000 --- a/src/main/java/org/wltea/analyzer/sample/LuceneIndexAndSearchDemo.java +++ /dev/null @@ -1,150 +0,0 @@ -/** - * IK 中文分词 版本 5.0 - * IK Analyzer release 5.0 - * - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - * - * 源代码由林良益(linliangyi2005@gmail.com)提供 - * 版权声明 2012,乌龙茶工作室 - * provided by Linliangyi and copyright 2012 by Oolong studio - * - * - */ -package org.wltea.analyzer.sample; - -import java.io.IOException; - -import org.apache.lucene.analysis.Analyzer; -import org.apache.lucene.document.Document; -import org.apache.lucene.document.Field; -import org.apache.lucene.document.StringField; -import org.apache.lucene.document.TextField; -import org.apache.lucene.index.CorruptIndexException; -import org.apache.lucene.index.DirectoryReader; -import org.apache.lucene.index.IndexReader; -import org.apache.lucene.index.IndexWriter; -import org.apache.lucene.index.IndexWriterConfig; -import org.apache.lucene.index.IndexWriterConfig.OpenMode; -import org.apache.lucene.queryparser.classic.ParseException; -import org.apache.lucene.queryparser.classic.QueryParser; -import org.apache.lucene.search.IndexSearcher; -import org.apache.lucene.search.Query; -import org.apache.lucene.search.ScoreDoc; -import org.apache.lucene.search.TopDocs; -import org.apache.lucene.store.Directory; -import org.apache.lucene.store.LockObtainFailedException; -import org.apache.lucene.store.RAMDirectory; -import org.apache.lucene.util.Version; -import org.elasticsearch.common.logging.ESLogger; -import org.elasticsearch.common.logging.Loggers; -import org.wltea.analyzer.lucene.IKAnalyzer; - - - - -/** - * 使用IKAnalyzer进行Lucene索引和查询的演示 - * 2012-3-2 - * - * 以下是结合Lucene4.0 API的写法 - * - */ -public class LuceneIndexAndSearchDemo { - - public static ESLogger logger= Loggers.getLogger("ik-analyzer"); - - /** - * 模拟: - * 创建一个单条记录的索引,并对其进行搜索 - * @param args - */ - public static void main(String[] args){ - //Lucene Document的域名 - String fieldName = "text"; - //检索内容 - String text = "IK Analyzer是一个结合词典分词和文法分词的中文分词开源工具包。它使用了全新的正向迭代最细粒度切分算法。"; - - //实例化IKAnalyzer分词器 - Analyzer analyzer = new IKAnalyzer(true); - - Directory directory = null; - IndexWriter iwriter = null; - IndexReader ireader = null; - IndexSearcher isearcher = null; - try { - //建立内存索引对象 - directory = new RAMDirectory(); - - //配置IndexWriterConfig - IndexWriterConfig iwConfig = new IndexWriterConfig(analyzer); - iwConfig.setOpenMode(OpenMode.CREATE_OR_APPEND); - iwriter = new IndexWriter(directory , iwConfig); - //写入索引 - Document doc = new Document(); - doc.add(new StringField("ID", "10000", Field.Store.YES)); - doc.add(new TextField(fieldName, text, Field.Store.YES)); - iwriter.addDocument(doc); - iwriter.close(); - - - //搜索过程********************************** - //实例化搜索器 - ireader = DirectoryReader.open(directory); - isearcher = new IndexSearcher(ireader); - - String keyword = "中文分词工具包"; - //使用QueryParser查询分析器构造Query对象 - QueryParser qp = new QueryParser(fieldName, analyzer); - qp.setDefaultOperator(QueryParser.AND_OPERATOR); - Query query = qp.parse(keyword); - System.out.println("Query = " + query); - - //搜索相似度最高的5条记录 - TopDocs topDocs = isearcher.search(query , 5); - System.out.println("命中:" + topDocs.totalHits); - //输出结果 - ScoreDoc[] scoreDocs = topDocs.scoreDocs; - for (int i = 0; i < topDocs.totalHits; i++){ - Document targetDoc = isearcher.doc(scoreDocs[i].doc); - System.out.println("内容:" + targetDoc.toString()); - } - - } catch (CorruptIndexException e) { - logger.error(e.getMessage(), e); - } catch (LockObtainFailedException e) { - logger.error(e.getMessage(), e); - } catch (IOException e) { - logger.error(e.getMessage(), e); - } catch (ParseException e) { - logger.error(e.getMessage(), e); - } finally{ - if(ireader != null){ - try { - ireader.close(); - } catch (IOException e) { - logger.error(e.getMessage(), e); - } - } - if(directory != null){ - try { - directory.close(); - } catch (IOException e) { - logger.error(e.getMessage(), e); - } - } - } - } -} \ No newline at end of file diff --git a/src/main/uml/IKAnalysisBinderProcessor.uml b/src/main/uml/IKAnalysisBinderProcessor.uml deleted file mode 100644 index f2baca0..0000000 --- a/src/main/uml/IKAnalysisBinderProcessor.uml +++ /dev/null @@ -1,83 +0,0 @@ - - - JAVA - org.elasticsearch.index.analysis.IKAnalysisBinderProcessor - - org.elasticsearch.index.analysis.IKAnalysisBinderProcessor - org.elasticsearch.index.analysis.AnalysisModule.AnalysisBinderProcessor.AnalyzersBindings - java.lang.Object - org.elasticsearch.index.analysis.AnalysisModule.AnalysisBinderProcessor - org.elasticsearch.index.analysis.AnalysisModule.AnalysisBinderProcessor.TokenFiltersBindings - org.elasticsearch.index.analysis.AnalysisModule.AnalysisBinderProcessor.CharFiltersBindings - org.elasticsearch.index.analysis.AnalysisModule.AnalysisBinderProcessor.TokenizersBindings - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - Fields - Methods - Constructors - Inner Classes - Properties - - -