From 3d47fa602139a70b6aa4d83b8400176d24027fdc Mon Sep 17 00:00:00 2001 From: medcl Date: Sat, 31 Oct 2015 20:59:13 +0800 Subject: [PATCH] update to support es 2.0 --- README.md | 111 ++++-------------- pom.xml | 26 ++-- src/main/assemblies/plugin.xml | 7 ++ src/main/config/ik.yaml | 0 .../analysis/IkAnalysisBinderProcessor.java | 11 +- .../index/analysis/IkAnalyzerProvider.java | 7 +- .../index/analysis/IkTokenizerFactory.java | 18 ++- .../indices/analysis/IKIndicesAnalysis.java | 78 ++++++++++++ .../analysis/IKIndicesAnalysisModule.java | 32 +++++ .../plugin/analysis/ik/AnalysisIkPlugin.java | 35 +++++- .../org/wltea/analyzer/cfg/Configuration.java | 23 ++-- .../org/wltea/analyzer/core/IKSegmenter.java | 26 +--- .../org/wltea/analyzer/lucene/IKAnalyzer.java | 30 ++--- .../wltea/analyzer/lucene/IKTokenizer.java | 7 +- .../analyzer/query/SWMCQueryBuilder.java | 4 +- .../sample/LuceneIndexAndSearchDemo.java | 4 +- src/main/resources/es-plugin.properties | 2 - .../resources/plugin-descriptor.properties | 80 +++++++++++++ 18 files changed, 310 insertions(+), 191 deletions(-) create mode 100644 src/main/config/ik.yaml create mode 100644 src/main/java/org/elasticsearch/indices/analysis/IKIndicesAnalysis.java create mode 100644 src/main/java/org/elasticsearch/indices/analysis/IKIndicesAnalysisModule.java delete mode 100644 src/main/resources/es-plugin.properties create mode 100644 src/main/resources/plugin-descriptor.properties diff --git a/README.md b/README.md index ab9b7f3..a7f8eeb 100644 --- a/README.md +++ b/README.md @@ -3,16 +3,15 @@ IK Analysis for ElasticSearch The IK Analysis plugin integrates Lucene IK analyzer into elasticsearch, support customized dictionary. -Tokenizer: `ik` - -更新:对于使用 ES 集群,用 IK 作为分词插件,经常会修改自定义词典的使用者,可以透过远程加载的方式,每次更新都会重新加载词典,不必重启 ES 服务。 +Analyzer: `ik_smart` , `ik_max_word` , Tokenizer: `ik_smart` , `ik_max_word` Versions -------- IK version | ES version -----------|----------- -master | 1.5.0 -> master +master | 2.0.0 -> master +1.5.0 | 2.0.0 1.4.1 | 1.7.2 1.4.0 | 1.6.0 1.3.0 | 1.5.0 @@ -30,108 +29,42 @@ master | 1.5.0 -> master Install ------- -you can download this plugin from RTF project(https://github.com/medcl/elasticsearch-rtf) -https://github.com/medcl/elasticsearch-rtf/tree/master/plugins/analysis-ik -https://github.com/medcl/elasticsearch-rtf/tree/master/config/ik +1.compile -also remember to download the dict files,unzip these dict file into your elasticsearch's config folder,such as: your-es-root/config/ik +`mvn package` -you need a service restart after that! +copy and unzip `target/release/ik**.zip` to `your-es-root/plugins/ik` -Configuration -------------- +2.config files: -### Analysis Configuration +download the dict files,unzip these dict file into your elasticsearch's config folder,such as: `your-es-root/config/ik` -#### `elasticsearch.yml` +3.restart elasticsearch -```yaml -index: - analysis: - analyzer: - ik: - alias: [ik_analyzer] - type: org.elasticsearch.index.analysis.IkAnalyzerProvider - ik_max_word: - type: ik - use_smart: false - ik_smart: - type: ik - use_smart: true -``` +Tips: -Or +ik_max_word: 会将文本做最细粒度的拆分,比如会将“中华人民共和国国歌”拆分为“中华人民共和国,中华人民,中华,华人,人民共和国,人民,人,民,共和国,共和,和,国国,国歌”,会穷尽各种可能的组合; -```yaml -index.analysis.analyzer.ik.type : "ik" -``` +ik_smart: 会做最粗粒度的拆分,比如会将“中华人民共和国国歌”拆分为“中华人民共和国,国歌”。 -#### 以上两种配置方式的区别: - -1、第二种方式,只定义了一个名为 ik 的 analyzer,其 use_smart 采用默认值 false - -2、第一种方式,定义了三个 analyzer,分别为:ik、ik_max_word、ik_smart,其中 ik_max_word 和 ik_smart 是基于 ik 这个 analyzer 定义的,并各自明确设置了 use_smart 的不同值。 - -3、其实,ik_max_word 等同于 ik。ik_max_word 会将文本做最细粒度的拆分,比如会将“中华人民共和国国歌”拆分为“中华人民共和国,中华人民,中华,华人,人民共和国,人民,人,民,共和国,共和,和,国国,国歌”,会穷尽各种可能的组合;而 ik_smart 会做最粗粒度的拆分,比如会将“中华人民共和国国歌”拆分为“中华人民共和国,国歌”。 - -因此,建议,在设置 mapping 时,用 ik 这个 analyzer,以尽可能地被搜索条件匹配到。 - -不过,如果你想将 /index_name/_analyze 这个 RESTful API 做为分词器用,用来提取某段文字中的主题词,则建议使用 ik_smart 这个 analyzer: - -``` -POST /hailiang/_analyze?analyzer=ik_smart HTTP/1.1 -Host: localhost:9200 -Cache-Control: no-cache - -中华人民共和国国歌 -``` - -返回值: - -```json -{ - "tokens" : [ { - "token" : "中华人民共和国", - "start_offset" : 0, - "end_offset" : 7, - "type" : "CN_WORD", - "position" : 1 - }, { - "token" : "国歌", - "start_offset" : 7, - "end_offset" : 9, - "type" : "CN_WORD", - "position" : 2 - } ] -} -``` - -另外,可以在 elasticsearch.yml 里加上如下一行,设置默认的 analyzer 为 ik: - -```yaml -index.analysis.analyzer.default.type : "ik" -``` - - -### Mapping Configuration #### Quick Example -1. create a index +1.create a index ```bash curl -XPUT http://localhost:9200/index ``` -2. create a mapping +2.create a mapping ```bash curl -XPOST http://localhost:9200/index/fulltext/_mapping -d' { "fulltext": { "_all": { - "indexAnalyzer": "ik", - "searchAnalyzer": "ik", + "indexAnalyzer": "ik_max_word", + "searchAnalyzer": "ik_max_word", "term_vector": "no", "store": "false" }, @@ -140,8 +73,8 @@ curl -XPOST http://localhost:9200/index/fulltext/_mapping -d' "type": "string", "store": "no", "term_vector": "with_positions_offsets", - "indexAnalyzer": "ik", - "searchAnalyzer": "ik", + "indexAnalyzer": "ik_max_word", + "searchAnalyzer": "ik_max_word", "include_in_all": "true", "boost": 8 } @@ -150,7 +83,7 @@ curl -XPOST http://localhost:9200/index/fulltext/_mapping -d' }' ``` -3. index some docs +3.index some docs ```bash curl -XPOST http://localhost:9200/index/fulltext/1 -d' @@ -176,7 +109,7 @@ curl -XPOST http://localhost:9200/index/fulltext/4 -d' ' ``` -4. query with highlighting +4.query with highlighting ```bash curl -XPOST http://localhost:9200/index/fulltext/_search -d' @@ -193,7 +126,7 @@ curl -XPOST http://localhost:9200/index/fulltext/_search -d' ' ``` -#### Result +Result ```json { @@ -257,7 +190,7 @@ curl -XPOST http://localhost:9200/index/fulltext/_search -d' location - location + http://xxx.com/xxx.dic ``` diff --git a/pom.xml b/pom.xml index d16a554..10ae09d 100644 --- a/pom.xml +++ b/pom.xml @@ -6,10 +6,21 @@ 4.0.0 org.elasticsearch elasticsearch-analysis-ik - 1.4.1 + 1.5.0 jar IK Analyzer for ElasticSearch 2009 + + + 2.0.0 + + ${project.basedir}/src/main/assemblies/plugin.xml + org.elasticsearch.plugin.analysis.ik.AnalysisIkPlugin + true + false + true + + The Apache Software License, Version 2.0 @@ -17,6 +28,7 @@ repo + scm:git:git@github.com:medcl/elasticsearch-analysis-ik.git scm:git:git@github.com:medcl/elasticsearch-analysis-ik.git @@ -30,10 +42,6 @@ 7 - - 1.7.2 - - oss.sonatype.org @@ -84,11 +92,6 @@ 4.10 test - - org.apache.lucene - lucene-core - 4.10.4 - @@ -137,9 +140,6 @@ fully.qualified.MainClass - - jar-with-dependencies - diff --git a/src/main/assemblies/plugin.xml b/src/main/assemblies/plugin.xml index eeb92f1..7e061d9 100644 --- a/src/main/assemblies/plugin.xml +++ b/src/main/assemblies/plugin.xml @@ -5,6 +5,13 @@ zip false + + + ${project.basedir}/src/main/resources/plugin-descriptor.properties + + true + + / diff --git a/src/main/config/ik.yaml b/src/main/config/ik.yaml new file mode 100644 index 0000000..e69de29 diff --git a/src/main/java/org/elasticsearch/index/analysis/IkAnalysisBinderProcessor.java b/src/main/java/org/elasticsearch/index/analysis/IkAnalysisBinderProcessor.java index d4d9319..f9f2814 100644 --- a/src/main/java/org/elasticsearch/index/analysis/IkAnalysisBinderProcessor.java +++ b/src/main/java/org/elasticsearch/index/analysis/IkAnalysisBinderProcessor.java @@ -3,20 +3,21 @@ package org.elasticsearch.index.analysis; public class IkAnalysisBinderProcessor extends AnalysisModule.AnalysisBinderProcessor { - @Override public void processTokenFilters(TokenFiltersBindings tokenFiltersBindings) { + + @Override + public void processTokenFilters(TokenFiltersBindings tokenFiltersBindings) { } - @Override public void processAnalyzers(AnalyzersBindings analyzersBindings) { + @Override + public void processAnalyzers(AnalyzersBindings analyzersBindings) { analyzersBindings.processAnalyzer("ik", IkAnalyzerProvider.class); - super.processAnalyzers(analyzersBindings); } @Override public void processTokenizers(TokenizersBindings tokenizersBindings) { - tokenizersBindings.processTokenizer("ik", IkTokenizerFactory.class); - super.processTokenizers(tokenizersBindings); + tokenizersBindings.processTokenizer("ik_tokenizer", IkTokenizerFactory.class); } } diff --git a/src/main/java/org/elasticsearch/index/analysis/IkAnalyzerProvider.java b/src/main/java/org/elasticsearch/index/analysis/IkAnalyzerProvider.java index 2dbee2e..dc40660 100644 --- a/src/main/java/org/elasticsearch/index/analysis/IkAnalyzerProvider.java +++ b/src/main/java/org/elasticsearch/index/analysis/IkAnalyzerProvider.java @@ -1,7 +1,6 @@ package org.elasticsearch.index.analysis; import org.elasticsearch.common.inject.Inject; -import org.elasticsearch.common.inject.assistedinject.Assisted; import org.elasticsearch.common.settings.Settings; import org.elasticsearch.env.Environment; import org.elasticsearch.index.Index; @@ -12,12 +11,14 @@ import org.wltea.analyzer.lucene.IKAnalyzer; public class IkAnalyzerProvider extends AbstractIndexAnalyzerProvider { private final IKAnalyzer analyzer; + private boolean useSmart=false; @Inject - public IkAnalyzerProvider(Index index, @IndexSettings Settings indexSettings, Environment env, @Assisted String name, @Assisted Settings settings) { + public IkAnalyzerProvider(Index index, @IndexSettings Settings indexSettings,Environment env, String name, Settings settings) { super(index, indexSettings, name, settings); Dictionary.initial(new Configuration(env)); - analyzer=new IKAnalyzer(indexSettings, settings, env); + useSmart = settings.get("use_smart", "false").equals("true"); + analyzer=new IKAnalyzer(useSmart); } @Override public IKAnalyzer get() { diff --git a/src/main/java/org/elasticsearch/index/analysis/IkTokenizerFactory.java b/src/main/java/org/elasticsearch/index/analysis/IkTokenizerFactory.java index 4b17f9e..96c157d 100644 --- a/src/main/java/org/elasticsearch/index/analysis/IkTokenizerFactory.java +++ b/src/main/java/org/elasticsearch/index/analysis/IkTokenizerFactory.java @@ -11,23 +11,21 @@ import org.wltea.analyzer.cfg.Configuration; import org.wltea.analyzer.dic.Dictionary; import org.wltea.analyzer.lucene.IKTokenizer; -import java.io.Reader; - public class IkTokenizerFactory extends AbstractTokenizerFactory { - private Environment environment; - private Settings settings; + private final Settings settings; + private boolean useSmart=false; @Inject public IkTokenizerFactory(Index index, @IndexSettings Settings indexSettings, Environment env, @Assisted String name, @Assisted Settings settings) { super(index, indexSettings, name, settings); - this.environment = env; - this.settings = settings; + this.settings=settings; Dictionary.initial(new Configuration(env)); } - @Override - public Tokenizer create(Reader reader) { - return new IKTokenizer(reader, settings, environment); - } + @Override + public Tokenizer create() { + this.useSmart = settings.get("use_smart", "false").equals("true"); + + return new IKTokenizer(useSmart); } } diff --git a/src/main/java/org/elasticsearch/indices/analysis/IKIndicesAnalysis.java b/src/main/java/org/elasticsearch/indices/analysis/IKIndicesAnalysis.java new file mode 100644 index 0000000..3223161 --- /dev/null +++ b/src/main/java/org/elasticsearch/indices/analysis/IKIndicesAnalysis.java @@ -0,0 +1,78 @@ +package org.elasticsearch.indices.analysis; + +import org.apache.lucene.analysis.Tokenizer; +import org.elasticsearch.common.component.AbstractComponent; +import org.elasticsearch.common.inject.Inject; +import org.elasticsearch.common.settings.Settings; +import org.elasticsearch.index.analysis.AnalyzerScope; +import org.elasticsearch.index.analysis.PreBuiltAnalyzerProviderFactory; +import org.elasticsearch.index.analysis.PreBuiltTokenizerFactoryFactory; +import org.elasticsearch.index.analysis.TokenizerFactory; +import org.wltea.analyzer.lucene.IKAnalyzer; +import org.wltea.analyzer.lucene.IKTokenizer; + +/** + * Registers indices level analysis components so, if not explicitly configured, + * will be shared among all indices. + */ +public class IKIndicesAnalysis extends AbstractComponent { + + private boolean useSmart=false; + + @Inject + public IKIndicesAnalysis(final Settings settings, + IndicesAnalysisService indicesAnalysisService) { + super(settings); + this.useSmart = settings.get("use_smart", "false").equals("true"); + indicesAnalysisService.analyzerProviderFactories().put("ik", + new PreBuiltAnalyzerProviderFactory("ik", AnalyzerScope.INDICES, + new IKAnalyzer(useSmart))); + + indicesAnalysisService.analyzerProviderFactories().put("ik_smart", + new PreBuiltAnalyzerProviderFactory("ik_smart", AnalyzerScope.INDICES, + new IKAnalyzer(true))); + + indicesAnalysisService.analyzerProviderFactories().put("ik_max_word", + new PreBuiltAnalyzerProviderFactory("ik_max_word", AnalyzerScope.INDICES, + new IKAnalyzer(false))); + + indicesAnalysisService.tokenizerFactories().put("ik", + new PreBuiltTokenizerFactoryFactory(new TokenizerFactory() { + @Override + public String name() { + return "ik"; + } + + @Override + public Tokenizer create() { + return new IKTokenizer(false); + } + })); + + indicesAnalysisService.tokenizerFactories().put("ik_smart", + new PreBuiltTokenizerFactoryFactory(new TokenizerFactory() { + @Override + public String name() { + return "ik_smart"; + } + + @Override + public Tokenizer create() { + return new IKTokenizer(true); + } + })); + + indicesAnalysisService.tokenizerFactories().put("ik_max_word", + new PreBuiltTokenizerFactoryFactory(new TokenizerFactory() { + @Override + public String name() { + return "ik_max_word"; + } + + @Override + public Tokenizer create() { + return new IKTokenizer(false); + } + })); + } +} \ No newline at end of file diff --git a/src/main/java/org/elasticsearch/indices/analysis/IKIndicesAnalysisModule.java b/src/main/java/org/elasticsearch/indices/analysis/IKIndicesAnalysisModule.java new file mode 100644 index 0000000..27bd543 --- /dev/null +++ b/src/main/java/org/elasticsearch/indices/analysis/IKIndicesAnalysisModule.java @@ -0,0 +1,32 @@ +/* + * Licensed to Elasticsearch under one or more contributor + * license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright + * ownership. Elasticsearch licenses this file to you under + * the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.elasticsearch.indices.analysis; + +import org.elasticsearch.common.inject.AbstractModule; + +/** + */ +public class IKIndicesAnalysisModule extends AbstractModule { + + @Override + protected void configure() { + bind(IKIndicesAnalysis.class).asEagerSingleton(); + } +} \ No newline at end of file diff --git a/src/main/java/org/elasticsearch/plugin/analysis/ik/AnalysisIkPlugin.java b/src/main/java/org/elasticsearch/plugin/analysis/ik/AnalysisIkPlugin.java index d9c82e5..1529c4b 100644 --- a/src/main/java/org/elasticsearch/plugin/analysis/ik/AnalysisIkPlugin.java +++ b/src/main/java/org/elasticsearch/plugin/analysis/ik/AnalysisIkPlugin.java @@ -1,12 +1,28 @@ package org.elasticsearch.plugin.analysis.ik; +import org.elasticsearch.common.inject.AbstractModule; import org.elasticsearch.common.inject.Module; +import org.elasticsearch.common.logging.ESLogger; +import org.elasticsearch.common.logging.ESLoggerFactory; +import org.elasticsearch.common.settings.Settings; import org.elasticsearch.index.analysis.AnalysisModule; import org.elasticsearch.index.analysis.IkAnalysisBinderProcessor; -import org.elasticsearch.plugins.AbstractPlugin; +import org.elasticsearch.indices.analysis.IKIndicesAnalysisModule; +import org.elasticsearch.plugins.Plugin; + +import java.util.Collection; +import java.util.Collections; +import java.util.logging.Logger; + +import static java.rmi.Naming.bind; -public class AnalysisIkPlugin extends AbstractPlugin { +public class AnalysisIkPlugin extends Plugin { + private final Settings settings; + + public AnalysisIkPlugin(Settings settings){ + this.settings = settings; + } @Override public String name() { return "analysis-ik"; @@ -17,11 +33,18 @@ public class AnalysisIkPlugin extends AbstractPlugin { return "ik analysis"; } + @Override + public Collection nodeModules() { + return Collections.singletonList(new IKIndicesAnalysisModule()); + } - @Override public void processModule(Module module) { - if (module instanceof AnalysisModule) { - AnalysisModule analysisModule = (AnalysisModule) module; - analysisModule.addProcessor(new IkAnalysisBinderProcessor()); + public static class ConfiguredExampleModule extends AbstractModule { + @Override + protected void configure() { } } + + public void onModule(AnalysisModule module) { + module.addProcessor(new IkAnalysisBinderProcessor()); + } } diff --git a/src/main/java/org/wltea/analyzer/cfg/Configuration.java b/src/main/java/org/wltea/analyzer/cfg/Configuration.java index 58274cf..9b5ed03 100644 --- a/src/main/java/org/wltea/analyzer/cfg/Configuration.java +++ b/src/main/java/org/wltea/analyzer/cfg/Configuration.java @@ -3,16 +3,17 @@ */ package org.wltea.analyzer.cfg; +import org.elasticsearch.common.inject.Inject; +import org.elasticsearch.common.logging.ESLogger; +import org.elasticsearch.common.logging.Loggers; +import org.elasticsearch.env.Environment; + import java.io.*; import java.util.ArrayList; import java.util.InvalidPropertiesFormatException; import java.util.List; import java.util.Properties; -import org.elasticsearch.common.logging.ESLogger; -import org.elasticsearch.common.logging.Loggers; -import org.elasticsearch.env.Environment; - public class Configuration { private static String FILE_NAME = "ik/IKAnalyzer.cfg.xml"; @@ -20,16 +21,18 @@ public class Configuration { private static final String REMOTE_EXT_DICT = "remote_ext_dict"; private static final String EXT_STOP = "ext_stopwords"; private static final String REMOTE_EXT_STOP = "remote_ext_stopwords"; - private static ESLogger logger = null; + private static ESLogger logger = Loggers.getLogger("ik-analyzer"); private Properties props; private Environment environment; + @Inject public Configuration(Environment env){ - logger = Loggers.getLogger("ik-analyzer"); props = new Properties(); environment = env; - File fileConfig= new File(environment.configFile(), FILE_NAME); + + File fileConfig= new File(environment.configFile().toFile(), FILE_NAME); + InputStream input = null; try { @@ -41,9 +44,9 @@ public class Configuration { try { props.loadFromXML(input); } catch (InvalidPropertiesFormatException e) { - e.printStackTrace(); + logger.error("ik-analyzer", e); } catch (IOException e) { - e.printStackTrace(); + logger.error("ik-analyzer",e); } } } @@ -123,6 +126,6 @@ public class Configuration { } public File getDictRoot() { - return environment.configFile(); + return environment.configFile().toFile(); } } diff --git a/src/main/java/org/wltea/analyzer/core/IKSegmenter.java b/src/main/java/org/wltea/analyzer/core/IKSegmenter.java index 7672154..b813101 100644 --- a/src/main/java/org/wltea/analyzer/core/IKSegmenter.java +++ b/src/main/java/org/wltea/analyzer/core/IKSegmenter.java @@ -41,8 +41,6 @@ public final class IKSegmenter { //字符窜reader private Reader input; - //分词器配置项 - private Configuration cfg; //分词器上下文 private AnalyzeContext context; //分词处理器列表 @@ -56,35 +54,17 @@ public final class IKSegmenter { * IK分词器构造函数 * @param input */ - public IKSegmenter(Reader input , Settings settings, Environment environment){ + public IKSegmenter(Reader input ,boolean useSmart){ this.input = input; - this.cfg = new Configuration(environment); - this.useSmart = settings.get("use_smart", "false").equals("true"); + this.useSmart = useSmart; this.init(); } - - public IKSegmenter(Reader input){ - new IKSegmenter(input, null,null); - } - -// /** -// * IK分词器构造函数 -// * @param input -// * @param cfg 使用自定义的Configuration构造分词器 -// * -// */ -// public IKSegmenter(Reader input , Configuration cfg){ -// this.input = input; -// this.cfg = cfg; -// this.init(); -// } + /** * 初始化 */ private void init(){ - //初始化词典单例 - Dictionary.initial(this.cfg); //初始化分词上下文 this.context = new AnalyzeContext(useSmart); //加载子分词器 diff --git a/src/main/java/org/wltea/analyzer/lucene/IKAnalyzer.java b/src/main/java/org/wltea/analyzer/lucene/IKAnalyzer.java index c9c0ef9..4bfe50b 100644 --- a/src/main/java/org/wltea/analyzer/lucene/IKAnalyzer.java +++ b/src/main/java/org/wltea/analyzer/lucene/IKAnalyzer.java @@ -24,13 +24,8 @@ */ package org.wltea.analyzer.lucene; -import java.io.Reader; - import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.Tokenizer; -import org.elasticsearch.common.settings.ImmutableSettings; -import org.elasticsearch.common.settings.Settings; -import org.elasticsearch.env.Environment; /** * IK分词器,Lucene Analyzer接口实现 @@ -39,8 +34,8 @@ import org.elasticsearch.env.Environment; public final class IKAnalyzer extends Analyzer{ private boolean useSmart; - - public boolean useSmart() { + + public boolean useSmart() { return useSmart; } @@ -54,35 +49,26 @@ public final class IKAnalyzer extends Analyzer{ * 默认细粒度切分算法 */ public IKAnalyzer(){ - this(false); } - - /** + + /** * IK分词器Lucene Analyzer接口实现类 * * @param useSmart 当为true时,分词器进行智能切分 */ public IKAnalyzer(boolean useSmart){ super(); - this.useSmart = useSmart; + this.useSmart = useSmart; } - Settings settings=ImmutableSettings.EMPTY; - Environment environment=new Environment(); - - public IKAnalyzer(Settings indexSetting,Settings settings, Environment environment) { - super(); - this.settings=settings; - this.environment= environment; - } /** * 重载Analyzer接口,构造分词组件 */ @Override - protected TokenStreamComponents createComponents(String fieldName, final Reader in) { - Tokenizer _IKTokenizer = new IKTokenizer(in , settings, environment); + protected TokenStreamComponents createComponents(String fieldName) { + Tokenizer _IKTokenizer = new IKTokenizer(useSmart); return new TokenStreamComponents(_IKTokenizer); - } + } } diff --git a/src/main/java/org/wltea/analyzer/lucene/IKTokenizer.java b/src/main/java/org/wltea/analyzer/lucene/IKTokenizer.java index d405251..00b86a7 100644 --- a/src/main/java/org/wltea/analyzer/lucene/IKTokenizer.java +++ b/src/main/java/org/wltea/analyzer/lucene/IKTokenizer.java @@ -66,14 +66,14 @@ public final class IKTokenizer extends Tokenizer { * Lucene 4.0 Tokenizer适配器类构造函数 * @param in */ - public IKTokenizer(Reader in , Settings settings, Environment environment){ - super(in); + public IKTokenizer(boolean useSmart){ + super(); offsetAtt = addAttribute(OffsetAttribute.class); termAtt = addAttribute(CharTermAttribute.class); typeAtt = addAttribute(TypeAttribute.class); posIncrAtt = addAttribute(PositionIncrementAttribute.class); - _IKImplement = new IKSegmenter(input , settings, environment); + _IKImplement = new IKSegmenter(input,useSmart); } /* (non-Javadoc) @@ -95,7 +95,6 @@ public final class IKTokenizer extends Tokenizer { //设置词元长度 termAtt.setLength(nextLexeme.getLength()); //设置词元位移 -// offsetAtt.setOffset(nextLexeme.getBeginPosition(), nextLexeme.getEndPosition()); offsetAtt.setOffset(correctOffset(nextLexeme.getBeginPosition()), correctOffset(nextLexeme.getEndPosition())); //记录分词的最后位置 diff --git a/src/main/java/org/wltea/analyzer/query/SWMCQueryBuilder.java b/src/main/java/org/wltea/analyzer/query/SWMCQueryBuilder.java index 1c62720..98f5e3b 100644 --- a/src/main/java/org/wltea/analyzer/query/SWMCQueryBuilder.java +++ b/src/main/java/org/wltea/analyzer/query/SWMCQueryBuilder.java @@ -71,7 +71,7 @@ public class SWMCQueryBuilder { private static List doAnalyze(String keywords){ List lexemes = new ArrayList(); - IKSegmenter ikSeg = new IKSegmenter(new StringReader(keywords)); + IKSegmenter ikSeg = new IKSegmenter(new StringReader(keywords),true); try{ Lexeme l = null; while( (l = ikSeg.next()) != null){ @@ -125,7 +125,7 @@ public class SWMCQueryBuilder { } //借助lucene queryparser 生成SWMC Query - QueryParser qp = new QueryParser(Version.LUCENE_40, fieldName, new StandardAnalyzer(Version.LUCENE_40)); + QueryParser qp = new QueryParser(fieldName, new StandardAnalyzer()); qp.setDefaultOperator(QueryParser.AND_OPERATOR); qp.setAutoGeneratePhraseQueries(true); diff --git a/src/main/java/org/wltea/analyzer/sample/LuceneIndexAndSearchDemo.java b/src/main/java/org/wltea/analyzer/sample/LuceneIndexAndSearchDemo.java index 32a998d..93f32c9 100644 --- a/src/main/java/org/wltea/analyzer/sample/LuceneIndexAndSearchDemo.java +++ b/src/main/java/org/wltea/analyzer/sample/LuceneIndexAndSearchDemo.java @@ -86,7 +86,7 @@ public class LuceneIndexAndSearchDemo { directory = new RAMDirectory(); //配置IndexWriterConfig - IndexWriterConfig iwConfig = new IndexWriterConfig(Version.LUCENE_40 , analyzer); + IndexWriterConfig iwConfig = new IndexWriterConfig(analyzer); iwConfig.setOpenMode(OpenMode.CREATE_OR_APPEND); iwriter = new IndexWriter(directory , iwConfig); //写入索引 @@ -104,7 +104,7 @@ public class LuceneIndexAndSearchDemo { String keyword = "中文分词工具包"; //使用QueryParser查询分析器构造Query对象 - QueryParser qp = new QueryParser(Version.LUCENE_40, fieldName, analyzer); + QueryParser qp = new QueryParser(fieldName, analyzer); qp.setDefaultOperator(QueryParser.AND_OPERATOR); Query query = qp.parse(keyword); System.out.println("Query = " + query); diff --git a/src/main/resources/es-plugin.properties b/src/main/resources/es-plugin.properties deleted file mode 100644 index edd0d41..0000000 --- a/src/main/resources/es-plugin.properties +++ /dev/null @@ -1,2 +0,0 @@ -plugin=org.elasticsearch.plugin.analysis.ik.AnalysisIkPlugin -version=${project.version} \ No newline at end of file diff --git a/src/main/resources/plugin-descriptor.properties b/src/main/resources/plugin-descriptor.properties new file mode 100644 index 0000000..d722eb4 --- /dev/null +++ b/src/main/resources/plugin-descriptor.properties @@ -0,0 +1,80 @@ +# Elasticsearch plugin descriptor file +# This file must exist as 'plugin-descriptor.properties' at +# the root directory of all plugins. +# +# A plugin can be 'site', 'jvm', or both. +# +### example site plugin for "foo": +# +# foo.zip <-- zip file for the plugin, with this structure: +# _site/ <-- the contents that will be served +# plugin-descriptor.properties <-- example contents below: +# +# site=true +# description=My cool plugin +# version=1.0 +# +### example jvm plugin for "foo" +# +# foo.zip <-- zip file for the plugin, with this structure: +# .jar <-- classes, resources, dependencies +# .jar <-- any number of jars +# plugin-descriptor.properties <-- example contents below: +# +# jvm=true +# classname=foo.bar.BazPlugin +# description=My cool plugin +# version=2.0.0-rc1 +# elasticsearch.version=2.0 +# java.version=1.7 +# +### mandatory elements for all plugins: +# +# 'description': simple summary of the plugin +description=${project.description} +# +# 'version': plugin's version +version=${project.version} +# +# 'name': the plugin name +name=${elasticsearch.plugin.name} + +### mandatory elements for site plugins: +# +# 'site': set to true to indicate contents of the _site/ +# directory in the root of the plugin should be served. +site=${elasticsearch.plugin.site} +# +### mandatory elements for jvm plugins : +# +# 'jvm': true if the 'classname' class should be loaded +# from jar files in the root directory of the plugin. +# Note that only jar files in the root directory are +# added to the classpath for the plugin! If you need +# other resources, package them into a resources jar. +jvm=${elasticsearch.plugin.jvm} +# +# 'classname': the name of the class to load, fully-qualified. +classname=${elasticsearch.plugin.classname} +# +# 'java.version' version of java the code is built against +# use the system property java.specification.version +# version string must be a sequence of nonnegative decimal integers +# separated by "."'s and may have leading zeros +java.version=${maven.compiler.target} +# +# 'elasticsearch.version' version of elasticsearch compiled against +# You will have to release a new version of the plugin for each new +# elasticsearch release. This version is checked when the plugin +# is loaded so Elasticsearch will refuse to start in the presence of +# plugins with the incorrect elasticsearch.version. +elasticsearch.version=${elasticsearch.version} +# +### deprecated elements for jvm plugins : +# +# 'isolated': true if the plugin should have its own classloader. +# passing false is deprecated, and only intended to support plugins +# that have hard dependencies against each other. If this is +# not specified, then the plugin is isolated by default. +isolated=${elasticsearch.plugin.isolated} +# \ No newline at end of file