diff --git a/pom.xml b/pom.xml index 1be1ae0..1a0bbe0 100644 --- a/pom.xml +++ b/pom.xml @@ -6,7 +6,7 @@ 4.0.0 org.elasticsearch elasticsearch-analysis-ik - 1.2.4 + 1.2.5 jar IK Analyzer for ElasticSearch 2009 @@ -31,7 +31,7 @@ - 0.90.6 + 0.90.2 diff --git a/src/main/java/org/wltea/analyzer/core/CJKSegmenter.java b/src/main/java/org/wltea/analyzer/core/CJKSegmenter.java index 5867ff5..d72eaa7 100644 --- a/src/main/java/org/wltea/analyzer/core/CJKSegmenter.java +++ b/src/main/java/org/wltea/analyzer/core/CJKSegmenter.java @@ -25,12 +25,12 @@ */ package org.wltea.analyzer.core; -import java.util.LinkedList; -import java.util.List; - import org.wltea.analyzer.dic.Dictionary; import org.wltea.analyzer.dic.Hit; +import java.util.LinkedList; +import java.util.List; + /** * 中文-日韩文子分词器 @@ -58,7 +58,7 @@ class CJKSegmenter implements ISegmenter { //处理词段队列 Hit[] tmpArray = this.tmpHits.toArray(new Hit[this.tmpHits.size()]); for(Hit hit : tmpArray){ - hit = Dictionary.getSingleton().matchWithHit(context.getSegmentBuff(), context.getCursor() , hit); + hit = Dictionary.getSingleton().matchWithHit(String.valueOf(context.getSegmentBuff()).toLowerCase().toCharArray(), context.getCursor() , hit); if(hit.isMatch()){ //输出当前的词 Lexeme newLexeme = new Lexeme(context.getBufferOffset() , hit.getBegin() , context.getCursor() - hit.getBegin() + 1 , Lexeme.TYPE_CNWORD); @@ -77,7 +77,7 @@ class CJKSegmenter implements ISegmenter { //********************************* //再对当前指针位置的字符进行单字匹配 - Hit singleCharHit = Dictionary.getSingleton().matchInMainDict(context.getSegmentBuff(), context.getCursor(), 1); + Hit singleCharHit = Dictionary.getSingleton().matchInMainDict(String.valueOf(context.getSegmentBuff()).toLowerCase().toCharArray(), context.getCursor(), 1); if(singleCharHit.isMatch()){//首字成词 //输出当前的词 Lexeme newLexeme = new Lexeme(context.getBufferOffset() , context.getCursor() , 1 , Lexeme.TYPE_CNWORD); diff --git a/src/main/java/org/wltea/analyzer/core/IKSegmenter.java b/src/main/java/org/wltea/analyzer/core/IKSegmenter.java index 4275923..9a16eea 100644 --- a/src/main/java/org/wltea/analyzer/core/IKSegmenter.java +++ b/src/main/java/org/wltea/analyzer/core/IKSegmenter.java @@ -23,16 +23,16 @@ */ package org.wltea.analyzer.core; -import java.io.IOException; -import java.io.Reader; -import java.util.ArrayList; -import java.util.List; - import org.elasticsearch.common.settings.Settings; import org.elasticsearch.env.Environment; import org.wltea.analyzer.cfg.Configuration; import org.wltea.analyzer.dic.Dictionary; +import java.io.IOException; +import java.io.Reader; +import java.util.ArrayList; +import java.util.List; + /** * IK分词器主类 * diff --git a/src/main/java/org/wltea/analyzer/dic/DictSegment.java b/src/main/java/org/wltea/analyzer/dic/DictSegment.java index 7e2f420..26d1993 100644 --- a/src/main/java/org/wltea/analyzer/dic/DictSegment.java +++ b/src/main/java/org/wltea/analyzer/dic/DictSegment.java @@ -114,8 +114,8 @@ class DictSegment implements Comparable{ } //设置hit的当前处理位置 searchHit.setEnd(begin); - - Character keyChar = new Character(charArray[begin]); + + Character keyChar = new Character(charArray[begin]); DictSegment ds = null; //引用实例变量为本地变量,避免查询时遇到更新的同步问题 diff --git a/src/main/java/org/wltea/analyzer/dic/Dictionary.java b/src/main/java/org/wltea/analyzer/dic/Dictionary.java index 206ffad..449ee3c 100644 --- a/src/main/java/org/wltea/analyzer/dic/Dictionary.java +++ b/src/main/java/org/wltea/analyzer/dic/Dictionary.java @@ -152,7 +152,7 @@ public class Dictionary { * @return Hit 匹配结果描述 */ public Hit matchInMainDict(char[] charArray , int begin, int length){ - return singleton._MainDict.match(String.valueOf(charArray).trim().toLowerCase().toCharArray(), begin, length); + return singleton._MainDict.match(String.valueOf(charArray).toLowerCase().toCharArray(), begin, length); } /** @@ -160,7 +160,7 @@ public class Dictionary { * @return Hit 匹配结果描述 */ public Hit matchInQuantifierDict(char[] charArray , int begin, int length){ - return singleton._QuantifierDict.match(String.valueOf(charArray).trim().toLowerCase().toCharArray(), begin, length); + return singleton._QuantifierDict.match(String.valueOf(charArray).toLowerCase().toCharArray(), begin, length); } @@ -179,7 +179,7 @@ public class Dictionary { * @return boolean */ public boolean isStopWord(char[] charArray , int begin, int length){ - return singleton._StopWords.match(String.valueOf(charArray).trim().toLowerCase().toCharArray(), begin, length).isMatch(); + return singleton._StopWords.match(String.valueOf(charArray).toLowerCase().toCharArray(), begin, length).isMatch(); } /** diff --git a/src/main/java/org/wltea/analyzer/lucene/IKTokenizer.java b/src/main/java/org/wltea/analyzer/lucene/IKTokenizer.java index 70b382b..5083e6a 100644 --- a/src/main/java/org/wltea/analyzer/lucene/IKTokenizer.java +++ b/src/main/java/org/wltea/analyzer/lucene/IKTokenizer.java @@ -26,9 +26,6 @@ */ package org.wltea.analyzer.lucene; -import java.io.IOException; -import java.io.Reader; - import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; @@ -38,6 +35,9 @@ import org.elasticsearch.env.Environment; import org.wltea.analyzer.core.IKSegmenter; import org.wltea.analyzer.core.Lexeme; +import java.io.IOException; +import java.io.Reader; + /** * IK分词器 Lucene Tokenizer适配器类 * 兼容Lucene 4.0版本 @@ -80,7 +80,7 @@ public final class IKTokenizer extends Tokenizer { if(nextLexeme != null){ //将Lexeme转成Attributes //设置词元文本 - termAtt.append(nextLexeme.getLexemeText()); + termAtt.append(nextLexeme.getLexemeText().toLowerCase()); //设置词元长度 termAtt.setLength(nextLexeme.getLength()); //设置词元位移 diff --git a/src/main/java/org/wltea/analyzer/query/IKQueryExpressionParser.java b/src/main/java/org/wltea/analyzer/query/IKQueryExpressionParser.java index 1b86a35..679ec12 100644 --- a/src/main/java/org/wltea/analyzer/query/IKQueryExpressionParser.java +++ b/src/main/java/org/wltea/analyzer/query/IKQueryExpressionParser.java @@ -24,20 +24,16 @@ */ package org.wltea.analyzer.query; +import org.apache.lucene.index.Term; +import org.apache.lucene.search.*; +import org.apache.lucene.search.BooleanClause.Occur; +import org.apache.lucene.util.BytesRef; + import java.util.ArrayList; import java.util.LinkedList; import java.util.List; import java.util.Stack; -import org.apache.lucene.index.Term; -import org.apache.lucene.search.BooleanClause; -import org.apache.lucene.search.BooleanQuery; -import org.apache.lucene.search.Query; -import org.apache.lucene.search.TermQuery; -import org.apache.lucene.search.TermRangeQuery; -import org.apache.lucene.search.BooleanClause.Occur; -import org.apache.lucene.util.BytesRef; - /** * IK简易查询表达式解析 * 结合SWMCQuery算法 @@ -66,7 +62,7 @@ public class IKQueryExpressionParser { */ public Query parseExp(String expression , boolean quickMode){ Query lucenceQuery = null; - if(expression != null && !"".equals(expression.trim())){ + if(expression != null && !"".equals(expression)){ try{ //文法解析 this.splitElements(expression);