bug fix,prefix blank char caused ArrayIndexOutOfBoundsException

This commit is contained in:
medcl 2013-12-13 16:39:30 +08:00
parent 35700686c9
commit e2fb31a55e
7 changed files with 27 additions and 31 deletions

View File

@ -6,7 +6,7 @@
<modelVersion>4.0.0</modelVersion> <modelVersion>4.0.0</modelVersion>
<groupId>org.elasticsearch</groupId> <groupId>org.elasticsearch</groupId>
<artifactId>elasticsearch-analysis-ik</artifactId> <artifactId>elasticsearch-analysis-ik</artifactId>
<version>1.2.4</version> <version>1.2.5</version>
<packaging>jar</packaging> <packaging>jar</packaging>
<description>IK Analyzer for ElasticSearch</description> <description>IK Analyzer for ElasticSearch</description>
<inceptionYear>2009</inceptionYear> <inceptionYear>2009</inceptionYear>
@ -31,7 +31,7 @@
</parent> </parent>
<properties> <properties>
<elasticsearch.version>0.90.6</elasticsearch.version> <elasticsearch.version>0.90.2</elasticsearch.version>
</properties> </properties>
<repositories> <repositories>

View File

@ -25,12 +25,12 @@
*/ */
package org.wltea.analyzer.core; package org.wltea.analyzer.core;
import java.util.LinkedList;
import java.util.List;
import org.wltea.analyzer.dic.Dictionary; import org.wltea.analyzer.dic.Dictionary;
import org.wltea.analyzer.dic.Hit; import org.wltea.analyzer.dic.Hit;
import java.util.LinkedList;
import java.util.List;
/** /**
* 中文-日韩文子分词器 * 中文-日韩文子分词器
@ -58,7 +58,7 @@ class CJKSegmenter implements ISegmenter {
//处理词段队列 //处理词段队列
Hit[] tmpArray = this.tmpHits.toArray(new Hit[this.tmpHits.size()]); Hit[] tmpArray = this.tmpHits.toArray(new Hit[this.tmpHits.size()]);
for(Hit hit : tmpArray){ for(Hit hit : tmpArray){
hit = Dictionary.getSingleton().matchWithHit(context.getSegmentBuff(), context.getCursor() , hit); hit = Dictionary.getSingleton().matchWithHit(String.valueOf(context.getSegmentBuff()).toLowerCase().toCharArray(), context.getCursor() , hit);
if(hit.isMatch()){ if(hit.isMatch()){
//输出当前的词 //输出当前的词
Lexeme newLexeme = new Lexeme(context.getBufferOffset() , hit.getBegin() , context.getCursor() - hit.getBegin() + 1 , Lexeme.TYPE_CNWORD); Lexeme newLexeme = new Lexeme(context.getBufferOffset() , hit.getBegin() , context.getCursor() - hit.getBegin() + 1 , Lexeme.TYPE_CNWORD);
@ -77,7 +77,7 @@ class CJKSegmenter implements ISegmenter {
//********************************* //*********************************
//再对当前指针位置的字符进行单字匹配 //再对当前指针位置的字符进行单字匹配
Hit singleCharHit = Dictionary.getSingleton().matchInMainDict(context.getSegmentBuff(), context.getCursor(), 1); Hit singleCharHit = Dictionary.getSingleton().matchInMainDict(String.valueOf(context.getSegmentBuff()).toLowerCase().toCharArray(), context.getCursor(), 1);
if(singleCharHit.isMatch()){//首字成词 if(singleCharHit.isMatch()){//首字成词
//输出当前的词 //输出当前的词
Lexeme newLexeme = new Lexeme(context.getBufferOffset() , context.getCursor() , 1 , Lexeme.TYPE_CNWORD); Lexeme newLexeme = new Lexeme(context.getBufferOffset() , context.getCursor() , 1 , Lexeme.TYPE_CNWORD);

View File

@ -23,16 +23,16 @@
*/ */
package org.wltea.analyzer.core; package org.wltea.analyzer.core;
import java.io.IOException;
import java.io.Reader;
import java.util.ArrayList;
import java.util.List;
import org.elasticsearch.common.settings.Settings; import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.env.Environment; import org.elasticsearch.env.Environment;
import org.wltea.analyzer.cfg.Configuration; import org.wltea.analyzer.cfg.Configuration;
import org.wltea.analyzer.dic.Dictionary; import org.wltea.analyzer.dic.Dictionary;
import java.io.IOException;
import java.io.Reader;
import java.util.ArrayList;
import java.util.List;
/** /**
* IK分词器主类 * IK分词器主类
* *

View File

@ -152,7 +152,7 @@ public class Dictionary {
* @return Hit 匹配结果描述 * @return Hit 匹配结果描述
*/ */
public Hit matchInMainDict(char[] charArray , int begin, int length){ public Hit matchInMainDict(char[] charArray , int begin, int length){
return singleton._MainDict.match(String.valueOf(charArray).trim().toLowerCase().toCharArray(), begin, length); return singleton._MainDict.match(String.valueOf(charArray).toLowerCase().toCharArray(), begin, length);
} }
/** /**
@ -160,7 +160,7 @@ public class Dictionary {
* @return Hit 匹配结果描述 * @return Hit 匹配结果描述
*/ */
public Hit matchInQuantifierDict(char[] charArray , int begin, int length){ public Hit matchInQuantifierDict(char[] charArray , int begin, int length){
return singleton._QuantifierDict.match(String.valueOf(charArray).trim().toLowerCase().toCharArray(), begin, length); return singleton._QuantifierDict.match(String.valueOf(charArray).toLowerCase().toCharArray(), begin, length);
} }
@ -179,7 +179,7 @@ public class Dictionary {
* @return boolean * @return boolean
*/ */
public boolean isStopWord(char[] charArray , int begin, int length){ public boolean isStopWord(char[] charArray , int begin, int length){
return singleton._StopWords.match(String.valueOf(charArray).trim().toLowerCase().toCharArray(), begin, length).isMatch(); return singleton._StopWords.match(String.valueOf(charArray).toLowerCase().toCharArray(), begin, length).isMatch();
} }
/** /**

View File

@ -26,9 +26,6 @@
*/ */
package org.wltea.analyzer.lucene; package org.wltea.analyzer.lucene;
import java.io.IOException;
import java.io.Reader;
import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
@ -38,6 +35,9 @@ import org.elasticsearch.env.Environment;
import org.wltea.analyzer.core.IKSegmenter; import org.wltea.analyzer.core.IKSegmenter;
import org.wltea.analyzer.core.Lexeme; import org.wltea.analyzer.core.Lexeme;
import java.io.IOException;
import java.io.Reader;
/** /**
* IK分词器 Lucene Tokenizer适配器类 * IK分词器 Lucene Tokenizer适配器类
* 兼容Lucene 4.0版本 * 兼容Lucene 4.0版本
@ -80,7 +80,7 @@ public final class IKTokenizer extends Tokenizer {
if(nextLexeme != null){ if(nextLexeme != null){
//将Lexeme转成Attributes //将Lexeme转成Attributes
//设置词元文本 //设置词元文本
termAtt.append(nextLexeme.getLexemeText()); termAtt.append(nextLexeme.getLexemeText().toLowerCase());
//设置词元长度 //设置词元长度
termAtt.setLength(nextLexeme.getLength()); termAtt.setLength(nextLexeme.getLength());
//设置词元位移 //设置词元位移

View File

@ -24,20 +24,16 @@
*/ */
package org.wltea.analyzer.query; package org.wltea.analyzer.query;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.*;
import org.apache.lucene.search.BooleanClause.Occur;
import org.apache.lucene.util.BytesRef;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.LinkedList; import java.util.LinkedList;
import java.util.List; import java.util.List;
import java.util.Stack; import java.util.Stack;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.BooleanClause;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.TermRangeQuery;
import org.apache.lucene.search.BooleanClause.Occur;
import org.apache.lucene.util.BytesRef;
/** /**
* IK简易查询表达式解析 * IK简易查询表达式解析
* 结合SWMCQuery算法 * 结合SWMCQuery算法
@ -66,7 +62,7 @@ public class IKQueryExpressionParser {
*/ */
public Query parseExp(String expression , boolean quickMode){ public Query parseExp(String expression , boolean quickMode){
Query lucenceQuery = null; Query lucenceQuery = null;
if(expression != null && !"".equals(expression.trim())){ if(expression != null && !"".equals(expression)){
try{ try{
//文法解析 //文法解析
this.splitElements(expression); this.splitElements(expression);