diff --git a/pom.xml b/pom.xml index e5982b1..cf454ea 100644 --- a/pom.xml +++ b/pom.xml @@ -78,6 +78,11 @@ 4.10 test + + org.apache.lucene + lucene-core + 4.9.0 + diff --git a/src/main/java/org/wltea/analyzer/lucene/IKTokenizer.java b/src/main/java/org/wltea/analyzer/lucene/IKTokenizer.java index bffd984..d405251 100644 --- a/src/main/java/org/wltea/analyzer/lucene/IKTokenizer.java +++ b/src/main/java/org/wltea/analyzer/lucene/IKTokenizer.java @@ -37,6 +37,7 @@ import org.wltea.analyzer.core.Lexeme; import java.io.IOException; import java.io.Reader; +import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; /** * IK分词器 Lucene Tokenizer适配器类 @@ -55,8 +56,13 @@ public final class IKTokenizer extends Tokenizer { private final TypeAttribute typeAtt; //记录最后一个词元的结束位置 private int endPosition; - - /** + + private int skippedPositions; + + private PositionIncrementAttribute posIncrAtt; + + + /** * Lucene 4.0 Tokenizer适配器类构造函数 * @param in */ @@ -65,8 +71,9 @@ public final class IKTokenizer extends Tokenizer { offsetAtt = addAttribute(OffsetAttribute.class); termAtt = addAttribute(CharTermAttribute.class); typeAtt = addAttribute(TypeAttribute.class); + posIncrAtt = addAttribute(PositionIncrementAttribute.class); - _IKImplement = new IKSegmenter(input , settings, environment); + _IKImplement = new IKSegmenter(input , settings, environment); } /* (non-Javadoc) @@ -76,16 +83,22 @@ public final class IKTokenizer extends Tokenizer { public boolean incrementToken() throws IOException { //清除所有的词元属性 clearAttributes(); - Lexeme nextLexeme = _IKImplement.next(); + skippedPositions = 0; + + Lexeme nextLexeme = _IKImplement.next(); if(nextLexeme != null){ + posIncrAtt.setPositionIncrement(skippedPositions +1 ); + //将Lexeme转成Attributes //设置词元文本 termAtt.append(nextLexeme.getLexemeText()); //设置词元长度 termAtt.setLength(nextLexeme.getLength()); //设置词元位移 - offsetAtt.setOffset(nextLexeme.getBeginPosition(), nextLexeme.getEndPosition()); - //记录分词的最后位置 +// offsetAtt.setOffset(nextLexeme.getBeginPosition(), nextLexeme.getEndPosition()); + offsetAtt.setOffset(correctOffset(nextLexeme.getBeginPosition()), correctOffset(nextLexeme.getEndPosition())); + + //记录分词的最后位置 endPosition = nextLexeme.getEndPosition(); //记录词元分类 typeAtt.setType(nextLexeme.getLexemeTypeString()); @@ -104,12 +117,15 @@ public final class IKTokenizer extends Tokenizer { public void reset() throws IOException { super.reset(); _IKImplement.reset(input); + skippedPositions = 0; } @Override - public final void end() { + public final void end() throws IOException { + super.end(); // set final offset int finalOffset = correctOffset(this.endPosition); offsetAtt.setOffset(finalOffset, finalOffset); + posIncrAtt.setPositionIncrement(posIncrAtt.getPositionIncrement() + skippedPositions); } }