diff --git a/pom.xml b/pom.xml
index e5982b1..cf454ea 100644
--- a/pom.xml
+++ b/pom.xml
@@ -78,6 +78,11 @@
4.10
test
+
+ org.apache.lucene
+ lucene-core
+ 4.9.0
+
diff --git a/src/main/java/org/wltea/analyzer/lucene/IKTokenizer.java b/src/main/java/org/wltea/analyzer/lucene/IKTokenizer.java
index bffd984..d405251 100644
--- a/src/main/java/org/wltea/analyzer/lucene/IKTokenizer.java
+++ b/src/main/java/org/wltea/analyzer/lucene/IKTokenizer.java
@@ -37,6 +37,7 @@ import org.wltea.analyzer.core.Lexeme;
import java.io.IOException;
import java.io.Reader;
+import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
/**
* IK分词器 Lucene Tokenizer适配器类
@@ -55,8 +56,13 @@ public final class IKTokenizer extends Tokenizer {
private final TypeAttribute typeAtt;
//记录最后一个词元的结束位置
private int endPosition;
-
- /**
+
+ private int skippedPositions;
+
+ private PositionIncrementAttribute posIncrAtt;
+
+
+ /**
* Lucene 4.0 Tokenizer适配器类构造函数
* @param in
*/
@@ -65,8 +71,9 @@ public final class IKTokenizer extends Tokenizer {
offsetAtt = addAttribute(OffsetAttribute.class);
termAtt = addAttribute(CharTermAttribute.class);
typeAtt = addAttribute(TypeAttribute.class);
+ posIncrAtt = addAttribute(PositionIncrementAttribute.class);
- _IKImplement = new IKSegmenter(input , settings, environment);
+ _IKImplement = new IKSegmenter(input , settings, environment);
}
/* (non-Javadoc)
@@ -76,16 +83,22 @@ public final class IKTokenizer extends Tokenizer {
public boolean incrementToken() throws IOException {
//清除所有的词元属性
clearAttributes();
- Lexeme nextLexeme = _IKImplement.next();
+ skippedPositions = 0;
+
+ Lexeme nextLexeme = _IKImplement.next();
if(nextLexeme != null){
+ posIncrAtt.setPositionIncrement(skippedPositions +1 );
+
//将Lexeme转成Attributes
//设置词元文本
termAtt.append(nextLexeme.getLexemeText());
//设置词元长度
termAtt.setLength(nextLexeme.getLength());
//设置词元位移
- offsetAtt.setOffset(nextLexeme.getBeginPosition(), nextLexeme.getEndPosition());
- //记录分词的最后位置
+// offsetAtt.setOffset(nextLexeme.getBeginPosition(), nextLexeme.getEndPosition());
+ offsetAtt.setOffset(correctOffset(nextLexeme.getBeginPosition()), correctOffset(nextLexeme.getEndPosition()));
+
+ //记录分词的最后位置
endPosition = nextLexeme.getEndPosition();
//记录词元分类
typeAtt.setType(nextLexeme.getLexemeTypeString());
@@ -104,12 +117,15 @@ public final class IKTokenizer extends Tokenizer {
public void reset() throws IOException {
super.reset();
_IKImplement.reset(input);
+ skippedPositions = 0;
}
@Override
- public final void end() {
+ public final void end() throws IOException {
+ super.end();
// set final offset
int finalOffset = correctOffset(this.endPosition);
offsetAtt.setOffset(finalOffset, finalOffset);
+ posIncrAtt.setPositionIncrement(posIncrAtt.getPositionIncrement() + skippedPositions);
}
}