fix offset problemn while create the custom analyzer combined with html_strip filter, @xpandan

This commit is contained in:
medcl 2014-09-17 17:19:30 +08:00
parent 44b6371542
commit 1f80b50a9f
2 changed files with 28 additions and 7 deletions

View File

@ -78,6 +78,11 @@
<version>4.10</version> <version>4.10</version>
<scope>test</scope> <scope>test</scope>
</dependency> </dependency>
<dependency>
<groupId>org.apache.lucene</groupId>
<artifactId>lucene-core</artifactId>
<version>4.9.0</version>
</dependency>
</dependencies> </dependencies>
<build> <build>

View File

@ -37,6 +37,7 @@ import org.wltea.analyzer.core.Lexeme;
import java.io.IOException; import java.io.IOException;
import java.io.Reader; import java.io.Reader;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
/** /**
* IK分词器 Lucene Tokenizer适配器类 * IK分词器 Lucene Tokenizer适配器类
@ -55,8 +56,13 @@ public final class IKTokenizer extends Tokenizer {
private final TypeAttribute typeAtt; private final TypeAttribute typeAtt;
//记录最后一个词元的结束位置 //记录最后一个词元的结束位置
private int endPosition; private int endPosition;
/** private int skippedPositions;
private PositionIncrementAttribute posIncrAtt;
/**
* Lucene 4.0 Tokenizer适配器类构造函数 * Lucene 4.0 Tokenizer适配器类构造函数
* @param in * @param in
*/ */
@ -65,8 +71,9 @@ public final class IKTokenizer extends Tokenizer {
offsetAtt = addAttribute(OffsetAttribute.class); offsetAtt = addAttribute(OffsetAttribute.class);
termAtt = addAttribute(CharTermAttribute.class); termAtt = addAttribute(CharTermAttribute.class);
typeAtt = addAttribute(TypeAttribute.class); typeAtt = addAttribute(TypeAttribute.class);
posIncrAtt = addAttribute(PositionIncrementAttribute.class);
_IKImplement = new IKSegmenter(input , settings, environment); _IKImplement = new IKSegmenter(input , settings, environment);
} }
/* (non-Javadoc) /* (non-Javadoc)
@ -76,16 +83,22 @@ public final class IKTokenizer extends Tokenizer {
public boolean incrementToken() throws IOException { public boolean incrementToken() throws IOException {
//清除所有的词元属性 //清除所有的词元属性
clearAttributes(); clearAttributes();
Lexeme nextLexeme = _IKImplement.next(); skippedPositions = 0;
Lexeme nextLexeme = _IKImplement.next();
if(nextLexeme != null){ if(nextLexeme != null){
posIncrAtt.setPositionIncrement(skippedPositions +1 );
//将Lexeme转成Attributes //将Lexeme转成Attributes
//设置词元文本 //设置词元文本
termAtt.append(nextLexeme.getLexemeText()); termAtt.append(nextLexeme.getLexemeText());
//设置词元长度 //设置词元长度
termAtt.setLength(nextLexeme.getLength()); termAtt.setLength(nextLexeme.getLength());
//设置词元位移 //设置词元位移
offsetAtt.setOffset(nextLexeme.getBeginPosition(), nextLexeme.getEndPosition()); // offsetAtt.setOffset(nextLexeme.getBeginPosition(), nextLexeme.getEndPosition());
//记录分词的最后位置 offsetAtt.setOffset(correctOffset(nextLexeme.getBeginPosition()), correctOffset(nextLexeme.getEndPosition()));
//记录分词的最后位置
endPosition = nextLexeme.getEndPosition(); endPosition = nextLexeme.getEndPosition();
//记录词元分类 //记录词元分类
typeAtt.setType(nextLexeme.getLexemeTypeString()); typeAtt.setType(nextLexeme.getLexemeTypeString());
@ -104,12 +117,15 @@ public final class IKTokenizer extends Tokenizer {
public void reset() throws IOException { public void reset() throws IOException {
super.reset(); super.reset();
_IKImplement.reset(input); _IKImplement.reset(input);
skippedPositions = 0;
} }
@Override @Override
public final void end() { public final void end() throws IOException {
super.end();
// set final offset // set final offset
int finalOffset = correctOffset(this.endPosition); int finalOffset = correctOffset(this.endPosition);
offsetAtt.setOffset(finalOffset, finalOffset); offsetAtt.setOffset(finalOffset, finalOffset);
posIncrAtt.setPositionIncrement(posIncrAtt.getPositionIncrement() + skippedPositions);
} }
} }