fix offset problemn while create the custom analyzer combined with html_strip filter, @xpandan
This commit is contained in:
parent
44b6371542
commit
1f80b50a9f
5
pom.xml
5
pom.xml
@ -78,6 +78,11 @@
|
||||
<version>4.10</version>
|
||||
<scope>test</scope>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.apache.lucene</groupId>
|
||||
<artifactId>lucene-core</artifactId>
|
||||
<version>4.9.0</version>
|
||||
</dependency>
|
||||
</dependencies>
|
||||
|
||||
<build>
|
||||
|
@ -37,6 +37,7 @@ import org.wltea.analyzer.core.Lexeme;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.Reader;
|
||||
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
||||
|
||||
/**
|
||||
* IK分词器 Lucene Tokenizer适配器类
|
||||
@ -55,8 +56,13 @@ public final class IKTokenizer extends Tokenizer {
|
||||
private final TypeAttribute typeAtt;
|
||||
//记录最后一个词元的结束位置
|
||||
private int endPosition;
|
||||
|
||||
/**
|
||||
|
||||
private int skippedPositions;
|
||||
|
||||
private PositionIncrementAttribute posIncrAtt;
|
||||
|
||||
|
||||
/**
|
||||
* Lucene 4.0 Tokenizer适配器类构造函数
|
||||
* @param in
|
||||
*/
|
||||
@ -65,8 +71,9 @@ public final class IKTokenizer extends Tokenizer {
|
||||
offsetAtt = addAttribute(OffsetAttribute.class);
|
||||
termAtt = addAttribute(CharTermAttribute.class);
|
||||
typeAtt = addAttribute(TypeAttribute.class);
|
||||
posIncrAtt = addAttribute(PositionIncrementAttribute.class);
|
||||
|
||||
_IKImplement = new IKSegmenter(input , settings, environment);
|
||||
_IKImplement = new IKSegmenter(input , settings, environment);
|
||||
}
|
||||
|
||||
/* (non-Javadoc)
|
||||
@ -76,16 +83,22 @@ public final class IKTokenizer extends Tokenizer {
|
||||
public boolean incrementToken() throws IOException {
|
||||
//清除所有的词元属性
|
||||
clearAttributes();
|
||||
Lexeme nextLexeme = _IKImplement.next();
|
||||
skippedPositions = 0;
|
||||
|
||||
Lexeme nextLexeme = _IKImplement.next();
|
||||
if(nextLexeme != null){
|
||||
posIncrAtt.setPositionIncrement(skippedPositions +1 );
|
||||
|
||||
//将Lexeme转成Attributes
|
||||
//设置词元文本
|
||||
termAtt.append(nextLexeme.getLexemeText());
|
||||
//设置词元长度
|
||||
termAtt.setLength(nextLexeme.getLength());
|
||||
//设置词元位移
|
||||
offsetAtt.setOffset(nextLexeme.getBeginPosition(), nextLexeme.getEndPosition());
|
||||
//记录分词的最后位置
|
||||
// offsetAtt.setOffset(nextLexeme.getBeginPosition(), nextLexeme.getEndPosition());
|
||||
offsetAtt.setOffset(correctOffset(nextLexeme.getBeginPosition()), correctOffset(nextLexeme.getEndPosition()));
|
||||
|
||||
//记录分词的最后位置
|
||||
endPosition = nextLexeme.getEndPosition();
|
||||
//记录词元分类
|
||||
typeAtt.setType(nextLexeme.getLexemeTypeString());
|
||||
@ -104,12 +117,15 @@ public final class IKTokenizer extends Tokenizer {
|
||||
public void reset() throws IOException {
|
||||
super.reset();
|
||||
_IKImplement.reset(input);
|
||||
skippedPositions = 0;
|
||||
}
|
||||
|
||||
@Override
|
||||
public final void end() {
|
||||
public final void end() throws IOException {
|
||||
super.end();
|
||||
// set final offset
|
||||
int finalOffset = correctOffset(this.endPosition);
|
||||
offsetAtt.setOffset(finalOffset, finalOffset);
|
||||
posIncrAtt.setPositionIncrement(posIncrAtt.getPositionIncrement() + skippedPositions);
|
||||
}
|
||||
}
|
||||
|
Loading…
x
Reference in New Issue
Block a user