Compare commits

...

1 Commits

Author SHA1 Message Date
05ebbed97c 8.8.1 2023-06-25 19:19:59 +08:00
2 changed files with 134 additions and 131 deletions

175
pom.xml
View File

@ -1,7 +1,6 @@
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<name>elasticsearch-analysis-ik</name>
<modelVersion>4.0.0</modelVersion>
<groupId>org.elasticsearch</groupId>
@ -12,8 +11,8 @@
<inceptionYear>2011</inceptionYear>
<properties>
<elasticsearch.version>8.4.1</elasticsearch.version>
<maven.compiler.target>1.8</maven.compiler.target>
<elasticsearch.version>8.8.1</elasticsearch.version>
<maven.compiler.target>17</maven.compiler.target>
<elasticsearch.assembly.descriptor>${project.basedir}/src/main/assemblies/plugin.xml</elasticsearch.assembly.descriptor>
<elasticsearch.plugin.name>analysis-ik</elasticsearch.plugin.name>
<elasticsearch.plugin.classname>org.elasticsearch.plugin.analysis.ik.AnalysisIkPlugin</elasticsearch.plugin.classname>
@ -69,8 +68,12 @@
<repository>
<id>oss.sonatype.org</id>
<name>OSS Sonatype</name>
<releases><enabled>true</enabled></releases>
<snapshots><enabled>true</enabled></snapshots>
<releases>
<enabled>true</enabled>
</releases>
<snapshots>
<enabled>true</enabled>
</snapshots>
<url>https://oss.sonatype.org/content/repositories/releases/</url>
</repository>
</repositories>
@ -87,13 +90,13 @@
<dependency>
<groupId>org.apache.httpcomponents</groupId>
<artifactId>httpclient</artifactId>
<version>4.5.2</version>
<version>4.5.14</version>
</dependency>
<dependency>
<groupId>org.apache.logging.log4j</groupId>
<artifactId>log4j-api</artifactId>
<version>2.18.0</version>
<version>2.19.0</version>
</dependency>
<dependency>
@ -187,83 +190,83 @@
<additionalparam>-Xdoclint:none</additionalparam>
</properties>
</profile>
<profile>
<id>release</id>
<build>
<plugins>
<plugin>
<groupId>org.sonatype.plugins</groupId>
<artifactId>nexus-staging-maven-plugin</artifactId>
<version>1.6.3</version>
<extensions>true</extensions>
<configuration>
<serverId>oss</serverId>
<nexusUrl>https://oss.sonatype.org/</nexusUrl>
<autoReleaseAfterClose>true</autoReleaseAfterClose>
</configuration>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-release-plugin</artifactId>
<version>2.1</version>
<configuration>
<autoVersionSubmodules>true</autoVersionSubmodules>
<useReleaseProfile>false</useReleaseProfile>
<releaseProfiles>release</releaseProfiles>
<goals>deploy</goals>
</configuration>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-compiler-plugin</artifactId>
<version>3.5.1</version>
<configuration>
<source>${maven.compiler.target}</source>
<target>${maven.compiler.target}</target>
</configuration>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-gpg-plugin</artifactId>
<version>1.5</version>
<executions>
<execution>
<id>sign-artifacts</id>
<phase>verify</phase>
<goals>
<goal>sign</goal>
</goals>
</execution>
</executions>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-source-plugin</artifactId>
<version>2.2.1</version>
<executions>
<execution>
<id>attach-sources</id>
<goals>
<goal>jar-no-fork</goal>
</goals>
</execution>
</executions>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-javadoc-plugin</artifactId>
<version>2.9</version>
<executions>
<execution>
<id>attach-javadocs</id>
<goals>
<goal>jar</goal>
</goals>
</execution>
</executions>
</plugin>
</plugins>
</build>
</profile>
<profile>
<id>release</id>
<build>
<plugins>
<plugin>
<groupId>org.sonatype.plugins</groupId>
<artifactId>nexus-staging-maven-plugin</artifactId>
<version>1.6.3</version>
<extensions>true</extensions>
<configuration>
<serverId>oss</serverId>
<nexusUrl>https://oss.sonatype.org/</nexusUrl>
<autoReleaseAfterClose>true</autoReleaseAfterClose>
</configuration>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-release-plugin</artifactId>
<version>2.1</version>
<configuration>
<autoVersionSubmodules>true</autoVersionSubmodules>
<useReleaseProfile>false</useReleaseProfile>
<releaseProfiles>release</releaseProfiles>
<goals>deploy</goals>
</configuration>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-compiler-plugin</artifactId>
<version>3.5.1</version>
<configuration>
<source>${maven.compiler.target}</source>
<target>${maven.compiler.target}</target>
</configuration>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-gpg-plugin</artifactId>
<version>1.5</version>
<executions>
<execution>
<id>sign-artifacts</id>
<phase>verify</phase>
<goals>
<goal>sign</goal>
</goals>
</execution>
</executions>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-source-plugin</artifactId>
<version>2.2.1</version>
<executions>
<execution>
<id>attach-sources</id>
<goals>
<goal>jar-no-fork</goal>
</goals>
</execution>
</executions>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-javadoc-plugin</artifactId>
<version>2.9</version>
<executions>
<execution>
<id>attach-javadocs</id>
<goals>
<goal>jar</goal>
</goals>
</execution>
</executions>
</plugin>
</plugins>
</build>
</profile>
</profiles>
</project>

View File

@ -30,14 +30,11 @@ import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.env.Environment;
import org.wltea.analyzer.cfg.Configuration;
import org.wltea.analyzer.core.IKSegmenter;
import org.wltea.analyzer.core.Lexeme;
import java.io.IOException;
import java.io.Reader;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
/**
@ -45,86 +42,89 @@ import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
* 兼容Lucene 4.0版本
*/
public final class IKTokenizer extends Tokenizer {
//IK分词器实现
// IK分词器实现
private IKSegmenter _IKImplement;
//词元文本属性
// 词元文本属性
private final CharTermAttribute termAtt;
//词元位移属性
// 词元位移属性
private final OffsetAttribute offsetAtt;
//词元分类属性该属性分类参考org.wltea.analyzer.core.Lexeme中的分类常量
// 词元分类属性该属性分类参考org.wltea.analyzer.core.Lexeme中的分类常量
private final TypeAttribute typeAtt;
//记录最后一个词元的结束位置
// 记录最后一个词元的结束位置
private int endPosition;
private int skippedPositions;
private int skippedPositions;
private PositionIncrementAttribute posIncrAtt;
private PositionIncrementAttribute posIncrAtt;
/**
/**
* Lucene 4.0 Tokenizer适配器类构造函数
*/
public IKTokenizer(Configuration configuration){
super();
offsetAtt = addAttribute(OffsetAttribute.class);
termAtt = addAttribute(CharTermAttribute.class);
typeAtt = addAttribute(TypeAttribute.class);
posIncrAtt = addAttribute(PositionIncrementAttribute.class);
*/
public IKTokenizer(Configuration configuration) {
super();
offsetAtt = addAttribute(OffsetAttribute.class);
termAtt = addAttribute(CharTermAttribute.class);
typeAtt = addAttribute(TypeAttribute.class);
posIncrAtt = addAttribute(PositionIncrementAttribute.class);
_IKImplement = new IKSegmenter(input,configuration);
_IKImplement = new IKSegmenter(input, configuration);
}
/* (non-Javadoc)
/*
* (non-Javadoc)
*
* @see org.apache.lucene.analysis.TokenStream#incrementToken()
*/
@Override
public boolean incrementToken() throws IOException {
//清除所有的词元属性
// 清除所有的词元属性
clearAttributes();
skippedPositions = 0;
skippedPositions = 0;
Lexeme nextLexeme = _IKImplement.next();
if(nextLexeme != null){
posIncrAtt.setPositionIncrement(skippedPositions +1 );
Lexeme nextLexeme = _IKImplement.next();
if (nextLexeme != null) {
posIncrAtt.setPositionIncrement(skippedPositions + 1);
//将Lexeme转成Attributes
//设置词元文本
// 将Lexeme转成Attributes
// 设置词元文本
termAtt.append(nextLexeme.getLexemeText());
//设置词元长度
// 设置词元长度
termAtt.setLength(nextLexeme.getLength());
//设置词元位移
offsetAtt.setOffset(correctOffset(nextLexeme.getBeginPosition()), correctOffset(nextLexeme.getEndPosition()));
// 设置词元位移
offsetAtt.setOffset(correctOffset(nextLexeme.getBeginPosition()),
correctOffset(nextLexeme.getEndPosition()));
//记录分词的最后位置
// 记录分词的最后位置
endPosition = nextLexeme.getEndPosition();
//记录词元分类
typeAtt.setType(nextLexeme.getLexemeTypeString());
//返会true告知还有下个词元
// 记录词元分类
typeAtt.setType(nextLexeme.getLexemeTypeString());
// 返会true告知还有下个词元
return true;
}
//返会false告知词元输出完毕
// 返会false告知词元输出完毕
return false;
}
/*
* (non-Javadoc)
*
* @see org.apache.lucene.analysis.Tokenizer#reset(java.io.Reader)
*/
@Override
public void reset() throws IOException {
super.reset();
_IKImplement.reset(input);
skippedPositions = 0;
}
skippedPositions = 0;
}
@Override
public final void end() throws IOException {
super.end();
// set final offset
super.end();
// set final offset
int finalOffset = correctOffset(this.endPosition);
offsetAtt.setOffset(finalOffset, finalOffset);
posIncrAtt.setPositionIncrement(posIncrAtt.getPositionIncrement() + skippedPositions);
posIncrAtt.setPositionIncrement(posIncrAtt.getPositionIncrement() + skippedPositions);
}
}