Compare commits
1 Commits
Author | SHA1 | Date | |
---|---|---|---|
05ebbed97c |
175
pom.xml
175
pom.xml
@ -1,7 +1,6 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<project xmlns="http://maven.apache.org/POM/4.0.0"
|
||||
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
|
||||
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
|
||||
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
|
||||
<name>elasticsearch-analysis-ik</name>
|
||||
<modelVersion>4.0.0</modelVersion>
|
||||
<groupId>org.elasticsearch</groupId>
|
||||
@ -12,8 +11,8 @@
|
||||
<inceptionYear>2011</inceptionYear>
|
||||
|
||||
<properties>
|
||||
<elasticsearch.version>8.4.1</elasticsearch.version>
|
||||
<maven.compiler.target>1.8</maven.compiler.target>
|
||||
<elasticsearch.version>8.8.1</elasticsearch.version>
|
||||
<maven.compiler.target>17</maven.compiler.target>
|
||||
<elasticsearch.assembly.descriptor>${project.basedir}/src/main/assemblies/plugin.xml</elasticsearch.assembly.descriptor>
|
||||
<elasticsearch.plugin.name>analysis-ik</elasticsearch.plugin.name>
|
||||
<elasticsearch.plugin.classname>org.elasticsearch.plugin.analysis.ik.AnalysisIkPlugin</elasticsearch.plugin.classname>
|
||||
@ -69,8 +68,12 @@
|
||||
<repository>
|
||||
<id>oss.sonatype.org</id>
|
||||
<name>OSS Sonatype</name>
|
||||
<releases><enabled>true</enabled></releases>
|
||||
<snapshots><enabled>true</enabled></snapshots>
|
||||
<releases>
|
||||
<enabled>true</enabled>
|
||||
</releases>
|
||||
<snapshots>
|
||||
<enabled>true</enabled>
|
||||
</snapshots>
|
||||
<url>https://oss.sonatype.org/content/repositories/releases/</url>
|
||||
</repository>
|
||||
</repositories>
|
||||
@ -87,13 +90,13 @@
|
||||
<dependency>
|
||||
<groupId>org.apache.httpcomponents</groupId>
|
||||
<artifactId>httpclient</artifactId>
|
||||
<version>4.5.2</version>
|
||||
<version>4.5.14</version>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>org.apache.logging.log4j</groupId>
|
||||
<artifactId>log4j-api</artifactId>
|
||||
<version>2.18.0</version>
|
||||
<version>2.19.0</version>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
@ -187,83 +190,83 @@
|
||||
<additionalparam>-Xdoclint:none</additionalparam>
|
||||
</properties>
|
||||
</profile>
|
||||
<profile>
|
||||
<id>release</id>
|
||||
<build>
|
||||
<plugins>
|
||||
<plugin>
|
||||
<groupId>org.sonatype.plugins</groupId>
|
||||
<artifactId>nexus-staging-maven-plugin</artifactId>
|
||||
<version>1.6.3</version>
|
||||
<extensions>true</extensions>
|
||||
<configuration>
|
||||
<serverId>oss</serverId>
|
||||
<nexusUrl>https://oss.sonatype.org/</nexusUrl>
|
||||
<autoReleaseAfterClose>true</autoReleaseAfterClose>
|
||||
</configuration>
|
||||
</plugin>
|
||||
<plugin>
|
||||
<groupId>org.apache.maven.plugins</groupId>
|
||||
<artifactId>maven-release-plugin</artifactId>
|
||||
<version>2.1</version>
|
||||
<configuration>
|
||||
<autoVersionSubmodules>true</autoVersionSubmodules>
|
||||
<useReleaseProfile>false</useReleaseProfile>
|
||||
<releaseProfiles>release</releaseProfiles>
|
||||
<goals>deploy</goals>
|
||||
</configuration>
|
||||
</plugin>
|
||||
<plugin>
|
||||
<groupId>org.apache.maven.plugins</groupId>
|
||||
<artifactId>maven-compiler-plugin</artifactId>
|
||||
<version>3.5.1</version>
|
||||
<configuration>
|
||||
<source>${maven.compiler.target}</source>
|
||||
<target>${maven.compiler.target}</target>
|
||||
</configuration>
|
||||
</plugin>
|
||||
<plugin>
|
||||
<groupId>org.apache.maven.plugins</groupId>
|
||||
<artifactId>maven-gpg-plugin</artifactId>
|
||||
<version>1.5</version>
|
||||
<executions>
|
||||
<execution>
|
||||
<id>sign-artifacts</id>
|
||||
<phase>verify</phase>
|
||||
<goals>
|
||||
<goal>sign</goal>
|
||||
</goals>
|
||||
</execution>
|
||||
</executions>
|
||||
</plugin>
|
||||
<plugin>
|
||||
<groupId>org.apache.maven.plugins</groupId>
|
||||
<artifactId>maven-source-plugin</artifactId>
|
||||
<version>2.2.1</version>
|
||||
<executions>
|
||||
<execution>
|
||||
<id>attach-sources</id>
|
||||
<goals>
|
||||
<goal>jar-no-fork</goal>
|
||||
</goals>
|
||||
</execution>
|
||||
</executions>
|
||||
</plugin>
|
||||
<plugin>
|
||||
<groupId>org.apache.maven.plugins</groupId>
|
||||
<artifactId>maven-javadoc-plugin</artifactId>
|
||||
<version>2.9</version>
|
||||
<executions>
|
||||
<execution>
|
||||
<id>attach-javadocs</id>
|
||||
<goals>
|
||||
<goal>jar</goal>
|
||||
</goals>
|
||||
</execution>
|
||||
</executions>
|
||||
</plugin>
|
||||
</plugins>
|
||||
</build>
|
||||
</profile>
|
||||
<profile>
|
||||
<id>release</id>
|
||||
<build>
|
||||
<plugins>
|
||||
<plugin>
|
||||
<groupId>org.sonatype.plugins</groupId>
|
||||
<artifactId>nexus-staging-maven-plugin</artifactId>
|
||||
<version>1.6.3</version>
|
||||
<extensions>true</extensions>
|
||||
<configuration>
|
||||
<serverId>oss</serverId>
|
||||
<nexusUrl>https://oss.sonatype.org/</nexusUrl>
|
||||
<autoReleaseAfterClose>true</autoReleaseAfterClose>
|
||||
</configuration>
|
||||
</plugin>
|
||||
<plugin>
|
||||
<groupId>org.apache.maven.plugins</groupId>
|
||||
<artifactId>maven-release-plugin</artifactId>
|
||||
<version>2.1</version>
|
||||
<configuration>
|
||||
<autoVersionSubmodules>true</autoVersionSubmodules>
|
||||
<useReleaseProfile>false</useReleaseProfile>
|
||||
<releaseProfiles>release</releaseProfiles>
|
||||
<goals>deploy</goals>
|
||||
</configuration>
|
||||
</plugin>
|
||||
<plugin>
|
||||
<groupId>org.apache.maven.plugins</groupId>
|
||||
<artifactId>maven-compiler-plugin</artifactId>
|
||||
<version>3.5.1</version>
|
||||
<configuration>
|
||||
<source>${maven.compiler.target}</source>
|
||||
<target>${maven.compiler.target}</target>
|
||||
</configuration>
|
||||
</plugin>
|
||||
<plugin>
|
||||
<groupId>org.apache.maven.plugins</groupId>
|
||||
<artifactId>maven-gpg-plugin</artifactId>
|
||||
<version>1.5</version>
|
||||
<executions>
|
||||
<execution>
|
||||
<id>sign-artifacts</id>
|
||||
<phase>verify</phase>
|
||||
<goals>
|
||||
<goal>sign</goal>
|
||||
</goals>
|
||||
</execution>
|
||||
</executions>
|
||||
</plugin>
|
||||
<plugin>
|
||||
<groupId>org.apache.maven.plugins</groupId>
|
||||
<artifactId>maven-source-plugin</artifactId>
|
||||
<version>2.2.1</version>
|
||||
<executions>
|
||||
<execution>
|
||||
<id>attach-sources</id>
|
||||
<goals>
|
||||
<goal>jar-no-fork</goal>
|
||||
</goals>
|
||||
</execution>
|
||||
</executions>
|
||||
</plugin>
|
||||
<plugin>
|
||||
<groupId>org.apache.maven.plugins</groupId>
|
||||
<artifactId>maven-javadoc-plugin</artifactId>
|
||||
<version>2.9</version>
|
||||
<executions>
|
||||
<execution>
|
||||
<id>attach-javadocs</id>
|
||||
<goals>
|
||||
<goal>jar</goal>
|
||||
</goals>
|
||||
</execution>
|
||||
</executions>
|
||||
</plugin>
|
||||
</plugins>
|
||||
</build>
|
||||
</profile>
|
||||
</profiles>
|
||||
</project>
|
||||
|
@ -30,14 +30,11 @@ import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
|
||||
import org.elasticsearch.common.settings.Settings;
|
||||
import org.elasticsearch.env.Environment;
|
||||
import org.wltea.analyzer.cfg.Configuration;
|
||||
import org.wltea.analyzer.core.IKSegmenter;
|
||||
import org.wltea.analyzer.core.Lexeme;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.Reader;
|
||||
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
||||
|
||||
/**
|
||||
@ -45,86 +42,89 @@ import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
||||
* 兼容Lucene 4.0版本
|
||||
*/
|
||||
public final class IKTokenizer extends Tokenizer {
|
||||
|
||||
//IK分词器实现
|
||||
|
||||
// IK分词器实现
|
||||
private IKSegmenter _IKImplement;
|
||||
|
||||
//词元文本属性
|
||||
|
||||
// 词元文本属性
|
||||
private final CharTermAttribute termAtt;
|
||||
//词元位移属性
|
||||
// 词元位移属性
|
||||
private final OffsetAttribute offsetAtt;
|
||||
//词元分类属性(该属性分类参考org.wltea.analyzer.core.Lexeme中的分类常量)
|
||||
// 词元分类属性(该属性分类参考org.wltea.analyzer.core.Lexeme中的分类常量)
|
||||
private final TypeAttribute typeAtt;
|
||||
//记录最后一个词元的结束位置
|
||||
// 记录最后一个词元的结束位置
|
||||
private int endPosition;
|
||||
|
||||
private int skippedPositions;
|
||||
private int skippedPositions;
|
||||
|
||||
private PositionIncrementAttribute posIncrAtt;
|
||||
private PositionIncrementAttribute posIncrAtt;
|
||||
|
||||
|
||||
/**
|
||||
/**
|
||||
* Lucene 4.0 Tokenizer适配器类构造函数
|
||||
*/
|
||||
public IKTokenizer(Configuration configuration){
|
||||
super();
|
||||
offsetAtt = addAttribute(OffsetAttribute.class);
|
||||
termAtt = addAttribute(CharTermAttribute.class);
|
||||
typeAtt = addAttribute(TypeAttribute.class);
|
||||
posIncrAtt = addAttribute(PositionIncrementAttribute.class);
|
||||
*/
|
||||
public IKTokenizer(Configuration configuration) {
|
||||
super();
|
||||
offsetAtt = addAttribute(OffsetAttribute.class);
|
||||
termAtt = addAttribute(CharTermAttribute.class);
|
||||
typeAtt = addAttribute(TypeAttribute.class);
|
||||
posIncrAtt = addAttribute(PositionIncrementAttribute.class);
|
||||
|
||||
_IKImplement = new IKSegmenter(input,configuration);
|
||||
_IKImplement = new IKSegmenter(input, configuration);
|
||||
}
|
||||
|
||||
/* (non-Javadoc)
|
||||
/*
|
||||
* (non-Javadoc)
|
||||
*
|
||||
* @see org.apache.lucene.analysis.TokenStream#incrementToken()
|
||||
*/
|
||||
@Override
|
||||
public boolean incrementToken() throws IOException {
|
||||
//清除所有的词元属性
|
||||
// 清除所有的词元属性
|
||||
clearAttributes();
|
||||
skippedPositions = 0;
|
||||
skippedPositions = 0;
|
||||
|
||||
Lexeme nextLexeme = _IKImplement.next();
|
||||
if(nextLexeme != null){
|
||||
posIncrAtt.setPositionIncrement(skippedPositions +1 );
|
||||
Lexeme nextLexeme = _IKImplement.next();
|
||||
if (nextLexeme != null) {
|
||||
posIncrAtt.setPositionIncrement(skippedPositions + 1);
|
||||
|
||||
//将Lexeme转成Attributes
|
||||
//设置词元文本
|
||||
// 将Lexeme转成Attributes
|
||||
// 设置词元文本
|
||||
termAtt.append(nextLexeme.getLexemeText());
|
||||
//设置词元长度
|
||||
// 设置词元长度
|
||||
termAtt.setLength(nextLexeme.getLength());
|
||||
//设置词元位移
|
||||
offsetAtt.setOffset(correctOffset(nextLexeme.getBeginPosition()), correctOffset(nextLexeme.getEndPosition()));
|
||||
// 设置词元位移
|
||||
offsetAtt.setOffset(correctOffset(nextLexeme.getBeginPosition()),
|
||||
correctOffset(nextLexeme.getEndPosition()));
|
||||
|
||||
//记录分词的最后位置
|
||||
// 记录分词的最后位置
|
||||
endPosition = nextLexeme.getEndPosition();
|
||||
//记录词元分类
|
||||
typeAtt.setType(nextLexeme.getLexemeTypeString());
|
||||
//返会true告知还有下个词元
|
||||
// 记录词元分类
|
||||
typeAtt.setType(nextLexeme.getLexemeTypeString());
|
||||
// 返会true告知还有下个词元
|
||||
return true;
|
||||
}
|
||||
//返会false告知词元输出完毕
|
||||
// 返会false告知词元输出完毕
|
||||
return false;
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* (non-Javadoc)
|
||||
*
|
||||
* @see org.apache.lucene.analysis.Tokenizer#reset(java.io.Reader)
|
||||
*/
|
||||
@Override
|
||||
public void reset() throws IOException {
|
||||
super.reset();
|
||||
_IKImplement.reset(input);
|
||||
skippedPositions = 0;
|
||||
}
|
||||
|
||||
skippedPositions = 0;
|
||||
}
|
||||
|
||||
@Override
|
||||
public final void end() throws IOException {
|
||||
super.end();
|
||||
// set final offset
|
||||
super.end();
|
||||
// set final offset
|
||||
int finalOffset = correctOffset(this.endPosition);
|
||||
offsetAtt.setOffset(finalOffset, finalOffset);
|
||||
posIncrAtt.setPositionIncrement(posIncrAtt.getPositionIncrement() + skippedPositions);
|
||||
posIncrAtt.setPositionIncrement(posIncrAtt.getPositionIncrement() + skippedPositions);
|
||||
}
|
||||
}
|
||||
|
Loading…
x
Reference in New Issue
Block a user