Compare commits
16 Commits
Author | SHA1 | Date | |
---|---|---|---|
|
9338c19104 | ||
|
0fb53ac32c | ||
|
b637708ba0 | ||
|
9c47725ea0 | ||
|
8e36b3240e | ||
|
e0157d5f39 | ||
|
0fccc038e2 | ||
|
5a1b8c8da6 | ||
|
1375ca6d39 | ||
|
4619effa15 | ||
|
5f53f1a5bf | ||
|
904a7493ea | ||
|
06e8a23d18 | ||
|
a1d6ba8ca2 | ||
|
90c9b58354 | ||
|
ba8bb85f31 |
2
.github/FUNDING.yml
vendored
Normal file
2
.github/FUNDING.yml
vendored
Normal file
@ -0,0 +1,2 @@
|
|||||||
|
patreon: medcl
|
||||||
|
custom: ["https://www.buymeacoffee.com/medcl"]
|
33
README.md
33
README.md
@ -10,16 +10,9 @@ Versions
|
|||||||
|
|
||||||
IK version | ES version
|
IK version | ES version
|
||||||
-----------|-----------
|
-----------|-----------
|
||||||
master | 6.x -> master
|
master | 7.x -> master
|
||||||
6.3.0| 6.3.0
|
6.x| 6.x
|
||||||
6.2.4| 6.2.4
|
5.x| 5.x
|
||||||
6.1.3| 6.1.3
|
|
||||||
5.6.8| 5.6.8
|
|
||||||
5.5.3| 5.5.3
|
|
||||||
5.4.3| 5.4.3
|
|
||||||
5.3.3| 5.3.3
|
|
||||||
5.2.2| 5.2.2
|
|
||||||
5.1.2| 5.1.2
|
|
||||||
1.10.6 | 2.4.6
|
1.10.6 | 2.4.6
|
||||||
1.9.5 | 2.3.5
|
1.9.5 | 2.3.5
|
||||||
1.8.1 | 2.2.1
|
1.8.1 | 2.2.1
|
||||||
@ -64,13 +57,13 @@ curl -XPUT http://localhost:9200/index
|
|||||||
2.create a mapping
|
2.create a mapping
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
curl -XPOST http://localhost:9200/index/fulltext/_mapping -H 'Content-Type:application/json' -d'
|
curl -XPOST http://localhost:9200/index/_mapping -H 'Content-Type:application/json' -d'
|
||||||
{
|
{
|
||||||
"properties": {
|
"properties": {
|
||||||
"content": {
|
"content": {
|
||||||
"type": "text",
|
"type": "text",
|
||||||
"analyzer": "ik_max_word",
|
"analyzer": "ik_max_word",
|
||||||
"search_analyzer": "ik_max_word"
|
"search_analyzer": "ik_smart"
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -80,25 +73,25 @@ curl -XPOST http://localhost:9200/index/fulltext/_mapping -H 'Content-Type:appli
|
|||||||
3.index some docs
|
3.index some docs
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
curl -XPOST http://localhost:9200/index/fulltext/1 -H 'Content-Type:application/json' -d'
|
curl -XPOST http://localhost:9200/index/_create/1 -H 'Content-Type:application/json' -d'
|
||||||
{"content":"美国留给伊拉克的是个烂摊子吗"}
|
{"content":"美国留给伊拉克的是个烂摊子吗"}
|
||||||
'
|
'
|
||||||
```
|
```
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
curl -XPOST http://localhost:9200/index/fulltext/2 -H 'Content-Type:application/json' -d'
|
curl -XPOST http://localhost:9200/index/_create/2 -H 'Content-Type:application/json' -d'
|
||||||
{"content":"公安部:各地校车将享最高路权"}
|
{"content":"公安部:各地校车将享最高路权"}
|
||||||
'
|
'
|
||||||
```
|
```
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
curl -XPOST http://localhost:9200/index/fulltext/3 -H 'Content-Type:application/json' -d'
|
curl -XPOST http://localhost:9200/index/_create/3 -H 'Content-Type:application/json' -d'
|
||||||
{"content":"中韩渔警冲突调查:韩警平均每天扣1艘中国渔船"}
|
{"content":"中韩渔警冲突调查:韩警平均每天扣1艘中国渔船"}
|
||||||
'
|
'
|
||||||
```
|
```
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
curl -XPOST http://localhost:9200/index/fulltext/4 -H 'Content-Type:application/json' -d'
|
curl -XPOST http://localhost:9200/index/_create/4 -H 'Content-Type:application/json' -d'
|
||||||
{"content":"中国驻洛杉矶领事馆遭亚裔男子枪击 嫌犯已自首"}
|
{"content":"中国驻洛杉矶领事馆遭亚裔男子枪击 嫌犯已自首"}
|
||||||
'
|
'
|
||||||
```
|
```
|
||||||
@ -106,7 +99,7 @@ curl -XPOST http://localhost:9200/index/fulltext/4 -H 'Content-Type:application/
|
|||||||
4.query with highlighting
|
4.query with highlighting
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
curl -XPOST http://localhost:9200/index/fulltext/_search -H 'Content-Type:application/json' -d'
|
curl -XPOST http://localhost:9200/index/_search -H 'Content-Type:application/json' -d'
|
||||||
{
|
{
|
||||||
"query" : { "match" : { "content" : "中国" }},
|
"query" : { "match" : { "content" : "中国" }},
|
||||||
"highlight" : {
|
"highlight" : {
|
||||||
@ -248,13 +241,13 @@ curl -XGET "http://localhost:9200/your_index/_analyze" -H 'Content-Type: applica
|
|||||||
4. ik_max_word 和 ik_smart 什么区别?
|
4. ik_max_word 和 ik_smart 什么区别?
|
||||||
|
|
||||||
|
|
||||||
ik_max_word: 会将文本做最细粒度的拆分,比如会将“中华人民共和国国歌”拆分为“中华人民共和国,中华人民,中华,华人,人民共和国,人民,人,民,共和国,共和,和,国国,国歌”,会穷尽各种可能的组合;
|
ik_max_word: 会将文本做最细粒度的拆分,比如会将“中华人民共和国国歌”拆分为“中华人民共和国,中华人民,中华,华人,人民共和国,人民,人,民,共和国,共和,和,国国,国歌”,会穷尽各种可能的组合,适合 Term Query;
|
||||||
|
|
||||||
ik_smart: 会做最粗粒度的拆分,比如会将“中华人民共和国国歌”拆分为“中华人民共和国,国歌”。
|
ik_smart: 会做最粗粒度的拆分,比如会将“中华人民共和国国歌”拆分为“中华人民共和国,国歌”,适合 Phrase 查询。
|
||||||
|
|
||||||
Changes
|
Changes
|
||||||
------
|
------
|
||||||
*5.0.0*
|
*自 v5.0.0 起*
|
||||||
|
|
||||||
- 移除名为 `ik` 的analyzer和tokenizer,请分别使用 `ik_smart` 和 `ik_max_word`
|
- 移除名为 `ik` 的analyzer和tokenizer,请分别使用 `ik_smart` 和 `ik_max_word`
|
||||||
|
|
||||||
|
14
pom.xml
Normal file → Executable file
14
pom.xml
Normal file → Executable file
@ -12,7 +12,7 @@
|
|||||||
<inceptionYear>2011</inceptionYear>
|
<inceptionYear>2011</inceptionYear>
|
||||||
|
|
||||||
<properties>
|
<properties>
|
||||||
<elasticsearch.version>6.5.0</elasticsearch.version>
|
<elasticsearch.version>8.4.1</elasticsearch.version>
|
||||||
<maven.compiler.target>1.8</maven.compiler.target>
|
<maven.compiler.target>1.8</maven.compiler.target>
|
||||||
<elasticsearch.assembly.descriptor>${project.basedir}/src/main/assemblies/plugin.xml</elasticsearch.assembly.descriptor>
|
<elasticsearch.assembly.descriptor>${project.basedir}/src/main/assemblies/plugin.xml</elasticsearch.assembly.descriptor>
|
||||||
<elasticsearch.plugin.name>analysis-ik</elasticsearch.plugin.name>
|
<elasticsearch.plugin.name>analysis-ik</elasticsearch.plugin.name>
|
||||||
@ -34,10 +34,10 @@
|
|||||||
|
|
||||||
<developers>
|
<developers>
|
||||||
<developer>
|
<developer>
|
||||||
<name>Medcl</name>
|
<name>INFINI Labs</name>
|
||||||
<email>medcl@elastic.co</email>
|
<email>hello@infini.ltd</email>
|
||||||
<organization>elastic</organization>
|
<organization>INFINI Labs</organization>
|
||||||
<organizationUrl>http://www.elastic.co</organizationUrl>
|
<organizationUrl>https://infinilabs.com</organizationUrl>
|
||||||
</developer>
|
</developer>
|
||||||
</developers>
|
</developers>
|
||||||
|
|
||||||
@ -71,7 +71,7 @@
|
|||||||
<name>OSS Sonatype</name>
|
<name>OSS Sonatype</name>
|
||||||
<releases><enabled>true</enabled></releases>
|
<releases><enabled>true</enabled></releases>
|
||||||
<snapshots><enabled>true</enabled></snapshots>
|
<snapshots><enabled>true</enabled></snapshots>
|
||||||
<url>http://oss.sonatype.org/content/repositories/releases/</url>
|
<url>https://oss.sonatype.org/content/repositories/releases/</url>
|
||||||
</repository>
|
</repository>
|
||||||
</repositories>
|
</repositories>
|
||||||
|
|
||||||
@ -93,7 +93,7 @@
|
|||||||
<dependency>
|
<dependency>
|
||||||
<groupId>org.apache.logging.log4j</groupId>
|
<groupId>org.apache.logging.log4j</groupId>
|
||||||
<artifactId>log4j-api</artifactId>
|
<artifactId>log4j-api</artifactId>
|
||||||
<version>2.3</version>
|
<version>2.18.0</version>
|
||||||
</dependency>
|
</dependency>
|
||||||
|
|
||||||
<dependency>
|
<dependency>
|
||||||
|
@ -10,7 +10,7 @@ public class IkAnalyzerProvider extends AbstractIndexAnalyzerProvider<IKAnalyzer
|
|||||||
private final IKAnalyzer analyzer;
|
private final IKAnalyzer analyzer;
|
||||||
|
|
||||||
public IkAnalyzerProvider(IndexSettings indexSettings, Environment env, String name, Settings settings,boolean useSmart) {
|
public IkAnalyzerProvider(IndexSettings indexSettings, Environment env, String name, Settings settings,boolean useSmart) {
|
||||||
super(indexSettings, name, settings);
|
super(name, settings);
|
||||||
|
|
||||||
Configuration configuration=new Configuration(env,settings).setUseSmart(useSmart);
|
Configuration configuration=new Configuration(env,settings).setUseSmart(useSmart);
|
||||||
|
|
||||||
|
@ -11,7 +11,7 @@ public class IkTokenizerFactory extends AbstractTokenizerFactory {
|
|||||||
private Configuration configuration;
|
private Configuration configuration;
|
||||||
|
|
||||||
public IkTokenizerFactory(IndexSettings indexSettings, Environment env, String name, Settings settings) {
|
public IkTokenizerFactory(IndexSettings indexSettings, Environment env, String name, Settings settings) {
|
||||||
super(indexSettings, name, settings);
|
super(indexSettings, settings,name);
|
||||||
configuration=new Configuration(env,settings);
|
configuration=new Configuration(env,settings);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
2
src/main/java/org/wltea/analyzer/cfg/Configuration.java
Normal file → Executable file
2
src/main/java/org/wltea/analyzer/cfg/Configuration.java
Normal file → Executable file
@ -4,7 +4,7 @@
|
|||||||
package org.wltea.analyzer.cfg;
|
package org.wltea.analyzer.cfg;
|
||||||
|
|
||||||
import org.elasticsearch.common.inject.Inject;
|
import org.elasticsearch.common.inject.Inject;
|
||||||
import org.elasticsearch.common.io.PathUtils;
|
import org.elasticsearch.core.PathUtils;
|
||||||
import org.elasticsearch.common.settings.Settings;
|
import org.elasticsearch.common.settings.Settings;
|
||||||
import org.elasticsearch.env.Environment;
|
import org.elasticsearch.env.Environment;
|
||||||
import org.elasticsearch.plugin.analysis.ik.AnalysisIkPlugin;
|
import org.elasticsearch.plugin.analysis.ik.AnalysisIkPlugin;
|
||||||
|
@ -268,13 +268,13 @@ class AnalyzeContext {
|
|||||||
while(l != null){
|
while(l != null){
|
||||||
this.results.add(l);
|
this.results.add(l);
|
||||||
//字典中无单字,但是词元冲突了,切分出相交词元的前一个词元中的单字
|
//字典中无单字,但是词元冲突了,切分出相交词元的前一个词元中的单字
|
||||||
int innerIndex = index + 1;
|
/*int innerIndex = index + 1;
|
||||||
for (; innerIndex < index + l.getLength(); innerIndex++) {
|
for (; innerIndex < index + l.getLength(); innerIndex++) {
|
||||||
Lexeme innerL = path.peekFirst();
|
Lexeme innerL = path.peekFirst();
|
||||||
if (innerL != null && innerIndex == innerL.getBegin()) {
|
if (innerL != null && innerIndex == innerL.getBegin()) {
|
||||||
this.outputSingleCJK(innerIndex - 1);
|
this.outputSingleCJK(innerIndex - 1);
|
||||||
}
|
}
|
||||||
}
|
}*/
|
||||||
|
|
||||||
//将index移至lexeme后
|
//将index移至lexeme后
|
||||||
index = l.getBegin() + l.getLength();
|
index = l.getBegin() + l.getLength();
|
||||||
|
@ -57,7 +57,7 @@ class DictSegment implements Comparable<DictSegment>{
|
|||||||
|
|
||||||
DictSegment(Character nodeChar){
|
DictSegment(Character nodeChar){
|
||||||
if(nodeChar == null){
|
if(nodeChar == null){
|
||||||
throw new IllegalArgumentException("参数为空异常,字符不能为空");
|
throw new IllegalArgumentException("node char cannot be empty");
|
||||||
}
|
}
|
||||||
this.nodeChar = nodeChar;
|
this.nodeChar = nodeChar;
|
||||||
}
|
}
|
||||||
|
16
src/main/java/org/wltea/analyzer/dic/Dictionary.java
Normal file → Executable file
16
src/main/java/org/wltea/analyzer/dic/Dictionary.java
Normal file → Executable file
@ -52,7 +52,7 @@ import org.apache.http.client.methods.HttpGet;
|
|||||||
import org.apache.http.impl.client.CloseableHttpClient;
|
import org.apache.http.impl.client.CloseableHttpClient;
|
||||||
import org.apache.http.impl.client.HttpClients;
|
import org.apache.http.impl.client.HttpClients;
|
||||||
import org.elasticsearch.SpecialPermission;
|
import org.elasticsearch.SpecialPermission;
|
||||||
import org.elasticsearch.common.io.PathUtils;
|
import org.elasticsearch.core.PathUtils;
|
||||||
import org.elasticsearch.plugin.analysis.ik.AnalysisIkPlugin;
|
import org.elasticsearch.plugin.analysis.ik.AnalysisIkPlugin;
|
||||||
import org.wltea.analyzer.cfg.Configuration;
|
import org.wltea.analyzer.cfg.Configuration;
|
||||||
import org.apache.logging.log4j.Logger;
|
import org.apache.logging.log4j.Logger;
|
||||||
@ -80,7 +80,7 @@ public class Dictionary {
|
|||||||
*/
|
*/
|
||||||
private Configuration configuration;
|
private Configuration configuration;
|
||||||
|
|
||||||
private static final Logger logger = ESPluginLoggerFactory.getLogger(Monitor.class.getName());
|
private static final Logger logger = ESPluginLoggerFactory.getLogger(Dictionary.class.getName());
|
||||||
|
|
||||||
private static ScheduledExecutorService pool = Executors.newScheduledThreadPool(1);
|
private static ScheduledExecutorService pool = Executors.newScheduledThreadPool(1);
|
||||||
|
|
||||||
@ -294,7 +294,7 @@ public class Dictionary {
|
|||||||
*/
|
*/
|
||||||
public static Dictionary getSingleton() {
|
public static Dictionary getSingleton() {
|
||||||
if (singleton == null) {
|
if (singleton == null) {
|
||||||
throw new IllegalStateException("词典尚未初始化,请先调用initial方法");
|
throw new IllegalStateException("ik dict has not been initialized yet, please call initial method first.");
|
||||||
}
|
}
|
||||||
return singleton;
|
return singleton;
|
||||||
}
|
}
|
||||||
@ -419,7 +419,7 @@ public class Dictionary {
|
|||||||
List<String> lists = getRemoteWords(location);
|
List<String> lists = getRemoteWords(location);
|
||||||
// 如果找不到扩展的字典,则忽略
|
// 如果找不到扩展的字典,则忽略
|
||||||
if (lists == null) {
|
if (lists == null) {
|
||||||
logger.error("[Dict Loading] " + location + "加载失败");
|
logger.error("[Dict Loading] " + location + " load failed");
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
for (String theWord : lists) {
|
for (String theWord : lists) {
|
||||||
@ -469,7 +469,7 @@ public class Dictionary {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (entity.getContentLength() > 0) {
|
if (entity.getContentLength() > 0 || entity.isChunked()) {
|
||||||
in = new BufferedReader(new InputStreamReader(entity.getContent(), charset));
|
in = new BufferedReader(new InputStreamReader(entity.getContent(), charset));
|
||||||
String line;
|
String line;
|
||||||
while ((line = in.readLine()) != null) {
|
while ((line = in.readLine()) != null) {
|
||||||
@ -518,7 +518,7 @@ public class Dictionary {
|
|||||||
List<String> lists = getRemoteWords(location);
|
List<String> lists = getRemoteWords(location);
|
||||||
// 如果找不到扩展的字典,则忽略
|
// 如果找不到扩展的字典,则忽略
|
||||||
if (lists == null) {
|
if (lists == null) {
|
||||||
logger.error("[Dict Loading] " + location + "加载失败");
|
logger.error("[Dict Loading] " + location + " load failed");
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
for (String theWord : lists) {
|
for (String theWord : lists) {
|
||||||
@ -562,7 +562,7 @@ public class Dictionary {
|
|||||||
}
|
}
|
||||||
|
|
||||||
void reLoadMainDict() {
|
void reLoadMainDict() {
|
||||||
logger.info("重新加载词典...");
|
logger.info("start to reload ik dict.");
|
||||||
// 新开一个实例加载词典,减少加载过程对当前词典使用的影响
|
// 新开一个实例加载词典,减少加载过程对当前词典使用的影响
|
||||||
Dictionary tmpDict = new Dictionary(configuration);
|
Dictionary tmpDict = new Dictionary(configuration);
|
||||||
tmpDict.configuration = getSingleton().configuration;
|
tmpDict.configuration = getSingleton().configuration;
|
||||||
@ -570,7 +570,7 @@ public class Dictionary {
|
|||||||
tmpDict.loadStopWordDict();
|
tmpDict.loadStopWordDict();
|
||||||
_MainDict = tmpDict._MainDict;
|
_MainDict = tmpDict._MainDict;
|
||||||
_StopWords = tmpDict._StopWords;
|
_StopWords = tmpDict._StopWords;
|
||||||
logger.info("重新加载词典完毕...");
|
logger.info("reload ik dict finished.");
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
Loading…
x
Reference in New Issue
Block a user