Compare commits

..

1 Commits
master ... 6.x

Author SHA1 Message Date
medcl
cfdba08f46 revert to previous commit 2022-05-24 14:17:46 +08:00
9 changed files with 39 additions and 42 deletions

2
.github/FUNDING.yml vendored
View File

@ -1,2 +0,0 @@
patreon: medcl
custom: ["https://www.buymeacoffee.com/medcl"]

View File

@ -10,9 +10,16 @@ Versions
IK version | ES version
-----------|-----------
master | 7.x -> master
6.x| 6.x
5.x| 5.x
master | 6.x -> master
6.3.0| 6.3.0
6.2.4| 6.2.4
6.1.3| 6.1.3
5.6.8| 5.6.8
5.5.3| 5.5.3
5.4.3| 5.4.3
5.3.3| 5.3.3
5.2.2| 5.2.2
5.1.2| 5.1.2
1.10.6 | 2.4.6
1.9.5 | 2.3.5
1.8.1 | 2.2.1
@ -57,13 +64,13 @@ curl -XPUT http://localhost:9200/index
2.create a mapping
```bash
curl -XPOST http://localhost:9200/index/_mapping -H 'Content-Type:application/json' -d'
curl -XPOST http://localhost:9200/index/fulltext/_mapping -H 'Content-Type:application/json' -d'
{
"properties": {
"content": {
"type": "text",
"analyzer": "ik_max_word",
"search_analyzer": "ik_smart"
"search_analyzer": "ik_max_word"
}
}
@ -73,25 +80,25 @@ curl -XPOST http://localhost:9200/index/_mapping -H 'Content-Type:application/js
3.index some docs
```bash
curl -XPOST http://localhost:9200/index/_create/1 -H 'Content-Type:application/json' -d'
curl -XPOST http://localhost:9200/index/fulltext/1 -H 'Content-Type:application/json' -d'
{"content":"美国留给伊拉克的是个烂摊子吗"}
'
```
```bash
curl -XPOST http://localhost:9200/index/_create/2 -H 'Content-Type:application/json' -d'
curl -XPOST http://localhost:9200/index/fulltext/2 -H 'Content-Type:application/json' -d'
{"content":"公安部:各地校车将享最高路权"}
'
```
```bash
curl -XPOST http://localhost:9200/index/_create/3 -H 'Content-Type:application/json' -d'
curl -XPOST http://localhost:9200/index/fulltext/3 -H 'Content-Type:application/json' -d'
{"content":"中韩渔警冲突调查韩警平均每天扣1艘中国渔船"}
'
```
```bash
curl -XPOST http://localhost:9200/index/_create/4 -H 'Content-Type:application/json' -d'
curl -XPOST http://localhost:9200/index/fulltext/4 -H 'Content-Type:application/json' -d'
{"content":"中国驻洛杉矶领事馆遭亚裔男子枪击 嫌犯已自首"}
'
```
@ -99,7 +106,7 @@ curl -XPOST http://localhost:9200/index/_create/4 -H 'Content-Type:application/j
4.query with highlighting
```bash
curl -XPOST http://localhost:9200/index/_search -H 'Content-Type:application/json' -d'
curl -XPOST http://localhost:9200/index/fulltext/_search -H 'Content-Type:application/json' -d'
{
"query" : { "match" : { "content" : "中国" }},
"highlight" : {
@ -241,13 +248,13 @@ curl -XGET "http://localhost:9200/your_index/_analyze" -H 'Content-Type: applica
4. ik_max_word 和 ik_smart 什么区别?
ik_max_word: 会将文本做最细粒度的拆分,比如会将“中华人民共和国国歌”拆分为“中华人民共和国,中华人民,中华,华人,人民共和国,人民,人,民,共和国,共和,和,国国,国歌”,会穷尽各种可能的组合,适合 Term Query
ik_max_word: 会将文本做最细粒度的拆分,比如会将“中华人民共和国国歌”拆分为“中华人民共和国,中华人民,中华,华人,人民共和国,人民,人,民,共和国,共和,和,国国,国歌”,会穷尽各种可能的组合;
ik_smart: 会做最粗粒度的拆分,比如会将“中华人民共和国国歌”拆分为“中华人民共和国,国歌”,适合 Phrase 查询
ik_smart: 会做最粗粒度的拆分,比如会将“中华人民共和国国歌”拆分为“中华人民共和国,国歌”。
Changes
------
*自 v5.0.0*
*5.0.0*
- 移除名为 `ik` 的analyzer和tokenizer,请分别使用 `ik_smart``ik_max_word`

14
pom.xml Executable file → Normal file
View File

@ -12,7 +12,7 @@
<inceptionYear>2011</inceptionYear>
<properties>
<elasticsearch.version>8.4.1</elasticsearch.version>
<elasticsearch.version>6.5.0</elasticsearch.version>
<maven.compiler.target>1.8</maven.compiler.target>
<elasticsearch.assembly.descriptor>${project.basedir}/src/main/assemblies/plugin.xml</elasticsearch.assembly.descriptor>
<elasticsearch.plugin.name>analysis-ik</elasticsearch.plugin.name>
@ -34,10 +34,10 @@
<developers>
<developer>
<name>INFINI Labs</name>
<email>hello@infini.ltd</email>
<organization>INFINI Labs</organization>
<organizationUrl>https://infinilabs.com</organizationUrl>
<name>Medcl</name>
<email>medcl@elastic.co</email>
<organization>elastic</organization>
<organizationUrl>http://www.elastic.co</organizationUrl>
</developer>
</developers>
@ -71,7 +71,7 @@
<name>OSS Sonatype</name>
<releases><enabled>true</enabled></releases>
<snapshots><enabled>true</enabled></snapshots>
<url>https://oss.sonatype.org/content/repositories/releases/</url>
<url>http://oss.sonatype.org/content/repositories/releases/</url>
</repository>
</repositories>
@ -93,7 +93,7 @@
<dependency>
<groupId>org.apache.logging.log4j</groupId>
<artifactId>log4j-api</artifactId>
<version>2.18.0</version>
<version>2.3</version>
</dependency>
<dependency>

View File

@ -10,7 +10,7 @@ public class IkAnalyzerProvider extends AbstractIndexAnalyzerProvider<IKAnalyzer
private final IKAnalyzer analyzer;
public IkAnalyzerProvider(IndexSettings indexSettings, Environment env, String name, Settings settings,boolean useSmart) {
super(name, settings);
super(indexSettings, name, settings);
Configuration configuration=new Configuration(env,settings).setUseSmart(useSmart);

View File

@ -11,7 +11,7 @@ public class IkTokenizerFactory extends AbstractTokenizerFactory {
private Configuration configuration;
public IkTokenizerFactory(IndexSettings indexSettings, Environment env, String name, Settings settings) {
super(indexSettings, settings,name);
super(indexSettings, name, settings);
configuration=new Configuration(env,settings);
}

View File

@ -4,7 +4,7 @@
package org.wltea.analyzer.cfg;
import org.elasticsearch.common.inject.Inject;
import org.elasticsearch.core.PathUtils;
import org.elasticsearch.common.io.PathUtils;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.env.Environment;
import org.elasticsearch.plugin.analysis.ik.AnalysisIkPlugin;

View File

@ -267,14 +267,6 @@ class AnalyzeContext {
Lexeme l = path.pollFirst();
while(l != null){
this.results.add(l);
//字典中无单字但是词元冲突了切分出相交词元的前一个词元中的单字
/*int innerIndex = index + 1;
for (; innerIndex < index + l.getLength(); innerIndex++) {
Lexeme innerL = path.peekFirst();
if (innerL != null && innerIndex == innerL.getBegin()) {
this.outputSingleCJK(innerIndex - 1);
}
}*/
//将index移至lexeme后
index = l.getBegin() + l.getLength();

View File

@ -57,7 +57,7 @@ class DictSegment implements Comparable<DictSegment>{
DictSegment(Character nodeChar){
if(nodeChar == null){
throw new IllegalArgumentException("node char cannot be empty");
throw new IllegalArgumentException("参数为空异常,字符不能为空");
}
this.nodeChar = nodeChar;
}

16
src/main/java/org/wltea/analyzer/dic/Dictionary.java Executable file → Normal file
View File

@ -52,7 +52,7 @@ import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.elasticsearch.SpecialPermission;
import org.elasticsearch.core.PathUtils;
import org.elasticsearch.common.io.PathUtils;
import org.elasticsearch.plugin.analysis.ik.AnalysisIkPlugin;
import org.wltea.analyzer.cfg.Configuration;
import org.apache.logging.log4j.Logger;
@ -80,7 +80,7 @@ public class Dictionary {
*/
private Configuration configuration;
private static final Logger logger = ESPluginLoggerFactory.getLogger(Dictionary.class.getName());
private static final Logger logger = ESPluginLoggerFactory.getLogger(Monitor.class.getName());
private static ScheduledExecutorService pool = Executors.newScheduledThreadPool(1);
@ -294,7 +294,7 @@ public class Dictionary {
*/
public static Dictionary getSingleton() {
if (singleton == null) {
throw new IllegalStateException("ik dict has not been initialized yet, please call initial method first.");
throw new IllegalStateException("词典尚未初始化请先调用initial方法");
}
return singleton;
}
@ -419,7 +419,7 @@ public class Dictionary {
List<String> lists = getRemoteWords(location);
// 如果找不到扩展的字典则忽略
if (lists == null) {
logger.error("[Dict Loading] " + location + " load failed");
logger.error("[Dict Loading] " + location + "加载失败");
continue;
}
for (String theWord : lists) {
@ -469,7 +469,7 @@ public class Dictionary {
}
}
if (entity.getContentLength() > 0 || entity.isChunked()) {
if (entity.getContentLength() > 0) {
in = new BufferedReader(new InputStreamReader(entity.getContent(), charset));
String line;
while ((line = in.readLine()) != null) {
@ -518,7 +518,7 @@ public class Dictionary {
List<String> lists = getRemoteWords(location);
// 如果找不到扩展的字典则忽略
if (lists == null) {
logger.error("[Dict Loading] " + location + " load failed");
logger.error("[Dict Loading] " + location + "加载失败");
continue;
}
for (String theWord : lists) {
@ -562,7 +562,7 @@ public class Dictionary {
}
void reLoadMainDict() {
logger.info("start to reload ik dict.");
logger.info("重新加载词典...");
// 新开一个实例加载词典减少加载过程对当前词典使用的影响
Dictionary tmpDict = new Dictionary(configuration);
tmpDict.configuration = getSingleton().configuration;
@ -570,7 +570,7 @@ public class Dictionary {
tmpDict.loadStopWordDict();
_MainDict = tmpDict._MainDict;
_StopWords = tmpDict._StopWords;
logger.info("reload ik dict finished.");
logger.info("重新加载词典完毕...");
}
}