Compare commits

...

16 Commits
6.x ... master

Author SHA1 Message Date
medcl
9338c19104 update to 8.4.1 2022-09-02 18:44:03 +08:00
Medcl
0fb53ac32c
Update pom.xml
Update log4j
2022-01-19 11:59:06 +08:00
medcl
b637708ba0 update log4j 2021-12-13 09:45:53 +08:00
medcl
9c47725ea0 update for 7.14 2021-08-04 17:19:10 +08:00
Medcl
8e36b3240e
Update FUNDING.yml 2021-05-19 17:27:37 +08:00
Medcl
e0157d5f39
Update FUNDING.yml 2021-05-19 17:27:04 +08:00
Medcl
0fccc038e2
Create FUNDING.yml 2021-05-19 16:50:12 +08:00
Jack
5a1b8c8da6
Read chunked remote words (#817)
Fix chunked content could not be read as it will not get content length
I see there is an issue #780 and this fix it
2020-09-06 16:34:40 +08:00
medcl
1375ca6d39 fix #789 2020-06-10 16:05:01 +08:00
Howard
4619effa15 transfer log message from chinese to english (#746) 2019-12-19 15:31:04 +08:00
medcl
5f53f1a5bf Merge branch 'master' of github.com:medcl/elasticsearch-analysis-ik 2019-10-07 19:01:51 +08:00
medcl
904a7493ea update to 7.4.0 2019-10-07 19:01:29 +08:00
zhipingpan
06e8a23d18 Update AnalyzeContext.java (#673) 2019-05-01 16:57:44 +08:00
Hongliang Wang
a1d6ba8ca2 Correct Search Analyzer (#668)
The former search analyzer `ik-max-word` will give the wrong result against described later in the README file.
2019-04-19 20:23:43 +08:00
medcl
90c9b58354 update example 2019-04-11 10:07:22 +08:00
medcl
ba8bb85f31 update to support 7.x 2019-04-11 09:35:19 +08:00
9 changed files with 36 additions and 41 deletions

2
.github/FUNDING.yml vendored Normal file
View File

@ -0,0 +1,2 @@
patreon: medcl
custom: ["https://www.buymeacoffee.com/medcl"]

View File

@ -10,16 +10,9 @@ Versions
IK version | ES version
-----------|-----------
master | 6.x -> master
6.3.0| 6.3.0
6.2.4| 6.2.4
6.1.3| 6.1.3
5.6.8| 5.6.8
5.5.3| 5.5.3
5.4.3| 5.4.3
5.3.3| 5.3.3
5.2.2| 5.2.2
5.1.2| 5.1.2
master | 7.x -> master
6.x| 6.x
5.x| 5.x
1.10.6 | 2.4.6
1.9.5 | 2.3.5
1.8.1 | 2.2.1
@ -64,13 +57,13 @@ curl -XPUT http://localhost:9200/index
2.create a mapping
```bash
curl -XPOST http://localhost:9200/index/fulltext/_mapping -H 'Content-Type:application/json' -d'
curl -XPOST http://localhost:9200/index/_mapping -H 'Content-Type:application/json' -d'
{
"properties": {
"content": {
"type": "text",
"analyzer": "ik_max_word",
"search_analyzer": "ik_max_word"
"search_analyzer": "ik_smart"
}
}
@ -80,25 +73,25 @@ curl -XPOST http://localhost:9200/index/fulltext/_mapping -H 'Content-Type:appli
3.index some docs
```bash
curl -XPOST http://localhost:9200/index/fulltext/1 -H 'Content-Type:application/json' -d'
curl -XPOST http://localhost:9200/index/_create/1 -H 'Content-Type:application/json' -d'
{"content":"美国留给伊拉克的是个烂摊子吗"}
'
```
```bash
curl -XPOST http://localhost:9200/index/fulltext/2 -H 'Content-Type:application/json' -d'
curl -XPOST http://localhost:9200/index/_create/2 -H 'Content-Type:application/json' -d'
{"content":"公安部:各地校车将享最高路权"}
'
```
```bash
curl -XPOST http://localhost:9200/index/fulltext/3 -H 'Content-Type:application/json' -d'
curl -XPOST http://localhost:9200/index/_create/3 -H 'Content-Type:application/json' -d'
{"content":"中韩渔警冲突调查韩警平均每天扣1艘中国渔船"}
'
```
```bash
curl -XPOST http://localhost:9200/index/fulltext/4 -H 'Content-Type:application/json' -d'
curl -XPOST http://localhost:9200/index/_create/4 -H 'Content-Type:application/json' -d'
{"content":"中国驻洛杉矶领事馆遭亚裔男子枪击 嫌犯已自首"}
'
```
@ -106,7 +99,7 @@ curl -XPOST http://localhost:9200/index/fulltext/4 -H 'Content-Type:application/
4.query with highlighting
```bash
curl -XPOST http://localhost:9200/index/fulltext/_search -H 'Content-Type:application/json' -d'
curl -XPOST http://localhost:9200/index/_search -H 'Content-Type:application/json' -d'
{
"query" : { "match" : { "content" : "中国" }},
"highlight" : {
@ -248,13 +241,13 @@ curl -XGET "http://localhost:9200/your_index/_analyze" -H 'Content-Type: applica
4. ik_max_word 和 ik_smart 什么区别?
ik_max_word: 会将文本做最细粒度的拆分,比如会将“中华人民共和国国歌”拆分为“中华人民共和国,中华人民,中华,华人,人民共和国,人民,人,民,共和国,共和,和,国国,国歌”,会穷尽各种可能的组合;
ik_max_word: 会将文本做最细粒度的拆分,比如会将“中华人民共和国国歌”拆分为“中华人民共和国,中华人民,中华,华人,人民共和国,人民,人,民,共和国,共和,和,国国,国歌”,会穷尽各种可能的组合,适合 Term Query
ik_smart: 会做最粗粒度的拆分,比如会将“中华人民共和国国歌”拆分为“中华人民共和国,国歌”。
ik_smart: 会做最粗粒度的拆分,比如会将“中华人民共和国国歌”拆分为“中华人民共和国,国歌”,适合 Phrase 查询
Changes
------
*5.0.0*
*自 v5.0.0*
- 移除名为 `ik` 的analyzer和tokenizer,请分别使用 `ik_smart``ik_max_word`

14
pom.xml Normal file → Executable file
View File

@ -12,7 +12,7 @@
<inceptionYear>2011</inceptionYear>
<properties>
<elasticsearch.version>6.5.0</elasticsearch.version>
<elasticsearch.version>8.4.1</elasticsearch.version>
<maven.compiler.target>1.8</maven.compiler.target>
<elasticsearch.assembly.descriptor>${project.basedir}/src/main/assemblies/plugin.xml</elasticsearch.assembly.descriptor>
<elasticsearch.plugin.name>analysis-ik</elasticsearch.plugin.name>
@ -34,10 +34,10 @@
<developers>
<developer>
<name>Medcl</name>
<email>medcl@elastic.co</email>
<organization>elastic</organization>
<organizationUrl>http://www.elastic.co</organizationUrl>
<name>INFINI Labs</name>
<email>hello@infini.ltd</email>
<organization>INFINI Labs</organization>
<organizationUrl>https://infinilabs.com</organizationUrl>
</developer>
</developers>
@ -71,7 +71,7 @@
<name>OSS Sonatype</name>
<releases><enabled>true</enabled></releases>
<snapshots><enabled>true</enabled></snapshots>
<url>http://oss.sonatype.org/content/repositories/releases/</url>
<url>https://oss.sonatype.org/content/repositories/releases/</url>
</repository>
</repositories>
@ -93,7 +93,7 @@
<dependency>
<groupId>org.apache.logging.log4j</groupId>
<artifactId>log4j-api</artifactId>
<version>2.3</version>
<version>2.18.0</version>
</dependency>
<dependency>

View File

@ -10,7 +10,7 @@ public class IkAnalyzerProvider extends AbstractIndexAnalyzerProvider<IKAnalyzer
private final IKAnalyzer analyzer;
public IkAnalyzerProvider(IndexSettings indexSettings, Environment env, String name, Settings settings,boolean useSmart) {
super(indexSettings, name, settings);
super(name, settings);
Configuration configuration=new Configuration(env,settings).setUseSmart(useSmart);

View File

@ -11,7 +11,7 @@ public class IkTokenizerFactory extends AbstractTokenizerFactory {
private Configuration configuration;
public IkTokenizerFactory(IndexSettings indexSettings, Environment env, String name, Settings settings) {
super(indexSettings, name, settings);
super(indexSettings, settings,name);
configuration=new Configuration(env,settings);
}

View File

@ -4,7 +4,7 @@
package org.wltea.analyzer.cfg;
import org.elasticsearch.common.inject.Inject;
import org.elasticsearch.common.io.PathUtils;
import org.elasticsearch.core.PathUtils;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.env.Environment;
import org.elasticsearch.plugin.analysis.ik.AnalysisIkPlugin;

View File

@ -268,13 +268,13 @@ class AnalyzeContext {
while(l != null){
this.results.add(l);
//字典中无单字但是词元冲突了切分出相交词元的前一个词元中的单字
int innerIndex = index + 1;
/*int innerIndex = index + 1;
for (; innerIndex < index + l.getLength(); innerIndex++) {
Lexeme innerL = path.peekFirst();
if (innerL != null && innerIndex == innerL.getBegin()) {
this.outputSingleCJK(innerIndex - 1);
}
}
}*/
//将index移至lexeme后
index = l.getBegin() + l.getLength();

View File

@ -57,7 +57,7 @@ class DictSegment implements Comparable<DictSegment>{
DictSegment(Character nodeChar){
if(nodeChar == null){
throw new IllegalArgumentException("参数为空异常,字符不能为空");
throw new IllegalArgumentException("node char cannot be empty");
}
this.nodeChar = nodeChar;
}

16
src/main/java/org/wltea/analyzer/dic/Dictionary.java Normal file → Executable file
View File

@ -52,7 +52,7 @@ import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.elasticsearch.SpecialPermission;
import org.elasticsearch.common.io.PathUtils;
import org.elasticsearch.core.PathUtils;
import org.elasticsearch.plugin.analysis.ik.AnalysisIkPlugin;
import org.wltea.analyzer.cfg.Configuration;
import org.apache.logging.log4j.Logger;
@ -80,7 +80,7 @@ public class Dictionary {
*/
private Configuration configuration;
private static final Logger logger = ESPluginLoggerFactory.getLogger(Monitor.class.getName());
private static final Logger logger = ESPluginLoggerFactory.getLogger(Dictionary.class.getName());
private static ScheduledExecutorService pool = Executors.newScheduledThreadPool(1);
@ -294,7 +294,7 @@ public class Dictionary {
*/
public static Dictionary getSingleton() {
if (singleton == null) {
throw new IllegalStateException("词典尚未初始化请先调用initial方法");
throw new IllegalStateException("ik dict has not been initialized yet, please call initial method first.");
}
return singleton;
}
@ -419,7 +419,7 @@ public class Dictionary {
List<String> lists = getRemoteWords(location);
// 如果找不到扩展的字典则忽略
if (lists == null) {
logger.error("[Dict Loading] " + location + "加载失败");
logger.error("[Dict Loading] " + location + " load failed");
continue;
}
for (String theWord : lists) {
@ -469,7 +469,7 @@ public class Dictionary {
}
}
if (entity.getContentLength() > 0) {
if (entity.getContentLength() > 0 || entity.isChunked()) {
in = new BufferedReader(new InputStreamReader(entity.getContent(), charset));
String line;
while ((line = in.readLine()) != null) {
@ -518,7 +518,7 @@ public class Dictionary {
List<String> lists = getRemoteWords(location);
// 如果找不到扩展的字典则忽略
if (lists == null) {
logger.error("[Dict Loading] " + location + "加载失败");
logger.error("[Dict Loading] " + location + " load failed");
continue;
}
for (String theWord : lists) {
@ -562,7 +562,7 @@ public class Dictionary {
}
void reLoadMainDict() {
logger.info("重新加载词典...");
logger.info("start to reload ik dict.");
// 新开一个实例加载词典减少加载过程对当前词典使用的影响
Dictionary tmpDict = new Dictionary(configuration);
tmpDict.configuration = getSingleton().configuration;
@ -570,7 +570,7 @@ public class Dictionary {
tmpDict.loadStopWordDict();
_MainDict = tmpDict._MainDict;
_StopWords = tmpDict._StopWords;
logger.info("重新加载词典完毕...");
logger.info("reload ik dict finished.");
}
}