Compare commits
47 Commits
Author | SHA1 | Date | |
---|---|---|---|
|
9338c19104 | ||
|
0fb53ac32c | ||
|
b637708ba0 | ||
|
9c47725ea0 | ||
|
8e36b3240e | ||
|
e0157d5f39 | ||
|
0fccc038e2 | ||
|
5a1b8c8da6 | ||
|
1375ca6d39 | ||
|
4619effa15 | ||
|
5f53f1a5bf | ||
|
904a7493ea | ||
|
06e8a23d18 | ||
|
a1d6ba8ca2 | ||
|
90c9b58354 | ||
|
ba8bb85f31 | ||
|
125ac3c5e5 | ||
|
f0dd522e60 | ||
|
9eaa2b90eb | ||
|
9873489ba7 | ||
|
949531572b | ||
|
1d750a9bdd | ||
|
3a7a81c29d | ||
|
1422d5b96c | ||
|
9fff6379ef | ||
|
5190dba198 | ||
|
83fa2ff8b2 | ||
|
0222529290 | ||
|
5e8d0df2be | ||
|
36e6d2d00b | ||
|
de1da42d38 | ||
|
3dcedde9e4 | ||
|
21a859a48d | ||
|
816b8ddd4b | ||
|
7028b9ea05 | ||
|
4ab2616a96 | ||
|
7c9b4771b3 | ||
|
22de5be444 | ||
|
0e8ddbd749 | ||
|
7a1445fdda | ||
|
353cefd5b8 | ||
|
0922152fb8 | ||
|
cc01c881af | ||
|
eb21c796d8 | ||
|
5828cb1c72 | ||
|
dc739d2cee | ||
|
2851cc2501 |
2
.github/FUNDING.yml
vendored
Normal file
2
.github/FUNDING.yml
vendored
Normal file
@ -0,0 +1,2 @@
|
|||||||
|
patreon: medcl
|
||||||
|
custom: ["https://www.buymeacoffee.com/medcl"]
|
@ -7,12 +7,3 @@ script:
|
|||||||
- java -version
|
- java -version
|
||||||
language: java
|
language: java
|
||||||
script: mvn clean package
|
script: mvn clean package
|
||||||
deploy:
|
|
||||||
provider: releases
|
|
||||||
api_key:
|
|
||||||
secure: llxJZlRYBIWINl5XI42RpEe+jTxlmSP6MX+oTNZa4oFjEeN9Kdd1G8+S3HSIhCc31RoF/2zeNsM9OehRi1O6bweNSQ9vjlKZQPD8FYcHaHpYW0U7h/OMbEeC794fAghm9ZsmOTNymdvbAXL14nJTrwOW9W8VqoZT9Jx7Ejad63Y=
|
|
||||||
file: target/releases/elasticsearch-analysis-ik-*.zip
|
|
||||||
file_glob: true
|
|
||||||
on:
|
|
||||||
repo: medcl/elasticsearch-analysis-ik
|
|
||||||
tags: true
|
|
||||||
|
54
README.md
54
README.md
@ -3,20 +3,16 @@ IK Analysis for Elasticsearch
|
|||||||
|
|
||||||
The IK Analysis plugin integrates Lucene IK analyzer (http://code.google.com/p/ik-analyzer/) into elasticsearch, support customized dictionary.
|
The IK Analysis plugin integrates Lucene IK analyzer (http://code.google.com/p/ik-analyzer/) into elasticsearch, support customized dictionary.
|
||||||
|
|
||||||
Analyzer: `ik_smart` , `ik_max_word` , Tokenizer: `ik_smart` , `ik_max_word`
|
Analyzer: `ik_smart` , `ik_max_word` , Tokenizer: `ik_smart` , `ik_max_word`
|
||||||
|
|
||||||
Versions
|
Versions
|
||||||
--------
|
--------
|
||||||
|
|
||||||
IK version | ES version
|
IK version | ES version
|
||||||
-----------|-----------
|
-----------|-----------
|
||||||
master | 5.x -> master
|
master | 7.x -> master
|
||||||
5.6.4| 5.6.4
|
6.x| 6.x
|
||||||
5.5.3| 5.5.3
|
5.x| 5.x
|
||||||
5.4.3| 5.4.3
|
|
||||||
5.3.3| 5.3.3
|
|
||||||
5.2.2| 5.2.2
|
|
||||||
5.1.2| 5.1.2
|
|
||||||
1.10.6 | 2.4.6
|
1.10.6 | 2.4.6
|
||||||
1.9.5 | 2.3.5
|
1.9.5 | 2.3.5
|
||||||
1.8.1 | 2.2.1
|
1.8.1 | 2.2.1
|
||||||
@ -33,12 +29,18 @@ Install
|
|||||||
1.download or compile
|
1.download or compile
|
||||||
|
|
||||||
* optional 1 - download pre-build package from here: https://github.com/medcl/elasticsearch-analysis-ik/releases
|
* optional 1 - download pre-build package from here: https://github.com/medcl/elasticsearch-analysis-ik/releases
|
||||||
|
|
||||||
|
create plugin folder `cd your-es-root/plugins/ && mkdir ik`
|
||||||
|
|
||||||
unzip plugin to folder `your-es-root/plugins/`
|
unzip plugin to folder `your-es-root/plugins/ik`
|
||||||
|
|
||||||
* optional 2 - use elasticsearch-plugin to install ( version > v5.5.1 ):
|
* optional 2 - use elasticsearch-plugin to install ( supported from version v5.5.1 ):
|
||||||
|
|
||||||
`./bin/elasticsearch-plugin install https://github.com/medcl/elasticsearch-analysis-ik/releases/download/v5.6.4/elasticsearch-analysis-ik-5.6.4.zip`
|
```
|
||||||
|
./bin/elasticsearch-plugin install https://github.com/medcl/elasticsearch-analysis-ik/releases/download/v6.3.0/elasticsearch-analysis-ik-6.3.0.zip
|
||||||
|
```
|
||||||
|
|
||||||
|
NOTE: replace `6.3.0` to your own elasticsearch version
|
||||||
|
|
||||||
2.restart elasticsearch
|
2.restart elasticsearch
|
||||||
|
|
||||||
@ -55,41 +57,41 @@ curl -XPUT http://localhost:9200/index
|
|||||||
2.create a mapping
|
2.create a mapping
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
curl -XPOST http://localhost:9200/index/fulltext/_mapping -d'
|
curl -XPOST http://localhost:9200/index/_mapping -H 'Content-Type:application/json' -d'
|
||||||
{
|
{
|
||||||
"properties": {
|
"properties": {
|
||||||
"content": {
|
"content": {
|
||||||
"type": "text",
|
"type": "text",
|
||||||
"analyzer": "ik_max_word",
|
"analyzer": "ik_max_word",
|
||||||
"search_analyzer": "ik_max_word"
|
"search_analyzer": "ik_smart"
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
}'
|
}'
|
||||||
```
|
```
|
||||||
|
|
||||||
3.index some docs
|
3.index some docs
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
curl -XPOST http://localhost:9200/index/fulltext/1 -d'
|
curl -XPOST http://localhost:9200/index/_create/1 -H 'Content-Type:application/json' -d'
|
||||||
{"content":"美国留给伊拉克的是个烂摊子吗"}
|
{"content":"美国留给伊拉克的是个烂摊子吗"}
|
||||||
'
|
'
|
||||||
```
|
```
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
curl -XPOST http://localhost:9200/index/fulltext/2 -d'
|
curl -XPOST http://localhost:9200/index/_create/2 -H 'Content-Type:application/json' -d'
|
||||||
{"content":"公安部:各地校车将享最高路权"}
|
{"content":"公安部:各地校车将享最高路权"}
|
||||||
'
|
'
|
||||||
```
|
```
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
curl -XPOST http://localhost:9200/index/fulltext/3 -d'
|
curl -XPOST http://localhost:9200/index/_create/3 -H 'Content-Type:application/json' -d'
|
||||||
{"content":"中韩渔警冲突调查:韩警平均每天扣1艘中国渔船"}
|
{"content":"中韩渔警冲突调查:韩警平均每天扣1艘中国渔船"}
|
||||||
'
|
'
|
||||||
```
|
```
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
curl -XPOST http://localhost:9200/index/fulltext/4 -d'
|
curl -XPOST http://localhost:9200/index/_create/4 -H 'Content-Type:application/json' -d'
|
||||||
{"content":"中国驻洛杉矶领事馆遭亚裔男子枪击 嫌犯已自首"}
|
{"content":"中国驻洛杉矶领事馆遭亚裔男子枪击 嫌犯已自首"}
|
||||||
'
|
'
|
||||||
```
|
```
|
||||||
@ -97,7 +99,7 @@ curl -XPOST http://localhost:9200/index/fulltext/4 -d'
|
|||||||
4.query with highlighting
|
4.query with highlighting
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
curl -XPOST http://localhost:9200/index/fulltext/_search -d'
|
curl -XPOST http://localhost:9200/index/_search -H 'Content-Type:application/json' -d'
|
||||||
{
|
{
|
||||||
"query" : { "match" : { "content" : "中国" }},
|
"query" : { "match" : { "content" : "中国" }},
|
||||||
"highlight" : {
|
"highlight" : {
|
||||||
@ -227,19 +229,25 @@ mvn package
|
|||||||
|
|
||||||
3.分词测试失败
|
3.分词测试失败
|
||||||
请在某个索引下调用analyze接口测试,而不是直接调用analyze接口
|
请在某个索引下调用analyze接口测试,而不是直接调用analyze接口
|
||||||
如:http://localhost:9200/your_index/_analyze?text=中华人民共和国MN&tokenizer=my_ik
|
如:
|
||||||
|
```bash
|
||||||
|
curl -XGET "http://localhost:9200/your_index/_analyze" -H 'Content-Type: application/json' -d'
|
||||||
|
{
|
||||||
|
"text":"中华人民共和国MN","tokenizer": "my_ik"
|
||||||
|
}'
|
||||||
|
```
|
||||||
|
|
||||||
|
|
||||||
4. ik_max_word 和 ik_smart 什么区别?
|
4. ik_max_word 和 ik_smart 什么区别?
|
||||||
|
|
||||||
|
|
||||||
ik_max_word: 会将文本做最细粒度的拆分,比如会将“中华人民共和国国歌”拆分为“中华人民共和国,中华人民,中华,华人,人民共和国,人民,人,民,共和国,共和,和,国国,国歌”,会穷尽各种可能的组合;
|
ik_max_word: 会将文本做最细粒度的拆分,比如会将“中华人民共和国国歌”拆分为“中华人民共和国,中华人民,中华,华人,人民共和国,人民,人,民,共和国,共和,和,国国,国歌”,会穷尽各种可能的组合,适合 Term Query;
|
||||||
|
|
||||||
ik_smart: 会做最粗粒度的拆分,比如会将“中华人民共和国国歌”拆分为“中华人民共和国,国歌”。
|
ik_smart: 会做最粗粒度的拆分,比如会将“中华人民共和国国歌”拆分为“中华人民共和国,国歌”,适合 Phrase 查询。
|
||||||
|
|
||||||
Changes
|
Changes
|
||||||
------
|
------
|
||||||
*5.0.0*
|
*自 v5.0.0 起*
|
||||||
|
|
||||||
- 移除名为 `ik` 的analyzer和tokenizer,请分别使用 `ik_smart` 和 `ik_max_word`
|
- 移除名为 `ik` 的analyzer和tokenizer,请分别使用 `ik_smart` 和 `ik_max_word`
|
||||||
|
|
||||||
|
16
pom.xml
Normal file → Executable file
16
pom.xml
Normal file → Executable file
@ -12,7 +12,7 @@
|
|||||||
<inceptionYear>2011</inceptionYear>
|
<inceptionYear>2011</inceptionYear>
|
||||||
|
|
||||||
<properties>
|
<properties>
|
||||||
<elasticsearch.version>5.6.4</elasticsearch.version>
|
<elasticsearch.version>8.4.1</elasticsearch.version>
|
||||||
<maven.compiler.target>1.8</maven.compiler.target>
|
<maven.compiler.target>1.8</maven.compiler.target>
|
||||||
<elasticsearch.assembly.descriptor>${project.basedir}/src/main/assemblies/plugin.xml</elasticsearch.assembly.descriptor>
|
<elasticsearch.assembly.descriptor>${project.basedir}/src/main/assemblies/plugin.xml</elasticsearch.assembly.descriptor>
|
||||||
<elasticsearch.plugin.name>analysis-ik</elasticsearch.plugin.name>
|
<elasticsearch.plugin.name>analysis-ik</elasticsearch.plugin.name>
|
||||||
@ -21,7 +21,7 @@
|
|||||||
<tests.rest.load_packaged>false</tests.rest.load_packaged>
|
<tests.rest.load_packaged>false</tests.rest.load_packaged>
|
||||||
<skip.unit.tests>true</skip.unit.tests>
|
<skip.unit.tests>true</skip.unit.tests>
|
||||||
<gpg.keyname>4E899B30</gpg.keyname>
|
<gpg.keyname>4E899B30</gpg.keyname>
|
||||||
<gpg.useagent>true</gpg.useagent>
|
<gpg.useagent>true</gpg.useagent>
|
||||||
</properties>
|
</properties>
|
||||||
|
|
||||||
<licenses>
|
<licenses>
|
||||||
@ -34,10 +34,10 @@
|
|||||||
|
|
||||||
<developers>
|
<developers>
|
||||||
<developer>
|
<developer>
|
||||||
<name>Medcl</name>
|
<name>INFINI Labs</name>
|
||||||
<email>medcl@elastic.co</email>
|
<email>hello@infini.ltd</email>
|
||||||
<organization>elastic</organization>
|
<organization>INFINI Labs</organization>
|
||||||
<organizationUrl>http://www.elastic.co</organizationUrl>
|
<organizationUrl>https://infinilabs.com</organizationUrl>
|
||||||
</developer>
|
</developer>
|
||||||
</developers>
|
</developers>
|
||||||
|
|
||||||
@ -71,7 +71,7 @@
|
|||||||
<name>OSS Sonatype</name>
|
<name>OSS Sonatype</name>
|
||||||
<releases><enabled>true</enabled></releases>
|
<releases><enabled>true</enabled></releases>
|
||||||
<snapshots><enabled>true</enabled></snapshots>
|
<snapshots><enabled>true</enabled></snapshots>
|
||||||
<url>http://oss.sonatype.org/content/repositories/releases/</url>
|
<url>https://oss.sonatype.org/content/repositories/releases/</url>
|
||||||
</repository>
|
</repository>
|
||||||
</repositories>
|
</repositories>
|
||||||
|
|
||||||
@ -93,7 +93,7 @@
|
|||||||
<dependency>
|
<dependency>
|
||||||
<groupId>org.apache.logging.log4j</groupId>
|
<groupId>org.apache.logging.log4j</groupId>
|
||||||
<artifactId>log4j-api</artifactId>
|
<artifactId>log4j-api</artifactId>
|
||||||
<version>2.3</version>
|
<version>2.18.0</version>
|
||||||
</dependency>
|
</dependency>
|
||||||
|
|
||||||
<dependency>
|
<dependency>
|
||||||
|
@ -8,20 +8,25 @@
|
|||||||
<fileSets>
|
<fileSets>
|
||||||
<fileSet>
|
<fileSet>
|
||||||
<directory>${project.basedir}/config</directory>
|
<directory>${project.basedir}/config</directory>
|
||||||
<outputDirectory>elasticsearch/config</outputDirectory>
|
<outputDirectory>config</outputDirectory>
|
||||||
</fileSet>
|
</fileSet>
|
||||||
</fileSets>
|
</fileSets>
|
||||||
|
|
||||||
<files>
|
<files>
|
||||||
<file>
|
<file>
|
||||||
<source>${project.basedir}/src/main/resources/plugin-descriptor.properties</source>
|
<source>${project.basedir}/src/main/resources/plugin-descriptor.properties</source>
|
||||||
<outputDirectory>elasticsearch</outputDirectory>
|
<outputDirectory/>
|
||||||
|
<filtered>true</filtered>
|
||||||
|
</file>
|
||||||
|
<file>
|
||||||
|
<source>${project.basedir}/src/main/resources/plugin-security.policy</source>
|
||||||
|
<outputDirectory/>
|
||||||
<filtered>true</filtered>
|
<filtered>true</filtered>
|
||||||
</file>
|
</file>
|
||||||
</files>
|
</files>
|
||||||
<dependencySets>
|
<dependencySets>
|
||||||
<dependencySet>
|
<dependencySet>
|
||||||
<outputDirectory>elasticsearch</outputDirectory>
|
<outputDirectory/>
|
||||||
<useProjectArtifact>true</useProjectArtifact>
|
<useProjectArtifact>true</useProjectArtifact>
|
||||||
<useTransitiveFiltering>true</useTransitiveFiltering>
|
<useTransitiveFiltering>true</useTransitiveFiltering>
|
||||||
<excludes>
|
<excludes>
|
||||||
@ -29,7 +34,7 @@
|
|||||||
</excludes>
|
</excludes>
|
||||||
</dependencySet>
|
</dependencySet>
|
||||||
<dependencySet>
|
<dependencySet>
|
||||||
<outputDirectory>elasticsearch</outputDirectory>
|
<outputDirectory/>
|
||||||
<useProjectArtifact>true</useProjectArtifact>
|
<useProjectArtifact>true</useProjectArtifact>
|
||||||
<useTransitiveFiltering>true</useTransitiveFiltering>
|
<useTransitiveFiltering>true</useTransitiveFiltering>
|
||||||
<includes>
|
<includes>
|
||||||
|
@ -10,7 +10,7 @@ public class IkAnalyzerProvider extends AbstractIndexAnalyzerProvider<IKAnalyzer
|
|||||||
private final IKAnalyzer analyzer;
|
private final IKAnalyzer analyzer;
|
||||||
|
|
||||||
public IkAnalyzerProvider(IndexSettings indexSettings, Environment env, String name, Settings settings,boolean useSmart) {
|
public IkAnalyzerProvider(IndexSettings indexSettings, Environment env, String name, Settings settings,boolean useSmart) {
|
||||||
super(indexSettings, name, settings);
|
super(name, settings);
|
||||||
|
|
||||||
Configuration configuration=new Configuration(env,settings).setUseSmart(useSmart);
|
Configuration configuration=new Configuration(env,settings).setUseSmart(useSmart);
|
||||||
|
|
||||||
|
@ -11,7 +11,7 @@ public class IkTokenizerFactory extends AbstractTokenizerFactory {
|
|||||||
private Configuration configuration;
|
private Configuration configuration;
|
||||||
|
|
||||||
public IkTokenizerFactory(IndexSettings indexSettings, Environment env, String name, Settings settings) {
|
public IkTokenizerFactory(IndexSettings indexSettings, Environment env, String name, Settings settings) {
|
||||||
super(indexSettings, name, settings);
|
super(indexSettings, settings,name);
|
||||||
configuration=new Configuration(env,settings);
|
configuration=new Configuration(env,settings);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
2
src/main/java/org/wltea/analyzer/cfg/Configuration.java
Normal file → Executable file
2
src/main/java/org/wltea/analyzer/cfg/Configuration.java
Normal file → Executable file
@ -4,7 +4,7 @@
|
|||||||
package org.wltea.analyzer.cfg;
|
package org.wltea.analyzer.cfg;
|
||||||
|
|
||||||
import org.elasticsearch.common.inject.Inject;
|
import org.elasticsearch.common.inject.Inject;
|
||||||
import org.elasticsearch.common.io.PathUtils;
|
import org.elasticsearch.core.PathUtils;
|
||||||
import org.elasticsearch.common.settings.Settings;
|
import org.elasticsearch.common.settings.Settings;
|
||||||
import org.elasticsearch.env.Environment;
|
import org.elasticsearch.env.Environment;
|
||||||
import org.elasticsearch.plugin.analysis.ik.AnalysisIkPlugin;
|
import org.elasticsearch.plugin.analysis.ik.AnalysisIkPlugin;
|
||||||
|
@ -48,7 +48,7 @@ class AnalyzeContext {
|
|||||||
private static final int BUFF_EXHAUST_CRITICAL = 100;
|
private static final int BUFF_EXHAUST_CRITICAL = 100;
|
||||||
|
|
||||||
|
|
||||||
//字符窜读取缓冲
|
//字符串读取缓冲
|
||||||
private char[] segmentBuff;
|
private char[] segmentBuff;
|
||||||
//字符类型数组
|
//字符类型数组
|
||||||
private int[] charTypes;
|
private int[] charTypes;
|
||||||
@ -267,6 +267,15 @@ class AnalyzeContext {
|
|||||||
Lexeme l = path.pollFirst();
|
Lexeme l = path.pollFirst();
|
||||||
while(l != null){
|
while(l != null){
|
||||||
this.results.add(l);
|
this.results.add(l);
|
||||||
|
//字典中无单字,但是词元冲突了,切分出相交词元的前一个词元中的单字
|
||||||
|
/*int innerIndex = index + 1;
|
||||||
|
for (; innerIndex < index + l.getLength(); innerIndex++) {
|
||||||
|
Lexeme innerL = path.peekFirst();
|
||||||
|
if (innerL != null && innerIndex == innerL.getBegin()) {
|
||||||
|
this.outputSingleCJK(innerIndex - 1);
|
||||||
|
}
|
||||||
|
}*/
|
||||||
|
|
||||||
//将index移至lexeme后
|
//将index移至lexeme后
|
||||||
index = l.getBegin() + l.getLength();
|
index = l.getBegin() + l.getLength();
|
||||||
l = path.pollFirst();
|
l = path.pollFirst();
|
||||||
|
@ -57,7 +57,7 @@ class DictSegment implements Comparable<DictSegment>{
|
|||||||
|
|
||||||
DictSegment(Character nodeChar){
|
DictSegment(Character nodeChar){
|
||||||
if(nodeChar == null){
|
if(nodeChar == null){
|
||||||
throw new IllegalArgumentException("参数为空异常,字符不能为空");
|
throw new IllegalArgumentException("node char cannot be empty");
|
||||||
}
|
}
|
||||||
this.nodeChar = nodeChar;
|
this.nodeChar = nodeChar;
|
||||||
}
|
}
|
||||||
|
434
src/main/java/org/wltea/analyzer/dic/Dictionary.java
Normal file → Executable file
434
src/main/java/org/wltea/analyzer/dic/Dictionary.java
Normal file → Executable file
@ -26,29 +26,37 @@
|
|||||||
package org.wltea.analyzer.dic;
|
package org.wltea.analyzer.dic;
|
||||||
|
|
||||||
import java.io.BufferedReader;
|
import java.io.BufferedReader;
|
||||||
import java.io.File;
|
|
||||||
import java.io.FileInputStream;
|
import java.io.FileInputStream;
|
||||||
import java.io.FileNotFoundException;
|
import java.io.FileNotFoundException;
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.io.InputStream;
|
import java.io.InputStream;
|
||||||
import java.io.InputStreamReader;
|
import java.io.InputStreamReader;
|
||||||
|
import java.nio.file.attribute.BasicFileAttributes;
|
||||||
|
import java.nio.file.Files;
|
||||||
|
import java.nio.file.FileVisitResult;
|
||||||
import java.nio.file.Path;
|
import java.nio.file.Path;
|
||||||
|
import java.nio.file.SimpleFileVisitor;
|
||||||
|
import java.security.AccessController;
|
||||||
|
import java.security.PrivilegedAction;
|
||||||
import java.util.*;
|
import java.util.*;
|
||||||
import java.util.concurrent.Executors;
|
import java.util.concurrent.Executors;
|
||||||
import java.util.concurrent.ScheduledExecutorService;
|
import java.util.concurrent.ScheduledExecutorService;
|
||||||
import java.util.concurrent.TimeUnit;
|
import java.util.concurrent.TimeUnit;
|
||||||
|
|
||||||
|
import org.apache.http.Header;
|
||||||
|
import org.apache.http.HttpEntity;
|
||||||
import org.apache.http.client.ClientProtocolException;
|
import org.apache.http.client.ClientProtocolException;
|
||||||
import org.apache.http.client.config.RequestConfig;
|
import org.apache.http.client.config.RequestConfig;
|
||||||
import org.apache.http.client.methods.CloseableHttpResponse;
|
import org.apache.http.client.methods.CloseableHttpResponse;
|
||||||
import org.apache.http.client.methods.HttpGet;
|
import org.apache.http.client.methods.HttpGet;
|
||||||
import org.apache.http.impl.client.CloseableHttpClient;
|
import org.apache.http.impl.client.CloseableHttpClient;
|
||||||
import org.apache.http.impl.client.HttpClients;
|
import org.apache.http.impl.client.HttpClients;
|
||||||
import org.elasticsearch.common.io.PathUtils;
|
import org.elasticsearch.SpecialPermission;
|
||||||
import org.elasticsearch.common.logging.ESLoggerFactory;
|
import org.elasticsearch.core.PathUtils;
|
||||||
import org.elasticsearch.plugin.analysis.ik.AnalysisIkPlugin;
|
import org.elasticsearch.plugin.analysis.ik.AnalysisIkPlugin;
|
||||||
import org.wltea.analyzer.cfg.Configuration;
|
import org.wltea.analyzer.cfg.Configuration;
|
||||||
import org.apache.logging.log4j.Logger;
|
import org.apache.logging.log4j.Logger;
|
||||||
|
import org.wltea.analyzer.help.ESPluginLoggerFactory;
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@ -63,14 +71,8 @@ public class Dictionary {
|
|||||||
|
|
||||||
private DictSegment _MainDict;
|
private DictSegment _MainDict;
|
||||||
|
|
||||||
private DictSegment _SurnameDict;
|
|
||||||
|
|
||||||
private DictSegment _QuantifierDict;
|
private DictSegment _QuantifierDict;
|
||||||
|
|
||||||
private DictSegment _SuffixDict;
|
|
||||||
|
|
||||||
private DictSegment _PrepDict;
|
|
||||||
|
|
||||||
private DictSegment _StopWords;
|
private DictSegment _StopWords;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@ -78,16 +80,16 @@ public class Dictionary {
|
|||||||
*/
|
*/
|
||||||
private Configuration configuration;
|
private Configuration configuration;
|
||||||
|
|
||||||
private static final Logger logger = ESLoggerFactory.getLogger(Monitor.class.getName());
|
private static final Logger logger = ESPluginLoggerFactory.getLogger(Dictionary.class.getName());
|
||||||
|
|
||||||
private static ScheduledExecutorService pool = Executors.newScheduledThreadPool(1);
|
private static ScheduledExecutorService pool = Executors.newScheduledThreadPool(1);
|
||||||
|
|
||||||
public static final String PATH_DIC_MAIN = "main.dic";
|
private static final String PATH_DIC_MAIN = "main.dic";
|
||||||
public static final String PATH_DIC_SURNAME = "surname.dic";
|
private static final String PATH_DIC_SURNAME = "surname.dic";
|
||||||
public static final String PATH_DIC_QUANTIFIER = "quantifier.dic";
|
private static final String PATH_DIC_QUANTIFIER = "quantifier.dic";
|
||||||
public static final String PATH_DIC_SUFFIX = "suffix.dic";
|
private static final String PATH_DIC_SUFFIX = "suffix.dic";
|
||||||
public static final String PATH_DIC_PREP = "preposition.dic";
|
private static final String PATH_DIC_PREP = "preposition.dic";
|
||||||
public static final String PATH_DIC_STOP = "stopword.dic";
|
private static final String PATH_DIC_STOP = "stopword.dic";
|
||||||
|
|
||||||
private final static String FILE_NAME = "IKAnalyzer.cfg.xml";
|
private final static String FILE_NAME = "IKAnalyzer.cfg.xml";
|
||||||
private final static String EXT_DICT = "ext_dict";
|
private final static String EXT_DICT = "ext_dict";
|
||||||
@ -122,15 +124,13 @@ public class Dictionary {
|
|||||||
if (input != null) {
|
if (input != null) {
|
||||||
try {
|
try {
|
||||||
props.loadFromXML(input);
|
props.loadFromXML(input);
|
||||||
} catch (InvalidPropertiesFormatException e) {
|
|
||||||
logger.error("ik-analyzer", e);
|
|
||||||
} catch (IOException e) {
|
} catch (IOException e) {
|
||||||
logger.error("ik-analyzer", e);
|
logger.error("ik-analyzer", e);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
public String getProperty(String key){
|
private String getProperty(String key){
|
||||||
if(props!=null){
|
if(props!=null){
|
||||||
return props.getProperty(key);
|
return props.getProperty(key);
|
||||||
}
|
}
|
||||||
@ -142,7 +142,7 @@ public class Dictionary {
|
|||||||
*
|
*
|
||||||
* @return Dictionary
|
* @return Dictionary
|
||||||
*/
|
*/
|
||||||
public static synchronized Dictionary initial(Configuration cfg) {
|
public static synchronized void initial(Configuration cfg) {
|
||||||
if (singleton == null) {
|
if (singleton == null) {
|
||||||
synchronized (Dictionary.class) {
|
synchronized (Dictionary.class) {
|
||||||
if (singleton == null) {
|
if (singleton == null) {
|
||||||
@ -166,14 +166,57 @@ public class Dictionary {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
return singleton;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return singleton;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
public List<String> getExtDictionarys() {
|
private void walkFileTree(List<String> files, Path path) {
|
||||||
|
if (Files.isRegularFile(path)) {
|
||||||
|
files.add(path.toString());
|
||||||
|
} else if (Files.isDirectory(path)) try {
|
||||||
|
Files.walkFileTree(path, new SimpleFileVisitor<Path>() {
|
||||||
|
@Override
|
||||||
|
public FileVisitResult visitFile(Path file, BasicFileAttributes attrs) {
|
||||||
|
files.add(file.toString());
|
||||||
|
return FileVisitResult.CONTINUE;
|
||||||
|
}
|
||||||
|
@Override
|
||||||
|
public FileVisitResult visitFileFailed(Path file, IOException e) {
|
||||||
|
logger.error("[Ext Loading] listing files", e);
|
||||||
|
return FileVisitResult.CONTINUE;
|
||||||
|
}
|
||||||
|
});
|
||||||
|
} catch (IOException e) {
|
||||||
|
logger.error("[Ext Loading] listing files", e);
|
||||||
|
} else {
|
||||||
|
logger.warn("[Ext Loading] file not found: " + path);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private void loadDictFile(DictSegment dict, Path file, boolean critical, String name) {
|
||||||
|
try (InputStream is = new FileInputStream(file.toFile())) {
|
||||||
|
BufferedReader br = new BufferedReader(
|
||||||
|
new InputStreamReader(is, "UTF-8"), 512);
|
||||||
|
String word = br.readLine();
|
||||||
|
if (word != null) {
|
||||||
|
if (word.startsWith("\uFEFF"))
|
||||||
|
word = word.substring(1);
|
||||||
|
for (; word != null; word = br.readLine()) {
|
||||||
|
word = word.trim();
|
||||||
|
if (word.isEmpty()) continue;
|
||||||
|
dict.fillSegment(word.toCharArray());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} catch (FileNotFoundException e) {
|
||||||
|
logger.error("ik-analyzer: " + name + " not found", e);
|
||||||
|
if (critical) throw new RuntimeException("ik-analyzer: " + name + " not found!!!", e);
|
||||||
|
} catch (IOException e) {
|
||||||
|
logger.error("ik-analyzer: " + name + " loading failed", e);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private List<String> getExtDictionarys() {
|
||||||
List<String> extDictFiles = new ArrayList<String>(2);
|
List<String> extDictFiles = new ArrayList<String>(2);
|
||||||
String extDictCfg = getProperty(EXT_DICT);
|
String extDictCfg = getProperty(EXT_DICT);
|
||||||
if (extDictCfg != null) {
|
if (extDictCfg != null) {
|
||||||
@ -181,8 +224,8 @@ public class Dictionary {
|
|||||||
String[] filePaths = extDictCfg.split(";");
|
String[] filePaths = extDictCfg.split(";");
|
||||||
for (String filePath : filePaths) {
|
for (String filePath : filePaths) {
|
||||||
if (filePath != null && !"".equals(filePath.trim())) {
|
if (filePath != null && !"".equals(filePath.trim())) {
|
||||||
Path file = PathUtils.get(filePath.trim());
|
Path file = PathUtils.get(getDictRoot(), filePath.trim());
|
||||||
extDictFiles.add(file.toString());
|
walkFileTree(extDictFiles, file);
|
||||||
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -190,7 +233,7 @@ public class Dictionary {
|
|||||||
return extDictFiles;
|
return extDictFiles;
|
||||||
}
|
}
|
||||||
|
|
||||||
public List<String> getRemoteExtDictionarys() {
|
private List<String> getRemoteExtDictionarys() {
|
||||||
List<String> remoteExtDictFiles = new ArrayList<String>(2);
|
List<String> remoteExtDictFiles = new ArrayList<String>(2);
|
||||||
String remoteExtDictCfg = getProperty(REMOTE_EXT_DICT);
|
String remoteExtDictCfg = getProperty(REMOTE_EXT_DICT);
|
||||||
if (remoteExtDictCfg != null) {
|
if (remoteExtDictCfg != null) {
|
||||||
@ -206,7 +249,7 @@ public class Dictionary {
|
|||||||
return remoteExtDictFiles;
|
return remoteExtDictFiles;
|
||||||
}
|
}
|
||||||
|
|
||||||
public List<String> getExtStopWordDictionarys() {
|
private List<String> getExtStopWordDictionarys() {
|
||||||
List<String> extStopWordDictFiles = new ArrayList<String>(2);
|
List<String> extStopWordDictFiles = new ArrayList<String>(2);
|
||||||
String extStopWordDictCfg = getProperty(EXT_STOP);
|
String extStopWordDictCfg = getProperty(EXT_STOP);
|
||||||
if (extStopWordDictCfg != null) {
|
if (extStopWordDictCfg != null) {
|
||||||
@ -214,8 +257,8 @@ public class Dictionary {
|
|||||||
String[] filePaths = extStopWordDictCfg.split(";");
|
String[] filePaths = extStopWordDictCfg.split(";");
|
||||||
for (String filePath : filePaths) {
|
for (String filePath : filePaths) {
|
||||||
if (filePath != null && !"".equals(filePath.trim())) {
|
if (filePath != null && !"".equals(filePath.trim())) {
|
||||||
Path file = PathUtils.get(filePath.trim());
|
Path file = PathUtils.get(getDictRoot(), filePath.trim());
|
||||||
extStopWordDictFiles.add(file.toString());
|
walkFileTree(extStopWordDictFiles, file);
|
||||||
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -223,7 +266,7 @@ public class Dictionary {
|
|||||||
return extStopWordDictFiles;
|
return extStopWordDictFiles;
|
||||||
}
|
}
|
||||||
|
|
||||||
public List<String> getRemoteExtStopWordDictionarys() {
|
private List<String> getRemoteExtStopWordDictionarys() {
|
||||||
List<String> remoteExtStopWordDictFiles = new ArrayList<String>(2);
|
List<String> remoteExtStopWordDictFiles = new ArrayList<String>(2);
|
||||||
String remoteExtStopWordDictCfg = getProperty(REMOTE_EXT_STOP);
|
String remoteExtStopWordDictCfg = getProperty(REMOTE_EXT_STOP);
|
||||||
if (remoteExtStopWordDictCfg != null) {
|
if (remoteExtStopWordDictCfg != null) {
|
||||||
@ -239,7 +282,7 @@ public class Dictionary {
|
|||||||
return remoteExtStopWordDictFiles;
|
return remoteExtStopWordDictFiles;
|
||||||
}
|
}
|
||||||
|
|
||||||
public String getDictRoot() {
|
private String getDictRoot() {
|
||||||
return conf_dir.toAbsolutePath().toString();
|
return conf_dir.toAbsolutePath().toString();
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -251,7 +294,7 @@ public class Dictionary {
|
|||||||
*/
|
*/
|
||||||
public static Dictionary getSingleton() {
|
public static Dictionary getSingleton() {
|
||||||
if (singleton == null) {
|
if (singleton == null) {
|
||||||
throw new IllegalStateException("词典尚未初始化,请先调用initial方法");
|
throw new IllegalStateException("ik dict has not been initialized yet, please call initial method first.");
|
||||||
}
|
}
|
||||||
return singleton;
|
return singleton;
|
||||||
}
|
}
|
||||||
@ -343,37 +386,7 @@ public class Dictionary {
|
|||||||
|
|
||||||
// 读取主词典文件
|
// 读取主词典文件
|
||||||
Path file = PathUtils.get(getDictRoot(), Dictionary.PATH_DIC_MAIN);
|
Path file = PathUtils.get(getDictRoot(), Dictionary.PATH_DIC_MAIN);
|
||||||
|
loadDictFile(_MainDict, file, false, "Main Dict");
|
||||||
InputStream is = null;
|
|
||||||
try {
|
|
||||||
is = new FileInputStream(file.toFile());
|
|
||||||
} catch (FileNotFoundException e) {
|
|
||||||
logger.error(e.getMessage(), e);
|
|
||||||
}
|
|
||||||
|
|
||||||
try {
|
|
||||||
BufferedReader br = new BufferedReader(new InputStreamReader(is, "UTF-8"), 512);
|
|
||||||
String theWord = null;
|
|
||||||
do {
|
|
||||||
theWord = br.readLine();
|
|
||||||
if (theWord != null && !"".equals(theWord.trim())) {
|
|
||||||
_MainDict.fillSegment(theWord.trim().toCharArray());
|
|
||||||
}
|
|
||||||
} while (theWord != null);
|
|
||||||
|
|
||||||
} catch (IOException e) {
|
|
||||||
logger.error("ik-analyzer", e);
|
|
||||||
|
|
||||||
} finally {
|
|
||||||
try {
|
|
||||||
if (is != null) {
|
|
||||||
is.close();
|
|
||||||
is = null;
|
|
||||||
}
|
|
||||||
} catch (IOException e) {
|
|
||||||
logger.error("ik-analyzer", e);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
// 加载扩展词典
|
// 加载扩展词典
|
||||||
this.loadExtDict();
|
this.loadExtDict();
|
||||||
// 加载远程自定义词库
|
// 加载远程自定义词库
|
||||||
@ -387,44 +400,11 @@ public class Dictionary {
|
|||||||
// 加载扩展词典配置
|
// 加载扩展词典配置
|
||||||
List<String> extDictFiles = getExtDictionarys();
|
List<String> extDictFiles = getExtDictionarys();
|
||||||
if (extDictFiles != null) {
|
if (extDictFiles != null) {
|
||||||
InputStream is = null;
|
|
||||||
for (String extDictName : extDictFiles) {
|
for (String extDictName : extDictFiles) {
|
||||||
// 读取扩展词典文件
|
// 读取扩展词典文件
|
||||||
logger.info("[Dict Loading] " + extDictName);
|
logger.info("[Dict Loading] " + extDictName);
|
||||||
Path file = PathUtils.get(getDictRoot(), extDictName);
|
Path file = PathUtils.get(extDictName);
|
||||||
try {
|
loadDictFile(_MainDict, file, false, "Extra Dict");
|
||||||
is = new FileInputStream(file.toFile());
|
|
||||||
} catch (FileNotFoundException e) {
|
|
||||||
logger.error("ik-analyzer", e);
|
|
||||||
}
|
|
||||||
|
|
||||||
// 如果找不到扩展的字典,则忽略
|
|
||||||
if (is == null) {
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
try {
|
|
||||||
BufferedReader br = new BufferedReader(new InputStreamReader(is, "UTF-8"), 512);
|
|
||||||
String theWord = null;
|
|
||||||
do {
|
|
||||||
theWord = br.readLine();
|
|
||||||
if (theWord != null && !"".equals(theWord.trim())) {
|
|
||||||
// 加载扩展词典数据到主内存词典中
|
|
||||||
_MainDict.fillSegment(theWord.trim().toCharArray());
|
|
||||||
}
|
|
||||||
} while (theWord != null);
|
|
||||||
|
|
||||||
} catch (IOException e) {
|
|
||||||
logger.error("ik-analyzer", e);
|
|
||||||
} finally {
|
|
||||||
try {
|
|
||||||
if (is != null) {
|
|
||||||
is.close();
|
|
||||||
is = null;
|
|
||||||
}
|
|
||||||
} catch (IOException e) {
|
|
||||||
logger.error("ik-analyzer", e);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -439,7 +419,7 @@ public class Dictionary {
|
|||||||
List<String> lists = getRemoteWords(location);
|
List<String> lists = getRemoteWords(location);
|
||||||
// 如果找不到扩展的字典,则忽略
|
// 如果找不到扩展的字典,则忽略
|
||||||
if (lists == null) {
|
if (lists == null) {
|
||||||
logger.error("[Dict Loading] " + location + "加载失败");
|
logger.error("[Dict Loading] " + location + " load failed");
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
for (String theWord : lists) {
|
for (String theWord : lists) {
|
||||||
@ -453,10 +433,17 @@ public class Dictionary {
|
|||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private static List<String> getRemoteWords(String location) {
|
||||||
|
SpecialPermission.check();
|
||||||
|
return AccessController.doPrivileged((PrivilegedAction<List<String>>) () -> {
|
||||||
|
return getRemoteWordsUnprivileged(location);
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* 从远程服务器上下载自定义词条
|
* 从远程服务器上下载自定义词条
|
||||||
*/
|
*/
|
||||||
private static List<String> getRemoteWords(String location) {
|
private static List<String> getRemoteWordsUnprivileged(String location) {
|
||||||
|
|
||||||
List<String> buffer = new ArrayList<String>();
|
List<String> buffer = new ArrayList<String>();
|
||||||
RequestConfig rc = RequestConfig.custom().setConnectionRequestTimeout(10 * 1000).setConnectTimeout(10 * 1000)
|
RequestConfig rc = RequestConfig.custom().setConnectionRequestTimeout(10 * 1000).setConnectTimeout(10 * 1000)
|
||||||
@ -472,25 +459,30 @@ public class Dictionary {
|
|||||||
|
|
||||||
String charset = "UTF-8";
|
String charset = "UTF-8";
|
||||||
// 获取编码,默认为utf-8
|
// 获取编码,默认为utf-8
|
||||||
if (response.getEntity().getContentType().getValue().contains("charset=")) {
|
HttpEntity entity = response.getEntity();
|
||||||
String contentType = response.getEntity().getContentType().getValue();
|
if(entity!=null){
|
||||||
charset = contentType.substring(contentType.lastIndexOf("=") + 1);
|
Header contentType = entity.getContentType();
|
||||||
}
|
if(contentType!=null&&contentType.getValue()!=null){
|
||||||
in = new BufferedReader(new InputStreamReader(response.getEntity().getContent(), charset));
|
String typeValue = contentType.getValue();
|
||||||
String line;
|
if(typeValue!=null&&typeValue.contains("charset=")){
|
||||||
while ((line = in.readLine()) != null) {
|
charset = typeValue.substring(typeValue.lastIndexOf("=") + 1);
|
||||||
buffer.add(line);
|
}
|
||||||
}
|
}
|
||||||
in.close();
|
|
||||||
response.close();
|
if (entity.getContentLength() > 0 || entity.isChunked()) {
|
||||||
return buffer;
|
in = new BufferedReader(new InputStreamReader(entity.getContent(), charset));
|
||||||
|
String line;
|
||||||
|
while ((line = in.readLine()) != null) {
|
||||||
|
buffer.add(line);
|
||||||
|
}
|
||||||
|
in.close();
|
||||||
|
response.close();
|
||||||
|
return buffer;
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
response.close();
|
response.close();
|
||||||
} catch (ClientProtocolException e) {
|
} catch (IllegalStateException | IOException e) {
|
||||||
logger.error("getRemoteWords {} error", e, location);
|
|
||||||
} catch (IllegalStateException e) {
|
|
||||||
logger.error("getRemoteWords {} error", e, location);
|
|
||||||
} catch (IOException e) {
|
|
||||||
logger.error("getRemoteWords {} error", e, location);
|
logger.error("getRemoteWords {} error", e, location);
|
||||||
}
|
}
|
||||||
return buffer;
|
return buffer;
|
||||||
@ -505,80 +497,17 @@ public class Dictionary {
|
|||||||
|
|
||||||
// 读取主词典文件
|
// 读取主词典文件
|
||||||
Path file = PathUtils.get(getDictRoot(), Dictionary.PATH_DIC_STOP);
|
Path file = PathUtils.get(getDictRoot(), Dictionary.PATH_DIC_STOP);
|
||||||
|
loadDictFile(_StopWords, file, false, "Main Stopwords");
|
||||||
InputStream is = null;
|
|
||||||
try {
|
|
||||||
is = new FileInputStream(file.toFile());
|
|
||||||
} catch (FileNotFoundException e) {
|
|
||||||
logger.error(e.getMessage(), e);
|
|
||||||
}
|
|
||||||
|
|
||||||
try {
|
|
||||||
BufferedReader br = new BufferedReader(new InputStreamReader(is, "UTF-8"), 512);
|
|
||||||
String theWord = null;
|
|
||||||
do {
|
|
||||||
theWord = br.readLine();
|
|
||||||
if (theWord != null && !"".equals(theWord.trim())) {
|
|
||||||
_StopWords.fillSegment(theWord.trim().toCharArray());
|
|
||||||
}
|
|
||||||
} while (theWord != null);
|
|
||||||
|
|
||||||
} catch (IOException e) {
|
|
||||||
logger.error("ik-analyzer", e);
|
|
||||||
|
|
||||||
} finally {
|
|
||||||
try {
|
|
||||||
if (is != null) {
|
|
||||||
is.close();
|
|
||||||
is = null;
|
|
||||||
}
|
|
||||||
} catch (IOException e) {
|
|
||||||
logger.error("ik-analyzer", e);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// 加载扩展停止词典
|
// 加载扩展停止词典
|
||||||
List<String> extStopWordDictFiles = getExtStopWordDictionarys();
|
List<String> extStopWordDictFiles = getExtStopWordDictionarys();
|
||||||
if (extStopWordDictFiles != null) {
|
if (extStopWordDictFiles != null) {
|
||||||
is = null;
|
|
||||||
for (String extStopWordDictName : extStopWordDictFiles) {
|
for (String extStopWordDictName : extStopWordDictFiles) {
|
||||||
logger.info("[Dict Loading] " + extStopWordDictName);
|
logger.info("[Dict Loading] " + extStopWordDictName);
|
||||||
|
|
||||||
// 读取扩展词典文件
|
// 读取扩展词典文件
|
||||||
file = PathUtils.get(getDictRoot(), extStopWordDictName);
|
file = PathUtils.get(extStopWordDictName);
|
||||||
try {
|
loadDictFile(_StopWords, file, false, "Extra Stopwords");
|
||||||
is = new FileInputStream(file.toFile());
|
|
||||||
} catch (FileNotFoundException e) {
|
|
||||||
logger.error("ik-analyzer", e);
|
|
||||||
}
|
|
||||||
// 如果找不到扩展的字典,则忽略
|
|
||||||
if (is == null) {
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
try {
|
|
||||||
BufferedReader br = new BufferedReader(new InputStreamReader(is, "UTF-8"), 512);
|
|
||||||
String theWord = null;
|
|
||||||
do {
|
|
||||||
theWord = br.readLine();
|
|
||||||
if (theWord != null && !"".equals(theWord.trim())) {
|
|
||||||
// 加载扩展停止词典数据到内存中
|
|
||||||
_StopWords.fillSegment(theWord.trim().toCharArray());
|
|
||||||
}
|
|
||||||
} while (theWord != null);
|
|
||||||
|
|
||||||
} catch (IOException e) {
|
|
||||||
logger.error("ik-analyzer", e);
|
|
||||||
|
|
||||||
} finally {
|
|
||||||
try {
|
|
||||||
if (is != null) {
|
|
||||||
is.close();
|
|
||||||
is = null;
|
|
||||||
}
|
|
||||||
} catch (IOException e) {
|
|
||||||
logger.error("ik-analyzer", e);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -589,7 +518,7 @@ public class Dictionary {
|
|||||||
List<String> lists = getRemoteWords(location);
|
List<String> lists = getRemoteWords(location);
|
||||||
// 如果找不到扩展的字典,则忽略
|
// 如果找不到扩展的字典,则忽略
|
||||||
if (lists == null) {
|
if (lists == null) {
|
||||||
logger.error("[Dict Loading] " + location + "加载失败");
|
logger.error("[Dict Loading] " + location + " load failed");
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
for (String theWord : lists) {
|
for (String theWord : lists) {
|
||||||
@ -611,146 +540,29 @@ public class Dictionary {
|
|||||||
_QuantifierDict = new DictSegment((char) 0);
|
_QuantifierDict = new DictSegment((char) 0);
|
||||||
// 读取量词词典文件
|
// 读取量词词典文件
|
||||||
Path file = PathUtils.get(getDictRoot(), Dictionary.PATH_DIC_QUANTIFIER);
|
Path file = PathUtils.get(getDictRoot(), Dictionary.PATH_DIC_QUANTIFIER);
|
||||||
InputStream is = null;
|
loadDictFile(_QuantifierDict, file, false, "Quantifier");
|
||||||
try {
|
|
||||||
is = new FileInputStream(file.toFile());
|
|
||||||
} catch (FileNotFoundException e) {
|
|
||||||
logger.error("ik-analyzer", e);
|
|
||||||
}
|
|
||||||
try {
|
|
||||||
BufferedReader br = new BufferedReader(new InputStreamReader(is, "UTF-8"), 512);
|
|
||||||
String theWord = null;
|
|
||||||
do {
|
|
||||||
theWord = br.readLine();
|
|
||||||
if (theWord != null && !"".equals(theWord.trim())) {
|
|
||||||
_QuantifierDict.fillSegment(theWord.trim().toCharArray());
|
|
||||||
}
|
|
||||||
} while (theWord != null);
|
|
||||||
|
|
||||||
} catch (IOException ioe) {
|
|
||||||
logger.error("Quantifier Dictionary loading exception.");
|
|
||||||
|
|
||||||
} finally {
|
|
||||||
try {
|
|
||||||
if (is != null) {
|
|
||||||
is.close();
|
|
||||||
is = null;
|
|
||||||
}
|
|
||||||
} catch (IOException e) {
|
|
||||||
logger.error("ik-analyzer", e);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
private void loadSurnameDict() {
|
private void loadSurnameDict() {
|
||||||
|
DictSegment _SurnameDict = new DictSegment((char) 0);
|
||||||
_SurnameDict = new DictSegment((char) 0);
|
|
||||||
Path file = PathUtils.get(getDictRoot(), Dictionary.PATH_DIC_SURNAME);
|
Path file = PathUtils.get(getDictRoot(), Dictionary.PATH_DIC_SURNAME);
|
||||||
InputStream is = null;
|
loadDictFile(_SurnameDict, file, true, "Surname");
|
||||||
try {
|
|
||||||
is = new FileInputStream(file.toFile());
|
|
||||||
} catch (FileNotFoundException e) {
|
|
||||||
logger.error("ik-analyzer", e);
|
|
||||||
}
|
|
||||||
if (is == null) {
|
|
||||||
throw new RuntimeException("Surname Dictionary not found!!!");
|
|
||||||
}
|
|
||||||
try {
|
|
||||||
BufferedReader br = new BufferedReader(new InputStreamReader(is, "UTF-8"), 512);
|
|
||||||
String theWord;
|
|
||||||
do {
|
|
||||||
theWord = br.readLine();
|
|
||||||
if (theWord != null && !"".equals(theWord.trim())) {
|
|
||||||
_SurnameDict.fillSegment(theWord.trim().toCharArray());
|
|
||||||
}
|
|
||||||
} while (theWord != null);
|
|
||||||
} catch (IOException e) {
|
|
||||||
logger.error("ik-analyzer", e);
|
|
||||||
} finally {
|
|
||||||
try {
|
|
||||||
if (is != null) {
|
|
||||||
is.close();
|
|
||||||
is = null;
|
|
||||||
}
|
|
||||||
} catch (IOException e) {
|
|
||||||
logger.error("ik-analyzer", e);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
private void loadSuffixDict() {
|
private void loadSuffixDict() {
|
||||||
|
DictSegment _SuffixDict = new DictSegment((char) 0);
|
||||||
_SuffixDict = new DictSegment((char) 0);
|
|
||||||
Path file = PathUtils.get(getDictRoot(), Dictionary.PATH_DIC_SUFFIX);
|
Path file = PathUtils.get(getDictRoot(), Dictionary.PATH_DIC_SUFFIX);
|
||||||
InputStream is = null;
|
loadDictFile(_SuffixDict, file, true, "Suffix");
|
||||||
try {
|
|
||||||
is = new FileInputStream(file.toFile());
|
|
||||||
} catch (FileNotFoundException e) {
|
|
||||||
logger.error("ik-analyzer", e);
|
|
||||||
}
|
|
||||||
if (is == null) {
|
|
||||||
throw new RuntimeException("Suffix Dictionary not found!!!");
|
|
||||||
}
|
|
||||||
try {
|
|
||||||
|
|
||||||
BufferedReader br = new BufferedReader(new InputStreamReader(is, "UTF-8"), 512);
|
|
||||||
String theWord;
|
|
||||||
do {
|
|
||||||
theWord = br.readLine();
|
|
||||||
if (theWord != null && !"".equals(theWord.trim())) {
|
|
||||||
_SuffixDict.fillSegment(theWord.trim().toCharArray());
|
|
||||||
}
|
|
||||||
} while (theWord != null);
|
|
||||||
} catch (IOException e) {
|
|
||||||
logger.error("ik-analyzer", e);
|
|
||||||
} finally {
|
|
||||||
try {
|
|
||||||
is.close();
|
|
||||||
is = null;
|
|
||||||
} catch (IOException e) {
|
|
||||||
logger.error("ik-analyzer", e);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
private void loadPrepDict() {
|
private void loadPrepDict() {
|
||||||
|
DictSegment _PrepDict = new DictSegment((char) 0);
|
||||||
_PrepDict = new DictSegment((char) 0);
|
|
||||||
Path file = PathUtils.get(getDictRoot(), Dictionary.PATH_DIC_PREP);
|
Path file = PathUtils.get(getDictRoot(), Dictionary.PATH_DIC_PREP);
|
||||||
InputStream is = null;
|
loadDictFile(_PrepDict, file, true, "Preposition");
|
||||||
try {
|
|
||||||
is = new FileInputStream(file.toFile());
|
|
||||||
} catch (FileNotFoundException e) {
|
|
||||||
logger.error("ik-analyzer", e);
|
|
||||||
}
|
|
||||||
if (is == null) {
|
|
||||||
throw new RuntimeException("Preposition Dictionary not found!!!");
|
|
||||||
}
|
|
||||||
try {
|
|
||||||
|
|
||||||
BufferedReader br = new BufferedReader(new InputStreamReader(is, "UTF-8"), 512);
|
|
||||||
String theWord;
|
|
||||||
do {
|
|
||||||
theWord = br.readLine();
|
|
||||||
if (theWord != null && !"".equals(theWord.trim())) {
|
|
||||||
|
|
||||||
_PrepDict.fillSegment(theWord.trim().toCharArray());
|
|
||||||
}
|
|
||||||
} while (theWord != null);
|
|
||||||
} catch (IOException e) {
|
|
||||||
logger.error("ik-analyzer", e);
|
|
||||||
} finally {
|
|
||||||
try {
|
|
||||||
is.close();
|
|
||||||
is = null;
|
|
||||||
} catch (IOException e) {
|
|
||||||
logger.error("ik-analyzer", e);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
public void reLoadMainDict() {
|
void reLoadMainDict() {
|
||||||
logger.info("重新加载词典...");
|
logger.info("start to reload ik dict.");
|
||||||
// 新开一个实例加载词典,减少加载过程对当前词典使用的影响
|
// 新开一个实例加载词典,减少加载过程对当前词典使用的影响
|
||||||
Dictionary tmpDict = new Dictionary(configuration);
|
Dictionary tmpDict = new Dictionary(configuration);
|
||||||
tmpDict.configuration = getSingleton().configuration;
|
tmpDict.configuration = getSingleton().configuration;
|
||||||
@ -758,7 +570,7 @@ public class Dictionary {
|
|||||||
tmpDict.loadStopWordDict();
|
tmpDict.loadStopWordDict();
|
||||||
_MainDict = tmpDict._MainDict;
|
_MainDict = tmpDict._MainDict;
|
||||||
_StopWords = tmpDict._StopWords;
|
_StopWords = tmpDict._StopWords;
|
||||||
logger.info("重新加载词典完毕...");
|
logger.info("reload ik dict finished.");
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
@ -1,6 +1,8 @@
|
|||||||
package org.wltea.analyzer.dic;
|
package org.wltea.analyzer.dic;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
|
import java.security.AccessController;
|
||||||
|
import java.security.PrivilegedAction;
|
||||||
|
|
||||||
import org.apache.http.client.config.RequestConfig;
|
import org.apache.http.client.config.RequestConfig;
|
||||||
import org.apache.http.client.methods.CloseableHttpResponse;
|
import org.apache.http.client.methods.CloseableHttpResponse;
|
||||||
@ -8,11 +10,12 @@ import org.apache.http.client.methods.HttpHead;
|
|||||||
import org.apache.http.impl.client.CloseableHttpClient;
|
import org.apache.http.impl.client.CloseableHttpClient;
|
||||||
import org.apache.http.impl.client.HttpClients;
|
import org.apache.http.impl.client.HttpClients;
|
||||||
import org.apache.logging.log4j.Logger;
|
import org.apache.logging.log4j.Logger;
|
||||||
import org.elasticsearch.common.logging.ESLoggerFactory;
|
import org.elasticsearch.SpecialPermission;
|
||||||
|
import org.wltea.analyzer.help.ESPluginLoggerFactory;
|
||||||
|
|
||||||
public class Monitor implements Runnable {
|
public class Monitor implements Runnable {
|
||||||
|
|
||||||
private static final Logger logger = ESLoggerFactory.getLogger(Monitor.class.getName());
|
private static final Logger logger = ESPluginLoggerFactory.getLogger(Monitor.class.getName());
|
||||||
|
|
||||||
private static CloseableHttpClient httpclient = HttpClients.createDefault();
|
private static CloseableHttpClient httpclient = HttpClients.createDefault();
|
||||||
/*
|
/*
|
||||||
@ -34,6 +37,15 @@ public class Monitor implements Runnable {
|
|||||||
this.last_modified = null;
|
this.last_modified = null;
|
||||||
this.eTags = null;
|
this.eTags = null;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public void run() {
|
||||||
|
SpecialPermission.check();
|
||||||
|
AccessController.doPrivileged((PrivilegedAction<Void>) () -> {
|
||||||
|
this.runUnprivileged();
|
||||||
|
return null;
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* 监控流程:
|
* 监控流程:
|
||||||
* ①向词库服务器发送Head请求
|
* ①向词库服务器发送Head请求
|
||||||
@ -43,7 +55,7 @@ public class Monitor implements Runnable {
|
|||||||
* ⑤休眠1min,返回第①步
|
* ⑤休眠1min,返回第①步
|
||||||
*/
|
*/
|
||||||
|
|
||||||
public void run() {
|
public void runUnprivileged() {
|
||||||
|
|
||||||
//超时设置
|
//超时设置
|
||||||
RequestConfig rc = RequestConfig.custom().setConnectionRequestTimeout(10*1000)
|
RequestConfig rc = RequestConfig.custom().setConnectionRequestTimeout(10*1000)
|
||||||
|
@ -0,0 +1,27 @@
|
|||||||
|
package org.wltea.analyzer.help;
|
||||||
|
|
||||||
|
import org.apache.logging.log4j.LogManager;
|
||||||
|
import org.apache.logging.log4j.Logger;
|
||||||
|
import org.apache.logging.log4j.spi.ExtendedLogger;
|
||||||
|
|
||||||
|
public class ESPluginLoggerFactory {
|
||||||
|
|
||||||
|
private ESPluginLoggerFactory() {
|
||||||
|
}
|
||||||
|
|
||||||
|
static public Logger getLogger(String name) {
|
||||||
|
return getLogger("", LogManager.getLogger(name));
|
||||||
|
}
|
||||||
|
|
||||||
|
static public Logger getLogger(String prefix, String name) {
|
||||||
|
return getLogger(prefix, LogManager.getLogger(name));
|
||||||
|
}
|
||||||
|
|
||||||
|
static public Logger getLogger(String prefix, Class<?> clazz) {
|
||||||
|
return getLogger(prefix, LogManager.getLogger(clazz.getName()));
|
||||||
|
}
|
||||||
|
|
||||||
|
static public Logger getLogger(String prefix, Logger logger) {
|
||||||
|
return (Logger)(prefix != null && prefix.length() != 0 ? new PrefixPluginLogger((ExtendedLogger)logger, logger.getName(), prefix) : logger);
|
||||||
|
}
|
||||||
|
}
|
@ -0,0 +1,48 @@
|
|||||||
|
package org.wltea.analyzer.help;
|
||||||
|
|
||||||
|
import org.apache.logging.log4j.Level;
|
||||||
|
import org.apache.logging.log4j.Marker;
|
||||||
|
import org.apache.logging.log4j.MarkerManager;
|
||||||
|
import org.apache.logging.log4j.message.Message;
|
||||||
|
import org.apache.logging.log4j.message.MessageFactory;
|
||||||
|
import org.apache.logging.log4j.spi.ExtendedLogger;
|
||||||
|
import org.apache.logging.log4j.spi.ExtendedLoggerWrapper;
|
||||||
|
|
||||||
|
import java.util.WeakHashMap;
|
||||||
|
|
||||||
|
public class PrefixPluginLogger extends ExtendedLoggerWrapper {
|
||||||
|
private static final WeakHashMap<String, Marker> markers = new WeakHashMap();
|
||||||
|
private final Marker marker;
|
||||||
|
|
||||||
|
static int markersSize() {
|
||||||
|
return markers.size();
|
||||||
|
}
|
||||||
|
|
||||||
|
public String prefix() {
|
||||||
|
return this.marker.getName();
|
||||||
|
}
|
||||||
|
|
||||||
|
PrefixPluginLogger(ExtendedLogger logger, String name, String prefix) {
|
||||||
|
super(logger, name, (MessageFactory) null);
|
||||||
|
String actualPrefix = prefix == null ? "" : prefix;
|
||||||
|
WeakHashMap var6 = markers;
|
||||||
|
MarkerManager.Log4jMarker actualMarker;
|
||||||
|
synchronized (markers) {
|
||||||
|
MarkerManager.Log4jMarker maybeMarker = (MarkerManager.Log4jMarker) markers.get(actualPrefix);
|
||||||
|
if (maybeMarker == null) {
|
||||||
|
actualMarker = new MarkerManager.Log4jMarker(actualPrefix);
|
||||||
|
markers.put(new String(actualPrefix), actualMarker);
|
||||||
|
} else {
|
||||||
|
actualMarker = maybeMarker;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
this.marker = (Marker) actualMarker;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void logMessage(String fqcn, Level level, Marker marker, Message message, Throwable t) {
|
||||||
|
assert marker == null;
|
||||||
|
|
||||||
|
super.logMessage(fqcn, level, this.marker, message, t);
|
||||||
|
}
|
||||||
|
}
|
@ -1,36 +1,38 @@
|
|||||||
package org.wltea.analyzer.help;
|
package org.wltea.analyzer.help;
|
||||||
|
|
||||||
import org.apache.logging.log4j.Logger;
|
import org.apache.logging.log4j.Logger;
|
||||||
import org.elasticsearch.common.logging.ESLoggerFactory;
|
|
||||||
|
|
||||||
public class Sleep {
|
public class Sleep {
|
||||||
|
|
||||||
private static final Logger logger = ESLoggerFactory.getLogger(Sleep.class.getName());
|
private static final Logger logger = ESPluginLoggerFactory.getLogger(Sleep.class.getName());
|
||||||
|
|
||||||
public enum Type{MSEC,SEC,MIN,HOUR};
|
public enum Type {MSEC, SEC, MIN, HOUR}
|
||||||
public static void sleep(Type type,int num){
|
|
||||||
try {
|
;
|
||||||
switch(type){
|
|
||||||
case MSEC:
|
public static void sleep(Type type, int num) {
|
||||||
Thread.sleep(num);
|
try {
|
||||||
return;
|
switch (type) {
|
||||||
case SEC:
|
case MSEC:
|
||||||
Thread.sleep(num*1000);
|
Thread.sleep(num);
|
||||||
return;
|
return;
|
||||||
case MIN:
|
case SEC:
|
||||||
Thread.sleep(num*60*1000);
|
Thread.sleep(num * 1000);
|
||||||
return;
|
return;
|
||||||
case HOUR:
|
case MIN:
|
||||||
Thread.sleep(num*60*60*1000);
|
Thread.sleep(num * 60 * 1000);
|
||||||
return;
|
return;
|
||||||
default:
|
case HOUR:
|
||||||
System.err.println("输入类型错误,应为MSEC,SEC,MIN,HOUR之一");
|
Thread.sleep(num * 60 * 60 * 1000);
|
||||||
return;
|
return;
|
||||||
}
|
default:
|
||||||
} catch (InterruptedException e) {
|
System.err.println("输入类型错误,应为MSEC,SEC,MIN,HOUR之一");
|
||||||
logger.error(e.getMessage(), e);
|
return;
|
||||||
}
|
}
|
||||||
}
|
} catch (InterruptedException e) {
|
||||||
|
logger.error(e.getMessage(), e);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
@ -38,21 +38,6 @@ version=${project.version}
|
|||||||
#
|
#
|
||||||
# 'name': the plugin name
|
# 'name': the plugin name
|
||||||
name=${elasticsearch.plugin.name}
|
name=${elasticsearch.plugin.name}
|
||||||
|
|
||||||
### mandatory elements for site plugins:
|
|
||||||
#
|
|
||||||
# 'site': set to true to indicate contents of the _site/
|
|
||||||
# directory in the root of the plugin should be served.
|
|
||||||
site=${elasticsearch.plugin.site}
|
|
||||||
#
|
|
||||||
### mandatory elements for jvm plugins :
|
|
||||||
#
|
|
||||||
# 'jvm': true if the 'classname' class should be loaded
|
|
||||||
# from jar files in the root directory of the plugin.
|
|
||||||
# Note that only jar files in the root directory are
|
|
||||||
# added to the classpath for the plugin! If you need
|
|
||||||
# other resources, package them into a resources jar.
|
|
||||||
jvm=${elasticsearch.plugin.jvm}
|
|
||||||
#
|
#
|
||||||
# 'classname': the name of the class to load, fully-qualified.
|
# 'classname': the name of the class to load, fully-qualified.
|
||||||
classname=${elasticsearch.plugin.classname}
|
classname=${elasticsearch.plugin.classname}
|
||||||
@ -69,12 +54,3 @@ java.version=${maven.compiler.target}
|
|||||||
# is loaded so Elasticsearch will refuse to start in the presence of
|
# is loaded so Elasticsearch will refuse to start in the presence of
|
||||||
# plugins with the incorrect elasticsearch.version.
|
# plugins with the incorrect elasticsearch.version.
|
||||||
elasticsearch.version=${elasticsearch.version}
|
elasticsearch.version=${elasticsearch.version}
|
||||||
#
|
|
||||||
### deprecated elements for jvm plugins :
|
|
||||||
#
|
|
||||||
# 'isolated': true if the plugin should have its own classloader.
|
|
||||||
# passing false is deprecated, and only intended to support plugins
|
|
||||||
# that have hard dependencies against each other. If this is
|
|
||||||
# not specified, then the plugin is isolated by default.
|
|
||||||
isolated=${elasticsearch.plugin.isolated}
|
|
||||||
#
|
|
4
src/main/resources/plugin-security.policy
Normal file
4
src/main/resources/plugin-security.policy
Normal file
@ -0,0 +1,4 @@
|
|||||||
|
grant {
|
||||||
|
// needed because of the hot reload functionality
|
||||||
|
permission java.net.SocketPermission "*", "connect,resolve";
|
||||||
|
};
|
Loading…
x
Reference in New Issue
Block a user