commit
a207059860
@ -1,6 +1,11 @@
|
|||||||
IK Analysis for ElasticSearch
|
IK Analysis for ElasticSearch
|
||||||
==================================
|
==================================
|
||||||
|
|
||||||
|
更新说明:
|
||||||
|
对于使用es集群,用ik作为分词插件,经常会修改自定义词典,增加远程加载,每次更新都会重新加载词典,不必重启es服务。
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
The IK Analysis plugin integrates Lucene IK analyzer into elasticsearch, support customized dictionary.
|
The IK Analysis plugin integrates Lucene IK analyzer into elasticsearch, support customized dictionary.
|
||||||
|
|
||||||
Tokenizer: `ik`
|
Tokenizer: `ik`
|
||||||
@ -53,6 +58,10 @@ https://github.com/medcl/elasticsearch-analysis-ik/blob/master/config/ik/IKAnaly
|
|||||||
<entry key="ext_dict">custom/mydict.dic;custom/single_word_low_freq.dic</entry>
|
<entry key="ext_dict">custom/mydict.dic;custom/single_word_low_freq.dic</entry>
|
||||||
<!--用户可以在这里配置自己的扩展停止词字典-->
|
<!--用户可以在这里配置自己的扩展停止词字典-->
|
||||||
<entry key="ext_stopwords">custom/ext_stopword.dic</entry>
|
<entry key="ext_stopwords">custom/ext_stopword.dic</entry>
|
||||||
|
<!--用户可以在这里配置远程扩展字典 -->
|
||||||
|
<entry key="remote_ext_dict">location</entry>
|
||||||
|
<!--用户可以在这里配置远程扩展停止词字典-->
|
||||||
|
<entry key="remote_ext_stopwords">location</entry>
|
||||||
</properties>
|
</properties>
|
||||||
|
|
||||||
</pre>
|
</pre>
|
||||||
|
@ -6,4 +6,8 @@
|
|||||||
<entry key="ext_dict">custom/mydict.dic;custom/single_word_low_freq.dic</entry>
|
<entry key="ext_dict">custom/mydict.dic;custom/single_word_low_freq.dic</entry>
|
||||||
<!--用户可以在这里配置自己的扩展停止词字典-->
|
<!--用户可以在这里配置自己的扩展停止词字典-->
|
||||||
<entry key="ext_stopwords">custom/ext_stopword.dic</entry>
|
<entry key="ext_stopwords">custom/ext_stopword.dic</entry>
|
||||||
|
<!--用户可以在这里配置远程扩展字典 -->
|
||||||
|
<entry key="remote_ext_dict">words_location</entry>
|
||||||
|
<!--用户可以在这里配置远程扩展停止词字典-->
|
||||||
|
<entry key="remote_ext_stopwords">words_location</entry>
|
||||||
</properties>
|
</properties>
|
7
pom.xml
7
pom.xml
@ -52,6 +52,13 @@
|
|||||||
<scope>compile</scope>
|
<scope>compile</scope>
|
||||||
</dependency>
|
</dependency>
|
||||||
|
|
||||||
|
<dependency>
|
||||||
|
<groupId>org.apache.httpcomponents</groupId>
|
||||||
|
<artifactId>httpclient</artifactId>
|
||||||
|
<version>4.3.5</version>
|
||||||
|
<scope>compile</scope>
|
||||||
|
</dependency>
|
||||||
|
|
||||||
<dependency>
|
<dependency>
|
||||||
<groupId>log4j</groupId>
|
<groupId>log4j</groupId>
|
||||||
<artifactId>log4j</artifactId>
|
<artifactId>log4j</artifactId>
|
||||||
|
@ -17,7 +17,9 @@ public class Configuration {
|
|||||||
|
|
||||||
private static String FILE_NAME = "ik/IKAnalyzer.cfg.xml";
|
private static String FILE_NAME = "ik/IKAnalyzer.cfg.xml";
|
||||||
private static final String EXT_DICT = "ext_dict";
|
private static final String EXT_DICT = "ext_dict";
|
||||||
|
private static final String REMOTE_EXT_DICT = "remote_ext_dict";
|
||||||
private static final String EXT_STOP = "ext_stopwords";
|
private static final String EXT_STOP = "ext_stopwords";
|
||||||
|
private static final String REMOTE_EXT_STOP = "remote_ext_stopwords";
|
||||||
private static ESLogger logger = null;
|
private static ESLogger logger = null;
|
||||||
private Properties props;
|
private Properties props;
|
||||||
private Environment environment;
|
private Environment environment;
|
||||||
@ -65,6 +67,24 @@ public class Configuration {
|
|||||||
return extDictFiles;
|
return extDictFiles;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public List<String> getRemoteExtDictionarys(){
|
||||||
|
List<String> remoteExtDictFiles = new ArrayList<String>(2);
|
||||||
|
String remoteExtDictCfg = props.getProperty(REMOTE_EXT_DICT);
|
||||||
|
if(remoteExtDictCfg != null){
|
||||||
|
|
||||||
|
String[] filePaths = remoteExtDictCfg.split(";");
|
||||||
|
if(filePaths != null){
|
||||||
|
for(String filePath : filePaths){
|
||||||
|
if(filePath != null && !"".equals(filePath.trim())){
|
||||||
|
remoteExtDictFiles.add(filePath);
|
||||||
|
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return remoteExtDictFiles;
|
||||||
|
}
|
||||||
|
|
||||||
public List<String> getExtStopWordDictionarys(){
|
public List<String> getExtStopWordDictionarys(){
|
||||||
List<String> extStopWordDictFiles = new ArrayList<String>(2);
|
List<String> extStopWordDictFiles = new ArrayList<String>(2);
|
||||||
String extStopWordDictCfg = props.getProperty(EXT_STOP);
|
String extStopWordDictCfg = props.getProperty(EXT_STOP);
|
||||||
@ -84,6 +104,24 @@ public class Configuration {
|
|||||||
return extStopWordDictFiles;
|
return extStopWordDictFiles;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public List<String> getRemoteExtStopWordDictionarys(){
|
||||||
|
List<String> remoteExtStopWordDictFiles = new ArrayList<String>(2);
|
||||||
|
String remoteExtStopWordDictCfg = props.getProperty(REMOTE_EXT_STOP);
|
||||||
|
if(remoteExtStopWordDictCfg != null){
|
||||||
|
|
||||||
|
String[] filePaths = remoteExtStopWordDictCfg.split(";");
|
||||||
|
if(filePaths != null){
|
||||||
|
for(String filePath : filePaths){
|
||||||
|
if(filePath != null && !"".equals(filePath.trim())){
|
||||||
|
remoteExtStopWordDictFiles.add(filePath);
|
||||||
|
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return remoteExtStopWordDictFiles;
|
||||||
|
}
|
||||||
|
|
||||||
public File getDictRoot() {
|
public File getDictRoot() {
|
||||||
return environment.configFile();
|
return environment.configFile();
|
||||||
}
|
}
|
||||||
|
@ -25,11 +25,18 @@
|
|||||||
*/
|
*/
|
||||||
package org.wltea.analyzer.dic;
|
package org.wltea.analyzer.dic;
|
||||||
|
|
||||||
|
import org.apache.http.client.ClientProtocolException;
|
||||||
|
import org.apache.http.client.config.RequestConfig;
|
||||||
|
import org.apache.http.client.methods.CloseableHttpResponse;
|
||||||
|
import org.apache.http.client.methods.HttpGet;
|
||||||
|
import org.apache.http.impl.client.CloseableHttpClient;
|
||||||
|
import org.apache.http.impl.client.HttpClients;
|
||||||
import org.elasticsearch.common.logging.ESLogger;
|
import org.elasticsearch.common.logging.ESLogger;
|
||||||
import org.elasticsearch.common.logging.Loggers;
|
import org.elasticsearch.common.logging.Loggers;
|
||||||
import org.wltea.analyzer.cfg.Configuration;
|
import org.wltea.analyzer.cfg.Configuration;
|
||||||
|
|
||||||
import java.io.*;
|
import java.io.*;
|
||||||
|
import java.util.ArrayList;
|
||||||
import java.util.Collection;
|
import java.util.Collection;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
|
||||||
@ -92,6 +99,17 @@ public class Dictionary {
|
|||||||
singleton.loadSuffixDict();
|
singleton.loadSuffixDict();
|
||||||
singleton.loadPrepDict();
|
singleton.loadPrepDict();
|
||||||
singleton.loadStopWordDict();
|
singleton.loadStopWordDict();
|
||||||
|
|
||||||
|
//建立监控线程
|
||||||
|
for(String location:cfg.getRemoteExtDictionarys()){
|
||||||
|
Thread monitor = new Thread(new Monitor(location));
|
||||||
|
monitor.start();
|
||||||
|
}
|
||||||
|
for(String location:cfg.getRemoteExtStopWordDictionarys()){
|
||||||
|
Thread monitor = new Thread(new Monitor(location));
|
||||||
|
monitor.start();
|
||||||
|
}
|
||||||
|
|
||||||
return singleton;
|
return singleton;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -224,6 +242,8 @@ public class Dictionary {
|
|||||||
}
|
}
|
||||||
//加载扩展词典
|
//加载扩展词典
|
||||||
this.loadExtDict();
|
this.loadExtDict();
|
||||||
|
//加载远程自定义词库
|
||||||
|
this.loadRemoteExtDict();
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@ -275,6 +295,76 @@ public class Dictionary {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* 加载远程扩展词典到主词库表
|
||||||
|
*/
|
||||||
|
private void loadRemoteExtDict(){
|
||||||
|
List<String> remoteExtDictFiles = configuration.getRemoteExtDictionarys();
|
||||||
|
for(String location:remoteExtDictFiles){
|
||||||
|
logger.info("[Dict Loading]" + location);
|
||||||
|
List<String> lists = getRemoteWords(location);
|
||||||
|
//如果找不到扩展的字典,则忽略
|
||||||
|
if(lists == null){
|
||||||
|
logger.error("[Dict Loading]"+location+"加载失败");
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
for(String theWord:lists){
|
||||||
|
if (theWord != null && !"".equals(theWord.trim())) {
|
||||||
|
//加载扩展词典数据到主内存词典中
|
||||||
|
logger.info(theWord);
|
||||||
|
_MainDict.fillSegment(theWord.trim().toLowerCase().toCharArray());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* 从远程服务器上下载自定义词条
|
||||||
|
*/
|
||||||
|
private static List<String> getRemoteWords(String location){
|
||||||
|
|
||||||
|
List<String> buffer = new ArrayList<String>();
|
||||||
|
RequestConfig rc = RequestConfig.custom().setConnectionRequestTimeout(10*1000)
|
||||||
|
.setConnectTimeout(10*1000).setSocketTimeout(60*1000).build();
|
||||||
|
CloseableHttpClient httpclient = HttpClients.createDefault();
|
||||||
|
CloseableHttpResponse response;
|
||||||
|
BufferedReader in;
|
||||||
|
HttpGet get = new HttpGet(location);
|
||||||
|
get.setConfig(rc);
|
||||||
|
try {
|
||||||
|
response = httpclient.execute(get);
|
||||||
|
if(response.getStatusLine().getStatusCode()==200){
|
||||||
|
|
||||||
|
String charset = "UTF-8";
|
||||||
|
//获取编码,默认为utf-8
|
||||||
|
if(response.getEntity().getContentType().getValue().contains("charset=")){
|
||||||
|
String contentType=response.getEntity().getContentType().getValue();
|
||||||
|
charset=contentType.substring(contentType.lastIndexOf("=")+1);
|
||||||
|
}
|
||||||
|
in = new BufferedReader(new InputStreamReader(response.getEntity().getContent(),charset));
|
||||||
|
String line ;
|
||||||
|
while((line = in.readLine())!=null){
|
||||||
|
buffer.add(line);
|
||||||
|
}
|
||||||
|
in.close();
|
||||||
|
response.close();
|
||||||
|
return buffer;
|
||||||
|
}
|
||||||
|
response.close();
|
||||||
|
} catch (ClientProtocolException e) {
|
||||||
|
e.printStackTrace();
|
||||||
|
} catch (IllegalStateException e) {
|
||||||
|
e.printStackTrace();
|
||||||
|
} catch (IOException e) {
|
||||||
|
e.printStackTrace();
|
||||||
|
}
|
||||||
|
return buffer;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* 加载用户扩展的停止词词典
|
* 加载用户扩展的停止词词典
|
||||||
*/
|
*/
|
||||||
@ -361,6 +451,27 @@ public class Dictionary {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
//加载远程停用词典
|
||||||
|
List<String> remoteExtStopWordDictFiles = configuration.getRemoteExtStopWordDictionarys();
|
||||||
|
for(String location:remoteExtStopWordDictFiles){
|
||||||
|
logger.info("[Dict Loading]" + location);
|
||||||
|
List<String> lists = getRemoteWords(location);
|
||||||
|
//如果找不到扩展的字典,则忽略
|
||||||
|
if(lists == null){
|
||||||
|
logger.error("[Dict Loading]"+location+"加载失败");
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
for(String theWord:lists){
|
||||||
|
if (theWord != null && !"".equals(theWord.trim())) {
|
||||||
|
//加载远程词典数据到主内存中
|
||||||
|
logger.info(theWord);
|
||||||
|
_StopWords.fillSegment(theWord.trim().toLowerCase().toCharArray());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@ -512,5 +623,10 @@ public class Dictionary {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public void reLoadMainDict(){
|
||||||
|
logger.info("重新加载词典...");
|
||||||
|
loadMainDict();
|
||||||
|
loadStopWordDict();
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
96
src/main/java/org/wltea/analyzer/dic/Monitor.java
Normal file
96
src/main/java/org/wltea/analyzer/dic/Monitor.java
Normal file
@ -0,0 +1,96 @@
|
|||||||
|
package org.wltea.analyzer.dic;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
|
||||||
|
import org.apache.http.Header;
|
||||||
|
import org.apache.http.client.ClientProtocolException;
|
||||||
|
import org.apache.http.client.config.RequestConfig;
|
||||||
|
import org.apache.http.client.methods.CloseableHttpResponse;
|
||||||
|
import org.apache.http.client.methods.HttpHead;
|
||||||
|
import org.apache.http.impl.client.CloseableHttpClient;
|
||||||
|
import org.apache.http.impl.client.HttpClients;
|
||||||
|
import org.wltea.analyzer.help.Sleep;
|
||||||
|
import org.wltea.analyzer.help.Sleep.Type;
|
||||||
|
|
||||||
|
public class Monitor implements Runnable {
|
||||||
|
|
||||||
|
private static CloseableHttpClient httpclient = HttpClients.createDefault();
|
||||||
|
/*
|
||||||
|
* 上次更改时间
|
||||||
|
*/
|
||||||
|
private String last_modified;
|
||||||
|
/*
|
||||||
|
* 资源属性
|
||||||
|
*/
|
||||||
|
private String eTags;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* 请求地址
|
||||||
|
*/
|
||||||
|
private String location;
|
||||||
|
|
||||||
|
public Monitor(String location) {
|
||||||
|
this.location = location;
|
||||||
|
this.last_modified = null;
|
||||||
|
this.eTags = null;
|
||||||
|
}
|
||||||
|
/**
|
||||||
|
* 监控流程:
|
||||||
|
* ①向词库服务器发送Head请求
|
||||||
|
* ②从响应中获取Last-Modify、ETags字段值,判断是否变化
|
||||||
|
* ③如果未变化,休眠1min,返回第①步
|
||||||
|
* ④如果有变化,重新加载词典
|
||||||
|
* ⑤休眠1min,返回第①步
|
||||||
|
*/
|
||||||
|
|
||||||
|
public void run() {
|
||||||
|
//超时设置
|
||||||
|
RequestConfig rc = RequestConfig.custom().setConnectionRequestTimeout(10*1000)
|
||||||
|
.setConnectTimeout(10*1000).setSocketTimeout(15*1000).build();
|
||||||
|
while (true) {
|
||||||
|
HttpHead head = new HttpHead(location);
|
||||||
|
head.setConfig(rc);
|
||||||
|
|
||||||
|
//设置请求头
|
||||||
|
if (last_modified != null) {
|
||||||
|
head.setHeader("If-Modified-Since", last_modified);
|
||||||
|
}
|
||||||
|
if (eTags != null) {
|
||||||
|
head.setHeader("If-None-Match", eTags);
|
||||||
|
}
|
||||||
|
|
||||||
|
CloseableHttpResponse response = null;
|
||||||
|
try {
|
||||||
|
response = httpclient.execute(head);
|
||||||
|
|
||||||
|
//返回304 Not Modified,词库未更新
|
||||||
|
if(response.getStatusLine().getStatusCode()==304){
|
||||||
|
continue;
|
||||||
|
}else if(response.getStatusLine().getStatusCode()==200){
|
||||||
|
|
||||||
|
if (!response.getLastHeader("Last-Modified").getValue().equalsIgnoreCase(last_modified)
|
||||||
|
||!response.getLastHeader("ETags").getValue().equalsIgnoreCase(eTags)) {
|
||||||
|
|
||||||
|
// 远程词库有更新,需要重新加载词典,并修改last_modified,eTags
|
||||||
|
Dictionary.getSingleton().reLoadMainDict();
|
||||||
|
last_modified = response.getLastHeader("Last-Modified")==null?null:response.getLastHeader("Last-Modified").getValue();
|
||||||
|
eTags = response.getLastHeader("ETags")==null?null:response.getLastHeader("ETags").getValue();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
} catch (ClientProtocolException e) {
|
||||||
|
e.printStackTrace();
|
||||||
|
} catch (IOException e) {
|
||||||
|
e.printStackTrace();
|
||||||
|
}finally{
|
||||||
|
try {
|
||||||
|
response.close();
|
||||||
|
} catch (IOException e) {
|
||||||
|
e.printStackTrace();
|
||||||
|
}
|
||||||
|
Sleep.sleep(Type.SEC, 60);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
30
src/main/java/org/wltea/analyzer/help/Sleep.java
Normal file
30
src/main/java/org/wltea/analyzer/help/Sleep.java
Normal file
@ -0,0 +1,30 @@
|
|||||||
|
package org.wltea.analyzer.help;
|
||||||
|
|
||||||
|
public class Sleep {
|
||||||
|
public enum Type{MSEC,SEC,MIN,HOUR};
|
||||||
|
public static void sleep(Type type,int num){
|
||||||
|
try {
|
||||||
|
switch(type){
|
||||||
|
case MSEC:
|
||||||
|
Thread.sleep(num);
|
||||||
|
return;
|
||||||
|
case SEC:
|
||||||
|
Thread.sleep(num*1000);
|
||||||
|
return;
|
||||||
|
case MIN:
|
||||||
|
Thread.sleep(num*60*1000);
|
||||||
|
return;
|
||||||
|
case HOUR:
|
||||||
|
Thread.sleep(num*60*60*1000);
|
||||||
|
return;
|
||||||
|
default:
|
||||||
|
System.err.println("输入类型错误,应为MSEC,SEC,MIN,HOUR之一");
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
} catch (InterruptedException e) {
|
||||||
|
e.printStackTrace();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
}
|
Loading…
x
Reference in New Issue
Block a user