diff --git a/README.textile b/README.textile
index c0cfa74..5912f5f 100644
--- a/README.textile
+++ b/README.textile
@@ -1,6 +1,11 @@
IK Analysis for ElasticSearch
==================================
+更新说明:
+ 对于使用es集群,用ik作为分词插件,经常会修改自定义词典,增加远程加载,每次更新都会重新加载词典,不必重启es服务。
+
+
+
The IK Analysis plugin integrates Lucene IK analyzer into elasticsearch, support customized dictionary.
Tokenizer: `ik`
@@ -52,7 +57,11 @@ https://github.com/medcl/elasticsearch-analysis-ik/blob/master/config/ik/IKAnaly
custom/mydict.dic;custom/single_word_low_freq.dic
- custom/ext_stopword.dic
+ custom/ext_stopword.dic
+
+ location
+
+ location
diff --git a/config/ik/IKAnalyzer.cfg.xml b/config/ik/IKAnalyzer.cfg.xml
index f5c95b3..a6c0e0c 100644
--- a/config/ik/IKAnalyzer.cfg.xml
+++ b/config/ik/IKAnalyzer.cfg.xml
@@ -5,5 +5,9 @@
custom/mydict.dic;custom/single_word_low_freq.dic
- custom/ext_stopword.dic
-
\ No newline at end of file
+ custom/ext_stopword.dic
+
+ words_location
+
+ words_location
+
diff --git a/pom.xml b/pom.xml
index cf454ea..2534484 100644
--- a/pom.xml
+++ b/pom.xml
@@ -51,6 +51,13 @@
${elasticsearch.version}
compile
+
+
+ org.apache.httpcomponents
+ httpclient
+ 4.3.5
+ compile
+
log4j
diff --git a/src/main/java/org/wltea/analyzer/cfg/Configuration.java b/src/main/java/org/wltea/analyzer/cfg/Configuration.java
index d90dfc5..58274cf 100644
--- a/src/main/java/org/wltea/analyzer/cfg/Configuration.java
+++ b/src/main/java/org/wltea/analyzer/cfg/Configuration.java
@@ -17,7 +17,9 @@ public class Configuration {
private static String FILE_NAME = "ik/IKAnalyzer.cfg.xml";
private static final String EXT_DICT = "ext_dict";
+ private static final String REMOTE_EXT_DICT = "remote_ext_dict";
private static final String EXT_STOP = "ext_stopwords";
+ private static final String REMOTE_EXT_STOP = "remote_ext_stopwords";
private static ESLogger logger = null;
private Properties props;
private Environment environment;
@@ -64,6 +66,24 @@ public class Configuration {
}
return extDictFiles;
}
+
+ public List getRemoteExtDictionarys(){
+ List remoteExtDictFiles = new ArrayList(2);
+ String remoteExtDictCfg = props.getProperty(REMOTE_EXT_DICT);
+ if(remoteExtDictCfg != null){
+
+ String[] filePaths = remoteExtDictCfg.split(";");
+ if(filePaths != null){
+ for(String filePath : filePaths){
+ if(filePath != null && !"".equals(filePath.trim())){
+ remoteExtDictFiles.add(filePath);
+
+ }
+ }
+ }
+ }
+ return remoteExtDictFiles;
+ }
public List getExtStopWordDictionarys(){
List extStopWordDictFiles = new ArrayList(2);
@@ -83,6 +103,24 @@ public class Configuration {
}
return extStopWordDictFiles;
}
+
+ public List getRemoteExtStopWordDictionarys(){
+ List remoteExtStopWordDictFiles = new ArrayList(2);
+ String remoteExtStopWordDictCfg = props.getProperty(REMOTE_EXT_STOP);
+ if(remoteExtStopWordDictCfg != null){
+
+ String[] filePaths = remoteExtStopWordDictCfg.split(";");
+ if(filePaths != null){
+ for(String filePath : filePaths){
+ if(filePath != null && !"".equals(filePath.trim())){
+ remoteExtStopWordDictFiles.add(filePath);
+
+ }
+ }
+ }
+ }
+ return remoteExtStopWordDictFiles;
+ }
public File getDictRoot() {
return environment.configFile();
diff --git a/src/main/java/org/wltea/analyzer/dic/Dictionary.java b/src/main/java/org/wltea/analyzer/dic/Dictionary.java
index 6a76f2f..efe32d2 100644
--- a/src/main/java/org/wltea/analyzer/dic/Dictionary.java
+++ b/src/main/java/org/wltea/analyzer/dic/Dictionary.java
@@ -25,11 +25,18 @@
*/
package org.wltea.analyzer.dic;
+import org.apache.http.client.ClientProtocolException;
+import org.apache.http.client.config.RequestConfig;
+import org.apache.http.client.methods.CloseableHttpResponse;
+import org.apache.http.client.methods.HttpGet;
+import org.apache.http.impl.client.CloseableHttpClient;
+import org.apache.http.impl.client.HttpClients;
import org.elasticsearch.common.logging.ESLogger;
import org.elasticsearch.common.logging.Loggers;
import org.wltea.analyzer.cfg.Configuration;
import java.io.*;
+import java.util.ArrayList;
import java.util.Collection;
import java.util.List;
@@ -92,6 +99,17 @@ public class Dictionary {
singleton.loadSuffixDict();
singleton.loadPrepDict();
singleton.loadStopWordDict();
+
+ //建立监控线程
+ for(String location:cfg.getRemoteExtDictionarys()){
+ Thread monitor = new Thread(new Monitor(location));
+ monitor.start();
+ }
+ for(String location:cfg.getRemoteExtStopWordDictionarys()){
+ Thread monitor = new Thread(new Monitor(location));
+ monitor.start();
+ }
+
return singleton;
}
}
@@ -224,6 +242,8 @@ public class Dictionary {
}
//加载扩展词典
this.loadExtDict();
+ //加载远程自定义词库
+ this.loadRemoteExtDict();
}
/**
@@ -275,6 +295,76 @@ public class Dictionary {
}
}
+
+ /**
+ * 加载远程扩展词典到主词库表
+ */
+ private void loadRemoteExtDict(){
+ List remoteExtDictFiles = configuration.getRemoteExtDictionarys();
+ for(String location:remoteExtDictFiles){
+ logger.info("[Dict Loading]" + location);
+ List lists = getRemoteWords(location);
+ //如果找不到扩展的字典,则忽略
+ if(lists == null){
+ logger.error("[Dict Loading]"+location+"加载失败");
+ continue;
+ }
+ for(String theWord:lists){
+ if (theWord != null && !"".equals(theWord.trim())) {
+ //加载扩展词典数据到主内存词典中
+ logger.info(theWord);
+ _MainDict.fillSegment(theWord.trim().toLowerCase().toCharArray());
+ }
+ }
+ }
+
+ }
+
+ /**
+ * 从远程服务器上下载自定义词条
+ */
+ private static List getRemoteWords(String location){
+
+ List buffer = new ArrayList();
+ RequestConfig rc = RequestConfig.custom().setConnectionRequestTimeout(10*1000)
+ .setConnectTimeout(10*1000).setSocketTimeout(60*1000).build();
+ CloseableHttpClient httpclient = HttpClients.createDefault();
+ CloseableHttpResponse response;
+ BufferedReader in;
+ HttpGet get = new HttpGet(location);
+ get.setConfig(rc);
+ try {
+ response = httpclient.execute(get);
+ if(response.getStatusLine().getStatusCode()==200){
+
+ String charset = "UTF-8";
+ //获取编码,默认为utf-8
+ if(response.getEntity().getContentType().getValue().contains("charset=")){
+ String contentType=response.getEntity().getContentType().getValue();
+ charset=contentType.substring(contentType.lastIndexOf("=")+1);
+ }
+ in = new BufferedReader(new InputStreamReader(response.getEntity().getContent(),charset));
+ String line ;
+ while((line = in.readLine())!=null){
+ buffer.add(line);
+ }
+ in.close();
+ response.close();
+ return buffer;
+ }
+ response.close();
+ } catch (ClientProtocolException e) {
+ e.printStackTrace();
+ } catch (IllegalStateException e) {
+ e.printStackTrace();
+ } catch (IOException e) {
+ e.printStackTrace();
+ }
+ return buffer;
+ }
+
+
+
/**
* 加载用户扩展的停止词词典
*/
@@ -360,7 +450,28 @@ public class Dictionary {
}
}
}
- }
+ }
+
+ //加载远程停用词典
+ List remoteExtStopWordDictFiles = configuration.getRemoteExtStopWordDictionarys();
+ for(String location:remoteExtStopWordDictFiles){
+ logger.info("[Dict Loading]" + location);
+ List lists = getRemoteWords(location);
+ //如果找不到扩展的字典,则忽略
+ if(lists == null){
+ logger.error("[Dict Loading]"+location+"加载失败");
+ continue;
+ }
+ for(String theWord:lists){
+ if (theWord != null && !"".equals(theWord.trim())) {
+ //加载远程词典数据到主内存中
+ logger.info(theWord);
+ _StopWords.fillSegment(theWord.trim().toLowerCase().toCharArray());
+ }
+ }
+ }
+
+
}
/**
@@ -511,6 +622,11 @@ public class Dictionary {
}
}
}
-
-
+
+ public void reLoadMainDict(){
+ logger.info("重新加载词典...");
+ loadMainDict();
+ loadStopWordDict();
+ }
+
}
diff --git a/src/main/java/org/wltea/analyzer/dic/Monitor.java b/src/main/java/org/wltea/analyzer/dic/Monitor.java
new file mode 100644
index 0000000..c051cc0
--- /dev/null
+++ b/src/main/java/org/wltea/analyzer/dic/Monitor.java
@@ -0,0 +1,96 @@
+package org.wltea.analyzer.dic;
+
+import java.io.IOException;
+
+import org.apache.http.Header;
+import org.apache.http.client.ClientProtocolException;
+import org.apache.http.client.config.RequestConfig;
+import org.apache.http.client.methods.CloseableHttpResponse;
+import org.apache.http.client.methods.HttpHead;
+import org.apache.http.impl.client.CloseableHttpClient;
+import org.apache.http.impl.client.HttpClients;
+import org.wltea.analyzer.help.Sleep;
+import org.wltea.analyzer.help.Sleep.Type;
+
+public class Monitor implements Runnable {
+
+ private static CloseableHttpClient httpclient = HttpClients.createDefault();
+ /*
+ * 上次更改时间
+ */
+ private String last_modified;
+ /*
+ * 资源属性
+ */
+ private String eTags;
+
+ /*
+ * 请求地址
+ */
+ private String location;
+
+ public Monitor(String location) {
+ this.location = location;
+ this.last_modified = null;
+ this.eTags = null;
+ }
+ /**
+ * 监控流程:
+ * ①向词库服务器发送Head请求
+ * ②从响应中获取Last-Modify、ETags字段值,判断是否变化
+ * ③如果未变化,休眠1min,返回第①步
+ * ④如果有变化,重新加载词典
+ * ⑤休眠1min,返回第①步
+ */
+
+ public void run() {
+ //超时设置
+ RequestConfig rc = RequestConfig.custom().setConnectionRequestTimeout(10*1000)
+ .setConnectTimeout(10*1000).setSocketTimeout(15*1000).build();
+ while (true) {
+ HttpHead head = new HttpHead(location);
+ head.setConfig(rc);
+
+ //设置请求头
+ if (last_modified != null) {
+ head.setHeader("If-Modified-Since", last_modified);
+ }
+ if (eTags != null) {
+ head.setHeader("If-None-Match", eTags);
+ }
+
+ CloseableHttpResponse response = null;
+ try {
+ response = httpclient.execute(head);
+
+ //返回304 Not Modified,词库未更新
+ if(response.getStatusLine().getStatusCode()==304){
+ continue;
+ }else if(response.getStatusLine().getStatusCode()==200){
+
+ if (!response.getLastHeader("Last-Modified").getValue().equalsIgnoreCase(last_modified)
+ ||!response.getLastHeader("ETags").getValue().equalsIgnoreCase(eTags)) {
+
+ // 远程词库有更新,需要重新加载词典,并修改last_modified,eTags
+ Dictionary.getSingleton().reLoadMainDict();
+ last_modified = response.getLastHeader("Last-Modified")==null?null:response.getLastHeader("Last-Modified").getValue();
+ eTags = response.getLastHeader("ETags")==null?null:response.getLastHeader("ETags").getValue();
+ }
+ }
+
+ } catch (ClientProtocolException e) {
+ e.printStackTrace();
+ } catch (IOException e) {
+ e.printStackTrace();
+ }finally{
+ try {
+ response.close();
+ } catch (IOException e) {
+ e.printStackTrace();
+ }
+ Sleep.sleep(Type.SEC, 60);
+ }
+ }
+ }
+
+}
diff --git a/src/main/java/org/wltea/analyzer/help/Sleep.java b/src/main/java/org/wltea/analyzer/help/Sleep.java
new file mode 100644
index 0000000..703b816
--- /dev/null
+++ b/src/main/java/org/wltea/analyzer/help/Sleep.java
@@ -0,0 +1,30 @@
+package org.wltea.analyzer.help;
+
+public class Sleep {
+ public enum Type{MSEC,SEC,MIN,HOUR};
+ public static void sleep(Type type,int num){
+ try {
+ switch(type){
+ case MSEC:
+ Thread.sleep(num);
+ return;
+ case SEC:
+ Thread.sleep(num*1000);
+ return;
+ case MIN:
+ Thread.sleep(num*60*1000);
+ return;
+ case HOUR:
+ Thread.sleep(num*60*60*1000);
+ return;
+ default:
+ System.err.println("输入类型错误,应为MSEC,SEC,MIN,HOUR之一");
+ return;
+ }
+ } catch (InterruptedException e) {
+ e.printStackTrace();
+ }
+ }
+
+
+}