From 694b68c93a3d84233157d4ce29a34423d950bb0a Mon Sep 17 00:00:00 2001 From: elasticsearch-ik Date: Wed, 24 Sep 2014 15:36:48 +0800 Subject: [PATCH 1/8] Update IKAnalyzer.cfg.xml MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 增加远程词典,配置项是词典的链接地址 --- config/ik/IKAnalyzer.cfg.xml | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/config/ik/IKAnalyzer.cfg.xml b/config/ik/IKAnalyzer.cfg.xml index f5c95b3..a6c0e0c 100644 --- a/config/ik/IKAnalyzer.cfg.xml +++ b/config/ik/IKAnalyzer.cfg.xml @@ -5,5 +5,9 @@ custom/mydict.dic;custom/single_word_low_freq.dic - custom/ext_stopword.dic - \ No newline at end of file + custom/ext_stopword.dic + + words_location + + words_location + From 4a00e82430c1c6d9e6a8bd734b14fd2a0e0f630d Mon Sep 17 00:00:00 2001 From: elasticsearch-ik Date: Wed, 24 Sep 2014 15:40:45 +0800 Subject: [PATCH 2/8] Update pom.xml MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 远程监控使用了httpclient,增加httpclient的依赖 --- pom.xml | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/pom.xml b/pom.xml index cf454ea..2534484 100644 --- a/pom.xml +++ b/pom.xml @@ -51,6 +51,13 @@ ${elasticsearch.version} compile + + + org.apache.httpcomponents + httpclient + 4.3.5 + compile + log4j From 092a9ad76938a039e7e59818fd55f8ebee87a8e2 Mon Sep 17 00:00:00 2001 From: elasticsearch-ik Date: Wed, 24 Sep 2014 15:46:30 +0800 Subject: [PATCH 3/8] Update Configuration.java MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 在配置中增加远程词典和远程停用词,以及它们的配置读取 --- .../org/wltea/analyzer/cfg/Configuration.java | 38 +++++++++++++++++++ 1 file changed, 38 insertions(+) diff --git a/src/main/java/org/wltea/analyzer/cfg/Configuration.java b/src/main/java/org/wltea/analyzer/cfg/Configuration.java index d90dfc5..58274cf 100644 --- a/src/main/java/org/wltea/analyzer/cfg/Configuration.java +++ b/src/main/java/org/wltea/analyzer/cfg/Configuration.java @@ -17,7 +17,9 @@ public class Configuration { private static String FILE_NAME = "ik/IKAnalyzer.cfg.xml"; private static final String EXT_DICT = "ext_dict"; + private static final String REMOTE_EXT_DICT = "remote_ext_dict"; private static final String EXT_STOP = "ext_stopwords"; + private static final String REMOTE_EXT_STOP = "remote_ext_stopwords"; private static ESLogger logger = null; private Properties props; private Environment environment; @@ -64,6 +66,24 @@ public class Configuration { } return extDictFiles; } + + public List getRemoteExtDictionarys(){ + List remoteExtDictFiles = new ArrayList(2); + String remoteExtDictCfg = props.getProperty(REMOTE_EXT_DICT); + if(remoteExtDictCfg != null){ + + String[] filePaths = remoteExtDictCfg.split(";"); + if(filePaths != null){ + for(String filePath : filePaths){ + if(filePath != null && !"".equals(filePath.trim())){ + remoteExtDictFiles.add(filePath); + + } + } + } + } + return remoteExtDictFiles; + } public List getExtStopWordDictionarys(){ List extStopWordDictFiles = new ArrayList(2); @@ -83,6 +103,24 @@ public class Configuration { } return extStopWordDictFiles; } + + public List getRemoteExtStopWordDictionarys(){ + List remoteExtStopWordDictFiles = new ArrayList(2); + String remoteExtStopWordDictCfg = props.getProperty(REMOTE_EXT_STOP); + if(remoteExtStopWordDictCfg != null){ + + String[] filePaths = remoteExtStopWordDictCfg.split(";"); + if(filePaths != null){ + for(String filePath : filePaths){ + if(filePath != null && !"".equals(filePath.trim())){ + remoteExtStopWordDictFiles.add(filePath); + + } + } + } + } + return remoteExtStopWordDictFiles; + } public File getDictRoot() { return environment.configFile(); From 11a8947603a9b736691f138317487ed90da1daea Mon Sep 17 00:00:00 2001 From: elasticsearch-ik Date: Wed, 24 Sep 2014 16:14:08 +0800 Subject: [PATCH 4/8] Create Monitor.java MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 监控线程,监控远程词典的更新 --- .../java/org/wltea/analyzer/dic/Monitor.java | 96 +++++++++++++++++++ 1 file changed, 96 insertions(+) create mode 100644 src/main/java/org/wltea/analyzer/dic/Monitor.java diff --git a/src/main/java/org/wltea/analyzer/dic/Monitor.java b/src/main/java/org/wltea/analyzer/dic/Monitor.java new file mode 100644 index 0000000..c051cc0 --- /dev/null +++ b/src/main/java/org/wltea/analyzer/dic/Monitor.java @@ -0,0 +1,96 @@ +package org.wltea.analyzer.dic; + +import java.io.IOException; + +import org.apache.http.Header; +import org.apache.http.client.ClientProtocolException; +import org.apache.http.client.config.RequestConfig; +import org.apache.http.client.methods.CloseableHttpResponse; +import org.apache.http.client.methods.HttpHead; +import org.apache.http.impl.client.CloseableHttpClient; +import org.apache.http.impl.client.HttpClients; +import org.wltea.analyzer.help.Sleep; +import org.wltea.analyzer.help.Sleep.Type; + +public class Monitor implements Runnable { + + private static CloseableHttpClient httpclient = HttpClients.createDefault(); + /* + * 上次更改时间 + */ + private String last_modified; + /* + * 资源属性 + */ + private String eTags; + + /* + * 请求地址 + */ + private String location; + + public Monitor(String location) { + this.location = location; + this.last_modified = null; + this.eTags = null; + } + /** + * 监控流程: + * ①向词库服务器发送Head请求 + * ②从响应中获取Last-Modify、ETags字段值,判断是否变化 + * ③如果未变化,休眠1min,返回第①步 + * ④如果有变化,重新加载词典 + * ⑤休眠1min,返回第①步 + */ + + public void run() { + //超时设置 + RequestConfig rc = RequestConfig.custom().setConnectionRequestTimeout(10*1000) + .setConnectTimeout(10*1000).setSocketTimeout(15*1000).build(); + while (true) { + HttpHead head = new HttpHead(location); + head.setConfig(rc); + + //设置请求头 + if (last_modified != null) { + head.setHeader("If-Modified-Since", last_modified); + } + if (eTags != null) { + head.setHeader("If-None-Match", eTags); + } + + CloseableHttpResponse response = null; + try { + response = httpclient.execute(head); + + //返回304 Not Modified,词库未更新 + if(response.getStatusLine().getStatusCode()==304){ + continue; + }else if(response.getStatusLine().getStatusCode()==200){ + + if (!response.getLastHeader("Last-Modified").getValue().equalsIgnoreCase(last_modified) + ||!response.getLastHeader("ETags").getValue().equalsIgnoreCase(eTags)) { + + // 远程词库有更新,需要重新加载词典,并修改last_modified,eTags + Dictionary.getSingleton().reLoadMainDict(); + last_modified = response.getLastHeader("Last-Modified")==null?null:response.getLastHeader("Last-Modified").getValue(); + eTags = response.getLastHeader("ETags")==null?null:response.getLastHeader("ETags").getValue(); + } + } + + } catch (ClientProtocolException e) { + e.printStackTrace(); + } catch (IOException e) { + e.printStackTrace(); + }finally{ + try { + response.close(); + } catch (IOException e) { + e.printStackTrace(); + } + Sleep.sleep(Type.SEC, 60); + } + } + } + +} From f0dc55aad665285627fb7f313c4d637b1bcc427a Mon Sep 17 00:00:00 2001 From: elasticsearch-ik Date: Wed, 24 Sep 2014 16:21:31 +0800 Subject: [PATCH 5/8] Update Dictionary.java MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 给远程加载建立监控 --- .../org/wltea/analyzer/dic/Dictionary.java | 122 +++++++++++++++++- 1 file changed, 119 insertions(+), 3 deletions(-) diff --git a/src/main/java/org/wltea/analyzer/dic/Dictionary.java b/src/main/java/org/wltea/analyzer/dic/Dictionary.java index 6a76f2f..efe32d2 100644 --- a/src/main/java/org/wltea/analyzer/dic/Dictionary.java +++ b/src/main/java/org/wltea/analyzer/dic/Dictionary.java @@ -25,11 +25,18 @@ */ package org.wltea.analyzer.dic; +import org.apache.http.client.ClientProtocolException; +import org.apache.http.client.config.RequestConfig; +import org.apache.http.client.methods.CloseableHttpResponse; +import org.apache.http.client.methods.HttpGet; +import org.apache.http.impl.client.CloseableHttpClient; +import org.apache.http.impl.client.HttpClients; import org.elasticsearch.common.logging.ESLogger; import org.elasticsearch.common.logging.Loggers; import org.wltea.analyzer.cfg.Configuration; import java.io.*; +import java.util.ArrayList; import java.util.Collection; import java.util.List; @@ -92,6 +99,17 @@ public class Dictionary { singleton.loadSuffixDict(); singleton.loadPrepDict(); singleton.loadStopWordDict(); + + //建立监控线程 + for(String location:cfg.getRemoteExtDictionarys()){ + Thread monitor = new Thread(new Monitor(location)); + monitor.start(); + } + for(String location:cfg.getRemoteExtStopWordDictionarys()){ + Thread monitor = new Thread(new Monitor(location)); + monitor.start(); + } + return singleton; } } @@ -224,6 +242,8 @@ public class Dictionary { } //加载扩展词典 this.loadExtDict(); + //加载远程自定义词库 + this.loadRemoteExtDict(); } /** @@ -275,6 +295,76 @@ public class Dictionary { } } + + /** + * 加载远程扩展词典到主词库表 + */ + private void loadRemoteExtDict(){ + List remoteExtDictFiles = configuration.getRemoteExtDictionarys(); + for(String location:remoteExtDictFiles){ + logger.info("[Dict Loading]" + location); + List lists = getRemoteWords(location); + //如果找不到扩展的字典,则忽略 + if(lists == null){ + logger.error("[Dict Loading]"+location+"加载失败"); + continue; + } + for(String theWord:lists){ + if (theWord != null && !"".equals(theWord.trim())) { + //加载扩展词典数据到主内存词典中 + logger.info(theWord); + _MainDict.fillSegment(theWord.trim().toLowerCase().toCharArray()); + } + } + } + + } + + /** + * 从远程服务器上下载自定义词条 + */ + private static List getRemoteWords(String location){ + + List buffer = new ArrayList(); + RequestConfig rc = RequestConfig.custom().setConnectionRequestTimeout(10*1000) + .setConnectTimeout(10*1000).setSocketTimeout(60*1000).build(); + CloseableHttpClient httpclient = HttpClients.createDefault(); + CloseableHttpResponse response; + BufferedReader in; + HttpGet get = new HttpGet(location); + get.setConfig(rc); + try { + response = httpclient.execute(get); + if(response.getStatusLine().getStatusCode()==200){ + + String charset = "UTF-8"; + //获取编码,默认为utf-8 + if(response.getEntity().getContentType().getValue().contains("charset=")){ + String contentType=response.getEntity().getContentType().getValue(); + charset=contentType.substring(contentType.lastIndexOf("=")+1); + } + in = new BufferedReader(new InputStreamReader(response.getEntity().getContent(),charset)); + String line ; + while((line = in.readLine())!=null){ + buffer.add(line); + } + in.close(); + response.close(); + return buffer; + } + response.close(); + } catch (ClientProtocolException e) { + e.printStackTrace(); + } catch (IllegalStateException e) { + e.printStackTrace(); + } catch (IOException e) { + e.printStackTrace(); + } + return buffer; + } + + + /** * 加载用户扩展的停止词词典 */ @@ -360,7 +450,28 @@ public class Dictionary { } } } - } + } + + //加载远程停用词典 + List remoteExtStopWordDictFiles = configuration.getRemoteExtStopWordDictionarys(); + for(String location:remoteExtStopWordDictFiles){ + logger.info("[Dict Loading]" + location); + List lists = getRemoteWords(location); + //如果找不到扩展的字典,则忽略 + if(lists == null){ + logger.error("[Dict Loading]"+location+"加载失败"); + continue; + } + for(String theWord:lists){ + if (theWord != null && !"".equals(theWord.trim())) { + //加载远程词典数据到主内存中 + logger.info(theWord); + _StopWords.fillSegment(theWord.trim().toLowerCase().toCharArray()); + } + } + } + + } /** @@ -511,6 +622,11 @@ public class Dictionary { } } } - - + + public void reLoadMainDict(){ + logger.info("重新加载词典..."); + loadMainDict(); + loadStopWordDict(); + } + } From ddef2a1051fcad735da991bdfe7c0623533febac Mon Sep 17 00:00:00 2001 From: elasticsearch-ik Date: Wed, 24 Sep 2014 16:26:42 +0800 Subject: [PATCH 6/8] Create Sleep.java MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 设置休眠 --- .../java/org/wltea/analyzer/help/Sleep.java | 30 +++++++++++++++++++ 1 file changed, 30 insertions(+) create mode 100644 src/main/java/org/wltea/analyzer/help/Sleep.java diff --git a/src/main/java/org/wltea/analyzer/help/Sleep.java b/src/main/java/org/wltea/analyzer/help/Sleep.java new file mode 100644 index 0000000..703b816 --- /dev/null +++ b/src/main/java/org/wltea/analyzer/help/Sleep.java @@ -0,0 +1,30 @@ +package org.wltea.analyzer.help; + +public class Sleep { + public enum Type{MSEC,SEC,MIN,HOUR}; + public static void sleep(Type type,int num){ + try { + switch(type){ + case MSEC: + Thread.sleep(num); + return; + case SEC: + Thread.sleep(num*1000); + return; + case MIN: + Thread.sleep(num*60*1000); + return; + case HOUR: + Thread.sleep(num*60*60*1000); + return; + default: + System.err.println("输入类型错误,应为MSEC,SEC,MIN,HOUR之一"); + return; + } + } catch (InterruptedException e) { + e.printStackTrace(); + } + } + + +} From 6048982a4d50bc974bca8536d487169e7a570806 Mon Sep 17 00:00:00 2001 From: elasticsearch-ik Date: Wed, 24 Sep 2014 16:56:04 +0800 Subject: [PATCH 7/8] Update README.textile --- README.textile | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/README.textile b/README.textile index c0cfa74..ad6ac34 100644 --- a/README.textile +++ b/README.textile @@ -1,6 +1,11 @@ IK Analysis for ElasticSearch ================================== +更新说明: + 对于使用es集群,用ik作为分词插件,经常会修改自定义词典,增加远程加载,每次更新都会重新加载词典,不必重启es服务。 + + + The IK Analysis plugin integrates Lucene IK analyzer into elasticsearch, support customized dictionary. Tokenizer: `ik` @@ -52,7 +57,11 @@ https://github.com/medcl/elasticsearch-analysis-ik/blob/master/config/ik/IKAnaly custom/mydict.dic;custom/single_word_low_freq.dic - custom/ext_stopword.dic + custom/ext_stopword.dic + + location + + location From 8a91d7c96686aaa94b8faca86d697a9aba5dd406 Mon Sep 17 00:00:00 2001 From: elasticsearch-ik Date: Wed, 24 Sep 2014 17:14:54 +0800 Subject: [PATCH 8/8] Update README.textile --- README.textile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.textile b/README.textile index ad6ac34..5912f5f 100644 --- a/README.textile +++ b/README.textile @@ -59,9 +59,9 @@ https://github.com/medcl/elasticsearch-analysis-ik/blob/master/config/ik/IKAnaly custom/ext_stopword.dic - location + location - location + location