merge code

This commit is contained in:
medcl 2016-04-10 22:17:59 +08:00
commit ca2bfe5732
6 changed files with 372 additions and 363 deletions

View File

@ -1,7 +1,7 @@
/** /**
* IK 中文分词 版本 5.0 * IK 中文分词 版本 5.0
* IK Analyzer release 5.0 * IK Analyzer release 5.0
* *
* Licensed to the Apache Software Foundation (ASF) under one or more * Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with * contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership. * this work for additional information regarding copyright ownership.
@ -20,8 +20,8 @@
* 源代码由林良益(linliangyi2005@gmail.com)提供 * 源代码由林良益(linliangyi2005@gmail.com)提供
* 版权声明 2012乌龙茶工作室 * 版权声明 2012乌龙茶工作室
* provided by Linliangyi and copyright 2012 by Oolong studio * provided by Linliangyi and copyright 2012 by Oolong studio
* *
* *
*/ */
package org.wltea.analyzer.dic; package org.wltea.analyzer.dic;
@ -62,37 +62,37 @@ public class Dictionary {
*/ */
private static Dictionary singleton; private static Dictionary singleton;
private DictSegment _MainDict; private DictSegment _MainDict;
private DictSegment _SurnameDict; private DictSegment _SurnameDict;
private DictSegment _QuantifierDict; private DictSegment _QuantifierDict;
private DictSegment _SuffixDict; private DictSegment _SuffixDict;
private DictSegment _PrepDict; private DictSegment _PrepDict;
private DictSegment _StopWords;
private DictSegment _StopWords;
/** /**
* 配置对象 * 配置对象
*/ */
private Configuration configuration; private Configuration configuration;
public static final ESLogger logger=Loggers.getLogger("ik-analyzer"); public static ESLogger logger=Loggers.getLogger("ik-analyzer");
private static ScheduledExecutorService pool = Executors.newScheduledThreadPool(1);
public static final String PATH_DIC_MAIN = "ik/main.dic";
public static final String PATH_DIC_SURNAME = "ik/surname.dic";
public static final String PATH_DIC_QUANTIFIER = "ik/quantifier.dic";
public static final String PATH_DIC_SUFFIX = "ik/suffix.dic";
public static final String PATH_DIC_PREP = "ik/preposition.dic";
public static final String PATH_DIC_STOP = "ik/stopword.dic";
private Dictionary(){
} private static ScheduledExecutorService pool = Executors.newScheduledThreadPool(1);
public static final String PATH_DIC_MAIN = "ik/main.dic";
public static final String PATH_DIC_SURNAME = "ik/surname.dic";
public static final String PATH_DIC_QUANTIFIER = "ik/quantifier.dic";
public static final String PATH_DIC_SUFFIX = "ik/suffix.dic";
public static final String PATH_DIC_PREP = "ik/preposition.dic";
public static final String PATH_DIC_STOP = "ik/stopword.dic";
private Dictionary(){
}
/** /**
* 词典初始化 * 词典初始化
@ -103,33 +103,34 @@ public class Dictionary {
* @return Dictionary * @return Dictionary
*/ */
public static synchronized Dictionary initial(Configuration cfg){ public static synchronized Dictionary initial(Configuration cfg){
if(singleton == null){
synchronized(Dictionary.class){ synchronized(Dictionary.class){
if(singleton == null){ if(singleton == null){
singleton = new Dictionary(); singleton = new Dictionary();
singleton.configuration=cfg; singleton.configuration=cfg;
singleton.loadMainDict(); singleton.loadMainDict();
singleton.loadSurnameDict(); singleton.loadSurnameDict();
singleton.loadQuantifierDict(); singleton.loadQuantifierDict();
singleton.loadSuffixDict(); singleton.loadSuffixDict();
singleton.loadPrepDict(); singleton.loadPrepDict();
singleton.loadStopWordDict(); singleton.loadStopWordDict();
//建立监控线程 //建立监控线程
for(String location:cfg.getRemoteExtDictionarys()){ for(String location:cfg.getRemoteExtDictionarys()){
//10 秒是初始延迟可以修改的 60是间隔时间 单位秒 //10 秒是初始延迟可以修改的 60是间隔时间 单位秒
pool.scheduleAtFixedRate(new Monitor(location), 10, 60, TimeUnit.SECONDS); pool.scheduleAtFixedRate(new Monitor(location), 10, 60, TimeUnit.SECONDS);
}
for(String location:cfg.getRemoteExtStopWordDictionarys()){
pool.scheduleAtFixedRate(new Monitor(location), 10, 60, TimeUnit.SECONDS);
}
return singleton;
} }
for(String location:cfg.getRemoteExtStopWordDictionarys()){
pool.scheduleAtFixedRate(new Monitor(location), 10, 60, TimeUnit.SECONDS);
}
return singleton;
} }
} }
return singleton; return singleton;
} }
/** /**
* 获取词典单子实例 * 获取词典单子实例
* @return Dictionary 单例对象 * @return Dictionary 单例对象
@ -140,7 +141,7 @@ public class Dictionary {
} }
return singleton; return singleton;
} }
/** /**
* 批量加载新词条 * 批量加载新词条
* @param words Collection<String>词条列表 * @param words Collection<String>词条列表
@ -155,7 +156,7 @@ public class Dictionary {
} }
} }
} }
/** /**
* 批量移除屏蔽词条 * 批量移除屏蔽词条
*/ */
@ -169,7 +170,7 @@ public class Dictionary {
} }
} }
} }
/** /**
* 检索匹配主词典 * 检索匹配主词典
* @return Hit 匹配结果描述 * @return Hit 匹配结果描述
@ -177,15 +178,15 @@ public class Dictionary {
public Hit matchInMainDict(char[] charArray){ public Hit matchInMainDict(char[] charArray){
return singleton._MainDict.match(charArray); return singleton._MainDict.match(charArray);
} }
/** /**
* 检索匹配主词典 * 检索匹配主词典
* @return Hit 匹配结果描述 * @return Hit 匹配结果描述
*/ */
public Hit matchInMainDict(char[] charArray , int begin, int length){ public Hit matchInMainDict(char[] charArray , int begin, int length){
return singleton._MainDict.match(charArray, begin, length); return singleton._MainDict.match(charArray, begin, length);
} }
/** /**
* 检索匹配量词词典 * 检索匹配量词词典
* @return Hit 匹配结果描述 * @return Hit 匹配结果描述
@ -193,8 +194,8 @@ public class Dictionary {
public Hit matchInQuantifierDict(char[] charArray , int begin, int length){ public Hit matchInQuantifierDict(char[] charArray , int begin, int length){
return singleton._QuantifierDict.match(charArray, begin, length); return singleton._QuantifierDict.match(charArray, begin, length);
} }
/** /**
* 从已匹配的Hit中直接取出DictSegment继续向下匹配 * 从已匹配的Hit中直接取出DictSegment继续向下匹配
* @return Hit * @return Hit
@ -203,16 +204,16 @@ public class Dictionary {
DictSegment ds = matchedHit.getMatchedDictSegment(); DictSegment ds = matchedHit.getMatchedDictSegment();
return ds.match(charArray, currentIndex, 1 , matchedHit); return ds.match(charArray, currentIndex, 1 , matchedHit);
} }
/** /**
* 判断是否是停止词 * 判断是否是停止词
* @return boolean * @return boolean
*/ */
public boolean isStopWord(char[] charArray , int begin, int length){ public boolean isStopWord(char[] charArray , int begin, int length){
return singleton._StopWords.match(charArray, begin, length).isMatch(); return singleton._StopWords.match(charArray, begin, length).isMatch();
} }
/** /**
* 加载主词典及扩展词典 * 加载主词典及扩展词典
*/ */
@ -223,13 +224,13 @@ public class Dictionary {
//读取主词典文件 //读取主词典文件
Path file = PathUtils.get(configuration.getDictRoot(), Dictionary.PATH_DIC_MAIN); Path file = PathUtils.get(configuration.getDictRoot(), Dictionary.PATH_DIC_MAIN);
InputStream is = null; InputStream is = null;
try { try {
is = new FileInputStream(file.toFile()); is = new FileInputStream(file.toFile());
} catch (FileNotFoundException e) { } catch (FileNotFoundException e) {
e.printStackTrace(); logger.error(e.getMessage(), e);
} }
try { try {
BufferedReader br = new BufferedReader(new InputStreamReader(is , "UTF-8"), 512); BufferedReader br = new BufferedReader(new InputStreamReader(is , "UTF-8"), 512);
String theWord = null; String theWord = null;
@ -239,26 +240,26 @@ public class Dictionary {
_MainDict.fillSegment(theWord.trim().toCharArray()); _MainDict.fillSegment(theWord.trim().toCharArray());
} }
} while (theWord != null); } while (theWord != null);
} catch (IOException e) {
logger.error("ik-analyzer",e);
}finally{ } catch (IOException e) {
logger.error("ik-analyzer",e);
}finally{
try { try {
if(is != null){ if(is != null){
is.close(); is.close();
is = null; is = null;
} }
} catch (IOException e) { } catch (IOException e) {
logger.error("ik-analyzer",e); logger.error("ik-analyzer",e);
} }
} }
//加载扩展词典 //加载扩展词典
this.loadExtDict(); this.loadExtDict();
//加载远程自定义词库 //加载远程自定义词库
this.loadRemoteExtDict(); this.loadRemoteExtDict();
} }
/** /**
* 加载用户配置的扩展词典到主词库表 * 加载用户配置的扩展词典到主词库表
*/ */
@ -269,13 +270,13 @@ public class Dictionary {
InputStream is = null; InputStream is = null;
for(String extDictName : extDictFiles){ for(String extDictName : extDictFiles){
//读取扩展词典文件 //读取扩展词典文件
logger.info("[Dict Loading] " + extDictName); logger.info("[Dict Loading] " + extDictName);
Path file = PathUtils.get(configuration.getDictRoot(), extDictName); Path file = PathUtils.get(configuration.getDictRoot(), extDictName);
try { try {
is = new FileInputStream(file.toFile()); is = new FileInputStream(file.toFile());
} catch (FileNotFoundException e) { } catch (FileNotFoundException e) {
logger.error("ik-analyzer",e); logger.error("ik-analyzer",e);
} }
//如果找不到扩展的字典则忽略 //如果找不到扩展的字典则忽略
if(is == null){ if(is == null){
@ -286,27 +287,29 @@ public class Dictionary {
String theWord = null; String theWord = null;
do { do {
theWord = br.readLine(); theWord = br.readLine();
if (theWord != null && !"".equals(theWord.trim())) { if (theWord != null && !"".equals(theWord.trim())) {
//加载扩展词典数据到主内存词典中 //加载扩展词典数据到主内存词典中
_MainDict.fillSegment(theWord.trim().toCharArray()); _MainDict.fillSegment(theWord.trim().toCharArray());
} }
} while (theWord != null); } while (theWord != null);
} catch (IOException e) { } catch (IOException e) {
logger.error("ik-analyzer",e); logger.error("ik-analyzer",e);
}finally{ }finally{
try { try {
is.close(); if(is != null){
is = null; is.close();
is = null;
}
} catch (IOException e) { } catch (IOException e) {
logger.error("ik-analyzer",e); logger.error("ik-analyzer",e);
} }
} }
} }
} }
} }
/** /**
* 加载远程扩展词典到主词库表 * 加载远程扩展词典到主词库表
*/ */
@ -315,14 +318,11 @@ public class Dictionary {
for(String location:remoteExtDictFiles){ for(String location:remoteExtDictFiles){
logger.info("[Dict Loading] " + location); logger.info("[Dict Loading] " + location);
List<String> lists = getRemoteWords(location); List<String> lists = getRemoteWords(location);
/** Redundant Nullcheck as the list is initialized in the getRemoteWords method
//如果找不到扩展的字典则忽略 //如果找不到扩展的字典则忽略
if(lists == null){ if(lists == null){
logger.error("[Dict Loading] "+location+"加载失败"); logger.error("[Dict Loading] "+location+"加载失败");
continue; continue;
}*/ }
for(String theWord:lists){ for(String theWord:lists){
if (theWord != null && !"".equals(theWord.trim())) { if (theWord != null && !"".equals(theWord.trim())) {
//加载扩展词典数据到主内存词典中 //加载扩展词典数据到主内存词典中
@ -331,14 +331,14 @@ public class Dictionary {
} }
} }
} }
} }
/** /**
* 从远程服务器上下载自定义词条 * 从远程服务器上下载自定义词条
*/ */
private static List<String> getRemoteWords(String location){ private static List<String> getRemoteWords(String location){
List<String> buffer = new ArrayList<String>(); List<String> buffer = new ArrayList<String>();
RequestConfig rc = RequestConfig.custom().setConnectionRequestTimeout(10*1000) RequestConfig rc = RequestConfig.custom().setConnectionRequestTimeout(10*1000)
.setConnectTimeout(10*1000).setSocketTimeout(60*1000).build(); .setConnectTimeout(10*1000).setSocketTimeout(60*1000).build();
@ -350,7 +350,7 @@ public class Dictionary {
try { try {
response = httpclient.execute(get); response = httpclient.execute(get);
if(response.getStatusLine().getStatusCode()==200){ if(response.getStatusLine().getStatusCode()==200){
String charset = "UTF-8"; String charset = "UTF-8";
//获取编码默认为utf-8 //获取编码默认为utf-8
if(response.getEntity().getContentType().getValue().contains("charset=")){ if(response.getEntity().getContentType().getValue().contains("charset=")){
@ -376,49 +376,49 @@ public class Dictionary {
} }
return buffer; return buffer;
} }
/** /**
* 加载用户扩展的停止词词典 * 加载用户扩展的停止词词典
*/ */
private void loadStopWordDict(){ private void loadStopWordDict(){
//建立主词典实例 //建立主词典实例
_StopWords = new DictSegment((char)0); _StopWords = new DictSegment((char)0);
//读取主词典文件 //读取主词典文件
Path file = PathUtils.get(configuration.getDictRoot(), Dictionary.PATH_DIC_STOP); Path file = PathUtils.get(configuration.getDictRoot(), Dictionary.PATH_DIC_STOP);
InputStream is = null; InputStream is = null;
try { try {
is = new FileInputStream(file.toFile()); is = new FileInputStream(file.toFile());
} catch (FileNotFoundException e) { } catch (FileNotFoundException e) {
e.printStackTrace(); logger.error(e.getMessage(), e);
} }
try { try {
BufferedReader br = new BufferedReader(new InputStreamReader(is , "UTF-8"), 512); BufferedReader br = new BufferedReader(new InputStreamReader(is , "UTF-8"), 512);
String theWord = null; String theWord = null;
do { do {
theWord = br.readLine(); theWord = br.readLine();
if (theWord != null && !"".equals(theWord.trim())) { if (theWord != null && !"".equals(theWord.trim())) {
_StopWords.fillSegment(theWord.trim().toCharArray()); _StopWords.fillSegment(theWord.trim().toCharArray());
} }
} while (theWord != null); } while (theWord != null);
} catch (IOException e) { } catch (IOException e) {
logger.error("ik-analyzer",e); logger.error("ik-analyzer",e);
}finally{ }finally{
try { try {
if(is != null){ if(is != null){
is.close(); is.close();
is = null; is = null;
} }
} catch (IOException e) { } catch (IOException e) {
logger.error("ik-analyzer",e); logger.error("ik-analyzer",e);
} }
} }
//加载扩展停止词典 //加载扩展停止词典
@ -426,15 +426,15 @@ public class Dictionary {
if(extStopWordDictFiles != null){ if(extStopWordDictFiles != null){
is = null; is = null;
for(String extStopWordDictName : extStopWordDictFiles){ for(String extStopWordDictName : extStopWordDictFiles){
logger.info("[Dict Loading] " + extStopWordDictName); logger.info("[Dict Loading] " + extStopWordDictName);
//读取扩展词典文件 //读取扩展词典文件
file=PathUtils.get(configuration.getDictRoot(), extStopWordDictName); file=PathUtils.get(configuration.getDictRoot(), extStopWordDictName);
try { try {
is = new FileInputStream(file.toFile()); is = new FileInputStream(file.toFile());
} catch (FileNotFoundException e) { } catch (FileNotFoundException e) {
logger.error("ik-analyzer",e); logger.error("ik-analyzer",e);
} }
//如果找不到扩展的字典则忽略 //如果找不到扩展的字典则忽略
if(is == null){ if(is == null){
continue; continue;
@ -446,37 +446,36 @@ public class Dictionary {
theWord = br.readLine(); theWord = br.readLine();
if (theWord != null && !"".equals(theWord.trim())) { if (theWord != null && !"".equals(theWord.trim())) {
//加载扩展停止词典数据到内存中 //加载扩展停止词典数据到内存中
_StopWords.fillSegment(theWord.trim().toCharArray()); _StopWords.fillSegment(theWord.trim().toCharArray());
} }
} while (theWord != null); } while (theWord != null);
} catch (IOException e) { } catch (IOException e) {
logger.error("ik-analyzer",e); logger.error("ik-analyzer",e);
}finally{ }finally{
try { try {
is.close(); if(is != null){
is = null; is.close();
is = null;
}
} catch (IOException e) { } catch (IOException e) {
logger.error("ik-analyzer",e); logger.error("ik-analyzer",e);
} }
} }
} }
} }
//加载远程停用词典 //加载远程停用词典
List<String> remoteExtStopWordDictFiles = configuration.getRemoteExtStopWordDictionarys(); List<String> remoteExtStopWordDictFiles = configuration.getRemoteExtStopWordDictionarys();
for(String location:remoteExtStopWordDictFiles){ for(String location:remoteExtStopWordDictFiles){
logger.info("[Dict Loading] " + location); logger.info("[Dict Loading] " + location);
List<String> lists = getRemoteWords(location); List<String> lists = getRemoteWords(location);
/** Redundant Nullcheck as the list is initialized in the getRemoteWords method
//如果找不到扩展的字典则忽略 //如果找不到扩展的字典则忽略
if(lists == null){ if(lists == null){
logger.error("[Dict Loading] "+location+"加载失败"); logger.error("[Dict Loading] "+location+"加载失败");
continue; continue;
}*/ }
for(String theWord:lists){ for(String theWord:lists){
if (theWord != null && !"".equals(theWord.trim())) { if (theWord != null && !"".equals(theWord.trim())) {
//加载远程词典数据到主内存中 //加载远程词典数据到主内存中
@ -485,10 +484,10 @@ public class Dictionary {
} }
} }
} }
} }
/** /**
* 加载量词词典 * 加载量词词典
*/ */
@ -497,12 +496,12 @@ public class Dictionary {
_QuantifierDict = new DictSegment((char)0); _QuantifierDict = new DictSegment((char)0);
//读取量词词典文件 //读取量词词典文件
Path file = PathUtils.get(configuration.getDictRoot(), Dictionary.PATH_DIC_QUANTIFIER); Path file = PathUtils.get(configuration.getDictRoot(), Dictionary.PATH_DIC_QUANTIFIER);
InputStream is = null; InputStream is = null;
try { try {
is = new FileInputStream(file.toFile()); is = new FileInputStream(file.toFile());
} catch (FileNotFoundException e) { } catch (FileNotFoundException e) {
logger.error("ik-analyzer",e); logger.error("ik-analyzer",e);
} }
try { try {
BufferedReader br = new BufferedReader(new InputStreamReader(is , "UTF-8"), 512); BufferedReader br = new BufferedReader(new InputStreamReader(is , "UTF-8"), 512);
String theWord = null; String theWord = null;
@ -512,132 +511,134 @@ public class Dictionary {
_QuantifierDict.fillSegment(theWord.trim().toCharArray()); _QuantifierDict.fillSegment(theWord.trim().toCharArray());
} }
} while (theWord != null); } while (theWord != null);
} catch (IOException ioe) { } catch (IOException ioe) {
logger.error("Quantifier Dictionary loading exception."); logger.error("Quantifier Dictionary loading exception.");
}finally{ }finally{
try { try {
if(is != null){ if(is != null){
is.close(); is.close();
is = null; is = null;
} }
} catch (IOException e) { } catch (IOException e) {
logger.error("ik-analyzer",e); logger.error("ik-analyzer",e);
} }
} }
} }
private void loadSurnameDict(){ private void loadSurnameDict(){
_SurnameDict = new DictSegment((char)0); _SurnameDict = new DictSegment((char)0);
Path file = PathUtils.get(configuration.getDictRoot(), Dictionary.PATH_DIC_SURNAME); Path file = PathUtils.get(configuration.getDictRoot(), Dictionary.PATH_DIC_SURNAME);
InputStream is = null; InputStream is = null;
try { try {
is = new FileInputStream(file.toFile()); is = new FileInputStream(file.toFile());
} catch (FileNotFoundException e) { } catch (FileNotFoundException e) {
logger.error("ik-analyzer",e); logger.error("ik-analyzer",e);
} }
if(is == null){ if(is == null){
throw new RuntimeException("Surname Dictionary not found!!!"); throw new RuntimeException("Surname Dictionary not found!!!");
} }
try { try {
BufferedReader br = new BufferedReader(new InputStreamReader(is , "UTF-8"), 512); BufferedReader br = new BufferedReader(new InputStreamReader(is , "UTF-8"), 512);
String theWord; String theWord;
do { do {
theWord = br.readLine(); theWord = br.readLine();
if (theWord != null && !"".equals(theWord.trim())) { if (theWord != null && !"".equals(theWord.trim())) {
_SurnameDict.fillSegment(theWord.trim().toCharArray()); _SurnameDict.fillSegment(theWord.trim().toCharArray());
} }
} while (theWord != null); } while (theWord != null);
} catch (IOException e) { } catch (IOException e) {
logger.error("ik-analyzer",e); logger.error("ik-analyzer",e);
}finally{ }finally{
try { try {
if(is != null){
is.close();
is = null;
}
} catch (IOException e) {
logger.error("ik-analyzer",e);
}
}
}
private void loadSuffixDict(){
_SuffixDict = new DictSegment((char)0);
Path file = PathUtils.get(configuration.getDictRoot(), Dictionary.PATH_DIC_SUFFIX);
InputStream is = null;
try {
is = new FileInputStream(file.toFile());
} catch (FileNotFoundException e) {
logger.error("ik-analyzer",e);
}
if(is == null){
throw new RuntimeException("Suffix Dictionary not found!!!");
}
try {
BufferedReader br = new BufferedReader(new InputStreamReader(is , "UTF-8"), 512);
String theWord;
do {
theWord = br.readLine();
if (theWord != null && !"".equals(theWord.trim())) {
_SuffixDict.fillSegment(theWord.trim().toCharArray());
}
} while (theWord != null);
} catch (IOException e) {
logger.error("ik-analyzer",e);
}finally{
try {
is.close(); is.close();
is = null; is = null;
} catch (IOException e) { } catch (IOException e) {
logger.error("ik-analyzer",e); logger.error("ik-analyzer",e);
} }
} }
} }
private void loadSuffixDict(){ private void loadPrepDict(){
_SuffixDict = new DictSegment((char)0); _PrepDict = new DictSegment((char)0);
Path file = PathUtils.get(configuration.getDictRoot(), Dictionary.PATH_DIC_SUFFIX);
InputStream is = null;
try {
is = new FileInputStream(file.toFile());
} catch (FileNotFoundException e) {
logger.error("ik-analyzer",e);
}
if(is == null){
throw new RuntimeException("Suffix Dictionary not found!!!");
}
try {
BufferedReader br = new BufferedReader(new InputStreamReader(is , "UTF-8"), 512);
String theWord;
do {
theWord = br.readLine();
if (theWord != null && !"".equals(theWord.trim())) {
_SuffixDict.fillSegment(theWord.trim().toCharArray());
}
} while (theWord != null);
} catch (IOException e) {
logger.error("ik-analyzer",e);
}finally{
try {
is.close();
is = null;
} catch (IOException e) {
logger.error("ik-analyzer",e);
}
}
}
private void loadPrepDict(){
_PrepDict = new DictSegment((char)0);
Path file = PathUtils.get(configuration.getDictRoot(), Dictionary.PATH_DIC_PREP); Path file = PathUtils.get(configuration.getDictRoot(), Dictionary.PATH_DIC_PREP);
InputStream is = null; InputStream is = null;
try { try {
is = new FileInputStream(file.toFile()); is = new FileInputStream(file.toFile());
} catch (FileNotFoundException e) { } catch (FileNotFoundException e) {
logger.error("ik-analyzer",e); logger.error("ik-analyzer",e);
} }
if(is == null){ if(is == null){
throw new RuntimeException("Preposition Dictionary not found!!!"); throw new RuntimeException("Preposition Dictionary not found!!!");
} }
try { try {
BufferedReader br = new BufferedReader(new InputStreamReader(is , "UTF-8"), 512); BufferedReader br = new BufferedReader(new InputStreamReader(is , "UTF-8"), 512);
String theWord; String theWord;
do { do {
theWord = br.readLine(); theWord = br.readLine();
if (theWord != null && !"".equals(theWord.trim())) { if (theWord != null && !"".equals(theWord.trim())) {
_PrepDict.fillSegment(theWord.trim().toCharArray()); _PrepDict.fillSegment(theWord.trim().toCharArray());
} }
} while (theWord != null); } while (theWord != null);
} catch (IOException e) { } catch (IOException e) {
logger.error("ik-analyzer",e); logger.error("ik-analyzer",e);
}finally{ }finally{
try { try {
is.close(); is.close();
is = null; is = null;
} catch (IOException e) { } catch (IOException e) {
logger.error("ik-analyzer",e); logger.error("ik-analyzer",e);
} }
} }
} }
public void reLoadMainDict(){ public void reLoadMainDict(){
logger.info("重新加载词典..."); logger.info("重新加载词典...");
// 新开一个实例加载词典减少加载过程对当前词典使用的影响 // 新开一个实例加载词典减少加载过程对当前词典使用的影响
Dictionary tmpDict = new Dictionary(); Dictionary tmpDict = new Dictionary();
tmpDict.configuration = getSingleton().configuration; tmpDict.configuration = getSingleton().configuration;
@ -646,6 +647,6 @@ public class Dictionary {
_MainDict = tmpDict._MainDict; _MainDict = tmpDict._MainDict;
_StopWords = tmpDict._StopWords; _StopWords = tmpDict._StopWords;
logger.info("重新加载词典完毕..."); logger.info("重新加载词典完毕...");
} }
} }

View File

@ -7,9 +7,13 @@ import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpHead; import org.apache.http.client.methods.HttpHead;
import org.apache.http.impl.client.CloseableHttpClient; import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients; import org.apache.http.impl.client.HttpClients;
import org.elasticsearch.common.logging.ESLogger;
import org.elasticsearch.common.logging.Loggers;
public class Monitor implements Runnable { public class Monitor implements Runnable {
public static ESLogger logger= Loggers.getLogger("ik-analyzer");
private static CloseableHttpClient httpclient = HttpClients.createDefault(); private static CloseableHttpClient httpclient = HttpClients.createDefault();
/* /*
* 上次更改时间 * 上次更改时间
@ -19,12 +23,12 @@ public class Monitor implements Runnable {
* 资源属性 * 资源属性
*/ */
private String eTags; private String eTags;
/* /*
* 请求地址 * 请求地址
*/ */
private String location; private String location;
public Monitor(String location) { public Monitor(String location) {
this.location = location; this.location = location;
this.last_modified = null; this.last_modified = null;
@ -38,16 +42,16 @@ public class Monitor implements Runnable {
* 如果有变化重新加载词典 * 如果有变化重新加载词典
* 休眠1min返回第 * 休眠1min返回第
*/ */
public void run() { public void run() {
//超时设置 //超时设置
RequestConfig rc = RequestConfig.custom().setConnectionRequestTimeout(10*1000) RequestConfig rc = RequestConfig.custom().setConnectionRequestTimeout(10*1000)
.setConnectTimeout(10*1000).setSocketTimeout(15*1000).build(); .setConnectTimeout(10*1000).setSocketTimeout(15*1000).build();
HttpHead head = new HttpHead(location); HttpHead head = new HttpHead(location);
head.setConfig(rc); head.setConfig(rc);
//设置请求头 //设置请求头
if (last_modified != null) { if (last_modified != null) {
head.setHeader("If-Modified-Since", last_modified); head.setHeader("If-Modified-Since", last_modified);
@ -55,17 +59,17 @@ public class Monitor implements Runnable {
if (eTags != null) { if (eTags != null) {
head.setHeader("If-None-Match", eTags); head.setHeader("If-None-Match", eTags);
} }
CloseableHttpResponse response = null; CloseableHttpResponse response = null;
try { try {
response = httpclient.execute(head); response = httpclient.execute(head);
//返回200 才做操作 //返回200 才做操作
if(response.getStatusLine().getStatusCode()==200){ if(response.getStatusLine().getStatusCode()==200){
if (!response.getLastHeader("Last-Modified").getValue().equalsIgnoreCase(last_modified) if (!response.getLastHeader("Last-Modified").getValue().equalsIgnoreCase(last_modified)
||!response.getLastHeader("ETag").getValue().equalsIgnoreCase(eTags)) { ||!response.getLastHeader("ETag").getValue().equalsIgnoreCase(eTags)) {
// 远程词库有更新,需要重新加载词典并修改last_modified,eTags // 远程词库有更新,需要重新加载词典并修改last_modified,eTags
Dictionary.getSingleton().reLoadMainDict(); Dictionary.getSingleton().reLoadMainDict();
@ -87,9 +91,9 @@ public class Monitor implements Runnable {
response.close(); response.close();
} }
} catch (IOException e) { } catch (IOException e) {
e.printStackTrace(); logger.error(e.getMessage(), e);
} }
} }
} }
} }

View File

@ -5,8 +5,8 @@ import org.elasticsearch.common.logging.Loggers;
public class Sleep { public class Sleep {
public static final ESLogger logger= Loggers.getLogger("ik-analyzer"); public static ESLogger logger= Loggers.getLogger("ik-analyzer");
public enum Type{MSEC,SEC,MIN,HOUR}; public enum Type{MSEC,SEC,MIN,HOUR};
public static void sleep(Type type,int num){ public static void sleep(Type type,int num){
try { try {
@ -15,22 +15,22 @@ public class Sleep {
Thread.sleep(num); Thread.sleep(num);
return; return;
case SEC: case SEC:
Thread.sleep(num*1000L); Thread.sleep(num*1000);
return; return;
case MIN: case MIN:
Thread.sleep(num*60*1000L); Thread.sleep(num*60*1000);
return; return;
case HOUR: case HOUR:
Thread.sleep(num*60*60*1000L); Thread.sleep(num*60*60*1000);
return; return;
default: default:
logger.error("输入类型错误应为MSEC,SEC,MIN,HOUR之一"); System.err.println("输入类型错误应为MSEC,SEC,MIN,HOUR之一");
return; return;
} }
} catch (InterruptedException e) { } catch (InterruptedException e) {
e.printStackTrace(); logger.error(e.getMessage(), e);
} }
} }
} }

View File

@ -1,7 +1,7 @@
/** /**
* IK 中文分词 版本 5.0 * IK 中文分词 版本 5.0
* IK Analyzer release 5.0 * IK Analyzer release 5.0
* *
* Licensed to the Apache Software Foundation (ASF) under one or more * Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with * contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership. * this work for additional information regarding copyright ownership.
@ -20,7 +20,7 @@
* 源代码由林良益(linliangyi2005@gmail.com)提供 * 源代码由林良益(linliangyi2005@gmail.com)提供
* 版权声明 2012乌龙茶工作室 * 版权声明 2012乌龙茶工作室
* provided by Linliangyi and copyright 2012 by Oolong studio * provided by Linliangyi and copyright 2012 by Oolong studio
* *
*/ */
package org.wltea.analyzer.query; package org.wltea.analyzer.query;
@ -34,6 +34,8 @@ import org.apache.lucene.queryparser.classic.ParseException;
import org.apache.lucene.queryparser.classic.QueryParser; import org.apache.lucene.queryparser.classic.QueryParser;
import org.apache.lucene.search.Query; import org.apache.lucene.search.Query;
import org.apache.lucene.util.Version; import org.apache.lucene.util.Version;
import org.elasticsearch.common.logging.ESLogger;
import org.elasticsearch.common.logging.Loggers;
import org.wltea.analyzer.core.IKSegmenter; import org.wltea.analyzer.core.IKSegmenter;
import org.wltea.analyzer.core.Lexeme; import org.wltea.analyzer.core.Lexeme;
@ -45,6 +47,8 @@ import org.wltea.analyzer.core.Lexeme;
*/ */
public class SWMCQueryBuilder { public class SWMCQueryBuilder {
public static ESLogger logger= Loggers.getLogger("ik-analyzer");
/** /**
* 生成SWMCQuery * 生成SWMCQuery
* @param fieldName * @param fieldName
@ -62,7 +66,7 @@ public class SWMCQueryBuilder {
Query _SWMCQuery = getSWMCQuery(fieldName , lexemes , quickMode); Query _SWMCQuery = getSWMCQuery(fieldName , lexemes , quickMode);
return _SWMCQuery; return _SWMCQuery;
} }
/** /**
* 分词切分并返回结链表 * 分词切分并返回结链表
* @param keywords * @param keywords
@ -78,16 +82,16 @@ public class SWMCQueryBuilder {
lexemes.add(l); lexemes.add(l);
} }
}catch(IOException e){ }catch(IOException e){
e.printStackTrace(); logger.error(e.getMessage(), e);
} }
return lexemes; return lexemes;
} }
/** /**
* 根据分词结果生成SWMC搜索 * 根据分词结果生成SWMC搜索
* @param fieldName * @param fieldName
// * @param pathOption // * @param pathOption
* @param quickMode * @param quickMode
* @return * @return
*/ */
@ -100,7 +104,7 @@ public class SWMCQueryBuilder {
int lastLexemeLength = 0; int lastLexemeLength = 0;
//记录最后词元结束位置 //记录最后词元结束位置
int lastLexemeEnd = -1; int lastLexemeEnd = -1;
int shortCount = 0; int shortCount = 0;
int totalCount = 0; int totalCount = 0;
for(Lexeme l : lexemes){ for(Lexeme l : lexemes){
@ -110,15 +114,15 @@ public class SWMCQueryBuilder {
keywordBuffer_Short.append(' ').append(l.getLexemeText()); keywordBuffer_Short.append(' ').append(l.getLexemeText());
shortCount += l.getLength(); shortCount += l.getLength();
} }
if(lastLexemeLength == 0){ if(lastLexemeLength == 0){
keywordBuffer.append(l.getLexemeText()); keywordBuffer.append(l.getLexemeText());
}else if(lastLexemeLength == 1 && l.getLength() == 1 }else if(lastLexemeLength == 1 && l.getLength() == 1
&& lastLexemeEnd == l.getBeginPosition()){//单字位置相邻长度为一合并) && lastLexemeEnd == l.getBeginPosition()){//单字位置相邻长度为一合并)
keywordBuffer.append(l.getLexemeText()); keywordBuffer.append(l.getLexemeText());
}else{ }else{
keywordBuffer.append(' ').append(l.getLexemeText()); keywordBuffer.append(' ').append(l.getLexemeText());
} }
lastLexemeLength = l.getLength(); lastLexemeLength = l.getLength();
lastLexemeEnd = l.getEndPosition(); lastLexemeEnd = l.getEndPosition();
@ -128,16 +132,16 @@ public class SWMCQueryBuilder {
QueryParser qp = new QueryParser(fieldName, new StandardAnalyzer()); QueryParser qp = new QueryParser(fieldName, new StandardAnalyzer());
qp.setDefaultOperator(QueryParser.AND_OPERATOR); qp.setDefaultOperator(QueryParser.AND_OPERATOR);
qp.setAutoGeneratePhraseQueries(true); qp.setAutoGeneratePhraseQueries(true);
if(quickMode && (shortCount * 1.0f / totalCount) > 0.5f){ if(quickMode && (shortCount * 1.0f / totalCount) > 0.5f){
try { try {
//System.out.println(keywordBuffer.toString()); //System.out.println(keywordBuffer.toString());
Query q = qp.parse(keywordBuffer_Short.toString()); Query q = qp.parse(keywordBuffer_Short.toString());
return q; return q;
} catch (ParseException e) { } catch (ParseException e) {
e.printStackTrace(); logger.error(e.getMessage(), e);
} }
}else{ }else{
if(keywordBuffer.length() > 0){ if(keywordBuffer.length() > 0){
try { try {
@ -145,10 +149,10 @@ public class SWMCQueryBuilder {
Query q = qp.parse(keywordBuffer.toString()); Query q = qp.parse(keywordBuffer.toString());
return q; return q;
} catch (ParseException e) { } catch (ParseException e) {
e.printStackTrace(); logger.error(e.getMessage(), e);
} }
} }
} }
return null; return null;
} }
} }

View File

@ -1,7 +1,7 @@
/** /**
* IK 中文分词 版本 5.0.1 * IK 中文分词 版本 5.0.1
* IK Analyzer release 5.0.1 * IK Analyzer release 5.0.1
* *
* Licensed to the Apache Software Foundation (ASF) under one or more * Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with * contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership. * this work for additional information regarding copyright ownership.
@ -20,8 +20,8 @@
* 源代码由林良益(linliangyi2005@gmail.com)提供 * 源代码由林良益(linliangyi2005@gmail.com)提供
* 版权声明 2012乌龙茶工作室 * 版权声明 2012乌龙茶工作室
* provided by Linliangyi and copyright 2012 by Oolong studio * provided by Linliangyi and copyright 2012 by Oolong studio
* *
* *
*/ */
package org.wltea.analyzer.sample; package org.wltea.analyzer.sample;
@ -44,47 +44,47 @@ import org.wltea.analyzer.lucene.IKAnalyzer;
*/ */
public class IKAnalzyerDemo { public class IKAnalzyerDemo {
public static final ESLogger logger= Loggers.getLogger("ik-analyzer"); public static ESLogger logger= Loggers.getLogger("ik-analyzer");
public static void main(String[] args){ public static void main(String[] args){
//构建IK分词器使用smart分词模式 //构建IK分词器使用smart分词模式
Analyzer analyzer = new IKAnalyzer(true); Analyzer analyzer = new IKAnalyzer(true);
//获取Lucene的TokenStream对象 //获取Lucene的TokenStream对象
TokenStream ts = null; TokenStream ts = null;
try { try {
ts = analyzer.tokenStream("myfield", new StringReader("WORLD ,.. html DATA</html>HELLO")); ts = analyzer.tokenStream("myfield", new StringReader("WORLD ,.. html DATA</html>HELLO"));
// ts = analyzer.tokenStream("myfield", new StringReader("这是一个中文分词的例子你可以直接运行它IKAnalyer can analysis english text too")); // ts = analyzer.tokenStream("myfield", new StringReader("这是一个中文分词的例子你可以直接运行它IKAnalyer can analysis english text too"));
//获取词元位置属性 //获取词元位置属性
OffsetAttribute offset = ts.addAttribute(OffsetAttribute.class); OffsetAttribute offset = ts.addAttribute(OffsetAttribute.class);
//获取词元文本属性 //获取词元文本属性
CharTermAttribute term = ts.addAttribute(CharTermAttribute.class); CharTermAttribute term = ts.addAttribute(CharTermAttribute.class);
//获取词元文本属性 //获取词元文本属性
TypeAttribute type = ts.addAttribute(TypeAttribute.class); TypeAttribute type = ts.addAttribute(TypeAttribute.class);
//重置TokenStream重置StringReader //重置TokenStream重置StringReader
ts.reset(); ts.reset();
//迭代获取分词结果 //迭代获取分词结果
while (ts.incrementToken()) { while (ts.incrementToken()) {
logger.info(offset.startOffset() + " - " + offset.endOffset() + " : " + term.toString() + " | " + type.type()); System.out.println(offset.startOffset() + " - " + offset.endOffset() + " : " + term.toString() + " | " + type.type());
} }
//关闭TokenStream关闭StringReader //关闭TokenStream关闭StringReader
ts.end(); // Perform end-of-stream operations, e.g. set the final offset. ts.end(); // Perform end-of-stream operations, e.g. set the final offset.
} catch (IOException e) { } catch (IOException e) {
e.printStackTrace(); logger.error(e.getMessage(), e);
} finally { } finally {
//释放TokenStream的所有资源 //释放TokenStream的所有资源
if(ts != null){ if(ts != null){
try { try {
ts.close(); ts.close();
} catch (IOException e) { } catch (IOException e) {
e.printStackTrace(); logger.error(e.getMessage(), e);
} }
} }
} }
} }
} }

View File

@ -1,7 +1,7 @@
/** /**
* IK 中文分词 版本 5.0 * IK 中文分词 版本 5.0
* IK Analyzer release 5.0 * IK Analyzer release 5.0
* *
* Licensed to the Apache Software Foundation (ASF) under one or more * Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with * contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership. * this work for additional information regarding copyright ownership.
@ -20,8 +20,8 @@
* 源代码由林良益(linliangyi2005@gmail.com)提供 * 源代码由林良益(linliangyi2005@gmail.com)提供
* 版权声明 2012乌龙茶工作室 * 版权声明 2012乌龙茶工作室
* provided by Linliangyi and copyright 2012 by Oolong studio * provided by Linliangyi and copyright 2012 by Oolong studio
* *
* *
*/ */
package org.wltea.analyzer.sample; package org.wltea.analyzer.sample;
@ -58,14 +58,14 @@ import org.wltea.analyzer.lucene.IKAnalyzer;
/** /**
* 使用IKAnalyzer进行Lucene索引和查询的演示 * 使用IKAnalyzer进行Lucene索引和查询的演示
* 2012-3-2 * 2012-3-2
* *
* 以下是结合Lucene4.0 API的写法 * 以下是结合Lucene4.0 API的写法
* *
*/ */
public class LuceneIndexAndSearchDemo { public class LuceneIndexAndSearchDemo {
public static final ESLogger logger= Loggers.getLogger("ik-analyzer"); public static ESLogger logger= Loggers.getLogger("ik-analyzer");
/** /**
* 模拟 * 模拟
* 创建一个单条记录的索引并对其进行搜索 * 创建一个单条记录的索引并对其进行搜索
@ -74,20 +74,20 @@ public class LuceneIndexAndSearchDemo {
public static void main(String[] args){ public static void main(String[] args){
//Lucene Document的域名 //Lucene Document的域名
String fieldName = "text"; String fieldName = "text";
//检索内容 //检索内容
String text = "IK Analyzer是一个结合词典分词和文法分词的中文分词开源工具包。它使用了全新的正向迭代最细粒度切分算法。"; String text = "IK Analyzer是一个结合词典分词和文法分词的中文分词开源工具包。它使用了全新的正向迭代最细粒度切分算法。";
//实例化IKAnalyzer分词器 //实例化IKAnalyzer分词器
Analyzer analyzer = new IKAnalyzer(true); Analyzer analyzer = new IKAnalyzer(true);
Directory directory = null; Directory directory = null;
IndexWriter iwriter = null; IndexWriter iwriter = null;
IndexReader ireader = null; IndexReader ireader = null;
IndexSearcher isearcher = null; IndexSearcher isearcher = null;
try { try {
//建立内存索引对象 //建立内存索引对象
directory = new RAMDirectory(); directory = new RAMDirectory();
//配置IndexWriterConfig //配置IndexWriterConfig
IndexWriterConfig iwConfig = new IndexWriterConfig(analyzer); IndexWriterConfig iwConfig = new IndexWriterConfig(analyzer);
iwConfig.setOpenMode(OpenMode.CREATE_OR_APPEND); iwConfig.setOpenMode(OpenMode.CREATE_OR_APPEND);
@ -98,53 +98,53 @@ public class LuceneIndexAndSearchDemo {
doc.add(new TextField(fieldName, text, Field.Store.YES)); doc.add(new TextField(fieldName, text, Field.Store.YES));
iwriter.addDocument(doc); iwriter.addDocument(doc);
iwriter.close(); iwriter.close();
//搜索过程********************************** //搜索过程**********************************
//实例化搜索器 //实例化搜索器
ireader = DirectoryReader.open(directory); ireader = DirectoryReader.open(directory);
isearcher = new IndexSearcher(ireader); isearcher = new IndexSearcher(ireader);
String keyword = "中文分词工具包"; String keyword = "中文分词工具包";
//使用QueryParser查询分析器构造Query对象 //使用QueryParser查询分析器构造Query对象
QueryParser qp = new QueryParser(fieldName, analyzer); QueryParser qp = new QueryParser(fieldName, analyzer);
qp.setDefaultOperator(QueryParser.AND_OPERATOR); qp.setDefaultOperator(QueryParser.AND_OPERATOR);
Query query = qp.parse(keyword); Query query = qp.parse(keyword);
logger.info("Query = " + query); System.out.println("Query = " + query);
//搜索相似度最高的5条记录 //搜索相似度最高的5条记录
TopDocs topDocs = isearcher.search(query , 5); TopDocs topDocs = isearcher.search(query , 5);
logger.info("命中:" + topDocs.totalHits); System.out.println("命中:" + topDocs.totalHits);
//输出结果 //输出结果
ScoreDoc[] scoreDocs = topDocs.scoreDocs; ScoreDoc[] scoreDocs = topDocs.scoreDocs;
for (int i = 0; i < topDocs.totalHits; i++){ for (int i = 0; i < topDocs.totalHits; i++){
Document targetDoc = isearcher.doc(scoreDocs[i].doc); Document targetDoc = isearcher.doc(scoreDocs[i].doc);
logger.info("内容:" + targetDoc.toString()); System.out.println("内容:" + targetDoc.toString());
} }
} catch (CorruptIndexException e) { } catch (CorruptIndexException e) {
e.printStackTrace(); logger.error(e.getMessage(), e);
} catch (LockObtainFailedException e) { } catch (LockObtainFailedException e) {
e.printStackTrace(); logger.error(e.getMessage(), e);
} catch (IOException e) { } catch (IOException e) {
e.printStackTrace(); logger.error(e.getMessage(), e);
} catch (ParseException e) { } catch (ParseException e) {
e.printStackTrace(); logger.error(e.getMessage(), e);
} finally{ } finally{
if(ireader != null){ if(ireader != null){
try { try {
ireader.close(); ireader.close();
} catch (IOException e) { } catch (IOException e) {
e.printStackTrace(); logger.error(e.getMessage(), e);
} }
} }
if(directory != null){ if(directory != null){
try { try {
directory.close(); directory.close();
} catch (IOException e) { } catch (IOException e) {
e.printStackTrace(); logger.error(e.getMessage(), e);
} }
} }
} }
} }
} }