merge code
This commit is contained in:
commit
ca2bfe5732
@ -1,7 +1,7 @@
|
|||||||
/**
|
/**
|
||||||
* IK 中文分词 版本 5.0
|
* IK 中文分词 版本 5.0
|
||||||
* IK Analyzer release 5.0
|
* IK Analyzer release 5.0
|
||||||
*
|
*
|
||||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
* contributor license agreements. See the NOTICE file distributed with
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
* this work for additional information regarding copyright ownership.
|
* this work for additional information regarding copyright ownership.
|
||||||
@ -20,8 +20,8 @@
|
|||||||
* 源代码由林良益(linliangyi2005@gmail.com)提供
|
* 源代码由林良益(linliangyi2005@gmail.com)提供
|
||||||
* 版权声明 2012,乌龙茶工作室
|
* 版权声明 2012,乌龙茶工作室
|
||||||
* provided by Linliangyi and copyright 2012 by Oolong studio
|
* provided by Linliangyi and copyright 2012 by Oolong studio
|
||||||
*
|
*
|
||||||
*
|
*
|
||||||
*/
|
*/
|
||||||
package org.wltea.analyzer.dic;
|
package org.wltea.analyzer.dic;
|
||||||
|
|
||||||
@ -62,37 +62,37 @@ public class Dictionary {
|
|||||||
*/
|
*/
|
||||||
private static Dictionary singleton;
|
private static Dictionary singleton;
|
||||||
|
|
||||||
private DictSegment _MainDict;
|
private DictSegment _MainDict;
|
||||||
|
|
||||||
private DictSegment _SurnameDict;
|
private DictSegment _SurnameDict;
|
||||||
|
|
||||||
private DictSegment _QuantifierDict;
|
private DictSegment _QuantifierDict;
|
||||||
|
|
||||||
private DictSegment _SuffixDict;
|
private DictSegment _SuffixDict;
|
||||||
|
|
||||||
private DictSegment _PrepDict;
|
private DictSegment _PrepDict;
|
||||||
|
|
||||||
|
private DictSegment _StopWords;
|
||||||
|
|
||||||
private DictSegment _StopWords;
|
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* 配置对象
|
* 配置对象
|
||||||
*/
|
*/
|
||||||
private Configuration configuration;
|
private Configuration configuration;
|
||||||
public static final ESLogger logger=Loggers.getLogger("ik-analyzer");
|
public static ESLogger logger=Loggers.getLogger("ik-analyzer");
|
||||||
|
|
||||||
private static ScheduledExecutorService pool = Executors.newScheduledThreadPool(1);
|
|
||||||
|
|
||||||
public static final String PATH_DIC_MAIN = "ik/main.dic";
|
|
||||||
public static final String PATH_DIC_SURNAME = "ik/surname.dic";
|
|
||||||
public static final String PATH_DIC_QUANTIFIER = "ik/quantifier.dic";
|
|
||||||
public static final String PATH_DIC_SUFFIX = "ik/suffix.dic";
|
|
||||||
public static final String PATH_DIC_PREP = "ik/preposition.dic";
|
|
||||||
public static final String PATH_DIC_STOP = "ik/stopword.dic";
|
|
||||||
|
|
||||||
private Dictionary(){
|
|
||||||
|
|
||||||
}
|
private static ScheduledExecutorService pool = Executors.newScheduledThreadPool(1);
|
||||||
|
|
||||||
|
public static final String PATH_DIC_MAIN = "ik/main.dic";
|
||||||
|
public static final String PATH_DIC_SURNAME = "ik/surname.dic";
|
||||||
|
public static final String PATH_DIC_QUANTIFIER = "ik/quantifier.dic";
|
||||||
|
public static final String PATH_DIC_SUFFIX = "ik/suffix.dic";
|
||||||
|
public static final String PATH_DIC_PREP = "ik/preposition.dic";
|
||||||
|
public static final String PATH_DIC_STOP = "ik/stopword.dic";
|
||||||
|
|
||||||
|
private Dictionary(){
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* 词典初始化
|
* 词典初始化
|
||||||
@ -103,33 +103,34 @@ public class Dictionary {
|
|||||||
* @return Dictionary
|
* @return Dictionary
|
||||||
*/
|
*/
|
||||||
public static synchronized Dictionary initial(Configuration cfg){
|
public static synchronized Dictionary initial(Configuration cfg){
|
||||||
|
if(singleton == null){
|
||||||
synchronized(Dictionary.class){
|
synchronized(Dictionary.class){
|
||||||
if(singleton == null){
|
if(singleton == null){
|
||||||
singleton = new Dictionary();
|
singleton = new Dictionary();
|
||||||
singleton.configuration=cfg;
|
singleton.configuration=cfg;
|
||||||
singleton.loadMainDict();
|
singleton.loadMainDict();
|
||||||
singleton.loadSurnameDict();
|
singleton.loadSurnameDict();
|
||||||
singleton.loadQuantifierDict();
|
singleton.loadQuantifierDict();
|
||||||
singleton.loadSuffixDict();
|
singleton.loadSuffixDict();
|
||||||
singleton.loadPrepDict();
|
singleton.loadPrepDict();
|
||||||
singleton.loadStopWordDict();
|
singleton.loadStopWordDict();
|
||||||
|
|
||||||
//建立监控线程
|
//建立监控线程
|
||||||
for(String location:cfg.getRemoteExtDictionarys()){
|
for(String location:cfg.getRemoteExtDictionarys()){
|
||||||
//10 秒是初始延迟可以修改的 60是间隔时间 单位秒
|
//10 秒是初始延迟可以修改的 60是间隔时间 单位秒
|
||||||
pool.scheduleAtFixedRate(new Monitor(location), 10, 60, TimeUnit.SECONDS);
|
pool.scheduleAtFixedRate(new Monitor(location), 10, 60, TimeUnit.SECONDS);
|
||||||
|
}
|
||||||
|
for(String location:cfg.getRemoteExtStopWordDictionarys()){
|
||||||
|
pool.scheduleAtFixedRate(new Monitor(location), 10, 60, TimeUnit.SECONDS);
|
||||||
|
}
|
||||||
|
|
||||||
|
return singleton;
|
||||||
}
|
}
|
||||||
for(String location:cfg.getRemoteExtStopWordDictionarys()){
|
|
||||||
pool.scheduleAtFixedRate(new Monitor(location), 10, 60, TimeUnit.SECONDS);
|
|
||||||
}
|
|
||||||
|
|
||||||
return singleton;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return singleton;
|
return singleton;
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* 获取词典单子实例
|
* 获取词典单子实例
|
||||||
* @return Dictionary 单例对象
|
* @return Dictionary 单例对象
|
||||||
@ -140,7 +141,7 @@ public class Dictionary {
|
|||||||
}
|
}
|
||||||
return singleton;
|
return singleton;
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* 批量加载新词条
|
* 批量加载新词条
|
||||||
* @param words Collection<String>词条列表
|
* @param words Collection<String>词条列表
|
||||||
@ -155,7 +156,7 @@ public class Dictionary {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* 批量移除(屏蔽)词条
|
* 批量移除(屏蔽)词条
|
||||||
*/
|
*/
|
||||||
@ -169,7 +170,7 @@ public class Dictionary {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* 检索匹配主词典
|
* 检索匹配主词典
|
||||||
* @return Hit 匹配结果描述
|
* @return Hit 匹配结果描述
|
||||||
@ -177,15 +178,15 @@ public class Dictionary {
|
|||||||
public Hit matchInMainDict(char[] charArray){
|
public Hit matchInMainDict(char[] charArray){
|
||||||
return singleton._MainDict.match(charArray);
|
return singleton._MainDict.match(charArray);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* 检索匹配主词典
|
* 检索匹配主词典
|
||||||
* @return Hit 匹配结果描述
|
* @return Hit 匹配结果描述
|
||||||
*/
|
*/
|
||||||
public Hit matchInMainDict(char[] charArray , int begin, int length){
|
public Hit matchInMainDict(char[] charArray , int begin, int length){
|
||||||
return singleton._MainDict.match(charArray, begin, length);
|
return singleton._MainDict.match(charArray, begin, length);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* 检索匹配量词词典
|
* 检索匹配量词词典
|
||||||
* @return Hit 匹配结果描述
|
* @return Hit 匹配结果描述
|
||||||
@ -193,8 +194,8 @@ public class Dictionary {
|
|||||||
public Hit matchInQuantifierDict(char[] charArray , int begin, int length){
|
public Hit matchInQuantifierDict(char[] charArray , int begin, int length){
|
||||||
return singleton._QuantifierDict.match(charArray, begin, length);
|
return singleton._QuantifierDict.match(charArray, begin, length);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* 从已匹配的Hit中直接取出DictSegment,继续向下匹配
|
* 从已匹配的Hit中直接取出DictSegment,继续向下匹配
|
||||||
* @return Hit
|
* @return Hit
|
||||||
@ -203,16 +204,16 @@ public class Dictionary {
|
|||||||
DictSegment ds = matchedHit.getMatchedDictSegment();
|
DictSegment ds = matchedHit.getMatchedDictSegment();
|
||||||
return ds.match(charArray, currentIndex, 1 , matchedHit);
|
return ds.match(charArray, currentIndex, 1 , matchedHit);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* 判断是否是停止词
|
* 判断是否是停止词
|
||||||
* @return boolean
|
* @return boolean
|
||||||
*/
|
*/
|
||||||
public boolean isStopWord(char[] charArray , int begin, int length){
|
public boolean isStopWord(char[] charArray , int begin, int length){
|
||||||
return singleton._StopWords.match(charArray, begin, length).isMatch();
|
return singleton._StopWords.match(charArray, begin, length).isMatch();
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* 加载主词典及扩展词典
|
* 加载主词典及扩展词典
|
||||||
*/
|
*/
|
||||||
@ -223,13 +224,13 @@ public class Dictionary {
|
|||||||
//读取主词典文件
|
//读取主词典文件
|
||||||
Path file = PathUtils.get(configuration.getDictRoot(), Dictionary.PATH_DIC_MAIN);
|
Path file = PathUtils.get(configuration.getDictRoot(), Dictionary.PATH_DIC_MAIN);
|
||||||
|
|
||||||
InputStream is = null;
|
InputStream is = null;
|
||||||
try {
|
try {
|
||||||
is = new FileInputStream(file.toFile());
|
is = new FileInputStream(file.toFile());
|
||||||
} catch (FileNotFoundException e) {
|
} catch (FileNotFoundException e) {
|
||||||
e.printStackTrace();
|
logger.error(e.getMessage(), e);
|
||||||
}
|
}
|
||||||
|
|
||||||
try {
|
try {
|
||||||
BufferedReader br = new BufferedReader(new InputStreamReader(is , "UTF-8"), 512);
|
BufferedReader br = new BufferedReader(new InputStreamReader(is , "UTF-8"), 512);
|
||||||
String theWord = null;
|
String theWord = null;
|
||||||
@ -239,26 +240,26 @@ public class Dictionary {
|
|||||||
_MainDict.fillSegment(theWord.trim().toCharArray());
|
_MainDict.fillSegment(theWord.trim().toCharArray());
|
||||||
}
|
}
|
||||||
} while (theWord != null);
|
} while (theWord != null);
|
||||||
|
|
||||||
} catch (IOException e) {
|
|
||||||
logger.error("ik-analyzer",e);
|
|
||||||
|
|
||||||
}finally{
|
} catch (IOException e) {
|
||||||
|
logger.error("ik-analyzer",e);
|
||||||
|
|
||||||
|
}finally{
|
||||||
try {
|
try {
|
||||||
if(is != null){
|
if(is != null){
|
||||||
is.close();
|
is.close();
|
||||||
is = null;
|
is = null;
|
||||||
}
|
}
|
||||||
} catch (IOException e) {
|
} catch (IOException e) {
|
||||||
logger.error("ik-analyzer",e);
|
logger.error("ik-analyzer",e);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
//加载扩展词典
|
//加载扩展词典
|
||||||
this.loadExtDict();
|
this.loadExtDict();
|
||||||
//加载远程自定义词库
|
//加载远程自定义词库
|
||||||
this.loadRemoteExtDict();
|
this.loadRemoteExtDict();
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* 加载用户配置的扩展词典到主词库表
|
* 加载用户配置的扩展词典到主词库表
|
||||||
*/
|
*/
|
||||||
@ -269,13 +270,13 @@ public class Dictionary {
|
|||||||
InputStream is = null;
|
InputStream is = null;
|
||||||
for(String extDictName : extDictFiles){
|
for(String extDictName : extDictFiles){
|
||||||
//读取扩展词典文件
|
//读取扩展词典文件
|
||||||
logger.info("[Dict Loading] " + extDictName);
|
logger.info("[Dict Loading] " + extDictName);
|
||||||
Path file = PathUtils.get(configuration.getDictRoot(), extDictName);
|
Path file = PathUtils.get(configuration.getDictRoot(), extDictName);
|
||||||
try {
|
try {
|
||||||
is = new FileInputStream(file.toFile());
|
is = new FileInputStream(file.toFile());
|
||||||
} catch (FileNotFoundException e) {
|
} catch (FileNotFoundException e) {
|
||||||
logger.error("ik-analyzer",e);
|
logger.error("ik-analyzer",e);
|
||||||
}
|
}
|
||||||
|
|
||||||
//如果找不到扩展的字典,则忽略
|
//如果找不到扩展的字典,则忽略
|
||||||
if(is == null){
|
if(is == null){
|
||||||
@ -286,27 +287,29 @@ public class Dictionary {
|
|||||||
String theWord = null;
|
String theWord = null;
|
||||||
do {
|
do {
|
||||||
theWord = br.readLine();
|
theWord = br.readLine();
|
||||||
if (theWord != null && !"".equals(theWord.trim())) {
|
if (theWord != null && !"".equals(theWord.trim())) {
|
||||||
//加载扩展词典数据到主内存词典中
|
//加载扩展词典数据到主内存词典中
|
||||||
_MainDict.fillSegment(theWord.trim().toCharArray());
|
_MainDict.fillSegment(theWord.trim().toCharArray());
|
||||||
}
|
}
|
||||||
} while (theWord != null);
|
} while (theWord != null);
|
||||||
|
|
||||||
} catch (IOException e) {
|
} catch (IOException e) {
|
||||||
logger.error("ik-analyzer",e);
|
logger.error("ik-analyzer",e);
|
||||||
}finally{
|
}finally{
|
||||||
try {
|
try {
|
||||||
is.close();
|
if(is != null){
|
||||||
is = null;
|
is.close();
|
||||||
|
is = null;
|
||||||
|
}
|
||||||
} catch (IOException e) {
|
} catch (IOException e) {
|
||||||
logger.error("ik-analyzer",e);
|
logger.error("ik-analyzer",e);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* 加载远程扩展词典到主词库表
|
* 加载远程扩展词典到主词库表
|
||||||
*/
|
*/
|
||||||
@ -315,14 +318,11 @@ public class Dictionary {
|
|||||||
for(String location:remoteExtDictFiles){
|
for(String location:remoteExtDictFiles){
|
||||||
logger.info("[Dict Loading] " + location);
|
logger.info("[Dict Loading] " + location);
|
||||||
List<String> lists = getRemoteWords(location);
|
List<String> lists = getRemoteWords(location);
|
||||||
|
|
||||||
/** Redundant Nullcheck as the list is initialized in the getRemoteWords method
|
|
||||||
//如果找不到扩展的字典,则忽略
|
//如果找不到扩展的字典,则忽略
|
||||||
if(lists == null){
|
if(lists == null){
|
||||||
logger.error("[Dict Loading] "+location+"加载失败");
|
logger.error("[Dict Loading] "+location+"加载失败");
|
||||||
continue;
|
continue;
|
||||||
}*/
|
}
|
||||||
|
|
||||||
for(String theWord:lists){
|
for(String theWord:lists){
|
||||||
if (theWord != null && !"".equals(theWord.trim())) {
|
if (theWord != null && !"".equals(theWord.trim())) {
|
||||||
//加载扩展词典数据到主内存词典中
|
//加载扩展词典数据到主内存词典中
|
||||||
@ -331,14 +331,14 @@ public class Dictionary {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* 从远程服务器上下载自定义词条
|
* 从远程服务器上下载自定义词条
|
||||||
*/
|
*/
|
||||||
private static List<String> getRemoteWords(String location){
|
private static List<String> getRemoteWords(String location){
|
||||||
|
|
||||||
List<String> buffer = new ArrayList<String>();
|
List<String> buffer = new ArrayList<String>();
|
||||||
RequestConfig rc = RequestConfig.custom().setConnectionRequestTimeout(10*1000)
|
RequestConfig rc = RequestConfig.custom().setConnectionRequestTimeout(10*1000)
|
||||||
.setConnectTimeout(10*1000).setSocketTimeout(60*1000).build();
|
.setConnectTimeout(10*1000).setSocketTimeout(60*1000).build();
|
||||||
@ -350,7 +350,7 @@ public class Dictionary {
|
|||||||
try {
|
try {
|
||||||
response = httpclient.execute(get);
|
response = httpclient.execute(get);
|
||||||
if(response.getStatusLine().getStatusCode()==200){
|
if(response.getStatusLine().getStatusCode()==200){
|
||||||
|
|
||||||
String charset = "UTF-8";
|
String charset = "UTF-8";
|
||||||
//获取编码,默认为utf-8
|
//获取编码,默认为utf-8
|
||||||
if(response.getEntity().getContentType().getValue().contains("charset=")){
|
if(response.getEntity().getContentType().getValue().contains("charset=")){
|
||||||
@ -376,49 +376,49 @@ public class Dictionary {
|
|||||||
}
|
}
|
||||||
return buffer;
|
return buffer;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* 加载用户扩展的停止词词典
|
* 加载用户扩展的停止词词典
|
||||||
*/
|
*/
|
||||||
private void loadStopWordDict(){
|
private void loadStopWordDict(){
|
||||||
//建立主词典实例
|
//建立主词典实例
|
||||||
_StopWords = new DictSegment((char)0);
|
_StopWords = new DictSegment((char)0);
|
||||||
|
|
||||||
//读取主词典文件
|
//读取主词典文件
|
||||||
Path file = PathUtils.get(configuration.getDictRoot(), Dictionary.PATH_DIC_STOP);
|
Path file = PathUtils.get(configuration.getDictRoot(), Dictionary.PATH_DIC_STOP);
|
||||||
|
|
||||||
InputStream is = null;
|
InputStream is = null;
|
||||||
try {
|
try {
|
||||||
is = new FileInputStream(file.toFile());
|
is = new FileInputStream(file.toFile());
|
||||||
} catch (FileNotFoundException e) {
|
} catch (FileNotFoundException e) {
|
||||||
e.printStackTrace();
|
logger.error(e.getMessage(), e);
|
||||||
}
|
}
|
||||||
|
|
||||||
try {
|
try {
|
||||||
BufferedReader br = new BufferedReader(new InputStreamReader(is , "UTF-8"), 512);
|
BufferedReader br = new BufferedReader(new InputStreamReader(is , "UTF-8"), 512);
|
||||||
String theWord = null;
|
String theWord = null;
|
||||||
do {
|
do {
|
||||||
theWord = br.readLine();
|
theWord = br.readLine();
|
||||||
if (theWord != null && !"".equals(theWord.trim())) {
|
if (theWord != null && !"".equals(theWord.trim())) {
|
||||||
_StopWords.fillSegment(theWord.trim().toCharArray());
|
_StopWords.fillSegment(theWord.trim().toCharArray());
|
||||||
}
|
}
|
||||||
} while (theWord != null);
|
} while (theWord != null);
|
||||||
|
|
||||||
} catch (IOException e) {
|
} catch (IOException e) {
|
||||||
logger.error("ik-analyzer",e);
|
logger.error("ik-analyzer",e);
|
||||||
|
|
||||||
}finally{
|
}finally{
|
||||||
try {
|
try {
|
||||||
if(is != null){
|
if(is != null){
|
||||||
is.close();
|
is.close();
|
||||||
is = null;
|
is = null;
|
||||||
}
|
}
|
||||||
} catch (IOException e) {
|
} catch (IOException e) {
|
||||||
logger.error("ik-analyzer",e);
|
logger.error("ik-analyzer",e);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
//加载扩展停止词典
|
//加载扩展停止词典
|
||||||
@ -426,15 +426,15 @@ public class Dictionary {
|
|||||||
if(extStopWordDictFiles != null){
|
if(extStopWordDictFiles != null){
|
||||||
is = null;
|
is = null;
|
||||||
for(String extStopWordDictName : extStopWordDictFiles){
|
for(String extStopWordDictName : extStopWordDictFiles){
|
||||||
logger.info("[Dict Loading] " + extStopWordDictName);
|
logger.info("[Dict Loading] " + extStopWordDictName);
|
||||||
|
|
||||||
//读取扩展词典文件
|
//读取扩展词典文件
|
||||||
file=PathUtils.get(configuration.getDictRoot(), extStopWordDictName);
|
file=PathUtils.get(configuration.getDictRoot(), extStopWordDictName);
|
||||||
try {
|
try {
|
||||||
is = new FileInputStream(file.toFile());
|
is = new FileInputStream(file.toFile());
|
||||||
} catch (FileNotFoundException e) {
|
} catch (FileNotFoundException e) {
|
||||||
logger.error("ik-analyzer",e);
|
logger.error("ik-analyzer",e);
|
||||||
}
|
}
|
||||||
//如果找不到扩展的字典,则忽略
|
//如果找不到扩展的字典,则忽略
|
||||||
if(is == null){
|
if(is == null){
|
||||||
continue;
|
continue;
|
||||||
@ -446,37 +446,36 @@ public class Dictionary {
|
|||||||
theWord = br.readLine();
|
theWord = br.readLine();
|
||||||
if (theWord != null && !"".equals(theWord.trim())) {
|
if (theWord != null && !"".equals(theWord.trim())) {
|
||||||
//加载扩展停止词典数据到内存中
|
//加载扩展停止词典数据到内存中
|
||||||
_StopWords.fillSegment(theWord.trim().toCharArray());
|
_StopWords.fillSegment(theWord.trim().toCharArray());
|
||||||
}
|
}
|
||||||
} while (theWord != null);
|
} while (theWord != null);
|
||||||
|
|
||||||
} catch (IOException e) {
|
} catch (IOException e) {
|
||||||
logger.error("ik-analyzer",e);
|
logger.error("ik-analyzer",e);
|
||||||
|
|
||||||
}finally{
|
}finally{
|
||||||
try {
|
try {
|
||||||
is.close();
|
if(is != null){
|
||||||
is = null;
|
is.close();
|
||||||
|
is = null;
|
||||||
|
}
|
||||||
} catch (IOException e) {
|
} catch (IOException e) {
|
||||||
logger.error("ik-analyzer",e);
|
logger.error("ik-analyzer",e);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
//加载远程停用词典
|
//加载远程停用词典
|
||||||
List<String> remoteExtStopWordDictFiles = configuration.getRemoteExtStopWordDictionarys();
|
List<String> remoteExtStopWordDictFiles = configuration.getRemoteExtStopWordDictionarys();
|
||||||
for(String location:remoteExtStopWordDictFiles){
|
for(String location:remoteExtStopWordDictFiles){
|
||||||
logger.info("[Dict Loading] " + location);
|
logger.info("[Dict Loading] " + location);
|
||||||
List<String> lists = getRemoteWords(location);
|
List<String> lists = getRemoteWords(location);
|
||||||
|
|
||||||
/** Redundant Nullcheck as the list is initialized in the getRemoteWords method
|
|
||||||
//如果找不到扩展的字典,则忽略
|
//如果找不到扩展的字典,则忽略
|
||||||
if(lists == null){
|
if(lists == null){
|
||||||
logger.error("[Dict Loading] "+location+"加载失败");
|
logger.error("[Dict Loading] "+location+"加载失败");
|
||||||
continue;
|
continue;
|
||||||
}*/
|
}
|
||||||
|
|
||||||
for(String theWord:lists){
|
for(String theWord:lists){
|
||||||
if (theWord != null && !"".equals(theWord.trim())) {
|
if (theWord != null && !"".equals(theWord.trim())) {
|
||||||
//加载远程词典数据到主内存中
|
//加载远程词典数据到主内存中
|
||||||
@ -485,10 +484,10 @@ public class Dictionary {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* 加载量词词典
|
* 加载量词词典
|
||||||
*/
|
*/
|
||||||
@ -497,12 +496,12 @@ public class Dictionary {
|
|||||||
_QuantifierDict = new DictSegment((char)0);
|
_QuantifierDict = new DictSegment((char)0);
|
||||||
//读取量词词典文件
|
//读取量词词典文件
|
||||||
Path file = PathUtils.get(configuration.getDictRoot(), Dictionary.PATH_DIC_QUANTIFIER);
|
Path file = PathUtils.get(configuration.getDictRoot(), Dictionary.PATH_DIC_QUANTIFIER);
|
||||||
InputStream is = null;
|
InputStream is = null;
|
||||||
try {
|
try {
|
||||||
is = new FileInputStream(file.toFile());
|
is = new FileInputStream(file.toFile());
|
||||||
} catch (FileNotFoundException e) {
|
} catch (FileNotFoundException e) {
|
||||||
logger.error("ik-analyzer",e);
|
logger.error("ik-analyzer",e);
|
||||||
}
|
}
|
||||||
try {
|
try {
|
||||||
BufferedReader br = new BufferedReader(new InputStreamReader(is , "UTF-8"), 512);
|
BufferedReader br = new BufferedReader(new InputStreamReader(is , "UTF-8"), 512);
|
||||||
String theWord = null;
|
String theWord = null;
|
||||||
@ -512,132 +511,134 @@ public class Dictionary {
|
|||||||
_QuantifierDict.fillSegment(theWord.trim().toCharArray());
|
_QuantifierDict.fillSegment(theWord.trim().toCharArray());
|
||||||
}
|
}
|
||||||
} while (theWord != null);
|
} while (theWord != null);
|
||||||
|
|
||||||
} catch (IOException ioe) {
|
} catch (IOException ioe) {
|
||||||
logger.error("Quantifier Dictionary loading exception.");
|
logger.error("Quantifier Dictionary loading exception.");
|
||||||
|
|
||||||
}finally{
|
}finally{
|
||||||
try {
|
try {
|
||||||
if(is != null){
|
if(is != null){
|
||||||
is.close();
|
is.close();
|
||||||
is = null;
|
is = null;
|
||||||
}
|
}
|
||||||
} catch (IOException e) {
|
} catch (IOException e) {
|
||||||
logger.error("ik-analyzer",e);
|
logger.error("ik-analyzer",e);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
private void loadSurnameDict(){
|
private void loadSurnameDict(){
|
||||||
|
|
||||||
_SurnameDict = new DictSegment((char)0);
|
_SurnameDict = new DictSegment((char)0);
|
||||||
Path file = PathUtils.get(configuration.getDictRoot(), Dictionary.PATH_DIC_SURNAME);
|
Path file = PathUtils.get(configuration.getDictRoot(), Dictionary.PATH_DIC_SURNAME);
|
||||||
InputStream is = null;
|
InputStream is = null;
|
||||||
try {
|
try {
|
||||||
is = new FileInputStream(file.toFile());
|
is = new FileInputStream(file.toFile());
|
||||||
} catch (FileNotFoundException e) {
|
} catch (FileNotFoundException e) {
|
||||||
logger.error("ik-analyzer",e);
|
logger.error("ik-analyzer",e);
|
||||||
}
|
}
|
||||||
if(is == null){
|
if(is == null){
|
||||||
throw new RuntimeException("Surname Dictionary not found!!!");
|
throw new RuntimeException("Surname Dictionary not found!!!");
|
||||||
}
|
}
|
||||||
try {
|
try {
|
||||||
BufferedReader br = new BufferedReader(new InputStreamReader(is , "UTF-8"), 512);
|
BufferedReader br = new BufferedReader(new InputStreamReader(is , "UTF-8"), 512);
|
||||||
String theWord;
|
String theWord;
|
||||||
do {
|
do {
|
||||||
theWord = br.readLine();
|
theWord = br.readLine();
|
||||||
if (theWord != null && !"".equals(theWord.trim())) {
|
if (theWord != null && !"".equals(theWord.trim())) {
|
||||||
_SurnameDict.fillSegment(theWord.trim().toCharArray());
|
_SurnameDict.fillSegment(theWord.trim().toCharArray());
|
||||||
}
|
}
|
||||||
} while (theWord != null);
|
} while (theWord != null);
|
||||||
} catch (IOException e) {
|
} catch (IOException e) {
|
||||||
logger.error("ik-analyzer",e);
|
logger.error("ik-analyzer",e);
|
||||||
}finally{
|
}finally{
|
||||||
try {
|
try {
|
||||||
|
if(is != null){
|
||||||
|
is.close();
|
||||||
|
is = null;
|
||||||
|
}
|
||||||
|
} catch (IOException e) {
|
||||||
|
logger.error("ik-analyzer",e);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private void loadSuffixDict(){
|
||||||
|
|
||||||
|
_SuffixDict = new DictSegment((char)0);
|
||||||
|
Path file = PathUtils.get(configuration.getDictRoot(), Dictionary.PATH_DIC_SUFFIX);
|
||||||
|
InputStream is = null;
|
||||||
|
try {
|
||||||
|
is = new FileInputStream(file.toFile());
|
||||||
|
} catch (FileNotFoundException e) {
|
||||||
|
logger.error("ik-analyzer",e);
|
||||||
|
}
|
||||||
|
if(is == null){
|
||||||
|
throw new RuntimeException("Suffix Dictionary not found!!!");
|
||||||
|
}
|
||||||
|
try {
|
||||||
|
|
||||||
|
BufferedReader br = new BufferedReader(new InputStreamReader(is , "UTF-8"), 512);
|
||||||
|
String theWord;
|
||||||
|
do {
|
||||||
|
theWord = br.readLine();
|
||||||
|
if (theWord != null && !"".equals(theWord.trim())) {
|
||||||
|
_SuffixDict.fillSegment(theWord.trim().toCharArray());
|
||||||
|
}
|
||||||
|
} while (theWord != null);
|
||||||
|
} catch (IOException e) {
|
||||||
|
logger.error("ik-analyzer",e);
|
||||||
|
}finally{
|
||||||
|
try {
|
||||||
is.close();
|
is.close();
|
||||||
is = null;
|
is = null;
|
||||||
} catch (IOException e) {
|
} catch (IOException e) {
|
||||||
logger.error("ik-analyzer",e);
|
logger.error("ik-analyzer",e);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
private void loadSuffixDict(){
|
private void loadPrepDict(){
|
||||||
|
|
||||||
_SuffixDict = new DictSegment((char)0);
|
_PrepDict = new DictSegment((char)0);
|
||||||
Path file = PathUtils.get(configuration.getDictRoot(), Dictionary.PATH_DIC_SUFFIX);
|
|
||||||
InputStream is = null;
|
|
||||||
try {
|
|
||||||
is = new FileInputStream(file.toFile());
|
|
||||||
} catch (FileNotFoundException e) {
|
|
||||||
logger.error("ik-analyzer",e);
|
|
||||||
}
|
|
||||||
if(is == null){
|
|
||||||
throw new RuntimeException("Suffix Dictionary not found!!!");
|
|
||||||
}
|
|
||||||
try {
|
|
||||||
|
|
||||||
BufferedReader br = new BufferedReader(new InputStreamReader(is , "UTF-8"), 512);
|
|
||||||
String theWord;
|
|
||||||
do {
|
|
||||||
theWord = br.readLine();
|
|
||||||
if (theWord != null && !"".equals(theWord.trim())) {
|
|
||||||
_SuffixDict.fillSegment(theWord.trim().toCharArray());
|
|
||||||
}
|
|
||||||
} while (theWord != null);
|
|
||||||
} catch (IOException e) {
|
|
||||||
logger.error("ik-analyzer",e);
|
|
||||||
}finally{
|
|
||||||
try {
|
|
||||||
is.close();
|
|
||||||
is = null;
|
|
||||||
} catch (IOException e) {
|
|
||||||
logger.error("ik-analyzer",e);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
private void loadPrepDict(){
|
|
||||||
|
|
||||||
_PrepDict = new DictSegment((char)0);
|
|
||||||
Path file = PathUtils.get(configuration.getDictRoot(), Dictionary.PATH_DIC_PREP);
|
Path file = PathUtils.get(configuration.getDictRoot(), Dictionary.PATH_DIC_PREP);
|
||||||
InputStream is = null;
|
InputStream is = null;
|
||||||
try {
|
try {
|
||||||
is = new FileInputStream(file.toFile());
|
is = new FileInputStream(file.toFile());
|
||||||
} catch (FileNotFoundException e) {
|
} catch (FileNotFoundException e) {
|
||||||
logger.error("ik-analyzer",e);
|
logger.error("ik-analyzer",e);
|
||||||
}
|
}
|
||||||
if(is == null){
|
if(is == null){
|
||||||
throw new RuntimeException("Preposition Dictionary not found!!!");
|
throw new RuntimeException("Preposition Dictionary not found!!!");
|
||||||
}
|
}
|
||||||
try {
|
try {
|
||||||
|
|
||||||
BufferedReader br = new BufferedReader(new InputStreamReader(is , "UTF-8"), 512);
|
BufferedReader br = new BufferedReader(new InputStreamReader(is , "UTF-8"), 512);
|
||||||
String theWord;
|
String theWord;
|
||||||
do {
|
do {
|
||||||
theWord = br.readLine();
|
theWord = br.readLine();
|
||||||
if (theWord != null && !"".equals(theWord.trim())) {
|
if (theWord != null && !"".equals(theWord.trim())) {
|
||||||
|
|
||||||
_PrepDict.fillSegment(theWord.trim().toCharArray());
|
_PrepDict.fillSegment(theWord.trim().toCharArray());
|
||||||
}
|
}
|
||||||
} while (theWord != null);
|
} while (theWord != null);
|
||||||
} catch (IOException e) {
|
} catch (IOException e) {
|
||||||
logger.error("ik-analyzer",e);
|
logger.error("ik-analyzer",e);
|
||||||
}finally{
|
}finally{
|
||||||
try {
|
try {
|
||||||
is.close();
|
is.close();
|
||||||
is = null;
|
is = null;
|
||||||
} catch (IOException e) {
|
} catch (IOException e) {
|
||||||
logger.error("ik-analyzer",e);
|
logger.error("ik-analyzer",e);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
public void reLoadMainDict(){
|
public void reLoadMainDict(){
|
||||||
logger.info("重新加载词典...");
|
logger.info("重新加载词典...");
|
||||||
// 新开一个实例加载词典,减少加载过程对当前词典使用的影响
|
// 新开一个实例加载词典,减少加载过程对当前词典使用的影响
|
||||||
Dictionary tmpDict = new Dictionary();
|
Dictionary tmpDict = new Dictionary();
|
||||||
tmpDict.configuration = getSingleton().configuration;
|
tmpDict.configuration = getSingleton().configuration;
|
||||||
@ -646,6 +647,6 @@ public class Dictionary {
|
|||||||
_MainDict = tmpDict._MainDict;
|
_MainDict = tmpDict._MainDict;
|
||||||
_StopWords = tmpDict._StopWords;
|
_StopWords = tmpDict._StopWords;
|
||||||
logger.info("重新加载词典完毕...");
|
logger.info("重新加载词典完毕...");
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
@ -7,9 +7,13 @@ import org.apache.http.client.methods.CloseableHttpResponse;
|
|||||||
import org.apache.http.client.methods.HttpHead;
|
import org.apache.http.client.methods.HttpHead;
|
||||||
import org.apache.http.impl.client.CloseableHttpClient;
|
import org.apache.http.impl.client.CloseableHttpClient;
|
||||||
import org.apache.http.impl.client.HttpClients;
|
import org.apache.http.impl.client.HttpClients;
|
||||||
|
import org.elasticsearch.common.logging.ESLogger;
|
||||||
|
import org.elasticsearch.common.logging.Loggers;
|
||||||
|
|
||||||
public class Monitor implements Runnable {
|
public class Monitor implements Runnable {
|
||||||
|
|
||||||
|
public static ESLogger logger= Loggers.getLogger("ik-analyzer");
|
||||||
|
|
||||||
private static CloseableHttpClient httpclient = HttpClients.createDefault();
|
private static CloseableHttpClient httpclient = HttpClients.createDefault();
|
||||||
/*
|
/*
|
||||||
* 上次更改时间
|
* 上次更改时间
|
||||||
@ -19,12 +23,12 @@ public class Monitor implements Runnable {
|
|||||||
* 资源属性
|
* 资源属性
|
||||||
*/
|
*/
|
||||||
private String eTags;
|
private String eTags;
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* 请求地址
|
* 请求地址
|
||||||
*/
|
*/
|
||||||
private String location;
|
private String location;
|
||||||
|
|
||||||
public Monitor(String location) {
|
public Monitor(String location) {
|
||||||
this.location = location;
|
this.location = location;
|
||||||
this.last_modified = null;
|
this.last_modified = null;
|
||||||
@ -38,16 +42,16 @@ public class Monitor implements Runnable {
|
|||||||
* ④如果有变化,重新加载词典
|
* ④如果有变化,重新加载词典
|
||||||
* ⑤休眠1min,返回第①步
|
* ⑤休眠1min,返回第①步
|
||||||
*/
|
*/
|
||||||
|
|
||||||
public void run() {
|
public void run() {
|
||||||
|
|
||||||
//超时设置
|
//超时设置
|
||||||
RequestConfig rc = RequestConfig.custom().setConnectionRequestTimeout(10*1000)
|
RequestConfig rc = RequestConfig.custom().setConnectionRequestTimeout(10*1000)
|
||||||
.setConnectTimeout(10*1000).setSocketTimeout(15*1000).build();
|
.setConnectTimeout(10*1000).setSocketTimeout(15*1000).build();
|
||||||
|
|
||||||
HttpHead head = new HttpHead(location);
|
HttpHead head = new HttpHead(location);
|
||||||
head.setConfig(rc);
|
head.setConfig(rc);
|
||||||
|
|
||||||
//设置请求头
|
//设置请求头
|
||||||
if (last_modified != null) {
|
if (last_modified != null) {
|
||||||
head.setHeader("If-Modified-Since", last_modified);
|
head.setHeader("If-Modified-Since", last_modified);
|
||||||
@ -55,17 +59,17 @@ public class Monitor implements Runnable {
|
|||||||
if (eTags != null) {
|
if (eTags != null) {
|
||||||
head.setHeader("If-None-Match", eTags);
|
head.setHeader("If-None-Match", eTags);
|
||||||
}
|
}
|
||||||
|
|
||||||
CloseableHttpResponse response = null;
|
CloseableHttpResponse response = null;
|
||||||
try {
|
try {
|
||||||
|
|
||||||
response = httpclient.execute(head);
|
response = httpclient.execute(head);
|
||||||
|
|
||||||
//返回200 才做操作
|
//返回200 才做操作
|
||||||
if(response.getStatusLine().getStatusCode()==200){
|
if(response.getStatusLine().getStatusCode()==200){
|
||||||
|
|
||||||
if (!response.getLastHeader("Last-Modified").getValue().equalsIgnoreCase(last_modified)
|
if (!response.getLastHeader("Last-Modified").getValue().equalsIgnoreCase(last_modified)
|
||||||
||!response.getLastHeader("ETag").getValue().equalsIgnoreCase(eTags)) {
|
||!response.getLastHeader("ETag").getValue().equalsIgnoreCase(eTags)) {
|
||||||
|
|
||||||
// 远程词库有更新,需要重新加载词典,并修改last_modified,eTags
|
// 远程词库有更新,需要重新加载词典,并修改last_modified,eTags
|
||||||
Dictionary.getSingleton().reLoadMainDict();
|
Dictionary.getSingleton().reLoadMainDict();
|
||||||
@ -87,9 +91,9 @@ public class Monitor implements Runnable {
|
|||||||
response.close();
|
response.close();
|
||||||
}
|
}
|
||||||
} catch (IOException e) {
|
} catch (IOException e) {
|
||||||
e.printStackTrace();
|
logger.error(e.getMessage(), e);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
@ -5,8 +5,8 @@ import org.elasticsearch.common.logging.Loggers;
|
|||||||
|
|
||||||
public class Sleep {
|
public class Sleep {
|
||||||
|
|
||||||
public static final ESLogger logger= Loggers.getLogger("ik-analyzer");
|
public static ESLogger logger= Loggers.getLogger("ik-analyzer");
|
||||||
|
|
||||||
public enum Type{MSEC,SEC,MIN,HOUR};
|
public enum Type{MSEC,SEC,MIN,HOUR};
|
||||||
public static void sleep(Type type,int num){
|
public static void sleep(Type type,int num){
|
||||||
try {
|
try {
|
||||||
@ -15,22 +15,22 @@ public class Sleep {
|
|||||||
Thread.sleep(num);
|
Thread.sleep(num);
|
||||||
return;
|
return;
|
||||||
case SEC:
|
case SEC:
|
||||||
Thread.sleep(num*1000L);
|
Thread.sleep(num*1000);
|
||||||
return;
|
return;
|
||||||
case MIN:
|
case MIN:
|
||||||
Thread.sleep(num*60*1000L);
|
Thread.sleep(num*60*1000);
|
||||||
return;
|
return;
|
||||||
case HOUR:
|
case HOUR:
|
||||||
Thread.sleep(num*60*60*1000L);
|
Thread.sleep(num*60*60*1000);
|
||||||
return;
|
return;
|
||||||
default:
|
default:
|
||||||
logger.error("输入类型错误,应为MSEC,SEC,MIN,HOUR之一");
|
System.err.println("输入类型错误,应为MSEC,SEC,MIN,HOUR之一");
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
} catch (InterruptedException e) {
|
} catch (InterruptedException e) {
|
||||||
e.printStackTrace();
|
logger.error(e.getMessage(), e);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
}
|
}
|
@ -1,7 +1,7 @@
|
|||||||
/**
|
/**
|
||||||
* IK 中文分词 版本 5.0
|
* IK 中文分词 版本 5.0
|
||||||
* IK Analyzer release 5.0
|
* IK Analyzer release 5.0
|
||||||
*
|
*
|
||||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
* contributor license agreements. See the NOTICE file distributed with
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
* this work for additional information regarding copyright ownership.
|
* this work for additional information regarding copyright ownership.
|
||||||
@ -20,7 +20,7 @@
|
|||||||
* 源代码由林良益(linliangyi2005@gmail.com)提供
|
* 源代码由林良益(linliangyi2005@gmail.com)提供
|
||||||
* 版权声明 2012,乌龙茶工作室
|
* 版权声明 2012,乌龙茶工作室
|
||||||
* provided by Linliangyi and copyright 2012 by Oolong studio
|
* provided by Linliangyi and copyright 2012 by Oolong studio
|
||||||
*
|
*
|
||||||
*/
|
*/
|
||||||
package org.wltea.analyzer.query;
|
package org.wltea.analyzer.query;
|
||||||
|
|
||||||
@ -34,6 +34,8 @@ import org.apache.lucene.queryparser.classic.ParseException;
|
|||||||
import org.apache.lucene.queryparser.classic.QueryParser;
|
import org.apache.lucene.queryparser.classic.QueryParser;
|
||||||
import org.apache.lucene.search.Query;
|
import org.apache.lucene.search.Query;
|
||||||
import org.apache.lucene.util.Version;
|
import org.apache.lucene.util.Version;
|
||||||
|
import org.elasticsearch.common.logging.ESLogger;
|
||||||
|
import org.elasticsearch.common.logging.Loggers;
|
||||||
import org.wltea.analyzer.core.IKSegmenter;
|
import org.wltea.analyzer.core.IKSegmenter;
|
||||||
import org.wltea.analyzer.core.Lexeme;
|
import org.wltea.analyzer.core.Lexeme;
|
||||||
|
|
||||||
@ -45,6 +47,8 @@ import org.wltea.analyzer.core.Lexeme;
|
|||||||
*/
|
*/
|
||||||
public class SWMCQueryBuilder {
|
public class SWMCQueryBuilder {
|
||||||
|
|
||||||
|
public static ESLogger logger= Loggers.getLogger("ik-analyzer");
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* 生成SWMCQuery
|
* 生成SWMCQuery
|
||||||
* @param fieldName
|
* @param fieldName
|
||||||
@ -62,7 +66,7 @@ public class SWMCQueryBuilder {
|
|||||||
Query _SWMCQuery = getSWMCQuery(fieldName , lexemes , quickMode);
|
Query _SWMCQuery = getSWMCQuery(fieldName , lexemes , quickMode);
|
||||||
return _SWMCQuery;
|
return _SWMCQuery;
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* 分词切分,并返回结链表
|
* 分词切分,并返回结链表
|
||||||
* @param keywords
|
* @param keywords
|
||||||
@ -78,16 +82,16 @@ public class SWMCQueryBuilder {
|
|||||||
lexemes.add(l);
|
lexemes.add(l);
|
||||||
}
|
}
|
||||||
}catch(IOException e){
|
}catch(IOException e){
|
||||||
e.printStackTrace();
|
logger.error(e.getMessage(), e);
|
||||||
}
|
}
|
||||||
return lexemes;
|
return lexemes;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* 根据分词结果生成SWMC搜索
|
* 根据分词结果生成SWMC搜索
|
||||||
* @param fieldName
|
* @param fieldName
|
||||||
// * @param pathOption
|
// * @param pathOption
|
||||||
* @param quickMode
|
* @param quickMode
|
||||||
* @return
|
* @return
|
||||||
*/
|
*/
|
||||||
@ -100,7 +104,7 @@ public class SWMCQueryBuilder {
|
|||||||
int lastLexemeLength = 0;
|
int lastLexemeLength = 0;
|
||||||
//记录最后词元结束位置
|
//记录最后词元结束位置
|
||||||
int lastLexemeEnd = -1;
|
int lastLexemeEnd = -1;
|
||||||
|
|
||||||
int shortCount = 0;
|
int shortCount = 0;
|
||||||
int totalCount = 0;
|
int totalCount = 0;
|
||||||
for(Lexeme l : lexemes){
|
for(Lexeme l : lexemes){
|
||||||
@ -110,15 +114,15 @@ public class SWMCQueryBuilder {
|
|||||||
keywordBuffer_Short.append(' ').append(l.getLexemeText());
|
keywordBuffer_Short.append(' ').append(l.getLexemeText());
|
||||||
shortCount += l.getLength();
|
shortCount += l.getLength();
|
||||||
}
|
}
|
||||||
|
|
||||||
if(lastLexemeLength == 0){
|
if(lastLexemeLength == 0){
|
||||||
keywordBuffer.append(l.getLexemeText());
|
keywordBuffer.append(l.getLexemeText());
|
||||||
}else if(lastLexemeLength == 1 && l.getLength() == 1
|
}else if(lastLexemeLength == 1 && l.getLength() == 1
|
||||||
&& lastLexemeEnd == l.getBeginPosition()){//单字位置相邻,长度为一,合并)
|
&& lastLexemeEnd == l.getBeginPosition()){//单字位置相邻,长度为一,合并)
|
||||||
keywordBuffer.append(l.getLexemeText());
|
keywordBuffer.append(l.getLexemeText());
|
||||||
}else{
|
}else{
|
||||||
keywordBuffer.append(' ').append(l.getLexemeText());
|
keywordBuffer.append(' ').append(l.getLexemeText());
|
||||||
|
|
||||||
}
|
}
|
||||||
lastLexemeLength = l.getLength();
|
lastLexemeLength = l.getLength();
|
||||||
lastLexemeEnd = l.getEndPosition();
|
lastLexemeEnd = l.getEndPosition();
|
||||||
@ -128,16 +132,16 @@ public class SWMCQueryBuilder {
|
|||||||
QueryParser qp = new QueryParser(fieldName, new StandardAnalyzer());
|
QueryParser qp = new QueryParser(fieldName, new StandardAnalyzer());
|
||||||
qp.setDefaultOperator(QueryParser.AND_OPERATOR);
|
qp.setDefaultOperator(QueryParser.AND_OPERATOR);
|
||||||
qp.setAutoGeneratePhraseQueries(true);
|
qp.setAutoGeneratePhraseQueries(true);
|
||||||
|
|
||||||
if(quickMode && (shortCount * 1.0f / totalCount) > 0.5f){
|
if(quickMode && (shortCount * 1.0f / totalCount) > 0.5f){
|
||||||
try {
|
try {
|
||||||
//System.out.println(keywordBuffer.toString());
|
//System.out.println(keywordBuffer.toString());
|
||||||
Query q = qp.parse(keywordBuffer_Short.toString());
|
Query q = qp.parse(keywordBuffer_Short.toString());
|
||||||
return q;
|
return q;
|
||||||
} catch (ParseException e) {
|
} catch (ParseException e) {
|
||||||
e.printStackTrace();
|
logger.error(e.getMessage(), e);
|
||||||
}
|
}
|
||||||
|
|
||||||
}else{
|
}else{
|
||||||
if(keywordBuffer.length() > 0){
|
if(keywordBuffer.length() > 0){
|
||||||
try {
|
try {
|
||||||
@ -145,10 +149,10 @@ public class SWMCQueryBuilder {
|
|||||||
Query q = qp.parse(keywordBuffer.toString());
|
Query q = qp.parse(keywordBuffer.toString());
|
||||||
return q;
|
return q;
|
||||||
} catch (ParseException e) {
|
} catch (ParseException e) {
|
||||||
e.printStackTrace();
|
logger.error(e.getMessage(), e);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
}
|
}
|
@ -1,7 +1,7 @@
|
|||||||
/**
|
/**
|
||||||
* IK 中文分词 版本 5.0.1
|
* IK 中文分词 版本 5.0.1
|
||||||
* IK Analyzer release 5.0.1
|
* IK Analyzer release 5.0.1
|
||||||
*
|
*
|
||||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
* contributor license agreements. See the NOTICE file distributed with
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
* this work for additional information regarding copyright ownership.
|
* this work for additional information regarding copyright ownership.
|
||||||
@ -20,8 +20,8 @@
|
|||||||
* 源代码由林良益(linliangyi2005@gmail.com)提供
|
* 源代码由林良益(linliangyi2005@gmail.com)提供
|
||||||
* 版权声明 2012,乌龙茶工作室
|
* 版权声明 2012,乌龙茶工作室
|
||||||
* provided by Linliangyi and copyright 2012 by Oolong studio
|
* provided by Linliangyi and copyright 2012 by Oolong studio
|
||||||
*
|
*
|
||||||
*
|
*
|
||||||
*/
|
*/
|
||||||
package org.wltea.analyzer.sample;
|
package org.wltea.analyzer.sample;
|
||||||
|
|
||||||
@ -44,47 +44,47 @@ import org.wltea.analyzer.lucene.IKAnalyzer;
|
|||||||
*/
|
*/
|
||||||
public class IKAnalzyerDemo {
|
public class IKAnalzyerDemo {
|
||||||
|
|
||||||
public static final ESLogger logger= Loggers.getLogger("ik-analyzer");
|
public static ESLogger logger= Loggers.getLogger("ik-analyzer");
|
||||||
|
|
||||||
public static void main(String[] args){
|
public static void main(String[] args){
|
||||||
//构建IK分词器,使用smart分词模式
|
//构建IK分词器,使用smart分词模式
|
||||||
Analyzer analyzer = new IKAnalyzer(true);
|
Analyzer analyzer = new IKAnalyzer(true);
|
||||||
|
|
||||||
//获取Lucene的TokenStream对象
|
//获取Lucene的TokenStream对象
|
||||||
TokenStream ts = null;
|
TokenStream ts = null;
|
||||||
try {
|
try {
|
||||||
ts = analyzer.tokenStream("myfield", new StringReader("WORLD ,.. html DATA</html>HELLO"));
|
ts = analyzer.tokenStream("myfield", new StringReader("WORLD ,.. html DATA</html>HELLO"));
|
||||||
// ts = analyzer.tokenStream("myfield", new StringReader("这是一个中文分词的例子,你可以直接运行它!IKAnalyer can analysis english text too"));
|
// ts = analyzer.tokenStream("myfield", new StringReader("这是一个中文分词的例子,你可以直接运行它!IKAnalyer can analysis english text too"));
|
||||||
//获取词元位置属性
|
//获取词元位置属性
|
||||||
OffsetAttribute offset = ts.addAttribute(OffsetAttribute.class);
|
OffsetAttribute offset = ts.addAttribute(OffsetAttribute.class);
|
||||||
//获取词元文本属性
|
//获取词元文本属性
|
||||||
CharTermAttribute term = ts.addAttribute(CharTermAttribute.class);
|
CharTermAttribute term = ts.addAttribute(CharTermAttribute.class);
|
||||||
//获取词元文本属性
|
//获取词元文本属性
|
||||||
TypeAttribute type = ts.addAttribute(TypeAttribute.class);
|
TypeAttribute type = ts.addAttribute(TypeAttribute.class);
|
||||||
|
|
||||||
|
|
||||||
//重置TokenStream(重置StringReader)
|
//重置TokenStream(重置StringReader)
|
||||||
ts.reset();
|
ts.reset();
|
||||||
//迭代获取分词结果
|
//迭代获取分词结果
|
||||||
while (ts.incrementToken()) {
|
while (ts.incrementToken()) {
|
||||||
logger.info(offset.startOffset() + " - " + offset.endOffset() + " : " + term.toString() + " | " + type.type());
|
System.out.println(offset.startOffset() + " - " + offset.endOffset() + " : " + term.toString() + " | " + type.type());
|
||||||
}
|
}
|
||||||
//关闭TokenStream(关闭StringReader)
|
//关闭TokenStream(关闭StringReader)
|
||||||
ts.end(); // Perform end-of-stream operations, e.g. set the final offset.
|
ts.end(); // Perform end-of-stream operations, e.g. set the final offset.
|
||||||
|
|
||||||
} catch (IOException e) {
|
} catch (IOException e) {
|
||||||
e.printStackTrace();
|
logger.error(e.getMessage(), e);
|
||||||
} finally {
|
} finally {
|
||||||
//释放TokenStream的所有资源
|
//释放TokenStream的所有资源
|
||||||
if(ts != null){
|
if(ts != null){
|
||||||
try {
|
try {
|
||||||
ts.close();
|
ts.close();
|
||||||
} catch (IOException e) {
|
} catch (IOException e) {
|
||||||
e.printStackTrace();
|
logger.error(e.getMessage(), e);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
@ -1,7 +1,7 @@
|
|||||||
/**
|
/**
|
||||||
* IK 中文分词 版本 5.0
|
* IK 中文分词 版本 5.0
|
||||||
* IK Analyzer release 5.0
|
* IK Analyzer release 5.0
|
||||||
*
|
*
|
||||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
* contributor license agreements. See the NOTICE file distributed with
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
* this work for additional information regarding copyright ownership.
|
* this work for additional information regarding copyright ownership.
|
||||||
@ -20,8 +20,8 @@
|
|||||||
* 源代码由林良益(linliangyi2005@gmail.com)提供
|
* 源代码由林良益(linliangyi2005@gmail.com)提供
|
||||||
* 版权声明 2012,乌龙茶工作室
|
* 版权声明 2012,乌龙茶工作室
|
||||||
* provided by Linliangyi and copyright 2012 by Oolong studio
|
* provided by Linliangyi and copyright 2012 by Oolong studio
|
||||||
*
|
*
|
||||||
*
|
*
|
||||||
*/
|
*/
|
||||||
package org.wltea.analyzer.sample;
|
package org.wltea.analyzer.sample;
|
||||||
|
|
||||||
@ -58,14 +58,14 @@ import org.wltea.analyzer.lucene.IKAnalyzer;
|
|||||||
/**
|
/**
|
||||||
* 使用IKAnalyzer进行Lucene索引和查询的演示
|
* 使用IKAnalyzer进行Lucene索引和查询的演示
|
||||||
* 2012-3-2
|
* 2012-3-2
|
||||||
*
|
*
|
||||||
* 以下是结合Lucene4.0 API的写法
|
* 以下是结合Lucene4.0 API的写法
|
||||||
*
|
*
|
||||||
*/
|
*/
|
||||||
public class LuceneIndexAndSearchDemo {
|
public class LuceneIndexAndSearchDemo {
|
||||||
|
|
||||||
public static final ESLogger logger= Loggers.getLogger("ik-analyzer");
|
public static ESLogger logger= Loggers.getLogger("ik-analyzer");
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* 模拟:
|
* 模拟:
|
||||||
* 创建一个单条记录的索引,并对其进行搜索
|
* 创建一个单条记录的索引,并对其进行搜索
|
||||||
@ -74,20 +74,20 @@ public class LuceneIndexAndSearchDemo {
|
|||||||
public static void main(String[] args){
|
public static void main(String[] args){
|
||||||
//Lucene Document的域名
|
//Lucene Document的域名
|
||||||
String fieldName = "text";
|
String fieldName = "text";
|
||||||
//检索内容
|
//检索内容
|
||||||
String text = "IK Analyzer是一个结合词典分词和文法分词的中文分词开源工具包。它使用了全新的正向迭代最细粒度切分算法。";
|
String text = "IK Analyzer是一个结合词典分词和文法分词的中文分词开源工具包。它使用了全新的正向迭代最细粒度切分算法。";
|
||||||
|
|
||||||
//实例化IKAnalyzer分词器
|
//实例化IKAnalyzer分词器
|
||||||
Analyzer analyzer = new IKAnalyzer(true);
|
Analyzer analyzer = new IKAnalyzer(true);
|
||||||
|
|
||||||
Directory directory = null;
|
Directory directory = null;
|
||||||
IndexWriter iwriter = null;
|
IndexWriter iwriter = null;
|
||||||
IndexReader ireader = null;
|
IndexReader ireader = null;
|
||||||
IndexSearcher isearcher = null;
|
IndexSearcher isearcher = null;
|
||||||
try {
|
try {
|
||||||
//建立内存索引对象
|
//建立内存索引对象
|
||||||
directory = new RAMDirectory();
|
directory = new RAMDirectory();
|
||||||
|
|
||||||
//配置IndexWriterConfig
|
//配置IndexWriterConfig
|
||||||
IndexWriterConfig iwConfig = new IndexWriterConfig(analyzer);
|
IndexWriterConfig iwConfig = new IndexWriterConfig(analyzer);
|
||||||
iwConfig.setOpenMode(OpenMode.CREATE_OR_APPEND);
|
iwConfig.setOpenMode(OpenMode.CREATE_OR_APPEND);
|
||||||
@ -98,53 +98,53 @@ public class LuceneIndexAndSearchDemo {
|
|||||||
doc.add(new TextField(fieldName, text, Field.Store.YES));
|
doc.add(new TextField(fieldName, text, Field.Store.YES));
|
||||||
iwriter.addDocument(doc);
|
iwriter.addDocument(doc);
|
||||||
iwriter.close();
|
iwriter.close();
|
||||||
|
|
||||||
|
|
||||||
//搜索过程**********************************
|
//搜索过程**********************************
|
||||||
//实例化搜索器
|
//实例化搜索器
|
||||||
ireader = DirectoryReader.open(directory);
|
ireader = DirectoryReader.open(directory);
|
||||||
isearcher = new IndexSearcher(ireader);
|
isearcher = new IndexSearcher(ireader);
|
||||||
|
|
||||||
String keyword = "中文分词工具包";
|
String keyword = "中文分词工具包";
|
||||||
//使用QueryParser查询分析器构造Query对象
|
//使用QueryParser查询分析器构造Query对象
|
||||||
QueryParser qp = new QueryParser(fieldName, analyzer);
|
QueryParser qp = new QueryParser(fieldName, analyzer);
|
||||||
qp.setDefaultOperator(QueryParser.AND_OPERATOR);
|
qp.setDefaultOperator(QueryParser.AND_OPERATOR);
|
||||||
Query query = qp.parse(keyword);
|
Query query = qp.parse(keyword);
|
||||||
logger.info("Query = " + query);
|
System.out.println("Query = " + query);
|
||||||
|
|
||||||
//搜索相似度最高的5条记录
|
//搜索相似度最高的5条记录
|
||||||
TopDocs topDocs = isearcher.search(query , 5);
|
TopDocs topDocs = isearcher.search(query , 5);
|
||||||
logger.info("命中:" + topDocs.totalHits);
|
System.out.println("命中:" + topDocs.totalHits);
|
||||||
//输出结果
|
//输出结果
|
||||||
ScoreDoc[] scoreDocs = topDocs.scoreDocs;
|
ScoreDoc[] scoreDocs = topDocs.scoreDocs;
|
||||||
for (int i = 0; i < topDocs.totalHits; i++){
|
for (int i = 0; i < topDocs.totalHits; i++){
|
||||||
Document targetDoc = isearcher.doc(scoreDocs[i].doc);
|
Document targetDoc = isearcher.doc(scoreDocs[i].doc);
|
||||||
logger.info("内容:" + targetDoc.toString());
|
System.out.println("内容:" + targetDoc.toString());
|
||||||
}
|
}
|
||||||
|
|
||||||
} catch (CorruptIndexException e) {
|
} catch (CorruptIndexException e) {
|
||||||
e.printStackTrace();
|
logger.error(e.getMessage(), e);
|
||||||
} catch (LockObtainFailedException e) {
|
} catch (LockObtainFailedException e) {
|
||||||
e.printStackTrace();
|
logger.error(e.getMessage(), e);
|
||||||
} catch (IOException e) {
|
} catch (IOException e) {
|
||||||
e.printStackTrace();
|
logger.error(e.getMessage(), e);
|
||||||
} catch (ParseException e) {
|
} catch (ParseException e) {
|
||||||
e.printStackTrace();
|
logger.error(e.getMessage(), e);
|
||||||
} finally{
|
} finally{
|
||||||
if(ireader != null){
|
if(ireader != null){
|
||||||
try {
|
try {
|
||||||
ireader.close();
|
ireader.close();
|
||||||
} catch (IOException e) {
|
} catch (IOException e) {
|
||||||
e.printStackTrace();
|
logger.error(e.getMessage(), e);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if(directory != null){
|
if(directory != null){
|
||||||
try {
|
try {
|
||||||
directory.close();
|
directory.close();
|
||||||
} catch (IOException e) {
|
} catch (IOException e) {
|
||||||
e.printStackTrace();
|
logger.error(e.getMessage(), e);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
Loading…
x
Reference in New Issue
Block a user