fix dict loading

This commit is contained in:
medcl 2013-05-31 14:32:22 +08:00
parent 6fc30fe600
commit 07ba4ece55
4 changed files with 177 additions and 56 deletions

View File

@ -6,7 +6,7 @@
<modelVersion>4.0.0</modelVersion> <modelVersion>4.0.0</modelVersion>
<groupId>org.elasticsearch</groupId> <groupId>org.elasticsearch</groupId>
<artifactId>elasticsearch-analysis-ik</artifactId> <artifactId>elasticsearch-analysis-ik</artifactId>
<version>1.2.0</version> <version>1.2.1</version>
<packaging>jar</packaging> <packaging>jar</packaging>
<description>IK Analyzer for ElasticSearch</description> <description>IK Analyzer for ElasticSearch</description>
<inceptionYear>2009</inceptionYear> <inceptionYear>2009</inceptionYear>

View File

@ -37,7 +37,7 @@ public class Configuration {
try { try {
input = new FileInputStream(fileConfig); input = new FileInputStream(fileConfig);
} catch (FileNotFoundException e) { } catch (FileNotFoundException e) {
e.printStackTrace(); logger.error("ik-analyzer",e);
} }
if(input != null){ if(input != null){
try { try {

View File

@ -26,8 +26,8 @@
package org.wltea.analyzer.dic; package org.wltea.analyzer.dic;
import java.util.Arrays; import java.util.Arrays;
import java.util.HashMap;
import java.util.Map; import java.util.Map;
import java.util.concurrent.ConcurrentHashMap;
/** /**
* 词典树分段表示词典树的一个分枝 * 词典树分段表示词典树的一个分枝
@ -35,7 +35,7 @@ import java.util.Map;
class DictSegment implements Comparable<DictSegment>{ class DictSegment implements Comparable<DictSegment>{
//公用字典表存储汉字 //公用字典表存储汉字
private static final Map<Character , Character> charMap = new HashMap<Character , Character>(16 , 0.95f); private static final Map<Character , Character> charMap = new ConcurrentHashMap<Character , Character>(16 , 0.95f);
//数组大小上限 //数组大小上限
private static final int ARRAY_LENGTH_LIMIT = 3; private static final int ARRAY_LENGTH_LIMIT = 3;
@ -298,7 +298,7 @@ class DictSegment implements Comparable<DictSegment>{
if(this.childrenMap == null){ if(this.childrenMap == null){
synchronized(this){ synchronized(this){
if(this.childrenMap == null){ if(this.childrenMap == null){
this.childrenMap = new HashMap<Character , DictSegment>(ARRAY_LENGTH_LIMIT * 2,0.8f); this.childrenMap = new ConcurrentHashMap<Character, DictSegment>(ARRAY_LENGTH_LIMIT * 2,0.8f);
} }
} }
} }

View File

@ -25,16 +25,16 @@
*/ */
package org.wltea.analyzer.dic; package org.wltea.analyzer.dic;
import java.io.*;
import java.util.Collection;
import java.util.List;
import org.elasticsearch.common.logging.ESLogger; import org.elasticsearch.common.logging.ESLogger;
import org.elasticsearch.common.logging.Loggers; import org.elasticsearch.common.logging.Loggers;
import org.elasticsearch.common.settings.Settings; import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.env.Environment; import org.elasticsearch.env.Environment;
import org.wltea.analyzer.cfg.Configuration; import org.wltea.analyzer.cfg.Configuration;
import java.io.*;
import java.util.Collection;
import java.util.List;
/** /**
* 词典管理类,单子模式 * 词典管理类,单子模式
*/ */
@ -45,20 +45,19 @@ public class Dictionary {
* 词典单子实例 * 词典单子实例
*/ */
private static Dictionary singleton; private static Dictionary singleton;
/* private DictSegment _MainDict;
* 主词典对象
*/ private DictSegment _SurnameDict;
private DictSegment _MainDict;
private DictSegment _QuantifierDict;
/*
* 停止词词典 private DictSegment _SuffixDict;
*/
private DictSegment _StopWordDict; private DictSegment _PrepDict;
/*
* 量词词典 private DictSegment _StopWords;
*/
private DictSegment _QuantifierDict;
/** /**
* 配置对象 * 配置对象
@ -95,10 +94,10 @@ public class Dictionary {
environment =new Environment(indexSettings); environment =new Environment(indexSettings);
configuration=new Configuration(indexSettings); configuration=new Configuration(indexSettings);
loadMainDict(); loadMainDict();
// loadSurnameDict(); loadSurnameDict();
loadQuantifierDict(); loadQuantifierDict();
// loadSuffixDict(); loadSuffixDict();
// loadPrepDict(); loadPrepDict();
loadStopWordDict(); loadStopWordDict();
dictInited=true; dictInited=true;
} }
@ -218,7 +217,7 @@ public class Dictionary {
* @return boolean * @return boolean
*/ */
public boolean isStopWord(char[] charArray , int begin, int length){ public boolean isStopWord(char[] charArray , int begin, int length){
return singleton._StopWordDict.match(charArray, begin, length).isMatch(); return singleton._StopWords.match(charArray, begin, length).isMatch();
} }
/** /**
@ -247,18 +246,17 @@ public class Dictionary {
} }
} while (theWord != null); } while (theWord != null);
} catch (IOException ioe) { } catch (IOException e) {
System.err.println("Main Dictionary loading exception."); logger.error("ik-analyzer",e);
ioe.printStackTrace();
}finally{
}finally{
try { try {
if(is != null){ if(is != null){
is.close(); is.close();
is = null; is = null;
} }
} catch (IOException e) { } catch (IOException e) {
e.printStackTrace(); logger.error("ik-analyzer",e);
} }
} }
//加载扩展词典 //加载扩展词典
@ -275,8 +273,14 @@ public class Dictionary {
InputStream is = null; InputStream is = null;
for(String extDictName : extDictFiles){ for(String extDictName : extDictFiles){
//读取扩展词典文件 //读取扩展词典文件
System.out.println("加载扩展词典:" + extDictName); logger.info("加载扩展词典:" + extDictName);
is = this.getClass().getClassLoader().getResourceAsStream(extDictName); File file=new File(environment.configFile(), extDictName);
try {
is = new FileInputStream(file);
} catch (FileNotFoundException e) {
logger.error("ik-analyzer",e);
}
//如果找不到扩展的字典则忽略 //如果找不到扩展的字典则忽略
if(is == null){ if(is == null){
continue; continue;
@ -288,24 +292,21 @@ public class Dictionary {
theWord = br.readLine(); theWord = br.readLine();
if (theWord != null && !"".equals(theWord.trim())) { if (theWord != null && !"".equals(theWord.trim())) {
//加载扩展词典数据到主内存词典中 //加载扩展词典数据到主内存词典中
//System.out.println(theWord);
_MainDict.fillSegment(theWord.trim().toLowerCase().toCharArray()); _MainDict.fillSegment(theWord.trim().toLowerCase().toCharArray());
} }
} while (theWord != null); } while (theWord != null);
} catch (IOException ioe) { } catch (IOException e) {
System.err.println("Extension Dictionary loading exception."); logger.error("ik-analyzer",e);
ioe.printStackTrace(); }finally{
}finally{
try { try {
if(is != null){ if(is != null){
is.close(); is.close();
is = null; is = null;
} }
} catch (IOException e) { } catch (IOException e) {
e.printStackTrace(); logger.error("ik-analyzer",e);
} }
} }
} }
} }
@ -316,15 +317,21 @@ public class Dictionary {
*/ */
private void loadStopWordDict(){ private void loadStopWordDict(){
//建立一个主词典实例 //建立一个主词典实例
_StopWordDict = new DictSegment((char)0); _StopWords = new DictSegment((char)0);
//加载扩展停止词典 //加载扩展停止词典
List<String> extStopWordDictFiles = configuration.getExtStopWordDictionarys(); List<String> extStopWordDictFiles = configuration.getExtStopWordDictionarys();
if(extStopWordDictFiles != null){ if(extStopWordDictFiles != null){
InputStream is = null; InputStream is = null;
for(String extStopWordDictName : extStopWordDictFiles){ for(String extStopWordDictName : extStopWordDictFiles){
System.out.println("加载扩展停止词典:" + extStopWordDictName); // logger.info("加载扩展停止词典:" + extStopWordDictName);
//读取扩展词典文件 //读取扩展词典文件
is = this.getClass().getClassLoader().getResourceAsStream(extStopWordDictName); File file=new File(environment.configFile(), extStopWordDictName);
try {
is = new FileInputStream(file);
} catch (FileNotFoundException e) {
logger.error("ik-analyzer",e);
}
//如果找不到扩展的字典则忽略 //如果找不到扩展的字典则忽略
if(is == null){ if(is == null){
continue; continue;
@ -335,15 +342,13 @@ public class Dictionary {
do { do {
theWord = br.readLine(); theWord = br.readLine();
if (theWord != null && !"".equals(theWord.trim())) { if (theWord != null && !"".equals(theWord.trim())) {
//System.out.println(theWord);
//加载扩展停止词典数据到内存中 //加载扩展停止词典数据到内存中
_StopWordDict.fillSegment(theWord.trim().toLowerCase().toCharArray()); _StopWords.fillSegment(theWord.trim().toLowerCase().toCharArray());
} }
} while (theWord != null); } while (theWord != null);
} catch (IOException ioe) { } catch (IOException e) {
System.err.println("Extension Stop word Dictionary loading exception."); logger.error("ik-analyzer",e);
ioe.printStackTrace();
}finally{ }finally{
try { try {
@ -352,7 +357,7 @@ public class Dictionary {
is = null; is = null;
} }
} catch (IOException e) { } catch (IOException e) {
e.printStackTrace(); logger.error("ik-analyzer",e);
} }
} }
} }
@ -371,7 +376,7 @@ public class Dictionary {
try { try {
is = new FileInputStream(file); is = new FileInputStream(file);
} catch (FileNotFoundException e) { } catch (FileNotFoundException e) {
e.printStackTrace(); logger.error("ik-analyzer",e);
} }
try { try {
BufferedReader br = new BufferedReader(new InputStreamReader(is , "UTF-8"), 512); BufferedReader br = new BufferedReader(new InputStreamReader(is , "UTF-8"), 512);
@ -384,8 +389,7 @@ public class Dictionary {
} while (theWord != null); } while (theWord != null);
} catch (IOException ioe) { } catch (IOException ioe) {
System.err.println("Quantifier Dictionary loading exception."); logger.error("Quantifier Dictionary loading exception.");
ioe.printStackTrace();
}finally{ }finally{
try { try {
@ -394,12 +398,129 @@ public class Dictionary {
is = null; is = null;
} }
} catch (IOException e) { } catch (IOException e) {
e.printStackTrace(); logger.error("ik-analyzer",e);
} }
} }
} }
private void loadSurnameDict(){
_SurnameDict = new DictSegment((char)0);
File file=new File(environment.configFile(),Dictionary.PATH_DIC_SURNAME);
InputStream is = null;
try {
is = new FileInputStream(file);
} catch (FileNotFoundException e) {
logger.error("ik-analyzer",e);
}
if(is == null){
throw new RuntimeException("Surname Dictionary not found!!!");
}
try {
BufferedReader br = new BufferedReader(new InputStreamReader(is , "UTF-8"), 512);
String theWord;
do {
theWord = br.readLine();
if (theWord != null && !"".equals(theWord.trim())) {
_SurnameDict.fillSegment(theWord.trim().toCharArray());
}
} while (theWord != null);
// logger.info("[Dict Loading] {},SurnameDict Size:{}",file.toString(),_SurnameDict.getDicNum());
} catch (IOException e) {
logger.error("ik-analyzer",e);
}finally{
try {
if(is != null){
is.close();
is = null;
}
} catch (IOException e) {
logger.error("ik-analyzer",e);
}
}
}
private void loadSuffixDict(){
_SuffixDict = new DictSegment((char)0);
File file=new File(environment.configFile(),Dictionary.PATH_DIC_SUFFIX);
InputStream is = null;
try {
is = new FileInputStream(file);
} catch (FileNotFoundException e) {
logger.error("ik-analyzer",e);
}
if(is == null){
throw new RuntimeException("Suffix Dictionary not found!!!");
}
try {
BufferedReader br = new BufferedReader(new InputStreamReader(is , "UTF-8"), 512);
String theWord;
do {
theWord = br.readLine();
if (theWord != null && !"".equals(theWord.trim())) {
_SuffixDict.fillSegment(theWord.trim().toCharArray());
}
} while (theWord != null);
// logger.info("[Dict Loading] {},SuffixDict Size:{}",file.toString(),_SuffixDict.getDicNum());
} catch (IOException e) {
logger.error("ik-analyzer",e);
}finally{
try {
if(is != null){
is.close();
is = null;
}
} catch (IOException e) {
logger.error("ik-analyzer",e);
}
}
}
private void loadPrepDict(){
_PrepDict = new DictSegment((char)0);
File file=new File(environment.configFile(),Dictionary.PATH_DIC_PREP);
InputStream is = null;
try {
is = new FileInputStream(file);
} catch (FileNotFoundException e) {
logger.error("ik-analyzer",e);
}
if(is == null){
throw new RuntimeException("Preposition Dictionary not found!!!");
}
try {
BufferedReader br = new BufferedReader(new InputStreamReader(is , "UTF-8"), 512);
String theWord;
do {
theWord = br.readLine();
if (theWord != null && !"".equals(theWord.trim())) {
_PrepDict.fillSegment(theWord.trim().toCharArray());
}
} while (theWord != null);
// logger.info("[Dict Loading] {},PrepDict Size:{}",file.toString(),_PrepDict.getDicNum());
} catch (IOException e) {
logger.error("ik-analyzer",e);
}finally{
try {
if(is != null){
is.close();
is = null;
}
} catch (IOException e) {
logger.error("ik-analyzer",e);
}
}
}
public static Dictionary getInstance(){ public static Dictionary getInstance(){
return Dictionary.singleton; return Dictionary.singleton;
} }