fix dict loading
This commit is contained in:
parent
6fc30fe600
commit
07ba4ece55
2
pom.xml
2
pom.xml
@ -6,7 +6,7 @@
|
|||||||
<modelVersion>4.0.0</modelVersion>
|
<modelVersion>4.0.0</modelVersion>
|
||||||
<groupId>org.elasticsearch</groupId>
|
<groupId>org.elasticsearch</groupId>
|
||||||
<artifactId>elasticsearch-analysis-ik</artifactId>
|
<artifactId>elasticsearch-analysis-ik</artifactId>
|
||||||
<version>1.2.0</version>
|
<version>1.2.1</version>
|
||||||
<packaging>jar</packaging>
|
<packaging>jar</packaging>
|
||||||
<description>IK Analyzer for ElasticSearch</description>
|
<description>IK Analyzer for ElasticSearch</description>
|
||||||
<inceptionYear>2009</inceptionYear>
|
<inceptionYear>2009</inceptionYear>
|
||||||
|
@ -37,7 +37,7 @@ public class Configuration {
|
|||||||
try {
|
try {
|
||||||
input = new FileInputStream(fileConfig);
|
input = new FileInputStream(fileConfig);
|
||||||
} catch (FileNotFoundException e) {
|
} catch (FileNotFoundException e) {
|
||||||
e.printStackTrace();
|
logger.error("ik-analyzer",e);
|
||||||
}
|
}
|
||||||
if(input != null){
|
if(input != null){
|
||||||
try {
|
try {
|
||||||
|
@ -26,8 +26,8 @@
|
|||||||
package org.wltea.analyzer.dic;
|
package org.wltea.analyzer.dic;
|
||||||
|
|
||||||
import java.util.Arrays;
|
import java.util.Arrays;
|
||||||
import java.util.HashMap;
|
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
|
import java.util.concurrent.ConcurrentHashMap;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* 词典树分段,表示词典树的一个分枝
|
* 词典树分段,表示词典树的一个分枝
|
||||||
@ -35,7 +35,7 @@ import java.util.Map;
|
|||||||
class DictSegment implements Comparable<DictSegment>{
|
class DictSegment implements Comparable<DictSegment>{
|
||||||
|
|
||||||
//公用字典表,存储汉字
|
//公用字典表,存储汉字
|
||||||
private static final Map<Character , Character> charMap = new HashMap<Character , Character>(16 , 0.95f);
|
private static final Map<Character , Character> charMap = new ConcurrentHashMap<Character , Character>(16 , 0.95f);
|
||||||
//数组大小上限
|
//数组大小上限
|
||||||
private static final int ARRAY_LENGTH_LIMIT = 3;
|
private static final int ARRAY_LENGTH_LIMIT = 3;
|
||||||
|
|
||||||
@ -298,7 +298,7 @@ class DictSegment implements Comparable<DictSegment>{
|
|||||||
if(this.childrenMap == null){
|
if(this.childrenMap == null){
|
||||||
synchronized(this){
|
synchronized(this){
|
||||||
if(this.childrenMap == null){
|
if(this.childrenMap == null){
|
||||||
this.childrenMap = new HashMap<Character , DictSegment>(ARRAY_LENGTH_LIMIT * 2,0.8f);
|
this.childrenMap = new ConcurrentHashMap<Character, DictSegment>(ARRAY_LENGTH_LIMIT * 2,0.8f);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -25,16 +25,16 @@
|
|||||||
*/
|
*/
|
||||||
package org.wltea.analyzer.dic;
|
package org.wltea.analyzer.dic;
|
||||||
|
|
||||||
import java.io.*;
|
|
||||||
import java.util.Collection;
|
|
||||||
import java.util.List;
|
|
||||||
|
|
||||||
import org.elasticsearch.common.logging.ESLogger;
|
import org.elasticsearch.common.logging.ESLogger;
|
||||||
import org.elasticsearch.common.logging.Loggers;
|
import org.elasticsearch.common.logging.Loggers;
|
||||||
import org.elasticsearch.common.settings.Settings;
|
import org.elasticsearch.common.settings.Settings;
|
||||||
import org.elasticsearch.env.Environment;
|
import org.elasticsearch.env.Environment;
|
||||||
import org.wltea.analyzer.cfg.Configuration;
|
import org.wltea.analyzer.cfg.Configuration;
|
||||||
|
|
||||||
|
import java.io.*;
|
||||||
|
import java.util.Collection;
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* 词典管理类,单子模式
|
* 词典管理类,单子模式
|
||||||
*/
|
*/
|
||||||
@ -45,20 +45,19 @@ public class Dictionary {
|
|||||||
* 词典单子实例
|
* 词典单子实例
|
||||||
*/
|
*/
|
||||||
private static Dictionary singleton;
|
private static Dictionary singleton;
|
||||||
|
|
||||||
/*
|
private DictSegment _MainDict;
|
||||||
* 主词典对象
|
|
||||||
*/
|
private DictSegment _SurnameDict;
|
||||||
private DictSegment _MainDict;
|
|
||||||
|
private DictSegment _QuantifierDict;
|
||||||
/*
|
|
||||||
* 停止词词典
|
private DictSegment _SuffixDict;
|
||||||
*/
|
|
||||||
private DictSegment _StopWordDict;
|
private DictSegment _PrepDict;
|
||||||
/*
|
|
||||||
* 量词词典
|
private DictSegment _StopWords;
|
||||||
*/
|
|
||||||
private DictSegment _QuantifierDict;
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* 配置对象
|
* 配置对象
|
||||||
@ -95,10 +94,10 @@ public class Dictionary {
|
|||||||
environment =new Environment(indexSettings);
|
environment =new Environment(indexSettings);
|
||||||
configuration=new Configuration(indexSettings);
|
configuration=new Configuration(indexSettings);
|
||||||
loadMainDict();
|
loadMainDict();
|
||||||
// loadSurnameDict();
|
loadSurnameDict();
|
||||||
loadQuantifierDict();
|
loadQuantifierDict();
|
||||||
// loadSuffixDict();
|
loadSuffixDict();
|
||||||
// loadPrepDict();
|
loadPrepDict();
|
||||||
loadStopWordDict();
|
loadStopWordDict();
|
||||||
dictInited=true;
|
dictInited=true;
|
||||||
}
|
}
|
||||||
@ -218,7 +217,7 @@ public class Dictionary {
|
|||||||
* @return boolean
|
* @return boolean
|
||||||
*/
|
*/
|
||||||
public boolean isStopWord(char[] charArray , int begin, int length){
|
public boolean isStopWord(char[] charArray , int begin, int length){
|
||||||
return singleton._StopWordDict.match(charArray, begin, length).isMatch();
|
return singleton._StopWords.match(charArray, begin, length).isMatch();
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@ -247,18 +246,17 @@ public class Dictionary {
|
|||||||
}
|
}
|
||||||
} while (theWord != null);
|
} while (theWord != null);
|
||||||
|
|
||||||
} catch (IOException ioe) {
|
} catch (IOException e) {
|
||||||
System.err.println("Main Dictionary loading exception.");
|
logger.error("ik-analyzer",e);
|
||||||
ioe.printStackTrace();
|
|
||||||
|
}finally{
|
||||||
}finally{
|
|
||||||
try {
|
try {
|
||||||
if(is != null){
|
if(is != null){
|
||||||
is.close();
|
is.close();
|
||||||
is = null;
|
is = null;
|
||||||
}
|
}
|
||||||
} catch (IOException e) {
|
} catch (IOException e) {
|
||||||
e.printStackTrace();
|
logger.error("ik-analyzer",e);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
//加载扩展词典
|
//加载扩展词典
|
||||||
@ -275,8 +273,14 @@ public class Dictionary {
|
|||||||
InputStream is = null;
|
InputStream is = null;
|
||||||
for(String extDictName : extDictFiles){
|
for(String extDictName : extDictFiles){
|
||||||
//读取扩展词典文件
|
//读取扩展词典文件
|
||||||
System.out.println("加载扩展词典:" + extDictName);
|
logger.info("加载扩展词典:" + extDictName);
|
||||||
is = this.getClass().getClassLoader().getResourceAsStream(extDictName);
|
File file=new File(environment.configFile(), extDictName);
|
||||||
|
try {
|
||||||
|
is = new FileInputStream(file);
|
||||||
|
} catch (FileNotFoundException e) {
|
||||||
|
logger.error("ik-analyzer",e);
|
||||||
|
}
|
||||||
|
|
||||||
//如果找不到扩展的字典,则忽略
|
//如果找不到扩展的字典,则忽略
|
||||||
if(is == null){
|
if(is == null){
|
||||||
continue;
|
continue;
|
||||||
@ -288,24 +292,21 @@ public class Dictionary {
|
|||||||
theWord = br.readLine();
|
theWord = br.readLine();
|
||||||
if (theWord != null && !"".equals(theWord.trim())) {
|
if (theWord != null && !"".equals(theWord.trim())) {
|
||||||
//加载扩展词典数据到主内存词典中
|
//加载扩展词典数据到主内存词典中
|
||||||
//System.out.println(theWord);
|
|
||||||
_MainDict.fillSegment(theWord.trim().toLowerCase().toCharArray());
|
_MainDict.fillSegment(theWord.trim().toLowerCase().toCharArray());
|
||||||
}
|
}
|
||||||
} while (theWord != null);
|
} while (theWord != null);
|
||||||
|
|
||||||
} catch (IOException ioe) {
|
} catch (IOException e) {
|
||||||
System.err.println("Extension Dictionary loading exception.");
|
logger.error("ik-analyzer",e);
|
||||||
ioe.printStackTrace();
|
}finally{
|
||||||
|
|
||||||
}finally{
|
|
||||||
try {
|
try {
|
||||||
if(is != null){
|
if(is != null){
|
||||||
is.close();
|
is.close();
|
||||||
is = null;
|
is = null;
|
||||||
}
|
}
|
||||||
} catch (IOException e) {
|
} catch (IOException e) {
|
||||||
e.printStackTrace();
|
logger.error("ik-analyzer",e);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -316,15 +317,21 @@ public class Dictionary {
|
|||||||
*/
|
*/
|
||||||
private void loadStopWordDict(){
|
private void loadStopWordDict(){
|
||||||
//建立一个主词典实例
|
//建立一个主词典实例
|
||||||
_StopWordDict = new DictSegment((char)0);
|
_StopWords = new DictSegment((char)0);
|
||||||
//加载扩展停止词典
|
//加载扩展停止词典
|
||||||
List<String> extStopWordDictFiles = configuration.getExtStopWordDictionarys();
|
List<String> extStopWordDictFiles = configuration.getExtStopWordDictionarys();
|
||||||
if(extStopWordDictFiles != null){
|
if(extStopWordDictFiles != null){
|
||||||
InputStream is = null;
|
InputStream is = null;
|
||||||
for(String extStopWordDictName : extStopWordDictFiles){
|
for(String extStopWordDictName : extStopWordDictFiles){
|
||||||
System.out.println("加载扩展停止词典:" + extStopWordDictName);
|
// logger.info("加载扩展停止词典:" + extStopWordDictName);
|
||||||
|
|
||||||
//读取扩展词典文件
|
//读取扩展词典文件
|
||||||
is = this.getClass().getClassLoader().getResourceAsStream(extStopWordDictName);
|
File file=new File(environment.configFile(), extStopWordDictName);
|
||||||
|
try {
|
||||||
|
is = new FileInputStream(file);
|
||||||
|
} catch (FileNotFoundException e) {
|
||||||
|
logger.error("ik-analyzer",e);
|
||||||
|
}
|
||||||
//如果找不到扩展的字典,则忽略
|
//如果找不到扩展的字典,则忽略
|
||||||
if(is == null){
|
if(is == null){
|
||||||
continue;
|
continue;
|
||||||
@ -335,15 +342,13 @@ public class Dictionary {
|
|||||||
do {
|
do {
|
||||||
theWord = br.readLine();
|
theWord = br.readLine();
|
||||||
if (theWord != null && !"".equals(theWord.trim())) {
|
if (theWord != null && !"".equals(theWord.trim())) {
|
||||||
//System.out.println(theWord);
|
|
||||||
//加载扩展停止词典数据到内存中
|
//加载扩展停止词典数据到内存中
|
||||||
_StopWordDict.fillSegment(theWord.trim().toLowerCase().toCharArray());
|
_StopWords.fillSegment(theWord.trim().toLowerCase().toCharArray());
|
||||||
}
|
}
|
||||||
} while (theWord != null);
|
} while (theWord != null);
|
||||||
|
|
||||||
} catch (IOException ioe) {
|
} catch (IOException e) {
|
||||||
System.err.println("Extension Stop word Dictionary loading exception.");
|
logger.error("ik-analyzer",e);
|
||||||
ioe.printStackTrace();
|
|
||||||
|
|
||||||
}finally{
|
}finally{
|
||||||
try {
|
try {
|
||||||
@ -352,7 +357,7 @@ public class Dictionary {
|
|||||||
is = null;
|
is = null;
|
||||||
}
|
}
|
||||||
} catch (IOException e) {
|
} catch (IOException e) {
|
||||||
e.printStackTrace();
|
logger.error("ik-analyzer",e);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -371,7 +376,7 @@ public class Dictionary {
|
|||||||
try {
|
try {
|
||||||
is = new FileInputStream(file);
|
is = new FileInputStream(file);
|
||||||
} catch (FileNotFoundException e) {
|
} catch (FileNotFoundException e) {
|
||||||
e.printStackTrace();
|
logger.error("ik-analyzer",e);
|
||||||
}
|
}
|
||||||
try {
|
try {
|
||||||
BufferedReader br = new BufferedReader(new InputStreamReader(is , "UTF-8"), 512);
|
BufferedReader br = new BufferedReader(new InputStreamReader(is , "UTF-8"), 512);
|
||||||
@ -384,8 +389,7 @@ public class Dictionary {
|
|||||||
} while (theWord != null);
|
} while (theWord != null);
|
||||||
|
|
||||||
} catch (IOException ioe) {
|
} catch (IOException ioe) {
|
||||||
System.err.println("Quantifier Dictionary loading exception.");
|
logger.error("Quantifier Dictionary loading exception.");
|
||||||
ioe.printStackTrace();
|
|
||||||
|
|
||||||
}finally{
|
}finally{
|
||||||
try {
|
try {
|
||||||
@ -394,12 +398,129 @@ public class Dictionary {
|
|||||||
is = null;
|
is = null;
|
||||||
}
|
}
|
||||||
} catch (IOException e) {
|
} catch (IOException e) {
|
||||||
e.printStackTrace();
|
logger.error("ik-analyzer",e);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private void loadSurnameDict(){
|
||||||
|
|
||||||
|
_SurnameDict = new DictSegment((char)0);
|
||||||
|
File file=new File(environment.configFile(),Dictionary.PATH_DIC_SURNAME);
|
||||||
|
InputStream is = null;
|
||||||
|
try {
|
||||||
|
is = new FileInputStream(file);
|
||||||
|
} catch (FileNotFoundException e) {
|
||||||
|
logger.error("ik-analyzer",e);
|
||||||
|
}
|
||||||
|
if(is == null){
|
||||||
|
throw new RuntimeException("Surname Dictionary not found!!!");
|
||||||
|
}
|
||||||
|
try {
|
||||||
|
BufferedReader br = new BufferedReader(new InputStreamReader(is , "UTF-8"), 512);
|
||||||
|
String theWord;
|
||||||
|
do {
|
||||||
|
theWord = br.readLine();
|
||||||
|
if (theWord != null && !"".equals(theWord.trim())) {
|
||||||
|
_SurnameDict.fillSegment(theWord.trim().toCharArray());
|
||||||
|
}
|
||||||
|
} while (theWord != null);
|
||||||
|
// logger.info("[Dict Loading] {},SurnameDict Size:{}",file.toString(),_SurnameDict.getDicNum());
|
||||||
|
} catch (IOException e) {
|
||||||
|
logger.error("ik-analyzer",e);
|
||||||
|
}finally{
|
||||||
|
try {
|
||||||
|
if(is != null){
|
||||||
|
is.close();
|
||||||
|
is = null;
|
||||||
|
}
|
||||||
|
} catch (IOException e) {
|
||||||
|
logger.error("ik-analyzer",e);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
private void loadSuffixDict(){
|
||||||
|
|
||||||
|
_SuffixDict = new DictSegment((char)0);
|
||||||
|
File file=new File(environment.configFile(),Dictionary.PATH_DIC_SUFFIX);
|
||||||
|
InputStream is = null;
|
||||||
|
try {
|
||||||
|
is = new FileInputStream(file);
|
||||||
|
} catch (FileNotFoundException e) {
|
||||||
|
logger.error("ik-analyzer",e);
|
||||||
|
}
|
||||||
|
if(is == null){
|
||||||
|
throw new RuntimeException("Suffix Dictionary not found!!!");
|
||||||
|
}
|
||||||
|
try {
|
||||||
|
|
||||||
|
BufferedReader br = new BufferedReader(new InputStreamReader(is , "UTF-8"), 512);
|
||||||
|
String theWord;
|
||||||
|
do {
|
||||||
|
theWord = br.readLine();
|
||||||
|
if (theWord != null && !"".equals(theWord.trim())) {
|
||||||
|
_SuffixDict.fillSegment(theWord.trim().toCharArray());
|
||||||
|
}
|
||||||
|
} while (theWord != null);
|
||||||
|
// logger.info("[Dict Loading] {},SuffixDict Size:{}",file.toString(),_SuffixDict.getDicNum());
|
||||||
|
} catch (IOException e) {
|
||||||
|
logger.error("ik-analyzer",e);
|
||||||
|
}finally{
|
||||||
|
try {
|
||||||
|
if(is != null){
|
||||||
|
is.close();
|
||||||
|
is = null;
|
||||||
|
}
|
||||||
|
} catch (IOException e) {
|
||||||
|
logger.error("ik-analyzer",e);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private void loadPrepDict(){
|
||||||
|
|
||||||
|
_PrepDict = new DictSegment((char)0);
|
||||||
|
File file=new File(environment.configFile(),Dictionary.PATH_DIC_PREP);
|
||||||
|
InputStream is = null;
|
||||||
|
try {
|
||||||
|
is = new FileInputStream(file);
|
||||||
|
} catch (FileNotFoundException e) {
|
||||||
|
logger.error("ik-analyzer",e);
|
||||||
|
}
|
||||||
|
if(is == null){
|
||||||
|
throw new RuntimeException("Preposition Dictionary not found!!!");
|
||||||
|
}
|
||||||
|
try {
|
||||||
|
|
||||||
|
BufferedReader br = new BufferedReader(new InputStreamReader(is , "UTF-8"), 512);
|
||||||
|
String theWord;
|
||||||
|
do {
|
||||||
|
theWord = br.readLine();
|
||||||
|
if (theWord != null && !"".equals(theWord.trim())) {
|
||||||
|
|
||||||
|
_PrepDict.fillSegment(theWord.trim().toCharArray());
|
||||||
|
}
|
||||||
|
} while (theWord != null);
|
||||||
|
// logger.info("[Dict Loading] {},PrepDict Size:{}",file.toString(),_PrepDict.getDicNum());
|
||||||
|
} catch (IOException e) {
|
||||||
|
logger.error("ik-analyzer",e);
|
||||||
|
}finally{
|
||||||
|
try {
|
||||||
|
if(is != null){
|
||||||
|
is.close();
|
||||||
|
is = null;
|
||||||
|
}
|
||||||
|
} catch (IOException e) {
|
||||||
|
logger.error("ik-analyzer",e);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
public static Dictionary getInstance(){
|
public static Dictionary getInstance(){
|
||||||
return Dictionary.singleton;
|
return Dictionary.singleton;
|
||||||
}
|
}
|
||||||
|
Loading…
x
Reference in New Issue
Block a user