From 07ba4ece55eca3f15858227463458b6b15cb4a77 Mon Sep 17 00:00:00 2001 From: medcl Date: Fri, 31 May 2013 14:32:22 +0800 Subject: [PATCH] fix dict loading --- pom.xml | 2 +- .../org/wltea/analyzer/cfg/Configuration.java | 2 +- .../org/wltea/analyzer/dic/DictSegment.java | 6 +- .../org/wltea/analyzer/dic/Dictionary.java | 223 ++++++++++++++---- 4 files changed, 177 insertions(+), 56 deletions(-) diff --git a/pom.xml b/pom.xml index 8a7b5b4..1b51527 100644 --- a/pom.xml +++ b/pom.xml @@ -6,7 +6,7 @@ 4.0.0 org.elasticsearch elasticsearch-analysis-ik - 1.2.0 + 1.2.1 jar IK Analyzer for ElasticSearch 2009 diff --git a/src/main/java/org/wltea/analyzer/cfg/Configuration.java b/src/main/java/org/wltea/analyzer/cfg/Configuration.java index 51343b4..a74f346 100644 --- a/src/main/java/org/wltea/analyzer/cfg/Configuration.java +++ b/src/main/java/org/wltea/analyzer/cfg/Configuration.java @@ -37,7 +37,7 @@ public class Configuration { try { input = new FileInputStream(fileConfig); } catch (FileNotFoundException e) { - e.printStackTrace(); + logger.error("ik-analyzer",e); } if(input != null){ try { diff --git a/src/main/java/org/wltea/analyzer/dic/DictSegment.java b/src/main/java/org/wltea/analyzer/dic/DictSegment.java index c34c5e2..7e2f420 100644 --- a/src/main/java/org/wltea/analyzer/dic/DictSegment.java +++ b/src/main/java/org/wltea/analyzer/dic/DictSegment.java @@ -26,8 +26,8 @@ package org.wltea.analyzer.dic; import java.util.Arrays; -import java.util.HashMap; import java.util.Map; +import java.util.concurrent.ConcurrentHashMap; /** * 词典树分段,表示词典树的一个分枝 @@ -35,7 +35,7 @@ import java.util.Map; class DictSegment implements Comparable{ //公用字典表,存储汉字 - private static final Map charMap = new HashMap(16 , 0.95f); + private static final Map charMap = new ConcurrentHashMap(16 , 0.95f); //数组大小上限 private static final int ARRAY_LENGTH_LIMIT = 3; @@ -298,7 +298,7 @@ class DictSegment implements Comparable{ if(this.childrenMap == null){ synchronized(this){ if(this.childrenMap == null){ - this.childrenMap = new HashMap(ARRAY_LENGTH_LIMIT * 2,0.8f); + this.childrenMap = new ConcurrentHashMap(ARRAY_LENGTH_LIMIT * 2,0.8f); } } } diff --git a/src/main/java/org/wltea/analyzer/dic/Dictionary.java b/src/main/java/org/wltea/analyzer/dic/Dictionary.java index 36ea8e3..a5bf8ae 100644 --- a/src/main/java/org/wltea/analyzer/dic/Dictionary.java +++ b/src/main/java/org/wltea/analyzer/dic/Dictionary.java @@ -25,16 +25,16 @@ */ package org.wltea.analyzer.dic; -import java.io.*; -import java.util.Collection; -import java.util.List; - import org.elasticsearch.common.logging.ESLogger; import org.elasticsearch.common.logging.Loggers; import org.elasticsearch.common.settings.Settings; import org.elasticsearch.env.Environment; import org.wltea.analyzer.cfg.Configuration; +import java.io.*; +import java.util.Collection; +import java.util.List; + /** * 词典管理类,单子模式 */ @@ -45,20 +45,19 @@ public class Dictionary { * 词典单子实例 */ private static Dictionary singleton; - - /* - * 主词典对象 - */ - private DictSegment _MainDict; - - /* - * 停止词词典 - */ - private DictSegment _StopWordDict; - /* - * 量词词典 - */ - private DictSegment _QuantifierDict; + + private DictSegment _MainDict; + + private DictSegment _SurnameDict; + + private DictSegment _QuantifierDict; + + private DictSegment _SuffixDict; + + private DictSegment _PrepDict; + + private DictSegment _StopWords; + /** * 配置对象 @@ -95,10 +94,10 @@ public class Dictionary { environment =new Environment(indexSettings); configuration=new Configuration(indexSettings); loadMainDict(); -// loadSurnameDict(); + loadSurnameDict(); loadQuantifierDict(); -// loadSuffixDict(); -// loadPrepDict(); + loadSuffixDict(); + loadPrepDict(); loadStopWordDict(); dictInited=true; } @@ -218,7 +217,7 @@ public class Dictionary { * @return boolean */ public boolean isStopWord(char[] charArray , int begin, int length){ - return singleton._StopWordDict.match(charArray, begin, length).isMatch(); + return singleton._StopWords.match(charArray, begin, length).isMatch(); } /** @@ -247,18 +246,17 @@ public class Dictionary { } } while (theWord != null); - } catch (IOException ioe) { - System.err.println("Main Dictionary loading exception."); - ioe.printStackTrace(); - - }finally{ + } catch (IOException e) { + logger.error("ik-analyzer",e); + + }finally{ try { if(is != null){ is.close(); is = null; } } catch (IOException e) { - e.printStackTrace(); + logger.error("ik-analyzer",e); } } //加载扩展词典 @@ -275,8 +273,14 @@ public class Dictionary { InputStream is = null; for(String extDictName : extDictFiles){ //读取扩展词典文件 - System.out.println("加载扩展词典:" + extDictName); - is = this.getClass().getClassLoader().getResourceAsStream(extDictName); + logger.info("加载扩展词典:" + extDictName); + File file=new File(environment.configFile(), extDictName); + try { + is = new FileInputStream(file); + } catch (FileNotFoundException e) { + logger.error("ik-analyzer",e); + } + //如果找不到扩展的字典,则忽略 if(is == null){ continue; @@ -288,24 +292,21 @@ public class Dictionary { theWord = br.readLine(); if (theWord != null && !"".equals(theWord.trim())) { //加载扩展词典数据到主内存词典中 - //System.out.println(theWord); _MainDict.fillSegment(theWord.trim().toLowerCase().toCharArray()); } } while (theWord != null); - } catch (IOException ioe) { - System.err.println("Extension Dictionary loading exception."); - ioe.printStackTrace(); - - }finally{ + } catch (IOException e) { + logger.error("ik-analyzer",e); + }finally{ try { if(is != null){ is.close(); is = null; } } catch (IOException e) { - e.printStackTrace(); - } + logger.error("ik-analyzer",e); + } } } } @@ -316,15 +317,21 @@ public class Dictionary { */ private void loadStopWordDict(){ //建立一个主词典实例 - _StopWordDict = new DictSegment((char)0); + _StopWords = new DictSegment((char)0); //加载扩展停止词典 List extStopWordDictFiles = configuration.getExtStopWordDictionarys(); if(extStopWordDictFiles != null){ InputStream is = null; for(String extStopWordDictName : extStopWordDictFiles){ - System.out.println("加载扩展停止词典:" + extStopWordDictName); +// logger.info("加载扩展停止词典:" + extStopWordDictName); + //读取扩展词典文件 - is = this.getClass().getClassLoader().getResourceAsStream(extStopWordDictName); + File file=new File(environment.configFile(), extStopWordDictName); + try { + is = new FileInputStream(file); + } catch (FileNotFoundException e) { + logger.error("ik-analyzer",e); + } //如果找不到扩展的字典,则忽略 if(is == null){ continue; @@ -335,15 +342,13 @@ public class Dictionary { do { theWord = br.readLine(); if (theWord != null && !"".equals(theWord.trim())) { - //System.out.println(theWord); //加载扩展停止词典数据到内存中 - _StopWordDict.fillSegment(theWord.trim().toLowerCase().toCharArray()); + _StopWords.fillSegment(theWord.trim().toLowerCase().toCharArray()); } } while (theWord != null); - } catch (IOException ioe) { - System.err.println("Extension Stop word Dictionary loading exception."); - ioe.printStackTrace(); + } catch (IOException e) { + logger.error("ik-analyzer",e); }finally{ try { @@ -352,7 +357,7 @@ public class Dictionary { is = null; } } catch (IOException e) { - e.printStackTrace(); + logger.error("ik-analyzer",e); } } } @@ -371,7 +376,7 @@ public class Dictionary { try { is = new FileInputStream(file); } catch (FileNotFoundException e) { - e.printStackTrace(); + logger.error("ik-analyzer",e); } try { BufferedReader br = new BufferedReader(new InputStreamReader(is , "UTF-8"), 512); @@ -384,8 +389,7 @@ public class Dictionary { } while (theWord != null); } catch (IOException ioe) { - System.err.println("Quantifier Dictionary loading exception."); - ioe.printStackTrace(); + logger.error("Quantifier Dictionary loading exception."); }finally{ try { @@ -394,12 +398,129 @@ public class Dictionary { is = null; } } catch (IOException e) { - e.printStackTrace(); + logger.error("ik-analyzer",e); } } } + private void loadSurnameDict(){ + + _SurnameDict = new DictSegment((char)0); + File file=new File(environment.configFile(),Dictionary.PATH_DIC_SURNAME); + InputStream is = null; + try { + is = new FileInputStream(file); + } catch (FileNotFoundException e) { + logger.error("ik-analyzer",e); + } + if(is == null){ + throw new RuntimeException("Surname Dictionary not found!!!"); + } + try { + BufferedReader br = new BufferedReader(new InputStreamReader(is , "UTF-8"), 512); + String theWord; + do { + theWord = br.readLine(); + if (theWord != null && !"".equals(theWord.trim())) { + _SurnameDict.fillSegment(theWord.trim().toCharArray()); + } + } while (theWord != null); +// logger.info("[Dict Loading] {},SurnameDict Size:{}",file.toString(),_SurnameDict.getDicNum()); + } catch (IOException e) { + logger.error("ik-analyzer",e); + }finally{ + try { + if(is != null){ + is.close(); + is = null; + } + } catch (IOException e) { + logger.error("ik-analyzer",e); + } + } + } + + + + private void loadSuffixDict(){ + + _SuffixDict = new DictSegment((char)0); + File file=new File(environment.configFile(),Dictionary.PATH_DIC_SUFFIX); + InputStream is = null; + try { + is = new FileInputStream(file); + } catch (FileNotFoundException e) { + logger.error("ik-analyzer",e); + } + if(is == null){ + throw new RuntimeException("Suffix Dictionary not found!!!"); + } + try { + + BufferedReader br = new BufferedReader(new InputStreamReader(is , "UTF-8"), 512); + String theWord; + do { + theWord = br.readLine(); + if (theWord != null && !"".equals(theWord.trim())) { + _SuffixDict.fillSegment(theWord.trim().toCharArray()); + } + } while (theWord != null); +// logger.info("[Dict Loading] {},SuffixDict Size:{}",file.toString(),_SuffixDict.getDicNum()); + } catch (IOException e) { + logger.error("ik-analyzer",e); + }finally{ + try { + if(is != null){ + is.close(); + is = null; + } + } catch (IOException e) { + logger.error("ik-analyzer",e); + } + } + } + + + private void loadPrepDict(){ + + _PrepDict = new DictSegment((char)0); + File file=new File(environment.configFile(),Dictionary.PATH_DIC_PREP); + InputStream is = null; + try { + is = new FileInputStream(file); + } catch (FileNotFoundException e) { + logger.error("ik-analyzer",e); + } + if(is == null){ + throw new RuntimeException("Preposition Dictionary not found!!!"); + } + try { + + BufferedReader br = new BufferedReader(new InputStreamReader(is , "UTF-8"), 512); + String theWord; + do { + theWord = br.readLine(); + if (theWord != null && !"".equals(theWord.trim())) { + + _PrepDict.fillSegment(theWord.trim().toCharArray()); + } + } while (theWord != null); +// logger.info("[Dict Loading] {},PrepDict Size:{}",file.toString(),_PrepDict.getDicNum()); + } catch (IOException e) { + logger.error("ik-analyzer",e); + }finally{ + try { + if(is != null){ + is.close(); + is = null; + } + } catch (IOException e) { + logger.error("ik-analyzer",e); + } + } + } + public static Dictionary getInstance(){ return Dictionary.singleton; }