From 7028b9ea05aed580a815c4085d55ff34f5b51245 Mon Sep 17 00:00:00 2001 From: Figroc Chen Date: Mon, 2 Apr 2018 13:29:22 +0800 Subject: [PATCH] BOM handling of dict file (#517) Signed-off-by: Peng Chen --- .../org/wltea/analyzer/dic/Dictionary.java | 282 ++---------------- 1 file changed, 30 insertions(+), 252 deletions(-) diff --git a/src/main/java/org/wltea/analyzer/dic/Dictionary.java b/src/main/java/org/wltea/analyzer/dic/Dictionary.java index a43087c..4e077ce 100644 --- a/src/main/java/org/wltea/analyzer/dic/Dictionary.java +++ b/src/main/java/org/wltea/analyzer/dic/Dictionary.java @@ -26,7 +26,6 @@ package org.wltea.analyzer.dic; import java.io.BufferedReader; -import java.io.File; import java.io.FileInputStream; import java.io.FileNotFoundException; import java.io.IOException; @@ -201,6 +200,28 @@ public class Dictionary { return files; } + private void loadDictFile(DictSegment dict, Path file, boolean critical, String name) { + try (InputStream is = new FileInputStream(file.toFile())) { + BufferedReader br = new BufferedReader( + new InputStreamReader(is, "UTF-8"), 512); + String word = br.readLine(); + if (word != null) { + if (word.startsWith("\uFEFF")) + word = word.substring(1); + for (; word != null; word = br.readLine()) { + word = word.trim(); + if (word.isEmpty()) continue; + dict.fillSegment(word.toCharArray()); + } + } + } catch (FileNotFoundException e) { + logger.error("ik-analyzer: " + name + " not found", e); + if (critical) throw new RuntimeException("ik-analyzer: " + name + " not found!!!", e); + } catch (IOException e) { + logger.error("ik-analyzer: " + name + " loading failed", e); + } + } + public List getExtDictionarys() { List extDictFiles = new ArrayList(2); String extDictCfg = getProperty(EXT_DICT); @@ -371,37 +392,7 @@ public class Dictionary { // 读取主词典文件 Path file = PathUtils.get(getDictRoot(), Dictionary.PATH_DIC_MAIN); - - InputStream is = null; - try { - is = new FileInputStream(file.toFile()); - } catch (FileNotFoundException e) { - logger.error(e.getMessage(), e); - } - - try { - BufferedReader br = new BufferedReader(new InputStreamReader(is, "UTF-8"), 512); - String theWord = null; - do { - theWord = br.readLine(); - if (theWord != null && !"".equals(theWord.trim())) { - _MainDict.fillSegment(theWord.trim().toCharArray()); - } - } while (theWord != null); - - } catch (IOException e) { - logger.error("ik-analyzer", e); - - } finally { - try { - if (is != null) { - is.close(); - is = null; - } - } catch (IOException e) { - logger.error("ik-analyzer", e); - } - } + loadDictFile(_MainDict, file, false, "Main Dict"); // 加载扩展词典 this.loadExtDict(); // 加载远程自定义词库 @@ -415,44 +406,11 @@ public class Dictionary { // 加载扩展词典配置 List extDictFiles = getExtDictionarys(); if (extDictFiles != null) { - InputStream is = null; for (String extDictName : extDictFiles) { // 读取扩展词典文件 logger.info("[Dict Loading] " + extDictName); Path file = PathUtils.get(extDictName); - try { - is = new FileInputStream(file.toFile()); - } catch (FileNotFoundException e) { - logger.error("ik-analyzer", e); - } - - // 如果找不到扩展的字典,则忽略 - if (is == null) { - continue; - } - try { - BufferedReader br = new BufferedReader(new InputStreamReader(is, "UTF-8"), 512); - String theWord = null; - do { - theWord = br.readLine(); - if (theWord != null && !"".equals(theWord.trim())) { - // 加载扩展词典数据到主内存词典中 - _MainDict.fillSegment(theWord.trim().toCharArray()); - } - } while (theWord != null); - - } catch (IOException e) { - logger.error("ik-analyzer", e); - } finally { - try { - if (is != null) { - is.close(); - is = null; - } - } catch (IOException e) { - logger.error("ik-analyzer", e); - } - } + loadDictFile(_MainDict, file, false, "Extra Dict"); } } } @@ -533,80 +491,17 @@ public class Dictionary { // 读取主词典文件 Path file = PathUtils.get(getDictRoot(), Dictionary.PATH_DIC_STOP); - - InputStream is = null; - try { - is = new FileInputStream(file.toFile()); - } catch (FileNotFoundException e) { - logger.error(e.getMessage(), e); - } - - try { - BufferedReader br = new BufferedReader(new InputStreamReader(is, "UTF-8"), 512); - String theWord = null; - do { - theWord = br.readLine(); - if (theWord != null && !"".equals(theWord.trim())) { - _StopWords.fillSegment(theWord.trim().toCharArray()); - } - } while (theWord != null); - - } catch (IOException e) { - logger.error("ik-analyzer", e); - - } finally { - try { - if (is != null) { - is.close(); - is = null; - } - } catch (IOException e) { - logger.error("ik-analyzer", e); - } - } + loadDictFile(_StopWords, file, false, "Main Stopwords"); // 加载扩展停止词典 List extStopWordDictFiles = getExtStopWordDictionarys(); if (extStopWordDictFiles != null) { - is = null; for (String extStopWordDictName : extStopWordDictFiles) { logger.info("[Dict Loading] " + extStopWordDictName); // 读取扩展词典文件 file = PathUtils.get(extStopWordDictName); - try { - is = new FileInputStream(file.toFile()); - } catch (FileNotFoundException e) { - logger.error("ik-analyzer", e); - } - // 如果找不到扩展的字典,则忽略 - if (is == null) { - continue; - } - try { - BufferedReader br = new BufferedReader(new InputStreamReader(is, "UTF-8"), 512); - String theWord = null; - do { - theWord = br.readLine(); - if (theWord != null && !"".equals(theWord.trim())) { - // 加载扩展停止词典数据到内存中 - _StopWords.fillSegment(theWord.trim().toCharArray()); - } - } while (theWord != null); - - } catch (IOException e) { - logger.error("ik-analyzer", e); - - } finally { - try { - if (is != null) { - is.close(); - is = null; - } - } catch (IOException e) { - logger.error("ik-analyzer", e); - } - } + loadDictFile(_StopWords, file, false, "Extra Stopwords"); } } @@ -639,142 +534,25 @@ public class Dictionary { _QuantifierDict = new DictSegment((char) 0); // 读取量词词典文件 Path file = PathUtils.get(getDictRoot(), Dictionary.PATH_DIC_QUANTIFIER); - InputStream is = null; - try { - is = new FileInputStream(file.toFile()); - } catch (FileNotFoundException e) { - logger.error("ik-analyzer", e); - } - try { - BufferedReader br = new BufferedReader(new InputStreamReader(is, "UTF-8"), 512); - String theWord = null; - do { - theWord = br.readLine(); - if (theWord != null && !"".equals(theWord.trim())) { - _QuantifierDict.fillSegment(theWord.trim().toCharArray()); - } - } while (theWord != null); - - } catch (IOException ioe) { - logger.error("Quantifier Dictionary loading exception."); - - } finally { - try { - if (is != null) { - is.close(); - is = null; - } - } catch (IOException e) { - logger.error("ik-analyzer", e); - } - } + loadDictFile(_QuantifierDict, file, false, "Quantifier"); } private void loadSurnameDict() { - _SurnameDict = new DictSegment((char) 0); Path file = PathUtils.get(getDictRoot(), Dictionary.PATH_DIC_SURNAME); - InputStream is = null; - try { - is = new FileInputStream(file.toFile()); - } catch (FileNotFoundException e) { - logger.error("ik-analyzer", e); - } - if (is == null) { - throw new RuntimeException("Surname Dictionary not found!!!"); - } - try { - BufferedReader br = new BufferedReader(new InputStreamReader(is, "UTF-8"), 512); - String theWord; - do { - theWord = br.readLine(); - if (theWord != null && !"".equals(theWord.trim())) { - _SurnameDict.fillSegment(theWord.trim().toCharArray()); - } - } while (theWord != null); - } catch (IOException e) { - logger.error("ik-analyzer", e); - } finally { - try { - if (is != null) { - is.close(); - is = null; - } - } catch (IOException e) { - logger.error("ik-analyzer", e); - } - } + loadDictFile(_SurnameDict, file, true, "Surname"); } private void loadSuffixDict() { - _SuffixDict = new DictSegment((char) 0); Path file = PathUtils.get(getDictRoot(), Dictionary.PATH_DIC_SUFFIX); - InputStream is = null; - try { - is = new FileInputStream(file.toFile()); - } catch (FileNotFoundException e) { - logger.error("ik-analyzer", e); - } - if (is == null) { - throw new RuntimeException("Suffix Dictionary not found!!!"); - } - try { - - BufferedReader br = new BufferedReader(new InputStreamReader(is, "UTF-8"), 512); - String theWord; - do { - theWord = br.readLine(); - if (theWord != null && !"".equals(theWord.trim())) { - _SuffixDict.fillSegment(theWord.trim().toCharArray()); - } - } while (theWord != null); - } catch (IOException e) { - logger.error("ik-analyzer", e); - } finally { - try { - is.close(); - is = null; - } catch (IOException e) { - logger.error("ik-analyzer", e); - } - } + loadDictFile(_SuffixDict, file, true, "Suffix"); } private void loadPrepDict() { - _PrepDict = new DictSegment((char) 0); Path file = PathUtils.get(getDictRoot(), Dictionary.PATH_DIC_PREP); - InputStream is = null; - try { - is = new FileInputStream(file.toFile()); - } catch (FileNotFoundException e) { - logger.error("ik-analyzer", e); - } - if (is == null) { - throw new RuntimeException("Preposition Dictionary not found!!!"); - } - try { - - BufferedReader br = new BufferedReader(new InputStreamReader(is, "UTF-8"), 512); - String theWord; - do { - theWord = br.readLine(); - if (theWord != null && !"".equals(theWord.trim())) { - - _PrepDict.fillSegment(theWord.trim().toCharArray()); - } - } while (theWord != null); - } catch (IOException e) { - logger.error("ik-analyzer", e); - } finally { - try { - is.close(); - is = null; - } catch (IOException e) { - logger.error("ik-analyzer", e); - } - } + loadDictFile(_PrepDict, file, true, "Preposition"); } public void reLoadMainDict() {