BOM handling of dict file (#517)
Signed-off-by: Peng Chen <figroc@gmail.com>
This commit is contained in:
parent
4ab2616a96
commit
7028b9ea05
@ -26,7 +26,6 @@
|
||||
package org.wltea.analyzer.dic;
|
||||
|
||||
import java.io.BufferedReader;
|
||||
import java.io.File;
|
||||
import java.io.FileInputStream;
|
||||
import java.io.FileNotFoundException;
|
||||
import java.io.IOException;
|
||||
@ -201,6 +200,28 @@ public class Dictionary {
|
||||
return files;
|
||||
}
|
||||
|
||||
private void loadDictFile(DictSegment dict, Path file, boolean critical, String name) {
|
||||
try (InputStream is = new FileInputStream(file.toFile())) {
|
||||
BufferedReader br = new BufferedReader(
|
||||
new InputStreamReader(is, "UTF-8"), 512);
|
||||
String word = br.readLine();
|
||||
if (word != null) {
|
||||
if (word.startsWith("\uFEFF"))
|
||||
word = word.substring(1);
|
||||
for (; word != null; word = br.readLine()) {
|
||||
word = word.trim();
|
||||
if (word.isEmpty()) continue;
|
||||
dict.fillSegment(word.toCharArray());
|
||||
}
|
||||
}
|
||||
} catch (FileNotFoundException e) {
|
||||
logger.error("ik-analyzer: " + name + " not found", e);
|
||||
if (critical) throw new RuntimeException("ik-analyzer: " + name + " not found!!!", e);
|
||||
} catch (IOException e) {
|
||||
logger.error("ik-analyzer: " + name + " loading failed", e);
|
||||
}
|
||||
}
|
||||
|
||||
public List<String> getExtDictionarys() {
|
||||
List<String> extDictFiles = new ArrayList<String>(2);
|
||||
String extDictCfg = getProperty(EXT_DICT);
|
||||
@ -371,37 +392,7 @@ public class Dictionary {
|
||||
|
||||
// 读取主词典文件
|
||||
Path file = PathUtils.get(getDictRoot(), Dictionary.PATH_DIC_MAIN);
|
||||
|
||||
InputStream is = null;
|
||||
try {
|
||||
is = new FileInputStream(file.toFile());
|
||||
} catch (FileNotFoundException e) {
|
||||
logger.error(e.getMessage(), e);
|
||||
}
|
||||
|
||||
try {
|
||||
BufferedReader br = new BufferedReader(new InputStreamReader(is, "UTF-8"), 512);
|
||||
String theWord = null;
|
||||
do {
|
||||
theWord = br.readLine();
|
||||
if (theWord != null && !"".equals(theWord.trim())) {
|
||||
_MainDict.fillSegment(theWord.trim().toCharArray());
|
||||
}
|
||||
} while (theWord != null);
|
||||
|
||||
} catch (IOException e) {
|
||||
logger.error("ik-analyzer", e);
|
||||
|
||||
} finally {
|
||||
try {
|
||||
if (is != null) {
|
||||
is.close();
|
||||
is = null;
|
||||
}
|
||||
} catch (IOException e) {
|
||||
logger.error("ik-analyzer", e);
|
||||
}
|
||||
}
|
||||
loadDictFile(_MainDict, file, false, "Main Dict");
|
||||
// 加载扩展词典
|
||||
this.loadExtDict();
|
||||
// 加载远程自定义词库
|
||||
@ -415,44 +406,11 @@ public class Dictionary {
|
||||
// 加载扩展词典配置
|
||||
List<String> extDictFiles = getExtDictionarys();
|
||||
if (extDictFiles != null) {
|
||||
InputStream is = null;
|
||||
for (String extDictName : extDictFiles) {
|
||||
// 读取扩展词典文件
|
||||
logger.info("[Dict Loading] " + extDictName);
|
||||
Path file = PathUtils.get(extDictName);
|
||||
try {
|
||||
is = new FileInputStream(file.toFile());
|
||||
} catch (FileNotFoundException e) {
|
||||
logger.error("ik-analyzer", e);
|
||||
}
|
||||
|
||||
// 如果找不到扩展的字典,则忽略
|
||||
if (is == null) {
|
||||
continue;
|
||||
}
|
||||
try {
|
||||
BufferedReader br = new BufferedReader(new InputStreamReader(is, "UTF-8"), 512);
|
||||
String theWord = null;
|
||||
do {
|
||||
theWord = br.readLine();
|
||||
if (theWord != null && !"".equals(theWord.trim())) {
|
||||
// 加载扩展词典数据到主内存词典中
|
||||
_MainDict.fillSegment(theWord.trim().toCharArray());
|
||||
}
|
||||
} while (theWord != null);
|
||||
|
||||
} catch (IOException e) {
|
||||
logger.error("ik-analyzer", e);
|
||||
} finally {
|
||||
try {
|
||||
if (is != null) {
|
||||
is.close();
|
||||
is = null;
|
||||
}
|
||||
} catch (IOException e) {
|
||||
logger.error("ik-analyzer", e);
|
||||
}
|
||||
}
|
||||
loadDictFile(_MainDict, file, false, "Extra Dict");
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -533,80 +491,17 @@ public class Dictionary {
|
||||
|
||||
// 读取主词典文件
|
||||
Path file = PathUtils.get(getDictRoot(), Dictionary.PATH_DIC_STOP);
|
||||
|
||||
InputStream is = null;
|
||||
try {
|
||||
is = new FileInputStream(file.toFile());
|
||||
} catch (FileNotFoundException e) {
|
||||
logger.error(e.getMessage(), e);
|
||||
}
|
||||
|
||||
try {
|
||||
BufferedReader br = new BufferedReader(new InputStreamReader(is, "UTF-8"), 512);
|
||||
String theWord = null;
|
||||
do {
|
||||
theWord = br.readLine();
|
||||
if (theWord != null && !"".equals(theWord.trim())) {
|
||||
_StopWords.fillSegment(theWord.trim().toCharArray());
|
||||
}
|
||||
} while (theWord != null);
|
||||
|
||||
} catch (IOException e) {
|
||||
logger.error("ik-analyzer", e);
|
||||
|
||||
} finally {
|
||||
try {
|
||||
if (is != null) {
|
||||
is.close();
|
||||
is = null;
|
||||
}
|
||||
} catch (IOException e) {
|
||||
logger.error("ik-analyzer", e);
|
||||
}
|
||||
}
|
||||
loadDictFile(_StopWords, file, false, "Main Stopwords");
|
||||
|
||||
// 加载扩展停止词典
|
||||
List<String> extStopWordDictFiles = getExtStopWordDictionarys();
|
||||
if (extStopWordDictFiles != null) {
|
||||
is = null;
|
||||
for (String extStopWordDictName : extStopWordDictFiles) {
|
||||
logger.info("[Dict Loading] " + extStopWordDictName);
|
||||
|
||||
// 读取扩展词典文件
|
||||
file = PathUtils.get(extStopWordDictName);
|
||||
try {
|
||||
is = new FileInputStream(file.toFile());
|
||||
} catch (FileNotFoundException e) {
|
||||
logger.error("ik-analyzer", e);
|
||||
}
|
||||
// 如果找不到扩展的字典,则忽略
|
||||
if (is == null) {
|
||||
continue;
|
||||
}
|
||||
try {
|
||||
BufferedReader br = new BufferedReader(new InputStreamReader(is, "UTF-8"), 512);
|
||||
String theWord = null;
|
||||
do {
|
||||
theWord = br.readLine();
|
||||
if (theWord != null && !"".equals(theWord.trim())) {
|
||||
// 加载扩展停止词典数据到内存中
|
||||
_StopWords.fillSegment(theWord.trim().toCharArray());
|
||||
}
|
||||
} while (theWord != null);
|
||||
|
||||
} catch (IOException e) {
|
||||
logger.error("ik-analyzer", e);
|
||||
|
||||
} finally {
|
||||
try {
|
||||
if (is != null) {
|
||||
is.close();
|
||||
is = null;
|
||||
}
|
||||
} catch (IOException e) {
|
||||
logger.error("ik-analyzer", e);
|
||||
}
|
||||
}
|
||||
loadDictFile(_StopWords, file, false, "Extra Stopwords");
|
||||
}
|
||||
}
|
||||
|
||||
@ -639,142 +534,25 @@ public class Dictionary {
|
||||
_QuantifierDict = new DictSegment((char) 0);
|
||||
// 读取量词词典文件
|
||||
Path file = PathUtils.get(getDictRoot(), Dictionary.PATH_DIC_QUANTIFIER);
|
||||
InputStream is = null;
|
||||
try {
|
||||
is = new FileInputStream(file.toFile());
|
||||
} catch (FileNotFoundException e) {
|
||||
logger.error("ik-analyzer", e);
|
||||
}
|
||||
try {
|
||||
BufferedReader br = new BufferedReader(new InputStreamReader(is, "UTF-8"), 512);
|
||||
String theWord = null;
|
||||
do {
|
||||
theWord = br.readLine();
|
||||
if (theWord != null && !"".equals(theWord.trim())) {
|
||||
_QuantifierDict.fillSegment(theWord.trim().toCharArray());
|
||||
}
|
||||
} while (theWord != null);
|
||||
|
||||
} catch (IOException ioe) {
|
||||
logger.error("Quantifier Dictionary loading exception.");
|
||||
|
||||
} finally {
|
||||
try {
|
||||
if (is != null) {
|
||||
is.close();
|
||||
is = null;
|
||||
}
|
||||
} catch (IOException e) {
|
||||
logger.error("ik-analyzer", e);
|
||||
}
|
||||
}
|
||||
loadDictFile(_QuantifierDict, file, false, "Quantifier");
|
||||
}
|
||||
|
||||
private void loadSurnameDict() {
|
||||
|
||||
_SurnameDict = new DictSegment((char) 0);
|
||||
Path file = PathUtils.get(getDictRoot(), Dictionary.PATH_DIC_SURNAME);
|
||||
InputStream is = null;
|
||||
try {
|
||||
is = new FileInputStream(file.toFile());
|
||||
} catch (FileNotFoundException e) {
|
||||
logger.error("ik-analyzer", e);
|
||||
}
|
||||
if (is == null) {
|
||||
throw new RuntimeException("Surname Dictionary not found!!!");
|
||||
}
|
||||
try {
|
||||
BufferedReader br = new BufferedReader(new InputStreamReader(is, "UTF-8"), 512);
|
||||
String theWord;
|
||||
do {
|
||||
theWord = br.readLine();
|
||||
if (theWord != null && !"".equals(theWord.trim())) {
|
||||
_SurnameDict.fillSegment(theWord.trim().toCharArray());
|
||||
}
|
||||
} while (theWord != null);
|
||||
} catch (IOException e) {
|
||||
logger.error("ik-analyzer", e);
|
||||
} finally {
|
||||
try {
|
||||
if (is != null) {
|
||||
is.close();
|
||||
is = null;
|
||||
}
|
||||
} catch (IOException e) {
|
||||
logger.error("ik-analyzer", e);
|
||||
}
|
||||
}
|
||||
loadDictFile(_SurnameDict, file, true, "Surname");
|
||||
}
|
||||
|
||||
private void loadSuffixDict() {
|
||||
|
||||
_SuffixDict = new DictSegment((char) 0);
|
||||
Path file = PathUtils.get(getDictRoot(), Dictionary.PATH_DIC_SUFFIX);
|
||||
InputStream is = null;
|
||||
try {
|
||||
is = new FileInputStream(file.toFile());
|
||||
} catch (FileNotFoundException e) {
|
||||
logger.error("ik-analyzer", e);
|
||||
}
|
||||
if (is == null) {
|
||||
throw new RuntimeException("Suffix Dictionary not found!!!");
|
||||
}
|
||||
try {
|
||||
|
||||
BufferedReader br = new BufferedReader(new InputStreamReader(is, "UTF-8"), 512);
|
||||
String theWord;
|
||||
do {
|
||||
theWord = br.readLine();
|
||||
if (theWord != null && !"".equals(theWord.trim())) {
|
||||
_SuffixDict.fillSegment(theWord.trim().toCharArray());
|
||||
}
|
||||
} while (theWord != null);
|
||||
} catch (IOException e) {
|
||||
logger.error("ik-analyzer", e);
|
||||
} finally {
|
||||
try {
|
||||
is.close();
|
||||
is = null;
|
||||
} catch (IOException e) {
|
||||
logger.error("ik-analyzer", e);
|
||||
}
|
||||
}
|
||||
loadDictFile(_SuffixDict, file, true, "Suffix");
|
||||
}
|
||||
|
||||
private void loadPrepDict() {
|
||||
|
||||
_PrepDict = new DictSegment((char) 0);
|
||||
Path file = PathUtils.get(getDictRoot(), Dictionary.PATH_DIC_PREP);
|
||||
InputStream is = null;
|
||||
try {
|
||||
is = new FileInputStream(file.toFile());
|
||||
} catch (FileNotFoundException e) {
|
||||
logger.error("ik-analyzer", e);
|
||||
}
|
||||
if (is == null) {
|
||||
throw new RuntimeException("Preposition Dictionary not found!!!");
|
||||
}
|
||||
try {
|
||||
|
||||
BufferedReader br = new BufferedReader(new InputStreamReader(is, "UTF-8"), 512);
|
||||
String theWord;
|
||||
do {
|
||||
theWord = br.readLine();
|
||||
if (theWord != null && !"".equals(theWord.trim())) {
|
||||
|
||||
_PrepDict.fillSegment(theWord.trim().toCharArray());
|
||||
}
|
||||
} while (theWord != null);
|
||||
} catch (IOException e) {
|
||||
logger.error("ik-analyzer", e);
|
||||
} finally {
|
||||
try {
|
||||
is.close();
|
||||
is = null;
|
||||
} catch (IOException e) {
|
||||
logger.error("ik-analyzer", e);
|
||||
}
|
||||
}
|
||||
loadDictFile(_PrepDict, file, true, "Preposition");
|
||||
}
|
||||
|
||||
public void reLoadMainDict() {
|
||||
|
Loading…
x
Reference in New Issue
Block a user