refactor DictTrie, and expose function: insertUserWord

This commit is contained in:
yanyiwu 2015-06-26 11:49:35 +08:00
parent ee255baf56
commit 1d27559209

View File

@ -46,10 +46,10 @@ class DictTrie {
loadDict_(dictPath); loadDict_(dictPath);
calculateWeight_(nodeInfos_); calculateWeight_(nodeInfos_);
minWeight_ = findMinWeight_(nodeInfos_); minWeight_ = findMinWeight_(nodeInfos_);
maxWeight_ = findMaxWeight_(nodeInfos_);
if(userDictPath.size()) { if(userDictPath.size()) {
double maxWeight = findMaxWeight_(nodeInfos_); loadUserDict_(userDictPath);
loadUserDict_(userDictPath, maxWeight, UNKNOWN_TAG);
} }
shrink_(nodeInfos_); shrink_(nodeInfos_);
trie_ = createTrie_(nodeInfos_); trie_ = createTrie_(nodeInfos_);
@ -62,13 +62,25 @@ class DictTrie {
bool find(Unicode::const_iterator begin, Unicode::const_iterator end, DagType& dag, size_t offset = 0) const { bool find(Unicode::const_iterator begin, Unicode::const_iterator end, DagType& dag, size_t offset = 0) const {
return trie_->find(begin, end, dag, offset); return trie_->find(begin, end, dag, offset);
} }
void find( void find(Unicode::const_iterator begin,
Unicode::const_iterator begin,
Unicode::const_iterator end, Unicode::const_iterator end,
vector<SegmentChar>& res vector<SegmentChar>& res) const {
) const {
trie_->find(begin, end, res); trie_->find(begin, end, res);
} }
bool insertUserWord(const string& word, const string& tag = UNKNOWN_TAG) {
DictUnit nodeInfo;
if(!TransCode::decode(word, nodeInfo.word)) {
LogError("decode %s failed.", word.c_str());
return false;
}
if(nodeInfo.word.size() == 1) {
userDictSingleChineseWord_.insert(nodeInfo.word[0]);
}
nodeInfo.weight = maxWeight_;
nodeInfo.tag = tag;
nodeInfos_.push_back(nodeInfo);
return true;
}
bool isUserDictSingleChineseWord(const Unicode::value_type& word) const { bool isUserDictSingleChineseWord(const Unicode::value_type& word) const {
return isIn(userDictSingleChineseWord_, word); return isIn(userDictSingleChineseWord_, word);
} }
@ -90,7 +102,7 @@ class DictTrie {
Trie * trie = new Trie(words, valuePointers); Trie * trie = new Trie(words, valuePointers);
return trie; return trie;
} }
void loadUserDict_(const string& filePath, double defaultWeight, const string& defaultTag) { void loadUserDict_(const string& filePath) {
ifstream ifs(filePath.c_str()); ifstream ifs(filePath.c_str());
if(!ifs.is_open()) { if(!ifs.is_open()) {
LogFatal("file %s open failed.", filePath.c_str()); LogFatal("file %s open failed.", filePath.c_str());
@ -105,19 +117,21 @@ class DictTrie {
if(buf.size() < 1) { if(buf.size() < 1) {
LogFatal("split [%s] result illegal", line.c_str()); LogFatal("split [%s] result illegal", line.c_str());
} }
if(!TransCode::decode(buf[0], nodeInfo.word)) { insertUserWord(buf[0], (buf.size() == 2 ? buf[1] : UNKNOWN_TAG));
LogError("line[%u:%s] illegal.", lineno, line.c_str());
continue;
}
if(nodeInfo.word.size() == 1) {
userDictSingleChineseWord_.insert(nodeInfo.word[0]);
}
nodeInfo.weight = defaultWeight;
nodeInfo.tag = (buf.size() == 2 ? buf[1] : defaultTag);
nodeInfos_.push_back(nodeInfo);
} }
LogInfo("load userdict[%s] ok. lines[%u]", filePath.c_str(), lineno); LogInfo("load userdict[%s] ok. lines[%u]", filePath.c_str(), lineno);
} }
bool insertWord_(const string& word, double weight, const string& tag) {
DictUnit nodeInfo;
if(!TransCode::decode(word, nodeInfo.word)) {
LogError("decode %s failed.", word.c_str());
return false;
}
nodeInfo.weight = weight;
nodeInfo.tag = tag;
nodeInfos_.push_back(nodeInfo);
return true;
}
void loadDict_(const string& filePath) { void loadDict_(const string& filePath) {
ifstream ifs(filePath.c_str()); ifstream ifs(filePath.c_str());
if(!ifs.is_open()) { if(!ifs.is_open()) {
@ -132,15 +146,7 @@ class DictTrie {
if(buf.size() != DICT_COLUMN_NUM) { if(buf.size() != DICT_COLUMN_NUM) {
LogFatal("split result illegal, line: %s, result size: %u", line.c_str(), buf.size()); LogFatal("split result illegal, line: %s, result size: %u", line.c_str(), buf.size());
} }
insertWord_(buf[0], atof(buf[1].c_str()), buf[2]);
if(!TransCode::decode(buf[0], nodeInfo.word)) {
LogError("line[%u:%s] illegal.", lineno, line.c_str());
continue;
}
nodeInfo.weight = atof(buf[1].c_str());
nodeInfo.tag = buf[2];
nodeInfos_.push_back(nodeInfo);
} }
} }
double findMinWeight_(const vector<DictUnit>& nodeInfos) const { double findMinWeight_(const vector<DictUnit>& nodeInfos) const {
@ -180,6 +186,7 @@ class DictTrie {
Trie * trie_; Trie * trie_;
double minWeight_; double minWeight_;
double maxWeight_;
unordered_set<Unicode::value_type> userDictSingleChineseWord_; unordered_set<Unicode::value_type> userDictSingleChineseWord_;
}; };
} }