mirror of
https://github.com/yanyiwu/cppjieba.git
synced 2025-07-18 00:00:12 +08:00
refactor DictTrie, and expose function: insertUserWord
This commit is contained in:
parent
ee255baf56
commit
1d27559209
@ -46,10 +46,10 @@ class DictTrie {
|
|||||||
loadDict_(dictPath);
|
loadDict_(dictPath);
|
||||||
calculateWeight_(nodeInfos_);
|
calculateWeight_(nodeInfos_);
|
||||||
minWeight_ = findMinWeight_(nodeInfos_);
|
minWeight_ = findMinWeight_(nodeInfos_);
|
||||||
|
maxWeight_ = findMaxWeight_(nodeInfos_);
|
||||||
|
|
||||||
if(userDictPath.size()) {
|
if(userDictPath.size()) {
|
||||||
double maxWeight = findMaxWeight_(nodeInfos_);
|
loadUserDict_(userDictPath);
|
||||||
loadUserDict_(userDictPath, maxWeight, UNKNOWN_TAG);
|
|
||||||
}
|
}
|
||||||
shrink_(nodeInfos_);
|
shrink_(nodeInfos_);
|
||||||
trie_ = createTrie_(nodeInfos_);
|
trie_ = createTrie_(nodeInfos_);
|
||||||
@ -62,13 +62,25 @@ class DictTrie {
|
|||||||
bool find(Unicode::const_iterator begin, Unicode::const_iterator end, DagType& dag, size_t offset = 0) const {
|
bool find(Unicode::const_iterator begin, Unicode::const_iterator end, DagType& dag, size_t offset = 0) const {
|
||||||
return trie_->find(begin, end, dag, offset);
|
return trie_->find(begin, end, dag, offset);
|
||||||
}
|
}
|
||||||
void find(
|
void find(Unicode::const_iterator begin,
|
||||||
Unicode::const_iterator begin,
|
|
||||||
Unicode::const_iterator end,
|
Unicode::const_iterator end,
|
||||||
vector<SegmentChar>& res
|
vector<SegmentChar>& res) const {
|
||||||
) const {
|
|
||||||
trie_->find(begin, end, res);
|
trie_->find(begin, end, res);
|
||||||
}
|
}
|
||||||
|
bool insertUserWord(const string& word, const string& tag = UNKNOWN_TAG) {
|
||||||
|
DictUnit nodeInfo;
|
||||||
|
if(!TransCode::decode(word, nodeInfo.word)) {
|
||||||
|
LogError("decode %s failed.", word.c_str());
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
if(nodeInfo.word.size() == 1) {
|
||||||
|
userDictSingleChineseWord_.insert(nodeInfo.word[0]);
|
||||||
|
}
|
||||||
|
nodeInfo.weight = maxWeight_;
|
||||||
|
nodeInfo.tag = tag;
|
||||||
|
nodeInfos_.push_back(nodeInfo);
|
||||||
|
return true;
|
||||||
|
}
|
||||||
bool isUserDictSingleChineseWord(const Unicode::value_type& word) const {
|
bool isUserDictSingleChineseWord(const Unicode::value_type& word) const {
|
||||||
return isIn(userDictSingleChineseWord_, word);
|
return isIn(userDictSingleChineseWord_, word);
|
||||||
}
|
}
|
||||||
@ -90,7 +102,7 @@ class DictTrie {
|
|||||||
Trie * trie = new Trie(words, valuePointers);
|
Trie * trie = new Trie(words, valuePointers);
|
||||||
return trie;
|
return trie;
|
||||||
}
|
}
|
||||||
void loadUserDict_(const string& filePath, double defaultWeight, const string& defaultTag) {
|
void loadUserDict_(const string& filePath) {
|
||||||
ifstream ifs(filePath.c_str());
|
ifstream ifs(filePath.c_str());
|
||||||
if(!ifs.is_open()) {
|
if(!ifs.is_open()) {
|
||||||
LogFatal("file %s open failed.", filePath.c_str());
|
LogFatal("file %s open failed.", filePath.c_str());
|
||||||
@ -105,19 +117,21 @@ class DictTrie {
|
|||||||
if(buf.size() < 1) {
|
if(buf.size() < 1) {
|
||||||
LogFatal("split [%s] result illegal", line.c_str());
|
LogFatal("split [%s] result illegal", line.c_str());
|
||||||
}
|
}
|
||||||
if(!TransCode::decode(buf[0], nodeInfo.word)) {
|
insertUserWord(buf[0], (buf.size() == 2 ? buf[1] : UNKNOWN_TAG));
|
||||||
LogError("line[%u:%s] illegal.", lineno, line.c_str());
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
if(nodeInfo.word.size() == 1) {
|
|
||||||
userDictSingleChineseWord_.insert(nodeInfo.word[0]);
|
|
||||||
}
|
|
||||||
nodeInfo.weight = defaultWeight;
|
|
||||||
nodeInfo.tag = (buf.size() == 2 ? buf[1] : defaultTag);
|
|
||||||
nodeInfos_.push_back(nodeInfo);
|
|
||||||
}
|
}
|
||||||
LogInfo("load userdict[%s] ok. lines[%u]", filePath.c_str(), lineno);
|
LogInfo("load userdict[%s] ok. lines[%u]", filePath.c_str(), lineno);
|
||||||
}
|
}
|
||||||
|
bool insertWord_(const string& word, double weight, const string& tag) {
|
||||||
|
DictUnit nodeInfo;
|
||||||
|
if(!TransCode::decode(word, nodeInfo.word)) {
|
||||||
|
LogError("decode %s failed.", word.c_str());
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
nodeInfo.weight = weight;
|
||||||
|
nodeInfo.tag = tag;
|
||||||
|
nodeInfos_.push_back(nodeInfo);
|
||||||
|
return true;
|
||||||
|
}
|
||||||
void loadDict_(const string& filePath) {
|
void loadDict_(const string& filePath) {
|
||||||
ifstream ifs(filePath.c_str());
|
ifstream ifs(filePath.c_str());
|
||||||
if(!ifs.is_open()) {
|
if(!ifs.is_open()) {
|
||||||
@ -132,15 +146,7 @@ class DictTrie {
|
|||||||
if(buf.size() != DICT_COLUMN_NUM) {
|
if(buf.size() != DICT_COLUMN_NUM) {
|
||||||
LogFatal("split result illegal, line: %s, result size: %u", line.c_str(), buf.size());
|
LogFatal("split result illegal, line: %s, result size: %u", line.c_str(), buf.size());
|
||||||
}
|
}
|
||||||
|
insertWord_(buf[0], atof(buf[1].c_str()), buf[2]);
|
||||||
if(!TransCode::decode(buf[0], nodeInfo.word)) {
|
|
||||||
LogError("line[%u:%s] illegal.", lineno, line.c_str());
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
nodeInfo.weight = atof(buf[1].c_str());
|
|
||||||
nodeInfo.tag = buf[2];
|
|
||||||
|
|
||||||
nodeInfos_.push_back(nodeInfo);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
double findMinWeight_(const vector<DictUnit>& nodeInfos) const {
|
double findMinWeight_(const vector<DictUnit>& nodeInfos) const {
|
||||||
@ -180,6 +186,7 @@ class DictTrie {
|
|||||||
Trie * trie_;
|
Trie * trie_;
|
||||||
|
|
||||||
double minWeight_;
|
double minWeight_;
|
||||||
|
double maxWeight_;
|
||||||
unordered_set<Unicode::value_type> userDictSingleChineseWord_;
|
unordered_set<Unicode::value_type> userDictSingleChineseWord_;
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
Loading…
x
Reference in New Issue
Block a user