diff --git a/include/cppjieba/DictTrie.hpp b/include/cppjieba/DictTrie.hpp index 1e22ecc..25aa5cf 100644 --- a/include/cppjieba/DictTrie.hpp +++ b/include/cppjieba/DictTrie.hpp @@ -50,6 +50,17 @@ class DictTrie { return true; } + bool InsertUserWord(const string& word,int freq, const string& tag = UNKNOWN_TAG) { + DictUnit node_info; + double weight = freq ? log(1.0 * freq / freq_sum_) : user_word_default_weight_ ; + if (!MakeNodeInfo(node_info, word, weight , tag)) { + return false; + } + active_node_infos_.push_back(node_info); + trie_->InsertNode(node_info.word, &active_node_infos_.back()); + return true; + } + const DictUnit* Find(RuneStrArray::const_iterator begin, RuneStrArray::const_iterator end) const { return trie_->Find(begin, end); } @@ -69,6 +80,63 @@ class DictTrie { return min_weight_; } + void InserUserDictNode(const string& line) { + vector buf; + DictUnit node_info; + Split(line, buf, " "); + if(buf.size() == 1){ + MakeNodeInfo(node_info, + buf[0], + user_word_default_weight_, + UNKNOWN_TAG); + } else if (buf.size() == 2) { + MakeNodeInfo(node_info, + buf[0], + user_word_default_weight_, + buf[1]); + } else if (buf.size() == 3) { + int freq = atoi(buf[1].c_str()); + assert(freq_sum_ > 0.0); + double weight = log(1.0 * freq / freq_sum_); + MakeNodeInfo(node_info, buf[0], weight, buf[2]); + } + static_node_infos_.push_back(node_info); + if (node_info.word.size() == 1) { + user_dict_single_chinese_word_.insert(node_info.word[0]); + } + } + + void LoadUserDict(const vector& buf) { + for (size_t i = 0; i < buf.size(); i++) { + InserUserDictNode(buf[i]); + } + } + + void LoadUserDict(const set& buf) { + std::set::const_iterator iter; + for (iter = buf.begin(); iter != buf.end(); iter++){ + InserUserDictNode(*iter); + } + } + + void LoadUserDict(const string& filePaths) { + vector files = limonp::Split(filePaths, "|;"); + size_t lineno = 0; + for (size_t i = 0; i < files.size(); i++) { + ifstream ifs(files[i].c_str()); + XCHECK(ifs.is_open()) << "open " << files[i] << " failed"; + string line; + + for (; getline(ifs, line); lineno++) { + if (line.size() == 0) { + continue; + } + InserUserDictNode(line); + } + } + } + + private: void Init(const string& dict_path, const string& user_dict_paths, UserWordWeightOption user_word_weight_opt) { LoadDict(dict_path); @@ -95,45 +163,8 @@ class DictTrie { trie_ = new Trie(words, valuePointers); } - void LoadUserDict(const string& filePaths) { - vector files = limonp::Split(filePaths, "|;"); - size_t lineno = 0; - for (size_t i = 0; i < files.size(); i++) { - ifstream ifs(files[i].c_str()); - XCHECK(ifs.is_open()) << "open " << files[i] << " failed"; - string line; - DictUnit node_info; - vector buf; - for (; getline(ifs, line); lineno++) { - if (line.size() == 0) { - continue; - } - buf.clear(); - Split(line, buf, " "); - DictUnit node_info; - if(buf.size() == 1){ - MakeNodeInfo(node_info, - buf[0], - user_word_default_weight_, - UNKNOWN_TAG); - } else if (buf.size() == 2) { - MakeNodeInfo(node_info, - buf[0], - user_word_default_weight_, - buf[1]); - } else if (buf.size() == 3) { - int freq = atoi(buf[1].c_str()); - assert(freq_sum_ > 0.0); - double weight = log(1.0 * freq / freq_sum_); - MakeNodeInfo(node_info, buf[0], weight, buf[2]); - } - static_node_infos_.push_back(node_info); - if (node_info.word.size() == 1) { - user_dict_single_chinese_word_.insert(node_info.word[0]); - } - } - } - } + + bool MakeNodeInfo(DictUnit& node_info, const string& word, diff --git a/include/cppjieba/Jieba.hpp b/include/cppjieba/Jieba.hpp index ef5cb45..a8f6751 100644 --- a/include/cppjieba/Jieba.hpp +++ b/include/cppjieba/Jieba.hpp @@ -84,10 +84,23 @@ class Jieba { const DictTrie* GetDictTrie() const { return &dict_trie_; } + const HMMModel* GetHMMModel() const { return &model_; } + void LoadUserDict(const vector& buf) { + dict_trie_.LoadUserDict(buf); + } + + void LoadUserDict(const set& buf) { + dict_trie_.LoadUserDict(buf); + } + + void LoadUserDict(const string& path) { + dict_trie_.LoadUserDict(path); + } + private: DictTrie dict_trie_; HMMModel model_;