From d56e5c0659e40add9f1f9832422de4e240eb2d82 Mon Sep 17 00:00:00 2001 From: zhoupeng Date: Fri, 8 Jun 2018 00:44:33 +0800 Subject: [PATCH 1/3] InsertUserWord with freq arg,expose InserUserDictNode with vector arg --- include/cppjieba/DictTrie.hpp | 60 ++++++++++++++++++++++------------- 1 file changed, 38 insertions(+), 22 deletions(-) diff --git a/include/cppjieba/DictTrie.hpp b/include/cppjieba/DictTrie.hpp index 1e22ecc..aaefb04 100644 --- a/include/cppjieba/DictTrie.hpp +++ b/include/cppjieba/DictTrie.hpp @@ -50,6 +50,17 @@ class DictTrie { return true; } + bool InsertUserWord(const string& word,int freq, const string& tag = UNKNOWN_TAG) { + DictUnit node_info; + double weight = freq ? log(1.0 * freq / freq_sum_) : user_word_default_weight_ ; + if (!MakeNodeInfo(node_info, word, weight , tag)) { + return false; + } + active_node_infos_.push_back(node_info); + trie_->InsertNode(node_info.word, &active_node_infos_.back()); + return true; + } + const DictUnit* Find(RuneStrArray::const_iterator begin, RuneStrArray::const_iterator end) const { return trie_->Find(begin, end); } @@ -69,6 +80,30 @@ class DictTrie { return min_weight_; } + void InserUserDictNode(vector& buf){ + DictUnit node_info; + if(buf.size() == 1){ + MakeNodeInfo(node_info, + buf[0], + user_word_default_weight_, + UNKNOWN_TAG); + } else if (buf.size() == 2) { + MakeNodeInfo(node_info, + buf[0], + user_word_default_weight_, + buf[1]); + } else if (buf.size() == 3) { + int freq = atoi(buf[1].c_str()); + assert(freq_sum_ > 0.0); + double weight = log(1.0 * freq / freq_sum_); + MakeNodeInfo(node_info, buf[0], weight, buf[2]); + } + static_node_infos_.push_back(node_info); + if (node_info.word.size() == 1) { + user_dict_single_chinese_word_.insert(node_info.word[0]); + } + } + private: void Init(const string& dict_path, const string& user_dict_paths, UserWordWeightOption user_word_weight_opt) { LoadDict(dict_path); @@ -95,6 +130,8 @@ class DictTrie { trie_ = new Trie(words, valuePointers); } + + void LoadUserDict(const string& filePaths) { vector files = limonp::Split(filePaths, "|;"); size_t lineno = 0; @@ -102,7 +139,6 @@ class DictTrie { ifstream ifs(files[i].c_str()); XCHECK(ifs.is_open()) << "open " << files[i] << " failed"; string line; - DictUnit node_info; vector buf; for (; getline(ifs, line); lineno++) { if (line.size() == 0) { @@ -110,27 +146,7 @@ class DictTrie { } buf.clear(); Split(line, buf, " "); - DictUnit node_info; - if(buf.size() == 1){ - MakeNodeInfo(node_info, - buf[0], - user_word_default_weight_, - UNKNOWN_TAG); - } else if (buf.size() == 2) { - MakeNodeInfo(node_info, - buf[0], - user_word_default_weight_, - buf[1]); - } else if (buf.size() == 3) { - int freq = atoi(buf[1].c_str()); - assert(freq_sum_ > 0.0); - double weight = log(1.0 * freq / freq_sum_); - MakeNodeInfo(node_info, buf[0], weight, buf[2]); - } - static_node_infos_.push_back(node_info); - if (node_info.word.size() == 1) { - user_dict_single_chinese_word_.insert(node_info.word[0]); - } + InserUserDictNode(buf); } } } From 1066bc085ed637ce42f2e96d449086ad24ce277b Mon Sep 17 00:00:00 2001 From: zhoupeng Date: Fri, 8 Jun 2018 01:32:47 +0800 Subject: [PATCH 2/3] fix input type ,expose to Jieba --- include/cppjieba/DictTrie.hpp | 16 +++++++++++----- include/cppjieba/Jieba.hpp | 4 ++++ 2 files changed, 15 insertions(+), 5 deletions(-) diff --git a/include/cppjieba/DictTrie.hpp b/include/cppjieba/DictTrie.hpp index aaefb04..d4d2043 100644 --- a/include/cppjieba/DictTrie.hpp +++ b/include/cppjieba/DictTrie.hpp @@ -80,8 +80,10 @@ class DictTrie { return min_weight_; } - void InserUserDictNode(vector& buf){ + void InserUserDictNode(const string& line){ + vector buf; DictUnit node_info; + Split(line, buf, " "); if(buf.size() == 1){ MakeNodeInfo(node_info, buf[0], @@ -103,6 +105,12 @@ class DictTrie { user_dict_single_chinese_word_.insert(node_info.word[0]); } } + + void LoadUserDict(vector& buf){ + for (size_t i = 0; i < buf.size(); i++) { + InserUserDictNode(buf[i]); + } + } private: void Init(const string& dict_path, const string& user_dict_paths, UserWordWeightOption user_word_weight_opt) { @@ -139,14 +147,12 @@ class DictTrie { ifstream ifs(files[i].c_str()); XCHECK(ifs.is_open()) << "open " << files[i] << " failed"; string line; - vector buf; + for (; getline(ifs, line); lineno++) { if (line.size() == 0) { continue; } - buf.clear(); - Split(line, buf, " "); - InserUserDictNode(buf); + InserUserDictNode(line); } } } diff --git a/include/cppjieba/Jieba.hpp b/include/cppjieba/Jieba.hpp index ef5cb45..2062a75 100644 --- a/include/cppjieba/Jieba.hpp +++ b/include/cppjieba/Jieba.hpp @@ -88,6 +88,10 @@ class Jieba { return &model_; } + void LoadUserDict(vector& buf) { + dict_trie_.LoadUserDict(buf); + } + private: DictTrie dict_trie_; HMMModel model_; From 1e1e585194d0816b369464890db7779436d5da21 Mon Sep 17 00:00:00 2001 From: zhoupeng Date: Fri, 8 Jun 2018 14:23:01 +0800 Subject: [PATCH 3/3] LoadUserDict by set,vector --- include/cppjieba/DictTrie.hpp | 45 +++++++++++++++++++++-------------- include/cppjieba/Jieba.hpp | 11 ++++++++- 2 files changed, 37 insertions(+), 19 deletions(-) diff --git a/include/cppjieba/DictTrie.hpp b/include/cppjieba/DictTrie.hpp index d4d2043..25aa5cf 100644 --- a/include/cppjieba/DictTrie.hpp +++ b/include/cppjieba/DictTrie.hpp @@ -80,7 +80,7 @@ class DictTrie { return min_weight_; } - void InserUserDictNode(const string& line){ + void InserUserDictNode(const string& line) { vector buf; DictUnit node_info; Split(line, buf, " "); @@ -106,12 +106,37 @@ class DictTrie { } } - void LoadUserDict(vector& buf){ + void LoadUserDict(const vector& buf) { for (size_t i = 0; i < buf.size(); i++) { InserUserDictNode(buf[i]); } } + void LoadUserDict(const set& buf) { + std::set::const_iterator iter; + for (iter = buf.begin(); iter != buf.end(); iter++){ + InserUserDictNode(*iter); + } + } + + void LoadUserDict(const string& filePaths) { + vector files = limonp::Split(filePaths, "|;"); + size_t lineno = 0; + for (size_t i = 0; i < files.size(); i++) { + ifstream ifs(files[i].c_str()); + XCHECK(ifs.is_open()) << "open " << files[i] << " failed"; + string line; + + for (; getline(ifs, line); lineno++) { + if (line.size() == 0) { + continue; + } + InserUserDictNode(line); + } + } + } + + private: void Init(const string& dict_path, const string& user_dict_paths, UserWordWeightOption user_word_weight_opt) { LoadDict(dict_path); @@ -140,22 +165,6 @@ class DictTrie { - void LoadUserDict(const string& filePaths) { - vector files = limonp::Split(filePaths, "|;"); - size_t lineno = 0; - for (size_t i = 0; i < files.size(); i++) { - ifstream ifs(files[i].c_str()); - XCHECK(ifs.is_open()) << "open " << files[i] << " failed"; - string line; - - for (; getline(ifs, line); lineno++) { - if (line.size() == 0) { - continue; - } - InserUserDictNode(line); - } - } - } bool MakeNodeInfo(DictUnit& node_info, const string& word, diff --git a/include/cppjieba/Jieba.hpp b/include/cppjieba/Jieba.hpp index 2062a75..a8f6751 100644 --- a/include/cppjieba/Jieba.hpp +++ b/include/cppjieba/Jieba.hpp @@ -84,14 +84,23 @@ class Jieba { const DictTrie* GetDictTrie() const { return &dict_trie_; } + const HMMModel* GetHMMModel() const { return &model_; } - void LoadUserDict(vector& buf) { + void LoadUserDict(const vector& buf) { dict_trie_.LoadUserDict(buf); } + void LoadUserDict(const set& buf) { + dict_trie_.LoadUserDict(buf); + } + + void LoadUserDict(const string& path) { + dict_trie_.LoadUserDict(path); + } + private: DictTrie dict_trie_; HMMModel model_;