diff --git a/include/cppjieba/DictTrie.hpp b/include/cppjieba/DictTrie.hpp index 97b4643..ea8916c 100644 --- a/include/cppjieba/DictTrie.hpp +++ b/include/cppjieba/DictTrie.hpp @@ -1,15 +1,15 @@ #ifndef CPPJIEBA_DICT_TRIE_HPP #define CPPJIEBA_DICT_TRIE_HPP -#include +#include #include -#include -#include #include #include -#include #include -#include +#include +#include +#include +#include #include "limonp/StringUtil.hpp" #include "limonp/Logging.hpp" #include "Unicode.hpp" @@ -17,8 +17,6 @@ namespace cppjieba { -using namespace limonp; - const double MIN_DOUBLE = -3.14e+100; const double MAX_DOUBLE = 3.14e+100; const size_t DICT_COLUMN_NUM = 3; @@ -32,7 +30,7 @@ class DictTrie { WordWeightMax, }; // enum UserWordWeightOption - DictTrie(const string& dict_path, const string& user_dict_paths = "", UserWordWeightOption user_word_weight_opt = WordWeightMedian) { + DictTrie(const std::string& dict_path, const std::string& user_dict_paths = "", UserWordWeightOption user_word_weight_opt = WordWeightMedian) { Init(dict_path, user_dict_paths, user_word_weight_opt); } @@ -40,7 +38,7 @@ class DictTrie { delete trie_; } - bool InsertUserWord(const string& word, const string& tag = UNKNOWN_TAG) { + bool InsertUserWord(const std::string& word, const std::string& tag = UNKNOWN_TAG) { DictUnit node_info; if (!MakeNodeInfo(node_info, word, user_word_default_weight_, tag)) { return false; @@ -50,7 +48,7 @@ class DictTrie { return true; } - bool InsertUserWord(const string& word,int freq, const string& tag = UNKNOWN_TAG) { + bool InsertUserWord(const std::string& word,int freq, const std::string& tag = UNKNOWN_TAG) { DictUnit node_info; double weight = freq ? log(1.0 * freq / freq_sum_) : user_word_default_weight_ ; if (!MakeNodeInfo(node_info, word, weight , tag)) { @@ -61,7 +59,7 @@ class DictTrie { return true; } - bool DeleteUserWord(const string& word, const string& tag = UNKNOWN_TAG) { + bool DeleteUserWord(const std::string& word, const std::string& tag = UNKNOWN_TAG) { DictUnit node_info; if (!MakeNodeInfo(node_info, word, user_word_default_weight_, tag)) { return false; @@ -69,19 +67,19 @@ class DictTrie { trie_->DeleteNode(node_info.word, &node_info); return true; } - + const DictUnit* Find(RuneStrArray::const_iterator begin, RuneStrArray::const_iterator end) const { return trie_->Find(begin, end); } - void Find(RuneStrArray::const_iterator begin, - RuneStrArray::const_iterator end, - vector&res, + void Find(RuneStrArray::const_iterator begin, + RuneStrArray::const_iterator end, + std::vector&res, size_t max_word_len = MAX_WORD_LENGTH) const { trie_->Find(begin, end, res, max_word_len); } - bool Find(const string& word) + bool Find(const std::string& word) { const DictUnit *tmp = NULL; RuneStrArray runes; @@ -108,18 +106,18 @@ class DictTrie { return min_weight_; } - void InserUserDictNode(const string& line) { - vector buf; + void InserUserDictNode(const std::string& line) { + std::vector buf; DictUnit node_info; - Split(line, buf, " "); + limonp::Split(line, buf, " "); if(buf.size() == 1){ - MakeNodeInfo(node_info, - buf[0], + MakeNodeInfo(node_info, + buf[0], user_word_default_weight_, UNKNOWN_TAG); } else if (buf.size() == 2) { - MakeNodeInfo(node_info, - buf[0], + MakeNodeInfo(node_info, + buf[0], user_word_default_weight_, buf[1]); } else if (buf.size() == 3) { @@ -133,27 +131,27 @@ class DictTrie { user_dict_single_chinese_word_.insert(node_info.word[0]); } } - - void LoadUserDict(const vector& buf) { + + void LoadUserDict(const std::vector& buf) { for (size_t i = 0; i < buf.size(); i++) { InserUserDictNode(buf[i]); } } - void LoadUserDict(const set& buf) { - std::set::const_iterator iter; + void LoadUserDict(const std::set& buf) { + std::set::const_iterator iter; for (iter = buf.begin(); iter != buf.end(); iter++){ InserUserDictNode(*iter); } } - void LoadUserDict(const string& filePaths) { - vector files = limonp::Split(filePaths, "|;"); + void LoadUserDict(const std::string& filePaths) { + std::vector files = limonp::Split(filePaths, "|;"); for (size_t i = 0; i < files.size(); i++) { - ifstream ifs(files[i].c_str()); - XCHECK(ifs.is_open()) << "open " << files[i] << " failed"; - string line; - + std::ifstream ifs(files[i].c_str()); + XCHECK(ifs.is_open()) << "open " << files[i] << " failed"; + std::string line; + while(getline(ifs, line)) { if (line.size() == 0) { continue; @@ -165,7 +163,7 @@ class DictTrie { private: - void Init(const string& dict_path, const string& user_dict_paths, UserWordWeightOption user_word_weight_opt) { + void Init(const std::string& dict_path, const std::string& user_dict_paths, UserWordWeightOption user_word_weight_opt) { LoadDict(dict_path); freq_sum_ = CalcFreqSum(static_node_infos_); CalculateWeight(static_node_infos_, freq_sum_); @@ -177,11 +175,11 @@ class DictTrie { Shrink(static_node_infos_); CreateTrie(static_node_infos_); } - - void CreateTrie(const vector& dictUnits) { + + void CreateTrie(const std::vector& dictUnits) { assert(dictUnits.size()); - vector words; - vector valuePointers; + std::vector words; + std::vector valuePointers; for (size_t i = 0 ; i < dictUnits.size(); i ++) { words.push_back(dictUnits[i].word); valuePointers.push_back(&dictUnits[i]); @@ -190,13 +188,10 @@ class DictTrie { trie_ = new Trie(words, valuePointers); } - - - bool MakeNodeInfo(DictUnit& node_info, - const string& word, - double weight, - const string& tag) { + const std::string& word, + double weight, + const std::string& tag) { if (!DecodeUTF8RunesInString(word, node_info.word)) { XLOG(ERROR) << "UTF-8 decode failed for dict word: " << word; return false; @@ -206,19 +201,19 @@ class DictTrie { return true; } - void LoadDict(const string& filePath) { - ifstream ifs(filePath.c_str()); + void LoadDict(const std::string& filePath) { + std::ifstream ifs(filePath.c_str()); XCHECK(ifs.is_open()) << "open " << filePath << " failed."; - string line; - vector buf; + std::string line; + std::vector buf; DictUnit node_info; while (getline(ifs, line)) { - Split(line, buf, " "); + limonp::Split(line, buf, " "); XCHECK(buf.size() == DICT_COLUMN_NUM) << "split result illegal, line:" << line; - MakeNodeInfo(node_info, - buf[0], - atof(buf[1].c_str()), + MakeNodeInfo(node_info, + buf[0], + atof(buf[1].c_str()), buf[2]); static_node_infos_.push_back(node_info); } @@ -230,8 +225,8 @@ class DictTrie { void SetStaticWordWeights(UserWordWeightOption option) { XCHECK(!static_node_infos_.empty()); - vector x = static_node_infos_; - sort(x.begin(), x.end(), WeightCompare); + std::vector x = static_node_infos_; + std::sort(x.begin(), x.end(), WeightCompare); min_weight_ = x[0].weight; max_weight_ = x[x.size() - 1].weight; median_weight_ = x[x.size() / 2].weight; @@ -248,7 +243,7 @@ class DictTrie { } } - double CalcFreqSum(const vector& node_infos) const { + double CalcFreqSum(const std::vector& node_infos) const { double sum = 0.0; for (size_t i = 0; i < node_infos.size(); i++) { sum += node_infos[i].weight; @@ -256,7 +251,7 @@ class DictTrie { return sum; } - void CalculateWeight(vector& node_infos, double sum) const { + void CalculateWeight(std::vector& node_infos, double sum) const { assert(sum > 0.0); for (size_t i = 0; i < node_infos.size(); i++) { DictUnit& node_info = node_infos[i]; @@ -265,12 +260,12 @@ class DictTrie { } } - void Shrink(vector& units) const { - vector(units.begin(), units.end()).swap(units); + void Shrink(std::vector& units) const { + std::vector(units.begin(), units.end()).swap(units); } - vector static_node_infos_; - deque active_node_infos_; // must not be vector + std::vector static_node_infos_; + std::deque active_node_infos_; // must not be std::vector Trie * trie_; double freq_sum_; @@ -278,7 +273,7 @@ class DictTrie { double max_weight_; double median_weight_; double user_word_default_weight_; - unordered_set user_dict_single_chinese_word_; + std::unordered_set user_dict_single_chinese_word_; }; }