diff --git a/ChangeLog.md b/ChangeLog.md index 094f747..a55ff04 100644 --- a/ChangeLog.md +++ b/ChangeLog.md @@ -4,6 +4,7 @@ + 在 Trie 中去除之前糟糕的针对 uint16 优化的用数组代替 map 的设计, 该设计的主要问题是前提 unicode 每个字符必须是 uint16 ,则无法更全面得支持 unicode 多国字符。 ++ Rune 类型从 16bit 更改为 32bit ,支持更多 Unicode 字符,包括一些罕见汉字。 ## v4.4.1 diff --git a/include/cppjieba/HMMModel.hpp b/include/cppjieba/HMMModel.hpp index 05f2b72..e14b956 100644 --- a/include/cppjieba/HMMModel.hpp +++ b/include/cppjieba/HMMModel.hpp @@ -2,11 +2,12 @@ #define CPPJIEBA_HMMMODEL_H #include "limonp/StringUtil.hpp" +#include "Trie.hpp" namespace cppjieba { using namespace limonp; -typedef unordered_map EmitProbMap; +typedef unordered_map EmitProbMap; struct HMMModel { /* @@ -70,7 +71,7 @@ struct HMMModel { CHECK(GetLine(ifile, line)); CHECK(LoadEmitProb(line, emitProbS)); } - double GetEmitProb(const EmitProbMap* ptMp, uint16_t key, + double GetEmitProb(const EmitProbMap* ptMp, Rune key, double defVal)const { EmitProbMap::const_iterator cit = ptMp->find(key); if (cit == ptMp->end()) { diff --git a/include/cppjieba/TransCode.hpp b/include/cppjieba/TransCode.hpp index 3e7690a..6320beb 100644 --- a/include/cppjieba/TransCode.hpp +++ b/include/cppjieba/TransCode.hpp @@ -13,7 +13,7 @@ namespace cppjieba { using namespace limonp; -typedef uint16_t Rune; +typedef uint32_t Rune; typedef limonp::LocalVector Unicode; namespace TransCode { @@ -21,7 +21,7 @@ inline bool Decode(const string& str, Unicode& res) { #ifdef CPPJIEBA_GBK return gbkTrans(str, res); #else - return Utf8ToUnicode(str, res); + return Utf8ToUnicode32(str, res); #endif } @@ -29,7 +29,7 @@ inline void Encode(Unicode::const_iterator begin, Unicode::const_iterator end, s #ifdef CPPJIEBA_GBK gbkTrans(begin, end, res); #else - UnicodeToUtf8(begin, end, res); + Unicode32ToUtf8(begin, end, res); #endif } diff --git a/include/cppjieba/Trie.hpp b/include/cppjieba/Trie.hpp index fb5b9ff..6d1350a 100644 --- a/include/cppjieba/Trie.hpp +++ b/include/cppjieba/Trie.hpp @@ -4,6 +4,7 @@ #include #include #include "limonp/StdExtension.hpp" +#include "Trie.hpp" namespace cppjieba { @@ -25,7 +26,7 @@ inline ostream & operator << (ostream& os, const DictUnit& unit) { } struct Dag { - uint16_t rune; + Rune rune; LocalVector > nexts; const DictUnit * pInfo; double weight;