change Rune type from uint16_t to uint32_t to support more chinese word

This commit is contained in:
yanyiwu 2016-02-18 14:54:03 +08:00
parent 8d66b1f1fa
commit 14e09290c2
4 changed files with 9 additions and 6 deletions

View File

@ -4,6 +4,7 @@
+ 在 Trie 中去除之前糟糕的针对 uint16 优化的用数组代替 map 的设计, + 在 Trie 中去除之前糟糕的针对 uint16 优化的用数组代替 map 的设计,
该设计的主要问题是前提 unicode 每个字符必须是 uint16 ,则无法更全面得支持 unicode 多国字符。 该设计的主要问题是前提 unicode 每个字符必须是 uint16 ,则无法更全面得支持 unicode 多国字符。
+ Rune 类型从 16bit 更改为 32bit ,支持更多 Unicode 字符,包括一些罕见汉字。
## v4.4.1 ## v4.4.1

View File

@ -2,11 +2,12 @@
#define CPPJIEBA_HMMMODEL_H #define CPPJIEBA_HMMMODEL_H
#include "limonp/StringUtil.hpp" #include "limonp/StringUtil.hpp"
#include "Trie.hpp"
namespace cppjieba { namespace cppjieba {
using namespace limonp; using namespace limonp;
typedef unordered_map<uint16_t, double> EmitProbMap; typedef unordered_map<Rune, double> EmitProbMap;
struct HMMModel { struct HMMModel {
/* /*
@ -70,7 +71,7 @@ struct HMMModel {
CHECK(GetLine(ifile, line)); CHECK(GetLine(ifile, line));
CHECK(LoadEmitProb(line, emitProbS)); CHECK(LoadEmitProb(line, emitProbS));
} }
double GetEmitProb(const EmitProbMap* ptMp, uint16_t key, double GetEmitProb(const EmitProbMap* ptMp, Rune key,
double defVal)const { double defVal)const {
EmitProbMap::const_iterator cit = ptMp->find(key); EmitProbMap::const_iterator cit = ptMp->find(key);
if (cit == ptMp->end()) { if (cit == ptMp->end()) {

View File

@ -13,7 +13,7 @@ namespace cppjieba {
using namespace limonp; using namespace limonp;
typedef uint16_t Rune; typedef uint32_t Rune;
typedef limonp::LocalVector<Rune> Unicode; typedef limonp::LocalVector<Rune> Unicode;
namespace TransCode { namespace TransCode {
@ -21,7 +21,7 @@ inline bool Decode(const string& str, Unicode& res) {
#ifdef CPPJIEBA_GBK #ifdef CPPJIEBA_GBK
return gbkTrans(str, res); return gbkTrans(str, res);
#else #else
return Utf8ToUnicode(str, res); return Utf8ToUnicode32(str, res);
#endif #endif
} }
@ -29,7 +29,7 @@ inline void Encode(Unicode::const_iterator begin, Unicode::const_iterator end, s
#ifdef CPPJIEBA_GBK #ifdef CPPJIEBA_GBK
gbkTrans(begin, end, res); gbkTrans(begin, end, res);
#else #else
UnicodeToUtf8(begin, end, res); Unicode32ToUtf8(begin, end, res);
#endif #endif
} }

View File

@ -4,6 +4,7 @@
#include <vector> #include <vector>
#include <queue> #include <queue>
#include "limonp/StdExtension.hpp" #include "limonp/StdExtension.hpp"
#include "Trie.hpp"
namespace cppjieba { namespace cppjieba {
@ -25,7 +26,7 @@ inline ostream & operator << (ostream& os, const DictUnit& unit) {
} }
struct Dag { struct Dag {
uint16_t rune; Rune rune;
LocalVector<pair<size_t, const DictUnit*> > nexts; LocalVector<pair<size_t, const DictUnit*> > nexts;
const DictUnit * pInfo; const DictUnit * pInfo;
double weight; double weight;