change Rune type from uint16_t to uint32_t to support more chinese word

This commit is contained in:
yanyiwu 2016-02-18 14:54:03 +08:00
parent 8d66b1f1fa
commit 14e09290c2
4 changed files with 9 additions and 6 deletions

View File

@ -4,6 +4,7 @@
+ 在 Trie 中去除之前糟糕的针对 uint16 优化的用数组代替 map 的设计,
该设计的主要问题是前提 unicode 每个字符必须是 uint16 ,则无法更全面得支持 unicode 多国字符。
+ Rune 类型从 16bit 更改为 32bit ,支持更多 Unicode 字符,包括一些罕见汉字。
## v4.4.1

View File

@ -2,11 +2,12 @@
#define CPPJIEBA_HMMMODEL_H
#include "limonp/StringUtil.hpp"
#include "Trie.hpp"
namespace cppjieba {
using namespace limonp;
typedef unordered_map<uint16_t, double> EmitProbMap;
typedef unordered_map<Rune, double> EmitProbMap;
struct HMMModel {
/*
@ -70,7 +71,7 @@ struct HMMModel {
CHECK(GetLine(ifile, line));
CHECK(LoadEmitProb(line, emitProbS));
}
double GetEmitProb(const EmitProbMap* ptMp, uint16_t key,
double GetEmitProb(const EmitProbMap* ptMp, Rune key,
double defVal)const {
EmitProbMap::const_iterator cit = ptMp->find(key);
if (cit == ptMp->end()) {

View File

@ -13,7 +13,7 @@ namespace cppjieba {
using namespace limonp;
typedef uint16_t Rune;
typedef uint32_t Rune;
typedef limonp::LocalVector<Rune> Unicode;
namespace TransCode {
@ -21,7 +21,7 @@ inline bool Decode(const string& str, Unicode& res) {
#ifdef CPPJIEBA_GBK
return gbkTrans(str, res);
#else
return Utf8ToUnicode(str, res);
return Utf8ToUnicode32(str, res);
#endif
}
@ -29,7 +29,7 @@ inline void Encode(Unicode::const_iterator begin, Unicode::const_iterator end, s
#ifdef CPPJIEBA_GBK
gbkTrans(begin, end, res);
#else
UnicodeToUtf8(begin, end, res);
Unicode32ToUtf8(begin, end, res);
#endif
}

View File

@ -4,6 +4,7 @@
#include <vector>
#include <queue>
#include "limonp/StdExtension.hpp"
#include "Trie.hpp"
namespace cppjieba {
@ -25,7 +26,7 @@ inline ostream & operator << (ostream& os, const DictUnit& unit) {
}
struct Dag {
uint16_t rune;
Rune rune;
LocalVector<pair<size_t, const DictUnit*> > nexts;
const DictUnit * pInfo;
double weight;