mirror of
https://github.com/yanyiwu/cppjieba.git
synced 2025-07-18 00:00:12 +08:00
规范Unicode的相关命名,使用Rune代表一个中文字符
This commit is contained in:
parent
0e16e000ea
commit
78e41e5fd0
@ -71,7 +71,7 @@ class DictTrie {
|
||||
vector<Dag>& res) const {
|
||||
trie_->find(begin, end, res);
|
||||
}
|
||||
bool isUserDictSingleChineseWord(const Unicode::value_type& word) const {
|
||||
bool isUserDictSingleChineseWord(const Rune& word) const {
|
||||
return isIn(userDictSingleChineseWord_, word);
|
||||
}
|
||||
double getMinWeight() const {
|
||||
@ -198,7 +198,7 @@ class DictTrie {
|
||||
|
||||
double minWeight_;
|
||||
double maxWeight_;
|
||||
unordered_set<Unicode::value_type> userDictSingleChineseWord_;
|
||||
unordered_set<Rune> userDictSingleChineseWord_;
|
||||
};
|
||||
}
|
||||
|
||||
|
@ -74,7 +74,7 @@ class HMMSegment: public SegmentBase {
|
||||
private:
|
||||
// sequential letters rule
|
||||
Unicode::const_iterator sequentialLetterRule_(Unicode::const_iterator begin, Unicode::const_iterator end) const {
|
||||
Unicode::value_type x = *begin;
|
||||
Rune x = *begin;
|
||||
if (('a' <= x && x <= 'z') || ('A' <= x && x <= 'Z')) {
|
||||
begin ++;
|
||||
} else {
|
||||
@ -92,7 +92,7 @@ class HMMSegment: public SegmentBase {
|
||||
}
|
||||
//
|
||||
Unicode::const_iterator numbersRule_(Unicode::const_iterator begin, Unicode::const_iterator end) const {
|
||||
Unicode::value_type x = *begin;
|
||||
Rune x = *begin;
|
||||
if('0' <= x && x <= '9') {
|
||||
begin ++;
|
||||
} else {
|
||||
|
@ -28,7 +28,7 @@ class MPSegment: public SegmentBase {
|
||||
}
|
||||
}
|
||||
|
||||
bool isUserDictSingleChineseWord(const Unicode::value_type & value) const {
|
||||
bool isUserDictSingleChineseWord(const Rune & value) const {
|
||||
return dictTrie_->isUserDictSingleChineseWord(value);
|
||||
}
|
||||
|
||||
@ -101,7 +101,7 @@ class MPSegment: public SegmentBase {
|
||||
res.push_back(p->word);
|
||||
i += p->word.size();
|
||||
} else { //single chinese word
|
||||
res.push_back(Unicode(1, dags[i].uniCh));
|
||||
res.push_back(Unicode(1, dags[i].rune));
|
||||
i++;
|
||||
}
|
||||
}
|
||||
|
@ -14,9 +14,9 @@ using namespace Limonp;
|
||||
|
||||
//const char* const SPECIAL_CHARS = " \t\n";
|
||||
#ifndef CPPJIEBA_GBK
|
||||
const UnicodeValueType SPECIAL_SYMBOL[] = {32u, 9u, 10u, 12290u, 65292u};
|
||||
const Rune SPECIAL_SYMBOL[] = {32u, 9u, 10u, 12290u, 65292u};
|
||||
#else
|
||||
const UnicodeValueType SPECIAL_SYMBOL[] = {32u, 9u, 10u};
|
||||
const Rune SPECIAL_SYMBOL[] = {32u, 9u, 10u};
|
||||
#endif
|
||||
|
||||
class SegmentBase: public ISegment, public NonCopyable {
|
||||
@ -63,7 +63,7 @@ class SegmentBase: public ISegment, public NonCopyable {
|
||||
assert(specialSymbols_.size());
|
||||
}
|
||||
private:
|
||||
unordered_set<UnicodeValueType> specialSymbols_;
|
||||
unordered_set<Rune> specialSymbols_;
|
||||
|
||||
};
|
||||
}
|
||||
|
@ -12,8 +12,10 @@
|
||||
namespace CppJieba {
|
||||
|
||||
using namespace Limonp;
|
||||
typedef uint16_t UnicodeValueType;
|
||||
typedef Limonp::LocalVector<UnicodeValueType> Unicode;
|
||||
|
||||
typedef uint16_t Rune;
|
||||
typedef Limonp::LocalVector<Rune> Unicode;
|
||||
|
||||
namespace TransCode {
|
||||
inline bool decode(const string& str, Unicode& res) {
|
||||
#ifdef CPPJIEBA_GBK
|
||||
|
10
src/Trie.hpp
10
src/Trie.hpp
@ -22,16 +22,16 @@ inline ostream & operator << (ostream& os, const DictUnit& unit) {
|
||||
}
|
||||
|
||||
struct Dag {
|
||||
uint16_t uniCh;
|
||||
uint16_t rune;
|
||||
LocalVector<pair<size_t, const DictUnit*> > nexts;
|
||||
const DictUnit * pInfo;
|
||||
double weight;
|
||||
size_t nextPos;
|
||||
Dag():uniCh(0), pInfo(NULL), weight(0.0), nextPos(0) {
|
||||
Dag():rune(0), pInfo(NULL), weight(0.0), nextPos(0) {
|
||||
}
|
||||
};
|
||||
|
||||
typedef Unicode::value_type TrieKey;
|
||||
typedef Rune TrieKey;
|
||||
|
||||
class TrieNode {
|
||||
public :
|
||||
@ -90,9 +90,9 @@ class Trie {
|
||||
const TrieNode *ptNode = NULL;
|
||||
TrieNode::NextMap::const_iterator citer;
|
||||
for (size_t i = 0; i < size_t(end - begin); i++) {
|
||||
Unicode::value_type ch = *(begin + i);
|
||||
Rune ch = *(begin + i);
|
||||
ptNode = _base + ch;
|
||||
res[i].uniCh = ch;
|
||||
res[i].rune = ch;
|
||||
assert(res[i].nexts.empty());
|
||||
|
||||
res[i].nexts.push_back(pair<size_t, const DictUnit*>(i, ptNode->ptValue));
|
||||
|
Loading…
x
Reference in New Issue
Block a user