mirror of
https://github.com/yanyiwu/cppjieba.git
synced 2025-07-18 00:00:12 +08:00
规范Unicode的相关命名,使用Rune代表一个中文字符
This commit is contained in:
parent
0e16e000ea
commit
78e41e5fd0
@ -71,7 +71,7 @@ class DictTrie {
|
|||||||
vector<Dag>& res) const {
|
vector<Dag>& res) const {
|
||||||
trie_->find(begin, end, res);
|
trie_->find(begin, end, res);
|
||||||
}
|
}
|
||||||
bool isUserDictSingleChineseWord(const Unicode::value_type& word) const {
|
bool isUserDictSingleChineseWord(const Rune& word) const {
|
||||||
return isIn(userDictSingleChineseWord_, word);
|
return isIn(userDictSingleChineseWord_, word);
|
||||||
}
|
}
|
||||||
double getMinWeight() const {
|
double getMinWeight() const {
|
||||||
@ -198,7 +198,7 @@ class DictTrie {
|
|||||||
|
|
||||||
double minWeight_;
|
double minWeight_;
|
||||||
double maxWeight_;
|
double maxWeight_;
|
||||||
unordered_set<Unicode::value_type> userDictSingleChineseWord_;
|
unordered_set<Rune> userDictSingleChineseWord_;
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -74,7 +74,7 @@ class HMMSegment: public SegmentBase {
|
|||||||
private:
|
private:
|
||||||
// sequential letters rule
|
// sequential letters rule
|
||||||
Unicode::const_iterator sequentialLetterRule_(Unicode::const_iterator begin, Unicode::const_iterator end) const {
|
Unicode::const_iterator sequentialLetterRule_(Unicode::const_iterator begin, Unicode::const_iterator end) const {
|
||||||
Unicode::value_type x = *begin;
|
Rune x = *begin;
|
||||||
if (('a' <= x && x <= 'z') || ('A' <= x && x <= 'Z')) {
|
if (('a' <= x && x <= 'z') || ('A' <= x && x <= 'Z')) {
|
||||||
begin ++;
|
begin ++;
|
||||||
} else {
|
} else {
|
||||||
@ -92,7 +92,7 @@ class HMMSegment: public SegmentBase {
|
|||||||
}
|
}
|
||||||
//
|
//
|
||||||
Unicode::const_iterator numbersRule_(Unicode::const_iterator begin, Unicode::const_iterator end) const {
|
Unicode::const_iterator numbersRule_(Unicode::const_iterator begin, Unicode::const_iterator end) const {
|
||||||
Unicode::value_type x = *begin;
|
Rune x = *begin;
|
||||||
if('0' <= x && x <= '9') {
|
if('0' <= x && x <= '9') {
|
||||||
begin ++;
|
begin ++;
|
||||||
} else {
|
} else {
|
||||||
|
@ -28,7 +28,7 @@ class MPSegment: public SegmentBase {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
bool isUserDictSingleChineseWord(const Unicode::value_type & value) const {
|
bool isUserDictSingleChineseWord(const Rune & value) const {
|
||||||
return dictTrie_->isUserDictSingleChineseWord(value);
|
return dictTrie_->isUserDictSingleChineseWord(value);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -101,7 +101,7 @@ class MPSegment: public SegmentBase {
|
|||||||
res.push_back(p->word);
|
res.push_back(p->word);
|
||||||
i += p->word.size();
|
i += p->word.size();
|
||||||
} else { //single chinese word
|
} else { //single chinese word
|
||||||
res.push_back(Unicode(1, dags[i].uniCh));
|
res.push_back(Unicode(1, dags[i].rune));
|
||||||
i++;
|
i++;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -14,9 +14,9 @@ using namespace Limonp;
|
|||||||
|
|
||||||
//const char* const SPECIAL_CHARS = " \t\n";
|
//const char* const SPECIAL_CHARS = " \t\n";
|
||||||
#ifndef CPPJIEBA_GBK
|
#ifndef CPPJIEBA_GBK
|
||||||
const UnicodeValueType SPECIAL_SYMBOL[] = {32u, 9u, 10u, 12290u, 65292u};
|
const Rune SPECIAL_SYMBOL[] = {32u, 9u, 10u, 12290u, 65292u};
|
||||||
#else
|
#else
|
||||||
const UnicodeValueType SPECIAL_SYMBOL[] = {32u, 9u, 10u};
|
const Rune SPECIAL_SYMBOL[] = {32u, 9u, 10u};
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
class SegmentBase: public ISegment, public NonCopyable {
|
class SegmentBase: public ISegment, public NonCopyable {
|
||||||
@ -63,7 +63,7 @@ class SegmentBase: public ISegment, public NonCopyable {
|
|||||||
assert(specialSymbols_.size());
|
assert(specialSymbols_.size());
|
||||||
}
|
}
|
||||||
private:
|
private:
|
||||||
unordered_set<UnicodeValueType> specialSymbols_;
|
unordered_set<Rune> specialSymbols_;
|
||||||
|
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
@ -12,8 +12,10 @@
|
|||||||
namespace CppJieba {
|
namespace CppJieba {
|
||||||
|
|
||||||
using namespace Limonp;
|
using namespace Limonp;
|
||||||
typedef uint16_t UnicodeValueType;
|
|
||||||
typedef Limonp::LocalVector<UnicodeValueType> Unicode;
|
typedef uint16_t Rune;
|
||||||
|
typedef Limonp::LocalVector<Rune> Unicode;
|
||||||
|
|
||||||
namespace TransCode {
|
namespace TransCode {
|
||||||
inline bool decode(const string& str, Unicode& res) {
|
inline bool decode(const string& str, Unicode& res) {
|
||||||
#ifdef CPPJIEBA_GBK
|
#ifdef CPPJIEBA_GBK
|
||||||
|
10
src/Trie.hpp
10
src/Trie.hpp
@ -22,16 +22,16 @@ inline ostream & operator << (ostream& os, const DictUnit& unit) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
struct Dag {
|
struct Dag {
|
||||||
uint16_t uniCh;
|
uint16_t rune;
|
||||||
LocalVector<pair<size_t, const DictUnit*> > nexts;
|
LocalVector<pair<size_t, const DictUnit*> > nexts;
|
||||||
const DictUnit * pInfo;
|
const DictUnit * pInfo;
|
||||||
double weight;
|
double weight;
|
||||||
size_t nextPos;
|
size_t nextPos;
|
||||||
Dag():uniCh(0), pInfo(NULL), weight(0.0), nextPos(0) {
|
Dag():rune(0), pInfo(NULL), weight(0.0), nextPos(0) {
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
typedef Unicode::value_type TrieKey;
|
typedef Rune TrieKey;
|
||||||
|
|
||||||
class TrieNode {
|
class TrieNode {
|
||||||
public :
|
public :
|
||||||
@ -90,9 +90,9 @@ class Trie {
|
|||||||
const TrieNode *ptNode = NULL;
|
const TrieNode *ptNode = NULL;
|
||||||
TrieNode::NextMap::const_iterator citer;
|
TrieNode::NextMap::const_iterator citer;
|
||||||
for (size_t i = 0; i < size_t(end - begin); i++) {
|
for (size_t i = 0; i < size_t(end - begin); i++) {
|
||||||
Unicode::value_type ch = *(begin + i);
|
Rune ch = *(begin + i);
|
||||||
ptNode = _base + ch;
|
ptNode = _base + ch;
|
||||||
res[i].uniCh = ch;
|
res[i].rune = ch;
|
||||||
assert(res[i].nexts.empty());
|
assert(res[i].nexts.empty());
|
||||||
|
|
||||||
res[i].nexts.push_back(pair<size_t, const DictUnit*>(i, ptNode->ptValue));
|
res[i].nexts.push_back(pair<size_t, const DictUnit*>(i, ptNode->ptValue));
|
||||||
|
Loading…
x
Reference in New Issue
Block a user