mirror of
https://github.com/yanyiwu/cppjieba.git
synced 2025-07-18 00:00:12 +08:00
little adjustment
This commit is contained in:
parent
00f738a617
commit
10e9b32258
@ -92,10 +92,11 @@ curl -d "南京市长江大桥" "http://127.0.0.1:11200/"
|
|||||||
["南京市", "长江大桥"]
|
["南京市", "长江大桥"]
|
||||||
```
|
```
|
||||||
|
|
||||||
如果有需要**安装使用**的,可以按照如下操作:
|
因为 HTTP GET 请求有长度限制,如果需要请求长文的,请使用POST请求。
|
||||||
|
|
||||||
### 安装服务
|
### 安装服务
|
||||||
|
|
||||||
|
如果有需要**安装使用**的,可以按照如下操作:
|
||||||
```
|
```
|
||||||
sudo make install
|
sudo make install
|
||||||
```
|
```
|
||||||
|
@ -25,22 +25,8 @@ namespace CppJieba
|
|||||||
|
|
||||||
class DictTrie
|
class DictTrie
|
||||||
{
|
{
|
||||||
private:
|
public:
|
||||||
vector<DictUnit> _nodeInfos;
|
|
||||||
Trie * _trie;
|
|
||||||
|
|
||||||
double _minWeight;
|
|
||||||
private:
|
|
||||||
unordered_set<Unicode::value_type> _userDictSingleChineseWord;
|
|
||||||
public:
|
|
||||||
bool isUserDictSingleChineseWord(const Unicode::value_type& word) const
|
|
||||||
{
|
|
||||||
return isIn(_userDictSingleChineseWord, word);
|
|
||||||
}
|
|
||||||
public:
|
|
||||||
double getMinWeight() const {return _minWeight;};
|
|
||||||
|
|
||||||
public:
|
|
||||||
DictTrie()
|
DictTrie()
|
||||||
{
|
{
|
||||||
_trie = NULL;
|
_trie = NULL;
|
||||||
@ -59,7 +45,6 @@ namespace CppJieba
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
public:
|
|
||||||
bool init(const string& dictPath, const string& userDictPath = "")
|
bool init(const string& dictPath, const string& userDictPath = "")
|
||||||
{
|
{
|
||||||
assert(!_trie);
|
assert(!_trie);
|
||||||
@ -78,7 +63,6 @@ namespace CppJieba
|
|||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
public:
|
|
||||||
const DictUnit* find(Unicode::const_iterator begin, Unicode::const_iterator end) const
|
const DictUnit* find(Unicode::const_iterator begin, Unicode::const_iterator end) const
|
||||||
{
|
{
|
||||||
return _trie->find(begin, end);
|
return _trie->find(begin, end);
|
||||||
@ -95,6 +79,11 @@ namespace CppJieba
|
|||||||
{
|
{
|
||||||
_trie->find(begin, end, res);
|
_trie->find(begin, end, res);
|
||||||
}
|
}
|
||||||
|
bool isUserDictSingleChineseWord(const Unicode::value_type& word) const
|
||||||
|
{
|
||||||
|
return isIn(_userDictSingleChineseWord, word);
|
||||||
|
}
|
||||||
|
double getMinWeight() const {return _minWeight;};
|
||||||
|
|
||||||
|
|
||||||
private:
|
private:
|
||||||
@ -204,7 +193,12 @@ namespace CppJieba
|
|||||||
vector<DictUnit>(units.begin(), units.end()).swap(units);
|
vector<DictUnit>(units.begin(), units.end()).swap(units);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private:
|
||||||
|
vector<DictUnit> _nodeInfos;
|
||||||
|
Trie * _trie;
|
||||||
|
|
||||||
|
double _minWeight;
|
||||||
|
unordered_set<Unicode::value_type> _userDictSingleChineseWord;
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -14,9 +14,6 @@ namespace CppJieba
|
|||||||
{
|
{
|
||||||
class FullSegment: public SegmentBase
|
class FullSegment: public SegmentBase
|
||||||
{
|
{
|
||||||
private:
|
|
||||||
const DictTrie* _dictTrie;
|
|
||||||
bool _isBorrowed;
|
|
||||||
public:
|
public:
|
||||||
FullSegment()
|
FullSegment()
|
||||||
{
|
{
|
||||||
@ -41,7 +38,6 @@ namespace CppJieba
|
|||||||
}
|
}
|
||||||
|
|
||||||
};
|
};
|
||||||
public:
|
|
||||||
bool init(const string& dictPath)
|
bool init(const string& dictPath)
|
||||||
{
|
{
|
||||||
assert(_dictTrie == NULL);
|
assert(_dictTrie == NULL);
|
||||||
@ -58,10 +54,7 @@ namespace CppJieba
|
|||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
public:
|
|
||||||
using SegmentBase::cut;
|
using SegmentBase::cut;
|
||||||
|
|
||||||
public:
|
|
||||||
bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<Unicode>& res) const
|
bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<Unicode>& res) const
|
||||||
{
|
{
|
||||||
assert(_dictTrie);
|
assert(_dictTrie);
|
||||||
@ -147,6 +140,9 @@ namespace CppJieba
|
|||||||
|
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
private:
|
||||||
|
const DictTrie* _dictTrie;
|
||||||
|
bool _isBorrowed;
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -24,15 +24,6 @@ namespace CppJieba
|
|||||||
* 0:B, 1:E, 2:M, 3:S
|
* 0:B, 1:E, 2:M, 3:S
|
||||||
* */
|
* */
|
||||||
enum {B = 0, E = 1, M = 2, S = 3, STATUS_SUM = 4};
|
enum {B = 0, E = 1, M = 2, S = 3, STATUS_SUM = 4};
|
||||||
private:
|
|
||||||
char _statMap[STATUS_SUM];
|
|
||||||
double _startProb[STATUS_SUM];
|
|
||||||
double _transProb[STATUS_SUM][STATUS_SUM];
|
|
||||||
EmitProbMap _emitProbB;
|
|
||||||
EmitProbMap _emitProbE;
|
|
||||||
EmitProbMap _emitProbM;
|
|
||||||
EmitProbMap _emitProbS;
|
|
||||||
vector<EmitProbMap* > _emitProbVec;
|
|
||||||
|
|
||||||
public:
|
public:
|
||||||
HMMSegment(){}
|
HMMSegment(){}
|
||||||
@ -101,6 +92,30 @@ namespace CppJieba
|
|||||||
}
|
}
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
public:
|
||||||
|
virtual bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<string>& res)const
|
||||||
|
{
|
||||||
|
if(begin == end)
|
||||||
|
{
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
vector<Unicode> words;
|
||||||
|
words.reserve(end - begin);
|
||||||
|
if(!cut(begin, end, words))
|
||||||
|
{
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
size_t offset = res.size();
|
||||||
|
res.resize(res.size() + words.size());
|
||||||
|
for(size_t i = 0; i < words.size(); i++)
|
||||||
|
{
|
||||||
|
if(!TransCode::encode(words[i], res[offset + i]))
|
||||||
|
{
|
||||||
|
LogError("encode failed.");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return true;
|
||||||
|
}
|
||||||
private:
|
private:
|
||||||
// sequential letters rule
|
// sequential letters rule
|
||||||
Unicode::const_iterator _sequentialLetterRule(Unicode::const_iterator begin, Unicode::const_iterator end) const
|
Unicode::const_iterator _sequentialLetterRule(Unicode::const_iterator begin, Unicode::const_iterator end) const
|
||||||
@ -168,32 +183,7 @@ namespace CppJieba
|
|||||||
}
|
}
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
public:
|
|
||||||
virtual bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<string>& res)const
|
|
||||||
{
|
|
||||||
if(begin == end)
|
|
||||||
{
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
vector<Unicode> words;
|
|
||||||
words.reserve(end - begin);
|
|
||||||
if(!cut(begin, end, words))
|
|
||||||
{
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
size_t offset = res.size();
|
|
||||||
res.resize(res.size() + words.size());
|
|
||||||
for(size_t i = 0; i < words.size(); i++)
|
|
||||||
{
|
|
||||||
if(!TransCode::encode(words[i], res[offset + i]))
|
|
||||||
{
|
|
||||||
LogError("encode failed.");
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
private:
|
|
||||||
bool _viterbi(Unicode::const_iterator begin, Unicode::const_iterator end, vector<size_t>& status)const
|
bool _viterbi(Unicode::const_iterator begin, Unicode::const_iterator end, vector<size_t>& status)const
|
||||||
{
|
{
|
||||||
if(begin == end)
|
if(begin == end)
|
||||||
@ -384,6 +374,15 @@ namespace CppJieba
|
|||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private:
|
||||||
|
char _statMap[STATUS_SUM];
|
||||||
|
double _startProb[STATUS_SUM];
|
||||||
|
double _transProb[STATUS_SUM][STATUS_SUM];
|
||||||
|
EmitProbMap _emitProbB;
|
||||||
|
EmitProbMap _emitProbE;
|
||||||
|
EmitProbMap _emitProbM;
|
||||||
|
EmitProbMap _emitProbS;
|
||||||
|
vector<EmitProbMap* > _emitProbVec;
|
||||||
|
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
@ -8,7 +8,6 @@ namespace CppJieba
|
|||||||
{
|
{
|
||||||
public:
|
public:
|
||||||
virtual ~ISegment(){};
|
virtual ~ISegment(){};
|
||||||
public:
|
|
||||||
virtual bool cut(Unicode::const_iterator begin , Unicode::const_iterator end, vector<string>& res) const = 0;
|
virtual bool cut(Unicode::const_iterator begin , Unicode::const_iterator end, vector<string>& res) const = 0;
|
||||||
virtual bool cut(const string& str, vector<string>& res) const = 0;
|
virtual bool cut(const string& str, vector<string>& res) const = 0;
|
||||||
};
|
};
|
||||||
|
@ -12,13 +12,6 @@ namespace CppJieba
|
|||||||
/*utf8*/
|
/*utf8*/
|
||||||
class KeywordExtractor
|
class KeywordExtractor
|
||||||
{
|
{
|
||||||
private:
|
|
||||||
MixSegment _segment;
|
|
||||||
private:
|
|
||||||
unordered_map<string, double> _idfMap;
|
|
||||||
double _idfAverage;
|
|
||||||
|
|
||||||
unordered_set<string> _stopWords;
|
|
||||||
public:
|
public:
|
||||||
KeywordExtractor(){};
|
KeywordExtractor(){};
|
||||||
KeywordExtractor(const string& dictPath, const string& hmmFilePath, const string& idfPath, const string& stopWordPath, const string& userDict = "")
|
KeywordExtractor(const string& dictPath, const string& hmmFilePath, const string& idfPath, const string& stopWordPath, const string& userDict = "")
|
||||||
@ -27,14 +20,12 @@ namespace CppJieba
|
|||||||
};
|
};
|
||||||
~KeywordExtractor(){};
|
~KeywordExtractor(){};
|
||||||
|
|
||||||
public:
|
|
||||||
void init(const string& dictPath, const string& hmmFilePath, const string& idfPath, const string& stopWordPath, const string& userDict = "")
|
void init(const string& dictPath, const string& hmmFilePath, const string& idfPath, const string& stopWordPath, const string& userDict = "")
|
||||||
{
|
{
|
||||||
_loadIdfDict(idfPath);
|
_loadIdfDict(idfPath);
|
||||||
_loadStopWordDict(stopWordPath);
|
_loadStopWordDict(stopWordPath);
|
||||||
LIMONP_CHECK(_segment.init(dictPath, hmmFilePath, userDict));
|
LIMONP_CHECK(_segment.init(dictPath, hmmFilePath, userDict));
|
||||||
};
|
};
|
||||||
public:
|
|
||||||
|
|
||||||
bool extract(const string& str, vector<string>& keywords, size_t topN) const
|
bool extract(const string& str, vector<string>& keywords, size_t topN) const
|
||||||
{
|
{
|
||||||
@ -148,7 +139,7 @@ namespace CppJieba
|
|||||||
}
|
}
|
||||||
assert(_stopWords.size());
|
assert(_stopWords.size());
|
||||||
}
|
}
|
||||||
private:
|
|
||||||
bool _isSingleWord(const string& str) const
|
bool _isSingleWord(const string& str) const
|
||||||
{
|
{
|
||||||
Unicode unicode;
|
Unicode unicode;
|
||||||
@ -158,12 +149,17 @@ namespace CppJieba
|
|||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
private:
|
|
||||||
static bool _cmp(const pair<string, double>& lhs, const pair<string, double>& rhs)
|
static bool _cmp(const pair<string, double>& lhs, const pair<string, double>& rhs)
|
||||||
{
|
{
|
||||||
return lhs.second > rhs.second;
|
return lhs.second > rhs.second;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private:
|
||||||
|
MixSegment _segment;
|
||||||
|
unordered_map<string, double> _idfMap;
|
||||||
|
double _idfAverage;
|
||||||
|
|
||||||
|
unordered_set<string> _stopWords;
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -14,8 +14,6 @@ namespace CppJieba
|
|||||||
|
|
||||||
class MPSegment: public SegmentBase
|
class MPSegment: public SegmentBase
|
||||||
{
|
{
|
||||||
private:
|
|
||||||
DictTrie _dictTrie;
|
|
||||||
|
|
||||||
public:
|
public:
|
||||||
MPSegment(){};
|
MPSegment(){};
|
||||||
@ -24,7 +22,7 @@ namespace CppJieba
|
|||||||
LIMONP_CHECK(init(dictPath, userDictPath));
|
LIMONP_CHECK(init(dictPath, userDictPath));
|
||||||
};
|
};
|
||||||
virtual ~MPSegment(){};
|
virtual ~MPSegment(){};
|
||||||
public:
|
|
||||||
bool init(const string& dictPath, const string& userDictPath = "")
|
bool init(const string& dictPath, const string& userDictPath = "")
|
||||||
{
|
{
|
||||||
LIMONP_CHECK(_dictTrie.init(dictPath, userDictPath));
|
LIMONP_CHECK(_dictTrie.init(dictPath, userDictPath));
|
||||||
@ -35,7 +33,7 @@ namespace CppJieba
|
|||||||
{
|
{
|
||||||
return _dictTrie.isUserDictSingleChineseWord(value);
|
return _dictTrie.isUserDictSingleChineseWord(value);
|
||||||
}
|
}
|
||||||
public:
|
|
||||||
using SegmentBase::cut;
|
using SegmentBase::cut;
|
||||||
virtual bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<string>& res)const
|
virtual bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<string>& res)const
|
||||||
{
|
{
|
||||||
@ -141,6 +139,8 @@ namespace CppJieba
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private:
|
||||||
|
DictTrie _dictTrie;
|
||||||
|
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
@ -10,17 +10,17 @@ namespace CppJieba
|
|||||||
{
|
{
|
||||||
class MixSegment: public SegmentBase
|
class MixSegment: public SegmentBase
|
||||||
{
|
{
|
||||||
private:
|
|
||||||
MPSegment _mpSeg;
|
|
||||||
HMMSegment _hmmSeg;
|
|
||||||
public:
|
public:
|
||||||
MixSegment(){};
|
MixSegment()
|
||||||
|
{
|
||||||
|
}
|
||||||
MixSegment(const string& mpSegDict, const string& hmmSegDict, const string& userDict = "")
|
MixSegment(const string& mpSegDict, const string& hmmSegDict, const string& userDict = "")
|
||||||
{
|
{
|
||||||
LIMONP_CHECK(init(mpSegDict, hmmSegDict, userDict));
|
LIMONP_CHECK(init(mpSegDict, hmmSegDict, userDict));
|
||||||
}
|
}
|
||||||
virtual ~MixSegment(){}
|
virtual ~MixSegment()
|
||||||
public:
|
{
|
||||||
|
}
|
||||||
bool init(const string& mpSegDict, const string& hmmSegDict, const string& userDict = "")
|
bool init(const string& mpSegDict, const string& hmmSegDict, const string& userDict = "")
|
||||||
{
|
{
|
||||||
LIMONP_CHECK(_mpSeg.init(mpSegDict, userDict));
|
LIMONP_CHECK(_mpSeg.init(mpSegDict, userDict));
|
||||||
@ -28,9 +28,7 @@ namespace CppJieba
|
|||||||
LogInfo("MixSegment init(%s, %s)", mpSegDict.c_str(), hmmSegDict.c_str());
|
LogInfo("MixSegment init(%s, %s)", mpSegDict.c_str(), hmmSegDict.c_str());
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
public:
|
|
||||||
using SegmentBase::cut;
|
using SegmentBase::cut;
|
||||||
public:
|
|
||||||
virtual bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<Unicode>& res) const
|
virtual bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<Unicode>& res) const
|
||||||
{
|
{
|
||||||
vector<Unicode> words;
|
vector<Unicode> words;
|
||||||
@ -115,6 +113,9 @@ namespace CppJieba
|
|||||||
{
|
{
|
||||||
return _mpSeg.getDictTrie();
|
return _mpSeg.getDictTrie();
|
||||||
}
|
}
|
||||||
|
private:
|
||||||
|
MPSegment _mpSeg;
|
||||||
|
HMMSegment _hmmSeg;
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -15,13 +15,10 @@ namespace CppJieba
|
|||||||
|
|
||||||
class PosTagger
|
class PosTagger
|
||||||
{
|
{
|
||||||
private:
|
|
||||||
MixSegment _segment;
|
|
||||||
const DictTrie * _dictTrie;
|
|
||||||
|
|
||||||
public:
|
public:
|
||||||
PosTagger()
|
PosTagger()
|
||||||
{}
|
{
|
||||||
|
}
|
||||||
PosTagger(
|
PosTagger(
|
||||||
const string& dictPath,
|
const string& dictPath,
|
||||||
const string& hmmFilePath,
|
const string& hmmFilePath,
|
||||||
@ -29,9 +26,10 @@ namespace CppJieba
|
|||||||
)
|
)
|
||||||
{
|
{
|
||||||
init(dictPath, hmmFilePath, userDictPath);
|
init(dictPath, hmmFilePath, userDictPath);
|
||||||
};
|
}
|
||||||
~PosTagger(){};
|
~PosTagger()
|
||||||
public:
|
{
|
||||||
|
}
|
||||||
void init(
|
void init(
|
||||||
const string& dictPath,
|
const string& dictPath,
|
||||||
const string& hmmFilePath,
|
const string& hmmFilePath,
|
||||||
@ -103,6 +101,9 @@ namespace CppJieba
|
|||||||
// the ascii chars contain english letter
|
// the ascii chars contain english letter
|
||||||
return POS_ENG;
|
return POS_ENG;
|
||||||
}
|
}
|
||||||
|
private:
|
||||||
|
MixSegment _segment;
|
||||||
|
const DictTrie * _dictTrie;
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -17,11 +17,6 @@ namespace CppJieba
|
|||||||
{
|
{
|
||||||
class QuerySegment: public SegmentBase
|
class QuerySegment: public SegmentBase
|
||||||
{
|
{
|
||||||
private:
|
|
||||||
MixSegment _mixSeg;
|
|
||||||
FullSegment _fullSeg;
|
|
||||||
size_t _maxWordLen;
|
|
||||||
|
|
||||||
public:
|
public:
|
||||||
QuerySegment(){};
|
QuerySegment(){};
|
||||||
QuerySegment(const string& dict, const string& model, size_t maxWordLen, const string& userDict = "")
|
QuerySegment(const string& dict, const string& model, size_t maxWordLen, const string& userDict = "")
|
||||||
@ -29,7 +24,6 @@ namespace CppJieba
|
|||||||
init(dict, model, maxWordLen, userDict);
|
init(dict, model, maxWordLen, userDict);
|
||||||
};
|
};
|
||||||
virtual ~QuerySegment(){};
|
virtual ~QuerySegment(){};
|
||||||
public:
|
|
||||||
bool init(const string& dict, const string& model, size_t maxWordLen, const string& userDict = "")
|
bool init(const string& dict, const string& model, size_t maxWordLen, const string& userDict = "")
|
||||||
{
|
{
|
||||||
LIMONP_CHECK(_mixSeg.init(dict, model, userDict));
|
LIMONP_CHECK(_mixSeg.init(dict, model, userDict));
|
||||||
@ -38,11 +32,7 @@ namespace CppJieba
|
|||||||
_maxWordLen = maxWordLen;
|
_maxWordLen = maxWordLen;
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
public:
|
|
||||||
using SegmentBase::cut;
|
using SegmentBase::cut;
|
||||||
|
|
||||||
public:
|
|
||||||
bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<Unicode>& res) const
|
bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<Unicode>& res) const
|
||||||
{
|
{
|
||||||
if (begin >= end)
|
if (begin >= end)
|
||||||
@ -117,6 +107,11 @@ namespace CppJieba
|
|||||||
|
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
private:
|
||||||
|
MixSegment _mixSeg;
|
||||||
|
FullSegment _fullSeg;
|
||||||
|
size_t _maxWordLen;
|
||||||
|
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -25,19 +25,6 @@ namespace CppJieba
|
|||||||
public:
|
public:
|
||||||
SegmentBase(){_loadSpecialSymbols();};
|
SegmentBase(){_loadSpecialSymbols();};
|
||||||
virtual ~SegmentBase(){};
|
virtual ~SegmentBase(){};
|
||||||
private:
|
|
||||||
unordered_set<UnicodeValueType> _specialSymbols;
|
|
||||||
private:
|
|
||||||
void _loadSpecialSymbols()
|
|
||||||
{
|
|
||||||
size_t size = sizeof(SPECIAL_SYMBOL)/sizeof(*SPECIAL_SYMBOL);
|
|
||||||
for(size_t i = 0; i < size; i ++)
|
|
||||||
{
|
|
||||||
_specialSymbols.insert(SPECIAL_SYMBOL[i]);
|
|
||||||
}
|
|
||||||
assert(_specialSymbols.size());
|
|
||||||
}
|
|
||||||
|
|
||||||
public:
|
public:
|
||||||
virtual bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<string>& res) const = 0;
|
virtual bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<string>& res) const = 0;
|
||||||
virtual bool cut(const string& str, vector<string>& res) const
|
virtual bool cut(const string& str, vector<string>& res) const
|
||||||
@ -72,6 +59,19 @@ namespace CppJieba
|
|||||||
|
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
private:
|
||||||
|
void _loadSpecialSymbols()
|
||||||
|
{
|
||||||
|
size_t size = sizeof(SPECIAL_SYMBOL)/sizeof(*SPECIAL_SYMBOL);
|
||||||
|
for(size_t i = 0; i < size; i ++)
|
||||||
|
{
|
||||||
|
_specialSymbols.insert(SPECIAL_SYMBOL[i]);
|
||||||
|
}
|
||||||
|
assert(_specialSymbols.size());
|
||||||
|
}
|
||||||
|
private:
|
||||||
|
unordered_set<UnicodeValueType> _specialSymbols;
|
||||||
|
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
|
17
src/Trie.hpp
17
src/Trie.hpp
@ -43,12 +43,6 @@ namespace CppJieba
|
|||||||
|
|
||||||
class TrieNode
|
class TrieNode
|
||||||
{
|
{
|
||||||
public:
|
|
||||||
typedef unordered_map<TrieKey, TrieNode*> NextMap;
|
|
||||||
public:
|
|
||||||
TrieNode * fail;
|
|
||||||
NextMap * next;
|
|
||||||
const DictUnit * ptValue;
|
|
||||||
public:
|
public:
|
||||||
TrieNode(): fail(NULL), next(NULL), ptValue(NULL)
|
TrieNode(): fail(NULL), next(NULL), ptValue(NULL)
|
||||||
{}
|
{}
|
||||||
@ -65,12 +59,15 @@ namespace CppJieba
|
|||||||
}
|
}
|
||||||
return iter->second;
|
return iter->second;
|
||||||
}
|
}
|
||||||
|
public:
|
||||||
|
typedef unordered_map<TrieKey, TrieNode*> NextMap;
|
||||||
|
TrieNode * fail;
|
||||||
|
NextMap * next;
|
||||||
|
const DictUnit * ptValue;
|
||||||
};
|
};
|
||||||
|
|
||||||
class Trie
|
class Trie
|
||||||
{
|
{
|
||||||
private:
|
|
||||||
TrieNode* _root;
|
|
||||||
public:
|
public:
|
||||||
Trie(const vector<Unicode>& keys, const vector<const DictUnit*> & valuePointers)
|
Trie(const vector<Unicode>& keys, const vector<const DictUnit*> & valuePointers)
|
||||||
{
|
{
|
||||||
@ -230,7 +227,6 @@ namespace CppJieba
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
private:
|
|
||||||
void _createTrie(const vector<Unicode>& keys, const vector<const DictUnit*> & valuePointers)
|
void _createTrie(const vector<Unicode>& keys, const vector<const DictUnit*> & valuePointers)
|
||||||
{
|
{
|
||||||
if(valuePointers.empty() || keys.empty())
|
if(valuePointers.empty() || keys.empty())
|
||||||
@ -244,7 +240,6 @@ namespace CppJieba
|
|||||||
_insertNode(keys[i], valuePointers[i]);
|
_insertNode(keys[i], valuePointers[i]);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
private:
|
|
||||||
void _insertNode(const Unicode& key, const DictUnit* ptValue)
|
void _insertNode(const Unicode& key, const DictUnit* ptValue)
|
||||||
{
|
{
|
||||||
TrieNode* ptNode = _root;
|
TrieNode* ptNode = _root;
|
||||||
@ -291,6 +286,8 @@ namespace CppJieba
|
|||||||
}
|
}
|
||||||
delete node;
|
delete node;
|
||||||
}
|
}
|
||||||
|
private:
|
||||||
|
TrieNode* _root;
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user