mirror of
https://github.com/yanyiwu/cppjieba.git
synced 2025-07-18 00:00:12 +08:00
little adjustment
This commit is contained in:
parent
00f738a617
commit
10e9b32258
@ -92,10 +92,11 @@ curl -d "南京市长江大桥" "http://127.0.0.1:11200/"
|
||||
["南京市", "长江大桥"]
|
||||
```
|
||||
|
||||
如果有需要**安装使用**的,可以按照如下操作:
|
||||
因为 HTTP GET 请求有长度限制,如果需要请求长文的,请使用POST请求。
|
||||
|
||||
### 安装服务
|
||||
|
||||
如果有需要**安装使用**的,可以按照如下操作:
|
||||
```
|
||||
sudo make install
|
||||
```
|
||||
|
@ -25,22 +25,8 @@ namespace CppJieba
|
||||
|
||||
class DictTrie
|
||||
{
|
||||
private:
|
||||
vector<DictUnit> _nodeInfos;
|
||||
Trie * _trie;
|
||||
public:
|
||||
|
||||
double _minWeight;
|
||||
private:
|
||||
unordered_set<Unicode::value_type> _userDictSingleChineseWord;
|
||||
public:
|
||||
bool isUserDictSingleChineseWord(const Unicode::value_type& word) const
|
||||
{
|
||||
return isIn(_userDictSingleChineseWord, word);
|
||||
}
|
||||
public:
|
||||
double getMinWeight() const {return _minWeight;};
|
||||
|
||||
public:
|
||||
DictTrie()
|
||||
{
|
||||
_trie = NULL;
|
||||
@ -59,7 +45,6 @@ namespace CppJieba
|
||||
}
|
||||
}
|
||||
|
||||
public:
|
||||
bool init(const string& dictPath, const string& userDictPath = "")
|
||||
{
|
||||
assert(!_trie);
|
||||
@ -78,7 +63,6 @@ namespace CppJieba
|
||||
return true;
|
||||
}
|
||||
|
||||
public:
|
||||
const DictUnit* find(Unicode::const_iterator begin, Unicode::const_iterator end) const
|
||||
{
|
||||
return _trie->find(begin, end);
|
||||
@ -95,6 +79,11 @@ namespace CppJieba
|
||||
{
|
||||
_trie->find(begin, end, res);
|
||||
}
|
||||
bool isUserDictSingleChineseWord(const Unicode::value_type& word) const
|
||||
{
|
||||
return isIn(_userDictSingleChineseWord, word);
|
||||
}
|
||||
double getMinWeight() const {return _minWeight;};
|
||||
|
||||
|
||||
private:
|
||||
@ -204,7 +193,12 @@ namespace CppJieba
|
||||
vector<DictUnit>(units.begin(), units.end()).swap(units);
|
||||
}
|
||||
|
||||
private:
|
||||
vector<DictUnit> _nodeInfos;
|
||||
Trie * _trie;
|
||||
|
||||
double _minWeight;
|
||||
unordered_set<Unicode::value_type> _userDictSingleChineseWord;
|
||||
};
|
||||
}
|
||||
|
||||
|
@ -14,9 +14,6 @@ namespace CppJieba
|
||||
{
|
||||
class FullSegment: public SegmentBase
|
||||
{
|
||||
private:
|
||||
const DictTrie* _dictTrie;
|
||||
bool _isBorrowed;
|
||||
public:
|
||||
FullSegment()
|
||||
{
|
||||
@ -41,7 +38,6 @@ namespace CppJieba
|
||||
}
|
||||
|
||||
};
|
||||
public:
|
||||
bool init(const string& dictPath)
|
||||
{
|
||||
assert(_dictTrie == NULL);
|
||||
@ -58,10 +54,7 @@ namespace CppJieba
|
||||
return true;
|
||||
}
|
||||
|
||||
public:
|
||||
using SegmentBase::cut;
|
||||
|
||||
public:
|
||||
bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<Unicode>& res) const
|
||||
{
|
||||
assert(_dictTrie);
|
||||
@ -147,6 +140,9 @@ namespace CppJieba
|
||||
|
||||
return true;
|
||||
}
|
||||
private:
|
||||
const DictTrie* _dictTrie;
|
||||
bool _isBorrowed;
|
||||
};
|
||||
}
|
||||
|
||||
|
@ -24,15 +24,6 @@ namespace CppJieba
|
||||
* 0:B, 1:E, 2:M, 3:S
|
||||
* */
|
||||
enum {B = 0, E = 1, M = 2, S = 3, STATUS_SUM = 4};
|
||||
private:
|
||||
char _statMap[STATUS_SUM];
|
||||
double _startProb[STATUS_SUM];
|
||||
double _transProb[STATUS_SUM][STATUS_SUM];
|
||||
EmitProbMap _emitProbB;
|
||||
EmitProbMap _emitProbE;
|
||||
EmitProbMap _emitProbM;
|
||||
EmitProbMap _emitProbS;
|
||||
vector<EmitProbMap* > _emitProbVec;
|
||||
|
||||
public:
|
||||
HMMSegment(){}
|
||||
@ -101,6 +92,30 @@ namespace CppJieba
|
||||
}
|
||||
return true;
|
||||
}
|
||||
public:
|
||||
virtual bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<string>& res)const
|
||||
{
|
||||
if(begin == end)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
vector<Unicode> words;
|
||||
words.reserve(end - begin);
|
||||
if(!cut(begin, end, words))
|
||||
{
|
||||
return false;
|
||||
}
|
||||
size_t offset = res.size();
|
||||
res.resize(res.size() + words.size());
|
||||
for(size_t i = 0; i < words.size(); i++)
|
||||
{
|
||||
if(!TransCode::encode(words[i], res[offset + i]))
|
||||
{
|
||||
LogError("encode failed.");
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
private:
|
||||
// sequential letters rule
|
||||
Unicode::const_iterator _sequentialLetterRule(Unicode::const_iterator begin, Unicode::const_iterator end) const
|
||||
@ -168,32 +183,7 @@ namespace CppJieba
|
||||
}
|
||||
return true;
|
||||
}
|
||||
public:
|
||||
virtual bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<string>& res)const
|
||||
{
|
||||
if(begin == end)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
vector<Unicode> words;
|
||||
words.reserve(end - begin);
|
||||
if(!cut(begin, end, words))
|
||||
{
|
||||
return false;
|
||||
}
|
||||
size_t offset = res.size();
|
||||
res.resize(res.size() + words.size());
|
||||
for(size_t i = 0; i < words.size(); i++)
|
||||
{
|
||||
if(!TransCode::encode(words[i], res[offset + i]))
|
||||
{
|
||||
LogError("encode failed.");
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
private:
|
||||
bool _viterbi(Unicode::const_iterator begin, Unicode::const_iterator end, vector<size_t>& status)const
|
||||
{
|
||||
if(begin == end)
|
||||
@ -384,6 +374,15 @@ namespace CppJieba
|
||||
|
||||
}
|
||||
|
||||
private:
|
||||
char _statMap[STATUS_SUM];
|
||||
double _startProb[STATUS_SUM];
|
||||
double _transProb[STATUS_SUM][STATUS_SUM];
|
||||
EmitProbMap _emitProbB;
|
||||
EmitProbMap _emitProbE;
|
||||
EmitProbMap _emitProbM;
|
||||
EmitProbMap _emitProbS;
|
||||
vector<EmitProbMap* > _emitProbVec;
|
||||
|
||||
};
|
||||
}
|
||||
|
@ -8,7 +8,6 @@ namespace CppJieba
|
||||
{
|
||||
public:
|
||||
virtual ~ISegment(){};
|
||||
public:
|
||||
virtual bool cut(Unicode::const_iterator begin , Unicode::const_iterator end, vector<string>& res) const = 0;
|
||||
virtual bool cut(const string& str, vector<string>& res) const = 0;
|
||||
};
|
||||
|
@ -12,13 +12,6 @@ namespace CppJieba
|
||||
/*utf8*/
|
||||
class KeywordExtractor
|
||||
{
|
||||
private:
|
||||
MixSegment _segment;
|
||||
private:
|
||||
unordered_map<string, double> _idfMap;
|
||||
double _idfAverage;
|
||||
|
||||
unordered_set<string> _stopWords;
|
||||
public:
|
||||
KeywordExtractor(){};
|
||||
KeywordExtractor(const string& dictPath, const string& hmmFilePath, const string& idfPath, const string& stopWordPath, const string& userDict = "")
|
||||
@ -27,14 +20,12 @@ namespace CppJieba
|
||||
};
|
||||
~KeywordExtractor(){};
|
||||
|
||||
public:
|
||||
void init(const string& dictPath, const string& hmmFilePath, const string& idfPath, const string& stopWordPath, const string& userDict = "")
|
||||
{
|
||||
_loadIdfDict(idfPath);
|
||||
_loadStopWordDict(stopWordPath);
|
||||
LIMONP_CHECK(_segment.init(dictPath, hmmFilePath, userDict));
|
||||
};
|
||||
public:
|
||||
|
||||
bool extract(const string& str, vector<string>& keywords, size_t topN) const
|
||||
{
|
||||
@ -148,7 +139,7 @@ namespace CppJieba
|
||||
}
|
||||
assert(_stopWords.size());
|
||||
}
|
||||
private:
|
||||
|
||||
bool _isSingleWord(const string& str) const
|
||||
{
|
||||
Unicode unicode;
|
||||
@ -158,12 +149,17 @@ namespace CppJieba
|
||||
return false;
|
||||
}
|
||||
|
||||
private:
|
||||
static bool _cmp(const pair<string, double>& lhs, const pair<string, double>& rhs)
|
||||
{
|
||||
return lhs.second > rhs.second;
|
||||
}
|
||||
|
||||
private:
|
||||
MixSegment _segment;
|
||||
unordered_map<string, double> _idfMap;
|
||||
double _idfAverage;
|
||||
|
||||
unordered_set<string> _stopWords;
|
||||
};
|
||||
}
|
||||
|
||||
|
@ -14,8 +14,6 @@ namespace CppJieba
|
||||
|
||||
class MPSegment: public SegmentBase
|
||||
{
|
||||
private:
|
||||
DictTrie _dictTrie;
|
||||
|
||||
public:
|
||||
MPSegment(){};
|
||||
@ -24,7 +22,7 @@ namespace CppJieba
|
||||
LIMONP_CHECK(init(dictPath, userDictPath));
|
||||
};
|
||||
virtual ~MPSegment(){};
|
||||
public:
|
||||
|
||||
bool init(const string& dictPath, const string& userDictPath = "")
|
||||
{
|
||||
LIMONP_CHECK(_dictTrie.init(dictPath, userDictPath));
|
||||
@ -35,7 +33,7 @@ namespace CppJieba
|
||||
{
|
||||
return _dictTrie.isUserDictSingleChineseWord(value);
|
||||
}
|
||||
public:
|
||||
|
||||
using SegmentBase::cut;
|
||||
virtual bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<string>& res)const
|
||||
{
|
||||
@ -141,6 +139,8 @@ namespace CppJieba
|
||||
}
|
||||
}
|
||||
|
||||
private:
|
||||
DictTrie _dictTrie;
|
||||
|
||||
};
|
||||
}
|
||||
|
@ -10,17 +10,17 @@ namespace CppJieba
|
||||
{
|
||||
class MixSegment: public SegmentBase
|
||||
{
|
||||
private:
|
||||
MPSegment _mpSeg;
|
||||
HMMSegment _hmmSeg;
|
||||
public:
|
||||
MixSegment(){};
|
||||
MixSegment()
|
||||
{
|
||||
}
|
||||
MixSegment(const string& mpSegDict, const string& hmmSegDict, const string& userDict = "")
|
||||
{
|
||||
LIMONP_CHECK(init(mpSegDict, hmmSegDict, userDict));
|
||||
}
|
||||
virtual ~MixSegment(){}
|
||||
public:
|
||||
virtual ~MixSegment()
|
||||
{
|
||||
}
|
||||
bool init(const string& mpSegDict, const string& hmmSegDict, const string& userDict = "")
|
||||
{
|
||||
LIMONP_CHECK(_mpSeg.init(mpSegDict, userDict));
|
||||
@ -28,9 +28,7 @@ namespace CppJieba
|
||||
LogInfo("MixSegment init(%s, %s)", mpSegDict.c_str(), hmmSegDict.c_str());
|
||||
return true;
|
||||
}
|
||||
public:
|
||||
using SegmentBase::cut;
|
||||
public:
|
||||
virtual bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<Unicode>& res) const
|
||||
{
|
||||
vector<Unicode> words;
|
||||
@ -115,6 +113,9 @@ namespace CppJieba
|
||||
{
|
||||
return _mpSeg.getDictTrie();
|
||||
}
|
||||
private:
|
||||
MPSegment _mpSeg;
|
||||
HMMSegment _hmmSeg;
|
||||
};
|
||||
}
|
||||
|
||||
|
@ -15,13 +15,10 @@ namespace CppJieba
|
||||
|
||||
class PosTagger
|
||||
{
|
||||
private:
|
||||
MixSegment _segment;
|
||||
const DictTrie * _dictTrie;
|
||||
|
||||
public:
|
||||
PosTagger()
|
||||
{}
|
||||
{
|
||||
}
|
||||
PosTagger(
|
||||
const string& dictPath,
|
||||
const string& hmmFilePath,
|
||||
@ -29,9 +26,10 @@ namespace CppJieba
|
||||
)
|
||||
{
|
||||
init(dictPath, hmmFilePath, userDictPath);
|
||||
};
|
||||
~PosTagger(){};
|
||||
public:
|
||||
}
|
||||
~PosTagger()
|
||||
{
|
||||
}
|
||||
void init(
|
||||
const string& dictPath,
|
||||
const string& hmmFilePath,
|
||||
@ -103,6 +101,9 @@ namespace CppJieba
|
||||
// the ascii chars contain english letter
|
||||
return POS_ENG;
|
||||
}
|
||||
private:
|
||||
MixSegment _segment;
|
||||
const DictTrie * _dictTrie;
|
||||
};
|
||||
}
|
||||
|
||||
|
@ -17,11 +17,6 @@ namespace CppJieba
|
||||
{
|
||||
class QuerySegment: public SegmentBase
|
||||
{
|
||||
private:
|
||||
MixSegment _mixSeg;
|
||||
FullSegment _fullSeg;
|
||||
size_t _maxWordLen;
|
||||
|
||||
public:
|
||||
QuerySegment(){};
|
||||
QuerySegment(const string& dict, const string& model, size_t maxWordLen, const string& userDict = "")
|
||||
@ -29,7 +24,6 @@ namespace CppJieba
|
||||
init(dict, model, maxWordLen, userDict);
|
||||
};
|
||||
virtual ~QuerySegment(){};
|
||||
public:
|
||||
bool init(const string& dict, const string& model, size_t maxWordLen, const string& userDict = "")
|
||||
{
|
||||
LIMONP_CHECK(_mixSeg.init(dict, model, userDict));
|
||||
@ -38,11 +32,7 @@ namespace CppJieba
|
||||
_maxWordLen = maxWordLen;
|
||||
return true;
|
||||
}
|
||||
|
||||
public:
|
||||
using SegmentBase::cut;
|
||||
|
||||
public:
|
||||
bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<Unicode>& res) const
|
||||
{
|
||||
if (begin >= end)
|
||||
@ -117,6 +107,11 @@ namespace CppJieba
|
||||
|
||||
return true;
|
||||
}
|
||||
private:
|
||||
MixSegment _mixSeg;
|
||||
FullSegment _fullSeg;
|
||||
size_t _maxWordLen;
|
||||
|
||||
};
|
||||
}
|
||||
|
||||
|
@ -25,19 +25,6 @@ namespace CppJieba
|
||||
public:
|
||||
SegmentBase(){_loadSpecialSymbols();};
|
||||
virtual ~SegmentBase(){};
|
||||
private:
|
||||
unordered_set<UnicodeValueType> _specialSymbols;
|
||||
private:
|
||||
void _loadSpecialSymbols()
|
||||
{
|
||||
size_t size = sizeof(SPECIAL_SYMBOL)/sizeof(*SPECIAL_SYMBOL);
|
||||
for(size_t i = 0; i < size; i ++)
|
||||
{
|
||||
_specialSymbols.insert(SPECIAL_SYMBOL[i]);
|
||||
}
|
||||
assert(_specialSymbols.size());
|
||||
}
|
||||
|
||||
public:
|
||||
virtual bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<string>& res) const = 0;
|
||||
virtual bool cut(const string& str, vector<string>& res) const
|
||||
@ -72,6 +59,19 @@ namespace CppJieba
|
||||
|
||||
return true;
|
||||
}
|
||||
private:
|
||||
void _loadSpecialSymbols()
|
||||
{
|
||||
size_t size = sizeof(SPECIAL_SYMBOL)/sizeof(*SPECIAL_SYMBOL);
|
||||
for(size_t i = 0; i < size; i ++)
|
||||
{
|
||||
_specialSymbols.insert(SPECIAL_SYMBOL[i]);
|
||||
}
|
||||
assert(_specialSymbols.size());
|
||||
}
|
||||
private:
|
||||
unordered_set<UnicodeValueType> _specialSymbols;
|
||||
|
||||
};
|
||||
}
|
||||
|
||||
|
17
src/Trie.hpp
17
src/Trie.hpp
@ -43,12 +43,6 @@ namespace CppJieba
|
||||
|
||||
class TrieNode
|
||||
{
|
||||
public:
|
||||
typedef unordered_map<TrieKey, TrieNode*> NextMap;
|
||||
public:
|
||||
TrieNode * fail;
|
||||
NextMap * next;
|
||||
const DictUnit * ptValue;
|
||||
public:
|
||||
TrieNode(): fail(NULL), next(NULL), ptValue(NULL)
|
||||
{}
|
||||
@ -65,12 +59,15 @@ namespace CppJieba
|
||||
}
|
||||
return iter->second;
|
||||
}
|
||||
public:
|
||||
typedef unordered_map<TrieKey, TrieNode*> NextMap;
|
||||
TrieNode * fail;
|
||||
NextMap * next;
|
||||
const DictUnit * ptValue;
|
||||
};
|
||||
|
||||
class Trie
|
||||
{
|
||||
private:
|
||||
TrieNode* _root;
|
||||
public:
|
||||
Trie(const vector<Unicode>& keys, const vector<const DictUnit*> & valuePointers)
|
||||
{
|
||||
@ -230,7 +227,6 @@ namespace CppJieba
|
||||
}
|
||||
}
|
||||
}
|
||||
private:
|
||||
void _createTrie(const vector<Unicode>& keys, const vector<const DictUnit*> & valuePointers)
|
||||
{
|
||||
if(valuePointers.empty() || keys.empty())
|
||||
@ -244,7 +240,6 @@ namespace CppJieba
|
||||
_insertNode(keys[i], valuePointers[i]);
|
||||
}
|
||||
}
|
||||
private:
|
||||
void _insertNode(const Unicode& key, const DictUnit* ptValue)
|
||||
{
|
||||
TrieNode* ptNode = _root;
|
||||
@ -291,6 +286,8 @@ namespace CppJieba
|
||||
}
|
||||
delete node;
|
||||
}
|
||||
private:
|
||||
TrieNode* _root;
|
||||
};
|
||||
}
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user