little adjustment

This commit is contained in:
yanyiwu 2015-01-31 12:58:49 +08:00
parent 00f738a617
commit 10e9b32258
12 changed files with 103 additions and 124 deletions

View File

@ -92,10 +92,11 @@ curl -d "南京市长江大桥" "http://127.0.0.1:11200/"
["南京市", "长江大桥"] ["南京市", "长江大桥"]
``` ```
如果有需要**安装使用**的,可以按照如下操作: 因为 HTTP GET 请求有长度限制如果需要请求长文的请使用POST请求。
### 安装服务 ### 安装服务
如果有需要**安装使用**的,可以按照如下操作:
``` ```
sudo make install sudo make install
``` ```

View File

@ -25,22 +25,8 @@ namespace CppJieba
class DictTrie class DictTrie
{ {
private: public:
vector<DictUnit> _nodeInfos;
Trie * _trie;
double _minWeight;
private:
unordered_set<Unicode::value_type> _userDictSingleChineseWord;
public:
bool isUserDictSingleChineseWord(const Unicode::value_type& word) const
{
return isIn(_userDictSingleChineseWord, word);
}
public:
double getMinWeight() const {return _minWeight;};
public:
DictTrie() DictTrie()
{ {
_trie = NULL; _trie = NULL;
@ -59,7 +45,6 @@ namespace CppJieba
} }
} }
public:
bool init(const string& dictPath, const string& userDictPath = "") bool init(const string& dictPath, const string& userDictPath = "")
{ {
assert(!_trie); assert(!_trie);
@ -78,7 +63,6 @@ namespace CppJieba
return true; return true;
} }
public:
const DictUnit* find(Unicode::const_iterator begin, Unicode::const_iterator end) const const DictUnit* find(Unicode::const_iterator begin, Unicode::const_iterator end) const
{ {
return _trie->find(begin, end); return _trie->find(begin, end);
@ -95,6 +79,11 @@ namespace CppJieba
{ {
_trie->find(begin, end, res); _trie->find(begin, end, res);
} }
bool isUserDictSingleChineseWord(const Unicode::value_type& word) const
{
return isIn(_userDictSingleChineseWord, word);
}
double getMinWeight() const {return _minWeight;};
private: private:
@ -204,7 +193,12 @@ namespace CppJieba
vector<DictUnit>(units.begin(), units.end()).swap(units); vector<DictUnit>(units.begin(), units.end()).swap(units);
} }
private:
vector<DictUnit> _nodeInfos;
Trie * _trie;
double _minWeight;
unordered_set<Unicode::value_type> _userDictSingleChineseWord;
}; };
} }

View File

@ -14,9 +14,6 @@ namespace CppJieba
{ {
class FullSegment: public SegmentBase class FullSegment: public SegmentBase
{ {
private:
const DictTrie* _dictTrie;
bool _isBorrowed;
public: public:
FullSegment() FullSegment()
{ {
@ -41,7 +38,6 @@ namespace CppJieba
} }
}; };
public:
bool init(const string& dictPath) bool init(const string& dictPath)
{ {
assert(_dictTrie == NULL); assert(_dictTrie == NULL);
@ -58,10 +54,7 @@ namespace CppJieba
return true; return true;
} }
public:
using SegmentBase::cut; using SegmentBase::cut;
public:
bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<Unicode>& res) const bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<Unicode>& res) const
{ {
assert(_dictTrie); assert(_dictTrie);
@ -147,6 +140,9 @@ namespace CppJieba
return true; return true;
} }
private:
const DictTrie* _dictTrie;
bool _isBorrowed;
}; };
} }

View File

@ -24,15 +24,6 @@ namespace CppJieba
* 0:B, 1:E, 2:M, 3:S * 0:B, 1:E, 2:M, 3:S
* */ * */
enum {B = 0, E = 1, M = 2, S = 3, STATUS_SUM = 4}; enum {B = 0, E = 1, M = 2, S = 3, STATUS_SUM = 4};
private:
char _statMap[STATUS_SUM];
double _startProb[STATUS_SUM];
double _transProb[STATUS_SUM][STATUS_SUM];
EmitProbMap _emitProbB;
EmitProbMap _emitProbE;
EmitProbMap _emitProbM;
EmitProbMap _emitProbS;
vector<EmitProbMap* > _emitProbVec;
public: public:
HMMSegment(){} HMMSegment(){}
@ -101,6 +92,30 @@ namespace CppJieba
} }
return true; return true;
} }
public:
virtual bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<string>& res)const
{
if(begin == end)
{
return false;
}
vector<Unicode> words;
words.reserve(end - begin);
if(!cut(begin, end, words))
{
return false;
}
size_t offset = res.size();
res.resize(res.size() + words.size());
for(size_t i = 0; i < words.size(); i++)
{
if(!TransCode::encode(words[i], res[offset + i]))
{
LogError("encode failed.");
}
}
return true;
}
private: private:
// sequential letters rule // sequential letters rule
Unicode::const_iterator _sequentialLetterRule(Unicode::const_iterator begin, Unicode::const_iterator end) const Unicode::const_iterator _sequentialLetterRule(Unicode::const_iterator begin, Unicode::const_iterator end) const
@ -168,32 +183,7 @@ namespace CppJieba
} }
return true; return true;
} }
public:
virtual bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<string>& res)const
{
if(begin == end)
{
return false;
}
vector<Unicode> words;
words.reserve(end - begin);
if(!cut(begin, end, words))
{
return false;
}
size_t offset = res.size();
res.resize(res.size() + words.size());
for(size_t i = 0; i < words.size(); i++)
{
if(!TransCode::encode(words[i], res[offset + i]))
{
LogError("encode failed.");
}
}
return true;
}
private:
bool _viterbi(Unicode::const_iterator begin, Unicode::const_iterator end, vector<size_t>& status)const bool _viterbi(Unicode::const_iterator begin, Unicode::const_iterator end, vector<size_t>& status)const
{ {
if(begin == end) if(begin == end)
@ -384,6 +374,15 @@ namespace CppJieba
} }
private:
char _statMap[STATUS_SUM];
double _startProb[STATUS_SUM];
double _transProb[STATUS_SUM][STATUS_SUM];
EmitProbMap _emitProbB;
EmitProbMap _emitProbE;
EmitProbMap _emitProbM;
EmitProbMap _emitProbS;
vector<EmitProbMap* > _emitProbVec;
}; };
} }

View File

@ -8,7 +8,6 @@ namespace CppJieba
{ {
public: public:
virtual ~ISegment(){}; virtual ~ISegment(){};
public:
virtual bool cut(Unicode::const_iterator begin , Unicode::const_iterator end, vector<string>& res) const = 0; virtual bool cut(Unicode::const_iterator begin , Unicode::const_iterator end, vector<string>& res) const = 0;
virtual bool cut(const string& str, vector<string>& res) const = 0; virtual bool cut(const string& str, vector<string>& res) const = 0;
}; };

View File

@ -12,13 +12,6 @@ namespace CppJieba
/*utf8*/ /*utf8*/
class KeywordExtractor class KeywordExtractor
{ {
private:
MixSegment _segment;
private:
unordered_map<string, double> _idfMap;
double _idfAverage;
unordered_set<string> _stopWords;
public: public:
KeywordExtractor(){}; KeywordExtractor(){};
KeywordExtractor(const string& dictPath, const string& hmmFilePath, const string& idfPath, const string& stopWordPath, const string& userDict = "") KeywordExtractor(const string& dictPath, const string& hmmFilePath, const string& idfPath, const string& stopWordPath, const string& userDict = "")
@ -27,14 +20,12 @@ namespace CppJieba
}; };
~KeywordExtractor(){}; ~KeywordExtractor(){};
public:
void init(const string& dictPath, const string& hmmFilePath, const string& idfPath, const string& stopWordPath, const string& userDict = "") void init(const string& dictPath, const string& hmmFilePath, const string& idfPath, const string& stopWordPath, const string& userDict = "")
{ {
_loadIdfDict(idfPath); _loadIdfDict(idfPath);
_loadStopWordDict(stopWordPath); _loadStopWordDict(stopWordPath);
LIMONP_CHECK(_segment.init(dictPath, hmmFilePath, userDict)); LIMONP_CHECK(_segment.init(dictPath, hmmFilePath, userDict));
}; };
public:
bool extract(const string& str, vector<string>& keywords, size_t topN) const bool extract(const string& str, vector<string>& keywords, size_t topN) const
{ {
@ -148,7 +139,7 @@ namespace CppJieba
} }
assert(_stopWords.size()); assert(_stopWords.size());
} }
private:
bool _isSingleWord(const string& str) const bool _isSingleWord(const string& str) const
{ {
Unicode unicode; Unicode unicode;
@ -158,12 +149,17 @@ namespace CppJieba
return false; return false;
} }
private:
static bool _cmp(const pair<string, double>& lhs, const pair<string, double>& rhs) static bool _cmp(const pair<string, double>& lhs, const pair<string, double>& rhs)
{ {
return lhs.second > rhs.second; return lhs.second > rhs.second;
} }
private:
MixSegment _segment;
unordered_map<string, double> _idfMap;
double _idfAverage;
unordered_set<string> _stopWords;
}; };
} }

View File

@ -14,8 +14,6 @@ namespace CppJieba
class MPSegment: public SegmentBase class MPSegment: public SegmentBase
{ {
private:
DictTrie _dictTrie;
public: public:
MPSegment(){}; MPSegment(){};
@ -24,7 +22,7 @@ namespace CppJieba
LIMONP_CHECK(init(dictPath, userDictPath)); LIMONP_CHECK(init(dictPath, userDictPath));
}; };
virtual ~MPSegment(){}; virtual ~MPSegment(){};
public:
bool init(const string& dictPath, const string& userDictPath = "") bool init(const string& dictPath, const string& userDictPath = "")
{ {
LIMONP_CHECK(_dictTrie.init(dictPath, userDictPath)); LIMONP_CHECK(_dictTrie.init(dictPath, userDictPath));
@ -35,7 +33,7 @@ namespace CppJieba
{ {
return _dictTrie.isUserDictSingleChineseWord(value); return _dictTrie.isUserDictSingleChineseWord(value);
} }
public:
using SegmentBase::cut; using SegmentBase::cut;
virtual bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<string>& res)const virtual bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<string>& res)const
{ {
@ -141,6 +139,8 @@ namespace CppJieba
} }
} }
private:
DictTrie _dictTrie;
}; };
} }

View File

@ -10,17 +10,17 @@ namespace CppJieba
{ {
class MixSegment: public SegmentBase class MixSegment: public SegmentBase
{ {
private:
MPSegment _mpSeg;
HMMSegment _hmmSeg;
public: public:
MixSegment(){}; MixSegment()
{
}
MixSegment(const string& mpSegDict, const string& hmmSegDict, const string& userDict = "") MixSegment(const string& mpSegDict, const string& hmmSegDict, const string& userDict = "")
{ {
LIMONP_CHECK(init(mpSegDict, hmmSegDict, userDict)); LIMONP_CHECK(init(mpSegDict, hmmSegDict, userDict));
} }
virtual ~MixSegment(){} virtual ~MixSegment()
public: {
}
bool init(const string& mpSegDict, const string& hmmSegDict, const string& userDict = "") bool init(const string& mpSegDict, const string& hmmSegDict, const string& userDict = "")
{ {
LIMONP_CHECK(_mpSeg.init(mpSegDict, userDict)); LIMONP_CHECK(_mpSeg.init(mpSegDict, userDict));
@ -28,9 +28,7 @@ namespace CppJieba
LogInfo("MixSegment init(%s, %s)", mpSegDict.c_str(), hmmSegDict.c_str()); LogInfo("MixSegment init(%s, %s)", mpSegDict.c_str(), hmmSegDict.c_str());
return true; return true;
} }
public:
using SegmentBase::cut; using SegmentBase::cut;
public:
virtual bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<Unicode>& res) const virtual bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<Unicode>& res) const
{ {
vector<Unicode> words; vector<Unicode> words;
@ -115,6 +113,9 @@ namespace CppJieba
{ {
return _mpSeg.getDictTrie(); return _mpSeg.getDictTrie();
} }
private:
MPSegment _mpSeg;
HMMSegment _hmmSeg;
}; };
} }

View File

@ -15,13 +15,10 @@ namespace CppJieba
class PosTagger class PosTagger
{ {
private:
MixSegment _segment;
const DictTrie * _dictTrie;
public: public:
PosTagger() PosTagger()
{} {
}
PosTagger( PosTagger(
const string& dictPath, const string& dictPath,
const string& hmmFilePath, const string& hmmFilePath,
@ -29,9 +26,10 @@ namespace CppJieba
) )
{ {
init(dictPath, hmmFilePath, userDictPath); init(dictPath, hmmFilePath, userDictPath);
}; }
~PosTagger(){}; ~PosTagger()
public: {
}
void init( void init(
const string& dictPath, const string& dictPath,
const string& hmmFilePath, const string& hmmFilePath,
@ -103,6 +101,9 @@ namespace CppJieba
// the ascii chars contain english letter // the ascii chars contain english letter
return POS_ENG; return POS_ENG;
} }
private:
MixSegment _segment;
const DictTrie * _dictTrie;
}; };
} }

View File

@ -17,11 +17,6 @@ namespace CppJieba
{ {
class QuerySegment: public SegmentBase class QuerySegment: public SegmentBase
{ {
private:
MixSegment _mixSeg;
FullSegment _fullSeg;
size_t _maxWordLen;
public: public:
QuerySegment(){}; QuerySegment(){};
QuerySegment(const string& dict, const string& model, size_t maxWordLen, const string& userDict = "") QuerySegment(const string& dict, const string& model, size_t maxWordLen, const string& userDict = "")
@ -29,7 +24,6 @@ namespace CppJieba
init(dict, model, maxWordLen, userDict); init(dict, model, maxWordLen, userDict);
}; };
virtual ~QuerySegment(){}; virtual ~QuerySegment(){};
public:
bool init(const string& dict, const string& model, size_t maxWordLen, const string& userDict = "") bool init(const string& dict, const string& model, size_t maxWordLen, const string& userDict = "")
{ {
LIMONP_CHECK(_mixSeg.init(dict, model, userDict)); LIMONP_CHECK(_mixSeg.init(dict, model, userDict));
@ -38,11 +32,7 @@ namespace CppJieba
_maxWordLen = maxWordLen; _maxWordLen = maxWordLen;
return true; return true;
} }
public:
using SegmentBase::cut; using SegmentBase::cut;
public:
bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<Unicode>& res) const bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<Unicode>& res) const
{ {
if (begin >= end) if (begin >= end)
@ -117,6 +107,11 @@ namespace CppJieba
return true; return true;
} }
private:
MixSegment _mixSeg;
FullSegment _fullSeg;
size_t _maxWordLen;
}; };
} }

View File

@ -25,19 +25,6 @@ namespace CppJieba
public: public:
SegmentBase(){_loadSpecialSymbols();}; SegmentBase(){_loadSpecialSymbols();};
virtual ~SegmentBase(){}; virtual ~SegmentBase(){};
private:
unordered_set<UnicodeValueType> _specialSymbols;
private:
void _loadSpecialSymbols()
{
size_t size = sizeof(SPECIAL_SYMBOL)/sizeof(*SPECIAL_SYMBOL);
for(size_t i = 0; i < size; i ++)
{
_specialSymbols.insert(SPECIAL_SYMBOL[i]);
}
assert(_specialSymbols.size());
}
public: public:
virtual bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<string>& res) const = 0; virtual bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<string>& res) const = 0;
virtual bool cut(const string& str, vector<string>& res) const virtual bool cut(const string& str, vector<string>& res) const
@ -72,6 +59,19 @@ namespace CppJieba
return true; return true;
} }
private:
void _loadSpecialSymbols()
{
size_t size = sizeof(SPECIAL_SYMBOL)/sizeof(*SPECIAL_SYMBOL);
for(size_t i = 0; i < size; i ++)
{
_specialSymbols.insert(SPECIAL_SYMBOL[i]);
}
assert(_specialSymbols.size());
}
private:
unordered_set<UnicodeValueType> _specialSymbols;
}; };
} }

View File

@ -43,12 +43,6 @@ namespace CppJieba
class TrieNode class TrieNode
{ {
public:
typedef unordered_map<TrieKey, TrieNode*> NextMap;
public:
TrieNode * fail;
NextMap * next;
const DictUnit * ptValue;
public: public:
TrieNode(): fail(NULL), next(NULL), ptValue(NULL) TrieNode(): fail(NULL), next(NULL), ptValue(NULL)
{} {}
@ -65,12 +59,15 @@ namespace CppJieba
} }
return iter->second; return iter->second;
} }
public:
typedef unordered_map<TrieKey, TrieNode*> NextMap;
TrieNode * fail;
NextMap * next;
const DictUnit * ptValue;
}; };
class Trie class Trie
{ {
private:
TrieNode* _root;
public: public:
Trie(const vector<Unicode>& keys, const vector<const DictUnit*> & valuePointers) Trie(const vector<Unicode>& keys, const vector<const DictUnit*> & valuePointers)
{ {
@ -230,7 +227,6 @@ namespace CppJieba
} }
} }
} }
private:
void _createTrie(const vector<Unicode>& keys, const vector<const DictUnit*> & valuePointers) void _createTrie(const vector<Unicode>& keys, const vector<const DictUnit*> & valuePointers)
{ {
if(valuePointers.empty() || keys.empty()) if(valuePointers.empty() || keys.empty())
@ -244,7 +240,6 @@ namespace CppJieba
_insertNode(keys[i], valuePointers[i]); _insertNode(keys[i], valuePointers[i]);
} }
} }
private:
void _insertNode(const Unicode& key, const DictUnit* ptValue) void _insertNode(const Unicode& key, const DictUnit* ptValue)
{ {
TrieNode* ptNode = _root; TrieNode* ptNode = _root;
@ -291,6 +286,8 @@ namespace CppJieba
} }
delete node; delete node;
} }
private:
TrieNode* _root;
}; };
} }