little adjustment

This commit is contained in:
yanyiwu 2015-01-31 12:58:49 +08:00
parent 00f738a617
commit 10e9b32258
12 changed files with 103 additions and 124 deletions

View File

@ -92,10 +92,11 @@ curl -d "南京市长江大桥" "http://127.0.0.1:11200/"
["南京市", "长江大桥"]
```
如果有需要**安装使用**的,可以按照如下操作:
因为 HTTP GET 请求有长度限制如果需要请求长文的请使用POST请求。
### 安装服务
如果有需要**安装使用**的,可以按照如下操作:
```
sudo make install
```

View File

@ -25,22 +25,8 @@ namespace CppJieba
class DictTrie
{
private:
vector<DictUnit> _nodeInfos;
Trie * _trie;
public:
double _minWeight;
private:
unordered_set<Unicode::value_type> _userDictSingleChineseWord;
public:
bool isUserDictSingleChineseWord(const Unicode::value_type& word) const
{
return isIn(_userDictSingleChineseWord, word);
}
public:
double getMinWeight() const {return _minWeight;};
public:
DictTrie()
{
_trie = NULL;
@ -59,7 +45,6 @@ namespace CppJieba
}
}
public:
bool init(const string& dictPath, const string& userDictPath = "")
{
assert(!_trie);
@ -78,7 +63,6 @@ namespace CppJieba
return true;
}
public:
const DictUnit* find(Unicode::const_iterator begin, Unicode::const_iterator end) const
{
return _trie->find(begin, end);
@ -95,6 +79,11 @@ namespace CppJieba
{
_trie->find(begin, end, res);
}
bool isUserDictSingleChineseWord(const Unicode::value_type& word) const
{
return isIn(_userDictSingleChineseWord, word);
}
double getMinWeight() const {return _minWeight;};
private:
@ -204,7 +193,12 @@ namespace CppJieba
vector<DictUnit>(units.begin(), units.end()).swap(units);
}
private:
vector<DictUnit> _nodeInfos;
Trie * _trie;
double _minWeight;
unordered_set<Unicode::value_type> _userDictSingleChineseWord;
};
}

View File

@ -14,9 +14,6 @@ namespace CppJieba
{
class FullSegment: public SegmentBase
{
private:
const DictTrie* _dictTrie;
bool _isBorrowed;
public:
FullSegment()
{
@ -41,7 +38,6 @@ namespace CppJieba
}
};
public:
bool init(const string& dictPath)
{
assert(_dictTrie == NULL);
@ -58,10 +54,7 @@ namespace CppJieba
return true;
}
public:
using SegmentBase::cut;
public:
bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<Unicode>& res) const
{
assert(_dictTrie);
@ -147,6 +140,9 @@ namespace CppJieba
return true;
}
private:
const DictTrie* _dictTrie;
bool _isBorrowed;
};
}

View File

@ -24,15 +24,6 @@ namespace CppJieba
* 0:B, 1:E, 2:M, 3:S
* */
enum {B = 0, E = 1, M = 2, S = 3, STATUS_SUM = 4};
private:
char _statMap[STATUS_SUM];
double _startProb[STATUS_SUM];
double _transProb[STATUS_SUM][STATUS_SUM];
EmitProbMap _emitProbB;
EmitProbMap _emitProbE;
EmitProbMap _emitProbM;
EmitProbMap _emitProbS;
vector<EmitProbMap* > _emitProbVec;
public:
HMMSegment(){}
@ -101,6 +92,30 @@ namespace CppJieba
}
return true;
}
public:
virtual bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<string>& res)const
{
if(begin == end)
{
return false;
}
vector<Unicode> words;
words.reserve(end - begin);
if(!cut(begin, end, words))
{
return false;
}
size_t offset = res.size();
res.resize(res.size() + words.size());
for(size_t i = 0; i < words.size(); i++)
{
if(!TransCode::encode(words[i], res[offset + i]))
{
LogError("encode failed.");
}
}
return true;
}
private:
// sequential letters rule
Unicode::const_iterator _sequentialLetterRule(Unicode::const_iterator begin, Unicode::const_iterator end) const
@ -168,32 +183,7 @@ namespace CppJieba
}
return true;
}
public:
virtual bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<string>& res)const
{
if(begin == end)
{
return false;
}
vector<Unicode> words;
words.reserve(end - begin);
if(!cut(begin, end, words))
{
return false;
}
size_t offset = res.size();
res.resize(res.size() + words.size());
for(size_t i = 0; i < words.size(); i++)
{
if(!TransCode::encode(words[i], res[offset + i]))
{
LogError("encode failed.");
}
}
return true;
}
private:
bool _viterbi(Unicode::const_iterator begin, Unicode::const_iterator end, vector<size_t>& status)const
{
if(begin == end)
@ -384,6 +374,15 @@ namespace CppJieba
}
private:
char _statMap[STATUS_SUM];
double _startProb[STATUS_SUM];
double _transProb[STATUS_SUM][STATUS_SUM];
EmitProbMap _emitProbB;
EmitProbMap _emitProbE;
EmitProbMap _emitProbM;
EmitProbMap _emitProbS;
vector<EmitProbMap* > _emitProbVec;
};
}

View File

@ -8,7 +8,6 @@ namespace CppJieba
{
public:
virtual ~ISegment(){};
public:
virtual bool cut(Unicode::const_iterator begin , Unicode::const_iterator end, vector<string>& res) const = 0;
virtual bool cut(const string& str, vector<string>& res) const = 0;
};

View File

@ -12,13 +12,6 @@ namespace CppJieba
/*utf8*/
class KeywordExtractor
{
private:
MixSegment _segment;
private:
unordered_map<string, double> _idfMap;
double _idfAverage;
unordered_set<string> _stopWords;
public:
KeywordExtractor(){};
KeywordExtractor(const string& dictPath, const string& hmmFilePath, const string& idfPath, const string& stopWordPath, const string& userDict = "")
@ -27,14 +20,12 @@ namespace CppJieba
};
~KeywordExtractor(){};
public:
void init(const string& dictPath, const string& hmmFilePath, const string& idfPath, const string& stopWordPath, const string& userDict = "")
{
_loadIdfDict(idfPath);
_loadStopWordDict(stopWordPath);
LIMONP_CHECK(_segment.init(dictPath, hmmFilePath, userDict));
};
public:
bool extract(const string& str, vector<string>& keywords, size_t topN) const
{
@ -148,7 +139,7 @@ namespace CppJieba
}
assert(_stopWords.size());
}
private:
bool _isSingleWord(const string& str) const
{
Unicode unicode;
@ -158,12 +149,17 @@ namespace CppJieba
return false;
}
private:
static bool _cmp(const pair<string, double>& lhs, const pair<string, double>& rhs)
{
return lhs.second > rhs.second;
}
private:
MixSegment _segment;
unordered_map<string, double> _idfMap;
double _idfAverage;
unordered_set<string> _stopWords;
};
}

View File

@ -14,8 +14,6 @@ namespace CppJieba
class MPSegment: public SegmentBase
{
private:
DictTrie _dictTrie;
public:
MPSegment(){};
@ -24,7 +22,7 @@ namespace CppJieba
LIMONP_CHECK(init(dictPath, userDictPath));
};
virtual ~MPSegment(){};
public:
bool init(const string& dictPath, const string& userDictPath = "")
{
LIMONP_CHECK(_dictTrie.init(dictPath, userDictPath));
@ -35,7 +33,7 @@ namespace CppJieba
{
return _dictTrie.isUserDictSingleChineseWord(value);
}
public:
using SegmentBase::cut;
virtual bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<string>& res)const
{
@ -141,6 +139,8 @@ namespace CppJieba
}
}
private:
DictTrie _dictTrie;
};
}

View File

@ -10,17 +10,17 @@ namespace CppJieba
{
class MixSegment: public SegmentBase
{
private:
MPSegment _mpSeg;
HMMSegment _hmmSeg;
public:
MixSegment(){};
MixSegment()
{
}
MixSegment(const string& mpSegDict, const string& hmmSegDict, const string& userDict = "")
{
LIMONP_CHECK(init(mpSegDict, hmmSegDict, userDict));
}
virtual ~MixSegment(){}
public:
virtual ~MixSegment()
{
}
bool init(const string& mpSegDict, const string& hmmSegDict, const string& userDict = "")
{
LIMONP_CHECK(_mpSeg.init(mpSegDict, userDict));
@ -28,9 +28,7 @@ namespace CppJieba
LogInfo("MixSegment init(%s, %s)", mpSegDict.c_str(), hmmSegDict.c_str());
return true;
}
public:
using SegmentBase::cut;
public:
virtual bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<Unicode>& res) const
{
vector<Unicode> words;
@ -115,6 +113,9 @@ namespace CppJieba
{
return _mpSeg.getDictTrie();
}
private:
MPSegment _mpSeg;
HMMSegment _hmmSeg;
};
}

View File

@ -15,13 +15,10 @@ namespace CppJieba
class PosTagger
{
private:
MixSegment _segment;
const DictTrie * _dictTrie;
public:
PosTagger()
{}
{
}
PosTagger(
const string& dictPath,
const string& hmmFilePath,
@ -29,9 +26,10 @@ namespace CppJieba
)
{
init(dictPath, hmmFilePath, userDictPath);
};
~PosTagger(){};
public:
}
~PosTagger()
{
}
void init(
const string& dictPath,
const string& hmmFilePath,
@ -103,6 +101,9 @@ namespace CppJieba
// the ascii chars contain english letter
return POS_ENG;
}
private:
MixSegment _segment;
const DictTrie * _dictTrie;
};
}

View File

@ -17,11 +17,6 @@ namespace CppJieba
{
class QuerySegment: public SegmentBase
{
private:
MixSegment _mixSeg;
FullSegment _fullSeg;
size_t _maxWordLen;
public:
QuerySegment(){};
QuerySegment(const string& dict, const string& model, size_t maxWordLen, const string& userDict = "")
@ -29,7 +24,6 @@ namespace CppJieba
init(dict, model, maxWordLen, userDict);
};
virtual ~QuerySegment(){};
public:
bool init(const string& dict, const string& model, size_t maxWordLen, const string& userDict = "")
{
LIMONP_CHECK(_mixSeg.init(dict, model, userDict));
@ -38,11 +32,7 @@ namespace CppJieba
_maxWordLen = maxWordLen;
return true;
}
public:
using SegmentBase::cut;
public:
bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<Unicode>& res) const
{
if (begin >= end)
@ -117,6 +107,11 @@ namespace CppJieba
return true;
}
private:
MixSegment _mixSeg;
FullSegment _fullSeg;
size_t _maxWordLen;
};
}

View File

@ -25,19 +25,6 @@ namespace CppJieba
public:
SegmentBase(){_loadSpecialSymbols();};
virtual ~SegmentBase(){};
private:
unordered_set<UnicodeValueType> _specialSymbols;
private:
void _loadSpecialSymbols()
{
size_t size = sizeof(SPECIAL_SYMBOL)/sizeof(*SPECIAL_SYMBOL);
for(size_t i = 0; i < size; i ++)
{
_specialSymbols.insert(SPECIAL_SYMBOL[i]);
}
assert(_specialSymbols.size());
}
public:
virtual bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<string>& res) const = 0;
virtual bool cut(const string& str, vector<string>& res) const
@ -72,6 +59,19 @@ namespace CppJieba
return true;
}
private:
void _loadSpecialSymbols()
{
size_t size = sizeof(SPECIAL_SYMBOL)/sizeof(*SPECIAL_SYMBOL);
for(size_t i = 0; i < size; i ++)
{
_specialSymbols.insert(SPECIAL_SYMBOL[i]);
}
assert(_specialSymbols.size());
}
private:
unordered_set<UnicodeValueType> _specialSymbols;
};
}

View File

@ -43,12 +43,6 @@ namespace CppJieba
class TrieNode
{
public:
typedef unordered_map<TrieKey, TrieNode*> NextMap;
public:
TrieNode * fail;
NextMap * next;
const DictUnit * ptValue;
public:
TrieNode(): fail(NULL), next(NULL), ptValue(NULL)
{}
@ -65,12 +59,15 @@ namespace CppJieba
}
return iter->second;
}
public:
typedef unordered_map<TrieKey, TrieNode*> NextMap;
TrieNode * fail;
NextMap * next;
const DictUnit * ptValue;
};
class Trie
{
private:
TrieNode* _root;
public:
Trie(const vector<Unicode>& keys, const vector<const DictUnit*> & valuePointers)
{
@ -230,7 +227,6 @@ namespace CppJieba
}
}
}
private:
void _createTrie(const vector<Unicode>& keys, const vector<const DictUnit*> & valuePointers)
{
if(valuePointers.empty() || keys.empty())
@ -244,7 +240,6 @@ namespace CppJieba
_insertNode(keys[i], valuePointers[i]);
}
}
private:
void _insertNode(const Unicode& key, const DictUnit* ptValue)
{
TrieNode* ptNode = _root;
@ -291,6 +286,8 @@ namespace CppJieba
}
delete node;
}
private:
TrieNode* _root;
};
}