astyle --style=google --indent=spaces=2

This commit is contained in:
yanyiwu 2015-05-06 17:53:20 +08:00
parent b70875f412
commit bb32234654
21 changed files with 1727 additions and 2071 deletions

View File

@ -12,80 +12,70 @@
using namespace Husky; using namespace Husky;
using namespace CppJieba; using namespace CppJieba;
class ReqHandler: public IRequestHandler class ReqHandler: public IRequestHandler {
{ public:
public: ReqHandler(const string& dictPath, const string& modelPath, const string& userDictPath): _segment(dictPath, modelPath, userDictPath) {};
ReqHandler(const string& dictPath, const string& modelPath, const string& userDictPath): _segment(dictPath, modelPath, userDictPath){}; virtual ~ReqHandler() {};
virtual ~ReqHandler(){}; public:
public: virtual bool do_GET(const HttpReqInfo& httpReq, string& strSnd) const {
virtual bool do_GET(const HttpReqInfo& httpReq, string& strSnd) const string sentence, tmp;
{ vector<string> words;
string sentence, tmp; httpReq.GET("key", tmp);
vector<string> words; URLDecode(tmp, sentence);
httpReq.GET("key", tmp); _segment.cut(sentence, words);
URLDecode(tmp, sentence); if(httpReq.GET("format", tmp) && tmp == "simple") {
_segment.cut(sentence, words); join(words.begin(), words.end(), strSnd, " ");
if(httpReq.GET("format", tmp) && tmp == "simple") return true;
{ }
join(words.begin(), words.end(), strSnd, " "); strSnd << words;
return true; return true;
} }
strSnd << words; virtual bool do_POST(const HttpReqInfo& httpReq, string& strSnd) const {
return true; vector<string> words;
} _segment.cut(httpReq.getBody(), words);
virtual bool do_POST(const HttpReqInfo& httpReq, string& strSnd) const strSnd << words;
{ return true;
vector<string> words; }
_segment.cut(httpReq.getBody(), words); private:
strSnd << words; MixSegment _segment;
return true;
}
private:
MixSegment _segment;
}; };
bool run(int argc, char** argv) bool run(int argc, char** argv) {
{ if(argc < 2) {
if(argc < 2) return false;
{ }
return false; Config conf(argv[1]);
} if(!conf) {
Config conf(argv[1]); return false;
if(!conf) }
{ int port = 0;
return false; int threadNumber = 0;
} int queueMaxSize = 0;
int port = 0; string dictPath;
int threadNumber = 0; string modelPath;
int queueMaxSize = 0; string userDictPath;
string dictPath; LIMONP_CHECK(conf.get("port", port));
string modelPath; LIMONP_CHECK(conf.get("thread_number", threadNumber));
string userDictPath; LIMONP_CHECK(conf.get("queue_max_size", queueMaxSize));
LIMONP_CHECK(conf.get("port", port)); LIMONP_CHECK(conf.get("dict_path", dictPath));
LIMONP_CHECK(conf.get("thread_number", threadNumber)); LIMONP_CHECK(conf.get("model_path", modelPath));
LIMONP_CHECK(conf.get("queue_max_size", queueMaxSize)); if(!conf.get("user_dict_path", userDictPath)) { //optional
LIMONP_CHECK(conf.get("dict_path", dictPath)); userDictPath = "";
LIMONP_CHECK(conf.get("model_path", modelPath)); }
if(!conf.get("user_dict_path", userDictPath)) //optional
{
userDictPath = "";
}
LogInfo("config info: %s", conf.getConfigInfo().c_str()); LogInfo("config info: %s", conf.getConfigInfo().c_str());
ReqHandler reqHandler(dictPath, modelPath, userDictPath); ReqHandler reqHandler(dictPath, modelPath, userDictPath);
ThreadPoolServer sf(threadNumber, queueMaxSize, port, reqHandler); ThreadPoolServer sf(threadNumber, queueMaxSize, port, reqHandler);
return sf.start(); return sf.start();
} }
int main(int argc, char* argv[]) int main(int argc, char* argv[]) {
{ if(!run(argc, argv)) {
if(!run(argc, argv)) printf("usage: %s <config_file>\n", argv[0]);
{ return EXIT_FAILURE;
printf("usage: %s <config_file>\n", argv[0]); }
return EXIT_FAILURE; return EXIT_SUCCESS;
}
return EXIT_SUCCESS;
} }

View File

@ -15,206 +15,174 @@
namespace CppJieba namespace CppJieba {
{ using namespace Limonp;
using namespace Limonp; const double MIN_DOUBLE = -3.14e+100;
const double MIN_DOUBLE = -3.14e+100; const double MAX_DOUBLE = 3.14e+100;
const double MAX_DOUBLE = 3.14e+100; const size_t DICT_COLUMN_NUM = 3;
const size_t DICT_COLUMN_NUM = 3; const char* const UNKNOWN_TAG = "";
const char* const UNKNOWN_TAG = "";
class DictTrie class DictTrie {
{ public:
public:
DictTrie() DictTrie() {
{ _trie = NULL;
_trie = NULL; _minWeight = MAX_DOUBLE;
_minWeight = MAX_DOUBLE; }
} DictTrie(const string& dictPath, const string& userDictPath = "") {
DictTrie(const string& dictPath, const string& userDictPath = "") new (this) DictTrie();
{ init(dictPath, userDictPath);
new (this) DictTrie(); }
init(dictPath, userDictPath); ~DictTrie() {
} if(_trie) {
~DictTrie() delete _trie;
{ }
if(_trie) }
{
delete _trie;
}
}
bool init(const string& dictPath, const string& userDictPath = "")
{
if(_trie != NULL)
{
LogFatal("trie already initted");
}
_loadDict(dictPath);
_calculateWeight(_nodeInfos);
_minWeight = _findMinWeight(_nodeInfos);
if(userDictPath.size())
{
double maxWeight = _findMaxWeight(_nodeInfos);
_loadUserDict(userDictPath, maxWeight, UNKNOWN_TAG);
}
_shrink(_nodeInfos);
_trie = _createTrie(_nodeInfos);
assert(_trie);
return true;
}
const DictUnit* find(Unicode::const_iterator begin, Unicode::const_iterator end) const bool init(const string& dictPath, const string& userDictPath = "") {
{ if(_trie != NULL) {
return _trie->find(begin, end); LogFatal("trie already initted");
} }
bool find(Unicode::const_iterator begin, Unicode::const_iterator end, DagType& dag, size_t offset = 0) const _loadDict(dictPath);
{ _calculateWeight(_nodeInfos);
return _trie->find(begin, end, dag, offset); _minWeight = _findMinWeight(_nodeInfos);
}
void find( if(userDictPath.size()) {
Unicode::const_iterator begin, double maxWeight = _findMaxWeight(_nodeInfos);
Unicode::const_iterator end, _loadUserDict(userDictPath, maxWeight, UNKNOWN_TAG);
vector<SegmentChar>& res }
) const _shrink(_nodeInfos);
{ _trie = _createTrie(_nodeInfos);
_trie->find(begin, end, res); assert(_trie);
} return true;
bool isUserDictSingleChineseWord(const Unicode::value_type& word) const }
{
return isIn(_userDictSingleChineseWord, word); const DictUnit* find(Unicode::const_iterator begin, Unicode::const_iterator end) const {
} return _trie->find(begin, end);
double getMinWeight() const {return _minWeight;}; }
bool find(Unicode::const_iterator begin, Unicode::const_iterator end, DagType& dag, size_t offset = 0) const {
return _trie->find(begin, end, dag, offset);
}
void find(
Unicode::const_iterator begin,
Unicode::const_iterator end,
vector<SegmentChar>& res
) const {
_trie->find(begin, end, res);
}
bool isUserDictSingleChineseWord(const Unicode::value_type& word) const {
return isIn(_userDictSingleChineseWord, word);
}
double getMinWeight() const {
return _minWeight;
};
private: private:
Trie * _createTrie(const vector<DictUnit>& dictUnits) Trie * _createTrie(const vector<DictUnit>& dictUnits) {
{ assert(dictUnits.size());
assert(dictUnits.size()); vector<Unicode> words;
vector<Unicode> words; vector<const DictUnit*> valuePointers;
vector<const DictUnit*> valuePointers; for(size_t i = 0 ; i < dictUnits.size(); i ++) {
for(size_t i = 0 ; i < dictUnits.size(); i ++) words.push_back(dictUnits[i].word);
{ valuePointers.push_back(&dictUnits[i]);
words.push_back(dictUnits[i].word); }
valuePointers.push_back(&dictUnits[i]);
}
Trie * trie = new Trie(words, valuePointers); Trie * trie = new Trie(words, valuePointers);
return trie; return trie;
} }
void _loadUserDict(const string& filePath, double defaultWeight, const string& defaultTag) void _loadUserDict(const string& filePath, double defaultWeight, const string& defaultTag) {
{ ifstream ifs(filePath.c_str());
ifstream ifs(filePath.c_str()); if(!ifs.is_open()) {
if(!ifs.is_open()) LogFatal("file %s open failed.", filePath.c_str());
{ }
LogFatal("file %s open failed.", filePath.c_str()); string line;
} DictUnit nodeInfo;
string line; vector<string> buf;
DictUnit nodeInfo; size_t lineno;
vector<string> buf; for(lineno = 0; getline(ifs, line); lineno++) {
size_t lineno; buf.clear();
for(lineno = 0; getline(ifs, line); lineno++) split(line, buf, " ");
{ if(buf.size() < 1) {
buf.clear(); LogFatal("split [%s] result illegal", line.c_str());
split(line, buf, " "); }
if(buf.size() < 1) if(!TransCode::decode(buf[0], nodeInfo.word)) {
{ LogError("line[%u:%s] illegal.", lineno, line.c_str());
LogFatal("split [%s] result illegal", line.c_str()); continue;
} }
if(!TransCode::decode(buf[0], nodeInfo.word)) if(nodeInfo.word.size() == 1) {
{ _userDictSingleChineseWord.insert(nodeInfo.word[0]);
LogError("line[%u:%s] illegal.", lineno, line.c_str()); }
continue; nodeInfo.weight = defaultWeight;
} nodeInfo.tag = (buf.size() == 2 ? buf[1] : defaultTag);
if(nodeInfo.word.size() == 1) _nodeInfos.push_back(nodeInfo);
{ }
_userDictSingleChineseWord.insert(nodeInfo.word[0]); LogInfo("load userdict[%s] ok. lines[%u]", filePath.c_str(), lineno);
} }
nodeInfo.weight = defaultWeight; void _loadDict(const string& filePath) {
nodeInfo.tag = (buf.size() == 2 ? buf[1] : defaultTag); ifstream ifs(filePath.c_str());
_nodeInfos.push_back(nodeInfo); if(!ifs.is_open()) {
} LogFatal("file %s open failed.", filePath.c_str());
LogInfo("load userdict[%s] ok. lines[%u]", filePath.c_str(), lineno); }
} string line;
void _loadDict(const string& filePath) vector<string> buf;
{
ifstream ifs(filePath.c_str());
if(!ifs.is_open())
{
LogFatal("file %s open failed.", filePath.c_str());
}
string line;
vector<string> buf;
DictUnit nodeInfo; DictUnit nodeInfo;
for(size_t lineno = 0; getline(ifs, line); lineno++) for(size_t lineno = 0; getline(ifs, line); lineno++) {
{ split(line, buf, " ");
split(line, buf, " "); if(buf.size() != DICT_COLUMN_NUM) {
if(buf.size() != DICT_COLUMN_NUM) LogFatal("split result illegal, line: %s, result size: %u", line.c_str(), buf.size());
{ }
LogFatal("split result illegal, line: %s, result size: %u", line.c_str(), buf.size());
}
if(!TransCode::decode(buf[0], nodeInfo.word))
{
LogError("line[%u:%s] illegal.", lineno, line.c_str());
continue;
}
nodeInfo.weight = atof(buf[1].c_str());
nodeInfo.tag = buf[2];
_nodeInfos.push_back(nodeInfo);
}
}
double _findMinWeight(const vector<DictUnit>& nodeInfos) const
{
double ret = MAX_DOUBLE;
for(size_t i = 0; i < nodeInfos.size(); i++)
{
ret = min(nodeInfos[i].weight, ret);
}
return ret;
}
double _findMaxWeight(const vector<DictUnit>& nodeInfos) const
{
double ret = MIN_DOUBLE;
for(size_t i = 0; i < nodeInfos.size(); i++)
{
ret = max(nodeInfos[i].weight, ret);
}
return ret;
}
void _calculateWeight(vector<DictUnit>& nodeInfos) const if(!TransCode::decode(buf[0], nodeInfo.word)) {
{ LogError("line[%u:%s] illegal.", lineno, line.c_str());
double sum = 0.0; continue;
for(size_t i = 0; i < nodeInfos.size(); i++) }
{ nodeInfo.weight = atof(buf[1].c_str());
sum += nodeInfos[i].weight; nodeInfo.tag = buf[2];
}
assert(sum);
for(size_t i = 0; i < nodeInfos.size(); i++)
{
DictUnit& nodeInfo = nodeInfos[i];
assert(nodeInfo.weight);
nodeInfo.weight = log(double(nodeInfo.weight)/double(sum));
}
}
void _shrink(vector<DictUnit>& units) const _nodeInfos.push_back(nodeInfo);
{ }
vector<DictUnit>(units.begin(), units.end()).swap(units); }
} double _findMinWeight(const vector<DictUnit>& nodeInfos) const {
double ret = MAX_DOUBLE;
for(size_t i = 0; i < nodeInfos.size(); i++) {
ret = min(nodeInfos[i].weight, ret);
}
return ret;
}
double _findMaxWeight(const vector<DictUnit>& nodeInfos) const {
double ret = MIN_DOUBLE;
for(size_t i = 0; i < nodeInfos.size(); i++) {
ret = max(nodeInfos[i].weight, ret);
}
return ret;
}
private: void _calculateWeight(vector<DictUnit>& nodeInfos) const {
vector<DictUnit> _nodeInfos; double sum = 0.0;
Trie * _trie; for(size_t i = 0; i < nodeInfos.size(); i++) {
sum += nodeInfos[i].weight;
}
assert(sum);
for(size_t i = 0; i < nodeInfos.size(); i++) {
DictUnit& nodeInfo = nodeInfos[i];
assert(nodeInfo.weight);
nodeInfo.weight = log(double(nodeInfo.weight)/double(sum));
}
}
double _minWeight; void _shrink(vector<DictUnit>& units) const {
unordered_set<Unicode::value_type> _userDictSingleChineseWord; vector<DictUnit>(units.begin(), units.end()).swap(units);
}; }
private:
vector<DictUnit> _nodeInfos;
Trie * _trie;
double _minWeight;
unordered_set<Unicode::value_type> _userDictSingleChineseWord;
};
} }
#endif #endif

View File

@ -10,140 +10,116 @@
#include "SegmentBase.hpp" #include "SegmentBase.hpp"
#include "TransCode.hpp" #include "TransCode.hpp"
namespace CppJieba namespace CppJieba {
{ class FullSegment: public SegmentBase {
class FullSegment: public SegmentBase public:
{ FullSegment() {
public: _dictTrie = NULL;
FullSegment() _isBorrowed = false;
{ }
_dictTrie = NULL; explicit FullSegment(const string& dictPath) {
_isBorrowed = false; _dictTrie = NULL;
} init(dictPath);
explicit FullSegment(const string& dictPath) }
{ explicit FullSegment(const DictTrie* dictTrie) {
_dictTrie = NULL; _dictTrie = NULL;
init(dictPath); init(dictTrie);
} }
explicit FullSegment(const DictTrie* dictTrie) virtual ~FullSegment() {
{ if(_dictTrie && ! _isBorrowed) {
_dictTrie = NULL; delete _dictTrie;
init(dictTrie); }
}
virtual ~FullSegment()
{
if(_dictTrie && ! _isBorrowed)
{
delete _dictTrie;
}
}; };
bool init(const string& dictPath) bool init(const string& dictPath) {
{ assert(_dictTrie == NULL);
assert(_dictTrie == NULL); _dictTrie = new DictTrie(dictPath);
_dictTrie = new DictTrie(dictPath); _isBorrowed = false;
_isBorrowed = false; return true;
return true; }
} bool init(const DictTrie* dictTrie) {
bool init(const DictTrie* dictTrie) assert(_dictTrie == NULL);
{ assert(dictTrie);
assert(_dictTrie == NULL); _dictTrie = dictTrie;
assert(dictTrie); _isBorrowed = true;
_dictTrie = dictTrie; return true;
_isBorrowed = true; }
return true;
}
using SegmentBase::cut; using SegmentBase::cut;
bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<Unicode>& res) const bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<Unicode>& res) const {
{ assert(_dictTrie);
assert(_dictTrie); if (begin >= end) {
if (begin >= end) LogError("begin >= end");
{ return false;
LogError("begin >= end"); }
return false;
}
//resut of searching in trie tree //resut of searching in trie tree
DagType tRes; DagType tRes;
//max index of res's words //max index of res's words
int maxIdx = 0; int maxIdx = 0;
// always equals to (uItr - begin) // always equals to (uItr - begin)
int uIdx = 0; int uIdx = 0;
//tmp variables //tmp variables
int wordLen = 0; int wordLen = 0;
for (Unicode::const_iterator uItr = begin; uItr != end; uItr++) for (Unicode::const_iterator uItr = begin; uItr != end; uItr++) {
{ //find word start from uItr
//find word start from uItr if (_dictTrie->find(uItr, end, tRes, 0)) {
if (_dictTrie->find(uItr, end, tRes, 0)) for(DagType::const_iterator itr = tRes.begin(); itr != tRes.end(); itr++)
{ //for (vector<pair<size_t, const DictUnit*> >::const_iterator itr = tRes.begin(); itr != tRes.end(); itr++)
for(DagType::const_iterator itr = tRes.begin(); itr != tRes.end(); itr++) {
//for (vector<pair<size_t, const DictUnit*> >::const_iterator itr = tRes.begin(); itr != tRes.end(); itr++) wordLen = itr->second->word.size();
{ if (wordLen >= 2 || (tRes.size() == 1 && maxIdx <= uIdx)) {
wordLen = itr->second->word.size(); res.push_back(itr->second->word);
if (wordLen >= 2 || (tRes.size() == 1 && maxIdx <= uIdx)) }
{ maxIdx = uIdx+wordLen > maxIdx ? uIdx+wordLen : maxIdx;
res.push_back(itr->second->word); }
} tRes.clear();
maxIdx = uIdx+wordLen > maxIdx ? uIdx+wordLen : maxIdx; } else { // not found word start from uItr
} if (maxIdx <= uIdx) { // never exist in prev results
tRes.clear(); //put itr itself in res
} res.push_back(Unicode(1, *uItr));
else // not found word start from uItr
{
if (maxIdx <= uIdx) // never exist in prev results
{
//put itr itself in res
res.push_back(Unicode(1, *uItr));
//mark it exits //mark it exits
++maxIdx; ++maxIdx;
} }
} }
++uIdx; ++uIdx;
} }
return true; return true;
} }
bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<string>& res) const bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<string>& res) const {
{ assert(_dictTrie);
assert(_dictTrie); if (begin >= end) {
if (begin >= end) LogError("begin >= end");
{ return false;
LogError("begin >= end"); }
return false;
}
vector<Unicode> uRes; vector<Unicode> uRes;
if (!cut(begin, end, uRes)) if (!cut(begin, end, uRes)) {
{ LogError("get unicode cut result error.");
LogError("get unicode cut result error."); return false;
return false; }
}
string tmp; string tmp;
for (vector<Unicode>::const_iterator uItr = uRes.begin(); uItr != uRes.end(); uItr++) for (vector<Unicode>::const_iterator uItr = uRes.begin(); uItr != uRes.end(); uItr++) {
{ if (TransCode::encode(*uItr, tmp)) {
if (TransCode::encode(*uItr, tmp)) res.push_back(tmp);
{ } else {
res.push_back(tmp); LogError("encode failed.");
} }
else }
{
LogError("encode failed.");
}
}
return true; return true;
} }
private: private:
const DictTrie* _dictTrie; const DictTrie* _dictTrie;
bool _isBorrowed; bool _isBorrowed;
}; };
} }
#endif #endif

View File

@ -12,387 +12,315 @@
#include "SegmentBase.hpp" #include "SegmentBase.hpp"
#include "DictTrie.hpp" #include "DictTrie.hpp"
namespace CppJieba namespace CppJieba {
{ using namespace Limonp;
using namespace Limonp; typedef unordered_map<uint16_t, double> EmitProbMap;
typedef unordered_map<uint16_t, double> EmitProbMap; class HMMSegment: public SegmentBase {
class HMMSegment: public SegmentBase public:
{ /*
public: * STATUS:
/* * 0:B, 1:E, 2:M, 3:S
* STATUS: * */
* 0:B, 1:E, 2:M, 3:S enum {B = 0, E = 1, M = 2, S = 3, STATUS_SUM = 4};
* */
enum {B = 0, E = 1, M = 2, S = 3, STATUS_SUM = 4};
public: public:
HMMSegment(){} HMMSegment() {}
explicit HMMSegment(const string& filePath) explicit HMMSegment(const string& filePath) {
{ LIMONP_CHECK(init(filePath));
LIMONP_CHECK(init(filePath)); }
} virtual ~HMMSegment() {}
virtual ~HMMSegment(){} public:
public: bool init(const string& filePath) {
bool init(const string& filePath) memset(_startProb, 0, sizeof(_startProb));
{ memset(_transProb, 0, sizeof(_transProb));
memset(_startProb, 0, sizeof(_startProb)); _statMap[0] = 'B';
memset(_transProb, 0, sizeof(_transProb)); _statMap[1] = 'E';
_statMap[0] = 'B'; _statMap[2] = 'M';
_statMap[1] = 'E'; _statMap[3] = 'S';
_statMap[2] = 'M'; _emitProbVec.push_back(&_emitProbB);
_statMap[3] = 'S'; _emitProbVec.push_back(&_emitProbE);
_emitProbVec.push_back(&_emitProbB); _emitProbVec.push_back(&_emitProbM);
_emitProbVec.push_back(&_emitProbE); _emitProbVec.push_back(&_emitProbS);
_emitProbVec.push_back(&_emitProbM); LIMONP_CHECK(_loadModel(filePath.c_str()));
_emitProbVec.push_back(&_emitProbS); LogInfo("HMMSegment init(%s) ok.", filePath.c_str());
LIMONP_CHECK(_loadModel(filePath.c_str())); return true;
LogInfo("HMMSegment init(%s) ok.", filePath.c_str()); }
return true; public:
} using SegmentBase::cut;
public: public:
using SegmentBase::cut; bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<Unicode>& res)const {
public: Unicode::const_iterator left = begin;
bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<Unicode>& res)const Unicode::const_iterator right = begin;
{ while(right != end) {
Unicode::const_iterator left = begin; if(*right < 0x80) {
Unicode::const_iterator right = begin; if(left != right && !_cut(left, right, res)) {
while(right != end) return false;
{ }
if(*right < 0x80) left = right;
{ do {
if(left != right && !_cut(left, right, res)) right = _sequentialLetterRule(left, end);
{ if(right != left) {
return false; break;
} }
left = right; right = _numbersRule(left, end);
do { if(right != left) {
right = _sequentialLetterRule(left, end); break;
if(right != left) }
{ right ++;
break; } while(false);
} res.push_back(Unicode(left, right));
right = _numbersRule(left, end); left = right;
if(right != left) } else {
{ right++;
break; }
} }
right ++; if(left != right && !_cut(left, right, res)) {
} while(false); return false;
res.push_back(Unicode(left, right)); }
left = right; return true;
} }
else public:
{ virtual bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<string>& res)const {
right++; if(begin == end) {
} return false;
} }
if(left != right && !_cut(left, right, res)) vector<Unicode> words;
{ words.reserve(end - begin);
return false; if(!cut(begin, end, words)) {
} return false;
return true; }
} size_t offset = res.size();
public: res.resize(res.size() + words.size());
virtual bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<string>& res)const for(size_t i = 0; i < words.size(); i++) {
{ if(!TransCode::encode(words[i], res[offset + i])) {
if(begin == end) LogError("encode failed.");
{ }
return false; }
} return true;
vector<Unicode> words; }
words.reserve(end - begin); private:
if(!cut(begin, end, words)) // sequential letters rule
{ Unicode::const_iterator _sequentialLetterRule(Unicode::const_iterator begin, Unicode::const_iterator end) const {
return false; Unicode::value_type x = *begin;
} if (('a' <= x && x <= 'z') || ('A' <= x && x <= 'Z')) {
size_t offset = res.size(); begin ++;
res.resize(res.size() + words.size()); } else {
for(size_t i = 0; i < words.size(); i++) return begin;
{ }
if(!TransCode::encode(words[i], res[offset + i])) while(begin != end) {
{ x = *begin;
LogError("encode failed."); if(('a' <= x && x <= 'z') || ('A' <= x && x <= 'Z') || ('0' <= x && x <= '9')) {
} begin ++;
} } else {
return true; break;
} }
private: }
// sequential letters rule return begin;
Unicode::const_iterator _sequentialLetterRule(Unicode::const_iterator begin, Unicode::const_iterator end) const }
{ //
Unicode::value_type x = *begin; Unicode::const_iterator _numbersRule(Unicode::const_iterator begin, Unicode::const_iterator end) const {
if (('a' <= x && x <= 'z') || ('A' <= x && x <= 'Z')) Unicode::value_type x = *begin;
{ if('0' <= x && x <= '9') {
begin ++; begin ++;
} } else {
else return begin;
{ }
return begin; while(begin != end) {
} x = *begin;
while(begin != end) if( ('0' <= x && x <= '9') || x == '.') {
{ begin++;
x = *begin; } else {
if(('a' <= x && x <= 'z') || ('A' <= x && x <= 'Z') || ('0' <= x && x <= '9')) break;
{ }
begin ++; }
} return begin;
else }
{ bool _cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<Unicode>& res) const {
break; vector<size_t> status;
} if(!_viterbi(begin, end, status)) {
} LogError("_viterbi failed.");
return begin; return false;
} }
//
Unicode::const_iterator _numbersRule(Unicode::const_iterator begin, Unicode::const_iterator end) const
{
Unicode::value_type x = *begin;
if('0' <= x && x <= '9')
{
begin ++;
}
else
{
return begin;
}
while(begin != end)
{
x = *begin;
if( ('0' <= x && x <= '9') || x == '.')
{
begin++;
}
else
{
break;
}
}
return begin;
}
bool _cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<Unicode>& res) const
{
vector<size_t> status;
if(!_viterbi(begin, end, status))
{
LogError("_viterbi failed.");
return false;
}
Unicode::const_iterator left = begin; Unicode::const_iterator left = begin;
Unicode::const_iterator right; Unicode::const_iterator right;
for(size_t i = 0; i < status.size(); i++) for(size_t i = 0; i < status.size(); i++) {
{ if(status[i] % 2) { //if(E == status[i] || S == status[i])
if(status[i] % 2) //if(E == status[i] || S == status[i]) right = begin + i + 1;
{ res.push_back(Unicode(left, right));
right = begin + i + 1; left = right;
res.push_back(Unicode(left, right)); }
left = right; }
} return true;
} }
return true;
}
bool _viterbi(Unicode::const_iterator begin, Unicode::const_iterator end, vector<size_t>& status)const bool _viterbi(Unicode::const_iterator begin, Unicode::const_iterator end, vector<size_t>& status)const {
{ if(begin == end) {
if(begin == end) return false;
{ }
return false;
}
size_t Y = STATUS_SUM; size_t Y = STATUS_SUM;
size_t X = end - begin; size_t X = end - begin;
size_t XYSize = X * Y; size_t XYSize = X * Y;
size_t now, old, stat; size_t now, old, stat;
double tmp, endE, endS; double tmp, endE, endS;
vector<int> path(XYSize); vector<int> path(XYSize);
vector<double> weight(XYSize); vector<double> weight(XYSize);
//start //start
for(size_t y = 0; y < Y; y++) for(size_t y = 0; y < Y; y++) {
{ weight[0 + y * X] = _startProb[y] + _getEmitProb(_emitProbVec[y], *begin, MIN_DOUBLE);
weight[0 + y * X] = _startProb[y] + _getEmitProb(_emitProbVec[y], *begin, MIN_DOUBLE); path[0 + y * X] = -1;
path[0 + y * X] = -1; }
}
double emitProb; double emitProb;
for(size_t x = 1; x < X; x++) for(size_t x = 1; x < X; x++) {
{ for(size_t y = 0; y < Y; y++) {
for(size_t y = 0; y < Y; y++) now = x + y*X;
{ weight[now] = MIN_DOUBLE;
now = x + y*X; path[now] = E; // warning
weight[now] = MIN_DOUBLE; emitProb = _getEmitProb(_emitProbVec[y], *(begin+x), MIN_DOUBLE);
path[now] = E; // warning for(size_t preY = 0; preY < Y; preY++) {
emitProb = _getEmitProb(_emitProbVec[y], *(begin+x), MIN_DOUBLE); old = x - 1 + preY * X;
for(size_t preY = 0; preY < Y; preY++) tmp = weight[old] + _transProb[preY][y] + emitProb;
{ if(tmp > weight[now]) {
old = x - 1 + preY * X; weight[now] = tmp;
tmp = weight[old] + _transProb[preY][y] + emitProb; path[now] = preY;
if(tmp > weight[now]) }
{ }
weight[now] = tmp; }
path[now] = preY; }
}
}
}
}
endE = weight[X-1+E*X]; endE = weight[X-1+E*X];
endS = weight[X-1+S*X]; endS = weight[X-1+S*X];
stat = 0; stat = 0;
if(endE >= endS) if(endE >= endS) {
{ stat = E;
stat = E; } else {
} stat = S;
else }
{
stat = S;
}
status.resize(X); status.resize(X);
for(int x = X -1 ; x >= 0; x--) for(int x = X -1 ; x >= 0; x--) {
{ status[x] = stat;
status[x] = stat; stat = path[x + stat*X];
stat = path[x + stat*X]; }
}
return true; return true;
} }
bool _loadModel(const char* const filePath) bool _loadModel(const char* const filePath) {
{ ifstream ifile(filePath);
ifstream ifile(filePath); string line;
string line; vector<string> tmp;
vector<string> tmp; vector<string> tmp2;
vector<string> tmp2; //load _startProb
//load _startProb if(!_getLine(ifile, line)) {
if(!_getLine(ifile, line)) return false;
{ }
return false; split(line, tmp, " ");
} if(tmp.size() != STATUS_SUM) {
split(line, tmp, " "); LogError("start_p illegal");
if(tmp.size() != STATUS_SUM) return false;
{ }
LogError("start_p illegal"); for(size_t j = 0; j< tmp.size(); j++) {
return false; _startProb[j] = atof(tmp[j].c_str());
} }
for(size_t j = 0; j< tmp.size(); j++)
{
_startProb[j] = atof(tmp[j].c_str());
}
//load _transProb //load _transProb
for(size_t i = 0; i < STATUS_SUM; i++) for(size_t i = 0; i < STATUS_SUM; i++) {
{ if(!_getLine(ifile, line)) {
if(!_getLine(ifile, line)) return false;
{ }
return false; split(line, tmp, " ");
} if(tmp.size() != STATUS_SUM) {
split(line, tmp, " "); LogError("trans_p illegal");
if(tmp.size() != STATUS_SUM) return false;
{ }
LogError("trans_p illegal"); for(size_t j =0; j < STATUS_SUM; j++) {
return false; _transProb[i][j] = atof(tmp[j].c_str());
} }
for(size_t j =0; j < STATUS_SUM; j++) }
{
_transProb[i][j] = atof(tmp[j].c_str());
}
}
//load _emitProbB //load _emitProbB
if(!_getLine(ifile, line) || !_loadEmitProb(line, _emitProbB)) if(!_getLine(ifile, line) || !_loadEmitProb(line, _emitProbB)) {
{ return false;
return false; }
}
//load _emitProbE //load _emitProbE
if(!_getLine(ifile, line) || !_loadEmitProb(line, _emitProbE)) if(!_getLine(ifile, line) || !_loadEmitProb(line, _emitProbE)) {
{ return false;
return false; }
}
//load _emitProbM //load _emitProbM
if(!_getLine(ifile, line) || !_loadEmitProb(line, _emitProbM)) if(!_getLine(ifile, line) || !_loadEmitProb(line, _emitProbM)) {
{ return false;
return false; }
}
//load _emitProbS //load _emitProbS
if(!_getLine(ifile, line) || !_loadEmitProb(line, _emitProbS)) if(!_getLine(ifile, line) || !_loadEmitProb(line, _emitProbS)) {
{ return false;
return false; }
}
return true; return true;
} }
bool _getLine(ifstream& ifile, string& line) bool _getLine(ifstream& ifile, string& line) {
{ while(getline(ifile, line)) {
while(getline(ifile, line)) trim(line);
{ if(line.empty()) {
trim(line); continue;
if(line.empty()) }
{ if(startsWith(line, "#")) {
continue; continue;
} }
if(startsWith(line, "#")) return true;
{ }
continue; return false;
} }
return true; bool _loadEmitProb(const string& line, EmitProbMap& mp) {
} if(line.empty()) {
return false; return false;
} }
bool _loadEmitProb(const string& line, EmitProbMap& mp) vector<string> tmp, tmp2;
{ Unicode unicode;
if(line.empty()) split(line, tmp, ",");
{ for(size_t i = 0; i < tmp.size(); i++) {
return false; split(tmp[i], tmp2, ":");
} if(2 != tmp2.size()) {
vector<string> tmp, tmp2; LogError("_emitProb illegal.");
Unicode unicode; return false;
split(line, tmp, ","); }
for(size_t i = 0; i < tmp.size(); i++) if(!TransCode::decode(tmp2[0], unicode) || unicode.size() != 1) {
{ LogError("TransCode failed.");
split(tmp[i], tmp2, ":"); return false;
if(2 != tmp2.size()) }
{ mp[unicode[0]] = atof(tmp2[1].c_str());
LogError("_emitProb illegal."); }
return false; return true;
} }
if(!TransCode::decode(tmp2[0], unicode) || unicode.size() != 1) double _getEmitProb(const EmitProbMap* ptMp, uint16_t key, double defVal)const {
{ EmitProbMap::const_iterator cit = ptMp->find(key);
LogError("TransCode failed."); if(cit == ptMp->end()) {
return false; return defVal;
} }
mp[unicode[0]] = atof(tmp2[1].c_str()); return cit->second;
}
return true;
}
double _getEmitProb(const EmitProbMap* ptMp, uint16_t key, double defVal)const
{
EmitProbMap::const_iterator cit = ptMp->find(key);
if(cit == ptMp->end())
{
return defVal;
}
return cit->second;
} }
private: private:
char _statMap[STATUS_SUM]; char _statMap[STATUS_SUM];
double _startProb[STATUS_SUM]; double _startProb[STATUS_SUM];
double _transProb[STATUS_SUM][STATUS_SUM]; double _transProb[STATUS_SUM][STATUS_SUM];
EmitProbMap _emitProbB; EmitProbMap _emitProbB;
EmitProbMap _emitProbE; EmitProbMap _emitProbE;
EmitProbMap _emitProbM; EmitProbMap _emitProbM;
EmitProbMap _emitProbS; EmitProbMap _emitProbS;
vector<EmitProbMap* > _emitProbVec; vector<EmitProbMap* > _emitProbVec;
}; };
} }
#endif #endif

View File

@ -2,15 +2,13 @@
#define CPPJIEBA_SEGMENTINTERFACE_H #define CPPJIEBA_SEGMENTINTERFACE_H
namespace CppJieba namespace CppJieba {
{ class ISegment {
class ISegment public:
{ virtual ~ISegment() {};
public: virtual bool cut(Unicode::const_iterator begin , Unicode::const_iterator end, vector<string>& res) const = 0;
virtual ~ISegment(){}; virtual bool cut(const string& str, vector<string>& res) const = 0;
virtual bool cut(Unicode::const_iterator begin , Unicode::const_iterator end, vector<string>& res) const = 0; };
virtual bool cut(const string& str, vector<string>& res) const = 0;
};
} }
#endif #endif

View File

@ -5,160 +5,134 @@
#include <cmath> #include <cmath>
#include <set> #include <set>
namespace CppJieba namespace CppJieba {
{ using namespace Limonp;
using namespace Limonp;
/*utf8*/ /*utf8*/
class KeywordExtractor class KeywordExtractor {
{ public:
public: KeywordExtractor() {};
KeywordExtractor(){}; KeywordExtractor(const string& dictPath, const string& hmmFilePath, const string& idfPath, const string& stopWordPath, const string& userDict = "") {
KeywordExtractor(const string& dictPath, const string& hmmFilePath, const string& idfPath, const string& stopWordPath, const string& userDict = "") init(dictPath, hmmFilePath, idfPath, stopWordPath, userDict);
{ };
init(dictPath, hmmFilePath, idfPath, stopWordPath, userDict); ~KeywordExtractor() {};
};
~KeywordExtractor(){};
void init(const string& dictPath, const string& hmmFilePath, const string& idfPath, const string& stopWordPath, const string& userDict = "") void init(const string& dictPath, const string& hmmFilePath, const string& idfPath, const string& stopWordPath, const string& userDict = "") {
{ _loadIdfDict(idfPath);
_loadIdfDict(idfPath); _loadStopWordDict(stopWordPath);
_loadStopWordDict(stopWordPath); LIMONP_CHECK(_segment.init(dictPath, hmmFilePath, userDict));
LIMONP_CHECK(_segment.init(dictPath, hmmFilePath, userDict)); };
};
bool extract(const string& str, vector<string>& keywords, size_t topN) const bool extract(const string& str, vector<string>& keywords, size_t topN) const {
{ vector<pair<string, double> > topWords;
vector<pair<string, double> > topWords; if(!extract(str, topWords, topN)) {
if(!extract(str, topWords, topN)) return false;
{ }
return false; for(size_t i = 0; i < topWords.size(); i++) {
} keywords.push_back(topWords[i].first);
for(size_t i = 0; i < topWords.size(); i++) }
{ return true;
keywords.push_back(topWords[i].first); }
}
return true;
}
bool extract(const string& str, vector<pair<string, double> >& keywords, size_t topN) const bool extract(const string& str, vector<pair<string, double> >& keywords, size_t topN) const {
{ vector<string> words;
vector<string> words; if(!_segment.cut(str, words)) {
if(!_segment.cut(str, words)) LogError("segment cut(%s) failed.", str.c_str());
{ return false;
LogError("segment cut(%s) failed.", str.c_str()); }
return false;
}
map<string, double> wordmap; map<string, double> wordmap;
for(vector<string>::iterator iter = words.begin(); iter != words.end(); iter++) for(vector<string>::iterator iter = words.begin(); iter != words.end(); iter++) {
{ if(_isSingleWord(*iter)) {
if(_isSingleWord(*iter)) continue;
{ }
continue; wordmap[*iter] += 1.0;
} }
wordmap[*iter] += 1.0;
}
for(map<string, double>::iterator itr = wordmap.begin(); itr != wordmap.end(); ) for(map<string, double>::iterator itr = wordmap.begin(); itr != wordmap.end(); ) {
{ if(_stopWords.end() != _stopWords.find(itr->first)) {
if(_stopWords.end() != _stopWords.find(itr->first)) wordmap.erase(itr++);
{ continue;
wordmap.erase(itr++); }
continue;
}
unordered_map<string, double>::const_iterator cit = _idfMap.find(itr->first); unordered_map<string, double>::const_iterator cit = _idfMap.find(itr->first);
if(cit != _idfMap.end()) if(cit != _idfMap.end()) {
{ itr->second *= cit->second;
itr->second *= cit->second; } else {
} itr->second *= _idfAverage;
else }
{ itr ++;
itr->second *= _idfAverage; }
}
itr ++;
}
keywords.clear(); keywords.clear();
std::copy(wordmap.begin(), wordmap.end(), std::inserter(keywords, keywords.begin())); std::copy(wordmap.begin(), wordmap.end(), std::inserter(keywords, keywords.begin()));
topN = min(topN, keywords.size()); topN = min(topN, keywords.size());
partial_sort(keywords.begin(), keywords.begin() + topN, keywords.end(), _cmp); partial_sort(keywords.begin(), keywords.begin() + topN, keywords.end(), _cmp);
keywords.resize(topN); keywords.resize(topN);
return true; return true;
} }
private: private:
void _loadIdfDict(const string& idfPath) void _loadIdfDict(const string& idfPath) {
{ ifstream ifs(idfPath.c_str());
ifstream ifs(idfPath.c_str()); if(!ifs.is_open()) {
if(!ifs.is_open()) LogFatal("open %s failed.", idfPath.c_str());
{ }
LogFatal("open %s failed.", idfPath.c_str()); string line ;
} vector<string> buf;
string line ; double idf = 0.0;
vector<string> buf; double idfSum = 0.0;
double idf = 0.0; size_t lineno = 0;
double idfSum = 0.0; for(; getline(ifs, line); lineno++) {
size_t lineno = 0; buf.clear();
for(;getline(ifs, line); lineno++) if(line.empty()) {
{ LogError("line[%d] empty. skipped.", lineno);
buf.clear(); continue;
if(line.empty()) }
{ if(!split(line, buf, " ") || buf.size() != 2) {
LogError("line[%d] empty. skipped.", lineno); LogError("line %d [%s] illegal. skipped.", lineno, line.c_str());
continue; continue;
} }
if(!split(line, buf, " ") || buf.size() != 2) idf = atof(buf[1].c_str());
{ _idfMap[buf[0]] = idf;
LogError("line %d [%s] illegal. skipped.", lineno, line.c_str()); idfSum += idf;
continue;
}
idf = atof(buf[1].c_str());
_idfMap[buf[0]] = idf;
idfSum += idf;
} }
assert(lineno); assert(lineno);
_idfAverage = idfSum / lineno; _idfAverage = idfSum / lineno;
assert(_idfAverage > 0.0); assert(_idfAverage > 0.0);
} }
void _loadStopWordDict(const string& filePath) void _loadStopWordDict(const string& filePath) {
{ ifstream ifs(filePath.c_str());
ifstream ifs(filePath.c_str()); if(!ifs.is_open()) {
if(!ifs.is_open()) LogFatal("open %s failed.", filePath.c_str());
{ }
LogFatal("open %s failed.", filePath.c_str()); string line ;
} while(getline(ifs, line)) {
string line ; _stopWords.insert(line);
while(getline(ifs, line)) }
{ assert(_stopWords.size());
_stopWords.insert(line); }
}
assert(_stopWords.size());
}
bool _isSingleWord(const string& str) const bool _isSingleWord(const string& str) const {
{ Unicode unicode;
Unicode unicode; TransCode::decode(str, unicode);
TransCode::decode(str, unicode); if(unicode.size() == 1)
if(unicode.size() == 1) return true;
return true; return false;
return false; }
}
static bool _cmp(const pair<string, double>& lhs, const pair<string, double>& rhs) static bool _cmp(const pair<string, double>& lhs, const pair<string, double>& rhs) {
{ return lhs.second > rhs.second;
return lhs.second > rhs.second; }
}
private:
MixSegment _segment;
unordered_map<string, double> _idfMap;
double _idfAverage;
unordered_set<string> _stopWords; private:
}; MixSegment _segment;
unordered_map<string, double> _idfMap;
double _idfAverage;
unordered_set<string> _stopWords;
};
} }
#endif #endif

View File

@ -9,140 +9,114 @@
#include "ISegment.hpp" #include "ISegment.hpp"
#include "SegmentBase.hpp" #include "SegmentBase.hpp"
namespace CppJieba namespace CppJieba {
{
class MPSegment: public SegmentBase class MPSegment: public SegmentBase {
{
public: public:
MPSegment(){}; MPSegment() {};
MPSegment(const string& dictPath, const string& userDictPath = "") MPSegment(const string& dictPath, const string& userDictPath = "") {
{ LIMONP_CHECK(init(dictPath, userDictPath));
LIMONP_CHECK(init(dictPath, userDictPath)); };
}; virtual ~MPSegment() {};
virtual ~MPSegment(){};
bool init(const string& dictPath, const string& userDictPath = "") bool init(const string& dictPath, const string& userDictPath = "") {
{ LIMONP_CHECK(_dictTrie.init(dictPath, userDictPath));
LIMONP_CHECK(_dictTrie.init(dictPath, userDictPath)); LogInfo("MPSegment init(%s) ok", dictPath.c_str());
LogInfo("MPSegment init(%s) ok", dictPath.c_str()); return true;
return true; }
} bool isUserDictSingleChineseWord(const Unicode::value_type & value) const {
bool isUserDictSingleChineseWord(const Unicode::value_type & value) const return _dictTrie.isUserDictSingleChineseWord(value);
{ }
return _dictTrie.isUserDictSingleChineseWord(value);
}
using SegmentBase::cut; using SegmentBase::cut;
virtual bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<string>& res)const virtual bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<string>& res)const {
{ if(begin == end) {
if(begin == end) return false;
{ }
return false;
}
vector<Unicode> words; vector<Unicode> words;
words.reserve(end - begin); words.reserve(end - begin);
if(!cut(begin, end, words)) if(!cut(begin, end, words)) {
{ return false;
return false; }
} size_t offset = res.size();
size_t offset = res.size(); res.resize(res.size() + words.size());
res.resize(res.size() + words.size()); for(size_t i = 0; i < words.size(); i++) {
for(size_t i = 0; i < words.size(); i++) if(!TransCode::encode(words[i], res[i + offset])) {
{ LogError("encode failed.");
if(!TransCode::encode(words[i], res[i + offset])) res[i + offset].clear();
{ }
LogError("encode failed."); }
res[i + offset].clear(); return true;
} }
}
return true;
}
bool cut(Unicode::const_iterator begin , Unicode::const_iterator end, vector<Unicode>& res) const bool cut(Unicode::const_iterator begin , Unicode::const_iterator end, vector<Unicode>& res) const {
{ if(end == begin) {
if(end == begin) return false;
{ }
return false; vector<SegmentChar> segmentChars;
}
vector<SegmentChar> segmentChars;
_dictTrie.find(begin, end, segmentChars); _dictTrie.find(begin, end, segmentChars);
_calcDP(segmentChars); _calcDP(segmentChars);
_cut(segmentChars, res); _cut(segmentChars, res);
return true; return true;
} }
const DictTrie* getDictTrie() const const DictTrie* getDictTrie() const {
{ return &_dictTrie;
return &_dictTrie; }
}
private: private:
void _calcDP(vector<SegmentChar>& segmentChars) const void _calcDP(vector<SegmentChar>& segmentChars) const {
{ size_t nextPos;
size_t nextPos; const DictUnit* p;
const DictUnit* p; double val;
double val;
for(ssize_t i = segmentChars.size() - 1; i >= 0; i--) for(ssize_t i = segmentChars.size() - 1; i >= 0; i--) {
{ segmentChars[i].pInfo = NULL;
segmentChars[i].pInfo = NULL; segmentChars[i].weight = MIN_DOUBLE;
segmentChars[i].weight = MIN_DOUBLE; assert(!segmentChars[i].dag.empty());
assert(!segmentChars[i].dag.empty()); for(DagType::const_iterator it = segmentChars[i].dag.begin(); it != segmentChars[i].dag.end(); it++) {
for(DagType::const_iterator it = segmentChars[i].dag.begin(); it != segmentChars[i].dag.end(); it++) nextPos = it->first;
{ p = it->second;
nextPos = it->first; val = 0.0;
p = it->second; if(nextPos + 1 < segmentChars.size()) {
val = 0.0; val += segmentChars[nextPos + 1].weight;
if(nextPos + 1 < segmentChars.size()) }
{
val += segmentChars[nextPos + 1].weight;
}
if(p) if(p) {
{ val += p->weight;
val += p->weight; } else {
} val += _dictTrie.getMinWeight();
else }
{ if(val > segmentChars[i].weight) {
val += _dictTrie.getMinWeight(); segmentChars[i].pInfo = p;
} segmentChars[i].weight = val;
if(val > segmentChars[i].weight) }
{ }
segmentChars[i].pInfo = p; }
segmentChars[i].weight = val; }
} void _cut(const vector<SegmentChar>& segmentChars, vector<Unicode>& res) const {
} size_t i = 0;
} while(i < segmentChars.size()) {
} const DictUnit* p = segmentChars[i].pInfo;
void _cut(const vector<SegmentChar>& segmentChars, vector<Unicode>& res) const if(p) {
{ res.push_back(p->word);
size_t i = 0; i += p->word.size();
while(i < segmentChars.size()) } else { //single chinese word
{ res.push_back(Unicode(1, segmentChars[i].uniCh));
const DictUnit* p = segmentChars[i].pInfo; i++;
if(p) }
{ }
res.push_back(p->word); }
i += p->word.size();
}
else//single chinese word
{
res.push_back(Unicode(1, segmentChars[i].uniCh));
i++;
}
}
}
private: private:
DictTrie _dictTrie; DictTrie _dictTrie;
}; };
} }
#endif #endif

View File

@ -6,117 +6,98 @@
#include "HMMSegment.hpp" #include "HMMSegment.hpp"
#include "Limonp/StringUtil.hpp" #include "Limonp/StringUtil.hpp"
namespace CppJieba namespace CppJieba {
{ class MixSegment: public SegmentBase {
class MixSegment: public SegmentBase public:
{ MixSegment() {
public: }
MixSegment() MixSegment(const string& mpSegDict, const string& hmmSegDict, const string& userDict = "") {
{ LIMONP_CHECK(init(mpSegDict, hmmSegDict, userDict));
} }
MixSegment(const string& mpSegDict, const string& hmmSegDict, const string& userDict = "") virtual ~MixSegment() {
{ }
LIMONP_CHECK(init(mpSegDict, hmmSegDict, userDict)); bool init(const string& mpSegDict, const string& hmmSegDict, const string& userDict = "") {
} LIMONP_CHECK(_mpSeg.init(mpSegDict, userDict));
virtual ~MixSegment() LIMONP_CHECK(_hmmSeg.init(hmmSegDict));
{ LogInfo("MixSegment init(%s, %s)", mpSegDict.c_str(), hmmSegDict.c_str());
} return true;
bool init(const string& mpSegDict, const string& hmmSegDict, const string& userDict = "") }
{ using SegmentBase::cut;
LIMONP_CHECK(_mpSeg.init(mpSegDict, userDict)); virtual bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<Unicode>& res) const {
LIMONP_CHECK(_hmmSeg.init(hmmSegDict)); vector<Unicode> words;
LogInfo("MixSegment init(%s, %s)", mpSegDict.c_str(), hmmSegDict.c_str()); words.reserve(end - begin);
return true; if(!_mpSeg.cut(begin, end, words)) {
} LogError("mpSeg cutDAG failed.");
using SegmentBase::cut; return false;
virtual bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<Unicode>& res) const }
{
vector<Unicode> words;
words.reserve(end - begin);
if(!_mpSeg.cut(begin, end, words))
{
LogError("mpSeg cutDAG failed.");
return false;
}
vector<Unicode> hmmRes; vector<Unicode> hmmRes;
hmmRes.reserve(end - begin); hmmRes.reserve(end - begin);
Unicode piece; Unicode piece;
piece.reserve(end - begin); piece.reserve(end - begin);
for (size_t i = 0, j = 0; i < words.size(); i++) for (size_t i = 0, j = 0; i < words.size(); i++) {
{ //if mp get a word, it's ok, put it into result
//if mp get a word, it's ok, put it into result if (1 != words[i].size() || (words[i].size() == 1 && _mpSeg.isUserDictSingleChineseWord(words[i][0]))) {
if (1 != words[i].size() || (words[i].size() == 1 && _mpSeg.isUserDictSingleChineseWord(words[i][0]))) res.push_back(words[i]);
{ continue;
res.push_back(words[i]); }
continue;
}
// if mp get a single one and it is not in userdict, collect it in sequence // if mp get a single one and it is not in userdict, collect it in sequence
j = i; j = i;
while (j < words.size() && 1 == words[j].size() && !_mpSeg.isUserDictSingleChineseWord(words[j][0])) while (j < words.size() && 1 == words[j].size() && !_mpSeg.isUserDictSingleChineseWord(words[j][0])) {
{ piece.push_back(words[j][0]);
piece.push_back(words[j][0]); j++;
j++; }
}
// cut the sequence with hmm // cut the sequence with hmm
if (!_hmmSeg.cut(piece.begin(), piece.end(), hmmRes)) if (!_hmmSeg.cut(piece.begin(), piece.end(), hmmRes)) {
{ LogError("_hmmSeg cut failed.");
LogError("_hmmSeg cut failed."); return false;
return false; }
}
//put hmm result to result //put hmm result to result
for (size_t k = 0; k < hmmRes.size(); k++) for (size_t k = 0; k < hmmRes.size(); k++) {
{ res.push_back(hmmRes[k]);
res.push_back(hmmRes[k]); }
}
//clear tmp vars //clear tmp vars
piece.clear(); piece.clear();
hmmRes.clear(); hmmRes.clear();
//let i jump over this piece //let i jump over this piece
i = j - 1; i = j - 1;
} }
return true; return true;
} }
virtual bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<string>& res)const virtual bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<string>& res)const {
{ if(begin == end) {
if(begin == end) return false;
{ }
return false;
}
vector<Unicode> uRes; vector<Unicode> uRes;
uRes.reserve(end - begin); uRes.reserve(end - begin);
if (!cut(begin, end, uRes)) if (!cut(begin, end, uRes)) {
{ return false;
return false; }
}
size_t offset = res.size(); size_t offset = res.size();
res.resize(res.size() + uRes.size()); res.resize(res.size() + uRes.size());
for(size_t i = 0; i < uRes.size(); i ++, offset++) for(size_t i = 0; i < uRes.size(); i ++, offset++) {
{ if(!TransCode::encode(uRes[i], res[offset])) {
if(!TransCode::encode(uRes[i], res[offset])) LogError("encode failed.");
{ }
LogError("encode failed."); }
} return true;
} }
return true;
}
const DictTrie* getDictTrie() const const DictTrie* getDictTrie() const {
{ return _mpSeg.getDictTrie();
return _mpSeg.getDictTrie(); }
} private:
private: MPSegment _mpSeg;
MPSegment _mpSeg; HMMSegment _hmmSeg;
HMMSegment _hmmSeg; };
};
} }
#endif #endif

View File

@ -5,106 +5,87 @@
#include "Limonp/StringUtil.hpp" #include "Limonp/StringUtil.hpp"
#include "DictTrie.hpp" #include "DictTrie.hpp"
namespace CppJieba namespace CppJieba {
{ using namespace Limonp;
using namespace Limonp;
static const char* const POS_M = "m"; static const char* const POS_M = "m";
static const char* const POS_ENG = "eng"; static const char* const POS_ENG = "eng";
static const char* const POS_X = "x"; static const char* const POS_X = "x";
class PosTagger class PosTagger {
{ public:
public: PosTagger() {
PosTagger() }
{ PosTagger(
} const string& dictPath,
PosTagger( const string& hmmFilePath,
const string& dictPath, const string& userDictPath = ""
const string& hmmFilePath, ) {
const string& userDictPath = "" init(dictPath, hmmFilePath, userDictPath);
) }
{ ~PosTagger() {
init(dictPath, hmmFilePath, userDictPath); }
} void init(
~PosTagger() const string& dictPath,
{ const string& hmmFilePath,
} const string& userDictPath = ""
void init( ) {
const string& dictPath, LIMONP_CHECK(_segment.init(dictPath, hmmFilePath, userDictPath));
const string& hmmFilePath, _dictTrie = _segment.getDictTrie();
const string& userDictPath = "" LIMONP_CHECK(_dictTrie);
) };
{
LIMONP_CHECK(_segment.init(dictPath, hmmFilePath, userDictPath));
_dictTrie = _segment.getDictTrie();
LIMONP_CHECK(_dictTrie);
};
bool tag(const string& src, vector<pair<string, string> >& res) const
{
vector<string> cutRes;
if (!_segment.cut(src, cutRes))
{
LogError("_mixSegment cut failed");
return false;
}
const DictUnit *tmp = NULL; bool tag(const string& src, vector<pair<string, string> >& res) const {
Unicode unico; vector<string> cutRes;
for (vector<string>::iterator itr = cutRes.begin(); itr != cutRes.end(); ++itr) if (!_segment.cut(src, cutRes)) {
{ LogError("_mixSegment cut failed");
if (!TransCode::decode(*itr, unico)) return false;
{ }
LogError("decode failed.");
return false; const DictUnit *tmp = NULL;
} Unicode unico;
tmp = _dictTrie->find(unico.begin(), unico.end()); for (vector<string>::iterator itr = cutRes.begin(); itr != cutRes.end(); ++itr) {
if(tmp == NULL || tmp->tag.empty()) if (!TransCode::decode(*itr, unico)) {
{ LogError("decode failed.");
res.push_back(make_pair(*itr, _specialRule(unico))); return false;
} }
else tmp = _dictTrie->find(unico.begin(), unico.end());
{ if(tmp == NULL || tmp->tag.empty()) {
res.push_back(make_pair(*itr, tmp->tag)); res.push_back(make_pair(*itr, _specialRule(unico)));
} } else {
} res.push_back(make_pair(*itr, tmp->tag));
return !res.empty(); }
} }
private: return !res.empty();
const char* _specialRule(const Unicode& unicode) const }
{ private:
size_t m = 0; const char* _specialRule(const Unicode& unicode) const {
size_t eng = 0; size_t m = 0;
for(size_t i = 0; i < unicode.size() && eng < unicode.size() / 2; i++) size_t eng = 0;
{ for(size_t i = 0; i < unicode.size() && eng < unicode.size() / 2; i++) {
if(unicode[i] < 0x80) if(unicode[i] < 0x80) {
{ eng ++;
eng ++; if('0' <= unicode[i] && unicode[i] <= '9') {
if('0' <= unicode[i] && unicode[i] <= '9') m++;
{ }
m++; }
} }
} // ascii char is not found
} if(eng == 0) {
// ascii char is not found return POS_X;
if(eng == 0) }
{ // all the ascii is number char
return POS_X; if(m == eng) {
} return POS_M;
// all the ascii is number char }
if(m == eng) // the ascii chars contain english letter
{ return POS_ENG;
return POS_M; }
} private:
// the ascii chars contain english letter MixSegment _segment;
return POS_ENG; const DictTrie * _dictTrie;
} };
private:
MixSegment _segment;
const DictTrie * _dictTrie;
};
} }
#endif #endif

View File

@ -13,106 +13,86 @@
#include "TransCode.hpp" #include "TransCode.hpp"
#include "DictTrie.hpp" #include "DictTrie.hpp"
namespace CppJieba namespace CppJieba {
{ class QuerySegment: public SegmentBase {
class QuerySegment: public SegmentBase public:
{ QuerySegment() {};
public: QuerySegment(const string& dict, const string& model, size_t maxWordLen, const string& userDict = "") {
QuerySegment(){}; init(dict, model, maxWordLen, userDict);
QuerySegment(const string& dict, const string& model, size_t maxWordLen, const string& userDict = "") };
{ virtual ~QuerySegment() {};
init(dict, model, maxWordLen, userDict); bool init(const string& dict, const string& model, size_t maxWordLen, const string& userDict = "") {
}; LIMONP_CHECK(_mixSeg.init(dict, model, userDict));
virtual ~QuerySegment(){}; LIMONP_CHECK(_fullSeg.init(_mixSeg.getDictTrie()));
bool init(const string& dict, const string& model, size_t maxWordLen, const string& userDict = "") assert(maxWordLen);
{ _maxWordLen = maxWordLen;
LIMONP_CHECK(_mixSeg.init(dict, model, userDict)); return true;
LIMONP_CHECK(_fullSeg.init(_mixSeg.getDictTrie())); }
assert(maxWordLen); using SegmentBase::cut;
_maxWordLen = maxWordLen; bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<Unicode>& res) const {
return true; if (begin >= end) {
LogError("begin >= end");
return false;
}
//use mix cut first
vector<Unicode> mixRes;
if (!_mixSeg.cut(begin, end, mixRes)) {
LogError("_mixSeg cut failed.");
return false;
}
vector<Unicode> fullRes;
for (vector<Unicode>::const_iterator mixResItr = mixRes.begin(); mixResItr != mixRes.end(); mixResItr++) {
// if it's too long, cut with _fullSeg, put fullRes in res
if (mixResItr->size() > _maxWordLen) {
if (_fullSeg.cut(mixResItr->begin(), mixResItr->end(), fullRes)) {
for (vector<Unicode>::const_iterator fullResItr = fullRes.begin(); fullResItr != fullRes.end(); fullResItr++) {
res.push_back(*fullResItr);
}
//clear tmp res
fullRes.clear();
} }
using SegmentBase::cut; } else { // just use the mix result
bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<Unicode>& res) const res.push_back(*mixResItr);
{ }
if (begin >= end) }
{
LogError("begin >= end");
return false;
}
//use mix cut first return true;
vector<Unicode> mixRes; }
if (!_mixSeg.cut(begin, end, mixRes))
{
LogError("_mixSeg cut failed.");
return false;
}
vector<Unicode> fullRes;
for (vector<Unicode>::const_iterator mixResItr = mixRes.begin(); mixResItr != mixRes.end(); mixResItr++)
{
// if it's too long, cut with _fullSeg, put fullRes in res
if (mixResItr->size() > _maxWordLen)
{
if (_fullSeg.cut(mixResItr->begin(), mixResItr->end(), fullRes))
{
for (vector<Unicode>::const_iterator fullResItr = fullRes.begin(); fullResItr != fullRes.end(); fullResItr++)
{
res.push_back(*fullResItr);
}
//clear tmp res
fullRes.clear();
}
}
else // just use the mix result
{
res.push_back(*mixResItr);
}
}
return true;
}
bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<string>& res) const bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<string>& res) const {
{ if (begin >= end) {
if (begin >= end) LogError("begin >= end");
{ return false;
LogError("begin >= end"); }
return false;
}
vector<Unicode> uRes; vector<Unicode> uRes;
if (!cut(begin, end, uRes)) if (!cut(begin, end, uRes)) {
{ LogError("get unicode cut result error.");
LogError("get unicode cut result error."); return false;
return false; }
}
string tmp; string tmp;
for (vector<Unicode>::const_iterator uItr = uRes.begin(); uItr != uRes.end(); uItr++) for (vector<Unicode>::const_iterator uItr = uRes.begin(); uItr != uRes.end(); uItr++) {
{ if (TransCode::encode(*uItr, tmp)) {
if (TransCode::encode(*uItr, tmp)) res.push_back(tmp);
{ } else {
res.push_back(tmp); LogError("encode failed.");
} }
else }
{
LogError("encode failed.");
}
}
return true; return true;
} }
private: private:
MixSegment _mixSeg; MixSegment _mixSeg;
FullSegment _fullSeg; FullSegment _fullSeg;
size_t _maxWordLen; size_t _maxWordLen;
}; };
} }
#endif #endif

View File

@ -9,70 +9,63 @@
#include <cassert> #include <cassert>
namespace CppJieba namespace CppJieba {
{ using namespace Limonp;
using namespace Limonp;
//const char* const SPECIAL_CHARS = " \t\n"; //const char* const SPECIAL_CHARS = " \t\n";
#ifndef CPPJIEBA_GBK #ifndef CPPJIEBA_GBK
const UnicodeValueType SPECIAL_SYMBOL[] = {32u, 9u, 10u, 12290u, 65292u}; const UnicodeValueType SPECIAL_SYMBOL[] = {32u, 9u, 10u, 12290u, 65292u};
#else #else
const UnicodeValueType SPECIAL_SYMBOL[] = {32u, 9u, 10u}; const UnicodeValueType SPECIAL_SYMBOL[] = {32u, 9u, 10u};
#endif #endif
class SegmentBase: public ISegment, public NonCopyable class SegmentBase: public ISegment, public NonCopyable {
{ public:
public: SegmentBase() {
SegmentBase(){_loadSpecialSymbols();}; _loadSpecialSymbols();
virtual ~SegmentBase(){}; };
public: virtual ~SegmentBase() {};
virtual bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<string>& res) const = 0; public:
virtual bool cut(const string& str, vector<string>& res) const virtual bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<string>& res) const = 0;
{ virtual bool cut(const string& str, vector<string>& res) const {
res.clear(); res.clear();
Unicode unicode; Unicode unicode;
unicode.reserve(str.size()); unicode.reserve(str.size());
TransCode::decode(str, unicode); TransCode::decode(str, unicode);
Unicode::const_iterator left = unicode.begin();
Unicode::const_iterator right;
for(right = unicode.begin(); right != unicode.end(); right++)
{
if(isIn(_specialSymbols, *right))
{
if(left != right)
{
cut(left, right, res);
}
res.resize(res.size() + 1);
TransCode::encode(right, right + 1, res.back());
left = right + 1;
}
}
if(left != right)
{
cut(left, right, res);
}
return true;
}
private:
void _loadSpecialSymbols()
{
size_t size = sizeof(SPECIAL_SYMBOL)/sizeof(*SPECIAL_SYMBOL);
for(size_t i = 0; i < size; i ++)
{
_specialSymbols.insert(SPECIAL_SYMBOL[i]);
}
assert(_specialSymbols.size());
}
private:
unordered_set<UnicodeValueType> _specialSymbols;
}; Unicode::const_iterator left = unicode.begin();
Unicode::const_iterator right;
for(right = unicode.begin(); right != unicode.end(); right++) {
if(isIn(_specialSymbols, *right)) {
if(left != right) {
cut(left, right, res);
}
res.resize(res.size() + 1);
TransCode::encode(right, right + 1, res.back());
left = right + 1;
}
}
if(left != right) {
cut(left, right, res);
}
return true;
}
private:
void _loadSpecialSymbols() {
size_t size = sizeof(SPECIAL_SYMBOL)/sizeof(*SPECIAL_SYMBOL);
for(size_t i = 0; i < size; i ++) {
_specialSymbols.insert(SPECIAL_SYMBOL[i]);
}
assert(_specialSymbols.size());
}
private:
unordered_set<UnicodeValueType> _specialSymbols;
};
} }
#endif #endif

View File

@ -9,55 +9,48 @@
#include "Limonp/StringUtil.hpp" #include "Limonp/StringUtil.hpp"
#include "Limonp/LocalVector.hpp" #include "Limonp/LocalVector.hpp"
namespace CppJieba namespace CppJieba {
{
using namespace Limonp; using namespace Limonp;
typedef uint16_t UnicodeValueType; typedef uint16_t UnicodeValueType;
typedef Limonp::LocalVector<UnicodeValueType> Unicode; typedef Limonp::LocalVector<UnicodeValueType> Unicode;
namespace TransCode namespace TransCode {
{ inline bool decode(const string& str, Unicode& res) {
inline bool decode(const string& str, Unicode& res)
{
#ifdef CPPJIEBA_GBK #ifdef CPPJIEBA_GBK
return gbkTrans(str, res); return gbkTrans(str, res);
#else #else
return utf8ToUnicode(str, res); return utf8ToUnicode(str, res);
#endif #endif
} }
inline bool encode(Unicode::const_iterator begin, Unicode::const_iterator end, string& res) inline bool encode(Unicode::const_iterator begin, Unicode::const_iterator end, string& res) {
{
#ifdef CPPJIEBA_GBK #ifdef CPPJIEBA_GBK
return gbkTrans(begin, end, res); return gbkTrans(begin, end, res);
#else #else
return unicodeToUtf8(begin, end, res); return unicodeToUtf8(begin, end, res);
#endif #endif
} }
inline bool encode(const Unicode& uni, string& res)
{
return encode(uni.begin(), uni.end(), res);
}
// compiler is expected to optimized this function to avoid return value copy inline bool encode(const Unicode& uni, string& res) {
inline string encode(Unicode::const_iterator begin, Unicode::const_iterator end) return encode(uni.begin(), uni.end(), res);
{ }
string res;
res.reserve(end - begin);
encode(begin, end, res);
return res;
}
// compiler is expected to optimized this function to avoid return value copy // compiler is expected to optimized this function to avoid return value copy
inline Unicode decode(const string& str) inline string encode(Unicode::const_iterator begin, Unicode::const_iterator end) {
{ string res;
Unicode unicode; res.reserve(end - begin);
unicode.reserve(str.size()); encode(begin, end, res);
decode(str, unicode); return res;
return unicode; }
}
} // compiler is expected to optimized this function to avoid return value copy
inline Unicode decode(const string& str) {
Unicode unicode;
unicode.reserve(str.size());
decode(str, unicode);
return unicode;
}
}
} }
#endif #endif

View File

@ -5,290 +5,241 @@
#include <vector> #include <vector>
#include <queue> #include <queue>
namespace CppJieba namespace CppJieba {
{ using namespace std;
using namespace std;
struct DictUnit struct DictUnit {
{ Unicode word;
Unicode word; double weight;
double weight; string tag;
string tag; };
};
// for debugging // for debugging
inline ostream & operator << (ostream& os, const DictUnit& unit) inline ostream & operator << (ostream& os, const DictUnit& unit) {
{ string s;
string s; s << unit.word;
s << unit.word; return os << string_format("%s %s %.3lf", s.c_str(), unit.tag.c_str(), unit.weight);
return os << string_format("%s %s %.3lf", s.c_str(), unit.tag.c_str(), unit.weight); }
typedef LocalVector<std::pair<size_t, const DictUnit*> > DagType;
struct SegmentChar {
uint16_t uniCh;
DagType dag;
const DictUnit * pInfo;
double weight;
size_t nextPos;
SegmentChar():uniCh(0), pInfo(NULL), weight(0.0), nextPos(0) {
}
~SegmentChar() {
}
};
typedef Unicode::value_type TrieKey;
class TrieNode {
public:
TrieNode(): fail(NULL), next(NULL), ptValue(NULL) {
}
const TrieNode * findNext(TrieKey key) const {
if(next == NULL) {
return NULL;
} }
NextMap::const_iterator iter = next->find(key);
if(iter == next->end()) {
return NULL;
}
return iter->second;
}
public:
typedef unordered_map<TrieKey, TrieNode*> NextMap;
TrieNode * fail;
NextMap * next;
const DictUnit * ptValue;
};
typedef LocalVector<std::pair<size_t, const DictUnit*> > DagType; class Trie {
public:
Trie(const vector<Unicode>& keys, const vector<const DictUnit*> & valuePointers) {
_root = new TrieNode;
_createTrie(keys, valuePointers);
_build();// build automation
}
~Trie() {
if(_root) {
_deleteNode(_root);
}
}
public:
const DictUnit* find(Unicode::const_iterator begin, Unicode::const_iterator end) const {
TrieNode::NextMap::const_iterator citer;
const TrieNode* ptNode = _root;
for(Unicode::const_iterator it = begin; it != end; it++) {
// build automation
assert(ptNode);
if(NULL == ptNode->next || ptNode->next->end() == (citer = ptNode->next->find(*it))) {
return NULL;
}
ptNode = citer->second;
}
return ptNode->ptValue;
}
// aho-corasick-automation
void find(
Unicode::const_iterator begin,
Unicode::const_iterator end,
vector<struct SegmentChar>& res
) const {
res.resize(end - begin);
const TrieNode * now = _root;
const TrieNode* node;
// compiler will complain warnings if only "i < end - begin" .
for (size_t i = 0; i < size_t(end - begin); i++) {
Unicode::value_type ch = *(begin + i);
res[i].uniCh = ch;
assert(res[i].dag.empty());
res[i].dag.push_back(pair<vector<Unicode >::size_type, const DictUnit* >(i, NULL));
bool flag = false;
struct SegmentChar // rollback
{ while( now != _root ) {
uint16_t uniCh; node = now->findNext(ch);
DagType dag; if (node != NULL) {
const DictUnit * pInfo; flag = true;
double weight; break;
size_t nextPos; } else {
SegmentChar():uniCh(0), pInfo(NULL), weight(0.0), nextPos(0) now = now->fail;
{} }
~SegmentChar() }
{}
};
typedef Unicode::value_type TrieKey; if(!flag) {
node = now->findNext(ch);
class TrieNode }
{ if(node == NULL) {
public: now = _root;
TrieNode(): fail(NULL), next(NULL), ptValue(NULL) } else {
{} now = node;
const TrieNode * findNext(TrieKey key) const const TrieNode * temp = now;
{ while(temp != _root) {
if(next == NULL) if (temp->ptValue) {
{ size_t pos = i - temp->ptValue->word.size() + 1;
return NULL; res[pos].dag.push_back(pair<vector<Unicode >::size_type, const DictUnit* >(i, temp->ptValue));
} if(pos == i) {
NextMap::const_iterator iter = next->find(key); res[pos].dag[0].second = temp->ptValue;
if(iter == next->end())
{
return NULL;
}
return iter->second;
} }
public: }
typedef unordered_map<TrieKey, TrieNode*> NextMap; temp = temp->fail;
TrieNode * fail; assert(temp);
NextMap * next; }
const DictUnit * ptValue; }
}; }
}
bool find(
Unicode::const_iterator begin,
Unicode::const_iterator end,
DagType & res,
size_t offset = 0) const {
const TrieNode * ptNode = _root;
TrieNode::NextMap::const_iterator citer;
for(Unicode::const_iterator itr = begin; itr != end ; itr++) {
assert(ptNode);
if(NULL == ptNode->next || ptNode->next->end() == (citer = ptNode->next->find(*itr))) {
break;
}
ptNode = citer->second;
if(ptNode->ptValue) {
if(itr == begin && res.size() == 1) { // first singleword
res[0].second = ptNode->ptValue;
} else {
res.push_back(pair<vector<Unicode >::size_type, const DictUnit* >(itr - begin + offset, ptNode->ptValue));
}
}
}
return !res.empty();
}
private:
void _build() {
queue<TrieNode*> que;
assert(_root->ptValue == NULL);
assert(_root->next);
_root->fail = NULL;
for(TrieNode::NextMap::iterator iter = _root->next->begin(); iter != _root->next->end(); iter++) {
iter->second->fail = _root;
que.push(iter->second);
}
TrieNode* back = NULL;
TrieNode::NextMap::iterator backiter;
while(!que.empty()) {
TrieNode * now = que.front();
que.pop();
if(now->next == NULL) {
continue;
}
for(TrieNode::NextMap::iterator iter = now->next->begin(); iter != now->next->end(); iter++) {
back = now->fail;
while(back != NULL) {
if(back->next && (backiter = back->next->find(iter->first)) != back->next->end()) {
iter->second->fail = backiter->second;
break;
}
back = back->fail;
}
if(back == NULL) {
iter->second->fail = _root;
}
que.push(iter->second);
}
}
}
void _createTrie(const vector<Unicode>& keys, const vector<const DictUnit*> & valuePointers) {
if(valuePointers.empty() || keys.empty()) {
return;
}
assert(keys.size() == valuePointers.size());
class Trie for(size_t i = 0; i < keys.size(); i++) {
{ _insertNode(keys[i], valuePointers[i]);
public: }
Trie(const vector<Unicode>& keys, const vector<const DictUnit*> & valuePointers) }
{ void _insertNode(const Unicode& key, const DictUnit* ptValue) {
_root = new TrieNode; TrieNode* ptNode = _root;
_createTrie(keys, valuePointers);
_build();// build automation
}
~Trie()
{
if(_root)
{
_deleteNode(_root);
}
}
public:
const DictUnit* find(Unicode::const_iterator begin, Unicode::const_iterator end) const
{
TrieNode::NextMap::const_iterator citer;
const TrieNode* ptNode = _root;
for(Unicode::const_iterator it = begin; it != end; it++)
{// build automation
assert(ptNode);
if(NULL == ptNode->next || ptNode->next->end() == (citer = ptNode->next->find(*it)))
{
return NULL;
}
ptNode = citer->second;
}
return ptNode->ptValue;
}
// aho-corasick-automation
void find(
Unicode::const_iterator begin,
Unicode::const_iterator end,
vector<struct SegmentChar>& res
) const
{
res.resize(end - begin);
const TrieNode * now = _root;
const TrieNode* node;
// compiler will complain warnings if only "i < end - begin" .
for (size_t i = 0; i < size_t(end - begin); i++)
{
Unicode::value_type ch = *(begin + i);
res[i].uniCh = ch;
assert(res[i].dag.empty());
res[i].dag.push_back(pair<vector<Unicode >::size_type, const DictUnit* >(i, NULL));
bool flag = false;
// rollback TrieNode::NextMap::const_iterator kmIter;
while( now != _root )
{
node = now->findNext(ch);
if (node != NULL)
{
flag = true;
break;
}
else
{
now = now->fail;
}
}
if(!flag) for(Unicode::const_iterator citer = key.begin(); citer != key.end(); citer++) {
{ if(NULL == ptNode->next) {
node = now->findNext(ch); ptNode->next = new TrieNode::NextMap;
} }
if(node == NULL) kmIter = ptNode->next->find(*citer);
{ if(ptNode->next->end() == kmIter) {
now = _root; TrieNode * nextNode = new TrieNode;
} nextNode->next = NULL;
else nextNode->ptValue = NULL;
{
now = node;
const TrieNode * temp = now;
while(temp != _root)
{
if (temp->ptValue)
{
size_t pos = i - temp->ptValue->word.size() + 1;
res[pos].dag.push_back(pair<vector<Unicode >::size_type, const DictUnit* >(i, temp->ptValue));
if(pos == i)
{
res[pos].dag[0].second = temp->ptValue;
}
}
temp = temp->fail;
assert(temp);
}
}
}
}
bool find(
Unicode::const_iterator begin,
Unicode::const_iterator end,
DagType & res,
size_t offset = 0) const
{
const TrieNode * ptNode = _root;
TrieNode::NextMap::const_iterator citer;
for(Unicode::const_iterator itr = begin; itr != end ; itr++)
{
assert(ptNode);
if(NULL == ptNode->next || ptNode->next->end() == (citer = ptNode->next->find(*itr)))
{
break;
}
ptNode = citer->second;
if(ptNode->ptValue)
{
if(itr == begin && res.size() == 1) // first singleword
{
res[0].second = ptNode->ptValue;
}
else
{
res.push_back(pair<vector<Unicode >::size_type, const DictUnit* >(itr - begin + offset, ptNode->ptValue));
}
}
}
return !res.empty();
}
private:
void _build()
{
queue<TrieNode*> que;
assert(_root->ptValue == NULL);
assert(_root->next);
_root->fail = NULL;
for(TrieNode::NextMap::iterator iter = _root->next->begin(); iter != _root->next->end(); iter++) {
iter->second->fail = _root;
que.push(iter->second);
}
TrieNode* back = NULL;
TrieNode::NextMap::iterator backiter;
while(!que.empty()) {
TrieNode * now = que.front();
que.pop();
if(now->next == NULL) {
continue;
}
for(TrieNode::NextMap::iterator iter = now->next->begin(); iter != now->next->end(); iter++) {
back = now->fail;
while(back != NULL) {
if(back->next && (backiter = back->next->find(iter->first)) != back->next->end())
{
iter->second->fail = backiter->second;
break;
}
back = back->fail;
}
if(back == NULL) {
iter->second->fail = _root;
}
que.push(iter->second);
}
}
}
void _createTrie(const vector<Unicode>& keys, const vector<const DictUnit*> & valuePointers)
{
if(valuePointers.empty() || keys.empty())
{
return;
}
assert(keys.size() == valuePointers.size());
for(size_t i = 0; i < keys.size(); i++) (*ptNode->next)[*citer] = nextNode;
{ ptNode = nextNode;
_insertNode(keys[i], valuePointers[i]); } else {
} ptNode = kmIter->second;
} }
void _insertNode(const Unicode& key, const DictUnit* ptValue) }
{ ptNode->ptValue = ptValue;
TrieNode* ptNode = _root; }
void _deleteNode(TrieNode* node) {
TrieNode::NextMap::const_iterator kmIter; if(!node) {
return;
for(Unicode::const_iterator citer = key.begin(); citer != key.end(); citer++) }
{ if(node->next) {
if(NULL == ptNode->next) TrieNode::NextMap::iterator it;
{ for(it = node->next->begin(); it != node->next->end(); it++) {
ptNode->next = new TrieNode::NextMap; _deleteNode(it->second);
} }
kmIter = ptNode->next->find(*citer); delete node->next;
if(ptNode->next->end() == kmIter) }
{ delete node;
TrieNode * nextNode = new TrieNode; }
nextNode->next = NULL; private:
nextNode->ptValue = NULL; TrieNode* _root;
};
(*ptNode->next)[*citer] = nextNode;
ptNode = nextNode;
}
else
{
ptNode = kmIter->second;
}
}
ptNode->ptValue = ptValue;
}
void _deleteNode(TrieNode* node)
{
if(!node)
{
return;
}
if(node->next)
{
TrieNode::NextMap::iterator it;
for(it = node->next->begin(); it != node->next->end(); it++)
{
_deleteNode(it->second);
}
delete node->next;
}
delete node;
}
private:
TrieNode* _root;
};
} }
#endif #endif

View File

@ -1,17 +1,16 @@
#include "../src/KeywordExtractor.hpp" #include "../src/KeywordExtractor.hpp"
using namespace CppJieba; using namespace CppJieba;
int main(int argc, char ** argv) int main(int argc, char ** argv) {
{ KeywordExtractor extractor("../dict/jieba.dict.utf8", "../dict/hmm_model.utf8", "../dict/idf.utf8", "../dict/stop_words.utf8");
KeywordExtractor extractor("../dict/jieba.dict.utf8", "../dict/hmm_model.utf8", "../dict/idf.utf8", "../dict/stop_words.utf8"); //KeywordExtractor extractor("../dict/jieba.dict.utf8", "../dict/hmm_model.utf8", "../dict/idf.utf8", "../dict/stop_words.utf8", "../dict/user.dict.utf8");
//KeywordExtractor extractor("../dict/jieba.dict.utf8", "../dict/hmm_model.utf8", "../dict/idf.utf8", "../dict/stop_words.utf8", "../dict/user.dict.utf8"); string s("我是拖拉机学院手扶拖拉机专业的。不用多久我就会升职加薪当上CEO走上人生巅峰。");
string s("我是拖拉机学院手扶拖拉机专业的。不用多久我就会升职加薪当上CEO走上人生巅峰。"); vector<pair<string, double> > wordweights;
vector<pair<string, double> > wordweights; vector<string> words;
vector<string> words; size_t topN = 5;
size_t topN = 5; extractor.extract(s, wordweights, topN);
extractor.extract(s, wordweights, topN); cout<< s << '\n' << wordweights << endl;
cout<< s << '\n' << wordweights << endl; extractor.extract(s, words, topN);
extractor.extract(s, words, topN); cout<< s << '\n' << words << endl;
cout<< s << '\n' << words << endl; return EXIT_SUCCESS;
return EXIT_SUCCESS;
} }

View File

@ -9,51 +9,46 @@
using namespace CppJieba; using namespace CppJieba;
void cut(size_t times = 50) void cut(size_t times = 50) {
{ MixSegment seg("../dict/jieba.dict.utf8", "../dict/hmm_model.utf8");
MixSegment seg("../dict/jieba.dict.utf8", "../dict/hmm_model.utf8"); vector<string> res;
vector<string> res; string doc;
string doc; ifstream ifs("../test/testdata/weicheng.utf8");
ifstream ifs("../test/testdata/weicheng.utf8"); assert(ifs);
assert(ifs); doc << ifs;
doc << ifs; long beginTime = clock();
long beginTime = clock(); for(size_t i = 0; i < times; i ++) {
for(size_t i = 0; i < times; i ++) printf("process [%3.0lf %%]\r", 100.0*(i+1)/times);
{ fflush(stdout);
printf("process [%3.0lf %%]\r", 100.0*(i+1)/times); res.clear();
fflush(stdout); seg.cut(doc, res);
res.clear(); }
seg.cut(doc, res); printf("\n");
} long endTime = clock();
printf("\n"); ColorPrintln(GREEN, "cut: [%.3lf seconds]time consumed.", double(endTime - beginTime)/CLOCKS_PER_SEC);
long endTime = clock();
ColorPrintln(GREEN, "cut: [%.3lf seconds]time consumed.", double(endTime - beginTime)/CLOCKS_PER_SEC);
} }
void extract(size_t times = 400) void extract(size_t times = 400) {
{ KeywordExtractor extractor("../dict/jieba.dict.utf8", "../dict/hmm_model.utf8", "../dict/idf.utf8", "../dict/stop_words.utf8");
KeywordExtractor extractor("../dict/jieba.dict.utf8", "../dict/hmm_model.utf8", "../dict/idf.utf8", "../dict/stop_words.utf8"); vector<string> words;
vector<string> words; string doc;
string doc; ifstream ifs("../test/testdata/review.100");
ifstream ifs("../test/testdata/review.100"); assert(ifs);
assert(ifs); doc << ifs;
doc << ifs; long beginTime = clock();
long beginTime = clock(); for(size_t i = 0; i < times; i ++) {
for(size_t i = 0; i < times; i ++) printf("process [%3.0lf %%]\r", 100.0*(i+1)/times);
{ fflush(stdout);
printf("process [%3.0lf %%]\r", 100.0*(i+1)/times); words.clear();
fflush(stdout); extractor.extract(doc, words, 5);
words.clear(); }
extractor.extract(doc, words, 5); printf("\n");
} long endTime = clock();
printf("\n"); ColorPrintln(GREEN, "extract: [%.3lf seconds]time consumed.", double(endTime - beginTime)/CLOCKS_PER_SEC);
long endTime = clock();
ColorPrintln(GREEN, "extract: [%.3lf seconds]time consumed.", double(endTime - beginTime)/CLOCKS_PER_SEC);
} }
int main(int argc, char ** argv) int main(int argc, char ** argv) {
{ cut();
cut(); extract();
extract(); return EXIT_SUCCESS;
return EXIT_SUCCESS;
} }

View File

@ -14,46 +14,42 @@ const char * const JIEBA_DICT_FILE = "../dict/jieba.dict.utf8";
const char * const HMM_DICT_FILE = "../dict/hmm_model.utf8"; const char * const HMM_DICT_FILE = "../dict/hmm_model.utf8";
const char * const USER_DICT_FILE = "../dict/user.dict.utf8"; const char * const USER_DICT_FILE = "../dict/user.dict.utf8";
void cut(const ISegment& seg, const char * const filePath) void cut(const ISegment& seg, const char * const filePath) {
{ ifstream ifile(filePath);
ifstream ifile(filePath); vector<string> words;
vector<string> words; string line;
string line; string res;
string res; while(getline(ifile, line)) {
while(getline(ifile, line)) if(!line.empty()) {
{ words.clear();
if(!line.empty()) seg.cut(line, words);
{ join(words.begin(), words.end(), res, "/");
words.clear(); cout<< res <<endl;
seg.cut(line, words);
join(words.begin(), words.end(), res, "/");
cout<< res <<endl;
}
} }
}
} }
int main(int argc, char ** argv) int main(int argc, char ** argv) {
{ {
{ printf("\e[32m%s\e[0m\n", "[demo] MPSegment"); // colorful
printf("\e[32m%s\e[0m\n", "[demo] MPSegment"); // colorful MPSegment seg(JIEBA_DICT_FILE);
MPSegment seg(JIEBA_DICT_FILE); cut(seg, TEST_FILE);
cut(seg, TEST_FILE); }
} {
{ printf("\e[32m%s\e[0m\n", "[demo] HMMSegment"); // colorful
printf("\e[32m%s\e[0m\n", "[demo] HMMSegment"); // colorful HMMSegment seg(HMM_DICT_FILE);
HMMSegment seg(HMM_DICT_FILE); cut(seg, TEST_FILE);
cut(seg, TEST_FILE); }
} {
{ printf("\e[32m%s\e[0m\n", "[demo] MixSegment"); // colorful
printf("\e[32m%s\e[0m\n", "[demo] MixSegment"); // colorful MixSegment seg(JIEBA_DICT_FILE, HMM_DICT_FILE);
MixSegment seg(JIEBA_DICT_FILE, HMM_DICT_FILE); cut(seg, TEST_FILE);
cut(seg, TEST_FILE); }
} {
{ printf("\e[32m%s\e[0m\n", "[demo] MixSegment with UserDict"); // colorful
printf("\e[32m%s\e[0m\n", "[demo] MixSegment with UserDict"); // colorful MixSegment seg(JIEBA_DICT_FILE, HMM_DICT_FILE, USER_DICT_FILE);
MixSegment seg(JIEBA_DICT_FILE, HMM_DICT_FILE, USER_DICT_FILE); cut(seg, TEST_FILE);
cut(seg, TEST_FILE); }
} return EXIT_SUCCESS;
return EXIT_SUCCESS;
} }

View File

@ -1,12 +1,11 @@
#include "../src/PosTagger.hpp" #include "../src/PosTagger.hpp"
using namespace CppJieba; using namespace CppJieba;
int main(int argc, char ** argv) int main(int argc, char ** argv) {
{ PosTagger tagger("../dict/jieba.dict.utf8", "../dict/hmm_model.utf8", "../dict/user.dict.utf8");
PosTagger tagger("../dict/jieba.dict.utf8", "../dict/hmm_model.utf8", "../dict/user.dict.utf8"); string s("我是蓝翔技工拖拉机学院手扶拖拉机专业的。不用多久我就会升职加薪当上总经理出任CEO迎娶白富美走上人生巅峰。");
string s("我是蓝翔技工拖拉机学院手扶拖拉机专业的。不用多久我就会升职加薪当上总经理出任CEO迎娶白富美走上人生巅峰。"); vector<pair<string, string> > res;
vector<pair<string, string> > res; tagger.tag(s, res);
tagger.tag(s, res); cout << res << endl;
cout << res << endl; return EXIT_SUCCESS;
return EXIT_SUCCESS;
} }

View File

@ -5,52 +5,50 @@ using namespace CppJieba;
TEST(KeywordExtractorTest, Test1) TEST(KeywordExtractorTest, Test1) {
{ KeywordExtractor extractor("../dict/extra_dict/jieba.dict.small.utf8", "../dict/hmm_model.utf8", "../dict/idf.utf8", "../dict/stop_words.utf8");
KeywordExtractor extractor("../dict/extra_dict/jieba.dict.small.utf8", "../dict/hmm_model.utf8", "../dict/idf.utf8", "../dict/stop_words.utf8");
{ {
string s("我是拖拉机学院手扶拖拉机专业的。不用多久我就会升职加薪当上CEO走上人生巅峰。"); string s("我是拖拉机学院手扶拖拉机专业的。不用多久我就会升职加薪当上CEO走上人生巅峰。");
string res; string res;
vector<pair<string, double> > wordweights; vector<pair<string, double> > wordweights;
size_t topN = 5; size_t topN = 5;
extractor.extract(s, wordweights, topN); extractor.extract(s, wordweights, topN);
res << wordweights; res << wordweights;
ASSERT_EQ(res, "[\"CEO:11.7392\", \"升职:10.8562\", \"加薪:10.6426\", \"手扶拖拉机:10.0089\", \"巅峰:9.49396\"]"); ASSERT_EQ(res, "[\"CEO:11.7392\", \"升职:10.8562\", \"加薪:10.6426\", \"手扶拖拉机:10.0089\", \"巅峰:9.49396\"]");
} }
{ {
string s("一部iPhone6"); string s("一部iPhone6");
string res; string res;
vector<pair<string, double> > wordweights; vector<pair<string, double> > wordweights;
size_t topN = 5; size_t topN = 5;
extractor.extract(s, wordweights, topN); extractor.extract(s, wordweights, topN);
res << wordweights; res << wordweights;
ASSERT_EQ(res, "[\"iPhone6:11.7392\", \"一部:6.47592\"]"); ASSERT_EQ(res, "[\"iPhone6:11.7392\", \"一部:6.47592\"]");
} }
} }
TEST(KeywordExtractorTest, Test2) TEST(KeywordExtractorTest, Test2) {
{ KeywordExtractor extractor("../dict/extra_dict/jieba.dict.small.utf8", "../dict/hmm_model.utf8", "../dict/idf.utf8", "../dict/stop_words.utf8", "../test/testdata/userdict.utf8");
KeywordExtractor extractor("../dict/extra_dict/jieba.dict.small.utf8", "../dict/hmm_model.utf8", "../dict/idf.utf8", "../dict/stop_words.utf8", "../test/testdata/userdict.utf8");
{ {
string s("蓝翔优秀毕业生"); string s("蓝翔优秀毕业生");
string res; string res;
vector<pair<string, double> > wordweights; vector<pair<string, double> > wordweights;
size_t topN = 5; size_t topN = 5;
extractor.extract(s, wordweights, topN); extractor.extract(s, wordweights, topN);
res << wordweights; res << wordweights;
ASSERT_EQ(res, "[\"蓝翔:11.7392\", \"毕业生:8.13549\", \"优秀:6.78347\"]"); ASSERT_EQ(res, "[\"蓝翔:11.7392\", \"毕业生:8.13549\", \"优秀:6.78347\"]");
} }
{ {
string s("一部iPhone6"); string s("一部iPhone6");
string res; string res;
vector<pair<string, double> > wordweights; vector<pair<string, double> > wordweights;
size_t topN = 5; size_t topN = 5;
extractor.extract(s, wordweights, topN); extractor.extract(s, wordweights, topN);
res << wordweights; res << wordweights;
ASSERT_EQ(res, "[\"iPhone6:11.7392\", \"一部:6.47592\"]"); ASSERT_EQ(res, "[\"iPhone6:11.7392\", \"一部:6.47592\"]");
} }
} }

View File

@ -12,32 +12,30 @@ static const char * const QUERY_TEST3 = "iPhone6手机的最大特点是很容
static const char * const ANS_TEST3 = "[\"iPhone6:eng\", \"手机:n\", \"的:uj\", \"最大:a\", \"特点:n\", \"是:v\", \"很:zg\", \"容易:a\", \"弯曲:v\", \"。:x\"]"; static const char * const ANS_TEST3 = "[\"iPhone6:eng\", \"手机:n\", \"的:uj\", \"最大:a\", \"特点:n\", \"是:v\", \"很:zg\", \"容易:a\", \"弯曲:v\", \"。:x\"]";
//static const char * const ANS_TEST3 = ""; //static const char * const ANS_TEST3 = "";
TEST(PosTaggerTest, Test) TEST(PosTaggerTest, Test) {
{ PosTagger tagger("../dict/jieba.dict.utf8", "../dict/hmm_model.utf8");
PosTagger tagger("../dict/jieba.dict.utf8", "../dict/hmm_model.utf8"); {
{ vector<pair<string, string> > res;
vector<pair<string, string> > res; tagger.tag(QUERY_TEST1, res);
tagger.tag(QUERY_TEST1, res); string s;
string s; s << res;
s << res; ASSERT_TRUE(s == ANS_TEST1);
ASSERT_TRUE(s == ANS_TEST1); }
}
} }
TEST(PosTagger, TestUserDict) TEST(PosTagger, TestUserDict) {
{ PosTagger tagger("../dict/jieba.dict.utf8", "../dict/hmm_model.utf8", "../test/testdata/userdict.utf8");
PosTagger tagger("../dict/jieba.dict.utf8", "../dict/hmm_model.utf8", "../test/testdata/userdict.utf8"); {
{ vector<pair<string, string> > res;
vector<pair<string, string> > res; tagger.tag(QUERY_TEST2, res);
tagger.tag(QUERY_TEST2, res); string s;
string s; s << res;
s << res; ASSERT_EQ(s, ANS_TEST2);
ASSERT_EQ(s, ANS_TEST2); }
} {
{ vector<pair<string, string> > res;
vector<pair<string, string> > res; tagger.tag(QUERY_TEST3, res);
tagger.tag(QUERY_TEST3, res); string s;
string s; s << res;
s << res; ASSERT_EQ(s, ANS_TEST3);
ASSERT_EQ(s, ANS_TEST3); }
}
} }

View File

@ -9,170 +9,176 @@
using namespace CppJieba; using namespace CppJieba;
TEST(MixSegmentTest, Test1) TEST(MixSegmentTest, Test1) {
{ MixSegment segment("../dict/jieba.dict.utf8", "../dict/hmm_model.utf8");;
MixSegment segment("../dict/jieba.dict.utf8", "../dict/hmm_model.utf8");; const char* str = "我来自北京邮电大学。。。学号123456用AK47";
const char* str = "我来自北京邮电大学。。。学号123456用AK47"; const char* res[] = {"", "来自", "北京邮电大学", "","","", "学号", "123456","","","AK47"};
const char* res[] = {"", "来自", "北京邮电大学", "","","", "学号", "123456","","","AK47"}; const char* str2 = "B超 T恤";
const char* str2 = "B超 T恤"; const char* res2[] = {"B超"," ", "T恤"};
const char* res2[] = {"B超"," ", "T恤"}; vector<string> words;
vector<string> words; ASSERT_TRUE(segment.cut(str, words));
ASSERT_TRUE(segment.cut(str, words)); ASSERT_EQ(words, vector<string>(res, res + sizeof(res)/sizeof(res[0])));
ASSERT_EQ(words, vector<string>(res, res + sizeof(res)/sizeof(res[0]))); ASSERT_TRUE(segment.cut(str2, words));
ASSERT_TRUE(segment.cut(str2, words)); ASSERT_EQ(words, vector<string>(res2, res2 + sizeof(res2)/sizeof(res2[0])));
ASSERT_EQ(words, vector<string>(res2, res2 + sizeof(res2)/sizeof(res2[0])));
} }
TEST(MixSegmentTest, NoUserDict) TEST(MixSegmentTest, NoUserDict) {
{ MixSegment segment("../dict/extra_dict/jieba.dict.small.utf8", "../dict/hmm_model.utf8");
MixSegment segment("../dict/extra_dict/jieba.dict.small.utf8", "../dict/hmm_model.utf8"); const char* str = "令狐冲是云计算方面的专家";
vector<string> words;
ASSERT_TRUE(segment.cut(str, words));
string res;
ASSERT_EQ("[\"令狐冲\", \"\", \"\", \"计算\", \"方面\", \"\", \"专家\"]", res << words);
}
TEST(MixSegmentTest, UserDict) {
MixSegment segment("../dict/extra_dict/jieba.dict.small.utf8", "../dict/hmm_model.utf8", "../dict/user.dict.utf8");
{
const char* str = "令狐冲是云计算方面的专家"; const char* str = "令狐冲是云计算方面的专家";
vector<string> words; vector<string> words;
ASSERT_TRUE(segment.cut(str, words)); ASSERT_TRUE(segment.cut(str, words));
string res; string res;
ASSERT_EQ("[\"令狐冲\", \"\", \"\", \"计算\", \"方面\", \"\", \"专家\"]", res << words); ASSERT_EQ("[\"令狐冲\", \"\", \"云计算\", \"方面\", \"\", \"专家\"]", res << words);
}
{
const char* str = "小明先就职于IBM,后在日本京都大学深造";
vector<string> words;
ASSERT_TRUE(segment.cut(str, words));
string res;
res << words;
ASSERT_EQ("[\"小明\", \"\", \"就职\", \"\", \"IBM\", \",\", \"\", \"\", \"日本\", \"京都大学\", \"深造\"]", res);
}
{
const char* str = "IBM,3.14";
vector<string> words;
ASSERT_TRUE(segment.cut(str, words));
string res;
res << words;
ASSERT_EQ("[\"IBM\", \",\", \"3.14\"]", res);
}
} }
TEST(MixSegmentTest, UserDict) TEST(MixSegmentTest, UserDict2) {
{ MixSegment segment("../dict/extra_dict/jieba.dict.small.utf8", "../dict/hmm_model.utf8", "../test/testdata/userdict.utf8");
MixSegment segment("../dict/extra_dict/jieba.dict.small.utf8", "../dict/hmm_model.utf8", "../dict/user.dict.utf8"); {
{ const char* str = "令狐冲是云计算方面的专家";
const char* str = "令狐冲是云计算方面的专家"; vector<string> words;
vector<string> words; ASSERT_TRUE(segment.cut(str, words));
ASSERT_TRUE(segment.cut(str, words)); string res;
string res; ASSERT_EQ("[\"令狐冲\", \"\", \"云计算\", \"方面\", \"\", \"专家\"]", res << words);
ASSERT_EQ("[\"令狐冲\", \"\", \"云计算\", \"方面\", \"\", \"专家\"]", res << words); }
} {
{ const char* str = "小明先就职于IBM,后在日本京都大学深造";
const char* str = "小明先就职于IBM,后在日本京都大学深造"; vector<string> words;
vector<string> words; ASSERT_TRUE(segment.cut(str, words));
ASSERT_TRUE(segment.cut(str, words)); string res;
string res; res << words;
res << words; ASSERT_EQ("[\"小明\", \"\", \"就职\", \"\", \"I\", \"B\", \"M\", \",\", \"\", \"\", \"日本\", \"京都大学\", \"深造\"]", res);
ASSERT_EQ("[\"小明\", \"\", \"就职\", \"\", \"IBM\", \",\", \"\", \"\", \"日本\", \"京都大学\", \"深造\"]", res); }
} {
{ const char* str = "IBM,3.14";
const char* str = "IBM,3.14"; vector<string> words;
vector<string> words; ASSERT_TRUE(segment.cut(str, words));
ASSERT_TRUE(segment.cut(str, words)); string res;
string res; res << words;
res << words; ASSERT_EQ("[\"I\", \"B\", \"M\", \",\", \"3.14\"]", res);
ASSERT_EQ("[\"IBM\", \",\", \"3.14\"]", res); }
}
}
TEST(MixSegmentTest, UserDict2)
{
MixSegment segment("../dict/extra_dict/jieba.dict.small.utf8", "../dict/hmm_model.utf8", "../test/testdata/userdict.utf8");
{
const char* str = "令狐冲是云计算方面的专家";
vector<string> words;
ASSERT_TRUE(segment.cut(str, words));
string res;
ASSERT_EQ("[\"令狐冲\", \"\", \"云计算\", \"方面\", \"\", \"专家\"]", res << words);
}
{
const char* str = "小明先就职于IBM,后在日本京都大学深造";
vector<string> words;
ASSERT_TRUE(segment.cut(str, words));
string res;
res << words;
ASSERT_EQ("[\"小明\", \"\", \"就职\", \"\", \"I\", \"B\", \"M\", \",\", \"\", \"\", \"日本\", \"京都大学\", \"深造\"]", res);
}
{
const char* str = "IBM,3.14";
vector<string> words;
ASSERT_TRUE(segment.cut(str, words));
string res;
res << words;
ASSERT_EQ("[\"I\", \"B\", \"M\", \",\", \"3.14\"]", res);
}
} }
TEST(MPSegmentTest, Test1) TEST(MPSegmentTest, Test1) {
{ MPSegment segment("../dict/jieba.dict.utf8");;
MPSegment segment("../dict/jieba.dict.utf8");; const char* str = "我来自北京邮电大学。";
const char* str = "我来自北京邮电大学。"; const char* res[] = {"", "来自", "北京邮电大学", ""};
const char* res[] = {"", "来自", "北京邮电大学", ""}; vector<string> words;
ASSERT_TRUE(segment.cut(str, words));
ASSERT_EQ(words, vector<string>(res, res + sizeof(res)/sizeof(res[0])));
{
const char* str = "B超 T恤";
const char * res[] = {"B超", " ", "T恤"};
vector<string> words; vector<string> words;
ASSERT_TRUE(segment.cut(str, words)); ASSERT_TRUE(segment.cut(str, words));
ASSERT_EQ(words, vector<string>(res, res + sizeof(res)/sizeof(res[0]))); ASSERT_EQ(words, vector<string>(res, res + sizeof(res)/sizeof(res[0])));
}
{
const char* str = "B超 T恤";
const char * res[] = {"B超", " ", "T恤"};
vector<string> words;
ASSERT_TRUE(segment.cut(str, words));
ASSERT_EQ(words, vector<string>(res, res + sizeof(res)/sizeof(res[0])));
}
} }
TEST(MPSegmentTest, Test2) TEST(MPSegmentTest, Test2) {
{ MPSegment segment("../dict/extra_dict/jieba.dict.small.utf8");
MPSegment segment("../dict/extra_dict/jieba.dict.small.utf8"); string line;
string line; ifstream ifs("../test/testdata/review.100");
ifstream ifs("../test/testdata/review.100"); vector<string> words;
vector<string> words;
string eRes; string eRes;
{ {
ifstream ifs("../test/testdata/review.100.res"); ifstream ifs("../test/testdata/review.100.res");
ASSERT_TRUE(!!ifs); ASSERT_TRUE(!!ifs);
eRes << ifs; eRes << ifs;
} }
string res; string res;
while(getline(ifs, line))
{
res += line;
res += '\n';
segment.cut(line, words);
string s;
s << words;
res += s;
res += '\n';
}
ofstream ofs("../test/testdata/review.100.res");
ASSERT_TRUE(!!ofs);
ofs << res;
}
TEST(HMMSegmentTest, Test1)
{
HMMSegment segment("../dict/hmm_model.utf8");;
{
const char* str = "我来自北京邮电大学。。。学号123456";
const char* res[] = {"我来", "自北京", "邮电大学", "", "", "", "学号", "123456"};
vector<string> words;
ASSERT_TRUE(segment.cut(str, words));
ASSERT_EQ(words, vector<string>(res, res + sizeof(res)/sizeof(res[0])));
}
{
const char* str = "IBM,1.2,123";
const char* res[] = {"IBM", ",", "1.2", ",", "123"};
vector<string> words;
ASSERT_TRUE(segment.cut(str, words));
ASSERT_EQ(words, vector<string>(res, res + sizeof(res)/sizeof(res[0])));
}
}
TEST(FullSegment, Test1) while(getline(ifs, line)) {
{ res += line;
FullSegment segment("../dict/extra_dict/jieba.dict.small.utf8"); res += '\n';
const char* str = "我来自北京邮电大学";
vector<string> words;
ASSERT_EQ(segment.cut(str, words), true);
segment.cut(line, words);
string s; string s;
s << words; s << words;
ASSERT_EQ(s, "[\"\", \"来自\", \"北京\", \"北京邮电大学\", \"邮电\", \"电大\", \"大学\"]"); res += s;
res += '\n';
}
ofstream ofs("../test/testdata/review.100.res");
ASSERT_TRUE(!!ofs);
ofs << res;
}
TEST(HMMSegmentTest, Test1) {
HMMSegment segment("../dict/hmm_model.utf8");;
{
const char* str = "我来自北京邮电大学。。。学号123456";
const char* res[] = {"我来", "自北京", "邮电大学", "", "", "", "学号", "123456"};
vector<string> words;
ASSERT_TRUE(segment.cut(str, words));
ASSERT_EQ(words, vector<string>(res, res + sizeof(res)/sizeof(res[0])));
}
{
const char* str = "IBM,1.2,123";
const char* res[] = {"IBM", ",", "1.2", ",", "123"};
vector<string> words;
ASSERT_TRUE(segment.cut(str, words));
ASSERT_EQ(words, vector<string>(res, res + sizeof(res)/sizeof(res[0])));
}
} }
TEST(QuerySegment, Test1) TEST(FullSegment, Test1) {
{ FullSegment segment("../dict/extra_dict/jieba.dict.small.utf8");
QuerySegment segment("../dict/extra_dict/jieba.dict.small.utf8", "../dict/hmm_model.utf8", 3); const char* str = "我来自北京邮电大学";
vector<string> words;
ASSERT_EQ(segment.cut(str, words), true);
string s;
s << words;
ASSERT_EQ(s, "[\"\", \"来自\", \"北京\", \"北京邮电大学\", \"邮电\", \"电大\", \"大学\"]");
}
TEST(QuerySegment, Test1) {
QuerySegment segment("../dict/extra_dict/jieba.dict.small.utf8", "../dict/hmm_model.utf8", 3);
const char* str = "小明硕士毕业于中国科学院计算所,后在日本京都大学深造";
vector<string> words;
ASSERT_TRUE(segment.cut(str, words));
string s1, s2;
s1 << words;
s2 = "[\"小明\", \"硕士\", \"毕业\", \"\", \"中国\", \"中国科学院\", \"科学\", \"科学院\", \"学院\", \"计算所\", \"\", \"\", \"\", \"日本\", \"京都\", \"京都大学\", \"大学\", \"深造\"]";
ASSERT_EQ(s1, s2);
}
TEST(QuerySegment, Test2) {
QuerySegment segment("../dict/extra_dict/jieba.dict.small.utf8", "../dict/hmm_model.utf8", 3, "../test/testdata/userdict.utf8");
{
const char* str = "小明硕士毕业于中国科学院计算所,后在日本京都大学深造"; const char* str = "小明硕士毕业于中国科学院计算所,后在日本京都大学深造";
vector<string> words; vector<string> words;
@ -182,35 +188,18 @@ TEST(QuerySegment, Test1)
s1 << words; s1 << words;
s2 = "[\"小明\", \"硕士\", \"毕业\", \"\", \"中国\", \"中国科学院\", \"科学\", \"科学院\", \"学院\", \"计算所\", \"\", \"\", \"\", \"日本\", \"京都\", \"京都大学\", \"大学\", \"深造\"]"; s2 = "[\"小明\", \"硕士\", \"毕业\", \"\", \"中国\", \"中国科学院\", \"科学\", \"科学院\", \"学院\", \"计算所\", \"\", \"\", \"\", \"日本\", \"京都\", \"京都大学\", \"大学\", \"深造\"]";
ASSERT_EQ(s1, s2); ASSERT_EQ(s1, s2);
}
{
const char* str = "小明硕士毕业于中国科学院计算所iPhone6";
vector<string> words;
ASSERT_TRUE(segment.cut(str, words));
string s1, s2;
s1 << words;
s2 = "[\"小明\", \"硕士\", \"毕业\", \"\", \"中国\", \"中国科学院\", \"科学\", \"科学院\", \"学院\", \"计算所\", \"iPhone6\"]";
ASSERT_EQ(s1, s2);
}
} }
TEST(QuerySegment, Test2)
{
QuerySegment segment("../dict/extra_dict/jieba.dict.small.utf8", "../dict/hmm_model.utf8", 3, "../test/testdata/userdict.utf8");
{
const char* str = "小明硕士毕业于中国科学院计算所,后在日本京都大学深造";
vector<string> words;
ASSERT_TRUE(segment.cut(str, words));
string s1, s2;
s1 << words;
s2 = "[\"小明\", \"硕士\", \"毕业\", \"\", \"中国\", \"中国科学院\", \"科学\", \"科学院\", \"学院\", \"计算所\", \"\", \"\", \"\", \"日本\", \"京都\", \"京都大学\", \"大学\", \"深造\"]";
ASSERT_EQ(s1, s2);
}
{
const char* str = "小明硕士毕业于中国科学院计算所iPhone6";
vector<string> words;
ASSERT_TRUE(segment.cut(str, words));
string s1, s2;
s1 << words;
s2 = "[\"小明\", \"硕士\", \"毕业\", \"\", \"中国\", \"中国科学院\", \"科学\", \"科学院\", \"学院\", \"计算所\", \"iPhone6\"]";
ASSERT_EQ(s1, s2);
}
}

View File

@ -6,75 +6,70 @@ using namespace CppJieba;
static const char* const DICT_FILE = "../dict/extra_dict/jieba.dict.small.utf8"; static const char* const DICT_FILE = "../dict/extra_dict/jieba.dict.small.utf8";
TEST(DictTrieTest, NewAndDelete) TEST(DictTrieTest, NewAndDelete) {
{ DictTrie * trie;
DictTrie * trie; trie = new DictTrie(DICT_FILE);
trie = new DictTrie(DICT_FILE); delete trie;
delete trie; trie = new DictTrie();
trie = new DictTrie(); delete trie;
delete trie;
} }
TEST(DictTrieTest, Test1) TEST(DictTrieTest, Test1) {
{
string s1, s2; string s1, s2;
DictTrie trie; DictTrie trie;
ASSERT_TRUE(trie.init(DICT_FILE)); ASSERT_TRUE(trie.init(DICT_FILE));
ASSERT_LT(trie.getMinWeight() + 15.6479, 0.001); ASSERT_LT(trie.getMinWeight() + 15.6479, 0.001);
string word("来到"); string word("来到");
Unicode uni; Unicode uni;
ASSERT_TRUE(TransCode::decode(word, uni)); ASSERT_TRUE(TransCode::decode(word, uni));
DictUnit nodeInfo; DictUnit nodeInfo;
nodeInfo.word = uni; nodeInfo.word = uni;
nodeInfo.tag = "v"; nodeInfo.tag = "v";
nodeInfo.weight = -8.87033; nodeInfo.weight = -8.87033;
s1 << nodeInfo; s1 << nodeInfo;
s2 << (*trie.find(uni.begin(), uni.end())); s2 << (*trie.find(uni.begin(), uni.end()));
EXPECT_EQ("[\"26469\", \"21040\"] v -8.870", s2);
word = "清华大学";
LocalVector<pair<size_t, const DictUnit*> > res;
//vector<pair<size_t, const DictUnit* > resMap;
LocalVector<pair<size_t, const DictUnit*> > res2;
const char * words[] = {"", "清华", "清华大学"};
for(size_t i = 0; i < sizeof(words)/sizeof(words[0]); i++)
{
ASSERT_TRUE(TransCode::decode(words[i], uni));
res.push_back(make_pair(uni.size() - 1, trie.find(uni.begin(), uni.end())));
//resMap[uni.size() - 1] = trie.find(uni.begin(), uni.end());
}
//DictUnit
//res.push_back(make_pair(0, ))
vector<pair<size_t, const DictUnit*> > vec; EXPECT_EQ("[\"26469\", \"21040\"] v -8.870", s2);
ASSERT_TRUE(TransCode::decode(word, uni)); word = "清华大学";
ASSERT_TRUE(trie.find(uni.begin(), uni.end(), res2, 0)); LocalVector<pair<size_t, const DictUnit*> > res;
s1 << res; //vector<pair<size_t, const DictUnit* > resMap;
s2 << res; LocalVector<pair<size_t, const DictUnit*> > res2;
ASSERT_EQ(s1, s2); const char * words[] = {"", "清华", "清华大学"};
for(size_t i = 0; i < sizeof(words)/sizeof(words[0]); i++) {
ASSERT_TRUE(TransCode::decode(words[i], uni));
res.push_back(make_pair(uni.size() - 1, trie.find(uni.begin(), uni.end())));
//resMap[uni.size() - 1] = trie.find(uni.begin(), uni.end());
}
//DictUnit
//res.push_back(make_pair(0, ))
vector<pair<size_t, const DictUnit*> > vec;
ASSERT_TRUE(TransCode::decode(word, uni));
ASSERT_TRUE(trie.find(uni.begin(), uni.end(), res2, 0));
s1 << res;
s2 << res;
ASSERT_EQ(s1, s2);
} }
TEST(DictTrieTest, UserDict) TEST(DictTrieTest, UserDict) {
{ DictTrie trie(DICT_FILE, "../test/testdata/userdict.utf8");
DictTrie trie(DICT_FILE, "../test/testdata/userdict.utf8"); string word = "云计算";
string word = "云计算"; Unicode unicode;
Unicode unicode; ASSERT_TRUE(TransCode::decode(word, unicode));
ASSERT_TRUE(TransCode::decode(word, unicode)); const DictUnit * unit = trie.find(unicode.begin(), unicode.end());
const DictUnit * unit = trie.find(unicode.begin(), unicode.end()); ASSERT_TRUE(unit);
ASSERT_TRUE(unit); string res ;
string res ; res << *unit;
res << *unit; ASSERT_EQ("[\"20113\", \"35745\", \"31639\"] -2.975", res);
ASSERT_EQ("[\"20113\", \"35745\", \"31639\"] -2.975", res);
} }
TEST(DictTrieTest, automation) TEST(DictTrieTest, automation) {
{ DictTrie trie(DICT_FILE, "../test/testdata/userdict.utf8");
DictTrie trie(DICT_FILE, "../test/testdata/userdict.utf8"); //string word = "yasherhs";
//string word = "yasherhs"; string word = "abcderf";
string word = "abcderf"; Unicode unicode;
Unicode unicode; ASSERT_TRUE(TransCode::decode(word, unicode));
ASSERT_TRUE(TransCode::decode(word, unicode)); vector<struct SegmentChar> res;
vector<struct SegmentChar> res; trie.find(unicode.begin(), unicode.end(), res);
trie.find(unicode.begin(), unicode.end(), res);
} }