astyle --style=google --indent=spaces=2

This commit is contained in:
yanyiwu 2015-05-06 17:53:20 +08:00
parent b70875f412
commit bb32234654
21 changed files with 1727 additions and 2071 deletions

View File

@ -12,29 +12,25 @@
using namespace Husky; using namespace Husky;
using namespace CppJieba; using namespace CppJieba;
class ReqHandler: public IRequestHandler class ReqHandler: public IRequestHandler {
{
public: public:
ReqHandler(const string& dictPath, const string& modelPath, const string& userDictPath): _segment(dictPath, modelPath, userDictPath) {}; ReqHandler(const string& dictPath, const string& modelPath, const string& userDictPath): _segment(dictPath, modelPath, userDictPath) {};
virtual ~ReqHandler() {}; virtual ~ReqHandler() {};
public: public:
virtual bool do_GET(const HttpReqInfo& httpReq, string& strSnd) const virtual bool do_GET(const HttpReqInfo& httpReq, string& strSnd) const {
{
string sentence, tmp; string sentence, tmp;
vector<string> words; vector<string> words;
httpReq.GET("key", tmp); httpReq.GET("key", tmp);
URLDecode(tmp, sentence); URLDecode(tmp, sentence);
_segment.cut(sentence, words); _segment.cut(sentence, words);
if(httpReq.GET("format", tmp) && tmp == "simple") if(httpReq.GET("format", tmp) && tmp == "simple") {
{
join(words.begin(), words.end(), strSnd, " "); join(words.begin(), words.end(), strSnd, " ");
return true; return true;
} }
strSnd << words; strSnd << words;
return true; return true;
} }
virtual bool do_POST(const HttpReqInfo& httpReq, string& strSnd) const virtual bool do_POST(const HttpReqInfo& httpReq, string& strSnd) const {
{
vector<string> words; vector<string> words;
_segment.cut(httpReq.getBody(), words); _segment.cut(httpReq.getBody(), words);
strSnd << words; strSnd << words;
@ -44,15 +40,12 @@ class ReqHandler: public IRequestHandler
MixSegment _segment; MixSegment _segment;
}; };
bool run(int argc, char** argv) bool run(int argc, char** argv) {
{ if(argc < 2) {
if(argc < 2)
{
return false; return false;
} }
Config conf(argv[1]); Config conf(argv[1]);
if(!conf) if(!conf) {
{
return false; return false;
} }
int port = 0; int port = 0;
@ -66,8 +59,7 @@ bool run(int argc, char** argv)
LIMONP_CHECK(conf.get("queue_max_size", queueMaxSize)); LIMONP_CHECK(conf.get("queue_max_size", queueMaxSize));
LIMONP_CHECK(conf.get("dict_path", dictPath)); LIMONP_CHECK(conf.get("dict_path", dictPath));
LIMONP_CHECK(conf.get("model_path", modelPath)); LIMONP_CHECK(conf.get("model_path", modelPath));
if(!conf.get("user_dict_path", userDictPath)) //optional if(!conf.get("user_dict_path", userDictPath)) { //optional
{
userDictPath = ""; userDictPath = "";
} }
@ -79,10 +71,8 @@ bool run(int argc, char** argv)
} }
int main(int argc, char* argv[]) int main(int argc, char* argv[]) {
{ if(!run(argc, argv)) {
if(!run(argc, argv))
{
printf("usage: %s <config_file>\n", argv[0]); printf("usage: %s <config_file>\n", argv[0]);
return EXIT_FAILURE; return EXIT_FAILURE;
} }

View File

@ -15,48 +15,39 @@
namespace CppJieba namespace CppJieba {
{
using namespace Limonp; using namespace Limonp;
const double MIN_DOUBLE = -3.14e+100; const double MIN_DOUBLE = -3.14e+100;
const double MAX_DOUBLE = 3.14e+100; const double MAX_DOUBLE = 3.14e+100;
const size_t DICT_COLUMN_NUM = 3; const size_t DICT_COLUMN_NUM = 3;
const char* const UNKNOWN_TAG = ""; const char* const UNKNOWN_TAG = "";
class DictTrie class DictTrie {
{
public: public:
DictTrie() DictTrie() {
{
_trie = NULL; _trie = NULL;
_minWeight = MAX_DOUBLE; _minWeight = MAX_DOUBLE;
} }
DictTrie(const string& dictPath, const string& userDictPath = "") DictTrie(const string& dictPath, const string& userDictPath = "") {
{
new (this) DictTrie(); new (this) DictTrie();
init(dictPath, userDictPath); init(dictPath, userDictPath);
} }
~DictTrie() ~DictTrie() {
{ if(_trie) {
if(_trie)
{
delete _trie; delete _trie;
} }
} }
bool init(const string& dictPath, const string& userDictPath = "") bool init(const string& dictPath, const string& userDictPath = "") {
{ if(_trie != NULL) {
if(_trie != NULL)
{
LogFatal("trie already initted"); LogFatal("trie already initted");
} }
_loadDict(dictPath); _loadDict(dictPath);
_calculateWeight(_nodeInfos); _calculateWeight(_nodeInfos);
_minWeight = _findMinWeight(_nodeInfos); _minWeight = _findMinWeight(_nodeInfos);
if(userDictPath.size()) if(userDictPath.size()) {
{
double maxWeight = _findMaxWeight(_nodeInfos); double maxWeight = _findMaxWeight(_nodeInfos);
_loadUserDict(userDictPath, maxWeight, UNKNOWN_TAG); _loadUserDict(userDictPath, maxWeight, UNKNOWN_TAG);
} }
@ -66,37 +57,33 @@ namespace CppJieba
return true; return true;
} }
const DictUnit* find(Unicode::const_iterator begin, Unicode::const_iterator end) const const DictUnit* find(Unicode::const_iterator begin, Unicode::const_iterator end) const {
{
return _trie->find(begin, end); return _trie->find(begin, end);
} }
bool find(Unicode::const_iterator begin, Unicode::const_iterator end, DagType& dag, size_t offset = 0) const bool find(Unicode::const_iterator begin, Unicode::const_iterator end, DagType& dag, size_t offset = 0) const {
{
return _trie->find(begin, end, dag, offset); return _trie->find(begin, end, dag, offset);
} }
void find( void find(
Unicode::const_iterator begin, Unicode::const_iterator begin,
Unicode::const_iterator end, Unicode::const_iterator end,
vector<SegmentChar>& res vector<SegmentChar>& res
) const ) const {
{
_trie->find(begin, end, res); _trie->find(begin, end, res);
} }
bool isUserDictSingleChineseWord(const Unicode::value_type& word) const bool isUserDictSingleChineseWord(const Unicode::value_type& word) const {
{
return isIn(_userDictSingleChineseWord, word); return isIn(_userDictSingleChineseWord, word);
} }
double getMinWeight() const {return _minWeight;}; double getMinWeight() const {
return _minWeight;
};
private: private:
Trie * _createTrie(const vector<DictUnit>& dictUnits) Trie * _createTrie(const vector<DictUnit>& dictUnits) {
{
assert(dictUnits.size()); assert(dictUnits.size());
vector<Unicode> words; vector<Unicode> words;
vector<const DictUnit*> valuePointers; vector<const DictUnit*> valuePointers;
for(size_t i = 0 ; i < dictUnits.size(); i ++) for(size_t i = 0 ; i < dictUnits.size(); i ++) {
{
words.push_back(dictUnits[i].word); words.push_back(dictUnits[i].word);
valuePointers.push_back(&dictUnits[i]); valuePointers.push_back(&dictUnits[i]);
} }
@ -104,32 +91,26 @@ namespace CppJieba
Trie * trie = new Trie(words, valuePointers); Trie * trie = new Trie(words, valuePointers);
return trie; return trie;
} }
void _loadUserDict(const string& filePath, double defaultWeight, const string& defaultTag) void _loadUserDict(const string& filePath, double defaultWeight, const string& defaultTag) {
{
ifstream ifs(filePath.c_str()); ifstream ifs(filePath.c_str());
if(!ifs.is_open()) if(!ifs.is_open()) {
{
LogFatal("file %s open failed.", filePath.c_str()); LogFatal("file %s open failed.", filePath.c_str());
} }
string line; string line;
DictUnit nodeInfo; DictUnit nodeInfo;
vector<string> buf; vector<string> buf;
size_t lineno; size_t lineno;
for(lineno = 0; getline(ifs, line); lineno++) for(lineno = 0; getline(ifs, line); lineno++) {
{
buf.clear(); buf.clear();
split(line, buf, " "); split(line, buf, " ");
if(buf.size() < 1) if(buf.size() < 1) {
{
LogFatal("split [%s] result illegal", line.c_str()); LogFatal("split [%s] result illegal", line.c_str());
} }
if(!TransCode::decode(buf[0], nodeInfo.word)) if(!TransCode::decode(buf[0], nodeInfo.word)) {
{
LogError("line[%u:%s] illegal.", lineno, line.c_str()); LogError("line[%u:%s] illegal.", lineno, line.c_str());
continue; continue;
} }
if(nodeInfo.word.size() == 1) if(nodeInfo.word.size() == 1) {
{
_userDictSingleChineseWord.insert(nodeInfo.word[0]); _userDictSingleChineseWord.insert(nodeInfo.word[0]);
} }
nodeInfo.weight = defaultWeight; nodeInfo.weight = defaultWeight;
@ -138,27 +119,22 @@ namespace CppJieba
} }
LogInfo("load userdict[%s] ok. lines[%u]", filePath.c_str(), lineno); LogInfo("load userdict[%s] ok. lines[%u]", filePath.c_str(), lineno);
} }
void _loadDict(const string& filePath) void _loadDict(const string& filePath) {
{
ifstream ifs(filePath.c_str()); ifstream ifs(filePath.c_str());
if(!ifs.is_open()) if(!ifs.is_open()) {
{
LogFatal("file %s open failed.", filePath.c_str()); LogFatal("file %s open failed.", filePath.c_str());
} }
string line; string line;
vector<string> buf; vector<string> buf;
DictUnit nodeInfo; DictUnit nodeInfo;
for(size_t lineno = 0; getline(ifs, line); lineno++) for(size_t lineno = 0; getline(ifs, line); lineno++) {
{
split(line, buf, " "); split(line, buf, " ");
if(buf.size() != DICT_COLUMN_NUM) if(buf.size() != DICT_COLUMN_NUM) {
{
LogFatal("split result illegal, line: %s, result size: %u", line.c_str(), buf.size()); LogFatal("split result illegal, line: %s, result size: %u", line.c_str(), buf.size());
} }
if(!TransCode::decode(buf[0], nodeInfo.word)) if(!TransCode::decode(buf[0], nodeInfo.word)) {
{
LogError("line[%u:%s] illegal.", lineno, line.c_str()); LogError("line[%u:%s] illegal.", lineno, line.c_str());
continue; continue;
} }
@ -168,43 +144,35 @@ namespace CppJieba
_nodeInfos.push_back(nodeInfo); _nodeInfos.push_back(nodeInfo);
} }
} }
double _findMinWeight(const vector<DictUnit>& nodeInfos) const double _findMinWeight(const vector<DictUnit>& nodeInfos) const {
{
double ret = MAX_DOUBLE; double ret = MAX_DOUBLE;
for(size_t i = 0; i < nodeInfos.size(); i++) for(size_t i = 0; i < nodeInfos.size(); i++) {
{
ret = min(nodeInfos[i].weight, ret); ret = min(nodeInfos[i].weight, ret);
} }
return ret; return ret;
} }
double _findMaxWeight(const vector<DictUnit>& nodeInfos) const double _findMaxWeight(const vector<DictUnit>& nodeInfos) const {
{
double ret = MIN_DOUBLE; double ret = MIN_DOUBLE;
for(size_t i = 0; i < nodeInfos.size(); i++) for(size_t i = 0; i < nodeInfos.size(); i++) {
{
ret = max(nodeInfos[i].weight, ret); ret = max(nodeInfos[i].weight, ret);
} }
return ret; return ret;
} }
void _calculateWeight(vector<DictUnit>& nodeInfos) const void _calculateWeight(vector<DictUnit>& nodeInfos) const {
{
double sum = 0.0; double sum = 0.0;
for(size_t i = 0; i < nodeInfos.size(); i++) for(size_t i = 0; i < nodeInfos.size(); i++) {
{
sum += nodeInfos[i].weight; sum += nodeInfos[i].weight;
} }
assert(sum); assert(sum);
for(size_t i = 0; i < nodeInfos.size(); i++) for(size_t i = 0; i < nodeInfos.size(); i++) {
{
DictUnit& nodeInfo = nodeInfos[i]; DictUnit& nodeInfo = nodeInfos[i];
assert(nodeInfo.weight); assert(nodeInfo.weight);
nodeInfo.weight = log(double(nodeInfo.weight)/double(sum)); nodeInfo.weight = log(double(nodeInfo.weight)/double(sum));
} }
} }
void _shrink(vector<DictUnit>& units) const void _shrink(vector<DictUnit>& units) const {
{
vector<DictUnit>(units.begin(), units.end()).swap(units); vector<DictUnit>(units.begin(), units.end()).swap(units);
} }

View File

@ -10,43 +10,34 @@
#include "SegmentBase.hpp" #include "SegmentBase.hpp"
#include "TransCode.hpp" #include "TransCode.hpp"
namespace CppJieba namespace CppJieba {
{ class FullSegment: public SegmentBase {
class FullSegment: public SegmentBase
{
public: public:
FullSegment() FullSegment() {
{
_dictTrie = NULL; _dictTrie = NULL;
_isBorrowed = false; _isBorrowed = false;
} }
explicit FullSegment(const string& dictPath) explicit FullSegment(const string& dictPath) {
{
_dictTrie = NULL; _dictTrie = NULL;
init(dictPath); init(dictPath);
} }
explicit FullSegment(const DictTrie* dictTrie) explicit FullSegment(const DictTrie* dictTrie) {
{
_dictTrie = NULL; _dictTrie = NULL;
init(dictTrie); init(dictTrie);
} }
virtual ~FullSegment() virtual ~FullSegment() {
{ if(_dictTrie && ! _isBorrowed) {
if(_dictTrie && ! _isBorrowed)
{
delete _dictTrie; delete _dictTrie;
} }
}; };
bool init(const string& dictPath) bool init(const string& dictPath) {
{
assert(_dictTrie == NULL); assert(_dictTrie == NULL);
_dictTrie = new DictTrie(dictPath); _dictTrie = new DictTrie(dictPath);
_isBorrowed = false; _isBorrowed = false;
return true; return true;
} }
bool init(const DictTrie* dictTrie) bool init(const DictTrie* dictTrie) {
{
assert(_dictTrie == NULL); assert(_dictTrie == NULL);
assert(dictTrie); assert(dictTrie);
_dictTrie = dictTrie; _dictTrie = dictTrie;
@ -55,11 +46,9 @@ namespace CppJieba
} }
using SegmentBase::cut; using SegmentBase::cut;
bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<Unicode>& res) const bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<Unicode>& res) const {
{
assert(_dictTrie); assert(_dictTrie);
if (begin >= end) if (begin >= end) {
{
LogError("begin >= end"); LogError("begin >= end");
return false; return false;
} }
@ -75,27 +64,21 @@ namespace CppJieba
//tmp variables //tmp variables
int wordLen = 0; int wordLen = 0;
for (Unicode::const_iterator uItr = begin; uItr != end; uItr++) for (Unicode::const_iterator uItr = begin; uItr != end; uItr++) {
{
//find word start from uItr //find word start from uItr
if (_dictTrie->find(uItr, end, tRes, 0)) if (_dictTrie->find(uItr, end, tRes, 0)) {
{
for(DagType::const_iterator itr = tRes.begin(); itr != tRes.end(); itr++) for(DagType::const_iterator itr = tRes.begin(); itr != tRes.end(); itr++)
//for (vector<pair<size_t, const DictUnit*> >::const_iterator itr = tRes.begin(); itr != tRes.end(); itr++) //for (vector<pair<size_t, const DictUnit*> >::const_iterator itr = tRes.begin(); itr != tRes.end(); itr++)
{ {
wordLen = itr->second->word.size(); wordLen = itr->second->word.size();
if (wordLen >= 2 || (tRes.size() == 1 && maxIdx <= uIdx)) if (wordLen >= 2 || (tRes.size() == 1 && maxIdx <= uIdx)) {
{
res.push_back(itr->second->word); res.push_back(itr->second->word);
} }
maxIdx = uIdx+wordLen > maxIdx ? uIdx+wordLen : maxIdx; maxIdx = uIdx+wordLen > maxIdx ? uIdx+wordLen : maxIdx;
} }
tRes.clear(); tRes.clear();
} } else { // not found word start from uItr
else // not found word start from uItr if (maxIdx <= uIdx) { // never exist in prev results
{
if (maxIdx <= uIdx) // never exist in prev results
{
//put itr itself in res //put itr itself in res
res.push_back(Unicode(1, *uItr)); res.push_back(Unicode(1, *uItr));
@ -109,31 +92,24 @@ namespace CppJieba
return true; return true;
} }
bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<string>& res) const bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<string>& res) const {
{
assert(_dictTrie); assert(_dictTrie);
if (begin >= end) if (begin >= end) {
{
LogError("begin >= end"); LogError("begin >= end");
return false; return false;
} }
vector<Unicode> uRes; vector<Unicode> uRes;
if (!cut(begin, end, uRes)) if (!cut(begin, end, uRes)) {
{
LogError("get unicode cut result error."); LogError("get unicode cut result error.");
return false; return false;
} }
string tmp; string tmp;
for (vector<Unicode>::const_iterator uItr = uRes.begin(); uItr != uRes.end(); uItr++) for (vector<Unicode>::const_iterator uItr = uRes.begin(); uItr != uRes.end(); uItr++) {
{ if (TransCode::encode(*uItr, tmp)) {
if (TransCode::encode(*uItr, tmp))
{
res.push_back(tmp); res.push_back(tmp);
} } else {
else
{
LogError("encode failed."); LogError("encode failed.");
} }
} }

View File

@ -12,12 +12,10 @@
#include "SegmentBase.hpp" #include "SegmentBase.hpp"
#include "DictTrie.hpp" #include "DictTrie.hpp"
namespace CppJieba namespace CppJieba {
{
using namespace Limonp; using namespace Limonp;
typedef unordered_map<uint16_t, double> EmitProbMap; typedef unordered_map<uint16_t, double> EmitProbMap;
class HMMSegment: public SegmentBase class HMMSegment: public SegmentBase {
{
public: public:
/* /*
* STATUS: * STATUS:
@ -27,14 +25,12 @@ namespace CppJieba
public: public:
HMMSegment() {} HMMSegment() {}
explicit HMMSegment(const string& filePath) explicit HMMSegment(const string& filePath) {
{
LIMONP_CHECK(init(filePath)); LIMONP_CHECK(init(filePath));
} }
virtual ~HMMSegment() {} virtual ~HMMSegment() {}
public: public:
bool init(const string& filePath) bool init(const string& filePath) {
{
memset(_startProb, 0, sizeof(_startProb)); memset(_startProb, 0, sizeof(_startProb));
memset(_transProb, 0, sizeof(_transProb)); memset(_transProb, 0, sizeof(_transProb));
_statMap[0] = 'B'; _statMap[0] = 'B';
@ -52,65 +48,51 @@ namespace CppJieba
public: public:
using SegmentBase::cut; using SegmentBase::cut;
public: public:
bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<Unicode>& res)const bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<Unicode>& res)const {
{
Unicode::const_iterator left = begin; Unicode::const_iterator left = begin;
Unicode::const_iterator right = begin; Unicode::const_iterator right = begin;
while(right != end) while(right != end) {
{ if(*right < 0x80) {
if(*right < 0x80) if(left != right && !_cut(left, right, res)) {
{
if(left != right && !_cut(left, right, res))
{
return false; return false;
} }
left = right; left = right;
do { do {
right = _sequentialLetterRule(left, end); right = _sequentialLetterRule(left, end);
if(right != left) if(right != left) {
{
break; break;
} }
right = _numbersRule(left, end); right = _numbersRule(left, end);
if(right != left) if(right != left) {
{
break; break;
} }
right ++; right ++;
} while(false); } while(false);
res.push_back(Unicode(left, right)); res.push_back(Unicode(left, right));
left = right; left = right;
} } else {
else
{
right++; right++;
} }
} }
if(left != right && !_cut(left, right, res)) if(left != right && !_cut(left, right, res)) {
{
return false; return false;
} }
return true; return true;
} }
public: public:
virtual bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<string>& res)const virtual bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<string>& res)const {
{ if(begin == end) {
if(begin == end)
{
return false; return false;
} }
vector<Unicode> words; vector<Unicode> words;
words.reserve(end - begin); words.reserve(end - begin);
if(!cut(begin, end, words)) if(!cut(begin, end, words)) {
{
return false; return false;
} }
size_t offset = res.size(); size_t offset = res.size();
res.resize(res.size() + words.size()); res.resize(res.size() + words.size());
for(size_t i = 0; i < words.size(); i++) for(size_t i = 0; i < words.size(); i++) {
{ if(!TransCode::encode(words[i], res[offset + i])) {
if(!TransCode::encode(words[i], res[offset + i]))
{
LogError("encode failed."); LogError("encode failed.");
} }
} }
@ -118,72 +100,52 @@ namespace CppJieba
} }
private: private:
// sequential letters rule // sequential letters rule
Unicode::const_iterator _sequentialLetterRule(Unicode::const_iterator begin, Unicode::const_iterator end) const Unicode::const_iterator _sequentialLetterRule(Unicode::const_iterator begin, Unicode::const_iterator end) const {
{
Unicode::value_type x = *begin; Unicode::value_type x = *begin;
if (('a' <= x && x <= 'z') || ('A' <= x && x <= 'Z')) if (('a' <= x && x <= 'z') || ('A' <= x && x <= 'Z')) {
{
begin ++; begin ++;
} } else {
else
{
return begin; return begin;
} }
while(begin != end) while(begin != end) {
{
x = *begin; x = *begin;
if(('a' <= x && x <= 'z') || ('A' <= x && x <= 'Z') || ('0' <= x && x <= '9')) if(('a' <= x && x <= 'z') || ('A' <= x && x <= 'Z') || ('0' <= x && x <= '9')) {
{
begin ++; begin ++;
} } else {
else
{
break; break;
} }
} }
return begin; return begin;
} }
// //
Unicode::const_iterator _numbersRule(Unicode::const_iterator begin, Unicode::const_iterator end) const Unicode::const_iterator _numbersRule(Unicode::const_iterator begin, Unicode::const_iterator end) const {
{
Unicode::value_type x = *begin; Unicode::value_type x = *begin;
if('0' <= x && x <= '9') if('0' <= x && x <= '9') {
{
begin ++; begin ++;
} } else {
else
{
return begin; return begin;
} }
while(begin != end) while(begin != end) {
{
x = *begin; x = *begin;
if( ('0' <= x && x <= '9') || x == '.') if( ('0' <= x && x <= '9') || x == '.') {
{
begin++; begin++;
} } else {
else
{
break; break;
} }
} }
return begin; return begin;
} }
bool _cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<Unicode>& res) const bool _cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<Unicode>& res) const {
{
vector<size_t> status; vector<size_t> status;
if(!_viterbi(begin, end, status)) if(!_viterbi(begin, end, status)) {
{
LogError("_viterbi failed."); LogError("_viterbi failed.");
return false; return false;
} }
Unicode::const_iterator left = begin; Unicode::const_iterator left = begin;
Unicode::const_iterator right; Unicode::const_iterator right;
for(size_t i = 0; i < status.size(); i++) for(size_t i = 0; i < status.size(); i++) {
{ if(status[i] % 2) { //if(E == status[i] || S == status[i])
if(status[i] % 2) //if(E == status[i] || S == status[i])
{
right = begin + i + 1; right = begin + i + 1;
res.push_back(Unicode(left, right)); res.push_back(Unicode(left, right));
left = right; left = right;
@ -192,10 +154,8 @@ namespace CppJieba
return true; return true;
} }
bool _viterbi(Unicode::const_iterator begin, Unicode::const_iterator end, vector<size_t>& status)const bool _viterbi(Unicode::const_iterator begin, Unicode::const_iterator end, vector<size_t>& status)const {
{ if(begin == end) {
if(begin == end)
{
return false; return false;
} }
@ -210,8 +170,7 @@ namespace CppJieba
vector<double> weight(XYSize); vector<double> weight(XYSize);
//start //start
for(size_t y = 0; y < Y; y++) for(size_t y = 0; y < Y; y++) {
{
weight[0 + y * X] = _startProb[y] + _getEmitProb(_emitProbVec[y], *begin, MIN_DOUBLE); weight[0 + y * X] = _startProb[y] + _getEmitProb(_emitProbVec[y], *begin, MIN_DOUBLE);
path[0 + y * X] = -1; path[0 + y * X] = -1;
} }
@ -219,20 +178,16 @@ namespace CppJieba
double emitProb; double emitProb;
for(size_t x = 1; x < X; x++) for(size_t x = 1; x < X; x++) {
{ for(size_t y = 0; y < Y; y++) {
for(size_t y = 0; y < Y; y++)
{
now = x + y*X; now = x + y*X;
weight[now] = MIN_DOUBLE; weight[now] = MIN_DOUBLE;
path[now] = E; // warning path[now] = E; // warning
emitProb = _getEmitProb(_emitProbVec[y], *(begin+x), MIN_DOUBLE); emitProb = _getEmitProb(_emitProbVec[y], *(begin+x), MIN_DOUBLE);
for(size_t preY = 0; preY < Y; preY++) for(size_t preY = 0; preY < Y; preY++) {
{
old = x - 1 + preY * X; old = x - 1 + preY * X;
tmp = weight[old] + _transProb[preY][y] + emitProb; tmp = weight[old] + _transProb[preY][y] + emitProb;
if(tmp > weight[now]) if(tmp > weight[now]) {
{
weight[now] = tmp; weight[now] = tmp;
path[now] = preY; path[now] = preY;
} }
@ -243,127 +198,102 @@ namespace CppJieba
endE = weight[X-1+E*X]; endE = weight[X-1+E*X];
endS = weight[X-1+S*X]; endS = weight[X-1+S*X];
stat = 0; stat = 0;
if(endE >= endS) if(endE >= endS) {
{
stat = E; stat = E;
} } else {
else
{
stat = S; stat = S;
} }
status.resize(X); status.resize(X);
for(int x = X -1 ; x >= 0; x--) for(int x = X -1 ; x >= 0; x--) {
{
status[x] = stat; status[x] = stat;
stat = path[x + stat*X]; stat = path[x + stat*X];
} }
return true; return true;
} }
bool _loadModel(const char* const filePath) bool _loadModel(const char* const filePath) {
{
ifstream ifile(filePath); ifstream ifile(filePath);
string line; string line;
vector<string> tmp; vector<string> tmp;
vector<string> tmp2; vector<string> tmp2;
//load _startProb //load _startProb
if(!_getLine(ifile, line)) if(!_getLine(ifile, line)) {
{
return false; return false;
} }
split(line, tmp, " "); split(line, tmp, " ");
if(tmp.size() != STATUS_SUM) if(tmp.size() != STATUS_SUM) {
{
LogError("start_p illegal"); LogError("start_p illegal");
return false; return false;
} }
for(size_t j = 0; j< tmp.size(); j++) for(size_t j = 0; j< tmp.size(); j++) {
{
_startProb[j] = atof(tmp[j].c_str()); _startProb[j] = atof(tmp[j].c_str());
} }
//load _transProb //load _transProb
for(size_t i = 0; i < STATUS_SUM; i++) for(size_t i = 0; i < STATUS_SUM; i++) {
{ if(!_getLine(ifile, line)) {
if(!_getLine(ifile, line))
{
return false; return false;
} }
split(line, tmp, " "); split(line, tmp, " ");
if(tmp.size() != STATUS_SUM) if(tmp.size() != STATUS_SUM) {
{
LogError("trans_p illegal"); LogError("trans_p illegal");
return false; return false;
} }
for(size_t j =0; j < STATUS_SUM; j++) for(size_t j =0; j < STATUS_SUM; j++) {
{
_transProb[i][j] = atof(tmp[j].c_str()); _transProb[i][j] = atof(tmp[j].c_str());
} }
} }
//load _emitProbB //load _emitProbB
if(!_getLine(ifile, line) || !_loadEmitProb(line, _emitProbB)) if(!_getLine(ifile, line) || !_loadEmitProb(line, _emitProbB)) {
{
return false; return false;
} }
//load _emitProbE //load _emitProbE
if(!_getLine(ifile, line) || !_loadEmitProb(line, _emitProbE)) if(!_getLine(ifile, line) || !_loadEmitProb(line, _emitProbE)) {
{
return false; return false;
} }
//load _emitProbM //load _emitProbM
if(!_getLine(ifile, line) || !_loadEmitProb(line, _emitProbM)) if(!_getLine(ifile, line) || !_loadEmitProb(line, _emitProbM)) {
{
return false; return false;
} }
//load _emitProbS //load _emitProbS
if(!_getLine(ifile, line) || !_loadEmitProb(line, _emitProbS)) if(!_getLine(ifile, line) || !_loadEmitProb(line, _emitProbS)) {
{
return false; return false;
} }
return true; return true;
} }
bool _getLine(ifstream& ifile, string& line) bool _getLine(ifstream& ifile, string& line) {
{ while(getline(ifile, line)) {
while(getline(ifile, line))
{
trim(line); trim(line);
if(line.empty()) if(line.empty()) {
{
continue; continue;
} }
if(startsWith(line, "#")) if(startsWith(line, "#")) {
{
continue; continue;
} }
return true; return true;
} }
return false; return false;
} }
bool _loadEmitProb(const string& line, EmitProbMap& mp) bool _loadEmitProb(const string& line, EmitProbMap& mp) {
{ if(line.empty()) {
if(line.empty())
{
return false; return false;
} }
vector<string> tmp, tmp2; vector<string> tmp, tmp2;
Unicode unicode; Unicode unicode;
split(line, tmp, ","); split(line, tmp, ",");
for(size_t i = 0; i < tmp.size(); i++) for(size_t i = 0; i < tmp.size(); i++) {
{
split(tmp[i], tmp2, ":"); split(tmp[i], tmp2, ":");
if(2 != tmp2.size()) if(2 != tmp2.size()) {
{
LogError("_emitProb illegal."); LogError("_emitProb illegal.");
return false; return false;
} }
if(!TransCode::decode(tmp2[0], unicode) || unicode.size() != 1) if(!TransCode::decode(tmp2[0], unicode) || unicode.size() != 1) {
{
LogError("TransCode failed."); LogError("TransCode failed.");
return false; return false;
} }
@ -371,11 +301,9 @@ namespace CppJieba
} }
return true; return true;
} }
double _getEmitProb(const EmitProbMap* ptMp, uint16_t key, double defVal)const double _getEmitProb(const EmitProbMap* ptMp, uint16_t key, double defVal)const {
{
EmitProbMap::const_iterator cit = ptMp->find(key); EmitProbMap::const_iterator cit = ptMp->find(key);
if(cit == ptMp->end()) if(cit == ptMp->end()) {
{
return defVal; return defVal;
} }
return cit->second; return cit->second;

View File

@ -2,10 +2,8 @@
#define CPPJIEBA_SEGMENTINTERFACE_H #define CPPJIEBA_SEGMENTINTERFACE_H
namespace CppJieba namespace CppJieba {
{ class ISegment {
class ISegment
{
public: public:
virtual ~ISegment() {}; virtual ~ISegment() {};
virtual bool cut(Unicode::const_iterator begin , Unicode::const_iterator end, vector<string>& res) const = 0; virtual bool cut(Unicode::const_iterator begin , Unicode::const_iterator end, vector<string>& res) const = 0;

View File

@ -5,76 +5,60 @@
#include <cmath> #include <cmath>
#include <set> #include <set>
namespace CppJieba namespace CppJieba {
{
using namespace Limonp; using namespace Limonp;
/*utf8*/ /*utf8*/
class KeywordExtractor class KeywordExtractor {
{
public: public:
KeywordExtractor() {}; KeywordExtractor() {};
KeywordExtractor(const string& dictPath, const string& hmmFilePath, const string& idfPath, const string& stopWordPath, const string& userDict = "") KeywordExtractor(const string& dictPath, const string& hmmFilePath, const string& idfPath, const string& stopWordPath, const string& userDict = "") {
{
init(dictPath, hmmFilePath, idfPath, stopWordPath, userDict); init(dictPath, hmmFilePath, idfPath, stopWordPath, userDict);
}; };
~KeywordExtractor() {}; ~KeywordExtractor() {};
void init(const string& dictPath, const string& hmmFilePath, const string& idfPath, const string& stopWordPath, const string& userDict = "") void init(const string& dictPath, const string& hmmFilePath, const string& idfPath, const string& stopWordPath, const string& userDict = "") {
{
_loadIdfDict(idfPath); _loadIdfDict(idfPath);
_loadStopWordDict(stopWordPath); _loadStopWordDict(stopWordPath);
LIMONP_CHECK(_segment.init(dictPath, hmmFilePath, userDict)); LIMONP_CHECK(_segment.init(dictPath, hmmFilePath, userDict));
}; };
bool extract(const string& str, vector<string>& keywords, size_t topN) const bool extract(const string& str, vector<string>& keywords, size_t topN) const {
{
vector<pair<string, double> > topWords; vector<pair<string, double> > topWords;
if(!extract(str, topWords, topN)) if(!extract(str, topWords, topN)) {
{
return false; return false;
} }
for(size_t i = 0; i < topWords.size(); i++) for(size_t i = 0; i < topWords.size(); i++) {
{
keywords.push_back(topWords[i].first); keywords.push_back(topWords[i].first);
} }
return true; return true;
} }
bool extract(const string& str, vector<pair<string, double> >& keywords, size_t topN) const bool extract(const string& str, vector<pair<string, double> >& keywords, size_t topN) const {
{
vector<string> words; vector<string> words;
if(!_segment.cut(str, words)) if(!_segment.cut(str, words)) {
{
LogError("segment cut(%s) failed.", str.c_str()); LogError("segment cut(%s) failed.", str.c_str());
return false; return false;
} }
map<string, double> wordmap; map<string, double> wordmap;
for(vector<string>::iterator iter = words.begin(); iter != words.end(); iter++) for(vector<string>::iterator iter = words.begin(); iter != words.end(); iter++) {
{ if(_isSingleWord(*iter)) {
if(_isSingleWord(*iter))
{
continue; continue;
} }
wordmap[*iter] += 1.0; wordmap[*iter] += 1.0;
} }
for(map<string, double>::iterator itr = wordmap.begin(); itr != wordmap.end(); ) for(map<string, double>::iterator itr = wordmap.begin(); itr != wordmap.end(); ) {
{ if(_stopWords.end() != _stopWords.find(itr->first)) {
if(_stopWords.end() != _stopWords.find(itr->first))
{
wordmap.erase(itr++); wordmap.erase(itr++);
continue; continue;
} }
unordered_map<string, double>::const_iterator cit = _idfMap.find(itr->first); unordered_map<string, double>::const_iterator cit = _idfMap.find(itr->first);
if(cit != _idfMap.end()) if(cit != _idfMap.end()) {
{
itr->second *= cit->second; itr->second *= cit->second;
} } else {
else
{
itr->second *= _idfAverage; itr->second *= _idfAverage;
} }
itr ++; itr ++;
@ -88,11 +72,9 @@ namespace CppJieba
return true; return true;
} }
private: private:
void _loadIdfDict(const string& idfPath) void _loadIdfDict(const string& idfPath) {
{
ifstream ifs(idfPath.c_str()); ifstream ifs(idfPath.c_str());
if(!ifs.is_open()) if(!ifs.is_open()) {
{
LogFatal("open %s failed.", idfPath.c_str()); LogFatal("open %s failed.", idfPath.c_str());
} }
string line ; string line ;
@ -100,16 +82,13 @@ namespace CppJieba
double idf = 0.0; double idf = 0.0;
double idfSum = 0.0; double idfSum = 0.0;
size_t lineno = 0; size_t lineno = 0;
for(;getline(ifs, line); lineno++) for(; getline(ifs, line); lineno++) {
{
buf.clear(); buf.clear();
if(line.empty()) if(line.empty()) {
{
LogError("line[%d] empty. skipped.", lineno); LogError("line[%d] empty. skipped.", lineno);
continue; continue;
} }
if(!split(line, buf, " ") || buf.size() != 2) if(!split(line, buf, " ") || buf.size() != 2) {
{
LogError("line %d [%s] illegal. skipped.", lineno, line.c_str()); LogError("line %d [%s] illegal. skipped.", lineno, line.c_str());
continue; continue;
} }
@ -123,23 +102,19 @@ namespace CppJieba
_idfAverage = idfSum / lineno; _idfAverage = idfSum / lineno;
assert(_idfAverage > 0.0); assert(_idfAverage > 0.0);
} }
void _loadStopWordDict(const string& filePath) void _loadStopWordDict(const string& filePath) {
{
ifstream ifs(filePath.c_str()); ifstream ifs(filePath.c_str());
if(!ifs.is_open()) if(!ifs.is_open()) {
{
LogFatal("open %s failed.", filePath.c_str()); LogFatal("open %s failed.", filePath.c_str());
} }
string line ; string line ;
while(getline(ifs, line)) while(getline(ifs, line)) {
{
_stopWords.insert(line); _stopWords.insert(line);
} }
assert(_stopWords.size()); assert(_stopWords.size());
} }
bool _isSingleWord(const string& str) const bool _isSingleWord(const string& str) const {
{
Unicode unicode; Unicode unicode;
TransCode::decode(str, unicode); TransCode::decode(str, unicode);
if(unicode.size() == 1) if(unicode.size() == 1)
@ -147,8 +122,7 @@ namespace CppJieba
return false; return false;
} }
static bool _cmp(const pair<string, double>& lhs, const pair<string, double>& rhs) static bool _cmp(const pair<string, double>& lhs, const pair<string, double>& rhs) {
{
return lhs.second > rhs.second; return lhs.second > rhs.second;
} }

View File

@ -9,51 +9,41 @@
#include "ISegment.hpp" #include "ISegment.hpp"
#include "SegmentBase.hpp" #include "SegmentBase.hpp"
namespace CppJieba namespace CppJieba {
{
class MPSegment: public SegmentBase class MPSegment: public SegmentBase {
{
public: public:
MPSegment() {}; MPSegment() {};
MPSegment(const string& dictPath, const string& userDictPath = "") MPSegment(const string& dictPath, const string& userDictPath = "") {
{
LIMONP_CHECK(init(dictPath, userDictPath)); LIMONP_CHECK(init(dictPath, userDictPath));
}; };
virtual ~MPSegment() {}; virtual ~MPSegment() {};
bool init(const string& dictPath, const string& userDictPath = "") bool init(const string& dictPath, const string& userDictPath = "") {
{
LIMONP_CHECK(_dictTrie.init(dictPath, userDictPath)); LIMONP_CHECK(_dictTrie.init(dictPath, userDictPath));
LogInfo("MPSegment init(%s) ok", dictPath.c_str()); LogInfo("MPSegment init(%s) ok", dictPath.c_str());
return true; return true;
} }
bool isUserDictSingleChineseWord(const Unicode::value_type & value) const bool isUserDictSingleChineseWord(const Unicode::value_type & value) const {
{
return _dictTrie.isUserDictSingleChineseWord(value); return _dictTrie.isUserDictSingleChineseWord(value);
} }
using SegmentBase::cut; using SegmentBase::cut;
virtual bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<string>& res)const virtual bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<string>& res)const {
{ if(begin == end) {
if(begin == end)
{
return false; return false;
} }
vector<Unicode> words; vector<Unicode> words;
words.reserve(end - begin); words.reserve(end - begin);
if(!cut(begin, end, words)) if(!cut(begin, end, words)) {
{
return false; return false;
} }
size_t offset = res.size(); size_t offset = res.size();
res.resize(res.size() + words.size()); res.resize(res.size() + words.size());
for(size_t i = 0; i < words.size(); i++) for(size_t i = 0; i < words.size(); i++) {
{ if(!TransCode::encode(words[i], res[i + offset])) {
if(!TransCode::encode(words[i], res[i + offset]))
{
LogError("encode failed."); LogError("encode failed.");
res[i + offset].clear(); res[i + offset].clear();
} }
@ -61,10 +51,8 @@ namespace CppJieba
return true; return true;
} }
bool cut(Unicode::const_iterator begin , Unicode::const_iterator end, vector<Unicode>& res) const bool cut(Unicode::const_iterator begin , Unicode::const_iterator end, vector<Unicode>& res) const {
{ if(end == begin) {
if(end == begin)
{
return false; return false;
} }
vector<SegmentChar> segmentChars; vector<SegmentChar> segmentChars;
@ -77,62 +65,48 @@ namespace CppJieba
return true; return true;
} }
const DictTrie* getDictTrie() const const DictTrie* getDictTrie() const {
{
return &_dictTrie; return &_dictTrie;
} }
private: private:
void _calcDP(vector<SegmentChar>& segmentChars) const void _calcDP(vector<SegmentChar>& segmentChars) const {
{
size_t nextPos; size_t nextPos;
const DictUnit* p; const DictUnit* p;
double val; double val;
for(ssize_t i = segmentChars.size() - 1; i >= 0; i--) for(ssize_t i = segmentChars.size() - 1; i >= 0; i--) {
{
segmentChars[i].pInfo = NULL; segmentChars[i].pInfo = NULL;
segmentChars[i].weight = MIN_DOUBLE; segmentChars[i].weight = MIN_DOUBLE;
assert(!segmentChars[i].dag.empty()); assert(!segmentChars[i].dag.empty());
for(DagType::const_iterator it = segmentChars[i].dag.begin(); it != segmentChars[i].dag.end(); it++) for(DagType::const_iterator it = segmentChars[i].dag.begin(); it != segmentChars[i].dag.end(); it++) {
{
nextPos = it->first; nextPos = it->first;
p = it->second; p = it->second;
val = 0.0; val = 0.0;
if(nextPos + 1 < segmentChars.size()) if(nextPos + 1 < segmentChars.size()) {
{
val += segmentChars[nextPos + 1].weight; val += segmentChars[nextPos + 1].weight;
} }
if(p) if(p) {
{
val += p->weight; val += p->weight;
} } else {
else
{
val += _dictTrie.getMinWeight(); val += _dictTrie.getMinWeight();
} }
if(val > segmentChars[i].weight) if(val > segmentChars[i].weight) {
{
segmentChars[i].pInfo = p; segmentChars[i].pInfo = p;
segmentChars[i].weight = val; segmentChars[i].weight = val;
} }
} }
} }
} }
void _cut(const vector<SegmentChar>& segmentChars, vector<Unicode>& res) const void _cut(const vector<SegmentChar>& segmentChars, vector<Unicode>& res) const {
{
size_t i = 0; size_t i = 0;
while(i < segmentChars.size()) while(i < segmentChars.size()) {
{
const DictUnit* p = segmentChars[i].pInfo; const DictUnit* p = segmentChars[i].pInfo;
if(p) if(p) {
{
res.push_back(p->word); res.push_back(p->word);
i += p->word.size(); i += p->word.size();
} } else { //single chinese word
else//single chinese word
{
res.push_back(Unicode(1, segmentChars[i].uniCh)); res.push_back(Unicode(1, segmentChars[i].uniCh));
i++; i++;
} }

View File

@ -6,35 +6,27 @@
#include "HMMSegment.hpp" #include "HMMSegment.hpp"
#include "Limonp/StringUtil.hpp" #include "Limonp/StringUtil.hpp"
namespace CppJieba namespace CppJieba {
{ class MixSegment: public SegmentBase {
class MixSegment: public SegmentBase
{
public: public:
MixSegment() MixSegment() {
{
} }
MixSegment(const string& mpSegDict, const string& hmmSegDict, const string& userDict = "") MixSegment(const string& mpSegDict, const string& hmmSegDict, const string& userDict = "") {
{
LIMONP_CHECK(init(mpSegDict, hmmSegDict, userDict)); LIMONP_CHECK(init(mpSegDict, hmmSegDict, userDict));
} }
virtual ~MixSegment() virtual ~MixSegment() {
{
} }
bool init(const string& mpSegDict, const string& hmmSegDict, const string& userDict = "") bool init(const string& mpSegDict, const string& hmmSegDict, const string& userDict = "") {
{
LIMONP_CHECK(_mpSeg.init(mpSegDict, userDict)); LIMONP_CHECK(_mpSeg.init(mpSegDict, userDict));
LIMONP_CHECK(_hmmSeg.init(hmmSegDict)); LIMONP_CHECK(_hmmSeg.init(hmmSegDict));
LogInfo("MixSegment init(%s, %s)", mpSegDict.c_str(), hmmSegDict.c_str()); LogInfo("MixSegment init(%s, %s)", mpSegDict.c_str(), hmmSegDict.c_str());
return true; return true;
} }
using SegmentBase::cut; using SegmentBase::cut;
virtual bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<Unicode>& res) const virtual bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<Unicode>& res) const {
{
vector<Unicode> words; vector<Unicode> words;
words.reserve(end - begin); words.reserve(end - begin);
if(!_mpSeg.cut(begin, end, words)) if(!_mpSeg.cut(begin, end, words)) {
{
LogError("mpSeg cutDAG failed."); LogError("mpSeg cutDAG failed.");
return false; return false;
} }
@ -43,33 +35,28 @@ namespace CppJieba
hmmRes.reserve(end - begin); hmmRes.reserve(end - begin);
Unicode piece; Unicode piece;
piece.reserve(end - begin); piece.reserve(end - begin);
for (size_t i = 0, j = 0; i < words.size(); i++) for (size_t i = 0, j = 0; i < words.size(); i++) {
{
//if mp get a word, it's ok, put it into result //if mp get a word, it's ok, put it into result
if (1 != words[i].size() || (words[i].size() == 1 && _mpSeg.isUserDictSingleChineseWord(words[i][0]))) if (1 != words[i].size() || (words[i].size() == 1 && _mpSeg.isUserDictSingleChineseWord(words[i][0]))) {
{
res.push_back(words[i]); res.push_back(words[i]);
continue; continue;
} }
// if mp get a single one and it is not in userdict, collect it in sequence // if mp get a single one and it is not in userdict, collect it in sequence
j = i; j = i;
while (j < words.size() && 1 == words[j].size() && !_mpSeg.isUserDictSingleChineseWord(words[j][0])) while (j < words.size() && 1 == words[j].size() && !_mpSeg.isUserDictSingleChineseWord(words[j][0])) {
{
piece.push_back(words[j][0]); piece.push_back(words[j][0]);
j++; j++;
} }
// cut the sequence with hmm // cut the sequence with hmm
if (!_hmmSeg.cut(piece.begin(), piece.end(), hmmRes)) if (!_hmmSeg.cut(piece.begin(), piece.end(), hmmRes)) {
{
LogError("_hmmSeg cut failed."); LogError("_hmmSeg cut failed.");
return false; return false;
} }
//put hmm result to result //put hmm result to result
for (size_t k = 0; k < hmmRes.size(); k++) for (size_t k = 0; k < hmmRes.size(); k++) {
{
res.push_back(hmmRes[k]); res.push_back(hmmRes[k]);
} }
@ -83,34 +70,28 @@ namespace CppJieba
return true; return true;
} }
virtual bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<string>& res)const virtual bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<string>& res)const {
{ if(begin == end) {
if(begin == end)
{
return false; return false;
} }
vector<Unicode> uRes; vector<Unicode> uRes;
uRes.reserve(end - begin); uRes.reserve(end - begin);
if (!cut(begin, end, uRes)) if (!cut(begin, end, uRes)) {
{
return false; return false;
} }
size_t offset = res.size(); size_t offset = res.size();
res.resize(res.size() + uRes.size()); res.resize(res.size() + uRes.size());
for(size_t i = 0; i < uRes.size(); i ++, offset++) for(size_t i = 0; i < uRes.size(); i ++, offset++) {
{ if(!TransCode::encode(uRes[i], res[offset])) {
if(!TransCode::encode(uRes[i], res[offset]))
{
LogError("encode failed."); LogError("encode failed.");
} }
} }
return true; return true;
} }
const DictTrie* getDictTrie() const const DictTrie* getDictTrie() const {
{
return _mpSeg.getDictTrie(); return _mpSeg.getDictTrie();
} }
private: private:

View File

@ -5,97 +5,78 @@
#include "Limonp/StringUtil.hpp" #include "Limonp/StringUtil.hpp"
#include "DictTrie.hpp" #include "DictTrie.hpp"
namespace CppJieba namespace CppJieba {
{
using namespace Limonp; using namespace Limonp;
static const char* const POS_M = "m"; static const char* const POS_M = "m";
static const char* const POS_ENG = "eng"; static const char* const POS_ENG = "eng";
static const char* const POS_X = "x"; static const char* const POS_X = "x";
class PosTagger class PosTagger {
{
public: public:
PosTagger() PosTagger() {
{
} }
PosTagger( PosTagger(
const string& dictPath, const string& dictPath,
const string& hmmFilePath, const string& hmmFilePath,
const string& userDictPath = "" const string& userDictPath = ""
) ) {
{
init(dictPath, hmmFilePath, userDictPath); init(dictPath, hmmFilePath, userDictPath);
} }
~PosTagger() ~PosTagger() {
{
} }
void init( void init(
const string& dictPath, const string& dictPath,
const string& hmmFilePath, const string& hmmFilePath,
const string& userDictPath = "" const string& userDictPath = ""
) ) {
{
LIMONP_CHECK(_segment.init(dictPath, hmmFilePath, userDictPath)); LIMONP_CHECK(_segment.init(dictPath, hmmFilePath, userDictPath));
_dictTrie = _segment.getDictTrie(); _dictTrie = _segment.getDictTrie();
LIMONP_CHECK(_dictTrie); LIMONP_CHECK(_dictTrie);
}; };
bool tag(const string& src, vector<pair<string, string> >& res) const bool tag(const string& src, vector<pair<string, string> >& res) const {
{
vector<string> cutRes; vector<string> cutRes;
if (!_segment.cut(src, cutRes)) if (!_segment.cut(src, cutRes)) {
{
LogError("_mixSegment cut failed"); LogError("_mixSegment cut failed");
return false; return false;
} }
const DictUnit *tmp = NULL; const DictUnit *tmp = NULL;
Unicode unico; Unicode unico;
for (vector<string>::iterator itr = cutRes.begin(); itr != cutRes.end(); ++itr) for (vector<string>::iterator itr = cutRes.begin(); itr != cutRes.end(); ++itr) {
{ if (!TransCode::decode(*itr, unico)) {
if (!TransCode::decode(*itr, unico))
{
LogError("decode failed."); LogError("decode failed.");
return false; return false;
} }
tmp = _dictTrie->find(unico.begin(), unico.end()); tmp = _dictTrie->find(unico.begin(), unico.end());
if(tmp == NULL || tmp->tag.empty()) if(tmp == NULL || tmp->tag.empty()) {
{
res.push_back(make_pair(*itr, _specialRule(unico))); res.push_back(make_pair(*itr, _specialRule(unico)));
} } else {
else
{
res.push_back(make_pair(*itr, tmp->tag)); res.push_back(make_pair(*itr, tmp->tag));
} }
} }
return !res.empty(); return !res.empty();
} }
private: private:
const char* _specialRule(const Unicode& unicode) const const char* _specialRule(const Unicode& unicode) const {
{
size_t m = 0; size_t m = 0;
size_t eng = 0; size_t eng = 0;
for(size_t i = 0; i < unicode.size() && eng < unicode.size() / 2; i++) for(size_t i = 0; i < unicode.size() && eng < unicode.size() / 2; i++) {
{ if(unicode[i] < 0x80) {
if(unicode[i] < 0x80)
{
eng ++; eng ++;
if('0' <= unicode[i] && unicode[i] <= '9') if('0' <= unicode[i] && unicode[i] <= '9') {
{
m++; m++;
} }
} }
} }
// ascii char is not found // ascii char is not found
if(eng == 0) if(eng == 0) {
{
return POS_X; return POS_X;
} }
// all the ascii is number char // all the ascii is number char
if(m == eng) if(m == eng) {
{
return POS_M; return POS_M;
} }
// the ascii chars contain english letter // the ascii chars contain english letter

View File

@ -13,19 +13,15 @@
#include "TransCode.hpp" #include "TransCode.hpp"
#include "DictTrie.hpp" #include "DictTrie.hpp"
namespace CppJieba namespace CppJieba {
{ class QuerySegment: public SegmentBase {
class QuerySegment: public SegmentBase
{
public: public:
QuerySegment() {}; QuerySegment() {};
QuerySegment(const string& dict, const string& model, size_t maxWordLen, const string& userDict = "") QuerySegment(const string& dict, const string& model, size_t maxWordLen, const string& userDict = "") {
{
init(dict, model, maxWordLen, userDict); init(dict, model, maxWordLen, userDict);
}; };
virtual ~QuerySegment() {}; virtual ~QuerySegment() {};
bool init(const string& dict, const string& model, size_t maxWordLen, const string& userDict = "") bool init(const string& dict, const string& model, size_t maxWordLen, const string& userDict = "") {
{
LIMONP_CHECK(_mixSeg.init(dict, model, userDict)); LIMONP_CHECK(_mixSeg.init(dict, model, userDict));
LIMONP_CHECK(_fullSeg.init(_mixSeg.getDictTrie())); LIMONP_CHECK(_fullSeg.init(_mixSeg.getDictTrie()));
assert(maxWordLen); assert(maxWordLen);
@ -33,42 +29,33 @@ namespace CppJieba
return true; return true;
} }
using SegmentBase::cut; using SegmentBase::cut;
bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<Unicode>& res) const bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<Unicode>& res) const {
{ if (begin >= end) {
if (begin >= end)
{
LogError("begin >= end"); LogError("begin >= end");
return false; return false;
} }
//use mix cut first //use mix cut first
vector<Unicode> mixRes; vector<Unicode> mixRes;
if (!_mixSeg.cut(begin, end, mixRes)) if (!_mixSeg.cut(begin, end, mixRes)) {
{
LogError("_mixSeg cut failed."); LogError("_mixSeg cut failed.");
return false; return false;
} }
vector<Unicode> fullRes; vector<Unicode> fullRes;
for (vector<Unicode>::const_iterator mixResItr = mixRes.begin(); mixResItr != mixRes.end(); mixResItr++) for (vector<Unicode>::const_iterator mixResItr = mixRes.begin(); mixResItr != mixRes.end(); mixResItr++) {
{
// if it's too long, cut with _fullSeg, put fullRes in res // if it's too long, cut with _fullSeg, put fullRes in res
if (mixResItr->size() > _maxWordLen) if (mixResItr->size() > _maxWordLen) {
{ if (_fullSeg.cut(mixResItr->begin(), mixResItr->end(), fullRes)) {
if (_fullSeg.cut(mixResItr->begin(), mixResItr->end(), fullRes)) for (vector<Unicode>::const_iterator fullResItr = fullRes.begin(); fullResItr != fullRes.end(); fullResItr++) {
{
for (vector<Unicode>::const_iterator fullResItr = fullRes.begin(); fullResItr != fullRes.end(); fullResItr++)
{
res.push_back(*fullResItr); res.push_back(*fullResItr);
} }
//clear tmp res //clear tmp res
fullRes.clear(); fullRes.clear();
} }
} } else { // just use the mix result
else // just use the mix result
{
res.push_back(*mixResItr); res.push_back(*mixResItr);
} }
} }
@ -77,30 +64,23 @@ namespace CppJieba
} }
bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<string>& res) const bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<string>& res) const {
{ if (begin >= end) {
if (begin >= end)
{
LogError("begin >= end"); LogError("begin >= end");
return false; return false;
} }
vector<Unicode> uRes; vector<Unicode> uRes;
if (!cut(begin, end, uRes)) if (!cut(begin, end, uRes)) {
{
LogError("get unicode cut result error."); LogError("get unicode cut result error.");
return false; return false;
} }
string tmp; string tmp;
for (vector<Unicode>::const_iterator uItr = uRes.begin(); uItr != uRes.end(); uItr++) for (vector<Unicode>::const_iterator uItr = uRes.begin(); uItr != uRes.end(); uItr++) {
{ if (TransCode::encode(*uItr, tmp)) {
if (TransCode::encode(*uItr, tmp))
{
res.push_back(tmp); res.push_back(tmp);
} } else {
else
{
LogError("encode failed."); LogError("encode failed.");
} }
} }

View File

@ -9,8 +9,7 @@
#include <cassert> #include <cassert>
namespace CppJieba namespace CppJieba {
{
using namespace Limonp; using namespace Limonp;
//const char* const SPECIAL_CHARS = " \t\n"; //const char* const SPECIAL_CHARS = " \t\n";
@ -20,15 +19,15 @@ namespace CppJieba
const UnicodeValueType SPECIAL_SYMBOL[] = {32u, 9u, 10u}; const UnicodeValueType SPECIAL_SYMBOL[] = {32u, 9u, 10u};
#endif #endif
class SegmentBase: public ISegment, public NonCopyable class SegmentBase: public ISegment, public NonCopyable {
{
public: public:
SegmentBase(){_loadSpecialSymbols();}; SegmentBase() {
_loadSpecialSymbols();
};
virtual ~SegmentBase() {}; virtual ~SegmentBase() {};
public: public:
virtual bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<string>& res) const = 0; virtual bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<string>& res) const = 0;
virtual bool cut(const string& str, vector<string>& res) const virtual bool cut(const string& str, vector<string>& res) const {
{
res.clear(); res.clear();
Unicode unicode; Unicode unicode;
@ -39,12 +38,9 @@ namespace CppJieba
Unicode::const_iterator left = unicode.begin(); Unicode::const_iterator left = unicode.begin();
Unicode::const_iterator right; Unicode::const_iterator right;
for(right = unicode.begin(); right != unicode.end(); right++) for(right = unicode.begin(); right != unicode.end(); right++) {
{ if(isIn(_specialSymbols, *right)) {
if(isIn(_specialSymbols, *right)) if(left != right) {
{
if(left != right)
{
cut(left, right, res); cut(left, right, res);
} }
res.resize(res.size() + 1); res.resize(res.size() + 1);
@ -52,19 +48,16 @@ namespace CppJieba
left = right + 1; left = right + 1;
} }
} }
if(left != right) if(left != right) {
{
cut(left, right, res); cut(left, right, res);
} }
return true; return true;
} }
private: private:
void _loadSpecialSymbols() void _loadSpecialSymbols() {
{
size_t size = sizeof(SPECIAL_SYMBOL)/sizeof(*SPECIAL_SYMBOL); size_t size = sizeof(SPECIAL_SYMBOL)/sizeof(*SPECIAL_SYMBOL);
for(size_t i = 0; i < size; i ++) for(size_t i = 0; i < size; i ++) {
{
_specialSymbols.insert(SPECIAL_SYMBOL[i]); _specialSymbols.insert(SPECIAL_SYMBOL[i]);
} }
assert(_specialSymbols.size()); assert(_specialSymbols.size());

View File

@ -9,16 +9,13 @@
#include "Limonp/StringUtil.hpp" #include "Limonp/StringUtil.hpp"
#include "Limonp/LocalVector.hpp" #include "Limonp/LocalVector.hpp"
namespace CppJieba namespace CppJieba {
{
using namespace Limonp; using namespace Limonp;
typedef uint16_t UnicodeValueType; typedef uint16_t UnicodeValueType;
typedef Limonp::LocalVector<UnicodeValueType> Unicode; typedef Limonp::LocalVector<UnicodeValueType> Unicode;
namespace TransCode namespace TransCode {
{ inline bool decode(const string& str, Unicode& res) {
inline bool decode(const string& str, Unicode& res)
{
#ifdef CPPJIEBA_GBK #ifdef CPPJIEBA_GBK
return gbkTrans(str, res); return gbkTrans(str, res);
#else #else
@ -26,8 +23,7 @@ namespace CppJieba
#endif #endif
} }
inline bool encode(Unicode::const_iterator begin, Unicode::const_iterator end, string& res) inline bool encode(Unicode::const_iterator begin, Unicode::const_iterator end, string& res) {
{
#ifdef CPPJIEBA_GBK #ifdef CPPJIEBA_GBK
return gbkTrans(begin, end, res); return gbkTrans(begin, end, res);
#else #else
@ -35,14 +31,12 @@ namespace CppJieba
#endif #endif
} }
inline bool encode(const Unicode& uni, string& res) inline bool encode(const Unicode& uni, string& res) {
{
return encode(uni.begin(), uni.end(), res); return encode(uni.begin(), uni.end(), res);
} }
// compiler is expected to optimized this function to avoid return value copy // compiler is expected to optimized this function to avoid return value copy
inline string encode(Unicode::const_iterator begin, Unicode::const_iterator end) inline string encode(Unicode::const_iterator begin, Unicode::const_iterator end) {
{
string res; string res;
res.reserve(end - begin); res.reserve(end - begin);
encode(begin, end, res); encode(begin, end, res);
@ -50,8 +44,7 @@ namespace CppJieba
} }
// compiler is expected to optimized this function to avoid return value copy // compiler is expected to optimized this function to avoid return value copy
inline Unicode decode(const string& str) inline Unicode decode(const string& str) {
{
Unicode unicode; Unicode unicode;
unicode.reserve(str.size()); unicode.reserve(str.size());
decode(str, unicode); decode(str, unicode);

View File

@ -5,20 +5,17 @@
#include <vector> #include <vector>
#include <queue> #include <queue>
namespace CppJieba namespace CppJieba {
{
using namespace std; using namespace std;
struct DictUnit struct DictUnit {
{
Unicode word; Unicode word;
double weight; double weight;
string tag; string tag;
}; };
// for debugging // for debugging
inline ostream & operator << (ostream& os, const DictUnit& unit) inline ostream & operator << (ostream& os, const DictUnit& unit) {
{
string s; string s;
s << unit.word; s << unit.word;
return os << string_format("%s %s %.3lf", s.c_str(), unit.tag.c_str(), unit.weight); return os << string_format("%s %s %.3lf", s.c_str(), unit.tag.c_str(), unit.weight);
@ -26,35 +23,30 @@ namespace CppJieba
typedef LocalVector<std::pair<size_t, const DictUnit*> > DagType; typedef LocalVector<std::pair<size_t, const DictUnit*> > DagType;
struct SegmentChar struct SegmentChar {
{
uint16_t uniCh; uint16_t uniCh;
DagType dag; DagType dag;
const DictUnit * pInfo; const DictUnit * pInfo;
double weight; double weight;
size_t nextPos; size_t nextPos;
SegmentChar():uniCh(0), pInfo(NULL), weight(0.0), nextPos(0) SegmentChar():uniCh(0), pInfo(NULL), weight(0.0), nextPos(0) {
{} }
~SegmentChar() ~SegmentChar() {
{} }
}; };
typedef Unicode::value_type TrieKey; typedef Unicode::value_type TrieKey;
class TrieNode class TrieNode {
{
public: public:
TrieNode(): fail(NULL), next(NULL), ptValue(NULL) TrieNode(): fail(NULL), next(NULL), ptValue(NULL) {
{} }
const TrieNode * findNext(TrieKey key) const const TrieNode * findNext(TrieKey key) const {
{ if(next == NULL) {
if(next == NULL)
{
return NULL; return NULL;
} }
NextMap::const_iterator iter = next->find(key); NextMap::const_iterator iter = next->find(key);
if(iter == next->end()) if(iter == next->end()) {
{
return NULL; return NULL;
} }
return iter->second; return iter->second;
@ -66,32 +58,26 @@ namespace CppJieba
const DictUnit * ptValue; const DictUnit * ptValue;
}; };
class Trie class Trie {
{
public: public:
Trie(const vector<Unicode>& keys, const vector<const DictUnit*> & valuePointers) Trie(const vector<Unicode>& keys, const vector<const DictUnit*> & valuePointers) {
{
_root = new TrieNode; _root = new TrieNode;
_createTrie(keys, valuePointers); _createTrie(keys, valuePointers);
_build();// build automation _build();// build automation
} }
~Trie() ~Trie() {
{ if(_root) {
if(_root)
{
_deleteNode(_root); _deleteNode(_root);
} }
} }
public: public:
const DictUnit* find(Unicode::const_iterator begin, Unicode::const_iterator end) const const DictUnit* find(Unicode::const_iterator begin, Unicode::const_iterator end) const {
{
TrieNode::NextMap::const_iterator citer; TrieNode::NextMap::const_iterator citer;
const TrieNode* ptNode = _root; const TrieNode* ptNode = _root;
for(Unicode::const_iterator it = begin; it != end; it++) for(Unicode::const_iterator it = begin; it != end; it++) {
{// build automation // build automation
assert(ptNode); assert(ptNode);
if(NULL == ptNode->next || ptNode->next->end() == (citer = ptNode->next->find(*it))) if(NULL == ptNode->next || ptNode->next->end() == (citer = ptNode->next->find(*it))) {
{
return NULL; return NULL;
} }
ptNode = citer->second; ptNode = citer->second;
@ -103,14 +89,12 @@ namespace CppJieba
Unicode::const_iterator begin, Unicode::const_iterator begin,
Unicode::const_iterator end, Unicode::const_iterator end,
vector<struct SegmentChar>& res vector<struct SegmentChar>& res
) const ) const {
{
res.resize(end - begin); res.resize(end - begin);
const TrieNode * now = _root; const TrieNode * now = _root;
const TrieNode* node; const TrieNode* node;
// compiler will complain warnings if only "i < end - begin" . // compiler will complain warnings if only "i < end - begin" .
for (size_t i = 0; i < size_t(end - begin); i++) for (size_t i = 0; i < size_t(end - begin); i++) {
{
Unicode::value_type ch = *(begin + i); Unicode::value_type ch = *(begin + i);
res[i].uniCh = ch; res[i].uniCh = ch;
assert(res[i].dag.empty()); assert(res[i].dag.empty());
@ -118,40 +102,29 @@ namespace CppJieba
bool flag = false; bool flag = false;
// rollback // rollback
while( now != _root ) while( now != _root ) {
{
node = now->findNext(ch); node = now->findNext(ch);
if (node != NULL) if (node != NULL) {
{
flag = true; flag = true;
break; break;
} } else {
else
{
now = now->fail; now = now->fail;
} }
} }
if(!flag) if(!flag) {
{
node = now->findNext(ch); node = now->findNext(ch);
} }
if(node == NULL) if(node == NULL) {
{
now = _root; now = _root;
} } else {
else
{
now = node; now = node;
const TrieNode * temp = now; const TrieNode * temp = now;
while(temp != _root) while(temp != _root) {
{ if (temp->ptValue) {
if (temp->ptValue)
{
size_t pos = i - temp->ptValue->word.size() + 1; size_t pos = i - temp->ptValue->word.size() + 1;
res[pos].dag.push_back(pair<vector<Unicode >::size_type, const DictUnit* >(i, temp->ptValue)); res[pos].dag.push_back(pair<vector<Unicode >::size_type, const DictUnit* >(i, temp->ptValue));
if(pos == i) if(pos == i) {
{
res[pos].dag[0].second = temp->ptValue; res[pos].dag[0].second = temp->ptValue;
} }
} }
@ -165,26 +138,19 @@ namespace CppJieba
Unicode::const_iterator begin, Unicode::const_iterator begin,
Unicode::const_iterator end, Unicode::const_iterator end,
DagType & res, DagType & res,
size_t offset = 0) const size_t offset = 0) const {
{
const TrieNode * ptNode = _root; const TrieNode * ptNode = _root;
TrieNode::NextMap::const_iterator citer; TrieNode::NextMap::const_iterator citer;
for(Unicode::const_iterator itr = begin; itr != end ; itr++) for(Unicode::const_iterator itr = begin; itr != end ; itr++) {
{
assert(ptNode); assert(ptNode);
if(NULL == ptNode->next || ptNode->next->end() == (citer = ptNode->next->find(*itr))) if(NULL == ptNode->next || ptNode->next->end() == (citer = ptNode->next->find(*itr))) {
{
break; break;
} }
ptNode = citer->second; ptNode = citer->second;
if(ptNode->ptValue) if(ptNode->ptValue) {
{ if(itr == begin && res.size() == 1) { // first singleword
if(itr == begin && res.size() == 1) // first singleword
{
res[0].second = ptNode->ptValue; res[0].second = ptNode->ptValue;
} } else {
else
{
res.push_back(pair<vector<Unicode >::size_type, const DictUnit* >(itr - begin + offset, ptNode->ptValue)); res.push_back(pair<vector<Unicode >::size_type, const DictUnit* >(itr - begin + offset, ptNode->ptValue));
} }
} }
@ -192,8 +158,7 @@ namespace CppJieba
return !res.empty(); return !res.empty();
} }
private: private:
void _build() void _build() {
{
queue<TrieNode*> que; queue<TrieNode*> que;
assert(_root->ptValue == NULL); assert(_root->ptValue == NULL);
assert(_root->next); assert(_root->next);
@ -213,8 +178,7 @@ namespace CppJieba
for(TrieNode::NextMap::iterator iter = now->next->begin(); iter != now->next->end(); iter++) { for(TrieNode::NextMap::iterator iter = now->next->begin(); iter != now->next->end(); iter++) {
back = now->fail; back = now->fail;
while(back != NULL) { while(back != NULL) {
if(back->next && (backiter = back->next->find(iter->first)) != back->next->end()) if(back->next && (backiter = back->next->find(iter->first)) != back->next->end()) {
{
iter->second->fail = backiter->second; iter->second->fail = backiter->second;
break; break;
} }
@ -227,59 +191,46 @@ namespace CppJieba
} }
} }
} }
void _createTrie(const vector<Unicode>& keys, const vector<const DictUnit*> & valuePointers) void _createTrie(const vector<Unicode>& keys, const vector<const DictUnit*> & valuePointers) {
{ if(valuePointers.empty() || keys.empty()) {
if(valuePointers.empty() || keys.empty())
{
return; return;
} }
assert(keys.size() == valuePointers.size()); assert(keys.size() == valuePointers.size());
for(size_t i = 0; i < keys.size(); i++) for(size_t i = 0; i < keys.size(); i++) {
{
_insertNode(keys[i], valuePointers[i]); _insertNode(keys[i], valuePointers[i]);
} }
} }
void _insertNode(const Unicode& key, const DictUnit* ptValue) void _insertNode(const Unicode& key, const DictUnit* ptValue) {
{
TrieNode* ptNode = _root; TrieNode* ptNode = _root;
TrieNode::NextMap::const_iterator kmIter; TrieNode::NextMap::const_iterator kmIter;
for(Unicode::const_iterator citer = key.begin(); citer != key.end(); citer++) for(Unicode::const_iterator citer = key.begin(); citer != key.end(); citer++) {
{ if(NULL == ptNode->next) {
if(NULL == ptNode->next)
{
ptNode->next = new TrieNode::NextMap; ptNode->next = new TrieNode::NextMap;
} }
kmIter = ptNode->next->find(*citer); kmIter = ptNode->next->find(*citer);
if(ptNode->next->end() == kmIter) if(ptNode->next->end() == kmIter) {
{
TrieNode * nextNode = new TrieNode; TrieNode * nextNode = new TrieNode;
nextNode->next = NULL; nextNode->next = NULL;
nextNode->ptValue = NULL; nextNode->ptValue = NULL;
(*ptNode->next)[*citer] = nextNode; (*ptNode->next)[*citer] = nextNode;
ptNode = nextNode; ptNode = nextNode;
} } else {
else
{
ptNode = kmIter->second; ptNode = kmIter->second;
} }
} }
ptNode->ptValue = ptValue; ptNode->ptValue = ptValue;
} }
void _deleteNode(TrieNode* node) void _deleteNode(TrieNode* node) {
{ if(!node) {
if(!node)
{
return; return;
} }
if(node->next) if(node->next) {
{
TrieNode::NextMap::iterator it; TrieNode::NextMap::iterator it;
for(it = node->next->begin(); it != node->next->end(); it++) for(it = node->next->begin(); it != node->next->end(); it++) {
{
_deleteNode(it->second); _deleteNode(it->second);
} }
delete node->next; delete node->next;

View File

@ -1,8 +1,7 @@
#include "../src/KeywordExtractor.hpp" #include "../src/KeywordExtractor.hpp"
using namespace CppJieba; using namespace CppJieba;
int main(int argc, char ** argv) int main(int argc, char ** argv) {
{
KeywordExtractor extractor("../dict/jieba.dict.utf8", "../dict/hmm_model.utf8", "../dict/idf.utf8", "../dict/stop_words.utf8"); KeywordExtractor extractor("../dict/jieba.dict.utf8", "../dict/hmm_model.utf8", "../dict/idf.utf8", "../dict/stop_words.utf8");
//KeywordExtractor extractor("../dict/jieba.dict.utf8", "../dict/hmm_model.utf8", "../dict/idf.utf8", "../dict/stop_words.utf8", "../dict/user.dict.utf8"); //KeywordExtractor extractor("../dict/jieba.dict.utf8", "../dict/hmm_model.utf8", "../dict/idf.utf8", "../dict/stop_words.utf8", "../dict/user.dict.utf8");
string s("我是拖拉机学院手扶拖拉机专业的。不用多久我就会升职加薪当上CEO走上人生巅峰。"); string s("我是拖拉机学院手扶拖拉机专业的。不用多久我就会升职加薪当上CEO走上人生巅峰。");

View File

@ -9,8 +9,7 @@
using namespace CppJieba; using namespace CppJieba;
void cut(size_t times = 50) void cut(size_t times = 50) {
{
MixSegment seg("../dict/jieba.dict.utf8", "../dict/hmm_model.utf8"); MixSegment seg("../dict/jieba.dict.utf8", "../dict/hmm_model.utf8");
vector<string> res; vector<string> res;
string doc; string doc;
@ -18,8 +17,7 @@ void cut(size_t times = 50)
assert(ifs); assert(ifs);
doc << ifs; doc << ifs;
long beginTime = clock(); long beginTime = clock();
for(size_t i = 0; i < times; i ++) for(size_t i = 0; i < times; i ++) {
{
printf("process [%3.0lf %%]\r", 100.0*(i+1)/times); printf("process [%3.0lf %%]\r", 100.0*(i+1)/times);
fflush(stdout); fflush(stdout);
res.clear(); res.clear();
@ -30,8 +28,7 @@ void cut(size_t times = 50)
ColorPrintln(GREEN, "cut: [%.3lf seconds]time consumed.", double(endTime - beginTime)/CLOCKS_PER_SEC); ColorPrintln(GREEN, "cut: [%.3lf seconds]time consumed.", double(endTime - beginTime)/CLOCKS_PER_SEC);
} }
void extract(size_t times = 400) void extract(size_t times = 400) {
{
KeywordExtractor extractor("../dict/jieba.dict.utf8", "../dict/hmm_model.utf8", "../dict/idf.utf8", "../dict/stop_words.utf8"); KeywordExtractor extractor("../dict/jieba.dict.utf8", "../dict/hmm_model.utf8", "../dict/idf.utf8", "../dict/stop_words.utf8");
vector<string> words; vector<string> words;
string doc; string doc;
@ -39,8 +36,7 @@ void extract(size_t times = 400)
assert(ifs); assert(ifs);
doc << ifs; doc << ifs;
long beginTime = clock(); long beginTime = clock();
for(size_t i = 0; i < times; i ++) for(size_t i = 0; i < times; i ++) {
{
printf("process [%3.0lf %%]\r", 100.0*(i+1)/times); printf("process [%3.0lf %%]\r", 100.0*(i+1)/times);
fflush(stdout); fflush(stdout);
words.clear(); words.clear();
@ -51,8 +47,7 @@ void extract(size_t times = 400)
ColorPrintln(GREEN, "extract: [%.3lf seconds]time consumed.", double(endTime - beginTime)/CLOCKS_PER_SEC); ColorPrintln(GREEN, "extract: [%.3lf seconds]time consumed.", double(endTime - beginTime)/CLOCKS_PER_SEC);
} }
int main(int argc, char ** argv) int main(int argc, char ** argv) {
{
cut(); cut();
extract(); extract();
return EXIT_SUCCESS; return EXIT_SUCCESS;

View File

@ -14,16 +14,13 @@ const char * const JIEBA_DICT_FILE = "../dict/jieba.dict.utf8";
const char * const HMM_DICT_FILE = "../dict/hmm_model.utf8"; const char * const HMM_DICT_FILE = "../dict/hmm_model.utf8";
const char * const USER_DICT_FILE = "../dict/user.dict.utf8"; const char * const USER_DICT_FILE = "../dict/user.dict.utf8";
void cut(const ISegment& seg, const char * const filePath) void cut(const ISegment& seg, const char * const filePath) {
{
ifstream ifile(filePath); ifstream ifile(filePath);
vector<string> words; vector<string> words;
string line; string line;
string res; string res;
while(getline(ifile, line)) while(getline(ifile, line)) {
{ if(!line.empty()) {
if(!line.empty())
{
words.clear(); words.clear();
seg.cut(line, words); seg.cut(line, words);
join(words.begin(), words.end(), res, "/"); join(words.begin(), words.end(), res, "/");
@ -33,8 +30,7 @@ void cut(const ISegment& seg, const char * const filePath)
} }
int main(int argc, char ** argv) int main(int argc, char ** argv) {
{
{ {
printf("\e[32m%s\e[0m\n", "[demo] MPSegment"); // colorful printf("\e[32m%s\e[0m\n", "[demo] MPSegment"); // colorful
MPSegment seg(JIEBA_DICT_FILE); MPSegment seg(JIEBA_DICT_FILE);

View File

@ -1,8 +1,7 @@
#include "../src/PosTagger.hpp" #include "../src/PosTagger.hpp"
using namespace CppJieba; using namespace CppJieba;
int main(int argc, char ** argv) int main(int argc, char ** argv) {
{
PosTagger tagger("../dict/jieba.dict.utf8", "../dict/hmm_model.utf8", "../dict/user.dict.utf8"); PosTagger tagger("../dict/jieba.dict.utf8", "../dict/hmm_model.utf8", "../dict/user.dict.utf8");
string s("我是蓝翔技工拖拉机学院手扶拖拉机专业的。不用多久我就会升职加薪当上总经理出任CEO迎娶白富美走上人生巅峰。"); string s("我是蓝翔技工拖拉机学院手扶拖拉机专业的。不用多久我就会升职加薪当上总经理出任CEO迎娶白富美走上人生巅峰。");
vector<pair<string, string> > res; vector<pair<string, string> > res;

View File

@ -5,8 +5,7 @@ using namespace CppJieba;
TEST(KeywordExtractorTest, Test1) TEST(KeywordExtractorTest, Test1) {
{
KeywordExtractor extractor("../dict/extra_dict/jieba.dict.small.utf8", "../dict/hmm_model.utf8", "../dict/idf.utf8", "../dict/stop_words.utf8"); KeywordExtractor extractor("../dict/extra_dict/jieba.dict.small.utf8", "../dict/hmm_model.utf8", "../dict/idf.utf8", "../dict/stop_words.utf8");
{ {
@ -30,8 +29,7 @@ TEST(KeywordExtractorTest, Test1)
} }
} }
TEST(KeywordExtractorTest, Test2) TEST(KeywordExtractorTest, Test2) {
{
KeywordExtractor extractor("../dict/extra_dict/jieba.dict.small.utf8", "../dict/hmm_model.utf8", "../dict/idf.utf8", "../dict/stop_words.utf8", "../test/testdata/userdict.utf8"); KeywordExtractor extractor("../dict/extra_dict/jieba.dict.small.utf8", "../dict/hmm_model.utf8", "../dict/idf.utf8", "../dict/stop_words.utf8", "../test/testdata/userdict.utf8");
{ {

View File

@ -12,8 +12,7 @@ static const char * const QUERY_TEST3 = "iPhone6手机的最大特点是很容
static const char * const ANS_TEST3 = "[\"iPhone6:eng\", \"手机:n\", \"的:uj\", \"最大:a\", \"特点:n\", \"是:v\", \"很:zg\", \"容易:a\", \"弯曲:v\", \"。:x\"]"; static const char * const ANS_TEST3 = "[\"iPhone6:eng\", \"手机:n\", \"的:uj\", \"最大:a\", \"特点:n\", \"是:v\", \"很:zg\", \"容易:a\", \"弯曲:v\", \"。:x\"]";
//static const char * const ANS_TEST3 = ""; //static const char * const ANS_TEST3 = "";
TEST(PosTaggerTest, Test) TEST(PosTaggerTest, Test) {
{
PosTagger tagger("../dict/jieba.dict.utf8", "../dict/hmm_model.utf8"); PosTagger tagger("../dict/jieba.dict.utf8", "../dict/hmm_model.utf8");
{ {
vector<pair<string, string> > res; vector<pair<string, string> > res;
@ -23,8 +22,7 @@ TEST(PosTaggerTest, Test)
ASSERT_TRUE(s == ANS_TEST1); ASSERT_TRUE(s == ANS_TEST1);
} }
} }
TEST(PosTagger, TestUserDict) TEST(PosTagger, TestUserDict) {
{
PosTagger tagger("../dict/jieba.dict.utf8", "../dict/hmm_model.utf8", "../test/testdata/userdict.utf8"); PosTagger tagger("../dict/jieba.dict.utf8", "../dict/hmm_model.utf8", "../test/testdata/userdict.utf8");
{ {
vector<pair<string, string> > res; vector<pair<string, string> > res;

View File

@ -9,8 +9,7 @@
using namespace CppJieba; using namespace CppJieba;
TEST(MixSegmentTest, Test1) TEST(MixSegmentTest, Test1) {
{
MixSegment segment("../dict/jieba.dict.utf8", "../dict/hmm_model.utf8");; MixSegment segment("../dict/jieba.dict.utf8", "../dict/hmm_model.utf8");;
const char* str = "我来自北京邮电大学。。。学号123456用AK47"; const char* str = "我来自北京邮电大学。。。学号123456用AK47";
const char* res[] = {"", "来自", "北京邮电大学", "","","", "学号", "123456","","","AK47"}; const char* res[] = {"", "来自", "北京邮电大学", "","","", "学号", "123456","","","AK47"};
@ -23,8 +22,7 @@ TEST(MixSegmentTest, Test1)
ASSERT_EQ(words, vector<string>(res2, res2 + sizeof(res2)/sizeof(res2[0]))); ASSERT_EQ(words, vector<string>(res2, res2 + sizeof(res2)/sizeof(res2[0])));
} }
TEST(MixSegmentTest, NoUserDict) TEST(MixSegmentTest, NoUserDict) {
{
MixSegment segment("../dict/extra_dict/jieba.dict.small.utf8", "../dict/hmm_model.utf8"); MixSegment segment("../dict/extra_dict/jieba.dict.small.utf8", "../dict/hmm_model.utf8");
const char* str = "令狐冲是云计算方面的专家"; const char* str = "令狐冲是云计算方面的专家";
vector<string> words; vector<string> words;
@ -33,8 +31,7 @@ TEST(MixSegmentTest, NoUserDict)
ASSERT_EQ("[\"令狐冲\", \"\", \"\", \"计算\", \"方面\", \"\", \"专家\"]", res << words); ASSERT_EQ("[\"令狐冲\", \"\", \"\", \"计算\", \"方面\", \"\", \"专家\"]", res << words);
} }
TEST(MixSegmentTest, UserDict) TEST(MixSegmentTest, UserDict) {
{
MixSegment segment("../dict/extra_dict/jieba.dict.small.utf8", "../dict/hmm_model.utf8", "../dict/user.dict.utf8"); MixSegment segment("../dict/extra_dict/jieba.dict.small.utf8", "../dict/hmm_model.utf8", "../dict/user.dict.utf8");
{ {
const char* str = "令狐冲是云计算方面的专家"; const char* str = "令狐冲是云计算方面的专家";
@ -60,8 +57,7 @@ TEST(MixSegmentTest, UserDict)
ASSERT_EQ("[\"IBM\", \",\", \"3.14\"]", res); ASSERT_EQ("[\"IBM\", \",\", \"3.14\"]", res);
} }
} }
TEST(MixSegmentTest, UserDict2) TEST(MixSegmentTest, UserDict2) {
{
MixSegment segment("../dict/extra_dict/jieba.dict.small.utf8", "../dict/hmm_model.utf8", "../test/testdata/userdict.utf8"); MixSegment segment("../dict/extra_dict/jieba.dict.small.utf8", "../dict/hmm_model.utf8", "../test/testdata/userdict.utf8");
{ {
const char* str = "令狐冲是云计算方面的专家"; const char* str = "令狐冲是云计算方面的专家";
@ -88,8 +84,7 @@ TEST(MixSegmentTest, UserDict2)
} }
} }
TEST(MPSegmentTest, Test1) TEST(MPSegmentTest, Test1) {
{
MPSegment segment("../dict/jieba.dict.utf8");; MPSegment segment("../dict/jieba.dict.utf8");;
const char* str = "我来自北京邮电大学。"; const char* str = "我来自北京邮电大学。";
const char* res[] = {"", "来自", "北京邮电大学", ""}; const char* res[] = {"", "来自", "北京邮电大学", ""};
@ -106,8 +101,7 @@ TEST(MPSegmentTest, Test1)
} }
} }
TEST(MPSegmentTest, Test2) TEST(MPSegmentTest, Test2) {
{
MPSegment segment("../dict/extra_dict/jieba.dict.small.utf8"); MPSegment segment("../dict/extra_dict/jieba.dict.small.utf8");
string line; string line;
ifstream ifs("../test/testdata/review.100"); ifstream ifs("../test/testdata/review.100");
@ -121,8 +115,7 @@ TEST(MPSegmentTest, Test2)
} }
string res; string res;
while(getline(ifs, line)) while(getline(ifs, line)) {
{
res += line; res += line;
res += '\n'; res += '\n';
@ -137,8 +130,7 @@ TEST(MPSegmentTest, Test2)
ofs << res; ofs << res;
} }
TEST(HMMSegmentTest, Test1) TEST(HMMSegmentTest, Test1) {
{
HMMSegment segment("../dict/hmm_model.utf8");; HMMSegment segment("../dict/hmm_model.utf8");;
{ {
const char* str = "我来自北京邮电大学。。。学号123456"; const char* str = "我来自北京邮电大学。。。学号123456";
@ -157,8 +149,7 @@ TEST(HMMSegmentTest, Test1)
} }
} }
TEST(FullSegment, Test1) TEST(FullSegment, Test1) {
{
FullSegment segment("../dict/extra_dict/jieba.dict.small.utf8"); FullSegment segment("../dict/extra_dict/jieba.dict.small.utf8");
const char* str = "我来自北京邮电大学"; const char* str = "我来自北京邮电大学";
vector<string> words; vector<string> words;
@ -170,8 +161,7 @@ TEST(FullSegment, Test1)
ASSERT_EQ(s, "[\"\", \"来自\", \"北京\", \"北京邮电大学\", \"邮电\", \"电大\", \"大学\"]"); ASSERT_EQ(s, "[\"\", \"来自\", \"北京\", \"北京邮电大学\", \"邮电\", \"电大\", \"大学\"]");
} }
TEST(QuerySegment, Test1) TEST(QuerySegment, Test1) {
{
QuerySegment segment("../dict/extra_dict/jieba.dict.small.utf8", "../dict/hmm_model.utf8", 3); QuerySegment segment("../dict/extra_dict/jieba.dict.small.utf8", "../dict/hmm_model.utf8", 3);
const char* str = "小明硕士毕业于中国科学院计算所,后在日本京都大学深造"; const char* str = "小明硕士毕业于中国科学院计算所,后在日本京都大学深造";
vector<string> words; vector<string> words;
@ -185,8 +175,7 @@ TEST(QuerySegment, Test1)
} }
TEST(QuerySegment, Test2) TEST(QuerySegment, Test2) {
{
QuerySegment segment("../dict/extra_dict/jieba.dict.small.utf8", "../dict/hmm_model.utf8", 3, "../test/testdata/userdict.utf8"); QuerySegment segment("../dict/extra_dict/jieba.dict.small.utf8", "../dict/hmm_model.utf8", 3, "../test/testdata/userdict.utf8");
{ {

View File

@ -6,8 +6,7 @@ using namespace CppJieba;
static const char* const DICT_FILE = "../dict/extra_dict/jieba.dict.small.utf8"; static const char* const DICT_FILE = "../dict/extra_dict/jieba.dict.small.utf8";
TEST(DictTrieTest, NewAndDelete) TEST(DictTrieTest, NewAndDelete) {
{
DictTrie * trie; DictTrie * trie;
trie = new DictTrie(DICT_FILE); trie = new DictTrie(DICT_FILE);
delete trie; delete trie;
@ -15,8 +14,7 @@ TEST(DictTrieTest, NewAndDelete)
delete trie; delete trie;
} }
TEST(DictTrieTest, Test1) TEST(DictTrieTest, Test1) {
{
string s1, s2; string s1, s2;
DictTrie trie; DictTrie trie;
@ -38,8 +36,7 @@ TEST(DictTrieTest, Test1)
//vector<pair<size_t, const DictUnit* > resMap; //vector<pair<size_t, const DictUnit* > resMap;
LocalVector<pair<size_t, const DictUnit*> > res2; LocalVector<pair<size_t, const DictUnit*> > res2;
const char * words[] = {"", "清华", "清华大学"}; const char * words[] = {"", "清华", "清华大学"};
for(size_t i = 0; i < sizeof(words)/sizeof(words[0]); i++) for(size_t i = 0; i < sizeof(words)/sizeof(words[0]); i++) {
{
ASSERT_TRUE(TransCode::decode(words[i], uni)); ASSERT_TRUE(TransCode::decode(words[i], uni));
res.push_back(make_pair(uni.size() - 1, trie.find(uni.begin(), uni.end()))); res.push_back(make_pair(uni.size() - 1, trie.find(uni.begin(), uni.end())));
//resMap[uni.size() - 1] = trie.find(uni.begin(), uni.end()); //resMap[uni.size() - 1] = trie.find(uni.begin(), uni.end());
@ -55,8 +52,7 @@ TEST(DictTrieTest, Test1)
ASSERT_EQ(s1, s2); ASSERT_EQ(s1, s2);
} }
TEST(DictTrieTest, UserDict) TEST(DictTrieTest, UserDict) {
{
DictTrie trie(DICT_FILE, "../test/testdata/userdict.utf8"); DictTrie trie(DICT_FILE, "../test/testdata/userdict.utf8");
string word = "云计算"; string word = "云计算";
Unicode unicode; Unicode unicode;
@ -68,8 +64,7 @@ TEST(DictTrieTest, UserDict)
ASSERT_EQ("[\"20113\", \"35745\", \"31639\"] -2.975", res); ASSERT_EQ("[\"20113\", \"35745\", \"31639\"] -2.975", res);
} }
TEST(DictTrieTest, automation) TEST(DictTrieTest, automation) {
{
DictTrie trie(DICT_FILE, "../test/testdata/userdict.utf8"); DictTrie trie(DICT_FILE, "../test/testdata/userdict.utf8");
//string word = "yasherhs"; //string word = "yasherhs";
string word = "abcderf"; string word = "abcderf";