From 313e05da1b337c3751d9a62d24f0c941af5ae88f Mon Sep 17 00:00:00 2001 From: wyy Date: Sat, 7 Dec 2013 20:25:28 -0800 Subject: [PATCH 1/5] ci for lunch --- src/MPSegment.hpp | 23 +++++++++---------- src/Trie.hpp | 56 +++++++++++++++++++++++++++++++++++++++-------- 2 files changed, 59 insertions(+), 20 deletions(-) diff --git a/src/MPSegment.hpp b/src/MPSegment.hpp index 288353f..cc60dc0 100644 --- a/src/MPSegment.hpp +++ b/src/MPSegment.hpp @@ -158,21 +158,22 @@ namespace CppJieba return false; } - vector > vp; + //vector > vp; for(Unicode::const_iterator it = begin; it != end; it++) { segContext.push_back(SegmentChar(*it)); SegmentChar& back = segContext.back(); - int i = it - begin; - vp.clear(); - if(_trie.find(it, end, vp)) - { - for(uint j = 0; j < vp.size(); j++) - { - uint nextp = vp[j].first + i; - back.dag[nextp] = vp[j].second; - } - } + uint i = it - begin; + _trie.find(it, end, i, back.dag); + //vp.clear(); + //if(_trie.find(it, end, vp)) + //{ + // for(uint j = 0; j < vp.size(); j++) + // { + // uint nextp = vp[j].first + i; + // back.dag[nextp] = vp[j].second; + // } + //} if(back.dag.end() == back.dag.find(i)) { back.dag[i] = NULL; diff --git a/src/Trie.hpp b/src/Trie.hpp index c39277a..e7956f7 100644 --- a/src/Trie.hpp +++ b/src/Trie.hpp @@ -157,15 +157,15 @@ namespace CppJieba bool _getInitFlag()const{return _initFlag;}; public: - const TrieNodeInfo* find(const string& str)const - { - Unicode uintVec; - if(!TransCode::decode(str, uintVec)) - { - return NULL; - } - return find(uintVec.begin(), uintVec.end()); - } + //const TrieNodeInfo* find(const string& str)const + //{ + // Unicode uintVec; + // if(!TransCode::decode(str, uintVec)) + // { + // return NULL; + // } + // return find(uintVec.begin(), uintVec.end()); + //} const TrieNodeInfo* find(Unicode::const_iterator begin, Unicode::const_iterator end)const { @@ -244,6 +244,44 @@ namespace CppJieba return !res.empty(); } + bool find(Unicode::const_iterator begin, Unicode::const_iterator end, uint offset, unordered_map & res) const + { + if(!_getInitFlag()) + { + LogFatal("trie not initted!"); + return false; + } + if (begin >= end) + { + LogFatal("begin >= end"); + return false; + } + TrieNode* p = _root; + for (Unicode::const_iterator itr = begin; itr != end; itr++) + { + if(p->hmap.find(*itr) == p-> hmap.end()) + { + break; + } + p = p->hmap[*itr]; + if(p->isLeaf) + { + uint pos = p->nodeInfoVecPos; + if(pos < _nodeInfoVec.size()) + { + //res.push_back(make_pair(itr-begin, &_nodeInfoVec[pos])); + res[itr-begin + offset] = &_nodeInfoVec[pos]; + } + else + { + LogFatal("node's nodeInfoVecPos is out of _nodeInfoVec's range"); + return false; + } + } + } + return !res.empty(); + } + //bool find(const Unicode& unico, vector >& res)const //{ // if (!unico.empty()) From acb4150e3cdbcaa01c5502fc2a156739532e0ab3 Mon Sep 17 00:00:00 2001 From: wyy Date: Sun, 8 Dec 2013 03:29:28 -0800 Subject: [PATCH 2/5] remove some unused code --- src/ChineseFilter.hpp | 93 ------------------------------------------- src/MPSegment.hpp | 41 +++---------------- src/Trie.hpp | 18 --------- 3 files changed, 6 insertions(+), 146 deletions(-) diff --git a/src/ChineseFilter.hpp b/src/ChineseFilter.hpp index 6a5dd66..1b359a8 100644 --- a/src/ChineseFilter.hpp +++ b/src/ChineseFilter.hpp @@ -6,99 +6,6 @@ namespace CppJieba { - //enum CHAR_TYPE { CHWORD = 0, DIGIT_OR_LETTER = 1}; - //typedef Unicode::const_iterator UniConIter; - //class ChineseFilter; - //class ChFilterIterator - //{ - // public: - // const Unicode * ptUnico; - // UniConIter begin; - // UniConIter end; - // CHAR_TYPE charType; - // ChFilterIterator& operator++() - // { - // return *this = _get(end); - // } - // ChFilterIterator operator++(int) - // { - // ChFilterIterator res = *this; - // *this = _get(end); - // return res; - // } - // bool operator==(const ChFilterIterator& iter) - // { - // return begin == iter.begin && end == iter.end; - // } - // bool operator!=(const ChFilterIterator& iter) - // { - // return !(*this == iter); - // } - // ChFilterIterator& operator=(const ChFilterIterator& iter) - // { - // ptUnico = iter.ptUnico; - // begin = iter.begin; - // end = iter.end; - // charType = iter.charType; - // return *this; - // } - // - // public: - // ChFilterIterator(const Unicode * ptu, UniConIter be, UniConIter en, CHAR_TYPE is):ptUnico(ptu), begin(be), end(en), charType(is){}; - // ChFilterIterator(const Unicode * ptu):ptUnico(ptu){*this = _get(ptUnico->begin());}; - // private: - // ChFilterIterator(){} - // private: - // CHAR_TYPE _charType(uint16_t x)const - // { - // if(x < 0x0080) - // { - // return DIGIT_OR_LETTER; - // } - // return CHWORD; - // } - // ChFilterIterator _get(UniConIter iter) - // { - // UniConIter _begin = iter; - // const UniConIter& _end = ptUnico->end(); - // if(iter == _end) - // { - // return ChFilterIterator(ptUnico, end, end, DIGIT_OR_LETTER); - // } - // CHAR_TYPE charType = _charType(*iter); - // iter ++; - // while(iter != _end &&charType == _charType(*iter)) - // { - // iter++; - // } - // return ChFilterIterator(ptUnico, _begin, iter, charType); - // } - - //}; - //class ChineseFilter - //{ - // private: - // Unicode _unico; - // public: - // typedef ChFilterIterator iterator; - // public: - // ChineseFilter(){}; - // ~ChineseFilter(){}; - // public: - // bool feed(const string& str) - // { - // return TransCode::decode(str, _unico); - // } - // iterator begin() - // { - // return iterator(&_unico); - // } - // iterator end() - // { - // return iterator(&_unico, _unico.end(), _unico.end(), DIGIT_OR_LETTER); - // } - //}; - /* * if char is ascii, count the ascii string's length and return 0; * else count the nonascii string's length and return 1; diff --git a/src/MPSegment.hpp b/src/MPSegment.hpp index cc60dc0..09d10b5 100644 --- a/src/MPSegment.hpp +++ b/src/MPSegment.hpp @@ -97,27 +97,7 @@ namespace CppJieba } return true; } - bool cut(const string& str, vector& segWordInfos)const - { - if(!_getInitFlag()) - { - LogError("not inited."); - return false; - } - if(str.empty()) - { - return false; - } - Unicode sentence; - if(!TransCode::decode(str, sentence)) - { - LogError("TransCode::decode failed."); - return false; - } - return cut(sentence.begin(), sentence.end(), segWordInfos); - - } bool cut(Unicode::const_iterator begin , Unicode::const_iterator end, vector& segWordInfos)const { if(!_getInitFlag()) @@ -158,26 +138,17 @@ namespace CppJieba return false; } - //vector > vp; for(Unicode::const_iterator it = begin; it != end; it++) { - segContext.push_back(SegmentChar(*it)); - SegmentChar& back = segContext.back(); + SegmentChar schar(*it); uint i = it - begin; - _trie.find(it, end, i, back.dag); - //vp.clear(); - //if(_trie.find(it, end, vp)) - //{ - // for(uint j = 0; j < vp.size(); j++) - // { - // uint nextp = vp[j].first + i; - // back.dag[nextp] = vp[j].second; - // } - //} - if(back.dag.end() == back.dag.find(i)) + _trie.find(it, end, i, schar.dag); + //DagType::iterator dagIter; + if(schar.dag.end() == schar.dag.find(i)) { - back.dag[i] = NULL; + schar.dag[i] = NULL; } + segContext.push_back(schar); } return true; } diff --git a/src/Trie.hpp b/src/Trie.hpp index e7956f7..9e3d58b 100644 --- a/src/Trie.hpp +++ b/src/Trie.hpp @@ -157,15 +157,6 @@ namespace CppJieba bool _getInitFlag()const{return _initFlag;}; public: - //const TrieNodeInfo* find(const string& str)const - //{ - // Unicode uintVec; - // if(!TransCode::decode(str, uintVec)) - // { - // return NULL; - // } - // return find(uintVec.begin(), uintVec.end()); - //} const TrieNodeInfo* find(Unicode::const_iterator begin, Unicode::const_iterator end)const { @@ -282,15 +273,6 @@ namespace CppJieba return !res.empty(); } - //bool find(const Unicode& unico, vector >& res)const - //{ - // if (!unico.empty()) - // { - // return find(unico.begin(), unico.end(), res); - // } - // return false; - //} - public: double getMinLogFreq()const{return _minLogFreq;}; From 1e29d25855491b4b145f82daaf971ebaecb9856c Mon Sep 17 00:00:00 2001 From: wyy Date: Wed, 11 Dec 2013 04:52:33 -0800 Subject: [PATCH 3/5] use assert for getinitflag --- src/FullSegment.hpp | 16 +++++++++------- src/HMMSegment.hpp | 12 +++++++----- src/MPSegment.hpp | 13 ++++++++----- src/MixSegment.hpp | 12 +++++++----- src/QuerySegment.hpp | 16 +++++++++------- 5 files changed, 40 insertions(+), 29 deletions(-) diff --git a/src/FullSegment.hpp b/src/FullSegment.hpp index 91c5fdc..d67f2f3 100644 --- a/src/FullSegment.hpp +++ b/src/FullSegment.hpp @@ -3,6 +3,7 @@ #include #include +#include #include "Limonp/logger.hpp" #include "Trie.hpp" #include "ISegment.hpp" @@ -63,15 +64,16 @@ namespace CppJieba public: bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector& res) const { + assert(_getInitFlag()); #ifndef NO_CODING_LOG - if (!_getInitFlag()) + //if (!_getInitFlag()) + //{ + // LogError("not inited."); + // return false; + //} + if (begin >= end) { - LogError("not inited."); - return false; - } - if (begin > end) - { - LogError("begin > end"); + LogError("begin >= end"); return false; } #endif diff --git a/src/HMMSegment.hpp b/src/HMMSegment.hpp index c9839a4..6e2b7b7 100644 --- a/src/HMMSegment.hpp +++ b/src/HMMSegment.hpp @@ -4,6 +4,7 @@ #include #include #include +#include #include "Limonp/str_functs.hpp" #include "Limonp/logger.hpp" #include "TransCode.hpp" @@ -93,11 +94,12 @@ namespace CppJieba public: virtual bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector& res)const { - if(!_getInitFlag()) - { - LogError("not inited."); - return false; - } + //if(!_getInitFlag()) + //{ + // LogError("not inited."); + // return false; + //} + assert(_getInitFlag()); if(begin == end) { return false; diff --git a/src/MPSegment.hpp b/src/MPSegment.hpp index 09d10b5..14b8f21 100644 --- a/src/MPSegment.hpp +++ b/src/MPSegment.hpp @@ -7,6 +7,7 @@ #include #include +#include #include "Limonp/logger.hpp" #include "Trie.hpp" #include "ISegment.hpp" @@ -73,11 +74,13 @@ namespace CppJieba public: virtual bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector& res)const { - if(!_getInitFlag()) - { - LogError("not inited."); - return false; - } + //if(!_getInitFlag()) + //{ + // LogError("not inited."); + // return false; + //} + assert(_getInitFlag()); + vector segWordInfos; if(!cut(begin, end, segWordInfos)) { diff --git a/src/MixSegment.hpp b/src/MixSegment.hpp index 8914256..5f35031 100644 --- a/src/MixSegment.hpp +++ b/src/MixSegment.hpp @@ -1,6 +1,7 @@ #ifndef CPPJIEBA_MIXSEGMENT_H #define CPPJIEBA_MIXSEGMENT_H +#include #include "MPSegment.hpp" #include "HMMSegment.hpp" #include "Limonp/str_functs.hpp" @@ -56,11 +57,12 @@ namespace CppJieba public: virtual bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector& res)const { - if(!_getInitFlag()) - { - LogError("not inited."); - return false; - } + //if(!_getInitFlag()) + //{ + // LogError("not inited."); + // return false; + //} + assert(_getInitFlag()); if(begin == end) { return false; diff --git a/src/QuerySegment.hpp b/src/QuerySegment.hpp index 58cf1a2..4487339 100644 --- a/src/QuerySegment.hpp +++ b/src/QuerySegment.hpp @@ -3,6 +3,7 @@ #include #include +#include #include "Limonp/logger.hpp" #include "Trie.hpp" #include "ISegment.hpp" @@ -64,15 +65,16 @@ namespace CppJieba public: bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector& res) const { + assert(_getInitFlag()); #ifndef NO_CODING_LOG - if (!_getInitFlag()) + //if (!_getInitFlag()) + //{ + // LogError("not inited."); + // return false; + //} + if (begin >= end) { - LogError("not inited."); - return false; - } - if (begin > end) - { - LogError("begin > end"); + LogError("begin >= end"); return false; } #endif From f3e0df12f72d39594302b1a45847fb8980a03508 Mon Sep 17 00:00:00 2001 From: wyy Date: Thu, 12 Dec 2013 23:21:27 -0800 Subject: [PATCH 4/5] modify test --- README.md | 4 ++-- test/Makefile | 2 +- test/segment.cpp | 20 ++++++++++---------- 3 files changed, 13 insertions(+), 13 deletions(-) mode change 100755 => 100644 test/Makefile diff --git a/README.md b/README.md index 3db03fd..682c7a8 100644 --- a/README.md +++ b/README.md @@ -25,7 +25,7 @@ make sudo make install ``` -#### 验证 +#### 测试 ```sh /usr/bin/cjseg.sh ../test/testlines.utf8 @@ -41,7 +41,7 @@ sudo /etc/init.d/cjserver start sudo /etc/init.d/cjserver stop ``` -#### 验证服务 +#### 测试服务 然后用chrome浏览器打开`http://127.0.0.1:11200/?key=南京市长江大桥` (用chrome的原因是chrome的默认编码就是utf-8) diff --git a/test/Makefile b/test/Makefile old mode 100755 new mode 100644 index be026b6..4ca9c91 --- a/test/Makefile +++ b/test/Makefile @@ -2,4 +2,4 @@ all: testp demo testp: g++ -o test_performance test_performance.cpp -std=c++0x -O3 demo: - g++ -o segment.demo segment.cpp -std=c++0x -L/usr/lib/CppJieba -lcppjieba + g++ -o segment.demo segment.cpp -std=c++0x diff --git a/test/segment.cpp b/test/segment.cpp index e9c4a1b..f3d77d7 100644 --- a/test/segment.cpp +++ b/test/segment.cpp @@ -1,9 +1,9 @@ #include #include -#include -#include -#include -#include +#include "../src/Limonp/ArgvContext.hpp" +#include "../src/MPSegment.hpp" +#include "../src/HMMSegment.hpp" +#include "../src/MixSegment.hpp" using namespace CppJieba; @@ -27,8 +27,8 @@ int main(int argc, char ** argv) { //demo { - HMMSegment seg; - if(!seg.init("../dicts/hmm_model.utf8")) + HMMSegment seg("../dicts/hmm_model.utf8"); + if(!seg.init()) { cout<<"seg init failed."< Date: Thu, 12 Dec 2013 23:24:04 -0800 Subject: [PATCH 5/5] modify test --- test/segment.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/test/segment.cpp b/test/segment.cpp index f3d77d7..b2a501f 100644 --- a/test/segment.cpp +++ b/test/segment.cpp @@ -1,6 +1,5 @@ #include #include -#include "../src/Limonp/ArgvContext.hpp" #include "../src/MPSegment.hpp" #include "../src/HMMSegment.hpp" #include "../src/MixSegment.hpp"