diff --git a/CMakeLists.txt b/CMakeLists.txt index 79247e2..1f4176b 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -22,4 +22,3 @@ ADD_SUBDIRECTORY(test) ENABLE_TESTING() ADD_TEST(NAME test.run COMMAND test.run) -ADD_TEST(NAME load_test COMMAND load_test) diff --git a/ChangeLog.md b/ChangeLog.md index 4263582..e52b3b2 100644 --- a/ChangeLog.md +++ b/ChangeLog.md @@ -2,12 +2,15 @@ 1. 适配更低级版本的`g++`和`cmake`,已在`g++ 4.1.2`和`cmake 2.6`上测试通过。 2. 修改一些测试用例的文件,减少测试时编译的时间。 +3. 修复`make install`相关的问题。 +4. 增加HTTP服务的POST请求接口。 +5. 修改Trie这个类潜在的bug并完善单元测试 ## v2.3.4 1. 修改了设计上的问题,删除了`TrieManager`这个类,以避免造成一些可能的隐患。 2. 增加`stop_words.utf8`词典,并修改`KeywordExtractor`的初始化函数用以使用此词典。 -3. 优化了Trie树相关部分代码结构。 +3. 优化了`Trie`树相关部分代码结构。 ## v2.3.3 diff --git a/README.md b/README.md index 7fe6dfc..9c59fbd 100644 --- a/README.md +++ b/README.md @@ -1,8 +1,8 @@ #CppJieba是"结巴"中文分词的C++版本 -功能性的代码全写成hpp文件,此处的hpp文件是将cpp和h两个文件全都写在hpp文件里面(当然需要遵守相关约束) +功能性的代码全写成hpp文件,文件依赖一直是很让人讨厌的东西,全做成hpp头文件形式的目的就是为了省去链接的依赖。 -之所以全写成hpp文件,是因为这样在别的项目需要使用到中文分词功能的时候直接`#include"xx.hpp" `进来就可以使用,无需麻烦的链接。 +**没有依赖,就没有伤害。** 实践证明写成hpp使用起来真的很爽,在后面提到的在iOS应用中的使用,和包装成`Node.js`的扩展[NodeJieba]都特别顺利。 @@ -57,8 +57,37 @@ sudo /etc/init.d/cjserver stop 然后用chrome浏览器打开`http://127.0.0.1:11200/?key=南京市长江大桥` (用chrome的原因是chrome的默认编码就是utf-8) -或者用命令 `curl "http://127.0.0.1:11200/?key=南京市长江大桥"` (ubuntu中的curl安装命令`sudo apt-get install curl`) +或者用命令 `curl "http://127.0.0.1:11200/?key=南京市长江大桥"` (ubuntu中的curl安装命令`sudo apt-get install curl`) +你可以看到返回的结果如下:(返回结果是json格式) + +``` +["南京市", "长江大桥"] +``` + +如果你使用如下调用方式: + +``` +curl "http://127.0.0.1:11200/?key=南京市长江大桥&format=simple" +``` + +则返回结果如下:(返回结果按空格隔开) + +``` +南京市 长江大桥 +``` + +同时,也支持HTTP POST模式,使用如下调用: + +``` +curl -d "南京市长江大桥" "http://127.0.0.1:11200/" +``` + +返回结果如下: + +``` +["南京市", "长江大桥"] +``` ### 卸载 ```sh diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 9cc83df..5fc6ee9 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -9,7 +9,4 @@ TARGET_LINK_LIBRARIES(cjserver pthread) INSTALL(TARGETS cjsegment RUNTIME DESTINATION bin) INSTALL(TARGETS cjserver RUNTIME DESTINATION bin) -INSTALL(FILES HMMSegment.hpp MPSegment.hpp Trie.hpp TrieManager.hpp ISegment.hpp MixSegment.hpp SegmentBase.hpp TransCode.hpp KeywordExtractor.hpp DESTINATION include/CppJieba) -ADD_SUBDIRECTORY(Husky) -ADD_SUBDIRECTORY(Limonp) diff --git a/src/FullSegment.hpp b/src/FullSegment.hpp index 902c652..ec97067 100644 --- a/src/FullSegment.hpp +++ b/src/FullSegment.hpp @@ -48,7 +48,7 @@ namespace CppJieba } //resut of searching in trie tree - vector > tRes; + DagType tRes; //max index of res's words int maxIdx = 0; @@ -61,9 +61,10 @@ namespace CppJieba for (Unicode::const_iterator uItr = begin; uItr != end; uItr++) { //find word start from uItr - if (_trie.find(uItr, end, tRes)) + if (_trie.find(uItr, end, tRes, 0)) { - for (vector >::const_iterator itr = tRes.begin(); itr != tRes.end(); itr++) + for(DagType::const_iterator itr = tRes.begin(); itr != tRes.end(); itr++) + //for (vector >::const_iterator itr = tRes.begin(); itr != tRes.end(); itr++) { wordLen = itr->second->word.size(); if (wordLen >= 2 || (tRes.size() == 1 && maxIdx <= uIdx)) diff --git a/src/HMMSegment.hpp b/src/HMMSegment.hpp index bd3a1df..41c5a77 100644 --- a/src/HMMSegment.hpp +++ b/src/HMMSegment.hpp @@ -131,15 +131,11 @@ namespace CppJieba size_t Y = STATUS_SUM; size_t X = end - begin; size_t XYSize = X * Y; - int * path; - double * weight; size_t now, old, stat; double tmp, endE, endS; - path = new int [XYSize]; - assert(path); - weight = new double [XYSize]; - assert(weight); + vector path(XYSize); + vector weight(XYSize); //start for(size_t y = 0; y < Y; y++) @@ -147,8 +143,10 @@ namespace CppJieba weight[0 + y * X] = _startProb[y] + _getEmitProb(_emitProbVec[y], *begin, MIN_DOUBLE); path[0 + y * X] = -1; } - //process - //for(; begin != end; begin++) + + + double emitProb; + for(size_t x = 1; x < X; x++) { for(size_t y = 0; y < Y; y++) @@ -156,10 +154,11 @@ namespace CppJieba now = x + y*X; weight[now] = MIN_DOUBLE; path[now] = E; // warning + emitProb = _getEmitProb(_emitProbVec[y], *(begin+x), MIN_DOUBLE); for(size_t preY = 0; preY < Y; preY++) { old = x - 1 + preY * X; - tmp = weight[old] + _transProb[preY][y] + _getEmitProb(_emitProbVec[y], *(begin+x), MIN_DOUBLE); + tmp = weight[old] + _transProb[preY][y] + emitProb; if(tmp > weight[now]) { weight[now] = tmp; @@ -188,8 +187,6 @@ namespace CppJieba stat = path[x + stat*X]; } - delete [] path; - delete [] weight; return true; } bool _loadModel(const char* const filePath) diff --git a/src/Husky/CMakeLists.txt b/src/Husky/CMakeLists.txt deleted file mode 100644 index 2c86c04..0000000 --- a/src/Husky/CMakeLists.txt +++ /dev/null @@ -1,5 +0,0 @@ -SET(EXECUTABLE_OUTPUT_PATH ${PROJECT_BINARY_DIR}/bin) -SET(LIBRARY_OUTPUT_PATH ${PROJECT_BINARY_DIR}/lib) - -FILE(GLOB SRCS *.hpp) -INSTALL(FILES ${SRCS} DESTINATION include/CppJieba/Husky) diff --git a/src/Husky/EpollServer.hpp b/src/Husky/EpollServer.hpp index 77a47ce..e300326 100644 --- a/src/Husky/EpollServer.hpp +++ b/src/Husky/EpollServer.hpp @@ -37,13 +37,14 @@ namespace Husky virtual ~IRequestHandler(){}; public: virtual bool do_GET(const HttpReqInfo& httpReq, string& res) const = 0; + virtual bool do_POST(const HttpReqInfo& httpReq, string& res) const = 0; }; class EpollServer { private: static const size_t LISTEN_QUEUE_LEN = 1024; - static const size_t RECV_BUFFER_SIZE = 1024 * 8; + static const size_t RECV_BUFFER_SIZE = 1024*4; static const int MAXEPOLLSIZE = 512; private: @@ -59,10 +60,10 @@ namespace Husky bool _setInitFlag(bool flag) {return _isInited = flag;} public: explicit EpollServer(uint port, const IRequestHandler* pHandler): _reqHandler(pHandler), _host_socket(-1), _isShutDown(false), _epollSize(0) - { - assert(_reqHandler); - _setInitFlag(_init_epoll(port)); - }; + { + assert(_reqHandler); + _setInitFlag(_init_epoll(port)); + }; ~EpollServer(){};// unfinished; public: operator bool() const @@ -75,10 +76,9 @@ namespace Husky //int clientSock; sockaddr_in clientaddr; socklen_t nSize = sizeof(clientaddr); - //char recvBuf[RECV_BUFFER_SIZE]; struct epoll_event events[MAXEPOLLSIZE]; int nfds, clientSock; - + while(!_isShutDown) { if(-1 == (nfds = epoll_wait(_epoll_fd, events, _epollSize, -1))) @@ -88,7 +88,7 @@ namespace Husky } //LogDebug("epoll_wait return event sum[%d]", nfds); - + for(int i = 0; i < nfds; i++) { if(events[i].data.fd == _host_socket) /*new connect coming.*/ @@ -106,7 +106,7 @@ namespace Husky } //LogInfo("connecting from: %d:%d, client socket: %d\n", inet_ntoa(clientaddr.sin_addr), ntohs(clientaddr.sin_port), clientSock); - + /* inet_ntoa is not thread safety at some version */ //_sockIpMap[clientSock] = inet_ntoa(clientaddr.sin_addr); @@ -119,7 +119,7 @@ namespace Husky _closesocket(events[i].data.fd); } } - + } return true; } @@ -190,26 +190,42 @@ namespace Husky } string strRec, strSnd, strRetByHandler; - strRec.resize(RECV_BUFFER_SIZE); - int nRetCode = recv(sockfd, (char*)strRec.c_str(), strRec.size(), 0); - if(-1 == nRetCode) + char recvBuf[RECV_BUFFER_SIZE]; + int nRetCode = -1; + while(true) { - LogDebug(strerror(errno)); - return false; - } - if(0 == nRetCode) - { - LogDebug("client socket closed gracefully."); - return false; + memset(recvBuf, 0, sizeof(recvBuf)); + nRetCode = recv(sockfd, recvBuf, sizeof(recvBuf) - 1, 0); + if(-1 == nRetCode) + { + LogDebug(strerror(errno)); + return false; + } + if(0 == nRetCode) + { + LogDebug("client socket orderly shut down"); + return false; + } + strRec += recvBuf; + if(nRetCode != sizeof(recvBuf) - 1) + { + break; + } } HttpReqInfo httpReq(strRec); - if(!_reqHandler->do_GET(httpReq, strRetByHandler)) + if("GET" == httpReq.getMethod() && !_reqHandler->do_GET(httpReq, strRetByHandler)) { LogError("do_GET failed."); return false; } + if("POST" == httpReq.getMethod() && !_reqHandler->do_POST(httpReq, strRetByHandler)) + { + LogError("do_POST failed."); + return false; + } string_format(strSnd, HTTP_FORMAT, CHARSET_UTF8, strRetByHandler.length(), strRetByHandler.c_str()); + if(-1 == send(sockfd, strSnd.c_str(), strSnd.length(), 0)) { LogError(strerror(errno)); diff --git a/src/Husky/HttpReqInfo.hpp b/src/Husky/HttpReqInfo.hpp index 7585a28..5358773 100644 --- a/src/Husky/HttpReqInfo.hpp +++ b/src/Husky/HttpReqInfo.hpp @@ -135,7 +135,8 @@ namespace Husky //message header end //body begin - + _body.assign(headerStr.substr(rpos)); + trim(_body); } public: string& operator[] (const string& key) @@ -150,14 +151,23 @@ namespace Husky { return _find(_methodGetMap, argKey, res); } - bool POST(const string& argKey, string& res)const + //bool POST(const string& argKey, string& res)const + //{ + // return _find(_methodPostMap, argKey, res); + //} + const string& getMethod() const { - return _find(_methodPostMap, argKey, res); + return _headerMap.find(KEY_METHOD)->second; + } + const string& getBody() const + { + return _body; } private: std::unordered_map _headerMap; std::unordered_map _methodGetMap; - std::unordered_map _methodPostMap; + //std::unordered_map _methodPostMap; + string _body; //public: friend ostream& operator<<(ostream& os, const HttpReqInfo& obj); private: @@ -215,7 +225,7 @@ namespace Husky inline std::ostream& operator << (std::ostream& os, const Husky::HttpReqInfo& obj) { - return os << obj._headerMap << obj._methodGetMap << obj._methodPostMap; + return os << obj._headerMap << obj._methodGetMap/* << obj._methodPostMap*/ << obj._body; } } diff --git a/src/Limonp/CMakeLists.txt b/src/Limonp/CMakeLists.txt deleted file mode 100644 index 8d57ecb..0000000 --- a/src/Limonp/CMakeLists.txt +++ /dev/null @@ -1,4 +0,0 @@ -INSTALL(FILES ArgvContext.hpp InitOnOff.hpp macro_def.hpp MysqlClient.hpp str_functs.hpp - cast_functs.hpp io_functs.hpp map_functs.hpp NonCopyable.hpp - Config.hpp logger.hpp md5.hpp std_outbound.hpp - DESTINATION include/CppJieba/Limonp) diff --git a/src/MPSegment.hpp b/src/MPSegment.hpp index a70e8b2..8a648a9 100644 --- a/src/MPSegment.hpp +++ b/src/MPSegment.hpp @@ -24,7 +24,7 @@ namespace CppJieba const TrieNodeInfo * pInfo; double weight; - SegmentChar(uint16_t uni):uniCh(uni), pInfo(NULL), weight(0.0) + SegmentChar():uniCh(0), pInfo(NULL), weight(0.0) {} }; typedef vector SegmentContext; @@ -59,18 +59,22 @@ namespace CppJieba virtual bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector& res)const { assert(_getInitFlag()); - - vector segWordInfos; - if(!cut(begin, end, segWordInfos)) + if(begin == end) { return false; } - string tmp; - for(size_t i = 0; i < segWordInfos.size(); i++) + + vector words; + if(!cut(begin, end, words)) { - if(TransCode::encode(segWordInfos[i].word, tmp)) + return false; + } + string word; + for(size_t i = 0; i < words.size(); i++) + { + if(TransCode::encode(words[i], word)) { - res.push_back(tmp); + res.push_back(word); } else { @@ -80,7 +84,7 @@ namespace CppJieba return true; } - bool cut(Unicode::const_iterator begin , Unicode::const_iterator end, vector& segWordInfos)const + bool cut(Unicode::const_iterator begin , Unicode::const_iterator end, vector& res) const { if(!_getInitFlag()) { @@ -88,7 +92,6 @@ namespace CppJieba return false; } SegmentContext segContext; - //calc DAG if(!_calcDAG(begin, end, segContext)) { @@ -102,7 +105,7 @@ namespace CppJieba return false; } - if(!_cut(segContext, segWordInfos)) + if(!_cut(segContext, res)) { LogError("_cut failed."); return false; @@ -114,21 +117,17 @@ namespace CppJieba private: bool _calcDAG(Unicode::const_iterator begin, Unicode::const_iterator end, SegmentContext& segContext) const { - if(begin >= end) - { - LogError("begin >= end."); - return false; - } - + SegmentChar schar; + size_t offset; for(Unicode::const_iterator it = begin; it != end; it++) { - SegmentChar schar(*it); - size_t i = it - begin; - _trie.find(it, end, i, schar.dag); - //DagType::iterator dagIter; - if(schar.dag.end() == schar.dag.find(i)) + schar.uniCh = *it; + offset = it - begin; + schar.dag.clear(); + _trie.find(it, end, schar.dag, offset); + if(!isIn(schar.dag, offset)) { - schar.dag[i] = NULL; + schar.dag[offset] = NULL; } segContext.push_back(schar); } @@ -142,15 +141,19 @@ namespace CppJieba return false; } + size_t nextPos; + const TrieNodeInfo* p; + double val; + for(int i = segContext.size() - 1; i >= 0; i--) { segContext[i].pInfo = NULL; segContext[i].weight = MIN_DOUBLE; for(DagType::const_iterator it = segContext[i].dag.begin(); it != segContext[i].dag.end(); it++) { - size_t nextPos = it->first; - const TrieNodeInfo* p = it->second; - double val = 0.0; + nextPos = it->first; + p = it->second; + val = 0.0; if(nextPos + 1 < segContext.size()) { val += segContext[nextPos + 1].weight; @@ -174,7 +177,7 @@ namespace CppJieba return true; } - bool _cut(SegmentContext& segContext, vector& res)const + bool _cut(SegmentContext& segContext, vector& res)const { size_t i = 0; while(i < segContext.size()) @@ -182,16 +185,12 @@ namespace CppJieba const TrieNodeInfo* p = segContext[i].pInfo; if(p) { - res.push_back(*p); + res.push_back(p->word); i += p->word.size(); } else//single chinese word { - TrieNodeInfo nodeInfo; - nodeInfo.word.push_back(segContext[i].uniCh); - nodeInfo.freq = 0; - nodeInfo.logFreq = _trie.getMinLogFreq(); - res.push_back(nodeInfo); + res.push_back(Unicode(1, segContext[i].uniCh)); i++; } } diff --git a/src/MixSegment.hpp b/src/MixSegment.hpp index b5a205f..36fcb08 100644 --- a/src/MixSegment.hpp +++ b/src/MixSegment.hpp @@ -44,14 +44,8 @@ namespace CppJieba virtual bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector& res) const { assert(_getInitFlag()); - if(begin >= end) - { - LogError("begin >= end"); - return false; - } - - vector infos; - if(!_mpSeg.cut(begin, end, infos)) + vector words; + if(!_mpSeg.cut(begin, end, words)) { LogError("mpSeg cutDAG failed."); return false; @@ -59,20 +53,20 @@ namespace CppJieba vector hmmRes; Unicode piece; - for (size_t i = 0, j = 0; i < infos.size(); i++) + for (size_t i = 0, j = 0; i < words.size(); i++) { //if mp get a word, it's ok, put it into result - if (1 != infos[i].word.size()) + if (1 != words[i].size()) { - res.push_back(infos[i].word); + res.push_back(words[i]); continue; } // if mp get a single one, collect it in sequence j = i; - while (j < infos.size() && infos[j].word.size() == 1) + while (j < words.size() && words[j].size() == 1) { - piece.push_back(infos[j].word[0]); + piece.push_back(words[j][0]); j++; } diff --git a/src/SegmentBase.hpp b/src/SegmentBase.hpp index e67ee04..3d82e69 100644 --- a/src/SegmentBase.hpp +++ b/src/SegmentBase.hpp @@ -22,12 +22,8 @@ namespace CppJieba virtual bool cut(const string& str, vector& res)const { assert(_getInitFlag()); - //if(!_getInitFlag()) - //{ - // LogError("not inited."); - // return false; - //} Unicode unico; + res.clear(); #ifdef NO_FILTER if(!TransCode::decode(str, unico)) { diff --git a/src/TransCode.hpp b/src/TransCode.hpp index e3bb891..aa05341 100644 --- a/src/TransCode.hpp +++ b/src/TransCode.hpp @@ -15,7 +15,7 @@ namespace CppJieba typedef std::vector Unicode; namespace TransCode { - inline bool decode(const string& str, vector& vec) + inline bool decode(const string& str, Unicode& vec) { #ifdef CPPJIEBA_GBK return gbkTrans(str, vec); @@ -24,7 +24,7 @@ namespace CppJieba #endif } - inline bool encode(vector::const_iterator begin, vector::const_iterator end, string& res) + inline bool encode(Unicode::const_iterator begin, Unicode::const_iterator end, string& res) { #ifdef CPPJIEBA_GBK return gbkTrans(begin, end, res); @@ -33,7 +33,7 @@ namespace CppJieba #endif } - inline bool encode(const vector& uni, string& res) + inline bool encode(const Unicode& uni, string& res) { return encode(uni.begin(), uni.end(), res); } diff --git a/src/Trie.hpp b/src/Trie.hpp index 925f5e4..166cd5b 100644 --- a/src/Trie.hpp +++ b/src/Trie.hpp @@ -26,16 +26,13 @@ namespace CppJieba const double MAX_DOUBLE = 3.14e+100; const size_t DICT_COLUMN_NUM = 3; typedef map TrieNodeMap; + struct TrieNodeInfo; struct TrieNode { TrieNodeMap hmap; - bool isLeaf; - size_t nodeInfoPos; - TrieNode() - { - isLeaf = false; - nodeInfoPos = 0; - } + const TrieNodeInfo * ptTrieNodeInfo; + TrieNode(): ptTrieNodeInfo(NULL) + {} }; struct TrieNodeInfo @@ -44,12 +41,6 @@ namespace CppJieba size_t freq; string tag; double logFreq; //logFreq = log(freq/sum(freq)); - TrieNodeInfo():freq(0),logFreq(0.0) - {} - TrieNodeInfo(const TrieNodeInfo& nodeInfo):word(nodeInfo.word), freq(nodeInfo.freq), tag(nodeInfo.tag), logFreq(nodeInfo.logFreq) - {} - TrieNodeInfo(const Unicode& _word):word(_word),freq(0),logFreq(MIN_DOUBLE) - {} }; inline ostream& operator << (ostream& os, const TrieNodeInfo & nodeInfo) @@ -72,33 +63,32 @@ namespace CppJieba public: Trie() { - _root = NULL; + _root = new TrieNode; _freqSum = 0; _minLogFreq = MAX_DOUBLE; _setInitFlag(false); } Trie(const string& filePath) { - Trie(); + new (this) Trie(); _setInitFlag(init(filePath)); } ~Trie() { _deleteNode(_root); } + private: + + public: bool init(const string& filePath) { assert(!_getInitFlag()); - - _root = new TrieNode; - assert(_root); - if(!_trieInsert(filePath)) - { - LogError("_trieInsert failed."); - return false; - } - _countWeight(); + _loadDict(filePath, _nodeInfos); + _createTrie(_nodeInfos, _root); + _freqSum = _calculateFreqSum(_nodeInfos); + assert(_freqSum); + _minLogFreq = _calculateLogFreqAndGetMinValue(_nodeInfos, _freqSum); return _setInitFlag(true); } @@ -106,47 +96,22 @@ namespace CppJieba const TrieNodeInfo* find(Unicode::const_iterator begin, Unicode::const_iterator end)const { TrieNodeMap::const_iterator citer; - TrieNode* p = _root; + const TrieNode* p = _root; for(Unicode::const_iterator it = begin; it != end; it++) { - uint16_t chUni = *it; - citer = p->hmap.find(chUni); - if(p-> hmap.end() == citer) + citer = p->hmap.find(*it); + if(p->hmap.end() == citer) { return NULL; } p = citer->second; } - if(p->isLeaf) - { - return &(_nodeInfos[p->nodeInfoPos]); - } - return NULL; + return p->ptTrieNodeInfo; } - bool find(Unicode::const_iterator begin, Unicode::const_iterator end, vector >& res) const + bool find(Unicode::const_iterator begin, Unicode::const_iterator end, DagType & res, size_t offset = 0) const { - TrieNodeMap::const_iterator citer; - TrieNode* p = _root; - for (Unicode::const_iterator itr = begin; itr != end; itr++) - { - citer = p->hmap.find(*itr); - if(p->hmap.end() == citer) - { - break; - } - p = citer->second; - if(p->isLeaf) - { - res.push_back(make_pair(itr-begin, &_nodeInfos[p->nodeInfoPos])); - } - } - return !res.empty(); - } - - bool find(Unicode::const_iterator begin, Unicode::const_iterator end, size_t offset, DagType & res) const - { - TrieNode* p = _root; + const TrieNode* p = _root; TrieNodeMap::const_iterator citer; for (Unicode::const_iterator itr = begin; itr != end; itr++) { @@ -156,9 +121,9 @@ namespace CppJieba break; } p = citer->second; - if(p->isLeaf) + if(p->ptTrieNodeInfo) { - res[itr - begin + offset] = &_nodeInfos[p->nodeInfoPos]; + res[itr - begin + offset] = p->ptTrieNodeInfo; } } return !res.empty(); @@ -168,43 +133,44 @@ namespace CppJieba double getMinLogFreq() const {return _minLogFreq;}; private: - void _insert(const TrieNodeInfo& nodeInfo, size_t nodeInfoPos) + void _insertNode(const TrieNodeInfo& nodeInfo, TrieNode* ptNode) const { const Unicode& unico = nodeInfo.word; - TrieNode* p = _root; + TrieNodeMap::const_iterator citer; for(size_t i = 0; i < unico.size(); i++) { uint16_t cu = unico[i]; - assert(p); - if(!isIn(p->hmap, cu)) + assert(ptNode); + citer = ptNode->hmap.find(cu); + if(ptNode->hmap.end() == citer) { TrieNode * next = new TrieNode; - assert(next); - p->hmap[cu] = next; - p = next; + ptNode->hmap[cu] = next; + ptNode = next; } else { - p = p->hmap[cu]; + ptNode = citer->second; } + } - p->isLeaf = true; - p->nodeInfoPos = nodeInfoPos; + ptNode->ptTrieNodeInfo = &nodeInfo; } private: - bool _trieInsert(const string& filePath) + void _loadDict(const string& filePath, vector& nodeInfos) const { ifstream ifs(filePath.c_str()); if(!ifs) { - LogError("open %s failed.", filePath.c_str()); - return false; + LogFatal("open %s failed.", filePath.c_str()); + exit(1); } string line; vector buf; + nodeInfos.clear(); TrieNodeInfo nodeInfo; for(size_t lineno = 0 ; getline(ifs, line); lineno++) { @@ -213,43 +179,46 @@ namespace CppJieba if(!TransCode::decode(buf[0], nodeInfo.word)) { LogError("line[%u:%s] illegal.", lineno, line.c_str()); - return false; + continue; } nodeInfo.freq = atoi(buf[1].c_str()); nodeInfo.tag = buf[2]; - _nodeInfos.push_back(nodeInfo); - + nodeInfos.push_back(nodeInfo); } + } + bool _createTrie(const vector& nodeInfos, TrieNode * ptNode) + { for(size_t i = 0; i < _nodeInfos.size(); i++) { - _insert(_nodeInfos[i], i); + _insertNode(_nodeInfos[i], ptNode); } return true; } - void _countWeight() + size_t _calculateFreqSum(const vector& nodeInfos) const { - //freq total freq - _freqSum = 0; - for(size_t i = 0; i < _nodeInfos.size(); i++) + size_t freqSum = 0; + for(size_t i = 0; i < nodeInfos.size(); i++) { - _freqSum += _nodeInfos[i].freq; + freqSum += nodeInfos[i].freq; } - - assert(_freqSum); - - //normalize - for(size_t i = 0; i < _nodeInfos.size(); i++) + return freqSum; + } + double _calculateLogFreqAndGetMinValue(vector& nodeInfos, size_t freqSum) const + { + assert(freqSum); + double minLogFreq = MAX_DOUBLE; + for(size_t i = 0; i < nodeInfos.size(); i++) { - TrieNodeInfo& nodeInfo = _nodeInfos[i]; + TrieNodeInfo& nodeInfo = nodeInfos[i]; assert(nodeInfo.freq); - nodeInfo.logFreq = log(double(nodeInfo.freq)/double(_freqSum)); - if(_minLogFreq > nodeInfo.logFreq) + nodeInfo.logFreq = log(double(nodeInfo.freq)/double(freqSum)); + if(minLogFreq > nodeInfo.logFreq) { - _minLogFreq = nodeInfo.logFreq; + minLogFreq = nodeInfo.logFreq; } } - + return minLogFreq; } void _deleteNode(TrieNode* node) diff --git a/src/server.cpp b/src/server.cpp index 551a215..441b85c 100644 --- a/src/server.cpp +++ b/src/server.cpp @@ -26,6 +26,18 @@ class ReqHandler: public IRequestHandler httpReq.GET("key", tmp); URLDecode(tmp, sentence); _segment.cut(sentence, words); + if(httpReq.GET("format", tmp) && tmp == "simple") + { + join(words.begin(), words.end(), strSnd, " "); + return true; + } + strSnd << words; + return true; + } + virtual bool do_POST(const HttpReqInfo& httpReq, string& strSnd) const + { + vector words; + _segment.cut(httpReq.getBody(), words); strSnd << words; return true; } diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index 5a8aaed..4abe00d 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -2,6 +2,6 @@ SET(EXECUTABLE_OUTPUT_PATH ${PROJECT_BINARY_DIR}) ADD_EXECUTABLE(segment.demo segment.cpp) ADD_EXECUTABLE(keyword.demo keyword_demo.cpp) -ADD_EXECUTABLE(tagging_demo tagging_demo.cpp) +ADD_EXECUTABLE(tagging.demo tagging_demo.cpp) ADD_EXECUTABLE(load_test load_test.cpp) ADD_SUBDIRECTORY(unittest) diff --git a/test/load_test.cpp b/test/load_test.cpp index 1b3a618..241c68c 100644 --- a/test/load_test.cpp +++ b/test/load_test.cpp @@ -9,7 +9,7 @@ using namespace CppJieba; -void cut(const ISegment * seg, const char * const filePath, size_t times = 10) +void cut(const ISegment * seg, const char * const filePath, size_t times = 30) { ifstream ifile(filePath); if(!ifile) @@ -23,10 +23,12 @@ void cut(const ISegment * seg, const char * const filePath, size_t times = 10) loadFile2Str(filePath, doc); for(uint i = 0; i < times; i ++) { - LogInfo("times[%u]", i); + printf("process [%3.0lf %%]\r", 100.0*(i+1)/times); + fflush(stdout); res.clear(); seg->cut(doc, res); } + printf("\n"); } int main(int argc, char ** argv) @@ -40,6 +42,6 @@ int main(int argc, char ** argv) long beginTime = clock(); cut(&seg, "../test/testdata/weicheng.utf8"); long endTime = clock(); - printf("[%.3lf seconds]time consumeed.\n", double(endTime - beginTime)/CLOCKS_PER_SEC); + printf("[%.3lf seconds]time consumed.\n", double(endTime - beginTime)/CLOCKS_PER_SEC); return EXIT_SUCCESS; } diff --git a/test/unittest/TSegments.cpp b/test/unittest/TSegments.cpp index e5e5a81..bfe8ea7 100644 --- a/test/unittest/TSegments.cpp +++ b/test/unittest/TSegments.cpp @@ -80,7 +80,6 @@ TEST(MPSegmentTest, Test2) res += line; res += '\n'; - words.clear(); segment.cut(line, words); string s; s << words; diff --git a/test/unittest/TTrie.cpp b/test/unittest/TTrie.cpp index 67dcf7a..1ccbb98 100644 --- a/test/unittest/TTrie.cpp +++ b/test/unittest/TTrie.cpp @@ -5,8 +5,18 @@ using namespace CppJieba; static const char* const DICT_FILE = "../dict/extra_dict/jieba.dict.small.utf8"; +TEST(TrieTest, NewAndDelete) +{ + Trie * trie; + trie = new Trie(DICT_FILE); + delete trie; + trie = new Trie(); + delete trie; +} + TEST(TrieTest, Test1) { + string s1, s2; Trie trie; ASSERT_TRUE(trie.init(DICT_FILE)); @@ -26,7 +36,7 @@ TEST(TrieTest, Test1) word = "清华大学"; vector > res; map resMap; - map map; + map mp; const char * words[] = {"清", "清华", "清华大学"}; for(size_t i = 0; i < sizeof(words)/sizeof(words[0]); i++) { @@ -40,10 +50,8 @@ TEST(TrieTest, Test1) vector > vec; ASSERT_TRUE(TransCode::decode(word, uni)); //print(uni); - ASSERT_TRUE(trie.find(uni.begin(), uni.end(), vec)); - ASSERT_EQ(vec, res); - ASSERT_TRUE(trie.find(uni.begin(), uni.end(), 0, map)); - ASSERT_EQ(map, resMap); + ASSERT_TRUE(trie.find(uni.begin(), uni.end(), mp, 0)); + ASSERT_EQ(mp, resMap); // print(vec); }