From 6c3028a2c9a994b4ed5dcc99d5947853779911f6 Mon Sep 17 00:00:00 2001 From: gwdwyy Date: Sat, 6 Jul 2013 12:37:48 +0800 Subject: [PATCH] modify find function --- Segment.cpp | 2 +- Trie.cpp | 77 ++++++++++++++++++++++++++++++++++------------------- Trie.h | 19 ++++++++----- 3 files changed, 63 insertions(+), 35 deletions(-) diff --git a/Segment.cpp b/Segment.cpp index f795da1..56938e9 100644 --- a/Segment.cpp +++ b/Segment.cpp @@ -98,7 +98,7 @@ namespace CppJieba for(int j = 0; j <= i; j++) { size_t uniLen = i - j + 1; - if(_trie.find(uniStr + j, uniLen)) + if(NULL != _trie.find(uniStr + j, uniLen)) { memset(utfBuf, 0 ,sizeof(utfBuf)); size_t ret = unicodeToUtf8(uniStr + j, uniLen, utfBuf); diff --git a/Trie.cpp b/Trie.cpp index 2989dac..17d0c6b 100644 --- a/Trie.cpp +++ b/Trie.cpp @@ -4,16 +4,17 @@ namespace CppJieba { Trie::iterator Trie::begin() { - return Trie::iterator(_root); + return _nodeInfoVec.begin(); } Trie::iterator Trie::end() { - return Trie::iterator(NULL); + return _nodeInfoVec.end(); } - Trie::Trie():_root(NULL), _nodeInfoVec(), _totalWeight(0) + Trie::Trie():_root(NULL), _totalCount(0) { + _minWeight = numeric_limits::max(); } Trie::~Trie() @@ -103,6 +104,38 @@ namespace CppJieba _display(_root, 0); } + const TrieNodeInfo* Trie::find(const ChUnicode* const chUniStr, size_t len) + { + TrieNode* p = _root; + for(size_t i = 0; i < len; i++) + { + ChUnicode chUni = chUniStr[i]; + if(p->hmap.find(chUni) == p-> hmap.end()) + { + return NULL; + } + else + { + p = p->hmap[chUni]; + } + } + if(p->isLeaf) + { + unsigned int pos = p->nodeInfoVecPos; + if(pos < _nodeInfoVec.size()) + { + return &(_nodeInfoVec[pos]); + } + else + { + LogFatal("node's nodeInfoVecPos is out of _nodeInfoVec's range"); + return NULL; + } + } + return NULL; + } + + /* bool Trie::find(const ChUnicode* chUniStr, size_t len) { int res = -1; @@ -121,6 +154,7 @@ namespace CppJieba } return p->isLeaf; } + */ /* bool Trie::find(const vector& uniVec) @@ -168,6 +202,12 @@ namespace CppJieba return res; } + double getWeight(const ChUnicode* uniStr, size_t len) + { + + } + + /* bool Trie::cut(const ChUnicode* chUniStr, size_t len, vector< vector >& res) { res.clear(); @@ -187,28 +227,8 @@ namespace CppJieba } return true; } + */ - bool Trie::cutUtf8(const string& str, vector< vector >& res) - { - ChUnicode buf[ChUniMaxLen]; - size_t len = utf8ToUnicode(str.c_str(), str.size(), buf); - if(0 == len) - { - return false; - } - return cut(buf, len, res); - /* - PRINT_MATRIX(res); - char buf[1024]; - FOR_VECTOR(res, i) - { - FOR_VECTOR(res[i], j) - { - unicodeToUtf8(chUniStr + i, res[i][j] - i + 1, buf); - cout< #include #include +#include #include "cppcommon/str_functs.h" #include "cppcommon/vec_functs.h" #include "cppcommon/logger.h" @@ -45,6 +46,7 @@ namespace CppJieba } }; + /* struct TrieNodeIterator { TrieNode* ptNode; @@ -88,16 +90,19 @@ namespace CppJieba return ptNode != x.ptNode; } }; + */ class Trie { private: TrieNode* _root; vector _nodeInfoVec; - int64_t _totalWeight; + + int64_t _totalCount; + double _minWeight; public: - typedef TrieNodeIterator iterator; + typedef vector::iterator iterator; public: iterator begin(); @@ -111,22 +116,24 @@ namespace CppJieba void display(); public: - bool find(const ChUnicode* chUniStr, size_t len); + const TrieNodeInfo* find(const ChUnicode* const chUniStr, size_t len); + //bool find(const ChUnicode* chUniStr, size_t len); //bool find(const vector& uniVec); int findMaxMatch(const ChUnicode* chUniStr, size_t len); public: - bool cut(const ChUnicode* chUniStr, size_t len, vector< vector >& res); + double getWeight(const ChUnicode* uniStr, size_t len); + //bool cut(const ChUnicode* chUniStr, size_t len, vector< vector >& res); //bool cutUni(const vector& uniVec, ) - bool cutUtf8(const string& str, vector< vector >& res); + //bool cutUtf8(const string& str, vector< vector >& res); //bool cutMa private: bool _buildTree(const char* const filePath); + bool _countWeight(); bool _destroyNode(TrieNode* node); void _display(TrieNode* node, int level); bool _insert(const TrieNodeInfo& nodeInfo); - bool _countWeight(); private: enum {bufSize = 1024};