astyle

2025-07-18 00:00:12 +08:00 · 2015-07-20 23:54:20 +08:00 · 2015-07-20 23:54:20 +08:00 · 931db7d1e5
commit 931db7d1e5
parent d1a112c0c4
5 changed files with 656 additions and 807 deletions
--- a/src/DictTrie.hpp
+++ b/src/DictTrie.hpp
@ -16,191 +16,164 @@



-namespace CppJieba
-{
-    using namespace Limonp;
-    const double MIN_DOUBLE = -3.14e+100;
-    const double MAX_DOUBLE = 3.14e+100;
-    const size_t DICT_COLUMN_NUM = 3;
-    const char* const UNKNOWN_TAG = "";
+namespace CppJieba {
+using namespace Limonp;
+const double MIN_DOUBLE = -3.14e+100;
+const double MAX_DOUBLE = 3.14e+100;
+const size_t DICT_COLUMN_NUM = 3;
+const char* const UNKNOWN_TAG = "";

-    class DictTrie
-    {
-        public:
+class DictTrie {
+ public:

-            DictTrie()
-            {
-                _trie = NULL;
-                _minWeight = MAX_DOUBLE;
-            }
-            DictTrie(const string& dictPath, const string& userDictPath = "")
-            {
-                new (this) DictTrie();
-                init(dictPath, userDictPath);
-            }
-            ~DictTrie()
-            {
-                if(_trie)
-                {
-                    delete _trie;
-                }
-            }
-            
-            bool init(const string& dictPath, const string& userDictPath = "")
-            {
-                assert(!_trie);
-                _loadDict(dictPath);
-                _calculateWeight(_nodeInfos);
-                _minWeight = _findMinWeight(_nodeInfos);
-                
-                if(userDictPath.size())
-                {
-                    double maxWeight = _findMaxWeight(_nodeInfos);
-                    _loadUserDict(userDictPath, maxWeight, UNKNOWN_TAG);
-                }
-                _shrink(_nodeInfos);
-                _trie = _createTrie(_nodeInfos);
-                assert(_trie);
-                return true;
-            }
+  DictTrie() {
+    _trie = NULL;
+    _minWeight = MAX_DOUBLE;
+  }
+  DictTrie(const string& dictPath, const string& userDictPath = "") {
+    new (this) DictTrie();
+    init(dictPath, userDictPath);
+  }
+  ~DictTrie() {
+    if(_trie) {
+      delete _trie;
+    }
+  }

-            const DictUnit* find(Unicode::const_iterator begin, Unicode::const_iterator end) const
-            {
-                return _trie->find(begin, end);
-            }
-            bool find(Unicode::const_iterator begin, Unicode::const_iterator end, DagType& dag, size_t offset = 0) const
-            {
-                return _trie->find(begin, end, dag, offset);
-            }
-            void find(
-                        Unicode::const_iterator begin, 
-                        Unicode::const_iterator end, 
-                        vector<SegmentChar>& res
-                        ) const
-            {
-                _trie->find(begin, end, res);
-            }
-            bool isUserDictSingleChineseWord(const Unicode::value_type& word) const
-            {
-                return isIn(_userDictSingleChineseWord, word);
-            }
-            double getMinWeight() const {return _minWeight;};
+  bool init(const string& dictPath, const string& userDictPath = "") {
+    assert(!_trie);
+    _loadDict(dictPath);
+    _calculateWeight(_nodeInfos);
+    _minWeight = _findMinWeight(_nodeInfos);
+
+    if(userDictPath.size()) {
+      double maxWeight = _findMaxWeight(_nodeInfos);
+      _loadUserDict(userDictPath, maxWeight, UNKNOWN_TAG);
+    }
+    _shrink(_nodeInfos);
+    _trie = _createTrie(_nodeInfos);
+    assert(_trie);
+    return true;
+  }
+
+  const DictUnit* find(Unicode::const_iterator begin, Unicode::const_iterator end) const {
+    return _trie->find(begin, end);
+  }
+  bool find(Unicode::const_iterator begin, Unicode::const_iterator end, DagType& dag, size_t offset = 0) const {
+    return _trie->find(begin, end, dag, offset);
+  }
+  void find(
+    Unicode::const_iterator begin,
+    Unicode::const_iterator end,
+    vector<SegmentChar>& res
+  ) const {
+    _trie->find(begin, end, res);
+  }
+  bool isUserDictSingleChineseWord(const Unicode::value_type& word) const {
+    return isIn(_userDictSingleChineseWord, word);
+  }
+  double getMinWeight() const {
+    return _minWeight;
+  };


-        private:
-            UglyTrie * _createTrie(const vector<DictUnit>& dictUnits)
-            {
-                assert(dictUnits.size());
-                vector<Unicode> words;
-                vector<const DictUnit*> valuePointers;
-                for(size_t i = 0 ; i < dictUnits.size(); i ++)
-                {
-                    words.push_back(dictUnits[i].word);
-                    valuePointers.push_back(&dictUnits[i]);
-                }
+ private:
+  UglyTrie * _createTrie(const vector<DictUnit>& dictUnits) {
+    assert(dictUnits.size());
+    vector<Unicode> words;
+    vector<const DictUnit*> valuePointers;
+    for(size_t i = 0 ; i < dictUnits.size(); i ++) {
+      words.push_back(dictUnits[i].word);
+      valuePointers.push_back(&dictUnits[i]);
+    }

-                UglyTrie * trie = new UglyTrie(words, valuePointers);
-                return trie;
-            }
-            void _loadUserDict(const string& filePath, double defaultWeight, const string& defaultTag)
-            {
-                ifstream ifs(filePath.c_str());
-                assert(ifs.is_open());
-                string line;
-                DictUnit nodeInfo;
-                vector<string> buf;
-                size_t lineno;
-                for(lineno = 0; getline(ifs, line); lineno++)
-                {
-                    buf.clear();
-                    split(line, buf, " ");
-                    assert(buf.size() >= 1);
-                    if(!TransCode::decode(buf[0], nodeInfo.word))
-                    {
-                        LogError("line[%u:%s] illegal.", lineno, line.c_str());
-                        continue;
-                    }
-                    if(nodeInfo.word.size() == 1)
-                    {
-                        _userDictSingleChineseWord.insert(nodeInfo.word[0]);
-                    }
-                    nodeInfo.weight = defaultWeight;
-                    nodeInfo.tag = (buf.size() == 2 ? buf[1] : defaultTag);
-                    _nodeInfos.push_back(nodeInfo);
-                }
-                LogInfo("load userdict[%s] ok. lines[%u]", filePath.c_str(), lineno);
-            }
-            void _loadDict(const string& filePath) 
-            {
-                ifstream ifs(filePath.c_str());
-                assert(ifs.is_open());
-                string line;
-                vector<string> buf;
+    UglyTrie * trie = new UglyTrie(words, valuePointers);
+    return trie;
+  }
+  void _loadUserDict(const string& filePath, double defaultWeight, const string& defaultTag) {
+    ifstream ifs(filePath.c_str());
+    assert(ifs.is_open());
+    string line;
+    DictUnit nodeInfo;
+    vector<string> buf;
+    size_t lineno;
+    for(lineno = 0; getline(ifs, line); lineno++) {
+      buf.clear();
+      split(line, buf, " ");
+      assert(buf.size() >= 1);
+      if(!TransCode::decode(buf[0], nodeInfo.word)) {
+        LogError("line[%u:%s] illegal.", lineno, line.c_str());
+        continue;
+      }
+      if(nodeInfo.word.size() == 1) {
+        _userDictSingleChineseWord.insert(nodeInfo.word[0]);
+      }
+      nodeInfo.weight = defaultWeight;
+      nodeInfo.tag = (buf.size() == 2 ? buf[1] : defaultTag);
+      _nodeInfos.push_back(nodeInfo);
+    }
+    LogInfo("load userdict[%s] ok. lines[%u]", filePath.c_str(), lineno);
+  }
+  void _loadDict(const string& filePath) {
+    ifstream ifs(filePath.c_str());
+    assert(ifs.is_open());
+    string line;
+    vector<string> buf;

-                DictUnit nodeInfo;
-                for(size_t lineno = 0 ; getline(ifs, line); lineno++)
-                {
-                    split(line, buf, " ");
-                    assert(buf.size() == DICT_COLUMN_NUM);
-                    
-                    if(!TransCode::decode(buf[0], nodeInfo.word))
-                    {
-                        LogError("line[%u:%s] illegal.", lineno, line.c_str());
-                        continue;
-                    }
-                    nodeInfo.weight = atof(buf[1].c_str());
-                    nodeInfo.tag = buf[2];
-                    
-                    _nodeInfos.push_back(nodeInfo);
-                }
-            }
-            double _findMinWeight(const vector<DictUnit>& nodeInfos) const
-            {
-                double ret = MAX_DOUBLE;
-                for(size_t i = 0; i < nodeInfos.size(); i++)
-                {
-                    ret = min(nodeInfos[i].weight, ret);
-                }
-                return ret;
-            }
-            double _findMaxWeight(const vector<DictUnit>& nodeInfos) const
-            {
-                double ret = MIN_DOUBLE;
-                for(size_t i = 0; i < nodeInfos.size(); i++)
-                {
-                    ret = max(nodeInfos[i].weight, ret);
-                }
-                return ret;
-            }
+    DictUnit nodeInfo;
+    for(size_t lineno = 0 ; getline(ifs, line); lineno++) {
+      split(line, buf, " ");
+      assert(buf.size() == DICT_COLUMN_NUM);

-            void _calculateWeight(vector<DictUnit>& nodeInfos) const
-            {
-                double sum = 0.0;
-                for(size_t i = 0; i < nodeInfos.size(); i++)
-                {
-                    sum += nodeInfos[i].weight;
-                }
-                assert(sum);
-                for(size_t i = 0; i < nodeInfos.size(); i++)
-                {
-                    DictUnit& nodeInfo = nodeInfos[i];
-                    assert(nodeInfo.weight);
-                    nodeInfo.weight = log(double(nodeInfo.weight)/double(sum));
-                }
-            }
+      if(!TransCode::decode(buf[0], nodeInfo.word)) {
+        LogError("line[%u:%s] illegal.", lineno, line.c_str());
+        continue;
+      }
+      nodeInfo.weight = atof(buf[1].c_str());
+      nodeInfo.tag = buf[2];

-            void _shrink(vector<DictUnit>& units) const
-            {
-                vector<DictUnit>(units.begin(), units.end()).swap(units);
-            }
+      _nodeInfos.push_back(nodeInfo);
+    }
+  }
+  double _findMinWeight(const vector<DictUnit>& nodeInfos) const {
+    double ret = MAX_DOUBLE;
+    for(size_t i = 0; i < nodeInfos.size(); i++) {
+      ret = min(nodeInfos[i].weight, ret);
+    }
+    return ret;
+  }
+  double _findMaxWeight(const vector<DictUnit>& nodeInfos) const {
+    double ret = MIN_DOUBLE;
+    for(size_t i = 0; i < nodeInfos.size(); i++) {
+      ret = max(nodeInfos[i].weight, ret);
+    }
+    return ret;
+  }

-        private:
-            vector<DictUnit> _nodeInfos;
-            UglyTrie * _trie;
+  void _calculateWeight(vector<DictUnit>& nodeInfos) const {
+    double sum = 0.0;
+    for(size_t i = 0; i < nodeInfos.size(); i++) {
+      sum += nodeInfos[i].weight;
+    }
+    assert(sum);
+    for(size_t i = 0; i < nodeInfos.size(); i++) {
+      DictUnit& nodeInfo = nodeInfos[i];
+      assert(nodeInfo.weight);
+      nodeInfo.weight = log(double(nodeInfo.weight)/double(sum));
+    }
+  }

-            double _minWeight;
-            unordered_set<Unicode::value_type> _userDictSingleChineseWord;
-    };
+  void _shrink(vector<DictUnit>& units) const {
+    vector<DictUnit>(units.begin(), units.end()).swap(units);
+  }
+
+ private:
+  vector<DictUnit> _nodeInfos;
+  UglyTrie * _trie;
+
+  double _minWeight;
+  unordered_set<Unicode::value_type> _userDictSingleChineseWord;
+};
 }

 #endif
--- a/src/FullSegment.hpp
+++ b/src/FullSegment.hpp
@ -10,140 +10,116 @@
 #include "SegmentBase.hpp"
 #include "TransCode.hpp"

-namespace CppJieba
-{
-    class FullSegment: public SegmentBase
-    {
-        public:
-            FullSegment()
-            {
-                _dictTrie = NULL;
-                _isBorrowed = false;
-            }
-            explicit FullSegment(const string& dictPath)
-            {
-                _dictTrie = NULL;
-                init(dictPath);
-            }
-            explicit FullSegment(const DictTrie* dictTrie) 
-            {
-                _dictTrie = NULL;
-                init(dictTrie);
-            }
-            virtual ~FullSegment()
-            {
-                if(_dictTrie && ! _isBorrowed) 
-                {
-                    delete _dictTrie;
-                }
+namespace CppJieba {
+class FullSegment: public SegmentBase {
+ public:
+  FullSegment() {
+    _dictTrie = NULL;
+    _isBorrowed = false;
+  }
+  explicit FullSegment(const string& dictPath) {
+    _dictTrie = NULL;
+    init(dictPath);
+  }
+  explicit FullSegment(const DictTrie* dictTrie) {
+    _dictTrie = NULL;
+    init(dictTrie);
+  }
+  virtual ~FullSegment() {
+    if(_dictTrie && ! _isBorrowed) {
+      delete _dictTrie;
+    }

-            };
-            bool init(const string& dictPath)
-            {
-                assert(_dictTrie == NULL);
-                _dictTrie = new DictTrie(dictPath);
-                _isBorrowed = false;
-                return true;
-            }
-            bool init(const DictTrie* dictTrie) 
-            {
-                assert(_dictTrie == NULL);
-                assert(dictTrie);
-                _dictTrie = dictTrie;
-                _isBorrowed = true;
-                return true;
-            }
+  };
+  bool init(const string& dictPath) {
+    assert(_dictTrie == NULL);
+    _dictTrie = new DictTrie(dictPath);
+    _isBorrowed = false;
+    return true;
+  }
+  bool init(const DictTrie* dictTrie) {
+    assert(_dictTrie == NULL);
+    assert(dictTrie);
+    _dictTrie = dictTrie;
+    _isBorrowed = true;
+    return true;
+  }

-            using SegmentBase::cut;
-            bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<Unicode>& res) const
-            {
-                assert(_dictTrie);
-                if (begin >= end)
-                {
-                    LogError("begin >= end");
-                    return false;
-                }
+  using SegmentBase::cut;
+  bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<Unicode>& res) const {
+    assert(_dictTrie);
+    if (begin >= end) {
+      LogError("begin >= end");
+      return false;
+    }

-                //resut of searching in trie tree
-                DagType tRes;
+    //resut of searching in trie tree
+    DagType tRes;

-                //max index of res's words
-                int maxIdx = 0;
+    //max index of res's words
+    int maxIdx = 0;

-                // always equals to (uItr - begin)
-                int uIdx = 0;
+    // always equals to (uItr - begin)
+    int uIdx = 0;

-                //tmp variables
-                int wordLen = 0;
-                for (Unicode::const_iterator uItr = begin; uItr != end; uItr++)
-                {
-                    //find word start from uItr
-                    if (_dictTrie->find(uItr, end, tRes, 0))
-                    {
-                        for(DagType::const_iterator itr = tRes.begin(); itr != tRes.end(); itr++)
-                        //for (vector<pair<size_t, const DictUnit*> >::const_iterator itr = tRes.begin(); itr != tRes.end(); itr++)
-                        {
-                            wordLen = itr->second->word.size();
-                            if (wordLen >= 2 || (tRes.size() == 1 && maxIdx <= uIdx))
-                            {
-                                res.push_back(itr->second->word);
-                            }
-                            maxIdx = uIdx+wordLen > maxIdx ? uIdx+wordLen : maxIdx;
-                        }
-                        tRes.clear();
-                    }
-                    else // not found word start from uItr
-                    {
-                        if (maxIdx <= uIdx) // never exist in prev results
-                        {
-                            //put itr itself in res
-                            res.push_back(Unicode(1, *uItr));
+    //tmp variables
+    int wordLen = 0;
+    for (Unicode::const_iterator uItr = begin; uItr != end; uItr++) {
+      //find word start from uItr
+      if (_dictTrie->find(uItr, end, tRes, 0)) {
+        for(DagType::const_iterator itr = tRes.begin(); itr != tRes.end(); itr++)
+          //for (vector<pair<size_t, const DictUnit*> >::const_iterator itr = tRes.begin(); itr != tRes.end(); itr++)
+        {
+          wordLen = itr->second->word.size();
+          if (wordLen >= 2 || (tRes.size() == 1 && maxIdx <= uIdx)) {
+            res.push_back(itr->second->word);
+          }
+          maxIdx = uIdx+wordLen > maxIdx ? uIdx+wordLen : maxIdx;
+        }
+        tRes.clear();
+      } else { // not found word start from uItr
+        if (maxIdx <= uIdx) { // never exist in prev results
+          //put itr itself in res
+          res.push_back(Unicode(1, *uItr));

-                            //mark it exits
-                            ++maxIdx;
-                        }
-                    }
-                    ++uIdx;
-                }
+          //mark it exits
+          ++maxIdx;
+        }
+      }
+      ++uIdx;
+    }

-                return true;
-            }
+    return true;
+  }

-            bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<string>& res) const
-            {
-                assert(_dictTrie);
-                if (begin >= end)
-                {
-                    LogError("begin >= end");
-                    return false;
-                }
+  bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<string>& res) const {
+    assert(_dictTrie);
+    if (begin >= end) {
+      LogError("begin >= end");
+      return false;
+    }

-                vector<Unicode> uRes;
-                if (!cut(begin, end, uRes))
-                {
-                    LogError("get unicode cut result error.");
-                    return false;
-                }
+    vector<Unicode> uRes;
+    if (!cut(begin, end, uRes)) {
+      LogError("get unicode cut result error.");
+      return false;
+    }

-                string tmp;
-                for (vector<Unicode>::const_iterator uItr = uRes.begin(); uItr != uRes.end(); uItr++)
-                {
-                    if (TransCode::encode(*uItr, tmp))
-                    {
-                        res.push_back(tmp);
-                    }
-                    else
-                    {
-                        LogError("encode failed.");
-                    }
-                }
+    string tmp;
+    for (vector<Unicode>::const_iterator uItr = uRes.begin(); uItr != uRes.end(); uItr++) {
+      if (TransCode::encode(*uItr, tmp)) {
+        res.push_back(tmp);
+      } else {
+        LogError("encode failed.");
+      }
+    }

-                return true;
-            }
-        private:
-            const DictTrie* _dictTrie;
-            bool _isBorrowed;
-    };
+    return true;
+  }
+ private:
+  const DictTrie* _dictTrie;
+  bool _isBorrowed;
+};
 }

 #endif
--- a/src/HMMSegment.hpp
+++ b/src/HMMSegment.hpp
@ -12,387 +12,315 @@
 #include "SegmentBase.hpp"
 #include "DictTrie.hpp"

-namespace CppJieba
-{
-    using namespace Limonp;
-    typedef unordered_map<uint16_t, double> EmitProbMap;
-    class HMMSegment: public SegmentBase
-    {
-        public:
-            /*
-             * STATUS:
-             * 0:B, 1:E, 2:M, 3:S
-             * */
-            enum {B = 0, E = 1, M = 2, S = 3, STATUS_SUM = 4};
+namespace CppJieba {
+using namespace Limonp;
+typedef unordered_map<uint16_t, double> EmitProbMap;
+class HMMSegment: public SegmentBase {
+ public:
+  /*
+   * STATUS:
+   * 0:B, 1:E, 2:M, 3:S
+   * */
+  enum {B = 0, E = 1, M = 2, S = 3, STATUS_SUM = 4};

-        public:
-            HMMSegment(){}
-            explicit HMMSegment(const string& filePath)
-            {
-                LIMONP_CHECK(init(filePath));
-            }
-            virtual ~HMMSegment(){}
-        public:
-            bool init(const string& filePath)
-            {
-                memset(_startProb, 0, sizeof(_startProb));
-                memset(_transProb, 0, sizeof(_transProb));
-                _statMap[0] = 'B';
-                _statMap[1] = 'E';
-                _statMap[2] = 'M';
-                _statMap[3] = 'S';
-                _emitProbVec.push_back(&_emitProbB);
-                _emitProbVec.push_back(&_emitProbE);
-                _emitProbVec.push_back(&_emitProbM);
-                _emitProbVec.push_back(&_emitProbS);
-                LIMONP_CHECK(_loadModel(filePath.c_str()));
-                LogInfo("HMMSegment init(%s) ok.", filePath.c_str());
-                return true;
-            }
-        public:
-            using SegmentBase::cut;
-        public:
-            bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<Unicode>& res)const 
-            {
-                Unicode::const_iterator left = begin;
-                Unicode::const_iterator right = begin;
-                while(right != end)
-                {
-                    if(*right < 0x80) 
-                    {
-                        if(left != right && !_cut(left, right, res))
-                        {
-                            return false;
-                        }
-                        left = right;
-                        do {
-                            right = _sequentialLetterRule(left, end);
-                            if(right != left)
-                            {
-                                break;
-                            }
-                            right = _numbersRule(left, end);
-                            if(right != left)
-                            {
-                                break;
-                            }
-                            right ++;
-                        } while(false);
-                        res.push_back(Unicode(left, right));
-                        left = right;
-                    }
-                    else
-                    {
-                        right++;
-                    }
-                }
-                if(left != right && !_cut(left, right, res))
-                {
-                    return false;
-                }
-                return true;
-            }
-        public:
-            virtual bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<string>& res)const
-            {
-                if(begin == end)
-                {
-                    return false;
-                }
-                vector<Unicode> words;
-                words.reserve(end - begin);
-                if(!cut(begin, end, words))
-                {
-                    return false;
-                }
-                size_t offset = res.size();
-                res.resize(res.size() + words.size());
-                for(size_t i = 0; i < words.size(); i++)
-                {
-                    if(!TransCode::encode(words[i], res[offset + i]))
-                    {
-                        LogError("encode failed.");
-                    }
-                }
-                return true;
-            }
-        private:
-            // sequential letters rule
-            Unicode::const_iterator _sequentialLetterRule(Unicode::const_iterator begin, Unicode::const_iterator end) const
-            {
-                Unicode::value_type x = *begin;
-                if (('a' <= x && x <= 'z') || ('A' <= x && x <= 'Z'))
-                {
-                    begin ++;
-                }
-                else
-                {
-                    return begin;
-                }
-                while(begin != end)
-                {
-                    x = *begin;
-                    if(('a' <= x && x <= 'z') || ('A' <= x && x <= 'Z') || ('0' <= x && x <= '9'))
-                    {
-                        begin ++;
-                    }
-                    else
-                    {
-                        break;
-                    }
-                }
-                return begin;
-            }
-            // 
-            Unicode::const_iterator _numbersRule(Unicode::const_iterator begin, Unicode::const_iterator end) const
-            {
-                Unicode::value_type x = *begin;
-                if('0' <= x && x <= '9')
-                {
-                    begin ++;
-                }
-                else
-                {
-                    return begin;
-                }
-                while(begin != end)
-                {
-                    x = *begin;
-                    if( ('0' <= x && x <= '9') || x == '.')
-                    {
-                        begin++;
-                    }
-                    else
-                    {
-                        break;
-                    }
-                }
-                return begin;
-            }
-            bool _cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<Unicode>& res) const 
-            {
-                vector<size_t> status; 
-                if(!_viterbi(begin, end, status))
-                {
-                    LogError("_viterbi failed.");
-                    return false;
-                }
+ public:
+  HMMSegment() {}
+  explicit HMMSegment(const string& filePath) {
+    LIMONP_CHECK(init(filePath));
+  }
+  virtual ~HMMSegment() {}
+ public:
+  bool init(const string& filePath) {
+    memset(_startProb, 0, sizeof(_startProb));
+    memset(_transProb, 0, sizeof(_transProb));
+    _statMap[0] = 'B';
+    _statMap[1] = 'E';
+    _statMap[2] = 'M';
+    _statMap[3] = 'S';
+    _emitProbVec.push_back(&_emitProbB);
+    _emitProbVec.push_back(&_emitProbE);
+    _emitProbVec.push_back(&_emitProbM);
+    _emitProbVec.push_back(&_emitProbS);
+    LIMONP_CHECK(_loadModel(filePath.c_str()));
+    LogInfo("HMMSegment init(%s) ok.", filePath.c_str());
+    return true;
+  }
+ public:
+  using SegmentBase::cut;
+ public:
+  bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<Unicode>& res)const {
+    Unicode::const_iterator left = begin;
+    Unicode::const_iterator right = begin;
+    while(right != end) {
+      if(*right < 0x80) {
+        if(left != right && !_cut(left, right, res)) {
+          return false;
+        }
+        left = right;
+        do {
+          right = _sequentialLetterRule(left, end);
+          if(right != left) {
+            break;
+          }
+          right = _numbersRule(left, end);
+          if(right != left) {
+            break;
+          }
+          right ++;
+        } while(false);
+        res.push_back(Unicode(left, right));
+        left = right;
+      } else {
+        right++;
+      }
+    }
+    if(left != right && !_cut(left, right, res)) {
+      return false;
+    }
+    return true;
+  }
+ public:
+  virtual bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<string>& res)const {
+    if(begin == end) {
+      return false;
+    }
+    vector<Unicode> words;
+    words.reserve(end - begin);
+    if(!cut(begin, end, words)) {
+      return false;
+    }
+    size_t offset = res.size();
+    res.resize(res.size() + words.size());
+    for(size_t i = 0; i < words.size(); i++) {
+      if(!TransCode::encode(words[i], res[offset + i])) {
+        LogError("encode failed.");
+      }
+    }
+    return true;
+  }
+ private:
+  // sequential letters rule
+  Unicode::const_iterator _sequentialLetterRule(Unicode::const_iterator begin, Unicode::const_iterator end) const {
+    Unicode::value_type x = *begin;
+    if (('a' <= x && x <= 'z') || ('A' <= x && x <= 'Z')) {
+      begin ++;
+    } else {
+      return begin;
+    }
+    while(begin != end) {
+      x = *begin;
+      if(('a' <= x && x <= 'z') || ('A' <= x && x <= 'Z') || ('0' <= x && x <= '9')) {
+        begin ++;
+      } else {
+        break;
+      }
+    }
+    return begin;
+  }
+  //
+  Unicode::const_iterator _numbersRule(Unicode::const_iterator begin, Unicode::const_iterator end) const {
+    Unicode::value_type x = *begin;
+    if('0' <= x && x <= '9') {
+      begin ++;
+    } else {
+      return begin;
+    }
+    while(begin != end) {
+      x = *begin;
+      if( ('0' <= x && x <= '9') || x == '.') {
+        begin++;
+      } else {
+        break;
+      }
+    }
+    return begin;
+  }
+  bool _cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<Unicode>& res) const {
+    vector<size_t> status;
+    if(!_viterbi(begin, end, status)) {
+      LogError("_viterbi failed.");
+      return false;
+    }

-                Unicode::const_iterator left = begin;
-                Unicode::const_iterator right;
-                for(size_t i = 0; i < status.size(); i++)
-                {
-                    if(status[i] % 2) //if(E == status[i] || S == status[i])
-                    {
-                        right = begin + i + 1;
-                        res.push_back(Unicode(left, right));
-                        left = right;
-                    }
-                }
-                return true;
-            }
+    Unicode::const_iterator left = begin;
+    Unicode::const_iterator right;
+    for(size_t i = 0; i < status.size(); i++) {
+      if(status[i] % 2) { //if(E == status[i] || S == status[i])
+        right = begin + i + 1;
+        res.push_back(Unicode(left, right));
+        left = right;
+      }
+    }
+    return true;
+  }

-            bool _viterbi(Unicode::const_iterator begin, Unicode::const_iterator end, vector<size_t>& status)const
-            {
-                if(begin == end)
-                {
-                    return false;
-                }
+  bool _viterbi(Unicode::const_iterator begin, Unicode::const_iterator end, vector<size_t>& status)const {
+    if(begin == end) {
+      return false;
+    }

-                size_t Y = STATUS_SUM;
-                size_t X = end - begin;
+    size_t Y = STATUS_SUM;
+    size_t X = end - begin;

-                size_t XYSize = X * Y;
-                size_t now, old, stat;
-                double tmp, endE, endS;
+    size_t XYSize = X * Y;
+    size_t now, old, stat;
+    double tmp, endE, endS;

-                vector<int> path(XYSize);
-                vector<double> weight(XYSize);
+    vector<int> path(XYSize);
+    vector<double> weight(XYSize);

-                //start
-                for(size_t y = 0; y < Y; y++)
-                {
-                    weight[0 + y * X] = _startProb[y] + _getEmitProb(_emitProbVec[y], *begin, MIN_DOUBLE);
-                    path[0 + y * X] = -1;
-                }
+    //start
+    for(size_t y = 0; y < Y; y++) {
+      weight[0 + y * X] = _startProb[y] + _getEmitProb(_emitProbVec[y], *begin, MIN_DOUBLE);
+      path[0 + y * X] = -1;
+    }


-                double emitProb;
+    double emitProb;

-                for(size_t x = 1; x < X; x++)
-                {
-                    for(size_t y = 0; y < Y; y++)
-                    {
-                        now = x + y*X;
-                        weight[now] = MIN_DOUBLE;
-                        path[now] = E; // warning
-                        emitProb = _getEmitProb(_emitProbVec[y], *(begin+x), MIN_DOUBLE);
-                        for(size_t preY = 0; preY < Y; preY++)
-                        {
-                            old = x - 1 + preY * X;
-                            tmp = weight[old] + _transProb[preY][y] + emitProb;
-                            if(tmp > weight[now])
-                            {
-                                weight[now] = tmp;
-                                path[now] = preY;
-                            }
-                        }
-                    }
-                }
+    for(size_t x = 1; x < X; x++) {
+      for(size_t y = 0; y < Y; y++) {
+        now = x + y*X;
+        weight[now] = MIN_DOUBLE;
+        path[now] = E; // warning
+        emitProb = _getEmitProb(_emitProbVec[y], *(begin+x), MIN_DOUBLE);
+        for(size_t preY = 0; preY < Y; preY++) {
+          old = x - 1 + preY * X;
+          tmp = weight[old] + _transProb[preY][y] + emitProb;
+          if(tmp > weight[now]) {
+            weight[now] = tmp;
+            path[now] = preY;
+          }
+        }
+      }
+    }

-                endE = weight[X-1+E*X];
-                endS = weight[X-1+S*X];
-                stat = 0;
-                if(endE >= endS)
-                {
-                    stat = E;
-                }
-                else
-                {
-                    stat = S;
-                }
+    endE = weight[X-1+E*X];
+    endS = weight[X-1+S*X];
+    stat = 0;
+    if(endE >= endS) {
+      stat = E;
+    } else {
+      stat = S;
+    }

-                status.resize(X);
-                for(int x = X -1 ; x >= 0; x--)
-                {
-                    status[x] = stat;
-                    stat = path[x + stat*X];
-                }
+    status.resize(X);
+    for(int x = X -1 ; x >= 0; x--) {
+      status[x] = stat;
+      stat = path[x + stat*X];
+    }

-                return true;
-            }
-            bool _loadModel(const char* const filePath)
-            {
-                ifstream ifile(filePath);
-                string line;
-                vector<string> tmp;
-                vector<string> tmp2;
-                //load _startProb
-                if(!_getLine(ifile, line))
-                {
-                    return false;
-                }
-                split(line, tmp, " ");
-                if(tmp.size() != STATUS_SUM)
-                {
-                    LogError("start_p illegal");
-                    return false;
-                }
-                for(size_t j = 0; j< tmp.size(); j++)
-                {
-                    _startProb[j] = atof(tmp[j].c_str());
-                }
+    return true;
+  }
+  bool _loadModel(const char* const filePath) {
+    ifstream ifile(filePath);
+    string line;
+    vector<string> tmp;
+    vector<string> tmp2;
+    //load _startProb
+    if(!_getLine(ifile, line)) {
+      return false;
+    }
+    split(line, tmp, " ");
+    if(tmp.size() != STATUS_SUM) {
+      LogError("start_p illegal");
+      return false;
+    }
+    for(size_t j = 0; j< tmp.size(); j++) {
+      _startProb[j] = atof(tmp[j].c_str());
+    }

-                //load _transProb
-                for(size_t i = 0; i < STATUS_SUM; i++)
-                {
-                    if(!_getLine(ifile, line))
-                    {
-                        return false;
-                    }
-                    split(line, tmp, " ");
-                    if(tmp.size() != STATUS_SUM)
-                    {
-                        LogError("trans_p illegal");
-                        return false;
-                    }
-                    for(size_t j =0; j < STATUS_SUM; j++)
-                    {
-                        _transProb[i][j] = atof(tmp[j].c_str());
-                    }
-                }
+    //load _transProb
+    for(size_t i = 0; i < STATUS_SUM; i++) {
+      if(!_getLine(ifile, line)) {
+        return false;
+      }
+      split(line, tmp, " ");
+      if(tmp.size() != STATUS_SUM) {
+        LogError("trans_p illegal");
+        return false;
+      }
+      for(size_t j =0; j < STATUS_SUM; j++) {
+        _transProb[i][j] = atof(tmp[j].c_str());
+      }
+    }

-                //load _emitProbB
-                if(!_getLine(ifile, line) || !_loadEmitProb(line, _emitProbB))
-                {
-                    return false;
-                }
+    //load _emitProbB
+    if(!_getLine(ifile, line) || !_loadEmitProb(line, _emitProbB)) {
+      return false;
+    }

-                //load _emitProbE
-                if(!_getLine(ifile, line) || !_loadEmitProb(line, _emitProbE))
-                {
-                    return false;
-                }
+    //load _emitProbE
+    if(!_getLine(ifile, line) || !_loadEmitProb(line, _emitProbE)) {
+      return false;
+    }

-                //load _emitProbM
-                if(!_getLine(ifile, line) || !_loadEmitProb(line, _emitProbM))
-                {
-                    return false;
-                }
+    //load _emitProbM
+    if(!_getLine(ifile, line) || !_loadEmitProb(line, _emitProbM)) {
+      return false;
+    }

-                //load _emitProbS
-                if(!_getLine(ifile, line) || !_loadEmitProb(line, _emitProbS))
-                {
-                    return false;
-                }
+    //load _emitProbS
+    if(!_getLine(ifile, line) || !_loadEmitProb(line, _emitProbS)) {
+      return false;
+    }

-                return true;
-            }
-            bool _getLine(ifstream& ifile, string& line)
-            {
-                while(getline(ifile, line))
-                {
-                    trim(line);
-                    if(line.empty())
-                    {
-                        continue;
-                    }
-                    if(startsWith(line, "#"))
-                    {
-                        continue;
-                    }
-                    return true;
-                }
-                return false;
-            }
-            bool _loadEmitProb(const string& line, EmitProbMap& mp)
-            {
-                if(line.empty())
-                {
-                    return false;
-                }
-                vector<string> tmp, tmp2;
-                Unicode unicode;
-                split(line, tmp, ",");
-                for(size_t i = 0; i < tmp.size(); i++)
-                {
-                    split(tmp[i], tmp2, ":");
-                    if(2 != tmp2.size())
-                    {
-                        LogError("_emitProb illegal.");
-                        return false;
-                    }
-                    if(!TransCode::decode(tmp2[0], unicode) || unicode.size() != 1)
-                    {
-                        LogError("TransCode failed.");
-                        return false;
-                    }
-                    mp[unicode[0]] = atof(tmp2[1].c_str());
-                }
-                return true;
-            }
-            double _getEmitProb(const EmitProbMap* ptMp, uint16_t key, double defVal)const 
-            {
-                EmitProbMap::const_iterator cit = ptMp->find(key);
-                if(cit == ptMp->end())
-                {
-                    return defVal;
-                }
-                return cit->second;
+    return true;
+  }
+  bool _getLine(ifstream& ifile, string& line) {
+    while(getline(ifile, line)) {
+      trim(line);
+      if(line.empty()) {
+        continue;
+      }
+      if(startsWith(line, "#")) {
+        continue;
+      }
+      return true;
+    }
+    return false;
+  }
+  bool _loadEmitProb(const string& line, EmitProbMap& mp) {
+    if(line.empty()) {
+      return false;
+    }
+    vector<string> tmp, tmp2;
+    Unicode unicode;
+    split(line, tmp, ",");
+    for(size_t i = 0; i < tmp.size(); i++) {
+      split(tmp[i], tmp2, ":");
+      if(2 != tmp2.size()) {
+        LogError("_emitProb illegal.");
+        return false;
+      }
+      if(!TransCode::decode(tmp2[0], unicode) || unicode.size() != 1) {
+        LogError("TransCode failed.");
+        return false;
+      }
+      mp[unicode[0]] = atof(tmp2[1].c_str());
+    }
+    return true;
+  }
+  double _getEmitProb(const EmitProbMap* ptMp, uint16_t key, double defVal)const {
+    EmitProbMap::const_iterator cit = ptMp->find(key);
+    if(cit == ptMp->end()) {
+      return defVal;
+    }
+    return cit->second;

-            }
+  }

-        private:
-            char _statMap[STATUS_SUM];
-            double _startProb[STATUS_SUM];
-            double _transProb[STATUS_SUM][STATUS_SUM];
-            EmitProbMap _emitProbB;
-            EmitProbMap _emitProbE;
-            EmitProbMap _emitProbM;
-            EmitProbMap _emitProbS;
-            vector<EmitProbMap* > _emitProbVec;
+ private:
+  char _statMap[STATUS_SUM];
+  double _startProb[STATUS_SUM];
+  double _transProb[STATUS_SUM][STATUS_SUM];
+  EmitProbMap _emitProbB;
+  EmitProbMap _emitProbE;
+  EmitProbMap _emitProbM;
+  EmitProbMap _emitProbS;
+  vector<EmitProbMap* > _emitProbVec;

-    };
+};
 }

 #endif
--- a/src/ISegment.hpp
+++ b/src/ISegment.hpp
@ -2,15 +2,13 @@
 #define CPPJIEBA_SEGMENTINTERFACE_H


-namespace CppJieba
-{
-    class ISegment
-    {
-        public:
-            virtual ~ISegment(){};
-            virtual bool cut(Unicode::const_iterator begin , Unicode::const_iterator end, vector<string>& res) const = 0;
-            virtual bool cut(const string& str, vector<string>& res) const = 0;
-    };
+namespace CppJieba {
+class ISegment {
+ public:
+  virtual ~ISegment() {};
+  virtual bool cut(Unicode::const_iterator begin , Unicode::const_iterator end, vector<string>& res) const = 0;
+  virtual bool cut(const string& str, vector<string>& res) const = 0;
+};
 }

 #endif
--- a/src/KeywordExtractor.hpp
+++ b/src/KeywordExtractor.hpp
@ -5,162 +5,136 @@
 #include <cmath>
 #include <set>

-namespace CppJieba
-{
-    using namespace Limonp;
+namespace CppJieba {
+using namespace Limonp;

-    /*utf8*/
-    class KeywordExtractor
-    {
-        public:
-            KeywordExtractor(){};
-            KeywordExtractor(const string& dictPath, const string& hmmFilePath, const string& idfPath, const string& stopWordPath, const string& userDict = "")
-            {
-                init(dictPath, hmmFilePath, idfPath, stopWordPath, userDict);
-            };
-            ~KeywordExtractor(){};
+/*utf8*/
+class KeywordExtractor {
+ public:
+  KeywordExtractor() {};
+  KeywordExtractor(const string& dictPath, const string& hmmFilePath, const string& idfPath, const string& stopWordPath, const string& userDict = "") {
+    init(dictPath, hmmFilePath, idfPath, stopWordPath, userDict);
+  };
+  ~KeywordExtractor() {};

-            void init(const string& dictPath, const string& hmmFilePath, const string& idfPath, const string& stopWordPath, const string& userDict = "")
-            {
-                _loadIdfDict(idfPath);
-                _loadStopWordDict(stopWordPath);
-                LIMONP_CHECK(_segment.init(dictPath, hmmFilePath, userDict));
-            };
+  void init(const string& dictPath, const string& hmmFilePath, const string& idfPath, const string& stopWordPath, const string& userDict = "") {
+    _loadIdfDict(idfPath);
+    _loadStopWordDict(stopWordPath);
+    LIMONP_CHECK(_segment.init(dictPath, hmmFilePath, userDict));
+  };

-            bool extract(const string& str, vector<string>& keywords, size_t topN) const
-            {
-                vector<pair<string, double> > topWords;
-                if(!extract(str, topWords, topN))
-                {
-                    return false;
-                }
-                for(size_t i = 0; i < topWords.size(); i++)
-                {
-                    keywords.push_back(topWords[i].first);
-                }
-                return true;
-            }
+  bool extract(const string& str, vector<string>& keywords, size_t topN) const {
+    vector<pair<string, double> > topWords;
+    if(!extract(str, topWords, topN)) {
+      return false;
+    }
+    for(size_t i = 0; i < topWords.size(); i++) {
+      keywords.push_back(topWords[i].first);
+    }
+    return true;
+  }

-            bool extract(const string& str, vector<pair<string, double> >& keywords, size_t topN) const
-            {
-                vector<string> words;
-                if(!_segment.cut(str, words))
-                {
-                    LogError("segment cut(%s) failed.", str.c_str());
-                    return false;
-                }
+  bool extract(const string& str, vector<pair<string, double> >& keywords, size_t topN) const {
+    vector<string> words;
+    if(!_segment.cut(str, words)) {
+      LogError("segment cut(%s) failed.", str.c_str());
+      return false;
+    }

-                map<string, double> wordmap;
-                for(vector<string>::iterator iter = words.begin(); iter != words.end(); iter++)
-                {
-                    if(_isSingleWord(*iter))
-                    {
-                        continue;
-                    }
-                    wordmap[*iter] += 1.0;
-                }
+    map<string, double> wordmap;
+    for(vector<string>::iterator iter = words.begin(); iter != words.end(); iter++) {
+      if(_isSingleWord(*iter)) {
+        continue;
+      }
+      wordmap[*iter] += 1.0;
+    }

-                for(map<string, double>::iterator itr = wordmap.begin(); itr != wordmap.end(); )
-                {
-                    if(_stopWords.end() != _stopWords.find(itr->first))
-                    {
-                        wordmap.erase(itr);
-                        continue;
-                    }
+    for(map<string, double>::iterator itr = wordmap.begin(); itr != wordmap.end(); ) {
+      if(_stopWords.end() != _stopWords.find(itr->first)) {
+        wordmap.erase(itr);
+        continue;
+      }

-                    unordered_map<string, double>::const_iterator cit = _idfMap.find(itr->first);
-                    if(cit != _idfMap.end())
-                    {
-                        itr->second *= cit->second;
-                    }
-                    else
-                    {
-                        itr->second *= _idfAverage;
-                    }
-                    itr ++;
-                }
+      unordered_map<string, double>::const_iterator cit = _idfMap.find(itr->first);
+      if(cit != _idfMap.end()) {
+        itr->second *= cit->second;
+      } else {
+        itr->second *= _idfAverage;
+      }
+      itr ++;
+    }

-                keywords.clear();
-                std::copy(wordmap.begin(), wordmap.end(), std::inserter(keywords, keywords.begin()));
-                topN = min(topN, keywords.size());
-                partial_sort(keywords.begin(), keywords.begin() + topN, keywords.end(), _cmp);
-                keywords.resize(topN);
-                return true;
-            }
-        private:
-            void _loadIdfDict(const string& idfPath)
-            {
-                ifstream ifs(idfPath.c_str());
-                if(!ifs)
-                {
-                    LogError("open %s failed.", idfPath.c_str());
-                    assert(false);
-                }
-                string line ;
-                vector<string> buf;
-                double idf = 0.0;
-                double idfSum = 0.0;
-                size_t lineno = 0;
-                for(;getline(ifs, line); lineno++)
-                {
-                    buf.clear();
-                    if(line.empty())
-                    {
-                        LogError("line[%d] empty. skipped.", lineno);
-                        continue;
-                    }
-                    if(!split(line, buf, " ") || buf.size() != 2)
-                    {
-                        LogError("line %d [%s] illegal. skipped.", lineno, line.c_str());
-                        continue;
-                    }
-                    idf = atof(buf[1].c_str());
-                    _idfMap[buf[0]] = idf;
-                    idfSum += idf;
+    keywords.clear();
+    std::copy(wordmap.begin(), wordmap.end(), std::inserter(keywords, keywords.begin()));
+    topN = min(topN, keywords.size());
+    partial_sort(keywords.begin(), keywords.begin() + topN, keywords.end(), _cmp);
+    keywords.resize(topN);
+    return true;
+  }
+ private:
+  void _loadIdfDict(const string& idfPath) {
+    ifstream ifs(idfPath.c_str());
+    if(!ifs) {
+      LogError("open %s failed.", idfPath.c_str());
+      assert(false);
+    }
+    string line ;
+    vector<string> buf;
+    double idf = 0.0;
+    double idfSum = 0.0;
+    size_t lineno = 0;
+    for(; getline(ifs, line); lineno++) {
+      buf.clear();
+      if(line.empty()) {
+        LogError("line[%d] empty. skipped.", lineno);
+        continue;
+      }
+      if(!split(line, buf, " ") || buf.size() != 2) {
+        LogError("line %d [%s] illegal. skipped.", lineno, line.c_str());
+        continue;
+      }
+      idf = atof(buf[1].c_str());
+      _idfMap[buf[0]] = idf;
+      idfSum += idf;

-                } 
+    }

-                assert(lineno);
-                _idfAverage = idfSum / lineno;
-                assert(_idfAverage > 0.0);
-            }
-            void _loadStopWordDict(const string& filePath)
-            {
-                ifstream ifs(filePath.c_str());
-                if(!ifs)
-                {
-                    LogError("open %s failed.", filePath.c_str());
-                    assert(false);
-                }
-                string line ;
-                while(getline(ifs, line))
-                {
-                    _stopWords.insert(line);
-                }
-                assert(_stopWords.size());
-            }
+    assert(lineno);
+    _idfAverage = idfSum / lineno;
+    assert(_idfAverage > 0.0);
+  }
+  void _loadStopWordDict(const string& filePath) {
+    ifstream ifs(filePath.c_str());
+    if(!ifs) {
+      LogError("open %s failed.", filePath.c_str());
+      assert(false);
+    }
+    string line ;
+    while(getline(ifs, line)) {
+      _stopWords.insert(line);
+    }
+    assert(_stopWords.size());
+  }

-            bool _isSingleWord(const string& str) const
-            {
-                Unicode unicode;
-                TransCode::decode(str, unicode);
-                if(unicode.size() == 1)
-                  return true;
-                return false;
-            }
+  bool _isSingleWord(const string& str) const {
+    Unicode unicode;
+    TransCode::decode(str, unicode);
+    if(unicode.size() == 1)
+      return true;
+    return false;
+  }

-            static bool _cmp(const pair<string, double>& lhs, const pair<string, double>& rhs)
-            {
-                return lhs.second > rhs.second;
-            }
-            
-        private:
-            MixSegment _segment;
-            unordered_map<string, double> _idfMap;
-            double _idfAverage;
+  static bool _cmp(const pair<string, double>& lhs, const pair<string, double>& rhs) {
+    return lhs.second > rhs.second;
+  }

-            unordered_set<string> _stopWords;
-    };
+ private:
+  MixSegment _segment;
+  unordered_map<string, double> _idfMap;
+  double _idfAverage;
+
+  unordered_set<string> _stopWords;
+};
 }

 #endif