astyle

2025-07-18 00:00:12 +08:00 · 2015-07-20 23:54:20 +08:00 · 2015-07-20 23:54:20 +08:00 · 931db7d1e5
commit 931db7d1e5
parent d1a112c0c4
5 changed files with 656 additions and 807 deletions
--- a/src/DictTrie.hpp
+++ b/src/DictTrie.hpp
@ -16,191 +16,164 @@
-namespace CppJieba
+namespace CppJieba {
-{
+using namespace Limonp;
-    using namespace Limonp;
+const double MIN_DOUBLE = -3.14e+100;
-    const double MIN_DOUBLE = -3.14e+100;
+const double MAX_DOUBLE = 3.14e+100;
-    const double MAX_DOUBLE = 3.14e+100;
+const size_t DICT_COLUMN_NUM = 3;
-    const size_t DICT_COLUMN_NUM = 3;
+const char* const UNKNOWN_TAG = "";
    const char* const UNKNOWN_TAG = "";
-    class DictTrie
+class DictTrie {
-    {
+ public:
        public:
-            DictTrie()
+  DictTrie() {
-            {
+    _trie = NULL;
-                _trie = NULL;
+    _minWeight = MAX_DOUBLE;
-                _minWeight = MAX_DOUBLE;
+  }
-            }
+  DictTrie(const string& dictPath, const string& userDictPath = "") {
-            DictTrie(const string& dictPath, const string& userDictPath = "")
+    new (this) DictTrie();
-            {
+    init(dictPath, userDictPath);
-                new (this) DictTrie();
+  }
-                init(dictPath, userDictPath);
+  ~DictTrie() {
-            }
+    if(_trie) {
-            ~DictTrie()
+      delete _trie;
-            {
+    }
-                if(_trie)
+  }
                {
                    delete _trie;
                }
            }
            bool init(const string& dictPath, const string& userDictPath = "")
            {
                assert(!_trie);
                _loadDict(dictPath);
                _calculateWeight(_nodeInfos);
                _minWeight = _findMinWeight(_nodeInfos);
                if(userDictPath.size())
                {
                    double maxWeight = _findMaxWeight(_nodeInfos);
                    _loadUserDict(userDictPath, maxWeight, UNKNOWN_TAG);
                }
                _shrink(_nodeInfos);
                _trie = _createTrie(_nodeInfos);
                assert(_trie);
                return true;
            }
-            const DictUnit* find(Unicode::const_iterator begin, Unicode::const_iterator end) const
+  bool init(const string& dictPath, const string& userDictPath = "") {
-            {
+    assert(!_trie);
-                return _trie->find(begin, end);
+    _loadDict(dictPath);
-            }
+    _calculateWeight(_nodeInfos);
-            bool find(Unicode::const_iterator begin, Unicode::const_iterator end, DagType& dag, size_t offset = 0) const
+    _minWeight = _findMinWeight(_nodeInfos);
-            {
+
-                return _trie->find(begin, end, dag, offset);
+    if(userDictPath.size()) {
-            }
+      double maxWeight = _findMaxWeight(_nodeInfos);
-            void find(
+      _loadUserDict(userDictPath, maxWeight, UNKNOWN_TAG);
-                        Unicode::const_iterator begin, 
+    }
-                        Unicode::const_iterator end, 
+    _shrink(_nodeInfos);
-                        vector<SegmentChar>& res
+    _trie = _createTrie(_nodeInfos);
-                        ) const
+    assert(_trie);
-            {
+    return true;
-                _trie->find(begin, end, res);
+  }
-            }
+
-            bool isUserDictSingleChineseWord(const Unicode::value_type& word) const
+  const DictUnit* find(Unicode::const_iterator begin, Unicode::const_iterator end) const {
-            {
+    return _trie->find(begin, end);
-                return isIn(_userDictSingleChineseWord, word);
+  }
-            }
+  bool find(Unicode::const_iterator begin, Unicode::const_iterator end, DagType& dag, size_t offset = 0) const {
-            double getMinWeight() const {return _minWeight;};
+    return _trie->find(begin, end, dag, offset);
  }
  void find(
    Unicode::const_iterator begin,
    Unicode::const_iterator end,
    vector<SegmentChar>& res
  ) const {
    _trie->find(begin, end, res);
  }
  bool isUserDictSingleChineseWord(const Unicode::value_type& word) const {
    return isIn(_userDictSingleChineseWord, word);
  }
  double getMinWeight() const {
    return _minWeight;
  };
-        private:
+ private:
-            UglyTrie * _createTrie(const vector<DictUnit>& dictUnits)
+  UglyTrie * _createTrie(const vector<DictUnit>& dictUnits) {
-            {
+    assert(dictUnits.size());
-                assert(dictUnits.size());
+    vector<Unicode> words;
-                vector<Unicode> words;
+    vector<const DictUnit*> valuePointers;
-                vector<const DictUnit*> valuePointers;
+    for(size_t i = 0 ; i < dictUnits.size(); i ++) {
-                for(size_t i = 0 ; i < dictUnits.size(); i ++)
+      words.push_back(dictUnits[i].word);
-                {
+      valuePointers.push_back(&dictUnits[i]);
-                    words.push_back(dictUnits[i].word);
+    }
                    valuePointers.push_back(&dictUnits[i]);
                }
-                UglyTrie * trie = new UglyTrie(words, valuePointers);
+    UglyTrie * trie = new UglyTrie(words, valuePointers);
-                return trie;
+    return trie;
-            }
+  }
-            void _loadUserDict(const string& filePath, double defaultWeight, const string& defaultTag)
+  void _loadUserDict(const string& filePath, double defaultWeight, const string& defaultTag) {
-            {
+    ifstream ifs(filePath.c_str());
-                ifstream ifs(filePath.c_str());
+    assert(ifs.is_open());
-                assert(ifs.is_open());
+    string line;
-                string line;
+    DictUnit nodeInfo;
-                DictUnit nodeInfo;
+    vector<string> buf;
-                vector<string> buf;
+    size_t lineno;
-                size_t lineno;
+    for(lineno = 0; getline(ifs, line); lineno++) {
-                for(lineno = 0; getline(ifs, line); lineno++)
+      buf.clear();
-                {
+      split(line, buf, " ");
-                    buf.clear();
+      assert(buf.size() >= 1);
-                    split(line, buf, " ");
+      if(!TransCode::decode(buf[0], nodeInfo.word)) {
-                    assert(buf.size() >= 1);
+        LogError("line[%u:%s] illegal.", lineno, line.c_str());
-                    if(!TransCode::decode(buf[0], nodeInfo.word))
+        continue;
-                    {
+      }
-                        LogError("line[%u:%s] illegal.", lineno, line.c_str());
+      if(nodeInfo.word.size() == 1) {
-                        continue;
+        _userDictSingleChineseWord.insert(nodeInfo.word[0]);
-                    }
+      }
-                    if(nodeInfo.word.size() == 1)
+      nodeInfo.weight = defaultWeight;
-                    {
+      nodeInfo.tag = (buf.size() == 2 ? buf[1] : defaultTag);
-                        _userDictSingleChineseWord.insert(nodeInfo.word[0]);
+      _nodeInfos.push_back(nodeInfo);
-                    }
+    }
-                    nodeInfo.weight = defaultWeight;
+    LogInfo("load userdict[%s] ok. lines[%u]", filePath.c_str(), lineno);
-                    nodeInfo.tag = (buf.size() == 2 ? buf[1] : defaultTag);
+  }
-                    _nodeInfos.push_back(nodeInfo);
+  void _loadDict(const string& filePath) {
-                }
+    ifstream ifs(filePath.c_str());
-                LogInfo("load userdict[%s] ok. lines[%u]", filePath.c_str(), lineno);
+    assert(ifs.is_open());
-            }
+    string line;
-            void _loadDict(const string& filePath) 
+    vector<string> buf;
            {
                ifstream ifs(filePath.c_str());
                assert(ifs.is_open());
                string line;
                vector<string> buf;
-                DictUnit nodeInfo;
+    DictUnit nodeInfo;
-                for(size_t lineno = 0 ; getline(ifs, line); lineno++)
+    for(size_t lineno = 0 ; getline(ifs, line); lineno++) {
-                {
+      split(line, buf, " ");
-                    split(line, buf, " ");
+      assert(buf.size() == DICT_COLUMN_NUM);
                    assert(buf.size() == DICT_COLUMN_NUM);
                    if(!TransCode::decode(buf[0], nodeInfo.word))
                    {
                        LogError("line[%u:%s] illegal.", lineno, line.c_str());
                        continue;
                    }
                    nodeInfo.weight = atof(buf[1].c_str());
                    nodeInfo.tag = buf[2];
                    _nodeInfos.push_back(nodeInfo);
                }
            }
            double _findMinWeight(const vector<DictUnit>& nodeInfos) const
            {
                double ret = MAX_DOUBLE;
                for(size_t i = 0; i < nodeInfos.size(); i++)
                {
                    ret = min(nodeInfos[i].weight, ret);
                }
                return ret;
            }
            double _findMaxWeight(const vector<DictUnit>& nodeInfos) const
            {
                double ret = MIN_DOUBLE;
                for(size_t i = 0; i < nodeInfos.size(); i++)
                {
                    ret = max(nodeInfos[i].weight, ret);
                }
                return ret;
            }
-            void _calculateWeight(vector<DictUnit>& nodeInfos) const
+      if(!TransCode::decode(buf[0], nodeInfo.word)) {
-            {
+        LogError("line[%u:%s] illegal.", lineno, line.c_str());
-                double sum = 0.0;
+        continue;
-                for(size_t i = 0; i < nodeInfos.size(); i++)
+      }
-                {
+      nodeInfo.weight = atof(buf[1].c_str());
-                    sum += nodeInfos[i].weight;
+      nodeInfo.tag = buf[2];
                }
                assert(sum);
                for(size_t i = 0; i < nodeInfos.size(); i++)
                {
                    DictUnit& nodeInfo = nodeInfos[i];
                    assert(nodeInfo.weight);
                    nodeInfo.weight = log(double(nodeInfo.weight)/double(sum));
                }
            }
-            void _shrink(vector<DictUnit>& units) const
+      _nodeInfos.push_back(nodeInfo);
-            {
+    }
-                vector<DictUnit>(units.begin(), units.end()).swap(units);
+  }
-            }
+  double _findMinWeight(const vector<DictUnit>& nodeInfos) const {
    double ret = MAX_DOUBLE;
    for(size_t i = 0; i < nodeInfos.size(); i++) {
      ret = min(nodeInfos[i].weight, ret);
    }
    return ret;
  }
  double _findMaxWeight(const vector<DictUnit>& nodeInfos) const {
    double ret = MIN_DOUBLE;
    for(size_t i = 0; i < nodeInfos.size(); i++) {
      ret = max(nodeInfos[i].weight, ret);
    }
    return ret;
  }
-        private:
+  void _calculateWeight(vector<DictUnit>& nodeInfos) const {
-            vector<DictUnit> _nodeInfos;
+    double sum = 0.0;
-            UglyTrie * _trie;
+    for(size_t i = 0; i < nodeInfos.size(); i++) {
      sum += nodeInfos[i].weight;
    }
    assert(sum);
    for(size_t i = 0; i < nodeInfos.size(); i++) {
      DictUnit& nodeInfo = nodeInfos[i];
      assert(nodeInfo.weight);
      nodeInfo.weight = log(double(nodeInfo.weight)/double(sum));
    }
  }
-            double _minWeight;
+  void _shrink(vector<DictUnit>& units) const {
-            unordered_set<Unicode::value_type> _userDictSingleChineseWord;
+    vector<DictUnit>(units.begin(), units.end()).swap(units);
-    };
+  }
 private:
  vector<DictUnit> _nodeInfos;
  UglyTrie * _trie;
  double _minWeight;
  unordered_set<Unicode::value_type> _userDictSingleChineseWord;
 };
 }
 #endif
--- a/src/FullSegment.hpp
+++ b/src/FullSegment.hpp
@ -10,140 +10,116 @@
 #include "SegmentBase.hpp"
 #include "TransCode.hpp"
-namespace CppJieba
+namespace CppJieba {
-{
+class FullSegment: public SegmentBase {
-    class FullSegment: public SegmentBase
+ public:
-    {
+  FullSegment() {
-        public:
+    _dictTrie = NULL;
-            FullSegment()
+    _isBorrowed = false;
-            {
+  }
-                _dictTrie = NULL;
+  explicit FullSegment(const string& dictPath) {
-                _isBorrowed = false;
+    _dictTrie = NULL;
-            }
+    init(dictPath);
-            explicit FullSegment(const string& dictPath)
+  }
-            {
+  explicit FullSegment(const DictTrie* dictTrie) {
-                _dictTrie = NULL;
+    _dictTrie = NULL;
-                init(dictPath);
+    init(dictTrie);
-            }
+  }
-            explicit FullSegment(const DictTrie* dictTrie) 
+  virtual ~FullSegment() {
-            {
+    if(_dictTrie && ! _isBorrowed) {
-                _dictTrie = NULL;
+      delete _dictTrie;
-                init(dictTrie);
+    }
            }
            virtual ~FullSegment()
            {
                if(_dictTrie && ! _isBorrowed) 
                {
                    delete _dictTrie;
                }
-            };
+  };
-            bool init(const string& dictPath)
+  bool init(const string& dictPath) {
-            {
+    assert(_dictTrie == NULL);
-                assert(_dictTrie == NULL);
+    _dictTrie = new DictTrie(dictPath);
-                _dictTrie = new DictTrie(dictPath);
+    _isBorrowed = false;
-                _isBorrowed = false;
+    return true;
-                return true;
+  }
-            }
+  bool init(const DictTrie* dictTrie) {
-            bool init(const DictTrie* dictTrie) 
+    assert(_dictTrie == NULL);
-            {
+    assert(dictTrie);
-                assert(_dictTrie == NULL);
+    _dictTrie = dictTrie;
-                assert(dictTrie);
+    _isBorrowed = true;
-                _dictTrie = dictTrie;
+    return true;
-                _isBorrowed = true;
+  }
                return true;
            }
-            using SegmentBase::cut;
+  using SegmentBase::cut;
-            bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<Unicode>& res) const
+  bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<Unicode>& res) const {
-            {
+    assert(_dictTrie);
-                assert(_dictTrie);
+    if (begin >= end) {
-                if (begin >= end)
+      LogError("begin >= end");
-                {
+      return false;
-                    LogError("begin >= end");
+    }
                    return false;
                }
-                //resut of searching in trie tree
+    //resut of searching in trie tree
-                DagType tRes;
+    DagType tRes;
-                //max index of res's words
+    //max index of res's words
-                int maxIdx = 0;
+    int maxIdx = 0;
-                // always equals to (uItr - begin)
+    // always equals to (uItr - begin)
-                int uIdx = 0;
+    int uIdx = 0;
-                //tmp variables
+    //tmp variables
-                int wordLen = 0;
+    int wordLen = 0;
-                for (Unicode::const_iterator uItr = begin; uItr != end; uItr++)
+    for (Unicode::const_iterator uItr = begin; uItr != end; uItr++) {
-                {
+      //find word start from uItr
-                    //find word start from uItr
+      if (_dictTrie->find(uItr, end, tRes, 0)) {
-                    if (_dictTrie->find(uItr, end, tRes, 0))
+        for(DagType::const_iterator itr = tRes.begin(); itr != tRes.end(); itr++)
-                    {
+          //for (vector<pair<size_t, const DictUnit*> >::const_iterator itr = tRes.begin(); itr != tRes.end(); itr++)
-                        for(DagType::const_iterator itr = tRes.begin(); itr != tRes.end(); itr++)
+        {
-                        //for (vector<pair<size_t, const DictUnit*> >::const_iterator itr = tRes.begin(); itr != tRes.end(); itr++)
+          wordLen = itr->second->word.size();
-                        {
+          if (wordLen >= 2 || (tRes.size() == 1 && maxIdx <= uIdx)) {
-                            wordLen = itr->second->word.size();
+            res.push_back(itr->second->word);
-                            if (wordLen >= 2 || (tRes.size() == 1 && maxIdx <= uIdx))
+          }
-                            {
+          maxIdx = uIdx+wordLen > maxIdx ? uIdx+wordLen : maxIdx;
-                                res.push_back(itr->second->word);
+        }
-                            }
+        tRes.clear();
-                            maxIdx = uIdx+wordLen > maxIdx ? uIdx+wordLen : maxIdx;
+      } else { // not found word start from uItr
-                        }
+        if (maxIdx <= uIdx) { // never exist in prev results
-                        tRes.clear();
+          //put itr itself in res
-                    }
+          res.push_back(Unicode(1, *uItr));
                    else // not found word start from uItr
                    {
                        if (maxIdx <= uIdx) // never exist in prev results
                        {
                            //put itr itself in res
                            res.push_back(Unicode(1, *uItr));
-                            //mark it exits
+          //mark it exits
-                            ++maxIdx;
+          ++maxIdx;
-                        }
+        }
-                    }
+      }
-                    ++uIdx;
+      ++uIdx;
-                }
+    }
-                return true;
+    return true;
-            }
+  }
-            bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<string>& res) const
+  bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<string>& res) const {
-            {
+    assert(_dictTrie);
-                assert(_dictTrie);
+    if (begin >= end) {
-                if (begin >= end)
+      LogError("begin >= end");
-                {
+      return false;
-                    LogError("begin >= end");
+    }
                    return false;
                }
-                vector<Unicode> uRes;
+    vector<Unicode> uRes;
-                if (!cut(begin, end, uRes))
+    if (!cut(begin, end, uRes)) {
-                {
+      LogError("get unicode cut result error.");
-                    LogError("get unicode cut result error.");
+      return false;
-                    return false;
+    }
                }
-                string tmp;
+    string tmp;
-                for (vector<Unicode>::const_iterator uItr = uRes.begin(); uItr != uRes.end(); uItr++)
+    for (vector<Unicode>::const_iterator uItr = uRes.begin(); uItr != uRes.end(); uItr++) {
-                {
+      if (TransCode::encode(*uItr, tmp)) {
-                    if (TransCode::encode(*uItr, tmp))
+        res.push_back(tmp);
-                    {
+      } else {
-                        res.push_back(tmp);
+        LogError("encode failed.");
-                    }
+      }
-                    else
+    }
                    {
                        LogError("encode failed.");
                    }
                }
-                return true;
+    return true;
-            }
+  }
-        private:
+ private:
-            const DictTrie* _dictTrie;
+  const DictTrie* _dictTrie;
-            bool _isBorrowed;
+  bool _isBorrowed;
-    };
+};
 }
 #endif
--- a/src/HMMSegment.hpp
+++ b/src/HMMSegment.hpp
@ -12,387 +12,315 @@
 #include "SegmentBase.hpp"
 #include "DictTrie.hpp"
-namespace CppJieba
+namespace CppJieba {
-{
+using namespace Limonp;
-    using namespace Limonp;
+typedef unordered_map<uint16_t, double> EmitProbMap;
-    typedef unordered_map<uint16_t, double> EmitProbMap;
+class HMMSegment: public SegmentBase {
-    class HMMSegment: public SegmentBase
+ public:
-    {
+  /*
-        public:
+   * STATUS:
-            /*
+   * 0:B, 1:E, 2:M, 3:S
-             * STATUS:
+   * */
-             * 0:B, 1:E, 2:M, 3:S
+  enum {B = 0, E = 1, M = 2, S = 3, STATUS_SUM = 4};
             * */
            enum {B = 0, E = 1, M = 2, S = 3, STATUS_SUM = 4};
-        public:
+ public:
-            HMMSegment(){}
+  HMMSegment() {}
-            explicit HMMSegment(const string& filePath)
+  explicit HMMSegment(const string& filePath) {
-            {
+    LIMONP_CHECK(init(filePath));
-                LIMONP_CHECK(init(filePath));
+  }
-            }
+  virtual ~HMMSegment() {}
-            virtual ~HMMSegment(){}
+ public:
-        public:
+  bool init(const string& filePath) {
-            bool init(const string& filePath)
+    memset(_startProb, 0, sizeof(_startProb));
-            {
+    memset(_transProb, 0, sizeof(_transProb));
-                memset(_startProb, 0, sizeof(_startProb));
+    _statMap[0] = 'B';
-                memset(_transProb, 0, sizeof(_transProb));
+    _statMap[1] = 'E';
-                _statMap[0] = 'B';
+    _statMap[2] = 'M';
-                _statMap[1] = 'E';
+    _statMap[3] = 'S';
-                _statMap[2] = 'M';
+    _emitProbVec.push_back(&_emitProbB);
-                _statMap[3] = 'S';
+    _emitProbVec.push_back(&_emitProbE);
-                _emitProbVec.push_back(&_emitProbB);
+    _emitProbVec.push_back(&_emitProbM);
-                _emitProbVec.push_back(&_emitProbE);
+    _emitProbVec.push_back(&_emitProbS);
-                _emitProbVec.push_back(&_emitProbM);
+    LIMONP_CHECK(_loadModel(filePath.c_str()));
-                _emitProbVec.push_back(&_emitProbS);
+    LogInfo("HMMSegment init(%s) ok.", filePath.c_str());
-                LIMONP_CHECK(_loadModel(filePath.c_str()));
+    return true;
-                LogInfo("HMMSegment init(%s) ok.", filePath.c_str());
+  }
-                return true;
+ public:
-            }
+  using SegmentBase::cut;
-        public:
+ public:
-            using SegmentBase::cut;
+  bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<Unicode>& res)const {
-        public:
+    Unicode::const_iterator left = begin;
-            bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<Unicode>& res)const 
+    Unicode::const_iterator right = begin;
-            {
+    while(right != end) {
-                Unicode::const_iterator left = begin;
+      if(*right < 0x80) {
-                Unicode::const_iterator right = begin;
+        if(left != right && !_cut(left, right, res)) {
-                while(right != end)
+          return false;
-                {
+        }
-                    if(*right < 0x80) 
+        left = right;
-                    {
+        do {
-                        if(left != right && !_cut(left, right, res))
+          right = _sequentialLetterRule(left, end);
-                        {
+          if(right != left) {
-                            return false;
+            break;
-                        }
+          }
-                        left = right;
+          right = _numbersRule(left, end);
-                        do {
+          if(right != left) {
-                            right = _sequentialLetterRule(left, end);
+            break;
-                            if(right != left)
+          }
-                            {
+          right ++;
-                                break;
+        } while(false);
-                            }
+        res.push_back(Unicode(left, right));
-                            right = _numbersRule(left, end);
+        left = right;
-                            if(right != left)
+      } else {
-                            {
+        right++;
-                                break;
+      }
-                            }
+    }
-                            right ++;
+    if(left != right && !_cut(left, right, res)) {
-                        } while(false);
+      return false;
-                        res.push_back(Unicode(left, right));
+    }
-                        left = right;
+    return true;
-                    }
+  }
-                    else
+ public:
-                    {
+  virtual bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<string>& res)const {
-                        right++;
+    if(begin == end) {
-                    }
+      return false;
-                }
+    }
-                if(left != right && !_cut(left, right, res))
+    vector<Unicode> words;
-                {
+    words.reserve(end - begin);
-                    return false;
+    if(!cut(begin, end, words)) {
-                }
+      return false;
-                return true;
+    }
-            }
+    size_t offset = res.size();
-        public:
+    res.resize(res.size() + words.size());
-            virtual bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<string>& res)const
+    for(size_t i = 0; i < words.size(); i++) {
-            {
+      if(!TransCode::encode(words[i], res[offset + i])) {
-                if(begin == end)
+        LogError("encode failed.");
-                {
+      }
-                    return false;
+    }
-                }
+    return true;
-                vector<Unicode> words;
+  }
-                words.reserve(end - begin);
+ private:
-                if(!cut(begin, end, words))
+  // sequential letters rule
-                {
+  Unicode::const_iterator _sequentialLetterRule(Unicode::const_iterator begin, Unicode::const_iterator end) const {
-                    return false;
+    Unicode::value_type x = *begin;
-                }
+    if (('a' <= x && x <= 'z') || ('A' <= x && x <= 'Z')) {
-                size_t offset = res.size();
+      begin ++;
-                res.resize(res.size() + words.size());
+    } else {
-                for(size_t i = 0; i < words.size(); i++)
+      return begin;
-                {
+    }
-                    if(!TransCode::encode(words[i], res[offset + i]))
+    while(begin != end) {
-                    {
+      x = *begin;
-                        LogError("encode failed.");
+      if(('a' <= x && x <= 'z') || ('A' <= x && x <= 'Z') || ('0' <= x && x <= '9')) {
-                    }
+        begin ++;
-                }
+      } else {
-                return true;
+        break;
-            }
+      }
-        private:
+    }
-            // sequential letters rule
+    return begin;
-            Unicode::const_iterator _sequentialLetterRule(Unicode::const_iterator begin, Unicode::const_iterator end) const
+  }
-            {
+  //
-                Unicode::value_type x = *begin;
+  Unicode::const_iterator _numbersRule(Unicode::const_iterator begin, Unicode::const_iterator end) const {
-                if (('a' <= x && x <= 'z') || ('A' <= x && x <= 'Z'))
+    Unicode::value_type x = *begin;
-                {
+    if('0' <= x && x <= '9') {
-                    begin ++;
+      begin ++;
-                }
+    } else {
-                else
+      return begin;
-                {
+    }
-                    return begin;
+    while(begin != end) {
-                }
+      x = *begin;
-                while(begin != end)
+      if( ('0' <= x && x <= '9') || x == '.') {
-                {
+        begin++;
-                    x = *begin;
+      } else {
-                    if(('a' <= x && x <= 'z') || ('A' <= x && x <= 'Z') || ('0' <= x && x <= '9'))
+        break;
-                    {
+      }
-                        begin ++;
+    }
-                    }
+    return begin;
-                    else
+  }
-                    {
+  bool _cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<Unicode>& res) const {
-                        break;
+    vector<size_t> status;
-                    }
+    if(!_viterbi(begin, end, status)) {
-                }
+      LogError("_viterbi failed.");
-                return begin;
+      return false;
-            }
+    }
            // 
            Unicode::const_iterator _numbersRule(Unicode::const_iterator begin, Unicode::const_iterator end) const
            {
                Unicode::value_type x = *begin;
                if('0' <= x && x <= '9')
                {
                    begin ++;
                }
                else
                {
                    return begin;
                }
                while(begin != end)
                {
                    x = *begin;
                    if( ('0' <= x && x <= '9') || x == '.')
                    {
                        begin++;
                    }
                    else
                    {
                        break;
                    }
                }
                return begin;
            }
            bool _cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<Unicode>& res) const 
            {
                vector<size_t> status; 
                if(!_viterbi(begin, end, status))
                {
                    LogError("_viterbi failed.");
                    return false;
                }
-                Unicode::const_iterator left = begin;
+    Unicode::const_iterator left = begin;
-                Unicode::const_iterator right;
+    Unicode::const_iterator right;
-                for(size_t i = 0; i < status.size(); i++)
+    for(size_t i = 0; i < status.size(); i++) {
-                {
+      if(status[i] % 2) { //if(E == status[i] || S == status[i])
-                    if(status[i] % 2) //if(E == status[i] || S == status[i])
+        right = begin + i + 1;
-                    {
+        res.push_back(Unicode(left, right));
-                        right = begin + i + 1;
+        left = right;
-                        res.push_back(Unicode(left, right));
+      }
-                        left = right;
+    }
-                    }
+    return true;
-                }
+  }
                return true;
            }
-            bool _viterbi(Unicode::const_iterator begin, Unicode::const_iterator end, vector<size_t>& status)const
+  bool _viterbi(Unicode::const_iterator begin, Unicode::const_iterator end, vector<size_t>& status)const {
-            {
+    if(begin == end) {
-                if(begin == end)
+      return false;
-                {
+    }
                    return false;
                }
-                size_t Y = STATUS_SUM;
+    size_t Y = STATUS_SUM;
-                size_t X = end - begin;
+    size_t X = end - begin;
-                size_t XYSize = X * Y;
+    size_t XYSize = X * Y;
-                size_t now, old, stat;
+    size_t now, old, stat;
-                double tmp, endE, endS;
+    double tmp, endE, endS;
-                vector<int> path(XYSize);
+    vector<int> path(XYSize);
-                vector<double> weight(XYSize);
+    vector<double> weight(XYSize);
-                //start
+    //start
-                for(size_t y = 0; y < Y; y++)
+    for(size_t y = 0; y < Y; y++) {
-                {
+      weight[0 + y * X] = _startProb[y] + _getEmitProb(_emitProbVec[y], *begin, MIN_DOUBLE);
-                    weight[0 + y * X] = _startProb[y] + _getEmitProb(_emitProbVec[y], *begin, MIN_DOUBLE);
+      path[0 + y * X] = -1;
-                    path[0 + y * X] = -1;
+    }
                }
-                double emitProb;
+    double emitProb;
-                for(size_t x = 1; x < X; x++)
+    for(size_t x = 1; x < X; x++) {
-                {
+      for(size_t y = 0; y < Y; y++) {
-                    for(size_t y = 0; y < Y; y++)
+        now = x + y*X;
-                    {
+        weight[now] = MIN_DOUBLE;
-                        now = x + y*X;
+        path[now] = E; // warning
-                        weight[now] = MIN_DOUBLE;
+        emitProb = _getEmitProb(_emitProbVec[y], *(begin+x), MIN_DOUBLE);
-                        path[now] = E; // warning
+        for(size_t preY = 0; preY < Y; preY++) {
-                        emitProb = _getEmitProb(_emitProbVec[y], *(begin+x), MIN_DOUBLE);
+          old = x - 1 + preY * X;
-                        for(size_t preY = 0; preY < Y; preY++)
+          tmp = weight[old] + _transProb[preY][y] + emitProb;
-                        {
+          if(tmp > weight[now]) {
-                            old = x - 1 + preY * X;
+            weight[now] = tmp;
-                            tmp = weight[old] + _transProb[preY][y] + emitProb;
+            path[now] = preY;
-                            if(tmp > weight[now])
+          }
-                            {
+        }
-                                weight[now] = tmp;
+      }
-                                path[now] = preY;
+    }
                            }
                        }
                    }
                }
-                endE = weight[X-1+E*X];
+    endE = weight[X-1+E*X];
-                endS = weight[X-1+S*X];
+    endS = weight[X-1+S*X];
-                stat = 0;
+    stat = 0;
-                if(endE >= endS)
+    if(endE >= endS) {
-                {
+      stat = E;
-                    stat = E;
+    } else {
-                }
+      stat = S;
-                else
+    }
                {
                    stat = S;
                }
-                status.resize(X);
+    status.resize(X);
-                for(int x = X -1 ; x >= 0; x--)
+    for(int x = X -1 ; x >= 0; x--) {
-                {
+      status[x] = stat;
-                    status[x] = stat;
+      stat = path[x + stat*X];
-                    stat = path[x + stat*X];
+    }
                }
-                return true;
+    return true;
-            }
+  }
-            bool _loadModel(const char* const filePath)
+  bool _loadModel(const char* const filePath) {
-            {
+    ifstream ifile(filePath);
-                ifstream ifile(filePath);
+    string line;
-                string line;
+    vector<string> tmp;
-                vector<string> tmp;
+    vector<string> tmp2;
-                vector<string> tmp2;
+    //load _startProb
-                //load _startProb
+    if(!_getLine(ifile, line)) {
-                if(!_getLine(ifile, line))
+      return false;
-                {
+    }
-                    return false;
+    split(line, tmp, " ");
-                }
+    if(tmp.size() != STATUS_SUM) {
-                split(line, tmp, " ");
+      LogError("start_p illegal");
-                if(tmp.size() != STATUS_SUM)
+      return false;
-                {
+    }
-                    LogError("start_p illegal");
+    for(size_t j = 0; j< tmp.size(); j++) {
-                    return false;
+      _startProb[j] = atof(tmp[j].c_str());
-                }
+    }
                for(size_t j = 0; j< tmp.size(); j++)
                {
                    _startProb[j] = atof(tmp[j].c_str());
                }
-                //load _transProb
+    //load _transProb
-                for(size_t i = 0; i < STATUS_SUM; i++)
+    for(size_t i = 0; i < STATUS_SUM; i++) {
-                {
+      if(!_getLine(ifile, line)) {
-                    if(!_getLine(ifile, line))
+        return false;
-                    {
+      }
-                        return false;
+      split(line, tmp, " ");
-                    }
+      if(tmp.size() != STATUS_SUM) {
-                    split(line, tmp, " ");
+        LogError("trans_p illegal");
-                    if(tmp.size() != STATUS_SUM)
+        return false;
-                    {
+      }
-                        LogError("trans_p illegal");
+      for(size_t j =0; j < STATUS_SUM; j++) {
-                        return false;
+        _transProb[i][j] = atof(tmp[j].c_str());
-                    }
+      }
-                    for(size_t j =0; j < STATUS_SUM; j++)
+    }
                    {
                        _transProb[i][j] = atof(tmp[j].c_str());
                    }
                }
-                //load _emitProbB
+    //load _emitProbB
-                if(!_getLine(ifile, line) || !_loadEmitProb(line, _emitProbB))
+    if(!_getLine(ifile, line) || !_loadEmitProb(line, _emitProbB)) {
-                {
+      return false;
-                    return false;
+    }
                }
-                //load _emitProbE
+    //load _emitProbE
-                if(!_getLine(ifile, line) || !_loadEmitProb(line, _emitProbE))
+    if(!_getLine(ifile, line) || !_loadEmitProb(line, _emitProbE)) {
-                {
+      return false;
-                    return false;
+    }
                }
-                //load _emitProbM
+    //load _emitProbM
-                if(!_getLine(ifile, line) || !_loadEmitProb(line, _emitProbM))
+    if(!_getLine(ifile, line) || !_loadEmitProb(line, _emitProbM)) {
-                {
+      return false;
-                    return false;
+    }
                }
-                //load _emitProbS
+    //load _emitProbS
-                if(!_getLine(ifile, line) || !_loadEmitProb(line, _emitProbS))
+    if(!_getLine(ifile, line) || !_loadEmitProb(line, _emitProbS)) {
-                {
+      return false;
-                    return false;
+    }
                }
-                return true;
+    return true;
-            }
+  }
-            bool _getLine(ifstream& ifile, string& line)
+  bool _getLine(ifstream& ifile, string& line) {
-            {
+    while(getline(ifile, line)) {
-                while(getline(ifile, line))
+      trim(line);
-                {
+      if(line.empty()) {
-                    trim(line);
+        continue;
-                    if(line.empty())
+      }
-                    {
+      if(startsWith(line, "#")) {
-                        continue;
+        continue;
-                    }
+      }
-                    if(startsWith(line, "#"))
+      return true;
-                    {
+    }
-                        continue;
+    return false;
-                    }
+  }
-                    return true;
+  bool _loadEmitProb(const string& line, EmitProbMap& mp) {
-                }
+    if(line.empty()) {
-                return false;
+      return false;
-            }
+    }
-            bool _loadEmitProb(const string& line, EmitProbMap& mp)
+    vector<string> tmp, tmp2;
-            {
+    Unicode unicode;
-                if(line.empty())
+    split(line, tmp, ",");
-                {
+    for(size_t i = 0; i < tmp.size(); i++) {
-                    return false;
+      split(tmp[i], tmp2, ":");
-                }
+      if(2 != tmp2.size()) {
-                vector<string> tmp, tmp2;
+        LogError("_emitProb illegal.");
-                Unicode unicode;
+        return false;
-                split(line, tmp, ",");
+      }
-                for(size_t i = 0; i < tmp.size(); i++)
+      if(!TransCode::decode(tmp2[0], unicode) || unicode.size() != 1) {
-                {
+        LogError("TransCode failed.");
-                    split(tmp[i], tmp2, ":");
+        return false;
-                    if(2 != tmp2.size())
+      }
-                    {
+      mp[unicode[0]] = atof(tmp2[1].c_str());
-                        LogError("_emitProb illegal.");
+    }
-                        return false;
+    return true;
-                    }
+  }
-                    if(!TransCode::decode(tmp2[0], unicode) || unicode.size() != 1)
+  double _getEmitProb(const EmitProbMap* ptMp, uint16_t key, double defVal)const {
-                    {
+    EmitProbMap::const_iterator cit = ptMp->find(key);
-                        LogError("TransCode failed.");
+    if(cit == ptMp->end()) {
-                        return false;
+      return defVal;
-                    }
+    }
-                    mp[unicode[0]] = atof(tmp2[1].c_str());
+    return cit->second;
                }
                return true;
            }
            double _getEmitProb(const EmitProbMap* ptMp, uint16_t key, double defVal)const 
            {
                EmitProbMap::const_iterator cit = ptMp->find(key);
                if(cit == ptMp->end())
                {
                    return defVal;
                }
                return cit->second;
-            }
+  }
-        private:
+ private:
-            char _statMap[STATUS_SUM];
+  char _statMap[STATUS_SUM];
-            double _startProb[STATUS_SUM];
+  double _startProb[STATUS_SUM];
-            double _transProb[STATUS_SUM][STATUS_SUM];
+  double _transProb[STATUS_SUM][STATUS_SUM];
-            EmitProbMap _emitProbB;
+  EmitProbMap _emitProbB;
-            EmitProbMap _emitProbE;
+  EmitProbMap _emitProbE;
-            EmitProbMap _emitProbM;
+  EmitProbMap _emitProbM;
-            EmitProbMap _emitProbS;
+  EmitProbMap _emitProbS;
-            vector<EmitProbMap* > _emitProbVec;
+  vector<EmitProbMap* > _emitProbVec;
-    };
+};
 }
 #endif
--- a/src/ISegment.hpp
+++ b/src/ISegment.hpp
@ -2,15 +2,13 @@
 #define CPPJIEBA_SEGMENTINTERFACE_H
-namespace CppJieba
+namespace CppJieba {
-{
+class ISegment {
-    class ISegment
+ public:
-    {
+  virtual ~ISegment() {};
-        public:
+  virtual bool cut(Unicode::const_iterator begin , Unicode::const_iterator end, vector<string>& res) const = 0;
-            virtual ~ISegment(){};
+  virtual bool cut(const string& str, vector<string>& res) const = 0;
-            virtual bool cut(Unicode::const_iterator begin , Unicode::const_iterator end, vector<string>& res) const = 0;
+};
            virtual bool cut(const string& str, vector<string>& res) const = 0;
    };
 }
 #endif
--- a/src/KeywordExtractor.hpp
+++ b/src/KeywordExtractor.hpp
@ -5,162 +5,136 @@
 #include <cmath>
 #include <set>
-namespace CppJieba
+namespace CppJieba {
-{
+using namespace Limonp;
    using namespace Limonp;
-    /*utf8*/
+/*utf8*/
-    class KeywordExtractor
+class KeywordExtractor {
-    {
+ public:
-        public:
+  KeywordExtractor() {};
-            KeywordExtractor(){};
+  KeywordExtractor(const string& dictPath, const string& hmmFilePath, const string& idfPath, const string& stopWordPath, const string& userDict = "") {
-            KeywordExtractor(const string& dictPath, const string& hmmFilePath, const string& idfPath, const string& stopWordPath, const string& userDict = "")
+    init(dictPath, hmmFilePath, idfPath, stopWordPath, userDict);
-            {
+  };
-                init(dictPath, hmmFilePath, idfPath, stopWordPath, userDict);
+  ~KeywordExtractor() {};
            };
            ~KeywordExtractor(){};
-            void init(const string& dictPath, const string& hmmFilePath, const string& idfPath, const string& stopWordPath, const string& userDict = "")
+  void init(const string& dictPath, const string& hmmFilePath, const string& idfPath, const string& stopWordPath, const string& userDict = "") {
-            {
+    _loadIdfDict(idfPath);
-                _loadIdfDict(idfPath);
+    _loadStopWordDict(stopWordPath);
-                _loadStopWordDict(stopWordPath);
+    LIMONP_CHECK(_segment.init(dictPath, hmmFilePath, userDict));
-                LIMONP_CHECK(_segment.init(dictPath, hmmFilePath, userDict));
+  };
            };
-            bool extract(const string& str, vector<string>& keywords, size_t topN) const
+  bool extract(const string& str, vector<string>& keywords, size_t topN) const {
-            {
+    vector<pair<string, double> > topWords;
-                vector<pair<string, double> > topWords;
+    if(!extract(str, topWords, topN)) {
-                if(!extract(str, topWords, topN))
+      return false;
-                {
+    }
-                    return false;
+    for(size_t i = 0; i < topWords.size(); i++) {
-                }
+      keywords.push_back(topWords[i].first);
-                for(size_t i = 0; i < topWords.size(); i++)
+    }
-                {
+    return true;
-                    keywords.push_back(topWords[i].first);
+  }
                }
                return true;
            }
-            bool extract(const string& str, vector<pair<string, double> >& keywords, size_t topN) const
+  bool extract(const string& str, vector<pair<string, double> >& keywords, size_t topN) const {
-            {
+    vector<string> words;
-                vector<string> words;
+    if(!_segment.cut(str, words)) {
-                if(!_segment.cut(str, words))
+      LogError("segment cut(%s) failed.", str.c_str());
-                {
+      return false;
-                    LogError("segment cut(%s) failed.", str.c_str());
+    }
                    return false;
                }
-                map<string, double> wordmap;
+    map<string, double> wordmap;
-                for(vector<string>::iterator iter = words.begin(); iter != words.end(); iter++)
+    for(vector<string>::iterator iter = words.begin(); iter != words.end(); iter++) {
-                {
+      if(_isSingleWord(*iter)) {
-                    if(_isSingleWord(*iter))
+        continue;
-                    {
+      }
-                        continue;
+      wordmap[*iter] += 1.0;
-                    }
+    }
                    wordmap[*iter] += 1.0;
                }
-                for(map<string, double>::iterator itr = wordmap.begin(); itr != wordmap.end(); )
+    for(map<string, double>::iterator itr = wordmap.begin(); itr != wordmap.end(); ) {
-                {
+      if(_stopWords.end() != _stopWords.find(itr->first)) {
-                    if(_stopWords.end() != _stopWords.find(itr->first))
+        wordmap.erase(itr);
-                    {
+        continue;
-                        wordmap.erase(itr);
+      }
                        continue;
                    }
-                    unordered_map<string, double>::const_iterator cit = _idfMap.find(itr->first);
+      unordered_map<string, double>::const_iterator cit = _idfMap.find(itr->first);
-                    if(cit != _idfMap.end())
+      if(cit != _idfMap.end()) {
-                    {
+        itr->second *= cit->second;
-                        itr->second *= cit->second;
+      } else {
-                    }
+        itr->second *= _idfAverage;
-                    else
+      }
-                    {
+      itr ++;
-                        itr->second *= _idfAverage;
+    }
                    }
                    itr ++;
                }
-                keywords.clear();
+    keywords.clear();
-                std::copy(wordmap.begin(), wordmap.end(), std::inserter(keywords, keywords.begin()));
+    std::copy(wordmap.begin(), wordmap.end(), std::inserter(keywords, keywords.begin()));
-                topN = min(topN, keywords.size());
+    topN = min(topN, keywords.size());
-                partial_sort(keywords.begin(), keywords.begin() + topN, keywords.end(), _cmp);
+    partial_sort(keywords.begin(), keywords.begin() + topN, keywords.end(), _cmp);
-                keywords.resize(topN);
+    keywords.resize(topN);
-                return true;
+    return true;
-            }
+  }
-        private:
+ private:
-            void _loadIdfDict(const string& idfPath)
+  void _loadIdfDict(const string& idfPath) {
-            {
+    ifstream ifs(idfPath.c_str());
-                ifstream ifs(idfPath.c_str());
+    if(!ifs) {
-                if(!ifs)
+      LogError("open %s failed.", idfPath.c_str());
-                {
+      assert(false);
-                    LogError("open %s failed.", idfPath.c_str());
+    }
-                    assert(false);
+    string line ;
-                }
+    vector<string> buf;
-                string line ;
+    double idf = 0.0;
-                vector<string> buf;
+    double idfSum = 0.0;
-                double idf = 0.0;
+    size_t lineno = 0;
-                double idfSum = 0.0;
+    for(; getline(ifs, line); lineno++) {
-                size_t lineno = 0;
+      buf.clear();
-                for(;getline(ifs, line); lineno++)
+      if(line.empty()) {
-                {
+        LogError("line[%d] empty. skipped.", lineno);
-                    buf.clear();
+        continue;
-                    if(line.empty())
+      }
-                    {
+      if(!split(line, buf, " ") || buf.size() != 2) {
-                        LogError("line[%d] empty. skipped.", lineno);
+        LogError("line %d [%s] illegal. skipped.", lineno, line.c_str());
-                        continue;
+        continue;
-                    }
+      }
-                    if(!split(line, buf, " ") || buf.size() != 2)
+      idf = atof(buf[1].c_str());
-                    {
+      _idfMap[buf[0]] = idf;
-                        LogError("line %d [%s] illegal. skipped.", lineno, line.c_str());
+      idfSum += idf;
                        continue;
                    }
                    idf = atof(buf[1].c_str());
                    _idfMap[buf[0]] = idf;
                    idfSum += idf;
-                } 
+    }
-                assert(lineno);
+    assert(lineno);
-                _idfAverage = idfSum / lineno;
+    _idfAverage = idfSum / lineno;
-                assert(_idfAverage > 0.0);
+    assert(_idfAverage > 0.0);
-            }
+  }
-            void _loadStopWordDict(const string& filePath)
+  void _loadStopWordDict(const string& filePath) {
-            {
+    ifstream ifs(filePath.c_str());
-                ifstream ifs(filePath.c_str());
+    if(!ifs) {
-                if(!ifs)
+      LogError("open %s failed.", filePath.c_str());
-                {
+      assert(false);
-                    LogError("open %s failed.", filePath.c_str());
+    }
-                    assert(false);
+    string line ;
-                }
+    while(getline(ifs, line)) {
-                string line ;
+      _stopWords.insert(line);
-                while(getline(ifs, line))
+    }
-                {
+    assert(_stopWords.size());
-                    _stopWords.insert(line);
+  }
                }
                assert(_stopWords.size());
            }
-            bool _isSingleWord(const string& str) const
+  bool _isSingleWord(const string& str) const {
-            {
+    Unicode unicode;
-                Unicode unicode;
+    TransCode::decode(str, unicode);
-                TransCode::decode(str, unicode);
+    if(unicode.size() == 1)
-                if(unicode.size() == 1)
+      return true;
-                  return true;
+    return false;
-                return false;
+  }
            }
-            static bool _cmp(const pair<string, double>& lhs, const pair<string, double>& rhs)
+  static bool _cmp(const pair<string, double>& lhs, const pair<string, double>& rhs) {
-            {
+    return lhs.second > rhs.second;
-                return lhs.second > rhs.second;
+  }
            }
        private:
            MixSegment _segment;
            unordered_map<string, double> _idfMap;
            double _idfAverage;
-            unordered_set<string> _stopWords;
+ private:
-    };
+  MixSegment _segment;
  unordered_map<string, double> _idfMap;
  double _idfAverage;
  unordered_set<string> _stopWords;
 };
 }
 #endif