merge upstream

2025-07-18 00:00:12 +08:00 · 2014-01-12 18:12:22 +08:00 · 2014-01-12 18:12:22 +08:00 · 680399efdc
commit 680399efdc
parent 14480a079a fba34e1ace
54 changed files with 259357 additions and 278 deletions
--- a/.gitignore
+++ b/.gitignore
@ -14,3 +14,4 @@ prior.gbk
 tmp
 t.*
 *.pid
+build
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -2,16 +2,22 @@ PROJECT(CPPJIEBA)

 CMAKE_MINIMUM_REQUIRED (VERSION 2.8)

-SET(CMAKE_INSTALL_PREFIX  /usr)
-ADD_DEFINITIONS(-std=c++0x -O3)
+if (CMAKE_INSTALL_PREFIX_INITIALIZED_TO_DEFAULT)
+    set (CMAKE_INSTALL_PREFIX "/usr" CACHE PATH "default install path" FORCE )
+endif()
+ADD_DEFINITIONS(-std=c++0x -O3 -Wall)
 IF (DEFINED ENC)
    ADD_DEFINITIONS(-DCPPJIEBA_${ENC})
 ENDIF()
 #ADD_DEFINITIONS(-DNO_FILTER)
 ADD_SUBDIRECTORY(src)
-ADD_SUBDIRECTORY(dicts)
-ADD_SUBDIRECTORY(scripts)
+ADD_SUBDIRECTORY(dict)
+
+if (!${APPLE})
+ADD_SUBDIRECTORY(script)
 ADD_SUBDIRECTORY(conf)
+endif()
+
 ADD_SUBDIRECTORY(test)

 ENABLE_TESTING()
--- a/README.md
+++ b/README.md
@ -28,7 +28,7 @@ sudo make install
 #### 测试

 ```sh
-cd build && ./test/segment.demo
+make test 
 ```

 ### 启动服务
@ -165,7 +165,7 @@ MPSegment.hpp

 HMMSegment.hpp
 是根据HMM模型来进行分词，主要算法思路是根据(B,E,M,S)四个状态来代表每个字的隐藏状态。
-HMM模型由dicts/下面的`hmm_model.utf8`提供。
+HMM模型由dict/下面的`hmm_model.utf8`提供。
 分词算法即viterbi算法。

 FullSegment.hpp
@ -173,7 +173,7 @@ FullSegment.hpp

 #### TransCode模块

-TransCode.cpp/TransCode.h 负责转换编码类型，将utf8和gbk转换成`uint16_t`类型，也负责逆转换。
+TransCode.hpp 负责转换编码类型，将utf8和gbk转换成`uint16_t`类型，也负责逆转换。

 ### src/Husky

--- a/conf/server.conf
+++ b/conf/server.conf
@ -10,10 +10,10 @@ thread_num=4
 daemonize=true

 #dict path
-dict_path=/usr/share/CppJieba/dicts/jieba.dict.utf8
+dict_path=/usr/share/CppJieba/dict/jieba.dict.utf8

 #model path
-model_path=/usr/share/CppJieba/dicts/hmm_model.utf8
+model_path=/usr/share/CppJieba/dict/hmm_model.utf8

 #pid file
 pid_file=/var/run/CppJieba/cjserver.pid
--- a/dicts/CMakeLists.txt
+++ b/dicts/CMakeLists.txt
@ -1 +1 @@
-INSTALL(FILES hmm_model.utf8 jieba.dict.utf8 DESTINATION share/CppJieba/dicts)
+INSTALL(FILES hmm_model.utf8 jieba.dict.utf8 DESTINATION share/CppJieba/dict)
--- a/dicts/hmm_model.gbk
+++ b/dicts/hmm_model.gbk
--- a/dicts/hmm_model.utf8
+++ b/dicts/hmm_model.utf8
--- a/dict/idf.utf8
+++ b/dict/idf.utf8
--- a/dicts/jieba.dict.gbk
+++ b/dicts/jieba.dict.gbk
--- a/dicts/jieba.dict.utf8
+++ b/dicts/jieba.dict.utf8
--- a/scripts/CMakeLists.txt
+++ b/scripts/CMakeLists.txt
--- a/script/cjseg.sh
+++ b/script/cjseg.sh
@ -0,0 +1,5 @@
+if [ $# -lt 1 ]; then
+    echo "usage: $0 <file>"
+    exit 1
+fi
+cjsegment --dictpath /usr/share/CppJieba/dict/jieba.dict.utf8 --modelpath /usr/share/CppJieba/dict/hmm_model.utf8 $1
--- a/scripts/cjserver
+++ b/scripts/cjserver
--- a/scripts/cjseg.sh
+++ b/scripts/cjseg.sh
@ -1,5 +0,0 @@
-if [ $# -lt 1 ]; then
-    echo "usage: $0 <file>"
-    exit 1
-fi
-cjsegment --dictpath /usr/share/CppJieba/dicts/jieba.dict.utf8 --modelpath /usr/share/CppJieba/dicts/hmm_model.utf8 $1
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@ -1,13 +1,15 @@
 SET(EXECUTABLE_OUTPUT_PATH ${PROJECT_BINARY_DIR}/bin)
 SET(LIBRARY_OUTPUT_PATH ${PROJECT_BINARY_DIR}/lib)

+INCLUDE_DIRECTORIES(${PROJECT_SOURCE_DIR}/src)
+
 ADD_EXECUTABLE(cjsegment segment.cpp)
 ADD_EXECUTABLE(cjserver server.cpp)
 TARGET_LINK_LIBRARIES(cjserver pthread)

 INSTALL(TARGETS cjsegment RUNTIME DESTINATION bin)
 INSTALL(TARGETS cjserver RUNTIME DESTINATION bin)
-INSTALL(FILES ChineseFilter.hpp HMMSegment.hpp MPSegment.hpp Trie.hpp ISegment.hpp  MixSegment.hpp  SegmentBase.hpp  TransCode.hpp  DESTINATION include/CppJieba)
+INSTALL(FILES HMMSegment.hpp MPSegment.hpp Trie.hpp TrieManager.hpp ISegment.hpp  MixSegment.hpp  SegmentBase.hpp  TransCode.hpp KeywordExtractor.hpp DESTINATION include/CppJieba)

 ADD_SUBDIRECTORY(Husky)
 ADD_SUBDIRECTORY(Limonp)
--- a/src/ChineseFilter.hpp
+++ b/src/ChineseFilter.hpp
@ -1,49 +0,0 @@
-#ifndef CPPJIEBA_CHINESEFILTER_H
-#define CPPJIEBA_CHINESEFILTER_H
-
-#include "TransCode.hpp"
-
-namespace CppJieba
-{
-
-    /*
-     * if char is ascii, count the ascii string's length and return 0;
-     * else count the nonascii string's length and return 1;
-     * if errors, return -1;
-     * */
-    inline int filterAscii(const char* str, uint len, uint& resLen)
-    {
-        if(!str || !len)
-        {
-            return -1;
-        }
-        char x = 0x80;
-        int resFlag = (str[0] & x ? 1 : 0);
-        resLen = 0;
-        if(!resFlag)
-        {
-            while(resLen < len && !(str[resLen] & x))
-            {
-                resLen ++;
-            }
-        }
-        else
-        {
-            while(resLen < len && (str[resLen] & x))
-            {
-#ifdef CPPJIEBA_GBK
-                resLen += 2;
-#else
-                resLen ++;
-#endif
-            }
-        }
-        if(resLen > len)
-        {
-            return -1;
-        }
-        return resFlag;
-    }
-}
-
-#endif
--- a/src/FullSegment.hpp
+++ b/src/FullSegment.hpp
@ -71,7 +71,7 @@ namespace CppJieba
                        for (vector<pair<uint, const TrieNodeInfo*> >::const_iterator itr = tRes.begin(); itr != tRes.end(); itr++)
                        {
                            wordLen = itr->second->word.size();
-                            if (wordLen >= 2 || tRes.size() == 1 && maxIdx <= uIdx)
+                            if (wordLen >= 2 || (tRes.size() == 1 && maxIdx <= uIdx))
                            {
                                res.push_back(itr->second->word);
                            }
--- a/src/HMMSegment.hpp
+++ b/src/HMMSegment.hpp
@ -46,6 +46,7 @@ namespace CppJieba
            {
                if(_getInitFlag())
                {
+                    LogError("inited already.");
                    return false;
                }
                memset(_startProb, 0, sizeof(_startProb));
@ -58,7 +59,13 @@ namespace CppJieba
                _emitProbVec.push_back(&_emitProbE);
                _emitProbVec.push_back(&_emitProbM);
                _emitProbVec.push_back(&_emitProbS);
-                return _setInitFlag(_loadModel(filePath.c_str()));
+                if(!_setInitFlag(_loadModel(filePath.c_str())))
+                {
+                    LogError("_loadModel(%s) failed.", filePath.c_str());
+                    return false;
+                }
+                LogInfo("HMMSegment init(%s) ok.", filePath.c_str());
+                return true;
            }
        public:
            using SegmentBase::cut;
@ -198,7 +205,7 @@ namespace CppJieba
            }
            bool _loadModel(const char* const filePath)
            {
-                LogInfo("loadModel [%s] start ...", filePath);
+                LogDebug("loadModel [%s] start ...", filePath);
                ifstream ifile(filePath);
                string line;
                vector<string> tmp;
@ -208,7 +215,7 @@ namespace CppJieba
                {
                    return false;
                }
-                splitStr(line, tmp, " ");
+                split(line, tmp, " ");
                if(tmp.size() != STATUS_SUM)
                {
                    LogError("start_p illegal");
@ -227,7 +234,7 @@ namespace CppJieba
                    {
                        return false;
                    }
-                    splitStr(line, tmp, " ");
+                    split(line, tmp, " ");
                    if(tmp.size() != STATUS_SUM)
                    {
                        LogError("trans_p illegal");
@ -264,7 +271,7 @@ namespace CppJieba
                    return false;
                }

-                LogInfo("loadModel [%s] end.", filePath);
+                LogDebug("loadModel [%s] end.", filePath);

                return true;
            }
@ -277,7 +284,7 @@ namespace CppJieba
                    {
                        continue;
                    }
-                    if(strStartsWith(line, "#"))
+                    if(startsWith(line, "#"))
                    {
                        continue;
                    }
@ -293,10 +300,10 @@ namespace CppJieba
                }
                vector<string> tmp, tmp2;
                uint16_t unico = 0;
-                splitStr(line, tmp, ",");
+                split(line, tmp, ",");
                for(uint i = 0; i < tmp.size(); i++)
                {
-                    splitStr(tmp[i], tmp2, ":");
+                    split(tmp[i], tmp2, ":");
                    if(2 != tmp2.size())
                    {
                        LogError("_emitProb illegal.");
--- a/src/Husky/CMakeLists.txt
+++ b/src/Husky/CMakeLists.txt
@ -1,4 +1,5 @@
 SET(EXECUTABLE_OUTPUT_PATH ${PROJECT_BINARY_DIR}/bin)
 SET(LIBRARY_OUTPUT_PATH ${PROJECT_BINARY_DIR}/lib)

-INSTALL(FILES HttpReqInfo.hpp ServerFrame.hpp ThreadManager.hpp DESTINATION include/CppJieba/Husky)
+FILE(GLOB SRCS *.hpp)
+INSTALL(FILES ${SRCS} DESTINATION include/CppJieba/Husky)
--- a/src/Husky/HttpReqInfo.hpp
+++ b/src/Husky/HttpReqInfo.hpp
@ -3,7 +3,7 @@

 #include <iostream>
 #include <string>
-#include "../Limonp/logger.hpp"
+#include "Limonp/logger.hpp"

 namespace Husky
 {
@ -88,7 +88,7 @@ namespace Husky
                }
                string firstline(headerStr, lpos, rpos - lpos);
                trim(firstline);
-                if(!splitStr(firstline, buf, " ") || 3 != buf.size())
+                if(!split(firstline, buf, " ") || 3 != buf.size())
                {
                    LogFatal("parse header first line failed.");
                    return false;
--- a/src/Husky/HuskyServer.hpp
+++ b/src/Husky/HuskyServer.hpp
@ -29,7 +29,7 @@ namespace Husky
    using namespace Limonp;
    typedef	int SOCKET;
    const struct timeval SOCKET_TIMEOUT = {2, 0};
-    const char* const RESPONSE_FORMAT = "HTTP/1.1 200 OK\r\nConnection: close\r\nServer: FrameServer/1.0.0\r\nContent-Type: text/json; charset=%s\r\nContent-Length: %d\r\n\r\n";
+    const char* const RESPONSE_FORMAT = "HTTP/1.1 200 OK\r\nConnection: close\r\nServer: HuskyServer/1.0.0\r\nContent-Type: text/json; charset=%s\r\nContent-Length: %d\r\n\r\n";
    const char* const RESPONSE_CHARSET_UTF8 = "UTF-8";
    const char* const RESPONSE_CHARSET_GB2312 = "GB2312";
    const char* const CLIENT_IP_K = "CLIENT_IP"; 
@ -53,13 +53,13 @@ namespace Husky
        bool * pShutdown;
    };

-    class ServerFrame//: public IWorkHandler
+    class HuskyServer
    {
        private:
            pthread_mutex_t m_pmAccept;
            bool m_bShutdown;
        public:
-            ServerFrame(unsigned nPort, unsigned nThreadCount, IRequestHandler* pHandler)
+            explicit HuskyServer(unsigned nPort, unsigned nThreadCount, IRequestHandler* pHandler)
            {
                m_bShutdown = false;
                m_nLsnPort = nPort;
@ -68,7 +68,7 @@ namespace Husky
                assert(pHandler);
                pthread_mutex_init(&m_pmAccept,NULL);
            };
-            virtual ~ServerFrame(){pthread_mutex_destroy(&m_pmAccept);};
+            virtual ~HuskyServer(){pthread_mutex_destroy(&m_pmAccept);};
            virtual bool init()  
            {

@ -292,8 +292,6 @@ namespace Husky
            u_short  m_nThreadCount;
            SOCKET   m_lsnSock;
            IRequestHandler *m_pHandler;
-            //static bool m_bShutdown;
-            //static pthread_mutex_t m_pmAccept;

    }; 

--- a/src/Husky/ThreadManager.hpp
+++ b/src/Husky/ThreadManager.hpp
@ -5,8 +5,6 @@
 #include <vector>
 #include <map>

-#define INFINITE 0
-
 namespace Husky
 {
    using namespace std;
@ -14,48 +12,44 @@ namespace Husky
    class ThreadManager
    {
        private:
-            typedef int HANDLE;
-            typedef int DWORD;
+            typedef pthread_t HANDLE;
            typedef void *(* PThreadFunc)(void* param);
        public:
            ThreadManager(){;}
            ~ThreadManager(){}

-            unsigned int HandleCount(){return m_vecHandle.size();}
+            size_t HandleCount(){return _handles.size();}

            void clear()
            {
-                m_vecHandle.clear();
+                _handles.clear();
            }

-            HANDLE CreateThread( PThreadFunc pFunc,void *pPara)
+            int CreateThread( PThreadFunc pFunc,void *pPara)
            {	
                pthread_t pt;
-                int nErrorCode=pthread_create(&pt,NULL,pFunc,pPara);
-                if(nErrorCode!=0)
+                int nErrorCode = pthread_create(&pt,NULL,pFunc,pPara);
+                if(nErrorCode != 0)
                  return nErrorCode;
-                m_vecHandle.push_back(pt);	//加入线程列表 为WaitForMultipleObjects准备	
+                _handles.push_back(pt);
                return nErrorCode;

            }

-            //hThread (thread handler)        : 为0时为默认最后一个加入管理器的线程句柄 
-            //dwMilliseconds等待时间  : 单位毫秒，默认值无穷时间
-            //return value            : -1句柄无效，其他值 WaitForSingleObject函数的返回值
-            DWORD Wait(HANDLE hThread=0,DWORD dwMilliseconds=INFINITE )
+            int Wait(HANDLE hThread = 0)
            {
-                if( hThread==0)//最后一个加入的线程
+                if( hThread == 0)//the last handle
                {   
-                    if(!m_vecHandle.empty())
+                    if(!_handles.empty())
                    {
-                        return pthread_join(m_vecHandle.back(),NULL);
+                        return pthread_join(_handles.back(),NULL);
                    }
                    else
                      return -1;
                }
                else
                {
-                    if (find(m_vecHandle.begin(),m_vecHandle.end(),hThread)==m_vecHandle.end())//不存在此句柄
+                    if (find(_handles.begin(),_handles.end(),hThread) == _handles.end())
                    {
                        return -1;
                    }
@ -65,31 +59,26 @@ namespace Husky

            }

-
-            //等待所有线程执行完毕
-            //bWaitAll是否所有线程  : 默认值1等待所有线程,0有任何线程结束，此函数返回
-            //dwMilliseconds        : 单位毫秒，默认值无穷时间
-            //return value          : -1没有任何句柄，其他值 WaitForMultipleObjects函数的返回值
-            DWORD  WaitMultipleThread( bool bWaitAll=1,DWORD dwMilliseconds=INFINITE)
+            int WaitMultipleThread()
            {
-                if (m_vecHandle.empty())		
+                if (_handles.empty())		
                  return -1;	
                int nErrorcode;
-                for (uint i=0;i<m_vecHandle.size();++i)
+                for (uint i = 0; i < _handles.size(); i++)
                {
-                    nErrorcode=pthread_join(m_vecHandle[i], NULL); 
-                    if (nErrorcode!=0)
+                    nErrorcode = pthread_join(_handles[i], NULL); 
+                    if (nErrorcode != 0)
                      return nErrorcode;	
                }	
                return 0;
            }

        private:
-            vector<pthread_t> m_vecHandle; 
+            vector<pthread_t> _handles; 

        private:
            ThreadManager(const ThreadManager&){;}// copy forbidden
-            void operator=(const ThreadManager &){}// copy forbidden			
+            void operator = (const ThreadManager &){}// copy forbidden			
    };
 }

--- a/src/KeywordExtractor.hpp
+++ b/src/KeywordExtractor.hpp
@ -0,0 +1,129 @@
+#ifndef CPPJIEBA_KEYWORD_EXTRACTOR_H
+#define CPPJIEBA_KEYWORD_EXTRACTOR_H
+
+#include "MPSegment.hpp"
+#include <cmath>
+#define MIN(X,Y) ((X) < (Y) ? (X) : (Y))
+
+namespace CppJieba
+{
+    using namespace Limonp;
+
+    //struct KeyWordInfo
+    //{
+    //    string word;
+    //    double tfidf;
+    //};
+
+    //inline ostream& operator << (ostream& os, const KeyWordInfo & keyword)
+    //{
+    //    return os << keyword.word << "," << keyword.idf;
+    //}
+
+    class KeywordExtractor
+    {
+        private:
+            MPSegment _segment;
+        private:
+            unordered_map<string, double> _idfMap;
+        protected:
+            bool _isInited;
+            bool _getInitFlag()const{return _isInited;};
+            bool _setInitFlag(bool flag){return _isInited = flag;};
+        public:
+            operator bool(){return _getInitFlag();};
+        public:
+            KeywordExtractor(){_setInitFlag(false);};
+            explicit KeywordExtractor(const string& dictPath, const string& idfPath){_setInitFlag(init(dictPath, idfPath));};
+            ~KeywordExtractor(){};
+        public:
+            bool init(const string& dictPath, const string& idfPath)
+            {
+                ifstream ifs(idfPath.c_str());
+                if(!ifs)
+                {
+                    LogError("open %s failed.", idfPath.c_str());
+                    return false;
+                }
+                string line ;
+                vector<string> buf;
+                for(uint lineno = 0; getline(ifs, line); lineno++)
+                {
+                    buf.clear();
+                    if(line.empty())
+                    {
+                        LogError("line[%d] empty. skipped.", lineno);
+                        continue;
+                    }
+                    if(!split(line, buf, " ") || buf.size() != 2)
+                    {
+                        LogError("line %d [%s] illegal. skipped.", lineno, line.c_str());
+                        continue;
+                    }
+                    _idfMap[buf[0]] = atof(buf[1].c_str());
+                }
+                return _setInitFlag(_segment.init(dictPath));
+            };
+        public:
+
+            bool extract(const string& str, vector<string>& keywords, uint topN) const
+            {
+                assert(_getInitFlag());
+                vector<pair<string, double> > topWords;
+                if(!extract(str, topWords, topN))
+                {
+                    return false;
+                }
+                for(uint i = 0; i < topWords.size(); i++)
+                {
+                    keywords.push_back(topWords[i].first);
+                }
+                return true;
+            }
+
+            bool extract(const string& str, vector<pair<string, double> >& keywords, uint topN) const
+            {
+                vector<string> words;
+                if(!_segment.cut(str, words))
+                {
+                    LogError("segment cut(%s) failed.", str.c_str());
+                    return false;
+                }
+
+                unordered_map<string, double> wordmap;
+                for(uint i = 0; i < words.size(); i ++)
+                {
+                    wordmap[ words[i] ] += 1.0;
+                }
+
+                for(unordered_map<string, double>::iterator itr = wordmap.begin(); itr != wordmap.end();)
+                {
+                    unordered_map<string, double>::const_iterator cit = _idfMap.find(itr->first);
+                    if(cit != _idfMap.end())
+                    {
+                        itr->second *= cit->second;
+                        itr ++;
+                    }
+                    else
+                    {
+                        itr = wordmap.erase(itr);
+                    }
+                }
+
+                keywords.resize(MIN(topN, wordmap.size()));
+                partial_sort_copy(wordmap.begin(), wordmap.end(), keywords.begin(), keywords.end(), _cmp);
+                return true;
+            }
+
+        private:
+            static bool _cmp(const pair<string, uint>& lhs, const pair<string, uint>& rhs)
+            {
+                return lhs.second > rhs.second;
+            }
+            
+    };
+}
+
+#endif
+
+
--- a/src/Limonp/ArgvContext.hpp
+++ b/src/Limonp/ArgvContext.hpp
@ -22,9 +22,9 @@ namespace Limonp

                for(int i = 0; i < argc; i++)
                {
-                    if(strStartsWith(argv[i], "-"))
+                    if(startsWith(argv[i], "-"))
                    {
-                        if(i + 1 < argc && !strStartsWith(argv[i + 1], "-"))
+                        if(i + 1 < argc && !startsWith(argv[i + 1], "-"))
                        {
                            _mpss[argv[i]] = argv[i+1];
                            i++;
--- a/src/Limonp/Config.hpp
+++ b/src/Limonp/Config.hpp
@ -18,7 +18,17 @@ namespace Limonp
    class Config
    {
        public:
-            bool loadFile(const char * const filePath)
+            Config(const char * const filePath)
+            {
+                _loadFile(filePath);
+            }
+        public:
+            operator bool ()
+            {
+                return !_map.empty();
+            }
+        private:
+            bool _loadFile(const char * const filePath)
            {
                ifstream ifs(filePath);
                if(!ifs)
@ -33,12 +43,12 @@ namespace Limonp
                {
                    lineno ++;
                    trim(line);
-                    if(line.empty() || strStartsWith(line, "#"))
+                    if(line.empty() || startsWith(line, "#"))
                    {
                        continue;
                    }
                    vecBuf.clear();
-                    if(!splitStr(line, vecBuf, "=") || 2 != vecBuf.size())
+                    if(!split(line, vecBuf, "=") || 2 != vecBuf.size())
                    {
                        LogFatal("line[%d:%s] is illegal.", lineno, line.c_str());
                        return false;
@ -57,6 +67,7 @@ namespace Limonp
                ifs.close();
                return true;
            }
+        public:
            bool get(const string& key, string& value) const
            {
                map<string, string>::const_iterator it = _map.find(key);
@ -73,7 +84,7 @@ namespace Limonp
            friend ostream& operator << (ostream& os, const Config& config);
    };
    
-    ostream& operator << (ostream& os, const Config& config)
+    inline ostream& operator << (ostream& os, const Config& config)
    {
        return os << config._map;
    }
--- a/src/Limonp/logger.hpp
+++ b/src/Limonp/logger.hpp
@ -11,6 +11,7 @@
 #include <string>
 #include <stdio.h>
 #include <stdarg.h>
+#include <cassert>
 #include "io_functs.hpp"
 #include "str_functs.hpp"

@ -23,6 +24,7 @@
 #define LogFatal(fmt, ...) Logger::LoggingF(LL_FATAL, FILE_BASENAME, __LINE__, fmt, ## __VA_ARGS__)


+
 namespace Limonp
 {
    using namespace std;
@ -36,16 +38,11 @@ namespace Limonp
        public:
            static bool Logging(uint level, const string& msg, const char* fileName, int lineNo)
            {
-                if(level > LL_FATAL)
-                {
-                    cerr<<"level's value is out of range"<<endl;
-                    return false;
-                }
+                assert(level <= LL_FATAL);
                char buf[CSTR_BUFFER_SIZE];
                time_t timeNow;
                time(&timeNow);
-                size_t ret = strftime(buf, sizeof(buf), LOG_TIME_FORMAT, localtime(&timeNow));
-                if(0 == ret)
+                if(!strftime(buf, sizeof(buf), LOG_TIME_FORMAT, localtime(&timeNow)))
                {
                    fprintf(stderr, "stftime failed.\n");
                    return false;
@ -55,6 +52,9 @@ namespace Limonp
            }
            static bool LoggingF(uint level, const char* fileName, int lineNo, const string& fmt, ...)
            {
+#ifdef LOGGER_LEVEL
+                if(level < LOGGER_LEVEL) return true;
+#endif
                int size = 256;
                string msg;
                va_list ap;
--- a/src/Limonp/md5.hpp
+++ b/src/Limonp/md5.hpp
@ -34,7 +34,7 @@
 namespace Limonp 
 {

-#pragma region MD5 defines
+//#pragma region MD5 defines
 // Constants for MD5Transform routine.
 #define S11 7
 #define S12 12
@ -85,7 +85,7 @@ namespace Limonp
  (a) = ROTATE_LEFT ((a), (s)); \
  (a) += (b); \
  }
-#pragma endregion
+//#pragma endregion


 typedef unsigned char BYTE ;
@ -115,7 +115,7 @@ private:
    unsigned char buffer[64];                         /* input buffer */
  } context ;

-  #pragma region static helper functions
+  //#pragma region static helper functions
  // The core of the MD5 algorithm is here.
  // MD5 basic transformation. Transforms state based on block.
  static void MD5Transform( UINT4 state[4], unsigned char block[64] )
@ -229,7 +229,7 @@ private:
      output[i] = ((UINT4)input[j]) | (((UINT4)input[j+1]) << 8) |
      (((UINT4)input[j+2]) << 16) | (((UINT4)input[j+3]) << 24);
  }
-  #pragma endregion
+  //#pragma endregion


 public:
@ -354,7 +354,7 @@ public:
    }
    else
    {
-      while( len = fread( buffer, 1, 1024, file ) )
+      while( (len = fread( buffer, 1, 1024, file )) )
        Update( buffer, len ) ;
      Final();

--- a/src/Limonp/str_functs.hpp
+++ b/src/Limonp/str_functs.hpp
@ -72,28 +72,6 @@ namespace Limonp
        }
    }

-    //inline bool joinStr(const vector<string>& src, string& dest, const string& connectorStr)
-    //{
-    //    if(src.empty())
-    //    {
-    //        return false;
-    //    }
-    //    for(uint i = 0; i < src.size() - 1; i++)
-    //    {
-    //        dest += src[i];
-    //        dest += connectorStr;
-    //    }
-    //    dest += src[src.size() - 1];
-    //    return true;
-    //}
-
-    //inline string joinStr(const vector<string>& source, const string& connector)
-    //{
-    //    string res;
-    //    joinStr(source, res, connector);
-    //    return res;
-    //}
-
    template<class T>
        void join(T begin, T end, string& res, const string& connector)
        {
@ -122,7 +100,7 @@ namespace Limonp



-    inline bool splitStr(const string& src, vector<string>& res, const string& pattern)
+    inline bool split(const string& src, vector<string>& res, const string& pattern)
    {
        if(src.empty())
        {
@ -181,20 +159,9 @@ namespace Limonp
    }


-    inline uint16_t twocharToUint16(char high, char low)
-    {
-        return (((uint16_t(high) & 0x00ff ) << 8) | (uint16_t(low) & 0x00ff));
-    }

-    inline pair<char, char> uint16ToChar2(uint16_t in)
-    {
-        pair<char, char> res;
-        res.first = (in>>8) & 0x00ff; //high
-        res.second = (in) & 0x00ff; //low
-        return res;
-    }

-    inline bool strStartsWith(const string& str, const string& prefix)
+    inline bool startsWith(const string& str, const string& prefix)
    {
        //return str.substr(0, prefix.size()) == prefix;
        if(prefix.length() > str.length())
@ -204,7 +171,7 @@ namespace Limonp
        return 0 == str.compare(0, prefix.length(), prefix);
    }

-    inline bool strEndsWith(const string& str, const string& suffix)
+    inline bool endsWith(const string& str, const string& suffix)
    {
        if(suffix.length() > str.length())
        {
@ -218,13 +185,19 @@ namespace Limonp
        return str.find(ch) != string::npos;
    }

+    inline uint16_t twocharToUint16(char high, char low)
+    {
+        return (((uint16_t(high) & 0x00ff ) << 8) | (uint16_t(low) & 0x00ff));
+    }
+
    inline bool utf8ToUnicode(const char * const str, uint len, vector<uint16_t>& vec)
    {
-        char ch1, ch2;
        if(!str)
        {
            return false;
        }
+        char ch1, ch2;
+        uint16_t tmp;
        vec.clear();
        for(uint i = 0;i < len;)
        {
@ -237,14 +210,16 @@ namespace Limonp
            {
                ch1 = (str[i] >> 2) & 0x07;
                ch2 = (str[i+1] & 0x3f) | ((str[i] & 0x03) << 6 );
-                vec.push_back(twocharToUint16(ch1, ch2));
+                tmp = (((uint16_t(ch1) & 0x00ff ) << 8) | (uint16_t(ch2) & 0x00ff));
+                vec.push_back(tmp);
                i += 2;
            }
            else if((unsigned char)str[i] <= 0xef && i + 2 < len)
            {
                ch1 = (str[i] << 4) | ((str[i+1] >> 2) & 0x0f );
                ch2 = ((str[i+1]<<6) & 0xc0) | (str[i+2] & 0x3f); 
-                vec.push_back(twocharToUint16(ch1, ch2));
+                tmp = (((uint16_t(ch1) & 0x00ff ) << 8) | (uint16_t(ch2) & 0x00ff));
+                vec.push_back(tmp);
                i += 3;
            }
            else
@ -310,7 +285,8 @@ namespace Limonp
            {
                if(i + 1 < len) //&& (str[i+1] & 0x80))
                {
-                    vec.push_back(twocharToUint16(str[i], str[i + 1]));
+                    uint16_t tmp = (((uint16_t(str[i]) & 0x00ff ) << 8) | (uint16_t(str[i+1]) & 0x00ff));
+                    vec.push_back(tmp);
                    i += 2;
                }
                else
@ -321,11 +297,20 @@ namespace Limonp
        }
        return true;
    }
+
    inline bool gbkTrans(const string& str, vector<uint16_t>& vec)
    {
        return gbkTrans(str.c_str(), str.size(), vec);
    }

+    //inline pair<char, char> uint16ToChar2(uint16_t in)
+    //{
+    //    pair<char, char> res;
+    //    res.first = (in>>8) & 0x00ff; //high
+    //    res.second = (in) & 0x00ff; //low
+    //    return res;
+    //}
+
    inline bool gbkTrans(vector<uint16_t>::const_iterator begin, vector<uint16_t>::const_iterator end, string& res)
    {
        if(begin >= end)
@ -333,18 +318,21 @@ namespace Limonp
            return false;
        }
        res.clear();
-        pair<char, char> pa;
+        //pair<char, char> pa;
+        char first, second;
        while(begin != end)
        {
-            pa = uint16ToChar2(*begin);
-            if(pa.first & 0x80)
+            //pa = uint16ToChar2(*begin);
+            first = ((*begin)>>8) & 0x00ff;
+            second = (*begin) & 0x00ff;
+            if(first & 0x80)
            {
-                res += pa.first;
-                res += pa.second;
+                res += first;
+                res += second;
            }
            else
            {
-                res += pa.second;
+                res += second;
            }
            begin++;
        }
--- a/src/MPSegment.hpp
+++ b/src/MPSegment.hpp
@ -32,7 +32,7 @@ namespace CppJieba

    class MPSegment: public SegmentBase
    {
-        private:
+        protected:
            Trie* _trie;

        public:
@ -56,6 +56,7 @@ namespace CppJieba
                    LogError("get a NULL pointor form getTrie(\"%s\").", dictPath.c_str());
                    return false;
                }
+                LogInfo("MPSegment init(%s) ok", dictPath.c_str());
                return _setInitFlag(true);
            }
        public:
--- a/src/MixSegment.hpp
+++ b/src/MixSegment.hpp
@ -15,9 +15,9 @@ namespace CppJieba
            HMMSegment _hmmSeg;
        public:
            MixSegment(){_setInitFlag(false);};
-            explicit MixSegment(const string& mpSegDict, const string& hmmSegDict): _mpSeg(mpSegDict), _hmmSeg(hmmSegDict)
+            explicit MixSegment(const string& mpSegDict, const string& hmmSegDict)
            {
-                _setInitFlag(_mpSeg && _hmmSeg);
+                _setInitFlag(init(mpSegDict, hmmSegDict));
            }
            virtual ~MixSegment(){}
        public:
@ -38,6 +38,7 @@ namespace CppJieba
                    LogError("_hmmSeg init");
                    return false;
                }
+                LogInfo("MixSegment init(%s, %s)", mpSegDict.c_str(), hmmSegDict.c_str());
                return _setInitFlag(true);
            }
        public:
--- a/src/SegmentBase.hpp
+++ b/src/SegmentBase.hpp
@ -1,10 +1,11 @@
 #ifndef CPPJIEBA_SEGMENTBASE_H
 #define CPPJIEBA_SEGMENTBASE_H

-#include "ISegment.hpp"
-#include "ChineseFilter.hpp"
-#include "Limonp/str_functs.hpp"
+#include "TransCode.hpp"
 #include "Limonp/logger.hpp"
+#include "ISegment.hpp"
+#include <cassert>
+

 namespace CppJieba
 {
@ -18,9 +19,10 @@ namespace CppJieba
            bool _isInited;
            bool _getInitFlag()const{return _isInited;};
            bool _setInitFlag(bool flag){return _isInited = flag;};
-
        public:
            operator bool(){return _getInitFlag();};
+
+        public:
            virtual bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<string>& res)const = 0;
            virtual bool cut(const string& str, vector<string>& res)const
            {
@ -74,6 +76,46 @@ namespace CppJieba
                return true;
 #endif
            }
+        public:
+
+            /*
+             * if char is ascii, count the ascii string's length and return 0;
+             * else count the nonascii string's length and return 1;
+             * if errors, return -1;
+             * */
+            static int filterAscii(const char* str, uint len, uint& resLen)
+            {
+                if(!str || !len)
+                {
+                    return -1;
+                }
+                char x = 0x80;
+                int resFlag = (str[0] & x ? 1 : 0);
+                resLen = 0;
+                if(!resFlag)
+                {
+                    while(resLen < len && !(str[resLen] & x))
+                    {
+                        resLen ++;
+                    }
+                }
+                else
+                {
+                    while(resLen < len && (str[resLen] & x))
+                    {
+#ifdef CPPJIEBA_GBK
+                        resLen += 2;
+#else
+                        resLen ++;
+#endif
+                    }
+                }
+                if(resLen > len)
+                {
+                    return -1;
+                }
+                return resFlag;
+            }

    };
 }
--- a/src/TfIdfKeyWord.hpp
+++ b/src/TfIdfKeyWord.hpp
@ -1,32 +0,0 @@
-#ifndef CPPJIEBA_TFIDF_H
-#define CPPJIEBA_TFIDF_H
-
-#include "MPSegment.hpp"
-
-namespace CppJieba
-{
-    using namespace Limonp;
-
-    class TfIdfKeyWord
-    {
-        private:
-            MPSegment _segment;
-        public:
-            TfIdfKeyWord(const char* dictFile): _segment(dictFile){};
-            ~TfIdfKeyWord(){};
-        public:
-            bool init(){return _segment.init();};
-            bool dispose(){return _segment.dispose();};
-        public:
-            bool extract(const string& str, vector<string>& words, uint topN)
-            {
-                return _segment.cut(words);
-                return true;
-            }
-            
-    };
-}
-
-#endif
-
-
--- a/src/Trie.hpp
+++ b/src/Trie.hpp
@ -130,26 +130,13 @@ namespace CppJieba
            }
            bool loadDict(const char * const filePath)
            {
-                if(!_getInitFlag())
-                {
-                    LogError("not initted.");
-                    return false;
-                }
-
-                if(!checkFileExist(filePath))
-                {
-                    LogError("cann't find fiel[%s].",filePath);
-                    return false;
-                }
-                bool res = false;
-                res = _trieInsert(filePath);
-                if(!res)
+                assert(_getInitFlag());
+                if(!_trieInsert(filePath))
                {
                    LogError("_trieInsert failed.");
                    return false;
                }
-                res = _countWeight();
-                if(!res)
+                if(!_countWeight())
                {
                    LogError("_countWeight failed.");
                    return false;
@ -339,22 +326,30 @@ namespace CppJieba
        private:
            bool _trieInsert(const char * const filePath)
            {
-                ifstream ifile(filePath);
+                ifstream ifs(filePath);
+                if(!ifs)
+                {
+                    LogError("open %s failed.", filePath);
+                    return false;
+                }
                string line;
                vector<string> vecBuf;

                TrieNodeInfo nodeInfo;
-                while(getline(ifile, line))
+                size_t lineno = 0;
+                while(getline(ifs, line))
                {
                    vecBuf.clear();
-                    splitStr(line, vecBuf, " ");
+                    lineno ++;
+                    split(line, vecBuf, " ");
                    if(3 < vecBuf.size())
                    {
-                        LogError("line[%s] illegal.", line.c_str());
+                        LogError("line[%u:%s] illegal.", lineno, line.c_str());
                        return false;
                    }
                    if(!TransCode::decode(vecBuf[0], nodeInfo.word))
                    {
+                        LogError("line[%u:%s] illegal.", lineno, line.c_str());
                        return false;
                    }
                    nodeInfo.freq = atoi(vecBuf[1].c_str());
--- a/src/TrieManager.hpp
+++ b/src/TrieManager.hpp
@ -23,15 +23,13 @@ namespace CppJieba
                    LogError("error when getting md5 for file '%s'", dictpath);
                    return NULL;
                }
-                LogInfo("md5 for file '%s': %s", dictpath, md5.c_str());

                if (_tries.find(md5) != _tries.end())
                {
-                    LogInfo("find a exits trie for md5: '%s'", md5.c_str());
                    return _tries[md5.c_str()];
                }

-                LogInfo("create a new trie for md5: '%s'", md5.c_str());
+                //LogDebug("create a new trie for md5: '%s'", md5.c_str());
                Trie* trie = NULL;
                try
                {
@ -54,15 +52,14 @@ namespace CppJieba
                    return NULL;
                }

-                LogInfo("trie->loadDict(%s) start...", dictpath);
                if (!trie->loadDict(dictpath))
                {
                    LogError("trie->loadDict(%s) failed...", dictpath);
                    return NULL;
                }
-                LogInfo("trie->loadDict end...");

                _tries[md5.c_str()] = trie;
+                LogDebug("trie->loadDict(%s)", dictpath);
                return trie;
            }

--- a/src/segment.cpp
+++ b/src/segment.cpp
@ -45,11 +45,11 @@ int main(int argc, char ** argv)
            <<"\t--modelpath\tsee example\n"
            <<"\t--maxlen\tspecify the granularity of cut used in cutQuery. \n\t\t\tIf not specified, the default is 3\n"
            <<"example:\n"
-            <<"\t"<<argv[0]<<" ../test/testdata/testlines.utf8 --dictpath ../dicts/jieba.dict.utf8 --algorithm cutDAG\n"
-            <<"\t"<<argv[0]<<" ../test/testdata/testlines.utf8 --dictpath ../dicts/jieba.dict.utf8 --algorithm cutFull\n"
-            <<"\t"<<argv[0]<<" ../test/testdata/testlines.utf8 --modelpath ../dicts/hmm_model.utf8 --algorithm cutHMM\n"
-            <<"\t"<<argv[0]<<" ../test/testdata/testlines.utf8 --dictpath ../dicts/jieba.dict.utf8 --modelpath ../dicts/hmm_model.utf8 --algorithm cutMix\n"
-            <<"\t"<<argv[0]<<" ../test/testdata/testlines.utf8 --dictpath ../dicts/jieba.dict.utf8 --modelpath ../dicts/hmm_model.utf8 --algorithm cutQuery --maxlen 3\n"
+            <<"\t"<<argv[0]<<" ../test/testdata/testlines.utf8 --dictpath ../dict/jieba.dict.utf8 --algorithm cutDAG\n"
+            <<"\t"<<argv[0]<<" ../test/testdata/testlines.utf8 --dictpath ../dict/jieba.dict.utf8 --algorithm cutFull\n"
+            <<"\t"<<argv[0]<<" ../test/testdata/testlines.utf8 --modelpath ../dict/hmm_model.utf8 --algorithm cutHMM\n"
+            <<"\t"<<argv[0]<<" ../test/testdata/testlines.utf8 --dictpath ../dict/jieba.dict.utf8 --modelpath ../dict/hmm_model.utf8 --algorithm cutMix\n"
+            <<"\t"<<argv[0]<<" ../test/testdata/testlines.utf8 --dictpath ../dict/jieba.dict.utf8 --modelpath ../dict/hmm_model.utf8 --algorithm cutQuery --maxlen 3\n"
            <<endl;
        
        return EXIT_FAILURE;
--- a/src/server.cpp
+++ b/src/server.cpp
@ -4,7 +4,7 @@
 #include <ctype.h>
 #include <string.h>
 #include "Limonp/Config.hpp"
-#include "Husky/ServerFrame.hpp"
+#include "Husky/HuskyServer.hpp"
 #include "MPSegment.hpp"
 #include "HMMSegment.hpp"
 #include "MixSegment.hpp"
@ -38,8 +38,8 @@ bool run(int argc, char** argv)
    {
        return false;
    }
-    Config conf;
-    if(!conf.loadFile(argv[1]))
+    Config conf(argv[1]);
+    if(!conf)
    {
        return false;
    }
@ -90,7 +90,7 @@ bool run(int argc, char** argv)
    }

    ReqHandler reqHandler(dictPath, modelPath);
-    ServerFrame sf(port, threadNum, &reqHandler);
+    HuskyServer sf(port, threadNum, &reqHandler);
    return sf.init() && sf.run();
 }

--- a/test/load_test.cpp
+++ b/test/load_test.cpp
@ -22,28 +22,19 @@ void cut(const ISegment * seg, const char * const filePath, size_t times = 10)
    for(uint i = 0; i < times; i ++)
    {
        LogInfo("times[%u]", i);
-        //ifile.seekg(0);
-        //while(getline(ifile, line))
-        //{
-        //    if(!line.empty())
-        //    {
-                res.clear();
-                seg->cut(doc, res);
-                //print(res);
-                //cout<<join(res.begin(), res.end(),"/")<<endl;
-        //    }
-        //}
+        res.clear();
+        seg->cut(doc, res);
    }
 }

 int main(int argc, char ** argv)
 {
    {
-        MixSegment seg("../dicts/jieba.dict.utf8", "../dicts/hmm_model.utf8");
+        MixSegment seg("../dict/jieba.dict.utf8", "../dict/hmm_model.utf8");
        if(!seg)
        {
            cout<<"seg init failed."<<endl;
-            return false;
+            return EXIT_FAILURE;
        }
        cut(&seg, "../test/testdata/weicheng.utf8");
    }
--- a/test/segment.cpp
+++ b/test/segment.cpp
@ -23,8 +23,8 @@ void cut(const ISegment * seg, const char * const filePath)
 }

 const char * const TEST_FILE = "../test/testdata/testlines.utf8";
-const char * const JIEBA_DICT_FILE = "../dicts/jieba.dict.utf8";
-const char * const HMM_DICT_FILE = "../dicts/hmm_model.utf8";
+const char * const JIEBA_DICT_FILE = "../dict/jieba.dict.utf8";
+const char * const HMM_DICT_FILE = "../dict/hmm_model.utf8";

 int main(int argc, char ** argv)
 {
--- a/test/server.cpp
+++ b/test/server.cpp
@ -8,8 +8,8 @@
 using namespace Husky;
 using namespace CppJieba;

-const char * const DEFAULT_DICTPATH = "../dicts/jieba.dict.utf8";
-const char * const DEFAULT_MODELPATH = "../dicts/hmm_model.utf8";
+const char * const DEFAULT_DICTPATH = "../dict/jieba.dict.utf8";
+const char * const DEFAULT_MODELPATH = "../dict/hmm_model.utf8";

 class ServerDemo: public IRequestHandler
 {
--- a/test/servertest/load_test.py
+++ b/test/servertest/load_test.py
@ -0,0 +1,91 @@
+#!/usr/bin/python
+# coding:utf-8
+import time
+import urllib2
+import threading
+from Queue import Queue
+from time import sleep
+import sys
+
+# 性能测试页面
+#PERF_TEST_URL = "http://10.2.66.38/?yyid=-1&suv=1309231700203264&callback=xxxxx"
+URLS = [line for line in open("../testdata/load_test.urls", "r")]
+
+# 配置:压力测试
+THREAD_NUM = 10            # 并发线程总数
+ONE_WORKER_NUM = 500       # 每个线程的循环次数
+LOOP_SLEEP = 0.01      # 每次请求时间间隔(秒)
+
+# 配置:模拟运行状态
+#THREAD_NUM = 10        # 并发线程总数
+#ONE_WORKER_NUM = 10      # 每个线程的循环次数
+#LOOP_SLEEP = 0        # 每次请求时间间隔(秒)
+
+
+# 出错数
+ERROR_NUM = 0
+
+
+#具体的处理函数，负责处理单个任务
+def doWork(index, url):
+    t = threading.currentThread()
+    #print "["+t.name+" "+str(index)+"] "+PERF_TEST_URL
+
+    try:
+        html = urllib2.urlopen(url).read()
+    except urllib2.URLError, e:
+        print "["+t.name+" "+str(index)+"] "
+        print e
+        global ERROR_NUM
+        ERROR_NUM += 1
+
+
+#这个是工作进程，负责不断从队列取数据并处理
+def working():
+    t = threading.currentThread()
+    print "["+t.name+"] Sub Thread Begin"
+
+    i = 0
+    while i < ONE_WORKER_NUM:
+        i += 1
+        doWork(i, URLS[i % len(URLS)])
+        sleep(LOOP_SLEEP)
+
+    print "["+t.name+"] Sub Thread End"
+
+
+def main():
+    #doWork(0)
+    #return
+    
+    t1 = time.time()
+
+    Threads = []
+
+    # 创建线程
+    for i in range(THREAD_NUM):
+        t = threading.Thread(target=working, name="T"+str(i))
+        t.setDaemon(True)
+        Threads.append(t)
+
+    for t in Threads:
+        t.start()
+
+    for t in Threads:
+        t.join()
+
+    print "main thread end"
+
+    t2 = time.time()
+    print "========================================"
+    #print "URL:", PERF_TEST_URL
+    print "任务数量:", THREAD_NUM, "*", ONE_WORKER_NUM, "=", THREAD_NUM*ONE_WORKER_NUM
+    print "总耗时(秒):", t2-t1
+    print "每次请求耗时(秒):", (t2-t1) / (THREAD_NUM*ONE_WORKER_NUM)
+    print "每秒承载请求数:", 1 / ((t2-t1) / (THREAD_NUM*ONE_WORKER_NUM))
+    print "错误数量:", ERROR_NUM
+
+
+if __name__ == "__main__": 
+    main()
+
--- a/test/servertest/run_curl.sh
+++ b/test/servertest/run_curl.sh
@ -0,0 +1,11 @@
+CURL_RES=../testdata/curl.res
+TMP=curl.res.tmp
+curl -s "http://127.0.0.1:11200/?key=南京市长江大桥" >> $TMP
+if diff $TMP $CURL_RES >> /dev/null
+then
+    echo "ok";
+else
+    echo "failed."
+fi
+
+rm $TMP
--- a/test/testdata/curl.res
+++ b/test/testdata/curl.res
@ -0,0 +1 @@
+["南京市", "长江大桥"]
--- a/test/testdata/load_test.urls
+++ b/test/testdata/load_test.urls
@ -0,0 +1 @@
+http://127.0.0.1:11200/?key=南京市长江大桥
--- a/test/unittest/CMakeLists.txt
+++ b/test/unittest/CMakeLists.txt
@ -3,9 +3,11 @@ SET(LIBRARY_OUTPUT_PATH ${PROJECT_BINARY_DIR}/test/lib)

 SET(GTEST_ROOT_DIR gtest-1.6.0)

+ADD_DEFINITIONS(-DLOGGER_LEVEL=LL_WARN)
 INCLUDE_DIRECTORIES(${GTEST_ROOT_DIR} ${GTEST_ROOT_DIR}/include ${PROJECT_SOURCE_DIR})
 ADD_LIBRARY(gtest STATIC ${GTEST_ROOT_DIR}/src/gtest-all.cc)
-ADD_EXECUTABLE(test.run gtest_main.cc TChineseFilter.cpp TMixSegment.cpp TMPSegment.cpp THMMSegment.cpp TTrie.cpp TFullSegment.cpp TQuerySegment.cpp TTrieManager.cpp TMd5.cpp)
+FILE(GLOB SRCFILES *.cpp)
+ADD_EXECUTABLE(test.run ${SRCFILES})
 TARGET_LINK_LIBRARIES(gtest pthread)
 TARGET_LINK_LIBRARIES(test.run gtest pthread)

--- a/test/unittest/TFullSegment.cpp
+++ b/test/unittest/TFullSegment.cpp
@ -5,7 +5,7 @@ using namespace CppJieba;

 TEST(FullSegment, Test1)
 {
-    FullSegment segment("../dicts/jieba.dict.utf8");
+    FullSegment segment("../dict/jieba.dict.utf8");
    const char* str = "我来自北京邮电大学。。。  学号 123456";
    const char* res[] = {"我", "来自", "北京", "北京邮电", "北京邮电大学", "邮电", "邮电大学", "电大", "大学", "。", "。", "。", "  ", "学号", " 123456"};
    vector<string> words;
--- a/test/unittest/THMMSegment.cpp
+++ b/test/unittest/THMMSegment.cpp
@ -5,7 +5,7 @@ using namespace CppJieba;

 TEST(HMMSegmentTest, Test1)
 {
-    HMMSegment segment("../dicts/hmm_model.utf8");;
+    HMMSegment segment("../dict/hmm_model.utf8");;
    const char* str = "我来自北京邮电大学。。。  学号 123456";
    const char* res[] = {"我来", "自北京", "邮电大学", "。", "。", "。", "  ", "学号", " 123456"};
    //string s;
--- a/test/unittest/TKeywordExtractor.cpp
+++ b/test/unittest/TKeywordExtractor.cpp
@ -0,0 +1,56 @@
+#include "src/KeywordExtractor.hpp"
+#include "gtest/gtest.h"
+
+using namespace CppJieba;
+
+TEST(KeywordExtractorTest, Test1)
+{
+    KeywordExtractor extractor("../dict/jieba.dict.utf8", "../dict/idf.utf8");
+    const char* str = "我来自北京邮电大学。。。  学号 123456";
+    const char* res[] = {"北京邮电大学", "来自"};
+    vector<string> words;
+    ASSERT_TRUE(extractor);
+    ASSERT_TRUE(extractor.extract(str, words, 2));
+    ASSERT_EQ(words, vector<string>(res, res + sizeof(res)/sizeof(res[0])));
+}
+
+TEST(KeywordExtractorTest, Test2)
+{
+    KeywordExtractor extractor("../dict/jieba.dict.utf8", "../dict/idf.utf8");
+    const char* str = "我来自北京邮电大学。。。  学号 123456";
+    const char* res[] = {"北京邮电大学", "来自"};
+    vector<string> words;
+    ASSERT_TRUE(extractor);
+    ASSERT_TRUE(extractor.extract(str, words, 9));
+    ASSERT_EQ(words, vector<string>(res, res + sizeof(res)/sizeof(res[0])));
+}
+
+
+TEST(KeywordExtractorTest, Test3)
+{
+    ifstream ifs("../test/testdata/weicheng.utf8");
+    ASSERT_TRUE(!!ifs);
+    string str((istreambuf_iterator<char>(ifs)), (istreambuf_iterator<char>()));
+    KeywordExtractor extractor("../dict/jieba.dict.utf8", "../dict/idf.utf8");
+    const char* res[] = {"小姐", "孙小姐", "方鸿渐", "自己", "没有"};
+    vector<string> keywords;
+    extractor.extract(str, keywords, 5);
+    ASSERT_EQ(keywords, vector<string>(res, res + sizeof(res)/sizeof(res[0])));
+
+}
+
+TEST(KeywordExtractorTest, Test4)
+{
+    ifstream ifs("../test/testdata/weicheng.utf8");
+    ASSERT_TRUE(!!ifs);
+    string str((istreambuf_iterator<char>(ifs)), (istreambuf_iterator<char>()));
+    KeywordExtractor extractor("../dict/jieba.dict.utf8", "../dict/idf.utf8");
+    //const char* res[] = {"小姐", "孙小姐", "方鸿渐", "自己", "没有"};
+    vector<pair<string,double> >  keywords;
+    extractor.extract(str, keywords, 5);
+    //print(keywords);
+    string res;
+    res << keywords;
+    ASSERT_EQ(res, "[\"小姐:4268.75\", \"孙小姐:3789.41\", \"方鸿渐:3030.35\", \"自己:2300.54\", \"没有:2104.27\"]");
+
+}
--- a/test/unittest/TMPSegment.cpp
+++ b/test/unittest/TMPSegment.cpp
@ -5,7 +5,7 @@ using namespace CppJieba;

 TEST(MPSegmentTest, Test1)
 {
-    MPSegment segment("../dicts/jieba.dict.utf8");;
+    MPSegment segment("../dict/jieba.dict.utf8");;
    const char* str = "我来自北京邮电大学。。。  学号 123456";
    const char* res[] = {"我", "来自", "北京邮电大学", "。","。","。","  ","学","号", " 123456"};
    vector<string> words;
--- a/test/unittest/TMixSegment.cpp
+++ b/test/unittest/TMixSegment.cpp
@ -5,7 +5,7 @@ using namespace CppJieba;

 TEST(MixSegmentTest, Test1)
 {
-    MixSegment segment("../dicts/jieba.dict.utf8", "../dicts/hmm_model.utf8");;
+    MixSegment segment("../dict/jieba.dict.utf8", "../dict/hmm_model.utf8");;
    const char* str = "我来自北京邮电大学。。。  学号 123456";
    const char* res[] = {"我", "来自", "北京邮电大学", "。","。","。","  ","学号", " 123456"};
    vector<string> words;
--- a/test/unittest/TQuerySegment.cpp
+++ b/test/unittest/TQuerySegment.cpp
@ -5,7 +5,7 @@ using namespace CppJieba;

 TEST(QuerySegment, Test1)
 {
-    QuerySegment segment("../dicts/jieba.dict.utf8", "../dicts/hmm_model.utf8", 3);
+    QuerySegment segment("../dict/jieba.dict.utf8", "../dict/hmm_model.utf8", 3);
    const char* str = "小明硕士毕业于中国科学院计算所，后在日本京都大学深造";
    const char* res[] = {"小明", "硕士", "毕业", "于", "中国", "中国科学院", "科学", "科学院", "学院", "计算所", "，", "后", "在", "日本", "日本京都大学", "京都", "京都大学", "大学", "深造"};
    vector<string> words;
--- a/test/unittest/TChineseFilter.cpp
+++ b/test/unittest/TChineseFilter.cpp
@ -1,9 +1,9 @@
-#include "src/ChineseFilter.hpp"
+#include "src/SegmentBase.hpp"
 #include "gtest/gtest.h"

 using namespace CppJieba;

-TEST(ChineseFilterTest, Test1)
+TEST(SegmentBaseTest, Test1)
 {
    const char* str = "heheh你好...hh";
    string s;
@ -16,9 +16,9 @@ TEST(ChineseFilterTest, Test1)
    uint offset = 0;
    while(offset < size)
    {
-        uint len;
+        uint len = 0;
        const char* t =  str + offset;
-        int ret = filterAscii(t, size - offset, len);
+        SegmentBase::filterAscii(t, size - offset, len);
        s.assign(t, len);
        res.push_back(s);
        //cout<<s<<","<<ret<<","<<len<<endl;
--- a/test/unittest/TTrie.cpp
+++ b/test/unittest/TTrie.cpp
@ -3,7 +3,7 @@

 using namespace CppJieba;

-static const char* const DICT_FILE = "../dicts/jieba.dict.utf8";
+static const char* const DICT_FILE = "../dict/jieba.dict.utf8";

 TEST(TrieTest, Test1)
 {
--- a/test/unittest/gtest-1.6.0/include/gtest/internal/gtest-port.h
+++ b/test/unittest/gtest-1.6.0/include/gtest/internal/gtest-port.h
@ -496,7 +496,20 @@
 #   undef _TR1_FUNCTIONAL  // Allows the user to #include
                        // <tr1/functional> if he chooses to.
 #  else
+#  if defined (__cplusplus) && __cplusplus > 199711L
+#   include <tuple>
+namespace std {
+    namespace tr1 {
+        using std::tuple;
+        using std::tuple_element;
+        using std::get;
+        using std::tuple_size;
+        using std::make_tuple;
+    }
+}
+#  else
 #   include <tr1/tuple>  // NOLINT
+#  endif
 #  endif  // !GTEST_HAS_RTTI && GTEST_GCC_VER_ < 40302

 # else
--- a/test/unittest/gtest_main.cpp
+++ b/test/unittest/gtest_main.cpp
				`@ -0,0 +1 @@`
				`http://127.0.0.1:11200/?key=南京市长江大桥`