diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 94bb87f..9cc83df 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -1,13 +1,15 @@ SET(EXECUTABLE_OUTPUT_PATH ${PROJECT_BINARY_DIR}/bin) SET(LIBRARY_OUTPUT_PATH ${PROJECT_BINARY_DIR}/lib) +INCLUDE_DIRECTORIES(${PROJECT_SOURCE_DIR}/src) + ADD_EXECUTABLE(cjsegment segment.cpp) ADD_EXECUTABLE(cjserver server.cpp) TARGET_LINK_LIBRARIES(cjserver pthread) INSTALL(TARGETS cjsegment RUNTIME DESTINATION bin) INSTALL(TARGETS cjserver RUNTIME DESTINATION bin) -INSTALL(FILES HMMSegment.hpp MPSegment.hpp Trie.hpp TrieManager.hpp ISegment.hpp MixSegment.hpp SegmentBase.hpp TransCode.hpp DESTINATION include/CppJieba) +INSTALL(FILES HMMSegment.hpp MPSegment.hpp Trie.hpp TrieManager.hpp ISegment.hpp MixSegment.hpp SegmentBase.hpp TransCode.hpp KeywordExtractor.hpp DESTINATION include/CppJieba) ADD_SUBDIRECTORY(Husky) ADD_SUBDIRECTORY(Limonp) diff --git a/src/HMMSegment.hpp b/src/HMMSegment.hpp index c2782af..4ad039b 100644 --- a/src/HMMSegment.hpp +++ b/src/HMMSegment.hpp @@ -215,7 +215,7 @@ namespace CppJieba { return false; } - splitStr(line, tmp, " "); + split(line, tmp, " "); if(tmp.size() != STATUS_SUM) { LogError("start_p illegal"); @@ -234,7 +234,7 @@ namespace CppJieba { return false; } - splitStr(line, tmp, " "); + split(line, tmp, " "); if(tmp.size() != STATUS_SUM) { LogError("trans_p illegal"); @@ -284,7 +284,7 @@ namespace CppJieba { continue; } - if(strStartsWith(line, "#")) + if(startsWith(line, "#")) { continue; } @@ -300,10 +300,10 @@ namespace CppJieba } vector tmp, tmp2; uint16_t unico = 0; - splitStr(line, tmp, ","); + split(line, tmp, ","); for(uint i = 0; i < tmp.size(); i++) { - splitStr(tmp[i], tmp2, ":"); + split(tmp[i], tmp2, ":"); if(2 != tmp2.size()) { LogError("_emitProb illegal."); diff --git a/src/Husky/HttpReqInfo.hpp b/src/Husky/HttpReqInfo.hpp index c0fa92a..2d52b73 100644 --- a/src/Husky/HttpReqInfo.hpp +++ b/src/Husky/HttpReqInfo.hpp @@ -3,7 +3,7 @@ #include #include -#include "../Limonp/logger.hpp" +#include "Limonp/logger.hpp" namespace Husky { @@ -88,7 +88,7 @@ namespace Husky } string firstline(headerStr, lpos, rpos - lpos); trim(firstline); - if(!splitStr(firstline, buf, " ") || 3 != buf.size()) + if(!split(firstline, buf, " ") || 3 != buf.size()) { LogFatal("parse header first line failed."); return false; diff --git a/src/Husky/ServerFrame.hpp b/src/Husky/HuskyServer.hpp similarity index 96% rename from src/Husky/ServerFrame.hpp rename to src/Husky/HuskyServer.hpp index 114c14c..9df6f81 100644 --- a/src/Husky/ServerFrame.hpp +++ b/src/Husky/HuskyServer.hpp @@ -29,7 +29,7 @@ namespace Husky using namespace Limonp; typedef int SOCKET; const struct timeval SOCKET_TIMEOUT = {2, 0}; - const char* const RESPONSE_FORMAT = "HTTP/1.1 200 OK\r\nConnection: close\r\nServer: FrameServer/1.0.0\r\nContent-Type: text/json; charset=%s\r\nContent-Length: %d\r\n\r\n"; + const char* const RESPONSE_FORMAT = "HTTP/1.1 200 OK\r\nConnection: close\r\nServer: HuskyServer/1.0.0\r\nContent-Type: text/json; charset=%s\r\nContent-Length: %d\r\n\r\n"; const char* const RESPONSE_CHARSET_UTF8 = "UTF-8"; const char* const RESPONSE_CHARSET_GB2312 = "GB2312"; const char* const CLIENT_IP_K = "CLIENT_IP"; @@ -53,13 +53,13 @@ namespace Husky bool * pShutdown; }; - class ServerFrame//: public IWorkHandler + class HuskyServer { private: pthread_mutex_t m_pmAccept; bool m_bShutdown; public: - ServerFrame(unsigned nPort, unsigned nThreadCount, IRequestHandler* pHandler) + explicit HuskyServer(unsigned nPort, unsigned nThreadCount, IRequestHandler* pHandler) { m_bShutdown = false; m_nLsnPort = nPort; @@ -68,7 +68,7 @@ namespace Husky assert(pHandler); pthread_mutex_init(&m_pmAccept,NULL); }; - virtual ~ServerFrame(){pthread_mutex_destroy(&m_pmAccept);}; + virtual ~HuskyServer(){pthread_mutex_destroy(&m_pmAccept);}; virtual bool init() { @@ -292,8 +292,6 @@ namespace Husky u_short m_nThreadCount; SOCKET m_lsnSock; IRequestHandler *m_pHandler; - //static bool m_bShutdown; - //static pthread_mutex_t m_pmAccept; }; diff --git a/src/Husky/ThreadManager.hpp b/src/Husky/ThreadManager.hpp index b3fce5f..147b23c 100644 --- a/src/Husky/ThreadManager.hpp +++ b/src/Husky/ThreadManager.hpp @@ -5,8 +5,6 @@ #include #include -#define INFINITE 0 - namespace Husky { using namespace std; @@ -15,47 +13,43 @@ namespace Husky { private: typedef int HANDLE; - typedef int DWORD; typedef void *(* PThreadFunc)(void* param); public: ThreadManager(){;} ~ThreadManager(){} - unsigned int HandleCount(){return m_vecHandle.size();} + size_t HandleCount(){return _handles.size();} void clear() { - m_vecHandle.clear(); + _handles.clear(); } HANDLE CreateThread( PThreadFunc pFunc,void *pPara) { pthread_t pt; - int nErrorCode=pthread_create(&pt,NULL,pFunc,pPara); - if(nErrorCode!=0) + int nErrorCode = pthread_create(&pt,NULL,pFunc,pPara); + if(nErrorCode != 0) return nErrorCode; - m_vecHandle.push_back(pt); //加入线程列表 为WaitForMultipleObjects准备 + _handles.push_back(pt); return nErrorCode; } - //hThread (thread handler) : 为0时为默认最后一个加入管理器的线程句柄 - //dwMilliseconds等待时间 : 单位毫秒,默认值无穷时间 - //return value : -1句柄无效,其他值 WaitForSingleObject函数的返回值 - DWORD Wait(HANDLE hThread=0,DWORD dwMilliseconds=INFINITE ) + int Wait(HANDLE hThread = 0) { - if( hThread==0)//最后一个加入的线程 + if( hThread == 0)//the last handle { - if(!m_vecHandle.empty()) + if(!_handles.empty()) { - return pthread_join(m_vecHandle.back(),NULL); + return pthread_join(_handles.back(),NULL); } else return -1; } else { - if (find(m_vecHandle.begin(),m_vecHandle.end(),hThread)==m_vecHandle.end())//不存在此句柄 + if (find(_handles.begin(),_handles.end(),hThread) == _handles.end()) { return -1; } @@ -65,31 +59,26 @@ namespace Husky } - - //等待所有线程执行完毕 - //bWaitAll是否所有线程 : 默认值1等待所有线程,0有任何线程结束,此函数返回 - //dwMilliseconds : 单位毫秒,默认值无穷时间 - //return value : -1没有任何句柄,其他值 WaitForMultipleObjects函数的返回值 - DWORD WaitMultipleThread( bool bWaitAll=1,DWORD dwMilliseconds=INFINITE) + int WaitMultipleThread() { - if (m_vecHandle.empty()) + if (_handles.empty()) return -1; int nErrorcode; - for (uint i=0;i m_vecHandle; + vector _handles; private: ThreadManager(const ThreadManager&){;}// copy forbidden - void operator=(const ThreadManager &){}// copy forbidden + void operator = (const ThreadManager &){}// copy forbidden }; } diff --git a/src/KeywordExtractor.hpp b/src/KeywordExtractor.hpp new file mode 100644 index 0000000..c4917bd --- /dev/null +++ b/src/KeywordExtractor.hpp @@ -0,0 +1,146 @@ +#ifndef CPPJIEBA_KEYWORD_EXTRACTOR_H +#define CPPJIEBA_KEYWORD_EXTRACTOR_H + +#include "MPSegment.hpp" +#include + +namespace CppJieba +{ + using namespace Limonp; + + struct KeyWordInfo + { + string word; + uint freq; + double idf; + }; + + inline ostream& operator << (ostream& os, const KeyWordInfo & keyword) + { + return os << keyword.word << "," << keyword.freq << "," << keyword.idf; + } + + class KeywordExtractor + { + private: + MPSegment _segment; + private: + unordered_map _wordIndex; + vector _wordinfos; + size_t _totalFreq; + protected: + bool _isInited; + bool _getInitFlag()const{return _isInited;}; + bool _setInitFlag(bool flag){return _isInited = flag;}; + public: + operator bool(){return _getInitFlag();}; + public: + KeywordExtractor(){_setInitFlag(false);}; + explicit KeywordExtractor(const string& dictPath){_setInitFlag(init(dictPath));}; + ~KeywordExtractor(){}; + public: + bool init(const string& dictPath) + { + ifstream ifs(dictPath.c_str()); + if(!ifs) + { + LogError("open %s failed.", dictPath.c_str()); + return false; + } + _totalFreq = 0; + int tfreq; + string line ; + vector buf; + KeyWordInfo keywordInfo; + for(uint lineno = 0; getline(ifs, line); lineno++) + { + buf.clear(); + if(line.empty()) + { + LogError("line[%d] empty. skipped.", lineno); + continue; + } + if(!split(line, buf, " ") || buf.size() != 3) + { + LogError("line %d [%s] illegal. skipped.", lineno, line.c_str()); + continue; + } + keywordInfo.word = buf[0]; + tfreq= atoi(buf[1].c_str()); + if(tfreq <= 0) + { + LogError("line %d [%s] illegal. skipped.", lineno, line.c_str()); + continue; + } + keywordInfo.freq = tfreq; + _totalFreq += tfreq; + _wordinfos.push_back(keywordInfo); + } + + // calculate idf & make index. + for(uint i = 0; i < _wordinfos.size(); i++) + { + if(_wordinfos[i].freq <= 0) + { + LogFatal("freq value is not positive."); + return false; + } + _wordinfos[i].idf = -log(_wordinfos[i].freq); + _wordIndex[_wordinfos[i].word] = &(_wordinfos[i]); + } + return _setInitFlag(_segment.init(dictPath)); + }; + public: + bool extract(const string& str, vector& keywords, uint topN) const + { + assert(_getInitFlag()); + + vector words; + if(!_segment.cut(str, words)) + { + LogError("segment cut(%s) failed.", str.c_str()); + return false; + } + + unordered_map wordmap; + for(uint i = 0; i < words.size(); i ++) + { + wordmap[ words[i] ] += 1.0; + } + + for(unordered_map::iterator itr = wordmap.begin(); itr != wordmap.end();) + { + unordered_map::const_iterator cit = _wordIndex.find(itr->first); + if(cit != _wordIndex.end()) + { + itr->second *= cit->second->idf; + itr ++; + } + else + { + itr = wordmap.erase(itr); + } + } + + vector > topWords(min(topN, wordmap.size())); + partial_sort_copy(wordmap.begin(), wordmap.end(), topWords.begin(), topWords.end(), _cmp); + + keywords.clear(); + for(uint i = 0; i < topWords.size(); i++) + { + keywords.push_back(topWords[i].first); + } + return true; + } + private: + static bool _cmp(const pair& lhs, const pair& rhs) + { + return lhs.second > rhs.second; + } + + }; +} + +#endif + + diff --git a/src/Limonp/ArgvContext.hpp b/src/Limonp/ArgvContext.hpp index 4838a81..dba3997 100644 --- a/src/Limonp/ArgvContext.hpp +++ b/src/Limonp/ArgvContext.hpp @@ -22,9 +22,9 @@ namespace Limonp for(int i = 0; i < argc; i++) { - if(strStartsWith(argv[i], "-")) + if(startsWith(argv[i], "-")) { - if(i + 1 < argc && !strStartsWith(argv[i + 1], "-")) + if(i + 1 < argc && !startsWith(argv[i + 1], "-")) { _mpss[argv[i]] = argv[i+1]; i++; diff --git a/src/Limonp/Config.hpp b/src/Limonp/Config.hpp index f107b9e..5f8de9d 100644 --- a/src/Limonp/Config.hpp +++ b/src/Limonp/Config.hpp @@ -18,7 +18,17 @@ namespace Limonp class Config { public: - bool loadFile(const char * const filePath) + Config(const char * const filePath) + { + _loadFile(filePath); + } + public: + operator bool () + { + return !_map.empty(); + } + private: + bool _loadFile(const char * const filePath) { ifstream ifs(filePath); if(!ifs) @@ -33,12 +43,12 @@ namespace Limonp { lineno ++; trim(line); - if(line.empty() || strStartsWith(line, "#")) + if(line.empty() || startsWith(line, "#")) { continue; } vecBuf.clear(); - if(!splitStr(line, vecBuf, "=") || 2 != vecBuf.size()) + if(!split(line, vecBuf, "=") || 2 != vecBuf.size()) { LogFatal("line[%d:%s] is illegal.", lineno, line.c_str()); return false; @@ -57,6 +67,7 @@ namespace Limonp ifs.close(); return true; } + public: bool get(const string& key, string& value) const { map::const_iterator it = _map.find(key); @@ -73,7 +84,7 @@ namespace Limonp friend ostream& operator << (ostream& os, const Config& config); }; - ostream& operator << (ostream& os, const Config& config) + inline ostream& operator << (ostream& os, const Config& config) { return os << config._map; } diff --git a/src/Limonp/logger.hpp b/src/Limonp/logger.hpp index 763f26d..a6c2760 100644 --- a/src/Limonp/logger.hpp +++ b/src/Limonp/logger.hpp @@ -11,6 +11,7 @@ #include #include #include +#include #include "io_functs.hpp" #include "str_functs.hpp" @@ -23,6 +24,7 @@ #define LogFatal(fmt, ...) Logger::LoggingF(LL_FATAL, FILE_BASENAME, __LINE__, fmt, ## __VA_ARGS__) + namespace Limonp { using namespace std; @@ -36,16 +38,11 @@ namespace Limonp public: static bool Logging(uint level, const string& msg, const char* fileName, int lineNo) { - if(level > LL_FATAL) - { - cerr<<"level's value is out of range"<& src, string& dest, const string& connectorStr) - //{ - // if(src.empty()) - // { - // return false; - // } - // for(uint i = 0; i < src.size() - 1; i++) - // { - // dest += src[i]; - // dest += connectorStr; - // } - // dest += src[src.size() - 1]; - // return true; - //} - - //inline string joinStr(const vector& source, const string& connector) - //{ - // string res; - // joinStr(source, res, connector); - // return res; - //} - template void join(T begin, T end, string& res, const string& connector) { @@ -122,7 +100,7 @@ namespace Limonp - inline bool splitStr(const string& src, vector& res, const string& pattern) + inline bool split(const string& src, vector& res, const string& pattern) { if(src.empty()) { @@ -181,20 +159,9 @@ namespace Limonp } - inline uint16_t twocharToUint16(char high, char low) - { - return (((uint16_t(high) & 0x00ff ) << 8) | (uint16_t(low) & 0x00ff)); - } - inline pair uint16ToChar2(uint16_t in) - { - pair res; - res.first = (in>>8) & 0x00ff; //high - res.second = (in) & 0x00ff; //low - return res; - } - inline bool strStartsWith(const string& str, const string& prefix) + inline bool startsWith(const string& str, const string& prefix) { //return str.substr(0, prefix.size()) == prefix; if(prefix.length() > str.length()) @@ -204,7 +171,7 @@ namespace Limonp return 0 == str.compare(0, prefix.length(), prefix); } - inline bool strEndsWith(const string& str, const string& suffix) + inline bool endsWith(const string& str, const string& suffix) { if(suffix.length() > str.length()) { @@ -218,13 +185,19 @@ namespace Limonp return str.find(ch) != string::npos; } + inline uint16_t twocharToUint16(char high, char low) + { + return (((uint16_t(high) & 0x00ff ) << 8) | (uint16_t(low) & 0x00ff)); + } + inline bool utf8ToUnicode(const char * const str, uint len, vector& vec) { - char ch1, ch2; if(!str) { return false; } + char ch1, ch2; + uint16_t tmp; vec.clear(); for(uint i = 0;i < len;) { @@ -237,14 +210,16 @@ namespace Limonp { ch1 = (str[i] >> 2) & 0x07; ch2 = (str[i+1] & 0x3f) | ((str[i] & 0x03) << 6 ); - vec.push_back(twocharToUint16(ch1, ch2)); + tmp = (((uint16_t(ch1) & 0x00ff ) << 8) | (uint16_t(ch2) & 0x00ff)); + vec.push_back(tmp); i += 2; } else if((unsigned char)str[i] <= 0xef && i + 2 < len) { ch1 = (str[i] << 4) | ((str[i+1] >> 2) & 0x0f ); ch2 = ((str[i+1]<<6) & 0xc0) | (str[i+2] & 0x3f); - vec.push_back(twocharToUint16(ch1, ch2)); + tmp = (((uint16_t(ch1) & 0x00ff ) << 8) | (uint16_t(ch2) & 0x00ff)); + vec.push_back(tmp); i += 3; } else @@ -310,7 +285,8 @@ namespace Limonp { if(i + 1 < len) //&& (str[i+1] & 0x80)) { - vec.push_back(twocharToUint16(str[i], str[i + 1])); + uint16_t tmp = (((uint16_t(str[i]) & 0x00ff ) << 8) | (uint16_t(str[i+1]) & 0x00ff)); + vec.push_back(tmp); i += 2; } else @@ -321,11 +297,20 @@ namespace Limonp } return true; } + inline bool gbkTrans(const string& str, vector& vec) { return gbkTrans(str.c_str(), str.size(), vec); } + //inline pair uint16ToChar2(uint16_t in) + //{ + // pair res; + // res.first = (in>>8) & 0x00ff; //high + // res.second = (in) & 0x00ff; //low + // return res; + //} + inline bool gbkTrans(vector::const_iterator begin, vector::const_iterator end, string& res) { if(begin >= end) @@ -333,18 +318,21 @@ namespace Limonp return false; } res.clear(); - pair pa; + //pair pa; + char first, second; while(begin != end) { - pa = uint16ToChar2(*begin); - if(pa.first & 0x80) + //pa = uint16ToChar2(*begin); + first = ((*begin)>>8) & 0x00ff; + second = (*begin) & 0x00ff; + if(first & 0x80) { - res += pa.first; - res += pa.second; + res += first; + res += second; } else { - res += pa.second; + res += second; } begin++; } diff --git a/src/MPSegment.hpp b/src/MPSegment.hpp index 9b0353a..5176c36 100644 --- a/src/MPSegment.hpp +++ b/src/MPSegment.hpp @@ -32,7 +32,7 @@ namespace CppJieba class MPSegment: public SegmentBase { - private: + protected: Trie* _trie; public: diff --git a/src/SegmentBase.hpp b/src/SegmentBase.hpp index 740d0cb..12938cd 100644 --- a/src/SegmentBase.hpp +++ b/src/SegmentBase.hpp @@ -19,9 +19,10 @@ namespace CppJieba bool _isInited; bool _getInitFlag()const{return _isInited;}; bool _setInitFlag(bool flag){return _isInited = flag;}; - public: operator bool(){return _getInitFlag();}; + + public: virtual bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector& res)const = 0; virtual bool cut(const string& str, vector& res)const { diff --git a/src/TfIdfKeyWord.hpp b/src/TfIdfKeyWord.hpp deleted file mode 100644 index c155af6..0000000 --- a/src/TfIdfKeyWord.hpp +++ /dev/null @@ -1,32 +0,0 @@ -#ifndef CPPJIEBA_TFIDF_H -#define CPPJIEBA_TFIDF_H - -#include "MPSegment.hpp" - -namespace CppJieba -{ - using namespace Limonp; - - class TfIdfKeyWord - { - private: - MPSegment _segment; - public: - TfIdfKeyWord(const char* dictFile): _segment(dictFile){}; - ~TfIdfKeyWord(){}; - public: - bool init(){return _segment.init();}; - bool dispose(){return _segment.dispose();}; - public: - bool extract(const string& str, vector& words, uint topN) - { - return _segment.cut(words); - return true; - } - - }; -} - -#endif - - diff --git a/src/Trie.hpp b/src/Trie.hpp index 3b0fb45..78864e4 100644 --- a/src/Trie.hpp +++ b/src/Trie.hpp @@ -130,26 +130,13 @@ namespace CppJieba } bool loadDict(const char * const filePath) { - if(!_getInitFlag()) - { - LogError("not initted."); - return false; - } - - if(!checkFileExist(filePath)) - { - LogError("cann't find fiel[%s].",filePath); - return false; - } - bool res = false; - res = _trieInsert(filePath); - if(!res) + assert(_getInitFlag()); + if(!_trieInsert(filePath)) { LogError("_trieInsert failed."); return false; } - res = _countWeight(); - if(!res) + if(!_countWeight()) { LogError("_countWeight failed."); return false; @@ -339,15 +326,20 @@ namespace CppJieba private: bool _trieInsert(const char * const filePath) { - ifstream ifile(filePath); + ifstream ifs(filePath); + if(!ifs) + { + LogError("open %s failed.", filePath); + return false; + } string line; vector vecBuf; TrieNodeInfo nodeInfo; - while(getline(ifile, line)) + while(getline(ifs, line)) { vecBuf.clear(); - splitStr(line, vecBuf, " "); + split(line, vecBuf, " "); if(3 < vecBuf.size()) { LogError("line[%s] illegal.", line.c_str()); diff --git a/src/server.cpp b/src/server.cpp index ffda036..0d1e83b 100644 --- a/src/server.cpp +++ b/src/server.cpp @@ -4,7 +4,7 @@ #include #include #include "Limonp/Config.hpp" -#include "Husky/ServerFrame.hpp" +#include "Husky/HuskyServer.hpp" #include "MPSegment.hpp" #include "HMMSegment.hpp" #include "MixSegment.hpp" @@ -38,8 +38,8 @@ bool run(int argc, char** argv) { return false; } - Config conf; - if(!conf.loadFile(argv[1])) + Config conf(argv[1]); + if(!conf) { return false; } @@ -90,7 +90,7 @@ bool run(int argc, char** argv) } ReqHandler reqHandler(dictPath, modelPath); - ServerFrame sf(port, threadNum, &reqHandler); + HuskyServer sf(port, threadNum, &reqHandler); return sf.init() && sf.run(); } diff --git a/test/unittest/CMakeLists.txt b/test/unittest/CMakeLists.txt index a8040a2..56bf087 100644 --- a/test/unittest/CMakeLists.txt +++ b/test/unittest/CMakeLists.txt @@ -3,9 +3,11 @@ SET(LIBRARY_OUTPUT_PATH ${PROJECT_BINARY_DIR}/test/lib) SET(GTEST_ROOT_DIR gtest-1.6.0) +ADD_DEFINITIONS(-DLOGGER_LEVEL=LL_WARN) INCLUDE_DIRECTORIES(${GTEST_ROOT_DIR} ${GTEST_ROOT_DIR}/include ${PROJECT_SOURCE_DIR}) ADD_LIBRARY(gtest STATIC ${GTEST_ROOT_DIR}/src/gtest-all.cc) -ADD_EXECUTABLE(test.run gtest_main.cc TSegmentBase.cpp TMixSegment.cpp TMPSegment.cpp THMMSegment.cpp TTrie.cpp TFullSegment.cpp TQuerySegment.cpp TTrieManager.cpp) +FILE(GLOB SRCFILES *.cpp) +ADD_EXECUTABLE(test.run ${SRCFILES}) TARGET_LINK_LIBRARIES(gtest pthread) TARGET_LINK_LIBRARIES(test.run gtest pthread) diff --git a/test/unittest/TKeywordExtractor.cpp b/test/unittest/TKeywordExtractor.cpp new file mode 100644 index 0000000..d6a3469 --- /dev/null +++ b/test/unittest/TKeywordExtractor.cpp @@ -0,0 +1,40 @@ +#include "src/KeywordExtractor.hpp" +#include "gtest/gtest.h" + +using namespace CppJieba; + +TEST(KeywordExtractorTest, Test1) +{ + KeywordExtractor extractor("../dicts/jieba.dict.utf8"); + const char* str = "我来自北京邮电大学。。。 学号 123456"; + const char* res[] = {"北京邮电大学", "来自"}; + vector words; + ASSERT_TRUE(extractor); + ASSERT_TRUE(extractor.extract(str, words, 2)); + ASSERT_EQ(words, vector(res, res + sizeof(res)/sizeof(res[0]))); +} + +TEST(KeywordExtractorTest, Test2) +{ + KeywordExtractor extractor("../dicts/jieba.dict.utf8"); + const char* str = "我来自北京邮电大学。。。 学号 123456"; + const char* res[] = {"北京邮电大学", "来自", "学", "号", "我"}; + vector words; + ASSERT_TRUE(extractor); + ASSERT_TRUE(extractor.extract(str, words, 9)); + ASSERT_EQ(words, vector(res, res + sizeof(res)/sizeof(res[0]))); +} + + +TEST(KeywordExtractorTest, Test3) +{ + ifstream ifs("../test/testdata/weicheng.utf8"); + ASSERT_TRUE(ifs); + string str((istreambuf_iterator(ifs)), (istreambuf_iterator())); + KeywordExtractor extractor("../dicts/jieba.dict.utf8"); + vector keywords; + string res; + extractor.extract(str, keywords, 5); + res << keywords; + ASSERT_EQ("[\"第三性\", \"多愁多病\", \"记挂着\", \"揭去\", \"贫血症\"]", res); +} diff --git a/test/unittest/gtest_main.cc b/test/unittest/gtest_main.cpp similarity index 100% rename from test/unittest/gtest_main.cc rename to test/unittest/gtest_main.cpp