From 24a15cd1287f53d3b758524480b20d46e39d4aba Mon Sep 17 00:00:00 2001 From: wyy Date: Mon, 23 Dec 2013 19:22:59 -0800 Subject: [PATCH] rename and finishing KeywordExtractor.hpp --- src/CMakeLists.txt | 2 +- src/KeywordExtractor.hpp | 65 +++++++++++++++++++ src/SegmentBase.hpp | 3 +- src/TfIdfKeyWord.hpp | 32 --------- test/unittest/CMakeLists.txt | 3 +- test/unittest/TKeywordExtractor.cpp | 19 ++++++ .../{gtest_main.cc => gtest_main.cpp} | 0 7 files changed, 89 insertions(+), 35 deletions(-) create mode 100644 src/KeywordExtractor.hpp delete mode 100644 src/TfIdfKeyWord.hpp create mode 100644 test/unittest/TKeywordExtractor.cpp rename test/unittest/{gtest_main.cc => gtest_main.cpp} (100%) diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 94bb87f..1816f9c 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -7,7 +7,7 @@ TARGET_LINK_LIBRARIES(cjserver pthread) INSTALL(TARGETS cjsegment RUNTIME DESTINATION bin) INSTALL(TARGETS cjserver RUNTIME DESTINATION bin) -INSTALL(FILES HMMSegment.hpp MPSegment.hpp Trie.hpp TrieManager.hpp ISegment.hpp MixSegment.hpp SegmentBase.hpp TransCode.hpp DESTINATION include/CppJieba) +INSTALL(FILES HMMSegment.hpp MPSegment.hpp Trie.hpp TrieManager.hpp ISegment.hpp MixSegment.hpp SegmentBase.hpp TransCode.hpp KeywordExtractor.hpp DESTINATION include/CppJieba) ADD_SUBDIRECTORY(Husky) ADD_SUBDIRECTORY(Limonp) diff --git a/src/KeywordExtractor.hpp b/src/KeywordExtractor.hpp new file mode 100644 index 0000000..7627bb1 --- /dev/null +++ b/src/KeywordExtractor.hpp @@ -0,0 +1,65 @@ +#ifndef CPPJIEBA_KEYWORD_EXTRACTOR_H +#define CPPJIEBA_KEYWORD_EXTRACTOR_H + +#include "MPSegment.hpp" + +namespace CppJieba +{ + using namespace Limonp; + + class KeywordExtractor + { + private: + MPSegment _segment; + protected: + bool _isInited; + bool _getInitFlag()const{return _isInited;}; + bool _setInitFlag(bool flag){return _isInited = flag;}; + public: + operator bool(){return _getInitFlag();}; + public: + KeywordExtractor(){_setInitFlag(false);}; + explicit KeywordExtractor(const string& dictPath){_setInitFlag(init(dictPath));}; + ~KeywordExtractor(){}; + public: + bool init(const string& dictPath){return _setInitFlag(_segment.init(dictPath));}; + public: + bool extract(const string& str, vector& keywords, uint topN) + { + assert(_getInitFlag()); + + vector words; + if(!_segment.cut(str, words)) + { + LogError("segment cut(%s) failed.", str.c_str()); + return false; + } + + unordered_map wordcnt; + for(uint i = 0; i < words.size(); i ++) + { + wordcnt[ words[i] ] ++; + } + + vector > topWords(topN); + partial_sort_copy(wordcnt.begin(), wordcnt.end(), topWords.begin(), topWords.end(), _cmp); + + keywords.clear(); + for(uint i = 0; i < topWords.size(); i++) + { + keywords.push_back(topWords[i].first); + } + return true; + } + private: + static bool _cmp(const pair& lhs, const pair& rhs) + { + return lhs.second > rhs.second; + } + + }; +} + +#endif + + diff --git a/src/SegmentBase.hpp b/src/SegmentBase.hpp index 740d0cb..12938cd 100644 --- a/src/SegmentBase.hpp +++ b/src/SegmentBase.hpp @@ -19,9 +19,10 @@ namespace CppJieba bool _isInited; bool _getInitFlag()const{return _isInited;}; bool _setInitFlag(bool flag){return _isInited = flag;}; - public: operator bool(){return _getInitFlag();}; + + public: virtual bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector& res)const = 0; virtual bool cut(const string& str, vector& res)const { diff --git a/src/TfIdfKeyWord.hpp b/src/TfIdfKeyWord.hpp deleted file mode 100644 index c155af6..0000000 --- a/src/TfIdfKeyWord.hpp +++ /dev/null @@ -1,32 +0,0 @@ -#ifndef CPPJIEBA_TFIDF_H -#define CPPJIEBA_TFIDF_H - -#include "MPSegment.hpp" - -namespace CppJieba -{ - using namespace Limonp; - - class TfIdfKeyWord - { - private: - MPSegment _segment; - public: - TfIdfKeyWord(const char* dictFile): _segment(dictFile){}; - ~TfIdfKeyWord(){}; - public: - bool init(){return _segment.init();}; - bool dispose(){return _segment.dispose();}; - public: - bool extract(const string& str, vector& words, uint topN) - { - return _segment.cut(words); - return true; - } - - }; -} - -#endif - - diff --git a/test/unittest/CMakeLists.txt b/test/unittest/CMakeLists.txt index a8040a2..f10a406 100644 --- a/test/unittest/CMakeLists.txt +++ b/test/unittest/CMakeLists.txt @@ -5,7 +5,8 @@ SET(GTEST_ROOT_DIR gtest-1.6.0) INCLUDE_DIRECTORIES(${GTEST_ROOT_DIR} ${GTEST_ROOT_DIR}/include ${PROJECT_SOURCE_DIR}) ADD_LIBRARY(gtest STATIC ${GTEST_ROOT_DIR}/src/gtest-all.cc) -ADD_EXECUTABLE(test.run gtest_main.cc TSegmentBase.cpp TMixSegment.cpp TMPSegment.cpp THMMSegment.cpp TTrie.cpp TFullSegment.cpp TQuerySegment.cpp TTrieManager.cpp) +FILE(GLOB SRCFILES *.cpp) +ADD_EXECUTABLE(test.run ${SRCFILES}) TARGET_LINK_LIBRARIES(gtest pthread) TARGET_LINK_LIBRARIES(test.run gtest pthread) diff --git a/test/unittest/TKeywordExtractor.cpp b/test/unittest/TKeywordExtractor.cpp new file mode 100644 index 0000000..fd331a6 --- /dev/null +++ b/test/unittest/TKeywordExtractor.cpp @@ -0,0 +1,19 @@ +#include "src/KeywordExtractor.hpp" +#include "gtest/gtest.h" + +using namespace CppJieba; + +TEST(KeywordExtractorTest, Test1) +{ + KeywordExtractor extractor("../dicts/jieba.dict.utf8"); + const char* str = "我来自北京邮电大学。。。 学号 123456"; + const char* res[] = {"我", "来自", "北京邮电大学", "。","。","。"," ","学","号", " 123456"}; + vector words; + ASSERT_TRUE(extractor); + ASSERT_TRUE(extractor.extract(str, words, 2)); + //print(words); + //exit(0); + //print(words); + ASSERT_EQ(words, vector(res, res + sizeof(res)/sizeof(res[0]))); +} + diff --git a/test/unittest/gtest_main.cc b/test/unittest/gtest_main.cpp similarity index 100% rename from test/unittest/gtest_main.cc rename to test/unittest/gtest_main.cpp