rename and finishing KeywordExtractor.hpp

This commit is contained in:
wyy 2013-12-23 19:22:59 -08:00
parent 657aee0fda
commit 24a15cd128
7 changed files with 89 additions and 35 deletions

View File

@ -7,7 +7,7 @@ TARGET_LINK_LIBRARIES(cjserver pthread)
INSTALL(TARGETS cjsegment RUNTIME DESTINATION bin) INSTALL(TARGETS cjsegment RUNTIME DESTINATION bin)
INSTALL(TARGETS cjserver RUNTIME DESTINATION bin) INSTALL(TARGETS cjserver RUNTIME DESTINATION bin)
INSTALL(FILES HMMSegment.hpp MPSegment.hpp Trie.hpp TrieManager.hpp ISegment.hpp MixSegment.hpp SegmentBase.hpp TransCode.hpp DESTINATION include/CppJieba) INSTALL(FILES HMMSegment.hpp MPSegment.hpp Trie.hpp TrieManager.hpp ISegment.hpp MixSegment.hpp SegmentBase.hpp TransCode.hpp KeywordExtractor.hpp DESTINATION include/CppJieba)
ADD_SUBDIRECTORY(Husky) ADD_SUBDIRECTORY(Husky)
ADD_SUBDIRECTORY(Limonp) ADD_SUBDIRECTORY(Limonp)

65
src/KeywordExtractor.hpp Normal file
View File

@ -0,0 +1,65 @@
#ifndef CPPJIEBA_KEYWORD_EXTRACTOR_H
#define CPPJIEBA_KEYWORD_EXTRACTOR_H
#include "MPSegment.hpp"
namespace CppJieba
{
using namespace Limonp;
class KeywordExtractor
{
private:
MPSegment _segment;
protected:
bool _isInited;
bool _getInitFlag()const{return _isInited;};
bool _setInitFlag(bool flag){return _isInited = flag;};
public:
operator bool(){return _getInitFlag();};
public:
KeywordExtractor(){_setInitFlag(false);};
explicit KeywordExtractor(const string& dictPath){_setInitFlag(init(dictPath));};
~KeywordExtractor(){};
public:
bool init(const string& dictPath){return _setInitFlag(_segment.init(dictPath));};
public:
bool extract(const string& str, vector<string>& keywords, uint topN)
{
assert(_getInitFlag());
vector<string> words;
if(!_segment.cut(str, words))
{
LogError("segment cut(%s) failed.", str.c_str());
return false;
}
unordered_map<string, uint> wordcnt;
for(uint i = 0; i < words.size(); i ++)
{
wordcnt[ words[i] ] ++;
}
vector<pair<string, uint> > topWords(topN);
partial_sort_copy(wordcnt.begin(), wordcnt.end(), topWords.begin(), topWords.end(), _cmp);
keywords.clear();
for(uint i = 0; i < topWords.size(); i++)
{
keywords.push_back(topWords[i].first);
}
return true;
}
private:
static bool _cmp(const pair<string, uint>& lhs, const pair<string, uint>& rhs)
{
return lhs.second > rhs.second;
}
};
}
#endif

View File

@ -19,9 +19,10 @@ namespace CppJieba
bool _isInited; bool _isInited;
bool _getInitFlag()const{return _isInited;}; bool _getInitFlag()const{return _isInited;};
bool _setInitFlag(bool flag){return _isInited = flag;}; bool _setInitFlag(bool flag){return _isInited = flag;};
public: public:
operator bool(){return _getInitFlag();}; operator bool(){return _getInitFlag();};
public:
virtual bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<string>& res)const = 0; virtual bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<string>& res)const = 0;
virtual bool cut(const string& str, vector<string>& res)const virtual bool cut(const string& str, vector<string>& res)const
{ {

View File

@ -1,32 +0,0 @@
#ifndef CPPJIEBA_TFIDF_H
#define CPPJIEBA_TFIDF_H
#include "MPSegment.hpp"
namespace CppJieba
{
using namespace Limonp;
class TfIdfKeyWord
{
private:
MPSegment _segment;
public:
TfIdfKeyWord(const char* dictFile): _segment(dictFile){};
~TfIdfKeyWord(){};
public:
bool init(){return _segment.init();};
bool dispose(){return _segment.dispose();};
public:
bool extract(const string& str, vector<string>& words, uint topN)
{
return _segment.cut(words);
return true;
}
};
}
#endif

View File

@ -5,7 +5,8 @@ SET(GTEST_ROOT_DIR gtest-1.6.0)
INCLUDE_DIRECTORIES(${GTEST_ROOT_DIR} ${GTEST_ROOT_DIR}/include ${PROJECT_SOURCE_DIR}) INCLUDE_DIRECTORIES(${GTEST_ROOT_DIR} ${GTEST_ROOT_DIR}/include ${PROJECT_SOURCE_DIR})
ADD_LIBRARY(gtest STATIC ${GTEST_ROOT_DIR}/src/gtest-all.cc) ADD_LIBRARY(gtest STATIC ${GTEST_ROOT_DIR}/src/gtest-all.cc)
ADD_EXECUTABLE(test.run gtest_main.cc TSegmentBase.cpp TMixSegment.cpp TMPSegment.cpp THMMSegment.cpp TTrie.cpp TFullSegment.cpp TQuerySegment.cpp TTrieManager.cpp) FILE(GLOB SRCFILES *.cpp)
ADD_EXECUTABLE(test.run ${SRCFILES})
TARGET_LINK_LIBRARIES(gtest pthread) TARGET_LINK_LIBRARIES(gtest pthread)
TARGET_LINK_LIBRARIES(test.run gtest pthread) TARGET_LINK_LIBRARIES(test.run gtest pthread)

View File

@ -0,0 +1,19 @@
#include "src/KeywordExtractor.hpp"
#include "gtest/gtest.h"
using namespace CppJieba;
TEST(KeywordExtractorTest, Test1)
{
KeywordExtractor extractor("../dicts/jieba.dict.utf8");
const char* str = "我来自北京邮电大学。。。 学号 123456";
const char* res[] = {"", "来自", "北京邮电大学", "","",""," ","","", " 123456"};
vector<string> words;
ASSERT_TRUE(extractor);
ASSERT_TRUE(extractor.extract(str, words, 2));
//print(words);
//exit(0);
//print(words);
ASSERT_EQ(words, vector<string>(res, res + sizeof(res)/sizeof(res[0])));
}