rename and finishing KeywordExtractor.hpp

This commit is contained in:
wyy 2013-12-23 19:22:59 -08:00
parent 657aee0fda
commit 24a15cd128
7 changed files with 89 additions and 35 deletions

View File

@ -7,7 +7,7 @@ TARGET_LINK_LIBRARIES(cjserver pthread)
INSTALL(TARGETS cjsegment RUNTIME DESTINATION bin)
INSTALL(TARGETS cjserver RUNTIME DESTINATION bin)
INSTALL(FILES HMMSegment.hpp MPSegment.hpp Trie.hpp TrieManager.hpp ISegment.hpp MixSegment.hpp SegmentBase.hpp TransCode.hpp DESTINATION include/CppJieba)
INSTALL(FILES HMMSegment.hpp MPSegment.hpp Trie.hpp TrieManager.hpp ISegment.hpp MixSegment.hpp SegmentBase.hpp TransCode.hpp KeywordExtractor.hpp DESTINATION include/CppJieba)
ADD_SUBDIRECTORY(Husky)
ADD_SUBDIRECTORY(Limonp)

65
src/KeywordExtractor.hpp Normal file
View File

@ -0,0 +1,65 @@
#ifndef CPPJIEBA_KEYWORD_EXTRACTOR_H
#define CPPJIEBA_KEYWORD_EXTRACTOR_H
#include "MPSegment.hpp"
namespace CppJieba
{
using namespace Limonp;
class KeywordExtractor
{
private:
MPSegment _segment;
protected:
bool _isInited;
bool _getInitFlag()const{return _isInited;};
bool _setInitFlag(bool flag){return _isInited = flag;};
public:
operator bool(){return _getInitFlag();};
public:
KeywordExtractor(){_setInitFlag(false);};
explicit KeywordExtractor(const string& dictPath){_setInitFlag(init(dictPath));};
~KeywordExtractor(){};
public:
bool init(const string& dictPath){return _setInitFlag(_segment.init(dictPath));};
public:
bool extract(const string& str, vector<string>& keywords, uint topN)
{
assert(_getInitFlag());
vector<string> words;
if(!_segment.cut(str, words))
{
LogError("segment cut(%s) failed.", str.c_str());
return false;
}
unordered_map<string, uint> wordcnt;
for(uint i = 0; i < words.size(); i ++)
{
wordcnt[ words[i] ] ++;
}
vector<pair<string, uint> > topWords(topN);
partial_sort_copy(wordcnt.begin(), wordcnt.end(), topWords.begin(), topWords.end(), _cmp);
keywords.clear();
for(uint i = 0; i < topWords.size(); i++)
{
keywords.push_back(topWords[i].first);
}
return true;
}
private:
static bool _cmp(const pair<string, uint>& lhs, const pair<string, uint>& rhs)
{
return lhs.second > rhs.second;
}
};
}
#endif

View File

@ -19,9 +19,10 @@ namespace CppJieba
bool _isInited;
bool _getInitFlag()const{return _isInited;};
bool _setInitFlag(bool flag){return _isInited = flag;};
public:
operator bool(){return _getInitFlag();};
public:
virtual bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<string>& res)const = 0;
virtual bool cut(const string& str, vector<string>& res)const
{

View File

@ -1,32 +0,0 @@
#ifndef CPPJIEBA_TFIDF_H
#define CPPJIEBA_TFIDF_H
#include "MPSegment.hpp"
namespace CppJieba
{
using namespace Limonp;
class TfIdfKeyWord
{
private:
MPSegment _segment;
public:
TfIdfKeyWord(const char* dictFile): _segment(dictFile){};
~TfIdfKeyWord(){};
public:
bool init(){return _segment.init();};
bool dispose(){return _segment.dispose();};
public:
bool extract(const string& str, vector<string>& words, uint topN)
{
return _segment.cut(words);
return true;
}
};
}
#endif

View File

@ -5,7 +5,8 @@ SET(GTEST_ROOT_DIR gtest-1.6.0)
INCLUDE_DIRECTORIES(${GTEST_ROOT_DIR} ${GTEST_ROOT_DIR}/include ${PROJECT_SOURCE_DIR})
ADD_LIBRARY(gtest STATIC ${GTEST_ROOT_DIR}/src/gtest-all.cc)
ADD_EXECUTABLE(test.run gtest_main.cc TSegmentBase.cpp TMixSegment.cpp TMPSegment.cpp THMMSegment.cpp TTrie.cpp TFullSegment.cpp TQuerySegment.cpp TTrieManager.cpp)
FILE(GLOB SRCFILES *.cpp)
ADD_EXECUTABLE(test.run ${SRCFILES})
TARGET_LINK_LIBRARIES(gtest pthread)
TARGET_LINK_LIBRARIES(test.run gtest pthread)

View File

@ -0,0 +1,19 @@
#include "src/KeywordExtractor.hpp"
#include "gtest/gtest.h"
using namespace CppJieba;
TEST(KeywordExtractorTest, Test1)
{
KeywordExtractor extractor("../dicts/jieba.dict.utf8");
const char* str = "我来自北京邮电大学。。。 学号 123456";
const char* res[] = {"", "来自", "北京邮电大学", "","",""," ","","", " 123456"};
vector<string> words;
ASSERT_TRUE(extractor);
ASSERT_TRUE(extractor.extract(str, words, 2));
//print(words);
//exit(0);
//print(words);
ASSERT_EQ(words, vector<string>(res, res + sizeof(res)/sizeof(res[0])));
}