mirror of
https://github.com/yanyiwu/cppjieba.git
synced 2025-07-18 00:00:12 +08:00
rename and finishing KeywordExtractor.hpp
This commit is contained in:
parent
657aee0fda
commit
24a15cd128
@ -7,7 +7,7 @@ TARGET_LINK_LIBRARIES(cjserver pthread)
|
|||||||
|
|
||||||
INSTALL(TARGETS cjsegment RUNTIME DESTINATION bin)
|
INSTALL(TARGETS cjsegment RUNTIME DESTINATION bin)
|
||||||
INSTALL(TARGETS cjserver RUNTIME DESTINATION bin)
|
INSTALL(TARGETS cjserver RUNTIME DESTINATION bin)
|
||||||
INSTALL(FILES HMMSegment.hpp MPSegment.hpp Trie.hpp TrieManager.hpp ISegment.hpp MixSegment.hpp SegmentBase.hpp TransCode.hpp DESTINATION include/CppJieba)
|
INSTALL(FILES HMMSegment.hpp MPSegment.hpp Trie.hpp TrieManager.hpp ISegment.hpp MixSegment.hpp SegmentBase.hpp TransCode.hpp KeywordExtractor.hpp DESTINATION include/CppJieba)
|
||||||
|
|
||||||
ADD_SUBDIRECTORY(Husky)
|
ADD_SUBDIRECTORY(Husky)
|
||||||
ADD_SUBDIRECTORY(Limonp)
|
ADD_SUBDIRECTORY(Limonp)
|
||||||
|
65
src/KeywordExtractor.hpp
Normal file
65
src/KeywordExtractor.hpp
Normal file
@ -0,0 +1,65 @@
|
|||||||
|
#ifndef CPPJIEBA_KEYWORD_EXTRACTOR_H
|
||||||
|
#define CPPJIEBA_KEYWORD_EXTRACTOR_H
|
||||||
|
|
||||||
|
#include "MPSegment.hpp"
|
||||||
|
|
||||||
|
namespace CppJieba
|
||||||
|
{
|
||||||
|
using namespace Limonp;
|
||||||
|
|
||||||
|
class KeywordExtractor
|
||||||
|
{
|
||||||
|
private:
|
||||||
|
MPSegment _segment;
|
||||||
|
protected:
|
||||||
|
bool _isInited;
|
||||||
|
bool _getInitFlag()const{return _isInited;};
|
||||||
|
bool _setInitFlag(bool flag){return _isInited = flag;};
|
||||||
|
public:
|
||||||
|
operator bool(){return _getInitFlag();};
|
||||||
|
public:
|
||||||
|
KeywordExtractor(){_setInitFlag(false);};
|
||||||
|
explicit KeywordExtractor(const string& dictPath){_setInitFlag(init(dictPath));};
|
||||||
|
~KeywordExtractor(){};
|
||||||
|
public:
|
||||||
|
bool init(const string& dictPath){return _setInitFlag(_segment.init(dictPath));};
|
||||||
|
public:
|
||||||
|
bool extract(const string& str, vector<string>& keywords, uint topN)
|
||||||
|
{
|
||||||
|
assert(_getInitFlag());
|
||||||
|
|
||||||
|
vector<string> words;
|
||||||
|
if(!_segment.cut(str, words))
|
||||||
|
{
|
||||||
|
LogError("segment cut(%s) failed.", str.c_str());
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
unordered_map<string, uint> wordcnt;
|
||||||
|
for(uint i = 0; i < words.size(); i ++)
|
||||||
|
{
|
||||||
|
wordcnt[ words[i] ] ++;
|
||||||
|
}
|
||||||
|
|
||||||
|
vector<pair<string, uint> > topWords(topN);
|
||||||
|
partial_sort_copy(wordcnt.begin(), wordcnt.end(), topWords.begin(), topWords.end(), _cmp);
|
||||||
|
|
||||||
|
keywords.clear();
|
||||||
|
for(uint i = 0; i < topWords.size(); i++)
|
||||||
|
{
|
||||||
|
keywords.push_back(topWords[i].first);
|
||||||
|
}
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
private:
|
||||||
|
static bool _cmp(const pair<string, uint>& lhs, const pair<string, uint>& rhs)
|
||||||
|
{
|
||||||
|
return lhs.second > rhs.second;
|
||||||
|
}
|
||||||
|
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
#endif
|
||||||
|
|
||||||
|
|
@ -19,9 +19,10 @@ namespace CppJieba
|
|||||||
bool _isInited;
|
bool _isInited;
|
||||||
bool _getInitFlag()const{return _isInited;};
|
bool _getInitFlag()const{return _isInited;};
|
||||||
bool _setInitFlag(bool flag){return _isInited = flag;};
|
bool _setInitFlag(bool flag){return _isInited = flag;};
|
||||||
|
|
||||||
public:
|
public:
|
||||||
operator bool(){return _getInitFlag();};
|
operator bool(){return _getInitFlag();};
|
||||||
|
|
||||||
|
public:
|
||||||
virtual bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<string>& res)const = 0;
|
virtual bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<string>& res)const = 0;
|
||||||
virtual bool cut(const string& str, vector<string>& res)const
|
virtual bool cut(const string& str, vector<string>& res)const
|
||||||
{
|
{
|
||||||
|
@ -1,32 +0,0 @@
|
|||||||
#ifndef CPPJIEBA_TFIDF_H
|
|
||||||
#define CPPJIEBA_TFIDF_H
|
|
||||||
|
|
||||||
#include "MPSegment.hpp"
|
|
||||||
|
|
||||||
namespace CppJieba
|
|
||||||
{
|
|
||||||
using namespace Limonp;
|
|
||||||
|
|
||||||
class TfIdfKeyWord
|
|
||||||
{
|
|
||||||
private:
|
|
||||||
MPSegment _segment;
|
|
||||||
public:
|
|
||||||
TfIdfKeyWord(const char* dictFile): _segment(dictFile){};
|
|
||||||
~TfIdfKeyWord(){};
|
|
||||||
public:
|
|
||||||
bool init(){return _segment.init();};
|
|
||||||
bool dispose(){return _segment.dispose();};
|
|
||||||
public:
|
|
||||||
bool extract(const string& str, vector<string>& words, uint topN)
|
|
||||||
{
|
|
||||||
return _segment.cut(words);
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
};
|
|
||||||
}
|
|
||||||
|
|
||||||
#endif
|
|
||||||
|
|
||||||
|
|
@ -5,7 +5,8 @@ SET(GTEST_ROOT_DIR gtest-1.6.0)
|
|||||||
|
|
||||||
INCLUDE_DIRECTORIES(${GTEST_ROOT_DIR} ${GTEST_ROOT_DIR}/include ${PROJECT_SOURCE_DIR})
|
INCLUDE_DIRECTORIES(${GTEST_ROOT_DIR} ${GTEST_ROOT_DIR}/include ${PROJECT_SOURCE_DIR})
|
||||||
ADD_LIBRARY(gtest STATIC ${GTEST_ROOT_DIR}/src/gtest-all.cc)
|
ADD_LIBRARY(gtest STATIC ${GTEST_ROOT_DIR}/src/gtest-all.cc)
|
||||||
ADD_EXECUTABLE(test.run gtest_main.cc TSegmentBase.cpp TMixSegment.cpp TMPSegment.cpp THMMSegment.cpp TTrie.cpp TFullSegment.cpp TQuerySegment.cpp TTrieManager.cpp)
|
FILE(GLOB SRCFILES *.cpp)
|
||||||
|
ADD_EXECUTABLE(test.run ${SRCFILES})
|
||||||
TARGET_LINK_LIBRARIES(gtest pthread)
|
TARGET_LINK_LIBRARIES(gtest pthread)
|
||||||
TARGET_LINK_LIBRARIES(test.run gtest pthread)
|
TARGET_LINK_LIBRARIES(test.run gtest pthread)
|
||||||
|
|
||||||
|
19
test/unittest/TKeywordExtractor.cpp
Normal file
19
test/unittest/TKeywordExtractor.cpp
Normal file
@ -0,0 +1,19 @@
|
|||||||
|
#include "src/KeywordExtractor.hpp"
|
||||||
|
#include "gtest/gtest.h"
|
||||||
|
|
||||||
|
using namespace CppJieba;
|
||||||
|
|
||||||
|
TEST(KeywordExtractorTest, Test1)
|
||||||
|
{
|
||||||
|
KeywordExtractor extractor("../dicts/jieba.dict.utf8");
|
||||||
|
const char* str = "我来自北京邮电大学。。。 学号 123456";
|
||||||
|
const char* res[] = {"我", "来自", "北京邮电大学", "。","。","。"," ","学","号", " 123456"};
|
||||||
|
vector<string> words;
|
||||||
|
ASSERT_TRUE(extractor);
|
||||||
|
ASSERT_TRUE(extractor.extract(str, words, 2));
|
||||||
|
//print(words);
|
||||||
|
//exit(0);
|
||||||
|
//print(words);
|
||||||
|
ASSERT_EQ(words, vector<string>(res, res + sizeof(res)/sizeof(res[0])));
|
||||||
|
}
|
||||||
|
|
Loading…
x
Reference in New Issue
Block a user