mirror of
https://github.com/yanyiwu/cppjieba.git
synced 2025-07-18 00:00:12 +08:00
rename and finishing KeywordExtractor.hpp
This commit is contained in:
parent
657aee0fda
commit
24a15cd128
@ -7,7 +7,7 @@ TARGET_LINK_LIBRARIES(cjserver pthread)
|
||||
|
||||
INSTALL(TARGETS cjsegment RUNTIME DESTINATION bin)
|
||||
INSTALL(TARGETS cjserver RUNTIME DESTINATION bin)
|
||||
INSTALL(FILES HMMSegment.hpp MPSegment.hpp Trie.hpp TrieManager.hpp ISegment.hpp MixSegment.hpp SegmentBase.hpp TransCode.hpp DESTINATION include/CppJieba)
|
||||
INSTALL(FILES HMMSegment.hpp MPSegment.hpp Trie.hpp TrieManager.hpp ISegment.hpp MixSegment.hpp SegmentBase.hpp TransCode.hpp KeywordExtractor.hpp DESTINATION include/CppJieba)
|
||||
|
||||
ADD_SUBDIRECTORY(Husky)
|
||||
ADD_SUBDIRECTORY(Limonp)
|
||||
|
65
src/KeywordExtractor.hpp
Normal file
65
src/KeywordExtractor.hpp
Normal file
@ -0,0 +1,65 @@
|
||||
#ifndef CPPJIEBA_KEYWORD_EXTRACTOR_H
|
||||
#define CPPJIEBA_KEYWORD_EXTRACTOR_H
|
||||
|
||||
#include "MPSegment.hpp"
|
||||
|
||||
namespace CppJieba
|
||||
{
|
||||
using namespace Limonp;
|
||||
|
||||
class KeywordExtractor
|
||||
{
|
||||
private:
|
||||
MPSegment _segment;
|
||||
protected:
|
||||
bool _isInited;
|
||||
bool _getInitFlag()const{return _isInited;};
|
||||
bool _setInitFlag(bool flag){return _isInited = flag;};
|
||||
public:
|
||||
operator bool(){return _getInitFlag();};
|
||||
public:
|
||||
KeywordExtractor(){_setInitFlag(false);};
|
||||
explicit KeywordExtractor(const string& dictPath){_setInitFlag(init(dictPath));};
|
||||
~KeywordExtractor(){};
|
||||
public:
|
||||
bool init(const string& dictPath){return _setInitFlag(_segment.init(dictPath));};
|
||||
public:
|
||||
bool extract(const string& str, vector<string>& keywords, uint topN)
|
||||
{
|
||||
assert(_getInitFlag());
|
||||
|
||||
vector<string> words;
|
||||
if(!_segment.cut(str, words))
|
||||
{
|
||||
LogError("segment cut(%s) failed.", str.c_str());
|
||||
return false;
|
||||
}
|
||||
|
||||
unordered_map<string, uint> wordcnt;
|
||||
for(uint i = 0; i < words.size(); i ++)
|
||||
{
|
||||
wordcnt[ words[i] ] ++;
|
||||
}
|
||||
|
||||
vector<pair<string, uint> > topWords(topN);
|
||||
partial_sort_copy(wordcnt.begin(), wordcnt.end(), topWords.begin(), topWords.end(), _cmp);
|
||||
|
||||
keywords.clear();
|
||||
for(uint i = 0; i < topWords.size(); i++)
|
||||
{
|
||||
keywords.push_back(topWords[i].first);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
private:
|
||||
static bool _cmp(const pair<string, uint>& lhs, const pair<string, uint>& rhs)
|
||||
{
|
||||
return lhs.second > rhs.second;
|
||||
}
|
||||
|
||||
};
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
|
@ -19,9 +19,10 @@ namespace CppJieba
|
||||
bool _isInited;
|
||||
bool _getInitFlag()const{return _isInited;};
|
||||
bool _setInitFlag(bool flag){return _isInited = flag;};
|
||||
|
||||
public:
|
||||
operator bool(){return _getInitFlag();};
|
||||
|
||||
public:
|
||||
virtual bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<string>& res)const = 0;
|
||||
virtual bool cut(const string& str, vector<string>& res)const
|
||||
{
|
||||
|
@ -1,32 +0,0 @@
|
||||
#ifndef CPPJIEBA_TFIDF_H
|
||||
#define CPPJIEBA_TFIDF_H
|
||||
|
||||
#include "MPSegment.hpp"
|
||||
|
||||
namespace CppJieba
|
||||
{
|
||||
using namespace Limonp;
|
||||
|
||||
class TfIdfKeyWord
|
||||
{
|
||||
private:
|
||||
MPSegment _segment;
|
||||
public:
|
||||
TfIdfKeyWord(const char* dictFile): _segment(dictFile){};
|
||||
~TfIdfKeyWord(){};
|
||||
public:
|
||||
bool init(){return _segment.init();};
|
||||
bool dispose(){return _segment.dispose();};
|
||||
public:
|
||||
bool extract(const string& str, vector<string>& words, uint topN)
|
||||
{
|
||||
return _segment.cut(words);
|
||||
return true;
|
||||
}
|
||||
|
||||
};
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
|
@ -5,7 +5,8 @@ SET(GTEST_ROOT_DIR gtest-1.6.0)
|
||||
|
||||
INCLUDE_DIRECTORIES(${GTEST_ROOT_DIR} ${GTEST_ROOT_DIR}/include ${PROJECT_SOURCE_DIR})
|
||||
ADD_LIBRARY(gtest STATIC ${GTEST_ROOT_DIR}/src/gtest-all.cc)
|
||||
ADD_EXECUTABLE(test.run gtest_main.cc TSegmentBase.cpp TMixSegment.cpp TMPSegment.cpp THMMSegment.cpp TTrie.cpp TFullSegment.cpp TQuerySegment.cpp TTrieManager.cpp)
|
||||
FILE(GLOB SRCFILES *.cpp)
|
||||
ADD_EXECUTABLE(test.run ${SRCFILES})
|
||||
TARGET_LINK_LIBRARIES(gtest pthread)
|
||||
TARGET_LINK_LIBRARIES(test.run gtest pthread)
|
||||
|
||||
|
19
test/unittest/TKeywordExtractor.cpp
Normal file
19
test/unittest/TKeywordExtractor.cpp
Normal file
@ -0,0 +1,19 @@
|
||||
#include "src/KeywordExtractor.hpp"
|
||||
#include "gtest/gtest.h"
|
||||
|
||||
using namespace CppJieba;
|
||||
|
||||
TEST(KeywordExtractorTest, Test1)
|
||||
{
|
||||
KeywordExtractor extractor("../dicts/jieba.dict.utf8");
|
||||
const char* str = "我来自北京邮电大学。。。 学号 123456";
|
||||
const char* res[] = {"我", "来自", "北京邮电大学", "。","。","。"," ","学","号", " 123456"};
|
||||
vector<string> words;
|
||||
ASSERT_TRUE(extractor);
|
||||
ASSERT_TRUE(extractor.extract(str, words, 2));
|
||||
//print(words);
|
||||
//exit(0);
|
||||
//print(words);
|
||||
ASSERT_EQ(words, vector<string>(res, res + sizeof(res)/sizeof(res[0])));
|
||||
}
|
||||
|
Loading…
x
Reference in New Issue
Block a user