From 229fcd715fd3ee844f19bc952a1e9674ba2cdfc1 Mon Sep 17 00:00:00 2001 From: wyy Date: Tue, 24 Dec 2013 19:03:52 -0800 Subject: [PATCH] add another extract function in keywordextractor.hpp and ut ok --- src/KeywordExtractor.hpp | 43 +++++++++++++++++------------ test/unittest/TKeywordExtractor.cpp | 19 ++++++++++++- 2 files changed, 44 insertions(+), 18 deletions(-) diff --git a/src/KeywordExtractor.hpp b/src/KeywordExtractor.hpp index cd6850e..240ced8 100644 --- a/src/KeywordExtractor.hpp +++ b/src/KeywordExtractor.hpp @@ -8,16 +8,16 @@ namespace CppJieba { using namespace Limonp; - struct KeyWordInfo - { - string word; - double idf; - }; + //struct KeyWordInfo + //{ + // string word; + // double tfidf; + //}; - inline ostream& operator << (ostream& os, const KeyWordInfo & keyword) - { - return os << keyword.word << "," << keyword.idf; - } + //inline ostream& operator << (ostream& os, const KeyWordInfo & keyword) + //{ + // return os << keyword.word << "," << keyword.idf; + //} class KeywordExtractor { @@ -64,10 +64,24 @@ namespace CppJieba return _setInitFlag(_segment.init(dictPath)); }; public: + bool extract(const string& str, vector& keywords, uint topN) const { assert(_getInitFlag()); + vector > topWords; + if(!extract(str, topWords, topN)) + { + return false; + } + for(uint i = 0; i < topWords.size(); i++) + { + keywords.push_back(topWords[i].first); + } + return true; + } + bool extract(const string& str, vector >& keywords, uint topN) const + { vector words; if(!_segment.cut(str, words)) { @@ -95,16 +109,11 @@ namespace CppJieba } } - vector > topWords(min(topN, wordmap.size())); - partial_sort_copy(wordmap.begin(), wordmap.end(), topWords.begin(), topWords.end(), _cmp); - - keywords.clear(); - for(uint i = 0; i < topWords.size(); i++) - { - keywords.push_back(topWords[i].first); - } + keywords.resize(min(topN, wordmap.size())); + partial_sort_copy(wordmap.begin(), wordmap.end(), keywords.begin(), keywords.end(), _cmp); return true; } + private: static bool _cmp(const pair& lhs, const pair& rhs) { diff --git a/test/unittest/TKeywordExtractor.cpp b/test/unittest/TKeywordExtractor.cpp index 1168c87..1ba3c31 100644 --- a/test/unittest/TKeywordExtractor.cpp +++ b/test/unittest/TKeywordExtractor.cpp @@ -32,8 +32,25 @@ TEST(KeywordExtractorTest, Test3) ASSERT_TRUE(ifs); string str((istreambuf_iterator(ifs)), (istreambuf_iterator())); KeywordExtractor extractor("../dict/jieba.dict.utf8", "../dict/idf.utf8"); - vector keywords; const char* res[] = {"小姐", "孙小姐", "方鸿渐", "自己", "没有"}; + vector keywords; extractor.extract(str, keywords, 5); ASSERT_EQ(keywords, vector(res, res + sizeof(res)/sizeof(res[0]))); + +} + +TEST(KeywordExtractorTest, Test4) +{ + ifstream ifs("../test/testdata/weicheng.utf8"); + ASSERT_TRUE(ifs); + string str((istreambuf_iterator(ifs)), (istreambuf_iterator())); + KeywordExtractor extractor("../dict/jieba.dict.utf8", "../dict/idf.utf8"); + //const char* res[] = {"小姐", "孙小姐", "方鸿渐", "自己", "没有"}; + vector > keywords; + extractor.extract(str, keywords, 5); + //print(keywords); + string res; + res << keywords; + ASSERT_EQ(res, "[\"小姐:4268.75\", \"孙小姐:3789.41\", \"方鸿渐:3030.35\", \"自己:2300.54\", \"没有:2104.27\"]"); + }