mirror of
https://github.com/yanyiwu/cppjieba.git
synced 2025-07-18 00:00:12 +08:00
Add KeywordExtractor::Word and add more overrided KeywordExtractor::Extract
This commit is contained in:
parent
e6a2b47b87
commit
c19736995c
@ -4,6 +4,7 @@
|
|||||||
|
|
||||||
+ Change Jieba::Locate to be static function.
|
+ Change Jieba::Locate to be static function.
|
||||||
+ Change the return value of KeywordExtractor::Extract from bool to void.
|
+ Change the return value of KeywordExtractor::Extract from bool to void.
|
||||||
|
+ Add KeywordExtractor::Word and add more overrided KeywordExtractor::Extract
|
||||||
|
|
||||||
## v4.5.3
|
## v4.5.3
|
||||||
|
|
||||||
|
@ -11,6 +11,12 @@ using namespace limonp;
|
|||||||
/*utf8*/
|
/*utf8*/
|
||||||
class KeywordExtractor {
|
class KeywordExtractor {
|
||||||
public:
|
public:
|
||||||
|
struct Word {
|
||||||
|
string word;
|
||||||
|
vector<size_t> offsets;
|
||||||
|
double weight;
|
||||||
|
}; // struct Word
|
||||||
|
|
||||||
KeywordExtractor(const string& dictPath,
|
KeywordExtractor(const string& dictPath,
|
||||||
const string& hmmFilePath,
|
const string& hmmFilePath,
|
||||||
const string& idfPath,
|
const string& idfPath,
|
||||||
@ -39,42 +45,53 @@ class KeywordExtractor {
|
|||||||
}
|
}
|
||||||
|
|
||||||
void Extract(const string& sentence, vector<string>& keywords, size_t topN) const {
|
void Extract(const string& sentence, vector<string>& keywords, size_t topN) const {
|
||||||
vector<pair<string, double> > topWords;
|
vector<Word> topWords;
|
||||||
Extract(sentence, topWords, topN);
|
Extract(sentence, topWords, topN);
|
||||||
for (size_t i = 0; i < topWords.size(); i++) {
|
for (size_t i = 0; i < topWords.size(); i++) {
|
||||||
keywords.push_back(topWords[i].first);
|
keywords.push_back(topWords[i].word);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void Extract(const string& sentence, vector<pair<string, double> >& keywords, size_t topN) const {
|
void Extract(const string& sentence, vector<pair<string, double> >& keywords, size_t topN) const {
|
||||||
|
vector<Word> topWords;
|
||||||
|
Extract(sentence, topWords, topN);
|
||||||
|
for (size_t i = 0; i < topWords.size(); i++) {
|
||||||
|
keywords.push_back(pair<string, double>(topWords[i].word, topWords[i].weight));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void Extract(const string& sentence, vector<Word>& keywords, size_t topN) const {
|
||||||
vector<string> words;
|
vector<string> words;
|
||||||
segment_.Cut(sentence, words);
|
segment_.Cut(sentence, words);
|
||||||
|
|
||||||
map<string, double> wordmap;
|
map<string, Word> wordmap;
|
||||||
for (vector<string>::iterator iter = words.begin(); iter != words.end(); iter++) {
|
size_t offset = 0;
|
||||||
if (IsSingleWord(*iter)) {
|
for (size_t i = 0; i < words.size(); ++i) {
|
||||||
|
size_t t = offset;
|
||||||
|
offset += words[i].size();
|
||||||
|
if (IsSingleWord(words[i]) || stopWords_.find(words[i]) != stopWords_.end()) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
wordmap[*iter] += 1.0;
|
wordmap[words[i]].offsets.push_back(t);
|
||||||
|
wordmap[words[i]].weight += 1.0;
|
||||||
}
|
}
|
||||||
|
if (offset != sentence.size()) {
|
||||||
for (map<string, double>::iterator itr = wordmap.begin(); itr != wordmap.end(); ) {
|
XLOG(ERROR) << "words illegal";
|
||||||
if (stopWords_.end() != stopWords_.find(itr->first)) {
|
return;
|
||||||
wordmap.erase(itr++);
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
unordered_map<string, double>::const_iterator cit = idfMap_.find(itr->first);
|
|
||||||
if (cit != idfMap_.end()) {
|
|
||||||
itr->second *= cit->second;
|
|
||||||
} else {
|
|
||||||
itr->second *= idfAverage_;
|
|
||||||
}
|
|
||||||
itr ++;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
keywords.clear();
|
keywords.clear();
|
||||||
std::copy(wordmap.begin(), wordmap.end(), std::inserter(keywords, keywords.begin()));
|
keywords.reserve(wordmap.size());
|
||||||
|
for (map<string, Word>::iterator itr = wordmap.begin(); itr != wordmap.end(); ++itr) {
|
||||||
|
unordered_map<string, double>::const_iterator cit = idfMap_.find(itr->first);
|
||||||
|
if (cit != idfMap_.end()) {
|
||||||
|
itr->second.weight *= cit->second;
|
||||||
|
} else {
|
||||||
|
itr->second.weight *= idfAverage_;
|
||||||
|
}
|
||||||
|
itr->second.word = itr->first;
|
||||||
|
keywords.push_back(itr->second);
|
||||||
|
}
|
||||||
topN = min(topN, keywords.size());
|
topN = min(topN, keywords.size());
|
||||||
partial_sort(keywords.begin(), keywords.begin() + topN, keywords.end(), Compare);
|
partial_sort(keywords.begin(), keywords.begin() + topN, keywords.end(), Compare);
|
||||||
keywords.resize(topN);
|
keywords.resize(topN);
|
||||||
@ -127,8 +144,8 @@ class KeywordExtractor {
|
|||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
static bool Compare(const pair<string, double>& lhs, const pair<string, double>& rhs) {
|
static bool Compare(const Word& lhs, const Word& rhs) {
|
||||||
return lhs.second > rhs.second;
|
return lhs.weight > rhs.weight;
|
||||||
}
|
}
|
||||||
|
|
||||||
MixSegment segment_;
|
MixSegment segment_;
|
||||||
@ -137,6 +154,11 @@ class KeywordExtractor {
|
|||||||
|
|
||||||
unordered_set<string> stopWords_;
|
unordered_set<string> stopWords_;
|
||||||
}; // class Jieba
|
}; // class Jieba
|
||||||
|
|
||||||
|
inline ostream& operator << (ostream& os, const KeywordExtractor::Word& word) {
|
||||||
|
return os << word.word << '|' << word.offsets << '|' << word.weight;
|
||||||
|
}
|
||||||
|
|
||||||
} // namespace cppjieba
|
} // namespace cppjieba
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
@ -62,7 +62,7 @@ int main(int argc, char** argv) {
|
|||||||
STOP_WORD_PATH);
|
STOP_WORD_PATH);
|
||||||
cout << "[demo] KEYWORD" << endl;
|
cout << "[demo] KEYWORD" << endl;
|
||||||
const size_t topk = 5;
|
const size_t topk = 5;
|
||||||
vector<pair<string, double> > keywordres;
|
vector<cppjieba::KeywordExtractor::Word> keywordres;
|
||||||
extractor.Extract(s, keywordres, topk);
|
extractor.Extract(s, keywordres, topk);
|
||||||
cout << s << endl;
|
cout << s << endl;
|
||||||
cout << keywordres << endl;
|
cout << keywordres << endl;
|
||||||
|
@ -6,24 +6,51 @@ using namespace cppjieba;
|
|||||||
TEST(KeywordExtractorTest, Test1) {
|
TEST(KeywordExtractorTest, Test1) {
|
||||||
KeywordExtractor Extractor("../test/testdata/extra_dict/jieba.dict.small.utf8", "../dict/hmm_model.utf8", "../dict/idf.utf8", "../dict/stop_words.utf8");
|
KeywordExtractor Extractor("../test/testdata/extra_dict/jieba.dict.small.utf8", "../dict/hmm_model.utf8", "../dict/idf.utf8", "../dict/stop_words.utf8");
|
||||||
|
|
||||||
|
{
|
||||||
|
string s("你好世界世界而且而且");
|
||||||
|
string res;
|
||||||
|
size_t topN = 5;
|
||||||
|
|
||||||
|
{
|
||||||
|
vector<string> words;
|
||||||
|
Extractor.Extract(s, words, topN);
|
||||||
|
res << words;
|
||||||
|
ASSERT_EQ(res, "[\"世界\", \"你好\"]");
|
||||||
|
}
|
||||||
|
|
||||||
|
{
|
||||||
|
vector<pair<string, double> > words;
|
||||||
|
Extractor.Extract(s, words, topN);
|
||||||
|
res << words;
|
||||||
|
ASSERT_EQ(res, "[\"世界:8.73506\", \"你好:7.95788\"]");
|
||||||
|
}
|
||||||
|
|
||||||
|
{
|
||||||
|
vector<KeywordExtractor::Word> words;
|
||||||
|
Extractor.Extract(s, words, topN);
|
||||||
|
res << words;
|
||||||
|
ASSERT_EQ(res, "[\"世界|[\"6\", \"12\"]|8.73506\", \"你好|[\"0\"]|7.95788\"]");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
{
|
{
|
||||||
string s("我是拖拉机学院手扶拖拉机专业的。不用多久,我就会升职加薪,当上CEO,走上人生巅峰。");
|
string s("我是拖拉机学院手扶拖拉机专业的。不用多久,我就会升职加薪,当上CEO,走上人生巅峰。");
|
||||||
string res;
|
string res;
|
||||||
vector<pair<string, double> > wordweights;
|
vector<KeywordExtractor::Word> wordweights;
|
||||||
size_t topN = 5;
|
size_t topN = 5;
|
||||||
Extractor.Extract(s, wordweights, topN);
|
Extractor.Extract(s, wordweights, topN);
|
||||||
res << wordweights;
|
res << wordweights;
|
||||||
ASSERT_EQ(res, "[\"CEO:11.7392\", \"升职:10.8562\", \"加薪:10.6426\", \"手扶拖拉机:10.0089\", \"巅峰:9.49396\"]");
|
ASSERT_EQ(res, "[\"CEO|[\"93\"]|11.7392\", \"\xE5\x8D\x87\xE8\x81\x8C|[\"72\"]|10.8562\", \"\xE5\x8A\xA0\xE8\x96\xAA|[\"78\"]|10.6426\", \"\xE6\x89\x8B\xE6\x89\xB6\xE6\x8B\x96\xE6\x8B\x89\xE6\x9C\xBA|[\"21\"]|10.0089\", \"\xE5\xB7\x85\xE5\xB3\xB0|[\"111\"]|9.49396\"]");
|
||||||
}
|
}
|
||||||
|
|
||||||
{
|
{
|
||||||
string s("一部iPhone6");
|
string s("一部iPhone6");
|
||||||
string res;
|
string res;
|
||||||
vector<pair<string, double> > wordweights;
|
vector<KeywordExtractor::Word> wordweights;
|
||||||
size_t topN = 5;
|
size_t topN = 5;
|
||||||
Extractor.Extract(s, wordweights, topN);
|
Extractor.Extract(s, wordweights, topN);
|
||||||
res << wordweights;
|
res << wordweights;
|
||||||
ASSERT_EQ(res, "[\"iPhone6:11.7392\", \"一部:6.47592\"]");
|
ASSERT_EQ(res, "[\"iPhone6|[\"6\"]|11.7392\", \"\xE4\xB8\x80\xE9\x83\xA8|[\"0\"]|6.47592\"]");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -33,20 +60,20 @@ TEST(KeywordExtractorTest, Test2) {
|
|||||||
{
|
{
|
||||||
string s("蓝翔优秀毕业生");
|
string s("蓝翔优秀毕业生");
|
||||||
string res;
|
string res;
|
||||||
vector<pair<string, double> > wordweights;
|
vector<KeywordExtractor::Word> wordweights;
|
||||||
size_t topN = 5;
|
size_t topN = 5;
|
||||||
Extractor.Extract(s, wordweights, topN);
|
Extractor.Extract(s, wordweights, topN);
|
||||||
res << wordweights;
|
res << wordweights;
|
||||||
ASSERT_EQ(res, "[\"蓝翔:11.7392\", \"毕业生:8.13549\", \"优秀:6.78347\"]");
|
ASSERT_EQ(res, "[\"\xE8\x93\x9D\xE7\xBF\x94|[\"0\"]|11.7392\", \"\xE6\xAF\x95\xE4\xB8\x9A\xE7\x94\x9F|[\"12\"]|8.13549\", \"\xE4\xBC\x98\xE7\xA7\x80|[\"6\"]|6.78347\"]");
|
||||||
}
|
}
|
||||||
|
|
||||||
{
|
{
|
||||||
string s("一部iPhone6");
|
string s("一部iPhone6");
|
||||||
string res;
|
string res;
|
||||||
vector<pair<string, double> > wordweights;
|
vector<KeywordExtractor::Word> wordweights;
|
||||||
size_t topN = 5;
|
size_t topN = 5;
|
||||||
Extractor.Extract(s, wordweights, topN);
|
Extractor.Extract(s, wordweights, topN);
|
||||||
res << wordweights;
|
res << wordweights;
|
||||||
ASSERT_EQ(res, "[\"iPhone6:11.7392\", \"一部:6.47592\"]");
|
ASSERT_EQ(res, "[\"iPhone6|[\"6\"]|11.7392\", \"\xE4\xB8\x80\xE9\x83\xA8|[\"0\"]|6.47592\"]");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
Loading…
x
Reference in New Issue
Block a user