mirror of
https://github.com/yanyiwu/cppjieba.git
synced 2025-07-18 00:00:12 +08:00
Merge pull request #196 from ahmadov/ahmadov/fix-ns-2
avoid implicit namespaces
This commit is contained in:
commit
338603b676
@ -1,37 +1,35 @@
|
|||||||
#ifndef CPPJIEBA_KEYWORD_EXTRACTOR_H
|
#ifndef CPPJIEBA_KEYWORD_EXTRACTOR_H
|
||||||
#define CPPJIEBA_KEYWORD_EXTRACTOR_H
|
#define CPPJIEBA_KEYWORD_EXTRACTOR_H
|
||||||
|
|
||||||
#include <cmath>
|
#include <algorithm>
|
||||||
#include <set>
|
#include <unordered_map>
|
||||||
|
#include <unordered_set>
|
||||||
#include "MixSegment.hpp"
|
#include "MixSegment.hpp"
|
||||||
|
|
||||||
namespace cppjieba {
|
namespace cppjieba {
|
||||||
|
|
||||||
using namespace limonp;
|
|
||||||
using namespace std;
|
|
||||||
|
|
||||||
/*utf8*/
|
/*utf8*/
|
||||||
class KeywordExtractor {
|
class KeywordExtractor {
|
||||||
public:
|
public:
|
||||||
struct Word {
|
struct Word {
|
||||||
string word;
|
std::string word;
|
||||||
vector<size_t> offsets;
|
std::vector<size_t> offsets;
|
||||||
double weight;
|
double weight;
|
||||||
}; // struct Word
|
}; // struct Word
|
||||||
|
|
||||||
KeywordExtractor(const string& dictPath,
|
KeywordExtractor(const std::string& dictPath,
|
||||||
const string& hmmFilePath,
|
const std::string& hmmFilePath,
|
||||||
const string& idfPath,
|
const std::string& idfPath,
|
||||||
const string& stopWordPath,
|
const std::string& stopWordPath,
|
||||||
const string& userDict = "")
|
const std::string& userDict = "")
|
||||||
: segment_(dictPath, hmmFilePath, userDict) {
|
: segment_(dictPath, hmmFilePath, userDict) {
|
||||||
LoadIdfDict(idfPath);
|
LoadIdfDict(idfPath);
|
||||||
LoadStopWordDict(stopWordPath);
|
LoadStopWordDict(stopWordPath);
|
||||||
}
|
}
|
||||||
KeywordExtractor(const DictTrie* dictTrie,
|
KeywordExtractor(const DictTrie* dictTrie,
|
||||||
const HMMModel* model,
|
const HMMModel* model,
|
||||||
const string& idfPath,
|
const std::string& idfPath,
|
||||||
const string& stopWordPath)
|
const std::string& stopWordPath)
|
||||||
: segment_(dictTrie, model) {
|
: segment_(dictTrie, model) {
|
||||||
LoadIdfDict(idfPath);
|
LoadIdfDict(idfPath);
|
||||||
LoadStopWordDict(stopWordPath);
|
LoadStopWordDict(stopWordPath);
|
||||||
@ -39,27 +37,27 @@ class KeywordExtractor {
|
|||||||
~KeywordExtractor() {
|
~KeywordExtractor() {
|
||||||
}
|
}
|
||||||
|
|
||||||
void Extract(const string& sentence, vector<string>& keywords, size_t topN) const {
|
void Extract(const std::string& sentence, std::vector<std::string>& keywords, size_t topN) const {
|
||||||
vector<Word> topWords;
|
std::vector<Word> topWords;
|
||||||
Extract(sentence, topWords, topN);
|
Extract(sentence, topWords, topN);
|
||||||
for (size_t i = 0; i < topWords.size(); i++) {
|
for (size_t i = 0; i < topWords.size(); i++) {
|
||||||
keywords.push_back(topWords[i].word);
|
keywords.push_back(topWords[i].word);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void Extract(const string& sentence, vector<pair<string, double> >& keywords, size_t topN) const {
|
void Extract(const std::string& sentence, std::vector<pair<std::string, double> >& keywords, size_t topN) const {
|
||||||
vector<Word> topWords;
|
std::vector<Word> topWords;
|
||||||
Extract(sentence, topWords, topN);
|
Extract(sentence, topWords, topN);
|
||||||
for (size_t i = 0; i < topWords.size(); i++) {
|
for (size_t i = 0; i < topWords.size(); i++) {
|
||||||
keywords.push_back(pair<string, double>(topWords[i].word, topWords[i].weight));
|
keywords.push_back(pair<std::string, double>(topWords[i].word, topWords[i].weight));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void Extract(const string& sentence, vector<Word>& keywords, size_t topN) const {
|
void Extract(const std::string& sentence, std::vector<Word>& keywords, size_t topN) const {
|
||||||
vector<string> words;
|
std::vector<std::string> words;
|
||||||
segment_.Cut(sentence, words);
|
segment_.Cut(sentence, words);
|
||||||
|
|
||||||
map<string, Word> wordmap;
|
std::map<std::string, Word> wordmap;
|
||||||
size_t offset = 0;
|
size_t offset = 0;
|
||||||
for (size_t i = 0; i < words.size(); ++i) {
|
for (size_t i = 0; i < words.size(); ++i) {
|
||||||
size_t t = offset;
|
size_t t = offset;
|
||||||
@ -77,8 +75,8 @@ class KeywordExtractor {
|
|||||||
|
|
||||||
keywords.clear();
|
keywords.clear();
|
||||||
keywords.reserve(wordmap.size());
|
keywords.reserve(wordmap.size());
|
||||||
for (map<string, Word>::iterator itr = wordmap.begin(); itr != wordmap.end(); ++itr) {
|
for (std::map<std::string, Word>::iterator itr = wordmap.begin(); itr != wordmap.end(); ++itr) {
|
||||||
unordered_map<string, double>::const_iterator cit = idfMap_.find(itr->first);
|
std::unordered_map<std::string, double>::const_iterator cit = idfMap_.find(itr->first);
|
||||||
if (cit != idfMap_.end()) {
|
if (cit != idfMap_.end()) {
|
||||||
itr->second.weight *= cit->second;
|
itr->second.weight *= cit->second;
|
||||||
} else {
|
} else {
|
||||||
@ -88,15 +86,15 @@ class KeywordExtractor {
|
|||||||
keywords.push_back(itr->second);
|
keywords.push_back(itr->second);
|
||||||
}
|
}
|
||||||
topN = min(topN, keywords.size());
|
topN = min(topN, keywords.size());
|
||||||
partial_sort(keywords.begin(), keywords.begin() + topN, keywords.end(), Compare);
|
std::partial_sort(keywords.begin(), keywords.begin() + topN, keywords.end(), Compare);
|
||||||
keywords.resize(topN);
|
keywords.resize(topN);
|
||||||
}
|
}
|
||||||
private:
|
private:
|
||||||
void LoadIdfDict(const string& idfPath) {
|
void LoadIdfDict(const std::string& idfPath) {
|
||||||
ifstream ifs(idfPath.c_str());
|
std::ifstream ifs(idfPath.c_str());
|
||||||
XCHECK(ifs.is_open()) << "open " << idfPath << " failed";
|
XCHECK(ifs.is_open()) << "open " << idfPath << " failed";
|
||||||
string line ;
|
std::string line ;
|
||||||
vector<string> buf;
|
std::vector<std::string> buf;
|
||||||
double idf = 0.0;
|
double idf = 0.0;
|
||||||
double idfSum = 0.0;
|
double idfSum = 0.0;
|
||||||
size_t lineno = 0;
|
size_t lineno = 0;
|
||||||
@ -106,7 +104,7 @@ class KeywordExtractor {
|
|||||||
XLOG(ERROR) << "lineno: " << lineno << " empty. skipped.";
|
XLOG(ERROR) << "lineno: " << lineno << " empty. skipped.";
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
Split(line, buf, " ");
|
limonp::Split(line, buf, " ");
|
||||||
if (buf.size() != 2) {
|
if (buf.size() != 2) {
|
||||||
XLOG(ERROR) << "line: " << line << ", lineno: " << lineno << " empty. skipped.";
|
XLOG(ERROR) << "line: " << line << ", lineno: " << lineno << " empty. skipped.";
|
||||||
continue;
|
continue;
|
||||||
@ -121,10 +119,10 @@ class KeywordExtractor {
|
|||||||
idfAverage_ = idfSum / lineno;
|
idfAverage_ = idfSum / lineno;
|
||||||
assert(idfAverage_ > 0.0);
|
assert(idfAverage_ > 0.0);
|
||||||
}
|
}
|
||||||
void LoadStopWordDict(const string& filePath) {
|
void LoadStopWordDict(const std::string& filePath) {
|
||||||
ifstream ifs(filePath.c_str());
|
std::ifstream ifs(filePath.c_str());
|
||||||
XCHECK(ifs.is_open()) << "open " << filePath << " failed";
|
XCHECK(ifs.is_open()) << "open " << filePath << " failed";
|
||||||
string line ;
|
std::string line ;
|
||||||
while (getline(ifs, line)) {
|
while (getline(ifs, line)) {
|
||||||
stopWords_.insert(line);
|
stopWords_.insert(line);
|
||||||
}
|
}
|
||||||
@ -136,18 +134,16 @@ class KeywordExtractor {
|
|||||||
}
|
}
|
||||||
|
|
||||||
MixSegment segment_;
|
MixSegment segment_;
|
||||||
unordered_map<string, double> idfMap_;
|
std::unordered_map<std::string, double> idfMap_;
|
||||||
double idfAverage_;
|
double idfAverage_;
|
||||||
|
|
||||||
unordered_set<string> stopWords_;
|
std::unordered_set<std::string> stopWords_;
|
||||||
}; // class KeywordExtractor
|
}; // class KeywordExtractor
|
||||||
|
|
||||||
inline ostream& operator << (ostream& os, const KeywordExtractor::Word& word) {
|
inline std::ostream& operator << (std::ostream& os, const KeywordExtractor::Word& word) {
|
||||||
return os << "{\"word\": \"" << word.word << "\", \"offset\": " << word.offsets << ", \"weight\": " << word.weight << "}";
|
return os << "{\"word\": \"" << word.word << "\", \"offset\": " << word.offsets << ", \"weight\": " << word.weight << "}";
|
||||||
}
|
}
|
||||||
|
|
||||||
} // namespace cppjieba
|
} // namespace cppjieba
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user