mirror of
https://github.com/yanyiwu/cppjieba.git
synced 2025-07-18 00:00:12 +08:00
KeywordExtractor 支持自定义词典(可选参数)。
This commit is contained in:
parent
269bc0fd0d
commit
4e72d4a06f
@ -1,5 +1,10 @@
|
||||
# CppJieba ChangeLog
|
||||
|
||||
## v2.4.5 (untagged)
|
||||
|
||||
1. 使得 QuerySegment 支持自定义词典(可选参数)。
|
||||
2. 使得 KeywordExtractor 支持自定义词典(可选参数)。
|
||||
|
||||
## v2.4.4
|
||||
|
||||
1. 修改两条更细粒度的特殊过滤规则,将连续的数字(包括浮点数)和连续的字母单独切分出来(而不会混在一起)。
|
||||
|
@ -115,7 +115,7 @@ namespace CppJieba
|
||||
void _loadUserDict(const string& filePath, double defaultWeight, const string& defaultTag)
|
||||
{
|
||||
ifstream ifs(filePath.c_str());
|
||||
assert(ifs);
|
||||
assert(ifs.is_open());
|
||||
string line;
|
||||
DictUnit nodeInfo;
|
||||
vector<string> buf;
|
||||
@ -143,7 +143,7 @@ namespace CppJieba
|
||||
void _loadDict(const string& filePath)
|
||||
{
|
||||
ifstream ifs(filePath.c_str());
|
||||
assert(ifs);
|
||||
assert(ifs.is_open());
|
||||
string line;
|
||||
vector<string> buf;
|
||||
|
||||
|
@ -21,19 +21,18 @@ namespace CppJieba
|
||||
unordered_set<string> _stopWords;
|
||||
public:
|
||||
KeywordExtractor(){};
|
||||
KeywordExtractor(const string& dictPath, const string& hmmFilePath, const string& idfPath, const string& stopWordPath)
|
||||
KeywordExtractor(const string& dictPath, const string& hmmFilePath, const string& idfPath, const string& stopWordPath, const string& userDict = "")
|
||||
{
|
||||
LIMONP_CHECK(init(dictPath, hmmFilePath, idfPath, stopWordPath));
|
||||
init(dictPath, hmmFilePath, idfPath, stopWordPath, userDict);
|
||||
};
|
||||
~KeywordExtractor(){};
|
||||
|
||||
public:
|
||||
bool init(const string& dictPath, const string& hmmFilePath, const string& idfPath, const string& stopWordPath)
|
||||
void init(const string& dictPath, const string& hmmFilePath, const string& idfPath, const string& stopWordPath, const string& userDict = "")
|
||||
{
|
||||
_loadIdfDict(idfPath);
|
||||
_loadStopWordDict(stopWordPath);
|
||||
LIMONP_CHECK(_segment.init(dictPath, hmmFilePath));
|
||||
return true;
|
||||
LIMONP_CHECK(_segment.init(dictPath, hmmFilePath, userDict));
|
||||
};
|
||||
public:
|
||||
|
||||
|
@ -4,6 +4,7 @@ using namespace CppJieba;
|
||||
int main(int argc, char ** argv)
|
||||
{
|
||||
KeywordExtractor extractor("../dict/jieba.dict.utf8", "../dict/hmm_model.utf8", "../dict/idf.utf8", "../dict/stop_words.utf8");
|
||||
//KeywordExtractor extractor("../dict/jieba.dict.utf8", "../dict/hmm_model.utf8", "../dict/idf.utf8", "../dict/stop_words.utf8", "../dict/user.dict.utf8");
|
||||
string s("我是拖拉机学院手扶拖拉机专业的。不用多久,我就会升职加薪,当上CEO,走上人生巅峰。");
|
||||
vector<pair<string, double> > wordweights;
|
||||
vector<string> words;
|
||||
|
@ -8,6 +8,8 @@ using namespace CppJieba;
|
||||
TEST(KeywordExtractorTest, Test1)
|
||||
{
|
||||
KeywordExtractor extractor("../dict/extra_dict/jieba.dict.small.utf8", "../dict/hmm_model.utf8", "../dict/idf.utf8", "../dict/stop_words.utf8");
|
||||
|
||||
{
|
||||
string s("我是拖拉机学院手扶拖拉机专业的。不用多久,我就会升职加薪,当上CEO,走上人生巅峰。");
|
||||
string res;
|
||||
vector<pair<string, double> > wordweights;
|
||||
@ -15,4 +17,40 @@ TEST(KeywordExtractorTest, Test1)
|
||||
extractor.extract(s, wordweights, topN);
|
||||
res << wordweights;
|
||||
ASSERT_EQ(res, "[\"CEO:11.7392\", \"升职:10.8562\", \"加薪:10.6426\", \"手扶拖拉机:10.0089\", \"巅峰:9.49396\"]");
|
||||
}
|
||||
|
||||
{
|
||||
string s("我毕业于蓝翔,日薪一部iPhone6");
|
||||
string res;
|
||||
vector<pair<string, double> > wordweights;
|
||||
size_t topN = 5;
|
||||
extractor.extract(s, wordweights, topN);
|
||||
res << wordweights;
|
||||
ASSERT_EQ(res, "[\"蓝翔:11.7392\", \"日薪:11.7392\", \"iPhone:11.7392\", \"一部:6.47592\", \"毕业:6.23165\"]");
|
||||
}
|
||||
}
|
||||
|
||||
TEST(KeywordExtractorTest, Test2)
|
||||
{
|
||||
KeywordExtractor extractor("../dict/extra_dict/jieba.dict.small.utf8", "../dict/hmm_model.utf8", "../dict/idf.utf8", "../dict/stop_words.utf8", "../test/testdata/userdict.utf8");
|
||||
|
||||
{
|
||||
string s("蓝翔优秀毕业生");
|
||||
string res;
|
||||
vector<pair<string, double> > wordweights;
|
||||
size_t topN = 5;
|
||||
extractor.extract(s, wordweights, topN);
|
||||
res << wordweights;
|
||||
ASSERT_EQ(res, "[\"蓝翔:11.7392\", \"毕业生:8.13549\", \"优秀:6.78347\"]");
|
||||
}
|
||||
|
||||
{
|
||||
string s("我毕业于蓝翔,日薪一部iPhone6");
|
||||
string res;
|
||||
vector<pair<string, double> > wordweights;
|
||||
size_t topN = 5;
|
||||
extractor.extract(s, wordweights, topN);
|
||||
res << wordweights;
|
||||
ASSERT_EQ(res, "[\"蓝翔:11.7392\", \"日薪:11.7392\", \"iPhone6:11.7392\", \"一部:6.47592\", \"毕业:6.23165\"]");
|
||||
}
|
||||
}
|
||||
|
Loading…
x
Reference in New Issue
Block a user