mirror of
https://github.com/yanyiwu/cppjieba.git
synced 2025-07-18 00:00:12 +08:00
KeywordExtractor 支持自定义词典(可选参数)。
This commit is contained in:
parent
269bc0fd0d
commit
4e72d4a06f
@ -1,5 +1,10 @@
|
|||||||
# CppJieba ChangeLog
|
# CppJieba ChangeLog
|
||||||
|
|
||||||
|
## v2.4.5 (untagged)
|
||||||
|
|
||||||
|
1. 使得 QuerySegment 支持自定义词典(可选参数)。
|
||||||
|
2. 使得 KeywordExtractor 支持自定义词典(可选参数)。
|
||||||
|
|
||||||
## v2.4.4
|
## v2.4.4
|
||||||
|
|
||||||
1. 修改两条更细粒度的特殊过滤规则,将连续的数字(包括浮点数)和连续的字母单独切分出来(而不会混在一起)。
|
1. 修改两条更细粒度的特殊过滤规则,将连续的数字(包括浮点数)和连续的字母单独切分出来(而不会混在一起)。
|
||||||
|
@ -115,7 +115,7 @@ namespace CppJieba
|
|||||||
void _loadUserDict(const string& filePath, double defaultWeight, const string& defaultTag)
|
void _loadUserDict(const string& filePath, double defaultWeight, const string& defaultTag)
|
||||||
{
|
{
|
||||||
ifstream ifs(filePath.c_str());
|
ifstream ifs(filePath.c_str());
|
||||||
assert(ifs);
|
assert(ifs.is_open());
|
||||||
string line;
|
string line;
|
||||||
DictUnit nodeInfo;
|
DictUnit nodeInfo;
|
||||||
vector<string> buf;
|
vector<string> buf;
|
||||||
@ -143,7 +143,7 @@ namespace CppJieba
|
|||||||
void _loadDict(const string& filePath)
|
void _loadDict(const string& filePath)
|
||||||
{
|
{
|
||||||
ifstream ifs(filePath.c_str());
|
ifstream ifs(filePath.c_str());
|
||||||
assert(ifs);
|
assert(ifs.is_open());
|
||||||
string line;
|
string line;
|
||||||
vector<string> buf;
|
vector<string> buf;
|
||||||
|
|
||||||
|
@ -21,19 +21,18 @@ namespace CppJieba
|
|||||||
unordered_set<string> _stopWords;
|
unordered_set<string> _stopWords;
|
||||||
public:
|
public:
|
||||||
KeywordExtractor(){};
|
KeywordExtractor(){};
|
||||||
KeywordExtractor(const string& dictPath, const string& hmmFilePath, const string& idfPath, const string& stopWordPath)
|
KeywordExtractor(const string& dictPath, const string& hmmFilePath, const string& idfPath, const string& stopWordPath, const string& userDict = "")
|
||||||
{
|
{
|
||||||
LIMONP_CHECK(init(dictPath, hmmFilePath, idfPath, stopWordPath));
|
init(dictPath, hmmFilePath, idfPath, stopWordPath, userDict);
|
||||||
};
|
};
|
||||||
~KeywordExtractor(){};
|
~KeywordExtractor(){};
|
||||||
|
|
||||||
public:
|
public:
|
||||||
bool init(const string& dictPath, const string& hmmFilePath, const string& idfPath, const string& stopWordPath)
|
void init(const string& dictPath, const string& hmmFilePath, const string& idfPath, const string& stopWordPath, const string& userDict = "")
|
||||||
{
|
{
|
||||||
_loadIdfDict(idfPath);
|
_loadIdfDict(idfPath);
|
||||||
_loadStopWordDict(stopWordPath);
|
_loadStopWordDict(stopWordPath);
|
||||||
LIMONP_CHECK(_segment.init(dictPath, hmmFilePath));
|
LIMONP_CHECK(_segment.init(dictPath, hmmFilePath, userDict));
|
||||||
return true;
|
|
||||||
};
|
};
|
||||||
public:
|
public:
|
||||||
|
|
||||||
|
@ -4,6 +4,7 @@ using namespace CppJieba;
|
|||||||
int main(int argc, char ** argv)
|
int main(int argc, char ** argv)
|
||||||
{
|
{
|
||||||
KeywordExtractor extractor("../dict/jieba.dict.utf8", "../dict/hmm_model.utf8", "../dict/idf.utf8", "../dict/stop_words.utf8");
|
KeywordExtractor extractor("../dict/jieba.dict.utf8", "../dict/hmm_model.utf8", "../dict/idf.utf8", "../dict/stop_words.utf8");
|
||||||
|
//KeywordExtractor extractor("../dict/jieba.dict.utf8", "../dict/hmm_model.utf8", "../dict/idf.utf8", "../dict/stop_words.utf8", "../dict/user.dict.utf8");
|
||||||
string s("我是拖拉机学院手扶拖拉机专业的。不用多久,我就会升职加薪,当上CEO,走上人生巅峰。");
|
string s("我是拖拉机学院手扶拖拉机专业的。不用多久,我就会升职加薪,当上CEO,走上人生巅峰。");
|
||||||
vector<pair<string, double> > wordweights;
|
vector<pair<string, double> > wordweights;
|
||||||
vector<string> words;
|
vector<string> words;
|
||||||
|
@ -8,11 +8,49 @@ using namespace CppJieba;
|
|||||||
TEST(KeywordExtractorTest, Test1)
|
TEST(KeywordExtractorTest, Test1)
|
||||||
{
|
{
|
||||||
KeywordExtractor extractor("../dict/extra_dict/jieba.dict.small.utf8", "../dict/hmm_model.utf8", "../dict/idf.utf8", "../dict/stop_words.utf8");
|
KeywordExtractor extractor("../dict/extra_dict/jieba.dict.small.utf8", "../dict/hmm_model.utf8", "../dict/idf.utf8", "../dict/stop_words.utf8");
|
||||||
string s("我是拖拉机学院手扶拖拉机专业的。不用多久,我就会升职加薪,当上CEO,走上人生巅峰。");
|
|
||||||
string res;
|
{
|
||||||
vector<pair<string, double> > wordweights;
|
string s("我是拖拉机学院手扶拖拉机专业的。不用多久,我就会升职加薪,当上CEO,走上人生巅峰。");
|
||||||
size_t topN = 5;
|
string res;
|
||||||
extractor.extract(s, wordweights, topN);
|
vector<pair<string, double> > wordweights;
|
||||||
res << wordweights;
|
size_t topN = 5;
|
||||||
ASSERT_EQ(res, "[\"CEO:11.7392\", \"升职:10.8562\", \"加薪:10.6426\", \"手扶拖拉机:10.0089\", \"巅峰:9.49396\"]");
|
extractor.extract(s, wordweights, topN);
|
||||||
|
res << wordweights;
|
||||||
|
ASSERT_EQ(res, "[\"CEO:11.7392\", \"升职:10.8562\", \"加薪:10.6426\", \"手扶拖拉机:10.0089\", \"巅峰:9.49396\"]");
|
||||||
|
}
|
||||||
|
|
||||||
|
{
|
||||||
|
string s("我毕业于蓝翔,日薪一部iPhone6");
|
||||||
|
string res;
|
||||||
|
vector<pair<string, double> > wordweights;
|
||||||
|
size_t topN = 5;
|
||||||
|
extractor.extract(s, wordweights, topN);
|
||||||
|
res << wordweights;
|
||||||
|
ASSERT_EQ(res, "[\"蓝翔:11.7392\", \"日薪:11.7392\", \"iPhone:11.7392\", \"一部:6.47592\", \"毕业:6.23165\"]");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
TEST(KeywordExtractorTest, Test2)
|
||||||
|
{
|
||||||
|
KeywordExtractor extractor("../dict/extra_dict/jieba.dict.small.utf8", "../dict/hmm_model.utf8", "../dict/idf.utf8", "../dict/stop_words.utf8", "../test/testdata/userdict.utf8");
|
||||||
|
|
||||||
|
{
|
||||||
|
string s("蓝翔优秀毕业生");
|
||||||
|
string res;
|
||||||
|
vector<pair<string, double> > wordweights;
|
||||||
|
size_t topN = 5;
|
||||||
|
extractor.extract(s, wordweights, topN);
|
||||||
|
res << wordweights;
|
||||||
|
ASSERT_EQ(res, "[\"蓝翔:11.7392\", \"毕业生:8.13549\", \"优秀:6.78347\"]");
|
||||||
|
}
|
||||||
|
|
||||||
|
{
|
||||||
|
string s("我毕业于蓝翔,日薪一部iPhone6");
|
||||||
|
string res;
|
||||||
|
vector<pair<string, double> > wordweights;
|
||||||
|
size_t topN = 5;
|
||||||
|
extractor.extract(s, wordweights, topN);
|
||||||
|
res << wordweights;
|
||||||
|
ASSERT_EQ(res, "[\"蓝翔:11.7392\", \"日薪:11.7392\", \"iPhone6:11.7392\", \"一部:6.47592\", \"毕业:6.23165\"]");
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
Loading…
x
Reference in New Issue
Block a user