KeywordExtractor 支持自定义词典(可选参数)。

This commit is contained in:
yanyiwu 2015-01-24 15:34:34 +08:00
parent 269bc0fd0d
commit 4e72d4a06f
5 changed files with 57 additions and 14 deletions

View File

@ -1,5 +1,10 @@
# CppJieba ChangeLog
## v2.4.5 (untagged)
1. 使得 QuerySegment 支持自定义词典(可选参数)。
2. 使得 KeywordExtractor 支持自定义词典(可选参数)。
## v2.4.4
1. 修改两条更细粒度的特殊过滤规则,将连续的数字(包括浮点数)和连续的字母单独切分出来(而不会混在一起)。

View File

@ -115,7 +115,7 @@ namespace CppJieba
void _loadUserDict(const string& filePath, double defaultWeight, const string& defaultTag)
{
ifstream ifs(filePath.c_str());
assert(ifs);
assert(ifs.is_open());
string line;
DictUnit nodeInfo;
vector<string> buf;
@ -143,7 +143,7 @@ namespace CppJieba
void _loadDict(const string& filePath)
{
ifstream ifs(filePath.c_str());
assert(ifs);
assert(ifs.is_open());
string line;
vector<string> buf;

View File

@ -21,19 +21,18 @@ namespace CppJieba
unordered_set<string> _stopWords;
public:
KeywordExtractor(){};
KeywordExtractor(const string& dictPath, const string& hmmFilePath, const string& idfPath, const string& stopWordPath)
KeywordExtractor(const string& dictPath, const string& hmmFilePath, const string& idfPath, const string& stopWordPath, const string& userDict = "")
{
LIMONP_CHECK(init(dictPath, hmmFilePath, idfPath, stopWordPath));
init(dictPath, hmmFilePath, idfPath, stopWordPath, userDict);
};
~KeywordExtractor(){};
public:
bool init(const string& dictPath, const string& hmmFilePath, const string& idfPath, const string& stopWordPath)
void init(const string& dictPath, const string& hmmFilePath, const string& idfPath, const string& stopWordPath, const string& userDict = "")
{
_loadIdfDict(idfPath);
_loadStopWordDict(stopWordPath);
LIMONP_CHECK(_segment.init(dictPath, hmmFilePath));
return true;
LIMONP_CHECK(_segment.init(dictPath, hmmFilePath, userDict));
};
public:

View File

@ -4,6 +4,7 @@ using namespace CppJieba;
int main(int argc, char ** argv)
{
KeywordExtractor extractor("../dict/jieba.dict.utf8", "../dict/hmm_model.utf8", "../dict/idf.utf8", "../dict/stop_words.utf8");
//KeywordExtractor extractor("../dict/jieba.dict.utf8", "../dict/hmm_model.utf8", "../dict/idf.utf8", "../dict/stop_words.utf8", "../dict/user.dict.utf8");
string s("我是拖拉机学院手扶拖拉机专业的。不用多久我就会升职加薪当上CEO走上人生巅峰。");
vector<pair<string, double> > wordweights;
vector<string> words;

View File

@ -8,6 +8,8 @@ using namespace CppJieba;
TEST(KeywordExtractorTest, Test1)
{
KeywordExtractor extractor("../dict/extra_dict/jieba.dict.small.utf8", "../dict/hmm_model.utf8", "../dict/idf.utf8", "../dict/stop_words.utf8");
{
string s("我是拖拉机学院手扶拖拉机专业的。不用多久我就会升职加薪当上CEO走上人生巅峰。");
string res;
vector<pair<string, double> > wordweights;
@ -15,4 +17,40 @@ TEST(KeywordExtractorTest, Test1)
extractor.extract(s, wordweights, topN);
res << wordweights;
ASSERT_EQ(res, "[\"CEO:11.7392\", \"升职:10.8562\", \"加薪:10.6426\", \"手扶拖拉机:10.0089\", \"巅峰:9.49396\"]");
}
{
string s("我毕业于蓝翔日薪一部iPhone6");
string res;
vector<pair<string, double> > wordweights;
size_t topN = 5;
extractor.extract(s, wordweights, topN);
res << wordweights;
ASSERT_EQ(res, "[\"蓝翔:11.7392\", \"日薪:11.7392\", \"iPhone:11.7392\", \"一部:6.47592\", \"毕业:6.23165\"]");
}
}
TEST(KeywordExtractorTest, Test2)
{
KeywordExtractor extractor("../dict/extra_dict/jieba.dict.small.utf8", "../dict/hmm_model.utf8", "../dict/idf.utf8", "../dict/stop_words.utf8", "../test/testdata/userdict.utf8");
{
string s("蓝翔优秀毕业生");
string res;
vector<pair<string, double> > wordweights;
size_t topN = 5;
extractor.extract(s, wordweights, topN);
res << wordweights;
ASSERT_EQ(res, "[\"蓝翔:11.7392\", \"毕业生:8.13549\", \"优秀:6.78347\"]");
}
{
string s("我毕业于蓝翔日薪一部iPhone6");
string res;
vector<pair<string, double> > wordweights;
size_t topN = 5;
extractor.extract(s, wordweights, topN);
res << wordweights;
ASSERT_EQ(res, "[\"蓝翔:11.7392\", \"日薪:11.7392\", \"iPhone6:11.7392\", \"一部:6.47592\", \"毕业:6.23165\"]");
}
}