cppjieba/test/unittest/keyword_extractor_test.cpp
Yanyi Wu 9cd64a1694 build: enhance test configuration and path management
- Added configuration for test paths in CMake to simplify file references.
- Updated load_test.cpp and various unit tests to use defined path macros for dictionary and test data files.
- Introduced test_paths.h.in to manage directory paths consistently across tests.
2025-05-02 23:33:18 +08:00

88 lines
3.3 KiB
C++
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#include "cppjieba/KeywordExtractor.hpp"
#include "gtest/gtest.h"
#include "test_paths.h"
using namespace cppjieba;
TEST(KeywordExtractorTest, Test1) {
KeywordExtractor Extractor(TEST_DATA_DIR "/extra_dict/jieba.dict.small.utf8",
DICT_DIR "/hmm_model.utf8",
DICT_DIR "/idf.utf8",
DICT_DIR "/stop_words.utf8");
{
string s("你好世界世界而且而且");
string res;
size_t topN = 5;
{
vector<string> words;
Extractor.Extract(s, words, topN);
res << words;
ASSERT_EQ(res, "[\"世界\", \"你好\"]");
}
{
vector<pair<string, double> > words;
Extractor.Extract(s, words, topN);
res << words;
ASSERT_EQ(res, "[世界:8.73506, 你好:7.95788]");
}
{
vector<KeywordExtractor::Word> words;
Extractor.Extract(s, words, topN);
res << words;
ASSERT_EQ(res, "[{\"word\": \"\xE4\xB8\x96\xE7\x95\x8C\", \"offset\": [6, 12], \"weight\": 8.73506}, {\"word\": \"\xE4\xBD\xA0\xE5\xA5\xBD\", \"offset\": [0], \"weight\": 7.95788}]");
}
}
{
string s("我是拖拉机学院手扶拖拉机专业的。不用多久我就会升职加薪当上CEO走上人生巅峰。");
string res;
vector<KeywordExtractor::Word> wordweights;
size_t topN = 5;
Extractor.Extract(s, wordweights, topN);
res << wordweights;
ASSERT_EQ(res, "[{\"word\": \"CEO\", \"offset\": [93], \"weight\": 11.7392}, {\"word\": \"\xE5\x8D\x87\xE8\x81\x8C\", \"offset\": [72], \"weight\": 10.8562}, {\"word\": \"\xE5\x8A\xA0\xE8\x96\xAA\", \"offset\": [78], \"weight\": 10.6426}, {\"word\": \"\xE6\x89\x8B\xE6\x89\xB6\xE6\x8B\x96\xE6\x8B\x89\xE6\x9C\xBA\", \"offset\": [21], \"weight\": 10.0089}, {\"word\": \"\xE5\xB7\x85\xE5\xB3\xB0\", \"offset\": [111], \"weight\": 9.49396}]");
}
{
string s("一部iPhone6");
string res;
vector<KeywordExtractor::Word> wordweights;
size_t topN = 5;
Extractor.Extract(s, wordweights, topN);
res << wordweights;
ASSERT_EQ(res, "[{\"word\": \"iPhone6\", \"offset\": [6], \"weight\": 11.7392}, {\"word\": \"\xE4\xB8\x80\xE9\x83\xA8\", \"offset\": [0], \"weight\": 6.47592}]");
}
}
TEST(KeywordExtractorTest, Test2) {
KeywordExtractor Extractor(TEST_DATA_DIR "/extra_dict/jieba.dict.small.utf8",
DICT_DIR "/hmm_model.utf8",
DICT_DIR "/idf.utf8",
DICT_DIR "/stop_words.utf8",
TEST_DATA_DIR "/userdict.utf8");
{
string s("蓝翔优秀毕业生");
string res;
vector<KeywordExtractor::Word> wordweights;
size_t topN = 5;
Extractor.Extract(s, wordweights, topN);
res << wordweights;
ASSERT_EQ(res, "[{\"word\": \"\xE8\x93\x9D\xE7\xBF\x94\", \"offset\": [0], \"weight\": 11.7392}, {\"word\": \"\xE6\xAF\x95\xE4\xB8\x9A\xE7\x94\x9F\", \"offset\": [12], \"weight\": 8.13549}, {\"word\": \"\xE4\xBC\x98\xE7\xA7\x80\", \"offset\": [6], \"weight\": 6.78347}]");
}
{
string s("一部iPhone6");
string res;
vector<KeywordExtractor::Word> wordweights;
size_t topN = 5;
Extractor.Extract(s, wordweights, topN);
res << wordweights;
ASSERT_EQ(res, "[{\"word\": \"iPhone6\", \"offset\": [6], \"weight\": 11.7392}, {\"word\": \"\xE4\xB8\x80\xE9\x83\xA8\", \"offset\": [0], \"weight\": 6.47592}]");
}
}