cppjieba/test/unittest/segments_test.cpp
Yanyi Wu 9f7e37c50b build: enhance test configuration and paths in CMakeLists
- Removed default install path setting from CMakeLists.txt.
- Updated test CMakeLists to include Google Test and configure test paths.
- Refactored load_test and unittest files to use defined paths for dictionaries and test data.
- Added test paths header for better path management in tests.
- Ensured all tests are properly linked and configured for execution.
2025-05-02 22:43:47 +08:00

262 lines
8.5 KiB
C++
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#include "cppjieba/SegmentBase.hpp"
#include "cppjieba/MixSegment.hpp"
#include "cppjieba/MPSegment.hpp"
#include "cppjieba/HMMSegment.hpp"
#include "cppjieba/FullSegment.hpp"
#include "cppjieba/QuerySegment.hpp"
#include "gtest/gtest.h"
#include "test_paths.h"
using namespace cppjieba;
TEST(MixSegmentTest, Test1) {
MixSegment segment(DICT_DIR "/jieba.dict.utf8", DICT_DIR "/hmm_model.utf8");
string sentence;
vector<string> words;
string actual;
string expected;
{
sentence = "我来自北京邮电大学。。。学号123456用AK47";
expected = "我/来自/北京邮电大学/。/。/。/学号/123456//用/AK47";
segment.Cut(sentence, words);
actual = Join(words.begin(), words.end(), "/");
ASSERT_EQ(actual, expected);
}
{
sentence = "B超 T恤";
expected = "B超/ /T恤";
segment.Cut(sentence, words);
actual = Join(words.begin(), words.end(), "/");
ASSERT_EQ(actual, expected);
}
{
sentence = "他来到了网易杭研大厦";
expected = "他/来到/了/网易/杭/研/大厦";
segment.Cut(sentence, words, false);
actual = Join(words.begin(), words.end(), "/");
ASSERT_EQ(actual, expected);
}
{
sentence = "他来到了网易杭研大厦";
expected = "他/来到/了/网易/杭研/大厦";
segment.Cut(sentence, words);
actual = Join(words.begin(), words.end(), "/");
ASSERT_EQ(actual, expected);
}
}
TEST(MixSegmentTest, NoUserDict) {
MixSegment segment(TEST_DATA_DIR "/extra_dict/jieba.dict.small.utf8", DICT_DIR "/hmm_model.utf8");
const char* str = "令狐冲是云计算方面的专家";
vector<string> words;
segment.Cut(str, words);
string res;
ASSERT_EQ("[\"令狐冲\", \"\", \"\", \"计算\", \"方面\", \"\", \"专家\"]", res << words);
}
TEST(MixSegmentTest, UserDict) {
MixSegment segment(TEST_DATA_DIR "/extra_dict/jieba.dict.small.utf8",
DICT_DIR "/hmm_model.utf8",
DICT_DIR "/user.dict.utf8");
{
const char* str = "令狐冲是云计算方面的专家";
vector<string> words;
segment.Cut(str, words);
string res;
ASSERT_EQ("[\"令狐冲\", \"\", \"云计算\", \"方面\", \"\", \"专家\"]", res << words);
}
{
const char* str = "小明先就职于IBM,后在日本京都大学深造";
vector<string> words;
segment.Cut(str, words);
string res;
res << words;
ASSERT_EQ("[\"小明\", \"\", \"就职\", \"\", \"IBM\", \",\", \"\", \"\", \"日本\", \"京都大学\", \"深造\"]", res);
}
{
const char* str = "IBM,3.14";
vector<string> words;
segment.Cut(str, words);
string res;
res << words;
ASSERT_EQ("[\"IBM\", \",\", \"3.14\"]", res);
}
}
TEST(MixSegmentTest, TestUserDict) {
MixSegment segment(TEST_DATA_DIR "/extra_dict/jieba.dict.small.utf8", DICT_DIR "/hmm_model.utf8",
TEST_DATA_DIR "/userdict.utf8");
vector<string> words;
string res;
segment.Cut("令狐冲是云计算方面的专家", words);
ASSERT_EQ("[\"令狐冲\", \"\", \"云计算\", \"方面\", \"\", \"专家\"]", res << words);
segment.Cut("小明先就职于IBM,后在日本京都大学深造", words);
res << words;
ASSERT_EQ("[\"小明\", \"\", \"就职\", \"\", \"I\", \"B\", \"M\", \",\", \"\", \"\", \"日本\", \"京都大学\", \"深造\"]", res);
segment.Cut("IBM,3.14", words);
res << words;
ASSERT_EQ("[\"I\", \"B\", \"M\", \",\", \"3.14\"]", res);
segment.Cut("忽如一夜春风来,千树万树梨花开", words);
res = limonp::Join(words.begin(), words.end(), "/");
ASSERT_EQ("忽如一夜春风来//千树/万树/梨花/开", res);
// rand input
{
const size_t ITERATION = 16;
const size_t MAX_LEN = 256;
string s;
srand(time(NULL));
for (size_t i = 0; i < ITERATION; i++) {
size_t len = rand() % MAX_LEN;
s.resize(len);
for (size_t j = 0; j < len; j++) {
s[rand() % len] = rand();
}
segment.Cut(s, words);
}
}
}
TEST(MixSegmentTest, TestMultiUserDict) {
MixSegment segment(TEST_DATA_DIR "/extra_dict/jieba.dict.small.utf8", DICT_DIR "/hmm_model.utf8",
TEST_DATA_DIR "/userdict.utf8;" TEST_DATA_DIR "/userdict.2.utf8");
vector<string> words;
string res;
segment.Cut("忽如一夜春风来,千树万树梨花开", words);
res = limonp::Join(words.begin(), words.end(), "/");
ASSERT_EQ("忽如一夜春风来//千树万树梨花开", res);
}
TEST(MPSegmentTest, Test1) {
MPSegment segment(DICT_DIR "/jieba.dict.utf8");
string s;
vector<string> words;
segment.Cut("我来自北京邮电大学。", words);
ASSERT_EQ("[\"\", \"来自\", \"北京邮电大学\", \"\"]", s << words);
segment.Cut("B超 T恤", words);
ASSERT_EQ(s << words, "[\"B超\", \" \", \"T恤\"]");
segment.Cut("南京市长江大桥", words);
ASSERT_EQ("[\"南京市\", \"长江大桥\"]", s << words);
// MaxWordLen
segment.Cut("南京市长江大桥", words, 3);
ASSERT_EQ("[\"南京市\", \"长江\", \"大桥\"]", s << words);
segment.Cut("南京市长江大桥", words, 0);
ASSERT_EQ("[\"\", \"\", \"\", \"\", \"\", \"\", \"\"]", s << words);
segment.Cut("湖南长沙市天心区", words);
s = Join(words.begin(), words.end(), "/");
ASSERT_EQ("湖南长沙市/天心区", s);
segment.Cut("湖南长沙市天心区", words, 3);
s = Join(words.begin(), words.end(), "/");
ASSERT_EQ("湖南/长沙市/天心区", s);
}
TEST(HMMSegmentTest, Test1) {
HMMSegment segment(DICT_DIR "/hmm_model.utf8");
{
const char* str = "我来自北京邮电大学。。。学号123456";
const char* res[] = {"我来", "自北京", "邮电大学", "", "", "", "学号", "123456"};
vector<string> words;
segment.Cut(str, words);
ASSERT_EQ(words, vector<string>(res, res + sizeof(res)/sizeof(res[0])));
}
{
const char* str = "IBM,1.2,123";
const char* res[] = {"IBM", ",", "1.2", ",", "123"};
vector<string> words;
segment.Cut(str, words);
ASSERT_EQ(words, vector<string>(res, res + sizeof(res)/sizeof(res[0])));
}
}
TEST(FullSegment, Test1) {
FullSegment segment(TEST_DATA_DIR "/extra_dict/jieba.dict.small.utf8");
vector<string> words;
string s;
segment.Cut("我来自北京邮电大学", words);
s << words;
ASSERT_EQ(s, "[\"\", \"来自\", \"北京\", \"北京邮电大学\", \"邮电\", \"电大\", \"大学\"]");
segment.Cut("上市公司CEO", words);
s << words;
ASSERT_EQ(s, "[\"上市\", \"公司\", \"C\", \"E\", \"O\"]");
}
TEST(QuerySegment, Test1) {
QuerySegment segment(DICT_DIR "/jieba.dict.utf8", DICT_DIR "/hmm_model.utf8", "");
vector<string> words;
string s1, s2;
segment.Cut("小明硕士毕业于中国科学院计算所,后在日本京都大学深造", words);
s1 = Join(words.begin(), words.end(), "/");
s2 = "小明/硕士/毕业/于/中国/科学/学院/科学院/中国科学院/计算/计算所//后/在/日本/京都/大学/日本京都大学/深造";
ASSERT_EQ(s1, s2);
segment.Cut("亲口交代", words);
s1 = Join(words.begin(), words.end(), "/");
s2 = "亲口/交代";
ASSERT_EQ(s1, s2);
segment.Cut("他心理健康", words);
s1 = Join(words.begin(), words.end(), "/");
s2 = "他/心理/健康/心理健康";
ASSERT_EQ(s1, s2);
}
TEST(QuerySegment, Test2) {
QuerySegment segment(TEST_DATA_DIR "/extra_dict/jieba.dict.small.utf8",
DICT_DIR "/hmm_model.utf8",
TEST_DATA_DIR "/userdict.utf8|" TEST_DATA_DIR "/userdict.english");
vector<string> words;
string s1, s2;
{
segment.Cut("小明硕士毕业于中国科学院计算所,后在日本京都大学深造", words);
s1 = Join(words.begin(), words.end(), "/");
s2 = "小明/硕士/毕业/于/中国/科学/学院/科学院/中国科学院/计算/计算所//后/在/日本/京都/大学/京都大学/深造";
ASSERT_EQ(s1, s2);
}
{
segment.Cut("小明硕士毕业于中国科学院计算所iPhone6", words);
s1 = Join(words.begin(), words.end(), "/");
s2 = "小明/硕士/毕业/于/中国/科学/学院/科学院/中国科学院/计算/计算所/iPhone6";
ASSERT_EQ(s1, s2);
}
{
segment.Cut("中国科学院", words);
s1 = Join(words.begin(), words.end(), "/");
s2 = "中国/科学/学院/科学院/中国科学院";
ASSERT_EQ(s1, s2);
}
}
TEST(MPSegmentTest, Unicode32) {
string s("天气很好,🙋 我们去郊游。");
vector<string> words;
MPSegment segment(DICT_DIR "/jieba.dict.utf8");
segment.Cut(s, words);
ASSERT_EQ(Join(words.begin(), words.end(), "/"), "天气/很/好//🙋/ /我们/去/郊游/。");
}