diff --git a/src/QuerySegment.hpp b/src/QuerySegment.hpp index 74f69f9..3d8cd5b 100644 --- a/src/QuerySegment.hpp +++ b/src/QuerySegment.hpp @@ -47,6 +47,7 @@ namespace CppJieba LogError("_fullSeg init"); return false; } + _maxWordLen = maxWordLen; return _setInitFlag(true); } @@ -84,6 +85,9 @@ namespace CppJieba { res.push_back(*fullResItr); } + + //clear tmp res + fullRes.clear(); } } else // just use the mix result diff --git a/src/TrieManager.hpp b/src/TrieManager.hpp index 2590c89..d011ccb 100644 --- a/src/TrieManager.hpp +++ b/src/TrieManager.hpp @@ -23,48 +23,47 @@ namespace CppJieba LogError("error when getting md5 for file '%s'", dictpath); return NULL; } - else + LogInfo("md5 for file '%s': %s", dictpath, md5.c_str()); + + if (_tries.find(md5) != _tries.end()) { - LogInfo("md5 for file '%s': %s", dictpath, md5.c_str()); - if (_tries.find(md5) == _tries.end()) - { - LogInfo("create a new trie for md5: '%s'", md5.c_str()); - Trie* trie = NULL; - try - { - trie = new Trie(); - } - catch (const bad_alloc& e) - { - LogError("error when new a trie for file '%s'", dictpath); - return NULL; - } - if (NULL == trie) - return NULL; - - if (!trie->init()) - { - LogError("trie init error for file '%s'", dictpath); - return NULL; - } - - LogInfo("trie->loadDict(%s) start...", dictpath); - if (!trie->loadDict(dictpath)) - { - LogError("trie->loadDict(%s) failed...", dictpath); - return NULL; - } - LogInfo("trie->loadDict end..."); - - _tries[md5.c_str()] = trie; - return trie; - } - else - { - LogInfo("find a exits trie for md5: '%s'", md5.c_str()); - return _tries[md5.c_str()]; - } + LogInfo("find a exits trie for md5: '%s'", md5.c_str()); + return _tries[md5.c_str()]; } + + LogInfo("create a new trie for md5: '%s'", md5.c_str()); + Trie* trie = NULL; + try + { + trie = new Trie(); + } + catch (const bad_alloc& e) + { + LogError("error when new a trie for file '%s'", dictpath); + return NULL; + } + if (NULL == trie) + { + LogError("get NULL from new trie for file '%s'", dictpath); + return NULL; + } + + if (!trie->init()) + { + LogError("trie init error for file '%s'", dictpath); + return NULL; + } + + LogInfo("trie->loadDict(%s) start...", dictpath); + if (!trie->loadDict(dictpath)) + { + LogError("trie->loadDict(%s) failed...", dictpath); + return NULL; + } + LogInfo("trie->loadDict end..."); + + _tries[md5.c_str()] = trie; + return trie; } static TrieManager& getInstance() diff --git a/test/testdata/jieba.dict.0.1.utf8 b/test/testdata/jieba.dict.0.1.utf8 new file mode 100644 index 0000000..e925f69 --- /dev/null +++ b/test/testdata/jieba.dict.0.1.utf8 @@ -0,0 +1,93 @@ +龙鸣狮吼 3 nr +龙齐诺 2 nr +龙齿 3 n +龚 176 nr +龚世萍 2 nr +龚书铎 2 nr +龚二人 2 nr +龚云甫 3 nr +龚伟强 5 nr +龚先生 4 nr +龚光杰 44 nr +龚古尔 24 nr +龚子敬 2 nr +龚孝升 12 nr +龚学平 2 nr +龚完敬 5 nr +龚定庵 3 nr +龚定敬 2 nr +龚宝铨 5 nr +龚家村 3 nr +龚建国 29 nr +龚德俊 6 nr +龚心瀚 3 nr +龚志国 2 nr +龚意田 3 nr +龚慈恩 3 nr +龚施茜 3 nr +龚晓犁 2 nr +龚普洛 3 nr +龚智超 7 nr +龚松林 10 nr +龚永明 3 nr +龚永泉 5 nr +龚泽艺 256 nr +龚睿 8 nrfg +龚祖同 2 nr +龚秋婷 3 nr +龚老爷 2 nr +龚育之 19 nr +龚自珍 28 nr +龚蓓苾 3 nr +龚虹嘉 3 nr +龚诗嘉 3 nr +龛 223 ng +龜 2 zg +龟 903 ns +龟儿子 123 n +龟兆 3 nz +龟兹 215 ns +龟兹王 3 nrt +龟冷搘床 3 v +龟冷支床 3 n +龟卜 3 n +龟厌不告 3 l +龟壳 33 n +龟壳花 3 n +龟头 34 n +龟头炎 3 n +龟山 23 ns +龟山乡 3 ns +龟山岛 3 ns +龟年鹤寿 3 ns +龟年鹤算 3 l +龟文 3 nz +龟文写迹 3 n +龟文鸟迹 3 n +龟板 10 n +龟毛免角 3 n +龟毛兔角 3 n +龟溪 3 ns +龟玉 3 nz +龟王 3 nz +龟甲 92 ns +龟甲胶 3 nz +龟筮 3 n +龟纹 3 n +龟缩 29 v +龟肉 3 n +龟背 21 n +龟背竹 3 n +龟苓膏 3 n +龟苗 3 n +龟裂 34 v +龟足 5 v +龟鉴 2 n +龟镜 3 nz +龟鳖 3 n +龟鹤遐寿 3 l +龟龄鹤算 3 n +龟龙片甲 3 nz +龟龙麟凤 3 ns +龠 5 g +龢 732 zg diff --git a/test/testdata/jieba.dict.0.utf8 b/test/testdata/jieba.dict.0.utf8 new file mode 100644 index 0000000..e925f69 --- /dev/null +++ b/test/testdata/jieba.dict.0.utf8 @@ -0,0 +1,93 @@ +龙鸣狮吼 3 nr +龙齐诺 2 nr +龙齿 3 n +龚 176 nr +龚世萍 2 nr +龚书铎 2 nr +龚二人 2 nr +龚云甫 3 nr +龚伟强 5 nr +龚先生 4 nr +龚光杰 44 nr +龚古尔 24 nr +龚子敬 2 nr +龚孝升 12 nr +龚学平 2 nr +龚完敬 5 nr +龚定庵 3 nr +龚定敬 2 nr +龚宝铨 5 nr +龚家村 3 nr +龚建国 29 nr +龚德俊 6 nr +龚心瀚 3 nr +龚志国 2 nr +龚意田 3 nr +龚慈恩 3 nr +龚施茜 3 nr +龚晓犁 2 nr +龚普洛 3 nr +龚智超 7 nr +龚松林 10 nr +龚永明 3 nr +龚永泉 5 nr +龚泽艺 256 nr +龚睿 8 nrfg +龚祖同 2 nr +龚秋婷 3 nr +龚老爷 2 nr +龚育之 19 nr +龚自珍 28 nr +龚蓓苾 3 nr +龚虹嘉 3 nr +龚诗嘉 3 nr +龛 223 ng +龜 2 zg +龟 903 ns +龟儿子 123 n +龟兆 3 nz +龟兹 215 ns +龟兹王 3 nrt +龟冷搘床 3 v +龟冷支床 3 n +龟卜 3 n +龟厌不告 3 l +龟壳 33 n +龟壳花 3 n +龟头 34 n +龟头炎 3 n +龟山 23 ns +龟山乡 3 ns +龟山岛 3 ns +龟年鹤寿 3 ns +龟年鹤算 3 l +龟文 3 nz +龟文写迹 3 n +龟文鸟迹 3 n +龟板 10 n +龟毛免角 3 n +龟毛兔角 3 n +龟溪 3 ns +龟玉 3 nz +龟王 3 nz +龟甲 92 ns +龟甲胶 3 nz +龟筮 3 n +龟纹 3 n +龟缩 29 v +龟肉 3 n +龟背 21 n +龟背竹 3 n +龟苓膏 3 n +龟苗 3 n +龟裂 34 v +龟足 5 v +龟鉴 2 n +龟镜 3 nz +龟鳖 3 n +龟鹤遐寿 3 l +龟龄鹤算 3 n +龟龙片甲 3 nz +龟龙麟凤 3 ns +龠 5 g +龢 732 zg diff --git a/test/testdata/jieba.dict.1.utf8 b/test/testdata/jieba.dict.1.utf8 new file mode 100644 index 0000000..d222070 --- /dev/null +++ b/test/testdata/jieba.dict.1.utf8 @@ -0,0 +1,67 @@ +AT&T 3 nz +B超 3 n +c# 3 nz +C# 3 nz +c++ 3 nz +C++ 3 nz +T恤 4 n +一 217830 m +一一 1670 m +一一二 11 m +一一例 3 m +一一分 8 m +一一列举 34 i +一一对 9 m +一一对应 43 l +一一记 2 m +一一道来 4 l +一丁 18 d +一丁不识 3 i +一丁点 3 m +一丁点儿 24 m +一七 22 m +一七八不 3 l +一万 442 m +一万一千 4 m +一万一千五百二十颗 2 m +一万一千八百八十斤 2 m +一万一千多间 2 m +一万一千零九十五册 4 m +一万七千 5 m +一万七千余 2 m +一万七千多 2 m +一万七千多户 2 m +一万万 4 m +一万万两 4 m +一万三千 8 m +一万三千五百一十七 2 m +一万三千五百斤 4 m +一万三千余种 2 m +一万三千块 2 m +一万两 124 m +一万两万 4 m +一万两千 3 m +一万个 62 m +一万九千 2 m +一万九千余 2 m +一万二 10 m +一万二千 7 m +一万二千两 2 m +一万二千五百 4 m +一万二千五百一十二 2 m +一万二千五百余 2 m +一万二千五百余吨 2 m +一万二千亩 2 m +一万二千余 2 m +一万二千六百八十二箱 2 m +一万二千名 3 m +一万二千里 3 m +一万五 6 m +一万五千 45 m +一万五千一百四十四卷 2 m +一万五千两 4 m +一万五千个 2 m +一万五千二百余 2 m +一万五千余 9 m +一万五千元 3 m +一万五千名 4 m diff --git a/test/testdata/jieba.dict.2.utf8 b/test/testdata/jieba.dict.2.utf8 new file mode 100644 index 0000000..5662f9a --- /dev/null +++ b/test/testdata/jieba.dict.2.utf8 @@ -0,0 +1,64 @@ +一万七千 5 m +一万七千余 2 m +一万七千多 2 m +一万七千多户 2 m +一万万 4 m +一万万两 4 m +一万三千 8 m +一万三千五百一十七 2 m +一万三千五百斤 4 m +一万三千余种 2 m +一万三千块 2 m +一万两 124 m +一万两万 4 m +一万两千 3 m +一万个 62 m +一万九千 2 m +一万九千余 2 m +一万二 10 m +一万二千 7 m +一万二千两 2 m +一万二千五百 4 m +一万二千五百一十二 2 m +一万二千五百余 2 m +一万二千五百余吨 2 m +一万二千亩 2 m +一万二千余 2 m +一万二千六百八十二箱 2 m +一万二千名 3 m +一万二千里 3 m +一万五 6 m +一万五千 45 m +一万五千一百四十四卷 2 m +一万五千两 4 m +一万五千个 2 m +一万五千二百余 2 m +一万五千余 9 m +一万五千元 3 m +一万五千名 4 m +一万五千多 2 m +一万五千家 2 m +一万亿 3 m +一万亿美元 5 m +一万余 41 m +一万余吨 2 m +一万余顷 2 m +一万倍 14 m +一万元 61 m +一万八 5 m +一万八千 7 m +一万八千余 8 m +一万八千多元 2 m +一万公里 2 m +一万六千 5 m +一万六千三百户 2 m +一万六千余户 2 m +一万六千多 3 m +一万册 2 m +一万刀 7 m +一万匹 4 m +一万卷 2 m +一万双 6 m +一万发 2 m +一万句 11 m +一万只 9 m diff --git a/test/unittest/CMakeLists.txt b/test/unittest/CMakeLists.txt index 34c3781..1f59f31 100644 --- a/test/unittest/CMakeLists.txt +++ b/test/unittest/CMakeLists.txt @@ -5,7 +5,7 @@ SET(GTEST_ROOT_DIR gtest-1.6.0) INCLUDE_DIRECTORIES(${GTEST_ROOT_DIR} ${GTEST_ROOT_DIR}/include ${PROJECT_SOURCE_DIR}) ADD_LIBRARY(gtest STATIC ${GTEST_ROOT_DIR}/src/gtest-all.cc) -ADD_EXECUTABLE(test.run gtest_main.cc TChineseFilter.cpp TMixSegment.cpp TMPSegment.cpp THMMSegment.cpp TTrie.cpp) +ADD_EXECUTABLE(test.run gtest_main.cc TChineseFilter.cpp TMixSegment.cpp TMPSegment.cpp THMMSegment.cpp TTrie.cpp TFullSegment.cpp TQuerySegment.cpp TTrieManager.cpp) TARGET_LINK_LIBRARIES(gtest pthread) TARGET_LINK_LIBRARIES(test.run gtest pthread) diff --git a/test/unittest/TFullSegment.cpp b/test/unittest/TFullSegment.cpp new file mode 100644 index 0000000..cc38026 --- /dev/null +++ b/test/unittest/TFullSegment.cpp @@ -0,0 +1,17 @@ +#include "src/FullSegment.hpp" +#include "gtest/gtest.h" + +using namespace CppJieba; + +TEST(FullSegment, Test1) +{ + FullSegment segment("../dicts/jieba.dict.utf8"); + const char* str = "我来自北京邮电大学。。。 学号 123456"; + const char* res[] = {"我", "来自", "北京", "北京邮电", "北京邮电大学", "邮电", "邮电大学", "电大", "大学", "。", "。", "。", " ", "学号", " 123456"}; + vector words; + + ASSERT_EQ(segment.cut(str, words), true); + + EXPECT_EQ(words, vector(res, res + sizeof(res)/sizeof(res[0]))); +} + diff --git a/test/unittest/TQuerySegment.cpp b/test/unittest/TQuerySegment.cpp new file mode 100644 index 0000000..55704e6 --- /dev/null +++ b/test/unittest/TQuerySegment.cpp @@ -0,0 +1,17 @@ +#include "src/QuerySegment.hpp" +#include "gtest/gtest.h" + +using namespace CppJieba; + +TEST(QuerySegment, Test1) +{ + QuerySegment segment("../dicts/jieba.dict.utf8", "../dicts/hmm_model.utf8", 3); + const char* str = "小明硕士毕业于中国科学院计算所,后在日本京都大学深造"; + const char* res[] = {"小明", "硕士", "毕业", "于", "中国", "中国科学院", "科学", "科学院", "学院", "计算所", ",", "后", "在", "日本", "日本京都大学", "京都", "京都大学", "大学", "深造"}; + vector words; + + ASSERT_EQ(segment.cut(str, words), true); + + EXPECT_EQ(words, vector(res, res + sizeof(res)/sizeof(res[0]))); +} + diff --git a/test/unittest/TTrieManager.cpp b/test/unittest/TTrieManager.cpp new file mode 100644 index 0000000..a51341a --- /dev/null +++ b/test/unittest/TTrieManager.cpp @@ -0,0 +1,52 @@ +#include "src/TrieManager.hpp" +#include "gtest/gtest.h" + +using namespace CppJieba; + +struct md5_ptr +{ + string md5; + Trie* ptr; +}; +typedef struct md5_ptr MD5_PTR; + +static const char* const DICT_FILE[] = { + "../test/testdata/jieba.dict.0.utf8", + "../test/testdata/jieba.dict.0.utf8", + "../test/testdata/jieba.dict.0.utf8", + "../test/testdata/jieba.dict.0.1.utf8", + "../test/testdata/jieba.dict.0.1.utf8", + "../test/testdata/jieba.dict.0.1.utf8", + "../test/testdata/jieba.dict.1.utf8", + "../test/testdata/jieba.dict.1.utf8", + "../test/testdata/jieba.dict.1.utf8", + "../test/testdata/jieba.dict.2.utf8", + "../test/testdata/jieba.dict.2.utf8", + "../test/testdata/jieba.dict.2.utf8", + "../test/testdata/jieba.dict.2.utf8"}; + +TEST(TrieManagerTest, Test1) +{ + vector tries(sizeof(DICT_FILE)/sizeof(DICT_FILE[0])); + for (uint i = 0; i < tries.size(); i++) + { + tries[i].ptr = TrieManager::getInstance().getTrie(DICT_FILE[i]); + ASSERT_TRUE(md5File(DICT_FILE[i], tries[i].md5)); + } + + for (uint i = 0; i < tries.size(); i++) + { + for (uint j = i + 1; j < tries.size(); j++) + { + if (tries[i].md5 == tries[j].md5) + { + ASSERT_EQ(tries[i].ptr, tries[j].ptr); + } + else + { + ASSERT_NE(tries[i].ptr, tries[j].ptr); + } + } + } +} +