From 3e0aaf73a5775a33fcceb6618ce82dda7419f1f0 Mon Sep 17 00:00:00 2001 From: wyy Date: Fri, 25 Apr 2014 19:30:26 +0800 Subject: [PATCH] adding user dict interface and test ok --- src/DictTrie.hpp | 5 +++-- test/unittest/CMakeLists.txt | 12 +++++++++++- test/unittest/TSegments.cpp | 23 +++++++++++++++-------- test/unittest/TTrie.cpp | 11 ++++++----- 4 files changed, 35 insertions(+), 16 deletions(-) diff --git a/src/DictTrie.hpp b/src/DictTrie.hpp index 8137084..53a6291 100644 --- a/src/DictTrie.hpp +++ b/src/DictTrie.hpp @@ -85,7 +85,6 @@ namespace CppJieba { double maxWeight = _findMaxWeight(_nodeInfos); _loadUserDict(userDictPath, maxWeight, UNKNOWN_TAG, _nodeInfos); - LogDebug("load userdict[%s] ok.", userDictPath.c_str()); } _shrink(_nodeInfos); _trie = _creatTrie(_nodeInfos); @@ -125,7 +124,8 @@ namespace CppJieba assert(ifs); string line; DictUnit nodeInfo; - for(size_t lineno = 0; getline(ifs, line); lineno++) + size_t lineno; + for(lineno = 0; getline(ifs, line); lineno++) { if(!TransCode::decode(line, nodeInfo.word)) { @@ -136,6 +136,7 @@ namespace CppJieba nodeInfo.tag = defaultTag; nodeInfos.push_back(nodeInfo); } + LogInfo("load userdict[%s] ok. lines[%u]", filePath.c_str(), lineno); } void _loadDict(const string& filePath, vector& nodeInfos) const { diff --git a/test/unittest/CMakeLists.txt b/test/unittest/CMakeLists.txt index ebf0322..c252213 100644 --- a/test/unittest/CMakeLists.txt +++ b/test/unittest/CMakeLists.txt @@ -1,4 +1,4 @@ -SET(EXECUTABLE_OUTPUT_PATH ${PROJECT_BINARY_DIR}) +SET(EXECUTABLE_OUTPUT_PATH ${PROJECT_BINARY_DIR}/test) SET(LIBRARY_OUTPUT_PATH ${PROJECT_BINARY_DIR}/lib) SET(GTEST_ROOT_DIR gtest-1.6.0) @@ -6,6 +6,16 @@ SET(GTEST_ROOT_DIR gtest-1.6.0) ADD_DEFINITIONS(-DLOGGER_LEVEL=LL_WARN) INCLUDE_DIRECTORIES(${GTEST_ROOT_DIR} ${GTEST_ROOT_DIR}/include ${PROJECT_SOURCE_DIR}) ADD_LIBRARY(gtest STATIC ${GTEST_ROOT_DIR}/src/gtest-all.cc) + +#ADD_EXECUTABLE(segments.test gtest_main.cpp TSegments.cpp) +#TARGET_LINK_LIBRARIES(segments.test gtest pthread) +# +#ADD_EXECUTABLE(trie.test gtest_main.cpp TTrie.cpp) +#TARGET_LINK_LIBRARIES(trie.test gtest pthread) + +#ADD_EXECUTABLE(keyword.test gtest_main.cpp TKeywordExtractor.cpp) +#TARGET_LINK_LIBRARIES(keyword.test gtest pthread) + ADD_EXECUTABLE(test.run gtest_main.cpp TKeywordExtractor.cpp TTrie.cpp TSegments.cpp ) TARGET_LINK_LIBRARIES(gtest pthread) TARGET_LINK_LIBRARIES(test.run gtest pthread) diff --git a/test/unittest/TSegments.cpp b/test/unittest/TSegments.cpp index b930d2e..4f4c5c8 100644 --- a/test/unittest/TSegments.cpp +++ b/test/unittest/TSegments.cpp @@ -58,19 +58,27 @@ TEST(MixSegmentTest, Test1) ASSERT_EQ(words, vector(res2, res2 + sizeof(res2)/sizeof(res2[0]))); } -TEST(MixSegmentTest, UserDict) +TEST(MixSegmentTest, NoUserDict) { - MixSegment segment("../dict/extra_dict/jieba.dict.small.utf8", "../dict/hmm_model.utf8", "../test/testdata/userdict.utf8"); - //MixSegment segment("../dict/extra_dict/jieba.dict.small.utf8", "../dict/hmm_model.utf8", "../dict/extra_dict/jieba.dict.small.utf8"); + MixSegment segment("../dict/extra_dict/jieba.dict.small.utf8", "../dict/hmm_model.utf8"); ASSERT_TRUE(segment); const char* str = "令狐冲是云计算方面的专家"; vector words; ASSERT_TRUE(segment.cut(str, words)); - print(words); - exit(0); + string res; + ASSERT_EQ("[\"令狐冲\", \"是\", \"云\", \"计算\", \"方面\", \"的\", \"专家\"]", res << words); + +} +TEST(MixSegmentTest, UserDict) +{ + MixSegment segment("../dict/extra_dict/jieba.dict.small.utf8", "../dict/hmm_model.utf8", "../test/testdata/userdict.utf8"); + ASSERT_TRUE(segment); + const char* str = "令狐冲是云计算方面的专家"; + vector words; + ASSERT_TRUE(segment.cut(str, words)); + string res; + ASSERT_EQ("[\"令狐冲\", \"是\", \"云计算\", \"方面\", \"的\", \"专家\"]", res << words); - //* 之前: 李小福 / 是 / 创新 / 办 / 主任 / 也 / 是 / 云 / 计算 / 方面 / 的 / 专家 / - // 加载自定义词库后: 李小福 / 是 / 创新办 / 主任 / 也 / 是 / 云计算 / 方面 / 的 / 专家 / } TEST(MPSegmentTest, Test1) @@ -81,7 +89,6 @@ TEST(MPSegmentTest, Test1) vector words; ASSERT_TRUE(segment); ASSERT_TRUE(segment.cut(str, words)); - //print(words); ASSERT_EQ(words, vector(res, res + sizeof(res)/sizeof(res[0]))); } diff --git a/test/unittest/TTrie.cpp b/test/unittest/TTrie.cpp index d44f8d0..cb65e50 100644 --- a/test/unittest/TTrie.cpp +++ b/test/unittest/TTrie.cpp @@ -48,19 +48,20 @@ TEST(DictTrieTest, Test1) vector > vec; ASSERT_TRUE(TransCode::decode(word, uni)); - //print(uni); ASSERT_TRUE(trie.find(uni.begin(), uni.end(), mp, 0)); ASSERT_EQ(mp, resMap); - // print(vec); } TEST(DictTrieTest, UserDict) { - DictTrie trie(DICT_FILE); + DictTrie trie(DICT_FILE, "../test/testdata/userdict.utf8"); ASSERT_TRUE(trie); string word = "云计算"; Unicode unicode; ASSERT_TRUE(TransCode::decode(word, unicode)); - print((*trie.find(unicode.begin(), unicode.end()))); - exit(0); + const DictUnit * unit = trie.find(unicode.begin(), unicode.end()); + ASSERT_TRUE(unit); + string res ; + res << *unit; + ASSERT_EQ("[\"20113\", \"35745\", \"31639\"] x -2.975", res); }