adding user dict interface and test ok

This commit is contained in:
wyy 2014-04-25 19:30:26 +08:00
parent 566187a49c
commit 3e0aaf73a5
4 changed files with 35 additions and 16 deletions

View File

@ -85,7 +85,6 @@ namespace CppJieba
{
double maxWeight = _findMaxWeight(_nodeInfos);
_loadUserDict(userDictPath, maxWeight, UNKNOWN_TAG, _nodeInfos);
LogDebug("load userdict[%s] ok.", userDictPath.c_str());
}
_shrink(_nodeInfos);
_trie = _creatTrie(_nodeInfos);
@ -125,7 +124,8 @@ namespace CppJieba
assert(ifs);
string line;
DictUnit nodeInfo;
for(size_t lineno = 0; getline(ifs, line); lineno++)
size_t lineno;
for(lineno = 0; getline(ifs, line); lineno++)
{
if(!TransCode::decode(line, nodeInfo.word))
{
@ -136,6 +136,7 @@ namespace CppJieba
nodeInfo.tag = defaultTag;
nodeInfos.push_back(nodeInfo);
}
LogInfo("load userdict[%s] ok. lines[%u]", filePath.c_str(), lineno);
}
void _loadDict(const string& filePath, vector<DictUnit>& nodeInfos) const
{

View File

@ -1,4 +1,4 @@
SET(EXECUTABLE_OUTPUT_PATH ${PROJECT_BINARY_DIR})
SET(EXECUTABLE_OUTPUT_PATH ${PROJECT_BINARY_DIR}/test)
SET(LIBRARY_OUTPUT_PATH ${PROJECT_BINARY_DIR}/lib)
SET(GTEST_ROOT_DIR gtest-1.6.0)
@ -6,6 +6,16 @@ SET(GTEST_ROOT_DIR gtest-1.6.0)
ADD_DEFINITIONS(-DLOGGER_LEVEL=LL_WARN)
INCLUDE_DIRECTORIES(${GTEST_ROOT_DIR} ${GTEST_ROOT_DIR}/include ${PROJECT_SOURCE_DIR})
ADD_LIBRARY(gtest STATIC ${GTEST_ROOT_DIR}/src/gtest-all.cc)
#ADD_EXECUTABLE(segments.test gtest_main.cpp TSegments.cpp)
#TARGET_LINK_LIBRARIES(segments.test gtest pthread)
#
#ADD_EXECUTABLE(trie.test gtest_main.cpp TTrie.cpp)
#TARGET_LINK_LIBRARIES(trie.test gtest pthread)
#ADD_EXECUTABLE(keyword.test gtest_main.cpp TKeywordExtractor.cpp)
#TARGET_LINK_LIBRARIES(keyword.test gtest pthread)
ADD_EXECUTABLE(test.run gtest_main.cpp TKeywordExtractor.cpp TTrie.cpp TSegments.cpp )
TARGET_LINK_LIBRARIES(gtest pthread)
TARGET_LINK_LIBRARIES(test.run gtest pthread)

View File

@ -58,19 +58,27 @@ TEST(MixSegmentTest, Test1)
ASSERT_EQ(words, vector<string>(res2, res2 + sizeof(res2)/sizeof(res2[0])));
}
TEST(MixSegmentTest, UserDict)
TEST(MixSegmentTest, NoUserDict)
{
MixSegment segment("../dict/extra_dict/jieba.dict.small.utf8", "../dict/hmm_model.utf8", "../test/testdata/userdict.utf8");
//MixSegment segment("../dict/extra_dict/jieba.dict.small.utf8", "../dict/hmm_model.utf8", "../dict/extra_dict/jieba.dict.small.utf8");
MixSegment segment("../dict/extra_dict/jieba.dict.small.utf8", "../dict/hmm_model.utf8");
ASSERT_TRUE(segment);
const char* str = "令狐冲是云计算方面的专家";
vector<string> words;
ASSERT_TRUE(segment.cut(str, words));
print(words);
exit(0);
string res;
ASSERT_EQ("[\"令狐冲\", \"\", \"\", \"计算\", \"方面\", \"\", \"专家\"]", res << words);
}
TEST(MixSegmentTest, UserDict)
{
MixSegment segment("../dict/extra_dict/jieba.dict.small.utf8", "../dict/hmm_model.utf8", "../test/testdata/userdict.utf8");
ASSERT_TRUE(segment);
const char* str = "令狐冲是云计算方面的专家";
vector<string> words;
ASSERT_TRUE(segment.cut(str, words));
string res;
ASSERT_EQ("[\"令狐冲\", \"\", \"云计算\", \"方面\", \"\", \"专家\"]", res << words);
//* 之前: 李小福 / 是 / 创新 / 办 / 主任 / 也 / 是 / 云 / 计算 / 方面 / 的 / 专家 /
// 加载自定义词库后: 李小福 / 是 / 创新办 / 主任 / 也 / 是 / 云计算 / 方面 / 的 / 专家 /
}
TEST(MPSegmentTest, Test1)
@ -81,7 +89,6 @@ TEST(MPSegmentTest, Test1)
vector<string> words;
ASSERT_TRUE(segment);
ASSERT_TRUE(segment.cut(str, words));
//print(words);
ASSERT_EQ(words, vector<string>(res, res + sizeof(res)/sizeof(res[0])));
}

View File

@ -48,19 +48,20 @@ TEST(DictTrieTest, Test1)
vector<pair<size_t, const DictUnit*> > vec;
ASSERT_TRUE(TransCode::decode(word, uni));
//print(uni);
ASSERT_TRUE(trie.find(uni.begin(), uni.end(), mp, 0));
ASSERT_EQ(mp, resMap);
// print(vec);
}
TEST(DictTrieTest, UserDict)
{
DictTrie trie(DICT_FILE);
DictTrie trie(DICT_FILE, "../test/testdata/userdict.utf8");
ASSERT_TRUE(trie);
string word = "云计算";
Unicode unicode;
ASSERT_TRUE(TransCode::decode(word, unicode));
print((*trie.find(unicode.begin(), unicode.end())));
exit(0);
const DictUnit * unit = trie.find(unicode.begin(), unicode.end());
ASSERT_TRUE(unit);
string res ;
res << *unit;
ASSERT_EQ("[\"20113\", \"35745\", \"31639\"] x -2.975", res);
}