adding user dict interface and test ok

This commit is contained in:
wyy 2014-04-25 19:30:26 +08:00
parent 566187a49c
commit 3e0aaf73a5
4 changed files with 35 additions and 16 deletions

View File

@ -85,7 +85,6 @@ namespace CppJieba
{ {
double maxWeight = _findMaxWeight(_nodeInfos); double maxWeight = _findMaxWeight(_nodeInfos);
_loadUserDict(userDictPath, maxWeight, UNKNOWN_TAG, _nodeInfos); _loadUserDict(userDictPath, maxWeight, UNKNOWN_TAG, _nodeInfos);
LogDebug("load userdict[%s] ok.", userDictPath.c_str());
} }
_shrink(_nodeInfos); _shrink(_nodeInfos);
_trie = _creatTrie(_nodeInfos); _trie = _creatTrie(_nodeInfos);
@ -125,7 +124,8 @@ namespace CppJieba
assert(ifs); assert(ifs);
string line; string line;
DictUnit nodeInfo; DictUnit nodeInfo;
for(size_t lineno = 0; getline(ifs, line); lineno++) size_t lineno;
for(lineno = 0; getline(ifs, line); lineno++)
{ {
if(!TransCode::decode(line, nodeInfo.word)) if(!TransCode::decode(line, nodeInfo.word))
{ {
@ -136,6 +136,7 @@ namespace CppJieba
nodeInfo.tag = defaultTag; nodeInfo.tag = defaultTag;
nodeInfos.push_back(nodeInfo); nodeInfos.push_back(nodeInfo);
} }
LogInfo("load userdict[%s] ok. lines[%u]", filePath.c_str(), lineno);
} }
void _loadDict(const string& filePath, vector<DictUnit>& nodeInfos) const void _loadDict(const string& filePath, vector<DictUnit>& nodeInfos) const
{ {

View File

@ -1,4 +1,4 @@
SET(EXECUTABLE_OUTPUT_PATH ${PROJECT_BINARY_DIR}) SET(EXECUTABLE_OUTPUT_PATH ${PROJECT_BINARY_DIR}/test)
SET(LIBRARY_OUTPUT_PATH ${PROJECT_BINARY_DIR}/lib) SET(LIBRARY_OUTPUT_PATH ${PROJECT_BINARY_DIR}/lib)
SET(GTEST_ROOT_DIR gtest-1.6.0) SET(GTEST_ROOT_DIR gtest-1.6.0)
@ -6,6 +6,16 @@ SET(GTEST_ROOT_DIR gtest-1.6.0)
ADD_DEFINITIONS(-DLOGGER_LEVEL=LL_WARN) ADD_DEFINITIONS(-DLOGGER_LEVEL=LL_WARN)
INCLUDE_DIRECTORIES(${GTEST_ROOT_DIR} ${GTEST_ROOT_DIR}/include ${PROJECT_SOURCE_DIR}) INCLUDE_DIRECTORIES(${GTEST_ROOT_DIR} ${GTEST_ROOT_DIR}/include ${PROJECT_SOURCE_DIR})
ADD_LIBRARY(gtest STATIC ${GTEST_ROOT_DIR}/src/gtest-all.cc) ADD_LIBRARY(gtest STATIC ${GTEST_ROOT_DIR}/src/gtest-all.cc)
#ADD_EXECUTABLE(segments.test gtest_main.cpp TSegments.cpp)
#TARGET_LINK_LIBRARIES(segments.test gtest pthread)
#
#ADD_EXECUTABLE(trie.test gtest_main.cpp TTrie.cpp)
#TARGET_LINK_LIBRARIES(trie.test gtest pthread)
#ADD_EXECUTABLE(keyword.test gtest_main.cpp TKeywordExtractor.cpp)
#TARGET_LINK_LIBRARIES(keyword.test gtest pthread)
ADD_EXECUTABLE(test.run gtest_main.cpp TKeywordExtractor.cpp TTrie.cpp TSegments.cpp ) ADD_EXECUTABLE(test.run gtest_main.cpp TKeywordExtractor.cpp TTrie.cpp TSegments.cpp )
TARGET_LINK_LIBRARIES(gtest pthread) TARGET_LINK_LIBRARIES(gtest pthread)
TARGET_LINK_LIBRARIES(test.run gtest pthread) TARGET_LINK_LIBRARIES(test.run gtest pthread)

View File

@ -58,19 +58,27 @@ TEST(MixSegmentTest, Test1)
ASSERT_EQ(words, vector<string>(res2, res2 + sizeof(res2)/sizeof(res2[0]))); ASSERT_EQ(words, vector<string>(res2, res2 + sizeof(res2)/sizeof(res2[0])));
} }
TEST(MixSegmentTest, UserDict) TEST(MixSegmentTest, NoUserDict)
{ {
MixSegment segment("../dict/extra_dict/jieba.dict.small.utf8", "../dict/hmm_model.utf8", "../test/testdata/userdict.utf8"); MixSegment segment("../dict/extra_dict/jieba.dict.small.utf8", "../dict/hmm_model.utf8");
//MixSegment segment("../dict/extra_dict/jieba.dict.small.utf8", "../dict/hmm_model.utf8", "../dict/extra_dict/jieba.dict.small.utf8");
ASSERT_TRUE(segment); ASSERT_TRUE(segment);
const char* str = "令狐冲是云计算方面的专家"; const char* str = "令狐冲是云计算方面的专家";
vector<string> words; vector<string> words;
ASSERT_TRUE(segment.cut(str, words)); ASSERT_TRUE(segment.cut(str, words));
print(words); string res;
exit(0); ASSERT_EQ("[\"令狐冲\", \"\", \"\", \"计算\", \"方面\", \"\", \"专家\"]", res << words);
}
TEST(MixSegmentTest, UserDict)
{
MixSegment segment("../dict/extra_dict/jieba.dict.small.utf8", "../dict/hmm_model.utf8", "../test/testdata/userdict.utf8");
ASSERT_TRUE(segment);
const char* str = "令狐冲是云计算方面的专家";
vector<string> words;
ASSERT_TRUE(segment.cut(str, words));
string res;
ASSERT_EQ("[\"令狐冲\", \"\", \"云计算\", \"方面\", \"\", \"专家\"]", res << words);
//* 之前: 李小福 / 是 / 创新 / 办 / 主任 / 也 / 是 / 云 / 计算 / 方面 / 的 / 专家 /
// 加载自定义词库后: 李小福 / 是 / 创新办 / 主任 / 也 / 是 / 云计算 / 方面 / 的 / 专家 /
} }
TEST(MPSegmentTest, Test1) TEST(MPSegmentTest, Test1)
@ -81,7 +89,6 @@ TEST(MPSegmentTest, Test1)
vector<string> words; vector<string> words;
ASSERT_TRUE(segment); ASSERT_TRUE(segment);
ASSERT_TRUE(segment.cut(str, words)); ASSERT_TRUE(segment.cut(str, words));
//print(words);
ASSERT_EQ(words, vector<string>(res, res + sizeof(res)/sizeof(res[0]))); ASSERT_EQ(words, vector<string>(res, res + sizeof(res)/sizeof(res[0])));
} }

View File

@ -48,19 +48,20 @@ TEST(DictTrieTest, Test1)
vector<pair<size_t, const DictUnit*> > vec; vector<pair<size_t, const DictUnit*> > vec;
ASSERT_TRUE(TransCode::decode(word, uni)); ASSERT_TRUE(TransCode::decode(word, uni));
//print(uni);
ASSERT_TRUE(trie.find(uni.begin(), uni.end(), mp, 0)); ASSERT_TRUE(trie.find(uni.begin(), uni.end(), mp, 0));
ASSERT_EQ(mp, resMap); ASSERT_EQ(mp, resMap);
// print(vec);
} }
TEST(DictTrieTest, UserDict) TEST(DictTrieTest, UserDict)
{ {
DictTrie trie(DICT_FILE); DictTrie trie(DICT_FILE, "../test/testdata/userdict.utf8");
ASSERT_TRUE(trie); ASSERT_TRUE(trie);
string word = "云计算"; string word = "云计算";
Unicode unicode; Unicode unicode;
ASSERT_TRUE(TransCode::decode(word, unicode)); ASSERT_TRUE(TransCode::decode(word, unicode));
print((*trie.find(unicode.begin(), unicode.end()))); const DictUnit * unit = trie.find(unicode.begin(), unicode.end());
exit(0); ASSERT_TRUE(unit);
string res ;
res << *unit;
ASSERT_EQ("[\"20113\", \"35745\", \"31639\"] x -2.975", res);
} }