mirror of
https://github.com/yanyiwu/cppjieba.git
synced 2025-07-18 00:00:12 +08:00
adding user dict interface and test ok
This commit is contained in:
parent
566187a49c
commit
3e0aaf73a5
@ -85,7 +85,6 @@ namespace CppJieba
|
|||||||
{
|
{
|
||||||
double maxWeight = _findMaxWeight(_nodeInfos);
|
double maxWeight = _findMaxWeight(_nodeInfos);
|
||||||
_loadUserDict(userDictPath, maxWeight, UNKNOWN_TAG, _nodeInfos);
|
_loadUserDict(userDictPath, maxWeight, UNKNOWN_TAG, _nodeInfos);
|
||||||
LogDebug("load userdict[%s] ok.", userDictPath.c_str());
|
|
||||||
}
|
}
|
||||||
_shrink(_nodeInfos);
|
_shrink(_nodeInfos);
|
||||||
_trie = _creatTrie(_nodeInfos);
|
_trie = _creatTrie(_nodeInfos);
|
||||||
@ -125,7 +124,8 @@ namespace CppJieba
|
|||||||
assert(ifs);
|
assert(ifs);
|
||||||
string line;
|
string line;
|
||||||
DictUnit nodeInfo;
|
DictUnit nodeInfo;
|
||||||
for(size_t lineno = 0; getline(ifs, line); lineno++)
|
size_t lineno;
|
||||||
|
for(lineno = 0; getline(ifs, line); lineno++)
|
||||||
{
|
{
|
||||||
if(!TransCode::decode(line, nodeInfo.word))
|
if(!TransCode::decode(line, nodeInfo.word))
|
||||||
{
|
{
|
||||||
@ -136,6 +136,7 @@ namespace CppJieba
|
|||||||
nodeInfo.tag = defaultTag;
|
nodeInfo.tag = defaultTag;
|
||||||
nodeInfos.push_back(nodeInfo);
|
nodeInfos.push_back(nodeInfo);
|
||||||
}
|
}
|
||||||
|
LogInfo("load userdict[%s] ok. lines[%u]", filePath.c_str(), lineno);
|
||||||
}
|
}
|
||||||
void _loadDict(const string& filePath, vector<DictUnit>& nodeInfos) const
|
void _loadDict(const string& filePath, vector<DictUnit>& nodeInfos) const
|
||||||
{
|
{
|
||||||
|
@ -1,4 +1,4 @@
|
|||||||
SET(EXECUTABLE_OUTPUT_PATH ${PROJECT_BINARY_DIR})
|
SET(EXECUTABLE_OUTPUT_PATH ${PROJECT_BINARY_DIR}/test)
|
||||||
SET(LIBRARY_OUTPUT_PATH ${PROJECT_BINARY_DIR}/lib)
|
SET(LIBRARY_OUTPUT_PATH ${PROJECT_BINARY_DIR}/lib)
|
||||||
|
|
||||||
SET(GTEST_ROOT_DIR gtest-1.6.0)
|
SET(GTEST_ROOT_DIR gtest-1.6.0)
|
||||||
@ -6,6 +6,16 @@ SET(GTEST_ROOT_DIR gtest-1.6.0)
|
|||||||
ADD_DEFINITIONS(-DLOGGER_LEVEL=LL_WARN)
|
ADD_DEFINITIONS(-DLOGGER_LEVEL=LL_WARN)
|
||||||
INCLUDE_DIRECTORIES(${GTEST_ROOT_DIR} ${GTEST_ROOT_DIR}/include ${PROJECT_SOURCE_DIR})
|
INCLUDE_DIRECTORIES(${GTEST_ROOT_DIR} ${GTEST_ROOT_DIR}/include ${PROJECT_SOURCE_DIR})
|
||||||
ADD_LIBRARY(gtest STATIC ${GTEST_ROOT_DIR}/src/gtest-all.cc)
|
ADD_LIBRARY(gtest STATIC ${GTEST_ROOT_DIR}/src/gtest-all.cc)
|
||||||
|
|
||||||
|
#ADD_EXECUTABLE(segments.test gtest_main.cpp TSegments.cpp)
|
||||||
|
#TARGET_LINK_LIBRARIES(segments.test gtest pthread)
|
||||||
|
#
|
||||||
|
#ADD_EXECUTABLE(trie.test gtest_main.cpp TTrie.cpp)
|
||||||
|
#TARGET_LINK_LIBRARIES(trie.test gtest pthread)
|
||||||
|
|
||||||
|
#ADD_EXECUTABLE(keyword.test gtest_main.cpp TKeywordExtractor.cpp)
|
||||||
|
#TARGET_LINK_LIBRARIES(keyword.test gtest pthread)
|
||||||
|
|
||||||
ADD_EXECUTABLE(test.run gtest_main.cpp TKeywordExtractor.cpp TTrie.cpp TSegments.cpp )
|
ADD_EXECUTABLE(test.run gtest_main.cpp TKeywordExtractor.cpp TTrie.cpp TSegments.cpp )
|
||||||
TARGET_LINK_LIBRARIES(gtest pthread)
|
TARGET_LINK_LIBRARIES(gtest pthread)
|
||||||
TARGET_LINK_LIBRARIES(test.run gtest pthread)
|
TARGET_LINK_LIBRARIES(test.run gtest pthread)
|
||||||
|
@ -58,19 +58,27 @@ TEST(MixSegmentTest, Test1)
|
|||||||
ASSERT_EQ(words, vector<string>(res2, res2 + sizeof(res2)/sizeof(res2[0])));
|
ASSERT_EQ(words, vector<string>(res2, res2 + sizeof(res2)/sizeof(res2[0])));
|
||||||
}
|
}
|
||||||
|
|
||||||
TEST(MixSegmentTest, UserDict)
|
TEST(MixSegmentTest, NoUserDict)
|
||||||
{
|
{
|
||||||
MixSegment segment("../dict/extra_dict/jieba.dict.small.utf8", "../dict/hmm_model.utf8", "../test/testdata/userdict.utf8");
|
MixSegment segment("../dict/extra_dict/jieba.dict.small.utf8", "../dict/hmm_model.utf8");
|
||||||
//MixSegment segment("../dict/extra_dict/jieba.dict.small.utf8", "../dict/hmm_model.utf8", "../dict/extra_dict/jieba.dict.small.utf8");
|
|
||||||
ASSERT_TRUE(segment);
|
ASSERT_TRUE(segment);
|
||||||
const char* str = "令狐冲是云计算方面的专家";
|
const char* str = "令狐冲是云计算方面的专家";
|
||||||
vector<string> words;
|
vector<string> words;
|
||||||
ASSERT_TRUE(segment.cut(str, words));
|
ASSERT_TRUE(segment.cut(str, words));
|
||||||
print(words);
|
string res;
|
||||||
exit(0);
|
ASSERT_EQ("[\"令狐冲\", \"是\", \"云\", \"计算\", \"方面\", \"的\", \"专家\"]", res << words);
|
||||||
|
|
||||||
|
}
|
||||||
|
TEST(MixSegmentTest, UserDict)
|
||||||
|
{
|
||||||
|
MixSegment segment("../dict/extra_dict/jieba.dict.small.utf8", "../dict/hmm_model.utf8", "../test/testdata/userdict.utf8");
|
||||||
|
ASSERT_TRUE(segment);
|
||||||
|
const char* str = "令狐冲是云计算方面的专家";
|
||||||
|
vector<string> words;
|
||||||
|
ASSERT_TRUE(segment.cut(str, words));
|
||||||
|
string res;
|
||||||
|
ASSERT_EQ("[\"令狐冲\", \"是\", \"云计算\", \"方面\", \"的\", \"专家\"]", res << words);
|
||||||
|
|
||||||
//* 之前: 李小福 / 是 / 创新 / 办 / 主任 / 也 / 是 / 云 / 计算 / 方面 / 的 / 专家 /
|
|
||||||
// 加载自定义词库后: 李小福 / 是 / 创新办 / 主任 / 也 / 是 / 云计算 / 方面 / 的 / 专家 /
|
|
||||||
}
|
}
|
||||||
|
|
||||||
TEST(MPSegmentTest, Test1)
|
TEST(MPSegmentTest, Test1)
|
||||||
@ -81,7 +89,6 @@ TEST(MPSegmentTest, Test1)
|
|||||||
vector<string> words;
|
vector<string> words;
|
||||||
ASSERT_TRUE(segment);
|
ASSERT_TRUE(segment);
|
||||||
ASSERT_TRUE(segment.cut(str, words));
|
ASSERT_TRUE(segment.cut(str, words));
|
||||||
//print(words);
|
|
||||||
ASSERT_EQ(words, vector<string>(res, res + sizeof(res)/sizeof(res[0])));
|
ASSERT_EQ(words, vector<string>(res, res + sizeof(res)/sizeof(res[0])));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -48,19 +48,20 @@ TEST(DictTrieTest, Test1)
|
|||||||
|
|
||||||
vector<pair<size_t, const DictUnit*> > vec;
|
vector<pair<size_t, const DictUnit*> > vec;
|
||||||
ASSERT_TRUE(TransCode::decode(word, uni));
|
ASSERT_TRUE(TransCode::decode(word, uni));
|
||||||
//print(uni);
|
|
||||||
ASSERT_TRUE(trie.find(uni.begin(), uni.end(), mp, 0));
|
ASSERT_TRUE(trie.find(uni.begin(), uni.end(), mp, 0));
|
||||||
ASSERT_EQ(mp, resMap);
|
ASSERT_EQ(mp, resMap);
|
||||||
// print(vec);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
TEST(DictTrieTest, UserDict)
|
TEST(DictTrieTest, UserDict)
|
||||||
{
|
{
|
||||||
DictTrie trie(DICT_FILE);
|
DictTrie trie(DICT_FILE, "../test/testdata/userdict.utf8");
|
||||||
ASSERT_TRUE(trie);
|
ASSERT_TRUE(trie);
|
||||||
string word = "云计算";
|
string word = "云计算";
|
||||||
Unicode unicode;
|
Unicode unicode;
|
||||||
ASSERT_TRUE(TransCode::decode(word, unicode));
|
ASSERT_TRUE(TransCode::decode(word, unicode));
|
||||||
print((*trie.find(unicode.begin(), unicode.end())));
|
const DictUnit * unit = trie.find(unicode.begin(), unicode.end());
|
||||||
exit(0);
|
ASSERT_TRUE(unit);
|
||||||
|
string res ;
|
||||||
|
res << *unit;
|
||||||
|
ASSERT_EQ("[\"20113\", \"35745\", \"31639\"] x -2.975", res);
|
||||||
}
|
}
|
||||||
|
Loading…
x
Reference in New Issue
Block a user