diff --git a/ChangeLog.md b/ChangeLog.md index e56d0f6..7e4987a 100644 --- a/ChangeLog.md +++ b/ChangeLog.md @@ -1,11 +1,12 @@ # CppJieba ChangeLog -## v2.5.0 (untagged) +## v3.0.0 (untagged) 1. 使得 QuerySegment 支持自定义词典(可选参数)。 2. 使得 KeywordExtractor 支持自定义词典(可选参数)。 3. 修改 Code Style ,参照 google code style 。 4. 增加更详细的错误日志,在初始化过程中合理使用LogFatal。 +5. 增加 Application 这个类,整合了所有CppJieba的功能进去,以后用户只需要使用这个类即可。 ## v2.4.4 diff --git a/src/Application.hpp b/src/Application.hpp new file mode 100644 index 0000000..205b8ec --- /dev/null +++ b/src/Application.hpp @@ -0,0 +1,80 @@ +#ifndef CPPJIEBA_APPLICATION_H +#define CPPJIEBA_APPLICATION_H + +#include "QuerySegment.hpp" +#include "PosTagger.hpp" +#include "KeywordExtractor.hpp" + +namespace CppJieba { + +enum CutMethod { + METHOD_MP, + METHOD_HMM, + METHOD_MIX, + METHOD_FULL, + METHOD_QUERY +}; + +class Application { + public: + Application(const string& dictDir) + : dictTrie_(pathJoin(dictDir, "jieba.dict.utf8")), + model_(pathJoin(dictDir, "hmm_model.utf8")), + mpSeg_(&dictTrie_), + hmmSeg_(&model_), + mixSeg_(&dictTrie_, &model_), + fullSeg_(&dictTrie_), + querySeg_(&dictTrie_, &model_), + tagger_(&dictTrie_, &model_), + extractor_(&dictTrie_, + &model_, + pathJoin(dictDir, "idf.utf8"), + pathJoin(dictDir, "stop_words.utf8")) { + } + void cut(const string& sentence, vector& words, + CutMethod method) const { + switch(method) { + case METHOD_MP: + mpSeg_.cut(sentence, words); + break; + case METHOD_HMM: + hmmSeg_.cut(sentence, words); + break; + case METHOD_MIX: + mixSeg_.cut(sentence, words); + break; + case METHOD_FULL: + fullSeg_.cut(sentence, words); + break; + case METHOD_QUERY: + querySeg_.cut(sentence, words); + break; + default: + LogError("argument method is illegal."); + } + } + void tag(const string& str, vector >& res) const { + tagger_.tag(str, res); + } + void extract(const string& str, vector >& keyword, + size_t topN) const { + extractor_.extract(str, keyword, topN); + } + ~Application() { + } + + private: + DictTrie dictTrie_; + HMMModel model_; + MPSegment mpSeg_; + HMMSegment hmmSeg_; + MixSegment mixSeg_; + FullSegment fullSeg_; + QuerySegment querySeg_; + PosTagger tagger_; + KeywordExtractor extractor_; +}; // class Application + +} // namespace CppJieba + +#endif // CPPJIEBA_APPLICATION_H diff --git a/src/QuerySegment.hpp b/src/QuerySegment.hpp index be2f6d6..9efd6d9 100644 --- a/src/QuerySegment.hpp +++ b/src/QuerySegment.hpp @@ -23,8 +23,8 @@ class QuerySegment: public SegmentBase { maxWordLen_(maxWordLen) { assert(maxWordLen_); } - QuerySegment(const DictTrie* dictTrie, const HMMModel* model) - : mixSeg_(dictTrie, model), fullSeg_(dictTrie) { + QuerySegment(const DictTrie* dictTrie, const HMMModel* model, size_t maxWordLen = 4) + : mixSeg_(dictTrie, model), fullSeg_(dictTrie), maxWordLen_(maxWordLen) { } virtual ~QuerySegment() { } @@ -39,7 +39,6 @@ class QuerySegment: public SegmentBase { vector fullRes; for (vector::const_iterator mixResItr = mixRes.begin(); mixResItr != mixRes.end(); mixResItr++) { - // if it's too long, cut with fullSeg_, put fullRes in res if (mixResItr->size() > maxWordLen_) { if (fullSeg_.cut(mixResItr->begin(), mixResItr->end(), fullRes)) { diff --git a/src/TransCode.hpp b/src/TransCode.hpp index ba1bfdd..5c96978 100644 --- a/src/TransCode.hpp +++ b/src/TransCode.hpp @@ -43,6 +43,10 @@ inline string encode(Unicode::const_iterator begin, Unicode::const_iterator end) return res; } +inline string encode(const Unicode& unicode) { + return encode(unicode.begin(), unicode.end()); +} + // compiler is expected to optimized this function to avoid return value copy inline Unicode decode(const string& str) { Unicode unicode; @@ -50,7 +54,8 @@ inline Unicode decode(const string& str) { decode(str, unicode); return unicode; } -} -} + +} // namespace TransCode +} // namespace CppJieba #endif diff --git a/test/unittest/CMakeLists.txt b/test/unittest/CMakeLists.txt index a6c8de4..35a2fa0 100644 --- a/test/unittest/CMakeLists.txt +++ b/test/unittest/CMakeLists.txt @@ -22,6 +22,7 @@ ADD_EXECUTABLE(test.run TTrie.cpp TSegments.cpp TPosTagger.cpp + TApplication.cpp ) TARGET_LINK_LIBRARIES(gtest pthread) TARGET_LINK_LIBRARIES(test.run gtest pthread) diff --git a/test/unittest/TApplication.cpp b/test/unittest/TApplication.cpp new file mode 100644 index 0000000..ab1646a --- /dev/null +++ b/test/unittest/TApplication.cpp @@ -0,0 +1,44 @@ +#include "src/Application.hpp" +#include "gtest/gtest.h" + +using namespace CppJieba; + +TEST(ApplicationTest, Test1) { + Application app("../dict/"); + vector words; + string result; + + app.cut("我来自北京邮电大学。", words, METHOD_MP); + result << words; + ASSERT_EQ("[\"我\", \"来自\", \"北京邮电大学\", \"。\"]", result); + + app.cut("我来自北京邮电大学。。。学号123456", words, METHOD_HMM); + result << words; + ASSERT_EQ("[\"我来\", \"自北京\", \"邮电大学\", \"。\", \"。\", \"。\", \"学号\", \"123456\"]", result); + + app.cut("我来自北京邮电大学。。。学号123456,用AK47", words, METHOD_MIX); + result << words; + ASSERT_EQ("[\"我\", \"来自\", \"北京邮电大学\", \"。\", \"。\", \"。\", \"学号\", \"123456\", \",\", \"用\", \"AK47\"]", result); + + app.cut("他来到了网易杭研大厦", words, METHOD_MIX); + result << words; + ASSERT_EQ("[\"他\", \"来到\", \"了\", \"网易\", \"杭研\", \"大厦\"]", result); + + app.cut("我来自北京邮电大学", words, METHOD_FULL); + result << words; + ASSERT_EQ(result, "[\"我\", \"来自\", \"北京\", \"北京邮电\", \"北京邮电大学\", \"邮电\", \"邮电大学\", \"电大\", \"大学\"]"); + + app.cut("他来到了网易杭研大厦", words, METHOD_QUERY); + result << words; + ASSERT_EQ("[\"他\", \"来到\", \"了\", \"网易\", \"杭研\", \"大厦\"]", result); + + vector > tagres; + app.tag("iPhone6手机的最大特点是很容易弯曲。", tagres); + result << tagres; + ASSERT_EQ("[\"iPhone6:eng\", \"手机:n\", \"的:uj\", \"最大:a\", \"特点:n\", \"是:v\", \"很:zg\", \"容易:a\", \"弯曲:v\", \"。:x\"]", result); + + vector > keywordres; + app.extract("我是拖拉机学院手扶拖拉机专业的。不用多久,我就会升职加薪,当上CEO,走上人生巅峰。", keywordres, 5); + result << keywordres; + ASSERT_EQ(result, "[\"CEO:11.7392\", \"升职:10.8562\", \"加薪:10.6426\", \"手扶拖拉机:10.0089\", \"巅峰:9.49396\"]"); +} diff --git a/test/unittest/TKeywordExtractor.cpp b/test/unittest/TKeywordExtractor.cpp index 25dfaa5..cd24956 100644 --- a/test/unittest/TKeywordExtractor.cpp +++ b/test/unittest/TKeywordExtractor.cpp @@ -3,8 +3,6 @@ using namespace CppJieba; - - TEST(KeywordExtractorTest, Test1) { KeywordExtractor extractor("../dict/extra_dict/jieba.dict.small.utf8", "../dict/hmm_model.utf8", "../dict/idf.utf8", "../dict/stop_words.utf8");