增加 Application 这个类,整合了所有CppJieba的功能进去,以后用户只需要使用这个类即可。

This commit is contained in:
yanyiwu 2015-06-05 16:00:32 +08:00
parent d56bf2cc68
commit 45588b75cc
7 changed files with 136 additions and 8 deletions

View File

@ -1,11 +1,12 @@
# CppJieba ChangeLog
## v2.5.0 (untagged)
## v3.0.0 (untagged)
1. 使得 QuerySegment 支持自定义词典(可选参数)。
2. 使得 KeywordExtractor 支持自定义词典(可选参数)。
3. 修改 Code Style ,参照 google code style 。
4. 增加更详细的错误日志在初始化过程中合理使用LogFatal。
5. 增加 Application 这个类整合了所有CppJieba的功能进去以后用户只需要使用这个类即可。
## v2.4.4

80
src/Application.hpp Normal file
View File

@ -0,0 +1,80 @@
#ifndef CPPJIEBA_APPLICATION_H
#define CPPJIEBA_APPLICATION_H
#include "QuerySegment.hpp"
#include "PosTagger.hpp"
#include "KeywordExtractor.hpp"
namespace CppJieba {
enum CutMethod {
METHOD_MP,
METHOD_HMM,
METHOD_MIX,
METHOD_FULL,
METHOD_QUERY
};
class Application {
public:
Application(const string& dictDir)
: dictTrie_(pathJoin(dictDir, "jieba.dict.utf8")),
model_(pathJoin(dictDir, "hmm_model.utf8")),
mpSeg_(&dictTrie_),
hmmSeg_(&model_),
mixSeg_(&dictTrie_, &model_),
fullSeg_(&dictTrie_),
querySeg_(&dictTrie_, &model_),
tagger_(&dictTrie_, &model_),
extractor_(&dictTrie_,
&model_,
pathJoin(dictDir, "idf.utf8"),
pathJoin(dictDir, "stop_words.utf8")) {
}
void cut(const string& sentence, vector<string>& words,
CutMethod method) const {
switch(method) {
case METHOD_MP:
mpSeg_.cut(sentence, words);
break;
case METHOD_HMM:
hmmSeg_.cut(sentence, words);
break;
case METHOD_MIX:
mixSeg_.cut(sentence, words);
break;
case METHOD_FULL:
fullSeg_.cut(sentence, words);
break;
case METHOD_QUERY:
querySeg_.cut(sentence, words);
break;
default:
LogError("argument method is illegal.");
}
}
void tag(const string& str, vector<pair<string, string> >& res) const {
tagger_.tag(str, res);
}
void extract(const string& str, vector<pair<string, double> >& keyword,
size_t topN) const {
extractor_.extract(str, keyword, topN);
}
~Application() {
}
private:
DictTrie dictTrie_;
HMMModel model_;
MPSegment mpSeg_;
HMMSegment hmmSeg_;
MixSegment mixSeg_;
FullSegment fullSeg_;
QuerySegment querySeg_;
PosTagger tagger_;
KeywordExtractor extractor_;
}; // class Application
} // namespace CppJieba
#endif // CPPJIEBA_APPLICATION_H

View File

@ -23,8 +23,8 @@ class QuerySegment: public SegmentBase {
maxWordLen_(maxWordLen) {
assert(maxWordLen_);
}
QuerySegment(const DictTrie* dictTrie, const HMMModel* model)
: mixSeg_(dictTrie, model), fullSeg_(dictTrie) {
QuerySegment(const DictTrie* dictTrie, const HMMModel* model, size_t maxWordLen = 4)
: mixSeg_(dictTrie, model), fullSeg_(dictTrie), maxWordLen_(maxWordLen) {
}
virtual ~QuerySegment() {
}
@ -39,7 +39,6 @@ class QuerySegment: public SegmentBase {
vector<Unicode> fullRes;
for (vector<Unicode>::const_iterator mixResItr = mixRes.begin(); mixResItr != mixRes.end(); mixResItr++) {
// if it's too long, cut with fullSeg_, put fullRes in res
if (mixResItr->size() > maxWordLen_) {
if (fullSeg_.cut(mixResItr->begin(), mixResItr->end(), fullRes)) {

View File

@ -43,6 +43,10 @@ inline string encode(Unicode::const_iterator begin, Unicode::const_iterator end)
return res;
}
inline string encode(const Unicode& unicode) {
return encode(unicode.begin(), unicode.end());
}
// compiler is expected to optimized this function to avoid return value copy
inline Unicode decode(const string& str) {
Unicode unicode;
@ -50,7 +54,8 @@ inline Unicode decode(const string& str) {
decode(str, unicode);
return unicode;
}
}
}
} // namespace TransCode
} // namespace CppJieba
#endif

View File

@ -22,6 +22,7 @@ ADD_EXECUTABLE(test.run
TTrie.cpp
TSegments.cpp
TPosTagger.cpp
TApplication.cpp
)
TARGET_LINK_LIBRARIES(gtest pthread)
TARGET_LINK_LIBRARIES(test.run gtest pthread)

View File

@ -0,0 +1,44 @@
#include "src/Application.hpp"
#include "gtest/gtest.h"
using namespace CppJieba;
TEST(ApplicationTest, Test1) {
Application app("../dict/");
vector<string> words;
string result;
app.cut("我来自北京邮电大学。", words, METHOD_MP);
result << words;
ASSERT_EQ("[\"\", \"来自\", \"北京邮电大学\", \"\"]", result);
app.cut("我来自北京邮电大学。。。学号123456", words, METHOD_HMM);
result << words;
ASSERT_EQ("[\"我来\", \"自北京\", \"邮电大学\", \"\", \"\", \"\", \"学号\", \"123456\"]", result);
app.cut("我来自北京邮电大学。。。学号123456用AK47", words, METHOD_MIX);
result << words;
ASSERT_EQ("[\"\", \"来自\", \"北京邮电大学\", \"\", \"\", \"\", \"学号\", \"123456\", \"\", \"\", \"AK47\"]", result);
app.cut("他来到了网易杭研大厦", words, METHOD_MIX);
result << words;
ASSERT_EQ("[\"\", \"来到\", \"\", \"网易\", \"杭研\", \"大厦\"]", result);
app.cut("我来自北京邮电大学", words, METHOD_FULL);
result << words;
ASSERT_EQ(result, "[\"\", \"来自\", \"北京\", \"北京邮电\", \"北京邮电大学\", \"邮电\", \"邮电大学\", \"电大\", \"大学\"]");
app.cut("他来到了网易杭研大厦", words, METHOD_QUERY);
result << words;
ASSERT_EQ("[\"\", \"来到\", \"\", \"网易\", \"杭研\", \"大厦\"]", result);
vector<pair<string, string> > tagres;
app.tag("iPhone6手机的最大特点是很容易弯曲。", tagres);
result << tagres;
ASSERT_EQ("[\"iPhone6:eng\", \"手机:n\", \"的:uj\", \"最大:a\", \"特点:n\", \"是:v\", \"很:zg\", \"容易:a\", \"弯曲:v\", \"。:x\"]", result);
vector<pair<string, double> > keywordres;
app.extract("我是拖拉机学院手扶拖拉机专业的。不用多久我就会升职加薪当上CEO走上人生巅峰。", keywordres, 5);
result << keywordres;
ASSERT_EQ(result, "[\"CEO:11.7392\", \"升职:10.8562\", \"加薪:10.6426\", \"手扶拖拉机:10.0089\", \"巅峰:9.49396\"]");
}

View File

@ -3,8 +3,6 @@
using namespace CppJieba;
TEST(KeywordExtractorTest, Test1) {
KeywordExtractor extractor("../dict/extra_dict/jieba.dict.small.utf8", "../dict/hmm_model.utf8", "../dict/idf.utf8", "../dict/stop_words.utf8");