mirror of
https://github.com/yanyiwu/cppjieba.git
synced 2025-07-18 00:00:12 +08:00
增加 Application 这个类,整合了所有CppJieba的功能进去,以后用户只需要使用这个类即可。
This commit is contained in:
parent
d56bf2cc68
commit
45588b75cc
@ -1,11 +1,12 @@
|
||||
# CppJieba ChangeLog
|
||||
|
||||
## v2.5.0 (untagged)
|
||||
## v3.0.0 (untagged)
|
||||
|
||||
1. 使得 QuerySegment 支持自定义词典(可选参数)。
|
||||
2. 使得 KeywordExtractor 支持自定义词典(可选参数)。
|
||||
3. 修改 Code Style ,参照 google code style 。
|
||||
4. 增加更详细的错误日志,在初始化过程中合理使用LogFatal。
|
||||
5. 增加 Application 这个类,整合了所有CppJieba的功能进去,以后用户只需要使用这个类即可。
|
||||
|
||||
## v2.4.4
|
||||
|
||||
|
80
src/Application.hpp
Normal file
80
src/Application.hpp
Normal file
@ -0,0 +1,80 @@
|
||||
#ifndef CPPJIEBA_APPLICATION_H
|
||||
#define CPPJIEBA_APPLICATION_H
|
||||
|
||||
#include "QuerySegment.hpp"
|
||||
#include "PosTagger.hpp"
|
||||
#include "KeywordExtractor.hpp"
|
||||
|
||||
namespace CppJieba {
|
||||
|
||||
enum CutMethod {
|
||||
METHOD_MP,
|
||||
METHOD_HMM,
|
||||
METHOD_MIX,
|
||||
METHOD_FULL,
|
||||
METHOD_QUERY
|
||||
};
|
||||
|
||||
class Application {
|
||||
public:
|
||||
Application(const string& dictDir)
|
||||
: dictTrie_(pathJoin(dictDir, "jieba.dict.utf8")),
|
||||
model_(pathJoin(dictDir, "hmm_model.utf8")),
|
||||
mpSeg_(&dictTrie_),
|
||||
hmmSeg_(&model_),
|
||||
mixSeg_(&dictTrie_, &model_),
|
||||
fullSeg_(&dictTrie_),
|
||||
querySeg_(&dictTrie_, &model_),
|
||||
tagger_(&dictTrie_, &model_),
|
||||
extractor_(&dictTrie_,
|
||||
&model_,
|
||||
pathJoin(dictDir, "idf.utf8"),
|
||||
pathJoin(dictDir, "stop_words.utf8")) {
|
||||
}
|
||||
void cut(const string& sentence, vector<string>& words,
|
||||
CutMethod method) const {
|
||||
switch(method) {
|
||||
case METHOD_MP:
|
||||
mpSeg_.cut(sentence, words);
|
||||
break;
|
||||
case METHOD_HMM:
|
||||
hmmSeg_.cut(sentence, words);
|
||||
break;
|
||||
case METHOD_MIX:
|
||||
mixSeg_.cut(sentence, words);
|
||||
break;
|
||||
case METHOD_FULL:
|
||||
fullSeg_.cut(sentence, words);
|
||||
break;
|
||||
case METHOD_QUERY:
|
||||
querySeg_.cut(sentence, words);
|
||||
break;
|
||||
default:
|
||||
LogError("argument method is illegal.");
|
||||
}
|
||||
}
|
||||
void tag(const string& str, vector<pair<string, string> >& res) const {
|
||||
tagger_.tag(str, res);
|
||||
}
|
||||
void extract(const string& str, vector<pair<string, double> >& keyword,
|
||||
size_t topN) const {
|
||||
extractor_.extract(str, keyword, topN);
|
||||
}
|
||||
~Application() {
|
||||
}
|
||||
|
||||
private:
|
||||
DictTrie dictTrie_;
|
||||
HMMModel model_;
|
||||
MPSegment mpSeg_;
|
||||
HMMSegment hmmSeg_;
|
||||
MixSegment mixSeg_;
|
||||
FullSegment fullSeg_;
|
||||
QuerySegment querySeg_;
|
||||
PosTagger tagger_;
|
||||
KeywordExtractor extractor_;
|
||||
}; // class Application
|
||||
|
||||
} // namespace CppJieba
|
||||
|
||||
#endif // CPPJIEBA_APPLICATION_H
|
@ -23,8 +23,8 @@ class QuerySegment: public SegmentBase {
|
||||
maxWordLen_(maxWordLen) {
|
||||
assert(maxWordLen_);
|
||||
}
|
||||
QuerySegment(const DictTrie* dictTrie, const HMMModel* model)
|
||||
: mixSeg_(dictTrie, model), fullSeg_(dictTrie) {
|
||||
QuerySegment(const DictTrie* dictTrie, const HMMModel* model, size_t maxWordLen = 4)
|
||||
: mixSeg_(dictTrie, model), fullSeg_(dictTrie), maxWordLen_(maxWordLen) {
|
||||
}
|
||||
virtual ~QuerySegment() {
|
||||
}
|
||||
@ -39,7 +39,6 @@ class QuerySegment: public SegmentBase {
|
||||
|
||||
vector<Unicode> fullRes;
|
||||
for (vector<Unicode>::const_iterator mixResItr = mixRes.begin(); mixResItr != mixRes.end(); mixResItr++) {
|
||||
|
||||
// if it's too long, cut with fullSeg_, put fullRes in res
|
||||
if (mixResItr->size() > maxWordLen_) {
|
||||
if (fullSeg_.cut(mixResItr->begin(), mixResItr->end(), fullRes)) {
|
||||
|
@ -43,6 +43,10 @@ inline string encode(Unicode::const_iterator begin, Unicode::const_iterator end)
|
||||
return res;
|
||||
}
|
||||
|
||||
inline string encode(const Unicode& unicode) {
|
||||
return encode(unicode.begin(), unicode.end());
|
||||
}
|
||||
|
||||
// compiler is expected to optimized this function to avoid return value copy
|
||||
inline Unicode decode(const string& str) {
|
||||
Unicode unicode;
|
||||
@ -50,7 +54,8 @@ inline Unicode decode(const string& str) {
|
||||
decode(str, unicode);
|
||||
return unicode;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace TransCode
|
||||
} // namespace CppJieba
|
||||
|
||||
#endif
|
||||
|
@ -22,6 +22,7 @@ ADD_EXECUTABLE(test.run
|
||||
TTrie.cpp
|
||||
TSegments.cpp
|
||||
TPosTagger.cpp
|
||||
TApplication.cpp
|
||||
)
|
||||
TARGET_LINK_LIBRARIES(gtest pthread)
|
||||
TARGET_LINK_LIBRARIES(test.run gtest pthread)
|
||||
|
44
test/unittest/TApplication.cpp
Normal file
44
test/unittest/TApplication.cpp
Normal file
@ -0,0 +1,44 @@
|
||||
#include "src/Application.hpp"
|
||||
#include "gtest/gtest.h"
|
||||
|
||||
using namespace CppJieba;
|
||||
|
||||
TEST(ApplicationTest, Test1) {
|
||||
Application app("../dict/");
|
||||
vector<string> words;
|
||||
string result;
|
||||
|
||||
app.cut("我来自北京邮电大学。", words, METHOD_MP);
|
||||
result << words;
|
||||
ASSERT_EQ("[\"我\", \"来自\", \"北京邮电大学\", \"。\"]", result);
|
||||
|
||||
app.cut("我来自北京邮电大学。。。学号123456", words, METHOD_HMM);
|
||||
result << words;
|
||||
ASSERT_EQ("[\"我来\", \"自北京\", \"邮电大学\", \"。\", \"。\", \"。\", \"学号\", \"123456\"]", result);
|
||||
|
||||
app.cut("我来自北京邮电大学。。。学号123456,用AK47", words, METHOD_MIX);
|
||||
result << words;
|
||||
ASSERT_EQ("[\"我\", \"来自\", \"北京邮电大学\", \"。\", \"。\", \"。\", \"学号\", \"123456\", \",\", \"用\", \"AK47\"]", result);
|
||||
|
||||
app.cut("他来到了网易杭研大厦", words, METHOD_MIX);
|
||||
result << words;
|
||||
ASSERT_EQ("[\"他\", \"来到\", \"了\", \"网易\", \"杭研\", \"大厦\"]", result);
|
||||
|
||||
app.cut("我来自北京邮电大学", words, METHOD_FULL);
|
||||
result << words;
|
||||
ASSERT_EQ(result, "[\"我\", \"来自\", \"北京\", \"北京邮电\", \"北京邮电大学\", \"邮电\", \"邮电大学\", \"电大\", \"大学\"]");
|
||||
|
||||
app.cut("他来到了网易杭研大厦", words, METHOD_QUERY);
|
||||
result << words;
|
||||
ASSERT_EQ("[\"他\", \"来到\", \"了\", \"网易\", \"杭研\", \"大厦\"]", result);
|
||||
|
||||
vector<pair<string, string> > tagres;
|
||||
app.tag("iPhone6手机的最大特点是很容易弯曲。", tagres);
|
||||
result << tagres;
|
||||
ASSERT_EQ("[\"iPhone6:eng\", \"手机:n\", \"的:uj\", \"最大:a\", \"特点:n\", \"是:v\", \"很:zg\", \"容易:a\", \"弯曲:v\", \"。:x\"]", result);
|
||||
|
||||
vector<pair<string, double> > keywordres;
|
||||
app.extract("我是拖拉机学院手扶拖拉机专业的。不用多久,我就会升职加薪,当上CEO,走上人生巅峰。", keywordres, 5);
|
||||
result << keywordres;
|
||||
ASSERT_EQ(result, "[\"CEO:11.7392\", \"升职:10.8562\", \"加薪:10.6426\", \"手扶拖拉机:10.0089\", \"巅峰:9.49396\"]");
|
||||
}
|
@ -3,8 +3,6 @@
|
||||
|
||||
using namespace CppJieba;
|
||||
|
||||
|
||||
|
||||
TEST(KeywordExtractorTest, Test1) {
|
||||
KeywordExtractor extractor("../dict/extra_dict/jieba.dict.small.utf8", "../dict/hmm_model.utf8", "../dict/idf.utf8", "../dict/stop_words.utf8");
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user