delete Application.hpp, use Jieba.hpp instead

This commit is contained in:
yanyiwu 2015-10-08 21:03:09 +08:00
parent 4d56be920b
commit 16b69e35c1
6 changed files with 139 additions and 246 deletions

View File

@ -5,15 +5,17 @@
#include <string.h>
#include "limonp/Config.hpp"
#include "husky/ThreadPoolServer.hpp"
#include "Application.hpp"
#include "Jieba.hpp"
using namespace husky;
using namespace CppJieba;
class ReqHandler: public IRequestHandler {
public:
ReqHandler(const CppJieba::Application& app): app_(app) {
ReqHandler(const CppJieba::Jieba& jieba)
: jieba_(jieba) {
}
virtual ~ReqHandler() {
}
@ -24,35 +26,35 @@ class ReqHandler: public IRequestHandler {
httpReq.GET("key", tmp);
URLDecode(tmp, sentence);
httpReq.GET("method", method);
app_.cut(sentence, words, CppJieba::METHOD_MIX);
jieba_.Cut(sentence, words, true);
httpReq.GET("format", format);
run(sentence, method, format, strSnd);
Run(sentence, method, format, strSnd);
return true;
}
virtual bool doPOST(const HttpReqInfo& httpReq, string& strSnd) {
vector<string> words;
run(httpReq.getBody(), "MIX", "simple", strSnd);
Run(httpReq.getBody(), "MIX", "simple", strSnd);
return true;
}
void run(const string& sentence,
void Run(const string& sentence,
const string& method,
const string& format,
string& strSnd) const {
vector<string> words;
if ("MP" == method) {
app_.cut(sentence, words, CppJieba::METHOD_MP);
jieba_.Cut(sentence, words, false);
} else if ("HMM" == method) {
app_.cut(sentence, words, CppJieba::METHOD_HMM);
jieba_.CutHMM(sentence, words);
} else if ("MIX" == method) {
app_.cut(sentence, words, CppJieba::METHOD_MIX);
jieba_.Cut(sentence, words, true);
} else if ("FULL" == method) {
app_.cut(sentence, words, CppJieba::METHOD_FULL);
jieba_.CutAll(sentence, words);
} else if ("QUERY" == method) {
app_.cut(sentence, words, CppJieba::METHOD_QUERY);
jieba_.CutForSearch(sentence, words);
} else { // default
app_.cut(sentence, words, CppJieba::METHOD_MIX);
jieba_.Cut(sentence, words, false);
}
if (format == "simple") {
join(words.begin(), words.end(), strSnd, " ");
@ -61,10 +63,10 @@ class ReqHandler: public IRequestHandler {
}
}
private:
const CppJieba::Application& app_;
const CppJieba::Jieba& jieba_;
};
bool run(int argc, char** argv) {
bool Run(int argc, char** argv) {
if (argc < 2) {
return false;
}
@ -78,24 +80,20 @@ bool run(int argc, char** argv) {
string dictPath = conf.get("dict_path", "");
string modelPath = conf.get("model_path", "");
string userDictPath = conf.get("user_dict_path", "");
string idfPath = conf.get("idf_path", "");
string stopWordsPath = conf.get("stop_words_path", "");
LogInfo("config info: %s", conf.getConfigInfo().c_str());
CppJieba::Application app(dictPath,
CppJieba::Jieba jieba(dictPath,
modelPath,
userDictPath,
idfPath,
stopWordsPath);
userDictPath);
ReqHandler reqHandler(app);
ReqHandler reqHandler(jieba);
ThreadPoolServer sf(threadNumber, queueMaxSize, port, reqHandler);
return sf.start();
}
int main(int argc, char* argv[]) {
if (!run(argc, argv)) {
if (!Run(argc, argv)) {
printf("usage: %s <config_file>\n", argv[0]);
return EXIT_FAILURE;
}

View File

@ -1,87 +0,0 @@
#ifndef CPPJIEBA_APPLICATION_H
#define CPPJIEBA_APPLICATION_H
#include "Jieba.hpp"
#include "PosTagger.hpp"
#include "KeywordExtractor.hpp"
namespace CppJieba {
enum CutMethod {
METHOD_MP,
METHOD_HMM,
METHOD_MIX,
METHOD_FULL,
METHOD_QUERY,
METHOD_LEVEL
};
class Application {
public:
Application(const string& dict_path,
const string& model_path,
const string& user_dict_path,
const string& idf_path,
const string& stopWords_path)
: jieba_(dict_path, model_path, user_dict_path),
tagger_(jieba_.GetDictTrie(), jieba_.GetHMMModel()),
extractor_(jieba_.GetDictTrie(),
jieba_.GetHMMModel(),
idf_path,
stopWords_path) {
}
void cut(const string& sentence, vector<string>& words,
CutMethod method = METHOD_MIX) const {
switch(method) {
case METHOD_MP:
jieba_.Cut(sentence, words);
break;
case METHOD_HMM:
jieba_.CutHMM(sentence, words);
break;
case METHOD_MIX:
jieba_.Cut(sentence, words);
break;
case METHOD_FULL:
jieba_.CutAll(sentence, words);
break;
case METHOD_QUERY:
jieba_.CutForSearch(sentence, words);
break;
case METHOD_LEVEL:
jieba_.CutLevel(sentence, words);
break;
default:
LogError("argument method is illegal.");
}
}
void cut(const string& sentence,
vector<pair<string, size_t> >& words) const {
jieba_.CutLevel(sentence, words);
}
void cut(const string& sentence,
vector<string>& words, size_t max_word_len) const {
jieba_.CutSmall(sentence, words, max_word_len);
}
bool InsertUserWord(const string& word, const string& tag = UNKNOWN_TAG) {
return jieba_.InsertUserWord(word, tag);
}
void tag(const string& str, vector<pair<string, string> >& res) const {
tagger_.tag(str, res);
}
void extract(const string& str, vector<pair<string, double> >& keyword,
size_t topN) const {
extractor_.extract(str, keyword, topN);
}
~Application() {
}
private:
Jieba jieba_;
PosTagger tagger_;
KeywordExtractor extractor_;
}; // class Application
} // namespace CppJieba
#endif // CPPJIEBA_APPLICATION_H

View File

@ -1,59 +1,49 @@
#include "../src/Application.hpp"
#include "../src/Jieba.hpp"
using namespace CppJieba;
using namespace std;
int main(int argc, char** argv) {
CppJieba::Application app("../dict/jieba.dict.utf8",
CppJieba::Jieba jieba("../dict/jieba.dict.utf8",
"../dict/hmm_model.utf8",
"../dict/user.dict.utf8",
"../dict/idf.utf8",
"../dict/stop_words.utf8");
"../dict/user.dict.utf8");
vector<string> words;
string result;
string s = "我是拖拉机学院手扶拖拉机专业的。不用多久我就会升职加薪当上CEO走上人生巅峰。";
cout << "[demo] DEFAULT" << endl;
app.cut(s, words);
cout << join(words.begin(), words.end(), "/") << endl;
cout << "[demo] Cut With HMM" << endl;
jieba.Cut(s, words, true);
cout << limonp::join(words.begin(), words.end(), "/") << endl;
cout << "[demo] METHOD_MP" << endl;
app.cut(s, words, METHOD_MP);
cout << join(words.begin(), words.end(), "/") << endl;
cout << "[demo] Cut Without HMM " << endl;
jieba.Cut(s, words, false);
cout << limonp::join(words.begin(), words.end(), "/") << endl;
cout << "[demo] METHOD_HMM" << endl;
app.cut(s, words, METHOD_HMM);
cout << join(words.begin(), words.end(), "/") << endl;
cout << "[demo] CutAll" << endl;
jieba.CutAll(s, words);
cout << limonp::join(words.begin(), words.end(), "/") << endl;
cout << "[demo] METHOD_MIX" << endl;
app.cut(s, words, METHOD_MIX);
cout << join(words.begin(), words.end(), "/") << endl;
cout << "[demo] METHOD_FULL" << endl;
app.cut(s, words, METHOD_FULL);
cout << join(words.begin(), words.end(), "/") << endl;
cout << "[demo] METHOD_QUERY" << endl;
app.cut(s, words, METHOD_QUERY);
cout << join(words.begin(), words.end(), "/") << endl;
cout << "[demo] TAGGING" << endl;
vector<pair<string, string> > tagres;
app.tag(s, tagres);
cout << s << endl;
cout << tagres << endl;;
cout << "[demo] KEYWORD" << endl;
vector<pair<string, double> > keywordres;
app.extract(s, keywordres, 5);
cout << s << endl;
cout << keywordres << endl;
cout << "[demo] CutForSearch" << endl;
jieba.CutForSearch(s, words);
cout << limonp::join(words.begin(), words.end(), "/") << endl;
cout << "[demo] Insert User Word" << endl;
app.cut("男默女泪", words);
cout << join(words.begin(), words.end(), "/") << endl;
app.InsertUserWord("男默女泪");
app.cut("男默女泪", words);
cout << join(words.begin(), words.end(), "/") << endl;
jieba.Cut("男默女泪", words);
cout << limonp::join(words.begin(), words.end(), "/") << endl;
jieba.InsertUserWord("男默女泪");
jieba.Cut("男默女泪", words);
cout << limonp::join(words.begin(), words.end(), "/") << endl;
//cout << "[demo] TAGGING" << endl;
//vector<pair<string, string> > tagres;
//jieba.tag(s, tagres);
//cout << s << endl;
//cout << tagres << endl;;
//cout << "[demo] KEYWORD" << endl;
//vector<pair<string, double> > keywordres;
//jieba.extract(s, keywordres, 5);
//cout << s << endl;
//cout << keywordres << endl;
return EXIT_SUCCESS;
}

View File

@ -13,7 +13,7 @@ ADD_EXECUTABLE(test.run
trie_test.cpp
segments_test.cpp
pos_tagger_test.cpp
application_test.cpp
jieba_test.cpp
pre_filter_test.cpp
)
TARGET_LINK_LIBRARIES(gtest pthread)

View File

@ -1,93 +0,0 @@
#include "src/Application.hpp"
#include "gtest/gtest.h"
using namespace CppJieba;
TEST(ApplicationTest, Test1) {
CppJieba::Application app("../dict/jieba.dict.utf8",
"../dict/hmm_model.utf8",
"../dict/user.dict.utf8",
"../dict/idf.utf8",
"../dict/stop_words.utf8");
vector<string> words;
string result;
app.cut("他来到了网易杭研大厦", words);
result << words;
ASSERT_EQ("[\"\", \"来到\", \"\", \"网易\", \"杭研\", \"大厦\"]", result);
app.cut("我来自北京邮电大学。", words, METHOD_MP);
result << words;
ASSERT_EQ("[\"\", \"来自\", \"北京邮电大学\", \"\"]", result);
app.cut("南京市长江大桥", words, 3);
ASSERT_EQ("[\"南京市\", \"长江\", \"大桥\"]", result << words);
app.cut("我来自北京邮电大学。。。学号123456", words, METHOD_HMM);
result << words;
ASSERT_EQ("[\"我来\", \"自北京\", \"邮电大学\", \"\", \"\", \"\", \"学号\", \"123456\"]", result);
app.cut("我来自北京邮电大学。。。学号123456用AK47", words, METHOD_MIX);
result << words;
ASSERT_EQ("[\"\", \"来自\", \"北京邮电大学\", \"\", \"\", \"\", \"学号\", \"123456\", \"\", \"\", \"AK47\"]", result);
app.cut("他来到了网易杭研大厦", words, METHOD_MIX);
result << words;
ASSERT_EQ("[\"\", \"来到\", \"\", \"网易\", \"杭研\", \"大厦\"]", result);
app.cut("我来自北京邮电大学", words, METHOD_FULL);
result << words;
ASSERT_EQ(result, "[\"\", \"来自\", \"北京\", \"北京邮电\", \"北京邮电大学\", \"邮电\", \"邮电大学\", \"电大\", \"大学\"]");
app.cut("他来到了网易杭研大厦", words, METHOD_QUERY);
result << words;
ASSERT_EQ("[\"\", \"来到\", \"\", \"网易\", \"杭研\", \"大厦\"]", result);
app.cut("南京市长江大桥", words, METHOD_LEVEL);
result << words;
ASSERT_EQ("[\"南京市\", \"长江大桥\", \"南京\", \"长江\", \"大桥\"]", result);
vector<pair<string, size_t> > word_levels;
app.cut("南京市长江大桥", word_levels);
result << word_levels;
ASSERT_EQ("[\"南京市:0\", \"长江大桥:0\", \"南京:1\", \"长江:1\", \"大桥:1\"]", result);
vector<pair<string, string> > tagres;
app.tag("iPhone6手机的最大特点是很容易弯曲。", tagres);
result << tagres;
ASSERT_EQ("[\"iPhone6:eng\", \"手机:n\", \"的:uj\", \"最大:a\", \"特点:n\", \"是:v\", \"很:zg\", \"容易:a\", \"弯曲:v\", \"。:x\"]", result);
vector<pair<string, double> > keywordres;
app.extract("我是拖拉机学院手扶拖拉机专业的。不用多久我就会升职加薪当上CEO走上人生巅峰。", keywordres, 5);
result << keywordres;
ASSERT_EQ(result, "[\"CEO:11.7392\", \"升职:10.8562\", \"加薪:10.6426\", \"手扶拖拉机:10.0089\", \"巅峰:9.49396\"]");
}
TEST(ApplicationTest, InsertUserWord) {
CppJieba::Application app("../dict/jieba.dict.utf8",
"../dict/hmm_model.utf8",
"../dict/user.dict.utf8",
"../dict/idf.utf8",
"../dict/stop_words.utf8");
vector<string> words;
string result;
app.cut("男默女泪", words);
result << words;
ASSERT_EQ("[\"男默\", \"女泪\"]", result);
ASSERT_TRUE(app.InsertUserWord("男默女泪"));
app.cut("男默女泪", words);
result << words;
ASSERT_EQ("[\"男默女泪\"]", result);
for (size_t i = 0; i < 100; i++) {
string newWord;
newWord << rand();
ASSERT_TRUE(app.InsertUserWord(newWord));
app.cut(newWord, words);
result << words;
ASSERT_EQ(result, string_format("[\"%s\"]", newWord.c_str()));
}
}

View File

@ -0,0 +1,85 @@
#include "src/Jieba.hpp"
#include "gtest/gtest.h"
using namespace CppJieba;
TEST(JiebaTest, Test1) {
CppJieba::Jieba jieba("../dict/jieba.dict.utf8",
"../dict/hmm_model.utf8",
"../dict/user.dict.utf8");
vector<string> words;
string result;
jieba.Cut("他来到了网易杭研大厦", words);
result << words;
ASSERT_EQ("[\"\", \"来到\", \"\", \"网易\", \"杭研\", \"大厦\"]", result);
jieba.Cut("我来自北京邮电大学。", words, false);
result << words;
ASSERT_EQ("[\"\", \"来自\", \"北京邮电大学\", \"\"]", result);
jieba.CutSmall("南京市长江大桥", words, 3);
ASSERT_EQ("[\"南京市\", \"长江\", \"大桥\"]", result << words);
jieba.CutHMM("我来自北京邮电大学。。。学号123456", words);
result << words;
ASSERT_EQ("[\"我来\", \"自北京\", \"邮电大学\", \"\", \"\", \"\", \"学号\", \"123456\"]", result);
jieba.Cut("我来自北京邮电大学。。。学号123456用AK47", words);
result << words;
ASSERT_EQ("[\"\", \"来自\", \"北京邮电大学\", \"\", \"\", \"\", \"学号\", \"123456\", \"\", \"\", \"AK47\"]", result);
jieba.CutAll("我来自北京邮电大学", words);
result << words;
ASSERT_EQ(result, "[\"\", \"来自\", \"北京\", \"北京邮电\", \"北京邮电大学\", \"邮电\", \"邮电大学\", \"电大\", \"大学\"]");
jieba.CutForSearch("他来到了网易杭研大厦", words);
result << words;
ASSERT_EQ("[\"\", \"来到\", \"\", \"网易\", \"杭研\", \"大厦\"]", result);
jieba.CutLevel("南京市长江大桥", words);
result << words;
ASSERT_EQ("[\"南京市\", \"长江大桥\", \"南京\", \"长江\", \"大桥\"]", result);
vector<pair<string, size_t> > word_levels;
jieba.CutLevel("南京市长江大桥", word_levels);
result << word_levels;
ASSERT_EQ("[\"南京市:0\", \"长江大桥:0\", \"南京:1\", \"长江:1\", \"大桥:1\"]", result);
//vector<pair<string, string> > tagres;
//jieba.tag("iPhone6手机的最大特点是很容易弯曲。", tagres);
//result << tagres;
//ASSERT_EQ("[\"iPhone6:eng\", \"手机:n\", \"的:uj\", \"最大:a\", \"特点:n\", \"是:v\", \"很:zg\", \"容易:a\", \"弯曲:v\", \"。:x\"]", result);
//vector<pair<string, double> > keywordres;
//jieba.extract("我是拖拉机学院手扶拖拉机专业的。不用多久我就会升职加薪当上CEO走上人生巅峰。", keywordres, 5);
//result << keywordres;
//ASSERT_EQ(result, "[\"CEO:11.7392\", \"升职:10.8562\", \"加薪:10.6426\", \"手扶拖拉机:10.0089\", \"巅峰:9.49396\"]");
}
TEST(JiebaTest, InsertUserWord) {
CppJieba::Jieba jieba("../dict/jieba.dict.utf8",
"../dict/hmm_model.utf8",
"../dict/user.dict.utf8");
vector<string> words;
string result;
jieba.Cut("男默女泪", words);
result << words;
ASSERT_EQ("[\"男默\", \"女泪\"]", result);
ASSERT_TRUE(jieba.InsertUserWord("男默女泪"));
jieba.Cut("男默女泪", words);
result << words;
ASSERT_EQ("[\"男默女泪\"]", result);
for (size_t i = 0; i < 100; i++) {
string newWord;
newWord << rand();
ASSERT_TRUE(jieba.InsertUserWord(newWord));
jieba.Cut(newWord, words);
result << words;
ASSERT_EQ(result, string_format("[\"%s\"]", newWord.c_str()));
}
}