diff --git a/server/server.cpp b/server/server.cpp index 63c86aa..8c8274c 100644 --- a/server/server.cpp +++ b/server/server.cpp @@ -5,15 +5,17 @@ #include #include "limonp/Config.hpp" #include "husky/ThreadPoolServer.hpp" -#include "Application.hpp" +#include "Jieba.hpp" using namespace husky; using namespace CppJieba; class ReqHandler: public IRequestHandler { public: - ReqHandler(const CppJieba::Application& app): app_(app) { + ReqHandler(const CppJieba::Jieba& jieba) + : jieba_(jieba) { } + virtual ~ReqHandler() { } @@ -24,35 +26,35 @@ class ReqHandler: public IRequestHandler { httpReq.GET("key", tmp); URLDecode(tmp, sentence); httpReq.GET("method", method); - app_.cut(sentence, words, CppJieba::METHOD_MIX); + jieba_.Cut(sentence, words, true); httpReq.GET("format", format); - run(sentence, method, format, strSnd); + Run(sentence, method, format, strSnd); return true; } virtual bool doPOST(const HttpReqInfo& httpReq, string& strSnd) { vector words; - run(httpReq.getBody(), "MIX", "simple", strSnd); + Run(httpReq.getBody(), "MIX", "simple", strSnd); return true; } - void run(const string& sentence, + void Run(const string& sentence, const string& method, const string& format, string& strSnd) const { vector words; if ("MP" == method) { - app_.cut(sentence, words, CppJieba::METHOD_MP); + jieba_.Cut(sentence, words, false); } else if ("HMM" == method) { - app_.cut(sentence, words, CppJieba::METHOD_HMM); + jieba_.CutHMM(sentence, words); } else if ("MIX" == method) { - app_.cut(sentence, words, CppJieba::METHOD_MIX); + jieba_.Cut(sentence, words, true); } else if ("FULL" == method) { - app_.cut(sentence, words, CppJieba::METHOD_FULL); + jieba_.CutAll(sentence, words); } else if ("QUERY" == method) { - app_.cut(sentence, words, CppJieba::METHOD_QUERY); + jieba_.CutForSearch(sentence, words); } else { // default - app_.cut(sentence, words, CppJieba::METHOD_MIX); + jieba_.Cut(sentence, words, false); } if (format == "simple") { join(words.begin(), words.end(), strSnd, " "); @@ -61,10 +63,10 @@ class ReqHandler: public IRequestHandler { } } private: - const CppJieba::Application& app_; + const CppJieba::Jieba& jieba_; }; -bool run(int argc, char** argv) { +bool Run(int argc, char** argv) { if (argc < 2) { return false; } @@ -78,24 +80,20 @@ bool run(int argc, char** argv) { string dictPath = conf.get("dict_path", ""); string modelPath = conf.get("model_path", ""); string userDictPath = conf.get("user_dict_path", ""); - string idfPath = conf.get("idf_path", ""); - string stopWordsPath = conf.get("stop_words_path", ""); LogInfo("config info: %s", conf.getConfigInfo().c_str()); - CppJieba::Application app(dictPath, + CppJieba::Jieba jieba(dictPath, modelPath, - userDictPath, - idfPath, - stopWordsPath); + userDictPath); - ReqHandler reqHandler(app); + ReqHandler reqHandler(jieba); ThreadPoolServer sf(threadNumber, queueMaxSize, port, reqHandler); return sf.start(); } int main(int argc, char* argv[]) { - if (!run(argc, argv)) { + if (!Run(argc, argv)) { printf("usage: %s \n", argv[0]); return EXIT_FAILURE; } diff --git a/src/Application.hpp b/src/Application.hpp deleted file mode 100644 index d7d8789..0000000 --- a/src/Application.hpp +++ /dev/null @@ -1,87 +0,0 @@ -#ifndef CPPJIEBA_APPLICATION_H -#define CPPJIEBA_APPLICATION_H - -#include "Jieba.hpp" -#include "PosTagger.hpp" -#include "KeywordExtractor.hpp" - -namespace CppJieba { - -enum CutMethod { - METHOD_MP, - METHOD_HMM, - METHOD_MIX, - METHOD_FULL, - METHOD_QUERY, - METHOD_LEVEL -}; - -class Application { - public: - Application(const string& dict_path, - const string& model_path, - const string& user_dict_path, - const string& idf_path, - const string& stopWords_path) - : jieba_(dict_path, model_path, user_dict_path), - tagger_(jieba_.GetDictTrie(), jieba_.GetHMMModel()), - extractor_(jieba_.GetDictTrie(), - jieba_.GetHMMModel(), - idf_path, - stopWords_path) { - } - void cut(const string& sentence, vector& words, - CutMethod method = METHOD_MIX) const { - switch(method) { - case METHOD_MP: - jieba_.Cut(sentence, words); - break; - case METHOD_HMM: - jieba_.CutHMM(sentence, words); - break; - case METHOD_MIX: - jieba_.Cut(sentence, words); - break; - case METHOD_FULL: - jieba_.CutAll(sentence, words); - break; - case METHOD_QUERY: - jieba_.CutForSearch(sentence, words); - break; - case METHOD_LEVEL: - jieba_.CutLevel(sentence, words); - break; - default: - LogError("argument method is illegal."); - } - } - void cut(const string& sentence, - vector >& words) const { - jieba_.CutLevel(sentence, words); - } - void cut(const string& sentence, - vector& words, size_t max_word_len) const { - jieba_.CutSmall(sentence, words, max_word_len); - } - bool InsertUserWord(const string& word, const string& tag = UNKNOWN_TAG) { - return jieba_.InsertUserWord(word, tag); - } - void tag(const string& str, vector >& res) const { - tagger_.tag(str, res); - } - void extract(const string& str, vector >& keyword, - size_t topN) const { - extractor_.extract(str, keyword, topN); - } - ~Application() { - } - - private: - Jieba jieba_; - PosTagger tagger_; - KeywordExtractor extractor_; -}; // class Application - -} // namespace CppJieba - -#endif // CPPJIEBA_APPLICATION_H diff --git a/test/demo.cpp b/test/demo.cpp index 7464bae..ab1c21d 100644 --- a/test/demo.cpp +++ b/test/demo.cpp @@ -1,59 +1,49 @@ -#include "../src/Application.hpp" +#include "../src/Jieba.hpp" -using namespace CppJieba; +using namespace std; int main(int argc, char** argv) { - CppJieba::Application app("../dict/jieba.dict.utf8", + CppJieba::Jieba jieba("../dict/jieba.dict.utf8", "../dict/hmm_model.utf8", - "../dict/user.dict.utf8", - "../dict/idf.utf8", - "../dict/stop_words.utf8"); + "../dict/user.dict.utf8"); vector words; string result; string s = "我是拖拉机学院手扶拖拉机专业的。不用多久,我就会升职加薪,当上CEO,走上人生巅峰。"; - cout << "[demo] DEFAULT" << endl; - app.cut(s, words); - cout << join(words.begin(), words.end(), "/") << endl; + cout << "[demo] Cut With HMM" << endl; + jieba.Cut(s, words, true); + cout << limonp::join(words.begin(), words.end(), "/") << endl; - cout << "[demo] METHOD_MP" << endl; - app.cut(s, words, METHOD_MP); - cout << join(words.begin(), words.end(), "/") << endl; + cout << "[demo] Cut Without HMM " << endl; + jieba.Cut(s, words, false); + cout << limonp::join(words.begin(), words.end(), "/") << endl; - cout << "[demo] METHOD_HMM" << endl; - app.cut(s, words, METHOD_HMM); - cout << join(words.begin(), words.end(), "/") << endl; + cout << "[demo] CutAll" << endl; + jieba.CutAll(s, words); + cout << limonp::join(words.begin(), words.end(), "/") << endl; - cout << "[demo] METHOD_MIX" << endl; - app.cut(s, words, METHOD_MIX); - cout << join(words.begin(), words.end(), "/") << endl; - - cout << "[demo] METHOD_FULL" << endl; - app.cut(s, words, METHOD_FULL); - cout << join(words.begin(), words.end(), "/") << endl; - - cout << "[demo] METHOD_QUERY" << endl; - app.cut(s, words, METHOD_QUERY); - cout << join(words.begin(), words.end(), "/") << endl; - - cout << "[demo] TAGGING" << endl; - vector > tagres; - app.tag(s, tagres); - cout << s << endl; - cout << tagres << endl;; - - cout << "[demo] KEYWORD" << endl; - vector > keywordres; - app.extract(s, keywordres, 5); - cout << s << endl; - cout << keywordres << endl; + cout << "[demo] CutForSearch" << endl; + jieba.CutForSearch(s, words); + cout << limonp::join(words.begin(), words.end(), "/") << endl; cout << "[demo] Insert User Word" << endl; - app.cut("男默女泪", words); - cout << join(words.begin(), words.end(), "/") << endl; - app.InsertUserWord("男默女泪"); - app.cut("男默女泪", words); - cout << join(words.begin(), words.end(), "/") << endl; + jieba.Cut("男默女泪", words); + cout << limonp::join(words.begin(), words.end(), "/") << endl; + jieba.InsertUserWord("男默女泪"); + jieba.Cut("男默女泪", words); + cout << limonp::join(words.begin(), words.end(), "/") << endl; + + //cout << "[demo] TAGGING" << endl; + //vector > tagres; + //jieba.tag(s, tagres); + //cout << s << endl; + //cout << tagres << endl;; + + //cout << "[demo] KEYWORD" << endl; + //vector > keywordres; + //jieba.extract(s, keywordres, 5); + //cout << s << endl; + //cout << keywordres << endl; return EXIT_SUCCESS; } diff --git a/test/unittest/CMakeLists.txt b/test/unittest/CMakeLists.txt index 6d2f67d..38ddf1e 100644 --- a/test/unittest/CMakeLists.txt +++ b/test/unittest/CMakeLists.txt @@ -13,7 +13,7 @@ ADD_EXECUTABLE(test.run trie_test.cpp segments_test.cpp pos_tagger_test.cpp - application_test.cpp + jieba_test.cpp pre_filter_test.cpp ) TARGET_LINK_LIBRARIES(gtest pthread) diff --git a/test/unittest/application_test.cpp b/test/unittest/application_test.cpp deleted file mode 100644 index bd34d8c..0000000 --- a/test/unittest/application_test.cpp +++ /dev/null @@ -1,93 +0,0 @@ -#include "src/Application.hpp" -#include "gtest/gtest.h" - -using namespace CppJieba; - -TEST(ApplicationTest, Test1) { - CppJieba::Application app("../dict/jieba.dict.utf8", - "../dict/hmm_model.utf8", - "../dict/user.dict.utf8", - "../dict/idf.utf8", - "../dict/stop_words.utf8"); - vector words; - string result; - - app.cut("他来到了网易杭研大厦", words); - result << words; - ASSERT_EQ("[\"他\", \"来到\", \"了\", \"网易\", \"杭研\", \"大厦\"]", result); - - app.cut("我来自北京邮电大学。", words, METHOD_MP); - result << words; - ASSERT_EQ("[\"我\", \"来自\", \"北京邮电大学\", \"。\"]", result); - - app.cut("南京市长江大桥", words, 3); - ASSERT_EQ("[\"南京市\", \"长江\", \"大桥\"]", result << words); - - app.cut("我来自北京邮电大学。。。学号123456", words, METHOD_HMM); - result << words; - ASSERT_EQ("[\"我来\", \"自北京\", \"邮电大学\", \"。\", \"。\", \"。\", \"学号\", \"123456\"]", result); - - app.cut("我来自北京邮电大学。。。学号123456,用AK47", words, METHOD_MIX); - result << words; - ASSERT_EQ("[\"我\", \"来自\", \"北京邮电大学\", \"。\", \"。\", \"。\", \"学号\", \"123456\", \",\", \"用\", \"AK47\"]", result); - - app.cut("他来到了网易杭研大厦", words, METHOD_MIX); - result << words; - ASSERT_EQ("[\"他\", \"来到\", \"了\", \"网易\", \"杭研\", \"大厦\"]", result); - - app.cut("我来自北京邮电大学", words, METHOD_FULL); - result << words; - ASSERT_EQ(result, "[\"我\", \"来自\", \"北京\", \"北京邮电\", \"北京邮电大学\", \"邮电\", \"邮电大学\", \"电大\", \"大学\"]"); - - app.cut("他来到了网易杭研大厦", words, METHOD_QUERY); - result << words; - ASSERT_EQ("[\"他\", \"来到\", \"了\", \"网易\", \"杭研\", \"大厦\"]", result); - - app.cut("南京市长江大桥", words, METHOD_LEVEL); - result << words; - ASSERT_EQ("[\"南京市\", \"长江大桥\", \"南京\", \"长江\", \"大桥\"]", result); - - vector > word_levels; - app.cut("南京市长江大桥", word_levels); - result << word_levels; - ASSERT_EQ("[\"南京市:0\", \"长江大桥:0\", \"南京:1\", \"长江:1\", \"大桥:1\"]", result); - - vector > tagres; - app.tag("iPhone6手机的最大特点是很容易弯曲。", tagres); - result << tagres; - ASSERT_EQ("[\"iPhone6:eng\", \"手机:n\", \"的:uj\", \"最大:a\", \"特点:n\", \"是:v\", \"很:zg\", \"容易:a\", \"弯曲:v\", \"。:x\"]", result); - - vector > keywordres; - app.extract("我是拖拉机学院手扶拖拉机专业的。不用多久,我就会升职加薪,当上CEO,走上人生巅峰。", keywordres, 5); - result << keywordres; - ASSERT_EQ(result, "[\"CEO:11.7392\", \"升职:10.8562\", \"加薪:10.6426\", \"手扶拖拉机:10.0089\", \"巅峰:9.49396\"]"); -} - -TEST(ApplicationTest, InsertUserWord) { - CppJieba::Application app("../dict/jieba.dict.utf8", - "../dict/hmm_model.utf8", - "../dict/user.dict.utf8", - "../dict/idf.utf8", - "../dict/stop_words.utf8"); - vector words; - string result; - - app.cut("男默女泪", words); - result << words; - ASSERT_EQ("[\"男默\", \"女泪\"]", result); - - ASSERT_TRUE(app.InsertUserWord("男默女泪")); - - app.cut("男默女泪", words); - result << words; - ASSERT_EQ("[\"男默女泪\"]", result); - - for (size_t i = 0; i < 100; i++) { - string newWord; - newWord << rand(); - ASSERT_TRUE(app.InsertUserWord(newWord)); - app.cut(newWord, words); - result << words; - ASSERT_EQ(result, string_format("[\"%s\"]", newWord.c_str())); - } -} diff --git a/test/unittest/jieba_test.cpp b/test/unittest/jieba_test.cpp new file mode 100644 index 0000000..e5ec8fb --- /dev/null +++ b/test/unittest/jieba_test.cpp @@ -0,0 +1,85 @@ +#include "src/Jieba.hpp" +#include "gtest/gtest.h" + +using namespace CppJieba; + +TEST(JiebaTest, Test1) { + CppJieba::Jieba jieba("../dict/jieba.dict.utf8", + "../dict/hmm_model.utf8", + "../dict/user.dict.utf8"); + vector words; + string result; + + jieba.Cut("他来到了网易杭研大厦", words); + result << words; + ASSERT_EQ("[\"他\", \"来到\", \"了\", \"网易\", \"杭研\", \"大厦\"]", result); + + jieba.Cut("我来自北京邮电大学。", words, false); + result << words; + ASSERT_EQ("[\"我\", \"来自\", \"北京邮电大学\", \"。\"]", result); + + jieba.CutSmall("南京市长江大桥", words, 3); + ASSERT_EQ("[\"南京市\", \"长江\", \"大桥\"]", result << words); + + jieba.CutHMM("我来自北京邮电大学。。。学号123456", words); + result << words; + ASSERT_EQ("[\"我来\", \"自北京\", \"邮电大学\", \"。\", \"。\", \"。\", \"学号\", \"123456\"]", result); + + jieba.Cut("我来自北京邮电大学。。。学号123456,用AK47", words); + result << words; + ASSERT_EQ("[\"我\", \"来自\", \"北京邮电大学\", \"。\", \"。\", \"。\", \"学号\", \"123456\", \",\", \"用\", \"AK47\"]", result); + + jieba.CutAll("我来自北京邮电大学", words); + result << words; + ASSERT_EQ(result, "[\"我\", \"来自\", \"北京\", \"北京邮电\", \"北京邮电大学\", \"邮电\", \"邮电大学\", \"电大\", \"大学\"]"); + + jieba.CutForSearch("他来到了网易杭研大厦", words); + result << words; + ASSERT_EQ("[\"他\", \"来到\", \"了\", \"网易\", \"杭研\", \"大厦\"]", result); + + jieba.CutLevel("南京市长江大桥", words); + result << words; + ASSERT_EQ("[\"南京市\", \"长江大桥\", \"南京\", \"长江\", \"大桥\"]", result); + + vector > word_levels; + jieba.CutLevel("南京市长江大桥", word_levels); + result << word_levels; + ASSERT_EQ("[\"南京市:0\", \"长江大桥:0\", \"南京:1\", \"长江:1\", \"大桥:1\"]", result); + + //vector > tagres; + //jieba.tag("iPhone6手机的最大特点是很容易弯曲。", tagres); + //result << tagres; + //ASSERT_EQ("[\"iPhone6:eng\", \"手机:n\", \"的:uj\", \"最大:a\", \"特点:n\", \"是:v\", \"很:zg\", \"容易:a\", \"弯曲:v\", \"。:x\"]", result); + + //vector > keywordres; + //jieba.extract("我是拖拉机学院手扶拖拉机专业的。不用多久,我就会升职加薪,当上CEO,走上人生巅峰。", keywordres, 5); + //result << keywordres; + //ASSERT_EQ(result, "[\"CEO:11.7392\", \"升职:10.8562\", \"加薪:10.6426\", \"手扶拖拉机:10.0089\", \"巅峰:9.49396\"]"); +} + +TEST(JiebaTest, InsertUserWord) { + CppJieba::Jieba jieba("../dict/jieba.dict.utf8", + "../dict/hmm_model.utf8", + "../dict/user.dict.utf8"); + vector words; + string result; + + jieba.Cut("男默女泪", words); + result << words; + ASSERT_EQ("[\"男默\", \"女泪\"]", result); + + ASSERT_TRUE(jieba.InsertUserWord("男默女泪")); + + jieba.Cut("男默女泪", words); + result << words; + ASSERT_EQ("[\"男默女泪\"]", result); + + for (size_t i = 0; i < 100; i++) { + string newWord; + newWord << rand(); + ASSERT_TRUE(jieba.InsertUserWord(newWord)); + jieba.Cut(newWord, words); + result << words; + ASSERT_EQ(result, string_format("[\"%s\"]", newWord.c_str())); + } +}