diff --git a/CMakeLists.txt b/CMakeLists.txt index bbc1d96..642a5e8 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -2,9 +2,9 @@ PROJECT(CPPJIEBA) CMAKE_MINIMUM_REQUIRED (VERSION 2.6) -if (CMAKE_INSTALL_PREFIX_INITIALIZED_TO_DEFAULT) - set (CMAKE_INSTALL_PREFIX "/usr" CACHE PATH "default install path" FORCE ) -endif() +#if (CMAKE_INSTALL_PREFIX_INITIALIZED_TO_DEFAULT) +#endif() +set (CMAKE_INSTALL_PREFIX "/usr/local/cppjieba" CACHE PATH "default install path" FORCE ) ADD_DEFINITIONS(-O3 -Wall -g) IF(APPLE) # mac os diff --git a/ChangeLog.md b/ChangeLog.md index 7e4987a..3a03e19 100644 --- a/ChangeLog.md +++ b/ChangeLog.md @@ -7,6 +7,8 @@ 3. 修改 Code Style ,参照 google code style 。 4. 增加更详细的错误日志,在初始化过程中合理使用LogFatal。 5. 增加 Application 这个类,整合了所有CppJieba的功能进去,以后用户只需要使用这个类即可。 +6. 修改 cjserver 服务,可以通过http参数使用不同切词算法进行切词。 +7. 修改 make install 的安装目录,统一安装到同一个目录 /usr/local/cppjieba 。 ## v2.4.4 diff --git a/README.md b/README.md index d1be056..7b5d8ff 100644 --- a/README.md +++ b/README.md @@ -10,7 +10,7 @@ CppJieba是"结巴"中文分词的C++版本 + 源代码都写进头文件`src/*.hpp`里,`include`即可使用。 + 支持`utf-8, gbk`编码,但是推荐使用`utf-8`编码, 因为`gbk`编码缺少严格测试,慎用。 -+ 内置分词服务`server/server.cpp`,在linux环境下可安装使用(可选)。 ++ 内置分词服务`server/server.cpp`,在linux环境下可安装使用(可选),可通过http参数选择不同分词算法进行分词。 + 项目自带较为完善的单元测试,核心功能中文分词(utf8)的稳定性接受过线上环境检验。 + 支持载自定义用户词典。 + 支持 `linux` , `mac osx` 操作系统。 @@ -59,7 +59,7 @@ make ### 启动服务 ``` -./bin/cjserver ../test/testdata/server.conf +./bin/cjserver ../conf/server_example.conf ``` ### 客户端请求示例 @@ -80,8 +80,18 @@ curl "http://127.0.0.1:11200/?key=南京市长江大桥&format=simple" 南京市 长江大桥 ``` -用 chrome 浏览器打开也行 ( chrome 设置默认编码是`utf-8`): +默认切词算法是MixSegment切词算法,如果想要使用其他算法切词,可以使用参数method来设置。 +示例如下: +``` +curl "http://127.0.0.1:11200/?key=南京市长江大桥&format=simple&method=MP" +curl "http://127.0.0.1:11200/?key=南京市长江大桥&format=simple&method=HMM" +curl "http://127.0.0.1:11200/?key=南京市长江大桥&format=simple&method=MIX" +curl "http://127.0.0.1:11200/?key=南京市长江大桥&format=simple&method=FULL" +curl "http://127.0.0.1:11200/?key=南京市长江大桥&format=simple&method=QUERY" +``` + +用 chrome 浏览器打开也行 ( chrome 设置默认编码是`utf-8`): 同时,也支持HTTP POST模式,使用如下调用: @@ -107,15 +117,15 @@ sudo make install ### 服务启动和停止(仅限 linux 系统) ``` -/etc/init.d/cjserver.start >> /dev/null 2>&1 -/etc/init.d/cjserver.stop +cd /usr/local/cppjieba +./script/cjserver.start +./script/cjserver.stop ``` ### 卸载服务(仅限 linux 系统) ```sh -cd build/ -cat install_manifest.txt | sudo xargs rm -rf +rm -rf /usr/local/cppjieba ``` ## Docker 示例 diff --git a/conf/CMakeLists.txt b/conf/CMakeLists.txt index e344c76..be5588c 100644 --- a/conf/CMakeLists.txt +++ b/conf/CMakeLists.txt @@ -1 +1 @@ -INSTALL(FILES server.conf DESTINATION /etc/CppJieba) +INSTALL(FILES server.conf DESTINATION conf) diff --git a/conf/server.conf b/conf/server.conf index 70902f2..fdefdab 100644 --- a/conf/server.conf +++ b/conf/server.conf @@ -7,10 +7,14 @@ thread_number=4 queue_max_size=4096 #dict path -dict_path=/usr/share/CppJieba/dict/jieba.dict.utf8 +dict_path=/usr/local/cppjieba/dict/jieba.dict.utf8 #model path -model_path=/usr/share/CppJieba/dict/hmm_model.utf8 +model_path=/usr/local/cppjieba/dict/hmm_model.utf8 #user_dict_path -#user_dict_path=/usr/share/CppJieba/dict/user.dict.utf8 +user_dict_path=/usr/local/cppjieba/dict/user.dict.utf8 + +idf_path=/usr/local/cppjieba/dict/idf.utf8 + +stop_words_path=/usr/local/cppjieba/dict/stop_words.utf8 diff --git a/conf/server_example.conf b/conf/server_example.conf new file mode 100644 index 0000000..4b1c9f4 --- /dev/null +++ b/conf/server_example.conf @@ -0,0 +1,19 @@ +# config + +#socket listen port +port=11200 + +thread_number=4 +queue_max_size=4096 + +#dict path +dict_path=../dict/jieba.dict.utf8 + +#model path +model_path=../dict/hmm_model.utf8 + +user_dict_path=../dict/user.dict.utf8 + +idf_path=../dict/idf.utf8 + +stop_words_path=../dict/stop_words.utf8 diff --git a/dict/CMakeLists.txt b/dict/CMakeLists.txt index 88eb2fb..2310959 100644 --- a/dict/CMakeLists.txt +++ b/dict/CMakeLists.txt @@ -1 +1,9 @@ -INSTALL(FILES hmm_model.utf8 jieba.dict.utf8 user.dict.utf8 DESTINATION share/CppJieba/dict) +INSTALL(FILES + hmm_model.utf8 + jieba.dict.utf8 + user.dict.utf8 + idf.utf8 + stop_words.utf8 + DESTINATION + dict +) diff --git a/script/CMakeLists.txt b/script/CMakeLists.txt index cfbc28e..33d3525 100644 --- a/script/CMakeLists.txt +++ b/script/CMakeLists.txt @@ -1 +1,6 @@ -INSTALL(PROGRAMS cjserver.start cjserver.stop DESTINATION /etc/init.d/) +INSTALL(PROGRAMS + cjserver.start + cjserver.stop + DESTINATION + script +) diff --git a/script/cjserver.start b/script/cjserver.start index 6d3fa50..f480a72 100755 --- a/script/cjserver.start +++ b/script/cjserver.start @@ -7,6 +7,6 @@ if [ ! -z "${PID}" ] then echo "please stop cjserver first." else - cjserver /etc/CppJieba/server.conf & + /usr/local/cppjieba/bin/cjserver /usr/local/cppjieba/conf/server.conf >> /dev/null 2>&1 & echo "service started." fi diff --git a/server/CMakeLists.txt b/server/CMakeLists.txt index 3069a93..35d213e 100644 --- a/server/CMakeLists.txt +++ b/server/CMakeLists.txt @@ -5,5 +5,4 @@ INCLUDE_DIRECTORIES(${PROJECT_SOURCE_DIR}/src) ADD_EXECUTABLE(cjserver server.cpp) TARGET_LINK_LIBRARIES(cjserver pthread) -INSTALL(TARGETS cjserver RUNTIME DESTINATION bin) - +INSTALL(TARGETS cjserver DESTINATION bin) diff --git a/server/server.cpp b/server/server.cpp index 7a2f372..f56e7c5 100644 --- a/server/server.cpp +++ b/server/server.cpp @@ -5,40 +5,63 @@ #include #include "Limonp/Config.hpp" #include "Husky/ThreadPoolServer.hpp" -#include "MixSegment.hpp" -#include "QuerySegment.hpp" -#include "FullSegment.hpp" +#include "Application.hpp" using namespace Husky; using namespace CppJieba; class ReqHandler: public IRequestHandler { public: - ReqHandler(const ISegment& segment): _segment(segment) { + ReqHandler(const CppJieba::Application& app): app_(app) { + } + virtual ~ReqHandler() { } - virtual ~ReqHandler() {}; virtual bool do_GET(const HttpReqInfo& httpReq, string& strSnd) const { - string sentence, tmp; + string sentence, method, format; + string tmp; vector words; httpReq.GET("key", tmp); URLDecode(tmp, sentence); - _segment.cut(sentence, words); - if(httpReq.GET("format", tmp) && tmp == "simple") { - join(words.begin(), words.end(), strSnd, " "); - return true; - } - strSnd << words; + httpReq.GET("method", method); + app_.cut(sentence, words, CppJieba::METHOD_MIX); + httpReq.GET("format", format); + run(sentence, method, format, strSnd); return true; } + virtual bool do_POST(const HttpReqInfo& httpReq, string& strSnd) const { vector words; - _segment.cut(httpReq.getBody(), words); - strSnd << words; + run(httpReq.getBody(), "MIX", "simple", strSnd); return true; } + + void run(const string& sentence, + const string& method, + const string& format, + string& strSnd) const { + vector words; + if ("MP" == method) { + app_.cut(sentence, words, CppJieba::METHOD_MP); + } else if ("HMM" == method) { + app_.cut(sentence, words, CppJieba::METHOD_HMM); + } else if ("MIX" == method) { + app_.cut(sentence, words, CppJieba::METHOD_MIX); + } else if ("FULL" == method) { + app_.cut(sentence, words, CppJieba::METHOD_FULL); + } else if ("QUERY" == method) { + app_.cut(sentence, words, CppJieba::METHOD_QUERY); + } else { // default + app_.cut(sentence, words, CppJieba::METHOD_MIX); + } + if(format == "simple") { + join(words.begin(), words.end(), strSnd, " "); + } else { + strSnd << words; + } + } private: - const ISegment& _segment; + const CppJieba::Application& app_; }; bool run(int argc, char** argv) { @@ -49,35 +72,26 @@ bool run(int argc, char** argv) { if(!conf) { return false; } - int port = 0; - int threadNumber = 0; - int queueMaxSize = 0; - string dictPath; - string modelPath; - string userDictPath; - LIMONP_CHECK(conf.get("port", port)); - LIMONP_CHECK(conf.get("thread_number", threadNumber)); - LIMONP_CHECK(conf.get("queue_max_size", queueMaxSize)); - LIMONP_CHECK(conf.get("dict_path", dictPath)); - LIMONP_CHECK(conf.get("model_path", modelPath)); - if(!conf.get("user_dict_path", userDictPath)) { //optional - userDictPath = ""; - } + int port = conf.get("port", 1339); + int threadNumber = conf.get("thread_number", 4); + int queueMaxSize = conf.get("queue_max_size", 1024); + string dictPath = conf.get("dict_path", ""); + string modelPath = conf.get("model_path", ""); + string userDictPath = conf.get("user_dict_path", ""); + string idfPath = conf.get("idf_path", ""); + string stopWordsPath = conf.get("stop_words_path", ""); LogInfo("config info: %s", conf.getConfigInfo().c_str()); - - /* - * segment can be one of (MPSegment, HMMSegment, MixSegment, QuerySegment ...) - */ - //MPSegment segment(dictPath, userDictPath); - //HMMSegment segment(modelPath); - MixSegment segment(dictPath, modelPath, userDictPath); - //QuerySegment segment(dictPath, modelPath); - ReqHandler reqHandler(segment); + CppJieba::Application app(dictPath, + modelPath, + userDictPath, + idfPath, + stopWordsPath); + + ReqHandler reqHandler(app); ThreadPoolServer sf(threadNumber, queueMaxSize, port, reqHandler); return sf.start(); - } int main(int argc, char* argv[]) { diff --git a/src/Application.hpp b/src/Application.hpp index 205b8ec..c51a62a 100644 --- a/src/Application.hpp +++ b/src/Application.hpp @@ -17,9 +17,13 @@ enum CutMethod { class Application { public: - Application(const string& dictDir) - : dictTrie_(pathJoin(dictDir, "jieba.dict.utf8")), - model_(pathJoin(dictDir, "hmm_model.utf8")), + Application(const string& dictPath, + const string& modelPath, + const string& userDictPath, + const string& idfPath, + const string& stopWordsPath) + : dictTrie_(dictPath, userDictPath), + model_(modelPath), mpSeg_(&dictTrie_), hmmSeg_(&model_), mixSeg_(&dictTrie_, &model_), @@ -28,8 +32,8 @@ class Application { tagger_(&dictTrie_, &model_), extractor_(&dictTrie_, &model_, - pathJoin(dictDir, "idf.utf8"), - pathJoin(dictDir, "stop_words.utf8")) { + idfPath, + stopWordsPath) { } void cut(const string& sentence, vector& words, CutMethod method) const { diff --git a/src/Limonp/Config.hpp b/src/Limonp/Config.hpp index e49bca6..c9e2088 100644 --- a/src/Limonp/Config.hpp +++ b/src/Limonp/Config.hpp @@ -55,21 +55,19 @@ class Config { ifs.close(); } public: - bool get(const string& key, string& value) const { + string get(const string& key, const string& defaultvalue) const { map::const_iterator it = map_.find(key); if(map_.end() != it) { - value = it->second; - return true; + return it->second; } - return false; + return defaultvalue; } - bool get(const string& key, int & value) const { - string str; - if(!get(key, str)) { - return false; + int get(const string& key, int defaultvalue) const { + string str = get(key, ""); + if("" == str) { + return defaultvalue; } - value = atoi(str.c_str()); - return true; + return atoi(str.c_str()); } const char* operator [] (const char* key) const { if(NULL == key) { diff --git a/test/demo.cpp b/test/demo.cpp index 3e2fe64..bbdea3f 100644 --- a/test/demo.cpp +++ b/test/demo.cpp @@ -16,7 +16,11 @@ void LoadSentences(const string& filepath, vector& sentences) { } int main(int argc, char** argv) { - CppJieba::Application app("../dict/"); + CppJieba::Application app("../dict/jieba.dict.utf8", + "../dict/hmm_model.utf8", + "../dict/user.dict.utf8", + "../dict/idf.utf8", + "../dict/stop_words.utf8"); vector words; string result; string s; diff --git a/test/testdata/server.conf b/test/testdata/server.conf index 01321db..4b1c9f4 100644 --- a/test/testdata/server.conf +++ b/test/testdata/server.conf @@ -11,3 +11,9 @@ dict_path=../dict/jieba.dict.utf8 #model path model_path=../dict/hmm_model.utf8 + +user_dict_path=../dict/user.dict.utf8 + +idf_path=../dict/idf.utf8 + +stop_words_path=../dict/stop_words.utf8 diff --git a/test/unittest/TApplication.cpp b/test/unittest/TApplication.cpp index ab1646a..00f1bcf 100644 --- a/test/unittest/TApplication.cpp +++ b/test/unittest/TApplication.cpp @@ -4,7 +4,11 @@ using namespace CppJieba; TEST(ApplicationTest, Test1) { - Application app("../dict/"); + CppJieba::Application app("../dict/jieba.dict.utf8", + "../dict/hmm_model.utf8", + "../dict/user.dict.utf8", + "../dict/idf.utf8", + "../dict/stop_words.utf8"); vector words; string result;