diff --git a/CMakeLists.txt b/CMakeLists.txt new file mode 100644 index 0000000..8983dc2 --- /dev/null +++ b/CMakeLists.txt @@ -0,0 +1,2 @@ +PROJECT(CPPJIEBA) +ADD_SUBDIRECTORY(src) diff --git a/README.md b/README.md index 4a8bd9d..0cef295 100644 --- a/README.md +++ b/README.md @@ -7,41 +7,63 @@ - `master`分支支持`utf8`编码 - `gbk`分支支持`gbk`编码 +## 安装与使用 -## 模块详解 +### 下载和安装 -### Trie树 -Trie.cpp/Trie.h 负责载入词典的trie树,主要供Segment模块使用。 +```sh +wget https://github.com/aszxqw/cppjieba/archive/master.zip -O cppjieba-master.zip +unzip cppjieba-master.zip +cd cppjieba-master +mkdir build +cd build +cmake -DCMAKE_INSTALL_PREFIX=/usr .. +make +sudo make install +``` -### Segment模块 +### 卸载 +```sh +cd build/ +cat install_manifest.txt | sudo xargs rm -rf +``` -MPSegment.cpp/MPSegment.h -(Maximum Probability)最大概率法:负责根据Trie树构建有向无环图和进行动态规划算法,是分词算法的核心。 +### 验证 -HMMSegment.cpp/HMMSegment.h -是根据HMM模型来进行分词,主要算法思路是根据(B,E,M,S)四个状态来代表每个字的隐藏状态。 -HMM模型由dicts/下面的`hmm_model.utf8`提供。 -分词算法即viterbi算法。 +```sh +cd test/ +g++ -o segment.demo segment.cpp -L/usr/lib/CppJieba/ -lcppjieba +./segment.demo # you will see the demo. +``` +运行一下 `./server` 或 `./segment` 都会有对应的帮助文档显示。 -### TransCode模块 +同时,如果想知道开发时如何使用`libcppjieba.a` 请看`test/segment.cpp`源代码即可。 -TransCode.cpp/TransCode.h 负责转换编码类型,将utf8和gbk转换成`uint16_t`类型,也负责逆转换。 +如果想知道如何搭建一个`cppjieba`中文分词的http服务请见 `test/server.cpp`源代码即可。 +若还有其他问题,欢迎`send mail`或者`open issue`。 :) +### 搭建服务 -## Demo +``` +cd ./test +g++ -o server server.cpp -L/usr/lib/CppJieba/ -L/usr/lib/CppJieba/Husky -lcppjieba -lhusky -lpthread +./server -n 4 -p 11258 -k start >> run.log 2>&1 #启动服务,监听11258这个端口。 +./server -n 4 -p 11258 -k stop #停止服务 +``` + +#### 验证服务 + +然后用chrome浏览器打开`http://127.0.0.1:11258/?key=我来自北京邮电大学` +(用chrome的原因是chrome的默认编码就是utf-8) + +或者用命令 `curl "http://127.0.0.1:11258/?key=我来自北京邮电大学"` (ubuntu中的curl安装命令`sudo apt-get install curl`) + +## 分词效果 ### MPSegment's demo -__这部分的功能经过线上考验,一直稳定运行,暂时没有发现什么bug。__ - -``` -cd ./demo; -make; -./segment_demo testlines.utf8 -``` - Output: ``` 我来到北京清华大学 @@ -59,12 +81,6 @@ Output: ### HMMSegment's demo -``` -cd ./demo; -make; -./segment_demo testlines.utf8 --modelpath ../dicts/hmm_model.utf8 --algorithm cutHMM -``` - Output: ``` 我来到北京清华大学 @@ -78,11 +94,6 @@ Output: ``` ### MixSegment's demo -``` -cd ./demo; -make; -./segment_demo testlines.utf8 --algorithm cutMix -``` Output: ``` @@ -98,62 +109,51 @@ Output: 我/来自/北京邮电大学/。。。/学号/091111xx/。。。 ``` -### Server's demo - -引入了husky这个文件夹,husky是一个简单的http服务框架。 -``` -cd ./demo; -make; -./start.sh #启动一个服务,监听11258这个端口(在start.sh里面指定)。 -``` - -关闭和重启分别是`stop.sh`和`restart.sh` - -然后用chrome浏览器打开`http://127.0.0.1:11258/?key=我来自北京邮电大学` -(用chrome的原因是chrome的默认编码就是utf-8) - -或者用命令 `curl "http://127.0.0.1:11258/?key=我来自北京邮电大学"` - - ### 效果分析 以上依次是MP,HMM,Mix三种方法的效果。 + 可以看出效果最好的是Mix,也就是融合MP和HMM的切词算法。即可以准确切出词典已有的词,又可以切出像"杭研"这样的未登录词。 -## Help + + +## 模块详解 本项目主要是如下目录组成: -### Limonp +### src + +核心目录,包含主要源代码。 + +#### Trie树 +Trie.cpp/Trie.h 负责载入词典的trie树,主要供Segment模块使用。 + +#### Segment模块 + +MPSegment.cpp/MPSegment.h +(Maximum Probability)最大概率法:负责根据Trie树构建有向无环图和进行动态规划算法,是分词算法的核心。 + +HMMSegment.cpp/HMMSegment.h +是根据HMM模型来进行分词,主要算法思路是根据(B,E,M,S)四个状态来代表每个字的隐藏状态。 +HMM模型由dicts/下面的`hmm_model.utf8`提供。 +分词算法即viterbi算法。 + +#### TransCode模块 + +TransCode.cpp/TransCode.h 负责转换编码类型,将utf8和gbk转换成`uint16_t`类型,也负责逆转换。 + +### src/Husky + +提供服务的框架代码, + +详见: https://github.com/aszxqw/husky + +### src/Limonp 主要是一些工具函数,例如字符串操作等。 直接include就可以使用。 -### cppjieba -核心目录,包含主要源代码。 -make 之后产生libcppjieb.a -使用方法参考如上cppcommon - - - -### run `./segment_demo` to get help. - -如下: -``` -usage: - ./segment_demo[options] -options: - --algorithm Supported methods are [cutDAG, cutHMM, cutMix] for now. - If not specified, the default is cutDAG - --dictpath If not specified, the default is ../dicts/jieba.dict.utf8 - --modelpath If not specified, the default is ../dicts/hmm_model.utf8 - If not specified, the default is utf8. -example: - ./segment_demo testlines.utf8 --dictpath ../dicts/jieba.dict.utf8 - ./segment_demo testlines.utf8 --modelpath ../dicts/hmm_model.utf8 --algorithm cutHMM - ./segment_demo testlines.utf8 --modelpath ../dicts/hmm_model.utf8 --algorithm cutMix - -``` +详见: https://github.com/aszxqw/limonp ## 分词速度 @@ -163,11 +163,11 @@ example: 测试环境: `Intel(R) Xeon(R) CPU E5506 @ 2.13GHz` -## Contact +## 联系客服 如果有运行问题或者任何疑问,欢迎联系 : wuyanyi09@gmail.com -## Thanks +## 鸣谢 "结巴中文"分词作者: SunJunyi https://github.com/fxsjy/jieba diff --git a/cppjieba/KeyWordExt.cpp b/cppjieba/KeyWordExt.cpp deleted file mode 100644 index c0ab565..0000000 --- a/cppjieba/KeyWordExt.cpp +++ /dev/null @@ -1,360 +0,0 @@ -/************************************ - * file enc : ASCII - * author : wuyanyi09@gmail.com -************************************/ -#include "KeyWordExt.h" - - -namespace CppJieba -{ - - KeyWordExt::KeyWordExt() - { - } - - KeyWordExt::~KeyWordExt() - { - } - - bool KeyWordExt::init(const char* const segDictFile) - { - LogInfo("KeyWordExt init start ..."); - if(!_segment.init(segDictFile)) - { - LogError("_segment.init failed."); - return false; - } - return true; - } - - bool KeyWordExt::loadStopWords(const char * const filePath) - { - - LogInfo("_loadStopWords(%s) start", filePath); - if(!_stopWords.empty()) - { - LogError("_stopWords has been loaded before! "); - return false; - } - if(!checkFileExist(filePath)) - { - LogError("cann't find file[%s].",filePath); - return false; - } - - ifstream ifile(filePath); - string line; - Unicode word; - while(getline(ifile, line)) - { - if(!TransCode::decode(line, word)) - { - LogError("decode failed ."); - return false; - } - _stopWords.insert(word); - } - LogInfo("load stopwords[%d] finished.", _stopWords.size()); - - return true; - } - - bool KeyWordExt::dispose() - { - _segment.dispose(); - return true; - } - - bool KeyWordExt::_wordInfoCompare(const KeyWordInfo& a, const KeyWordInfo& b) - { - return a.weight > b.weight; - } - - bool KeyWordExt::_sortWLIDF(vector& wordInfos) - { - for(uint i = 0; i < wordInfos.size(); i++) - { - KeyWordInfo& wInfo = wordInfos[i]; - wInfo.idf = - wInfo.logFreq; - wInfo.weight = log(double(wInfo.word.size() + 1)) * wInfo.idf; - } - sort(wordInfos.begin(), wordInfos.end(), _wordInfoCompare); - return true; - } - - bool KeyWordExt::_extTopN(vector& wordInfos, uint topN) - { - int dis = wordInfos.size() - topN; - if(dis <= 0) - { - return true; - } - - if(uint(dis) <= topN) - { - for(int i = 0; i< dis; i++) - { - wordInfos.pop_back(); - } - } - else// in case that topN << size; - { - - vector tmp(wordInfos.begin(), wordInfos.begin() + topN); - wordInfos.swap(tmp); - } - return true; - } - - - bool KeyWordExt::extract(const vector& words, vector& keyWordInfos, uint topN) - { - if(words.empty()) - { - return false; - } - - keyWordInfos.clear(); - for(uint i = 0; i < words.size(); i++) - { - Unicode uniWord; - if(!TransCode::decode(words[i], uniWord)) - { - LogError("decode failed"); - return false; - } - keyWordInfos.push_back(uniWord); - } - - return _extract(keyWordInfos, topN); - } - - bool KeyWordExt::extract(const string& title, vector& keyWordInfos, uint topN) - { - if(title.empty()) - { - return false; - } - - vector trieNodeInfos; - Unicode unico; - if(!TransCode::decode(title, unico)) - { - return false; - } - _segment.cut(unico.begin(), unico.end(), trieNodeInfos); - - keyWordInfos.clear(); - for(uint i = 0; i < trieNodeInfos.size(); i++) - { - keyWordInfos.push_back(trieNodeInfos[i]); - } - return _extract(keyWordInfos, topN); - } - - bool KeyWordExt::_extract(vector& keyWordInfos, uint topN) - { - if(!_filter(keyWordInfos)) - { - LogError("_filter failed."); - return false; - } - - if(!_sortWLIDF(keyWordInfos)) - { - LogError("_sortWLIDF failed."); - return false; - } - - if(!_extTopN(keyWordInfos, topN)) - { - LogError("_extTopN failed."); - return false; - } - - return true; - } - - bool KeyWordExt::_filter(vector& wordInfos) - { - if(!_filterDuplicate(wordInfos)) - { - LogError("_filterDuplicate failed."); - return false; - } - - if(!_filterSingleWord(wordInfos)) - { - LogError("_filterSingleWord failed."); - return false; - } - - if(!_filterStopWords(wordInfos)) - { - LogError("_filterStopWords failed."); - return false; - } - - if(!_filterSubstr(wordInfos)) - { - LogError("_filterSubstr failed."); - return false; - } - - return true; - } - - bool KeyWordExt::_filterStopWords(vector& wordInfos) - { - if(_stopWords.empty()) - { - return true; - } - for(vector::iterator it = wordInfos.begin(); it != wordInfos.end();) - { - if(_stopWords.find(it->word) != _stopWords.end()) - { - it = wordInfos.erase(it); - } - else - { - it ++; - } - } - return true; - } - - - bool KeyWordExt::_filterDuplicate(vector& wordInfos) - { - set st; - for(vector::iterator it = wordInfos.begin(); it != wordInfos.end(); ) - { - if(st.find(it->word) != st.end()) - { - it = wordInfos.erase(it); - } - else - { - st.insert(it->word); - it++; - } - } - return true; - } - - bool KeyWordExt::_filterSingleWord(vector& wordInfos) - { - for(vector::iterator it = wordInfos.begin(); it != wordInfos.end();) - { - - // filter single word - if(1 == it->word.size()) - { - it = wordInfos.erase(it); - } - else - { - it++; - } - } - return true; - } - - bool KeyWordExt::_filterSubstr(vector& wordInfos) - { - vector tmp ; - for(uint i = 0; i < wordInfos.size(); i++) - { - tmp.push_back(wordInfos[i].word); - } - - for(vector::iterator it = wordInfos.begin(); it != wordInfos.end(); ) - { - if(_isSubIn(tmp, it->word)) - { - it = wordInfos.erase(it); - } - else - { - it++; - } - } - - return true; - } - - //bool KeyWordExt::_isContainSubWords(const string& word) - //{ - // for(uint i = 0; i < _priorSubWords.size(); i++) - // { - // if(string::npos != word.find(_priorSubWords[i])) - // { - // return true; - // } - // } - // return false; - //} - - //bool KeyWordExt::_prioritizeSubWords(vector& wordInfos) - //{ - // if(2 > wordInfos.size()) - // { - // return true; - // } - - // KeyWordInfo prior; - // bool flag = false; - // for(vector::iterator it = wordInfos.begin(); it != wordInfos.end(); ) - // { - // if(_isContainSubWords(it->word)) - // { - // prior = *it; - // it = wordInfos.erase(it); - // flag = true; - // break; - // } - // else - // { - // it ++; - // } - // } - // if(flag) - // { - // wordInfos.insert(wordInfos.begin(), prior); - // } - // return true; - //} -} - - -#ifdef KEYWORDEXT_UT - -using namespace CppJieba; - -int main() -{ - KeyWordExt ext; - ext.init(); - if(!ext.loadSegDict("../dicts/segdict.gbk.v2.1")) - { - return 1; - } - ext._loadStopWords("../dicts/stopwords.gbk.v1.0"); - - ifstream ifile("testtitle.gbk"); - vector res; - string line; - while(getline(ifile, line)) - { - cout< -#include "MPSegment.h" -#include "structs.h" - -namespace CppJieba -{ - - class KeyWordExt - { - private: - MPSegment _segment; - //vector _priorSubWords; - set _stopWords; - public: - KeyWordExt(); - ~KeyWordExt(); - bool init(const char* const segDictFile); - bool dispose(); - bool loadStopWords(const char * const filePath); - private: - //bool _loadPriorSubWords(const char * const filePath); - - - public: - bool extract(const string& title, vector& keyWordInfos, uint topN); - bool extract(const vector& words, vector& keyWordInfos, uint topN); - private: - static bool _wordInfoCompare(const KeyWordInfo& a, const KeyWordInfo& b); - private: - bool _extract(vector& keyWordInfos, uint topN); - bool _extTopN(vector& wordInfos, uint topN); - private: - //sort by word len - idf - bool _sortWLIDF(vector& wordInfos); - private: - bool _filter(vector& ); - bool _filterDuplicate(vector& ); - bool _filterSingleWord(vector& ); - bool _filterSubstr(vector& ); - bool _filterStopWords(vector& ); - private: - inline bool _isSubIn(const vector& words, const Unicode& word)const - { - - for(uint j = 0; j < words.size(); j++) - { - if(word != words[j] && words[j].end() != search(words[j].begin(), words[j].end(), word.begin(), word.end())) - { - return true; - } - } - return false; - } - //bool _prioritizeSubWords(vector& wordInfos); - //bool _isContainSubWords(const string& word); - - }; - -} - -#endif diff --git a/cppjieba/Makefile b/cppjieba/Makefile deleted file mode 100644 index c4febcf..0000000 --- a/cppjieba/Makefile +++ /dev/null @@ -1,66 +0,0 @@ -CXX := g++ -LD := g++ -AR := ar rc - -DEBUG_CXXFLAGS := -g -Wall -DDEBUG -RELEASE_CXXFLAGS := -Wall -O3 - -ifeq (YES, ${RELEASE}) - CXXFLAGS := ${RELEASE_CXXFLAGS} - LDFLAGS := ${RELEASE_LDFLAGS} -else - CXXFLAGS := ${DEBUG_CXXFLAGS} - LDFLAGS := ${DEBUG_LDFLAGS} -endif - -SOURCES := $(wildcard *.cpp) -OBJS := $(patsubst %.cpp,%.o,$(SOURCES)) - -INC := -I../limonp - -LIBA := libcppjieba.a - -# remove the objs after compilation -.INTERMEDIATE: -#.PHONY: clean $(CMLIB) -.PHONY: clean - -all: $(LIBA) - -# This is a suffix rule -#.c.o: -%.o: %.cpp - $(CXX) -c $(CXXFLAGS) $< $(INC) - - -${LIBA}: $(OBJS) - $(AR) $@ $(OBJS) - -#unit test -Trie.ut: Trie.cpp Trie.h globals.h TransCode.cpp TransCode.hpp $(CMLIB) - $(CXX) -o $@ $(CXXFLAGS) Trie.cpp TransCode.cpp -DTRIE_UT $(CMLIB) - -MPSegment.ut: MPSegment.cpp Trie.cpp MPSegment.h Trie.h globals.h $(CMLIB) - $(CXX) -o $@ $(CXXFLAGS) MPSegment.cpp Trie.cpp TransCode.cpp -DSEGMENT_UT $(CMLIB) - -KeyWordExt.ut: KeyWordExt.cpp KeyWordExt.h MPSegment.h Trie.h globals.h TransCode.cpp TransCode.hpp $(CMLIB) - $(CXX) -o $@ $(CXXFLAGS) KeyWordExt.cpp MPSegment.cpp Trie.cpp TransCode.cpp -DKEYWORDEXT_UT $(CMLIB) - -TransCode.ut: TransCode.cpp TransCode.hpp globals.h $(CMLIB) - $(CXX) -o $@ $(CXXFLAGS) TransCode.cpp -DCPPJIEBA_TRANSCODE_UT $(CMLIB) -HMMSegment.ut: HMMSegment.cpp TransCode.cpp TransCode.hpp HMMSegment.h $(CMLIB) - $(CXX) -o $@ $(CXXFLAGS) TransCode.cpp HMMSegment.cpp -DHMMSEGMENT_UT $(CMLIB) -MixSegment.ut: MixSegment.cpp MixSegment.h HMMSegment.cpp MPSegment.cpp Trie.cpp MPSegment.h Trie.h globals.h $(CMLIB) - $(CXX) -o $@ $(CXXFLAGS) MixSegment.cpp HMMSegment.cpp MPSegment.cpp Trie.cpp TransCode.cpp -DMIXSEGMENT_UT $(CMLIB) -ChineseFilter.ut: ChineseFilter.cpp ChineseFilter.hpp - $(CXX) -o $@ $(CXXFLAGS) ChineseFilter.cpp -DUT - -clean: - rm -f *.o *.d *.d.* *.ut $(LIBA) - -sinclude $(SOURCES:.cpp=.d) -%.d:%.cpp - @set -e; rm -f $@; \ - $(CXX) -MM $< > $@.$$$$; \ - sed 's,\($*\).o[ :]*,\1.o $@ : ,g' < $@.$$$$ > $@; \ - rm -f $@.$$$$ diff --git a/demo/Makefile b/demo/Makefile deleted file mode 100644 index a305a01..0000000 --- a/demo/Makefile +++ /dev/null @@ -1,53 +0,0 @@ -CXX := g++ -LD := g++ -AR := ar rc - - -DEBUG_CXXFLAGS := -g -Wall -DDEBUG -RELEASE_CXXFLAGS := -Wall -O3 - -ifeq (YES, ${DEBUG}) - CXXFLAGS := ${DEBUG_CXXFLAGS} - LDFLAGS := ${DEBUG_LDFLAGS} -else - CXXFLAGS := ${RELEASE_CXXFLAGS} - LDFLAGS := ${RELEASE_LDFLAGS} -endif - -INCS := -I../limonp -LINK := -lpthread - -SOURCES := $(wildcard *.cpp) -OBJS := $(patsubst %.cpp,%.o,$(SOURCES)) -DEMOS := $(patsubst %.cpp,%.demo,$(SOURCES)) - -CPPJIEBADIR := ../cppjieba -LIBCPPJIEBA := $(CPPJIEBADIR)/libcppjieba.a - -HUSKYDIR := ../husky -LIBHUSKYA := $(HUSKYDIR)/libhusky.a - -.PHONY: clean $(LIBCPPJIEBA) - -all: $(DEMOS) - -%.demo: %.cpp $(LIBCPPJIEBA) $(LIBHUSKYA) - $(CXX) -o $@ $(CXXFLAGS) $^ $(INCS) $(LINK) - -$(LIBCPPJIEBA): - cd $(CPPJIEBADIR) && $(MAKE) - -$(LIBHUSKYA): - cd $(HUSKYDIR) && $(MAKE) - -clean: - rm -f *.o *.ut *.d *.d.* $(DEMOS) - cd $(CPPJIEBADIR) && make clean - cd $(HUSKYDIR) && make clean - -sinclude $(SOURCES:.cpp=.d) -%.d:%.cpp - @set -e; rm -f $@; \ - $(CXX) -MM $< > $@.$$$$; \ - sed 's,\($*\).o[ :]*,\1.o $@ : ,g' < $@.$$$$ > $@; \ - rm -f $@.$$$$ diff --git a/demo/example.sh b/demo/example.sh deleted file mode 100755 index 466d702..0000000 --- a/demo/example.sh +++ /dev/null @@ -1,3 +0,0 @@ -./segment.demo testlines.utf8 --dictpath ../dicts/jieba.dict.utf8 -./segment.demo testlines.utf8 --modelpath ../dicts/hmm_model.utf8 --algorithm cutHMM -./segment.demo testlines.utf8 --algorithm cutMix diff --git a/demo/keywordext.cpp b/demo/keywordext.cpp deleted file mode 100644 index bbdd781..0000000 --- a/demo/keywordext.cpp +++ /dev/null @@ -1,56 +0,0 @@ -#include -#include -#include -#include "../cppjieba/KeyWordExt.h" - -using namespace CppJieba; - - -void testKeyWordExt(const char * dictPath, const char * filePath) -{ - KeyWordExt ext; - if(!ext.init(dictPath)) - { - return; - } - - ifstream ifile(filePath); - vector res; - string line; - while(getline(ifile, line)) - { - res.clear(); - if(!line.empty()) - { - ext.extract(line, res, 20); - cout< argc) - { - cout<<"usage: \n\t"<\n" - <<"options:\n" - <<"\t--dictpath\tIf not specified, the default is "< -#include -#include -#include "../cppjieba/MPSegment.h" -#include "../cppjieba/HMMSegment.h" -#include "../cppjieba/MixSegment.h" - -using namespace CppJieba; - -MPSegment seg; -HMMSegment hmmseg; -MixSegment mixseg; -bool init(const char * const dictPath, const char * const modelPath) -{ - if(!seg.init(dictPath)) - { - cout<<"seg init failed."< res; - string line; - while(getline(ifile, line)) - { - if(!line.empty()) - { - res.clear(); - seg->cut(line, res); - cout<\n" - <<"options:\n" - <<"\t--algorithm\tSupported methods are [cutDAG, cutHMM, cutMix] for now. \n\t\t\tIf not specified, the default is cutDAG\n" - <<"\t--dictpath\tIf not specified, the default is "< $@.$$$$; \ - sed 's,\($*\).o[ :]*,\1.o $@ : ,g' < $@.$$$$ > $@; \ - rm -f $@.$$$$ diff --git a/limonp/map_functs.hpp b/limonp/map_functs.hpp deleted file mode 100644 index 9479691..0000000 --- a/limonp/map_functs.hpp +++ /dev/null @@ -1,123 +0,0 @@ -/************************************ - * file enc : ascii - * author : wuyanyi09@gmail.com - ************************************/ - - -#ifndef LIMONP_MAP_FUNCTS_H -#define LIMONP_MAP_FUNCTS_H - -#include -#include -#include -#include -#include "typedefs.h" - -namespace Limonp -{ - using namespace std; - - - template - string setToString(const set& st) - { - if(st.empty()) - { - return "{}"; - } - stringstream ss; - ss<<'{'; - typename set::const_iterator it = st.begin(); - ss<<*it; - it++; - while(it != st.end()) - { - ss<<", "<<*it; - it++; - } - ss<<'}'; - return ss.str(); - } - - template - string mapToString(const map& mp) - { - if(mp.empty()) - { - return "{}"; - } - stringstream ss; - ss<<'{'; - typename map::const_iterator it = mp.begin(); - ss<first<<": "<second; - it++; - while(it != mp.end()) - { - ss<<", "<first<<": "<second; - it++; - } - ss<<'}'; - return ss.str(); - } - - template - string HashMapToString(const HashMap& mp) - { - if(mp.empty()) - { - return "{}"; - } - stringstream ss; - ss<<'{'; - typename HashMap::const_iterator it = mp.begin(); - ss<first<<": "<second; - it++; - while(it != mp.end()) - { - ss<<", "<first<<": "<second; - it++; - } - ss<<'}'; - return ss.str(); - } - template - string pairToString(const pair& p) - { - stringstream ss; - ss< - void printMap(const map& mp) - { - for(typename map::const_iterator it = mp.begin(); it != mp.end(); it++) - { - cout<first<<' '<second< - vT getMap(const map& mp, const kT & key, const vT & defaultVal) - { - typename map::const_iterator it; - it = mp.find(key); - if(mp.end() == it) - { - return defaultVal; - } - return it->second; - } - - template - void map2Vec(const map& mp, vector > & res) - { - typename map::const_iterator it = mp.begin(); - for(; it != mp.end(); it++) - { - res.push_back(*it); - } - } -} - -#endif diff --git a/limonp/typedefs.h b/limonp/typedefs.h deleted file mode 100644 index a8da002..0000000 --- a/limonp/typedefs.h +++ /dev/null @@ -1,21 +0,0 @@ -/************************************ - * file enc : utf8 - * author : wuyanyi09@gmail.com -************************************/ -#ifndef LIMONP_TYPEDEFS_H -#define LIMONP_TYPEDEFS_H - -#include -#include -#include -#include -#define HashMap std::tr1::unordered_map - -namespace Limonp -{ - typedef std::vector Unicode; - typedef std::vector::const_iterator UnicodeConstIterator; -} - - -#endif diff --git a/limonp/vec_functs.hpp b/limonp/vec_functs.hpp deleted file mode 100644 index ac18548..0000000 --- a/limonp/vec_functs.hpp +++ /dev/null @@ -1,142 +0,0 @@ -/************************************ - * file enc : ascii - * author : wuyanyi09@gmail.com -************************************/ -#ifndef LIMONP_VEC_FUNCTS_H -#define LIMONP_VEC_FUNCTS_H -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#define FOR_VECTOR(vec, i) for(size_t i = 0; i < vec.size(); i++) - -#define PRINT_VECTOR(vec) FOR_VECTOR(vec, i)\ -{\ - cout< - bool vecToString(const vector& vec, string& res) - { - if(vec.empty()) - { - res = "[]"; - return false; - } - stringstream ss; - ss<<"[\""< - string vecToString(const vector& vec) - { - string res; - vecToString(vec, res); - return res; - } - - template - bool isInVec(const vector& vec, const T& item) - { - typename vector::const_iterator it = find(vec.begin(), vec.end(), item); - return it != vec.end(); - } - template - void splitVec(const vector& vecSrc, vector< pair > >& outVec, const vector& patterns) - { - vector tmp; - T pattern; - size_t patternSize = patterns.size(); - for(size_t i = 0; i < vecSrc.size(); i++) - { - size_t patternPos = patternSize; - for(size_t j = 0; j < patternSize; j++) - { - if(patterns[j] == vecSrc[i]) - { - patternPos = j; - break; - } - } - if(patternPos != patternSize) - { - if(!tmp.empty()) - { - outVec.push_back(make_pair >(pattern, tmp)); - tmp.clear(); - } - pattern = patterns[patternPos]; - } - else - { - tmp.push_back(vecSrc[i]); - } - } - if(!tmp.empty()) - { - outVec.push_back(make_pair >(pattern, tmp)); - } - } - - template - void splitVec(const vector& vecSrc, vector< vector >& outVec, const vector& patternVec) - { - vector tmp; - for(size_t i = 0; i < vecSrc.size(); i++) - { - bool flag = false; - for(size_t j = 0; j < patternVec.size(); j++) - { - if(patternVec[j] == vecSrc[i]) - { - flag = true; - break; - } - } - if(flag) - { - if(!tmp.empty()) - { - outVec.push_back(tmp); - tmp.clear(); - } - } - else - { - tmp.push_back(vecSrc[i]); - } - } - if(!tmp.empty()) - { - outVec.push_back(tmp); - } - } -} - -#endif diff --git a/scripts/add_header.sh b/scripts/add_header.sh deleted file mode 100755 index e27b7a8..0000000 --- a/scripts/add_header.sh +++ /dev/null @@ -1 +0,0 @@ -sed -i '1i/************************************\n * file enc : utf8\n * author : wuyanyi09@gmail.com\n************************************/' ../src/*.h ../src/*.cpp ../src/*.tcc diff --git a/scripts/check_dict.py b/scripts/check_dict.py deleted file mode 100755 index ac528f7..0000000 --- a/scripts/check_dict.py +++ /dev/null @@ -1,29 +0,0 @@ -#!/usr/bin/python - -import sys - -if len(sys.argv) == 1: - print "usage : %s dict_file1 dict_file2 ..." - exit(1) - -d = {} - -for fname in sys.argv[1:]: - with open(fname, "r") as fin: - for i, line in enumerate(fin): - try: - word, cnt, tag = line.strip().split(" ") - if word in d: - print "error file[%s] line[%s] : %s" %(fname, i, line) - exit(1) - else: - d[word] = True - - if 0 >= int(cnt) : - print "error file[%s] line[%s] : %s" %(fname, i, line) - exit(1) - except Exception as err: - print "error file[%s] line[%s] : %s" %(fname, i, line) - exit(1) - -print "OK" diff --git a/scripts/filter_dict.py b/scripts/filter_dict.py deleted file mode 100755 index 8cb247a..0000000 --- a/scripts/filter_dict.py +++ /dev/null @@ -1,23 +0,0 @@ -#!/usr/bin/python - -import sys - -if len(sys.argv) == 1: - print "usage : %s dict_file_path" - exit(1) - -d = {} -with open(sys.argv[1], "r") as fin: - for i, line in enumerate(fin): - word, cnt, tag = line.strip().split(" ") - if word in d: - #print "error file[%s] line[%s] : %s" %(fname, i, line) - #exit(1) - continue - else: - d[word] = True - if 0 >= int(cnt) : - continue - - print line.strip() - diff --git a/scripts/iconv_dict.py b/scripts/iconv_dict.py deleted file mode 100755 index 483ecec..0000000 --- a/scripts/iconv_dict.py +++ /dev/null @@ -1,15 +0,0 @@ -#!/usr/bin/python - -import sys - -if len(sys.argv) != 4: - print "usage : %s from_enc to_enc dict_file_path \nexample: %s gbk utf-8 fname" %(__file__, __file__) - exit(1) - -with open(sys.argv[3], "r") as fin: - for i, line in enumerate(fin): - try: - print line.strip().decode(sys.argv[1]).encode(sys.argv[2]) - except Exception as err: - print >> sys.stderr, err - diff --git a/demo/restart.sh b/scripts/restart.sh similarity index 100% rename from demo/restart.sh rename to scripts/restart.sh diff --git a/demo/start.sh b/scripts/start.sh similarity index 100% rename from demo/start.sh rename to scripts/start.sh diff --git a/demo/stop.sh b/scripts/stop.sh similarity index 100% rename from demo/stop.sh rename to scripts/stop.sh diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt new file mode 100644 index 0000000..dc867e2 --- /dev/null +++ b/src/CMakeLists.txt @@ -0,0 +1,23 @@ +SET(EXECUTABLE_OUTPUT_PATH ${PROJECT_BINARY_DIR}/bin) +SET(LIBRARY_OUTPUT_PATH ${PROJECT_BINARY_DIR}/lib) + +SET(LIBCPPJIEBA_SRC HMMSegment.cpp MixSegment.cpp MPSegment.cpp Trie.cpp) +ADD_LIBRARY(cppjieba STATIC ${LIBCPPJIEBA_SRC}) +ADD_EXECUTABLE(segment segment.cpp) +ADD_EXECUTABLE(server server.cpp) + +LINK_DIRECTORIES(husky) + +TARGET_LINK_LIBRARIES(segment cppjieba) +TARGET_LINK_LIBRARIES(server cppjieba husky pthread) + +SET_TARGET_PROPERTIES(cppjieba PROPERTIES VERSION 1.2 SOVERSION 1) + +INSTALL(TARGETS cppjieba ARCHIVE DESTINATION lib/CppJieba) +INSTALL(TARGETS segment RUNTIME DESTINATION bin/CppJieba) +INSTALL(TARGETS server RUNTIME DESTINATION bin/CppJieba) +INSTALL(FILES ChineseFilter.hpp HMMSegment.h MPSegment.h structs.h Trie.h globals.h ISegment.hpp MixSegment.h SegmentBase.hpp TransCode.hpp DESTINATION include/CppJieba) + + +ADD_SUBDIRECTORY(Husky) +ADD_SUBDIRECTORY(Limonp) diff --git a/cppjieba/ChineseFilter.hpp b/src/ChineseFilter.hpp similarity index 100% rename from cppjieba/ChineseFilter.hpp rename to src/ChineseFilter.hpp diff --git a/cppjieba/HMMSegment.cpp b/src/HMMSegment.cpp similarity index 100% rename from cppjieba/HMMSegment.cpp rename to src/HMMSegment.cpp diff --git a/cppjieba/HMMSegment.h b/src/HMMSegment.h similarity index 96% rename from cppjieba/HMMSegment.h rename to src/HMMSegment.h index 3691a37..a71081e 100644 --- a/cppjieba/HMMSegment.h +++ b/src/HMMSegment.h @@ -4,8 +4,8 @@ #include #include #include -#include -#include +#include "Limonp/str_functs.hpp" +#include "Limonp/logger.hpp" #include "globals.h" #include "TransCode.hpp" #include "ISegment.hpp" diff --git a/src/Husky/CMakeLists.txt b/src/Husky/CMakeLists.txt new file mode 100644 index 0000000..281fd79 --- /dev/null +++ b/src/Husky/CMakeLists.txt @@ -0,0 +1,8 @@ +SET(EXECUTABLE_OUTPUT_PATH ${PROJECT_BINARY_DIR}/bin) +SET(LIBRARY_OUTPUT_PATH ${PROJECT_BINARY_DIR}/lib) + +SET(LIBHUSKY_SRC Daemon.cpp ServerFrame.cpp) +ADD_LIBRARY(husky STATIC ${LIBHUSKY_SRC}) + +INSTALL(TARGETS husky ARCHIVE DESTINATION lib/CppJieba/Husky) +INSTALL(FILES Daemon.h globals.h HttpReqInfo.hpp ServerFrame.h ThreadManager.hpp DESTINATION include/CppJieba/Husky) diff --git a/husky/Daemon.cpp b/src/Husky/Daemon.cpp similarity index 100% rename from husky/Daemon.cpp rename to src/Husky/Daemon.cpp diff --git a/husky/Daemon.h b/src/Husky/Daemon.h similarity index 96% rename from husky/Daemon.h rename to src/Husky/Daemon.h index 532765a..61506a5 100644 --- a/husky/Daemon.h +++ b/src/Husky/Daemon.h @@ -8,7 +8,7 @@ #include #include #include -#include +#include "../Limonp/logger.hpp" #include "ServerFrame.h" namespace Husky diff --git a/husky/HttpReqInfo.hpp b/src/Husky/HttpReqInfo.hpp similarity index 91% rename from husky/HttpReqInfo.hpp rename to src/Husky/HttpReqInfo.hpp index 68bae8d..9b735a2 100644 --- a/husky/HttpReqInfo.hpp +++ b/src/Husky/HttpReqInfo.hpp @@ -3,14 +3,14 @@ #include #include +#include "../Limonp/logger.hpp" +#include "../Limonp/str_functs.hpp" #include "globals.h" -#include -#include -#include namespace Husky { using namespace Limonp; + using namespace std; static const char* const KEY_METHOD = "METHOD"; static const char* const KEY_PATH = "PATH"; @@ -130,7 +130,8 @@ namespace Husky LogFatal("headerStr illegal."); return false; } - _headerMap[upperStr(k)] = v; + upper(k); + _headerMap[k] = v; lpos = rpos + 1; } //message header end @@ -160,6 +161,8 @@ namespace Husky HashMap _headerMap; HashMap _methodGetMap; HashMap _methodPostMap; + //public: + friend ostream& operator<<(ostream& os, const HttpReqInfo& obj); private: bool _find(const HashMap& mp, const string& key, string& res)const { @@ -171,19 +174,6 @@ namespace Husky res = it->second; return true; } - public: - //string toString() const;// function for debug because of heavy time consuming - string toString() const - { - string res("{"); - res += HashMapToString(_headerMap); - res += ","; - res += HashMapToString(_methodGetMap); - res += ","; - res += HashMapToString(_methodPostMap); - res += "}"; - return res; - } private: bool _parseUrl(const string& url, HashMap& mp) { @@ -226,6 +216,11 @@ namespace Husky } }; + inline std::ostream& operator << (std::ostream& os, const Husky::HttpReqInfo& obj) + { + return os << obj._headerMap << obj._methodGetMap << obj._methodPostMap; + } + } #endif diff --git a/husky/ServerFrame.cpp b/src/Husky/ServerFrame.cpp similarity index 96% rename from husky/ServerFrame.cpp rename to src/Husky/ServerFrame.cpp index 9903fb7..aa3eab9 100644 --- a/husky/ServerFrame.cpp +++ b/src/Husky/ServerFrame.cpp @@ -134,13 +134,13 @@ namespace Husky nRetCode = recv(hClientSock, chRecvBuf, RECV_BUFFER, 0); strRec = chRecvBuf; -#ifdef DEBUG - LogDebug("response[%s]", strRec.c_str()); +#ifdef HUKSY_DEBUG + LogDebug("request[%s]", strRec.c_str()); #endif if(SOCKET_ERROR==nRetCode) { - LogError("error [%s]", strerror(errno)); + LogDebug("error [%s]", strerror(errno)); closesocket(hClientSock); continue; } @@ -160,15 +160,15 @@ namespace Husky strHttpResp=chHttpHeader; strHttpResp+=strSnd; +#ifdef HUKSY_DEBUG + LogDebug("response'body [%s]", strSnd.c_str()); +#endif if (SOCKET_ERROR==send(hClientSock,strHttpResp.c_str(),strHttpResp.length(),0)) { LogError("error [%s]", strerror(errno)); } -#ifdef DEBUG - LogDebug("send response [%s] ", strHttpResp.c_str()); -#endif closesocket(hClientSock); } diff --git a/husky/ServerFrame.h b/src/Husky/ServerFrame.h similarity index 100% rename from husky/ServerFrame.h rename to src/Husky/ServerFrame.h diff --git a/husky/ThreadManager.hpp b/src/Husky/ThreadManager.hpp similarity index 100% rename from husky/ThreadManager.hpp rename to src/Husky/ThreadManager.hpp diff --git a/husky/globals.h b/src/Husky/globals.h similarity index 100% rename from husky/globals.h rename to src/Husky/globals.h diff --git a/cppjieba/ISegment.hpp b/src/ISegment.hpp similarity index 100% rename from cppjieba/ISegment.hpp rename to src/ISegment.hpp diff --git a/limonp/ArgvContext.hpp b/src/Limonp/ArgvContext.hpp similarity index 85% rename from limonp/ArgvContext.hpp rename to src/Limonp/ArgvContext.hpp index f4aeef2..8be15f1 100644 --- a/limonp/ArgvContext.hpp +++ b/src/Limonp/ArgvContext.hpp @@ -10,7 +10,6 @@ #include #include "str_functs.hpp" #include "map_functs.hpp" -#include "vec_functs.hpp" namespace Limonp { @@ -43,12 +42,7 @@ namespace Limonp } ~ArgvContext(){}; public: - string toString() - { - stringstream ss; - ss<(_args)<(_mpss)<(_sset); - return ss.str(); - } + friend ostream& operator << (ostream& os, const ArgvContext& args); string operator [](uint i) { if(i < _args.size()) @@ -81,6 +75,16 @@ namespace Limonp set _sset; }; + + inline ostream& operator << (ostream& os, const ArgvContext& args) + { + return os< #include #include "logger.hpp" -#include "vec_functs.hpp" namespace Limonp { @@ -21,8 +20,9 @@ namespace Limonp const char * const USER; const char * const PASSWD; const char * const DB; + const char * const CHARSET; public: - MysqlClient(const char* host, uint port, const char* user, const char* passwd, const char* db): HOST(host), PORT(port), USER(user), PASSWD(passwd), DB(db){ _conn = NULL;}; + MysqlClient(const char* host, uint port, const char* user, const char* passwd, const char* db, const char* charset = "utf8"): HOST(host), PORT(port), USER(user), PASSWD(passwd), DB(db), CHARSET(charset){ _conn = NULL;}; ~MysqlClient(){dispose();}; public: bool init() @@ -42,10 +42,17 @@ namespace Limonp return false; } + if(mysql_set_character_set(_conn, CHARSET)) + { + LogError("mysql_set_character_set [%s] failed.", CHARSET); + return false; + } + //set reconenct char value = 1; mysql_options(_conn, MYSQL_OPT_RECONNECT, &value); + LogInfo("MysqlClient {host: %s, port:%d, database:%s, charset:%s}", HOST, PORT, DB, CHARSET); return true; } bool dispose() @@ -71,6 +78,18 @@ namespace Limonp } return true; } + uint insert(const char* tb_name, const char* keys, const vector& vals) + { + uint retn = 0; + string sql; + for(uint i = 0; i < vals.size(); i ++) + { + sql.clear(); + string_format(sql, "insert into %s (%s) values %s", tb_name, keys, vals[i].c_str()); + retn += executeSql(sql.c_str()); + } + return retn; + } bool select(const char* sql, RowsType& rows) { if(!executeSql(sql)) diff --git a/limonp/cast_functs.hpp b/src/Limonp/cast_functs.hpp similarity index 100% rename from limonp/cast_functs.hpp rename to src/Limonp/cast_functs.hpp diff --git a/limonp/config.hpp b/src/Limonp/config.hpp similarity index 100% rename from limonp/config.hpp rename to src/Limonp/config.hpp diff --git a/limonp/io_functs.hpp b/src/Limonp/io_functs.hpp similarity index 100% rename from limonp/io_functs.hpp rename to src/Limonp/io_functs.hpp diff --git a/limonp/logger.hpp b/src/Limonp/logger.hpp similarity index 80% rename from limonp/logger.hpp rename to src/Limonp/logger.hpp index 5ccfdec..763f26d 100644 --- a/limonp/logger.hpp +++ b/src/Limonp/logger.hpp @@ -13,13 +13,14 @@ #include #include "io_functs.hpp" #include "str_functs.hpp" -#include "typedefs.h" -#define LogDebug(fmt, ...) Logger::LoggingF(LL_DEBUG, __FILE__, __LINE__, fmt, ## __VA_ARGS__) -#define LogInfo(fmt, ...) Logger::LoggingF(LL_INFO, __FILE__, __LINE__, fmt, ## __VA_ARGS__) -#define LogWarn(fmt, ...) Logger::LoggingF(LL_WARN, __FILE__, __LINE__, fmt, ## __VA_ARGS__) -#define LogError(fmt, ...) Logger::LoggingF(LL_ERROR, __FILE__, __LINE__, fmt, ## __VA_ARGS__) -#define LogFatal(fmt, ...) Logger::LoggingF(LL_FATAL, __FILE__, __LINE__, fmt, ## __VA_ARGS__) +#define FILE_BASENAME strrchr(__FILE__, '/') ? strrchr(__FILE__, '/') + 1 : __FILE__ + +#define LogDebug(fmt, ...) Logger::LoggingF(LL_DEBUG, FILE_BASENAME, __LINE__, fmt, ## __VA_ARGS__) +#define LogInfo(fmt, ...) Logger::LoggingF(LL_INFO, FILE_BASENAME, __LINE__, fmt, ## __VA_ARGS__) +#define LogWarn(fmt, ...) Logger::LoggingF(LL_WARN, FILE_BASENAME, __LINE__, fmt, ## __VA_ARGS__) +#define LogError(fmt, ...) Logger::LoggingF(LL_ERROR, FILE_BASENAME, __LINE__, fmt, ## __VA_ARGS__) +#define LogFatal(fmt, ...) Logger::LoggingF(LL_FATAL, FILE_BASENAME, __LINE__, fmt, ## __VA_ARGS__) namespace Limonp diff --git a/limonp/macro_def.hpp b/src/Limonp/macro_def.hpp similarity index 100% rename from limonp/macro_def.hpp rename to src/Limonp/macro_def.hpp diff --git a/src/Limonp/map_functs.hpp b/src/Limonp/map_functs.hpp new file mode 100644 index 0000000..44d472c --- /dev/null +++ b/src/Limonp/map_functs.hpp @@ -0,0 +1,116 @@ +/************************************ + * file enc : ascii + * author : wuyanyi09@gmail.com + ************************************/ + + +#ifndef LIMONP_MAP_FUNCTS_H +#define LIMONP_MAP_FUNCTS_H + +#include +#include +#include +#include + +#include +#define HashMap std::tr1::unordered_map + +namespace Limonp +{ + using namespace std; + + + //template + // string setToString(const set& st) + // { + // if(st.empty()) + // { + // return "{}"; + // } + // stringstream ss; + // ss<<'{'; + // typename set::const_iterator it = st.begin(); + // ss<<*it; + // it++; + // while(it != st.end()) + // { + // ss<<", "<<*it; + // it++; + // } + // ss<<'}'; + // return ss.str(); + // } + + //template + // string mapToString(const map& mp) + // { + // if(mp.empty()) + // { + // return "{}"; + // } + // stringstream ss; + // ss<<'{'; + // typename map::const_iterator it = mp.begin(); + // ss<first<<": "<second; + // it++; + // while(it != mp.end()) + // { + // ss<<", "<first<<": "<second; + // it++; + // } + // ss<<'}'; + // return ss.str(); + // } + + //template + // string HashMapToString(const HashMap& mp) + // { + // if(mp.empty()) + // { + // return "{}"; + // } + // stringstream ss; + // ss<<'{'; + // typename HashMap::const_iterator it = mp.begin(); + // ss<first<<": "<second; + // it++; + // while(it != mp.end()) + // { + // ss<<", "<first<<": "<second; + // it++; + // } + // ss<<'}'; + // return ss.str(); + // } + //template + // string pairToString(const pair& p) + // { + // stringstream ss; + // ss< + vT getMap(const map& mp, const kT & key, const vT & defaultVal) + { + typename map::const_iterator it; + it = mp.find(key); + if(mp.end() == it) + { + return defaultVal; + } + return it->second; + } + + template + void map2Vec(const map& mp, vector > & res) + { + typename map::const_iterator it = mp.begin(); + for(; it != mp.end(); it++) + { + res.push_back(*it); + } + } +} + +#endif diff --git a/src/Limonp/std_outbound.hpp b/src/Limonp/std_outbound.hpp new file mode 100644 index 0000000..ab3e5c3 --- /dev/null +++ b/src/Limonp/std_outbound.hpp @@ -0,0 +1,101 @@ +#ifndef LIMONP_STD_OUTBOUND_H +#define LIMONP_STD_OUTBOUND_H + +#include +#include +#include + +namespace std +{ + template + ostream& operator << (ostream& os, const vector& vec) + { + if(vec.empty()) + { + return os << "[]"; + } + os<<"[\""< + ostream& operator << (ostream& os, const pair& pr) + { + os << pr.first << ":" << pr.second ; + return os; + } + + + template + string& operator << (string& str, const T& obj) + { + stringstream ss; + ss << obj; // call ostream& operator << (ostream& os, + return str = ss.str(); + } + + template + ostream& operator << (ostream& os, const map& mp) + { + if(mp.empty()) + { + os<<"{}"; + return os; + } + os<<'{'; + typename map::const_iterator it = mp.begin(); + os<<*it; + it++; + while(it != mp.end()) + { + os<<", "<<*it; + it++; + } + os<<'}'; + return os; + } + template + ostream& operator << (ostream& os, const std::tr1::unordered_map& mp) + { + if(mp.empty()) + { + return os << "{}"; + } + os<<'{'; + typename std::tr1::unordered_map::const_iterator it = mp.begin(); + os<<*it; + it++; + while(it != mp.end()) + { + os<<", "<<*it++; + } + return os<<'}'; + } + + template + ostream& operator << (ostream& os, const set& st) + { + if(st.empty()) + { + os << "{}"; + return os; + } + os<<'{'; + typename set::const_iterator it = st.begin(); + os<<*it; + it++; + while(it != st.end()) + { + os<<", "<<*it; + it++; + } + os<<'}'; + return os; + } +} + +#endif diff --git a/limonp/str_functs.hpp b/src/Limonp/str_functs.hpp similarity index 61% rename from limonp/str_functs.hpp rename to src/Limonp/str_functs.hpp index 0116995..8ad62a3 100644 --- a/limonp/str_functs.hpp +++ b/src/Limonp/str_functs.hpp @@ -15,9 +15,17 @@ #include #include #include -#include "typedefs.h" #include #include +#include +#include +#include +#include +#include "std_outbound.hpp" +#include "map_functs.hpp" + +#define print(x) cout<<(x)<& src, string& dest, const string& connectorStr) - { - if(src.empty()) - { - return false; - } - for(uint i = 0; i < src.size() - 1; i++) - { - dest += src[i]; - dest += connectorStr; - } - dest += src[src.size() - 1]; - return true; - } + //inline bool joinStr(const vector& src, string& dest, const string& connectorStr) + //{ + // if(src.empty()) + // { + // return false; + // } + // for(uint i = 0; i < src.size() - 1; i++) + // { + // dest += src[i]; + // dest += connectorStr; + // } + // dest += src[src.size() - 1]; + // return true; + //} - inline string joinStr(const vector& source, const string& connector) - { - string res; - joinStr(source, res, connector); - return res; - } + //inline string joinStr(const vector& source, const string& connector) + //{ + // string res; + // joinStr(source, res, connector); + // return res; + //} + + template + void join(T begin, T end, string& res, const string& connector) + { + if(begin == end) + { + return; + } + stringstream ss; + ss<<*begin; + begin++; + while(begin != end) + { + ss << connector << *begin; + begin ++; + } + res = ss.str(); + } + + template + string join(T begin, T end, const string& connector) + { + string res; + join(begin ,end, res, connector); + return res; + } + + inline bool splitStr(const string& src, vector& res, const string& pattern) { @@ -104,26 +141,24 @@ namespace Limonp return true; } res.push_back(src.substr(start, end - start)); - if(end == src.size() - 1) - { - res.push_back(""); - break; - } + if(end == src.size() - 1) + { + res.push_back(""); + break; + } start = end + 1; } return true; } - inline string upperStr(const string& strIn) + inline string& upper(string& str) { - string str = strIn; transform(str.begin(), str.end(), str.begin(), (int (*)(int))toupper); return str; } - inline string lowerStr(const string& strIn) + inline string& lower(string& str) { - string str = strIn; transform(str.begin(), str.end(), str.begin(), (int (*)(int))tolower); return str; } @@ -183,40 +218,40 @@ namespace Limonp return str.find(ch) != string::npos; } - inline void extractWords(const string& sentence, vector& words) - { - bool flag = false; - uint lhs = 0, len = 0; - for(uint i = 0; i < sentence.size(); i++) - { - char x = sentence[i]; - if((0x0030 <= x && x<= 0x0039) || (0x0041 <= x && x <= 0x005a ) || (0x0061 <= x && x <= 0x007a)) - { - if(flag) - { - len ++; - } - else - { - lhs = i; - len = 1; - } - flag = true; - } - else - { - if(flag) - { - words.push_back(string(sentence, lhs, len)); - } - flag = false; - } - } - if(flag) - { - words.push_back(string(sentence, lhs, len)); - } - } + //inline void extractWords(const string& sentence, vector& words) + //{ + // bool flag = false; + // uint lhs = 0, len = 0; + // for(uint i = 0; i < sentence.size(); i++) + // { + // char x = sentence[i]; + // if((0x0030 <= x && x<= 0x0039) || (0x0041 <= x && x <= 0x005a ) || (0x0061 <= x && x <= 0x007a)) + // { + // if(flag) + // { + // len ++; + // } + // else + // { + // lhs = i; + // len = 1; + // } + // flag = true; + // } + // else + // { + // if(flag) + // { + // words.push_back(string(sentence, lhs, len)); + // } + // flag = false; + // } + // } + // if(flag) + // { + // words.push_back(string(sentence, lhs, len)); + // } + //} } diff --git a/cppjieba/MPSegment.cpp b/src/MPSegment.cpp similarity index 100% rename from cppjieba/MPSegment.cpp rename to src/MPSegment.cpp diff --git a/cppjieba/MPSegment.h b/src/MPSegment.h similarity index 98% rename from cppjieba/MPSegment.h rename to src/MPSegment.h index 769743d..a3eaae3 100644 --- a/cppjieba/MPSegment.h +++ b/src/MPSegment.h @@ -7,7 +7,7 @@ #include #include -#include +#include "Limonp/logger.hpp" #include "Trie.h" #include "globals.h" #include "ISegment.hpp" diff --git a/cppjieba/MixSegment.cpp b/src/MixSegment.cpp similarity index 100% rename from cppjieba/MixSegment.cpp rename to src/MixSegment.cpp diff --git a/cppjieba/MixSegment.h b/src/MixSegment.h similarity index 95% rename from cppjieba/MixSegment.h rename to src/MixSegment.h index e85d0e8..079db3f 100644 --- a/cppjieba/MixSegment.h +++ b/src/MixSegment.h @@ -3,7 +3,7 @@ #include "MPSegment.h" #include "HMMSegment.h" -#include +#include "Limonp/str_functs.hpp" namespace CppJieba { diff --git a/cppjieba/SegmentBase.hpp b/src/SegmentBase.hpp similarity index 96% rename from cppjieba/SegmentBase.hpp rename to src/SegmentBase.hpp index 17a7130..b082f56 100644 --- a/cppjieba/SegmentBase.hpp +++ b/src/SegmentBase.hpp @@ -4,8 +4,8 @@ #include "globals.h" #include "ISegment.hpp" #include "ChineseFilter.hpp" -#include -#include +#include "Limonp/str_functs.hpp" +#include "Limonp/logger.hpp" namespace CppJieba { diff --git a/cppjieba/TransCode.hpp b/src/TransCode.hpp similarity index 98% rename from cppjieba/TransCode.hpp rename to src/TransCode.hpp index 46ede56..febe3a4 100644 --- a/cppjieba/TransCode.hpp +++ b/src/TransCode.hpp @@ -7,8 +7,7 @@ #include "globals.h" -#include -#include +#include "Limonp/str_functs.hpp" namespace CppJieba { diff --git a/cppjieba/Trie.cpp b/src/Trie.cpp similarity index 100% rename from cppjieba/Trie.cpp rename to src/Trie.cpp diff --git a/cppjieba/Trie.h b/src/Trie.h similarity index 97% rename from cppjieba/Trie.h rename to src/Trie.h index 25689be..0fa54e1 100644 --- a/cppjieba/Trie.h +++ b/src/Trie.h @@ -12,8 +12,8 @@ #include #include #include -#include -#include +#include "Limonp/str_functs.hpp" +#include "Limonp/logger.hpp" #include "TransCode.hpp" #include "globals.h" #include "structs.h" diff --git a/cppjieba/globals.h b/src/globals.h similarity index 100% rename from cppjieba/globals.h rename to src/globals.h diff --git a/src/segment.cpp b/src/segment.cpp new file mode 100644 index 0000000..fbd367d --- /dev/null +++ b/src/segment.cpp @@ -0,0 +1,82 @@ +#include +#include +#include "Limonp/ArgvContext.hpp" +#include "MPSegment.h" +#include "HMMSegment.h" +#include "MixSegment.h" + +using namespace CppJieba; + +void cut(const ISegment * seg, const char * const filePath) +{ + ifstream ifile(filePath); + vector res; + string line; + while(getline(ifile, line)) + { + if(!line.empty()) + { + res.clear(); + seg->cut(line, res); + cout<\n" + <<"options:\n" + <<"\t--algorithm\tSupported methods are [cutDAG, cutHMM, cutMix] for now. \n\t\t\tIf not specified, the default is cutDAG\n" + <<"\t--dictpath\tsee example\n" + <<"\t--modelpath\tsee example\n" + <<"example:\n" + <<"\t"< #include #include -#include -#include "../husky/Daemon.h" -#include "../husky/ServerFrame.h" -#include "../cppjieba/MPSegment.h" -#include "../cppjieba/HMMSegment.h" -#include "../cppjieba/MixSegment.h" +#include "Limonp/ArgvContext.hpp" +#include "Husky/Daemon.h" +#include "Husky/ServerFrame.h" +#include "MPSegment.h" +#include "HMMSegment.h" +#include "MixSegment.h" using namespace Husky; using namespace CppJieba; @@ -31,7 +31,7 @@ class ServerDemo: public IRequestHandler httpReq.GET("key", tmp); URLDecode(tmp, sentence); _segment.cut(sentence, words); - vecToString(words, strSnd); + strSnd << words; return true; } private: diff --git a/cppjieba/structs.h b/src/structs.h similarity index 81% rename from cppjieba/structs.h rename to src/structs.h index 38fd5ec..88c5894 100644 --- a/cppjieba/structs.h +++ b/src/structs.h @@ -74,7 +74,7 @@ namespace CppJieba KeyWordInfo(const TrieNodeInfo& trieNodeInfo):TrieNodeInfo(trieNodeInfo) { } - inline string toString() const + string toString() const { string tmp; TransCode::encode(word, tmp); @@ -89,16 +89,23 @@ namespace CppJieba return *this; } }; - - inline string joinWordInfos(const vector& vec) + + inline ostream& operator << (ostream& os, const KeyWordInfo& info) { - vector tmp; - for(uint i = 0; i < vec.size(); i++) - { - tmp.push_back(vec[i].toString()); - } - return joinStr(tmp, ","); + string tmp; + TransCode::encode(info.word, tmp); + return os << "{words:" << tmp << ", weight:" << info.weight << ", idf:" << info.idf << "}"; } + + //inline string joinWordInfos(const vector& vec) + //{ + // vector tmp; + // for(uint i = 0; i < vec.size(); i++) + // { + // tmp.push_back(vec[i].toString()); + // } + // return joinStr(tmp, ","); + //} } #endif diff --git a/test/Makefile b/test/Makefile deleted file mode 100644 index a7d4434..0000000 --- a/test/Makefile +++ /dev/null @@ -1,54 +0,0 @@ -CXX := g++ -LD := g++ -AR := ar rc - -INCS := -I../cppjieba/ - -DEBUG_CXXFLAGS := -g -Wall -DDEBUG -DUT $(INCS) - -CXXFLAGS := ${DEBUG_CXXFLAGS} -LDFLAGS := ${DEBUG_LDFLAGS} - -DOLINK := $(LD) $(LDFLAGS) -DOPACK := $(AR) -SOURCES := $(wildcard *.cpp) -OBJS := $(patsubst %.cpp,%.o,$(SOURCES)) -UTS := $(patsubst %.cpp,%.ut,$(SOURCES)) - -CPPJIEBADIR = ../cppjieba -LIBCPPJIEBA = $(CPPJIEBADIR)/libcppjieba.a - -CPPCOMMONDIR = ../cppcommon -LIBCPPCM = $(CPPCOMMONDIR)/libcm.a - -LIBA := $(LIBCPPJIEBA) $(LIBCPPCM) -# remove the objs after compilation -.PHONY: clean $(LIBA) - -# Main Targets -all: $(UTS) - -# This is a suffix rule -#.c.o: -%.o: %.cpp - $(CXX) -c $(CXXFLAGS) $< -%.ut: %.o $(LIBA) - $(CXX) $(CXXFLAGS) -o $@ $^ - -$(LIBCPPJIEBA): - cd $(CPPJIEBADIR) && $(MAKE) - -$(LIBCPPCM): - cd $(CPPCOMMONDIR) && $(MAKE) - -clean: - rm -f *.o *.ut *.d *.d.* -# cd $(CPPJIEBADIR) && make clean -# cd $(CPPCOMMONDIR) && make clean - -sinclude $(SOURCES:.cpp=.d) -%.d:%.cpp - @set -e; rm -f $@; \ - $(CXX) -MM $< > $@.$$$$; \ - sed 's,\($*\).o[ :]*,\1.o $@ : ,g' < $@.$$$$ > $@; \ - rm -f $@.$$$$ diff --git a/test/segment.cpp b/test/segment.cpp new file mode 100644 index 0000000..e9c4a1b --- /dev/null +++ b/test/segment.cpp @@ -0,0 +1,60 @@ +#include +#include +#include +#include +#include +#include + +using namespace CppJieba; + +void cut(const ISegment * seg, const char * const filePath) +{ + ifstream ifile(filePath); + vector res; + string line; + while(getline(ifile, line)) + { + if(!line.empty()) + { + res.clear(); + seg->cut(line, res); + cout< +#include +#include +#include +#include +#include + +using namespace Husky; +using namespace CppJieba; + +const char * const DEFAULT_DICTPATH = "../dicts/jieba.dict.utf8"; +const char * const DEFAULT_MODELPATH = "../dicts/hmm_model.utf8"; + +class ServerDemo: public IRequestHandler +{ + public: + ServerDemo(){}; + virtual ~ServerDemo(){}; + virtual bool init(){return _segment.init(DEFAULT_DICTPATH, DEFAULT_MODELPATH);}; + virtual bool dispose(){return _segment.dispose();}; + public: + virtual bool do_GET(const HttpReqInfo& httpReq, string& strSnd) + { + string sentence, tmp; + vector words; + httpReq.GET("key", tmp); + URLDecode(tmp, sentence); + _segment.cut(sentence, words); + strSnd << words; + return true; + } + private: + MixSegment _segment; +}; + +int main(int argc,char* argv[]) +{ + if(argc != 7) + { + printf("usage: %s -n THREAD_NUMBER -p LISTEN_PORT -k start|stop\n",argv[0]); + return -1; + } + ArgvContext arg(argc, argv); + unsigned int port = atoi(arg["-p"].c_str()); + unsigned int threadNum = atoi(arg["-n"].c_str()); + + ServerDemo s; + Daemon daemon(&s); + if(arg["-k"] == "start") + { + return !daemon.Start(port, threadNum); + } + else + { + return !daemon.Stop(); + } +} + diff --git a/demo/testlines.utf8 b/test/testlines.utf8 similarity index 100% rename from demo/testlines.utf8 rename to test/testlines.utf8