Merge branch 'dev'

This commit is contained in:
wyy 2013-11-04 07:04:38 -08:00
commit 1f65862c82
60 changed files with 720 additions and 1388 deletions

2
CMakeLists.txt Normal file
View File

@ -0,0 +1,2 @@
PROJECT(CPPJIEBA)
ADD_SUBDIRECTORY(src)

156
README.md
View File

@ -7,41 +7,63 @@
- `master`分支支持`utf8`编码
- `gbk`分支支持`gbk`编码
## 安装与使用
## 模块详解
### 下载和安装
### Trie树
Trie.cpp/Trie.h 负责载入词典的trie树主要供Segment模块使用。
```sh
wget https://github.com/aszxqw/cppjieba/archive/master.zip -O cppjieba-master.zip
unzip cppjieba-master.zip
cd cppjieba-master
mkdir build
cd build
cmake -DCMAKE_INSTALL_PREFIX=/usr ..
make
sudo make install
```
### Segment模块
### 卸载
```sh
cd build/
cat install_manifest.txt | sudo xargs rm -rf
```
MPSegment.cpp/MPSegment.h
(Maximum Probability)最大概率法:负责根据Trie树构建有向无环图和进行动态规划算法是分词算法的核心。
### 验证
HMMSegment.cpp/HMMSegment.h
是根据HMM模型来进行分词主要算法思路是根据(B,E,M,S)四个状态来代表每个字的隐藏状态。
HMM模型由dicts/下面的`hmm_model.utf8`提供。
分词算法即viterbi算法。
```sh
cd test/
g++ -o segment.demo segment.cpp -L/usr/lib/CppJieba/ -lcppjieba
./segment.demo # you will see the demo.
```
运行一下 `./server``./segment` 都会有对应的帮助文档显示。
### TransCode模块
同时,如果想知道开发时如何使用`libcppjieba.a` 请看`test/segment.cpp`源代码即可。
TransCode.cpp/TransCode.h 负责转换编码类型将utf8和gbk转换成`uint16_t`类型,也负责逆转换
如果想知道如何搭建一个`cppjieba`中文分词的http服务请见 `test/server.cpp`源代码即可
若还有其他问题,欢迎`send mail`或者`open issue`。 :)
### 搭建服务
## Demo
```
cd ./test
g++ -o server server.cpp -L/usr/lib/CppJieba/ -L/usr/lib/CppJieba/Husky -lcppjieba -lhusky -lpthread
./server -n 4 -p 11258 -k start >> run.log 2>&1 #启动服务监听11258这个端口。
./server -n 4 -p 11258 -k stop #停止服务
```
#### 验证服务
然后用chrome浏览器打开`http://127.0.0.1:11258/?key=我来自北京邮电大学`
(用chrome的原因是chrome的默认编码就是utf-8)
或者用命令 `curl "http://127.0.0.1:11258/?key=我来自北京邮电大学"` (ubuntu中的curl安装命令`sudo apt-get install curl`)
## 分词效果
### MPSegment's demo
__这部分的功能经过线上考验一直稳定运行暂时没有发现什么bug。__
```
cd ./demo;
make;
./segment_demo testlines.utf8
```
Output:
```
我来到北京清华大学
@ -59,12 +81,6 @@ Output:
### HMMSegment's demo
```
cd ./demo;
make;
./segment_demo testlines.utf8 --modelpath ../dicts/hmm_model.utf8 --algorithm cutHMM
```
Output:
```
我来到北京清华大学
@ -78,11 +94,6 @@ Output:
```
### MixSegment's demo
```
cd ./demo;
make;
./segment_demo testlines.utf8 --algorithm cutMix
```
Output:
```
@ -98,62 +109,51 @@ Output:
我/来自/北京邮电大学/。。。/学号/091111xx/。。。
```
### Server's demo
引入了husky这个文件夹husky是一个简单的http服务框架。
```
cd ./demo;
make;
./start.sh #启动一个服务监听11258这个端口(在start.sh里面指定)。
```
关闭和重启分别是`stop.sh``restart.sh`
然后用chrome浏览器打开`http://127.0.0.1:11258/?key=我来自北京邮电大学`
(用chrome的原因是chrome的默认编码就是utf-8)
或者用命令 `curl "http://127.0.0.1:11258/?key=我来自北京邮电大学"`
### 效果分析
以上依次是MP,HMM,Mix三种方法的效果。
可以看出效果最好的是Mix也就是融合MP和HMM的切词算法。即可以准确切出词典已有的词又可以切出像"杭研"这样的未登录词。
## Help
## 模块详解
本项目主要是如下目录组成:
### Limonp
### src
核心目录,包含主要源代码。
#### Trie树
Trie.cpp/Trie.h 负责载入词典的trie树主要供Segment模块使用。
#### Segment模块
MPSegment.cpp/MPSegment.h
(Maximum Probability)最大概率法:负责根据Trie树构建有向无环图和进行动态规划算法是分词算法的核心。
HMMSegment.cpp/HMMSegment.h
是根据HMM模型来进行分词主要算法思路是根据(B,E,M,S)四个状态来代表每个字的隐藏状态。
HMM模型由dicts/下面的`hmm_model.utf8`提供。
分词算法即viterbi算法。
#### TransCode模块
TransCode.cpp/TransCode.h 负责转换编码类型将utf8和gbk转换成`uint16_t`类型,也负责逆转换。
### src/Husky
提供服务的框架代码,
详见: https://github.com/aszxqw/husky
### src/Limonp
主要是一些工具函数,例如字符串操作等。
直接include就可以使用。
### cppjieba
核心目录,包含主要源代码。
make 之后产生libcppjieb.a
使用方法参考如上cppcommon
### run `./segment_demo` to get help.
如下:
```
usage:
./segment_demo[options] <filename>
options:
--algorithm Supported methods are [cutDAG, cutHMM, cutMix] for now.
If not specified, the default is cutDAG
--dictpath If not specified, the default is ../dicts/jieba.dict.utf8
--modelpath If not specified, the default is ../dicts/hmm_model.utf8
If not specified, the default is utf8.
example:
./segment_demo testlines.utf8 --dictpath ../dicts/jieba.dict.utf8
./segment_demo testlines.utf8 --modelpath ../dicts/hmm_model.utf8 --algorithm cutHMM
./segment_demo testlines.utf8 --modelpath ../dicts/hmm_model.utf8 --algorithm cutMix
```
详见: https://github.com/aszxqw/limonp
## 分词速度
@ -163,11 +163,11 @@ example:
测试环境: `Intel(R) Xeon(R) CPU E5506 @ 2.13GHz`
## Contact
## 联系客服
如果有运行问题或者任何疑问,欢迎联系 : wuyanyi09@gmail.com
## Thanks
## 鸣谢
"结巴中文"分词作者: SunJunyi
https://github.com/fxsjy/jieba

View File

@ -1,360 +0,0 @@
/************************************
* file enc : ASCII
* author : wuyanyi09@gmail.com
************************************/
#include "KeyWordExt.h"
namespace CppJieba
{
KeyWordExt::KeyWordExt()
{
}
KeyWordExt::~KeyWordExt()
{
}
bool KeyWordExt::init(const char* const segDictFile)
{
LogInfo("KeyWordExt init start ...");
if(!_segment.init(segDictFile))
{
LogError("_segment.init failed.");
return false;
}
return true;
}
bool KeyWordExt::loadStopWords(const char * const filePath)
{
LogInfo("_loadStopWords(%s) start", filePath);
if(!_stopWords.empty())
{
LogError("_stopWords has been loaded before! ");
return false;
}
if(!checkFileExist(filePath))
{
LogError("cann't find file[%s].",filePath);
return false;
}
ifstream ifile(filePath);
string line;
Unicode word;
while(getline(ifile, line))
{
if(!TransCode::decode(line, word))
{
LogError("decode failed .");
return false;
}
_stopWords.insert(word);
}
LogInfo("load stopwords[%d] finished.", _stopWords.size());
return true;
}
bool KeyWordExt::dispose()
{
_segment.dispose();
return true;
}
bool KeyWordExt::_wordInfoCompare(const KeyWordInfo& a, const KeyWordInfo& b)
{
return a.weight > b.weight;
}
bool KeyWordExt::_sortWLIDF(vector<KeyWordInfo>& wordInfos)
{
for(uint i = 0; i < wordInfos.size(); i++)
{
KeyWordInfo& wInfo = wordInfos[i];
wInfo.idf = - wInfo.logFreq;
wInfo.weight = log(double(wInfo.word.size() + 1)) * wInfo.idf;
}
sort(wordInfos.begin(), wordInfos.end(), _wordInfoCompare);
return true;
}
bool KeyWordExt::_extTopN(vector<KeyWordInfo>& wordInfos, uint topN)
{
int dis = wordInfos.size() - topN;
if(dis <= 0)
{
return true;
}
if(uint(dis) <= topN)
{
for(int i = 0; i< dis; i++)
{
wordInfos.pop_back();
}
}
else// in case that topN << size;
{
vector<KeyWordInfo> tmp(wordInfos.begin(), wordInfos.begin() + topN);
wordInfos.swap(tmp);
}
return true;
}
bool KeyWordExt::extract(const vector<string>& words, vector<KeyWordInfo>& keyWordInfos, uint topN)
{
if(words.empty())
{
return false;
}
keyWordInfos.clear();
for(uint i = 0; i < words.size(); i++)
{
Unicode uniWord;
if(!TransCode::decode(words[i], uniWord))
{
LogError("decode failed");
return false;
}
keyWordInfos.push_back(uniWord);
}
return _extract(keyWordInfos, topN);
}
bool KeyWordExt::extract(const string& title, vector<KeyWordInfo>& keyWordInfos, uint topN)
{
if(title.empty())
{
return false;
}
vector<TrieNodeInfo> trieNodeInfos;
Unicode unico;
if(!TransCode::decode(title, unico))
{
return false;
}
_segment.cut(unico.begin(), unico.end(), trieNodeInfos);
keyWordInfos.clear();
for(uint i = 0; i < trieNodeInfos.size(); i++)
{
keyWordInfos.push_back(trieNodeInfos[i]);
}
return _extract(keyWordInfos, topN);
}
bool KeyWordExt::_extract(vector<KeyWordInfo>& keyWordInfos, uint topN)
{
if(!_filter(keyWordInfos))
{
LogError("_filter failed.");
return false;
}
if(!_sortWLIDF(keyWordInfos))
{
LogError("_sortWLIDF failed.");
return false;
}
if(!_extTopN(keyWordInfos, topN))
{
LogError("_extTopN failed.");
return false;
}
return true;
}
bool KeyWordExt::_filter(vector<KeyWordInfo>& wordInfos)
{
if(!_filterDuplicate(wordInfos))
{
LogError("_filterDuplicate failed.");
return false;
}
if(!_filterSingleWord(wordInfos))
{
LogError("_filterSingleWord failed.");
return false;
}
if(!_filterStopWords(wordInfos))
{
LogError("_filterStopWords failed.");
return false;
}
if(!_filterSubstr(wordInfos))
{
LogError("_filterSubstr failed.");
return false;
}
return true;
}
bool KeyWordExt::_filterStopWords(vector<KeyWordInfo>& wordInfos)
{
if(_stopWords.empty())
{
return true;
}
for(vector<KeyWordInfo>::iterator it = wordInfos.begin(); it != wordInfos.end();)
{
if(_stopWords.find(it->word) != _stopWords.end())
{
it = wordInfos.erase(it);
}
else
{
it ++;
}
}
return true;
}
bool KeyWordExt::_filterDuplicate(vector<KeyWordInfo>& wordInfos)
{
set<Unicode> st;
for(vector<KeyWordInfo>::iterator it = wordInfos.begin(); it != wordInfos.end(); )
{
if(st.find(it->word) != st.end())
{
it = wordInfos.erase(it);
}
else
{
st.insert(it->word);
it++;
}
}
return true;
}
bool KeyWordExt::_filterSingleWord(vector<KeyWordInfo>& wordInfos)
{
for(vector<KeyWordInfo>::iterator it = wordInfos.begin(); it != wordInfos.end();)
{
// filter single word
if(1 == it->word.size())
{
it = wordInfos.erase(it);
}
else
{
it++;
}
}
return true;
}
bool KeyWordExt::_filterSubstr(vector<KeyWordInfo>& wordInfos)
{
vector<Unicode> tmp ;
for(uint i = 0; i < wordInfos.size(); i++)
{
tmp.push_back(wordInfos[i].word);
}
for(vector<KeyWordInfo>::iterator it = wordInfos.begin(); it != wordInfos.end(); )
{
if(_isSubIn(tmp, it->word))
{
it = wordInfos.erase(it);
}
else
{
it++;
}
}
return true;
}
//bool KeyWordExt::_isContainSubWords(const string& word)
//{
// for(uint i = 0; i < _priorSubWords.size(); i++)
// {
// if(string::npos != word.find(_priorSubWords[i]))
// {
// return true;
// }
// }
// return false;
//}
//bool KeyWordExt::_prioritizeSubWords(vector<KeyWordInfo>& wordInfos)
//{
// if(2 > wordInfos.size())
// {
// return true;
// }
// KeyWordInfo prior;
// bool flag = false;
// for(vector<KeyWordInfo>::iterator it = wordInfos.begin(); it != wordInfos.end(); )
// {
// if(_isContainSubWords(it->word))
// {
// prior = *it;
// it = wordInfos.erase(it);
// flag = true;
// break;
// }
// else
// {
// it ++;
// }
// }
// if(flag)
// {
// wordInfos.insert(wordInfos.begin(), prior);
// }
// return true;
//}
}
#ifdef KEYWORDEXT_UT
using namespace CppJieba;
int main()
{
KeyWordExt ext;
ext.init();
if(!ext.loadSegDict("../dicts/segdict.gbk.v2.1"))
{
return 1;
}
ext._loadStopWords("../dicts/stopwords.gbk.v1.0");
ifstream ifile("testtitle.gbk");
vector<string> res;
string line;
while(getline(ifile, line))
{
cout<<line<<endl;
res.clear();
ext.extract(line, res, 20);
PRINT_VECTOR(res);
}
ext.dispose();
return 0;
}
#endif

View File

@ -1,68 +0,0 @@
/************************************
* file enc : ASCII
* author : wuyanyi09@gmail.com
************************************/
#ifndef CPPJIEBA_KEYWORDEXT_H
#define CPPJIEBA_KEYWORDEXT_H
#include <logger.hpp>
#include "MPSegment.h"
#include "structs.h"
namespace CppJieba
{
class KeyWordExt
{
private:
MPSegment _segment;
//vector<string> _priorSubWords;
set<Unicode> _stopWords;
public:
KeyWordExt();
~KeyWordExt();
bool init(const char* const segDictFile);
bool dispose();
bool loadStopWords(const char * const filePath);
private:
//bool _loadPriorSubWords(const char * const filePath);
public:
bool extract(const string& title, vector<KeyWordInfo>& keyWordInfos, uint topN);
bool extract(const vector<string>& words, vector<KeyWordInfo>& keyWordInfos, uint topN);
private:
static bool _wordInfoCompare(const KeyWordInfo& a, const KeyWordInfo& b);
private:
bool _extract(vector<KeyWordInfo>& keyWordInfos, uint topN);
bool _extTopN(vector<KeyWordInfo>& wordInfos, uint topN);
private:
//sort by word len - idf
bool _sortWLIDF(vector<KeyWordInfo>& wordInfos);
private:
bool _filter(vector<KeyWordInfo>& );
bool _filterDuplicate(vector<KeyWordInfo>& );
bool _filterSingleWord(vector<KeyWordInfo>& );
bool _filterSubstr(vector<KeyWordInfo>& );
bool _filterStopWords(vector<KeyWordInfo>& );
private:
inline bool _isSubIn(const vector<Unicode>& words, const Unicode& word)const
{
for(uint j = 0; j < words.size(); j++)
{
if(word != words[j] && words[j].end() != search(words[j].begin(), words[j].end(), word.begin(), word.end()))
{
return true;
}
}
return false;
}
//bool _prioritizeSubWords(vector<KeyWordInfo>& wordInfos);
//bool _isContainSubWords(const string& word);
};
}
#endif

View File

@ -1,66 +0,0 @@
CXX := g++
LD := g++
AR := ar rc
DEBUG_CXXFLAGS := -g -Wall -DDEBUG
RELEASE_CXXFLAGS := -Wall -O3
ifeq (YES, ${RELEASE})
CXXFLAGS := ${RELEASE_CXXFLAGS}
LDFLAGS := ${RELEASE_LDFLAGS}
else
CXXFLAGS := ${DEBUG_CXXFLAGS}
LDFLAGS := ${DEBUG_LDFLAGS}
endif
SOURCES := $(wildcard *.cpp)
OBJS := $(patsubst %.cpp,%.o,$(SOURCES))
INC := -I../limonp
LIBA := libcppjieba.a
# remove the objs after compilation
.INTERMEDIATE:
#.PHONY: clean $(CMLIB)
.PHONY: clean
all: $(LIBA)
# This is a suffix rule
#.c.o:
%.o: %.cpp
$(CXX) -c $(CXXFLAGS) $< $(INC)
${LIBA}: $(OBJS)
$(AR) $@ $(OBJS)
#unit test
Trie.ut: Trie.cpp Trie.h globals.h TransCode.cpp TransCode.hpp $(CMLIB)
$(CXX) -o $@ $(CXXFLAGS) Trie.cpp TransCode.cpp -DTRIE_UT $(CMLIB)
MPSegment.ut: MPSegment.cpp Trie.cpp MPSegment.h Trie.h globals.h $(CMLIB)
$(CXX) -o $@ $(CXXFLAGS) MPSegment.cpp Trie.cpp TransCode.cpp -DSEGMENT_UT $(CMLIB)
KeyWordExt.ut: KeyWordExt.cpp KeyWordExt.h MPSegment.h Trie.h globals.h TransCode.cpp TransCode.hpp $(CMLIB)
$(CXX) -o $@ $(CXXFLAGS) KeyWordExt.cpp MPSegment.cpp Trie.cpp TransCode.cpp -DKEYWORDEXT_UT $(CMLIB)
TransCode.ut: TransCode.cpp TransCode.hpp globals.h $(CMLIB)
$(CXX) -o $@ $(CXXFLAGS) TransCode.cpp -DCPPJIEBA_TRANSCODE_UT $(CMLIB)
HMMSegment.ut: HMMSegment.cpp TransCode.cpp TransCode.hpp HMMSegment.h $(CMLIB)
$(CXX) -o $@ $(CXXFLAGS) TransCode.cpp HMMSegment.cpp -DHMMSEGMENT_UT $(CMLIB)
MixSegment.ut: MixSegment.cpp MixSegment.h HMMSegment.cpp MPSegment.cpp Trie.cpp MPSegment.h Trie.h globals.h $(CMLIB)
$(CXX) -o $@ $(CXXFLAGS) MixSegment.cpp HMMSegment.cpp MPSegment.cpp Trie.cpp TransCode.cpp -DMIXSEGMENT_UT $(CMLIB)
ChineseFilter.ut: ChineseFilter.cpp ChineseFilter.hpp
$(CXX) -o $@ $(CXXFLAGS) ChineseFilter.cpp -DUT
clean:
rm -f *.o *.d *.d.* *.ut $(LIBA)
sinclude $(SOURCES:.cpp=.d)
%.d:%.cpp
@set -e; rm -f $@; \
$(CXX) -MM $< > $@.$$$$; \
sed 's,\($*\).o[ :]*,\1.o $@ : ,g' < $@.$$$$ > $@; \
rm -f $@.$$$$

View File

@ -1,53 +0,0 @@
CXX := g++
LD := g++
AR := ar rc
DEBUG_CXXFLAGS := -g -Wall -DDEBUG
RELEASE_CXXFLAGS := -Wall -O3
ifeq (YES, ${DEBUG})
CXXFLAGS := ${DEBUG_CXXFLAGS}
LDFLAGS := ${DEBUG_LDFLAGS}
else
CXXFLAGS := ${RELEASE_CXXFLAGS}
LDFLAGS := ${RELEASE_LDFLAGS}
endif
INCS := -I../limonp
LINK := -lpthread
SOURCES := $(wildcard *.cpp)
OBJS := $(patsubst %.cpp,%.o,$(SOURCES))
DEMOS := $(patsubst %.cpp,%.demo,$(SOURCES))
CPPJIEBADIR := ../cppjieba
LIBCPPJIEBA := $(CPPJIEBADIR)/libcppjieba.a
HUSKYDIR := ../husky
LIBHUSKYA := $(HUSKYDIR)/libhusky.a
.PHONY: clean $(LIBCPPJIEBA)
all: $(DEMOS)
%.demo: %.cpp $(LIBCPPJIEBA) $(LIBHUSKYA)
$(CXX) -o $@ $(CXXFLAGS) $^ $(INCS) $(LINK)
$(LIBCPPJIEBA):
cd $(CPPJIEBADIR) && $(MAKE)
$(LIBHUSKYA):
cd $(HUSKYDIR) && $(MAKE)
clean:
rm -f *.o *.ut *.d *.d.* $(DEMOS)
cd $(CPPJIEBADIR) && make clean
cd $(HUSKYDIR) && make clean
sinclude $(SOURCES:.cpp=.d)
%.d:%.cpp
@set -e; rm -f $@; \
$(CXX) -MM $< > $@.$$$$; \
sed 's,\($*\).o[ :]*,\1.o $@ : ,g' < $@.$$$$ > $@; \
rm -f $@.$$$$

View File

@ -1,3 +0,0 @@
./segment.demo testlines.utf8 --dictpath ../dicts/jieba.dict.utf8
./segment.demo testlines.utf8 --modelpath ../dicts/hmm_model.utf8 --algorithm cutHMM
./segment.demo testlines.utf8 --algorithm cutMix

View File

@ -1,56 +0,0 @@
#include <iostream>
#include <fstream>
#include <ArgvContext.hpp>
#include "../cppjieba/KeyWordExt.h"
using namespace CppJieba;
void testKeyWordExt(const char * dictPath, const char * filePath)
{
KeyWordExt ext;
if(!ext.init(dictPath))
{
return;
}
ifstream ifile(filePath);
vector<KeyWordInfo> res;
string line;
while(getline(ifile, line))
{
res.clear();
if(!line.empty())
{
ext.extract(line, res, 20);
cout<<line<<'\n'<<joinWordInfos(res)<<endl;
}
}
ext.dispose();
}
const char * const DEFAULT_DICTPATH = "../dicts/jieba.dict.utf8";
int main(int argc, char ** argv)
{
if(2 > argc)
{
cout<<"usage: \n\t"<<argv[0]<<" [options] <filename>\n"
<<"options:\n"
<<"\t--dictpath\tIf not specified, the default is "<<DEFAULT_DICTPATH<<"\n"
<<"examples:\n"
<<"\t"<<argv[0]<<" testlines.utf8 --encoding utf-8 --dictpath ../dicts/jieba.dict.utf8\n"
<<endl;
return -1;
}
ArgvContext arg(argc, argv);
string dictPath = arg["--dictpath"];
if("" == dictPath)
{
dictPath = DEFAULT_DICTPATH;
}
testKeyWordExt(dictPath.c_str(), arg[1].c_str());
return 0;
}

View File

@ -1,124 +0,0 @@
#include <iostream>
#include <fstream>
#include <ArgvContext.hpp>
#include "../cppjieba/MPSegment.h"
#include "../cppjieba/HMMSegment.h"
#include "../cppjieba/MixSegment.h"
using namespace CppJieba;
MPSegment seg;
HMMSegment hmmseg;
MixSegment mixseg;
bool init(const char * const dictPath, const char * const modelPath)
{
if(!seg.init(dictPath))
{
cout<<"seg init failed."<<endl;
return false;
}
if(!hmmseg.init(modelPath))
{
cout<<"hmmseg init failed."<<endl;
return false;
}
if(!mixseg.init(dictPath, modelPath))
{
cout<<"mixseg init failed."<<endl;
return false;
}
return true;
}
void cut(const ISegment * seg, const char * const filePath)
{
ifstream ifile(filePath);
vector<string> res;
string line;
while(getline(ifile, line))
{
if(!line.empty())
{
res.clear();
seg->cut(line, res);
cout<<line<<"\n"<<joinStr(res,"/")<<endl;
}
}
}
bool dispose()
{
if(!seg.dispose())
{
cout<<"seg dispose failed."<<endl;
return false;
}
if(!hmmseg.dispose())
{
cout<<"seg dispose failed."<<endl;
return false;
}
if(!mixseg.dispose())
{
cout<<"seg dispose failed."<<endl;
return false;
}
return true;
}
const char * const DEFAULT_DICTPATH = "../dicts/jieba.dict.utf8";
const char * const DEFAULT_MODELPATH = "../dicts/hmm_model.utf8";
int main(int argc, char ** argv)
{
if(argc < 2)
{
cout<<"usage: \n\t"<<argv[0]<<"[options] <filename>\n"
<<"options:\n"
<<"\t--algorithm\tSupported methods are [cutDAG, cutHMM, cutMix] for now. \n\t\t\tIf not specified, the default is cutDAG\n"
<<"\t--dictpath\tIf not specified, the default is "<<DEFAULT_DICTPATH<<'\n'
<<"\t--modelpath\tIf not specified, the default is "<<DEFAULT_MODELPATH<<'\n'
<<"example:\n"
<<"\t"<<argv[0]<<" testlines.utf8 --dictpath ../dicts/jieba.dict.utf8\n"
<<"\t"<<argv[0]<<" testlines.utf8 --modelpath ../dicts/hmm_model.utf8 --algorithm cutHMM\n"
<<"\t"<<argv[0]<<" testlines.utf8 --modelpath ../dicts/hmm_model.utf8 --algorithm cutMix\n"
<<endl;
return -1;
}
ArgvContext arg(argc, argv);
string dictPath = arg["--dictpath"];
string modelPath = arg["--modelpath"];
string algorithm = arg["--algorithm"];
if(dictPath.empty())
{
dictPath = DEFAULT_DICTPATH;
}
if(modelPath.empty())
{
modelPath = DEFAULT_MODELPATH;
}
if(!init(dictPath.c_str(), modelPath.c_str()))
{
LogError("init failed.");
return -1;
}
if("cutHMM" == algorithm)
{
cut(&hmmseg, arg[1].c_str());
}
else if("cutMix" == algorithm)
{
cut(&mixseg, arg[1].c_str());
}
else
{
cut(&seg, arg[1].c_str());
}
dispose();
return 0;
}

View File

@ -1,42 +0,0 @@
CXX := g++
LD := g++
AR := ar rc
DEBUG_CXXFLAGS := -g -Wall -DDEBUG
RELEASE_CXXFLAGS := -Wall -O3
ifeq (YES, ${RELEASE})
CXXFLAGS := ${RELEASE_CXXFLAGS}
LDFLAGS := ${RELEASE_LDFLAGS}
else
CXXFLAGS := ${DEBUG_CXXFLAGS}
LDFLAGS := ${DEBUG_LDFLAGS}
endif
DOLINK := $(LD) $(LDFLAGS) -o $@ $^
DOPACK := $(AR)
SOURCES = $(wildcard *.cpp)
OBJS := $(patsubst %.cpp,%.o,$(SOURCES))
INC := -I../limonp
LIBA := libhusky.a
.PHONY: clean
all: $(LIBA)
%.o: %.cpp
$(CXX) -c $(CXXFLAGS) $< $(INC)
${LIBA}: $(OBJS)
$(DOPACK) $@ $(OBJS)
clean:
rm -f *.o *.d *.d.* $(LIBA)
sinclude $(SOURCES:.cpp=.d)
%.d:%.cpp
@set -e; rm -f $@; \
$(CXX) -MM $< > $@.$$$$; \
sed 's,\($*\).o[ :]*,\1.o $@ : ,g' < $@.$$$$ > $@; \
rm -f $@.$$$$

View File

@ -1,123 +0,0 @@
/************************************
* file enc : ascii
* author : wuyanyi09@gmail.com
************************************/
#ifndef LIMONP_MAP_FUNCTS_H
#define LIMONP_MAP_FUNCTS_H
#include <map>
#include <set>
#include <iostream>
#include <sstream>
#include "typedefs.h"
namespace Limonp
{
using namespace std;
template <typename T>
string setToString(const set<T>& st)
{
if(st.empty())
{
return "{}";
}
stringstream ss;
ss<<'{';
typename set<T>::const_iterator it = st.begin();
ss<<*it;
it++;
while(it != st.end())
{
ss<<", "<<*it;
it++;
}
ss<<'}';
return ss.str();
}
template<typename T1, typename T2>
string mapToString(const map<T1, T2>& mp)
{
if(mp.empty())
{
return "{}";
}
stringstream ss;
ss<<'{';
typename map<T1, T2>::const_iterator it = mp.begin();
ss<<it->first<<": "<<it->second;
it++;
while(it != mp.end())
{
ss<<", "<<it->first<<": "<<it->second;
it++;
}
ss<<'}';
return ss.str();
}
template<typename T1, typename T2>
string HashMapToString(const HashMap<T1, T2>& mp)
{
if(mp.empty())
{
return "{}";
}
stringstream ss;
ss<<'{';
typename HashMap<T1, T2>::const_iterator it = mp.begin();
ss<<it->first<<": "<<it->second;
it++;
while(it != mp.end())
{
ss<<", "<<it->first<<": "<<it->second;
it++;
}
ss<<'}';
return ss.str();
}
template<typename T1, typename T2>
string pairToString(const pair<T1, T2>& p)
{
stringstream ss;
ss<<p.first<<":"<<p.second;
return ss.str();
}
template<class kT, class vT>
void printMap(const map<kT, vT>& mp)
{
for(typename map<kT, vT>::const_iterator it = mp.begin(); it != mp.end(); it++)
{
cout<<it->first<<' '<<it->second<<endl;
}
}
template<class kT, class vT>
vT getMap(const map<kT, vT>& mp, const kT & key, const vT & defaultVal)
{
typename map<kT, vT>::const_iterator it;
it = mp.find(key);
if(mp.end() == it)
{
return defaultVal;
}
return it->second;
}
template<class kT, class vT>
void map2Vec(const map<kT, vT>& mp, vector<pair<kT, vT> > & res)
{
typename map<kT, vT>::const_iterator it = mp.begin();
for(; it != mp.end(); it++)
{
res.push_back(*it);
}
}
}
#endif

View File

@ -1,21 +0,0 @@
/************************************
* file enc : utf8
* author : wuyanyi09@gmail.com
************************************/
#ifndef LIMONP_TYPEDEFS_H
#define LIMONP_TYPEDEFS_H
#include <stdint.h>
#include <vector>
#include <sys/types.h>
#include <tr1/unordered_map>
#define HashMap std::tr1::unordered_map
namespace Limonp
{
typedef std::vector<uint16_t> Unicode;
typedef std::vector<uint16_t>::const_iterator UnicodeConstIterator;
}
#endif

View File

@ -1,142 +0,0 @@
/************************************
* file enc : ascii
* author : wuyanyi09@gmail.com
************************************/
#ifndef LIMONP_VEC_FUNCTS_H
#define LIMONP_VEC_FUNCTS_H
#include <iostream>
#include <vector>
#include <algorithm>
#include <queue>
#include <string>
#include <iostream>
#include <string>
#include <vector>
#include <utility>
#include <algorithm>
#include <sstream>
#define FOR_VECTOR(vec, i) for(size_t i = 0; i < vec.size(); i++)
#define PRINT_VECTOR(vec) FOR_VECTOR(vec, i)\
{\
cout<<vec[i]<<endl;\
}
#define PRINT_MATRIX(mat) FOR_VECTOR(mat, i) \
{\
FOR_VECTOR(mat[i], j)\
{\
cout<<"["<<i<<","<<j<<"]:"<<mat[i][j]<<endl;\
}\
}
namespace Limonp
{
using namespace std;
template <typename T>
bool vecToString(const vector<T>& vec, string& res)
{
if(vec.empty())
{
res = "[]";
return false;
}
stringstream ss;
ss<<"[\""<<vec[0];
for(uint i = 1; i < vec.size(); i++)
{
ss<<"\", \""<<vec[i];
}
ss<<"\"]";
res = ss.str();
return true;
}
template <typename T>
string vecToString(const vector<T>& vec)
{
string res;
vecToString(vec, res);
return res;
}
template<typename T>
bool isInVec(const vector<T>& vec, const T& item)
{
typename vector<T>::const_iterator it = find(vec.begin(), vec.end(), item);
return it != vec.end();
}
template<typename T>
void splitVec(const vector<T>& vecSrc, vector< pair<T, vector<T> > >& outVec, const vector<T>& patterns)
{
vector<T> tmp;
T pattern;
size_t patternSize = patterns.size();
for(size_t i = 0; i < vecSrc.size(); i++)
{
size_t patternPos = patternSize;
for(size_t j = 0; j < patternSize; j++)
{
if(patterns[j] == vecSrc[i])
{
patternPos = j;
break;
}
}
if(patternPos != patternSize)
{
if(!tmp.empty())
{
outVec.push_back(make_pair<T, vector<T> >(pattern, tmp));
tmp.clear();
}
pattern = patterns[patternPos];
}
else
{
tmp.push_back(vecSrc[i]);
}
}
if(!tmp.empty())
{
outVec.push_back(make_pair<T, vector<T> >(pattern, tmp));
}
}
template<typename T>
void splitVec(const vector<T>& vecSrc, vector< vector<T> >& outVec, const vector<T>& patternVec)
{
vector<T> tmp;
for(size_t i = 0; i < vecSrc.size(); i++)
{
bool flag = false;
for(size_t j = 0; j < patternVec.size(); j++)
{
if(patternVec[j] == vecSrc[i])
{
flag = true;
break;
}
}
if(flag)
{
if(!tmp.empty())
{
outVec.push_back(tmp);
tmp.clear();
}
}
else
{
tmp.push_back(vecSrc[i]);
}
}
if(!tmp.empty())
{
outVec.push_back(tmp);
}
}
}
#endif

View File

@ -1 +0,0 @@
sed -i '1i/************************************\n * file enc : utf8\n * author : wuyanyi09@gmail.com\n************************************/' ../src/*.h ../src/*.cpp ../src/*.tcc

View File

@ -1,29 +0,0 @@
#!/usr/bin/python
import sys
if len(sys.argv) == 1:
print "usage : %s dict_file1 dict_file2 ..."
exit(1)
d = {}
for fname in sys.argv[1:]:
with open(fname, "r") as fin:
for i, line in enumerate(fin):
try:
word, cnt, tag = line.strip().split(" ")
if word in d:
print "error file[%s] line[%s] : %s" %(fname, i, line)
exit(1)
else:
d[word] = True
if 0 >= int(cnt) :
print "error file[%s] line[%s] : %s" %(fname, i, line)
exit(1)
except Exception as err:
print "error file[%s] line[%s] : %s" %(fname, i, line)
exit(1)
print "OK"

View File

@ -1,23 +0,0 @@
#!/usr/bin/python
import sys
if len(sys.argv) == 1:
print "usage : %s dict_file_path"
exit(1)
d = {}
with open(sys.argv[1], "r") as fin:
for i, line in enumerate(fin):
word, cnt, tag = line.strip().split(" ")
if word in d:
#print "error file[%s] line[%s] : %s" %(fname, i, line)
#exit(1)
continue
else:
d[word] = True
if 0 >= int(cnt) :
continue
print line.strip()

View File

@ -1,15 +0,0 @@
#!/usr/bin/python
import sys
if len(sys.argv) != 4:
print "usage : %s from_enc to_enc dict_file_path \nexample: %s gbk utf-8 fname" %(__file__, __file__)
exit(1)
with open(sys.argv[3], "r") as fin:
for i, line in enumerate(fin):
try:
print line.strip().decode(sys.argv[1]).encode(sys.argv[2])
except Exception as err:
print >> sys.stderr, err

23
src/CMakeLists.txt Normal file
View File

@ -0,0 +1,23 @@
SET(EXECUTABLE_OUTPUT_PATH ${PROJECT_BINARY_DIR}/bin)
SET(LIBRARY_OUTPUT_PATH ${PROJECT_BINARY_DIR}/lib)
SET(LIBCPPJIEBA_SRC HMMSegment.cpp MixSegment.cpp MPSegment.cpp Trie.cpp)
ADD_LIBRARY(cppjieba STATIC ${LIBCPPJIEBA_SRC})
ADD_EXECUTABLE(segment segment.cpp)
ADD_EXECUTABLE(server server.cpp)
LINK_DIRECTORIES(husky)
TARGET_LINK_LIBRARIES(segment cppjieba)
TARGET_LINK_LIBRARIES(server cppjieba husky pthread)
SET_TARGET_PROPERTIES(cppjieba PROPERTIES VERSION 1.2 SOVERSION 1)
INSTALL(TARGETS cppjieba ARCHIVE DESTINATION lib/CppJieba)
INSTALL(TARGETS segment RUNTIME DESTINATION bin/CppJieba)
INSTALL(TARGETS server RUNTIME DESTINATION bin/CppJieba)
INSTALL(FILES ChineseFilter.hpp HMMSegment.h MPSegment.h structs.h Trie.h globals.h ISegment.hpp MixSegment.h SegmentBase.hpp TransCode.hpp DESTINATION include/CppJieba)
ADD_SUBDIRECTORY(Husky)
ADD_SUBDIRECTORY(Limonp)

View File

@ -4,8 +4,8 @@
#include <iostream>
#include <fstream>
#include <memory.h>
#include <str_functs.hpp>
#include <logger.hpp>
#include "Limonp/str_functs.hpp"
#include "Limonp/logger.hpp"
#include "globals.h"
#include "TransCode.hpp"
#include "ISegment.hpp"

8
src/Husky/CMakeLists.txt Normal file
View File

@ -0,0 +1,8 @@
SET(EXECUTABLE_OUTPUT_PATH ${PROJECT_BINARY_DIR}/bin)
SET(LIBRARY_OUTPUT_PATH ${PROJECT_BINARY_DIR}/lib)
SET(LIBHUSKY_SRC Daemon.cpp ServerFrame.cpp)
ADD_LIBRARY(husky STATIC ${LIBHUSKY_SRC})
INSTALL(TARGETS husky ARCHIVE DESTINATION lib/CppJieba/Husky)
INSTALL(FILES Daemon.h globals.h HttpReqInfo.hpp ServerFrame.h ThreadManager.hpp DESTINATION include/CppJieba/Husky)

View File

@ -8,7 +8,7 @@
#include <sys/wait.h>
#include <sys/stat.h>
#include <signal.h>
#include <logger.hpp>
#include "../Limonp/logger.hpp"
#include "ServerFrame.h"
namespace Husky

View File

@ -3,14 +3,14 @@
#include <iostream>
#include <string>
#include "../Limonp/logger.hpp"
#include "../Limonp/str_functs.hpp"
#include "globals.h"
#include <str_functs.hpp>
#include <logger.hpp>
#include <map_functs.hpp>
namespace Husky
{
using namespace Limonp;
using namespace std;
static const char* const KEY_METHOD = "METHOD";
static const char* const KEY_PATH = "PATH";
@ -130,7 +130,8 @@ namespace Husky
LogFatal("headerStr illegal.");
return false;
}
_headerMap[upperStr(k)] = v;
upper(k);
_headerMap[k] = v;
lpos = rpos + 1;
}
//message header end
@ -160,6 +161,8 @@ namespace Husky
HashMap<string, string> _headerMap;
HashMap<string, string> _methodGetMap;
HashMap<string, string> _methodPostMap;
//public:
friend ostream& operator<<(ostream& os, const HttpReqInfo& obj);
private:
bool _find(const HashMap<string, string>& mp, const string& key, string& res)const
{
@ -171,19 +174,6 @@ namespace Husky
res = it->second;
return true;
}
public:
//string toString() const;// function for debug because of heavy time consuming
string toString() const
{
string res("{");
res += HashMapToString(_headerMap);
res += ",";
res += HashMapToString(_methodGetMap);
res += ",";
res += HashMapToString(_methodPostMap);
res += "}";
return res;
}
private:
bool _parseUrl(const string& url, HashMap<string, string>& mp)
{
@ -226,6 +216,11 @@ namespace Husky
}
};
inline std::ostream& operator << (std::ostream& os, const Husky::HttpReqInfo& obj)
{
return os << obj._headerMap << obj._methodGetMap << obj._methodPostMap;
}
}
#endif

View File

@ -134,13 +134,13 @@ namespace Husky
nRetCode = recv(hClientSock, chRecvBuf, RECV_BUFFER, 0);
strRec = chRecvBuf;
#ifdef DEBUG
LogDebug("response[%s]", strRec.c_str());
#ifdef HUKSY_DEBUG
LogDebug("request[%s]", strRec.c_str());
#endif
if(SOCKET_ERROR==nRetCode)
{
LogError("error [%s]", strerror(errno));
LogDebug("error [%s]", strerror(errno));
closesocket(hClientSock);
continue;
}
@ -160,15 +160,15 @@ namespace Husky
strHttpResp=chHttpHeader;
strHttpResp+=strSnd;
#ifdef HUKSY_DEBUG
LogDebug("response'body [%s]", strSnd.c_str());
#endif
if (SOCKET_ERROR==send(hClientSock,strHttpResp.c_str(),strHttpResp.length(),0))
{
LogError("error [%s]", strerror(errno));
}
#ifdef DEBUG
LogDebug("send response [%s] ", strHttpResp.c_str());
#endif
closesocket(hClientSock);
}

View File

@ -10,7 +10,6 @@
#include <sstream>
#include "str_functs.hpp"
#include "map_functs.hpp"
#include "vec_functs.hpp"
namespace Limonp
{
@ -43,12 +42,7 @@ namespace Limonp
}
~ArgvContext(){};
public:
string toString()
{
stringstream ss;
ss<<vecToString<string>(_args)<<mapToString<string, string>(_mpss)<<setToString<string>(_sset);
return ss.str();
}
friend ostream& operator << (ostream& os, const ArgvContext& args);
string operator [](uint i)
{
if(i < _args.size())
@ -81,6 +75,16 @@ namespace Limonp
set<string> _sset;
};
inline ostream& operator << (ostream& os, const ArgvContext& args)
{
return os<<args._args<<args._mpss<<args._sset;
}
//string toString()
//{
// stringstream ss;
// return ss.str();
//}
}
#endif

View File

@ -0,0 +1,2 @@
FILE(GLOB HEAD_HPP_LIST "*.hpp")
INSTALL(FILES ${HEAD_HPP_LIST} DESTINATION include/CppJieba/Limonp)

View File

@ -6,7 +6,6 @@
#include <vector>
#include <string>
#include "logger.hpp"
#include "vec_functs.hpp"
namespace Limonp
{
@ -21,8 +20,9 @@ namespace Limonp
const char * const USER;
const char * const PASSWD;
const char * const DB;
const char * const CHARSET;
public:
MysqlClient(const char* host, uint port, const char* user, const char* passwd, const char* db): HOST(host), PORT(port), USER(user), PASSWD(passwd), DB(db){ _conn = NULL;};
MysqlClient(const char* host, uint port, const char* user, const char* passwd, const char* db, const char* charset = "utf8"): HOST(host), PORT(port), USER(user), PASSWD(passwd), DB(db), CHARSET(charset){ _conn = NULL;};
~MysqlClient(){dispose();};
public:
bool init()
@ -42,10 +42,17 @@ namespace Limonp
return false;
}
if(mysql_set_character_set(_conn, CHARSET))
{
LogError("mysql_set_character_set [%s] failed.", CHARSET);
return false;
}
//set reconenct
char value = 1;
mysql_options(_conn, MYSQL_OPT_RECONNECT, &value);
LogInfo("MysqlClient {host: %s, port:%d, database:%s, charset:%s}", HOST, PORT, DB, CHARSET);
return true;
}
bool dispose()
@ -71,6 +78,18 @@ namespace Limonp
}
return true;
}
uint insert(const char* tb_name, const char* keys, const vector<string>& vals)
{
uint retn = 0;
string sql;
for(uint i = 0; i < vals.size(); i ++)
{
sql.clear();
string_format(sql, "insert into %s (%s) values %s", tb_name, keys, vals[i].c_str());
retn += executeSql(sql.c_str());
}
return retn;
}
bool select(const char* sql, RowsType& rows)
{
if(!executeSql(sql))

View File

@ -13,13 +13,14 @@
#include <stdarg.h>
#include "io_functs.hpp"
#include "str_functs.hpp"
#include "typedefs.h"
#define LogDebug(fmt, ...) Logger::LoggingF(LL_DEBUG, __FILE__, __LINE__, fmt, ## __VA_ARGS__)
#define LogInfo(fmt, ...) Logger::LoggingF(LL_INFO, __FILE__, __LINE__, fmt, ## __VA_ARGS__)
#define LogWarn(fmt, ...) Logger::LoggingF(LL_WARN, __FILE__, __LINE__, fmt, ## __VA_ARGS__)
#define LogError(fmt, ...) Logger::LoggingF(LL_ERROR, __FILE__, __LINE__, fmt, ## __VA_ARGS__)
#define LogFatal(fmt, ...) Logger::LoggingF(LL_FATAL, __FILE__, __LINE__, fmt, ## __VA_ARGS__)
#define FILE_BASENAME strrchr(__FILE__, '/') ? strrchr(__FILE__, '/') + 1 : __FILE__
#define LogDebug(fmt, ...) Logger::LoggingF(LL_DEBUG, FILE_BASENAME, __LINE__, fmt, ## __VA_ARGS__)
#define LogInfo(fmt, ...) Logger::LoggingF(LL_INFO, FILE_BASENAME, __LINE__, fmt, ## __VA_ARGS__)
#define LogWarn(fmt, ...) Logger::LoggingF(LL_WARN, FILE_BASENAME, __LINE__, fmt, ## __VA_ARGS__)
#define LogError(fmt, ...) Logger::LoggingF(LL_ERROR, FILE_BASENAME, __LINE__, fmt, ## __VA_ARGS__)
#define LogFatal(fmt, ...) Logger::LoggingF(LL_FATAL, FILE_BASENAME, __LINE__, fmt, ## __VA_ARGS__)
namespace Limonp

116
src/Limonp/map_functs.hpp Normal file
View File

@ -0,0 +1,116 @@
/************************************
* file enc : ascii
* author : wuyanyi09@gmail.com
************************************/
#ifndef LIMONP_MAP_FUNCTS_H
#define LIMONP_MAP_FUNCTS_H
#include <map>
#include <set>
#include <iostream>
#include <sstream>
#include <tr1/unordered_map>
#define HashMap std::tr1::unordered_map
namespace Limonp
{
using namespace std;
//template <typename T>
// string setToString(const set<T>& st)
// {
// if(st.empty())
// {
// return "{}";
// }
// stringstream ss;
// ss<<'{';
// typename set<T>::const_iterator it = st.begin();
// ss<<*it;
// it++;
// while(it != st.end())
// {
// ss<<", "<<*it;
// it++;
// }
// ss<<'}';
// return ss.str();
// }
//template<typename T1, typename T2>
// string mapToString(const map<T1, T2>& mp)
// {
// if(mp.empty())
// {
// return "{}";
// }
// stringstream ss;
// ss<<'{';
// typename map<T1, T2>::const_iterator it = mp.begin();
// ss<<it->first<<": "<<it->second;
// it++;
// while(it != mp.end())
// {
// ss<<", "<<it->first<<": "<<it->second;
// it++;
// }
// ss<<'}';
// return ss.str();
// }
//template<typename T1, typename T2>
// string HashMapToString(const HashMap<T1, T2>& mp)
// {
// if(mp.empty())
// {
// return "{}";
// }
// stringstream ss;
// ss<<'{';
// typename HashMap<T1, T2>::const_iterator it = mp.begin();
// ss<<it->first<<": "<<it->second;
// it++;
// while(it != mp.end())
// {
// ss<<", "<<it->first<<": "<<it->second;
// it++;
// }
// ss<<'}';
// return ss.str();
// }
//template<typename T1, typename T2>
// string pairToString(const pair<T1, T2>& p)
// {
// stringstream ss;
// ss<<p.first<<":"<<p.second;
// return ss.str();
// }
template<class kT, class vT>
vT getMap(const map<kT, vT>& mp, const kT & key, const vT & defaultVal)
{
typename map<kT, vT>::const_iterator it;
it = mp.find(key);
if(mp.end() == it)
{
return defaultVal;
}
return it->second;
}
template<class kT, class vT>
void map2Vec(const map<kT, vT>& mp, vector<pair<kT, vT> > & res)
{
typename map<kT, vT>::const_iterator it = mp.begin();
for(; it != mp.end(); it++)
{
res.push_back(*it);
}
}
}
#endif

101
src/Limonp/std_outbound.hpp Normal file
View File

@ -0,0 +1,101 @@
#ifndef LIMONP_STD_OUTBOUND_H
#define LIMONP_STD_OUTBOUND_H
#include <tr1/unordered_map>
#include <map>
#include <set>
namespace std
{
template<typename T>
ostream& operator << (ostream& os, const vector<T>& vec)
{
if(vec.empty())
{
return os << "[]";
}
os<<"[\""<<vec[0];
for(uint i = 1; i < vec.size(); i++)
{
os<<"\", \""<<vec[i];
}
os<<"\"]";
return os;
}
template<class T1, class T2>
ostream& operator << (ostream& os, const pair<T1, T2>& pr)
{
os << pr.first << ":" << pr.second ;
return os;
}
template<class T>
string& operator << (string& str, const T& obj)
{
stringstream ss;
ss << obj; // call ostream& operator << (ostream& os,
return str = ss.str();
}
template<class T1, class T2>
ostream& operator << (ostream& os, const map<T1, T2>& mp)
{
if(mp.empty())
{
os<<"{}";
return os;
}
os<<'{';
typename map<T1, T2>::const_iterator it = mp.begin();
os<<*it;
it++;
while(it != mp.end())
{
os<<", "<<*it;
it++;
}
os<<'}';
return os;
}
template<class T1, class T2>
ostream& operator << (ostream& os, const std::tr1::unordered_map<T1, T2>& mp)
{
if(mp.empty())
{
return os << "{}";
}
os<<'{';
typename std::tr1::unordered_map<T1, T2>::const_iterator it = mp.begin();
os<<*it;
it++;
while(it != mp.end())
{
os<<", "<<*it++;
}
return os<<'}';
}
template<class T>
ostream& operator << (ostream& os, const set<T>& st)
{
if(st.empty())
{
os << "{}";
return os;
}
os<<'{';
typename set<T>::const_iterator it = st.begin();
os<<*it;
it++;
while(it != st.end())
{
os<<", "<<*it;
it++;
}
os<<'}';
return os;
}
}
#endif

View File

@ -15,9 +15,17 @@
#include <stdio.h>
#include <stdarg.h>
#include <memory.h>
#include "typedefs.h"
#include <functional>
#include <locale>
#include <sstream>
#include <sys/types.h>
#include <iterator>
#include <algorithm>
#include "std_outbound.hpp"
#include "map_functs.hpp"
#define print(x) cout<<(x)<<endl
namespace Limonp
{
using namespace std;
@ -42,11 +50,12 @@ namespace Limonp
}
return str;
}
inline void string_format(string& res, const char* fmt, ...)
{
int size = 256;
va_list ap;
res.clear();
while (1) {
res.resize(size);
va_start(ap, fmt);
@ -63,27 +72,55 @@ namespace Limonp
}
}
inline bool joinStr(const vector<string>& src, string& dest, const string& connectorStr)
{
if(src.empty())
{
return false;
}
for(uint i = 0; i < src.size() - 1; i++)
{
dest += src[i];
dest += connectorStr;
}
dest += src[src.size() - 1];
return true;
}
//inline bool joinStr(const vector<string>& src, string& dest, const string& connectorStr)
//{
// if(src.empty())
// {
// return false;
// }
// for(uint i = 0; i < src.size() - 1; i++)
// {
// dest += src[i];
// dest += connectorStr;
// }
// dest += src[src.size() - 1];
// return true;
//}
inline string joinStr(const vector<string>& source, const string& connector)
{
string res;
joinStr(source, res, connector);
return res;
}
//inline string joinStr(const vector<string>& source, const string& connector)
//{
// string res;
// joinStr(source, res, connector);
// return res;
//}
template<class T>
void join(T begin, T end, string& res, const string& connector)
{
if(begin == end)
{
return;
}
stringstream ss;
ss<<*begin;
begin++;
while(begin != end)
{
ss << connector << *begin;
begin ++;
}
res = ss.str();
}
template<class T>
string join(T begin, T end, const string& connector)
{
string res;
join(begin ,end, res, connector);
return res;
}
inline bool splitStr(const string& src, vector<string>& res, const string& pattern)
{
@ -104,26 +141,24 @@ namespace Limonp
return true;
}
res.push_back(src.substr(start, end - start));
if(end == src.size() - 1)
{
res.push_back("");
break;
}
if(end == src.size() - 1)
{
res.push_back("");
break;
}
start = end + 1;
}
return true;
}
inline string upperStr(const string& strIn)
inline string& upper(string& str)
{
string str = strIn;
transform(str.begin(), str.end(), str.begin(), (int (*)(int))toupper);
return str;
}
inline string lowerStr(const string& strIn)
inline string& lower(string& str)
{
string str = strIn;
transform(str.begin(), str.end(), str.begin(), (int (*)(int))tolower);
return str;
}
@ -183,40 +218,40 @@ namespace Limonp
return str.find(ch) != string::npos;
}
inline void extractWords(const string& sentence, vector<string>& words)
{
bool flag = false;
uint lhs = 0, len = 0;
for(uint i = 0; i < sentence.size(); i++)
{
char x = sentence[i];
if((0x0030 <= x && x<= 0x0039) || (0x0041 <= x && x <= 0x005a ) || (0x0061 <= x && x <= 0x007a))
{
if(flag)
{
len ++;
}
else
{
lhs = i;
len = 1;
}
flag = true;
}
else
{
if(flag)
{
words.push_back(string(sentence, lhs, len));
}
flag = false;
}
}
if(flag)
{
words.push_back(string(sentence, lhs, len));
}
}
//inline void extractWords(const string& sentence, vector<string>& words)
//{
// bool flag = false;
// uint lhs = 0, len = 0;
// for(uint i = 0; i < sentence.size(); i++)
// {
// char x = sentence[i];
// if((0x0030 <= x && x<= 0x0039) || (0x0041 <= x && x <= 0x005a ) || (0x0061 <= x && x <= 0x007a))
// {
// if(flag)
// {
// len ++;
// }
// else
// {
// lhs = i;
// len = 1;
// }
// flag = true;
// }
// else
// {
// if(flag)
// {
// words.push_back(string(sentence, lhs, len));
// }
// flag = false;
// }
// }
// if(flag)
// {
// words.push_back(string(sentence, lhs, len));
// }
//}
}

View File

@ -7,7 +7,7 @@
#include <algorithm>
#include <set>
#include <logger.hpp>
#include "Limonp/logger.hpp"
#include "Trie.h"
#include "globals.h"
#include "ISegment.hpp"

View File

@ -3,7 +3,7 @@
#include "MPSegment.h"
#include "HMMSegment.h"
#include <str_functs.hpp>
#include "Limonp/str_functs.hpp"
namespace CppJieba
{

View File

@ -4,8 +4,8 @@
#include "globals.h"
#include "ISegment.hpp"
#include "ChineseFilter.hpp"
#include <str_functs.hpp>
#include <logger.hpp>
#include "Limonp/str_functs.hpp"
#include "Limonp/logger.hpp"
namespace CppJieba
{

View File

@ -7,8 +7,7 @@
#include "globals.h"
#include <str_functs.hpp>
#include <vec_functs.hpp>
#include "Limonp/str_functs.hpp"
namespace CppJieba
{

View File

@ -12,8 +12,8 @@
#include <stdint.h>
#include <cmath>
#include <limits>
#include <str_functs.hpp>
#include <logger.hpp>
#include "Limonp/str_functs.hpp"
#include "Limonp/logger.hpp"
#include "TransCode.hpp"
#include "globals.h"
#include "structs.h"

82
src/segment.cpp Normal file
View File

@ -0,0 +1,82 @@
#include <iostream>
#include <fstream>
#include "Limonp/ArgvContext.hpp"
#include "MPSegment.h"
#include "HMMSegment.h"
#include "MixSegment.h"
using namespace CppJieba;
void cut(const ISegment * seg, const char * const filePath)
{
ifstream ifile(filePath);
vector<string> res;
string line;
while(getline(ifile, line))
{
if(!line.empty())
{
res.clear();
seg->cut(line, res);
cout<<join(res.begin(), res.end(),"/")<<endl;
}
}
}
int main(int argc, char ** argv)
{
if(argc < 2)
{
cout<<"usage: \n\t"<<argv[0]<<" [options] <filename>\n"
<<"options:\n"
<<"\t--algorithm\tSupported methods are [cutDAG, cutHMM, cutMix] for now. \n\t\t\tIf not specified, the default is cutDAG\n"
<<"\t--dictpath\tsee example\n"
<<"\t--modelpath\tsee example\n"
<<"example:\n"
<<"\t"<<argv[0]<<" testlines.utf8 --dictpath dicts/jieba.dict.utf8\n"
<<"\t"<<argv[0]<<" testlines.utf8 --modelpath dicts/hmm_model.utf8 --algorithm cutHMM\n"
<<"\t"<<argv[0]<<" testlines.utf8 --dictpath dicts/jieba.dict.utf8 --modelpath dicts/hmm_model.utf8 --algorithm cutMix\n"
<<endl;
return EXIT_FAILURE;
}
ArgvContext arg(argc, argv);
string dictPath = arg["--dictpath"];
string modelPath = arg["--modelpath"];
string algorithm = arg["--algorithm"];
if("cutHMM" == algorithm)
{
HMMSegment seg;
if(!seg.init(modelPath.c_str()))
{
cout<<"seg init failed."<<endl;
return EXIT_FAILURE;
}
cut(&seg, arg[1].c_str());
seg.dispose();
}
else if("cutMix" == algorithm)
{
MixSegment seg;
if(!seg.init(dictPath.c_str(), modelPath.c_str()))
{
cout<<"seg init failed."<<endl;
return EXIT_FAILURE;
}
cut(&seg, arg[1].c_str());
seg.dispose();
}
else
{
MPSegment seg;
if(!seg.init(dictPath.c_str()))
{
cout<<"seg init failed."<<endl;
return false;
}
cut(&seg, arg[1].c_str());
seg.dispose();
}
return EXIT_SUCCESS;
}

View File

@ -3,12 +3,12 @@
#include <string>
#include <ctype.h>
#include <string.h>
#include <ArgvContext.hpp>
#include "../husky/Daemon.h"
#include "../husky/ServerFrame.h"
#include "../cppjieba/MPSegment.h"
#include "../cppjieba/HMMSegment.h"
#include "../cppjieba/MixSegment.h"
#include "Limonp/ArgvContext.hpp"
#include "Husky/Daemon.h"
#include "Husky/ServerFrame.h"
#include "MPSegment.h"
#include "HMMSegment.h"
#include "MixSegment.h"
using namespace Husky;
using namespace CppJieba;
@ -31,7 +31,7 @@ class ServerDemo: public IRequestHandler
httpReq.GET("key", tmp);
URLDecode(tmp, sentence);
_segment.cut(sentence, words);
vecToString(words, strSnd);
strSnd << words;
return true;
}
private:

View File

@ -74,7 +74,7 @@ namespace CppJieba
KeyWordInfo(const TrieNodeInfo& trieNodeInfo):TrieNodeInfo(trieNodeInfo)
{
}
inline string toString() const
string toString() const
{
string tmp;
TransCode::encode(word, tmp);
@ -89,16 +89,23 @@ namespace CppJieba
return *this;
}
};
inline string joinWordInfos(const vector<KeyWordInfo>& vec)
inline ostream& operator << (ostream& os, const KeyWordInfo& info)
{
vector<string> tmp;
for(uint i = 0; i < vec.size(); i++)
{
tmp.push_back(vec[i].toString());
}
return joinStr(tmp, ",");
string tmp;
TransCode::encode(info.word, tmp);
return os << "{words:" << tmp << ", weight:" << info.weight << ", idf:" << info.idf << "}";
}
//inline string joinWordInfos(const vector<KeyWordInfo>& vec)
//{
// vector<string> tmp;
// for(uint i = 0; i < vec.size(); i++)
// {
// tmp.push_back(vec[i].toString());
// }
// return joinStr(tmp, ",");
//}
}
#endif

View File

@ -1,54 +0,0 @@
CXX := g++
LD := g++
AR := ar rc
INCS := -I../cppjieba/
DEBUG_CXXFLAGS := -g -Wall -DDEBUG -DUT $(INCS)
CXXFLAGS := ${DEBUG_CXXFLAGS}
LDFLAGS := ${DEBUG_LDFLAGS}
DOLINK := $(LD) $(LDFLAGS)
DOPACK := $(AR)
SOURCES := $(wildcard *.cpp)
OBJS := $(patsubst %.cpp,%.o,$(SOURCES))
UTS := $(patsubst %.cpp,%.ut,$(SOURCES))
CPPJIEBADIR = ../cppjieba
LIBCPPJIEBA = $(CPPJIEBADIR)/libcppjieba.a
CPPCOMMONDIR = ../cppcommon
LIBCPPCM = $(CPPCOMMONDIR)/libcm.a
LIBA := $(LIBCPPJIEBA) $(LIBCPPCM)
# remove the objs after compilation
.PHONY: clean $(LIBA)
# Main Targets
all: $(UTS)
# This is a suffix rule
#.c.o:
%.o: %.cpp
$(CXX) -c $(CXXFLAGS) $<
%.ut: %.o $(LIBA)
$(CXX) $(CXXFLAGS) -o $@ $^
$(LIBCPPJIEBA):
cd $(CPPJIEBADIR) && $(MAKE)
$(LIBCPPCM):
cd $(CPPCOMMONDIR) && $(MAKE)
clean:
rm -f *.o *.ut *.d *.d.*
# cd $(CPPJIEBADIR) && make clean
# cd $(CPPCOMMONDIR) && make clean
sinclude $(SOURCES:.cpp=.d)
%.d:%.cpp
@set -e; rm -f $@; \
$(CXX) -MM $< > $@.$$$$; \
sed 's,\($*\).o[ :]*,\1.o $@ : ,g' < $@.$$$$ > $@; \
rm -f $@.$$$$

60
test/segment.cpp Normal file
View File

@ -0,0 +1,60 @@
#include <iostream>
#include <fstream>
#include <CppJieba/Limonp/ArgvContext.hpp>
#include <CppJieba/MPSegment.h>
#include <CppJieba/HMMSegment.h>
#include <CppJieba/MixSegment.h>
using namespace CppJieba;
void cut(const ISegment * seg, const char * const filePath)
{
ifstream ifile(filePath);
vector<string> res;
string line;
while(getline(ifile, line))
{
if(!line.empty())
{
res.clear();
seg->cut(line, res);
cout<<join(res.begin(), res.end(),"/")<<endl;
}
}
}
int main(int argc, char ** argv)
{
//demo
{
HMMSegment seg;
if(!seg.init("../dicts/hmm_model.utf8"))
{
cout<<"seg init failed."<<endl;
return EXIT_FAILURE;
}
cut(&seg, "testlines.utf8");
seg.dispose();
}
{
MixSegment seg;
if(!seg.init("../dicts/jieba.dict.utf8", "../dicts/hmm_model.utf8"))
{
cout<<"seg init failed."<<endl;
return EXIT_FAILURE;
}
cut(&seg, "testlines.utf8");
seg.dispose();
}
{
MPSegment seg;
if(!seg.init("../dicts/jieba.dict.utf8"))
{
cout<<"seg init failed."<<endl;
return false;
}
cut(&seg, "testlines.utf8");
seg.dispose();
}
return EXIT_SUCCESS;
}

58
test/server.cpp Normal file
View File

@ -0,0 +1,58 @@
#include <CppJieba/Husky/ServerFrame.h>
#include <CppJieba/Husky/Daemon.h>
#include <CppJieba/Limonp/ArgvContext.hpp>
#include <CppJieba/MPSegment.h>
#include <CppJieba/HMMSegment.h>
#include <CppJieba/MixSegment.h>
using namespace Husky;
using namespace CppJieba;
const char * const DEFAULT_DICTPATH = "../dicts/jieba.dict.utf8";
const char * const DEFAULT_MODELPATH = "../dicts/hmm_model.utf8";
class ServerDemo: public IRequestHandler
{
public:
ServerDemo(){};
virtual ~ServerDemo(){};
virtual bool init(){return _segment.init(DEFAULT_DICTPATH, DEFAULT_MODELPATH);};
virtual bool dispose(){return _segment.dispose();};
public:
virtual bool do_GET(const HttpReqInfo& httpReq, string& strSnd)
{
string sentence, tmp;
vector<string> words;
httpReq.GET("key", tmp);
URLDecode(tmp, sentence);
_segment.cut(sentence, words);
strSnd << words;
return true;
}
private:
MixSegment _segment;
};
int main(int argc,char* argv[])
{
if(argc != 7)
{
printf("usage: %s -n THREAD_NUMBER -p LISTEN_PORT -k start|stop\n",argv[0]);
return -1;
}
ArgvContext arg(argc, argv);
unsigned int port = atoi(arg["-p"].c_str());
unsigned int threadNum = atoi(arg["-n"].c_str());
ServerDemo s;
Daemon daemon(&s);
if(arg["-k"] == "start")
{
return !daemon.Start(port, threadNum);
}
else
{
return !daemon.Stop();
}
}