mirror of
https://github.com/yanyiwu/cppjieba.git
synced 2025-07-18 00:00:12 +08:00
Merge branch 'dev'
This commit is contained in:
commit
1f65862c82
2
CMakeLists.txt
Normal file
2
CMakeLists.txt
Normal file
@ -0,0 +1,2 @@
|
||||
PROJECT(CPPJIEBA)
|
||||
ADD_SUBDIRECTORY(src)
|
156
README.md
156
README.md
@ -7,41 +7,63 @@
|
||||
- `master`分支支持`utf8`编码
|
||||
- `gbk`分支支持`gbk`编码
|
||||
|
||||
## 安装与使用
|
||||
|
||||
## 模块详解
|
||||
### 下载和安装
|
||||
|
||||
### Trie树
|
||||
Trie.cpp/Trie.h 负责载入词典的trie树,主要供Segment模块使用。
|
||||
```sh
|
||||
wget https://github.com/aszxqw/cppjieba/archive/master.zip -O cppjieba-master.zip
|
||||
unzip cppjieba-master.zip
|
||||
cd cppjieba-master
|
||||
mkdir build
|
||||
cd build
|
||||
cmake -DCMAKE_INSTALL_PREFIX=/usr ..
|
||||
make
|
||||
sudo make install
|
||||
```
|
||||
|
||||
### Segment模块
|
||||
### 卸载
|
||||
```sh
|
||||
cd build/
|
||||
cat install_manifest.txt | sudo xargs rm -rf
|
||||
```
|
||||
|
||||
MPSegment.cpp/MPSegment.h
|
||||
(Maximum Probability)最大概率法:负责根据Trie树构建有向无环图和进行动态规划算法,是分词算法的核心。
|
||||
### 验证
|
||||
|
||||
HMMSegment.cpp/HMMSegment.h
|
||||
是根据HMM模型来进行分词,主要算法思路是根据(B,E,M,S)四个状态来代表每个字的隐藏状态。
|
||||
HMM模型由dicts/下面的`hmm_model.utf8`提供。
|
||||
分词算法即viterbi算法。
|
||||
```sh
|
||||
cd test/
|
||||
g++ -o segment.demo segment.cpp -L/usr/lib/CppJieba/ -lcppjieba
|
||||
./segment.demo # you will see the demo.
|
||||
```
|
||||
|
||||
运行一下 `./server` 或 `./segment` 都会有对应的帮助文档显示。
|
||||
|
||||
### TransCode模块
|
||||
同时,如果想知道开发时如何使用`libcppjieba.a` 请看`test/segment.cpp`源代码即可。
|
||||
|
||||
TransCode.cpp/TransCode.h 负责转换编码类型,将utf8和gbk转换成`uint16_t`类型,也负责逆转换。
|
||||
如果想知道如何搭建一个`cppjieba`中文分词的http服务请见 `test/server.cpp`源代码即可。
|
||||
|
||||
若还有其他问题,欢迎`send mail`或者`open issue`。 :)
|
||||
|
||||
### 搭建服务
|
||||
|
||||
## Demo
|
||||
```
|
||||
cd ./test
|
||||
g++ -o server server.cpp -L/usr/lib/CppJieba/ -L/usr/lib/CppJieba/Husky -lcppjieba -lhusky -lpthread
|
||||
./server -n 4 -p 11258 -k start >> run.log 2>&1 #启动服务,监听11258这个端口。
|
||||
./server -n 4 -p 11258 -k stop #停止服务
|
||||
```
|
||||
|
||||
#### 验证服务
|
||||
|
||||
然后用chrome浏览器打开`http://127.0.0.1:11258/?key=我来自北京邮电大学`
|
||||
(用chrome的原因是chrome的默认编码就是utf-8)
|
||||
|
||||
或者用命令 `curl "http://127.0.0.1:11258/?key=我来自北京邮电大学"` (ubuntu中的curl安装命令`sudo apt-get install curl`)
|
||||
|
||||
## 分词效果
|
||||
|
||||
### MPSegment's demo
|
||||
|
||||
__这部分的功能经过线上考验,一直稳定运行,暂时没有发现什么bug。__
|
||||
|
||||
```
|
||||
cd ./demo;
|
||||
make;
|
||||
./segment_demo testlines.utf8
|
||||
```
|
||||
|
||||
Output:
|
||||
```
|
||||
我来到北京清华大学
|
||||
@ -59,12 +81,6 @@ Output:
|
||||
|
||||
### HMMSegment's demo
|
||||
|
||||
```
|
||||
cd ./demo;
|
||||
make;
|
||||
./segment_demo testlines.utf8 --modelpath ../dicts/hmm_model.utf8 --algorithm cutHMM
|
||||
```
|
||||
|
||||
Output:
|
||||
```
|
||||
我来到北京清华大学
|
||||
@ -78,11 +94,6 @@ Output:
|
||||
```
|
||||
|
||||
### MixSegment's demo
|
||||
```
|
||||
cd ./demo;
|
||||
make;
|
||||
./segment_demo testlines.utf8 --algorithm cutMix
|
||||
```
|
||||
|
||||
Output:
|
||||
```
|
||||
@ -98,62 +109,51 @@ Output:
|
||||
我/来自/北京邮电大学/。。。/学号/091111xx/。。。
|
||||
```
|
||||
|
||||
### Server's demo
|
||||
|
||||
引入了husky这个文件夹,husky是一个简单的http服务框架。
|
||||
```
|
||||
cd ./demo;
|
||||
make;
|
||||
./start.sh #启动一个服务,监听11258这个端口(在start.sh里面指定)。
|
||||
```
|
||||
|
||||
关闭和重启分别是`stop.sh`和`restart.sh`
|
||||
|
||||
然后用chrome浏览器打开`http://127.0.0.1:11258/?key=我来自北京邮电大学`
|
||||
(用chrome的原因是chrome的默认编码就是utf-8)
|
||||
|
||||
或者用命令 `curl "http://127.0.0.1:11258/?key=我来自北京邮电大学"`
|
||||
|
||||
|
||||
### 效果分析
|
||||
|
||||
以上依次是MP,HMM,Mix三种方法的效果。
|
||||
|
||||
可以看出效果最好的是Mix,也就是融合MP和HMM的切词算法。即可以准确切出词典已有的词,又可以切出像"杭研"这样的未登录词。
|
||||
|
||||
## Help
|
||||
|
||||
|
||||
## 模块详解
|
||||
|
||||
本项目主要是如下目录组成:
|
||||
|
||||
### Limonp
|
||||
### src
|
||||
|
||||
核心目录,包含主要源代码。
|
||||
|
||||
#### Trie树
|
||||
Trie.cpp/Trie.h 负责载入词典的trie树,主要供Segment模块使用。
|
||||
|
||||
#### Segment模块
|
||||
|
||||
MPSegment.cpp/MPSegment.h
|
||||
(Maximum Probability)最大概率法:负责根据Trie树构建有向无环图和进行动态规划算法,是分词算法的核心。
|
||||
|
||||
HMMSegment.cpp/HMMSegment.h
|
||||
是根据HMM模型来进行分词,主要算法思路是根据(B,E,M,S)四个状态来代表每个字的隐藏状态。
|
||||
HMM模型由dicts/下面的`hmm_model.utf8`提供。
|
||||
分词算法即viterbi算法。
|
||||
|
||||
#### TransCode模块
|
||||
|
||||
TransCode.cpp/TransCode.h 负责转换编码类型,将utf8和gbk转换成`uint16_t`类型,也负责逆转换。
|
||||
|
||||
### src/Husky
|
||||
|
||||
提供服务的框架代码,
|
||||
|
||||
详见: https://github.com/aszxqw/husky
|
||||
|
||||
### src/Limonp
|
||||
|
||||
主要是一些工具函数,例如字符串操作等。
|
||||
直接include就可以使用。
|
||||
|
||||
### cppjieba
|
||||
核心目录,包含主要源代码。
|
||||
make 之后产生libcppjieb.a
|
||||
使用方法参考如上cppcommon
|
||||
|
||||
|
||||
|
||||
### run `./segment_demo` to get help.
|
||||
|
||||
如下:
|
||||
```
|
||||
usage:
|
||||
./segment_demo[options] <filename>
|
||||
options:
|
||||
--algorithm Supported methods are [cutDAG, cutHMM, cutMix] for now.
|
||||
If not specified, the default is cutDAG
|
||||
--dictpath If not specified, the default is ../dicts/jieba.dict.utf8
|
||||
--modelpath If not specified, the default is ../dicts/hmm_model.utf8
|
||||
If not specified, the default is utf8.
|
||||
example:
|
||||
./segment_demo testlines.utf8 --dictpath ../dicts/jieba.dict.utf8
|
||||
./segment_demo testlines.utf8 --modelpath ../dicts/hmm_model.utf8 --algorithm cutHMM
|
||||
./segment_demo testlines.utf8 --modelpath ../dicts/hmm_model.utf8 --algorithm cutMix
|
||||
|
||||
```
|
||||
详见: https://github.com/aszxqw/limonp
|
||||
|
||||
## 分词速度
|
||||
|
||||
@ -163,11 +163,11 @@ example:
|
||||
测试环境: `Intel(R) Xeon(R) CPU E5506 @ 2.13GHz`
|
||||
|
||||
|
||||
## Contact
|
||||
## 联系客服
|
||||
|
||||
如果有运行问题或者任何疑问,欢迎联系 : wuyanyi09@gmail.com
|
||||
|
||||
## Thanks
|
||||
## 鸣谢
|
||||
|
||||
"结巴中文"分词作者: SunJunyi
|
||||
https://github.com/fxsjy/jieba
|
||||
|
@ -1,360 +0,0 @@
|
||||
/************************************
|
||||
* file enc : ASCII
|
||||
* author : wuyanyi09@gmail.com
|
||||
************************************/
|
||||
#include "KeyWordExt.h"
|
||||
|
||||
|
||||
namespace CppJieba
|
||||
{
|
||||
|
||||
KeyWordExt::KeyWordExt()
|
||||
{
|
||||
}
|
||||
|
||||
KeyWordExt::~KeyWordExt()
|
||||
{
|
||||
}
|
||||
|
||||
bool KeyWordExt::init(const char* const segDictFile)
|
||||
{
|
||||
LogInfo("KeyWordExt init start ...");
|
||||
if(!_segment.init(segDictFile))
|
||||
{
|
||||
LogError("_segment.init failed.");
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
bool KeyWordExt::loadStopWords(const char * const filePath)
|
||||
{
|
||||
|
||||
LogInfo("_loadStopWords(%s) start", filePath);
|
||||
if(!_stopWords.empty())
|
||||
{
|
||||
LogError("_stopWords has been loaded before! ");
|
||||
return false;
|
||||
}
|
||||
if(!checkFileExist(filePath))
|
||||
{
|
||||
LogError("cann't find file[%s].",filePath);
|
||||
return false;
|
||||
}
|
||||
|
||||
ifstream ifile(filePath);
|
||||
string line;
|
||||
Unicode word;
|
||||
while(getline(ifile, line))
|
||||
{
|
||||
if(!TransCode::decode(line, word))
|
||||
{
|
||||
LogError("decode failed .");
|
||||
return false;
|
||||
}
|
||||
_stopWords.insert(word);
|
||||
}
|
||||
LogInfo("load stopwords[%d] finished.", _stopWords.size());
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
bool KeyWordExt::dispose()
|
||||
{
|
||||
_segment.dispose();
|
||||
return true;
|
||||
}
|
||||
|
||||
bool KeyWordExt::_wordInfoCompare(const KeyWordInfo& a, const KeyWordInfo& b)
|
||||
{
|
||||
return a.weight > b.weight;
|
||||
}
|
||||
|
||||
bool KeyWordExt::_sortWLIDF(vector<KeyWordInfo>& wordInfos)
|
||||
{
|
||||
for(uint i = 0; i < wordInfos.size(); i++)
|
||||
{
|
||||
KeyWordInfo& wInfo = wordInfos[i];
|
||||
wInfo.idf = - wInfo.logFreq;
|
||||
wInfo.weight = log(double(wInfo.word.size() + 1)) * wInfo.idf;
|
||||
}
|
||||
sort(wordInfos.begin(), wordInfos.end(), _wordInfoCompare);
|
||||
return true;
|
||||
}
|
||||
|
||||
bool KeyWordExt::_extTopN(vector<KeyWordInfo>& wordInfos, uint topN)
|
||||
{
|
||||
int dis = wordInfos.size() - topN;
|
||||
if(dis <= 0)
|
||||
{
|
||||
return true;
|
||||
}
|
||||
|
||||
if(uint(dis) <= topN)
|
||||
{
|
||||
for(int i = 0; i< dis; i++)
|
||||
{
|
||||
wordInfos.pop_back();
|
||||
}
|
||||
}
|
||||
else// in case that topN << size;
|
||||
{
|
||||
|
||||
vector<KeyWordInfo> tmp(wordInfos.begin(), wordInfos.begin() + topN);
|
||||
wordInfos.swap(tmp);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
bool KeyWordExt::extract(const vector<string>& words, vector<KeyWordInfo>& keyWordInfos, uint topN)
|
||||
{
|
||||
if(words.empty())
|
||||
{
|
||||
return false;
|
||||
}
|
||||
|
||||
keyWordInfos.clear();
|
||||
for(uint i = 0; i < words.size(); i++)
|
||||
{
|
||||
Unicode uniWord;
|
||||
if(!TransCode::decode(words[i], uniWord))
|
||||
{
|
||||
LogError("decode failed");
|
||||
return false;
|
||||
}
|
||||
keyWordInfos.push_back(uniWord);
|
||||
}
|
||||
|
||||
return _extract(keyWordInfos, topN);
|
||||
}
|
||||
|
||||
bool KeyWordExt::extract(const string& title, vector<KeyWordInfo>& keyWordInfos, uint topN)
|
||||
{
|
||||
if(title.empty())
|
||||
{
|
||||
return false;
|
||||
}
|
||||
|
||||
vector<TrieNodeInfo> trieNodeInfos;
|
||||
Unicode unico;
|
||||
if(!TransCode::decode(title, unico))
|
||||
{
|
||||
return false;
|
||||
}
|
||||
_segment.cut(unico.begin(), unico.end(), trieNodeInfos);
|
||||
|
||||
keyWordInfos.clear();
|
||||
for(uint i = 0; i < trieNodeInfos.size(); i++)
|
||||
{
|
||||
keyWordInfos.push_back(trieNodeInfos[i]);
|
||||
}
|
||||
return _extract(keyWordInfos, topN);
|
||||
}
|
||||
|
||||
bool KeyWordExt::_extract(vector<KeyWordInfo>& keyWordInfos, uint topN)
|
||||
{
|
||||
if(!_filter(keyWordInfos))
|
||||
{
|
||||
LogError("_filter failed.");
|
||||
return false;
|
||||
}
|
||||
|
||||
if(!_sortWLIDF(keyWordInfos))
|
||||
{
|
||||
LogError("_sortWLIDF failed.");
|
||||
return false;
|
||||
}
|
||||
|
||||
if(!_extTopN(keyWordInfos, topN))
|
||||
{
|
||||
LogError("_extTopN failed.");
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
bool KeyWordExt::_filter(vector<KeyWordInfo>& wordInfos)
|
||||
{
|
||||
if(!_filterDuplicate(wordInfos))
|
||||
{
|
||||
LogError("_filterDuplicate failed.");
|
||||
return false;
|
||||
}
|
||||
|
||||
if(!_filterSingleWord(wordInfos))
|
||||
{
|
||||
LogError("_filterSingleWord failed.");
|
||||
return false;
|
||||
}
|
||||
|
||||
if(!_filterStopWords(wordInfos))
|
||||
{
|
||||
LogError("_filterStopWords failed.");
|
||||
return false;
|
||||
}
|
||||
|
||||
if(!_filterSubstr(wordInfos))
|
||||
{
|
||||
LogError("_filterSubstr failed.");
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
bool KeyWordExt::_filterStopWords(vector<KeyWordInfo>& wordInfos)
|
||||
{
|
||||
if(_stopWords.empty())
|
||||
{
|
||||
return true;
|
||||
}
|
||||
for(vector<KeyWordInfo>::iterator it = wordInfos.begin(); it != wordInfos.end();)
|
||||
{
|
||||
if(_stopWords.find(it->word) != _stopWords.end())
|
||||
{
|
||||
it = wordInfos.erase(it);
|
||||
}
|
||||
else
|
||||
{
|
||||
it ++;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
bool KeyWordExt::_filterDuplicate(vector<KeyWordInfo>& wordInfos)
|
||||
{
|
||||
set<Unicode> st;
|
||||
for(vector<KeyWordInfo>::iterator it = wordInfos.begin(); it != wordInfos.end(); )
|
||||
{
|
||||
if(st.find(it->word) != st.end())
|
||||
{
|
||||
it = wordInfos.erase(it);
|
||||
}
|
||||
else
|
||||
{
|
||||
st.insert(it->word);
|
||||
it++;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
bool KeyWordExt::_filterSingleWord(vector<KeyWordInfo>& wordInfos)
|
||||
{
|
||||
for(vector<KeyWordInfo>::iterator it = wordInfos.begin(); it != wordInfos.end();)
|
||||
{
|
||||
|
||||
// filter single word
|
||||
if(1 == it->word.size())
|
||||
{
|
||||
it = wordInfos.erase(it);
|
||||
}
|
||||
else
|
||||
{
|
||||
it++;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
bool KeyWordExt::_filterSubstr(vector<KeyWordInfo>& wordInfos)
|
||||
{
|
||||
vector<Unicode> tmp ;
|
||||
for(uint i = 0; i < wordInfos.size(); i++)
|
||||
{
|
||||
tmp.push_back(wordInfos[i].word);
|
||||
}
|
||||
|
||||
for(vector<KeyWordInfo>::iterator it = wordInfos.begin(); it != wordInfos.end(); )
|
||||
{
|
||||
if(_isSubIn(tmp, it->word))
|
||||
{
|
||||
it = wordInfos.erase(it);
|
||||
}
|
||||
else
|
||||
{
|
||||
it++;
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
//bool KeyWordExt::_isContainSubWords(const string& word)
|
||||
//{
|
||||
// for(uint i = 0; i < _priorSubWords.size(); i++)
|
||||
// {
|
||||
// if(string::npos != word.find(_priorSubWords[i]))
|
||||
// {
|
||||
// return true;
|
||||
// }
|
||||
// }
|
||||
// return false;
|
||||
//}
|
||||
|
||||
//bool KeyWordExt::_prioritizeSubWords(vector<KeyWordInfo>& wordInfos)
|
||||
//{
|
||||
// if(2 > wordInfos.size())
|
||||
// {
|
||||
// return true;
|
||||
// }
|
||||
|
||||
// KeyWordInfo prior;
|
||||
// bool flag = false;
|
||||
// for(vector<KeyWordInfo>::iterator it = wordInfos.begin(); it != wordInfos.end(); )
|
||||
// {
|
||||
// if(_isContainSubWords(it->word))
|
||||
// {
|
||||
// prior = *it;
|
||||
// it = wordInfos.erase(it);
|
||||
// flag = true;
|
||||
// break;
|
||||
// }
|
||||
// else
|
||||
// {
|
||||
// it ++;
|
||||
// }
|
||||
// }
|
||||
// if(flag)
|
||||
// {
|
||||
// wordInfos.insert(wordInfos.begin(), prior);
|
||||
// }
|
||||
// return true;
|
||||
//}
|
||||
}
|
||||
|
||||
|
||||
#ifdef KEYWORDEXT_UT
|
||||
|
||||
using namespace CppJieba;
|
||||
|
||||
int main()
|
||||
{
|
||||
KeyWordExt ext;
|
||||
ext.init();
|
||||
if(!ext.loadSegDict("../dicts/segdict.gbk.v2.1"))
|
||||
{
|
||||
return 1;
|
||||
}
|
||||
ext._loadStopWords("../dicts/stopwords.gbk.v1.0");
|
||||
|
||||
ifstream ifile("testtitle.gbk");
|
||||
vector<string> res;
|
||||
string line;
|
||||
while(getline(ifile, line))
|
||||
{
|
||||
cout<<line<<endl;
|
||||
res.clear();
|
||||
ext.extract(line, res, 20);
|
||||
PRINT_VECTOR(res);
|
||||
}
|
||||
|
||||
ext.dispose();
|
||||
return 0;
|
||||
}
|
||||
|
||||
#endif
|
@ -1,68 +0,0 @@
|
||||
/************************************
|
||||
* file enc : ASCII
|
||||
* author : wuyanyi09@gmail.com
|
||||
************************************/
|
||||
#ifndef CPPJIEBA_KEYWORDEXT_H
|
||||
#define CPPJIEBA_KEYWORDEXT_H
|
||||
|
||||
#include <logger.hpp>
|
||||
#include "MPSegment.h"
|
||||
#include "structs.h"
|
||||
|
||||
namespace CppJieba
|
||||
{
|
||||
|
||||
class KeyWordExt
|
||||
{
|
||||
private:
|
||||
MPSegment _segment;
|
||||
//vector<string> _priorSubWords;
|
||||
set<Unicode> _stopWords;
|
||||
public:
|
||||
KeyWordExt();
|
||||
~KeyWordExt();
|
||||
bool init(const char* const segDictFile);
|
||||
bool dispose();
|
||||
bool loadStopWords(const char * const filePath);
|
||||
private:
|
||||
//bool _loadPriorSubWords(const char * const filePath);
|
||||
|
||||
|
||||
public:
|
||||
bool extract(const string& title, vector<KeyWordInfo>& keyWordInfos, uint topN);
|
||||
bool extract(const vector<string>& words, vector<KeyWordInfo>& keyWordInfos, uint topN);
|
||||
private:
|
||||
static bool _wordInfoCompare(const KeyWordInfo& a, const KeyWordInfo& b);
|
||||
private:
|
||||
bool _extract(vector<KeyWordInfo>& keyWordInfos, uint topN);
|
||||
bool _extTopN(vector<KeyWordInfo>& wordInfos, uint topN);
|
||||
private:
|
||||
//sort by word len - idf
|
||||
bool _sortWLIDF(vector<KeyWordInfo>& wordInfos);
|
||||
private:
|
||||
bool _filter(vector<KeyWordInfo>& );
|
||||
bool _filterDuplicate(vector<KeyWordInfo>& );
|
||||
bool _filterSingleWord(vector<KeyWordInfo>& );
|
||||
bool _filterSubstr(vector<KeyWordInfo>& );
|
||||
bool _filterStopWords(vector<KeyWordInfo>& );
|
||||
private:
|
||||
inline bool _isSubIn(const vector<Unicode>& words, const Unicode& word)const
|
||||
{
|
||||
|
||||
for(uint j = 0; j < words.size(); j++)
|
||||
{
|
||||
if(word != words[j] && words[j].end() != search(words[j].begin(), words[j].end(), word.begin(), word.end()))
|
||||
{
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
//bool _prioritizeSubWords(vector<KeyWordInfo>& wordInfos);
|
||||
//bool _isContainSubWords(const string& word);
|
||||
|
||||
};
|
||||
|
||||
}
|
||||
|
||||
#endif
|
@ -1,66 +0,0 @@
|
||||
CXX := g++
|
||||
LD := g++
|
||||
AR := ar rc
|
||||
|
||||
DEBUG_CXXFLAGS := -g -Wall -DDEBUG
|
||||
RELEASE_CXXFLAGS := -Wall -O3
|
||||
|
||||
ifeq (YES, ${RELEASE})
|
||||
CXXFLAGS := ${RELEASE_CXXFLAGS}
|
||||
LDFLAGS := ${RELEASE_LDFLAGS}
|
||||
else
|
||||
CXXFLAGS := ${DEBUG_CXXFLAGS}
|
||||
LDFLAGS := ${DEBUG_LDFLAGS}
|
||||
endif
|
||||
|
||||
SOURCES := $(wildcard *.cpp)
|
||||
OBJS := $(patsubst %.cpp,%.o,$(SOURCES))
|
||||
|
||||
INC := -I../limonp
|
||||
|
||||
LIBA := libcppjieba.a
|
||||
|
||||
# remove the objs after compilation
|
||||
.INTERMEDIATE:
|
||||
#.PHONY: clean $(CMLIB)
|
||||
.PHONY: clean
|
||||
|
||||
all: $(LIBA)
|
||||
|
||||
# This is a suffix rule
|
||||
#.c.o:
|
||||
%.o: %.cpp
|
||||
$(CXX) -c $(CXXFLAGS) $< $(INC)
|
||||
|
||||
|
||||
${LIBA}: $(OBJS)
|
||||
$(AR) $@ $(OBJS)
|
||||
|
||||
#unit test
|
||||
Trie.ut: Trie.cpp Trie.h globals.h TransCode.cpp TransCode.hpp $(CMLIB)
|
||||
$(CXX) -o $@ $(CXXFLAGS) Trie.cpp TransCode.cpp -DTRIE_UT $(CMLIB)
|
||||
|
||||
MPSegment.ut: MPSegment.cpp Trie.cpp MPSegment.h Trie.h globals.h $(CMLIB)
|
||||
$(CXX) -o $@ $(CXXFLAGS) MPSegment.cpp Trie.cpp TransCode.cpp -DSEGMENT_UT $(CMLIB)
|
||||
|
||||
KeyWordExt.ut: KeyWordExt.cpp KeyWordExt.h MPSegment.h Trie.h globals.h TransCode.cpp TransCode.hpp $(CMLIB)
|
||||
$(CXX) -o $@ $(CXXFLAGS) KeyWordExt.cpp MPSegment.cpp Trie.cpp TransCode.cpp -DKEYWORDEXT_UT $(CMLIB)
|
||||
|
||||
TransCode.ut: TransCode.cpp TransCode.hpp globals.h $(CMLIB)
|
||||
$(CXX) -o $@ $(CXXFLAGS) TransCode.cpp -DCPPJIEBA_TRANSCODE_UT $(CMLIB)
|
||||
HMMSegment.ut: HMMSegment.cpp TransCode.cpp TransCode.hpp HMMSegment.h $(CMLIB)
|
||||
$(CXX) -o $@ $(CXXFLAGS) TransCode.cpp HMMSegment.cpp -DHMMSEGMENT_UT $(CMLIB)
|
||||
MixSegment.ut: MixSegment.cpp MixSegment.h HMMSegment.cpp MPSegment.cpp Trie.cpp MPSegment.h Trie.h globals.h $(CMLIB)
|
||||
$(CXX) -o $@ $(CXXFLAGS) MixSegment.cpp HMMSegment.cpp MPSegment.cpp Trie.cpp TransCode.cpp -DMIXSEGMENT_UT $(CMLIB)
|
||||
ChineseFilter.ut: ChineseFilter.cpp ChineseFilter.hpp
|
||||
$(CXX) -o $@ $(CXXFLAGS) ChineseFilter.cpp -DUT
|
||||
|
||||
clean:
|
||||
rm -f *.o *.d *.d.* *.ut $(LIBA)
|
||||
|
||||
sinclude $(SOURCES:.cpp=.d)
|
||||
%.d:%.cpp
|
||||
@set -e; rm -f $@; \
|
||||
$(CXX) -MM $< > $@.$$$$; \
|
||||
sed 's,\($*\).o[ :]*,\1.o $@ : ,g' < $@.$$$$ > $@; \
|
||||
rm -f $@.$$$$
|
@ -1,53 +0,0 @@
|
||||
CXX := g++
|
||||
LD := g++
|
||||
AR := ar rc
|
||||
|
||||
|
||||
DEBUG_CXXFLAGS := -g -Wall -DDEBUG
|
||||
RELEASE_CXXFLAGS := -Wall -O3
|
||||
|
||||
ifeq (YES, ${DEBUG})
|
||||
CXXFLAGS := ${DEBUG_CXXFLAGS}
|
||||
LDFLAGS := ${DEBUG_LDFLAGS}
|
||||
else
|
||||
CXXFLAGS := ${RELEASE_CXXFLAGS}
|
||||
LDFLAGS := ${RELEASE_LDFLAGS}
|
||||
endif
|
||||
|
||||
INCS := -I../limonp
|
||||
LINK := -lpthread
|
||||
|
||||
SOURCES := $(wildcard *.cpp)
|
||||
OBJS := $(patsubst %.cpp,%.o,$(SOURCES))
|
||||
DEMOS := $(patsubst %.cpp,%.demo,$(SOURCES))
|
||||
|
||||
CPPJIEBADIR := ../cppjieba
|
||||
LIBCPPJIEBA := $(CPPJIEBADIR)/libcppjieba.a
|
||||
|
||||
HUSKYDIR := ../husky
|
||||
LIBHUSKYA := $(HUSKYDIR)/libhusky.a
|
||||
|
||||
.PHONY: clean $(LIBCPPJIEBA)
|
||||
|
||||
all: $(DEMOS)
|
||||
|
||||
%.demo: %.cpp $(LIBCPPJIEBA) $(LIBHUSKYA)
|
||||
$(CXX) -o $@ $(CXXFLAGS) $^ $(INCS) $(LINK)
|
||||
|
||||
$(LIBCPPJIEBA):
|
||||
cd $(CPPJIEBADIR) && $(MAKE)
|
||||
|
||||
$(LIBHUSKYA):
|
||||
cd $(HUSKYDIR) && $(MAKE)
|
||||
|
||||
clean:
|
||||
rm -f *.o *.ut *.d *.d.* $(DEMOS)
|
||||
cd $(CPPJIEBADIR) && make clean
|
||||
cd $(HUSKYDIR) && make clean
|
||||
|
||||
sinclude $(SOURCES:.cpp=.d)
|
||||
%.d:%.cpp
|
||||
@set -e; rm -f $@; \
|
||||
$(CXX) -MM $< > $@.$$$$; \
|
||||
sed 's,\($*\).o[ :]*,\1.o $@ : ,g' < $@.$$$$ > $@; \
|
||||
rm -f $@.$$$$
|
@ -1,3 +0,0 @@
|
||||
./segment.demo testlines.utf8 --dictpath ../dicts/jieba.dict.utf8
|
||||
./segment.demo testlines.utf8 --modelpath ../dicts/hmm_model.utf8 --algorithm cutHMM
|
||||
./segment.demo testlines.utf8 --algorithm cutMix
|
@ -1,56 +0,0 @@
|
||||
#include <iostream>
|
||||
#include <fstream>
|
||||
#include <ArgvContext.hpp>
|
||||
#include "../cppjieba/KeyWordExt.h"
|
||||
|
||||
using namespace CppJieba;
|
||||
|
||||
|
||||
void testKeyWordExt(const char * dictPath, const char * filePath)
|
||||
{
|
||||
KeyWordExt ext;
|
||||
if(!ext.init(dictPath))
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
ifstream ifile(filePath);
|
||||
vector<KeyWordInfo> res;
|
||||
string line;
|
||||
while(getline(ifile, line))
|
||||
{
|
||||
res.clear();
|
||||
if(!line.empty())
|
||||
{
|
||||
ext.extract(line, res, 20);
|
||||
cout<<line<<'\n'<<joinWordInfos(res)<<endl;
|
||||
}
|
||||
|
||||
}
|
||||
ext.dispose();
|
||||
}
|
||||
|
||||
const char * const DEFAULT_DICTPATH = "../dicts/jieba.dict.utf8";
|
||||
|
||||
int main(int argc, char ** argv)
|
||||
{
|
||||
if(2 > argc)
|
||||
{
|
||||
cout<<"usage: \n\t"<<argv[0]<<" [options] <filename>\n"
|
||||
<<"options:\n"
|
||||
<<"\t--dictpath\tIf not specified, the default is "<<DEFAULT_DICTPATH<<"\n"
|
||||
<<"examples:\n"
|
||||
<<"\t"<<argv[0]<<" testlines.utf8 --encoding utf-8 --dictpath ../dicts/jieba.dict.utf8\n"
|
||||
<<endl;
|
||||
return -1;
|
||||
}
|
||||
|
||||
ArgvContext arg(argc, argv);
|
||||
string dictPath = arg["--dictpath"];
|
||||
if("" == dictPath)
|
||||
{
|
||||
dictPath = DEFAULT_DICTPATH;
|
||||
}
|
||||
testKeyWordExt(dictPath.c_str(), arg[1].c_str());
|
||||
return 0;
|
||||
}
|
124
demo/segment.cpp
124
demo/segment.cpp
@ -1,124 +0,0 @@
|
||||
#include <iostream>
|
||||
#include <fstream>
|
||||
#include <ArgvContext.hpp>
|
||||
#include "../cppjieba/MPSegment.h"
|
||||
#include "../cppjieba/HMMSegment.h"
|
||||
#include "../cppjieba/MixSegment.h"
|
||||
|
||||
using namespace CppJieba;
|
||||
|
||||
MPSegment seg;
|
||||
HMMSegment hmmseg;
|
||||
MixSegment mixseg;
|
||||
bool init(const char * const dictPath, const char * const modelPath)
|
||||
{
|
||||
if(!seg.init(dictPath))
|
||||
{
|
||||
cout<<"seg init failed."<<endl;
|
||||
return false;
|
||||
}
|
||||
|
||||
if(!hmmseg.init(modelPath))
|
||||
{
|
||||
cout<<"hmmseg init failed."<<endl;
|
||||
return false;
|
||||
}
|
||||
|
||||
if(!mixseg.init(dictPath, modelPath))
|
||||
{
|
||||
cout<<"mixseg init failed."<<endl;
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
void cut(const ISegment * seg, const char * const filePath)
|
||||
{
|
||||
ifstream ifile(filePath);
|
||||
vector<string> res;
|
||||
string line;
|
||||
while(getline(ifile, line))
|
||||
{
|
||||
if(!line.empty())
|
||||
{
|
||||
res.clear();
|
||||
seg->cut(line, res);
|
||||
cout<<line<<"\n"<<joinStr(res,"/")<<endl;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
bool dispose()
|
||||
{
|
||||
if(!seg.dispose())
|
||||
{
|
||||
cout<<"seg dispose failed."<<endl;
|
||||
return false;
|
||||
}
|
||||
if(!hmmseg.dispose())
|
||||
{
|
||||
cout<<"seg dispose failed."<<endl;
|
||||
return false;
|
||||
}
|
||||
if(!mixseg.dispose())
|
||||
{
|
||||
cout<<"seg dispose failed."<<endl;
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
const char * const DEFAULT_DICTPATH = "../dicts/jieba.dict.utf8";
|
||||
const char * const DEFAULT_MODELPATH = "../dicts/hmm_model.utf8";
|
||||
|
||||
int main(int argc, char ** argv)
|
||||
{
|
||||
if(argc < 2)
|
||||
{
|
||||
cout<<"usage: \n\t"<<argv[0]<<"[options] <filename>\n"
|
||||
<<"options:\n"
|
||||
<<"\t--algorithm\tSupported methods are [cutDAG, cutHMM, cutMix] for now. \n\t\t\tIf not specified, the default is cutDAG\n"
|
||||
<<"\t--dictpath\tIf not specified, the default is "<<DEFAULT_DICTPATH<<'\n'
|
||||
<<"\t--modelpath\tIf not specified, the default is "<<DEFAULT_MODELPATH<<'\n'
|
||||
<<"example:\n"
|
||||
<<"\t"<<argv[0]<<" testlines.utf8 --dictpath ../dicts/jieba.dict.utf8\n"
|
||||
<<"\t"<<argv[0]<<" testlines.utf8 --modelpath ../dicts/hmm_model.utf8 --algorithm cutHMM\n"
|
||||
<<"\t"<<argv[0]<<" testlines.utf8 --modelpath ../dicts/hmm_model.utf8 --algorithm cutMix\n"
|
||||
<<endl;
|
||||
|
||||
return -1;
|
||||
}
|
||||
ArgvContext arg(argc, argv);
|
||||
string dictPath = arg["--dictpath"];
|
||||
string modelPath = arg["--modelpath"];
|
||||
string algorithm = arg["--algorithm"];
|
||||
if(dictPath.empty())
|
||||
{
|
||||
dictPath = DEFAULT_DICTPATH;
|
||||
}
|
||||
if(modelPath.empty())
|
||||
{
|
||||
modelPath = DEFAULT_MODELPATH;
|
||||
}
|
||||
|
||||
if(!init(dictPath.c_str(), modelPath.c_str()))
|
||||
{
|
||||
LogError("init failed.");
|
||||
return -1;
|
||||
}
|
||||
if("cutHMM" == algorithm)
|
||||
{
|
||||
cut(&hmmseg, arg[1].c_str());
|
||||
}
|
||||
else if("cutMix" == algorithm)
|
||||
{
|
||||
cut(&mixseg, arg[1].c_str());
|
||||
}
|
||||
else
|
||||
{
|
||||
cut(&seg, arg[1].c_str());
|
||||
}
|
||||
dispose();
|
||||
return 0;
|
||||
}
|
@ -1,42 +0,0 @@
|
||||
CXX := g++
|
||||
LD := g++
|
||||
AR := ar rc
|
||||
|
||||
DEBUG_CXXFLAGS := -g -Wall -DDEBUG
|
||||
RELEASE_CXXFLAGS := -Wall -O3
|
||||
|
||||
ifeq (YES, ${RELEASE})
|
||||
CXXFLAGS := ${RELEASE_CXXFLAGS}
|
||||
LDFLAGS := ${RELEASE_LDFLAGS}
|
||||
else
|
||||
CXXFLAGS := ${DEBUG_CXXFLAGS}
|
||||
LDFLAGS := ${DEBUG_LDFLAGS}
|
||||
endif
|
||||
|
||||
DOLINK := $(LD) $(LDFLAGS) -o $@ $^
|
||||
DOPACK := $(AR)
|
||||
SOURCES = $(wildcard *.cpp)
|
||||
OBJS := $(patsubst %.cpp,%.o,$(SOURCES))
|
||||
|
||||
INC := -I../limonp
|
||||
LIBA := libhusky.a
|
||||
|
||||
.PHONY: clean
|
||||
|
||||
all: $(LIBA)
|
||||
|
||||
%.o: %.cpp
|
||||
$(CXX) -c $(CXXFLAGS) $< $(INC)
|
||||
|
||||
${LIBA}: $(OBJS)
|
||||
$(DOPACK) $@ $(OBJS)
|
||||
|
||||
clean:
|
||||
rm -f *.o *.d *.d.* $(LIBA)
|
||||
|
||||
sinclude $(SOURCES:.cpp=.d)
|
||||
%.d:%.cpp
|
||||
@set -e; rm -f $@; \
|
||||
$(CXX) -MM $< > $@.$$$$; \
|
||||
sed 's,\($*\).o[ :]*,\1.o $@ : ,g' < $@.$$$$ > $@; \
|
||||
rm -f $@.$$$$
|
@ -1,123 +0,0 @@
|
||||
/************************************
|
||||
* file enc : ascii
|
||||
* author : wuyanyi09@gmail.com
|
||||
************************************/
|
||||
|
||||
|
||||
#ifndef LIMONP_MAP_FUNCTS_H
|
||||
#define LIMONP_MAP_FUNCTS_H
|
||||
|
||||
#include <map>
|
||||
#include <set>
|
||||
#include <iostream>
|
||||
#include <sstream>
|
||||
#include "typedefs.h"
|
||||
|
||||
namespace Limonp
|
||||
{
|
||||
using namespace std;
|
||||
|
||||
|
||||
template <typename T>
|
||||
string setToString(const set<T>& st)
|
||||
{
|
||||
if(st.empty())
|
||||
{
|
||||
return "{}";
|
||||
}
|
||||
stringstream ss;
|
||||
ss<<'{';
|
||||
typename set<T>::const_iterator it = st.begin();
|
||||
ss<<*it;
|
||||
it++;
|
||||
while(it != st.end())
|
||||
{
|
||||
ss<<", "<<*it;
|
||||
it++;
|
||||
}
|
||||
ss<<'}';
|
||||
return ss.str();
|
||||
}
|
||||
|
||||
template<typename T1, typename T2>
|
||||
string mapToString(const map<T1, T2>& mp)
|
||||
{
|
||||
if(mp.empty())
|
||||
{
|
||||
return "{}";
|
||||
}
|
||||
stringstream ss;
|
||||
ss<<'{';
|
||||
typename map<T1, T2>::const_iterator it = mp.begin();
|
||||
ss<<it->first<<": "<<it->second;
|
||||
it++;
|
||||
while(it != mp.end())
|
||||
{
|
||||
ss<<", "<<it->first<<": "<<it->second;
|
||||
it++;
|
||||
}
|
||||
ss<<'}';
|
||||
return ss.str();
|
||||
}
|
||||
|
||||
template<typename T1, typename T2>
|
||||
string HashMapToString(const HashMap<T1, T2>& mp)
|
||||
{
|
||||
if(mp.empty())
|
||||
{
|
||||
return "{}";
|
||||
}
|
||||
stringstream ss;
|
||||
ss<<'{';
|
||||
typename HashMap<T1, T2>::const_iterator it = mp.begin();
|
||||
ss<<it->first<<": "<<it->second;
|
||||
it++;
|
||||
while(it != mp.end())
|
||||
{
|
||||
ss<<", "<<it->first<<": "<<it->second;
|
||||
it++;
|
||||
}
|
||||
ss<<'}';
|
||||
return ss.str();
|
||||
}
|
||||
template<typename T1, typename T2>
|
||||
string pairToString(const pair<T1, T2>& p)
|
||||
{
|
||||
stringstream ss;
|
||||
ss<<p.first<<":"<<p.second;
|
||||
return ss.str();
|
||||
}
|
||||
|
||||
template<class kT, class vT>
|
||||
void printMap(const map<kT, vT>& mp)
|
||||
{
|
||||
for(typename map<kT, vT>::const_iterator it = mp.begin(); it != mp.end(); it++)
|
||||
{
|
||||
cout<<it->first<<' '<<it->second<<endl;
|
||||
}
|
||||
}
|
||||
|
||||
template<class kT, class vT>
|
||||
vT getMap(const map<kT, vT>& mp, const kT & key, const vT & defaultVal)
|
||||
{
|
||||
typename map<kT, vT>::const_iterator it;
|
||||
it = mp.find(key);
|
||||
if(mp.end() == it)
|
||||
{
|
||||
return defaultVal;
|
||||
}
|
||||
return it->second;
|
||||
}
|
||||
|
||||
template<class kT, class vT>
|
||||
void map2Vec(const map<kT, vT>& mp, vector<pair<kT, vT> > & res)
|
||||
{
|
||||
typename map<kT, vT>::const_iterator it = mp.begin();
|
||||
for(; it != mp.end(); it++)
|
||||
{
|
||||
res.push_back(*it);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
@ -1,21 +0,0 @@
|
||||
/************************************
|
||||
* file enc : utf8
|
||||
* author : wuyanyi09@gmail.com
|
||||
************************************/
|
||||
#ifndef LIMONP_TYPEDEFS_H
|
||||
#define LIMONP_TYPEDEFS_H
|
||||
|
||||
#include <stdint.h>
|
||||
#include <vector>
|
||||
#include <sys/types.h>
|
||||
#include <tr1/unordered_map>
|
||||
#define HashMap std::tr1::unordered_map
|
||||
|
||||
namespace Limonp
|
||||
{
|
||||
typedef std::vector<uint16_t> Unicode;
|
||||
typedef std::vector<uint16_t>::const_iterator UnicodeConstIterator;
|
||||
}
|
||||
|
||||
|
||||
#endif
|
@ -1,142 +0,0 @@
|
||||
/************************************
|
||||
* file enc : ascii
|
||||
* author : wuyanyi09@gmail.com
|
||||
************************************/
|
||||
#ifndef LIMONP_VEC_FUNCTS_H
|
||||
#define LIMONP_VEC_FUNCTS_H
|
||||
#include <iostream>
|
||||
#include <vector>
|
||||
#include <algorithm>
|
||||
#include <queue>
|
||||
#include <string>
|
||||
#include <iostream>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
#include <utility>
|
||||
#include <algorithm>
|
||||
#include <sstream>
|
||||
|
||||
#define FOR_VECTOR(vec, i) for(size_t i = 0; i < vec.size(); i++)
|
||||
|
||||
#define PRINT_VECTOR(vec) FOR_VECTOR(vec, i)\
|
||||
{\
|
||||
cout<<vec[i]<<endl;\
|
||||
}
|
||||
|
||||
#define PRINT_MATRIX(mat) FOR_VECTOR(mat, i) \
|
||||
{\
|
||||
FOR_VECTOR(mat[i], j)\
|
||||
{\
|
||||
cout<<"["<<i<<","<<j<<"]:"<<mat[i][j]<<endl;\
|
||||
}\
|
||||
}
|
||||
|
||||
namespace Limonp
|
||||
{
|
||||
using namespace std;
|
||||
template <typename T>
|
||||
bool vecToString(const vector<T>& vec, string& res)
|
||||
{
|
||||
if(vec.empty())
|
||||
{
|
||||
res = "[]";
|
||||
return false;
|
||||
}
|
||||
stringstream ss;
|
||||
ss<<"[\""<<vec[0];
|
||||
for(uint i = 1; i < vec.size(); i++)
|
||||
{
|
||||
ss<<"\", \""<<vec[i];
|
||||
}
|
||||
ss<<"\"]";
|
||||
res = ss.str();
|
||||
return true;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
string vecToString(const vector<T>& vec)
|
||||
{
|
||||
string res;
|
||||
vecToString(vec, res);
|
||||
return res;
|
||||
}
|
||||
|
||||
template<typename T>
|
||||
bool isInVec(const vector<T>& vec, const T& item)
|
||||
{
|
||||
typename vector<T>::const_iterator it = find(vec.begin(), vec.end(), item);
|
||||
return it != vec.end();
|
||||
}
|
||||
template<typename T>
|
||||
void splitVec(const vector<T>& vecSrc, vector< pair<T, vector<T> > >& outVec, const vector<T>& patterns)
|
||||
{
|
||||
vector<T> tmp;
|
||||
T pattern;
|
||||
size_t patternSize = patterns.size();
|
||||
for(size_t i = 0; i < vecSrc.size(); i++)
|
||||
{
|
||||
size_t patternPos = patternSize;
|
||||
for(size_t j = 0; j < patternSize; j++)
|
||||
{
|
||||
if(patterns[j] == vecSrc[i])
|
||||
{
|
||||
patternPos = j;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if(patternPos != patternSize)
|
||||
{
|
||||
if(!tmp.empty())
|
||||
{
|
||||
outVec.push_back(make_pair<T, vector<T> >(pattern, tmp));
|
||||
tmp.clear();
|
||||
}
|
||||
pattern = patterns[patternPos];
|
||||
}
|
||||
else
|
||||
{
|
||||
tmp.push_back(vecSrc[i]);
|
||||
}
|
||||
}
|
||||
if(!tmp.empty())
|
||||
{
|
||||
outVec.push_back(make_pair<T, vector<T> >(pattern, tmp));
|
||||
}
|
||||
}
|
||||
|
||||
template<typename T>
|
||||
void splitVec(const vector<T>& vecSrc, vector< vector<T> >& outVec, const vector<T>& patternVec)
|
||||
{
|
||||
vector<T> tmp;
|
||||
for(size_t i = 0; i < vecSrc.size(); i++)
|
||||
{
|
||||
bool flag = false;
|
||||
for(size_t j = 0; j < patternVec.size(); j++)
|
||||
{
|
||||
if(patternVec[j] == vecSrc[i])
|
||||
{
|
||||
flag = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if(flag)
|
||||
{
|
||||
if(!tmp.empty())
|
||||
{
|
||||
outVec.push_back(tmp);
|
||||
tmp.clear();
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
tmp.push_back(vecSrc[i]);
|
||||
}
|
||||
}
|
||||
if(!tmp.empty())
|
||||
{
|
||||
outVec.push_back(tmp);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
@ -1 +0,0 @@
|
||||
sed -i '1i/************************************\n * file enc : utf8\n * author : wuyanyi09@gmail.com\n************************************/' ../src/*.h ../src/*.cpp ../src/*.tcc
|
@ -1,29 +0,0 @@
|
||||
#!/usr/bin/python
|
||||
|
||||
import sys
|
||||
|
||||
if len(sys.argv) == 1:
|
||||
print "usage : %s dict_file1 dict_file2 ..."
|
||||
exit(1)
|
||||
|
||||
d = {}
|
||||
|
||||
for fname in sys.argv[1:]:
|
||||
with open(fname, "r") as fin:
|
||||
for i, line in enumerate(fin):
|
||||
try:
|
||||
word, cnt, tag = line.strip().split(" ")
|
||||
if word in d:
|
||||
print "error file[%s] line[%s] : %s" %(fname, i, line)
|
||||
exit(1)
|
||||
else:
|
||||
d[word] = True
|
||||
|
||||
if 0 >= int(cnt) :
|
||||
print "error file[%s] line[%s] : %s" %(fname, i, line)
|
||||
exit(1)
|
||||
except Exception as err:
|
||||
print "error file[%s] line[%s] : %s" %(fname, i, line)
|
||||
exit(1)
|
||||
|
||||
print "OK"
|
@ -1,23 +0,0 @@
|
||||
#!/usr/bin/python
|
||||
|
||||
import sys
|
||||
|
||||
if len(sys.argv) == 1:
|
||||
print "usage : %s dict_file_path"
|
||||
exit(1)
|
||||
|
||||
d = {}
|
||||
with open(sys.argv[1], "r") as fin:
|
||||
for i, line in enumerate(fin):
|
||||
word, cnt, tag = line.strip().split(" ")
|
||||
if word in d:
|
||||
#print "error file[%s] line[%s] : %s" %(fname, i, line)
|
||||
#exit(1)
|
||||
continue
|
||||
else:
|
||||
d[word] = True
|
||||
if 0 >= int(cnt) :
|
||||
continue
|
||||
|
||||
print line.strip()
|
||||
|
@ -1,15 +0,0 @@
|
||||
#!/usr/bin/python
|
||||
|
||||
import sys
|
||||
|
||||
if len(sys.argv) != 4:
|
||||
print "usage : %s from_enc to_enc dict_file_path \nexample: %s gbk utf-8 fname" %(__file__, __file__)
|
||||
exit(1)
|
||||
|
||||
with open(sys.argv[3], "r") as fin:
|
||||
for i, line in enumerate(fin):
|
||||
try:
|
||||
print line.strip().decode(sys.argv[1]).encode(sys.argv[2])
|
||||
except Exception as err:
|
||||
print >> sys.stderr, err
|
||||
|
23
src/CMakeLists.txt
Normal file
23
src/CMakeLists.txt
Normal file
@ -0,0 +1,23 @@
|
||||
SET(EXECUTABLE_OUTPUT_PATH ${PROJECT_BINARY_DIR}/bin)
|
||||
SET(LIBRARY_OUTPUT_PATH ${PROJECT_BINARY_DIR}/lib)
|
||||
|
||||
SET(LIBCPPJIEBA_SRC HMMSegment.cpp MixSegment.cpp MPSegment.cpp Trie.cpp)
|
||||
ADD_LIBRARY(cppjieba STATIC ${LIBCPPJIEBA_SRC})
|
||||
ADD_EXECUTABLE(segment segment.cpp)
|
||||
ADD_EXECUTABLE(server server.cpp)
|
||||
|
||||
LINK_DIRECTORIES(husky)
|
||||
|
||||
TARGET_LINK_LIBRARIES(segment cppjieba)
|
||||
TARGET_LINK_LIBRARIES(server cppjieba husky pthread)
|
||||
|
||||
SET_TARGET_PROPERTIES(cppjieba PROPERTIES VERSION 1.2 SOVERSION 1)
|
||||
|
||||
INSTALL(TARGETS cppjieba ARCHIVE DESTINATION lib/CppJieba)
|
||||
INSTALL(TARGETS segment RUNTIME DESTINATION bin/CppJieba)
|
||||
INSTALL(TARGETS server RUNTIME DESTINATION bin/CppJieba)
|
||||
INSTALL(FILES ChineseFilter.hpp HMMSegment.h MPSegment.h structs.h Trie.h globals.h ISegment.hpp MixSegment.h SegmentBase.hpp TransCode.hpp DESTINATION include/CppJieba)
|
||||
|
||||
|
||||
ADD_SUBDIRECTORY(Husky)
|
||||
ADD_SUBDIRECTORY(Limonp)
|
@ -4,8 +4,8 @@
|
||||
#include <iostream>
|
||||
#include <fstream>
|
||||
#include <memory.h>
|
||||
#include <str_functs.hpp>
|
||||
#include <logger.hpp>
|
||||
#include "Limonp/str_functs.hpp"
|
||||
#include "Limonp/logger.hpp"
|
||||
#include "globals.h"
|
||||
#include "TransCode.hpp"
|
||||
#include "ISegment.hpp"
|
8
src/Husky/CMakeLists.txt
Normal file
8
src/Husky/CMakeLists.txt
Normal file
@ -0,0 +1,8 @@
|
||||
SET(EXECUTABLE_OUTPUT_PATH ${PROJECT_BINARY_DIR}/bin)
|
||||
SET(LIBRARY_OUTPUT_PATH ${PROJECT_BINARY_DIR}/lib)
|
||||
|
||||
SET(LIBHUSKY_SRC Daemon.cpp ServerFrame.cpp)
|
||||
ADD_LIBRARY(husky STATIC ${LIBHUSKY_SRC})
|
||||
|
||||
INSTALL(TARGETS husky ARCHIVE DESTINATION lib/CppJieba/Husky)
|
||||
INSTALL(FILES Daemon.h globals.h HttpReqInfo.hpp ServerFrame.h ThreadManager.hpp DESTINATION include/CppJieba/Husky)
|
@ -8,7 +8,7 @@
|
||||
#include <sys/wait.h>
|
||||
#include <sys/stat.h>
|
||||
#include <signal.h>
|
||||
#include <logger.hpp>
|
||||
#include "../Limonp/logger.hpp"
|
||||
#include "ServerFrame.h"
|
||||
|
||||
namespace Husky
|
@ -3,14 +3,14 @@
|
||||
|
||||
#include <iostream>
|
||||
#include <string>
|
||||
#include "../Limonp/logger.hpp"
|
||||
#include "../Limonp/str_functs.hpp"
|
||||
#include "globals.h"
|
||||
#include <str_functs.hpp>
|
||||
#include <logger.hpp>
|
||||
#include <map_functs.hpp>
|
||||
|
||||
namespace Husky
|
||||
{
|
||||
using namespace Limonp;
|
||||
using namespace std;
|
||||
|
||||
static const char* const KEY_METHOD = "METHOD";
|
||||
static const char* const KEY_PATH = "PATH";
|
||||
@ -130,7 +130,8 @@ namespace Husky
|
||||
LogFatal("headerStr illegal.");
|
||||
return false;
|
||||
}
|
||||
_headerMap[upperStr(k)] = v;
|
||||
upper(k);
|
||||
_headerMap[k] = v;
|
||||
lpos = rpos + 1;
|
||||
}
|
||||
//message header end
|
||||
@ -160,6 +161,8 @@ namespace Husky
|
||||
HashMap<string, string> _headerMap;
|
||||
HashMap<string, string> _methodGetMap;
|
||||
HashMap<string, string> _methodPostMap;
|
||||
//public:
|
||||
friend ostream& operator<<(ostream& os, const HttpReqInfo& obj);
|
||||
private:
|
||||
bool _find(const HashMap<string, string>& mp, const string& key, string& res)const
|
||||
{
|
||||
@ -171,19 +174,6 @@ namespace Husky
|
||||
res = it->second;
|
||||
return true;
|
||||
}
|
||||
public:
|
||||
//string toString() const;// function for debug because of heavy time consuming
|
||||
string toString() const
|
||||
{
|
||||
string res("{");
|
||||
res += HashMapToString(_headerMap);
|
||||
res += ",";
|
||||
res += HashMapToString(_methodGetMap);
|
||||
res += ",";
|
||||
res += HashMapToString(_methodPostMap);
|
||||
res += "}";
|
||||
return res;
|
||||
}
|
||||
private:
|
||||
bool _parseUrl(const string& url, HashMap<string, string>& mp)
|
||||
{
|
||||
@ -226,6 +216,11 @@ namespace Husky
|
||||
}
|
||||
};
|
||||
|
||||
inline std::ostream& operator << (std::ostream& os, const Husky::HttpReqInfo& obj)
|
||||
{
|
||||
return os << obj._headerMap << obj._methodGetMap << obj._methodPostMap;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
#endif
|
@ -134,13 +134,13 @@ namespace Husky
|
||||
nRetCode = recv(hClientSock, chRecvBuf, RECV_BUFFER, 0);
|
||||
strRec = chRecvBuf;
|
||||
|
||||
#ifdef DEBUG
|
||||
LogDebug("response[%s]", strRec.c_str());
|
||||
#ifdef HUKSY_DEBUG
|
||||
LogDebug("request[%s]", strRec.c_str());
|
||||
#endif
|
||||
|
||||
if(SOCKET_ERROR==nRetCode)
|
||||
{
|
||||
LogError("error [%s]", strerror(errno));
|
||||
LogDebug("error [%s]", strerror(errno));
|
||||
closesocket(hClientSock);
|
||||
continue;
|
||||
}
|
||||
@ -160,15 +160,15 @@ namespace Husky
|
||||
|
||||
strHttpResp=chHttpHeader;
|
||||
strHttpResp+=strSnd;
|
||||
#ifdef HUKSY_DEBUG
|
||||
LogDebug("response'body [%s]", strSnd.c_str());
|
||||
#endif
|
||||
|
||||
if (SOCKET_ERROR==send(hClientSock,strHttpResp.c_str(),strHttpResp.length(),0))
|
||||
{
|
||||
LogError("error [%s]", strerror(errno));
|
||||
}
|
||||
|
||||
#ifdef DEBUG
|
||||
LogDebug("send response [%s] ", strHttpResp.c_str());
|
||||
#endif
|
||||
|
||||
closesocket(hClientSock);
|
||||
}
|
@ -10,7 +10,6 @@
|
||||
#include <sstream>
|
||||
#include "str_functs.hpp"
|
||||
#include "map_functs.hpp"
|
||||
#include "vec_functs.hpp"
|
||||
|
||||
namespace Limonp
|
||||
{
|
||||
@ -43,12 +42,7 @@ namespace Limonp
|
||||
}
|
||||
~ArgvContext(){};
|
||||
public:
|
||||
string toString()
|
||||
{
|
||||
stringstream ss;
|
||||
ss<<vecToString<string>(_args)<<mapToString<string, string>(_mpss)<<setToString<string>(_sset);
|
||||
return ss.str();
|
||||
}
|
||||
friend ostream& operator << (ostream& os, const ArgvContext& args);
|
||||
string operator [](uint i)
|
||||
{
|
||||
if(i < _args.size())
|
||||
@ -81,6 +75,16 @@ namespace Limonp
|
||||
set<string> _sset;
|
||||
|
||||
};
|
||||
|
||||
inline ostream& operator << (ostream& os, const ArgvContext& args)
|
||||
{
|
||||
return os<<args._args<<args._mpss<<args._sset;
|
||||
}
|
||||
//string toString()
|
||||
//{
|
||||
// stringstream ss;
|
||||
// return ss.str();
|
||||
//}
|
||||
}
|
||||
|
||||
#endif
|
2
src/Limonp/CMakeLists.txt
Normal file
2
src/Limonp/CMakeLists.txt
Normal file
@ -0,0 +1,2 @@
|
||||
FILE(GLOB HEAD_HPP_LIST "*.hpp")
|
||||
INSTALL(FILES ${HEAD_HPP_LIST} DESTINATION include/CppJieba/Limonp)
|
@ -6,7 +6,6 @@
|
||||
#include <vector>
|
||||
#include <string>
|
||||
#include "logger.hpp"
|
||||
#include "vec_functs.hpp"
|
||||
|
||||
namespace Limonp
|
||||
{
|
||||
@ -21,8 +20,9 @@ namespace Limonp
|
||||
const char * const USER;
|
||||
const char * const PASSWD;
|
||||
const char * const DB;
|
||||
const char * const CHARSET;
|
||||
public:
|
||||
MysqlClient(const char* host, uint port, const char* user, const char* passwd, const char* db): HOST(host), PORT(port), USER(user), PASSWD(passwd), DB(db){ _conn = NULL;};
|
||||
MysqlClient(const char* host, uint port, const char* user, const char* passwd, const char* db, const char* charset = "utf8"): HOST(host), PORT(port), USER(user), PASSWD(passwd), DB(db), CHARSET(charset){ _conn = NULL;};
|
||||
~MysqlClient(){dispose();};
|
||||
public:
|
||||
bool init()
|
||||
@ -42,10 +42,17 @@ namespace Limonp
|
||||
return false;
|
||||
}
|
||||
|
||||
if(mysql_set_character_set(_conn, CHARSET))
|
||||
{
|
||||
LogError("mysql_set_character_set [%s] failed.", CHARSET);
|
||||
return false;
|
||||
}
|
||||
|
||||
//set reconenct
|
||||
char value = 1;
|
||||
mysql_options(_conn, MYSQL_OPT_RECONNECT, &value);
|
||||
|
||||
LogInfo("MysqlClient {host: %s, port:%d, database:%s, charset:%s}", HOST, PORT, DB, CHARSET);
|
||||
return true;
|
||||
}
|
||||
bool dispose()
|
||||
@ -71,6 +78,18 @@ namespace Limonp
|
||||
}
|
||||
return true;
|
||||
}
|
||||
uint insert(const char* tb_name, const char* keys, const vector<string>& vals)
|
||||
{
|
||||
uint retn = 0;
|
||||
string sql;
|
||||
for(uint i = 0; i < vals.size(); i ++)
|
||||
{
|
||||
sql.clear();
|
||||
string_format(sql, "insert into %s (%s) values %s", tb_name, keys, vals[i].c_str());
|
||||
retn += executeSql(sql.c_str());
|
||||
}
|
||||
return retn;
|
||||
}
|
||||
bool select(const char* sql, RowsType& rows)
|
||||
{
|
||||
if(!executeSql(sql))
|
@ -13,13 +13,14 @@
|
||||
#include <stdarg.h>
|
||||
#include "io_functs.hpp"
|
||||
#include "str_functs.hpp"
|
||||
#include "typedefs.h"
|
||||
|
||||
#define LogDebug(fmt, ...) Logger::LoggingF(LL_DEBUG, __FILE__, __LINE__, fmt, ## __VA_ARGS__)
|
||||
#define LogInfo(fmt, ...) Logger::LoggingF(LL_INFO, __FILE__, __LINE__, fmt, ## __VA_ARGS__)
|
||||
#define LogWarn(fmt, ...) Logger::LoggingF(LL_WARN, __FILE__, __LINE__, fmt, ## __VA_ARGS__)
|
||||
#define LogError(fmt, ...) Logger::LoggingF(LL_ERROR, __FILE__, __LINE__, fmt, ## __VA_ARGS__)
|
||||
#define LogFatal(fmt, ...) Logger::LoggingF(LL_FATAL, __FILE__, __LINE__, fmt, ## __VA_ARGS__)
|
||||
#define FILE_BASENAME strrchr(__FILE__, '/') ? strrchr(__FILE__, '/') + 1 : __FILE__
|
||||
|
||||
#define LogDebug(fmt, ...) Logger::LoggingF(LL_DEBUG, FILE_BASENAME, __LINE__, fmt, ## __VA_ARGS__)
|
||||
#define LogInfo(fmt, ...) Logger::LoggingF(LL_INFO, FILE_BASENAME, __LINE__, fmt, ## __VA_ARGS__)
|
||||
#define LogWarn(fmt, ...) Logger::LoggingF(LL_WARN, FILE_BASENAME, __LINE__, fmt, ## __VA_ARGS__)
|
||||
#define LogError(fmt, ...) Logger::LoggingF(LL_ERROR, FILE_BASENAME, __LINE__, fmt, ## __VA_ARGS__)
|
||||
#define LogFatal(fmt, ...) Logger::LoggingF(LL_FATAL, FILE_BASENAME, __LINE__, fmt, ## __VA_ARGS__)
|
||||
|
||||
|
||||
namespace Limonp
|
116
src/Limonp/map_functs.hpp
Normal file
116
src/Limonp/map_functs.hpp
Normal file
@ -0,0 +1,116 @@
|
||||
/************************************
|
||||
* file enc : ascii
|
||||
* author : wuyanyi09@gmail.com
|
||||
************************************/
|
||||
|
||||
|
||||
#ifndef LIMONP_MAP_FUNCTS_H
|
||||
#define LIMONP_MAP_FUNCTS_H
|
||||
|
||||
#include <map>
|
||||
#include <set>
|
||||
#include <iostream>
|
||||
#include <sstream>
|
||||
|
||||
#include <tr1/unordered_map>
|
||||
#define HashMap std::tr1::unordered_map
|
||||
|
||||
namespace Limonp
|
||||
{
|
||||
using namespace std;
|
||||
|
||||
|
||||
//template <typename T>
|
||||
// string setToString(const set<T>& st)
|
||||
// {
|
||||
// if(st.empty())
|
||||
// {
|
||||
// return "{}";
|
||||
// }
|
||||
// stringstream ss;
|
||||
// ss<<'{';
|
||||
// typename set<T>::const_iterator it = st.begin();
|
||||
// ss<<*it;
|
||||
// it++;
|
||||
// while(it != st.end())
|
||||
// {
|
||||
// ss<<", "<<*it;
|
||||
// it++;
|
||||
// }
|
||||
// ss<<'}';
|
||||
// return ss.str();
|
||||
// }
|
||||
|
||||
//template<typename T1, typename T2>
|
||||
// string mapToString(const map<T1, T2>& mp)
|
||||
// {
|
||||
// if(mp.empty())
|
||||
// {
|
||||
// return "{}";
|
||||
// }
|
||||
// stringstream ss;
|
||||
// ss<<'{';
|
||||
// typename map<T1, T2>::const_iterator it = mp.begin();
|
||||
// ss<<it->first<<": "<<it->second;
|
||||
// it++;
|
||||
// while(it != mp.end())
|
||||
// {
|
||||
// ss<<", "<<it->first<<": "<<it->second;
|
||||
// it++;
|
||||
// }
|
||||
// ss<<'}';
|
||||
// return ss.str();
|
||||
// }
|
||||
|
||||
//template<typename T1, typename T2>
|
||||
// string HashMapToString(const HashMap<T1, T2>& mp)
|
||||
// {
|
||||
// if(mp.empty())
|
||||
// {
|
||||
// return "{}";
|
||||
// }
|
||||
// stringstream ss;
|
||||
// ss<<'{';
|
||||
// typename HashMap<T1, T2>::const_iterator it = mp.begin();
|
||||
// ss<<it->first<<": "<<it->second;
|
||||
// it++;
|
||||
// while(it != mp.end())
|
||||
// {
|
||||
// ss<<", "<<it->first<<": "<<it->second;
|
||||
// it++;
|
||||
// }
|
||||
// ss<<'}';
|
||||
// return ss.str();
|
||||
// }
|
||||
//template<typename T1, typename T2>
|
||||
// string pairToString(const pair<T1, T2>& p)
|
||||
// {
|
||||
// stringstream ss;
|
||||
// ss<<p.first<<":"<<p.second;
|
||||
// return ss.str();
|
||||
// }
|
||||
|
||||
template<class kT, class vT>
|
||||
vT getMap(const map<kT, vT>& mp, const kT & key, const vT & defaultVal)
|
||||
{
|
||||
typename map<kT, vT>::const_iterator it;
|
||||
it = mp.find(key);
|
||||
if(mp.end() == it)
|
||||
{
|
||||
return defaultVal;
|
||||
}
|
||||
return it->second;
|
||||
}
|
||||
|
||||
template<class kT, class vT>
|
||||
void map2Vec(const map<kT, vT>& mp, vector<pair<kT, vT> > & res)
|
||||
{
|
||||
typename map<kT, vT>::const_iterator it = mp.begin();
|
||||
for(; it != mp.end(); it++)
|
||||
{
|
||||
res.push_back(*it);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
101
src/Limonp/std_outbound.hpp
Normal file
101
src/Limonp/std_outbound.hpp
Normal file
@ -0,0 +1,101 @@
|
||||
#ifndef LIMONP_STD_OUTBOUND_H
|
||||
#define LIMONP_STD_OUTBOUND_H
|
||||
|
||||
#include <tr1/unordered_map>
|
||||
#include <map>
|
||||
#include <set>
|
||||
|
||||
namespace std
|
||||
{
|
||||
template<typename T>
|
||||
ostream& operator << (ostream& os, const vector<T>& vec)
|
||||
{
|
||||
if(vec.empty())
|
||||
{
|
||||
return os << "[]";
|
||||
}
|
||||
os<<"[\""<<vec[0];
|
||||
for(uint i = 1; i < vec.size(); i++)
|
||||
{
|
||||
os<<"\", \""<<vec[i];
|
||||
}
|
||||
os<<"\"]";
|
||||
return os;
|
||||
}
|
||||
template<class T1, class T2>
|
||||
ostream& operator << (ostream& os, const pair<T1, T2>& pr)
|
||||
{
|
||||
os << pr.first << ":" << pr.second ;
|
||||
return os;
|
||||
}
|
||||
|
||||
|
||||
template<class T>
|
||||
string& operator << (string& str, const T& obj)
|
||||
{
|
||||
stringstream ss;
|
||||
ss << obj; // call ostream& operator << (ostream& os,
|
||||
return str = ss.str();
|
||||
}
|
||||
|
||||
template<class T1, class T2>
|
||||
ostream& operator << (ostream& os, const map<T1, T2>& mp)
|
||||
{
|
||||
if(mp.empty())
|
||||
{
|
||||
os<<"{}";
|
||||
return os;
|
||||
}
|
||||
os<<'{';
|
||||
typename map<T1, T2>::const_iterator it = mp.begin();
|
||||
os<<*it;
|
||||
it++;
|
||||
while(it != mp.end())
|
||||
{
|
||||
os<<", "<<*it;
|
||||
it++;
|
||||
}
|
||||
os<<'}';
|
||||
return os;
|
||||
}
|
||||
template<class T1, class T2>
|
||||
ostream& operator << (ostream& os, const std::tr1::unordered_map<T1, T2>& mp)
|
||||
{
|
||||
if(mp.empty())
|
||||
{
|
||||
return os << "{}";
|
||||
}
|
||||
os<<'{';
|
||||
typename std::tr1::unordered_map<T1, T2>::const_iterator it = mp.begin();
|
||||
os<<*it;
|
||||
it++;
|
||||
while(it != mp.end())
|
||||
{
|
||||
os<<", "<<*it++;
|
||||
}
|
||||
return os<<'}';
|
||||
}
|
||||
|
||||
template<class T>
|
||||
ostream& operator << (ostream& os, const set<T>& st)
|
||||
{
|
||||
if(st.empty())
|
||||
{
|
||||
os << "{}";
|
||||
return os;
|
||||
}
|
||||
os<<'{';
|
||||
typename set<T>::const_iterator it = st.begin();
|
||||
os<<*it;
|
||||
it++;
|
||||
while(it != st.end())
|
||||
{
|
||||
os<<", "<<*it;
|
||||
it++;
|
||||
}
|
||||
os<<'}';
|
||||
return os;
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
@ -15,9 +15,17 @@
|
||||
#include <stdio.h>
|
||||
#include <stdarg.h>
|
||||
#include <memory.h>
|
||||
#include "typedefs.h"
|
||||
#include <functional>
|
||||
#include <locale>
|
||||
#include <sstream>
|
||||
#include <sys/types.h>
|
||||
#include <iterator>
|
||||
#include <algorithm>
|
||||
#include "std_outbound.hpp"
|
||||
#include "map_functs.hpp"
|
||||
|
||||
#define print(x) cout<<(x)<<endl
|
||||
|
||||
namespace Limonp
|
||||
{
|
||||
using namespace std;
|
||||
@ -42,11 +50,12 @@ namespace Limonp
|
||||
}
|
||||
return str;
|
||||
}
|
||||
|
||||
|
||||
inline void string_format(string& res, const char* fmt, ...)
|
||||
{
|
||||
int size = 256;
|
||||
va_list ap;
|
||||
res.clear();
|
||||
while (1) {
|
||||
res.resize(size);
|
||||
va_start(ap, fmt);
|
||||
@ -63,27 +72,55 @@ namespace Limonp
|
||||
}
|
||||
}
|
||||
|
||||
inline bool joinStr(const vector<string>& src, string& dest, const string& connectorStr)
|
||||
{
|
||||
if(src.empty())
|
||||
{
|
||||
return false;
|
||||
}
|
||||
for(uint i = 0; i < src.size() - 1; i++)
|
||||
{
|
||||
dest += src[i];
|
||||
dest += connectorStr;
|
||||
}
|
||||
dest += src[src.size() - 1];
|
||||
return true;
|
||||
}
|
||||
//inline bool joinStr(const vector<string>& src, string& dest, const string& connectorStr)
|
||||
//{
|
||||
// if(src.empty())
|
||||
// {
|
||||
// return false;
|
||||
// }
|
||||
// for(uint i = 0; i < src.size() - 1; i++)
|
||||
// {
|
||||
// dest += src[i];
|
||||
// dest += connectorStr;
|
||||
// }
|
||||
// dest += src[src.size() - 1];
|
||||
// return true;
|
||||
//}
|
||||
|
||||
inline string joinStr(const vector<string>& source, const string& connector)
|
||||
{
|
||||
string res;
|
||||
joinStr(source, res, connector);
|
||||
return res;
|
||||
}
|
||||
//inline string joinStr(const vector<string>& source, const string& connector)
|
||||
//{
|
||||
// string res;
|
||||
// joinStr(source, res, connector);
|
||||
// return res;
|
||||
//}
|
||||
|
||||
template<class T>
|
||||
void join(T begin, T end, string& res, const string& connector)
|
||||
{
|
||||
if(begin == end)
|
||||
{
|
||||
return;
|
||||
}
|
||||
stringstream ss;
|
||||
ss<<*begin;
|
||||
begin++;
|
||||
while(begin != end)
|
||||
{
|
||||
ss << connector << *begin;
|
||||
begin ++;
|
||||
}
|
||||
res = ss.str();
|
||||
}
|
||||
|
||||
template<class T>
|
||||
string join(T begin, T end, const string& connector)
|
||||
{
|
||||
string res;
|
||||
join(begin ,end, res, connector);
|
||||
return res;
|
||||
}
|
||||
|
||||
|
||||
|
||||
inline bool splitStr(const string& src, vector<string>& res, const string& pattern)
|
||||
{
|
||||
@ -104,26 +141,24 @@ namespace Limonp
|
||||
return true;
|
||||
}
|
||||
res.push_back(src.substr(start, end - start));
|
||||
if(end == src.size() - 1)
|
||||
{
|
||||
res.push_back("");
|
||||
break;
|
||||
}
|
||||
if(end == src.size() - 1)
|
||||
{
|
||||
res.push_back("");
|
||||
break;
|
||||
}
|
||||
start = end + 1;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
inline string upperStr(const string& strIn)
|
||||
inline string& upper(string& str)
|
||||
{
|
||||
string str = strIn;
|
||||
transform(str.begin(), str.end(), str.begin(), (int (*)(int))toupper);
|
||||
return str;
|
||||
}
|
||||
|
||||
inline string lowerStr(const string& strIn)
|
||||
inline string& lower(string& str)
|
||||
{
|
||||
string str = strIn;
|
||||
transform(str.begin(), str.end(), str.begin(), (int (*)(int))tolower);
|
||||
return str;
|
||||
}
|
||||
@ -183,40 +218,40 @@ namespace Limonp
|
||||
return str.find(ch) != string::npos;
|
||||
}
|
||||
|
||||
inline void extractWords(const string& sentence, vector<string>& words)
|
||||
{
|
||||
bool flag = false;
|
||||
uint lhs = 0, len = 0;
|
||||
for(uint i = 0; i < sentence.size(); i++)
|
||||
{
|
||||
char x = sentence[i];
|
||||
if((0x0030 <= x && x<= 0x0039) || (0x0041 <= x && x <= 0x005a ) || (0x0061 <= x && x <= 0x007a))
|
||||
{
|
||||
if(flag)
|
||||
{
|
||||
len ++;
|
||||
}
|
||||
else
|
||||
{
|
||||
lhs = i;
|
||||
len = 1;
|
||||
}
|
||||
flag = true;
|
||||
}
|
||||
else
|
||||
{
|
||||
if(flag)
|
||||
{
|
||||
words.push_back(string(sentence, lhs, len));
|
||||
}
|
||||
flag = false;
|
||||
}
|
||||
}
|
||||
if(flag)
|
||||
{
|
||||
words.push_back(string(sentence, lhs, len));
|
||||
}
|
||||
}
|
||||
//inline void extractWords(const string& sentence, vector<string>& words)
|
||||
//{
|
||||
// bool flag = false;
|
||||
// uint lhs = 0, len = 0;
|
||||
// for(uint i = 0; i < sentence.size(); i++)
|
||||
// {
|
||||
// char x = sentence[i];
|
||||
// if((0x0030 <= x && x<= 0x0039) || (0x0041 <= x && x <= 0x005a ) || (0x0061 <= x && x <= 0x007a))
|
||||
// {
|
||||
// if(flag)
|
||||
// {
|
||||
// len ++;
|
||||
// }
|
||||
// else
|
||||
// {
|
||||
// lhs = i;
|
||||
// len = 1;
|
||||
// }
|
||||
// flag = true;
|
||||
// }
|
||||
// else
|
||||
// {
|
||||
// if(flag)
|
||||
// {
|
||||
// words.push_back(string(sentence, lhs, len));
|
||||
// }
|
||||
// flag = false;
|
||||
// }
|
||||
// }
|
||||
// if(flag)
|
||||
// {
|
||||
// words.push_back(string(sentence, lhs, len));
|
||||
// }
|
||||
//}
|
||||
|
||||
|
||||
}
|
@ -7,7 +7,7 @@
|
||||
|
||||
#include <algorithm>
|
||||
#include <set>
|
||||
#include <logger.hpp>
|
||||
#include "Limonp/logger.hpp"
|
||||
#include "Trie.h"
|
||||
#include "globals.h"
|
||||
#include "ISegment.hpp"
|
@ -3,7 +3,7 @@
|
||||
|
||||
#include "MPSegment.h"
|
||||
#include "HMMSegment.h"
|
||||
#include <str_functs.hpp>
|
||||
#include "Limonp/str_functs.hpp"
|
||||
|
||||
namespace CppJieba
|
||||
{
|
@ -4,8 +4,8 @@
|
||||
#include "globals.h"
|
||||
#include "ISegment.hpp"
|
||||
#include "ChineseFilter.hpp"
|
||||
#include <str_functs.hpp>
|
||||
#include <logger.hpp>
|
||||
#include "Limonp/str_functs.hpp"
|
||||
#include "Limonp/logger.hpp"
|
||||
|
||||
namespace CppJieba
|
||||
{
|
@ -7,8 +7,7 @@
|
||||
|
||||
|
||||
#include "globals.h"
|
||||
#include <str_functs.hpp>
|
||||
#include <vec_functs.hpp>
|
||||
#include "Limonp/str_functs.hpp"
|
||||
|
||||
namespace CppJieba
|
||||
{
|
@ -12,8 +12,8 @@
|
||||
#include <stdint.h>
|
||||
#include <cmath>
|
||||
#include <limits>
|
||||
#include <str_functs.hpp>
|
||||
#include <logger.hpp>
|
||||
#include "Limonp/str_functs.hpp"
|
||||
#include "Limonp/logger.hpp"
|
||||
#include "TransCode.hpp"
|
||||
#include "globals.h"
|
||||
#include "structs.h"
|
82
src/segment.cpp
Normal file
82
src/segment.cpp
Normal file
@ -0,0 +1,82 @@
|
||||
#include <iostream>
|
||||
#include <fstream>
|
||||
#include "Limonp/ArgvContext.hpp"
|
||||
#include "MPSegment.h"
|
||||
#include "HMMSegment.h"
|
||||
#include "MixSegment.h"
|
||||
|
||||
using namespace CppJieba;
|
||||
|
||||
void cut(const ISegment * seg, const char * const filePath)
|
||||
{
|
||||
ifstream ifile(filePath);
|
||||
vector<string> res;
|
||||
string line;
|
||||
while(getline(ifile, line))
|
||||
{
|
||||
if(!line.empty())
|
||||
{
|
||||
res.clear();
|
||||
seg->cut(line, res);
|
||||
cout<<join(res.begin(), res.end(),"/")<<endl;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
int main(int argc, char ** argv)
|
||||
{
|
||||
if(argc < 2)
|
||||
{
|
||||
cout<<"usage: \n\t"<<argv[0]<<" [options] <filename>\n"
|
||||
<<"options:\n"
|
||||
<<"\t--algorithm\tSupported methods are [cutDAG, cutHMM, cutMix] for now. \n\t\t\tIf not specified, the default is cutDAG\n"
|
||||
<<"\t--dictpath\tsee example\n"
|
||||
<<"\t--modelpath\tsee example\n"
|
||||
<<"example:\n"
|
||||
<<"\t"<<argv[0]<<" testlines.utf8 --dictpath dicts/jieba.dict.utf8\n"
|
||||
<<"\t"<<argv[0]<<" testlines.utf8 --modelpath dicts/hmm_model.utf8 --algorithm cutHMM\n"
|
||||
<<"\t"<<argv[0]<<" testlines.utf8 --dictpath dicts/jieba.dict.utf8 --modelpath dicts/hmm_model.utf8 --algorithm cutMix\n"
|
||||
<<endl;
|
||||
|
||||
return EXIT_FAILURE;
|
||||
}
|
||||
ArgvContext arg(argc, argv);
|
||||
string dictPath = arg["--dictpath"];
|
||||
string modelPath = arg["--modelpath"];
|
||||
string algorithm = arg["--algorithm"];
|
||||
|
||||
if("cutHMM" == algorithm)
|
||||
{
|
||||
HMMSegment seg;
|
||||
if(!seg.init(modelPath.c_str()))
|
||||
{
|
||||
cout<<"seg init failed."<<endl;
|
||||
return EXIT_FAILURE;
|
||||
}
|
||||
cut(&seg, arg[1].c_str());
|
||||
seg.dispose();
|
||||
}
|
||||
else if("cutMix" == algorithm)
|
||||
{
|
||||
MixSegment seg;
|
||||
if(!seg.init(dictPath.c_str(), modelPath.c_str()))
|
||||
{
|
||||
cout<<"seg init failed."<<endl;
|
||||
return EXIT_FAILURE;
|
||||
}
|
||||
cut(&seg, arg[1].c_str());
|
||||
seg.dispose();
|
||||
}
|
||||
else
|
||||
{
|
||||
MPSegment seg;
|
||||
if(!seg.init(dictPath.c_str()))
|
||||
{
|
||||
cout<<"seg init failed."<<endl;
|
||||
return false;
|
||||
}
|
||||
cut(&seg, arg[1].c_str());
|
||||
seg.dispose();
|
||||
}
|
||||
return EXIT_SUCCESS;
|
||||
}
|
@ -3,12 +3,12 @@
|
||||
#include <string>
|
||||
#include <ctype.h>
|
||||
#include <string.h>
|
||||
#include <ArgvContext.hpp>
|
||||
#include "../husky/Daemon.h"
|
||||
#include "../husky/ServerFrame.h"
|
||||
#include "../cppjieba/MPSegment.h"
|
||||
#include "../cppjieba/HMMSegment.h"
|
||||
#include "../cppjieba/MixSegment.h"
|
||||
#include "Limonp/ArgvContext.hpp"
|
||||
#include "Husky/Daemon.h"
|
||||
#include "Husky/ServerFrame.h"
|
||||
#include "MPSegment.h"
|
||||
#include "HMMSegment.h"
|
||||
#include "MixSegment.h"
|
||||
|
||||
using namespace Husky;
|
||||
using namespace CppJieba;
|
||||
@ -31,7 +31,7 @@ class ServerDemo: public IRequestHandler
|
||||
httpReq.GET("key", tmp);
|
||||
URLDecode(tmp, sentence);
|
||||
_segment.cut(sentence, words);
|
||||
vecToString(words, strSnd);
|
||||
strSnd << words;
|
||||
return true;
|
||||
}
|
||||
private:
|
@ -74,7 +74,7 @@ namespace CppJieba
|
||||
KeyWordInfo(const TrieNodeInfo& trieNodeInfo):TrieNodeInfo(trieNodeInfo)
|
||||
{
|
||||
}
|
||||
inline string toString() const
|
||||
string toString() const
|
||||
{
|
||||
string tmp;
|
||||
TransCode::encode(word, tmp);
|
||||
@ -89,16 +89,23 @@ namespace CppJieba
|
||||
return *this;
|
||||
}
|
||||
};
|
||||
|
||||
inline string joinWordInfos(const vector<KeyWordInfo>& vec)
|
||||
|
||||
inline ostream& operator << (ostream& os, const KeyWordInfo& info)
|
||||
{
|
||||
vector<string> tmp;
|
||||
for(uint i = 0; i < vec.size(); i++)
|
||||
{
|
||||
tmp.push_back(vec[i].toString());
|
||||
}
|
||||
return joinStr(tmp, ",");
|
||||
string tmp;
|
||||
TransCode::encode(info.word, tmp);
|
||||
return os << "{words:" << tmp << ", weight:" << info.weight << ", idf:" << info.idf << "}";
|
||||
}
|
||||
|
||||
//inline string joinWordInfos(const vector<KeyWordInfo>& vec)
|
||||
//{
|
||||
// vector<string> tmp;
|
||||
// for(uint i = 0; i < vec.size(); i++)
|
||||
// {
|
||||
// tmp.push_back(vec[i].toString());
|
||||
// }
|
||||
// return joinStr(tmp, ",");
|
||||
//}
|
||||
}
|
||||
|
||||
#endif
|
@ -1,54 +0,0 @@
|
||||
CXX := g++
|
||||
LD := g++
|
||||
AR := ar rc
|
||||
|
||||
INCS := -I../cppjieba/
|
||||
|
||||
DEBUG_CXXFLAGS := -g -Wall -DDEBUG -DUT $(INCS)
|
||||
|
||||
CXXFLAGS := ${DEBUG_CXXFLAGS}
|
||||
LDFLAGS := ${DEBUG_LDFLAGS}
|
||||
|
||||
DOLINK := $(LD) $(LDFLAGS)
|
||||
DOPACK := $(AR)
|
||||
SOURCES := $(wildcard *.cpp)
|
||||
OBJS := $(patsubst %.cpp,%.o,$(SOURCES))
|
||||
UTS := $(patsubst %.cpp,%.ut,$(SOURCES))
|
||||
|
||||
CPPJIEBADIR = ../cppjieba
|
||||
LIBCPPJIEBA = $(CPPJIEBADIR)/libcppjieba.a
|
||||
|
||||
CPPCOMMONDIR = ../cppcommon
|
||||
LIBCPPCM = $(CPPCOMMONDIR)/libcm.a
|
||||
|
||||
LIBA := $(LIBCPPJIEBA) $(LIBCPPCM)
|
||||
# remove the objs after compilation
|
||||
.PHONY: clean $(LIBA)
|
||||
|
||||
# Main Targets
|
||||
all: $(UTS)
|
||||
|
||||
# This is a suffix rule
|
||||
#.c.o:
|
||||
%.o: %.cpp
|
||||
$(CXX) -c $(CXXFLAGS) $<
|
||||
%.ut: %.o $(LIBA)
|
||||
$(CXX) $(CXXFLAGS) -o $@ $^
|
||||
|
||||
$(LIBCPPJIEBA):
|
||||
cd $(CPPJIEBADIR) && $(MAKE)
|
||||
|
||||
$(LIBCPPCM):
|
||||
cd $(CPPCOMMONDIR) && $(MAKE)
|
||||
|
||||
clean:
|
||||
rm -f *.o *.ut *.d *.d.*
|
||||
# cd $(CPPJIEBADIR) && make clean
|
||||
# cd $(CPPCOMMONDIR) && make clean
|
||||
|
||||
sinclude $(SOURCES:.cpp=.d)
|
||||
%.d:%.cpp
|
||||
@set -e; rm -f $@; \
|
||||
$(CXX) -MM $< > $@.$$$$; \
|
||||
sed 's,\($*\).o[ :]*,\1.o $@ : ,g' < $@.$$$$ > $@; \
|
||||
rm -f $@.$$$$
|
60
test/segment.cpp
Normal file
60
test/segment.cpp
Normal file
@ -0,0 +1,60 @@
|
||||
#include <iostream>
|
||||
#include <fstream>
|
||||
#include <CppJieba/Limonp/ArgvContext.hpp>
|
||||
#include <CppJieba/MPSegment.h>
|
||||
#include <CppJieba/HMMSegment.h>
|
||||
#include <CppJieba/MixSegment.h>
|
||||
|
||||
using namespace CppJieba;
|
||||
|
||||
void cut(const ISegment * seg, const char * const filePath)
|
||||
{
|
||||
ifstream ifile(filePath);
|
||||
vector<string> res;
|
||||
string line;
|
||||
while(getline(ifile, line))
|
||||
{
|
||||
if(!line.empty())
|
||||
{
|
||||
res.clear();
|
||||
seg->cut(line, res);
|
||||
cout<<join(res.begin(), res.end(),"/")<<endl;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
int main(int argc, char ** argv)
|
||||
{
|
||||
//demo
|
||||
{
|
||||
HMMSegment seg;
|
||||
if(!seg.init("../dicts/hmm_model.utf8"))
|
||||
{
|
||||
cout<<"seg init failed."<<endl;
|
||||
return EXIT_FAILURE;
|
||||
}
|
||||
cut(&seg, "testlines.utf8");
|
||||
seg.dispose();
|
||||
}
|
||||
{
|
||||
MixSegment seg;
|
||||
if(!seg.init("../dicts/jieba.dict.utf8", "../dicts/hmm_model.utf8"))
|
||||
{
|
||||
cout<<"seg init failed."<<endl;
|
||||
return EXIT_FAILURE;
|
||||
}
|
||||
cut(&seg, "testlines.utf8");
|
||||
seg.dispose();
|
||||
}
|
||||
{
|
||||
MPSegment seg;
|
||||
if(!seg.init("../dicts/jieba.dict.utf8"))
|
||||
{
|
||||
cout<<"seg init failed."<<endl;
|
||||
return false;
|
||||
}
|
||||
cut(&seg, "testlines.utf8");
|
||||
seg.dispose();
|
||||
}
|
||||
return EXIT_SUCCESS;
|
||||
}
|
58
test/server.cpp
Normal file
58
test/server.cpp
Normal file
@ -0,0 +1,58 @@
|
||||
#include <CppJieba/Husky/ServerFrame.h>
|
||||
#include <CppJieba/Husky/Daemon.h>
|
||||
#include <CppJieba/Limonp/ArgvContext.hpp>
|
||||
#include <CppJieba/MPSegment.h>
|
||||
#include <CppJieba/HMMSegment.h>
|
||||
#include <CppJieba/MixSegment.h>
|
||||
|
||||
using namespace Husky;
|
||||
using namespace CppJieba;
|
||||
|
||||
const char * const DEFAULT_DICTPATH = "../dicts/jieba.dict.utf8";
|
||||
const char * const DEFAULT_MODELPATH = "../dicts/hmm_model.utf8";
|
||||
|
||||
class ServerDemo: public IRequestHandler
|
||||
{
|
||||
public:
|
||||
ServerDemo(){};
|
||||
virtual ~ServerDemo(){};
|
||||
virtual bool init(){return _segment.init(DEFAULT_DICTPATH, DEFAULT_MODELPATH);};
|
||||
virtual bool dispose(){return _segment.dispose();};
|
||||
public:
|
||||
virtual bool do_GET(const HttpReqInfo& httpReq, string& strSnd)
|
||||
{
|
||||
string sentence, tmp;
|
||||
vector<string> words;
|
||||
httpReq.GET("key", tmp);
|
||||
URLDecode(tmp, sentence);
|
||||
_segment.cut(sentence, words);
|
||||
strSnd << words;
|
||||
return true;
|
||||
}
|
||||
private:
|
||||
MixSegment _segment;
|
||||
};
|
||||
|
||||
int main(int argc,char* argv[])
|
||||
{
|
||||
if(argc != 7)
|
||||
{
|
||||
printf("usage: %s -n THREAD_NUMBER -p LISTEN_PORT -k start|stop\n",argv[0]);
|
||||
return -1;
|
||||
}
|
||||
ArgvContext arg(argc, argv);
|
||||
unsigned int port = atoi(arg["-p"].c_str());
|
||||
unsigned int threadNum = atoi(arg["-n"].c_str());
|
||||
|
||||
ServerDemo s;
|
||||
Daemon daemon(&s);
|
||||
if(arg["-k"] == "start")
|
||||
{
|
||||
return !daemon.Start(port, threadNum);
|
||||
}
|
||||
else
|
||||
{
|
||||
return !daemon.Stop();
|
||||
}
|
||||
}
|
||||
|
Loading…
x
Reference in New Issue
Block a user