merge upstream

This commit is contained in:
aholic 2013-11-27 16:32:54 +08:00
commit ef8954f1fe
15 changed files with 226 additions and 232 deletions

View File

@ -1,6 +1,9 @@
PROJECT(CPPJIEBA)
SET(CMAKE_INSTALL_PREFIX /usr)
ADD_DEFINITIONS(-std=c++0x -O3)
IF (DEFINED ENC)
ADD_DEFINITIONS(-DCPPJIEBA_${ENC})
ENDIF()
ADD_SUBDIRECTORY(src)
ADD_SUBDIRECTORY(dicts)
ADD_SUBDIRECTORY(scripts)

View File

@ -8,9 +8,6 @@
现在支持utf8,gbk编码的分词。
- `master`分支支持`utf8`编码
- `gbk`分支支持`gbk`编码
## 安装与使用
### 下载和安装
@ -22,6 +19,8 @@ cd cppjieba-master
mkdir build
cd build
cmake ..
# 默认是utf8编码如果要使用gbk编码则使用下句cmake命令
# cmake .. -DENC=GBK
make
sudo make install
```
@ -122,14 +121,14 @@ Output:
核心目录,包含主要源代码。
#### Trie树
Trie.cpp/Trie.h 负责载入词典的trie树主要供Segment模块使用。
Trie.hpp 负责载入词典的trie树主要供Segment模块使用。
#### Segment模块
MPSegment.cpp/MPSegment.h
MPSegment.hpp
(Maximum Probability)最大概率法:负责根据Trie树构建有向无环图和进行动态规划算法是分词算法的核心。
HMMSegment.cpp/HMMSegment.h
HMMSegment.hpp
是根据HMM模型来进行分词主要算法思路是根据(B,E,M,S)四个状态来代表每个字的隐藏状态。
HMM模型由dicts/下面的`hmm_model.utf8`提供。
分词算法即viterbi算法。

View File

@ -6,7 +6,7 @@
namespace CppJieba
{
enum CHAR_TYPE { CHWORD = 0, DIGIT_OR_LETTER = 1, OTHERS = 2};
enum CHAR_TYPE { CHWORD = 0, DIGIT_OR_LETTER = 1};
typedef Unicode::const_iterator UniConIter;
class ChineseFilter;
class ChFilterIterator
@ -55,11 +55,7 @@ namespace CppJieba
{
return DIGIT_OR_LETTER;
}
if(x >= 0x4e00 && x <= 0x9fff)
{
return CHWORD;
}
return OTHERS;
return CHWORD;
}
ChFilterIterator _get(UniConIter iter)
{
@ -67,7 +63,7 @@ namespace CppJieba
const UniConIter& _end = ptUnico->end();
if(iter == _end)
{
return ChFilterIterator(ptUnico, end, end, OTHERS);
return ChFilterIterator(ptUnico, end, end, DIGIT_OR_LETTER);
}
CHAR_TYPE charType = _charType(*iter);
iter ++;
@ -99,7 +95,7 @@ namespace CppJieba
}
iterator end()
{
return iterator(&_unico, _unico.end(), _unico.end(), OTHERS);
return iterator(&_unico, _unico.end(), _unico.end(), DIGIT_OR_LETTER);
}
};
}

View File

@ -15,12 +15,13 @@ namespace CppJieba
{
private:
Trie _trie;
const string _dictPath;
public:
FullSegment(){};
FullSegment(const char* dictPath): _dictPath(dictPath){};
virtual ~FullSegment(){dispose();};
public:
bool init(const char* const filePath)
bool init()
{
if(_getInitFlag())
{
@ -32,8 +33,8 @@ namespace CppJieba
LogError("_trie.init failed.");
return false;
}
LogInfo("_trie.loadDict(%s) start...", filePath);
if(!_trie.loadDict(filePath))
LogInfo("_trie.loadDict(%s) start...", _dictPath.c_str());
if(!_trie.loadDict(_dictPath.c_str()))
{
LogError("_trie.loadDict faield.");
return false;

View File

@ -31,9 +31,11 @@ namespace CppJieba
EmitProbMap _emitProbM;
EmitProbMap _emitProbS;
vector<EmitProbMap* > _emitProbVec;
private:
const string _hmmModelPath;
public:
HMMSegment()
HMMSegment(const char * const filePath): _hmmModelPath(filePath)
{
memset(_startProb, 0, sizeof(_startProb));
memset(_transProb, 0, sizeof(_transProb));
@ -51,11 +53,11 @@ namespace CppJieba
dispose();
}
public:
bool init(const char* const modelPath)
virtual bool init()
{
return _setInitFlag(_loadModel(modelPath));
return _setInitFlag(_loadModel(_hmmModelPath.c_str()));
}
bool dispose()
virtual bool dispose()
{
_setInitFlag(false);
return true;
@ -88,11 +90,8 @@ namespace CppJieba
}
return true;
}
bool cut(const string& str, vector<string>& res)const
{
return SegmentBase::cut(str, res);
}
bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<string>& res)const
public:
virtual bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<string>& res)const
{
if(!_getInitFlag())
{

View File

@ -6,8 +6,11 @@ namespace CppJieba
{
class ISegment
{
//public:
// virtual ~ISegment(){};
public:
virtual ~ISegment(){};
public:
virtual bool init() = 0;
virtual bool dispose() = 0;
public:
virtual bool cut(Unicode::const_iterator begin , Unicode::const_iterator end, vector<string>& res) const = 0;
virtual bool cut(const string& str, vector<string>& res) const = 0;

View File

@ -120,7 +120,7 @@ namespace Limonp
return res;
}
inline bool splitStr(const string& src, vector<string>& res, const string& pattern)
{
@ -218,41 +218,129 @@ namespace Limonp
return str.find(ch) != string::npos;
}
//inline void extractWords(const string& sentence, vector<string>& words)
//{
// bool flag = false;
// uint lhs = 0, len = 0;
// for(uint i = 0; i < sentence.size(); i++)
// {
// char x = sentence[i];
// if((0x0030 <= x && x<= 0x0039) || (0x0041 <= x && x <= 0x005a ) || (0x0061 <= x && x <= 0x007a))
// {
// if(flag)
// {
// len ++;
// }
// else
// {
// lhs = i;
// len = 1;
// }
// flag = true;
// }
// else
// {
// if(flag)
// {
// words.push_back(string(sentence, lhs, len));
// }
// flag = false;
// }
// }
// if(flag)
// {
// words.push_back(string(sentence, lhs, len));
// }
//}
inline bool utf8ToUnicode(const string& str, vector<uint16_t>& vec)
{
char ch1, ch2;
if(str.empty())
{
return false;
}
vec.clear();
size_t siz = str.size();
for(uint i = 0;i < siz;)
{
if(!(str[i] & 0x80)) // 0xxxxxxx
{
vec.push_back(str[i]);
i++;
}
else if ((unsigned char)str[i] <= 0xdf && i + 1 < siz) // 110xxxxxx
{
ch1 = (str[i] >> 2) & 0x07;
ch2 = (str[i+1] & 0x3f) | ((str[i] & 0x03) << 6 );
vec.push_back(twocharToUint16(ch1, ch2));
i += 2;
}
else if((unsigned char)str[i] <= 0xef && i + 2 < siz)
{
ch1 = (str[i] << 4) | ((str[i+1] >> 2) & 0x0f );
ch2 = ((str[i+1]<<6) & 0xc0) | (str[i+2] & 0x3f);
vec.push_back(twocharToUint16(ch1, ch2));
i += 3;
}
else
{
return false;
}
}
return true;
}
inline bool unicodeToUtf8(vector<uint16_t>::const_iterator begin, vector<uint16_t>::const_iterator end, string& res)
{
if(begin >= end)
{
return false;
}
res.clear();
uint16_t ui;
while(begin != end)
{
ui = *begin;
if(ui <= 0x7f)
{
res += char(ui);
}
else if(ui <= 0x7ff)
{
res += char(((ui>>6) & 0x1f) | 0xc0);
res += char((ui & 0x3f) | 0x80);
}
else
{
res += char(((ui >> 12) & 0x0f )| 0xe0);
res += char(((ui>>6) & 0x3f )| 0x80 );
res += char((ui & 0x3f) | 0x80);
}
begin ++;
}
return true;
}
inline bool gbkTrans(const string& str, vector<uint16_t>& vec)
{
vec.clear();
if(str.empty())
{
return false;
}
uint i = 0;
while(i < str.size())
{
if(0 == (str[i] & 0x80))
{
vec.push_back(uint16_t(str[i]));
i++;
}
else
{
if(i + 1 < str.size()) //&& (str[i+1] & 0x80))
{
vec.push_back(twocharToUint16(str[i], str[i + 1]));
i += 2;
}
else
{
return false;
}
}
}
return true;
}
inline bool gbkTrans(vector<uint16_t>::const_iterator begin, vector<uint16_t>::const_iterator end, string& res)
{
if(begin >= end)
{
return false;
}
res.clear();
pair<char, char> pa;
while(begin != end)
{
pa = uint16ToChar2(*begin);
if(pa.first & 0x80)
{
res += pa.first;
res += pa.second;
}
else
{
res += pa.second;
}
begin++;
}
return true;
}
}
#endif

View File

@ -32,12 +32,14 @@ namespace CppJieba
{
private:
Trie _trie;
private:
const string _dictPath;
public:
MPSegment(){};
MPSegment(const char * const dictPath): _dictPath(dictPath){};
virtual ~MPSegment(){dispose();};
public:
bool init(const char* const filePath)
virtual bool init()
{
if(_getInitFlag())
{
@ -49,8 +51,8 @@ namespace CppJieba
LogError("_trie.init failed.");
return false;
}
LogInfo("_trie.loadDict(%s) start...", filePath);
if(!_trie.loadDict(filePath))
LogInfo("_trie.loadDict(%s) start...", _dictPath.c_str());
if(!_trie.loadDict(_dictPath.c_str()))
{
LogError("_trie.loadDict faield.");
return false;
@ -58,7 +60,7 @@ namespace CppJieba
LogInfo("_trie.loadDict end.");
return _setInitFlag(true);
}
bool dispose()
virtual bool dispose()
{
if(!_getInitFlag())
{
@ -69,12 +71,7 @@ namespace CppJieba
return true;
}
public:
//bool cut(const string& str, vector<TrieNodeInfo>& segWordInfos)const;
bool cut(const string& str, vector<string>& res)const
{
return SegmentBase::cut(str, res);
}
bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<string>& res)const
virtual bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<string>& res)const
{
if(!_getInitFlag())
{
@ -155,7 +152,6 @@ namespace CppJieba
return true;
}
//virtual bool cut(const string& str, vector<string>& res)const;
private:
bool _calcDAG(SegmentContext& segContext)const

View File

@ -13,7 +13,7 @@ namespace CppJieba
MPSegment _mpSeg;
HMMSegment _hmmSeg;
public:
MixSegment()
MixSegment(const char * const mpSegDict, const char * const hmmSegDict): _mpSeg(mpSegDict), _hmmSeg(hmmSegDict)
{
}
virtual ~MixSegment()
@ -21,26 +21,26 @@ namespace CppJieba
dispose();
}
public:
bool init(const char* const mpSegDict, const char* const hmmSegDict)
virtual bool init()
{
if(_getInitFlag())
{
LogError("inited.");
return false;
}
if(!_mpSeg.init(mpSegDict))
if(!_mpSeg.init())
{
LogError("_mpSeg init");
return false;
}
if(!_hmmSeg.init(hmmSegDict))
if(!_hmmSeg.init())
{
LogError("_hmmSeg init");
return false;
}
return _setInitFlag(true);
}
bool dispose()
virtual bool dispose()
{
if(!_getInitFlag())
{
@ -52,12 +52,9 @@ namespace CppJieba
return true;
}
public:
//virtual bool cut(const string& str, vector<string>& res) const;
bool cut(const string& str, vector<string>& res)const
{
return SegmentBase::cut(str, res);
}
bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<string>& res)const
using SegmentBase::cut;
public:
virtual bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<string>& res)const
{
if(!_getInitFlag())
{

View File

@ -14,12 +14,17 @@ namespace CppJieba
public:
SegmentBase(){_setInitFlag(false);};
virtual ~SegmentBase(){};
private:
bool _isInited;
protected:
bool _isInited;
bool _getInitFlag()const{return _isInited;};
bool _setInitFlag(bool flag){return _isInited = flag;};
bool cut(const string& str, vector<string>& res)const
public:
virtual bool init() = 0;
virtual bool dispose() = 0;
public:
virtual bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<string>& res)const = 0;
virtual bool cut(const string& str, vector<string>& res)const
{
if(!_getInitFlag())
{
@ -45,7 +50,6 @@ namespace CppJieba
}
return true;
}
bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<string>& res)const = 0;
};
}

View File

@ -17,76 +17,25 @@ namespace CppJieba
{
inline bool decode(const string& str, vector<uint16_t>& vec)
{
char ch1, ch2;
if(str.empty())
{
return false;
}
vec.clear();
size_t siz = str.size();
for(uint i = 0;i < siz;)
{
if(!(str[i] & 0x80)) // 0xxxxxxx
{
vec.push_back(str[i]);
i++;
}
else if ((unsigned char)str[i] <= 0xdf && i + 1 < siz) // 110xxxxxx
{
ch1 = (str[i] >> 2) & 0x07;
ch2 = (str[i+1] & 0x3f) | ((str[i] & 0x03) << 6 );
vec.push_back(twocharToUint16(ch1, ch2));
i += 2;
}
else if((unsigned char)str[i] <= 0xef && i + 2 < siz)
{
ch1 = (str[i] << 4) | ((str[i+1] >> 2) & 0x0f );
ch2 = ((str[i+1]<<6) & 0xc0) | (str[i+2] & 0x3f);
vec.push_back(twocharToUint16(ch1, ch2));
i += 3;
}
else
{
return false;
}
}
return true;
#ifdef CPPJIEBA_GBK
return gbkTrans(str, vec);
#else
return utf8ToUnicode(str, vec);
#endif
}
inline bool encode(vector<uint16_t>::const_iterator begin, vector<uint16_t>::const_iterator end, string& res)
{
if(begin >= end)
{
return false;
}
res.clear();
uint16_t ui;
while(begin != end)
{
ui = *begin;
if(ui <= 0x7f)
{
res += char(ui);
}
else if(ui <= 0x7ff)
{
res += char(((ui>>6) & 0x1f) | 0xc0);
res += char((ui & 0x3f) | 0x80);
}
else
{
res += char(((ui >> 12) & 0x0f )| 0xe0);
res += char(((ui>>6) & 0x3f )| 0x80 );
res += char((ui & 0x3f) | 0x80);
}
begin ++;
}
return true;
#ifdef CPPJIEBA_GBK
return gbkTrans(begin, end, res);
#else
return unicodeToUtf8(begin, end, res);
#endif
}
inline bool encode(const vector<uint16_t>& sentence, string& res)
inline bool encode(const vector<uint16_t>& uni, string& res)
{
return encode(sentence.begin(), sentence.end(), res);
return encode(uni.begin(), uni.end(), res);
}
}
}

View File

@ -164,14 +164,6 @@ namespace CppJieba
{
return NULL;
}
return find(uintVec);
}
const TrieNodeInfo* find(const Unicode& uintVec)const
{
if(uintVec.empty())
{
return NULL;
}
return find(uintVec.begin(), uintVec.end());
}
const TrieNodeInfo* find(Unicode::const_iterator begin, Unicode::const_iterator end)const
@ -261,56 +253,11 @@ namespace CppJieba
return false;
}
const TrieNodeInfo* findPrefix(const string& str)const
{
if(!_getInitFlag())
{
LogFatal("trie not initted!");
return NULL;
}
Unicode uintVec;
if(!TransCode::decode(str, uintVec))
{
LogError("TransCode::decode failed.");
return NULL;
}
//find
TrieNode* p = _root;
uint pos = 0;
uint16_t chUni = 0;
const TrieNodeInfo * res = NULL;
for(uint i = 0; i < uintVec.size(); i++)
{
chUni = uintVec[i];
if(p->isLeaf)
{
pos = p->nodeInfoVecPos;
if(pos >= _nodeInfoVec.size())
{
LogFatal("node's nodeInfoVecPos is out of _nodeInfoVec's range");
return NULL;
}
res = &(_nodeInfoVec[pos]);
}
if(p->hmap.find(chUni) == p->hmap.end())
{
break;
}
else
{
p = p->hmap[chUni];
}
}
return res;
}
public:
double getMinLogFreq()const{return _minLogFreq;};
bool insert(const TrieNodeInfo& nodeInfo)
private:
bool _insert(const TrieNodeInfo& nodeInfo)
{
if(!_getInitFlag())
{
@ -353,7 +300,7 @@ namespace CppJieba
}
if(p->isLeaf)
{
LogError("this node already inserted");
LogError("this node already _inserted");
return false;
}
@ -391,10 +338,10 @@ namespace CppJieba
nodeInfo.tag = vecBuf[2];
}
//insert node
if(!insert(nodeInfo))
//_insert node
if(!_insert(nodeInfo))
{
LogError("insert node failed!");
LogError("_insert node failed!");
}
}
return true;

View File

@ -19,8 +19,14 @@ void cut(const ISegment * seg, const char * const filePath)
{
cout << line << endl;
res.clear();
seg->cut(line, res);
cout<<join(res.begin(), res.end(),"/")<<endl;
if(!seg->cut(line, res))
{
LogError("seg cut failed.");
}
else
{
print(join(res.begin(), res.end(), "/"));
}
}
}
}
@ -35,9 +41,9 @@ int main(int argc, char ** argv)
<<"\t--dictpath\tsee example\n"
<<"\t--modelpath\tsee example\n"
<<"example:\n"
<<"\t"<<argv[0]<<" testlines.utf8 --dictpath dicts/jieba.dict.utf8\n"
<<"\t"<<argv[0]<<" testlines.utf8 --modelpath dicts/hmm_model.utf8 --algorithm cutHMM\n"
<<"\t"<<argv[0]<<" testlines.utf8 --dictpath dicts/jieba.dict.utf8 --modelpath dicts/hmm_model.utf8 --algorithm cutMix\n"
<<"\t"<<argv[0]<<" ../test/testlines.utf8 --dictpath ../dicts/jieba.dict.utf8 --algorithm cutDAG\n"
<<"\t"<<argv[0]<<" ../test/testlines.utf8 --modelpath ../dicts/hmm_model.utf8 --algorithm cutHMM\n"
<<"\t"<<argv[0]<<" ../test/testlines.utf8 --dictpath ../dicts/jieba.dict.utf8 --modelpath ../dicts/hmm_model.utf8 --algorithm cutMix\n"
<<endl;
return EXIT_FAILURE;
@ -49,8 +55,8 @@ int main(int argc, char ** argv)
if("cutHMM" == algorithm)
{
HMMSegment seg;
if(!seg.init(modelPath.c_str()))
HMMSegment seg(modelPath.c_str());
if(!seg.init())
{
cout<<"seg init failed."<<endl;
return EXIT_FAILURE;
@ -60,8 +66,8 @@ int main(int argc, char ** argv)
}
else if("cutDAG" == algorithm)
{
MPSegment seg;
if(!seg.init(dictPath.c_str()))
MPSegment seg(dictPath.c_str());
if(!seg.init())
{
cout<<"seg init failed."<<endl;
return false;
@ -71,8 +77,8 @@ int main(int argc, char ** argv)
}
else if ("cutFull" == algorithm)
{
FullSegment seg;
if (!seg.init(dictPath.c_str()))
FullSegment seg(dictPath.c_str());
if (!seg.init())
{
cout << "seg init failed" << endl;
return false;
@ -82,8 +88,8 @@ int main(int argc, char ** argv)
}
else
{
MixSegment seg;
if(!seg.init(dictPath.c_str(), modelPath.c_str()))
MixSegment seg(dictPath.c_str(), modelPath.c_str());
if(!seg.init())
{
cout<<"seg init failed."<<endl;
return EXIT_FAILURE;

View File

@ -14,13 +14,10 @@ using namespace CppJieba;
class ReqHandler: public IRequestHandler
{
private:
string _dictPath;
string _modelPath;
public:
ReqHandler(const string& dictPath, const string& modelPath): _dictPath(dictPath), _modelPath(modelPath){};
ReqHandler(const string& dictPath, const string& modelPath): _segment(dictPath.c_str(), modelPath.c_str()){};
virtual ~ReqHandler(){};
virtual bool init(){return _segment.init(_dictPath.c_str(), _modelPath.c_str());};
virtual bool init(){return _segment.init();};
virtual bool dispose(){return _segment.dispose();};
public:
virtual bool do_GET(const HttpReqInfo& httpReq, string& strSnd)

9
test/testlines.gbk Normal file
View File

@ -0,0 +1,9 @@
我来到北京清华大学
他来到了网易杭研大厦
杭研
小明硕士毕业于中国科学院计算所,后在日本京都大学深造
我来自北京邮电大学。。。学号091111xx。。。
来这里看看别人正在搜索什么吧
我来到南京市长江大桥
请在一米线外等候
人事处女干事