Merge branch 'dev'

This commit is contained in:
wyy 2013-12-12 23:26:13 -08:00
commit 1b1ed6e3aa
10 changed files with 96 additions and 187 deletions

View File

@ -25,7 +25,7 @@ make
sudo make install
```
#### 验证
#### 测试
```sh
/usr/bin/cjseg.sh ../test/testlines.utf8
@ -41,7 +41,7 @@ sudo /etc/init.d/cjserver start
sudo /etc/init.d/cjserver stop
```
#### 验证服务
#### 测试服务
然后用chrome浏览器打开`http://127.0.0.1:11200/?key=南京市长江大桥`
(用chrome的原因是chrome的默认编码就是utf-8)

View File

@ -6,99 +6,6 @@
namespace CppJieba
{
//enum CHAR_TYPE { CHWORD = 0, DIGIT_OR_LETTER = 1};
//typedef Unicode::const_iterator UniConIter;
//class ChineseFilter;
//class ChFilterIterator
//{
// public:
// const Unicode * ptUnico;
// UniConIter begin;
// UniConIter end;
// CHAR_TYPE charType;
// ChFilterIterator& operator++()
// {
// return *this = _get(end);
// }
// ChFilterIterator operator++(int)
// {
// ChFilterIterator res = *this;
// *this = _get(end);
// return res;
// }
// bool operator==(const ChFilterIterator& iter)
// {
// return begin == iter.begin && end == iter.end;
// }
// bool operator!=(const ChFilterIterator& iter)
// {
// return !(*this == iter);
// }
// ChFilterIterator& operator=(const ChFilterIterator& iter)
// {
// ptUnico = iter.ptUnico;
// begin = iter.begin;
// end = iter.end;
// charType = iter.charType;
// return *this;
// }
//
// public:
// ChFilterIterator(const Unicode * ptu, UniConIter be, UniConIter en, CHAR_TYPE is):ptUnico(ptu), begin(be), end(en), charType(is){};
// ChFilterIterator(const Unicode * ptu):ptUnico(ptu){*this = _get(ptUnico->begin());};
// private:
// ChFilterIterator(){}
// private:
// CHAR_TYPE _charType(uint16_t x)const
// {
// if(x < 0x0080)
// {
// return DIGIT_OR_LETTER;
// }
// return CHWORD;
// }
// ChFilterIterator _get(UniConIter iter)
// {
// UniConIter _begin = iter;
// const UniConIter& _end = ptUnico->end();
// if(iter == _end)
// {
// return ChFilterIterator(ptUnico, end, end, DIGIT_OR_LETTER);
// }
// CHAR_TYPE charType = _charType(*iter);
// iter ++;
// while(iter != _end &&charType == _charType(*iter))
// {
// iter++;
// }
// return ChFilterIterator(ptUnico, _begin, iter, charType);
// }
//};
//class ChineseFilter
//{
// private:
// Unicode _unico;
// public:
// typedef ChFilterIterator iterator;
// public:
// ChineseFilter(){};
// ~ChineseFilter(){};
// public:
// bool feed(const string& str)
// {
// return TransCode::decode(str, _unico);
// }
// iterator begin()
// {
// return iterator(&_unico);
// }
// iterator end()
// {
// return iterator(&_unico, _unico.end(), _unico.end(), DIGIT_OR_LETTER);
// }
//};
/*
* if char is ascii, count the ascii string's length and return 0;
* else count the nonascii string's length and return 1;

View File

@ -3,6 +3,7 @@
#include <algorithm>
#include <set>
#include <cassert>
#include "Limonp/logger.hpp"
#include "Trie.hpp"
#include "ISegment.hpp"
@ -63,15 +64,16 @@ namespace CppJieba
public:
bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<Unicode>& res) const
{
assert(_getInitFlag());
#ifndef NO_CODING_LOG
if (!_getInitFlag())
//if (!_getInitFlag())
//{
// LogError("not inited.");
// return false;
//}
if (begin >= end)
{
LogError("not inited.");
return false;
}
if (begin > end)
{
LogError("begin > end");
LogError("begin >= end");
return false;
}
#endif

View File

@ -4,6 +4,7 @@
#include <iostream>
#include <fstream>
#include <memory.h>
#include <cassert>
#include "Limonp/str_functs.hpp"
#include "Limonp/logger.hpp"
#include "TransCode.hpp"
@ -93,11 +94,12 @@ namespace CppJieba
public:
virtual bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<string>& res)const
{
if(!_getInitFlag())
{
LogError("not inited.");
return false;
}
//if(!_getInitFlag())
//{
// LogError("not inited.");
// return false;
//}
assert(_getInitFlag());
if(begin == end)
{
return false;

View File

@ -7,6 +7,7 @@
#include <algorithm>
#include <set>
#include <cassert>
#include "Limonp/logger.hpp"
#include "Trie.hpp"
#include "ISegment.hpp"
@ -73,11 +74,13 @@ namespace CppJieba
public:
virtual bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<string>& res)const
{
if(!_getInitFlag())
{
LogError("not inited.");
return false;
}
//if(!_getInitFlag())
//{
// LogError("not inited.");
// return false;
//}
assert(_getInitFlag());
vector<TrieNodeInfo> segWordInfos;
if(!cut(begin, end, segWordInfos))
{
@ -97,27 +100,7 @@ namespace CppJieba
}
return true;
}
bool cut(const string& str, vector<TrieNodeInfo>& segWordInfos)const
{
if(!_getInitFlag())
{
LogError("not inited.");
return false;
}
if(str.empty())
{
return false;
}
Unicode sentence;
if(!TransCode::decode(str, sentence))
{
LogError("TransCode::decode failed.");
return false;
}
return cut(sentence.begin(), sentence.end(), segWordInfos);
}
bool cut(Unicode::const_iterator begin , Unicode::const_iterator end, vector<TrieNodeInfo>& segWordInfos)const
{
if(!_getInitFlag())
@ -158,25 +141,17 @@ namespace CppJieba
return false;
}
vector<pair<uint, const TrieNodeInfo*> > vp;
for(Unicode::const_iterator it = begin; it != end; it++)
{
segContext.push_back(SegmentChar(*it));
SegmentChar& back = segContext.back();
int i = it - begin;
vp.clear();
if(_trie.find(it, end, vp))
SegmentChar schar(*it);
uint i = it - begin;
_trie.find(it, end, i, schar.dag);
//DagType::iterator dagIter;
if(schar.dag.end() == schar.dag.find(i))
{
for(uint j = 0; j < vp.size(); j++)
{
uint nextp = vp[j].first + i;
back.dag[nextp] = vp[j].second;
}
}
if(back.dag.end() == back.dag.find(i))
{
back.dag[i] = NULL;
schar.dag[i] = NULL;
}
segContext.push_back(schar);
}
return true;
}

View File

@ -1,6 +1,7 @@
#ifndef CPPJIEBA_MIXSEGMENT_H
#define CPPJIEBA_MIXSEGMENT_H
#include <cassert>
#include "MPSegment.hpp"
#include "HMMSegment.hpp"
#include "Limonp/str_functs.hpp"
@ -56,11 +57,12 @@ namespace CppJieba
public:
virtual bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<string>& res)const
{
if(!_getInitFlag())
{
LogError("not inited.");
return false;
}
//if(!_getInitFlag())
//{
// LogError("not inited.");
// return false;
//}
assert(_getInitFlag());
if(begin == end)
{
return false;

View File

@ -3,6 +3,7 @@
#include <algorithm>
#include <set>
#include <cassert>
#include "Limonp/logger.hpp"
#include "Trie.hpp"
#include "ISegment.hpp"
@ -64,15 +65,16 @@ namespace CppJieba
public:
bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<Unicode>& res) const
{
assert(_getInitFlag());
#ifndef NO_CODING_LOG
if (!_getInitFlag())
//if (!_getInitFlag())
//{
// LogError("not inited.");
// return false;
//}
if (begin >= end)
{
LogError("not inited.");
return false;
}
if (begin > end)
{
LogError("begin > end");
LogError("begin >= end");
return false;
}
#endif

View File

@ -157,15 +157,6 @@ namespace CppJieba
bool _getInitFlag()const{return _initFlag;};
public:
const TrieNodeInfo* find(const string& str)const
{
Unicode uintVec;
if(!TransCode::decode(str, uintVec))
{
return NULL;
}
return find(uintVec.begin(), uintVec.end());
}
const TrieNodeInfo* find(Unicode::const_iterator begin, Unicode::const_iterator end)const
{
@ -244,14 +235,43 @@ namespace CppJieba
return !res.empty();
}
//bool find(const Unicode& unico, vector<pair<uint, const TrieNodeInfo*> >& res)const
//{
// if (!unico.empty())
// {
// return find(unico.begin(), unico.end(), res);
// }
// return false;
//}
bool find(Unicode::const_iterator begin, Unicode::const_iterator end, uint offset, unordered_map<uint, const TrieNodeInfo* > & res) const
{
if(!_getInitFlag())
{
LogFatal("trie not initted!");
return false;
}
if (begin >= end)
{
LogFatal("begin >= end");
return false;
}
TrieNode* p = _root;
for (Unicode::const_iterator itr = begin; itr != end; itr++)
{
if(p->hmap.find(*itr) == p-> hmap.end())
{
break;
}
p = p->hmap[*itr];
if(p->isLeaf)
{
uint pos = p->nodeInfoVecPos;
if(pos < _nodeInfoVec.size())
{
//res.push_back(make_pair(itr-begin, &_nodeInfoVec[pos]));
res[itr-begin + offset] = &_nodeInfoVec[pos];
}
else
{
LogFatal("node's nodeInfoVecPos is out of _nodeInfoVec's range");
return false;
}
}
}
return !res.empty();
}
public:
double getMinLogFreq()const{return _minLogFreq;};

2
test/Makefile Executable file → Normal file
View File

@ -2,4 +2,4 @@ all: testp demo
testp:
g++ -o test_performance test_performance.cpp -std=c++0x -O3
demo:
g++ -o segment.demo segment.cpp -std=c++0x -L/usr/lib/CppJieba -lcppjieba
g++ -o segment.demo segment.cpp -std=c++0x

View File

@ -1,9 +1,8 @@
#include <iostream>
#include <fstream>
#include <CppJieba/Limonp/ArgvContext.hpp>
#include <CppJieba/MPSegment.h>
#include <CppJieba/HMMSegment.h>
#include <CppJieba/MixSegment.h>
#include "../src/MPSegment.hpp"
#include "../src/HMMSegment.hpp"
#include "../src/MixSegment.hpp"
using namespace CppJieba;
@ -27,8 +26,8 @@ int main(int argc, char ** argv)
{
//demo
{
HMMSegment seg;
if(!seg.init("../dicts/hmm_model.utf8"))
HMMSegment seg("../dicts/hmm_model.utf8");
if(!seg.init())
{
cout<<"seg init failed."<<endl;
return EXIT_FAILURE;
@ -37,8 +36,8 @@ int main(int argc, char ** argv)
seg.dispose();
}
{
MixSegment seg;
if(!seg.init("../dicts/jieba.dict.utf8", "../dicts/hmm_model.utf8"))
MixSegment seg("../dicts/jieba.dict.utf8", "../dicts/hmm_model.utf8");
if(!seg.init())
{
cout<<"seg init failed."<<endl;
return EXIT_FAILURE;
@ -47,8 +46,8 @@ int main(int argc, char ** argv)
seg.dispose();
}
{
MPSegment seg;
if(!seg.init("../dicts/jieba.dict.utf8"))
MPSegment seg("../dicts/jieba.dict.utf8");
if(!seg.init())
{
cout<<"seg init failed."<<endl;
return false;