Merge branch 'dev'

This commit is contained in:
wyy 2013-12-12 23:26:13 -08:00
commit 1b1ed6e3aa
10 changed files with 96 additions and 187 deletions

View File

@ -25,7 +25,7 @@ make
sudo make install sudo make install
``` ```
#### 验证 #### 测试
```sh ```sh
/usr/bin/cjseg.sh ../test/testlines.utf8 /usr/bin/cjseg.sh ../test/testlines.utf8
@ -41,7 +41,7 @@ sudo /etc/init.d/cjserver start
sudo /etc/init.d/cjserver stop sudo /etc/init.d/cjserver stop
``` ```
#### 验证服务 #### 测试服务
然后用chrome浏览器打开`http://127.0.0.1:11200/?key=南京市长江大桥` 然后用chrome浏览器打开`http://127.0.0.1:11200/?key=南京市长江大桥`
(用chrome的原因是chrome的默认编码就是utf-8) (用chrome的原因是chrome的默认编码就是utf-8)

View File

@ -6,99 +6,6 @@
namespace CppJieba namespace CppJieba
{ {
//enum CHAR_TYPE { CHWORD = 0, DIGIT_OR_LETTER = 1};
//typedef Unicode::const_iterator UniConIter;
//class ChineseFilter;
//class ChFilterIterator
//{
// public:
// const Unicode * ptUnico;
// UniConIter begin;
// UniConIter end;
// CHAR_TYPE charType;
// ChFilterIterator& operator++()
// {
// return *this = _get(end);
// }
// ChFilterIterator operator++(int)
// {
// ChFilterIterator res = *this;
// *this = _get(end);
// return res;
// }
// bool operator==(const ChFilterIterator& iter)
// {
// return begin == iter.begin && end == iter.end;
// }
// bool operator!=(const ChFilterIterator& iter)
// {
// return !(*this == iter);
// }
// ChFilterIterator& operator=(const ChFilterIterator& iter)
// {
// ptUnico = iter.ptUnico;
// begin = iter.begin;
// end = iter.end;
// charType = iter.charType;
// return *this;
// }
//
// public:
// ChFilterIterator(const Unicode * ptu, UniConIter be, UniConIter en, CHAR_TYPE is):ptUnico(ptu), begin(be), end(en), charType(is){};
// ChFilterIterator(const Unicode * ptu):ptUnico(ptu){*this = _get(ptUnico->begin());};
// private:
// ChFilterIterator(){}
// private:
// CHAR_TYPE _charType(uint16_t x)const
// {
// if(x < 0x0080)
// {
// return DIGIT_OR_LETTER;
// }
// return CHWORD;
// }
// ChFilterIterator _get(UniConIter iter)
// {
// UniConIter _begin = iter;
// const UniConIter& _end = ptUnico->end();
// if(iter == _end)
// {
// return ChFilterIterator(ptUnico, end, end, DIGIT_OR_LETTER);
// }
// CHAR_TYPE charType = _charType(*iter);
// iter ++;
// while(iter != _end &&charType == _charType(*iter))
// {
// iter++;
// }
// return ChFilterIterator(ptUnico, _begin, iter, charType);
// }
//};
//class ChineseFilter
//{
// private:
// Unicode _unico;
// public:
// typedef ChFilterIterator iterator;
// public:
// ChineseFilter(){};
// ~ChineseFilter(){};
// public:
// bool feed(const string& str)
// {
// return TransCode::decode(str, _unico);
// }
// iterator begin()
// {
// return iterator(&_unico);
// }
// iterator end()
// {
// return iterator(&_unico, _unico.end(), _unico.end(), DIGIT_OR_LETTER);
// }
//};
/* /*
* if char is ascii, count the ascii string's length and return 0; * if char is ascii, count the ascii string's length and return 0;
* else count the nonascii string's length and return 1; * else count the nonascii string's length and return 1;

View File

@ -3,6 +3,7 @@
#include <algorithm> #include <algorithm>
#include <set> #include <set>
#include <cassert>
#include "Limonp/logger.hpp" #include "Limonp/logger.hpp"
#include "Trie.hpp" #include "Trie.hpp"
#include "ISegment.hpp" #include "ISegment.hpp"
@ -63,15 +64,16 @@ namespace CppJieba
public: public:
bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<Unicode>& res) const bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<Unicode>& res) const
{ {
assert(_getInitFlag());
#ifndef NO_CODING_LOG #ifndef NO_CODING_LOG
if (!_getInitFlag()) //if (!_getInitFlag())
//{
// LogError("not inited.");
// return false;
//}
if (begin >= end)
{ {
LogError("not inited."); LogError("begin >= end");
return false;
}
if (begin > end)
{
LogError("begin > end");
return false; return false;
} }
#endif #endif

View File

@ -4,6 +4,7 @@
#include <iostream> #include <iostream>
#include <fstream> #include <fstream>
#include <memory.h> #include <memory.h>
#include <cassert>
#include "Limonp/str_functs.hpp" #include "Limonp/str_functs.hpp"
#include "Limonp/logger.hpp" #include "Limonp/logger.hpp"
#include "TransCode.hpp" #include "TransCode.hpp"
@ -93,11 +94,12 @@ namespace CppJieba
public: public:
virtual bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<string>& res)const virtual bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<string>& res)const
{ {
if(!_getInitFlag()) //if(!_getInitFlag())
{ //{
LogError("not inited."); // LogError("not inited.");
return false; // return false;
} //}
assert(_getInitFlag());
if(begin == end) if(begin == end)
{ {
return false; return false;

View File

@ -7,6 +7,7 @@
#include <algorithm> #include <algorithm>
#include <set> #include <set>
#include <cassert>
#include "Limonp/logger.hpp" #include "Limonp/logger.hpp"
#include "Trie.hpp" #include "Trie.hpp"
#include "ISegment.hpp" #include "ISegment.hpp"
@ -73,11 +74,13 @@ namespace CppJieba
public: public:
virtual bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<string>& res)const virtual bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<string>& res)const
{ {
if(!_getInitFlag()) //if(!_getInitFlag())
{ //{
LogError("not inited."); // LogError("not inited.");
return false; // return false;
} //}
assert(_getInitFlag());
vector<TrieNodeInfo> segWordInfos; vector<TrieNodeInfo> segWordInfos;
if(!cut(begin, end, segWordInfos)) if(!cut(begin, end, segWordInfos))
{ {
@ -97,27 +100,7 @@ namespace CppJieba
} }
return true; return true;
} }
bool cut(const string& str, vector<TrieNodeInfo>& segWordInfos)const
{
if(!_getInitFlag())
{
LogError("not inited.");
return false;
}
if(str.empty())
{
return false;
}
Unicode sentence;
if(!TransCode::decode(str, sentence))
{
LogError("TransCode::decode failed.");
return false;
}
return cut(sentence.begin(), sentence.end(), segWordInfos);
}
bool cut(Unicode::const_iterator begin , Unicode::const_iterator end, vector<TrieNodeInfo>& segWordInfos)const bool cut(Unicode::const_iterator begin , Unicode::const_iterator end, vector<TrieNodeInfo>& segWordInfos)const
{ {
if(!_getInitFlag()) if(!_getInitFlag())
@ -158,25 +141,17 @@ namespace CppJieba
return false; return false;
} }
vector<pair<uint, const TrieNodeInfo*> > vp;
for(Unicode::const_iterator it = begin; it != end; it++) for(Unicode::const_iterator it = begin; it != end; it++)
{ {
segContext.push_back(SegmentChar(*it)); SegmentChar schar(*it);
SegmentChar& back = segContext.back(); uint i = it - begin;
int i = it - begin; _trie.find(it, end, i, schar.dag);
vp.clear(); //DagType::iterator dagIter;
if(_trie.find(it, end, vp)) if(schar.dag.end() == schar.dag.find(i))
{ {
for(uint j = 0; j < vp.size(); j++) schar.dag[i] = NULL;
{
uint nextp = vp[j].first + i;
back.dag[nextp] = vp[j].second;
}
}
if(back.dag.end() == back.dag.find(i))
{
back.dag[i] = NULL;
} }
segContext.push_back(schar);
} }
return true; return true;
} }

View File

@ -1,6 +1,7 @@
#ifndef CPPJIEBA_MIXSEGMENT_H #ifndef CPPJIEBA_MIXSEGMENT_H
#define CPPJIEBA_MIXSEGMENT_H #define CPPJIEBA_MIXSEGMENT_H
#include <cassert>
#include "MPSegment.hpp" #include "MPSegment.hpp"
#include "HMMSegment.hpp" #include "HMMSegment.hpp"
#include "Limonp/str_functs.hpp" #include "Limonp/str_functs.hpp"
@ -56,11 +57,12 @@ namespace CppJieba
public: public:
virtual bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<string>& res)const virtual bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<string>& res)const
{ {
if(!_getInitFlag()) //if(!_getInitFlag())
{ //{
LogError("not inited."); // LogError("not inited.");
return false; // return false;
} //}
assert(_getInitFlag());
if(begin == end) if(begin == end)
{ {
return false; return false;

View File

@ -3,6 +3,7 @@
#include <algorithm> #include <algorithm>
#include <set> #include <set>
#include <cassert>
#include "Limonp/logger.hpp" #include "Limonp/logger.hpp"
#include "Trie.hpp" #include "Trie.hpp"
#include "ISegment.hpp" #include "ISegment.hpp"
@ -64,15 +65,16 @@ namespace CppJieba
public: public:
bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<Unicode>& res) const bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<Unicode>& res) const
{ {
assert(_getInitFlag());
#ifndef NO_CODING_LOG #ifndef NO_CODING_LOG
if (!_getInitFlag()) //if (!_getInitFlag())
//{
// LogError("not inited.");
// return false;
//}
if (begin >= end)
{ {
LogError("not inited."); LogError("begin >= end");
return false;
}
if (begin > end)
{
LogError("begin > end");
return false; return false;
} }
#endif #endif

View File

@ -157,15 +157,6 @@ namespace CppJieba
bool _getInitFlag()const{return _initFlag;}; bool _getInitFlag()const{return _initFlag;};
public: public:
const TrieNodeInfo* find(const string& str)const
{
Unicode uintVec;
if(!TransCode::decode(str, uintVec))
{
return NULL;
}
return find(uintVec.begin(), uintVec.end());
}
const TrieNodeInfo* find(Unicode::const_iterator begin, Unicode::const_iterator end)const const TrieNodeInfo* find(Unicode::const_iterator begin, Unicode::const_iterator end)const
{ {
@ -244,14 +235,43 @@ namespace CppJieba
return !res.empty(); return !res.empty();
} }
//bool find(const Unicode& unico, vector<pair<uint, const TrieNodeInfo*> >& res)const bool find(Unicode::const_iterator begin, Unicode::const_iterator end, uint offset, unordered_map<uint, const TrieNodeInfo* > & res) const
//{ {
// if (!unico.empty()) if(!_getInitFlag())
// { {
// return find(unico.begin(), unico.end(), res); LogFatal("trie not initted!");
// } return false;
// return false; }
//} if (begin >= end)
{
LogFatal("begin >= end");
return false;
}
TrieNode* p = _root;
for (Unicode::const_iterator itr = begin; itr != end; itr++)
{
if(p->hmap.find(*itr) == p-> hmap.end())
{
break;
}
p = p->hmap[*itr];
if(p->isLeaf)
{
uint pos = p->nodeInfoVecPos;
if(pos < _nodeInfoVec.size())
{
//res.push_back(make_pair(itr-begin, &_nodeInfoVec[pos]));
res[itr-begin + offset] = &_nodeInfoVec[pos];
}
else
{
LogFatal("node's nodeInfoVecPos is out of _nodeInfoVec's range");
return false;
}
}
}
return !res.empty();
}
public: public:
double getMinLogFreq()const{return _minLogFreq;}; double getMinLogFreq()const{return _minLogFreq;};

2
test/Makefile Executable file → Normal file
View File

@ -2,4 +2,4 @@ all: testp demo
testp: testp:
g++ -o test_performance test_performance.cpp -std=c++0x -O3 g++ -o test_performance test_performance.cpp -std=c++0x -O3
demo: demo:
g++ -o segment.demo segment.cpp -std=c++0x -L/usr/lib/CppJieba -lcppjieba g++ -o segment.demo segment.cpp -std=c++0x

View File

@ -1,9 +1,8 @@
#include <iostream> #include <iostream>
#include <fstream> #include <fstream>
#include <CppJieba/Limonp/ArgvContext.hpp> #include "../src/MPSegment.hpp"
#include <CppJieba/MPSegment.h> #include "../src/HMMSegment.hpp"
#include <CppJieba/HMMSegment.h> #include "../src/MixSegment.hpp"
#include <CppJieba/MixSegment.h>
using namespace CppJieba; using namespace CppJieba;
@ -27,8 +26,8 @@ int main(int argc, char ** argv)
{ {
//demo //demo
{ {
HMMSegment seg; HMMSegment seg("../dicts/hmm_model.utf8");
if(!seg.init("../dicts/hmm_model.utf8")) if(!seg.init())
{ {
cout<<"seg init failed."<<endl; cout<<"seg init failed."<<endl;
return EXIT_FAILURE; return EXIT_FAILURE;
@ -37,8 +36,8 @@ int main(int argc, char ** argv)
seg.dispose(); seg.dispose();
} }
{ {
MixSegment seg; MixSegment seg("../dicts/jieba.dict.utf8", "../dicts/hmm_model.utf8");
if(!seg.init("../dicts/jieba.dict.utf8", "../dicts/hmm_model.utf8")) if(!seg.init())
{ {
cout<<"seg init failed."<<endl; cout<<"seg init failed."<<endl;
return EXIT_FAILURE; return EXIT_FAILURE;
@ -47,8 +46,8 @@ int main(int argc, char ** argv)
seg.dispose(); seg.dispose();
} }
{ {
MPSegment seg; MPSegment seg("../dicts/jieba.dict.utf8");
if(!seg.init("../dicts/jieba.dict.utf8")) if(!seg.init())
{ {
cout<<"seg init failed."<<endl; cout<<"seg init failed."<<endl;
return false; return false;