merge dev

This commit is contained in:
wyy 2014-04-11 12:37:55 +08:00
commit 9a78765ecc
20 changed files with 232 additions and 210 deletions

View File

@ -22,4 +22,3 @@ ADD_SUBDIRECTORY(test)
ENABLE_TESTING() ENABLE_TESTING()
ADD_TEST(NAME test.run COMMAND test.run) ADD_TEST(NAME test.run COMMAND test.run)
ADD_TEST(NAME load_test COMMAND load_test)

View File

@ -2,12 +2,15 @@
1. 适配更低级版本的`g++``cmake`,已在`g++ 4.1.2``cmake 2.6`上测试通过。 1. 适配更低级版本的`g++``cmake`,已在`g++ 4.1.2``cmake 2.6`上测试通过。
2. 修改一些测试用例的文件,减少测试时编译的时间。 2. 修改一些测试用例的文件,减少测试时编译的时间。
3. 修复`make install`相关的问题。
4. 增加HTTP服务的POST请求接口。
5. 修改Trie这个类潜在的bug并完善单元测试
## v2.3.4 ## v2.3.4
1. 修改了设计上的问题,删除了`TrieManager`这个类,以避免造成一些可能的隐患。 1. 修改了设计上的问题,删除了`TrieManager`这个类,以避免造成一些可能的隐患。
2. 增加`stop_words.utf8`词典,并修改`KeywordExtractor`的初始化函数用以使用此词典。 2. 增加`stop_words.utf8`词典,并修改`KeywordExtractor`的初始化函数用以使用此词典。
3. 优化了Trie树相关部分代码结构。 3. 优化了`Trie`树相关部分代码结构。
## v2.3.3 ## v2.3.3

View File

@ -1,8 +1,8 @@
#CppJieba是"结巴"中文分词的C++版本 #CppJieba是"结巴"中文分词的C++版本
功能性的代码全写成hpp文件此处的hpp文件是将cpp和h两个文件全都写在hpp文件里面当然需要遵守相关约束 功能性的代码全写成hpp文件文件依赖一直是很让人讨厌的东西全做成hpp头文件形式的目的就是为了省去链接的依赖。
之所以全写成hpp文件是因为这样在别的项目需要使用到中文分词功能的时候直接`#include"xx.hpp" `进来就可以使用,无需麻烦的链接。 **没有依赖,就没有伤害。**
实践证明写成hpp使用起来真的很爽在后面提到的在iOS应用中的使用和包装成`Node.js`的扩展[NodeJieba]都特别顺利。 实践证明写成hpp使用起来真的很爽在后面提到的在iOS应用中的使用和包装成`Node.js`的扩展[NodeJieba]都特别顺利。
@ -57,8 +57,37 @@ sudo /etc/init.d/cjserver stop
然后用chrome浏览器打开`http://127.0.0.1:11200/?key=南京市长江大桥` 然后用chrome浏览器打开`http://127.0.0.1:11200/?key=南京市长江大桥`
(用chrome的原因是chrome的默认编码就是utf-8) (用chrome的原因是chrome的默认编码就是utf-8)
或者用命令 `curl "http://127.0.0.1:11200/?key=南京市长江大桥"` (ubuntu中的curl安装命令`sudo apt-get install curl`) 或者用命令 `curl "http://127.0.0.1:11200/?key=南京市长江大桥"` (ubuntu中的curl安装命令`sudo apt-get install curl`)
你可以看到返回的结果如下:(返回结果是json格式)
```
["南京市", "长江大桥"]
```
如果你使用如下调用方式:
```
curl "http://127.0.0.1:11200/?key=南京市长江大桥&format=simple"
```
则返回结果如下:(返回结果按空格隔开)
```
南京市 长江大桥
```
同时也支持HTTP POST模式使用如下调用:
```
curl -d "南京市长江大桥" "http://127.0.0.1:11200/"
```
返回结果如下:
```
["南京市", "长江大桥"]
```
### 卸载 ### 卸载
```sh ```sh

View File

@ -9,7 +9,4 @@ TARGET_LINK_LIBRARIES(cjserver pthread)
INSTALL(TARGETS cjsegment RUNTIME DESTINATION bin) INSTALL(TARGETS cjsegment RUNTIME DESTINATION bin)
INSTALL(TARGETS cjserver RUNTIME DESTINATION bin) INSTALL(TARGETS cjserver RUNTIME DESTINATION bin)
INSTALL(FILES HMMSegment.hpp MPSegment.hpp Trie.hpp TrieManager.hpp ISegment.hpp MixSegment.hpp SegmentBase.hpp TransCode.hpp KeywordExtractor.hpp DESTINATION include/CppJieba)
ADD_SUBDIRECTORY(Husky)
ADD_SUBDIRECTORY(Limonp)

View File

@ -48,7 +48,7 @@ namespace CppJieba
} }
//resut of searching in trie tree //resut of searching in trie tree
vector<pair<size_t, const TrieNodeInfo*> > tRes; DagType tRes;
//max index of res's words //max index of res's words
int maxIdx = 0; int maxIdx = 0;
@ -61,9 +61,10 @@ namespace CppJieba
for (Unicode::const_iterator uItr = begin; uItr != end; uItr++) for (Unicode::const_iterator uItr = begin; uItr != end; uItr++)
{ {
//find word start from uItr //find word start from uItr
if (_trie.find(uItr, end, tRes)) if (_trie.find(uItr, end, tRes, 0))
{ {
for (vector<pair<size_t, const TrieNodeInfo*> >::const_iterator itr = tRes.begin(); itr != tRes.end(); itr++) for(DagType::const_iterator itr = tRes.begin(); itr != tRes.end(); itr++)
//for (vector<pair<size_t, const TrieNodeInfo*> >::const_iterator itr = tRes.begin(); itr != tRes.end(); itr++)
{ {
wordLen = itr->second->word.size(); wordLen = itr->second->word.size();
if (wordLen >= 2 || (tRes.size() == 1 && maxIdx <= uIdx)) if (wordLen >= 2 || (tRes.size() == 1 && maxIdx <= uIdx))

View File

@ -131,15 +131,11 @@ namespace CppJieba
size_t Y = STATUS_SUM; size_t Y = STATUS_SUM;
size_t X = end - begin; size_t X = end - begin;
size_t XYSize = X * Y; size_t XYSize = X * Y;
int * path;
double * weight;
size_t now, old, stat; size_t now, old, stat;
double tmp, endE, endS; double tmp, endE, endS;
path = new int [XYSize]; vector<int> path(XYSize);
assert(path); vector<double> weight(XYSize);
weight = new double [XYSize];
assert(weight);
//start //start
for(size_t y = 0; y < Y; y++) for(size_t y = 0; y < Y; y++)
@ -147,8 +143,10 @@ namespace CppJieba
weight[0 + y * X] = _startProb[y] + _getEmitProb(_emitProbVec[y], *begin, MIN_DOUBLE); weight[0 + y * X] = _startProb[y] + _getEmitProb(_emitProbVec[y], *begin, MIN_DOUBLE);
path[0 + y * X] = -1; path[0 + y * X] = -1;
} }
//process
//for(; begin != end; begin++)
double emitProb;
for(size_t x = 1; x < X; x++) for(size_t x = 1; x < X; x++)
{ {
for(size_t y = 0; y < Y; y++) for(size_t y = 0; y < Y; y++)
@ -156,10 +154,11 @@ namespace CppJieba
now = x + y*X; now = x + y*X;
weight[now] = MIN_DOUBLE; weight[now] = MIN_DOUBLE;
path[now] = E; // warning path[now] = E; // warning
emitProb = _getEmitProb(_emitProbVec[y], *(begin+x), MIN_DOUBLE);
for(size_t preY = 0; preY < Y; preY++) for(size_t preY = 0; preY < Y; preY++)
{ {
old = x - 1 + preY * X; old = x - 1 + preY * X;
tmp = weight[old] + _transProb[preY][y] + _getEmitProb(_emitProbVec[y], *(begin+x), MIN_DOUBLE); tmp = weight[old] + _transProb[preY][y] + emitProb;
if(tmp > weight[now]) if(tmp > weight[now])
{ {
weight[now] = tmp; weight[now] = tmp;
@ -188,8 +187,6 @@ namespace CppJieba
stat = path[x + stat*X]; stat = path[x + stat*X];
} }
delete [] path;
delete [] weight;
return true; return true;
} }
bool _loadModel(const char* const filePath) bool _loadModel(const char* const filePath)

View File

@ -1,5 +0,0 @@
SET(EXECUTABLE_OUTPUT_PATH ${PROJECT_BINARY_DIR}/bin)
SET(LIBRARY_OUTPUT_PATH ${PROJECT_BINARY_DIR}/lib)
FILE(GLOB SRCS *.hpp)
INSTALL(FILES ${SRCS} DESTINATION include/CppJieba/Husky)

View File

@ -37,13 +37,14 @@ namespace Husky
virtual ~IRequestHandler(){}; virtual ~IRequestHandler(){};
public: public:
virtual bool do_GET(const HttpReqInfo& httpReq, string& res) const = 0; virtual bool do_GET(const HttpReqInfo& httpReq, string& res) const = 0;
virtual bool do_POST(const HttpReqInfo& httpReq, string& res) const = 0;
}; };
class EpollServer class EpollServer
{ {
private: private:
static const size_t LISTEN_QUEUE_LEN = 1024; static const size_t LISTEN_QUEUE_LEN = 1024;
static const size_t RECV_BUFFER_SIZE = 1024 * 8; static const size_t RECV_BUFFER_SIZE = 1024*4;
static const int MAXEPOLLSIZE = 512; static const int MAXEPOLLSIZE = 512;
private: private:
@ -59,10 +60,10 @@ namespace Husky
bool _setInitFlag(bool flag) {return _isInited = flag;} bool _setInitFlag(bool flag) {return _isInited = flag;}
public: public:
explicit EpollServer(uint port, const IRequestHandler* pHandler): _reqHandler(pHandler), _host_socket(-1), _isShutDown(false), _epollSize(0) explicit EpollServer(uint port, const IRequestHandler* pHandler): _reqHandler(pHandler), _host_socket(-1), _isShutDown(false), _epollSize(0)
{ {
assert(_reqHandler); assert(_reqHandler);
_setInitFlag(_init_epoll(port)); _setInitFlag(_init_epoll(port));
}; };
~EpollServer(){};// unfinished; ~EpollServer(){};// unfinished;
public: public:
operator bool() const operator bool() const
@ -75,10 +76,9 @@ namespace Husky
//int clientSock; //int clientSock;
sockaddr_in clientaddr; sockaddr_in clientaddr;
socklen_t nSize = sizeof(clientaddr); socklen_t nSize = sizeof(clientaddr);
//char recvBuf[RECV_BUFFER_SIZE];
struct epoll_event events[MAXEPOLLSIZE]; struct epoll_event events[MAXEPOLLSIZE];
int nfds, clientSock; int nfds, clientSock;
while(!_isShutDown) while(!_isShutDown)
{ {
if(-1 == (nfds = epoll_wait(_epoll_fd, events, _epollSize, -1))) if(-1 == (nfds = epoll_wait(_epoll_fd, events, _epollSize, -1)))
@ -88,7 +88,7 @@ namespace Husky
} }
//LogDebug("epoll_wait return event sum[%d]", nfds); //LogDebug("epoll_wait return event sum[%d]", nfds);
for(int i = 0; i < nfds; i++) for(int i = 0; i < nfds; i++)
{ {
if(events[i].data.fd == _host_socket) /*new connect coming.*/ if(events[i].data.fd == _host_socket) /*new connect coming.*/
@ -106,7 +106,7 @@ namespace Husky
} }
//LogInfo("connecting from: %d:%d client socket: %d\n", inet_ntoa(clientaddr.sin_addr), ntohs(clientaddr.sin_port), clientSock); //LogInfo("connecting from: %d:%d client socket: %d\n", inet_ntoa(clientaddr.sin_addr), ntohs(clientaddr.sin_port), clientSock);
/* inet_ntoa is not thread safety at some version */ /* inet_ntoa is not thread safety at some version */
//_sockIpMap[clientSock] = inet_ntoa(clientaddr.sin_addr); //_sockIpMap[clientSock] = inet_ntoa(clientaddr.sin_addr);
@ -119,7 +119,7 @@ namespace Husky
_closesocket(events[i].data.fd); _closesocket(events[i].data.fd);
} }
} }
} }
return true; return true;
} }
@ -190,26 +190,42 @@ namespace Husky
} }
string strRec, strSnd, strRetByHandler; string strRec, strSnd, strRetByHandler;
strRec.resize(RECV_BUFFER_SIZE); char recvBuf[RECV_BUFFER_SIZE];
int nRetCode = recv(sockfd, (char*)strRec.c_str(), strRec.size(), 0); int nRetCode = -1;
if(-1 == nRetCode) while(true)
{ {
LogDebug(strerror(errno)); memset(recvBuf, 0, sizeof(recvBuf));
return false; nRetCode = recv(sockfd, recvBuf, sizeof(recvBuf) - 1, 0);
} if(-1 == nRetCode)
if(0 == nRetCode) {
{ LogDebug(strerror(errno));
LogDebug("client socket closed gracefully."); return false;
return false; }
if(0 == nRetCode)
{
LogDebug("client socket orderly shut down");
return false;
}
strRec += recvBuf;
if(nRetCode != sizeof(recvBuf) - 1)
{
break;
}
} }
HttpReqInfo httpReq(strRec); HttpReqInfo httpReq(strRec);
if(!_reqHandler->do_GET(httpReq, strRetByHandler)) if("GET" == httpReq.getMethod() && !_reqHandler->do_GET(httpReq, strRetByHandler))
{ {
LogError("do_GET failed."); LogError("do_GET failed.");
return false; return false;
} }
if("POST" == httpReq.getMethod() && !_reqHandler->do_POST(httpReq, strRetByHandler))
{
LogError("do_POST failed.");
return false;
}
string_format(strSnd, HTTP_FORMAT, CHARSET_UTF8, strRetByHandler.length(), strRetByHandler.c_str()); string_format(strSnd, HTTP_FORMAT, CHARSET_UTF8, strRetByHandler.length(), strRetByHandler.c_str());
if(-1 == send(sockfd, strSnd.c_str(), strSnd.length(), 0)) if(-1 == send(sockfd, strSnd.c_str(), strSnd.length(), 0))
{ {
LogError(strerror(errno)); LogError(strerror(errno));

View File

@ -135,7 +135,8 @@ namespace Husky
//message header end //message header end
//body begin //body begin
_body.assign(headerStr.substr(rpos));
trim(_body);
} }
public: public:
string& operator[] (const string& key) string& operator[] (const string& key)
@ -150,14 +151,23 @@ namespace Husky
{ {
return _find(_methodGetMap, argKey, res); return _find(_methodGetMap, argKey, res);
} }
bool POST(const string& argKey, string& res)const //bool POST(const string& argKey, string& res)const
//{
// return _find(_methodPostMap, argKey, res);
//}
const string& getMethod() const
{ {
return _find(_methodPostMap, argKey, res); return _headerMap.find(KEY_METHOD)->second;
}
const string& getBody() const
{
return _body;
} }
private: private:
std::unordered_map<string, string> _headerMap; std::unordered_map<string, string> _headerMap;
std::unordered_map<string, string> _methodGetMap; std::unordered_map<string, string> _methodGetMap;
std::unordered_map<string, string> _methodPostMap; //std::unordered_map<string, string> _methodPostMap;
string _body;
//public: //public:
friend ostream& operator<<(ostream& os, const HttpReqInfo& obj); friend ostream& operator<<(ostream& os, const HttpReqInfo& obj);
private: private:
@ -215,7 +225,7 @@ namespace Husky
inline std::ostream& operator << (std::ostream& os, const Husky::HttpReqInfo& obj) inline std::ostream& operator << (std::ostream& os, const Husky::HttpReqInfo& obj)
{ {
return os << obj._headerMap << obj._methodGetMap << obj._methodPostMap; return os << obj._headerMap << obj._methodGetMap/* << obj._methodPostMap*/ << obj._body;
} }
} }

View File

@ -1,4 +0,0 @@
INSTALL(FILES ArgvContext.hpp InitOnOff.hpp macro_def.hpp MysqlClient.hpp str_functs.hpp
cast_functs.hpp io_functs.hpp map_functs.hpp NonCopyable.hpp
Config.hpp logger.hpp md5.hpp std_outbound.hpp
DESTINATION include/CppJieba/Limonp)

View File

@ -24,7 +24,7 @@ namespace CppJieba
const TrieNodeInfo * pInfo; const TrieNodeInfo * pInfo;
double weight; double weight;
SegmentChar(uint16_t uni):uniCh(uni), pInfo(NULL), weight(0.0) SegmentChar():uniCh(0), pInfo(NULL), weight(0.0)
{} {}
}; };
typedef vector<SegmentChar> SegmentContext; typedef vector<SegmentChar> SegmentContext;
@ -59,18 +59,22 @@ namespace CppJieba
virtual bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<string>& res)const virtual bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<string>& res)const
{ {
assert(_getInitFlag()); assert(_getInitFlag());
if(begin == end)
vector<TrieNodeInfo> segWordInfos;
if(!cut(begin, end, segWordInfos))
{ {
return false; return false;
} }
string tmp;
for(size_t i = 0; i < segWordInfos.size(); i++) vector<Unicode> words;
if(!cut(begin, end, words))
{ {
if(TransCode::encode(segWordInfos[i].word, tmp)) return false;
}
string word;
for(size_t i = 0; i < words.size(); i++)
{
if(TransCode::encode(words[i], word))
{ {
res.push_back(tmp); res.push_back(word);
} }
else else
{ {
@ -80,7 +84,7 @@ namespace CppJieba
return true; return true;
} }
bool cut(Unicode::const_iterator begin , Unicode::const_iterator end, vector<TrieNodeInfo>& segWordInfos)const bool cut(Unicode::const_iterator begin , Unicode::const_iterator end, vector<Unicode>& res) const
{ {
if(!_getInitFlag()) if(!_getInitFlag())
{ {
@ -88,7 +92,6 @@ namespace CppJieba
return false; return false;
} }
SegmentContext segContext; SegmentContext segContext;
//calc DAG //calc DAG
if(!_calcDAG(begin, end, segContext)) if(!_calcDAG(begin, end, segContext))
{ {
@ -102,7 +105,7 @@ namespace CppJieba
return false; return false;
} }
if(!_cut(segContext, segWordInfos)) if(!_cut(segContext, res))
{ {
LogError("_cut failed."); LogError("_cut failed.");
return false; return false;
@ -114,21 +117,17 @@ namespace CppJieba
private: private:
bool _calcDAG(Unicode::const_iterator begin, Unicode::const_iterator end, SegmentContext& segContext) const bool _calcDAG(Unicode::const_iterator begin, Unicode::const_iterator end, SegmentContext& segContext) const
{ {
if(begin >= end) SegmentChar schar;
{ size_t offset;
LogError("begin >= end.");
return false;
}
for(Unicode::const_iterator it = begin; it != end; it++) for(Unicode::const_iterator it = begin; it != end; it++)
{ {
SegmentChar schar(*it); schar.uniCh = *it;
size_t i = it - begin; offset = it - begin;
_trie.find(it, end, i, schar.dag); schar.dag.clear();
//DagType::iterator dagIter; _trie.find(it, end, schar.dag, offset);
if(schar.dag.end() == schar.dag.find(i)) if(!isIn(schar.dag, offset))
{ {
schar.dag[i] = NULL; schar.dag[offset] = NULL;
} }
segContext.push_back(schar); segContext.push_back(schar);
} }
@ -142,15 +141,19 @@ namespace CppJieba
return false; return false;
} }
size_t nextPos;
const TrieNodeInfo* p;
double val;
for(int i = segContext.size() - 1; i >= 0; i--) for(int i = segContext.size() - 1; i >= 0; i--)
{ {
segContext[i].pInfo = NULL; segContext[i].pInfo = NULL;
segContext[i].weight = MIN_DOUBLE; segContext[i].weight = MIN_DOUBLE;
for(DagType::const_iterator it = segContext[i].dag.begin(); it != segContext[i].dag.end(); it++) for(DagType::const_iterator it = segContext[i].dag.begin(); it != segContext[i].dag.end(); it++)
{ {
size_t nextPos = it->first; nextPos = it->first;
const TrieNodeInfo* p = it->second; p = it->second;
double val = 0.0; val = 0.0;
if(nextPos + 1 < segContext.size()) if(nextPos + 1 < segContext.size())
{ {
val += segContext[nextPos + 1].weight; val += segContext[nextPos + 1].weight;
@ -174,7 +177,7 @@ namespace CppJieba
return true; return true;
} }
bool _cut(SegmentContext& segContext, vector<TrieNodeInfo>& res)const bool _cut(SegmentContext& segContext, vector<Unicode>& res)const
{ {
size_t i = 0; size_t i = 0;
while(i < segContext.size()) while(i < segContext.size())
@ -182,16 +185,12 @@ namespace CppJieba
const TrieNodeInfo* p = segContext[i].pInfo; const TrieNodeInfo* p = segContext[i].pInfo;
if(p) if(p)
{ {
res.push_back(*p); res.push_back(p->word);
i += p->word.size(); i += p->word.size();
} }
else//single chinese word else//single chinese word
{ {
TrieNodeInfo nodeInfo; res.push_back(Unicode(1, segContext[i].uniCh));
nodeInfo.word.push_back(segContext[i].uniCh);
nodeInfo.freq = 0;
nodeInfo.logFreq = _trie.getMinLogFreq();
res.push_back(nodeInfo);
i++; i++;
} }
} }

View File

@ -44,14 +44,8 @@ namespace CppJieba
virtual bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<Unicode>& res) const virtual bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<Unicode>& res) const
{ {
assert(_getInitFlag()); assert(_getInitFlag());
if(begin >= end) vector<Unicode> words;
{ if(!_mpSeg.cut(begin, end, words))
LogError("begin >= end");
return false;
}
vector<TrieNodeInfo> infos;
if(!_mpSeg.cut(begin, end, infos))
{ {
LogError("mpSeg cutDAG failed."); LogError("mpSeg cutDAG failed.");
return false; return false;
@ -59,20 +53,20 @@ namespace CppJieba
vector<Unicode> hmmRes; vector<Unicode> hmmRes;
Unicode piece; Unicode piece;
for (size_t i = 0, j = 0; i < infos.size(); i++) for (size_t i = 0, j = 0; i < words.size(); i++)
{ {
//if mp get a word, it's ok, put it into result //if mp get a word, it's ok, put it into result
if (1 != infos[i].word.size()) if (1 != words[i].size())
{ {
res.push_back(infos[i].word); res.push_back(words[i]);
continue; continue;
} }
// if mp get a single one, collect it in sequence // if mp get a single one, collect it in sequence
j = i; j = i;
while (j < infos.size() && infos[j].word.size() == 1) while (j < words.size() && words[j].size() == 1)
{ {
piece.push_back(infos[j].word[0]); piece.push_back(words[j][0]);
j++; j++;
} }

View File

@ -22,12 +22,8 @@ namespace CppJieba
virtual bool cut(const string& str, vector<string>& res)const virtual bool cut(const string& str, vector<string>& res)const
{ {
assert(_getInitFlag()); assert(_getInitFlag());
//if(!_getInitFlag())
//{
// LogError("not inited.");
// return false;
//}
Unicode unico; Unicode unico;
res.clear();
#ifdef NO_FILTER #ifdef NO_FILTER
if(!TransCode::decode(str, unico)) if(!TransCode::decode(str, unico))
{ {

View File

@ -15,7 +15,7 @@ namespace CppJieba
typedef std::vector<uint16_t> Unicode; typedef std::vector<uint16_t> Unicode;
namespace TransCode namespace TransCode
{ {
inline bool decode(const string& str, vector<uint16_t>& vec) inline bool decode(const string& str, Unicode& vec)
{ {
#ifdef CPPJIEBA_GBK #ifdef CPPJIEBA_GBK
return gbkTrans(str, vec); return gbkTrans(str, vec);
@ -24,7 +24,7 @@ namespace CppJieba
#endif #endif
} }
inline bool encode(vector<uint16_t>::const_iterator begin, vector<uint16_t>::const_iterator end, string& res) inline bool encode(Unicode::const_iterator begin, Unicode::const_iterator end, string& res)
{ {
#ifdef CPPJIEBA_GBK #ifdef CPPJIEBA_GBK
return gbkTrans(begin, end, res); return gbkTrans(begin, end, res);
@ -33,7 +33,7 @@ namespace CppJieba
#endif #endif
} }
inline bool encode(const vector<uint16_t>& uni, string& res) inline bool encode(const Unicode& uni, string& res)
{ {
return encode(uni.begin(), uni.end(), res); return encode(uni.begin(), uni.end(), res);
} }

View File

@ -26,16 +26,13 @@ namespace CppJieba
const double MAX_DOUBLE = 3.14e+100; const double MAX_DOUBLE = 3.14e+100;
const size_t DICT_COLUMN_NUM = 3; const size_t DICT_COLUMN_NUM = 3;
typedef map<uint16_t, struct TrieNode*> TrieNodeMap; typedef map<uint16_t, struct TrieNode*> TrieNodeMap;
struct TrieNodeInfo;
struct TrieNode struct TrieNode
{ {
TrieNodeMap hmap; TrieNodeMap hmap;
bool isLeaf; const TrieNodeInfo * ptTrieNodeInfo;
size_t nodeInfoPos; TrieNode(): ptTrieNodeInfo(NULL)
TrieNode() {}
{
isLeaf = false;
nodeInfoPos = 0;
}
}; };
struct TrieNodeInfo struct TrieNodeInfo
@ -44,12 +41,6 @@ namespace CppJieba
size_t freq; size_t freq;
string tag; string tag;
double logFreq; //logFreq = log(freq/sum(freq)); double logFreq; //logFreq = log(freq/sum(freq));
TrieNodeInfo():freq(0),logFreq(0.0)
{}
TrieNodeInfo(const TrieNodeInfo& nodeInfo):word(nodeInfo.word), freq(nodeInfo.freq), tag(nodeInfo.tag), logFreq(nodeInfo.logFreq)
{}
TrieNodeInfo(const Unicode& _word):word(_word),freq(0),logFreq(MIN_DOUBLE)
{}
}; };
inline ostream& operator << (ostream& os, const TrieNodeInfo & nodeInfo) inline ostream& operator << (ostream& os, const TrieNodeInfo & nodeInfo)
@ -72,33 +63,32 @@ namespace CppJieba
public: public:
Trie() Trie()
{ {
_root = NULL; _root = new TrieNode;
_freqSum = 0; _freqSum = 0;
_minLogFreq = MAX_DOUBLE; _minLogFreq = MAX_DOUBLE;
_setInitFlag(false); _setInitFlag(false);
} }
Trie(const string& filePath) Trie(const string& filePath)
{ {
Trie(); new (this) Trie();
_setInitFlag(init(filePath)); _setInitFlag(init(filePath));
} }
~Trie() ~Trie()
{ {
_deleteNode(_root); _deleteNode(_root);
} }
private:
public: public:
bool init(const string& filePath) bool init(const string& filePath)
{ {
assert(!_getInitFlag()); assert(!_getInitFlag());
_loadDict(filePath, _nodeInfos);
_root = new TrieNode; _createTrie(_nodeInfos, _root);
assert(_root); _freqSum = _calculateFreqSum(_nodeInfos);
if(!_trieInsert(filePath)) assert(_freqSum);
{ _minLogFreq = _calculateLogFreqAndGetMinValue(_nodeInfos, _freqSum);
LogError("_trieInsert failed.");
return false;
}
_countWeight();
return _setInitFlag(true); return _setInitFlag(true);
} }
@ -106,47 +96,22 @@ namespace CppJieba
const TrieNodeInfo* find(Unicode::const_iterator begin, Unicode::const_iterator end)const const TrieNodeInfo* find(Unicode::const_iterator begin, Unicode::const_iterator end)const
{ {
TrieNodeMap::const_iterator citer; TrieNodeMap::const_iterator citer;
TrieNode* p = _root; const TrieNode* p = _root;
for(Unicode::const_iterator it = begin; it != end; it++) for(Unicode::const_iterator it = begin; it != end; it++)
{ {
uint16_t chUni = *it; citer = p->hmap.find(*it);
citer = p->hmap.find(chUni); if(p->hmap.end() == citer)
if(p-> hmap.end() == citer)
{ {
return NULL; return NULL;
} }
p = citer->second; p = citer->second;
} }
if(p->isLeaf) return p->ptTrieNodeInfo;
{
return &(_nodeInfos[p->nodeInfoPos]);
}
return NULL;
} }
bool find(Unicode::const_iterator begin, Unicode::const_iterator end, vector<pair<size_t, const TrieNodeInfo*> >& res) const bool find(Unicode::const_iterator begin, Unicode::const_iterator end, DagType & res, size_t offset = 0) const
{ {
TrieNodeMap::const_iterator citer; const TrieNode* p = _root;
TrieNode* p = _root;
for (Unicode::const_iterator itr = begin; itr != end; itr++)
{
citer = p->hmap.find(*itr);
if(p->hmap.end() == citer)
{
break;
}
p = citer->second;
if(p->isLeaf)
{
res.push_back(make_pair(itr-begin, &_nodeInfos[p->nodeInfoPos]));
}
}
return !res.empty();
}
bool find(Unicode::const_iterator begin, Unicode::const_iterator end, size_t offset, DagType & res) const
{
TrieNode* p = _root;
TrieNodeMap::const_iterator citer; TrieNodeMap::const_iterator citer;
for (Unicode::const_iterator itr = begin; itr != end; itr++) for (Unicode::const_iterator itr = begin; itr != end; itr++)
{ {
@ -156,9 +121,9 @@ namespace CppJieba
break; break;
} }
p = citer->second; p = citer->second;
if(p->isLeaf) if(p->ptTrieNodeInfo)
{ {
res[itr - begin + offset] = &_nodeInfos[p->nodeInfoPos]; res[itr - begin + offset] = p->ptTrieNodeInfo;
} }
} }
return !res.empty(); return !res.empty();
@ -168,43 +133,44 @@ namespace CppJieba
double getMinLogFreq() const {return _minLogFreq;}; double getMinLogFreq() const {return _minLogFreq;};
private: private:
void _insert(const TrieNodeInfo& nodeInfo, size_t nodeInfoPos) void _insertNode(const TrieNodeInfo& nodeInfo, TrieNode* ptNode) const
{ {
const Unicode& unico = nodeInfo.word; const Unicode& unico = nodeInfo.word;
TrieNode* p = _root; TrieNodeMap::const_iterator citer;
for(size_t i = 0; i < unico.size(); i++) for(size_t i = 0; i < unico.size(); i++)
{ {
uint16_t cu = unico[i]; uint16_t cu = unico[i];
assert(p); assert(ptNode);
if(!isIn(p->hmap, cu)) citer = ptNode->hmap.find(cu);
if(ptNode->hmap.end() == citer)
{ {
TrieNode * next = new TrieNode; TrieNode * next = new TrieNode;
assert(next); ptNode->hmap[cu] = next;
p->hmap[cu] = next; ptNode = next;
p = next;
} }
else else
{ {
p = p->hmap[cu]; ptNode = citer->second;
} }
} }
p->isLeaf = true; ptNode->ptTrieNodeInfo = &nodeInfo;
p->nodeInfoPos = nodeInfoPos;
} }
private: private:
bool _trieInsert(const string& filePath) void _loadDict(const string& filePath, vector<TrieNodeInfo>& nodeInfos) const
{ {
ifstream ifs(filePath.c_str()); ifstream ifs(filePath.c_str());
if(!ifs) if(!ifs)
{ {
LogError("open %s failed.", filePath.c_str()); LogFatal("open %s failed.", filePath.c_str());
return false; exit(1);
} }
string line; string line;
vector<string> buf; vector<string> buf;
nodeInfos.clear();
TrieNodeInfo nodeInfo; TrieNodeInfo nodeInfo;
for(size_t lineno = 0 ; getline(ifs, line); lineno++) for(size_t lineno = 0 ; getline(ifs, line); lineno++)
{ {
@ -213,43 +179,46 @@ namespace CppJieba
if(!TransCode::decode(buf[0], nodeInfo.word)) if(!TransCode::decode(buf[0], nodeInfo.word))
{ {
LogError("line[%u:%s] illegal.", lineno, line.c_str()); LogError("line[%u:%s] illegal.", lineno, line.c_str());
return false; continue;
} }
nodeInfo.freq = atoi(buf[1].c_str()); nodeInfo.freq = atoi(buf[1].c_str());
nodeInfo.tag = buf[2]; nodeInfo.tag = buf[2];
_nodeInfos.push_back(nodeInfo); nodeInfos.push_back(nodeInfo);
} }
}
bool _createTrie(const vector<TrieNodeInfo>& nodeInfos, TrieNode * ptNode)
{
for(size_t i = 0; i < _nodeInfos.size(); i++) for(size_t i = 0; i < _nodeInfos.size(); i++)
{ {
_insert(_nodeInfos[i], i); _insertNode(_nodeInfos[i], ptNode);
} }
return true; return true;
} }
void _countWeight() size_t _calculateFreqSum(const vector<TrieNodeInfo>& nodeInfos) const
{ {
//freq total freq size_t freqSum = 0;
_freqSum = 0; for(size_t i = 0; i < nodeInfos.size(); i++)
for(size_t i = 0; i < _nodeInfos.size(); i++)
{ {
_freqSum += _nodeInfos[i].freq; freqSum += nodeInfos[i].freq;
} }
return freqSum;
assert(_freqSum); }
double _calculateLogFreqAndGetMinValue(vector<TrieNodeInfo>& nodeInfos, size_t freqSum) const
//normalize {
for(size_t i = 0; i < _nodeInfos.size(); i++) assert(freqSum);
double minLogFreq = MAX_DOUBLE;
for(size_t i = 0; i < nodeInfos.size(); i++)
{ {
TrieNodeInfo& nodeInfo = _nodeInfos[i]; TrieNodeInfo& nodeInfo = nodeInfos[i];
assert(nodeInfo.freq); assert(nodeInfo.freq);
nodeInfo.logFreq = log(double(nodeInfo.freq)/double(_freqSum)); nodeInfo.logFreq = log(double(nodeInfo.freq)/double(freqSum));
if(_minLogFreq > nodeInfo.logFreq) if(minLogFreq > nodeInfo.logFreq)
{ {
_minLogFreq = nodeInfo.logFreq; minLogFreq = nodeInfo.logFreq;
} }
} }
return minLogFreq;
} }
void _deleteNode(TrieNode* node) void _deleteNode(TrieNode* node)

View File

@ -26,6 +26,18 @@ class ReqHandler: public IRequestHandler
httpReq.GET("key", tmp); httpReq.GET("key", tmp);
URLDecode(tmp, sentence); URLDecode(tmp, sentence);
_segment.cut(sentence, words); _segment.cut(sentence, words);
if(httpReq.GET("format", tmp) && tmp == "simple")
{
join(words.begin(), words.end(), strSnd, " ");
return true;
}
strSnd << words;
return true;
}
virtual bool do_POST(const HttpReqInfo& httpReq, string& strSnd) const
{
vector<string> words;
_segment.cut(httpReq.getBody(), words);
strSnd << words; strSnd << words;
return true; return true;
} }

View File

@ -2,6 +2,6 @@ SET(EXECUTABLE_OUTPUT_PATH ${PROJECT_BINARY_DIR})
ADD_EXECUTABLE(segment.demo segment.cpp) ADD_EXECUTABLE(segment.demo segment.cpp)
ADD_EXECUTABLE(keyword.demo keyword_demo.cpp) ADD_EXECUTABLE(keyword.demo keyword_demo.cpp)
ADD_EXECUTABLE(tagging_demo tagging_demo.cpp) ADD_EXECUTABLE(tagging.demo tagging_demo.cpp)
ADD_EXECUTABLE(load_test load_test.cpp) ADD_EXECUTABLE(load_test load_test.cpp)
ADD_SUBDIRECTORY(unittest) ADD_SUBDIRECTORY(unittest)

View File

@ -9,7 +9,7 @@
using namespace CppJieba; using namespace CppJieba;
void cut(const ISegment * seg, const char * const filePath, size_t times = 10) void cut(const ISegment * seg, const char * const filePath, size_t times = 30)
{ {
ifstream ifile(filePath); ifstream ifile(filePath);
if(!ifile) if(!ifile)
@ -23,10 +23,12 @@ void cut(const ISegment * seg, const char * const filePath, size_t times = 10)
loadFile2Str(filePath, doc); loadFile2Str(filePath, doc);
for(uint i = 0; i < times; i ++) for(uint i = 0; i < times; i ++)
{ {
LogInfo("times[%u]", i); printf("process [%3.0lf %%]\r", 100.0*(i+1)/times);
fflush(stdout);
res.clear(); res.clear();
seg->cut(doc, res); seg->cut(doc, res);
} }
printf("\n");
} }
int main(int argc, char ** argv) int main(int argc, char ** argv)
@ -40,6 +42,6 @@ int main(int argc, char ** argv)
long beginTime = clock(); long beginTime = clock();
cut(&seg, "../test/testdata/weicheng.utf8"); cut(&seg, "../test/testdata/weicheng.utf8");
long endTime = clock(); long endTime = clock();
printf("[%.3lf seconds]time consumeed.\n", double(endTime - beginTime)/CLOCKS_PER_SEC); printf("[%.3lf seconds]time consumed.\n", double(endTime - beginTime)/CLOCKS_PER_SEC);
return EXIT_SUCCESS; return EXIT_SUCCESS;
} }

View File

@ -80,7 +80,6 @@ TEST(MPSegmentTest, Test2)
res += line; res += line;
res += '\n'; res += '\n';
words.clear();
segment.cut(line, words); segment.cut(line, words);
string s; string s;
s << words; s << words;

View File

@ -5,8 +5,18 @@ using namespace CppJieba;
static const char* const DICT_FILE = "../dict/extra_dict/jieba.dict.small.utf8"; static const char* const DICT_FILE = "../dict/extra_dict/jieba.dict.small.utf8";
TEST(TrieTest, NewAndDelete)
{
Trie * trie;
trie = new Trie(DICT_FILE);
delete trie;
trie = new Trie();
delete trie;
}
TEST(TrieTest, Test1) TEST(TrieTest, Test1)
{ {
string s1, s2; string s1, s2;
Trie trie; Trie trie;
ASSERT_TRUE(trie.init(DICT_FILE)); ASSERT_TRUE(trie.init(DICT_FILE));
@ -26,7 +36,7 @@ TEST(TrieTest, Test1)
word = "清华大学"; word = "清华大学";
vector<pair<size_t, const TrieNodeInfo*> > res; vector<pair<size_t, const TrieNodeInfo*> > res;
map<size_t, const TrieNodeInfo* > resMap; map<size_t, const TrieNodeInfo* > resMap;
map<size_t, const TrieNodeInfo* > map; map<size_t, const TrieNodeInfo* > mp;
const char * words[] = {"", "清华", "清华大学"}; const char * words[] = {"", "清华", "清华大学"};
for(size_t i = 0; i < sizeof(words)/sizeof(words[0]); i++) for(size_t i = 0; i < sizeof(words)/sizeof(words[0]); i++)
{ {
@ -40,10 +50,8 @@ TEST(TrieTest, Test1)
vector<pair<size_t, const TrieNodeInfo*> > vec; vector<pair<size_t, const TrieNodeInfo*> > vec;
ASSERT_TRUE(TransCode::decode(word, uni)); ASSERT_TRUE(TransCode::decode(word, uni));
//print(uni); //print(uni);
ASSERT_TRUE(trie.find(uni.begin(), uni.end(), vec)); ASSERT_TRUE(trie.find(uni.begin(), uni.end(), mp, 0));
ASSERT_EQ(vec, res); ASSERT_EQ(mp, resMap);
ASSERT_TRUE(trie.find(uni.begin(), uni.end(), 0, map));
ASSERT_EQ(map, resMap);
// print(vec); // print(vec);
} }