merge dev

This commit is contained in:
wyy 2014-04-11 12:37:55 +08:00
commit 9a78765ecc
20 changed files with 232 additions and 210 deletions

View File

@ -22,4 +22,3 @@ ADD_SUBDIRECTORY(test)
ENABLE_TESTING()
ADD_TEST(NAME test.run COMMAND test.run)
ADD_TEST(NAME load_test COMMAND load_test)

View File

@ -2,12 +2,15 @@
1. 适配更低级版本的`g++``cmake`,已在`g++ 4.1.2``cmake 2.6`上测试通过。
2. 修改一些测试用例的文件,减少测试时编译的时间。
3. 修复`make install`相关的问题。
4. 增加HTTP服务的POST请求接口。
5. 修改Trie这个类潜在的bug并完善单元测试
## v2.3.4
1. 修改了设计上的问题,删除了`TrieManager`这个类,以避免造成一些可能的隐患。
2. 增加`stop_words.utf8`词典,并修改`KeywordExtractor`的初始化函数用以使用此词典。
3. 优化了Trie树相关部分代码结构。
3. 优化了`Trie`树相关部分代码结构。
## v2.3.3

View File

@ -1,8 +1,8 @@
#CppJieba是"结巴"中文分词的C++版本
功能性的代码全写成hpp文件此处的hpp文件是将cpp和h两个文件全都写在hpp文件里面当然需要遵守相关约束
功能性的代码全写成hpp文件文件依赖一直是很让人讨厌的东西全做成hpp头文件形式的目的就是为了省去链接的依赖。
之所以全写成hpp文件是因为这样在别的项目需要使用到中文分词功能的时候直接`#include"xx.hpp" `进来就可以使用,无需麻烦的链接。
**没有依赖,就没有伤害。**
实践证明写成hpp使用起来真的很爽在后面提到的在iOS应用中的使用和包装成`Node.js`的扩展[NodeJieba]都特别顺利。
@ -57,8 +57,37 @@ sudo /etc/init.d/cjserver stop
然后用chrome浏览器打开`http://127.0.0.1:11200/?key=南京市长江大桥`
(用chrome的原因是chrome的默认编码就是utf-8)
或者用命令 `curl "http://127.0.0.1:11200/?key=南京市长江大桥"` (ubuntu中的curl安装命令`sudo apt-get install curl`)
或者用命令 `curl "http://127.0.0.1:11200/?key=南京市长江大桥"` (ubuntu中的curl安装命令`sudo apt-get install curl`)
你可以看到返回的结果如下:(返回结果是json格式)
```
["南京市", "长江大桥"]
```
如果你使用如下调用方式:
```
curl "http://127.0.0.1:11200/?key=南京市长江大桥&format=simple"
```
则返回结果如下:(返回结果按空格隔开)
```
南京市 长江大桥
```
同时也支持HTTP POST模式使用如下调用:
```
curl -d "南京市长江大桥" "http://127.0.0.1:11200/"
```
返回结果如下:
```
["南京市", "长江大桥"]
```
### 卸载
```sh

View File

@ -9,7 +9,4 @@ TARGET_LINK_LIBRARIES(cjserver pthread)
INSTALL(TARGETS cjsegment RUNTIME DESTINATION bin)
INSTALL(TARGETS cjserver RUNTIME DESTINATION bin)
INSTALL(FILES HMMSegment.hpp MPSegment.hpp Trie.hpp TrieManager.hpp ISegment.hpp MixSegment.hpp SegmentBase.hpp TransCode.hpp KeywordExtractor.hpp DESTINATION include/CppJieba)
ADD_SUBDIRECTORY(Husky)
ADD_SUBDIRECTORY(Limonp)

View File

@ -48,7 +48,7 @@ namespace CppJieba
}
//resut of searching in trie tree
vector<pair<size_t, const TrieNodeInfo*> > tRes;
DagType tRes;
//max index of res's words
int maxIdx = 0;
@ -61,9 +61,10 @@ namespace CppJieba
for (Unicode::const_iterator uItr = begin; uItr != end; uItr++)
{
//find word start from uItr
if (_trie.find(uItr, end, tRes))
if (_trie.find(uItr, end, tRes, 0))
{
for (vector<pair<size_t, const TrieNodeInfo*> >::const_iterator itr = tRes.begin(); itr != tRes.end(); itr++)
for(DagType::const_iterator itr = tRes.begin(); itr != tRes.end(); itr++)
//for (vector<pair<size_t, const TrieNodeInfo*> >::const_iterator itr = tRes.begin(); itr != tRes.end(); itr++)
{
wordLen = itr->second->word.size();
if (wordLen >= 2 || (tRes.size() == 1 && maxIdx <= uIdx))

View File

@ -131,15 +131,11 @@ namespace CppJieba
size_t Y = STATUS_SUM;
size_t X = end - begin;
size_t XYSize = X * Y;
int * path;
double * weight;
size_t now, old, stat;
double tmp, endE, endS;
path = new int [XYSize];
assert(path);
weight = new double [XYSize];
assert(weight);
vector<int> path(XYSize);
vector<double> weight(XYSize);
//start
for(size_t y = 0; y < Y; y++)
@ -147,8 +143,10 @@ namespace CppJieba
weight[0 + y * X] = _startProb[y] + _getEmitProb(_emitProbVec[y], *begin, MIN_DOUBLE);
path[0 + y * X] = -1;
}
//process
//for(; begin != end; begin++)
double emitProb;
for(size_t x = 1; x < X; x++)
{
for(size_t y = 0; y < Y; y++)
@ -156,10 +154,11 @@ namespace CppJieba
now = x + y*X;
weight[now] = MIN_DOUBLE;
path[now] = E; // warning
emitProb = _getEmitProb(_emitProbVec[y], *(begin+x), MIN_DOUBLE);
for(size_t preY = 0; preY < Y; preY++)
{
old = x - 1 + preY * X;
tmp = weight[old] + _transProb[preY][y] + _getEmitProb(_emitProbVec[y], *(begin+x), MIN_DOUBLE);
tmp = weight[old] + _transProb[preY][y] + emitProb;
if(tmp > weight[now])
{
weight[now] = tmp;
@ -188,8 +187,6 @@ namespace CppJieba
stat = path[x + stat*X];
}
delete [] path;
delete [] weight;
return true;
}
bool _loadModel(const char* const filePath)

View File

@ -1,5 +0,0 @@
SET(EXECUTABLE_OUTPUT_PATH ${PROJECT_BINARY_DIR}/bin)
SET(LIBRARY_OUTPUT_PATH ${PROJECT_BINARY_DIR}/lib)
FILE(GLOB SRCS *.hpp)
INSTALL(FILES ${SRCS} DESTINATION include/CppJieba/Husky)

View File

@ -37,13 +37,14 @@ namespace Husky
virtual ~IRequestHandler(){};
public:
virtual bool do_GET(const HttpReqInfo& httpReq, string& res) const = 0;
virtual bool do_POST(const HttpReqInfo& httpReq, string& res) const = 0;
};
class EpollServer
{
private:
static const size_t LISTEN_QUEUE_LEN = 1024;
static const size_t RECV_BUFFER_SIZE = 1024 * 8;
static const size_t RECV_BUFFER_SIZE = 1024*4;
static const int MAXEPOLLSIZE = 512;
private:
@ -59,10 +60,10 @@ namespace Husky
bool _setInitFlag(bool flag) {return _isInited = flag;}
public:
explicit EpollServer(uint port, const IRequestHandler* pHandler): _reqHandler(pHandler), _host_socket(-1), _isShutDown(false), _epollSize(0)
{
assert(_reqHandler);
_setInitFlag(_init_epoll(port));
};
{
assert(_reqHandler);
_setInitFlag(_init_epoll(port));
};
~EpollServer(){};// unfinished;
public:
operator bool() const
@ -75,10 +76,9 @@ namespace Husky
//int clientSock;
sockaddr_in clientaddr;
socklen_t nSize = sizeof(clientaddr);
//char recvBuf[RECV_BUFFER_SIZE];
struct epoll_event events[MAXEPOLLSIZE];
int nfds, clientSock;
while(!_isShutDown)
{
if(-1 == (nfds = epoll_wait(_epoll_fd, events, _epollSize, -1)))
@ -88,7 +88,7 @@ namespace Husky
}
//LogDebug("epoll_wait return event sum[%d]", nfds);
for(int i = 0; i < nfds; i++)
{
if(events[i].data.fd == _host_socket) /*new connect coming.*/
@ -106,7 +106,7 @@ namespace Husky
}
//LogInfo("connecting from: %d:%d client socket: %d\n", inet_ntoa(clientaddr.sin_addr), ntohs(clientaddr.sin_port), clientSock);
/* inet_ntoa is not thread safety at some version */
//_sockIpMap[clientSock] = inet_ntoa(clientaddr.sin_addr);
@ -119,7 +119,7 @@ namespace Husky
_closesocket(events[i].data.fd);
}
}
}
return true;
}
@ -190,26 +190,42 @@ namespace Husky
}
string strRec, strSnd, strRetByHandler;
strRec.resize(RECV_BUFFER_SIZE);
int nRetCode = recv(sockfd, (char*)strRec.c_str(), strRec.size(), 0);
if(-1 == nRetCode)
char recvBuf[RECV_BUFFER_SIZE];
int nRetCode = -1;
while(true)
{
LogDebug(strerror(errno));
return false;
}
if(0 == nRetCode)
{
LogDebug("client socket closed gracefully.");
return false;
memset(recvBuf, 0, sizeof(recvBuf));
nRetCode = recv(sockfd, recvBuf, sizeof(recvBuf) - 1, 0);
if(-1 == nRetCode)
{
LogDebug(strerror(errno));
return false;
}
if(0 == nRetCode)
{
LogDebug("client socket orderly shut down");
return false;
}
strRec += recvBuf;
if(nRetCode != sizeof(recvBuf) - 1)
{
break;
}
}
HttpReqInfo httpReq(strRec);
if(!_reqHandler->do_GET(httpReq, strRetByHandler))
if("GET" == httpReq.getMethod() && !_reqHandler->do_GET(httpReq, strRetByHandler))
{
LogError("do_GET failed.");
return false;
}
if("POST" == httpReq.getMethod() && !_reqHandler->do_POST(httpReq, strRetByHandler))
{
LogError("do_POST failed.");
return false;
}
string_format(strSnd, HTTP_FORMAT, CHARSET_UTF8, strRetByHandler.length(), strRetByHandler.c_str());
if(-1 == send(sockfd, strSnd.c_str(), strSnd.length(), 0))
{
LogError(strerror(errno));

View File

@ -135,7 +135,8 @@ namespace Husky
//message header end
//body begin
_body.assign(headerStr.substr(rpos));
trim(_body);
}
public:
string& operator[] (const string& key)
@ -150,14 +151,23 @@ namespace Husky
{
return _find(_methodGetMap, argKey, res);
}
bool POST(const string& argKey, string& res)const
//bool POST(const string& argKey, string& res)const
//{
// return _find(_methodPostMap, argKey, res);
//}
const string& getMethod() const
{
return _find(_methodPostMap, argKey, res);
return _headerMap.find(KEY_METHOD)->second;
}
const string& getBody() const
{
return _body;
}
private:
std::unordered_map<string, string> _headerMap;
std::unordered_map<string, string> _methodGetMap;
std::unordered_map<string, string> _methodPostMap;
//std::unordered_map<string, string> _methodPostMap;
string _body;
//public:
friend ostream& operator<<(ostream& os, const HttpReqInfo& obj);
private:
@ -215,7 +225,7 @@ namespace Husky
inline std::ostream& operator << (std::ostream& os, const Husky::HttpReqInfo& obj)
{
return os << obj._headerMap << obj._methodGetMap << obj._methodPostMap;
return os << obj._headerMap << obj._methodGetMap/* << obj._methodPostMap*/ << obj._body;
}
}

View File

@ -1,4 +0,0 @@
INSTALL(FILES ArgvContext.hpp InitOnOff.hpp macro_def.hpp MysqlClient.hpp str_functs.hpp
cast_functs.hpp io_functs.hpp map_functs.hpp NonCopyable.hpp
Config.hpp logger.hpp md5.hpp std_outbound.hpp
DESTINATION include/CppJieba/Limonp)

View File

@ -24,7 +24,7 @@ namespace CppJieba
const TrieNodeInfo * pInfo;
double weight;
SegmentChar(uint16_t uni):uniCh(uni), pInfo(NULL), weight(0.0)
SegmentChar():uniCh(0), pInfo(NULL), weight(0.0)
{}
};
typedef vector<SegmentChar> SegmentContext;
@ -59,18 +59,22 @@ namespace CppJieba
virtual bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<string>& res)const
{
assert(_getInitFlag());
vector<TrieNodeInfo> segWordInfos;
if(!cut(begin, end, segWordInfos))
if(begin == end)
{
return false;
}
string tmp;
for(size_t i = 0; i < segWordInfos.size(); i++)
vector<Unicode> words;
if(!cut(begin, end, words))
{
if(TransCode::encode(segWordInfos[i].word, tmp))
return false;
}
string word;
for(size_t i = 0; i < words.size(); i++)
{
if(TransCode::encode(words[i], word))
{
res.push_back(tmp);
res.push_back(word);
}
else
{
@ -80,7 +84,7 @@ namespace CppJieba
return true;
}
bool cut(Unicode::const_iterator begin , Unicode::const_iterator end, vector<TrieNodeInfo>& segWordInfos)const
bool cut(Unicode::const_iterator begin , Unicode::const_iterator end, vector<Unicode>& res) const
{
if(!_getInitFlag())
{
@ -88,7 +92,6 @@ namespace CppJieba
return false;
}
SegmentContext segContext;
//calc DAG
if(!_calcDAG(begin, end, segContext))
{
@ -102,7 +105,7 @@ namespace CppJieba
return false;
}
if(!_cut(segContext, segWordInfos))
if(!_cut(segContext, res))
{
LogError("_cut failed.");
return false;
@ -114,21 +117,17 @@ namespace CppJieba
private:
bool _calcDAG(Unicode::const_iterator begin, Unicode::const_iterator end, SegmentContext& segContext) const
{
if(begin >= end)
{
LogError("begin >= end.");
return false;
}
SegmentChar schar;
size_t offset;
for(Unicode::const_iterator it = begin; it != end; it++)
{
SegmentChar schar(*it);
size_t i = it - begin;
_trie.find(it, end, i, schar.dag);
//DagType::iterator dagIter;
if(schar.dag.end() == schar.dag.find(i))
schar.uniCh = *it;
offset = it - begin;
schar.dag.clear();
_trie.find(it, end, schar.dag, offset);
if(!isIn(schar.dag, offset))
{
schar.dag[i] = NULL;
schar.dag[offset] = NULL;
}
segContext.push_back(schar);
}
@ -142,15 +141,19 @@ namespace CppJieba
return false;
}
size_t nextPos;
const TrieNodeInfo* p;
double val;
for(int i = segContext.size() - 1; i >= 0; i--)
{
segContext[i].pInfo = NULL;
segContext[i].weight = MIN_DOUBLE;
for(DagType::const_iterator it = segContext[i].dag.begin(); it != segContext[i].dag.end(); it++)
{
size_t nextPos = it->first;
const TrieNodeInfo* p = it->second;
double val = 0.0;
nextPos = it->first;
p = it->second;
val = 0.0;
if(nextPos + 1 < segContext.size())
{
val += segContext[nextPos + 1].weight;
@ -174,7 +177,7 @@ namespace CppJieba
return true;
}
bool _cut(SegmentContext& segContext, vector<TrieNodeInfo>& res)const
bool _cut(SegmentContext& segContext, vector<Unicode>& res)const
{
size_t i = 0;
while(i < segContext.size())
@ -182,16 +185,12 @@ namespace CppJieba
const TrieNodeInfo* p = segContext[i].pInfo;
if(p)
{
res.push_back(*p);
res.push_back(p->word);
i += p->word.size();
}
else//single chinese word
{
TrieNodeInfo nodeInfo;
nodeInfo.word.push_back(segContext[i].uniCh);
nodeInfo.freq = 0;
nodeInfo.logFreq = _trie.getMinLogFreq();
res.push_back(nodeInfo);
res.push_back(Unicode(1, segContext[i].uniCh));
i++;
}
}

View File

@ -44,14 +44,8 @@ namespace CppJieba
virtual bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<Unicode>& res) const
{
assert(_getInitFlag());
if(begin >= end)
{
LogError("begin >= end");
return false;
}
vector<TrieNodeInfo> infos;
if(!_mpSeg.cut(begin, end, infos))
vector<Unicode> words;
if(!_mpSeg.cut(begin, end, words))
{
LogError("mpSeg cutDAG failed.");
return false;
@ -59,20 +53,20 @@ namespace CppJieba
vector<Unicode> hmmRes;
Unicode piece;
for (size_t i = 0, j = 0; i < infos.size(); i++)
for (size_t i = 0, j = 0; i < words.size(); i++)
{
//if mp get a word, it's ok, put it into result
if (1 != infos[i].word.size())
if (1 != words[i].size())
{
res.push_back(infos[i].word);
res.push_back(words[i]);
continue;
}
// if mp get a single one, collect it in sequence
j = i;
while (j < infos.size() && infos[j].word.size() == 1)
while (j < words.size() && words[j].size() == 1)
{
piece.push_back(infos[j].word[0]);
piece.push_back(words[j][0]);
j++;
}

View File

@ -22,12 +22,8 @@ namespace CppJieba
virtual bool cut(const string& str, vector<string>& res)const
{
assert(_getInitFlag());
//if(!_getInitFlag())
//{
// LogError("not inited.");
// return false;
//}
Unicode unico;
res.clear();
#ifdef NO_FILTER
if(!TransCode::decode(str, unico))
{

View File

@ -15,7 +15,7 @@ namespace CppJieba
typedef std::vector<uint16_t> Unicode;
namespace TransCode
{
inline bool decode(const string& str, vector<uint16_t>& vec)
inline bool decode(const string& str, Unicode& vec)
{
#ifdef CPPJIEBA_GBK
return gbkTrans(str, vec);
@ -24,7 +24,7 @@ namespace CppJieba
#endif
}
inline bool encode(vector<uint16_t>::const_iterator begin, vector<uint16_t>::const_iterator end, string& res)
inline bool encode(Unicode::const_iterator begin, Unicode::const_iterator end, string& res)
{
#ifdef CPPJIEBA_GBK
return gbkTrans(begin, end, res);
@ -33,7 +33,7 @@ namespace CppJieba
#endif
}
inline bool encode(const vector<uint16_t>& uni, string& res)
inline bool encode(const Unicode& uni, string& res)
{
return encode(uni.begin(), uni.end(), res);
}

View File

@ -26,16 +26,13 @@ namespace CppJieba
const double MAX_DOUBLE = 3.14e+100;
const size_t DICT_COLUMN_NUM = 3;
typedef map<uint16_t, struct TrieNode*> TrieNodeMap;
struct TrieNodeInfo;
struct TrieNode
{
TrieNodeMap hmap;
bool isLeaf;
size_t nodeInfoPos;
TrieNode()
{
isLeaf = false;
nodeInfoPos = 0;
}
const TrieNodeInfo * ptTrieNodeInfo;
TrieNode(): ptTrieNodeInfo(NULL)
{}
};
struct TrieNodeInfo
@ -44,12 +41,6 @@ namespace CppJieba
size_t freq;
string tag;
double logFreq; //logFreq = log(freq/sum(freq));
TrieNodeInfo():freq(0),logFreq(0.0)
{}
TrieNodeInfo(const TrieNodeInfo& nodeInfo):word(nodeInfo.word), freq(nodeInfo.freq), tag(nodeInfo.tag), logFreq(nodeInfo.logFreq)
{}
TrieNodeInfo(const Unicode& _word):word(_word),freq(0),logFreq(MIN_DOUBLE)
{}
};
inline ostream& operator << (ostream& os, const TrieNodeInfo & nodeInfo)
@ -72,33 +63,32 @@ namespace CppJieba
public:
Trie()
{
_root = NULL;
_root = new TrieNode;
_freqSum = 0;
_minLogFreq = MAX_DOUBLE;
_setInitFlag(false);
}
Trie(const string& filePath)
{
Trie();
new (this) Trie();
_setInitFlag(init(filePath));
}
~Trie()
{
_deleteNode(_root);
}
private:
public:
bool init(const string& filePath)
{
assert(!_getInitFlag());
_root = new TrieNode;
assert(_root);
if(!_trieInsert(filePath))
{
LogError("_trieInsert failed.");
return false;
}
_countWeight();
_loadDict(filePath, _nodeInfos);
_createTrie(_nodeInfos, _root);
_freqSum = _calculateFreqSum(_nodeInfos);
assert(_freqSum);
_minLogFreq = _calculateLogFreqAndGetMinValue(_nodeInfos, _freqSum);
return _setInitFlag(true);
}
@ -106,47 +96,22 @@ namespace CppJieba
const TrieNodeInfo* find(Unicode::const_iterator begin, Unicode::const_iterator end)const
{
TrieNodeMap::const_iterator citer;
TrieNode* p = _root;
const TrieNode* p = _root;
for(Unicode::const_iterator it = begin; it != end; it++)
{
uint16_t chUni = *it;
citer = p->hmap.find(chUni);
if(p-> hmap.end() == citer)
citer = p->hmap.find(*it);
if(p->hmap.end() == citer)
{
return NULL;
}
p = citer->second;
}
if(p->isLeaf)
{
return &(_nodeInfos[p->nodeInfoPos]);
}
return NULL;
return p->ptTrieNodeInfo;
}
bool find(Unicode::const_iterator begin, Unicode::const_iterator end, vector<pair<size_t, const TrieNodeInfo*> >& res) const
bool find(Unicode::const_iterator begin, Unicode::const_iterator end, DagType & res, size_t offset = 0) const
{
TrieNodeMap::const_iterator citer;
TrieNode* p = _root;
for (Unicode::const_iterator itr = begin; itr != end; itr++)
{
citer = p->hmap.find(*itr);
if(p->hmap.end() == citer)
{
break;
}
p = citer->second;
if(p->isLeaf)
{
res.push_back(make_pair(itr-begin, &_nodeInfos[p->nodeInfoPos]));
}
}
return !res.empty();
}
bool find(Unicode::const_iterator begin, Unicode::const_iterator end, size_t offset, DagType & res) const
{
TrieNode* p = _root;
const TrieNode* p = _root;
TrieNodeMap::const_iterator citer;
for (Unicode::const_iterator itr = begin; itr != end; itr++)
{
@ -156,9 +121,9 @@ namespace CppJieba
break;
}
p = citer->second;
if(p->isLeaf)
if(p->ptTrieNodeInfo)
{
res[itr - begin + offset] = &_nodeInfos[p->nodeInfoPos];
res[itr - begin + offset] = p->ptTrieNodeInfo;
}
}
return !res.empty();
@ -168,43 +133,44 @@ namespace CppJieba
double getMinLogFreq() const {return _minLogFreq;};
private:
void _insert(const TrieNodeInfo& nodeInfo, size_t nodeInfoPos)
void _insertNode(const TrieNodeInfo& nodeInfo, TrieNode* ptNode) const
{
const Unicode& unico = nodeInfo.word;
TrieNode* p = _root;
TrieNodeMap::const_iterator citer;
for(size_t i = 0; i < unico.size(); i++)
{
uint16_t cu = unico[i];
assert(p);
if(!isIn(p->hmap, cu))
assert(ptNode);
citer = ptNode->hmap.find(cu);
if(ptNode->hmap.end() == citer)
{
TrieNode * next = new TrieNode;
assert(next);
p->hmap[cu] = next;
p = next;
ptNode->hmap[cu] = next;
ptNode = next;
}
else
{
p = p->hmap[cu];
ptNode = citer->second;
}
}
p->isLeaf = true;
p->nodeInfoPos = nodeInfoPos;
ptNode->ptTrieNodeInfo = &nodeInfo;
}
private:
bool _trieInsert(const string& filePath)
void _loadDict(const string& filePath, vector<TrieNodeInfo>& nodeInfos) const
{
ifstream ifs(filePath.c_str());
if(!ifs)
{
LogError("open %s failed.", filePath.c_str());
return false;
LogFatal("open %s failed.", filePath.c_str());
exit(1);
}
string line;
vector<string> buf;
nodeInfos.clear();
TrieNodeInfo nodeInfo;
for(size_t lineno = 0 ; getline(ifs, line); lineno++)
{
@ -213,43 +179,46 @@ namespace CppJieba
if(!TransCode::decode(buf[0], nodeInfo.word))
{
LogError("line[%u:%s] illegal.", lineno, line.c_str());
return false;
continue;
}
nodeInfo.freq = atoi(buf[1].c_str());
nodeInfo.tag = buf[2];
_nodeInfos.push_back(nodeInfo);
nodeInfos.push_back(nodeInfo);
}
}
bool _createTrie(const vector<TrieNodeInfo>& nodeInfos, TrieNode * ptNode)
{
for(size_t i = 0; i < _nodeInfos.size(); i++)
{
_insert(_nodeInfos[i], i);
_insertNode(_nodeInfos[i], ptNode);
}
return true;
}
void _countWeight()
size_t _calculateFreqSum(const vector<TrieNodeInfo>& nodeInfos) const
{
//freq total freq
_freqSum = 0;
for(size_t i = 0; i < _nodeInfos.size(); i++)
size_t freqSum = 0;
for(size_t i = 0; i < nodeInfos.size(); i++)
{
_freqSum += _nodeInfos[i].freq;
freqSum += nodeInfos[i].freq;
}
assert(_freqSum);
//normalize
for(size_t i = 0; i < _nodeInfos.size(); i++)
return freqSum;
}
double _calculateLogFreqAndGetMinValue(vector<TrieNodeInfo>& nodeInfos, size_t freqSum) const
{
assert(freqSum);
double minLogFreq = MAX_DOUBLE;
for(size_t i = 0; i < nodeInfos.size(); i++)
{
TrieNodeInfo& nodeInfo = _nodeInfos[i];
TrieNodeInfo& nodeInfo = nodeInfos[i];
assert(nodeInfo.freq);
nodeInfo.logFreq = log(double(nodeInfo.freq)/double(_freqSum));
if(_minLogFreq > nodeInfo.logFreq)
nodeInfo.logFreq = log(double(nodeInfo.freq)/double(freqSum));
if(minLogFreq > nodeInfo.logFreq)
{
_minLogFreq = nodeInfo.logFreq;
minLogFreq = nodeInfo.logFreq;
}
}
return minLogFreq;
}
void _deleteNode(TrieNode* node)

View File

@ -26,6 +26,18 @@ class ReqHandler: public IRequestHandler
httpReq.GET("key", tmp);
URLDecode(tmp, sentence);
_segment.cut(sentence, words);
if(httpReq.GET("format", tmp) && tmp == "simple")
{
join(words.begin(), words.end(), strSnd, " ");
return true;
}
strSnd << words;
return true;
}
virtual bool do_POST(const HttpReqInfo& httpReq, string& strSnd) const
{
vector<string> words;
_segment.cut(httpReq.getBody(), words);
strSnd << words;
return true;
}

View File

@ -2,6 +2,6 @@ SET(EXECUTABLE_OUTPUT_PATH ${PROJECT_BINARY_DIR})
ADD_EXECUTABLE(segment.demo segment.cpp)
ADD_EXECUTABLE(keyword.demo keyword_demo.cpp)
ADD_EXECUTABLE(tagging_demo tagging_demo.cpp)
ADD_EXECUTABLE(tagging.demo tagging_demo.cpp)
ADD_EXECUTABLE(load_test load_test.cpp)
ADD_SUBDIRECTORY(unittest)

View File

@ -9,7 +9,7 @@
using namespace CppJieba;
void cut(const ISegment * seg, const char * const filePath, size_t times = 10)
void cut(const ISegment * seg, const char * const filePath, size_t times = 30)
{
ifstream ifile(filePath);
if(!ifile)
@ -23,10 +23,12 @@ void cut(const ISegment * seg, const char * const filePath, size_t times = 10)
loadFile2Str(filePath, doc);
for(uint i = 0; i < times; i ++)
{
LogInfo("times[%u]", i);
printf("process [%3.0lf %%]\r", 100.0*(i+1)/times);
fflush(stdout);
res.clear();
seg->cut(doc, res);
}
printf("\n");
}
int main(int argc, char ** argv)
@ -40,6 +42,6 @@ int main(int argc, char ** argv)
long beginTime = clock();
cut(&seg, "../test/testdata/weicheng.utf8");
long endTime = clock();
printf("[%.3lf seconds]time consumeed.\n", double(endTime - beginTime)/CLOCKS_PER_SEC);
printf("[%.3lf seconds]time consumed.\n", double(endTime - beginTime)/CLOCKS_PER_SEC);
return EXIT_SUCCESS;
}

View File

@ -80,7 +80,6 @@ TEST(MPSegmentTest, Test2)
res += line;
res += '\n';
words.clear();
segment.cut(line, words);
string s;
s << words;

View File

@ -5,8 +5,18 @@ using namespace CppJieba;
static const char* const DICT_FILE = "../dict/extra_dict/jieba.dict.small.utf8";
TEST(TrieTest, NewAndDelete)
{
Trie * trie;
trie = new Trie(DICT_FILE);
delete trie;
trie = new Trie();
delete trie;
}
TEST(TrieTest, Test1)
{
string s1, s2;
Trie trie;
ASSERT_TRUE(trie.init(DICT_FILE));
@ -26,7 +36,7 @@ TEST(TrieTest, Test1)
word = "清华大学";
vector<pair<size_t, const TrieNodeInfo*> > res;
map<size_t, const TrieNodeInfo* > resMap;
map<size_t, const TrieNodeInfo* > map;
map<size_t, const TrieNodeInfo* > mp;
const char * words[] = {"", "清华", "清华大学"};
for(size_t i = 0; i < sizeof(words)/sizeof(words[0]); i++)
{
@ -40,10 +50,8 @@ TEST(TrieTest, Test1)
vector<pair<size_t, const TrieNodeInfo*> > vec;
ASSERT_TRUE(TransCode::decode(word, uni));
//print(uni);
ASSERT_TRUE(trie.find(uni.begin(), uni.end(), vec));
ASSERT_EQ(vec, res);
ASSERT_TRUE(trie.find(uni.begin(), uni.end(), 0, map));
ASSERT_EQ(map, resMap);
ASSERT_TRUE(trie.find(uni.begin(), uni.end(), mp, 0));
ASSERT_EQ(mp, resMap);
// print(vec);
}