mirror of
https://github.com/yanyiwu/cppjieba.git
synced 2025-07-18 00:00:12 +08:00
merge dev
This commit is contained in:
commit
9a78765ecc
@ -22,4 +22,3 @@ ADD_SUBDIRECTORY(test)
|
||||
|
||||
ENABLE_TESTING()
|
||||
ADD_TEST(NAME test.run COMMAND test.run)
|
||||
ADD_TEST(NAME load_test COMMAND load_test)
|
||||
|
@ -2,12 +2,15 @@
|
||||
|
||||
1. 适配更低级版本的`g++`和`cmake`,已在`g++ 4.1.2`和`cmake 2.6`上测试通过。
|
||||
2. 修改一些测试用例的文件,减少测试时编译的时间。
|
||||
3. 修复`make install`相关的问题。
|
||||
4. 增加HTTP服务的POST请求接口。
|
||||
5. 修改Trie这个类潜在的bug并完善单元测试
|
||||
|
||||
## v2.3.4
|
||||
|
||||
1. 修改了设计上的问题,删除了`TrieManager`这个类,以避免造成一些可能的隐患。
|
||||
2. 增加`stop_words.utf8`词典,并修改`KeywordExtractor`的初始化函数用以使用此词典。
|
||||
3. 优化了Trie树相关部分代码结构。
|
||||
3. 优化了`Trie`树相关部分代码结构。
|
||||
|
||||
## v2.3.3
|
||||
|
||||
|
33
README.md
33
README.md
@ -1,8 +1,8 @@
|
||||
#CppJieba是"结巴"中文分词的C++版本
|
||||
|
||||
功能性的代码全写成hpp文件,此处的hpp文件是将cpp和h两个文件全都写在hpp文件里面(当然需要遵守相关约束)
|
||||
功能性的代码全写成hpp文件,文件依赖一直是很让人讨厌的东西,全做成hpp头文件形式的目的就是为了省去链接的依赖。
|
||||
|
||||
之所以全写成hpp文件,是因为这样在别的项目需要使用到中文分词功能的时候直接`#include"xx.hpp" `进来就可以使用,无需麻烦的链接。
|
||||
**没有依赖,就没有伤害。**
|
||||
|
||||
实践证明写成hpp使用起来真的很爽,在后面提到的在iOS应用中的使用,和包装成`Node.js`的扩展[NodeJieba]都特别顺利。
|
||||
|
||||
@ -59,6 +59,35 @@ sudo /etc/init.d/cjserver stop
|
||||
|
||||
或者用命令 `curl "http://127.0.0.1:11200/?key=南京市长江大桥"` (ubuntu中的curl安装命令`sudo apt-get install curl`)
|
||||
|
||||
你可以看到返回的结果如下:(返回结果是json格式)
|
||||
|
||||
```
|
||||
["南京市", "长江大桥"]
|
||||
```
|
||||
|
||||
如果你使用如下调用方式:
|
||||
|
||||
```
|
||||
curl "http://127.0.0.1:11200/?key=南京市长江大桥&format=simple"
|
||||
```
|
||||
|
||||
则返回结果如下:(返回结果按空格隔开)
|
||||
|
||||
```
|
||||
南京市 长江大桥
|
||||
```
|
||||
|
||||
同时,也支持HTTP POST模式,使用如下调用:
|
||||
|
||||
```
|
||||
curl -d "南京市长江大桥" "http://127.0.0.1:11200/"
|
||||
```
|
||||
|
||||
返回结果如下:
|
||||
|
||||
```
|
||||
["南京市", "长江大桥"]
|
||||
```
|
||||
|
||||
### 卸载
|
||||
```sh
|
||||
|
@ -9,7 +9,4 @@ TARGET_LINK_LIBRARIES(cjserver pthread)
|
||||
|
||||
INSTALL(TARGETS cjsegment RUNTIME DESTINATION bin)
|
||||
INSTALL(TARGETS cjserver RUNTIME DESTINATION bin)
|
||||
INSTALL(FILES HMMSegment.hpp MPSegment.hpp Trie.hpp TrieManager.hpp ISegment.hpp MixSegment.hpp SegmentBase.hpp TransCode.hpp KeywordExtractor.hpp DESTINATION include/CppJieba)
|
||||
|
||||
ADD_SUBDIRECTORY(Husky)
|
||||
ADD_SUBDIRECTORY(Limonp)
|
||||
|
@ -48,7 +48,7 @@ namespace CppJieba
|
||||
}
|
||||
|
||||
//resut of searching in trie tree
|
||||
vector<pair<size_t, const TrieNodeInfo*> > tRes;
|
||||
DagType tRes;
|
||||
|
||||
//max index of res's words
|
||||
int maxIdx = 0;
|
||||
@ -61,9 +61,10 @@ namespace CppJieba
|
||||
for (Unicode::const_iterator uItr = begin; uItr != end; uItr++)
|
||||
{
|
||||
//find word start from uItr
|
||||
if (_trie.find(uItr, end, tRes))
|
||||
if (_trie.find(uItr, end, tRes, 0))
|
||||
{
|
||||
for (vector<pair<size_t, const TrieNodeInfo*> >::const_iterator itr = tRes.begin(); itr != tRes.end(); itr++)
|
||||
for(DagType::const_iterator itr = tRes.begin(); itr != tRes.end(); itr++)
|
||||
//for (vector<pair<size_t, const TrieNodeInfo*> >::const_iterator itr = tRes.begin(); itr != tRes.end(); itr++)
|
||||
{
|
||||
wordLen = itr->second->word.size();
|
||||
if (wordLen >= 2 || (tRes.size() == 1 && maxIdx <= uIdx))
|
||||
|
@ -131,15 +131,11 @@ namespace CppJieba
|
||||
size_t Y = STATUS_SUM;
|
||||
size_t X = end - begin;
|
||||
size_t XYSize = X * Y;
|
||||
int * path;
|
||||
double * weight;
|
||||
size_t now, old, stat;
|
||||
double tmp, endE, endS;
|
||||
|
||||
path = new int [XYSize];
|
||||
assert(path);
|
||||
weight = new double [XYSize];
|
||||
assert(weight);
|
||||
vector<int> path(XYSize);
|
||||
vector<double> weight(XYSize);
|
||||
|
||||
//start
|
||||
for(size_t y = 0; y < Y; y++)
|
||||
@ -147,8 +143,10 @@ namespace CppJieba
|
||||
weight[0 + y * X] = _startProb[y] + _getEmitProb(_emitProbVec[y], *begin, MIN_DOUBLE);
|
||||
path[0 + y * X] = -1;
|
||||
}
|
||||
//process
|
||||
//for(; begin != end; begin++)
|
||||
|
||||
|
||||
double emitProb;
|
||||
|
||||
for(size_t x = 1; x < X; x++)
|
||||
{
|
||||
for(size_t y = 0; y < Y; y++)
|
||||
@ -156,10 +154,11 @@ namespace CppJieba
|
||||
now = x + y*X;
|
||||
weight[now] = MIN_DOUBLE;
|
||||
path[now] = E; // warning
|
||||
emitProb = _getEmitProb(_emitProbVec[y], *(begin+x), MIN_DOUBLE);
|
||||
for(size_t preY = 0; preY < Y; preY++)
|
||||
{
|
||||
old = x - 1 + preY * X;
|
||||
tmp = weight[old] + _transProb[preY][y] + _getEmitProb(_emitProbVec[y], *(begin+x), MIN_DOUBLE);
|
||||
tmp = weight[old] + _transProb[preY][y] + emitProb;
|
||||
if(tmp > weight[now])
|
||||
{
|
||||
weight[now] = tmp;
|
||||
@ -188,8 +187,6 @@ namespace CppJieba
|
||||
stat = path[x + stat*X];
|
||||
}
|
||||
|
||||
delete [] path;
|
||||
delete [] weight;
|
||||
return true;
|
||||
}
|
||||
bool _loadModel(const char* const filePath)
|
||||
|
@ -1,5 +0,0 @@
|
||||
SET(EXECUTABLE_OUTPUT_PATH ${PROJECT_BINARY_DIR}/bin)
|
||||
SET(LIBRARY_OUTPUT_PATH ${PROJECT_BINARY_DIR}/lib)
|
||||
|
||||
FILE(GLOB SRCS *.hpp)
|
||||
INSTALL(FILES ${SRCS} DESTINATION include/CppJieba/Husky)
|
@ -37,13 +37,14 @@ namespace Husky
|
||||
virtual ~IRequestHandler(){};
|
||||
public:
|
||||
virtual bool do_GET(const HttpReqInfo& httpReq, string& res) const = 0;
|
||||
virtual bool do_POST(const HttpReqInfo& httpReq, string& res) const = 0;
|
||||
};
|
||||
|
||||
class EpollServer
|
||||
{
|
||||
private:
|
||||
static const size_t LISTEN_QUEUE_LEN = 1024;
|
||||
static const size_t RECV_BUFFER_SIZE = 1024 * 8;
|
||||
static const size_t RECV_BUFFER_SIZE = 1024*4;
|
||||
static const int MAXEPOLLSIZE = 512;
|
||||
|
||||
private:
|
||||
@ -59,10 +60,10 @@ namespace Husky
|
||||
bool _setInitFlag(bool flag) {return _isInited = flag;}
|
||||
public:
|
||||
explicit EpollServer(uint port, const IRequestHandler* pHandler): _reqHandler(pHandler), _host_socket(-1), _isShutDown(false), _epollSize(0)
|
||||
{
|
||||
assert(_reqHandler);
|
||||
_setInitFlag(_init_epoll(port));
|
||||
};
|
||||
{
|
||||
assert(_reqHandler);
|
||||
_setInitFlag(_init_epoll(port));
|
||||
};
|
||||
~EpollServer(){};// unfinished;
|
||||
public:
|
||||
operator bool() const
|
||||
@ -75,7 +76,6 @@ namespace Husky
|
||||
//int clientSock;
|
||||
sockaddr_in clientaddr;
|
||||
socklen_t nSize = sizeof(clientaddr);
|
||||
//char recvBuf[RECV_BUFFER_SIZE];
|
||||
struct epoll_event events[MAXEPOLLSIZE];
|
||||
int nfds, clientSock;
|
||||
|
||||
@ -190,26 +190,42 @@ namespace Husky
|
||||
}
|
||||
|
||||
string strRec, strSnd, strRetByHandler;
|
||||
strRec.resize(RECV_BUFFER_SIZE);
|
||||
int nRetCode = recv(sockfd, (char*)strRec.c_str(), strRec.size(), 0);
|
||||
if(-1 == nRetCode)
|
||||
char recvBuf[RECV_BUFFER_SIZE];
|
||||
int nRetCode = -1;
|
||||
while(true)
|
||||
{
|
||||
LogDebug(strerror(errno));
|
||||
return false;
|
||||
}
|
||||
if(0 == nRetCode)
|
||||
{
|
||||
LogDebug("client socket closed gracefully.");
|
||||
return false;
|
||||
memset(recvBuf, 0, sizeof(recvBuf));
|
||||
nRetCode = recv(sockfd, recvBuf, sizeof(recvBuf) - 1, 0);
|
||||
if(-1 == nRetCode)
|
||||
{
|
||||
LogDebug(strerror(errno));
|
||||
return false;
|
||||
}
|
||||
if(0 == nRetCode)
|
||||
{
|
||||
LogDebug("client socket orderly shut down");
|
||||
return false;
|
||||
}
|
||||
strRec += recvBuf;
|
||||
if(nRetCode != sizeof(recvBuf) - 1)
|
||||
{
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
HttpReqInfo httpReq(strRec);
|
||||
if(!_reqHandler->do_GET(httpReq, strRetByHandler))
|
||||
if("GET" == httpReq.getMethod() && !_reqHandler->do_GET(httpReq, strRetByHandler))
|
||||
{
|
||||
LogError("do_GET failed.");
|
||||
return false;
|
||||
}
|
||||
if("POST" == httpReq.getMethod() && !_reqHandler->do_POST(httpReq, strRetByHandler))
|
||||
{
|
||||
LogError("do_POST failed.");
|
||||
return false;
|
||||
}
|
||||
string_format(strSnd, HTTP_FORMAT, CHARSET_UTF8, strRetByHandler.length(), strRetByHandler.c_str());
|
||||
|
||||
if(-1 == send(sockfd, strSnd.c_str(), strSnd.length(), 0))
|
||||
{
|
||||
LogError(strerror(errno));
|
||||
|
@ -135,7 +135,8 @@ namespace Husky
|
||||
//message header end
|
||||
|
||||
//body begin
|
||||
|
||||
_body.assign(headerStr.substr(rpos));
|
||||
trim(_body);
|
||||
}
|
||||
public:
|
||||
string& operator[] (const string& key)
|
||||
@ -150,14 +151,23 @@ namespace Husky
|
||||
{
|
||||
return _find(_methodGetMap, argKey, res);
|
||||
}
|
||||
bool POST(const string& argKey, string& res)const
|
||||
//bool POST(const string& argKey, string& res)const
|
||||
//{
|
||||
// return _find(_methodPostMap, argKey, res);
|
||||
//}
|
||||
const string& getMethod() const
|
||||
{
|
||||
return _find(_methodPostMap, argKey, res);
|
||||
return _headerMap.find(KEY_METHOD)->second;
|
||||
}
|
||||
const string& getBody() const
|
||||
{
|
||||
return _body;
|
||||
}
|
||||
private:
|
||||
std::unordered_map<string, string> _headerMap;
|
||||
std::unordered_map<string, string> _methodGetMap;
|
||||
std::unordered_map<string, string> _methodPostMap;
|
||||
//std::unordered_map<string, string> _methodPostMap;
|
||||
string _body;
|
||||
//public:
|
||||
friend ostream& operator<<(ostream& os, const HttpReqInfo& obj);
|
||||
private:
|
||||
@ -215,7 +225,7 @@ namespace Husky
|
||||
|
||||
inline std::ostream& operator << (std::ostream& os, const Husky::HttpReqInfo& obj)
|
||||
{
|
||||
return os << obj._headerMap << obj._methodGetMap << obj._methodPostMap;
|
||||
return os << obj._headerMap << obj._methodGetMap/* << obj._methodPostMap*/ << obj._body;
|
||||
}
|
||||
|
||||
}
|
||||
|
@ -1,4 +0,0 @@
|
||||
INSTALL(FILES ArgvContext.hpp InitOnOff.hpp macro_def.hpp MysqlClient.hpp str_functs.hpp
|
||||
cast_functs.hpp io_functs.hpp map_functs.hpp NonCopyable.hpp
|
||||
Config.hpp logger.hpp md5.hpp std_outbound.hpp
|
||||
DESTINATION include/CppJieba/Limonp)
|
@ -24,7 +24,7 @@ namespace CppJieba
|
||||
const TrieNodeInfo * pInfo;
|
||||
double weight;
|
||||
|
||||
SegmentChar(uint16_t uni):uniCh(uni), pInfo(NULL), weight(0.0)
|
||||
SegmentChar():uniCh(0), pInfo(NULL), weight(0.0)
|
||||
{}
|
||||
};
|
||||
typedef vector<SegmentChar> SegmentContext;
|
||||
@ -59,18 +59,22 @@ namespace CppJieba
|
||||
virtual bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<string>& res)const
|
||||
{
|
||||
assert(_getInitFlag());
|
||||
|
||||
vector<TrieNodeInfo> segWordInfos;
|
||||
if(!cut(begin, end, segWordInfos))
|
||||
if(begin == end)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
string tmp;
|
||||
for(size_t i = 0; i < segWordInfos.size(); i++)
|
||||
|
||||
vector<Unicode> words;
|
||||
if(!cut(begin, end, words))
|
||||
{
|
||||
if(TransCode::encode(segWordInfos[i].word, tmp))
|
||||
return false;
|
||||
}
|
||||
string word;
|
||||
for(size_t i = 0; i < words.size(); i++)
|
||||
{
|
||||
if(TransCode::encode(words[i], word))
|
||||
{
|
||||
res.push_back(tmp);
|
||||
res.push_back(word);
|
||||
}
|
||||
else
|
||||
{
|
||||
@ -80,7 +84,7 @@ namespace CppJieba
|
||||
return true;
|
||||
}
|
||||
|
||||
bool cut(Unicode::const_iterator begin , Unicode::const_iterator end, vector<TrieNodeInfo>& segWordInfos)const
|
||||
bool cut(Unicode::const_iterator begin , Unicode::const_iterator end, vector<Unicode>& res) const
|
||||
{
|
||||
if(!_getInitFlag())
|
||||
{
|
||||
@ -88,7 +92,6 @@ namespace CppJieba
|
||||
return false;
|
||||
}
|
||||
SegmentContext segContext;
|
||||
|
||||
//calc DAG
|
||||
if(!_calcDAG(begin, end, segContext))
|
||||
{
|
||||
@ -102,7 +105,7 @@ namespace CppJieba
|
||||
return false;
|
||||
}
|
||||
|
||||
if(!_cut(segContext, segWordInfos))
|
||||
if(!_cut(segContext, res))
|
||||
{
|
||||
LogError("_cut failed.");
|
||||
return false;
|
||||
@ -114,21 +117,17 @@ namespace CppJieba
|
||||
private:
|
||||
bool _calcDAG(Unicode::const_iterator begin, Unicode::const_iterator end, SegmentContext& segContext) const
|
||||
{
|
||||
if(begin >= end)
|
||||
{
|
||||
LogError("begin >= end.");
|
||||
return false;
|
||||
}
|
||||
|
||||
SegmentChar schar;
|
||||
size_t offset;
|
||||
for(Unicode::const_iterator it = begin; it != end; it++)
|
||||
{
|
||||
SegmentChar schar(*it);
|
||||
size_t i = it - begin;
|
||||
_trie.find(it, end, i, schar.dag);
|
||||
//DagType::iterator dagIter;
|
||||
if(schar.dag.end() == schar.dag.find(i))
|
||||
schar.uniCh = *it;
|
||||
offset = it - begin;
|
||||
schar.dag.clear();
|
||||
_trie.find(it, end, schar.dag, offset);
|
||||
if(!isIn(schar.dag, offset))
|
||||
{
|
||||
schar.dag[i] = NULL;
|
||||
schar.dag[offset] = NULL;
|
||||
}
|
||||
segContext.push_back(schar);
|
||||
}
|
||||
@ -142,15 +141,19 @@ namespace CppJieba
|
||||
return false;
|
||||
}
|
||||
|
||||
size_t nextPos;
|
||||
const TrieNodeInfo* p;
|
||||
double val;
|
||||
|
||||
for(int i = segContext.size() - 1; i >= 0; i--)
|
||||
{
|
||||
segContext[i].pInfo = NULL;
|
||||
segContext[i].weight = MIN_DOUBLE;
|
||||
for(DagType::const_iterator it = segContext[i].dag.begin(); it != segContext[i].dag.end(); it++)
|
||||
{
|
||||
size_t nextPos = it->first;
|
||||
const TrieNodeInfo* p = it->second;
|
||||
double val = 0.0;
|
||||
nextPos = it->first;
|
||||
p = it->second;
|
||||
val = 0.0;
|
||||
if(nextPos + 1 < segContext.size())
|
||||
{
|
||||
val += segContext[nextPos + 1].weight;
|
||||
@ -174,7 +177,7 @@ namespace CppJieba
|
||||
return true;
|
||||
|
||||
}
|
||||
bool _cut(SegmentContext& segContext, vector<TrieNodeInfo>& res)const
|
||||
bool _cut(SegmentContext& segContext, vector<Unicode>& res)const
|
||||
{
|
||||
size_t i = 0;
|
||||
while(i < segContext.size())
|
||||
@ -182,16 +185,12 @@ namespace CppJieba
|
||||
const TrieNodeInfo* p = segContext[i].pInfo;
|
||||
if(p)
|
||||
{
|
||||
res.push_back(*p);
|
||||
res.push_back(p->word);
|
||||
i += p->word.size();
|
||||
}
|
||||
else//single chinese word
|
||||
{
|
||||
TrieNodeInfo nodeInfo;
|
||||
nodeInfo.word.push_back(segContext[i].uniCh);
|
||||
nodeInfo.freq = 0;
|
||||
nodeInfo.logFreq = _trie.getMinLogFreq();
|
||||
res.push_back(nodeInfo);
|
||||
res.push_back(Unicode(1, segContext[i].uniCh));
|
||||
i++;
|
||||
}
|
||||
}
|
||||
|
@ -44,14 +44,8 @@ namespace CppJieba
|
||||
virtual bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<Unicode>& res) const
|
||||
{
|
||||
assert(_getInitFlag());
|
||||
if(begin >= end)
|
||||
{
|
||||
LogError("begin >= end");
|
||||
return false;
|
||||
}
|
||||
|
||||
vector<TrieNodeInfo> infos;
|
||||
if(!_mpSeg.cut(begin, end, infos))
|
||||
vector<Unicode> words;
|
||||
if(!_mpSeg.cut(begin, end, words))
|
||||
{
|
||||
LogError("mpSeg cutDAG failed.");
|
||||
return false;
|
||||
@ -59,20 +53,20 @@ namespace CppJieba
|
||||
|
||||
vector<Unicode> hmmRes;
|
||||
Unicode piece;
|
||||
for (size_t i = 0, j = 0; i < infos.size(); i++)
|
||||
for (size_t i = 0, j = 0; i < words.size(); i++)
|
||||
{
|
||||
//if mp get a word, it's ok, put it into result
|
||||
if (1 != infos[i].word.size())
|
||||
if (1 != words[i].size())
|
||||
{
|
||||
res.push_back(infos[i].word);
|
||||
res.push_back(words[i]);
|
||||
continue;
|
||||
}
|
||||
|
||||
// if mp get a single one, collect it in sequence
|
||||
j = i;
|
||||
while (j < infos.size() && infos[j].word.size() == 1)
|
||||
while (j < words.size() && words[j].size() == 1)
|
||||
{
|
||||
piece.push_back(infos[j].word[0]);
|
||||
piece.push_back(words[j][0]);
|
||||
j++;
|
||||
}
|
||||
|
||||
|
@ -22,12 +22,8 @@ namespace CppJieba
|
||||
virtual bool cut(const string& str, vector<string>& res)const
|
||||
{
|
||||
assert(_getInitFlag());
|
||||
//if(!_getInitFlag())
|
||||
//{
|
||||
// LogError("not inited.");
|
||||
// return false;
|
||||
//}
|
||||
Unicode unico;
|
||||
res.clear();
|
||||
#ifdef NO_FILTER
|
||||
if(!TransCode::decode(str, unico))
|
||||
{
|
||||
|
@ -15,7 +15,7 @@ namespace CppJieba
|
||||
typedef std::vector<uint16_t> Unicode;
|
||||
namespace TransCode
|
||||
{
|
||||
inline bool decode(const string& str, vector<uint16_t>& vec)
|
||||
inline bool decode(const string& str, Unicode& vec)
|
||||
{
|
||||
#ifdef CPPJIEBA_GBK
|
||||
return gbkTrans(str, vec);
|
||||
@ -24,7 +24,7 @@ namespace CppJieba
|
||||
#endif
|
||||
}
|
||||
|
||||
inline bool encode(vector<uint16_t>::const_iterator begin, vector<uint16_t>::const_iterator end, string& res)
|
||||
inline bool encode(Unicode::const_iterator begin, Unicode::const_iterator end, string& res)
|
||||
{
|
||||
#ifdef CPPJIEBA_GBK
|
||||
return gbkTrans(begin, end, res);
|
||||
@ -33,7 +33,7 @@ namespace CppJieba
|
||||
#endif
|
||||
}
|
||||
|
||||
inline bool encode(const vector<uint16_t>& uni, string& res)
|
||||
inline bool encode(const Unicode& uni, string& res)
|
||||
{
|
||||
return encode(uni.begin(), uni.end(), res);
|
||||
}
|
||||
|
147
src/Trie.hpp
147
src/Trie.hpp
@ -26,16 +26,13 @@ namespace CppJieba
|
||||
const double MAX_DOUBLE = 3.14e+100;
|
||||
const size_t DICT_COLUMN_NUM = 3;
|
||||
typedef map<uint16_t, struct TrieNode*> TrieNodeMap;
|
||||
struct TrieNodeInfo;
|
||||
struct TrieNode
|
||||
{
|
||||
TrieNodeMap hmap;
|
||||
bool isLeaf;
|
||||
size_t nodeInfoPos;
|
||||
TrieNode()
|
||||
{
|
||||
isLeaf = false;
|
||||
nodeInfoPos = 0;
|
||||
}
|
||||
const TrieNodeInfo * ptTrieNodeInfo;
|
||||
TrieNode(): ptTrieNodeInfo(NULL)
|
||||
{}
|
||||
};
|
||||
|
||||
struct TrieNodeInfo
|
||||
@ -44,12 +41,6 @@ namespace CppJieba
|
||||
size_t freq;
|
||||
string tag;
|
||||
double logFreq; //logFreq = log(freq/sum(freq));
|
||||
TrieNodeInfo():freq(0),logFreq(0.0)
|
||||
{}
|
||||
TrieNodeInfo(const TrieNodeInfo& nodeInfo):word(nodeInfo.word), freq(nodeInfo.freq), tag(nodeInfo.tag), logFreq(nodeInfo.logFreq)
|
||||
{}
|
||||
TrieNodeInfo(const Unicode& _word):word(_word),freq(0),logFreq(MIN_DOUBLE)
|
||||
{}
|
||||
};
|
||||
|
||||
inline ostream& operator << (ostream& os, const TrieNodeInfo & nodeInfo)
|
||||
@ -72,33 +63,32 @@ namespace CppJieba
|
||||
public:
|
||||
Trie()
|
||||
{
|
||||
_root = NULL;
|
||||
_root = new TrieNode;
|
||||
_freqSum = 0;
|
||||
_minLogFreq = MAX_DOUBLE;
|
||||
_setInitFlag(false);
|
||||
}
|
||||
Trie(const string& filePath)
|
||||
{
|
||||
Trie();
|
||||
new (this) Trie();
|
||||
_setInitFlag(init(filePath));
|
||||
}
|
||||
~Trie()
|
||||
{
|
||||
_deleteNode(_root);
|
||||
}
|
||||
private:
|
||||
|
||||
|
||||
public:
|
||||
bool init(const string& filePath)
|
||||
{
|
||||
assert(!_getInitFlag());
|
||||
|
||||
_root = new TrieNode;
|
||||
assert(_root);
|
||||
if(!_trieInsert(filePath))
|
||||
{
|
||||
LogError("_trieInsert failed.");
|
||||
return false;
|
||||
}
|
||||
_countWeight();
|
||||
_loadDict(filePath, _nodeInfos);
|
||||
_createTrie(_nodeInfos, _root);
|
||||
_freqSum = _calculateFreqSum(_nodeInfos);
|
||||
assert(_freqSum);
|
||||
_minLogFreq = _calculateLogFreqAndGetMinValue(_nodeInfos, _freqSum);
|
||||
return _setInitFlag(true);
|
||||
}
|
||||
|
||||
@ -106,47 +96,22 @@ namespace CppJieba
|
||||
const TrieNodeInfo* find(Unicode::const_iterator begin, Unicode::const_iterator end)const
|
||||
{
|
||||
TrieNodeMap::const_iterator citer;
|
||||
TrieNode* p = _root;
|
||||
const TrieNode* p = _root;
|
||||
for(Unicode::const_iterator it = begin; it != end; it++)
|
||||
{
|
||||
uint16_t chUni = *it;
|
||||
citer = p->hmap.find(chUni);
|
||||
if(p-> hmap.end() == citer)
|
||||
citer = p->hmap.find(*it);
|
||||
if(p->hmap.end() == citer)
|
||||
{
|
||||
return NULL;
|
||||
}
|
||||
p = citer->second;
|
||||
}
|
||||
if(p->isLeaf)
|
||||
{
|
||||
return &(_nodeInfos[p->nodeInfoPos]);
|
||||
}
|
||||
return NULL;
|
||||
return p->ptTrieNodeInfo;
|
||||
}
|
||||
|
||||
bool find(Unicode::const_iterator begin, Unicode::const_iterator end, vector<pair<size_t, const TrieNodeInfo*> >& res) const
|
||||
bool find(Unicode::const_iterator begin, Unicode::const_iterator end, DagType & res, size_t offset = 0) const
|
||||
{
|
||||
TrieNodeMap::const_iterator citer;
|
||||
TrieNode* p = _root;
|
||||
for (Unicode::const_iterator itr = begin; itr != end; itr++)
|
||||
{
|
||||
citer = p->hmap.find(*itr);
|
||||
if(p->hmap.end() == citer)
|
||||
{
|
||||
break;
|
||||
}
|
||||
p = citer->second;
|
||||
if(p->isLeaf)
|
||||
{
|
||||
res.push_back(make_pair(itr-begin, &_nodeInfos[p->nodeInfoPos]));
|
||||
}
|
||||
}
|
||||
return !res.empty();
|
||||
}
|
||||
|
||||
bool find(Unicode::const_iterator begin, Unicode::const_iterator end, size_t offset, DagType & res) const
|
||||
{
|
||||
TrieNode* p = _root;
|
||||
const TrieNode* p = _root;
|
||||
TrieNodeMap::const_iterator citer;
|
||||
for (Unicode::const_iterator itr = begin; itr != end; itr++)
|
||||
{
|
||||
@ -156,9 +121,9 @@ namespace CppJieba
|
||||
break;
|
||||
}
|
||||
p = citer->second;
|
||||
if(p->isLeaf)
|
||||
if(p->ptTrieNodeInfo)
|
||||
{
|
||||
res[itr - begin + offset] = &_nodeInfos[p->nodeInfoPos];
|
||||
res[itr - begin + offset] = p->ptTrieNodeInfo;
|
||||
}
|
||||
}
|
||||
return !res.empty();
|
||||
@ -168,43 +133,44 @@ namespace CppJieba
|
||||
double getMinLogFreq() const {return _minLogFreq;};
|
||||
|
||||
private:
|
||||
void _insert(const TrieNodeInfo& nodeInfo, size_t nodeInfoPos)
|
||||
void _insertNode(const TrieNodeInfo& nodeInfo, TrieNode* ptNode) const
|
||||
{
|
||||
const Unicode& unico = nodeInfo.word;
|
||||
TrieNode* p = _root;
|
||||
TrieNodeMap::const_iterator citer;
|
||||
for(size_t i = 0; i < unico.size(); i++)
|
||||
{
|
||||
uint16_t cu = unico[i];
|
||||
assert(p);
|
||||
if(!isIn(p->hmap, cu))
|
||||
assert(ptNode);
|
||||
citer = ptNode->hmap.find(cu);
|
||||
if(ptNode->hmap.end() == citer)
|
||||
{
|
||||
TrieNode * next = new TrieNode;
|
||||
assert(next);
|
||||
p->hmap[cu] = next;
|
||||
p = next;
|
||||
ptNode->hmap[cu] = next;
|
||||
ptNode = next;
|
||||
}
|
||||
else
|
||||
{
|
||||
p = p->hmap[cu];
|
||||
ptNode = citer->second;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
p->isLeaf = true;
|
||||
p->nodeInfoPos = nodeInfoPos;
|
||||
ptNode->ptTrieNodeInfo = &nodeInfo;
|
||||
}
|
||||
|
||||
private:
|
||||
bool _trieInsert(const string& filePath)
|
||||
void _loadDict(const string& filePath, vector<TrieNodeInfo>& nodeInfos) const
|
||||
{
|
||||
ifstream ifs(filePath.c_str());
|
||||
if(!ifs)
|
||||
{
|
||||
LogError("open %s failed.", filePath.c_str());
|
||||
return false;
|
||||
LogFatal("open %s failed.", filePath.c_str());
|
||||
exit(1);
|
||||
}
|
||||
string line;
|
||||
vector<string> buf;
|
||||
|
||||
nodeInfos.clear();
|
||||
TrieNodeInfo nodeInfo;
|
||||
for(size_t lineno = 0 ; getline(ifs, line); lineno++)
|
||||
{
|
||||
@ -213,43 +179,46 @@ namespace CppJieba
|
||||
if(!TransCode::decode(buf[0], nodeInfo.word))
|
||||
{
|
||||
LogError("line[%u:%s] illegal.", lineno, line.c_str());
|
||||
return false;
|
||||
continue;
|
||||
}
|
||||
nodeInfo.freq = atoi(buf[1].c_str());
|
||||
nodeInfo.tag = buf[2];
|
||||
|
||||
_nodeInfos.push_back(nodeInfo);
|
||||
|
||||
nodeInfos.push_back(nodeInfo);
|
||||
}
|
||||
}
|
||||
bool _createTrie(const vector<TrieNodeInfo>& nodeInfos, TrieNode * ptNode)
|
||||
{
|
||||
for(size_t i = 0; i < _nodeInfos.size(); i++)
|
||||
{
|
||||
_insert(_nodeInfos[i], i);
|
||||
_insertNode(_nodeInfos[i], ptNode);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
void _countWeight()
|
||||
size_t _calculateFreqSum(const vector<TrieNodeInfo>& nodeInfos) const
|
||||
{
|
||||
//freq total freq
|
||||
_freqSum = 0;
|
||||
for(size_t i = 0; i < _nodeInfos.size(); i++)
|
||||
size_t freqSum = 0;
|
||||
for(size_t i = 0; i < nodeInfos.size(); i++)
|
||||
{
|
||||
_freqSum += _nodeInfos[i].freq;
|
||||
freqSum += nodeInfos[i].freq;
|
||||
}
|
||||
|
||||
assert(_freqSum);
|
||||
|
||||
//normalize
|
||||
for(size_t i = 0; i < _nodeInfos.size(); i++)
|
||||
return freqSum;
|
||||
}
|
||||
double _calculateLogFreqAndGetMinValue(vector<TrieNodeInfo>& nodeInfos, size_t freqSum) const
|
||||
{
|
||||
assert(freqSum);
|
||||
double minLogFreq = MAX_DOUBLE;
|
||||
for(size_t i = 0; i < nodeInfos.size(); i++)
|
||||
{
|
||||
TrieNodeInfo& nodeInfo = _nodeInfos[i];
|
||||
TrieNodeInfo& nodeInfo = nodeInfos[i];
|
||||
assert(nodeInfo.freq);
|
||||
nodeInfo.logFreq = log(double(nodeInfo.freq)/double(_freqSum));
|
||||
if(_minLogFreq > nodeInfo.logFreq)
|
||||
nodeInfo.logFreq = log(double(nodeInfo.freq)/double(freqSum));
|
||||
if(minLogFreq > nodeInfo.logFreq)
|
||||
{
|
||||
_minLogFreq = nodeInfo.logFreq;
|
||||
minLogFreq = nodeInfo.logFreq;
|
||||
}
|
||||
}
|
||||
|
||||
return minLogFreq;
|
||||
}
|
||||
|
||||
void _deleteNode(TrieNode* node)
|
||||
|
@ -26,6 +26,18 @@ class ReqHandler: public IRequestHandler
|
||||
httpReq.GET("key", tmp);
|
||||
URLDecode(tmp, sentence);
|
||||
_segment.cut(sentence, words);
|
||||
if(httpReq.GET("format", tmp) && tmp == "simple")
|
||||
{
|
||||
join(words.begin(), words.end(), strSnd, " ");
|
||||
return true;
|
||||
}
|
||||
strSnd << words;
|
||||
return true;
|
||||
}
|
||||
virtual bool do_POST(const HttpReqInfo& httpReq, string& strSnd) const
|
||||
{
|
||||
vector<string> words;
|
||||
_segment.cut(httpReq.getBody(), words);
|
||||
strSnd << words;
|
||||
return true;
|
||||
}
|
||||
|
@ -2,6 +2,6 @@ SET(EXECUTABLE_OUTPUT_PATH ${PROJECT_BINARY_DIR})
|
||||
|
||||
ADD_EXECUTABLE(segment.demo segment.cpp)
|
||||
ADD_EXECUTABLE(keyword.demo keyword_demo.cpp)
|
||||
ADD_EXECUTABLE(tagging_demo tagging_demo.cpp)
|
||||
ADD_EXECUTABLE(tagging.demo tagging_demo.cpp)
|
||||
ADD_EXECUTABLE(load_test load_test.cpp)
|
||||
ADD_SUBDIRECTORY(unittest)
|
||||
|
@ -9,7 +9,7 @@
|
||||
|
||||
using namespace CppJieba;
|
||||
|
||||
void cut(const ISegment * seg, const char * const filePath, size_t times = 10)
|
||||
void cut(const ISegment * seg, const char * const filePath, size_t times = 30)
|
||||
{
|
||||
ifstream ifile(filePath);
|
||||
if(!ifile)
|
||||
@ -23,10 +23,12 @@ void cut(const ISegment * seg, const char * const filePath, size_t times = 10)
|
||||
loadFile2Str(filePath, doc);
|
||||
for(uint i = 0; i < times; i ++)
|
||||
{
|
||||
LogInfo("times[%u]", i);
|
||||
printf("process [%3.0lf %%]\r", 100.0*(i+1)/times);
|
||||
fflush(stdout);
|
||||
res.clear();
|
||||
seg->cut(doc, res);
|
||||
}
|
||||
printf("\n");
|
||||
}
|
||||
|
||||
int main(int argc, char ** argv)
|
||||
@ -40,6 +42,6 @@ int main(int argc, char ** argv)
|
||||
long beginTime = clock();
|
||||
cut(&seg, "../test/testdata/weicheng.utf8");
|
||||
long endTime = clock();
|
||||
printf("[%.3lf seconds]time consumeed.\n", double(endTime - beginTime)/CLOCKS_PER_SEC);
|
||||
printf("[%.3lf seconds]time consumed.\n", double(endTime - beginTime)/CLOCKS_PER_SEC);
|
||||
return EXIT_SUCCESS;
|
||||
}
|
||||
|
@ -80,7 +80,6 @@ TEST(MPSegmentTest, Test2)
|
||||
res += line;
|
||||
res += '\n';
|
||||
|
||||
words.clear();
|
||||
segment.cut(line, words);
|
||||
string s;
|
||||
s << words;
|
||||
|
@ -5,8 +5,18 @@ using namespace CppJieba;
|
||||
|
||||
static const char* const DICT_FILE = "../dict/extra_dict/jieba.dict.small.utf8";
|
||||
|
||||
TEST(TrieTest, NewAndDelete)
|
||||
{
|
||||
Trie * trie;
|
||||
trie = new Trie(DICT_FILE);
|
||||
delete trie;
|
||||
trie = new Trie();
|
||||
delete trie;
|
||||
}
|
||||
|
||||
TEST(TrieTest, Test1)
|
||||
{
|
||||
|
||||
string s1, s2;
|
||||
Trie trie;
|
||||
ASSERT_TRUE(trie.init(DICT_FILE));
|
||||
@ -26,7 +36,7 @@ TEST(TrieTest, Test1)
|
||||
word = "清华大学";
|
||||
vector<pair<size_t, const TrieNodeInfo*> > res;
|
||||
map<size_t, const TrieNodeInfo* > resMap;
|
||||
map<size_t, const TrieNodeInfo* > map;
|
||||
map<size_t, const TrieNodeInfo* > mp;
|
||||
const char * words[] = {"清", "清华", "清华大学"};
|
||||
for(size_t i = 0; i < sizeof(words)/sizeof(words[0]); i++)
|
||||
{
|
||||
@ -40,10 +50,8 @@ TEST(TrieTest, Test1)
|
||||
vector<pair<size_t, const TrieNodeInfo*> > vec;
|
||||
ASSERT_TRUE(TransCode::decode(word, uni));
|
||||
//print(uni);
|
||||
ASSERT_TRUE(trie.find(uni.begin(), uni.end(), vec));
|
||||
ASSERT_EQ(vec, res);
|
||||
ASSERT_TRUE(trie.find(uni.begin(), uni.end(), 0, map));
|
||||
ASSERT_EQ(map, resMap);
|
||||
ASSERT_TRUE(trie.find(uni.begin(), uni.end(), mp, 0));
|
||||
ASSERT_EQ(mp, resMap);
|
||||
// print(vec);
|
||||
}
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user