mirror of
https://github.com/yanyiwu/cppjieba.git
synced 2025-07-18 00:00:12 +08:00
merge upstream
This commit is contained in:
commit
680399efdc
1
.gitignore
vendored
1
.gitignore
vendored
@ -14,3 +14,4 @@ prior.gbk
|
||||
tmp
|
||||
t.*
|
||||
*.pid
|
||||
build
|
||||
|
@ -2,16 +2,22 @@ PROJECT(CPPJIEBA)
|
||||
|
||||
CMAKE_MINIMUM_REQUIRED (VERSION 2.8)
|
||||
|
||||
SET(CMAKE_INSTALL_PREFIX /usr)
|
||||
ADD_DEFINITIONS(-std=c++0x -O3)
|
||||
if (CMAKE_INSTALL_PREFIX_INITIALIZED_TO_DEFAULT)
|
||||
set (CMAKE_INSTALL_PREFIX "/usr" CACHE PATH "default install path" FORCE )
|
||||
endif()
|
||||
ADD_DEFINITIONS(-std=c++0x -O3 -Wall)
|
||||
IF (DEFINED ENC)
|
||||
ADD_DEFINITIONS(-DCPPJIEBA_${ENC})
|
||||
ENDIF()
|
||||
#ADD_DEFINITIONS(-DNO_FILTER)
|
||||
ADD_SUBDIRECTORY(src)
|
||||
ADD_SUBDIRECTORY(dicts)
|
||||
ADD_SUBDIRECTORY(scripts)
|
||||
ADD_SUBDIRECTORY(dict)
|
||||
|
||||
if (!${APPLE})
|
||||
ADD_SUBDIRECTORY(script)
|
||||
ADD_SUBDIRECTORY(conf)
|
||||
endif()
|
||||
|
||||
ADD_SUBDIRECTORY(test)
|
||||
|
||||
ENABLE_TESTING()
|
||||
|
@ -28,7 +28,7 @@ sudo make install
|
||||
#### 测试
|
||||
|
||||
```sh
|
||||
cd build && ./test/segment.demo
|
||||
make test
|
||||
```
|
||||
|
||||
### 启动服务
|
||||
@ -165,7 +165,7 @@ MPSegment.hpp
|
||||
|
||||
HMMSegment.hpp
|
||||
是根据HMM模型来进行分词,主要算法思路是根据(B,E,M,S)四个状态来代表每个字的隐藏状态。
|
||||
HMM模型由dicts/下面的`hmm_model.utf8`提供。
|
||||
HMM模型由dict/下面的`hmm_model.utf8`提供。
|
||||
分词算法即viterbi算法。
|
||||
|
||||
FullSegment.hpp
|
||||
@ -173,7 +173,7 @@ FullSegment.hpp
|
||||
|
||||
#### TransCode模块
|
||||
|
||||
TransCode.cpp/TransCode.h 负责转换编码类型,将utf8和gbk转换成`uint16_t`类型,也负责逆转换。
|
||||
TransCode.hpp 负责转换编码类型,将utf8和gbk转换成`uint16_t`类型,也负责逆转换。
|
||||
|
||||
### src/Husky
|
||||
|
||||
|
@ -10,10 +10,10 @@ thread_num=4
|
||||
daemonize=true
|
||||
|
||||
#dict path
|
||||
dict_path=/usr/share/CppJieba/dicts/jieba.dict.utf8
|
||||
dict_path=/usr/share/CppJieba/dict/jieba.dict.utf8
|
||||
|
||||
#model path
|
||||
model_path=/usr/share/CppJieba/dicts/hmm_model.utf8
|
||||
model_path=/usr/share/CppJieba/dict/hmm_model.utf8
|
||||
|
||||
#pid file
|
||||
pid_file=/var/run/CppJieba/cjserver.pid
|
||||
|
@ -1 +1 @@
|
||||
INSTALL(FILES hmm_model.utf8 jieba.dict.utf8 DESTINATION share/CppJieba/dicts)
|
||||
INSTALL(FILES hmm_model.utf8 jieba.dict.utf8 DESTINATION share/CppJieba/dict)
|
258826
dict/idf.utf8
Normal file
258826
dict/idf.utf8
Normal file
File diff suppressed because it is too large
Load Diff
5
script/cjseg.sh
Executable file
5
script/cjseg.sh
Executable file
@ -0,0 +1,5 @@
|
||||
if [ $# -lt 1 ]; then
|
||||
echo "usage: $0 <file>"
|
||||
exit 1
|
||||
fi
|
||||
cjsegment --dictpath /usr/share/CppJieba/dict/jieba.dict.utf8 --modelpath /usr/share/CppJieba/dict/hmm_model.utf8 $1
|
@ -1,5 +0,0 @@
|
||||
if [ $# -lt 1 ]; then
|
||||
echo "usage: $0 <file>"
|
||||
exit 1
|
||||
fi
|
||||
cjsegment --dictpath /usr/share/CppJieba/dicts/jieba.dict.utf8 --modelpath /usr/share/CppJieba/dicts/hmm_model.utf8 $1
|
@ -1,13 +1,15 @@
|
||||
SET(EXECUTABLE_OUTPUT_PATH ${PROJECT_BINARY_DIR}/bin)
|
||||
SET(LIBRARY_OUTPUT_PATH ${PROJECT_BINARY_DIR}/lib)
|
||||
|
||||
INCLUDE_DIRECTORIES(${PROJECT_SOURCE_DIR}/src)
|
||||
|
||||
ADD_EXECUTABLE(cjsegment segment.cpp)
|
||||
ADD_EXECUTABLE(cjserver server.cpp)
|
||||
TARGET_LINK_LIBRARIES(cjserver pthread)
|
||||
|
||||
INSTALL(TARGETS cjsegment RUNTIME DESTINATION bin)
|
||||
INSTALL(TARGETS cjserver RUNTIME DESTINATION bin)
|
||||
INSTALL(FILES ChineseFilter.hpp HMMSegment.hpp MPSegment.hpp Trie.hpp ISegment.hpp MixSegment.hpp SegmentBase.hpp TransCode.hpp DESTINATION include/CppJieba)
|
||||
INSTALL(FILES HMMSegment.hpp MPSegment.hpp Trie.hpp TrieManager.hpp ISegment.hpp MixSegment.hpp SegmentBase.hpp TransCode.hpp KeywordExtractor.hpp DESTINATION include/CppJieba)
|
||||
|
||||
ADD_SUBDIRECTORY(Husky)
|
||||
ADD_SUBDIRECTORY(Limonp)
|
||||
|
@ -1,49 +0,0 @@
|
||||
#ifndef CPPJIEBA_CHINESEFILTER_H
|
||||
#define CPPJIEBA_CHINESEFILTER_H
|
||||
|
||||
#include "TransCode.hpp"
|
||||
|
||||
namespace CppJieba
|
||||
{
|
||||
|
||||
/*
|
||||
* if char is ascii, count the ascii string's length and return 0;
|
||||
* else count the nonascii string's length and return 1;
|
||||
* if errors, return -1;
|
||||
* */
|
||||
inline int filterAscii(const char* str, uint len, uint& resLen)
|
||||
{
|
||||
if(!str || !len)
|
||||
{
|
||||
return -1;
|
||||
}
|
||||
char x = 0x80;
|
||||
int resFlag = (str[0] & x ? 1 : 0);
|
||||
resLen = 0;
|
||||
if(!resFlag)
|
||||
{
|
||||
while(resLen < len && !(str[resLen] & x))
|
||||
{
|
||||
resLen ++;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
while(resLen < len && (str[resLen] & x))
|
||||
{
|
||||
#ifdef CPPJIEBA_GBK
|
||||
resLen += 2;
|
||||
#else
|
||||
resLen ++;
|
||||
#endif
|
||||
}
|
||||
}
|
||||
if(resLen > len)
|
||||
{
|
||||
return -1;
|
||||
}
|
||||
return resFlag;
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
@ -71,7 +71,7 @@ namespace CppJieba
|
||||
for (vector<pair<uint, const TrieNodeInfo*> >::const_iterator itr = tRes.begin(); itr != tRes.end(); itr++)
|
||||
{
|
||||
wordLen = itr->second->word.size();
|
||||
if (wordLen >= 2 || tRes.size() == 1 && maxIdx <= uIdx)
|
||||
if (wordLen >= 2 || (tRes.size() == 1 && maxIdx <= uIdx))
|
||||
{
|
||||
res.push_back(itr->second->word);
|
||||
}
|
||||
|
@ -46,6 +46,7 @@ namespace CppJieba
|
||||
{
|
||||
if(_getInitFlag())
|
||||
{
|
||||
LogError("inited already.");
|
||||
return false;
|
||||
}
|
||||
memset(_startProb, 0, sizeof(_startProb));
|
||||
@ -58,7 +59,13 @@ namespace CppJieba
|
||||
_emitProbVec.push_back(&_emitProbE);
|
||||
_emitProbVec.push_back(&_emitProbM);
|
||||
_emitProbVec.push_back(&_emitProbS);
|
||||
return _setInitFlag(_loadModel(filePath.c_str()));
|
||||
if(!_setInitFlag(_loadModel(filePath.c_str())))
|
||||
{
|
||||
LogError("_loadModel(%s) failed.", filePath.c_str());
|
||||
return false;
|
||||
}
|
||||
LogInfo("HMMSegment init(%s) ok.", filePath.c_str());
|
||||
return true;
|
||||
}
|
||||
public:
|
||||
using SegmentBase::cut;
|
||||
@ -198,7 +205,7 @@ namespace CppJieba
|
||||
}
|
||||
bool _loadModel(const char* const filePath)
|
||||
{
|
||||
LogInfo("loadModel [%s] start ...", filePath);
|
||||
LogDebug("loadModel [%s] start ...", filePath);
|
||||
ifstream ifile(filePath);
|
||||
string line;
|
||||
vector<string> tmp;
|
||||
@ -208,7 +215,7 @@ namespace CppJieba
|
||||
{
|
||||
return false;
|
||||
}
|
||||
splitStr(line, tmp, " ");
|
||||
split(line, tmp, " ");
|
||||
if(tmp.size() != STATUS_SUM)
|
||||
{
|
||||
LogError("start_p illegal");
|
||||
@ -227,7 +234,7 @@ namespace CppJieba
|
||||
{
|
||||
return false;
|
||||
}
|
||||
splitStr(line, tmp, " ");
|
||||
split(line, tmp, " ");
|
||||
if(tmp.size() != STATUS_SUM)
|
||||
{
|
||||
LogError("trans_p illegal");
|
||||
@ -264,7 +271,7 @@ namespace CppJieba
|
||||
return false;
|
||||
}
|
||||
|
||||
LogInfo("loadModel [%s] end.", filePath);
|
||||
LogDebug("loadModel [%s] end.", filePath);
|
||||
|
||||
return true;
|
||||
}
|
||||
@ -277,7 +284,7 @@ namespace CppJieba
|
||||
{
|
||||
continue;
|
||||
}
|
||||
if(strStartsWith(line, "#"))
|
||||
if(startsWith(line, "#"))
|
||||
{
|
||||
continue;
|
||||
}
|
||||
@ -293,10 +300,10 @@ namespace CppJieba
|
||||
}
|
||||
vector<string> tmp, tmp2;
|
||||
uint16_t unico = 0;
|
||||
splitStr(line, tmp, ",");
|
||||
split(line, tmp, ",");
|
||||
for(uint i = 0; i < tmp.size(); i++)
|
||||
{
|
||||
splitStr(tmp[i], tmp2, ":");
|
||||
split(tmp[i], tmp2, ":");
|
||||
if(2 != tmp2.size())
|
||||
{
|
||||
LogError("_emitProb illegal.");
|
||||
|
@ -1,4 +1,5 @@
|
||||
SET(EXECUTABLE_OUTPUT_PATH ${PROJECT_BINARY_DIR}/bin)
|
||||
SET(LIBRARY_OUTPUT_PATH ${PROJECT_BINARY_DIR}/lib)
|
||||
|
||||
INSTALL(FILES HttpReqInfo.hpp ServerFrame.hpp ThreadManager.hpp DESTINATION include/CppJieba/Husky)
|
||||
FILE(GLOB SRCS *.hpp)
|
||||
INSTALL(FILES ${SRCS} DESTINATION include/CppJieba/Husky)
|
||||
|
@ -3,7 +3,7 @@
|
||||
|
||||
#include <iostream>
|
||||
#include <string>
|
||||
#include "../Limonp/logger.hpp"
|
||||
#include "Limonp/logger.hpp"
|
||||
|
||||
namespace Husky
|
||||
{
|
||||
@ -88,7 +88,7 @@ namespace Husky
|
||||
}
|
||||
string firstline(headerStr, lpos, rpos - lpos);
|
||||
trim(firstline);
|
||||
if(!splitStr(firstline, buf, " ") || 3 != buf.size())
|
||||
if(!split(firstline, buf, " ") || 3 != buf.size())
|
||||
{
|
||||
LogFatal("parse header first line failed.");
|
||||
return false;
|
||||
|
@ -29,7 +29,7 @@ namespace Husky
|
||||
using namespace Limonp;
|
||||
typedef int SOCKET;
|
||||
const struct timeval SOCKET_TIMEOUT = {2, 0};
|
||||
const char* const RESPONSE_FORMAT = "HTTP/1.1 200 OK\r\nConnection: close\r\nServer: FrameServer/1.0.0\r\nContent-Type: text/json; charset=%s\r\nContent-Length: %d\r\n\r\n";
|
||||
const char* const RESPONSE_FORMAT = "HTTP/1.1 200 OK\r\nConnection: close\r\nServer: HuskyServer/1.0.0\r\nContent-Type: text/json; charset=%s\r\nContent-Length: %d\r\n\r\n";
|
||||
const char* const RESPONSE_CHARSET_UTF8 = "UTF-8";
|
||||
const char* const RESPONSE_CHARSET_GB2312 = "GB2312";
|
||||
const char* const CLIENT_IP_K = "CLIENT_IP";
|
||||
@ -53,13 +53,13 @@ namespace Husky
|
||||
bool * pShutdown;
|
||||
};
|
||||
|
||||
class ServerFrame//: public IWorkHandler
|
||||
class HuskyServer
|
||||
{
|
||||
private:
|
||||
pthread_mutex_t m_pmAccept;
|
||||
bool m_bShutdown;
|
||||
public:
|
||||
ServerFrame(unsigned nPort, unsigned nThreadCount, IRequestHandler* pHandler)
|
||||
explicit HuskyServer(unsigned nPort, unsigned nThreadCount, IRequestHandler* pHandler)
|
||||
{
|
||||
m_bShutdown = false;
|
||||
m_nLsnPort = nPort;
|
||||
@ -68,7 +68,7 @@ namespace Husky
|
||||
assert(pHandler);
|
||||
pthread_mutex_init(&m_pmAccept,NULL);
|
||||
};
|
||||
virtual ~ServerFrame(){pthread_mutex_destroy(&m_pmAccept);};
|
||||
virtual ~HuskyServer(){pthread_mutex_destroy(&m_pmAccept);};
|
||||
virtual bool init()
|
||||
{
|
||||
|
||||
@ -292,8 +292,6 @@ namespace Husky
|
||||
u_short m_nThreadCount;
|
||||
SOCKET m_lsnSock;
|
||||
IRequestHandler *m_pHandler;
|
||||
//static bool m_bShutdown;
|
||||
//static pthread_mutex_t m_pmAccept;
|
||||
|
||||
};
|
||||
|
@ -5,8 +5,6 @@
|
||||
#include <vector>
|
||||
#include <map>
|
||||
|
||||
#define INFINITE 0
|
||||
|
||||
namespace Husky
|
||||
{
|
||||
using namespace std;
|
||||
@ -14,48 +12,44 @@ namespace Husky
|
||||
class ThreadManager
|
||||
{
|
||||
private:
|
||||
typedef int HANDLE;
|
||||
typedef int DWORD;
|
||||
typedef pthread_t HANDLE;
|
||||
typedef void *(* PThreadFunc)(void* param);
|
||||
public:
|
||||
ThreadManager(){;}
|
||||
~ThreadManager(){}
|
||||
|
||||
unsigned int HandleCount(){return m_vecHandle.size();}
|
||||
size_t HandleCount(){return _handles.size();}
|
||||
|
||||
void clear()
|
||||
{
|
||||
m_vecHandle.clear();
|
||||
_handles.clear();
|
||||
}
|
||||
|
||||
HANDLE CreateThread( PThreadFunc pFunc,void *pPara)
|
||||
int CreateThread( PThreadFunc pFunc,void *pPara)
|
||||
{
|
||||
pthread_t pt;
|
||||
int nErrorCode=pthread_create(&pt,NULL,pFunc,pPara);
|
||||
if(nErrorCode!=0)
|
||||
int nErrorCode = pthread_create(&pt,NULL,pFunc,pPara);
|
||||
if(nErrorCode != 0)
|
||||
return nErrorCode;
|
||||
m_vecHandle.push_back(pt); //加入线程列表 为WaitForMultipleObjects准备
|
||||
_handles.push_back(pt);
|
||||
return nErrorCode;
|
||||
|
||||
}
|
||||
|
||||
//hThread (thread handler) : 为0时为默认最后一个加入管理器的线程句柄
|
||||
//dwMilliseconds等待时间 : 单位毫秒,默认值无穷时间
|
||||
//return value : -1句柄无效,其他值 WaitForSingleObject函数的返回值
|
||||
DWORD Wait(HANDLE hThread=0,DWORD dwMilliseconds=INFINITE )
|
||||
int Wait(HANDLE hThread = 0)
|
||||
{
|
||||
if( hThread==0)//最后一个加入的线程
|
||||
if( hThread == 0)//the last handle
|
||||
{
|
||||
if(!m_vecHandle.empty())
|
||||
if(!_handles.empty())
|
||||
{
|
||||
return pthread_join(m_vecHandle.back(),NULL);
|
||||
return pthread_join(_handles.back(),NULL);
|
||||
}
|
||||
else
|
||||
return -1;
|
||||
}
|
||||
else
|
||||
{
|
||||
if (find(m_vecHandle.begin(),m_vecHandle.end(),hThread)==m_vecHandle.end())//不存在此句柄
|
||||
if (find(_handles.begin(),_handles.end(),hThread) == _handles.end())
|
||||
{
|
||||
return -1;
|
||||
}
|
||||
@ -65,31 +59,26 @@ namespace Husky
|
||||
|
||||
}
|
||||
|
||||
|
||||
//等待所有线程执行完毕
|
||||
//bWaitAll是否所有线程 : 默认值1等待所有线程,0有任何线程结束,此函数返回
|
||||
//dwMilliseconds : 单位毫秒,默认值无穷时间
|
||||
//return value : -1没有任何句柄,其他值 WaitForMultipleObjects函数的返回值
|
||||
DWORD WaitMultipleThread( bool bWaitAll=1,DWORD dwMilliseconds=INFINITE)
|
||||
int WaitMultipleThread()
|
||||
{
|
||||
if (m_vecHandle.empty())
|
||||
if (_handles.empty())
|
||||
return -1;
|
||||
int nErrorcode;
|
||||
for (uint i=0;i<m_vecHandle.size();++i)
|
||||
for (uint i = 0; i < _handles.size(); i++)
|
||||
{
|
||||
nErrorcode=pthread_join(m_vecHandle[i], NULL);
|
||||
if (nErrorcode!=0)
|
||||
nErrorcode = pthread_join(_handles[i], NULL);
|
||||
if (nErrorcode != 0)
|
||||
return nErrorcode;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
private:
|
||||
vector<pthread_t> m_vecHandle;
|
||||
vector<pthread_t> _handles;
|
||||
|
||||
private:
|
||||
ThreadManager(const ThreadManager&){;}// copy forbidden
|
||||
void operator=(const ThreadManager &){}// copy forbidden
|
||||
void operator = (const ThreadManager &){}// copy forbidden
|
||||
};
|
||||
}
|
||||
|
||||
|
129
src/KeywordExtractor.hpp
Normal file
129
src/KeywordExtractor.hpp
Normal file
@ -0,0 +1,129 @@
|
||||
#ifndef CPPJIEBA_KEYWORD_EXTRACTOR_H
|
||||
#define CPPJIEBA_KEYWORD_EXTRACTOR_H
|
||||
|
||||
#include "MPSegment.hpp"
|
||||
#include <cmath>
|
||||
#define MIN(X,Y) ((X) < (Y) ? (X) : (Y))
|
||||
|
||||
namespace CppJieba
|
||||
{
|
||||
using namespace Limonp;
|
||||
|
||||
//struct KeyWordInfo
|
||||
//{
|
||||
// string word;
|
||||
// double tfidf;
|
||||
//};
|
||||
|
||||
//inline ostream& operator << (ostream& os, const KeyWordInfo & keyword)
|
||||
//{
|
||||
// return os << keyword.word << "," << keyword.idf;
|
||||
//}
|
||||
|
||||
class KeywordExtractor
|
||||
{
|
||||
private:
|
||||
MPSegment _segment;
|
||||
private:
|
||||
unordered_map<string, double> _idfMap;
|
||||
protected:
|
||||
bool _isInited;
|
||||
bool _getInitFlag()const{return _isInited;};
|
||||
bool _setInitFlag(bool flag){return _isInited = flag;};
|
||||
public:
|
||||
operator bool(){return _getInitFlag();};
|
||||
public:
|
||||
KeywordExtractor(){_setInitFlag(false);};
|
||||
explicit KeywordExtractor(const string& dictPath, const string& idfPath){_setInitFlag(init(dictPath, idfPath));};
|
||||
~KeywordExtractor(){};
|
||||
public:
|
||||
bool init(const string& dictPath, const string& idfPath)
|
||||
{
|
||||
ifstream ifs(idfPath.c_str());
|
||||
if(!ifs)
|
||||
{
|
||||
LogError("open %s failed.", idfPath.c_str());
|
||||
return false;
|
||||
}
|
||||
string line ;
|
||||
vector<string> buf;
|
||||
for(uint lineno = 0; getline(ifs, line); lineno++)
|
||||
{
|
||||
buf.clear();
|
||||
if(line.empty())
|
||||
{
|
||||
LogError("line[%d] empty. skipped.", lineno);
|
||||
continue;
|
||||
}
|
||||
if(!split(line, buf, " ") || buf.size() != 2)
|
||||
{
|
||||
LogError("line %d [%s] illegal. skipped.", lineno, line.c_str());
|
||||
continue;
|
||||
}
|
||||
_idfMap[buf[0]] = atof(buf[1].c_str());
|
||||
}
|
||||
return _setInitFlag(_segment.init(dictPath));
|
||||
};
|
||||
public:
|
||||
|
||||
bool extract(const string& str, vector<string>& keywords, uint topN) const
|
||||
{
|
||||
assert(_getInitFlag());
|
||||
vector<pair<string, double> > topWords;
|
||||
if(!extract(str, topWords, topN))
|
||||
{
|
||||
return false;
|
||||
}
|
||||
for(uint i = 0; i < topWords.size(); i++)
|
||||
{
|
||||
keywords.push_back(topWords[i].first);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
bool extract(const string& str, vector<pair<string, double> >& keywords, uint topN) const
|
||||
{
|
||||
vector<string> words;
|
||||
if(!_segment.cut(str, words))
|
||||
{
|
||||
LogError("segment cut(%s) failed.", str.c_str());
|
||||
return false;
|
||||
}
|
||||
|
||||
unordered_map<string, double> wordmap;
|
||||
for(uint i = 0; i < words.size(); i ++)
|
||||
{
|
||||
wordmap[ words[i] ] += 1.0;
|
||||
}
|
||||
|
||||
for(unordered_map<string, double>::iterator itr = wordmap.begin(); itr != wordmap.end();)
|
||||
{
|
||||
unordered_map<string, double>::const_iterator cit = _idfMap.find(itr->first);
|
||||
if(cit != _idfMap.end())
|
||||
{
|
||||
itr->second *= cit->second;
|
||||
itr ++;
|
||||
}
|
||||
else
|
||||
{
|
||||
itr = wordmap.erase(itr);
|
||||
}
|
||||
}
|
||||
|
||||
keywords.resize(MIN(topN, wordmap.size()));
|
||||
partial_sort_copy(wordmap.begin(), wordmap.end(), keywords.begin(), keywords.end(), _cmp);
|
||||
return true;
|
||||
}
|
||||
|
||||
private:
|
||||
static bool _cmp(const pair<string, uint>& lhs, const pair<string, uint>& rhs)
|
||||
{
|
||||
return lhs.second > rhs.second;
|
||||
}
|
||||
|
||||
};
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
|
@ -22,9 +22,9 @@ namespace Limonp
|
||||
|
||||
for(int i = 0; i < argc; i++)
|
||||
{
|
||||
if(strStartsWith(argv[i], "-"))
|
||||
if(startsWith(argv[i], "-"))
|
||||
{
|
||||
if(i + 1 < argc && !strStartsWith(argv[i + 1], "-"))
|
||||
if(i + 1 < argc && !startsWith(argv[i + 1], "-"))
|
||||
{
|
||||
_mpss[argv[i]] = argv[i+1];
|
||||
i++;
|
||||
|
@ -18,7 +18,17 @@ namespace Limonp
|
||||
class Config
|
||||
{
|
||||
public:
|
||||
bool loadFile(const char * const filePath)
|
||||
Config(const char * const filePath)
|
||||
{
|
||||
_loadFile(filePath);
|
||||
}
|
||||
public:
|
||||
operator bool ()
|
||||
{
|
||||
return !_map.empty();
|
||||
}
|
||||
private:
|
||||
bool _loadFile(const char * const filePath)
|
||||
{
|
||||
ifstream ifs(filePath);
|
||||
if(!ifs)
|
||||
@ -33,12 +43,12 @@ namespace Limonp
|
||||
{
|
||||
lineno ++;
|
||||
trim(line);
|
||||
if(line.empty() || strStartsWith(line, "#"))
|
||||
if(line.empty() || startsWith(line, "#"))
|
||||
{
|
||||
continue;
|
||||
}
|
||||
vecBuf.clear();
|
||||
if(!splitStr(line, vecBuf, "=") || 2 != vecBuf.size())
|
||||
if(!split(line, vecBuf, "=") || 2 != vecBuf.size())
|
||||
{
|
||||
LogFatal("line[%d:%s] is illegal.", lineno, line.c_str());
|
||||
return false;
|
||||
@ -57,6 +67,7 @@ namespace Limonp
|
||||
ifs.close();
|
||||
return true;
|
||||
}
|
||||
public:
|
||||
bool get(const string& key, string& value) const
|
||||
{
|
||||
map<string, string>::const_iterator it = _map.find(key);
|
||||
@ -73,7 +84,7 @@ namespace Limonp
|
||||
friend ostream& operator << (ostream& os, const Config& config);
|
||||
};
|
||||
|
||||
ostream& operator << (ostream& os, const Config& config)
|
||||
inline ostream& operator << (ostream& os, const Config& config)
|
||||
{
|
||||
return os << config._map;
|
||||
}
|
||||
|
@ -11,6 +11,7 @@
|
||||
#include <string>
|
||||
#include <stdio.h>
|
||||
#include <stdarg.h>
|
||||
#include <cassert>
|
||||
#include "io_functs.hpp"
|
||||
#include "str_functs.hpp"
|
||||
|
||||
@ -23,6 +24,7 @@
|
||||
#define LogFatal(fmt, ...) Logger::LoggingF(LL_FATAL, FILE_BASENAME, __LINE__, fmt, ## __VA_ARGS__)
|
||||
|
||||
|
||||
|
||||
namespace Limonp
|
||||
{
|
||||
using namespace std;
|
||||
@ -36,16 +38,11 @@ namespace Limonp
|
||||
public:
|
||||
static bool Logging(uint level, const string& msg, const char* fileName, int lineNo)
|
||||
{
|
||||
if(level > LL_FATAL)
|
||||
{
|
||||
cerr<<"level's value is out of range"<<endl;
|
||||
return false;
|
||||
}
|
||||
assert(level <= LL_FATAL);
|
||||
char buf[CSTR_BUFFER_SIZE];
|
||||
time_t timeNow;
|
||||
time(&timeNow);
|
||||
size_t ret = strftime(buf, sizeof(buf), LOG_TIME_FORMAT, localtime(&timeNow));
|
||||
if(0 == ret)
|
||||
if(!strftime(buf, sizeof(buf), LOG_TIME_FORMAT, localtime(&timeNow)))
|
||||
{
|
||||
fprintf(stderr, "stftime failed.\n");
|
||||
return false;
|
||||
@ -55,6 +52,9 @@ namespace Limonp
|
||||
}
|
||||
static bool LoggingF(uint level, const char* fileName, int lineNo, const string& fmt, ...)
|
||||
{
|
||||
#ifdef LOGGER_LEVEL
|
||||
if(level < LOGGER_LEVEL) return true;
|
||||
#endif
|
||||
int size = 256;
|
||||
string msg;
|
||||
va_list ap;
|
||||
|
@ -34,7 +34,7 @@
|
||||
namespace Limonp
|
||||
{
|
||||
|
||||
#pragma region MD5 defines
|
||||
//#pragma region MD5 defines
|
||||
// Constants for MD5Transform routine.
|
||||
#define S11 7
|
||||
#define S12 12
|
||||
@ -85,7 +85,7 @@ namespace Limonp
|
||||
(a) = ROTATE_LEFT ((a), (s)); \
|
||||
(a) += (b); \
|
||||
}
|
||||
#pragma endregion
|
||||
//#pragma endregion
|
||||
|
||||
|
||||
typedef unsigned char BYTE ;
|
||||
@ -115,7 +115,7 @@ private:
|
||||
unsigned char buffer[64]; /* input buffer */
|
||||
} context ;
|
||||
|
||||
#pragma region static helper functions
|
||||
//#pragma region static helper functions
|
||||
// The core of the MD5 algorithm is here.
|
||||
// MD5 basic transformation. Transforms state based on block.
|
||||
static void MD5Transform( UINT4 state[4], unsigned char block[64] )
|
||||
@ -229,7 +229,7 @@ private:
|
||||
output[i] = ((UINT4)input[j]) | (((UINT4)input[j+1]) << 8) |
|
||||
(((UINT4)input[j+2]) << 16) | (((UINT4)input[j+3]) << 24);
|
||||
}
|
||||
#pragma endregion
|
||||
//#pragma endregion
|
||||
|
||||
|
||||
public:
|
||||
@ -354,7 +354,7 @@ public:
|
||||
}
|
||||
else
|
||||
{
|
||||
while( len = fread( buffer, 1, 1024, file ) )
|
||||
while( (len = fread( buffer, 1, 1024, file )) )
|
||||
Update( buffer, len ) ;
|
||||
Final();
|
||||
|
||||
|
@ -72,28 +72,6 @@ namespace Limonp
|
||||
}
|
||||
}
|
||||
|
||||
//inline bool joinStr(const vector<string>& src, string& dest, const string& connectorStr)
|
||||
//{
|
||||
// if(src.empty())
|
||||
// {
|
||||
// return false;
|
||||
// }
|
||||
// for(uint i = 0; i < src.size() - 1; i++)
|
||||
// {
|
||||
// dest += src[i];
|
||||
// dest += connectorStr;
|
||||
// }
|
||||
// dest += src[src.size() - 1];
|
||||
// return true;
|
||||
//}
|
||||
|
||||
//inline string joinStr(const vector<string>& source, const string& connector)
|
||||
//{
|
||||
// string res;
|
||||
// joinStr(source, res, connector);
|
||||
// return res;
|
||||
//}
|
||||
|
||||
template<class T>
|
||||
void join(T begin, T end, string& res, const string& connector)
|
||||
{
|
||||
@ -122,7 +100,7 @@ namespace Limonp
|
||||
|
||||
|
||||
|
||||
inline bool splitStr(const string& src, vector<string>& res, const string& pattern)
|
||||
inline bool split(const string& src, vector<string>& res, const string& pattern)
|
||||
{
|
||||
if(src.empty())
|
||||
{
|
||||
@ -181,20 +159,9 @@ namespace Limonp
|
||||
}
|
||||
|
||||
|
||||
inline uint16_t twocharToUint16(char high, char low)
|
||||
{
|
||||
return (((uint16_t(high) & 0x00ff ) << 8) | (uint16_t(low) & 0x00ff));
|
||||
}
|
||||
|
||||
inline pair<char, char> uint16ToChar2(uint16_t in)
|
||||
{
|
||||
pair<char, char> res;
|
||||
res.first = (in>>8) & 0x00ff; //high
|
||||
res.second = (in) & 0x00ff; //low
|
||||
return res;
|
||||
}
|
||||
|
||||
inline bool strStartsWith(const string& str, const string& prefix)
|
||||
inline bool startsWith(const string& str, const string& prefix)
|
||||
{
|
||||
//return str.substr(0, prefix.size()) == prefix;
|
||||
if(prefix.length() > str.length())
|
||||
@ -204,7 +171,7 @@ namespace Limonp
|
||||
return 0 == str.compare(0, prefix.length(), prefix);
|
||||
}
|
||||
|
||||
inline bool strEndsWith(const string& str, const string& suffix)
|
||||
inline bool endsWith(const string& str, const string& suffix)
|
||||
{
|
||||
if(suffix.length() > str.length())
|
||||
{
|
||||
@ -218,13 +185,19 @@ namespace Limonp
|
||||
return str.find(ch) != string::npos;
|
||||
}
|
||||
|
||||
inline uint16_t twocharToUint16(char high, char low)
|
||||
{
|
||||
return (((uint16_t(high) & 0x00ff ) << 8) | (uint16_t(low) & 0x00ff));
|
||||
}
|
||||
|
||||
inline bool utf8ToUnicode(const char * const str, uint len, vector<uint16_t>& vec)
|
||||
{
|
||||
char ch1, ch2;
|
||||
if(!str)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
char ch1, ch2;
|
||||
uint16_t tmp;
|
||||
vec.clear();
|
||||
for(uint i = 0;i < len;)
|
||||
{
|
||||
@ -237,14 +210,16 @@ namespace Limonp
|
||||
{
|
||||
ch1 = (str[i] >> 2) & 0x07;
|
||||
ch2 = (str[i+1] & 0x3f) | ((str[i] & 0x03) << 6 );
|
||||
vec.push_back(twocharToUint16(ch1, ch2));
|
||||
tmp = (((uint16_t(ch1) & 0x00ff ) << 8) | (uint16_t(ch2) & 0x00ff));
|
||||
vec.push_back(tmp);
|
||||
i += 2;
|
||||
}
|
||||
else if((unsigned char)str[i] <= 0xef && i + 2 < len)
|
||||
{
|
||||
ch1 = (str[i] << 4) | ((str[i+1] >> 2) & 0x0f );
|
||||
ch2 = ((str[i+1]<<6) & 0xc0) | (str[i+2] & 0x3f);
|
||||
vec.push_back(twocharToUint16(ch1, ch2));
|
||||
tmp = (((uint16_t(ch1) & 0x00ff ) << 8) | (uint16_t(ch2) & 0x00ff));
|
||||
vec.push_back(tmp);
|
||||
i += 3;
|
||||
}
|
||||
else
|
||||
@ -310,7 +285,8 @@ namespace Limonp
|
||||
{
|
||||
if(i + 1 < len) //&& (str[i+1] & 0x80))
|
||||
{
|
||||
vec.push_back(twocharToUint16(str[i], str[i + 1]));
|
||||
uint16_t tmp = (((uint16_t(str[i]) & 0x00ff ) << 8) | (uint16_t(str[i+1]) & 0x00ff));
|
||||
vec.push_back(tmp);
|
||||
i += 2;
|
||||
}
|
||||
else
|
||||
@ -321,11 +297,20 @@ namespace Limonp
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
inline bool gbkTrans(const string& str, vector<uint16_t>& vec)
|
||||
{
|
||||
return gbkTrans(str.c_str(), str.size(), vec);
|
||||
}
|
||||
|
||||
//inline pair<char, char> uint16ToChar2(uint16_t in)
|
||||
//{
|
||||
// pair<char, char> res;
|
||||
// res.first = (in>>8) & 0x00ff; //high
|
||||
// res.second = (in) & 0x00ff; //low
|
||||
// return res;
|
||||
//}
|
||||
|
||||
inline bool gbkTrans(vector<uint16_t>::const_iterator begin, vector<uint16_t>::const_iterator end, string& res)
|
||||
{
|
||||
if(begin >= end)
|
||||
@ -333,18 +318,21 @@ namespace Limonp
|
||||
return false;
|
||||
}
|
||||
res.clear();
|
||||
pair<char, char> pa;
|
||||
//pair<char, char> pa;
|
||||
char first, second;
|
||||
while(begin != end)
|
||||
{
|
||||
pa = uint16ToChar2(*begin);
|
||||
if(pa.first & 0x80)
|
||||
//pa = uint16ToChar2(*begin);
|
||||
first = ((*begin)>>8) & 0x00ff;
|
||||
second = (*begin) & 0x00ff;
|
||||
if(first & 0x80)
|
||||
{
|
||||
res += pa.first;
|
||||
res += pa.second;
|
||||
res += first;
|
||||
res += second;
|
||||
}
|
||||
else
|
||||
{
|
||||
res += pa.second;
|
||||
res += second;
|
||||
}
|
||||
begin++;
|
||||
}
|
||||
|
@ -32,7 +32,7 @@ namespace CppJieba
|
||||
|
||||
class MPSegment: public SegmentBase
|
||||
{
|
||||
private:
|
||||
protected:
|
||||
Trie* _trie;
|
||||
|
||||
public:
|
||||
@ -56,6 +56,7 @@ namespace CppJieba
|
||||
LogError("get a NULL pointor form getTrie(\"%s\").", dictPath.c_str());
|
||||
return false;
|
||||
}
|
||||
LogInfo("MPSegment init(%s) ok", dictPath.c_str());
|
||||
return _setInitFlag(true);
|
||||
}
|
||||
public:
|
||||
|
@ -15,9 +15,9 @@ namespace CppJieba
|
||||
HMMSegment _hmmSeg;
|
||||
public:
|
||||
MixSegment(){_setInitFlag(false);};
|
||||
explicit MixSegment(const string& mpSegDict, const string& hmmSegDict): _mpSeg(mpSegDict), _hmmSeg(hmmSegDict)
|
||||
explicit MixSegment(const string& mpSegDict, const string& hmmSegDict)
|
||||
{
|
||||
_setInitFlag(_mpSeg && _hmmSeg);
|
||||
_setInitFlag(init(mpSegDict, hmmSegDict));
|
||||
}
|
||||
virtual ~MixSegment(){}
|
||||
public:
|
||||
@ -38,6 +38,7 @@ namespace CppJieba
|
||||
LogError("_hmmSeg init");
|
||||
return false;
|
||||
}
|
||||
LogInfo("MixSegment init(%s, %s)", mpSegDict.c_str(), hmmSegDict.c_str());
|
||||
return _setInitFlag(true);
|
||||
}
|
||||
public:
|
||||
|
@ -1,10 +1,11 @@
|
||||
#ifndef CPPJIEBA_SEGMENTBASE_H
|
||||
#define CPPJIEBA_SEGMENTBASE_H
|
||||
|
||||
#include "ISegment.hpp"
|
||||
#include "ChineseFilter.hpp"
|
||||
#include "Limonp/str_functs.hpp"
|
||||
#include "TransCode.hpp"
|
||||
#include "Limonp/logger.hpp"
|
||||
#include "ISegment.hpp"
|
||||
#include <cassert>
|
||||
|
||||
|
||||
namespace CppJieba
|
||||
{
|
||||
@ -18,9 +19,10 @@ namespace CppJieba
|
||||
bool _isInited;
|
||||
bool _getInitFlag()const{return _isInited;};
|
||||
bool _setInitFlag(bool flag){return _isInited = flag;};
|
||||
|
||||
public:
|
||||
operator bool(){return _getInitFlag();};
|
||||
|
||||
public:
|
||||
virtual bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<string>& res)const = 0;
|
||||
virtual bool cut(const string& str, vector<string>& res)const
|
||||
{
|
||||
@ -74,6 +76,46 @@ namespace CppJieba
|
||||
return true;
|
||||
#endif
|
||||
}
|
||||
public:
|
||||
|
||||
/*
|
||||
* if char is ascii, count the ascii string's length and return 0;
|
||||
* else count the nonascii string's length and return 1;
|
||||
* if errors, return -1;
|
||||
* */
|
||||
static int filterAscii(const char* str, uint len, uint& resLen)
|
||||
{
|
||||
if(!str || !len)
|
||||
{
|
||||
return -1;
|
||||
}
|
||||
char x = 0x80;
|
||||
int resFlag = (str[0] & x ? 1 : 0);
|
||||
resLen = 0;
|
||||
if(!resFlag)
|
||||
{
|
||||
while(resLen < len && !(str[resLen] & x))
|
||||
{
|
||||
resLen ++;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
while(resLen < len && (str[resLen] & x))
|
||||
{
|
||||
#ifdef CPPJIEBA_GBK
|
||||
resLen += 2;
|
||||
#else
|
||||
resLen ++;
|
||||
#endif
|
||||
}
|
||||
}
|
||||
if(resLen > len)
|
||||
{
|
||||
return -1;
|
||||
}
|
||||
return resFlag;
|
||||
}
|
||||
|
||||
};
|
||||
}
|
||||
|
@ -1,32 +0,0 @@
|
||||
#ifndef CPPJIEBA_TFIDF_H
|
||||
#define CPPJIEBA_TFIDF_H
|
||||
|
||||
#include "MPSegment.hpp"
|
||||
|
||||
namespace CppJieba
|
||||
{
|
||||
using namespace Limonp;
|
||||
|
||||
class TfIdfKeyWord
|
||||
{
|
||||
private:
|
||||
MPSegment _segment;
|
||||
public:
|
||||
TfIdfKeyWord(const char* dictFile): _segment(dictFile){};
|
||||
~TfIdfKeyWord(){};
|
||||
public:
|
||||
bool init(){return _segment.init();};
|
||||
bool dispose(){return _segment.dispose();};
|
||||
public:
|
||||
bool extract(const string& str, vector<string>& words, uint topN)
|
||||
{
|
||||
return _segment.cut(words);
|
||||
return true;
|
||||
}
|
||||
|
||||
};
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
|
35
src/Trie.hpp
35
src/Trie.hpp
@ -130,26 +130,13 @@ namespace CppJieba
|
||||
}
|
||||
bool loadDict(const char * const filePath)
|
||||
{
|
||||
if(!_getInitFlag())
|
||||
{
|
||||
LogError("not initted.");
|
||||
return false;
|
||||
}
|
||||
|
||||
if(!checkFileExist(filePath))
|
||||
{
|
||||
LogError("cann't find fiel[%s].",filePath);
|
||||
return false;
|
||||
}
|
||||
bool res = false;
|
||||
res = _trieInsert(filePath);
|
||||
if(!res)
|
||||
assert(_getInitFlag());
|
||||
if(!_trieInsert(filePath))
|
||||
{
|
||||
LogError("_trieInsert failed.");
|
||||
return false;
|
||||
}
|
||||
res = _countWeight();
|
||||
if(!res)
|
||||
if(!_countWeight())
|
||||
{
|
||||
LogError("_countWeight failed.");
|
||||
return false;
|
||||
@ -339,22 +326,30 @@ namespace CppJieba
|
||||
private:
|
||||
bool _trieInsert(const char * const filePath)
|
||||
{
|
||||
ifstream ifile(filePath);
|
||||
ifstream ifs(filePath);
|
||||
if(!ifs)
|
||||
{
|
||||
LogError("open %s failed.", filePath);
|
||||
return false;
|
||||
}
|
||||
string line;
|
||||
vector<string> vecBuf;
|
||||
|
||||
TrieNodeInfo nodeInfo;
|
||||
while(getline(ifile, line))
|
||||
size_t lineno = 0;
|
||||
while(getline(ifs, line))
|
||||
{
|
||||
vecBuf.clear();
|
||||
splitStr(line, vecBuf, " ");
|
||||
lineno ++;
|
||||
split(line, vecBuf, " ");
|
||||
if(3 < vecBuf.size())
|
||||
{
|
||||
LogError("line[%s] illegal.", line.c_str());
|
||||
LogError("line[%u:%s] illegal.", lineno, line.c_str());
|
||||
return false;
|
||||
}
|
||||
if(!TransCode::decode(vecBuf[0], nodeInfo.word))
|
||||
{
|
||||
LogError("line[%u:%s] illegal.", lineno, line.c_str());
|
||||
return false;
|
||||
}
|
||||
nodeInfo.freq = atoi(vecBuf[1].c_str());
|
||||
|
@ -23,15 +23,13 @@ namespace CppJieba
|
||||
LogError("error when getting md5 for file '%s'", dictpath);
|
||||
return NULL;
|
||||
}
|
||||
LogInfo("md5 for file '%s': %s", dictpath, md5.c_str());
|
||||
|
||||
if (_tries.find(md5) != _tries.end())
|
||||
{
|
||||
LogInfo("find a exits trie for md5: '%s'", md5.c_str());
|
||||
return _tries[md5.c_str()];
|
||||
}
|
||||
|
||||
LogInfo("create a new trie for md5: '%s'", md5.c_str());
|
||||
//LogDebug("create a new trie for md5: '%s'", md5.c_str());
|
||||
Trie* trie = NULL;
|
||||
try
|
||||
{
|
||||
@ -54,15 +52,14 @@ namespace CppJieba
|
||||
return NULL;
|
||||
}
|
||||
|
||||
LogInfo("trie->loadDict(%s) start...", dictpath);
|
||||
if (!trie->loadDict(dictpath))
|
||||
{
|
||||
LogError("trie->loadDict(%s) failed...", dictpath);
|
||||
return NULL;
|
||||
}
|
||||
LogInfo("trie->loadDict end...");
|
||||
|
||||
_tries[md5.c_str()] = trie;
|
||||
LogDebug("trie->loadDict(%s)", dictpath);
|
||||
return trie;
|
||||
}
|
||||
|
||||
|
@ -45,11 +45,11 @@ int main(int argc, char ** argv)
|
||||
<<"\t--modelpath\tsee example\n"
|
||||
<<"\t--maxlen\tspecify the granularity of cut used in cutQuery. \n\t\t\tIf not specified, the default is 3\n"
|
||||
<<"example:\n"
|
||||
<<"\t"<<argv[0]<<" ../test/testdata/testlines.utf8 --dictpath ../dicts/jieba.dict.utf8 --algorithm cutDAG\n"
|
||||
<<"\t"<<argv[0]<<" ../test/testdata/testlines.utf8 --dictpath ../dicts/jieba.dict.utf8 --algorithm cutFull\n"
|
||||
<<"\t"<<argv[0]<<" ../test/testdata/testlines.utf8 --modelpath ../dicts/hmm_model.utf8 --algorithm cutHMM\n"
|
||||
<<"\t"<<argv[0]<<" ../test/testdata/testlines.utf8 --dictpath ../dicts/jieba.dict.utf8 --modelpath ../dicts/hmm_model.utf8 --algorithm cutMix\n"
|
||||
<<"\t"<<argv[0]<<" ../test/testdata/testlines.utf8 --dictpath ../dicts/jieba.dict.utf8 --modelpath ../dicts/hmm_model.utf8 --algorithm cutQuery --maxlen 3\n"
|
||||
<<"\t"<<argv[0]<<" ../test/testdata/testlines.utf8 --dictpath ../dict/jieba.dict.utf8 --algorithm cutDAG\n"
|
||||
<<"\t"<<argv[0]<<" ../test/testdata/testlines.utf8 --dictpath ../dict/jieba.dict.utf8 --algorithm cutFull\n"
|
||||
<<"\t"<<argv[0]<<" ../test/testdata/testlines.utf8 --modelpath ../dict/hmm_model.utf8 --algorithm cutHMM\n"
|
||||
<<"\t"<<argv[0]<<" ../test/testdata/testlines.utf8 --dictpath ../dict/jieba.dict.utf8 --modelpath ../dict/hmm_model.utf8 --algorithm cutMix\n"
|
||||
<<"\t"<<argv[0]<<" ../test/testdata/testlines.utf8 --dictpath ../dict/jieba.dict.utf8 --modelpath ../dict/hmm_model.utf8 --algorithm cutQuery --maxlen 3\n"
|
||||
<<endl;
|
||||
|
||||
return EXIT_FAILURE;
|
||||
|
@ -4,7 +4,7 @@
|
||||
#include <ctype.h>
|
||||
#include <string.h>
|
||||
#include "Limonp/Config.hpp"
|
||||
#include "Husky/ServerFrame.hpp"
|
||||
#include "Husky/HuskyServer.hpp"
|
||||
#include "MPSegment.hpp"
|
||||
#include "HMMSegment.hpp"
|
||||
#include "MixSegment.hpp"
|
||||
@ -38,8 +38,8 @@ bool run(int argc, char** argv)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
Config conf;
|
||||
if(!conf.loadFile(argv[1]))
|
||||
Config conf(argv[1]);
|
||||
if(!conf)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
@ -90,7 +90,7 @@ bool run(int argc, char** argv)
|
||||
}
|
||||
|
||||
ReqHandler reqHandler(dictPath, modelPath);
|
||||
ServerFrame sf(port, threadNum, &reqHandler);
|
||||
HuskyServer sf(port, threadNum, &reqHandler);
|
||||
return sf.init() && sf.run();
|
||||
}
|
||||
|
||||
|
@ -22,28 +22,19 @@ void cut(const ISegment * seg, const char * const filePath, size_t times = 10)
|
||||
for(uint i = 0; i < times; i ++)
|
||||
{
|
||||
LogInfo("times[%u]", i);
|
||||
//ifile.seekg(0);
|
||||
//while(getline(ifile, line))
|
||||
//{
|
||||
// if(!line.empty())
|
||||
// {
|
||||
res.clear();
|
||||
seg->cut(doc, res);
|
||||
//print(res);
|
||||
//cout<<join(res.begin(), res.end(),"/")<<endl;
|
||||
// }
|
||||
//}
|
||||
res.clear();
|
||||
seg->cut(doc, res);
|
||||
}
|
||||
}
|
||||
|
||||
int main(int argc, char ** argv)
|
||||
{
|
||||
{
|
||||
MixSegment seg("../dicts/jieba.dict.utf8", "../dicts/hmm_model.utf8");
|
||||
MixSegment seg("../dict/jieba.dict.utf8", "../dict/hmm_model.utf8");
|
||||
if(!seg)
|
||||
{
|
||||
cout<<"seg init failed."<<endl;
|
||||
return false;
|
||||
return EXIT_FAILURE;
|
||||
}
|
||||
cut(&seg, "../test/testdata/weicheng.utf8");
|
||||
}
|
||||
|
@ -23,8 +23,8 @@ void cut(const ISegment * seg, const char * const filePath)
|
||||
}
|
||||
|
||||
const char * const TEST_FILE = "../test/testdata/testlines.utf8";
|
||||
const char * const JIEBA_DICT_FILE = "../dicts/jieba.dict.utf8";
|
||||
const char * const HMM_DICT_FILE = "../dicts/hmm_model.utf8";
|
||||
const char * const JIEBA_DICT_FILE = "../dict/jieba.dict.utf8";
|
||||
const char * const HMM_DICT_FILE = "../dict/hmm_model.utf8";
|
||||
|
||||
int main(int argc, char ** argv)
|
||||
{
|
||||
|
@ -8,8 +8,8 @@
|
||||
using namespace Husky;
|
||||
using namespace CppJieba;
|
||||
|
||||
const char * const DEFAULT_DICTPATH = "../dicts/jieba.dict.utf8";
|
||||
const char * const DEFAULT_MODELPATH = "../dicts/hmm_model.utf8";
|
||||
const char * const DEFAULT_DICTPATH = "../dict/jieba.dict.utf8";
|
||||
const char * const DEFAULT_MODELPATH = "../dict/hmm_model.utf8";
|
||||
|
||||
class ServerDemo: public IRequestHandler
|
||||
{
|
||||
|
91
test/servertest/load_test.py
Executable file
91
test/servertest/load_test.py
Executable file
@ -0,0 +1,91 @@
|
||||
#!/usr/bin/python
|
||||
# coding:utf-8
|
||||
import time
|
||||
import urllib2
|
||||
import threading
|
||||
from Queue import Queue
|
||||
from time import sleep
|
||||
import sys
|
||||
|
||||
# 性能测试页面
|
||||
#PERF_TEST_URL = "http://10.2.66.38/?yyid=-1&suv=1309231700203264&callback=xxxxx"
|
||||
URLS = [line for line in open("../testdata/load_test.urls", "r")]
|
||||
|
||||
# 配置:压力测试
|
||||
THREAD_NUM = 10 # 并发线程总数
|
||||
ONE_WORKER_NUM = 500 # 每个线程的循环次数
|
||||
LOOP_SLEEP = 0.01 # 每次请求时间间隔(秒)
|
||||
|
||||
# 配置:模拟运行状态
|
||||
#THREAD_NUM = 10 # 并发线程总数
|
||||
#ONE_WORKER_NUM = 10 # 每个线程的循环次数
|
||||
#LOOP_SLEEP = 0 # 每次请求时间间隔(秒)
|
||||
|
||||
|
||||
# 出错数
|
||||
ERROR_NUM = 0
|
||||
|
||||
|
||||
#具体的处理函数,负责处理单个任务
|
||||
def doWork(index, url):
|
||||
t = threading.currentThread()
|
||||
#print "["+t.name+" "+str(index)+"] "+PERF_TEST_URL
|
||||
|
||||
try:
|
||||
html = urllib2.urlopen(url).read()
|
||||
except urllib2.URLError, e:
|
||||
print "["+t.name+" "+str(index)+"] "
|
||||
print e
|
||||
global ERROR_NUM
|
||||
ERROR_NUM += 1
|
||||
|
||||
|
||||
#这个是工作进程,负责不断从队列取数据并处理
|
||||
def working():
|
||||
t = threading.currentThread()
|
||||
print "["+t.name+"] Sub Thread Begin"
|
||||
|
||||
i = 0
|
||||
while i < ONE_WORKER_NUM:
|
||||
i += 1
|
||||
doWork(i, URLS[i % len(URLS)])
|
||||
sleep(LOOP_SLEEP)
|
||||
|
||||
print "["+t.name+"] Sub Thread End"
|
||||
|
||||
|
||||
def main():
|
||||
#doWork(0)
|
||||
#return
|
||||
|
||||
t1 = time.time()
|
||||
|
||||
Threads = []
|
||||
|
||||
# 创建线程
|
||||
for i in range(THREAD_NUM):
|
||||
t = threading.Thread(target=working, name="T"+str(i))
|
||||
t.setDaemon(True)
|
||||
Threads.append(t)
|
||||
|
||||
for t in Threads:
|
||||
t.start()
|
||||
|
||||
for t in Threads:
|
||||
t.join()
|
||||
|
||||
print "main thread end"
|
||||
|
||||
t2 = time.time()
|
||||
print "========================================"
|
||||
#print "URL:", PERF_TEST_URL
|
||||
print "任务数量:", THREAD_NUM, "*", ONE_WORKER_NUM, "=", THREAD_NUM*ONE_WORKER_NUM
|
||||
print "总耗时(秒):", t2-t1
|
||||
print "每次请求耗时(秒):", (t2-t1) / (THREAD_NUM*ONE_WORKER_NUM)
|
||||
print "每秒承载请求数:", 1 / ((t2-t1) / (THREAD_NUM*ONE_WORKER_NUM))
|
||||
print "错误数量:", ERROR_NUM
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
11
test/servertest/run_curl.sh
Executable file
11
test/servertest/run_curl.sh
Executable file
@ -0,0 +1,11 @@
|
||||
CURL_RES=../testdata/curl.res
|
||||
TMP=curl.res.tmp
|
||||
curl -s "http://127.0.0.1:11200/?key=南京市长江大桥" >> $TMP
|
||||
if diff $TMP $CURL_RES >> /dev/null
|
||||
then
|
||||
echo "ok";
|
||||
else
|
||||
echo "failed."
|
||||
fi
|
||||
|
||||
rm $TMP
|
1
test/testdata/curl.res
vendored
Normal file
1
test/testdata/curl.res
vendored
Normal file
@ -0,0 +1 @@
|
||||
["南京市", "长江大桥"]
|
1
test/testdata/load_test.urls
vendored
Normal file
1
test/testdata/load_test.urls
vendored
Normal file
@ -0,0 +1 @@
|
||||
http://127.0.0.1:11200/?key=南京市长江大桥
|
@ -3,9 +3,11 @@ SET(LIBRARY_OUTPUT_PATH ${PROJECT_BINARY_DIR}/test/lib)
|
||||
|
||||
SET(GTEST_ROOT_DIR gtest-1.6.0)
|
||||
|
||||
ADD_DEFINITIONS(-DLOGGER_LEVEL=LL_WARN)
|
||||
INCLUDE_DIRECTORIES(${GTEST_ROOT_DIR} ${GTEST_ROOT_DIR}/include ${PROJECT_SOURCE_DIR})
|
||||
ADD_LIBRARY(gtest STATIC ${GTEST_ROOT_DIR}/src/gtest-all.cc)
|
||||
ADD_EXECUTABLE(test.run gtest_main.cc TChineseFilter.cpp TMixSegment.cpp TMPSegment.cpp THMMSegment.cpp TTrie.cpp TFullSegment.cpp TQuerySegment.cpp TTrieManager.cpp TMd5.cpp)
|
||||
FILE(GLOB SRCFILES *.cpp)
|
||||
ADD_EXECUTABLE(test.run ${SRCFILES})
|
||||
TARGET_LINK_LIBRARIES(gtest pthread)
|
||||
TARGET_LINK_LIBRARIES(test.run gtest pthread)
|
||||
|
||||
|
@ -5,7 +5,7 @@ using namespace CppJieba;
|
||||
|
||||
TEST(FullSegment, Test1)
|
||||
{
|
||||
FullSegment segment("../dicts/jieba.dict.utf8");
|
||||
FullSegment segment("../dict/jieba.dict.utf8");
|
||||
const char* str = "我来自北京邮电大学。。。 学号 123456";
|
||||
const char* res[] = {"我", "来自", "北京", "北京邮电", "北京邮电大学", "邮电", "邮电大学", "电大", "大学", "。", "。", "。", " ", "学号", " 123456"};
|
||||
vector<string> words;
|
||||
|
@ -5,7 +5,7 @@ using namespace CppJieba;
|
||||
|
||||
TEST(HMMSegmentTest, Test1)
|
||||
{
|
||||
HMMSegment segment("../dicts/hmm_model.utf8");;
|
||||
HMMSegment segment("../dict/hmm_model.utf8");;
|
||||
const char* str = "我来自北京邮电大学。。。 学号 123456";
|
||||
const char* res[] = {"我来", "自北京", "邮电大学", "。", "。", "。", " ", "学号", " 123456"};
|
||||
//string s;
|
||||
|
56
test/unittest/TKeywordExtractor.cpp
Normal file
56
test/unittest/TKeywordExtractor.cpp
Normal file
@ -0,0 +1,56 @@
|
||||
#include "src/KeywordExtractor.hpp"
|
||||
#include "gtest/gtest.h"
|
||||
|
||||
using namespace CppJieba;
|
||||
|
||||
TEST(KeywordExtractorTest, Test1)
|
||||
{
|
||||
KeywordExtractor extractor("../dict/jieba.dict.utf8", "../dict/idf.utf8");
|
||||
const char* str = "我来自北京邮电大学。。。 学号 123456";
|
||||
const char* res[] = {"北京邮电大学", "来自"};
|
||||
vector<string> words;
|
||||
ASSERT_TRUE(extractor);
|
||||
ASSERT_TRUE(extractor.extract(str, words, 2));
|
||||
ASSERT_EQ(words, vector<string>(res, res + sizeof(res)/sizeof(res[0])));
|
||||
}
|
||||
|
||||
TEST(KeywordExtractorTest, Test2)
|
||||
{
|
||||
KeywordExtractor extractor("../dict/jieba.dict.utf8", "../dict/idf.utf8");
|
||||
const char* str = "我来自北京邮电大学。。。 学号 123456";
|
||||
const char* res[] = {"北京邮电大学", "来自"};
|
||||
vector<string> words;
|
||||
ASSERT_TRUE(extractor);
|
||||
ASSERT_TRUE(extractor.extract(str, words, 9));
|
||||
ASSERT_EQ(words, vector<string>(res, res + sizeof(res)/sizeof(res[0])));
|
||||
}
|
||||
|
||||
|
||||
TEST(KeywordExtractorTest, Test3)
|
||||
{
|
||||
ifstream ifs("../test/testdata/weicheng.utf8");
|
||||
ASSERT_TRUE(!!ifs);
|
||||
string str((istreambuf_iterator<char>(ifs)), (istreambuf_iterator<char>()));
|
||||
KeywordExtractor extractor("../dict/jieba.dict.utf8", "../dict/idf.utf8");
|
||||
const char* res[] = {"小姐", "孙小姐", "方鸿渐", "自己", "没有"};
|
||||
vector<string> keywords;
|
||||
extractor.extract(str, keywords, 5);
|
||||
ASSERT_EQ(keywords, vector<string>(res, res + sizeof(res)/sizeof(res[0])));
|
||||
|
||||
}
|
||||
|
||||
TEST(KeywordExtractorTest, Test4)
|
||||
{
|
||||
ifstream ifs("../test/testdata/weicheng.utf8");
|
||||
ASSERT_TRUE(!!ifs);
|
||||
string str((istreambuf_iterator<char>(ifs)), (istreambuf_iterator<char>()));
|
||||
KeywordExtractor extractor("../dict/jieba.dict.utf8", "../dict/idf.utf8");
|
||||
//const char* res[] = {"小姐", "孙小姐", "方鸿渐", "自己", "没有"};
|
||||
vector<pair<string,double> > keywords;
|
||||
extractor.extract(str, keywords, 5);
|
||||
//print(keywords);
|
||||
string res;
|
||||
res << keywords;
|
||||
ASSERT_EQ(res, "[\"小姐:4268.75\", \"孙小姐:3789.41\", \"方鸿渐:3030.35\", \"自己:2300.54\", \"没有:2104.27\"]");
|
||||
|
||||
}
|
@ -5,7 +5,7 @@ using namespace CppJieba;
|
||||
|
||||
TEST(MPSegmentTest, Test1)
|
||||
{
|
||||
MPSegment segment("../dicts/jieba.dict.utf8");;
|
||||
MPSegment segment("../dict/jieba.dict.utf8");;
|
||||
const char* str = "我来自北京邮电大学。。。 学号 123456";
|
||||
const char* res[] = {"我", "来自", "北京邮电大学", "。","。","。"," ","学","号", " 123456"};
|
||||
vector<string> words;
|
||||
|
@ -5,7 +5,7 @@ using namespace CppJieba;
|
||||
|
||||
TEST(MixSegmentTest, Test1)
|
||||
{
|
||||
MixSegment segment("../dicts/jieba.dict.utf8", "../dicts/hmm_model.utf8");;
|
||||
MixSegment segment("../dict/jieba.dict.utf8", "../dict/hmm_model.utf8");;
|
||||
const char* str = "我来自北京邮电大学。。。 学号 123456";
|
||||
const char* res[] = {"我", "来自", "北京邮电大学", "。","。","。"," ","学号", " 123456"};
|
||||
vector<string> words;
|
||||
|
@ -5,7 +5,7 @@ using namespace CppJieba;
|
||||
|
||||
TEST(QuerySegment, Test1)
|
||||
{
|
||||
QuerySegment segment("../dicts/jieba.dict.utf8", "../dicts/hmm_model.utf8", 3);
|
||||
QuerySegment segment("../dict/jieba.dict.utf8", "../dict/hmm_model.utf8", 3);
|
||||
const char* str = "小明硕士毕业于中国科学院计算所,后在日本京都大学深造";
|
||||
const char* res[] = {"小明", "硕士", "毕业", "于", "中国", "中国科学院", "科学", "科学院", "学院", "计算所", ",", "后", "在", "日本", "日本京都大学", "京都", "京都大学", "大学", "深造"};
|
||||
vector<string> words;
|
||||
|
@ -1,9 +1,9 @@
|
||||
#include "src/ChineseFilter.hpp"
|
||||
#include "src/SegmentBase.hpp"
|
||||
#include "gtest/gtest.h"
|
||||
|
||||
using namespace CppJieba;
|
||||
|
||||
TEST(ChineseFilterTest, Test1)
|
||||
TEST(SegmentBaseTest, Test1)
|
||||
{
|
||||
const char* str = "heheh你好...hh";
|
||||
string s;
|
||||
@ -16,9 +16,9 @@ TEST(ChineseFilterTest, Test1)
|
||||
uint offset = 0;
|
||||
while(offset < size)
|
||||
{
|
||||
uint len;
|
||||
uint len = 0;
|
||||
const char* t = str + offset;
|
||||
int ret = filterAscii(t, size - offset, len);
|
||||
SegmentBase::filterAscii(t, size - offset, len);
|
||||
s.assign(t, len);
|
||||
res.push_back(s);
|
||||
//cout<<s<<","<<ret<<","<<len<<endl;
|
@ -3,7 +3,7 @@
|
||||
|
||||
using namespace CppJieba;
|
||||
|
||||
static const char* const DICT_FILE = "../dicts/jieba.dict.utf8";
|
||||
static const char* const DICT_FILE = "../dict/jieba.dict.utf8";
|
||||
|
||||
TEST(TrieTest, Test1)
|
||||
{
|
||||
|
@ -496,7 +496,20 @@
|
||||
# undef _TR1_FUNCTIONAL // Allows the user to #include
|
||||
// <tr1/functional> if he chooses to.
|
||||
# else
|
||||
# if defined (__cplusplus) && __cplusplus > 199711L
|
||||
# include <tuple>
|
||||
namespace std {
|
||||
namespace tr1 {
|
||||
using std::tuple;
|
||||
using std::tuple_element;
|
||||
using std::get;
|
||||
using std::tuple_size;
|
||||
using std::make_tuple;
|
||||
}
|
||||
}
|
||||
# else
|
||||
# include <tr1/tuple> // NOLINT
|
||||
# endif
|
||||
# endif // !GTEST_HAS_RTTI && GTEST_GCC_VER_ < 40302
|
||||
|
||||
# else
|
||||
|
Loading…
x
Reference in New Issue
Block a user