Merge remote-tracking branch 'origin/dev'

This commit is contained in:
wyy 2013-12-24 02:59:19 -08:00
commit 5236c634b2
18 changed files with 296 additions and 159 deletions

View File

@ -1,13 +1,15 @@
SET(EXECUTABLE_OUTPUT_PATH ${PROJECT_BINARY_DIR}/bin) SET(EXECUTABLE_OUTPUT_PATH ${PROJECT_BINARY_DIR}/bin)
SET(LIBRARY_OUTPUT_PATH ${PROJECT_BINARY_DIR}/lib) SET(LIBRARY_OUTPUT_PATH ${PROJECT_BINARY_DIR}/lib)
INCLUDE_DIRECTORIES(${PROJECT_SOURCE_DIR}/src)
ADD_EXECUTABLE(cjsegment segment.cpp) ADD_EXECUTABLE(cjsegment segment.cpp)
ADD_EXECUTABLE(cjserver server.cpp) ADD_EXECUTABLE(cjserver server.cpp)
TARGET_LINK_LIBRARIES(cjserver pthread) TARGET_LINK_LIBRARIES(cjserver pthread)
INSTALL(TARGETS cjsegment RUNTIME DESTINATION bin) INSTALL(TARGETS cjsegment RUNTIME DESTINATION bin)
INSTALL(TARGETS cjserver RUNTIME DESTINATION bin) INSTALL(TARGETS cjserver RUNTIME DESTINATION bin)
INSTALL(FILES HMMSegment.hpp MPSegment.hpp Trie.hpp TrieManager.hpp ISegment.hpp MixSegment.hpp SegmentBase.hpp TransCode.hpp DESTINATION include/CppJieba) INSTALL(FILES HMMSegment.hpp MPSegment.hpp Trie.hpp TrieManager.hpp ISegment.hpp MixSegment.hpp SegmentBase.hpp TransCode.hpp KeywordExtractor.hpp DESTINATION include/CppJieba)
ADD_SUBDIRECTORY(Husky) ADD_SUBDIRECTORY(Husky)
ADD_SUBDIRECTORY(Limonp) ADD_SUBDIRECTORY(Limonp)

View File

@ -215,7 +215,7 @@ namespace CppJieba
{ {
return false; return false;
} }
splitStr(line, tmp, " "); split(line, tmp, " ");
if(tmp.size() != STATUS_SUM) if(tmp.size() != STATUS_SUM)
{ {
LogError("start_p illegal"); LogError("start_p illegal");
@ -234,7 +234,7 @@ namespace CppJieba
{ {
return false; return false;
} }
splitStr(line, tmp, " "); split(line, tmp, " ");
if(tmp.size() != STATUS_SUM) if(tmp.size() != STATUS_SUM)
{ {
LogError("trans_p illegal"); LogError("trans_p illegal");
@ -284,7 +284,7 @@ namespace CppJieba
{ {
continue; continue;
} }
if(strStartsWith(line, "#")) if(startsWith(line, "#"))
{ {
continue; continue;
} }
@ -300,10 +300,10 @@ namespace CppJieba
} }
vector<string> tmp, tmp2; vector<string> tmp, tmp2;
uint16_t unico = 0; uint16_t unico = 0;
splitStr(line, tmp, ","); split(line, tmp, ",");
for(uint i = 0; i < tmp.size(); i++) for(uint i = 0; i < tmp.size(); i++)
{ {
splitStr(tmp[i], tmp2, ":"); split(tmp[i], tmp2, ":");
if(2 != tmp2.size()) if(2 != tmp2.size())
{ {
LogError("_emitProb illegal."); LogError("_emitProb illegal.");

View File

@ -3,7 +3,7 @@
#include <iostream> #include <iostream>
#include <string> #include <string>
#include "../Limonp/logger.hpp" #include "Limonp/logger.hpp"
namespace Husky namespace Husky
{ {
@ -88,7 +88,7 @@ namespace Husky
} }
string firstline(headerStr, lpos, rpos - lpos); string firstline(headerStr, lpos, rpos - lpos);
trim(firstline); trim(firstline);
if(!splitStr(firstline, buf, " ") || 3 != buf.size()) if(!split(firstline, buf, " ") || 3 != buf.size())
{ {
LogFatal("parse header first line failed."); LogFatal("parse header first line failed.");
return false; return false;

View File

@ -29,7 +29,7 @@ namespace Husky
using namespace Limonp; using namespace Limonp;
typedef int SOCKET; typedef int SOCKET;
const struct timeval SOCKET_TIMEOUT = {2, 0}; const struct timeval SOCKET_TIMEOUT = {2, 0};
const char* const RESPONSE_FORMAT = "HTTP/1.1 200 OK\r\nConnection: close\r\nServer: FrameServer/1.0.0\r\nContent-Type: text/json; charset=%s\r\nContent-Length: %d\r\n\r\n"; const char* const RESPONSE_FORMAT = "HTTP/1.1 200 OK\r\nConnection: close\r\nServer: HuskyServer/1.0.0\r\nContent-Type: text/json; charset=%s\r\nContent-Length: %d\r\n\r\n";
const char* const RESPONSE_CHARSET_UTF8 = "UTF-8"; const char* const RESPONSE_CHARSET_UTF8 = "UTF-8";
const char* const RESPONSE_CHARSET_GB2312 = "GB2312"; const char* const RESPONSE_CHARSET_GB2312 = "GB2312";
const char* const CLIENT_IP_K = "CLIENT_IP"; const char* const CLIENT_IP_K = "CLIENT_IP";
@ -53,13 +53,13 @@ namespace Husky
bool * pShutdown; bool * pShutdown;
}; };
class ServerFrame//: public IWorkHandler class HuskyServer
{ {
private: private:
pthread_mutex_t m_pmAccept; pthread_mutex_t m_pmAccept;
bool m_bShutdown; bool m_bShutdown;
public: public:
ServerFrame(unsigned nPort, unsigned nThreadCount, IRequestHandler* pHandler) explicit HuskyServer(unsigned nPort, unsigned nThreadCount, IRequestHandler* pHandler)
{ {
m_bShutdown = false; m_bShutdown = false;
m_nLsnPort = nPort; m_nLsnPort = nPort;
@ -68,7 +68,7 @@ namespace Husky
assert(pHandler); assert(pHandler);
pthread_mutex_init(&m_pmAccept,NULL); pthread_mutex_init(&m_pmAccept,NULL);
}; };
virtual ~ServerFrame(){pthread_mutex_destroy(&m_pmAccept);}; virtual ~HuskyServer(){pthread_mutex_destroy(&m_pmAccept);};
virtual bool init() virtual bool init()
{ {
@ -292,8 +292,6 @@ namespace Husky
u_short m_nThreadCount; u_short m_nThreadCount;
SOCKET m_lsnSock; SOCKET m_lsnSock;
IRequestHandler *m_pHandler; IRequestHandler *m_pHandler;
//static bool m_bShutdown;
//static pthread_mutex_t m_pmAccept;
}; };

View File

@ -5,8 +5,6 @@
#include <vector> #include <vector>
#include <map> #include <map>
#define INFINITE 0
namespace Husky namespace Husky
{ {
using namespace std; using namespace std;
@ -15,47 +13,43 @@ namespace Husky
{ {
private: private:
typedef int HANDLE; typedef int HANDLE;
typedef int DWORD;
typedef void *(* PThreadFunc)(void* param); typedef void *(* PThreadFunc)(void* param);
public: public:
ThreadManager(){;} ThreadManager(){;}
~ThreadManager(){} ~ThreadManager(){}
unsigned int HandleCount(){return m_vecHandle.size();} size_t HandleCount(){return _handles.size();}
void clear() void clear()
{ {
m_vecHandle.clear(); _handles.clear();
} }
HANDLE CreateThread( PThreadFunc pFunc,void *pPara) HANDLE CreateThread( PThreadFunc pFunc,void *pPara)
{ {
pthread_t pt; pthread_t pt;
int nErrorCode=pthread_create(&pt,NULL,pFunc,pPara); int nErrorCode = pthread_create(&pt,NULL,pFunc,pPara);
if(nErrorCode!=0) if(nErrorCode != 0)
return nErrorCode; return nErrorCode;
m_vecHandle.push_back(pt); //加入线程列表 为WaitForMultipleObjects准备 _handles.push_back(pt);
return nErrorCode; return nErrorCode;
} }
//hThread (thread handler) : 为0时为默认最后一个加入管理器的线程句柄 int Wait(HANDLE hThread = 0)
//dwMilliseconds等待时间 : 单位毫秒,默认值无穷时间
//return value : -1句柄无效其他值 WaitForSingleObject函数的返回值
DWORD Wait(HANDLE hThread=0,DWORD dwMilliseconds=INFINITE )
{ {
if( hThread==0)//最后一个加入的线程 if( hThread == 0)//the last handle
{ {
if(!m_vecHandle.empty()) if(!_handles.empty())
{ {
return pthread_join(m_vecHandle.back(),NULL); return pthread_join(_handles.back(),NULL);
} }
else else
return -1; return -1;
} }
else else
{ {
if (find(m_vecHandle.begin(),m_vecHandle.end(),hThread)==m_vecHandle.end())//不存在此句柄 if (find(_handles.begin(),_handles.end(),hThread) == _handles.end())
{ {
return -1; return -1;
} }
@ -65,31 +59,26 @@ namespace Husky
} }
int WaitMultipleThread()
//等待所有线程执行完毕
//bWaitAll是否所有线程 : 默认值1等待所有线程,0有任何线程结束此函数返回
//dwMilliseconds : 单位毫秒,默认值无穷时间
//return value : -1没有任何句柄其他值 WaitForMultipleObjects函数的返回值
DWORD WaitMultipleThread( bool bWaitAll=1,DWORD dwMilliseconds=INFINITE)
{ {
if (m_vecHandle.empty()) if (_handles.empty())
return -1; return -1;
int nErrorcode; int nErrorcode;
for (uint i=0;i<m_vecHandle.size();++i) for (uint i = 0; i < _handles.size(); i++)
{ {
nErrorcode=pthread_join(m_vecHandle[i], NULL); nErrorcode = pthread_join(_handles[i], NULL);
if (nErrorcode!=0) if (nErrorcode != 0)
return nErrorcode; return nErrorcode;
} }
return 0; return 0;
} }
private: private:
vector<pthread_t> m_vecHandle; vector<pthread_t> _handles;
private: private:
ThreadManager(const ThreadManager&){;}// copy forbidden ThreadManager(const ThreadManager&){;}// copy forbidden
void operator=(const ThreadManager &){}// copy forbidden void operator = (const ThreadManager &){}// copy forbidden
}; };
} }

146
src/KeywordExtractor.hpp Normal file
View File

@ -0,0 +1,146 @@
#ifndef CPPJIEBA_KEYWORD_EXTRACTOR_H
#define CPPJIEBA_KEYWORD_EXTRACTOR_H
#include "MPSegment.hpp"
#include <cmath>
namespace CppJieba
{
using namespace Limonp;
struct KeyWordInfo
{
string word;
uint freq;
double idf;
};
inline ostream& operator << (ostream& os, const KeyWordInfo & keyword)
{
return os << keyword.word << "," << keyword.freq << "," << keyword.idf;
}
class KeywordExtractor
{
private:
MPSegment _segment;
private:
unordered_map<string, const KeyWordInfo* > _wordIndex;
vector<KeyWordInfo> _wordinfos;
size_t _totalFreq;
protected:
bool _isInited;
bool _getInitFlag()const{return _isInited;};
bool _setInitFlag(bool flag){return _isInited = flag;};
public:
operator bool(){return _getInitFlag();};
public:
KeywordExtractor(){_setInitFlag(false);};
explicit KeywordExtractor(const string& dictPath){_setInitFlag(init(dictPath));};
~KeywordExtractor(){};
public:
bool init(const string& dictPath)
{
ifstream ifs(dictPath.c_str());
if(!ifs)
{
LogError("open %s failed.", dictPath.c_str());
return false;
}
_totalFreq = 0;
int tfreq;
string line ;
vector<string> buf;
KeyWordInfo keywordInfo;
for(uint lineno = 0; getline(ifs, line); lineno++)
{
buf.clear();
if(line.empty())
{
LogError("line[%d] empty. skipped.", lineno);
continue;
}
if(!split(line, buf, " ") || buf.size() != 3)
{
LogError("line %d [%s] illegal. skipped.", lineno, line.c_str());
continue;
}
keywordInfo.word = buf[0];
tfreq= atoi(buf[1].c_str());
if(tfreq <= 0)
{
LogError("line %d [%s] illegal. skipped.", lineno, line.c_str());
continue;
}
keywordInfo.freq = tfreq;
_totalFreq += tfreq;
_wordinfos.push_back(keywordInfo);
}
// calculate idf & make index.
for(uint i = 0; i < _wordinfos.size(); i++)
{
if(_wordinfos[i].freq <= 0)
{
LogFatal("freq value is not positive.");
return false;
}
_wordinfos[i].idf = -log(_wordinfos[i].freq);
_wordIndex[_wordinfos[i].word] = &(_wordinfos[i]);
}
return _setInitFlag(_segment.init(dictPath));
};
public:
bool extract(const string& str, vector<string>& keywords, uint topN) const
{
assert(_getInitFlag());
vector<string> words;
if(!_segment.cut(str, words))
{
LogError("segment cut(%s) failed.", str.c_str());
return false;
}
unordered_map<string, double> wordmap;
for(uint i = 0; i < words.size(); i ++)
{
wordmap[ words[i] ] += 1.0;
}
for(unordered_map<string, double>::iterator itr = wordmap.begin(); itr != wordmap.end();)
{
unordered_map<string, const KeyWordInfo*>::const_iterator cit = _wordIndex.find(itr->first);
if(cit != _wordIndex.end())
{
itr->second *= cit->second->idf;
itr ++;
}
else
{
itr = wordmap.erase(itr);
}
}
vector<pair<string, double> > topWords(min(topN, wordmap.size()));
partial_sort_copy(wordmap.begin(), wordmap.end(), topWords.begin(), topWords.end(), _cmp);
keywords.clear();
for(uint i = 0; i < topWords.size(); i++)
{
keywords.push_back(topWords[i].first);
}
return true;
}
private:
static bool _cmp(const pair<string, uint>& lhs, const pair<string, uint>& rhs)
{
return lhs.second > rhs.second;
}
};
}
#endif

View File

@ -22,9 +22,9 @@ namespace Limonp
for(int i = 0; i < argc; i++) for(int i = 0; i < argc; i++)
{ {
if(strStartsWith(argv[i], "-")) if(startsWith(argv[i], "-"))
{ {
if(i + 1 < argc && !strStartsWith(argv[i + 1], "-")) if(i + 1 < argc && !startsWith(argv[i + 1], "-"))
{ {
_mpss[argv[i]] = argv[i+1]; _mpss[argv[i]] = argv[i+1];
i++; i++;

View File

@ -18,7 +18,17 @@ namespace Limonp
class Config class Config
{ {
public: public:
bool loadFile(const char * const filePath) Config(const char * const filePath)
{
_loadFile(filePath);
}
public:
operator bool ()
{
return !_map.empty();
}
private:
bool _loadFile(const char * const filePath)
{ {
ifstream ifs(filePath); ifstream ifs(filePath);
if(!ifs) if(!ifs)
@ -33,12 +43,12 @@ namespace Limonp
{ {
lineno ++; lineno ++;
trim(line); trim(line);
if(line.empty() || strStartsWith(line, "#")) if(line.empty() || startsWith(line, "#"))
{ {
continue; continue;
} }
vecBuf.clear(); vecBuf.clear();
if(!splitStr(line, vecBuf, "=") || 2 != vecBuf.size()) if(!split(line, vecBuf, "=") || 2 != vecBuf.size())
{ {
LogFatal("line[%d:%s] is illegal.", lineno, line.c_str()); LogFatal("line[%d:%s] is illegal.", lineno, line.c_str());
return false; return false;
@ -57,6 +67,7 @@ namespace Limonp
ifs.close(); ifs.close();
return true; return true;
} }
public:
bool get(const string& key, string& value) const bool get(const string& key, string& value) const
{ {
map<string, string>::const_iterator it = _map.find(key); map<string, string>::const_iterator it = _map.find(key);
@ -73,7 +84,7 @@ namespace Limonp
friend ostream& operator << (ostream& os, const Config& config); friend ostream& operator << (ostream& os, const Config& config);
}; };
ostream& operator << (ostream& os, const Config& config) inline ostream& operator << (ostream& os, const Config& config)
{ {
return os << config._map; return os << config._map;
} }

View File

@ -11,6 +11,7 @@
#include <string> #include <string>
#include <stdio.h> #include <stdio.h>
#include <stdarg.h> #include <stdarg.h>
#include <cassert>
#include "io_functs.hpp" #include "io_functs.hpp"
#include "str_functs.hpp" #include "str_functs.hpp"
@ -23,6 +24,7 @@
#define LogFatal(fmt, ...) Logger::LoggingF(LL_FATAL, FILE_BASENAME, __LINE__, fmt, ## __VA_ARGS__) #define LogFatal(fmt, ...) Logger::LoggingF(LL_FATAL, FILE_BASENAME, __LINE__, fmt, ## __VA_ARGS__)
namespace Limonp namespace Limonp
{ {
using namespace std; using namespace std;
@ -36,16 +38,11 @@ namespace Limonp
public: public:
static bool Logging(uint level, const string& msg, const char* fileName, int lineNo) static bool Logging(uint level, const string& msg, const char* fileName, int lineNo)
{ {
if(level > LL_FATAL) assert(level <= LL_FATAL);
{
cerr<<"level's value is out of range"<<endl;
return false;
}
char buf[CSTR_BUFFER_SIZE]; char buf[CSTR_BUFFER_SIZE];
time_t timeNow; time_t timeNow;
time(&timeNow); time(&timeNow);
size_t ret = strftime(buf, sizeof(buf), LOG_TIME_FORMAT, localtime(&timeNow)); if(!strftime(buf, sizeof(buf), LOG_TIME_FORMAT, localtime(&timeNow)))
if(0 == ret)
{ {
fprintf(stderr, "stftime failed.\n"); fprintf(stderr, "stftime failed.\n");
return false; return false;
@ -55,6 +52,9 @@ namespace Limonp
} }
static bool LoggingF(uint level, const char* fileName, int lineNo, const string& fmt, ...) static bool LoggingF(uint level, const char* fileName, int lineNo, const string& fmt, ...)
{ {
#ifdef LOGGER_LEVEL
if(level < LOGGER_LEVEL) return true;
#endif
int size = 256; int size = 256;
string msg; string msg;
va_list ap; va_list ap;

View File

@ -72,28 +72,6 @@ namespace Limonp
} }
} }
//inline bool joinStr(const vector<string>& src, string& dest, const string& connectorStr)
//{
// if(src.empty())
// {
// return false;
// }
// for(uint i = 0; i < src.size() - 1; i++)
// {
// dest += src[i];
// dest += connectorStr;
// }
// dest += src[src.size() - 1];
// return true;
//}
//inline string joinStr(const vector<string>& source, const string& connector)
//{
// string res;
// joinStr(source, res, connector);
// return res;
//}
template<class T> template<class T>
void join(T begin, T end, string& res, const string& connector) void join(T begin, T end, string& res, const string& connector)
{ {
@ -122,7 +100,7 @@ namespace Limonp
inline bool splitStr(const string& src, vector<string>& res, const string& pattern) inline bool split(const string& src, vector<string>& res, const string& pattern)
{ {
if(src.empty()) if(src.empty())
{ {
@ -181,20 +159,9 @@ namespace Limonp
} }
inline uint16_t twocharToUint16(char high, char low)
{
return (((uint16_t(high) & 0x00ff ) << 8) | (uint16_t(low) & 0x00ff));
}
inline pair<char, char> uint16ToChar2(uint16_t in)
{
pair<char, char> res;
res.first = (in>>8) & 0x00ff; //high
res.second = (in) & 0x00ff; //low
return res;
}
inline bool strStartsWith(const string& str, const string& prefix) inline bool startsWith(const string& str, const string& prefix)
{ {
//return str.substr(0, prefix.size()) == prefix; //return str.substr(0, prefix.size()) == prefix;
if(prefix.length() > str.length()) if(prefix.length() > str.length())
@ -204,7 +171,7 @@ namespace Limonp
return 0 == str.compare(0, prefix.length(), prefix); return 0 == str.compare(0, prefix.length(), prefix);
} }
inline bool strEndsWith(const string& str, const string& suffix) inline bool endsWith(const string& str, const string& suffix)
{ {
if(suffix.length() > str.length()) if(suffix.length() > str.length())
{ {
@ -218,13 +185,19 @@ namespace Limonp
return str.find(ch) != string::npos; return str.find(ch) != string::npos;
} }
inline uint16_t twocharToUint16(char high, char low)
{
return (((uint16_t(high) & 0x00ff ) << 8) | (uint16_t(low) & 0x00ff));
}
inline bool utf8ToUnicode(const char * const str, uint len, vector<uint16_t>& vec) inline bool utf8ToUnicode(const char * const str, uint len, vector<uint16_t>& vec)
{ {
char ch1, ch2;
if(!str) if(!str)
{ {
return false; return false;
} }
char ch1, ch2;
uint16_t tmp;
vec.clear(); vec.clear();
for(uint i = 0;i < len;) for(uint i = 0;i < len;)
{ {
@ -237,14 +210,16 @@ namespace Limonp
{ {
ch1 = (str[i] >> 2) & 0x07; ch1 = (str[i] >> 2) & 0x07;
ch2 = (str[i+1] & 0x3f) | ((str[i] & 0x03) << 6 ); ch2 = (str[i+1] & 0x3f) | ((str[i] & 0x03) << 6 );
vec.push_back(twocharToUint16(ch1, ch2)); tmp = (((uint16_t(ch1) & 0x00ff ) << 8) | (uint16_t(ch2) & 0x00ff));
vec.push_back(tmp);
i += 2; i += 2;
} }
else if((unsigned char)str[i] <= 0xef && i + 2 < len) else if((unsigned char)str[i] <= 0xef && i + 2 < len)
{ {
ch1 = (str[i] << 4) | ((str[i+1] >> 2) & 0x0f ); ch1 = (str[i] << 4) | ((str[i+1] >> 2) & 0x0f );
ch2 = ((str[i+1]<<6) & 0xc0) | (str[i+2] & 0x3f); ch2 = ((str[i+1]<<6) & 0xc0) | (str[i+2] & 0x3f);
vec.push_back(twocharToUint16(ch1, ch2)); tmp = (((uint16_t(ch1) & 0x00ff ) << 8) | (uint16_t(ch2) & 0x00ff));
vec.push_back(tmp);
i += 3; i += 3;
} }
else else
@ -310,7 +285,8 @@ namespace Limonp
{ {
if(i + 1 < len) //&& (str[i+1] & 0x80)) if(i + 1 < len) //&& (str[i+1] & 0x80))
{ {
vec.push_back(twocharToUint16(str[i], str[i + 1])); uint16_t tmp = (((uint16_t(str[i]) & 0x00ff ) << 8) | (uint16_t(str[i+1]) & 0x00ff));
vec.push_back(tmp);
i += 2; i += 2;
} }
else else
@ -321,11 +297,20 @@ namespace Limonp
} }
return true; return true;
} }
inline bool gbkTrans(const string& str, vector<uint16_t>& vec) inline bool gbkTrans(const string& str, vector<uint16_t>& vec)
{ {
return gbkTrans(str.c_str(), str.size(), vec); return gbkTrans(str.c_str(), str.size(), vec);
} }
//inline pair<char, char> uint16ToChar2(uint16_t in)
//{
// pair<char, char> res;
// res.first = (in>>8) & 0x00ff; //high
// res.second = (in) & 0x00ff; //low
// return res;
//}
inline bool gbkTrans(vector<uint16_t>::const_iterator begin, vector<uint16_t>::const_iterator end, string& res) inline bool gbkTrans(vector<uint16_t>::const_iterator begin, vector<uint16_t>::const_iterator end, string& res)
{ {
if(begin >= end) if(begin >= end)
@ -333,18 +318,21 @@ namespace Limonp
return false; return false;
} }
res.clear(); res.clear();
pair<char, char> pa; //pair<char, char> pa;
char first, second;
while(begin != end) while(begin != end)
{ {
pa = uint16ToChar2(*begin); //pa = uint16ToChar2(*begin);
if(pa.first & 0x80) first = ((*begin)>>8) & 0x00ff;
second = (*begin) & 0x00ff;
if(first & 0x80)
{ {
res += pa.first; res += first;
res += pa.second; res += second;
} }
else else
{ {
res += pa.second; res += second;
} }
begin++; begin++;
} }

View File

@ -32,7 +32,7 @@ namespace CppJieba
class MPSegment: public SegmentBase class MPSegment: public SegmentBase
{ {
private: protected:
Trie* _trie; Trie* _trie;
public: public:

View File

@ -19,9 +19,10 @@ namespace CppJieba
bool _isInited; bool _isInited;
bool _getInitFlag()const{return _isInited;}; bool _getInitFlag()const{return _isInited;};
bool _setInitFlag(bool flag){return _isInited = flag;}; bool _setInitFlag(bool flag){return _isInited = flag;};
public: public:
operator bool(){return _getInitFlag();}; operator bool(){return _getInitFlag();};
public:
virtual bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<string>& res)const = 0; virtual bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<string>& res)const = 0;
virtual bool cut(const string& str, vector<string>& res)const virtual bool cut(const string& str, vector<string>& res)const
{ {

View File

@ -1,32 +0,0 @@
#ifndef CPPJIEBA_TFIDF_H
#define CPPJIEBA_TFIDF_H
#include "MPSegment.hpp"
namespace CppJieba
{
using namespace Limonp;
class TfIdfKeyWord
{
private:
MPSegment _segment;
public:
TfIdfKeyWord(const char* dictFile): _segment(dictFile){};
~TfIdfKeyWord(){};
public:
bool init(){return _segment.init();};
bool dispose(){return _segment.dispose();};
public:
bool extract(const string& str, vector<string>& words, uint topN)
{
return _segment.cut(words);
return true;
}
};
}
#endif

View File

@ -130,26 +130,13 @@ namespace CppJieba
} }
bool loadDict(const char * const filePath) bool loadDict(const char * const filePath)
{ {
if(!_getInitFlag()) assert(_getInitFlag());
{ if(!_trieInsert(filePath))
LogError("not initted.");
return false;
}
if(!checkFileExist(filePath))
{
LogError("cann't find fiel[%s].",filePath);
return false;
}
bool res = false;
res = _trieInsert(filePath);
if(!res)
{ {
LogError("_trieInsert failed."); LogError("_trieInsert failed.");
return false; return false;
} }
res = _countWeight(); if(!_countWeight())
if(!res)
{ {
LogError("_countWeight failed."); LogError("_countWeight failed.");
return false; return false;
@ -339,15 +326,20 @@ namespace CppJieba
private: private:
bool _trieInsert(const char * const filePath) bool _trieInsert(const char * const filePath)
{ {
ifstream ifile(filePath); ifstream ifs(filePath);
if(!ifs)
{
LogError("open %s failed.", filePath);
return false;
}
string line; string line;
vector<string> vecBuf; vector<string> vecBuf;
TrieNodeInfo nodeInfo; TrieNodeInfo nodeInfo;
while(getline(ifile, line)) while(getline(ifs, line))
{ {
vecBuf.clear(); vecBuf.clear();
splitStr(line, vecBuf, " "); split(line, vecBuf, " ");
if(3 < vecBuf.size()) if(3 < vecBuf.size())
{ {
LogError("line[%s] illegal.", line.c_str()); LogError("line[%s] illegal.", line.c_str());

View File

@ -4,7 +4,7 @@
#include <ctype.h> #include <ctype.h>
#include <string.h> #include <string.h>
#include "Limonp/Config.hpp" #include "Limonp/Config.hpp"
#include "Husky/ServerFrame.hpp" #include "Husky/HuskyServer.hpp"
#include "MPSegment.hpp" #include "MPSegment.hpp"
#include "HMMSegment.hpp" #include "HMMSegment.hpp"
#include "MixSegment.hpp" #include "MixSegment.hpp"
@ -38,8 +38,8 @@ bool run(int argc, char** argv)
{ {
return false; return false;
} }
Config conf; Config conf(argv[1]);
if(!conf.loadFile(argv[1])) if(!conf)
{ {
return false; return false;
} }
@ -90,7 +90,7 @@ bool run(int argc, char** argv)
} }
ReqHandler reqHandler(dictPath, modelPath); ReqHandler reqHandler(dictPath, modelPath);
ServerFrame sf(port, threadNum, &reqHandler); HuskyServer sf(port, threadNum, &reqHandler);
return sf.init() && sf.run(); return sf.init() && sf.run();
} }

View File

@ -3,9 +3,11 @@ SET(LIBRARY_OUTPUT_PATH ${PROJECT_BINARY_DIR}/test/lib)
SET(GTEST_ROOT_DIR gtest-1.6.0) SET(GTEST_ROOT_DIR gtest-1.6.0)
ADD_DEFINITIONS(-DLOGGER_LEVEL=LL_WARN)
INCLUDE_DIRECTORIES(${GTEST_ROOT_DIR} ${GTEST_ROOT_DIR}/include ${PROJECT_SOURCE_DIR}) INCLUDE_DIRECTORIES(${GTEST_ROOT_DIR} ${GTEST_ROOT_DIR}/include ${PROJECT_SOURCE_DIR})
ADD_LIBRARY(gtest STATIC ${GTEST_ROOT_DIR}/src/gtest-all.cc) ADD_LIBRARY(gtest STATIC ${GTEST_ROOT_DIR}/src/gtest-all.cc)
ADD_EXECUTABLE(test.run gtest_main.cc TSegmentBase.cpp TMixSegment.cpp TMPSegment.cpp THMMSegment.cpp TTrie.cpp TFullSegment.cpp TQuerySegment.cpp TTrieManager.cpp) FILE(GLOB SRCFILES *.cpp)
ADD_EXECUTABLE(test.run ${SRCFILES})
TARGET_LINK_LIBRARIES(gtest pthread) TARGET_LINK_LIBRARIES(gtest pthread)
TARGET_LINK_LIBRARIES(test.run gtest pthread) TARGET_LINK_LIBRARIES(test.run gtest pthread)

View File

@ -0,0 +1,40 @@
#include "src/KeywordExtractor.hpp"
#include "gtest/gtest.h"
using namespace CppJieba;
TEST(KeywordExtractorTest, Test1)
{
KeywordExtractor extractor("../dicts/jieba.dict.utf8");
const char* str = "我来自北京邮电大学。。。 学号 123456";
const char* res[] = {"北京邮电大学", "来自"};
vector<string> words;
ASSERT_TRUE(extractor);
ASSERT_TRUE(extractor.extract(str, words, 2));
ASSERT_EQ(words, vector<string>(res, res + sizeof(res)/sizeof(res[0])));
}
TEST(KeywordExtractorTest, Test2)
{
KeywordExtractor extractor("../dicts/jieba.dict.utf8");
const char* str = "我来自北京邮电大学。。。 学号 123456";
const char* res[] = {"北京邮电大学", "来自", "", "", ""};
vector<string> words;
ASSERT_TRUE(extractor);
ASSERT_TRUE(extractor.extract(str, words, 9));
ASSERT_EQ(words, vector<string>(res, res + sizeof(res)/sizeof(res[0])));
}
TEST(KeywordExtractorTest, Test3)
{
ifstream ifs("../test/testdata/weicheng.utf8");
ASSERT_TRUE(ifs);
string str((istreambuf_iterator<char>(ifs)), (istreambuf_iterator<char>()));
KeywordExtractor extractor("../dicts/jieba.dict.utf8");
vector<string> keywords;
string res;
extractor.extract(str, keywords, 5);
res << keywords;
ASSERT_EQ("[\"第三性\", \"多愁多病\", \"记挂着\", \"揭去\", \"贫血症\"]", res);
}