Merge remote-tracking branch 'origin/dev'

This commit is contained in:
wyy 2013-12-24 02:59:19 -08:00
commit 5236c634b2
18 changed files with 296 additions and 159 deletions

View File

@ -1,13 +1,15 @@
SET(EXECUTABLE_OUTPUT_PATH ${PROJECT_BINARY_DIR}/bin)
SET(LIBRARY_OUTPUT_PATH ${PROJECT_BINARY_DIR}/lib)
INCLUDE_DIRECTORIES(${PROJECT_SOURCE_DIR}/src)
ADD_EXECUTABLE(cjsegment segment.cpp)
ADD_EXECUTABLE(cjserver server.cpp)
TARGET_LINK_LIBRARIES(cjserver pthread)
INSTALL(TARGETS cjsegment RUNTIME DESTINATION bin)
INSTALL(TARGETS cjserver RUNTIME DESTINATION bin)
INSTALL(FILES HMMSegment.hpp MPSegment.hpp Trie.hpp TrieManager.hpp ISegment.hpp MixSegment.hpp SegmentBase.hpp TransCode.hpp DESTINATION include/CppJieba)
INSTALL(FILES HMMSegment.hpp MPSegment.hpp Trie.hpp TrieManager.hpp ISegment.hpp MixSegment.hpp SegmentBase.hpp TransCode.hpp KeywordExtractor.hpp DESTINATION include/CppJieba)
ADD_SUBDIRECTORY(Husky)
ADD_SUBDIRECTORY(Limonp)

View File

@ -215,7 +215,7 @@ namespace CppJieba
{
return false;
}
splitStr(line, tmp, " ");
split(line, tmp, " ");
if(tmp.size() != STATUS_SUM)
{
LogError("start_p illegal");
@ -234,7 +234,7 @@ namespace CppJieba
{
return false;
}
splitStr(line, tmp, " ");
split(line, tmp, " ");
if(tmp.size() != STATUS_SUM)
{
LogError("trans_p illegal");
@ -284,7 +284,7 @@ namespace CppJieba
{
continue;
}
if(strStartsWith(line, "#"))
if(startsWith(line, "#"))
{
continue;
}
@ -300,10 +300,10 @@ namespace CppJieba
}
vector<string> tmp, tmp2;
uint16_t unico = 0;
splitStr(line, tmp, ",");
split(line, tmp, ",");
for(uint i = 0; i < tmp.size(); i++)
{
splitStr(tmp[i], tmp2, ":");
split(tmp[i], tmp2, ":");
if(2 != tmp2.size())
{
LogError("_emitProb illegal.");

View File

@ -3,7 +3,7 @@
#include <iostream>
#include <string>
#include "../Limonp/logger.hpp"
#include "Limonp/logger.hpp"
namespace Husky
{
@ -88,7 +88,7 @@ namespace Husky
}
string firstline(headerStr, lpos, rpos - lpos);
trim(firstline);
if(!splitStr(firstline, buf, " ") || 3 != buf.size())
if(!split(firstline, buf, " ") || 3 != buf.size())
{
LogFatal("parse header first line failed.");
return false;

View File

@ -29,7 +29,7 @@ namespace Husky
using namespace Limonp;
typedef int SOCKET;
const struct timeval SOCKET_TIMEOUT = {2, 0};
const char* const RESPONSE_FORMAT = "HTTP/1.1 200 OK\r\nConnection: close\r\nServer: FrameServer/1.0.0\r\nContent-Type: text/json; charset=%s\r\nContent-Length: %d\r\n\r\n";
const char* const RESPONSE_FORMAT = "HTTP/1.1 200 OK\r\nConnection: close\r\nServer: HuskyServer/1.0.0\r\nContent-Type: text/json; charset=%s\r\nContent-Length: %d\r\n\r\n";
const char* const RESPONSE_CHARSET_UTF8 = "UTF-8";
const char* const RESPONSE_CHARSET_GB2312 = "GB2312";
const char* const CLIENT_IP_K = "CLIENT_IP";
@ -53,13 +53,13 @@ namespace Husky
bool * pShutdown;
};
class ServerFrame//: public IWorkHandler
class HuskyServer
{
private:
pthread_mutex_t m_pmAccept;
bool m_bShutdown;
public:
ServerFrame(unsigned nPort, unsigned nThreadCount, IRequestHandler* pHandler)
explicit HuskyServer(unsigned nPort, unsigned nThreadCount, IRequestHandler* pHandler)
{
m_bShutdown = false;
m_nLsnPort = nPort;
@ -68,7 +68,7 @@ namespace Husky
assert(pHandler);
pthread_mutex_init(&m_pmAccept,NULL);
};
virtual ~ServerFrame(){pthread_mutex_destroy(&m_pmAccept);};
virtual ~HuskyServer(){pthread_mutex_destroy(&m_pmAccept);};
virtual bool init()
{
@ -292,8 +292,6 @@ namespace Husky
u_short m_nThreadCount;
SOCKET m_lsnSock;
IRequestHandler *m_pHandler;
//static bool m_bShutdown;
//static pthread_mutex_t m_pmAccept;
};

View File

@ -5,8 +5,6 @@
#include <vector>
#include <map>
#define INFINITE 0
namespace Husky
{
using namespace std;
@ -15,47 +13,43 @@ namespace Husky
{
private:
typedef int HANDLE;
typedef int DWORD;
typedef void *(* PThreadFunc)(void* param);
public:
ThreadManager(){;}
~ThreadManager(){}
unsigned int HandleCount(){return m_vecHandle.size();}
size_t HandleCount(){return _handles.size();}
void clear()
{
m_vecHandle.clear();
_handles.clear();
}
HANDLE CreateThread( PThreadFunc pFunc,void *pPara)
{
pthread_t pt;
int nErrorCode=pthread_create(&pt,NULL,pFunc,pPara);
if(nErrorCode!=0)
int nErrorCode = pthread_create(&pt,NULL,pFunc,pPara);
if(nErrorCode != 0)
return nErrorCode;
m_vecHandle.push_back(pt); //加入线程列表 为WaitForMultipleObjects准备
_handles.push_back(pt);
return nErrorCode;
}
//hThread (thread handler) : 为0时为默认最后一个加入管理器的线程句柄
//dwMilliseconds等待时间 : 单位毫秒,默认值无穷时间
//return value : -1句柄无效其他值 WaitForSingleObject函数的返回值
DWORD Wait(HANDLE hThread=0,DWORD dwMilliseconds=INFINITE )
int Wait(HANDLE hThread = 0)
{
if( hThread==0)//最后一个加入的线程
if( hThread == 0)//the last handle
{
if(!m_vecHandle.empty())
if(!_handles.empty())
{
return pthread_join(m_vecHandle.back(),NULL);
return pthread_join(_handles.back(),NULL);
}
else
return -1;
}
else
{
if (find(m_vecHandle.begin(),m_vecHandle.end(),hThread)==m_vecHandle.end())//不存在此句柄
if (find(_handles.begin(),_handles.end(),hThread) == _handles.end())
{
return -1;
}
@ -65,31 +59,26 @@ namespace Husky
}
//等待所有线程执行完毕
//bWaitAll是否所有线程 : 默认值1等待所有线程,0有任何线程结束此函数返回
//dwMilliseconds : 单位毫秒,默认值无穷时间
//return value : -1没有任何句柄其他值 WaitForMultipleObjects函数的返回值
DWORD WaitMultipleThread( bool bWaitAll=1,DWORD dwMilliseconds=INFINITE)
int WaitMultipleThread()
{
if (m_vecHandle.empty())
if (_handles.empty())
return -1;
int nErrorcode;
for (uint i=0;i<m_vecHandle.size();++i)
for (uint i = 0; i < _handles.size(); i++)
{
nErrorcode=pthread_join(m_vecHandle[i], NULL);
if (nErrorcode!=0)
nErrorcode = pthread_join(_handles[i], NULL);
if (nErrorcode != 0)
return nErrorcode;
}
return 0;
}
private:
vector<pthread_t> m_vecHandle;
vector<pthread_t> _handles;
private:
ThreadManager(const ThreadManager&){;}// copy forbidden
void operator=(const ThreadManager &){}// copy forbidden
void operator = (const ThreadManager &){}// copy forbidden
};
}

146
src/KeywordExtractor.hpp Normal file
View File

@ -0,0 +1,146 @@
#ifndef CPPJIEBA_KEYWORD_EXTRACTOR_H
#define CPPJIEBA_KEYWORD_EXTRACTOR_H
#include "MPSegment.hpp"
#include <cmath>
namespace CppJieba
{
using namespace Limonp;
struct KeyWordInfo
{
string word;
uint freq;
double idf;
};
inline ostream& operator << (ostream& os, const KeyWordInfo & keyword)
{
return os << keyword.word << "," << keyword.freq << "," << keyword.idf;
}
class KeywordExtractor
{
private:
MPSegment _segment;
private:
unordered_map<string, const KeyWordInfo* > _wordIndex;
vector<KeyWordInfo> _wordinfos;
size_t _totalFreq;
protected:
bool _isInited;
bool _getInitFlag()const{return _isInited;};
bool _setInitFlag(bool flag){return _isInited = flag;};
public:
operator bool(){return _getInitFlag();};
public:
KeywordExtractor(){_setInitFlag(false);};
explicit KeywordExtractor(const string& dictPath){_setInitFlag(init(dictPath));};
~KeywordExtractor(){};
public:
bool init(const string& dictPath)
{
ifstream ifs(dictPath.c_str());
if(!ifs)
{
LogError("open %s failed.", dictPath.c_str());
return false;
}
_totalFreq = 0;
int tfreq;
string line ;
vector<string> buf;
KeyWordInfo keywordInfo;
for(uint lineno = 0; getline(ifs, line); lineno++)
{
buf.clear();
if(line.empty())
{
LogError("line[%d] empty. skipped.", lineno);
continue;
}
if(!split(line, buf, " ") || buf.size() != 3)
{
LogError("line %d [%s] illegal. skipped.", lineno, line.c_str());
continue;
}
keywordInfo.word = buf[0];
tfreq= atoi(buf[1].c_str());
if(tfreq <= 0)
{
LogError("line %d [%s] illegal. skipped.", lineno, line.c_str());
continue;
}
keywordInfo.freq = tfreq;
_totalFreq += tfreq;
_wordinfos.push_back(keywordInfo);
}
// calculate idf & make index.
for(uint i = 0; i < _wordinfos.size(); i++)
{
if(_wordinfos[i].freq <= 0)
{
LogFatal("freq value is not positive.");
return false;
}
_wordinfos[i].idf = -log(_wordinfos[i].freq);
_wordIndex[_wordinfos[i].word] = &(_wordinfos[i]);
}
return _setInitFlag(_segment.init(dictPath));
};
public:
bool extract(const string& str, vector<string>& keywords, uint topN) const
{
assert(_getInitFlag());
vector<string> words;
if(!_segment.cut(str, words))
{
LogError("segment cut(%s) failed.", str.c_str());
return false;
}
unordered_map<string, double> wordmap;
for(uint i = 0; i < words.size(); i ++)
{
wordmap[ words[i] ] += 1.0;
}
for(unordered_map<string, double>::iterator itr = wordmap.begin(); itr != wordmap.end();)
{
unordered_map<string, const KeyWordInfo*>::const_iterator cit = _wordIndex.find(itr->first);
if(cit != _wordIndex.end())
{
itr->second *= cit->second->idf;
itr ++;
}
else
{
itr = wordmap.erase(itr);
}
}
vector<pair<string, double> > topWords(min(topN, wordmap.size()));
partial_sort_copy(wordmap.begin(), wordmap.end(), topWords.begin(), topWords.end(), _cmp);
keywords.clear();
for(uint i = 0; i < topWords.size(); i++)
{
keywords.push_back(topWords[i].first);
}
return true;
}
private:
static bool _cmp(const pair<string, uint>& lhs, const pair<string, uint>& rhs)
{
return lhs.second > rhs.second;
}
};
}
#endif

View File

@ -22,9 +22,9 @@ namespace Limonp
for(int i = 0; i < argc; i++)
{
if(strStartsWith(argv[i], "-"))
if(startsWith(argv[i], "-"))
{
if(i + 1 < argc && !strStartsWith(argv[i + 1], "-"))
if(i + 1 < argc && !startsWith(argv[i + 1], "-"))
{
_mpss[argv[i]] = argv[i+1];
i++;

View File

@ -18,7 +18,17 @@ namespace Limonp
class Config
{
public:
bool loadFile(const char * const filePath)
Config(const char * const filePath)
{
_loadFile(filePath);
}
public:
operator bool ()
{
return !_map.empty();
}
private:
bool _loadFile(const char * const filePath)
{
ifstream ifs(filePath);
if(!ifs)
@ -33,12 +43,12 @@ namespace Limonp
{
lineno ++;
trim(line);
if(line.empty() || strStartsWith(line, "#"))
if(line.empty() || startsWith(line, "#"))
{
continue;
}
vecBuf.clear();
if(!splitStr(line, vecBuf, "=") || 2 != vecBuf.size())
if(!split(line, vecBuf, "=") || 2 != vecBuf.size())
{
LogFatal("line[%d:%s] is illegal.", lineno, line.c_str());
return false;
@ -57,6 +67,7 @@ namespace Limonp
ifs.close();
return true;
}
public:
bool get(const string& key, string& value) const
{
map<string, string>::const_iterator it = _map.find(key);
@ -73,7 +84,7 @@ namespace Limonp
friend ostream& operator << (ostream& os, const Config& config);
};
ostream& operator << (ostream& os, const Config& config)
inline ostream& operator << (ostream& os, const Config& config)
{
return os << config._map;
}

View File

@ -11,6 +11,7 @@
#include <string>
#include <stdio.h>
#include <stdarg.h>
#include <cassert>
#include "io_functs.hpp"
#include "str_functs.hpp"
@ -23,6 +24,7 @@
#define LogFatal(fmt, ...) Logger::LoggingF(LL_FATAL, FILE_BASENAME, __LINE__, fmt, ## __VA_ARGS__)
namespace Limonp
{
using namespace std;
@ -36,16 +38,11 @@ namespace Limonp
public:
static bool Logging(uint level, const string& msg, const char* fileName, int lineNo)
{
if(level > LL_FATAL)
{
cerr<<"level's value is out of range"<<endl;
return false;
}
assert(level <= LL_FATAL);
char buf[CSTR_BUFFER_SIZE];
time_t timeNow;
time(&timeNow);
size_t ret = strftime(buf, sizeof(buf), LOG_TIME_FORMAT, localtime(&timeNow));
if(0 == ret)
if(!strftime(buf, sizeof(buf), LOG_TIME_FORMAT, localtime(&timeNow)))
{
fprintf(stderr, "stftime failed.\n");
return false;
@ -55,6 +52,9 @@ namespace Limonp
}
static bool LoggingF(uint level, const char* fileName, int lineNo, const string& fmt, ...)
{
#ifdef LOGGER_LEVEL
if(level < LOGGER_LEVEL) return true;
#endif
int size = 256;
string msg;
va_list ap;

View File

@ -72,28 +72,6 @@ namespace Limonp
}
}
//inline bool joinStr(const vector<string>& src, string& dest, const string& connectorStr)
//{
// if(src.empty())
// {
// return false;
// }
// for(uint i = 0; i < src.size() - 1; i++)
// {
// dest += src[i];
// dest += connectorStr;
// }
// dest += src[src.size() - 1];
// return true;
//}
//inline string joinStr(const vector<string>& source, const string& connector)
//{
// string res;
// joinStr(source, res, connector);
// return res;
//}
template<class T>
void join(T begin, T end, string& res, const string& connector)
{
@ -122,7 +100,7 @@ namespace Limonp
inline bool splitStr(const string& src, vector<string>& res, const string& pattern)
inline bool split(const string& src, vector<string>& res, const string& pattern)
{
if(src.empty())
{
@ -181,20 +159,9 @@ namespace Limonp
}
inline uint16_t twocharToUint16(char high, char low)
{
return (((uint16_t(high) & 0x00ff ) << 8) | (uint16_t(low) & 0x00ff));
}
inline pair<char, char> uint16ToChar2(uint16_t in)
{
pair<char, char> res;
res.first = (in>>8) & 0x00ff; //high
res.second = (in) & 0x00ff; //low
return res;
}
inline bool strStartsWith(const string& str, const string& prefix)
inline bool startsWith(const string& str, const string& prefix)
{
//return str.substr(0, prefix.size()) == prefix;
if(prefix.length() > str.length())
@ -204,7 +171,7 @@ namespace Limonp
return 0 == str.compare(0, prefix.length(), prefix);
}
inline bool strEndsWith(const string& str, const string& suffix)
inline bool endsWith(const string& str, const string& suffix)
{
if(suffix.length() > str.length())
{
@ -218,13 +185,19 @@ namespace Limonp
return str.find(ch) != string::npos;
}
inline uint16_t twocharToUint16(char high, char low)
{
return (((uint16_t(high) & 0x00ff ) << 8) | (uint16_t(low) & 0x00ff));
}
inline bool utf8ToUnicode(const char * const str, uint len, vector<uint16_t>& vec)
{
char ch1, ch2;
if(!str)
{
return false;
}
char ch1, ch2;
uint16_t tmp;
vec.clear();
for(uint i = 0;i < len;)
{
@ -237,14 +210,16 @@ namespace Limonp
{
ch1 = (str[i] >> 2) & 0x07;
ch2 = (str[i+1] & 0x3f) | ((str[i] & 0x03) << 6 );
vec.push_back(twocharToUint16(ch1, ch2));
tmp = (((uint16_t(ch1) & 0x00ff ) << 8) | (uint16_t(ch2) & 0x00ff));
vec.push_back(tmp);
i += 2;
}
else if((unsigned char)str[i] <= 0xef && i + 2 < len)
{
ch1 = (str[i] << 4) | ((str[i+1] >> 2) & 0x0f );
ch2 = ((str[i+1]<<6) & 0xc0) | (str[i+2] & 0x3f);
vec.push_back(twocharToUint16(ch1, ch2));
tmp = (((uint16_t(ch1) & 0x00ff ) << 8) | (uint16_t(ch2) & 0x00ff));
vec.push_back(tmp);
i += 3;
}
else
@ -310,7 +285,8 @@ namespace Limonp
{
if(i + 1 < len) //&& (str[i+1] & 0x80))
{
vec.push_back(twocharToUint16(str[i], str[i + 1]));
uint16_t tmp = (((uint16_t(str[i]) & 0x00ff ) << 8) | (uint16_t(str[i+1]) & 0x00ff));
vec.push_back(tmp);
i += 2;
}
else
@ -321,11 +297,20 @@ namespace Limonp
}
return true;
}
inline bool gbkTrans(const string& str, vector<uint16_t>& vec)
{
return gbkTrans(str.c_str(), str.size(), vec);
}
//inline pair<char, char> uint16ToChar2(uint16_t in)
//{
// pair<char, char> res;
// res.first = (in>>8) & 0x00ff; //high
// res.second = (in) & 0x00ff; //low
// return res;
//}
inline bool gbkTrans(vector<uint16_t>::const_iterator begin, vector<uint16_t>::const_iterator end, string& res)
{
if(begin >= end)
@ -333,18 +318,21 @@ namespace Limonp
return false;
}
res.clear();
pair<char, char> pa;
//pair<char, char> pa;
char first, second;
while(begin != end)
{
pa = uint16ToChar2(*begin);
if(pa.first & 0x80)
//pa = uint16ToChar2(*begin);
first = ((*begin)>>8) & 0x00ff;
second = (*begin) & 0x00ff;
if(first & 0x80)
{
res += pa.first;
res += pa.second;
res += first;
res += second;
}
else
{
res += pa.second;
res += second;
}
begin++;
}

View File

@ -32,7 +32,7 @@ namespace CppJieba
class MPSegment: public SegmentBase
{
private:
protected:
Trie* _trie;
public:

View File

@ -19,9 +19,10 @@ namespace CppJieba
bool _isInited;
bool _getInitFlag()const{return _isInited;};
bool _setInitFlag(bool flag){return _isInited = flag;};
public:
operator bool(){return _getInitFlag();};
public:
virtual bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<string>& res)const = 0;
virtual bool cut(const string& str, vector<string>& res)const
{

View File

@ -1,32 +0,0 @@
#ifndef CPPJIEBA_TFIDF_H
#define CPPJIEBA_TFIDF_H
#include "MPSegment.hpp"
namespace CppJieba
{
using namespace Limonp;
class TfIdfKeyWord
{
private:
MPSegment _segment;
public:
TfIdfKeyWord(const char* dictFile): _segment(dictFile){};
~TfIdfKeyWord(){};
public:
bool init(){return _segment.init();};
bool dispose(){return _segment.dispose();};
public:
bool extract(const string& str, vector<string>& words, uint topN)
{
return _segment.cut(words);
return true;
}
};
}
#endif

View File

@ -130,26 +130,13 @@ namespace CppJieba
}
bool loadDict(const char * const filePath)
{
if(!_getInitFlag())
{
LogError("not initted.");
return false;
}
if(!checkFileExist(filePath))
{
LogError("cann't find fiel[%s].",filePath);
return false;
}
bool res = false;
res = _trieInsert(filePath);
if(!res)
assert(_getInitFlag());
if(!_trieInsert(filePath))
{
LogError("_trieInsert failed.");
return false;
}
res = _countWeight();
if(!res)
if(!_countWeight())
{
LogError("_countWeight failed.");
return false;
@ -339,15 +326,20 @@ namespace CppJieba
private:
bool _trieInsert(const char * const filePath)
{
ifstream ifile(filePath);
ifstream ifs(filePath);
if(!ifs)
{
LogError("open %s failed.", filePath);
return false;
}
string line;
vector<string> vecBuf;
TrieNodeInfo nodeInfo;
while(getline(ifile, line))
while(getline(ifs, line))
{
vecBuf.clear();
splitStr(line, vecBuf, " ");
split(line, vecBuf, " ");
if(3 < vecBuf.size())
{
LogError("line[%s] illegal.", line.c_str());

View File

@ -4,7 +4,7 @@
#include <ctype.h>
#include <string.h>
#include "Limonp/Config.hpp"
#include "Husky/ServerFrame.hpp"
#include "Husky/HuskyServer.hpp"
#include "MPSegment.hpp"
#include "HMMSegment.hpp"
#include "MixSegment.hpp"
@ -38,8 +38,8 @@ bool run(int argc, char** argv)
{
return false;
}
Config conf;
if(!conf.loadFile(argv[1]))
Config conf(argv[1]);
if(!conf)
{
return false;
}
@ -90,7 +90,7 @@ bool run(int argc, char** argv)
}
ReqHandler reqHandler(dictPath, modelPath);
ServerFrame sf(port, threadNum, &reqHandler);
HuskyServer sf(port, threadNum, &reqHandler);
return sf.init() && sf.run();
}

View File

@ -3,9 +3,11 @@ SET(LIBRARY_OUTPUT_PATH ${PROJECT_BINARY_DIR}/test/lib)
SET(GTEST_ROOT_DIR gtest-1.6.0)
ADD_DEFINITIONS(-DLOGGER_LEVEL=LL_WARN)
INCLUDE_DIRECTORIES(${GTEST_ROOT_DIR} ${GTEST_ROOT_DIR}/include ${PROJECT_SOURCE_DIR})
ADD_LIBRARY(gtest STATIC ${GTEST_ROOT_DIR}/src/gtest-all.cc)
ADD_EXECUTABLE(test.run gtest_main.cc TSegmentBase.cpp TMixSegment.cpp TMPSegment.cpp THMMSegment.cpp TTrie.cpp TFullSegment.cpp TQuerySegment.cpp TTrieManager.cpp)
FILE(GLOB SRCFILES *.cpp)
ADD_EXECUTABLE(test.run ${SRCFILES})
TARGET_LINK_LIBRARIES(gtest pthread)
TARGET_LINK_LIBRARIES(test.run gtest pthread)

View File

@ -0,0 +1,40 @@
#include "src/KeywordExtractor.hpp"
#include "gtest/gtest.h"
using namespace CppJieba;
TEST(KeywordExtractorTest, Test1)
{
KeywordExtractor extractor("../dicts/jieba.dict.utf8");
const char* str = "我来自北京邮电大学。。。 学号 123456";
const char* res[] = {"北京邮电大学", "来自"};
vector<string> words;
ASSERT_TRUE(extractor);
ASSERT_TRUE(extractor.extract(str, words, 2));
ASSERT_EQ(words, vector<string>(res, res + sizeof(res)/sizeof(res[0])));
}
TEST(KeywordExtractorTest, Test2)
{
KeywordExtractor extractor("../dicts/jieba.dict.utf8");
const char* str = "我来自北京邮电大学。。。 学号 123456";
const char* res[] = {"北京邮电大学", "来自", "", "", ""};
vector<string> words;
ASSERT_TRUE(extractor);
ASSERT_TRUE(extractor.extract(str, words, 9));
ASSERT_EQ(words, vector<string>(res, res + sizeof(res)/sizeof(res[0])));
}
TEST(KeywordExtractorTest, Test3)
{
ifstream ifs("../test/testdata/weicheng.utf8");
ASSERT_TRUE(ifs);
string str((istreambuf_iterator<char>(ifs)), (istreambuf_iterator<char>()));
KeywordExtractor extractor("../dicts/jieba.dict.utf8");
vector<string> keywords;
string res;
extractor.extract(str, keywords, 5);
res << keywords;
ASSERT_EQ("[\"第三性\", \"多愁多病\", \"记挂着\", \"揭去\", \"贫血症\"]", res);
}