This commit is contained in:
aholic 2014-01-27 01:54:01 +08:00
commit 8e2c726a8c
11 changed files with 196 additions and 87 deletions

View File

@ -1,13 +0,0 @@
DO WHAT THE FUCK YOU WANT TO PUBLIC LICENSE
Version 2, December 2004
Copyright (C) 2013 Yanyi Wu <wuyanyi09@gmail.com>
Everyone is permitted to copy and distribute verbatim or modified
copies of this license document, and changing it is allowed as long
as the name is changed.
DO WHAT THE FUCK YOU WANT TO PUBLIC LICENSE
TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION
0. You just DO WHAT THE FUCK YOU WANT TO.

20
LICENSE Normal file
View File

@ -0,0 +1,20 @@
The MIT License (MIT)
Copyright (c) 2013 Yanyi Wu
Permission is hereby granted, free of charge, to any person obtaining a copy of
this software and associated documentation files (the "Software"), to deal in
the Software without restriction, including without limitation the rights to
use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
the Software, and to permit persons to whom the Software is furnished to do so,
subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

View File

@ -10,6 +10,11 @@
## 安装与使用
### 依赖
* g++ (version >= 4.6);
* cmake (version >= 2.8);
### 下载和安装
```sh

29
dict/README.md Normal file
View File

@ -0,0 +1,29 @@
# CppJieba字典
文件后缀名代表的是词典的编码方式。
比如filename.utf8 是 utf8编码filename.gbk 是 gbk编码方式。
## 分词
### jieba.dict.utf8/gbk
作为最大概率法(MPSegment: Max Probability)分词所使用的词典。
### hmm_model.utf8/gbk
作为隐式马尔科夫模型(HMMSegment: Hidden Markov Model)分词所使用的词典。
__对于MixSegment(混合MPSegment和HMMSegment两者)则同时使用以上两个词典__
## 关键词抽取
## idf.utf8
IDF(Inverse Document Frequency)
在KeywordExtractor中使用的是经典的TF-IDF算法所以需要这么一个词典提供IDF信息。

View File

@ -1,43 +1,36 @@
#ifndef CPPJIEBA_KEYWORD_EXTRACTOR_H
#define CPPJIEBA_KEYWORD_EXTRACTOR_H
#include "MPSegment.hpp"
#include "MixSegment.hpp"
#include <cmath>
#include <unordered_set>
#define MIN(X,Y) ((X) < (Y) ? (X) : (Y))
namespace CppJieba
{
using namespace Limonp;
//struct KeyWordInfo
//{
// string word;
// double tfidf;
//};
/*utf8*/
const char * BLACK_LIST[] = {"我们", "他们"};
//inline ostream& operator << (ostream& os, const KeyWordInfo & keyword)
//{
// return os << keyword.word << "," << keyword.idf;
//}
class KeywordExtractor
class KeywordExtractor: public InitOnOff
{
private:
MPSegment _segment;
MixSegment _segment;
private:
unordered_map<string, double> _idfMap;
protected:
bool _isInited;
bool _getInitFlag()const{return _isInited;};
bool _setInitFlag(bool flag){return _isInited = flag;};
public:
operator bool(){return _getInitFlag();};
double _idfAverage;
unordered_set<string> _blackSet;
public:
KeywordExtractor(){_setInitFlag(false);};
explicit KeywordExtractor(const string& dictPath, const string& idfPath){_setInitFlag(init(dictPath, idfPath));};
explicit KeywordExtractor(const string& dictPath, const string& hmmFilePath, const string& idfPath)
{
_setInitFlag(init(dictPath, hmmFilePath, idfPath));
};
~KeywordExtractor(){};
public:
bool init(const string& dictPath, const string& idfPath)
bool init(const string& dictPath, const string& hmmFilePath, const string& idfPath)
{
ifstream ifs(idfPath.c_str());
if(!ifs)
@ -47,7 +40,10 @@ namespace CppJieba
}
string line ;
vector<string> buf;
for(uint lineno = 0; getline(ifs, line); lineno++)
double idf = 0.0;
double idfSum = 0.0;
size_t lineno = 0;
for(;getline(ifs, line); lineno++)
{
buf.clear();
if(line.empty())
@ -60,9 +56,22 @@ namespace CppJieba
LogError("line %d [%s] illegal. skipped.", lineno, line.c_str());
continue;
}
_idfMap[buf[0]] = atof(buf[1].c_str());
}
return _setInitFlag(_segment.init(dictPath));
idf = atof(buf[1].c_str());
_idfMap[buf[0]] = idf;
idfSum += idf;
}
std::copy(
BLACK_LIST, BLACK_LIST + sizeof(BLACK_LIST)/sizeof(BLACK_LIST[0]),
std::inserter(_blackSet, _blackSet.begin()));
assert(lineno);
_idfAverage = idfSum / lineno;
assert(_idfAverage > 0.0);
return _setInitFlag(_segment.init(dictPath, hmmFilePath));
};
public:
@ -90,30 +99,58 @@ namespace CppJieba
return false;
}
// filtering single word.
for(vector<string>::iterator iter = words.begin(); iter != words.end(); )
{
if(_isSingleWord(*iter))
{
iter = words.erase(iter);
}
else
{
iter++;
}
}
unordered_map<string, double> wordmap;
for(uint i = 0; i < words.size(); i ++)
{
wordmap[ words[i] ] += 1.0;
}
for(unordered_map<string, double>::iterator itr = wordmap.begin(); itr != wordmap.end();)
for(unordered_map<string, double>::iterator itr = wordmap.begin(); itr != wordmap.end(); )
{
if(_blackSet.end() != _blackSet.find(itr->first))
{
itr = wordmap.erase(itr);
continue;
}
unordered_map<string, double>::const_iterator cit = _idfMap.find(itr->first);
if(cit != _idfMap.end())
{
itr->second *= cit->second;
itr ++;
}
else
{
itr = wordmap.erase(itr);
itr->second *= _idfAverage;
}
itr ++;
}
keywords.resize(MIN(topN, wordmap.size()));
partial_sort_copy(wordmap.begin(), wordmap.end(), keywords.begin(), keywords.end(), _cmp);
return true;
}
private:
bool _isSingleWord(const string& str) const
{
Unicode unicode;
TransCode::decode(str, unicode);
if(unicode.size() == 1)
return true;
return false;
}
private:
static bool _cmp(const pair<string, uint>& lhs, const pair<string, uint>& rhs)

View File

@ -1 +1,3 @@
INSTALL(FILES ArgvContext.hpp io_functs.hpp macro_def.hpp MysqlClient.hpp str_functs.hpp cast_functs.hpp Config.hpp logger.hpp map_functs.hpp std_outbound.hpp DESTINATION include/CppJieba/Limonp)
INSTALL(FILES ArgvContext.hpp io_functs.hpp macro_def.hpp MysqlClient.hpp
str_functs.hpp cast_functs.hpp Config.hpp logger.hpp map_functs.hpp
std_outbound.hpp InitOnOff.hpp DESTINATION include/CppJieba/Limonp)

21
src/Limonp/InitOnOff.hpp Normal file
View File

@ -0,0 +1,21 @@
#ifndef LIMONP_INITONOFF_H
#define LIMONP_INITONOFF_H
namespace Limonp
{
class InitOnOff
{
public:
InitOnOff(){_setInitFlag(false);};
~InitOnOff(){};
protected:
bool _isInited;
bool _getInitFlag()const{return _isInited;};
bool _setInitFlag(bool flag){return _isInited = flag;};
public:
operator bool(){return _getInitFlag();};
};
}
#endif

View File

@ -100,7 +100,7 @@ namespace Limonp
inline bool split(const string& src, vector<string>& res, const string& pattern)
inline bool split(const string& src, vector<string>& res, const string& pattern, size_t offset = 0, size_t len = string::npos)
{
if(src.empty())
{
@ -110,20 +110,28 @@ namespace Limonp
size_t start = 0;
size_t end = 0;
while(start < src.size())
size_t cnt = 0;
while(start < src.size() && res.size() < len)
{
end = src.find_first_of(pattern, start);
if(string::npos == end)
{
res.push_back(src.substr(start));
if(cnt >= offset)
{
res.push_back(src.substr(start));
}
return true;
}
res.push_back(src.substr(start, end - start));
if(end == src.size() - 1)
//if(end == src.size() - 1)
//{
// res.push_back("");
// return true;
//}
if(cnt >= offset)
{
res.push_back("");
break;
res.push_back(src.substr(start, end - start));
}
cnt ++;
start = end + 1;
}
return true;
@ -158,12 +166,8 @@ namespace Limonp
return ltrim(rtrim(s));
}
inline bool startsWith(const string& str, const string& prefix)
{
//return str.substr(0, prefix.size()) == prefix;
if(prefix.length() > str.length())
{
return false;

View File

@ -3,6 +3,7 @@
#include "TransCode.hpp"
#include "Limonp/logger.hpp"
#include "Limonp/InitOnOff.hpp"
#include "ISegment.hpp"
#include <cassert>
@ -10,17 +11,11 @@
namespace CppJieba
{
using namespace Limonp;
class SegmentBase: public ISegment
class SegmentBase: public ISegment, public InitOnOff
{
public:
SegmentBase(){_setInitFlag(false);};
SegmentBase(){};
virtual ~SegmentBase(){};
protected:
bool _isInited;
bool _getInitFlag()const{return _isInited;};
bool _setInitFlag(bool flag){return _isInited = flag;};
public:
operator bool(){return _getInitFlag();};
public:
virtual bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<string>& res)const = 0;

View File

@ -3,25 +3,25 @@
using namespace CppJieba;
const char* KEYWORD_EXT_TEST_SENTENCE = "我来自北京邮电大学。 学号123456";
TEST(KeywordExtractorTest, Test1)
{
KeywordExtractor extractor("../dict/jieba.dict.utf8", "../dict/idf.utf8");
const char* str = "我来自北京邮电大学。。。 学号 123456";
const char* res[] = {"北京邮电大学", "来自"};
KeywordExtractor extractor("../dict/jieba.dict.utf8", "../dict/hmm_model.utf8", "../dict/idf.utf8");
const char* res[] = {"学号", "北京邮电大学"};
vector<string> words;
ASSERT_TRUE(extractor);
ASSERT_TRUE(extractor.extract(str, words, 2));
ASSERT_TRUE(extractor.extract(KEYWORD_EXT_TEST_SENTENCE, words, 2));
ASSERT_EQ(words, vector<string>(res, res + sizeof(res)/sizeof(res[0])));
}
TEST(KeywordExtractorTest, Test2)
{
KeywordExtractor extractor("../dict/jieba.dict.utf8", "../dict/idf.utf8");
const char* str = "我来自北京邮电大学。。。 学号 123456";
const char* res[] = {"北京邮电大学", "来自"};
KeywordExtractor extractor("../dict/jieba.dict.utf8", "../dict/hmm_model.utf8", "../dict/idf.utf8");
const char* res[] = {"学号", "北京邮电大学", "123456", "来自"};
vector<string> words;
ASSERT_TRUE(extractor);
ASSERT_TRUE(extractor.extract(str, words, 9));
ASSERT_TRUE(extractor.extract(KEYWORD_EXT_TEST_SENTENCE, words, 9));
ASSERT_EQ(words, vector<string>(res, res + sizeof(res)/sizeof(res[0])));
}
@ -31,26 +31,35 @@ TEST(KeywordExtractorTest, Test3)
ifstream ifs("../test/testdata/weicheng.utf8");
ASSERT_TRUE(!!ifs);
string str((istreambuf_iterator<char>(ifs)), (istreambuf_iterator<char>()));
KeywordExtractor extractor("../dict/jieba.dict.utf8", "../dict/idf.utf8");
const char* res[] = {"小姐", "孙小姐", "方鸿渐", "自己", "没有"};
KeywordExtractor extractor("../dict/jieba.dict.utf8", "../dict/hmm_model.utf8", "../dict/idf.utf8");
const char* res[] = {"柔嘉", "小姐", "孙小姐", "方鸿渐", "鸿渐"};
const char* res2 = "[\"柔嘉:5611.34\", \"小姐:4268.75\", \"孙小姐:3789.41\", \"方鸿渐:3030.35\", \"鸿渐:2552.93\"]";
vector<string> keywords;
string resStr;
vector<pair<string,double> > keywords2;
extractor.extract(str, keywords, 5);
extractor.extract(str, keywords2, 5);
ASSERT_EQ(keywords, vector<string>(res, res + sizeof(res)/sizeof(res[0])));
resStr << keywords2;
ASSERT_EQ(res2, resStr);
}
TEST(KeywordExtractorTest, Test4)
{
ifstream ifs("../test/testdata/weicheng.utf8");
ASSERT_TRUE(!!ifs);
string str((istreambuf_iterator<char>(ifs)), (istreambuf_iterator<char>()));
KeywordExtractor extractor("../dict/jieba.dict.utf8", "../dict/idf.utf8");
//const char* res[] = {"小姐", "孙小姐", "方鸿渐", "自己", "没有"};
vector<pair<string,double> > keywords;
extractor.extract(str, keywords, 5);
//print(keywords);
string res;
res << keywords;
ASSERT_EQ(res, "[\"小姐:4268.75\", \"孙小姐:3789.41\", \"方鸿渐:3030.35\", \"自己:2300.54\", \"没有:2104.27\"]");
}
//TEST(KeywordExtractorTest, Test4)
//{
// ifstream ifs("../test/testdata/weicheng.utf8");
// ASSERT_TRUE(!!ifs);
// string str((istreambuf_iterator<char>(ifs)), (istreambuf_iterator<char>()));
// KeywordExtractor extractor("../dict/jieba.dict.utf8", "../dict/hmm_model.utf8", "../dict/idf.utf8");
// //const char* res[] = {"小姐", "孙小姐", "方鸿渐", "自己", "没有"};
// vector<pair<string,double> > keywords;
// extractor.extract(str, keywords, 5);
// //print(keywords);
// string res;
// res << keywords;
// print(keywords);
// print(__LINE__);
// exit(1);
// ASSERT_EQ(res, "[\"小姐:4268.75\", \"孙小姐:3789.41\", \"方鸿渐:3030.35\", \"自己:2300.54\", \"没有:2104.27\"]");
//
//}

View File

@ -19,7 +19,7 @@ TEST(Md5Test, Test1)
{
ASSERT_EQ(sizeof(DICT_FILE)/sizeof(DICT_FILE[0]), sizeof(DICT_FILE_MD5)/sizeof(DICT_FILE_MD5[0]));
string tmp;
for (int i = 0; i < sizeof(DICT_FILE)/sizeof(DICT_FILE[0]); i++)
for (uint i = 0; i < sizeof(DICT_FILE)/sizeof(DICT_FILE[0]); i++)
{
md5File(DICT_FILE[i], tmp);
ASSERT_EQ(tmp, string(DICT_FILE_MD5[i]));