mirror of
https://github.com/yanyiwu/cppjieba.git
synced 2025-07-18 00:00:12 +08:00
abondom ISegment
This commit is contained in:
parent
6d69363145
commit
14974d51b4
@ -6,7 +6,6 @@
|
|||||||
#include <cassert>
|
#include <cassert>
|
||||||
#include "limonp/Logger.hpp"
|
#include "limonp/Logger.hpp"
|
||||||
#include "DictTrie.hpp"
|
#include "DictTrie.hpp"
|
||||||
#include "ISegment.hpp"
|
|
||||||
#include "SegmentBase.hpp"
|
#include "SegmentBase.hpp"
|
||||||
#include "TransCode.hpp"
|
#include "TransCode.hpp"
|
||||||
|
|
||||||
@ -22,13 +21,24 @@ class FullSegment: public SegmentBase {
|
|||||||
: dictTrie_(dictTrie), isNeedDestroy_(false) {
|
: dictTrie_(dictTrie), isNeedDestroy_(false) {
|
||||||
assert(dictTrie_);
|
assert(dictTrie_);
|
||||||
}
|
}
|
||||||
virtual ~FullSegment() {
|
~FullSegment() {
|
||||||
if(isNeedDestroy_) {
|
if(isNeedDestroy_) {
|
||||||
delete dictTrie_;
|
delete dictTrie_;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
using SegmentBase::cut;
|
void cut(const string& sentence,
|
||||||
virtual void cut(Unicode::const_iterator begin,
|
vector<string>& words) const {
|
||||||
|
PreFilter pre_filter(symbols_, sentence);
|
||||||
|
PreFilter::Range range;
|
||||||
|
vector<Unicode> uwords;
|
||||||
|
uwords.reserve(sentence.size());
|
||||||
|
while (pre_filter.HasNext()) {
|
||||||
|
range = pre_filter.Next();
|
||||||
|
cut(range.begin, range.end, uwords);
|
||||||
|
}
|
||||||
|
TransCode::encode(uwords, words);
|
||||||
|
}
|
||||||
|
void cut(Unicode::const_iterator begin,
|
||||||
Unicode::const_iterator end,
|
Unicode::const_iterator end,
|
||||||
vector<Unicode>& res) const {
|
vector<Unicode>& res) const {
|
||||||
//resut of searching in trie tree
|
//resut of searching in trie tree
|
||||||
|
@ -17,14 +17,25 @@ class HMMSegment: public SegmentBase {
|
|||||||
HMMSegment(const HMMModel* model)
|
HMMSegment(const HMMModel* model)
|
||||||
: model_(model), isNeedDestroy_(false) {
|
: model_(model), isNeedDestroy_(false) {
|
||||||
}
|
}
|
||||||
virtual ~HMMSegment() {
|
~HMMSegment() {
|
||||||
if(isNeedDestroy_) {
|
if(isNeedDestroy_) {
|
||||||
delete model_;
|
delete model_;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
using SegmentBase::cut;
|
void cut(const string& sentence,
|
||||||
void cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<Unicode>& res)const {
|
vector<string>& words) const {
|
||||||
|
PreFilter pre_filter(symbols_, sentence);
|
||||||
|
PreFilter::Range range;
|
||||||
|
vector<Unicode> uwords;
|
||||||
|
uwords.reserve(sentence.size());
|
||||||
|
while (pre_filter.HasNext()) {
|
||||||
|
range = pre_filter.Next();
|
||||||
|
cut(range.begin, range.end, uwords);
|
||||||
|
}
|
||||||
|
TransCode::encode(uwords, words);
|
||||||
|
}
|
||||||
|
void cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<Unicode>& res) const {
|
||||||
Unicode::const_iterator left = begin;
|
Unicode::const_iterator left = begin;
|
||||||
Unicode::const_iterator right = begin;
|
Unicode::const_iterator right = begin;
|
||||||
while(right != end) {
|
while(right != end) {
|
||||||
|
@ -1,15 +0,0 @@
|
|||||||
#ifndef CPPJIEBA_ISEGMENT_H
|
|
||||||
#define CPPJIEBA_ISEGMENT_H
|
|
||||||
|
|
||||||
namespace CppJieba {
|
|
||||||
|
|
||||||
class ISegment {
|
|
||||||
public:
|
|
||||||
virtual ~ISegment() {
|
|
||||||
}
|
|
||||||
virtual bool cut(const string& str, vector<string>& res) const = 0;
|
|
||||||
};
|
|
||||||
|
|
||||||
} // namespace CppJieba
|
|
||||||
|
|
||||||
#endif // CPPJIEBA_ISEGMENT_H
|
|
@ -44,10 +44,7 @@ class KeywordExtractor {
|
|||||||
|
|
||||||
bool extract(const string& str, vector<pair<string, double> >& keywords, size_t topN) const {
|
bool extract(const string& str, vector<pair<string, double> >& keywords, size_t topN) const {
|
||||||
vector<string> words;
|
vector<string> words;
|
||||||
if(!segment_.cut(str, words)) {
|
segment_.cut(str, words);
|
||||||
LogError("segment cut(%s) failed.", str.c_str());
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
map<string, double> wordmap;
|
map<string, double> wordmap;
|
||||||
for(vector<string>::iterator iter = words.begin(); iter != words.end(); iter++) {
|
for(vector<string>::iterator iter = words.begin(); iter != words.end(); iter++) {
|
||||||
|
@ -5,7 +5,7 @@
|
|||||||
|
|
||||||
namespace CppJieba {
|
namespace CppJieba {
|
||||||
|
|
||||||
class LevelSegment: public ISegment {
|
class LevelSegment: public SegmentBase{
|
||||||
public:
|
public:
|
||||||
LevelSegment(const string& dictPath,
|
LevelSegment(const string& dictPath,
|
||||||
const string& userDictPath = "")
|
const string& userDictPath = "")
|
||||||
@ -15,7 +15,7 @@ class LevelSegment: public ISegment {
|
|||||||
LevelSegment(const DictTrie* dictTrie)
|
LevelSegment(const DictTrie* dictTrie)
|
||||||
: mpSeg_(dictTrie) {
|
: mpSeg_(dictTrie) {
|
||||||
}
|
}
|
||||||
virtual ~LevelSegment() {
|
~LevelSegment() {
|
||||||
}
|
}
|
||||||
|
|
||||||
void cut(Unicode::const_iterator begin,
|
void cut(Unicode::const_iterator begin,
|
||||||
|
@ -6,7 +6,6 @@
|
|||||||
#include <cassert>
|
#include <cassert>
|
||||||
#include "limonp/Logger.hpp"
|
#include "limonp/Logger.hpp"
|
||||||
#include "DictTrie.hpp"
|
#include "DictTrie.hpp"
|
||||||
#include "ISegment.hpp"
|
|
||||||
#include "SegmentBase.hpp"
|
#include "SegmentBase.hpp"
|
||||||
|
|
||||||
namespace CppJieba {
|
namespace CppJieba {
|
||||||
@ -22,50 +21,38 @@ class MPSegment: public SegmentBase {
|
|||||||
: dictTrie_(dictTrie), isNeedDestroy_(false) {
|
: dictTrie_(dictTrie), isNeedDestroy_(false) {
|
||||||
assert(dictTrie_);
|
assert(dictTrie_);
|
||||||
}
|
}
|
||||||
virtual ~MPSegment() {
|
~MPSegment() {
|
||||||
if(isNeedDestroy_) {
|
if(isNeedDestroy_) {
|
||||||
delete dictTrie_;
|
delete dictTrie_;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
using SegmentBase::cut;
|
void cut(const string& sentence,
|
||||||
void cut(Unicode::const_iterator begin , Unicode::const_iterator end, vector<Unicode>& words) const {
|
|
||||||
vector<Dag> dags;
|
|
||||||
|
|
||||||
dictTrie_->find(begin, end, dags);
|
|
||||||
|
|
||||||
CalcDP(dags);
|
|
||||||
|
|
||||||
Cut(dags, words);
|
|
||||||
}
|
|
||||||
bool cut(const string& sentence,
|
|
||||||
vector<string>& words,
|
vector<string>& words,
|
||||||
size_t max_word_len) const {
|
size_t max_word_len = MAX_WORD_LENGTH) const {
|
||||||
Unicode unicode;
|
PreFilter pre_filter(symbols_, sentence);
|
||||||
if (!TransCode::decode(sentence, unicode)) {
|
PreFilter::Range range;
|
||||||
return false;
|
vector<Unicode> uwords;
|
||||||
|
uwords.reserve(sentence.size());
|
||||||
|
while (pre_filter.HasNext()) {
|
||||||
|
range = pre_filter.Next();
|
||||||
|
cut(range.begin, range.end, uwords, max_word_len);
|
||||||
}
|
}
|
||||||
vector<Unicode> unicodeWords;
|
TransCode::encode(uwords, words);
|
||||||
cut(unicode.begin(), unicode.end(),
|
|
||||||
unicodeWords, max_word_len);
|
|
||||||
words.resize(unicodeWords.size());
|
|
||||||
for (size_t i = 0; i < words.size(); i++) {
|
|
||||||
TransCode::encode(unicodeWords[i], words[i]);
|
|
||||||
}
|
|
||||||
return true;
|
|
||||||
}
|
}
|
||||||
void cut(Unicode::const_iterator begin,
|
void cut(Unicode::const_iterator begin,
|
||||||
Unicode::const_iterator end,
|
Unicode::const_iterator end,
|
||||||
vector<Unicode>& words,
|
vector<Unicode>& words,
|
||||||
size_t max_word_len) const {
|
size_t max_word_len = MAX_WORD_LENGTH) const {
|
||||||
vector<Dag> dags;
|
vector<Dag> dags;
|
||||||
dictTrie_->find(begin,
|
dictTrie_->find(begin,
|
||||||
end,
|
end,
|
||||||
dags,
|
dags,
|
||||||
max_word_len);
|
max_word_len);
|
||||||
CalcDP(dags);
|
CalcDP(dags);
|
||||||
Cut(dags, words);
|
CutByDag(dags, words);
|
||||||
}
|
}
|
||||||
|
|
||||||
const DictTrie* getDictTrie() const {
|
const DictTrie* getDictTrie() const {
|
||||||
return dictTrie_;
|
return dictTrie_;
|
||||||
}
|
}
|
||||||
@ -103,7 +90,7 @@ class MPSegment: public SegmentBase {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
void Cut(const vector<Dag>& dags,
|
void CutByDag(const vector<Dag>& dags,
|
||||||
vector<Unicode>& words) const {
|
vector<Unicode>& words) const {
|
||||||
size_t i = 0;
|
size_t i = 0;
|
||||||
while(i < dags.size()) {
|
while(i < dags.size()) {
|
||||||
|
@ -18,10 +18,23 @@ class MixSegment: public SegmentBase {
|
|||||||
MixSegment(const DictTrie* dictTrie, const HMMModel* model)
|
MixSegment(const DictTrie* dictTrie, const HMMModel* model)
|
||||||
: mpSeg_(dictTrie), hmmSeg_(model) {
|
: mpSeg_(dictTrie), hmmSeg_(model) {
|
||||||
}
|
}
|
||||||
virtual ~MixSegment() {
|
~MixSegment() {
|
||||||
}
|
}
|
||||||
using SegmentBase::cut;
|
|
||||||
virtual void cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<Unicode>& res) const {
|
void cut(const string& sentence,
|
||||||
|
vector<string>& words) const {
|
||||||
|
PreFilter pre_filter(symbols_, sentence);
|
||||||
|
PreFilter::Range range;
|
||||||
|
vector<Unicode> uwords;
|
||||||
|
uwords.reserve(sentence.size());
|
||||||
|
while (pre_filter.HasNext()) {
|
||||||
|
range = pre_filter.Next();
|
||||||
|
cut(range.begin, range.end, uwords);
|
||||||
|
}
|
||||||
|
TransCode::encode(uwords, words);
|
||||||
|
}
|
||||||
|
|
||||||
|
void cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<Unicode>& res) const {
|
||||||
vector<Unicode> words;
|
vector<Unicode> words;
|
||||||
words.reserve(end - begin);
|
words.reserve(end - begin);
|
||||||
mpSeg_.cut(begin, end, words);
|
mpSeg_.cut(begin, end, words);
|
||||||
|
@ -27,10 +27,7 @@ class PosTagger {
|
|||||||
|
|
||||||
bool tag(const string& src, vector<pair<string, string> >& res) const {
|
bool tag(const string& src, vector<pair<string, string> >& res) const {
|
||||||
vector<string> cutRes;
|
vector<string> cutRes;
|
||||||
if (!segment_.cut(src, cutRes)) {
|
segment_.cut(src, cutRes);
|
||||||
LogError("mixSegment_ cut failed");
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
const DictUnit *tmp = NULL;
|
const DictUnit *tmp = NULL;
|
||||||
Unicode unico;
|
Unicode unico;
|
||||||
|
@ -6,7 +6,6 @@
|
|||||||
#include <cassert>
|
#include <cassert>
|
||||||
#include "limonp/Logger.hpp"
|
#include "limonp/Logger.hpp"
|
||||||
#include "DictTrie.hpp"
|
#include "DictTrie.hpp"
|
||||||
#include "ISegment.hpp"
|
|
||||||
#include "SegmentBase.hpp"
|
#include "SegmentBase.hpp"
|
||||||
#include "FullSegment.hpp"
|
#include "FullSegment.hpp"
|
||||||
#include "MixSegment.hpp"
|
#include "MixSegment.hpp"
|
||||||
@ -25,9 +24,20 @@ class QuerySegment: public SegmentBase {
|
|||||||
QuerySegment(const DictTrie* dictTrie, const HMMModel* model, size_t maxWordLen = 4)
|
QuerySegment(const DictTrie* dictTrie, const HMMModel* model, size_t maxWordLen = 4)
|
||||||
: mixSeg_(dictTrie, model), fullSeg_(dictTrie), maxWordLen_(maxWordLen) {
|
: mixSeg_(dictTrie, model), fullSeg_(dictTrie), maxWordLen_(maxWordLen) {
|
||||||
}
|
}
|
||||||
virtual ~QuerySegment() {
|
~QuerySegment() {
|
||||||
|
}
|
||||||
|
void cut(const string& sentence,
|
||||||
|
vector<string>& words) const {
|
||||||
|
PreFilter pre_filter(symbols_, sentence);
|
||||||
|
PreFilter::Range range;
|
||||||
|
vector<Unicode> uwords;
|
||||||
|
uwords.reserve(sentence.size());
|
||||||
|
while (pre_filter.HasNext()) {
|
||||||
|
range = pre_filter.Next();
|
||||||
|
cut(range.begin, range.end, uwords);
|
||||||
|
}
|
||||||
|
TransCode::encode(uwords, words);
|
||||||
}
|
}
|
||||||
using SegmentBase::cut;
|
|
||||||
void cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<Unicode>& res) const {
|
void cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<Unicode>& res) const {
|
||||||
//use mix cut first
|
//use mix cut first
|
||||||
vector<Unicode> mixRes;
|
vector<Unicode> mixRes;
|
||||||
|
@ -3,7 +3,6 @@
|
|||||||
|
|
||||||
#include "limonp/Logger.hpp"
|
#include "limonp/Logger.hpp"
|
||||||
#include "PreFilter.hpp"
|
#include "PreFilter.hpp"
|
||||||
#include "ISegment.hpp"
|
|
||||||
#include <cassert>
|
#include <cassert>
|
||||||
|
|
||||||
|
|
||||||
@ -14,16 +13,17 @@ const Rune SPECIAL_SYMBOL[] = {32u, 9u, 10u, 65292u, 12290u};
|
|||||||
|
|
||||||
using namespace limonp;
|
using namespace limonp;
|
||||||
|
|
||||||
class SegmentBase: public ISegment {
|
class SegmentBase {
|
||||||
public:
|
public:
|
||||||
SegmentBase() {
|
SegmentBase() {
|
||||||
LoadSpecialSymbols();
|
LoadSpecialSymbols();
|
||||||
}
|
}
|
||||||
virtual ~SegmentBase() {
|
~SegmentBase() {
|
||||||
}
|
}
|
||||||
|
/*
|
||||||
public:
|
public:
|
||||||
virtual void cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<Unicode>& res) const = 0;
|
void cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<Unicode>& res) const = 0;
|
||||||
virtual bool cut(const string& sentence, vector<string>& words) const {
|
bool cut(const string& sentence, vector<string>& words) const {
|
||||||
PreFilter pre_filter(symbols_, sentence);
|
PreFilter pre_filter(symbols_, sentence);
|
||||||
PreFilter::Range range;
|
PreFilter::Range range;
|
||||||
vector<Unicode> uwords;
|
vector<Unicode> uwords;
|
||||||
@ -32,14 +32,12 @@ class SegmentBase: public ISegment {
|
|||||||
range = pre_filter.Next();
|
range = pre_filter.Next();
|
||||||
cut(range.begin, range.end, uwords);
|
cut(range.begin, range.end, uwords);
|
||||||
}
|
}
|
||||||
words.resize(uwords.size());
|
TransCode::encode(uwords, words);
|
||||||
for (size_t i = 0; i < uwords.size(); i++) {
|
|
||||||
TransCode::encode(uwords[i], words[i]);
|
|
||||||
}
|
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
*/
|
||||||
|
|
||||||
private:
|
protected:
|
||||||
void LoadSpecialSymbols() {
|
void LoadSpecialSymbols() {
|
||||||
size_t size = sizeof(SPECIAL_SYMBOL)/sizeof(*SPECIAL_SYMBOL);
|
size_t size = sizeof(SPECIAL_SYMBOL)/sizeof(*SPECIAL_SYMBOL);
|
||||||
for(size_t i = 0; i < size; i ++) {
|
for(size_t i = 0; i < size; i ++) {
|
||||||
@ -47,7 +45,6 @@ class SegmentBase: public ISegment {
|
|||||||
}
|
}
|
||||||
assert(symbols_.size());
|
assert(symbols_.size());
|
||||||
}
|
}
|
||||||
|
|
||||||
unordered_set<Rune> symbols_;
|
unordered_set<Rune> symbols_;
|
||||||
}; // class SegmentBase
|
}; // class SegmentBase
|
||||||
|
|
||||||
|
@ -17,9 +17,9 @@ TEST(MixSegmentTest, Test1) {
|
|||||||
const char* str2 = "B超 T恤";
|
const char* str2 = "B超 T恤";
|
||||||
const char* res2[] = {"B超"," ", "T恤"};
|
const char* res2[] = {"B超"," ", "T恤"};
|
||||||
vector<string> words;
|
vector<string> words;
|
||||||
ASSERT_TRUE(segment.cut(str, words));
|
segment.cut(str, words);
|
||||||
ASSERT_EQ(words, vector<string>(res, res + sizeof(res)/sizeof(res[0])));
|
ASSERT_EQ(words, vector<string>(res, res + sizeof(res)/sizeof(res[0])));
|
||||||
ASSERT_TRUE(segment.cut(str2, words));
|
segment.cut(str2, words);
|
||||||
ASSERT_EQ(words, vector<string>(res2, res2 + sizeof(res2)/sizeof(res2[0])));
|
ASSERT_EQ(words, vector<string>(res2, res2 + sizeof(res2)/sizeof(res2[0])));
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -27,7 +27,7 @@ TEST(MixSegmentTest, NoUserDict) {
|
|||||||
MixSegment segment("../test/testdata/extra_dict/jieba.dict.small.utf8", "../dict/hmm_model.utf8");
|
MixSegment segment("../test/testdata/extra_dict/jieba.dict.small.utf8", "../dict/hmm_model.utf8");
|
||||||
const char* str = "令狐冲是云计算方面的专家";
|
const char* str = "令狐冲是云计算方面的专家";
|
||||||
vector<string> words;
|
vector<string> words;
|
||||||
ASSERT_TRUE(segment.cut(str, words));
|
segment.cut(str, words);
|
||||||
string res;
|
string res;
|
||||||
ASSERT_EQ("[\"令狐冲\", \"是\", \"云\", \"计算\", \"方面\", \"的\", \"专家\"]", res << words);
|
ASSERT_EQ("[\"令狐冲\", \"是\", \"云\", \"计算\", \"方面\", \"的\", \"专家\"]", res << words);
|
||||||
|
|
||||||
@ -37,14 +37,14 @@ TEST(MixSegmentTest, UserDict) {
|
|||||||
{
|
{
|
||||||
const char* str = "令狐冲是云计算方面的专家";
|
const char* str = "令狐冲是云计算方面的专家";
|
||||||
vector<string> words;
|
vector<string> words;
|
||||||
ASSERT_TRUE(segment.cut(str, words));
|
segment.cut(str, words);
|
||||||
string res;
|
string res;
|
||||||
ASSERT_EQ("[\"令狐冲\", \"是\", \"云计算\", \"方面\", \"的\", \"专家\"]", res << words);
|
ASSERT_EQ("[\"令狐冲\", \"是\", \"云计算\", \"方面\", \"的\", \"专家\"]", res << words);
|
||||||
}
|
}
|
||||||
{
|
{
|
||||||
const char* str = "小明先就职于IBM,后在日本京都大学深造";
|
const char* str = "小明先就职于IBM,后在日本京都大学深造";
|
||||||
vector<string> words;
|
vector<string> words;
|
||||||
ASSERT_TRUE(segment.cut(str, words));
|
segment.cut(str, words);
|
||||||
string res;
|
string res;
|
||||||
res << words;
|
res << words;
|
||||||
ASSERT_EQ("[\"小明\", \"先\", \"就职\", \"于\", \"IBM\", \",\", \"后\", \"在\", \"日本\", \"京都大学\", \"深造\"]", res);
|
ASSERT_EQ("[\"小明\", \"先\", \"就职\", \"于\", \"IBM\", \",\", \"后\", \"在\", \"日本\", \"京都大学\", \"深造\"]", res);
|
||||||
@ -52,7 +52,7 @@ TEST(MixSegmentTest, UserDict) {
|
|||||||
{
|
{
|
||||||
const char* str = "IBM,3.14";
|
const char* str = "IBM,3.14";
|
||||||
vector<string> words;
|
vector<string> words;
|
||||||
ASSERT_TRUE(segment.cut(str, words));
|
segment.cut(str, words);
|
||||||
string res;
|
string res;
|
||||||
res << words;
|
res << words;
|
||||||
ASSERT_EQ("[\"IBM\", \",\", \"3.14\"]", res);
|
ASSERT_EQ("[\"IBM\", \",\", \"3.14\"]", res);
|
||||||
@ -63,14 +63,14 @@ TEST(MixSegmentTest, UserDict2) {
|
|||||||
{
|
{
|
||||||
const char* str = "令狐冲是云计算方面的专家";
|
const char* str = "令狐冲是云计算方面的专家";
|
||||||
vector<string> words;
|
vector<string> words;
|
||||||
ASSERT_TRUE(segment.cut(str, words));
|
segment.cut(str, words);
|
||||||
string res;
|
string res;
|
||||||
ASSERT_EQ("[\"令狐冲\", \"是\", \"云计算\", \"方面\", \"的\", \"专家\"]", res << words);
|
ASSERT_EQ("[\"令狐冲\", \"是\", \"云计算\", \"方面\", \"的\", \"专家\"]", res << words);
|
||||||
}
|
}
|
||||||
{
|
{
|
||||||
const char* str = "小明先就职于IBM,后在日本京都大学深造";
|
const char* str = "小明先就职于IBM,后在日本京都大学深造";
|
||||||
vector<string> words;
|
vector<string> words;
|
||||||
ASSERT_TRUE(segment.cut(str, words));
|
segment.cut(str, words);
|
||||||
string res;
|
string res;
|
||||||
res << words;
|
res << words;
|
||||||
ASSERT_EQ("[\"小明\", \"先\", \"就职\", \"于\", \"I\", \"B\", \"M\", \",\", \"后\", \"在\", \"日本\", \"京都大学\", \"深造\"]", res);
|
ASSERT_EQ("[\"小明\", \"先\", \"就职\", \"于\", \"I\", \"B\", \"M\", \",\", \"后\", \"在\", \"日本\", \"京都大学\", \"深造\"]", res);
|
||||||
@ -78,7 +78,7 @@ TEST(MixSegmentTest, UserDict2) {
|
|||||||
{
|
{
|
||||||
const char* str = "IBM,3.14";
|
const char* str = "IBM,3.14";
|
||||||
vector<string> words;
|
vector<string> words;
|
||||||
ASSERT_TRUE(segment.cut(str, words));
|
segment.cut(str, words);
|
||||||
string res;
|
string res;
|
||||||
res << words;
|
res << words;
|
||||||
ASSERT_EQ("[\"I\", \"B\", \"M\", \",\", \"3.14\"]", res);
|
ASSERT_EQ("[\"I\", \"B\", \"M\", \",\", \"3.14\"]", res);
|
||||||
@ -89,20 +89,20 @@ TEST(MPSegmentTest, Test1) {
|
|||||||
MPSegment segment("../dict/jieba.dict.utf8");;
|
MPSegment segment("../dict/jieba.dict.utf8");;
|
||||||
string s;
|
string s;
|
||||||
vector<string> words;
|
vector<string> words;
|
||||||
ASSERT_TRUE(segment.cut("我来自北京邮电大学。", words));
|
segment.cut("我来自北京邮电大学。", words);
|
||||||
ASSERT_EQ("[\"我\", \"来自\", \"北京邮电大学\", \"。\"]", s << words);
|
ASSERT_EQ("[\"我\", \"来自\", \"北京邮电大学\", \"。\"]", s << words);
|
||||||
|
|
||||||
ASSERT_TRUE(segment.cut("B超 T恤", words));
|
segment.cut("B超 T恤", words);
|
||||||
ASSERT_EQ(s << words, "[\"B超\", \" \", \"T恤\"]");
|
ASSERT_EQ(s << words, "[\"B超\", \" \", \"T恤\"]");
|
||||||
|
|
||||||
ASSERT_TRUE(segment.cut("南京市长江大桥", words));
|
segment.cut("南京市长江大桥", words);
|
||||||
ASSERT_EQ("[\"南京市\", \"长江大桥\"]", s << words);
|
ASSERT_EQ("[\"南京市\", \"长江大桥\"]", s << words);
|
||||||
|
|
||||||
// MaxWordLen
|
// MaxWordLen
|
||||||
ASSERT_TRUE(segment.cut("南京市长江大桥", words, 3));
|
segment.cut("南京市长江大桥", words, 3);
|
||||||
ASSERT_EQ("[\"南京市\", \"长江\", \"大桥\"]", s << words);
|
ASSERT_EQ("[\"南京市\", \"长江\", \"大桥\"]", s << words);
|
||||||
|
|
||||||
ASSERT_TRUE(segment.cut("南京市长江大桥", words, 0));
|
segment.cut("南京市长江大桥", words, 0);
|
||||||
ASSERT_EQ("[\"南\", \"京\", \"市\", \"长\", \"江\", \"大\", \"桥\"]", s << words);
|
ASSERT_EQ("[\"南\", \"京\", \"市\", \"长\", \"江\", \"大\", \"桥\"]", s << words);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -142,7 +142,7 @@ TEST(HMMSegmentTest, Test1) {
|
|||||||
const char* str = "我来自北京邮电大学。。。学号123456";
|
const char* str = "我来自北京邮电大学。。。学号123456";
|
||||||
const char* res[] = {"我来", "自北京", "邮电大学", "。", "。", "。", "学号", "123456"};
|
const char* res[] = {"我来", "自北京", "邮电大学", "。", "。", "。", "学号", "123456"};
|
||||||
vector<string> words;
|
vector<string> words;
|
||||||
ASSERT_TRUE(segment.cut(str, words));
|
segment.cut(str, words);
|
||||||
ASSERT_EQ(words, vector<string>(res, res + sizeof(res)/sizeof(res[0])));
|
ASSERT_EQ(words, vector<string>(res, res + sizeof(res)/sizeof(res[0])));
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -150,7 +150,7 @@ TEST(HMMSegmentTest, Test1) {
|
|||||||
const char* str = "IBM,1.2,123";
|
const char* str = "IBM,1.2,123";
|
||||||
const char* res[] = {"IBM", ",", "1.2", ",", "123"};
|
const char* res[] = {"IBM", ",", "1.2", ",", "123"};
|
||||||
vector<string> words;
|
vector<string> words;
|
||||||
ASSERT_TRUE(segment.cut(str, words));
|
segment.cut(str, words);
|
||||||
ASSERT_EQ(words, vector<string>(res, res + sizeof(res)/sizeof(res[0])));
|
ASSERT_EQ(words, vector<string>(res, res + sizeof(res)/sizeof(res[0])));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -160,12 +160,12 @@ TEST(FullSegment, Test1) {
|
|||||||
vector<string> words;
|
vector<string> words;
|
||||||
string s;
|
string s;
|
||||||
|
|
||||||
ASSERT_TRUE(segment.cut("我来自北京邮电大学", words));
|
segment.cut("我来自北京邮电大学", words);
|
||||||
s << words;
|
s << words;
|
||||||
ASSERT_EQ(s, "[\"我\", \"来自\", \"北京\", \"北京邮电大学\", \"邮电\", \"电大\", \"大学\"]");
|
ASSERT_EQ(s, "[\"我\", \"来自\", \"北京\", \"北京邮电大学\", \"邮电\", \"电大\", \"大学\"]");
|
||||||
|
|
||||||
|
|
||||||
ASSERT_TRUE(segment.cut("上市公司CEO", words));
|
segment.cut("上市公司CEO", words);
|
||||||
s << words;
|
s << words;
|
||||||
ASSERT_EQ(s, "[\"上市\", \"公司\", \"C\", \"E\", \"O\"]");
|
ASSERT_EQ(s, "[\"上市\", \"公司\", \"C\", \"E\", \"O\"]");
|
||||||
}
|
}
|
||||||
@ -175,7 +175,7 @@ TEST(QuerySegment, Test1) {
|
|||||||
const char* str = "小明硕士毕业于中国科学院计算所,后在日本京都大学深造";
|
const char* str = "小明硕士毕业于中国科学院计算所,后在日本京都大学深造";
|
||||||
vector<string> words;
|
vector<string> words;
|
||||||
|
|
||||||
ASSERT_TRUE(segment.cut(str, words));
|
segment.cut(str, words);
|
||||||
|
|
||||||
string s1, s2;
|
string s1, s2;
|
||||||
s1 << words;
|
s1 << words;
|
||||||
@ -191,7 +191,7 @@ TEST(QuerySegment, Test2) {
|
|||||||
const char* str = "小明硕士毕业于中国科学院计算所,后在日本京都大学深造";
|
const char* str = "小明硕士毕业于中国科学院计算所,后在日本京都大学深造";
|
||||||
vector<string> words;
|
vector<string> words;
|
||||||
|
|
||||||
ASSERT_TRUE(segment.cut(str, words));
|
segment.cut(str, words);
|
||||||
|
|
||||||
string s1, s2;
|
string s1, s2;
|
||||||
s1 << words;
|
s1 << words;
|
||||||
@ -203,7 +203,7 @@ TEST(QuerySegment, Test2) {
|
|||||||
const char* str = "小明硕士毕业于中国科学院计算所iPhone6";
|
const char* str = "小明硕士毕业于中国科学院计算所iPhone6";
|
||||||
vector<string> words;
|
vector<string> words;
|
||||||
|
|
||||||
ASSERT_TRUE(segment.cut(str, words));
|
segment.cut(str, words);
|
||||||
|
|
||||||
string s1, s2;
|
string s1, s2;
|
||||||
s1 << words;
|
s1 << words;
|
||||||
|
Loading…
x
Reference in New Issue
Block a user