diff --git a/src/DictTrie.hpp b/src/DictTrie.hpp index 231aa2a..35ce7c7 100644 --- a/src/DictTrie.hpp +++ b/src/DictTrie.hpp @@ -66,21 +66,18 @@ class DictTrie { const DictUnit* find(Unicode::const_iterator begin, Unicode::const_iterator end) const { return trie_->find(begin, end); } - void find(Unicode::const_iterator begin, - Unicode::const_iterator end, - vector& res) const { - trie_->find(begin, end, res); - } - void findByLimit(Unicode::const_iterator begin, + + void find(Unicode::const_iterator begin, Unicode::const_iterator end, - size_t min_word_len, - size_t max_word_len, - vector&res) const { - trie_->findByLimit(begin, end, min_word_len, max_word_len, res); + vector&res, + size_t max_word_len = MAX_WORD_LENGTH) const { + trie_->find(begin, end, res, max_word_len); } + bool isUserDictSingleChineseWord(const Rune& word) const { return isIn(userDictSingleChineseWord_, word); } + double getMinWeight() const { return minWeight_; } diff --git a/src/ISegment.hpp b/src/ISegment.hpp index f0704b6..38ef0a2 100644 --- a/src/ISegment.hpp +++ b/src/ISegment.hpp @@ -6,7 +6,7 @@ namespace CppJieba { class ISegment { public: virtual ~ISegment() { - }; + } virtual bool cut(const string& str, vector& res) const = 0; }; diff --git a/src/LevelSegment.hpp b/src/LevelSegment.hpp new file mode 100644 index 0000000..912da35 --- /dev/null +++ b/src/LevelSegment.hpp @@ -0,0 +1,79 @@ +#ifndef CPPJIEBA_LEVELSEGMENT_H +#define CPPJIEBA_LEVELSEGMENT_H + +#include "MPSegment.hpp" + +namespace CppJieba { + +class LevelSegment: public ISegment { + public: + LevelSegment(const string& dictPath, + const string& userDictPath = "") + : mpSeg_(dictPath, userDictPath) { + LogInfo("LevelSegment init"); + } + LevelSegment(const DictTrie* dictTrie) + : mpSeg_(dictTrie) { + } + virtual ~LevelSegment() { + } + + void cut(Unicode::const_iterator begin, + Unicode::const_iterator end, + vector >& res) const { + vector words; + vector smallerWords; + words.reserve(end - begin); + mpSeg_.cut(begin, end, words); + smallerWords.reserve(words.size()); + res.reserve(words.size()); + + size_t level = 0; + while (!words.empty()) { + smallerWords.clear(); + for (size_t i = 0; i < words.size(); i++) { + if (words[i].size() >= 3) { + size_t len = words[i].size() - 1; + mpSeg_.cut(words[i].begin(), words[i].end(), smallerWords, len); // buffer.push_back without clear + } + if (words[i].size() > 1) { + res.push_back(pair(words[i], level)); + } + } + + words.swap(smallerWords); + level++; + } + } + + void cut(const string& sentence, + vector >& words) const { + Unicode unicode; + TransCode::decode(sentence, unicode); + vector > unicodeWords; + cut(unicode.begin(), unicode.end(), unicodeWords); + words.resize(unicodeWords.size()); + for (size_t i = 0; i < words.size(); i++) { + TransCode::encode(unicodeWords[i].first, words[i].first); + words[i].second = unicodeWords[i].second; + } + } + + bool cut(const string& sentence, + vector& res) const { + vector > words; + cut(sentence, words); + res.reserve(words.size()); + for (size_t i = 0; i < words.size(); i++) { + res.push_back(words[i].first); + } + return true; + } + + private: + MPSegment mpSeg_; +}; // class LevelSegment + +} // namespace CppJieba + +#endif // CPPJIEBA_LEVELSEGMENT_H diff --git a/src/MPSegment.hpp b/src/MPSegment.hpp index c0fd036..48a8ea0 100644 --- a/src/MPSegment.hpp +++ b/src/MPSegment.hpp @@ -44,14 +44,13 @@ class MPSegment: public SegmentBase { } void cut(Unicode::const_iterator begin, Unicode::const_iterator end, - size_t min_word_len, - size_t max_word_len, - vector&res) const { + vector& res, + size_t max_word_len) const { vector dags; - dictTrie_->findByLimit(begin, end, - min_word_len, - max_word_len, - dags); + dictTrie_->find(begin, + end, + dags, + max_word_len); calcDP_(dags); cut_(dags, res); } diff --git a/src/Trie.hpp b/src/Trie.hpp index 80f330a..7311817 100644 --- a/src/Trie.hpp +++ b/src/Trie.hpp @@ -8,7 +8,6 @@ namespace CppJieba { using namespace std; -const size_t MIN_WORD_LENGTH = 1; const size_t MAX_WORD_LENGTH = 512; struct DictUnit { @@ -86,18 +85,12 @@ class Trie { return ptNode->ptValue; } - void findByLimit(Unicode::const_iterator begin, + void find(Unicode::const_iterator begin, Unicode::const_iterator end, - size_t min_word_len, - size_t max_word_len, - vector&res) const { + vector&res, + size_t max_word_len = MAX_WORD_LENGTH) const { res.resize(end - begin); - // min_word_len start from 1; - if (min_word_len < 1) { - min_word_len = 1; - } - const TrieNode *ptNode = NULL; TrieNode::NextMap::const_iterator citer; for (size_t i = 0; i < size_t(end - begin); i++) { @@ -106,11 +99,8 @@ class Trie { res[i].rune = rune; assert(res[i].nexts.empty()); - if (min_word_len <= 1) { - res[i].nexts.push_back(pair(i, ptNode->ptValue)); - } + res[i].nexts.push_back(pair(i, ptNode->ptValue)); - // min_word_len start from 1; for (size_t j = i + 1; j < size_t(end - begin) && (j - i + 1) <= max_word_len ; j++) { if (ptNode->next == NULL) { break; @@ -120,18 +110,13 @@ class Trie { break; } ptNode = citer->second; - if (NULL != ptNode->ptValue && (j - i + 1) >= min_word_len) { + if (NULL != ptNode->ptValue) { res[i].nexts.push_back(pair(j, ptNode->ptValue)); } } } } - void find(Unicode::const_iterator begin, - Unicode::const_iterator end, - vector& res) const { - findByLimit(begin, end, MIN_WORD_LENGTH, MAX_WORD_LENGTH, res); - } void insertNode(const Unicode& key, const DictUnit* ptValue) { if (key.begin() == key.end()) { return; diff --git a/test/unittest/TSegments.cpp b/test/unittest/TSegments.cpp index 05519c3..3fff55d 100644 --- a/test/unittest/TSegments.cpp +++ b/test/unittest/TSegments.cpp @@ -4,6 +4,7 @@ #include "src/HMMSegment.hpp" #include "src/FullSegment.hpp" #include "src/QuerySegment.hpp" +#include "src/LevelSegment.hpp" #include "gtest/gtest.h" using namespace CppJieba; @@ -86,50 +87,52 @@ TEST(MixSegmentTest, UserDict2) { TEST(MPSegmentTest, Test1) { MPSegment segment("../dict/jieba.dict.utf8");; - const char* str = "我来自北京邮电大学。"; - const char* res[] = {"我", "来自", "北京邮电大学", "。"}; + string s; vector words; - ASSERT_TRUE(segment.cut(str, words)); - ASSERT_EQ(words, vector(res, res + sizeof(res)/sizeof(res[0]))); + ASSERT_TRUE(segment.cut("我来自北京邮电大学。", words)); + ASSERT_EQ("[\"我\", \"来自\", \"北京邮电大学\", \"。\"]", s << words); - { - const char* str = "B超 T恤"; - const char * res[] = {"B超", " ", "T恤"}; - vector words; - ASSERT_TRUE(segment.cut(str, words)); - ASSERT_EQ(words, vector(res, res + sizeof(res)/sizeof(res[0]))); - } + ASSERT_TRUE(segment.cut("B超 T恤", words)); + ASSERT_EQ(s << words, "[\"B超\", \" \", \"T恤\"]"); + + ASSERT_TRUE(segment.cut("南京市长江大桥", words)); + ASSERT_EQ("[\"南京市\", \"长江大桥\"]", s << words); + + // MaxWordLen + //ASSERT_TRUE(segment.cut("南京市长江大桥", words, 3)); + //ASSERT_EQ("[\"南京市\", \"长江\", \"大桥\"]", s << words); } -TEST(MPSegmentTest, Test2) { - MPSegment segment("../test/testdata/extra_dict/jieba.dict.small.utf8"); - string line; - ifstream ifs("../test/testdata/review.100"); - vector words; +//TEST(MPSegmentTest, Test2) { +// MPSegment segment("../test/testdata/extra_dict/jieba.dict.small.utf8"); +// string line; +// ifstream ifs("../test/testdata/review.100"); +// vector words; +// +// string eRes; +// { +// ifstream ifs("../test/testdata/review.100.res"); +// ASSERT_TRUE(!!ifs); +// eRes << ifs; +// } +// string res; +// +// while(getline(ifs, line)) { +// res += line; +// res += '\n'; +// +// segment.cut(line, words); +// string s; +// s << words; +// res += s; +// res += '\n'; +// } +// ofstream ofs("../test/testdata/review.100.res"); +// ASSERT_TRUE(!!ofs); +// ofs << res; +// +//} - string eRes; - { - ifstream ifs("../test/testdata/review.100.res"); - ASSERT_TRUE(!!ifs); - eRes << ifs; - } - string res; - - while(getline(ifs, line)) { - res += line; - res += '\n'; - - segment.cut(line, words); - string s; - s << words; - res += s; - res += '\n'; - } - ofstream ofs("../test/testdata/review.100.res"); - ASSERT_TRUE(!!ofs); - ofs << res; - -} TEST(HMMSegmentTest, Test1) { HMMSegment segment("../dict/hmm_model.utf8");; { @@ -203,3 +206,15 @@ TEST(QuerySegment, Test2) { } } + +TEST(LevelSegmentTest, Test0) { + string s; + LevelSegment segment("../test/testdata/extra_dict/jieba.dict.small.utf8"); + vector > words; + segment.cut("南京市长江大桥", words); + ASSERT_EQ("[\"南京市:0\", \"长江大桥:0\", \"南京:1\", \"长江:1\", \"大桥:1\"]", s << words); + + vector res; + segment.cut("南京市长江大桥", res); + ASSERT_EQ("[\"南京市\", \"长江大桥\", \"南京\", \"长江\", \"大桥\"]", s << res); +} diff --git a/test/unittest/TTrie.cpp b/test/unittest/TTrie.cpp index 8bd7c4d..ac964f7 100644 --- a/test/unittest/TTrie.cpp +++ b/test/unittest/TTrie.cpp @@ -122,28 +122,26 @@ TEST(DictTrieTest, Dag) { } } - //findByLimit [2, 3] { string word = "长江大桥"; Unicode unicode; ASSERT_TRUE(TransCode::decode(word, unicode)); vector res; - trie.findByLimit(unicode.begin(), unicode.end(), 2, 3, res); + trie.find(unicode.begin(), unicode.end(), res, 3); - size_t nexts_sizes[] = {1, 0, 1, 0}; + size_t nexts_sizes[] = {2, 1, 2, 1}; ASSERT_EQ(res.size(), sizeof(nexts_sizes)/sizeof(nexts_sizes[0])); for (size_t i = 0; i < res.size(); i++) { ASSERT_EQ(res[i].nexts.size(), nexts_sizes[i]); } } - //findByLimit [0, 4] { string word = "长江大桥"; Unicode unicode; ASSERT_TRUE(TransCode::decode(word, unicode)); vector res; - trie.findByLimit(unicode.begin(), unicode.end(), 0, 4, res); + trie.find(unicode.begin(), unicode.end(), res, 4); size_t nexts_sizes[] = {3, 1, 2, 1}; ASSERT_EQ(res.size(), sizeof(nexts_sizes)/sizeof(nexts_sizes[0]));