diff --git a/src/DictTrie.hpp b/src/DictTrie.hpp index bed15cf..15de328 100644 --- a/src/DictTrie.hpp +++ b/src/DictTrie.hpp @@ -66,9 +66,6 @@ class DictTrie { const DictUnit* find(Unicode::const_iterator begin, Unicode::const_iterator end) const { return trie_->find(begin, end); } - bool find(Unicode::const_iterator begin, Unicode::const_iterator end, LocalVector >& nexts, size_t offset = 0) const { - return trie_->find(begin, end, nexts, offset); - } void find(Unicode::const_iterator begin, Unicode::const_iterator end, vector& res) const { diff --git a/src/FullSegment.hpp b/src/FullSegment.hpp index 6c8153f..1a70315 100644 --- a/src/FullSegment.hpp +++ b/src/FullSegment.hpp @@ -42,6 +42,23 @@ class FullSegment: public SegmentBase { //tmp variables int wordLen = 0; assert(dictTrie_); + vector dags; + dictTrie_->find(begin, end, dags); + for (size_t i = 0; i < dags.size(); i++) { + for (size_t j = 0; j < dags[i].nexts.size(); j++) { + const DictUnit* du = dags[i].nexts[j].second; + if (du == NULL) { + continue; + } + wordLen = du->word.size(); + if (wordLen >= 2 || (dags[i].nexts.size() == 1 && maxIdx <= uIdx)) { + res.push_back(du->word); + } + maxIdx = uIdx + wordLen > maxIdx ? uIdx + wordLen : maxIdx; + } + uIdx++; + } + /* for (Unicode::const_iterator uItr = begin; uItr != end; uItr++) { //find word start from uItr if (dictTrie_->find(uItr, end, tRes, 0)) { @@ -64,6 +81,7 @@ class FullSegment: public SegmentBase { } ++uIdx; } + */ return true; } diff --git a/src/MPSegment.hpp b/src/MPSegment.hpp index 1d9cd16..f35158c 100644 --- a/src/MPSegment.hpp +++ b/src/MPSegment.hpp @@ -48,13 +48,13 @@ class MPSegment: public SegmentBase { } bool cut(Unicode::const_iterator begin , Unicode::const_iterator end, vector& res) const { - vector segmentChars; + vector dags; - dictTrie_->find(begin, end, segmentChars); + dictTrie_->find(begin, end, dags); - calcDP_(segmentChars); + calcDP_(dags); - cut_(segmentChars, res); + cut_(dags, res); return true; } @@ -63,12 +63,12 @@ class MPSegment: public SegmentBase { } private: - void calcDP_(vector& segmentChars) const { + void calcDP_(vector& dags) const { size_t nextPos; const DictUnit* p; double val; - for(vector::reverse_iterator rit = segmentChars.rbegin(); rit != segmentChars.rend(); rit++) { + for(vector::reverse_iterator rit = dags.rbegin(); rit != dags.rend(); rit++) { rit->pInfo = NULL; rit->weight = MIN_DOUBLE; assert(!rit->nexts.empty()); @@ -76,8 +76,8 @@ class MPSegment: public SegmentBase { nextPos = it->first; p = it->second; val = 0.0; - if(nextPos + 1 < segmentChars.size()) { - val += segmentChars[nextPos + 1].weight; + if(nextPos + 1 < dags.size()) { + val += dags[nextPos + 1].weight; } if(p) { @@ -92,16 +92,16 @@ class MPSegment: public SegmentBase { } } } - void cut_(const vector& segmentChars, + void cut_(const vector& dags, vector& res) const { size_t i = 0; - while(i < segmentChars.size()) { - const DictUnit* p = segmentChars[i].pInfo; + while(i < dags.size()) { + const DictUnit* p = dags[i].pInfo; if(p) { res.push_back(p->word); i += p->word.size(); } else { //single chinese word - res.push_back(Unicode(1, segmentChars[i].uniCh)); + res.push_back(Unicode(1, dags[i].uniCh)); i++; } } diff --git a/src/Trie.hpp b/src/Trie.hpp index 18e1b82..289787b 100644 --- a/src/Trie.hpp +++ b/src/Trie.hpp @@ -93,9 +93,9 @@ class Trie { Unicode::value_type ch = *(begin + i); ptNode = _base + ch; res[i].uniCh = ch; - assert(res[i].dag.empty()); + assert(res[i].nexts.empty()); - res[i].dag.push_back(DagType::value_type(i, ptNode->ptValue)); + res[i].nexts.push_back(pair(i, ptNode->ptValue)); for (size_t j = i + 1; j < size_t(end - begin); j++) { if (ptNode->next == NULL) { @@ -107,43 +107,11 @@ class Trie { } ptNode = citer->second; if (NULL != ptNode->ptValue) { - res[i].dag.push_back(DagType::value_type(j, ptNode->ptValue)); + res[i].nexts.push_back(pair(j, ptNode->ptValue)); } } } } - bool find( - Unicode::const_iterator begin, - Unicode::const_iterator end, - LocalVector > & res, - size_t offset = 0) const { - if (begin == end) { - return !res.empty(); - } - - const TrieNode* ptNode = _base + (*(begin++)); - if (ptNode->ptValue != NULL && res.size() == 1) { - res[0].second = ptNode->ptValue; - } else if (ptNode->ptValue != NULL) { - res.push_back(DagType::value_type(offset, ptNode->ptValue)); - } - - TrieNode::NextMap::const_iterator citer; - for (Unicode::const_iterator itr = begin; itr != end; itr++) { - if (NULL == ptNode->next) { - break; - } - citer = ptNode->next->find(*itr); - if (citer == ptNode->next->end()) { - break; - } - ptNode = citer->second; - if (NULL != ptNode->ptValue) { - res.push_back(DagType::value_type(itr - begin + offset, ptNode->ptValue)); - } - } - return !res.empty(); - } void insertNode(const Unicode& key, const DictUnit* ptValue) { if (key.begin() == key.end()) { return; diff --git a/test/unittest/TTrie.cpp b/test/unittest/TTrie.cpp index 628a931..d40d8e6 100644 --- a/test/unittest/TTrie.cpp +++ b/test/unittest/TTrie.cpp @@ -15,7 +15,6 @@ TEST(DictTrieTest, NewAndDelete) { } TEST(DictTrieTest, Test1) { - string s1, s2; DictTrie trie; trie.init(DICT_FILE); @@ -32,22 +31,23 @@ TEST(DictTrieTest, Test1) { EXPECT_EQ("[\"26469\", \"21040\"] v -8.870", s2); word = "清华大学"; - LocalVector > res, res2; + LocalVector > res; const char * words[] = {"清", "清华", "清华大学"}; for(size_t i = 0; i < sizeof(words)/sizeof(words[0]); i++) { ASSERT_TRUE(TransCode::decode(words[i], uni)); res.push_back(make_pair(uni.size() - 1, trie.find(uni.begin(), uni.end()))); //resMap[uni.size() - 1] = trie.find(uni.begin(), uni.end()); } - //DictUnit - //res.push_back(make_pair(0, )) - vector > vec; + vector dags; ASSERT_TRUE(TransCode::decode(word, uni)); - ASSERT_TRUE(trie.find(uni.begin(), uni.end(), res2, 0)); + trie.find(uni.begin(), uni.end(), dags); + ASSERT_EQ(dags.size(), uni.size()); + ASSERT_NE(dags.size(), 0u); s1 << res; - s2 << res; + s2 << dags[0].nexts; ASSERT_EQ(s1, s2); + } TEST(DictTrieTest, UserDict) { @@ -62,12 +62,48 @@ TEST(DictTrieTest, UserDict) { ASSERT_EQ("[\"20113\", \"35745\", \"31639\"] -2.975", res); } -TEST(DictTrieTest, automation) { +TEST(DictTrieTest, Dag) { DictTrie trie(DICT_FILE, "../test/testdata/userdict.utf8"); - //string word = "yasherhs"; - string word = "abcderf"; - Unicode unicode; - ASSERT_TRUE(TransCode::decode(word, unicode)); - vector res; - trie.find(unicode.begin(), unicode.end(), res); + + { + string word = "清华大学"; + Unicode unicode; + ASSERT_TRUE(TransCode::decode(word, unicode)); + vector res; + trie.find(unicode.begin(), unicode.end(), res); + + size_t nexts_sizes[] = {3, 2, 2, 1}; + ASSERT_EQ(res.size(), sizeof(nexts_sizes)/sizeof(nexts_sizes[0])); + for (size_t i = 0; i < res.size(); i++) { + ASSERT_EQ(res[i].nexts.size(), nexts_sizes[i]); + } + } + + { + string word = "北京邮电大学"; + Unicode unicode; + ASSERT_TRUE(TransCode::decode(word, unicode)); + vector res; + trie.find(unicode.begin(), unicode.end(), res); + + size_t nexts_sizes[] = {3, 1, 2, 2, 2, 1}; + ASSERT_EQ(res.size(), sizeof(nexts_sizes)/sizeof(nexts_sizes[0])); + for (size_t i = 0; i < res.size(); i++) { + ASSERT_EQ(res[i].nexts.size(), nexts_sizes[i]); + } + } + + { + string word = "长江大桥"; + Unicode unicode; + ASSERT_TRUE(TransCode::decode(word, unicode)); + vector res; + trie.find(unicode.begin(), unicode.end(), res); + + size_t nexts_sizes[] = {3, 1, 2, 1}; + ASSERT_EQ(res.size(), sizeof(nexts_sizes)/sizeof(nexts_sizes[0])); + for (size_t i = 0; i < res.size(); i++) { + ASSERT_EQ(res[i].nexts.size(), nexts_sizes[i]); + } + } }