From 4d86abb0012923158b5f271cea0884c5af233336 Mon Sep 17 00:00:00 2001 From: yanyiwu Date: Thu, 23 Jul 2015 21:10:56 +0800 Subject: [PATCH] =?UTF-8?q?=E6=96=B0=E5=A2=9EfindByLimit=E5=87=BD=E6=95=B0?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/DictTrie.hpp | 7 +++++++ src/Trie.hpp | 40 +++++++++++++++++++++++++++--------- test/unittest/TTrie.cpp | 45 +++++++++++++++++++++++++++++++++++++++++ 3 files changed, 82 insertions(+), 10 deletions(-) diff --git a/src/DictTrie.hpp b/src/DictTrie.hpp index f5347b5..9a5a5ad 100644 --- a/src/DictTrie.hpp +++ b/src/DictTrie.hpp @@ -71,6 +71,13 @@ class DictTrie { vector& res) const { trie_->find(begin, end, res); } + void findByLimit(Unicode::const_iterator begin, + Unicode::const_iterator end, + vector&res, + size_t min_word_len, + size_t max_word_len) const { + trie_->findByLimit(begin, end, res, min_word_len, max_word_len); + } bool isUserDictSingleChineseWord(const Rune& word) const { return isIn(userDictSingleChineseWord_, word); } diff --git a/src/Trie.hpp b/src/Trie.hpp index c42c383..7768dfc 100644 --- a/src/Trie.hpp +++ b/src/Trie.hpp @@ -8,6 +8,9 @@ namespace CppJieba { using namespace std; +const size_t MIN_WORD_LENGTH = 1; +const size_t MAX_WORD_LENGTH = 512; + struct DictUnit { Unicode word; double weight; @@ -35,7 +38,8 @@ typedef Rune TrieKey; class TrieNode { public : - TrieNode(): next(NULL), ptValue(NULL) {} + TrieNode(): next(NULL), ptValue(NULL) { + } public: typedef unordered_map NextMap; NextMap *next; @@ -82,22 +86,32 @@ class Trie { return ptNode->ptValue; } - void find(Unicode::const_iterator begin, - Unicode::const_iterator end, - vector& res) const { + void findByLimit(Unicode::const_iterator begin, + Unicode::const_iterator end, + vector&res, + size_t min_word_len, + size_t max_word_len) const { res.resize(end - begin); + // min_word_len start from 1; + if (min_word_len < 1) { + min_word_len = 1; + } + const TrieNode *ptNode = NULL; TrieNode::NextMap::const_iterator citer; for (size_t i = 0; i < size_t(end - begin); i++) { - Rune ch = *(begin + i); - ptNode = _base + ch; - res[i].rune = ch; + Rune rune = *(begin + i); + ptNode = _base + rune; + res[i].rune = rune; assert(res[i].nexts.empty()); - res[i].nexts.push_back(pair(i, ptNode->ptValue)); + if (min_word_len <= 1) { + res[i].nexts.push_back(pair(i, ptNode->ptValue)); + } - for (size_t j = i + 1; j < size_t(end - begin); j++) { + // min_word_len start from 1; + for (size_t j = i + 1; j < size_t(end - begin) && (j - i + 1) <= max_word_len ; j++) { if (ptNode->next == NULL) { break; } @@ -106,12 +120,18 @@ class Trie { break; } ptNode = citer->second; - if (NULL != ptNode->ptValue) { + if (NULL != ptNode->ptValue && (j - i + 1) >= min_word_len) { res[i].nexts.push_back(pair(j, ptNode->ptValue)); } } } } + + void find(Unicode::const_iterator begin, + Unicode::const_iterator end, + vector& res) const { + findByLimit(begin, end, res, MIN_WORD_LENGTH, MAX_WORD_LENGTH); + } void insertNode(const Unicode& key, const DictUnit* ptValue) { if (key.begin() == key.end()) { return; diff --git a/test/unittest/TTrie.cpp b/test/unittest/TTrie.cpp index d40d8e6..8f030da 100644 --- a/test/unittest/TTrie.cpp +++ b/test/unittest/TTrie.cpp @@ -6,6 +6,20 @@ using namespace CppJieba; static const char* const DICT_FILE = "../test/testdata/extra_dict/jieba.dict.small.utf8"; +TEST(TrieTest, Empty) { + vector keys; + vector values; + Trie trie(keys, values); +} + +TEST(TrieTest, Construct) { + vector keys; + vector values; + keys.push_back(TransCode::decode("你")); + values.push_back((const DictUnit*)(NULL)); + Trie trie(keys, values); +} + TEST(DictTrieTest, NewAndDelete) { DictTrie * trie; trie = new DictTrie(DICT_FILE); @@ -14,6 +28,7 @@ TEST(DictTrieTest, NewAndDelete) { delete trie; } + TEST(DictTrieTest, Test1) { string s1, s2; DictTrie trie; @@ -106,4 +121,34 @@ TEST(DictTrieTest, Dag) { ASSERT_EQ(res[i].nexts.size(), nexts_sizes[i]); } } + + //findByLimit [2, 3] + { + string word = "长江大桥"; + Unicode unicode; + ASSERT_TRUE(TransCode::decode(word, unicode)); + vector res; + trie.findByLimit(unicode.begin(), unicode.end(), res, 2, 3); + + size_t nexts_sizes[] = {1, 0, 1, 0}; + ASSERT_EQ(res.size(), sizeof(nexts_sizes)/sizeof(nexts_sizes[0])); + for (size_t i = 0; i < res.size(); i++) { + ASSERT_EQ(res[i].nexts.size(), nexts_sizes[i]); + } + } + + //findByLimit [0, 4] + { + string word = "长江大桥"; + Unicode unicode; + ASSERT_TRUE(TransCode::decode(word, unicode)); + vector res; + trie.findByLimit(unicode.begin(), unicode.end(), res, 0, 4); + + size_t nexts_sizes[] = {3, 1, 2, 1}; + ASSERT_EQ(res.size(), sizeof(nexts_sizes)/sizeof(nexts_sizes[0])); + for (size_t i = 0; i < res.size(); i++) { + ASSERT_EQ(res[i].nexts.size(), nexts_sizes[i]); + } + } }