新增findByLimit函数

This commit is contained in:
yanyiwu 2015-07-23 21:10:56 +08:00
parent 78e41e5fd0
commit 4d86abb001
3 changed files with 82 additions and 10 deletions

View File

@ -71,6 +71,13 @@ class DictTrie {
vector<Dag>& res) const {
trie_->find(begin, end, res);
}
void findByLimit(Unicode::const_iterator begin,
Unicode::const_iterator end,
vector<struct Dag>&res,
size_t min_word_len,
size_t max_word_len) const {
trie_->findByLimit(begin, end, res, min_word_len, max_word_len);
}
bool isUserDictSingleChineseWord(const Rune& word) const {
return isIn(userDictSingleChineseWord_, word);
}

View File

@ -8,6 +8,9 @@
namespace CppJieba {
using namespace std;
const size_t MIN_WORD_LENGTH = 1;
const size_t MAX_WORD_LENGTH = 512;
struct DictUnit {
Unicode word;
double weight;
@ -35,7 +38,8 @@ typedef Rune TrieKey;
class TrieNode {
public :
TrieNode(): next(NULL), ptValue(NULL) {}
TrieNode(): next(NULL), ptValue(NULL) {
}
public:
typedef unordered_map<TrieKey, TrieNode*> NextMap;
NextMap *next;
@ -82,22 +86,32 @@ class Trie {
return ptNode->ptValue;
}
void find(Unicode::const_iterator begin,
Unicode::const_iterator end,
vector<struct Dag>& res) const {
void findByLimit(Unicode::const_iterator begin,
Unicode::const_iterator end,
vector<struct Dag>&res,
size_t min_word_len,
size_t max_word_len) const {
res.resize(end - begin);
// min_word_len start from 1;
if (min_word_len < 1) {
min_word_len = 1;
}
const TrieNode *ptNode = NULL;
TrieNode::NextMap::const_iterator citer;
for (size_t i = 0; i < size_t(end - begin); i++) {
Rune ch = *(begin + i);
ptNode = _base + ch;
res[i].rune = ch;
Rune rune = *(begin + i);
ptNode = _base + rune;
res[i].rune = rune;
assert(res[i].nexts.empty());
res[i].nexts.push_back(pair<size_t, const DictUnit*>(i, ptNode->ptValue));
if (min_word_len <= 1) {
res[i].nexts.push_back(pair<size_t, const DictUnit*>(i, ptNode->ptValue));
}
for (size_t j = i + 1; j < size_t(end - begin); j++) {
// min_word_len start from 1;
for (size_t j = i + 1; j < size_t(end - begin) && (j - i + 1) <= max_word_len ; j++) {
if (ptNode->next == NULL) {
break;
}
@ -106,12 +120,18 @@ class Trie {
break;
}
ptNode = citer->second;
if (NULL != ptNode->ptValue) {
if (NULL != ptNode->ptValue && (j - i + 1) >= min_word_len) {
res[i].nexts.push_back(pair<size_t, const DictUnit*>(j, ptNode->ptValue));
}
}
}
}
void find(Unicode::const_iterator begin,
Unicode::const_iterator end,
vector<struct Dag>& res) const {
findByLimit(begin, end, res, MIN_WORD_LENGTH, MAX_WORD_LENGTH);
}
void insertNode(const Unicode& key, const DictUnit* ptValue) {
if (key.begin() == key.end()) {
return;

View File

@ -6,6 +6,20 @@ using namespace CppJieba;
static const char* const DICT_FILE = "../test/testdata/extra_dict/jieba.dict.small.utf8";
TEST(TrieTest, Empty) {
vector<Unicode> keys;
vector<const DictUnit*> values;
Trie trie(keys, values);
}
TEST(TrieTest, Construct) {
vector<Unicode> keys;
vector<const DictUnit*> values;
keys.push_back(TransCode::decode(""));
values.push_back((const DictUnit*)(NULL));
Trie trie(keys, values);
}
TEST(DictTrieTest, NewAndDelete) {
DictTrie * trie;
trie = new DictTrie(DICT_FILE);
@ -14,6 +28,7 @@ TEST(DictTrieTest, NewAndDelete) {
delete trie;
}
TEST(DictTrieTest, Test1) {
string s1, s2;
DictTrie trie;
@ -106,4 +121,34 @@ TEST(DictTrieTest, Dag) {
ASSERT_EQ(res[i].nexts.size(), nexts_sizes[i]);
}
}
//findByLimit [2, 3]
{
string word = "长江大桥";
Unicode unicode;
ASSERT_TRUE(TransCode::decode(word, unicode));
vector<struct Dag> res;
trie.findByLimit(unicode.begin(), unicode.end(), res, 2, 3);
size_t nexts_sizes[] = {1, 0, 1, 0};
ASSERT_EQ(res.size(), sizeof(nexts_sizes)/sizeof(nexts_sizes[0]));
for (size_t i = 0; i < res.size(); i++) {
ASSERT_EQ(res[i].nexts.size(), nexts_sizes[i]);
}
}
//findByLimit [0, 4]
{
string word = "长江大桥";
Unicode unicode;
ASSERT_TRUE(TransCode::decode(word, unicode));
vector<struct Dag> res;
trie.findByLimit(unicode.begin(), unicode.end(), res, 0, 4);
size_t nexts_sizes[] = {3, 1, 2, 1};
ASSERT_EQ(res.size(), sizeof(nexts_sizes)/sizeof(nexts_sizes[0]));
for (size_t i = 0; i < res.size(); i++) {
ASSERT_EQ(res[i].nexts.size(), nexts_sizes[i]);
}
}
}