mirror of
https://github.com/yanyiwu/cppjieba.git
synced 2025-07-18 00:00:12 +08:00
新增findByLimit函数
This commit is contained in:
parent
78e41e5fd0
commit
4d86abb001
@ -71,6 +71,13 @@ class DictTrie {
|
||||
vector<Dag>& res) const {
|
||||
trie_->find(begin, end, res);
|
||||
}
|
||||
void findByLimit(Unicode::const_iterator begin,
|
||||
Unicode::const_iterator end,
|
||||
vector<struct Dag>&res,
|
||||
size_t min_word_len,
|
||||
size_t max_word_len) const {
|
||||
trie_->findByLimit(begin, end, res, min_word_len, max_word_len);
|
||||
}
|
||||
bool isUserDictSingleChineseWord(const Rune& word) const {
|
||||
return isIn(userDictSingleChineseWord_, word);
|
||||
}
|
||||
|
40
src/Trie.hpp
40
src/Trie.hpp
@ -8,6 +8,9 @@
|
||||
namespace CppJieba {
|
||||
using namespace std;
|
||||
|
||||
const size_t MIN_WORD_LENGTH = 1;
|
||||
const size_t MAX_WORD_LENGTH = 512;
|
||||
|
||||
struct DictUnit {
|
||||
Unicode word;
|
||||
double weight;
|
||||
@ -35,7 +38,8 @@ typedef Rune TrieKey;
|
||||
|
||||
class TrieNode {
|
||||
public :
|
||||
TrieNode(): next(NULL), ptValue(NULL) {}
|
||||
TrieNode(): next(NULL), ptValue(NULL) {
|
||||
}
|
||||
public:
|
||||
typedef unordered_map<TrieKey, TrieNode*> NextMap;
|
||||
NextMap *next;
|
||||
@ -82,22 +86,32 @@ class Trie {
|
||||
return ptNode->ptValue;
|
||||
}
|
||||
|
||||
void find(Unicode::const_iterator begin,
|
||||
Unicode::const_iterator end,
|
||||
vector<struct Dag>& res) const {
|
||||
void findByLimit(Unicode::const_iterator begin,
|
||||
Unicode::const_iterator end,
|
||||
vector<struct Dag>&res,
|
||||
size_t min_word_len,
|
||||
size_t max_word_len) const {
|
||||
res.resize(end - begin);
|
||||
|
||||
// min_word_len start from 1;
|
||||
if (min_word_len < 1) {
|
||||
min_word_len = 1;
|
||||
}
|
||||
|
||||
const TrieNode *ptNode = NULL;
|
||||
TrieNode::NextMap::const_iterator citer;
|
||||
for (size_t i = 0; i < size_t(end - begin); i++) {
|
||||
Rune ch = *(begin + i);
|
||||
ptNode = _base + ch;
|
||||
res[i].rune = ch;
|
||||
Rune rune = *(begin + i);
|
||||
ptNode = _base + rune;
|
||||
res[i].rune = rune;
|
||||
assert(res[i].nexts.empty());
|
||||
|
||||
res[i].nexts.push_back(pair<size_t, const DictUnit*>(i, ptNode->ptValue));
|
||||
if (min_word_len <= 1) {
|
||||
res[i].nexts.push_back(pair<size_t, const DictUnit*>(i, ptNode->ptValue));
|
||||
}
|
||||
|
||||
for (size_t j = i + 1; j < size_t(end - begin); j++) {
|
||||
// min_word_len start from 1;
|
||||
for (size_t j = i + 1; j < size_t(end - begin) && (j - i + 1) <= max_word_len ; j++) {
|
||||
if (ptNode->next == NULL) {
|
||||
break;
|
||||
}
|
||||
@ -106,12 +120,18 @@ class Trie {
|
||||
break;
|
||||
}
|
||||
ptNode = citer->second;
|
||||
if (NULL != ptNode->ptValue) {
|
||||
if (NULL != ptNode->ptValue && (j - i + 1) >= min_word_len) {
|
||||
res[i].nexts.push_back(pair<size_t, const DictUnit*>(j, ptNode->ptValue));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void find(Unicode::const_iterator begin,
|
||||
Unicode::const_iterator end,
|
||||
vector<struct Dag>& res) const {
|
||||
findByLimit(begin, end, res, MIN_WORD_LENGTH, MAX_WORD_LENGTH);
|
||||
}
|
||||
void insertNode(const Unicode& key, const DictUnit* ptValue) {
|
||||
if (key.begin() == key.end()) {
|
||||
return;
|
||||
|
@ -6,6 +6,20 @@ using namespace CppJieba;
|
||||
|
||||
static const char* const DICT_FILE = "../test/testdata/extra_dict/jieba.dict.small.utf8";
|
||||
|
||||
TEST(TrieTest, Empty) {
|
||||
vector<Unicode> keys;
|
||||
vector<const DictUnit*> values;
|
||||
Trie trie(keys, values);
|
||||
}
|
||||
|
||||
TEST(TrieTest, Construct) {
|
||||
vector<Unicode> keys;
|
||||
vector<const DictUnit*> values;
|
||||
keys.push_back(TransCode::decode("你"));
|
||||
values.push_back((const DictUnit*)(NULL));
|
||||
Trie trie(keys, values);
|
||||
}
|
||||
|
||||
TEST(DictTrieTest, NewAndDelete) {
|
||||
DictTrie * trie;
|
||||
trie = new DictTrie(DICT_FILE);
|
||||
@ -14,6 +28,7 @@ TEST(DictTrieTest, NewAndDelete) {
|
||||
delete trie;
|
||||
}
|
||||
|
||||
|
||||
TEST(DictTrieTest, Test1) {
|
||||
string s1, s2;
|
||||
DictTrie trie;
|
||||
@ -106,4 +121,34 @@ TEST(DictTrieTest, Dag) {
|
||||
ASSERT_EQ(res[i].nexts.size(), nexts_sizes[i]);
|
||||
}
|
||||
}
|
||||
|
||||
//findByLimit [2, 3]
|
||||
{
|
||||
string word = "长江大桥";
|
||||
Unicode unicode;
|
||||
ASSERT_TRUE(TransCode::decode(word, unicode));
|
||||
vector<struct Dag> res;
|
||||
trie.findByLimit(unicode.begin(), unicode.end(), res, 2, 3);
|
||||
|
||||
size_t nexts_sizes[] = {1, 0, 1, 0};
|
||||
ASSERT_EQ(res.size(), sizeof(nexts_sizes)/sizeof(nexts_sizes[0]));
|
||||
for (size_t i = 0; i < res.size(); i++) {
|
||||
ASSERT_EQ(res[i].nexts.size(), nexts_sizes[i]);
|
||||
}
|
||||
}
|
||||
|
||||
//findByLimit [0, 4]
|
||||
{
|
||||
string word = "长江大桥";
|
||||
Unicode unicode;
|
||||
ASSERT_TRUE(TransCode::decode(word, unicode));
|
||||
vector<struct Dag> res;
|
||||
trie.findByLimit(unicode.begin(), unicode.end(), res, 0, 4);
|
||||
|
||||
size_t nexts_sizes[] = {3, 1, 2, 1};
|
||||
ASSERT_EQ(res.size(), sizeof(nexts_sizes)/sizeof(nexts_sizes[0]));
|
||||
for (size_t i = 0; i < res.size(); i++) {
|
||||
ASSERT_EQ(res[i].nexts.size(), nexts_sizes[i]);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
Loading…
x
Reference in New Issue
Block a user