mirror of
https://github.com/yanyiwu/cppjieba.git
synced 2025-07-18 00:00:12 +08:00
新增findByLimit函数
This commit is contained in:
parent
78e41e5fd0
commit
4d86abb001
@ -71,6 +71,13 @@ class DictTrie {
|
|||||||
vector<Dag>& res) const {
|
vector<Dag>& res) const {
|
||||||
trie_->find(begin, end, res);
|
trie_->find(begin, end, res);
|
||||||
}
|
}
|
||||||
|
void findByLimit(Unicode::const_iterator begin,
|
||||||
|
Unicode::const_iterator end,
|
||||||
|
vector<struct Dag>&res,
|
||||||
|
size_t min_word_len,
|
||||||
|
size_t max_word_len) const {
|
||||||
|
trie_->findByLimit(begin, end, res, min_word_len, max_word_len);
|
||||||
|
}
|
||||||
bool isUserDictSingleChineseWord(const Rune& word) const {
|
bool isUserDictSingleChineseWord(const Rune& word) const {
|
||||||
return isIn(userDictSingleChineseWord_, word);
|
return isIn(userDictSingleChineseWord_, word);
|
||||||
}
|
}
|
||||||
|
40
src/Trie.hpp
40
src/Trie.hpp
@ -8,6 +8,9 @@
|
|||||||
namespace CppJieba {
|
namespace CppJieba {
|
||||||
using namespace std;
|
using namespace std;
|
||||||
|
|
||||||
|
const size_t MIN_WORD_LENGTH = 1;
|
||||||
|
const size_t MAX_WORD_LENGTH = 512;
|
||||||
|
|
||||||
struct DictUnit {
|
struct DictUnit {
|
||||||
Unicode word;
|
Unicode word;
|
||||||
double weight;
|
double weight;
|
||||||
@ -35,7 +38,8 @@ typedef Rune TrieKey;
|
|||||||
|
|
||||||
class TrieNode {
|
class TrieNode {
|
||||||
public :
|
public :
|
||||||
TrieNode(): next(NULL), ptValue(NULL) {}
|
TrieNode(): next(NULL), ptValue(NULL) {
|
||||||
|
}
|
||||||
public:
|
public:
|
||||||
typedef unordered_map<TrieKey, TrieNode*> NextMap;
|
typedef unordered_map<TrieKey, TrieNode*> NextMap;
|
||||||
NextMap *next;
|
NextMap *next;
|
||||||
@ -82,22 +86,32 @@ class Trie {
|
|||||||
return ptNode->ptValue;
|
return ptNode->ptValue;
|
||||||
}
|
}
|
||||||
|
|
||||||
void find(Unicode::const_iterator begin,
|
void findByLimit(Unicode::const_iterator begin,
|
||||||
Unicode::const_iterator end,
|
Unicode::const_iterator end,
|
||||||
vector<struct Dag>& res) const {
|
vector<struct Dag>&res,
|
||||||
|
size_t min_word_len,
|
||||||
|
size_t max_word_len) const {
|
||||||
res.resize(end - begin);
|
res.resize(end - begin);
|
||||||
|
|
||||||
|
// min_word_len start from 1;
|
||||||
|
if (min_word_len < 1) {
|
||||||
|
min_word_len = 1;
|
||||||
|
}
|
||||||
|
|
||||||
const TrieNode *ptNode = NULL;
|
const TrieNode *ptNode = NULL;
|
||||||
TrieNode::NextMap::const_iterator citer;
|
TrieNode::NextMap::const_iterator citer;
|
||||||
for (size_t i = 0; i < size_t(end - begin); i++) {
|
for (size_t i = 0; i < size_t(end - begin); i++) {
|
||||||
Rune ch = *(begin + i);
|
Rune rune = *(begin + i);
|
||||||
ptNode = _base + ch;
|
ptNode = _base + rune;
|
||||||
res[i].rune = ch;
|
res[i].rune = rune;
|
||||||
assert(res[i].nexts.empty());
|
assert(res[i].nexts.empty());
|
||||||
|
|
||||||
res[i].nexts.push_back(pair<size_t, const DictUnit*>(i, ptNode->ptValue));
|
if (min_word_len <= 1) {
|
||||||
|
res[i].nexts.push_back(pair<size_t, const DictUnit*>(i, ptNode->ptValue));
|
||||||
|
}
|
||||||
|
|
||||||
for (size_t j = i + 1; j < size_t(end - begin); j++) {
|
// min_word_len start from 1;
|
||||||
|
for (size_t j = i + 1; j < size_t(end - begin) && (j - i + 1) <= max_word_len ; j++) {
|
||||||
if (ptNode->next == NULL) {
|
if (ptNode->next == NULL) {
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
@ -106,12 +120,18 @@ class Trie {
|
|||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
ptNode = citer->second;
|
ptNode = citer->second;
|
||||||
if (NULL != ptNode->ptValue) {
|
if (NULL != ptNode->ptValue && (j - i + 1) >= min_word_len) {
|
||||||
res[i].nexts.push_back(pair<size_t, const DictUnit*>(j, ptNode->ptValue));
|
res[i].nexts.push_back(pair<size_t, const DictUnit*>(j, ptNode->ptValue));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void find(Unicode::const_iterator begin,
|
||||||
|
Unicode::const_iterator end,
|
||||||
|
vector<struct Dag>& res) const {
|
||||||
|
findByLimit(begin, end, res, MIN_WORD_LENGTH, MAX_WORD_LENGTH);
|
||||||
|
}
|
||||||
void insertNode(const Unicode& key, const DictUnit* ptValue) {
|
void insertNode(const Unicode& key, const DictUnit* ptValue) {
|
||||||
if (key.begin() == key.end()) {
|
if (key.begin() == key.end()) {
|
||||||
return;
|
return;
|
||||||
|
@ -6,6 +6,20 @@ using namespace CppJieba;
|
|||||||
|
|
||||||
static const char* const DICT_FILE = "../test/testdata/extra_dict/jieba.dict.small.utf8";
|
static const char* const DICT_FILE = "../test/testdata/extra_dict/jieba.dict.small.utf8";
|
||||||
|
|
||||||
|
TEST(TrieTest, Empty) {
|
||||||
|
vector<Unicode> keys;
|
||||||
|
vector<const DictUnit*> values;
|
||||||
|
Trie trie(keys, values);
|
||||||
|
}
|
||||||
|
|
||||||
|
TEST(TrieTest, Construct) {
|
||||||
|
vector<Unicode> keys;
|
||||||
|
vector<const DictUnit*> values;
|
||||||
|
keys.push_back(TransCode::decode("你"));
|
||||||
|
values.push_back((const DictUnit*)(NULL));
|
||||||
|
Trie trie(keys, values);
|
||||||
|
}
|
||||||
|
|
||||||
TEST(DictTrieTest, NewAndDelete) {
|
TEST(DictTrieTest, NewAndDelete) {
|
||||||
DictTrie * trie;
|
DictTrie * trie;
|
||||||
trie = new DictTrie(DICT_FILE);
|
trie = new DictTrie(DICT_FILE);
|
||||||
@ -14,6 +28,7 @@ TEST(DictTrieTest, NewAndDelete) {
|
|||||||
delete trie;
|
delete trie;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
TEST(DictTrieTest, Test1) {
|
TEST(DictTrieTest, Test1) {
|
||||||
string s1, s2;
|
string s1, s2;
|
||||||
DictTrie trie;
|
DictTrie trie;
|
||||||
@ -106,4 +121,34 @@ TEST(DictTrieTest, Dag) {
|
|||||||
ASSERT_EQ(res[i].nexts.size(), nexts_sizes[i]);
|
ASSERT_EQ(res[i].nexts.size(), nexts_sizes[i]);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
//findByLimit [2, 3]
|
||||||
|
{
|
||||||
|
string word = "长江大桥";
|
||||||
|
Unicode unicode;
|
||||||
|
ASSERT_TRUE(TransCode::decode(word, unicode));
|
||||||
|
vector<struct Dag> res;
|
||||||
|
trie.findByLimit(unicode.begin(), unicode.end(), res, 2, 3);
|
||||||
|
|
||||||
|
size_t nexts_sizes[] = {1, 0, 1, 0};
|
||||||
|
ASSERT_EQ(res.size(), sizeof(nexts_sizes)/sizeof(nexts_sizes[0]));
|
||||||
|
for (size_t i = 0; i < res.size(); i++) {
|
||||||
|
ASSERT_EQ(res[i].nexts.size(), nexts_sizes[i]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
//findByLimit [0, 4]
|
||||||
|
{
|
||||||
|
string word = "长江大桥";
|
||||||
|
Unicode unicode;
|
||||||
|
ASSERT_TRUE(TransCode::decode(word, unicode));
|
||||||
|
vector<struct Dag> res;
|
||||||
|
trie.findByLimit(unicode.begin(), unicode.end(), res, 0, 4);
|
||||||
|
|
||||||
|
size_t nexts_sizes[] = {3, 1, 2, 1};
|
||||||
|
ASSERT_EQ(res.size(), sizeof(nexts_sizes)/sizeof(nexts_sizes[0]));
|
||||||
|
for (size_t i = 0; i < res.size(); i++) {
|
||||||
|
ASSERT_EQ(res[i].nexts.size(), nexts_sizes[i]);
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
Loading…
x
Reference in New Issue
Block a user