mirror of
https://github.com/yanyiwu/cppjieba.git
synced 2025-07-18 00:00:12 +08:00
解决一些历史遗留问题
This commit is contained in:
parent
620d276887
commit
0e16e000ea
@ -66,9 +66,6 @@ class DictTrie {
|
||||
const DictUnit* find(Unicode::const_iterator begin, Unicode::const_iterator end) const {
|
||||
return trie_->find(begin, end);
|
||||
}
|
||||
bool find(Unicode::const_iterator begin, Unicode::const_iterator end, LocalVector<pair<size_t, const DictUnit*> >& nexts, size_t offset = 0) const {
|
||||
return trie_->find(begin, end, nexts, offset);
|
||||
}
|
||||
void find(Unicode::const_iterator begin,
|
||||
Unicode::const_iterator end,
|
||||
vector<Dag>& res) const {
|
||||
|
@ -42,6 +42,23 @@ class FullSegment: public SegmentBase {
|
||||
//tmp variables
|
||||
int wordLen = 0;
|
||||
assert(dictTrie_);
|
||||
vector<struct Dag> dags;
|
||||
dictTrie_->find(begin, end, dags);
|
||||
for (size_t i = 0; i < dags.size(); i++) {
|
||||
for (size_t j = 0; j < dags[i].nexts.size(); j++) {
|
||||
const DictUnit* du = dags[i].nexts[j].second;
|
||||
if (du == NULL) {
|
||||
continue;
|
||||
}
|
||||
wordLen = du->word.size();
|
||||
if (wordLen >= 2 || (dags[i].nexts.size() == 1 && maxIdx <= uIdx)) {
|
||||
res.push_back(du->word);
|
||||
}
|
||||
maxIdx = uIdx + wordLen > maxIdx ? uIdx + wordLen : maxIdx;
|
||||
}
|
||||
uIdx++;
|
||||
}
|
||||
/*
|
||||
for (Unicode::const_iterator uItr = begin; uItr != end; uItr++) {
|
||||
//find word start from uItr
|
||||
if (dictTrie_->find(uItr, end, tRes, 0)) {
|
||||
@ -64,6 +81,7 @@ class FullSegment: public SegmentBase {
|
||||
}
|
||||
++uIdx;
|
||||
}
|
||||
*/
|
||||
|
||||
return true;
|
||||
}
|
||||
|
@ -48,13 +48,13 @@ class MPSegment: public SegmentBase {
|
||||
}
|
||||
|
||||
bool cut(Unicode::const_iterator begin , Unicode::const_iterator end, vector<Unicode>& res) const {
|
||||
vector<Dag> segmentChars;
|
||||
vector<Dag> dags;
|
||||
|
||||
dictTrie_->find(begin, end, segmentChars);
|
||||
dictTrie_->find(begin, end, dags);
|
||||
|
||||
calcDP_(segmentChars);
|
||||
calcDP_(dags);
|
||||
|
||||
cut_(segmentChars, res);
|
||||
cut_(dags, res);
|
||||
|
||||
return true;
|
||||
}
|
||||
@ -63,12 +63,12 @@ class MPSegment: public SegmentBase {
|
||||
}
|
||||
|
||||
private:
|
||||
void calcDP_(vector<Dag>& segmentChars) const {
|
||||
void calcDP_(vector<Dag>& dags) const {
|
||||
size_t nextPos;
|
||||
const DictUnit* p;
|
||||
double val;
|
||||
|
||||
for(vector<Dag>::reverse_iterator rit = segmentChars.rbegin(); rit != segmentChars.rend(); rit++) {
|
||||
for(vector<Dag>::reverse_iterator rit = dags.rbegin(); rit != dags.rend(); rit++) {
|
||||
rit->pInfo = NULL;
|
||||
rit->weight = MIN_DOUBLE;
|
||||
assert(!rit->nexts.empty());
|
||||
@ -76,8 +76,8 @@ class MPSegment: public SegmentBase {
|
||||
nextPos = it->first;
|
||||
p = it->second;
|
||||
val = 0.0;
|
||||
if(nextPos + 1 < segmentChars.size()) {
|
||||
val += segmentChars[nextPos + 1].weight;
|
||||
if(nextPos + 1 < dags.size()) {
|
||||
val += dags[nextPos + 1].weight;
|
||||
}
|
||||
|
||||
if(p) {
|
||||
@ -92,16 +92,16 @@ class MPSegment: public SegmentBase {
|
||||
}
|
||||
}
|
||||
}
|
||||
void cut_(const vector<Dag>& segmentChars,
|
||||
void cut_(const vector<Dag>& dags,
|
||||
vector<Unicode>& res) const {
|
||||
size_t i = 0;
|
||||
while(i < segmentChars.size()) {
|
||||
const DictUnit* p = segmentChars[i].pInfo;
|
||||
while(i < dags.size()) {
|
||||
const DictUnit* p = dags[i].pInfo;
|
||||
if(p) {
|
||||
res.push_back(p->word);
|
||||
i += p->word.size();
|
||||
} else { //single chinese word
|
||||
res.push_back(Unicode(1, segmentChars[i].uniCh));
|
||||
res.push_back(Unicode(1, dags[i].uniCh));
|
||||
i++;
|
||||
}
|
||||
}
|
||||
|
38
src/Trie.hpp
38
src/Trie.hpp
@ -93,9 +93,9 @@ class Trie {
|
||||
Unicode::value_type ch = *(begin + i);
|
||||
ptNode = _base + ch;
|
||||
res[i].uniCh = ch;
|
||||
assert(res[i].dag.empty());
|
||||
assert(res[i].nexts.empty());
|
||||
|
||||
res[i].dag.push_back(DagType::value_type(i, ptNode->ptValue));
|
||||
res[i].nexts.push_back(pair<size_t, const DictUnit*>(i, ptNode->ptValue));
|
||||
|
||||
for (size_t j = i + 1; j < size_t(end - begin); j++) {
|
||||
if (ptNode->next == NULL) {
|
||||
@ -107,43 +107,11 @@ class Trie {
|
||||
}
|
||||
ptNode = citer->second;
|
||||
if (NULL != ptNode->ptValue) {
|
||||
res[i].dag.push_back(DagType::value_type(j, ptNode->ptValue));
|
||||
res[i].nexts.push_back(pair<size_t, const DictUnit*>(j, ptNode->ptValue));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
bool find(
|
||||
Unicode::const_iterator begin,
|
||||
Unicode::const_iterator end,
|
||||
LocalVector<pair<size_t, const DictUnit*> > & res,
|
||||
size_t offset = 0) const {
|
||||
if (begin == end) {
|
||||
return !res.empty();
|
||||
}
|
||||
|
||||
const TrieNode* ptNode = _base + (*(begin++));
|
||||
if (ptNode->ptValue != NULL && res.size() == 1) {
|
||||
res[0].second = ptNode->ptValue;
|
||||
} else if (ptNode->ptValue != NULL) {
|
||||
res.push_back(DagType::value_type(offset, ptNode->ptValue));
|
||||
}
|
||||
|
||||
TrieNode::NextMap::const_iterator citer;
|
||||
for (Unicode::const_iterator itr = begin; itr != end; itr++) {
|
||||
if (NULL == ptNode->next) {
|
||||
break;
|
||||
}
|
||||
citer = ptNode->next->find(*itr);
|
||||
if (citer == ptNode->next->end()) {
|
||||
break;
|
||||
}
|
||||
ptNode = citer->second;
|
||||
if (NULL != ptNode->ptValue) {
|
||||
res.push_back(DagType::value_type(itr - begin + offset, ptNode->ptValue));
|
||||
}
|
||||
}
|
||||
return !res.empty();
|
||||
}
|
||||
void insertNode(const Unicode& key, const DictUnit* ptValue) {
|
||||
if (key.begin() == key.end()) {
|
||||
return;
|
||||
|
@ -15,7 +15,6 @@ TEST(DictTrieTest, NewAndDelete) {
|
||||
}
|
||||
|
||||
TEST(DictTrieTest, Test1) {
|
||||
|
||||
string s1, s2;
|
||||
DictTrie trie;
|
||||
trie.init(DICT_FILE);
|
||||
@ -32,22 +31,23 @@ TEST(DictTrieTest, Test1) {
|
||||
|
||||
EXPECT_EQ("[\"26469\", \"21040\"] v -8.870", s2);
|
||||
word = "清华大学";
|
||||
LocalVector<pair<size_t, const DictUnit*> > res, res2;
|
||||
LocalVector<pair<size_t, const DictUnit*> > res;
|
||||
const char * words[] = {"清", "清华", "清华大学"};
|
||||
for(size_t i = 0; i < sizeof(words)/sizeof(words[0]); i++) {
|
||||
ASSERT_TRUE(TransCode::decode(words[i], uni));
|
||||
res.push_back(make_pair(uni.size() - 1, trie.find(uni.begin(), uni.end())));
|
||||
//resMap[uni.size() - 1] = trie.find(uni.begin(), uni.end());
|
||||
}
|
||||
//DictUnit
|
||||
//res.push_back(make_pair(0, ))
|
||||
|
||||
vector<pair<size_t, const DictUnit*> > vec;
|
||||
vector<struct Dag> dags;
|
||||
ASSERT_TRUE(TransCode::decode(word, uni));
|
||||
ASSERT_TRUE(trie.find(uni.begin(), uni.end(), res2, 0));
|
||||
trie.find(uni.begin(), uni.end(), dags);
|
||||
ASSERT_EQ(dags.size(), uni.size());
|
||||
ASSERT_NE(dags.size(), 0u);
|
||||
s1 << res;
|
||||
s2 << res;
|
||||
s2 << dags[0].nexts;
|
||||
ASSERT_EQ(s1, s2);
|
||||
|
||||
}
|
||||
|
||||
TEST(DictTrieTest, UserDict) {
|
||||
@ -62,12 +62,48 @@ TEST(DictTrieTest, UserDict) {
|
||||
ASSERT_EQ("[\"20113\", \"35745\", \"31639\"] -2.975", res);
|
||||
}
|
||||
|
||||
TEST(DictTrieTest, automation) {
|
||||
TEST(DictTrieTest, Dag) {
|
||||
DictTrie trie(DICT_FILE, "../test/testdata/userdict.utf8");
|
||||
//string word = "yasherhs";
|
||||
string word = "abcderf";
|
||||
|
||||
{
|
||||
string word = "清华大学";
|
||||
Unicode unicode;
|
||||
ASSERT_TRUE(TransCode::decode(word, unicode));
|
||||
vector<struct Dag> res;
|
||||
trie.find(unicode.begin(), unicode.end(), res);
|
||||
|
||||
size_t nexts_sizes[] = {3, 2, 2, 1};
|
||||
ASSERT_EQ(res.size(), sizeof(nexts_sizes)/sizeof(nexts_sizes[0]));
|
||||
for (size_t i = 0; i < res.size(); i++) {
|
||||
ASSERT_EQ(res[i].nexts.size(), nexts_sizes[i]);
|
||||
}
|
||||
}
|
||||
|
||||
{
|
||||
string word = "北京邮电大学";
|
||||
Unicode unicode;
|
||||
ASSERT_TRUE(TransCode::decode(word, unicode));
|
||||
vector<struct Dag> res;
|
||||
trie.find(unicode.begin(), unicode.end(), res);
|
||||
|
||||
size_t nexts_sizes[] = {3, 1, 2, 2, 2, 1};
|
||||
ASSERT_EQ(res.size(), sizeof(nexts_sizes)/sizeof(nexts_sizes[0]));
|
||||
for (size_t i = 0; i < res.size(); i++) {
|
||||
ASSERT_EQ(res[i].nexts.size(), nexts_sizes[i]);
|
||||
}
|
||||
}
|
||||
|
||||
{
|
||||
string word = "长江大桥";
|
||||
Unicode unicode;
|
||||
ASSERT_TRUE(TransCode::decode(word, unicode));
|
||||
vector<struct Dag> res;
|
||||
trie.find(unicode.begin(), unicode.end(), res);
|
||||
|
||||
size_t nexts_sizes[] = {3, 1, 2, 1};
|
||||
ASSERT_EQ(res.size(), sizeof(nexts_sizes)/sizeof(nexts_sizes[0]));
|
||||
for (size_t i = 0; i < res.size(); i++) {
|
||||
ASSERT_EQ(res[i].nexts.size(), nexts_sizes[i]);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
Loading…
x
Reference in New Issue
Block a user