解决一些历史遗留问题

This commit is contained in:
yanyiwu 2015-07-21 14:32:05 +08:00
parent 620d276887
commit 0e16e000ea
5 changed files with 83 additions and 64 deletions

View File

@ -66,9 +66,6 @@ class DictTrie {
const DictUnit* find(Unicode::const_iterator begin, Unicode::const_iterator end) const { const DictUnit* find(Unicode::const_iterator begin, Unicode::const_iterator end) const {
return trie_->find(begin, end); return trie_->find(begin, end);
} }
bool find(Unicode::const_iterator begin, Unicode::const_iterator end, LocalVector<pair<size_t, const DictUnit*> >& nexts, size_t offset = 0) const {
return trie_->find(begin, end, nexts, offset);
}
void find(Unicode::const_iterator begin, void find(Unicode::const_iterator begin,
Unicode::const_iterator end, Unicode::const_iterator end,
vector<Dag>& res) const { vector<Dag>& res) const {

View File

@ -42,6 +42,23 @@ class FullSegment: public SegmentBase {
//tmp variables //tmp variables
int wordLen = 0; int wordLen = 0;
assert(dictTrie_); assert(dictTrie_);
vector<struct Dag> dags;
dictTrie_->find(begin, end, dags);
for (size_t i = 0; i < dags.size(); i++) {
for (size_t j = 0; j < dags[i].nexts.size(); j++) {
const DictUnit* du = dags[i].nexts[j].second;
if (du == NULL) {
continue;
}
wordLen = du->word.size();
if (wordLen >= 2 || (dags[i].nexts.size() == 1 && maxIdx <= uIdx)) {
res.push_back(du->word);
}
maxIdx = uIdx + wordLen > maxIdx ? uIdx + wordLen : maxIdx;
}
uIdx++;
}
/*
for (Unicode::const_iterator uItr = begin; uItr != end; uItr++) { for (Unicode::const_iterator uItr = begin; uItr != end; uItr++) {
//find word start from uItr //find word start from uItr
if (dictTrie_->find(uItr, end, tRes, 0)) { if (dictTrie_->find(uItr, end, tRes, 0)) {
@ -64,6 +81,7 @@ class FullSegment: public SegmentBase {
} }
++uIdx; ++uIdx;
} }
*/
return true; return true;
} }

View File

@ -48,13 +48,13 @@ class MPSegment: public SegmentBase {
} }
bool cut(Unicode::const_iterator begin , Unicode::const_iterator end, vector<Unicode>& res) const { bool cut(Unicode::const_iterator begin , Unicode::const_iterator end, vector<Unicode>& res) const {
vector<Dag> segmentChars; vector<Dag> dags;
dictTrie_->find(begin, end, segmentChars); dictTrie_->find(begin, end, dags);
calcDP_(segmentChars); calcDP_(dags);
cut_(segmentChars, res); cut_(dags, res);
return true; return true;
} }
@ -63,12 +63,12 @@ class MPSegment: public SegmentBase {
} }
private: private:
void calcDP_(vector<Dag>& segmentChars) const { void calcDP_(vector<Dag>& dags) const {
size_t nextPos; size_t nextPos;
const DictUnit* p; const DictUnit* p;
double val; double val;
for(vector<Dag>::reverse_iterator rit = segmentChars.rbegin(); rit != segmentChars.rend(); rit++) { for(vector<Dag>::reverse_iterator rit = dags.rbegin(); rit != dags.rend(); rit++) {
rit->pInfo = NULL; rit->pInfo = NULL;
rit->weight = MIN_DOUBLE; rit->weight = MIN_DOUBLE;
assert(!rit->nexts.empty()); assert(!rit->nexts.empty());
@ -76,8 +76,8 @@ class MPSegment: public SegmentBase {
nextPos = it->first; nextPos = it->first;
p = it->second; p = it->second;
val = 0.0; val = 0.0;
if(nextPos + 1 < segmentChars.size()) { if(nextPos + 1 < dags.size()) {
val += segmentChars[nextPos + 1].weight; val += dags[nextPos + 1].weight;
} }
if(p) { if(p) {
@ -92,16 +92,16 @@ class MPSegment: public SegmentBase {
} }
} }
} }
void cut_(const vector<Dag>& segmentChars, void cut_(const vector<Dag>& dags,
vector<Unicode>& res) const { vector<Unicode>& res) const {
size_t i = 0; size_t i = 0;
while(i < segmentChars.size()) { while(i < dags.size()) {
const DictUnit* p = segmentChars[i].pInfo; const DictUnit* p = dags[i].pInfo;
if(p) { if(p) {
res.push_back(p->word); res.push_back(p->word);
i += p->word.size(); i += p->word.size();
} else { //single chinese word } else { //single chinese word
res.push_back(Unicode(1, segmentChars[i].uniCh)); res.push_back(Unicode(1, dags[i].uniCh));
i++; i++;
} }
} }

View File

@ -93,9 +93,9 @@ class Trie {
Unicode::value_type ch = *(begin + i); Unicode::value_type ch = *(begin + i);
ptNode = _base + ch; ptNode = _base + ch;
res[i].uniCh = ch; res[i].uniCh = ch;
assert(res[i].dag.empty()); assert(res[i].nexts.empty());
res[i].dag.push_back(DagType::value_type(i, ptNode->ptValue)); res[i].nexts.push_back(pair<size_t, const DictUnit*>(i, ptNode->ptValue));
for (size_t j = i + 1; j < size_t(end - begin); j++) { for (size_t j = i + 1; j < size_t(end - begin); j++) {
if (ptNode->next == NULL) { if (ptNode->next == NULL) {
@ -107,43 +107,11 @@ class Trie {
} }
ptNode = citer->second; ptNode = citer->second;
if (NULL != ptNode->ptValue) { if (NULL != ptNode->ptValue) {
res[i].dag.push_back(DagType::value_type(j, ptNode->ptValue)); res[i].nexts.push_back(pair<size_t, const DictUnit*>(j, ptNode->ptValue));
} }
} }
} }
} }
bool find(
Unicode::const_iterator begin,
Unicode::const_iterator end,
LocalVector<pair<size_t, const DictUnit*> > & res,
size_t offset = 0) const {
if (begin == end) {
return !res.empty();
}
const TrieNode* ptNode = _base + (*(begin++));
if (ptNode->ptValue != NULL && res.size() == 1) {
res[0].second = ptNode->ptValue;
} else if (ptNode->ptValue != NULL) {
res.push_back(DagType::value_type(offset, ptNode->ptValue));
}
TrieNode::NextMap::const_iterator citer;
for (Unicode::const_iterator itr = begin; itr != end; itr++) {
if (NULL == ptNode->next) {
break;
}
citer = ptNode->next->find(*itr);
if (citer == ptNode->next->end()) {
break;
}
ptNode = citer->second;
if (NULL != ptNode->ptValue) {
res.push_back(DagType::value_type(itr - begin + offset, ptNode->ptValue));
}
}
return !res.empty();
}
void insertNode(const Unicode& key, const DictUnit* ptValue) { void insertNode(const Unicode& key, const DictUnit* ptValue) {
if (key.begin() == key.end()) { if (key.begin() == key.end()) {
return; return;

View File

@ -15,7 +15,6 @@ TEST(DictTrieTest, NewAndDelete) {
} }
TEST(DictTrieTest, Test1) { TEST(DictTrieTest, Test1) {
string s1, s2; string s1, s2;
DictTrie trie; DictTrie trie;
trie.init(DICT_FILE); trie.init(DICT_FILE);
@ -32,22 +31,23 @@ TEST(DictTrieTest, Test1) {
EXPECT_EQ("[\"26469\", \"21040\"] v -8.870", s2); EXPECT_EQ("[\"26469\", \"21040\"] v -8.870", s2);
word = "清华大学"; word = "清华大学";
LocalVector<pair<size_t, const DictUnit*> > res, res2; LocalVector<pair<size_t, const DictUnit*> > res;
const char * words[] = {"", "清华", "清华大学"}; const char * words[] = {"", "清华", "清华大学"};
for(size_t i = 0; i < sizeof(words)/sizeof(words[0]); i++) { for(size_t i = 0; i < sizeof(words)/sizeof(words[0]); i++) {
ASSERT_TRUE(TransCode::decode(words[i], uni)); ASSERT_TRUE(TransCode::decode(words[i], uni));
res.push_back(make_pair(uni.size() - 1, trie.find(uni.begin(), uni.end()))); res.push_back(make_pair(uni.size() - 1, trie.find(uni.begin(), uni.end())));
//resMap[uni.size() - 1] = trie.find(uni.begin(), uni.end()); //resMap[uni.size() - 1] = trie.find(uni.begin(), uni.end());
} }
//DictUnit
//res.push_back(make_pair(0, ))
vector<pair<size_t, const DictUnit*> > vec; vector<pair<size_t, const DictUnit*> > vec;
vector<struct Dag> dags;
ASSERT_TRUE(TransCode::decode(word, uni)); ASSERT_TRUE(TransCode::decode(word, uni));
ASSERT_TRUE(trie.find(uni.begin(), uni.end(), res2, 0)); trie.find(uni.begin(), uni.end(), dags);
ASSERT_EQ(dags.size(), uni.size());
ASSERT_NE(dags.size(), 0u);
s1 << res; s1 << res;
s2 << res; s2 << dags[0].nexts;
ASSERT_EQ(s1, s2); ASSERT_EQ(s1, s2);
} }
TEST(DictTrieTest, UserDict) { TEST(DictTrieTest, UserDict) {
@ -62,12 +62,48 @@ TEST(DictTrieTest, UserDict) {
ASSERT_EQ("[\"20113\", \"35745\", \"31639\"] -2.975", res); ASSERT_EQ("[\"20113\", \"35745\", \"31639\"] -2.975", res);
} }
TEST(DictTrieTest, automation) { TEST(DictTrieTest, Dag) {
DictTrie trie(DICT_FILE, "../test/testdata/userdict.utf8"); DictTrie trie(DICT_FILE, "../test/testdata/userdict.utf8");
//string word = "yasherhs";
string word = "abcderf"; {
Unicode unicode; string word = "清华大学";
ASSERT_TRUE(TransCode::decode(word, unicode)); Unicode unicode;
vector<struct Dag> res; ASSERT_TRUE(TransCode::decode(word, unicode));
trie.find(unicode.begin(), unicode.end(), res); vector<struct Dag> res;
trie.find(unicode.begin(), unicode.end(), res);
size_t nexts_sizes[] = {3, 2, 2, 1};
ASSERT_EQ(res.size(), sizeof(nexts_sizes)/sizeof(nexts_sizes[0]));
for (size_t i = 0; i < res.size(); i++) {
ASSERT_EQ(res[i].nexts.size(), nexts_sizes[i]);
}
}
{
string word = "北京邮电大学";
Unicode unicode;
ASSERT_TRUE(TransCode::decode(word, unicode));
vector<struct Dag> res;
trie.find(unicode.begin(), unicode.end(), res);
size_t nexts_sizes[] = {3, 1, 2, 2, 2, 1};
ASSERT_EQ(res.size(), sizeof(nexts_sizes)/sizeof(nexts_sizes[0]));
for (size_t i = 0; i < res.size(); i++) {
ASSERT_EQ(res[i].nexts.size(), nexts_sizes[i]);
}
}
{
string word = "长江大桥";
Unicode unicode;
ASSERT_TRUE(TransCode::decode(word, unicode));
vector<struct Dag> res;
trie.find(unicode.begin(), unicode.end(), res);
size_t nexts_sizes[] = {3, 1, 2, 1};
ASSERT_EQ(res.size(), sizeof(nexts_sizes)/sizeof(nexts_sizes[0]));
for (size_t i = 0; i < res.size(); i++) {
ASSERT_EQ(res[i].nexts.size(), nexts_sizes[i]);
}
}
} }