解决一些历史遗留问题

This commit is contained in:
yanyiwu 2015-07-21 14:32:05 +08:00
parent 620d276887
commit 0e16e000ea
5 changed files with 83 additions and 64 deletions

View File

@ -66,9 +66,6 @@ class DictTrie {
const DictUnit* find(Unicode::const_iterator begin, Unicode::const_iterator end) const {
return trie_->find(begin, end);
}
bool find(Unicode::const_iterator begin, Unicode::const_iterator end, LocalVector<pair<size_t, const DictUnit*> >& nexts, size_t offset = 0) const {
return trie_->find(begin, end, nexts, offset);
}
void find(Unicode::const_iterator begin,
Unicode::const_iterator end,
vector<Dag>& res) const {

View File

@ -42,6 +42,23 @@ class FullSegment: public SegmentBase {
//tmp variables
int wordLen = 0;
assert(dictTrie_);
vector<struct Dag> dags;
dictTrie_->find(begin, end, dags);
for (size_t i = 0; i < dags.size(); i++) {
for (size_t j = 0; j < dags[i].nexts.size(); j++) {
const DictUnit* du = dags[i].nexts[j].second;
if (du == NULL) {
continue;
}
wordLen = du->word.size();
if (wordLen >= 2 || (dags[i].nexts.size() == 1 && maxIdx <= uIdx)) {
res.push_back(du->word);
}
maxIdx = uIdx + wordLen > maxIdx ? uIdx + wordLen : maxIdx;
}
uIdx++;
}
/*
for (Unicode::const_iterator uItr = begin; uItr != end; uItr++) {
//find word start from uItr
if (dictTrie_->find(uItr, end, tRes, 0)) {
@ -64,6 +81,7 @@ class FullSegment: public SegmentBase {
}
++uIdx;
}
*/
return true;
}

View File

@ -48,13 +48,13 @@ class MPSegment: public SegmentBase {
}
bool cut(Unicode::const_iterator begin , Unicode::const_iterator end, vector<Unicode>& res) const {
vector<Dag> segmentChars;
vector<Dag> dags;
dictTrie_->find(begin, end, segmentChars);
dictTrie_->find(begin, end, dags);
calcDP_(segmentChars);
calcDP_(dags);
cut_(segmentChars, res);
cut_(dags, res);
return true;
}
@ -63,12 +63,12 @@ class MPSegment: public SegmentBase {
}
private:
void calcDP_(vector<Dag>& segmentChars) const {
void calcDP_(vector<Dag>& dags) const {
size_t nextPos;
const DictUnit* p;
double val;
for(vector<Dag>::reverse_iterator rit = segmentChars.rbegin(); rit != segmentChars.rend(); rit++) {
for(vector<Dag>::reverse_iterator rit = dags.rbegin(); rit != dags.rend(); rit++) {
rit->pInfo = NULL;
rit->weight = MIN_DOUBLE;
assert(!rit->nexts.empty());
@ -76,8 +76,8 @@ class MPSegment: public SegmentBase {
nextPos = it->first;
p = it->second;
val = 0.0;
if(nextPos + 1 < segmentChars.size()) {
val += segmentChars[nextPos + 1].weight;
if(nextPos + 1 < dags.size()) {
val += dags[nextPos + 1].weight;
}
if(p) {
@ -92,16 +92,16 @@ class MPSegment: public SegmentBase {
}
}
}
void cut_(const vector<Dag>& segmentChars,
void cut_(const vector<Dag>& dags,
vector<Unicode>& res) const {
size_t i = 0;
while(i < segmentChars.size()) {
const DictUnit* p = segmentChars[i].pInfo;
while(i < dags.size()) {
const DictUnit* p = dags[i].pInfo;
if(p) {
res.push_back(p->word);
i += p->word.size();
} else { //single chinese word
res.push_back(Unicode(1, segmentChars[i].uniCh));
res.push_back(Unicode(1, dags[i].uniCh));
i++;
}
}

View File

@ -93,9 +93,9 @@ class Trie {
Unicode::value_type ch = *(begin + i);
ptNode = _base + ch;
res[i].uniCh = ch;
assert(res[i].dag.empty());
assert(res[i].nexts.empty());
res[i].dag.push_back(DagType::value_type(i, ptNode->ptValue));
res[i].nexts.push_back(pair<size_t, const DictUnit*>(i, ptNode->ptValue));
for (size_t j = i + 1; j < size_t(end - begin); j++) {
if (ptNode->next == NULL) {
@ -107,43 +107,11 @@ class Trie {
}
ptNode = citer->second;
if (NULL != ptNode->ptValue) {
res[i].dag.push_back(DagType::value_type(j, ptNode->ptValue));
res[i].nexts.push_back(pair<size_t, const DictUnit*>(j, ptNode->ptValue));
}
}
}
}
bool find(
Unicode::const_iterator begin,
Unicode::const_iterator end,
LocalVector<pair<size_t, const DictUnit*> > & res,
size_t offset = 0) const {
if (begin == end) {
return !res.empty();
}
const TrieNode* ptNode = _base + (*(begin++));
if (ptNode->ptValue != NULL && res.size() == 1) {
res[0].second = ptNode->ptValue;
} else if (ptNode->ptValue != NULL) {
res.push_back(DagType::value_type(offset, ptNode->ptValue));
}
TrieNode::NextMap::const_iterator citer;
for (Unicode::const_iterator itr = begin; itr != end; itr++) {
if (NULL == ptNode->next) {
break;
}
citer = ptNode->next->find(*itr);
if (citer == ptNode->next->end()) {
break;
}
ptNode = citer->second;
if (NULL != ptNode->ptValue) {
res.push_back(DagType::value_type(itr - begin + offset, ptNode->ptValue));
}
}
return !res.empty();
}
void insertNode(const Unicode& key, const DictUnit* ptValue) {
if (key.begin() == key.end()) {
return;

View File

@ -15,7 +15,6 @@ TEST(DictTrieTest, NewAndDelete) {
}
TEST(DictTrieTest, Test1) {
string s1, s2;
DictTrie trie;
trie.init(DICT_FILE);
@ -32,22 +31,23 @@ TEST(DictTrieTest, Test1) {
EXPECT_EQ("[\"26469\", \"21040\"] v -8.870", s2);
word = "清华大学";
LocalVector<pair<size_t, const DictUnit*> > res, res2;
LocalVector<pair<size_t, const DictUnit*> > res;
const char * words[] = {"", "清华", "清华大学"};
for(size_t i = 0; i < sizeof(words)/sizeof(words[0]); i++) {
ASSERT_TRUE(TransCode::decode(words[i], uni));
res.push_back(make_pair(uni.size() - 1, trie.find(uni.begin(), uni.end())));
//resMap[uni.size() - 1] = trie.find(uni.begin(), uni.end());
}
//DictUnit
//res.push_back(make_pair(0, ))
vector<pair<size_t, const DictUnit*> > vec;
vector<struct Dag> dags;
ASSERT_TRUE(TransCode::decode(word, uni));
ASSERT_TRUE(trie.find(uni.begin(), uni.end(), res2, 0));
trie.find(uni.begin(), uni.end(), dags);
ASSERT_EQ(dags.size(), uni.size());
ASSERT_NE(dags.size(), 0u);
s1 << res;
s2 << res;
s2 << dags[0].nexts;
ASSERT_EQ(s1, s2);
}
TEST(DictTrieTest, UserDict) {
@ -62,12 +62,48 @@ TEST(DictTrieTest, UserDict) {
ASSERT_EQ("[\"20113\", \"35745\", \"31639\"] -2.975", res);
}
TEST(DictTrieTest, automation) {
TEST(DictTrieTest, Dag) {
DictTrie trie(DICT_FILE, "../test/testdata/userdict.utf8");
//string word = "yasherhs";
string word = "abcderf";
Unicode unicode;
ASSERT_TRUE(TransCode::decode(word, unicode));
vector<struct Dag> res;
trie.find(unicode.begin(), unicode.end(), res);
{
string word = "清华大学";
Unicode unicode;
ASSERT_TRUE(TransCode::decode(word, unicode));
vector<struct Dag> res;
trie.find(unicode.begin(), unicode.end(), res);
size_t nexts_sizes[] = {3, 2, 2, 1};
ASSERT_EQ(res.size(), sizeof(nexts_sizes)/sizeof(nexts_sizes[0]));
for (size_t i = 0; i < res.size(); i++) {
ASSERT_EQ(res[i].nexts.size(), nexts_sizes[i]);
}
}
{
string word = "北京邮电大学";
Unicode unicode;
ASSERT_TRUE(TransCode::decode(word, unicode));
vector<struct Dag> res;
trie.find(unicode.begin(), unicode.end(), res);
size_t nexts_sizes[] = {3, 1, 2, 2, 2, 1};
ASSERT_EQ(res.size(), sizeof(nexts_sizes)/sizeof(nexts_sizes[0]));
for (size_t i = 0; i < res.size(); i++) {
ASSERT_EQ(res[i].nexts.size(), nexts_sizes[i]);
}
}
{
string word = "长江大桥";
Unicode unicode;
ASSERT_TRUE(TransCode::decode(word, unicode));
vector<struct Dag> res;
trie.find(unicode.begin(), unicode.end(), res);
size_t nexts_sizes[] = {3, 1, 2, 1};
ASSERT_EQ(res.size(), sizeof(nexts_sizes)/sizeof(nexts_sizes[0]));
for (size_t i = 0; i < res.size(); i++) {
ASSERT_EQ(res[i].nexts.size(), nexts_sizes[i]);
}
}
}