mirror of
https://github.com/yanyiwu/cppjieba.git
synced 2025-07-18 00:00:12 +08:00
解决一些历史遗留问题
This commit is contained in:
parent
620d276887
commit
0e16e000ea
@ -66,9 +66,6 @@ class DictTrie {
|
|||||||
const DictUnit* find(Unicode::const_iterator begin, Unicode::const_iterator end) const {
|
const DictUnit* find(Unicode::const_iterator begin, Unicode::const_iterator end) const {
|
||||||
return trie_->find(begin, end);
|
return trie_->find(begin, end);
|
||||||
}
|
}
|
||||||
bool find(Unicode::const_iterator begin, Unicode::const_iterator end, LocalVector<pair<size_t, const DictUnit*> >& nexts, size_t offset = 0) const {
|
|
||||||
return trie_->find(begin, end, nexts, offset);
|
|
||||||
}
|
|
||||||
void find(Unicode::const_iterator begin,
|
void find(Unicode::const_iterator begin,
|
||||||
Unicode::const_iterator end,
|
Unicode::const_iterator end,
|
||||||
vector<Dag>& res) const {
|
vector<Dag>& res) const {
|
||||||
|
@ -42,6 +42,23 @@ class FullSegment: public SegmentBase {
|
|||||||
//tmp variables
|
//tmp variables
|
||||||
int wordLen = 0;
|
int wordLen = 0;
|
||||||
assert(dictTrie_);
|
assert(dictTrie_);
|
||||||
|
vector<struct Dag> dags;
|
||||||
|
dictTrie_->find(begin, end, dags);
|
||||||
|
for (size_t i = 0; i < dags.size(); i++) {
|
||||||
|
for (size_t j = 0; j < dags[i].nexts.size(); j++) {
|
||||||
|
const DictUnit* du = dags[i].nexts[j].second;
|
||||||
|
if (du == NULL) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
wordLen = du->word.size();
|
||||||
|
if (wordLen >= 2 || (dags[i].nexts.size() == 1 && maxIdx <= uIdx)) {
|
||||||
|
res.push_back(du->word);
|
||||||
|
}
|
||||||
|
maxIdx = uIdx + wordLen > maxIdx ? uIdx + wordLen : maxIdx;
|
||||||
|
}
|
||||||
|
uIdx++;
|
||||||
|
}
|
||||||
|
/*
|
||||||
for (Unicode::const_iterator uItr = begin; uItr != end; uItr++) {
|
for (Unicode::const_iterator uItr = begin; uItr != end; uItr++) {
|
||||||
//find word start from uItr
|
//find word start from uItr
|
||||||
if (dictTrie_->find(uItr, end, tRes, 0)) {
|
if (dictTrie_->find(uItr, end, tRes, 0)) {
|
||||||
@ -64,6 +81,7 @@ class FullSegment: public SegmentBase {
|
|||||||
}
|
}
|
||||||
++uIdx;
|
++uIdx;
|
||||||
}
|
}
|
||||||
|
*/
|
||||||
|
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
@ -48,13 +48,13 @@ class MPSegment: public SegmentBase {
|
|||||||
}
|
}
|
||||||
|
|
||||||
bool cut(Unicode::const_iterator begin , Unicode::const_iterator end, vector<Unicode>& res) const {
|
bool cut(Unicode::const_iterator begin , Unicode::const_iterator end, vector<Unicode>& res) const {
|
||||||
vector<Dag> segmentChars;
|
vector<Dag> dags;
|
||||||
|
|
||||||
dictTrie_->find(begin, end, segmentChars);
|
dictTrie_->find(begin, end, dags);
|
||||||
|
|
||||||
calcDP_(segmentChars);
|
calcDP_(dags);
|
||||||
|
|
||||||
cut_(segmentChars, res);
|
cut_(dags, res);
|
||||||
|
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
@ -63,12 +63,12 @@ class MPSegment: public SegmentBase {
|
|||||||
}
|
}
|
||||||
|
|
||||||
private:
|
private:
|
||||||
void calcDP_(vector<Dag>& segmentChars) const {
|
void calcDP_(vector<Dag>& dags) const {
|
||||||
size_t nextPos;
|
size_t nextPos;
|
||||||
const DictUnit* p;
|
const DictUnit* p;
|
||||||
double val;
|
double val;
|
||||||
|
|
||||||
for(vector<Dag>::reverse_iterator rit = segmentChars.rbegin(); rit != segmentChars.rend(); rit++) {
|
for(vector<Dag>::reverse_iterator rit = dags.rbegin(); rit != dags.rend(); rit++) {
|
||||||
rit->pInfo = NULL;
|
rit->pInfo = NULL;
|
||||||
rit->weight = MIN_DOUBLE;
|
rit->weight = MIN_DOUBLE;
|
||||||
assert(!rit->nexts.empty());
|
assert(!rit->nexts.empty());
|
||||||
@ -76,8 +76,8 @@ class MPSegment: public SegmentBase {
|
|||||||
nextPos = it->first;
|
nextPos = it->first;
|
||||||
p = it->second;
|
p = it->second;
|
||||||
val = 0.0;
|
val = 0.0;
|
||||||
if(nextPos + 1 < segmentChars.size()) {
|
if(nextPos + 1 < dags.size()) {
|
||||||
val += segmentChars[nextPos + 1].weight;
|
val += dags[nextPos + 1].weight;
|
||||||
}
|
}
|
||||||
|
|
||||||
if(p) {
|
if(p) {
|
||||||
@ -92,16 +92,16 @@ class MPSegment: public SegmentBase {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
void cut_(const vector<Dag>& segmentChars,
|
void cut_(const vector<Dag>& dags,
|
||||||
vector<Unicode>& res) const {
|
vector<Unicode>& res) const {
|
||||||
size_t i = 0;
|
size_t i = 0;
|
||||||
while(i < segmentChars.size()) {
|
while(i < dags.size()) {
|
||||||
const DictUnit* p = segmentChars[i].pInfo;
|
const DictUnit* p = dags[i].pInfo;
|
||||||
if(p) {
|
if(p) {
|
||||||
res.push_back(p->word);
|
res.push_back(p->word);
|
||||||
i += p->word.size();
|
i += p->word.size();
|
||||||
} else { //single chinese word
|
} else { //single chinese word
|
||||||
res.push_back(Unicode(1, segmentChars[i].uniCh));
|
res.push_back(Unicode(1, dags[i].uniCh));
|
||||||
i++;
|
i++;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
38
src/Trie.hpp
38
src/Trie.hpp
@ -93,9 +93,9 @@ class Trie {
|
|||||||
Unicode::value_type ch = *(begin + i);
|
Unicode::value_type ch = *(begin + i);
|
||||||
ptNode = _base + ch;
|
ptNode = _base + ch;
|
||||||
res[i].uniCh = ch;
|
res[i].uniCh = ch;
|
||||||
assert(res[i].dag.empty());
|
assert(res[i].nexts.empty());
|
||||||
|
|
||||||
res[i].dag.push_back(DagType::value_type(i, ptNode->ptValue));
|
res[i].nexts.push_back(pair<size_t, const DictUnit*>(i, ptNode->ptValue));
|
||||||
|
|
||||||
for (size_t j = i + 1; j < size_t(end - begin); j++) {
|
for (size_t j = i + 1; j < size_t(end - begin); j++) {
|
||||||
if (ptNode->next == NULL) {
|
if (ptNode->next == NULL) {
|
||||||
@ -107,43 +107,11 @@ class Trie {
|
|||||||
}
|
}
|
||||||
ptNode = citer->second;
|
ptNode = citer->second;
|
||||||
if (NULL != ptNode->ptValue) {
|
if (NULL != ptNode->ptValue) {
|
||||||
res[i].dag.push_back(DagType::value_type(j, ptNode->ptValue));
|
res[i].nexts.push_back(pair<size_t, const DictUnit*>(j, ptNode->ptValue));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
bool find(
|
|
||||||
Unicode::const_iterator begin,
|
|
||||||
Unicode::const_iterator end,
|
|
||||||
LocalVector<pair<size_t, const DictUnit*> > & res,
|
|
||||||
size_t offset = 0) const {
|
|
||||||
if (begin == end) {
|
|
||||||
return !res.empty();
|
|
||||||
}
|
|
||||||
|
|
||||||
const TrieNode* ptNode = _base + (*(begin++));
|
|
||||||
if (ptNode->ptValue != NULL && res.size() == 1) {
|
|
||||||
res[0].second = ptNode->ptValue;
|
|
||||||
} else if (ptNode->ptValue != NULL) {
|
|
||||||
res.push_back(DagType::value_type(offset, ptNode->ptValue));
|
|
||||||
}
|
|
||||||
|
|
||||||
TrieNode::NextMap::const_iterator citer;
|
|
||||||
for (Unicode::const_iterator itr = begin; itr != end; itr++) {
|
|
||||||
if (NULL == ptNode->next) {
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
citer = ptNode->next->find(*itr);
|
|
||||||
if (citer == ptNode->next->end()) {
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
ptNode = citer->second;
|
|
||||||
if (NULL != ptNode->ptValue) {
|
|
||||||
res.push_back(DagType::value_type(itr - begin + offset, ptNode->ptValue));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return !res.empty();
|
|
||||||
}
|
|
||||||
void insertNode(const Unicode& key, const DictUnit* ptValue) {
|
void insertNode(const Unicode& key, const DictUnit* ptValue) {
|
||||||
if (key.begin() == key.end()) {
|
if (key.begin() == key.end()) {
|
||||||
return;
|
return;
|
||||||
|
@ -15,7 +15,6 @@ TEST(DictTrieTest, NewAndDelete) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
TEST(DictTrieTest, Test1) {
|
TEST(DictTrieTest, Test1) {
|
||||||
|
|
||||||
string s1, s2;
|
string s1, s2;
|
||||||
DictTrie trie;
|
DictTrie trie;
|
||||||
trie.init(DICT_FILE);
|
trie.init(DICT_FILE);
|
||||||
@ -32,22 +31,23 @@ TEST(DictTrieTest, Test1) {
|
|||||||
|
|
||||||
EXPECT_EQ("[\"26469\", \"21040\"] v -8.870", s2);
|
EXPECT_EQ("[\"26469\", \"21040\"] v -8.870", s2);
|
||||||
word = "清华大学";
|
word = "清华大学";
|
||||||
LocalVector<pair<size_t, const DictUnit*> > res, res2;
|
LocalVector<pair<size_t, const DictUnit*> > res;
|
||||||
const char * words[] = {"清", "清华", "清华大学"};
|
const char * words[] = {"清", "清华", "清华大学"};
|
||||||
for(size_t i = 0; i < sizeof(words)/sizeof(words[0]); i++) {
|
for(size_t i = 0; i < sizeof(words)/sizeof(words[0]); i++) {
|
||||||
ASSERT_TRUE(TransCode::decode(words[i], uni));
|
ASSERT_TRUE(TransCode::decode(words[i], uni));
|
||||||
res.push_back(make_pair(uni.size() - 1, trie.find(uni.begin(), uni.end())));
|
res.push_back(make_pair(uni.size() - 1, trie.find(uni.begin(), uni.end())));
|
||||||
//resMap[uni.size() - 1] = trie.find(uni.begin(), uni.end());
|
//resMap[uni.size() - 1] = trie.find(uni.begin(), uni.end());
|
||||||
}
|
}
|
||||||
//DictUnit
|
|
||||||
//res.push_back(make_pair(0, ))
|
|
||||||
|
|
||||||
vector<pair<size_t, const DictUnit*> > vec;
|
vector<pair<size_t, const DictUnit*> > vec;
|
||||||
|
vector<struct Dag> dags;
|
||||||
ASSERT_TRUE(TransCode::decode(word, uni));
|
ASSERT_TRUE(TransCode::decode(word, uni));
|
||||||
ASSERT_TRUE(trie.find(uni.begin(), uni.end(), res2, 0));
|
trie.find(uni.begin(), uni.end(), dags);
|
||||||
|
ASSERT_EQ(dags.size(), uni.size());
|
||||||
|
ASSERT_NE(dags.size(), 0u);
|
||||||
s1 << res;
|
s1 << res;
|
||||||
s2 << res;
|
s2 << dags[0].nexts;
|
||||||
ASSERT_EQ(s1, s2);
|
ASSERT_EQ(s1, s2);
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
TEST(DictTrieTest, UserDict) {
|
TEST(DictTrieTest, UserDict) {
|
||||||
@ -62,12 +62,48 @@ TEST(DictTrieTest, UserDict) {
|
|||||||
ASSERT_EQ("[\"20113\", \"35745\", \"31639\"] -2.975", res);
|
ASSERT_EQ("[\"20113\", \"35745\", \"31639\"] -2.975", res);
|
||||||
}
|
}
|
||||||
|
|
||||||
TEST(DictTrieTest, automation) {
|
TEST(DictTrieTest, Dag) {
|
||||||
DictTrie trie(DICT_FILE, "../test/testdata/userdict.utf8");
|
DictTrie trie(DICT_FILE, "../test/testdata/userdict.utf8");
|
||||||
//string word = "yasherhs";
|
|
||||||
string word = "abcderf";
|
{
|
||||||
Unicode unicode;
|
string word = "清华大学";
|
||||||
ASSERT_TRUE(TransCode::decode(word, unicode));
|
Unicode unicode;
|
||||||
vector<struct Dag> res;
|
ASSERT_TRUE(TransCode::decode(word, unicode));
|
||||||
trie.find(unicode.begin(), unicode.end(), res);
|
vector<struct Dag> res;
|
||||||
|
trie.find(unicode.begin(), unicode.end(), res);
|
||||||
|
|
||||||
|
size_t nexts_sizes[] = {3, 2, 2, 1};
|
||||||
|
ASSERT_EQ(res.size(), sizeof(nexts_sizes)/sizeof(nexts_sizes[0]));
|
||||||
|
for (size_t i = 0; i < res.size(); i++) {
|
||||||
|
ASSERT_EQ(res[i].nexts.size(), nexts_sizes[i]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
{
|
||||||
|
string word = "北京邮电大学";
|
||||||
|
Unicode unicode;
|
||||||
|
ASSERT_TRUE(TransCode::decode(word, unicode));
|
||||||
|
vector<struct Dag> res;
|
||||||
|
trie.find(unicode.begin(), unicode.end(), res);
|
||||||
|
|
||||||
|
size_t nexts_sizes[] = {3, 1, 2, 2, 2, 1};
|
||||||
|
ASSERT_EQ(res.size(), sizeof(nexts_sizes)/sizeof(nexts_sizes[0]));
|
||||||
|
for (size_t i = 0; i < res.size(); i++) {
|
||||||
|
ASSERT_EQ(res[i].nexts.size(), nexts_sizes[i]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
{
|
||||||
|
string word = "长江大桥";
|
||||||
|
Unicode unicode;
|
||||||
|
ASSERT_TRUE(TransCode::decode(word, unicode));
|
||||||
|
vector<struct Dag> res;
|
||||||
|
trie.find(unicode.begin(), unicode.end(), res);
|
||||||
|
|
||||||
|
size_t nexts_sizes[] = {3, 1, 2, 1};
|
||||||
|
ASSERT_EQ(res.size(), sizeof(nexts_sizes)/sizeof(nexts_sizes[0]));
|
||||||
|
for (size_t i = 0; i < res.size(); i++) {
|
||||||
|
ASSERT_EQ(res[i].nexts.size(), nexts_sizes[i]);
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
Loading…
x
Reference in New Issue
Block a user