diff --git a/include/cppjieba/DictTrie.hpp b/include/cppjieba/DictTrie.hpp index b526f7f..82add4b 100644 --- a/include/cppjieba/DictTrie.hpp +++ b/include/cppjieba/DictTrie.hpp @@ -48,12 +48,12 @@ class DictTrie { return true; } - const DictUnit* Find(unicode::RuneStrArray::const_iterator begin, unicode::RuneStrArray::const_iterator end) const { + const DictUnit* Find(RuneStrArray::const_iterator begin, RuneStrArray::const_iterator end) const { return trie_->Find(begin, end); } - void Find(unicode::RuneStrArray::const_iterator begin, - unicode::RuneStrArray::const_iterator end, + void Find(RuneStrArray::const_iterator begin, + RuneStrArray::const_iterator end, vector&res, size_t max_word_len = MAX_WORD_LENGTH) const { trie_->Find(begin, end, res, max_word_len); @@ -124,7 +124,7 @@ class DictTrie { const string& word, double weight, const string& tag) { - if (!unicode::DecodeRunesInString(word, node_info.word)) { + if (!DecodeRunesInString(word, node_info.word)) { XLOG(ERROR) << "Decode " << word << " failed."; return false; } diff --git a/include/cppjieba/FullSegment.hpp b/include/cppjieba/FullSegment.hpp index 99cb8fa..2f101ee 100644 --- a/include/cppjieba/FullSegment.hpp +++ b/include/cppjieba/FullSegment.hpp @@ -29,7 +29,7 @@ class FullSegment: public SegmentBase { vector& words) const { PreFilter pre_filter(symbols_, sentence); PreFilter::Range range; - vector wrs; + vector wrs; wrs.reserve(sentence.size()/2); while (pre_filter.HasNext()) { range = pre_filter.Next(); @@ -37,11 +37,11 @@ class FullSegment: public SegmentBase { } words.clear(); words.reserve(wrs.size()); - unicode::GetStringsFromWordRanges(wrs, words); + GetStringsFromWordRanges(wrs, words); } - void Cut(unicode::RuneStrArray::const_iterator begin, - unicode::RuneStrArray::const_iterator end, - vector& res) const { + void Cut(RuneStrArray::const_iterator begin, + RuneStrArray::const_iterator end, + vector& res) const { //resut of searching in trie tree LocalVector > tRes; @@ -63,13 +63,13 @@ class FullSegment: public SegmentBase { const DictUnit* du = dags[i].nexts[j].second; if (du == NULL) { if (dags[i].nexts.size() == 1 && maxIdx <= uIdx) { - unicode::WordRange wr(begin + i, begin + nextoffset); + WordRange wr(begin + i, begin + nextoffset); res.push_back(wr); } } else { wordLen = du->word.size(); if (wordLen >= 2 || (dags[i].nexts.size() == 1 && maxIdx <= uIdx)) { - unicode::WordRange wr(begin + i, begin + nextoffset); + WordRange wr(begin + i, begin + nextoffset); res.push_back(wr); } } diff --git a/include/cppjieba/HMMModel.hpp b/include/cppjieba/HMMModel.hpp index e22ffca..27e6b66 100644 --- a/include/cppjieba/HMMModel.hpp +++ b/include/cppjieba/HMMModel.hpp @@ -105,7 +105,7 @@ struct HMMModel { XLOG(ERROR) << "emitProb illegal."; return false; } - if (!unicode::DecodeRunesInString(tmp2[0], unicode) || unicode.size() != 1) { + if (!DecodeRunesInString(tmp2[0], unicode) || unicode.size() != 1) { XLOG(ERROR) << "TransCode failed."; return false; } diff --git a/include/cppjieba/HMMSegment.hpp b/include/cppjieba/HMMSegment.hpp index c5dd7f8..84e69aa 100644 --- a/include/cppjieba/HMMSegment.hpp +++ b/include/cppjieba/HMMSegment.hpp @@ -27,7 +27,7 @@ class HMMSegment: public SegmentBase { vector& words) const { PreFilter pre_filter(symbols_, sentence); PreFilter::Range range; - vector wrs; + vector wrs; wrs.reserve(sentence.size()/2); while (pre_filter.HasNext()) { range = pre_filter.Next(); @@ -35,11 +35,11 @@ class HMMSegment: public SegmentBase { } words.clear(); words.reserve(wrs.size()); - unicode::GetStringsFromWordRanges(wrs, words); + GetStringsFromWordRanges(wrs, words); } - void Cut(unicode::RuneStrArray::const_iterator begin, unicode::RuneStrArray::const_iterator end, vector& res) const { - unicode::RuneStrArray::const_iterator left = begin; - unicode::RuneStrArray::const_iterator right = begin; + void Cut(RuneStrArray::const_iterator begin, RuneStrArray::const_iterator end, vector& res) const { + RuneStrArray::const_iterator left = begin; + RuneStrArray::const_iterator right = begin; while (right != end) { if (right->rune < 0x80) { if (left != right) { @@ -57,7 +57,7 @@ class HMMSegment: public SegmentBase { } right ++; } while (false); - unicode::WordRange wr(left, right - 1); + WordRange wr(left, right - 1); res.push_back(wr); left = right; } else { @@ -70,7 +70,7 @@ class HMMSegment: public SegmentBase { } private: // sequential letters rule - unicode::RuneStrArray::const_iterator SequentialLetterRule(unicode::RuneStrArray::const_iterator begin, unicode::RuneStrArray::const_iterator end) const { + RuneStrArray::const_iterator SequentialLetterRule(RuneStrArray::const_iterator begin, RuneStrArray::const_iterator end) const { Rune x = begin->rune; if (('a' <= x && x <= 'z') || ('A' <= x && x <= 'Z')) { begin ++; @@ -88,7 +88,7 @@ class HMMSegment: public SegmentBase { return begin; } // - unicode::RuneStrArray::const_iterator NumbersRule(unicode::RuneStrArray::const_iterator begin, unicode::RuneStrArray::const_iterator end) const { + RuneStrArray::const_iterator NumbersRule(RuneStrArray::const_iterator begin, RuneStrArray::const_iterator end) const { Rune x = begin->rune; if ('0' <= x && x <= '9') { begin ++; @@ -105,24 +105,24 @@ class HMMSegment: public SegmentBase { } return begin; } - void InternalCut(unicode::RuneStrArray::const_iterator begin, unicode::RuneStrArray::const_iterator end, vector& res) const { + void InternalCut(RuneStrArray::const_iterator begin, RuneStrArray::const_iterator end, vector& res) const { vector status; Viterbi(begin, end, status); - unicode::RuneStrArray::const_iterator left = begin; - unicode::RuneStrArray::const_iterator right; + RuneStrArray::const_iterator left = begin; + RuneStrArray::const_iterator right; for (size_t i = 0; i < status.size(); i++) { if (status[i] % 2) { //if (HMMModel::E == status[i] || HMMModel::S == status[i]) right = begin + i + 1; - unicode::WordRange wr(left, right - 1); + WordRange wr(left, right - 1); res.push_back(wr); left = right; } } } - void Viterbi(unicode::RuneStrArray::const_iterator begin, - unicode::RuneStrArray::const_iterator end, + void Viterbi(RuneStrArray::const_iterator begin, + RuneStrArray::const_iterator end, vector& status) const { size_t Y = HMMModel::STATUS_SUM; size_t X = end - begin; diff --git a/include/cppjieba/KeywordExtractor.hpp b/include/cppjieba/KeywordExtractor.hpp index cb8e573..9b15634 100644 --- a/include/cppjieba/KeywordExtractor.hpp +++ b/include/cppjieba/KeywordExtractor.hpp @@ -69,7 +69,7 @@ class KeywordExtractor { for (size_t i = 0; i < words.size(); ++i) { size_t t = offset; offset += words[i].size(); - if (unicode::IsSingleWord(words[i]) || stopWords_.find(words[i]) != stopWords_.end()) { + if (IsSingleWord(words[i]) || stopWords_.find(words[i]) != stopWords_.end()) { continue; } wordmap[words[i]].offsets.push_back(t); diff --git a/include/cppjieba/LevelSegment.hpp b/include/cppjieba/LevelSegment.hpp index 9974098..9fa5909 100644 --- a/include/cppjieba/LevelSegment.hpp +++ b/include/cppjieba/LevelSegment.hpp @@ -17,8 +17,8 @@ class LevelSegment: public SegmentBase{ ~LevelSegment() { } - void Cut(unicode::RuneStrArray::const_iterator begin, - unicode::RuneStrArray::const_iterator end, + void Cut(RuneStrArray::const_iterator begin, + RuneStrArray::const_iterator end, vector >& res) const { res.clear(); vector words; @@ -50,7 +50,7 @@ class LevelSegment: public SegmentBase{ vector >& words) const { words.clear(); RuneStrArray unicode; - unicode::DecodeRunesInString(sentence, unicode); + DecodeRunesInString(sentence, unicode); vector > unicodeWords; Cut(unicode.begin(), unicode.end(), unicodeWords); words.resize(unicodeWords.size()); diff --git a/include/cppjieba/MPSegment.hpp b/include/cppjieba/MPSegment.hpp index a5b908b..b386ae9 100644 --- a/include/cppjieba/MPSegment.hpp +++ b/include/cppjieba/MPSegment.hpp @@ -30,7 +30,7 @@ class MPSegment: public SegmentBase { size_t max_word_len = MAX_WORD_LENGTH) const { PreFilter pre_filter(symbols_, sentence); PreFilter::Range range; - vector wrs; + vector wrs; wrs.reserve(sentence.size()/2); while (pre_filter.HasNext()) { range = pre_filter.Next(); @@ -38,11 +38,11 @@ class MPSegment: public SegmentBase { } words.clear(); words.reserve(wrs.size()); - unicode::GetStringsFromWordRanges(wrs, words); + GetStringsFromWordRanges(wrs, words); } - void Cut(unicode::RuneStrArray::const_iterator begin, - unicode::RuneStrArray::const_iterator end, - vector& words, + void Cut(RuneStrArray::const_iterator begin, + RuneStrArray::const_iterator end, + vector& words, size_t max_word_len = MAX_WORD_LENGTH) const { vector dags; dictTrie_->Find(begin, @@ -90,20 +90,20 @@ class MPSegment: public SegmentBase { } } } - void CutByDag(unicode::RuneStrArray::const_iterator begin, - unicode::RuneStrArray::const_iterator end, + void CutByDag(RuneStrArray::const_iterator begin, + RuneStrArray::const_iterator end, const vector& dags, - vector& words) const { + vector& words) const { size_t i = 0; while (i < dags.size()) { const DictUnit* p = dags[i].pInfo; if (p) { assert(p->word.size() >= 1); - unicode::WordRange wr(begin + i, begin + i + p->word.size() - 1); + WordRange wr(begin + i, begin + i + p->word.size() - 1); words.push_back(wr); i += p->word.size(); } else { //single chinese word - unicode::WordRange wr(begin + i, begin + i); + WordRange wr(begin + i, begin + i); words.push_back(wr); i++; } diff --git a/include/cppjieba/MixSegment.hpp b/include/cppjieba/MixSegment.hpp index 10cf0fa..e096815 100644 --- a/include/cppjieba/MixSegment.hpp +++ b/include/cppjieba/MixSegment.hpp @@ -23,7 +23,7 @@ class MixSegment: public SegmentBase { void Cut(const string& sentence, vector& words, bool hmm = true) const { PreFilter pre_filter(symbols_, sentence); PreFilter::Range range; - vector wrs; + vector wrs; wrs.reserve(sentence.size() / 2); while (pre_filter.HasNext()) { range = pre_filter.Next(); @@ -31,20 +31,20 @@ class MixSegment: public SegmentBase { } words.clear(); words.reserve(wrs.size()); - unicode::GetStringsFromWordRanges(wrs, words); + GetStringsFromWordRanges(wrs, words); } - void Cut(unicode::RuneStrArray::const_iterator begin, unicode::RuneStrArray::const_iterator end, vector& res, bool hmm) const { + void Cut(RuneStrArray::const_iterator begin, RuneStrArray::const_iterator end, vector& res, bool hmm) const { if (!hmm) { mpSeg_.Cut(begin, end, res); return; } - vector words; + vector words; assert(end >= begin); words.reserve(end - begin); mpSeg_.Cut(begin, end, words); - vector hmmRes; + vector hmmRes; hmmRes.reserve(end - begin); for (size_t i = 0; i < words.size(); i++) { //if mp Get a word, it's ok, put it into result diff --git a/include/cppjieba/PosTagger.hpp b/include/cppjieba/PosTagger.hpp index dcda7c5..863c07b 100644 --- a/include/cppjieba/PosTagger.hpp +++ b/include/cppjieba/PosTagger.hpp @@ -30,11 +30,11 @@ class PosTagger { segment_.Cut(src, CutRes); const DictUnit *tmp = NULL; - unicode::RuneStrArray runes; + RuneStrArray runes; const DictTrie * dict = segment_.GetDictTrie(); assert(dict != NULL); for (vector::iterator itr = CutRes.begin(); itr != CutRes.end(); ++itr) { - if (!unicode::DecodeRunesInString(*itr, runes)) { + if (!DecodeRunesInString(*itr, runes)) { XLOG(ERROR) << "Decode failed."; return false; } @@ -48,7 +48,7 @@ class PosTagger { return !res.empty(); } private: - const char* SpecialRule(const unicode::RuneStrArray& unicode) const { + const char* SpecialRule(const RuneStrArray& unicode) const { size_t m = 0; size_t eng = 0; for (size_t i = 0; i < unicode.size() && eng < unicode.size() / 2; i++) { diff --git a/include/cppjieba/PreFilter.hpp b/include/cppjieba/PreFilter.hpp index 2b27879..0d5b877 100644 --- a/include/cppjieba/PreFilter.hpp +++ b/include/cppjieba/PreFilter.hpp @@ -9,14 +9,14 @@ class PreFilter { public: //TODO use WordRange instead of Range struct Range { - unicode::RuneStrArray::const_iterator begin; - unicode::RuneStrArray::const_iterator end; + RuneStrArray::const_iterator begin; + RuneStrArray::const_iterator end; }; // struct Range PreFilter(const unordered_set& symbols, const string& sentence) : symbols_(symbols) { - unicode::DecodeRunesInString(sentence, sentence_); + DecodeRunesInString(sentence, sentence_); cursor_ = sentence_.begin(); } ~PreFilter() { @@ -41,8 +41,8 @@ class PreFilter { return range; } private: - unicode::RuneStrArray::const_iterator cursor_; - unicode::RuneStrArray sentence_; + RuneStrArray::const_iterator cursor_; + RuneStrArray sentence_; const unordered_set& symbols_; }; // class PreFilter diff --git a/include/cppjieba/QuerySegment.hpp b/include/cppjieba/QuerySegment.hpp index 0c9b73b..15a684e 100644 --- a/include/cppjieba/QuerySegment.hpp +++ b/include/cppjieba/QuerySegment.hpp @@ -29,7 +29,7 @@ class QuerySegment: public SegmentBase { void Cut(const string& sentence, vector& words, bool hmm = true) const { PreFilter pre_filter(symbols_, sentence); PreFilter::Range range; - vector wrs; + vector wrs; wrs.reserve(sentence.size()/2); while (pre_filter.HasNext()) { range = pre_filter.Next(); @@ -37,19 +37,19 @@ class QuerySegment: public SegmentBase { } words.clear(); words.reserve(wrs.size()); - unicode::GetStringsFromWordRanges(wrs, words); + GetStringsFromWordRanges(wrs, words); } - void Cut(unicode::RuneStrArray::const_iterator begin, unicode::RuneStrArray::const_iterator end, vector& res, bool hmm) const { + void Cut(RuneStrArray::const_iterator begin, RuneStrArray::const_iterator end, vector& res, bool hmm) const { //use mix Cut first - vector mixRes; + vector mixRes; mixSeg_.Cut(begin, end, mixRes, hmm); - vector fullRes; - for (vector::const_iterator mixResItr = mixRes.begin(); mixResItr != mixRes.end(); mixResItr++) { + vector fullRes; + for (vector::const_iterator mixResItr = mixRes.begin(); mixResItr != mixRes.end(); mixResItr++) { // if it's too long, Cut with fullSeg_, put fullRes in res if (mixResItr->Length() > maxWordLen_ && !mixResItr->IsAllAscii()) { fullSeg_.Cut(mixResItr->left, mixResItr->right + 1, fullRes); - for (vector::const_iterator fullResItr = fullRes.begin(); fullResItr != fullRes.end(); fullResItr++) { + for (vector::const_iterator fullResItr = fullRes.begin(); fullResItr != fullRes.end(); fullResItr++) { res.push_back(*fullResItr); } diff --git a/include/cppjieba/Trie.hpp b/include/cppjieba/Trie.hpp index 41d67ef..fcd5e32 100644 --- a/include/cppjieba/Trie.hpp +++ b/include/cppjieba/Trie.hpp @@ -9,15 +9,11 @@ namespace cppjieba { using namespace std; -using unicode::Rune; -using unicode::RuneStr; -using unicode::Unicode; -using unicode::WordRange; const size_t MAX_WORD_LENGTH = 512; struct DictUnit { - unicode::Unicode word; + Unicode word; double weight; string tag; }; // struct DictUnit @@ -62,14 +58,14 @@ class Trie { DeleteNode(root_); } - const DictUnit* Find(unicode::RuneStrArray::const_iterator begin, unicode::RuneStrArray::const_iterator end) const { + const DictUnit* Find(RuneStrArray::const_iterator begin, RuneStrArray::const_iterator end) const { if (begin == end) { return NULL; } const TrieNode* ptNode = root_; TrieNode::NextMap::const_iterator citer; - for (unicode::RuneStrArray::const_iterator it = begin; it != end; it++) { + for (RuneStrArray::const_iterator it = begin; it != end; it++) { if (NULL == ptNode->next) { return NULL; } @@ -82,8 +78,8 @@ class Trie { return ptNode->ptValue; } - void Find(unicode::RuneStrArray::const_iterator begin, - unicode::RuneStrArray::const_iterator end, + void Find(RuneStrArray::const_iterator begin, + RuneStrArray::const_iterator end, vector&res, size_t max_word_len = MAX_WORD_LENGTH) const { assert(root_ != NULL); diff --git a/include/cppjieba/Unicode.hpp b/include/cppjieba/Unicode.hpp index 4fe527d..113cb22 100644 --- a/include/cppjieba/Unicode.hpp +++ b/include/cppjieba/Unicode.hpp @@ -8,7 +8,6 @@ #include "limonp/LocalVector.hpp" namespace cppjieba { -namespace unicode { typedef uint32_t Rune; @@ -156,7 +155,7 @@ inline Unicode DecodeRunesInString(const std::string& s) { //[left, right] -inline std::string GetStringFromRunes(unicode::RuneStrArray::const_iterator left, unicode::RuneStrArray::const_iterator right) { +inline std::string GetStringFromRunes(RuneStrArray::const_iterator left, RuneStrArray::const_iterator right) { assert(right->str >= left->str); return std::string(left->str, right->str - left->str + right->len); } @@ -173,7 +172,6 @@ inline std::vector GetStringsFromWordRanges(const std::vector keys; vector values; - keys.push_back(unicode::DecodeRunesInString("你")); + keys.push_back(DecodeRunesInString("你")); values.push_back((const DictUnit*)(NULL)); Trie trie(keys, values); } @@ -31,8 +31,8 @@ TEST(DictTrieTest, Test1) { DictTrie trie(DICT_FILE); ASSERT_LT(trie.GetMinWeight() + 15.6479, 0.001); string word("来到"); - cppjieba::unicode::RuneStrArray uni; - ASSERT_TRUE(unicode::DecodeRunesInString(word, uni)); + cppjieba::RuneStrArray uni; + ASSERT_TRUE(DecodeRunesInString(word, uni)); //DictUnit nodeInfo; //nodeInfo.word = uni; //nodeInfo.tag = "v"; @@ -52,13 +52,13 @@ TEST(DictTrieTest, Test1) { LocalVector > res; const char * words[] = {"清", "清华", "清华大学"}; for (size_t i = 0; i < sizeof(words)/sizeof(words[0]); i++) { - ASSERT_TRUE(unicode::DecodeRunesInString(words[i], uni)); + ASSERT_TRUE(DecodeRunesInString(words[i], uni)); res.push_back(make_pair(uni.size() - 1, trie.Find(uni.begin(), uni.end()))); //resMap[uni.size() - 1] = trie.Find(uni.begin(), uni.end()); } vector > vec; vector dags; - ASSERT_TRUE(unicode::DecodeRunesInString(word, uni)); + ASSERT_TRUE(DecodeRunesInString(word, uni)); trie.Find(uni.begin(), uni.end(), dags); ASSERT_EQ(dags.size(), uni.size()); ASSERT_NE(dags.size(), 0u); @@ -71,8 +71,8 @@ TEST(DictTrieTest, Test1) { TEST(DictTrieTest, UserDict) { DictTrie trie(DICT_FILE, "../test/testdata/userdict.utf8"); string word = "云计算"; - cppjieba::unicode::RuneStrArray unicode; - ASSERT_TRUE(unicode::DecodeRunesInString(word, unicode)); + cppjieba::RuneStrArray unicode; + ASSERT_TRUE(DecodeRunesInString(word, unicode)); const DictUnit * unit = trie.Find(unicode.begin(), unicode.end()); ASSERT_TRUE(unit); ASSERT_NEAR(unit->weight, -14.100, 0.001); @@ -81,8 +81,8 @@ TEST(DictTrieTest, UserDict) { TEST(DictTrieTest, UserDictWithMaxWeight) { DictTrie trie(DICT_FILE, "../test/testdata/userdict.utf8", DictTrie::WordWeightMax); string word = "云计算"; - cppjieba::unicode::RuneStrArray unicode; - ASSERT_TRUE(unicode::DecodeRunesInString(word, unicode)); + cppjieba::RuneStrArray unicode; + ASSERT_TRUE(DecodeRunesInString(word, unicode)); const DictUnit * unit = trie.Find(unicode.begin(), unicode.end()); ASSERT_TRUE(unit); ASSERT_NEAR(unit->weight, -2.975, 0.001); @@ -93,8 +93,8 @@ TEST(DictTrieTest, Dag) { { string word = "清华大学"; - cppjieba::unicode::RuneStrArray unicode; - ASSERT_TRUE(unicode::DecodeRunesInString(word, unicode)); + cppjieba::RuneStrArray unicode; + ASSERT_TRUE(DecodeRunesInString(word, unicode)); vector res; trie.Find(unicode.begin(), unicode.end(), res); @@ -107,8 +107,8 @@ TEST(DictTrieTest, Dag) { { string word = "北京邮电大学"; - cppjieba::unicode::RuneStrArray unicode; - ASSERT_TRUE(unicode::DecodeRunesInString(word, unicode)); + cppjieba::RuneStrArray unicode; + ASSERT_TRUE(DecodeRunesInString(word, unicode)); vector res; trie.Find(unicode.begin(), unicode.end(), res); @@ -121,8 +121,8 @@ TEST(DictTrieTest, Dag) { { string word = "长江大桥"; - cppjieba::unicode::RuneStrArray unicode; - ASSERT_TRUE(unicode::DecodeRunesInString(word, unicode)); + cppjieba::RuneStrArray unicode; + ASSERT_TRUE(DecodeRunesInString(word, unicode)); vector res; trie.Find(unicode.begin(), unicode.end(), res); @@ -135,8 +135,8 @@ TEST(DictTrieTest, Dag) { { string word = "长江大桥"; - cppjieba::unicode::RuneStrArray unicode; - ASSERT_TRUE(unicode::DecodeRunesInString(word, unicode)); + cppjieba::RuneStrArray unicode; + ASSERT_TRUE(DecodeRunesInString(word, unicode)); vector res; trie.Find(unicode.begin(), unicode.end(), res, 3); @@ -149,8 +149,8 @@ TEST(DictTrieTest, Dag) { { string word = "长江大桥"; - cppjieba::unicode::RuneStrArray unicode; - ASSERT_TRUE(unicode::DecodeRunesInString(word, unicode)); + cppjieba::RuneStrArray unicode; + ASSERT_TRUE(DecodeRunesInString(word, unicode)); vector res; trie.Find(unicode.begin(), unicode.end(), res, 4);