From 5775a40beedfddd5ae3d52b236c8698ecd6c38fb Mon Sep 17 00:00:00 2001 From: t-k- Date: Wed, 6 Jul 2016 02:44:56 -0600 Subject: [PATCH] Add LookupTag function for single token tag lookup. --- include/cppjieba/Jieba.hpp | 3 +++ include/cppjieba/MixSegment.hpp | 4 ++++ include/cppjieba/PosTagger.hpp | 19 ++++++++++++------- test/demo.cpp | 7 +++++++ 4 files changed, 26 insertions(+), 7 deletions(-) diff --git a/include/cppjieba/Jieba.hpp b/include/cppjieba/Jieba.hpp index 76f12b6..9c87ce9 100644 --- a/include/cppjieba/Jieba.hpp +++ b/include/cppjieba/Jieba.hpp @@ -62,6 +62,9 @@ class Jieba { void Tag(const string& sentence, vector >& words) const { mix_seg_.Tag(sentence, words); } + string LookupTag(const string &str) const { + return mix_seg_.LookupTag(str); + } bool InsertUserWord(const string& word, const string& tag = UNKNOWN_TAG) { return dict_trie_.InsertUserWord(word, tag); } diff --git a/include/cppjieba/MixSegment.hpp b/include/cppjieba/MixSegment.hpp index 3e18b73..8fd24e9 100644 --- a/include/cppjieba/MixSegment.hpp +++ b/include/cppjieba/MixSegment.hpp @@ -93,6 +93,10 @@ class MixSegment: public SegmentTagged { return tagger_.Tag(src, res, *this); } + string LookupTag(const string &str) const { + return tagger_.LookupTag(str, *this); + } + private: MPSegment mpSeg_; HMMSegment hmmSeg_; diff --git a/include/cppjieba/PosTagger.hpp b/include/cppjieba/PosTagger.hpp index 7113297..78853d5 100644 --- a/include/cppjieba/PosTagger.hpp +++ b/include/cppjieba/PosTagger.hpp @@ -23,24 +23,29 @@ class PosTagger { vector CutRes; segment.Cut(src, CutRes); + for (vector::iterator itr = CutRes.begin(); itr != CutRes.end(); ++itr) { + res.push_back(make_pair(*itr, LookupTag(*itr, segment))); + } + return !res.empty(); + } + + string LookupTag(const string &str, const SegmentTagged& segment) const { const DictUnit *tmp = NULL; RuneStrArray runes; const DictTrie * dict = segment.GetDictTrie(); assert(dict != NULL); - for (vector::iterator itr = CutRes.begin(); itr != CutRes.end(); ++itr) { - if (!DecodeRunesInString(*itr, runes)) { + if (!DecodeRunesInString(str, runes)) { XLOG(ERROR) << "Decode failed."; - return false; + return POS_X; } tmp = dict->Find(runes.begin(), runes.end()); if (tmp == NULL || tmp->tag.empty()) { - res.push_back(make_pair(*itr, SpecialRule(runes))); + return SpecialRule(runes); } else { - res.push_back(make_pair(*itr, tmp->tag)); + return tmp->tag; } - } - return !res.empty(); } + private: const char* SpecialRule(const RuneStrArray& unicode) const { size_t m = 0; diff --git a/test/demo.cpp b/test/demo.cpp index dddd7fc..bae5ef3 100644 --- a/test/demo.cpp +++ b/test/demo.cpp @@ -51,6 +51,13 @@ int main(int argc, char** argv) { jieba.CutForSearch(s, jiebawords, true); cout << jiebawords << endl; + cout << "[demo] Lookup Tag for Single Token" << endl; + vector > LookupTagres = {{"拖拉机", ""}, {"CEO", ""}, {".",""}}; + LookupTagres[0].second = jieba.LookupTag(LookupTagres[0].first); + LookupTagres[1].second = jieba.LookupTag(LookupTagres[1].first); + LookupTagres[2].second = jieba.LookupTag(LookupTagres[2].first); + cout << LookupTagres << endl;; + cout << "[demo] Tagging" << endl; vector > tagres; s = "我是拖拉机学院手扶拖拉机专业的。不用多久,我就会升职加薪,当上CEO,走上人生巅峰。";