Merge pull request #72 from t-k-/master

增加 LookupTag 函数来对单个的 token 进行 tag 查询
This commit is contained in:
Yanyi Wu 2016-07-07 11:15:59 +08:00 committed by GitHub
commit 8b75bf14a3
4 changed files with 30 additions and 7 deletions

View File

@ -62,6 +62,9 @@ class Jieba {
void Tag(const string& sentence, vector<pair<string, string> >& words) const {
mix_seg_.Tag(sentence, words);
}
string LookupTag(const string &str) const {
return mix_seg_.LookupTag(str);
}
bool InsertUserWord(const string& word, const string& tag = UNKNOWN_TAG) {
return dict_trie_.InsertUserWord(word, tag);
}

View File

@ -93,6 +93,10 @@ class MixSegment: public SegmentTagged {
return tagger_.Tag(src, res, *this);
}
string LookupTag(const string &str) const {
return tagger_.LookupTag(str, *this);
}
private:
MPSegment mpSeg_;
HMMSegment hmmSeg_;

View File

@ -23,24 +23,29 @@ class PosTagger {
vector<string> CutRes;
segment.Cut(src, CutRes);
for (vector<string>::iterator itr = CutRes.begin(); itr != CutRes.end(); ++itr) {
res.push_back(make_pair(*itr, LookupTag(*itr, segment)));
}
return !res.empty();
}
string LookupTag(const string &str, const SegmentTagged& segment) const {
const DictUnit *tmp = NULL;
RuneStrArray runes;
const DictTrie * dict = segment.GetDictTrie();
assert(dict != NULL);
for (vector<string>::iterator itr = CutRes.begin(); itr != CutRes.end(); ++itr) {
if (!DecodeRunesInString(*itr, runes)) {
if (!DecodeRunesInString(str, runes)) {
XLOG(ERROR) << "Decode failed.";
return false;
return POS_X;
}
tmp = dict->Find(runes.begin(), runes.end());
if (tmp == NULL || tmp->tag.empty()) {
res.push_back(make_pair(*itr, SpecialRule(runes)));
return SpecialRule(runes);
} else {
res.push_back(make_pair(*itr, tmp->tag));
return tmp->tag;
}
}
return !res.empty();
}
private:
const char* SpecialRule(const RuneStrArray& unicode) const {
size_t m = 0;

View File

@ -51,6 +51,17 @@ int main(int argc, char** argv) {
jieba.CutForSearch(s, jiebawords, true);
cout << jiebawords << endl;
cout << "[demo] Lookup Tag for Single Token" << endl;
const int DemoTokenMaxLen = 32;
char DemoTokens[][DemoTokenMaxLen] = {"拖拉机", "CEO", "123", ""};
vector<pair<string, string> > LookupTagres(sizeof(DemoTokens) / DemoTokenMaxLen);
vector<pair<string, string> >::iterator it;
for (it = LookupTagres.begin(); it != LookupTagres.end(); it++) {
it->first = DemoTokens[it - LookupTagres.begin()];
it->second = jieba.LookupTag(it->first);
}
cout << LookupTagres << endl;
cout << "[demo] Tagging" << endl;
vector<pair<string, string> > tagres;
s = "我是拖拉机学院手扶拖拉机专业的。不用多久我就会升职加薪当上CEO走上人生巅峰。";