add Jieba::Locate function to get word location of cutted sentence

This commit is contained in:
yanyiwu 2015-12-02 01:19:23 +08:00
parent fb63e78ed2
commit 8dc01ae614
5 changed files with 56 additions and 17 deletions

View File

@ -1,5 +1,9 @@
# CppJieba ChangeLog
## next version
1. 新增 Jieba::Locate 函数接口,作为计算分词结果的词语位置信息,在某些场景下有用,比如搜索结果高亮之类的。
## v4.1.1
1. 在 class Jieba 中新增词性标注的接口函数 Jieba::Tag

View File

@ -50,27 +50,22 @@ make test
结果示例:
```
[demo] METHOD_MP
我/是/拖拉机/学院/手扶拖拉机/专业/的/。/不用/多久//我/就/会/升职/加薪//当/上/C/E/O//走上/人生/巅峰/。
[demo] METHOD_HMM
我/是/拖拉机/学院/手/扶/拖拉机/专业/的/。/不用/多久//我/就/会升/职加薪//当上/CEO//走上/人生/巅峰/。
[demo] METHOD_MIX
[demo] Cut With HMM
我/是/拖拉机/学院/手扶拖拉机/专业/的/。/不用/多久//我/就/会/升职/加薪//当上/CEO//走上/人生/巅峰/。
[demo] METHOD_FULL
我/是/拖拉/拖拉机/学院/手扶/手扶拖拉机/拖拉/拖拉机/专业/的/。/不用/多久//我/就/会升/升职/加薪//当上/C/E/O//走上/人生/巅峰/。
[demo] METHOD_QUERY
[demo] Cut Without HMM
我/是/拖拉机/学院/手扶拖拉机/专业/的/。/不用/多久//我/就/会/升职/加薪//当/上/C/E/O//走上/人生/巅峰/。
[demo] CutAll
我/是/拖拉/拖拉机/学院/手扶/手扶拖拉机/拖拉/拖拉机/专业/的/。/不用/多久//我/就/会升/升职/加薪//当上/C/E/O//走上/人生/巅峰/。[demo] CutForSearch
我/是/拖拉机/学院/手扶/手扶拖拉机/拖拉/拖拉机/专业/的/。/不用/多久//我/就/会/升职/加薪//当上/CEO//走上/人生/巅峰/。
[demo] Insert User Word
男默/女泪
男默女泪
[demo] Locate Words
南京市, 0, 3
长江大桥, 3, 7
[demo] TAGGING
我是拖拉机学院手扶拖拉机专业的。不用多久我就会升职加薪当上CEO走上人生巅峰。
["我:r", "是:v", "拖拉机:n", "学院:n", "手扶拖拉机:n", "专业:n", "的:uj", "。:x", "不用:v", "多久:m", ":x", "我:r", "就:d", "会:v", "升职:v", "加薪:nr", ":x", "当上:t", "CEO:eng", ":x", "走上:v", "人生:n", "巅峰:n", "。:x"]
[demo] KEYWORD
我是拖拉机学院手扶拖拉机专业的。不用多久我就会升职加薪当上CEO走上人生巅峰。
["CEO:11.7392", "升职:10.8562", "加薪:10.6426", "手扶拖拉机:10.0089", "巅峰:9.49396"]
```
详细请看 `test/demo.cpp`.

View File

@ -23,6 +23,12 @@ class Jieba {
~Jieba() {
}
struct LocWord {
string word;
size_t begin;
size_t end;
}; // struct LocWord
void Cut(const string& sentence, vector<string>& words, bool hmm = true) const {
mix_seg_.Cut(sentence, words, hmm);
}
@ -44,6 +50,18 @@ class Jieba {
void CutSmall(const string& sentence, vector<string>& words, size_t max_word_len) const {
mp_seg_.Cut(sentence, words, max_word_len);
}
void Locate(const vector<string>& words, vector<LocWord>& loc_words) const {
loc_words.resize(words.size());
size_t begin = 0;
for (size_t i = 0; i < words.size(); i++) {
size_t len = TransCode::Decode(words[i]).size();
loc_words[i].word = words[i];
loc_words[i].begin = begin;
loc_words[i].end = loc_words[i].begin + len;
begin = loc_words[i].end;
}
}
void Tag(const string& sentence, vector<pair<string, string> >& words) const {
pos_tagger_.Tag(sentence, words);
}

View File

@ -33,6 +33,17 @@ int main(int argc, char** argv) {
jieba.Cut("男默女泪", words);
cout << limonp::join(words.begin(), words.end(), "/") << endl;
cout << "[demo] Locate Words" << endl;
vector<cppjieba::Jieba::LocWord> loc_words;
jieba.Cut("南京市长江大桥", words, true);
jieba.Locate(words, loc_words);
for (size_t i = 0; i < loc_words.size(); i++) {
cout << loc_words[i].word
<< ", " << loc_words[i].begin
<< ", " << loc_words[i].end
<< endl;
}
cout << "[demo] TAGGING" << endl;
vector<pair<string, string> > tagres;
jieba.Tag(s, tagres);

View File

@ -13,7 +13,7 @@ TEST(JiebaTest, Test1) {
jieba.Cut("他来到了网易杭研大厦", words);
result << words;
ASSERT_EQ("[\"\", \"来到\", \"\", \"网易\", \"杭研\", \"大厦\"]", result);
jieba.Cut("我来自北京邮电大学。", words, false);
result << words;
ASSERT_EQ("[\"\", \"来自\", \"北京邮电大学\", \"\"]", result);
@ -46,6 +46,17 @@ TEST(JiebaTest, Test1) {
result << word_levels;
ASSERT_EQ("[\"南京市:0\", \"长江大桥:0\", \"南京:1\", \"长江:1\", \"大桥:1\"]", result);
vector<Jieba::LocWord> loc_words;
jieba.Cut("南京市长江大桥", words);
jieba.Locate(words, loc_words);
ASSERT_EQ(loc_words.size(), 2u);
ASSERT_EQ(loc_words[0].word, "南京市");
ASSERT_EQ(loc_words[0].begin, 0u);
ASSERT_EQ(loc_words[0].end, 3u);
ASSERT_EQ(loc_words[1].word, "长江大桥");
ASSERT_EQ(loc_words[1].begin, 3u);
ASSERT_EQ(loc_words[1].end, 7u);
//vector<pair<string, string> > tagres;
//jieba.Tag("iPhone6手机的最大特点是很容易弯曲。", tagres);
//result << tagres;