mirror of
https://github.com/yanyiwu/cppjieba.git
synced 2025-07-18 00:00:12 +08:00
add Jieba::Locate function to get word location of cutted sentence
This commit is contained in:
parent
fb63e78ed2
commit
8dc01ae614
@ -1,5 +1,9 @@
|
||||
# CppJieba ChangeLog
|
||||
|
||||
## next version
|
||||
|
||||
1. 新增 Jieba::Locate 函数接口,作为计算分词结果的词语位置信息,在某些场景下有用,比如搜索结果高亮之类的。
|
||||
|
||||
## v4.1.1
|
||||
|
||||
1. 在 class Jieba 中新增词性标注的接口函数 Jieba::Tag
|
||||
|
27
README.md
27
README.md
@ -50,27 +50,22 @@ make test
|
||||
结果示例:
|
||||
|
||||
```
|
||||
[demo] METHOD_MP
|
||||
我/是/拖拉机/学院/手扶拖拉机/专业/的/。/不用/多久/,/我/就/会/升职/加薪/,/当/上/C/E/O/,/走上/人生/巅峰/。
|
||||
|
||||
[demo] METHOD_HMM
|
||||
我/是/拖拉机/学院/手/扶/拖拉机/专业/的/。/不用/多久/,/我/就/会升/职加薪/,/当上/CEO/,/走上/人生/巅峰/。
|
||||
|
||||
[demo] METHOD_MIX
|
||||
[demo] Cut With HMM
|
||||
我/是/拖拉机/学院/手扶拖拉机/专业/的/。/不用/多久/,/我/就/会/升职/加薪/,/当上/CEO/,/走上/人生/巅峰/。
|
||||
|
||||
[demo] METHOD_FULL
|
||||
我/是/拖拉/拖拉机/学院/手扶/手扶拖拉机/拖拉/拖拉机/专业/的/。/不用/多久/,/我/就/会升/升职/加薪/,/当上/C/E/O/,/走上/人生/巅峰/。
|
||||
[demo] METHOD_QUERY
|
||||
[demo] Cut Without HMM
|
||||
我/是/拖拉机/学院/手扶拖拉机/专业/的/。/不用/多久/,/我/就/会/升职/加薪/,/当/上/C/E/O/,/走上/人生/巅峰/。
|
||||
[demo] CutAll
|
||||
我/是/拖拉/拖拉机/学院/手扶/手扶拖拉机/拖拉/拖拉机/专业/的/。/不用/多久/,/我/就/会升/升职/加薪/,/当上/C/E/O/,/走上/人生/巅峰/。[demo] CutForSearch
|
||||
我/是/拖拉机/学院/手扶/手扶拖拉机/拖拉/拖拉机/专业/的/。/不用/多久/,/我/就/会/升职/加薪/,/当上/CEO/,/走上/人生/巅峰/。
|
||||
|
||||
[demo] Insert User Word
|
||||
男默/女泪
|
||||
男默女泪
|
||||
[demo] Locate Words
|
||||
南京市, 0, 3
|
||||
长江大桥, 3, 7
|
||||
[demo] TAGGING
|
||||
我是拖拉机学院手扶拖拉机专业的。不用多久,我就会升职加薪,当上CEO,走上人生巅峰。
|
||||
["我:r", "是:v", "拖拉机:n", "学院:n", "手扶拖拉机:n", "专业:n", "的:uj", "。:x", "不用:v", "多久:m", ",:x", "我:r", "就:d", "会:v", "升职:v", "加薪:nr", ",:x", "当上:t", "CEO:eng", ",:x", "走上:v", "人生:n", "巅峰:n", "。:x"]
|
||||
|
||||
[demo] KEYWORD
|
||||
我是拖拉机学院手扶拖拉机专业的。不用多久,我就会升职加薪,当上CEO,走上人生巅峰。
|
||||
["CEO:11.7392", "升职:10.8562", "加薪:10.6426", "手扶拖拉机:10.0089", "巅峰:9.49396"]
|
||||
```
|
||||
|
||||
详细请看 `test/demo.cpp`.
|
||||
|
@ -23,6 +23,12 @@ class Jieba {
|
||||
~Jieba() {
|
||||
}
|
||||
|
||||
struct LocWord {
|
||||
string word;
|
||||
size_t begin;
|
||||
size_t end;
|
||||
}; // struct LocWord
|
||||
|
||||
void Cut(const string& sentence, vector<string>& words, bool hmm = true) const {
|
||||
mix_seg_.Cut(sentence, words, hmm);
|
||||
}
|
||||
@ -44,6 +50,18 @@ class Jieba {
|
||||
void CutSmall(const string& sentence, vector<string>& words, size_t max_word_len) const {
|
||||
mp_seg_.Cut(sentence, words, max_word_len);
|
||||
}
|
||||
void Locate(const vector<string>& words, vector<LocWord>& loc_words) const {
|
||||
loc_words.resize(words.size());
|
||||
size_t begin = 0;
|
||||
for (size_t i = 0; i < words.size(); i++) {
|
||||
size_t len = TransCode::Decode(words[i]).size();
|
||||
loc_words[i].word = words[i];
|
||||
loc_words[i].begin = begin;
|
||||
loc_words[i].end = loc_words[i].begin + len;
|
||||
begin = loc_words[i].end;
|
||||
}
|
||||
}
|
||||
|
||||
void Tag(const string& sentence, vector<pair<string, string> >& words) const {
|
||||
pos_tagger_.Tag(sentence, words);
|
||||
}
|
||||
|
@ -33,6 +33,17 @@ int main(int argc, char** argv) {
|
||||
jieba.Cut("男默女泪", words);
|
||||
cout << limonp::join(words.begin(), words.end(), "/") << endl;
|
||||
|
||||
cout << "[demo] Locate Words" << endl;
|
||||
vector<cppjieba::Jieba::LocWord> loc_words;
|
||||
jieba.Cut("南京市长江大桥", words, true);
|
||||
jieba.Locate(words, loc_words);
|
||||
for (size_t i = 0; i < loc_words.size(); i++) {
|
||||
cout << loc_words[i].word
|
||||
<< ", " << loc_words[i].begin
|
||||
<< ", " << loc_words[i].end
|
||||
<< endl;
|
||||
}
|
||||
|
||||
cout << "[demo] TAGGING" << endl;
|
||||
vector<pair<string, string> > tagres;
|
||||
jieba.Tag(s, tagres);
|
||||
|
@ -13,7 +13,7 @@ TEST(JiebaTest, Test1) {
|
||||
jieba.Cut("他来到了网易杭研大厦", words);
|
||||
result << words;
|
||||
ASSERT_EQ("[\"他\", \"来到\", \"了\", \"网易\", \"杭研\", \"大厦\"]", result);
|
||||
|
||||
|
||||
jieba.Cut("我来自北京邮电大学。", words, false);
|
||||
result << words;
|
||||
ASSERT_EQ("[\"我\", \"来自\", \"北京邮电大学\", \"。\"]", result);
|
||||
@ -46,6 +46,17 @@ TEST(JiebaTest, Test1) {
|
||||
result << word_levels;
|
||||
ASSERT_EQ("[\"南京市:0\", \"长江大桥:0\", \"南京:1\", \"长江:1\", \"大桥:1\"]", result);
|
||||
|
||||
vector<Jieba::LocWord> loc_words;
|
||||
jieba.Cut("南京市长江大桥", words);
|
||||
jieba.Locate(words, loc_words);
|
||||
ASSERT_EQ(loc_words.size(), 2u);
|
||||
ASSERT_EQ(loc_words[0].word, "南京市");
|
||||
ASSERT_EQ(loc_words[0].begin, 0u);
|
||||
ASSERT_EQ(loc_words[0].end, 3u);
|
||||
ASSERT_EQ(loc_words[1].word, "长江大桥");
|
||||
ASSERT_EQ(loc_words[1].begin, 3u);
|
||||
ASSERT_EQ(loc_words[1].end, 7u);
|
||||
|
||||
//vector<pair<string, string> > tagres;
|
||||
//jieba.Tag("iPhone6手机的最大特点是很容易弯曲。", tagres);
|
||||
//result << tagres;
|
||||
|
Loading…
x
Reference in New Issue
Block a user