mirror of
https://github.com/yanyiwu/cppjieba.git
synced 2025-07-18 00:00:12 +08:00
merge the latest codes in master branch, and update unittest cases to pass ci
This commit is contained in:
commit
5c739484ae
18
ChangeLog.md
18
ChangeLog.md
@ -1,5 +1,22 @@
|
||||
# CppJieba ChangeLog
|
||||
|
||||
## v4.8.0
|
||||
|
||||
+ rewrite QuerySegment, make `Jieba::CutForSearch` behaves the same as [jieba] `cut_for_search` api
|
||||
+ remove Jieba::SetQuerySegmentThreshold
|
||||
|
||||
## v4.7.0
|
||||
|
||||
api changes:
|
||||
|
||||
+ override Cut functions, add location information into Word results;
|
||||
+ remove LevelSegment;
|
||||
+ remove Jieba::Locate;
|
||||
|
||||
upgrade:
|
||||
|
||||
+ limonp -> v0.6.1
|
||||
|
||||
## v4.6.0
|
||||
|
||||
+ Change Jieba::Locate(deprecated) to be static function.
|
||||
@ -204,3 +221,4 @@
|
||||
[husky]:http://github.com/yanyiwu/husky.git
|
||||
[issue50]:https://github.com/yanyiwu/cppjieba/issues/50
|
||||
[qinwf]:https://github.com/yanyiwu/cppjieba/pull/53#issuecomment-176264929
|
||||
[jieba]:https://github.com/fxsjy/jieba
|
||||
|
33
README.md
33
README.md
@ -5,6 +5,7 @@
|
||||
[](https://github.com/yanyiwu/cppjieba)
|
||||
[](http://yanyiwu.com/work/2015/06/14/jieba-series-performance-test.html)
|
||||
[](http://yanyiwu.mit-license.org)
|
||||
[](https://ci.appveyor.com/project/yanyiwu/cppjieba/branch/master)
|
||||
|
||||
[](https://github.com/yanyiwu/cppjieba)
|
||||
|
||||
@ -54,28 +55,26 @@ make test
|
||||
|
||||
```
|
||||
[demo] Cut With HMM
|
||||
我/是/拖拉机/学院/手扶拖拉机/专业/的/。/不用/多久/,/我/就/会/升职/加薪/,/当上/CEO/,/走上/人生/巅峰/。
|
||||
他/来到/了/网易/杭研/大厦
|
||||
[demo] Cut Without HMM
|
||||
我/是/拖拉机/学院/手扶拖拉机/专业/的/。/不用/多久/,/我/就/会/升职/加薪/,/当/上/C/E/O/,/走上/人生/巅峰/。
|
||||
他/来到/了/网易/杭/研/大厦
|
||||
我来到北京清华大学
|
||||
[demo] CutAll
|
||||
我/是/拖拉/拖拉机/学院/手扶/手扶拖拉机/拖拉/拖拉机/专业/的/。/不用/多久/,/我/就/会升/升职/加薪/,/当上/C/E/O/,/走上/人生/巅峰/。[demo] CutForSearch
|
||||
我/是/拖拉机/学院/手扶/手扶拖拉机/拖拉/拖拉机/专业/的/。/不用/多久/,/我/就/会/升职/加薪/,/当上/CEO/,/走上/人生/巅峰/。
|
||||
我/来到/北京/清华/清华大学/华大/大学
|
||||
小明硕士毕业于中国科学院计算所,后在日本京都大学深造
|
||||
[demo] CutForSearch
|
||||
小明/硕士/毕业/于/中国/科学/学院/科学院/中国科学院/计算/计算所/,/后/在/日本/京都/大学/日本京都大学/深造
|
||||
[demo] Insert User Word
|
||||
男默/女泪
|
||||
男默女泪
|
||||
[demo] Locate Words
|
||||
南京市, 0, 3
|
||||
长江大桥, 3, 7
|
||||
[demo] TAGGING
|
||||
[demo] CutForSearch Word With Offset
|
||||
[{"word": "小明", "offset": 0}, {"word": "硕士", "offset": 6}, {"word": "毕业", "offset": 12}, {"word": "于", "offset": 18}, {"word": "中国", "offset": 21}, {"word": "科学", "offset": 27}, {"word": "学院", "offset": 30}, {"word": "科学院", "offset": 27}, {"word": "中国科学院", "offset": 21}, {"word": "计算", "offset": 36}, {"word": "计算所", "offset": 36}, {"word": ",", "offset": 45}, {"word": "后", "offset": 48}, {"word": "在", "offset": 51}, {"word": "日本", "offset": 54}, {"word": "京都", "offset": 60}, {"word": "大学", "offset": 66}, {"word": "日本京都大学", "offset": 54}, {"word": "深造", "offset": 72}]
|
||||
[demo] Tagging
|
||||
我是拖拉机学院手扶拖拉机专业的。不用多久,我就会升职加薪,当上CEO,走上人生巅峰。
|
||||
["我:r", "是:v", "拖拉机:n", "学院:n", "手扶拖拉机:n", "专业:n", "的:uj", "。:x", "不用:v", "多久:m", ",:x", "我:r", "就:d", "会:v", "升职:v", "加薪:nr", ",:x", "当上:t", "CEO:eng", ",:x", "走上:v", "人生:n", "巅峰:n", "。:x"]
|
||||
[demo] KEYWORD
|
||||
我是拖拉机学院手扶拖拉机专业的。不用多久,我就会升职加薪,当上CEO,走上人生巅峰 。
|
||||
CEO|11.7392
|
||||
升职|10.8562
|
||||
加薪|10.6426
|
||||
手扶拖拉机|10.0089
|
||||
巅峰|9.49396
|
||||
[我:r, 是:v, 拖拉机:n, 学院:n, 手扶拖拉机:n, 专业:n, 的:uj, 。:x, 不用:v, 多久:m, ,:x, 我:r, 就:d, 会:v, 升职:v, 加薪:nr, ,:x, 当上:t, CEO:eng, ,:x, 走上:v, 人生:n, 巅峰:n, 。:x]
|
||||
[demo] Keyword Extraction
|
||||
我是拖拉机学院手扶拖拉机专业的。不用多久,我就会升职加薪,当上CEO,走上人生巅峰。
|
||||
[{"word": "CEO", "offset": [93], "weight": 11.7392}, {"word": "升职", "offset": [72], "weight": 10.8562}, {"word": "加薪", "offset": [78], "weight": 10.6426}, {"word": "手扶拖拉机", "offset": [21], "weight": 10.0089}, {"word": "巅峰", "offset": [111], "weight": 9.49396}]
|
||||
```
|
||||
|
||||
详细请看 `test/demo.cpp`.
|
||||
@ -228,6 +227,7 @@ Query方法先使用Mix方法切词,对于切出来的较长的词再使用Ful
|
||||
+ [ngx_http_cppjieba_module] Nginx 分词插件。
|
||||
+ [cppjiebapy] 由 [jannson] 开发的供 python 模块调用的项目 [cppjiebapy], 相关讨论 [cppjiebapy_discussion] .
|
||||
+ [KeywordServer] 50行搭建一个中文关键词抽取服务。
|
||||
+ [cppjieba-server] CppJieba HTTP 服务器。
|
||||
|
||||
## 线上演示
|
||||
|
||||
@ -278,6 +278,7 @@ Query方法先使用Mix方法切词,对于切出来的较长的词再使用Ful
|
||||
[Jieba中文分词系列性能评测]:http://yanyiwu.com/work/2015/06/14/jieba-series-performance-test.html
|
||||
[pg_jieba]:https://github.com/jaiminpan/pg_jieba
|
||||
[gitbook-plugin-search-pro]:https://plugins.gitbook.com/plugin/search-pro
|
||||
[cppjieba-server]:https://github.com/yanyiwu/cppjieba-server
|
||||
|
||||
|
||||
[](https://bitdeli.com/free "Bitdeli Badge")
|
||||
|
26
README_EN.md
26
README_EN.md
@ -5,6 +5,7 @@
|
||||
[](https://github.com/yanyiwu/cppjieba)
|
||||
[](http://yanyiwu.com/work/2015/06/14/jieba-series-performance-test.html)
|
||||
[](http://yanyiwu.mit-license.org)
|
||||
[](https://ci.appveyor.com/project/yanyiwu/cppjieba/branch/master)
|
||||
|
||||
[](https://github.com/yanyiwu/cppjieba)
|
||||
|
||||
@ -46,21 +47,26 @@ Output:
|
||||
|
||||
```
|
||||
[demo] Cut With HMM
|
||||
我/是/拖拉机/学院/手扶拖拉机/专业/的/。/不用/多久/,/我/就/会/升职/加薪/,/当上/CEO/,/走上/人生/巅峰/。
|
||||
他/来到/了/网易/杭研/大厦
|
||||
[demo] Cut Without HMM
|
||||
我/是/拖拉机/学院/手扶拖拉机/专业/的/。/不用/多久/,/我/就/会/升职/加薪/,/当/上/C/E/O/,/走上/人生/巅峰/。
|
||||
他/来到/了/网易/杭/研/大厦
|
||||
我来到北京清华大学
|
||||
[demo] CutAll
|
||||
我/是/拖拉/拖拉机/学院/手扶/手扶拖拉机/拖拉/拖拉机/专业/的/。/不用/多久/,/我/就/会升/升职/加薪/,/当上/C/E/O/,/走上/人生/巅峰/。[demo] CutForSearch
|
||||
我/是/拖拉机/学院/手扶/手扶拖拉机/拖拉/拖拉机/专业/的/。/不用/多久/,/我/就/会/升职/加薪/,/当上/CEO/,/走上/人生/巅峰/。
|
||||
我/来到/北京/清华/清华大学/华大/大学
|
||||
小明硕士毕业于中国科学院计算所,后在日本京都大学深造
|
||||
[demo] CutForSearch
|
||||
小明/硕士/毕业/于/中国/科学/学院/科学院/中国科学院/计算/计算所/,/后/在/日本/京都/大学/日本京都大学/深造
|
||||
[demo] Insert User Word
|
||||
男默/女泪
|
||||
男默女泪
|
||||
[demo] Locate Words
|
||||
南京市, 0, 3
|
||||
长江大桥, 3, 7
|
||||
[demo] TAGGING
|
||||
[demo] CutForSearch Word With Offset
|
||||
[{"word": "小明", "offset": 0}, {"word": "硕士", "offset": 6}, {"word": "毕业", "offset": 12}, {"word": "于", "offset": 18}, {"word": "中国", "offset": 21}, {"word": "科学", "offset": 27}, {"word": "学院", "offset": 30}, {"word": "科学院", "offset": 27}, {"word": "中国科学院", "offset": 21}, {"word": "计算", "offset": 36}, {"word": "计算所", "offset": 36}, {"word": ",", "offset": 45}, {"word": "后", "offset": 48}, {"word": "在", "offset": 51}, {"word": "日本", "offset": 54}, {"word": "京都", "offset": 60}, {"word": "大学", "offset": 66}, {"word": "日本京都大学", "offset": 54}, {"word": "深造", "offset": 72}]
|
||||
[demo] Tagging
|
||||
我是拖拉机学院手扶拖拉机专业的。不用多久,我就会升职加薪,当上CEO,走上人生巅峰。
|
||||
["我:r", "是:v", "拖拉机:n", "学院:n", "手扶拖拉机:n", "专业:n", "的:uj", "。:x", "不用:v", "多久:m", ",:x", "我:r", "就:d", "会:v", "升职:v", "加薪:nr", ",:x", "当上:t", "CEO:eng", ",:x", "走上:v", "人生:n", "巅峰:n", "。:x"]
|
||||
[我:r, 是:v, 拖拉机:n, 学院:n, 手扶拖拉机:n, 专业:n, 的:uj, 。:x, 不用:v, 多久:m, ,:x, 我:r, 就:d, 会:v, 升职:v, 加薪:nr, ,:x, 当上:t, CEO:eng, ,:x, 走上:v, 人生:n, 巅峰:n, 。:x]
|
||||
[demo] Keyword Extraction
|
||||
我是拖拉机学院手扶拖拉机专业的。不用多久,我就会升职加薪,当上CEO,走上人生巅峰。
|
||||
[{"word": "CEO", "offset": [93], "weight": 11.7392}, {"word": "升职", "offset": [72], "weight": 10.8562}, {"word": "加薪", "offset": [78], "weight": 10.6426}, {"word": "手扶拖拉机", "offset": [21], "weight": 10.0089}, {"word": "巅峰", "offset": [111], "weight": 9.49396}]
|
||||
```
|
||||
|
||||
Please see details in `test/demo.cpp`.
|
||||
@ -79,6 +85,7 @@ Please see details in `test/demo.cpp`.
|
||||
+ [pg_jieba]
|
||||
+ [ngx_http_cppjieba_module]
|
||||
+ [gitbook-plugin-search-pro]
|
||||
+ [cppjieba-server]
|
||||
|
||||
## Contact
|
||||
|
||||
@ -101,3 +108,4 @@ Please see details in `test/demo.cpp`.
|
||||
[SqlJieba]:https://github.com/yanyiwu/sqljieba
|
||||
[pg_jieba]:https://github.com/jaiminpan/pg_jieba
|
||||
[gitbook-plugin-search-pro]:https://plugins.gitbook.com/plugin/search-pro
|
||||
[cppjieba-server]:https://github.com/yanyiwu/cppjieba-server
|
||||
|
32
appveyor.yml
Normal file
32
appveyor.yml
Normal file
@ -0,0 +1,32 @@
|
||||
os: Visual Studio 2015
|
||||
|
||||
platform: x64
|
||||
|
||||
# clone directory
|
||||
clone_folder: c:\projects\cppjieba
|
||||
|
||||
# scripts to run before build
|
||||
before_build:
|
||||
- echo Running cmake...
|
||||
- cd c:\projects\cppjieba
|
||||
- cmake .
|
||||
|
||||
build:
|
||||
project: ALL_BUILD.vcxproj # path to Visual Studio solution or project
|
||||
|
||||
# scripts to run after build
|
||||
after_build:
|
||||
- cd Debug
|
||||
- demo.exe
|
||||
- load_test.exe
|
||||
- cd ..
|
||||
- COPY .\test\Debug\test.run.exe .\test\test.run.exe
|
||||
- cd test
|
||||
- test.run.exe
|
||||
- cd ..
|
||||
- 7z a c:\projects\all.zip * -tzip
|
||||
- cd c:\projects
|
||||
|
||||
artifacts:
|
||||
- path: all.zip
|
||||
name: all.zip
|
4
deps/gtest/CMakeLists.txt
vendored
4
deps/gtest/CMakeLists.txt
vendored
@ -1,3 +1,5 @@
|
||||
INCLUDE_DIRECTORIES(./ include)
|
||||
ADD_LIBRARY(gtest STATIC src/gtest-all.cc)
|
||||
TARGET_LINK_LIBRARIES(gtest pthread)
|
||||
if(NOT MSVC)
|
||||
TARGET_LINK_LIBRARIES(gtest pthread)
|
||||
endif()
|
||||
|
13
deps/limonp/StdExtension.hpp
vendored
13
deps/limonp/StdExtension.hpp
vendored
@ -35,6 +35,19 @@ namespace std {
|
||||
|
||||
template<typename T>
|
||||
ostream& operator << (ostream& os, const vector<T>& v) {
|
||||
if(v.empty()) {
|
||||
return os << "[]";
|
||||
}
|
||||
os<<"["<<v[0];
|
||||
for(size_t i = 1; i < v.size(); i++) {
|
||||
os<<", "<<v[i];
|
||||
}
|
||||
os<<"]";
|
||||
return os;
|
||||
}
|
||||
|
||||
template<>
|
||||
inline ostream& operator << (ostream& os, const vector<string>& v) {
|
||||
if(v.empty()) {
|
||||
return os << "[]";
|
||||
}
|
||||
|
@ -10,7 +10,7 @@
|
||||
#include <limits>
|
||||
#include "limonp/StringUtil.hpp"
|
||||
#include "limonp/Logging.hpp"
|
||||
#include "TransCode.hpp"
|
||||
#include "Unicode.hpp"
|
||||
#include "Trie.hpp"
|
||||
|
||||
namespace cppjieba {
|
||||
@ -48,12 +48,12 @@ class DictTrie {
|
||||
return true;
|
||||
}
|
||||
|
||||
const DictUnit* Find(Unicode::const_iterator begin, Unicode::const_iterator end) const {
|
||||
const DictUnit* Find(RuneStrArray::const_iterator begin, RuneStrArray::const_iterator end) const {
|
||||
return trie_->Find(begin, end);
|
||||
}
|
||||
|
||||
void Find(Unicode::const_iterator begin,
|
||||
Unicode::const_iterator end,
|
||||
void Find(RuneStrArray::const_iterator begin,
|
||||
RuneStrArray::const_iterator end,
|
||||
vector<struct Dag>&res,
|
||||
size_t max_word_len = MAX_WORD_LENGTH) const {
|
||||
trie_->Find(begin, end, res, max_word_len);
|
||||
@ -124,7 +124,7 @@ class DictTrie {
|
||||
const string& word,
|
||||
double weight,
|
||||
const string& tag) {
|
||||
if (!TransCode::Decode(word, node_info.word)) {
|
||||
if (!DecodeRunesInString(word, node_info.word)) {
|
||||
XLOG(ERROR) << "Decode " << word << " failed.";
|
||||
return false;
|
||||
}
|
||||
|
@ -7,7 +7,7 @@
|
||||
#include "limonp/Logging.hpp"
|
||||
#include "DictTrie.hpp"
|
||||
#include "SegmentBase.hpp"
|
||||
#include "TransCode.hpp"
|
||||
#include "Unicode.hpp"
|
||||
|
||||
namespace cppjieba {
|
||||
class FullSegment: public SegmentBase {
|
||||
@ -27,19 +27,27 @@ class FullSegment: public SegmentBase {
|
||||
}
|
||||
void Cut(const string& sentence,
|
||||
vector<string>& words) const {
|
||||
vector<Word> tmp;
|
||||
Cut(sentence, tmp);
|
||||
GetStringsFromWords(tmp, words);
|
||||
}
|
||||
void Cut(const string& sentence,
|
||||
vector<Word>& words) const {
|
||||
PreFilter pre_filter(symbols_, sentence);
|
||||
PreFilter::Range range;
|
||||
vector<Unicode> uwords;
|
||||
uwords.reserve(sentence.size());
|
||||
vector<WordRange> wrs;
|
||||
wrs.reserve(sentence.size()/2);
|
||||
while (pre_filter.HasNext()) {
|
||||
range = pre_filter.Next();
|
||||
Cut(range.begin, range.end, uwords);
|
||||
Cut(range.begin, range.end, wrs);
|
||||
}
|
||||
TransCode::Encode(uwords, words);
|
||||
words.clear();
|
||||
words.reserve(wrs.size());
|
||||
GetWordsFromWordRanges(sentence, wrs, words);
|
||||
}
|
||||
void Cut(Unicode::const_iterator begin,
|
||||
Unicode::const_iterator end,
|
||||
vector<Unicode>& res) const {
|
||||
void Cut(RuneStrArray::const_iterator begin,
|
||||
RuneStrArray::const_iterator end,
|
||||
vector<WordRange>& res) const {
|
||||
//resut of searching in trie tree
|
||||
LocalVector<pair<size_t, const DictUnit*> > tRes;
|
||||
|
||||
@ -56,15 +64,19 @@ class FullSegment: public SegmentBase {
|
||||
dictTrie_->Find(begin, end, dags);
|
||||
for (size_t i = 0; i < dags.size(); i++) {
|
||||
for (size_t j = 0; j < dags[i].nexts.size(); j++) {
|
||||
size_t nextoffset = dags[i].nexts[j].first;
|
||||
assert(nextoffset < dags.size());
|
||||
const DictUnit* du = dags[i].nexts[j].second;
|
||||
if (du == NULL) {
|
||||
if (dags[i].nexts.size() == 1 && maxIdx <= uIdx) {
|
||||
res.push_back(Unicode(1, dags[i].rune));
|
||||
WordRange wr(begin + i, begin + nextoffset);
|
||||
res.push_back(wr);
|
||||
}
|
||||
} else {
|
||||
wordLen = du->word.size();
|
||||
if (wordLen >= 2 || (dags[i].nexts.size() == 1 && maxIdx <= uIdx)) {
|
||||
res.push_back(du->word);
|
||||
WordRange wr(begin + i, begin + nextoffset);
|
||||
res.push_back(wr);
|
||||
}
|
||||
}
|
||||
maxIdx = uIdx + wordLen > maxIdx ? uIdx + wordLen : maxIdx;
|
||||
|
@ -105,7 +105,7 @@ struct HMMModel {
|
||||
XLOG(ERROR) << "emitProb illegal.";
|
||||
return false;
|
||||
}
|
||||
if (!TransCode::Decode(tmp2[0], unicode) || unicode.size() != 1) {
|
||||
if (!DecodeRunesInString(tmp2[0], unicode) || unicode.size() != 1) {
|
||||
XLOG(ERROR) << "TransCode failed.";
|
||||
return false;
|
||||
}
|
||||
|
@ -25,21 +25,29 @@ class HMMSegment: public SegmentBase {
|
||||
|
||||
void Cut(const string& sentence,
|
||||
vector<string>& words) const {
|
||||
vector<Word> tmp;
|
||||
Cut(sentence, tmp);
|
||||
GetStringsFromWords(tmp, words);
|
||||
}
|
||||
void Cut(const string& sentence,
|
||||
vector<Word>& words) const {
|
||||
PreFilter pre_filter(symbols_, sentence);
|
||||
PreFilter::Range range;
|
||||
vector<Unicode> uwords;
|
||||
uwords.reserve(sentence.size());
|
||||
vector<WordRange> wrs;
|
||||
wrs.reserve(sentence.size()/2);
|
||||
while (pre_filter.HasNext()) {
|
||||
range = pre_filter.Next();
|
||||
Cut(range.begin, range.end, uwords);
|
||||
Cut(range.begin, range.end, wrs);
|
||||
}
|
||||
TransCode::Encode(uwords, words);
|
||||
words.clear();
|
||||
words.reserve(wrs.size());
|
||||
GetWordsFromWordRanges(sentence, wrs, words);
|
||||
}
|
||||
void Cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<Unicode>& res) const {
|
||||
Unicode::const_iterator left = begin;
|
||||
Unicode::const_iterator right = begin;
|
||||
void Cut(RuneStrArray::const_iterator begin, RuneStrArray::const_iterator end, vector<WordRange>& res) const {
|
||||
RuneStrArray::const_iterator left = begin;
|
||||
RuneStrArray::const_iterator right = begin;
|
||||
while (right != end) {
|
||||
if (*right < 0x80) {
|
||||
if (right->rune < 0x80) {
|
||||
if (left != right) {
|
||||
InternalCut(left, right, res);
|
||||
}
|
||||
@ -55,7 +63,8 @@ class HMMSegment: public SegmentBase {
|
||||
}
|
||||
right ++;
|
||||
} while (false);
|
||||
res.push_back(Unicode(left, right));
|
||||
WordRange wr(left, right - 1);
|
||||
res.push_back(wr);
|
||||
left = right;
|
||||
} else {
|
||||
right++;
|
||||
@ -67,15 +76,15 @@ class HMMSegment: public SegmentBase {
|
||||
}
|
||||
private:
|
||||
// sequential letters rule
|
||||
Unicode::const_iterator SequentialLetterRule(Unicode::const_iterator begin, Unicode::const_iterator end) const {
|
||||
Rune x = *begin;
|
||||
RuneStrArray::const_iterator SequentialLetterRule(RuneStrArray::const_iterator begin, RuneStrArray::const_iterator end) const {
|
||||
Rune x = begin->rune;
|
||||
if (('a' <= x && x <= 'z') || ('A' <= x && x <= 'Z')) {
|
||||
begin ++;
|
||||
} else {
|
||||
return begin;
|
||||
}
|
||||
while (begin != end) {
|
||||
x = *begin;
|
||||
x = begin->rune;
|
||||
if (('a' <= x && x <= 'z') || ('A' <= x && x <= 'Z') || ('0' <= x && x <= '9')) {
|
||||
begin ++;
|
||||
} else {
|
||||
@ -85,15 +94,15 @@ class HMMSegment: public SegmentBase {
|
||||
return begin;
|
||||
}
|
||||
//
|
||||
Unicode::const_iterator NumbersRule(Unicode::const_iterator begin, Unicode::const_iterator end) const {
|
||||
Rune x = *begin;
|
||||
RuneStrArray::const_iterator NumbersRule(RuneStrArray::const_iterator begin, RuneStrArray::const_iterator end) const {
|
||||
Rune x = begin->rune;
|
||||
if ('0' <= x && x <= '9') {
|
||||
begin ++;
|
||||
} else {
|
||||
return begin;
|
||||
}
|
||||
while (begin != end) {
|
||||
x = *begin;
|
||||
x = begin->rune;
|
||||
if ( ('0' <= x && x <= '9') || x == '.') {
|
||||
begin++;
|
||||
} else {
|
||||
@ -102,23 +111,24 @@ class HMMSegment: public SegmentBase {
|
||||
}
|
||||
return begin;
|
||||
}
|
||||
void InternalCut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<Unicode>& res) const {
|
||||
void InternalCut(RuneStrArray::const_iterator begin, RuneStrArray::const_iterator end, vector<WordRange>& res) const {
|
||||
vector<size_t> status;
|
||||
Viterbi(begin, end, status);
|
||||
|
||||
Unicode::const_iterator left = begin;
|
||||
Unicode::const_iterator right;
|
||||
RuneStrArray::const_iterator left = begin;
|
||||
RuneStrArray::const_iterator right;
|
||||
for (size_t i = 0; i < status.size(); i++) {
|
||||
if (status[i] % 2) { //if (HMMModel::E == status[i] || HMMModel::S == status[i])
|
||||
right = begin + i + 1;
|
||||
res.push_back(Unicode(left, right));
|
||||
WordRange wr(left, right - 1);
|
||||
res.push_back(wr);
|
||||
left = right;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void Viterbi(Unicode::const_iterator begin,
|
||||
Unicode::const_iterator end,
|
||||
void Viterbi(RuneStrArray::const_iterator begin,
|
||||
RuneStrArray::const_iterator end,
|
||||
vector<size_t>& status) const {
|
||||
size_t Y = HMMModel::STATUS_SUM;
|
||||
size_t X = end - begin;
|
||||
@ -132,7 +142,7 @@ class HMMSegment: public SegmentBase {
|
||||
|
||||
//start
|
||||
for (size_t y = 0; y < Y; y++) {
|
||||
weight[0 + y * X] = model_->startProb[y] + model_->GetEmitProb(model_->emitProbVec[y], *begin, MIN_DOUBLE);
|
||||
weight[0 + y * X] = model_->startProb[y] + model_->GetEmitProb(model_->emitProbVec[y], begin->rune, MIN_DOUBLE);
|
||||
path[0 + y * X] = -1;
|
||||
}
|
||||
|
||||
@ -143,7 +153,7 @@ class HMMSegment: public SegmentBase {
|
||||
now = x + y*X;
|
||||
weight[now] = MIN_DOUBLE;
|
||||
path[now] = HMMModel::E; // warning
|
||||
emitProb = model_->GetEmitProb(model_->emitProbVec[y], *(begin+x), MIN_DOUBLE);
|
||||
emitProb = model_->GetEmitProb(model_->emitProbVec[y], (begin+x)->rune, MIN_DOUBLE);
|
||||
for (size_t preY = 0; preY < Y; preY++) {
|
||||
old = x - 1 + preY * X;
|
||||
tmp = weight[old] + model_->transProb[preY][y] + emitProb;
|
||||
|
@ -3,7 +3,7 @@
|
||||
|
||||
#include "QuerySegment.hpp"
|
||||
#include "PosTagger.hpp"
|
||||
#include "LevelSegment.hpp"
|
||||
//#include "LevelSegment.hpp"
|
||||
|
||||
namespace cppjieba {
|
||||
|
||||
@ -17,7 +17,7 @@ class Jieba {
|
||||
mix_seg_(&dict_trie_, &model_),
|
||||
full_seg_(&dict_trie_),
|
||||
query_seg_(&dict_trie_, &model_),
|
||||
level_seg_(&dict_trie_),
|
||||
//level_seg_(&dict_trie_),
|
||||
pos_tagger_(&dict_trie_, &model_) {
|
||||
}
|
||||
~Jieba() {
|
||||
@ -32,34 +32,32 @@ class Jieba {
|
||||
void Cut(const string& sentence, vector<string>& words, bool hmm = true) const {
|
||||
mix_seg_.Cut(sentence, words, hmm);
|
||||
}
|
||||
void Cut(const string& sentence, vector<Word>& words, bool hmm = true) const {
|
||||
mix_seg_.Cut(sentence, words, hmm);
|
||||
}
|
||||
void CutAll(const string& sentence, vector<string>& words) const {
|
||||
full_seg_.Cut(sentence, words);
|
||||
}
|
||||
void CutAll(const string& sentence, vector<Word>& words) const {
|
||||
full_seg_.Cut(sentence, words);
|
||||
}
|
||||
void CutForSearch(const string& sentence, vector<string>& words, bool hmm = true) const {
|
||||
query_seg_.Cut(sentence, words, hmm);
|
||||
}
|
||||
void CutForSearch(const string& sentence, vector<Word>& words, bool hmm = true) const {
|
||||
query_seg_.Cut(sentence, words, hmm);
|
||||
}
|
||||
void CutHMM(const string& sentence, vector<string>& words) const {
|
||||
hmm_seg_.Cut(sentence, words);
|
||||
}
|
||||
void CutLevel(const string& sentence, vector<string>& words) const {
|
||||
level_seg_.Cut(sentence, words);
|
||||
}
|
||||
void CutLevel(const string& sentence, vector<pair<string, size_t> >& words) const {
|
||||
level_seg_.Cut(sentence, words);
|
||||
void CutHMM(const string& sentence, vector<Word>& words) const {
|
||||
hmm_seg_.Cut(sentence, words);
|
||||
}
|
||||
void CutSmall(const string& sentence, vector<string>& words, size_t max_word_len) const {
|
||||
mp_seg_.Cut(sentence, words, max_word_len);
|
||||
}
|
||||
static void Locate(const vector<string>& words, vector<LocWord>& loc_words) {
|
||||
loc_words.resize(words.size());
|
||||
size_t begin = 0;
|
||||
for (size_t i = 0; i < words.size(); i++) {
|
||||
size_t len = TransCode::Decode(words[i]).size();
|
||||
loc_words[i].word = words[i];
|
||||
loc_words[i].begin = begin;
|
||||
loc_words[i].end = loc_words[i].begin + len;
|
||||
begin = loc_words[i].end;
|
||||
}
|
||||
void CutSmall(const string& sentence, vector<Word>& words, size_t max_word_len) const {
|
||||
mp_seg_.Cut(sentence, words, max_word_len);
|
||||
}
|
||||
|
||||
void Tag(const string& sentence, vector<pair<string, string> >& words) const {
|
||||
@ -76,9 +74,6 @@ class Jieba {
|
||||
return &model_;
|
||||
}
|
||||
|
||||
void SetQuerySegmentThreshold(size_t len) {
|
||||
query_seg_.SetMaxWordLen(len);
|
||||
}
|
||||
private:
|
||||
DictTrie dict_trie_;
|
||||
HMMModel model_;
|
||||
@ -89,7 +84,7 @@ class Jieba {
|
||||
MixSegment mix_seg_;
|
||||
FullSegment full_seg_;
|
||||
QuerySegment query_seg_;
|
||||
LevelSegment level_seg_;
|
||||
//LevelSegment level_seg_;
|
||||
|
||||
PosTagger pos_tagger_;
|
||||
|
||||
|
@ -136,14 +136,6 @@ class KeywordExtractor {
|
||||
assert(stopWords_.size());
|
||||
}
|
||||
|
||||
bool IsSingleWord(const string& str) const {
|
||||
Unicode unicode;
|
||||
TransCode::Decode(str, unicode);
|
||||
if (unicode.size() == 1)
|
||||
return true;
|
||||
return false;
|
||||
}
|
||||
|
||||
static bool Compare(const Word& lhs, const Word& rhs) {
|
||||
return lhs.weight > rhs.weight;
|
||||
}
|
||||
@ -153,10 +145,10 @@ class KeywordExtractor {
|
||||
double idfAverage_;
|
||||
|
||||
unordered_set<string> stopWords_;
|
||||
}; // class Jieba
|
||||
}; // class KeywordExtractor
|
||||
|
||||
inline ostream& operator << (ostream& os, const KeywordExtractor::Word& word) {
|
||||
return os << word.word << '|' << word.offsets << '|' << word.weight;
|
||||
return os << "{\"word\": \"" << word.word << "\", \"offset\": " << word.offsets << ", \"weight\": " << word.weight << "}";
|
||||
}
|
||||
|
||||
} // namespace cppjieba
|
||||
|
@ -1,81 +0,0 @@
|
||||
#ifndef CPPJIEBA_LEVELSEGMENT_H
|
||||
#define CPPJIEBA_LEVELSEGMENT_H
|
||||
|
||||
#include "MPSegment.hpp"
|
||||
|
||||
namespace cppjieba {
|
||||
|
||||
class LevelSegment: public SegmentBase{
|
||||
public:
|
||||
LevelSegment(const string& dictPath,
|
||||
const string& userDictPath = "")
|
||||
: mpSeg_(dictPath, userDictPath) {
|
||||
}
|
||||
LevelSegment(const DictTrie* dictTrie)
|
||||
: mpSeg_(dictTrie) {
|
||||
}
|
||||
~LevelSegment() {
|
||||
}
|
||||
|
||||
void Cut(Unicode::const_iterator begin,
|
||||
Unicode::const_iterator end,
|
||||
vector<pair<Unicode, size_t> >& res) const {
|
||||
res.clear();
|
||||
vector<Unicode> words;
|
||||
vector<Unicode> smallerWords;
|
||||
words.reserve(end - begin);
|
||||
mpSeg_.Cut(begin, end, words);
|
||||
smallerWords.reserve(words.size());
|
||||
res.reserve(words.size());
|
||||
|
||||
size_t level = 0;
|
||||
while (!words.empty()) {
|
||||
smallerWords.clear();
|
||||
for (size_t i = 0; i < words.size(); i++) {
|
||||
if (words[i].size() >= 3) {
|
||||
size_t len = words[i].size() - 1;
|
||||
mpSeg_.Cut(words[i].begin(), words[i].end(), smallerWords, len); // buffer.push_back without clear
|
||||
}
|
||||
if (words[i].size() > 1) {
|
||||
res.push_back(pair<Unicode, size_t>(words[i], level));
|
||||
}
|
||||
}
|
||||
|
||||
words.swap(smallerWords);
|
||||
level++;
|
||||
}
|
||||
}
|
||||
|
||||
void Cut(const string& sentence,
|
||||
vector<pair<string, size_t> >& words) const {
|
||||
words.clear();
|
||||
Unicode unicode;
|
||||
TransCode::Decode(sentence, unicode);
|
||||
vector<pair<Unicode, size_t> > unicodeWords;
|
||||
Cut(unicode.begin(), unicode.end(), unicodeWords);
|
||||
words.resize(unicodeWords.size());
|
||||
for (size_t i = 0; i < words.size(); i++) {
|
||||
TransCode::Encode(unicodeWords[i].first, words[i].first);
|
||||
words[i].second = unicodeWords[i].second;
|
||||
}
|
||||
}
|
||||
|
||||
bool Cut(const string& sentence,
|
||||
vector<string>& res) const {
|
||||
vector<pair<string, size_t> > words;
|
||||
Cut(sentence, words);
|
||||
res.clear();
|
||||
res.reserve(words.size());
|
||||
for (size_t i = 0; i < words.size(); i++) {
|
||||
res.push_back(words[i].first);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
private:
|
||||
MPSegment mpSeg_;
|
||||
}; // class LevelSegment
|
||||
|
||||
} // namespace cppjieba
|
||||
|
||||
#endif // CPPJIEBA_LEVELSEGMENT_H
|
@ -28,19 +28,28 @@ class MPSegment: public SegmentBase {
|
||||
void Cut(const string& sentence,
|
||||
vector<string>& words,
|
||||
size_t max_word_len = MAX_WORD_LENGTH) const {
|
||||
vector<Word> tmp;
|
||||
Cut(sentence, tmp, max_word_len);
|
||||
GetStringsFromWords(tmp, words);
|
||||
}
|
||||
void Cut(const string& sentence,
|
||||
vector<Word>& words,
|
||||
size_t max_word_len = MAX_WORD_LENGTH) const {
|
||||
PreFilter pre_filter(symbols_, sentence);
|
||||
PreFilter::Range range;
|
||||
vector<Unicode> uwords;
|
||||
uwords.reserve(sentence.size());
|
||||
vector<WordRange> wrs;
|
||||
wrs.reserve(sentence.size()/2);
|
||||
while (pre_filter.HasNext()) {
|
||||
range = pre_filter.Next();
|
||||
Cut(range.begin, range.end, uwords, max_word_len);
|
||||
Cut(range.begin, range.end, wrs, max_word_len);
|
||||
}
|
||||
TransCode::Encode(uwords, words);
|
||||
words.clear();
|
||||
words.reserve(wrs.size());
|
||||
GetWordsFromWordRanges(sentence, wrs, words);
|
||||
}
|
||||
void Cut(Unicode::const_iterator begin,
|
||||
Unicode::const_iterator end,
|
||||
vector<Unicode>& words,
|
||||
void Cut(RuneStrArray::const_iterator begin,
|
||||
RuneStrArray::const_iterator end,
|
||||
vector<WordRange>& words,
|
||||
size_t max_word_len = MAX_WORD_LENGTH) const {
|
||||
vector<Dag> dags;
|
||||
dictTrie_->Find(begin,
|
||||
@ -48,7 +57,7 @@ class MPSegment: public SegmentBase {
|
||||
dags,
|
||||
max_word_len);
|
||||
CalcDP(dags);
|
||||
CutByDag(dags, words);
|
||||
CutByDag(begin, end, dags, words);
|
||||
}
|
||||
|
||||
const DictTrie* GetDictTrie() const {
|
||||
@ -88,16 +97,21 @@ class MPSegment: public SegmentBase {
|
||||
}
|
||||
}
|
||||
}
|
||||
void CutByDag(const vector<Dag>& dags,
|
||||
vector<Unicode>& words) const {
|
||||
void CutByDag(RuneStrArray::const_iterator begin,
|
||||
RuneStrArray::const_iterator end,
|
||||
const vector<Dag>& dags,
|
||||
vector<WordRange>& words) const {
|
||||
size_t i = 0;
|
||||
while (i < dags.size()) {
|
||||
const DictUnit* p = dags[i].pInfo;
|
||||
if (p) {
|
||||
words.push_back(p->word);
|
||||
assert(p->word.size() >= 1);
|
||||
WordRange wr(begin + i, begin + i + p->word.size() - 1);
|
||||
words.push_back(wr);
|
||||
i += p->word.size();
|
||||
} else { //single chinese word
|
||||
words.push_back(Unicode(1, dags[i].rune));
|
||||
WordRange wr(begin + i, begin + i);
|
||||
words.push_back(wr);
|
||||
i++;
|
||||
}
|
||||
}
|
||||
|
@ -21,54 +21,59 @@ class MixSegment: public SegmentBase {
|
||||
}
|
||||
|
||||
void Cut(const string& sentence, vector<string>& words, bool hmm = true) const {
|
||||
vector<Word> tmp;
|
||||
Cut(sentence, tmp, hmm);
|
||||
GetStringsFromWords(tmp, words);
|
||||
}
|
||||
void Cut(const string& sentence, vector<Word>& words, bool hmm = true) const {
|
||||
PreFilter pre_filter(symbols_, sentence);
|
||||
PreFilter::Range range;
|
||||
vector<Unicode> uwords;
|
||||
uwords.reserve(sentence.size());
|
||||
vector<WordRange> wrs;
|
||||
wrs.reserve(sentence.size() / 2);
|
||||
while (pre_filter.HasNext()) {
|
||||
range = pre_filter.Next();
|
||||
Cut(range.begin, range.end, uwords, hmm);
|
||||
Cut(range.begin, range.end, wrs, hmm);
|
||||
}
|
||||
TransCode::Encode(uwords, words);
|
||||
words.clear();
|
||||
words.reserve(wrs.size());
|
||||
GetWordsFromWordRanges(sentence, wrs, words);
|
||||
}
|
||||
|
||||
void Cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<Unicode>& res, bool hmm) const {
|
||||
void Cut(RuneStrArray::const_iterator begin, RuneStrArray::const_iterator end, vector<WordRange>& res, bool hmm) const {
|
||||
if (!hmm) {
|
||||
mpSeg_.Cut(begin, end, res);
|
||||
return;
|
||||
}
|
||||
vector<Unicode> words;
|
||||
vector<WordRange> words;
|
||||
assert(end >= begin);
|
||||
words.reserve(end - begin);
|
||||
mpSeg_.Cut(begin, end, words);
|
||||
|
||||
vector<Unicode> hmmRes;
|
||||
vector<WordRange> hmmRes;
|
||||
hmmRes.reserve(end - begin);
|
||||
Unicode piece;
|
||||
piece.reserve(end - begin);
|
||||
for (size_t i = 0, j = 0; i < words.size(); i++) {
|
||||
for (size_t i = 0; i < words.size(); i++) {
|
||||
//if mp Get a word, it's ok, put it into result
|
||||
if (1 != words[i].size() || (words[i].size() == 1 && mpSeg_.IsUserDictSingleChineseWord(words[i][0]))) {
|
||||
if (words[i].left != words[i].right || (words[i].left == words[i].right && mpSeg_.IsUserDictSingleChineseWord(words[i].left->rune))) {
|
||||
res.push_back(words[i]);
|
||||
continue;
|
||||
}
|
||||
|
||||
// if mp Get a single one and it is not in userdict, collect it in sequence
|
||||
j = i;
|
||||
while (j < words.size() && 1 == words[j].size() && !mpSeg_.IsUserDictSingleChineseWord(words[j][0])) {
|
||||
piece.push_back(words[j][0]);
|
||||
size_t j = i;
|
||||
while (j < words.size() && words[j].left == words[j].right && !mpSeg_.IsUserDictSingleChineseWord(words[j].left->rune)) {
|
||||
j++;
|
||||
}
|
||||
|
||||
// Cut the sequence with hmm
|
||||
hmmSeg_.Cut(piece.begin(), piece.end(), hmmRes);
|
||||
|
||||
assert(j - 1 >= i);
|
||||
// TODO
|
||||
hmmSeg_.Cut(words[i].left, words[j - 1].left + 1, hmmRes);
|
||||
//put hmm result to result
|
||||
for (size_t k = 0; k < hmmRes.size(); k++) {
|
||||
res.push_back(hmmRes[k]);
|
||||
}
|
||||
|
||||
//clear tmp vars
|
||||
piece.clear();
|
||||
hmmRes.clear();
|
||||
|
||||
//let i jump over this piece
|
||||
|
@ -30,17 +30,17 @@ class PosTagger {
|
||||
segment_.Cut(src, CutRes);
|
||||
|
||||
const DictUnit *tmp = NULL;
|
||||
Unicode unico;
|
||||
RuneStrArray runes;
|
||||
const DictTrie * dict = segment_.GetDictTrie();
|
||||
assert(dict != NULL);
|
||||
for (vector<string>::iterator itr = CutRes.begin(); itr != CutRes.end(); ++itr) {
|
||||
if (!TransCode::Decode(*itr, unico)) {
|
||||
if (!DecodeRunesInString(*itr, runes)) {
|
||||
XLOG(ERROR) << "Decode failed.";
|
||||
return false;
|
||||
}
|
||||
tmp = dict->Find(unico.begin(), unico.end());
|
||||
tmp = dict->Find(runes.begin(), runes.end());
|
||||
if (tmp == NULL || tmp->tag.empty()) {
|
||||
res.push_back(make_pair(*itr, SpecialRule(unico)));
|
||||
res.push_back(make_pair(*itr, SpecialRule(runes)));
|
||||
} else {
|
||||
res.push_back(make_pair(*itr, tmp->tag));
|
||||
}
|
||||
@ -48,13 +48,13 @@ class PosTagger {
|
||||
return !res.empty();
|
||||
}
|
||||
private:
|
||||
const char* SpecialRule(const Unicode& unicode) const {
|
||||
const char* SpecialRule(const RuneStrArray& unicode) const {
|
||||
size_t m = 0;
|
||||
size_t eng = 0;
|
||||
for (size_t i = 0; i < unicode.size() && eng < unicode.size() / 2; i++) {
|
||||
if (unicode[i] < 0x80) {
|
||||
if (unicode[i].rune < 0x80) {
|
||||
eng ++;
|
||||
if ('0' <= unicode[i] && unicode[i] <= '9') {
|
||||
if ('0' <= unicode[i].rune && unicode[i].rune <= '9') {
|
||||
m++;
|
||||
}
|
||||
}
|
||||
|
@ -1,32 +1,25 @@
|
||||
#ifndef CPPJIEBA_PRE_FILTER_H
|
||||
#define CPPJIEBA_PRE_FILTER_H
|
||||
|
||||
#include "TransCode.hpp"
|
||||
#include "Trie.hpp"
|
||||
#include "limonp/Logging.hpp"
|
||||
|
||||
namespace cppjieba {
|
||||
|
||||
//class PreFilterIterator {
|
||||
// public:
|
||||
// PreFilterIterator() {
|
||||
// }
|
||||
// ~PreFilterIterator() {
|
||||
// }
|
||||
//
|
||||
// private:
|
||||
// const unordered_set<Rune>& specialSymbols_;
|
||||
//}; // PreFilterIterator
|
||||
|
||||
class PreFilter {
|
||||
public:
|
||||
//TODO use WordRange instead of Range
|
||||
struct Range {
|
||||
Unicode::const_iterator begin;
|
||||
Unicode::const_iterator end;
|
||||
RuneStrArray::const_iterator begin;
|
||||
RuneStrArray::const_iterator end;
|
||||
}; // struct Range
|
||||
|
||||
PreFilter(const unordered_set<Rune>& symbols,
|
||||
const string& sentence)
|
||||
: symbols_(symbols) {
|
||||
TransCode::Decode(sentence, sentence_);
|
||||
if (!DecodeRunesInString(sentence, sentence_)) {
|
||||
XLOG(ERROR) << "decode failed. ";
|
||||
}
|
||||
cursor_ = sentence_.begin();
|
||||
}
|
||||
~PreFilter() {
|
||||
@ -38,7 +31,7 @@ class PreFilter {
|
||||
Range range;
|
||||
range.begin = cursor_;
|
||||
while (cursor_ != sentence_.end()) {
|
||||
if (IsIn(symbols_, *cursor_)) {
|
||||
if (IsIn(symbols_, cursor_->rune)) {
|
||||
if (range.begin == cursor_) {
|
||||
cursor_ ++;
|
||||
}
|
||||
@ -51,8 +44,8 @@ class PreFilter {
|
||||
return range;
|
||||
}
|
||||
private:
|
||||
Unicode::const_iterator cursor_;
|
||||
Unicode sentence_;
|
||||
RuneStrArray::const_iterator cursor_;
|
||||
RuneStrArray sentence_;
|
||||
const unordered_set<Rune>& symbols_;
|
||||
}; // class PreFilter
|
||||
|
||||
|
@ -9,61 +9,65 @@
|
||||
#include "SegmentBase.hpp"
|
||||
#include "FullSegment.hpp"
|
||||
#include "MixSegment.hpp"
|
||||
#include "TransCode.hpp"
|
||||
#include "Unicode.hpp"
|
||||
#include "DictTrie.hpp"
|
||||
|
||||
namespace cppjieba {
|
||||
class QuerySegment: public SegmentBase {
|
||||
public:
|
||||
QuerySegment(const string& dict, const string& model, const string& userDict = "", size_t maxWordLen = 4)
|
||||
QuerySegment(const string& dict, const string& model, const string& userDict = "")
|
||||
: mixSeg_(dict, model, userDict),
|
||||
fullSeg_(mixSeg_.GetDictTrie()),
|
||||
maxWordLen_(maxWordLen) {
|
||||
assert(maxWordLen_);
|
||||
trie_(mixSeg_.GetDictTrie()) {
|
||||
}
|
||||
QuerySegment(const DictTrie* dictTrie, const HMMModel* model, size_t maxWordLen = 4)
|
||||
: mixSeg_(dictTrie, model), fullSeg_(dictTrie), maxWordLen_(maxWordLen) {
|
||||
QuerySegment(const DictTrie* dictTrie, const HMMModel* model)
|
||||
: mixSeg_(dictTrie, model), trie_(dictTrie) {
|
||||
}
|
||||
~QuerySegment() {
|
||||
}
|
||||
void Cut(const string& sentence, vector<string>& words, bool hmm = true) const {
|
||||
vector<Word> tmp;
|
||||
Cut(sentence, tmp, hmm);
|
||||
GetStringsFromWords(tmp, words);
|
||||
}
|
||||
void Cut(const string& sentence, vector<Word>& words, bool hmm = true) const {
|
||||
PreFilter pre_filter(symbols_, sentence);
|
||||
PreFilter::Range range;
|
||||
vector<Unicode> uwords;
|
||||
uwords.reserve(sentence.size());
|
||||
vector<WordRange> wrs;
|
||||
wrs.reserve(sentence.size()/2);
|
||||
while (pre_filter.HasNext()) {
|
||||
range = pre_filter.Next();
|
||||
Cut(range.begin, range.end, uwords, hmm);
|
||||
Cut(range.begin, range.end, wrs, hmm);
|
||||
}
|
||||
TransCode::Encode(uwords, words);
|
||||
words.clear();
|
||||
words.reserve(wrs.size());
|
||||
GetWordsFromWordRanges(sentence, wrs, words);
|
||||
}
|
||||
void Cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<Unicode>& res, bool hmm) const {
|
||||
void Cut(RuneStrArray::const_iterator begin, RuneStrArray::const_iterator end, vector<WordRange>& res, bool hmm) const {
|
||||
//use mix Cut first
|
||||
vector<Unicode> mixRes;
|
||||
vector<WordRange> mixRes;
|
||||
mixSeg_.Cut(begin, end, mixRes, hmm);
|
||||
|
||||
vector<Unicode> fullRes;
|
||||
for (vector<Unicode>::const_iterator mixResItr = mixRes.begin(); mixResItr != mixRes.end(); mixResItr++) {
|
||||
// if it's too long, Cut with fullSeg_, put fullRes in res
|
||||
if (mixResItr->size() > maxWordLen_ && !IsAllAscii(*mixResItr)) {
|
||||
fullSeg_.Cut(mixResItr->begin(), mixResItr->end(), fullRes);
|
||||
for (vector<Unicode>::const_iterator fullResItr = fullRes.begin(); fullResItr != fullRes.end(); fullResItr++) {
|
||||
res.push_back(*fullResItr);
|
||||
vector<WordRange> fullRes;
|
||||
for (vector<WordRange>::const_iterator mixResItr = mixRes.begin(); mixResItr != mixRes.end(); mixResItr++) {
|
||||
if (mixResItr->Length() > 2) {
|
||||
for (size_t i = 0; i + 1 < mixResItr->Length(); i++) {
|
||||
WordRange wr(mixResItr->left + i, mixResItr->left + i + 1);
|
||||
if (trie_->Find(wr.left, wr.right + 1) != NULL) {
|
||||
res.push_back(wr);
|
||||
}
|
||||
}
|
||||
|
||||
//clear tmp res
|
||||
fullRes.clear();
|
||||
} else { // just use the mix result
|
||||
res.push_back(*mixResItr);
|
||||
}
|
||||
if (mixResItr->Length() > 3) {
|
||||
for (size_t i = 0; i + 2 < mixResItr->Length(); i++) {
|
||||
WordRange wr(mixResItr->left + i, mixResItr->left + i + 2);
|
||||
if (trie_->Find(wr.left, wr.right + 1) != NULL) {
|
||||
res.push_back(wr);
|
||||
}
|
||||
}
|
||||
}
|
||||
res.push_back(*mixResItr);
|
||||
}
|
||||
}
|
||||
void SetMaxWordLen(size_t len) {
|
||||
maxWordLen_ = len;
|
||||
}
|
||||
size_t GetMaxWordLen() const {
|
||||
return maxWordLen_;
|
||||
}
|
||||
private:
|
||||
bool IsAllAscii(const Unicode& s) const {
|
||||
for(size_t i = 0; i < s.size(); i++) {
|
||||
@ -74,8 +78,7 @@ class QuerySegment: public SegmentBase {
|
||||
return true;
|
||||
}
|
||||
MixSegment mixSeg_;
|
||||
FullSegment fullSeg_;
|
||||
size_t maxWordLen_;
|
||||
const DictTrie* trie_;
|
||||
}; // QuerySegment
|
||||
|
||||
} // namespace cppjieba
|
||||
|
@ -171,24 +171,16 @@ namespace cppjieba {
|
||||
assert(stopWords_.size());
|
||||
}
|
||||
|
||||
bool IsSingleWord(const string& str) const {
|
||||
Unicode unicode;
|
||||
TransCode::Decode(str, unicode);
|
||||
if (unicode.size() == 1)
|
||||
return true;
|
||||
return false;
|
||||
}
|
||||
|
||||
static bool Compare(const Word &x,const Word &y){
|
||||
return x.weight > y.weight;
|
||||
}
|
||||
|
||||
MixSegment segment_;
|
||||
unordered_set<string> stopWords_;
|
||||
};
|
||||
}; // class TextRankExtractor
|
||||
|
||||
inline ostream& operator << (ostream& os, const TextRankExtractor::Word& word) {
|
||||
return os << word.word << '|' << word.offsets << '|' << word.weight;
|
||||
return os << "{\"word\": \"" << word.word << "\", \"offset\": " << word.offsets << ", \"weight\": " << word.weight << "}";
|
||||
}
|
||||
} // namespace cppjieba
|
||||
|
||||
|
@ -1,70 +0,0 @@
|
||||
/************************************
|
||||
* file enc : utf-8
|
||||
* author : wuyanyi09@gmail.com
|
||||
************************************/
|
||||
#ifndef CPPJIEBA_TRANSCODE_H
|
||||
#define CPPJIEBA_TRANSCODE_H
|
||||
|
||||
|
||||
#include "limonp/StringUtil.hpp"
|
||||
#include "limonp/LocalVector.hpp"
|
||||
|
||||
namespace cppjieba {
|
||||
|
||||
using namespace limonp;
|
||||
|
||||
typedef uint32_t Rune;
|
||||
typedef limonp::LocalVector<Rune> Unicode;
|
||||
|
||||
namespace TransCode {
|
||||
inline bool Decode(const string& str, Unicode& res) {
|
||||
#ifdef CPPJIEBA_GBK
|
||||
return gbkTrans(str, res);
|
||||
#else
|
||||
return Utf8ToUnicode32(str, res);
|
||||
#endif
|
||||
}
|
||||
|
||||
inline void Encode(Unicode::const_iterator begin, Unicode::const_iterator end, string& res) {
|
||||
#ifdef CPPJIEBA_GBK
|
||||
gbkTrans(begin, end, res);
|
||||
#else
|
||||
Unicode32ToUtf8(begin, end, res);
|
||||
#endif
|
||||
}
|
||||
|
||||
inline void Encode(const Unicode& uni, string& res) {
|
||||
Encode(uni.begin(), uni.end(), res);
|
||||
}
|
||||
|
||||
// compiler is expected to optimized this function to avoid return value copy
|
||||
inline string Encode(Unicode::const_iterator begin, Unicode::const_iterator end) {
|
||||
string res;
|
||||
res.reserve(end - begin);
|
||||
Encode(begin, end, res);
|
||||
return res;
|
||||
}
|
||||
|
||||
inline string Encode(const Unicode& unicode) {
|
||||
return Encode(unicode.begin(), unicode.end());
|
||||
}
|
||||
|
||||
// compiler is expected to optimized this function to avoid return value copy
|
||||
inline Unicode Decode(const string& str) {
|
||||
Unicode unicode;
|
||||
unicode.reserve(str.size());
|
||||
Decode(str, unicode);
|
||||
return unicode;
|
||||
}
|
||||
|
||||
inline void Encode(const vector<Unicode>& input, vector<string>& output) {
|
||||
output.resize(input.size());
|
||||
for (size_t i = 0; i < output.size(); i++) {
|
||||
Encode(input[i], output[i]);
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace TransCode
|
||||
} // namespace cppjieba
|
||||
|
||||
#endif
|
@ -4,7 +4,7 @@
|
||||
#include <vector>
|
||||
#include <queue>
|
||||
#include "limonp/StdExtension.hpp"
|
||||
#include "Trie.hpp"
|
||||
#include "Unicode.hpp"
|
||||
|
||||
namespace cppjieba {
|
||||
|
||||
@ -16,24 +16,25 @@ struct DictUnit {
|
||||
Unicode word;
|
||||
double weight;
|
||||
string tag;
|
||||
};
|
||||
}; // struct DictUnit
|
||||
|
||||
// for debugging
|
||||
inline ostream & operator << (ostream& os, const DictUnit& unit) {
|
||||
string s;
|
||||
s << unit.word;
|
||||
return os << StringFormat("%s %s %.3lf", s.c_str(), unit.tag.c_str(), unit.weight);
|
||||
}
|
||||
// inline ostream & operator << (ostream& os, const DictUnit& unit) {
|
||||
// string s;
|
||||
// s << unit.word;
|
||||
// return os << StringFormat("%s %s %.3lf", s.c_str(), unit.tag.c_str(), unit.weight);
|
||||
// }
|
||||
|
||||
struct Dag {
|
||||
Rune rune;
|
||||
LocalVector<pair<size_t, const DictUnit*> > nexts;
|
||||
RuneStr runestr;
|
||||
// [offset, nexts.first]
|
||||
limonp::LocalVector<pair<size_t, const DictUnit*> > nexts;
|
||||
const DictUnit * pInfo;
|
||||
double weight;
|
||||
size_t nextPos;
|
||||
Dag():rune(0), pInfo(NULL), weight(0.0), nextPos(0) {
|
||||
size_t nextPos; // TODO
|
||||
Dag():runestr(), pInfo(NULL), weight(0.0), nextPos(0) {
|
||||
}
|
||||
};
|
||||
}; // struct Dag
|
||||
|
||||
typedef Rune TrieKey;
|
||||
|
||||
@ -57,18 +58,18 @@ class Trie {
|
||||
DeleteNode(root_);
|
||||
}
|
||||
|
||||
const DictUnit* Find(Unicode::const_iterator begin, Unicode::const_iterator end) const {
|
||||
const DictUnit* Find(RuneStrArray::const_iterator begin, RuneStrArray::const_iterator end) const {
|
||||
if (begin == end) {
|
||||
return NULL;
|
||||
}
|
||||
|
||||
const TrieNode* ptNode = root_;
|
||||
TrieNode::NextMap::const_iterator citer;
|
||||
for (Unicode::const_iterator it = begin; it != end; it++) {
|
||||
for (RuneStrArray::const_iterator it = begin; it != end; it++) {
|
||||
if (NULL == ptNode->next) {
|
||||
return NULL;
|
||||
}
|
||||
citer = ptNode->next->find(*it);
|
||||
citer = ptNode->next->find(it->rune);
|
||||
if (ptNode->next->end() == citer) {
|
||||
return NULL;
|
||||
}
|
||||
@ -77,8 +78,8 @@ class Trie {
|
||||
return ptNode->ptValue;
|
||||
}
|
||||
|
||||
void Find(Unicode::const_iterator begin,
|
||||
Unicode::const_iterator end,
|
||||
void Find(RuneStrArray::const_iterator begin,
|
||||
RuneStrArray::const_iterator end,
|
||||
vector<struct Dag>&res,
|
||||
size_t max_word_len = MAX_WORD_LENGTH) const {
|
||||
assert(root_ != NULL);
|
||||
@ -87,10 +88,9 @@ class Trie {
|
||||
const TrieNode *ptNode = NULL;
|
||||
TrieNode::NextMap::const_iterator citer;
|
||||
for (size_t i = 0; i < size_t(end - begin); i++) {
|
||||
Rune rune = *(begin + i);
|
||||
res[i].rune = rune;
|
||||
res[i].runestr = *(begin + i);
|
||||
|
||||
if (root_->next != NULL && root_->next->end() != (citer = root_->next->find(rune))) {
|
||||
if (root_->next != NULL && root_->next->end() != (citer = root_->next->find(res[i].runestr.rune))) {
|
||||
ptNode = citer->second;
|
||||
} else {
|
||||
ptNode = NULL;
|
||||
@ -105,7 +105,7 @@ class Trie {
|
||||
if (ptNode == NULL || ptNode->next == NULL) {
|
||||
break;
|
||||
}
|
||||
citer = ptNode->next->find(*(begin + j));
|
||||
citer = ptNode->next->find((begin + j)->rune);
|
||||
if (ptNode->next->end() == citer) {
|
||||
break;
|
||||
}
|
||||
|
215
include/cppjieba/Unicode.hpp
Normal file
215
include/cppjieba/Unicode.hpp
Normal file
@ -0,0 +1,215 @@
|
||||
#ifndef CPPJIEBA_UNICODE_H
|
||||
#define CPPJIEBA_UNICODE_H
|
||||
|
||||
#include <stdint.h>
|
||||
#include <stdlib.h>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
#include <ostream>
|
||||
#include "limonp/LocalVector.hpp"
|
||||
|
||||
namespace cppjieba {
|
||||
|
||||
using std::string;
|
||||
using std::vector;
|
||||
|
||||
typedef uint32_t Rune;
|
||||
|
||||
struct Word {
|
||||
string word;
|
||||
uint32_t offset;
|
||||
Word(const string& w, uint32_t o)
|
||||
: word(w), offset(o) {
|
||||
}
|
||||
}; // struct Word
|
||||
|
||||
inline std::ostream& operator << (std::ostream& os, const Word& w) {
|
||||
return os << "{\"word\": \"" << w.word << "\", \"offset\": " << w.offset << "}";
|
||||
}
|
||||
|
||||
struct RuneStr {
|
||||
Rune rune;
|
||||
uint32_t offset;
|
||||
uint32_t len;
|
||||
RuneStr(): rune(0), offset(0), len(0) {
|
||||
}
|
||||
RuneStr(Rune r, uint32_t o, uint32_t l)
|
||||
: rune(r), offset(o), len(l) {
|
||||
}
|
||||
}; // struct RuneStr
|
||||
|
||||
inline std::ostream& operator << (std::ostream& os, const RuneStr& r) {
|
||||
return os << "{\"rune\": \"" << r.rune << "\", \"offset\": " << r.offset << ", \"len\": " << r.len << "}";
|
||||
}
|
||||
|
||||
typedef limonp::LocalVector<Rune> Unicode;
|
||||
typedef limonp::LocalVector<struct RuneStr> RuneStrArray;
|
||||
|
||||
// [left, right]
|
||||
struct WordRange {
|
||||
RuneStrArray::const_iterator left;
|
||||
RuneStrArray::const_iterator right;
|
||||
WordRange(RuneStrArray::const_iterator l, RuneStrArray::const_iterator r)
|
||||
: left(l), right(r) {
|
||||
}
|
||||
size_t Length() const {
|
||||
return right - left + 1;
|
||||
}
|
||||
bool IsAllAscii() const {
|
||||
for (RuneStrArray::const_iterator iter = left; iter <= right; ++iter) {
|
||||
if (iter->rune >= 0x80) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
}; // struct WordRange
|
||||
|
||||
struct RuneStrLite {
|
||||
uint32_t rune;
|
||||
uint32_t len;
|
||||
RuneStrLite(): rune(0), len(0) {
|
||||
}
|
||||
RuneStrLite(uint32_t r, uint32_t l): rune(r), len(l) {
|
||||
}
|
||||
}; // struct RuneStrLite
|
||||
|
||||
inline RuneStrLite DecodeRuneInString(const char* str, size_t len) {
|
||||
RuneStrLite rp(0, 0);
|
||||
if (str == NULL || len == 0) {
|
||||
return rp;
|
||||
}
|
||||
if (!(str[0] & 0x80)) { // 0xxxxxxx
|
||||
// 7bit, total 7bit
|
||||
rp.rune = (uint8_t)(str[0]) & 0x7f;
|
||||
rp.len = 1;
|
||||
} else if ((uint8_t)str[0] <= 0xdf && 1 < len) {
|
||||
// 110xxxxxx
|
||||
// 5bit, total 5bit
|
||||
rp.rune = (uint8_t)(str[0]) & 0x1f;
|
||||
|
||||
// 6bit, total 11bit
|
||||
rp.rune <<= 6;
|
||||
rp.rune |= (uint8_t)(str[1]) & 0x3f;
|
||||
rp.len = 2;
|
||||
} else if((uint8_t)str[0] <= 0xef && 2 < len) { // 1110xxxxxx
|
||||
// 4bit, total 4bit
|
||||
rp.rune = (uint8_t)(str[0]) & 0x0f;
|
||||
|
||||
// 6bit, total 10bit
|
||||
rp.rune <<= 6;
|
||||
rp.rune |= (uint8_t)(str[1]) & 0x3f;
|
||||
|
||||
// 6bit, total 16bit
|
||||
rp.rune <<= 6;
|
||||
rp.rune |= (uint8_t)(str[2]) & 0x3f;
|
||||
|
||||
rp.len = 3;
|
||||
} else if((uint8_t)str[0] <= 0xf7 && 3 < len) { // 11110xxxx
|
||||
// 3bit, total 3bit
|
||||
rp.rune = (uint8_t)(str[0]) & 0x07;
|
||||
|
||||
// 6bit, total 9bit
|
||||
rp.rune <<= 6;
|
||||
rp.rune |= (uint8_t)(str[1]) & 0x3f;
|
||||
|
||||
// 6bit, total 15bit
|
||||
rp.rune <<= 6;
|
||||
rp.rune |= (uint8_t)(str[2]) & 0x3f;
|
||||
|
||||
// 6bit, total 21bit
|
||||
rp.rune <<= 6;
|
||||
rp.rune |= (uint8_t)(str[3]) & 0x3f;
|
||||
|
||||
rp.len = 4;
|
||||
} else {
|
||||
rp.rune = 0;
|
||||
rp.len = 0;
|
||||
}
|
||||
return rp;
|
||||
}
|
||||
|
||||
inline bool DecodeRunesInString(const char* s, size_t len, RuneStrArray& runes) {
|
||||
runes.clear();
|
||||
runes.reserve(len / 2);
|
||||
for (size_t i = 0; i < len;) {
|
||||
RuneStrLite rp = DecodeRuneInString(s + i, len - i);
|
||||
if (rp.len == 0) {
|
||||
runes.clear();
|
||||
return false;
|
||||
}
|
||||
RuneStr x(rp.rune, i, rp.len);
|
||||
runes.push_back(x);
|
||||
i += rp.len;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
inline bool DecodeRunesInString(const string& s, RuneStrArray& runes) {
|
||||
return DecodeRunesInString(s.c_str(), s.size(), runes);
|
||||
}
|
||||
|
||||
inline bool DecodeRunesInString(const char* s, size_t len, Unicode& unicode) {
|
||||
unicode.clear();
|
||||
RuneStrArray runes;
|
||||
if (!DecodeRunesInString(s, len, runes)) {
|
||||
return false;
|
||||
}
|
||||
unicode.reserve(runes.size());
|
||||
for (size_t i = 0; i < runes.size(); i++) {
|
||||
unicode.push_back(runes[i].rune);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
inline bool IsSingleWord(const string& str) {
|
||||
RuneStrLite rp = DecodeRuneInString(str.c_str(), str.size());
|
||||
return rp.len == str.size();
|
||||
}
|
||||
|
||||
inline bool DecodeRunesInString(const string& s, Unicode& unicode) {
|
||||
return DecodeRunesInString(s.c_str(), s.size(), unicode);
|
||||
}
|
||||
|
||||
inline Unicode DecodeRunesInString(const string& s) {
|
||||
Unicode result;
|
||||
DecodeRunesInString(s, result);
|
||||
return result;
|
||||
}
|
||||
|
||||
|
||||
// [left, right]
|
||||
inline Word GetWordFromRunes(const string& s, RuneStrArray::const_iterator left, RuneStrArray::const_iterator right) {
|
||||
assert(right->offset >= left->offset);
|
||||
uint32_t len = right->offset - left->offset + right->len;
|
||||
return Word(s.substr(left->offset, len), left->offset);
|
||||
}
|
||||
|
||||
inline string GetStringFromRunes(const string& s, RuneStrArray::const_iterator left, RuneStrArray::const_iterator right) {
|
||||
assert(right->offset >= left->offset);
|
||||
uint32_t len = right->offset - left->offset + right->len;
|
||||
return s.substr(left->offset, len);
|
||||
}
|
||||
|
||||
inline void GetWordsFromWordRanges(const string& s, const vector<WordRange>& wrs, vector<Word>& words) {
|
||||
for (size_t i = 0; i < wrs.size(); i++) {
|
||||
words.push_back(GetWordFromRunes(s, wrs[i].left, wrs[i].right));
|
||||
}
|
||||
}
|
||||
|
||||
inline vector<Word> GetWordsFromWordRanges(const string& s, const vector<WordRange>& wrs) {
|
||||
vector<Word> result;
|
||||
GetWordsFromWordRanges(s, wrs, result);
|
||||
return result;
|
||||
}
|
||||
|
||||
inline void GetStringsFromWords(const vector<Word>& words, vector<string>& strs) {
|
||||
strs.resize(words.size());
|
||||
for (size_t i = 0; i < words.size(); ++i) {
|
||||
strs[i] = words[i].word;
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace cppjieba
|
||||
|
||||
#endif // CPPJIEBA_UNICODE_H
|
@ -14,9 +14,12 @@ int main(int argc, char** argv) {
|
||||
HMM_PATH,
|
||||
USER_DICT_PATH);
|
||||
vector<string> words;
|
||||
vector<cppjieba::Word> jiebawords;
|
||||
string s;
|
||||
string result;
|
||||
string s = "我是拖拉机学院手扶拖拉机专业的。不用多久,我就会升职加薪,当上CEO,走上人生巅峰。";
|
||||
|
||||
s = "他来到了网易杭研大厦";
|
||||
cout << s << endl;
|
||||
cout << "[demo] Cut With HMM" << endl;
|
||||
jieba.Cut(s, words, true);
|
||||
cout << limonp::Join(words.begin(), words.end(), "/") << endl;
|
||||
@ -25,10 +28,14 @@ int main(int argc, char** argv) {
|
||||
jieba.Cut(s, words, false);
|
||||
cout << limonp::Join(words.begin(), words.end(), "/") << endl;
|
||||
|
||||
s = "我来到北京清华大学";
|
||||
cout << s << endl;
|
||||
cout << "[demo] CutAll" << endl;
|
||||
jieba.CutAll(s, words);
|
||||
cout << limonp::Join(words.begin(), words.end(), "/") << endl;
|
||||
|
||||
s = "小明硕士毕业于中国科学院计算所,后在日本京都大学深造";
|
||||
cout << s << endl;
|
||||
cout << "[demo] CutForSearch" << endl;
|
||||
jieba.CutForSearch(s, words);
|
||||
cout << limonp::Join(words.begin(), words.end(), "/") << endl;
|
||||
@ -40,19 +47,13 @@ int main(int argc, char** argv) {
|
||||
jieba.Cut("男默女泪", words);
|
||||
cout << limonp::Join(words.begin(), words.end(), "/") << endl;
|
||||
|
||||
cout << "[demo] Locate Words" << endl;
|
||||
vector<cppjieba::Jieba::LocWord> loc_words;
|
||||
jieba.Cut("南京市长江大桥", words, true);
|
||||
cppjieba::Jieba::Locate(words, loc_words);
|
||||
for (size_t i = 0; i < loc_words.size(); i++) {
|
||||
cout << loc_words[i].word
|
||||
<< ", " << loc_words[i].begin
|
||||
<< ", " << loc_words[i].end
|
||||
<< endl;
|
||||
}
|
||||
cout << "[demo] CutForSearch Word With Offset" << endl;
|
||||
jieba.CutForSearch(s, jiebawords, true);
|
||||
cout << jiebawords << endl;
|
||||
|
||||
cout << "[demo] TAGGING" << endl;
|
||||
cout << "[demo] Tagging" << endl;
|
||||
vector<pair<string, string> > tagres;
|
||||
s = "我是拖拉机学院手扶拖拉机专业的。不用多久,我就会升职加薪,当上CEO,走上人生巅峰。";
|
||||
jieba.Tag(s, tagres);
|
||||
cout << s << endl;
|
||||
cout << tagres << endl;;
|
||||
@ -60,13 +61,11 @@ int main(int argc, char** argv) {
|
||||
cppjieba::KeywordExtractor extractor(jieba,
|
||||
IDF_PATH,
|
||||
STOP_WORD_PATH);
|
||||
cout << "[demo] KEYWORD" << endl;
|
||||
cout << "[demo] Keyword Extraction" << endl;
|
||||
const size_t topk = 5;
|
||||
vector<cppjieba::KeywordExtractor::Word> keywordres;
|
||||
extractor.Extract(s, keywordres, topk);
|
||||
cout << s << endl;
|
||||
for (size_t i = 0; i < keywordres.size(); ++i) {
|
||||
cout << keywordres[i].word << "|" << keywordres[i].weight << endl;
|
||||
}
|
||||
cout << keywordres << endl;
|
||||
return EXIT_SUCCESS;
|
||||
}
|
||||
|
@ -13,7 +13,12 @@ ADD_EXECUTABLE(test.run
|
||||
pos_tagger_test.cpp
|
||||
jieba_test.cpp
|
||||
pre_filter_test.cpp
|
||||
unicode_test.cpp
|
||||
textrank_test.cpp
|
||||
)
|
||||
TARGET_LINK_LIBRARIES(test.run gtest pthread)
|
||||
|
||||
if(MSVC)
|
||||
TARGET_LINK_LIBRARIES(test.run gtest)
|
||||
else()
|
||||
TARGET_LINK_LIBRARIES(test.run gtest pthread)
|
||||
endif()
|
||||
|
@ -37,35 +37,45 @@ TEST(JiebaTest, Test1) {
|
||||
result << words;
|
||||
ASSERT_EQ("[\"他\", \"来到\", \"了\", \"网易\", \"杭研\", \"大厦\"]", result);
|
||||
|
||||
jieba.CutLevel("南京市长江大桥", words);
|
||||
}
|
||||
TEST(JiebaTest, WordTest) {
|
||||
cppjieba::Jieba jieba("../dict/jieba.dict.utf8",
|
||||
"../dict/hmm_model.utf8",
|
||||
"../dict/user.dict.utf8");
|
||||
vector<Word> words;
|
||||
string result;
|
||||
|
||||
jieba.Cut("他来到了网易杭研大厦", words);
|
||||
result << words;
|
||||
ASSERT_EQ("[\"南京市\", \"长江大桥\", \"南京\", \"长江\", \"大桥\"]", result);
|
||||
ASSERT_EQ("[{\"word\": \"\xE4\xBB\x96\", \"offset\": 0}, {\"word\": \"\xE6\x9D\xA5\xE5\x88\xB0\", \"offset\": 3}, {\"word\": \"\xE4\xBA\x86\", \"offset\": 9}, {\"word\": \"\xE7\xBD\x91\xE6\x98\x93\", \"offset\": 12}, {\"word\": \"\xE6\x9D\xAD\xE7\xA0\x94\", \"offset\": 18}, {\"word\": \"\xE5\xA4\xA7\xE5\x8E\xA6\", \"offset\": 24}]", result);
|
||||
|
||||
vector<pair<string, size_t> > word_levels;
|
||||
jieba.CutLevel("南京市长江大桥", word_levels);
|
||||
result << word_levels;
|
||||
ASSERT_EQ("[\"南京市:0\", \"长江大桥:0\", \"南京:1\", \"长江:1\", \"大桥:1\"]", result);
|
||||
jieba.Cut("我来自北京邮电大学。", words, false);
|
||||
result << words;
|
||||
//ASSERT_EQ("[\"我\", \"来自\", \"北京邮电大学\", \"。\"]", result);
|
||||
ASSERT_EQ("[{\"word\": \"\xE6\x88\x91\", \"offset\": 0}, {\"word\": \"\xE6\x9D\xA5\xE8\x87\xAA\", \"offset\": 3}, {\"word\": \"\xE5\x8C\x97\xE4\xBA\xAC\xE9\x82\xAE\xE7\x94\xB5\xE5\xA4\xA7\xE5\xAD\xA6\", \"offset\": 9}, {\"word\": \"\xE3\x80\x82\", \"offset\": 27}]", result);
|
||||
|
||||
vector<Jieba::LocWord> loc_words;
|
||||
jieba.Cut("南京市长江大桥", words);
|
||||
jieba.Locate(words, loc_words);
|
||||
ASSERT_EQ(loc_words.size(), 2u);
|
||||
ASSERT_EQ(loc_words[0].word, "南京市");
|
||||
ASSERT_EQ(loc_words[0].begin, 0u);
|
||||
ASSERT_EQ(loc_words[0].end, 3u);
|
||||
ASSERT_EQ(loc_words[1].word, "长江大桥");
|
||||
ASSERT_EQ(loc_words[1].begin, 3u);
|
||||
ASSERT_EQ(loc_words[1].end, 7u);
|
||||
jieba.CutSmall("南京市长江大桥", words, 3);
|
||||
//ASSERT_EQ("[\"南京市\", \"长江\", \"大桥\"]", result << words);
|
||||
ASSERT_EQ("[{\"word\": \"\xE5\x8D\x97\xE4\xBA\xAC\xE5\xB8\x82\", \"offset\": 0}, {\"word\": \"\xE9\x95\xBF\xE6\xB1\x9F\", \"offset\": 9}, {\"word\": \"\xE5\xA4\xA7\xE6\xA1\xA5\", \"offset\": 15}]", result << words);
|
||||
|
||||
//vector<pair<string, string> > tagres;
|
||||
//jieba.Tag("iPhone6手机的最大特点是很容易弯曲。", tagres);
|
||||
//result << tagres;
|
||||
//ASSERT_EQ("[\"iPhone6:eng\", \"手机:n\", \"的:uj\", \"最大:a\", \"特点:n\", \"是:v\", \"很:zg\", \"容易:a\", \"弯曲:v\", \"。:x\"]", result);
|
||||
jieba.CutHMM("我来自北京邮电大学。。。学号123456", words);
|
||||
result << words;
|
||||
ASSERT_EQ("[{\"word\": \"\xE6\x88\x91\xE6\x9D\xA5\", \"offset\": 0}, {\"word\": \"\xE8\x87\xAA\xE5\x8C\x97\xE4\xBA\xAC\", \"offset\": 6}, {\"word\": \"\xE9\x82\xAE\xE7\x94\xB5\xE5\xA4\xA7\xE5\xAD\xA6\", \"offset\": 15}, {\"word\": \"\xE3\x80\x82\", \"offset\": 27}, {\"word\": \"\xE3\x80\x82\", \"offset\": 30}, {\"word\": \"\xE3\x80\x82\", \"offset\": 33}, {\"word\": \"\xE5\xAD\xA6\xE5\x8F\xB7\", \"offset\": 36}, {\"word\": \"123456\", \"offset\": 42}]", result);
|
||||
|
||||
//vector<pair<string, double> > keywordres;
|
||||
//jieba.Extract("我是拖拉机学院手扶拖拉机专业的。不用多久,我就会升职加薪,当上CEO,走上人生巅峰。", keywordres, 5);
|
||||
//result << keywordres;
|
||||
//ASSERT_EQ(result, "[\"CEO:11.7392\", \"升职:10.8562\", \"加薪:10.6426\", \"手扶拖拉机:10.0089\", \"巅峰:9.49396\"]");
|
||||
jieba.Cut("我来自北京邮电大学。。。学号123456,用AK47", words);
|
||||
result << words;
|
||||
//ASSERT_EQ("[\"我\", \"来自\", \"北京邮电大学\", \"。\", \"。\", \"。\", \"学号\", \"123456\", \",\", \"用\", \"AK47\"]", result);
|
||||
ASSERT_EQ("[{\"word\": \"\xE6\x88\x91\", \"offset\": 0}, {\"word\": \"\xE6\x9D\xA5\xE8\x87\xAA\", \"offset\": 3}, {\"word\": \"\xE5\x8C\x97\xE4\xBA\xAC\xE9\x82\xAE\xE7\x94\xB5\xE5\xA4\xA7\xE5\xAD\xA6\", \"offset\": 9}, {\"word\": \"\xE3\x80\x82\", \"offset\": 27}, {\"word\": \"\xE3\x80\x82\", \"offset\": 30}, {\"word\": \"\xE3\x80\x82\", \"offset\": 33}, {\"word\": \"\xE5\xAD\xA6\xE5\x8F\xB7\", \"offset\": 36}, {\"word\": \"123456\", \"offset\": 42}, {\"word\": \"\xEF\xBC\x8C\", \"offset\": 48}, {\"word\": \"\xE7\x94\xA8\", \"offset\": 51}, {\"word\": \"AK47\", \"offset\": 54}]", result);
|
||||
|
||||
jieba.CutAll("我来自北京邮电大学", words);
|
||||
result << words;
|
||||
//ASSERT_EQ(result, "[\"我\", \"来自\", \"北京\", \"北京邮电\", \"北京邮电大学\", \"邮电\", \"邮电大学\", \"电大\", \"大学\"]");
|
||||
ASSERT_EQ("[{\"word\": \"\xE6\x88\x91\", \"offset\": 0}, {\"word\": \"\xE6\x9D\xA5\xE8\x87\xAA\", \"offset\": 3}, {\"word\": \"\xE5\x8C\x97\xE4\xBA\xAC\", \"offset\": 9}, {\"word\": \"\xE5\x8C\x97\xE4\xBA\xAC\xE9\x82\xAE\xE7\x94\xB5\", \"offset\": 9}, {\"word\": \"\xE5\x8C\x97\xE4\xBA\xAC\xE9\x82\xAE\xE7\x94\xB5\xE5\xA4\xA7\xE5\xAD\xA6\", \"offset\": 9}, {\"word\": \"\xE9\x82\xAE\xE7\x94\xB5\", \"offset\": 15}, {\"word\": \"\xE9\x82\xAE\xE7\x94\xB5\xE5\xA4\xA7\xE5\xAD\xA6\", \"offset\": 15}, {\"word\": \"\xE7\x94\xB5\xE5\xA4\xA7\", \"offset\": 18}, {\"word\": \"\xE5\xA4\xA7\xE5\xAD\xA6\", \"offset\": 21}]", result);
|
||||
|
||||
jieba.CutForSearch("他来到了网易杭研大厦", words);
|
||||
result << words;
|
||||
//ASSERT_EQ("[\"他\", \"来到\", \"了\", \"网易\", \"杭研\", \"大厦\"]", result);
|
||||
ASSERT_EQ("[{\"word\": \"\xE4\xBB\x96\", \"offset\": 0}, {\"word\": \"\xE6\x9D\xA5\xE5\x88\xB0\", \"offset\": 3}, {\"word\": \"\xE4\xBA\x86\", \"offset\": 9}, {\"word\": \"\xE7\xBD\x91\xE6\x98\x93\", \"offset\": 12}, {\"word\": \"\xE6\x9D\xAD\xE7\xA0\x94\", \"offset\": 18}, {\"word\": \"\xE5\xA4\xA7\xE5\x8E\xA6\", \"offset\": 24}]", result);
|
||||
}
|
||||
|
||||
TEST(JiebaTest, InsertUserWord) {
|
||||
|
@ -22,14 +22,14 @@ TEST(KeywordExtractorTest, Test1) {
|
||||
vector<pair<string, double> > words;
|
||||
Extractor.Extract(s, words, topN);
|
||||
res << words;
|
||||
ASSERT_EQ(res, "[\"世界:8.73506\", \"你好:7.95788\"]");
|
||||
ASSERT_EQ(res, "[世界:8.73506, 你好:7.95788]");
|
||||
}
|
||||
|
||||
{
|
||||
vector<KeywordExtractor::Word> words;
|
||||
Extractor.Extract(s, words, topN);
|
||||
res << words;
|
||||
ASSERT_EQ(res, "[\"世界|[\"6\", \"12\"]|8.73506\", \"你好|[\"0\"]|7.95788\"]");
|
||||
ASSERT_EQ(res, "[{\"word\": \"\xE4\xB8\x96\xE7\x95\x8C\", \"offset\": [6, 12], \"weight\": 8.73506}, {\"word\": \"\xE4\xBD\xA0\xE5\xA5\xBD\", \"offset\": [0], \"weight\": 7.95788}]");
|
||||
}
|
||||
}
|
||||
|
||||
@ -40,7 +40,7 @@ TEST(KeywordExtractorTest, Test1) {
|
||||
size_t topN = 5;
|
||||
Extractor.Extract(s, wordweights, topN);
|
||||
res << wordweights;
|
||||
ASSERT_EQ(res, "[\"CEO|[\"93\"]|11.7392\", \"\xE5\x8D\x87\xE8\x81\x8C|[\"72\"]|10.8562\", \"\xE5\x8A\xA0\xE8\x96\xAA|[\"78\"]|10.6426\", \"\xE6\x89\x8B\xE6\x89\xB6\xE6\x8B\x96\xE6\x8B\x89\xE6\x9C\xBA|[\"21\"]|10.0089\", \"\xE5\xB7\x85\xE5\xB3\xB0|[\"111\"]|9.49396\"]");
|
||||
ASSERT_EQ(res, "[{\"word\": \"CEO\", \"offset\": [93], \"weight\": 11.7392}, {\"word\": \"\xE5\x8D\x87\xE8\x81\x8C\", \"offset\": [72], \"weight\": 10.8562}, {\"word\": \"\xE5\x8A\xA0\xE8\x96\xAA\", \"offset\": [78], \"weight\": 10.6426}, {\"word\": \"\xE6\x89\x8B\xE6\x89\xB6\xE6\x8B\x96\xE6\x8B\x89\xE6\x9C\xBA\", \"offset\": [21], \"weight\": 10.0089}, {\"word\": \"\xE5\xB7\x85\xE5\xB3\xB0\", \"offset\": [111], \"weight\": 9.49396}]");
|
||||
}
|
||||
|
||||
{
|
||||
@ -50,7 +50,7 @@ TEST(KeywordExtractorTest, Test1) {
|
||||
size_t topN = 5;
|
||||
Extractor.Extract(s, wordweights, topN);
|
||||
res << wordweights;
|
||||
ASSERT_EQ(res, "[\"iPhone6|[\"6\"]|11.7392\", \"\xE4\xB8\x80\xE9\x83\xA8|[\"0\"]|6.47592\"]");
|
||||
ASSERT_EQ(res, "[{\"word\": \"iPhone6\", \"offset\": [6], \"weight\": 11.7392}, {\"word\": \"\xE4\xB8\x80\xE9\x83\xA8\", \"offset\": [0], \"weight\": 6.47592}]");
|
||||
}
|
||||
}
|
||||
|
||||
@ -64,7 +64,7 @@ TEST(KeywordExtractorTest, Test2) {
|
||||
size_t topN = 5;
|
||||
Extractor.Extract(s, wordweights, topN);
|
||||
res << wordweights;
|
||||
ASSERT_EQ(res, "[\"\xE8\x93\x9D\xE7\xBF\x94|[\"0\"]|11.7392\", \"\xE6\xAF\x95\xE4\xB8\x9A\xE7\x94\x9F|[\"12\"]|8.13549\", \"\xE4\xBC\x98\xE7\xA7\x80|[\"6\"]|6.78347\"]");
|
||||
ASSERT_EQ(res, "[{\"word\": \"\xE8\x93\x9D\xE7\xBF\x94\", \"offset\": [0], \"weight\": 11.7392}, {\"word\": \"\xE6\xAF\x95\xE4\xB8\x9A\xE7\x94\x9F\", \"offset\": [12], \"weight\": 8.13549}, {\"word\": \"\xE4\xBC\x98\xE7\xA7\x80\", \"offset\": [6], \"weight\": 6.78347}]");
|
||||
}
|
||||
|
||||
{
|
||||
@ -74,6 +74,6 @@ TEST(KeywordExtractorTest, Test2) {
|
||||
size_t topN = 5;
|
||||
Extractor.Extract(s, wordweights, topN);
|
||||
res << wordweights;
|
||||
ASSERT_EQ(res, "[\"iPhone6|[\"6\"]|11.7392\", \"\xE4\xB8\x80\xE9\x83\xA8|[\"0\"]|6.47592\"]");
|
||||
ASSERT_EQ(res, "[{\"word\": \"iPhone6\", \"offset\": [6], \"weight\": 11.7392}, {\"word\": \"\xE4\xB8\x80\xE9\x83\xA8\", \"offset\": [0], \"weight\": 6.47592}]");
|
||||
}
|
||||
}
|
||||
|
@ -4,12 +4,12 @@
|
||||
using namespace cppjieba;
|
||||
|
||||
static const char * const QUERY_TEST1 = "我是蓝翔技工拖拉机学院手扶拖拉机专业的。不用多久,我就会升职加薪,当上总经理,出任CEO,迎娶白富美,走上人生巅峰。";
|
||||
static const char * const ANS_TEST1 = "[\"我:r\", \"是:v\", \"蓝翔:x\", \"技工:n\", \"拖拉机:n\", \"学院:n\", \"手扶拖拉机:n\", \"专业:n\", \"的:uj\", \"。:x\", \"不用:v\", \"多久:m\", \",:x\", \"我:r\", \"就:d\", \"会:v\", \"升职:v\", \"加薪:nr\", \",:x\", \"当上:t\", \"总经理:n\", \",:x\", \"出任:v\", \"CEO:eng\", \",:x\", \"迎娶:v\", \"白富:x\", \"美:ns\", \",:x\", \"走上:v\", \"人生:n\", \"巅峰:n\", \"。:x\"]";
|
||||
static const char * const ANS_TEST1 = "[我:r, 是:v, 蓝翔:x, 技工:n, 拖拉机:n, 学院:n, 手扶拖拉机:n, 专业:n, 的:uj, 。:x, 不用:v, 多久:m, ,:x, 我:r, 就:d, 会:v, 升职:v, 加薪:nr, ,:x, 当上:t, 总经理:n, ,:x, 出任:v, CEO:eng, ,:x, 迎娶:v, 白富:x, 美:ns, ,:x, 走上:v, 人生:n, 巅峰:n, 。:x]";
|
||||
static const char * const QUERY_TEST2 = "我是蓝翔技工拖拉机学院手扶拖拉机专业的。不用多久,我就会升职加薪,当上总经理,出任CEO,迎娶白富美,走上人生巅峰。";
|
||||
static const char * const ANS_TEST2 = "[\"我:r\", \"是:v\", \"蓝翔:nz\", \"技工:n\", \"拖拉机:n\", \"学院:n\", \"手扶拖拉机:n\", \"专业:n\", \"的:uj\", \"。:x\", \"不用:v\", \"多久:m\", \",:x\", \"我:r\", \"就:d\", \"会:v\", \"升职:v\", \"加薪:nr\", \",:x\", \"当上:t\", \"总经理:n\", \",:x\", \"出任:v\", \"CEO:eng\", \",:x\", \"迎娶:v\", \"白富:x\", \"美:ns\", \",:x\", \"走上:v\", \"人生:n\", \"巅峰:n\", \"。:x\"]";
|
||||
static const char * const ANS_TEST2 = "[我:r, 是:v, 蓝翔:nz, 技工:n, 拖拉机:n, 学院:n, 手扶拖拉机:n, 专业:n, 的:uj, 。:x, 不用:v, 多久:m, ,:x, 我:r, 就:d, 会:v, 升职:v, 加薪:nr, ,:x, 当上:t, 总经理:n, ,:x, 出任:v, CEO:eng, ,:x, 迎娶:v, 白富:x, 美:ns, ,:x, 走上:v, 人生:n, 巅峰:n, 。:x]";
|
||||
|
||||
static const char * const QUERY_TEST3 = "iPhone6手机的最大特点是很容易弯曲。";
|
||||
static const char * const ANS_TEST3 = "[\"iPhone6:eng\", \"手机:n\", \"的:uj\", \"最大:a\", \"特点:n\", \"是:v\", \"很:zg\", \"容易:a\", \"弯曲:v\", \"。:x\"]";
|
||||
static const char * const ANS_TEST3 = "[iPhone6:eng, 手机:n, 的:uj, 最大:a, 特点:n, 是:v, 很:zg, 容易:a, 弯曲:v, 。:x]";
|
||||
//static const char * const ANS_TEST3 = "";
|
||||
|
||||
TEST(PosTaggerTest, Test) {
|
||||
|
@ -1,5 +1,6 @@
|
||||
#include "gtest/gtest.h"
|
||||
#include "cppjieba/PreFilter.hpp"
|
||||
#include "limonp/StringUtil.hpp"
|
||||
|
||||
using namespace cppjieba;
|
||||
|
||||
@ -11,32 +12,32 @@ TEST(PreFilterTest, Test1) {
|
||||
string res;
|
||||
|
||||
{
|
||||
PreFilter filter(symbol, "你好,美丽的,世界");
|
||||
string s = "你好,美丽的,世界";
|
||||
PreFilter filter(symbol, s);
|
||||
expected = "你好/,/美丽的/,/世界";
|
||||
ASSERT_TRUE(filter.HasNext());
|
||||
vector<string> words;
|
||||
while (filter.HasNext()) {
|
||||
PreFilter::Range range;
|
||||
range = filter.Next();
|
||||
words.push_back(TransCode::Encode(range.begin, range.end));
|
||||
words.push_back(GetStringFromRunes(s, range.begin, range.end - 1));
|
||||
}
|
||||
res = Join(words.begin(), words.end(), "/");
|
||||
res = limonp::Join(words.begin(), words.end(), "/");
|
||||
ASSERT_EQ(res, expected);
|
||||
}
|
||||
|
||||
{
|
||||
PreFilter filter(symbol, "我来自北京邮电大学。。。学号123456,用AK47");
|
||||
string s = "我来自北京邮电大学。。。学号123456,用AK47";
|
||||
PreFilter filter(symbol, s);
|
||||
expected = "我来自北京邮电大学/。/。/。/学号123456/,/用AK47";
|
||||
ASSERT_TRUE(filter.HasNext());
|
||||
vector<string> words;
|
||||
while (filter.HasNext()) {
|
||||
PreFilter::Range range;
|
||||
range = filter.Next();
|
||||
words.push_back(TransCode::Encode(range.begin, range.end));
|
||||
}
|
||||
res = Join(words.begin(), words.end(), "/");
|
||||
for (size_t i = 0; i < words.size(); i++) {
|
||||
words.push_back(GetStringFromRunes(s, range.begin, range.end - 1));
|
||||
}
|
||||
res = limonp::Join(words.begin(), words.end(), "/");
|
||||
ASSERT_EQ(res, expected);
|
||||
}
|
||||
}
|
||||
|
@ -4,7 +4,6 @@
|
||||
#include "cppjieba/HMMSegment.hpp"
|
||||
#include "cppjieba/FullSegment.hpp"
|
||||
#include "cppjieba/QuerySegment.hpp"
|
||||
#include "cppjieba/LevelSegment.hpp"
|
||||
#include "gtest/gtest.h"
|
||||
|
||||
using namespace cppjieba;
|
||||
@ -104,6 +103,23 @@ TEST(MixSegmentTest, TestUserDict) {
|
||||
segment.Cut("忽如一夜春风来,千树万树梨花开", words);
|
||||
res = limonp::Join(words.begin(), words.end(), "/");
|
||||
ASSERT_EQ("忽如一夜春风来/,/千树/万树/梨花/开", res);
|
||||
|
||||
// rand input
|
||||
{
|
||||
const size_t ITERATION = 16;
|
||||
const size_t MAX_LEN = 256;
|
||||
string s;
|
||||
srand(time(NULL));
|
||||
|
||||
for (size_t i = 0; i < ITERATION; i++) {
|
||||
size_t len = rand() % MAX_LEN;
|
||||
s.resize(len);
|
||||
for (size_t j = 0; j < len; j++) {
|
||||
s[rand() % len] = rand();
|
||||
}
|
||||
segment.Cut(s, words);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
TEST(MixSegmentTest, TestMultiUserDict) {
|
||||
@ -181,73 +197,52 @@ TEST(FullSegment, Test1) {
|
||||
}
|
||||
|
||||
TEST(QuerySegment, Test1) {
|
||||
QuerySegment segment("../test/testdata/extra_dict/jieba.dict.small.utf8", "../dict/hmm_model.utf8", "", 3);
|
||||
const char* str = "小明硕士毕业于中国科学院计算所,后在日本京都大学深造";
|
||||
QuerySegment segment("../dict/jieba.dict.utf8", "../dict/hmm_model.utf8", "");
|
||||
vector<string> words;
|
||||
|
||||
segment.Cut(str, words);
|
||||
|
||||
string s1, s2;
|
||||
s1 << words;
|
||||
s2 = "[\"小明\", \"硕士\", \"毕业\", \"于\", \"中国\", \"中国科学院\", \"科学\", \"科学院\", \"学院\", \"计算所\", \",\", \"后\", \"在\", \"日本\", \"京都\", \"京都大学\", \"大学\", \"深造\"]";
|
||||
|
||||
segment.Cut("小明硕士毕业于中国科学院计算所,后在日本京都大学深造", words);
|
||||
s1 = Join(words.begin(), words.end(), "/");
|
||||
s2 = "小明/硕士/毕业/于/中国/科学/学院/科学院/中国科学院/计算/计算所/,/后/在/日本/京都/大学/日本京都大学/深造";
|
||||
ASSERT_EQ(s1, s2);
|
||||
|
||||
segment.Cut("亲口交代", words);
|
||||
s1 = Join(words.begin(), words.end(), "/");
|
||||
s2 = "亲口/交代";
|
||||
ASSERT_EQ(s1, s2);
|
||||
|
||||
segment.Cut("他心理健康", words);
|
||||
s1 = Join(words.begin(), words.end(), "/");
|
||||
s2 = "他/心理/健康/心理健康";
|
||||
ASSERT_EQ(s1, s2);
|
||||
}
|
||||
|
||||
TEST(QuerySegment, Test2) {
|
||||
QuerySegment segment("../test/testdata/extra_dict/jieba.dict.small.utf8", "../dict/hmm_model.utf8", "../test/testdata/userdict.utf8|../test/testdata/userdict.english", 3);
|
||||
QuerySegment segment("../test/testdata/extra_dict/jieba.dict.small.utf8", "../dict/hmm_model.utf8", "../test/testdata/userdict.utf8|../test/testdata/userdict.english");
|
||||
vector<string> words;
|
||||
string s1, s2;
|
||||
|
||||
{
|
||||
const char* str = "小明硕士毕业于中国科学院计算所,后在日本京都大学深造";
|
||||
vector<string> words;
|
||||
|
||||
segment.Cut(str, words);
|
||||
|
||||
string s1, s2;
|
||||
s1 << words;
|
||||
s2 = "[\"小明\", \"硕士\", \"毕业\", \"于\", \"中国\", \"中国科学院\", \"科学\", \"科学院\", \"学院\", \"计算所\", \",\", \"后\", \"在\", \"日本\", \"京都\", \"京都大学\", \"大学\", \"深造\"]";
|
||||
segment.Cut("小明硕士毕业于中国科学院计算所,后在日本京都大学深造", words);
|
||||
s1 = Join(words.begin(), words.end(), "/");
|
||||
s2 = "小明/硕士/毕业/于/中国/科学/学院/科学院/中国科学院/计算/计算所/,/后/在/日本/京都/大学/京都大学/深造";
|
||||
ASSERT_EQ(s1, s2);
|
||||
}
|
||||
|
||||
{
|
||||
const char* str = "小明硕士毕业于中国科学院计算所iPhone6";
|
||||
vector<string> words;
|
||||
|
||||
segment.Cut(str, words);
|
||||
|
||||
string s1, s2;
|
||||
s1 << words;
|
||||
s2 = "[\"小明\", \"硕士\", \"毕业\", \"于\", \"中国\", \"中国科学院\", \"科学\", \"科学院\", \"学院\", \"计算所\", \"iPhone6\"]";
|
||||
segment.Cut("小明硕士毕业于中国科学院计算所iPhone6", words);
|
||||
s1 = Join(words.begin(), words.end(), "/");
|
||||
s2 = "小明/硕士/毕业/于/中国/科学/学院/科学院/中国科学院/计算/计算所/iPhone6";
|
||||
ASSERT_EQ(s1, s2);
|
||||
}
|
||||
|
||||
{
|
||||
vector<string> words;
|
||||
segment.Cut("internal", words);
|
||||
string s = Join(words.begin(), words.end(), "/");
|
||||
ASSERT_EQ("internal", s);
|
||||
}
|
||||
|
||||
segment.SetMaxWordLen(5);
|
||||
|
||||
{
|
||||
vector<string> words;
|
||||
segment.Cut("中国科学院", words);
|
||||
string s = Join(words.begin(), words.end(), "/");
|
||||
ASSERT_EQ("中国科学院", s);
|
||||
s1 = Join(words.begin(), words.end(), "/");
|
||||
s2 = "中国/科学/学院/科学院/中国科学院";
|
||||
ASSERT_EQ(s1, s2);
|
||||
}
|
||||
}
|
||||
|
||||
TEST(LevelSegmentTest, Test0) {
|
||||
string s;
|
||||
LevelSegment segment("../test/testdata/extra_dict/jieba.dict.small.utf8");
|
||||
vector<pair<string, size_t> > words;
|
||||
segment.Cut("南京市长江大桥", words);
|
||||
ASSERT_EQ("[\"南京市:0\", \"长江大桥:0\", \"南京:1\", \"长江:1\", \"大桥:1\"]", s << words);
|
||||
|
||||
vector<string> res;
|
||||
segment.Cut("南京市长江大桥", res);
|
||||
ASSERT_EQ("[\"南京市\", \"长江大桥\", \"南京\", \"长江\", \"大桥\"]", s << res);
|
||||
}
|
||||
|
||||
TEST(MPSegmentTest, Unicode32) {
|
||||
|
@ -24,16 +24,14 @@ TEST(TextRankExtractorTest, Test1) {
|
||||
vector<pair<string, double> > words;
|
||||
Extractor.Extract(s, words, topN);
|
||||
res << words;
|
||||
//ASSERT_EQ(res, "[\"世界:1\", \"你好:0.514286\"]");
|
||||
ASSERT_EQ(res, "[\"\xE4\xB8\x96\xE7\x95\x8C:1\", \"\xE4\xBD\xA0\xE5\xA5\xBD:0.519787\"]");
|
||||
ASSERT_EQ(res, "[世界:1, 你好:0.519787]");
|
||||
}
|
||||
|
||||
{
|
||||
vector<TextRankExtractor::Word> words;
|
||||
Extractor.Extract(s, words, topN);
|
||||
res << words;
|
||||
//ASSERT_EQ(res, "[\"世界|[\"6\", \"12\"]|1\", \"你好|[\"0\"]|0.514286\"]");
|
||||
ASSERT_EQ(res, "[\"\xE4\xB8\x96\xE7\x95\x8C|[\"6\", \"12\"]|1\", \"\xE4\xBD\xA0\xE5\xA5\xBD|[\"0\"]|0.519787\"]");
|
||||
ASSERT_EQ(res, "[{\"word\": \"世界\", \"offset\": [6, 12], \"weight\": 1}, {\"word\": \"你好\", \"offset\": [0], \"weight\": 0.519787}]");
|
||||
}
|
||||
}
|
||||
|
||||
@ -44,8 +42,7 @@ TEST(TextRankExtractorTest, Test1) {
|
||||
size_t topN = 5;
|
||||
Extractor.Extract(s, wordweights, topN);
|
||||
res << wordweights;
|
||||
ASSERT_EQ(res, "[\"\xE4\xB8\x93\xE4\xB8\x9A|[\"36\"]|1\", \"CEO|[\"94\"]|0.95375\", \"\xE6\x89\x8B\xE6\x89\xB6\xE6\x8B\x96\xE6\x8B\x89\xE6\x9C\xBA|[\"21\"]|0.801701\", \"\xE5\xBD\x93\xE4\xB8\x8A|[\"87\"]|0.798968\", \"\xE8\xB5\xB0\xE4\xB8\x8A|[\"100\"]|0.775505\"]");
|
||||
// ASSERT_EQ(res, "[\"\xE4\xB8\x93\xE4\xB8\x9A|[\"36\"]|1\", \"CEO|[\"94\"]|0.953149\", \"\xE6\x89\x8B\xE6\x89\xB6\xE6\x8B\x96\xE6\x8B\x89\xE6\x9C\xBA|[\"21\"]|0.794203\", \"\xE5\xBD\x93\xE4\xB8\x8A|[\"87\"]|0.78716\", \"\xE8\xB5\xB0\xE4\xB8\x8A|[\"100\"]|0.767636\"]");
|
||||
ASSERT_EQ(res, "[{\"word\": \"专业\", \"offset\": [36], \"weight\": 1}, {\"word\": \"CEO\", \"offset\": [94], \"weight\": 0.95375}, {\"word\": \"手扶拖拉机\", \"offset\": [21], \"weight\": 0.801701}, {\"word\": \"当上\", \"offset\": [87], \"weight\": 0.798968}, {\"word\": \"走上\", \"offset\": [100], \"weight\": 0.775505}]");
|
||||
}
|
||||
|
||||
{
|
||||
@ -55,8 +52,7 @@ TEST(TextRankExtractorTest, Test1) {
|
||||
size_t topN = 5;
|
||||
Extractor.Extract(s, wordweights, topN);
|
||||
res << wordweights;
|
||||
ASSERT_EQ(res, "[\"\xE4\xB8\x80\xE9\x83\xA8|[\"0\"]|1\", \"iPhone6|[\"6\"]|0.996126\"]");
|
||||
//ASSERT_EQ(res, "[\"iPhone6|[\"6\"]|1\", \"\xE4\xB8\x80\xE9\x83\xA8|[\"0\"]|0.996126\"]");
|
||||
ASSERT_EQ(res, "[{\"word\": \"一部\", \"offset\": [0], \"weight\": 1}, {\"word\": \"iPhone6\", \"offset\": [6], \"weight\": 0.996126}]");
|
||||
}
|
||||
}
|
||||
|
||||
@ -74,8 +70,7 @@ TEST(TextRankExtractorTest, Test2) {
|
||||
size_t topN = 5;
|
||||
Extractor.Extract(s, wordweights, topN);
|
||||
res << wordweights;
|
||||
ASSERT_EQ(res, "[\"蓝翔|[\"0\"]|1\", \"毕业生|[\"12\"]|0.996685\", \"优秀|[\"6\"]|0.992994\"]");
|
||||
//ASSERT_EQ(res, "[\"\xE4\xBC\x98\xE7\xA7\x80|[\"6\"]|1\", \"\xE6\xAF\x95\xE4\xB8\x9A\xE7\x94\x9F|[\"12\"]|0.996685\", \"\xE8\x93\x9D\xE7\xBF\x94|[\"0\"]|0.992994\"]");
|
||||
ASSERT_EQ(res, "[{\"word\": \"蓝翔\", \"offset\": [0], \"weight\": 1}, {\"word\": \"毕业生\", \"offset\": [12], \"weight\": 0.996685}, {\"word\": \"优秀\", \"offset\": [6], \"weight\": 0.992994}]");
|
||||
}
|
||||
|
||||
{
|
||||
@ -85,7 +80,6 @@ TEST(TextRankExtractorTest, Test2) {
|
||||
size_t topN = 5;
|
||||
Extractor.Extract(s, wordweights, topN);
|
||||
res << wordweights;
|
||||
//ASSERT_EQ(res, "[\"iPhone6|[\"6\"]|1\", \"\xE4\xB8\x80\xE9\x83\xA8|[\"0\"]|0.996126\"]");
|
||||
ASSERT_EQ(res, "[\"\xE4\xB8\x80\xE9\x83\xA8|[\"0\"]|1\", \"iPhone6|[\"6\"]|0.996126\"]");
|
||||
ASSERT_EQ(res, "[{\"word\": \"一部\", \"offset\": [0], \"weight\": 1}, {\"word\": \"iPhone6\", \"offset\": [6], \"weight\": 0.996126}]");
|
||||
}
|
||||
}
|
||||
|
@ -15,7 +15,7 @@ TEST(TrieTest, Empty) {
|
||||
TEST(TrieTest, Construct) {
|
||||
vector<Unicode> keys;
|
||||
vector<const DictUnit*> values;
|
||||
keys.push_back(TransCode::Decode("你"));
|
||||
keys.push_back(DecodeRunesInString("你"));
|
||||
values.push_back((const DictUnit*)(NULL));
|
||||
Trie trie(keys, values);
|
||||
}
|
||||
@ -31,27 +31,34 @@ TEST(DictTrieTest, Test1) {
|
||||
DictTrie trie(DICT_FILE);
|
||||
ASSERT_LT(trie.GetMinWeight() + 15.6479, 0.001);
|
||||
string word("来到");
|
||||
Unicode uni;
|
||||
ASSERT_TRUE(TransCode::Decode(word, uni));
|
||||
DictUnit nodeInfo;
|
||||
nodeInfo.word = uni;
|
||||
nodeInfo.tag = "v";
|
||||
nodeInfo.weight = -8.87033;
|
||||
s1 << nodeInfo;
|
||||
s2 << (*trie.Find(uni.begin(), uni.end()));
|
||||
cppjieba::RuneStrArray uni;
|
||||
ASSERT_TRUE(DecodeRunesInString(word, uni));
|
||||
//DictUnit nodeInfo;
|
||||
//nodeInfo.word = uni;
|
||||
//nodeInfo.tag = "v";
|
||||
//nodeInfo.weight = -8.87033;
|
||||
//s1 << nodeInfo;
|
||||
//s2 << (*trie.Find(uni.begin(), uni.end()));
|
||||
const DictUnit* du = trie.Find(uni.begin(), uni.end());
|
||||
ASSERT_TRUE(du != NULL);
|
||||
ASSERT_EQ(2u, du->word.size());
|
||||
ASSERT_EQ(26469u, du->word[0]);
|
||||
ASSERT_EQ(21040u, du->word[1]);
|
||||
ASSERT_EQ("v", du->tag);
|
||||
ASSERT_NEAR(-8.870, du->weight, 0.001);
|
||||
|
||||
EXPECT_EQ("[\"26469\", \"21040\"] v -8.870", s2);
|
||||
//EXPECT_EQ("[\"26469\", \"21040\"] v -8.870", s2);
|
||||
word = "清华大学";
|
||||
LocalVector<pair<size_t, const DictUnit*> > res;
|
||||
const char * words[] = {"清", "清华", "清华大学"};
|
||||
for (size_t i = 0; i < sizeof(words)/sizeof(words[0]); i++) {
|
||||
ASSERT_TRUE(TransCode::Decode(words[i], uni));
|
||||
ASSERT_TRUE(DecodeRunesInString(words[i], uni));
|
||||
res.push_back(make_pair(uni.size() - 1, trie.Find(uni.begin(), uni.end())));
|
||||
//resMap[uni.size() - 1] = trie.Find(uni.begin(), uni.end());
|
||||
}
|
||||
vector<pair<size_t, const DictUnit*> > vec;
|
||||
vector<struct Dag> dags;
|
||||
ASSERT_TRUE(TransCode::Decode(word, uni));
|
||||
ASSERT_TRUE(DecodeRunesInString(word, uni));
|
||||
trie.Find(uni.begin(), uni.end(), dags);
|
||||
ASSERT_EQ(dags.size(), uni.size());
|
||||
ASSERT_NE(dags.size(), 0u);
|
||||
@ -64,25 +71,21 @@ TEST(DictTrieTest, Test1) {
|
||||
TEST(DictTrieTest, UserDict) {
|
||||
DictTrie trie(DICT_FILE, "../test/testdata/userdict.utf8");
|
||||
string word = "云计算";
|
||||
Unicode unicode;
|
||||
ASSERT_TRUE(TransCode::Decode(word, unicode));
|
||||
cppjieba::RuneStrArray unicode;
|
||||
ASSERT_TRUE(DecodeRunesInString(word, unicode));
|
||||
const DictUnit * unit = trie.Find(unicode.begin(), unicode.end());
|
||||
ASSERT_TRUE(unit);
|
||||
string res ;
|
||||
res << *unit;
|
||||
ASSERT_EQ("[\"20113\", \"35745\", \"31639\"] -14.100", res);
|
||||
ASSERT_NEAR(unit->weight, -14.100, 0.001);
|
||||
}
|
||||
|
||||
TEST(DictTrieTest, UserDictWithMaxWeight) {
|
||||
DictTrie trie(DICT_FILE, "../test/testdata/userdict.utf8", DictTrie::WordWeightMax);
|
||||
string word = "云计算";
|
||||
Unicode unicode;
|
||||
ASSERT_TRUE(TransCode::Decode(word, unicode));
|
||||
cppjieba::RuneStrArray unicode;
|
||||
ASSERT_TRUE(DecodeRunesInString(word, unicode));
|
||||
const DictUnit * unit = trie.Find(unicode.begin(), unicode.end());
|
||||
ASSERT_TRUE(unit);
|
||||
string res ;
|
||||
res << *unit;
|
||||
ASSERT_EQ("[\"20113\", \"35745\", \"31639\"] -2.975", res);
|
||||
ASSERT_NEAR(unit->weight, -2.975, 0.001);
|
||||
}
|
||||
|
||||
TEST(DictTrieTest, Dag) {
|
||||
@ -90,8 +93,8 @@ TEST(DictTrieTest, Dag) {
|
||||
|
||||
{
|
||||
string word = "清华大学";
|
||||
Unicode unicode;
|
||||
ASSERT_TRUE(TransCode::Decode(word, unicode));
|
||||
cppjieba::RuneStrArray unicode;
|
||||
ASSERT_TRUE(DecodeRunesInString(word, unicode));
|
||||
vector<struct Dag> res;
|
||||
trie.Find(unicode.begin(), unicode.end(), res);
|
||||
|
||||
@ -104,8 +107,8 @@ TEST(DictTrieTest, Dag) {
|
||||
|
||||
{
|
||||
string word = "北京邮电大学";
|
||||
Unicode unicode;
|
||||
ASSERT_TRUE(TransCode::Decode(word, unicode));
|
||||
cppjieba::RuneStrArray unicode;
|
||||
ASSERT_TRUE(DecodeRunesInString(word, unicode));
|
||||
vector<struct Dag> res;
|
||||
trie.Find(unicode.begin(), unicode.end(), res);
|
||||
|
||||
@ -118,8 +121,8 @@ TEST(DictTrieTest, Dag) {
|
||||
|
||||
{
|
||||
string word = "长江大桥";
|
||||
Unicode unicode;
|
||||
ASSERT_TRUE(TransCode::Decode(word, unicode));
|
||||
cppjieba::RuneStrArray unicode;
|
||||
ASSERT_TRUE(DecodeRunesInString(word, unicode));
|
||||
vector<struct Dag> res;
|
||||
trie.Find(unicode.begin(), unicode.end(), res);
|
||||
|
||||
@ -132,8 +135,8 @@ TEST(DictTrieTest, Dag) {
|
||||
|
||||
{
|
||||
string word = "长江大桥";
|
||||
Unicode unicode;
|
||||
ASSERT_TRUE(TransCode::Decode(word, unicode));
|
||||
cppjieba::RuneStrArray unicode;
|
||||
ASSERT_TRUE(DecodeRunesInString(word, unicode));
|
||||
vector<struct Dag> res;
|
||||
trie.Find(unicode.begin(), unicode.end(), res, 3);
|
||||
|
||||
@ -146,8 +149,8 @@ TEST(DictTrieTest, Dag) {
|
||||
|
||||
{
|
||||
string word = "长江大桥";
|
||||
Unicode unicode;
|
||||
ASSERT_TRUE(TransCode::Decode(word, unicode));
|
||||
cppjieba::RuneStrArray unicode;
|
||||
ASSERT_TRUE(DecodeRunesInString(word, unicode));
|
||||
vector<struct Dag> res;
|
||||
trie.Find(unicode.begin(), unicode.end(), res, 4);
|
||||
|
||||
|
43
test/unittest/unicode_test.cpp
Normal file
43
test/unittest/unicode_test.cpp
Normal file
@ -0,0 +1,43 @@
|
||||
#include "cppjieba/Unicode.hpp"
|
||||
#include "limonp/StdExtension.hpp"
|
||||
#include "gtest/gtest.h"
|
||||
|
||||
using namespace cppjieba;
|
||||
using namespace std;
|
||||
|
||||
TEST(UnicodeTest, Test1) {
|
||||
string s = "你好世界";
|
||||
RuneStrArray runes;
|
||||
ASSERT_TRUE(DecodeRunesInString(s, runes));
|
||||
string actual;
|
||||
string expected = "[\"{\"rune\": \"20320\", \"offset\": 0, \"len\": 3}\", \"{\"rune\": \"22909\", \"offset\": 3, \"len\": 3}\", \"{\"rune\": \"19990\", \"offset\": 6, \"len\": 3}\", \"{\"rune\": \"30028\", \"offset\": 9, \"len\": 3}\"]";
|
||||
actual << runes;
|
||||
ASSERT_EQ(expected, actual);
|
||||
}
|
||||
|
||||
TEST(UnicodeTest, Illegal) {
|
||||
string s = "123\x80";
|
||||
RuneStrArray runes;
|
||||
ASSERT_FALSE(DecodeRunesInString(s, runes));
|
||||
string actual;
|
||||
string expected = "[]";
|
||||
actual << runes;
|
||||
ASSERT_EQ(expected, actual);
|
||||
}
|
||||
|
||||
TEST(UnicodeTest, Rand) {
|
||||
const size_t ITERATION = 1024;
|
||||
const size_t MAX_LEN = 256;
|
||||
string s;
|
||||
srand(time(NULL));
|
||||
|
||||
for (size_t i = 0; i < ITERATION; i++) {
|
||||
size_t len = rand() % MAX_LEN;
|
||||
s.resize(len);
|
||||
for (size_t j = 0; j < len; j++) {
|
||||
s[rand() % len] = rand();
|
||||
}
|
||||
RuneStrArray runes;
|
||||
DecodeRunesInString(s, runes);
|
||||
}
|
||||
}
|
Loading…
x
Reference in New Issue
Block a user