mirror of
https://github.com/yanyiwu/cppjieba.git
synced 2025-07-18 00:00:12 +08:00
Compare commits
34 Commits
Author | SHA1 | Date | |
---|---|---|---|
|
294755fab1 | ||
|
714a297823 | ||
|
c14131e3e2 | ||
|
9cd64a1694 | ||
|
aa410a69bb | ||
|
b5dc8e7a35 | ||
|
8141d8f434 | ||
|
9d8af2116e | ||
|
2185315643 | ||
|
340de007f9 | ||
|
940ea02eb4 | ||
|
3732abc0e5 | ||
|
9cda7f33e8 | ||
|
338603b676 | ||
|
d93dda397c | ||
|
7730deee52 | ||
|
588860b5b6 | ||
|
0523949aa8 | ||
|
b11fd29697 | ||
|
15b8086a2a | ||
|
1d74caf705 | ||
|
0c7c5228d0 | ||
|
016fc17575 | ||
|
39fc58f081 | ||
|
42a93a4b98 | ||
|
5ee74d788e | ||
|
9b45e084a3 | ||
|
aa1def5ddb | ||
|
732812cdfb | ||
|
6e167a30dd | ||
|
5ef74f335a | ||
|
6339262755 | ||
|
cc58d4f858 | ||
|
dbebc7cacb |
40
.github/workflows/cmake-arm64.yml
vendored
Normal file
40
.github/workflows/cmake-arm64.yml
vendored
Normal file
@ -0,0 +1,40 @@
|
||||
name: CMake Windows ARM64
|
||||
|
||||
on:
|
||||
push:
|
||||
pull_request:
|
||||
workflow_dispatch:
|
||||
|
||||
env:
|
||||
BUILD_TYPE: Release
|
||||
|
||||
jobs:
|
||||
build-windows-arm64:
|
||||
runs-on: windows-2022
|
||||
strategy:
|
||||
matrix:
|
||||
cpp_version: [11, 14, 17, 20]
|
||||
|
||||
steps:
|
||||
- name: Check out repository code
|
||||
uses: actions/checkout@v2
|
||||
with:
|
||||
submodules: recursive
|
||||
|
||||
- name: Configure CMake
|
||||
# Configure CMake in a 'build' subdirectory. `CMAKE_BUILD_TYPE` is only required if you are using a single-configuration generator such as make.
|
||||
# See https://cmake.org/cmake/help/latest/variable/CMAKE_BUILD_TYPE.html?highlight=cmake_build_type
|
||||
# run: cmake -B ${{github.workspace}}/build -DCMAKE_BUILD_TYPE=${{env.BUILD_TYPE}}
|
||||
run: cmake -B ${{github.workspace}}/build -DBUILD_TESTING=ON -DCMAKE_CXX_STANDARD=${{matrix.cpp_version}} -DCMAKE_BUILD_TYPE=${{env.BUILD_TYPE}}
|
||||
|
||||
- name: Build
|
||||
# Build your program with the given configuration
|
||||
# run: cmake --build ${{github.workspace}}/build --config ${{env.BUILD_TYPE}}
|
||||
run: cmake --build ${{github.workspace}}/build --config ${{env.BUILD_TYPE}}
|
||||
|
||||
- name: Test
|
||||
working-directory: ${{github.workspace}}/build
|
||||
# Execute tests defined by the CMake configuration.
|
||||
# See https://cmake.org/cmake/help/latest/manual/ctest.1.html for more detail
|
||||
run: ctest -C ${{env.BUILD_TYPE}} --verbose
|
||||
|
7
.github/workflows/cmake.yml
vendored
7
.github/workflows/cmake.yml
vendored
@ -17,13 +17,14 @@ jobs:
|
||||
strategy:
|
||||
matrix:
|
||||
os: [
|
||||
ubuntu-20.04,
|
||||
ubuntu-22.04,
|
||||
macos-12,
|
||||
ubuntu-latest,
|
||||
macos-13,
|
||||
macos-14,
|
||||
macos-latest,
|
||||
windows-2019,
|
||||
windows-2022,
|
||||
windows-latest,
|
||||
]
|
||||
cpp_version: [11, 14, 17, 20]
|
||||
|
||||
@ -48,5 +49,5 @@ jobs:
|
||||
working-directory: ${{github.workspace}}/build
|
||||
# Execute tests defined by the CMake configuration.
|
||||
# See https://cmake.org/cmake/help/latest/manual/ctest.1.html for more detail
|
||||
run: ctest -C ${{env.BUILD_TYPE}}
|
||||
run: ctest -C ${{env.BUILD_TYPE}} --verbose
|
||||
|
3
.github/workflows/stale-issues.yml
vendored
3
.github/workflows/stale-issues.yml
vendored
@ -2,7 +2,8 @@ name: Close Stale Issues
|
||||
|
||||
on:
|
||||
schedule:
|
||||
- cron: '0 0 * * 0' # Run weekly on Sunday at midnight
|
||||
- cron: '0 0 3 */3 *' # Every three months on the 3rd day at midnight
|
||||
|
||||
|
||||
jobs:
|
||||
stale:
|
||||
|
2
.gitignore
vendored
2
.gitignore
vendored
@ -15,3 +15,5 @@ tmp
|
||||
t.*
|
||||
*.pid
|
||||
build
|
||||
Testing/Temporary/CTestCostData.txt
|
||||
Testing/Temporary/LastTest.log
|
||||
|
15
CHANGELOG.md
15
CHANGELOG.md
@ -1,5 +1,20 @@
|
||||
# CHANGELOG
|
||||
|
||||
## v5.5.0
|
||||
|
||||
+ feat: add Windows ARM64 build support
|
||||
+ build: upgrade googletest from 1.11.0 to 1.12.1
|
||||
+ build: update CMake minimum version requirement to 3.10
|
||||
+ fix: make namespaces explicit and fix missing includes
|
||||
+ ci: update stale-issues workflow configuration
|
||||
|
||||
## v5.4.0
|
||||
|
||||
+ unittest: class Jiaba add default argument input
|
||||
+ class Jieba: support default dictpath
|
||||
+ cmake: avoid testing when FetchContent by other project
|
||||
+ class DictTrie: removed unused var
|
||||
|
||||
## v5.3.2
|
||||
|
||||
+ removed test/demo.cpp and linked https://github.com/yanyiwu/cppjieba-demo
|
||||
|
@ -1,25 +1,31 @@
|
||||
CMAKE_MINIMUM_REQUIRED (VERSION 3.5)
|
||||
CMAKE_MINIMUM_REQUIRED (VERSION 3.10)
|
||||
PROJECT(CPPJIEBA)
|
||||
|
||||
INCLUDE_DIRECTORIES(${PROJECT_SOURCE_DIR}/deps/limonp/include
|
||||
${PROJECT_SOURCE_DIR}/include)
|
||||
|
||||
if (CMAKE_INSTALL_PREFIX_INITIALIZED_TO_DEFAULT)
|
||||
set (CMAKE_INSTALL_PREFIX "/usr/local/cppjieba" CACHE PATH "default install path" FORCE )
|
||||
endif()
|
||||
|
||||
if(NOT DEFINED CMAKE_CXX_STANDARD)
|
||||
set(CMAKE_CXX_STANDARD 11)
|
||||
endif()
|
||||
set(CMAKE_CXX_STANDARD_REQUIRED ON)
|
||||
set(CMAKE_CXX_EXTENSIONS OFF)
|
||||
|
||||
ADD_DEFINITIONS(-O3 -Wall -g)
|
||||
ADD_DEFINITIONS(-O3 -g)
|
||||
|
||||
ADD_SUBDIRECTORY(test)
|
||||
# Define a variable to check if this is the top-level project
|
||||
if(NOT DEFINED CPPJIEBA_TOP_LEVEL_PROJECT)
|
||||
if(CMAKE_CURRENT_SOURCE_DIR STREQUAL CMAKE_SOURCE_DIR)
|
||||
set(CPPJIEBA_TOP_LEVEL_PROJECT ON)
|
||||
else()
|
||||
set(CPPJIEBA_TOP_LEVEL_PROJECT OFF)
|
||||
endif()
|
||||
endif()
|
||||
|
||||
ENABLE_TESTING()
|
||||
if(NOT MSVC)
|
||||
if(CPPJIEBA_TOP_LEVEL_PROJECT)
|
||||
ENABLE_TESTING()
|
||||
|
||||
message(STATUS "MSVC value: ${MSVC}")
|
||||
ADD_SUBDIRECTORY(test)
|
||||
ADD_TEST(NAME ./test/test.run COMMAND ./test/test.run)
|
||||
ADD_TEST(NAME ./load_test COMMAND ./load_test)
|
||||
endif()
|
115
README.md
115
README.md
@ -10,22 +10,25 @@
|
||||
|
||||
CppJieba是"结巴(Jieba)"中文分词的C++版本
|
||||
|
||||
## 特性
|
||||
### 主要特点
|
||||
|
||||
+ 源代码都写进头文件`include/cppjieba/*.hpp`里,`include`即可使用。
|
||||
+ 支持`utf8`编码。
|
||||
+ 项目自带较为完善的单元测试,核心功能中文分词(utf8)的稳定性接受过线上环境检验。
|
||||
+ 支持载自定义用户词典,多路径时支持分隔符'|'或者';'分隔。
|
||||
+ 支持 `Linux` , `Mac OSX`, `Windows` 操作系统。
|
||||
- 🚀 高性能:经过线上环境验证的稳定性和性能表现
|
||||
- 📦 易集成:源代码以头文件形式提供 (`include/cppjieba/*.hpp`),包含即可使用
|
||||
- 🔍 多种分词模式:支持精确模式、全模式、搜索引擎模式等
|
||||
- 📚 自定义词典:支持用户自定义词典,支持多词典路径(使用'|'或';'分隔)
|
||||
- 💻 跨平台:支持 Linux、macOS、Windows 操作系统
|
||||
- 🌈 UTF-8编码:原生支持 UTF-8 编码的中文处理
|
||||
|
||||
## 用法
|
||||
## 快速开始
|
||||
|
||||
### 依赖软件
|
||||
### 环境要求
|
||||
|
||||
* `g++ (version >= 4.1 is recommended) or clang++`;
|
||||
* `cmake (version >= 2.6 is recommended)`;
|
||||
- C++ 编译器:
|
||||
- g++ (推荐 4.1 以上版本)
|
||||
- 或 clang++
|
||||
- cmake (推荐 2.6 以上版本)
|
||||
|
||||
### 下载和编译
|
||||
### 安装步骤
|
||||
|
||||
```sh
|
||||
git clone https://github.com/yanyiwu/cppjieba.git
|
||||
@ -36,15 +39,11 @@ mkdir build
|
||||
cd build
|
||||
cmake ..
|
||||
make
|
||||
```
|
||||
|
||||
有兴趣的可以跑跑测试(可选):
|
||||
|
||||
```
|
||||
make test
|
||||
```
|
||||
|
||||
## Demo
|
||||
## 使用示例
|
||||
|
||||
```
|
||||
./demo
|
||||
@ -210,71 +209,37 @@ For more details, please see [demo](https://github.com/yanyiwu/cppjieba-demo).
|
||||
|
||||
+ [dict.367W.utf8] iLife(562193561 at qq.com)
|
||||
|
||||
## 应用
|
||||
## 生态系统
|
||||
|
||||
+ [GoJieba] go语言版本的结巴中文分词。
|
||||
+ [NodeJieba] Node.js 版本的结巴中文分词。
|
||||
+ [simhash] 中文文档的的相似度计算
|
||||
+ [exjieba] Erlang 版本的结巴中文分词。
|
||||
+ [jiebaR] R语言版本的结巴中文分词。
|
||||
+ [cjieba] C语言版本的结巴分词。
|
||||
+ [jieba_rb] Ruby 版本的结巴分词。
|
||||
+ [iosjieba] iOS 版本的结巴分词。
|
||||
+ [SqlJieba] MySQL 全文索引的结巴中文分词插件。
|
||||
+ [pg_jieba] PostgreSQL 数据库的分词插件。
|
||||
+ [simple] SQLite3 FTS5 数据库的分词插件。
|
||||
+ [gitbook-plugin-search-pro] 支持中文搜索的 gitbook 插件。
|
||||
+ [ngx_http_cppjieba_module] Nginx 分词插件。
|
||||
+ [cppjiebapy] 由 [jannson] 开发的供 python 模块调用的项目 [cppjiebapy], 相关讨论 [cppjiebapy_discussion] .
|
||||
+ [cppjieba-py] 由 [bung87] 基于 pybind11 封装的 python 模块,使用体验上接近于原jieba。
|
||||
+ [KeywordServer] 50行搭建一个中文关键词抽取服务。
|
||||
+ [cppjieba-server] CppJieba HTTP 服务器。
|
||||
+ [phpjieba] php版本的结巴分词扩展。
|
||||
+ [perl5-jieba] Perl版本的结巴分词扩展。
|
||||
+ [jieba-dlang] D 语言的结巴分词 Deimos Bindings。
|
||||
CppJieba 已经被广泛应用于各种编程语言的分词实现中:
|
||||
|
||||
## 性能评测
|
||||
- [GoJieba](https://github.com/yanyiwu/gojieba) - Go 语言版本
|
||||
- [NodeJieba](https://github.com/yanyiwu/nodejieba) - Node.js 版本
|
||||
- [CJieba](https://github.com/yanyiwu/cjieba) - C 语言版本
|
||||
- [jiebaR](https://github.com/qinwf/jiebaR) - R 语言版本
|
||||
- [exjieba](https://github.com/falood/exjieba) - Erlang 版本
|
||||
- [jieba_rb](https://github.com/altkatz/jieba_rb) - Ruby 版本
|
||||
- [iosjieba](https://github.com/yanyiwu/iosjieba) - iOS 版本
|
||||
- [phpjieba](https://github.com/jonnywang/phpjieba) - PHP 版本
|
||||
- [perl5-jieba](https://metacpan.org/pod/distribution/Lingua-ZH-Jieba/lib/Lingua/ZH/Jieba.pod) - Perl 版本
|
||||
|
||||
[Jieba中文分词系列性能评测]
|
||||
### 应用项目
|
||||
|
||||
## Sponsorship
|
||||
- [simhash](https://github.com/yanyiwu/simhash) - 中文文档相似度计算
|
||||
- [pg_jieba](https://github.com/jaiminpan/pg_jieba) - PostgreSQL 分词插件
|
||||
- [gitbook-plugin-search-pro](https://plugins.gitbook.com/plugin/search-pro) - Gitbook 中文搜索插件
|
||||
- [ngx_http_cppjieba_module](https://github.com/yanyiwu/ngx_http_cppjieba_module) - Nginx 分词插件
|
||||
|
||||
[](https://tracking.gitads.io/?campaign=gitads&repo=cppjieba&redirect=gitads.io)
|
||||
## 贡献指南
|
||||
|
||||
## Contributors
|
||||
我们欢迎各种形式的贡献,包括但不限于:
|
||||
|
||||
### Code Contributors
|
||||
|
||||
This project exists thanks to all the people who contribute.
|
||||
<a href="https://github.com/yanyiwu/cppjieba/graphs/contributors"><img src="https://opencollective.com/cppjieba/contributors.svg?width=890&button=false" /></a>
|
||||
|
||||
[GoJieba]:https://github.com/yanyiwu/gojieba
|
||||
[CppJieba]:https://github.com/yanyiwu/cppjieba
|
||||
[jannson]:https://github.com/jannson
|
||||
[cppjiebapy]:https://github.com/jannson/cppjiebapy
|
||||
[bung87]:https://github.com/bung87
|
||||
[cppjieba-py]:https://github.com/bung87/cppjieba-py
|
||||
[cppjiebapy_discussion]:https://github.com/yanyiwu/cppjieba/issues/1
|
||||
[NodeJieba]:https://github.com/yanyiwu/nodejieba
|
||||
[jiebaR]:https://github.com/qinwf/jiebaR
|
||||
[simhash]:https://github.com/yanyiwu/simhash
|
||||
[代码详解]:https://github.com/yanyiwu/cppjieba/wiki/CppJieba%E4%BB%A3%E7%A0%81%E8%AF%A6%E8%A7%A3
|
||||
[issue25]:https://github.com/yanyiwu/cppjieba/issues/25
|
||||
[exjieba]:https://github.com/falood/exjieba
|
||||
[KeywordServer]:https://github.com/yanyiwu/keyword_server
|
||||
[ngx_http_cppjieba_module]:https://github.com/yanyiwu/ngx_http_cppjieba_module
|
||||
[dict.367W.utf8]:https://github.com/qinwf/BigDict
|
||||
[cjieba]:http://github.com/yanyiwu/cjieba
|
||||
[jieba_rb]:https://github.com/altkatz/jieba_rb
|
||||
[iosjieba]:https://github.com/yanyiwu/iosjieba
|
||||
[SqlJieba]:https://github.com/yanyiwu/sqljieba
|
||||
[Jieba中文分词系列性能评测]:http://yanyiwu.com/work/2015/06/14/jieba-series-performance-test.html
|
||||
[pg_jieba]:https://github.com/jaiminpan/pg_jieba
|
||||
[gitbook-plugin-search-pro]:https://plugins.gitbook.com/plugin/search-pro
|
||||
[cppjieba-server]:https://github.com/yanyiwu/cppjieba-server
|
||||
[phpjieba]:https://github.com/jonnywang/phpjieba
|
||||
[perl5-jieba]:https://metacpan.org/pod/distribution/Lingua-ZH-Jieba/lib/Lingua/ZH/Jieba.pod
|
||||
[jieba-dlang]:https://github.com/shove70/jieba
|
||||
[simple]:https://github.com/wangfenjin/simple
|
||||
- 提交问题和建议
|
||||
- 改进文档
|
||||
- 提交代码修复
|
||||
- 添加新功能
|
||||
|
||||
|
||||
如果您觉得 CppJieba 对您有帮助,欢迎 star ⭐️ 支持项目!
|
||||
|
||||
|
||||
|
2
deps/limonp
vendored
2
deps/limonp
vendored
@ -1 +1 @@
|
||||
Subproject commit ac32f1f287f65d5ce0ce295010c88026fae060ee
|
||||
Subproject commit 5c82a3f17e4e0adc6a5decfe245054b0ed533d1a
|
@ -312698,7 +312698,6 @@ T恤 4 n
|
||||
部属 1126 n
|
||||
部属工作 3 n
|
||||
部属院校 3 n
|
||||
部手机 33 n
|
||||
部族 643 n
|
||||
部标 4 n
|
||||
部省级 2 n
|
||||
|
@ -1,15 +1,15 @@
|
||||
#ifndef CPPJIEBA_DICT_TRIE_HPP
|
||||
#define CPPJIEBA_DICT_TRIE_HPP
|
||||
|
||||
#include <iostream>
|
||||
#include <algorithm>
|
||||
#include <fstream>
|
||||
#include <map>
|
||||
#include <string>
|
||||
#include <cstring>
|
||||
#include <cstdlib>
|
||||
#include <stdint.h>
|
||||
#include <cmath>
|
||||
#include <limits>
|
||||
#include <deque>
|
||||
#include <set>
|
||||
#include <string>
|
||||
#include <unordered_set>
|
||||
#include "limonp/StringUtil.hpp"
|
||||
#include "limonp/Logging.hpp"
|
||||
#include "Unicode.hpp"
|
||||
@ -17,8 +17,6 @@
|
||||
|
||||
namespace cppjieba {
|
||||
|
||||
using namespace limonp;
|
||||
|
||||
const double MIN_DOUBLE = -3.14e+100;
|
||||
const double MAX_DOUBLE = 3.14e+100;
|
||||
const size_t DICT_COLUMN_NUM = 3;
|
||||
@ -32,7 +30,7 @@ class DictTrie {
|
||||
WordWeightMax,
|
||||
}; // enum UserWordWeightOption
|
||||
|
||||
DictTrie(const string& dict_path, const string& user_dict_paths = "", UserWordWeightOption user_word_weight_opt = WordWeightMedian) {
|
||||
DictTrie(const std::string& dict_path, const std::string& user_dict_paths = "", UserWordWeightOption user_word_weight_opt = WordWeightMedian) {
|
||||
Init(dict_path, user_dict_paths, user_word_weight_opt);
|
||||
}
|
||||
|
||||
@ -40,7 +38,7 @@ class DictTrie {
|
||||
delete trie_;
|
||||
}
|
||||
|
||||
bool InsertUserWord(const string& word, const string& tag = UNKNOWN_TAG) {
|
||||
bool InsertUserWord(const std::string& word, const std::string& tag = UNKNOWN_TAG) {
|
||||
DictUnit node_info;
|
||||
if (!MakeNodeInfo(node_info, word, user_word_default_weight_, tag)) {
|
||||
return false;
|
||||
@ -50,7 +48,7 @@ class DictTrie {
|
||||
return true;
|
||||
}
|
||||
|
||||
bool InsertUserWord(const string& word,int freq, const string& tag = UNKNOWN_TAG) {
|
||||
bool InsertUserWord(const std::string& word,int freq, const std::string& tag = UNKNOWN_TAG) {
|
||||
DictUnit node_info;
|
||||
double weight = freq ? log(1.0 * freq / freq_sum_) : user_word_default_weight_ ;
|
||||
if (!MakeNodeInfo(node_info, word, weight , tag)) {
|
||||
@ -61,7 +59,7 @@ class DictTrie {
|
||||
return true;
|
||||
}
|
||||
|
||||
bool DeleteUserWord(const string& word, const string& tag = UNKNOWN_TAG) {
|
||||
bool DeleteUserWord(const std::string& word, const std::string& tag = UNKNOWN_TAG) {
|
||||
DictUnit node_info;
|
||||
if (!MakeNodeInfo(node_info, word, user_word_default_weight_, tag)) {
|
||||
return false;
|
||||
@ -76,16 +74,16 @@ class DictTrie {
|
||||
|
||||
void Find(RuneStrArray::const_iterator begin,
|
||||
RuneStrArray::const_iterator end,
|
||||
vector<struct Dag>&res,
|
||||
std::vector<struct Dag>&res,
|
||||
size_t max_word_len = MAX_WORD_LENGTH) const {
|
||||
trie_->Find(begin, end, res, max_word_len);
|
||||
}
|
||||
|
||||
bool Find(const string& word)
|
||||
bool Find(const std::string& word)
|
||||
{
|
||||
const DictUnit *tmp = NULL;
|
||||
RuneStrArray runes;
|
||||
if (!DecodeRunesInString(word, runes))
|
||||
if (!DecodeUTF8RunesInString(word, runes))
|
||||
{
|
||||
XLOG(ERROR) << "Decode failed.";
|
||||
}
|
||||
@ -108,10 +106,10 @@ class DictTrie {
|
||||
return min_weight_;
|
||||
}
|
||||
|
||||
void InserUserDictNode(const string& line) {
|
||||
vector<string> buf;
|
||||
void InserUserDictNode(const std::string& line) {
|
||||
std::vector<std::string> buf;
|
||||
DictUnit node_info;
|
||||
Split(line, buf, " ");
|
||||
limonp::Split(line, buf, " ");
|
||||
if(buf.size() == 1){
|
||||
MakeNodeInfo(node_info,
|
||||
buf[0],
|
||||
@ -134,28 +132,27 @@ class DictTrie {
|
||||
}
|
||||
}
|
||||
|
||||
void LoadUserDict(const vector<string>& buf) {
|
||||
void LoadUserDict(const std::vector<std::string>& buf) {
|
||||
for (size_t i = 0; i < buf.size(); i++) {
|
||||
InserUserDictNode(buf[i]);
|
||||
}
|
||||
}
|
||||
|
||||
void LoadUserDict(const set<string>& buf) {
|
||||
std::set<string>::const_iterator iter;
|
||||
void LoadUserDict(const std::set<std::string>& buf) {
|
||||
std::set<std::string>::const_iterator iter;
|
||||
for (iter = buf.begin(); iter != buf.end(); iter++){
|
||||
InserUserDictNode(*iter);
|
||||
}
|
||||
}
|
||||
|
||||
void LoadUserDict(const string& filePaths) {
|
||||
vector<string> files = limonp::Split(filePaths, "|;");
|
||||
size_t lineno = 0;
|
||||
void LoadUserDict(const std::string& filePaths) {
|
||||
std::vector<std::string> files = limonp::Split(filePaths, "|;");
|
||||
for (size_t i = 0; i < files.size(); i++) {
|
||||
ifstream ifs(files[i].c_str());
|
||||
std::ifstream ifs(files[i].c_str());
|
||||
XCHECK(ifs.is_open()) << "open " << files[i] << " failed";
|
||||
string line;
|
||||
std::string line;
|
||||
|
||||
for (; getline(ifs, line); lineno++) {
|
||||
while(getline(ifs, line)) {
|
||||
if (line.size() == 0) {
|
||||
continue;
|
||||
}
|
||||
@ -166,7 +163,7 @@ class DictTrie {
|
||||
|
||||
|
||||
private:
|
||||
void Init(const string& dict_path, const string& user_dict_paths, UserWordWeightOption user_word_weight_opt) {
|
||||
void Init(const std::string& dict_path, const std::string& user_dict_paths, UserWordWeightOption user_word_weight_opt) {
|
||||
LoadDict(dict_path);
|
||||
freq_sum_ = CalcFreqSum(static_node_infos_);
|
||||
CalculateWeight(static_node_infos_, freq_sum_);
|
||||
@ -179,10 +176,10 @@ class DictTrie {
|
||||
CreateTrie(static_node_infos_);
|
||||
}
|
||||
|
||||
void CreateTrie(const vector<DictUnit>& dictUnits) {
|
||||
void CreateTrie(const std::vector<DictUnit>& dictUnits) {
|
||||
assert(dictUnits.size());
|
||||
vector<Unicode> words;
|
||||
vector<const DictUnit*> valuePointers;
|
||||
std::vector<Unicode> words;
|
||||
std::vector<const DictUnit*> valuePointers;
|
||||
for (size_t i = 0 ; i < dictUnits.size(); i ++) {
|
||||
words.push_back(dictUnits[i].word);
|
||||
valuePointers.push_back(&dictUnits[i]);
|
||||
@ -191,15 +188,12 @@ class DictTrie {
|
||||
trie_ = new Trie(words, valuePointers);
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
bool MakeNodeInfo(DictUnit& node_info,
|
||||
const string& word,
|
||||
const std::string& word,
|
||||
double weight,
|
||||
const string& tag) {
|
||||
if (!DecodeRunesInString(word, node_info.word)) {
|
||||
XLOG(ERROR) << "Decode " << word << " failed.";
|
||||
const std::string& tag) {
|
||||
if (!DecodeUTF8RunesInString(word, node_info.word)) {
|
||||
XLOG(ERROR) << "UTF-8 decode failed for dict word: " << word;
|
||||
return false;
|
||||
}
|
||||
node_info.weight = weight;
|
||||
@ -207,15 +201,15 @@ class DictTrie {
|
||||
return true;
|
||||
}
|
||||
|
||||
void LoadDict(const string& filePath) {
|
||||
ifstream ifs(filePath.c_str());
|
||||
void LoadDict(const std::string& filePath) {
|
||||
std::ifstream ifs(filePath.c_str());
|
||||
XCHECK(ifs.is_open()) << "open " << filePath << " failed.";
|
||||
string line;
|
||||
vector<string> buf;
|
||||
std::string line;
|
||||
std::vector<std::string> buf;
|
||||
|
||||
DictUnit node_info;
|
||||
for (size_t lineno = 0; getline(ifs, line); lineno++) {
|
||||
Split(line, buf, " ");
|
||||
while (getline(ifs, line)) {
|
||||
limonp::Split(line, buf, " ");
|
||||
XCHECK(buf.size() == DICT_COLUMN_NUM) << "split result illegal, line:" << line;
|
||||
MakeNodeInfo(node_info,
|
||||
buf[0],
|
||||
@ -231,8 +225,8 @@ class DictTrie {
|
||||
|
||||
void SetStaticWordWeights(UserWordWeightOption option) {
|
||||
XCHECK(!static_node_infos_.empty());
|
||||
vector<DictUnit> x = static_node_infos_;
|
||||
sort(x.begin(), x.end(), WeightCompare);
|
||||
std::vector<DictUnit> x = static_node_infos_;
|
||||
std::sort(x.begin(), x.end(), WeightCompare);
|
||||
min_weight_ = x[0].weight;
|
||||
max_weight_ = x[x.size() - 1].weight;
|
||||
median_weight_ = x[x.size() / 2].weight;
|
||||
@ -249,7 +243,7 @@ class DictTrie {
|
||||
}
|
||||
}
|
||||
|
||||
double CalcFreqSum(const vector<DictUnit>& node_infos) const {
|
||||
double CalcFreqSum(const std::vector<DictUnit>& node_infos) const {
|
||||
double sum = 0.0;
|
||||
for (size_t i = 0; i < node_infos.size(); i++) {
|
||||
sum += node_infos[i].weight;
|
||||
@ -257,7 +251,7 @@ class DictTrie {
|
||||
return sum;
|
||||
}
|
||||
|
||||
void CalculateWeight(vector<DictUnit>& node_infos, double sum) const {
|
||||
void CalculateWeight(std::vector<DictUnit>& node_infos, double sum) const {
|
||||
assert(sum > 0.0);
|
||||
for (size_t i = 0; i < node_infos.size(); i++) {
|
||||
DictUnit& node_info = node_infos[i];
|
||||
@ -266,12 +260,12 @@ class DictTrie {
|
||||
}
|
||||
}
|
||||
|
||||
void Shrink(vector<DictUnit>& units) const {
|
||||
vector<DictUnit>(units.begin(), units.end()).swap(units);
|
||||
void Shrink(std::vector<DictUnit>& units) const {
|
||||
std::vector<DictUnit>(units.begin(), units.end()).swap(units);
|
||||
}
|
||||
|
||||
vector<DictUnit> static_node_infos_;
|
||||
deque<DictUnit> active_node_infos_; // must not be vector
|
||||
std::vector<DictUnit> static_node_infos_;
|
||||
std::deque<DictUnit> active_node_infos_; // must not be std::vector
|
||||
Trie * trie_;
|
||||
|
||||
double freq_sum_;
|
||||
@ -279,7 +273,7 @@ class DictTrie {
|
||||
double max_weight_;
|
||||
double median_weight_;
|
||||
double user_word_default_weight_;
|
||||
unordered_set<Rune> user_dict_single_chinese_word_;
|
||||
std::unordered_set<Rune> user_dict_single_chinese_word_;
|
||||
};
|
||||
}
|
||||
|
||||
|
@ -105,7 +105,7 @@ struct HMMModel {
|
||||
XLOG(ERROR) << "emitProb illegal.";
|
||||
return false;
|
||||
}
|
||||
if (!DecodeRunesInString(tmp2[0], unicode) || unicode.size() != 1) {
|
||||
if (!DecodeUTF8RunesInString(tmp2[0], unicode) || unicode.size() != 1) {
|
||||
XLOG(ERROR) << "TransCode failed.";
|
||||
return false;
|
||||
}
|
||||
|
@ -8,19 +8,21 @@ namespace cppjieba {
|
||||
|
||||
class Jieba {
|
||||
public:
|
||||
Jieba(const string& dict_path,
|
||||
const string& model_path,
|
||||
const string& user_dict_path,
|
||||
const string& idfPath,
|
||||
const string& stopWordPath)
|
||||
: dict_trie_(dict_path, user_dict_path),
|
||||
model_(model_path),
|
||||
Jieba(const string& dict_path = "",
|
||||
const string& model_path = "",
|
||||
const string& user_dict_path = "",
|
||||
const string& idf_path = "",
|
||||
const string& stop_word_path = "")
|
||||
: dict_trie_(getPath(dict_path, "jieba.dict.utf8"), getPath(user_dict_path, "user.dict.utf8")),
|
||||
model_(getPath(model_path, "hmm_model.utf8")),
|
||||
mp_seg_(&dict_trie_),
|
||||
hmm_seg_(&model_),
|
||||
mix_seg_(&dict_trie_, &model_),
|
||||
full_seg_(&dict_trie_),
|
||||
query_seg_(&dict_trie_, &model_),
|
||||
extractor(&dict_trie_, &model_, idfPath, stopWordPath) {
|
||||
extractor(&dict_trie_, &model_,
|
||||
getPath(idf_path, "idf.utf8"),
|
||||
getPath(stop_word_path, "stop_words.utf8")) {
|
||||
}
|
||||
~Jieba() {
|
||||
}
|
||||
@ -115,6 +117,39 @@ class Jieba {
|
||||
}
|
||||
|
||||
private:
|
||||
static string pathJoin(const string& dir, const string& filename) {
|
||||
if (dir.empty()) {
|
||||
return filename;
|
||||
}
|
||||
|
||||
char last_char = dir[dir.length() - 1];
|
||||
if (last_char == '/' || last_char == '\\') {
|
||||
return dir + filename;
|
||||
} else {
|
||||
#ifdef _WIN32
|
||||
return dir + '\\' + filename;
|
||||
#else
|
||||
return dir + '/' + filename;
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
||||
static string getCurrentDirectory() {
|
||||
string path(__FILE__);
|
||||
size_t pos = path.find_last_of("/\\");
|
||||
return (pos == string::npos) ? "" : path.substr(0, pos);
|
||||
}
|
||||
|
||||
static string getPath(const string& path, const string& default_file) {
|
||||
if (path.empty()) {
|
||||
string current_dir = getCurrentDirectory();
|
||||
string parent_dir = current_dir.substr(0, current_dir.find_last_of("/\\"));
|
||||
string grandparent_dir = parent_dir.substr(0, parent_dir.find_last_of("/\\"));
|
||||
return pathJoin(pathJoin(grandparent_dir, "dict"), default_file);
|
||||
}
|
||||
return path;
|
||||
}
|
||||
|
||||
DictTrie dict_trie_;
|
||||
HMMModel model_;
|
||||
|
||||
|
@ -1,37 +1,35 @@
|
||||
#ifndef CPPJIEBA_KEYWORD_EXTRACTOR_H
|
||||
#define CPPJIEBA_KEYWORD_EXTRACTOR_H
|
||||
|
||||
#include <cmath>
|
||||
#include <set>
|
||||
#include <algorithm>
|
||||
#include <unordered_map>
|
||||
#include <unordered_set>
|
||||
#include "MixSegment.hpp"
|
||||
|
||||
namespace cppjieba {
|
||||
|
||||
using namespace limonp;
|
||||
using namespace std;
|
||||
|
||||
/*utf8*/
|
||||
class KeywordExtractor {
|
||||
public:
|
||||
struct Word {
|
||||
string word;
|
||||
vector<size_t> offsets;
|
||||
std::string word;
|
||||
std::vector<size_t> offsets;
|
||||
double weight;
|
||||
}; // struct Word
|
||||
|
||||
KeywordExtractor(const string& dictPath,
|
||||
const string& hmmFilePath,
|
||||
const string& idfPath,
|
||||
const string& stopWordPath,
|
||||
const string& userDict = "")
|
||||
KeywordExtractor(const std::string& dictPath,
|
||||
const std::string& hmmFilePath,
|
||||
const std::string& idfPath,
|
||||
const std::string& stopWordPath,
|
||||
const std::string& userDict = "")
|
||||
: segment_(dictPath, hmmFilePath, userDict) {
|
||||
LoadIdfDict(idfPath);
|
||||
LoadStopWordDict(stopWordPath);
|
||||
}
|
||||
KeywordExtractor(const DictTrie* dictTrie,
|
||||
const HMMModel* model,
|
||||
const string& idfPath,
|
||||
const string& stopWordPath)
|
||||
const std::string& idfPath,
|
||||
const std::string& stopWordPath)
|
||||
: segment_(dictTrie, model) {
|
||||
LoadIdfDict(idfPath);
|
||||
LoadStopWordDict(stopWordPath);
|
||||
@ -39,27 +37,27 @@ class KeywordExtractor {
|
||||
~KeywordExtractor() {
|
||||
}
|
||||
|
||||
void Extract(const string& sentence, vector<string>& keywords, size_t topN) const {
|
||||
vector<Word> topWords;
|
||||
void Extract(const std::string& sentence, std::vector<std::string>& keywords, size_t topN) const {
|
||||
std::vector<Word> topWords;
|
||||
Extract(sentence, topWords, topN);
|
||||
for (size_t i = 0; i < topWords.size(); i++) {
|
||||
keywords.push_back(topWords[i].word);
|
||||
}
|
||||
}
|
||||
|
||||
void Extract(const string& sentence, vector<pair<string, double> >& keywords, size_t topN) const {
|
||||
vector<Word> topWords;
|
||||
void Extract(const std::string& sentence, std::vector<pair<std::string, double> >& keywords, size_t topN) const {
|
||||
std::vector<Word> topWords;
|
||||
Extract(sentence, topWords, topN);
|
||||
for (size_t i = 0; i < topWords.size(); i++) {
|
||||
keywords.push_back(pair<string, double>(topWords[i].word, topWords[i].weight));
|
||||
keywords.push_back(pair<std::string, double>(topWords[i].word, topWords[i].weight));
|
||||
}
|
||||
}
|
||||
|
||||
void Extract(const string& sentence, vector<Word>& keywords, size_t topN) const {
|
||||
vector<string> words;
|
||||
void Extract(const std::string& sentence, std::vector<Word>& keywords, size_t topN) const {
|
||||
std::vector<std::string> words;
|
||||
segment_.Cut(sentence, words);
|
||||
|
||||
map<string, Word> wordmap;
|
||||
std::map<std::string, Word> wordmap;
|
||||
size_t offset = 0;
|
||||
for (size_t i = 0; i < words.size(); ++i) {
|
||||
size_t t = offset;
|
||||
@ -77,8 +75,8 @@ class KeywordExtractor {
|
||||
|
||||
keywords.clear();
|
||||
keywords.reserve(wordmap.size());
|
||||
for (map<string, Word>::iterator itr = wordmap.begin(); itr != wordmap.end(); ++itr) {
|
||||
unordered_map<string, double>::const_iterator cit = idfMap_.find(itr->first);
|
||||
for (std::map<std::string, Word>::iterator itr = wordmap.begin(); itr != wordmap.end(); ++itr) {
|
||||
std::unordered_map<std::string, double>::const_iterator cit = idfMap_.find(itr->first);
|
||||
if (cit != idfMap_.end()) {
|
||||
itr->second.weight *= cit->second;
|
||||
} else {
|
||||
@ -88,15 +86,15 @@ class KeywordExtractor {
|
||||
keywords.push_back(itr->second);
|
||||
}
|
||||
topN = min(topN, keywords.size());
|
||||
partial_sort(keywords.begin(), keywords.begin() + topN, keywords.end(), Compare);
|
||||
std::partial_sort(keywords.begin(), keywords.begin() + topN, keywords.end(), Compare);
|
||||
keywords.resize(topN);
|
||||
}
|
||||
private:
|
||||
void LoadIdfDict(const string& idfPath) {
|
||||
ifstream ifs(idfPath.c_str());
|
||||
void LoadIdfDict(const std::string& idfPath) {
|
||||
std::ifstream ifs(idfPath.c_str());
|
||||
XCHECK(ifs.is_open()) << "open " << idfPath << " failed";
|
||||
string line ;
|
||||
vector<string> buf;
|
||||
std::string line ;
|
||||
std::vector<std::string> buf;
|
||||
double idf = 0.0;
|
||||
double idfSum = 0.0;
|
||||
size_t lineno = 0;
|
||||
@ -106,7 +104,7 @@ class KeywordExtractor {
|
||||
XLOG(ERROR) << "lineno: " << lineno << " empty. skipped.";
|
||||
continue;
|
||||
}
|
||||
Split(line, buf, " ");
|
||||
limonp::Split(line, buf, " ");
|
||||
if (buf.size() != 2) {
|
||||
XLOG(ERROR) << "line: " << line << ", lineno: " << lineno << " empty. skipped.";
|
||||
continue;
|
||||
@ -121,10 +119,10 @@ class KeywordExtractor {
|
||||
idfAverage_ = idfSum / lineno;
|
||||
assert(idfAverage_ > 0.0);
|
||||
}
|
||||
void LoadStopWordDict(const string& filePath) {
|
||||
ifstream ifs(filePath.c_str());
|
||||
void LoadStopWordDict(const std::string& filePath) {
|
||||
std::ifstream ifs(filePath.c_str());
|
||||
XCHECK(ifs.is_open()) << "open " << filePath << " failed";
|
||||
string line ;
|
||||
std::string line ;
|
||||
while (getline(ifs, line)) {
|
||||
stopWords_.insert(line);
|
||||
}
|
||||
@ -136,18 +134,16 @@ class KeywordExtractor {
|
||||
}
|
||||
|
||||
MixSegment segment_;
|
||||
unordered_map<string, double> idfMap_;
|
||||
std::unordered_map<std::string, double> idfMap_;
|
||||
double idfAverage_;
|
||||
|
||||
unordered_set<string> stopWords_;
|
||||
std::unordered_set<std::string> stopWords_;
|
||||
}; // class KeywordExtractor
|
||||
|
||||
inline ostream& operator << (ostream& os, const KeywordExtractor::Word& word) {
|
||||
inline std::ostream& operator << (std::ostream& os, const KeywordExtractor::Word& word) {
|
||||
return os << "{\"word\": \"" << word.word << "\", \"offset\": " << word.offsets << ", \"weight\": " << word.weight << "}";
|
||||
}
|
||||
|
||||
} // namespace cppjieba
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
|
@ -34,8 +34,8 @@ class PosTagger {
|
||||
RuneStrArray runes;
|
||||
const DictTrie * dict = segment.GetDictTrie();
|
||||
assert(dict != NULL);
|
||||
if (!DecodeRunesInString(str, runes)) {
|
||||
XLOG(ERROR) << "Decode failed.";
|
||||
if (!DecodeUTF8RunesInString(str, runes)) {
|
||||
XLOG(ERROR) << "UTF-8 decode failed for word: " << str;
|
||||
return POS_X;
|
||||
}
|
||||
tmp = dict->Find(runes.begin(), runes.end());
|
||||
|
@ -17,8 +17,8 @@ class PreFilter {
|
||||
PreFilter(const unordered_set<Rune>& symbols,
|
||||
const string& sentence)
|
||||
: symbols_(symbols) {
|
||||
if (!DecodeRunesInString(sentence, sentence_)) {
|
||||
XLOG(ERROR) << "decode failed. ";
|
||||
if (!DecodeUTF8RunesInString(sentence, sentence_)) {
|
||||
XLOG(ERROR) << "UTF-8 decode failed for input sentence";
|
||||
}
|
||||
cursor_ = sentence_.begin();
|
||||
}
|
||||
|
@ -25,8 +25,8 @@ class SegmentBase {
|
||||
bool ResetSeparators(const string& s) {
|
||||
symbols_.clear();
|
||||
RuneStrArray runes;
|
||||
if (!DecodeRunesInString(s, runes)) {
|
||||
XLOG(ERROR) << "decode " << s << " failed";
|
||||
if (!DecodeUTF8RunesInString(s, runes)) {
|
||||
XLOG(ERROR) << "UTF-8 decode failed for separators: " << s;
|
||||
return false;
|
||||
}
|
||||
for (size_t i = 0; i < runes.size(); i++) {
|
||||
|
@ -84,7 +84,7 @@ struct RuneStrLite {
|
||||
}
|
||||
}; // struct RuneStrLite
|
||||
|
||||
inline RuneStrLite DecodeRuneInString(const char* str, size_t len) {
|
||||
inline RuneStrLite DecodeUTF8ToRune(const char* str, size_t len) {
|
||||
RuneStrLite rp(0, 0);
|
||||
if (str == NULL || len == 0) {
|
||||
return rp;
|
||||
@ -139,11 +139,11 @@ inline RuneStrLite DecodeRuneInString(const char* str, size_t len) {
|
||||
return rp;
|
||||
}
|
||||
|
||||
inline bool DecodeRunesInString(const char* s, size_t len, RuneStrArray& runes) {
|
||||
inline bool DecodeUTF8RunesInString(const char* s, size_t len, RuneStrArray& runes) {
|
||||
runes.clear();
|
||||
runes.reserve(len / 2);
|
||||
for (uint32_t i = 0, j = 0; i < len;) {
|
||||
RuneStrLite rp = DecodeRuneInString(s + i, len - i);
|
||||
RuneStrLite rp = DecodeUTF8ToRune(s + i, len - i);
|
||||
if (rp.len == 0) {
|
||||
runes.clear();
|
||||
return false;
|
||||
@ -156,14 +156,14 @@ inline bool DecodeRunesInString(const char* s, size_t len, RuneStrArray& runes)
|
||||
return true;
|
||||
}
|
||||
|
||||
inline bool DecodeRunesInString(const string& s, RuneStrArray& runes) {
|
||||
return DecodeRunesInString(s.c_str(), s.size(), runes);
|
||||
inline bool DecodeUTF8RunesInString(const string& s, RuneStrArray& runes) {
|
||||
return DecodeUTF8RunesInString(s.c_str(), s.size(), runes);
|
||||
}
|
||||
|
||||
inline bool DecodeRunesInString(const char* s, size_t len, Unicode& unicode) {
|
||||
inline bool DecodeUTF8RunesInString(const char* s, size_t len, Unicode& unicode) {
|
||||
unicode.clear();
|
||||
RuneStrArray runes;
|
||||
if (!DecodeRunesInString(s, len, runes)) {
|
||||
if (!DecodeUTF8RunesInString(s, len, runes)) {
|
||||
return false;
|
||||
}
|
||||
unicode.reserve(runes.size());
|
||||
@ -174,17 +174,17 @@ inline bool DecodeRunesInString(const char* s, size_t len, Unicode& unicode) {
|
||||
}
|
||||
|
||||
inline bool IsSingleWord(const string& str) {
|
||||
RuneStrLite rp = DecodeRuneInString(str.c_str(), str.size());
|
||||
RuneStrLite rp = DecodeUTF8ToRune(str.c_str(), str.size());
|
||||
return rp.len == str.size();
|
||||
}
|
||||
|
||||
inline bool DecodeRunesInString(const string& s, Unicode& unicode) {
|
||||
return DecodeRunesInString(s.c_str(), s.size(), unicode);
|
||||
inline bool DecodeUTF8RunesInString(const string& s, Unicode& unicode) {
|
||||
return DecodeUTF8RunesInString(s.c_str(), s.size(), unicode);
|
||||
}
|
||||
|
||||
inline Unicode DecodeRunesInString(const string& s) {
|
||||
inline Unicode DecodeUTF8RunesInString(const string& s) {
|
||||
Unicode result;
|
||||
DecodeRunesInString(s, result);
|
||||
DecodeUTF8RunesInString(s, result);
|
||||
return result;
|
||||
}
|
||||
|
||||
|
@ -1,6 +1,12 @@
|
||||
SET(EXECUTABLE_OUTPUT_PATH ${PROJECT_BINARY_DIR})
|
||||
|
||||
if(NOT MSVC)
|
||||
ADD_EXECUTABLE(load_test load_test.cpp)
|
||||
ADD_SUBDIRECTORY(unittest)
|
||||
endif()
|
||||
# Configure test paths
|
||||
configure_file("${CMAKE_CURRENT_SOURCE_DIR}/test_paths.h.in" "${CMAKE_BINARY_DIR}/test/test_paths.h")
|
||||
|
||||
INCLUDE_DIRECTORIES(
|
||||
${CMAKE_CURRENT_BINARY_DIR}
|
||||
${CMAKE_BINARY_DIR}/test
|
||||
)
|
||||
|
||||
ADD_EXECUTABLE(load_test load_test.cpp)
|
||||
ADD_SUBDIRECTORY(unittest)
|
@ -6,14 +6,15 @@
|
||||
#include "cppjieba/MixSegment.hpp"
|
||||
#include "cppjieba/KeywordExtractor.hpp"
|
||||
#include "limonp/Colors.hpp"
|
||||
#include "test_paths.h"
|
||||
|
||||
using namespace cppjieba;
|
||||
|
||||
void Cut(size_t times = 50) {
|
||||
MixSegment seg("../dict/jieba.dict.utf8", "../dict/hmm_model.utf8");
|
||||
MixSegment seg(DICT_DIR "/jieba.dict.utf8", DICT_DIR "/hmm_model.utf8");
|
||||
vector<string> res;
|
||||
string doc;
|
||||
ifstream ifs("../test/testdata/weicheng.utf8");
|
||||
ifstream ifs(TEST_DATA_DIR "/weicheng.utf8");
|
||||
assert(ifs);
|
||||
doc << ifs;
|
||||
long beginTime = clock();
|
||||
@ -29,10 +30,13 @@ void Cut(size_t times = 50) {
|
||||
}
|
||||
|
||||
void Extract(size_t times = 400) {
|
||||
KeywordExtractor Extractor("../dict/jieba.dict.utf8", "../dict/hmm_model.utf8", "../dict/idf.utf8", "../dict/stop_words.utf8");
|
||||
KeywordExtractor Extractor(DICT_DIR "/jieba.dict.utf8",
|
||||
DICT_DIR "/hmm_model.utf8",
|
||||
DICT_DIR "/idf.utf8",
|
||||
DICT_DIR "/stop_words.utf8");
|
||||
vector<string> words;
|
||||
string doc;
|
||||
ifstream ifs("../test/testdata/review.100");
|
||||
ifstream ifs(TEST_DATA_DIR "/review.100");
|
||||
assert(ifs);
|
||||
doc << ifs;
|
||||
long beginTime = clock();
|
||||
|
7
test/test_paths.h.in
Normal file
7
test/test_paths.h.in
Normal file
@ -0,0 +1,7 @@
|
||||
#ifndef TEST_PATHS_H
|
||||
#define TEST_PATHS_H
|
||||
|
||||
#define TEST_DATA_DIR "@CMAKE_CURRENT_SOURCE_DIR@/testdata"
|
||||
#define DICT_DIR "@CMAKE_SOURCE_DIR@/dict"
|
||||
|
||||
#endif // TEST_PATHS_H
|
@ -1,6 +1,8 @@
|
||||
message(STATUS "MSVC value: ${MSVC}")
|
||||
if (MSVC)
|
||||
set(CMAKE_MSVC_RUNTIME_LIBRARY "MultiThreadedDebugDLL")
|
||||
set(gtest_force_shared_crt ON CACHE BOOL "" FORCE)
|
||||
add_compile_options(/utf-8)
|
||||
endif()
|
||||
|
||||
include(FetchContent)
|
||||
@ -8,7 +10,7 @@ include(FetchContent)
|
||||
FetchContent_Declare(
|
||||
googletest
|
||||
GIT_REPOSITORY https://github.com/google/googletest.git
|
||||
GIT_TAG release-1.11.0
|
||||
GIT_TAG release-1.12.1
|
||||
)
|
||||
FetchContent_MakeAvailable(googletest)
|
||||
|
||||
@ -18,6 +20,12 @@ SET(LIBRARY_OUTPUT_PATH ${PROJECT_BINARY_DIR}/lib)
|
||||
|
||||
ADD_DEFINITIONS(-DLOGGING_LEVEL=LL_WARNING)
|
||||
|
||||
# Add include directories
|
||||
INCLUDE_DIRECTORIES(
|
||||
${CMAKE_CURRENT_BINARY_DIR}
|
||||
${CMAKE_BINARY_DIR}/test
|
||||
)
|
||||
|
||||
ADD_EXECUTABLE(test.run
|
||||
gtest_main.cpp
|
||||
keyword_extractor_test.cpp
|
||||
|
@ -1,14 +1,11 @@
|
||||
#include "cppjieba/Jieba.hpp"
|
||||
#include "gtest/gtest.h"
|
||||
#include "test_paths.h"
|
||||
|
||||
using namespace cppjieba;
|
||||
|
||||
TEST(JiebaTest, Test1) {
|
||||
cppjieba::Jieba jieba("../dict/jieba.dict.utf8",
|
||||
"../dict/hmm_model.utf8",
|
||||
"../dict/user.dict.utf8",
|
||||
"../dict/idf.utf8",
|
||||
"../dict/stop_words.utf8");
|
||||
TEST(JiebaTest, Test0) {
|
||||
cppjieba::Jieba jieba;
|
||||
vector<string> words;
|
||||
string result;
|
||||
|
||||
@ -38,14 +35,51 @@ TEST(JiebaTest, Test1) {
|
||||
jieba.CutForSearch("他来到了网易杭研大厦", words);
|
||||
result << words;
|
||||
ASSERT_EQ("[\"他\", \"来到\", \"了\", \"网易\", \"杭研\", \"大厦\"]", result);
|
||||
|
||||
}
|
||||
|
||||
TEST(JiebaTest, Test1) {
|
||||
cppjieba::Jieba jieba(DICT_DIR "/jieba.dict.utf8",
|
||||
DICT_DIR "/hmm_model.utf8",
|
||||
DICT_DIR "/user.dict.utf8",
|
||||
DICT_DIR "/idf.utf8",
|
||||
DICT_DIR "/stop_words.utf8");
|
||||
vector<string> words;
|
||||
string result;
|
||||
|
||||
jieba.Cut("他来到了网易杭研大厦", words);
|
||||
result << words;
|
||||
ASSERT_EQ("[\"他\", \"来到\", \"了\", \"网易\", \"杭研\", \"大厦\"]", result);
|
||||
|
||||
jieba.Cut("我来自北京邮电大学。", words, false);
|
||||
result << words;
|
||||
ASSERT_EQ("[\"我\", \"来自\", \"北京邮电大学\", \"。\"]", result);
|
||||
|
||||
jieba.CutSmall("南京市长江大桥", words, 3);
|
||||
ASSERT_EQ("[\"南京市\", \"长江\", \"大桥\"]", result << words);
|
||||
|
||||
jieba.CutHMM("我来自北京邮电大学。。。学号123456", words);
|
||||
result << words;
|
||||
ASSERT_EQ("[\"我来\", \"自北京\", \"邮电大学\", \"。\", \"。\", \"。\", \"学号\", \"123456\"]", result);
|
||||
|
||||
jieba.Cut("我来自北京邮电大学。。。学号123456,用AK47", words);
|
||||
result << words;
|
||||
ASSERT_EQ("[\"我\", \"来自\", \"北京邮电大学\", \"。\", \"。\", \"。\", \"学号\", \"123456\", \",\", \"用\", \"AK47\"]", result);
|
||||
|
||||
jieba.CutAll("我来自北京邮电大学", words);
|
||||
result << words;
|
||||
ASSERT_EQ(result, "[\"我\", \"来自\", \"北京\", \"北京邮电\", \"北京邮电大学\", \"邮电\", \"邮电大学\", \"电大\", \"大学\"]");
|
||||
|
||||
jieba.CutForSearch("他来到了网易杭研大厦", words);
|
||||
result << words;
|
||||
ASSERT_EQ("[\"他\", \"来到\", \"了\", \"网易\", \"杭研\", \"大厦\"]", result);
|
||||
}
|
||||
|
||||
TEST(JiebaTest, WordTest) {
|
||||
cppjieba::Jieba jieba("../dict/jieba.dict.utf8",
|
||||
"../dict/hmm_model.utf8",
|
||||
"../dict/user.dict.utf8",
|
||||
"../dict/idf.utf8",
|
||||
"../dict/stop_words.utf8");
|
||||
cppjieba::Jieba jieba(DICT_DIR "/jieba.dict.utf8",
|
||||
DICT_DIR "/hmm_model.utf8",
|
||||
DICT_DIR "/user.dict.utf8",
|
||||
DICT_DIR "/idf.utf8",
|
||||
DICT_DIR "/stop_words.utf8");
|
||||
vector<Word> words;
|
||||
string result;
|
||||
|
||||
@ -83,11 +117,11 @@ TEST(JiebaTest, WordTest) {
|
||||
}
|
||||
|
||||
TEST(JiebaTest, InsertUserWord) {
|
||||
cppjieba::Jieba jieba("../dict/jieba.dict.utf8",
|
||||
"../dict/hmm_model.utf8",
|
||||
"../dict/user.dict.utf8",
|
||||
"../dict/idf.utf8",
|
||||
"../dict/stop_words.utf8");
|
||||
cppjieba::Jieba jieba(DICT_DIR "/jieba.dict.utf8",
|
||||
DICT_DIR "/hmm_model.utf8",
|
||||
DICT_DIR "/user.dict.utf8",
|
||||
DICT_DIR "/idf.utf8",
|
||||
DICT_DIR "/stop_words.utf8");
|
||||
vector<string> words;
|
||||
string result;
|
||||
|
||||
|
@ -1,10 +1,14 @@
|
||||
#include "cppjieba/KeywordExtractor.hpp"
|
||||
#include "gtest/gtest.h"
|
||||
#include "test_paths.h"
|
||||
|
||||
using namespace cppjieba;
|
||||
|
||||
TEST(KeywordExtractorTest, Test1) {
|
||||
KeywordExtractor Extractor("../test/testdata/extra_dict/jieba.dict.small.utf8", "../dict/hmm_model.utf8", "../dict/idf.utf8", "../dict/stop_words.utf8");
|
||||
KeywordExtractor Extractor(TEST_DATA_DIR "/extra_dict/jieba.dict.small.utf8",
|
||||
DICT_DIR "/hmm_model.utf8",
|
||||
DICT_DIR "/idf.utf8",
|
||||
DICT_DIR "/stop_words.utf8");
|
||||
|
||||
{
|
||||
string s("你好世界世界而且而且");
|
||||
@ -55,7 +59,11 @@ TEST(KeywordExtractorTest, Test1) {
|
||||
}
|
||||
|
||||
TEST(KeywordExtractorTest, Test2) {
|
||||
KeywordExtractor Extractor("../test/testdata/extra_dict/jieba.dict.small.utf8", "../dict/hmm_model.utf8", "../dict/idf.utf8", "../dict/stop_words.utf8", "../test/testdata/userdict.utf8");
|
||||
KeywordExtractor Extractor(TEST_DATA_DIR "/extra_dict/jieba.dict.small.utf8",
|
||||
DICT_DIR "/hmm_model.utf8",
|
||||
DICT_DIR "/idf.utf8",
|
||||
DICT_DIR "/stop_words.utf8",
|
||||
TEST_DATA_DIR "/userdict.utf8");
|
||||
|
||||
{
|
||||
string s("蓝翔优秀毕业生");
|
||||
|
@ -1,5 +1,6 @@
|
||||
#include "cppjieba/MixSegment.hpp"
|
||||
#include "gtest/gtest.h"
|
||||
#include "test_paths.h"
|
||||
|
||||
using namespace cppjieba;
|
||||
|
||||
@ -13,7 +14,7 @@ static const char * const ANS_TEST3 = "[iPhone6:eng, 手机:n, 的:uj, 最大:a,
|
||||
//static const char * const ANS_TEST3 = "";
|
||||
|
||||
TEST(PosTaggerTest, Test) {
|
||||
MixSegment tagger("../dict/jieba.dict.utf8", "../dict/hmm_model.utf8");
|
||||
MixSegment tagger(DICT_DIR "/jieba.dict.utf8", DICT_DIR "/hmm_model.utf8");
|
||||
{
|
||||
vector<pair<string, string> > res;
|
||||
tagger.Tag(QUERY_TEST1, res);
|
||||
@ -23,7 +24,7 @@ TEST(PosTaggerTest, Test) {
|
||||
}
|
||||
}
|
||||
TEST(PosTagger, TestUserDict) {
|
||||
MixSegment tagger("../dict/jieba.dict.utf8", "../dict/hmm_model.utf8", "../test/testdata/userdict.utf8");
|
||||
MixSegment tagger(DICT_DIR "/jieba.dict.utf8", DICT_DIR "/hmm_model.utf8", TEST_DATA_DIR "/userdict.utf8");
|
||||
{
|
||||
vector<pair<string, string> > res;
|
||||
tagger.Tag(QUERY_TEST2, res);
|
||||
|
@ -5,11 +5,12 @@
|
||||
#include "cppjieba/FullSegment.hpp"
|
||||
#include "cppjieba/QuerySegment.hpp"
|
||||
#include "gtest/gtest.h"
|
||||
#include "test_paths.h"
|
||||
|
||||
using namespace cppjieba;
|
||||
|
||||
TEST(MixSegmentTest, Test1) {
|
||||
MixSegment segment("../dict/jieba.dict.utf8", "../dict/hmm_model.utf8");;
|
||||
MixSegment segment(DICT_DIR "/jieba.dict.utf8", DICT_DIR "/hmm_model.utf8");
|
||||
string sentence;
|
||||
vector<string> words;
|
||||
string actual;
|
||||
@ -49,16 +50,18 @@ TEST(MixSegmentTest, Test1) {
|
||||
}
|
||||
|
||||
TEST(MixSegmentTest, NoUserDict) {
|
||||
MixSegment segment("../test/testdata/extra_dict/jieba.dict.small.utf8", "../dict/hmm_model.utf8");
|
||||
MixSegment segment(TEST_DATA_DIR "/extra_dict/jieba.dict.small.utf8", DICT_DIR "/hmm_model.utf8");
|
||||
const char* str = "令狐冲是云计算方面的专家";
|
||||
vector<string> words;
|
||||
segment.Cut(str, words);
|
||||
string res;
|
||||
ASSERT_EQ("[\"令狐冲\", \"是\", \"云\", \"计算\", \"方面\", \"的\", \"专家\"]", res << words);
|
||||
|
||||
}
|
||||
|
||||
TEST(MixSegmentTest, UserDict) {
|
||||
MixSegment segment("../test/testdata/extra_dict/jieba.dict.small.utf8", "../dict/hmm_model.utf8", "../dict/user.dict.utf8");
|
||||
MixSegment segment(TEST_DATA_DIR "/extra_dict/jieba.dict.small.utf8",
|
||||
DICT_DIR "/hmm_model.utf8",
|
||||
DICT_DIR "/user.dict.utf8");
|
||||
{
|
||||
const char* str = "令狐冲是云计算方面的专家";
|
||||
vector<string> words;
|
||||
@ -83,9 +86,10 @@ TEST(MixSegmentTest, UserDict) {
|
||||
ASSERT_EQ("[\"IBM\", \",\", \"3.14\"]", res);
|
||||
}
|
||||
}
|
||||
|
||||
TEST(MixSegmentTest, TestUserDict) {
|
||||
MixSegment segment("../test/testdata/extra_dict/jieba.dict.small.utf8", "../dict/hmm_model.utf8",
|
||||
"../test/testdata/userdict.utf8");
|
||||
MixSegment segment(TEST_DATA_DIR "/extra_dict/jieba.dict.small.utf8", DICT_DIR "/hmm_model.utf8",
|
||||
TEST_DATA_DIR "/userdict.utf8");
|
||||
vector<string> words;
|
||||
string res;
|
||||
|
||||
@ -123,8 +127,8 @@ TEST(MixSegmentTest, TestUserDict) {
|
||||
}
|
||||
|
||||
TEST(MixSegmentTest, TestMultiUserDict) {
|
||||
MixSegment segment("../test/testdata/extra_dict/jieba.dict.small.utf8", "../dict/hmm_model.utf8",
|
||||
"../test/testdata/userdict.utf8;../test/testdata/userdict.2.utf8");
|
||||
MixSegment segment(TEST_DATA_DIR "/extra_dict/jieba.dict.small.utf8", DICT_DIR "/hmm_model.utf8",
|
||||
TEST_DATA_DIR "/userdict.utf8;" TEST_DATA_DIR "/userdict.2.utf8");
|
||||
vector<string> words;
|
||||
string res;
|
||||
|
||||
@ -134,7 +138,7 @@ TEST(MixSegmentTest, TestMultiUserDict) {
|
||||
}
|
||||
|
||||
TEST(MPSegmentTest, Test1) {
|
||||
MPSegment segment("../dict/jieba.dict.utf8");;
|
||||
MPSegment segment(DICT_DIR "/jieba.dict.utf8");
|
||||
string s;
|
||||
vector<string> words;
|
||||
segment.Cut("我来自北京邮电大学。", words);
|
||||
@ -163,7 +167,7 @@ TEST(MPSegmentTest, Test1) {
|
||||
}
|
||||
|
||||
TEST(HMMSegmentTest, Test1) {
|
||||
HMMSegment segment("../dict/hmm_model.utf8");;
|
||||
HMMSegment segment(DICT_DIR "/hmm_model.utf8");
|
||||
{
|
||||
const char* str = "我来自北京邮电大学。。。学号123456";
|
||||
const char* res[] = {"我来", "自北京", "邮电大学", "。", "。", "。", "学号", "123456"};
|
||||
@ -182,7 +186,7 @@ TEST(HMMSegmentTest, Test1) {
|
||||
}
|
||||
|
||||
TEST(FullSegment, Test1) {
|
||||
FullSegment segment("../test/testdata/extra_dict/jieba.dict.small.utf8");
|
||||
FullSegment segment(TEST_DATA_DIR "/extra_dict/jieba.dict.small.utf8");
|
||||
vector<string> words;
|
||||
string s;
|
||||
|
||||
@ -197,7 +201,7 @@ TEST(FullSegment, Test1) {
|
||||
}
|
||||
|
||||
TEST(QuerySegment, Test1) {
|
||||
QuerySegment segment("../dict/jieba.dict.utf8", "../dict/hmm_model.utf8", "");
|
||||
QuerySegment segment(DICT_DIR "/jieba.dict.utf8", DICT_DIR "/hmm_model.utf8", "");
|
||||
vector<string> words;
|
||||
string s1, s2;
|
||||
|
||||
@ -218,7 +222,9 @@ TEST(QuerySegment, Test1) {
|
||||
}
|
||||
|
||||
TEST(QuerySegment, Test2) {
|
||||
QuerySegment segment("../test/testdata/extra_dict/jieba.dict.small.utf8", "../dict/hmm_model.utf8", "../test/testdata/userdict.utf8|../test/testdata/userdict.english");
|
||||
QuerySegment segment(TEST_DATA_DIR "/extra_dict/jieba.dict.small.utf8",
|
||||
DICT_DIR "/hmm_model.utf8",
|
||||
TEST_DATA_DIR "/userdict.utf8|" TEST_DATA_DIR "/userdict.english");
|
||||
vector<string> words;
|
||||
string s1, s2;
|
||||
|
||||
@ -242,14 +248,13 @@ TEST(QuerySegment, Test2) {
|
||||
s2 = "中国/科学/学院/科学院/中国科学院";
|
||||
ASSERT_EQ(s1, s2);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
TEST(MPSegmentTest, Unicode32) {
|
||||
string s("天气很好,🙋 我们去郊游。");
|
||||
vector<string> words;
|
||||
|
||||
MPSegment segment("../dict/jieba.dict.utf8");;
|
||||
MPSegment segment(DICT_DIR "/jieba.dict.utf8");
|
||||
segment.Cut(s, words);
|
||||
|
||||
ASSERT_EQ(Join(words.begin(), words.end(), "/"), "天气/很/好/,/🙋/ /我们/去/郊游/。");
|
||||
|
@ -1,13 +1,14 @@
|
||||
#include "cppjieba/TextRankExtractor.hpp"
|
||||
#include "gtest/gtest.h"
|
||||
#include "test_paths.h"
|
||||
|
||||
using namespace cppjieba;
|
||||
|
||||
TEST(TextRankExtractorTest, Test1) {
|
||||
TextRankExtractor Extractor(
|
||||
"../test/testdata/extra_dict/jieba.dict.small.utf8",
|
||||
"../dict/hmm_model.utf8",
|
||||
"../dict/stop_words.utf8");
|
||||
TEST_DATA_DIR "/extra_dict/jieba.dict.small.utf8",
|
||||
DICT_DIR "/hmm_model.utf8",
|
||||
DICT_DIR "/stop_words.utf8");
|
||||
{
|
||||
string s("你好世界世界而且而且");
|
||||
string res;
|
||||
@ -59,10 +60,10 @@ TEST(TextRankExtractorTest, Test1) {
|
||||
|
||||
TEST(TextRankExtractorTest, Test2) {
|
||||
TextRankExtractor Extractor(
|
||||
"../test/testdata/extra_dict/jieba.dict.small.utf8",
|
||||
"../dict/hmm_model.utf8",
|
||||
"../dict/stop_words.utf8",
|
||||
"../test/testdata/userdict.utf8");
|
||||
TEST_DATA_DIR "/extra_dict/jieba.dict.small.utf8",
|
||||
DICT_DIR "/hmm_model.utf8",
|
||||
DICT_DIR "/stop_words.utf8",
|
||||
TEST_DATA_DIR "/userdict.utf8");
|
||||
|
||||
{
|
||||
string s("\xe8\x93\x9d\xe7\xbf\x94\xe4\xbc\x98\xe7\xa7\x80\xe6\xaf\x95\xe4\xb8\x9a\xe7\x94\x9f");
|
||||
|
@ -1,10 +1,11 @@
|
||||
#include "cppjieba/DictTrie.hpp"
|
||||
#include "cppjieba/MPSegment.hpp"
|
||||
#include "gtest/gtest.h"
|
||||
#include "test_paths.h"
|
||||
|
||||
using namespace cppjieba;
|
||||
|
||||
static const char* const DICT_FILE = "../test/testdata/extra_dict/jieba.dict.small.utf8";
|
||||
static const char* const DICT_FILE = TEST_DATA_DIR "/extra_dict/jieba.dict.small.utf8";
|
||||
|
||||
TEST(TrieTest, Empty) {
|
||||
vector<Unicode> keys;
|
||||
@ -15,7 +16,7 @@ TEST(TrieTest, Empty) {
|
||||
TEST(TrieTest, Construct) {
|
||||
vector<Unicode> keys;
|
||||
vector<const DictUnit*> values;
|
||||
keys.push_back(DecodeRunesInString("你"));
|
||||
keys.push_back(DecodeUTF8RunesInString("你"));
|
||||
values.push_back((const DictUnit*)(NULL));
|
||||
Trie trie(keys, values);
|
||||
}
|
||||
@ -32,13 +33,7 @@ TEST(DictTrieTest, Test1) {
|
||||
ASSERT_LT(trie.GetMinWeight() + 15.6479, 0.001);
|
||||
string word("来到");
|
||||
cppjieba::RuneStrArray uni;
|
||||
ASSERT_TRUE(DecodeRunesInString(word, uni));
|
||||
//DictUnit nodeInfo;
|
||||
//nodeInfo.word = uni;
|
||||
//nodeInfo.tag = "v";
|
||||
//nodeInfo.weight = -8.87033;
|
||||
//s1 << nodeInfo;
|
||||
//s2 << (*trie.Find(uni.begin(), uni.end()));
|
||||
ASSERT_TRUE(DecodeUTF8RunesInString(word, uni));
|
||||
const DictUnit* du = trie.Find(uni.begin(), uni.end());
|
||||
ASSERT_TRUE(du != NULL);
|
||||
ASSERT_EQ(2u, du->word.size());
|
||||
@ -47,45 +42,42 @@ TEST(DictTrieTest, Test1) {
|
||||
ASSERT_EQ("v", du->tag);
|
||||
ASSERT_NEAR(-8.870, du->weight, 0.001);
|
||||
|
||||
//EXPECT_EQ("[\"26469\", \"21040\"] v -8.870", s2);
|
||||
word = "清华大学";
|
||||
LocalVector<pair<size_t, const DictUnit*> > res;
|
||||
const char * words[] = {"清", "清华", "清华大学"};
|
||||
for (size_t i = 0; i < sizeof(words)/sizeof(words[0]); i++) {
|
||||
ASSERT_TRUE(DecodeRunesInString(words[i], uni));
|
||||
ASSERT_TRUE(DecodeUTF8RunesInString(words[i], uni));
|
||||
res.push_back(make_pair(uni.size() - 1, trie.Find(uni.begin(), uni.end())));
|
||||
//resMap[uni.size() - 1] = trie.Find(uni.begin(), uni.end());
|
||||
}
|
||||
vector<pair<size_t, const DictUnit*> > vec;
|
||||
vector<struct Dag> dags;
|
||||
ASSERT_TRUE(DecodeRunesInString(word, uni));
|
||||
ASSERT_TRUE(DecodeUTF8RunesInString(word, uni));
|
||||
trie.Find(uni.begin(), uni.end(), dags);
|
||||
ASSERT_EQ(dags.size(), uni.size());
|
||||
ASSERT_NE(dags.size(), 0u);
|
||||
s1 << res;
|
||||
s2 << dags[0].nexts;
|
||||
ASSERT_EQ(s1, s2);
|
||||
|
||||
}
|
||||
|
||||
TEST(DictTrieTest, UserDict) {
|
||||
DictTrie trie(DICT_FILE, "../test/testdata/userdict.utf8");
|
||||
DictTrie trie(DICT_FILE, TEST_DATA_DIR "/userdict.utf8");
|
||||
string word = "云计算";
|
||||
cppjieba::RuneStrArray unicode;
|
||||
ASSERT_TRUE(DecodeRunesInString(word, unicode));
|
||||
ASSERT_TRUE(DecodeUTF8RunesInString(word, unicode));
|
||||
const DictUnit * unit = trie.Find(unicode.begin(), unicode.end());
|
||||
ASSERT_TRUE(unit != NULL);
|
||||
ASSERT_NEAR(unit->weight, -14.100, 0.001);
|
||||
|
||||
word = "蓝翔";
|
||||
ASSERT_TRUE(DecodeRunesInString(word, unicode));
|
||||
ASSERT_TRUE(DecodeUTF8RunesInString(word, unicode));
|
||||
unit = trie.Find(unicode.begin(), unicode.end());
|
||||
ASSERT_TRUE(unit != NULL);
|
||||
ASSERT_EQ(unit->tag, "nz");
|
||||
ASSERT_NEAR(unit->weight, -14.100, 0.001);
|
||||
|
||||
word = "区块链";
|
||||
ASSERT_TRUE(DecodeRunesInString(word, unicode));
|
||||
ASSERT_TRUE(DecodeUTF8RunesInString(word, unicode));
|
||||
unit = trie.Find(unicode.begin(), unicode.end());
|
||||
ASSERT_TRUE(unit != NULL);
|
||||
ASSERT_EQ(unit->tag, "nz");
|
||||
@ -93,22 +85,22 @@ TEST(DictTrieTest, UserDict) {
|
||||
}
|
||||
|
||||
TEST(DictTrieTest, UserDictWithMaxWeight) {
|
||||
DictTrie trie(DICT_FILE, "../test/testdata/userdict.utf8", DictTrie::WordWeightMax);
|
||||
DictTrie trie(DICT_FILE, TEST_DATA_DIR "/userdict.utf8", DictTrie::WordWeightMax);
|
||||
string word = "云计算";
|
||||
cppjieba::RuneStrArray unicode;
|
||||
ASSERT_TRUE(DecodeRunesInString(word, unicode));
|
||||
ASSERT_TRUE(DecodeUTF8RunesInString(word, unicode));
|
||||
const DictUnit * unit = trie.Find(unicode.begin(), unicode.end());
|
||||
ASSERT_TRUE(unit);
|
||||
ASSERT_NEAR(unit->weight, -2.975, 0.001);
|
||||
}
|
||||
|
||||
TEST(DictTrieTest, Dag) {
|
||||
DictTrie trie(DICT_FILE, "../test/testdata/userdict.utf8");
|
||||
DictTrie trie(DICT_FILE, TEST_DATA_DIR "/userdict.utf8");
|
||||
|
||||
{
|
||||
string word = "清华大学";
|
||||
cppjieba::RuneStrArray unicode;
|
||||
ASSERT_TRUE(DecodeRunesInString(word, unicode));
|
||||
ASSERT_TRUE(DecodeUTF8RunesInString(word, unicode));
|
||||
vector<struct Dag> res;
|
||||
trie.Find(unicode.begin(), unicode.end(), res);
|
||||
|
||||
@ -122,7 +114,7 @@ TEST(DictTrieTest, Dag) {
|
||||
{
|
||||
string word = "北京邮电大学";
|
||||
cppjieba::RuneStrArray unicode;
|
||||
ASSERT_TRUE(DecodeRunesInString(word, unicode));
|
||||
ASSERT_TRUE(DecodeUTF8RunesInString(word, unicode));
|
||||
vector<struct Dag> res;
|
||||
trie.Find(unicode.begin(), unicode.end(), res);
|
||||
|
||||
@ -136,7 +128,7 @@ TEST(DictTrieTest, Dag) {
|
||||
{
|
||||
string word = "长江大桥";
|
||||
cppjieba::RuneStrArray unicode;
|
||||
ASSERT_TRUE(DecodeRunesInString(word, unicode));
|
||||
ASSERT_TRUE(DecodeUTF8RunesInString(word, unicode));
|
||||
vector<struct Dag> res;
|
||||
trie.Find(unicode.begin(), unicode.end(), res);
|
||||
|
||||
@ -150,7 +142,7 @@ TEST(DictTrieTest, Dag) {
|
||||
{
|
||||
string word = "长江大桥";
|
||||
cppjieba::RuneStrArray unicode;
|
||||
ASSERT_TRUE(DecodeRunesInString(word, unicode));
|
||||
ASSERT_TRUE(DecodeUTF8RunesInString(word, unicode));
|
||||
vector<struct Dag> res;
|
||||
trie.Find(unicode.begin(), unicode.end(), res, 3);
|
||||
|
||||
@ -164,7 +156,7 @@ TEST(DictTrieTest, Dag) {
|
||||
{
|
||||
string word = "长江大桥";
|
||||
cppjieba::RuneStrArray unicode;
|
||||
ASSERT_TRUE(DecodeRunesInString(word, unicode));
|
||||
ASSERT_TRUE(DecodeUTF8RunesInString(word, unicode));
|
||||
vector<struct Dag> res;
|
||||
trie.Find(unicode.begin(), unicode.end(), res, 4);
|
||||
|
||||
|
@ -8,7 +8,7 @@ using namespace std;
|
||||
TEST(UnicodeTest, Test1) {
|
||||
string s = "你好世界";
|
||||
RuneStrArray runes;
|
||||
ASSERT_TRUE(DecodeRunesInString(s, runes));
|
||||
ASSERT_TRUE(DecodeUTF8RunesInString(s, runes));
|
||||
string actual;
|
||||
string expected = "[\"{\"rune\": \"20320\", \"offset\": 0, \"len\": 3}\", \"{\"rune\": \"22909\", \"offset\": 3, \"len\": 3}\", \"{\"rune\": \"19990\", \"offset\": 6, \"len\": 3}\", \"{\"rune\": \"30028\", \"offset\": 9, \"len\": 3}\"]";
|
||||
actual << runes;
|
||||
@ -18,7 +18,7 @@ TEST(UnicodeTest, Test1) {
|
||||
TEST(UnicodeTest, Illegal) {
|
||||
string s = "123\x80";
|
||||
RuneStrArray runes;
|
||||
ASSERT_FALSE(DecodeRunesInString(s, runes));
|
||||
ASSERT_FALSE(DecodeUTF8RunesInString(s, runes));
|
||||
string actual;
|
||||
string expected = "[]";
|
||||
actual << runes;
|
||||
@ -38,6 +38,6 @@ TEST(UnicodeTest, Rand) {
|
||||
s[rand() % len] = rand();
|
||||
}
|
||||
RuneStrArray runes;
|
||||
DecodeRunesInString(s, runes);
|
||||
DecodeUTF8RunesInString(s, runes);
|
||||
}
|
||||
}
|
||||
|
Loading…
x
Reference in New Issue
Block a user