Compare commits

..

No commits in common. "master" and "v5.4.1" have entirely different histories.

21 changed files with 259 additions and 291 deletions

View File

@ -1,40 +0,0 @@
name: CMake Windows ARM64
on:
push:
pull_request:
workflow_dispatch:
env:
BUILD_TYPE: Release
jobs:
build-windows-arm64:
runs-on: windows-2022
strategy:
matrix:
cpp_version: [11, 14, 17, 20]
steps:
- name: Check out repository code
uses: actions/checkout@v2
with:
submodules: recursive
- name: Configure CMake
# Configure CMake in a 'build' subdirectory. `CMAKE_BUILD_TYPE` is only required if you are using a single-configuration generator such as make.
# See https://cmake.org/cmake/help/latest/variable/CMAKE_BUILD_TYPE.html?highlight=cmake_build_type
# run: cmake -B ${{github.workspace}}/build -DCMAKE_BUILD_TYPE=${{env.BUILD_TYPE}}
run: cmake -B ${{github.workspace}}/build -DBUILD_TESTING=ON -DCMAKE_CXX_STANDARD=${{matrix.cpp_version}} -DCMAKE_BUILD_TYPE=${{env.BUILD_TYPE}}
- name: Build
# Build your program with the given configuration
# run: cmake --build ${{github.workspace}}/build --config ${{env.BUILD_TYPE}}
run: cmake --build ${{github.workspace}}/build --config ${{env.BUILD_TYPE}}
- name: Test
working-directory: ${{github.workspace}}/build
# Execute tests defined by the CMake configuration.
# See https://cmake.org/cmake/help/latest/manual/ctest.1.html for more detail
run: ctest -C ${{env.BUILD_TYPE}} --verbose

View File

@ -16,15 +16,13 @@ jobs:
runs-on: ${{ matrix.os }}
strategy:
matrix:
os: [
ubuntu-22.04,
ubuntu-latest,
os: [
ubuntu-20.04,
ubuntu-22.04,
macos-13,
macos-14,
macos-latest,
windows-2019,
windows-2022,
windows-latest,
]
cpp_version: [11, 14, 17, 20]

View File

@ -2,8 +2,7 @@ name: Close Stale Issues
on:
schedule:
- cron: '0 0 3 */3 *' # Every three months on the 3rd day at midnight
- cron: '0 0 3 * *' # Monthly on the 3rd day of the month at midnight
jobs:
stale:

2
.gitignore vendored
View File

@ -15,5 +15,3 @@ tmp
t.*
*.pid
build
Testing/Temporary/CTestCostData.txt
Testing/Temporary/LastTest.log

View File

@ -1,13 +1,5 @@
# CHANGELOG
## v5.5.0
+ feat: add Windows ARM64 build support
+ build: upgrade googletest from 1.11.0 to 1.12.1
+ build: update CMake minimum version requirement to 3.10
+ fix: make namespaces explicit and fix missing includes
+ ci: update stale-issues workflow configuration
## v5.4.0
+ unittest: class Jiaba add default argument input

View File

@ -1,9 +1,13 @@
CMAKE_MINIMUM_REQUIRED (VERSION 3.10)
CMAKE_MINIMUM_REQUIRED (VERSION 3.5)
PROJECT(CPPJIEBA)
INCLUDE_DIRECTORIES(${PROJECT_SOURCE_DIR}/deps/limonp/include
${PROJECT_SOURCE_DIR}/include)
if (CMAKE_INSTALL_PREFIX_INITIALIZED_TO_DEFAULT)
set (CMAKE_INSTALL_PREFIX "/usr/local/cppjieba" CACHE PATH "default install path" FORCE )
endif()
if(NOT DEFINED CMAKE_CXX_STANDARD)
set(CMAKE_CXX_STANDARD 11)
endif()
@ -21,11 +25,18 @@ if(NOT DEFINED CPPJIEBA_TOP_LEVEL_PROJECT)
endif()
endif()
if(CPPJIEBA_TOP_LEVEL_PROJECT)
option(CPPJIEBA_BUILD_TESTS "Build cppjieba tests" ${CPPJIEBA_TOP_LEVEL_PROJECT})
if(CPPJIEBA_BUILD_TESTS)
ENABLE_TESTING()
message(STATUS "MSVC value: ${MSVC}")
ADD_SUBDIRECTORY(test)
ADD_TEST(NAME ./test/test.run COMMAND ./test/test.run)
ADD_TEST(NAME ./load_test COMMAND ./load_test)
endif()
if(NOT MSVC)
ADD_TEST(NAME ./test/test.run COMMAND ./test/test.run)
ADD_TEST(NAME ./load_test COMMAND ./load_test)
endif()
endif()
# ... other CMake configurations ...

115
README.md
View File

@ -10,25 +10,22 @@
CppJieba是"结巴(Jieba)"中文分词的C++版本
### 主要特点
## 特性
- 🚀 高性能:经过线上环境验证的稳定性和性能表现
- 📦 易集成:源代码以头文件形式提供 (`include/cppjieba/*.hpp`),包含即可使用
- 🔍 多种分词模式:支持精确模式、全模式、搜索引擎模式等
- 📚 自定义词典:支持用户自定义词典,支持多词典路径(使用'|'或';'分隔)
- 💻 跨平台:支持 Linux、macOS、Windows 操作系统
- 🌈 UTF-8编码原生支持 UTF-8 编码的中文处理
+ 源代码都写进头文件`include/cppjieba/*.hpp`里,`include`即可使用。
+ 支持`utf8`编码。
+ 项目自带较为完善的单元测试,核心功能中文分词(utf8)的稳定性接受过线上环境检验。
+ 支持载自定义用户词典,多路径时支持分隔符'|'或者';'分隔。
+ 支持 `Linux` , `Mac OSX`, `Windows` 操作系统。
## 快速开始
## 用法
### 环境要求
### 依赖软件
- C++ 编译器:
- g++ (推荐 4.1 以上版本)
- 或 clang++
- cmake (推荐 2.6 以上版本)
* `g++ (version >= 4.1 is recommended) or clang++`;
* `cmake (version >= 2.6 is recommended)`;
### 安装步骤
### 下载和编译
```sh
git clone https://github.com/yanyiwu/cppjieba.git
@ -39,11 +36,15 @@ mkdir build
cd build
cmake ..
make
```
有兴趣的可以跑跑测试(可选):
```
make test
```
## 使用示例
## Demo
```
./demo
@ -209,37 +210,71 @@ For more details, please see [demo](https://github.com/yanyiwu/cppjieba-demo).
+ [dict.367W.utf8] iLife(562193561 at qq.com)
## 生态系统
## 应用
CppJieba 已经被广泛应用于各种编程语言的分词实现中:
+ [GoJieba] go语言版本的结巴中文分词。
+ [NodeJieba] Node.js 版本的结巴中文分词。
+ [simhash] 中文文档的的相似度计算
+ [exjieba] Erlang 版本的结巴中文分词。
+ [jiebaR] R语言版本的结巴中文分词。
+ [cjieba] C语言版本的结巴分词。
+ [jieba_rb] Ruby 版本的结巴分词。
+ [iosjieba] iOS 版本的结巴分词。
+ [SqlJieba] MySQL 全文索引的结巴中文分词插件。
+ [pg_jieba] PostgreSQL 数据库的分词插件。
+ [simple] SQLite3 FTS5 数据库的分词插件。
+ [gitbook-plugin-search-pro] 支持中文搜索的 gitbook 插件。
+ [ngx_http_cppjieba_module] Nginx 分词插件。
+ [cppjiebapy] 由 [jannson] 开发的供 python 模块调用的项目 [cppjiebapy], 相关讨论 [cppjiebapy_discussion] .
+ [cppjieba-py] 由 [bung87] 基于 pybind11 封装的 python 模块,使用体验上接近于原jieba。
+ [KeywordServer] 50行搭建一个中文关键词抽取服务。
+ [cppjieba-server] CppJieba HTTP 服务器。
+ [phpjieba] php版本的结巴分词扩展。
+ [perl5-jieba] Perl版本的结巴分词扩展。
+ [jieba-dlang] D 语言的结巴分词 Deimos Bindings。
- [GoJieba](https://github.com/yanyiwu/gojieba) - Go 语言版本
- [NodeJieba](https://github.com/yanyiwu/nodejieba) - Node.js 版本
- [CJieba](https://github.com/yanyiwu/cjieba) - C 语言版本
- [jiebaR](https://github.com/qinwf/jiebaR) - R 语言版本
- [exjieba](https://github.com/falood/exjieba) - Erlang 版本
- [jieba_rb](https://github.com/altkatz/jieba_rb) - Ruby 版本
- [iosjieba](https://github.com/yanyiwu/iosjieba) - iOS 版本
- [phpjieba](https://github.com/jonnywang/phpjieba) - PHP 版本
- [perl5-jieba](https://metacpan.org/pod/distribution/Lingua-ZH-Jieba/lib/Lingua/ZH/Jieba.pod) - Perl 版本
## 性能评测
### 应用项目
[Jieba中文分词系列性能评测]
- [simhash](https://github.com/yanyiwu/simhash) - 中文文档相似度计算
- [pg_jieba](https://github.com/jaiminpan/pg_jieba) - PostgreSQL 分词插件
- [gitbook-plugin-search-pro](https://plugins.gitbook.com/plugin/search-pro) - Gitbook 中文搜索插件
- [ngx_http_cppjieba_module](https://github.com/yanyiwu/ngx_http_cppjieba_module) - Nginx 分词插件
## Sponsorship
## 贡献指南
[![sponsorship](http://images.gitads.io/cppjieba)](https://tracking.gitads.io/?campaign=gitads&repo=cppjieba&redirect=gitads.io)
我们欢迎各种形式的贡献,包括但不限于:
## Contributors
- 提交问题和建议
- 改进文档
- 提交代码修复
- 添加新功能
如果您觉得 CppJieba 对您有帮助,欢迎 star ⭐️ 支持项目!
### Code Contributors
This project exists thanks to all the people who contribute.
<a href="https://github.com/yanyiwu/cppjieba/graphs/contributors"><img src="https://opencollective.com/cppjieba/contributors.svg?width=890&button=false" /></a>
[GoJieba]:https://github.com/yanyiwu/gojieba
[CppJieba]:https://github.com/yanyiwu/cppjieba
[jannson]:https://github.com/jannson
[cppjiebapy]:https://github.com/jannson/cppjiebapy
[bung87]:https://github.com/bung87
[cppjieba-py]:https://github.com/bung87/cppjieba-py
[cppjiebapy_discussion]:https://github.com/yanyiwu/cppjieba/issues/1
[NodeJieba]:https://github.com/yanyiwu/nodejieba
[jiebaR]:https://github.com/qinwf/jiebaR
[simhash]:https://github.com/yanyiwu/simhash
[代码详解]:https://github.com/yanyiwu/cppjieba/wiki/CppJieba%E4%BB%A3%E7%A0%81%E8%AF%A6%E8%A7%A3
[issue25]:https://github.com/yanyiwu/cppjieba/issues/25
[exjieba]:https://github.com/falood/exjieba
[KeywordServer]:https://github.com/yanyiwu/keyword_server
[ngx_http_cppjieba_module]:https://github.com/yanyiwu/ngx_http_cppjieba_module
[dict.367W.utf8]:https://github.com/qinwf/BigDict
[cjieba]:http://github.com/yanyiwu/cjieba
[jieba_rb]:https://github.com/altkatz/jieba_rb
[iosjieba]:https://github.com/yanyiwu/iosjieba
[SqlJieba]:https://github.com/yanyiwu/sqljieba
[Jieba中文分词系列性能评测]:http://yanyiwu.com/work/2015/06/14/jieba-series-performance-test.html
[pg_jieba]:https://github.com/jaiminpan/pg_jieba
[gitbook-plugin-search-pro]:https://plugins.gitbook.com/plugin/search-pro
[cppjieba-server]:https://github.com/yanyiwu/cppjieba-server
[phpjieba]:https://github.com/jonnywang/phpjieba
[perl5-jieba]:https://metacpan.org/pod/distribution/Lingua-ZH-Jieba/lib/Lingua/ZH/Jieba.pod
[jieba-dlang]:https://github.com/shove70/jieba
[simple]:https://github.com/wangfenjin/simple

2
deps/limonp vendored

@ -1 +1 @@
Subproject commit 5c82a3f17e4e0adc6a5decfe245054b0ed533d1a
Subproject commit ac32f1f287f65d5ce0ce295010c88026fae060ee

View File

@ -312698,6 +312698,7 @@ T恤 4 n
部属 1126 n
部属工作 3 n
部属院校 3 n
部手机 33 n
部族 643 n
部标 4 n
部省级 2 n

View File

@ -1,15 +1,15 @@
#ifndef CPPJIEBA_DICT_TRIE_HPP
#define CPPJIEBA_DICT_TRIE_HPP
#include <algorithm>
#include <iostream>
#include <fstream>
#include <map>
#include <string>
#include <cstring>
#include <cstdlib>
#include <stdint.h>
#include <cmath>
#include <deque>
#include <set>
#include <string>
#include <unordered_set>
#include <limits>
#include "limonp/StringUtil.hpp"
#include "limonp/Logging.hpp"
#include "Unicode.hpp"
@ -17,6 +17,8 @@
namespace cppjieba {
using namespace limonp;
const double MIN_DOUBLE = -3.14e+100;
const double MAX_DOUBLE = 3.14e+100;
const size_t DICT_COLUMN_NUM = 3;
@ -30,7 +32,7 @@ class DictTrie {
WordWeightMax,
}; // enum UserWordWeightOption
DictTrie(const std::string& dict_path, const std::string& user_dict_paths = "", UserWordWeightOption user_word_weight_opt = WordWeightMedian) {
DictTrie(const string& dict_path, const string& user_dict_paths = "", UserWordWeightOption user_word_weight_opt = WordWeightMedian) {
Init(dict_path, user_dict_paths, user_word_weight_opt);
}
@ -38,7 +40,7 @@ class DictTrie {
delete trie_;
}
bool InsertUserWord(const std::string& word, const std::string& tag = UNKNOWN_TAG) {
bool InsertUserWord(const string& word, const string& tag = UNKNOWN_TAG) {
DictUnit node_info;
if (!MakeNodeInfo(node_info, word, user_word_default_weight_, tag)) {
return false;
@ -48,7 +50,7 @@ class DictTrie {
return true;
}
bool InsertUserWord(const std::string& word,int freq, const std::string& tag = UNKNOWN_TAG) {
bool InsertUserWord(const string& word,int freq, const string& tag = UNKNOWN_TAG) {
DictUnit node_info;
double weight = freq ? log(1.0 * freq / freq_sum_) : user_word_default_weight_ ;
if (!MakeNodeInfo(node_info, word, weight , tag)) {
@ -59,7 +61,7 @@ class DictTrie {
return true;
}
bool DeleteUserWord(const std::string& word, const std::string& tag = UNKNOWN_TAG) {
bool DeleteUserWord(const string& word, const string& tag = UNKNOWN_TAG) {
DictUnit node_info;
if (!MakeNodeInfo(node_info, word, user_word_default_weight_, tag)) {
return false;
@ -67,19 +69,19 @@ class DictTrie {
trie_->DeleteNode(node_info.word, &node_info);
return true;
}
const DictUnit* Find(RuneStrArray::const_iterator begin, RuneStrArray::const_iterator end) const {
return trie_->Find(begin, end);
}
void Find(RuneStrArray::const_iterator begin,
RuneStrArray::const_iterator end,
std::vector<struct Dag>&res,
void Find(RuneStrArray::const_iterator begin,
RuneStrArray::const_iterator end,
vector<struct Dag>&res,
size_t max_word_len = MAX_WORD_LENGTH) const {
trie_->Find(begin, end, res, max_word_len);
}
bool Find(const std::string& word)
bool Find(const string& word)
{
const DictUnit *tmp = NULL;
RuneStrArray runes;
@ -106,18 +108,18 @@ class DictTrie {
return min_weight_;
}
void InserUserDictNode(const std::string& line) {
std::vector<std::string> buf;
void InserUserDictNode(const string& line) {
vector<string> buf;
DictUnit node_info;
limonp::Split(line, buf, " ");
Split(line, buf, " ");
if(buf.size() == 1){
MakeNodeInfo(node_info,
buf[0],
MakeNodeInfo(node_info,
buf[0],
user_word_default_weight_,
UNKNOWN_TAG);
} else if (buf.size() == 2) {
MakeNodeInfo(node_info,
buf[0],
MakeNodeInfo(node_info,
buf[0],
user_word_default_weight_,
buf[1]);
} else if (buf.size() == 3) {
@ -131,27 +133,27 @@ class DictTrie {
user_dict_single_chinese_word_.insert(node_info.word[0]);
}
}
void LoadUserDict(const std::vector<std::string>& buf) {
void LoadUserDict(const vector<string>& buf) {
for (size_t i = 0; i < buf.size(); i++) {
InserUserDictNode(buf[i]);
}
}
void LoadUserDict(const std::set<std::string>& buf) {
std::set<std::string>::const_iterator iter;
void LoadUserDict(const set<string>& buf) {
std::set<string>::const_iterator iter;
for (iter = buf.begin(); iter != buf.end(); iter++){
InserUserDictNode(*iter);
}
}
void LoadUserDict(const std::string& filePaths) {
std::vector<std::string> files = limonp::Split(filePaths, "|;");
void LoadUserDict(const string& filePaths) {
vector<string> files = limonp::Split(filePaths, "|;");
for (size_t i = 0; i < files.size(); i++) {
std::ifstream ifs(files[i].c_str());
XCHECK(ifs.is_open()) << "open " << files[i] << " failed";
std::string line;
ifstream ifs(files[i].c_str());
XCHECK(ifs.is_open()) << "open " << files[i] << " failed";
string line;
while(getline(ifs, line)) {
if (line.size() == 0) {
continue;
@ -163,7 +165,7 @@ class DictTrie {
private:
void Init(const std::string& dict_path, const std::string& user_dict_paths, UserWordWeightOption user_word_weight_opt) {
void Init(const string& dict_path, const string& user_dict_paths, UserWordWeightOption user_word_weight_opt) {
LoadDict(dict_path);
freq_sum_ = CalcFreqSum(static_node_infos_);
CalculateWeight(static_node_infos_, freq_sum_);
@ -175,11 +177,11 @@ class DictTrie {
Shrink(static_node_infos_);
CreateTrie(static_node_infos_);
}
void CreateTrie(const std::vector<DictUnit>& dictUnits) {
void CreateTrie(const vector<DictUnit>& dictUnits) {
assert(dictUnits.size());
std::vector<Unicode> words;
std::vector<const DictUnit*> valuePointers;
vector<Unicode> words;
vector<const DictUnit*> valuePointers;
for (size_t i = 0 ; i < dictUnits.size(); i ++) {
words.push_back(dictUnits[i].word);
valuePointers.push_back(&dictUnits[i]);
@ -188,10 +190,13 @@ class DictTrie {
trie_ = new Trie(words, valuePointers);
}
bool MakeNodeInfo(DictUnit& node_info,
const std::string& word,
double weight,
const std::string& tag) {
const string& word,
double weight,
const string& tag) {
if (!DecodeUTF8RunesInString(word, node_info.word)) {
XLOG(ERROR) << "UTF-8 decode failed for dict word: " << word;
return false;
@ -201,19 +206,19 @@ class DictTrie {
return true;
}
void LoadDict(const std::string& filePath) {
std::ifstream ifs(filePath.c_str());
void LoadDict(const string& filePath) {
ifstream ifs(filePath.c_str());
XCHECK(ifs.is_open()) << "open " << filePath << " failed.";
std::string line;
std::vector<std::string> buf;
string line;
vector<string> buf;
DictUnit node_info;
while (getline(ifs, line)) {
limonp::Split(line, buf, " ");
Split(line, buf, " ");
XCHECK(buf.size() == DICT_COLUMN_NUM) << "split result illegal, line:" << line;
MakeNodeInfo(node_info,
buf[0],
atof(buf[1].c_str()),
MakeNodeInfo(node_info,
buf[0],
atof(buf[1].c_str()),
buf[2]);
static_node_infos_.push_back(node_info);
}
@ -225,8 +230,8 @@ class DictTrie {
void SetStaticWordWeights(UserWordWeightOption option) {
XCHECK(!static_node_infos_.empty());
std::vector<DictUnit> x = static_node_infos_;
std::sort(x.begin(), x.end(), WeightCompare);
vector<DictUnit> x = static_node_infos_;
sort(x.begin(), x.end(), WeightCompare);
min_weight_ = x[0].weight;
max_weight_ = x[x.size() - 1].weight;
median_weight_ = x[x.size() / 2].weight;
@ -243,7 +248,7 @@ class DictTrie {
}
}
double CalcFreqSum(const std::vector<DictUnit>& node_infos) const {
double CalcFreqSum(const vector<DictUnit>& node_infos) const {
double sum = 0.0;
for (size_t i = 0; i < node_infos.size(); i++) {
sum += node_infos[i].weight;
@ -251,7 +256,7 @@ class DictTrie {
return sum;
}
void CalculateWeight(std::vector<DictUnit>& node_infos, double sum) const {
void CalculateWeight(vector<DictUnit>& node_infos, double sum) const {
assert(sum > 0.0);
for (size_t i = 0; i < node_infos.size(); i++) {
DictUnit& node_info = node_infos[i];
@ -260,12 +265,12 @@ class DictTrie {
}
}
void Shrink(std::vector<DictUnit>& units) const {
std::vector<DictUnit>(units.begin(), units.end()).swap(units);
void Shrink(vector<DictUnit>& units) const {
vector<DictUnit>(units.begin(), units.end()).swap(units);
}
std::vector<DictUnit> static_node_infos_;
std::deque<DictUnit> active_node_infos_; // must not be std::vector
vector<DictUnit> static_node_infos_;
deque<DictUnit> active_node_infos_; // must not be vector
Trie * trie_;
double freq_sum_;
@ -273,7 +278,7 @@ class DictTrie {
double max_weight_;
double median_weight_;
double user_word_default_weight_;
std::unordered_set<Rune> user_dict_single_chinese_word_;
unordered_set<Rune> user_dict_single_chinese_word_;
};
}

View File

@ -1,35 +1,37 @@
#ifndef CPPJIEBA_KEYWORD_EXTRACTOR_H
#define CPPJIEBA_KEYWORD_EXTRACTOR_H
#include <algorithm>
#include <unordered_map>
#include <unordered_set>
#include <cmath>
#include <set>
#include "MixSegment.hpp"
namespace cppjieba {
using namespace limonp;
using namespace std;
/*utf8*/
class KeywordExtractor {
public:
struct Word {
std::string word;
std::vector<size_t> offsets;
string word;
vector<size_t> offsets;
double weight;
}; // struct Word
KeywordExtractor(const std::string& dictPath,
const std::string& hmmFilePath,
const std::string& idfPath,
const std::string& stopWordPath,
const std::string& userDict = "")
KeywordExtractor(const string& dictPath,
const string& hmmFilePath,
const string& idfPath,
const string& stopWordPath,
const string& userDict = "")
: segment_(dictPath, hmmFilePath, userDict) {
LoadIdfDict(idfPath);
LoadStopWordDict(stopWordPath);
}
KeywordExtractor(const DictTrie* dictTrie,
const HMMModel* model,
const std::string& idfPath,
const std::string& stopWordPath)
const string& idfPath,
const string& stopWordPath)
: segment_(dictTrie, model) {
LoadIdfDict(idfPath);
LoadStopWordDict(stopWordPath);
@ -37,27 +39,27 @@ class KeywordExtractor {
~KeywordExtractor() {
}
void Extract(const std::string& sentence, std::vector<std::string>& keywords, size_t topN) const {
std::vector<Word> topWords;
void Extract(const string& sentence, vector<string>& keywords, size_t topN) const {
vector<Word> topWords;
Extract(sentence, topWords, topN);
for (size_t i = 0; i < topWords.size(); i++) {
keywords.push_back(topWords[i].word);
}
}
void Extract(const std::string& sentence, std::vector<pair<std::string, double> >& keywords, size_t topN) const {
std::vector<Word> topWords;
void Extract(const string& sentence, vector<pair<string, double> >& keywords, size_t topN) const {
vector<Word> topWords;
Extract(sentence, topWords, topN);
for (size_t i = 0; i < topWords.size(); i++) {
keywords.push_back(pair<std::string, double>(topWords[i].word, topWords[i].weight));
keywords.push_back(pair<string, double>(topWords[i].word, topWords[i].weight));
}
}
void Extract(const std::string& sentence, std::vector<Word>& keywords, size_t topN) const {
std::vector<std::string> words;
void Extract(const string& sentence, vector<Word>& keywords, size_t topN) const {
vector<string> words;
segment_.Cut(sentence, words);
std::map<std::string, Word> wordmap;
map<string, Word> wordmap;
size_t offset = 0;
for (size_t i = 0; i < words.size(); ++i) {
size_t t = offset;
@ -75,8 +77,8 @@ class KeywordExtractor {
keywords.clear();
keywords.reserve(wordmap.size());
for (std::map<std::string, Word>::iterator itr = wordmap.begin(); itr != wordmap.end(); ++itr) {
std::unordered_map<std::string, double>::const_iterator cit = idfMap_.find(itr->first);
for (map<string, Word>::iterator itr = wordmap.begin(); itr != wordmap.end(); ++itr) {
unordered_map<string, double>::const_iterator cit = idfMap_.find(itr->first);
if (cit != idfMap_.end()) {
itr->second.weight *= cit->second;
} else {
@ -86,15 +88,15 @@ class KeywordExtractor {
keywords.push_back(itr->second);
}
topN = min(topN, keywords.size());
std::partial_sort(keywords.begin(), keywords.begin() + topN, keywords.end(), Compare);
partial_sort(keywords.begin(), keywords.begin() + topN, keywords.end(), Compare);
keywords.resize(topN);
}
private:
void LoadIdfDict(const std::string& idfPath) {
std::ifstream ifs(idfPath.c_str());
void LoadIdfDict(const string& idfPath) {
ifstream ifs(idfPath.c_str());
XCHECK(ifs.is_open()) << "open " << idfPath << " failed";
std::string line ;
std::vector<std::string> buf;
string line ;
vector<string> buf;
double idf = 0.0;
double idfSum = 0.0;
size_t lineno = 0;
@ -104,7 +106,7 @@ class KeywordExtractor {
XLOG(ERROR) << "lineno: " << lineno << " empty. skipped.";
continue;
}
limonp::Split(line, buf, " ");
Split(line, buf, " ");
if (buf.size() != 2) {
XLOG(ERROR) << "line: " << line << ", lineno: " << lineno << " empty. skipped.";
continue;
@ -119,10 +121,10 @@ class KeywordExtractor {
idfAverage_ = idfSum / lineno;
assert(idfAverage_ > 0.0);
}
void LoadStopWordDict(const std::string& filePath) {
std::ifstream ifs(filePath.c_str());
void LoadStopWordDict(const string& filePath) {
ifstream ifs(filePath.c_str());
XCHECK(ifs.is_open()) << "open " << filePath << " failed";
std::string line ;
string line ;
while (getline(ifs, line)) {
stopWords_.insert(line);
}
@ -134,16 +136,18 @@ class KeywordExtractor {
}
MixSegment segment_;
std::unordered_map<std::string, double> idfMap_;
unordered_map<string, double> idfMap_;
double idfAverage_;
std::unordered_set<std::string> stopWords_;
unordered_set<string> stopWords_;
}; // class KeywordExtractor
inline std::ostream& operator << (std::ostream& os, const KeywordExtractor::Word& word) {
inline ostream& operator << (ostream& os, const KeywordExtractor::Word& word) {
return os << "{\"word\": \"" << word.word << "\", \"offset\": " << word.offsets << ", \"weight\": " << word.weight << "}";
}
} // namespace cppjieba
#endif

View File

@ -1,12 +1,4 @@
SET(EXECUTABLE_OUTPUT_PATH ${PROJECT_BINARY_DIR})
# Configure test paths
configure_file("${CMAKE_CURRENT_SOURCE_DIR}/test_paths.h.in" "${CMAKE_BINARY_DIR}/test/test_paths.h")
INCLUDE_DIRECTORIES(
${CMAKE_CURRENT_BINARY_DIR}
${CMAKE_BINARY_DIR}/test
)
ADD_EXECUTABLE(load_test load_test.cpp)
ADD_SUBDIRECTORY(unittest)
ADD_SUBDIRECTORY(unittest)

View File

@ -6,15 +6,14 @@
#include "cppjieba/MixSegment.hpp"
#include "cppjieba/KeywordExtractor.hpp"
#include "limonp/Colors.hpp"
#include "test_paths.h"
using namespace cppjieba;
void Cut(size_t times = 50) {
MixSegment seg(DICT_DIR "/jieba.dict.utf8", DICT_DIR "/hmm_model.utf8");
MixSegment seg("../dict/jieba.dict.utf8", "../dict/hmm_model.utf8");
vector<string> res;
string doc;
ifstream ifs(TEST_DATA_DIR "/weicheng.utf8");
ifstream ifs("../test/testdata/weicheng.utf8");
assert(ifs);
doc << ifs;
long beginTime = clock();
@ -30,13 +29,10 @@ void Cut(size_t times = 50) {
}
void Extract(size_t times = 400) {
KeywordExtractor Extractor(DICT_DIR "/jieba.dict.utf8",
DICT_DIR "/hmm_model.utf8",
DICT_DIR "/idf.utf8",
DICT_DIR "/stop_words.utf8");
KeywordExtractor Extractor("../dict/jieba.dict.utf8", "../dict/hmm_model.utf8", "../dict/idf.utf8", "../dict/stop_words.utf8");
vector<string> words;
string doc;
ifstream ifs(TEST_DATA_DIR "/review.100");
ifstream ifs("../test/testdata/review.100");
assert(ifs);
doc << ifs;
long beginTime = clock();
@ -55,4 +51,4 @@ int main(int argc, char ** argv) {
Cut();
Extract();
return EXIT_SUCCESS;
}
}

View File

@ -1,7 +0,0 @@
#ifndef TEST_PATHS_H
#define TEST_PATHS_H
#define TEST_DATA_DIR "@CMAKE_CURRENT_SOURCE_DIR@/testdata"
#define DICT_DIR "@CMAKE_SOURCE_DIR@/dict"
#endif // TEST_PATHS_H

View File

@ -1,8 +1,6 @@
message(STATUS "MSVC value: ${MSVC}")
if (MSVC)
set(CMAKE_MSVC_RUNTIME_LIBRARY "MultiThreadedDebugDLL")
set(gtest_force_shared_crt ON CACHE BOOL "" FORCE)
add_compile_options(/utf-8)
endif()
include(FetchContent)
@ -10,7 +8,7 @@ include(FetchContent)
FetchContent_Declare(
googletest
GIT_REPOSITORY https://github.com/google/googletest.git
GIT_TAG release-1.12.1
GIT_TAG release-1.11.0
)
FetchContent_MakeAvailable(googletest)
@ -20,12 +18,6 @@ SET(LIBRARY_OUTPUT_PATH ${PROJECT_BINARY_DIR}/lib)
ADD_DEFINITIONS(-DLOGGING_LEVEL=LL_WARNING)
# Add include directories
INCLUDE_DIRECTORIES(
${CMAKE_CURRENT_BINARY_DIR}
${CMAKE_BINARY_DIR}/test
)
ADD_EXECUTABLE(test.run
gtest_main.cpp
keyword_extractor_test.cpp

View File

@ -1,6 +1,5 @@
#include "cppjieba/Jieba.hpp"
#include "gtest/gtest.h"
#include "test_paths.h"
using namespace cppjieba;
@ -38,11 +37,11 @@ TEST(JiebaTest, Test0) {
}
TEST(JiebaTest, Test1) {
cppjieba::Jieba jieba(DICT_DIR "/jieba.dict.utf8",
DICT_DIR "/hmm_model.utf8",
DICT_DIR "/user.dict.utf8",
DICT_DIR "/idf.utf8",
DICT_DIR "/stop_words.utf8");
cppjieba::Jieba jieba("../dict/jieba.dict.utf8",
"../dict/hmm_model.utf8",
"../dict/user.dict.utf8",
"../dict/idf.utf8",
"../dict/stop_words.utf8");
vector<string> words;
string result;
@ -72,14 +71,14 @@ TEST(JiebaTest, Test1) {
jieba.CutForSearch("他来到了网易杭研大厦", words);
result << words;
ASSERT_EQ("[\"\", \"来到\", \"\", \"网易\", \"杭研\", \"大厦\"]", result);
}
}
TEST(JiebaTest, WordTest) {
cppjieba::Jieba jieba(DICT_DIR "/jieba.dict.utf8",
DICT_DIR "/hmm_model.utf8",
DICT_DIR "/user.dict.utf8",
DICT_DIR "/idf.utf8",
DICT_DIR "/stop_words.utf8");
cppjieba::Jieba jieba("../dict/jieba.dict.utf8",
"../dict/hmm_model.utf8",
"../dict/user.dict.utf8",
"../dict/idf.utf8",
"../dict/stop_words.utf8");
vector<Word> words;
string result;
@ -117,11 +116,11 @@ TEST(JiebaTest, WordTest) {
}
TEST(JiebaTest, InsertUserWord) {
cppjieba::Jieba jieba(DICT_DIR "/jieba.dict.utf8",
DICT_DIR "/hmm_model.utf8",
DICT_DIR "/user.dict.utf8",
DICT_DIR "/idf.utf8",
DICT_DIR "/stop_words.utf8");
cppjieba::Jieba jieba("../dict/jieba.dict.utf8",
"../dict/hmm_model.utf8",
"../dict/user.dict.utf8",
"../dict/idf.utf8",
"../dict/stop_words.utf8");
vector<string> words;
string result;

View File

@ -1,14 +1,10 @@
#include "cppjieba/KeywordExtractor.hpp"
#include "gtest/gtest.h"
#include "test_paths.h"
using namespace cppjieba;
TEST(KeywordExtractorTest, Test1) {
KeywordExtractor Extractor(TEST_DATA_DIR "/extra_dict/jieba.dict.small.utf8",
DICT_DIR "/hmm_model.utf8",
DICT_DIR "/idf.utf8",
DICT_DIR "/stop_words.utf8");
KeywordExtractor Extractor("../test/testdata/extra_dict/jieba.dict.small.utf8", "../dict/hmm_model.utf8", "../dict/idf.utf8", "../dict/stop_words.utf8");
{
string s("你好世界世界而且而且");
@ -59,11 +55,7 @@ TEST(KeywordExtractorTest, Test1) {
}
TEST(KeywordExtractorTest, Test2) {
KeywordExtractor Extractor(TEST_DATA_DIR "/extra_dict/jieba.dict.small.utf8",
DICT_DIR "/hmm_model.utf8",
DICT_DIR "/idf.utf8",
DICT_DIR "/stop_words.utf8",
TEST_DATA_DIR "/userdict.utf8");
KeywordExtractor Extractor("../test/testdata/extra_dict/jieba.dict.small.utf8", "../dict/hmm_model.utf8", "../dict/idf.utf8", "../dict/stop_words.utf8", "../test/testdata/userdict.utf8");
{
string s("蓝翔优秀毕业生");

View File

@ -1,6 +1,5 @@
#include "cppjieba/MixSegment.hpp"
#include "gtest/gtest.h"
#include "test_paths.h"
using namespace cppjieba;
@ -14,7 +13,7 @@ static const char * const ANS_TEST3 = "[iPhone6:eng, 手机:n, 的:uj, 最大:a,
//static const char * const ANS_TEST3 = "";
TEST(PosTaggerTest, Test) {
MixSegment tagger(DICT_DIR "/jieba.dict.utf8", DICT_DIR "/hmm_model.utf8");
MixSegment tagger("../dict/jieba.dict.utf8", "../dict/hmm_model.utf8");
{
vector<pair<string, string> > res;
tagger.Tag(QUERY_TEST1, res);
@ -24,7 +23,7 @@ TEST(PosTaggerTest, Test) {
}
}
TEST(PosTagger, TestUserDict) {
MixSegment tagger(DICT_DIR "/jieba.dict.utf8", DICT_DIR "/hmm_model.utf8", TEST_DATA_DIR "/userdict.utf8");
MixSegment tagger("../dict/jieba.dict.utf8", "../dict/hmm_model.utf8", "../test/testdata/userdict.utf8");
{
vector<pair<string, string> > res;
tagger.Tag(QUERY_TEST2, res);

View File

@ -5,12 +5,11 @@
#include "cppjieba/FullSegment.hpp"
#include "cppjieba/QuerySegment.hpp"
#include "gtest/gtest.h"
#include "test_paths.h"
using namespace cppjieba;
TEST(MixSegmentTest, Test1) {
MixSegment segment(DICT_DIR "/jieba.dict.utf8", DICT_DIR "/hmm_model.utf8");
MixSegment segment("../dict/jieba.dict.utf8", "../dict/hmm_model.utf8");;
string sentence;
vector<string> words;
string actual;
@ -50,18 +49,16 @@ TEST(MixSegmentTest, Test1) {
}
TEST(MixSegmentTest, NoUserDict) {
MixSegment segment(TEST_DATA_DIR "/extra_dict/jieba.dict.small.utf8", DICT_DIR "/hmm_model.utf8");
MixSegment segment("../test/testdata/extra_dict/jieba.dict.small.utf8", "../dict/hmm_model.utf8");
const char* str = "令狐冲是云计算方面的专家";
vector<string> words;
segment.Cut(str, words);
string res;
ASSERT_EQ("[\"令狐冲\", \"\", \"\", \"计算\", \"方面\", \"\", \"专家\"]", res << words);
}
}
TEST(MixSegmentTest, UserDict) {
MixSegment segment(TEST_DATA_DIR "/extra_dict/jieba.dict.small.utf8",
DICT_DIR "/hmm_model.utf8",
DICT_DIR "/user.dict.utf8");
MixSegment segment("../test/testdata/extra_dict/jieba.dict.small.utf8", "../dict/hmm_model.utf8", "../dict/user.dict.utf8");
{
const char* str = "令狐冲是云计算方面的专家";
vector<string> words;
@ -86,10 +83,9 @@ TEST(MixSegmentTest, UserDict) {
ASSERT_EQ("[\"IBM\", \",\", \"3.14\"]", res);
}
}
TEST(MixSegmentTest, TestUserDict) {
MixSegment segment(TEST_DATA_DIR "/extra_dict/jieba.dict.small.utf8", DICT_DIR "/hmm_model.utf8",
TEST_DATA_DIR "/userdict.utf8");
MixSegment segment("../test/testdata/extra_dict/jieba.dict.small.utf8", "../dict/hmm_model.utf8",
"../test/testdata/userdict.utf8");
vector<string> words;
string res;
@ -127,8 +123,8 @@ TEST(MixSegmentTest, TestUserDict) {
}
TEST(MixSegmentTest, TestMultiUserDict) {
MixSegment segment(TEST_DATA_DIR "/extra_dict/jieba.dict.small.utf8", DICT_DIR "/hmm_model.utf8",
TEST_DATA_DIR "/userdict.utf8;" TEST_DATA_DIR "/userdict.2.utf8");
MixSegment segment("../test/testdata/extra_dict/jieba.dict.small.utf8", "../dict/hmm_model.utf8",
"../test/testdata/userdict.utf8;../test/testdata/userdict.2.utf8");
vector<string> words;
string res;
@ -138,7 +134,7 @@ TEST(MixSegmentTest, TestMultiUserDict) {
}
TEST(MPSegmentTest, Test1) {
MPSegment segment(DICT_DIR "/jieba.dict.utf8");
MPSegment segment("../dict/jieba.dict.utf8");;
string s;
vector<string> words;
segment.Cut("我来自北京邮电大学。", words);
@ -167,7 +163,7 @@ TEST(MPSegmentTest, Test1) {
}
TEST(HMMSegmentTest, Test1) {
HMMSegment segment(DICT_DIR "/hmm_model.utf8");
HMMSegment segment("../dict/hmm_model.utf8");;
{
const char* str = "我来自北京邮电大学。。。学号123456";
const char* res[] = {"我来", "自北京", "邮电大学", "", "", "", "学号", "123456"};
@ -186,7 +182,7 @@ TEST(HMMSegmentTest, Test1) {
}
TEST(FullSegment, Test1) {
FullSegment segment(TEST_DATA_DIR "/extra_dict/jieba.dict.small.utf8");
FullSegment segment("../test/testdata/extra_dict/jieba.dict.small.utf8");
vector<string> words;
string s;
@ -201,7 +197,7 @@ TEST(FullSegment, Test1) {
}
TEST(QuerySegment, Test1) {
QuerySegment segment(DICT_DIR "/jieba.dict.utf8", DICT_DIR "/hmm_model.utf8", "");
QuerySegment segment("../dict/jieba.dict.utf8", "../dict/hmm_model.utf8", "");
vector<string> words;
string s1, s2;
@ -222,9 +218,7 @@ TEST(QuerySegment, Test1) {
}
TEST(QuerySegment, Test2) {
QuerySegment segment(TEST_DATA_DIR "/extra_dict/jieba.dict.small.utf8",
DICT_DIR "/hmm_model.utf8",
TEST_DATA_DIR "/userdict.utf8|" TEST_DATA_DIR "/userdict.english");
QuerySegment segment("../test/testdata/extra_dict/jieba.dict.small.utf8", "../dict/hmm_model.utf8", "../test/testdata/userdict.utf8|../test/testdata/userdict.english");
vector<string> words;
string s1, s2;
@ -248,13 +242,14 @@ TEST(QuerySegment, Test2) {
s2 = "中国/科学/学院/科学院/中国科学院";
ASSERT_EQ(s1, s2);
}
}
TEST(MPSegmentTest, Unicode32) {
string s("天气很好,🙋 我们去郊游。");
vector<string> words;
MPSegment segment(DICT_DIR "/jieba.dict.utf8");
MPSegment segment("../dict/jieba.dict.utf8");;
segment.Cut(s, words);
ASSERT_EQ(Join(words.begin(), words.end(), "/"), "天气/很/好//🙋/ /我们/去/郊游/。");

View File

@ -1,14 +1,13 @@
#include "cppjieba/TextRankExtractor.hpp"
#include "gtest/gtest.h"
#include "test_paths.h"
using namespace cppjieba;
TEST(TextRankExtractorTest, Test1) {
TextRankExtractor Extractor(
TEST_DATA_DIR "/extra_dict/jieba.dict.small.utf8",
DICT_DIR "/hmm_model.utf8",
DICT_DIR "/stop_words.utf8");
"../test/testdata/extra_dict/jieba.dict.small.utf8",
"../dict/hmm_model.utf8",
"../dict/stop_words.utf8");
{
string s("你好世界世界而且而且");
string res;
@ -60,10 +59,10 @@ TEST(TextRankExtractorTest, Test1) {
TEST(TextRankExtractorTest, Test2) {
TextRankExtractor Extractor(
TEST_DATA_DIR "/extra_dict/jieba.dict.small.utf8",
DICT_DIR "/hmm_model.utf8",
DICT_DIR "/stop_words.utf8",
TEST_DATA_DIR "/userdict.utf8");
"../test/testdata/extra_dict/jieba.dict.small.utf8",
"../dict/hmm_model.utf8",
"../dict/stop_words.utf8",
"../test/testdata/userdict.utf8");
{
string s("\xe8\x93\x9d\xe7\xbf\x94\xe4\xbc\x98\xe7\xa7\x80\xe6\xaf\x95\xe4\xb8\x9a\xe7\x94\x9f");

View File

@ -1,11 +1,10 @@
#include "cppjieba/DictTrie.hpp"
#include "cppjieba/MPSegment.hpp"
#include "gtest/gtest.h"
#include "test_paths.h"
using namespace cppjieba;
static const char* const DICT_FILE = TEST_DATA_DIR "/extra_dict/jieba.dict.small.utf8";
static const char* const DICT_FILE = "../test/testdata/extra_dict/jieba.dict.small.utf8";
TEST(TrieTest, Empty) {
vector<Unicode> keys;
@ -34,6 +33,12 @@ TEST(DictTrieTest, Test1) {
string word("来到");
cppjieba::RuneStrArray uni;
ASSERT_TRUE(DecodeUTF8RunesInString(word, uni));
//DictUnit nodeInfo;
//nodeInfo.word = uni;
//nodeInfo.tag = "v";
//nodeInfo.weight = -8.87033;
//s1 << nodeInfo;
//s2 << (*trie.Find(uni.begin(), uni.end()));
const DictUnit* du = trie.Find(uni.begin(), uni.end());
ASSERT_TRUE(du != NULL);
ASSERT_EQ(2u, du->word.size());
@ -42,12 +47,14 @@ TEST(DictTrieTest, Test1) {
ASSERT_EQ("v", du->tag);
ASSERT_NEAR(-8.870, du->weight, 0.001);
//EXPECT_EQ("[\"26469\", \"21040\"] v -8.870", s2);
word = "清华大学";
LocalVector<pair<size_t, const DictUnit*> > res;
const char * words[] = {"", "清华", "清华大学"};
for (size_t i = 0; i < sizeof(words)/sizeof(words[0]); i++) {
ASSERT_TRUE(DecodeUTF8RunesInString(words[i], uni));
res.push_back(make_pair(uni.size() - 1, trie.Find(uni.begin(), uni.end())));
//resMap[uni.size() - 1] = trie.Find(uni.begin(), uni.end());
}
vector<pair<size_t, const DictUnit*> > vec;
vector<struct Dag> dags;
@ -58,10 +65,11 @@ TEST(DictTrieTest, Test1) {
s1 << res;
s2 << dags[0].nexts;
ASSERT_EQ(s1, s2);
}
TEST(DictTrieTest, UserDict) {
DictTrie trie(DICT_FILE, TEST_DATA_DIR "/userdict.utf8");
DictTrie trie(DICT_FILE, "../test/testdata/userdict.utf8");
string word = "云计算";
cppjieba::RuneStrArray unicode;
ASSERT_TRUE(DecodeUTF8RunesInString(word, unicode));
@ -85,7 +93,7 @@ TEST(DictTrieTest, UserDict) {
}
TEST(DictTrieTest, UserDictWithMaxWeight) {
DictTrie trie(DICT_FILE, TEST_DATA_DIR "/userdict.utf8", DictTrie::WordWeightMax);
DictTrie trie(DICT_FILE, "../test/testdata/userdict.utf8", DictTrie::WordWeightMax);
string word = "云计算";
cppjieba::RuneStrArray unicode;
ASSERT_TRUE(DecodeUTF8RunesInString(word, unicode));
@ -95,7 +103,7 @@ TEST(DictTrieTest, UserDictWithMaxWeight) {
}
TEST(DictTrieTest, Dag) {
DictTrie trie(DICT_FILE, TEST_DATA_DIR "/userdict.utf8");
DictTrie trie(DICT_FILE, "../test/testdata/userdict.utf8");
{
string word = "清华大学";