build: refine CMakeLists.txt by removing unnecessary conditions and options

- Eliminated the default installation prefix condition to streamline the configuration. - Simplified the test build logic by ensuring tests are enabled only for top-level projects. - Cleaned up redundant code for better readability and maintainability.
build: update CMakeLists.txt to include additional directories for test configuration
2025-07-18 00:00:12 +08:00 · 2025-05-03 07:43:25 +08:00 · 2025-05-02 23:47:37 +08:00 · 2025-05-02 23:41:53 +08:00 · 2025-05-02 23:33:18 +08:00 · 2025-05-02 21:39:18 +08:00
28 changed files with 430 additions and 310 deletions
--- a/.github/workflows/cmake-arm64.yml
+++ b/.github/workflows/cmake-arm64.yml
@ -0,0 +1,40 @@
+name: CMake Windows ARM64
+
+on:
+  push:
+  pull_request:
+  workflow_dispatch:  
+
+env:
+  BUILD_TYPE: Release
+
+jobs:
+  build-windows-arm64:
+    runs-on: windows-2022
+    strategy:
+      matrix:
+        cpp_version: [11, 14, 17, 20]
+
+    steps:
+      - name: Check out repository code
+        uses: actions/checkout@v2
+        with:
+            submodules: recursive
+
+      - name: Configure CMake
+        # Configure CMake in a 'build' subdirectory. `CMAKE_BUILD_TYPE` is only required if you are using a single-configuration generator such as make.
+        # See https://cmake.org/cmake/help/latest/variable/CMAKE_BUILD_TYPE.html?highlight=cmake_build_type
+        # run: cmake -B ${{github.workspace}}/build -DCMAKE_BUILD_TYPE=${{env.BUILD_TYPE}}
+        run: cmake -B ${{github.workspace}}/build -DBUILD_TESTING=ON -DCMAKE_CXX_STANDARD=${{matrix.cpp_version}} -DCMAKE_BUILD_TYPE=${{env.BUILD_TYPE}}
+
+      - name: Build
+        # Build your program with the given configuration
+        # run: cmake --build ${{github.workspace}}/build --config ${{env.BUILD_TYPE}}
+        run: cmake --build ${{github.workspace}}/build --config ${{env.BUILD_TYPE}}
+
+      - name: Test
+        working-directory: ${{github.workspace}}/build
+        # Execute tests defined by the CMake configuration.  
+        # See https://cmake.org/cmake/help/latest/manual/ctest.1.html for more detail
+        run: ctest -C ${{env.BUILD_TYPE}} --verbose
+      
--- a/.github/workflows/cmake.yml
+++ b/.github/workflows/cmake.yml
@ -17,13 +17,14 @@ jobs:
    strategy:
      matrix:
        os: [ 
-          ubuntu-20.04, 
          ubuntu-22.04,
-          macos-12,
+          ubuntu-latest,
          macos-13,
          macos-14,
+          macos-latest,
          windows-2019,
          windows-2022,
+          windows-latest,
        ]  
        cpp_version: [11, 14, 17, 20]

@ -48,5 +49,5 @@ jobs:
        working-directory: ${{github.workspace}}/build
        # Execute tests defined by the CMake configuration.  
        # See https://cmake.org/cmake/help/latest/manual/ctest.1.html for more detail
-        run: ctest -C ${{env.BUILD_TYPE}}
+        run: ctest -C ${{env.BUILD_TYPE}} --verbose
      
--- a/.github/workflows/stale-issues.yml
+++ b/.github/workflows/stale-issues.yml
@ -2,7 +2,8 @@ name: Close Stale Issues

 on:
  schedule:
-    - cron: '0 0 * * 0'  # Run weekly on Sunday at midnight
+    - cron: '0 0 3 */3 *'  # Every three months on the 3rd day at midnight  
+

 jobs:
  stale:
--- a/.gitignore
+++ b/.gitignore
@ -15,3 +15,5 @@ tmp
 t.*
 *.pid
 build
+Testing/Temporary/CTestCostData.txt
+Testing/Temporary/LastTest.log
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -1,5 +1,20 @@
 # CHANGELOG

+## v5.5.0
+
+ feat: add Windows ARM64 build support
+ build: upgrade googletest from 1.11.0 to 1.12.1
+ build: update CMake minimum version requirement to 3.10
+ fix: make namespaces explicit and fix missing includes
+ ci: update stale-issues workflow configuration
+
+## v5.4.0
+
+ unittest: class Jiaba add default argument input
+ class Jieba: support default dictpath
+ cmake: avoid testing when FetchContent by other project
+ class DictTrie: removed unused var
+
 ## v5.3.2

 + removed test/demo.cpp and linked https://github.com/yanyiwu/cppjieba-demo
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -1,25 +1,31 @@
-CMAKE_MINIMUM_REQUIRED (VERSION 3.5)
+CMAKE_MINIMUM_REQUIRED (VERSION 3.10)
 PROJECT(CPPJIEBA)

 INCLUDE_DIRECTORIES(${PROJECT_SOURCE_DIR}/deps/limonp/include
  ${PROJECT_SOURCE_DIR}/include)

-if (CMAKE_INSTALL_PREFIX_INITIALIZED_TO_DEFAULT)
-    set (CMAKE_INSTALL_PREFIX "/usr/local/cppjieba" CACHE PATH "default install path" FORCE )
-endif()
-
 if(NOT DEFINED CMAKE_CXX_STANDARD)
    set(CMAKE_CXX_STANDARD 11)
 endif()
 set(CMAKE_CXX_STANDARD_REQUIRED ON)
 set(CMAKE_CXX_EXTENSIONS OFF)

-ADD_DEFINITIONS(-O3 -Wall -g)
+ADD_DEFINITIONS(-O3 -g)

-ADD_SUBDIRECTORY(test)
+# Define a variable to check if this is the top-level project
+if(NOT DEFINED CPPJIEBA_TOP_LEVEL_PROJECT)
+    if(CMAKE_CURRENT_SOURCE_DIR STREQUAL CMAKE_SOURCE_DIR)
+        set(CPPJIEBA_TOP_LEVEL_PROJECT ON)
+    else()
+        set(CPPJIEBA_TOP_LEVEL_PROJECT OFF)
+    endif()
+endif()

-ENABLE_TESTING()
-if(NOT MSVC)
+if(CPPJIEBA_TOP_LEVEL_PROJECT)
+  ENABLE_TESTING()
+
+  message(STATUS "MSVC value: ${MSVC}")
+  ADD_SUBDIRECTORY(test)
  ADD_TEST(NAME ./test/test.run COMMAND ./test/test.run)
  ADD_TEST(NAME ./load_test COMMAND ./load_test)
 endif()
--- a/README.md
+++ b/README.md
@ -10,22 +10,25 @@

 CppJieba是"结巴(Jieba)"中文分词的C++版本

-## 特性
+### 主要特点

-+ 源代码都写进头文件`include/cppjieba/*.hpp`里，`include`即可使用。
-+ 支持`utf8`编码。
-+ 项目自带较为完善的单元测试，核心功能中文分词(utf8)的稳定性接受过线上环境检验。
-+ 支持载自定义用户词典，多路径时支持分隔符'|'或者';'分隔。
-+ 支持 `Linux` , `Mac OSX`, `Windows` 操作系统。
+- 🚀 高性能：经过线上环境验证的稳定性和性能表现
+- 📦 易集成：源代码以头文件形式提供 (`include/cppjieba/*.hpp`)，包含即可使用
+- 🔍 多种分词模式：支持精确模式、全模式、搜索引擎模式等
+- 📚 自定义词典：支持用户自定义词典，支持多词典路径（使用'|'或';'分隔）
+- 💻 跨平台：支持 Linux、macOS、Windows 操作系统
+- 🌈 UTF-8编码：原生支持 UTF-8 编码的中文处理

-## 用法
+## 快速开始

-### 依赖软件
+### 环境要求

-* `g++ (version >= 4.1 is recommended) or clang++`;
-* `cmake (version >= 2.6 is recommended)`;
+- C++ 编译器：
+  - g++ (推荐 4.1 以上版本)
+  - 或 clang++
+- cmake (推荐 2.6 以上版本)

-### 下载和编译
+### 安装步骤

 ```sh
 git clone https://github.com/yanyiwu/cppjieba.git
@ -36,15 +39,11 @@ mkdir build
 cd build
 cmake ..
 make
-```

-有兴趣的可以跑跑测试(可选):
-
-```
 make test
 ```

-## Demo
+## 使用示例

 ```
 ./demo
@ -210,71 +209,37 @@ For more details, please see [demo](https://github.com/yanyiwu/cppjieba-demo).

 + [dict.367W.utf8] iLife(562193561 at qq.com)

-## 应用
+## 生态系统

-+ [GoJieba] go语言版本的结巴中文分词。
-+ [NodeJieba] Node.js 版本的结巴中文分词。
-+ [simhash] 中文文档的的相似度计算
-+ [exjieba] Erlang 版本的结巴中文分词。
-+ [jiebaR] R语言版本的结巴中文分词。
-+ [cjieba] C语言版本的结巴分词。
-+ [jieba_rb] Ruby 版本的结巴分词。
-+ [iosjieba] iOS 版本的结巴分词。
-+ [SqlJieba] MySQL 全文索引的结巴中文分词插件。
-+ [pg_jieba] PostgreSQL 数据库的分词插件。
-+ [simple] SQLite3 FTS5 数据库的分词插件。
-+ [gitbook-plugin-search-pro] 支持中文搜索的 gitbook 插件。
-+ [ngx_http_cppjieba_module] Nginx 分词插件。
-+ [cppjiebapy] 由 [jannson] 开发的供 python 模块调用的项目 [cppjiebapy], 相关讨论 [cppjiebapy_discussion] .
-+ [cppjieba-py] 由 [bung87] 基于 pybind11 封装的 python 模块,使用体验上接近于原jieba。
-+ [KeywordServer] 50行搭建一个中文关键词抽取服务。
-+ [cppjieba-server] CppJieba HTTP 服务器。
-+ [phpjieba] php版本的结巴分词扩展。
-+ [perl5-jieba] Perl版本的结巴分词扩展。
-+ [jieba-dlang] D 语言的结巴分词 Deimos Bindings。
+CppJieba 已经被广泛应用于各种编程语言的分词实现中：

-## 性能评测
+- [GoJieba](https://github.com/yanyiwu/gojieba) - Go 语言版本
+- [NodeJieba](https://github.com/yanyiwu/nodejieba) - Node.js 版本
+- [CJieba](https://github.com/yanyiwu/cjieba) - C 语言版本
+- [jiebaR](https://github.com/qinwf/jiebaR) - R 语言版本
+- [exjieba](https://github.com/falood/exjieba) - Erlang 版本
+- [jieba_rb](https://github.com/altkatz/jieba_rb) - Ruby 版本
+- [iosjieba](https://github.com/yanyiwu/iosjieba) - iOS 版本
+- [phpjieba](https://github.com/jonnywang/phpjieba) - PHP 版本
+- [perl5-jieba](https://metacpan.org/pod/distribution/Lingua-ZH-Jieba/lib/Lingua/ZH/Jieba.pod) - Perl 版本

-[Jieba中文分词系列性能评测]
+### 应用项目

-## Sponsorship
+- [simhash](https://github.com/yanyiwu/simhash) - 中文文档相似度计算
+- [pg_jieba](https://github.com/jaiminpan/pg_jieba) - PostgreSQL 分词插件
+- [gitbook-plugin-search-pro](https://plugins.gitbook.com/plugin/search-pro) - Gitbook 中文搜索插件
+- [ngx_http_cppjieba_module](https://github.com/yanyiwu/ngx_http_cppjieba_module) - Nginx 分词插件

-[![sponsorship](http://images.gitads.io/cppjieba)](https://tracking.gitads.io/?campaign=gitads&repo=cppjieba&redirect=gitads.io)
+## 贡献指南

-## Contributors
+我们欢迎各种形式的贡献，包括但不限于：

-### Code Contributors
-
-This project exists thanks to all the people who contribute.
-<a href="https://github.com/yanyiwu/cppjieba/graphs/contributors"><img src="https://opencollective.com/cppjieba/contributors.svg?width=890&button=false" /></a>
-
-[GoJieba]:https://github.com/yanyiwu/gojieba
-[CppJieba]:https://github.com/yanyiwu/cppjieba
-[jannson]:https://github.com/jannson
-[cppjiebapy]:https://github.com/jannson/cppjiebapy
-[bung87]:https://github.com/bung87
-[cppjieba-py]:https://github.com/bung87/cppjieba-py
-[cppjiebapy_discussion]:https://github.com/yanyiwu/cppjieba/issues/1
-[NodeJieba]:https://github.com/yanyiwu/nodejieba
-[jiebaR]:https://github.com/qinwf/jiebaR
-[simhash]:https://github.com/yanyiwu/simhash
-[代码详解]:https://github.com/yanyiwu/cppjieba/wiki/CppJieba%E4%BB%A3%E7%A0%81%E8%AF%A6%E8%A7%A3
-[issue25]:https://github.com/yanyiwu/cppjieba/issues/25
-[exjieba]:https://github.com/falood/exjieba
-[KeywordServer]:https://github.com/yanyiwu/keyword_server
-[ngx_http_cppjieba_module]:https://github.com/yanyiwu/ngx_http_cppjieba_module
-[dict.367W.utf8]:https://github.com/qinwf/BigDict
-[cjieba]:http://github.com/yanyiwu/cjieba
-[jieba_rb]:https://github.com/altkatz/jieba_rb
-[iosjieba]:https://github.com/yanyiwu/iosjieba
-[SqlJieba]:https://github.com/yanyiwu/sqljieba
-[Jieba中文分词系列性能评测]:http://yanyiwu.com/work/2015/06/14/jieba-series-performance-test.html
-[pg_jieba]:https://github.com/jaiminpan/pg_jieba
-[gitbook-plugin-search-pro]:https://plugins.gitbook.com/plugin/search-pro
-[cppjieba-server]:https://github.com/yanyiwu/cppjieba-server
-[phpjieba]:https://github.com/jonnywang/phpjieba
-[perl5-jieba]:https://metacpan.org/pod/distribution/Lingua-ZH-Jieba/lib/Lingua/ZH/Jieba.pod
-[jieba-dlang]:https://github.com/shove70/jieba
-[simple]:https://github.com/wangfenjin/simple
+- 提交问题和建议
+- 改进文档
+- 提交代码修复
+- 添加新功能
+
+
+如果您觉得 CppJieba 对您有帮助，欢迎 star ⭐️ 支持项目！


--- a/deps/limonp
+++ b/deps/limonp
@ -1 +1 @@
-Subproject commit ac32f1f287f65d5ce0ce295010c88026fae060ee
+Subproject commit 5c82a3f17e4e0adc6a5decfe245054b0ed533d1a
--- a/dict/jieba.dict.utf8
+++ b/dict/jieba.dict.utf8
@ -312698,7 +312698,6 @@ T恤 4 n
 部属 1126 n
 部属工作 3 n
 部属院校 3 n
-部手机 33 n
 部族 643 n
 部标 4 n
 部省级 2 n
--- a/include/cppjieba/DictTrie.hpp
+++ b/include/cppjieba/DictTrie.hpp
@ -1,15 +1,15 @@
 #ifndef CPPJIEBA_DICT_TRIE_HPP
 #define CPPJIEBA_DICT_TRIE_HPP

-#include <iostream>
+#include <algorithm>
 #include <fstream>
-#include <map>
-#include <string>
 #include <cstring>
 #include <cstdlib>
-#include <stdint.h>
 #include <cmath>
-#include <limits>
+#include <deque>
+#include <set>
+#include <string>
+#include <unordered_set>
 #include "limonp/StringUtil.hpp"
 #include "limonp/Logging.hpp"
 #include "Unicode.hpp"
@ -17,8 +17,6 @@

 namespace cppjieba {

-using namespace limonp;
-
 const double MIN_DOUBLE = -3.14e+100;
 const double MAX_DOUBLE = 3.14e+100;
 const size_t DICT_COLUMN_NUM = 3;
@ -32,7 +30,7 @@ class DictTrie {
    WordWeightMax,
  }; // enum UserWordWeightOption

-  DictTrie(const string& dict_path, const string& user_dict_paths = "", UserWordWeightOption user_word_weight_opt = WordWeightMedian) {
+  DictTrie(const std::string& dict_path, const std::string& user_dict_paths = "", UserWordWeightOption user_word_weight_opt = WordWeightMedian) {
    Init(dict_path, user_dict_paths, user_word_weight_opt);
  }

@ -40,7 +38,7 @@ class DictTrie {
    delete trie_;
  }

-  bool InsertUserWord(const string& word, const string& tag = UNKNOWN_TAG) {
+  bool InsertUserWord(const std::string& word, const std::string& tag = UNKNOWN_TAG) {
    DictUnit node_info;
    if (!MakeNodeInfo(node_info, word, user_word_default_weight_, tag)) {
      return false;
@ -50,7 +48,7 @@ class DictTrie {
    return true;
  }

-  bool InsertUserWord(const string& word,int freq, const string& tag = UNKNOWN_TAG) {
+  bool InsertUserWord(const std::string& word,int freq, const std::string& tag = UNKNOWN_TAG) {
    DictUnit node_info;
    double weight = freq ? log(1.0 * freq / freq_sum_) : user_word_default_weight_ ;
    if (!MakeNodeInfo(node_info, word, weight , tag)) {
@ -61,7 +59,7 @@ class DictTrie {
    return true;
  }

-  bool DeleteUserWord(const string& word, const string& tag = UNKNOWN_TAG) {
+  bool DeleteUserWord(const std::string& word, const std::string& tag = UNKNOWN_TAG) {
    DictUnit node_info;
    if (!MakeNodeInfo(node_info, word, user_word_default_weight_, tag)) {
      return false;
@ -76,16 +74,16 @@ class DictTrie {

  void Find(RuneStrArray::const_iterator begin,
        RuneStrArray::const_iterator end,
-        vector<struct Dag>&res,
+        std::vector<struct Dag>&res,
        size_t max_word_len = MAX_WORD_LENGTH) const {
    trie_->Find(begin, end, res, max_word_len);
  }

-  bool Find(const string& word)
+  bool Find(const std::string& word)
  {
    const DictUnit *tmp = NULL;
    RuneStrArray runes;
-    if (!DecodeRunesInString(word, runes))
+    if (!DecodeUTF8RunesInString(word, runes))
    {
      XLOG(ERROR) << "Decode failed.";
    }
@ -108,10 +106,10 @@ class DictTrie {
    return min_weight_;
  }

-  void InserUserDictNode(const string& line) {
-    vector<string> buf;
+  void InserUserDictNode(const std::string& line) {
+    std::vector<std::string> buf;
    DictUnit node_info;
-    Split(line, buf, " ");
+    limonp::Split(line, buf, " ");
    if(buf.size() == 1){
          MakeNodeInfo(node_info,
                buf[0],
@ -134,28 +132,27 @@ class DictTrie {
        }
  }

-  void LoadUserDict(const vector<string>& buf) {
+  void LoadUserDict(const std::vector<std::string>& buf) {
    for (size_t i = 0; i < buf.size(); i++) {
      InserUserDictNode(buf[i]);
    }
  }

-   void LoadUserDict(const set<string>& buf) {
-    std::set<string>::const_iterator iter;
+   void LoadUserDict(const std::set<std::string>& buf) {
+    std::set<std::string>::const_iterator iter;
    for (iter = buf.begin(); iter != buf.end(); iter++){
      InserUserDictNode(*iter);
    }
  }

-  void LoadUserDict(const string& filePaths) {
-    vector<string> files = limonp::Split(filePaths, "|;");
-    size_t lineno = 0;
+  void LoadUserDict(const std::string& filePaths) {
+    std::vector<std::string> files = limonp::Split(filePaths, "|;");
    for (size_t i = 0; i < files.size(); i++) {
-      ifstream ifs(files[i].c_str());
+      std::ifstream ifs(files[i].c_str());
      XCHECK(ifs.is_open()) << "open " << files[i] << " failed";
-      string line;
+      std::string line;

-      for (; getline(ifs, line); lineno++) {
+      while(getline(ifs, line)) {
        if (line.size() == 0) {
          continue;
        }
@ -166,7 +163,7 @@ class DictTrie {


 private:
-  void Init(const string& dict_path, const string& user_dict_paths, UserWordWeightOption user_word_weight_opt) {
+  void Init(const std::string& dict_path, const std::string& user_dict_paths, UserWordWeightOption user_word_weight_opt) {
    LoadDict(dict_path);
    freq_sum_ = CalcFreqSum(static_node_infos_);
    CalculateWeight(static_node_infos_, freq_sum_);
@ -179,10 +176,10 @@ class DictTrie {
    CreateTrie(static_node_infos_);
  }

-  void CreateTrie(const vector<DictUnit>& dictUnits) {
+  void CreateTrie(const std::vector<DictUnit>& dictUnits) {
    assert(dictUnits.size());
-    vector<Unicode> words;
-    vector<const DictUnit*> valuePointers;
+    std::vector<Unicode> words;
+    std::vector<const DictUnit*> valuePointers;
    for (size_t i = 0 ; i < dictUnits.size(); i ++) {
      words.push_back(dictUnits[i].word);
      valuePointers.push_back(&dictUnits[i]);
@ -191,15 +188,12 @@ class DictTrie {
    trie_ = new Trie(words, valuePointers);
  }

-  
-
-
  bool MakeNodeInfo(DictUnit& node_info,
-        const string& word, 
+        const std::string& word,
        double weight,
-        const string& tag) {
-    if (!DecodeRunesInString(word, node_info.word)) {
-      XLOG(ERROR) << "Decode " << word << " failed.";
+        const std::string& tag) {
+    if (!DecodeUTF8RunesInString(word, node_info.word)) {
+      XLOG(ERROR) << "UTF-8 decode failed for dict word: " << word;
      return false;
    }
    node_info.weight = weight;
@ -207,15 +201,15 @@ class DictTrie {
    return true;
  }

-  void LoadDict(const string& filePath) {
-    ifstream ifs(filePath.c_str());
+  void LoadDict(const std::string& filePath) {
+    std::ifstream ifs(filePath.c_str());
    XCHECK(ifs.is_open()) << "open " << filePath << " failed.";
-    string line;
-    vector<string> buf;
+    std::string line;
+    std::vector<std::string> buf;

    DictUnit node_info;
-    for (size_t lineno = 0; getline(ifs, line); lineno++) {
-      Split(line, buf, " ");
+    while (getline(ifs, line)) {
+      limonp::Split(line, buf, " ");
      XCHECK(buf.size() == DICT_COLUMN_NUM) << "split result illegal, line:" << line;
      MakeNodeInfo(node_info,
            buf[0],
@ -231,8 +225,8 @@ class DictTrie {

  void SetStaticWordWeights(UserWordWeightOption option) {
    XCHECK(!static_node_infos_.empty());
-    vector<DictUnit> x = static_node_infos_;
-    sort(x.begin(), x.end(), WeightCompare);
+    std::vector<DictUnit> x = static_node_infos_;
+    std::sort(x.begin(), x.end(), WeightCompare);
    min_weight_ = x[0].weight;
    max_weight_ = x[x.size() - 1].weight;
    median_weight_ = x[x.size() / 2].weight;
@ -249,7 +243,7 @@ class DictTrie {
    }
  }

-  double CalcFreqSum(const vector<DictUnit>& node_infos) const {
+  double CalcFreqSum(const std::vector<DictUnit>& node_infos) const {
    double sum = 0.0;
    for (size_t i = 0; i < node_infos.size(); i++) {
      sum += node_infos[i].weight;
@ -257,7 +251,7 @@ class DictTrie {
    return sum;
  }

-  void CalculateWeight(vector<DictUnit>& node_infos, double sum) const {
+  void CalculateWeight(std::vector<DictUnit>& node_infos, double sum) const {
    assert(sum > 0.0);
    for (size_t i = 0; i < node_infos.size(); i++) {
      DictUnit& node_info = node_infos[i];
@ -266,12 +260,12 @@ class DictTrie {
    }
  }

-  void Shrink(vector<DictUnit>& units) const {
-    vector<DictUnit>(units.begin(), units.end()).swap(units);
+  void Shrink(std::vector<DictUnit>& units) const {
+    std::vector<DictUnit>(units.begin(), units.end()).swap(units);
  }

-  vector<DictUnit> static_node_infos_;
-  deque<DictUnit> active_node_infos_; // must not be vector
+  std::vector<DictUnit> static_node_infos_;
+  std::deque<DictUnit> active_node_infos_; // must not be std::vector
  Trie * trie_;

  double freq_sum_;
@ -279,7 +273,7 @@ class DictTrie {
  double max_weight_;
  double median_weight_;
  double user_word_default_weight_;
-  unordered_set<Rune> user_dict_single_chinese_word_;
+  std::unordered_set<Rune> user_dict_single_chinese_word_;
 };
 }

--- a/include/cppjieba/HMMModel.hpp
+++ b/include/cppjieba/HMMModel.hpp
@ -105,7 +105,7 @@ struct HMMModel {
        XLOG(ERROR) << "emitProb illegal.";
        return false;
      }
-      if (!DecodeRunesInString(tmp2[0], unicode) || unicode.size() != 1) {
+      if (!DecodeUTF8RunesInString(tmp2[0], unicode) || unicode.size() != 1) {
        XLOG(ERROR) << "TransCode failed.";
        return false;
      }
--- a/include/cppjieba/Jieba.hpp
+++ b/include/cppjieba/Jieba.hpp
@ -8,19 +8,21 @@ namespace cppjieba {

 class Jieba {
 public:
-  Jieba(const string& dict_path, 
-        const string& model_path,
-        const string& user_dict_path, 
-        const string& idfPath, 
-        const string& stopWordPath) 
-    : dict_trie_(dict_path, user_dict_path),
-      model_(model_path),
+  Jieba(const string& dict_path = "", 
+        const string& model_path = "",
+        const string& user_dict_path = "", 
+        const string& idf_path = "", 
+        const string& stop_word_path = "") 
+    : dict_trie_(getPath(dict_path, "jieba.dict.utf8"), getPath(user_dict_path, "user.dict.utf8")),
+      model_(getPath(model_path, "hmm_model.utf8")),
      mp_seg_(&dict_trie_),
      hmm_seg_(&model_),
      mix_seg_(&dict_trie_, &model_),
      full_seg_(&dict_trie_),
      query_seg_(&dict_trie_, &model_),
-      extractor(&dict_trie_, &model_, idfPath, stopWordPath) {
+      extractor(&dict_trie_, &model_, 
+                getPath(idf_path, "idf.utf8"), 
+                getPath(stop_word_path, "stop_words.utf8")) {
  }
  ~Jieba() {
  }
@ -115,6 +117,39 @@ class Jieba {
  }

 private:
+  static string pathJoin(const string& dir, const string& filename) {
+    if (dir.empty()) {
+        return filename;
+    }
+    
+    char last_char = dir[dir.length() - 1];
+    if (last_char == '/' || last_char == '\\') {
+        return dir + filename;
+    } else {
+        #ifdef _WIN32
+        return dir + '\\' + filename;
+        #else
+        return dir + '/' + filename;
+        #endif
+    }
+  }
+
+  static string getCurrentDirectory() {
+    string path(__FILE__);
+    size_t pos = path.find_last_of("/\\");
+    return (pos == string::npos) ? "" : path.substr(0, pos);
+  }
+
+  static string getPath(const string& path, const string& default_file) {
+    if (path.empty()) {
+      string current_dir = getCurrentDirectory();
+      string parent_dir = current_dir.substr(0, current_dir.find_last_of("/\\"));
+      string grandparent_dir = parent_dir.substr(0, parent_dir.find_last_of("/\\"));
+      return pathJoin(pathJoin(grandparent_dir, "dict"), default_file);
+    }
+    return path;
+  }
+
  DictTrie dict_trie_;
  HMMModel model_;
  
--- a/include/cppjieba/KeywordExtractor.hpp
+++ b/include/cppjieba/KeywordExtractor.hpp
@ -1,37 +1,35 @@
 #ifndef CPPJIEBA_KEYWORD_EXTRACTOR_H
 #define CPPJIEBA_KEYWORD_EXTRACTOR_H

-#include <cmath>
-#include <set>
+#include <algorithm>
+#include <unordered_map>
+#include <unordered_set>
 #include "MixSegment.hpp"

 namespace cppjieba {

-using namespace limonp;
-using namespace std;
-
 /*utf8*/
 class KeywordExtractor {
 public:
  struct Word {
-    string word;
-    vector<size_t> offsets;
+    std::string word;
+    std::vector<size_t> offsets;
    double weight;
  }; // struct Word

-  KeywordExtractor(const string& dictPath, 
-        const string& hmmFilePath, 
-        const string& idfPath, 
-        const string& stopWordPath, 
-        const string& userDict = "") 
+  KeywordExtractor(const std::string& dictPath, 
+        const std::string& hmmFilePath, 
+        const std::string& idfPath, 
+        const std::string& stopWordPath, 
+        const std::string& userDict = "") 
    : segment_(dictPath, hmmFilePath, userDict) {
    LoadIdfDict(idfPath);
    LoadStopWordDict(stopWordPath);
  }
  KeywordExtractor(const DictTrie* dictTrie, 
        const HMMModel* model,
-        const string& idfPath, 
-        const string& stopWordPath) 
+        const std::string& idfPath, 
+        const std::string& stopWordPath) 
    : segment_(dictTrie, model) {
    LoadIdfDict(idfPath);
    LoadStopWordDict(stopWordPath);
@ -39,27 +37,27 @@ class KeywordExtractor {
  ~KeywordExtractor() {
  }

-  void Extract(const string& sentence, vector<string>& keywords, size_t topN) const {
-    vector<Word> topWords;
+  void Extract(const std::string& sentence, std::vector<std::string>& keywords, size_t topN) const {
+    std::vector<Word> topWords;
    Extract(sentence, topWords, topN);
    for (size_t i = 0; i < topWords.size(); i++) {
      keywords.push_back(topWords[i].word);
    }
  }

-  void Extract(const string& sentence, vector<pair<string, double> >& keywords, size_t topN) const {
-    vector<Word> topWords;
+  void Extract(const std::string& sentence, std::vector<pair<std::string, double> >& keywords, size_t topN) const {
+    std::vector<Word> topWords;
    Extract(sentence, topWords, topN);
    for (size_t i = 0; i < topWords.size(); i++) {
-      keywords.push_back(pair<string, double>(topWords[i].word, topWords[i].weight));
+      keywords.push_back(pair<std::string, double>(topWords[i].word, topWords[i].weight));
    }
  }

-  void Extract(const string& sentence, vector<Word>& keywords, size_t topN) const {
-    vector<string> words;
+  void Extract(const std::string& sentence, std::vector<Word>& keywords, size_t topN) const {
+    std::vector<std::string> words;
    segment_.Cut(sentence, words);

-    map<string, Word> wordmap;
+    std::map<std::string, Word> wordmap;
    size_t offset = 0;
    for (size_t i = 0; i < words.size(); ++i) {
      size_t t = offset;
@ -77,8 +75,8 @@ class KeywordExtractor {

    keywords.clear();
    keywords.reserve(wordmap.size());
-    for (map<string, Word>::iterator itr = wordmap.begin(); itr != wordmap.end(); ++itr) {
-      unordered_map<string, double>::const_iterator cit = idfMap_.find(itr->first);
+    for (std::map<std::string, Word>::iterator itr = wordmap.begin(); itr != wordmap.end(); ++itr) {
+      std::unordered_map<std::string, double>::const_iterator cit = idfMap_.find(itr->first);
      if (cit != idfMap_.end()) {
        itr->second.weight *= cit->second;
      } else {
@ -88,15 +86,15 @@ class KeywordExtractor {
      keywords.push_back(itr->second);
    }
    topN = min(topN, keywords.size());
-    partial_sort(keywords.begin(), keywords.begin() + topN, keywords.end(), Compare);
+    std::partial_sort(keywords.begin(), keywords.begin() + topN, keywords.end(), Compare);
    keywords.resize(topN);
  }
 private:
-  void LoadIdfDict(const string& idfPath) {
-    ifstream ifs(idfPath.c_str());
+  void LoadIdfDict(const std::string& idfPath) {
+    std::ifstream ifs(idfPath.c_str());
    XCHECK(ifs.is_open()) << "open " << idfPath << " failed";
-    string line ;
-    vector<string> buf;
+    std::string line ;
+    std::vector<std::string> buf;
    double idf = 0.0;
    double idfSum = 0.0;
    size_t lineno = 0;
@ -106,7 +104,7 @@ class KeywordExtractor {
        XLOG(ERROR) << "lineno: " << lineno << " empty. skipped.";
        continue;
      }
-      Split(line, buf, " ");
+      limonp::Split(line, buf, " ");
      if (buf.size() != 2) {
        XLOG(ERROR) << "line: " << line << ", lineno: " << lineno << " empty. skipped.";
        continue;
@ -121,10 +119,10 @@ class KeywordExtractor {
    idfAverage_ = idfSum / lineno;
    assert(idfAverage_ > 0.0);
  }
-  void LoadStopWordDict(const string& filePath) {
-    ifstream ifs(filePath.c_str());
+  void LoadStopWordDict(const std::string& filePath) {
+    std::ifstream ifs(filePath.c_str());
    XCHECK(ifs.is_open()) << "open " << filePath << " failed";
-    string line ;
+    std::string line ;
    while (getline(ifs, line)) {
      stopWords_.insert(line);
    }
@ -136,18 +134,16 @@ class KeywordExtractor {
  }

  MixSegment segment_;
-  unordered_map<string, double> idfMap_;
+  std::unordered_map<std::string, double> idfMap_;
  double idfAverage_;

-  unordered_set<string> stopWords_;
+  std::unordered_set<std::string> stopWords_;
 }; // class KeywordExtractor

-inline ostream& operator << (ostream& os, const KeywordExtractor::Word& word) {
+inline std::ostream& operator << (std::ostream& os, const KeywordExtractor::Word& word) {
  return os << "{\"word\": \"" << word.word << "\", \"offset\": " << word.offsets << ", \"weight\": " << word.weight << "}"; 
 }

 } // namespace cppjieba

 #endif
-
-
--- a/include/cppjieba/PosTagger.hpp
+++ b/include/cppjieba/PosTagger.hpp
@ -34,8 +34,8 @@ class PosTagger {
    RuneStrArray runes;
    const DictTrie * dict = segment.GetDictTrie();
    assert(dict != NULL);
-      if (!DecodeRunesInString(str, runes)) {
-        XLOG(ERROR) << "Decode failed.";
+      if (!DecodeUTF8RunesInString(str, runes)) {
+        XLOG(ERROR) << "UTF-8 decode failed for word: " << str;
        return POS_X;
      }
      tmp = dict->Find(runes.begin(), runes.end());
--- a/include/cppjieba/PreFilter.hpp
+++ b/include/cppjieba/PreFilter.hpp
@ -17,8 +17,8 @@ class PreFilter {
  PreFilter(const unordered_set<Rune>& symbols, 
        const string& sentence)
    : symbols_(symbols) {
-    if (!DecodeRunesInString(sentence, sentence_)) {
-      XLOG(ERROR) << "decode failed. "; 
+    if (!DecodeUTF8RunesInString(sentence, sentence_)) {
+      XLOG(ERROR) << "UTF-8 decode failed for input sentence"; 
    }
    cursor_ = sentence_.begin();
  }
--- a/include/cppjieba/SegmentBase.hpp
+++ b/include/cppjieba/SegmentBase.hpp
@ -25,8 +25,8 @@ class SegmentBase {
  bool ResetSeparators(const string& s) {
    symbols_.clear();
    RuneStrArray runes;
-    if (!DecodeRunesInString(s, runes)) {
-      XLOG(ERROR) << "decode " << s << " failed";
+    if (!DecodeUTF8RunesInString(s, runes)) {
+      XLOG(ERROR) << "UTF-8 decode failed for separators: " << s;
      return false;
    }
    for (size_t i = 0; i < runes.size(); i++) {
--- a/include/cppjieba/Unicode.hpp
+++ b/include/cppjieba/Unicode.hpp
@ -84,7 +84,7 @@ struct RuneStrLite {
  }
 }; // struct RuneStrLite

-inline RuneStrLite DecodeRuneInString(const char* str, size_t len) {
+inline RuneStrLite DecodeUTF8ToRune(const char* str, size_t len) {
  RuneStrLite rp(0, 0);
  if (str == NULL || len == 0) {
    return rp;
@ -139,11 +139,11 @@ inline RuneStrLite DecodeRuneInString(const char* str, size_t len) {
  return rp;
 }

-inline bool DecodeRunesInString(const char* s, size_t len, RuneStrArray& runes) {
+inline bool DecodeUTF8RunesInString(const char* s, size_t len, RuneStrArray& runes) {
  runes.clear();
  runes.reserve(len / 2);
  for (uint32_t i = 0, j = 0; i < len;) {
-    RuneStrLite rp = DecodeRuneInString(s + i, len - i);
+    RuneStrLite rp = DecodeUTF8ToRune(s + i, len - i);
    if (rp.len == 0) {
      runes.clear();
      return false;
@ -156,14 +156,14 @@ inline bool DecodeRunesInString(const char* s, size_t len, RuneStrArray& runes)
  return true;
 }

-inline bool DecodeRunesInString(const string& s, RuneStrArray& runes) {
-  return DecodeRunesInString(s.c_str(), s.size(), runes);
+inline bool DecodeUTF8RunesInString(const string& s, RuneStrArray& runes) {
+  return DecodeUTF8RunesInString(s.c_str(), s.size(), runes);
 }

-inline bool DecodeRunesInString(const char* s, size_t len, Unicode& unicode) {
+inline bool DecodeUTF8RunesInString(const char* s, size_t len, Unicode& unicode) {
  unicode.clear();
  RuneStrArray runes;
-  if (!DecodeRunesInString(s, len, runes)) {
+  if (!DecodeUTF8RunesInString(s, len, runes)) {
    return false;
  }
  unicode.reserve(runes.size());
@ -174,17 +174,17 @@ inline bool DecodeRunesInString(const char* s, size_t len, Unicode& unicode) {
 }

 inline bool IsSingleWord(const string& str) {
-  RuneStrLite rp = DecodeRuneInString(str.c_str(), str.size());
+  RuneStrLite rp = DecodeUTF8ToRune(str.c_str(), str.size());
  return rp.len == str.size();
 }

-inline bool DecodeRunesInString(const string& s, Unicode& unicode) {
-  return DecodeRunesInString(s.c_str(), s.size(), unicode);
+inline bool DecodeUTF8RunesInString(const string& s, Unicode& unicode) {
+  return DecodeUTF8RunesInString(s.c_str(), s.size(), unicode);
 }

-inline Unicode DecodeRunesInString(const string& s) {
+inline Unicode DecodeUTF8RunesInString(const string& s) {
  Unicode result;
-  DecodeRunesInString(s, result);
+  DecodeUTF8RunesInString(s, result);
  return result;
 }

--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@ -1,6 +1,12 @@
 SET(EXECUTABLE_OUTPUT_PATH ${PROJECT_BINARY_DIR})

-if(NOT MSVC)
-	ADD_EXECUTABLE(load_test load_test.cpp)
-	ADD_SUBDIRECTORY(unittest)
-endif()
+# Configure test paths
+configure_file("${CMAKE_CURRENT_SOURCE_DIR}/test_paths.h.in" "${CMAKE_BINARY_DIR}/test/test_paths.h")
+
+INCLUDE_DIRECTORIES(
+    ${CMAKE_CURRENT_BINARY_DIR}
+    ${CMAKE_BINARY_DIR}/test
+)
+
+ADD_EXECUTABLE(load_test load_test.cpp)
+ADD_SUBDIRECTORY(unittest)
--- a/test/load_test.cpp
+++ b/test/load_test.cpp
@ -6,14 +6,15 @@
 #include "cppjieba/MixSegment.hpp"
 #include "cppjieba/KeywordExtractor.hpp"
 #include "limonp/Colors.hpp"
+#include "test_paths.h"

 using namespace cppjieba;

 void Cut(size_t times = 50) {
-  MixSegment seg("../dict/jieba.dict.utf8", "../dict/hmm_model.utf8");
+  MixSegment seg(DICT_DIR "/jieba.dict.utf8", DICT_DIR "/hmm_model.utf8");
  vector<string> res;
  string doc;
-  ifstream ifs("../test/testdata/weicheng.utf8");
+  ifstream ifs(TEST_DATA_DIR "/weicheng.utf8");
  assert(ifs);
  doc << ifs;
  long beginTime = clock();
@ -29,10 +30,13 @@ void Cut(size_t times = 50) {
 }

 void Extract(size_t times = 400) {
-  KeywordExtractor Extractor("../dict/jieba.dict.utf8", "../dict/hmm_model.utf8", "../dict/idf.utf8", "../dict/stop_words.utf8");
+  KeywordExtractor Extractor(DICT_DIR "/jieba.dict.utf8", 
+                           DICT_DIR "/hmm_model.utf8", 
+                           DICT_DIR "/idf.utf8", 
+                           DICT_DIR "/stop_words.utf8");
  vector<string> words;
  string doc;
-  ifstream ifs("../test/testdata/review.100");
+  ifstream ifs(TEST_DATA_DIR "/review.100");
  assert(ifs);
  doc << ifs;
  long beginTime = clock();
--- a/test/test_paths.h.in
+++ b/test/test_paths.h.in
@ -0,0 +1,7 @@
+#ifndef TEST_PATHS_H
+#define TEST_PATHS_H
+
+#define TEST_DATA_DIR "@CMAKE_CURRENT_SOURCE_DIR@/testdata"
+#define DICT_DIR "@CMAKE_SOURCE_DIR@/dict"
+
+#endif // TEST_PATHS_H 
--- a/test/unittest/CMakeLists.txt
+++ b/test/unittest/CMakeLists.txt
@ -1,6 +1,8 @@
+message(STATUS "MSVC value: ${MSVC}")
 if (MSVC)
 	set(CMAKE_MSVC_RUNTIME_LIBRARY "MultiThreadedDebugDLL")
 	set(gtest_force_shared_crt ON CACHE BOOL "" FORCE)
+	add_compile_options(/utf-8)
 endif()

 include(FetchContent)
@ -8,7 +10,7 @@ include(FetchContent)
 FetchContent_Declare(
  googletest
  GIT_REPOSITORY https://github.com/google/googletest.git
-  GIT_TAG release-1.11.0
+  GIT_TAG release-1.12.1
 )
 FetchContent_MakeAvailable(googletest)

@ -18,6 +20,12 @@ SET(LIBRARY_OUTPUT_PATH ${PROJECT_BINARY_DIR}/lib)

 ADD_DEFINITIONS(-DLOGGING_LEVEL=LL_WARNING)

+# Add include directories
+INCLUDE_DIRECTORIES(
+    ${CMAKE_CURRENT_BINARY_DIR}
+    ${CMAKE_BINARY_DIR}/test
+)
+
 ADD_EXECUTABLE(test.run 
    gtest_main.cpp 
    keyword_extractor_test.cpp 
--- a/test/unittest/jieba_test.cpp
+++ b/test/unittest/jieba_test.cpp
@ -1,14 +1,11 @@
 #include "cppjieba/Jieba.hpp"
 #include "gtest/gtest.h"
+#include "test_paths.h"

 using namespace cppjieba;

-TEST(JiebaTest, Test1) {
-  cppjieba::Jieba jieba("../dict/jieba.dict.utf8",
-                        "../dict/hmm_model.utf8",
-                        "../dict/user.dict.utf8",
-                        "../dict/idf.utf8",
-                        "../dict/stop_words.utf8");
+TEST(JiebaTest, Test0) {
+  cppjieba::Jieba jieba;
  vector<string> words;
  string result;

@ -38,14 +35,51 @@ TEST(JiebaTest, Test1) {
  jieba.CutForSearch("他来到了网易杭研大厦", words);
  result << words;
  ASSERT_EQ("[\"他\", \"来到\", \"了\", \"网易\", \"杭研\", \"大厦\"]", result);
-
 }
+
+TEST(JiebaTest, Test1) {
+  cppjieba::Jieba jieba(DICT_DIR "/jieba.dict.utf8",
+                        DICT_DIR "/hmm_model.utf8",
+                        DICT_DIR "/user.dict.utf8",
+                        DICT_DIR "/idf.utf8",
+                        DICT_DIR "/stop_words.utf8");
+  vector<string> words;
+  string result;
+
+  jieba.Cut("他来到了网易杭研大厦", words);
+  result << words;
+  ASSERT_EQ("[\"他\", \"来到\", \"了\", \"网易\", \"杭研\", \"大厦\"]", result);
+
+  jieba.Cut("我来自北京邮电大学。", words, false);
+  result << words;
+  ASSERT_EQ("[\"我\", \"来自\", \"北京邮电大学\", \"。\"]", result);
+
+  jieba.CutSmall("南京市长江大桥", words, 3);
+  ASSERT_EQ("[\"南京市\", \"长江\", \"大桥\"]", result << words);
+
+  jieba.CutHMM("我来自北京邮电大学。。。学号123456", words);
+  result << words;
+  ASSERT_EQ("[\"我来\", \"自北京\", \"邮电大学\", \"。\", \"。\", \"。\", \"学号\", \"123456\"]", result);
+
+  jieba.Cut("我来自北京邮电大学。。。学号123456，用AK47", words);
+  result << words;
+  ASSERT_EQ("[\"我\", \"来自\", \"北京邮电大学\", \"。\", \"。\", \"。\", \"学号\", \"123456\", \"，\", \"用\", \"AK47\"]", result);
+
+  jieba.CutAll("我来自北京邮电大学", words);
+  result << words;
+  ASSERT_EQ(result, "[\"我\", \"来自\", \"北京\", \"北京邮电\", \"北京邮电大学\", \"邮电\", \"邮电大学\", \"电大\", \"大学\"]");
+
+  jieba.CutForSearch("他来到了网易杭研大厦", words);
+  result << words;
+  ASSERT_EQ("[\"他\", \"来到\", \"了\", \"网易\", \"杭研\", \"大厦\"]", result);
+}
+
 TEST(JiebaTest, WordTest) {
-  cppjieba::Jieba jieba("../dict/jieba.dict.utf8",
-                        "../dict/hmm_model.utf8",
-                        "../dict/user.dict.utf8",
-                        "../dict/idf.utf8",
-                        "../dict/stop_words.utf8");
+  cppjieba::Jieba jieba(DICT_DIR "/jieba.dict.utf8",
+                        DICT_DIR "/hmm_model.utf8",
+                        DICT_DIR "/user.dict.utf8",
+                        DICT_DIR "/idf.utf8",
+                        DICT_DIR "/stop_words.utf8");
  vector<Word> words;
  string result;

@ -83,11 +117,11 @@ TEST(JiebaTest, WordTest) {
 }

 TEST(JiebaTest, InsertUserWord) {
-  cppjieba::Jieba jieba("../dict/jieba.dict.utf8",
-                        "../dict/hmm_model.utf8",
-                        "../dict/user.dict.utf8",
-                        "../dict/idf.utf8",
-                        "../dict/stop_words.utf8");
+  cppjieba::Jieba jieba(DICT_DIR "/jieba.dict.utf8",
+                        DICT_DIR "/hmm_model.utf8",
+                        DICT_DIR "/user.dict.utf8",
+                        DICT_DIR "/idf.utf8",
+                        DICT_DIR "/stop_words.utf8");
  vector<string> words;
  string result;

--- a/test/unittest/keyword_extractor_test.cpp
+++ b/test/unittest/keyword_extractor_test.cpp
@ -1,10 +1,14 @@
 #include "cppjieba/KeywordExtractor.hpp"
 #include "gtest/gtest.h"
+#include "test_paths.h"

 using namespace cppjieba;

 TEST(KeywordExtractorTest, Test1) {
-  KeywordExtractor Extractor("../test/testdata/extra_dict/jieba.dict.small.utf8", "../dict/hmm_model.utf8", "../dict/idf.utf8", "../dict/stop_words.utf8");
+  KeywordExtractor Extractor(TEST_DATA_DIR "/extra_dict/jieba.dict.small.utf8", 
+                            DICT_DIR "/hmm_model.utf8", 
+                            DICT_DIR "/idf.utf8", 
+                            DICT_DIR "/stop_words.utf8");

  {
    string s("你好世界世界而且而且");
@ -55,7 +59,11 @@ TEST(KeywordExtractorTest, Test1) {
 }

 TEST(KeywordExtractorTest, Test2) {
-  KeywordExtractor Extractor("../test/testdata/extra_dict/jieba.dict.small.utf8", "../dict/hmm_model.utf8", "../dict/idf.utf8", "../dict/stop_words.utf8", "../test/testdata/userdict.utf8");
+  KeywordExtractor Extractor(TEST_DATA_DIR "/extra_dict/jieba.dict.small.utf8", 
+                            DICT_DIR "/hmm_model.utf8", 
+                            DICT_DIR "/idf.utf8", 
+                            DICT_DIR "/stop_words.utf8", 
+                            TEST_DATA_DIR "/userdict.utf8");

  {
    string s("蓝翔优秀毕业生");
--- a/test/unittest/pos_tagger_test.cpp
+++ b/test/unittest/pos_tagger_test.cpp
@ -1,5 +1,6 @@
 #include "cppjieba/MixSegment.hpp"
 #include "gtest/gtest.h"
+#include "test_paths.h"

 using namespace cppjieba;

@ -13,7 +14,7 @@ static const char * const ANS_TEST3 = "[iPhone6:eng, 手机:n, 的:uj, 最大:a,
 //static const char * const ANS_TEST3 = "";

 TEST(PosTaggerTest, Test) {
-  MixSegment tagger("../dict/jieba.dict.utf8", "../dict/hmm_model.utf8");
+  MixSegment tagger(DICT_DIR "/jieba.dict.utf8", DICT_DIR "/hmm_model.utf8");
  {
    vector<pair<string, string> > res;
    tagger.Tag(QUERY_TEST1, res);
@ -23,7 +24,7 @@ TEST(PosTaggerTest, Test) {
  }
 }
 TEST(PosTagger, TestUserDict) {
-  MixSegment tagger("../dict/jieba.dict.utf8", "../dict/hmm_model.utf8", "../test/testdata/userdict.utf8");
+  MixSegment tagger(DICT_DIR "/jieba.dict.utf8", DICT_DIR "/hmm_model.utf8", TEST_DATA_DIR "/userdict.utf8");
  {
    vector<pair<string, string> > res;
    tagger.Tag(QUERY_TEST2, res);
--- a/test/unittest/segments_test.cpp
+++ b/test/unittest/segments_test.cpp
@ -5,11 +5,12 @@
 #include "cppjieba/FullSegment.hpp"
 #include "cppjieba/QuerySegment.hpp"
 #include "gtest/gtest.h"
+#include "test_paths.h"

 using namespace cppjieba;

 TEST(MixSegmentTest, Test1) {
-  MixSegment segment("../dict/jieba.dict.utf8", "../dict/hmm_model.utf8");;
+  MixSegment segment(DICT_DIR "/jieba.dict.utf8", DICT_DIR "/hmm_model.utf8");
  string sentence;
  vector<string> words;
  string actual;
@ -49,16 +50,18 @@ TEST(MixSegmentTest, Test1) {
 }

 TEST(MixSegmentTest, NoUserDict) {
-  MixSegment segment("../test/testdata/extra_dict/jieba.dict.small.utf8", "../dict/hmm_model.utf8");
+  MixSegment segment(TEST_DATA_DIR "/extra_dict/jieba.dict.small.utf8", DICT_DIR "/hmm_model.utf8");
  const char* str = "令狐冲是云计算方面的专家";
  vector<string> words;
  segment.Cut(str, words);
  string res;
  ASSERT_EQ("[\"令狐冲\", \"是\", \"云\", \"计算\", \"方面\", \"的\", \"专家\"]", res << words);
-
 }
+
 TEST(MixSegmentTest, UserDict) {
-  MixSegment segment("../test/testdata/extra_dict/jieba.dict.small.utf8", "../dict/hmm_model.utf8", "../dict/user.dict.utf8");
+  MixSegment segment(TEST_DATA_DIR "/extra_dict/jieba.dict.small.utf8", 
+                    DICT_DIR "/hmm_model.utf8", 
+                    DICT_DIR "/user.dict.utf8");
  {
    const char* str = "令狐冲是云计算方面的专家";
    vector<string> words;
@ -83,9 +86,10 @@ TEST(MixSegmentTest, UserDict) {
    ASSERT_EQ("[\"IBM\", \",\", \"3.14\"]", res);
  }
 }
+
 TEST(MixSegmentTest, TestUserDict) {
-  MixSegment segment("../test/testdata/extra_dict/jieba.dict.small.utf8", "../dict/hmm_model.utf8", 
-        "../test/testdata/userdict.utf8");
+  MixSegment segment(TEST_DATA_DIR "/extra_dict/jieba.dict.small.utf8", DICT_DIR "/hmm_model.utf8", 
+        TEST_DATA_DIR "/userdict.utf8");
  vector<string> words;
  string res;

@ -123,8 +127,8 @@ TEST(MixSegmentTest, TestUserDict) {
 }

 TEST(MixSegmentTest, TestMultiUserDict) {
-  MixSegment segment("../test/testdata/extra_dict/jieba.dict.small.utf8", "../dict/hmm_model.utf8", 
-        "../test/testdata/userdict.utf8;../test/testdata/userdict.2.utf8");
+  MixSegment segment(TEST_DATA_DIR "/extra_dict/jieba.dict.small.utf8", DICT_DIR "/hmm_model.utf8", 
+        TEST_DATA_DIR "/userdict.utf8;" TEST_DATA_DIR "/userdict.2.utf8");
  vector<string> words;
  string res;

@ -134,7 +138,7 @@ TEST(MixSegmentTest, TestMultiUserDict) {
 }

 TEST(MPSegmentTest, Test1) {
-  MPSegment segment("../dict/jieba.dict.utf8");;
+  MPSegment segment(DICT_DIR "/jieba.dict.utf8");
  string s;
  vector<string> words;
  segment.Cut("我来自北京邮电大学。", words);
@ -163,7 +167,7 @@ TEST(MPSegmentTest, Test1) {
 }

 TEST(HMMSegmentTest, Test1) {
-  HMMSegment segment("../dict/hmm_model.utf8");;
+  HMMSegment segment(DICT_DIR "/hmm_model.utf8");
  {
    const char* str = "我来自北京邮电大学。。。学号123456";
    const char* res[] = {"我来", "自北京", "邮电大学", "。", "。", "。", "学号", "123456"};
@ -182,7 +186,7 @@ TEST(HMMSegmentTest, Test1) {
 }

 TEST(FullSegment, Test1) {
-  FullSegment segment("../test/testdata/extra_dict/jieba.dict.small.utf8");
+  FullSegment segment(TEST_DATA_DIR "/extra_dict/jieba.dict.small.utf8");
  vector<string> words;
  string s;

@ -197,7 +201,7 @@ TEST(FullSegment, Test1) {
 }

 TEST(QuerySegment, Test1) {
-  QuerySegment segment("../dict/jieba.dict.utf8", "../dict/hmm_model.utf8", "");
+  QuerySegment segment(DICT_DIR "/jieba.dict.utf8", DICT_DIR "/hmm_model.utf8", "");
  vector<string> words;
  string s1, s2;

@ -218,7 +222,9 @@ TEST(QuerySegment, Test1) {
 }

 TEST(QuerySegment, Test2) {
-  QuerySegment segment("../test/testdata/extra_dict/jieba.dict.small.utf8", "../dict/hmm_model.utf8", "../test/testdata/userdict.utf8|../test/testdata/userdict.english");
+  QuerySegment segment(TEST_DATA_DIR "/extra_dict/jieba.dict.small.utf8", 
+                      DICT_DIR "/hmm_model.utf8", 
+                      TEST_DATA_DIR "/userdict.utf8|" TEST_DATA_DIR "/userdict.english");
  vector<string> words;
  string s1, s2;

@ -242,14 +248,13 @@ TEST(QuerySegment, Test2) {
    s2 = "中国/科学/学院/科学院/中国科学院";
    ASSERT_EQ(s1, s2);
  }
-
 }

 TEST(MPSegmentTest, Unicode32) {
  string s("天气很好，🙋 我们去郊游。");
  vector<string> words;

-  MPSegment segment("../dict/jieba.dict.utf8");;
+  MPSegment segment(DICT_DIR "/jieba.dict.utf8");
  segment.Cut(s, words);

  ASSERT_EQ(Join(words.begin(), words.end(), "/"), "天气/很/好/，/🙋/ /我们/去/郊游/。");
--- a/test/unittest/textrank_test.cpp
+++ b/test/unittest/textrank_test.cpp
@ -1,13 +1,14 @@
 #include "cppjieba/TextRankExtractor.hpp"
 #include "gtest/gtest.h"
+#include "test_paths.h"

 using namespace cppjieba;

 TEST(TextRankExtractorTest, Test1) {
  TextRankExtractor Extractor(
-    "../test/testdata/extra_dict/jieba.dict.small.utf8",
-    "../dict/hmm_model.utf8", 
-    "../dict/stop_words.utf8");
+    TEST_DATA_DIR "/extra_dict/jieba.dict.small.utf8",
+    DICT_DIR "/hmm_model.utf8", 
+    DICT_DIR "/stop_words.utf8");
  {
    string s("你好世界世界而且而且");
    string res;
@ -59,10 +60,10 @@ TEST(TextRankExtractorTest, Test1) {

 TEST(TextRankExtractorTest, Test2) {
  TextRankExtractor Extractor(
-    "../test/testdata/extra_dict/jieba.dict.small.utf8",
-    "../dict/hmm_model.utf8",
-    "../dict/stop_words.utf8",
-    "../test/testdata/userdict.utf8");
+    TEST_DATA_DIR "/extra_dict/jieba.dict.small.utf8",
+    DICT_DIR "/hmm_model.utf8",
+    DICT_DIR "/stop_words.utf8",
+    TEST_DATA_DIR "/userdict.utf8");

  {
    string s("\xe8\x93\x9d\xe7\xbf\x94\xe4\xbc\x98\xe7\xa7\x80\xe6\xaf\x95\xe4\xb8\x9a\xe7\x94\x9f");
--- a/test/unittest/trie_test.cpp
+++ b/test/unittest/trie_test.cpp
@ -1,10 +1,11 @@
 #include "cppjieba/DictTrie.hpp"
 #include "cppjieba/MPSegment.hpp"
 #include "gtest/gtest.h"
+#include "test_paths.h"

 using namespace cppjieba;

-static const char* const DICT_FILE = "../test/testdata/extra_dict/jieba.dict.small.utf8";
+static const char* const DICT_FILE = TEST_DATA_DIR "/extra_dict/jieba.dict.small.utf8";

 TEST(TrieTest, Empty) {
  vector<Unicode> keys;
@ -15,7 +16,7 @@ TEST(TrieTest, Empty) {
 TEST(TrieTest, Construct) {
  vector<Unicode> keys;
  vector<const DictUnit*> values;
-  keys.push_back(DecodeRunesInString("你"));
+  keys.push_back(DecodeUTF8RunesInString("你"));
  values.push_back((const DictUnit*)(NULL));
  Trie trie(keys, values);
 }
@ -32,13 +33,7 @@ TEST(DictTrieTest, Test1) {
  ASSERT_LT(trie.GetMinWeight() + 15.6479, 0.001);
  string word("来到");
  cppjieba::RuneStrArray uni;
-  ASSERT_TRUE(DecodeRunesInString(word, uni));
-  //DictUnit nodeInfo;
-  //nodeInfo.word = uni;
-  //nodeInfo.tag = "v";
-  //nodeInfo.weight = -8.87033;
-  //s1 << nodeInfo;
-  //s2 << (*trie.Find(uni.begin(), uni.end()));
+  ASSERT_TRUE(DecodeUTF8RunesInString(word, uni));
  const DictUnit* du = trie.Find(uni.begin(), uni.end());
  ASSERT_TRUE(du != NULL);
  ASSERT_EQ(2u, du->word.size());
@ -47,45 +42,42 @@ TEST(DictTrieTest, Test1) {
  ASSERT_EQ("v", du->tag);
  ASSERT_NEAR(-8.870, du->weight, 0.001);

-  //EXPECT_EQ("[\"26469\", \"21040\"] v -8.870", s2);
  word = "清华大学";
  LocalVector<pair<size_t, const DictUnit*> > res;
  const char * words[] = {"清", "清华", "清华大学"};
  for (size_t i = 0; i < sizeof(words)/sizeof(words[0]); i++) {
-    ASSERT_TRUE(DecodeRunesInString(words[i], uni));
+    ASSERT_TRUE(DecodeUTF8RunesInString(words[i], uni));
    res.push_back(make_pair(uni.size() - 1, trie.Find(uni.begin(), uni.end())));
-    //resMap[uni.size() - 1] = trie.Find(uni.begin(), uni.end());
  }
  vector<pair<size_t, const DictUnit*> > vec;
  vector<struct Dag> dags;
-  ASSERT_TRUE(DecodeRunesInString(word, uni));
+  ASSERT_TRUE(DecodeUTF8RunesInString(word, uni));
  trie.Find(uni.begin(), uni.end(), dags);
  ASSERT_EQ(dags.size(), uni.size());
  ASSERT_NE(dags.size(), 0u);
  s1 << res;
  s2 << dags[0].nexts;
  ASSERT_EQ(s1, s2);
-  
 }

 TEST(DictTrieTest, UserDict) {
-  DictTrie trie(DICT_FILE, "../test/testdata/userdict.utf8");
+  DictTrie trie(DICT_FILE, TEST_DATA_DIR "/userdict.utf8");
  string word = "云计算";
  cppjieba::RuneStrArray unicode;
-  ASSERT_TRUE(DecodeRunesInString(word, unicode));
+  ASSERT_TRUE(DecodeUTF8RunesInString(word, unicode));
  const DictUnit * unit = trie.Find(unicode.begin(), unicode.end());
  ASSERT_TRUE(unit != NULL);
  ASSERT_NEAR(unit->weight, -14.100, 0.001);

  word = "蓝翔";
-  ASSERT_TRUE(DecodeRunesInString(word, unicode));
+  ASSERT_TRUE(DecodeUTF8RunesInString(word, unicode));
  unit = trie.Find(unicode.begin(), unicode.end());
  ASSERT_TRUE(unit != NULL);
  ASSERT_EQ(unit->tag, "nz");
  ASSERT_NEAR(unit->weight, -14.100, 0.001);

  word = "区块链";
-  ASSERT_TRUE(DecodeRunesInString(word, unicode));
+  ASSERT_TRUE(DecodeUTF8RunesInString(word, unicode));
  unit = trie.Find(unicode.begin(), unicode.end());
  ASSERT_TRUE(unit != NULL);
  ASSERT_EQ(unit->tag, "nz");
@ -93,22 +85,22 @@ TEST(DictTrieTest, UserDict) {
 }

 TEST(DictTrieTest, UserDictWithMaxWeight) {
-  DictTrie trie(DICT_FILE, "../test/testdata/userdict.utf8", DictTrie::WordWeightMax);
+  DictTrie trie(DICT_FILE, TEST_DATA_DIR "/userdict.utf8", DictTrie::WordWeightMax);
  string word = "云计算";
  cppjieba::RuneStrArray unicode;
-  ASSERT_TRUE(DecodeRunesInString(word, unicode));
+  ASSERT_TRUE(DecodeUTF8RunesInString(word, unicode));
  const DictUnit * unit = trie.Find(unicode.begin(), unicode.end());
  ASSERT_TRUE(unit);
  ASSERT_NEAR(unit->weight, -2.975, 0.001);
 }

 TEST(DictTrieTest, Dag) {
-  DictTrie trie(DICT_FILE, "../test/testdata/userdict.utf8");
+  DictTrie trie(DICT_FILE, TEST_DATA_DIR "/userdict.utf8");

  {
    string word = "清华大学";
    cppjieba::RuneStrArray unicode;
-    ASSERT_TRUE(DecodeRunesInString(word, unicode));
+    ASSERT_TRUE(DecodeUTF8RunesInString(word, unicode));
    vector<struct Dag> res;
    trie.Find(unicode.begin(), unicode.end(), res);

@ -122,7 +114,7 @@ TEST(DictTrieTest, Dag) {
  {
    string word = "北京邮电大学";
    cppjieba::RuneStrArray unicode;
-    ASSERT_TRUE(DecodeRunesInString(word, unicode));
+    ASSERT_TRUE(DecodeUTF8RunesInString(word, unicode));
    vector<struct Dag> res;
    trie.Find(unicode.begin(), unicode.end(), res);

@ -136,7 +128,7 @@ TEST(DictTrieTest, Dag) {
  {
    string word = "长江大桥";
    cppjieba::RuneStrArray unicode;
-    ASSERT_TRUE(DecodeRunesInString(word, unicode));
+    ASSERT_TRUE(DecodeUTF8RunesInString(word, unicode));
    vector<struct Dag> res;
    trie.Find(unicode.begin(), unicode.end(), res);

@ -150,7 +142,7 @@ TEST(DictTrieTest, Dag) {
  {
    string word = "长江大桥";
    cppjieba::RuneStrArray unicode;
-    ASSERT_TRUE(DecodeRunesInString(word, unicode));
+    ASSERT_TRUE(DecodeUTF8RunesInString(word, unicode));
    vector<struct Dag> res;
    trie.Find(unicode.begin(), unicode.end(), res, 3);

@ -164,7 +156,7 @@ TEST(DictTrieTest, Dag) {
  {
    string word = "长江大桥";
    cppjieba::RuneStrArray unicode;
-    ASSERT_TRUE(DecodeRunesInString(word, unicode));
+    ASSERT_TRUE(DecodeUTF8RunesInString(word, unicode));
    vector<struct Dag> res;
    trie.Find(unicode.begin(), unicode.end(), res, 4);

--- a/test/unittest/unicode_test.cpp
+++ b/test/unittest/unicode_test.cpp
@ -8,7 +8,7 @@ using namespace std;
 TEST(UnicodeTest, Test1) {
  string s = "你好世界";
  RuneStrArray runes;
-  ASSERT_TRUE(DecodeRunesInString(s, runes));
+  ASSERT_TRUE(DecodeUTF8RunesInString(s, runes));
  string actual;
  string expected = "[\"{\"rune\": \"20320\", \"offset\": 0, \"len\": 3}\", \"{\"rune\": \"22909\", \"offset\": 3, \"len\": 3}\", \"{\"rune\": \"19990\", \"offset\": 6, \"len\": 3}\", \"{\"rune\": \"30028\", \"offset\": 9, \"len\": 3}\"]";
  actual << runes;
@ -18,7 +18,7 @@ TEST(UnicodeTest, Test1) {
 TEST(UnicodeTest, Illegal) {
  string s = "123\x80";
  RuneStrArray runes;
-  ASSERT_FALSE(DecodeRunesInString(s, runes));
+  ASSERT_FALSE(DecodeUTF8RunesInString(s, runes));
  string actual;
  string expected = "[]";
  actual << runes;
@ -38,6 +38,6 @@ TEST(UnicodeTest, Rand) {
      s[rand() % len] = rand();
    }
    RuneStrArray runes;
-    DecodeRunesInString(s, runes);
+    DecodeUTF8RunesInString(s, runes);
  }
 }
Author	SHA1	Message	Date
Yanyi Wu	294755fab1	build: refine CMakeLists.txt by removing unnecessary conditions and options - Eliminated the default installation prefix condition to streamline the configuration. - Simplified the test build logic by ensuring tests are enabled only for top-level projects. - Cleaned up redundant code for better readability and maintainability.	2025-05-03 07:43:25 +08:00
Yanyi Wu	714a297823	build: update CMakeLists.txt to include additional directories for test configuration - Added include directories for the current binary and test directories to improve test file accessibility. - Ensured proper configuration for test paths in the build process.	2025-05-02 23:47:37 +08:00
Yanyi Wu	c14131e3e2	refactor: clean up load_test.cpp by removing unused dependencies and tests - Removed unused Jieba test and associated includes from load_test.cpp. - Simplified main function to focus on essential operations. - Ensured consistent exit handling by returning EXIT_SUCCESS.	2025-05-02 23:41:53 +08:00
Yanyi Wu	9cd64a1694	build: enhance test configuration and path management - Added configuration for test paths in CMake to simplify file references. - Updated load_test.cpp and various unit tests to use defined path macros for dictionary and test data files. - Introduced test_paths.h.in to manage directory paths consistently across tests.	2025-05-02 23:33:18 +08:00
Yanyi Wu	aa410a69bb	build: simplify test configuration in CMakeLists.txt - Removed conditional check for MSVC when adding test commands. - Ensured that test commands are always added regardless of the compiler.	2025-05-02 21:39:18 +08:00
Yanyi Wu	b5dc8e7a35	build: update .gitignore and CMakeLists for test configuration - Added entries to .gitignore for temporary test files. - Included a message to display MSVC value during build. - Added UTF-8 compile option for MSVC in unittest CMakeLists.	2025-05-02 21:28:28 +08:00
Yanyi Wu	8141d8f434	Merge pull request #200 from yanyiwu/dev fix: remove outdated entry from jieba dictionary	2025-05-02 17:31:29 +08:00
yanyiwu	9d8af2116e	build: update CI workflow to include latest OS versions	2025-05-02 11:53:33 +08:00
yanyiwu	2185315643	fix: remove outdated entry from jieba dictionary	2025-05-02 11:38:31 +08:00
yanyiwu	340de007f9	docs: update README.md	2025-04-13 18:59:44 +08:00
yanyiwu	940ea02eb4	deps: upgrade limonp from v1.0.0 to v1.0.1	2025-04-12 17:54:01 +08:00
yanyiwu	3732abc0e5	docs: update CHANGELOG for v5.5.0	2025-04-12 10:07:40 +08:00
yanyiwu	9cda7f33e8	build: upgrade googletest from 1.11.0 to 1.12.1	2025-04-12 10:02:10 +08:00
Yanyi Wu	338603b676	Merge pull request #196 from ahmadov/ahmadov/fix-ns-2 avoid implicit namespaces	2025-04-11 08:59:41 +08:00
Elmi Ahmadov	d93dda397c	avoid implicit namespaces This PR fixes the ambigious `partial_sort` in KeywordExtractor.hpp. We also have a definition for it and the compiler is consufed which implementation should be used. To fix it, we can use the `std` namespace explicitly. Also, use the `std` namespace for the other data structures and include their headers.	2025-04-10 19:10:05 +02:00
Yanyi Wu	7730deee52	Merge pull request #195 from ahmadov/ahmadov/fix-ns fix missing includes and make namespaces explicit	2025-04-10 23:01:18 +08:00
Elmi Ahmadov	588860b5b6	fix missing includes and make namespaces explicit	2025-04-10 16:11:20 +02:00
Yanyi Wu	0523949aa8	Update stale-issues.yml	2025-04-05 17:26:58 +08:00
Yanyi Wu	b11fd29697	Update README.md	2025-03-08 17:33:48 +08:00
yanyiwu	15b8086a2a	Add CMake workflow for Windows ARM64 builds This commit introduces a new GitHub Actions workflow for building and testing CMake projects on Windows ARM64. The workflow includes steps for checking out the repository, configuring CMake with multiple C++ standards, building the project, and running tests. This enhancement supports continuous integration for ARM64 architecture, improving the project's build versatility.	2025-01-18 20:58:17 +08:00
yanyiwu	1d74caf705	Update CMake minimum version requirement to 3.10	2025-01-18 20:47:06 +08:00
Yanyi Wu	0c7c5228d0	Update README.md	2025-01-17 23:47:09 +08:00
yanyiwu	016fc17575	Improve error logging for UTF-8 decoding failures across cppjieba components. Updated error messages in DictTrie, PosTagger, PreFilter, and SegmentBase to provide clearer context on the specific input causing the failure. This change enhances the debugging experience when handling UTF-8 encoded strings.	2024-12-08 17:26:28 +08:00
yanyiwu	39fc58f081	Remove macOS 12 from CI workflow in cmake.yml	2024-12-08 17:03:39 +08:00
yanyiwu	42a93a4b98	Refactor decoding functions to use UTF-8 compliant methods Updated multiple files to replace instances of DecodeRunesInString with DecodeUTF8RunesInString, ensuring proper handling of UTF-8 encoded strings. This change enhances the robustness of string decoding across the cppjieba library, including updates in DictTrie, HMMModel, PosTagger, PreFilter, SegmentBase, and Unicode files. Additionally, corresponding unit tests have been modified to reflect these changes.	2024-12-08 16:46:24 +08:00
yanyiwu	5ee74d788e	[stale-isssues] Monthly on the 3rd day of the month at midnight	2024-11-03 17:22:28 +08:00
yanyiwu	9b45e084a3	v5.4.0	2024-09-22 10:02:53 +08:00
yanyiwu	aa1def5ddb	class Jiaba unittest add default argument input	2024-09-22 09:43:04 +08:00
yanyiwu	732812cdfb	class Jieba: support default dictpath	2024-09-22 09:38:31 +08:00
yanyiwu	6e167a30dd	cmake: avoid testing when FetchContent by other project	2024-09-22 00:25:23 +08:00
yanyiwu	5ef74f335a	Revert "cmake: enable windows/msvc test" This reverts commit 63392627552b018ea018848c82965c263b0030fa.	2024-09-21 23:58:59 +08:00
yanyiwu	6339262755	cmake: enable windows/msvc test	2024-09-21 21:49:56 +08:00
yanyiwu	cc58d4f858	DictTrie: removed unused var	2024-09-21 21:29:55 +08:00
yanyiwu	dbebc7cacb	cmake: enable windows/msvc test	2024-09-21 21:10:53 +08:00