细粒度分词功能

2025-07-18 00:00:12 +08:00 · 2015-08-30 16:35:21 +08:00 · 2015-08-30 16:35:21 +08:00 · 1babe57ebc
commit 1babe57ebc
parent 3c60c35906
3 changed files with 10 additions and 0 deletions
--- a/src/Application.hpp
+++ b/src/Application.hpp
@ -67,6 +67,10 @@ class Application {
        vector<pair<string, size_t> >& words) const {
    levelSeg_.cut(sentence, words);
  }
+  void cut(const string& sentence,
+        vector<string>& words, size_t max_word_len) const {
+    mpSeg_.cut(sentence, words, max_word_len);
+  }
  bool insertUserWord(const string& word, const string& tag = UNKNOWN_TAG) {
    return dictTrie_.insertUserWord(word, tag);
  }
--- a/test/unittest/TApplication.cpp
+++ b/test/unittest/TApplication.cpp
@ -20,6 +20,9 @@ TEST(ApplicationTest, Test1) {
  result << words;
  ASSERT_EQ("[\"我\", \"来自\", \"北京邮电大学\", \"。\"]", result);

+  app.cut("南京市长江大桥", words, 3);
+  ASSERT_EQ("[\"南京市\", \"长江\", \"大桥\"]", result << words);
+
  app.cut("我来自北京邮电大学。。。学号123456", words, METHOD_HMM);
  result << words;
  ASSERT_EQ("[\"我来\", \"自北京\", \"邮电大学\", \"。\", \"。\", \"。\", \"学号\", \"123456\"]", result);
--- a/test/unittest/TSegments.cpp
+++ b/test/unittest/TSegments.cpp
@ -101,6 +101,9 @@ TEST(MPSegmentTest, Test1) {
  // MaxWordLen
  ASSERT_TRUE(segment.cut("南京市长江大桥", words, 3));
  ASSERT_EQ("[\"南京市\", \"长江\", \"大桥\"]", s << words);
+
+  ASSERT_TRUE(segment.cut("南京市长江大桥", words, 0));
+  ASSERT_EQ("[\"南\", \"京\", \"市\", \"长\", \"江\", \"大\", \"桥\"]", s << words);
 }

 //TEST(MPSegmentTest, Test2) {