make QuerySegment support user.dict.utf8

2025-07-18 00:00:12 +08:00 · 2015-01-23 01:10:12 +08:00 · 2015-01-23 01:10:12 +08:00 · 269bc0fd0d
commit 269bc0fd0d
parent a406c0f8cc
5 changed files with 40 additions and 9 deletions
--- a/README.md
+++ b/README.md
@ -344,7 +344,9 @@ http://cppjieba-webdemo.herokuapp.com/

 ## 客服

-`wuyanyi09@foxmail.com`
+`i@yanyiwu.com`
+
+![image](http://yanyiwu.com/weedfs/2/5a7d1b5c0d/yanyiwu_personal_qrcodes.jpg)

 ## 鸣谢

@ -353,7 +355,7 @@ https://github.com/fxsjy/jieba

 ## 作者

- aszxqw https://github.com/aszxqw i@yanyiwu.com
+- yanyiwu https://github.com/aszxqw i@yanyiwu.com
 - aholic https://github.com/aholic ruochen.xu@gmail.com

 [CppJieba]:https://github.com/aszxqw/cppjieba
--- a/src/QuerySegment.hpp
+++ b/src/QuerySegment.hpp
@ -24,15 +24,15 @@ namespace CppJieba

    public:
        QuerySegment(){};
-        QuerySegment(const string& dict, const string& model, size_t maxWordLen)
+        QuerySegment(const string& dict, const string& model, size_t maxWordLen, const string& userDict = "")
        {
-            init(dict, model, maxWordLen);
+            init(dict, model, maxWordLen, userDict);
        };
        virtual ~QuerySegment(){};
    public:
-        bool init(const string& dict, const string& model, size_t maxWordLen)
+        bool init(const string& dict, const string& model, size_t maxWordLen, const string& userDict = "")
        {
-            LIMONP_CHECK(_mixSeg.init(dict, model));
+            LIMONP_CHECK(_mixSeg.init(dict, model, userDict));
            LIMONP_CHECK(_fullSeg.init(_mixSeg.getDictTrie()));
            assert(maxWordLen);
            _maxWordLen = maxWordLen;
--- a/test/testdata/userdict.utf8
+++ b/test/testdata/userdict.utf8
@ -2,5 +2,5 @@
 韩玉鉴赏
 A
 B
-iphone6
+iPhone6
 蓝翔 nz
--- a/test/unittest/TPosTagger.cpp
+++ b/test/unittest/TPosTagger.cpp
@ -8,8 +8,8 @@ static const char * const ANS_TEST1 = "[\"我:r\", \"是:v\", \"蓝翔:x\", \"
 static const char * const QUERY_TEST2 = "我是蓝翔技工拖拉机学院手扶拖拉机专业的。不用多久，我就会升职加薪，当上总经理，出任CEO，迎娶白富美，走上人生巅峰。";
 static const char * const ANS_TEST2 = "[\"我:r\", \"是:v\", \"蓝翔:nz\", \"技工:n\", \"拖拉机:n\", \"学院:n\", \"手扶拖拉机:n\", \"专业:n\", \"的:uj\", \"。:x\", \"不用:v\", \"多久:m\", \"，:x\", \"我:r\", \"就:d\", \"会:v\", \"升职:v\", \"加薪:nr\", \"，:x\", \"当上:t\", \"总经理:n\", \"，:x\", \"出任:v\", \"CEO:eng\", \"，:x\", \"迎娶:v\", \"白富:x\", \"美:ns\", \"，:x\", \"走上:v\", \"人生:n\", \"巅峰:n\", \"。:x\"]";

-static const char * const QUERY_TEST3 = "iphone6手机的最大特点是很容易弯曲。";
-static const char * const ANS_TEST3 = "[\"iphone6:eng\", \"手机:n\", \"的:uj\", \"最大:a\", \"特点:n\", \"是:v\", \"很:zg\", \"容易:a\", \"弯曲:v\", \"。:x\"]";
+static const char * const QUERY_TEST3 = "iPhone6手机的最大特点是很容易弯曲。";
+static const char * const ANS_TEST3 = "[\"iPhone6:eng\", \"手机:n\", \"的:uj\", \"最大:a\", \"特点:n\", \"是:v\", \"很:zg\", \"容易:a\", \"弯曲:v\", \"。:x\"]";
 //static const char * const ANS_TEST3 = "";

 TEST(PosTaggerTest, Test)
--- a/test/unittest/TSegments.cpp
+++ b/test/unittest/TSegments.cpp
@ -185,3 +185,32 @@ TEST(QuerySegment, Test1)

 }

+TEST(QuerySegment, Test2)
+{
+    QuerySegment segment("../dict/extra_dict/jieba.dict.small.utf8", "../dict/hmm_model.utf8", 3, "../test/testdata/userdict.utf8");
+
+    {
+        const char* str = "小明硕士毕业于中国科学院计算所，后在日本京都大学深造";
+        vector<string> words;
+
+        ASSERT_TRUE(segment.cut(str, words));
+
+        string s1, s2;
+        s1 << words;
+        s2 = "[\"小明\", \"硕士\", \"毕业\", \"于\", \"中国\", \"中国科学院\", \"科学\", \"科学院\", \"学院\", \"计算所\", \"，\", \"后\", \"在\", \"日本\", \"京都\", \"京都大学\", \"大学\", \"深造\"]";
+        ASSERT_EQ(s1, s2);
+    }
+
+    {
+        const char* str = "小明硕士毕业于中国科学院计算所iPhone6";
+        vector<string> words;
+
+        ASSERT_TRUE(segment.cut(str, words));
+
+        string s1, s2;
+        s1 << words;
+        s2 = "[\"小明\", \"硕士\", \"毕业\", \"于\", \"中国\", \"中国科学院\", \"科学\", \"科学院\", \"学院\", \"计算所\", \"iPhone6\"]";
+        ASSERT_EQ(s1, s2);
+    }
+    
+}