diff --git a/README.md b/README.md index a7d8724..3e6b508 100644 --- a/README.md +++ b/README.md @@ -80,10 +80,13 @@ Output: ``` 我来到北京清华大学 我来/到/北京/清华大学 + 他来到了网易杭研大厦 他来/到/了/网易/杭/研大厦 + 小明硕士毕业于中国科学院计算所,后在日本京都大学深造 小明/硕士/毕业于/中国/科学院/计算所/,/后/在/日/本/京/都/大/学/深/造 + 我来自北京邮电大学。。。学号091111xx。。。 我来/自北京/邮电大学/。。。/学号/091111xx/。。。 ``` @@ -94,23 +97,60 @@ Output: ``` 我来到北京清华大学 我/来到/北京/清华大学 + 他来到了网易杭研大厦 他/来到/了/网易/杭研/大厦 -杭研 -杭研 + 小明硕士毕业于中国科学院计算所,后在日本京都大学深造 小明/硕士/毕业/于/中国科学院/计算所/,/后/在/日本京都大学/深造 + 我来自北京邮电大学。。。学号091111xx。。。 我/来自/北京邮电大学/。。。/学号/091111xx/。。。 ``` +### FullSegment's demo + +Output: +``` +我来到北京清华大学 +我/来到/北京/清华/清华大学/华大/大学 + +他来到了网易杭研大厦 +他/来到/了/网易/杭/研/大厦 + +小明硕士毕业于中国科学院计算所,后在日本京都大学深造 +小/明/硕士/毕业/于/中国/中国科学院/科学/科学院/学院/计算/计算所/,/后/在/日本/日本京都大学/京都/京都大学/大学/深造 + +我来自北京邮电大学。。。 学号 091111xx。。。 +我/来自/北京/北京邮电/北京邮电大学/邮电/邮电大学/电大/大学/。/。/。/ /学号/ 091111xx/。/。/。 +``` + +### QuerySegment's demo + +Output: +``` +我来到北京清华大学 +我/来到/北京/清华/清华大学/华大/大学 + +他来到了网易杭研大厦 +他/来到/了/网易/杭研/大厦 + +小明硕士毕业于中国科学院计算所,后在日本京都大学深造 +小明/硕士/毕业/于/中国/中国科学院/科学/科学院/学院/计算所/,/后/在/中国/中国科学院/科学/科学院/学院/日本/日本京都大学/京都/京都大学/大学/深造 + +我来自北京邮电大学。。。 学号 091111xx。。。 +我/来自/北京/北京邮电/北京邮电大学/邮电/邮电大学/电大/大学/。/。/。/ /学号/ 091111xx/。/。/。 +``` + ### 效果分析 以上依次是MP,HMM,Mix三种方法的效果。 可以看出效果最好的是Mix,也就是融合MP和HMM的切词算法。即可以准确切出词典已有的词,又可以切出像"杭研"这样的未登录词。 +Full方法切出所有字典里的词语。 +Query方法先使用Mix方法切词,对于切出来的较长的词再使用Full方法。 ## 模块详解 @@ -120,6 +160,10 @@ Output: 核心目录,包含主要源代码。 +#### TrieManager模块 +TrieManager.hpp 提供一个单例TrieManager,负责管理trie树。 +通过该单例获取trie树时,会先判断是否已经由该字典文件生成了一颗trie树,如果已有则返回已有的trie树,否则重新创建一颗trie树返回。 + #### Trie树 Trie.hpp 负责载入词典的trie树,主要供Segment模块使用。 @@ -133,6 +177,9 @@ HMMSegment.hpp HMM模型由dicts/下面的`hmm_model.utf8`提供。 分词算法即viterbi算法。 +FullSegment.hpp +枚举句子中所有可能成词的情况,找出字典里存在的即可。 + #### TransCode模块 TransCode.cpp/TransCode.h 负责转换编码类型,将utf8和gbk转换成`uint16_t`类型,也负责逆转换。 @@ -154,8 +201,8 @@ TransCode.cpp/TransCode.h 负责转换编码类型,将utf8和gbk转换成`uint ### MixSegment -分词速度大概是 62M / 54sec = 1.15M/sec -测试环境: `Intel(R) Xeon(R) CPU E5506 @ 2.13GHz` +分词速度大概是 = 2M/sec +测试环境: `Intel(R) Xeon(R) CPU E5506 @ 2.13GHz` 电脑下开的ubuntu虚拟机 ## 联系客服 diff --git a/src/FullSegment.hpp b/src/FullSegment.hpp index d67f2f3..d4bfa31 100644 --- a/src/FullSegment.hpp +++ b/src/FullSegment.hpp @@ -9,13 +9,14 @@ #include "ISegment.hpp" #include "SegmentBase.hpp" #include "TransCode.hpp" +#include "TrieManager.hpp" namespace CppJieba { class FullSegment: public SegmentBase { private: - Trie _trie; + Trie* _trie; const string _dictPath; public: @@ -24,36 +25,25 @@ namespace CppJieba public: bool init() { -#ifndef NO_CODING_LOG if(_getInitFlag()) { LogError("already inited before now."); return false; } -#endif - if(!_trie.init()) + _trie = TrieManager::getInstance().getTrie(_dictPath.c_str()); + if (NULL == _trie) { - LogError("_trie.init failed."); + LogError("get NULL pointor from getTrie(\"%s\")", _dictPath.c_str()); return false; } - LogInfo("_trie.loadDict(%s) start...", _dictPath.c_str()); - if(!_trie.loadDict(_dictPath.c_str())) - { - LogError("_trie.loadDict faield."); - return false; - } - LogInfo("_trie.loadDict end."); return _setInitFlag(true); } bool dispose() { -#ifndef NO_CODING_LOG if(!_getInitFlag()) { return true; } -#endif - _trie.dispose(); _setInitFlag(false); return true; } @@ -65,18 +55,12 @@ namespace CppJieba bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector& res) const { assert(_getInitFlag()); -#ifndef NO_CODING_LOG - //if (!_getInitFlag()) - //{ - // LogError("not inited."); - // return false; - //} if (begin >= end) { LogError("begin >= end"); return false; } -#endif + //resut of searching in trie tree vector > tRes; @@ -91,7 +75,7 @@ namespace CppJieba for (Unicode::const_iterator uItr = begin; uItr != end; uItr++) { //find word start from uItr - if (_trie.find(uItr, end, tRes)) + if (_trie->find(uItr, end, tRes)) { for (vector >::const_iterator itr = tRes.begin(); itr != tRes.end(); itr++) { @@ -123,26 +107,21 @@ namespace CppJieba bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector& res) const { -#ifndef NO_CODING_LOG - if (!_getInitFlag()) + assert(_getInitFlag()); + if (begin >= end) { - LogError("not inited."); + LogError("begin >= end"); return false; } - if (begin > end) - { - LogError("begin > end"); - return false; - } -#endif + vector uRes; if (!cut(begin, end, uRes)) { LogError("get unicode cut result error."); return false; } - string tmp; + string tmp; for (vector::const_iterator uItr = uRes.begin(); uItr != uRes.end(); uItr++) { if (TransCode::encode(*uItr, tmp)) diff --git a/src/Limonp/md5.hpp b/src/Limonp/md5.hpp new file mode 100644 index 0000000..8b861e1 --- /dev/null +++ b/src/Limonp/md5.hpp @@ -0,0 +1,435 @@ +#ifndef __MD5_H__ +#define __MD5_H__ + +// Copyright (C) 1991-2, RSA Data Security, Inc. Created 1991. All +// rights reserved. + +// License to copy and use this software is granted provided that it +// is identified as the "RSA Data Security, Inc. MD5 Message-Digest +// Algorithm" in all material mentioning or referencing this software +// or this function. +// +// License is also granted to make and use derivative works provided +// that such works are identified as "derived from the RSA Data +// Security, Inc. MD5 Message-Digest Algorithm" in all material +// mentioning or referencing the derived work. +// +// RSA Data Security, Inc. makes no representations concerning either +// the merchantability of this software or the suitability of this +// software for any particular purpose. It is provided "as is" +// without express or implied warranty of any kind. +// +// These notices must be retained in any copies of any part of this +// documentation and/or software. + + + +// The original md5 implementation avoids external libraries. +// This version has dependency on stdio.h for file input and +// string.h for memcpy. +#include +#include +#include + +namespace Limonp +{ + +#pragma region MD5 defines +// Constants for MD5Transform routine. +#define S11 7 +#define S12 12 +#define S13 17 +#define S14 22 +#define S21 5 +#define S22 9 +#define S23 14 +#define S24 20 +#define S31 4 +#define S32 11 +#define S33 16 +#define S34 23 +#define S41 6 +#define S42 10 +#define S43 15 +#define S44 21 + + +// F, G, H and I are basic MD5 functions. +#define F(x, y, z) (((x) & (y)) | ((~x) & (z))) +#define G(x, y, z) (((x) & (z)) | ((y) & (~z))) +#define H(x, y, z) ((x) ^ (y) ^ (z)) +#define I(x, y, z) ((y) ^ ((x) | (~z))) + +// ROTATE_LEFT rotates x left n bits. +#define ROTATE_LEFT(x, n) (((x) << (n)) | ((x) >> (32-(n)))) + +// FF, GG, HH, and II transformations for rounds 1, 2, 3, and 4. +// Rotation is separate from addition to prevent recomputation. +#define FF(a, b, c, d, x, s, ac) { \ + (a) += F ((b), (c), (d)) + (x) + (UINT4)(ac); \ + (a) = ROTATE_LEFT ((a), (s)); \ + (a) += (b); \ + } +#define GG(a, b, c, d, x, s, ac) { \ + (a) += G ((b), (c), (d)) + (x) + (UINT4)(ac); \ + (a) = ROTATE_LEFT ((a), (s)); \ + (a) += (b); \ + } +#define HH(a, b, c, d, x, s, ac) { \ + (a) += H ((b), (c), (d)) + (x) + (UINT4)(ac); \ + (a) = ROTATE_LEFT ((a), (s)); \ + (a) += (b); \ + } +#define II(a, b, c, d, x, s, ac) { \ + (a) += I ((b), (c), (d)) + (x) + (UINT4)(ac); \ + (a) = ROTATE_LEFT ((a), (s)); \ + (a) += (b); \ + } +#pragma endregion + + +typedef unsigned char BYTE ; + +// POINTER defines a generic pointer type +typedef unsigned char *POINTER; + +// UINT2 defines a two byte word +typedef unsigned short int UINT2; + +// UINT4 defines a four byte word +typedef unsigned long int UINT4; + +static unsigned char PADDING[64] = { + 0x80, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +}; +// convenient object that wraps +// the C-functions for use in C++ only +class MD5 +{ +private: + struct __context_t { + UINT4 state[4]; /* state (ABCD) */ + UINT4 count[2]; /* number of bits, modulo 2^64 (lsb first) */ + unsigned char buffer[64]; /* input buffer */ + } context ; + + #pragma region static helper functions + // The core of the MD5 algorithm is here. + // MD5 basic transformation. Transforms state based on block. + static void MD5Transform( UINT4 state[4], unsigned char block[64] ) + { + UINT4 a = state[0], b = state[1], c = state[2], d = state[3], x[16]; + + Decode (x, block, 64); + + /* Round 1 */ + FF (a, b, c, d, x[ 0], S11, 0xd76aa478); /* 1 */ + FF (d, a, b, c, x[ 1], S12, 0xe8c7b756); /* 2 */ + FF (c, d, a, b, x[ 2], S13, 0x242070db); /* 3 */ + FF (b, c, d, a, x[ 3], S14, 0xc1bdceee); /* 4 */ + FF (a, b, c, d, x[ 4], S11, 0xf57c0faf); /* 5 */ + FF (d, a, b, c, x[ 5], S12, 0x4787c62a); /* 6 */ + FF (c, d, a, b, x[ 6], S13, 0xa8304613); /* 7 */ + FF (b, c, d, a, x[ 7], S14, 0xfd469501); /* 8 */ + FF (a, b, c, d, x[ 8], S11, 0x698098d8); /* 9 */ + FF (d, a, b, c, x[ 9], S12, 0x8b44f7af); /* 10 */ + FF (c, d, a, b, x[10], S13, 0xffff5bb1); /* 11 */ + FF (b, c, d, a, x[11], S14, 0x895cd7be); /* 12 */ + FF (a, b, c, d, x[12], S11, 0x6b901122); /* 13 */ + FF (d, a, b, c, x[13], S12, 0xfd987193); /* 14 */ + FF (c, d, a, b, x[14], S13, 0xa679438e); /* 15 */ + FF (b, c, d, a, x[15], S14, 0x49b40821); /* 16 */ + + /* Round 2 */ + GG (a, b, c, d, x[ 1], S21, 0xf61e2562); /* 17 */ + GG (d, a, b, c, x[ 6], S22, 0xc040b340); /* 18 */ + GG (c, d, a, b, x[11], S23, 0x265e5a51); /* 19 */ + GG (b, c, d, a, x[ 0], S24, 0xe9b6c7aa); /* 20 */ + GG (a, b, c, d, x[ 5], S21, 0xd62f105d); /* 21 */ + GG (d, a, b, c, x[10], S22, 0x2441453); /* 22 */ + GG (c, d, a, b, x[15], S23, 0xd8a1e681); /* 23 */ + GG (b, c, d, a, x[ 4], S24, 0xe7d3fbc8); /* 24 */ + GG (a, b, c, d, x[ 9], S21, 0x21e1cde6); /* 25 */ + GG (d, a, b, c, x[14], S22, 0xc33707d6); /* 26 */ + GG (c, d, a, b, x[ 3], S23, 0xf4d50d87); /* 27 */ + GG (b, c, d, a, x[ 8], S24, 0x455a14ed); /* 28 */ + GG (a, b, c, d, x[13], S21, 0xa9e3e905); /* 29 */ + GG (d, a, b, c, x[ 2], S22, 0xfcefa3f8); /* 30 */ + GG (c, d, a, b, x[ 7], S23, 0x676f02d9); /* 31 */ + GG (b, c, d, a, x[12], S24, 0x8d2a4c8a); /* 32 */ + + /* Round 3 */ + HH (a, b, c, d, x[ 5], S31, 0xfffa3942); /* 33 */ + HH (d, a, b, c, x[ 8], S32, 0x8771f681); /* 34 */ + HH (c, d, a, b, x[11], S33, 0x6d9d6122); /* 35 */ + HH (b, c, d, a, x[14], S34, 0xfde5380c); /* 36 */ + HH (a, b, c, d, x[ 1], S31, 0xa4beea44); /* 37 */ + HH (d, a, b, c, x[ 4], S32, 0x4bdecfa9); /* 38 */ + HH (c, d, a, b, x[ 7], S33, 0xf6bb4b60); /* 39 */ + HH (b, c, d, a, x[10], S34, 0xbebfbc70); /* 40 */ + HH (a, b, c, d, x[13], S31, 0x289b7ec6); /* 41 */ + HH (d, a, b, c, x[ 0], S32, 0xeaa127fa); /* 42 */ + HH (c, d, a, b, x[ 3], S33, 0xd4ef3085); /* 43 */ + HH (b, c, d, a, x[ 6], S34, 0x4881d05); /* 44 */ + HH (a, b, c, d, x[ 9], S31, 0xd9d4d039); /* 45 */ + HH (d, a, b, c, x[12], S32, 0xe6db99e5); /* 46 */ + HH (c, d, a, b, x[15], S33, 0x1fa27cf8); /* 47 */ + HH (b, c, d, a, x[ 2], S34, 0xc4ac5665); /* 48 */ + + /* Round 4 */ + II (a, b, c, d, x[ 0], S41, 0xf4292244); /* 49 */ + II (d, a, b, c, x[ 7], S42, 0x432aff97); /* 50 */ + II (c, d, a, b, x[14], S43, 0xab9423a7); /* 51 */ + II (b, c, d, a, x[ 5], S44, 0xfc93a039); /* 52 */ + II (a, b, c, d, x[12], S41, 0x655b59c3); /* 53 */ + II (d, a, b, c, x[ 3], S42, 0x8f0ccc92); /* 54 */ + II (c, d, a, b, x[10], S43, 0xffeff47d); /* 55 */ + II (b, c, d, a, x[ 1], S44, 0x85845dd1); /* 56 */ + II (a, b, c, d, x[ 8], S41, 0x6fa87e4f); /* 57 */ + II (d, a, b, c, x[15], S42, 0xfe2ce6e0); /* 58 */ + II (c, d, a, b, x[ 6], S43, 0xa3014314); /* 59 */ + II (b, c, d, a, x[13], S44, 0x4e0811a1); /* 60 */ + II (a, b, c, d, x[ 4], S41, 0xf7537e82); /* 61 */ + II (d, a, b, c, x[11], S42, 0xbd3af235); /* 62 */ + II (c, d, a, b, x[ 2], S43, 0x2ad7d2bb); /* 63 */ + II (b, c, d, a, x[ 9], S44, 0xeb86d391); /* 64 */ + + state[0] += a; + state[1] += b; + state[2] += c; + state[3] += d; + + // Zeroize sensitive information. + memset((POINTER)x, 0, sizeof (x)); + } + + // Encodes input (UINT4) into output (unsigned char). Assumes len is + // a multiple of 4. + static void Encode( unsigned char *output, UINT4 *input, unsigned int len ) + { + unsigned int i, j; + + for (i = 0, j = 0; j < len; i++, j += 4) { + output[j] = (unsigned char)(input[i] & 0xff); + output[j+1] = (unsigned char)((input[i] >> 8) & 0xff); + output[j+2] = (unsigned char)((input[i] >> 16) & 0xff); + output[j+3] = (unsigned char)((input[i] >> 24) & 0xff); + } + } + + // Decodes input (unsigned char) into output (UINT4). Assumes len is + // a multiple of 4. + static void Decode( UINT4 *output, unsigned char *input, unsigned int len ) + { + unsigned int i, j; + + for (i = 0, j = 0; j < len; i++, j += 4) + output[i] = ((UINT4)input[j]) | (((UINT4)input[j+1]) << 8) | + (((UINT4)input[j+2]) << 16) | (((UINT4)input[j+3]) << 24); + } + #pragma endregion + + +public: + // MAIN FUNCTIONS + MD5() + { + Init() ; + } + + // MD5 initialization. Begins an MD5 operation, writing a new context. + void Init() + { + context.count[0] = context.count[1] = 0; + + // Load magic initialization constants. + context.state[0] = 0x67452301; + context.state[1] = 0xefcdab89; + context.state[2] = 0x98badcfe; + context.state[3] = 0x10325476; + } + + // MD5 block update operation. Continues an MD5 message-digest + // operation, processing another message block, and updating the + // context. + void Update( + unsigned char *input, // input block + unsigned int inputLen ) // length of input block + { + unsigned int i, index, partLen; + + // Compute number of bytes mod 64 + index = (unsigned int)((context.count[0] >> 3) & 0x3F); + + // Update number of bits + if ((context.count[0] += ((UINT4)inputLen << 3)) + < ((UINT4)inputLen << 3)) + context.count[1]++; + context.count[1] += ((UINT4)inputLen >> 29); + + partLen = 64 - index; + + // Transform as many times as possible. + if (inputLen >= partLen) { + memcpy((POINTER)&context.buffer[index], (POINTER)input, partLen); + MD5Transform (context.state, context.buffer); + + for (i = partLen; i + 63 < inputLen; i += 64) + MD5Transform (context.state, &input[i]); + + index = 0; + } + else + i = 0; + + /* Buffer remaining input */ + memcpy((POINTER)&context.buffer[index], (POINTER)&input[i], inputLen-i); + } + + // MD5 finalization. Ends an MD5 message-digest operation, writing the + // the message digest and zeroizing the context. + // Writes to digestRaw + void Final() + { + unsigned char bits[8]; + unsigned int index, padLen; + + // Save number of bits + Encode( bits, context.count, 8 ); + + // Pad out to 56 mod 64. + index = (unsigned int)((context.count[0] >> 3) & 0x3f); + padLen = (index < 56) ? (56 - index) : (120 - index); + Update( PADDING, padLen ); + + // Append length (before padding) + Update( bits, 8 ); + + // Store state in digest + Encode( digestRaw, context.state, 16); + + // Zeroize sensitive information. + memset((POINTER)&context, 0, sizeof (context)); + + writeToString() ; + } + + /// Buffer must be 32+1 (nul) = 33 chars long at least + void writeToString() + { + int pos ; + + for( pos = 0 ; pos < 16 ; pos++ ) + sprintf( digestChars+(pos*2), "%02x", digestRaw[pos] ) ; + } + + +public: + // an MD5 digest is a 16-byte number (32 hex digits) + BYTE digestRaw[ 16 ] ; + + // This version of the digest is actually + // a "printf'd" version of the digest. + char digestChars[ 33 ] ; + + /// Load a file from disk and digest it + // Digests a file and returns the result. + const char* digestFile( const char *filename ) + { + if (NULL == filename || strcmp(filename, "") == 0) + return NULL; + + Init() ; + + FILE *file; + + int len; + unsigned char buffer[1024] ; + + if((file = fopen (filename, "rb")) == NULL) + { + return NULL; + } + else + { + while( len = fread( buffer, 1, 1024, file ) ) + Update( buffer, len ) ; + Final(); + + fclose( file ); + } + + return digestChars ; + } + + /// Digests a byte-array already in memory + const char* digestMemory( BYTE *memchunk, int len ) + { + if (NULL == memchunk) + return NULL; + + Init() ; + Update( memchunk, len ) ; + Final() ; + + return digestChars ; + } + + // Digests a string and prints the result. + const char* digestString(const char *string ) + { + if (string == NULL) + return NULL; + + Init() ; + Update( (unsigned char*)string, strlen(string) ) ; + Final() ; + + return digestChars ; + } +}; + +inline bool md5String(const char* str, std::string& res) +{ + if (NULL == str) + { + res = ""; + return false; + } + + MD5 md5; + const char *pRes = md5.digestString(str); + if (NULL == pRes) + { + res = ""; + return false; + } + + res = pRes; + return true; +} + +inline bool md5File(const char* filepath, std::string& res) +{ + if (NULL == filepath || strcmp(filepath, "") == 0) + { + res = ""; + return false; + } + + MD5 md5; + const char *pRes = md5.digestFile(filepath); + + if (NULL == pRes) + { + res = ""; + return false; + } + + res = pRes; + return true; +} +} +#endif diff --git a/src/MPSegment.hpp b/src/MPSegment.hpp index 14b8f21..b0a7bed 100644 --- a/src/MPSegment.hpp +++ b/src/MPSegment.hpp @@ -10,6 +10,7 @@ #include #include "Limonp/logger.hpp" #include "Trie.hpp" +#include "TrieManager.hpp" #include "ISegment.hpp" #include "SegmentBase.hpp" @@ -32,7 +33,7 @@ namespace CppJieba class MPSegment: public SegmentBase { private: - Trie _trie; + Trie* _trie; private: const string _dictPath; @@ -47,18 +48,12 @@ namespace CppJieba LogError("already inited before now."); return false; } - if(!_trie.init()) + _trie = TrieManager::getInstance().getTrie(_dictPath.c_str()); + if (_trie == NULL) { - LogError("_trie.init failed."); + LogError("get a NULL pointor form getTrie(\"%s\").", _dictPath.c_str()); return false; } - LogInfo("_trie.loadDict(%s) start...", _dictPath.c_str()); - if(!_trie.loadDict(_dictPath.c_str())) - { - LogError("_trie.loadDict faield."); - return false; - } - LogInfo("_trie.loadDict end."); return _setInitFlag(true); } virtual bool dispose() @@ -67,18 +62,12 @@ namespace CppJieba { return true; } - _trie.dispose(); _setInitFlag(false); return true; } public: virtual bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector& res)const { - //if(!_getInitFlag()) - //{ - // LogError("not inited."); - // return false; - //} assert(_getInitFlag()); vector segWordInfos; @@ -145,7 +134,7 @@ namespace CppJieba { SegmentChar schar(*it); uint i = it - begin; - _trie.find(it, end, i, schar.dag); + _trie->find(it, end, i, schar.dag); //DagType::iterator dagIter; if(schar.dag.end() == schar.dag.find(i)) { @@ -183,7 +172,7 @@ namespace CppJieba } else { - val += _trie.getMinLogFreq(); + val += _trie->getMinLogFreq(); } if(val > segContext[i].weight) { @@ -211,7 +200,7 @@ namespace CppJieba TrieNodeInfo nodeInfo; nodeInfo.word.push_back(segContext[i].uniCh); nodeInfo.freq = 0; - nodeInfo.logFreq = _trie.getMinLogFreq(); + nodeInfo.logFreq = _trie->getMinLogFreq(); res.push_back(nodeInfo); i++; } diff --git a/src/MixSegment.hpp b/src/MixSegment.hpp index 5f35031..79c40d1 100644 --- a/src/MixSegment.hpp +++ b/src/MixSegment.hpp @@ -55,72 +55,95 @@ namespace CppJieba public: using SegmentBase::cut; public: - virtual bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector& res)const + virtual bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector& res) const { - //if(!_getInitFlag()) - //{ - // LogError("not inited."); - // return false; - //} assert(_getInitFlag()); - if(begin == end) + if(begin >= end) { + LogError("begin >= end"); return false; } + vector infos; if(!_mpSeg.cut(begin, end, infos)) { LogError("mpSeg cutDAG failed."); return false; } - Unicode unico; + vector hmmRes; - string tmp; - for(uint i= 0; i < infos.size(); i++) + Unicode piece; + for (uint i = 0, j = 0; i < infos.size(); i++) { - TransCode::encode(infos[i].word,tmp); - if(1 == infos[i].word.size()) + //if mp get a word, it's ok, put it into result + if (1 != infos[i].word.size()) { - unico.push_back(infos[i].word[0]); + res.push_back(infos[i].word); + continue; } - else + + // if mp get a single one, collect it in sequence + j = i; + while (j < infos.size() && infos[j].word.size() == 1) { - if(!unico.empty()) - { - hmmRes.clear(); - if(!_hmmSeg.cut(unico.begin(), unico.end(), hmmRes)) - { - LogError("_hmmSeg cut failed."); - return false; - } - for(uint j = 0; j < hmmRes.size(); j++) - { - TransCode::encode(hmmRes[j], tmp); - res.push_back(tmp); - } - } - unico.clear(); - TransCode::encode(infos[i].word, tmp); - res.push_back(tmp); + piece.push_back(infos[j].word[0]); + j++; } - } - if(!unico.empty()) - { - hmmRes.clear(); - if(!_hmmSeg.cut(unico.begin(), unico.end(), hmmRes)) + + // cut the sequence with hmm + if (!_hmmSeg.cut(piece.begin(), piece.end(), hmmRes)) { LogError("_hmmSeg cut failed."); return false; } - for(uint j = 0; j < hmmRes.size(); j++) + + //put hmm result to return + for (uint k = 0; k < hmmRes.size(); k++) { - TransCode::encode(hmmRes[j], tmp); - res.push_back(tmp); + res.push_back(hmmRes[k]); } + + //clear tmp vars + piece.clear(); + hmmRes.clear(); + + //let i jump over this piece + i = j - 1; } return true; } + + virtual bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector& res)const + { + assert(_getInitFlag()); + if(begin >= end) + { + LogError("begin >= end"); + return false; + } + + vector uRes; + if (!cut(begin, end, uRes)) + { + LogError("get unicode cut result error."); + return false; + } + + string tmp; + for (vector::const_iterator uItr = uRes.begin(); uItr != uRes.end(); uItr++) + { + if (TransCode::encode(*uItr, tmp)) + { + res.push_back(tmp); + } + else + { + LogError("encode failed."); + } + } + return true; + } }; } diff --git a/src/QuerySegment.hpp b/src/QuerySegment.hpp index 4487339..a57f710 100644 --- a/src/QuerySegment.hpp +++ b/src/QuerySegment.hpp @@ -8,34 +8,33 @@ #include "Trie.hpp" #include "ISegment.hpp" #include "SegmentBase.hpp" -#include "HMMSegment.hpp" #include "FullSegment.hpp" +#include "MixSegment.hpp" #include "TransCode.hpp" +#include "TrieManager.hpp" namespace CppJieba { class QuerySegment: public SegmentBase { private: - HMMSegment _hmmSeg; + MixSegment _mixSeg; FullSegment _fullSeg; int _maxWordLen; public: - QuerySegment(const char* fullSegDict, const char* hmmSegDict, int maxWordLen): _hmmSeg(hmmSegDict), _fullSeg(fullSegDict), _maxWordLen(maxWordLen){}; + QuerySegment(const char* dict, const char* model, int maxWordLen): _mixSeg(dict, model), _fullSeg(dict), _maxWordLen(maxWordLen){}; virtual ~QuerySegment(){dispose();}; public: bool init() { -#ifndef NO_CODING_LOG if (_getInitFlag()) { LogError("inited."); } -#endif - if (!_hmmSeg.init()) + if (!_mixSeg.init()) { - LogError("_hmmSeg init"); + LogError("_mixSeg init"); return false; } if (!_fullSeg.init()) @@ -47,14 +46,12 @@ namespace CppJieba } bool dispose() { -#ifndef NO_CODING_LOG if(!_getInitFlag()) { return true; } -#endif _fullSeg.dispose(); - _hmmSeg.dispose(); + _mixSeg.dispose(); _setInitFlag(false); return true; } @@ -66,34 +63,28 @@ namespace CppJieba bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector& res) const { assert(_getInitFlag()); -#ifndef NO_CODING_LOG - //if (!_getInitFlag()) - //{ - // LogError("not inited."); - // return false; - //} if (begin >= end) { LogError("begin >= end"); return false; } -#endif - //use hmm cut first - vector hmmRes; - if (!_hmmSeg.cut(begin, end, hmmRes)) + + //use mix cut first + vector mixRes; + if (!_mixSeg.cut(begin, end, mixRes)) { - LogError("_hmmSeg cut failed."); + LogError("_mixSeg cut failed."); return false; } vector fullRes; - for (vector::const_iterator hmmResItr = hmmRes.begin(); hmmResItr != hmmRes.end(); hmmResItr++) + for (vector::const_iterator mixResItr = mixRes.begin(); mixResItr != mixRes.end(); mixResItr++) { // if it's too long, cut with _fullSeg, put fullRes in res - if (hmmResItr->size() > _maxWordLen) + if (mixResItr->size() > _maxWordLen) { - if (_fullSeg.cut(hmmResItr->begin(), hmmResItr->end(), fullRes)) + if (_fullSeg.cut(mixResItr->begin(), mixResItr->end(), fullRes)) { for (vector::const_iterator fullResItr = fullRes.begin(); fullResItr != fullRes.end(); fullResItr++) { @@ -101,9 +92,9 @@ namespace CppJieba } } } - else // just use the hmm result + else // just use the mix result { - res.push_back(*hmmResItr); + res.push_back(*mixResItr); } } @@ -113,18 +104,13 @@ namespace CppJieba bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector& res) const { -#ifndef NO_CODING_LOG - if (!_getInitFlag()) + assert(_getInitFlag()); + if (begin >= end) { - LogError("not inited."); + LogError("begin >= end"); return false; } - if (begin > end) - { - LogError("begin > end"); - return false; - } -#endif + vector uRes; if (!cut(begin, end, uRes)) { diff --git a/src/TrieManager.hpp b/src/TrieManager.hpp new file mode 100644 index 0000000..2590c89 --- /dev/null +++ b/src/TrieManager.hpp @@ -0,0 +1,77 @@ +#ifndef CPPJIEBA_TRIEMANAGER_H +#define CPPJIEBA_TRIEMANAGER_H + +#include "Trie.hpp" +#include "Limonp/md5.hpp" +#include "Limonp/logger.hpp" + +namespace CppJieba +{ + using namespace Limonp; + class TrieManager + { + private: + unordered_map _tries; + TrieManager(){}; + TrieManager(TrieManager& tm){}; + public: + Trie* getTrie(const char* dictpath) + { + string md5; + if (!md5File(dictpath, md5)) + { + LogError("error when getting md5 for file '%s'", dictpath); + return NULL; + } + else + { + LogInfo("md5 for file '%s': %s", dictpath, md5.c_str()); + if (_tries.find(md5) == _tries.end()) + { + LogInfo("create a new trie for md5: '%s'", md5.c_str()); + Trie* trie = NULL; + try + { + trie = new Trie(); + } + catch (const bad_alloc& e) + { + LogError("error when new a trie for file '%s'", dictpath); + return NULL; + } + if (NULL == trie) + return NULL; + + if (!trie->init()) + { + LogError("trie init error for file '%s'", dictpath); + return NULL; + } + + LogInfo("trie->loadDict(%s) start...", dictpath); + if (!trie->loadDict(dictpath)) + { + LogError("trie->loadDict(%s) failed...", dictpath); + return NULL; + } + LogInfo("trie->loadDict end..."); + + _tries[md5.c_str()] = trie; + return trie; + } + else + { + LogInfo("find a exits trie for md5: '%s'", md5.c_str()); + return _tries[md5.c_str()]; + } + } + } + + static TrieManager& getInstance() + { + static TrieManager _this; + return _this; + } + }; +} +#endif diff --git a/src/segment.cpp b/src/segment.cpp index 2eb61f4..37cad63 100644 --- a/src/segment.cpp +++ b/src/segment.cpp @@ -43,12 +43,13 @@ int main(int argc, char ** argv) <<"\t--algorithm\tSupported methods are [cutDAG, cutHMM, cutFull, cutQuery, cutMix] for now. \n\t\t\tIf not specified, the default is cutMix\n" <<"\t--dictpath\tsee example\n" <<"\t--modelpath\tsee example\n" - <<"\t--maxlen\tspecify the granularity of cut used in cutQuery, If not specified, the default is 3\n" + <<"\t--maxlen\tspecify the granularity of cut used in cutQuery. \n\t\t\tIf not specified, the default is 3\n" <<"example:\n" - <<"\t"<