From 12328a3a7e9c847303343b72d3302e84c1696a08 Mon Sep 17 00:00:00 2001 From: aholic Date: Thu, 28 Nov 2013 09:17:29 +0800 Subject: [PATCH 1/8] remove macro NO_CODING_LOG --- src/FullSegment.hpp | 8 -------- src/MixSegment.hpp | 1 + src/QuerySegment.hpp | 8 -------- 3 files changed, 1 insertion(+), 16 deletions(-) diff --git a/src/FullSegment.hpp b/src/FullSegment.hpp index d67f2f3..5fc3604 100644 --- a/src/FullSegment.hpp +++ b/src/FullSegment.hpp @@ -24,13 +24,11 @@ namespace CppJieba public: bool init() { -#ifndef NO_CODING_LOG if(_getInitFlag()) { LogError("already inited before now."); return false; } -#endif if(!_trie.init()) { LogError("_trie.init failed."); @@ -47,12 +45,10 @@ namespace CppJieba } bool dispose() { -#ifndef NO_CODING_LOG if(!_getInitFlag()) { return true; } -#endif _trie.dispose(); _setInitFlag(false); return true; @@ -65,7 +61,6 @@ namespace CppJieba bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector& res) const { assert(_getInitFlag()); -#ifndef NO_CODING_LOG //if (!_getInitFlag()) //{ // LogError("not inited."); @@ -76,7 +71,6 @@ namespace CppJieba LogError("begin >= end"); return false; } -#endif //resut of searching in trie tree vector > tRes; @@ -123,7 +117,6 @@ namespace CppJieba bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector& res) const { -#ifndef NO_CODING_LOG if (!_getInitFlag()) { LogError("not inited."); @@ -134,7 +127,6 @@ namespace CppJieba LogError("begin > end"); return false; } -#endif vector uRes; if (!cut(begin, end, uRes)) { diff --git a/src/MixSegment.hpp b/src/MixSegment.hpp index 5f35031..272a57f 100644 --- a/src/MixSegment.hpp +++ b/src/MixSegment.hpp @@ -55,6 +55,7 @@ namespace CppJieba public: using SegmentBase::cut; public: + virtual bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector& res)const { //if(!_getInitFlag()) diff --git a/src/QuerySegment.hpp b/src/QuerySegment.hpp index 4487339..de268c5 100644 --- a/src/QuerySegment.hpp +++ b/src/QuerySegment.hpp @@ -27,12 +27,10 @@ namespace CppJieba public: bool init() { -#ifndef NO_CODING_LOG if (_getInitFlag()) { LogError("inited."); } -#endif if (!_hmmSeg.init()) { LogError("_hmmSeg init"); @@ -47,12 +45,10 @@ namespace CppJieba } bool dispose() { -#ifndef NO_CODING_LOG if(!_getInitFlag()) { return true; } -#endif _fullSeg.dispose(); _hmmSeg.dispose(); _setInitFlag(false); @@ -66,7 +62,6 @@ namespace CppJieba bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector& res) const { assert(_getInitFlag()); -#ifndef NO_CODING_LOG //if (!_getInitFlag()) //{ // LogError("not inited."); @@ -77,7 +72,6 @@ namespace CppJieba LogError("begin >= end"); return false; } -#endif //use hmm cut first vector hmmRes; if (!_hmmSeg.cut(begin, end, hmmRes)) @@ -113,7 +107,6 @@ namespace CppJieba bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector& res) const { -#ifndef NO_CODING_LOG if (!_getInitFlag()) { LogError("not inited."); @@ -124,7 +117,6 @@ namespace CppJieba LogError("begin > end"); return false; } -#endif vector uRes; if (!cut(begin, end, uRes)) { From 599c130bd9a5790740e575777bda05aecc98b483 Mon Sep 17 00:00:00 2001 From: aholic Date: Thu, 28 Nov 2013 10:49:40 +0800 Subject: [PATCH 2/8] make MixSegment looks better --- src/FullSegment.hpp | 19 +++----- src/MixSegment.hpp | 102 ++++++++++++++++++++++++++----------------- src/QuerySegment.hpp | 17 +++----- 3 files changed, 73 insertions(+), 65 deletions(-) diff --git a/src/FullSegment.hpp b/src/FullSegment.hpp index 5fc3604..f1182fc 100644 --- a/src/FullSegment.hpp +++ b/src/FullSegment.hpp @@ -61,16 +61,12 @@ namespace CppJieba bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector& res) const { assert(_getInitFlag()); - //if (!_getInitFlag()) - //{ - // LogError("not inited."); - // return false; - //} if (begin >= end) { LogError("begin >= end"); return false; } + //resut of searching in trie tree vector > tRes; @@ -117,24 +113,21 @@ namespace CppJieba bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector& res) const { - if (!_getInitFlag()) + assert(_getInitFlag()); + if (begin >= end) { - LogError("not inited."); - return false; - } - if (begin > end) - { - LogError("begin > end"); + LogError("begin >= end"); return false; } + vector uRes; if (!cut(begin, end, uRes)) { LogError("get unicode cut result error."); return false; } - string tmp; + string tmp; for (vector::const_iterator uItr = uRes.begin(); uItr != uRes.end(); uItr++) { if (TransCode::encode(*uItr, tmp)) diff --git a/src/MixSegment.hpp b/src/MixSegment.hpp index 272a57f..79c40d1 100644 --- a/src/MixSegment.hpp +++ b/src/MixSegment.hpp @@ -55,73 +55,95 @@ namespace CppJieba public: using SegmentBase::cut; public: - - virtual bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector& res)const + virtual bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector& res) const { - //if(!_getInitFlag()) - //{ - // LogError("not inited."); - // return false; - //} assert(_getInitFlag()); - if(begin == end) + if(begin >= end) { + LogError("begin >= end"); return false; } + vector infos; if(!_mpSeg.cut(begin, end, infos)) { LogError("mpSeg cutDAG failed."); return false; } - Unicode unico; + vector hmmRes; - string tmp; - for(uint i= 0; i < infos.size(); i++) + Unicode piece; + for (uint i = 0, j = 0; i < infos.size(); i++) { - TransCode::encode(infos[i].word,tmp); - if(1 == infos[i].word.size()) + //if mp get a word, it's ok, put it into result + if (1 != infos[i].word.size()) { - unico.push_back(infos[i].word[0]); + res.push_back(infos[i].word); + continue; } - else + + // if mp get a single one, collect it in sequence + j = i; + while (j < infos.size() && infos[j].word.size() == 1) { - if(!unico.empty()) - { - hmmRes.clear(); - if(!_hmmSeg.cut(unico.begin(), unico.end(), hmmRes)) - { - LogError("_hmmSeg cut failed."); - return false; - } - for(uint j = 0; j < hmmRes.size(); j++) - { - TransCode::encode(hmmRes[j], tmp); - res.push_back(tmp); - } - } - unico.clear(); - TransCode::encode(infos[i].word, tmp); - res.push_back(tmp); + piece.push_back(infos[j].word[0]); + j++; } - } - if(!unico.empty()) - { - hmmRes.clear(); - if(!_hmmSeg.cut(unico.begin(), unico.end(), hmmRes)) + + // cut the sequence with hmm + if (!_hmmSeg.cut(piece.begin(), piece.end(), hmmRes)) { LogError("_hmmSeg cut failed."); return false; } - for(uint j = 0; j < hmmRes.size(); j++) + + //put hmm result to return + for (uint k = 0; k < hmmRes.size(); k++) { - TransCode::encode(hmmRes[j], tmp); - res.push_back(tmp); + res.push_back(hmmRes[k]); } + + //clear tmp vars + piece.clear(); + hmmRes.clear(); + + //let i jump over this piece + i = j - 1; } return true; } + + virtual bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector& res)const + { + assert(_getInitFlag()); + if(begin >= end) + { + LogError("begin >= end"); + return false; + } + + vector uRes; + if (!cut(begin, end, uRes)) + { + LogError("get unicode cut result error."); + return false; + } + + string tmp; + for (vector::const_iterator uItr = uRes.begin(); uItr != uRes.end(); uItr++) + { + if (TransCode::encode(*uItr, tmp)) + { + res.push_back(tmp); + } + else + { + LogError("encode failed."); + } + } + return true; + } }; } diff --git a/src/QuerySegment.hpp b/src/QuerySegment.hpp index de268c5..ae5987e 100644 --- a/src/QuerySegment.hpp +++ b/src/QuerySegment.hpp @@ -62,16 +62,12 @@ namespace CppJieba bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector& res) const { assert(_getInitFlag()); - //if (!_getInitFlag()) - //{ - // LogError("not inited."); - // return false; - //} if (begin >= end) { LogError("begin >= end"); return false; } + //use hmm cut first vector hmmRes; if (!_hmmSeg.cut(begin, end, hmmRes)) @@ -107,16 +103,13 @@ namespace CppJieba bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector& res) const { - if (!_getInitFlag()) + assert(_getInitFlag()); + if (begin >= end) { - LogError("not inited."); - return false; - } - if (begin > end) - { - LogError("begin > end"); + LogError("begin >= end"); return false; } + vector uRes; if (!cut(begin, end, uRes)) { From bcc2329a0eec8b2891ed380407821b47da8e5b19 Mon Sep 17 00:00:00 2001 From: wyy Date: Sat, 7 Dec 2013 08:11:43 -0800 Subject: [PATCH 3/8] modify README.md --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 3db03fd..a0482f8 100644 --- a/README.md +++ b/README.md @@ -154,8 +154,8 @@ TransCode.cpp/TransCode.h 负责转换编码类型,将utf8和gbk转换成`uint ### MixSegment -分词速度大概是 62M / 54sec = 1.15M/sec -测试环境: `Intel(R) Xeon(R) CPU E5506 @ 2.13GHz` +分词速度大概是 = 2M/sec +测试环境: `Intel(R) Xeon(R) CPU E5506 @ 2.13GHz` 电脑下开的ubuntu虚拟机 ## 联系客服 From 7c7d5e29bc69e0f3d77e78d92a12fda4e5d14c0c Mon Sep 17 00:00:00 2001 From: aholic Date: Mon, 16 Dec 2013 06:03:04 +0800 Subject: [PATCH 4/8] update Limonp, add TrieManager to manage tries --- src/Limonp/md5.hpp | 423 ++++++++++++++++++++++++++++++++++++++++++++ src/TrieManager.hpp | 74 ++++++++ 2 files changed, 497 insertions(+) create mode 100644 src/Limonp/md5.hpp create mode 100644 src/TrieManager.hpp diff --git a/src/Limonp/md5.hpp b/src/Limonp/md5.hpp new file mode 100644 index 0000000..3673d57 --- /dev/null +++ b/src/Limonp/md5.hpp @@ -0,0 +1,423 @@ +#ifndef __MD5_H__ +#define __MD5_H__ + +// Copyright (C) 1991-2, RSA Data Security, Inc. Created 1991. All +// rights reserved. + +// License to copy and use this software is granted provided that it +// is identified as the "RSA Data Security, Inc. MD5 Message-Digest +// Algorithm" in all material mentioning or referencing this software +// or this function. +// +// License is also granted to make and use derivative works provided +// that such works are identified as "derived from the RSA Data +// Security, Inc. MD5 Message-Digest Algorithm" in all material +// mentioning or referencing the derived work. +// +// RSA Data Security, Inc. makes no representations concerning either +// the merchantability of this software or the suitability of this +// software for any particular purpose. It is provided "as is" +// without express or implied warranty of any kind. +// +// These notices must be retained in any copies of any part of this +// documentation and/or software. + + + +// The original md5 implementation avoids external libraries. +// This version has dependency on stdio.h for file input and +// string.h for memcpy. +#include +#include +#include + +namespace Limonp +{ + +#pragma region MD5 defines +// Constants for MD5Transform routine. +#define S11 7 +#define S12 12 +#define S13 17 +#define S14 22 +#define S21 5 +#define S22 9 +#define S23 14 +#define S24 20 +#define S31 4 +#define S32 11 +#define S33 16 +#define S34 23 +#define S41 6 +#define S42 10 +#define S43 15 +#define S44 21 + + +// F, G, H and I are basic MD5 functions. +#define F(x, y, z) (((x) & (y)) | ((~x) & (z))) +#define G(x, y, z) (((x) & (z)) | ((y) & (~z))) +#define H(x, y, z) ((x) ^ (y) ^ (z)) +#define I(x, y, z) ((y) ^ ((x) | (~z))) + +// ROTATE_LEFT rotates x left n bits. +#define ROTATE_LEFT(x, n) (((x) << (n)) | ((x) >> (32-(n)))) + +// FF, GG, HH, and II transformations for rounds 1, 2, 3, and 4. +// Rotation is separate from addition to prevent recomputation. +#define FF(a, b, c, d, x, s, ac) { \ + (a) += F ((b), (c), (d)) + (x) + (UINT4)(ac); \ + (a) = ROTATE_LEFT ((a), (s)); \ + (a) += (b); \ + } +#define GG(a, b, c, d, x, s, ac) { \ + (a) += G ((b), (c), (d)) + (x) + (UINT4)(ac); \ + (a) = ROTATE_LEFT ((a), (s)); \ + (a) += (b); \ + } +#define HH(a, b, c, d, x, s, ac) { \ + (a) += H ((b), (c), (d)) + (x) + (UINT4)(ac); \ + (a) = ROTATE_LEFT ((a), (s)); \ + (a) += (b); \ + } +#define II(a, b, c, d, x, s, ac) { \ + (a) += I ((b), (c), (d)) + (x) + (UINT4)(ac); \ + (a) = ROTATE_LEFT ((a), (s)); \ + (a) += (b); \ + } +#pragma endregion + + +typedef unsigned char BYTE ; + +// POINTER defines a generic pointer type +typedef unsigned char *POINTER; + +// UINT2 defines a two byte word +typedef unsigned short int UINT2; + +// UINT4 defines a four byte word +typedef unsigned long int UINT4; + +static unsigned char PADDING[64] = { + 0x80, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +}; +// convenient object that wraps +// the C-functions for use in C++ only +class MD5 +{ +private: + struct __context_t { + UINT4 state[4]; /* state (ABCD) */ + UINT4 count[2]; /* number of bits, modulo 2^64 (lsb first) */ + unsigned char buffer[64]; /* input buffer */ + } context ; + + #pragma region static helper functions + // The core of the MD5 algorithm is here. + // MD5 basic transformation. Transforms state based on block. + static void MD5Transform( UINT4 state[4], unsigned char block[64] ) + { + UINT4 a = state[0], b = state[1], c = state[2], d = state[3], x[16]; + + Decode (x, block, 64); + + /* Round 1 */ + FF (a, b, c, d, x[ 0], S11, 0xd76aa478); /* 1 */ + FF (d, a, b, c, x[ 1], S12, 0xe8c7b756); /* 2 */ + FF (c, d, a, b, x[ 2], S13, 0x242070db); /* 3 */ + FF (b, c, d, a, x[ 3], S14, 0xc1bdceee); /* 4 */ + FF (a, b, c, d, x[ 4], S11, 0xf57c0faf); /* 5 */ + FF (d, a, b, c, x[ 5], S12, 0x4787c62a); /* 6 */ + FF (c, d, a, b, x[ 6], S13, 0xa8304613); /* 7 */ + FF (b, c, d, a, x[ 7], S14, 0xfd469501); /* 8 */ + FF (a, b, c, d, x[ 8], S11, 0x698098d8); /* 9 */ + FF (d, a, b, c, x[ 9], S12, 0x8b44f7af); /* 10 */ + FF (c, d, a, b, x[10], S13, 0xffff5bb1); /* 11 */ + FF (b, c, d, a, x[11], S14, 0x895cd7be); /* 12 */ + FF (a, b, c, d, x[12], S11, 0x6b901122); /* 13 */ + FF (d, a, b, c, x[13], S12, 0xfd987193); /* 14 */ + FF (c, d, a, b, x[14], S13, 0xa679438e); /* 15 */ + FF (b, c, d, a, x[15], S14, 0x49b40821); /* 16 */ + + /* Round 2 */ + GG (a, b, c, d, x[ 1], S21, 0xf61e2562); /* 17 */ + GG (d, a, b, c, x[ 6], S22, 0xc040b340); /* 18 */ + GG (c, d, a, b, x[11], S23, 0x265e5a51); /* 19 */ + GG (b, c, d, a, x[ 0], S24, 0xe9b6c7aa); /* 20 */ + GG (a, b, c, d, x[ 5], S21, 0xd62f105d); /* 21 */ + GG (d, a, b, c, x[10], S22, 0x2441453); /* 22 */ + GG (c, d, a, b, x[15], S23, 0xd8a1e681); /* 23 */ + GG (b, c, d, a, x[ 4], S24, 0xe7d3fbc8); /* 24 */ + GG (a, b, c, d, x[ 9], S21, 0x21e1cde6); /* 25 */ + GG (d, a, b, c, x[14], S22, 0xc33707d6); /* 26 */ + GG (c, d, a, b, x[ 3], S23, 0xf4d50d87); /* 27 */ + GG (b, c, d, a, x[ 8], S24, 0x455a14ed); /* 28 */ + GG (a, b, c, d, x[13], S21, 0xa9e3e905); /* 29 */ + GG (d, a, b, c, x[ 2], S22, 0xfcefa3f8); /* 30 */ + GG (c, d, a, b, x[ 7], S23, 0x676f02d9); /* 31 */ + GG (b, c, d, a, x[12], S24, 0x8d2a4c8a); /* 32 */ + + /* Round 3 */ + HH (a, b, c, d, x[ 5], S31, 0xfffa3942); /* 33 */ + HH (d, a, b, c, x[ 8], S32, 0x8771f681); /* 34 */ + HH (c, d, a, b, x[11], S33, 0x6d9d6122); /* 35 */ + HH (b, c, d, a, x[14], S34, 0xfde5380c); /* 36 */ + HH (a, b, c, d, x[ 1], S31, 0xa4beea44); /* 37 */ + HH (d, a, b, c, x[ 4], S32, 0x4bdecfa9); /* 38 */ + HH (c, d, a, b, x[ 7], S33, 0xf6bb4b60); /* 39 */ + HH (b, c, d, a, x[10], S34, 0xbebfbc70); /* 40 */ + HH (a, b, c, d, x[13], S31, 0x289b7ec6); /* 41 */ + HH (d, a, b, c, x[ 0], S32, 0xeaa127fa); /* 42 */ + HH (c, d, a, b, x[ 3], S33, 0xd4ef3085); /* 43 */ + HH (b, c, d, a, x[ 6], S34, 0x4881d05); /* 44 */ + HH (a, b, c, d, x[ 9], S31, 0xd9d4d039); /* 45 */ + HH (d, a, b, c, x[12], S32, 0xe6db99e5); /* 46 */ + HH (c, d, a, b, x[15], S33, 0x1fa27cf8); /* 47 */ + HH (b, c, d, a, x[ 2], S34, 0xc4ac5665); /* 48 */ + + /* Round 4 */ + II (a, b, c, d, x[ 0], S41, 0xf4292244); /* 49 */ + II (d, a, b, c, x[ 7], S42, 0x432aff97); /* 50 */ + II (c, d, a, b, x[14], S43, 0xab9423a7); /* 51 */ + II (b, c, d, a, x[ 5], S44, 0xfc93a039); /* 52 */ + II (a, b, c, d, x[12], S41, 0x655b59c3); /* 53 */ + II (d, a, b, c, x[ 3], S42, 0x8f0ccc92); /* 54 */ + II (c, d, a, b, x[10], S43, 0xffeff47d); /* 55 */ + II (b, c, d, a, x[ 1], S44, 0x85845dd1); /* 56 */ + II (a, b, c, d, x[ 8], S41, 0x6fa87e4f); /* 57 */ + II (d, a, b, c, x[15], S42, 0xfe2ce6e0); /* 58 */ + II (c, d, a, b, x[ 6], S43, 0xa3014314); /* 59 */ + II (b, c, d, a, x[13], S44, 0x4e0811a1); /* 60 */ + II (a, b, c, d, x[ 4], S41, 0xf7537e82); /* 61 */ + II (d, a, b, c, x[11], S42, 0xbd3af235); /* 62 */ + II (c, d, a, b, x[ 2], S43, 0x2ad7d2bb); /* 63 */ + II (b, c, d, a, x[ 9], S44, 0xeb86d391); /* 64 */ + + state[0] += a; + state[1] += b; + state[2] += c; + state[3] += d; + + // Zeroize sensitive information. + memset((POINTER)x, 0, sizeof (x)); + } + + // Encodes input (UINT4) into output (unsigned char). Assumes len is + // a multiple of 4. + static void Encode( unsigned char *output, UINT4 *input, unsigned int len ) + { + unsigned int i, j; + + for (i = 0, j = 0; j < len; i++, j += 4) { + output[j] = (unsigned char)(input[i] & 0xff); + output[j+1] = (unsigned char)((input[i] >> 8) & 0xff); + output[j+2] = (unsigned char)((input[i] >> 16) & 0xff); + output[j+3] = (unsigned char)((input[i] >> 24) & 0xff); + } + } + + // Decodes input (unsigned char) into output (UINT4). Assumes len is + // a multiple of 4. + static void Decode( UINT4 *output, unsigned char *input, unsigned int len ) + { + unsigned int i, j; + + for (i = 0, j = 0; j < len; i++, j += 4) + output[i] = ((UINT4)input[j]) | (((UINT4)input[j+1]) << 8) | + (((UINT4)input[j+2]) << 16) | (((UINT4)input[j+3]) << 24); + } + #pragma endregion + + +public: + // MAIN FUNCTIONS + MD5() + { + Init() ; + } + + // MD5 initialization. Begins an MD5 operation, writing a new context. + void Init() + { + context.count[0] = context.count[1] = 0; + + // Load magic initialization constants. + context.state[0] = 0x67452301; + context.state[1] = 0xefcdab89; + context.state[2] = 0x98badcfe; + context.state[3] = 0x10325476; + } + + // MD5 block update operation. Continues an MD5 message-digest + // operation, processing another message block, and updating the + // context. + void Update( + unsigned char *input, // input block + unsigned int inputLen ) // length of input block + { + unsigned int i, index, partLen; + + // Compute number of bytes mod 64 + index = (unsigned int)((context.count[0] >> 3) & 0x3F); + + // Update number of bits + if ((context.count[0] += ((UINT4)inputLen << 3)) + < ((UINT4)inputLen << 3)) + context.count[1]++; + context.count[1] += ((UINT4)inputLen >> 29); + + partLen = 64 - index; + + // Transform as many times as possible. + if (inputLen >= partLen) { + memcpy((POINTER)&context.buffer[index], (POINTER)input, partLen); + MD5Transform (context.state, context.buffer); + + for (i = partLen; i + 63 < inputLen; i += 64) + MD5Transform (context.state, &input[i]); + + index = 0; + } + else + i = 0; + + /* Buffer remaining input */ + memcpy((POINTER)&context.buffer[index], (POINTER)&input[i], inputLen-i); + } + + // MD5 finalization. Ends an MD5 message-digest operation, writing the + // the message digest and zeroizing the context. + // Writes to digestRaw + void Final() + { + unsigned char bits[8]; + unsigned int index, padLen; + + // Save number of bits + Encode( bits, context.count, 8 ); + + // Pad out to 56 mod 64. + index = (unsigned int)((context.count[0] >> 3) & 0x3f); + padLen = (index < 56) ? (56 - index) : (120 - index); + Update( PADDING, padLen ); + + // Append length (before padding) + Update( bits, 8 ); + + // Store state in digest + Encode( digestRaw, context.state, 16); + + // Zeroize sensitive information. + memset((POINTER)&context, 0, sizeof (context)); + + writeToString() ; + } + + /// Buffer must be 32+1 (nul) = 33 chars long at least + void writeToString() + { + int pos ; + + for( pos = 0 ; pos < 16 ; pos++ ) + sprintf( digestChars+(pos*2), "%02x", digestRaw[pos] ) ; + } + + +public: + // an MD5 digest is a 16-byte number (32 hex digits) + BYTE digestRaw[ 16 ] ; + + // This version of the digest is actually + // a "printf'd" version of the digest. + char digestChars[ 33 ] ; + + /// Load a file from disk and digest it + // Digests a file and returns the result. + char* digestFile( const char *filename ) + { + Init() ; + + FILE *file; + + int len; + unsigned char buffer[1024] ; + + if(NULL == filename || (file = fopen (filename, "rb")) == NULL ) + { + return ""; + } + else + { + while( len = fread( buffer, 1, 1024, file ) ) + Update( buffer, len ) ; + Final(); + + fclose( file ); + } + + return digestChars ; + } + + /// Digests a byte-array already in memory + char* digestMemory( BYTE *memchunk, int len ) + { + if (NULL == memchunk) + return ""; + + Init() ; + Update( memchunk, len ) ; + Final() ; + + return digestChars ; + } + + // Digests a string and prints the result. + char* digestString(const char *string ) + { + if (string == NULL) + return ""; + + Init() ; + Update( (unsigned char*)string, strlen(string) ) ; + Final() ; + + return digestChars ; + } +}; + +inline bool md5String(const char* str, std::string& res) +{ + if (NULL == str) + { + res = ""; + return false; + } + + MD5 md5; + res = md5.digestString(str); + + if (res == "") + return false; + return true; +} + +inline bool md5File(const char* filepath, std::string& res) +{ + if (NULL == filepath || strcmp(filepath, "") == 0) + { + res = ""; + return false; + } + + MD5 md5; + res = md5.digestFile(filepath); + + if (res == "") + return false; + return true; +} +} +#endif diff --git a/src/TrieManager.hpp b/src/TrieManager.hpp new file mode 100644 index 0000000..612fbc9 --- /dev/null +++ b/src/TrieManager.hpp @@ -0,0 +1,74 @@ +#ifndef CPPJIEBA_TRIEMANAGER_H +#define CPPJIEBA_TRIEMANAGER_H + +#include "Trie.hpp" +#include "Limonp/md5.hpp" +#include "Limonp/logger.hpp" + +namespace CppJieba +{ + using namespace Limonp; + class TrieManager + { + private: + unordered_map _tries; + TrieManager(){}; + TrieManager(TrieManager& tm){}; + public: + Trie* getTrie(const char* dictpath) + { + string md5; + if (!md5File(dictpath, md5)) + { + LogError("error when getting md5 for file '%s'", dictpath); + return NULL; + } + else + { + if (_tries.find(md5.c_str()) == _tries.end()) + { + Trie* trie = NULL; + try + { + trie = new Trie(); + } + catch (const bad_alloc& e) + { + LogError("error when new a trie for file '%s'", dictpath); + return NULL; + } + if (NULL == trie) + return NULL; + + if (!trie->init()) + { + LogError("trie init error for file '%s'", dictpath); + return NULL; + } + + LogInfo("trie->loadDict(%s) start...", dictpath); + if (!trie->loadDict(dictpath)) + { + LogError("trie->loadDict(%s) failed...", dictpath); + return NULL; + } + LogInfo("trie->loadDict end..."); + + _tries[md5.c_str()] = trie; + return trie; + } + else + { + return _tries[md5.c_str()]; + } + } + } + + static TrieManager& getInstance() + { + static TrieManager _this; + return _this; + } + }; +} +#endif From a0f588a8af189e0265f3c2da86ef1536e2175edd Mon Sep 17 00:00:00 2001 From: aholic Date: Mon, 16 Dec 2013 07:01:50 +0800 Subject: [PATCH 5/8] update md5.hpp in limonp | change map type in TrieManager.hpp --- src/Limonp/md5.hpp | 31 ++++++++++++++++++++----------- src/TrieManager.hpp | 7 +++++-- 2 files changed, 25 insertions(+), 13 deletions(-) diff --git a/src/Limonp/md5.hpp b/src/Limonp/md5.hpp index 3673d57..b01be20 100644 --- a/src/Limonp/md5.hpp +++ b/src/Limonp/md5.hpp @@ -336,7 +336,7 @@ public: /// Load a file from disk and digest it // Digests a file and returns the result. - char* digestFile( const char *filename ) + const char* digestFile( const char *filename ) { Init() ; @@ -347,7 +347,7 @@ public: if(NULL == filename || (file = fopen (filename, "rb")) == NULL ) { - return ""; + return NULL; } else { @@ -362,10 +362,10 @@ public: } /// Digests a byte-array already in memory - char* digestMemory( BYTE *memchunk, int len ) + const char* digestMemory( BYTE *memchunk, int len ) { if (NULL == memchunk) - return ""; + return NULL; Init() ; Update( memchunk, len ) ; @@ -375,10 +375,10 @@ public: } // Digests a string and prints the result. - char* digestString(const char *string ) + const char* digestString(const char *string ) { if (string == NULL) - return ""; + return NULL; Init() ; Update( (unsigned char*)string, strlen(string) ) ; @@ -397,10 +397,14 @@ inline bool md5String(const char* str, std::string& res) } MD5 md5; - res = md5.digestString(str); - - if (res == "") + const char *pRes = md5.digestString(str); + if (NULL == pRes) + { + res = ""; return false; + } + + res = pRes; return true; } @@ -413,10 +417,15 @@ inline bool md5File(const char* filepath, std::string& res) } MD5 md5; - res = md5.digestFile(filepath); + const char *pRes = md5.digestFile(filepath); - if (res == "") + if (NULL == pRes) + { + res = ""; return false; + } + + res = pRes; return true; } } diff --git a/src/TrieManager.hpp b/src/TrieManager.hpp index 612fbc9..2590c89 100644 --- a/src/TrieManager.hpp +++ b/src/TrieManager.hpp @@ -11,7 +11,7 @@ namespace CppJieba class TrieManager { private: - unordered_map _tries; + unordered_map _tries; TrieManager(){}; TrieManager(TrieManager& tm){}; public: @@ -25,8 +25,10 @@ namespace CppJieba } else { - if (_tries.find(md5.c_str()) == _tries.end()) + LogInfo("md5 for file '%s': %s", dictpath, md5.c_str()); + if (_tries.find(md5) == _tries.end()) { + LogInfo("create a new trie for md5: '%s'", md5.c_str()); Trie* trie = NULL; try { @@ -59,6 +61,7 @@ namespace CppJieba } else { + LogInfo("find a exits trie for md5: '%s'", md5.c_str()); return _tries[md5.c_str()]; } } From 7add684a8aa936d1b9f426032c60666ba0742a28 Mon Sep 17 00:00:00 2001 From: aholic Date: Mon, 16 Dec 2013 14:18:44 +0800 Subject: [PATCH 6/8] change algorithm for QuerySegment(now is mix+full) | use TrieManager to get a trie for all Segment --- src/FullSegment.hpp | 18 ++++++------------ src/MPSegment.hpp | 27 ++++++++------------------- src/QuerySegment.hpp | 31 ++++++++++++++++--------------- 3 files changed, 30 insertions(+), 46 deletions(-) diff --git a/src/FullSegment.hpp b/src/FullSegment.hpp index f1182fc..d4bfa31 100644 --- a/src/FullSegment.hpp +++ b/src/FullSegment.hpp @@ -9,13 +9,14 @@ #include "ISegment.hpp" #include "SegmentBase.hpp" #include "TransCode.hpp" +#include "TrieManager.hpp" namespace CppJieba { class FullSegment: public SegmentBase { private: - Trie _trie; + Trie* _trie; const string _dictPath; public: @@ -29,18 +30,12 @@ namespace CppJieba LogError("already inited before now."); return false; } - if(!_trie.init()) + _trie = TrieManager::getInstance().getTrie(_dictPath.c_str()); + if (NULL == _trie) { - LogError("_trie.init failed."); + LogError("get NULL pointor from getTrie(\"%s\")", _dictPath.c_str()); return false; } - LogInfo("_trie.loadDict(%s) start...", _dictPath.c_str()); - if(!_trie.loadDict(_dictPath.c_str())) - { - LogError("_trie.loadDict faield."); - return false; - } - LogInfo("_trie.loadDict end."); return _setInitFlag(true); } bool dispose() @@ -49,7 +44,6 @@ namespace CppJieba { return true; } - _trie.dispose(); _setInitFlag(false); return true; } @@ -81,7 +75,7 @@ namespace CppJieba for (Unicode::const_iterator uItr = begin; uItr != end; uItr++) { //find word start from uItr - if (_trie.find(uItr, end, tRes)) + if (_trie->find(uItr, end, tRes)) { for (vector >::const_iterator itr = tRes.begin(); itr != tRes.end(); itr++) { diff --git a/src/MPSegment.hpp b/src/MPSegment.hpp index 14b8f21..b0a7bed 100644 --- a/src/MPSegment.hpp +++ b/src/MPSegment.hpp @@ -10,6 +10,7 @@ #include #include "Limonp/logger.hpp" #include "Trie.hpp" +#include "TrieManager.hpp" #include "ISegment.hpp" #include "SegmentBase.hpp" @@ -32,7 +33,7 @@ namespace CppJieba class MPSegment: public SegmentBase { private: - Trie _trie; + Trie* _trie; private: const string _dictPath; @@ -47,18 +48,12 @@ namespace CppJieba LogError("already inited before now."); return false; } - if(!_trie.init()) + _trie = TrieManager::getInstance().getTrie(_dictPath.c_str()); + if (_trie == NULL) { - LogError("_trie.init failed."); + LogError("get a NULL pointor form getTrie(\"%s\").", _dictPath.c_str()); return false; } - LogInfo("_trie.loadDict(%s) start...", _dictPath.c_str()); - if(!_trie.loadDict(_dictPath.c_str())) - { - LogError("_trie.loadDict faield."); - return false; - } - LogInfo("_trie.loadDict end."); return _setInitFlag(true); } virtual bool dispose() @@ -67,18 +62,12 @@ namespace CppJieba { return true; } - _trie.dispose(); _setInitFlag(false); return true; } public: virtual bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector& res)const { - //if(!_getInitFlag()) - //{ - // LogError("not inited."); - // return false; - //} assert(_getInitFlag()); vector segWordInfos; @@ -145,7 +134,7 @@ namespace CppJieba { SegmentChar schar(*it); uint i = it - begin; - _trie.find(it, end, i, schar.dag); + _trie->find(it, end, i, schar.dag); //DagType::iterator dagIter; if(schar.dag.end() == schar.dag.find(i)) { @@ -183,7 +172,7 @@ namespace CppJieba } else { - val += _trie.getMinLogFreq(); + val += _trie->getMinLogFreq(); } if(val > segContext[i].weight) { @@ -211,7 +200,7 @@ namespace CppJieba TrieNodeInfo nodeInfo; nodeInfo.word.push_back(segContext[i].uniCh); nodeInfo.freq = 0; - nodeInfo.logFreq = _trie.getMinLogFreq(); + nodeInfo.logFreq = _trie->getMinLogFreq(); res.push_back(nodeInfo); i++; } diff --git a/src/QuerySegment.hpp b/src/QuerySegment.hpp index ae5987e..a57f710 100644 --- a/src/QuerySegment.hpp +++ b/src/QuerySegment.hpp @@ -8,21 +8,22 @@ #include "Trie.hpp" #include "ISegment.hpp" #include "SegmentBase.hpp" -#include "HMMSegment.hpp" #include "FullSegment.hpp" +#include "MixSegment.hpp" #include "TransCode.hpp" +#include "TrieManager.hpp" namespace CppJieba { class QuerySegment: public SegmentBase { private: - HMMSegment _hmmSeg; + MixSegment _mixSeg; FullSegment _fullSeg; int _maxWordLen; public: - QuerySegment(const char* fullSegDict, const char* hmmSegDict, int maxWordLen): _hmmSeg(hmmSegDict), _fullSeg(fullSegDict), _maxWordLen(maxWordLen){}; + QuerySegment(const char* dict, const char* model, int maxWordLen): _mixSeg(dict, model), _fullSeg(dict), _maxWordLen(maxWordLen){}; virtual ~QuerySegment(){dispose();}; public: bool init() @@ -31,9 +32,9 @@ namespace CppJieba { LogError("inited."); } - if (!_hmmSeg.init()) + if (!_mixSeg.init()) { - LogError("_hmmSeg init"); + LogError("_mixSeg init"); return false; } if (!_fullSeg.init()) @@ -50,7 +51,7 @@ namespace CppJieba return true; } _fullSeg.dispose(); - _hmmSeg.dispose(); + _mixSeg.dispose(); _setInitFlag(false); return true; } @@ -68,22 +69,22 @@ namespace CppJieba return false; } - //use hmm cut first - vector hmmRes; - if (!_hmmSeg.cut(begin, end, hmmRes)) + //use mix cut first + vector mixRes; + if (!_mixSeg.cut(begin, end, mixRes)) { - LogError("_hmmSeg cut failed."); + LogError("_mixSeg cut failed."); return false; } vector fullRes; - for (vector::const_iterator hmmResItr = hmmRes.begin(); hmmResItr != hmmRes.end(); hmmResItr++) + for (vector::const_iterator mixResItr = mixRes.begin(); mixResItr != mixRes.end(); mixResItr++) { // if it's too long, cut with _fullSeg, put fullRes in res - if (hmmResItr->size() > _maxWordLen) + if (mixResItr->size() > _maxWordLen) { - if (_fullSeg.cut(hmmResItr->begin(), hmmResItr->end(), fullRes)) + if (_fullSeg.cut(mixResItr->begin(), mixResItr->end(), fullRes)) { for (vector::const_iterator fullResItr = fullRes.begin(); fullResItr != fullRes.end(); fullResItr++) { @@ -91,9 +92,9 @@ namespace CppJieba } } } - else // just use the hmm result + else // just use the mix result { - res.push_back(*hmmResItr); + res.push_back(*mixResItr); } } From 82424cc7f5a69e36e1314cb4e20a2bb06bdada01 Mon Sep 17 00:00:00 2001 From: aholic Date: Mon, 16 Dec 2013 14:42:53 +0800 Subject: [PATCH 7/8] add FullSegment QuerySegment TrieManger to README.md --- README.md | 51 +++++++++++++++++++++++++++++++++++++++++++++++-- src/segment.cpp | 11 ++++++----- 2 files changed, 55 insertions(+), 7 deletions(-) diff --git a/README.md b/README.md index a2979f4..8386cee 100644 --- a/README.md +++ b/README.md @@ -80,10 +80,13 @@ Output: ``` 我来到北京清华大学 我来/到/北京/清华大学 + 他来到了网易杭研大厦 他来/到/了/网易/杭/研大厦 + 小明硕士毕业于中国科学院计算所,后在日本京都大学深造 小明/硕士/毕业于/中国/科学院/计算所/,/后/在/日/本/京/都/大/学/深/造 + 我来自北京邮电大学。。。学号091111xx。。。 我来/自北京/邮电大学/。。。/学号/091111xx/。。。 ``` @@ -94,23 +97,60 @@ Output: ``` 我来到北京清华大学 我/来到/北京/清华大学 + 他来到了网易杭研大厦 他/来到/了/网易/杭研/大厦 -杭研 -杭研 + 小明硕士毕业于中国科学院计算所,后在日本京都大学深造 小明/硕士/毕业/于/中国科学院/计算所/,/后/在/日本京都大学/深造 + 我来自北京邮电大学。。。学号091111xx。。。 我/来自/北京邮电大学/。。。/学号/091111xx/。。。 ``` +### FullSegment's demo + +Output: +``` +我来到北京清华大学 +我/来到/北京/清华/清华大学/华大/大学 + +他来到了网易杭研大厦 +他/来到/了/网易/杭/研/大厦 + +小明硕士毕业于中国科学院计算所,后在日本京都大学深造 +小/明/硕士/毕业/于/中国/中国科学院/科学/科学院/学院/计算/计算所/,/后/在/日本/日本京都大学/京都/京都大学/大学/深造 + +我来自北京邮电大学。。。 学号 091111xx。。。 +我/来自/北京/北京邮电/北京邮电大学/邮电/邮电大学/电大/大学/。/。/。/ /学号/ 091111xx/。/。/。 +``` + +### QuerySegment's demo + +Output: +``` +我来到北京清华大学 +我/来到/北京/清华/清华大学/华大/大学 + +他来到了网易杭研大厦 +他/来到/了/网易/杭研/大厦 + +小明硕士毕业于中国科学院计算所,后在日本京都大学深造 +小明/硕士/毕业/于/中国/中国科学院/科学/科学院/学院/计算所/,/后/在/中国/中国科学院/科学/科学院/学院/日本/日本京都大学/京都/京都大学/大学/深造 + +我来自北京邮电大学。。。 学号 091111xx。。。 +我/来自/北京/北京邮电/北京邮电大学/邮电/邮电大学/电大/大学/。/。/。/ /学号/ 091111xx/。/。/。 +``` + ### 效果分析 以上依次是MP,HMM,Mix三种方法的效果。 可以看出效果最好的是Mix,也就是融合MP和HMM的切词算法。即可以准确切出词典已有的词,又可以切出像"杭研"这样的未登录词。 +Full方法切出所有字典里的词语。 +Query方法先使用Mix方法切词,对于切出来的较长的词再使用Full方法。 ## 模块详解 @@ -120,6 +160,10 @@ Output: 核心目录,包含主要源代码。 +#### TrieManager模块 +TrieManager.hpp 提供一个单例TrieManager,负责管理trie树。 +通过该单例获取trie树时,会先判断是否已经由该字典文件生成了一颗trie树,如果已有则返回已有的trie树,否则重新创建一颗trie树返回。 + #### Trie树 Trie.hpp 负责载入词典的trie树,主要供Segment模块使用。 @@ -133,6 +177,9 @@ HMMSegment.hpp HMM模型由dicts/下面的`hmm_model.utf8`提供。 分词算法即viterbi算法。 +FullSegment.hpp +枚举句子中所有可能成词的情况,找出字典里存在的即可。 + #### TransCode模块 TransCode.cpp/TransCode.h 负责转换编码类型,将utf8和gbk转换成`uint16_t`类型,也负责逆转换。 diff --git a/src/segment.cpp b/src/segment.cpp index 2eb61f4..37cad63 100644 --- a/src/segment.cpp +++ b/src/segment.cpp @@ -43,12 +43,13 @@ int main(int argc, char ** argv) <<"\t--algorithm\tSupported methods are [cutDAG, cutHMM, cutFull, cutQuery, cutMix] for now. \n\t\t\tIf not specified, the default is cutMix\n" <<"\t--dictpath\tsee example\n" <<"\t--modelpath\tsee example\n" - <<"\t--maxlen\tspecify the granularity of cut used in cutQuery, If not specified, the default is 3\n" + <<"\t--maxlen\tspecify the granularity of cut used in cutQuery. \n\t\t\tIf not specified, the default is 3\n" <<"example:\n" - <<"\t"< Date: Mon, 16 Dec 2013 16:32:24 +0800 Subject: [PATCH 8/8] update md5.hpp in limonp --- src/Limonp/md5.hpp | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/Limonp/md5.hpp b/src/Limonp/md5.hpp index b01be20..8b861e1 100644 --- a/src/Limonp/md5.hpp +++ b/src/Limonp/md5.hpp @@ -338,6 +338,9 @@ public: // Digests a file and returns the result. const char* digestFile( const char *filename ) { + if (NULL == filename || strcmp(filename, "") == 0) + return NULL; + Init() ; FILE *file; @@ -345,7 +348,7 @@ public: int len; unsigned char buffer[1024] ; - if(NULL == filename || (file = fopen (filename, "rb")) == NULL ) + if((file = fopen (filename, "rb")) == NULL) { return NULL; }