big reconstruction: replace string word with Unicode in structs.h and others

2025-07-18 00:00:12 +08:00 · 2013-09-09 14:22:25 +08:00 · 2013-09-09 14:22:25 +08:00 · e8a98d4e4d
commit e8a98d4e4d
parent 70f12f2c97
11 changed files with 338 additions and 280 deletions
--- a/demo/keywordext_demo.cpp
+++ b/demo/keywordext_demo.cpp
@ -8,7 +8,7 @@ using namespace CppJieba;
 void testKeyWordExt(const char * dictPath, const char * filePath)
 {
 	KeyWordExt ext;
-	if(!ext.init(dictPath, "../dicts/stopwords.gbk.v1.0"))
+	if(!ext.init(dictPath))
 	{
 		return;
 	}
--- a/demo/segment_demo.cpp
+++ b/demo/segment_demo.cpp
@ -32,7 +32,7 @@ void cut(const char * const filePath)
 	{
 		if(!line.empty())
 		{
-			seg.cutDAG(line, res);
+			seg.cut(line, res);
 			cout<<line<<"\n"<<joinStr(res,"/")<<endl;
 		}
 	}
@ -60,7 +60,7 @@ void cutAll(const char* const filePath)
 	string line;
 	while(getline(ifs, line))
 	{
-		seg.cutDAG(line, res);
+		seg.cut(line, res);
 	}
 }
--- a/src/KeyWordExt.cpp
+++ b/src/KeyWordExt.cpp
@ -16,7 +16,7 @@ namespace CppJieba
 	{
 	}
-	bool KeyWordExt::init(const char* const segDictFile, const char* const stopWordDictFile)
+	bool KeyWordExt::init(const char* const segDictFile)
 	{
 		LogInfo("KeyWordExt init start ...");
 		if(!_segment.init(segDictFile))
@ -24,40 +24,34 @@ namespace CppJieba
 			LogError("_segment.init failed.");
 			return false;
 		}
 		if(!_loadStopWords(stopWordDictFile))
 		{
 			LogError("_loadStopWords failed.");
 			return false;
 		}
 		LogInfo("KeyWordExt init OK.");
 		return true;
 	}
-	bool KeyWordExt::_loadPriorSubWords(const char * const filePath)
+	//bool KeyWordExt::_loadPriorSubWords(const char * const filePath)
-	{
+	//{
-		LogInfo(string_format("_loadPriorSubWords(%s) start", filePath));
+	//	LogInfo(string_format("_loadPriorSubWords(%s) start", filePath));
-		if(!checkFileExist(filePath))
+	//	if(!checkFileExist(filePath))
-		{
+	//	{
-			LogError(string_format("cann't find file[%s].",filePath));
+	//		LogError(string_format("cann't find file[%s].",filePath));
-			return false;
+	//		return false;
-		}
+	//	}
-		if(!_priorSubWords.empty())
+	//	if(!_priorSubWords.empty())
-		{
+	//	{
-			LogError("_priorSubWords has been initted before");
+	//		LogError("_priorSubWords has been initted before");
-			return false;
+	//		return false;
-		}
+	//	}
-		ifstream infile(filePath);
+	//	ifstream infile(filePath);
-		string subword;
+	//	string subword;
-		while(getline(infile, subword))
+	//	while(getline(infile, subword))
-		{
+	//	{
-			_priorSubWords.push_back(subword);
+	//		_priorSubWords.push_back(subword);
-		}
+	//	}
-		LogInfo(string_format("_loadPriorSubWords(%s) end", filePath));
+	//	LogInfo(string_format("_loadPriorSubWords(%s) end", filePath));
-		infile.close();
+	//	infile.close();
-		return true;
+	//	return true;
-	}
+	//}
-	bool KeyWordExt::_loadStopWords(const char * const filePath)
+	bool KeyWordExt::loadStopWords(const char * const filePath)
 	{
 		LogInfo(string_format("_loadStopWords(%s) start", filePath));
@ -74,9 +68,15 @@ namespace CppJieba
 		ifstream ifile(filePath);
 		string line;
        Unicode word;
 		while(getline(ifile, line))
 		{
-			_stopWords.insert(line);
+            if(!TransCode::strToVec(line, word))
            {
                LogError("strToVec failed .");
                return false;
            }
 			_stopWords.insert(word);
 		}
 		LogInfo(string_format("load stopwords[%d] finished.", _stopWords.size()));
@ -100,12 +100,7 @@ namespace CppJieba
 		{
 			KeyWordInfo& wInfo = wordInfos[i];
 			wInfo.idf = - wInfo.logFreq;
-			if(0 == wInfo.wLen)
+			wInfo.weight = log(double(wInfo.word.size() + 1)) * wInfo.idf;
 			{
 				LogFatal("wLen is 0!");
 				return false;
 			}
 			wInfo.weight = log(double(wInfo.wLen + 1)) * wInfo.idf;
 		}
 		sort(wordInfos.begin(), wordInfos.end(), _wordInfoCompare);
 		return true;
@ -143,14 +138,16 @@ namespace CppJieba
 			return false;
 		}
 #ifdef DEBU
 		LogDebug(string_format("words:[%s]", joinStr(words, ",").c_str()));
 #endif
 		keyWordInfos.clear();
 		for(uint i = 0; i < words.size(); i++)
 		{
-			keyWordInfos.push_back(words[i]);
+            Unicode uniWord;
            if(!TransCode::strToVec(words[i], uniWord))
            {
                LogError("strToVec failed");
                return false;
            }
 			keyWordInfos.push_back(uniWord);
 		}
 		return _extract(keyWordInfos, topN);
@ -164,7 +161,7 @@ namespace CppJieba
 		}
 		vector<TrieNodeInfo> trieNodeInfos; 
-		_segment.cutDAG(title, trieNodeInfos);
+		_segment.cut(title, trieNodeInfos);
 		keyWordInfos.clear();
 		for(uint i = 0; i < trieNodeInfos.size(); i++)
@ -249,7 +246,7 @@ namespace CppJieba
 	bool KeyWordExt::_filterDuplicate(vector<KeyWordInfo>& wordInfos)
 	{
-		set<string> st;
+		set<Unicode> st;
 		for(vector<KeyWordInfo>::iterator it = wordInfos.begin(); it != wordInfos.end(); )
 		{
 			if(st.find(it->word) != st.end())
@ -271,7 +268,7 @@ namespace CppJieba
 		{
 			// filter single word
-			if(1 == it->wLen)
+			if(1 == it->word.size())
 			{
 				it = wordInfos.erase(it);
 			}
@ -285,79 +282,68 @@ namespace CppJieba
 	bool KeyWordExt::_filterSubstr(vector<KeyWordInfo>& wordInfos)
 	{
-		vector<string> tmp ;
+		vector<Unicode> tmp ;
 		for(uint i = 0; i < wordInfos.size(); i++)
 		{
 			tmp.push_back(wordInfos[i].word);
 		}
 		set<string> subs;
 		for(vector<KeyWordInfo>::iterator it = wordInfos.begin(); it != wordInfos.end(); it ++)
 		{
 			for(uint j = 0; j < tmp.size(); j++)
 			{
 				if(it->word != tmp[j] && string::npos != tmp[j].find(it->word, 0))
 				{
 					subs.insert(it->word);
 				}
 			}
 		}
 		//erase subs from strs
 		for(vector<KeyWordInfo>::iterator it = wordInfos.begin(); it != wordInfos.end(); )
 		{
-			if(subs.end() != subs.find(it->word))
+            if(_isSubIn(tmp, it->word))
-			{
+            {
-				it =  wordInfos.erase(it);
+                it = wordInfos.erase(it);
-			}
+            }
-			else
+            else
-			{
+            {
-				it ++;
+                it++;
-			}
+            }
 		}
 		return true;
 	}
-	bool KeyWordExt::_isContainSubWords(const string& word)
+	//bool KeyWordExt::_isContainSubWords(const string& word)
-	{
+	//{
-		for(uint i = 0; i < _priorSubWords.size(); i++)
+	//	for(uint i = 0; i < _priorSubWords.size(); i++)
-		{
+	//	{
-			if(string::npos != word.find(_priorSubWords[i]))
+	//		if(string::npos != word.find(_priorSubWords[i]))
-			{
+	//		{
-				return true;
+	//			return true;
-			}
+	//		}
-		}
+	//	}
-		return false;
+	//	return false;
-	}
+	//}
-	bool KeyWordExt::_prioritizeSubWords(vector<KeyWordInfo>& wordInfos)
+	//bool KeyWordExt::_prioritizeSubWords(vector<KeyWordInfo>& wordInfos)
-	{
+	//{
-		if(2 > wordInfos.size())
+	//	if(2 > wordInfos.size())
-		{
+	//	{
-			return true;
+	//		return true;
-		}
+	//	}
-		KeyWordInfo prior;
+	//	KeyWordInfo prior;
-		bool flag = false;
+	//	bool flag = false;
-		for(vector<KeyWordInfo>::iterator it = wordInfos.begin(); it != wordInfos.end(); )
+	//	for(vector<KeyWordInfo>::iterator it = wordInfos.begin(); it != wordInfos.end(); )
-		{
+	//	{
-			if(_isContainSubWords(it->word))
+	//		if(_isContainSubWords(it->word))
-			{
+	//		{
-				prior = *it;
+	//			prior = *it;
-				it = wordInfos.erase(it);
+	//			it = wordInfos.erase(it);
-				flag = true;
+	//			flag = true;
-				break;
+	//			break;
-			}
+	//		}
-			else
+	//		else
-			{
+	//		{
-				it ++;
+	//			it ++;
-			}
+	//		}
-		}
+	//	}
-		if(flag)
+	//	if(flag)
-		{
+	//	{
-			wordInfos.insert(wordInfos.begin(), prior);
+	//		wordInfos.insert(wordInfos.begin(), prior);
-		}
+	//	}
-		return true;
+	//	return true;
-	}
+	//}
 }
@ -375,12 +361,6 @@ int main()
 	}
 	ext._loadStopWords("../dicts/stopwords.gbk.v1.0");
 	if(!ext._loadPriorSubWords("../dicts/prior.gbk"))
 	{
 		cerr<<"err"<<endl;
 		return 1;
 	}
 	ifstream ifile("testtitle.gbk");
 	vector<string> res;
 	string line;
--- a/src/KeyWordExt.h
+++ b/src/KeyWordExt.h
@ -1,7 +1,7 @@
 /************************************
 * file enc : ASCII
 * author   : wuyanyi09@gmail.com
-************************************/
+ ************************************/
 #ifndef CPPJIEBA_KEYWORDEXT_H
 #define CPPJIEBA_KEYWORDEXT_H
@ -11,45 +11,56 @@
 namespace CppJieba
 {
-	class KeyWordExt
+    class KeyWordExt
-	{
+    {
-		private:
+        private:
-			MPSegment _segment;
+            MPSegment _segment;
-			vector<string> _priorSubWords;
+            //vector<string> _priorSubWords;
-			set<string> _stopWords;
+            set<Unicode> _stopWords;
-		public:
+        public:
-			KeyWordExt();
+            KeyWordExt();
-			~KeyWordExt();
+            ~KeyWordExt();
-			bool init(const char* const segDictFile, const char* const stopWordDictFile);
+            bool init(const char* const segDictFile);
-			bool dispose();
+            bool dispose();
-
+            bool loadStopWords(const char * const filePath);
-		private:
+        private:
-			bool _loadStopWords(const char * const filePath);
+            //bool _loadPriorSubWords(const char * const filePath);
 			bool _loadPriorSubWords(const char * const filePath);
-		public:
+        public:
-			bool extract(const string& title, vector<KeyWordInfo>& keyWordInfos, uint topN);
+            bool extract(const string& title, vector<KeyWordInfo>& keyWordInfos, uint topN);
-			bool extract(const vector<string>& words, vector<KeyWordInfo>& keyWordInfos, uint topN);
+            bool extract(const vector<string>& words, vector<KeyWordInfo>& keyWordInfos, uint topN);
-		private:
+        private:
-			static bool _wordInfoCompare(const KeyWordInfo& a, const KeyWordInfo& b);
+            static bool _wordInfoCompare(const KeyWordInfo& a, const KeyWordInfo& b);
-		private:
+        private:
-			bool _extract(vector<KeyWordInfo>& keyWordInfos, uint topN);
+            bool _extract(vector<KeyWordInfo>& keyWordInfos, uint topN);
-			bool _extTopN(vector<KeyWordInfo>& wordInfos, uint topN);
+            bool _extTopN(vector<KeyWordInfo>& wordInfos, uint topN);
-		private:
+        private:
-			//sort by word len - idf
+            //sort by word len - idf
-			bool _sortWLIDF(vector<KeyWordInfo>& wordInfos);
+            bool _sortWLIDF(vector<KeyWordInfo>& wordInfos);
-		private:
+        private:
-			bool _filter(vector<KeyWordInfo>& );
+            bool _filter(vector<KeyWordInfo>& );
-			bool _filterDuplicate(vector<KeyWordInfo>& );
+            bool _filterDuplicate(vector<KeyWordInfo>& );
-			bool _filterSingleWord(vector<KeyWordInfo>& );
+            bool _filterSingleWord(vector<KeyWordInfo>& );
-			bool _filterSubstr(vector<KeyWordInfo>& );
+            bool _filterSubstr(vector<KeyWordInfo>& );
-			bool _filterStopWords(vector<KeyWordInfo>& );
+            bool _filterStopWords(vector<KeyWordInfo>& );
-		private:
+        private:
-			bool _prioritizeSubWords(vector<KeyWordInfo>& wordInfos);
+            inline bool _isSubIn(const vector<Unicode>& words, const Unicode& word)const
-			bool _isContainSubWords(const string& word);
+            {
-	};
+                for(uint j = 0; j < words.size(); j++)
                {
                    if(word != words[j] && words[j].end() != search(words[j].begin(), words[j].end(), word.begin(), word.end()))
                    {
                        return true;
                    }
                }
                return false;
            }
            //bool _prioritizeSubWords(vector<KeyWordInfo>& wordInfos);
            //bool _isContainSubWords(const string& word);
    };
 }
--- a/src/MPSegment.cpp
+++ b/src/MPSegment.cpp
@ -36,22 +36,22 @@ namespace CppJieba
 		return _trie.dispose();
 	}
-	bool MPSegment::cutDAG(const string& str, vector<string>& res)
+	bool MPSegment::cut(const string& str, vector<string>& res)
 	{
 		vector<TrieNodeInfo> segWordInfos;
-		if(!cutDAG(str, segWordInfos))
+		if(!cut(str, segWordInfos))
 		{
 			return false;
 		}
 		res.clear();
 		for(uint i = 0; i < segWordInfos.size(); i++)
 		{
-			res.push_back(segWordInfos[i].word);
+			res.push_back(TransCode::vecToStr(segWordInfos[i].word.begin(), segWordInfos[i].word.end()));
 		}
 		return true;
 	}
-	bool MPSegment::cutDAG(const string& str, vector<TrieNodeInfo>& segWordInfos)
+	bool MPSegment::cut(const string& str, vector<TrieNodeInfo>& segWordInfos)
 	{
 		if(str.empty())
 		{
@ -59,13 +59,19 @@ namespace CppJieba
 		}
 		segWordInfos.clear();
 		SegmentContext segContext;
        Unicode sentence;
-		if(!TransCode::strToVec(str, segContext.uintVec))
+		if(!TransCode::strToVec(str, sentence))
 		{
 			LogError("TransCode::strToVec failed.");
 			return false;
 		}
        for(uint i = 0; i < sentence.size(); i++)
        {
            segContext.push_back(SegmentChar(sentence[i]));
        }
 		//calc DAG
 		if(!_calcDAG(segContext))
 		{
@ -79,9 +85,9 @@ namespace CppJieba
 			return false;
 		}
-		if(!_cutDAG(segContext, segWordInfos))
+		if(!_cut(segContext, segWordInfos))
 		{
-			LogError("_cutDAG failed.");
+			LogError("_cut failed.");
 			return false;
 		}
@ -90,111 +96,150 @@ namespace CppJieba
 	bool MPSegment::_calcDAG(SegmentContext& segContext)
 	{
-		if(segContext.uintVec.empty())
+		if(segContext.empty())
 		{
            LogError("segContext empty.");
 			return false;
 		}
-		vector<pair<uint, const TrieNodeInfo*> > vec;
+
-		Unicode::const_iterator beginIter = segContext.uintVec.begin();
+        Unicode unicode;
-		for(Unicode::const_iterator iterI = segContext.uintVec.begin(); iterI != segContext.uintVec.end(); iterI++)
+        for(uint i = 0; i < segContext.size(); i++)
-		{
+        {
-			vec.clear();
+            unicode.clear();
-			vec.push_back(pair<uint, const TrieNodeInfo*>(iterI - beginIter, NULL));
+            for(uint j = i ; j < segContext.size(); j++)
-			for(Unicode::const_iterator iterJ = iterI + 1;  iterJ != segContext.uintVec.end(); iterJ++)
+            {
-			{
+                unicode.push_back(segContext[j].uniCh);
-				//care: the iterJ exceed iterEnd
+                const TrieNodeInfo* pInfo = _trie.find(unicode);
-				const TrieNodeInfo* ptNodeInfo = _trie.find(iterI, iterJ + 1);
+                if(pInfo)
-				if(NULL != ptNodeInfo)
+                {
-				{
+                    segContext[i].dag[j] = pInfo;
-					vec.push_back(pair<uint, const TrieNodeInfo*>(iterJ - beginIter, ptNodeInfo));
+                }
-				}
+            }
-			}
+            if(segContext[i].dag.end() == segContext[i].dag.find(i))
-			segContext.dag.push_back(vec);
+            {
-		}
+                segContext[i].dag[i] = NULL;
-		return true;
+            }
        }
        return true;
 		//vector<pair<uint, const TrieNodeInfo*> > vec;
 		//Unicode::const_iterator beginIter = segContext.uintVec.begin();
 		//for(Unicode::const_iterator iterI = segContext.uintVec.begin(); iterI != segContext.uintVec.end(); iterI++)
 		//{
 		//	vec.clear();
 		//	vec.push_back(pair<uint, const TrieNodeInfo*>(iterI - beginIter, NULL));
 		//	for(Unicode::const_iterator iterJ = iterI + 1;  iterJ != segContext.uintVec.end(); iterJ++)
 		//	{
 		//		//care: the iterJ exceed iterEnd
 		//		const TrieNodeInfo* ptNodeInfo = _trie.find(iterI, iterJ + 1);
 		//		if(NULL != ptNodeInfo)
 		//		{
 		//			vec.push_back(pair<uint, const TrieNodeInfo*>(iterJ - beginIter, ptNodeInfo));
 		//		}
 		//	}
 		//	segContext.dag.push_back(vec);
 		//}
 		//return true;
 	}
 	bool MPSegment::_calcDP(SegmentContext& segContext)
 	{
-		if(segContext.uintVec.empty())
+		if(segContext.empty())
 		{
-			LogError("uintVec illegal");
+			LogError("segContext empty");
 			return false;
 		}
-		if(segContext.uintVec.size() != segContext.dag.size())
+        for(int i = segContext.size() - 1; i >= 0; i--)
-		{
+        {
-			LogError("dag is illegal!");
+            segContext[i].pInfo = NULL;
-			return false;
+            segContext[i].weight = MIN_DOUBLE;
-		}
+            for(DagType::const_iterator it = segContext[i].dag.begin(); it != segContext[i].dag.end(); it++)
            {
                uint nextPos = it->first;
                const TrieNodeInfo* p = it->second;
                double val = 0.0;
                if(nextPos + 1 < segContext.size())
                {
                    val += segContext[nextPos + 1].weight;
                }
-		segContext.dp.assign(segContext.uintVec.size() + 1, pair<const TrieNodeInfo*, double>(NULL, 0.0));
+                if(p)
-		segContext.dp[segContext.uintVec.size()].first = NULL;
+                {
-		segContext.dp[segContext.uintVec.size()].second = 0.0;
+					val += p->logFreq; 
-
+                }
-		for(int i = segContext.uintVec.size() - 1; i >= 0; i--)
+                else
-		{
+                {
 			// calc max
 			segContext.dp[i].first = NULL;
 			segContext.dp[i].second = MIN_DOUBLE;
 			for(uint j = 0; j < segContext.dag[i].size(); j++)
 			{
 				const pair<uint , const TrieNodeInfo*>& p = segContext.dag[i][j];
 				int pos = p.first;
 				double val = segContext.dp[pos+1].second;
 				if(NULL != p.second)
 				{
 					val += (p.second)->logFreq; 
 				}
 				else
 				{
 				    val += _trie.getMinLogFreq();
-				}
+                }
-
+				if(val > segContext[i].weight)
 				if(val > segContext.dp[i].second)
 				{
-					segContext.dp[i].first = p.second;
+					segContext[i].pInfo = p;
-					segContext.dp[i].second = val;
+					segContext[i].weight = val;
 				}
-			}
+            }
-		}
+        }
-		segContext.dp.pop_back();
+        return true;
-		return true;
+
 		//segContext.dp.assign(segContext.uintVec.size() + 1, pair<const TrieNodeInfo*, double>(NULL, 0.0));
 		//segContext.dp[segContext.uintVec.size()].first = NULL;
 		//segContext.dp[segContext.uintVec.size()].second = 0.0;
 		//for(int i = segContext.uintVec.size() - 1; i >= 0; i--)
 		//{
 		//	// calc max
 		//	segContext.dp[i].first = NULL;
 		//	segContext.dp[i].second = MIN_DOUBLE;
 		//	for(uint j = 0; j < segContext.dag[i].size(); j++)
 		//	{
 		//		const pair<uint , const TrieNodeInfo*>& p = segContext.dag[i][j];
 		//		int pos = p.first;
 		//		double val = segContext.dp[pos+1].second;
 		//		if(NULL != p.second)
 		//		{
 		//			val += (p.second)->logFreq; 
 		//		}
 		//		else
 		//		{
 		//		    val += _trie.getMinLogFreq();
 		//		}
 		//		if(val > segContext.dp[i].second)
 		//		{
 		//			segContext.dp[i].first = p.second;
 		//			segContext.dp[i].second = val;
 		//		}
 		//	}
 		//}
 		//segContext.dp.pop_back();
 		//return true;
 	}
-	bool MPSegment::_cutDAG(SegmentContext& segContext, vector<TrieNodeInfo>& res)
+	bool MPSegment::_cut(SegmentContext& segContext, vector<TrieNodeInfo>& res)
 	{
-		if(segContext.dp.empty() || segContext.uintVec.empty() || segContext.dp.size() != segContext.uintVec.size())
+		//if(segContext.dp.empty() || segContext.uintVec.empty() || segContext.dp.size() != segContext.uintVec.size())
-		{
+		//{
-			LogError("dp or uintVec illegal!");
+		//	LogFatal("dp or uintVec illegal!");
-			return false;
+		//	return false;
-		}
+		//}
 		res.clear();
 		Unicode::const_iterator iterBegin = segContext.uintVec.begin();
 		uint i = 0;
-		while(i < segContext.dp.size())
+		while(i < segContext.size())
 		{
-			const TrieNodeInfo* p = segContext.dp[i].first;
+			const TrieNodeInfo* p = segContext[i].pInfo;
-			if(NULL == p)
+			if(p)
 			{
 				res.push_back(*p);
 				i += p->word.size();
 			}
 			else//single chinese word
 			{
 				TrieNodeInfo nodeInfo;
-				nodeInfo.word = TransCode::vecToStr(iterBegin + i, iterBegin + i +1);
+				nodeInfo.word.push_back(segContext[i].uniCh);
 				nodeInfo.wLen = 1;
 				nodeInfo.freq = 0;
 				nodeInfo.logFreq = _trie.getMinLogFreq();
 				res.push_back(nodeInfo);
-				i ++;
+				i++;
 			}
 			else
 			{
 				res.push_back(*p);
 				if(0 == p->wLen)
 				{
 					LogFatal("TrieNodeInfo's wLen is 0!");
 					return false;
 				}
 				i += p->wLen;
 			}
 		}
 		return true;
@ -223,7 +268,7 @@ int main()
 	while(getline(ifile, line))
 	{
 		res.clear();
-		segment.cutDAG(line, res);
+		segment.cut(line, res);
 		PRINT_VECTOR(res);
 		getchar();
 	}
--- a/src/MPSegment.h
+++ b/src/MPSegment.h
@ -13,6 +13,9 @@
 namespace CppJieba
 {
    typedef vector<SegmentChar> SegmentContext;
 	class MPSegment
 	{
 		private:
@ -25,15 +28,14 @@ namespace CppJieba
 			bool init(const char* const filePath);
 			bool dispose();
 		public:
-			bool cutDAG(const string& str, vector<TrieNodeInfo>& segWordInfos);
+			bool cut(const string& str, vector<TrieNodeInfo>& segWordInfos);
-			bool cutDAG(const string& str, vector<string>& res);
+			bool cut(const string& str, vector<string>& res);
 		private:
 			bool _calcDAG(SegmentContext& segContext);
 			bool _calcDP(SegmentContext& segContext);
-			bool _cutDAG(SegmentContext& segContext, vector<TrieNodeInfo>& res);
+			bool _cut(SegmentContext& segContext, vector<TrieNodeInfo>& res);
 			//bool _fill(const string& )
 	};
 }
--- a/src/MixSegment.cpp
+++ b/src/MixSegment.cpp
@ -35,12 +35,12 @@ namespace CppJieba
    bool MixSegment::cut(const string& str, vector<string>& res)
    {
        vector<TrieNodeInfo> infos;
-        if(!_mpSeg.cutDAG(str, infos))
+        if(!_mpSeg.cut(str, infos))
        {
            LogError("_mpSeg cutDAG failed.");
            return false;
        }
-        for(uint = 0; i < infos.size(); i++)
+        for(uint i= 0; i < infos.size(); i++)
        {
        }
--- a/src/Trie.cpp
+++ b/src/Trie.cpp
@ -109,9 +109,11 @@ namespace CppJieba
                LogError(string_format("line[%s] illegal.", line.c_str()));
                return false;
            }
-			nodeInfo.word = vecBuf[0];
+			if(!TransCode::strToVec(vecBuf[0], nodeInfo.word))
            {
                return false;
            }
 			nodeInfo.freq = atoi(vecBuf[1].c_str());
 			nodeInfo.wLen = TransCode::getWordLength(nodeInfo.word);
 			if(3 == vecBuf.size())
 			{
 				nodeInfo.tag = vecBuf[2];
@ -193,7 +195,7 @@ namespace CppJieba
 		return res;
 	}
-	const TrieNodeInfo* Trie::find(const string& str)
+	TrieNodeInfo* Trie::find(const string& str)
 	{
 		Unicode uintVec;
 		bool retFlag = TransCode::strToVec(str, uintVec);
@ -204,7 +206,7 @@ namespace CppJieba
 		return find(uintVec);
 	}
-	const TrieNodeInfo* Trie::find(const Unicode& uintVec)
+	TrieNodeInfo* Trie::find(const Unicode& uintVec)
 	{
 		if(uintVec.empty())
 		{
@ -213,7 +215,7 @@ namespace CppJieba
 		return find(uintVec.begin(), uintVec.end());
 	}
-	const TrieNodeInfo* Trie::find(Unicode::const_iterator begin, Unicode::const_iterator end)
+	TrieNodeInfo* Trie::find(Unicode::const_iterator begin, Unicode::const_iterator end)
 	{
 		if(!_getInitFlag())
@ -324,16 +326,8 @@ namespace CppJieba
 			return false;
 		}
 		const string& word = nodeInfo.word;
 		Unicode uintVec;
 		bool retFlag = TransCode::strToVec(word, uintVec);
 		if(!retFlag)
 		{
 			LogError("TransCode::strToVec error.");
 			return false;
 		}
 		const Unicode& uintVec = nodeInfo.word;
        TrieNode* p = _root;
        for(uint i = 0; i < uintVec.size(); i++)
        {
--- a/src/Trie.h
+++ b/src/Trie.h
@ -66,9 +66,9 @@ namespace CppJieba
 			bool _getInitFlag();
 		public:
-			const TrieNodeInfo* find(const string& str);
+			TrieNodeInfo* find(const string& str);
-			const TrieNodeInfo* find(const Unicode& uintVec);
+			TrieNodeInfo* find(const Unicode& uintVec);
-			const TrieNodeInfo* find(Unicode::const_iterator begin, Unicode::const_iterator end);
+			TrieNodeInfo* find(Unicode::const_iterator begin, Unicode::const_iterator end);
 			const TrieNodeInfo* findPrefix(const string& str);
 		public:
--- a/src/globals.h
+++ b/src/globals.h
@ -27,7 +27,6 @@ namespace CppJieba
 	typedef unordered_map<uint16_t, struct TrieNode*> TrieNodeMap;
 	typedef unordered_map<uint16_t, double> EmitProbMap;
 	const double MIN_DOUBLE = -3.14e+100;
 	const double MAX_DOUBLE = 3.14e+100;
 }
--- a/src/structs.h
+++ b/src/structs.h
@ -4,35 +4,63 @@
 #include <limits>
 #include "globals.h"
 #include "Trie.h"
 #include "TransCode.h"
 namespace CppJieba
 {
 	struct TrieNodeInfo
 	{
-		string word;
+		//string word;
-		size_t wLen;// the word's len , not string.length(), 
+		//size_t wLen;// the word's len , not string.length(), 
        Unicode word;
 		size_t freq;
 		string tag;
 		double logFreq; //logFreq = log(freq/sum(freq));
-		TrieNodeInfo():wLen(0),freq(0),logFreq(0.0)
+		TrieNodeInfo():freq(0),logFreq(0.0)
 		{
 		}
-		TrieNodeInfo(const TrieNodeInfo& nodeInfo):word(nodeInfo.word), wLen(nodeInfo.wLen), freq(nodeInfo.freq), tag(nodeInfo.tag), logFreq(nodeInfo.logFreq)
+		TrieNodeInfo(const TrieNodeInfo& nodeInfo):word(nodeInfo.word), freq(nodeInfo.freq), tag(nodeInfo.tag), logFreq(nodeInfo.logFreq)
 		{
 		}
-		TrieNodeInfo(const string& _word):word(_word),freq(0),logFreq(MIN_DOUBLE)
+		TrieNodeInfo(const Unicode& _word):word(_word),freq(0),logFreq(MIN_DOUBLE)
 		{
 			wLen = TransCode::getWordLength(_word);
 		}
 	};
-	struct SegmentContext//: public TrieNodeInfo
+    typedef unordered_map<uint, const TrieNodeInfo*> DagType;
-	{
+    struct SegmentChar 
-		vector<uint16_t> uintVec;
+    {
-		vector< vector<pair<uint, const TrieNodeInfo*> > > dag;
+        uint16_t uniCh;
-		vector< pair<const TrieNodeInfo*, double> > dp;
+        DagType dag;
-	};
+        const TrieNodeInfo * pInfo;
        double weight;
        SegmentChar(uint16_t uni):uniCh(uni), pInfo(NULL), weight(0.0)
        {
        }
        /*const TrieNodeInfo* pInfo;
        double weight;
        SegmentChar(uint16_t unich, const TrieNodeInfo* p, double w):uniCh(unich), pInfo(p), weight(w)
        {
        }*/
    };
    /*
    struct SegmentContext
    {
        vector<SegmentChar> context;
        bool getDA
    };*/
    typedef vector<SegmentChar> SegmentContext;
 	//struct SegmentContext
 	//{
    //    vector<SegmentChar> context;
 	//	//vector<uint16_t> uintVec;
 	//	//vector< vector<pair<uint, const TrieNodeInfo*> > > dag;
 	//	//vector< pair<const TrieNodeInfo*, double> > dp;
 	//};
 	/*
 	struct SegmentWordInfo: public TrieNodeInfo
@ -48,7 +76,7 @@ namespace CppJieba
 		KeyWordInfo():idf(0.0),weight(0.0)
 		{
 		}
-		KeyWordInfo(const string& _word):TrieNodeInfo(_word),idf(0.0),weight(0.0)
+		KeyWordInfo(const Unicode& _word):TrieNodeInfo(_word),idf(0.0),weight(0.0)
 		{ 
 		}
 		KeyWordInfo(const TrieNodeInfo& trieNodeInfo):TrieNodeInfo(trieNodeInfo)
@ -56,13 +84,12 @@ namespace CppJieba
 		}
 		string toString() const
 		{
-			return string_format("{word:%s,wLen:%d weight:%lf, idf:%lf}", word.c_str(), wLen, weight, idf);
+			return string_format("{word:%s,weight:%lf, idf:%lf}", TransCode::vecToStr(word.begin(), word.end()).c_str(), weight, idf);
 		}
 		KeyWordInfo& operator = (const TrieNodeInfo& trieNodeInfo)
 		{
 			word = trieNodeInfo.word;
 			freq = trieNodeInfo.freq;
 			wLen = trieNodeInfo.wLen;
 			tag = trieNodeInfo.tag;
 			logFreq = trieNodeInfo.logFreq;
 			return *this;