change the priorSubWord

This commit is contained in:
gwdwyy 2013-07-18 14:53:27 +08:00
parent 089a63bf2c
commit 03581fcabe
4 changed files with 36 additions and 20 deletions

View File

@ -26,21 +26,27 @@ namespace CppJieba
} }
bool KeyWordExt::loadPriorWordPrefixes(const char * const filePath) bool KeyWordExt::loadPriorSubWord(const char * const filePath)
{ {
LogInfo(string_format("loadPriorWordPrefixes(%s) start", filePath)); LogInfo(string_format("loadPriorSubWord(%s) start", filePath));
if(!checkFileExist(filePath)) if(!checkFileExist(filePath))
{ {
LogError(string_format("cann't find file[%s].",filePath)); LogError(string_format("cann't find file[%s].",filePath));
return false; return false;
} }
bool retFlag = _priorPrefixTrie.init(filePath); if(!_priorSubWords.empty())
if(!retFlag)
{ {
LogError("_priorPrefixTrie.init return false."); LogError("_priorSubWords has been initted before");
return false; return false;
} }
LogInfo(string_format("loadPriorWordPrefixes(%s) end", filePath)); ifstream infile(filePath);
string subword;
while(getline(infile, subword))
{
_priorSubWords.push_back(subword);
}
LogInfo(string_format("loadPriorSubWord(%s) end", filePath));
infile.close();
return true; return true;
} }
@ -73,7 +79,6 @@ namespace CppJieba
bool KeyWordExt::destroy() bool KeyWordExt::destroy()
{ {
_segment.destroy(); _segment.destroy();
_priorPrefixTrie.destroy();
return true; return true;
} }
@ -132,8 +137,8 @@ namespace CppJieba
_sortWLIDF(wordInfos); _sortWLIDF(wordInfos);
LogDebug(string_format("calc weight & sorted:\n%s",joinWordInfos(wordInfos).c_str())); LogDebug(string_format("calc weight & sorted:\n%s",joinWordInfos(wordInfos).c_str()));
_priorWordPrefixes(wordInfos); _prioritizeSubWords(wordInfos);
LogDebug(string_format("_priorWordPrefixes res:\n%s", joinWordInfos(wordInfos).c_str())); LogDebug(string_format("_prioritizeSubWords res:\n%s", joinWordInfos(wordInfos).c_str()));
//extract TopN //extract TopN
for(uint i = 0; i < topN && i < wordInfos.size(); i++) for(uint i = 0; i < topN && i < wordInfos.size(); i++)
{ {
@ -308,7 +313,19 @@ namespace CppJieba
return true; return true;
} }
bool KeyWordExt::_priorWordPrefixes(vector<WordInfo>& wordInfos) bool KeyWordExt::_isContainSubWords(const string& word)
{
for(uint i = 0; i < _priorSubWords.size(); i++)
{
if(string::npos != word.find(_priorSubWords[i]))
{
return true;
}
}
return false;
}
bool KeyWordExt::_prioritizeSubWords(vector<WordInfo>& wordInfos)
{ {
if(2 > wordInfos.size()) if(2 > wordInfos.size())
{ {
@ -319,7 +336,7 @@ namespace CppJieba
bool flag = false; bool flag = false;
for(vector<WordInfo>::iterator it = wordInfos.begin(); it != wordInfos.end(); ) for(vector<WordInfo>::iterator it = wordInfos.begin(); it != wordInfos.end(); )
{ {
if(NULL != _priorPrefixTrie.findPrefix(it->word)) if(_isContainSubWords(it->word))
{ {
prior = *it; prior = *it;
it = wordInfos.erase(it); it = wordInfos.erase(it);
@ -353,7 +370,7 @@ int main()
} }
ext.loadStopWords("stopwords.tmp"); ext.loadStopWords("stopwords.tmp");
if(!ext.loadPriorWordPrefixes("prior.utf8")) if(!ext.loadPriorSubWord("prior.utf8"))
{ {
cerr<<"err"<<endl; cerr<<"err"<<endl;
return 1; return 1;
@ -375,7 +392,7 @@ int main()
*/ */
title = "2013新款19CM超大檐帽 遮阳草帽子 沙滩帽防晒大檐欧美新款夏天女装"; title = "2013夏季新款韩版女装甜美雪纺长裙连衣裙碎花裙蕾丝无袖连衣裙子";
res.clear(); res.clear();
ext.extract(title, res, 5); ext.extract(title, res, 5);
PRINT_VECTOR(res); PRINT_VECTOR(res);

View File

@ -31,7 +31,7 @@ namespace CppJieba
for(uint i = 0; i < vec.size(); i++) for(uint i = 0; i < vec.size(); i++)
{ {
tmp.push_back(vec[i].getInfoStr()); tmp.push_back(vec[i].getInfoStr());
}; }
return joinStr(tmp, ","); return joinStr(tmp, ",");
} }
@ -39,7 +39,7 @@ namespace CppJieba
{ {
private: private:
Segment _segment; Segment _segment;
Trie _priorPrefixTrie; vector<string> _priorSubWords;
set<string> _stopWords; set<string> _stopWords;
public: public:
KeyWordExt(); KeyWordExt();
@ -50,7 +50,7 @@ namespace CppJieba
bool loadStopWords(const char * const filePath); bool loadStopWords(const char * const filePath);
//load prior words' prefix //load prior words' prefix
bool loadPriorWordPrefixes( const char * const filePath); bool loadPriorSubWord( const char * const filePath);
bool destroy(); bool destroy();
@ -70,7 +70,8 @@ namespace CppJieba
bool _filterSubstr(vector<string>& utf8Strs); bool _filterSubstr(vector<string>& utf8Strs);
bool _filterStopWords(vector<string>& utf8Strs); bool _filterStopWords(vector<string>& utf8Strs);
private: private:
bool _priorWordPrefixes(vector<WordInfo>& wordInfos); bool _prioritizeSubWords(vector<WordInfo>& wordInfos);
bool _isContainSubWords(const string& word);
}; };

View File

@ -31,7 +31,7 @@ namespace CppJieba
{ {
string word;// utf8 string word string word;// utf8 string word
size_t wLen;// the word's len , not string.size(), eg: "我是中国人" wLen = 5 . size_t wLen;// the word's len , not string.size(), eg: "我是中国人" wLen = 5 .
unsigned int count; size_t count;
string tag; string tag;
double weight; double weight;
TrieNodeInfo() TrieNodeInfo()

View File

@ -7,8 +7,6 @@
namespace CppJieba namespace CppJieba
{ {
//file path
//const char * const DICT_FILE_PATH = "dict.txt";
//typedefs //typedefs
typedef uint16_t ChUnicode; typedef uint16_t ChUnicode;