mirror of
https://github.com/yanyiwu/cppjieba.git
synced 2025-07-18 00:00:12 +08:00
change the priorSubWord
This commit is contained in:
parent
089a63bf2c
commit
03581fcabe
@ -26,21 +26,27 @@ namespace CppJieba
|
|||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
bool KeyWordExt::loadPriorWordPrefixes(const char * const filePath)
|
bool KeyWordExt::loadPriorSubWord(const char * const filePath)
|
||||||
{
|
{
|
||||||
LogInfo(string_format("loadPriorWordPrefixes(%s) start", filePath));
|
LogInfo(string_format("loadPriorSubWord(%s) start", filePath));
|
||||||
if(!checkFileExist(filePath))
|
if(!checkFileExist(filePath))
|
||||||
{
|
{
|
||||||
LogError(string_format("cann't find file[%s].",filePath));
|
LogError(string_format("cann't find file[%s].",filePath));
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
bool retFlag = _priorPrefixTrie.init(filePath);
|
if(!_priorSubWords.empty())
|
||||||
if(!retFlag)
|
|
||||||
{
|
{
|
||||||
LogError("_priorPrefixTrie.init return false.");
|
LogError("_priorSubWords has been initted before");
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
LogInfo(string_format("loadPriorWordPrefixes(%s) end", filePath));
|
ifstream infile(filePath);
|
||||||
|
string subword;
|
||||||
|
while(getline(infile, subword))
|
||||||
|
{
|
||||||
|
_priorSubWords.push_back(subword);
|
||||||
|
}
|
||||||
|
LogInfo(string_format("loadPriorSubWord(%s) end", filePath));
|
||||||
|
infile.close();
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -73,7 +79,6 @@ namespace CppJieba
|
|||||||
bool KeyWordExt::destroy()
|
bool KeyWordExt::destroy()
|
||||||
{
|
{
|
||||||
_segment.destroy();
|
_segment.destroy();
|
||||||
_priorPrefixTrie.destroy();
|
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -132,8 +137,8 @@ namespace CppJieba
|
|||||||
_sortWLIDF(wordInfos);
|
_sortWLIDF(wordInfos);
|
||||||
LogDebug(string_format("calc weight & sorted:\n%s",joinWordInfos(wordInfos).c_str()));
|
LogDebug(string_format("calc weight & sorted:\n%s",joinWordInfos(wordInfos).c_str()));
|
||||||
|
|
||||||
_priorWordPrefixes(wordInfos);
|
_prioritizeSubWords(wordInfos);
|
||||||
LogDebug(string_format("_priorWordPrefixes res:\n%s", joinWordInfos(wordInfos).c_str()));
|
LogDebug(string_format("_prioritizeSubWords res:\n%s", joinWordInfos(wordInfos).c_str()));
|
||||||
//extract TopN
|
//extract TopN
|
||||||
for(uint i = 0; i < topN && i < wordInfos.size(); i++)
|
for(uint i = 0; i < topN && i < wordInfos.size(); i++)
|
||||||
{
|
{
|
||||||
@ -308,7 +313,19 @@ namespace CppJieba
|
|||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
bool KeyWordExt::_priorWordPrefixes(vector<WordInfo>& wordInfos)
|
bool KeyWordExt::_isContainSubWords(const string& word)
|
||||||
|
{
|
||||||
|
for(uint i = 0; i < _priorSubWords.size(); i++)
|
||||||
|
{
|
||||||
|
if(string::npos != word.find(_priorSubWords[i]))
|
||||||
|
{
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
bool KeyWordExt::_prioritizeSubWords(vector<WordInfo>& wordInfos)
|
||||||
{
|
{
|
||||||
if(2 > wordInfos.size())
|
if(2 > wordInfos.size())
|
||||||
{
|
{
|
||||||
@ -319,7 +336,7 @@ namespace CppJieba
|
|||||||
bool flag = false;
|
bool flag = false;
|
||||||
for(vector<WordInfo>::iterator it = wordInfos.begin(); it != wordInfos.end(); )
|
for(vector<WordInfo>::iterator it = wordInfos.begin(); it != wordInfos.end(); )
|
||||||
{
|
{
|
||||||
if(NULL != _priorPrefixTrie.findPrefix(it->word))
|
if(_isContainSubWords(it->word))
|
||||||
{
|
{
|
||||||
prior = *it;
|
prior = *it;
|
||||||
it = wordInfos.erase(it);
|
it = wordInfos.erase(it);
|
||||||
@ -353,7 +370,7 @@ int main()
|
|||||||
}
|
}
|
||||||
ext.loadStopWords("stopwords.tmp");
|
ext.loadStopWords("stopwords.tmp");
|
||||||
|
|
||||||
if(!ext.loadPriorWordPrefixes("prior.utf8"))
|
if(!ext.loadPriorSubWord("prior.utf8"))
|
||||||
{
|
{
|
||||||
cerr<<"err"<<endl;
|
cerr<<"err"<<endl;
|
||||||
return 1;
|
return 1;
|
||||||
@ -375,7 +392,7 @@ int main()
|
|||||||
*/
|
*/
|
||||||
|
|
||||||
|
|
||||||
title = "2013新款19CM超大檐帽 遮阳草帽子 沙滩帽防晒大檐欧美新款夏天女装";
|
title = "2013夏季新款韩版女装甜美雪纺长裙连衣裙碎花裙蕾丝无袖连衣裙子";
|
||||||
res.clear();
|
res.clear();
|
||||||
ext.extract(title, res, 5);
|
ext.extract(title, res, 5);
|
||||||
PRINT_VECTOR(res);
|
PRINT_VECTOR(res);
|
||||||
|
@ -31,7 +31,7 @@ namespace CppJieba
|
|||||||
for(uint i = 0; i < vec.size(); i++)
|
for(uint i = 0; i < vec.size(); i++)
|
||||||
{
|
{
|
||||||
tmp.push_back(vec[i].getInfoStr());
|
tmp.push_back(vec[i].getInfoStr());
|
||||||
};
|
}
|
||||||
return joinStr(tmp, ",");
|
return joinStr(tmp, ",");
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -39,7 +39,7 @@ namespace CppJieba
|
|||||||
{
|
{
|
||||||
private:
|
private:
|
||||||
Segment _segment;
|
Segment _segment;
|
||||||
Trie _priorPrefixTrie;
|
vector<string> _priorSubWords;
|
||||||
set<string> _stopWords;
|
set<string> _stopWords;
|
||||||
public:
|
public:
|
||||||
KeyWordExt();
|
KeyWordExt();
|
||||||
@ -50,7 +50,7 @@ namespace CppJieba
|
|||||||
bool loadStopWords(const char * const filePath);
|
bool loadStopWords(const char * const filePath);
|
||||||
|
|
||||||
//load prior words' prefix
|
//load prior words' prefix
|
||||||
bool loadPriorWordPrefixes( const char * const filePath);
|
bool loadPriorSubWord( const char * const filePath);
|
||||||
|
|
||||||
bool destroy();
|
bool destroy();
|
||||||
|
|
||||||
@ -70,7 +70,8 @@ namespace CppJieba
|
|||||||
bool _filterSubstr(vector<string>& utf8Strs);
|
bool _filterSubstr(vector<string>& utf8Strs);
|
||||||
bool _filterStopWords(vector<string>& utf8Strs);
|
bool _filterStopWords(vector<string>& utf8Strs);
|
||||||
private:
|
private:
|
||||||
bool _priorWordPrefixes(vector<WordInfo>& wordInfos);
|
bool _prioritizeSubWords(vector<WordInfo>& wordInfos);
|
||||||
|
bool _isContainSubWords(const string& word);
|
||||||
|
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -31,7 +31,7 @@ namespace CppJieba
|
|||||||
{
|
{
|
||||||
string word;// utf8 string word
|
string word;// utf8 string word
|
||||||
size_t wLen;// the word's len , not string.size(), eg: "我是中国人" wLen = 5 .
|
size_t wLen;// the word's len , not string.size(), eg: "我是中国人" wLen = 5 .
|
||||||
unsigned int count;
|
size_t count;
|
||||||
string tag;
|
string tag;
|
||||||
double weight;
|
double weight;
|
||||||
TrieNodeInfo()
|
TrieNodeInfo()
|
||||||
|
@ -7,8 +7,6 @@
|
|||||||
|
|
||||||
namespace CppJieba
|
namespace CppJieba
|
||||||
{
|
{
|
||||||
//file path
|
|
||||||
//const char * const DICT_FILE_PATH = "dict.txt";
|
|
||||||
|
|
||||||
//typedefs
|
//typedefs
|
||||||
typedef uint16_t ChUnicode;
|
typedef uint16_t ChUnicode;
|
||||||
|
Loading…
x
Reference in New Issue
Block a user