mirror of
https://github.com/yanyiwu/cppjieba.git
synced 2025-07-18 00:00:12 +08:00
[code style] uppercase function name
This commit is contained in:
parent
1a9a37aa64
commit
f17c2d10e2
@ -5,6 +5,7 @@
|
|||||||
1. 支持多个userdict载入,多词典路径用英文冒号(:)作为分隔符,就当是向环境变量PATH致敬,哈哈。
|
1. 支持多个userdict载入,多词典路径用英文冒号(:)作为分隔符,就当是向环境变量PATH致敬,哈哈。
|
||||||
2. userdict是不带权重的,之前对于新的userword默认设置词频权重为最大值,现已支持可配置,默认使用中位值。
|
2. userdict是不带权重的,之前对于新的userword默认设置词频权重为最大值,现已支持可配置,默认使用中位值。
|
||||||
3. 【兼容性预警】修改一些代码风格,比如命名空间小写化,从CppJieba变成cppjieba。
|
3. 【兼容性预警】修改一些代码风格,比如命名空间小写化,从CppJieba变成cppjieba。
|
||||||
|
4. 【兼容性预警】弃用Application.hpp, 取而代之使用Jieba.hpp ,接口也进行了大幅修改,函数风格更统一,和python版本的Jieba分词更一致。
|
||||||
|
|
||||||
## v3.2.1
|
## v3.2.1
|
||||||
|
|
||||||
|
@ -44,7 +44,7 @@ class DictTrie {
|
|||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
active_node_infos_.push_back(node_info);
|
active_node_infos_.push_back(node_info);
|
||||||
trie_->insertNode(node_info.word, &active_node_infos_.back());
|
trie_->InsertNode(node_info.word, &active_node_infos_.back());
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -120,7 +120,7 @@ class DictTrie {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
LogInfo("load userdicts[%s] ok. lines[%u]", filePaths.c_str(), lineno);
|
LogInfo("Load userdicts[%s] ok. lines[%u]", filePaths.c_str(), lineno);
|
||||||
}
|
}
|
||||||
|
|
||||||
bool MakeNodeInfo(DictUnit& node_info,
|
bool MakeNodeInfo(DictUnit& node_info,
|
||||||
|
@ -26,7 +26,7 @@ class FullSegment: public SegmentBase {
|
|||||||
delete dictTrie_;
|
delete dictTrie_;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
void cut(const string& sentence,
|
void Cut(const string& sentence,
|
||||||
vector<string>& words) const {
|
vector<string>& words) const {
|
||||||
PreFilter pre_filter(symbols_, sentence);
|
PreFilter pre_filter(symbols_, sentence);
|
||||||
PreFilter::Range range;
|
PreFilter::Range range;
|
||||||
@ -34,11 +34,11 @@ class FullSegment: public SegmentBase {
|
|||||||
uwords.reserve(sentence.size());
|
uwords.reserve(sentence.size());
|
||||||
while (pre_filter.HasNext()) {
|
while (pre_filter.HasNext()) {
|
||||||
range = pre_filter.Next();
|
range = pre_filter.Next();
|
||||||
cut(range.begin, range.end, uwords);
|
Cut(range.begin, range.end, uwords);
|
||||||
}
|
}
|
||||||
TransCode::encode(uwords, words);
|
TransCode::encode(uwords, words);
|
||||||
}
|
}
|
||||||
void cut(Unicode::const_iterator begin,
|
void Cut(Unicode::const_iterator begin,
|
||||||
Unicode::const_iterator end,
|
Unicode::const_iterator end,
|
||||||
vector<Unicode>& res) const {
|
vector<Unicode>& res) const {
|
||||||
//resut of searching in trie tree
|
//resut of searching in trie tree
|
||||||
|
@ -26,11 +26,11 @@ struct HMMModel {
|
|||||||
emitProbVec.push_back(&emitProbE);
|
emitProbVec.push_back(&emitProbE);
|
||||||
emitProbVec.push_back(&emitProbM);
|
emitProbVec.push_back(&emitProbM);
|
||||||
emitProbVec.push_back(&emitProbS);
|
emitProbVec.push_back(&emitProbS);
|
||||||
loadModel(modelPath);
|
LoadModel(modelPath);
|
||||||
}
|
}
|
||||||
~HMMModel() {
|
~HMMModel() {
|
||||||
}
|
}
|
||||||
void loadModel(const string& filePath) {
|
void LoadModel(const string& filePath) {
|
||||||
ifstream ifile(filePath.c_str());
|
ifstream ifile(filePath.c_str());
|
||||||
if (!ifile.is_open()) {
|
if (!ifile.is_open()) {
|
||||||
LogFatal("open %s failed.", filePath.c_str());
|
LogFatal("open %s failed.", filePath.c_str());
|
||||||
@ -38,9 +38,9 @@ struct HMMModel {
|
|||||||
string line;
|
string line;
|
||||||
vector<string> tmp;
|
vector<string> tmp;
|
||||||
vector<string> tmp2;
|
vector<string> tmp2;
|
||||||
//load startProb
|
//Load startProb
|
||||||
if (!getLine(ifile, line)) {
|
if (!GetLine(ifile, line)) {
|
||||||
LogFatal("load startProb");
|
LogFatal("Load startProb");
|
||||||
}
|
}
|
||||||
split(line, tmp, " ");
|
split(line, tmp, " ");
|
||||||
if (tmp.size() != STATUS_SUM) {
|
if (tmp.size() != STATUS_SUM) {
|
||||||
@ -50,10 +50,10 @@ struct HMMModel {
|
|||||||
startProb[j] = atof(tmp[j].c_str());
|
startProb[j] = atof(tmp[j].c_str());
|
||||||
}
|
}
|
||||||
|
|
||||||
//load transProb
|
//Load transProb
|
||||||
for (size_t i = 0; i < STATUS_SUM; i++) {
|
for (size_t i = 0; i < STATUS_SUM; i++) {
|
||||||
if (!getLine(ifile, line)) {
|
if (!GetLine(ifile, line)) {
|
||||||
LogFatal("load transProb failed.");
|
LogFatal("Load transProb failed.");
|
||||||
}
|
}
|
||||||
split(line, tmp, " ");
|
split(line, tmp, " ");
|
||||||
if (tmp.size() != STATUS_SUM) {
|
if (tmp.size() != STATUS_SUM) {
|
||||||
@ -64,27 +64,27 @@ struct HMMModel {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
//load emitProbB
|
//Load emitProbB
|
||||||
if (!getLine(ifile, line) || !loadEmitProb(line, emitProbB)) {
|
if (!GetLine(ifile, line) || !LoadEmitProb(line, emitProbB)) {
|
||||||
LogFatal("load emitProbB failed.");
|
LogFatal("Load emitProbB failed.");
|
||||||
}
|
}
|
||||||
|
|
||||||
//load emitProbE
|
//Load emitProbE
|
||||||
if (!getLine(ifile, line) || !loadEmitProb(line, emitProbE)) {
|
if (!GetLine(ifile, line) || !LoadEmitProb(line, emitProbE)) {
|
||||||
LogFatal("load emitProbE failed.");
|
LogFatal("Load emitProbE failed.");
|
||||||
}
|
}
|
||||||
|
|
||||||
//load emitProbM
|
//Load emitProbM
|
||||||
if (!getLine(ifile, line) || !loadEmitProb(line, emitProbM)) {
|
if (!GetLine(ifile, line) || !LoadEmitProb(line, emitProbM)) {
|
||||||
LogFatal("load emitProbM failed.");
|
LogFatal("Load emitProbM failed.");
|
||||||
}
|
}
|
||||||
|
|
||||||
//load emitProbS
|
//Load emitProbS
|
||||||
if (!getLine(ifile, line) || !loadEmitProb(line, emitProbS)) {
|
if (!GetLine(ifile, line) || !LoadEmitProb(line, emitProbS)) {
|
||||||
LogFatal("load emitProbS failed.");
|
LogFatal("Load emitProbS failed.");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
double getEmitProb(const EmitProbMap* ptMp, uint16_t key,
|
double GetEmitProb(const EmitProbMap* ptMp, uint16_t key,
|
||||||
double defVal)const {
|
double defVal)const {
|
||||||
EmitProbMap::const_iterator cit = ptMp->find(key);
|
EmitProbMap::const_iterator cit = ptMp->find(key);
|
||||||
if (cit == ptMp->end()) {
|
if (cit == ptMp->end()) {
|
||||||
@ -92,7 +92,7 @@ struct HMMModel {
|
|||||||
}
|
}
|
||||||
return cit->second;
|
return cit->second;
|
||||||
}
|
}
|
||||||
bool getLine(ifstream& ifile, string& line) {
|
bool GetLine(ifstream& ifile, string& line) {
|
||||||
while (getline(ifile, line)) {
|
while (getline(ifile, line)) {
|
||||||
trim(line);
|
trim(line);
|
||||||
if (line.empty()) {
|
if (line.empty()) {
|
||||||
@ -105,7 +105,7 @@ struct HMMModel {
|
|||||||
}
|
}
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
bool loadEmitProb(const string& line, EmitProbMap& mp) {
|
bool LoadEmitProb(const string& line, EmitProbMap& mp) {
|
||||||
if (line.empty()) {
|
if (line.empty()) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
@ -23,7 +23,7 @@ class HMMSegment: public SegmentBase {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void cut(const string& sentence,
|
void Cut(const string& sentence,
|
||||||
vector<string>& words) const {
|
vector<string>& words) const {
|
||||||
PreFilter pre_filter(symbols_, sentence);
|
PreFilter pre_filter(symbols_, sentence);
|
||||||
PreFilter::Range range;
|
PreFilter::Range range;
|
||||||
@ -31,11 +31,11 @@ class HMMSegment: public SegmentBase {
|
|||||||
uwords.reserve(sentence.size());
|
uwords.reserve(sentence.size());
|
||||||
while (pre_filter.HasNext()) {
|
while (pre_filter.HasNext()) {
|
||||||
range = pre_filter.Next();
|
range = pre_filter.Next();
|
||||||
cut(range.begin, range.end, uwords);
|
Cut(range.begin, range.end, uwords);
|
||||||
}
|
}
|
||||||
TransCode::encode(uwords, words);
|
TransCode::encode(uwords, words);
|
||||||
}
|
}
|
||||||
void cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<Unicode>& res) const {
|
void Cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<Unicode>& res) const {
|
||||||
Unicode::const_iterator left = begin;
|
Unicode::const_iterator left = begin;
|
||||||
Unicode::const_iterator right = begin;
|
Unicode::const_iterator right = begin;
|
||||||
while (right != end) {
|
while (right != end) {
|
||||||
@ -132,7 +132,7 @@ class HMMSegment: public SegmentBase {
|
|||||||
|
|
||||||
//start
|
//start
|
||||||
for (size_t y = 0; y < Y; y++) {
|
for (size_t y = 0; y < Y; y++) {
|
||||||
weight[0 + y * X] = model_->startProb[y] + model_->getEmitProb(model_->emitProbVec[y], *begin, MIN_DOUBLE);
|
weight[0 + y * X] = model_->startProb[y] + model_->GetEmitProb(model_->emitProbVec[y], *begin, MIN_DOUBLE);
|
||||||
path[0 + y * X] = -1;
|
path[0 + y * X] = -1;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -143,7 +143,7 @@ class HMMSegment: public SegmentBase {
|
|||||||
now = x + y*X;
|
now = x + y*X;
|
||||||
weight[now] = MIN_DOUBLE;
|
weight[now] = MIN_DOUBLE;
|
||||||
path[now] = HMMModel::E; // warning
|
path[now] = HMMModel::E; // warning
|
||||||
emitProb = model_->getEmitProb(model_->emitProbVec[y], *(begin+x), MIN_DOUBLE);
|
emitProb = model_->GetEmitProb(model_->emitProbVec[y], *(begin+x), MIN_DOUBLE);
|
||||||
for (size_t preY = 0; preY < Y; preY++) {
|
for (size_t preY = 0; preY < Y; preY++) {
|
||||||
old = x - 1 + preY * X;
|
old = x - 1 + preY * X;
|
||||||
tmp = weight[old] + model_->transProb[preY][y] + emitProb;
|
tmp = weight[old] + model_->transProb[preY][y] + emitProb;
|
||||||
|
@ -23,25 +23,25 @@ class Jieba {
|
|||||||
}
|
}
|
||||||
|
|
||||||
void Cut(const string& sentence, vector<string>& words, bool hmm = true) const {
|
void Cut(const string& sentence, vector<string>& words, bool hmm = true) const {
|
||||||
mix_seg_.cut(sentence, words, hmm);
|
mix_seg_.Cut(sentence, words, hmm);
|
||||||
}
|
}
|
||||||
void CutAll(const string& sentence, vector<string>& words) const {
|
void CutAll(const string& sentence, vector<string>& words) const {
|
||||||
full_seg_.cut(sentence, words);
|
full_seg_.Cut(sentence, words);
|
||||||
}
|
}
|
||||||
void CutForSearch(const string& sentence, vector<string>& words, bool hmm = true) const {
|
void CutForSearch(const string& sentence, vector<string>& words, bool hmm = true) const {
|
||||||
query_seg_.cut(sentence, words, hmm);
|
query_seg_.Cut(sentence, words, hmm);
|
||||||
}
|
}
|
||||||
void CutHMM(const string& sentence, vector<string>& words) const {
|
void CutHMM(const string& sentence, vector<string>& words) const {
|
||||||
hmm_seg_.cut(sentence, words);
|
hmm_seg_.Cut(sentence, words);
|
||||||
}
|
}
|
||||||
void CutLevel(const string& sentence, vector<string>& words) const {
|
void CutLevel(const string& sentence, vector<string>& words) const {
|
||||||
level_seg_.cut(sentence, words);
|
level_seg_.Cut(sentence, words);
|
||||||
}
|
}
|
||||||
void CutLevel(const string& sentence, vector<pair<string, size_t> >& words) const {
|
void CutLevel(const string& sentence, vector<pair<string, size_t> >& words) const {
|
||||||
level_seg_.cut(sentence, words);
|
level_seg_.Cut(sentence, words);
|
||||||
}
|
}
|
||||||
void CutSmall(const string& sentence, vector<string>& words, size_t max_word_len) const {
|
void CutSmall(const string& sentence, vector<string>& words, size_t max_word_len) const {
|
||||||
mp_seg_.cut(sentence, words, max_word_len);
|
mp_seg_.Cut(sentence, words, max_word_len);
|
||||||
}
|
}
|
||||||
bool InsertUserWord(const string& word, const string& tag = UNKNOWN_TAG) {
|
bool InsertUserWord(const string& word, const string& tag = UNKNOWN_TAG) {
|
||||||
return dict_trie_.InsertUserWord(word, tag);
|
return dict_trie_.InsertUserWord(word, tag);
|
||||||
|
@ -44,7 +44,7 @@ class KeywordExtractor {
|
|||||||
|
|
||||||
bool extract(const string& sentence, vector<pair<string, double> >& keywords, size_t topN) const {
|
bool extract(const string& sentence, vector<pair<string, double> >& keywords, size_t topN) const {
|
||||||
vector<string> words;
|
vector<string> words;
|
||||||
segment_.cut(sentence, words);
|
segment_.Cut(sentence, words);
|
||||||
|
|
||||||
map<string, double> wordmap;
|
map<string, double> wordmap;
|
||||||
for (vector<string>::iterator iter = words.begin(); iter != words.end(); iter++) {
|
for (vector<string>::iterator iter = words.begin(); iter != words.end(); iter++) {
|
||||||
|
@ -18,14 +18,14 @@ class LevelSegment: public SegmentBase{
|
|||||||
~LevelSegment() {
|
~LevelSegment() {
|
||||||
}
|
}
|
||||||
|
|
||||||
void cut(Unicode::const_iterator begin,
|
void Cut(Unicode::const_iterator begin,
|
||||||
Unicode::const_iterator end,
|
Unicode::const_iterator end,
|
||||||
vector<pair<Unicode, size_t> >& res) const {
|
vector<pair<Unicode, size_t> >& res) const {
|
||||||
res.clear();
|
res.clear();
|
||||||
vector<Unicode> words;
|
vector<Unicode> words;
|
||||||
vector<Unicode> smallerWords;
|
vector<Unicode> smallerWords;
|
||||||
words.reserve(end - begin);
|
words.reserve(end - begin);
|
||||||
mpSeg_.cut(begin, end, words);
|
mpSeg_.Cut(begin, end, words);
|
||||||
smallerWords.reserve(words.size());
|
smallerWords.reserve(words.size());
|
||||||
res.reserve(words.size());
|
res.reserve(words.size());
|
||||||
|
|
||||||
@ -35,7 +35,7 @@ class LevelSegment: public SegmentBase{
|
|||||||
for (size_t i = 0; i < words.size(); i++) {
|
for (size_t i = 0; i < words.size(); i++) {
|
||||||
if (words[i].size() >= 3) {
|
if (words[i].size() >= 3) {
|
||||||
size_t len = words[i].size() - 1;
|
size_t len = words[i].size() - 1;
|
||||||
mpSeg_.cut(words[i].begin(), words[i].end(), smallerWords, len); // buffer.push_back without clear
|
mpSeg_.Cut(words[i].begin(), words[i].end(), smallerWords, len); // buffer.push_back without clear
|
||||||
}
|
}
|
||||||
if (words[i].size() > 1) {
|
if (words[i].size() > 1) {
|
||||||
res.push_back(pair<Unicode, size_t>(words[i], level));
|
res.push_back(pair<Unicode, size_t>(words[i], level));
|
||||||
@ -47,13 +47,13 @@ class LevelSegment: public SegmentBase{
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void cut(const string& sentence,
|
void Cut(const string& sentence,
|
||||||
vector<pair<string, size_t> >& words) const {
|
vector<pair<string, size_t> >& words) const {
|
||||||
words.clear();
|
words.clear();
|
||||||
Unicode unicode;
|
Unicode unicode;
|
||||||
TransCode::decode(sentence, unicode);
|
TransCode::decode(sentence, unicode);
|
||||||
vector<pair<Unicode, size_t> > unicodeWords;
|
vector<pair<Unicode, size_t> > unicodeWords;
|
||||||
cut(unicode.begin(), unicode.end(), unicodeWords);
|
Cut(unicode.begin(), unicode.end(), unicodeWords);
|
||||||
words.resize(unicodeWords.size());
|
words.resize(unicodeWords.size());
|
||||||
for (size_t i = 0; i < words.size(); i++) {
|
for (size_t i = 0; i < words.size(); i++) {
|
||||||
TransCode::encode(unicodeWords[i].first, words[i].first);
|
TransCode::encode(unicodeWords[i].first, words[i].first);
|
||||||
@ -61,10 +61,10 @@ class LevelSegment: public SegmentBase{
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
bool cut(const string& sentence,
|
bool Cut(const string& sentence,
|
||||||
vector<string>& res) const {
|
vector<string>& res) const {
|
||||||
vector<pair<string, size_t> > words;
|
vector<pair<string, size_t> > words;
|
||||||
cut(sentence, words);
|
Cut(sentence, words);
|
||||||
res.clear();
|
res.clear();
|
||||||
res.reserve(words.size());
|
res.reserve(words.size());
|
||||||
for (size_t i = 0; i < words.size(); i++) {
|
for (size_t i = 0; i < words.size(); i++) {
|
||||||
|
@ -27,7 +27,7 @@ class MPSegment: public SegmentBase {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void cut(const string& sentence,
|
void Cut(const string& sentence,
|
||||||
vector<string>& words,
|
vector<string>& words,
|
||||||
size_t max_word_len = MAX_WORD_LENGTH) const {
|
size_t max_word_len = MAX_WORD_LENGTH) const {
|
||||||
PreFilter pre_filter(symbols_, sentence);
|
PreFilter pre_filter(symbols_, sentence);
|
||||||
@ -36,11 +36,11 @@ class MPSegment: public SegmentBase {
|
|||||||
uwords.reserve(sentence.size());
|
uwords.reserve(sentence.size());
|
||||||
while (pre_filter.HasNext()) {
|
while (pre_filter.HasNext()) {
|
||||||
range = pre_filter.Next();
|
range = pre_filter.Next();
|
||||||
cut(range.begin, range.end, uwords, max_word_len);
|
Cut(range.begin, range.end, uwords, max_word_len);
|
||||||
}
|
}
|
||||||
TransCode::encode(uwords, words);
|
TransCode::encode(uwords, words);
|
||||||
}
|
}
|
||||||
void cut(Unicode::const_iterator begin,
|
void Cut(Unicode::const_iterator begin,
|
||||||
Unicode::const_iterator end,
|
Unicode::const_iterator end,
|
||||||
vector<Unicode>& words,
|
vector<Unicode>& words,
|
||||||
size_t max_word_len = MAX_WORD_LENGTH) const {
|
size_t max_word_len = MAX_WORD_LENGTH) const {
|
||||||
@ -53,7 +53,7 @@ class MPSegment: public SegmentBase {
|
|||||||
CutByDag(dags, words);
|
CutByDag(dags, words);
|
||||||
}
|
}
|
||||||
|
|
||||||
const DictTrie* getDictTrie() const {
|
const DictTrie* GetDictTrie() const {
|
||||||
return dictTrie_;
|
return dictTrie_;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -21,47 +21,47 @@ class MixSegment: public SegmentBase {
|
|||||||
~MixSegment() {
|
~MixSegment() {
|
||||||
}
|
}
|
||||||
|
|
||||||
void cut(const string& sentence, vector<string>& words, bool hmm = true) const {
|
void Cut(const string& sentence, vector<string>& words, bool hmm = true) const {
|
||||||
PreFilter pre_filter(symbols_, sentence);
|
PreFilter pre_filter(symbols_, sentence);
|
||||||
PreFilter::Range range;
|
PreFilter::Range range;
|
||||||
vector<Unicode> uwords;
|
vector<Unicode> uwords;
|
||||||
uwords.reserve(sentence.size());
|
uwords.reserve(sentence.size());
|
||||||
while (pre_filter.HasNext()) {
|
while (pre_filter.HasNext()) {
|
||||||
range = pre_filter.Next();
|
range = pre_filter.Next();
|
||||||
cut(range.begin, range.end, uwords, hmm);
|
Cut(range.begin, range.end, uwords, hmm);
|
||||||
}
|
}
|
||||||
TransCode::encode(uwords, words);
|
TransCode::encode(uwords, words);
|
||||||
}
|
}
|
||||||
|
|
||||||
void cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<Unicode>& res, bool hmm) const {
|
void Cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<Unicode>& res, bool hmm) const {
|
||||||
if (!hmm) {
|
if (!hmm) {
|
||||||
mpSeg_.cut(begin, end, res);
|
mpSeg_.Cut(begin, end, res);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
vector<Unicode> words;
|
vector<Unicode> words;
|
||||||
words.reserve(end - begin);
|
words.reserve(end - begin);
|
||||||
mpSeg_.cut(begin, end, words);
|
mpSeg_.Cut(begin, end, words);
|
||||||
|
|
||||||
vector<Unicode> hmmRes;
|
vector<Unicode> hmmRes;
|
||||||
hmmRes.reserve(end - begin);
|
hmmRes.reserve(end - begin);
|
||||||
Unicode piece;
|
Unicode piece;
|
||||||
piece.reserve(end - begin);
|
piece.reserve(end - begin);
|
||||||
for (size_t i = 0, j = 0; i < words.size(); i++) {
|
for (size_t i = 0, j = 0; i < words.size(); i++) {
|
||||||
//if mp get a word, it's ok, put it into result
|
//if mp Get a word, it's ok, put it into result
|
||||||
if (1 != words[i].size() || (words[i].size() == 1 && mpSeg_.IsUserDictSingleChineseWord(words[i][0]))) {
|
if (1 != words[i].size() || (words[i].size() == 1 && mpSeg_.IsUserDictSingleChineseWord(words[i][0]))) {
|
||||||
res.push_back(words[i]);
|
res.push_back(words[i]);
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
// if mp get a single one and it is not in userdict, collect it in sequence
|
// if mp Get a single one and it is not in userdict, collect it in sequence
|
||||||
j = i;
|
j = i;
|
||||||
while (j < words.size() && 1 == words[j].size() && !mpSeg_.IsUserDictSingleChineseWord(words[j][0])) {
|
while (j < words.size() && 1 == words[j].size() && !mpSeg_.IsUserDictSingleChineseWord(words[j][0])) {
|
||||||
piece.push_back(words[j][0]);
|
piece.push_back(words[j][0]);
|
||||||
j++;
|
j++;
|
||||||
}
|
}
|
||||||
|
|
||||||
// cut the sequence with hmm
|
// Cut the sequence with hmm
|
||||||
hmmSeg_.cut(piece.begin(), piece.end(), hmmRes);
|
hmmSeg_.Cut(piece.begin(), piece.end(), hmmRes);
|
||||||
|
|
||||||
//put hmm result to result
|
//put hmm result to result
|
||||||
for (size_t k = 0; k < hmmRes.size(); k++) {
|
for (size_t k = 0; k < hmmRes.size(); k++) {
|
||||||
@ -77,8 +77,8 @@ class MixSegment: public SegmentBase {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
const DictTrie* getDictTrie() const {
|
const DictTrie* GetDictTrie() const {
|
||||||
return mpSeg_.getDictTrie();
|
return mpSeg_.GetDictTrie();
|
||||||
}
|
}
|
||||||
private:
|
private:
|
||||||
MPSegment mpSeg_;
|
MPSegment mpSeg_;
|
||||||
|
@ -26,14 +26,14 @@ class PosTagger {
|
|||||||
}
|
}
|
||||||
|
|
||||||
bool tag(const string& src, vector<pair<string, string> >& res) const {
|
bool tag(const string& src, vector<pair<string, string> >& res) const {
|
||||||
vector<string> cutRes;
|
vector<string> CutRes;
|
||||||
segment_.cut(src, cutRes);
|
segment_.Cut(src, CutRes);
|
||||||
|
|
||||||
const DictUnit *tmp = NULL;
|
const DictUnit *tmp = NULL;
|
||||||
Unicode unico;
|
Unicode unico;
|
||||||
const DictTrie * dict = segment_.getDictTrie();
|
const DictTrie * dict = segment_.GetDictTrie();
|
||||||
assert(dict != NULL);
|
assert(dict != NULL);
|
||||||
for (vector<string>::iterator itr = cutRes.begin(); itr != cutRes.end(); ++itr) {
|
for (vector<string>::iterator itr = CutRes.begin(); itr != CutRes.end(); ++itr) {
|
||||||
if (!TransCode::decode(*itr, unico)) {
|
if (!TransCode::decode(*itr, unico)) {
|
||||||
LogError("decode failed.");
|
LogError("decode failed.");
|
||||||
return false;
|
return false;
|
||||||
|
@ -17,7 +17,7 @@ class QuerySegment: public SegmentBase {
|
|||||||
public:
|
public:
|
||||||
QuerySegment(const string& dict, const string& model, const string& userDict = "", size_t maxWordLen = 4)
|
QuerySegment(const string& dict, const string& model, const string& userDict = "", size_t maxWordLen = 4)
|
||||||
: mixSeg_(dict, model, userDict),
|
: mixSeg_(dict, model, userDict),
|
||||||
fullSeg_(mixSeg_.getDictTrie()),
|
fullSeg_(mixSeg_.GetDictTrie()),
|
||||||
maxWordLen_(maxWordLen) {
|
maxWordLen_(maxWordLen) {
|
||||||
assert(maxWordLen_);
|
assert(maxWordLen_);
|
||||||
}
|
}
|
||||||
@ -26,27 +26,27 @@ class QuerySegment: public SegmentBase {
|
|||||||
}
|
}
|
||||||
~QuerySegment() {
|
~QuerySegment() {
|
||||||
}
|
}
|
||||||
void cut(const string& sentence, vector<string>& words, bool hmm = true) const {
|
void Cut(const string& sentence, vector<string>& words, bool hmm = true) const {
|
||||||
PreFilter pre_filter(symbols_, sentence);
|
PreFilter pre_filter(symbols_, sentence);
|
||||||
PreFilter::Range range;
|
PreFilter::Range range;
|
||||||
vector<Unicode> uwords;
|
vector<Unicode> uwords;
|
||||||
uwords.reserve(sentence.size());
|
uwords.reserve(sentence.size());
|
||||||
while (pre_filter.HasNext()) {
|
while (pre_filter.HasNext()) {
|
||||||
range = pre_filter.Next();
|
range = pre_filter.Next();
|
||||||
cut(range.begin, range.end, uwords, hmm);
|
Cut(range.begin, range.end, uwords, hmm);
|
||||||
}
|
}
|
||||||
TransCode::encode(uwords, words);
|
TransCode::encode(uwords, words);
|
||||||
}
|
}
|
||||||
void cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<Unicode>& res, bool hmm) const {
|
void Cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<Unicode>& res, bool hmm) const {
|
||||||
//use mix cut first
|
//use mix Cut first
|
||||||
vector<Unicode> mixRes;
|
vector<Unicode> mixRes;
|
||||||
mixSeg_.cut(begin, end, mixRes, hmm);
|
mixSeg_.Cut(begin, end, mixRes, hmm);
|
||||||
|
|
||||||
vector<Unicode> fullRes;
|
vector<Unicode> fullRes;
|
||||||
for (vector<Unicode>::const_iterator mixResItr = mixRes.begin(); mixResItr != mixRes.end(); mixResItr++) {
|
for (vector<Unicode>::const_iterator mixResItr = mixRes.begin(); mixResItr != mixRes.end(); mixResItr++) {
|
||||||
// if it's too long, cut with fullSeg_, put fullRes in res
|
// if it's too long, Cut with fullSeg_, put fullRes in res
|
||||||
if (mixResItr->size() > maxWordLen_) {
|
if (mixResItr->size() > maxWordLen_) {
|
||||||
fullSeg_.cut(mixResItr->begin(), mixResItr->end(), fullRes);
|
fullSeg_.Cut(mixResItr->begin(), mixResItr->end(), fullRes);
|
||||||
for (vector<Unicode>::const_iterator fullResItr = fullRes.begin(); fullResItr != fullRes.end(); fullResItr++) {
|
for (vector<Unicode>::const_iterator fullResItr = fullRes.begin(); fullResItr != fullRes.end(); fullResItr++) {
|
||||||
res.push_back(*fullResItr);
|
res.push_back(*fullResItr);
|
||||||
}
|
}
|
||||||
|
@ -117,7 +117,7 @@ class Trie {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void insertNode(const Unicode& key, const DictUnit* ptValue) {
|
void InsertNode(const Unicode& key, const DictUnit* ptValue) {
|
||||||
if (key.begin() == key.end()) {
|
if (key.begin() == key.end()) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
@ -150,7 +150,7 @@ class Trie {
|
|||||||
assert(keys.size() == valuePointers.size());
|
assert(keys.size() == valuePointers.size());
|
||||||
|
|
||||||
for (size_t i = 0; i < keys.size(); i++) {
|
for (size_t i = 0; i < keys.size(); i++) {
|
||||||
insertNode(keys[i], valuePointers[i]);
|
InsertNode(keys[i], valuePointers[i]);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user