mirror of
https://github.com/yanyiwu/cppjieba.git
synced 2025-07-18 00:00:12 +08:00
[code style] uppercase function name
This commit is contained in:
parent
1a9a37aa64
commit
f17c2d10e2
@ -5,6 +5,7 @@
|
||||
1. 支持多个userdict载入,多词典路径用英文冒号(:)作为分隔符,就当是向环境变量PATH致敬,哈哈。
|
||||
2. userdict是不带权重的,之前对于新的userword默认设置词频权重为最大值,现已支持可配置,默认使用中位值。
|
||||
3. 【兼容性预警】修改一些代码风格,比如命名空间小写化,从CppJieba变成cppjieba。
|
||||
4. 【兼容性预警】弃用Application.hpp, 取而代之使用Jieba.hpp ,接口也进行了大幅修改,函数风格更统一,和python版本的Jieba分词更一致。
|
||||
|
||||
## v3.2.1
|
||||
|
||||
|
@ -44,7 +44,7 @@ class DictTrie {
|
||||
return false;
|
||||
}
|
||||
active_node_infos_.push_back(node_info);
|
||||
trie_->insertNode(node_info.word, &active_node_infos_.back());
|
||||
trie_->InsertNode(node_info.word, &active_node_infos_.back());
|
||||
return true;
|
||||
}
|
||||
|
||||
@ -120,7 +120,7 @@ class DictTrie {
|
||||
}
|
||||
}
|
||||
}
|
||||
LogInfo("load userdicts[%s] ok. lines[%u]", filePaths.c_str(), lineno);
|
||||
LogInfo("Load userdicts[%s] ok. lines[%u]", filePaths.c_str(), lineno);
|
||||
}
|
||||
|
||||
bool MakeNodeInfo(DictUnit& node_info,
|
||||
|
@ -26,7 +26,7 @@ class FullSegment: public SegmentBase {
|
||||
delete dictTrie_;
|
||||
}
|
||||
}
|
||||
void cut(const string& sentence,
|
||||
void Cut(const string& sentence,
|
||||
vector<string>& words) const {
|
||||
PreFilter pre_filter(symbols_, sentence);
|
||||
PreFilter::Range range;
|
||||
@ -34,11 +34,11 @@ class FullSegment: public SegmentBase {
|
||||
uwords.reserve(sentence.size());
|
||||
while (pre_filter.HasNext()) {
|
||||
range = pre_filter.Next();
|
||||
cut(range.begin, range.end, uwords);
|
||||
Cut(range.begin, range.end, uwords);
|
||||
}
|
||||
TransCode::encode(uwords, words);
|
||||
}
|
||||
void cut(Unicode::const_iterator begin,
|
||||
void Cut(Unicode::const_iterator begin,
|
||||
Unicode::const_iterator end,
|
||||
vector<Unicode>& res) const {
|
||||
//resut of searching in trie tree
|
||||
|
@ -26,11 +26,11 @@ struct HMMModel {
|
||||
emitProbVec.push_back(&emitProbE);
|
||||
emitProbVec.push_back(&emitProbM);
|
||||
emitProbVec.push_back(&emitProbS);
|
||||
loadModel(modelPath);
|
||||
LoadModel(modelPath);
|
||||
}
|
||||
~HMMModel() {
|
||||
}
|
||||
void loadModel(const string& filePath) {
|
||||
void LoadModel(const string& filePath) {
|
||||
ifstream ifile(filePath.c_str());
|
||||
if (!ifile.is_open()) {
|
||||
LogFatal("open %s failed.", filePath.c_str());
|
||||
@ -38,9 +38,9 @@ struct HMMModel {
|
||||
string line;
|
||||
vector<string> tmp;
|
||||
vector<string> tmp2;
|
||||
//load startProb
|
||||
if (!getLine(ifile, line)) {
|
||||
LogFatal("load startProb");
|
||||
//Load startProb
|
||||
if (!GetLine(ifile, line)) {
|
||||
LogFatal("Load startProb");
|
||||
}
|
||||
split(line, tmp, " ");
|
||||
if (tmp.size() != STATUS_SUM) {
|
||||
@ -50,10 +50,10 @@ struct HMMModel {
|
||||
startProb[j] = atof(tmp[j].c_str());
|
||||
}
|
||||
|
||||
//load transProb
|
||||
//Load transProb
|
||||
for (size_t i = 0; i < STATUS_SUM; i++) {
|
||||
if (!getLine(ifile, line)) {
|
||||
LogFatal("load transProb failed.");
|
||||
if (!GetLine(ifile, line)) {
|
||||
LogFatal("Load transProb failed.");
|
||||
}
|
||||
split(line, tmp, " ");
|
||||
if (tmp.size() != STATUS_SUM) {
|
||||
@ -64,27 +64,27 @@ struct HMMModel {
|
||||
}
|
||||
}
|
||||
|
||||
//load emitProbB
|
||||
if (!getLine(ifile, line) || !loadEmitProb(line, emitProbB)) {
|
||||
LogFatal("load emitProbB failed.");
|
||||
//Load emitProbB
|
||||
if (!GetLine(ifile, line) || !LoadEmitProb(line, emitProbB)) {
|
||||
LogFatal("Load emitProbB failed.");
|
||||
}
|
||||
|
||||
//load emitProbE
|
||||
if (!getLine(ifile, line) || !loadEmitProb(line, emitProbE)) {
|
||||
LogFatal("load emitProbE failed.");
|
||||
//Load emitProbE
|
||||
if (!GetLine(ifile, line) || !LoadEmitProb(line, emitProbE)) {
|
||||
LogFatal("Load emitProbE failed.");
|
||||
}
|
||||
|
||||
//load emitProbM
|
||||
if (!getLine(ifile, line) || !loadEmitProb(line, emitProbM)) {
|
||||
LogFatal("load emitProbM failed.");
|
||||
//Load emitProbM
|
||||
if (!GetLine(ifile, line) || !LoadEmitProb(line, emitProbM)) {
|
||||
LogFatal("Load emitProbM failed.");
|
||||
}
|
||||
|
||||
//load emitProbS
|
||||
if (!getLine(ifile, line) || !loadEmitProb(line, emitProbS)) {
|
||||
LogFatal("load emitProbS failed.");
|
||||
//Load emitProbS
|
||||
if (!GetLine(ifile, line) || !LoadEmitProb(line, emitProbS)) {
|
||||
LogFatal("Load emitProbS failed.");
|
||||
}
|
||||
}
|
||||
double getEmitProb(const EmitProbMap* ptMp, uint16_t key,
|
||||
double GetEmitProb(const EmitProbMap* ptMp, uint16_t key,
|
||||
double defVal)const {
|
||||
EmitProbMap::const_iterator cit = ptMp->find(key);
|
||||
if (cit == ptMp->end()) {
|
||||
@ -92,7 +92,7 @@ struct HMMModel {
|
||||
}
|
||||
return cit->second;
|
||||
}
|
||||
bool getLine(ifstream& ifile, string& line) {
|
||||
bool GetLine(ifstream& ifile, string& line) {
|
||||
while (getline(ifile, line)) {
|
||||
trim(line);
|
||||
if (line.empty()) {
|
||||
@ -105,7 +105,7 @@ struct HMMModel {
|
||||
}
|
||||
return false;
|
||||
}
|
||||
bool loadEmitProb(const string& line, EmitProbMap& mp) {
|
||||
bool LoadEmitProb(const string& line, EmitProbMap& mp) {
|
||||
if (line.empty()) {
|
||||
return false;
|
||||
}
|
||||
|
@ -23,7 +23,7 @@ class HMMSegment: public SegmentBase {
|
||||
}
|
||||
}
|
||||
|
||||
void cut(const string& sentence,
|
||||
void Cut(const string& sentence,
|
||||
vector<string>& words) const {
|
||||
PreFilter pre_filter(symbols_, sentence);
|
||||
PreFilter::Range range;
|
||||
@ -31,11 +31,11 @@ class HMMSegment: public SegmentBase {
|
||||
uwords.reserve(sentence.size());
|
||||
while (pre_filter.HasNext()) {
|
||||
range = pre_filter.Next();
|
||||
cut(range.begin, range.end, uwords);
|
||||
Cut(range.begin, range.end, uwords);
|
||||
}
|
||||
TransCode::encode(uwords, words);
|
||||
}
|
||||
void cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<Unicode>& res) const {
|
||||
void Cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<Unicode>& res) const {
|
||||
Unicode::const_iterator left = begin;
|
||||
Unicode::const_iterator right = begin;
|
||||
while (right != end) {
|
||||
@ -132,7 +132,7 @@ class HMMSegment: public SegmentBase {
|
||||
|
||||
//start
|
||||
for (size_t y = 0; y < Y; y++) {
|
||||
weight[0 + y * X] = model_->startProb[y] + model_->getEmitProb(model_->emitProbVec[y], *begin, MIN_DOUBLE);
|
||||
weight[0 + y * X] = model_->startProb[y] + model_->GetEmitProb(model_->emitProbVec[y], *begin, MIN_DOUBLE);
|
||||
path[0 + y * X] = -1;
|
||||
}
|
||||
|
||||
@ -143,7 +143,7 @@ class HMMSegment: public SegmentBase {
|
||||
now = x + y*X;
|
||||
weight[now] = MIN_DOUBLE;
|
||||
path[now] = HMMModel::E; // warning
|
||||
emitProb = model_->getEmitProb(model_->emitProbVec[y], *(begin+x), MIN_DOUBLE);
|
||||
emitProb = model_->GetEmitProb(model_->emitProbVec[y], *(begin+x), MIN_DOUBLE);
|
||||
for (size_t preY = 0; preY < Y; preY++) {
|
||||
old = x - 1 + preY * X;
|
||||
tmp = weight[old] + model_->transProb[preY][y] + emitProb;
|
||||
|
@ -23,25 +23,25 @@ class Jieba {
|
||||
}
|
||||
|
||||
void Cut(const string& sentence, vector<string>& words, bool hmm = true) const {
|
||||
mix_seg_.cut(sentence, words, hmm);
|
||||
mix_seg_.Cut(sentence, words, hmm);
|
||||
}
|
||||
void CutAll(const string& sentence, vector<string>& words) const {
|
||||
full_seg_.cut(sentence, words);
|
||||
full_seg_.Cut(sentence, words);
|
||||
}
|
||||
void CutForSearch(const string& sentence, vector<string>& words, bool hmm = true) const {
|
||||
query_seg_.cut(sentence, words, hmm);
|
||||
query_seg_.Cut(sentence, words, hmm);
|
||||
}
|
||||
void CutHMM(const string& sentence, vector<string>& words) const {
|
||||
hmm_seg_.cut(sentence, words);
|
||||
hmm_seg_.Cut(sentence, words);
|
||||
}
|
||||
void CutLevel(const string& sentence, vector<string>& words) const {
|
||||
level_seg_.cut(sentence, words);
|
||||
level_seg_.Cut(sentence, words);
|
||||
}
|
||||
void CutLevel(const string& sentence, vector<pair<string, size_t> >& words) const {
|
||||
level_seg_.cut(sentence, words);
|
||||
level_seg_.Cut(sentence, words);
|
||||
}
|
||||
void CutSmall(const string& sentence, vector<string>& words, size_t max_word_len) const {
|
||||
mp_seg_.cut(sentence, words, max_word_len);
|
||||
mp_seg_.Cut(sentence, words, max_word_len);
|
||||
}
|
||||
bool InsertUserWord(const string& word, const string& tag = UNKNOWN_TAG) {
|
||||
return dict_trie_.InsertUserWord(word, tag);
|
||||
|
@ -44,7 +44,7 @@ class KeywordExtractor {
|
||||
|
||||
bool extract(const string& sentence, vector<pair<string, double> >& keywords, size_t topN) const {
|
||||
vector<string> words;
|
||||
segment_.cut(sentence, words);
|
||||
segment_.Cut(sentence, words);
|
||||
|
||||
map<string, double> wordmap;
|
||||
for (vector<string>::iterator iter = words.begin(); iter != words.end(); iter++) {
|
||||
|
@ -18,14 +18,14 @@ class LevelSegment: public SegmentBase{
|
||||
~LevelSegment() {
|
||||
}
|
||||
|
||||
void cut(Unicode::const_iterator begin,
|
||||
void Cut(Unicode::const_iterator begin,
|
||||
Unicode::const_iterator end,
|
||||
vector<pair<Unicode, size_t> >& res) const {
|
||||
res.clear();
|
||||
vector<Unicode> words;
|
||||
vector<Unicode> smallerWords;
|
||||
words.reserve(end - begin);
|
||||
mpSeg_.cut(begin, end, words);
|
||||
mpSeg_.Cut(begin, end, words);
|
||||
smallerWords.reserve(words.size());
|
||||
res.reserve(words.size());
|
||||
|
||||
@ -35,7 +35,7 @@ class LevelSegment: public SegmentBase{
|
||||
for (size_t i = 0; i < words.size(); i++) {
|
||||
if (words[i].size() >= 3) {
|
||||
size_t len = words[i].size() - 1;
|
||||
mpSeg_.cut(words[i].begin(), words[i].end(), smallerWords, len); // buffer.push_back without clear
|
||||
mpSeg_.Cut(words[i].begin(), words[i].end(), smallerWords, len); // buffer.push_back without clear
|
||||
}
|
||||
if (words[i].size() > 1) {
|
||||
res.push_back(pair<Unicode, size_t>(words[i], level));
|
||||
@ -47,13 +47,13 @@ class LevelSegment: public SegmentBase{
|
||||
}
|
||||
}
|
||||
|
||||
void cut(const string& sentence,
|
||||
void Cut(const string& sentence,
|
||||
vector<pair<string, size_t> >& words) const {
|
||||
words.clear();
|
||||
Unicode unicode;
|
||||
TransCode::decode(sentence, unicode);
|
||||
vector<pair<Unicode, size_t> > unicodeWords;
|
||||
cut(unicode.begin(), unicode.end(), unicodeWords);
|
||||
Cut(unicode.begin(), unicode.end(), unicodeWords);
|
||||
words.resize(unicodeWords.size());
|
||||
for (size_t i = 0; i < words.size(); i++) {
|
||||
TransCode::encode(unicodeWords[i].first, words[i].first);
|
||||
@ -61,10 +61,10 @@ class LevelSegment: public SegmentBase{
|
||||
}
|
||||
}
|
||||
|
||||
bool cut(const string& sentence,
|
||||
bool Cut(const string& sentence,
|
||||
vector<string>& res) const {
|
||||
vector<pair<string, size_t> > words;
|
||||
cut(sentence, words);
|
||||
Cut(sentence, words);
|
||||
res.clear();
|
||||
res.reserve(words.size());
|
||||
for (size_t i = 0; i < words.size(); i++) {
|
||||
|
@ -27,7 +27,7 @@ class MPSegment: public SegmentBase {
|
||||
}
|
||||
}
|
||||
|
||||
void cut(const string& sentence,
|
||||
void Cut(const string& sentence,
|
||||
vector<string>& words,
|
||||
size_t max_word_len = MAX_WORD_LENGTH) const {
|
||||
PreFilter pre_filter(symbols_, sentence);
|
||||
@ -36,11 +36,11 @@ class MPSegment: public SegmentBase {
|
||||
uwords.reserve(sentence.size());
|
||||
while (pre_filter.HasNext()) {
|
||||
range = pre_filter.Next();
|
||||
cut(range.begin, range.end, uwords, max_word_len);
|
||||
Cut(range.begin, range.end, uwords, max_word_len);
|
||||
}
|
||||
TransCode::encode(uwords, words);
|
||||
}
|
||||
void cut(Unicode::const_iterator begin,
|
||||
void Cut(Unicode::const_iterator begin,
|
||||
Unicode::const_iterator end,
|
||||
vector<Unicode>& words,
|
||||
size_t max_word_len = MAX_WORD_LENGTH) const {
|
||||
@ -53,7 +53,7 @@ class MPSegment: public SegmentBase {
|
||||
CutByDag(dags, words);
|
||||
}
|
||||
|
||||
const DictTrie* getDictTrie() const {
|
||||
const DictTrie* GetDictTrie() const {
|
||||
return dictTrie_;
|
||||
}
|
||||
|
||||
|
@ -21,47 +21,47 @@ class MixSegment: public SegmentBase {
|
||||
~MixSegment() {
|
||||
}
|
||||
|
||||
void cut(const string& sentence, vector<string>& words, bool hmm = true) const {
|
||||
void Cut(const string& sentence, vector<string>& words, bool hmm = true) const {
|
||||
PreFilter pre_filter(symbols_, sentence);
|
||||
PreFilter::Range range;
|
||||
vector<Unicode> uwords;
|
||||
uwords.reserve(sentence.size());
|
||||
while (pre_filter.HasNext()) {
|
||||
range = pre_filter.Next();
|
||||
cut(range.begin, range.end, uwords, hmm);
|
||||
Cut(range.begin, range.end, uwords, hmm);
|
||||
}
|
||||
TransCode::encode(uwords, words);
|
||||
}
|
||||
|
||||
void cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<Unicode>& res, bool hmm) const {
|
||||
void Cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<Unicode>& res, bool hmm) const {
|
||||
if (!hmm) {
|
||||
mpSeg_.cut(begin, end, res);
|
||||
mpSeg_.Cut(begin, end, res);
|
||||
return;
|
||||
}
|
||||
vector<Unicode> words;
|
||||
words.reserve(end - begin);
|
||||
mpSeg_.cut(begin, end, words);
|
||||
mpSeg_.Cut(begin, end, words);
|
||||
|
||||
vector<Unicode> hmmRes;
|
||||
hmmRes.reserve(end - begin);
|
||||
Unicode piece;
|
||||
piece.reserve(end - begin);
|
||||
for (size_t i = 0, j = 0; i < words.size(); i++) {
|
||||
//if mp get a word, it's ok, put it into result
|
||||
//if mp Get a word, it's ok, put it into result
|
||||
if (1 != words[i].size() || (words[i].size() == 1 && mpSeg_.IsUserDictSingleChineseWord(words[i][0]))) {
|
||||
res.push_back(words[i]);
|
||||
continue;
|
||||
}
|
||||
|
||||
// if mp get a single one and it is not in userdict, collect it in sequence
|
||||
// if mp Get a single one and it is not in userdict, collect it in sequence
|
||||
j = i;
|
||||
while (j < words.size() && 1 == words[j].size() && !mpSeg_.IsUserDictSingleChineseWord(words[j][0])) {
|
||||
piece.push_back(words[j][0]);
|
||||
j++;
|
||||
}
|
||||
|
||||
// cut the sequence with hmm
|
||||
hmmSeg_.cut(piece.begin(), piece.end(), hmmRes);
|
||||
// Cut the sequence with hmm
|
||||
hmmSeg_.Cut(piece.begin(), piece.end(), hmmRes);
|
||||
|
||||
//put hmm result to result
|
||||
for (size_t k = 0; k < hmmRes.size(); k++) {
|
||||
@ -77,8 +77,8 @@ class MixSegment: public SegmentBase {
|
||||
}
|
||||
}
|
||||
|
||||
const DictTrie* getDictTrie() const {
|
||||
return mpSeg_.getDictTrie();
|
||||
const DictTrie* GetDictTrie() const {
|
||||
return mpSeg_.GetDictTrie();
|
||||
}
|
||||
private:
|
||||
MPSegment mpSeg_;
|
||||
|
@ -26,14 +26,14 @@ class PosTagger {
|
||||
}
|
||||
|
||||
bool tag(const string& src, vector<pair<string, string> >& res) const {
|
||||
vector<string> cutRes;
|
||||
segment_.cut(src, cutRes);
|
||||
vector<string> CutRes;
|
||||
segment_.Cut(src, CutRes);
|
||||
|
||||
const DictUnit *tmp = NULL;
|
||||
Unicode unico;
|
||||
const DictTrie * dict = segment_.getDictTrie();
|
||||
const DictTrie * dict = segment_.GetDictTrie();
|
||||
assert(dict != NULL);
|
||||
for (vector<string>::iterator itr = cutRes.begin(); itr != cutRes.end(); ++itr) {
|
||||
for (vector<string>::iterator itr = CutRes.begin(); itr != CutRes.end(); ++itr) {
|
||||
if (!TransCode::decode(*itr, unico)) {
|
||||
LogError("decode failed.");
|
||||
return false;
|
||||
|
@ -17,7 +17,7 @@ class QuerySegment: public SegmentBase {
|
||||
public:
|
||||
QuerySegment(const string& dict, const string& model, const string& userDict = "", size_t maxWordLen = 4)
|
||||
: mixSeg_(dict, model, userDict),
|
||||
fullSeg_(mixSeg_.getDictTrie()),
|
||||
fullSeg_(mixSeg_.GetDictTrie()),
|
||||
maxWordLen_(maxWordLen) {
|
||||
assert(maxWordLen_);
|
||||
}
|
||||
@ -26,27 +26,27 @@ class QuerySegment: public SegmentBase {
|
||||
}
|
||||
~QuerySegment() {
|
||||
}
|
||||
void cut(const string& sentence, vector<string>& words, bool hmm = true) const {
|
||||
void Cut(const string& sentence, vector<string>& words, bool hmm = true) const {
|
||||
PreFilter pre_filter(symbols_, sentence);
|
||||
PreFilter::Range range;
|
||||
vector<Unicode> uwords;
|
||||
uwords.reserve(sentence.size());
|
||||
while (pre_filter.HasNext()) {
|
||||
range = pre_filter.Next();
|
||||
cut(range.begin, range.end, uwords, hmm);
|
||||
Cut(range.begin, range.end, uwords, hmm);
|
||||
}
|
||||
TransCode::encode(uwords, words);
|
||||
}
|
||||
void cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<Unicode>& res, bool hmm) const {
|
||||
//use mix cut first
|
||||
void Cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<Unicode>& res, bool hmm) const {
|
||||
//use mix Cut first
|
||||
vector<Unicode> mixRes;
|
||||
mixSeg_.cut(begin, end, mixRes, hmm);
|
||||
mixSeg_.Cut(begin, end, mixRes, hmm);
|
||||
|
||||
vector<Unicode> fullRes;
|
||||
for (vector<Unicode>::const_iterator mixResItr = mixRes.begin(); mixResItr != mixRes.end(); mixResItr++) {
|
||||
// if it's too long, cut with fullSeg_, put fullRes in res
|
||||
// if it's too long, Cut with fullSeg_, put fullRes in res
|
||||
if (mixResItr->size() > maxWordLen_) {
|
||||
fullSeg_.cut(mixResItr->begin(), mixResItr->end(), fullRes);
|
||||
fullSeg_.Cut(mixResItr->begin(), mixResItr->end(), fullRes);
|
||||
for (vector<Unicode>::const_iterator fullResItr = fullRes.begin(); fullResItr != fullRes.end(); fullResItr++) {
|
||||
res.push_back(*fullResItr);
|
||||
}
|
||||
|
@ -117,7 +117,7 @@ class Trie {
|
||||
}
|
||||
}
|
||||
|
||||
void insertNode(const Unicode& key, const DictUnit* ptValue) {
|
||||
void InsertNode(const Unicode& key, const DictUnit* ptValue) {
|
||||
if (key.begin() == key.end()) {
|
||||
return;
|
||||
}
|
||||
@ -150,7 +150,7 @@ class Trie {
|
||||
assert(keys.size() == valuePointers.size());
|
||||
|
||||
for (size_t i = 0; i < keys.size(); i++) {
|
||||
insertNode(keys[i], valuePointers[i]);
|
||||
InsertNode(keys[i], valuePointers[i]);
|
||||
}
|
||||
}
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user