upgrade basic functions

This commit is contained in:
yanyiwu 2015-12-12 21:25:57 +08:00
parent 8bf70127c2
commit bcb112a4b1
19 changed files with 90 additions and 118 deletions

View File

@ -1,5 +1,9 @@
# CppJieba ChangeLog
## next version
1. Upgrade [limonp] to version v0.4.1, [husky] to version v0.2.0
## v4.2.0
1. 修复[issue50]提到的多词典分隔符在Windows环境下存在的问题从':'修改成'|'或';'。
@ -66,14 +70,14 @@
## v2.4.3
1. 更新 [Husky] 服务代码,新 [Husky] 为基于线程池的服务器简易框架。并且修复当 HTTP POST 请求时 body 过长数据可能丢失的问题。
1. 更新 [husky] 服务代码,新 [husky] 为基于线程池的服务器简易框架。并且修复当 HTTP POST 请求时 body 过长数据可能丢失的问题。
2. 修改 PosTagger 的参数结构,删除暂时无用的参数。并添加使用自定义字典的参数,也就是支持 **自定义词性**
3. 更好的支持 `mac osx` (原谅作者如此屌丝,这么晚才买 `mac` )。
4. 支持 `Docker` ,具体请见 `Dockerfile`
## v2.4.2
1. 适当使用 `vector` 的基础上,使用`Limonp/LocalVector.hpp`作为`Unicode`的类型等优化,约提高性能 `30%`
1. 适当使用 `vector` 的基础上,使用`limonp/LocalVector.hpp`作为`Unicode`的类型等优化,约提高性能 `30%`
2. 使 `cjserver` 支持用户自定义词典,通过在 `conf/server.conf` 里面配置 `user_dict_path` 来实现。
3. 修复 `MPSegment` 切词时,当句子中含有特殊字符时,切词结果不完整的问题。
4. 修改 `FullSegment` 减少内存使用。
@ -131,9 +135,9 @@
1. 完成__最大概率分词算法__和__HMM分词算法__并且将他们结合起来成效果最好的`MixSegment`
2. 进行大量的代码重构将主要的功能性代码都写成了hpp文件。
3. 使用`cmake`工具来管理项目。
4. 使用 [Limonp]作为工具函数库,比如日志,字符串操作等常用函数。
5. 使用 [Husky] 搭简易分词服务的服务器框架。
4. 使用 [limonp]作为工具函数库,比如日志,字符串操作等常用函数。
5. 使用 [husky] 搭简易分词服务的服务器框架。
[Limonp]:http://github.com/yanyiwu/limonp.git
[Husky]:http://github.com/yanyiwu/husky.git
[limonp]:http://github.com/yanyiwu/limonp.git
[husky]:http://github.com/yanyiwu/husky.git
[issue50]:https://github.com/yanyiwu/cppjieba/issues/50

View File

@ -19,7 +19,7 @@ class ReqHandler: public IRequestHandler {
virtual ~ReqHandler() {
}
virtual bool doGET(const HttpReqInfo& httpReq, string& strSnd) {
virtual bool DoGET(const HttpReqInfo& httpReq, string& strSnd) {
string sentence, method, format;
string tmp;
vector<string> words;
@ -32,9 +32,9 @@ class ReqHandler: public IRequestHandler {
return true;
}
virtual bool doPOST(const HttpReqInfo& httpReq, string& strSnd) {
virtual bool DoPOST(const HttpReqInfo& httpReq, string& strSnd) {
vector<string> words;
Run(httpReq.getBody(), "MIX", "simple", strSnd);
Run(httpReq.GetBody(), "MIX", "simple", strSnd);
return true;
}
@ -57,7 +57,7 @@ class ReqHandler: public IRequestHandler {
jieba_.Cut(sentence, words, false);
}
if (format == "simple") {
join(words.begin(), words.end(), strSnd, " ");
Join(words.begin(), words.end(), strSnd, " ");
} else {
strSnd << words;
}
@ -74,22 +74,22 @@ bool Run(int argc, char** argv) {
if (!conf) {
return false;
}
int port = conf.get("port", 1339);
int threadNumber = conf.get("thread_number", 4);
int queueMaxSize = conf.get("queue_max_size", 1024);
string dictPath = conf.get("dict_path", "");
string modelPath = conf.get("model_path", "");
string userDictPath = conf.get("user_dict_path", "");
int port = conf.Get("port", 1339);
int threadNumber = conf.Get("thread_number", 4);
int queueMaxSize = conf.Get("queue_max_size", 1024);
string dictPath = conf.Get("dict_path", "");
string modelPath = conf.Get("model_path", "");
string userDictPath = conf.Get("user_dict_path", "");
LogInfo("config info: %s", conf.getConfigInfo().c_str());
LOG(INFO) << "config info: " << conf.GetConfigInfo();
cppjieba::Jieba jieba(dictPath,
modelPath,
userDictPath);
ReqHandler reqHandler(jieba);
ThreadPoolServer sf(threadNumber, queueMaxSize, port, reqHandler);
return sf.start();
ThreadPoolServer server(threadNumber, queueMaxSize, port, reqHandler);
return server.Start();
}
int main(int argc, char* argv[]) {

View File

@ -9,7 +9,7 @@
#include <cmath>
#include <limits>
#include "limonp/StringUtil.hpp"
#include "limonp/Logger.hpp"
#include "limonp/Logging.hpp"
#include "TransCode.hpp"
#include "Trie.hpp"
@ -60,7 +60,7 @@ class DictTrie {
}
bool IsUserDictSingleChineseWord(const Rune& word) const {
return isIn(user_dict_single_chinese_word_, word);
return IsIn(user_dict_single_chinese_word_, word);
}
double GetMinWeight() const {
@ -93,24 +93,20 @@ class DictTrie {
}
void LoadUserDict(const string& filePaths) {
vector<string> files = limonp::split(filePaths, "|;");
vector<string> files = limonp::Split(filePaths, "|;");
size_t lineno = 0;
for (size_t i = 0; i < files.size(); i++) {
ifstream ifs(files[i].c_str());
if (!ifs.is_open()) {
LogFatal("file %s open failed.", files[i].c_str());
}
CHECK(ifs.is_open()) << "open " << files[i] << " failed";
string line;
DictUnit node_info;
vector<string> buf;
for (; getline(ifs, line); lineno++) {
if (line.size() == 0)
if (line.size() == 0) {
continue;
buf.clear();
split(line, buf, " ");
if (buf.size() < 1) {
LogFatal("split [%s] result illegal", line.c_str());
}
buf.clear();
Split(line, buf, " ");
DictUnit node_info;
MakeNodeInfo(node_info,
buf[0],
@ -122,7 +118,7 @@ class DictTrie {
}
}
}
LogInfo("Load userdicts[%s] ok. lines[%u]", filePaths.c_str(), lineno);
LOG(INFO) << "load userdicts " << filePaths << ", lines: " << lineno;
}
bool MakeNodeInfo(DictUnit& node_info,
@ -130,7 +126,7 @@ class DictTrie {
double weight,
const string& tag) {
if (!TransCode::Decode(word, node_info.word)) {
LogError("Decode %s failed.", word.c_str());
LOG(ERROR) << "Decode " << word << " failed.";
return false;
}
node_info.weight = weight;
@ -140,18 +136,14 @@ class DictTrie {
void LoadDict(const string& filePath) {
ifstream ifs(filePath.c_str());
if (!ifs.is_open()) {
LogFatal("file %s open failed.", filePath.c_str());
}
CHECK(ifs.is_open()) << "open " << filePath << " failed.";
string line;
vector<string> buf;
DictUnit node_info;
for (size_t lineno = 0; getline(ifs, line); lineno++) {
split(line, buf, " ");
if (buf.size() != DICT_COLUMN_NUM) {
LogFatal("split result illegal, line: %s, result size: %u", line.c_str(), buf.size());
}
Split(line, buf, " ");
CHECK(buf.size() == DICT_COLUMN_NUM) << "split result illegal, line:" << line;
MakeNodeInfo(node_info,
buf[0],
atof(buf[1].c_str()),
@ -165,9 +157,7 @@ class DictTrie {
}
void SetStaticWordWeights(UserWordWeightOption option) {
if (static_node_infos_.empty()) {
LogFatal("something must be wrong");
}
CHECK(!static_node_infos_.empty());
vector<DictUnit> x = static_node_infos_;
sort(x.begin(), x.end(), WeightCompare);
min_weight_ = x[0].weight;

View File

@ -4,7 +4,7 @@
#include <algorithm>
#include <set>
#include <cassert>
#include "limonp/Logger.hpp"
#include "limonp/Logging.hpp"
#include "DictTrie.hpp"
#include "SegmentBase.hpp"
#include "TransCode.hpp"
@ -15,7 +15,6 @@ class FullSegment: public SegmentBase {
FullSegment(const string& dictPath) {
dictTrie_ = new DictTrie(dictPath);
isNeedDestroy_ = true;
LogInfo("FullSegment init %s ok", dictPath.c_str());
}
FullSegment(const DictTrie* dictTrie)
: dictTrie_(dictTrie), isNeedDestroy_(false) {

View File

@ -32,57 +32,43 @@ struct HMMModel {
}
void LoadModel(const string& filePath) {
ifstream ifile(filePath.c_str());
if (!ifile.is_open()) {
LogFatal("open %s failed.", filePath.c_str());
}
CHECK(ifile.is_open()) << "open " << filePath << " failed";
string line;
vector<string> tmp;
vector<string> tmp2;
//Load startProb
if (!GetLine(ifile, line)) {
LogFatal("Load startProb");
}
split(line, tmp, " ");
if (tmp.size() != STATUS_SUM) {
LogFatal("start_p illegal");
}
CHECK(GetLine(ifile, line));
Split(line, tmp, " ");
CHECK(tmp.size() == STATUS_SUM);
for (size_t j = 0; j< tmp.size(); j++) {
startProb[j] = atof(tmp[j].c_str());
}
//Load transProb
for (size_t i = 0; i < STATUS_SUM; i++) {
if (!GetLine(ifile, line)) {
LogFatal("Load transProb failed.");
}
split(line, tmp, " ");
if (tmp.size() != STATUS_SUM) {
LogFatal("trans_p illegal");
}
CHECK(GetLine(ifile, line));
Split(line, tmp, " ");
CHECK(tmp.size() == STATUS_SUM);
for (size_t j =0; j < STATUS_SUM; j++) {
transProb[i][j] = atof(tmp[j].c_str());
}
}
//Load emitProbB
if (!GetLine(ifile, line) || !LoadEmitProb(line, emitProbB)) {
LogFatal("Load emitProbB failed.");
}
CHECK(GetLine(ifile, line));
CHECK(LoadEmitProb(line, emitProbB));
//Load emitProbE
if (!GetLine(ifile, line) || !LoadEmitProb(line, emitProbE)) {
LogFatal("Load emitProbE failed.");
}
CHECK(GetLine(ifile, line));
CHECK(LoadEmitProb(line, emitProbE));
//Load emitProbM
if (!GetLine(ifile, line) || !LoadEmitProb(line, emitProbM)) {
LogFatal("Load emitProbM failed.");
}
CHECK(GetLine(ifile, line));
CHECK(LoadEmitProb(line, emitProbM));
//Load emitProbS
if (!GetLine(ifile, line) || !LoadEmitProb(line, emitProbS)) {
LogFatal("Load emitProbS failed.");
}
CHECK(GetLine(ifile, line));
CHECK(LoadEmitProb(line, emitProbS));
}
double GetEmitProb(const EmitProbMap* ptMp, uint16_t key,
double defVal)const {
@ -94,11 +80,11 @@ struct HMMModel {
}
bool GetLine(ifstream& ifile, string& line) {
while (getline(ifile, line)) {
trim(line);
Trim(line);
if (line.empty()) {
continue;
}
if (startsWith(line, "#")) {
if (StartsWith(line, "#")) {
continue;
}
return true;
@ -111,15 +97,15 @@ struct HMMModel {
}
vector<string> tmp, tmp2;
Unicode unicode;
split(line, tmp, ",");
Split(line, tmp, ",");
for (size_t i = 0; i < tmp.size(); i++) {
split(tmp[i], tmp2, ":");
Split(tmp[i], tmp2, ":");
if (2 != tmp2.size()) {
LogError("emitProb illegal.");
LOG(ERROR) << "emitProb illegal.";
return false;
}
if (!TransCode::Decode(tmp2[0], unicode) || unicode.size() != 1) {
LogError("TransCode failed.");
LOG(ERROR) << "TransCode failed.";
return false;
}
mp[unicode[0]] = atof(tmp2[1].c_str());

View File

@ -79,9 +79,7 @@ class KeywordExtractor {
private:
void LoadIdfDict(const string& idfPath) {
ifstream ifs(idfPath.c_str());
if (!ifs.is_open()) {
LogFatal("open %s failed.", idfPath.c_str());
}
CHECK(ifs.is_open()) << "open " << idfPath << " failed";
string line ;
vector<string> buf;
double idf = 0.0;
@ -90,12 +88,12 @@ class KeywordExtractor {
for (; getline(ifs, line); lineno++) {
buf.clear();
if (line.empty()) {
LogError("line[%d] empty. skipped.", lineno);
LOG(ERROR) << "lineno: " << lineno << " empty. skipped.";
continue;
}
split(line, buf, " ");
Split(line, buf, " ");
if (buf.size() != 2) {
LogError("line %d [%s] illegal. skipped.", lineno, line.c_str());
LOG(ERROR) << "line: " << line << ", lineno: " << lineno << " empty. skipped.";
continue;
}
idf = atof(buf[1].c_str());
@ -110,9 +108,7 @@ class KeywordExtractor {
}
void LoadStopWordDict(const string& filePath) {
ifstream ifs(filePath.c_str());
if (!ifs.is_open()) {
LogFatal("open %s failed.", filePath.c_str());
}
CHECK(ifs.is_open()) << "open " << filePath << " failed";
string line ;
while (getline(ifs, line)) {
stopWords_.insert(line);

View File

@ -10,7 +10,6 @@ class LevelSegment: public SegmentBase{
LevelSegment(const string& dictPath,
const string& userDictPath = "")
: mpSeg_(dictPath, userDictPath) {
LogInfo("LevelSegment init");
}
LevelSegment(const DictTrie* dictTrie)
: mpSeg_(dictTrie) {

View File

@ -4,7 +4,7 @@
#include <algorithm>
#include <set>
#include <cassert>
#include "limonp/Logger.hpp"
#include "limonp/Logging.hpp"
#include "DictTrie.hpp"
#include "SegmentBase.hpp"
@ -15,7 +15,6 @@ class MPSegment: public SegmentBase {
MPSegment(const string& dictPath, const string& userDictPath = "") {
dictTrie_ = new DictTrie(dictPath, userDictPath);
isNeedDestroy_ = true;
LogInfo("MPSegment init(%s) ok", dictPath.c_str());
}
MPSegment(const DictTrie* dictTrie)
: dictTrie_(dictTrie), isNeedDestroy_(false) {

View File

@ -13,7 +13,6 @@ class MixSegment: public SegmentBase {
const string& userDict = "")
: mpSeg_(mpSegDict, userDict),
hmmSeg_(hmmSegDict) {
LogInfo("MixSegment init %s, %s", mpSegDict.c_str(), hmmSegDict.c_str());
}
MixSegment(const DictTrie* dictTrie, const HMMModel* model)
: mpSeg_(dictTrie), hmmSeg_(model) {

View File

@ -35,7 +35,7 @@ class PosTagger {
assert(dict != NULL);
for (vector<string>::iterator itr = CutRes.begin(); itr != CutRes.end(); ++itr) {
if (!TransCode::Decode(*itr, unico)) {
LogError("Decode failed.");
LOG(ERROR) << "Decode failed.";
return false;
}
tmp = dict->Find(unico.begin(), unico.end());

View File

@ -38,7 +38,7 @@ class PreFilter {
Range range;
range.begin = cursor_;
while (cursor_ != sentence_.end()) {
if (isIn(symbols_, *cursor_)) {
if (IsIn(symbols_, *cursor_)) {
if (range.begin == cursor_) {
cursor_ ++;
}

View File

@ -4,7 +4,7 @@
#include <algorithm>
#include <set>
#include <cassert>
#include "limonp/Logger.hpp"
#include "limonp/Logging.hpp"
#include "DictTrie.hpp"
#include "SegmentBase.hpp"
#include "FullSegment.hpp"

View File

@ -1,7 +1,7 @@
#ifndef CPPJIEBA_SEGMENTBASE_H
#define CPPJIEBA_SEGMENTBASE_H
#include "limonp/Logger.hpp"
#include "limonp/Logging.hpp"
#include "PreFilter.hpp"
#include <cassert>

View File

@ -21,7 +21,7 @@ inline bool Decode(const string& str, Unicode& res) {
#ifdef CPPJIEBA_GBK
return gbkTrans(str, res);
#else
return utf8ToUnicode(str, res);
return Utf8ToUnicode(str, res);
#endif
}
@ -29,7 +29,7 @@ inline void Encode(Unicode::const_iterator begin, Unicode::const_iterator end, s
#ifdef CPPJIEBA_GBK
gbkTrans(begin, end, res);
#else
unicodeToUtf8(begin, end, res);
UnicodeToUtf8(begin, end, res);
#endif
}

View File

@ -20,7 +20,7 @@ struct DictUnit {
inline ostream & operator << (ostream& os, const DictUnit& unit) {
string s;
s << unit.word;
return os << string_format("%s %s %.3lf", s.c_str(), unit.tag.c_str(), unit.weight);
return os << StringFormat("%s %s %.3lf", s.c_str(), unit.tag.c_str(), unit.weight);
}
struct Dag {

View File

@ -12,26 +12,26 @@ int main(int argc, char** argv) {
cout << "[demo] Cut With HMM" << endl;
jieba.Cut(s, words, true);
cout << limonp::join(words.begin(), words.end(), "/") << endl;
cout << limonp::Join(words.begin(), words.end(), "/") << endl;
cout << "[demo] Cut Without HMM " << endl;
jieba.Cut(s, words, false);
cout << limonp::join(words.begin(), words.end(), "/") << endl;
cout << limonp::Join(words.begin(), words.end(), "/") << endl;
cout << "[demo] CutAll" << endl;
jieba.CutAll(s, words);
cout << limonp::join(words.begin(), words.end(), "/") << endl;
cout << limonp::Join(words.begin(), words.end(), "/") << endl;
cout << "[demo] CutForSearch" << endl;
jieba.CutForSearch(s, words);
cout << limonp::join(words.begin(), words.end(), "/") << endl;
cout << limonp::Join(words.begin(), words.end(), "/") << endl;
cout << "[demo] Insert User Word" << endl;
jieba.Cut("男默女泪", words);
cout << limonp::join(words.begin(), words.end(), "/") << endl;
cout << limonp::Join(words.begin(), words.end(), "/") << endl;
jieba.InsertUserWord("男默女泪");
jieba.Cut("男默女泪", words);
cout << limonp::join(words.begin(), words.end(), "/") << endl;
cout << limonp::Join(words.begin(), words.end(), "/") << endl;
cout << "[demo] Locate Words" << endl;
vector<cppjieba::Jieba::LocWord> loc_words;

View File

@ -91,6 +91,6 @@ TEST(JiebaTest, InsertUserWord) {
ASSERT_TRUE(jieba.InsertUserWord(newWord));
jieba.Cut(newWord, words);
result << words;
ASSERT_EQ(result, string_format("[\"%s\"]", newWord.c_str()));
ASSERT_EQ(result, StringFormat("[\"%s\"]", newWord.c_str()));
}
}

View File

@ -20,7 +20,7 @@ TEST(PreFilterTest, Test1) {
range = filter.Next();
words.push_back(TransCode::Encode(range.begin, range.end));
}
res = join(words.begin(), words.end(), "/");
res = Join(words.begin(), words.end(), "/");
ASSERT_EQ(res, expected);
}
@ -34,7 +34,7 @@ TEST(PreFilterTest, Test1) {
range = filter.Next();
words.push_back(TransCode::Encode(range.begin, range.end));
}
res = join(words.begin(), words.end(), "/");
res = Join(words.begin(), words.end(), "/");
for (size_t i = 0; i < words.size(); i++) {
}
ASSERT_EQ(res, expected);

View File

@ -20,7 +20,7 @@ TEST(MixSegmentTest, Test1) {
sentence = "我来自北京邮电大学。。。学号123456用AK47";
expected = "我/来自/北京邮电大学/。/。/。/学号/123456//用/AK47";
segment.Cut(sentence, words);
actual = join(words.begin(), words.end(), "/");
actual = Join(words.begin(), words.end(), "/");
ASSERT_EQ(actual, expected);
}
@ -28,7 +28,7 @@ TEST(MixSegmentTest, Test1) {
sentence = "B超 T恤";
expected = "B超/ /T恤";
segment.Cut(sentence, words);
actual = join(words.begin(), words.end(), "/");
actual = Join(words.begin(), words.end(), "/");
ASSERT_EQ(actual, expected);
}
@ -36,7 +36,7 @@ TEST(MixSegmentTest, Test1) {
sentence = "他来到了网易杭研大厦";
expected = "他/来到/了/网易/杭/研/大厦";
segment.Cut(sentence, words, false);
actual = join(words.begin(), words.end(), "/");
actual = Join(words.begin(), words.end(), "/");
ASSERT_EQ(actual, expected);
}
@ -44,7 +44,7 @@ TEST(MixSegmentTest, Test1) {
sentence = "他来到了网易杭研大厦";
expected = "他/来到/了/网易/杭研/大厦";
segment.Cut(sentence, words);
actual = join(words.begin(), words.end(), "/");
actual = Join(words.begin(), words.end(), "/");
ASSERT_EQ(actual, expected);
}
}
@ -102,7 +102,7 @@ TEST(MixSegmentTest, TestUserDict) {
ASSERT_EQ("[\"I\", \"B\", \"M\", \",\", \"3.14\"]", res);
segment.Cut("忽如一夜春风来,千树万树梨花开", words);
res = limonp::join(words.begin(), words.end(), "/");
res = limonp::Join(words.begin(), words.end(), "/");
ASSERT_EQ("忽如一夜春风来//千树/万树/梨花/开", res);
}
@ -113,7 +113,7 @@ TEST(MixSegmentTest, TestMultiUserDict) {
string res;
segment.Cut("忽如一夜春风来,千树万树梨花开", words);
res = limonp::join(words.begin(), words.end(), "/");
res = limonp::Join(words.begin(), words.end(), "/");
ASSERT_EQ("忽如一夜春风来//千树万树梨花开", res);
}
@ -138,11 +138,11 @@ TEST(MPSegmentTest, Test1) {
ASSERT_EQ("[\"\", \"\", \"\", \"\", \"\", \"\", \"\"]", s << words);
segment.Cut("湖南长沙市天心区", words);
s = join(words.begin(), words.end(), "/");
s = Join(words.begin(), words.end(), "/");
ASSERT_EQ("湖南长沙市/天心区", s);
segment.Cut("湖南长沙市天心区", words, 3);
s = join(words.begin(), words.end(), "/");
s = Join(words.begin(), words.end(), "/");
ASSERT_EQ("湖南/长沙市/天心区", s);
}
@ -254,7 +254,7 @@ TEST(QuerySegment, Test2) {
{
vector<string> words;
segment.Cut("internal", words);
string s = join(words.begin(), words.end(), "/");
string s = Join(words.begin(), words.end(), "/");
ASSERT_EQ("internal", s);
}
@ -263,7 +263,7 @@ TEST(QuerySegment, Test2) {
{
vector<string> words;
segment.Cut("中国科学院", words);
string s = join(words.begin(), words.end(), "/");
string s = Join(words.begin(), words.end(), "/");
ASSERT_EQ("中国科学院", s);
}
}