defautl encoding is changed to utf8

This commit is contained in:
gwdwyy 2013-08-20 02:11:09 +08:00
parent fda9e910ed
commit 07d27e250d
3 changed files with 49 additions and 14 deletions

View File

@ -6,7 +6,7 @@ n的jieba分词源码写的。
#Detail
========
1.现在支持gbk编码的分词。
1.现在支持utf8,gbk编码的分词。默认编码是utf8。
2.分词算法上还没增加HMM模型这部分。
3.关键词抽取是暂时是针对类似title之类的超短语句使用的基本上没有普适性。
@ -19,6 +19,7 @@ cd ./demo;
make;
./segment_demo testlines.gbk
```
run `./segment_demo` to get help.
#Contact
wuyanyi09@gmail.com

View File

@ -73,26 +73,39 @@ void testKeyWordExt2(const char * dictPath, const char * filePath)
ext.dispose();
}
const char * const DEFAULT_DICTPATH = "../dicts/jieba.dict.gbk";
const char * const DEFAULT_DICTPATH = "../dicts/jieba.dict.utf8";
int main(int argc, char ** argv)
{
ArgvContext arg(argc, argv);
string dictPath = arg["--dictpath"];
if("" == dictPath)
{
dictPath = DEFAULT_DICTPATH;
}
if("" == arg[1])
if(2 > argc)
{
cout<<"usage: \n\t"<<argv[0]<<" [options] <filename>\n"
<<"options:\n"
<<"\t--dictpath\tIf is not specified, the default is "<<DEFAULT_DICTPATH<<"\n"
<<"\t--encoding\tSupported encoding methods are [gbk, utf-8] for now. \n\t\t\tIf is not specified, the default is gbk."
<<"\t--encoding\tSupported encoding methods are [gbk, utf-8] for now. \n\t\t\tIf is not specified, the default is utf-8.\n"
<<"examples:\n"
<<"\t"<<argv[0]<<" testlines.utf8 --encoding utf-8 --dictpath ../dicts/jieba.dict.utf8\n"
<<"\t"<<argv[0]<<" testlines.gbk --encoding gbk --dictpath ../dicts/jieba.dict.gbk\n"
<<endl;
return -1;
}
ArgvContext arg(argc, argv);
string dictPath = arg["--dictpath"];
string encoding = arg["--encoding"];
if("" == dictPath)
{
dictPath = DEFAULT_DICTPATH;
}
if("gbk" == encoding)
{
TransCode::setGbkEnc();
}
else
{
TransCode::setUtf8Enc();
}
testKeyWordExt(dictPath.c_str(), arg[1].c_str());
return 0;
}

View File

@ -47,23 +47,44 @@ bool dispose()
return true;
}
const char * const DEFAULT_DICTPATH = "../dicts/jieba.dict.utf8";
int main(int argc, char ** argv)
{
map<string, string> mpss;
/*map<string, string> mpss;
getArgvMap(argc, argv, mpss);
string enc = getMap<string, string>(mpss, "--encoding", "");
string dictPath = getMap<string, string>(mpss, "--dictpath", "../dicts/jieba.dict.gbk");
*/
if(argc < 2)
{
cout<<"usage: \n\t"<<argv[0]<<" <filename> [options]\n"
<<"options:\n"
<<"\t--dictpath\tIf is not specified, the default is ../dicts/jieba.dict.gbk\n"
<<"\t--encoding\tSupported encoding methods are [gbk, utf-8] for now. \n\t\t\tIf is not specified, the default is gbk."
<<"\t--dictpath\tIf is not specified, the default is ../dicts/jieba.dict.utf8\n"
<<"\t--encoding\tSupported encoding methods are [gbk, utf-8] for now. \n\t\t\tIf is not specified, the default is utf8.\n"
<<"example:\n"
<<"\t"<<argv[0]<<" testlines.utf8 --encoding utf-8 --dictpath ../dicts/jieba.dict.utf8\n"
<<"\t"<<argv[0]<<" testlines.gbk --encoding gbk --dictpath ../dicts/jieba.dict.gbk\n"
<<endl;
return -1;
}
ArgvContext arg(argc, argv);
string dictPath = arg["--dictpath"];
string encoding = arg["--encoding"];
if("" == dictPath)
{
dictPath = DEFAULT_DICTPATH;
}
if("gbk" == encoding)
{
TransCode::setGbkEnc();
}
else
{
TransCode::setUtf8Enc();
}
init(dictPath.c_str());
run(argv[1]);
dispose();