defautl encoding is changed to utf8

This commit is contained in:
gwdwyy 2013-08-20 02:11:09 +08:00
parent fda9e910ed
commit 07d27e250d
3 changed files with 49 additions and 14 deletions

View File

@ -6,7 +6,7 @@ n的jieba分词源码写的。
#Detail #Detail
======== ========
1.现在支持gbk编码的分词。 1.现在支持utf8,gbk编码的分词。默认编码是utf8。
2.分词算法上还没增加HMM模型这部分。 2.分词算法上还没增加HMM模型这部分。
3.关键词抽取是暂时是针对类似title之类的超短语句使用的基本上没有普适性。 3.关键词抽取是暂时是针对类似title之类的超短语句使用的基本上没有普适性。
@ -19,6 +19,7 @@ cd ./demo;
make; make;
./segment_demo testlines.gbk ./segment_demo testlines.gbk
``` ```
run `./segment_demo` to get help.
#Contact #Contact
wuyanyi09@gmail.com wuyanyi09@gmail.com

View File

@ -73,25 +73,38 @@ void testKeyWordExt2(const char * dictPath, const char * filePath)
ext.dispose(); ext.dispose();
} }
const char * const DEFAULT_DICTPATH = "../dicts/jieba.dict.gbk"; const char * const DEFAULT_DICTPATH = "../dicts/jieba.dict.utf8";
int main(int argc, char ** argv) int main(int argc, char ** argv)
{ {
ArgvContext arg(argc, argv); if(2 > argc)
string dictPath = arg["--dictpath"];
if("" == dictPath)
{
dictPath = DEFAULT_DICTPATH;
}
if("" == arg[1])
{ {
cout<<"usage: \n\t"<<argv[0]<<" [options] <filename>\n" cout<<"usage: \n\t"<<argv[0]<<" [options] <filename>\n"
<<"options:\n" <<"options:\n"
<<"\t--dictpath\tIf is not specified, the default is "<<DEFAULT_DICTPATH<<"\n" <<"\t--dictpath\tIf is not specified, the default is "<<DEFAULT_DICTPATH<<"\n"
<<"\t--encoding\tSupported encoding methods are [gbk, utf-8] for now. \n\t\t\tIf is not specified, the default is gbk." <<"\t--encoding\tSupported encoding methods are [gbk, utf-8] for now. \n\t\t\tIf is not specified, the default is utf-8.\n"
<<"examples:\n"
<<"\t"<<argv[0]<<" testlines.utf8 --encoding utf-8 --dictpath ../dicts/jieba.dict.utf8\n"
<<"\t"<<argv[0]<<" testlines.gbk --encoding gbk --dictpath ../dicts/jieba.dict.gbk\n"
<<endl; <<endl;
return -1; return -1;
} }
ArgvContext arg(argc, argv);
string dictPath = arg["--dictpath"];
string encoding = arg["--encoding"];
if("" == dictPath)
{
dictPath = DEFAULT_DICTPATH;
}
if("gbk" == encoding)
{
TransCode::setGbkEnc();
}
else
{
TransCode::setUtf8Enc();
}
testKeyWordExt(dictPath.c_str(), arg[1].c_str()); testKeyWordExt(dictPath.c_str(), arg[1].c_str());
return 0; return 0;

View File

@ -47,23 +47,44 @@ bool dispose()
return true; return true;
} }
const char * const DEFAULT_DICTPATH = "../dicts/jieba.dict.utf8";
int main(int argc, char ** argv) int main(int argc, char ** argv)
{ {
map<string, string> mpss; /*map<string, string> mpss;
getArgvMap(argc, argv, mpss); getArgvMap(argc, argv, mpss);
string enc = getMap<string, string>(mpss, "--encoding", ""); string enc = getMap<string, string>(mpss, "--encoding", "");
string dictPath = getMap<string, string>(mpss, "--dictpath", "../dicts/jieba.dict.gbk"); string dictPath = getMap<string, string>(mpss, "--dictpath", "../dicts/jieba.dict.gbk");
*/
if(argc < 2) if(argc < 2)
{ {
cout<<"usage: \n\t"<<argv[0]<<" <filename> [options]\n" cout<<"usage: \n\t"<<argv[0]<<" <filename> [options]\n"
<<"options:\n" <<"options:\n"
<<"\t--dictpath\tIf is not specified, the default is ../dicts/jieba.dict.gbk\n" <<"\t--dictpath\tIf is not specified, the default is ../dicts/jieba.dict.utf8\n"
<<"\t--encoding\tSupported encoding methods are [gbk, utf-8] for now. \n\t\t\tIf is not specified, the default is gbk." <<"\t--encoding\tSupported encoding methods are [gbk, utf-8] for now. \n\t\t\tIf is not specified, the default is utf8.\n"
<<"example:\n"
<<"\t"<<argv[0]<<" testlines.utf8 --encoding utf-8 --dictpath ../dicts/jieba.dict.utf8\n"
<<"\t"<<argv[0]<<" testlines.gbk --encoding gbk --dictpath ../dicts/jieba.dict.gbk\n"
<<endl; <<endl;
return -1; return -1;
} }
ArgvContext arg(argc, argv);
string dictPath = arg["--dictpath"];
string encoding = arg["--encoding"];
if("" == dictPath)
{
dictPath = DEFAULT_DICTPATH;
}
if("gbk" == encoding)
{
TransCode::setGbkEnc();
}
else
{
TransCode::setUtf8Enc();
}
init(dictPath.c_str()); init(dictPath.c_str());
run(argv[1]); run(argv[1]);
dispose(); dispose();