mirror of
https://github.com/yanyiwu/cppjieba.git
synced 2025-07-18 00:00:12 +08:00
defautl encoding is changed to utf8
This commit is contained in:
parent
fda9e910ed
commit
07d27e250d
@ -6,7 +6,7 @@ n的jieba分词源码写的。
|
||||
|
||||
#Detail
|
||||
========
|
||||
1.现在只支持gbk编码的分词。
|
||||
1.现在支持utf8,gbk编码的分词。默认编码是utf8。
|
||||
2.分词算法上还没增加HMM模型这部分。
|
||||
3.关键词抽取是暂时是针对类似title之类的超短语句使用的,基本上没有普适性。
|
||||
|
||||
@ -19,6 +19,7 @@ cd ./demo;
|
||||
make;
|
||||
./segment_demo testlines.gbk
|
||||
```
|
||||
run `./segment_demo` to get help.
|
||||
|
||||
#Contact
|
||||
wuyanyi09@gmail.com
|
||||
|
@ -73,25 +73,38 @@ void testKeyWordExt2(const char * dictPath, const char * filePath)
|
||||
ext.dispose();
|
||||
}
|
||||
|
||||
const char * const DEFAULT_DICTPATH = "../dicts/jieba.dict.gbk";
|
||||
const char * const DEFAULT_DICTPATH = "../dicts/jieba.dict.utf8";
|
||||
|
||||
int main(int argc, char ** argv)
|
||||
{
|
||||
ArgvContext arg(argc, argv);
|
||||
string dictPath = arg["--dictpath"];
|
||||
if("" == dictPath)
|
||||
{
|
||||
dictPath = DEFAULT_DICTPATH;
|
||||
}
|
||||
if("" == arg[1])
|
||||
if(2 > argc)
|
||||
{
|
||||
cout<<"usage: \n\t"<<argv[0]<<" [options] <filename>\n"
|
||||
<<"options:\n"
|
||||
<<"\t--dictpath\tIf is not specified, the default is "<<DEFAULT_DICTPATH<<"\n"
|
||||
<<"\t--encoding\tSupported encoding methods are [gbk, utf-8] for now. \n\t\t\tIf is not specified, the default is gbk."
|
||||
<<"\t--encoding\tSupported encoding methods are [gbk, utf-8] for now. \n\t\t\tIf is not specified, the default is utf-8.\n"
|
||||
<<"examples:\n"
|
||||
<<"\t"<<argv[0]<<" testlines.utf8 --encoding utf-8 --dictpath ../dicts/jieba.dict.utf8\n"
|
||||
<<"\t"<<argv[0]<<" testlines.gbk --encoding gbk --dictpath ../dicts/jieba.dict.gbk\n"
|
||||
<<endl;
|
||||
return -1;
|
||||
}
|
||||
|
||||
ArgvContext arg(argc, argv);
|
||||
string dictPath = arg["--dictpath"];
|
||||
string encoding = arg["--encoding"];
|
||||
if("" == dictPath)
|
||||
{
|
||||
dictPath = DEFAULT_DICTPATH;
|
||||
}
|
||||
if("gbk" == encoding)
|
||||
{
|
||||
TransCode::setGbkEnc();
|
||||
}
|
||||
else
|
||||
{
|
||||
TransCode::setUtf8Enc();
|
||||
}
|
||||
|
||||
testKeyWordExt(dictPath.c_str(), arg[1].c_str());
|
||||
return 0;
|
||||
|
@ -47,23 +47,44 @@ bool dispose()
|
||||
return true;
|
||||
}
|
||||
|
||||
const char * const DEFAULT_DICTPATH = "../dicts/jieba.dict.utf8";
|
||||
|
||||
int main(int argc, char ** argv)
|
||||
{
|
||||
map<string, string> mpss;
|
||||
/*map<string, string> mpss;
|
||||
getArgvMap(argc, argv, mpss);
|
||||
string enc = getMap<string, string>(mpss, "--encoding", "");
|
||||
string dictPath = getMap<string, string>(mpss, "--dictpath", "../dicts/jieba.dict.gbk");
|
||||
|
||||
*/
|
||||
if(argc < 2)
|
||||
{
|
||||
cout<<"usage: \n\t"<<argv[0]<<" <filename> [options]\n"
|
||||
<<"options:\n"
|
||||
<<"\t--dictpath\tIf is not specified, the default is ../dicts/jieba.dict.gbk\n"
|
||||
<<"\t--encoding\tSupported encoding methods are [gbk, utf-8] for now. \n\t\t\tIf is not specified, the default is gbk."
|
||||
<<"\t--dictpath\tIf is not specified, the default is ../dicts/jieba.dict.utf8\n"
|
||||
<<"\t--encoding\tSupported encoding methods are [gbk, utf-8] for now. \n\t\t\tIf is not specified, the default is utf8.\n"
|
||||
<<"example:\n"
|
||||
<<"\t"<<argv[0]<<" testlines.utf8 --encoding utf-8 --dictpath ../dicts/jieba.dict.utf8\n"
|
||||
<<"\t"<<argv[0]<<" testlines.gbk --encoding gbk --dictpath ../dicts/jieba.dict.gbk\n"
|
||||
<<endl;
|
||||
|
||||
return -1;
|
||||
}
|
||||
ArgvContext arg(argc, argv);
|
||||
string dictPath = arg["--dictpath"];
|
||||
string encoding = arg["--encoding"];
|
||||
if("" == dictPath)
|
||||
{
|
||||
dictPath = DEFAULT_DICTPATH;
|
||||
}
|
||||
if("gbk" == encoding)
|
||||
{
|
||||
TransCode::setGbkEnc();
|
||||
}
|
||||
else
|
||||
{
|
||||
TransCode::setUtf8Enc();
|
||||
}
|
||||
|
||||
init(dictPath.c_str());
|
||||
run(argv[1]);
|
||||
dispose();
|
||||
|
Loading…
x
Reference in New Issue
Block a user