add config to enable/disable lowercase and remote_dict, Closes #241
This commit is contained in:
parent
b662596939
commit
341b586373
@ -230,7 +230,12 @@ mvn compile
|
|||||||
mvn package
|
mvn package
|
||||||
```
|
```
|
||||||
|
|
||||||
copy & unzip file #{project_path}/elasticsearch-analysis-ik/target/releases/elasticsearch-analysis-ik-*.zip to your elasticsearch's folder: plugins/ik
|
拷贝和解压release下的文件: #{project_path}/elasticsearch-analysis-ik/target/releases/elasticsearch-analysis-ik-*.zip 到你的 elasticsearch 插件目录, 如: plugins/ik
|
||||||
|
重启elasticsearch
|
||||||
|
|
||||||
|
3.分词测试失败
|
||||||
|
请在某个索引下调用analyze接口测试,而不是直接调用analyze接口
|
||||||
|
如:http://localhost:9200/your_index/_analyze?text=中华人民共和国MN&tokenizer=my_ik
|
||||||
|
|
||||||
|
|
||||||
Thanks
|
Thanks
|
||||||
|
@ -1,7 +1,6 @@
|
|||||||
package org.elasticsearch.index.analysis;
|
package org.elasticsearch.index.analysis;
|
||||||
|
|
||||||
|
|
||||||
@Deprecated
|
|
||||||
public class IkAnalysisBinderProcessor extends AnalysisModule.AnalysisBinderProcessor {
|
public class IkAnalysisBinderProcessor extends AnalysisModule.AnalysisBinderProcessor {
|
||||||
|
|
||||||
|
|
||||||
|
@ -10,17 +10,16 @@ import org.wltea.analyzer.cfg.Configuration;
|
|||||||
import org.wltea.analyzer.dic.Dictionary;
|
import org.wltea.analyzer.dic.Dictionary;
|
||||||
import org.wltea.analyzer.lucene.IKAnalyzer;
|
import org.wltea.analyzer.lucene.IKAnalyzer;
|
||||||
|
|
||||||
@Deprecated
|
|
||||||
public class IkAnalyzerProvider extends AbstractIndexAnalyzerProvider<IKAnalyzer> {
|
public class IkAnalyzerProvider extends AbstractIndexAnalyzerProvider<IKAnalyzer> {
|
||||||
private final IKAnalyzer analyzer;
|
private final IKAnalyzer analyzer;
|
||||||
private boolean useSmart=false;
|
|
||||||
|
|
||||||
@Inject
|
@Inject
|
||||||
public IkAnalyzerProvider(Index index, IndexSettingsService indexSettingsService, Environment env, @Assisted String name, @Assisted Settings settings) {
|
public IkAnalyzerProvider(Index index, IndexSettingsService indexSettingsService, Environment env, @Assisted String name, @Assisted Settings settings) {
|
||||||
super(index, indexSettingsService.getSettings(), name, settings);
|
super(index, indexSettingsService.getSettings(), name, settings);
|
||||||
Dictionary.initial(new Configuration(env));
|
|
||||||
useSmart = settings.get("use_smart", "false").equals("true");
|
Configuration configuration=new Configuration(env,settings);
|
||||||
analyzer=new IKAnalyzer(useSmart);
|
|
||||||
|
analyzer=new IKAnalyzer(configuration);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override public IKAnalyzer get() {
|
@Override public IKAnalyzer get() {
|
||||||
|
@ -8,25 +8,18 @@ import org.elasticsearch.env.Environment;
|
|||||||
import org.elasticsearch.index.Index;
|
import org.elasticsearch.index.Index;
|
||||||
import org.elasticsearch.index.settings.IndexSettingsService;
|
import org.elasticsearch.index.settings.IndexSettingsService;
|
||||||
import org.wltea.analyzer.cfg.Configuration;
|
import org.wltea.analyzer.cfg.Configuration;
|
||||||
import org.wltea.analyzer.dic.Dictionary;
|
|
||||||
import org.wltea.analyzer.lucene.IKTokenizer;
|
import org.wltea.analyzer.lucene.IKTokenizer;
|
||||||
|
|
||||||
@Deprecated
|
|
||||||
public class IkTokenizerFactory extends AbstractTokenizerFactory {
|
public class IkTokenizerFactory extends AbstractTokenizerFactory {
|
||||||
private final Settings settings;
|
private Configuration configuration;
|
||||||
private boolean useSmart=false;
|
|
||||||
|
|
||||||
@Inject
|
@Inject
|
||||||
public IkTokenizerFactory(Index index, IndexSettingsService indexSettingsService,Environment env, @Assisted String name, @Assisted Settings settings) {
|
public IkTokenizerFactory(Index index, IndexSettingsService indexSettingsService,Environment env, @Assisted String name, @Assisted Settings settings) {
|
||||||
super(index, indexSettingsService.getSettings(), name, settings);
|
super(index, indexSettingsService.getSettings(), name, settings);
|
||||||
this.settings=settings;
|
configuration=new Configuration(env,settings);
|
||||||
Dictionary.initial(new Configuration(env));
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public Tokenizer create() {
|
public Tokenizer create() {
|
||||||
this.useSmart = settings.get("use_smart", "false").equals("true");
|
return new IKTokenizer(configuration); }
|
||||||
|
|
||||||
return new IKTokenizer(useSmart); }
|
|
||||||
}
|
}
|
||||||
|
@ -3,6 +3,7 @@ package org.elasticsearch.indices.analysis;
|
|||||||
import org.apache.lucene.analysis.Tokenizer;
|
import org.apache.lucene.analysis.Tokenizer;
|
||||||
import org.elasticsearch.common.component.AbstractComponent;
|
import org.elasticsearch.common.component.AbstractComponent;
|
||||||
import org.elasticsearch.common.inject.Inject;
|
import org.elasticsearch.common.inject.Inject;
|
||||||
|
import org.elasticsearch.common.inject.assistedinject.Assisted;
|
||||||
import org.elasticsearch.common.settings.Settings;
|
import org.elasticsearch.common.settings.Settings;
|
||||||
import org.elasticsearch.env.Environment;
|
import org.elasticsearch.env.Environment;
|
||||||
import org.elasticsearch.index.analysis.AnalyzerScope;
|
import org.elasticsearch.index.analysis.AnalyzerScope;
|
||||||
@ -26,21 +27,20 @@ public class IKIndicesAnalysis extends AbstractComponent {
|
|||||||
public IKIndicesAnalysis(final Settings settings,
|
public IKIndicesAnalysis(final Settings settings,
|
||||||
IndicesAnalysisService indicesAnalysisService,Environment env) {
|
IndicesAnalysisService indicesAnalysisService,Environment env) {
|
||||||
super(settings);
|
super(settings);
|
||||||
Dictionary.initial(new Configuration(env));
|
final Configuration configuration=new Configuration(env,settings).setUseSmart(false);
|
||||||
|
final Configuration smartConfiguration=new Configuration(env,settings).setUseSmart(true);
|
||||||
this.useSmart = settings.get("use_smart", "false").equals("true");
|
|
||||||
|
|
||||||
indicesAnalysisService.analyzerProviderFactories().put("ik",
|
indicesAnalysisService.analyzerProviderFactories().put("ik",
|
||||||
new PreBuiltAnalyzerProviderFactory("ik", AnalyzerScope.GLOBAL,
|
new PreBuiltAnalyzerProviderFactory("ik", AnalyzerScope.GLOBAL,
|
||||||
new IKAnalyzer(useSmart)));
|
new IKAnalyzer(configuration)));
|
||||||
|
|
||||||
indicesAnalysisService.analyzerProviderFactories().put("ik_smart",
|
indicesAnalysisService.analyzerProviderFactories().put("ik_smart",
|
||||||
new PreBuiltAnalyzerProviderFactory("ik_smart", AnalyzerScope.GLOBAL,
|
new PreBuiltAnalyzerProviderFactory("ik_smart", AnalyzerScope.GLOBAL,
|
||||||
new IKAnalyzer(true)));
|
new IKAnalyzer(smartConfiguration)));
|
||||||
|
|
||||||
indicesAnalysisService.analyzerProviderFactories().put("ik_max_word",
|
indicesAnalysisService.analyzerProviderFactories().put("ik_max_word",
|
||||||
new PreBuiltAnalyzerProviderFactory("ik_max_word", AnalyzerScope.GLOBAL,
|
new PreBuiltAnalyzerProviderFactory("ik_max_word", AnalyzerScope.GLOBAL,
|
||||||
new IKAnalyzer(false)));
|
new IKAnalyzer(configuration)));
|
||||||
|
|
||||||
indicesAnalysisService.tokenizerFactories().put("ik",
|
indicesAnalysisService.tokenizerFactories().put("ik",
|
||||||
new PreBuiltTokenizerFactoryFactory(new TokenizerFactory() {
|
new PreBuiltTokenizerFactoryFactory(new TokenizerFactory() {
|
||||||
@ -51,7 +51,7 @@ public class IKIndicesAnalysis extends AbstractComponent {
|
|||||||
|
|
||||||
@Override
|
@Override
|
||||||
public Tokenizer create() {
|
public Tokenizer create() {
|
||||||
return new IKTokenizer(false);
|
return new IKTokenizer(configuration);
|
||||||
}
|
}
|
||||||
}));
|
}));
|
||||||
|
|
||||||
@ -64,7 +64,7 @@ public class IKIndicesAnalysis extends AbstractComponent {
|
|||||||
|
|
||||||
@Override
|
@Override
|
||||||
public Tokenizer create() {
|
public Tokenizer create() {
|
||||||
return new IKTokenizer(true);
|
return new IKTokenizer(smartConfiguration);
|
||||||
}
|
}
|
||||||
}));
|
}));
|
||||||
|
|
||||||
@ -77,8 +77,8 @@ public class IKIndicesAnalysis extends AbstractComponent {
|
|||||||
|
|
||||||
@Override
|
@Override
|
||||||
public Tokenizer create() {
|
public Tokenizer create() {
|
||||||
return new IKTokenizer(false);
|
return new IKTokenizer(configuration);
|
||||||
}
|
}
|
||||||
}));
|
}));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -7,8 +7,10 @@ import org.elasticsearch.common.inject.Inject;
|
|||||||
import org.elasticsearch.common.io.PathUtils;
|
import org.elasticsearch.common.io.PathUtils;
|
||||||
import org.elasticsearch.common.logging.ESLogger;
|
import org.elasticsearch.common.logging.ESLogger;
|
||||||
import org.elasticsearch.common.logging.Loggers;
|
import org.elasticsearch.common.logging.Loggers;
|
||||||
|
import org.elasticsearch.common.settings.Settings;
|
||||||
import org.elasticsearch.env.Environment;
|
import org.elasticsearch.env.Environment;
|
||||||
import org.elasticsearch.plugin.analysis.ik.AnalysisIkPlugin;
|
import org.elasticsearch.plugin.analysis.ik.AnalysisIkPlugin;
|
||||||
|
import org.wltea.analyzer.dic.Dictionary;
|
||||||
|
|
||||||
import java.io.*;
|
import java.io.*;
|
||||||
import java.net.URL;
|
import java.net.URL;
|
||||||
@ -20,132 +22,61 @@ import java.util.Properties;
|
|||||||
|
|
||||||
public class Configuration {
|
public class Configuration {
|
||||||
|
|
||||||
private static String FILE_NAME = "IKAnalyzer.cfg.xml";
|
|
||||||
private static final String EXT_DICT = "ext_dict";
|
|
||||||
private static final String REMOTE_EXT_DICT = "remote_ext_dict";
|
|
||||||
private static final String EXT_STOP = "ext_stopwords";
|
|
||||||
private static final String REMOTE_EXT_STOP = "remote_ext_stopwords";
|
|
||||||
private static ESLogger logger = Loggers.getLogger("ik-analyzer");
|
|
||||||
private Path conf_dir;
|
|
||||||
private Properties props;
|
|
||||||
private Environment environment;
|
private Environment environment;
|
||||||
|
private Settings settings;
|
||||||
|
|
||||||
|
//是否启用智能分词
|
||||||
|
private boolean useSmart;
|
||||||
|
|
||||||
|
//是否启用远程词典加载
|
||||||
|
private boolean enableRemoteDict=false;
|
||||||
|
|
||||||
|
//是否启用小写处理
|
||||||
|
private boolean enableLowercase=true;
|
||||||
|
|
||||||
|
|
||||||
@Inject
|
@Inject
|
||||||
public Configuration(Environment env) {
|
public Configuration(Environment env,Settings settings) {
|
||||||
props = new Properties();
|
this.environment = env;
|
||||||
environment = env;
|
this.settings=settings;
|
||||||
|
|
||||||
conf_dir = environment.configFile().resolve(AnalysisIkPlugin.PLUGIN_NAME);
|
this.useSmart = settings.get("use_smart", "false").equals("true");
|
||||||
Path configFile = conf_dir.resolve(FILE_NAME);
|
this.enableLowercase = settings.get("enable_lowercase", "true").equals("true");
|
||||||
|
this.enableRemoteDict = settings.get("enable_remote_dict", "true").equals("true");
|
||||||
|
|
||||||
|
Dictionary.initial(this);
|
||||||
|
|
||||||
InputStream input = null;
|
|
||||||
try {
|
|
||||||
logger.info("try load config from {}", configFile);
|
|
||||||
input = new FileInputStream(configFile.toFile());
|
|
||||||
} catch (FileNotFoundException e) {
|
|
||||||
conf_dir = this.getConfigInPluginDir();
|
|
||||||
configFile = conf_dir.resolve(FILE_NAME);
|
|
||||||
try {
|
|
||||||
logger.info("try load config from {}", configFile);
|
|
||||||
input = new FileInputStream(configFile.toFile());
|
|
||||||
} catch (FileNotFoundException ex) {
|
|
||||||
// We should report origin exception
|
|
||||||
logger.error("ik-analyzer", e);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if (input != null) {
|
|
||||||
try {
|
|
||||||
props.loadFromXML(input);
|
|
||||||
} catch (InvalidPropertiesFormatException e) {
|
|
||||||
logger.error("ik-analyzer", e);
|
|
||||||
} catch (IOException e) {
|
|
||||||
logger.error("ik-analyzer", e);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
public List<String> getExtDictionarys() {
|
public Path getConfigInPluginDir() {
|
||||||
List<String> extDictFiles = new ArrayList<String>(2);
|
|
||||||
String extDictCfg = props.getProperty(EXT_DICT);
|
|
||||||
if (extDictCfg != null) {
|
|
||||||
|
|
||||||
String[] filePaths = extDictCfg.split(";");
|
|
||||||
if (filePaths != null) {
|
|
||||||
for (String filePath : filePaths) {
|
|
||||||
if (filePath != null && !"".equals(filePath.trim())) {
|
|
||||||
Path file = PathUtils.get(filePath.trim());
|
|
||||||
extDictFiles.add(file.toString());
|
|
||||||
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return extDictFiles;
|
|
||||||
}
|
|
||||||
|
|
||||||
public List<String> getRemoteExtDictionarys() {
|
|
||||||
List<String> remoteExtDictFiles = new ArrayList<String>(2);
|
|
||||||
String remoteExtDictCfg = props.getProperty(REMOTE_EXT_DICT);
|
|
||||||
if (remoteExtDictCfg != null) {
|
|
||||||
|
|
||||||
String[] filePaths = remoteExtDictCfg.split(";");
|
|
||||||
if (filePaths != null) {
|
|
||||||
for (String filePath : filePaths) {
|
|
||||||
if (filePath != null && !"".equals(filePath.trim())) {
|
|
||||||
remoteExtDictFiles.add(filePath);
|
|
||||||
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return remoteExtDictFiles;
|
|
||||||
}
|
|
||||||
|
|
||||||
public List<String> getExtStopWordDictionarys() {
|
|
||||||
List<String> extStopWordDictFiles = new ArrayList<String>(2);
|
|
||||||
String extStopWordDictCfg = props.getProperty(EXT_STOP);
|
|
||||||
if (extStopWordDictCfg != null) {
|
|
||||||
|
|
||||||
String[] filePaths = extStopWordDictCfg.split(";");
|
|
||||||
if (filePaths != null) {
|
|
||||||
for (String filePath : filePaths) {
|
|
||||||
if (filePath != null && !"".equals(filePath.trim())) {
|
|
||||||
Path file = PathUtils.get(filePath.trim());
|
|
||||||
extStopWordDictFiles.add(file.toString());
|
|
||||||
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return extStopWordDictFiles;
|
|
||||||
}
|
|
||||||
|
|
||||||
public List<String> getRemoteExtStopWordDictionarys() {
|
|
||||||
List<String> remoteExtStopWordDictFiles = new ArrayList<String>(2);
|
|
||||||
String remoteExtStopWordDictCfg = props.getProperty(REMOTE_EXT_STOP);
|
|
||||||
if (remoteExtStopWordDictCfg != null) {
|
|
||||||
|
|
||||||
String[] filePaths = remoteExtStopWordDictCfg.split(";");
|
|
||||||
if (filePaths != null) {
|
|
||||||
for (String filePath : filePaths) {
|
|
||||||
if (filePath != null && !"".equals(filePath.trim())) {
|
|
||||||
remoteExtStopWordDictFiles.add(filePath);
|
|
||||||
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return remoteExtStopWordDictFiles;
|
|
||||||
}
|
|
||||||
|
|
||||||
public String getDictRoot() {
|
|
||||||
return conf_dir.toAbsolutePath().toString();
|
|
||||||
}
|
|
||||||
|
|
||||||
private Path getConfigInPluginDir() {
|
|
||||||
return PathUtils
|
return PathUtils
|
||||||
.get(new File(AnalysisIkPlugin.class.getProtectionDomain().getCodeSource().getLocation().getPath())
|
.get(new File(AnalysisIkPlugin.class.getProtectionDomain().getCodeSource().getLocation().getPath())
|
||||||
.getParent(), "config")
|
.getParent(), "config")
|
||||||
.toAbsolutePath();
|
.toAbsolutePath();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public boolean isUseSmart() {
|
||||||
|
return useSmart;
|
||||||
|
}
|
||||||
|
|
||||||
|
public Configuration setUseSmart(boolean useSmart) {
|
||||||
|
this.useSmart = useSmart;
|
||||||
|
return this;
|
||||||
|
}
|
||||||
|
|
||||||
|
public Environment getEnvironment() {
|
||||||
|
return environment;
|
||||||
|
}
|
||||||
|
|
||||||
|
public Settings getSettings() {
|
||||||
|
return settings;
|
||||||
|
}
|
||||||
|
|
||||||
|
public boolean isEnableRemoteDict() {
|
||||||
|
return enableRemoteDict;
|
||||||
|
}
|
||||||
|
|
||||||
|
public boolean isEnableLowercase() {
|
||||||
|
return enableLowercase;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
@ -32,6 +32,7 @@ import java.util.LinkedList;
|
|||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
import java.util.Set;
|
import java.util.Set;
|
||||||
|
|
||||||
|
import org.wltea.analyzer.cfg.Configuration;
|
||||||
import org.wltea.analyzer.dic.Dictionary;
|
import org.wltea.analyzer.dic.Dictionary;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@ -72,12 +73,11 @@ class AnalyzeContext {
|
|||||||
private Map<Integer , LexemePath> pathMap;
|
private Map<Integer , LexemePath> pathMap;
|
||||||
//最终分词结果集
|
//最终分词结果集
|
||||||
private LinkedList<Lexeme> results;
|
private LinkedList<Lexeme> results;
|
||||||
private boolean useSmart;
|
|
||||||
//分词器配置项
|
//分词器配置项
|
||||||
// private Configuration cfg;
|
private Configuration cfg;
|
||||||
|
|
||||||
public AnalyzeContext(boolean useSmart){
|
public AnalyzeContext(Configuration configuration){
|
||||||
this.useSmart = useSmart;
|
this.cfg = configuration;
|
||||||
this.segmentBuff = new char[BUFF_SIZE];
|
this.segmentBuff = new char[BUFF_SIZE];
|
||||||
this.charTypes = new int[BUFF_SIZE];
|
this.charTypes = new int[BUFF_SIZE];
|
||||||
this.buffLocker = new HashSet<String>();
|
this.buffLocker = new HashSet<String>();
|
||||||
@ -139,7 +139,7 @@ class AnalyzeContext {
|
|||||||
*/
|
*/
|
||||||
void initCursor(){
|
void initCursor(){
|
||||||
this.cursor = 0;
|
this.cursor = 0;
|
||||||
this.segmentBuff[this.cursor] = CharacterUtil.regularize(this.segmentBuff[this.cursor]);
|
this.segmentBuff[this.cursor] = CharacterUtil.regularize(this.segmentBuff[this.cursor],cfg.isEnableLowercase());
|
||||||
this.charTypes[this.cursor] = CharacterUtil.identifyCharType(this.segmentBuff[this.cursor]);
|
this.charTypes[this.cursor] = CharacterUtil.identifyCharType(this.segmentBuff[this.cursor]);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -151,7 +151,7 @@ class AnalyzeContext {
|
|||||||
boolean moveCursor(){
|
boolean moveCursor(){
|
||||||
if(this.cursor < this.available - 1){
|
if(this.cursor < this.available - 1){
|
||||||
this.cursor++;
|
this.cursor++;
|
||||||
this.segmentBuff[this.cursor] = CharacterUtil.regularize(this.segmentBuff[this.cursor]);
|
this.segmentBuff[this.cursor] = CharacterUtil.regularize(this.segmentBuff[this.cursor],cfg.isEnableLowercase());
|
||||||
this.charTypes[this.cursor] = CharacterUtil.identifyCharType(this.segmentBuff[this.cursor]);
|
this.charTypes[this.cursor] = CharacterUtil.identifyCharType(this.segmentBuff[this.cursor]);
|
||||||
return true;
|
return true;
|
||||||
}else{
|
}else{
|
||||||
@ -345,7 +345,7 @@ class AnalyzeContext {
|
|||||||
*/
|
*/
|
||||||
private void compound(Lexeme result){
|
private void compound(Lexeme result){
|
||||||
|
|
||||||
if(!this.useSmart){
|
if(!this.cfg.isUseSmart()){
|
||||||
return ;
|
return ;
|
||||||
}
|
}
|
||||||
//数量词合并处理
|
//数量词合并处理
|
||||||
|
@ -86,14 +86,14 @@ class CharacterUtil {
|
|||||||
* @param input
|
* @param input
|
||||||
* @return char
|
* @return char
|
||||||
*/
|
*/
|
||||||
static char regularize(char input){
|
static char regularize(char input,boolean lowercase){
|
||||||
if (input == 12288) {
|
if (input == 12288) {
|
||||||
input = (char) 32;
|
input = (char) 32;
|
||||||
|
|
||||||
}else if (input > 65280 && input < 65375) {
|
}else if (input > 65280 && input < 65375) {
|
||||||
input = (char) (input - 65248);
|
input = (char) (input - 65248);
|
||||||
|
|
||||||
}else if (input >= 'A' && input <= 'Z') {
|
}else if (input >= 'A' && input <= 'Z' && lowercase) {
|
||||||
input += 32;
|
input += 32;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -23,10 +23,7 @@
|
|||||||
*/
|
*/
|
||||||
package org.wltea.analyzer.core;
|
package org.wltea.analyzer.core;
|
||||||
|
|
||||||
import org.elasticsearch.common.settings.Settings;
|
|
||||||
import org.elasticsearch.env.Environment;
|
|
||||||
import org.wltea.analyzer.cfg.Configuration;
|
import org.wltea.analyzer.cfg.Configuration;
|
||||||
import org.wltea.analyzer.dic.Dictionary;
|
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.io.Reader;
|
import java.io.Reader;
|
||||||
@ -47,16 +44,16 @@ public final class IKSegmenter {
|
|||||||
private List<ISegmenter> segmenters;
|
private List<ISegmenter> segmenters;
|
||||||
//分词歧义裁决器
|
//分词歧义裁决器
|
||||||
private IKArbitrator arbitrator;
|
private IKArbitrator arbitrator;
|
||||||
private boolean useSmart = false;
|
private Configuration configuration;
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* IK分词器构造函数
|
* IK分词器构造函数
|
||||||
* @param input
|
* @param input
|
||||||
*/
|
*/
|
||||||
public IKSegmenter(Reader input ,boolean useSmart){
|
public IKSegmenter(Reader input ,Configuration configuration){
|
||||||
this.input = input;
|
this.input = input;
|
||||||
this.useSmart = useSmart;
|
this.configuration = configuration;
|
||||||
this.init();
|
this.init();
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -66,7 +63,7 @@ public final class IKSegmenter {
|
|||||||
*/
|
*/
|
||||||
private void init(){
|
private void init(){
|
||||||
//初始化分词上下文
|
//初始化分词上下文
|
||||||
this.context = new AnalyzeContext(useSmart);
|
this.context = new AnalyzeContext(configuration);
|
||||||
//加载子分词器
|
//加载子分词器
|
||||||
this.segmenters = this.loadSegmenters();
|
this.segmenters = this.loadSegmenters();
|
||||||
//加载歧义裁决器
|
//加载歧义裁决器
|
||||||
@ -127,7 +124,7 @@ public final class IKSegmenter {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
//对分词进行歧义处理
|
//对分词进行歧义处理
|
||||||
this.arbitrator.process(context, useSmart);
|
this.arbitrator.process(context, configuration.isUseSmart());
|
||||||
//将分词结果输出到结果集,并处理未切分的单个CJK字符
|
//将分词结果输出到结果集,并处理未切分的单个CJK字符
|
||||||
context.outputToResult();
|
context.outputToResult();
|
||||||
//记录本次分词的缓冲区位移
|
//记录本次分词的缓冲区位移
|
||||||
|
@ -33,9 +33,7 @@ import java.io.IOException;
|
|||||||
import java.io.InputStream;
|
import java.io.InputStream;
|
||||||
import java.io.InputStreamReader;
|
import java.io.InputStreamReader;
|
||||||
import java.nio.file.Path;
|
import java.nio.file.Path;
|
||||||
import java.util.ArrayList;
|
import java.util.*;
|
||||||
import java.util.Collection;
|
|
||||||
import java.util.List;
|
|
||||||
import java.util.concurrent.Executors;
|
import java.util.concurrent.Executors;
|
||||||
import java.util.concurrent.ScheduledExecutorService;
|
import java.util.concurrent.ScheduledExecutorService;
|
||||||
import java.util.concurrent.TimeUnit;
|
import java.util.concurrent.TimeUnit;
|
||||||
@ -49,6 +47,7 @@ import org.apache.http.impl.client.HttpClients;
|
|||||||
import org.elasticsearch.common.io.PathUtils;
|
import org.elasticsearch.common.io.PathUtils;
|
||||||
import org.elasticsearch.common.logging.ESLogger;
|
import org.elasticsearch.common.logging.ESLogger;
|
||||||
import org.elasticsearch.common.logging.Loggers;
|
import org.elasticsearch.common.logging.Loggers;
|
||||||
|
import org.elasticsearch.plugin.analysis.ik.AnalysisIkPlugin;
|
||||||
import org.wltea.analyzer.cfg.Configuration;
|
import org.wltea.analyzer.cfg.Configuration;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@ -88,10 +87,53 @@ public class Dictionary {
|
|||||||
public static final String PATH_DIC_PREP = "preposition.dic";
|
public static final String PATH_DIC_PREP = "preposition.dic";
|
||||||
public static final String PATH_DIC_STOP = "stopword.dic";
|
public static final String PATH_DIC_STOP = "stopword.dic";
|
||||||
|
|
||||||
private Dictionary() {
|
private final static String FILE_NAME = "IKAnalyzer.cfg.xml";
|
||||||
|
private final static String EXT_DICT = "ext_dict";
|
||||||
|
private final static String REMOTE_EXT_DICT = "remote_ext_dict";
|
||||||
|
private final static String EXT_STOP = "ext_stopwords";
|
||||||
|
private final static String REMOTE_EXT_STOP = "remote_ext_stopwords";
|
||||||
|
|
||||||
|
private Path conf_dir;
|
||||||
|
private Properties props;
|
||||||
|
|
||||||
|
private Dictionary(Configuration cfg) {
|
||||||
|
this.configuration = cfg;
|
||||||
|
this.props = new Properties();
|
||||||
|
this.conf_dir = cfg.getEnvironment().configFile().resolve(AnalysisIkPlugin.PLUGIN_NAME);
|
||||||
|
Path configFile = conf_dir.resolve(FILE_NAME);
|
||||||
|
|
||||||
|
InputStream input = null;
|
||||||
|
try {
|
||||||
|
logger.info("try load config from {}", configFile);
|
||||||
|
input = new FileInputStream(configFile.toFile());
|
||||||
|
} catch (FileNotFoundException e) {
|
||||||
|
conf_dir = cfg.getConfigInPluginDir();
|
||||||
|
configFile = conf_dir.resolve(FILE_NAME);
|
||||||
|
try {
|
||||||
|
logger.info("try load config from {}", configFile);
|
||||||
|
input = new FileInputStream(configFile.toFile());
|
||||||
|
} catch (FileNotFoundException ex) {
|
||||||
|
// We should report origin exception
|
||||||
|
logger.error("ik-analyzer", e);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (input != null) {
|
||||||
|
try {
|
||||||
|
props.loadFromXML(input);
|
||||||
|
} catch (InvalidPropertiesFormatException e) {
|
||||||
|
logger.error("ik-analyzer", e);
|
||||||
|
} catch (IOException e) {
|
||||||
|
logger.error("ik-analyzer", e);
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public String getProperty(String key){
|
||||||
|
if(props!=null){
|
||||||
|
return props.getProperty(key);
|
||||||
|
}
|
||||||
|
return null;
|
||||||
|
}
|
||||||
/**
|
/**
|
||||||
* 词典初始化 由于IK Analyzer的词典采用Dictionary类的静态方法进行词典初始化
|
* 词典初始化 由于IK Analyzer的词典采用Dictionary类的静态方法进行词典初始化
|
||||||
* 只有当Dictionary类被实际调用时,才会开始载入词典, 这将延长首次分词操作的时间 该方法提供了一个在应用加载阶段就初始化字典的手段
|
* 只有当Dictionary类被实际调用时,才会开始载入词典, 这将延长首次分词操作的时间 该方法提供了一个在应用加载阶段就初始化字典的手段
|
||||||
@ -102,8 +144,8 @@ public class Dictionary {
|
|||||||
if (singleton == null) {
|
if (singleton == null) {
|
||||||
synchronized (Dictionary.class) {
|
synchronized (Dictionary.class) {
|
||||||
if (singleton == null) {
|
if (singleton == null) {
|
||||||
singleton = new Dictionary();
|
|
||||||
singleton.configuration = cfg;
|
singleton = new Dictionary(cfg);
|
||||||
singleton.loadMainDict();
|
singleton.loadMainDict();
|
||||||
singleton.loadSurnameDict();
|
singleton.loadSurnameDict();
|
||||||
singleton.loadQuantifierDict();
|
singleton.loadQuantifierDict();
|
||||||
@ -111,13 +153,15 @@ public class Dictionary {
|
|||||||
singleton.loadPrepDict();
|
singleton.loadPrepDict();
|
||||||
singleton.loadStopWordDict();
|
singleton.loadStopWordDict();
|
||||||
|
|
||||||
// 建立监控线程
|
if(cfg.isEnableRemoteDict()){
|
||||||
for (String location : cfg.getRemoteExtDictionarys()) {
|
// 建立监控线程
|
||||||
// 10 秒是初始延迟可以修改的 60是间隔时间 单位秒
|
for (String location : singleton.getRemoteExtDictionarys()) {
|
||||||
pool.scheduleAtFixedRate(new Monitor(location), 10, 60, TimeUnit.SECONDS);
|
// 10 秒是初始延迟可以修改的 60是间隔时间 单位秒
|
||||||
}
|
pool.scheduleAtFixedRate(new Monitor(location), 10, 60, TimeUnit.SECONDS);
|
||||||
for (String location : cfg.getRemoteExtStopWordDictionarys()) {
|
}
|
||||||
pool.scheduleAtFixedRate(new Monitor(location), 10, 60, TimeUnit.SECONDS);
|
for (String location : singleton.getRemoteExtStopWordDictionarys()) {
|
||||||
|
pool.scheduleAtFixedRate(new Monitor(location), 10, 60, TimeUnit.SECONDS);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
return singleton;
|
return singleton;
|
||||||
@ -127,6 +171,77 @@ public class Dictionary {
|
|||||||
return singleton;
|
return singleton;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public List<String> getExtDictionarys() {
|
||||||
|
List<String> extDictFiles = new ArrayList<String>(2);
|
||||||
|
String extDictCfg = getProperty(EXT_DICT);
|
||||||
|
if (extDictCfg != null) {
|
||||||
|
|
||||||
|
String[] filePaths = extDictCfg.split(";");
|
||||||
|
for (String filePath : filePaths) {
|
||||||
|
if (filePath != null && !"".equals(filePath.trim())) {
|
||||||
|
Path file = PathUtils.get(filePath.trim());
|
||||||
|
extDictFiles.add(file.toString());
|
||||||
|
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return extDictFiles;
|
||||||
|
}
|
||||||
|
|
||||||
|
public List<String> getRemoteExtDictionarys() {
|
||||||
|
List<String> remoteExtDictFiles = new ArrayList<String>(2);
|
||||||
|
String remoteExtDictCfg = getProperty(REMOTE_EXT_DICT);
|
||||||
|
if (remoteExtDictCfg != null) {
|
||||||
|
|
||||||
|
String[] filePaths = remoteExtDictCfg.split(";");
|
||||||
|
for (String filePath : filePaths) {
|
||||||
|
if (filePath != null && !"".equals(filePath.trim())) {
|
||||||
|
remoteExtDictFiles.add(filePath);
|
||||||
|
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return remoteExtDictFiles;
|
||||||
|
}
|
||||||
|
|
||||||
|
public List<String> getExtStopWordDictionarys() {
|
||||||
|
List<String> extStopWordDictFiles = new ArrayList<String>(2);
|
||||||
|
String extStopWordDictCfg = getProperty(EXT_STOP);
|
||||||
|
if (extStopWordDictCfg != null) {
|
||||||
|
|
||||||
|
String[] filePaths = extStopWordDictCfg.split(";");
|
||||||
|
for (String filePath : filePaths) {
|
||||||
|
if (filePath != null && !"".equals(filePath.trim())) {
|
||||||
|
Path file = PathUtils.get(filePath.trim());
|
||||||
|
extStopWordDictFiles.add(file.toString());
|
||||||
|
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return extStopWordDictFiles;
|
||||||
|
}
|
||||||
|
|
||||||
|
public List<String> getRemoteExtStopWordDictionarys() {
|
||||||
|
List<String> remoteExtStopWordDictFiles = new ArrayList<String>(2);
|
||||||
|
String remoteExtStopWordDictCfg = getProperty(REMOTE_EXT_STOP);
|
||||||
|
if (remoteExtStopWordDictCfg != null) {
|
||||||
|
|
||||||
|
String[] filePaths = remoteExtStopWordDictCfg.split(";");
|
||||||
|
for (String filePath : filePaths) {
|
||||||
|
if (filePath != null && !"".equals(filePath.trim())) {
|
||||||
|
remoteExtStopWordDictFiles.add(filePath);
|
||||||
|
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return remoteExtStopWordDictFiles;
|
||||||
|
}
|
||||||
|
|
||||||
|
public String getDictRoot() {
|
||||||
|
return conf_dir.toAbsolutePath().toString();
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* 获取词典单子实例
|
* 获取词典单子实例
|
||||||
*
|
*
|
||||||
@ -139,6 +254,7 @@ public class Dictionary {
|
|||||||
return singleton;
|
return singleton;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* 批量加载新词条
|
* 批量加载新词条
|
||||||
*
|
*
|
||||||
@ -224,7 +340,7 @@ public class Dictionary {
|
|||||||
_MainDict = new DictSegment((char) 0);
|
_MainDict = new DictSegment((char) 0);
|
||||||
|
|
||||||
// 读取主词典文件
|
// 读取主词典文件
|
||||||
Path file = PathUtils.get(configuration.getDictRoot(), Dictionary.PATH_DIC_MAIN);
|
Path file = PathUtils.get(getDictRoot(), Dictionary.PATH_DIC_MAIN);
|
||||||
|
|
||||||
InputStream is = null;
|
InputStream is = null;
|
||||||
try {
|
try {
|
||||||
@ -267,13 +383,13 @@ public class Dictionary {
|
|||||||
*/
|
*/
|
||||||
private void loadExtDict() {
|
private void loadExtDict() {
|
||||||
// 加载扩展词典配置
|
// 加载扩展词典配置
|
||||||
List<String> extDictFiles = configuration.getExtDictionarys();
|
List<String> extDictFiles = getExtDictionarys();
|
||||||
if (extDictFiles != null) {
|
if (extDictFiles != null) {
|
||||||
InputStream is = null;
|
InputStream is = null;
|
||||||
for (String extDictName : extDictFiles) {
|
for (String extDictName : extDictFiles) {
|
||||||
// 读取扩展词典文件
|
// 读取扩展词典文件
|
||||||
logger.info("[Dict Loading] " + extDictName);
|
logger.info("[Dict Loading] " + extDictName);
|
||||||
Path file = PathUtils.get(configuration.getDictRoot(), extDictName);
|
Path file = PathUtils.get(getDictRoot(), extDictName);
|
||||||
try {
|
try {
|
||||||
is = new FileInputStream(file.toFile());
|
is = new FileInputStream(file.toFile());
|
||||||
} catch (FileNotFoundException e) {
|
} catch (FileNotFoundException e) {
|
||||||
@ -315,7 +431,7 @@ public class Dictionary {
|
|||||||
* 加载远程扩展词典到主词库表
|
* 加载远程扩展词典到主词库表
|
||||||
*/
|
*/
|
||||||
private void loadRemoteExtDict() {
|
private void loadRemoteExtDict() {
|
||||||
List<String> remoteExtDictFiles = configuration.getRemoteExtDictionarys();
|
List<String> remoteExtDictFiles = getRemoteExtDictionarys();
|
||||||
for (String location : remoteExtDictFiles) {
|
for (String location : remoteExtDictFiles) {
|
||||||
logger.info("[Dict Loading] " + location);
|
logger.info("[Dict Loading] " + location);
|
||||||
List<String> lists = getRemoteWords(location);
|
List<String> lists = getRemoteWords(location);
|
||||||
@ -386,7 +502,7 @@ public class Dictionary {
|
|||||||
_StopWords = new DictSegment((char) 0);
|
_StopWords = new DictSegment((char) 0);
|
||||||
|
|
||||||
// 读取主词典文件
|
// 读取主词典文件
|
||||||
Path file = PathUtils.get(configuration.getDictRoot(), Dictionary.PATH_DIC_STOP);
|
Path file = PathUtils.get(getDictRoot(), Dictionary.PATH_DIC_STOP);
|
||||||
|
|
||||||
InputStream is = null;
|
InputStream is = null;
|
||||||
try {
|
try {
|
||||||
@ -420,14 +536,14 @@ public class Dictionary {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// 加载扩展停止词典
|
// 加载扩展停止词典
|
||||||
List<String> extStopWordDictFiles = configuration.getExtStopWordDictionarys();
|
List<String> extStopWordDictFiles = getExtStopWordDictionarys();
|
||||||
if (extStopWordDictFiles != null) {
|
if (extStopWordDictFiles != null) {
|
||||||
is = null;
|
is = null;
|
||||||
for (String extStopWordDictName : extStopWordDictFiles) {
|
for (String extStopWordDictName : extStopWordDictFiles) {
|
||||||
logger.info("[Dict Loading] " + extStopWordDictName);
|
logger.info("[Dict Loading] " + extStopWordDictName);
|
||||||
|
|
||||||
// 读取扩展词典文件
|
// 读取扩展词典文件
|
||||||
file = PathUtils.get(configuration.getDictRoot(), extStopWordDictName);
|
file = PathUtils.get(getDictRoot(), extStopWordDictName);
|
||||||
try {
|
try {
|
||||||
is = new FileInputStream(file.toFile());
|
is = new FileInputStream(file.toFile());
|
||||||
} catch (FileNotFoundException e) {
|
} catch (FileNotFoundException e) {
|
||||||
@ -465,7 +581,7 @@ public class Dictionary {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// 加载远程停用词典
|
// 加载远程停用词典
|
||||||
List<String> remoteExtStopWordDictFiles = configuration.getRemoteExtStopWordDictionarys();
|
List<String> remoteExtStopWordDictFiles = getRemoteExtStopWordDictionarys();
|
||||||
for (String location : remoteExtStopWordDictFiles) {
|
for (String location : remoteExtStopWordDictFiles) {
|
||||||
logger.info("[Dict Loading] " + location);
|
logger.info("[Dict Loading] " + location);
|
||||||
List<String> lists = getRemoteWords(location);
|
List<String> lists = getRemoteWords(location);
|
||||||
@ -492,7 +608,7 @@ public class Dictionary {
|
|||||||
// 建立一个量词典实例
|
// 建立一个量词典实例
|
||||||
_QuantifierDict = new DictSegment((char) 0);
|
_QuantifierDict = new DictSegment((char) 0);
|
||||||
// 读取量词词典文件
|
// 读取量词词典文件
|
||||||
Path file = PathUtils.get(configuration.getDictRoot(), Dictionary.PATH_DIC_QUANTIFIER);
|
Path file = PathUtils.get(getDictRoot(), Dictionary.PATH_DIC_QUANTIFIER);
|
||||||
InputStream is = null;
|
InputStream is = null;
|
||||||
try {
|
try {
|
||||||
is = new FileInputStream(file.toFile());
|
is = new FileInputStream(file.toFile());
|
||||||
@ -527,7 +643,7 @@ public class Dictionary {
|
|||||||
private void loadSurnameDict() {
|
private void loadSurnameDict() {
|
||||||
|
|
||||||
_SurnameDict = new DictSegment((char) 0);
|
_SurnameDict = new DictSegment((char) 0);
|
||||||
Path file = PathUtils.get(configuration.getDictRoot(), Dictionary.PATH_DIC_SURNAME);
|
Path file = PathUtils.get(getDictRoot(), Dictionary.PATH_DIC_SURNAME);
|
||||||
InputStream is = null;
|
InputStream is = null;
|
||||||
try {
|
try {
|
||||||
is = new FileInputStream(file.toFile());
|
is = new FileInputStream(file.toFile());
|
||||||
@ -563,7 +679,7 @@ public class Dictionary {
|
|||||||
private void loadSuffixDict() {
|
private void loadSuffixDict() {
|
||||||
|
|
||||||
_SuffixDict = new DictSegment((char) 0);
|
_SuffixDict = new DictSegment((char) 0);
|
||||||
Path file = PathUtils.get(configuration.getDictRoot(), Dictionary.PATH_DIC_SUFFIX);
|
Path file = PathUtils.get(getDictRoot(), Dictionary.PATH_DIC_SUFFIX);
|
||||||
InputStream is = null;
|
InputStream is = null;
|
||||||
try {
|
try {
|
||||||
is = new FileInputStream(file.toFile());
|
is = new FileInputStream(file.toFile());
|
||||||
@ -598,7 +714,7 @@ public class Dictionary {
|
|||||||
private void loadPrepDict() {
|
private void loadPrepDict() {
|
||||||
|
|
||||||
_PrepDict = new DictSegment((char) 0);
|
_PrepDict = new DictSegment((char) 0);
|
||||||
Path file = PathUtils.get(configuration.getDictRoot(), Dictionary.PATH_DIC_PREP);
|
Path file = PathUtils.get(getDictRoot(), Dictionary.PATH_DIC_PREP);
|
||||||
InputStream is = null;
|
InputStream is = null;
|
||||||
try {
|
try {
|
||||||
is = new FileInputStream(file.toFile());
|
is = new FileInputStream(file.toFile());
|
||||||
@ -634,7 +750,7 @@ public class Dictionary {
|
|||||||
public void reLoadMainDict() {
|
public void reLoadMainDict() {
|
||||||
logger.info("重新加载词典...");
|
logger.info("重新加载词典...");
|
||||||
// 新开一个实例加载词典,减少加载过程对当前词典使用的影响
|
// 新开一个实例加载词典,减少加载过程对当前词典使用的影响
|
||||||
Dictionary tmpDict = new Dictionary();
|
Dictionary tmpDict = new Dictionary(configuration);
|
||||||
tmpDict.configuration = getSingleton().configuration;
|
tmpDict.configuration = getSingleton().configuration;
|
||||||
tmpDict.loadMainDict();
|
tmpDict.loadMainDict();
|
||||||
tmpDict.loadStopWordDict();
|
tmpDict.loadStopWordDict();
|
||||||
@ -643,4 +759,4 @@ public class Dictionary {
|
|||||||
logger.info("重新加载词典完毕...");
|
logger.info("重新加载词典完毕...");
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
@ -26,6 +26,7 @@ package org.wltea.analyzer.lucene;
|
|||||||
|
|
||||||
import org.apache.lucene.analysis.Analyzer;
|
import org.apache.lucene.analysis.Analyzer;
|
||||||
import org.apache.lucene.analysis.Tokenizer;
|
import org.apache.lucene.analysis.Tokenizer;
|
||||||
|
import org.wltea.analyzer.cfg.Configuration;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* IK分词器,Lucene Analyzer接口实现
|
* IK分词器,Lucene Analyzer接口实现
|
||||||
@ -33,15 +34,7 @@ import org.apache.lucene.analysis.Tokenizer;
|
|||||||
*/
|
*/
|
||||||
public final class IKAnalyzer extends Analyzer{
|
public final class IKAnalyzer extends Analyzer{
|
||||||
|
|
||||||
private boolean useSmart;
|
private Configuration configuration;
|
||||||
|
|
||||||
public boolean useSmart() {
|
|
||||||
return useSmart;
|
|
||||||
}
|
|
||||||
|
|
||||||
public void setUseSmart(boolean useSmart) {
|
|
||||||
this.useSmart = useSmart;
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* IK分词器Lucene Analyzer接口实现类
|
* IK分词器Lucene Analyzer接口实现类
|
||||||
@ -54,11 +47,11 @@ public final class IKAnalyzer extends Analyzer{
|
|||||||
/**
|
/**
|
||||||
* IK分词器Lucene Analyzer接口实现类
|
* IK分词器Lucene Analyzer接口实现类
|
||||||
*
|
*
|
||||||
* @param useSmart 当为true时,分词器进行智能切分
|
* @param configuration IK配置
|
||||||
*/
|
*/
|
||||||
public IKAnalyzer(boolean useSmart){
|
public IKAnalyzer(Configuration configuration){
|
||||||
super();
|
super();
|
||||||
this.useSmart = useSmart;
|
this.configuration = configuration;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@ -67,7 +60,7 @@ public final class IKAnalyzer extends Analyzer{
|
|||||||
*/
|
*/
|
||||||
@Override
|
@Override
|
||||||
protected TokenStreamComponents createComponents(String fieldName) {
|
protected TokenStreamComponents createComponents(String fieldName) {
|
||||||
Tokenizer _IKTokenizer = new IKTokenizer(useSmart);
|
Tokenizer _IKTokenizer = new IKTokenizer(configuration);
|
||||||
return new TokenStreamComponents(_IKTokenizer);
|
return new TokenStreamComponents(_IKTokenizer);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -32,6 +32,7 @@ import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
|||||||
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
|
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
|
||||||
import org.elasticsearch.common.settings.Settings;
|
import org.elasticsearch.common.settings.Settings;
|
||||||
import org.elasticsearch.env.Environment;
|
import org.elasticsearch.env.Environment;
|
||||||
|
import org.wltea.analyzer.cfg.Configuration;
|
||||||
import org.wltea.analyzer.core.IKSegmenter;
|
import org.wltea.analyzer.core.IKSegmenter;
|
||||||
import org.wltea.analyzer.core.Lexeme;
|
import org.wltea.analyzer.core.Lexeme;
|
||||||
|
|
||||||
@ -64,16 +65,15 @@ public final class IKTokenizer extends Tokenizer {
|
|||||||
|
|
||||||
/**
|
/**
|
||||||
* Lucene 4.0 Tokenizer适配器类构造函数
|
* Lucene 4.0 Tokenizer适配器类构造函数
|
||||||
* @param in
|
|
||||||
*/
|
*/
|
||||||
public IKTokenizer(boolean useSmart){
|
public IKTokenizer(Configuration configuration){
|
||||||
super();
|
super();
|
||||||
offsetAtt = addAttribute(OffsetAttribute.class);
|
offsetAtt = addAttribute(OffsetAttribute.class);
|
||||||
termAtt = addAttribute(CharTermAttribute.class);
|
termAtt = addAttribute(CharTermAttribute.class);
|
||||||
typeAtt = addAttribute(TypeAttribute.class);
|
typeAtt = addAttribute(TypeAttribute.class);
|
||||||
posIncrAtt = addAttribute(PositionIncrementAttribute.class);
|
posIncrAtt = addAttribute(PositionIncrementAttribute.class);
|
||||||
|
|
||||||
_IKImplement = new IKSegmenter(input,useSmart);
|
_IKImplement = new IKSegmenter(input,configuration);
|
||||||
}
|
}
|
||||||
|
|
||||||
/* (non-Javadoc)
|
/* (non-Javadoc)
|
||||||
|
@ -1,90 +0,0 @@
|
|||||||
/**
|
|
||||||
* IK 中文分词 版本 5.0.1
|
|
||||||
* IK Analyzer release 5.0.1
|
|
||||||
*
|
|
||||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
|
||||||
* contributor license agreements. See the NOTICE file distributed with
|
|
||||||
* this work for additional information regarding copyright ownership.
|
|
||||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
|
||||||
* (the "License"); you may not use this file except in compliance with
|
|
||||||
* the License. You may obtain a copy of the License at
|
|
||||||
*
|
|
||||||
* http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
*
|
|
||||||
* Unless required by applicable law or agreed to in writing, software
|
|
||||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
* See the License for the specific language governing permissions and
|
|
||||||
* limitations under the License.
|
|
||||||
*
|
|
||||||
* 源代码由林良益(linliangyi2005@gmail.com)提供
|
|
||||||
* 版权声明 2012,乌龙茶工作室
|
|
||||||
* provided by Linliangyi and copyright 2012 by Oolong studio
|
|
||||||
*
|
|
||||||
*
|
|
||||||
*/
|
|
||||||
package org.wltea.analyzer.sample;
|
|
||||||
|
|
||||||
import java.io.IOException;
|
|
||||||
import java.io.StringReader;
|
|
||||||
|
|
||||||
import org.apache.lucene.analysis.Analyzer;
|
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
|
||||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
|
||||||
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
|
||||||
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
|
|
||||||
import org.elasticsearch.common.logging.ESLogger;
|
|
||||||
import org.elasticsearch.common.logging.Loggers;
|
|
||||||
import org.wltea.analyzer.lucene.IKAnalyzer;
|
|
||||||
|
|
||||||
/**
|
|
||||||
* 使用IKAnalyzer进行分词的演示
|
|
||||||
* 2012-10-22
|
|
||||||
*
|
|
||||||
*/
|
|
||||||
public class IKAnalzyerDemo {
|
|
||||||
|
|
||||||
public static ESLogger logger= Loggers.getLogger("ik-analyzer");
|
|
||||||
|
|
||||||
public static void main(String[] args){
|
|
||||||
//构建IK分词器,使用smart分词模式
|
|
||||||
Analyzer analyzer = new IKAnalyzer(true);
|
|
||||||
|
|
||||||
//获取Lucene的TokenStream对象
|
|
||||||
TokenStream ts = null;
|
|
||||||
try {
|
|
||||||
ts = analyzer.tokenStream("myfield", new StringReader("WORLD ,.. html DATA</html>HELLO"));
|
|
||||||
// ts = analyzer.tokenStream("myfield", new StringReader("这是一个中文分词的例子,你可以直接运行它!IKAnalyer can analysis english text too"));
|
|
||||||
//获取词元位置属性
|
|
||||||
OffsetAttribute offset = ts.addAttribute(OffsetAttribute.class);
|
|
||||||
//获取词元文本属性
|
|
||||||
CharTermAttribute term = ts.addAttribute(CharTermAttribute.class);
|
|
||||||
//获取词元文本属性
|
|
||||||
TypeAttribute type = ts.addAttribute(TypeAttribute.class);
|
|
||||||
|
|
||||||
|
|
||||||
//重置TokenStream(重置StringReader)
|
|
||||||
ts.reset();
|
|
||||||
//迭代获取分词结果
|
|
||||||
while (ts.incrementToken()) {
|
|
||||||
System.out.println(offset.startOffset() + " - " + offset.endOffset() + " : " + term.toString() + " | " + type.type());
|
|
||||||
}
|
|
||||||
//关闭TokenStream(关闭StringReader)
|
|
||||||
ts.end(); // Perform end-of-stream operations, e.g. set the final offset.
|
|
||||||
|
|
||||||
} catch (IOException e) {
|
|
||||||
logger.error(e.getMessage(), e);
|
|
||||||
} finally {
|
|
||||||
//释放TokenStream的所有资源
|
|
||||||
if(ts != null){
|
|
||||||
try {
|
|
||||||
ts.close();
|
|
||||||
} catch (IOException e) {
|
|
||||||
logger.error(e.getMessage(), e);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
@ -1,150 +0,0 @@
|
|||||||
/**
|
|
||||||
* IK 中文分词 版本 5.0
|
|
||||||
* IK Analyzer release 5.0
|
|
||||||
*
|
|
||||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
|
||||||
* contributor license agreements. See the NOTICE file distributed with
|
|
||||||
* this work for additional information regarding copyright ownership.
|
|
||||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
|
||||||
* (the "License"); you may not use this file except in compliance with
|
|
||||||
* the License. You may obtain a copy of the License at
|
|
||||||
*
|
|
||||||
* http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
*
|
|
||||||
* Unless required by applicable law or agreed to in writing, software
|
|
||||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
* See the License for the specific language governing permissions and
|
|
||||||
* limitations under the License.
|
|
||||||
*
|
|
||||||
* 源代码由林良益(linliangyi2005@gmail.com)提供
|
|
||||||
* 版权声明 2012,乌龙茶工作室
|
|
||||||
* provided by Linliangyi and copyright 2012 by Oolong studio
|
|
||||||
*
|
|
||||||
*
|
|
||||||
*/
|
|
||||||
package org.wltea.analyzer.sample;
|
|
||||||
|
|
||||||
import java.io.IOException;
|
|
||||||
|
|
||||||
import org.apache.lucene.analysis.Analyzer;
|
|
||||||
import org.apache.lucene.document.Document;
|
|
||||||
import org.apache.lucene.document.Field;
|
|
||||||
import org.apache.lucene.document.StringField;
|
|
||||||
import org.apache.lucene.document.TextField;
|
|
||||||
import org.apache.lucene.index.CorruptIndexException;
|
|
||||||
import org.apache.lucene.index.DirectoryReader;
|
|
||||||
import org.apache.lucene.index.IndexReader;
|
|
||||||
import org.apache.lucene.index.IndexWriter;
|
|
||||||
import org.apache.lucene.index.IndexWriterConfig;
|
|
||||||
import org.apache.lucene.index.IndexWriterConfig.OpenMode;
|
|
||||||
import org.apache.lucene.queryparser.classic.ParseException;
|
|
||||||
import org.apache.lucene.queryparser.classic.QueryParser;
|
|
||||||
import org.apache.lucene.search.IndexSearcher;
|
|
||||||
import org.apache.lucene.search.Query;
|
|
||||||
import org.apache.lucene.search.ScoreDoc;
|
|
||||||
import org.apache.lucene.search.TopDocs;
|
|
||||||
import org.apache.lucene.store.Directory;
|
|
||||||
import org.apache.lucene.store.LockObtainFailedException;
|
|
||||||
import org.apache.lucene.store.RAMDirectory;
|
|
||||||
import org.apache.lucene.util.Version;
|
|
||||||
import org.elasticsearch.common.logging.ESLogger;
|
|
||||||
import org.elasticsearch.common.logging.Loggers;
|
|
||||||
import org.wltea.analyzer.lucene.IKAnalyzer;
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
/**
|
|
||||||
* 使用IKAnalyzer进行Lucene索引和查询的演示
|
|
||||||
* 2012-3-2
|
|
||||||
*
|
|
||||||
* 以下是结合Lucene4.0 API的写法
|
|
||||||
*
|
|
||||||
*/
|
|
||||||
public class LuceneIndexAndSearchDemo {
|
|
||||||
|
|
||||||
public static ESLogger logger= Loggers.getLogger("ik-analyzer");
|
|
||||||
|
|
||||||
/**
|
|
||||||
* 模拟:
|
|
||||||
* 创建一个单条记录的索引,并对其进行搜索
|
|
||||||
* @param args
|
|
||||||
*/
|
|
||||||
public static void main(String[] args){
|
|
||||||
//Lucene Document的域名
|
|
||||||
String fieldName = "text";
|
|
||||||
//检索内容
|
|
||||||
String text = "IK Analyzer是一个结合词典分词和文法分词的中文分词开源工具包。它使用了全新的正向迭代最细粒度切分算法。";
|
|
||||||
|
|
||||||
//实例化IKAnalyzer分词器
|
|
||||||
Analyzer analyzer = new IKAnalyzer(true);
|
|
||||||
|
|
||||||
Directory directory = null;
|
|
||||||
IndexWriter iwriter = null;
|
|
||||||
IndexReader ireader = null;
|
|
||||||
IndexSearcher isearcher = null;
|
|
||||||
try {
|
|
||||||
//建立内存索引对象
|
|
||||||
directory = new RAMDirectory();
|
|
||||||
|
|
||||||
//配置IndexWriterConfig
|
|
||||||
IndexWriterConfig iwConfig = new IndexWriterConfig(analyzer);
|
|
||||||
iwConfig.setOpenMode(OpenMode.CREATE_OR_APPEND);
|
|
||||||
iwriter = new IndexWriter(directory , iwConfig);
|
|
||||||
//写入索引
|
|
||||||
Document doc = new Document();
|
|
||||||
doc.add(new StringField("ID", "10000", Field.Store.YES));
|
|
||||||
doc.add(new TextField(fieldName, text, Field.Store.YES));
|
|
||||||
iwriter.addDocument(doc);
|
|
||||||
iwriter.close();
|
|
||||||
|
|
||||||
|
|
||||||
//搜索过程**********************************
|
|
||||||
//实例化搜索器
|
|
||||||
ireader = DirectoryReader.open(directory);
|
|
||||||
isearcher = new IndexSearcher(ireader);
|
|
||||||
|
|
||||||
String keyword = "中文分词工具包";
|
|
||||||
//使用QueryParser查询分析器构造Query对象
|
|
||||||
QueryParser qp = new QueryParser(fieldName, analyzer);
|
|
||||||
qp.setDefaultOperator(QueryParser.AND_OPERATOR);
|
|
||||||
Query query = qp.parse(keyword);
|
|
||||||
System.out.println("Query = " + query);
|
|
||||||
|
|
||||||
//搜索相似度最高的5条记录
|
|
||||||
TopDocs topDocs = isearcher.search(query , 5);
|
|
||||||
System.out.println("命中:" + topDocs.totalHits);
|
|
||||||
//输出结果
|
|
||||||
ScoreDoc[] scoreDocs = topDocs.scoreDocs;
|
|
||||||
for (int i = 0; i < topDocs.totalHits; i++){
|
|
||||||
Document targetDoc = isearcher.doc(scoreDocs[i].doc);
|
|
||||||
System.out.println("内容:" + targetDoc.toString());
|
|
||||||
}
|
|
||||||
|
|
||||||
} catch (CorruptIndexException e) {
|
|
||||||
logger.error(e.getMessage(), e);
|
|
||||||
} catch (LockObtainFailedException e) {
|
|
||||||
logger.error(e.getMessage(), e);
|
|
||||||
} catch (IOException e) {
|
|
||||||
logger.error(e.getMessage(), e);
|
|
||||||
} catch (ParseException e) {
|
|
||||||
logger.error(e.getMessage(), e);
|
|
||||||
} finally{
|
|
||||||
if(ireader != null){
|
|
||||||
try {
|
|
||||||
ireader.close();
|
|
||||||
} catch (IOException e) {
|
|
||||||
logger.error(e.getMessage(), e);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if(directory != null){
|
|
||||||
try {
|
|
||||||
directory.close();
|
|
||||||
} catch (IOException e) {
|
|
||||||
logger.error(e.getMessage(), e);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
@ -1,83 +0,0 @@
|
|||||||
<?xml version="1.0" encoding="UTF-8"?>
|
|
||||||
<Diagram>
|
|
||||||
<ID>JAVA</ID>
|
|
||||||
<OriginalElement>org.elasticsearch.index.analysis.IKAnalysisBinderProcessor</OriginalElement>
|
|
||||||
<nodes>
|
|
||||||
<node x="1244.0" y="553.0">org.elasticsearch.index.analysis.IKAnalysisBinderProcessor</node>
|
|
||||||
<node x="2212.0" y="489.0">org.elasticsearch.index.analysis.AnalysisModule.AnalysisBinderProcessor.AnalyzersBindings</node>
|
|
||||||
<node x="1316.0" y="0.0">java.lang.Object</node>
|
|
||||||
<node x="1244.0" y="329.0">org.elasticsearch.index.analysis.AnalysisModule.AnalysisBinderProcessor</node>
|
|
||||||
<node x="616.0" y="510.0">org.elasticsearch.index.analysis.AnalysisModule.AnalysisBinderProcessor.TokenFiltersBindings</node>
|
|
||||||
<node x="0.0" y="510.0">org.elasticsearch.index.analysis.AnalysisModule.AnalysisBinderProcessor.CharFiltersBindings</node>
|
|
||||||
<node x="1608.0" y="510.0">org.elasticsearch.index.analysis.AnalysisModule.AnalysisBinderProcessor.TokenizersBindings</node>
|
|
||||||
</nodes>
|
|
||||||
<notes />
|
|
||||||
<edges>
|
|
||||||
<edge source="org.elasticsearch.index.analysis.AnalysisModule.AnalysisBinderProcessor.TokenFiltersBindings" target="org.elasticsearch.index.analysis.AnalysisModule.AnalysisBinderProcessor">
|
|
||||||
<point x="152.0" y="-77.0" />
|
|
||||||
<point x="1072.0" y="469.0" />
|
|
||||||
<point x="1347.2" y="469.0" />
|
|
||||||
<point x="-68.79999999999995" y="55.0" />
|
|
||||||
</edge>
|
|
||||||
<edge source="org.elasticsearch.index.analysis.AnalysisModule.AnalysisBinderProcessor.CharFiltersBindings" target="java.lang.Object">
|
|
||||||
<point x="-149.0" y="-77.0" />
|
|
||||||
<point x="149.0" y="299.0" />
|
|
||||||
<point x="1336.0" y="299.0" />
|
|
||||||
<point x="-80.0" y="139.5" />
|
|
||||||
</edge>
|
|
||||||
<edge source="org.elasticsearch.index.analysis.AnalysisModule.AnalysisBinderProcessor" target="java.lang.Object">
|
|
||||||
<point x="0.0" y="-55.0" />
|
|
||||||
<point x="0.0" y="139.5" />
|
|
||||||
</edge>
|
|
||||||
<edge source="org.elasticsearch.index.analysis.AnalysisModule.AnalysisBinderProcessor.AnalyzersBindings" target="org.elasticsearch.index.analysis.AnalysisModule.AnalysisBinderProcessor">
|
|
||||||
<point x="-180.5" y="-98.0" />
|
|
||||||
<point x="2392.5" y="459.0" />
|
|
||||||
<point x="1553.6" y="459.0" />
|
|
||||||
<point x="137.5999999999999" y="55.0" />
|
|
||||||
</edge>
|
|
||||||
<edge source="org.elasticsearch.index.analysis.AnalysisModule.AnalysisBinderProcessor.CharFiltersBindings" target="org.elasticsearch.index.analysis.AnalysisModule.AnalysisBinderProcessor">
|
|
||||||
<point x="149.0" y="-77.0" />
|
|
||||||
<point x="447.0" y="459.0" />
|
|
||||||
<point x="1278.4" y="459.0" />
|
|
||||||
<point x="-137.5999999999999" y="55.0" />
|
|
||||||
</edge>
|
|
||||||
<edge source="org.elasticsearch.index.analysis.IKAnalysisBinderProcessor" target="org.elasticsearch.index.analysis.AnalysisModule.AnalysisBinderProcessor">
|
|
||||||
<point x="0.0" y="-34.0" />
|
|
||||||
<point x="0.0" y="55.0" />
|
|
||||||
</edge>
|
|
||||||
<edge source="org.elasticsearch.index.analysis.AnalysisModule.AnalysisBinderProcessor.TokenFiltersBindings" target="java.lang.Object">
|
|
||||||
<point x="-152.0" y="-77.0" />
|
|
||||||
<point x="768.0" y="309.0" />
|
|
||||||
<point x="1376.0" y="309.0" />
|
|
||||||
<point x="-40.0" y="139.5" />
|
|
||||||
</edge>
|
|
||||||
<edge source="org.elasticsearch.index.analysis.AnalysisModule.AnalysisBinderProcessor.AnalyzersBindings" target="java.lang.Object">
|
|
||||||
<point x="180.5" y="-98.0" />
|
|
||||||
<point x="2753.5" y="299.0" />
|
|
||||||
<point x="1496.0" y="299.0" />
|
|
||||||
<point x="80.0" y="139.5" />
|
|
||||||
</edge>
|
|
||||||
<edge source="org.elasticsearch.index.analysis.AnalysisModule.AnalysisBinderProcessor.TokenizersBindings" target="java.lang.Object">
|
|
||||||
<point x="146.0" y="-77.0" />
|
|
||||||
<point x="2046.0" y="309.0" />
|
|
||||||
<point x="1456.0" y="309.0" />
|
|
||||||
<point x="40.0" y="139.5" />
|
|
||||||
</edge>
|
|
||||||
<edge source="org.elasticsearch.index.analysis.AnalysisModule.AnalysisBinderProcessor.TokenizersBindings" target="org.elasticsearch.index.analysis.AnalysisModule.AnalysisBinderProcessor">
|
|
||||||
<point x="-146.0" y="-77.0" />
|
|
||||||
<point x="1754.0" y="469.0" />
|
|
||||||
<point x="1484.8" y="469.0" />
|
|
||||||
<point x="68.79999999999995" y="55.0" />
|
|
||||||
</edge>
|
|
||||||
</edges>
|
|
||||||
<settings layout="Hierarchic Group" zoom="1.0" x="110.5" y="89.0" />
|
|
||||||
<SelectedNodes />
|
|
||||||
<Categories>
|
|
||||||
<Category>Fields</Category>
|
|
||||||
<Category>Methods</Category>
|
|
||||||
<Category>Constructors</Category>
|
|
||||||
<Category>Inner Classes</Category>
|
|
||||||
<Category>Properties</Category>
|
|
||||||
</Categories>
|
|
||||||
</Diagram>
|
|
||||||
|
|
Loading…
x
Reference in New Issue
Block a user