Compare commits
106 Commits
Author | SHA1 | Date | |
---|---|---|---|
|
9338c19104 | ||
|
0fb53ac32c | ||
|
b637708ba0 | ||
|
9c47725ea0 | ||
|
8e36b3240e | ||
|
e0157d5f39 | ||
|
0fccc038e2 | ||
|
5a1b8c8da6 | ||
|
1375ca6d39 | ||
|
4619effa15 | ||
|
5f53f1a5bf | ||
|
904a7493ea | ||
|
06e8a23d18 | ||
|
a1d6ba8ca2 | ||
|
90c9b58354 | ||
|
ba8bb85f31 | ||
|
125ac3c5e5 | ||
|
f0dd522e60 | ||
|
9eaa2b90eb | ||
|
9873489ba7 | ||
|
949531572b | ||
|
1d750a9bdd | ||
|
3a7a81c29d | ||
|
1422d5b96c | ||
|
9fff6379ef | ||
|
5190dba198 | ||
|
83fa2ff8b2 | ||
|
0222529290 | ||
|
5e8d0df2be | ||
|
36e6d2d00b | ||
|
de1da42d38 | ||
|
3dcedde9e4 | ||
|
21a859a48d | ||
|
816b8ddd4b | ||
|
7028b9ea05 | ||
|
4ab2616a96 | ||
|
7c9b4771b3 | ||
|
22de5be444 | ||
|
0e8ddbd749 | ||
|
7a1445fdda | ||
|
353cefd5b8 | ||
|
0922152fb8 | ||
|
cc01c881af | ||
|
eb21c796d8 | ||
|
5828cb1c72 | ||
|
dc739d2cee | ||
|
2851cc2501 | ||
|
b32366489b | ||
|
6a55e3af76 | ||
|
7636e1a234 | ||
|
1f2dfbffd5 | ||
|
2541e35991 | ||
|
55a4f05666 | ||
|
6309787f94 | ||
|
c4c498a3aa | ||
|
8da12f3492 | ||
|
50230bfa64 | ||
|
adf282f115 | ||
|
1a62eb1651 | ||
|
455b672e5a | ||
|
2d16b56728 | ||
|
1987d6ace4 | ||
|
60e5e7768f | ||
|
7dfeb25c8f | ||
|
e7d968ffa8 | ||
|
a1fea66be8 | ||
|
dbb45eec56 | ||
|
c5a1553850 | ||
|
400206511d | ||
|
d1d216a195 | ||
|
494576998a | ||
|
b85b487569 | ||
|
ffb88ee0fa | ||
|
e08d9d9be5 | ||
|
754572b2b9 | ||
|
e0ada4440e | ||
|
17f6e982a5 | ||
|
b6ec9c0a00 | ||
|
7c92a10fc0 | ||
|
f28ec3c3c2 | ||
|
bfcebccd0f | ||
|
82c6369501 | ||
|
e637c4b1b2 | ||
|
ac2b78acd0 | ||
|
168a798da8 | ||
|
efb393f3d7 | ||
|
2df086b082 | ||
|
b11aec33c2 | ||
|
7e86d7390a | ||
|
341b586373 | ||
|
b662596939 | ||
|
82432f1059 | ||
|
4373cf7c94 | ||
|
f1d59921fe | ||
|
c94ef3884a | ||
|
26fe905cc6 | ||
|
7e29998ab9 | ||
|
cd9dfdf225 | ||
|
2dfe76375a | ||
|
ca2bfe5732 | ||
|
3bd1490225 | ||
|
c2244ccd80 | ||
|
800097d776 | ||
|
e14b7a0df7 | ||
|
ffe9d9b8e7 | ||
|
5ec0bd5bd6 |
2
.github/FUNDING.yml
vendored
Normal file
2
.github/FUNDING.yml
vendored
Normal file
@ -0,0 +1,2 @@
|
|||||||
|
patreon: medcl
|
||||||
|
custom: ["https://www.buymeacoffee.com/medcl"]
|
16
.travis.yml
16
.travis.yml
@ -1,11 +1,9 @@
|
|||||||
|
sudo: required
|
||||||
|
jdk:
|
||||||
|
- oraclejdk8
|
||||||
|
install: true
|
||||||
|
script:
|
||||||
|
- sudo apt-get update && sudo apt-get install oracle-java8-installer
|
||||||
|
- java -version
|
||||||
language: java
|
language: java
|
||||||
script: mvn clean package
|
script: mvn clean package
|
||||||
deploy:
|
|
||||||
provider: releases
|
|
||||||
api_key:
|
|
||||||
secure: llxJZlRYBIWINl5XI42RpEe+jTxlmSP6MX+oTNZa4oFjEeN9Kdd1G8+S3HSIhCc31RoF/2zeNsM9OehRi1O6bweNSQ9vjlKZQPD8FYcHaHpYW0U7h/OMbEeC794fAghm9ZsmOTNymdvbAXL14nJTrwOW9W8VqoZT9Jx7Ejad63Y=
|
|
||||||
file: target/releases/elasticsearch-analysis-ik-*.zip
|
|
||||||
file_glob: true
|
|
||||||
on:
|
|
||||||
repo: medcl/elasticsearch-analysis-ik
|
|
||||||
tags: true
|
|
||||||
|
98
README.md
98
README.md
@ -10,43 +10,41 @@ Versions
|
|||||||
|
|
||||||
IK version | ES version
|
IK version | ES version
|
||||||
-----------|-----------
|
-----------|-----------
|
||||||
master | 2.2.1 -> master
|
master | 7.x -> master
|
||||||
|
6.x| 6.x
|
||||||
|
5.x| 5.x
|
||||||
|
1.10.6 | 2.4.6
|
||||||
|
1.9.5 | 2.3.5
|
||||||
1.8.1 | 2.2.1
|
1.8.1 | 2.2.1
|
||||||
1.8.0 | 2.2.0
|
|
||||||
1.7.0 | 2.1.1
|
1.7.0 | 2.1.1
|
||||||
1.6.1 | 2.1.0
|
|
||||||
1.5.0 | 2.0.0
|
1.5.0 | 2.0.0
|
||||||
1.4.1 | 1.7.2
|
|
||||||
1.4.0 | 1.6.0
|
|
||||||
1.3.0 | 1.5.0
|
|
||||||
1.2.9 | 1.4.0
|
|
||||||
1.2.8 | 1.3.2
|
|
||||||
1.2.7 | 1.2.1
|
|
||||||
1.2.6 | 1.0.0
|
1.2.6 | 1.0.0
|
||||||
1.2.5 | 0.90.x
|
1.2.5 | 0.90.x
|
||||||
1.1.3 | 0.20.x
|
1.1.3 | 0.20.x
|
||||||
1.1.2 | 0.19.x
|
|
||||||
1.0.0 | 0.16.2 -> 0.19.0
|
1.0.0 | 0.16.2 -> 0.19.0
|
||||||
|
|
||||||
Install
|
Install
|
||||||
-------
|
-------
|
||||||
|
|
||||||
1.compile
|
1.download or compile
|
||||||
|
|
||||||
`mvn package`
|
* optional 1 - download pre-build package from here: https://github.com/medcl/elasticsearch-analysis-ik/releases
|
||||||
|
|
||||||
copy and unzip `target/releases/elasticsearch-analysis-ik-{version}.zip` to `your-es-root/plugins/ik`
|
create plugin folder `cd your-es-root/plugins/ && mkdir ik`
|
||||||
|
|
||||||
|
unzip plugin to folder `your-es-root/plugins/ik`
|
||||||
|
|
||||||
|
* optional 2 - use elasticsearch-plugin to install ( supported from version v5.5.1 ):
|
||||||
|
|
||||||
|
```
|
||||||
|
./bin/elasticsearch-plugin install https://github.com/medcl/elasticsearch-analysis-ik/releases/download/v6.3.0/elasticsearch-analysis-ik-6.3.0.zip
|
||||||
|
```
|
||||||
|
|
||||||
|
NOTE: replace `6.3.0` to your own elasticsearch version
|
||||||
|
|
||||||
2.restart elasticsearch
|
2.restart elasticsearch
|
||||||
|
|
||||||
|
|
||||||
Tips:
|
|
||||||
|
|
||||||
ik_max_word: 会将文本做最细粒度的拆分,比如会将“中华人民共和国国歌”拆分为“中华人民共和国,中华人民,中华,华人,人民共和国,人民,人,民,共和国,共和,和,国国,国歌”,会穷尽各种可能的组合;
|
|
||||||
|
|
||||||
ik_smart: 会做最粗粒度的拆分,比如会将“中华人民共和国国歌”拆分为“中华人民共和国,国歌”。
|
|
||||||
|
|
||||||
|
|
||||||
#### Quick Example
|
#### Quick Example
|
||||||
|
|
||||||
@ -59,52 +57,41 @@ curl -XPUT http://localhost:9200/index
|
|||||||
2.create a mapping
|
2.create a mapping
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
curl -XPOST http://localhost:9200/index/fulltext/_mapping -d'
|
curl -XPOST http://localhost:9200/index/_mapping -H 'Content-Type:application/json' -d'
|
||||||
{
|
{
|
||||||
"fulltext": {
|
|
||||||
"_all": {
|
|
||||||
"analyzer": "ik_max_word",
|
|
||||||
"search_analyzer": "ik_max_word",
|
|
||||||
"term_vector": "no",
|
|
||||||
"store": "false"
|
|
||||||
},
|
|
||||||
"properties": {
|
"properties": {
|
||||||
"content": {
|
"content": {
|
||||||
"type": "string",
|
"type": "text",
|
||||||
"store": "no",
|
|
||||||
"term_vector": "with_positions_offsets",
|
|
||||||
"analyzer": "ik_max_word",
|
"analyzer": "ik_max_word",
|
||||||
"search_analyzer": "ik_max_word",
|
"search_analyzer": "ik_smart"
|
||||||
"include_in_all": "true",
|
|
||||||
"boost": 8
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
}'
|
}'
|
||||||
```
|
```
|
||||||
|
|
||||||
3.index some docs
|
3.index some docs
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
curl -XPOST http://localhost:9200/index/fulltext/1 -d'
|
curl -XPOST http://localhost:9200/index/_create/1 -H 'Content-Type:application/json' -d'
|
||||||
{"content":"美国留给伊拉克的是个烂摊子吗"}
|
{"content":"美国留给伊拉克的是个烂摊子吗"}
|
||||||
'
|
'
|
||||||
```
|
```
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
curl -XPOST http://localhost:9200/index/fulltext/2 -d'
|
curl -XPOST http://localhost:9200/index/_create/2 -H 'Content-Type:application/json' -d'
|
||||||
{"content":"公安部:各地校车将享最高路权"}
|
{"content":"公安部:各地校车将享最高路权"}
|
||||||
'
|
'
|
||||||
```
|
```
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
curl -XPOST http://localhost:9200/index/fulltext/3 -d'
|
curl -XPOST http://localhost:9200/index/_create/3 -H 'Content-Type:application/json' -d'
|
||||||
{"content":"中韩渔警冲突调查:韩警平均每天扣1艘中国渔船"}
|
{"content":"中韩渔警冲突调查:韩警平均每天扣1艘中国渔船"}
|
||||||
'
|
'
|
||||||
```
|
```
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
curl -XPOST http://localhost:9200/index/fulltext/4 -d'
|
curl -XPOST http://localhost:9200/index/_create/4 -H 'Content-Type:application/json' -d'
|
||||||
{"content":"中国驻洛杉矶领事馆遭亚裔男子枪击 嫌犯已自首"}
|
{"content":"中国驻洛杉矶领事馆遭亚裔男子枪击 嫌犯已自首"}
|
||||||
'
|
'
|
||||||
```
|
```
|
||||||
@ -112,9 +99,9 @@ curl -XPOST http://localhost:9200/index/fulltext/4 -d'
|
|||||||
4.query with highlighting
|
4.query with highlighting
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
curl -XPOST http://localhost:9200/index/fulltext/_search -d'
|
curl -XPOST http://localhost:9200/index/_search -H 'Content-Type:application/json' -d'
|
||||||
{
|
{
|
||||||
"query" : { "term" : { "content" : "中国" }},
|
"query" : { "match" : { "content" : "中国" }},
|
||||||
"highlight" : {
|
"highlight" : {
|
||||||
"pre_tags" : ["<tag1>", "<tag2>"],
|
"pre_tags" : ["<tag1>", "<tag2>"],
|
||||||
"post_tags" : ["</tag1>", "</tag2>"],
|
"post_tags" : ["</tag1>", "</tag2>"],
|
||||||
@ -176,7 +163,8 @@ Result
|
|||||||
|
|
||||||
### Dictionary Configuration
|
### Dictionary Configuration
|
||||||
|
|
||||||
#### `plugins/elasticsearch-analysis-ik-*/config/ik/IKAnalyzer.cfg.xml`
|
`IKAnalyzer.cfg.xml` can be located at `{conf}/analysis-ik/config/IKAnalyzer.cfg.xml`
|
||||||
|
or `{plugins}/elasticsearch-analysis-ik-*/config/IKAnalyzer.cfg.xml`
|
||||||
|
|
||||||
```xml
|
```xml
|
||||||
<?xml version="1.0" encoding="UTF-8"?>
|
<?xml version="1.0" encoding="UTF-8"?>
|
||||||
@ -230,12 +218,38 @@ have fun.
|
|||||||
```bash
|
```bash
|
||||||
git clone https://github.com/medcl/elasticsearch-analysis-ik
|
git clone https://github.com/medcl/elasticsearch-analysis-ik
|
||||||
cd elasticsearch-analysis-ik
|
cd elasticsearch-analysis-ik
|
||||||
|
git checkout tags/{version}
|
||||||
mvn clean
|
mvn clean
|
||||||
mvn compile
|
mvn compile
|
||||||
mvn package
|
mvn package
|
||||||
```
|
```
|
||||||
|
|
||||||
copy & unzip file #{project_path}/elasticsearch-analysis-ik/target/releases/elasticsearch-analysis-ik-*.zip to your elasticsearch's folder: plugins/ik
|
拷贝和解压release下的文件: #{project_path}/elasticsearch-analysis-ik/target/releases/elasticsearch-analysis-ik-*.zip 到你的 elasticsearch 插件目录, 如: plugins/ik
|
||||||
|
重启elasticsearch
|
||||||
|
|
||||||
|
3.分词测试失败
|
||||||
|
请在某个索引下调用analyze接口测试,而不是直接调用analyze接口
|
||||||
|
如:
|
||||||
|
```bash
|
||||||
|
curl -XGET "http://localhost:9200/your_index/_analyze" -H 'Content-Type: application/json' -d'
|
||||||
|
{
|
||||||
|
"text":"中华人民共和国MN","tokenizer": "my_ik"
|
||||||
|
}'
|
||||||
|
```
|
||||||
|
|
||||||
|
|
||||||
|
4. ik_max_word 和 ik_smart 什么区别?
|
||||||
|
|
||||||
|
|
||||||
|
ik_max_word: 会将文本做最细粒度的拆分,比如会将“中华人民共和国国歌”拆分为“中华人民共和国,中华人民,中华,华人,人民共和国,人民,人,民,共和国,共和,和,国国,国歌”,会穷尽各种可能的组合,适合 Term Query;
|
||||||
|
|
||||||
|
ik_smart: 会做最粗粒度的拆分,比如会将“中华人民共和国国歌”拆分为“中华人民共和国,国歌”,适合 Phrase 查询。
|
||||||
|
|
||||||
|
Changes
|
||||||
|
------
|
||||||
|
*自 v5.0.0 起*
|
||||||
|
|
||||||
|
- 移除名为 `ik` 的analyzer和tokenizer,请分别使用 `ik_smart` 和 `ik_max_word`
|
||||||
|
|
||||||
|
|
||||||
Thanks
|
Thanks
|
||||||
|
@ -3,9 +3,9 @@
|
|||||||
<properties>
|
<properties>
|
||||||
<comment>IK Analyzer 扩展配置</comment>
|
<comment>IK Analyzer 扩展配置</comment>
|
||||||
<!--用户可以在这里配置自己的扩展字典 -->
|
<!--用户可以在这里配置自己的扩展字典 -->
|
||||||
<entry key="ext_dict">custom/mydict.dic;custom/single_word_low_freq.dic</entry>
|
<entry key="ext_dict"></entry>
|
||||||
<!--用户可以在这里配置自己的扩展停止词字典-->
|
<!--用户可以在这里配置自己的扩展停止词字典-->
|
||||||
<entry key="ext_stopwords">custom/ext_stopword.dic</entry>
|
<entry key="ext_stopwords"></entry>
|
||||||
<!--用户可以在这里配置远程扩展字典 -->
|
<!--用户可以在这里配置远程扩展字典 -->
|
||||||
<!-- <entry key="remote_ext_dict">words_location</entry> -->
|
<!-- <entry key="remote_ext_dict">words_location</entry> -->
|
||||||
<!--用户可以在这里配置远程扩展停止词字典-->
|
<!--用户可以在这里配置远程扩展停止词字典-->
|
@ -1,14 +0,0 @@
|
|||||||
medcl
|
|
||||||
elastic
|
|
||||||
elasticsearch
|
|
||||||
kogstash
|
|
||||||
kibana
|
|
||||||
marvel
|
|
||||||
shield
|
|
||||||
watcher
|
|
||||||
beats
|
|
||||||
packetbeat
|
|
||||||
filebeat
|
|
||||||
topbeat
|
|
||||||
metrixbeat
|
|
||||||
kimchy
|
|
475
licenses/lucene-LICENSE.txt
Normal file
475
licenses/lucene-LICENSE.txt
Normal file
@ -0,0 +1,475 @@
|
|||||||
|
|
||||||
|
Apache License
|
||||||
|
Version 2.0, January 2004
|
||||||
|
http://www.apache.org/licenses/
|
||||||
|
|
||||||
|
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
|
||||||
|
|
||||||
|
1. Definitions.
|
||||||
|
|
||||||
|
"License" shall mean the terms and conditions for use, reproduction,
|
||||||
|
and distribution as defined by Sections 1 through 9 of this document.
|
||||||
|
|
||||||
|
"Licensor" shall mean the copyright owner or entity authorized by
|
||||||
|
the copyright owner that is granting the License.
|
||||||
|
|
||||||
|
"Legal Entity" shall mean the union of the acting entity and all
|
||||||
|
other entities that control, are controlled by, or are under common
|
||||||
|
control with that entity. For the purposes of this definition,
|
||||||
|
"control" means (i) the power, direct or indirect, to cause the
|
||||||
|
direction or management of such entity, whether by contract or
|
||||||
|
otherwise, or (ii) ownership of fifty percent (50%) or more of the
|
||||||
|
outstanding shares, or (iii) beneficial ownership of such entity.
|
||||||
|
|
||||||
|
"You" (or "Your") shall mean an individual or Legal Entity
|
||||||
|
exercising permissions granted by this License.
|
||||||
|
|
||||||
|
"Source" form shall mean the preferred form for making modifications,
|
||||||
|
including but not limited to software source code, documentation
|
||||||
|
source, and configuration files.
|
||||||
|
|
||||||
|
"Object" form shall mean any form resulting from mechanical
|
||||||
|
transformation or translation of a Source form, including but
|
||||||
|
not limited to compiled object code, generated documentation,
|
||||||
|
and conversions to other media types.
|
||||||
|
|
||||||
|
"Work" shall mean the work of authorship, whether in Source or
|
||||||
|
Object form, made available under the License, as indicated by a
|
||||||
|
copyright notice that is included in or attached to the work
|
||||||
|
(an example is provided in the Appendix below).
|
||||||
|
|
||||||
|
"Derivative Works" shall mean any work, whether in Source or Object
|
||||||
|
form, that is based on (or derived from) the Work and for which the
|
||||||
|
editorial revisions, annotations, elaborations, or other modifications
|
||||||
|
represent, as a whole, an original work of authorship. For the purposes
|
||||||
|
of this License, Derivative Works shall not include works that remain
|
||||||
|
separable from, or merely link (or bind by name) to the interfaces of,
|
||||||
|
the Work and Derivative Works thereof.
|
||||||
|
|
||||||
|
"Contribution" shall mean any work of authorship, including
|
||||||
|
the original version of the Work and any modifications or additions
|
||||||
|
to that Work or Derivative Works thereof, that is intentionally
|
||||||
|
submitted to Licensor for inclusion in the Work by the copyright owner
|
||||||
|
or by an individual or Legal Entity authorized to submit on behalf of
|
||||||
|
the copyright owner. For the purposes of this definition, "submitted"
|
||||||
|
means any form of electronic, verbal, or written communication sent
|
||||||
|
to the Licensor or its representatives, including but not limited to
|
||||||
|
communication on electronic mailing lists, source code control systems,
|
||||||
|
and issue tracking systems that are managed by, or on behalf of, the
|
||||||
|
Licensor for the purpose of discussing and improving the Work, but
|
||||||
|
excluding communication that is conspicuously marked or otherwise
|
||||||
|
designated in writing by the copyright owner as "Not a Contribution."
|
||||||
|
|
||||||
|
"Contributor" shall mean Licensor and any individual or Legal Entity
|
||||||
|
on behalf of whom a Contribution has been received by Licensor and
|
||||||
|
subsequently incorporated within the Work.
|
||||||
|
|
||||||
|
2. Grant of Copyright License. Subject to the terms and conditions of
|
||||||
|
this License, each Contributor hereby grants to You a perpetual,
|
||||||
|
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
||||||
|
copyright license to reproduce, prepare Derivative Works of,
|
||||||
|
publicly display, publicly perform, sublicense, and distribute the
|
||||||
|
Work and such Derivative Works in Source or Object form.
|
||||||
|
|
||||||
|
3. Grant of Patent License. Subject to the terms and conditions of
|
||||||
|
this License, each Contributor hereby grants to You a perpetual,
|
||||||
|
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
||||||
|
(except as stated in this section) patent license to make, have made,
|
||||||
|
use, offer to sell, sell, import, and otherwise transfer the Work,
|
||||||
|
where such license applies only to those patent claims licensable
|
||||||
|
by such Contributor that are necessarily infringed by their
|
||||||
|
Contribution(s) alone or by combination of their Contribution(s)
|
||||||
|
with the Work to which such Contribution(s) was submitted. If You
|
||||||
|
institute patent litigation against any entity (including a
|
||||||
|
cross-claim or counterclaim in a lawsuit) alleging that the Work
|
||||||
|
or a Contribution incorporated within the Work constitutes direct
|
||||||
|
or contributory patent infringement, then any patent licenses
|
||||||
|
granted to You under this License for that Work shall terminate
|
||||||
|
as of the date such litigation is filed.
|
||||||
|
|
||||||
|
4. Redistribution. You may reproduce and distribute copies of the
|
||||||
|
Work or Derivative Works thereof in any medium, with or without
|
||||||
|
modifications, and in Source or Object form, provided that You
|
||||||
|
meet the following conditions:
|
||||||
|
|
||||||
|
(a) You must give any other recipients of the Work or
|
||||||
|
Derivative Works a copy of this License; and
|
||||||
|
|
||||||
|
(b) You must cause any modified files to carry prominent notices
|
||||||
|
stating that You changed the files; and
|
||||||
|
|
||||||
|
(c) You must retain, in the Source form of any Derivative Works
|
||||||
|
that You distribute, all copyright, patent, trademark, and
|
||||||
|
attribution notices from the Source form of the Work,
|
||||||
|
excluding those notices that do not pertain to any part of
|
||||||
|
the Derivative Works; and
|
||||||
|
|
||||||
|
(d) If the Work includes a "NOTICE" text file as part of its
|
||||||
|
distribution, then any Derivative Works that You distribute must
|
||||||
|
include a readable copy of the attribution notices contained
|
||||||
|
within such NOTICE file, excluding those notices that do not
|
||||||
|
pertain to any part of the Derivative Works, in at least one
|
||||||
|
of the following places: within a NOTICE text file distributed
|
||||||
|
as part of the Derivative Works; within the Source form or
|
||||||
|
documentation, if provided along with the Derivative Works; or,
|
||||||
|
within a display generated by the Derivative Works, if and
|
||||||
|
wherever such third-party notices normally appear. The contents
|
||||||
|
of the NOTICE file are for informational purposes only and
|
||||||
|
do not modify the License. You may add Your own attribution
|
||||||
|
notices within Derivative Works that You distribute, alongside
|
||||||
|
or as an addendum to the NOTICE text from the Work, provided
|
||||||
|
that such additional attribution notices cannot be construed
|
||||||
|
as modifying the License.
|
||||||
|
|
||||||
|
You may add Your own copyright statement to Your modifications and
|
||||||
|
may provide additional or different license terms and conditions
|
||||||
|
for use, reproduction, or distribution of Your modifications, or
|
||||||
|
for any such Derivative Works as a whole, provided Your use,
|
||||||
|
reproduction, and distribution of the Work otherwise complies with
|
||||||
|
the conditions stated in this License.
|
||||||
|
|
||||||
|
5. Submission of Contributions. Unless You explicitly state otherwise,
|
||||||
|
any Contribution intentionally submitted for inclusion in the Work
|
||||||
|
by You to the Licensor shall be under the terms and conditions of
|
||||||
|
this License, without any additional terms or conditions.
|
||||||
|
Notwithstanding the above, nothing herein shall supersede or modify
|
||||||
|
the terms of any separate license agreement you may have executed
|
||||||
|
with Licensor regarding such Contributions.
|
||||||
|
|
||||||
|
6. Trademarks. This License does not grant permission to use the trade
|
||||||
|
names, trademarks, service marks, or product names of the Licensor,
|
||||||
|
except as required for reasonable and customary use in describing the
|
||||||
|
origin of the Work and reproducing the content of the NOTICE file.
|
||||||
|
|
||||||
|
7. Disclaimer of Warranty. Unless required by applicable law or
|
||||||
|
agreed to in writing, Licensor provides the Work (and each
|
||||||
|
Contributor provides its Contributions) on an "AS IS" BASIS,
|
||||||
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
||||||
|
implied, including, without limitation, any warranties or conditions
|
||||||
|
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
|
||||||
|
PARTICULAR PURPOSE. You are solely responsible for determining the
|
||||||
|
appropriateness of using or redistributing the Work and assume any
|
||||||
|
risks associated with Your exercise of permissions under this License.
|
||||||
|
|
||||||
|
8. Limitation of Liability. In no event and under no legal theory,
|
||||||
|
whether in tort (including negligence), contract, or otherwise,
|
||||||
|
unless required by applicable law (such as deliberate and grossly
|
||||||
|
negligent acts) or agreed to in writing, shall any Contributor be
|
||||||
|
liable to You for damages, including any direct, indirect, special,
|
||||||
|
incidental, or consequential damages of any character arising as a
|
||||||
|
result of this License or out of the use or inability to use the
|
||||||
|
Work (including but not limited to damages for loss of goodwill,
|
||||||
|
work stoppage, computer failure or malfunction, or any and all
|
||||||
|
other commercial damages or losses), even if such Contributor
|
||||||
|
has been advised of the possibility of such damages.
|
||||||
|
|
||||||
|
9. Accepting Warranty or Additional Liability. While redistributing
|
||||||
|
the Work or Derivative Works thereof, You may choose to offer,
|
||||||
|
and charge a fee for, acceptance of support, warranty, indemnity,
|
||||||
|
or other liability obligations and/or rights consistent with this
|
||||||
|
License. However, in accepting such obligations, You may act only
|
||||||
|
on Your own behalf and on Your sole responsibility, not on behalf
|
||||||
|
of any other Contributor, and only if You agree to indemnify,
|
||||||
|
defend, and hold each Contributor harmless for any liability
|
||||||
|
incurred by, or claims asserted against, such Contributor by reason
|
||||||
|
of your accepting any such warranty or additional liability.
|
||||||
|
|
||||||
|
END OF TERMS AND CONDITIONS
|
||||||
|
|
||||||
|
APPENDIX: How to apply the Apache License to your work.
|
||||||
|
|
||||||
|
To apply the Apache License to your work, attach the following
|
||||||
|
boilerplate notice, with the fields enclosed by brackets "[]"
|
||||||
|
replaced with your own identifying information. (Don't include
|
||||||
|
the brackets!) The text should be enclosed in the appropriate
|
||||||
|
comment syntax for the file format. We also recommend that a
|
||||||
|
file or class name and description of purpose be included on the
|
||||||
|
same "printed page" as the copyright notice for easier
|
||||||
|
identification within third-party archives.
|
||||||
|
|
||||||
|
Copyright [yyyy] [name of copyright owner]
|
||||||
|
|
||||||
|
Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
you may not use this file except in compliance with the License.
|
||||||
|
You may obtain a copy of the License at
|
||||||
|
|
||||||
|
http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
|
||||||
|
Unless required by applicable law or agreed to in writing, software
|
||||||
|
distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
See the License for the specific language governing permissions and
|
||||||
|
limitations under the License.
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
Some code in core/src/java/org/apache/lucene/util/UnicodeUtil.java was
|
||||||
|
derived from unicode conversion examples available at
|
||||||
|
http://www.unicode.org/Public/PROGRAMS/CVTUTF. Here is the copyright
|
||||||
|
from those sources:
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Copyright 2001-2004 Unicode, Inc.
|
||||||
|
*
|
||||||
|
* Disclaimer
|
||||||
|
*
|
||||||
|
* This source code is provided as is by Unicode, Inc. No claims are
|
||||||
|
* made as to fitness for any particular purpose. No warranties of any
|
||||||
|
* kind are expressed or implied. The recipient agrees to determine
|
||||||
|
* applicability of information provided. If this file has been
|
||||||
|
* purchased on magnetic or optical media from Unicode, Inc., the
|
||||||
|
* sole remedy for any claim will be exchange of defective media
|
||||||
|
* within 90 days of receipt.
|
||||||
|
*
|
||||||
|
* Limitations on Rights to Redistribute This Code
|
||||||
|
*
|
||||||
|
* Unicode, Inc. hereby grants the right to freely use the information
|
||||||
|
* supplied in this file in the creation of products supporting the
|
||||||
|
* Unicode Standard, and to make copies of this file in any form
|
||||||
|
* for internal or external distribution as long as this notice
|
||||||
|
* remains attached.
|
||||||
|
*/
|
||||||
|
|
||||||
|
|
||||||
|
Some code in core/src/java/org/apache/lucene/util/ArrayUtil.java was
|
||||||
|
derived from Python 2.4.2 sources available at
|
||||||
|
http://www.python.org. Full license is here:
|
||||||
|
|
||||||
|
http://www.python.org/download/releases/2.4.2/license/
|
||||||
|
|
||||||
|
Some code in core/src/java/org/apache/lucene/util/UnicodeUtil.java was
|
||||||
|
derived from Python 3.1.2 sources available at
|
||||||
|
http://www.python.org. Full license is here:
|
||||||
|
|
||||||
|
http://www.python.org/download/releases/3.1.2/license/
|
||||||
|
|
||||||
|
Some code in core/src/java/org/apache/lucene/util/automaton was
|
||||||
|
derived from Brics automaton sources available at
|
||||||
|
www.brics.dk/automaton/. Here is the copyright from those sources:
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Copyright (c) 2001-2009 Anders Moeller
|
||||||
|
* All rights reserved.
|
||||||
|
*
|
||||||
|
* Redistribution and use in source and binary forms, with or without
|
||||||
|
* modification, are permitted provided that the following conditions
|
||||||
|
* are met:
|
||||||
|
* 1. Redistributions of source code must retain the above copyright
|
||||||
|
* notice, this list of conditions and the following disclaimer.
|
||||||
|
* 2. Redistributions in binary form must reproduce the above copyright
|
||||||
|
* notice, this list of conditions and the following disclaimer in the
|
||||||
|
* documentation and/or other materials provided with the distribution.
|
||||||
|
* 3. The name of the author may not be used to endorse or promote products
|
||||||
|
* derived from this software without specific prior written permission.
|
||||||
|
*
|
||||||
|
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
|
||||||
|
* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
|
||||||
|
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
|
||||||
|
* IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
|
||||||
|
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
|
||||||
|
* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||||
|
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||||
|
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||||
|
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
|
||||||
|
* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
*/
|
||||||
|
|
||||||
|
The levenshtein automata tables in core/src/java/org/apache/lucene/util/automaton
|
||||||
|
were automatically generated with the moman/finenight FSA package.
|
||||||
|
Here is the copyright for those sources:
|
||||||
|
|
||||||
|
# Copyright (c) 2010, Jean-Philippe Barrette-LaPierre, <jpb@rrette.com>
|
||||||
|
#
|
||||||
|
# Permission is hereby granted, free of charge, to any person
|
||||||
|
# obtaining a copy of this software and associated documentation
|
||||||
|
# files (the "Software"), to deal in the Software without
|
||||||
|
# restriction, including without limitation the rights to use,
|
||||||
|
# copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||||
|
# copies of the Software, and to permit persons to whom the
|
||||||
|
# Software is furnished to do so, subject to the following
|
||||||
|
# conditions:
|
||||||
|
#
|
||||||
|
# The above copyright notice and this permission notice shall be
|
||||||
|
# included in all copies or substantial portions of the Software.
|
||||||
|
#
|
||||||
|
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||||
|
# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||||
|
# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||||
|
# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||||
|
# HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||||
|
# WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||||
|
# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||||
|
# OTHER DEALINGS IN THE SOFTWARE.
|
||||||
|
|
||||||
|
Some code in core/src/java/org/apache/lucene/util/UnicodeUtil.java was
|
||||||
|
derived from ICU (http://www.icu-project.org)
|
||||||
|
The full license is available here:
|
||||||
|
http://source.icu-project.org/repos/icu/icu/trunk/license.html
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Copyright (C) 1999-2010, International Business Machines
|
||||||
|
* Corporation and others. All Rights Reserved.
|
||||||
|
*
|
||||||
|
* Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||||
|
* of this software and associated documentation files (the "Software"), to deal
|
||||||
|
* in the Software without restriction, including without limitation the rights
|
||||||
|
* to use, copy, modify, merge, publish, distribute, and/or sell copies of the
|
||||||
|
* Software, and to permit persons to whom the Software is furnished to do so,
|
||||||
|
* provided that the above copyright notice(s) and this permission notice appear
|
||||||
|
* in all copies of the Software and that both the above copyright notice(s) and
|
||||||
|
* this permission notice appear in supporting documentation.
|
||||||
|
*
|
||||||
|
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||||
|
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||||
|
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF THIRD PARTY RIGHTS.
|
||||||
|
* IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS INCLUDED IN THIS NOTICE BE
|
||||||
|
* LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES, OR
|
||||||
|
* ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER
|
||||||
|
* IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
|
||||||
|
* OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
|
||||||
|
*
|
||||||
|
* Except as contained in this notice, the name of a copyright holder shall not
|
||||||
|
* be used in advertising or otherwise to promote the sale, use or other
|
||||||
|
* dealings in this Software without prior written authorization of the
|
||||||
|
* copyright holder.
|
||||||
|
*/
|
||||||
|
|
||||||
|
The following license applies to the Snowball stemmers:
|
||||||
|
|
||||||
|
Copyright (c) 2001, Dr Martin Porter
|
||||||
|
Copyright (c) 2002, Richard Boulton
|
||||||
|
All rights reserved.
|
||||||
|
|
||||||
|
Redistribution and use in source and binary forms, with or without
|
||||||
|
modification, are permitted provided that the following conditions are met:
|
||||||
|
|
||||||
|
* Redistributions of source code must retain the above copyright notice,
|
||||||
|
* this list of conditions and the following disclaimer.
|
||||||
|
* Redistributions in binary form must reproduce the above copyright
|
||||||
|
* notice, this list of conditions and the following disclaimer in the
|
||||||
|
* documentation and/or other materials provided with the distribution.
|
||||||
|
* Neither the name of the copyright holders nor the names of its contributors
|
||||||
|
* may be used to endorse or promote products derived from this software
|
||||||
|
* without specific prior written permission.
|
||||||
|
|
||||||
|
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||||
|
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||||
|
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||||
|
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
|
||||||
|
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||||
|
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||||
|
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||||
|
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||||
|
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||||
|
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
|
||||||
|
The following license applies to the KStemmer:
|
||||||
|
|
||||||
|
Copyright © 2003,
|
||||||
|
Center for Intelligent Information Retrieval,
|
||||||
|
University of Massachusetts, Amherst.
|
||||||
|
All rights reserved.
|
||||||
|
|
||||||
|
Redistribution and use in source and binary forms, with or without modification,
|
||||||
|
are permitted provided that the following conditions are met:
|
||||||
|
|
||||||
|
1. Redistributions of source code must retain the above copyright notice, this
|
||||||
|
list of conditions and the following disclaimer.
|
||||||
|
|
||||||
|
2. Redistributions in binary form must reproduce the above copyright notice,
|
||||||
|
this list of conditions and the following disclaimer in the documentation
|
||||||
|
and/or other materials provided with the distribution.
|
||||||
|
|
||||||
|
3. The names "Center for Intelligent Information Retrieval" and
|
||||||
|
"University of Massachusetts" must not be used to endorse or promote products
|
||||||
|
derived from this software without prior written permission. To obtain
|
||||||
|
permission, contact info@ciir.cs.umass.edu.
|
||||||
|
|
||||||
|
THIS SOFTWARE IS PROVIDED BY UNIVERSITY OF MASSACHUSETTS AND OTHER CONTRIBUTORS
|
||||||
|
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
|
||||||
|
THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||||
|
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDERS OR CONTRIBUTORS BE
|
||||||
|
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||||
|
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE
|
||||||
|
GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
|
||||||
|
HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
|
||||||
|
LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
|
||||||
|
OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
|
||||||
|
SUCH DAMAGE.
|
||||||
|
|
||||||
|
The following license applies to the Morfologik project:
|
||||||
|
|
||||||
|
Copyright (c) 2006 Dawid Weiss
|
||||||
|
Copyright (c) 2007-2011 Dawid Weiss, Marcin Miłkowski
|
||||||
|
All rights reserved.
|
||||||
|
|
||||||
|
Redistribution and use in source and binary forms, with or without modification,
|
||||||
|
are permitted provided that the following conditions are met:
|
||||||
|
|
||||||
|
* Redistributions of source code must retain the above copyright notice,
|
||||||
|
this list of conditions and the following disclaimer.
|
||||||
|
|
||||||
|
* Redistributions in binary form must reproduce the above copyright notice,
|
||||||
|
this list of conditions and the following disclaimer in the documentation
|
||||||
|
and/or other materials provided with the distribution.
|
||||||
|
|
||||||
|
* Neither the name of Morfologik nor the names of its contributors
|
||||||
|
may be used to endorse or promote products derived from this software
|
||||||
|
without specific prior written permission.
|
||||||
|
|
||||||
|
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
|
||||||
|
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
||||||
|
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||||
|
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
|
||||||
|
ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
|
||||||
|
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
||||||
|
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
|
||||||
|
ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||||
|
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||||
|
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
The dictionary comes from Morfologik project. Morfologik uses data from
|
||||||
|
Polish ispell/myspell dictionary hosted at http://www.sjp.pl/slownik/en/ and
|
||||||
|
is licenced on the terms of (inter alia) LGPL and Creative Commons
|
||||||
|
ShareAlike. The part-of-speech tags were added in Morfologik project and
|
||||||
|
are not found in the data from sjp.pl. The tagset is similar to IPI PAN
|
||||||
|
tagset.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
The following license applies to the Morfeusz project,
|
||||||
|
used by org.apache.lucene.analysis.morfologik.
|
||||||
|
|
||||||
|
BSD-licensed dictionary of Polish (SGJP)
|
||||||
|
http://sgjp.pl/morfeusz/
|
||||||
|
|
||||||
|
Copyright © 2011 Zygmunt Saloni, Włodzimierz Gruszczyński,
|
||||||
|
Marcin Woliński, Robert Wołosz
|
||||||
|
|
||||||
|
All rights reserved.
|
||||||
|
|
||||||
|
Redistribution and use in source and binary forms, with or without
|
||||||
|
modification, are permitted provided that the following conditions are
|
||||||
|
met:
|
||||||
|
|
||||||
|
1. Redistributions of source code must retain the above copyright
|
||||||
|
notice, this list of conditions and the following disclaimer.
|
||||||
|
|
||||||
|
2. Redistributions in binary form must reproduce the above copyright
|
||||||
|
notice, this list of conditions and the following disclaimer in the
|
||||||
|
documentation and/or other materials provided with the
|
||||||
|
distribution.
|
||||||
|
|
||||||
|
THIS SOFTWARE IS PROVIDED BY COPYRIGHT HOLDERS “AS IS” AND ANY EXPRESS
|
||||||
|
OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
||||||
|
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||||
|
DISCLAIMED. IN NO EVENT SHALL COPYRIGHT HOLDERS OR CONTRIBUTORS BE
|
||||||
|
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||||
|
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||||
|
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
|
||||||
|
BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
|
||||||
|
WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
|
||||||
|
OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN
|
||||||
|
IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
191
licenses/lucene-NOTICE.txt
Normal file
191
licenses/lucene-NOTICE.txt
Normal file
@ -0,0 +1,191 @@
|
|||||||
|
Apache Lucene
|
||||||
|
Copyright 2014 The Apache Software Foundation
|
||||||
|
|
||||||
|
This product includes software developed at
|
||||||
|
The Apache Software Foundation (http://www.apache.org/).
|
||||||
|
|
||||||
|
Includes software from other Apache Software Foundation projects,
|
||||||
|
including, but not limited to:
|
||||||
|
- Apache Ant
|
||||||
|
- Apache Jakarta Regexp
|
||||||
|
- Apache Commons
|
||||||
|
- Apache Xerces
|
||||||
|
|
||||||
|
ICU4J, (under analysis/icu) is licensed under an MIT styles license
|
||||||
|
and Copyright (c) 1995-2008 International Business Machines Corporation and others
|
||||||
|
|
||||||
|
Some data files (under analysis/icu/src/data) are derived from Unicode data such
|
||||||
|
as the Unicode Character Database. See http://unicode.org/copyright.html for more
|
||||||
|
details.
|
||||||
|
|
||||||
|
Brics Automaton (under core/src/java/org/apache/lucene/util/automaton) is
|
||||||
|
BSD-licensed, created by Anders Møller. See http://www.brics.dk/automaton/
|
||||||
|
|
||||||
|
The levenshtein automata tables (under core/src/java/org/apache/lucene/util/automaton) were
|
||||||
|
automatically generated with the moman/finenight FSA library, created by
|
||||||
|
Jean-Philippe Barrette-LaPierre. This library is available under an MIT license,
|
||||||
|
see http://sites.google.com/site/rrettesite/moman and
|
||||||
|
http://bitbucket.org/jpbarrette/moman/overview/
|
||||||
|
|
||||||
|
The class org.apache.lucene.util.WeakIdentityMap was derived from
|
||||||
|
the Apache CXF project and is Apache License 2.0.
|
||||||
|
|
||||||
|
The Google Code Prettify is Apache License 2.0.
|
||||||
|
See http://code.google.com/p/google-code-prettify/
|
||||||
|
|
||||||
|
JUnit (junit-4.10) is licensed under the Common Public License v. 1.0
|
||||||
|
See http://junit.sourceforge.net/cpl-v10.html
|
||||||
|
|
||||||
|
This product includes code (JaspellTernarySearchTrie) from Java Spelling Checkin
|
||||||
|
g Package (jaspell): http://jaspell.sourceforge.net/
|
||||||
|
License: The BSD License (http://www.opensource.org/licenses/bsd-license.php)
|
||||||
|
|
||||||
|
The snowball stemmers in
|
||||||
|
analysis/common/src/java/net/sf/snowball
|
||||||
|
were developed by Martin Porter and Richard Boulton.
|
||||||
|
The snowball stopword lists in
|
||||||
|
analysis/common/src/resources/org/apache/lucene/analysis/snowball
|
||||||
|
were developed by Martin Porter and Richard Boulton.
|
||||||
|
The full snowball package is available from
|
||||||
|
http://snowball.tartarus.org/
|
||||||
|
|
||||||
|
The KStem stemmer in
|
||||||
|
analysis/common/src/org/apache/lucene/analysis/en
|
||||||
|
was developed by Bob Krovetz and Sergio Guzman-Lara (CIIR-UMass Amherst)
|
||||||
|
under the BSD-license.
|
||||||
|
|
||||||
|
The Arabic,Persian,Romanian,Bulgarian, and Hindi analyzers (common) come with a default
|
||||||
|
stopword list that is BSD-licensed created by Jacques Savoy. These files reside in:
|
||||||
|
analysis/common/src/resources/org/apache/lucene/analysis/ar/stopwords.txt,
|
||||||
|
analysis/common/src/resources/org/apache/lucene/analysis/fa/stopwords.txt,
|
||||||
|
analysis/common/src/resources/org/apache/lucene/analysis/ro/stopwords.txt,
|
||||||
|
analysis/common/src/resources/org/apache/lucene/analysis/bg/stopwords.txt,
|
||||||
|
analysis/common/src/resources/org/apache/lucene/analysis/hi/stopwords.txt
|
||||||
|
See http://members.unine.ch/jacques.savoy/clef/index.html.
|
||||||
|
|
||||||
|
The German,Spanish,Finnish,French,Hungarian,Italian,Portuguese,Russian and Swedish light stemmers
|
||||||
|
(common) are based on BSD-licensed reference implementations created by Jacques Savoy and
|
||||||
|
Ljiljana Dolamic. These files reside in:
|
||||||
|
analysis/common/src/java/org/apache/lucene/analysis/de/GermanLightStemmer.java
|
||||||
|
analysis/common/src/java/org/apache/lucene/analysis/de/GermanMinimalStemmer.java
|
||||||
|
analysis/common/src/java/org/apache/lucene/analysis/es/SpanishLightStemmer.java
|
||||||
|
analysis/common/src/java/org/apache/lucene/analysis/fi/FinnishLightStemmer.java
|
||||||
|
analysis/common/src/java/org/apache/lucene/analysis/fr/FrenchLightStemmer.java
|
||||||
|
analysis/common/src/java/org/apache/lucene/analysis/fr/FrenchMinimalStemmer.java
|
||||||
|
analysis/common/src/java/org/apache/lucene/analysis/hu/HungarianLightStemmer.java
|
||||||
|
analysis/common/src/java/org/apache/lucene/analysis/it/ItalianLightStemmer.java
|
||||||
|
analysis/common/src/java/org/apache/lucene/analysis/pt/PortugueseLightStemmer.java
|
||||||
|
analysis/common/src/java/org/apache/lucene/analysis/ru/RussianLightStemmer.java
|
||||||
|
analysis/common/src/java/org/apache/lucene/analysis/sv/SwedishLightStemmer.java
|
||||||
|
|
||||||
|
The Stempel analyzer (stempel) includes BSD-licensed software developed
|
||||||
|
by the Egothor project http://egothor.sf.net/, created by Leo Galambos, Martin Kvapil,
|
||||||
|
and Edmond Nolan.
|
||||||
|
|
||||||
|
The Polish analyzer (stempel) comes with a default
|
||||||
|
stopword list that is BSD-licensed created by the Carrot2 project. The file resides
|
||||||
|
in stempel/src/resources/org/apache/lucene/analysis/pl/stopwords.txt.
|
||||||
|
See http://project.carrot2.org/license.html.
|
||||||
|
|
||||||
|
The SmartChineseAnalyzer source code (smartcn) was
|
||||||
|
provided by Xiaoping Gao and copyright 2009 by www.imdict.net.
|
||||||
|
|
||||||
|
WordBreakTestUnicode_*.java (under modules/analysis/common/src/test/)
|
||||||
|
is derived from Unicode data such as the Unicode Character Database.
|
||||||
|
See http://unicode.org/copyright.html for more details.
|
||||||
|
|
||||||
|
The Morfologik analyzer (morfologik) includes BSD-licensed software
|
||||||
|
developed by Dawid Weiss and Marcin Miłkowski (http://morfologik.blogspot.com/).
|
||||||
|
|
||||||
|
Morfologik uses data from Polish ispell/myspell dictionary
|
||||||
|
(http://www.sjp.pl/slownik/en/) licenced on the terms of (inter alia)
|
||||||
|
LGPL and Creative Commons ShareAlike.
|
||||||
|
|
||||||
|
Morfologic includes data from BSD-licensed dictionary of Polish (SGJP)
|
||||||
|
(http://sgjp.pl/morfeusz/)
|
||||||
|
|
||||||
|
Servlet-api.jar and javax.servlet-*.jar are under the CDDL license, the original
|
||||||
|
source code for this can be found at http://www.eclipse.org/jetty/downloads.php
|
||||||
|
|
||||||
|
===========================================================================
|
||||||
|
Kuromoji Japanese Morphological Analyzer - Apache Lucene Integration
|
||||||
|
===========================================================================
|
||||||
|
|
||||||
|
This software includes a binary and/or source version of data from
|
||||||
|
|
||||||
|
mecab-ipadic-2.7.0-20070801
|
||||||
|
|
||||||
|
which can be obtained from
|
||||||
|
|
||||||
|
http://atilika.com/releases/mecab-ipadic/mecab-ipadic-2.7.0-20070801.tar.gz
|
||||||
|
|
||||||
|
or
|
||||||
|
|
||||||
|
http://jaist.dl.sourceforge.net/project/mecab/mecab-ipadic/2.7.0-20070801/mecab-ipadic-2.7.0-20070801.tar.gz
|
||||||
|
|
||||||
|
===========================================================================
|
||||||
|
mecab-ipadic-2.7.0-20070801 Notice
|
||||||
|
===========================================================================
|
||||||
|
|
||||||
|
Nara Institute of Science and Technology (NAIST),
|
||||||
|
the copyright holders, disclaims all warranties with regard to this
|
||||||
|
software, including all implied warranties of merchantability and
|
||||||
|
fitness, in no event shall NAIST be liable for
|
||||||
|
any special, indirect or consequential damages or any damages
|
||||||
|
whatsoever resulting from loss of use, data or profits, whether in an
|
||||||
|
action of contract, negligence or other tortuous action, arising out
|
||||||
|
of or in connection with the use or performance of this software.
|
||||||
|
|
||||||
|
A large portion of the dictionary entries
|
||||||
|
originate from ICOT Free Software. The following conditions for ICOT
|
||||||
|
Free Software applies to the current dictionary as well.
|
||||||
|
|
||||||
|
Each User may also freely distribute the Program, whether in its
|
||||||
|
original form or modified, to any third party or parties, PROVIDED
|
||||||
|
that the provisions of Section 3 ("NO WARRANTY") will ALWAYS appear
|
||||||
|
on, or be attached to, the Program, which is distributed substantially
|
||||||
|
in the same form as set out herein and that such intended
|
||||||
|
distribution, if actually made, will neither violate or otherwise
|
||||||
|
contravene any of the laws and regulations of the countries having
|
||||||
|
jurisdiction over the User or the intended distribution itself.
|
||||||
|
|
||||||
|
NO WARRANTY
|
||||||
|
|
||||||
|
The program was produced on an experimental basis in the course of the
|
||||||
|
research and development conducted during the project and is provided
|
||||||
|
to users as so produced on an experimental basis. Accordingly, the
|
||||||
|
program is provided without any warranty whatsoever, whether express,
|
||||||
|
implied, statutory or otherwise. The term "warranty" used herein
|
||||||
|
includes, but is not limited to, any warranty of the quality,
|
||||||
|
performance, merchantability and fitness for a particular purpose of
|
||||||
|
the program and the nonexistence of any infringement or violation of
|
||||||
|
any right of any third party.
|
||||||
|
|
||||||
|
Each user of the program will agree and understand, and be deemed to
|
||||||
|
have agreed and understood, that there is no warranty whatsoever for
|
||||||
|
the program and, accordingly, the entire risk arising from or
|
||||||
|
otherwise connected with the program is assumed by the user.
|
||||||
|
|
||||||
|
Therefore, neither ICOT, the copyright holder, or any other
|
||||||
|
organization that participated in or was otherwise related to the
|
||||||
|
development of the program and their respective officials, directors,
|
||||||
|
officers and other employees shall be held liable for any and all
|
||||||
|
damages, including, without limitation, general, special, incidental
|
||||||
|
and consequential damages, arising out of or otherwise in connection
|
||||||
|
with the use or inability to use the program or any product, material
|
||||||
|
or result produced or otherwise obtained by using the program,
|
||||||
|
regardless of whether they have been advised of, or otherwise had
|
||||||
|
knowledge of, the possibility of such damages at any time during the
|
||||||
|
project or thereafter. Each user will be deemed to have agreed to the
|
||||||
|
foregoing by his or her commencement of use of the program. The term
|
||||||
|
"use" as used herein includes, but is not limited to, the use,
|
||||||
|
modification, copying and distribution of the program and the
|
||||||
|
production of secondary products from the program.
|
||||||
|
|
||||||
|
In the case where the program, whether in its original form or
|
||||||
|
modified, was distributed or delivered to or received by a user from
|
||||||
|
any person, organization or entity other than ICOT, unless it makes or
|
||||||
|
grants independently of ICOT any specific warranty to the user in
|
||||||
|
writing, such person, organization or entity, will also be exempted
|
||||||
|
from and not be held liable to the user for any such damages as noted
|
||||||
|
above as far as the program is concerned.
|
145
pom.xml
Normal file → Executable file
145
pom.xml
Normal file → Executable file
@ -6,20 +6,22 @@
|
|||||||
<modelVersion>4.0.0</modelVersion>
|
<modelVersion>4.0.0</modelVersion>
|
||||||
<groupId>org.elasticsearch</groupId>
|
<groupId>org.elasticsearch</groupId>
|
||||||
<artifactId>elasticsearch-analysis-ik</artifactId>
|
<artifactId>elasticsearch-analysis-ik</artifactId>
|
||||||
<version>1.8.1</version>
|
<version>${elasticsearch.version}</version>
|
||||||
<packaging>jar</packaging>
|
<packaging>jar</packaging>
|
||||||
<description>IK Analyzer for Elasticsearch</description>
|
<description>IK Analyzer for Elasticsearch</description>
|
||||||
<inceptionYear>2011</inceptionYear>
|
<inceptionYear>2011</inceptionYear>
|
||||||
|
|
||||||
<properties>
|
<properties>
|
||||||
<elasticsearch.version>2.2.1</elasticsearch.version>
|
<elasticsearch.version>8.4.1</elasticsearch.version>
|
||||||
<maven.compiler.target>1.7</maven.compiler.target>
|
<maven.compiler.target>1.8</maven.compiler.target>
|
||||||
<elasticsearch.assembly.descriptor>${project.basedir}/src/main/assemblies/plugin.xml</elasticsearch.assembly.descriptor>
|
<elasticsearch.assembly.descriptor>${project.basedir}/src/main/assemblies/plugin.xml</elasticsearch.assembly.descriptor>
|
||||||
<elasticsearch.plugin.name>analysis-ik</elasticsearch.plugin.name>
|
<elasticsearch.plugin.name>analysis-ik</elasticsearch.plugin.name>
|
||||||
<elasticsearch.plugin.classname>org.elasticsearch.plugin.analysis.ik.AnalysisIkPlugin</elasticsearch.plugin.classname>
|
<elasticsearch.plugin.classname>org.elasticsearch.plugin.analysis.ik.AnalysisIkPlugin</elasticsearch.plugin.classname>
|
||||||
<elasticsearch.plugin.jvm>true</elasticsearch.plugin.jvm>
|
<elasticsearch.plugin.jvm>true</elasticsearch.plugin.jvm>
|
||||||
<tests.rest.load_packaged>false</tests.rest.load_packaged>
|
<tests.rest.load_packaged>false</tests.rest.load_packaged>
|
||||||
<skip.unit.tests>true</skip.unit.tests>
|
<skip.unit.tests>true</skip.unit.tests>
|
||||||
|
<gpg.keyname>4E899B30</gpg.keyname>
|
||||||
|
<gpg.useagent>true</gpg.useagent>
|
||||||
</properties>
|
</properties>
|
||||||
|
|
||||||
<licenses>
|
<licenses>
|
||||||
@ -30,6 +32,15 @@
|
|||||||
</license>
|
</license>
|
||||||
</licenses>
|
</licenses>
|
||||||
|
|
||||||
|
<developers>
|
||||||
|
<developer>
|
||||||
|
<name>INFINI Labs</name>
|
||||||
|
<email>hello@infini.ltd</email>
|
||||||
|
<organization>INFINI Labs</organization>
|
||||||
|
<organizationUrl>https://infinilabs.com</organizationUrl>
|
||||||
|
</developer>
|
||||||
|
</developers>
|
||||||
|
|
||||||
<scm>
|
<scm>
|
||||||
<connection>scm:git:git@github.com:medcl/elasticsearch-analysis-ik.git</connection>
|
<connection>scm:git:git@github.com:medcl/elasticsearch-analysis-ik.git</connection>
|
||||||
<developerConnection>scm:git:git@github.com:medcl/elasticsearch-analysis-ik.git
|
<developerConnection>scm:git:git@github.com:medcl/elasticsearch-analysis-ik.git
|
||||||
@ -40,16 +51,27 @@
|
|||||||
<parent>
|
<parent>
|
||||||
<groupId>org.sonatype.oss</groupId>
|
<groupId>org.sonatype.oss</groupId>
|
||||||
<artifactId>oss-parent</artifactId>
|
<artifactId>oss-parent</artifactId>
|
||||||
<version>7</version>
|
<version>9</version>
|
||||||
</parent>
|
</parent>
|
||||||
|
|
||||||
|
<distributionManagement>
|
||||||
|
<snapshotRepository>
|
||||||
|
<id>oss.sonatype.org</id>
|
||||||
|
<url>https://oss.sonatype.org/content/repositories/snapshots</url>
|
||||||
|
</snapshotRepository>
|
||||||
|
<repository>
|
||||||
|
<id>oss.sonatype.org</id>
|
||||||
|
<url>https://oss.sonatype.org/service/local/staging/deploy/maven2/</url>
|
||||||
|
</repository>
|
||||||
|
</distributionManagement>
|
||||||
|
|
||||||
<repositories>
|
<repositories>
|
||||||
<repository>
|
<repository>
|
||||||
<id>oss.sonatype.org</id>
|
<id>oss.sonatype.org</id>
|
||||||
<name>OSS Sonatype</name>
|
<name>OSS Sonatype</name>
|
||||||
<releases><enabled>true</enabled></releases>
|
<releases><enabled>true</enabled></releases>
|
||||||
<snapshots><enabled>true</enabled></snapshots>
|
<snapshots><enabled>true</enabled></snapshots>
|
||||||
<url>http://oss.sonatype.org/content/repositories/releases/</url>
|
<url>https://oss.sonatype.org/content/repositories/releases/</url>
|
||||||
</repository>
|
</repository>
|
||||||
</repositories>
|
</repositories>
|
||||||
|
|
||||||
@ -61,36 +83,36 @@
|
|||||||
<scope>compile</scope>
|
<scope>compile</scope>
|
||||||
</dependency>
|
</dependency>
|
||||||
|
|
||||||
|
|
||||||
<dependency>
|
<dependency>
|
||||||
<groupId>org.apache.httpcomponents</groupId>
|
<groupId>org.apache.httpcomponents</groupId>
|
||||||
<artifactId>httpclient</artifactId>
|
<artifactId>httpclient</artifactId>
|
||||||
<version>4.4.1</version>
|
<version>4.5.2</version>
|
||||||
</dependency>
|
</dependency>
|
||||||
|
|
||||||
<dependency>
|
<dependency>
|
||||||
<groupId>log4j</groupId>
|
<groupId>org.apache.logging.log4j</groupId>
|
||||||
<artifactId>log4j</artifactId>
|
<artifactId>log4j-api</artifactId>
|
||||||
<version>1.2.16</version>
|
<version>2.18.0</version>
|
||||||
<scope>runtime</scope>
|
|
||||||
</dependency>
|
</dependency>
|
||||||
|
|
||||||
<dependency>
|
<dependency>
|
||||||
<groupId>org.hamcrest</groupId>
|
<groupId>org.hamcrest</groupId>
|
||||||
<artifactId>hamcrest-core</artifactId>
|
<artifactId>hamcrest-core</artifactId>
|
||||||
<version>1.3.RC2</version>
|
<version>1.3</version>
|
||||||
<scope>test</scope>
|
<scope>test</scope>
|
||||||
</dependency>
|
</dependency>
|
||||||
|
|
||||||
<dependency>
|
<dependency>
|
||||||
<groupId>org.hamcrest</groupId>
|
<groupId>org.hamcrest</groupId>
|
||||||
<artifactId>hamcrest-library</artifactId>
|
<artifactId>hamcrest-library</artifactId>
|
||||||
<version>1.3.RC2</version>
|
<version>1.3</version>
|
||||||
<scope>test</scope>
|
<scope>test</scope>
|
||||||
</dependency>
|
</dependency>
|
||||||
<dependency>
|
<dependency>
|
||||||
<groupId>junit</groupId>
|
<groupId>junit</groupId>
|
||||||
<artifactId>junit</artifactId>
|
<artifactId>junit</artifactId>
|
||||||
<version>4.10</version>
|
<version>4.12</version>
|
||||||
<scope>test</scope>
|
<scope>test</scope>
|
||||||
</dependency>
|
</dependency>
|
||||||
</dependencies>
|
</dependencies>
|
||||||
@ -100,10 +122,10 @@
|
|||||||
<plugin>
|
<plugin>
|
||||||
<groupId>org.apache.maven.plugins</groupId>
|
<groupId>org.apache.maven.plugins</groupId>
|
||||||
<artifactId>maven-compiler-plugin</artifactId>
|
<artifactId>maven-compiler-plugin</artifactId>
|
||||||
<version>2.3.2</version>
|
<version>3.5.1</version>
|
||||||
<configuration>
|
<configuration>
|
||||||
<source>1.6</source>
|
<source>${maven.compiler.target}</source>
|
||||||
<target>1.6</target>
|
<target>${maven.compiler.target}</target>
|
||||||
</configuration>
|
</configuration>
|
||||||
</plugin>
|
</plugin>
|
||||||
<plugin>
|
<plugin>
|
||||||
@ -131,7 +153,9 @@
|
|||||||
</plugin>
|
</plugin>
|
||||||
<plugin>
|
<plugin>
|
||||||
<artifactId>maven-assembly-plugin</artifactId>
|
<artifactId>maven-assembly-plugin</artifactId>
|
||||||
|
|
||||||
<configuration>
|
<configuration>
|
||||||
|
<appendAssemblyId>false</appendAssemblyId>
|
||||||
<outputDirectory>${project.build.directory}/releases/</outputDirectory>
|
<outputDirectory>${project.build.directory}/releases/</outputDirectory>
|
||||||
<descriptors>
|
<descriptors>
|
||||||
<descriptor>${basedir}/src/main/assemblies/plugin.xml</descriptor>
|
<descriptor>${basedir}/src/main/assemblies/plugin.xml</descriptor>
|
||||||
@ -153,4 +177,93 @@
|
|||||||
</plugin>
|
</plugin>
|
||||||
</plugins>
|
</plugins>
|
||||||
</build>
|
</build>
|
||||||
|
<profiles>
|
||||||
|
<profile>
|
||||||
|
<id>disable-java8-doclint</id>
|
||||||
|
<activation>
|
||||||
|
<jdk>[1.8,)</jdk>
|
||||||
|
</activation>
|
||||||
|
<properties>
|
||||||
|
<additionalparam>-Xdoclint:none</additionalparam>
|
||||||
|
</properties>
|
||||||
|
</profile>
|
||||||
|
<profile>
|
||||||
|
<id>release</id>
|
||||||
|
<build>
|
||||||
|
<plugins>
|
||||||
|
<plugin>
|
||||||
|
<groupId>org.sonatype.plugins</groupId>
|
||||||
|
<artifactId>nexus-staging-maven-plugin</artifactId>
|
||||||
|
<version>1.6.3</version>
|
||||||
|
<extensions>true</extensions>
|
||||||
|
<configuration>
|
||||||
|
<serverId>oss</serverId>
|
||||||
|
<nexusUrl>https://oss.sonatype.org/</nexusUrl>
|
||||||
|
<autoReleaseAfterClose>true</autoReleaseAfterClose>
|
||||||
|
</configuration>
|
||||||
|
</plugin>
|
||||||
|
<plugin>
|
||||||
|
<groupId>org.apache.maven.plugins</groupId>
|
||||||
|
<artifactId>maven-release-plugin</artifactId>
|
||||||
|
<version>2.1</version>
|
||||||
|
<configuration>
|
||||||
|
<autoVersionSubmodules>true</autoVersionSubmodules>
|
||||||
|
<useReleaseProfile>false</useReleaseProfile>
|
||||||
|
<releaseProfiles>release</releaseProfiles>
|
||||||
|
<goals>deploy</goals>
|
||||||
|
</configuration>
|
||||||
|
</plugin>
|
||||||
|
<plugin>
|
||||||
|
<groupId>org.apache.maven.plugins</groupId>
|
||||||
|
<artifactId>maven-compiler-plugin</artifactId>
|
||||||
|
<version>3.5.1</version>
|
||||||
|
<configuration>
|
||||||
|
<source>${maven.compiler.target}</source>
|
||||||
|
<target>${maven.compiler.target}</target>
|
||||||
|
</configuration>
|
||||||
|
</plugin>
|
||||||
|
<plugin>
|
||||||
|
<groupId>org.apache.maven.plugins</groupId>
|
||||||
|
<artifactId>maven-gpg-plugin</artifactId>
|
||||||
|
<version>1.5</version>
|
||||||
|
<executions>
|
||||||
|
<execution>
|
||||||
|
<id>sign-artifacts</id>
|
||||||
|
<phase>verify</phase>
|
||||||
|
<goals>
|
||||||
|
<goal>sign</goal>
|
||||||
|
</goals>
|
||||||
|
</execution>
|
||||||
|
</executions>
|
||||||
|
</plugin>
|
||||||
|
<plugin>
|
||||||
|
<groupId>org.apache.maven.plugins</groupId>
|
||||||
|
<artifactId>maven-source-plugin</artifactId>
|
||||||
|
<version>2.2.1</version>
|
||||||
|
<executions>
|
||||||
|
<execution>
|
||||||
|
<id>attach-sources</id>
|
||||||
|
<goals>
|
||||||
|
<goal>jar-no-fork</goal>
|
||||||
|
</goals>
|
||||||
|
</execution>
|
||||||
|
</executions>
|
||||||
|
</plugin>
|
||||||
|
<plugin>
|
||||||
|
<groupId>org.apache.maven.plugins</groupId>
|
||||||
|
<artifactId>maven-javadoc-plugin</artifactId>
|
||||||
|
<version>2.9</version>
|
||||||
|
<executions>
|
||||||
|
<execution>
|
||||||
|
<id>attach-javadocs</id>
|
||||||
|
<goals>
|
||||||
|
<goal>jar</goal>
|
||||||
|
</goals>
|
||||||
|
</execution>
|
||||||
|
</executions>
|
||||||
|
</plugin>
|
||||||
|
</plugins>
|
||||||
|
</build>
|
||||||
|
</profile>
|
||||||
|
</profiles>
|
||||||
</project>
|
</project>
|
||||||
|
@ -1,5 +1,6 @@
|
|||||||
<?xml version="1.0"?>
|
<?xml version="1.0"?>
|
||||||
<assembly>
|
<assembly>
|
||||||
|
<id>analysis-ik-release</id>
|
||||||
<formats>
|
<formats>
|
||||||
<format>zip</format>
|
<format>zip</format>
|
||||||
</formats>
|
</formats>
|
||||||
@ -7,20 +8,25 @@
|
|||||||
<fileSets>
|
<fileSets>
|
||||||
<fileSet>
|
<fileSet>
|
||||||
<directory>${project.basedir}/config</directory>
|
<directory>${project.basedir}/config</directory>
|
||||||
<outputDirectory>/config</outputDirectory>
|
<outputDirectory>config</outputDirectory>
|
||||||
</fileSet>
|
</fileSet>
|
||||||
</fileSets>
|
</fileSets>
|
||||||
|
|
||||||
<files>
|
<files>
|
||||||
<file>
|
<file>
|
||||||
<source>${project.basedir}/src/main/resources/plugin-descriptor.properties</source>
|
<source>${project.basedir}/src/main/resources/plugin-descriptor.properties</source>
|
||||||
<outputDirectory></outputDirectory>
|
<outputDirectory/>
|
||||||
|
<filtered>true</filtered>
|
||||||
|
</file>
|
||||||
|
<file>
|
||||||
|
<source>${project.basedir}/src/main/resources/plugin-security.policy</source>
|
||||||
|
<outputDirectory/>
|
||||||
<filtered>true</filtered>
|
<filtered>true</filtered>
|
||||||
</file>
|
</file>
|
||||||
</files>
|
</files>
|
||||||
<dependencySets>
|
<dependencySets>
|
||||||
<dependencySet>
|
<dependencySet>
|
||||||
<outputDirectory>/</outputDirectory>
|
<outputDirectory/>
|
||||||
<useProjectArtifact>true</useProjectArtifact>
|
<useProjectArtifact>true</useProjectArtifact>
|
||||||
<useTransitiveFiltering>true</useTransitiveFiltering>
|
<useTransitiveFiltering>true</useTransitiveFiltering>
|
||||||
<excludes>
|
<excludes>
|
||||||
@ -28,7 +34,7 @@
|
|||||||
</excludes>
|
</excludes>
|
||||||
</dependencySet>
|
</dependencySet>
|
||||||
<dependencySet>
|
<dependencySet>
|
||||||
<outputDirectory>/</outputDirectory>
|
<outputDirectory/>
|
||||||
<useProjectArtifact>true</useProjectArtifact>
|
<useProjectArtifact>true</useProjectArtifact>
|
||||||
<useTransitiveFiltering>true</useTransitiveFiltering>
|
<useTransitiveFiltering>true</useTransitiveFiltering>
|
||||||
<includes>
|
<includes>
|
||||||
|
@ -1,24 +0,0 @@
|
|||||||
package org.elasticsearch.index.analysis;
|
|
||||||
|
|
||||||
|
|
||||||
@Deprecated
|
|
||||||
public class IkAnalysisBinderProcessor extends AnalysisModule.AnalysisBinderProcessor {
|
|
||||||
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public void processTokenFilters(TokenFiltersBindings tokenFiltersBindings) {
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public void processAnalyzers(AnalyzersBindings analyzersBindings) {
|
|
||||||
analyzersBindings.processAnalyzer("ik", IkAnalyzerProvider.class);
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public void processTokenizers(TokenizersBindings tokenizersBindings) {
|
|
||||||
tokenizersBindings.processTokenizer("ik", IkTokenizerFactory.class);
|
|
||||||
}
|
|
||||||
}
|
|
@ -1,26 +1,28 @@
|
|||||||
package org.elasticsearch.index.analysis;
|
package org.elasticsearch.index.analysis;
|
||||||
|
|
||||||
import org.elasticsearch.common.inject.Inject;
|
|
||||||
import org.elasticsearch.common.inject.assistedinject.Assisted;
|
|
||||||
import org.elasticsearch.common.settings.Settings;
|
import org.elasticsearch.common.settings.Settings;
|
||||||
import org.elasticsearch.env.Environment;
|
import org.elasticsearch.env.Environment;
|
||||||
import org.elasticsearch.index.Index;
|
import org.elasticsearch.index.IndexSettings;
|
||||||
import org.elasticsearch.index.settings.IndexSettingsService;
|
|
||||||
import org.wltea.analyzer.cfg.Configuration;
|
import org.wltea.analyzer.cfg.Configuration;
|
||||||
import org.wltea.analyzer.dic.Dictionary;
|
|
||||||
import org.wltea.analyzer.lucene.IKAnalyzer;
|
import org.wltea.analyzer.lucene.IKAnalyzer;
|
||||||
|
|
||||||
@Deprecated
|
|
||||||
public class IkAnalyzerProvider extends AbstractIndexAnalyzerProvider<IKAnalyzer> {
|
public class IkAnalyzerProvider extends AbstractIndexAnalyzerProvider<IKAnalyzer> {
|
||||||
private final IKAnalyzer analyzer;
|
private final IKAnalyzer analyzer;
|
||||||
private boolean useSmart=false;
|
|
||||||
|
|
||||||
@Inject
|
public IkAnalyzerProvider(IndexSettings indexSettings, Environment env, String name, Settings settings,boolean useSmart) {
|
||||||
public IkAnalyzerProvider(Index index, IndexSettingsService indexSettingsService, Environment env, @Assisted String name, @Assisted Settings settings) {
|
super(name, settings);
|
||||||
super(index, indexSettingsService.getSettings(), name, settings);
|
|
||||||
Dictionary.initial(new Configuration(env));
|
Configuration configuration=new Configuration(env,settings).setUseSmart(useSmart);
|
||||||
useSmart = settings.get("use_smart", "false").equals("true");
|
|
||||||
analyzer=new IKAnalyzer(useSmart);
|
analyzer=new IKAnalyzer(configuration);
|
||||||
|
}
|
||||||
|
|
||||||
|
public static IkAnalyzerProvider getIkSmartAnalyzerProvider(IndexSettings indexSettings, Environment env, String name, Settings settings) {
|
||||||
|
return new IkAnalyzerProvider(indexSettings,env,name,settings,true);
|
||||||
|
}
|
||||||
|
|
||||||
|
public static IkAnalyzerProvider getIkAnalyzerProvider(IndexSettings indexSettings, Environment env, String name, Settings settings) {
|
||||||
|
return new IkAnalyzerProvider(indexSettings,env,name,settings,false);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override public IKAnalyzer get() {
|
@Override public IKAnalyzer get() {
|
||||||
|
@ -1,32 +1,34 @@
|
|||||||
package org.elasticsearch.index.analysis;
|
package org.elasticsearch.index.analysis;
|
||||||
|
|
||||||
import org.apache.lucene.analysis.Tokenizer;
|
import org.apache.lucene.analysis.Tokenizer;
|
||||||
import org.elasticsearch.common.inject.Inject;
|
|
||||||
import org.elasticsearch.common.inject.assistedinject.Assisted;
|
|
||||||
import org.elasticsearch.common.settings.Settings;
|
import org.elasticsearch.common.settings.Settings;
|
||||||
import org.elasticsearch.env.Environment;
|
import org.elasticsearch.env.Environment;
|
||||||
import org.elasticsearch.index.Index;
|
import org.elasticsearch.index.IndexSettings;
|
||||||
import org.elasticsearch.index.settings.IndexSettingsService;
|
|
||||||
import org.wltea.analyzer.cfg.Configuration;
|
import org.wltea.analyzer.cfg.Configuration;
|
||||||
import org.wltea.analyzer.dic.Dictionary;
|
|
||||||
import org.wltea.analyzer.lucene.IKTokenizer;
|
import org.wltea.analyzer.lucene.IKTokenizer;
|
||||||
|
|
||||||
@Deprecated
|
|
||||||
public class IkTokenizerFactory extends AbstractTokenizerFactory {
|
public class IkTokenizerFactory extends AbstractTokenizerFactory {
|
||||||
private final Settings settings;
|
private Configuration configuration;
|
||||||
private boolean useSmart=false;
|
|
||||||
|
|
||||||
@Inject
|
public IkTokenizerFactory(IndexSettings indexSettings, Environment env, String name, Settings settings) {
|
||||||
public IkTokenizerFactory(Index index, IndexSettingsService indexSettingsService,Environment env, @Assisted String name, @Assisted Settings settings) {
|
super(indexSettings, settings,name);
|
||||||
super(index, indexSettingsService.getSettings(), name, settings);
|
configuration=new Configuration(env,settings);
|
||||||
this.settings=settings;
|
|
||||||
Dictionary.initial(new Configuration(env));
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public static IkTokenizerFactory getIkTokenizerFactory(IndexSettings indexSettings, Environment env, String name, Settings settings) {
|
||||||
|
return new IkTokenizerFactory(indexSettings,env, name, settings).setSmart(false);
|
||||||
|
}
|
||||||
|
|
||||||
|
public static IkTokenizerFactory getIkSmartTokenizerFactory(IndexSettings indexSettings, Environment env, String name, Settings settings) {
|
||||||
|
return new IkTokenizerFactory(indexSettings,env, name, settings).setSmart(true);
|
||||||
|
}
|
||||||
|
|
||||||
|
public IkTokenizerFactory setSmart(boolean smart){
|
||||||
|
this.configuration.setUseSmart(smart);
|
||||||
|
return this;
|
||||||
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public Tokenizer create() {
|
public Tokenizer create() {
|
||||||
this.useSmart = settings.get("use_smart", "false").equals("true");
|
return new IKTokenizer(configuration); }
|
||||||
|
|
||||||
return new IKTokenizer(useSmart); }
|
|
||||||
}
|
}
|
||||||
|
@ -1,84 +0,0 @@
|
|||||||
package org.elasticsearch.indices.analysis;
|
|
||||||
|
|
||||||
import org.apache.lucene.analysis.Tokenizer;
|
|
||||||
import org.elasticsearch.common.component.AbstractComponent;
|
|
||||||
import org.elasticsearch.common.inject.Inject;
|
|
||||||
import org.elasticsearch.common.settings.Settings;
|
|
||||||
import org.elasticsearch.env.Environment;
|
|
||||||
import org.elasticsearch.index.analysis.AnalyzerScope;
|
|
||||||
import org.elasticsearch.index.analysis.PreBuiltAnalyzerProviderFactory;
|
|
||||||
import org.elasticsearch.index.analysis.PreBuiltTokenizerFactoryFactory;
|
|
||||||
import org.elasticsearch.index.analysis.TokenizerFactory;
|
|
||||||
import org.wltea.analyzer.cfg.Configuration;
|
|
||||||
import org.wltea.analyzer.dic.Dictionary;
|
|
||||||
import org.wltea.analyzer.lucene.IKAnalyzer;
|
|
||||||
import org.wltea.analyzer.lucene.IKTokenizer;
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Registers indices level analysis components so, if not explicitly configured,
|
|
||||||
* will be shared among all indices.
|
|
||||||
*/
|
|
||||||
public class IKIndicesAnalysis extends AbstractComponent {
|
|
||||||
|
|
||||||
private boolean useSmart=false;
|
|
||||||
|
|
||||||
@Inject
|
|
||||||
public IKIndicesAnalysis(final Settings settings,
|
|
||||||
IndicesAnalysisService indicesAnalysisService,Environment env) {
|
|
||||||
super(settings);
|
|
||||||
Dictionary.initial(new Configuration(env));
|
|
||||||
|
|
||||||
this.useSmart = settings.get("use_smart", "false").equals("true");
|
|
||||||
|
|
||||||
indicesAnalysisService.analyzerProviderFactories().put("ik",
|
|
||||||
new PreBuiltAnalyzerProviderFactory("ik", AnalyzerScope.GLOBAL,
|
|
||||||
new IKAnalyzer(useSmart)));
|
|
||||||
|
|
||||||
indicesAnalysisService.analyzerProviderFactories().put("ik_smart",
|
|
||||||
new PreBuiltAnalyzerProviderFactory("ik_smart", AnalyzerScope.GLOBAL,
|
|
||||||
new IKAnalyzer(true)));
|
|
||||||
|
|
||||||
indicesAnalysisService.analyzerProviderFactories().put("ik_max_word",
|
|
||||||
new PreBuiltAnalyzerProviderFactory("ik_max_word", AnalyzerScope.GLOBAL,
|
|
||||||
new IKAnalyzer(false)));
|
|
||||||
|
|
||||||
indicesAnalysisService.tokenizerFactories().put("ik",
|
|
||||||
new PreBuiltTokenizerFactoryFactory(new TokenizerFactory() {
|
|
||||||
@Override
|
|
||||||
public String name() {
|
|
||||||
return "ik";
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public Tokenizer create() {
|
|
||||||
return new IKTokenizer(false);
|
|
||||||
}
|
|
||||||
}));
|
|
||||||
|
|
||||||
indicesAnalysisService.tokenizerFactories().put("ik_smart",
|
|
||||||
new PreBuiltTokenizerFactoryFactory(new TokenizerFactory() {
|
|
||||||
@Override
|
|
||||||
public String name() {
|
|
||||||
return "ik_smart";
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public Tokenizer create() {
|
|
||||||
return new IKTokenizer(true);
|
|
||||||
}
|
|
||||||
}));
|
|
||||||
|
|
||||||
indicesAnalysisService.tokenizerFactories().put("ik_max_word",
|
|
||||||
new PreBuiltTokenizerFactoryFactory(new TokenizerFactory() {
|
|
||||||
@Override
|
|
||||||
public String name() {
|
|
||||||
return "ik_max_word";
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public Tokenizer create() {
|
|
||||||
return new IKTokenizer(false);
|
|
||||||
}
|
|
||||||
}));
|
|
||||||
}
|
|
||||||
}
|
|
@ -1,32 +0,0 @@
|
|||||||
/*
|
|
||||||
* Licensed to Elasticsearch under one or more contributor
|
|
||||||
* license agreements. See the NOTICE file distributed with
|
|
||||||
* this work for additional information regarding copyright
|
|
||||||
* ownership. Elasticsearch licenses this file to you under
|
|
||||||
* the Apache License, Version 2.0 (the "License"); you may
|
|
||||||
* not use this file except in compliance with the License.
|
|
||||||
* You may obtain a copy of the License at
|
|
||||||
*
|
|
||||||
* http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
*
|
|
||||||
* Unless required by applicable law or agreed to in writing,
|
|
||||||
* software distributed under the License is distributed on an
|
|
||||||
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
|
||||||
* KIND, either express or implied. See the License for the
|
|
||||||
* specific language governing permissions and limitations
|
|
||||||
* under the License.
|
|
||||||
*/
|
|
||||||
|
|
||||||
package org.elasticsearch.indices.analysis;
|
|
||||||
|
|
||||||
import org.elasticsearch.common.inject.AbstractModule;
|
|
||||||
|
|
||||||
/**
|
|
||||||
*/
|
|
||||||
public class IKIndicesAnalysisModule extends AbstractModule {
|
|
||||||
|
|
||||||
@Override
|
|
||||||
protected void configure() {
|
|
||||||
bind(IKIndicesAnalysis.class).asEagerSingleton();
|
|
||||||
}
|
|
||||||
}
|
|
@ -1,44 +1,41 @@
|
|||||||
package org.elasticsearch.plugin.analysis.ik;
|
package org.elasticsearch.plugin.analysis.ik;
|
||||||
|
|
||||||
import org.elasticsearch.common.inject.AbstractModule;
|
import org.apache.lucene.analysis.Analyzer;
|
||||||
import org.elasticsearch.common.inject.Inject;
|
import org.elasticsearch.index.analysis.AnalyzerProvider;
|
||||||
import org.elasticsearch.common.inject.Module;
|
import org.elasticsearch.index.analysis.IkAnalyzerProvider;
|
||||||
import org.elasticsearch.common.logging.ESLogger;
|
import org.elasticsearch.index.analysis.IkTokenizerFactory;
|
||||||
import org.elasticsearch.common.logging.ESLoggerFactory;
|
import org.elasticsearch.index.analysis.TokenizerFactory;
|
||||||
import org.elasticsearch.common.settings.Settings;
|
import org.elasticsearch.indices.analysis.AnalysisModule;
|
||||||
import org.elasticsearch.env.Environment;
|
import org.elasticsearch.plugins.AnalysisPlugin;
|
||||||
import org.elasticsearch.index.analysis.AnalysisModule;
|
|
||||||
import org.elasticsearch.index.analysis.IkAnalysisBinderProcessor;
|
|
||||||
import org.elasticsearch.indices.analysis.IKIndicesAnalysisModule;
|
|
||||||
import org.elasticsearch.plugins.Plugin;
|
import org.elasticsearch.plugins.Plugin;
|
||||||
import org.wltea.analyzer.cfg.Configuration;
|
|
||||||
import org.wltea.analyzer.dic.Dictionary;
|
|
||||||
|
|
||||||
import java.nio.file.Path;
|
import java.util.HashMap;
|
||||||
import java.util.Collection;
|
import java.util.Map;
|
||||||
import java.util.Collections;
|
|
||||||
import java.util.logging.Logger;
|
|
||||||
|
|
||||||
import static java.rmi.Naming.bind;
|
|
||||||
|
|
||||||
|
|
||||||
public class AnalysisIkPlugin extends Plugin {
|
public class AnalysisIkPlugin extends Plugin implements AnalysisPlugin {
|
||||||
|
|
||||||
@Override public String name() {
|
public static String PLUGIN_NAME = "analysis-ik";
|
||||||
return "analysis-ik";
|
|
||||||
}
|
@Override
|
||||||
|
public Map<String, AnalysisModule.AnalysisProvider<TokenizerFactory>> getTokenizers() {
|
||||||
|
Map<String, AnalysisModule.AnalysisProvider<TokenizerFactory>> extra = new HashMap<>();
|
||||||
|
|
||||||
|
|
||||||
@Override public String description() {
|
extra.put("ik_smart", IkTokenizerFactory::getIkSmartTokenizerFactory);
|
||||||
return "ik analysis";
|
extra.put("ik_max_word", IkTokenizerFactory::getIkTokenizerFactory);
|
||||||
|
|
||||||
|
return extra;
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public Collection<Module> nodeModules() {
|
public Map<String, AnalysisModule.AnalysisProvider<AnalyzerProvider<? extends Analyzer>>> getAnalyzers() {
|
||||||
return Collections.<Module>singletonList(new IKIndicesAnalysisModule());
|
Map<String, AnalysisModule.AnalysisProvider<AnalyzerProvider<? extends Analyzer>>> extra = new HashMap<>();
|
||||||
}
|
|
||||||
public void onModule(AnalysisModule module) {
|
extra.put("ik_smart", IkAnalyzerProvider::getIkSmartAnalyzerProvider);
|
||||||
module.addProcessor(new IkAnalysisBinderProcessor());
|
extra.put("ik_max_word", IkAnalyzerProvider::getIkAnalyzerProvider);
|
||||||
|
|
||||||
|
return extra;
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
152
src/main/java/org/wltea/analyzer/cfg/Configuration.java
Normal file → Executable file
152
src/main/java/org/wltea/analyzer/cfg/Configuration.java
Normal file → Executable file
@ -4,134 +4,72 @@
|
|||||||
package org.wltea.analyzer.cfg;
|
package org.wltea.analyzer.cfg;
|
||||||
|
|
||||||
import org.elasticsearch.common.inject.Inject;
|
import org.elasticsearch.common.inject.Inject;
|
||||||
import org.elasticsearch.common.io.PathUtils;
|
import org.elasticsearch.core.PathUtils;
|
||||||
import org.elasticsearch.common.logging.ESLogger;
|
import org.elasticsearch.common.settings.Settings;
|
||||||
import org.elasticsearch.common.logging.Loggers;
|
|
||||||
import org.elasticsearch.env.Environment;
|
import org.elasticsearch.env.Environment;
|
||||||
import org.elasticsearch.plugin.analysis.ik.AnalysisIkPlugin;
|
import org.elasticsearch.plugin.analysis.ik.AnalysisIkPlugin;
|
||||||
|
import org.wltea.analyzer.dic.Dictionary;
|
||||||
|
|
||||||
import java.io.*;
|
import java.io.File;
|
||||||
import java.net.URL;
|
|
||||||
import java.nio.file.Path;
|
import java.nio.file.Path;
|
||||||
import java.util.ArrayList;
|
|
||||||
import java.util.InvalidPropertiesFormatException;
|
|
||||||
import java.util.List;
|
|
||||||
import java.util.Properties;
|
|
||||||
|
|
||||||
public class Configuration {
|
public class Configuration {
|
||||||
|
|
||||||
private static String FILE_NAME = "ik/IKAnalyzer.cfg.xml";
|
|
||||||
private static final String EXT_DICT = "ext_dict";
|
|
||||||
private static final String REMOTE_EXT_DICT = "remote_ext_dict";
|
|
||||||
private static final String EXT_STOP = "ext_stopwords";
|
|
||||||
private static final String REMOTE_EXT_STOP = "remote_ext_stopwords";
|
|
||||||
private static ESLogger logger = Loggers.getLogger("ik-analyzer");
|
|
||||||
private Properties props;
|
|
||||||
private Environment environment;
|
private Environment environment;
|
||||||
|
private Settings settings;
|
||||||
|
|
||||||
|
//是否启用智能分词
|
||||||
|
private boolean useSmart;
|
||||||
|
|
||||||
|
//是否启用远程词典加载
|
||||||
|
private boolean enableRemoteDict=false;
|
||||||
|
|
||||||
|
//是否启用小写处理
|
||||||
|
private boolean enableLowercase=true;
|
||||||
|
|
||||||
|
|
||||||
@Inject
|
@Inject
|
||||||
public Configuration(Environment env){
|
public Configuration(Environment env,Settings settings) {
|
||||||
props = new Properties();
|
this.environment = env;
|
||||||
environment = env;
|
this.settings=settings;
|
||||||
|
|
||||||
|
this.useSmart = settings.get("use_smart", "false").equals("true");
|
||||||
|
this.enableLowercase = settings.get("enable_lowercase", "true").equals("true");
|
||||||
|
this.enableRemoteDict = settings.get("enable_remote_dict", "true").equals("true");
|
||||||
|
|
||||||
Path fileConfig = PathUtils.get(getDictRoot(), FILE_NAME);
|
Dictionary.initial(this);
|
||||||
|
|
||||||
|
|
||||||
InputStream input = null;
|
|
||||||
try {
|
|
||||||
input = new FileInputStream(fileConfig.toFile());
|
|
||||||
} catch (FileNotFoundException e) {
|
|
||||||
logger.error("ik-analyzer",e);
|
|
||||||
}
|
|
||||||
if(input != null){
|
|
||||||
try {
|
|
||||||
props.loadFromXML(input);
|
|
||||||
} catch (InvalidPropertiesFormatException e) {
|
|
||||||
logger.error("ik-analyzer", e);
|
|
||||||
} catch (IOException e) {
|
|
||||||
logger.error("ik-analyzer",e);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
public List<String> getExtDictionarys(){
|
|
||||||
List<String> extDictFiles = new ArrayList<String>(2);
|
|
||||||
String extDictCfg = props.getProperty(EXT_DICT);
|
|
||||||
if(extDictCfg != null){
|
|
||||||
|
|
||||||
String[] filePaths = extDictCfg.split(";");
|
|
||||||
if(filePaths != null){
|
|
||||||
for(String filePath : filePaths){
|
|
||||||
if(filePath != null && !"".equals(filePath.trim())){
|
|
||||||
Path file = PathUtils.get("ik", filePath.trim());
|
|
||||||
extDictFiles.add(file.toString());
|
|
||||||
|
|
||||||
}
|
}
|
||||||
}
|
|
||||||
}
|
public Path getConfigInPluginDir() {
|
||||||
}
|
return PathUtils
|
||||||
return extDictFiles;
|
.get(new File(AnalysisIkPlugin.class.getProtectionDomain().getCodeSource().getLocation().getPath())
|
||||||
|
.getParent(), "config")
|
||||||
|
.toAbsolutePath();
|
||||||
}
|
}
|
||||||
|
|
||||||
public List<String> getRemoteExtDictionarys(){
|
public boolean isUseSmart() {
|
||||||
List<String> remoteExtDictFiles = new ArrayList<String>(2);
|
return useSmart;
|
||||||
String remoteExtDictCfg = props.getProperty(REMOTE_EXT_DICT);
|
|
||||||
if(remoteExtDictCfg != null){
|
|
||||||
|
|
||||||
String[] filePaths = remoteExtDictCfg.split(";");
|
|
||||||
if(filePaths != null){
|
|
||||||
for(String filePath : filePaths){
|
|
||||||
if(filePath != null && !"".equals(filePath.trim())){
|
|
||||||
remoteExtDictFiles.add(filePath);
|
|
||||||
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return remoteExtDictFiles;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
public List<String> getExtStopWordDictionarys(){
|
public Configuration setUseSmart(boolean useSmart) {
|
||||||
List<String> extStopWordDictFiles = new ArrayList<String>(2);
|
this.useSmart = useSmart;
|
||||||
String extStopWordDictCfg = props.getProperty(EXT_STOP);
|
return this;
|
||||||
if(extStopWordDictCfg != null){
|
|
||||||
|
|
||||||
String[] filePaths = extStopWordDictCfg.split(";");
|
|
||||||
if(filePaths != null){
|
|
||||||
for(String filePath : filePaths){
|
|
||||||
if(filePath != null && !"".equals(filePath.trim())){
|
|
||||||
Path file = PathUtils.get("ik", filePath.trim());
|
|
||||||
extStopWordDictFiles.add(file.toString());
|
|
||||||
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return extStopWordDictFiles;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
public List<String> getRemoteExtStopWordDictionarys(){
|
public Environment getEnvironment() {
|
||||||
List<String> remoteExtStopWordDictFiles = new ArrayList<String>(2);
|
return environment;
|
||||||
String remoteExtStopWordDictCfg = props.getProperty(REMOTE_EXT_STOP);
|
|
||||||
if(remoteExtStopWordDictCfg != null){
|
|
||||||
|
|
||||||
String[] filePaths = remoteExtStopWordDictCfg.split(";");
|
|
||||||
if(filePaths != null){
|
|
||||||
for(String filePath : filePaths){
|
|
||||||
if(filePath != null && !"".equals(filePath.trim())){
|
|
||||||
remoteExtStopWordDictFiles.add(filePath);
|
|
||||||
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return remoteExtStopWordDictFiles;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
public String getDictRoot() {
|
public Settings getSettings() {
|
||||||
return PathUtils.get(
|
return settings;
|
||||||
new File(AnalysisIkPlugin.class.getProtectionDomain().getCodeSource().getLocation().getPath()).getParent(),"config")
|
}
|
||||||
.toAbsolutePath().toString();
|
|
||||||
|
public boolean isEnableRemoteDict() {
|
||||||
|
return enableRemoteDict;
|
||||||
|
}
|
||||||
|
|
||||||
|
public boolean isEnableLowercase() {
|
||||||
|
return enableLowercase;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -32,6 +32,7 @@ import java.util.LinkedList;
|
|||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
import java.util.Set;
|
import java.util.Set;
|
||||||
|
|
||||||
|
import org.wltea.analyzer.cfg.Configuration;
|
||||||
import org.wltea.analyzer.dic.Dictionary;
|
import org.wltea.analyzer.dic.Dictionary;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@ -47,7 +48,7 @@ class AnalyzeContext {
|
|||||||
private static final int BUFF_EXHAUST_CRITICAL = 100;
|
private static final int BUFF_EXHAUST_CRITICAL = 100;
|
||||||
|
|
||||||
|
|
||||||
//字符窜读取缓冲
|
//字符串读取缓冲
|
||||||
private char[] segmentBuff;
|
private char[] segmentBuff;
|
||||||
//字符类型数组
|
//字符类型数组
|
||||||
private int[] charTypes;
|
private int[] charTypes;
|
||||||
@ -72,12 +73,11 @@ class AnalyzeContext {
|
|||||||
private Map<Integer , LexemePath> pathMap;
|
private Map<Integer , LexemePath> pathMap;
|
||||||
//最终分词结果集
|
//最终分词结果集
|
||||||
private LinkedList<Lexeme> results;
|
private LinkedList<Lexeme> results;
|
||||||
private boolean useSmart;
|
|
||||||
//分词器配置项
|
//分词器配置项
|
||||||
// private Configuration cfg;
|
private Configuration cfg;
|
||||||
|
|
||||||
public AnalyzeContext(boolean useSmart){
|
public AnalyzeContext(Configuration configuration){
|
||||||
this.useSmart = useSmart;
|
this.cfg = configuration;
|
||||||
this.segmentBuff = new char[BUFF_SIZE];
|
this.segmentBuff = new char[BUFF_SIZE];
|
||||||
this.charTypes = new int[BUFF_SIZE];
|
this.charTypes = new int[BUFF_SIZE];
|
||||||
this.buffLocker = new HashSet<String>();
|
this.buffLocker = new HashSet<String>();
|
||||||
@ -139,7 +139,7 @@ class AnalyzeContext {
|
|||||||
*/
|
*/
|
||||||
void initCursor(){
|
void initCursor(){
|
||||||
this.cursor = 0;
|
this.cursor = 0;
|
||||||
this.segmentBuff[this.cursor] = CharacterUtil.regularize(this.segmentBuff[this.cursor]);
|
this.segmentBuff[this.cursor] = CharacterUtil.regularize(this.segmentBuff[this.cursor],cfg.isEnableLowercase());
|
||||||
this.charTypes[this.cursor] = CharacterUtil.identifyCharType(this.segmentBuff[this.cursor]);
|
this.charTypes[this.cursor] = CharacterUtil.identifyCharType(this.segmentBuff[this.cursor]);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -151,7 +151,7 @@ class AnalyzeContext {
|
|||||||
boolean moveCursor(){
|
boolean moveCursor(){
|
||||||
if(this.cursor < this.available - 1){
|
if(this.cursor < this.available - 1){
|
||||||
this.cursor++;
|
this.cursor++;
|
||||||
this.segmentBuff[this.cursor] = CharacterUtil.regularize(this.segmentBuff[this.cursor]);
|
this.segmentBuff[this.cursor] = CharacterUtil.regularize(this.segmentBuff[this.cursor],cfg.isEnableLowercase());
|
||||||
this.charTypes[this.cursor] = CharacterUtil.identifyCharType(this.segmentBuff[this.cursor]);
|
this.charTypes[this.cursor] = CharacterUtil.identifyCharType(this.segmentBuff[this.cursor]);
|
||||||
return true;
|
return true;
|
||||||
}else{
|
}else{
|
||||||
@ -267,6 +267,15 @@ class AnalyzeContext {
|
|||||||
Lexeme l = path.pollFirst();
|
Lexeme l = path.pollFirst();
|
||||||
while(l != null){
|
while(l != null){
|
||||||
this.results.add(l);
|
this.results.add(l);
|
||||||
|
//字典中无单字,但是词元冲突了,切分出相交词元的前一个词元中的单字
|
||||||
|
/*int innerIndex = index + 1;
|
||||||
|
for (; innerIndex < index + l.getLength(); innerIndex++) {
|
||||||
|
Lexeme innerL = path.peekFirst();
|
||||||
|
if (innerL != null && innerIndex == innerL.getBegin()) {
|
||||||
|
this.outputSingleCJK(innerIndex - 1);
|
||||||
|
}
|
||||||
|
}*/
|
||||||
|
|
||||||
//将index移至lexeme后
|
//将index移至lexeme后
|
||||||
index = l.getBegin() + l.getLength();
|
index = l.getBegin() + l.getLength();
|
||||||
l = path.pollFirst();
|
l = path.pollFirst();
|
||||||
@ -345,7 +354,7 @@ class AnalyzeContext {
|
|||||||
*/
|
*/
|
||||||
private void compound(Lexeme result){
|
private void compound(Lexeme result){
|
||||||
|
|
||||||
if(!this.useSmart){
|
if(!this.cfg.isUseSmart()){
|
||||||
return ;
|
return ;
|
||||||
}
|
}
|
||||||
//数量词合并处理
|
//数量词合并处理
|
||||||
|
@ -86,14 +86,14 @@ class CharacterUtil {
|
|||||||
* @param input
|
* @param input
|
||||||
* @return char
|
* @return char
|
||||||
*/
|
*/
|
||||||
static char regularize(char input){
|
static char regularize(char input,boolean lowercase){
|
||||||
if (input == 12288) {
|
if (input == 12288) {
|
||||||
input = (char) 32;
|
input = (char) 32;
|
||||||
|
|
||||||
}else if (input > 65280 && input < 65375) {
|
}else if (input > 65280 && input < 65375) {
|
||||||
input = (char) (input - 65248);
|
input = (char) (input - 65248);
|
||||||
|
|
||||||
}else if (input >= 'A' && input <= 'Z') {
|
}else if (input >= 'A' && input <= 'Z' && lowercase) {
|
||||||
input += 32;
|
input += 32;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -23,10 +23,7 @@
|
|||||||
*/
|
*/
|
||||||
package org.wltea.analyzer.core;
|
package org.wltea.analyzer.core;
|
||||||
|
|
||||||
import org.elasticsearch.common.settings.Settings;
|
|
||||||
import org.elasticsearch.env.Environment;
|
|
||||||
import org.wltea.analyzer.cfg.Configuration;
|
import org.wltea.analyzer.cfg.Configuration;
|
||||||
import org.wltea.analyzer.dic.Dictionary;
|
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.io.Reader;
|
import java.io.Reader;
|
||||||
@ -47,16 +44,16 @@ public final class IKSegmenter {
|
|||||||
private List<ISegmenter> segmenters;
|
private List<ISegmenter> segmenters;
|
||||||
//分词歧义裁决器
|
//分词歧义裁决器
|
||||||
private IKArbitrator arbitrator;
|
private IKArbitrator arbitrator;
|
||||||
private boolean useSmart = false;
|
private Configuration configuration;
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* IK分词器构造函数
|
* IK分词器构造函数
|
||||||
* @param input
|
* @param input
|
||||||
*/
|
*/
|
||||||
public IKSegmenter(Reader input ,boolean useSmart){
|
public IKSegmenter(Reader input ,Configuration configuration){
|
||||||
this.input = input;
|
this.input = input;
|
||||||
this.useSmart = useSmart;
|
this.configuration = configuration;
|
||||||
this.init();
|
this.init();
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -66,7 +63,7 @@ public final class IKSegmenter {
|
|||||||
*/
|
*/
|
||||||
private void init(){
|
private void init(){
|
||||||
//初始化分词上下文
|
//初始化分词上下文
|
||||||
this.context = new AnalyzeContext(useSmart);
|
this.context = new AnalyzeContext(configuration);
|
||||||
//加载子分词器
|
//加载子分词器
|
||||||
this.segmenters = this.loadSegmenters();
|
this.segmenters = this.loadSegmenters();
|
||||||
//加载歧义裁决器
|
//加载歧义裁决器
|
||||||
@ -127,7 +124,7 @@ public final class IKSegmenter {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
//对分词进行歧义处理
|
//对分词进行歧义处理
|
||||||
this.arbitrator.process(context, useSmart);
|
this.arbitrator.process(context, configuration.isUseSmart());
|
||||||
//将分词结果输出到结果集,并处理未切分的单个CJK字符
|
//将分词结果输出到结果集,并处理未切分的单个CJK字符
|
||||||
context.outputToResult();
|
context.outputToResult();
|
||||||
//记录本次分词的缓冲区位移
|
//记录本次分词的缓冲区位移
|
||||||
|
@ -57,7 +57,7 @@ class DictSegment implements Comparable<DictSegment>{
|
|||||||
|
|
||||||
DictSegment(Character nodeChar){
|
DictSegment(Character nodeChar){
|
||||||
if(nodeChar == null){
|
if(nodeChar == null){
|
||||||
throw new IllegalArgumentException("参数为空异常,字符不能为空");
|
throw new IllegalArgumentException("node char cannot be empty");
|
||||||
}
|
}
|
||||||
this.nodeChar = nodeChar;
|
this.nodeChar = nodeChar;
|
||||||
}
|
}
|
||||||
|
593
src/main/java/org/wltea/analyzer/dic/Dictionary.java
Normal file → Executable file
593
src/main/java/org/wltea/analyzer/dic/Dictionary.java
Normal file → Executable file
@ -26,37 +26,44 @@
|
|||||||
package org.wltea.analyzer.dic;
|
package org.wltea.analyzer.dic;
|
||||||
|
|
||||||
import java.io.BufferedReader;
|
import java.io.BufferedReader;
|
||||||
import java.io.File;
|
|
||||||
import java.io.FileInputStream;
|
import java.io.FileInputStream;
|
||||||
import java.io.FileNotFoundException;
|
import java.io.FileNotFoundException;
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.io.InputStream;
|
import java.io.InputStream;
|
||||||
import java.io.InputStreamReader;
|
import java.io.InputStreamReader;
|
||||||
|
import java.nio.file.attribute.BasicFileAttributes;
|
||||||
|
import java.nio.file.Files;
|
||||||
|
import java.nio.file.FileVisitResult;
|
||||||
import java.nio.file.Path;
|
import java.nio.file.Path;
|
||||||
import java.util.ArrayList;
|
import java.nio.file.SimpleFileVisitor;
|
||||||
import java.util.Collection;
|
import java.security.AccessController;
|
||||||
import java.util.List;
|
import java.security.PrivilegedAction;
|
||||||
|
import java.util.*;
|
||||||
import java.util.concurrent.Executors;
|
import java.util.concurrent.Executors;
|
||||||
import java.util.concurrent.ScheduledExecutorService;
|
import java.util.concurrent.ScheduledExecutorService;
|
||||||
import java.util.concurrent.TimeUnit;
|
import java.util.concurrent.TimeUnit;
|
||||||
|
|
||||||
|
import org.apache.http.Header;
|
||||||
|
import org.apache.http.HttpEntity;
|
||||||
import org.apache.http.client.ClientProtocolException;
|
import org.apache.http.client.ClientProtocolException;
|
||||||
import org.apache.http.client.config.RequestConfig;
|
import org.apache.http.client.config.RequestConfig;
|
||||||
import org.apache.http.client.methods.CloseableHttpResponse;
|
import org.apache.http.client.methods.CloseableHttpResponse;
|
||||||
import org.apache.http.client.methods.HttpGet;
|
import org.apache.http.client.methods.HttpGet;
|
||||||
import org.apache.http.impl.client.CloseableHttpClient;
|
import org.apache.http.impl.client.CloseableHttpClient;
|
||||||
import org.apache.http.impl.client.HttpClients;
|
import org.apache.http.impl.client.HttpClients;
|
||||||
import org.elasticsearch.common.io.PathUtils;
|
import org.elasticsearch.SpecialPermission;
|
||||||
import org.elasticsearch.common.logging.ESLogger;
|
import org.elasticsearch.core.PathUtils;
|
||||||
import org.elasticsearch.common.logging.Loggers;
|
import org.elasticsearch.plugin.analysis.ik.AnalysisIkPlugin;
|
||||||
import org.wltea.analyzer.cfg.Configuration;
|
import org.wltea.analyzer.cfg.Configuration;
|
||||||
|
import org.apache.logging.log4j.Logger;
|
||||||
|
import org.wltea.analyzer.help.ESPluginLoggerFactory;
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* 词典管理类,单子模式
|
* 词典管理类,单子模式
|
||||||
*/
|
*/
|
||||||
public class Dictionary {
|
public class Dictionary {
|
||||||
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* 词典单子实例
|
* 词典单子实例
|
||||||
*/
|
*/
|
||||||
@ -64,50 +71,83 @@ public class Dictionary {
|
|||||||
|
|
||||||
private DictSegment _MainDict;
|
private DictSegment _MainDict;
|
||||||
|
|
||||||
private DictSegment _SurnameDict;
|
|
||||||
|
|
||||||
private DictSegment _QuantifierDict;
|
private DictSegment _QuantifierDict;
|
||||||
|
|
||||||
private DictSegment _SuffixDict;
|
|
||||||
|
|
||||||
private DictSegment _PrepDict;
|
|
||||||
|
|
||||||
private DictSegment _StopWords;
|
private DictSegment _StopWords;
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* 配置对象
|
* 配置对象
|
||||||
*/
|
*/
|
||||||
private Configuration configuration;
|
private Configuration configuration;
|
||||||
public static final ESLogger logger=Loggers.getLogger("ik-analyzer");
|
|
||||||
|
private static final Logger logger = ESPluginLoggerFactory.getLogger(Dictionary.class.getName());
|
||||||
|
|
||||||
private static ScheduledExecutorService pool = Executors.newScheduledThreadPool(1);
|
private static ScheduledExecutorService pool = Executors.newScheduledThreadPool(1);
|
||||||
|
|
||||||
public static final String PATH_DIC_MAIN = "ik/main.dic";
|
private static final String PATH_DIC_MAIN = "main.dic";
|
||||||
public static final String PATH_DIC_SURNAME = "ik/surname.dic";
|
private static final String PATH_DIC_SURNAME = "surname.dic";
|
||||||
public static final String PATH_DIC_QUANTIFIER = "ik/quantifier.dic";
|
private static final String PATH_DIC_QUANTIFIER = "quantifier.dic";
|
||||||
public static final String PATH_DIC_SUFFIX = "ik/suffix.dic";
|
private static final String PATH_DIC_SUFFIX = "suffix.dic";
|
||||||
public static final String PATH_DIC_PREP = "ik/preposition.dic";
|
private static final String PATH_DIC_PREP = "preposition.dic";
|
||||||
public static final String PATH_DIC_STOP = "ik/stopword.dic";
|
private static final String PATH_DIC_STOP = "stopword.dic";
|
||||||
|
|
||||||
private Dictionary(){
|
private final static String FILE_NAME = "IKAnalyzer.cfg.xml";
|
||||||
|
private final static String EXT_DICT = "ext_dict";
|
||||||
|
private final static String REMOTE_EXT_DICT = "remote_ext_dict";
|
||||||
|
private final static String EXT_STOP = "ext_stopwords";
|
||||||
|
private final static String REMOTE_EXT_STOP = "remote_ext_stopwords";
|
||||||
|
|
||||||
|
private Path conf_dir;
|
||||||
|
private Properties props;
|
||||||
|
|
||||||
|
private Dictionary(Configuration cfg) {
|
||||||
|
this.configuration = cfg;
|
||||||
|
this.props = new Properties();
|
||||||
|
this.conf_dir = cfg.getEnvironment().configFile().resolve(AnalysisIkPlugin.PLUGIN_NAME);
|
||||||
|
Path configFile = conf_dir.resolve(FILE_NAME);
|
||||||
|
|
||||||
|
InputStream input = null;
|
||||||
|
try {
|
||||||
|
logger.info("try load config from {}", configFile);
|
||||||
|
input = new FileInputStream(configFile.toFile());
|
||||||
|
} catch (FileNotFoundException e) {
|
||||||
|
conf_dir = cfg.getConfigInPluginDir();
|
||||||
|
configFile = conf_dir.resolve(FILE_NAME);
|
||||||
|
try {
|
||||||
|
logger.info("try load config from {}", configFile);
|
||||||
|
input = new FileInputStream(configFile.toFile());
|
||||||
|
} catch (FileNotFoundException ex) {
|
||||||
|
// We should report origin exception
|
||||||
|
logger.error("ik-analyzer", e);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (input != null) {
|
||||||
|
try {
|
||||||
|
props.loadFromXML(input);
|
||||||
|
} catch (IOException e) {
|
||||||
|
logger.error("ik-analyzer", e);
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private String getProperty(String key){
|
||||||
|
if(props!=null){
|
||||||
|
return props.getProperty(key);
|
||||||
|
}
|
||||||
|
return null;
|
||||||
|
}
|
||||||
/**
|
/**
|
||||||
* 词典初始化
|
* 词典初始化 由于IK Analyzer的词典采用Dictionary类的静态方法进行词典初始化
|
||||||
* 由于IK Analyzer的词典采用Dictionary类的静态方法进行词典初始化
|
* 只有当Dictionary类被实际调用时,才会开始载入词典, 这将延长首次分词操作的时间 该方法提供了一个在应用加载阶段就初始化字典的手段
|
||||||
* 只有当Dictionary类被实际调用时,才会开始载入词典,
|
*
|
||||||
* 这将延长首次分词操作的时间
|
|
||||||
* 该方法提供了一个在应用加载阶段就初始化字典的手段
|
|
||||||
* @return Dictionary
|
* @return Dictionary
|
||||||
*/
|
*/
|
||||||
public static synchronized Dictionary initial(Configuration cfg){
|
public static synchronized void initial(Configuration cfg) {
|
||||||
|
if (singleton == null) {
|
||||||
synchronized (Dictionary.class) {
|
synchronized (Dictionary.class) {
|
||||||
if (singleton == null) {
|
if (singleton == null) {
|
||||||
singleton = new Dictionary();
|
|
||||||
singleton.configuration=cfg;
|
singleton = new Dictionary(cfg);
|
||||||
singleton.loadMainDict();
|
singleton.loadMainDict();
|
||||||
singleton.loadSurnameDict();
|
singleton.loadSurnameDict();
|
||||||
singleton.loadQuantifierDict();
|
singleton.loadQuantifierDict();
|
||||||
@ -115,35 +155,156 @@ public class Dictionary {
|
|||||||
singleton.loadPrepDict();
|
singleton.loadPrepDict();
|
||||||
singleton.loadStopWordDict();
|
singleton.loadStopWordDict();
|
||||||
|
|
||||||
|
if(cfg.isEnableRemoteDict()){
|
||||||
// 建立监控线程
|
// 建立监控线程
|
||||||
for(String location:cfg.getRemoteExtDictionarys()){
|
for (String location : singleton.getRemoteExtDictionarys()) {
|
||||||
// 10 秒是初始延迟可以修改的 60是间隔时间 单位秒
|
// 10 秒是初始延迟可以修改的 60是间隔时间 单位秒
|
||||||
pool.scheduleAtFixedRate(new Monitor(location), 10, 60, TimeUnit.SECONDS);
|
pool.scheduleAtFixedRate(new Monitor(location), 10, 60, TimeUnit.SECONDS);
|
||||||
}
|
}
|
||||||
for(String location:cfg.getRemoteExtStopWordDictionarys()){
|
for (String location : singleton.getRemoteExtStopWordDictionarys()) {
|
||||||
pool.scheduleAtFixedRate(new Monitor(location), 10, 60, TimeUnit.SECONDS);
|
pool.scheduleAtFixedRate(new Monitor(location), 10, 60, TimeUnit.SECONDS);
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
return singleton;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return singleton;
|
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private void walkFileTree(List<String> files, Path path) {
|
||||||
|
if (Files.isRegularFile(path)) {
|
||||||
|
files.add(path.toString());
|
||||||
|
} else if (Files.isDirectory(path)) try {
|
||||||
|
Files.walkFileTree(path, new SimpleFileVisitor<Path>() {
|
||||||
|
@Override
|
||||||
|
public FileVisitResult visitFile(Path file, BasicFileAttributes attrs) {
|
||||||
|
files.add(file.toString());
|
||||||
|
return FileVisitResult.CONTINUE;
|
||||||
|
}
|
||||||
|
@Override
|
||||||
|
public FileVisitResult visitFileFailed(Path file, IOException e) {
|
||||||
|
logger.error("[Ext Loading] listing files", e);
|
||||||
|
return FileVisitResult.CONTINUE;
|
||||||
|
}
|
||||||
|
});
|
||||||
|
} catch (IOException e) {
|
||||||
|
logger.error("[Ext Loading] listing files", e);
|
||||||
|
} else {
|
||||||
|
logger.warn("[Ext Loading] file not found: " + path);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private void loadDictFile(DictSegment dict, Path file, boolean critical, String name) {
|
||||||
|
try (InputStream is = new FileInputStream(file.toFile())) {
|
||||||
|
BufferedReader br = new BufferedReader(
|
||||||
|
new InputStreamReader(is, "UTF-8"), 512);
|
||||||
|
String word = br.readLine();
|
||||||
|
if (word != null) {
|
||||||
|
if (word.startsWith("\uFEFF"))
|
||||||
|
word = word.substring(1);
|
||||||
|
for (; word != null; word = br.readLine()) {
|
||||||
|
word = word.trim();
|
||||||
|
if (word.isEmpty()) continue;
|
||||||
|
dict.fillSegment(word.toCharArray());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} catch (FileNotFoundException e) {
|
||||||
|
logger.error("ik-analyzer: " + name + " not found", e);
|
||||||
|
if (critical) throw new RuntimeException("ik-analyzer: " + name + " not found!!!", e);
|
||||||
|
} catch (IOException e) {
|
||||||
|
logger.error("ik-analyzer: " + name + " loading failed", e);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private List<String> getExtDictionarys() {
|
||||||
|
List<String> extDictFiles = new ArrayList<String>(2);
|
||||||
|
String extDictCfg = getProperty(EXT_DICT);
|
||||||
|
if (extDictCfg != null) {
|
||||||
|
|
||||||
|
String[] filePaths = extDictCfg.split(";");
|
||||||
|
for (String filePath : filePaths) {
|
||||||
|
if (filePath != null && !"".equals(filePath.trim())) {
|
||||||
|
Path file = PathUtils.get(getDictRoot(), filePath.trim());
|
||||||
|
walkFileTree(extDictFiles, file);
|
||||||
|
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return extDictFiles;
|
||||||
|
}
|
||||||
|
|
||||||
|
private List<String> getRemoteExtDictionarys() {
|
||||||
|
List<String> remoteExtDictFiles = new ArrayList<String>(2);
|
||||||
|
String remoteExtDictCfg = getProperty(REMOTE_EXT_DICT);
|
||||||
|
if (remoteExtDictCfg != null) {
|
||||||
|
|
||||||
|
String[] filePaths = remoteExtDictCfg.split(";");
|
||||||
|
for (String filePath : filePaths) {
|
||||||
|
if (filePath != null && !"".equals(filePath.trim())) {
|
||||||
|
remoteExtDictFiles.add(filePath);
|
||||||
|
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return remoteExtDictFiles;
|
||||||
|
}
|
||||||
|
|
||||||
|
private List<String> getExtStopWordDictionarys() {
|
||||||
|
List<String> extStopWordDictFiles = new ArrayList<String>(2);
|
||||||
|
String extStopWordDictCfg = getProperty(EXT_STOP);
|
||||||
|
if (extStopWordDictCfg != null) {
|
||||||
|
|
||||||
|
String[] filePaths = extStopWordDictCfg.split(";");
|
||||||
|
for (String filePath : filePaths) {
|
||||||
|
if (filePath != null && !"".equals(filePath.trim())) {
|
||||||
|
Path file = PathUtils.get(getDictRoot(), filePath.trim());
|
||||||
|
walkFileTree(extStopWordDictFiles, file);
|
||||||
|
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return extStopWordDictFiles;
|
||||||
|
}
|
||||||
|
|
||||||
|
private List<String> getRemoteExtStopWordDictionarys() {
|
||||||
|
List<String> remoteExtStopWordDictFiles = new ArrayList<String>(2);
|
||||||
|
String remoteExtStopWordDictCfg = getProperty(REMOTE_EXT_STOP);
|
||||||
|
if (remoteExtStopWordDictCfg != null) {
|
||||||
|
|
||||||
|
String[] filePaths = remoteExtStopWordDictCfg.split(";");
|
||||||
|
for (String filePath : filePaths) {
|
||||||
|
if (filePath != null && !"".equals(filePath.trim())) {
|
||||||
|
remoteExtStopWordDictFiles.add(filePath);
|
||||||
|
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return remoteExtStopWordDictFiles;
|
||||||
|
}
|
||||||
|
|
||||||
|
private String getDictRoot() {
|
||||||
|
return conf_dir.toAbsolutePath().toString();
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* 获取词典单子实例
|
* 获取词典单子实例
|
||||||
|
*
|
||||||
* @return Dictionary 单例对象
|
* @return Dictionary 单例对象
|
||||||
*/
|
*/
|
||||||
public static Dictionary getSingleton() {
|
public static Dictionary getSingleton() {
|
||||||
if (singleton == null) {
|
if (singleton == null) {
|
||||||
throw new IllegalStateException("词典尚未初始化,请先调用initial方法");
|
throw new IllegalStateException("ik dict has not been initialized yet, please call initial method first.");
|
||||||
}
|
}
|
||||||
return singleton;
|
return singleton;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* 批量加载新词条
|
* 批量加载新词条
|
||||||
* @param words Collection<String>词条列表
|
*
|
||||||
|
* @param words
|
||||||
|
* Collection<String>词条列表
|
||||||
*/
|
*/
|
||||||
public void addWords(Collection<String> words) {
|
public void addWords(Collection<String> words) {
|
||||||
if (words != null) {
|
if (words != null) {
|
||||||
@ -172,6 +333,7 @@ public class Dictionary {
|
|||||||
|
|
||||||
/**
|
/**
|
||||||
* 检索匹配主词典
|
* 检索匹配主词典
|
||||||
|
*
|
||||||
* @return Hit 匹配结果描述
|
* @return Hit 匹配结果描述
|
||||||
*/
|
*/
|
||||||
public Hit matchInMainDict(char[] charArray) {
|
public Hit matchInMainDict(char[] charArray) {
|
||||||
@ -180,6 +342,7 @@ public class Dictionary {
|
|||||||
|
|
||||||
/**
|
/**
|
||||||
* 检索匹配主词典
|
* 检索匹配主词典
|
||||||
|
*
|
||||||
* @return Hit 匹配结果描述
|
* @return Hit 匹配结果描述
|
||||||
*/
|
*/
|
||||||
public Hit matchInMainDict(char[] charArray, int begin, int length) {
|
public Hit matchInMainDict(char[] charArray, int begin, int length) {
|
||||||
@ -188,15 +351,16 @@ public class Dictionary {
|
|||||||
|
|
||||||
/**
|
/**
|
||||||
* 检索匹配量词词典
|
* 检索匹配量词词典
|
||||||
|
*
|
||||||
* @return Hit 匹配结果描述
|
* @return Hit 匹配结果描述
|
||||||
*/
|
*/
|
||||||
public Hit matchInQuantifierDict(char[] charArray, int begin, int length) {
|
public Hit matchInQuantifierDict(char[] charArray, int begin, int length) {
|
||||||
return singleton._QuantifierDict.match(charArray, begin, length);
|
return singleton._QuantifierDict.match(charArray, begin, length);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* 从已匹配的Hit中直接取出DictSegment,继续向下匹配
|
* 从已匹配的Hit中直接取出DictSegment,继续向下匹配
|
||||||
|
*
|
||||||
* @return Hit
|
* @return Hit
|
||||||
*/
|
*/
|
||||||
public Hit matchWithHit(char[] charArray, int currentIndex, Hit matchedHit) {
|
public Hit matchWithHit(char[] charArray, int currentIndex, Hit matchedHit) {
|
||||||
@ -204,9 +368,9 @@ public class Dictionary {
|
|||||||
return ds.match(charArray, currentIndex, 1, matchedHit);
|
return ds.match(charArray, currentIndex, 1, matchedHit);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* 判断是否是停止词
|
* 判断是否是停止词
|
||||||
|
*
|
||||||
* @return boolean
|
* @return boolean
|
||||||
*/
|
*/
|
||||||
public boolean isStopWord(char[] charArray, int begin, int length) {
|
public boolean isStopWord(char[] charArray, int begin, int length) {
|
||||||
@ -221,38 +385,8 @@ public class Dictionary {
|
|||||||
_MainDict = new DictSegment((char) 0);
|
_MainDict = new DictSegment((char) 0);
|
||||||
|
|
||||||
// 读取主词典文件
|
// 读取主词典文件
|
||||||
Path file = PathUtils.get(configuration.getDictRoot(), Dictionary.PATH_DIC_MAIN);
|
Path file = PathUtils.get(getDictRoot(), Dictionary.PATH_DIC_MAIN);
|
||||||
|
loadDictFile(_MainDict, file, false, "Main Dict");
|
||||||
InputStream is = null;
|
|
||||||
try {
|
|
||||||
is = new FileInputStream(file.toFile());
|
|
||||||
} catch (FileNotFoundException e) {
|
|
||||||
e.printStackTrace();
|
|
||||||
}
|
|
||||||
|
|
||||||
try {
|
|
||||||
BufferedReader br = new BufferedReader(new InputStreamReader(is , "UTF-8"), 512);
|
|
||||||
String theWord = null;
|
|
||||||
do {
|
|
||||||
theWord = br.readLine();
|
|
||||||
if (theWord != null && !"".equals(theWord.trim())) {
|
|
||||||
_MainDict.fillSegment(theWord.trim().toCharArray());
|
|
||||||
}
|
|
||||||
} while (theWord != null);
|
|
||||||
|
|
||||||
} catch (IOException e) {
|
|
||||||
logger.error("ik-analyzer",e);
|
|
||||||
|
|
||||||
}finally{
|
|
||||||
try {
|
|
||||||
if(is != null){
|
|
||||||
is.close();
|
|
||||||
is = null;
|
|
||||||
}
|
|
||||||
} catch (IOException e) {
|
|
||||||
logger.error("ik-analyzer",e);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
// 加载扩展词典
|
// 加载扩展词典
|
||||||
this.loadExtDict();
|
this.loadExtDict();
|
||||||
// 加载远程自定义词库
|
// 加载远程自定义词库
|
||||||
@ -264,65 +398,30 @@ public class Dictionary {
|
|||||||
*/
|
*/
|
||||||
private void loadExtDict() {
|
private void loadExtDict() {
|
||||||
// 加载扩展词典配置
|
// 加载扩展词典配置
|
||||||
List<String> extDictFiles = configuration.getExtDictionarys();
|
List<String> extDictFiles = getExtDictionarys();
|
||||||
if (extDictFiles != null) {
|
if (extDictFiles != null) {
|
||||||
InputStream is = null;
|
|
||||||
for (String extDictName : extDictFiles) {
|
for (String extDictName : extDictFiles) {
|
||||||
// 读取扩展词典文件
|
// 读取扩展词典文件
|
||||||
logger.info("[Dict Loading] " + extDictName);
|
logger.info("[Dict Loading] " + extDictName);
|
||||||
Path file = PathUtils.get(configuration.getDictRoot(), extDictName);
|
Path file = PathUtils.get(extDictName);
|
||||||
try {
|
loadDictFile(_MainDict, file, false, "Extra Dict");
|
||||||
is = new FileInputStream(file.toFile());
|
|
||||||
} catch (FileNotFoundException e) {
|
|
||||||
logger.error("ik-analyzer",e);
|
|
||||||
}
|
|
||||||
|
|
||||||
//如果找不到扩展的字典,则忽略
|
|
||||||
if(is == null){
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
try {
|
|
||||||
BufferedReader br = new BufferedReader(new InputStreamReader(is , "UTF-8"), 512);
|
|
||||||
String theWord = null;
|
|
||||||
do {
|
|
||||||
theWord = br.readLine();
|
|
||||||
if (theWord != null && !"".equals(theWord.trim())) {
|
|
||||||
//加载扩展词典数据到主内存词典中
|
|
||||||
_MainDict.fillSegment(theWord.trim().toCharArray());
|
|
||||||
}
|
|
||||||
} while (theWord != null);
|
|
||||||
|
|
||||||
} catch (IOException e) {
|
|
||||||
logger.error("ik-analyzer",e);
|
|
||||||
}finally{
|
|
||||||
try {
|
|
||||||
is.close();
|
|
||||||
is = null;
|
|
||||||
} catch (IOException e) {
|
|
||||||
logger.error("ik-analyzer",e);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* 加载远程扩展词典到主词库表
|
* 加载远程扩展词典到主词库表
|
||||||
*/
|
*/
|
||||||
private void loadRemoteExtDict() {
|
private void loadRemoteExtDict() {
|
||||||
List<String> remoteExtDictFiles = configuration.getRemoteExtDictionarys();
|
List<String> remoteExtDictFiles = getRemoteExtDictionarys();
|
||||||
for (String location : remoteExtDictFiles) {
|
for (String location : remoteExtDictFiles) {
|
||||||
logger.info("[Dict Loading] " + location);
|
logger.info("[Dict Loading] " + location);
|
||||||
List<String> lists = getRemoteWords(location);
|
List<String> lists = getRemoteWords(location);
|
||||||
|
|
||||||
/** Redundant Nullcheck as the list is initialized in the getRemoteWords method
|
|
||||||
// 如果找不到扩展的字典,则忽略
|
// 如果找不到扩展的字典,则忽略
|
||||||
if (lists == null) {
|
if (lists == null) {
|
||||||
logger.error("[Dict Loading] "+location+"加载失败");
|
logger.error("[Dict Loading] " + location + " load failed");
|
||||||
continue;
|
continue;
|
||||||
}*/
|
}
|
||||||
|
|
||||||
for (String theWord : lists) {
|
for (String theWord : lists) {
|
||||||
if (theWord != null && !"".equals(theWord.trim())) {
|
if (theWord != null && !"".equals(theWord.trim())) {
|
||||||
// 加载扩展词典数据到主内存词典中
|
// 加载扩展词典数据到主内存词典中
|
||||||
@ -334,14 +433,21 @@ public class Dictionary {
|
|||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private static List<String> getRemoteWords(String location) {
|
||||||
|
SpecialPermission.check();
|
||||||
|
return AccessController.doPrivileged((PrivilegedAction<List<String>>) () -> {
|
||||||
|
return getRemoteWordsUnprivileged(location);
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* 从远程服务器上下载自定义词条
|
* 从远程服务器上下载自定义词条
|
||||||
*/
|
*/
|
||||||
private static List<String> getRemoteWords(String location){
|
private static List<String> getRemoteWordsUnprivileged(String location) {
|
||||||
|
|
||||||
List<String> buffer = new ArrayList<String>();
|
List<String> buffer = new ArrayList<String>();
|
||||||
RequestConfig rc = RequestConfig.custom().setConnectionRequestTimeout(10*1000)
|
RequestConfig rc = RequestConfig.custom().setConnectionRequestTimeout(10 * 1000).setConnectTimeout(10 * 1000)
|
||||||
.setConnectTimeout(10*1000).setSocketTimeout(60*1000).build();
|
.setSocketTimeout(60 * 1000).build();
|
||||||
CloseableHttpClient httpclient = HttpClients.createDefault();
|
CloseableHttpClient httpclient = HttpClients.createDefault();
|
||||||
CloseableHttpResponse response;
|
CloseableHttpResponse response;
|
||||||
BufferedReader in;
|
BufferedReader in;
|
||||||
@ -353,11 +459,18 @@ public class Dictionary {
|
|||||||
|
|
||||||
String charset = "UTF-8";
|
String charset = "UTF-8";
|
||||||
// 获取编码,默认为utf-8
|
// 获取编码,默认为utf-8
|
||||||
if(response.getEntity().getContentType().getValue().contains("charset=")){
|
HttpEntity entity = response.getEntity();
|
||||||
String contentType=response.getEntity().getContentType().getValue();
|
if(entity!=null){
|
||||||
charset=contentType.substring(contentType.lastIndexOf("=")+1);
|
Header contentType = entity.getContentType();
|
||||||
|
if(contentType!=null&&contentType.getValue()!=null){
|
||||||
|
String typeValue = contentType.getValue();
|
||||||
|
if(typeValue!=null&&typeValue.contains("charset=")){
|
||||||
|
charset = typeValue.substring(typeValue.lastIndexOf("=") + 1);
|
||||||
}
|
}
|
||||||
in = new BufferedReader(new InputStreamReader(response.getEntity().getContent(),charset));
|
}
|
||||||
|
|
||||||
|
if (entity.getContentLength() > 0 || entity.isChunked()) {
|
||||||
|
in = new BufferedReader(new InputStreamReader(entity.getContent(), charset));
|
||||||
String line;
|
String line;
|
||||||
while ((line = in.readLine()) != null) {
|
while ((line = in.readLine()) != null) {
|
||||||
buffer.add(line);
|
buffer.add(line);
|
||||||
@ -366,19 +479,15 @@ public class Dictionary {
|
|||||||
response.close();
|
response.close();
|
||||||
return buffer;
|
return buffer;
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
response.close();
|
response.close();
|
||||||
} catch (ClientProtocolException e) {
|
} catch (IllegalStateException | IOException e) {
|
||||||
logger.error( "getRemoteWords {} error" , e , location);
|
|
||||||
} catch (IllegalStateException e) {
|
|
||||||
logger.error( "getRemoteWords {} error" , e , location );
|
|
||||||
} catch (IOException e) {
|
|
||||||
logger.error("getRemoteWords {} error", e, location);
|
logger.error("getRemoteWords {} error", e, location);
|
||||||
}
|
}
|
||||||
return buffer;
|
return buffer;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* 加载用户扩展的停止词词典
|
* 加载用户扩展的停止词词典
|
||||||
*/
|
*/
|
||||||
@ -387,96 +496,31 @@ public class Dictionary {
|
|||||||
_StopWords = new DictSegment((char) 0);
|
_StopWords = new DictSegment((char) 0);
|
||||||
|
|
||||||
// 读取主词典文件
|
// 读取主词典文件
|
||||||
Path file = PathUtils.get(configuration.getDictRoot(), Dictionary.PATH_DIC_STOP);
|
Path file = PathUtils.get(getDictRoot(), Dictionary.PATH_DIC_STOP);
|
||||||
|
loadDictFile(_StopWords, file, false, "Main Stopwords");
|
||||||
InputStream is = null;
|
|
||||||
try {
|
|
||||||
is = new FileInputStream(file.toFile());
|
|
||||||
} catch (FileNotFoundException e) {
|
|
||||||
e.printStackTrace();
|
|
||||||
}
|
|
||||||
|
|
||||||
try {
|
|
||||||
BufferedReader br = new BufferedReader(new InputStreamReader(is , "UTF-8"), 512);
|
|
||||||
String theWord = null;
|
|
||||||
do {
|
|
||||||
theWord = br.readLine();
|
|
||||||
if (theWord != null && !"".equals(theWord.trim())) {
|
|
||||||
_StopWords.fillSegment(theWord.trim().toCharArray());
|
|
||||||
}
|
|
||||||
} while (theWord != null);
|
|
||||||
|
|
||||||
} catch (IOException e) {
|
|
||||||
logger.error("ik-analyzer",e);
|
|
||||||
|
|
||||||
}finally{
|
|
||||||
try {
|
|
||||||
if(is != null){
|
|
||||||
is.close();
|
|
||||||
is = null;
|
|
||||||
}
|
|
||||||
} catch (IOException e) {
|
|
||||||
logger.error("ik-analyzer",e);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
// 加载扩展停止词典
|
// 加载扩展停止词典
|
||||||
List<String> extStopWordDictFiles = configuration.getExtStopWordDictionarys();
|
List<String> extStopWordDictFiles = getExtStopWordDictionarys();
|
||||||
if (extStopWordDictFiles != null) {
|
if (extStopWordDictFiles != null) {
|
||||||
is = null;
|
|
||||||
for (String extStopWordDictName : extStopWordDictFiles) {
|
for (String extStopWordDictName : extStopWordDictFiles) {
|
||||||
logger.info("[Dict Loading] " + extStopWordDictName);
|
logger.info("[Dict Loading] " + extStopWordDictName);
|
||||||
|
|
||||||
// 读取扩展词典文件
|
// 读取扩展词典文件
|
||||||
file=PathUtils.get(configuration.getDictRoot(), extStopWordDictName);
|
file = PathUtils.get(extStopWordDictName);
|
||||||
try {
|
loadDictFile(_StopWords, file, false, "Extra Stopwords");
|
||||||
is = new FileInputStream(file.toFile());
|
|
||||||
} catch (FileNotFoundException e) {
|
|
||||||
logger.error("ik-analyzer",e);
|
|
||||||
}
|
|
||||||
//如果找不到扩展的字典,则忽略
|
|
||||||
if(is == null){
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
try {
|
|
||||||
BufferedReader br = new BufferedReader(new InputStreamReader(is , "UTF-8"), 512);
|
|
||||||
String theWord = null;
|
|
||||||
do {
|
|
||||||
theWord = br.readLine();
|
|
||||||
if (theWord != null && !"".equals(theWord.trim())) {
|
|
||||||
//加载扩展停止词典数据到内存中
|
|
||||||
_StopWords.fillSegment(theWord.trim().toCharArray());
|
|
||||||
}
|
|
||||||
} while (theWord != null);
|
|
||||||
|
|
||||||
} catch (IOException e) {
|
|
||||||
logger.error("ik-analyzer",e);
|
|
||||||
|
|
||||||
}finally{
|
|
||||||
try {
|
|
||||||
is.close();
|
|
||||||
is = null;
|
|
||||||
} catch (IOException e) {
|
|
||||||
logger.error("ik-analyzer",e);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// 加载远程停用词典
|
// 加载远程停用词典
|
||||||
List<String> remoteExtStopWordDictFiles = configuration.getRemoteExtStopWordDictionarys();
|
List<String> remoteExtStopWordDictFiles = getRemoteExtStopWordDictionarys();
|
||||||
for (String location : remoteExtStopWordDictFiles) {
|
for (String location : remoteExtStopWordDictFiles) {
|
||||||
logger.info("[Dict Loading] " + location);
|
logger.info("[Dict Loading] " + location);
|
||||||
List<String> lists = getRemoteWords(location);
|
List<String> lists = getRemoteWords(location);
|
||||||
|
|
||||||
/** Redundant Nullcheck as the list is initialized in the getRemoteWords method
|
|
||||||
// 如果找不到扩展的字典,则忽略
|
// 如果找不到扩展的字典,则忽略
|
||||||
if (lists == null) {
|
if (lists == null) {
|
||||||
logger.error("[Dict Loading] "+location+"加载失败");
|
logger.error("[Dict Loading] " + location + " load failed");
|
||||||
continue;
|
continue;
|
||||||
}*/
|
}
|
||||||
|
|
||||||
for (String theWord : lists) {
|
for (String theWord : lists) {
|
||||||
if (theWord != null && !"".equals(theWord.trim())) {
|
if (theWord != null && !"".equals(theWord.trim())) {
|
||||||
// 加载远程词典数据到主内存中
|
// 加载远程词典数据到主内存中
|
||||||
@ -486,7 +530,6 @@ public class Dictionary {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@ -496,156 +539,38 @@ public class Dictionary {
|
|||||||
// 建立一个量词典实例
|
// 建立一个量词典实例
|
||||||
_QuantifierDict = new DictSegment((char) 0);
|
_QuantifierDict = new DictSegment((char) 0);
|
||||||
// 读取量词词典文件
|
// 读取量词词典文件
|
||||||
Path file = PathUtils.get(configuration.getDictRoot(), Dictionary.PATH_DIC_QUANTIFIER);
|
Path file = PathUtils.get(getDictRoot(), Dictionary.PATH_DIC_QUANTIFIER);
|
||||||
InputStream is = null;
|
loadDictFile(_QuantifierDict, file, false, "Quantifier");
|
||||||
try {
|
|
||||||
is = new FileInputStream(file.toFile());
|
|
||||||
} catch (FileNotFoundException e) {
|
|
||||||
logger.error("ik-analyzer",e);
|
|
||||||
}
|
}
|
||||||
try {
|
|
||||||
BufferedReader br = new BufferedReader(new InputStreamReader(is , "UTF-8"), 512);
|
|
||||||
String theWord = null;
|
|
||||||
do {
|
|
||||||
theWord = br.readLine();
|
|
||||||
if (theWord != null && !"".equals(theWord.trim())) {
|
|
||||||
_QuantifierDict.fillSegment(theWord.trim().toCharArray());
|
|
||||||
}
|
|
||||||
} while (theWord != null);
|
|
||||||
|
|
||||||
} catch (IOException ioe) {
|
|
||||||
logger.error("Quantifier Dictionary loading exception.");
|
|
||||||
|
|
||||||
}finally{
|
|
||||||
try {
|
|
||||||
if(is != null){
|
|
||||||
is.close();
|
|
||||||
is = null;
|
|
||||||
}
|
|
||||||
} catch (IOException e) {
|
|
||||||
logger.error("ik-analyzer",e);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
private void loadSurnameDict() {
|
private void loadSurnameDict() {
|
||||||
|
DictSegment _SurnameDict = new DictSegment((char) 0);
|
||||||
_SurnameDict = new DictSegment((char)0);
|
Path file = PathUtils.get(getDictRoot(), Dictionary.PATH_DIC_SURNAME);
|
||||||
Path file = PathUtils.get(configuration.getDictRoot(), Dictionary.PATH_DIC_SURNAME);
|
loadDictFile(_SurnameDict, file, true, "Surname");
|
||||||
InputStream is = null;
|
|
||||||
try {
|
|
||||||
is = new FileInputStream(file.toFile());
|
|
||||||
} catch (FileNotFoundException e) {
|
|
||||||
logger.error("ik-analyzer",e);
|
|
||||||
}
|
}
|
||||||
if(is == null){
|
|
||||||
throw new RuntimeException("Surname Dictionary not found!!!");
|
|
||||||
}
|
|
||||||
try {
|
|
||||||
BufferedReader br = new BufferedReader(new InputStreamReader(is , "UTF-8"), 512);
|
|
||||||
String theWord;
|
|
||||||
do {
|
|
||||||
theWord = br.readLine();
|
|
||||||
if (theWord != null && !"".equals(theWord.trim())) {
|
|
||||||
_SurnameDict.fillSegment(theWord.trim().toCharArray());
|
|
||||||
}
|
|
||||||
} while (theWord != null);
|
|
||||||
} catch (IOException e) {
|
|
||||||
logger.error("ik-analyzer",e);
|
|
||||||
}finally{
|
|
||||||
try {
|
|
||||||
is.close();
|
|
||||||
is = null;
|
|
||||||
} catch (IOException e) {
|
|
||||||
logger.error("ik-analyzer",e);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
private void loadSuffixDict() {
|
private void loadSuffixDict() {
|
||||||
|
DictSegment _SuffixDict = new DictSegment((char) 0);
|
||||||
_SuffixDict = new DictSegment((char)0);
|
Path file = PathUtils.get(getDictRoot(), Dictionary.PATH_DIC_SUFFIX);
|
||||||
Path file = PathUtils.get(configuration.getDictRoot(), Dictionary.PATH_DIC_SUFFIX);
|
loadDictFile(_SuffixDict, file, true, "Suffix");
|
||||||
InputStream is = null;
|
|
||||||
try {
|
|
||||||
is = new FileInputStream(file.toFile());
|
|
||||||
} catch (FileNotFoundException e) {
|
|
||||||
logger.error("ik-analyzer",e);
|
|
||||||
}
|
}
|
||||||
if(is == null){
|
|
||||||
throw new RuntimeException("Suffix Dictionary not found!!!");
|
|
||||||
}
|
|
||||||
try {
|
|
||||||
|
|
||||||
BufferedReader br = new BufferedReader(new InputStreamReader(is , "UTF-8"), 512);
|
|
||||||
String theWord;
|
|
||||||
do {
|
|
||||||
theWord = br.readLine();
|
|
||||||
if (theWord != null && !"".equals(theWord.trim())) {
|
|
||||||
_SuffixDict.fillSegment(theWord.trim().toCharArray());
|
|
||||||
}
|
|
||||||
} while (theWord != null);
|
|
||||||
} catch (IOException e) {
|
|
||||||
logger.error("ik-analyzer",e);
|
|
||||||
}finally{
|
|
||||||
try {
|
|
||||||
is.close();
|
|
||||||
is = null;
|
|
||||||
} catch (IOException e) {
|
|
||||||
logger.error("ik-analyzer",e);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
private void loadPrepDict() {
|
private void loadPrepDict() {
|
||||||
|
DictSegment _PrepDict = new DictSegment((char) 0);
|
||||||
_PrepDict = new DictSegment((char)0);
|
Path file = PathUtils.get(getDictRoot(), Dictionary.PATH_DIC_PREP);
|
||||||
Path file = PathUtils.get(configuration.getDictRoot(), Dictionary.PATH_DIC_PREP);
|
loadDictFile(_PrepDict, file, true, "Preposition");
|
||||||
InputStream is = null;
|
|
||||||
try {
|
|
||||||
is = new FileInputStream(file.toFile());
|
|
||||||
} catch (FileNotFoundException e) {
|
|
||||||
logger.error("ik-analyzer",e);
|
|
||||||
}
|
|
||||||
if(is == null){
|
|
||||||
throw new RuntimeException("Preposition Dictionary not found!!!");
|
|
||||||
}
|
|
||||||
try {
|
|
||||||
|
|
||||||
BufferedReader br = new BufferedReader(new InputStreamReader(is , "UTF-8"), 512);
|
|
||||||
String theWord;
|
|
||||||
do {
|
|
||||||
theWord = br.readLine();
|
|
||||||
if (theWord != null && !"".equals(theWord.trim())) {
|
|
||||||
|
|
||||||
_PrepDict.fillSegment(theWord.trim().toCharArray());
|
|
||||||
}
|
|
||||||
} while (theWord != null);
|
|
||||||
} catch (IOException e) {
|
|
||||||
logger.error("ik-analyzer",e);
|
|
||||||
}finally{
|
|
||||||
try {
|
|
||||||
is.close();
|
|
||||||
is = null;
|
|
||||||
} catch (IOException e) {
|
|
||||||
logger.error("ik-analyzer",e);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
public void reLoadMainDict(){
|
void reLoadMainDict() {
|
||||||
logger.info("重新加载词典...");
|
logger.info("start to reload ik dict.");
|
||||||
// 新开一个实例加载词典,减少加载过程对当前词典使用的影响
|
// 新开一个实例加载词典,减少加载过程对当前词典使用的影响
|
||||||
Dictionary tmpDict = new Dictionary();
|
Dictionary tmpDict = new Dictionary(configuration);
|
||||||
tmpDict.configuration = getSingleton().configuration;
|
tmpDict.configuration = getSingleton().configuration;
|
||||||
tmpDict.loadMainDict();
|
tmpDict.loadMainDict();
|
||||||
tmpDict.loadStopWordDict();
|
tmpDict.loadStopWordDict();
|
||||||
_MainDict = tmpDict._MainDict;
|
_MainDict = tmpDict._MainDict;
|
||||||
_StopWords = tmpDict._StopWords;
|
_StopWords = tmpDict._StopWords;
|
||||||
logger.info("重新加载词典完毕...");
|
logger.info("reload ik dict finished.");
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
@ -1,15 +1,22 @@
|
|||||||
package org.wltea.analyzer.dic;
|
package org.wltea.analyzer.dic;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
|
import java.security.AccessController;
|
||||||
|
import java.security.PrivilegedAction;
|
||||||
|
|
||||||
import org.apache.http.client.config.RequestConfig;
|
import org.apache.http.client.config.RequestConfig;
|
||||||
import org.apache.http.client.methods.CloseableHttpResponse;
|
import org.apache.http.client.methods.CloseableHttpResponse;
|
||||||
import org.apache.http.client.methods.HttpHead;
|
import org.apache.http.client.methods.HttpHead;
|
||||||
import org.apache.http.impl.client.CloseableHttpClient;
|
import org.apache.http.impl.client.CloseableHttpClient;
|
||||||
import org.apache.http.impl.client.HttpClients;
|
import org.apache.http.impl.client.HttpClients;
|
||||||
|
import org.apache.logging.log4j.Logger;
|
||||||
|
import org.elasticsearch.SpecialPermission;
|
||||||
|
import org.wltea.analyzer.help.ESPluginLoggerFactory;
|
||||||
|
|
||||||
public class Monitor implements Runnable {
|
public class Monitor implements Runnable {
|
||||||
|
|
||||||
|
private static final Logger logger = ESPluginLoggerFactory.getLogger(Monitor.class.getName());
|
||||||
|
|
||||||
private static CloseableHttpClient httpclient = HttpClients.createDefault();
|
private static CloseableHttpClient httpclient = HttpClients.createDefault();
|
||||||
/*
|
/*
|
||||||
* 上次更改时间
|
* 上次更改时间
|
||||||
@ -30,6 +37,15 @@ public class Monitor implements Runnable {
|
|||||||
this.last_modified = null;
|
this.last_modified = null;
|
||||||
this.eTags = null;
|
this.eTags = null;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public void run() {
|
||||||
|
SpecialPermission.check();
|
||||||
|
AccessController.doPrivileged((PrivilegedAction<Void>) () -> {
|
||||||
|
this.runUnprivileged();
|
||||||
|
return null;
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* 监控流程:
|
* 监控流程:
|
||||||
* ①向词库服务器发送Head请求
|
* ①向词库服务器发送Head请求
|
||||||
@ -39,7 +55,7 @@ public class Monitor implements Runnable {
|
|||||||
* ⑤休眠1min,返回第①步
|
* ⑤休眠1min,返回第①步
|
||||||
*/
|
*/
|
||||||
|
|
||||||
public void run() {
|
public void runUnprivileged() {
|
||||||
|
|
||||||
//超时设置
|
//超时设置
|
||||||
RequestConfig rc = RequestConfig.custom().setConnectionRequestTimeout(10*1000)
|
RequestConfig rc = RequestConfig.custom().setConnectionRequestTimeout(10*1000)
|
||||||
@ -64,8 +80,8 @@ public class Monitor implements Runnable {
|
|||||||
//返回200 才做操作
|
//返回200 才做操作
|
||||||
if(response.getStatusLine().getStatusCode()==200){
|
if(response.getStatusLine().getStatusCode()==200){
|
||||||
|
|
||||||
if (!response.getLastHeader("Last-Modified").getValue().equalsIgnoreCase(last_modified)
|
if (((response.getLastHeader("Last-Modified")!=null) && !response.getLastHeader("Last-Modified").getValue().equalsIgnoreCase(last_modified))
|
||||||
||!response.getLastHeader("ETag").getValue().equalsIgnoreCase(eTags)) {
|
||((response.getLastHeader("ETag")!=null) && !response.getLastHeader("ETag").getValue().equalsIgnoreCase(eTags))) {
|
||||||
|
|
||||||
// 远程词库有更新,需要重新加载词典,并修改last_modified,eTags
|
// 远程词库有更新,需要重新加载词典,并修改last_modified,eTags
|
||||||
Dictionary.getSingleton().reLoadMainDict();
|
Dictionary.getSingleton().reLoadMainDict();
|
||||||
@ -76,18 +92,18 @@ public class Monitor implements Runnable {
|
|||||||
//没有修改,不做操作
|
//没有修改,不做操作
|
||||||
//noop
|
//noop
|
||||||
}else{
|
}else{
|
||||||
Dictionary.logger.info("remote_ext_dict {} return bad code {}" , location , response.getStatusLine().getStatusCode() );
|
logger.info("remote_ext_dict {} return bad code {}" , location , response.getStatusLine().getStatusCode() );
|
||||||
}
|
}
|
||||||
|
|
||||||
} catch (Exception e) {
|
} catch (Exception e) {
|
||||||
Dictionary.logger.error("remote_ext_dict {} error!",e , location);
|
logger.error("remote_ext_dict {} error!",e , location);
|
||||||
}finally{
|
}finally{
|
||||||
try {
|
try {
|
||||||
if (response != null) {
|
if (response != null) {
|
||||||
response.close();
|
response.close();
|
||||||
}
|
}
|
||||||
} catch (IOException e) {
|
} catch (IOException e) {
|
||||||
e.printStackTrace();
|
logger.error(e.getMessage(), e);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -0,0 +1,27 @@
|
|||||||
|
package org.wltea.analyzer.help;
|
||||||
|
|
||||||
|
import org.apache.logging.log4j.LogManager;
|
||||||
|
import org.apache.logging.log4j.Logger;
|
||||||
|
import org.apache.logging.log4j.spi.ExtendedLogger;
|
||||||
|
|
||||||
|
public class ESPluginLoggerFactory {
|
||||||
|
|
||||||
|
private ESPluginLoggerFactory() {
|
||||||
|
}
|
||||||
|
|
||||||
|
static public Logger getLogger(String name) {
|
||||||
|
return getLogger("", LogManager.getLogger(name));
|
||||||
|
}
|
||||||
|
|
||||||
|
static public Logger getLogger(String prefix, String name) {
|
||||||
|
return getLogger(prefix, LogManager.getLogger(name));
|
||||||
|
}
|
||||||
|
|
||||||
|
static public Logger getLogger(String prefix, Class<?> clazz) {
|
||||||
|
return getLogger(prefix, LogManager.getLogger(clazz.getName()));
|
||||||
|
}
|
||||||
|
|
||||||
|
static public Logger getLogger(String prefix, Logger logger) {
|
||||||
|
return (Logger)(prefix != null && prefix.length() != 0 ? new PrefixPluginLogger((ExtendedLogger)logger, logger.getName(), prefix) : logger);
|
||||||
|
}
|
||||||
|
}
|
@ -0,0 +1,48 @@
|
|||||||
|
package org.wltea.analyzer.help;
|
||||||
|
|
||||||
|
import org.apache.logging.log4j.Level;
|
||||||
|
import org.apache.logging.log4j.Marker;
|
||||||
|
import org.apache.logging.log4j.MarkerManager;
|
||||||
|
import org.apache.logging.log4j.message.Message;
|
||||||
|
import org.apache.logging.log4j.message.MessageFactory;
|
||||||
|
import org.apache.logging.log4j.spi.ExtendedLogger;
|
||||||
|
import org.apache.logging.log4j.spi.ExtendedLoggerWrapper;
|
||||||
|
|
||||||
|
import java.util.WeakHashMap;
|
||||||
|
|
||||||
|
public class PrefixPluginLogger extends ExtendedLoggerWrapper {
|
||||||
|
private static final WeakHashMap<String, Marker> markers = new WeakHashMap();
|
||||||
|
private final Marker marker;
|
||||||
|
|
||||||
|
static int markersSize() {
|
||||||
|
return markers.size();
|
||||||
|
}
|
||||||
|
|
||||||
|
public String prefix() {
|
||||||
|
return this.marker.getName();
|
||||||
|
}
|
||||||
|
|
||||||
|
PrefixPluginLogger(ExtendedLogger logger, String name, String prefix) {
|
||||||
|
super(logger, name, (MessageFactory) null);
|
||||||
|
String actualPrefix = prefix == null ? "" : prefix;
|
||||||
|
WeakHashMap var6 = markers;
|
||||||
|
MarkerManager.Log4jMarker actualMarker;
|
||||||
|
synchronized (markers) {
|
||||||
|
MarkerManager.Log4jMarker maybeMarker = (MarkerManager.Log4jMarker) markers.get(actualPrefix);
|
||||||
|
if (maybeMarker == null) {
|
||||||
|
actualMarker = new MarkerManager.Log4jMarker(actualPrefix);
|
||||||
|
markers.put(new String(actualPrefix), actualMarker);
|
||||||
|
} else {
|
||||||
|
actualMarker = maybeMarker;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
this.marker = (Marker) actualMarker;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void logMessage(String fqcn, Level level, Marker marker, Message message, Throwable t) {
|
||||||
|
assert marker == null;
|
||||||
|
|
||||||
|
super.logMessage(fqcn, level, this.marker, message, t);
|
||||||
|
}
|
||||||
|
}
|
@ -1,13 +1,15 @@
|
|||||||
package org.wltea.analyzer.help;
|
package org.wltea.analyzer.help;
|
||||||
|
|
||||||
import org.elasticsearch.common.logging.ESLogger;
|
import org.apache.logging.log4j.Logger;
|
||||||
import org.elasticsearch.common.logging.Loggers;
|
|
||||||
|
|
||||||
public class Sleep {
|
public class Sleep {
|
||||||
|
|
||||||
public static final ESLogger logger= Loggers.getLogger("ik-analyzer");
|
private static final Logger logger = ESPluginLoggerFactory.getLogger(Sleep.class.getName());
|
||||||
|
|
||||||
|
public enum Type {MSEC, SEC, MIN, HOUR}
|
||||||
|
|
||||||
|
;
|
||||||
|
|
||||||
public enum Type{MSEC,SEC,MIN,HOUR};
|
|
||||||
public static void sleep(Type type, int num) {
|
public static void sleep(Type type, int num) {
|
||||||
try {
|
try {
|
||||||
switch (type) {
|
switch (type) {
|
||||||
@ -15,20 +17,20 @@ public class Sleep {
|
|||||||
Thread.sleep(num);
|
Thread.sleep(num);
|
||||||
return;
|
return;
|
||||||
case SEC:
|
case SEC:
|
||||||
Thread.sleep(num*1000L);
|
Thread.sleep(num * 1000);
|
||||||
return;
|
return;
|
||||||
case MIN:
|
case MIN:
|
||||||
Thread.sleep(num*60*1000L);
|
Thread.sleep(num * 60 * 1000);
|
||||||
return;
|
return;
|
||||||
case HOUR:
|
case HOUR:
|
||||||
Thread.sleep(num*60*60*1000L);
|
Thread.sleep(num * 60 * 60 * 1000);
|
||||||
return;
|
return;
|
||||||
default:
|
default:
|
||||||
logger.error("输入类型错误,应为MSEC,SEC,MIN,HOUR之一");
|
System.err.println("输入类型错误,应为MSEC,SEC,MIN,HOUR之一");
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
} catch (InterruptedException e) {
|
} catch (InterruptedException e) {
|
||||||
e.printStackTrace();
|
logger.error(e.getMessage(), e);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -26,6 +26,7 @@ package org.wltea.analyzer.lucene;
|
|||||||
|
|
||||||
import org.apache.lucene.analysis.Analyzer;
|
import org.apache.lucene.analysis.Analyzer;
|
||||||
import org.apache.lucene.analysis.Tokenizer;
|
import org.apache.lucene.analysis.Tokenizer;
|
||||||
|
import org.wltea.analyzer.cfg.Configuration;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* IK分词器,Lucene Analyzer接口实现
|
* IK分词器,Lucene Analyzer接口实现
|
||||||
@ -33,15 +34,7 @@ import org.apache.lucene.analysis.Tokenizer;
|
|||||||
*/
|
*/
|
||||||
public final class IKAnalyzer extends Analyzer{
|
public final class IKAnalyzer extends Analyzer{
|
||||||
|
|
||||||
private boolean useSmart;
|
private Configuration configuration;
|
||||||
|
|
||||||
public boolean useSmart() {
|
|
||||||
return useSmart;
|
|
||||||
}
|
|
||||||
|
|
||||||
public void setUseSmart(boolean useSmart) {
|
|
||||||
this.useSmart = useSmart;
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* IK分词器Lucene Analyzer接口实现类
|
* IK分词器Lucene Analyzer接口实现类
|
||||||
@ -54,11 +47,11 @@ public final class IKAnalyzer extends Analyzer{
|
|||||||
/**
|
/**
|
||||||
* IK分词器Lucene Analyzer接口实现类
|
* IK分词器Lucene Analyzer接口实现类
|
||||||
*
|
*
|
||||||
* @param useSmart 当为true时,分词器进行智能切分
|
* @param configuration IK配置
|
||||||
*/
|
*/
|
||||||
public IKAnalyzer(boolean useSmart){
|
public IKAnalyzer(Configuration configuration){
|
||||||
super();
|
super();
|
||||||
this.useSmart = useSmart;
|
this.configuration = configuration;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@ -67,7 +60,7 @@ public final class IKAnalyzer extends Analyzer{
|
|||||||
*/
|
*/
|
||||||
@Override
|
@Override
|
||||||
protected TokenStreamComponents createComponents(String fieldName) {
|
protected TokenStreamComponents createComponents(String fieldName) {
|
||||||
Tokenizer _IKTokenizer = new IKTokenizer(useSmart);
|
Tokenizer _IKTokenizer = new IKTokenizer(configuration);
|
||||||
return new TokenStreamComponents(_IKTokenizer);
|
return new TokenStreamComponents(_IKTokenizer);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -32,6 +32,7 @@ import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
|||||||
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
|
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
|
||||||
import org.elasticsearch.common.settings.Settings;
|
import org.elasticsearch.common.settings.Settings;
|
||||||
import org.elasticsearch.env.Environment;
|
import org.elasticsearch.env.Environment;
|
||||||
|
import org.wltea.analyzer.cfg.Configuration;
|
||||||
import org.wltea.analyzer.core.IKSegmenter;
|
import org.wltea.analyzer.core.IKSegmenter;
|
||||||
import org.wltea.analyzer.core.Lexeme;
|
import org.wltea.analyzer.core.Lexeme;
|
||||||
|
|
||||||
@ -64,16 +65,15 @@ public final class IKTokenizer extends Tokenizer {
|
|||||||
|
|
||||||
/**
|
/**
|
||||||
* Lucene 4.0 Tokenizer适配器类构造函数
|
* Lucene 4.0 Tokenizer适配器类构造函数
|
||||||
* @param in
|
|
||||||
*/
|
*/
|
||||||
public IKTokenizer(boolean useSmart){
|
public IKTokenizer(Configuration configuration){
|
||||||
super();
|
super();
|
||||||
offsetAtt = addAttribute(OffsetAttribute.class);
|
offsetAtt = addAttribute(OffsetAttribute.class);
|
||||||
termAtt = addAttribute(CharTermAttribute.class);
|
termAtt = addAttribute(CharTermAttribute.class);
|
||||||
typeAtt = addAttribute(TypeAttribute.class);
|
typeAtt = addAttribute(TypeAttribute.class);
|
||||||
posIncrAtt = addAttribute(PositionIncrementAttribute.class);
|
posIncrAtt = addAttribute(PositionIncrementAttribute.class);
|
||||||
|
|
||||||
_IKImplement = new IKSegmenter(input,useSmart);
|
_IKImplement = new IKSegmenter(input,configuration);
|
||||||
}
|
}
|
||||||
|
|
||||||
/* (non-Javadoc)
|
/* (non-Javadoc)
|
||||||
|
@ -1,716 +0,0 @@
|
|||||||
/**
|
|
||||||
* IK 中文分词 版本 5.0
|
|
||||||
* IK Analyzer release 5.0
|
|
||||||
*
|
|
||||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
|
||||||
* contributor license agreements. See the NOTICE file distributed with
|
|
||||||
* this work for additional information regarding copyright ownership.
|
|
||||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
|
||||||
* (the "License"); you may not use this file except in compliance with
|
|
||||||
* the License. You may obtain a copy of the License at
|
|
||||||
*
|
|
||||||
* http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
*
|
|
||||||
* Unless required by applicable law or agreed to in writing, software
|
|
||||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
* See the License for the specific language governing permissions and
|
|
||||||
* limitations under the License.
|
|
||||||
*
|
|
||||||
* 源代码由林良益(linliangyi2005@gmail.com)提供
|
|
||||||
* 版权声明 2012,乌龙茶工作室
|
|
||||||
* provided by Linliangyi and copyright 2012 by Oolong studio
|
|
||||||
*
|
|
||||||
*/
|
|
||||||
package org.wltea.analyzer.query;
|
|
||||||
|
|
||||||
import org.apache.lucene.index.Term;
|
|
||||||
import org.apache.lucene.search.*;
|
|
||||||
import org.apache.lucene.search.BooleanClause.Occur;
|
|
||||||
import org.apache.lucene.util.BytesRef;
|
|
||||||
import org.elasticsearch.common.logging.ESLogger;
|
|
||||||
import org.elasticsearch.common.logging.Loggers;
|
|
||||||
|
|
||||||
import java.util.ArrayList;
|
|
||||||
import java.util.LinkedList;
|
|
||||||
import java.util.List;
|
|
||||||
import java.util.Stack;
|
|
||||||
|
|
||||||
/**
|
|
||||||
* IK简易查询表达式解析
|
|
||||||
* 结合SWMCQuery算法
|
|
||||||
*
|
|
||||||
* 表达式例子 :
|
|
||||||
* (id='1231231' && title:'monkey') || (content:'你好吗' || ulr='www.ik.com') - name:'helloword'
|
|
||||||
* @author linliangyi
|
|
||||||
*
|
|
||||||
*/
|
|
||||||
public class IKQueryExpressionParser {
|
|
||||||
|
|
||||||
public static final ESLogger logger= Loggers.getLogger("ik-analyzer");
|
|
||||||
|
|
||||||
//public static final String LUCENE_SPECIAL_CHAR = "&&||-()':={}[],";
|
|
||||||
|
|
||||||
private List<Element> elements = new ArrayList<Element>();
|
|
||||||
|
|
||||||
private Stack<Query> querys = new Stack<Query>();
|
|
||||||
|
|
||||||
private Stack<Element> operates = new Stack<Element>();
|
|
||||||
|
|
||||||
/**
|
|
||||||
* 解析查询表达式,生成Lucene Query对象
|
|
||||||
*
|
|
||||||
* @param expression
|
|
||||||
* @param quickMode
|
|
||||||
* @return Lucene query
|
|
||||||
*/
|
|
||||||
public Query parseExp(String expression , boolean quickMode){
|
|
||||||
Query lucenceQuery = null;
|
|
||||||
if(expression != null && !"".equals(expression)){
|
|
||||||
try{
|
|
||||||
//文法解析
|
|
||||||
this.splitElements(expression);
|
|
||||||
//语法解析
|
|
||||||
this.parseSyntax(quickMode);
|
|
||||||
if(this.querys.size() == 1){
|
|
||||||
lucenceQuery = this.querys.pop();
|
|
||||||
}else{
|
|
||||||
throw new IllegalStateException("表达式异常: 缺少逻辑操作符 或 括号缺失");
|
|
||||||
}
|
|
||||||
}finally{
|
|
||||||
elements.clear();
|
|
||||||
querys.clear();
|
|
||||||
operates.clear();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return lucenceQuery;
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* 表达式文法解析
|
|
||||||
* @param expression
|
|
||||||
*/
|
|
||||||
private void splitElements(String expression){
|
|
||||||
|
|
||||||
if(expression == null){
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
Element curretElement = null;
|
|
||||||
|
|
||||||
char[] expChars = expression.toCharArray();
|
|
||||||
for(int i = 0 ; i < expChars.length ; i++){
|
|
||||||
switch(expChars[i]){
|
|
||||||
case '&' :
|
|
||||||
if(curretElement == null){
|
|
||||||
curretElement = new Element();
|
|
||||||
curretElement.type = '&';
|
|
||||||
curretElement.append(expChars[i]);
|
|
||||||
}else if(curretElement.type == '&'){
|
|
||||||
curretElement.append(expChars[i]);
|
|
||||||
this.elements.add(curretElement);
|
|
||||||
curretElement = null;
|
|
||||||
}else if(curretElement.type == '\''){
|
|
||||||
curretElement.append(expChars[i]);
|
|
||||||
}else {
|
|
||||||
this.elements.add(curretElement);
|
|
||||||
curretElement = new Element();
|
|
||||||
curretElement.type = '&';
|
|
||||||
curretElement.append(expChars[i]);
|
|
||||||
}
|
|
||||||
break;
|
|
||||||
|
|
||||||
case '|' :
|
|
||||||
if(curretElement == null){
|
|
||||||
curretElement = new Element();
|
|
||||||
curretElement.type = '|';
|
|
||||||
curretElement.append(expChars[i]);
|
|
||||||
}else if(curretElement.type == '|'){
|
|
||||||
curretElement.append(expChars[i]);
|
|
||||||
this.elements.add(curretElement);
|
|
||||||
curretElement = null;
|
|
||||||
}else if(curretElement.type == '\''){
|
|
||||||
curretElement.append(expChars[i]);
|
|
||||||
}else {
|
|
||||||
this.elements.add(curretElement);
|
|
||||||
curretElement = new Element();
|
|
||||||
curretElement.type = '|';
|
|
||||||
curretElement.append(expChars[i]);
|
|
||||||
}
|
|
||||||
break;
|
|
||||||
|
|
||||||
case '-' :
|
|
||||||
if(curretElement != null){
|
|
||||||
if(curretElement.type == '\''){
|
|
||||||
curretElement.append(expChars[i]);
|
|
||||||
continue;
|
|
||||||
}else{
|
|
||||||
this.elements.add(curretElement);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
curretElement = new Element();
|
|
||||||
curretElement.type = '-';
|
|
||||||
curretElement.append(expChars[i]);
|
|
||||||
this.elements.add(curretElement);
|
|
||||||
curretElement = null;
|
|
||||||
break;
|
|
||||||
|
|
||||||
case '(' :
|
|
||||||
if(curretElement != null){
|
|
||||||
if(curretElement.type == '\''){
|
|
||||||
curretElement.append(expChars[i]);
|
|
||||||
continue;
|
|
||||||
}else{
|
|
||||||
this.elements.add(curretElement);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
curretElement = new Element();
|
|
||||||
curretElement.type = '(';
|
|
||||||
curretElement.append(expChars[i]);
|
|
||||||
this.elements.add(curretElement);
|
|
||||||
curretElement = null;
|
|
||||||
break;
|
|
||||||
|
|
||||||
case ')' :
|
|
||||||
if(curretElement != null){
|
|
||||||
if(curretElement.type == '\''){
|
|
||||||
curretElement.append(expChars[i]);
|
|
||||||
continue;
|
|
||||||
}else{
|
|
||||||
this.elements.add(curretElement);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
curretElement = new Element();
|
|
||||||
curretElement.type = ')';
|
|
||||||
curretElement.append(expChars[i]);
|
|
||||||
this.elements.add(curretElement);
|
|
||||||
curretElement = null;
|
|
||||||
break;
|
|
||||||
|
|
||||||
case ':' :
|
|
||||||
if(curretElement != null){
|
|
||||||
if(curretElement.type == '\''){
|
|
||||||
curretElement.append(expChars[i]);
|
|
||||||
continue;
|
|
||||||
}else{
|
|
||||||
this.elements.add(curretElement);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
curretElement = new Element();
|
|
||||||
curretElement.type = ':';
|
|
||||||
curretElement.append(expChars[i]);
|
|
||||||
this.elements.add(curretElement);
|
|
||||||
curretElement = null;
|
|
||||||
break;
|
|
||||||
|
|
||||||
case '=' :
|
|
||||||
if(curretElement != null){
|
|
||||||
if(curretElement.type == '\''){
|
|
||||||
curretElement.append(expChars[i]);
|
|
||||||
continue;
|
|
||||||
}else{
|
|
||||||
this.elements.add(curretElement);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
curretElement = new Element();
|
|
||||||
curretElement.type = '=';
|
|
||||||
curretElement.append(expChars[i]);
|
|
||||||
this.elements.add(curretElement);
|
|
||||||
curretElement = null;
|
|
||||||
break;
|
|
||||||
|
|
||||||
case ' ' :
|
|
||||||
if(curretElement != null){
|
|
||||||
if(curretElement.type == '\''){
|
|
||||||
curretElement.append(expChars[i]);
|
|
||||||
}else{
|
|
||||||
this.elements.add(curretElement);
|
|
||||||
curretElement = null;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
break;
|
|
||||||
|
|
||||||
case '\'' :
|
|
||||||
if(curretElement == null){
|
|
||||||
curretElement = new Element();
|
|
||||||
curretElement.type = '\'';
|
|
||||||
|
|
||||||
}else if(curretElement.type == '\''){
|
|
||||||
this.elements.add(curretElement);
|
|
||||||
curretElement = null;
|
|
||||||
|
|
||||||
}else{
|
|
||||||
this.elements.add(curretElement);
|
|
||||||
curretElement = new Element();
|
|
||||||
curretElement.type = '\'';
|
|
||||||
|
|
||||||
}
|
|
||||||
break;
|
|
||||||
|
|
||||||
case '[':
|
|
||||||
if(curretElement != null){
|
|
||||||
if(curretElement.type == '\''){
|
|
||||||
curretElement.append(expChars[i]);
|
|
||||||
continue;
|
|
||||||
}else{
|
|
||||||
this.elements.add(curretElement);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
curretElement = new Element();
|
|
||||||
curretElement.type = '[';
|
|
||||||
curretElement.append(expChars[i]);
|
|
||||||
this.elements.add(curretElement);
|
|
||||||
curretElement = null;
|
|
||||||
break;
|
|
||||||
|
|
||||||
case ']':
|
|
||||||
if(curretElement != null){
|
|
||||||
if(curretElement.type == '\''){
|
|
||||||
curretElement.append(expChars[i]);
|
|
||||||
continue;
|
|
||||||
}else{
|
|
||||||
this.elements.add(curretElement);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
curretElement = new Element();
|
|
||||||
curretElement.type = ']';
|
|
||||||
curretElement.append(expChars[i]);
|
|
||||||
this.elements.add(curretElement);
|
|
||||||
curretElement = null;
|
|
||||||
|
|
||||||
break;
|
|
||||||
|
|
||||||
case '{':
|
|
||||||
if(curretElement != null){
|
|
||||||
if(curretElement.type == '\''){
|
|
||||||
curretElement.append(expChars[i]);
|
|
||||||
continue;
|
|
||||||
}else{
|
|
||||||
this.elements.add(curretElement);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
curretElement = new Element();
|
|
||||||
curretElement.type = '{';
|
|
||||||
curretElement.append(expChars[i]);
|
|
||||||
this.elements.add(curretElement);
|
|
||||||
curretElement = null;
|
|
||||||
break;
|
|
||||||
|
|
||||||
case '}':
|
|
||||||
if(curretElement != null){
|
|
||||||
if(curretElement.type == '\''){
|
|
||||||
curretElement.append(expChars[i]);
|
|
||||||
continue;
|
|
||||||
}else{
|
|
||||||
this.elements.add(curretElement);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
curretElement = new Element();
|
|
||||||
curretElement.type = '}';
|
|
||||||
curretElement.append(expChars[i]);
|
|
||||||
this.elements.add(curretElement);
|
|
||||||
curretElement = null;
|
|
||||||
|
|
||||||
break;
|
|
||||||
case ',':
|
|
||||||
if(curretElement != null){
|
|
||||||
if(curretElement.type == '\''){
|
|
||||||
curretElement.append(expChars[i]);
|
|
||||||
continue;
|
|
||||||
}else{
|
|
||||||
this.elements.add(curretElement);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
curretElement = new Element();
|
|
||||||
curretElement.type = ',';
|
|
||||||
curretElement.append(expChars[i]);
|
|
||||||
this.elements.add(curretElement);
|
|
||||||
curretElement = null;
|
|
||||||
|
|
||||||
break;
|
|
||||||
|
|
||||||
default :
|
|
||||||
if(curretElement == null){
|
|
||||||
curretElement = new Element();
|
|
||||||
curretElement.type = 'F';
|
|
||||||
curretElement.append(expChars[i]);
|
|
||||||
|
|
||||||
}else if(curretElement.type == 'F'){
|
|
||||||
curretElement.append(expChars[i]);
|
|
||||||
|
|
||||||
}else if(curretElement.type == '\''){
|
|
||||||
curretElement.append(expChars[i]);
|
|
||||||
|
|
||||||
}else{
|
|
||||||
this.elements.add(curretElement);
|
|
||||||
curretElement = new Element();
|
|
||||||
curretElement.type = 'F';
|
|
||||||
curretElement.append(expChars[i]);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if(curretElement != null){
|
|
||||||
this.elements.add(curretElement);
|
|
||||||
curretElement = null;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* 语法解析
|
|
||||||
*
|
|
||||||
*/
|
|
||||||
private void parseSyntax(boolean quickMode){
|
|
||||||
for(int i = 0 ; i < this.elements.size() ; i++){
|
|
||||||
Element e = this.elements.get(i);
|
|
||||||
if('F' == e.type){
|
|
||||||
Element e2 = this.elements.get(i + 1);
|
|
||||||
if('=' != e2.type && ':' != e2.type){
|
|
||||||
throw new IllegalStateException("表达式异常: = 或 : 号丢失");
|
|
||||||
}
|
|
||||||
Element e3 = this.elements.get(i + 2);
|
|
||||||
//处理 = 和 : 运算
|
|
||||||
if('\'' == e3.type){
|
|
||||||
i+=2;
|
|
||||||
if('=' == e2.type){
|
|
||||||
TermQuery tQuery = new TermQuery(new Term(e.toString() , e3.toString()));
|
|
||||||
this.querys.push(tQuery);
|
|
||||||
}else if(':' == e2.type){
|
|
||||||
String keyword = e3.toString();
|
|
||||||
//SWMCQuery Here
|
|
||||||
Query _SWMCQuery = SWMCQueryBuilder.create(e.toString(), keyword , quickMode);
|
|
||||||
this.querys.push(_SWMCQuery);
|
|
||||||
}
|
|
||||||
|
|
||||||
}else if('[' == e3.type || '{' == e3.type){
|
|
||||||
i+=2;
|
|
||||||
//处理 [] 和 {}
|
|
||||||
LinkedList<Element> eQueue = new LinkedList<Element>();
|
|
||||||
eQueue.add(e3);
|
|
||||||
for( i++ ; i < this.elements.size() ; i++){
|
|
||||||
Element eN = this.elements.get(i);
|
|
||||||
eQueue.add(eN);
|
|
||||||
if(']' == eN.type || '}' == eN.type){
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
//翻译RangeQuery
|
|
||||||
Query rangeQuery = this.toTermRangeQuery(e , eQueue);
|
|
||||||
this.querys.push(rangeQuery);
|
|
||||||
}else{
|
|
||||||
throw new IllegalStateException("表达式异常:匹配值丢失");
|
|
||||||
}
|
|
||||||
|
|
||||||
}else if('(' == e.type){
|
|
||||||
this.operates.push(e);
|
|
||||||
|
|
||||||
}else if(')' == e.type){
|
|
||||||
boolean doPop = true;
|
|
||||||
while(doPop && !this.operates.empty()){
|
|
||||||
Element op = this.operates.pop();
|
|
||||||
if('(' == op.type){
|
|
||||||
doPop = false;
|
|
||||||
}else {
|
|
||||||
Query q = toBooleanQuery(op);
|
|
||||||
this.querys.push(q);
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
||||||
}else{
|
|
||||||
|
|
||||||
if(this.operates.isEmpty()){
|
|
||||||
this.operates.push(e);
|
|
||||||
}else{
|
|
||||||
boolean doPeek = true;
|
|
||||||
while(doPeek && !this.operates.isEmpty()){
|
|
||||||
Element eleOnTop = this.operates.peek();
|
|
||||||
if('(' == eleOnTop.type){
|
|
||||||
doPeek = false;
|
|
||||||
this.operates.push(e);
|
|
||||||
}else if(compare(e , eleOnTop) == 1){
|
|
||||||
this.operates.push(e);
|
|
||||||
doPeek = false;
|
|
||||||
}else if(compare(e , eleOnTop) == 0){
|
|
||||||
Query q = toBooleanQuery(eleOnTop);
|
|
||||||
this.operates.pop();
|
|
||||||
this.querys.push(q);
|
|
||||||
}else{
|
|
||||||
Query q = toBooleanQuery(eleOnTop);
|
|
||||||
this.operates.pop();
|
|
||||||
this.querys.push(q);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if(doPeek && this.operates.empty()){
|
|
||||||
this.operates.push(e);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
while(!this.operates.isEmpty()){
|
|
||||||
Element eleOnTop = this.operates.pop();
|
|
||||||
Query q = toBooleanQuery(eleOnTop);
|
|
||||||
this.querys.push(q);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* 根据逻辑操作符,生成BooleanQuery
|
|
||||||
* @param op
|
|
||||||
* @return
|
|
||||||
*/
|
|
||||||
private Query toBooleanQuery(Element op){
|
|
||||||
if(this.querys.size() == 0){
|
|
||||||
return null;
|
|
||||||
}
|
|
||||||
|
|
||||||
BooleanQuery resultQuery = new BooleanQuery();
|
|
||||||
|
|
||||||
if(this.querys.size() == 1){
|
|
||||||
return this.querys.get(0);
|
|
||||||
}
|
|
||||||
|
|
||||||
Query q2 = this.querys.pop();
|
|
||||||
Query q1 = this.querys.pop();
|
|
||||||
if('&' == op.type){
|
|
||||||
if(q1 != null){
|
|
||||||
if(q1 instanceof BooleanQuery){
|
|
||||||
BooleanClause[] clauses = ((BooleanQuery)q1).getClauses();
|
|
||||||
if(clauses.length > 0
|
|
||||||
&& clauses[0].getOccur() == Occur.MUST){
|
|
||||||
for(BooleanClause c : clauses){
|
|
||||||
resultQuery.add(c);
|
|
||||||
}
|
|
||||||
}else{
|
|
||||||
resultQuery.add(q1,Occur.MUST);
|
|
||||||
}
|
|
||||||
|
|
||||||
}else{
|
|
||||||
//q1 instanceof TermQuery
|
|
||||||
//q1 instanceof TermRangeQuery
|
|
||||||
//q1 instanceof PhraseQuery
|
|
||||||
//others
|
|
||||||
resultQuery.add(q1,Occur.MUST);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if(q2 != null){
|
|
||||||
if(q2 instanceof BooleanQuery){
|
|
||||||
BooleanClause[] clauses = ((BooleanQuery)q2).getClauses();
|
|
||||||
if(clauses.length > 0
|
|
||||||
&& clauses[0].getOccur() == Occur.MUST){
|
|
||||||
for(BooleanClause c : clauses){
|
|
||||||
resultQuery.add(c);
|
|
||||||
}
|
|
||||||
}else{
|
|
||||||
resultQuery.add(q2,Occur.MUST);
|
|
||||||
}
|
|
||||||
|
|
||||||
}else{
|
|
||||||
//q1 instanceof TermQuery
|
|
||||||
//q1 instanceof TermRangeQuery
|
|
||||||
//q1 instanceof PhraseQuery
|
|
||||||
//others
|
|
||||||
resultQuery.add(q2,Occur.MUST);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
}else if('|' == op.type){
|
|
||||||
if(q1 != null){
|
|
||||||
if(q1 instanceof BooleanQuery){
|
|
||||||
BooleanClause[] clauses = ((BooleanQuery)q1).getClauses();
|
|
||||||
if(clauses.length > 0
|
|
||||||
&& clauses[0].getOccur() == Occur.SHOULD){
|
|
||||||
for(BooleanClause c : clauses){
|
|
||||||
resultQuery.add(c);
|
|
||||||
}
|
|
||||||
}else{
|
|
||||||
resultQuery.add(q1,Occur.SHOULD);
|
|
||||||
}
|
|
||||||
|
|
||||||
}else{
|
|
||||||
//q1 instanceof TermQuery
|
|
||||||
//q1 instanceof TermRangeQuery
|
|
||||||
//q1 instanceof PhraseQuery
|
|
||||||
//others
|
|
||||||
resultQuery.add(q1,Occur.SHOULD);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if(q2 != null){
|
|
||||||
if(q2 instanceof BooleanQuery){
|
|
||||||
BooleanClause[] clauses = ((BooleanQuery)q2).getClauses();
|
|
||||||
if(clauses.length > 0
|
|
||||||
&& clauses[0].getOccur() == Occur.SHOULD){
|
|
||||||
for(BooleanClause c : clauses){
|
|
||||||
resultQuery.add(c);
|
|
||||||
}
|
|
||||||
}else{
|
|
||||||
resultQuery.add(q2,Occur.SHOULD);
|
|
||||||
}
|
|
||||||
}else{
|
|
||||||
//q2 instanceof TermQuery
|
|
||||||
//q2 instanceof TermRangeQuery
|
|
||||||
//q2 instanceof PhraseQuery
|
|
||||||
//others
|
|
||||||
resultQuery.add(q2,Occur.SHOULD);
|
|
||||||
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
}else if('-' == op.type){
|
|
||||||
if(q1 == null || q2 == null){
|
|
||||||
throw new IllegalStateException("表达式异常:SubQuery 个数不匹配");
|
|
||||||
}
|
|
||||||
|
|
||||||
if(q1 instanceof BooleanQuery){
|
|
||||||
BooleanClause[] clauses = ((BooleanQuery)q1).getClauses();
|
|
||||||
if(clauses.length > 0){
|
|
||||||
for(BooleanClause c : clauses){
|
|
||||||
resultQuery.add(c);
|
|
||||||
}
|
|
||||||
}else{
|
|
||||||
resultQuery.add(q1,Occur.MUST);
|
|
||||||
}
|
|
||||||
|
|
||||||
}else{
|
|
||||||
//q1 instanceof TermQuery
|
|
||||||
//q1 instanceof TermRangeQuery
|
|
||||||
//q1 instanceof PhraseQuery
|
|
||||||
//others
|
|
||||||
resultQuery.add(q1,Occur.MUST);
|
|
||||||
}
|
|
||||||
|
|
||||||
resultQuery.add(q2,Occur.MUST_NOT);
|
|
||||||
}
|
|
||||||
return resultQuery;
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* 组装TermRangeQuery
|
|
||||||
* @param elements
|
|
||||||
* @return
|
|
||||||
*/
|
|
||||||
private TermRangeQuery toTermRangeQuery(Element fieldNameEle , LinkedList<Element> elements){
|
|
||||||
|
|
||||||
boolean includeFirst = false;
|
|
||||||
boolean includeLast = false;
|
|
||||||
String firstValue = null;
|
|
||||||
String lastValue = null;
|
|
||||||
//检查第一个元素是否是[或者{
|
|
||||||
Element first = elements.getFirst();
|
|
||||||
if('[' == first.type){
|
|
||||||
includeFirst = true;
|
|
||||||
}else if('{' == first.type){
|
|
||||||
includeFirst = false;
|
|
||||||
}else {
|
|
||||||
throw new IllegalStateException("表达式异常");
|
|
||||||
}
|
|
||||||
//检查最后一个元素是否是]或者}
|
|
||||||
Element last = elements.getLast();
|
|
||||||
if(']' == last.type){
|
|
||||||
includeLast = true;
|
|
||||||
}else if('}' == last.type){
|
|
||||||
includeLast = false;
|
|
||||||
}else {
|
|
||||||
throw new IllegalStateException("表达式异常, RangeQuery缺少结束括号");
|
|
||||||
}
|
|
||||||
if(elements.size() < 4 || elements.size() > 5){
|
|
||||||
throw new IllegalStateException("表达式异常, RangeQuery 错误");
|
|
||||||
}
|
|
||||||
//读出中间部分
|
|
||||||
Element e2 = elements.get(1);
|
|
||||||
if('\'' == e2.type){
|
|
||||||
firstValue = e2.toString();
|
|
||||||
//
|
|
||||||
Element e3 = elements.get(2);
|
|
||||||
if(',' != e3.type){
|
|
||||||
throw new IllegalStateException("表达式异常, RangeQuery缺少逗号分隔");
|
|
||||||
}
|
|
||||||
//
|
|
||||||
Element e4 = elements.get(3);
|
|
||||||
if('\'' == e4.type){
|
|
||||||
lastValue = e4.toString();
|
|
||||||
}else if(e4 != last){
|
|
||||||
throw new IllegalStateException("表达式异常,RangeQuery格式错误");
|
|
||||||
}
|
|
||||||
}else if(',' == e2.type){
|
|
||||||
firstValue = null;
|
|
||||||
//
|
|
||||||
Element e3 = elements.get(2);
|
|
||||||
if('\'' == e3.type){
|
|
||||||
lastValue = e3.toString();
|
|
||||||
}else{
|
|
||||||
throw new IllegalStateException("表达式异常,RangeQuery格式错误");
|
|
||||||
}
|
|
||||||
|
|
||||||
}else {
|
|
||||||
throw new IllegalStateException("表达式异常, RangeQuery格式错误");
|
|
||||||
}
|
|
||||||
|
|
||||||
return new TermRangeQuery(fieldNameEle.toString() , new BytesRef(firstValue) , new BytesRef(lastValue) , includeFirst , includeLast);
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* 比较操作符优先级
|
|
||||||
* @param e1
|
|
||||||
* @param e2
|
|
||||||
* @return
|
|
||||||
*/
|
|
||||||
private int compare(Element e1 , Element e2){
|
|
||||||
if('&' == e1.type){
|
|
||||||
if('&' == e2.type){
|
|
||||||
return 0;
|
|
||||||
}else {
|
|
||||||
return 1;
|
|
||||||
}
|
|
||||||
}else if('|' == e1.type){
|
|
||||||
if('&' == e2.type){
|
|
||||||
return -1;
|
|
||||||
}else if('|' == e2.type){
|
|
||||||
return 0;
|
|
||||||
}else{
|
|
||||||
return 1;
|
|
||||||
}
|
|
||||||
}else{
|
|
||||||
if('-' == e2.type){
|
|
||||||
return 0;
|
|
||||||
}else{
|
|
||||||
return -1;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* 表达式元素(操作符、FieldName、FieldValue)
|
|
||||||
* @author linliangyi
|
|
||||||
* May 20, 2010
|
|
||||||
*/
|
|
||||||
private class Element{
|
|
||||||
char type = 0;
|
|
||||||
StringBuffer eleTextBuff;
|
|
||||||
|
|
||||||
public Element(){
|
|
||||||
eleTextBuff = new StringBuffer();
|
|
||||||
}
|
|
||||||
|
|
||||||
public void append(char c){
|
|
||||||
this.eleTextBuff.append(c);
|
|
||||||
}
|
|
||||||
|
|
||||||
public String toString(){
|
|
||||||
return this.eleTextBuff.toString();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
public static void main(String[] args){
|
|
||||||
IKQueryExpressionParser parser = new IKQueryExpressionParser();
|
|
||||||
//String ikQueryExp = "newsTitle:'的两款《魔兽世界》插件Bigfoot和月光宝盒'";
|
|
||||||
String ikQueryExp = "(id='ABcdRf' && date:{'20010101','20110101'} && keyword:'魔兽中国') || (content:'KSHT-KSH-A001-18' || ulr='www.ik.com') - name:'林良益'";
|
|
||||||
Query result = parser.parseExp(ikQueryExp , true);
|
|
||||||
logger.info(result.toString());
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
@ -1,154 +0,0 @@
|
|||||||
/**
|
|
||||||
* IK 中文分词 版本 5.0
|
|
||||||
* IK Analyzer release 5.0
|
|
||||||
*
|
|
||||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
|
||||||
* contributor license agreements. See the NOTICE file distributed with
|
|
||||||
* this work for additional information regarding copyright ownership.
|
|
||||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
|
||||||
* (the "License"); you may not use this file except in compliance with
|
|
||||||
* the License. You may obtain a copy of the License at
|
|
||||||
*
|
|
||||||
* http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
*
|
|
||||||
* Unless required by applicable law or agreed to in writing, software
|
|
||||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
* See the License for the specific language governing permissions and
|
|
||||||
* limitations under the License.
|
|
||||||
*
|
|
||||||
* 源代码由林良益(linliangyi2005@gmail.com)提供
|
|
||||||
* 版权声明 2012,乌龙茶工作室
|
|
||||||
* provided by Linliangyi and copyright 2012 by Oolong studio
|
|
||||||
*
|
|
||||||
*/
|
|
||||||
package org.wltea.analyzer.query;
|
|
||||||
|
|
||||||
import java.io.IOException;
|
|
||||||
import java.io.StringReader;
|
|
||||||
import java.util.ArrayList;
|
|
||||||
import java.util.List;
|
|
||||||
|
|
||||||
import org.apache.lucene.analysis.standard.StandardAnalyzer;
|
|
||||||
import org.apache.lucene.queryparser.classic.ParseException;
|
|
||||||
import org.apache.lucene.queryparser.classic.QueryParser;
|
|
||||||
import org.apache.lucene.search.Query;
|
|
||||||
import org.apache.lucene.util.Version;
|
|
||||||
import org.wltea.analyzer.core.IKSegmenter;
|
|
||||||
import org.wltea.analyzer.core.Lexeme;
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Single Word Multi Char Query Builder
|
|
||||||
* IK分词算法专用
|
|
||||||
* @author linliangyi
|
|
||||||
*
|
|
||||||
*/
|
|
||||||
public class SWMCQueryBuilder {
|
|
||||||
|
|
||||||
/**
|
|
||||||
* 生成SWMCQuery
|
|
||||||
* @param fieldName
|
|
||||||
* @param keywords
|
|
||||||
* @param quickMode
|
|
||||||
* @return Lucene Query
|
|
||||||
*/
|
|
||||||
public static Query create(String fieldName ,String keywords , boolean quickMode){
|
|
||||||
if(fieldName == null || keywords == null){
|
|
||||||
throw new IllegalArgumentException("参数 fieldName 、 keywords 不能为null.");
|
|
||||||
}
|
|
||||||
//1.对keywords进行分词处理
|
|
||||||
List<Lexeme> lexemes = doAnalyze(keywords);
|
|
||||||
//2.根据分词结果,生成SWMCQuery
|
|
||||||
Query _SWMCQuery = getSWMCQuery(fieldName , lexemes , quickMode);
|
|
||||||
return _SWMCQuery;
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* 分词切分,并返回结链表
|
|
||||||
* @param keywords
|
|
||||||
* @return
|
|
||||||
*/
|
|
||||||
private static List<Lexeme> doAnalyze(String keywords){
|
|
||||||
List<Lexeme> lexemes = new ArrayList<Lexeme>();
|
|
||||||
|
|
||||||
IKSegmenter ikSeg = new IKSegmenter(new StringReader(keywords),true);
|
|
||||||
try{
|
|
||||||
Lexeme l = null;
|
|
||||||
while( (l = ikSeg.next()) != null){
|
|
||||||
lexemes.add(l);
|
|
||||||
}
|
|
||||||
}catch(IOException e){
|
|
||||||
e.printStackTrace();
|
|
||||||
}
|
|
||||||
return lexemes;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
/**
|
|
||||||
* 根据分词结果生成SWMC搜索
|
|
||||||
* @param fieldName
|
|
||||||
// * @param pathOption
|
|
||||||
* @param quickMode
|
|
||||||
* @return
|
|
||||||
*/
|
|
||||||
private static Query getSWMCQuery(String fieldName , List<Lexeme> lexemes , boolean quickMode){
|
|
||||||
//构造SWMC的查询表达式
|
|
||||||
StringBuffer keywordBuffer = new StringBuffer();
|
|
||||||
//精简的SWMC的查询表达式
|
|
||||||
StringBuffer keywordBuffer_Short = new StringBuffer();
|
|
||||||
//记录最后词元长度
|
|
||||||
int lastLexemeLength = 0;
|
|
||||||
//记录最后词元结束位置
|
|
||||||
int lastLexemeEnd = -1;
|
|
||||||
|
|
||||||
int shortCount = 0;
|
|
||||||
int totalCount = 0;
|
|
||||||
for(Lexeme l : lexemes){
|
|
||||||
totalCount += l.getLength();
|
|
||||||
//精简表达式
|
|
||||||
if(l.getLength() > 1){
|
|
||||||
keywordBuffer_Short.append(' ').append(l.getLexemeText());
|
|
||||||
shortCount += l.getLength();
|
|
||||||
}
|
|
||||||
|
|
||||||
if(lastLexemeLength == 0){
|
|
||||||
keywordBuffer.append(l.getLexemeText());
|
|
||||||
}else if(lastLexemeLength == 1 && l.getLength() == 1
|
|
||||||
&& lastLexemeEnd == l.getBeginPosition()){//单字位置相邻,长度为一,合并)
|
|
||||||
keywordBuffer.append(l.getLexemeText());
|
|
||||||
}else{
|
|
||||||
keywordBuffer.append(' ').append(l.getLexemeText());
|
|
||||||
|
|
||||||
}
|
|
||||||
lastLexemeLength = l.getLength();
|
|
||||||
lastLexemeEnd = l.getEndPosition();
|
|
||||||
}
|
|
||||||
|
|
||||||
//借助lucene queryparser 生成SWMC Query
|
|
||||||
QueryParser qp = new QueryParser(fieldName, new StandardAnalyzer());
|
|
||||||
qp.setDefaultOperator(QueryParser.AND_OPERATOR);
|
|
||||||
qp.setAutoGeneratePhraseQueries(true);
|
|
||||||
|
|
||||||
if(quickMode && (shortCount * 1.0f / totalCount) > 0.5f){
|
|
||||||
try {
|
|
||||||
//System.out.println(keywordBuffer.toString());
|
|
||||||
Query q = qp.parse(keywordBuffer_Short.toString());
|
|
||||||
return q;
|
|
||||||
} catch (ParseException e) {
|
|
||||||
e.printStackTrace();
|
|
||||||
}
|
|
||||||
|
|
||||||
}else{
|
|
||||||
if(keywordBuffer.length() > 0){
|
|
||||||
try {
|
|
||||||
//System.out.println(keywordBuffer.toString());
|
|
||||||
Query q = qp.parse(keywordBuffer.toString());
|
|
||||||
return q;
|
|
||||||
} catch (ParseException e) {
|
|
||||||
e.printStackTrace();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return null;
|
|
||||||
}
|
|
||||||
}
|
|
@ -1,90 +0,0 @@
|
|||||||
/**
|
|
||||||
* IK 中文分词 版本 5.0.1
|
|
||||||
* IK Analyzer release 5.0.1
|
|
||||||
*
|
|
||||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
|
||||||
* contributor license agreements. See the NOTICE file distributed with
|
|
||||||
* this work for additional information regarding copyright ownership.
|
|
||||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
|
||||||
* (the "License"); you may not use this file except in compliance with
|
|
||||||
* the License. You may obtain a copy of the License at
|
|
||||||
*
|
|
||||||
* http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
*
|
|
||||||
* Unless required by applicable law or agreed to in writing, software
|
|
||||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
* See the License for the specific language governing permissions and
|
|
||||||
* limitations under the License.
|
|
||||||
*
|
|
||||||
* 源代码由林良益(linliangyi2005@gmail.com)提供
|
|
||||||
* 版权声明 2012,乌龙茶工作室
|
|
||||||
* provided by Linliangyi and copyright 2012 by Oolong studio
|
|
||||||
*
|
|
||||||
*
|
|
||||||
*/
|
|
||||||
package org.wltea.analyzer.sample;
|
|
||||||
|
|
||||||
import java.io.IOException;
|
|
||||||
import java.io.StringReader;
|
|
||||||
|
|
||||||
import org.apache.lucene.analysis.Analyzer;
|
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
|
||||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
|
||||||
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
|
||||||
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
|
|
||||||
import org.elasticsearch.common.logging.ESLogger;
|
|
||||||
import org.elasticsearch.common.logging.Loggers;
|
|
||||||
import org.wltea.analyzer.lucene.IKAnalyzer;
|
|
||||||
|
|
||||||
/**
|
|
||||||
* 使用IKAnalyzer进行分词的演示
|
|
||||||
* 2012-10-22
|
|
||||||
*
|
|
||||||
*/
|
|
||||||
public class IKAnalzyerDemo {
|
|
||||||
|
|
||||||
public static final ESLogger logger= Loggers.getLogger("ik-analyzer");
|
|
||||||
|
|
||||||
public static void main(String[] args){
|
|
||||||
//构建IK分词器,使用smart分词模式
|
|
||||||
Analyzer analyzer = new IKAnalyzer(true);
|
|
||||||
|
|
||||||
//获取Lucene的TokenStream对象
|
|
||||||
TokenStream ts = null;
|
|
||||||
try {
|
|
||||||
ts = analyzer.tokenStream("myfield", new StringReader("WORLD ,.. html DATA</html>HELLO"));
|
|
||||||
// ts = analyzer.tokenStream("myfield", new StringReader("这是一个中文分词的例子,你可以直接运行它!IKAnalyer can analysis english text too"));
|
|
||||||
//获取词元位置属性
|
|
||||||
OffsetAttribute offset = ts.addAttribute(OffsetAttribute.class);
|
|
||||||
//获取词元文本属性
|
|
||||||
CharTermAttribute term = ts.addAttribute(CharTermAttribute.class);
|
|
||||||
//获取词元文本属性
|
|
||||||
TypeAttribute type = ts.addAttribute(TypeAttribute.class);
|
|
||||||
|
|
||||||
|
|
||||||
//重置TokenStream(重置StringReader)
|
|
||||||
ts.reset();
|
|
||||||
//迭代获取分词结果
|
|
||||||
while (ts.incrementToken()) {
|
|
||||||
logger.info(offset.startOffset() + " - " + offset.endOffset() + " : " + term.toString() + " | " + type.type());
|
|
||||||
}
|
|
||||||
//关闭TokenStream(关闭StringReader)
|
|
||||||
ts.end(); // Perform end-of-stream operations, e.g. set the final offset.
|
|
||||||
|
|
||||||
} catch (IOException e) {
|
|
||||||
e.printStackTrace();
|
|
||||||
} finally {
|
|
||||||
//释放TokenStream的所有资源
|
|
||||||
if(ts != null){
|
|
||||||
try {
|
|
||||||
ts.close();
|
|
||||||
} catch (IOException e) {
|
|
||||||
e.printStackTrace();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
@ -1,150 +0,0 @@
|
|||||||
/**
|
|
||||||
* IK 中文分词 版本 5.0
|
|
||||||
* IK Analyzer release 5.0
|
|
||||||
*
|
|
||||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
|
||||||
* contributor license agreements. See the NOTICE file distributed with
|
|
||||||
* this work for additional information regarding copyright ownership.
|
|
||||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
|
||||||
* (the "License"); you may not use this file except in compliance with
|
|
||||||
* the License. You may obtain a copy of the License at
|
|
||||||
*
|
|
||||||
* http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
*
|
|
||||||
* Unless required by applicable law or agreed to in writing, software
|
|
||||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
* See the License for the specific language governing permissions and
|
|
||||||
* limitations under the License.
|
|
||||||
*
|
|
||||||
* 源代码由林良益(linliangyi2005@gmail.com)提供
|
|
||||||
* 版权声明 2012,乌龙茶工作室
|
|
||||||
* provided by Linliangyi and copyright 2012 by Oolong studio
|
|
||||||
*
|
|
||||||
*
|
|
||||||
*/
|
|
||||||
package org.wltea.analyzer.sample;
|
|
||||||
|
|
||||||
import java.io.IOException;
|
|
||||||
|
|
||||||
import org.apache.lucene.analysis.Analyzer;
|
|
||||||
import org.apache.lucene.document.Document;
|
|
||||||
import org.apache.lucene.document.Field;
|
|
||||||
import org.apache.lucene.document.StringField;
|
|
||||||
import org.apache.lucene.document.TextField;
|
|
||||||
import org.apache.lucene.index.CorruptIndexException;
|
|
||||||
import org.apache.lucene.index.DirectoryReader;
|
|
||||||
import org.apache.lucene.index.IndexReader;
|
|
||||||
import org.apache.lucene.index.IndexWriter;
|
|
||||||
import org.apache.lucene.index.IndexWriterConfig;
|
|
||||||
import org.apache.lucene.index.IndexWriterConfig.OpenMode;
|
|
||||||
import org.apache.lucene.queryparser.classic.ParseException;
|
|
||||||
import org.apache.lucene.queryparser.classic.QueryParser;
|
|
||||||
import org.apache.lucene.search.IndexSearcher;
|
|
||||||
import org.apache.lucene.search.Query;
|
|
||||||
import org.apache.lucene.search.ScoreDoc;
|
|
||||||
import org.apache.lucene.search.TopDocs;
|
|
||||||
import org.apache.lucene.store.Directory;
|
|
||||||
import org.apache.lucene.store.LockObtainFailedException;
|
|
||||||
import org.apache.lucene.store.RAMDirectory;
|
|
||||||
import org.apache.lucene.util.Version;
|
|
||||||
import org.elasticsearch.common.logging.ESLogger;
|
|
||||||
import org.elasticsearch.common.logging.Loggers;
|
|
||||||
import org.wltea.analyzer.lucene.IKAnalyzer;
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
/**
|
|
||||||
* 使用IKAnalyzer进行Lucene索引和查询的演示
|
|
||||||
* 2012-3-2
|
|
||||||
*
|
|
||||||
* 以下是结合Lucene4.0 API的写法
|
|
||||||
*
|
|
||||||
*/
|
|
||||||
public class LuceneIndexAndSearchDemo {
|
|
||||||
|
|
||||||
public static final ESLogger logger= Loggers.getLogger("ik-analyzer");
|
|
||||||
|
|
||||||
/**
|
|
||||||
* 模拟:
|
|
||||||
* 创建一个单条记录的索引,并对其进行搜索
|
|
||||||
* @param args
|
|
||||||
*/
|
|
||||||
public static void main(String[] args){
|
|
||||||
//Lucene Document的域名
|
|
||||||
String fieldName = "text";
|
|
||||||
//检索内容
|
|
||||||
String text = "IK Analyzer是一个结合词典分词和文法分词的中文分词开源工具包。它使用了全新的正向迭代最细粒度切分算法。";
|
|
||||||
|
|
||||||
//实例化IKAnalyzer分词器
|
|
||||||
Analyzer analyzer = new IKAnalyzer(true);
|
|
||||||
|
|
||||||
Directory directory = null;
|
|
||||||
IndexWriter iwriter = null;
|
|
||||||
IndexReader ireader = null;
|
|
||||||
IndexSearcher isearcher = null;
|
|
||||||
try {
|
|
||||||
//建立内存索引对象
|
|
||||||
directory = new RAMDirectory();
|
|
||||||
|
|
||||||
//配置IndexWriterConfig
|
|
||||||
IndexWriterConfig iwConfig = new IndexWriterConfig(analyzer);
|
|
||||||
iwConfig.setOpenMode(OpenMode.CREATE_OR_APPEND);
|
|
||||||
iwriter = new IndexWriter(directory , iwConfig);
|
|
||||||
//写入索引
|
|
||||||
Document doc = new Document();
|
|
||||||
doc.add(new StringField("ID", "10000", Field.Store.YES));
|
|
||||||
doc.add(new TextField(fieldName, text, Field.Store.YES));
|
|
||||||
iwriter.addDocument(doc);
|
|
||||||
iwriter.close();
|
|
||||||
|
|
||||||
|
|
||||||
//搜索过程**********************************
|
|
||||||
//实例化搜索器
|
|
||||||
ireader = DirectoryReader.open(directory);
|
|
||||||
isearcher = new IndexSearcher(ireader);
|
|
||||||
|
|
||||||
String keyword = "中文分词工具包";
|
|
||||||
//使用QueryParser查询分析器构造Query对象
|
|
||||||
QueryParser qp = new QueryParser(fieldName, analyzer);
|
|
||||||
qp.setDefaultOperator(QueryParser.AND_OPERATOR);
|
|
||||||
Query query = qp.parse(keyword);
|
|
||||||
logger.info("Query = " + query);
|
|
||||||
|
|
||||||
//搜索相似度最高的5条记录
|
|
||||||
TopDocs topDocs = isearcher.search(query , 5);
|
|
||||||
logger.info("命中:" + topDocs.totalHits);
|
|
||||||
//输出结果
|
|
||||||
ScoreDoc[] scoreDocs = topDocs.scoreDocs;
|
|
||||||
for (int i = 0; i < topDocs.totalHits; i++){
|
|
||||||
Document targetDoc = isearcher.doc(scoreDocs[i].doc);
|
|
||||||
logger.info("内容:" + targetDoc.toString());
|
|
||||||
}
|
|
||||||
|
|
||||||
} catch (CorruptIndexException e) {
|
|
||||||
e.printStackTrace();
|
|
||||||
} catch (LockObtainFailedException e) {
|
|
||||||
e.printStackTrace();
|
|
||||||
} catch (IOException e) {
|
|
||||||
e.printStackTrace();
|
|
||||||
} catch (ParseException e) {
|
|
||||||
e.printStackTrace();
|
|
||||||
} finally{
|
|
||||||
if(ireader != null){
|
|
||||||
try {
|
|
||||||
ireader.close();
|
|
||||||
} catch (IOException e) {
|
|
||||||
e.printStackTrace();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if(directory != null){
|
|
||||||
try {
|
|
||||||
directory.close();
|
|
||||||
} catch (IOException e) {
|
|
||||||
e.printStackTrace();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
@ -38,21 +38,6 @@ version=${project.version}
|
|||||||
#
|
#
|
||||||
# 'name': the plugin name
|
# 'name': the plugin name
|
||||||
name=${elasticsearch.plugin.name}
|
name=${elasticsearch.plugin.name}
|
||||||
|
|
||||||
### mandatory elements for site plugins:
|
|
||||||
#
|
|
||||||
# 'site': set to true to indicate contents of the _site/
|
|
||||||
# directory in the root of the plugin should be served.
|
|
||||||
site=${elasticsearch.plugin.site}
|
|
||||||
#
|
|
||||||
### mandatory elements for jvm plugins :
|
|
||||||
#
|
|
||||||
# 'jvm': true if the 'classname' class should be loaded
|
|
||||||
# from jar files in the root directory of the plugin.
|
|
||||||
# Note that only jar files in the root directory are
|
|
||||||
# added to the classpath for the plugin! If you need
|
|
||||||
# other resources, package them into a resources jar.
|
|
||||||
jvm=${elasticsearch.plugin.jvm}
|
|
||||||
#
|
#
|
||||||
# 'classname': the name of the class to load, fully-qualified.
|
# 'classname': the name of the class to load, fully-qualified.
|
||||||
classname=${elasticsearch.plugin.classname}
|
classname=${elasticsearch.plugin.classname}
|
||||||
@ -69,12 +54,3 @@ java.version=${maven.compiler.target}
|
|||||||
# is loaded so Elasticsearch will refuse to start in the presence of
|
# is loaded so Elasticsearch will refuse to start in the presence of
|
||||||
# plugins with the incorrect elasticsearch.version.
|
# plugins with the incorrect elasticsearch.version.
|
||||||
elasticsearch.version=${elasticsearch.version}
|
elasticsearch.version=${elasticsearch.version}
|
||||||
#
|
|
||||||
### deprecated elements for jvm plugins :
|
|
||||||
#
|
|
||||||
# 'isolated': true if the plugin should have its own classloader.
|
|
||||||
# passing false is deprecated, and only intended to support plugins
|
|
||||||
# that have hard dependencies against each other. If this is
|
|
||||||
# not specified, then the plugin is isolated by default.
|
|
||||||
isolated=${elasticsearch.plugin.isolated}
|
|
||||||
#
|
|
4
src/main/resources/plugin-security.policy
Normal file
4
src/main/resources/plugin-security.policy
Normal file
@ -0,0 +1,4 @@
|
|||||||
|
grant {
|
||||||
|
// needed because of the hot reload functionality
|
||||||
|
permission java.net.SocketPermission "*", "connect,resolve";
|
||||||
|
};
|
@ -1,83 +0,0 @@
|
|||||||
<?xml version="1.0" encoding="UTF-8"?>
|
|
||||||
<Diagram>
|
|
||||||
<ID>JAVA</ID>
|
|
||||||
<OriginalElement>org.elasticsearch.index.analysis.IKAnalysisBinderProcessor</OriginalElement>
|
|
||||||
<nodes>
|
|
||||||
<node x="1244.0" y="553.0">org.elasticsearch.index.analysis.IKAnalysisBinderProcessor</node>
|
|
||||||
<node x="2212.0" y="489.0">org.elasticsearch.index.analysis.AnalysisModule.AnalysisBinderProcessor.AnalyzersBindings</node>
|
|
||||||
<node x="1316.0" y="0.0">java.lang.Object</node>
|
|
||||||
<node x="1244.0" y="329.0">org.elasticsearch.index.analysis.AnalysisModule.AnalysisBinderProcessor</node>
|
|
||||||
<node x="616.0" y="510.0">org.elasticsearch.index.analysis.AnalysisModule.AnalysisBinderProcessor.TokenFiltersBindings</node>
|
|
||||||
<node x="0.0" y="510.0">org.elasticsearch.index.analysis.AnalysisModule.AnalysisBinderProcessor.CharFiltersBindings</node>
|
|
||||||
<node x="1608.0" y="510.0">org.elasticsearch.index.analysis.AnalysisModule.AnalysisBinderProcessor.TokenizersBindings</node>
|
|
||||||
</nodes>
|
|
||||||
<notes />
|
|
||||||
<edges>
|
|
||||||
<edge source="org.elasticsearch.index.analysis.AnalysisModule.AnalysisBinderProcessor.TokenFiltersBindings" target="org.elasticsearch.index.analysis.AnalysisModule.AnalysisBinderProcessor">
|
|
||||||
<point x="152.0" y="-77.0" />
|
|
||||||
<point x="1072.0" y="469.0" />
|
|
||||||
<point x="1347.2" y="469.0" />
|
|
||||||
<point x="-68.79999999999995" y="55.0" />
|
|
||||||
</edge>
|
|
||||||
<edge source="org.elasticsearch.index.analysis.AnalysisModule.AnalysisBinderProcessor.CharFiltersBindings" target="java.lang.Object">
|
|
||||||
<point x="-149.0" y="-77.0" />
|
|
||||||
<point x="149.0" y="299.0" />
|
|
||||||
<point x="1336.0" y="299.0" />
|
|
||||||
<point x="-80.0" y="139.5" />
|
|
||||||
</edge>
|
|
||||||
<edge source="org.elasticsearch.index.analysis.AnalysisModule.AnalysisBinderProcessor" target="java.lang.Object">
|
|
||||||
<point x="0.0" y="-55.0" />
|
|
||||||
<point x="0.0" y="139.5" />
|
|
||||||
</edge>
|
|
||||||
<edge source="org.elasticsearch.index.analysis.AnalysisModule.AnalysisBinderProcessor.AnalyzersBindings" target="org.elasticsearch.index.analysis.AnalysisModule.AnalysisBinderProcessor">
|
|
||||||
<point x="-180.5" y="-98.0" />
|
|
||||||
<point x="2392.5" y="459.0" />
|
|
||||||
<point x="1553.6" y="459.0" />
|
|
||||||
<point x="137.5999999999999" y="55.0" />
|
|
||||||
</edge>
|
|
||||||
<edge source="org.elasticsearch.index.analysis.AnalysisModule.AnalysisBinderProcessor.CharFiltersBindings" target="org.elasticsearch.index.analysis.AnalysisModule.AnalysisBinderProcessor">
|
|
||||||
<point x="149.0" y="-77.0" />
|
|
||||||
<point x="447.0" y="459.0" />
|
|
||||||
<point x="1278.4" y="459.0" />
|
|
||||||
<point x="-137.5999999999999" y="55.0" />
|
|
||||||
</edge>
|
|
||||||
<edge source="org.elasticsearch.index.analysis.IKAnalysisBinderProcessor" target="org.elasticsearch.index.analysis.AnalysisModule.AnalysisBinderProcessor">
|
|
||||||
<point x="0.0" y="-34.0" />
|
|
||||||
<point x="0.0" y="55.0" />
|
|
||||||
</edge>
|
|
||||||
<edge source="org.elasticsearch.index.analysis.AnalysisModule.AnalysisBinderProcessor.TokenFiltersBindings" target="java.lang.Object">
|
|
||||||
<point x="-152.0" y="-77.0" />
|
|
||||||
<point x="768.0" y="309.0" />
|
|
||||||
<point x="1376.0" y="309.0" />
|
|
||||||
<point x="-40.0" y="139.5" />
|
|
||||||
</edge>
|
|
||||||
<edge source="org.elasticsearch.index.analysis.AnalysisModule.AnalysisBinderProcessor.AnalyzersBindings" target="java.lang.Object">
|
|
||||||
<point x="180.5" y="-98.0" />
|
|
||||||
<point x="2753.5" y="299.0" />
|
|
||||||
<point x="1496.0" y="299.0" />
|
|
||||||
<point x="80.0" y="139.5" />
|
|
||||||
</edge>
|
|
||||||
<edge source="org.elasticsearch.index.analysis.AnalysisModule.AnalysisBinderProcessor.TokenizersBindings" target="java.lang.Object">
|
|
||||||
<point x="146.0" y="-77.0" />
|
|
||||||
<point x="2046.0" y="309.0" />
|
|
||||||
<point x="1456.0" y="309.0" />
|
|
||||||
<point x="40.0" y="139.5" />
|
|
||||||
</edge>
|
|
||||||
<edge source="org.elasticsearch.index.analysis.AnalysisModule.AnalysisBinderProcessor.TokenizersBindings" target="org.elasticsearch.index.analysis.AnalysisModule.AnalysisBinderProcessor">
|
|
||||||
<point x="-146.0" y="-77.0" />
|
|
||||||
<point x="1754.0" y="469.0" />
|
|
||||||
<point x="1484.8" y="469.0" />
|
|
||||||
<point x="68.79999999999995" y="55.0" />
|
|
||||||
</edge>
|
|
||||||
</edges>
|
|
||||||
<settings layout="Hierarchic Group" zoom="1.0" x="110.5" y="89.0" />
|
|
||||||
<SelectedNodes />
|
|
||||||
<Categories>
|
|
||||||
<Category>Fields</Category>
|
|
||||||
<Category>Methods</Category>
|
|
||||||
<Category>Constructors</Category>
|
|
||||||
<Category>Inner Classes</Category>
|
|
||||||
<Category>Properties</Category>
|
|
||||||
</Categories>
|
|
||||||
</Diagram>
|
|
||||||
|
|
Loading…
x
Reference in New Issue
Block a user