mirror of
https://github.com/elastic/eland.git
synced 2025-07-11 00:02:14 +08:00
Add support for es_match() to DataFrame and Series
This commit is contained in:
parent
92a8040614
commit
cb4cd083c3
6
docs/sphinx/reference/api/eland.DataFrame.es_match.rst
Normal file
6
docs/sphinx/reference/api/eland.DataFrame.es_match.rst
Normal file
@ -0,0 +1,6 @@
|
|||||||
|
eland.DataFrame.es_match
|
||||||
|
========================
|
||||||
|
|
||||||
|
.. currentmodule:: eland
|
||||||
|
|
||||||
|
.. automethod:: DataFrame.es_match
|
6
docs/sphinx/reference/api/eland.Series.es_match.rst
Normal file
6
docs/sphinx/reference/api/eland.Series.es_match.rst
Normal file
@ -0,0 +1,6 @@
|
|||||||
|
eland.Series.es_match
|
||||||
|
=====================
|
||||||
|
|
||||||
|
.. currentmodule:: eland
|
||||||
|
|
||||||
|
.. automethod:: Series.es_match
|
@ -111,6 +111,7 @@ Elasticsearch Functions
|
|||||||
:toctree: api/
|
:toctree: api/
|
||||||
|
|
||||||
DataFrame.es_info
|
DataFrame.es_info
|
||||||
|
DataFrame.es_match
|
||||||
DataFrame.es_query
|
DataFrame.es_query
|
||||||
DataFrame.es_dtypes
|
DataFrame.es_dtypes
|
||||||
|
|
||||||
|
@ -115,5 +115,6 @@ Elasticsearch Functions
|
|||||||
:toctree: api/
|
:toctree: api/
|
||||||
|
|
||||||
Series.es_info
|
Series.es_info
|
||||||
|
Series.es_match
|
||||||
Series.es_dtype
|
Series.es_dtype
|
||||||
Series.es_dtypes
|
Series.es_dtypes
|
||||||
|
@ -19,7 +19,7 @@ import re
|
|||||||
import sys
|
import sys
|
||||||
import warnings
|
import warnings
|
||||||
from io import StringIO
|
from io import StringIO
|
||||||
from typing import List, Optional, Sequence, Tuple, Union
|
from typing import Any, List, Optional, Sequence, Tuple, Union
|
||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
@ -632,6 +632,103 @@ class DataFrame(NDFrame):
|
|||||||
def info_es(self):
|
def info_es(self):
|
||||||
return self.es_info()
|
return self.es_info()
|
||||||
|
|
||||||
|
def es_match(
|
||||||
|
self,
|
||||||
|
text: str,
|
||||||
|
*,
|
||||||
|
columns: Optional[Union[str, Sequence[str]]] = None,
|
||||||
|
match_phrase: bool = False,
|
||||||
|
must_not_match: bool = False,
|
||||||
|
multi_match_type: Optional[str] = None,
|
||||||
|
match_only_text_fields: bool = True,
|
||||||
|
analyzer: Optional[str] = None,
|
||||||
|
fuzziness: Optional[Union[int, str]] = None,
|
||||||
|
**kwargs: Any,
|
||||||
|
) -> "DataFrame":
|
||||||
|
"""Filters data with an Elasticsearch ``match``, ``match_phrase``, or
|
||||||
|
``multi_match`` query depending on the given parameters and columns.
|
||||||
|
|
||||||
|
Read more about `Full-Text Queries in Elasticsearch <https://www.elastic.co/guide/en/elasticsearch/reference/current/full-text-queries.html>`_
|
||||||
|
|
||||||
|
By default all fields of type 'text' within Elasticsearch are queried
|
||||||
|
otherwise specific columns can be specified via the ``columns`` parameter
|
||||||
|
or a single column can be filtered on with :py:meth:`eland.Series.es_match`
|
||||||
|
|
||||||
|
All additional keyword arguments are passed in the body of the match query.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
text: str
|
||||||
|
String of text to search for
|
||||||
|
columns: str, List[str], optional
|
||||||
|
List of columns to search over. Defaults to all 'text' fields in Elasticsearch
|
||||||
|
match_phrase: bool, default False
|
||||||
|
If True will use ``match_phrase`` instead of ``match`` query which takes into account
|
||||||
|
the order of the ``text`` parameter.
|
||||||
|
must_not_match: bool, default False
|
||||||
|
If True will apply a boolean NOT (~) to the
|
||||||
|
query. Instead of requiring a match the query
|
||||||
|
will require text to not match.
|
||||||
|
multi_match_type: str, optional
|
||||||
|
If given and matching against multiple columns will set the ``multi_match.type`` setting
|
||||||
|
match_only_text_fields: bool, default True
|
||||||
|
When True this function will raise an error if any non-text fields
|
||||||
|
are queried to prevent fields that aren't analyzed from not working properly.
|
||||||
|
Set to False to ignore this preventative check.
|
||||||
|
analyzer: str, optional
|
||||||
|
Specify which analyzer to use for the match query
|
||||||
|
fuzziness: int, str, optional
|
||||||
|
Specify the fuzziness option for the match query
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
DataFrame
|
||||||
|
A filtered :py:class:`eland.DataFrame` with the given match query
|
||||||
|
|
||||||
|
Examples
|
||||||
|
--------
|
||||||
|
>>> df = ed.DataFrame("localhost:9200", "ecommerce")
|
||||||
|
>>> df.es_match("Men's", columns=["category"])
|
||||||
|
category currency ... type user
|
||||||
|
0 [Men's Clothing] EUR ... order eddie
|
||||||
|
4 [Men's Clothing, Men's Accessories] EUR ... order eddie
|
||||||
|
6 [Men's Clothing] EUR ... order oliver
|
||||||
|
7 [Men's Clothing, Men's Accessories, Men's Shoes] EUR ... order abd
|
||||||
|
11 [Men's Accessories, Men's Clothing] EUR ... order eddie
|
||||||
|
... ... ... ... ... ...
|
||||||
|
4663 [Men's Shoes, Men's Clothing] EUR ... order samir
|
||||||
|
4667 [Men's Clothing, Men's Shoes] EUR ... order sultan
|
||||||
|
4671 [Men's Clothing] EUR ... order jim
|
||||||
|
4672 [Men's Clothing] EUR ... order yahya
|
||||||
|
4674 [Women's Accessories, Men's Clothing] EUR ... order jackson
|
||||||
|
<BLANKLINE>
|
||||||
|
[2310 rows x 45 columns]
|
||||||
|
"""
|
||||||
|
# Determine which columns will be used
|
||||||
|
es_dtypes = self.es_dtypes.to_dict()
|
||||||
|
if columns is None:
|
||||||
|
columns = [
|
||||||
|
column for column, es_dtype in es_dtypes.items() if es_dtype == "text"
|
||||||
|
]
|
||||||
|
elif isinstance(columns, str):
|
||||||
|
columns = [columns]
|
||||||
|
columns = list(columns)
|
||||||
|
|
||||||
|
qc = self._query_compiler
|
||||||
|
filter = qc.es_match(
|
||||||
|
text,
|
||||||
|
columns,
|
||||||
|
match_phrase=match_phrase,
|
||||||
|
match_only_text_fields=match_only_text_fields,
|
||||||
|
multi_match_type=multi_match_type,
|
||||||
|
analyzer=analyzer,
|
||||||
|
fuzziness=fuzziness,
|
||||||
|
**kwargs,
|
||||||
|
)
|
||||||
|
if must_not_match:
|
||||||
|
filter = ~filter
|
||||||
|
return DataFrame(_query_compiler=qc._update_query(filter))
|
||||||
|
|
||||||
def es_query(self, query) -> "DataFrame":
|
def es_query(self, query) -> "DataFrame":
|
||||||
"""Applies an Elasticsearch DSL query to the current DataFrame.
|
"""Applies an Elasticsearch DSL query to the current DataFrame.
|
||||||
|
|
||||||
|
@ -995,24 +995,12 @@ class Operations:
|
|||||||
is_scan = False
|
is_scan = False
|
||||||
if size is not None and size <= DEFAULT_ES_MAX_RESULT_WINDOW:
|
if size is not None and size <= DEFAULT_ES_MAX_RESULT_WINDOW:
|
||||||
if size > 0:
|
if size > 0:
|
||||||
try:
|
es_results = query_compiler._client.search(
|
||||||
|
index=query_compiler._index_pattern,
|
||||||
es_results = query_compiler._client.search(
|
size=size,
|
||||||
index=query_compiler._index_pattern,
|
sort=sort_params,
|
||||||
size=size,
|
body=body,
|
||||||
sort=sort_params,
|
)
|
||||||
body=body,
|
|
||||||
)
|
|
||||||
except Exception:
|
|
||||||
# Catch all ES errors and print debug (currently to stdout)
|
|
||||||
error = {
|
|
||||||
"index": query_compiler._index_pattern,
|
|
||||||
"size": size,
|
|
||||||
"sort": sort_params,
|
|
||||||
"body": body,
|
|
||||||
}
|
|
||||||
print("Elasticsearch error:", error)
|
|
||||||
raise
|
|
||||||
else:
|
else:
|
||||||
is_scan = True
|
is_scan = True
|
||||||
es_results = scan(
|
es_results = scan(
|
||||||
|
@ -17,7 +17,7 @@
|
|||||||
|
|
||||||
import copy
|
import copy
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
from typing import TYPE_CHECKING, List, Optional, Sequence
|
from typing import TYPE_CHECKING, Any, List, Optional, Sequence, Union
|
||||||
|
|
||||||
import numpy as np # type: ignore
|
import numpy as np # type: ignore
|
||||||
import pandas as pd # type: ignore
|
import pandas as pd # type: ignore
|
||||||
@ -430,6 +430,77 @@ class QueryCompiler:
|
|||||||
|
|
||||||
return result
|
return result
|
||||||
|
|
||||||
|
def es_match(
|
||||||
|
self,
|
||||||
|
text: str,
|
||||||
|
columns: Sequence[str],
|
||||||
|
*,
|
||||||
|
match_phrase: bool = False,
|
||||||
|
match_only_text_fields: bool = True,
|
||||||
|
multi_match_type: Optional[str] = None,
|
||||||
|
analyzer: Optional[str] = None,
|
||||||
|
fuzziness: Optional[Union[int, str]] = None,
|
||||||
|
**kwargs: Any,
|
||||||
|
) -> QueryFilter:
|
||||||
|
if len(columns) < 1:
|
||||||
|
raise ValueError("columns can't be empty")
|
||||||
|
|
||||||
|
es_dtypes = self.es_dtypes.to_dict()
|
||||||
|
|
||||||
|
# Build the base options for the 'match_*' query
|
||||||
|
options = {"query": text}
|
||||||
|
if analyzer is not None:
|
||||||
|
options["analyzer"] = analyzer
|
||||||
|
if fuzziness is not None:
|
||||||
|
options["fuzziness"] = fuzziness
|
||||||
|
options.update(kwargs)
|
||||||
|
|
||||||
|
# Warn the user if they're not querying text columns
|
||||||
|
if match_only_text_fields:
|
||||||
|
non_text_columns = {}
|
||||||
|
for column in columns:
|
||||||
|
# Don't worry about wildcards
|
||||||
|
if "*" in column:
|
||||||
|
continue
|
||||||
|
|
||||||
|
es_dtype = es_dtypes[column]
|
||||||
|
if es_dtype != "text":
|
||||||
|
non_text_columns[column] = es_dtype
|
||||||
|
if non_text_columns:
|
||||||
|
raise ValueError(
|
||||||
|
f"Attempting to run es_match() on non-text fields "
|
||||||
|
f"({', '.join([k + '=' + v for k, v in non_text_columns.items()])}) "
|
||||||
|
f"means that these fields may not be analyzed properly. "
|
||||||
|
f"Consider reindexing these fields as text or use 'match_only_text_es_dtypes=False' "
|
||||||
|
f"to use match anyways"
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
options.setdefault("lenient", True)
|
||||||
|
|
||||||
|
# If only one column use 'match'
|
||||||
|
# otherwise use 'multi_match' with 'fields'
|
||||||
|
if len(columns) == 1:
|
||||||
|
if multi_match_type is not None:
|
||||||
|
raise ValueError(
|
||||||
|
"multi_match_type parameter only valid "
|
||||||
|
"when searching more than one column"
|
||||||
|
)
|
||||||
|
query = {"match_phrase" if match_phrase else "match": {columns[0]: options}}
|
||||||
|
else:
|
||||||
|
options["fields"] = columns
|
||||||
|
if match_phrase:
|
||||||
|
if multi_match_type not in ("phrase", None):
|
||||||
|
raise ValueError(
|
||||||
|
f"match_phrase=True and multi_match_type={multi_match_type!r} "
|
||||||
|
f"are not compatible. Must be multi_match_type='phrase'"
|
||||||
|
)
|
||||||
|
multi_match_type = "phrase"
|
||||||
|
if multi_match_type is not None:
|
||||||
|
options["type"] = multi_match_type
|
||||||
|
|
||||||
|
query = {"multi_match": options}
|
||||||
|
return QueryFilter(query)
|
||||||
|
|
||||||
def es_query(self, query):
|
def es_query(self, query):
|
||||||
return self._update_query(QueryFilter(query))
|
return self._update_query(QueryFilter(query))
|
||||||
|
|
||||||
|
@ -55,6 +55,7 @@ from eland.filter import (
|
|||||||
LessEqual,
|
LessEqual,
|
||||||
NotFilter,
|
NotFilter,
|
||||||
NotNull,
|
NotNull,
|
||||||
|
QueryFilter,
|
||||||
ScriptFilter,
|
ScriptFilter,
|
||||||
)
|
)
|
||||||
from eland.ndframe import NDFrame
|
from eland.ndframe import NDFrame
|
||||||
@ -636,6 +637,74 @@ class Series(NDFrame):
|
|||||||
)
|
)
|
||||||
return Series(_query_compiler=new_query_compiler)
|
return Series(_query_compiler=new_query_compiler)
|
||||||
|
|
||||||
|
def es_match(
|
||||||
|
self,
|
||||||
|
text: str,
|
||||||
|
*,
|
||||||
|
match_phrase: bool = False,
|
||||||
|
match_only_text_fields: bool = True,
|
||||||
|
analyzer: Optional[str] = None,
|
||||||
|
fuzziness: Optional[Union[int, str]] = None,
|
||||||
|
**kwargs: Any,
|
||||||
|
) -> QueryFilter:
|
||||||
|
"""Filters data with an Elasticsearch ``match`` or ``match_phrase``
|
||||||
|
query depending on the given parameters.
|
||||||
|
|
||||||
|
Read more about `Full-Text Queries in Elasticsearch <https://www.elastic.co/guide/en/elasticsearch/reference/current/full-text-queries.html>`_
|
||||||
|
|
||||||
|
All additional keyword arguments are passed in the body of the match query.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
text: str
|
||||||
|
String of text to search for
|
||||||
|
match_phrase: bool, default False
|
||||||
|
If True will use ``match_phrase`` instead of ``match`` query which takes into account
|
||||||
|
the order of the ``text`` parameter.
|
||||||
|
match_only_text_fields: bool, default True
|
||||||
|
When True this function will raise an error if any non-text fields
|
||||||
|
are queried to prevent fields that aren't analyzed from not working properly.
|
||||||
|
Set to False to ignore this preventative check.
|
||||||
|
analyzer: str, optional
|
||||||
|
Specify which analyzer to use for the match query
|
||||||
|
fuzziness: int, str, optional
|
||||||
|
Specify the fuzziness option for the match query
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
QueryFilter
|
||||||
|
Boolean filter to be combined with other filters and
|
||||||
|
then passed to DataFrame[...].
|
||||||
|
|
||||||
|
Examples
|
||||||
|
--------
|
||||||
|
>>> df = ed.DataFrame(
|
||||||
|
... "localhost:9200", "ecommerce",
|
||||||
|
... columns=["category", "taxful_total_price"]
|
||||||
|
... )
|
||||||
|
>>> df[
|
||||||
|
... df.category.es_match("Men's")
|
||||||
|
... & (df.taxful_total_price > 200.0)
|
||||||
|
... ].head(5)
|
||||||
|
category taxful_total_price
|
||||||
|
13 [Men's Clothing] 266.96
|
||||||
|
33 [Men's Clothing] 221.98
|
||||||
|
54 [Men's Clothing] 234.98
|
||||||
|
93 [Men's Shoes, Women's Accessories] 239.98
|
||||||
|
273 [Men's Shoes] 214.98
|
||||||
|
<BLANKLINE>
|
||||||
|
[5 rows x 2 columns]
|
||||||
|
"""
|
||||||
|
return self._query_compiler.es_match(
|
||||||
|
text,
|
||||||
|
columns=[self.name],
|
||||||
|
match_phrase=match_phrase,
|
||||||
|
match_only_text_fields=match_only_text_fields,
|
||||||
|
analyzer=analyzer,
|
||||||
|
fuzziness=fuzziness,
|
||||||
|
**kwargs,
|
||||||
|
)
|
||||||
|
|
||||||
def es_info(self) -> str:
|
def es_info(self) -> str:
|
||||||
buf = StringIO()
|
buf = StringIO()
|
||||||
|
|
||||||
|
41
eland/tests/dataframe/test_es_match_pytest.py
Normal file
41
eland/tests/dataframe/test_es_match_pytest.py
Normal file
@ -0,0 +1,41 @@
|
|||||||
|
# Licensed to Elasticsearch B.V. under one or more contributor
|
||||||
|
# license agreements. See the NOTICE file distributed with
|
||||||
|
# this work for additional information regarding copyright
|
||||||
|
# ownership. Elasticsearch B.V. licenses this file to you under
|
||||||
|
# the Apache License, Version 2.0 (the "License"); you may
|
||||||
|
# not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing,
|
||||||
|
# software distributed under the License is distributed on an
|
||||||
|
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||||
|
# KIND, either express or implied. See the License for the
|
||||||
|
# specific language governing permissions and limitations
|
||||||
|
# under the License.
|
||||||
|
|
||||||
|
# File called _pytest for PyCharm compatability
|
||||||
|
|
||||||
|
from eland.tests.common import TestData
|
||||||
|
|
||||||
|
|
||||||
|
class TestEsMatch(TestData):
|
||||||
|
def test_match(self):
|
||||||
|
df = self.ed_ecommerce()
|
||||||
|
|
||||||
|
categories = list(df.es_match("Men's").category.to_pandas())
|
||||||
|
assert len(categories) > 0
|
||||||
|
assert all(any("Men's" in y for y in x) for x in categories)
|
||||||
|
|
||||||
|
def test_must_not_match(self):
|
||||||
|
df = self.ed_ecommerce()
|
||||||
|
|
||||||
|
categories = list(
|
||||||
|
df.es_match("Men's", must_not_match=True)
|
||||||
|
.es_match("Women's")
|
||||||
|
.category.to_pandas()
|
||||||
|
)
|
||||||
|
assert len(categories) > 0
|
||||||
|
assert all(all("Men's" not in y for y in x) for x in categories)
|
||||||
|
assert all(any("Women's" in y for y in x) for x in categories)
|
196
eland/tests/query_compiler/test_es_match_pytest.py
Normal file
196
eland/tests/query_compiler/test_es_match_pytest.py
Normal file
@ -0,0 +1,196 @@
|
|||||||
|
# Licensed to Elasticsearch B.V. under one or more contributor
|
||||||
|
# license agreements. See the NOTICE file distributed with
|
||||||
|
# this work for additional information regarding copyright
|
||||||
|
# ownership. Elasticsearch B.V. licenses this file to you under
|
||||||
|
# the Apache License, Version 2.0 (the "License"); you may
|
||||||
|
# not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing,
|
||||||
|
# software distributed under the License is distributed on an
|
||||||
|
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||||
|
# KIND, either express or implied. See the License for the
|
||||||
|
# specific language governing permissions and limitations
|
||||||
|
# under the License.
|
||||||
|
|
||||||
|
# File called _pytest for PyCharm compatability
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
from eland.query_compiler import QueryCompiler
|
||||||
|
from eland.tests.common import TestData
|
||||||
|
|
||||||
|
|
||||||
|
class TestEsMatch(TestData):
|
||||||
|
def test_es_match(self):
|
||||||
|
df = self.ed_ecommerce()
|
||||||
|
query_compiler: QueryCompiler = df._query_compiler
|
||||||
|
|
||||||
|
filter = query_compiler.es_match(
|
||||||
|
"joe", ["customer_full_name"], analyzer="my-analyzer", fuzziness="1..2"
|
||||||
|
)
|
||||||
|
assert filter.build() == {
|
||||||
|
"match": {
|
||||||
|
"customer_full_name": {
|
||||||
|
"query": "joe",
|
||||||
|
"analyzer": "my-analyzer",
|
||||||
|
"fuzziness": "1..2",
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
filter = query_compiler.es_match(
|
||||||
|
"joe", ["customer_last_name", "customer_first_name"]
|
||||||
|
)
|
||||||
|
assert filter.build() == {
|
||||||
|
"multi_match": {
|
||||||
|
"query": "joe",
|
||||||
|
"fields": ["customer_last_name", "customer_first_name"],
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
def test_es_match_must_not_match(self):
|
||||||
|
df = self.ed_ecommerce()
|
||||||
|
|
||||||
|
# single match
|
||||||
|
df2 = df.es_match("joe", columns=["customer_full_name"], must_not_match=True)
|
||||||
|
query_params, _ = df2._query_compiler._operations._resolve_tasks(
|
||||||
|
df2._query_compiler
|
||||||
|
)
|
||||||
|
assert query_params.query.to_search_body() == {
|
||||||
|
"query": {
|
||||||
|
"bool": {
|
||||||
|
"must_not": {"match": {"customer_full_name": {"query": "joe"}}}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
# multi_match
|
||||||
|
df2 = df.es_match(
|
||||||
|
"joe",
|
||||||
|
columns=["customer_first_name", "customer_last_name"],
|
||||||
|
must_not_match=True,
|
||||||
|
)
|
||||||
|
query_params, _ = df2._query_compiler._operations._resolve_tasks(
|
||||||
|
df2._query_compiler
|
||||||
|
)
|
||||||
|
assert query_params.query.to_search_body() == {
|
||||||
|
"query": {
|
||||||
|
"bool": {
|
||||||
|
"must_not": {
|
||||||
|
"multi_match": {
|
||||||
|
"fields": [
|
||||||
|
"customer_first_name",
|
||||||
|
"customer_last_name",
|
||||||
|
],
|
||||||
|
"query": "joe",
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
def test_es_match_phrase(self):
|
||||||
|
df = self.ed_ecommerce()
|
||||||
|
query_compiler: QueryCompiler = df._query_compiler
|
||||||
|
|
||||||
|
filter = query_compiler.es_match(
|
||||||
|
"joe", ["customer_full_name"], match_phrase=True
|
||||||
|
)
|
||||||
|
assert filter.build() == {
|
||||||
|
"match_phrase": {
|
||||||
|
"customer_full_name": {
|
||||||
|
"query": "joe",
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
filter = query_compiler.es_match(
|
||||||
|
"joe", ["customer_last_name", "customer_first_name"], match_phrase=True
|
||||||
|
)
|
||||||
|
assert filter.build() == {
|
||||||
|
"multi_match": {
|
||||||
|
"query": "joe",
|
||||||
|
"type": "phrase",
|
||||||
|
"fields": ["customer_last_name", "customer_first_name"],
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
def test_es_match_phrase_not_allowed_with_multi_match_type(self):
|
||||||
|
df = self.ed_ecommerce()
|
||||||
|
query_compiler: QueryCompiler = df._query_compiler
|
||||||
|
|
||||||
|
with pytest.raises(ValueError) as e:
|
||||||
|
query_compiler.es_match(
|
||||||
|
"joe",
|
||||||
|
["customer_first_name", "customer_last_name"],
|
||||||
|
match_phrase=True,
|
||||||
|
multi_match_type="best_fields",
|
||||||
|
)
|
||||||
|
assert str(e.value) == (
|
||||||
|
"match_phrase=True and multi_match_type='best_fields' "
|
||||||
|
"are not compatible. Must be multi_match_type='phrase'"
|
||||||
|
)
|
||||||
|
|
||||||
|
filter = query_compiler.es_match(
|
||||||
|
"joe",
|
||||||
|
["customer_last_name", "customer_first_name"],
|
||||||
|
match_phrase=True,
|
||||||
|
multi_match_type="phrase",
|
||||||
|
)
|
||||||
|
assert filter.build() == {
|
||||||
|
"multi_match": {
|
||||||
|
"query": "joe",
|
||||||
|
"type": "phrase",
|
||||||
|
"fields": ["customer_last_name", "customer_first_name"],
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
def test_es_match_non_text_fields(self):
|
||||||
|
df = self.ed_ecommerce()
|
||||||
|
query_compiler: QueryCompiler = df._query_compiler
|
||||||
|
|
||||||
|
with pytest.raises(ValueError) as e:
|
||||||
|
query_compiler.es_match(
|
||||||
|
"joe",
|
||||||
|
[
|
||||||
|
"customer_first_name",
|
||||||
|
"order_date",
|
||||||
|
"customer_last_name",
|
||||||
|
"currency",
|
||||||
|
"order_*",
|
||||||
|
],
|
||||||
|
)
|
||||||
|
assert str(e.value) == (
|
||||||
|
"Attempting to run es_match() on non-text fields (order_date=date, "
|
||||||
|
"currency=keyword) means that these fields may not be analyzed properly. "
|
||||||
|
"Consider reindexing these fields as text or use 'match_only_text_es_dtypes=False' "
|
||||||
|
"to use match anyways"
|
||||||
|
)
|
||||||
|
|
||||||
|
filter = query_compiler.es_match(
|
||||||
|
"joe",
|
||||||
|
[
|
||||||
|
"customer_first_name",
|
||||||
|
"order_date",
|
||||||
|
"customer_last_name",
|
||||||
|
"currency",
|
||||||
|
"order_*",
|
||||||
|
],
|
||||||
|
match_only_text_fields=False,
|
||||||
|
)
|
||||||
|
assert filter.build() == {
|
||||||
|
"multi_match": {
|
||||||
|
"query": "joe",
|
||||||
|
"lenient": True,
|
||||||
|
"fields": [
|
||||||
|
"customer_first_name",
|
||||||
|
"order_date",
|
||||||
|
"customer_last_name",
|
||||||
|
"currency",
|
||||||
|
"order_*",
|
||||||
|
],
|
||||||
|
}
|
||||||
|
}
|
41
eland/tests/series/test_es_match.py
Normal file
41
eland/tests/series/test_es_match.py
Normal file
@ -0,0 +1,41 @@
|
|||||||
|
# Licensed to Elasticsearch B.V. under one or more contributor
|
||||||
|
# license agreements. See the NOTICE file distributed with
|
||||||
|
# this work for additional information regarding copyright
|
||||||
|
# ownership. Elasticsearch B.V. licenses this file to you under
|
||||||
|
# the Apache License, Version 2.0 (the "License"); you may
|
||||||
|
# not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing,
|
||||||
|
# software distributed under the License is distributed on an
|
||||||
|
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||||
|
# KIND, either express or implied. See the License for the
|
||||||
|
# specific language governing permissions and limitations
|
||||||
|
# under the License.
|
||||||
|
|
||||||
|
# File called _pytest for PyCharm compatability
|
||||||
|
|
||||||
|
from eland.tests.common import TestData
|
||||||
|
|
||||||
|
|
||||||
|
class TestEsMatch(TestData):
|
||||||
|
def test_match(self):
|
||||||
|
df = self.ed_ecommerce()
|
||||||
|
|
||||||
|
categories = list(df[df.category.es_match("Men's")].category.to_pandas())
|
||||||
|
assert len(categories) > 0
|
||||||
|
assert all(any("Men's" in y for y in x) for x in categories)
|
||||||
|
|
||||||
|
def test_must_not_match(self):
|
||||||
|
df = self.ed_ecommerce()
|
||||||
|
|
||||||
|
categories = list(
|
||||||
|
df[
|
||||||
|
~df.category.es_match("Men's") & df.category.es_match("Women's")
|
||||||
|
].category.to_pandas()
|
||||||
|
)
|
||||||
|
assert len(categories) > 0
|
||||||
|
assert all(all("Men's" not in y for y in x) for x in categories)
|
||||||
|
assert all(any("Women's" in y for y in x) for x in categories)
|
Loading…
x
Reference in New Issue
Block a user