diff --git a/docs/sphinx/reference/api/eland.DataFrame.es_match.rst b/docs/sphinx/reference/api/eland.DataFrame.es_match.rst new file mode 100644 index 0000000..7c57624 --- /dev/null +++ b/docs/sphinx/reference/api/eland.DataFrame.es_match.rst @@ -0,0 +1,6 @@ +eland.DataFrame.es_match +======================== + +.. currentmodule:: eland + +.. automethod:: DataFrame.es_match diff --git a/docs/sphinx/reference/api/eland.Series.es_match.rst b/docs/sphinx/reference/api/eland.Series.es_match.rst new file mode 100644 index 0000000..f12c43d --- /dev/null +++ b/docs/sphinx/reference/api/eland.Series.es_match.rst @@ -0,0 +1,6 @@ +eland.Series.es_match +===================== + +.. currentmodule:: eland + +.. automethod:: Series.es_match diff --git a/docs/sphinx/reference/dataframe.rst b/docs/sphinx/reference/dataframe.rst index 1c0c86c..72869aa 100644 --- a/docs/sphinx/reference/dataframe.rst +++ b/docs/sphinx/reference/dataframe.rst @@ -111,6 +111,7 @@ Elasticsearch Functions :toctree: api/ DataFrame.es_info + DataFrame.es_match DataFrame.es_query DataFrame.es_dtypes diff --git a/docs/sphinx/reference/series.rst b/docs/sphinx/reference/series.rst index ac809dd..3e34a2f 100644 --- a/docs/sphinx/reference/series.rst +++ b/docs/sphinx/reference/series.rst @@ -115,5 +115,6 @@ Elasticsearch Functions :toctree: api/ Series.es_info + Series.es_match Series.es_dtype Series.es_dtypes diff --git a/eland/dataframe.py b/eland/dataframe.py index 550c7e4..f13beeb 100644 --- a/eland/dataframe.py +++ b/eland/dataframe.py @@ -19,7 +19,7 @@ import re import sys import warnings from io import StringIO -from typing import List, Optional, Sequence, Tuple, Union +from typing import Any, List, Optional, Sequence, Tuple, Union import numpy as np import pandas as pd @@ -632,6 +632,103 @@ class DataFrame(NDFrame): def info_es(self): return self.es_info() + def es_match( + self, + text: str, + *, + columns: Optional[Union[str, Sequence[str]]] = None, + match_phrase: bool = False, + must_not_match: bool = False, + multi_match_type: Optional[str] = None, + match_only_text_fields: bool = True, + analyzer: Optional[str] = None, + fuzziness: Optional[Union[int, str]] = None, + **kwargs: Any, + ) -> "DataFrame": + """Filters data with an Elasticsearch ``match``, ``match_phrase``, or + ``multi_match`` query depending on the given parameters and columns. + + Read more about `Full-Text Queries in Elasticsearch `_ + + By default all fields of type 'text' within Elasticsearch are queried + otherwise specific columns can be specified via the ``columns`` parameter + or a single column can be filtered on with :py:meth:`eland.Series.es_match` + + All additional keyword arguments are passed in the body of the match query. + + Parameters + ---------- + text: str + String of text to search for + columns: str, List[str], optional + List of columns to search over. Defaults to all 'text' fields in Elasticsearch + match_phrase: bool, default False + If True will use ``match_phrase`` instead of ``match`` query which takes into account + the order of the ``text`` parameter. + must_not_match: bool, default False + If True will apply a boolean NOT (~) to the + query. Instead of requiring a match the query + will require text to not match. + multi_match_type: str, optional + If given and matching against multiple columns will set the ``multi_match.type`` setting + match_only_text_fields: bool, default True + When True this function will raise an error if any non-text fields + are queried to prevent fields that aren't analyzed from not working properly. + Set to False to ignore this preventative check. + analyzer: str, optional + Specify which analyzer to use for the match query + fuzziness: int, str, optional + Specify the fuzziness option for the match query + + Returns + ------- + DataFrame + A filtered :py:class:`eland.DataFrame` with the given match query + + Examples + -------- + >>> df = ed.DataFrame("localhost:9200", "ecommerce") + >>> df.es_match("Men's", columns=["category"]) + category currency ... type user + 0 [Men's Clothing] EUR ... order eddie + 4 [Men's Clothing, Men's Accessories] EUR ... order eddie + 6 [Men's Clothing] EUR ... order oliver + 7 [Men's Clothing, Men's Accessories, Men's Shoes] EUR ... order abd + 11 [Men's Accessories, Men's Clothing] EUR ... order eddie + ... ... ... ... ... ... + 4663 [Men's Shoes, Men's Clothing] EUR ... order samir + 4667 [Men's Clothing, Men's Shoes] EUR ... order sultan + 4671 [Men's Clothing] EUR ... order jim + 4672 [Men's Clothing] EUR ... order yahya + 4674 [Women's Accessories, Men's Clothing] EUR ... order jackson + + [2310 rows x 45 columns] + """ + # Determine which columns will be used + es_dtypes = self.es_dtypes.to_dict() + if columns is None: + columns = [ + column for column, es_dtype in es_dtypes.items() if es_dtype == "text" + ] + elif isinstance(columns, str): + columns = [columns] + columns = list(columns) + + qc = self._query_compiler + filter = qc.es_match( + text, + columns, + match_phrase=match_phrase, + match_only_text_fields=match_only_text_fields, + multi_match_type=multi_match_type, + analyzer=analyzer, + fuzziness=fuzziness, + **kwargs, + ) + if must_not_match: + filter = ~filter + return DataFrame(_query_compiler=qc._update_query(filter)) + def es_query(self, query) -> "DataFrame": """Applies an Elasticsearch DSL query to the current DataFrame. diff --git a/eland/operations.py b/eland/operations.py index 909ea0c..ea376ae 100644 --- a/eland/operations.py +++ b/eland/operations.py @@ -995,24 +995,12 @@ class Operations: is_scan = False if size is not None and size <= DEFAULT_ES_MAX_RESULT_WINDOW: if size > 0: - try: - - es_results = query_compiler._client.search( - index=query_compiler._index_pattern, - size=size, - sort=sort_params, - body=body, - ) - except Exception: - # Catch all ES errors and print debug (currently to stdout) - error = { - "index": query_compiler._index_pattern, - "size": size, - "sort": sort_params, - "body": body, - } - print("Elasticsearch error:", error) - raise + es_results = query_compiler._client.search( + index=query_compiler._index_pattern, + size=size, + sort=sort_params, + body=body, + ) else: is_scan = True es_results = scan( diff --git a/eland/query_compiler.py b/eland/query_compiler.py index 85b2c66..f3a4b07 100644 --- a/eland/query_compiler.py +++ b/eland/query_compiler.py @@ -17,7 +17,7 @@ import copy from datetime import datetime -from typing import TYPE_CHECKING, List, Optional, Sequence +from typing import TYPE_CHECKING, Any, List, Optional, Sequence, Union import numpy as np # type: ignore import pandas as pd # type: ignore @@ -430,6 +430,77 @@ class QueryCompiler: return result + def es_match( + self, + text: str, + columns: Sequence[str], + *, + match_phrase: bool = False, + match_only_text_fields: bool = True, + multi_match_type: Optional[str] = None, + analyzer: Optional[str] = None, + fuzziness: Optional[Union[int, str]] = None, + **kwargs: Any, + ) -> QueryFilter: + if len(columns) < 1: + raise ValueError("columns can't be empty") + + es_dtypes = self.es_dtypes.to_dict() + + # Build the base options for the 'match_*' query + options = {"query": text} + if analyzer is not None: + options["analyzer"] = analyzer + if fuzziness is not None: + options["fuzziness"] = fuzziness + options.update(kwargs) + + # Warn the user if they're not querying text columns + if match_only_text_fields: + non_text_columns = {} + for column in columns: + # Don't worry about wildcards + if "*" in column: + continue + + es_dtype = es_dtypes[column] + if es_dtype != "text": + non_text_columns[column] = es_dtype + if non_text_columns: + raise ValueError( + f"Attempting to run es_match() on non-text fields " + f"({', '.join([k + '=' + v for k, v in non_text_columns.items()])}) " + f"means that these fields may not be analyzed properly. " + f"Consider reindexing these fields as text or use 'match_only_text_es_dtypes=False' " + f"to use match anyways" + ) + else: + options.setdefault("lenient", True) + + # If only one column use 'match' + # otherwise use 'multi_match' with 'fields' + if len(columns) == 1: + if multi_match_type is not None: + raise ValueError( + "multi_match_type parameter only valid " + "when searching more than one column" + ) + query = {"match_phrase" if match_phrase else "match": {columns[0]: options}} + else: + options["fields"] = columns + if match_phrase: + if multi_match_type not in ("phrase", None): + raise ValueError( + f"match_phrase=True and multi_match_type={multi_match_type!r} " + f"are not compatible. Must be multi_match_type='phrase'" + ) + multi_match_type = "phrase" + if multi_match_type is not None: + options["type"] = multi_match_type + + query = {"multi_match": options} + return QueryFilter(query) + def es_query(self, query): return self._update_query(QueryFilter(query)) diff --git a/eland/series.py b/eland/series.py index a2089bc..392da82 100644 --- a/eland/series.py +++ b/eland/series.py @@ -55,6 +55,7 @@ from eland.filter import ( LessEqual, NotFilter, NotNull, + QueryFilter, ScriptFilter, ) from eland.ndframe import NDFrame @@ -636,6 +637,74 @@ class Series(NDFrame): ) return Series(_query_compiler=new_query_compiler) + def es_match( + self, + text: str, + *, + match_phrase: bool = False, + match_only_text_fields: bool = True, + analyzer: Optional[str] = None, + fuzziness: Optional[Union[int, str]] = None, + **kwargs: Any, + ) -> QueryFilter: + """Filters data with an Elasticsearch ``match`` or ``match_phrase`` + query depending on the given parameters. + + Read more about `Full-Text Queries in Elasticsearch `_ + + All additional keyword arguments are passed in the body of the match query. + + Parameters + ---------- + text: str + String of text to search for + match_phrase: bool, default False + If True will use ``match_phrase`` instead of ``match`` query which takes into account + the order of the ``text`` parameter. + match_only_text_fields: bool, default True + When True this function will raise an error if any non-text fields + are queried to prevent fields that aren't analyzed from not working properly. + Set to False to ignore this preventative check. + analyzer: str, optional + Specify which analyzer to use for the match query + fuzziness: int, str, optional + Specify the fuzziness option for the match query + + Returns + ------- + QueryFilter + Boolean filter to be combined with other filters and + then passed to DataFrame[...]. + + Examples + -------- + >>> df = ed.DataFrame( + ... "localhost:9200", "ecommerce", + ... columns=["category", "taxful_total_price"] + ... ) + >>> df[ + ... df.category.es_match("Men's") + ... & (df.taxful_total_price > 200.0) + ... ].head(5) + category taxful_total_price + 13 [Men's Clothing] 266.96 + 33 [Men's Clothing] 221.98 + 54 [Men's Clothing] 234.98 + 93 [Men's Shoes, Women's Accessories] 239.98 + 273 [Men's Shoes] 214.98 + + [5 rows x 2 columns] + """ + return self._query_compiler.es_match( + text, + columns=[self.name], + match_phrase=match_phrase, + match_only_text_fields=match_only_text_fields, + analyzer=analyzer, + fuzziness=fuzziness, + **kwargs, + ) + def es_info(self) -> str: buf = StringIO() diff --git a/eland/tests/dataframe/test_es_match_pytest.py b/eland/tests/dataframe/test_es_match_pytest.py new file mode 100644 index 0000000..67923df --- /dev/null +++ b/eland/tests/dataframe/test_es_match_pytest.py @@ -0,0 +1,41 @@ +# Licensed to Elasticsearch B.V. under one or more contributor +# license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright +# ownership. Elasticsearch B.V. licenses this file to you under +# the Apache License, Version 2.0 (the "License"); you may +# not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# File called _pytest for PyCharm compatability + +from eland.tests.common import TestData + + +class TestEsMatch(TestData): + def test_match(self): + df = self.ed_ecommerce() + + categories = list(df.es_match("Men's").category.to_pandas()) + assert len(categories) > 0 + assert all(any("Men's" in y for y in x) for x in categories) + + def test_must_not_match(self): + df = self.ed_ecommerce() + + categories = list( + df.es_match("Men's", must_not_match=True) + .es_match("Women's") + .category.to_pandas() + ) + assert len(categories) > 0 + assert all(all("Men's" not in y for y in x) for x in categories) + assert all(any("Women's" in y for y in x) for x in categories) diff --git a/eland/tests/query_compiler/test_es_match_pytest.py b/eland/tests/query_compiler/test_es_match_pytest.py new file mode 100644 index 0000000..f0625d7 --- /dev/null +++ b/eland/tests/query_compiler/test_es_match_pytest.py @@ -0,0 +1,196 @@ +# Licensed to Elasticsearch B.V. under one or more contributor +# license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright +# ownership. Elasticsearch B.V. licenses this file to you under +# the Apache License, Version 2.0 (the "License"); you may +# not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# File called _pytest for PyCharm compatability + +import pytest + +from eland.query_compiler import QueryCompiler +from eland.tests.common import TestData + + +class TestEsMatch(TestData): + def test_es_match(self): + df = self.ed_ecommerce() + query_compiler: QueryCompiler = df._query_compiler + + filter = query_compiler.es_match( + "joe", ["customer_full_name"], analyzer="my-analyzer", fuzziness="1..2" + ) + assert filter.build() == { + "match": { + "customer_full_name": { + "query": "joe", + "analyzer": "my-analyzer", + "fuzziness": "1..2", + } + } + } + + filter = query_compiler.es_match( + "joe", ["customer_last_name", "customer_first_name"] + ) + assert filter.build() == { + "multi_match": { + "query": "joe", + "fields": ["customer_last_name", "customer_first_name"], + } + } + + def test_es_match_must_not_match(self): + df = self.ed_ecommerce() + + # single match + df2 = df.es_match("joe", columns=["customer_full_name"], must_not_match=True) + query_params, _ = df2._query_compiler._operations._resolve_tasks( + df2._query_compiler + ) + assert query_params.query.to_search_body() == { + "query": { + "bool": { + "must_not": {"match": {"customer_full_name": {"query": "joe"}}} + } + } + } + + # multi_match + df2 = df.es_match( + "joe", + columns=["customer_first_name", "customer_last_name"], + must_not_match=True, + ) + query_params, _ = df2._query_compiler._operations._resolve_tasks( + df2._query_compiler + ) + assert query_params.query.to_search_body() == { + "query": { + "bool": { + "must_not": { + "multi_match": { + "fields": [ + "customer_first_name", + "customer_last_name", + ], + "query": "joe", + } + } + } + } + } + + def test_es_match_phrase(self): + df = self.ed_ecommerce() + query_compiler: QueryCompiler = df._query_compiler + + filter = query_compiler.es_match( + "joe", ["customer_full_name"], match_phrase=True + ) + assert filter.build() == { + "match_phrase": { + "customer_full_name": { + "query": "joe", + } + } + } + + filter = query_compiler.es_match( + "joe", ["customer_last_name", "customer_first_name"], match_phrase=True + ) + assert filter.build() == { + "multi_match": { + "query": "joe", + "type": "phrase", + "fields": ["customer_last_name", "customer_first_name"], + } + } + + def test_es_match_phrase_not_allowed_with_multi_match_type(self): + df = self.ed_ecommerce() + query_compiler: QueryCompiler = df._query_compiler + + with pytest.raises(ValueError) as e: + query_compiler.es_match( + "joe", + ["customer_first_name", "customer_last_name"], + match_phrase=True, + multi_match_type="best_fields", + ) + assert str(e.value) == ( + "match_phrase=True and multi_match_type='best_fields' " + "are not compatible. Must be multi_match_type='phrase'" + ) + + filter = query_compiler.es_match( + "joe", + ["customer_last_name", "customer_first_name"], + match_phrase=True, + multi_match_type="phrase", + ) + assert filter.build() == { + "multi_match": { + "query": "joe", + "type": "phrase", + "fields": ["customer_last_name", "customer_first_name"], + } + } + + def test_es_match_non_text_fields(self): + df = self.ed_ecommerce() + query_compiler: QueryCompiler = df._query_compiler + + with pytest.raises(ValueError) as e: + query_compiler.es_match( + "joe", + [ + "customer_first_name", + "order_date", + "customer_last_name", + "currency", + "order_*", + ], + ) + assert str(e.value) == ( + "Attempting to run es_match() on non-text fields (order_date=date, " + "currency=keyword) means that these fields may not be analyzed properly. " + "Consider reindexing these fields as text or use 'match_only_text_es_dtypes=False' " + "to use match anyways" + ) + + filter = query_compiler.es_match( + "joe", + [ + "customer_first_name", + "order_date", + "customer_last_name", + "currency", + "order_*", + ], + match_only_text_fields=False, + ) + assert filter.build() == { + "multi_match": { + "query": "joe", + "lenient": True, + "fields": [ + "customer_first_name", + "order_date", + "customer_last_name", + "currency", + "order_*", + ], + } + } diff --git a/eland/tests/series/test_es_match.py b/eland/tests/series/test_es_match.py new file mode 100644 index 0000000..d29b1a1 --- /dev/null +++ b/eland/tests/series/test_es_match.py @@ -0,0 +1,41 @@ +# Licensed to Elasticsearch B.V. under one or more contributor +# license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright +# ownership. Elasticsearch B.V. licenses this file to you under +# the Apache License, Version 2.0 (the "License"); you may +# not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# File called _pytest for PyCharm compatability + +from eland.tests.common import TestData + + +class TestEsMatch(TestData): + def test_match(self): + df = self.ed_ecommerce() + + categories = list(df[df.category.es_match("Men's")].category.to_pandas()) + assert len(categories) > 0 + assert all(any("Men's" in y for y in x) for x in categories) + + def test_must_not_match(self): + df = self.ed_ecommerce() + + categories = list( + df[ + ~df.category.es_match("Men's") & df.category.es_match("Women's") + ].category.to_pandas() + ) + assert len(categories) > 0 + assert all(all("Men's" not in y for y in x) for x in categories) + assert all(any("Women's" in y for y in x) for x in categories)