Add support for es_match() to DataFrame and Series

2025-07-11 00:02:14 +08:00 · 2020-10-29 10:16:50 -05:00 · 2020-10-29 10:16:50 -05:00 · cb4cd083c3
commit cb4cd083c3
parent 92a8040614
11 changed files with 537 additions and 20 deletions
--- a/docs/sphinx/reference/api/eland.DataFrame.es_match.rst
+++ b/docs/sphinx/reference/api/eland.DataFrame.es_match.rst
@ -0,0 +1,6 @@
+eland.DataFrame.es_match
+========================
+
+.. currentmodule:: eland
+
+.. automethod:: DataFrame.es_match
--- a/docs/sphinx/reference/api/eland.Series.es_match.rst
+++ b/docs/sphinx/reference/api/eland.Series.es_match.rst
@ -0,0 +1,6 @@
+eland.Series.es_match
+=====================
+
+.. currentmodule:: eland
+
+.. automethod:: Series.es_match
--- a/docs/sphinx/reference/dataframe.rst
+++ b/docs/sphinx/reference/dataframe.rst
@ -111,6 +111,7 @@ Elasticsearch Functions
   :toctree: api/

   DataFrame.es_info
+   DataFrame.es_match
   DataFrame.es_query
   DataFrame.es_dtypes

--- a/docs/sphinx/reference/series.rst
+++ b/docs/sphinx/reference/series.rst
@ -115,5 +115,6 @@ Elasticsearch Functions
   :toctree: api/

   Series.es_info
+   Series.es_match
   Series.es_dtype
   Series.es_dtypes
--- a/eland/dataframe.py
+++ b/eland/dataframe.py
@ -19,7 +19,7 @@ import re
 import sys
 import warnings
 from io import StringIO
-from typing import List, Optional, Sequence, Tuple, Union
+from typing import Any, List, Optional, Sequence, Tuple, Union

 import numpy as np
 import pandas as pd
@ -632,6 +632,103 @@ class DataFrame(NDFrame):
    def info_es(self):
        return self.es_info()

+    def es_match(
+        self,
+        text: str,
+        *,
+        columns: Optional[Union[str, Sequence[str]]] = None,
+        match_phrase: bool = False,
+        must_not_match: bool = False,
+        multi_match_type: Optional[str] = None,
+        match_only_text_fields: bool = True,
+        analyzer: Optional[str] = None,
+        fuzziness: Optional[Union[int, str]] = None,
+        **kwargs: Any,
+    ) -> "DataFrame":
+        """Filters data with an Elasticsearch ``match``, ``match_phrase``, or
+        ``multi_match`` query depending on the given parameters and columns.
+
+        Read more about `Full-Text Queries in Elasticsearch <https://www.elastic.co/guide/en/elasticsearch/reference/current/full-text-queries.html>`_
+
+        By default all fields of type 'text' within Elasticsearch are queried
+        otherwise specific columns can be specified via the ``columns`` parameter
+        or a single column can be filtered on with :py:meth:`eland.Series.es_match`
+
+        All additional keyword arguments are passed in the body of the match query.
+
+        Parameters
+        ----------
+        text: str
+            String of text to search for
+        columns: str, List[str], optional
+            List of columns to search over. Defaults to all 'text' fields in Elasticsearch
+        match_phrase: bool, default False
+            If True will use ``match_phrase`` instead of ``match`` query which takes into account
+            the order of the ``text`` parameter.
+        must_not_match: bool, default False
+            If True will apply a boolean NOT (~) to the
+            query. Instead of requiring a match the query
+            will require text to not match.
+        multi_match_type: str, optional
+            If given and matching against multiple columns will set the ``multi_match.type`` setting
+        match_only_text_fields: bool, default True
+            When True this function will raise an error if any non-text fields
+            are queried to prevent fields that aren't analyzed from not working properly.
+            Set to False to ignore this preventative check.
+        analyzer: str, optional
+            Specify which analyzer to use for the match query
+        fuzziness: int, str, optional
+            Specify the fuzziness option for the match query
+
+        Returns
+        -------
+        DataFrame
+            A filtered :py:class:`eland.DataFrame` with the given match query
+
+        Examples
+        --------
+        >>> df = ed.DataFrame("localhost:9200", "ecommerce")
+        >>> df.es_match("Men's", columns=["category"])
+                                                      category currency  ...   type     user
+        0                                     [Men's Clothing]      EUR  ...  order    eddie
+        4                  [Men's Clothing, Men's Accessories]      EUR  ...  order    eddie
+        6                                     [Men's Clothing]      EUR  ...  order   oliver
+        7     [Men's Clothing, Men's Accessories, Men's Shoes]      EUR  ...  order      abd
+        11                 [Men's Accessories, Men's Clothing]      EUR  ...  order    eddie
+        ...                                                ...      ...  ...    ...      ...
+        4663                     [Men's Shoes, Men's Clothing]      EUR  ...  order    samir
+        4667                     [Men's Clothing, Men's Shoes]      EUR  ...  order   sultan
+        4671                                  [Men's Clothing]      EUR  ...  order      jim
+        4672                                  [Men's Clothing]      EUR  ...  order    yahya
+        4674             [Women's Accessories, Men's Clothing]      EUR  ...  order  jackson
+        <BLANKLINE>
+        [2310 rows x 45 columns]
+        """
+        # Determine which columns will be used
+        es_dtypes = self.es_dtypes.to_dict()
+        if columns is None:
+            columns = [
+                column for column, es_dtype in es_dtypes.items() if es_dtype == "text"
+            ]
+        elif isinstance(columns, str):
+            columns = [columns]
+        columns = list(columns)
+
+        qc = self._query_compiler
+        filter = qc.es_match(
+            text,
+            columns,
+            match_phrase=match_phrase,
+            match_only_text_fields=match_only_text_fields,
+            multi_match_type=multi_match_type,
+            analyzer=analyzer,
+            fuzziness=fuzziness,
+            **kwargs,
+        )
+        if must_not_match:
+            filter = ~filter
+        return DataFrame(_query_compiler=qc._update_query(filter))
+
    def es_query(self, query) -> "DataFrame":
        """Applies an Elasticsearch DSL query to the current DataFrame.

--- a/eland/operations.py
+++ b/eland/operations.py
@ -995,24 +995,12 @@ class Operations:
        is_scan = False
        if size is not None and size <= DEFAULT_ES_MAX_RESULT_WINDOW:
            if size > 0:
-                try:
-
-                    es_results = query_compiler._client.search(
-                        index=query_compiler._index_pattern,
-                        size=size,
-                        sort=sort_params,
-                        body=body,
-                    )
-                except Exception:
-                    # Catch all ES errors and print debug (currently to stdout)
-                    error = {
-                        "index": query_compiler._index_pattern,
-                        "size": size,
-                        "sort": sort_params,
-                        "body": body,
-                    }
-                    print("Elasticsearch error:", error)
-                    raise
+                es_results = query_compiler._client.search(
+                    index=query_compiler._index_pattern,
+                    size=size,
+                    sort=sort_params,
+                    body=body,
+                )
        else:
            is_scan = True
            es_results = scan(
--- a/eland/query_compiler.py
+++ b/eland/query_compiler.py
@ -17,7 +17,7 @@

 import copy
 from datetime import datetime
-from typing import TYPE_CHECKING, List, Optional, Sequence
+from typing import TYPE_CHECKING, Any, List, Optional, Sequence, Union

 import numpy as np  # type: ignore
 import pandas as pd  # type: ignore
@ -430,6 +430,77 @@ class QueryCompiler:

        return result

+    def es_match(
+        self,
+        text: str,
+        columns: Sequence[str],
+        *,
+        match_phrase: bool = False,
+        match_only_text_fields: bool = True,
+        multi_match_type: Optional[str] = None,
+        analyzer: Optional[str] = None,
+        fuzziness: Optional[Union[int, str]] = None,
+        **kwargs: Any,
+    ) -> QueryFilter:
+        if len(columns) < 1:
+            raise ValueError("columns can't be empty")
+
+        es_dtypes = self.es_dtypes.to_dict()
+
+        # Build the base options for the 'match_*' query
+        options = {"query": text}
+        if analyzer is not None:
+            options["analyzer"] = analyzer
+        if fuzziness is not None:
+            options["fuzziness"] = fuzziness
+        options.update(kwargs)
+
+        # Warn the user if they're not querying text columns
+        if match_only_text_fields:
+            non_text_columns = {}
+            for column in columns:
+                # Don't worry about wildcards
+                if "*" in column:
+                    continue
+
+                es_dtype = es_dtypes[column]
+                if es_dtype != "text":
+                    non_text_columns[column] = es_dtype
+            if non_text_columns:
+                raise ValueError(
+                    f"Attempting to run es_match() on non-text fields "
+                    f"({', '.join([k + '=' + v for k, v in non_text_columns.items()])}) "
+                    f"means that these fields may not be analyzed properly. "
+                    f"Consider reindexing these fields as text or use 'match_only_text_es_dtypes=False' "
+                    f"to use match anyways"
+                )
+        else:
+            options.setdefault("lenient", True)
+
+        # If only one column use 'match'
+        # otherwise use 'multi_match' with 'fields'
+        if len(columns) == 1:
+            if multi_match_type is not None:
+                raise ValueError(
+                    "multi_match_type parameter only valid "
+                    "when searching more than one column"
+                )
+            query = {"match_phrase" if match_phrase else "match": {columns[0]: options}}
+        else:
+            options["fields"] = columns
+            if match_phrase:
+                if multi_match_type not in ("phrase", None):
+                    raise ValueError(
+                        f"match_phrase=True and multi_match_type={multi_match_type!r} "
+                        f"are not compatible. Must be multi_match_type='phrase'"
+                    )
+                multi_match_type = "phrase"
+            if multi_match_type is not None:
+                options["type"] = multi_match_type
+
+            query = {"multi_match": options}
+        return QueryFilter(query)
+
    def es_query(self, query):
        return self._update_query(QueryFilter(query))

--- a/eland/series.py
+++ b/eland/series.py
@ -55,6 +55,7 @@ from eland.filter import (
    LessEqual,
    NotFilter,
    NotNull,
+    QueryFilter,
    ScriptFilter,
 )
 from eland.ndframe import NDFrame
@ -636,6 +637,74 @@ class Series(NDFrame):
        )
        return Series(_query_compiler=new_query_compiler)

+    def es_match(
+        self,
+        text: str,
+        *,
+        match_phrase: bool = False,
+        match_only_text_fields: bool = True,
+        analyzer: Optional[str] = None,
+        fuzziness: Optional[Union[int, str]] = None,
+        **kwargs: Any,
+    ) -> QueryFilter:
+        """Filters data with an Elasticsearch ``match`` or ``match_phrase``
+        query depending on the given parameters.
+
+        Read more about `Full-Text Queries in Elasticsearch <https://www.elastic.co/guide/en/elasticsearch/reference/current/full-text-queries.html>`_
+
+        All additional keyword arguments are passed in the body of the match query.
+
+        Parameters
+        ----------
+        text: str
+            String of text to search for
+        match_phrase: bool, default False
+            If True will use ``match_phrase`` instead of ``match`` query which takes into account
+            the order of the ``text`` parameter.
+        match_only_text_fields: bool, default True
+            When True this function will raise an error if any non-text fields
+            are queried to prevent fields that aren't analyzed from not working properly.
+            Set to False to ignore this preventative check.
+        analyzer: str, optional
+            Specify which analyzer to use for the match query
+        fuzziness: int, str, optional
+            Specify the fuzziness option for the match query
+
+        Returns
+        -------
+        QueryFilter
+            Boolean filter to be combined with other filters and
+            then passed to DataFrame[...].
+
+        Examples
+        --------
+        >>> df = ed.DataFrame(
+        ...   "localhost:9200", "ecommerce",
+        ...   columns=["category", "taxful_total_price"]
+        ... )
+        >>> df[
+        ...     df.category.es_match("Men's")
+        ...     & (df.taxful_total_price > 200.0)
+        ... ].head(5)
+                                       category  taxful_total_price
+        13                     [Men's Clothing]              266.96
+        33                     [Men's Clothing]              221.98
+        54                     [Men's Clothing]              234.98
+        93   [Men's Shoes, Women's Accessories]              239.98
+        273                       [Men's Shoes]              214.98
+        <BLANKLINE>
+        [5 rows x 2 columns]
+        """
+        return self._query_compiler.es_match(
+            text,
+            columns=[self.name],
+            match_phrase=match_phrase,
+            match_only_text_fields=match_only_text_fields,
+            analyzer=analyzer,
+            fuzziness=fuzziness,
+            **kwargs,
+        )
+
    def es_info(self) -> str:
        buf = StringIO()

--- a/eland/tests/dataframe/test_es_match_pytest.py
+++ b/eland/tests/dataframe/test_es_match_pytest.py
@ -0,0 +1,41 @@
+#  Licensed to Elasticsearch B.V. under one or more contributor
+#  license agreements. See the NOTICE file distributed with
+#  this work for additional information regarding copyright
+#  ownership. Elasticsearch B.V. licenses this file to you under
+#  the Apache License, Version 2.0 (the "License"); you may
+#  not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+# 	http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing,
+#  software distributed under the License is distributed on an
+#  "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+#  KIND, either express or implied.  See the License for the
+#  specific language governing permissions and limitations
+#  under the License.
+
+# File called _pytest for PyCharm compatability
+
+from eland.tests.common import TestData
+
+
+class TestEsMatch(TestData):
+    def test_match(self):
+        df = self.ed_ecommerce()
+
+        categories = list(df.es_match("Men's").category.to_pandas())
+        assert len(categories) > 0
+        assert all(any("Men's" in y for y in x) for x in categories)
+
+    def test_must_not_match(self):
+        df = self.ed_ecommerce()
+
+        categories = list(
+            df.es_match("Men's", must_not_match=True)
+            .es_match("Women's")
+            .category.to_pandas()
+        )
+        assert len(categories) > 0
+        assert all(all("Men's" not in y for y in x) for x in categories)
+        assert all(any("Women's" in y for y in x) for x in categories)
--- a/eland/tests/query_compiler/test_es_match_pytest.py
+++ b/eland/tests/query_compiler/test_es_match_pytest.py
@ -0,0 +1,196 @@
+#  Licensed to Elasticsearch B.V. under one or more contributor
+#  license agreements. See the NOTICE file distributed with
+#  this work for additional information regarding copyright
+#  ownership. Elasticsearch B.V. licenses this file to you under
+#  the Apache License, Version 2.0 (the "License"); you may
+#  not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+# 	http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing,
+#  software distributed under the License is distributed on an
+#  "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+#  KIND, either express or implied.  See the License for the
+#  specific language governing permissions and limitations
+#  under the License.
+
+# File called _pytest for PyCharm compatability
+
+import pytest
+
+from eland.query_compiler import QueryCompiler
+from eland.tests.common import TestData
+
+
+class TestEsMatch(TestData):
+    def test_es_match(self):
+        df = self.ed_ecommerce()
+        query_compiler: QueryCompiler = df._query_compiler
+
+        filter = query_compiler.es_match(
+            "joe", ["customer_full_name"], analyzer="my-analyzer", fuzziness="1..2"
+        )
+        assert filter.build() == {
+            "match": {
+                "customer_full_name": {
+                    "query": "joe",
+                    "analyzer": "my-analyzer",
+                    "fuzziness": "1..2",
+                }
+            }
+        }
+
+        filter = query_compiler.es_match(
+            "joe", ["customer_last_name", "customer_first_name"]
+        )
+        assert filter.build() == {
+            "multi_match": {
+                "query": "joe",
+                "fields": ["customer_last_name", "customer_first_name"],
+            }
+        }
+
+    def test_es_match_must_not_match(self):
+        df = self.ed_ecommerce()
+
+        # single match
+        df2 = df.es_match("joe", columns=["customer_full_name"], must_not_match=True)
+        query_params, _ = df2._query_compiler._operations._resolve_tasks(
+            df2._query_compiler
+        )
+        assert query_params.query.to_search_body() == {
+            "query": {
+                "bool": {
+                    "must_not": {"match": {"customer_full_name": {"query": "joe"}}}
+                }
+            }
+        }
+
+        # multi_match
+        df2 = df.es_match(
+            "joe",
+            columns=["customer_first_name", "customer_last_name"],
+            must_not_match=True,
+        )
+        query_params, _ = df2._query_compiler._operations._resolve_tasks(
+            df2._query_compiler
+        )
+        assert query_params.query.to_search_body() == {
+            "query": {
+                "bool": {
+                    "must_not": {
+                        "multi_match": {
+                            "fields": [
+                                "customer_first_name",
+                                "customer_last_name",
+                            ],
+                            "query": "joe",
+                        }
+                    }
+                }
+            }
+        }
+
+    def test_es_match_phrase(self):
+        df = self.ed_ecommerce()
+        query_compiler: QueryCompiler = df._query_compiler
+
+        filter = query_compiler.es_match(
+            "joe", ["customer_full_name"], match_phrase=True
+        )
+        assert filter.build() == {
+            "match_phrase": {
+                "customer_full_name": {
+                    "query": "joe",
+                }
+            }
+        }
+
+        filter = query_compiler.es_match(
+            "joe", ["customer_last_name", "customer_first_name"], match_phrase=True
+        )
+        assert filter.build() == {
+            "multi_match": {
+                "query": "joe",
+                "type": "phrase",
+                "fields": ["customer_last_name", "customer_first_name"],
+            }
+        }
+
+    def test_es_match_phrase_not_allowed_with_multi_match_type(self):
+        df = self.ed_ecommerce()
+        query_compiler: QueryCompiler = df._query_compiler
+
+        with pytest.raises(ValueError) as e:
+            query_compiler.es_match(
+                "joe",
+                ["customer_first_name", "customer_last_name"],
+                match_phrase=True,
+                multi_match_type="best_fields",
+            )
+        assert str(e.value) == (
+            "match_phrase=True and multi_match_type='best_fields' "
+            "are not compatible. Must be multi_match_type='phrase'"
+        )
+
+        filter = query_compiler.es_match(
+            "joe",
+            ["customer_last_name", "customer_first_name"],
+            match_phrase=True,
+            multi_match_type="phrase",
+        )
+        assert filter.build() == {
+            "multi_match": {
+                "query": "joe",
+                "type": "phrase",
+                "fields": ["customer_last_name", "customer_first_name"],
+            }
+        }
+
+    def test_es_match_non_text_fields(self):
+        df = self.ed_ecommerce()
+        query_compiler: QueryCompiler = df._query_compiler
+
+        with pytest.raises(ValueError) as e:
+            query_compiler.es_match(
+                "joe",
+                [
+                    "customer_first_name",
+                    "order_date",
+                    "customer_last_name",
+                    "currency",
+                    "order_*",
+                ],
+            )
+        assert str(e.value) == (
+            "Attempting to run es_match() on non-text fields (order_date=date, "
+            "currency=keyword) means that these fields may not be analyzed properly. "
+            "Consider reindexing these fields as text or use 'match_only_text_es_dtypes=False' "
+            "to use match anyways"
+        )
+
+        filter = query_compiler.es_match(
+            "joe",
+            [
+                "customer_first_name",
+                "order_date",
+                "customer_last_name",
+                "currency",
+                "order_*",
+            ],
+            match_only_text_fields=False,
+        )
+        assert filter.build() == {
+            "multi_match": {
+                "query": "joe",
+                "lenient": True,
+                "fields": [
+                    "customer_first_name",
+                    "order_date",
+                    "customer_last_name",
+                    "currency",
+                    "order_*",
+                ],
+            }
+        }
--- a/eland/tests/series/test_es_match.py
+++ b/eland/tests/series/test_es_match.py
@ -0,0 +1,41 @@
+#  Licensed to Elasticsearch B.V. under one or more contributor
+#  license agreements. See the NOTICE file distributed with
+#  this work for additional information regarding copyright
+#  ownership. Elasticsearch B.V. licenses this file to you under
+#  the Apache License, Version 2.0 (the "License"); you may
+#  not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+# 	http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing,
+#  software distributed under the License is distributed on an
+#  "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+#  KIND, either express or implied.  See the License for the
+#  specific language governing permissions and limitations
+#  under the License.
+
+# File called _pytest for PyCharm compatability
+
+from eland.tests.common import TestData
+
+
+class TestEsMatch(TestData):
+    def test_match(self):
+        df = self.ed_ecommerce()
+
+        categories = list(df[df.category.es_match("Men's")].category.to_pandas())
+        assert len(categories) > 0
+        assert all(any("Men's" in y for y in x) for x in categories)
+
+    def test_must_not_match(self):
+        df = self.ed_ecommerce()
+
+        categories = list(
+            df[
+                ~df.category.es_match("Men's") & df.category.es_match("Women's")
+            ].category.to_pandas()
+        )
+        assert len(categories) > 0
+        assert all(all("Men's" not in y for y in x) for x in categories)
+        assert all(any("Women's" in y for y in x) for x in categories)