Add support for es_match() to DataFrame and Series

2025-07-11 00:02:14 +08:00 · 2020-10-29 10:16:50 -05:00 · 2020-10-29 10:16:50 -05:00 · cb4cd083c3
commit cb4cd083c3
parent 92a8040614
11 changed files with 537 additions and 20 deletions
--- a/docs/sphinx/reference/api/eland.DataFrame.es_match.rst
+++ b/docs/sphinx/reference/api/eland.DataFrame.es_match.rst
@ -0,0 +1,6 @@
 eland.DataFrame.es_match
 ========================
 .. currentmodule:: eland
 .. automethod:: DataFrame.es_match
--- a/docs/sphinx/reference/api/eland.Series.es_match.rst
+++ b/docs/sphinx/reference/api/eland.Series.es_match.rst
@ -0,0 +1,6 @@
 eland.Series.es_match
 =====================
 .. currentmodule:: eland
 .. automethod:: Series.es_match
--- a/docs/sphinx/reference/dataframe.rst
+++ b/docs/sphinx/reference/dataframe.rst
@ -111,6 +111,7 @@ Elasticsearch Functions
   :toctree: api/
   DataFrame.es_info
   DataFrame.es_match
   DataFrame.es_query
   DataFrame.es_dtypes
--- a/docs/sphinx/reference/series.rst
+++ b/docs/sphinx/reference/series.rst
@ -115,5 +115,6 @@ Elasticsearch Functions
   :toctree: api/
   Series.es_info
   Series.es_match
   Series.es_dtype
   Series.es_dtypes
--- a/eland/dataframe.py
+++ b/eland/dataframe.py
@ -19,7 +19,7 @@ import re
 import sys
 import warnings
 from io import StringIO
-from typing import List, Optional, Sequence, Tuple, Union
+from typing import Any, List, Optional, Sequence, Tuple, Union
 import numpy as np
 import pandas as pd
@ -632,6 +632,103 @@ class DataFrame(NDFrame):
    def info_es(self):
        return self.es_info()
    def es_match(
        self,
        text: str,
        *,
        columns: Optional[Union[str, Sequence[str]]] = None,
        match_phrase: bool = False,
        must_not_match: bool = False,
        multi_match_type: Optional[str] = None,
        match_only_text_fields: bool = True,
        analyzer: Optional[str] = None,
        fuzziness: Optional[Union[int, str]] = None,
        **kwargs: Any,
    ) -> "DataFrame":
        """Filters data with an Elasticsearch ``match``, ``match_phrase``, or
        ``multi_match`` query depending on the given parameters and columns.
        Read more about `Full-Text Queries in Elasticsearch <https://www.elastic.co/guide/en/elasticsearch/reference/current/full-text-queries.html>`_
        By default all fields of type 'text' within Elasticsearch are queried
        otherwise specific columns can be specified via the ``columns`` parameter
        or a single column can be filtered on with :py:meth:`eland.Series.es_match`
        All additional keyword arguments are passed in the body of the match query.
        Parameters
        ----------
        text: str
            String of text to search for
        columns: str, List[str], optional
            List of columns to search over. Defaults to all 'text' fields in Elasticsearch
        match_phrase: bool, default False
            If True will use ``match_phrase`` instead of ``match`` query which takes into account
            the order of the ``text`` parameter.
        must_not_match: bool, default False
            If True will apply a boolean NOT (~) to the
            query. Instead of requiring a match the query
            will require text to not match.
        multi_match_type: str, optional
            If given and matching against multiple columns will set the ``multi_match.type`` setting
        match_only_text_fields: bool, default True
            When True this function will raise an error if any non-text fields
            are queried to prevent fields that aren't analyzed from not working properly.
            Set to False to ignore this preventative check.
        analyzer: str, optional
            Specify which analyzer to use for the match query
        fuzziness: int, str, optional
            Specify the fuzziness option for the match query
        Returns
        -------
        DataFrame
            A filtered :py:class:`eland.DataFrame` with the given match query
        Examples
        --------
        >>> df = ed.DataFrame("localhost:9200", "ecommerce")
        >>> df.es_match("Men's", columns=["category"])
                                                      category currency  ...   type     user
        0                                     [Men's Clothing]      EUR  ...  order    eddie
        4                  [Men's Clothing, Men's Accessories]      EUR  ...  order    eddie
        6                                     [Men's Clothing]      EUR  ...  order   oliver
        7     [Men's Clothing, Men's Accessories, Men's Shoes]      EUR  ...  order      abd
        11                 [Men's Accessories, Men's Clothing]      EUR  ...  order    eddie
        ...                                                ...      ...  ...    ...      ...
        4663                     [Men's Shoes, Men's Clothing]      EUR  ...  order    samir
        4667                     [Men's Clothing, Men's Shoes]      EUR  ...  order   sultan
        4671                                  [Men's Clothing]      EUR  ...  order      jim
        4672                                  [Men's Clothing]      EUR  ...  order    yahya
        4674             [Women's Accessories, Men's Clothing]      EUR  ...  order  jackson
        <BLANKLINE>
        [2310 rows x 45 columns]
        """
        # Determine which columns will be used
        es_dtypes = self.es_dtypes.to_dict()
        if columns is None:
            columns = [
                column for column, es_dtype in es_dtypes.items() if es_dtype == "text"
            ]
        elif isinstance(columns, str):
            columns = [columns]
        columns = list(columns)
        qc = self._query_compiler
        filter = qc.es_match(
            text,
            columns,
            match_phrase=match_phrase,
            match_only_text_fields=match_only_text_fields,
            multi_match_type=multi_match_type,
            analyzer=analyzer,
            fuzziness=fuzziness,
            **kwargs,
        )
        if must_not_match:
            filter = ~filter
        return DataFrame(_query_compiler=qc._update_query(filter))
    def es_query(self, query) -> "DataFrame":
        """Applies an Elasticsearch DSL query to the current DataFrame.
--- a/eland/operations.py
+++ b/eland/operations.py
@ -995,24 +995,12 @@ class Operations:
        is_scan = False
        if size is not None and size <= DEFAULT_ES_MAX_RESULT_WINDOW:
            if size > 0:
-                try:
+                es_results = query_compiler._client.search(
-
+                    index=query_compiler._index_pattern,
-                    es_results = query_compiler._client.search(
+                    size=size,
-                        index=query_compiler._index_pattern,
+                    sort=sort_params,
-                        size=size,
+                    body=body,
-                        sort=sort_params,
+                )
                        body=body,
                    )
                except Exception:
                    # Catch all ES errors and print debug (currently to stdout)
                    error = {
                        "index": query_compiler._index_pattern,
                        "size": size,
                        "sort": sort_params,
                        "body": body,
                    }
                    print("Elasticsearch error:", error)
                    raise
        else:
            is_scan = True
            es_results = scan(
--- a/eland/query_compiler.py
+++ b/eland/query_compiler.py
@ -17,7 +17,7 @@
 import copy
 from datetime import datetime
-from typing import TYPE_CHECKING, List, Optional, Sequence
+from typing import TYPE_CHECKING, Any, List, Optional, Sequence, Union
 import numpy as np  # type: ignore
 import pandas as pd  # type: ignore
@ -430,6 +430,77 @@ class QueryCompiler:
        return result
    def es_match(
        self,
        text: str,
        columns: Sequence[str],
        *,
        match_phrase: bool = False,
        match_only_text_fields: bool = True,
        multi_match_type: Optional[str] = None,
        analyzer: Optional[str] = None,
        fuzziness: Optional[Union[int, str]] = None,
        **kwargs: Any,
    ) -> QueryFilter:
        if len(columns) < 1:
            raise ValueError("columns can't be empty")
        es_dtypes = self.es_dtypes.to_dict()
        # Build the base options for the 'match_*' query
        options = {"query": text}
        if analyzer is not None:
            options["analyzer"] = analyzer
        if fuzziness is not None:
            options["fuzziness"] = fuzziness
        options.update(kwargs)
        # Warn the user if they're not querying text columns
        if match_only_text_fields:
            non_text_columns = {}
            for column in columns:
                # Don't worry about wildcards
                if "*" in column:
                    continue
                es_dtype = es_dtypes[column]
                if es_dtype != "text":
                    non_text_columns[column] = es_dtype
            if non_text_columns:
                raise ValueError(
                    f"Attempting to run es_match() on non-text fields "
                    f"({', '.join([k + '=' + v for k, v in non_text_columns.items()])}) "
                    f"means that these fields may not be analyzed properly. "
                    f"Consider reindexing these fields as text or use 'match_only_text_es_dtypes=False' "
                    f"to use match anyways"
                )
        else:
            options.setdefault("lenient", True)
        # If only one column use 'match'
        # otherwise use 'multi_match' with 'fields'
        if len(columns) == 1:
            if multi_match_type is not None:
                raise ValueError(
                    "multi_match_type parameter only valid "
                    "when searching more than one column"
                )
            query = {"match_phrase" if match_phrase else "match": {columns[0]: options}}
        else:
            options["fields"] = columns
            if match_phrase:
                if multi_match_type not in ("phrase", None):
                    raise ValueError(
                        f"match_phrase=True and multi_match_type={multi_match_type!r} "
                        f"are not compatible. Must be multi_match_type='phrase'"
                    )
                multi_match_type = "phrase"
            if multi_match_type is not None:
                options["type"] = multi_match_type
            query = {"multi_match": options}
        return QueryFilter(query)
    def es_query(self, query):
        return self._update_query(QueryFilter(query))
--- a/eland/series.py
+++ b/eland/series.py
@ -55,6 +55,7 @@ from eland.filter import (
    LessEqual,
    NotFilter,
    NotNull,
    QueryFilter,
    ScriptFilter,
 )
 from eland.ndframe import NDFrame
@ -636,6 +637,74 @@ class Series(NDFrame):
        )
        return Series(_query_compiler=new_query_compiler)
    def es_match(
        self,
        text: str,
        *,
        match_phrase: bool = False,
        match_only_text_fields: bool = True,
        analyzer: Optional[str] = None,
        fuzziness: Optional[Union[int, str]] = None,
        **kwargs: Any,
    ) -> QueryFilter:
        """Filters data with an Elasticsearch ``match`` or ``match_phrase``
        query depending on the given parameters.
        Read more about `Full-Text Queries in Elasticsearch <https://www.elastic.co/guide/en/elasticsearch/reference/current/full-text-queries.html>`_
        All additional keyword arguments are passed in the body of the match query.
        Parameters
        ----------
        text: str
            String of text to search for
        match_phrase: bool, default False
            If True will use ``match_phrase`` instead of ``match`` query which takes into account
            the order of the ``text`` parameter.
        match_only_text_fields: bool, default True
            When True this function will raise an error if any non-text fields
            are queried to prevent fields that aren't analyzed from not working properly.
            Set to False to ignore this preventative check.
        analyzer: str, optional
            Specify which analyzer to use for the match query
        fuzziness: int, str, optional
            Specify the fuzziness option for the match query
        Returns
        -------
        QueryFilter
            Boolean filter to be combined with other filters and
            then passed to DataFrame[...].
        Examples
        --------
        >>> df = ed.DataFrame(
        ...   "localhost:9200", "ecommerce",
        ...   columns=["category", "taxful_total_price"]
        ... )
        >>> df[
        ...     df.category.es_match("Men's")
        ...     & (df.taxful_total_price > 200.0)
        ... ].head(5)
                                       category  taxful_total_price
        13                     [Men's Clothing]              266.96
        33                     [Men's Clothing]              221.98
        54                     [Men's Clothing]              234.98
        93   [Men's Shoes, Women's Accessories]              239.98
        273                       [Men's Shoes]              214.98
        <BLANKLINE>
        [5 rows x 2 columns]
        """
        return self._query_compiler.es_match(
            text,
            columns=[self.name],
            match_phrase=match_phrase,
            match_only_text_fields=match_only_text_fields,
            analyzer=analyzer,
            fuzziness=fuzziness,
            **kwargs,
        )
    def es_info(self) -> str:
        buf = StringIO()
--- a/eland/tests/dataframe/test_es_match_pytest.py
+++ b/eland/tests/dataframe/test_es_match_pytest.py
@ -0,0 +1,41 @@
 #  Licensed to Elasticsearch B.V. under one or more contributor
 #  license agreements. See the NOTICE file distributed with
 #  this work for additional information regarding copyright
 #  ownership. Elasticsearch B.V. licenses this file to you under
 #  the Apache License, Version 2.0 (the "License"); you may
 #  not use this file except in compliance with the License.
 #  You may obtain a copy of the License at
 #
 # 	http://www.apache.org/licenses/LICENSE-2.0
 #
 #  Unless required by applicable law or agreed to in writing,
 #  software distributed under the License is distributed on an
 #  "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 #  KIND, either express or implied.  See the License for the
 #  specific language governing permissions and limitations
 #  under the License.
 # File called _pytest for PyCharm compatability
 from eland.tests.common import TestData
 class TestEsMatch(TestData):
    def test_match(self):
        df = self.ed_ecommerce()
        categories = list(df.es_match("Men's").category.to_pandas())
        assert len(categories) > 0
        assert all(any("Men's" in y for y in x) for x in categories)
    def test_must_not_match(self):
        df = self.ed_ecommerce()
        categories = list(
            df.es_match("Men's", must_not_match=True)
            .es_match("Women's")
            .category.to_pandas()
        )
        assert len(categories) > 0
        assert all(all("Men's" not in y for y in x) for x in categories)
        assert all(any("Women's" in y for y in x) for x in categories)
--- a/eland/tests/query_compiler/test_es_match_pytest.py
+++ b/eland/tests/query_compiler/test_es_match_pytest.py
@ -0,0 +1,196 @@
 #  Licensed to Elasticsearch B.V. under one or more contributor
 #  license agreements. See the NOTICE file distributed with
 #  this work for additional information regarding copyright
 #  ownership. Elasticsearch B.V. licenses this file to you under
 #  the Apache License, Version 2.0 (the "License"); you may
 #  not use this file except in compliance with the License.
 #  You may obtain a copy of the License at
 #
 # 	http://www.apache.org/licenses/LICENSE-2.0
 #
 #  Unless required by applicable law or agreed to in writing,
 #  software distributed under the License is distributed on an
 #  "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 #  KIND, either express or implied.  See the License for the
 #  specific language governing permissions and limitations
 #  under the License.
 # File called _pytest for PyCharm compatability
 import pytest
 from eland.query_compiler import QueryCompiler
 from eland.tests.common import TestData
 class TestEsMatch(TestData):
    def test_es_match(self):
        df = self.ed_ecommerce()
        query_compiler: QueryCompiler = df._query_compiler
        filter = query_compiler.es_match(
            "joe", ["customer_full_name"], analyzer="my-analyzer", fuzziness="1..2"
        )
        assert filter.build() == {
            "match": {
                "customer_full_name": {
                    "query": "joe",
                    "analyzer": "my-analyzer",
                    "fuzziness": "1..2",
                }
            }
        }
        filter = query_compiler.es_match(
            "joe", ["customer_last_name", "customer_first_name"]
        )
        assert filter.build() == {
            "multi_match": {
                "query": "joe",
                "fields": ["customer_last_name", "customer_first_name"],
            }
        }
    def test_es_match_must_not_match(self):
        df = self.ed_ecommerce()
        # single match
        df2 = df.es_match("joe", columns=["customer_full_name"], must_not_match=True)
        query_params, _ = df2._query_compiler._operations._resolve_tasks(
            df2._query_compiler
        )
        assert query_params.query.to_search_body() == {
            "query": {
                "bool": {
                    "must_not": {"match": {"customer_full_name": {"query": "joe"}}}
                }
            }
        }
        # multi_match
        df2 = df.es_match(
            "joe",
            columns=["customer_first_name", "customer_last_name"],
            must_not_match=True,
        )
        query_params, _ = df2._query_compiler._operations._resolve_tasks(
            df2._query_compiler
        )
        assert query_params.query.to_search_body() == {
            "query": {
                "bool": {
                    "must_not": {
                        "multi_match": {
                            "fields": [
                                "customer_first_name",
                                "customer_last_name",
                            ],
                            "query": "joe",
                        }
                    }
                }
            }
        }
    def test_es_match_phrase(self):
        df = self.ed_ecommerce()
        query_compiler: QueryCompiler = df._query_compiler
        filter = query_compiler.es_match(
            "joe", ["customer_full_name"], match_phrase=True
        )
        assert filter.build() == {
            "match_phrase": {
                "customer_full_name": {
                    "query": "joe",
                }
            }
        }
        filter = query_compiler.es_match(
            "joe", ["customer_last_name", "customer_first_name"], match_phrase=True
        )
        assert filter.build() == {
            "multi_match": {
                "query": "joe",
                "type": "phrase",
                "fields": ["customer_last_name", "customer_first_name"],
            }
        }
    def test_es_match_phrase_not_allowed_with_multi_match_type(self):
        df = self.ed_ecommerce()
        query_compiler: QueryCompiler = df._query_compiler
        with pytest.raises(ValueError) as e:
            query_compiler.es_match(
                "joe",
                ["customer_first_name", "customer_last_name"],
                match_phrase=True,
                multi_match_type="best_fields",
            )
        assert str(e.value) == (
            "match_phrase=True and multi_match_type='best_fields' "
            "are not compatible. Must be multi_match_type='phrase'"
        )
        filter = query_compiler.es_match(
            "joe",
            ["customer_last_name", "customer_first_name"],
            match_phrase=True,
            multi_match_type="phrase",
        )
        assert filter.build() == {
            "multi_match": {
                "query": "joe",
                "type": "phrase",
                "fields": ["customer_last_name", "customer_first_name"],
            }
        }
    def test_es_match_non_text_fields(self):
        df = self.ed_ecommerce()
        query_compiler: QueryCompiler = df._query_compiler
        with pytest.raises(ValueError) as e:
            query_compiler.es_match(
                "joe",
                [
                    "customer_first_name",
                    "order_date",
                    "customer_last_name",
                    "currency",
                    "order_*",
                ],
            )
        assert str(e.value) == (
            "Attempting to run es_match() on non-text fields (order_date=date, "
            "currency=keyword) means that these fields may not be analyzed properly. "
            "Consider reindexing these fields as text or use 'match_only_text_es_dtypes=False' "
            "to use match anyways"
        )
        filter = query_compiler.es_match(
            "joe",
            [
                "customer_first_name",
                "order_date",
                "customer_last_name",
                "currency",
                "order_*",
            ],
            match_only_text_fields=False,
        )
        assert filter.build() == {
            "multi_match": {
                "query": "joe",
                "lenient": True,
                "fields": [
                    "customer_first_name",
                    "order_date",
                    "customer_last_name",
                    "currency",
                    "order_*",
                ],
            }
        }
--- a/eland/tests/series/test_es_match.py
+++ b/eland/tests/series/test_es_match.py
@ -0,0 +1,41 @@
 #  Licensed to Elasticsearch B.V. under one or more contributor
 #  license agreements. See the NOTICE file distributed with
 #  this work for additional information regarding copyright
 #  ownership. Elasticsearch B.V. licenses this file to you under
 #  the Apache License, Version 2.0 (the "License"); you may
 #  not use this file except in compliance with the License.
 #  You may obtain a copy of the License at
 #
 # 	http://www.apache.org/licenses/LICENSE-2.0
 #
 #  Unless required by applicable law or agreed to in writing,
 #  software distributed under the License is distributed on an
 #  "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 #  KIND, either express or implied.  See the License for the
 #  specific language governing permissions and limitations
 #  under the License.
 # File called _pytest for PyCharm compatability
 from eland.tests.common import TestData
 class TestEsMatch(TestData):
    def test_match(self):
        df = self.ed_ecommerce()
        categories = list(df[df.category.es_match("Men's")].category.to_pandas())
        assert len(categories) > 0
        assert all(any("Men's" in y for y in x) for x in categories)
    def test_must_not_match(self):
        df = self.ed_ecommerce()
        categories = list(
            df[
                ~df.category.es_match("Men's") & df.category.es_match("Women's")
            ].category.to_pandas()
        )
        assert len(categories) > 0
        assert all(all("Men's" not in y for y in x) for x in categories)
        assert all(any("Women's" in y for y in x) for x in categories)