Add support for es_match() to DataFrame and Series

This commit is contained in:
Seth Michael Larson 2020-10-29 10:16:50 -05:00 committed by GitHub
parent 92a8040614
commit cb4cd083c3
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
11 changed files with 537 additions and 20 deletions

View File

@ -0,0 +1,6 @@
eland.DataFrame.es_match
========================
.. currentmodule:: eland
.. automethod:: DataFrame.es_match

View File

@ -0,0 +1,6 @@
eland.Series.es_match
=====================
.. currentmodule:: eland
.. automethod:: Series.es_match

View File

@ -111,6 +111,7 @@ Elasticsearch Functions
:toctree: api/
DataFrame.es_info
DataFrame.es_match
DataFrame.es_query
DataFrame.es_dtypes

View File

@ -115,5 +115,6 @@ Elasticsearch Functions
:toctree: api/
Series.es_info
Series.es_match
Series.es_dtype
Series.es_dtypes

View File

@ -19,7 +19,7 @@ import re
import sys
import warnings
from io import StringIO
from typing import List, Optional, Sequence, Tuple, Union
from typing import Any, List, Optional, Sequence, Tuple, Union
import numpy as np
import pandas as pd
@ -632,6 +632,103 @@ class DataFrame(NDFrame):
def info_es(self):
return self.es_info()
def es_match(
self,
text: str,
*,
columns: Optional[Union[str, Sequence[str]]] = None,
match_phrase: bool = False,
must_not_match: bool = False,
multi_match_type: Optional[str] = None,
match_only_text_fields: bool = True,
analyzer: Optional[str] = None,
fuzziness: Optional[Union[int, str]] = None,
**kwargs: Any,
) -> "DataFrame":
"""Filters data with an Elasticsearch ``match``, ``match_phrase``, or
``multi_match`` query depending on the given parameters and columns.
Read more about `Full-Text Queries in Elasticsearch <https://www.elastic.co/guide/en/elasticsearch/reference/current/full-text-queries.html>`_
By default all fields of type 'text' within Elasticsearch are queried
otherwise specific columns can be specified via the ``columns`` parameter
or a single column can be filtered on with :py:meth:`eland.Series.es_match`
All additional keyword arguments are passed in the body of the match query.
Parameters
----------
text: str
String of text to search for
columns: str, List[str], optional
List of columns to search over. Defaults to all 'text' fields in Elasticsearch
match_phrase: bool, default False
If True will use ``match_phrase`` instead of ``match`` query which takes into account
the order of the ``text`` parameter.
must_not_match: bool, default False
If True will apply a boolean NOT (~) to the
query. Instead of requiring a match the query
will require text to not match.
multi_match_type: str, optional
If given and matching against multiple columns will set the ``multi_match.type`` setting
match_only_text_fields: bool, default True
When True this function will raise an error if any non-text fields
are queried to prevent fields that aren't analyzed from not working properly.
Set to False to ignore this preventative check.
analyzer: str, optional
Specify which analyzer to use for the match query
fuzziness: int, str, optional
Specify the fuzziness option for the match query
Returns
-------
DataFrame
A filtered :py:class:`eland.DataFrame` with the given match query
Examples
--------
>>> df = ed.DataFrame("localhost:9200", "ecommerce")
>>> df.es_match("Men's", columns=["category"])
category currency ... type user
0 [Men's Clothing] EUR ... order eddie
4 [Men's Clothing, Men's Accessories] EUR ... order eddie
6 [Men's Clothing] EUR ... order oliver
7 [Men's Clothing, Men's Accessories, Men's Shoes] EUR ... order abd
11 [Men's Accessories, Men's Clothing] EUR ... order eddie
... ... ... ... ... ...
4663 [Men's Shoes, Men's Clothing] EUR ... order samir
4667 [Men's Clothing, Men's Shoes] EUR ... order sultan
4671 [Men's Clothing] EUR ... order jim
4672 [Men's Clothing] EUR ... order yahya
4674 [Women's Accessories, Men's Clothing] EUR ... order jackson
<BLANKLINE>
[2310 rows x 45 columns]
"""
# Determine which columns will be used
es_dtypes = self.es_dtypes.to_dict()
if columns is None:
columns = [
column for column, es_dtype in es_dtypes.items() if es_dtype == "text"
]
elif isinstance(columns, str):
columns = [columns]
columns = list(columns)
qc = self._query_compiler
filter = qc.es_match(
text,
columns,
match_phrase=match_phrase,
match_only_text_fields=match_only_text_fields,
multi_match_type=multi_match_type,
analyzer=analyzer,
fuzziness=fuzziness,
**kwargs,
)
if must_not_match:
filter = ~filter
return DataFrame(_query_compiler=qc._update_query(filter))
def es_query(self, query) -> "DataFrame":
"""Applies an Elasticsearch DSL query to the current DataFrame.

View File

@ -995,24 +995,12 @@ class Operations:
is_scan = False
if size is not None and size <= DEFAULT_ES_MAX_RESULT_WINDOW:
if size > 0:
try:
es_results = query_compiler._client.search(
index=query_compiler._index_pattern,
size=size,
sort=sort_params,
body=body,
)
except Exception:
# Catch all ES errors and print debug (currently to stdout)
error = {
"index": query_compiler._index_pattern,
"size": size,
"sort": sort_params,
"body": body,
}
print("Elasticsearch error:", error)
raise
es_results = query_compiler._client.search(
index=query_compiler._index_pattern,
size=size,
sort=sort_params,
body=body,
)
else:
is_scan = True
es_results = scan(

View File

@ -17,7 +17,7 @@
import copy
from datetime import datetime
from typing import TYPE_CHECKING, List, Optional, Sequence
from typing import TYPE_CHECKING, Any, List, Optional, Sequence, Union
import numpy as np # type: ignore
import pandas as pd # type: ignore
@ -430,6 +430,77 @@ class QueryCompiler:
return result
def es_match(
self,
text: str,
columns: Sequence[str],
*,
match_phrase: bool = False,
match_only_text_fields: bool = True,
multi_match_type: Optional[str] = None,
analyzer: Optional[str] = None,
fuzziness: Optional[Union[int, str]] = None,
**kwargs: Any,
) -> QueryFilter:
if len(columns) < 1:
raise ValueError("columns can't be empty")
es_dtypes = self.es_dtypes.to_dict()
# Build the base options for the 'match_*' query
options = {"query": text}
if analyzer is not None:
options["analyzer"] = analyzer
if fuzziness is not None:
options["fuzziness"] = fuzziness
options.update(kwargs)
# Warn the user if they're not querying text columns
if match_only_text_fields:
non_text_columns = {}
for column in columns:
# Don't worry about wildcards
if "*" in column:
continue
es_dtype = es_dtypes[column]
if es_dtype != "text":
non_text_columns[column] = es_dtype
if non_text_columns:
raise ValueError(
f"Attempting to run es_match() on non-text fields "
f"({', '.join([k + '=' + v for k, v in non_text_columns.items()])}) "
f"means that these fields may not be analyzed properly. "
f"Consider reindexing these fields as text or use 'match_only_text_es_dtypes=False' "
f"to use match anyways"
)
else:
options.setdefault("lenient", True)
# If only one column use 'match'
# otherwise use 'multi_match' with 'fields'
if len(columns) == 1:
if multi_match_type is not None:
raise ValueError(
"multi_match_type parameter only valid "
"when searching more than one column"
)
query = {"match_phrase" if match_phrase else "match": {columns[0]: options}}
else:
options["fields"] = columns
if match_phrase:
if multi_match_type not in ("phrase", None):
raise ValueError(
f"match_phrase=True and multi_match_type={multi_match_type!r} "
f"are not compatible. Must be multi_match_type='phrase'"
)
multi_match_type = "phrase"
if multi_match_type is not None:
options["type"] = multi_match_type
query = {"multi_match": options}
return QueryFilter(query)
def es_query(self, query):
return self._update_query(QueryFilter(query))

View File

@ -55,6 +55,7 @@ from eland.filter import (
LessEqual,
NotFilter,
NotNull,
QueryFilter,
ScriptFilter,
)
from eland.ndframe import NDFrame
@ -636,6 +637,74 @@ class Series(NDFrame):
)
return Series(_query_compiler=new_query_compiler)
def es_match(
self,
text: str,
*,
match_phrase: bool = False,
match_only_text_fields: bool = True,
analyzer: Optional[str] = None,
fuzziness: Optional[Union[int, str]] = None,
**kwargs: Any,
) -> QueryFilter:
"""Filters data with an Elasticsearch ``match`` or ``match_phrase``
query depending on the given parameters.
Read more about `Full-Text Queries in Elasticsearch <https://www.elastic.co/guide/en/elasticsearch/reference/current/full-text-queries.html>`_
All additional keyword arguments are passed in the body of the match query.
Parameters
----------
text: str
String of text to search for
match_phrase: bool, default False
If True will use ``match_phrase`` instead of ``match`` query which takes into account
the order of the ``text`` parameter.
match_only_text_fields: bool, default True
When True this function will raise an error if any non-text fields
are queried to prevent fields that aren't analyzed from not working properly.
Set to False to ignore this preventative check.
analyzer: str, optional
Specify which analyzer to use for the match query
fuzziness: int, str, optional
Specify the fuzziness option for the match query
Returns
-------
QueryFilter
Boolean filter to be combined with other filters and
then passed to DataFrame[...].
Examples
--------
>>> df = ed.DataFrame(
... "localhost:9200", "ecommerce",
... columns=["category", "taxful_total_price"]
... )
>>> df[
... df.category.es_match("Men's")
... & (df.taxful_total_price > 200.0)
... ].head(5)
category taxful_total_price
13 [Men's Clothing] 266.96
33 [Men's Clothing] 221.98
54 [Men's Clothing] 234.98
93 [Men's Shoes, Women's Accessories] 239.98
273 [Men's Shoes] 214.98
<BLANKLINE>
[5 rows x 2 columns]
"""
return self._query_compiler.es_match(
text,
columns=[self.name],
match_phrase=match_phrase,
match_only_text_fields=match_only_text_fields,
analyzer=analyzer,
fuzziness=fuzziness,
**kwargs,
)
def es_info(self) -> str:
buf = StringIO()

View File

@ -0,0 +1,41 @@
# Licensed to Elasticsearch B.V. under one or more contributor
# license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright
# ownership. Elasticsearch B.V. licenses this file to you under
# the Apache License, Version 2.0 (the "License"); you may
# not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
# File called _pytest for PyCharm compatability
from eland.tests.common import TestData
class TestEsMatch(TestData):
def test_match(self):
df = self.ed_ecommerce()
categories = list(df.es_match("Men's").category.to_pandas())
assert len(categories) > 0
assert all(any("Men's" in y for y in x) for x in categories)
def test_must_not_match(self):
df = self.ed_ecommerce()
categories = list(
df.es_match("Men's", must_not_match=True)
.es_match("Women's")
.category.to_pandas()
)
assert len(categories) > 0
assert all(all("Men's" not in y for y in x) for x in categories)
assert all(any("Women's" in y for y in x) for x in categories)

View File

@ -0,0 +1,196 @@
# Licensed to Elasticsearch B.V. under one or more contributor
# license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright
# ownership. Elasticsearch B.V. licenses this file to you under
# the Apache License, Version 2.0 (the "License"); you may
# not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
# File called _pytest for PyCharm compatability
import pytest
from eland.query_compiler import QueryCompiler
from eland.tests.common import TestData
class TestEsMatch(TestData):
def test_es_match(self):
df = self.ed_ecommerce()
query_compiler: QueryCompiler = df._query_compiler
filter = query_compiler.es_match(
"joe", ["customer_full_name"], analyzer="my-analyzer", fuzziness="1..2"
)
assert filter.build() == {
"match": {
"customer_full_name": {
"query": "joe",
"analyzer": "my-analyzer",
"fuzziness": "1..2",
}
}
}
filter = query_compiler.es_match(
"joe", ["customer_last_name", "customer_first_name"]
)
assert filter.build() == {
"multi_match": {
"query": "joe",
"fields": ["customer_last_name", "customer_first_name"],
}
}
def test_es_match_must_not_match(self):
df = self.ed_ecommerce()
# single match
df2 = df.es_match("joe", columns=["customer_full_name"], must_not_match=True)
query_params, _ = df2._query_compiler._operations._resolve_tasks(
df2._query_compiler
)
assert query_params.query.to_search_body() == {
"query": {
"bool": {
"must_not": {"match": {"customer_full_name": {"query": "joe"}}}
}
}
}
# multi_match
df2 = df.es_match(
"joe",
columns=["customer_first_name", "customer_last_name"],
must_not_match=True,
)
query_params, _ = df2._query_compiler._operations._resolve_tasks(
df2._query_compiler
)
assert query_params.query.to_search_body() == {
"query": {
"bool": {
"must_not": {
"multi_match": {
"fields": [
"customer_first_name",
"customer_last_name",
],
"query": "joe",
}
}
}
}
}
def test_es_match_phrase(self):
df = self.ed_ecommerce()
query_compiler: QueryCompiler = df._query_compiler
filter = query_compiler.es_match(
"joe", ["customer_full_name"], match_phrase=True
)
assert filter.build() == {
"match_phrase": {
"customer_full_name": {
"query": "joe",
}
}
}
filter = query_compiler.es_match(
"joe", ["customer_last_name", "customer_first_name"], match_phrase=True
)
assert filter.build() == {
"multi_match": {
"query": "joe",
"type": "phrase",
"fields": ["customer_last_name", "customer_first_name"],
}
}
def test_es_match_phrase_not_allowed_with_multi_match_type(self):
df = self.ed_ecommerce()
query_compiler: QueryCompiler = df._query_compiler
with pytest.raises(ValueError) as e:
query_compiler.es_match(
"joe",
["customer_first_name", "customer_last_name"],
match_phrase=True,
multi_match_type="best_fields",
)
assert str(e.value) == (
"match_phrase=True and multi_match_type='best_fields' "
"are not compatible. Must be multi_match_type='phrase'"
)
filter = query_compiler.es_match(
"joe",
["customer_last_name", "customer_first_name"],
match_phrase=True,
multi_match_type="phrase",
)
assert filter.build() == {
"multi_match": {
"query": "joe",
"type": "phrase",
"fields": ["customer_last_name", "customer_first_name"],
}
}
def test_es_match_non_text_fields(self):
df = self.ed_ecommerce()
query_compiler: QueryCompiler = df._query_compiler
with pytest.raises(ValueError) as e:
query_compiler.es_match(
"joe",
[
"customer_first_name",
"order_date",
"customer_last_name",
"currency",
"order_*",
],
)
assert str(e.value) == (
"Attempting to run es_match() on non-text fields (order_date=date, "
"currency=keyword) means that these fields may not be analyzed properly. "
"Consider reindexing these fields as text or use 'match_only_text_es_dtypes=False' "
"to use match anyways"
)
filter = query_compiler.es_match(
"joe",
[
"customer_first_name",
"order_date",
"customer_last_name",
"currency",
"order_*",
],
match_only_text_fields=False,
)
assert filter.build() == {
"multi_match": {
"query": "joe",
"lenient": True,
"fields": [
"customer_first_name",
"order_date",
"customer_last_name",
"currency",
"order_*",
],
}
}

View File

@ -0,0 +1,41 @@
# Licensed to Elasticsearch B.V. under one or more contributor
# license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright
# ownership. Elasticsearch B.V. licenses this file to you under
# the Apache License, Version 2.0 (the "License"); you may
# not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
# File called _pytest for PyCharm compatability
from eland.tests.common import TestData
class TestEsMatch(TestData):
def test_match(self):
df = self.ed_ecommerce()
categories = list(df[df.category.es_match("Men's")].category.to_pandas())
assert len(categories) > 0
assert all(any("Men's" in y for y in x) for x in categories)
def test_must_not_match(self):
df = self.ed_ecommerce()
categories = list(
df[
~df.category.es_match("Men's") & df.category.es_match("Women's")
].category.to_pandas()
)
assert len(categories) > 0
assert all(all("Men's" not in y for y in x) for x in categories)
assert all(any("Women's" in y for y in x) for x in categories)