mirror of
https://github.com/elastic/eland.git
synced 2025-07-11 00:02:14 +08:00
Support the v8.0 Elasticsearch client
This commit is contained in:
parent
1ffbe002c4
commit
109387184a
@ -11,7 +11,7 @@ without overloading your machine.
|
||||
-------------------------------------
|
||||
>>> import eland as ed
|
||||
>>> # Connect to 'flights' index via localhost Elasticsearch node
|
||||
>>> df = ed.DataFrame('localhost:9200', 'flights')
|
||||
>>> df = ed.DataFrame('http://localhost:9200', 'flights')
|
||||
|
||||
# eland.DataFrame instance has the same API as pandas.DataFrame
|
||||
# except all data is in Elasticsearch. See .info() memory usage.
|
||||
|
@ -19,7 +19,7 @@ model in Elasticsearch
|
||||
|
||||
# Import the model into Elasticsearch
|
||||
>>> es_model = MLModel.import_model(
|
||||
es_client="localhost:9200",
|
||||
es_client="http://localhost:9200",
|
||||
model_id="xgb-classifier",
|
||||
model=xgb_model,
|
||||
feature_names=["f0", "f1", "f2", "f3", "f4"],
|
||||
|
@ -20,13 +20,13 @@ The recommended way to set your requirements in your `setup.py` or
|
||||
[discrete]
|
||||
=== Getting Started
|
||||
|
||||
Create a `DataFrame` object connected to an {es} cluster running on `localhost:9200`:
|
||||
Create a `DataFrame` object connected to an {es} cluster running on `http://localhost:9200`:
|
||||
|
||||
[source,python]
|
||||
------------------------------------
|
||||
>>> import eland as ed
|
||||
>>> df = ed.DataFrame(
|
||||
... es_client="localhost:9200",
|
||||
... es_client="http://localhost:9200",
|
||||
... es_index_pattern="flights",
|
||||
... )
|
||||
>>> df
|
||||
|
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
@ -309,8 +309,10 @@ def elasticsearch_date_to_pandas_date(
|
||||
def ensure_es_client(
|
||||
es_client: Union[str, List[str], Tuple[str, ...], Elasticsearch]
|
||||
) -> Elasticsearch:
|
||||
if isinstance(es_client, tuple):
|
||||
es_client = list(es_client)
|
||||
if not isinstance(es_client, Elasticsearch):
|
||||
es_client = Elasticsearch(es_client)
|
||||
es_client = Elasticsearch(es_client) # type: ignore[arg-type]
|
||||
return es_client
|
||||
|
||||
|
||||
@ -334,16 +336,3 @@ def es_version(es_client: Elasticsearch) -> Tuple[int, int, int]:
|
||||
else:
|
||||
eland_es_version = es_client._eland_es_version # type: ignore
|
||||
return eland_es_version
|
||||
|
||||
|
||||
def es_api_compat(
|
||||
method: Callable[..., Dict[str, Any]], **kwargs: Any
|
||||
) -> Dict[str, Any]:
|
||||
"""Expands the 'body' parameter to top-level parameters
|
||||
on clients that would raise DeprecationWarnings if used.
|
||||
"""
|
||||
if ES_CLIENT_HAS_V8_0_DEPRECATIONS:
|
||||
body = kwargs.pop("body", None)
|
||||
if body:
|
||||
kwargs.update(body)
|
||||
return method(**kwargs)
|
||||
|
@ -56,7 +56,7 @@ class DataFrame(NDFrame):
|
||||
|
||||
Parameters
|
||||
----------
|
||||
es_client: Elasticsearch client argument(s) (e.g. 'localhost:9200')
|
||||
es_client: Elasticsearch client argument(s) (e.g. 'http://localhost:9200')
|
||||
- elasticsearch-py parameters or
|
||||
- elasticsearch-py instance
|
||||
es_index_pattern: str
|
||||
@ -74,7 +74,7 @@ class DataFrame(NDFrame):
|
||||
--------
|
||||
Constructing DataFrame from an Elasticsearch configuration arguments and an Elasticsearch index
|
||||
|
||||
>>> df = ed.DataFrame('localhost:9200', 'flights')
|
||||
>>> df = ed.DataFrame('http://localhost:9200', 'flights')
|
||||
>>> df.head()
|
||||
AvgTicketPrice Cancelled ... dayOfWeek timestamp
|
||||
0 841.265642 False ... 0 2018-01-01 00:00:00
|
||||
@ -89,7 +89,7 @@ class DataFrame(NDFrame):
|
||||
Constructing DataFrame from an Elasticsearch client and an Elasticsearch index
|
||||
|
||||
>>> from elasticsearch import Elasticsearch
|
||||
>>> es = Elasticsearch("localhost:9200")
|
||||
>>> es = Elasticsearch("http://localhost:9200")
|
||||
>>> df = ed.DataFrame(es_client=es, es_index_pattern='flights', columns=['AvgTicketPrice', 'Cancelled'])
|
||||
>>> df.head()
|
||||
AvgTicketPrice Cancelled
|
||||
@ -106,7 +106,7 @@ class DataFrame(NDFrame):
|
||||
(TODO - currently index_field must also be a field if not _id)
|
||||
|
||||
>>> df = ed.DataFrame(
|
||||
... es_client='localhost',
|
||||
... es_client='http://localhost:9200',
|
||||
... es_index_pattern='flights',
|
||||
... columns=['AvgTicketPrice', 'timestamp'],
|
||||
... es_index_field='timestamp'
|
||||
@ -170,7 +170,7 @@ class DataFrame(NDFrame):
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> df = ed.DataFrame('localhost', 'flights')
|
||||
>>> df = ed.DataFrame('http://localhost:9200', 'flights')
|
||||
>>> assert isinstance(df.columns, pd.Index)
|
||||
>>> df.columns
|
||||
Index(['AvgTicketPrice', 'Cancelled', 'Carrier', 'Dest', 'DestAirportID', 'DestCityName',
|
||||
@ -198,7 +198,7 @@ class DataFrame(NDFrame):
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> df = ed.DataFrame('localhost', 'flights')
|
||||
>>> df = ed.DataFrame('http://localhost:9200', 'flights')
|
||||
>>> df.empty
|
||||
False
|
||||
"""
|
||||
@ -228,7 +228,7 @@ class DataFrame(NDFrame):
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> df = ed.DataFrame('localhost', 'flights', columns=['Origin', 'Dest'])
|
||||
>>> df = ed.DataFrame('http://localhost:9200', 'flights', columns=['Origin', 'Dest'])
|
||||
>>> df.head(3)
|
||||
Origin Dest
|
||||
0 Frankfurt am Main Airport Sydney Kingsford Smith International Airport
|
||||
@ -263,7 +263,7 @@ class DataFrame(NDFrame):
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> df = ed.DataFrame('localhost', 'flights', columns=['Origin', 'Dest'])
|
||||
>>> df = ed.DataFrame('http://localhost:9200', 'flights', columns=['Origin', 'Dest'])
|
||||
>>> df.tail()
|
||||
Origin \\
|
||||
13054 Pisa International Airport...
|
||||
@ -365,7 +365,7 @@ class DataFrame(NDFrame):
|
||||
--------
|
||||
Drop a column
|
||||
|
||||
>>> df = ed.DataFrame('localhost', 'ecommerce', columns=['customer_first_name', 'email', 'user'])
|
||||
>>> df = ed.DataFrame('http://localhost:9200', 'ecommerce', columns=['customer_first_name', 'email', 'user'])
|
||||
>>> df.drop(columns=['user'])
|
||||
customer_first_name email
|
||||
0 Eddie eddie@underwood-family.zzz
|
||||
@ -575,7 +575,7 @@ class DataFrame(NDFrame):
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> df = ed.DataFrame('localhost', 'ecommerce', columns=['customer_first_name', 'geoip.city_name'])
|
||||
>>> df = ed.DataFrame('http://localhost:9200', 'ecommerce', columns=['customer_first_name', 'geoip.city_name'])
|
||||
>>> df.count()
|
||||
customer_first_name 4675
|
||||
geoip.city_name 4094
|
||||
@ -597,7 +597,7 @@ class DataFrame(NDFrame):
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> df = ed.DataFrame('localhost', 'flights')
|
||||
>>> df = ed.DataFrame('http://localhost:9200', 'flights')
|
||||
>>> df = df[(df.OriginAirportID == 'AMS') & (df.FlightDelayMin > 60)]
|
||||
>>> df = df[['timestamp', 'OriginAirportID', 'DestAirportID', 'FlightDelayMin']]
|
||||
>>> df = df.tail()
|
||||
@ -692,7 +692,7 @@ class DataFrame(NDFrame):
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> df = ed.DataFrame("localhost:9200", "ecommerce")
|
||||
>>> df = ed.DataFrame("http://localhost:9200", "ecommerce")
|
||||
>>> df.es_match("Men's", columns=["category"])
|
||||
category currency ... type user
|
||||
0 [Men's Clothing] EUR ... order eddie
|
||||
@ -754,7 +754,7 @@ class DataFrame(NDFrame):
|
||||
|
||||
.. _geo-distance query: https://www.elastic.co/guide/en/elasticsearch/reference/current/query-dsl-geo-distance-query.html
|
||||
|
||||
>>> df = ed.DataFrame('localhost', 'ecommerce', columns=['customer_first_name', 'geoip.city_name'])
|
||||
>>> df = ed.DataFrame('http://localhost:9200', 'ecommerce', columns=['customer_first_name', 'geoip.city_name'])
|
||||
>>> df.es_query({"bool": {"filter": {"geo_distance": {"distance": "1km", "geoip.location": [55.3, 25.3]}}}}).head()
|
||||
customer_first_name geoip.city_name
|
||||
1 Mary Dubai
|
||||
@ -830,7 +830,7 @@ class DataFrame(NDFrame):
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> df = ed.DataFrame('localhost', 'ecommerce', columns=['customer_first_name', 'geoip.city_name'])
|
||||
>>> df = ed.DataFrame('http://localhost:9200', 'ecommerce', columns=['customer_first_name', 'geoip.city_name'])
|
||||
>>> df.info()
|
||||
<class 'eland.dataframe.DataFrame'>
|
||||
Index: 4675 entries, 0 to 4674
|
||||
@ -1366,7 +1366,7 @@ class DataFrame(NDFrame):
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> df = ed.DataFrame('localhost', 'flights',
|
||||
>>> df = ed.DataFrame('http://localhost:9200', 'flights',
|
||||
... columns=['AvgTicketPrice', 'Dest', 'Cancelled', 'timestamp', 'dayOfWeek'])
|
||||
>>> df.dtypes
|
||||
AvgTicketPrice float64
|
||||
@ -1407,7 +1407,7 @@ class DataFrame(NDFrame):
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> df = ed.DataFrame('localhost', 'ecommerce')
|
||||
>>> df = ed.DataFrame('http://localhost:9200', 'ecommerce')
|
||||
>>> df.shape
|
||||
(4675, 45)
|
||||
"""
|
||||
@ -1462,7 +1462,7 @@ class DataFrame(NDFrame):
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> df = ed.DataFrame('localhost:9200', 'flights', columns=['AvgTicketPrice', 'Cancelled']).head()
|
||||
>>> df = ed.DataFrame('http://localhost:9200', 'flights', columns=['AvgTicketPrice', 'Cancelled']).head()
|
||||
>>> df
|
||||
AvgTicketPrice Cancelled
|
||||
0 841.265642 False
|
||||
@ -1520,7 +1520,7 @@ class DataFrame(NDFrame):
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> df = ed.DataFrame('localhost:9200', 'flights', columns=['AvgTicketPrice', 'Cancelled']).head()
|
||||
>>> df = ed.DataFrame('http://localhost:9200', 'flights', columns=['AvgTicketPrice', 'Cancelled']).head()
|
||||
>>> df
|
||||
AvgTicketPrice Cancelled
|
||||
0 841.265642 False
|
||||
@ -1614,7 +1614,7 @@ class DataFrame(NDFrame):
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> df = ed.DataFrame('localhost', 'flights', columns=['AvgTicketPrice', 'DistanceKilometers', 'timestamp', 'DestCountry'])
|
||||
>>> df = ed.DataFrame('http://localhost:9200', 'flights', columns=['AvgTicketPrice', 'DistanceKilometers', 'timestamp', 'DestCountry'])
|
||||
>>> df.aggregate(['sum', 'min', 'std'], numeric_only=True).astype(int)
|
||||
AvgTicketPrice DistanceKilometers
|
||||
sum 8204364 92616288
|
||||
@ -1689,7 +1689,7 @@ class DataFrame(NDFrame):
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> ed_flights = ed.DataFrame('localhost', 'flights', columns=["AvgTicketPrice", "Cancelled", "dayOfWeek", "timestamp", "DestCountry"])
|
||||
>>> ed_flights = ed.DataFrame('http://localhost:9200', 'flights', columns=["AvgTicketPrice", "Cancelled", "dayOfWeek", "timestamp", "DestCountry"])
|
||||
>>> ed_flights.groupby(["DestCountry", "Cancelled"]).agg(["min", "max"], numeric_only=True) # doctest: +NORMALIZE_WHITESPACE
|
||||
AvgTicketPrice dayOfWeek
|
||||
min max min max
|
||||
@ -1784,7 +1784,7 @@ class DataFrame(NDFrame):
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> ed_ecommerce = ed.DataFrame('localhost', 'ecommerce')
|
||||
>>> ed_ecommerce = ed.DataFrame('http://localhost:9200', 'ecommerce')
|
||||
>>> ed_df = ed_ecommerce.filter(["total_quantity", "geoip.city_name", "customer_birth_date", "day_of_week", "taxful_total_price"])
|
||||
>>> ed_df.mode(numeric_only=False)
|
||||
total_quantity geoip.city_name customer_birth_date day_of_week taxful_total_price
|
||||
@ -1849,7 +1849,7 @@ class DataFrame(NDFrame):
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> ed_df = ed.DataFrame('localhost', 'flights')
|
||||
>>> ed_df = ed.DataFrame('http://localhost:9200', 'flights')
|
||||
>>> ed_flights = ed_df.filter(["AvgTicketPrice", "FlightDelayMin", "dayOfWeek", "timestamp"])
|
||||
>>> ed_flights.quantile() # doctest: +SKIP
|
||||
AvgTicketPrice 640.387285
|
||||
@ -1892,7 +1892,7 @@ class DataFrame(NDFrame):
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> ed_df = ed.DataFrame('localhost', 'flights')
|
||||
>>> ed_df = ed.DataFrame('http://localhost:9200', 'flights')
|
||||
>>> ed_flights = ed_df.filter(["AvgTicketPrice", "FlightDelayMin", "dayOfWeek", "timestamp"])
|
||||
>>> ed_flights.idxmax()
|
||||
AvgTicketPrice 1843
|
||||
@ -1924,7 +1924,7 @@ class DataFrame(NDFrame):
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> ed_df = ed.DataFrame('localhost', 'flights')
|
||||
>>> ed_df = ed.DataFrame('http://localhost:9200', 'flights')
|
||||
>>> ed_flights = ed_df.filter(["AvgTicketPrice", "FlightDelayMin", "dayOfWeek", "timestamp"])
|
||||
>>> ed_flights.idxmin()
|
||||
AvgTicketPrice 5454
|
||||
@ -1960,7 +1960,7 @@ class DataFrame(NDFrame):
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> df = ed.DataFrame('localhost', 'flights')
|
||||
>>> df = ed.DataFrame('http://localhost:9200', 'flights')
|
||||
>>> df.shape
|
||||
(13059, 27)
|
||||
>>> df.query('FlightDelayMin > 60').shape
|
||||
@ -2004,7 +2004,7 @@ class DataFrame(NDFrame):
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> df = ed.DataFrame('localhost', 'flights')
|
||||
>>> df = ed.DataFrame('http://localhost:9200', 'flights')
|
||||
>>> df.get('Carrier')
|
||||
0 Kibana Airlines
|
||||
1 Logstash Airways
|
||||
@ -2135,7 +2135,7 @@ class DataFrame(NDFrame):
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> ed_df = ed.DataFrame('localhost', 'flights', columns=['AvgTicketPrice', 'Carrier']).head(5)
|
||||
>>> ed_df = ed.DataFrame('http://localhost:9200', 'flights', columns=['AvgTicketPrice', 'Carrier']).head(5)
|
||||
>>> pd_df = ed.eland_to_pandas(ed_df)
|
||||
>>> print(f"type(ed_df)={type(ed_df)}\\ntype(pd_df)={type(pd_df)}")
|
||||
type(ed_df)=<class 'eland.dataframe.DataFrame'>
|
||||
|
19
eland/etl.py
19
eland/etl.py
@ -24,12 +24,7 @@ from elasticsearch import Elasticsearch
|
||||
from elasticsearch.helpers import parallel_bulk
|
||||
|
||||
from eland import DataFrame
|
||||
from eland.common import (
|
||||
DEFAULT_CHUNK_SIZE,
|
||||
PANDAS_VERSION,
|
||||
ensure_es_client,
|
||||
es_api_compat,
|
||||
)
|
||||
from eland.common import DEFAULT_CHUNK_SIZE, PANDAS_VERSION, ensure_es_client
|
||||
from eland.field_mappings import FieldMappings, verify_mapping_compatibility
|
||||
|
||||
try:
|
||||
@ -128,7 +123,7 @@ def pandas_to_eland(
|
||||
|
||||
|
||||
>>> ed_df = ed.pandas_to_eland(pd_df,
|
||||
... 'localhost',
|
||||
... 'http://localhost:9200',
|
||||
... 'pandas_to_eland',
|
||||
... es_if_exists="replace",
|
||||
... es_refresh=True,
|
||||
@ -175,7 +170,7 @@ def pandas_to_eland(
|
||||
|
||||
elif es_if_exists == "replace":
|
||||
es_client.indices.delete(index=es_dest_index)
|
||||
es_api_compat(es_client.indices.create, index=es_dest_index, body=mapping)
|
||||
es_client.indices.create(index=es_dest_index, mappings=mapping["mappings"])
|
||||
|
||||
elif es_if_exists == "append":
|
||||
dest_mapping = es_client.indices.get_mapping(index=es_dest_index)[
|
||||
@ -187,7 +182,7 @@ def pandas_to_eland(
|
||||
es_type_overrides=es_type_overrides,
|
||||
)
|
||||
else:
|
||||
es_api_compat(es_client.indices.create, index=es_dest_index, body=mapping)
|
||||
es_client.indices.create(index=es_dest_index, mappings=mapping["mappings"])
|
||||
|
||||
def action_generator(
|
||||
pd_df: pd.DataFrame,
|
||||
@ -252,7 +247,7 @@ def eland_to_pandas(ed_df: DataFrame, show_progress: bool = False) -> pd.DataFra
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> ed_df = ed.DataFrame('localhost', 'flights').head()
|
||||
>>> ed_df = ed.DataFrame('http://localhost:9200', 'flights').head()
|
||||
>>> type(ed_df)
|
||||
<class 'eland.dataframe.DataFrame'>
|
||||
>>> ed_df
|
||||
@ -282,7 +277,7 @@ def eland_to_pandas(ed_df: DataFrame, show_progress: bool = False) -> pd.DataFra
|
||||
|
||||
Convert `eland.DataFrame` to `pandas.DataFrame` and show progress every 10000 rows
|
||||
|
||||
>>> pd_df = ed.eland_to_pandas(ed.DataFrame('localhost', 'flights'), show_progress=True) # doctest: +SKIP
|
||||
>>> pd_df = ed.eland_to_pandas(ed.DataFrame('http://localhost:9200', 'flights'), show_progress=True) # doctest: +SKIP
|
||||
2020-01-29 12:43:36.572395: read 10000 rows
|
||||
2020-01-29 12:43:37.309031: read 13059 rows
|
||||
|
||||
@ -420,7 +415,7 @@ def csv_to_eland( # type: ignore
|
||||
|
||||
>>> ed.csv_to_eland(
|
||||
... "churn.csv",
|
||||
... es_client='localhost',
|
||||
... es_client='http://localhost:9200',
|
||||
... es_dest_index='churn',
|
||||
... es_refresh=True,
|
||||
... index_col=0
|
||||
|
@ -515,7 +515,7 @@ class FieldMappings:
|
||||
@staticmethod
|
||||
def _generate_es_mappings(
|
||||
dataframe: "pd.DataFrame", es_type_overrides: Optional[Mapping[str, str]] = None
|
||||
) -> Dict[str, Dict[str, Dict[str, Any]]]:
|
||||
) -> Dict[str, Dict[str, Any]]:
|
||||
"""Given a pandas dataframe, generate the associated Elasticsearch mapping
|
||||
|
||||
Parameters
|
||||
@ -894,20 +894,20 @@ def verify_mapping_compatibility(
|
||||
problems = []
|
||||
es_type_overrides = es_type_overrides or {}
|
||||
|
||||
ed_mapping = ed_mapping["mappings"]["properties"]
|
||||
es_mapping = es_mapping["mappings"]["properties"]
|
||||
ed_props = ed_mapping["mappings"]["properties"]
|
||||
es_props = es_mapping["mappings"]["properties"]
|
||||
|
||||
for key in sorted(es_mapping.keys()):
|
||||
if key not in ed_mapping:
|
||||
for key in sorted(es_props.keys()):
|
||||
if key not in ed_props:
|
||||
problems.append(f"- {key!r} is missing from DataFrame columns")
|
||||
|
||||
for key, key_def in sorted(ed_mapping.items()):
|
||||
if key not in es_mapping:
|
||||
for key, key_def in sorted(ed_props.items()):
|
||||
if key not in es_props:
|
||||
problems.append(f"- {key!r} is missing from ES index mapping")
|
||||
continue
|
||||
|
||||
key_type = es_type_overrides.get(key, key_def["type"])
|
||||
es_key_type = es_mapping[key]["type"]
|
||||
es_key_type = es_props[key]["type"]
|
||||
if key_type != es_key_type and es_key_type not in ES_COMPATIBLE_TYPES.get(
|
||||
key_type, ()
|
||||
):
|
||||
|
@ -68,7 +68,7 @@ class DataFrameGroupBy(GroupBy):
|
||||
Examples
|
||||
--------
|
||||
>>> df = ed.DataFrame(
|
||||
... "localhost", "flights",
|
||||
... "http://localhost:9200", "flights",
|
||||
... columns=["AvgTicketPrice", "Cancelled", "dayOfWeek", "timestamp", "DestCountry"]
|
||||
... )
|
||||
>>> df.groupby("DestCountry").mean(numeric_only=False) # doctest: +SKIP
|
||||
@ -119,7 +119,7 @@ class DataFrameGroupBy(GroupBy):
|
||||
Examples
|
||||
--------
|
||||
>>> df = ed.DataFrame(
|
||||
... "localhost", "flights",
|
||||
... "http://localhost:9200", "flights",
|
||||
... columns=["AvgTicketPrice", "Cancelled", "dayOfWeek", "timestamp", "DestCountry"]
|
||||
... )
|
||||
>>> df.groupby("DestCountry").var() # doctest: +NORMALIZE_WHITESPACE
|
||||
@ -170,7 +170,7 @@ class DataFrameGroupBy(GroupBy):
|
||||
Examples
|
||||
--------
|
||||
>>> df = ed.DataFrame(
|
||||
... "localhost", "flights",
|
||||
... "http://localhost:9200", "flights",
|
||||
... columns=["AvgTicketPrice", "Cancelled", "dayOfWeek", "DestCountry"]
|
||||
... )
|
||||
>>> df.groupby("DestCountry").std() # doctest: +NORMALIZE_WHITESPACE
|
||||
@ -221,7 +221,7 @@ class DataFrameGroupBy(GroupBy):
|
||||
Examples
|
||||
--------
|
||||
>>> df = ed.DataFrame(
|
||||
... "localhost", "flights",
|
||||
... "http://localhost:9200", "flights",
|
||||
... columns=["AvgTicketPrice", "Cancelled", "dayOfWeek", "timestamp", "DestCountry"]
|
||||
... )
|
||||
>>> df.groupby("DestCountry").mad() # doctest: +SKIP
|
||||
@ -272,7 +272,7 @@ class DataFrameGroupBy(GroupBy):
|
||||
Examples
|
||||
--------
|
||||
>>> df = ed.DataFrame(
|
||||
... "localhost", "flights",
|
||||
... "http://localhost:9200", "flights",
|
||||
... columns=["AvgTicketPrice", "Cancelled", "dayOfWeek", "timestamp", "DestCountry"]
|
||||
... )
|
||||
>>> df.groupby("DestCountry").median(numeric_only=False) # doctest: +SKIP
|
||||
@ -323,7 +323,7 @@ class DataFrameGroupBy(GroupBy):
|
||||
Examples
|
||||
--------
|
||||
>>> df = ed.DataFrame(
|
||||
... "localhost", "flights",
|
||||
... "http://localhost:9200", "flights",
|
||||
... columns=["AvgTicketPrice", "Cancelled", "dayOfWeek", "DestCountry"]
|
||||
... )
|
||||
>>> df.groupby("DestCountry").sum() # doctest: +NORMALIZE_WHITESPACE
|
||||
@ -374,7 +374,7 @@ class DataFrameGroupBy(GroupBy):
|
||||
Examples
|
||||
--------
|
||||
>>> df = ed.DataFrame(
|
||||
... "localhost", "flights",
|
||||
... "http://localhost:9200", "flights",
|
||||
... columns=["AvgTicketPrice", "Cancelled", "dayOfWeek", "timestamp", "DestCountry"]
|
||||
... )
|
||||
>>> df.groupby("DestCountry").min(numeric_only=False) # doctest: +NORMALIZE_WHITESPACE
|
||||
@ -425,7 +425,7 @@ class DataFrameGroupBy(GroupBy):
|
||||
Examples
|
||||
--------
|
||||
>>> df = ed.DataFrame(
|
||||
... "localhost", "flights",
|
||||
... "http://localhost:9200", "flights",
|
||||
... columns=["AvgTicketPrice", "Cancelled", "dayOfWeek", "timestamp", "DestCountry"]
|
||||
... )
|
||||
>>> df.groupby("DestCountry").max(numeric_only=False) # doctest: +NORMALIZE_WHITESPACE
|
||||
@ -476,7 +476,7 @@ class DataFrameGroupBy(GroupBy):
|
||||
Examples
|
||||
--------
|
||||
>>> df = ed.DataFrame(
|
||||
... "localhost", "flights",
|
||||
... "http://localhost:9200", "flights",
|
||||
... columns=["AvgTicketPrice", "Cancelled", "dayOfWeek", "DestCountry"]
|
||||
... )
|
||||
>>> df.groupby("DestCountry").nunique() # doctest: +NORMALIZE_WHITESPACE
|
||||
@ -526,7 +526,7 @@ class DataFrameGroupBy(GroupBy):
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> ed_df = ed.DataFrame('localhost', 'flights')
|
||||
>>> ed_df = ed.DataFrame('http://localhost:9200', 'flights')
|
||||
>>> ed_flights = ed_df.filter(["AvgTicketPrice", "FlightDelayMin", "dayOfWeek", "timestamp"])
|
||||
>>> ed_flights.groupby(["dayOfWeek", "Cancelled"]).quantile() # doctest: +SKIP
|
||||
AvgTicketPrice FlightDelayMin
|
||||
@ -616,7 +616,7 @@ class DataFrameGroupBy(GroupBy):
|
||||
Examples
|
||||
--------
|
||||
>>> df = ed.DataFrame(
|
||||
... "localhost", "flights",
|
||||
... "http://localhost:9200", "flights",
|
||||
... columns=["AvgTicketPrice", "Cancelled", "dayOfWeek", "DestCountry"]
|
||||
... )
|
||||
>>> df.groupby("DestCountry").aggregate(["min", "max"]) # doctest: +NORMALIZE_WHITESPACE
|
||||
@ -670,7 +670,7 @@ class DataFrameGroupBy(GroupBy):
|
||||
Examples
|
||||
--------
|
||||
>>> df = ed.DataFrame(
|
||||
... "localhost", "flights",
|
||||
... "http://localhost:9200", "flights",
|
||||
... columns=["AvgTicketPrice", "Cancelled", "dayOfWeek", "DestCountry"]
|
||||
... )
|
||||
>>> df.groupby("DestCountry").count() # doctest: +NORMALIZE_WHITESPACE
|
||||
|
@ -79,7 +79,7 @@ class MLModel:
|
||||
model_id: str
|
||||
The unique identifier of the trained inference model in Elasticsearch.
|
||||
"""
|
||||
self._client = ensure_es_client(es_client)
|
||||
self._client: Elasticsearch = ensure_es_client(es_client)
|
||||
self._model_id = model_id
|
||||
self._trained_model_config_cache: Optional[Dict[str, Any]] = None
|
||||
|
||||
@ -120,7 +120,7 @@ class MLModel:
|
||||
>>> # Serialise the model to Elasticsearch
|
||||
>>> feature_names = ["f0", "f1", "f2", "f3", "f4", "f5"]
|
||||
>>> model_id = "test_xgb_regressor"
|
||||
>>> es_model = MLModel.import_model('localhost', model_id, regressor, feature_names, es_if_exists='replace')
|
||||
>>> es_model = MLModel.import_model('http://localhost:9200', model_id, regressor, feature_names, es_if_exists='replace')
|
||||
|
||||
>>> # Get some test results from Elasticsearch model
|
||||
>>> es_model.predict(test_data) # doctest: +SKIP
|
||||
@ -167,20 +167,18 @@ class MLModel:
|
||||
)
|
||||
|
||||
results = self._client.ingest.simulate(
|
||||
body={
|
||||
"pipeline": {
|
||||
"processors": [
|
||||
{
|
||||
"inference": {
|
||||
"model_id": self._model_id,
|
||||
"inference_config": {self.model_type: {}},
|
||||
field_map_name: {},
|
||||
}
|
||||
pipeline={
|
||||
"processors": [
|
||||
{
|
||||
"inference": {
|
||||
"model_id": self._model_id,
|
||||
"inference_config": {self.model_type: {}},
|
||||
field_map_name: {},
|
||||
}
|
||||
]
|
||||
},
|
||||
"docs": docs,
|
||||
}
|
||||
}
|
||||
]
|
||||
},
|
||||
docs=docs,
|
||||
)
|
||||
|
||||
# Unpack results into an array. Errors can be present
|
||||
@ -342,7 +340,7 @@ class MLModel:
|
||||
>>> feature_names = ["f0", "f1", "f2", "f3", "f4"]
|
||||
>>> model_id = "test_decision_tree_classifier"
|
||||
>>> es_model = MLModel.import_model(
|
||||
... 'localhost',
|
||||
... 'http://localhost:9200',
|
||||
... model_id=model_id,
|
||||
... model=classifier,
|
||||
... feature_names=feature_names,
|
||||
@ -383,22 +381,21 @@ class MLModel:
|
||||
elif es_if_exists == "replace":
|
||||
ml_model.delete_model()
|
||||
|
||||
body: Dict[str, Any] = {
|
||||
"input": {"field_names": feature_names},
|
||||
}
|
||||
# 'inference_config' is required in 7.8+ but isn't available in <=7.7
|
||||
if es_version(es_client) >= (7, 8):
|
||||
body["inference_config"] = {model_type: {}}
|
||||
|
||||
if es_compress_model_definition:
|
||||
body["compressed_definition"] = serializer.serialize_and_compress_model()
|
||||
ml_model._client.ml.put_trained_model(
|
||||
model_id=model_id,
|
||||
input={"field_names": feature_names},
|
||||
inference_config={model_type: {}},
|
||||
compressed_definition=serializer.serialize_and_compress_model(),
|
||||
)
|
||||
else:
|
||||
body["definition"] = serializer.serialize_model()
|
||||
ml_model._client.ml.put_trained_model(
|
||||
model_id=model_id,
|
||||
input={"field_names": feature_names},
|
||||
inference_config={model_type: {}},
|
||||
definition=serializer.serialize_model(),
|
||||
)
|
||||
|
||||
ml_model._client.ml.put_trained_model(
|
||||
model_id=model_id,
|
||||
body=body,
|
||||
)
|
||||
return ml_model
|
||||
|
||||
def delete_model(self) -> None:
|
||||
@ -408,7 +405,9 @@ class MLModel:
|
||||
If model doesn't exist, ignore failure.
|
||||
"""
|
||||
try:
|
||||
self._client.ml.delete_trained_model(model_id=self._model_id, ignore=(404,))
|
||||
self._client.options(ignore_status=404).ml.delete_trained_model(
|
||||
model_id=self._model_id
|
||||
)
|
||||
except elasticsearch.NotFoundError:
|
||||
pass
|
||||
|
||||
@ -426,16 +425,7 @@ class MLModel:
|
||||
def _trained_model_config(self) -> Dict[str, Any]:
|
||||
"""Lazily loads an ML models 'trained_model_config' information"""
|
||||
if self._trained_model_config_cache is None:
|
||||
|
||||
# In Elasticsearch 7.7 and earlier you can't get
|
||||
# target type without pulling the model definition
|
||||
# so we check the version first.
|
||||
if es_version(self._client) < (7, 8):
|
||||
resp = self._client.ml.get_trained_models(
|
||||
model_id=self._model_id, include_model_definition=True
|
||||
)
|
||||
else:
|
||||
resp = self._client.ml.get_trained_models(model_id=self._model_id)
|
||||
resp = self._client.ml.get_trained_models(model_id=self._model_id)
|
||||
|
||||
if resp["count"] > 1:
|
||||
raise ValueError(f"Model ID {self._model_id!r} wasn't unambiguous")
|
||||
|
@ -46,21 +46,19 @@ class PyTorchModel:
|
||||
es_client: Union[str, List[str], Tuple[str, ...], "Elasticsearch"],
|
||||
model_id: str,
|
||||
):
|
||||
self._client = ensure_es_client(es_client)
|
||||
self._client: Elasticsearch = ensure_es_client(es_client)
|
||||
self.model_id = model_id
|
||||
|
||||
def put_config(self, path: str) -> None:
|
||||
with open(path) as f:
|
||||
config = json.load(f)
|
||||
self._client.ml.put_trained_model(model_id=self.model_id, body=config)
|
||||
self._client.ml.put_trained_model(model_id=self.model_id, **config)
|
||||
|
||||
def put_vocab(self, path: str) -> None:
|
||||
with open(path) as f:
|
||||
vocab = json.load(f)
|
||||
self._client.transport.perform_request(
|
||||
"PUT",
|
||||
f"/_ml/trained_models/{self.model_id}/vocabulary",
|
||||
body=vocab,
|
||||
self._client.ml.put_trained_model_vocabulary(
|
||||
model_id=self.model_id, vocabulary=vocab["vocabulary"]
|
||||
)
|
||||
|
||||
def put_model(self, model_path: str, chunk_size: int = DEFAULT_CHUNK_SIZE) -> None:
|
||||
@ -76,15 +74,12 @@ class PyTorchModel:
|
||||
yield base64.b64encode(data).decode()
|
||||
|
||||
for i, data in tqdm(enumerate(model_file_chunk_generator()), total=total_parts):
|
||||
body = {
|
||||
"total_definition_length": model_size,
|
||||
"total_parts": total_parts,
|
||||
"definition": data,
|
||||
}
|
||||
self._client.transport.perform_request(
|
||||
"PUT",
|
||||
f"/_ml/trained_models/{self.model_id}/definition/{i}",
|
||||
body=body,
|
||||
self._client.ml.put_trained_model_definition_part(
|
||||
model_id=self.model_id,
|
||||
part=i,
|
||||
total_definition_length=model_size,
|
||||
total_parts=total_parts,
|
||||
definition=data,
|
||||
)
|
||||
|
||||
def import_model(
|
||||
@ -100,42 +95,41 @@ class PyTorchModel:
|
||||
self.put_vocab(vocab_path)
|
||||
|
||||
def infer(
|
||||
self, body: Dict[str, Any], timeout: str = DEFAULT_TIMEOUT
|
||||
) -> Union[bool, Any]:
|
||||
return self._client.transport.perform_request(
|
||||
"POST",
|
||||
f"/_ml/trained_models/{self.model_id}/deployment/_infer",
|
||||
body=body,
|
||||
params={"timeout": timeout, "request_timeout": 60},
|
||||
self,
|
||||
docs: List[Dict[str, str]],
|
||||
timeout: str = DEFAULT_TIMEOUT,
|
||||
) -> Any:
|
||||
return self._client.options(
|
||||
request_timeout=60
|
||||
).ml.infer_trained_model_deployment(
|
||||
model_id=self.model_id,
|
||||
timeout=timeout,
|
||||
docs=docs,
|
||||
)
|
||||
|
||||
def start(self, timeout: str = DEFAULT_TIMEOUT) -> None:
|
||||
self._client.transport.perform_request(
|
||||
"POST",
|
||||
f"/_ml/trained_models/{self.model_id}/deployment/_start",
|
||||
params={"timeout": timeout, "request_timeout": 60, "wait_for": "started"},
|
||||
self._client.options(request_timeout=60).ml.start_trained_model_deployment(
|
||||
model_id=self.model_id, timeout=timeout, wait_for="started"
|
||||
)
|
||||
|
||||
def stop(self) -> None:
|
||||
self._client.transport.perform_request(
|
||||
"POST",
|
||||
f"/_ml/trained_models/{self.model_id}/deployment/_stop",
|
||||
params={"ignore": 404},
|
||||
)
|
||||
self._client.ml.stop_trained_model_deployment(model_id=self.model_id)
|
||||
|
||||
def delete(self) -> None:
|
||||
self._client.ml.delete_trained_model(model_id=self.model_id, ignore=(404,))
|
||||
self._client.options(ignore_status=404).ml.delete_trained_model(
|
||||
model_id=self.model_id
|
||||
)
|
||||
|
||||
@classmethod
|
||||
def list(
|
||||
cls, es_client: Union[str, List[str], Tuple[str, ...], "Elasticsearch"]
|
||||
) -> Set[str]:
|
||||
client = ensure_es_client(es_client)
|
||||
res = client.ml.get_trained_models(model_id="*", allow_no_match=True)
|
||||
resp = client.ml.get_trained_models(model_id="*", allow_no_match=True)
|
||||
return set(
|
||||
[
|
||||
model["model_id"]
|
||||
for model in res["trained_model_configs"]
|
||||
for model in resp["trained_model_configs"]
|
||||
if model["model_type"] == "pytorch"
|
||||
]
|
||||
)
|
||||
|
@ -99,7 +99,7 @@ class NDFrame(ABC):
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> df = ed.DataFrame('localhost', 'flights')
|
||||
>>> df = ed.DataFrame('http://localhost:9200', 'flights')
|
||||
>>> assert isinstance(df.index, ed.Index)
|
||||
>>> df.index.es_index_field
|
||||
'_id'
|
||||
@ -127,7 +127,7 @@ class NDFrame(ABC):
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> df = ed.DataFrame('localhost', 'flights', columns=['Origin', 'AvgTicketPrice', 'timestamp', 'dayOfWeek'])
|
||||
>>> df = ed.DataFrame('http://localhost:9200', 'flights', columns=['Origin', 'AvgTicketPrice', 'timestamp', 'dayOfWeek'])
|
||||
>>> df.dtypes
|
||||
Origin object
|
||||
AvgTicketPrice float64
|
||||
@ -149,7 +149,7 @@ class NDFrame(ABC):
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> df = ed.DataFrame('localhost', 'flights', columns=['Origin', 'AvgTicketPrice', 'timestamp', 'dayOfWeek'])
|
||||
>>> df = ed.DataFrame('http://localhost:9200', 'flights', columns=['Origin', 'AvgTicketPrice', 'timestamp', 'dayOfWeek'])
|
||||
>>> df.es_dtypes
|
||||
Origin keyword
|
||||
AvgTicketPrice float
|
||||
@ -213,7 +213,7 @@ class NDFrame(ABC):
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> df = ed.DataFrame('localhost', 'flights', columns=["AvgTicketPrice", "Cancelled", "dayOfWeek", "timestamp", "DestCountry"])
|
||||
>>> df = ed.DataFrame('http://localhost:9200', 'flights', columns=["AvgTicketPrice", "Cancelled", "dayOfWeek", "timestamp", "DestCountry"])
|
||||
>>> df.mean() # doctest: +SKIP
|
||||
AvgTicketPrice 628.254
|
||||
Cancelled 0.128494
|
||||
@ -262,7 +262,7 @@ class NDFrame(ABC):
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> df = ed.DataFrame('localhost', 'flights', columns=["AvgTicketPrice", "Cancelled", "dayOfWeek", "timestamp", "DestCountry"])
|
||||
>>> df = ed.DataFrame('http://localhost:9200', 'flights', columns=["AvgTicketPrice", "Cancelled", "dayOfWeek", "timestamp", "DestCountry"])
|
||||
>>> df.sum() # doctest: +SKIP
|
||||
AvgTicketPrice 8.20436e+06
|
||||
Cancelled 1678
|
||||
@ -310,7 +310,7 @@ class NDFrame(ABC):
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> df = ed.DataFrame('localhost', 'flights', columns=["AvgTicketPrice", "Cancelled", "dayOfWeek", "timestamp", "DestCountry"])
|
||||
>>> df = ed.DataFrame('http://localhost:9200', 'flights', columns=["AvgTicketPrice", "Cancelled", "dayOfWeek", "timestamp", "DestCountry"])
|
||||
>>> df.min() # doctest: +SKIP
|
||||
AvgTicketPrice 100.021
|
||||
Cancelled False
|
||||
@ -357,7 +357,7 @@ class NDFrame(ABC):
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> df = ed.DataFrame('localhost', 'flights', columns=["AvgTicketPrice", "Cancelled", "dayOfWeek", "timestamp", "DestCountry"])
|
||||
>>> df = ed.DataFrame('http://localhost:9200', 'flights', columns=["AvgTicketPrice", "Cancelled", "dayOfWeek", "timestamp", "DestCountry"])
|
||||
>>> df.var() # doctest: +SKIP
|
||||
AvgTicketPrice 70964.570234
|
||||
Cancelled 0.111987
|
||||
@ -403,7 +403,7 @@ class NDFrame(ABC):
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> df = ed.DataFrame('localhost', 'flights', columns=["AvgTicketPrice", "Cancelled", "dayOfWeek", "timestamp", "DestCountry"])
|
||||
>>> df = ed.DataFrame('http://localhost:9200', 'flights', columns=["AvgTicketPrice", "Cancelled", "dayOfWeek", "timestamp", "DestCountry"])
|
||||
>>> df.std() # doctest: +SKIP
|
||||
AvgTicketPrice 266.407061
|
||||
Cancelled 0.334664
|
||||
@ -449,7 +449,7 @@ class NDFrame(ABC):
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> df = ed.DataFrame('localhost', 'flights', columns=["AvgTicketPrice", "Cancelled", "dayOfWeek", "timestamp", "DestCountry"])
|
||||
>>> df = ed.DataFrame('http://localhost:9200', 'flights', columns=["AvgTicketPrice", "Cancelled", "dayOfWeek", "timestamp", "DestCountry"])
|
||||
>>> df.median() # doctest: +SKIP
|
||||
AvgTicketPrice 640.363
|
||||
Cancelled False
|
||||
@ -498,7 +498,7 @@ class NDFrame(ABC):
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> df = ed.DataFrame('localhost', 'flights', columns=["AvgTicketPrice", "Cancelled", "dayOfWeek", "timestamp", "DestCountry"])
|
||||
>>> df = ed.DataFrame('http://localhost:9200', 'flights', columns=["AvgTicketPrice", "Cancelled", "dayOfWeek", "timestamp", "DestCountry"])
|
||||
>>> df.max() # doctest: +SKIP
|
||||
AvgTicketPrice 1199.73
|
||||
Cancelled True
|
||||
@ -557,7 +557,7 @@ class NDFrame(ABC):
|
||||
Examples
|
||||
--------
|
||||
>>> columns = ['category', 'currency', 'customer_birth_date', 'customer_first_name', 'user']
|
||||
>>> df = ed.DataFrame('localhost', 'ecommerce', columns=columns)
|
||||
>>> df = ed.DataFrame('http://localhost:9200', 'ecommerce', columns=columns)
|
||||
>>> df.nunique()
|
||||
category 6
|
||||
currency 1
|
||||
@ -583,7 +583,7 @@ class NDFrame(ABC):
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> df = ed.DataFrame('localhost', 'flights', columns=["AvgTicketPrice", "Cancelled", "dayOfWeek", "timestamp", "DestCountry"])
|
||||
>>> df = ed.DataFrame('http://localhost:9200', 'flights', columns=["AvgTicketPrice", "Cancelled", "dayOfWeek", "timestamp", "DestCountry"])
|
||||
>>> df.mad() # doctest: +SKIP
|
||||
AvgTicketPrice 213.35497
|
||||
dayOfWeek 2.00000
|
||||
@ -628,7 +628,7 @@ class NDFrame(ABC):
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> df = ed.DataFrame('localhost', 'flights', columns=['AvgTicketPrice', 'FlightDelayMin']) # ignoring percentiles
|
||||
>>> df = ed.DataFrame('http://localhost:9200', 'flights', columns=['AvgTicketPrice', 'FlightDelayMin']) # ignoring percentiles
|
||||
>>> df.describe() # doctest: +SKIP
|
||||
AvgTicketPrice FlightDelayMin
|
||||
count 13059.000000 13059.000000
|
||||
|
@ -34,7 +34,6 @@ from typing import (
|
||||
|
||||
import numpy as np
|
||||
import pandas as pd # type: ignore
|
||||
from elasticsearch.exceptions import NotFoundError
|
||||
|
||||
from eland.actions import PostProcessingAction
|
||||
from eland.common import (
|
||||
@ -45,8 +44,6 @@ from eland.common import (
|
||||
SortOrder,
|
||||
build_pd_series,
|
||||
elasticsearch_date_to_pandas_date,
|
||||
es_api_compat,
|
||||
es_version,
|
||||
)
|
||||
from eland.index import Index
|
||||
from eland.query import Query
|
||||
@ -173,7 +170,7 @@ class Operations:
|
||||
body.exists(field, must=True)
|
||||
|
||||
field_exists_count = query_compiler._client.count(
|
||||
index=query_compiler._index_pattern, body=body.to_count_body()
|
||||
index=query_compiler._index_pattern, **body.to_count_body()
|
||||
)["count"]
|
||||
counts[field] = field_exists_count
|
||||
|
||||
@ -240,7 +237,7 @@ class Operations:
|
||||
|
||||
# Fetch Response
|
||||
response = query_compiler._client.search(
|
||||
index=query_compiler._index_pattern, size=0, body=body.to_search_body()
|
||||
index=query_compiler._index_pattern, size=0, **body.to_search_body()
|
||||
)
|
||||
response = response["aggregations"]
|
||||
|
||||
@ -404,7 +401,7 @@ class Operations:
|
||||
)
|
||||
|
||||
response = query_compiler._client.search(
|
||||
index=query_compiler._index_pattern, size=0, body=body.to_search_body()
|
||||
index=query_compiler._index_pattern, size=0, **body.to_search_body()
|
||||
)
|
||||
|
||||
"""
|
||||
@ -1275,7 +1272,7 @@ class Operations:
|
||||
body.exists(field, must=True)
|
||||
|
||||
count: int = query_compiler._client.count(
|
||||
index=query_compiler._index_pattern, body=body.to_count_body()
|
||||
index=query_compiler._index_pattern, **body.to_count_body()
|
||||
)["count"]
|
||||
return count
|
||||
|
||||
@ -1313,7 +1310,7 @@ class Operations:
|
||||
body.terms(field, items, must=True)
|
||||
|
||||
count: int = query_compiler._client.count(
|
||||
index=query_compiler._index_pattern, body=body.to_count_body()
|
||||
index=query_compiler._index_pattern, **body.to_count_body()
|
||||
)["count"]
|
||||
return count
|
||||
|
||||
@ -1488,99 +1485,16 @@ def _search_yield_hits(
|
||||
[[{'_index': 'flights', '_type': '_doc', '_id': '0', '_score': None, '_source': {...}, 'sort': [...]},
|
||||
{'_index': 'flights', '_type': '_doc', '_id': '1', '_score': None, '_source': {...}, 'sort': [...]}]]
|
||||
"""
|
||||
# No documents, no reason to send a search.
|
||||
if max_number_of_hits == 0:
|
||||
return
|
||||
|
||||
# Make a copy of 'body' to avoid mutating it outside this function.
|
||||
body = body.copy()
|
||||
|
||||
# Use the default search size
|
||||
body.setdefault("size", DEFAULT_SEARCH_SIZE)
|
||||
|
||||
# Elasticsearch 7.12 added '_shard_doc' sort tiebreaker for PITs which
|
||||
# means we're guaranteed to be safe on documents with a duplicate sort rank.
|
||||
if es_version(query_compiler._client) >= (7, 12, 0):
|
||||
yield from _search_with_pit_and_search_after(
|
||||
query_compiler=query_compiler,
|
||||
body=body,
|
||||
max_number_of_hits=max_number_of_hits,
|
||||
)
|
||||
|
||||
# Otherwise we use 'scroll' like we used to.
|
||||
else:
|
||||
yield from _search_with_scroll(
|
||||
query_compiler=query_compiler,
|
||||
body=body,
|
||||
max_number_of_hits=max_number_of_hits,
|
||||
)
|
||||
|
||||
|
||||
def _search_with_scroll(
|
||||
query_compiler: "QueryCompiler",
|
||||
body: Dict[str, Any],
|
||||
max_number_of_hits: Optional[int],
|
||||
) -> Generator[List[Dict[str, Any]], None, None]:
|
||||
# No documents, no reason to send a search.
|
||||
if max_number_of_hits == 0:
|
||||
return
|
||||
|
||||
client = query_compiler._client
|
||||
hits_yielded = 0
|
||||
|
||||
# Make the initial search with 'scroll' set
|
||||
resp = es_api_compat(
|
||||
client.search,
|
||||
index=query_compiler._index_pattern,
|
||||
body=body,
|
||||
scroll=DEFAULT_PIT_KEEP_ALIVE,
|
||||
)
|
||||
scroll_id: Optional[str] = resp.get("_scroll_id", None)
|
||||
|
||||
try:
|
||||
while scroll_id and (
|
||||
max_number_of_hits is None or hits_yielded < max_number_of_hits
|
||||
):
|
||||
hits: List[Dict[str, Any]] = resp["hits"]["hits"]
|
||||
|
||||
# If we didn't receive any hits it means we've reached the end.
|
||||
if not hits:
|
||||
break
|
||||
|
||||
# Calculate which hits should be yielded from this batch
|
||||
if max_number_of_hits is None:
|
||||
hits_to_yield = len(hits)
|
||||
else:
|
||||
hits_to_yield = min(len(hits), max_number_of_hits - hits_yielded)
|
||||
|
||||
# Yield the hits we need to and then track the total number.
|
||||
# Never yield an empty list as that makes things simpler for
|
||||
# downstream consumers.
|
||||
if hits and hits_to_yield > 0:
|
||||
yield hits[:hits_to_yield]
|
||||
hits_yielded += hits_to_yield
|
||||
|
||||
# Retrieve the next set of results
|
||||
resp = client.scroll(
|
||||
body={"scroll_id": scroll_id, "scroll": DEFAULT_PIT_KEEP_ALIVE},
|
||||
)
|
||||
scroll_id = resp.get("_scroll_id", None) # Update the scroll ID.
|
||||
|
||||
finally:
|
||||
# Close the scroll if we have one open
|
||||
if scroll_id is not None:
|
||||
try:
|
||||
client.clear_scroll(body={"scroll_id": [scroll_id]})
|
||||
except NotFoundError:
|
||||
pass
|
||||
|
||||
|
||||
def _search_with_pit_and_search_after(
|
||||
query_compiler: "QueryCompiler",
|
||||
body: Dict[str, Any],
|
||||
max_number_of_hits: Optional[int],
|
||||
) -> Generator[List[Dict[str, Any]], None, None]:
|
||||
|
||||
# No documents, no reason to send a search.
|
||||
if max_number_of_hits == 0:
|
||||
return
|
||||
|
||||
client = query_compiler._client
|
||||
hits_yielded = 0 # Track the total number of hits yielded.
|
||||
pit_id: Optional[str] = None
|
||||
@ -1603,7 +1517,7 @@ def _search_with_pit_and_search_after(
|
||||
body["pit"] = {"id": pit_id, "keep_alive": DEFAULT_PIT_KEEP_ALIVE}
|
||||
|
||||
while max_number_of_hits is None or hits_yielded < max_number_of_hits:
|
||||
resp = es_api_compat(client.search, body=body)
|
||||
resp = client.search(**body)
|
||||
hits: List[Dict[str, Any]] = resp["hits"]["hits"]
|
||||
|
||||
# The point in time ID can change between searches so we
|
||||
@ -1636,8 +1550,4 @@ def _search_with_pit_and_search_after(
|
||||
# We want to cleanup the point in time if we allocated one
|
||||
# to keep our memory footprint low.
|
||||
if pit_id is not None:
|
||||
try:
|
||||
client.close_point_in_time(body={"id": pit_id})
|
||||
except NotFoundError:
|
||||
# If a point in time is already closed Elasticsearch throws NotFoundError
|
||||
pass
|
||||
client.options(ignore_status=404).close_point_in_time(id=pit_id)
|
||||
|
@ -43,7 +43,7 @@ def ed_hist_series(
|
||||
Examples
|
||||
--------
|
||||
>>> import matplotlib.pyplot as plt
|
||||
>>> df = ed.DataFrame('localhost', 'flights')
|
||||
>>> df = ed.DataFrame('http://localhost:9200', 'flights')
|
||||
>>> df[df.OriginWeather == 'Sunny']['FlightTimeMin'].hist(alpha=0.5, density=True) # doctest: +SKIP
|
||||
>>> df[df.OriginWeather != 'Sunny']['FlightTimeMin'].hist(alpha=0.5, density=True) # doctest: +SKIP
|
||||
>>> plt.show() # doctest: +SKIP
|
||||
@ -109,7 +109,7 @@ def ed_hist_frame(
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> df = ed.DataFrame('localhost', 'flights')
|
||||
>>> df = ed.DataFrame('http://localhost:9200', 'flights')
|
||||
>>> hist = df.select_dtypes(include=[np.number]).hist(figsize=[10,10]) # doctest: +SKIP
|
||||
"""
|
||||
return hist_frame(
|
||||
|
@ -97,7 +97,7 @@ class Series(NDFrame):
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> ed.Series(es_client='localhost', es_index_pattern='flights', name='Carrier')
|
||||
>>> ed.Series(es_client='http://localhost:9200', es_index_pattern='flights', name='Carrier')
|
||||
0 Kibana Airlines
|
||||
1 Logstash Airways
|
||||
2 Logstash Airways
|
||||
@ -165,7 +165,7 @@ class Series(NDFrame):
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> df = ed.Series('localhost', 'ecommerce', name='total_quantity')
|
||||
>>> df = ed.Series('http://localhost:9200', 'ecommerce', name='total_quantity')
|
||||
>>> df.shape
|
||||
(4675, 1)
|
||||
"""
|
||||
@ -214,7 +214,7 @@ class Series(NDFrame):
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> df = ed.DataFrame('localhost', 'flights')
|
||||
>>> df = ed.DataFrame('http://localhost:9200', 'flights')
|
||||
>>> df.Carrier
|
||||
0 Kibana Airlines
|
||||
1 Logstash Airways
|
||||
@ -290,7 +290,7 @@ class Series(NDFrame):
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> df = ed.DataFrame('localhost', 'flights')
|
||||
>>> df = ed.DataFrame('http://localhost:9200', 'flights')
|
||||
>>> df['Carrier'].value_counts()
|
||||
Logstash Airways 3331
|
||||
JetBeats 3274
|
||||
@ -587,7 +587,7 @@ class Series(NDFrame):
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> ed_flights = ed.DataFrame('localhost', 'flights')
|
||||
>>> ed_flights = ed.DataFrame('http://localhost:9200', 'flights')
|
||||
>>> ed_flights["timestamp"].quantile([.2,.5,.75]) # doctest: +SKIP
|
||||
0.20 2018-01-09 04:30:57.289159912
|
||||
0.50 2018-01-21 23:39:27.031627441
|
||||
@ -691,7 +691,7 @@ class Series(NDFrame):
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> ed_ecommerce = ed.DataFrame('localhost', 'ecommerce')
|
||||
>>> ed_ecommerce = ed.DataFrame('http://localhost:9200', 'ecommerce')
|
||||
>>> ed_ecommerce["day_of_week"].mode()
|
||||
0 Thursday
|
||||
dtype: object
|
||||
@ -760,7 +760,7 @@ class Series(NDFrame):
|
||||
Examples
|
||||
--------
|
||||
>>> df = ed.DataFrame(
|
||||
... "localhost:9200", "ecommerce",
|
||||
... "http://localhost:9200", "ecommerce",
|
||||
... columns=["category", "taxful_total_price"]
|
||||
... )
|
||||
>>> df[
|
||||
@ -807,7 +807,7 @@ class Series(NDFrame):
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> df = ed.DataFrame('localhost', 'ecommerce').head(5)
|
||||
>>> df = ed.DataFrame('http://localhost:9200', 'ecommerce').head(5)
|
||||
>>> df.taxful_total_price
|
||||
0 36.98
|
||||
1 53.98
|
||||
@ -867,7 +867,7 @@ class Series(NDFrame):
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> df = ed.DataFrame('localhost', 'ecommerce').head(5)
|
||||
>>> df = ed.DataFrame('http://localhost:9200', 'ecommerce').head(5)
|
||||
>>> df.taxful_total_price
|
||||
0 36.98
|
||||
1 53.98
|
||||
@ -906,7 +906,7 @@ class Series(NDFrame):
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> df = ed.DataFrame('localhost', 'ecommerce').head(5)
|
||||
>>> df = ed.DataFrame('http://localhost:9200', 'ecommerce').head(5)
|
||||
>>> df.taxful_total_price
|
||||
0 36.98
|
||||
1 53.98
|
||||
@ -945,7 +945,7 @@ class Series(NDFrame):
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> df = ed.DataFrame('localhost', 'ecommerce').head(5)
|
||||
>>> df = ed.DataFrame('http://localhost:9200', 'ecommerce').head(5)
|
||||
>>> df.taxful_total_price
|
||||
0 36.98
|
||||
1 53.98
|
||||
@ -984,7 +984,7 @@ class Series(NDFrame):
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> df = ed.DataFrame('localhost', 'ecommerce').head(5)
|
||||
>>> df = ed.DataFrame('http://localhost:9200', 'ecommerce').head(5)
|
||||
>>> df.taxful_total_price
|
||||
0 36.98
|
||||
1 53.98
|
||||
@ -1023,7 +1023,7 @@ class Series(NDFrame):
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> df = ed.DataFrame('localhost', 'ecommerce').head(5)
|
||||
>>> df = ed.DataFrame('http://localhost:9200', 'ecommerce').head(5)
|
||||
>>> df.taxful_total_price
|
||||
0 36.98
|
||||
1 53.98
|
||||
@ -1062,7 +1062,7 @@ class Series(NDFrame):
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> df = ed.DataFrame('localhost', 'ecommerce').head(5)
|
||||
>>> df = ed.DataFrame('http://localhost:9200', 'ecommerce').head(5)
|
||||
>>> df.taxful_total_price
|
||||
0 36.98
|
||||
1 53.98
|
||||
@ -1101,7 +1101,7 @@ class Series(NDFrame):
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> df = ed.DataFrame('localhost', 'ecommerce').head(5)
|
||||
>>> df = ed.DataFrame('http://localhost:9200', 'ecommerce').head(5)
|
||||
>>> df.taxful_total_price
|
||||
0 36.98
|
||||
1 53.98
|
||||
@ -1133,7 +1133,7 @@ class Series(NDFrame):
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> df = ed.DataFrame('localhost', 'ecommerce').head(5)
|
||||
>>> df = ed.DataFrame('http://localhost:9200', 'ecommerce').head(5)
|
||||
>>> df.taxful_total_price
|
||||
0 36.98
|
||||
1 53.98
|
||||
@ -1165,7 +1165,7 @@ class Series(NDFrame):
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> df = ed.DataFrame('localhost', 'ecommerce').head(5)
|
||||
>>> df = ed.DataFrame('http://localhost:9200', 'ecommerce').head(5)
|
||||
>>> df.taxful_total_price
|
||||
0 36.98
|
||||
1 53.98
|
||||
@ -1197,7 +1197,7 @@ class Series(NDFrame):
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> df = ed.DataFrame('localhost', 'ecommerce').head(5)
|
||||
>>> df = ed.DataFrame('http://localhost:9200', 'ecommerce').head(5)
|
||||
>>> df.taxful_total_price
|
||||
0 36.98
|
||||
1 53.98
|
||||
@ -1229,7 +1229,7 @@ class Series(NDFrame):
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> df = ed.DataFrame('localhost', 'ecommerce').head(5)
|
||||
>>> df = ed.DataFrame('http://localhost:9200', 'ecommerce').head(5)
|
||||
>>> df.taxful_total_price
|
||||
0 36.98
|
||||
1 53.98
|
||||
@ -1261,7 +1261,7 @@ class Series(NDFrame):
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> df = ed.DataFrame('localhost', 'ecommerce').head(5)
|
||||
>>> df = ed.DataFrame('http://localhost:9200', 'ecommerce').head(5)
|
||||
>>> df.total_quantity
|
||||
0 2
|
||||
1 2
|
||||
@ -1293,7 +1293,7 @@ class Series(NDFrame):
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> df = ed.DataFrame('localhost', 'ecommerce').head(5)
|
||||
>>> df = ed.DataFrame('http://localhost:9200', 'ecommerce').head(5)
|
||||
>>> df.taxful_total_price
|
||||
0 36.98
|
||||
1 53.98
|
||||
@ -1415,7 +1415,7 @@ class Series(NDFrame):
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> s = ed.DataFrame('localhost', 'flights')['AvgTicketPrice']
|
||||
>>> s = ed.DataFrame('http://localhost:9200', 'flights')['AvgTicketPrice']
|
||||
>>> int(s.max())
|
||||
1199
|
||||
"""
|
||||
@ -1439,7 +1439,7 @@ class Series(NDFrame):
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> s = ed.DataFrame('localhost', 'flights')['AvgTicketPrice']
|
||||
>>> s = ed.DataFrame('http://localhost:9200', 'flights')['AvgTicketPrice']
|
||||
>>> int(s.mean())
|
||||
628
|
||||
"""
|
||||
@ -1463,7 +1463,7 @@ class Series(NDFrame):
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> s = ed.DataFrame('localhost', 'flights')['AvgTicketPrice']
|
||||
>>> s = ed.DataFrame('http://localhost:9200', 'flights')['AvgTicketPrice']
|
||||
>>> int(s.median())
|
||||
640
|
||||
"""
|
||||
@ -1487,7 +1487,7 @@ class Series(NDFrame):
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> s = ed.DataFrame('localhost', 'flights')['AvgTicketPrice']
|
||||
>>> s = ed.DataFrame('http://localhost:9200', 'flights')['AvgTicketPrice']
|
||||
>>> int(s.min())
|
||||
100
|
||||
"""
|
||||
@ -1511,7 +1511,7 @@ class Series(NDFrame):
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> s = ed.DataFrame('localhost', 'flights')['AvgTicketPrice']
|
||||
>>> s = ed.DataFrame('http://localhost:9200', 'flights')['AvgTicketPrice']
|
||||
>>> int(s.sum())
|
||||
8204364
|
||||
"""
|
||||
@ -1533,7 +1533,7 @@ class Series(NDFrame):
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> s = ed.DataFrame('localhost', 'flights')['Carrier']
|
||||
>>> s = ed.DataFrame('http://localhost:9200', 'flights')['Carrier']
|
||||
>>> s.nunique()
|
||||
4
|
||||
"""
|
||||
@ -1555,7 +1555,7 @@ class Series(NDFrame):
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> s = ed.DataFrame('localhost', 'flights')['AvgTicketPrice']
|
||||
>>> s = ed.DataFrame('http://localhost:9200', 'flights')['AvgTicketPrice']
|
||||
>>> int(s.var())
|
||||
70964
|
||||
"""
|
||||
@ -1577,7 +1577,7 @@ class Series(NDFrame):
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> s = ed.DataFrame('localhost', 'flights')['AvgTicketPrice']
|
||||
>>> s = ed.DataFrame('http://localhost:9200', 'flights')['AvgTicketPrice']
|
||||
>>> int(s.std())
|
||||
266
|
||||
"""
|
||||
@ -1599,7 +1599,7 @@ class Series(NDFrame):
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> s = ed.DataFrame('localhost', 'flights')['AvgTicketPrice']
|
||||
>>> s = ed.DataFrame('http://localhost:9200', 'flights')['AvgTicketPrice']
|
||||
>>> int(s.mad())
|
||||
213
|
||||
"""
|
||||
@ -1627,7 +1627,7 @@ class Series(NDFrame):
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> df = ed.DataFrame('localhost', 'flights') # ignoring percentiles as they don't generate consistent results
|
||||
>>> df = ed.DataFrame('http://localhost:9200', 'flights') # ignoring percentiles as they don't generate consistent results
|
||||
>>> df.AvgTicketPrice.describe() # doctest: +SKIP
|
||||
count 13059.000000
|
||||
mean 628.253689
|
||||
@ -1660,7 +1660,7 @@ class Series(NDFrame):
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> ed_s = ed.Series('localhost', 'flights', name='Carrier').head(5)
|
||||
>>> ed_s = ed.Series('http://localhost:9200', 'flights', name='Carrier').head(5)
|
||||
>>> pd_s = ed.eland_to_pandas(ed_s)
|
||||
>>> print(f"type(ed_s)={type(ed_s)}\\ntype(pd_s)={type(pd_s)}")
|
||||
type(ed_s)=<class 'eland.series.Series'>
|
||||
|
42
noxfile.py
42
noxfile.py
@ -71,19 +71,19 @@ def lint(session):
|
||||
# Install numpy to use its mypy plugin
|
||||
# https://numpy.org/devdocs/reference/typing.html#mypy-plugin
|
||||
session.install("black", "flake8", "mypy", "isort", "numpy")
|
||||
session.install("--pre", "elasticsearch")
|
||||
session.install("--pre", "elasticsearch>=8.0.0a1,<9")
|
||||
session.run("python", "utils/license-headers.py", "check", *SOURCE_FILES)
|
||||
session.run("black", "--check", "--target-version=py37", *SOURCE_FILES)
|
||||
session.run("isort", "--check", "--profile=black", *SOURCE_FILES)
|
||||
session.run("flake8", "--ignore=E501,W503,E402,E712,E203", *SOURCE_FILES)
|
||||
|
||||
# TODO: When all files are typed we can change this to .run("mypy", "--strict", "eland/")
|
||||
session.log("mypy --strict eland/")
|
||||
session.log("mypy --show-error-codes --strict eland/")
|
||||
for typed_file in TYPED_FILES:
|
||||
if not os.path.isfile(typed_file):
|
||||
session.error(f"The file {typed_file!r} couldn't be found")
|
||||
process = subprocess.run(
|
||||
["mypy", "--strict", typed_file],
|
||||
["mypy", "--show-error-codes", "--strict", typed_file],
|
||||
env=session.env,
|
||||
stdout=subprocess.PIPE,
|
||||
stderr=subprocess.STDOUT,
|
||||
@ -100,14 +100,15 @@ def lint(session):
|
||||
session.error("\n" + "\n".join(sorted(set(errors))))
|
||||
|
||||
|
||||
@nox.session(python=["3.7", "3.8", "3.9"])
|
||||
@nox.session(python=["3.7", "3.8", "3.9", "3.10"])
|
||||
@nox.parametrize("pandas_version", ["1.2.0", "1.3.0"])
|
||||
def test(session, pandas_version: str):
|
||||
session.install("-r", "requirements-dev.txt")
|
||||
session.install(".")
|
||||
session.run("python", "-m", "pip", "install", f"pandas~={pandas_version}")
|
||||
session.run("python", "-m", "tests.setup_tests")
|
||||
session.run(
|
||||
|
||||
pytest_args = (
|
||||
"python",
|
||||
"-m",
|
||||
"pytest",
|
||||
@ -116,6 +117,13 @@ def test(session, pandas_version: str):
|
||||
"--cov-config=setup.cfg",
|
||||
"--doctest-modules",
|
||||
"--nbval",
|
||||
)
|
||||
|
||||
# PyTorch doesn't support Python 3.10 yet
|
||||
if session.python == "3.10":
|
||||
pytest_args += ("--ignore=eland/ml/pytorch",)
|
||||
session.run(
|
||||
*pytest_args,
|
||||
*(session.posargs or ("eland/", "tests/")),
|
||||
)
|
||||
|
||||
@ -144,21 +152,25 @@ def docs(session):
|
||||
|
||||
# See if we have an Elasticsearch cluster active
|
||||
# to rebuild the Jupyter notebooks with.
|
||||
es_active = False
|
||||
try:
|
||||
import elasticsearch
|
||||
from elasticsearch import ConnectionError, Elasticsearch
|
||||
|
||||
es = elasticsearch.Elasticsearch("localhost:9200")
|
||||
es.info()
|
||||
if not es.indices.exists("flights"):
|
||||
session.run("python", "-m", "tests.setup_tests")
|
||||
es_active = True
|
||||
except Exception:
|
||||
es_active = False
|
||||
try:
|
||||
es = Elasticsearch("http://localhost:9200")
|
||||
es.info()
|
||||
if not es.indices.exists(index="flights"):
|
||||
session.run("python", "-m", "tests.setup_tests")
|
||||
es_active = True
|
||||
except ConnectionError:
|
||||
pass
|
||||
except ImportError:
|
||||
pass
|
||||
|
||||
# Rebuild all the example notebooks inplace
|
||||
if es_active:
|
||||
session.install("jupyter-client", "ipykernel")
|
||||
for filename in os.listdir(BASE_DIR / "docs/source/examples"):
|
||||
for filename in os.listdir(BASE_DIR / "docs/sphinx/examples"):
|
||||
if (
|
||||
filename.endswith(".ipynb")
|
||||
and filename != "introduction_to_eland_webinar.ipynb"
|
||||
@ -170,7 +182,7 @@ def docs(session):
|
||||
"notebook",
|
||||
"--inplace",
|
||||
"--execute",
|
||||
str(BASE_DIR / "docs/source/examples" / filename),
|
||||
str(BASE_DIR / "docs/sphinx/examples" / filename),
|
||||
)
|
||||
|
||||
session.cd("docs")
|
||||
|
@ -1,4 +1,4 @@
|
||||
elasticsearch>=7.7
|
||||
elasticsearch>=8.0.0a1,<9
|
||||
pandas>=1.2.0
|
||||
matplotlib
|
||||
pytest>=5.2.1
|
||||
@ -11,6 +11,9 @@ nox
|
||||
lightgbm
|
||||
pytest-cov
|
||||
mypy
|
||||
sentence-transformers>=2.1.0
|
||||
torch>=1.9.0
|
||||
transformers[torch]>=4.12.0
|
||||
huggingface-hub>=0.0.17
|
||||
|
||||
# Torch doesn't support Python 3.10 yet (pytorch/pytorch#66424)
|
||||
sentence-transformers>=2.1.0; python_version<'3.10'
|
||||
torch>=1.9.0; python_version<'3.10'
|
||||
transformers[torch]>=4.12.0; python_version<'3.10'
|
||||
|
@ -1,3 +1,3 @@
|
||||
elasticsearch>=7.7
|
||||
elasticsearch>=8.0.0a1,<9
|
||||
pandas>=1
|
||||
matplotlib
|
||||
|
2
setup.py
2
setup.py
@ -82,7 +82,7 @@ setup(
|
||||
keywords="elastic eland pandas python",
|
||||
packages=find_packages(include=["eland", "eland.*"]),
|
||||
install_requires=[
|
||||
"elasticsearch>=7.11,<8",
|
||||
"elasticsearch>=8.0.0a1,<9",
|
||||
"pandas>=1.2,<1.4",
|
||||
"matplotlib",
|
||||
"numpy",
|
||||
|
@ -25,17 +25,12 @@ from eland.common import es_version
|
||||
ROOT_DIR = os.path.dirname(os.path.abspath(__file__))
|
||||
|
||||
# Define test files and indices
|
||||
ELASTICSEARCH_HOST = os.environ.get("ELASTICSEARCH_HOST") or "localhost"
|
||||
ELASTICSEARCH_HOST = os.environ.get(
|
||||
"ELASTICSEARCH_URL", os.environ.get("ELASTICSEARCH_HOST", "http://localhost:9200")
|
||||
)
|
||||
|
||||
# Define client to use in tests
|
||||
TEST_SUITE = os.environ.get("TEST_SUITE", "xpack")
|
||||
if TEST_SUITE == "xpack":
|
||||
ES_TEST_CLIENT = Elasticsearch(
|
||||
ELASTICSEARCH_HOST,
|
||||
http_auth=("elastic", "changeme"),
|
||||
)
|
||||
else:
|
||||
ES_TEST_CLIENT = Elasticsearch(ELASTICSEARCH_HOST)
|
||||
ES_TEST_CLIENT = Elasticsearch(ELASTICSEARCH_HOST)
|
||||
|
||||
ES_VERSION = es_version(ES_TEST_CLIENT)
|
||||
|
||||
|
@ -42,7 +42,7 @@ class TestDataFrameDateTime(TestData):
|
||||
usually contains tests).
|
||||
"""
|
||||
es = ES_TEST_CLIENT
|
||||
if es.indices.exists(cls.time_index_name):
|
||||
if es.indices.exists(index=cls.time_index_name):
|
||||
es.indices.delete(index=cls.time_index_name)
|
||||
dts = [datetime.strptime(time, "%Y-%m-%dT%H:%M:%S.%f%z") for time in cls.times]
|
||||
|
||||
@ -58,11 +58,11 @@ class TestDataFrameDateTime(TestData):
|
||||
|
||||
body = {"mappings": mappings}
|
||||
index = "test_time_formats"
|
||||
es.indices.delete(index=index, ignore=[400, 404])
|
||||
es.options(ignore_status=[400, 404]).indices.delete(index=index)
|
||||
es.indices.create(index=index, body=body)
|
||||
|
||||
for i, time_formats in enumerate(time_formats_docs):
|
||||
es.index(index=index, body=time_formats, id=i)
|
||||
es.index(index=index, id=i, document=time_formats)
|
||||
es.indices.refresh(index=index)
|
||||
|
||||
@classmethod
|
||||
|
@ -69,7 +69,7 @@ class TestDataFrameQuery(TestData):
|
||||
|
||||
assert_pandas_eland_frame_equal(pd_q4, ed_q4)
|
||||
|
||||
ES_TEST_CLIENT.indices.delete(index_name)
|
||||
ES_TEST_CLIENT.indices.delete(index=index_name)
|
||||
|
||||
def test_simple_query(self):
|
||||
ed_flights = self.ed_flights()
|
||||
@ -141,4 +141,4 @@ class TestDataFrameQuery(TestData):
|
||||
|
||||
assert_pandas_eland_frame_equal(pd_q4, ed_q4)
|
||||
|
||||
ES_TEST_CLIENT.indices.delete(index_name)
|
||||
ES_TEST_CLIENT.indices.delete(index=index_name)
|
||||
|
@ -99,7 +99,7 @@ class TestDataFrameToCSV(TestData):
|
||||
print(pd_flights_from_csv.head())
|
||||
|
||||
# clean up index
|
||||
ES_TEST_CLIENT.indices.delete(test_index)
|
||||
ES_TEST_CLIENT.indices.delete(index=test_index)
|
||||
|
||||
def test_pd_to_csv_without_filepath(self):
|
||||
|
||||
|
@ -122,7 +122,7 @@ class TestDataFrameUtils(TestData):
|
||||
}
|
||||
}
|
||||
|
||||
mapping = ES_TEST_CLIENT.indices.get_mapping(index_name)
|
||||
mapping = ES_TEST_CLIENT.indices.get_mapping(index=index_name)
|
||||
|
||||
assert expected_mapping == mapping
|
||||
|
||||
|
@ -195,7 +195,7 @@ class TestPandasToEland:
|
||||
)
|
||||
|
||||
# Assert that the value 128 caused the index error
|
||||
assert "Value [128] is out of range for a byte" in str(e.value)
|
||||
assert "Value [128] is out of range for a byte" in str(e.value.errors)
|
||||
|
||||
def test_pandas_to_eland_text_inserts_keyword(self):
|
||||
es = ES_TEST_CLIENT
|
||||
|
@ -32,7 +32,7 @@ class TestDateTime(TestData):
|
||||
usually contains tests).
|
||||
"""
|
||||
es = ES_TEST_CLIENT
|
||||
if es.indices.exists(cls.time_index_name):
|
||||
if es.indices.exists(index=cls.time_index_name):
|
||||
es.indices.delete(index=cls.time_index_name)
|
||||
dts = [datetime.strptime(time, "%Y-%m-%dT%H:%M:%S.%f%z") for time in cls.times]
|
||||
|
||||
@ -46,13 +46,12 @@ class TestDateTime(TestData):
|
||||
mappings["properties"][field_name]["type"] = "date"
|
||||
mappings["properties"][field_name]["format"] = field_name
|
||||
|
||||
body = {"mappings": mappings}
|
||||
index = "test_time_formats"
|
||||
es.indices.delete(index=index, ignore=[400, 404])
|
||||
es.indices.create(index=index, body=body)
|
||||
es.options(ignore_status=[400, 404]).indices.delete(index=index)
|
||||
es.indices.create(index=index, mappings=mappings)
|
||||
|
||||
for i, time_formats in enumerate(time_formats_docs):
|
||||
es.index(index=index, body=time_formats, id=i)
|
||||
es.index(index=index, id=i, document=time_formats)
|
||||
es.indices.refresh(index=index)
|
||||
|
||||
@classmethod
|
||||
|
@ -90,5 +90,5 @@ class TestPytorchModel:
|
||||
def test_text_classification(self, model_id, task, text_input, value):
|
||||
with tempfile.TemporaryDirectory() as tmp_dir:
|
||||
ptm = download_model_and_start_deployment(tmp_dir, True, model_id, task)
|
||||
result = ptm.infer({"docs": [{"text_field": text_input}]})
|
||||
result = ptm.infer(docs=[{"text_field": text_input}])
|
||||
assert result["predicted_value"] == value
|
||||
|
File diff suppressed because one or more lines are too long
@ -18,7 +18,9 @@
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": "False"
|
||||
"text/plain": [
|
||||
"False"
|
||||
]
|
||||
},
|
||||
"execution_count": 2,
|
||||
"metadata": {},
|
||||
@ -27,7 +29,7 @@
|
||||
],
|
||||
"source": [
|
||||
"es = Elasticsearch()\n",
|
||||
"ed_df = ed.DataFrame('localhost', 'flights', columns = [\"AvgTicketPrice\", \"Cancelled\", \"dayOfWeek\", \"timestamp\", \"DestCountry\"])\n",
|
||||
"ed_df = ed.DataFrame('http://localhost:9200', 'flights', columns = [\"AvgTicketPrice\", \"Cancelled\", \"dayOfWeek\", \"timestamp\", \"DestCountry\"])\n",
|
||||
"es.indices.exists(index=\"churn\")"
|
||||
]
|
||||
},
|
||||
@ -57,7 +59,9 @@
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": "pandas.core.frame.DataFrame"
|
||||
"text/plain": [
|
||||
"pandas.core.frame.DataFrame"
|
||||
]
|
||||
},
|
||||
"execution_count": 4,
|
||||
"metadata": {},
|
||||
@ -75,8 +79,125 @@
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": " account length area code churn customer service calls \\\n0 128 415 0 1 \n1 107 415 0 1 \n\n international plan number vmail messages phone number state \\\n0 no 25 382-4657 KS \n1 no 26 371-7191 OH \n\n total day calls total day charge ... total eve calls total eve charge \\\n0 110 45.07 ... 99 16.78 \n1 123 27.47 ... 103 16.62 \n\n total eve minutes total intl calls total intl charge total intl minutes \\\n0 197.4 3 2.7 10.0 \n1 195.5 3 3.7 13.7 \n\n total night calls total night charge total night minutes voice mail plan \n0 91 11.01 244.7 yes \n1 103 11.45 254.4 yes \n\n[2 rows x 21 columns]",
|
||||
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>account length</th>\n <th>area code</th>\n <th>churn</th>\n <th>customer service calls</th>\n <th>international plan</th>\n <th>number vmail messages</th>\n <th>phone number</th>\n <th>state</th>\n <th>total day calls</th>\n <th>total day charge</th>\n <th>...</th>\n <th>total eve calls</th>\n <th>total eve charge</th>\n <th>total eve minutes</th>\n <th>total intl calls</th>\n <th>total intl charge</th>\n <th>total intl minutes</th>\n <th>total night calls</th>\n <th>total night charge</th>\n <th>total night minutes</th>\n <th>voice mail plan</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td>128</td>\n <td>415</td>\n <td>0</td>\n <td>1</td>\n <td>no</td>\n <td>25</td>\n <td>382-4657</td>\n <td>KS</td>\n <td>110</td>\n <td>45.07</td>\n <td>...</td>\n <td>99</td>\n <td>16.78</td>\n <td>197.4</td>\n <td>3</td>\n <td>2.7</td>\n <td>10.0</td>\n <td>91</td>\n <td>11.01</td>\n <td>244.7</td>\n <td>yes</td>\n </tr>\n <tr>\n <th>1</th>\n <td>107</td>\n <td>415</td>\n <td>0</td>\n <td>1</td>\n <td>no</td>\n <td>26</td>\n <td>371-7191</td>\n <td>OH</td>\n <td>123</td>\n <td>27.47</td>\n <td>...</td>\n <td>103</td>\n <td>16.62</td>\n <td>195.5</td>\n <td>3</td>\n <td>3.7</td>\n <td>13.7</td>\n <td>103</td>\n <td>11.45</td>\n <td>254.4</td>\n <td>yes</td>\n </tr>\n </tbody>\n</table>\n</div>\n<p>2 rows × 21 columns</p>"
|
||||
"text/html": [
|
||||
"<div>\n",
|
||||
"<style scoped>\n",
|
||||
" .dataframe tbody tr th:only-of-type {\n",
|
||||
" vertical-align: middle;\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" .dataframe tbody tr th {\n",
|
||||
" vertical-align: top;\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" .dataframe thead th {\n",
|
||||
" text-align: right;\n",
|
||||
" }\n",
|
||||
"</style>\n",
|
||||
"<table border=\"1\" class=\"dataframe\">\n",
|
||||
" <thead>\n",
|
||||
" <tr style=\"text-align: right;\">\n",
|
||||
" <th></th>\n",
|
||||
" <th>account length</th>\n",
|
||||
" <th>area code</th>\n",
|
||||
" <th>churn</th>\n",
|
||||
" <th>customer service calls</th>\n",
|
||||
" <th>international plan</th>\n",
|
||||
" <th>number vmail messages</th>\n",
|
||||
" <th>phone number</th>\n",
|
||||
" <th>state</th>\n",
|
||||
" <th>total day calls</th>\n",
|
||||
" <th>total day charge</th>\n",
|
||||
" <th>...</th>\n",
|
||||
" <th>total eve calls</th>\n",
|
||||
" <th>total eve charge</th>\n",
|
||||
" <th>total eve minutes</th>\n",
|
||||
" <th>total intl calls</th>\n",
|
||||
" <th>total intl charge</th>\n",
|
||||
" <th>total intl minutes</th>\n",
|
||||
" <th>total night calls</th>\n",
|
||||
" <th>total night charge</th>\n",
|
||||
" <th>total night minutes</th>\n",
|
||||
" <th>voice mail plan</th>\n",
|
||||
" </tr>\n",
|
||||
" </thead>\n",
|
||||
" <tbody>\n",
|
||||
" <tr>\n",
|
||||
" <th>0</th>\n",
|
||||
" <td>128</td>\n",
|
||||
" <td>415</td>\n",
|
||||
" <td>0</td>\n",
|
||||
" <td>1</td>\n",
|
||||
" <td>no</td>\n",
|
||||
" <td>25</td>\n",
|
||||
" <td>382-4657</td>\n",
|
||||
" <td>KS</td>\n",
|
||||
" <td>110</td>\n",
|
||||
" <td>45.07</td>\n",
|
||||
" <td>...</td>\n",
|
||||
" <td>99</td>\n",
|
||||
" <td>16.78</td>\n",
|
||||
" <td>197.4</td>\n",
|
||||
" <td>3</td>\n",
|
||||
" <td>2.7</td>\n",
|
||||
" <td>10.0</td>\n",
|
||||
" <td>91</td>\n",
|
||||
" <td>11.01</td>\n",
|
||||
" <td>244.7</td>\n",
|
||||
" <td>yes</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>1</th>\n",
|
||||
" <td>107</td>\n",
|
||||
" <td>415</td>\n",
|
||||
" <td>0</td>\n",
|
||||
" <td>1</td>\n",
|
||||
" <td>no</td>\n",
|
||||
" <td>26</td>\n",
|
||||
" <td>371-7191</td>\n",
|
||||
" <td>OH</td>\n",
|
||||
" <td>123</td>\n",
|
||||
" <td>27.47</td>\n",
|
||||
" <td>...</td>\n",
|
||||
" <td>103</td>\n",
|
||||
" <td>16.62</td>\n",
|
||||
" <td>195.5</td>\n",
|
||||
" <td>3</td>\n",
|
||||
" <td>3.7</td>\n",
|
||||
" <td>13.7</td>\n",
|
||||
" <td>103</td>\n",
|
||||
" <td>11.45</td>\n",
|
||||
" <td>254.4</td>\n",
|
||||
" <td>yes</td>\n",
|
||||
" </tr>\n",
|
||||
" </tbody>\n",
|
||||
"</table>\n",
|
||||
"</div>\n",
|
||||
"<p>2 rows × 21 columns</p>"
|
||||
],
|
||||
"text/plain": [
|
||||
" account length area code churn customer service calls \\\n",
|
||||
"0 128 415 0 1 \n",
|
||||
"1 107 415 0 1 \n",
|
||||
"\n",
|
||||
" international plan number vmail messages phone number state \\\n",
|
||||
"0 no 25 382-4657 KS \n",
|
||||
"1 no 26 371-7191 OH \n",
|
||||
"\n",
|
||||
" total day calls total day charge ... total eve calls total eve charge \\\n",
|
||||
"0 110 45.07 ... 99 16.78 \n",
|
||||
"1 123 27.47 ... 103 16.62 \n",
|
||||
"\n",
|
||||
" total eve minutes total intl calls total intl charge total intl minutes \\\n",
|
||||
"0 197.4 3 2.7 10.0 \n",
|
||||
"1 195.5 3 3.7 13.7 \n",
|
||||
"\n",
|
||||
" total night calls total night charge total night minutes voice mail plan \n",
|
||||
"0 91 11.01 244.7 yes \n",
|
||||
"1 103 11.45 254.4 yes \n",
|
||||
"\n",
|
||||
"[2 rows x 21 columns]"
|
||||
]
|
||||
},
|
||||
"execution_count": 5,
|
||||
"metadata": {},
|
||||
@ -85,7 +206,7 @@
|
||||
],
|
||||
"source": [
|
||||
"# NBVAL_IGNORE_OUTPUT\n",
|
||||
"ed.csv_to_eland(\"./test_churn.csv\", es_client='localhost', es_dest_index='churn', es_refresh=True, index_col=0)"
|
||||
"ed.csv_to_eland(\"./test_churn.csv\", es_client='http://localhost:9200', es_dest_index='churn', es_refresh=True, index_col=0)"
|
||||
]
|
||||
},
|
||||
{
|
||||
@ -95,7 +216,37 @@
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": "{'took': 0,\n 'timed_out': False,\n '_shards': {'total': 1, 'successful': 1, 'skipped': 0, 'failed': 0},\n 'hits': {'total': {'value': 2, 'relation': 'eq'},\n 'max_score': 1.0,\n 'hits': [{'_index': 'churn',\n '_id': '0',\n '_score': 1.0,\n '_source': {'state': 'KS',\n 'account length': 128,\n 'area code': 415,\n 'phone number': '382-4657',\n 'international plan': 'no',\n 'voice mail plan': 'yes',\n 'number vmail messages': 25,\n 'total day minutes': 265.1,\n 'total day calls': 110,\n 'total day charge': 45.07,\n 'total eve minutes': 197.4,\n 'total eve calls': 99,\n 'total eve charge': 16.78,\n 'total night minutes': 244.7,\n 'total night calls': 91,\n 'total night charge': 11.01,\n 'total intl minutes': 10.0,\n 'total intl calls': 3,\n 'total intl charge': 2.7,\n 'customer service calls': 1,\n 'churn': 0}}]}}"
|
||||
"text/plain": [
|
||||
"{'took': 0,\n",
|
||||
" 'timed_out': False,\n",
|
||||
" '_shards': {'total': 1, 'successful': 1, 'skipped': 0, 'failed': 0},\n",
|
||||
" 'hits': {'total': {'value': 2, 'relation': 'eq'},\n",
|
||||
" 'max_score': 1.0,\n",
|
||||
" 'hits': [{'_index': 'churn',\n",
|
||||
" '_id': '0',\n",
|
||||
" '_score': 1.0,\n",
|
||||
" '_source': {'state': 'KS',\n",
|
||||
" 'account length': 128,\n",
|
||||
" 'area code': 415,\n",
|
||||
" 'phone number': '382-4657',\n",
|
||||
" 'international plan': 'no',\n",
|
||||
" 'voice mail plan': 'yes',\n",
|
||||
" 'number vmail messages': 25,\n",
|
||||
" 'total day minutes': 265.1,\n",
|
||||
" 'total day calls': 110,\n",
|
||||
" 'total day charge': 45.07,\n",
|
||||
" 'total eve minutes': 197.4,\n",
|
||||
" 'total eve calls': 99,\n",
|
||||
" 'total eve charge': 16.78,\n",
|
||||
" 'total night minutes': 244.7,\n",
|
||||
" 'total night calls': 91,\n",
|
||||
" 'total night charge': 11.01,\n",
|
||||
" 'total intl minutes': 10.0,\n",
|
||||
" 'total intl calls': 3,\n",
|
||||
" 'total intl charge': 2.7,\n",
|
||||
" 'customer service calls': 1,\n",
|
||||
" 'churn': 0}}]}}"
|
||||
]
|
||||
},
|
||||
"execution_count": 6,
|
||||
"metadata": {},
|
||||
@ -114,7 +265,9 @@
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": "{'acknowledged': True}"
|
||||
"text/plain": [
|
||||
"{'acknowledged': True}"
|
||||
]
|
||||
},
|
||||
"execution_count": 7,
|
||||
"metadata": {},
|
||||
@ -147,4 +300,4 @@
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 4
|
||||
}
|
||||
}
|
||||
|
@ -22,7 +22,7 @@
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"ed_df = ed.DataFrame('localhost', 'flights', columns=[\"AvgTicketPrice\", \"Cancelled\", \"dayOfWeek\", \"timestamp\", \"DestCountry\"])"
|
||||
"ed_df = ed.DataFrame('http://localhost:9200', 'flights', columns=[\"AvgTicketPrice\", \"Cancelled\", \"dayOfWeek\", \"timestamp\", \"DestCountry\"])"
|
||||
]
|
||||
},
|
||||
{
|
||||
@ -32,7 +32,13 @@
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": "AvgTicketPrice 640.387285\nCancelled False\ndayOfWeek 3\ntimestamp 2018-01-21 23:43:19.256498944\ndtype: object"
|
||||
"text/plain": [
|
||||
"AvgTicketPrice 640.387285\n",
|
||||
"Cancelled False\n",
|
||||
"dayOfWeek 3\n",
|
||||
"timestamp 2018-01-21 23:43:19.256498944\n",
|
||||
"dtype: object"
|
||||
]
|
||||
},
|
||||
"execution_count": 3,
|
||||
"metadata": {},
|
||||
@ -51,7 +57,12 @@
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": "AvgTicketPrice 640.387285\nCancelled 0.000000\ndayOfWeek 3.000000\ndtype: float64"
|
||||
"text/plain": [
|
||||
"AvgTicketPrice 640.387285\n",
|
||||
"Cancelled 0.000000\n",
|
||||
"dayOfWeek 3.000000\n",
|
||||
"dtype: float64"
|
||||
]
|
||||
},
|
||||
"execution_count": 4,
|
||||
"metadata": {},
|
||||
@ -70,7 +81,14 @@
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": "AvgTicketPrice 640.387285\nCancelled False\ndayOfWeek 3\ntimestamp 2018-01-21 23:43:19.256498944\nDestCountry NaN\ndtype: object"
|
||||
"text/plain": [
|
||||
"AvgTicketPrice 640.387285\n",
|
||||
"Cancelled False\n",
|
||||
"dayOfWeek 3\n",
|
||||
"timestamp 2018-01-21 23:43:19.256498944\n",
|
||||
"DestCountry NaN\n",
|
||||
"dtype: object"
|
||||
]
|
||||
},
|
||||
"execution_count": 5,
|
||||
"metadata": {},
|
||||
@ -89,7 +107,11 @@
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": "AvgTicketPrice 213.430365\ndayOfWeek 2.000000\ndtype: float64"
|
||||
"text/plain": [
|
||||
"AvgTicketPrice 213.430365\n",
|
||||
"dayOfWeek 2.000000\n",
|
||||
"dtype: float64"
|
||||
]
|
||||
},
|
||||
"execution_count": 6,
|
||||
"metadata": {},
|
||||
@ -108,7 +130,11 @@
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": "AvgTicketPrice 213.430365\ndayOfWeek 2.000000\ndtype: float64"
|
||||
"text/plain": [
|
||||
"AvgTicketPrice 213.430365\n",
|
||||
"dayOfWeek 2.000000\n",
|
||||
"dtype: float64"
|
||||
]
|
||||
},
|
||||
"execution_count": 7,
|
||||
"metadata": {},
|
||||
@ -127,7 +153,14 @@
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": "AvgTicketPrice 213.430365\nCancelled NaN\ndayOfWeek 2.0\ntimestamp NaT\nDestCountry NaN\ndtype: object"
|
||||
"text/plain": [
|
||||
"AvgTicketPrice 213.430365\n",
|
||||
"Cancelled NaN\n",
|
||||
"dayOfWeek 2.0\n",
|
||||
"timestamp NaT\n",
|
||||
"DestCountry NaN\n",
|
||||
"dtype: object"
|
||||
]
|
||||
},
|
||||
"execution_count": 8,
|
||||
"metadata": {},
|
||||
@ -161,4 +194,4 @@
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 4
|
||||
}
|
||||
}
|
||||
|
File diff suppressed because one or more lines are too long
@ -17,8 +17,8 @@
|
||||
|
||||
import pandas as pd
|
||||
from elasticsearch import helpers
|
||||
from elasticsearch._sync.client import Elasticsearch
|
||||
|
||||
from eland.common import es_version
|
||||
from tests import (
|
||||
ECOMMERCE_FILE_NAME,
|
||||
ECOMMERCE_INDEX_NAME,
|
||||
@ -53,9 +53,9 @@ def _setup_data(es):
|
||||
|
||||
# Delete index
|
||||
print("Deleting index:", index_name)
|
||||
es.indices.delete(index=index_name, ignore=[400, 404])
|
||||
es.options(ignore_status=[400, 404]).indices.delete(index=index_name)
|
||||
print("Creating index:", index_name)
|
||||
es.indices.create(index=index_name, body=mapping)
|
||||
es.indices.create(index=index_name, **mapping)
|
||||
|
||||
df = pd.read_json(json_file_name, lines=True)
|
||||
|
||||
@ -85,30 +85,28 @@ def _setup_data(es):
|
||||
print("Done", index_name)
|
||||
|
||||
|
||||
def _update_max_compilations_limit(es, limit="10000/1m"):
|
||||
def _update_max_compilations_limit(es: Elasticsearch, limit="10000/1m"):
|
||||
print("Updating script.max_compilations_rate to ", limit)
|
||||
if es_version(es) < (7, 8):
|
||||
body = {"transient": {"script.max_compilations_rate": limit}}
|
||||
else:
|
||||
body = {
|
||||
"transient": {
|
||||
"script.max_compilations_rate": "use-context",
|
||||
"script.context.field.max_compilations_rate": limit,
|
||||
}
|
||||
es.cluster.put_settings(
|
||||
transient={
|
||||
"script.max_compilations_rate": "use-context",
|
||||
"script.context.field.max_compilations_rate": limit,
|
||||
}
|
||||
es.cluster.put_settings(body=body)
|
||||
)
|
||||
|
||||
|
||||
def _setup_test_mappings(es):
|
||||
def _setup_test_mappings(es: Elasticsearch):
|
||||
# Create a complex mapping containing many Elasticsearch features
|
||||
es.indices.delete(index=TEST_MAPPING1_INDEX_NAME, ignore=[400, 404])
|
||||
es.indices.create(index=TEST_MAPPING1_INDEX_NAME, body=TEST_MAPPING1)
|
||||
es.options(ignore_status=[400, 404]).indices.delete(index=TEST_MAPPING1_INDEX_NAME)
|
||||
es.indices.create(index=TEST_MAPPING1_INDEX_NAME, **TEST_MAPPING1)
|
||||
|
||||
|
||||
def _setup_test_nested(es):
|
||||
es.indices.delete(index=TEST_NESTED_USER_GROUP_INDEX_NAME, ignore=[400, 404])
|
||||
es.options(ignore_status=[400, 404]).indices.delete(
|
||||
index=TEST_NESTED_USER_GROUP_INDEX_NAME
|
||||
)
|
||||
es.indices.create(
|
||||
index=TEST_NESTED_USER_GROUP_INDEX_NAME, body=TEST_NESTED_USER_GROUP_MAPPING
|
||||
index=TEST_NESTED_USER_GROUP_INDEX_NAME, **TEST_NESTED_USER_GROUP_MAPPING
|
||||
)
|
||||
|
||||
helpers.bulk(es, TEST_NESTED_USER_GROUP_DOCS)
|
||||
|
Loading…
x
Reference in New Issue
Block a user