diff --git a/LICENSE b/LICENSE.txt similarity index 100% rename from LICENSE rename to LICENSE.txt diff --git a/NOTICE.txt b/NOTICE.txt new file mode 100644 index 0000000..389f4bc --- /dev/null +++ b/NOTICE.txt @@ -0,0 +1,52 @@ +eland source code +Copyright 2019 Elasticsearch BV + +--- +This product includes code that is adapted from pandas, which is +available under a "BSD-3-Clause" license. + +https://github.com/pandas-dev/pandas + +BSD 3-Clause License + +Copyright (c) 2008-2012, AQR Capital Management, LLC, Lambda Foundry, Inc. and PyData Development Team +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + +* Redistributions of source code must retain the above copyright notice, this + list of conditions and the following disclaimer. + +* Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimer in the documentation + and/or other materials provided with the distribution. + +* Neither the name of the copyright holder nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +---- +This product includes code that is adapted from pandasticsearch, which is +available under a "MIT" license. + +https://github.com/onesuper/pandasticsearch + +Copyright (C) 2016 onesuper + +Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. diff --git a/README.md b/README.md index e5d3d4f..13bede9 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,9 @@ # What is it? -eland is a elasticsearch client Python package to analyse, explore and manipulate data that resides in elasticsearch. Where possible the package uses existing Python APIs and data structures to make it easy to switch between Numpy, Pandas, Scikit-learn to their elasticsearch powered equivalents. In general, the data resides in elasticsearch and not in memory, which allows eland to access large datasets stored in elasticsearch. +eland is a elasticsearch client `Python `__ package to analyse, explore and manipulate data that resides in elasticsearch. +Where possible the package uses existing Python APIs and data structures to make it easy to switch between numpy, +pandas, scikit-learn to their elasticsearch powered equivalents. In general, the data resides in elasticsearch and +not in memory, which allows eland to access large datasets stored in elasticsearch. For example, to explore data in a large elasticsearch index, simply create an eland DataFrame from an elasticsearch index pattern, and explore using an API that mirrors a subset of the pandas.DataFrame API: @@ -29,6 +32,21 @@ min 0.000000 0.000000 0.000000 max 400140.000000 246.000000 5.000000 ``` +## Connecting to Elasticsearch Cloud + +```python +>>> es = Elasticsearch(cloud_id="", http_auth=('','')) + +>>> es.info() +{'name': 'instance-0000000000', 'cluster_name': 'bf900cfce5684a81bca0be0cce5913bc', 'cluster_uuid': 'xLPvrV3jQNeadA7oM4l1jA', 'version': {'number': '7.4.2', 'build_flavor': 'default', 'build_type': 'tar', 'build_hash': '2f90bbf7b93631e52bafb59b3b049cb44ec25e96', 'build_date': '2019-10-28T20:40:44.881551Z', 'build_snapshot': False, 'lucene_version': '8.2.0', 'minimum_wire_compatibility_version': '6.8.0', 'minimum_index_compatibility_version': '6.0.0-beta1'}, 'tagline': 'You Know, for Search'} + +>>> import eland as ed + +>>> df = ed.read_es(es, 'reviews') +``` + + + ## Development Setup 1. Create a virtual environment in Python diff --git a/eland/filter.py b/eland/filter.py index 078e4d7..2534bba 100644 --- a/eland/filter.py +++ b/eland/filter.py @@ -1,4 +1,4 @@ -# Derived from pandasticsearch filters +# Originally based on code in pandasticsearch filters # Es filter builder for BooleanCond class BooleanFilter: @@ -8,20 +8,34 @@ class BooleanFilter: def __and__(self, x): # Combine results if isinstance(self, AndFilter): - self.subtree['must'].append(x.subtree) + if 'must_not' in x.subtree: + # nest a must_not under a must + self.subtree['must'].append(x.build()) # 'build includes bool' + else: + # append a must to a must + self.subtree['must'].append(x.subtree) # 'subtree strips bool' return self elif isinstance(x, AndFilter): - x.subtree['must'].append(self.subtree) + if 'must_not' in self.subtree: + x.subtree['must'].append(self.build()) + else: + x.subtree['must'].append(self.subtree) return x return AndFilter(self, x) def __or__(self, x): # Combine results if isinstance(self, OrFilter): - self.subtree['should'].append(x.subtree) + if 'must_not' in x.subtree: + self.subtree['should'].append(x.build()) + else: + self.subtree['should'].append(x.subtree) return self elif isinstance(x, OrFilter): - x.subtree['should'].append(self.subtree) + if 'must_not' in self.subtree: + x.subtree['should'].append(self.build()) + else: + x.subtree['should'].append(self.subtree) return x return OrFilter(self, x) diff --git a/eland/ndframe.py b/eland/ndframe.py index 323ff6c..6a7ddeb 100644 --- a/eland/ndframe.py +++ b/eland/ndframe.py @@ -217,7 +217,6 @@ class NDFrame: [4673 rows x 3 columns] """ - # (derived from modin.base.BasePandasDataset) # Level not supported if level is not None: raise NotImplementedError("level not supported {}".format(level)) diff --git a/eland/operations.py b/eland/operations.py index b888f86..2be5217 100644 --- a/eland/operations.py +++ b/eland/operations.py @@ -99,6 +99,7 @@ class Operations: for task in reversed(self._tasks): if task[0] == 'field_names': return task[1] + return None def __repr__(self): diff --git a/eland/query_compiler.py b/eland/query_compiler.py index cc7ce6b..07c93b3 100644 --- a/eland/query_compiler.py +++ b/eland/query_compiler.py @@ -309,8 +309,9 @@ class ElandQueryCompiler: # if the field is numeric. This implementation will currently map # any script field with "Infinity" as a string to np.inf if x == 'Infinity': - x = np.inf - out[name[:-1]] = x + out[name[:-1]] = np.inf + else: + out[name[:-1]] = x flatten(y) diff --git a/eland/tests/dataframe/test_query_pytest.py b/eland/tests/dataframe/test_query_pytest.py index d2add40..47eda76 100644 --- a/eland/tests/dataframe/test_query_pytest.py +++ b/eland/tests/dataframe/test_query_pytest.py @@ -1,6 +1,7 @@ # File called _pytest for PyCharm compatability import pandas as pd +from elasticsearch import Elasticsearch import eland as ed from eland.tests.common import ELASTICSEARCH_HOST @@ -15,11 +16,22 @@ class TestDataFrameQuery(TestData): # https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.query.html pd_df = pd.DataFrame({'A': range(1, 6), 'B': range(10, 0, -2), 'C': range(10, 5, -1)}, index=['0', '1', '2', '3', '4']) + """ + >>> pd_df + A B C + 0 1 10 10 + 1 2 8 9 + 2 3 6 8 + 3 4 4 7 + 4 5 2 6 + """ + + es = Elasticsearch(ELASTICSEARCH_HOST) # Now create index index_name = 'eland_test_query' - ed_df = ed.pandas_to_eland(pd_df, ELASTICSEARCH_HOST, index_name, if_exists="replace", refresh=True) + ed_df = ed.pandas_to_eland(pd_df, es, index_name, if_exists="replace", refresh=True) assert_pandas_eland_frame_equal(pd_df, ed_df) @@ -43,6 +55,8 @@ class TestDataFrameQuery(TestData): assert_pandas_eland_frame_equal(pd_q4, ed_q4) + es.indices.delete(index_name) + def test_simple_query(self): ed_flights = self.ed_flights() pd_flights = self.pd_flights() @@ -55,3 +69,54 @@ class TestDataFrameQuery(TestData): assert pd_flights[pd_flights.OriginAirportID.isin(['LHR', 'SYD'])].shape == \ ed_flights[ed_flights.OriginAirportID.isin(['LHR', 'SYD'])].shape + + def test_multiitem_query(self): + # Examples from: + # https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.query.html + pd_df = pd.DataFrame({'A': range(1, 6), 'B': range(10, 0, -2), 'C': range(10, 5, -1)}, + index=['0', '1', '2', '3', '4']) + """ + >>> pd_df + A B C + 0 1 10 10 + 1 2 8 9 + 2 3 6 8 + 3 4 4 7 + 4 5 2 6 + """ + es = Elasticsearch(ELASTICSEARCH_HOST) + + # Now create index + index_name = 'eland_test_query' + + ed_df = ed.pandas_to_eland(pd_df, es, index_name, if_exists="replace", refresh=True) + + assert_pandas_eland_frame_equal(pd_df, ed_df) + + pd_df.info() + ed_df.info() + + pd_q1 = pd_df[pd_df.A > 2] + pd_q2 = pd_df[pd_df.A > pd_df.B] + pd_q3 = pd_df[pd_df.B == pd_df.C] + + ed_q1 = ed_df[ed_df.A > 2] + ed_q2 = ed_df[ed_df.A > ed_df.B] + ed_q3 = ed_df[ed_df.B == ed_df.C] + + assert_pandas_eland_frame_equal(pd_q1, ed_q1) + assert_pandas_eland_frame_equal(pd_q2, ed_q2) + assert_pandas_eland_frame_equal(pd_q3, ed_q3) + + ed_q4 = ed_q1.query('B > 2') + pd_q4 = pd_q1.query('B > 2') + + assert_pandas_eland_frame_equal(pd_q4, ed_q4) + + # Drop rows by index + ed_q4 = ed_q4.drop(['2']) + pd_q4 = pd_q4.drop(['2']) + + assert_pandas_eland_frame_equal(pd_q4, ed_q4) + + #es.indices.delete(index_name) \ No newline at end of file diff --git a/eland/tests/dataframe/test_to_csv_pytest.py b/eland/tests/dataframe/test_to_csv_pytest.py index fd2ae5e..4ff8290 100644 --- a/eland/tests/dataframe/test_to_csv_pytest.py +++ b/eland/tests/dataframe/test_to_csv_pytest.py @@ -65,3 +65,6 @@ class TestDataFrameToCSV(TestData): # TODO - there is a 'bug' where the Elasticsearch index returns data in a different order to the CSV print(ed_flights_from_csv.head()) print(pd_flights_from_csv.head()) + + # clean up index + es.indices.delete(test_index) diff --git a/eland/tests/operators/test_operators_pytest.py b/eland/tests/operators/test_operators_pytest.py index be1ba61..6992d3c 100644 --- a/eland/tests/operators/test_operators_pytest.py +++ b/eland/tests/operators/test_operators_pytest.py @@ -1,4 +1,3 @@ -# -*- coding: UTF-8 -*- from eland.filter import * @@ -170,3 +169,83 @@ class TestOperators: ] } } + + def test_must_and_must_not_filter(self): + exp = (GreaterEqual('a', 2) & GreaterEqual('b', 2)) & ~(IsIn('ids', [1, 2, 3])) + a = exp.build() + b = { + 'bool': { + 'must': [ + {'range': {'a': {'gte': 2}}}, + {'range': {'b': {'gte': 2}}}, + { + 'bool': { + 'must_not': { + 'ids': {'values': [1, 2, 3]} + } + } + } + ] + } + } + assert a == b + + def test_must_not_and_must_filter(self): + exp = ~(IsIn('ids', [1, 2, 3])) & (GreaterEqual('a', 2) & GreaterEqual('b', 2)) + a = exp.build() + b = { + 'bool': { + 'must': [ + {'range': {'a': {'gte': 2}}}, + {'range': {'b': {'gte': 2}}}, + { + 'bool': { + 'must_not': { + 'ids': {'values': [1, 2, 3]} + } + } + } + ] + } + } + assert a == b + + def test_must_not_or_must_filter(self): + exp = ~(IsIn('ids', [1, 2, 3])) | (GreaterEqual('a', 2) | GreaterEqual('b', 2)) + a = exp.build() + b = { + 'bool': { + 'should': [ + {'range': {'a': {'gte': 2}}}, + {'range': {'b': {'gte': 2}}}, + { + 'bool': { + 'must_not': { + 'ids': {'values': [1, 2, 3]} + } + } + } + ] + } + } + assert a == b + + def test_must_or_must_not_filter(self): + exp = (GreaterEqual('a', 2) | GreaterEqual('b', 2)) | ~(IsIn('ids', [1, 2, 3])) + a = exp.build() + b = { + 'bool': { + 'should': [ + {'range': {'a': {'gte': 2}}}, + {'range': {'b': {'gte': 2}}}, + { + 'bool': { + 'must_not': { + 'ids': {'values': [1, 2, 3]} + } + } + } + ] + } + } + assert a == b