Correcting license files + fixing bug in filter

LICENSE and NOTICE conform to Elastic policy. Bug in
nested negated filters fixed.

Also, some limited cleanup.
This commit is contained in:
Stephen Dodson 2019-12-03 13:56:49 +00:00
parent a3dd86075a
commit bf6c56878a
10 changed files with 243 additions and 11 deletions

52
NOTICE.txt Normal file
View File

@ -0,0 +1,52 @@
eland source code
Copyright 2019 Elasticsearch BV
---
This product includes code that is adapted from pandas, which is
available under a "BSD-3-Clause" license.
https://github.com/pandas-dev/pandas
BSD 3-Clause License
Copyright (c) 2008-2012, AQR Capital Management, LLC, Lambda Foundry, Inc. and PyData Development Team
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright notice, this
list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright notice,
this list of conditions and the following disclaimer in the documentation
and/or other materials provided with the distribution.
* Neither the name of the copyright holder nor the names of its
contributors may be used to endorse or promote products derived from
this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
----
This product includes code that is adapted from pandasticsearch, which is
available under a "MIT" license.
https://github.com/onesuper/pandasticsearch
Copyright (C) 2016 onesuper
Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

View File

@ -1,6 +1,9 @@
# What is it?
eland is a elasticsearch client Python package to analyse, explore and manipulate data that resides in elasticsearch. Where possible the package uses existing Python APIs and data structures to make it easy to switch between Numpy, Pandas, Scikit-learn to their elasticsearch powered equivalents. In general, the data resides in elasticsearch and not in memory, which allows eland to access large datasets stored in elasticsearch.
eland is a elasticsearch client `Python <https://www.python.org/>`__ package to analyse, explore and manipulate data that resides in elasticsearch.
Where possible the package uses existing Python APIs and data structures to make it easy to switch between numpy,
pandas, scikit-learn to their elasticsearch powered equivalents. In general, the data resides in elasticsearch and
not in memory, which allows eland to access large datasets stored in elasticsearch.
For example, to explore data in a large elasticsearch index, simply create an eland DataFrame from an elasticsearch index pattern, and explore using an API that mirrors a subset of the pandas.DataFrame API:
@ -29,6 +32,21 @@ min 0.000000 0.000000 0.000000
max 400140.000000 246.000000 5.000000
```
## Connecting to Elasticsearch Cloud
```python
>>> es = Elasticsearch(cloud_id="<cloud_id>", http_auth=('<user>','<password>'))
>>> es.info()
{'name': 'instance-0000000000', 'cluster_name': 'bf900cfce5684a81bca0be0cce5913bc', 'cluster_uuid': 'xLPvrV3jQNeadA7oM4l1jA', 'version': {'number': '7.4.2', 'build_flavor': 'default', 'build_type': 'tar', 'build_hash': '2f90bbf7b93631e52bafb59b3b049cb44ec25e96', 'build_date': '2019-10-28T20:40:44.881551Z', 'build_snapshot': False, 'lucene_version': '8.2.0', 'minimum_wire_compatibility_version': '6.8.0', 'minimum_index_compatibility_version': '6.0.0-beta1'}, 'tagline': 'You Know, for Search'}
>>> import eland as ed
>>> df = ed.read_es(es, 'reviews')
```
## Development Setup
1. Create a virtual environment in Python

View File

@ -1,4 +1,4 @@
# Derived from pandasticsearch filters
# Originally based on code in pandasticsearch filters
# Es filter builder for BooleanCond
class BooleanFilter:
@ -8,20 +8,34 @@ class BooleanFilter:
def __and__(self, x):
# Combine results
if isinstance(self, AndFilter):
self.subtree['must'].append(x.subtree)
if 'must_not' in x.subtree:
# nest a must_not under a must
self.subtree['must'].append(x.build()) # 'build includes bool'
else:
# append a must to a must
self.subtree['must'].append(x.subtree) # 'subtree strips bool'
return self
elif isinstance(x, AndFilter):
x.subtree['must'].append(self.subtree)
if 'must_not' in self.subtree:
x.subtree['must'].append(self.build())
else:
x.subtree['must'].append(self.subtree)
return x
return AndFilter(self, x)
def __or__(self, x):
# Combine results
if isinstance(self, OrFilter):
self.subtree['should'].append(x.subtree)
if 'must_not' in x.subtree:
self.subtree['should'].append(x.build())
else:
self.subtree['should'].append(x.subtree)
return self
elif isinstance(x, OrFilter):
x.subtree['should'].append(self.subtree)
if 'must_not' in self.subtree:
x.subtree['should'].append(self.build())
else:
x.subtree['should'].append(self.subtree)
return x
return OrFilter(self, x)

View File

@ -217,7 +217,6 @@ class NDFrame:
<BLANKLINE>
[4673 rows x 3 columns]
"""
# (derived from modin.base.BasePandasDataset)
# Level not supported
if level is not None:
raise NotImplementedError("level not supported {}".format(level))

View File

@ -99,6 +99,7 @@ class Operations:
for task in reversed(self._tasks):
if task[0] == 'field_names':
return task[1]
return None
def __repr__(self):

View File

@ -309,8 +309,9 @@ class ElandQueryCompiler:
# if the field is numeric. This implementation will currently map
# any script field with "Infinity" as a string to np.inf
if x == 'Infinity':
x = np.inf
out[name[:-1]] = x
out[name[:-1]] = np.inf
else:
out[name[:-1]] = x
flatten(y)

View File

@ -1,6 +1,7 @@
# File called _pytest for PyCharm compatability
import pandas as pd
from elasticsearch import Elasticsearch
import eland as ed
from eland.tests.common import ELASTICSEARCH_HOST
@ -15,11 +16,22 @@ class TestDataFrameQuery(TestData):
# https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.query.html
pd_df = pd.DataFrame({'A': range(1, 6), 'B': range(10, 0, -2), 'C': range(10, 5, -1)},
index=['0', '1', '2', '3', '4'])
"""
>>> pd_df
A B C
0 1 10 10
1 2 8 9
2 3 6 8
3 4 4 7
4 5 2 6
"""
es = Elasticsearch(ELASTICSEARCH_HOST)
# Now create index
index_name = 'eland_test_query'
ed_df = ed.pandas_to_eland(pd_df, ELASTICSEARCH_HOST, index_name, if_exists="replace", refresh=True)
ed_df = ed.pandas_to_eland(pd_df, es, index_name, if_exists="replace", refresh=True)
assert_pandas_eland_frame_equal(pd_df, ed_df)
@ -43,6 +55,8 @@ class TestDataFrameQuery(TestData):
assert_pandas_eland_frame_equal(pd_q4, ed_q4)
es.indices.delete(index_name)
def test_simple_query(self):
ed_flights = self.ed_flights()
pd_flights = self.pd_flights()
@ -55,3 +69,54 @@ class TestDataFrameQuery(TestData):
assert pd_flights[pd_flights.OriginAirportID.isin(['LHR', 'SYD'])].shape == \
ed_flights[ed_flights.OriginAirportID.isin(['LHR', 'SYD'])].shape
def test_multiitem_query(self):
# Examples from:
# https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.query.html
pd_df = pd.DataFrame({'A': range(1, 6), 'B': range(10, 0, -2), 'C': range(10, 5, -1)},
index=['0', '1', '2', '3', '4'])
"""
>>> pd_df
A B C
0 1 10 10
1 2 8 9
2 3 6 8
3 4 4 7
4 5 2 6
"""
es = Elasticsearch(ELASTICSEARCH_HOST)
# Now create index
index_name = 'eland_test_query'
ed_df = ed.pandas_to_eland(pd_df, es, index_name, if_exists="replace", refresh=True)
assert_pandas_eland_frame_equal(pd_df, ed_df)
pd_df.info()
ed_df.info()
pd_q1 = pd_df[pd_df.A > 2]
pd_q2 = pd_df[pd_df.A > pd_df.B]
pd_q3 = pd_df[pd_df.B == pd_df.C]
ed_q1 = ed_df[ed_df.A > 2]
ed_q2 = ed_df[ed_df.A > ed_df.B]
ed_q3 = ed_df[ed_df.B == ed_df.C]
assert_pandas_eland_frame_equal(pd_q1, ed_q1)
assert_pandas_eland_frame_equal(pd_q2, ed_q2)
assert_pandas_eland_frame_equal(pd_q3, ed_q3)
ed_q4 = ed_q1.query('B > 2')
pd_q4 = pd_q1.query('B > 2')
assert_pandas_eland_frame_equal(pd_q4, ed_q4)
# Drop rows by index
ed_q4 = ed_q4.drop(['2'])
pd_q4 = pd_q4.drop(['2'])
assert_pandas_eland_frame_equal(pd_q4, ed_q4)
#es.indices.delete(index_name)

View File

@ -65,3 +65,6 @@ class TestDataFrameToCSV(TestData):
# TODO - there is a 'bug' where the Elasticsearch index returns data in a different order to the CSV
print(ed_flights_from_csv.head())
print(pd_flights_from_csv.head())
# clean up index
es.indices.delete(test_index)

View File

@ -1,4 +1,3 @@
# -*- coding: UTF-8 -*-
from eland.filter import *
@ -170,3 +169,83 @@ class TestOperators:
]
}
}
def test_must_and_must_not_filter(self):
exp = (GreaterEqual('a', 2) & GreaterEqual('b', 2)) & ~(IsIn('ids', [1, 2, 3]))
a = exp.build()
b = {
'bool': {
'must': [
{'range': {'a': {'gte': 2}}},
{'range': {'b': {'gte': 2}}},
{
'bool': {
'must_not': {
'ids': {'values': [1, 2, 3]}
}
}
}
]
}
}
assert a == b
def test_must_not_and_must_filter(self):
exp = ~(IsIn('ids', [1, 2, 3])) & (GreaterEqual('a', 2) & GreaterEqual('b', 2))
a = exp.build()
b = {
'bool': {
'must': [
{'range': {'a': {'gte': 2}}},
{'range': {'b': {'gte': 2}}},
{
'bool': {
'must_not': {
'ids': {'values': [1, 2, 3]}
}
}
}
]
}
}
assert a == b
def test_must_not_or_must_filter(self):
exp = ~(IsIn('ids', [1, 2, 3])) | (GreaterEqual('a', 2) | GreaterEqual('b', 2))
a = exp.build()
b = {
'bool': {
'should': [
{'range': {'a': {'gte': 2}}},
{'range': {'b': {'gte': 2}}},
{
'bool': {
'must_not': {
'ids': {'values': [1, 2, 3]}
}
}
}
]
}
}
assert a == b
def test_must_or_must_not_filter(self):
exp = (GreaterEqual('a', 2) | GreaterEqual('b', 2)) | ~(IsIn('ids', [1, 2, 3]))
a = exp.build()
b = {
'bool': {
'should': [
{'range': {'a': {'gte': 2}}},
{'range': {'b': {'gte': 2}}},
{
'bool': {
'must_not': {
'ids': {'values': [1, 2, 3]}
}
}
}
]
}
}
assert a == b