Too long frame exception fixes (#135)

* Updating test matrix for 7.6 + removing oss for now.

* Resolving 7.6.0 docs issues

* Updating ML docs

* Resolving too_long_frame_exception on large mappings

- Embedded _source parameters in bodt rather than url
- Fixed bug in DataFrame.info on empty DataFrame
- Removed warning from TestImportedMLModel

* Resolving too_long_frame_exception on large mappings

- Embedded _source parameters in bodt rather than url
- Fixed bug in DataFrame.info on empty DataFrame
- Removed warning from TestImportedMLModel
This commit is contained in:
Stephen Dodson 2020-02-26 12:50:14 +00:00 committed by GitHub
parent 206677818f
commit a33ff45ebc
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 90 additions and 7 deletions

View File

@ -555,8 +555,13 @@ class DataFrame(NDFrame):
# Print index summary e.g.
# Index: 103 entries, 0 to 102
# Do this by getting head and tail of dataframe
head = self.head(1)._to_pandas().index[0]
tail = self.tail(1)._to_pandas().index[0]
if self.empty:
# index[0] is out of bounds for empty df
head = self.head(1)._to_pandas()
tail = self.tail(1)._to_pandas()
else:
head = self.head(1)._to_pandas().index[0]
tail = self.tail(1)._to_pandas().index[0]
index_summary = ', %s to %s' % (pprint_thing(head),
pprint_thing(tail))

View File

@ -16,7 +16,7 @@ import warnings
from collections import OrderedDict
import pandas as pd
from pandas.core.dtypes.common import is_bool_dtype, is_datetime_or_timedelta_dtype
from pandas.core.dtypes.common import is_datetime_or_timedelta_dtype
from eland import Index, SortOrder, DEFAULT_CSV_BATCH_OUTPUT_SIZE, DEFAULT_ES_MAX_RESULT_WINDOW, \
elasticsearch_date_to_pandas_date
@ -602,12 +602,22 @@ class Operations:
if size is not None and size <= DEFAULT_ES_MAX_RESULT_WINDOW:
if size > 0:
try:
# For query_compiler._client.search we could add _source
# as a parameter, or add this value in body.
#
# If _source is a parameter it is encoded into to the url.
#
# If _source is a large number of fields (1000+) then this can result in an
# extremely long url and a `too_long_frame_exception`. Therefore, add
# _source to the body rather than as a _source parameter
if _source:
body['_source'] = _source
es_results = query_compiler._client.search(
index=query_compiler._index_pattern,
size=size,
sort=sort_params,
body=body,
_source=_source)
body=body)
except Exception:
# Catch all ES errors and print debug (currently to stdout)
error = {

View File

@ -0,0 +1,49 @@
# Copyright 2020 Elasticsearch BV
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# File called _pytest for PyCharm compatability
import eland as ed
from eland.tests.common import ES_TEST_CLIENT
from eland.tests.common import TestData
class TestDataFrameBigMapping(TestData):
def test_big_mapping(self):
mapping = {'mappings': {'properties': {}}}
for i in range(0, 1000):
field_name = "long_field_name_" + str(i)
mapping['mappings']['properties'][field_name] = {'type': 'float'}
ES_TEST_CLIENT.indices.delete(index='thousand_fields', ignore=[400, 404])
ES_TEST_CLIENT.indices.create(index='thousand_fields', body=mapping)
ed_df = ed.DataFrame(ES_TEST_CLIENT, 'thousand_fields')
ed_df.info()
ES_TEST_CLIENT.indices.delete(index='thousand_fields')

View File

@ -15,6 +15,10 @@
# File called _pytest for PyCharm compatability
from io import StringIO
import eland as ed
from eland.tests import ES_TEST_CLIENT
from eland.tests.common import TestData
@ -39,3 +43,18 @@ class TestDataFrameInfo(TestData):
# NOTE: info does not work on truncated data frames (e.g. head/tail) TODO
print(self.ed_ecommerce().info())
def test_empty_info(self):
mapping = {'mappings': {'properties': {}}}
for i in range(0, 10):
field_name = "field_name_" + str(i)
mapping['mappings']['properties'][field_name] = {'type': 'float'}
ES_TEST_CLIENT.indices.delete(index='empty_index', ignore=[400, 404])
ES_TEST_CLIENT.indices.create(index='empty_index', body=mapping)
ed_df = ed.DataFrame(ES_TEST_CLIENT, 'empty_index')
ed_df.info()
ES_TEST_CLIENT.indices.delete(index='empty_index')

View File

@ -120,7 +120,7 @@ class TestImportedMLModel:
# Get some test results
test_data = [[0.1, 0.2, 0.3, -0.5, 1.0], [1.6, 2.1, -10, 50, -1.0]]
test_results = classifier.predict(test_data)
test_results = classifier.predict(np.asarray(test_data))
# Serialise the models to Elasticsearch
feature_names = ["f0", "f1", "f2", "f3", "f4"]
@ -142,7 +142,7 @@ class TestImportedMLModel:
# Get some test results
test_data = [[0.1, 0.2, 0.3, -0.5, 1.0], [1.6, 2.1, -10, 50, -1.0]]
test_results = regressor.predict(test_data)
test_results = regressor.predict(np.asarray(test_data))
# Serialise the models to Elasticsearch
feature_names = ["f0", "f1", "f2", "f3", "f4"]