Too long frame exception fixes (#135)

* Updating test matrix for 7.6 + removing oss for now. * Resolving 7.6.0 docs issues * Updating ML docs * Resolving too_long_frame_exception on large mappings - Embedded _source parameters in bodt rather than url - Fixed bug in DataFrame.info on empty DataFrame - Removed warning from TestImportedMLModel * Resolving too_long_frame_exception on large mappings - Embedded _source parameters in bodt rather than url - Fixed bug in DataFrame.info on empty DataFrame - Removed warning from TestImportedMLModel
2025-07-24 00:00:39 +08:00 · 2020-02-26 12:50:14 +00:00 · 2020-02-26 12:50:14 +00:00 · a33ff45ebc
commit a33ff45ebc
parent 206677818f
5 changed files with 90 additions and 7 deletions
--- a/eland/dataframe.py
+++ b/eland/dataframe.py
@ -555,8 +555,13 @@ class DataFrame(NDFrame):
        # Print index summary e.g.
        # Index: 103 entries, 0 to 102
        # Do this by getting head and tail of dataframe
-        head = self.head(1)._to_pandas().index[0]
-        tail = self.tail(1)._to_pandas().index[0]
+        if self.empty:
+            # index[0] is out of bounds for empty df
+            head = self.head(1)._to_pandas()
+            tail = self.tail(1)._to_pandas()
+        else:
+            head = self.head(1)._to_pandas().index[0]
+            tail = self.tail(1)._to_pandas().index[0]
        index_summary = ', %s to %s' % (pprint_thing(head),
                                        pprint_thing(tail))

--- a/eland/operations.py
+++ b/eland/operations.py
@ -16,7 +16,7 @@ import warnings
 from collections import OrderedDict

 import pandas as pd
-from pandas.core.dtypes.common import is_bool_dtype, is_datetime_or_timedelta_dtype
+from pandas.core.dtypes.common import is_datetime_or_timedelta_dtype

 from eland import Index, SortOrder, DEFAULT_CSV_BATCH_OUTPUT_SIZE, DEFAULT_ES_MAX_RESULT_WINDOW, \
    elasticsearch_date_to_pandas_date
@ -602,12 +602,22 @@ class Operations:
        if size is not None and size <= DEFAULT_ES_MAX_RESULT_WINDOW:
            if size > 0:
                try:
+                    # For query_compiler._client.search we could add _source
+                    # as a parameter, or add this value in body.
+                    #
+                    # If _source is a parameter it is encoded into to the url.
+                    #
+                    # If _source is a large number of fields (1000+) then this can result in an
+                    # extremely long url and a `too_long_frame_exception`. Therefore, add
+                    # _source to the body rather than as a _source parameter
+                    if _source:
+                        body['_source'] = _source
+
                    es_results = query_compiler._client.search(
                        index=query_compiler._index_pattern,
                        size=size,
                        sort=sort_params,
-                        body=body,
-                        _source=_source)
+                        body=body)
                except Exception:
                    # Catch all ES errors and print debug (currently to stdout)
                    error = {
--- a/eland/tests/dataframe/test_big_mapping_pytest.py
+++ b/eland/tests/dataframe/test_big_mapping_pytest.py
@ -0,0 +1,49 @@
+#  Copyright 2020 Elasticsearch BV
+#
+#      Licensed under the Apache License, Version 2.0 (the "License");
+#      you may not use this file except in compliance with the License.
+#      You may obtain a copy of the License at
+#
+#          http://www.apache.org/licenses/LICENSE-2.0
+#
+#      Unless required by applicable law or agreed to in writing, software
+#      distributed under the License is distributed on an "AS IS" BASIS,
+#      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#      See the License for the specific language governing permissions and
+#      limitations under the License.
+#
+#      Licensed under the Apache License, Version 2.0 (the "License");
+#      you may not use this file except in compliance with the License.
+#      You may obtain a copy of the License at
+#
+#          http://www.apache.org/licenses/LICENSE-2.0
+#
+#      Unless required by applicable law or agreed to in writing, software
+#      distributed under the License is distributed on an "AS IS" BASIS,
+#      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#      See the License for the specific language governing permissions and
+#      limitations under the License.
+
+# File called _pytest for PyCharm compatability
+
+import eland as ed
+from eland.tests.common import ES_TEST_CLIENT
+from eland.tests.common import TestData
+
+
+class TestDataFrameBigMapping(TestData):
+
+    def test_big_mapping(self):
+        mapping = {'mappings': {'properties': {}}}
+
+        for i in range(0, 1000):
+            field_name = "long_field_name_" + str(i)
+            mapping['mappings']['properties'][field_name] = {'type': 'float'}
+
+        ES_TEST_CLIENT.indices.delete(index='thousand_fields', ignore=[400, 404])
+        ES_TEST_CLIENT.indices.create(index='thousand_fields', body=mapping)
+
+        ed_df = ed.DataFrame(ES_TEST_CLIENT, 'thousand_fields')
+        ed_df.info()
+
+        ES_TEST_CLIENT.indices.delete(index='thousand_fields')
--- a/eland/tests/dataframe/test_info_pytest.py
+++ b/eland/tests/dataframe/test_info_pytest.py
@ -15,6 +15,10 @@
 # File called _pytest for PyCharm compatability
 from io import StringIO

+import eland as ed
+
+from eland.tests import ES_TEST_CLIENT
+
 from eland.tests.common import TestData


@ -39,3 +43,18 @@ class TestDataFrameInfo(TestData):
        # NOTE: info does not work on truncated data frames (e.g. head/tail) TODO

        print(self.ed_ecommerce().info())
+
+    def test_empty_info(self):
+        mapping = {'mappings': {'properties': {}}}
+
+        for i in range(0, 10):
+            field_name = "field_name_" + str(i)
+            mapping['mappings']['properties'][field_name] = {'type': 'float'}
+
+        ES_TEST_CLIENT.indices.delete(index='empty_index', ignore=[400, 404])
+        ES_TEST_CLIENT.indices.create(index='empty_index', body=mapping)
+
+        ed_df = ed.DataFrame(ES_TEST_CLIENT, 'empty_index')
+        ed_df.info()
+
+        ES_TEST_CLIENT.indices.delete(index='empty_index')
--- a/eland/tests/ml/test_imported_ml_model_pytest.py
+++ b/eland/tests/ml/test_imported_ml_model_pytest.py
@ -120,7 +120,7 @@ class TestImportedMLModel:

        # Get some test results
        test_data = [[0.1, 0.2, 0.3, -0.5, 1.0], [1.6, 2.1, -10, 50, -1.0]]
-        test_results = classifier.predict(test_data)
+        test_results = classifier.predict(np.asarray(test_data))

        # Serialise the models to Elasticsearch
        feature_names = ["f0", "f1", "f2", "f3", "f4"]
@ -142,7 +142,7 @@ class TestImportedMLModel:

        # Get some test results
        test_data = [[0.1, 0.2, 0.3, -0.5, 1.0], [1.6, 2.1, -10, 50, -1.0]]
-        test_results = regressor.predict(test_data)
+        test_results = regressor.predict(np.asarray(test_data))

        # Serialise the models to Elasticsearch
        feature_names = ["f0", "f1", "f2", "f3", "f4"]