diff --git a/eland/dataframe.py b/eland/dataframe.py index c398806..071853d 100644 --- a/eland/dataframe.py +++ b/eland/dataframe.py @@ -555,8 +555,13 @@ class DataFrame(NDFrame): # Print index summary e.g. # Index: 103 entries, 0 to 102 # Do this by getting head and tail of dataframe - head = self.head(1)._to_pandas().index[0] - tail = self.tail(1)._to_pandas().index[0] + if self.empty: + # index[0] is out of bounds for empty df + head = self.head(1)._to_pandas() + tail = self.tail(1)._to_pandas() + else: + head = self.head(1)._to_pandas().index[0] + tail = self.tail(1)._to_pandas().index[0] index_summary = ', %s to %s' % (pprint_thing(head), pprint_thing(tail)) diff --git a/eland/operations.py b/eland/operations.py index d7f61bb..2ab7a5e 100644 --- a/eland/operations.py +++ b/eland/operations.py @@ -16,7 +16,7 @@ import warnings from collections import OrderedDict import pandas as pd -from pandas.core.dtypes.common import is_bool_dtype, is_datetime_or_timedelta_dtype +from pandas.core.dtypes.common import is_datetime_or_timedelta_dtype from eland import Index, SortOrder, DEFAULT_CSV_BATCH_OUTPUT_SIZE, DEFAULT_ES_MAX_RESULT_WINDOW, \ elasticsearch_date_to_pandas_date @@ -602,12 +602,22 @@ class Operations: if size is not None and size <= DEFAULT_ES_MAX_RESULT_WINDOW: if size > 0: try: + # For query_compiler._client.search we could add _source + # as a parameter, or add this value in body. + # + # If _source is a parameter it is encoded into to the url. + # + # If _source is a large number of fields (1000+) then this can result in an + # extremely long url and a `too_long_frame_exception`. Therefore, add + # _source to the body rather than as a _source parameter + if _source: + body['_source'] = _source + es_results = query_compiler._client.search( index=query_compiler._index_pattern, size=size, sort=sort_params, - body=body, - _source=_source) + body=body) except Exception: # Catch all ES errors and print debug (currently to stdout) error = { diff --git a/eland/tests/dataframe/test_big_mapping_pytest.py b/eland/tests/dataframe/test_big_mapping_pytest.py new file mode 100644 index 0000000..74b5d97 --- /dev/null +++ b/eland/tests/dataframe/test_big_mapping_pytest.py @@ -0,0 +1,49 @@ +# Copyright 2020 Elasticsearch BV +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# File called _pytest for PyCharm compatability + +import eland as ed +from eland.tests.common import ES_TEST_CLIENT +from eland.tests.common import TestData + + +class TestDataFrameBigMapping(TestData): + + def test_big_mapping(self): + mapping = {'mappings': {'properties': {}}} + + for i in range(0, 1000): + field_name = "long_field_name_" + str(i) + mapping['mappings']['properties'][field_name] = {'type': 'float'} + + ES_TEST_CLIENT.indices.delete(index='thousand_fields', ignore=[400, 404]) + ES_TEST_CLIENT.indices.create(index='thousand_fields', body=mapping) + + ed_df = ed.DataFrame(ES_TEST_CLIENT, 'thousand_fields') + ed_df.info() + + ES_TEST_CLIENT.indices.delete(index='thousand_fields') diff --git a/eland/tests/dataframe/test_info_pytest.py b/eland/tests/dataframe/test_info_pytest.py index 03d57a2..2200ec1 100644 --- a/eland/tests/dataframe/test_info_pytest.py +++ b/eland/tests/dataframe/test_info_pytest.py @@ -15,6 +15,10 @@ # File called _pytest for PyCharm compatability from io import StringIO +import eland as ed + +from eland.tests import ES_TEST_CLIENT + from eland.tests.common import TestData @@ -39,3 +43,18 @@ class TestDataFrameInfo(TestData): # NOTE: info does not work on truncated data frames (e.g. head/tail) TODO print(self.ed_ecommerce().info()) + + def test_empty_info(self): + mapping = {'mappings': {'properties': {}}} + + for i in range(0, 10): + field_name = "field_name_" + str(i) + mapping['mappings']['properties'][field_name] = {'type': 'float'} + + ES_TEST_CLIENT.indices.delete(index='empty_index', ignore=[400, 404]) + ES_TEST_CLIENT.indices.create(index='empty_index', body=mapping) + + ed_df = ed.DataFrame(ES_TEST_CLIENT, 'empty_index') + ed_df.info() + + ES_TEST_CLIENT.indices.delete(index='empty_index') diff --git a/eland/tests/ml/test_imported_ml_model_pytest.py b/eland/tests/ml/test_imported_ml_model_pytest.py index 987214a..93d12b8 100644 --- a/eland/tests/ml/test_imported_ml_model_pytest.py +++ b/eland/tests/ml/test_imported_ml_model_pytest.py @@ -120,7 +120,7 @@ class TestImportedMLModel: # Get some test results test_data = [[0.1, 0.2, 0.3, -0.5, 1.0], [1.6, 2.1, -10, 50, -1.0]] - test_results = classifier.predict(test_data) + test_results = classifier.predict(np.asarray(test_data)) # Serialise the models to Elasticsearch feature_names = ["f0", "f1", "f2", "f3", "f4"] @@ -142,7 +142,7 @@ class TestImportedMLModel: # Get some test results test_data = [[0.1, 0.2, 0.3, -0.5, 1.0], [1.6, 2.1, -10, 50, -1.0]] - test_results = regressor.predict(test_data) + test_results = regressor.predict(np.asarray(test_data)) # Serialise the models to Elasticsearch feature_names = ["f0", "f1", "f2", "f3", "f4"]