diff --git a/eland/__init__.py b/eland/__init__.py index 7b48ae3..094154d 100644 --- a/eland/__init__.py +++ b/eland/__init__.py @@ -1,4 +1,4 @@ from .utils import * from .frame import * from .client import * -from .mappings import * \ No newline at end of file +from .mappings import * diff --git a/eland/client.py b/eland/client.py index a3207e2..5359e15 100644 --- a/eland/client.py +++ b/eland/client.py @@ -21,3 +21,8 @@ class Client(): def field_caps(self, **kwargs): return self.es.field_caps(**kwargs) + + def count(self, **kwargs): + count_json = self.es.count(**kwargs) + return count_json['count'] + diff --git a/eland/frame.py b/eland/frame.py index 47b331a..6631781 100644 --- a/eland/frame.py +++ b/eland/frame.py @@ -30,6 +30,8 @@ from elasticsearch_dsl import Search import pandas as pd +from pandas.core.arrays.sparse import BlockIndex + class DataFrame(): """ pandas.DataFrame like API that proxies into Elasticsearch index(es). @@ -251,3 +253,71 @@ class DataFrame(): df = pd.DataFrame(data=results, index=['count', 'mean', 'std', 'min', '25%', '50%', '75%', 'max']) return df + + @property + def shape(self): + """ + Return a tuple representing the dimensionality of the DataFrame. + + Returns + ------- + shape: tuple + 0 - number of rows + 1 - number of columns + """ + num_rows = len(self) + num_columns = self.columns + + return num_rows, num_columns + + @property + def columns(self): + return self.mappings.source_fields() + + def __getitem__(self, item): + if isinstance(item, str): + if item not in self.mappings.is_source_field(item): + raise TypeError('Column does not exist: [{0}]'.format(item)) + return Column(item) + elif isinstance(item, BooleanFilter): + self._filter = item.build() + return self + else: + raise TypeError('Unsupported expr: [{0}]'.format(item)) + + def __len__(self): + """ + Returns length of info axis, but here we use the index. + """ + return self.client.count(index=self.index_pattern) + + # ---------------------------------------------------------------------- + # Rendering Methods + + def __repr__(self): + # The return for this is display.options.max_rows + max_rows = 60 + head_rows = max_rows / 2 + tail_rows = max_rows - head_rows + + head = self.head(max_rows) + + num_rows = len(self) + + if (num_rows > max_rows): + # If we have a lot of rows, create a SparseDataFrame and use + # pandas to_string logic + # NOTE: this sparse DataFrame can't be used as the middle + # section is all NaNs. However, it gives us potentially a nice way + # to use the pandas IO methods. + sdf = pd.DataFrame({item: pd.SparseArray(data=head[item], + sparse_index= + BlockIndex( + num_rows, [0, num_rows-tail_rows], [head_rows, tail_rows])) + for item in self.columns}) + + # TODO - don't hard code max_rows - use pandas default/ES default + return sdf.to_string(max_rows=max_rows) + + return head.to_string(max_rows=max_rows) + diff --git a/eland/mappings.py b/eland/mappings.py index b96d7c1..a884307 100644 --- a/eland/mappings.py +++ b/eland/mappings.py @@ -298,3 +298,21 @@ class Mappings(): ((self.mappings_capabilities.pd_dtype == 'int64') | (self.mappings_capabilities.pd_dtype == 'float64'))].index.tolist() + def source_fields(self): + """ + Returns + ------- + source_fields: list of str + List of source fields + """ + return self.mappings_capabilities[self.mappings_capabilities._source == True].index.tolist() + + def count_source_fields(self): + """ + Returns + ------- + count_source_fields: int + Number of source fields in mapping + """ + return len(self.mappings_capabilities[self.mappings_capabilities._source == True].index) + diff --git a/eland/tests/__init__.py b/eland/tests/__init__.py index 14c8b54..26d8423 100644 --- a/eland/tests/__init__.py +++ b/eland/tests/__init__.py @@ -438,6 +438,7 @@ TEST_MAPPING1_EXPECTED = { } TEST_MAPPING1_EXPECTED_DF = pd.DataFrame.from_dict(data=TEST_MAPPING1_EXPECTED, orient='index', columns=['es_dtype']) +TEST_MAPPING1_EXPECTED_SOURCE_FIELD_COUNT = len(TEST_MAPPING1_EXPECTED_DF.index) - 4 TEST_NESTED_USER_GROUP_INDEX_NAME = 'nested_user_group' TEST_NESTED_USER_GROUP_MAPPING = { diff --git a/eland/tests/client/test_mappings_pytest.py b/eland/tests/client/test_mappings_pytest.py index be3a9cf..34aac5b 100644 --- a/eland/tests/client/test_mappings_pytest.py +++ b/eland/tests/client/test_mappings_pytest.py @@ -12,11 +12,13 @@ class TestMapping(): # Requires 'setup_tests.py' to be run prior to this def test_mapping(self): - mapping = ed.Mappings(ed.Client(ELASTICSEARCH_HOST), TEST_MAPPING1_INDEX_NAME) + mappings = ed.Mappings(ed.Client(ELASTICSEARCH_HOST), TEST_MAPPING1_INDEX_NAME) - assert mapping.all_fields() == TEST_MAPPING1_EXPECTED_DF.index.tolist() + assert mappings.all_fields() == TEST_MAPPING1_EXPECTED_DF.index.tolist() - assert_frame_equal(TEST_MAPPING1_EXPECTED_DF, pd.DataFrame(mapping.mappings_capabilities['es_dtype'])) + assert_frame_equal(TEST_MAPPING1_EXPECTED_DF, pd.DataFrame(mappings.mappings_capabilities['es_dtype'])) + + assert mappings.count_source_fields() == TEST_MAPPING1_EXPECTED_SOURCE_FIELD_COUNT diff --git a/eland/tests/frame/test_indexing_pytest.py b/eland/tests/frame/test_indexing_pytest.py index bd230f1..9548f13 100644 --- a/eland/tests/frame/test_indexing_pytest.py +++ b/eland/tests/frame/test_indexing_pytest.py @@ -52,3 +52,10 @@ class TestDataFrameIndexing(TestData): # don't match the mapping types. This is mainly because the products field is # nested and so can be treated as a multi-field in ES, but not in pandas + def test_size(self): + assert self.pd_flights().shape == self.ed_flights().shape + assert len(self.pd_flights()) == len(self.ed_flights()) + + def test_to_string(self): + print(self.ed_flights()) +