Renaming modules and added mapping store

2025-07-24 00:00:39 +08:00 · 2019-06-18 11:48:56 +00:00 · 2019-06-18 11:48:56 +00:00 · 674ac129e6
commit 674ac129e6
parent a3ea4db772
9 changed files with 379 additions and 123 deletions
--- a/eland/DataFrame.py
+++ b/eland/DataFrame.py
@ -1,108 +0,0 @@
 import eland
 from elasticsearch import Elasticsearch
 from elasticsearch_dsl import Search
 import json
 import pandas as pd
 class DataFrame():
    def __init__(self, client, index_pattern):
        self.client = eland.Client(client)
        self.index_pattern = index_pattern
        self.client.indices().exists(index_pattern)
    @staticmethod
    def _flatten_results(prefix, results, result):
        # TODO
        return prefix
    @staticmethod
    def _es_results_to_pandas(results):
        # TODO - resolve nested fields
        rows = []
        for hit in results['hits']['hits']:
            row = hit['_source']
            rows.append(row)
        #return pd.DataFrame(data=rows)
        # Converting the list of dicts to a dataframe doesn't convert datetimes
        # effectively compared to read_json. TODO - https://github.com/elastic/eland/issues/2
        json_rows = json.dumps(rows)
        return pd.read_json(json_rows)
    @staticmethod
    def _flatten_mapping(prefix, properties, result):
        for k, v in properties.items():
            if 'properties' in v:
                if(prefix == ''):
                    prefix = k
                else:
                    prefix = prefix + '.' + k
                DataFrame._flatten_mapping(prefix, v['properties'], result)
            else:
                if(prefix == ''):
                    key = k
                else:
                    key = prefix + '.' + k
                type = v['type']
                result.append((key, type))
    @staticmethod
    def _es_mappings_to_pandas(mappings):
        fields = []
        for index in mappings:            
            if 'properties' in mappings[index]['mappings']:
                properties = mappings[index]['mappings']['properties']
                DataFrame._flatten_mapping('', properties, fields)
        return pd.DataFrame(data=fields, columns=['field', 'datatype'])
    def head(self, n=5):
        results = self.client.search(index=self.index_pattern, size=n)
        return DataFrame._es_results_to_pandas(results)
    def describe(self):
        # First get all types
        #mapping = self.client.indices().get_mapping(index=self.index_pattern)
        mapping = self.client.indices().get_mapping(index=self.index_pattern)
        fields = DataFrame._es_mappings_to_pandas(mapping)
        # Get numeric types (https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#the-where-method-and-masking)
        # https://www.elastic.co/guide/en/elasticsearch/reference/current/number.html
        # TODO refactor this list out of method
        numeric_fields = fields.query('datatype == ["long", "integer", "short", "byte", "double", "float", "half_float", "scaled_float"]')
        # for each field we copute:
        # count, mean, std, min, 25%, 50%, 75%, max
        search = Search(using=self.client, index=self.index_pattern).extra(size=0)
        for field in numeric_fields.field:
            search.aggs.metric('extended_stats_'+field, 'extended_stats', field=field)
            search.aggs.metric('percentiles_'+field, 'percentiles', field=field)
        response = search.execute()
        results = pd.DataFrame(index=['count', 'mean', 'std', 'min', '25%', '50%', '75%', 'max'])
        for field in numeric_fields.field:
            values = []
            values.append(response.aggregations['extended_stats_'+field]['count'])
            values.append(response.aggregations['extended_stats_'+field]['avg'])
            values.append(response.aggregations['extended_stats_'+field]['std_deviation'])
            values.append(response.aggregations['extended_stats_'+field]['min'])
            values.append(response.aggregations['percentiles_'+field]['values']['25.0'])
            values.append(response.aggregations['percentiles_'+field]['values']['50.0'])
            values.append(response.aggregations['percentiles_'+field]['values']['75.0'])
            values.append(response.aggregations['extended_stats_'+field]['max'])
            # if not None
            if (values.count(None) < len(values)):
                results = results.assign(**{field: values})
        return results
--- a/eland/init.py
+++ b/eland/init.py
@ -1,3 +1,3 @@
 from .utils import *
-from .DataFrame import *
+from .frame import *
-from .Client import *
+from .client import *
--- a/eland/client.py
+++ b/eland/client.py
@ -8,7 +8,7 @@ class Client(object):
            self.es = es
        else:
            self.es = Elasticsearch(es)
-            
+
    def info(self):
        return self.es.info()
@ -17,3 +17,7 @@ class Client(object):
    def search(self, **kwargs):
        return self.es.search(**kwargs)
    def field_caps(self, **kwargs):
        return self.es.field_caps(**kwargs)
--- a/eland/frame.py
+++ b/eland/frame.py
@ -0,0 +1,201 @@
 """
 DataFrame
 ---------
 An efficient 2D container for potentially mixed-type time series or other
 labeled data series.
 The underlying data resides in Elasticsearch and the API aligns as much as
 possible with pandas.DataFrame API.
 This allows the eland.DataFrame to access large datasets stored in Elasticsearch,
 without storing the dataset in local memory.
 Implementation Details
 ----------------------
 Elasticsearch indexes can be configured in many different ways, and these indexes
 utilise different data structures to pandas.DataFrame.
 eland.DataFrame operations that return individual rows (e.g. df.head()) return
 _source data. If _source is not enabled, this data is not accessible.
 Similarly, only Elasticsearch searchable fields can be searched or filtered, and
 only Elasticsearch aggregatable fields can be aggregated or grouped.
 """
 import eland
 from elasticsearch import Elasticsearch
 from elasticsearch_dsl import Search
 import pandas as pd
 class DataFrame():
    """
    pandas.DataFrame like API that proxies into Elasticsearch index(es).
    Parameters
    ----------
    client : eland.Client
        A reference to a Elasticsearch python client
    index_pattern : str
        An Elasticsearch index pattern. This can contain wildcards (e.g. filebeat-*).
    See Also
    --------
    Examples
    --------
    >>> import eland as ed
    >>> client = ed.Client(Elasticsearch())
    >>> df = ed.DataFrame(client, 'reviews')
    >>> df.head()
       reviewerId  vendorId  rating              date
    0           0         0       5  2006-04-07 17:08
    1           1         1       5  2006-05-04 12:16
    2           2         2       4  2006-04-21 12:26
    3           3         3       5  2006-04-18 15:48
    4           3         4       5  2006-04-18 15:49
    Notice that the types are based on Elasticsearch mappings
    Notes
    -----
    If the Elasticsearch index is deleted or index mappings are changed after this
    object is created, the object is not rebuilt and so inconsistencies can occur.
    Mapping Elasticsearch types to pandas dtypes
    --------------------------------------------
    Elasticsearch field datatype              | Pandas dtype
    --
    text                                      | object
    keyword                                   | object
    long, integer, short, byte, binary        | int64
    double, float, half_float, scaled_float   | float64
    date, date_nanos                          | datetime64[ns]
    boolean                                   | bool
    TODO - add additional mapping types
    """
    def __init__(self, client, index_pattern):
        self.client = eland.Client(client)
        self.index_pattern = index_pattern
        # Get and persist mappings, this allows use to correctly
        # map returned types from Elasticsearch to pandas datatypes
        mapping = self.client.indices().get_mapping(index=self.index_pattern)
        #field_caps = self.client.field_caps(index=self.index_pattern, fields='*')
        #self.fields, self.aggregatable_fields, self.searchable_fields = \
        #    DataFrame._es_mappings_to_pandas(mapping, field_caps)
        self.fields = DataFrame._es_mappings_to_pandas(mapping)
    @staticmethod
    def _flatten_results(prefix, results, result):
        # TODO
        return prefix
    def _es_results_to_pandas(self, results):
        # TODO - resolve nested fields
        rows = []
        for hit in results['hits']['hits']:
            row = hit['_source']
            rows.append(row)
        df = pd.DataFrame(data=rows)
        return df
    @staticmethod
    def _extract_types_from_mapping(y):
        """
        Extract data types from mapping for DataFrame columns.
        Elasticsearch _source data is transformed into pandas DataFrames. This strategy is not compatible
        with all Elasticsearch configurations. Notes:
        - This strategy is not compatible with all Elasticsearch configurations. If _source is disabled
        (https://www.elastic.co/guide/en/elasticsearch/reference/current/mapping-source-field.html#disable-source-field)
        no data values will be populated
        - Sub-fields (e.g. english.text in {"mappings":{"properties":{"text":{"type":"text","fields":{"english":{"type":"text","analyzer":"english"}}}}}})
        are not be used
        """
        out = {}
        # Recurse until we get a 'type: xxx' - ignore sub-fields
        def flatten(x, name=''):
            if type(x) is dict:
                for a in x:
                    if a == 'type' and type(x[a]) is str: # 'type' can be a name of a field
                        out[name[:-1]] = x[a]
                    if a == 'properties' or a == 'fields':
                        flatten(x[a], name)
                    else:
                        flatten(x[a], name + a + '.')
        flatten(y)
        return out
    @staticmethod
    def _es_mappings_to_pandas(mappings):
        fields = {}
        for index in mappings:
            if 'properties' in mappings[index]['mappings']:
                properties = mappings[index]['mappings']['properties']
                datatypes = DataFrame._extract_types_from_mapping(properties)
                # Note there could be conflicts here - e.g. the same field name with different semantics in
                # different indexes - currently the last one wins TODO: review this
                fields.update(datatypes)
        return pd.DataFrame.from_dict(data=fields, orient='index', columns=['datatype'])
    def head(self, n=5):
        results = self.client.search(index=self.index_pattern, size=n)
        return self._es_results_to_pandas(results)
    def describe(self):
        # First get all types
        #mapping = self.client.indices().get_mapping(index=self.index_pattern)
        mapping = self.client.indices().get_mapping(index=self.index_pattern)
        fields = DataFrame._es_mappings_to_pandas(mapping)
        # Get numeric types (https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#the-where-method-and-masking)
        # https://www.elastic.co/guide/en/elasticsearch/reference/current/number.html
        # TODO refactor this list out of method
        numeric_fields = fields.query('datatype == ["long", "integer", "short", "byte", "double", "float", "half_float", "scaled_float"]')
        # for each field we copute:
        # count, mean, std, min, 25%, 50%, 75%, max
        search = search(using=self.client, index=self.index_pattern).extra(size=0)
        for field in numeric_fields.field:
            search.aggs.metric('extended_stats_'+field, 'extended_stats', field=field)
            search.aggs.metric('percentiles_'+field, 'percentiles', field=field)
        response = search.execute()
        results = pd.dataframe(index=['count', 'mean', 'std', 'min', '25%', '50%', '75%', 'max'])
        for field in numeric_fields.field:
            values = []
            values.append(response.aggregations['extended_stats_'+field]['count'])
            values.append(response.aggregations['extended_stats_'+field]['avg'])
            values.append(response.aggregations['extended_stats_'+field]['std_deviation'])
            values.append(response.aggregations['extended_stats_'+field]['min'])
            values.append(response.aggregations['percentiles_'+field]['values']['25.0'])
            values.append(response.aggregations['percentiles_'+field]['values']['50.0'])
            values.append(response.aggregations['percentiles_'+field]['values']['75.0'])
            values.append(response.aggregations['extended_stats_'+field]['max'])
            # if not None
            if (values.count(None) < len(values)):
                results = results.assign(**{field: values})
        return results
--- a/eland/tests/init.py
+++ b/eland/tests/init.py
@ -1,4 +1,5 @@
 import os
 import pandas as pd
 ROOT_DIR = os.path.dirname(os.path.abspath(__file__))
@ -10,3 +11,150 @@ FLIGHTS_FILE_NAME = ROOT_DIR + '/flights.json.gz'
 ECOMMERCE_INDEX_NAME = 'ecommerce'
 ECOMMERCE_FILE_NAME = ROOT_DIR + '/ecommerce.json.gz'
 TEST_MAPPING1 = {
      'mappings': {
        'properties': {
          'city': {
            'type': 'text',
            'fields': {
              'raw': {
                'type': 'keyword'
              }
            }
          },
          'text': {
            'type': 'text',
            'fields': {
              'english': {
                'type': 'text',
                'analyzer': 'english'
              }
            }
          },
          'origin_location': {
            'properties': {
              'lat': {
                'type': 'text',
                'index_prefixes': {},
                'fields': {
                  'keyword': {
                    'type': 'keyword',
                    'ignore_above': 256
                  }
                }
              },
              'lon': {
                'type': 'text',
                'fields': {
                  'keyword': {
                    'type': 'keyword',
                    'ignore_above': 256
                  }
                }
              }
            }
          },
          'maps-telemetry': {
            'properties': {
              'attributesPerMap': {
                'properties': {
                  'dataSourcesCount': {
                    'properties': {
                      'avg': {
                        'type': 'long'
                      },
                      'max': {
                        'type': 'long'
                      },
                      'min': {
                        'type': 'long'
                      }
                    }
                  },
                  'emsVectorLayersCount': {
                    'dynamic': 'true',
                    'properties': {
                      'france_departments': {
                        'properties': {
                          'avg': {
                            'type': 'float'
                          },
                          'max': {
                            'type': 'long'
                          },
                          'min': {
                            'type': 'long'
                          }
                        }
                      }
                    }
                  }
                }
              }
            }
          },
          'type': {
            'type': 'keyword'
          },
          'name': {
            'type': 'text'
          },
          'user_name': {
            'type': 'keyword'
          },
          'email': {
            'type': 'keyword'
          },
          'content': {
            'type': 'text'
          },
          'tweeted_at': {
            'type': 'date'
          },
          'dest_location': {
            'type': 'geo_point'
          },
          'user': {
            'type': 'nested'
          },
          'my_join_field': {
            'type': 'join',
            'relations': {
              'question': ['answer', 'comment'],
              'answer': 'vote'
            }
          }
        }
      }
    }
 TEST_MAPPING1_INDEX_NAME = 'mapping1'
 TEST_MAPPING1_EXPECTED = {
    'city': 'text',
    'city.raw': 'keyword',
    'content': 'text',
    'dest_location': 'geo_point',
    'email': 'keyword',
    'maps-telemetry.attributesPerMap.dataSourcesCount.avg': 'long',
    'maps-telemetry.attributesPerMap.dataSourcesCount.max': 'long',
    'maps-telemetry.attributesPerMap.dataSourcesCount.min': 'long',
    'maps-telemetry.attributesPerMap.emsVectorLayersCount.france_departments.avg': 'float',
    'maps-telemetry.attributesPerMap.emsVectorLayersCount.france_departments.max': 'long',
    'maps-telemetry.attributesPerMap.emsVectorLayersCount.france_departments.min': 'long',
    'my_join_field': 'join',
    'name': 'text',
    'origin_location.lat': 'text',
    'origin_location.lat.keyword': 'keyword',
    'origin_location.lon': 'text',
    'origin_location.lon.keyword': 'keyword',
    'text': 'text',
    'text.english': 'text',
    'tweeted_at': 'date',
    'type': 'keyword',
    'user': 'nested',
    'user_name': 'keyword'
 }
 TEST_MAPPING1_EXPECTED_DF = pd.DataFrame.from_dict(data=TEST_MAPPING1_EXPECTED, orient='index', columns=['datatype'])
--- a/eland/tests/dataframe/init.py
+++ b/eland/tests/dataframe/init.py
--- a/eland/tests/dataframe/common.py
+++ b/eland/tests/dataframe/common.py
--- a/eland/tests/dataframe/test_indexing_pytest.py
+++ b/eland/tests/dataframe/test_indexing_pytest.py
@ -1,5 +1,10 @@
 # File called _pytest for PyCharm compatability
-from eland.tests.dataframe.common import TestData
+from eland.tests.frame.common import TestData
 from eland.tests import *
 import eland as ed
 import pandas as pd
 from pandas.util.testing import (
    assert_almost_equal, assert_frame_equal, assert_series_equal)
@ -15,10 +20,9 @@ class TestDataFrameIndexing(TestData):
        pd_ecommerce_head = self.pd_ecommerce().head()
        ed_ecommerce_head = self.ed_ecommerce().head()
        #print(pd_ecommerce_head.dtypes)
        #print(ed_ecommerce_head.dtypes)
        assert_frame_equal(pd_ecommerce_head, ed_ecommerce_head)
-        
+    def test_mappings(self):
        test_mapping1 = ed.read_es(ELASTICSEARCH_HOST, TEST_MAPPING1_INDEX_NAME)
        assert_frame_equal(TEST_MAPPING1_EXPECTED_DF, test_mapping1.fields)
--- a/eland/tests/index_flights.py
+++ b/eland/tests/index_flights.py
@ -2,24 +2,19 @@ import pandas as pd
 from elasticsearch import Elasticsearch
 from elasticsearch import helpers
-from eland.tests import FLIGHTS_FILE_NAME, FLIGHTS_INDEX_NAME, ECOMMERCE_FILE_NAME, ECOMMERCE_INDEX_NAME
+from eland.tests import *
 DATA_LIST = [
    (FLIGHTS_FILE_NAME, FLIGHTS_INDEX_NAME),
    (ECOMMERCE_FILE_NAME, ECOMMERCE_INDEX_NAME)
 ]
-if __name__ == '__main__':
+def _setup_data(es):
    # Read json file and index records into Elasticsearch
    for data in DATA_LIST:
        json_file_name = data[0]
        index_name = data[1]
        # Create connection to Elasticsearch - use defaults1
        es = Elasticsearch()
        # Delete index
        print("Deleting index:", index_name)
        es.indices.delete(index=index_name, ignore=[400, 404])
@ -49,3 +44,15 @@ if __name__ == '__main__':
        actions = []
        print("Done", index_name)
 def _setup_test_mappings(es):
    # Create a complex mapping containing many Elasticsearch features
    es.indices.delete(index=TEST_MAPPING1_INDEX_NAME, ignore=[400, 404])
    es.indices.create(index=TEST_MAPPING1_INDEX_NAME, body=TEST_MAPPING1)
 if __name__ == '__main__':
    # Create connection to Elasticsearch - use defaults
    es = Elasticsearch(ELASTICSEARCH_HOST)
    _setup_data(es)
    _setup_test_mappings(es)