From c7236335263cb1930309331068662323f68638c5 Mon Sep 17 00:00:00 2001 From: Stephen Dodson Date: Sat, 22 Jun 2019 06:55:30 +0000 Subject: [PATCH] Resolving merge issue --- eland/dataframe.py | 100 --------------------------------------------- 1 file changed, 100 deletions(-) delete mode 100644 eland/dataframe.py diff --git a/eland/dataframe.py b/eland/dataframe.py deleted file mode 100644 index 1f49216..0000000 --- a/eland/dataframe.py +++ /dev/null @@ -1,100 +0,0 @@ -import eland - -from elasticsearch import Elasticsearch -from elasticsearch_dsl import Search - -import pandas as pd - -class DataFrame(): - - def __init__(self, client, index_pattern): - self.client = eland.Client(client) - self.index_pattern = index_pattern - - self.client.indices().exists(index_pattern) - - @staticmethod - def _es_results_to_pandas(results): - # TODO - resolve nested fields - rows = [] - for hit in results['hits']['hits']: - row = {} - for k in hit.keys(): - if k == '_source': - row.update(hit['_source']) - rows.append(row) - return pd.DataFrame(data=rows) - - @staticmethod - def _flatten_mapping(prefix, properties, result): - for k, v in properties.items(): - if 'properties' in v: - if(prefix == ''): - prefix = k - else: - prefix = prefix + '.' + k - DataFrame._flatten_mapping(prefix, v['properties'], result) - else: - if(prefix == ''): - key = k - else: - key = prefix + '.' + k - type = v['type'] - result.append((key, type)) - - @staticmethod - def _es_mappings_to_pandas(mappings): - fields = [] - for index in mappings: - if 'properties' in mappings[index]['mappings']: - properties = mappings[index]['mappings']['properties'] - - DataFrame._flatten_mapping('', properties, fields) - - return pd.DataFrame(data=fields, columns=['field', 'datatype']) - - def head(self, n=5): - results = self.client.search(index=self.index_pattern, size=n) - - return DataFrame._es_results_to_pandas(results) - - def describe(self): - # First get all types - #mapping = self.client.indices().get_mapping(index=self.index_pattern) - mapping = self.client.indices().get_mapping(index=self.index_pattern) - - fields = DataFrame._es_mappings_to_pandas(mapping) - - # Get numeric types (https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#the-where-method-and-masking) - # https://www.elastic.co/guide/en/elasticsearch/reference/current/number.html - # TODO refactor this list out of method - numeric_fields = fields.query('datatype == ["long", "integer", "short", "byte", "double", "float", "half_float", "scaled_float"]') - - # for each field we copute: - # count, mean, std, min, 25%, 50%, 75%, max - search = Search(using=self.client, index=self.index_pattern).extra(size=0) - - for field in numeric_fields.field: - search.aggs.metric('extended_stats_'+field, 'extended_stats', field=field) - search.aggs.metric('percentiles_'+field, 'percentiles', field=field) - - response = search.execute() - - results = pd.DataFrame(index=['count', 'mean', 'std', 'min', '25%', '50%', '75%', 'max']) - - for field in numeric_fields.field: - values = [] - values.append(response.aggregations['extended_stats_'+field]['count']) - values.append(response.aggregations['extended_stats_'+field]['avg']) - values.append(response.aggregations['extended_stats_'+field]['std_deviation']) - values.append(response.aggregations['extended_stats_'+field]['min']) - values.append(response.aggregations['percentiles_'+field]['values']['25.0']) - values.append(response.aggregations['percentiles_'+field]['values']['50.0']) - values.append(response.aggregations['percentiles_'+field]['values']['75.0']) - values.append(response.aggregations['extended_stats_'+field]['max']) - - # if not None - if (values.count(None) < len(values)): - results = results.assign(**{field: values}) - - return results