Merge pull request #7 from Winterflower/feature/pep8ify

Lowercase modulenames
2025-07-11 00:02:14 +08:00 · 2019-06-22 08:50:18 +02:00 · 2019-06-22 08:50:18 +02:00 · 989a792a54
commit 989a792a54
parent 956678053b 9c61a71a81
5 changed files with 132 additions and 2 deletions
--- a/README.md
+++ b/README.md
@ -29,6 +29,28 @@ min         0.000000       0.000000       0.000000
 max    400140.000000     246.000000       5.000000
 ```

+## Development Setup
+
+1. Create a virtual environment in Python 
+
+For example, 
+
+```
+python3 -m venv env
+```
+
+2. Activate the virtual environment
+
+```
+source env/bin/activate
+```
+
+3. Install dependencies from the `requirements.txt` file
+
+```
+pip install -r requirements.txt
+```
+
 ## Why eland?

 Naming is difficult, but as we had to call it something:
--- a/eland/init.py
+++ b/eland/init.py
@ -1,4 +1,4 @@
 from .utils import *
 from .frame import *
 from .client import *
-from .mappings import *
+from .mappings import *
--- a/eland/client.py
+++ b/eland/client.py
@ -9,7 +9,7 @@ class Client():
            self.es = es
        else:
            self.es = Elasticsearch(es)
-
+            
    def info(self):
        return self.es.info()
    
--- a/eland/dataframe.py
+++ b/eland/dataframe.py
@ -0,0 +1,100 @@
+import eland
+
+from elasticsearch import Elasticsearch
+from elasticsearch_dsl import Search
+
+import pandas as pd
+
+class DataFrame():
+    
+    def __init__(self, client, index_pattern):
+        self.client = eland.Client(client)
+        self.index_pattern = index_pattern
+        
+        self.client.indices().exists(index_pattern)
+        
+    @staticmethod
+    def _es_results_to_pandas(results):
+        # TODO - resolve nested fields
+        rows = []
+        for hit in results['hits']['hits']:
+            row = {}
+            for k in hit.keys():
+                if k == '_source':
+                    row.update(hit['_source'])
+            rows.append(row)
+        return pd.DataFrame(data=rows)
+    
+    @staticmethod
+    def _flatten_mapping(prefix, properties, result):
+        for k, v in properties.items():
+            if 'properties' in v:
+                if(prefix == ''):
+                    prefix = k
+                else:
+                    prefix = prefix + '.' + k
+                DataFrame._flatten_mapping(prefix, v['properties'], result)
+            else:
+                if(prefix == ''):
+                    key = k
+                else:
+                    key = prefix + '.' + k
+                type = v['type']
+                result.append((key, type))
+    
+    @staticmethod
+    def _es_mappings_to_pandas(mappings):
+        fields = []
+        for index in mappings:            
+            if 'properties' in mappings[index]['mappings']:
+                properties = mappings[index]['mappings']['properties']
+                
+                DataFrame._flatten_mapping('', properties, fields)
+                
+        return pd.DataFrame(data=fields, columns=['field', 'datatype'])
+        
+    def head(self, n=5):
+        results = self.client.search(index=self.index_pattern, size=n)
+        
+        return DataFrame._es_results_to_pandas(results)
+    
+    def describe(self):
+        # First get all types
+        #mapping = self.client.indices().get_mapping(index=self.index_pattern)
+        mapping = self.client.indices().get_mapping(index=self.index_pattern)
+                
+        fields = DataFrame._es_mappings_to_pandas(mapping)
+        
+        # Get numeric types (https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#the-where-method-and-masking)
+        # https://www.elastic.co/guide/en/elasticsearch/reference/current/number.html
+        # TODO refactor this list out of method
+        numeric_fields = fields.query('datatype == ["long", "integer", "short", "byte", "double", "float", "half_float", "scaled_float"]')
+                
+        # for each field we copute:
+        # count, mean, std, min, 25%, 50%, 75%, max
+        search = Search(using=self.client, index=self.index_pattern).extra(size=0)
+        
+        for field in numeric_fields.field:
+            search.aggs.metric('extended_stats_'+field, 'extended_stats', field=field)
+            search.aggs.metric('percentiles_'+field, 'percentiles', field=field)
+
+        response = search.execute()
+        
+        results = pd.DataFrame(index=['count', 'mean', 'std', 'min', '25%', '50%', '75%', 'max'])
+        
+        for field in numeric_fields.field:
+            values = []
+            values.append(response.aggregations['extended_stats_'+field]['count'])
+            values.append(response.aggregations['extended_stats_'+field]['avg'])
+            values.append(response.aggregations['extended_stats_'+field]['std_deviation'])
+            values.append(response.aggregations['extended_stats_'+field]['min'])
+            values.append(response.aggregations['percentiles_'+field]['values']['25.0'])
+            values.append(response.aggregations['percentiles_'+field]['values']['50.0'])
+            values.append(response.aggregations['percentiles_'+field]['values']['75.0'])
+            values.append(response.aggregations['extended_stats_'+field]['max'])
+            
+            # if not None
+            if (values.count(None) < len(values)):
+                results = results.assign(**{field: values})
+            
+        return results
--- a/requirements.txt
+++ b/requirements.txt
@ -0,0 +1,8 @@
+elasticsearch==7.0.2
+elasticsearch-dsl==7.0.0
+numpy==1.16.4
+pandas==0.24.2
+python-dateutil==2.8.0
+pytz==2019.1
+six==1.12.0
+urllib3==1.25.3