mirror of
https://github.com/elastic/eland.git
synced 2025-07-11 00:02:14 +08:00
Added DataFrame.info() + more methods
This commit is contained in:
parent
9030f84f4c
commit
428f35f21f
194
eland/frame.py
194
eland/frame.py
@ -32,6 +32,13 @@ import pandas as pd
|
|||||||
|
|
||||||
from pandas.core.arrays.sparse import BlockIndex
|
from pandas.core.arrays.sparse import BlockIndex
|
||||||
|
|
||||||
|
from pandas.io.formats import format as fmt
|
||||||
|
from pandas.io.formats.printing import pprint_thing
|
||||||
|
|
||||||
|
from io import StringIO
|
||||||
|
|
||||||
|
import sys
|
||||||
|
|
||||||
class DataFrame():
|
class DataFrame():
|
||||||
"""
|
"""
|
||||||
pandas.DataFrame like API that proxies into Elasticsearch index(es).
|
pandas.DataFrame like API that proxies into Elasticsearch index(es).
|
||||||
@ -283,6 +290,121 @@ class DataFrame():
|
|||||||
|
|
||||||
return df
|
return df
|
||||||
|
|
||||||
|
def info(self, verbose=None, buf=None, max_cols=None, memory_usage=None,
|
||||||
|
null_counts=None):
|
||||||
|
"""
|
||||||
|
Print a concise summary of a DataFrame.
|
||||||
|
|
||||||
|
This method prints information about a DataFrame including
|
||||||
|
the index dtype and column dtypes, non-null values and memory usage.
|
||||||
|
|
||||||
|
This copies a lot of code from pandas.DataFrame.info as it is difficult
|
||||||
|
to split out the appropriate code or creating a SparseDataFrame gives
|
||||||
|
incorrect results on types and counts.
|
||||||
|
"""
|
||||||
|
if buf is None: # pragma: no cover
|
||||||
|
buf = sys.stdout
|
||||||
|
|
||||||
|
fake_df = self.__fake_dataframe__()
|
||||||
|
|
||||||
|
lines = []
|
||||||
|
|
||||||
|
lines.append(str(type(self)))
|
||||||
|
lines.append(fake_df.index._summary())
|
||||||
|
|
||||||
|
if len(self.columns) == 0:
|
||||||
|
lines.append('Empty {name}'.format(name=type(self).__name__))
|
||||||
|
fmt.buffer_put_lines(buf, lines)
|
||||||
|
return
|
||||||
|
|
||||||
|
cols = self.columns
|
||||||
|
|
||||||
|
# hack
|
||||||
|
if max_cols is None:
|
||||||
|
max_cols = pd.get_option('display.max_info_columns',
|
||||||
|
len(self.columns) + 1)
|
||||||
|
|
||||||
|
max_rows = pd.get_option('display.max_info_rows', len(self) + 1)
|
||||||
|
|
||||||
|
if null_counts is None:
|
||||||
|
show_counts = ((len(self.columns) <= max_cols) and
|
||||||
|
(len(self) < max_rows))
|
||||||
|
else:
|
||||||
|
show_counts = null_counts
|
||||||
|
exceeds_info_cols = len(self.columns) > max_cols
|
||||||
|
|
||||||
|
def _verbose_repr():
|
||||||
|
lines.append('Data columns (total %d columns):' %
|
||||||
|
len(self.columns))
|
||||||
|
space = max(len(pprint_thing(k)) for k in self.columns) + 4
|
||||||
|
counts = None
|
||||||
|
|
||||||
|
tmpl = "{count}{dtype}"
|
||||||
|
if show_counts:
|
||||||
|
counts = self.count()
|
||||||
|
if len(cols) != len(counts): # pragma: no cover
|
||||||
|
raise AssertionError(
|
||||||
|
'Columns must equal counts '
|
||||||
|
'({cols:d} != {counts:d})'.format(
|
||||||
|
cols=len(cols), counts=len(counts)))
|
||||||
|
tmpl = "{count} non-null {dtype}"
|
||||||
|
|
||||||
|
dtypes = self.dtypes
|
||||||
|
for i, col in enumerate(self.columns):
|
||||||
|
dtype = dtypes.iloc[i]
|
||||||
|
col = pprint_thing(col)
|
||||||
|
|
||||||
|
count = ""
|
||||||
|
if show_counts:
|
||||||
|
count = counts.iloc[i]
|
||||||
|
|
||||||
|
lines.append(_put_str(col, space) + tmpl.format(count=count,
|
||||||
|
dtype=dtype))
|
||||||
|
|
||||||
|
def _non_verbose_repr():
|
||||||
|
lines.append(self.columns._summary(name='Columns'))
|
||||||
|
|
||||||
|
def _sizeof_fmt(num, size_qualifier):
|
||||||
|
# returns size in human readable format
|
||||||
|
for x in ['bytes', 'KB', 'MB', 'GB', 'TB']:
|
||||||
|
if num < 1024.0:
|
||||||
|
return ("{num:3.1f}{size_q} "
|
||||||
|
"{x}".format(num=num, size_q=size_qualifier, x=x))
|
||||||
|
num /= 1024.0
|
||||||
|
return "{num:3.1f}{size_q} {pb}".format(num=num,
|
||||||
|
size_q=size_qualifier,
|
||||||
|
pb='PB')
|
||||||
|
|
||||||
|
if verbose:
|
||||||
|
_verbose_repr()
|
||||||
|
elif verbose is False: # specifically set to False, not nesc None
|
||||||
|
_non_verbose_repr()
|
||||||
|
else:
|
||||||
|
if exceeds_info_cols:
|
||||||
|
_non_verbose_repr()
|
||||||
|
else:
|
||||||
|
_verbose_repr()
|
||||||
|
|
||||||
|
counts = self.get_dtype_counts()
|
||||||
|
dtypes = ['{k}({kk:d})'.format(k=k[0], kk=k[1]) for k
|
||||||
|
in sorted(counts.items())]
|
||||||
|
lines.append('dtypes: {types}'.format(types=', '.join(dtypes)))
|
||||||
|
|
||||||
|
if memory_usage is None:
|
||||||
|
memory_usage = pd.get_option('display.memory_usage')
|
||||||
|
if memory_usage:
|
||||||
|
# append memory usage of df to display
|
||||||
|
size_qualifier = ''
|
||||||
|
|
||||||
|
# TODO - this is different from pd.DataFrame as we shouldn't
|
||||||
|
# really hold much in memory. For now just approximate with getsizeof + ignore deep
|
||||||
|
mem_usage = sys.getsizeof(self)
|
||||||
|
lines.append("memory usage: {mem}\n".format(
|
||||||
|
mem=_sizeof_fmt(mem_usage, size_qualifier)))
|
||||||
|
|
||||||
|
fmt.buffer_put_lines(buf, lines)
|
||||||
|
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def shape(self):
|
def shape(self):
|
||||||
"""
|
"""
|
||||||
@ -301,7 +423,38 @@ class DataFrame():
|
|||||||
|
|
||||||
@property
|
@property
|
||||||
def columns(self):
|
def columns(self):
|
||||||
return self.mappings.source_fields()
|
return pd.Index(self.mappings.source_fields())
|
||||||
|
|
||||||
|
@property
|
||||||
|
def dtypes(self):
|
||||||
|
return self.mappings.dtypes()
|
||||||
|
|
||||||
|
def get_dtype_counts(self):
|
||||||
|
return self.mappings.get_dtype_counts()
|
||||||
|
|
||||||
|
def count(self):
|
||||||
|
"""
|
||||||
|
Count non-NA cells for each column (TODO row)
|
||||||
|
|
||||||
|
Counts are based on exists queries against ES
|
||||||
|
|
||||||
|
This is inefficient, as it creates N queries (N is number of fields).
|
||||||
|
|
||||||
|
An alternative approach is to use value_count aggregations. However, they have issues in that:
|
||||||
|
1. They can only be used with aggregatable fields (e.g. keyword not text)
|
||||||
|
2. For list fields they return multiple counts. E.g. tags=['elastic', 'ml'] returns value_count=2
|
||||||
|
for a single document.
|
||||||
|
"""
|
||||||
|
counts = {}
|
||||||
|
for field in self.mappings.source_fields():
|
||||||
|
exists_query = {"query":{"exists":{"field":field}}}
|
||||||
|
field_exists_count = self.client.count(index=self.index_pattern, body=exists_query)
|
||||||
|
counts[field] = field_exists_count
|
||||||
|
|
||||||
|
count = pd.Series(data=counts, index=self.mappings.source_fields())
|
||||||
|
|
||||||
|
return count
|
||||||
|
|
||||||
|
|
||||||
def __getitem__(self, item):
|
def __getitem__(self, item):
|
||||||
# df['a'] -> item == str
|
# df['a'] -> item == str
|
||||||
@ -313,6 +466,8 @@ class DataFrame():
|
|||||||
columns.append(item)
|
columns.append(item)
|
||||||
elif isinstance(item, tuple):
|
elif isinstance(item, tuple):
|
||||||
columns.extend(list(item))
|
columns.extend(list(item))
|
||||||
|
elif isinstance(item, list):
|
||||||
|
columns.extend(item)
|
||||||
|
|
||||||
if len(columns) > 0:
|
if len(columns) > 0:
|
||||||
# Return new eland.DataFrame with modified mappings
|
# Return new eland.DataFrame with modified mappings
|
||||||
@ -337,13 +492,14 @@ class DataFrame():
|
|||||||
# Rendering Methods
|
# Rendering Methods
|
||||||
|
|
||||||
def __repr__(self):
|
def __repr__(self):
|
||||||
|
"""
|
||||||
|
Return a string representation for a particular DataFrame.
|
||||||
|
"""
|
||||||
return self.to_string()
|
return self.to_string()
|
||||||
|
|
||||||
|
|
||||||
def to_string(self):
|
def __fake_dataframe__(self, max_rows=1):
|
||||||
# The return for this is display.options.max_rows
|
head_rows = max_rows / 2 + 1
|
||||||
max_rows = 60
|
|
||||||
head_rows = max_rows / 2
|
|
||||||
tail_rows = max_rows - head_rows
|
tail_rows = max_rows - head_rows
|
||||||
|
|
||||||
head = self.head(max_rows)
|
head = self.head(max_rows)
|
||||||
@ -358,13 +514,33 @@ class DataFrame():
|
|||||||
# to use the pandas IO methods.
|
# to use the pandas IO methods.
|
||||||
# TODO - if data is indexed by time series, return top/bottom of
|
# TODO - if data is indexed by time series, return top/bottom of
|
||||||
# time series, rather than first max_rows items
|
# time series, rather than first max_rows items
|
||||||
|
if tail_rows > 0:
|
||||||
|
locations = [0, num_rows-tail_rows]
|
||||||
|
lengths = [head_rows, tail_rows]
|
||||||
|
else:
|
||||||
|
locations = [0]
|
||||||
|
lengths = [head_rows]
|
||||||
|
|
||||||
sdf = pd.DataFrame({item: pd.SparseArray(data=head[item],
|
sdf = pd.DataFrame({item: pd.SparseArray(data=head[item],
|
||||||
sparse_index=
|
sparse_index=
|
||||||
BlockIndex(
|
BlockIndex(
|
||||||
num_rows, [0, num_rows-tail_rows], [head_rows, tail_rows]))
|
num_rows, locations, lengths))
|
||||||
for item in self.columns})
|
for item in self.columns})
|
||||||
|
|
||||||
# TODO - don't hard code max_rows - use pandas default/ES default
|
return sdf
|
||||||
return sdf.to_string(max_rows=max_rows)
|
|
||||||
|
|
||||||
return head.to_string(max_rows=max_rows)
|
return head
|
||||||
|
|
||||||
|
|
||||||
|
def to_string(self):
|
||||||
|
# TODO - this doesn't return 'notebook' friendly results yet..
|
||||||
|
# TODO - don't hard code max_rows - use pandas default/ES default
|
||||||
|
max_rows = 60
|
||||||
|
|
||||||
|
df = self.__fake_dataframe__(max_rows=max_rows)
|
||||||
|
|
||||||
|
return df.to_string(max_rows=max_rows, show_dimensions=True)
|
||||||
|
|
||||||
|
|
||||||
|
def _put_str(s, space):
|
||||||
|
return '{s}'.format(s=s)[:space].ljust(space)
|
@ -1,6 +1,8 @@
|
|||||||
import warnings
|
import warnings
|
||||||
|
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
|
|
||||||
|
|
||||||
class Mappings():
|
class Mappings():
|
||||||
"""
|
"""
|
||||||
General purpose to manage Elasticsearch to/from pandas mappings
|
General purpose to manage Elasticsearch to/from pandas mappings
|
||||||
@ -26,6 +28,7 @@ class Mappings():
|
|||||||
origin_location.lat True text object True False
|
origin_location.lat True text object True False
|
||||||
|
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self,
|
def __init__(self,
|
||||||
client=None,
|
client=None,
|
||||||
index_pattern=None,
|
index_pattern=None,
|
||||||
@ -63,14 +66,14 @@ class Mappings():
|
|||||||
# field_name, es_dtype, pd_dtype, is_searchable, is_aggregtable, is_source
|
# field_name, es_dtype, pd_dtype, is_searchable, is_aggregtable, is_source
|
||||||
self.mappings_capabilities = Mappings._create_capability_matrix(all_fields, source_fields, all_fields_caps)
|
self.mappings_capabilities = Mappings._create_capability_matrix(all_fields, source_fields, all_fields_caps)
|
||||||
else:
|
else:
|
||||||
# Copy object and restrict mapping columns
|
# Reference object and restrict mapping columns
|
||||||
self.mappings_capabilities = mappings.mappings_capabilities.loc[columns]
|
self.mappings_capabilities = mappings.mappings_capabilities.loc[columns]
|
||||||
|
|
||||||
# Cache source field types for efficient lookup
|
# Cache source field types for efficient lookup
|
||||||
# (this massively improves performance of DataFrame.flatten)
|
# (this massively improves performance of DataFrame.flatten)
|
||||||
self.source_field_pd_dtypes = {}
|
self.source_field_pd_dtypes = {}
|
||||||
|
|
||||||
for field_name in self.source_fields():
|
for field_name in self.mappings_capabilities[self.mappings_capabilities._source == True].index:
|
||||||
pd_dtype = self.mappings_capabilities.loc[field_name]['pd_dtype']
|
pd_dtype = self.mappings_capabilities.loc[field_name]['pd_dtype']
|
||||||
self.source_field_pd_dtypes[field_name] = pd_dtype
|
self.source_field_pd_dtypes[field_name] = pd_dtype
|
||||||
|
|
||||||
@ -336,7 +339,7 @@ class Mappings():
|
|||||||
source_fields: list of str
|
source_fields: list of str
|
||||||
List of source fields
|
List of source fields
|
||||||
"""
|
"""
|
||||||
return self.mappings_capabilities[self.mappings_capabilities._source == True].index.tolist()
|
return self.source_field_pd_dtypes.keys()
|
||||||
|
|
||||||
def count_source_fields(self):
|
def count_source_fields(self):
|
||||||
"""
|
"""
|
||||||
@ -345,5 +348,25 @@ class Mappings():
|
|||||||
count_source_fields: int
|
count_source_fields: int
|
||||||
Number of source fields in mapping
|
Number of source fields in mapping
|
||||||
"""
|
"""
|
||||||
return len(self.mappings_capabilities[self.mappings_capabilities._source == True].index)
|
return len(self.source_fields())
|
||||||
|
|
||||||
|
def dtypes(self):
|
||||||
|
"""
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
dtypes: pd.Series
|
||||||
|
Source field name + pd_dtype
|
||||||
|
"""
|
||||||
|
return pd.Series(self.source_field_pd_dtypes)
|
||||||
|
|
||||||
|
def get_dtype_counts(self):
|
||||||
|
"""
|
||||||
|
Return counts of unique dtypes in this object.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
get_dtype_counts : Series
|
||||||
|
Series with the count of columns with each dtype.
|
||||||
|
"""
|
||||||
|
return pd.Series(self.mappings_capabilities[self.mappings_capabilities._source == True].groupby('pd_dtype')[
|
||||||
|
'_source'].count().to_dict())
|
||||||
|
@ -438,7 +438,11 @@ TEST_MAPPING1_EXPECTED = {
|
|||||||
}
|
}
|
||||||
|
|
||||||
TEST_MAPPING1_EXPECTED_DF = pd.DataFrame.from_dict(data=TEST_MAPPING1_EXPECTED, orient='index', columns=['es_dtype'])
|
TEST_MAPPING1_EXPECTED_DF = pd.DataFrame.from_dict(data=TEST_MAPPING1_EXPECTED, orient='index', columns=['es_dtype'])
|
||||||
TEST_MAPPING1_EXPECTED_SOURCE_FIELD_COUNT = len(TEST_MAPPING1_EXPECTED_DF.index) - 4
|
TEST_MAPPING1_EXPECTED_SOURCE_FIELD_DF = TEST_MAPPING1_EXPECTED_DF.drop(index=['city.raw',
|
||||||
|
'origin_location.lat.keyword',
|
||||||
|
'origin_location.lon.keyword',
|
||||||
|
'text.english'])
|
||||||
|
TEST_MAPPING1_EXPECTED_SOURCE_FIELD_COUNT = len(TEST_MAPPING1_EXPECTED_SOURCE_FIELD_DF.index)
|
||||||
|
|
||||||
TEST_NESTED_USER_GROUP_INDEX_NAME = 'nested_user_group'
|
TEST_NESTED_USER_GROUP_INDEX_NAME = 'nested_user_group'
|
||||||
TEST_NESTED_USER_GROUP_MAPPING = {
|
TEST_NESTED_USER_GROUP_MAPPING = {
|
||||||
|
@ -1,47 +1,71 @@
|
|||||||
# File called _pytest for PyCharm compatability
|
# File called _pytest for PyCharm compatability
|
||||||
import pytest
|
|
||||||
|
|
||||||
from eland.tests import *
|
|
||||||
|
|
||||||
from pandas.util.testing import (
|
from pandas.util.testing import (
|
||||||
assert_almost_equal, assert_frame_equal, assert_series_equal)
|
assert_series_equal, assert_frame_equal)
|
||||||
|
|
||||||
import eland as ed
|
import eland as ed
|
||||||
|
from eland.tests import *
|
||||||
|
from eland.tests.common import TestData
|
||||||
|
|
||||||
class TestMapping():
|
|
||||||
|
class TestMapping(TestData):
|
||||||
|
|
||||||
# Requires 'setup_tests.py' to be run prior to this
|
# Requires 'setup_tests.py' to be run prior to this
|
||||||
def test_mapping(self):
|
def test_fields(self):
|
||||||
mappings = ed.Mappings(ed.Client(ELASTICSEARCH_HOST), TEST_MAPPING1_INDEX_NAME)
|
mappings = ed.Mappings(ed.Client(ELASTICSEARCH_HOST), TEST_MAPPING1_INDEX_NAME)
|
||||||
|
|
||||||
assert mappings.all_fields() == TEST_MAPPING1_EXPECTED_DF.index.tolist()
|
assert TEST_MAPPING1_EXPECTED_DF.index.tolist() == mappings.all_fields()
|
||||||
|
|
||||||
assert_frame_equal(TEST_MAPPING1_EXPECTED_DF, pd.DataFrame(mappings.mappings_capabilities['es_dtype']))
|
assert_frame_equal(TEST_MAPPING1_EXPECTED_DF, pd.DataFrame(mappings.mappings_capabilities['es_dtype']))
|
||||||
|
|
||||||
assert mappings.count_source_fields() == TEST_MAPPING1_EXPECTED_SOURCE_FIELD_COUNT
|
assert TEST_MAPPING1_EXPECTED_SOURCE_FIELD_COUNT == mappings.count_source_fields()
|
||||||
|
|
||||||
def test_copy(self):
|
def test_copy(self):
|
||||||
mappings = ed.Mappings(ed.Client(ELASTICSEARCH_HOST), TEST_MAPPING1_INDEX_NAME)
|
mappings = ed.Mappings(ed.Client(ELASTICSEARCH_HOST), TEST_MAPPING1_INDEX_NAME)
|
||||||
|
|
||||||
assert mappings.all_fields() == TEST_MAPPING1_EXPECTED_DF.index.tolist()
|
assert TEST_MAPPING1_EXPECTED_DF.index.tolist() == mappings.all_fields()
|
||||||
assert_frame_equal(TEST_MAPPING1_EXPECTED_DF, pd.DataFrame(mappings.mappings_capabilities['es_dtype']))
|
assert_frame_equal(TEST_MAPPING1_EXPECTED_DF, pd.DataFrame(mappings.mappings_capabilities['es_dtype']))
|
||||||
assert mappings.count_source_fields() == TEST_MAPPING1_EXPECTED_SOURCE_FIELD_COUNT
|
assert TEST_MAPPING1_EXPECTED_SOURCE_FIELD_COUNT == mappings.count_source_fields()
|
||||||
|
|
||||||
# Pick 1 source field
|
# Pick 1 source field
|
||||||
columns = ['dest_location']
|
columns = ['dest_location']
|
||||||
mappings_copy1 = ed.Mappings(mappings=mappings, columns=columns)
|
mappings_copy1 = ed.Mappings(mappings=mappings, columns=columns)
|
||||||
|
|
||||||
assert mappings_copy1.all_fields() == columns
|
assert columns == mappings_copy1.all_fields()
|
||||||
assert mappings_copy1.count_source_fields() == len(columns)
|
assert len(columns) == mappings_copy1.count_source_fields()
|
||||||
|
|
||||||
# Pick 3 source fields (out of order)
|
# Pick 3 source fields (out of order)
|
||||||
columns = ['dest_location', 'city', 'user_name']
|
columns = ['dest_location', 'city', 'user_name']
|
||||||
mappings_copy2 = ed.Mappings(mappings=mappings, columns=columns)
|
mappings_copy2 = ed.Mappings(mappings=mappings, columns=columns)
|
||||||
|
|
||||||
assert mappings_copy2.all_fields() == columns
|
assert columns == mappings_copy2.all_fields()
|
||||||
assert mappings_copy2.count_source_fields() == len(columns)
|
assert len(columns) == mappings_copy2.count_source_fields()
|
||||||
|
|
||||||
# Check original is still ok
|
# Check original is still ok
|
||||||
assert mappings.all_fields() == TEST_MAPPING1_EXPECTED_DF.index.tolist()
|
assert TEST_MAPPING1_EXPECTED_DF.index.tolist() == mappings.all_fields()
|
||||||
assert_frame_equal(TEST_MAPPING1_EXPECTED_DF, pd.DataFrame(mappings.mappings_capabilities['es_dtype']))
|
assert_frame_equal(TEST_MAPPING1_EXPECTED_DF, pd.DataFrame(mappings.mappings_capabilities['es_dtype']))
|
||||||
assert mappings.count_source_fields() == TEST_MAPPING1_EXPECTED_SOURCE_FIELD_COUNT
|
assert TEST_MAPPING1_EXPECTED_SOURCE_FIELD_COUNT == mappings.count_source_fields()
|
||||||
|
|
||||||
|
def test_dtypes(self):
|
||||||
|
mappings = ed.Mappings(ed.Client(ELASTICSEARCH_HOST), TEST_MAPPING1_INDEX_NAME)
|
||||||
|
|
||||||
|
expected_dtypes = pd.Series(
|
||||||
|
{'city': 'object', 'content': 'object', 'dest_location': 'object', 'email': 'object',
|
||||||
|
'maps-telemetry.attributesPerMap.dataSourcesCount.avg': 'int64',
|
||||||
|
'maps-telemetry.attributesPerMap.dataSourcesCount.max': 'int64',
|
||||||
|
'maps-telemetry.attributesPerMap.dataSourcesCount.min': 'int64',
|
||||||
|
'maps-telemetry.attributesPerMap.emsVectorLayersCount.france_departments.avg': 'float64',
|
||||||
|
'maps-telemetry.attributesPerMap.emsVectorLayersCount.france_departments.max': 'int64',
|
||||||
|
'maps-telemetry.attributesPerMap.emsVectorLayersCount.france_departments.min': 'int64',
|
||||||
|
'my_join_field': 'object', 'name': 'object', 'origin_location.lat': 'object',
|
||||||
|
'origin_location.lon': 'object', 'text': 'object', 'tweeted_at': 'datetime64[ns]',
|
||||||
|
'type': 'object', 'user_name': 'object'})
|
||||||
|
|
||||||
|
assert_series_equal(expected_dtypes, mappings.dtypes())
|
||||||
|
|
||||||
|
def test_get_dtype_counts(self):
|
||||||
|
mappings = ed.Mappings(ed.Client(ELASTICSEARCH_HOST), TEST_MAPPING1_INDEX_NAME)
|
||||||
|
|
||||||
|
expected_get_dtype_counts = pd.Series({'datetime64[ns]': 1, 'float64': 1, 'int64': 5, 'object': 11})
|
||||||
|
|
||||||
|
assert_series_equal(expected_get_dtype_counts, mappings.get_dtype_counts())
|
||||||
|
@ -1,12 +1,11 @@
|
|||||||
# File called _pytest for PyCharm compatability
|
# File called _pytest for PyCharm compatability
|
||||||
from eland.tests.frame.common import TestData
|
from eland.tests.common import TestData
|
||||||
from eland.tests import *
|
|
||||||
|
|
||||||
import eland as ed
|
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
|
import io
|
||||||
|
|
||||||
from pandas.util.testing import (
|
from pandas.util.testing import (
|
||||||
assert_almost_equal, assert_frame_equal, assert_series_equal)
|
assert_series_equal, assert_frame_equal)
|
||||||
|
|
||||||
class TestDataFrameIndexing(TestData):
|
class TestDataFrameIndexing(TestData):
|
||||||
|
|
||||||
@ -59,7 +58,7 @@ class TestDataFrameIndexing(TestData):
|
|||||||
def test_to_string(self):
|
def test_to_string(self):
|
||||||
print(self.ed_flights())
|
print(self.ed_flights())
|
||||||
|
|
||||||
def test_get_item(self):
|
def test_getitem(self):
|
||||||
# Test 1 attribute
|
# Test 1 attribute
|
||||||
ed_carrier = self.ed_flights()['Carrier']
|
ed_carrier = self.ed_flights()['Carrier']
|
||||||
|
|
||||||
@ -96,3 +95,72 @@ class TestDataFrameIndexing(TestData):
|
|||||||
|
|
||||||
#ed_3_items_to_string = ed_3_items.to_string()
|
#ed_3_items_to_string = ed_3_items.to_string()
|
||||||
#print(ed_3_items_to_string)
|
#print(ed_3_items_to_string)
|
||||||
|
|
||||||
|
# Test numerics
|
||||||
|
numerics = ['DistanceMiles', 'AvgTicketPrice', 'FlightTimeMin']
|
||||||
|
ed_numerics = self.ed_flights()[numerics]
|
||||||
|
|
||||||
|
# just test headers
|
||||||
|
ed_numerics_describe = ed_numerics.describe()
|
||||||
|
assert ed_numerics_describe.columns.tolist() == numerics
|
||||||
|
|
||||||
|
def test_info(self):
|
||||||
|
ed_flights_info_buf = io.StringIO()
|
||||||
|
pd_flights_info_buf = io.StringIO()
|
||||||
|
|
||||||
|
self.ed_flights().info(buf=ed_flights_info_buf)
|
||||||
|
self.pd_flights().info(buf=pd_flights_info_buf)
|
||||||
|
|
||||||
|
ed_flights_info = (ed_flights_info_buf.getvalue().splitlines())
|
||||||
|
pd_flights_info = (pd_flights_info_buf.getvalue().splitlines())
|
||||||
|
|
||||||
|
flights_diff = set(ed_flights_info).symmetric_difference(set(pd_flights_info))
|
||||||
|
|
||||||
|
ed_ecommerce_info_buf = io.StringIO()
|
||||||
|
pd_ecommerce_info_buf = io.StringIO()
|
||||||
|
|
||||||
|
self.ed_ecommerce().info(buf=ed_ecommerce_info_buf)
|
||||||
|
self.pd_ecommerce().info(buf=pd_ecommerce_info_buf)
|
||||||
|
|
||||||
|
ed_ecommerce_info = (ed_ecommerce_info_buf.getvalue().splitlines())
|
||||||
|
pd_ecommerce_info = (pd_ecommerce_info_buf.getvalue().splitlines())
|
||||||
|
|
||||||
|
# We don't compare ecommerce here as the default dtypes in pandas from read_json
|
||||||
|
# don't match the mapping types. This is mainly because the products field is
|
||||||
|
# nested and so can be treated as a multi-field in ES, but not in pandas
|
||||||
|
ecommerce_diff = set(ed_ecommerce_info).symmetric_difference(set(pd_ecommerce_info))
|
||||||
|
|
||||||
|
|
||||||
|
def test_count(self):
|
||||||
|
pd_flights_count = self.pd_flights().count()
|
||||||
|
ed_flights_count = self.ed_flights().count()
|
||||||
|
|
||||||
|
assert_series_equal(pd_flights_count, ed_flights_count)
|
||||||
|
|
||||||
|
pd_ecommerce_count = self.pd_ecommerce().count()
|
||||||
|
ed_ecommerce_count = self.ed_ecommerce().count()
|
||||||
|
|
||||||
|
assert_series_equal(pd_ecommerce_count, ed_ecommerce_count)
|
||||||
|
|
||||||
|
def test_get_dtype_counts(self):
|
||||||
|
pd_flights_get_dtype_counts = self.pd_flights().get_dtype_counts().sort_index()
|
||||||
|
ed_flights_get_dtype_counts = self.ed_flights().get_dtype_counts().sort_index()
|
||||||
|
|
||||||
|
assert_series_equal(pd_flights_get_dtype_counts, ed_flights_get_dtype_counts)
|
||||||
|
|
||||||
|
def test_properties(self):
|
||||||
|
pd_flights_shape = self.pd_flights().shape
|
||||||
|
ed_flights_shape = self.ed_flights().shape
|
||||||
|
|
||||||
|
assert pd_flights_shape == ed_flights_shape
|
||||||
|
|
||||||
|
pd_flights_columns = self.pd_flights().columns
|
||||||
|
ed_flights_columns = self.ed_flights().columns
|
||||||
|
|
||||||
|
assert pd_flights_columns.tolist() == ed_flights_columns.tolist()
|
||||||
|
|
||||||
|
pd_flights_dtypes = self.pd_flights().dtypes
|
||||||
|
ed_flights_dtypes = self.ed_flights().dtypes
|
||||||
|
|
||||||
|
assert_series_equal(pd_flights_dtypes, ed_flights_dtypes)
|
||||||
|
|
||||||
|
File diff suppressed because it is too large
Load Diff
Loading…
x
Reference in New Issue
Block a user