diff --git a/eland/dataframe.py b/eland/dataframe.py index c0995c7..c9a35c9 100644 --- a/eland/dataframe.py +++ b/eland/dataframe.py @@ -1,11 +1,18 @@ import warnings +import sys import pandas as pd +import numpy as np + from pandas.compat import StringIO +from pandas.core.common import apply_if_callable, is_bool_indexer from pandas.io.common import _expand_user, _stringify_path from pandas.io.formats import console +from pandas.io.formats import format as fmt +from pandas.io.formats.printing import pprint_thing from eland import NDFrame +from eland import Series class DataFrame(NDFrame): @@ -67,13 +74,22 @@ class DataFrame(NDFrame): return buf.getvalue() + def info_es(self): + buf = StringIO() + + super().info_es(buf) + + return buf.getvalue() + + + def to_string(self, buf=None, columns=None, col_space=None, header=True, index=True, na_rep='NaN', formatters=None, float_format=None, sparsify=None, index_names=True, justify=None, max_rows=None, max_cols=None, show_dimensions=False, decimal='.', line_width=None): """ - From pandas - except we set max_rows default to avoid careless + From pandas - except we set max_rows default to avoid careless extraction of entire index """ if max_rows is None: warnings.warn("DataFrame.to_string called without max_rows set " @@ -111,3 +127,82 @@ class DataFrame(NDFrame): if buf is None: result = _buf.getvalue() return result + + def _getitem(self, key): + """Get the column specified by key for this DataFrame. + + Args: + key : The column name. + + Returns: + A Pandas Series representing the value for the column. + """ + key = apply_if_callable(key, self) + # Shortcut if key is an actual column + try: + if key in self.columns: + return self._getitem_column(key) + except (KeyError, ValueError, TypeError): + pass + if isinstance(key, (Series, np.ndarray, pd.Index, list)): + return self._getitem_array(key) + elif isinstance(key, DataFrame): + return self.where(key) + else: + return self._getitem_column(key) + + def _getitem_column(self, key): + if key not in self.columns: + raise KeyError("{}".format(key)) + s = self._reduce_dimension(self._query_compiler.getitem_column_array([key])) + s._parent = self + return s + + def _getitem_array(self, key): + if isinstance(key, Series): + key = key._to_pandas() + if is_bool_indexer(key): + if isinstance(key, pd.Series) and not key.index.equals(self.index): + warnings.warn( + "Boolean Series key will be reindexed to match DataFrame index.", + PendingDeprecationWarning, + stacklevel=3, + ) + elif len(key) != len(self.index): + raise ValueError( + "Item wrong length {} instead of {}.".format( + len(key), len(self.index) + ) + ) + key = check_bool_indexer(self.index, key) + # We convert to a RangeIndex because getitem_row_array is expecting a list + # of indices, and RangeIndex will give us the exact indices of each boolean + # requested. + key = pd.RangeIndex(len(self.index))[key] + if len(key): + return DataFrame( + query_compiler=self._query_compiler.getitem_row_array(key) + ) + else: + return DataFrame(columns=self.columns) + else: + if any(k not in self.columns for k in key): + raise KeyError( + "{} not index".format( + str([k for k in key if k not in self.columns]).replace(",", "") + ) + ) + return DataFrame( + query_compiler=self._query_compiler.getitem_column_array(key) + ) + + def _reduce_dimension(self, query_compiler): + return Series(query_compiler=query_compiler) + + def _to_pandas(self): + return self._query_compiler.to_pandas() + + def squeeze(self, axis=None): + return DataFrame( + query_compiler=self._query_compiler.squeeze(axis) + ) diff --git a/eland/index.py b/eland/index.py index eb13ce6..c175916 100644 --- a/eland/index.py +++ b/eland/index.py @@ -49,3 +49,16 @@ class Index: def __len__(self): return self._query_compiler._index_count() + + # Make iterable + def __next__(self): + # TODO resolve this hack to make this 'iterable' + raise StopIteration() + + def __iter__(self): + return self + + def info_es(self, buf): + buf.write("Index:\n") + buf.write("\tindex_field: {0}\n".format(self.index_field)) + buf.write("\tis_source_field: {0}\n".format(self.is_source_field)) diff --git a/eland/mappings.py b/eland/mappings.py index 76d7286..79d60ef 100644 --- a/eland/mappings.py +++ b/eland/mappings.py @@ -456,13 +456,7 @@ class Mappings: return pd.Series(self._mappings_capabilities[self._mappings_capabilities._source == True].groupby('pd_dtype')[ '_source'].count().to_dict()) - def to_pandas(self): - """ - - Returns - ------- - df : pd.DataFrame - pandas DaraFrame representing this index - """ - + def info_es(self, buf): + buf.write("Mappings:\n") + buf.write("\tcapabilities: {0}\n".format(self._mappings_capabilities)) diff --git a/eland/ndframe.py b/eland/ndframe.py index 383e298..30b2897 100644 --- a/eland/ndframe.py +++ b/eland/ndframe.py @@ -24,6 +24,7 @@ only Elasticsearch aggregatable fields can be aggregated or grouped. """ from modin.pandas.base import BasePandasDataset +from modin.pandas.indexing import _iLocIndexer from eland import ElandQueryCompiler @@ -51,6 +52,7 @@ class NDFrame(BasePandasDataset): index_field=index_field) self._query_compiler = query_compiler + def _get_index(self): return self._query_compiler.index @@ -72,6 +74,30 @@ class NDFrame(BasePandasDataset): return head.append(tail) - def _to_pandas(self): - return self._query_compiler.to_pandas() + def __getattr__(self, key): + """After regular attribute access, looks up the name in the columns + + Args: + key (str): Attribute name. + + Returns: + The value of the attribute. + """ + print(key) + try: + return object.__getattribute__(self, key) + except AttributeError as e: + if key in self.columns: + return self[key] + raise e + + @property + def iloc(self): + """Purely integer-location based indexing for selection by position. + + """ + return _iLocIndexer(self) + + def info_es(self, buf): + self._query_compiler.info_es(buf) diff --git a/eland/operations.py b/eland/operations.py index ec18a14..7c6c863 100644 --- a/eland/operations.py +++ b/eland/operations.py @@ -1,5 +1,10 @@ from enum import Enum +from pandas.core.indexes.numeric import Int64Index +from pandas.core.indexes.range import RangeIndex + +import copy + class Operations: """ @@ -39,9 +44,16 @@ class Operations: @staticmethod def to_string(order): if order == Operations.SortOrder.ASC: - return ":asc" + return "asc" + + return "desc" + + def from_string(order): + if order == "asc": + return Operations.SortOrder.ASC + + return Operations.SortOrder.DESC - return ":desc" def __init__(self, tasks=None): if tasks == None: @@ -53,7 +65,7 @@ class Operations: return type(self)(*args, **kwargs) def copy(self): - return self.__constructor__(tasks=self._tasks.copy()) + return self.__constructor__(tasks=copy.deepcopy(self._tasks)) def head(self, index, n): # Add a task that is an ascending sort with size=n @@ -61,13 +73,24 @@ class Operations: self._tasks.append(task) def tail(self, index, n): - # Add a task that is descending sort with size=n task = ('tail', (index.sort_field, n)) self._tasks.append(task) def set_columns(self, columns): - self._tasks['columns'] = columns + # Setting columns at different phases of the task list may result in different + # operations. So instead of setting columns once, set when it happens in call chain + # TODO - column renaming + # TODO - validate we are setting columns to a subset of last columns? + task = ('columns', columns) + self._tasks.append(task) + + def get_columns(self): + # Iterate backwards through task list looking for last 'columns' task + for task in reversed(self._tasks): + if task[0] == 'columns': + return task[1] + return None def __repr__(self): return repr(self._tasks) @@ -77,15 +100,38 @@ class Operations: size, sort_params = Operations._query_to_params(query) - es_results = query_compiler._client.search( - index=query_compiler._index_pattern, - size=size, - sort=sort_params) + # Only return requested columns + columns = self.get_columns() + + # If size=None use scan not search - then post sort results when in df + # If size>10000 use scan + if size is not None and size <= 10000: + es_results = query_compiler._client.search( + index=query_compiler._index_pattern, + size=size, + sort=sort_params, + _source=columns) + else: + es_results = query_compiler._client.scan( + index=query_compiler._index_pattern, + _source=columns) + # create post sort + if sort_params is not None: + post_processing.append(self._sort_params_to_postprocessing(sort_params)) df = query_compiler._es_results_to_pandas(es_results) return self._apply_df_post_processing(df, post_processing) + def iloc(self, index, columns): + # index and columns are indexers + task = ('iloc', (index, columns)) + self._tasks.append(task) + + def squeeze(self, axis): + task = ('squeeze', axis) + self._tasks.append(task) + def to_count(self, query_compiler): query, post_processing = self._to_es_query() @@ -106,11 +152,23 @@ class Operations: return query_compiler._client.count(index=query_compiler._index_pattern, body=exists_query) + @staticmethod + def _sort_params_to_postprocessing(input): + # Split string + sort_params = input.split(":") + + query_sort_field = sort_params[0] + query_sort_order = Operations.SortOrder.from_string(sort_params[1]) + + task = ('sort_field', (query_sort_field, query_sort_order)) + + return task + @staticmethod def _query_to_params(query): sort_params = None if query['query_sort_field'] and query['query_sort_order']: - sort_params = query['query_sort_field'] + Operations.SortOrder.to_string(query['query_sort_order']) + sort_params = query['query_sort_field'] + ":" + Operations.SortOrder.to_string(query['query_sort_order']) size = query['query_size'] @@ -135,6 +193,25 @@ class Operations: df = df.head(action[1][1]) elif action[0] == 'tail': df = df.tail(action[1][1]) + elif action[0] == 'sort_field': + sort_field = action[1][0] + sort_order = action[1][1] + if sort_order == Operations.SortOrder.ASC: + df = df.sort_values(sort_field, True) + else: + df = df.sort_values(sort_field, False) + elif action[0] == 'iloc': + index_indexer = action[1][0] + column_indexer = action[1][1] + if index_indexer is None: + index_indexer = slice(None) + if column_indexer is None: + column_indexer = slice(None) + df = df.iloc[index_indexer, column_indexer] + elif action[0] == 'squeeze': + print(df) + df = df.squeeze(axis=action[1]) + print(df) return df @@ -145,7 +222,8 @@ class Operations: # other operations require in-core post-processing of results query = {"query_sort_field": None, "query_sort_order": None, - "query_size": None} + "query_size": None, + "query_fields": None} post_processing = [] @@ -154,6 +232,10 @@ class Operations: query, post_processing = self._resolve_head(task, query, post_processing) elif task[0] == 'tail': query, post_processing = self._resolve_tail(task, query, post_processing) + elif task[0] == 'iloc': + query, post_processing = self._resolve_iloc(task, query, post_processing) + else: # a lot of operations simply post-process the dataframe - put these straight through + query, post_processing = self._resolve_post_processing_task(task, query, post_processing) return query, post_processing @@ -229,3 +311,44 @@ class Operations: post_processing.append(('sort_index')) return query, post_processing + + def _resolve_iloc(self, item, query, post_processing): + # tail - sort desc, size n, post-process sort asc + # |---4--7-9---------| + + # This is a list of items we return via an integer index + int_index = item[1][0] + if int_index is not None: + last_item = int_index.max() + + # If we have a query_size we do this post processing + if query['query_size'] is not None: + post_processing.append(item) + return query, post_processing + + # size should be > last item + query['query_size'] = last_item + 1 + post_processing.append(item) + + return query, post_processing + + def _resolve_post_processing_task(self, item, query, post_processing): + # Just do this in post-processing + post_processing.append(item) + + return query, post_processing + + def info_es(self, buf): + buf.write("Operations:\n") + buf.write("\ttasks: {0}\n".format(self._tasks)) + + query, post_processing = self._to_es_query() + size, sort_params = Operations._query_to_params(query) + columns = self.get_columns() + + buf.write("\tsize: {0}\n".format(size)) + buf.write("\tsort_params: {0}\n".format(sort_params)) + buf.write("\tcolumns: {0}\n".format(columns)) + buf.write("\tpost_processing: {0}\n".format(post_processing)) + + diff --git a/eland/query_compiler.py b/eland/query_compiler.py index 49b0125..f96e26c 100644 --- a/eland/query_compiler.py +++ b/eland/query_compiler.py @@ -6,6 +6,9 @@ from eland import Index from eland import Mappings from eland import Operations +from pandas.core.indexes.numeric import Int64Index +from pandas.core.indexes.range import RangeIndex + class ElandQueryCompiler(BaseQueryCompiler): @@ -29,13 +32,24 @@ class ElandQueryCompiler(BaseQueryCompiler): else: self._operations = operations + if columns is not None: + self.columns = columns + def _get_index(self): return self._index def _get_columns(self): - return pd.Index(self._mappings.source_fields()) + columns = self._operations.get_columns() + if columns is None: + # default to all + columns = self._mappings.source_fields() - columns = property(_get_columns) + return pd.Index(columns) + + def _set_columns(self, columns): + self._operations.set_columns(columns) + + columns = property(_get_columns, _set_columns) index = property(_get_index) # END Index, columns, and dtypes objects @@ -218,7 +232,7 @@ class ElandQueryCompiler(BaseQueryCompiler): return self.__constructor__( client=self._client, index_pattern=self._index_pattern, - columns=self.columns, + columns=None, # columns are embedded in operations index_field=self._index.index_field, operations=self._operations.copy() ) @@ -245,3 +259,46 @@ class ElandQueryCompiler(BaseQueryCompiler): Pandas DataFrame """ return self._operations.to_pandas(self) + + # __getitem__ methods + def getitem_column_array(self, key, numeric=False): + """Get column data for target labels. + + Args: + key: Target labels by which to retrieve data. + numeric: A boolean representing whether or not the key passed in represents + the numeric index or the named index. + + Returns: + A new QueryCompiler. + """ + result = self.copy() + + if numeric: + raise NotImplementedError("Not implemented yet...") + + result._operations.set_columns(key) + + return result + + def squeeze(self, axis=None): + result = self.copy() + + result._operations.squeeze(axis) + + return result + + def view(self, index=None, columns=None): + result = self.copy() + + result._operations.iloc(index, columns) + + return result + + def info_es(self, buf): + buf.write("index_pattern: {index_pattern}\n".format(index_pattern=self._index_pattern)) + + self._index.info_es(buf) + self._mappings.info_es(buf) + self._operations.info_es(buf) + diff --git a/eland/series.py b/eland/series.py index e69de29..89381af 100644 --- a/eland/series.py +++ b/eland/series.py @@ -0,0 +1,153 @@ +""" +Series +--------- +One-dimensional ndarray with axis labels (including time series). + +The underlying data resides in Elasticsearch and the API aligns as much as +possible with pandas.DataFrame API. + +This allows the eland.Series to access large datasets stored in Elasticsearch, +without storing the dataset in local memory. + +Implementation Details +---------------------- +Based on NDFrame which underpins eland.1DataFrame + +""" + +import pandas as pd +import warnings + +from eland import NDFrame + + +class Series(NDFrame): + """ + pandas.Series like API that proxies into Elasticsearch index(es). + + Parameters + ---------- + client : eland.Client + A reference to a Elasticsearch python client + + index_pattern : str + An Elasticsearch index pattern. This can contain wildcards (e.g. filebeat-*). + + field_name : str + The field to base the series on + + See Also + -------- + + Examples + -------- + + import eland as ed + client = ed.Client(Elasticsearch()) + s = ed.DataFrame(client, 'reviews', 'date') + df.head() + reviewerId vendorId rating date + 0 0 0 5 2006-04-07 17:08 + 1 1 1 5 2006-05-04 12:16 + 2 2 2 4 2006-04-21 12:26 + 3 3 3 5 2006-04-18 15:48 + 4 3 4 5 2006-04-18 15:49 + + Notice that the types are based on Elasticsearch mappings + + Notes + ----- + If the Elasticsearch index is deleted or index mappings are changed after this + object is created, the object is not rebuilt and so inconsistencies can occur. + + """ + + def __init__(self, + client=None, + index_pattern=None, + name=None, + index_field=None, + query_compiler=None): + # Series has 1 column + if name is None: + columns = None + else: + columns = [name] + + super().__init__( + client=client, + index_pattern=index_pattern, + columns=columns, + index_field=index_field, + query_compiler=query_compiler) + + @property + def empty(self): + """Determines if the Series is empty. + + Returns: + True if the Series is empty. + False otherwise. + """ + # TODO - this is called on every attribute get (most methods) from modin/pandas/base.py:3337 + # (as Index.__len__ performs an query) we may want to cache self.index.empty() + return len(self.index) == 0 + + def _get_name(self): + return self._query_compiler.columns[0] + + name = property(_get_name) + + def head(self, n=5): + return super().head(n) + + def tail(self, n=5): + return super().tail(n) + + # ---------------------------------------------------------------------- + # Rendering Methods + def __repr__(self): + num_rows = pd.get_option("max_rows") or 60 + + return self.to_string(max_rows=num_rows) + + def to_string( + self, + buf=None, + na_rep="NaN", + float_format=None, + header=True, + index=True, + length=False, + dtype=False, + name=False, + max_rows=None): + + if max_rows is None: + warnings.warn("Series.to_string called without max_rows set " + "- this will return entire index results. " + "Setting max_rows=60, overwrite if different behaviour is required.") + max_rows = 60 + + # Create a slightly bigger dataframe than display + temp_df = self._build_repr_df(max_rows+1, None) + if isinstance(temp_df, pd.DataFrame): + temp_df = temp_df[self.name] + temp_str = repr(temp_df) + if self.name is not None: + name_str = "Name: {}, ".format(str(self.name)) + else: + name_str = "" + if len(self.index) > max_rows: + len_str = "Length: {}, ".format(len(self.index)) + else: + len_str = "" + dtype_str = "dtype: {}".format(temp_str.rsplit("dtype: ", 1)[-1]) + if len(self) == 0: + return "Series([], {}{}".format(name_str, dtype_str) + return temp_str.rsplit("\nName:", 1)[0] + "\n{}{}{}".format( + name_str, len_str, dtype_str + ) + + def _to_pandas(self): + return self._query_compiler.to_pandas()[self.name] diff --git a/eland/tests/common.py b/eland/tests/common.py index 6243a4e..a1fe68a 100644 --- a/eland/tests/common.py +++ b/eland/tests/common.py @@ -1,10 +1,9 @@ import pytest import eland as ed - import pandas as pd -from pandas.util.testing import (assert_frame_equal) +from pandas.util.testing import (assert_frame_equal, assert_series_equal) import os @@ -47,14 +46,26 @@ class TestData: def assert_pandas_eland_frame_equal(left, right): if not isinstance(left, pd.DataFrame): - raise AssertionError("Expected type {exp_type}, found {act_type} instead", - exp_type=type(pd.DataFrame), act_type=type(left)) + raise AssertionError("Expected type {exp_type}, found {act_type} instead".format( + exp_type='pd.DataFrame', act_type=type(left))) if not isinstance(right, ed.DataFrame): - raise AssertionError("Expected type {exp_type}, found {act_type} instead", - exp_type=type(ed.DataFrame), act_type=type(right)) + raise AssertionError("Expected type {exp_type}, found {act_type} instead".format( + exp_type='ed.DataFrame', act_type=type(right))) # Use pandas tests to check similarity assert_frame_equal(left, right._to_pandas()) +def assert_pandas_eland_series_equal(left, right): + if not isinstance(left, pd.Series): + raise AssertionError("Expected type {exp_type}, found {act_type} instead".format( + exp_type='pd.Series', act_type=type(left))) + + if not isinstance(right, ed.Series): + raise AssertionError("Expected type {exp_type}, found {act_type} instead".format( + exp_type='ed.Series', act_type=type(right))) + + # Use pandas tests to check similarity + assert_series_equal(left, right._to_pandas()) + diff --git a/eland/tests/dataframe/test_count_pytest.py b/eland/tests/dataframe/test_count_pytest.py new file mode 100644 index 0000000..4d3af5f --- /dev/null +++ b/eland/tests/dataframe/test_count_pytest.py @@ -0,0 +1,13 @@ +# File called _pytest for PyCharm compatability + +from eland.tests.common import TestData + + +class TestDataFrameCount(TestData): + + def test_to_string1(self): + ed_flights = self.ed_flights() + pd_flights = self.pd_flights() + + #ed_count = ed_flights.count() + diff --git a/eland/tests/dataframe/test_getitem_pytest.py b/eland/tests/dataframe/test_getitem_pytest.py new file mode 100644 index 0000000..16c8d75 --- /dev/null +++ b/eland/tests/dataframe/test_getitem_pytest.py @@ -0,0 +1,39 @@ +# File called _pytest for PyCharm compatability +import pandas as pd + +from eland.tests.common import TestData +from eland.tests.common import ( + assert_pandas_eland_frame_equal, + assert_pandas_eland_series_equal +) + + + +class TestDataFrameGetItem(TestData): + + def test_getitem1(self): + ed_flights = self.ed_flights().head(103) + pd_flights = self.pd_flights().head(103) + + ed_flights_OriginAirportID = ed_flights['OriginAirportID'] + pd_flights_OriginAirportID = pd_flights['OriginAirportID'] + + assert_pandas_eland_series_equal(pd_flights_OriginAirportID, ed_flights_OriginAirportID) + + def test_getitem2(self): + ed_flights = self.ed_flights().head(42) + pd_flights = self.pd_flights().head(42) + + ed_flights_slice = ed_flights[['OriginAirportID', 'AvgTicketPrice', 'Carrier']] + pd_flights_slice = pd_flights[['OriginAirportID', 'AvgTicketPrice', 'Carrier']] + + assert_pandas_eland_frame_equal(pd_flights_slice, ed_flights_slice) + + def test_getitem3(self): + ed_flights = self.ed_flights().head(89) + pd_flights = self.pd_flights().head(89) + + ed_flights_OriginAirportID = ed_flights.OriginAirportID + pd_flights_OriginAirportID = pd_flights.OriginAirportID + + assert_pandas_eland_series_equal(pd_flights_OriginAirportID, ed_flights_OriginAirportID) diff --git a/eland/tests/dataframe/test_iloc_pytest.py b/eland/tests/dataframe/test_iloc_pytest.py new file mode 100644 index 0000000..8c49d28 --- /dev/null +++ b/eland/tests/dataframe/test_iloc_pytest.py @@ -0,0 +1,76 @@ +# File called _pytest for PyCharm compatability +import pandas as pd +import eland as ed + +from eland.tests.common import TestData +from eland.tests.common import ( + assert_pandas_eland_frame_equal, + assert_pandas_eland_series_equal +) + +import numpy as np + +class TestDataFrameiLoc(TestData): + + def test_range(self): + columns = ['a', 'b', 'c', 'd', 'e'] + + r = pd.RangeIndex(0, 3, 1) + + i = pd.Int64Index([1, 2]) + + dates = pd.date_range('1/1/2000', periods=8) + + df = pd.DataFrame(np.random.randn(8, 4), index = dates, columns = ['A', 'B', 'C', 'D']) + + print(df) + + print("STEVE ", df.squeeze()) + + ii = slice(None) + rr = slice(None) + + print(df.iloc[:, 0:3]) + print(df.iloc[i, r]) + print(df.iloc[ii, rr]) + + def test_iloc1(self): + ed_flights = self.ed_flights() + pd_flights = self.pd_flights() + + # https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.iloc.html#pandas.DataFrame.iloc + + #pd_flights.info() + + pd_iloc0 = pd_flights.iloc[0] + pd_iloc1= pd_flights.iloc[[0]] + pd_iloc2= pd_flights.iloc[[0, 1]] + pd_iloc3 = pd_flights.iloc[:3] + pd_iloc4 = pd_flights.iloc[[True, False, True]] + pd_iloc5 = pd_flights.iloc[0, 1] + pd_iloc6 = pd_flights.iloc[[0, 2], [1, 3]] + pd_iloc7 = pd_flights.iloc[1:3, 0:3] + pd_iloc8 = pd_flights.iloc[:, [True, False, True, False]] + pd_iloc9 = pd_flights.iloc[[True, False, True, False]] + + ed_iloc0 = ed_flights.iloc[0] + ed_iloc1 = ed_flights.iloc[[0]] + ed_iloc2 = ed_flights.iloc[[0, 1]] + ed_iloc3 = ed_flights.iloc[:3] + ed_iloc4 = ed_flights.iloc[[True, False, True]] + ed_iloc5 = ed_flights.iloc[0, 1] + ed_iloc6 = ed_flights.iloc[[0, 2], [1, 3]] + ed_iloc7 = ed_flights.iloc[1:3, 0:3] + ed_iloc8 = ed_flights.iloc[:, [True, False, True, False]] + ed_iloc9 = ed_flights.iloc[[True, False, True, False]] + + #assert_pandas_eland_frame_equal(pd_iloc0, ed_iloc0) # pd_iloc0 is Series + assert_pandas_eland_frame_equal(pd_iloc1, ed_iloc1) + assert_pandas_eland_frame_equal(pd_iloc2, ed_iloc2) + assert_pandas_eland_frame_equal(pd_iloc3, ed_iloc3) + assert_pandas_eland_frame_equal(pd_iloc4, ed_iloc4) + #assert_pandas_eland_frame_equal(pd_iloc5, ed_iloc5) # pd_iloc5 is numpy_bool + assert_pandas_eland_frame_equal(pd_iloc6, ed_iloc6) + assert_pandas_eland_frame_equal(pd_iloc7, ed_iloc7) + assert_pandas_eland_frame_equal(pd_iloc8, ed_iloc8) + assert_pandas_eland_frame_equal(pd_iloc9, ed_iloc9) diff --git a/eland/tests/dataframe/test_info_es_pytest.py b/eland/tests/dataframe/test_info_es_pytest.py new file mode 100644 index 0000000..41f84c6 --- /dev/null +++ b/eland/tests/dataframe/test_info_es_pytest.py @@ -0,0 +1,15 @@ +# File called _pytest for PyCharm compatability + +from eland.tests.common import TestData + + +class TestDataFrameInfo(TestData): + + def test_to_info1(self): + ed_flights = self.ed_flights() + + head = ed_flights.head(103) + slice = head[['timestamp', 'OriginRegion', 'Carrier']] + iloc = slice.iloc[10:92, [0,2]] + print(iloc.info_es()) + print(iloc) diff --git a/eland/tests/dataframe/test_info_pytest.py b/eland/tests/dataframe/test_info_pytest.py new file mode 100644 index 0000000..2fd2436 --- /dev/null +++ b/eland/tests/dataframe/test_info_pytest.py @@ -0,0 +1,12 @@ +# File called _pytest for PyCharm compatability + +from eland.tests.common import TestData + + +class TestDataFrameInfo(TestData): + + def test_to_info1(self): + ed_flights = self.ed_flights() + pd_flights = self.pd_flights() + + ed_flights.info() diff --git a/eland/tests/series/__init__.py b/eland/tests/series/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/eland/tests/series/test_head_tail_pytest.py b/eland/tests/series/test_head_tail_pytest.py new file mode 100644 index 0000000..803c8a9 --- /dev/null +++ b/eland/tests/series/test_head_tail_pytest.py @@ -0,0 +1,29 @@ +# File called _pytest for PyCharm compatability +import pandas as pd +import eland as ed + +from eland.tests.common import TestData +from eland.tests.common import assert_pandas_eland_series_equal + +from eland.tests import ELASTICSEARCH_HOST +from eland.tests import FLIGHTS_INDEX_NAME + +from pandas.util.testing import assert_series_equal + + + +class TestSeriesHeadTail(TestData): + + def test_head_tail(self): + pd_s = self.pd_flights()['Carrier'] + ed_s = ed.Series(ELASTICSEARCH_HOST, FLIGHTS_INDEX_NAME, 'Carrier') + + pd_s_head = pd_s.head(10) + ed_s_head = ed_s.head(10) + + assert_pandas_eland_series_equal(pd_s_head, ed_s_head) + + pd_s_tail = pd_s.tail(10) + ed_s_tail = ed_s.tail(10) + + assert_pandas_eland_series_equal(pd_s_tail, ed_s_tail) diff --git a/eland/tests/series/test_repr_pytest.py b/eland/tests/series/test_repr_pytest.py new file mode 100644 index 0000000..b3b6481 --- /dev/null +++ b/eland/tests/series/test_repr_pytest.py @@ -0,0 +1,24 @@ +# File called _pytest for PyCharm compatability +import pandas as pd +import eland as ed + +from eland.tests.common import TestData +from eland.tests.common import assert_pandas_eland_frame_equal + +from eland.tests import ELASTICSEARCH_HOST +from eland.tests import FLIGHTS_INDEX_NAME + +from pandas.util.testing import assert_series_equal + + + +class TestSeriesRepr(TestData): + + def test_repr(self): + pd_s = self.pd_flights()['Carrier'] + ed_s = ed.Series(ELASTICSEARCH_HOST, FLIGHTS_INDEX_NAME, 'Carrier') + + pd_repr = repr(pd_s) + ed_repr = repr(ed_s) + + assert pd_repr == ed_repr