mirror of
https://github.com/elastic/eland.git
synced 2025-07-11 00:02:14 +08:00
Adding smaller test and first effort to implement aggs
This commit is contained in:
parent
67b7aee9c9
commit
c6e0c5b92b
@ -469,4 +469,53 @@ class DataFrame(NDFrame):
|
||||
def keys(self):
|
||||
return self.columns
|
||||
|
||||
def to_csv(
|
||||
self,
|
||||
path_or_buf=None,
|
||||
sep=",",
|
||||
na_rep="",
|
||||
float_format=None,
|
||||
columns=None,
|
||||
header=True,
|
||||
index=True,
|
||||
index_label=None,
|
||||
mode="w",
|
||||
encoding=None,
|
||||
compression="infer",
|
||||
quoting=None,
|
||||
quotechar='"',
|
||||
line_terminator=None,
|
||||
chunksize=None,
|
||||
tupleize_cols=None,
|
||||
date_format=None,
|
||||
doublequote=True,
|
||||
escapechar=None,
|
||||
decimal=".",
|
||||
*args,
|
||||
**kwargs
|
||||
):
|
||||
kwargs = {
|
||||
"path_or_buf": path_or_buf,
|
||||
"sep": sep,
|
||||
"na_rep": na_rep,
|
||||
"float_format": float_format,
|
||||
"columns": columns,
|
||||
"header": header,
|
||||
"index": index,
|
||||
"index_label": index_label,
|
||||
"mode": mode,
|
||||
"encoding": encoding,
|
||||
"compression": compression,
|
||||
"quoting": quoting,
|
||||
"quotechar": quotechar,
|
||||
"line_terminator": line_terminator,
|
||||
"chunksize": chunksize,
|
||||
"tupleize_cols": tupleize_cols,
|
||||
"date_format": date_format,
|
||||
"doublequote": doublequote,
|
||||
"escapechar": escapechar,
|
||||
"decimal": decimal,
|
||||
}
|
||||
|
||||
|
||||
hist = gfx.ed_hist_frame
|
||||
|
@ -27,7 +27,7 @@ https://github.com/adgirish/kaggleScape/blob/master/results/annotResults.csv rep
|
||||
+-------------------------+-------+------------------------------------------------+
|
||||
| df.head | 783 | y |
|
||||
+-------------------------+-------+------------------------------------------------+
|
||||
| df.drop | 761 | |
|
||||
| df.drop | 761 | y |
|
||||
+-------------------------+-------+------------------------------------------------+
|
||||
| df.sum | 755 | y |
|
||||
+-------------------------+-------+------------------------------------------------+
|
||||
|
@ -70,6 +70,9 @@ class Operations:
|
||||
def set_columns(self, columns):
|
||||
# Setting columns at different phases of the task list may result in different
|
||||
# operations. So instead of setting columns once, set when it happens in call chain
|
||||
if type(columns) is not list:
|
||||
columns = list(columns)
|
||||
|
||||
# TODO - column renaming
|
||||
# TODO - validate we are setting columns to a subset of last columns?
|
||||
task = ('columns', columns)
|
||||
@ -483,6 +486,7 @@ class Operations:
|
||||
df = df.iloc[index_indexer, column_indexer]
|
||||
elif action[0] == 'squeeze':
|
||||
df = df.squeeze(axis=action[1])
|
||||
# columns could be in here (and we ignore it)
|
||||
|
||||
return df
|
||||
|
||||
|
@ -11,6 +11,36 @@ from pandas.core.indexes.range import RangeIndex
|
||||
|
||||
|
||||
class ElandQueryCompiler(BaseQueryCompiler):
|
||||
"""
|
||||
Some notes on what can and can not be mapped:
|
||||
|
||||
1. df.head(10)
|
||||
|
||||
/_search?size=10
|
||||
|
||||
2. df.tail(10)
|
||||
|
||||
/_search?size=10&sort=_doc:desc
|
||||
+ post_process results (sort_index)
|
||||
|
||||
3. df[['OriginAirportID', 'AvgTicketPrice', 'Carrier']]
|
||||
|
||||
/_search
|
||||
{ '_source': ['OriginAirportID', 'AvgTicketPrice', 'Carrier']}
|
||||
|
||||
4. df.drop(['1', '2'])
|
||||
|
||||
/_search
|
||||
{'query': {'bool': {'must': [], 'must_not': [{'ids': {'values': ['1', '2']}}]}}, 'aggs': {}}
|
||||
|
||||
This doesn't work is size is set (e.g. head/tail) as we don't know in Elasticsearch if values '1' or '2' are
|
||||
in the first/last n fields.
|
||||
|
||||
A way to mitigate this would be to post process this drop - TODO
|
||||
|
||||
|
||||
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
client=None,
|
||||
@ -155,45 +185,6 @@ class ElandQueryCompiler(BaseQueryCompiler):
|
||||
if results is None:
|
||||
return self._empty_pd_ef()
|
||||
|
||||
def flatten_dict(y):
|
||||
out = {}
|
||||
|
||||
def flatten(x, name=''):
|
||||
# We flatten into source fields e.g. if type=geo_point
|
||||
# location: {lat=52.38, lon=4.90}
|
||||
if name == '':
|
||||
is_source_field = False
|
||||
pd_dtype = 'object'
|
||||
else:
|
||||
is_source_field, pd_dtype = self._mappings.source_field_pd_dtype(name[:-1])
|
||||
|
||||
if not is_source_field and type(x) is dict:
|
||||
for a in x:
|
||||
flatten(x[a], name + a + '.')
|
||||
elif not is_source_field and type(x) is list:
|
||||
for a in x:
|
||||
flatten(a, name)
|
||||
elif is_source_field == True: # only print source fields from mappings (TODO - not so efficient for large number of fields and filtered mapping)
|
||||
field_name = name[:-1]
|
||||
|
||||
# Coerce types - for now just datetime
|
||||
if pd_dtype == 'datetime64[ns]':
|
||||
x = pd.to_datetime(x)
|
||||
|
||||
# Elasticsearch can have multiple values for a field. These are represented as lists, so
|
||||
# create lists for this pivot (see notes above)
|
||||
if field_name in out:
|
||||
if type(out[field_name]) is not list:
|
||||
l = [out[field_name]]
|
||||
out[field_name] = l
|
||||
out[field_name].append(x)
|
||||
else:
|
||||
out[field_name] = x
|
||||
|
||||
flatten(y)
|
||||
|
||||
return out
|
||||
|
||||
rows = []
|
||||
index = []
|
||||
if isinstance(results, dict):
|
||||
@ -212,7 +203,7 @@ class ElandQueryCompiler(BaseQueryCompiler):
|
||||
index.append(index_field)
|
||||
|
||||
# flatten row to map correctly to 2D DataFrame
|
||||
rows.append(flatten_dict(row))
|
||||
rows.append(self._flatten_dict(row))
|
||||
|
||||
# Create pandas DataFrame
|
||||
df = pd.DataFrame(data=rows, index=index)
|
||||
@ -232,6 +223,100 @@ class ElandQueryCompiler(BaseQueryCompiler):
|
||||
|
||||
return df
|
||||
|
||||
def _to_csv(self, results, **kwargs):
|
||||
# Very similar to _es_results_to_pandas except we create partial pandas.DataFrame
|
||||
# and write these to csv
|
||||
|
||||
# Use chunksize in kwargs do determine size of partial data frame
|
||||
if 'chunksize' in kwargs:
|
||||
chunksize = kwargs['chunksize']
|
||||
else:
|
||||
# If no default chunk, set to 1000
|
||||
chunksize = 1000
|
||||
|
||||
if results is None:
|
||||
return self._empty_pd_ef()
|
||||
|
||||
rows = []
|
||||
index = []
|
||||
if isinstance(results, dict):
|
||||
iterator = results['hits']['hits']
|
||||
else:
|
||||
iterator = results
|
||||
|
||||
i = 0
|
||||
for hit in iterator:
|
||||
row = hit['_source']
|
||||
|
||||
# get index value - can be _id or can be field value in source
|
||||
if self._index.is_source_field:
|
||||
index_field = row[self._index.index_field]
|
||||
else:
|
||||
index_field = hit[self._index.index_field]
|
||||
index.append(index_field)
|
||||
|
||||
# flatten row to map correctly to 2D DataFrame
|
||||
rows.append(self._flatten_dict(row))
|
||||
|
||||
i = i + 1
|
||||
if i % chunksize == 0:
|
||||
# Create pandas DataFrame
|
||||
df = pd.DataFrame(data=rows, index=index)
|
||||
|
||||
# _source may not contain all columns in the mapping
|
||||
# therefore, fill in missing columns
|
||||
# (note this returns self.columns NOT IN df.columns)
|
||||
missing_columns = list(set(self.columns) - set(df.columns))
|
||||
|
||||
for missing in missing_columns:
|
||||
is_source_field, pd_dtype = self._mappings.source_field_pd_dtype(missing)
|
||||
df[missing] = None
|
||||
df[missing].astype(pd_dtype)
|
||||
|
||||
# Sort columns in mapping order
|
||||
df = df[self.columns]
|
||||
|
||||
return df
|
||||
|
||||
def _flatten_dict(self, y):
|
||||
out = {}
|
||||
|
||||
def flatten(x, name=''):
|
||||
# We flatten into source fields e.g. if type=geo_point
|
||||
# location: {lat=52.38, lon=4.90}
|
||||
if name == '':
|
||||
is_source_field = False
|
||||
pd_dtype = 'object'
|
||||
else:
|
||||
is_source_field, pd_dtype = self._mappings.source_field_pd_dtype(name[:-1])
|
||||
|
||||
if not is_source_field and type(x) is dict:
|
||||
for a in x:
|
||||
flatten(x[a], name + a + '.')
|
||||
elif not is_source_field and type(x) is list:
|
||||
for a in x:
|
||||
flatten(a, name)
|
||||
elif is_source_field == True: # only print source fields from mappings (TODO - not so efficient for large number of fields and filtered mapping)
|
||||
field_name = name[:-1]
|
||||
|
||||
# Coerce types - for now just datetime
|
||||
if pd_dtype == 'datetime64[ns]':
|
||||
x = pd.to_datetime(x)
|
||||
|
||||
# Elasticsearch can have multiple values for a field. These are represented as lists, so
|
||||
# create lists for this pivot (see notes above)
|
||||
if field_name in out:
|
||||
if type(out[field_name]) is not list:
|
||||
l = [out[field_name]]
|
||||
out[field_name] = l
|
||||
out[field_name].append(x)
|
||||
else:
|
||||
out[field_name] = x
|
||||
|
||||
flatten(y)
|
||||
|
||||
return out
|
||||
|
||||
def _index_count(self):
|
||||
"""
|
||||
Returns
|
||||
|
@ -99,6 +99,11 @@ FLIGHTS_MAPPING = { "mappings" : {
|
||||
FLIGHTS_FILE_NAME = ROOT_DIR + '/flights.json.gz'
|
||||
FLIGHTS_DF_FILE_NAME = ROOT_DIR + '/flights_df.json.gz'
|
||||
|
||||
FLIGHTS_SMALL_INDEX_NAME = 'flights_small'
|
||||
FLIGHTS_SMALL_MAPPING = FLIGHTS_MAPPING
|
||||
FLIGHTS_SMALL_FILE_NAME = ROOT_DIR + '/flights_small.json.gz'
|
||||
FLIGHTS_SMALL_DF_FILE_NAME = ROOT_DIR + '/flights_small_df.json.gz'
|
||||
|
||||
ECOMMERCE_INDEX_NAME = 'ecommerce'
|
||||
ECOMMERCE_MAPPING = { "mappings" : {
|
||||
"properties" : {
|
||||
|
@ -12,6 +12,7 @@ ROOT_DIR = os.path.dirname(os.path.abspath(__file__))
|
||||
# Create pandas and eland data frames
|
||||
from eland.tests import ELASTICSEARCH_HOST
|
||||
from eland.tests import FLIGHTS_DF_FILE_NAME, FLIGHTS_INDEX_NAME,\
|
||||
FLIGHTS_SMALL_INDEX_NAME,\
|
||||
ECOMMERCE_DF_FILE_NAME, ECOMMERCE_INDEX_NAME
|
||||
|
||||
_pd_flights = pd.read_json(FLIGHTS_DF_FILE_NAME).sort_index()
|
||||
@ -20,6 +21,9 @@ _pd_flights['timestamp'] = \
|
||||
_pd_flights.index = _pd_flights.index.map(str) # make index 'object' not int
|
||||
_ed_flights = ed.read_es(ELASTICSEARCH_HOST, FLIGHTS_INDEX_NAME)
|
||||
|
||||
_pd_flights_small = _pd_flights.head(48)
|
||||
_ed_flights_small = ed.read_es(ELASTICSEARCH_HOST, FLIGHTS_SMALL_INDEX_NAME)
|
||||
|
||||
_pd_ecommerce = pd.read_json(ECOMMERCE_DF_FILE_NAME).sort_index()
|
||||
_pd_ecommerce['order_date'] = \
|
||||
pd.to_datetime(_pd_ecommerce['order_date'])
|
||||
@ -38,6 +42,13 @@ class TestData:
|
||||
def ed_flights(self):
|
||||
return _ed_flights
|
||||
|
||||
def pd_flights_small(self):
|
||||
return _pd_flights_small
|
||||
|
||||
def ed_flights_small(self):
|
||||
return _ed_flights_small
|
||||
|
||||
|
||||
def pd_ecommerce(self):
|
||||
return _pd_ecommerce
|
||||
|
||||
|
@ -31,14 +31,5 @@ class TestDataFrameDescribe(TestData):
|
||||
# don't match the mapping types. This is mainly because the products field is
|
||||
# nested and so can be treated as a multi-field in ES, but not in pandas
|
||||
|
||||
def test_to_describe2(self):
|
||||
pd_flights = self.pd_flights().head()
|
||||
ed_flights = self.ed_flights().head()
|
||||
|
||||
pd_describe = pd_flights.describe()
|
||||
# This fails as we can not run 'describe' on a truncate ed dataframe
|
||||
ed_describe = ed_flights.describe()
|
||||
|
||||
print(pd_describe)
|
||||
print(ed_describe)
|
||||
# We can not also run 'describe' on a truncate ed dataframe
|
||||
|
||||
|
@ -14,8 +14,8 @@ import numpy as np
|
||||
class TestDataFrameDrop(TestData):
|
||||
|
||||
def test_drop1(self):
|
||||
ed_flights = self.ed_flights()
|
||||
pd_flights = self.pd_flights()
|
||||
ed_flights_small = self.ed_flights_small()
|
||||
pd_flights_small = self.pd_flights_small()
|
||||
|
||||
# ['AvgTicketPrice', 'Cancelled', 'Carrier', 'Dest', 'DestAirportID',
|
||||
# 'DestCityName', 'DestCountry', 'DestLocation', 'DestRegion',
|
||||
@ -24,33 +24,17 @@ class TestDataFrameDrop(TestData):
|
||||
# 'FlightTimeMin', 'Origin', 'OriginAirportID', 'OriginCityName',
|
||||
# 'OriginCountry', 'OriginLocation', 'OriginRegion', 'OriginWeather',
|
||||
# 'dayOfWeek', 'timestamp']
|
||||
pd_col0 = pd_flights.drop(['Carrier', 'DestCityName'], axis=1)
|
||||
pd_col1 = pd_flights.drop(columns=['Carrier', 'DestCityName'])
|
||||
pd_col0 = pd_flights_small.drop(['Carrier', 'DestCityName'], axis=1)
|
||||
pd_col1 = pd_flights_small.drop(columns=['Carrier', 'DestCityName'])
|
||||
|
||||
ed_col0 = ed_flights.drop(['Carrier', 'DestCityName'], axis=1)
|
||||
ed_col1 = ed_flights.drop(columns=['Carrier', 'DestCityName'])
|
||||
ed_col0 = ed_flights_small.drop(['Carrier', 'DestCityName'], axis=1)
|
||||
ed_col1 = ed_flights_small.drop(columns=['Carrier', 'DestCityName'])
|
||||
|
||||
#assert_pandas_eland_frame_equal(pd_col0, ed_col0)
|
||||
#assert_pandas_eland_frame_equal(pd_col1, ed_col1)
|
||||
assert_pandas_eland_frame_equal(pd_col0, ed_col0)
|
||||
assert_pandas_eland_frame_equal(pd_col1, ed_col1)
|
||||
|
||||
# Drop rows by index
|
||||
pd_idx0 = pd_flights.drop(['1', '2'])
|
||||
ed_idx0 = ed_flights.drop(['1', '2'])
|
||||
|
||||
print(pd_idx0.info())
|
||||
print(ed_idx0.info())
|
||||
pd_idx0 = pd_flights_small.drop(['1', '2'])
|
||||
ed_idx0 = ed_flights_small.drop(['1', '2'])
|
||||
|
||||
assert_pandas_eland_frame_equal(pd_idx0, ed_idx0)
|
||||
|
||||
"""
|
||||
#assert_pandas_eland_frame_equal(pd_iloc0, ed_iloc0) # pd_iloc0 is Series
|
||||
assert_pandas_eland_frame_equal(pd_iloc1, ed_iloc1)
|
||||
assert_pandas_eland_frame_equal(pd_iloc2, ed_iloc2)
|
||||
assert_pandas_eland_frame_equal(pd_iloc3, ed_iloc3)
|
||||
assert_pandas_eland_frame_equal(pd_iloc4, ed_iloc4)
|
||||
#assert_pandas_eland_frame_equal(pd_iloc5, ed_iloc5) # pd_iloc5 is numpy_bool
|
||||
assert_pandas_eland_frame_equal(pd_iloc6, ed_iloc6)
|
||||
assert_pandas_eland_frame_equal(pd_iloc7, ed_iloc7)
|
||||
assert_pandas_eland_frame_equal(pd_iloc8, ed_iloc8)
|
||||
assert_pandas_eland_frame_equal(pd_iloc9, ed_iloc9)
|
||||
"""
|
||||
|
@ -30,3 +30,15 @@ class TestDataFrameHist(TestData):
|
||||
# Numbers are slightly different
|
||||
assert_almost_equal(pd_bins, ed_bins)
|
||||
assert_almost_equal(pd_weights, ed_weights)
|
||||
|
||||
def test_hist2(self):
|
||||
pd_df = self.pd_flights()[['DistanceKilometers', 'DistanceMiles', 'FlightDelayMin', 'FlightTimeHour']]
|
||||
ed_df = self.ed_flights()[['DistanceKilometers', 'DistanceMiles', 'FlightDelayMin', 'FlightTimeHour']]
|
||||
|
||||
num_bins = 10
|
||||
|
||||
ed_bins, ed_weights = ed_df._hist(num_bins=num_bins)
|
||||
|
||||
print(ed_bins)
|
||||
|
||||
|
||||
|
@ -1,8 +1,11 @@
|
||||
# File called _pytest for PyCharm compatability
|
||||
|
||||
from eland.tests.common import TestData
|
||||
import gzip
|
||||
|
||||
import pandas as pd
|
||||
|
||||
import eland as ed
|
||||
from eland.tests.common import TestData
|
||||
|
||||
|
||||
class TestDataFrameReviews(TestData):
|
||||
@ -13,5 +16,16 @@ class TestDataFrameReviews(TestData):
|
||||
print(ed_reviews.head())
|
||||
print(ed_reviews.describe())
|
||||
print(ed_reviews.info())
|
||||
print(ed_reviews.hist(column="rating", bins = 5))
|
||||
#print(ed_reviews.head().info_es())
|
||||
print(ed_reviews.hist(column="rating", bins=5))
|
||||
# print(ed_reviews.head().info_es())
|
||||
|
||||
def test_review(self):
|
||||
csv_handle = gzip.open('../anonreviews.csv.gz')
|
||||
|
||||
reviews = pd.read_csv(csv_handle)
|
||||
|
||||
reviews['date'] = pd.to_datetime(reviews['date'])
|
||||
|
||||
g = reviews.groupby('reviewerId')
|
||||
|
||||
print(g.describe())
|
||||
|
14
eland/tests/dataframe/test_to_csv_pytest.py
Normal file
14
eland/tests/dataframe/test_to_csv_pytest.py
Normal file
@ -0,0 +1,14 @@
|
||||
# File called _pytest for PyCharm compatability
|
||||
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
|
||||
import eland as ed
|
||||
from eland.tests.common import ELASTICSEARCH_HOST
|
||||
from eland.tests.common import TestData
|
||||
|
||||
|
||||
class TestDataFrameToCSV(TestData):
|
||||
|
||||
def test_to_csv(self):
|
||||
print("TODO")
|
Binary file not shown.
Binary file not shown.
BIN
eland/tests/flights_small.json.gz
Normal file
BIN
eland/tests/flights_small.json.gz
Normal file
Binary file not shown.
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
@ -5,7 +5,7 @@ from eland.tests.common import TestData
|
||||
from matplotlib.testing.decorators import check_figures_equal
|
||||
|
||||
@check_figures_equal(extensions=['png'])
|
||||
def test_plot(fig_test, fig_ref):
|
||||
def test_plot_hist(fig_test, fig_ref):
|
||||
test_data = TestData()
|
||||
|
||||
pd_flights = test_data.pd_flights()[['DistanceKilometers', 'DistanceMiles', 'FlightDelayMin', 'FlightTimeHour']]
|
||||
@ -16,4 +16,3 @@ def test_plot(fig_test, fig_ref):
|
||||
|
||||
ed_ax = fig_test.subplots()
|
||||
ed_flights.hist(ax=ed_ax)
|
||||
|
||||
|
@ -6,6 +6,7 @@ from eland.tests import *
|
||||
|
||||
DATA_LIST = [
|
||||
(FLIGHTS_FILE_NAME, FLIGHTS_INDEX_NAME, FLIGHTS_MAPPING),
|
||||
(FLIGHTS_SMALL_FILE_NAME, FLIGHTS_SMALL_INDEX_NAME, FLIGHTS_MAPPING),
|
||||
(ECOMMERCE_FILE_NAME, ECOMMERCE_INDEX_NAME, ECOMMERCE_MAPPING)
|
||||
]
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user