mirror of
https://github.com/elastic/eland.git
synced 2025-07-11 00:02:14 +08:00
862 lines
31 KiB
Python
862 lines
31 KiB
Python
import copy
|
|
from enum import Enum
|
|
|
|
import pandas as pd
|
|
|
|
from eland import Index
|
|
from eland import Query
|
|
|
|
|
|
class Operations:
|
|
"""
|
|
A collector of the queries and selectors we apply to queries to return the appropriate results.
|
|
|
|
For example,
|
|
- a list of the columns in the DataFrame (a subset of columns in the index)
|
|
- a size limit on the results (e.g. for head(n=5))
|
|
- a query to filter the results (e.g. df.A > 10)
|
|
|
|
This is maintained as a 'task graph' (inspired by dask)
|
|
(see https://docs.dask.org/en/latest/spec.html)
|
|
"""
|
|
|
|
class SortOrder(Enum):
|
|
ASC = 0
|
|
DESC = 1
|
|
|
|
@staticmethod
|
|
def reverse(order):
|
|
if order == Operations.SortOrder.ASC:
|
|
return Operations.SortOrder.DESC
|
|
|
|
return Operations.SortOrder.ASC
|
|
|
|
@staticmethod
|
|
def to_string(order):
|
|
if order == Operations.SortOrder.ASC:
|
|
return "asc"
|
|
|
|
return "desc"
|
|
|
|
def from_string(order):
|
|
if order == "asc":
|
|
return Operations.SortOrder.ASC
|
|
|
|
return Operations.SortOrder.DESC
|
|
|
|
def __init__(self, tasks=None):
|
|
if tasks == None:
|
|
self._tasks = []
|
|
else:
|
|
self._tasks = tasks
|
|
|
|
def __constructor__(self, *args, **kwargs):
|
|
return type(self)(*args, **kwargs)
|
|
|
|
def copy(self):
|
|
return self.__constructor__(tasks=copy.deepcopy(self._tasks))
|
|
|
|
def head(self, index, n):
|
|
# Add a task that is an ascending sort with size=n
|
|
task = ('head', (index.sort_field, n))
|
|
self._tasks.append(task)
|
|
|
|
def tail(self, index, n):
|
|
# Add a task that is descending sort with size=n
|
|
task = ('tail', (index.sort_field, n))
|
|
self._tasks.append(task)
|
|
|
|
def set_columns(self, columns):
|
|
# Setting columns at different phases of the task list may result in different
|
|
# operations. So instead of setting columns once, set when it happens in call chain
|
|
if type(columns) is not list:
|
|
columns = list(columns)
|
|
|
|
# TODO - column renaming
|
|
# TODO - validate we are setting columns to a subset of last columns?
|
|
task = ('columns', columns)
|
|
self._tasks.append(task)
|
|
# Iterate backwards through task list looking for last 'columns' task
|
|
for task in reversed(self._tasks):
|
|
if task[0] == 'columns':
|
|
return task[1]
|
|
return None
|
|
|
|
def get_columns(self):
|
|
# Iterate backwards through task list looking for last 'columns' task
|
|
for task in reversed(self._tasks):
|
|
if task[0] == 'columns':
|
|
return task[1]
|
|
return None
|
|
|
|
def __repr__(self):
|
|
return repr(self._tasks)
|
|
|
|
def count(self, query_compiler):
|
|
query_params, post_processing = self._resolve_tasks()
|
|
|
|
# Elasticsearch _count is very efficient and so used to return results here. This means that
|
|
# data frames that have restricted size or sort params will not return valid results (_count doesn't support size).
|
|
# Longer term we may fall back to pandas, but this may result in loading all index into memory.
|
|
if self._size(query_params, post_processing) is not None:
|
|
raise NotImplementedError("Requesting count with additional query and processing parameters "
|
|
"not supported {0} {1}"
|
|
.format(query_params, post_processing))
|
|
|
|
# Only return requested columns
|
|
fields = query_compiler.columns
|
|
|
|
counts = {}
|
|
for field in fields:
|
|
body = Query(query_params['query'])
|
|
body.exists(field, must=True)
|
|
|
|
field_exists_count = query_compiler._client.count(index=query_compiler._index_pattern,
|
|
body=body.to_count_body())
|
|
counts[field] = field_exists_count
|
|
|
|
return pd.Series(data=counts, index=fields)
|
|
|
|
def mean(self, query_compiler):
|
|
return self._metric_aggs(query_compiler, 'avg')
|
|
|
|
def sum(self, query_compiler):
|
|
return self._metric_aggs(query_compiler, 'sum')
|
|
|
|
def max(self, query_compiler):
|
|
return self._metric_aggs(query_compiler, 'max')
|
|
|
|
def min(self, query_compiler):
|
|
return self._metric_aggs(query_compiler, 'min')
|
|
|
|
def nunique(self, query_compiler):
|
|
return self._terms_aggs(query_compiler, 'cardinality')
|
|
|
|
def hist(self, query_compiler, bins):
|
|
return self._hist_aggs(query_compiler, bins)
|
|
|
|
def _metric_aggs(self, query_compiler, func):
|
|
query_params, post_processing = self._resolve_tasks()
|
|
|
|
size = self._size(query_params, post_processing)
|
|
if size is not None:
|
|
raise NotImplementedError("Can not count field matches if size is set {}".format(size))
|
|
|
|
columns = self.get_columns()
|
|
|
|
numeric_source_fields = query_compiler._mappings.numeric_source_fields(columns)
|
|
|
|
body = Query(query_params['query'])
|
|
|
|
for field in numeric_source_fields:
|
|
body.metric_aggs(field, func, field)
|
|
|
|
response = query_compiler._client.search(
|
|
index=query_compiler._index_pattern,
|
|
size=0,
|
|
body=body.to_search_body())
|
|
|
|
# Results are of the form
|
|
# "aggregations" : {
|
|
# "AvgTicketPrice" : {
|
|
# "value" : 628.2536888148849
|
|
# }
|
|
# }
|
|
results = {}
|
|
|
|
for field in numeric_source_fields:
|
|
results[field] = response['aggregations'][field]['value']
|
|
|
|
# Return single value if this is a series
|
|
# if len(numeric_source_fields) == 1:
|
|
# return np.float64(results[numeric_source_fields[0]])
|
|
|
|
s = pd.Series(data=results, index=numeric_source_fields)
|
|
|
|
return s
|
|
|
|
def _terms_aggs(self, query_compiler, func):
|
|
query_params, post_processing = self._resolve_tasks()
|
|
|
|
size = self._size(query_params, post_processing)
|
|
if size is not None:
|
|
raise NotImplementedError("Can not count field matches if size is set {}".format(size))
|
|
|
|
columns = self.get_columns()
|
|
if columns is None:
|
|
columns = query_compiler._mappings.source_fields()
|
|
|
|
body = Query(query_params['query'])
|
|
|
|
for field in columns:
|
|
body.metric_aggs(field, func, field)
|
|
|
|
response = query_compiler._client.search(
|
|
index=query_compiler._index_pattern,
|
|
size=0,
|
|
body=body.to_search_body())
|
|
|
|
results = {}
|
|
|
|
for field in columns:
|
|
results[field] = response['aggregations'][field]['value']
|
|
|
|
s = pd.Series(data=results, index=columns)
|
|
|
|
return s
|
|
|
|
def _hist_aggs(self, query_compiler, num_bins):
|
|
# Get histogram bins and weights for numeric columns
|
|
query_params, post_processing = self._resolve_tasks()
|
|
|
|
size = self._size(query_params, post_processing)
|
|
if size is not None:
|
|
raise NotImplementedError("Can not count field matches if size is set {}".format(size))
|
|
|
|
columns = self.get_columns()
|
|
|
|
numeric_source_fields = query_compiler._mappings.numeric_source_fields(columns)
|
|
|
|
body = Query(query_params['query'])
|
|
|
|
min_aggs = self._metric_aggs(query_compiler, 'min')
|
|
max_aggs = self._metric_aggs(query_compiler, 'max')
|
|
|
|
for field in numeric_source_fields:
|
|
body.hist_aggs(field, field, min_aggs, max_aggs, num_bins)
|
|
|
|
response = query_compiler._client.search(
|
|
index=query_compiler._index_pattern,
|
|
size=0,
|
|
body=body.to_search_body())
|
|
|
|
# results are like
|
|
# "aggregations" : {
|
|
# "DistanceKilometers" : {
|
|
# "buckets" : [
|
|
# {
|
|
# "key" : 0.0,
|
|
# "doc_count" : 2956
|
|
# },
|
|
# {
|
|
# "key" : 1988.1482421875,
|
|
# "doc_count" : 768
|
|
# },
|
|
# ...
|
|
|
|
bins = {}
|
|
weights = {}
|
|
|
|
# There is one more bin that weights
|
|
# len(bins) = len(weights) + 1
|
|
|
|
# bins = [ 0. 36. 72. 108. 144. 180. 216. 252. 288. 324. 360.]
|
|
# len(bins) == 11
|
|
# weights = [10066., 263., 386., 264., 273., 390., 324., 438., 261., 394.]
|
|
# len(weights) == 10
|
|
|
|
# ES returns
|
|
# weights = [10066., 263., 386., 264., 273., 390., 324., 438., 261., 252., 142.]
|
|
# So sum last 2 buckets
|
|
for field in numeric_source_fields:
|
|
buckets = response['aggregations'][field]['buckets']
|
|
|
|
bins[field] = []
|
|
weights[field] = []
|
|
|
|
for bucket in buckets:
|
|
bins[field].append(bucket['key'])
|
|
|
|
if bucket == buckets[-1]:
|
|
weights[field][-1] += bucket['doc_count']
|
|
else:
|
|
weights[field].append(bucket['doc_count'])
|
|
|
|
df_bins = pd.DataFrame(data=bins)
|
|
df_weights = pd.DataFrame(data=weights)
|
|
|
|
return df_bins, df_weights
|
|
|
|
@staticmethod
|
|
def _map_pd_aggs_to_es_aggs(pd_aggs):
|
|
"""
|
|
Args:
|
|
pd_aggs - list of pandas aggs (e.g. ['mad', 'min', 'std'] etc.)
|
|
|
|
Returns:
|
|
ed_aggs - list of corresponding es_aggs (e.g. ['median_absolute_deviation', 'min', 'std'] etc.)
|
|
|
|
Pandas supports a lot of options here, and these options generally work on text and numerics in pandas.
|
|
Elasticsearch has metric aggs and terms aggs so will have different behaviour.
|
|
|
|
Pandas aggs that return columns (as opposed to transformed rows):
|
|
|
|
all
|
|
any
|
|
count
|
|
mad
|
|
max
|
|
mean
|
|
median
|
|
min
|
|
mode
|
|
quantile
|
|
rank
|
|
sem
|
|
skew
|
|
sum
|
|
std
|
|
var
|
|
nunique
|
|
"""
|
|
ed_aggs = []
|
|
for pd_agg in pd_aggs:
|
|
if pd_agg == 'count':
|
|
ed_aggs.append('count')
|
|
elif pd_agg == 'mad':
|
|
ed_aggs.append('median_absolute_deviation')
|
|
elif pd_agg == 'max':
|
|
ed_aggs.append('max')
|
|
elif pd_agg == 'mean':
|
|
ed_aggs.append('avg')
|
|
elif pd_agg == 'median':
|
|
ed_aggs.append(('percentiles', '50.0'))
|
|
elif pd_agg == 'min':
|
|
ed_aggs.append('min')
|
|
elif pd_agg == 'mode':
|
|
# We could do this via top term
|
|
raise NotImplementedError(pd_agg, " not currently implemented")
|
|
elif pd_agg == 'quantile':
|
|
# TODO
|
|
raise NotImplementedError(pd_agg, " not currently implemented")
|
|
elif pd_agg == 'rank':
|
|
# TODO
|
|
raise NotImplementedError(pd_agg, " not currently implemented")
|
|
elif pd_agg == 'sem':
|
|
# TODO
|
|
raise NotImplementedError(pd_agg, " not currently implemented")
|
|
elif pd_agg == 'sum':
|
|
ed_aggs.append('sum')
|
|
elif pd_agg == 'std':
|
|
ed_aggs.append(('extended_stats', 'std_deviation'))
|
|
elif pd_agg == 'var':
|
|
ed_aggs.append(('extended_stats', 'variance'))
|
|
else:
|
|
raise NotImplementedError(pd_agg, " not currently implemented")
|
|
|
|
# TODO - we can optimise extended_stats here as if we have 'count' and 'std' extended_stats would
|
|
# return both in one call
|
|
|
|
return ed_aggs
|
|
|
|
def aggs(self, query_compiler, pd_aggs):
|
|
query_params, post_processing = self._resolve_tasks()
|
|
|
|
size = self._size(query_params, post_processing)
|
|
if size is not None:
|
|
raise NotImplementedError("Can not count field matches if size is set {}".format(size))
|
|
|
|
columns = self.get_columns()
|
|
|
|
body = Query(query_params['query'])
|
|
|
|
# convert pandas aggs to ES equivalent
|
|
es_aggs = self._map_pd_aggs_to_es_aggs(pd_aggs)
|
|
|
|
for field in columns:
|
|
for es_agg in es_aggs:
|
|
# If we have multiple 'extended_stats' etc. here we simply NOOP on 2nd call
|
|
if isinstance(es_agg, tuple):
|
|
body.metric_aggs(es_agg[0] + '_' + field, es_agg[0], field)
|
|
else:
|
|
body.metric_aggs(es_agg + '_' + field, es_agg, field)
|
|
|
|
response = query_compiler._client.search(
|
|
index=query_compiler._index_pattern,
|
|
size=0,
|
|
body=body.to_search_body())
|
|
|
|
"""
|
|
Results are like (for 'sum', 'min')
|
|
|
|
AvgTicketPrice DistanceKilometers DistanceMiles FlightDelayMin
|
|
sum 8.204365e+06 9.261629e+07 5.754909e+07 618150
|
|
min 1.000205e+02 0.000000e+00 0.000000e+00 0
|
|
"""
|
|
results = {}
|
|
|
|
for field in columns:
|
|
values = list()
|
|
for es_agg in es_aggs:
|
|
if isinstance(es_agg, tuple):
|
|
values.append(response['aggregations'][es_agg[0] + '_' + field][es_agg[1]])
|
|
else:
|
|
values.append(response['aggregations'][es_agg + '_' + field]['value'])
|
|
|
|
results[field] = values
|
|
|
|
df = pd.DataFrame(data=results, index=pd_aggs)
|
|
|
|
return df
|
|
|
|
def describe(self, query_compiler):
|
|
query_params, post_processing = self._resolve_tasks()
|
|
|
|
size = self._size(query_params, post_processing)
|
|
if size is not None:
|
|
raise NotImplementedError("Can not count field matches if size is set {}".format(size))
|
|
|
|
columns = self.get_columns()
|
|
|
|
numeric_source_fields = query_compiler._mappings.numeric_source_fields(columns, include_bool=False)
|
|
|
|
# for each field we compute:
|
|
# count, mean, std, min, 25%, 50%, 75%, max
|
|
body = Query(query_params['query'])
|
|
|
|
for field in numeric_source_fields:
|
|
body.metric_aggs('extended_stats_' + field, 'extended_stats', field)
|
|
body.metric_aggs('percentiles_' + field, 'percentiles', field)
|
|
|
|
response = query_compiler._client.search(
|
|
index=query_compiler._index_pattern,
|
|
size=0,
|
|
body=body.to_search_body())
|
|
|
|
results = {}
|
|
|
|
for field in numeric_source_fields:
|
|
values = list()
|
|
values.append(response['aggregations']['extended_stats_' + field]['count'])
|
|
values.append(response['aggregations']['extended_stats_' + field]['avg'])
|
|
values.append(response['aggregations']['extended_stats_' + field]['std_deviation'])
|
|
values.append(response['aggregations']['extended_stats_' + field]['min'])
|
|
values.append(response['aggregations']['percentiles_' + field]['values']['25.0'])
|
|
values.append(response['aggregations']['percentiles_' + field]['values']['50.0'])
|
|
values.append(response['aggregations']['percentiles_' + field]['values']['75.0'])
|
|
values.append(response['aggregations']['extended_stats_' + field]['max'])
|
|
|
|
# if not None
|
|
if values.count(None) < len(values):
|
|
results[field] = values
|
|
|
|
df = pd.DataFrame(data=results, index=['count', 'mean', 'std', 'min', '25%', '50%', '75%', 'max'])
|
|
|
|
return df
|
|
|
|
def to_pandas(self, query_compiler):
|
|
class PandasDataFrameCollector:
|
|
def collect(self, df):
|
|
self.df = df
|
|
|
|
def batch_size(self):
|
|
return None
|
|
|
|
collector = PandasDataFrameCollector()
|
|
|
|
self._es_results(query_compiler, collector)
|
|
|
|
return collector.df
|
|
|
|
def to_csv(self, query_compiler, **kwargs):
|
|
class PandasToCSVCollector:
|
|
def __init__(self, **kwargs):
|
|
self.kwargs = kwargs
|
|
self.ret = None
|
|
self.first_time = True
|
|
|
|
def collect(self, df):
|
|
# If this is the first time we collect results, then write header, otherwise don't write header
|
|
# and append results
|
|
if self.first_time:
|
|
self.first_time = False
|
|
df.to_csv(**self.kwargs)
|
|
else:
|
|
# Don't write header, and change mode to append
|
|
self.kwargs['header'] = False
|
|
self.kwargs['mode'] = 'a'
|
|
df.to_csv(**self.kwargs)
|
|
|
|
def batch_size(self):
|
|
# By default read 10000 docs to csv
|
|
batch_size = 10000
|
|
return batch_size
|
|
|
|
collector = PandasToCSVCollector(**kwargs)
|
|
|
|
self._es_results(query_compiler, collector)
|
|
|
|
return collector.ret
|
|
|
|
def _es_results(self, query_compiler, collector):
|
|
query_params, post_processing = self._resolve_tasks()
|
|
|
|
size, sort_params = Operations._query_params_to_size_and_sort(query_params)
|
|
|
|
body = Query(query_params['query'])
|
|
|
|
# Only return requested columns
|
|
columns = self.get_columns()
|
|
|
|
es_results = None
|
|
|
|
# If size=None use scan not search - then post sort results when in df
|
|
# If size>10000 use scan
|
|
is_scan = False
|
|
if size is not None and size <= 10000:
|
|
if size > 0:
|
|
es_results = query_compiler._client.search(
|
|
index=query_compiler._index_pattern,
|
|
size=size,
|
|
sort=sort_params,
|
|
body=body.to_search_body(),
|
|
_source=columns)
|
|
else:
|
|
is_scan = True
|
|
es_results = query_compiler._client.scan(
|
|
index=query_compiler._index_pattern,
|
|
query=body.to_search_body(),
|
|
_source=columns)
|
|
# create post sort
|
|
if sort_params is not None:
|
|
post_processing.append(self._sort_params_to_postprocessing(sort_params))
|
|
|
|
if is_scan:
|
|
while True:
|
|
partial_result, df = query_compiler._es_results_to_pandas(es_results, collector.batch_size())
|
|
df = self._apply_df_post_processing(df, post_processing)
|
|
collector.collect(df)
|
|
if partial_result == False:
|
|
break
|
|
else:
|
|
partial_result, df = query_compiler._es_results_to_pandas(es_results)
|
|
df = self._apply_df_post_processing(df, post_processing)
|
|
collector.collect(df)
|
|
|
|
def iloc(self, index, columns):
|
|
# index and columns are indexers
|
|
task = ('iloc', (index, columns))
|
|
self._tasks.append(task)
|
|
|
|
def squeeze(self, axis):
|
|
task = ('squeeze', axis)
|
|
self._tasks.append(task)
|
|
|
|
def index_count(self, query_compiler, field):
|
|
# field is the index field so count values
|
|
query_params, post_processing = self._resolve_tasks()
|
|
|
|
size = self._size(query_params, post_processing)
|
|
|
|
# Size is dictated by operations
|
|
if size is not None:
|
|
# TODO - this is not necessarily valid as the field may not exist in ALL these docs
|
|
return size
|
|
|
|
body = Query(query_params['query'])
|
|
body.exists(field, must=True)
|
|
|
|
return query_compiler._client.count(index=query_compiler._index_pattern, body=body.to_count_body())
|
|
|
|
def _validate_index_operation(self, items):
|
|
if not isinstance(items, list):
|
|
raise TypeError("list item required - not {}".format(type(items)))
|
|
|
|
# field is the index field so count values
|
|
query_params, post_processing = self._resolve_tasks()
|
|
|
|
size = self._size(query_params, post_processing)
|
|
|
|
# Size is dictated by operations
|
|
if size is not None:
|
|
raise NotImplementedError("Can not count field matches if size is set {}".format(size))
|
|
|
|
return query_params, post_processing
|
|
|
|
def index_matches_count(self, query_compiler, field, items):
|
|
query_params, post_processing = self._validate_index_operation(items)
|
|
|
|
body = Query(query_params['query'])
|
|
|
|
if field == Index.ID_INDEX_FIELD:
|
|
body.ids(items, must=True)
|
|
else:
|
|
body.terms(field, items, must=True)
|
|
|
|
return query_compiler._client.count(index=query_compiler._index_pattern, body=body.to_count_body())
|
|
|
|
def drop_index_values(self, query_compiler, field, items):
|
|
self._validate_index_operation(items)
|
|
|
|
# Putting boolean queries together
|
|
# i = 10
|
|
# not i = 20
|
|
# _id in [1,2,3]
|
|
# _id not in [1,2,3]
|
|
# a in ['a','b','c']
|
|
# b not in ['a','b','c']
|
|
# For now use term queries
|
|
if field == Index.ID_INDEX_FIELD:
|
|
task = ('query_ids', ('must_not', items))
|
|
else:
|
|
task = ('query_terms', ('must_not', (field, items)))
|
|
self._tasks.append(task)
|
|
|
|
@staticmethod
|
|
def _sort_params_to_postprocessing(input):
|
|
# Split string
|
|
sort_params = input.split(":")
|
|
|
|
query_sort_field = sort_params[0]
|
|
query_sort_order = Operations.SortOrder.from_string(sort_params[1])
|
|
|
|
task = ('sort_field', (query_sort_field, query_sort_order))
|
|
|
|
return task
|
|
|
|
@staticmethod
|
|
def _query_params_to_size_and_sort(query_params):
|
|
sort_params = None
|
|
if query_params['query_sort_field'] and query_params['query_sort_order']:
|
|
sort_params = query_params['query_sort_field'] + ":" + Operations.SortOrder.to_string(
|
|
query_params['query_sort_order'])
|
|
|
|
size = query_params['query_size']
|
|
|
|
return size, sort_params
|
|
|
|
@staticmethod
|
|
def _count_post_processing(post_processing):
|
|
size = None
|
|
for action in post_processing:
|
|
if action[0] == 'head' or action[0] == 'tail':
|
|
if size is None or action[1][1] < size:
|
|
size = action[1][1]
|
|
|
|
return size
|
|
|
|
@staticmethod
|
|
def _apply_df_post_processing(df, post_processing):
|
|
for action in post_processing:
|
|
if action == 'sort_index':
|
|
df = df.sort_index()
|
|
elif action[0] == 'head':
|
|
df = df.head(action[1][1])
|
|
elif action[0] == 'tail':
|
|
df = df.tail(action[1][1])
|
|
elif action[0] == 'sort_field':
|
|
sort_field = action[1][0]
|
|
sort_order = action[1][1]
|
|
if sort_order == Operations.SortOrder.ASC:
|
|
df = df.sort_values(sort_field, True)
|
|
else:
|
|
df = df.sort_values(sort_field, False)
|
|
elif action[0] == 'iloc':
|
|
index_indexer = action[1][0]
|
|
column_indexer = action[1][1]
|
|
if index_indexer is None:
|
|
index_indexer = slice(None)
|
|
if column_indexer is None:
|
|
column_indexer = slice(None)
|
|
df = df.iloc[index_indexer, column_indexer]
|
|
elif action[0] == 'squeeze':
|
|
df = df.squeeze(axis=action[1])
|
|
# columns could be in here (and we ignore it)
|
|
|
|
return df
|
|
|
|
def _resolve_tasks(self):
|
|
# We now try and combine all tasks into an Elasticsearch query
|
|
# Some operations can be simply combined into a single query
|
|
# other operations require pre-queries and then combinations
|
|
# other operations require in-core post-processing of results
|
|
query_params = {"query_sort_field": None,
|
|
"query_sort_order": None,
|
|
"query_size": None,
|
|
"query_fields": None,
|
|
"query": Query()}
|
|
|
|
post_processing = []
|
|
|
|
for task in self._tasks:
|
|
if task[0] == 'head':
|
|
query_params, post_processing = self._resolve_head(task, query_params, post_processing)
|
|
elif task[0] == 'tail':
|
|
query_params, post_processing = self._resolve_tail(task, query_params, post_processing)
|
|
elif task[0] == 'iloc':
|
|
query_params, post_processing = self._resolve_iloc(task, query_params, post_processing)
|
|
elif task[0] == 'query_ids':
|
|
query_params, post_processing = self._resolve_query_ids(task, query_params, post_processing)
|
|
elif task[0] == 'query_terms':
|
|
query_params, post_processing = self._resolve_query_terms(task, query_params, post_processing)
|
|
elif task[0] == 'boolean_filter':
|
|
query_params, post_processing = self._resolve_boolean_filter(task, query_params, post_processing)
|
|
else: # a lot of operations simply post-process the dataframe - put these straight through
|
|
query_params, post_processing = self._resolve_post_processing_task(task, query_params, post_processing)
|
|
|
|
return query_params, post_processing
|
|
|
|
def _resolve_head(self, item, query_params, post_processing):
|
|
# head - sort asc, size n
|
|
# |12345-------------|
|
|
query_sort_field = item[1][0]
|
|
query_sort_order = Operations.SortOrder.ASC
|
|
query_size = item[1][1]
|
|
|
|
# If we are already postprocessing the query results, we just get 'head' of these
|
|
# (note, currently we just append another head, we don't optimise by
|
|
# overwriting previous head)
|
|
if len(post_processing) > 0:
|
|
post_processing.append(item)
|
|
return query_params, post_processing
|
|
|
|
if query_params['query_sort_field'] is None:
|
|
query_params['query_sort_field'] = query_sort_field
|
|
# if it is already sorted we use existing field
|
|
|
|
if query_params['query_sort_order'] is None:
|
|
query_params['query_sort_order'] = query_sort_order
|
|
# if it is already sorted we get head of existing order
|
|
|
|
if query_params['query_size'] is None:
|
|
query_params['query_size'] = query_size
|
|
else:
|
|
# truncate if head is smaller
|
|
if query_size < query_params['query_size']:
|
|
query_params['query_size'] = query_size
|
|
|
|
return query_params, post_processing
|
|
|
|
def _resolve_tail(self, item, query_params, post_processing):
|
|
# tail - sort desc, size n, post-process sort asc
|
|
# |-------------12345|
|
|
query_sort_field = item[1][0]
|
|
query_sort_order = Operations.SortOrder.DESC
|
|
query_size = item[1][1]
|
|
|
|
# If this is a tail of a tail adjust settings and return
|
|
if query_params['query_size'] is not None and \
|
|
query_params['query_sort_order'] == query_sort_order and \
|
|
post_processing == [('sort_index')]:
|
|
if query_size < query_params['query_size']:
|
|
query_params['query_size'] = query_size
|
|
return query_params, post_processing
|
|
|
|
# If we are already postprocessing the query results, just get 'tail' of these
|
|
# (note, currently we just append another tail, we don't optimise by
|
|
# overwriting previous tail)
|
|
if len(post_processing) > 0:
|
|
post_processing.append(item)
|
|
return query_params, post_processing
|
|
|
|
# If results are already constrained, just get 'tail' of these
|
|
# (note, currently we just append another tail, we don't optimise by
|
|
# overwriting previous tail)
|
|
if query_params['query_size'] is not None:
|
|
post_processing.append(item)
|
|
return query_params, post_processing
|
|
else:
|
|
query_params['query_size'] = query_size
|
|
if query_params['query_sort_field'] is None:
|
|
query_params['query_sort_field'] = query_sort_field
|
|
if query_params['query_sort_order'] is None:
|
|
query_params['query_sort_order'] = query_sort_order
|
|
else:
|
|
# reverse sort order
|
|
query_params['query_sort_order'] = Operations.SortOrder.reverse(query_sort_order)
|
|
|
|
post_processing.append(('sort_index'))
|
|
|
|
return query_params, post_processing
|
|
|
|
def _resolve_iloc(self, item, query_params, post_processing):
|
|
# tail - sort desc, size n, post-process sort asc
|
|
# |---4--7-9---------|
|
|
|
|
# This is a list of items we return via an integer index
|
|
int_index = item[1][0]
|
|
if int_index is not None:
|
|
last_item = int_index.max()
|
|
|
|
# If we have a query_size we do this post processing
|
|
if query_params['query_size'] is not None:
|
|
post_processing.append(item)
|
|
return query_params, post_processing
|
|
|
|
# size should be > last item
|
|
query_params['query_size'] = last_item + 1
|
|
post_processing.append(item)
|
|
|
|
return query_params, post_processing
|
|
|
|
def _resolve_query_ids(self, item, query_params, post_processing):
|
|
# task = ('query_ids', ('must_not', items))
|
|
must_clause = item[1][0]
|
|
ids = item[1][1]
|
|
|
|
if must_clause == 'must':
|
|
query_params['query'].ids(ids, must=True)
|
|
else:
|
|
query_params['query'].ids(ids, must=False)
|
|
|
|
return query_params, post_processing
|
|
|
|
def _resolve_query_terms(self, item, query_params, post_processing):
|
|
# task = ('query_terms', ('must_not', (field, terms)))
|
|
must_clause = item[1][0]
|
|
field = item[1][1][0]
|
|
terms = item[1][1][1]
|
|
|
|
if must_clause == 'must':
|
|
query_params['query'].terms(field, terms, must=True)
|
|
else:
|
|
query_params['query'].terms(field, terms, must=False)
|
|
|
|
return query_params, post_processing
|
|
|
|
def _resolve_boolean_filter(self, item, query_params, post_processing):
|
|
# task = ('boolean_filter', object)
|
|
boolean_filter = item[1]
|
|
|
|
query_params['query'].update_boolean_filter(boolean_filter)
|
|
|
|
return query_params, post_processing
|
|
|
|
def _resolve_post_processing_task(self, item, query_params, post_processing):
|
|
# Just do this in post-processing
|
|
if item[0] != 'columns':
|
|
post_processing.append(item)
|
|
|
|
return query_params, post_processing
|
|
|
|
def _size(self, query_params, post_processing):
|
|
# Shrink wrap code around checking if size parameter is set
|
|
size = query_params['query_size'] # can be None
|
|
|
|
pp_size = self._count_post_processing(post_processing)
|
|
if pp_size is not None:
|
|
if size is not None:
|
|
size = min(size, pp_size)
|
|
else:
|
|
size = pp_size
|
|
|
|
# This can return None
|
|
return size
|
|
|
|
def info_es(self, buf):
|
|
buf.write("Operations:\n")
|
|
buf.write("\ttasks: {0}\n".format(self._tasks))
|
|
|
|
query_params, post_processing = self._resolve_tasks()
|
|
size, sort_params = Operations._query_params_to_size_and_sort(query_params)
|
|
columns = self.get_columns()
|
|
|
|
buf.write("\tsize: {0}\n".format(size))
|
|
buf.write("\tsort_params: {0}\n".format(sort_params))
|
|
buf.write("\tcolumns: {0}\n".format(columns))
|
|
buf.write("\tpost_processing: {0}\n".format(post_processing))
|
|
|
|
def update_query(self, boolean_filter):
|
|
task = ('boolean_filter', boolean_filter)
|
|
self._tasks.append(task)
|