mirror of
https://github.com/elastic/eland.git
synced 2025-07-11 00:02:14 +08:00
Fixing rename and truediv issues
tests pass TODO - implement additional orithmetic ops
This commit is contained in:
parent
c12bf9357b
commit
5d119215f8
@ -389,10 +389,10 @@ class DataFrame(NDFrame):
|
||||
<BLANKLINE>
|
||||
[27 rows x 5 columns]
|
||||
Operations:
|
||||
tasks: [('boolean_filter', {'bool': {'must': [{'term': {'OriginAirportID': 'AMS'}}, {'range': {'FlightDelayMin': {'gt': 60}}}]}}), ('columns', ['timestamp', 'OriginAirportID', 'DestAirportID', 'FlightDelayMin']), ('tail', ('_doc', 5))]
|
||||
tasks: [('boolean_filter', {'bool': {'must': [{'term': {'OriginAirportID': 'AMS'}}, {'range': {'FlightDelayMin': {'gt': 60}}}]}}), ('field_names', ['timestamp', 'OriginAirportID', 'DestAirportID', 'FlightDelayMin']), ('tail', ('_doc', 5))]
|
||||
size: 5
|
||||
sort_params: _doc:desc
|
||||
columns: ['timestamp', 'OriginAirportID', 'DestAirportID', 'FlightDelayMin']
|
||||
field_names: ['timestamp', 'OriginAirportID', 'DestAirportID', 'FlightDelayMin']
|
||||
post_processing: ['sort_index']
|
||||
<BLANKLINE>
|
||||
"""
|
||||
|
@ -881,31 +881,58 @@ class Operations:
|
||||
left_field = item[1][1][1][0]
|
||||
right_field = item[1][1][1][1]
|
||||
|
||||
"""
|
||||
(if op_name = 'truediv')
|
||||
|
||||
"script_fields": {
|
||||
"field_name": {
|
||||
"script": {
|
||||
"source": "doc[left_field].value / doc[right_field].value"
|
||||
}
|
||||
if isinstance(right_field, str):
|
||||
"""
|
||||
(if op_name = 'truediv')
|
||||
|
||||
"script_fields": {
|
||||
"field_name": {
|
||||
"script": {
|
||||
"source": "doc[left_field].value / doc[right_field].value"
|
||||
}
|
||||
}
|
||||
}
|
||||
"""
|
||||
if op_name == 'truediv':
|
||||
op = '/'
|
||||
else:
|
||||
raise NotImplementedError("Not implemented operation '{0}'".format(op_name))
|
||||
|
||||
source = "doc['{0}'].value {1} doc['{2}'].value".format(left_field, op, right_field)
|
||||
|
||||
if query_params['query_script_fields'] is None:
|
||||
query_params['query_script_fields'] = {}
|
||||
query_params['query_script_fields'][field_name] = {
|
||||
'script': {
|
||||
'source': source
|
||||
}
|
||||
}
|
||||
}
|
||||
"""
|
||||
if op_name == 'truediv':
|
||||
op = '/'
|
||||
else:
|
||||
raise NotImplementedError("Not implemented operation '{0}'".format(op_name))
|
||||
"""
|
||||
(if op_name = 'truediv')
|
||||
|
||||
source = "doc['{0}'].value {1} doc['{2}'].value".format(left_field, op, right_field)
|
||||
|
||||
if query_params['query_script_fields'] is None:
|
||||
query_params['query_script_fields'] = {}
|
||||
query_params['query_script_fields'][field_name] = {
|
||||
'script': {
|
||||
'source': source
|
||||
"script_fields": {
|
||||
"field_name": {
|
||||
"script": {
|
||||
"source": "doc[left_field].value / right_field"
|
||||
}
|
||||
}
|
||||
}
|
||||
"""
|
||||
if op_name == 'truediv':
|
||||
op = '/'
|
||||
else:
|
||||
raise NotImplementedError("Not implemented operation '{0}'".format(op_name))
|
||||
|
||||
source = "doc['{0}'].value {1} {2}".format(left_field, op, right_field)
|
||||
|
||||
if query_params['query_script_fields'] is None:
|
||||
query_params['query_script_fields'] = {}
|
||||
query_params['query_script_fields'][field_name] = {
|
||||
'script': {
|
||||
'source': source
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return query_params, post_processing
|
||||
|
||||
|
@ -96,6 +96,7 @@ class ElandQueryCompiler:
|
||||
self._operations.set_field_names(columns)
|
||||
|
||||
columns = property(_get_columns, _set_columns)
|
||||
|
||||
index = property(_get_index)
|
||||
|
||||
@property
|
||||
@ -241,9 +242,9 @@ class ElandQueryCompiler:
|
||||
# _source may not contain all columns in the mapping
|
||||
# therefore, fill in missing columns
|
||||
# (note this returns self.columns NOT IN df.columns)
|
||||
missing_columns = list(set(self.columns) - set(df.columns))
|
||||
missing_field_names = list(set(self.field_names) - set(df.columns))
|
||||
|
||||
for missing in missing_columns:
|
||||
for missing in missing_field_names:
|
||||
is_source_field, pd_dtype = self._mappings.source_field_pd_dtype(missing)
|
||||
df[missing] = pd.Series(dtype=pd_dtype)
|
||||
|
||||
@ -252,7 +253,8 @@ class ElandQueryCompiler:
|
||||
df.rename(columns=self._name_mapper.display_names_mapper(), inplace=True)
|
||||
|
||||
# Sort columns in mapping order
|
||||
df = df[self.columns]
|
||||
if len(self.columns) > 1:
|
||||
df = df[self.columns]
|
||||
|
||||
return partial_result, df
|
||||
|
||||
@ -343,12 +345,14 @@ class ElandQueryCompiler:
|
||||
index_field=self._index.index_field, operations=self._operations.copy(),
|
||||
name_mapper=self._name_mapper.copy())
|
||||
|
||||
def rename(self, renames):
|
||||
result = self.copy()
|
||||
|
||||
result._name_mapper.rename_display_name(renames)
|
||||
|
||||
return result
|
||||
def rename(self, renames, inplace=False):
|
||||
if inplace:
|
||||
self._name_mapper.rename_display_name(renames)
|
||||
return self
|
||||
else:
|
||||
result = self.copy()
|
||||
result._name_mapper.rename_display_name(renames)
|
||||
return result
|
||||
|
||||
def head(self, n):
|
||||
result = self.copy()
|
||||
@ -503,10 +507,10 @@ class ElandQueryCompiler:
|
||||
"{0} != {1}".format(self._index_pattern, right._index_pattern)
|
||||
)
|
||||
|
||||
def arithmetic_op_fields(self, field_name, op, left_field, right_field):
|
||||
def arithmetic_op_fields(self, new_field_name, op, left_field, right_field):
|
||||
result = self.copy()
|
||||
|
||||
result._operations.arithmetic_op_fields(field_name, op, left_field, right_field)
|
||||
result._operations.arithmetic_op_fields(new_field_name, op, left_field, right_field)
|
||||
|
||||
return result
|
||||
|
||||
@ -547,10 +551,10 @@ class ElandQueryCompiler:
|
||||
self._field_to_display_names[field_name] = new_display_name
|
||||
|
||||
def field_names_to_list(self):
|
||||
return self._field_to_display_names.keys()
|
||||
return sorted(list(self._field_to_display_names.keys()))
|
||||
|
||||
def display_names_to_list(self):
|
||||
return self._display_to_field_names.keys()
|
||||
return sorted(list(self._display_to_field_names.keys()))
|
||||
|
||||
# Return mapper values as dict
|
||||
def display_names_mapper(self):
|
||||
@ -595,7 +599,7 @@ class ElandQueryCompiler:
|
||||
|
||||
def copy(self):
|
||||
return self.__constructor__(
|
||||
field_to_display_names=self._field_to_display_names,
|
||||
display_to_field_names = self._display_to_field_names
|
||||
field_to_display_names=self._field_to_display_names.copy(),
|
||||
display_to_field_names = self._display_to_field_names.copy()
|
||||
)
|
||||
|
||||
|
@ -18,6 +18,7 @@ Based on NDFrame which underpins eland.1DataFrame
|
||||
from io import StringIO
|
||||
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
|
||||
from eland import NDFrame
|
||||
from eland.filter import NotFilter, Equal, Greater, Less, GreaterEqual, LessEqual, ScriptFilter, IsIn
|
||||
@ -96,19 +97,58 @@ class Series(NDFrame):
|
||||
def _get_name(self):
|
||||
return self._query_compiler.columns[0]
|
||||
|
||||
name = property(_get_name)
|
||||
def _set_name(self, name):
|
||||
self._query_compiler.rename({self.name: name}, inplace=True)
|
||||
|
||||
name = property(_get_name, _set_name)
|
||||
|
||||
def rename(self, new_name):
|
||||
"""
|
||||
ONLY COLUMN rename supported
|
||||
Rename name of series. Only column rename is supported. This does not change the underlying
|
||||
Elasticsearch index, but adds a soft link from the new name (column) to the Elasticsearch field name
|
||||
|
||||
Parameters
|
||||
----------
|
||||
new_name
|
||||
new_name: str
|
||||
|
||||
Returns
|
||||
-------
|
||||
eland.Series
|
||||
eland.Series with new name.
|
||||
|
||||
See Also
|
||||
--------
|
||||
:pandas_api_docs:pandas.Series.rename
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> df = ed.DataFrame('localhost', 'flights')
|
||||
>>> df.Carrier
|
||||
0 Kibana Airlines
|
||||
1 Logstash Airways
|
||||
2 Logstash Airways
|
||||
3 Kibana Airlines
|
||||
4 Kibana Airlines
|
||||
...
|
||||
13054 Logstash Airways
|
||||
13055 Logstash Airways
|
||||
13056 Logstash Airways
|
||||
13057 JetBeats
|
||||
13058 JetBeats
|
||||
Name: Carrier, Length: 13059, dtype: object
|
||||
>>> df.Carrier.rename('Airline')
|
||||
0 Kibana Airlines
|
||||
1 Logstash Airways
|
||||
2 Logstash Airways
|
||||
3 Kibana Airlines
|
||||
4 Kibana Airlines
|
||||
...
|
||||
13054 Logstash Airways
|
||||
13055 Logstash Airways
|
||||
13056 Logstash Airways
|
||||
13057 JetBeats
|
||||
13058 JetBeats
|
||||
Name: Airline, Length: 13059, dtype: object
|
||||
"""
|
||||
return Series(query_compiler=self._query_compiler.rename({self.name: new_name}))
|
||||
|
||||
@ -312,11 +352,25 @@ class Series(NDFrame):
|
||||
# Check compatibility
|
||||
self._query_compiler.check_arithmetics(right._query_compiler)
|
||||
|
||||
field_name = "{0}_{1}_{2}".format(self.name, "truediv", right.name)
|
||||
new_field_name = "{0}_{1}_{2}".format(self.name, "truediv", right.name)
|
||||
|
||||
# Compatible, so create new Series
|
||||
return Series(query_compiler=self._query_compiler.arithmetic_op_fields(
|
||||
field_name, 'truediv', self.name, right.name))
|
||||
series = Series(query_compiler=self._query_compiler.arithmetic_op_fields(
|
||||
new_field_name, 'truediv', self.name, right.name))
|
||||
series.name = None
|
||||
|
||||
return series
|
||||
elif isinstance(right, (int, float)): # TODO extend to numpy types
|
||||
new_field_name = "{0}_{1}_{2}".format(self.name, "truediv", str(right).replace('.','_'))
|
||||
|
||||
# Compatible, so create new Series
|
||||
series = Series(query_compiler=self._query_compiler.arithmetic_op_fields(
|
||||
new_field_name, 'truediv', self.name, float(right))) # force rhs to float
|
||||
|
||||
# name of Series remains original name
|
||||
series.name = self.name
|
||||
|
||||
return series
|
||||
else:
|
||||
raise TypeError(
|
||||
"Can only perform arithmetic operation on selected types "
|
||||
|
@ -53,23 +53,23 @@ class TestQueryCompilerRename(TestData):
|
||||
update_A = {'a' : 'A'}
|
||||
mapper.rename_display_name(update_A)
|
||||
|
||||
assert display_names == mapper.display_names(columns)
|
||||
assert display_names == mapper.field_to_display_names(columns)
|
||||
|
||||
# Invalid update
|
||||
display_names = ['A', 'b', 'c', 'd']
|
||||
update_ZZ = {'a' : 'ZZ'}
|
||||
mapper.rename_display_name(update_ZZ)
|
||||
|
||||
assert display_names == mapper.display_names(columns)
|
||||
assert display_names == mapper.field_to_display_names(columns)
|
||||
|
||||
display_names = ['AA', 'b', 'c', 'd']
|
||||
update_AA = {'A' : 'AA'} # already renamed to 'A'
|
||||
mapper.rename_display_name(update_AA)
|
||||
|
||||
assert display_names == mapper.display_names(columns)
|
||||
assert display_names == mapper.field_to_display_names(columns)
|
||||
|
||||
display_names = ['AA', 'b', 'C', 'd']
|
||||
update_AA_C = {'a' : 'AA', 'c' : 'C'} # 'a' rename ignored
|
||||
mapper.rename_display_name(update_AA_C)
|
||||
|
||||
assert display_names == mapper.display_names(columns)
|
||||
assert display_names == mapper.field_to_display_names(columns)
|
||||
|
@ -20,10 +20,7 @@ class TestSeriesArithmetics(TestData):
|
||||
ed_df = self.ed_ecommerce()
|
||||
|
||||
pd_avg_price = pd_df['total_quantity'] / pd_df['taxful_total_price']
|
||||
print(pd_avg_price) # this has None as name
|
||||
|
||||
ed_avg_price = ed_df['total_quantity'] / ed_df['taxful_total_price']
|
||||
print(ed_avg_price)
|
||||
|
||||
assert_pandas_eland_series_equal(pd_avg_price, ed_avg_price, check_less_precise=True)
|
||||
|
||||
@ -32,19 +29,15 @@ class TestSeriesArithmetics(TestData):
|
||||
ed_df = self.ed_ecommerce()
|
||||
|
||||
pd_avg_price = pd_df['total_quantity'] / 10.0
|
||||
print(pd_avg_price)
|
||||
|
||||
ed_avg_price = ed_df['total_quantity'] / 10.0
|
||||
print(ed_avg_price)
|
||||
|
||||
def test_ecommerce_series_div_other(self):
|
||||
assert_pandas_eland_series_equal(pd_avg_price, ed_avg_price, check_less_precise=True)
|
||||
|
||||
def test_ecommerce_series_div_int(self):
|
||||
pd_df = self.pd_ecommerce()
|
||||
ed_df = self.ed_ecommerce()
|
||||
|
||||
ed_s1 = ed_df.total_quantity
|
||||
ed_s2 = ed_df.taxful_total_price
|
||||
pd_avg_price = pd_df['total_quantity'] / int(10)
|
||||
ed_avg_price = ed_df['total_quantity'] / int(10)
|
||||
|
||||
print(ed_s1)
|
||||
print(ed_s2)
|
||||
|
||||
print(ed_s1)
|
||||
print(ed_s2)
|
||||
assert_pandas_eland_series_equal(pd_avg_price, ed_avg_price, check_less_precise=True)
|
||||
|
32
eland/tests/series/test_name_pytest.py
Normal file
32
eland/tests/series/test_name_pytest.py
Normal file
@ -0,0 +1,32 @@
|
||||
# File called _pytest for PyCharm compatability
|
||||
import eland as ed
|
||||
from eland.tests import ELASTICSEARCH_HOST
|
||||
from eland.tests import FLIGHTS_INDEX_NAME
|
||||
from eland.tests.common import TestData
|
||||
from eland.tests.common import assert_pandas_eland_series_equal
|
||||
|
||||
|
||||
class TestSeriesName(TestData):
|
||||
|
||||
def test_name(self):
|
||||
# deep copy pandas DataFrame as .name alters this reference frame
|
||||
pd_series = self.pd_flights()['Carrier'].copy(deep=True)
|
||||
ed_series = ed.Series(ELASTICSEARCH_HOST, FLIGHTS_INDEX_NAME, 'Carrier')
|
||||
|
||||
assert_pandas_eland_series_equal(pd_series, ed_series)
|
||||
assert ed_series.name == pd_series.name
|
||||
|
||||
pd_series.name = "renamed1"
|
||||
ed_series.name = "renamed1"
|
||||
|
||||
assert_pandas_eland_series_equal(pd_series, ed_series)
|
||||
assert ed_series.name == pd_series.name
|
||||
|
||||
pd_series.name = "renamed2"
|
||||
ed_series.name = "renamed2"
|
||||
|
||||
assert_pandas_eland_series_equal(pd_series, ed_series)
|
||||
assert ed_series.name == pd_series.name
|
||||
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user