mirror of
https://github.com/elastic/eland.git
synced 2025-07-11 00:02:14 +08:00
Add NDFrame.median() aggregation
This commit is contained in:
parent
7a1c636e56
commit
e8f307d2e0
@ -252,11 +252,95 @@ class NDFrame(ABC):
|
||||
return self._query_compiler.min(numeric_only=numeric_only)
|
||||
|
||||
def var(self, numeric_only=True):
|
||||
"""
|
||||
Return variance for each numeric column
|
||||
|
||||
Returns
|
||||
-------
|
||||
pandas.Series
|
||||
The value of the variance for each numeric column
|
||||
|
||||
See Also
|
||||
--------
|
||||
:pandas_api_docs:`pandas.DataFrame.var`
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> df = ed.DataFrame('localhost', 'flights')
|
||||
>>> df.var() # doctest: +SKIP
|
||||
AvgTicketPrice 7.096185e+04
|
||||
Cancelled 1.119831e-01
|
||||
DistanceKilometers 2.096049e+07
|
||||
DistanceMiles 8.092892e+06
|
||||
FlightDelay 1.880825e-01
|
||||
FlightDelayMin 9.359209e+03
|
||||
FlightTimeHour 3.112545e+01
|
||||
FlightTimeMin 1.120516e+05
|
||||
dayOfWeek 3.761135e+00
|
||||
dtype: float64
|
||||
"""
|
||||
return self._query_compiler.var(numeric_only=numeric_only)
|
||||
|
||||
def std(self, numeric_only=True):
|
||||
"""
|
||||
Return standard deviation for each numeric column
|
||||
|
||||
Returns
|
||||
-------
|
||||
pandas.Series
|
||||
The value of the standard deviation for each numeric column
|
||||
|
||||
See Also
|
||||
--------
|
||||
:pandas_api_docs:`pandas.DataFrame.std`
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> df = ed.DataFrame('localhost', 'flights')
|
||||
>>> df.std() # doctest: +SKIP
|
||||
AvgTicketPrice 266.386661
|
||||
Cancelled 0.334639
|
||||
DistanceKilometers 4578.263193
|
||||
DistanceMiles 2844.800855
|
||||
FlightDelay 0.433685
|
||||
FlightDelayMin 96.743006
|
||||
FlightTimeHour 5.579019
|
||||
FlightTimeMin 334.741135
|
||||
dayOfWeek 1.939365
|
||||
dtype: float64
|
||||
"""
|
||||
return self._query_compiler.std(numeric_only=numeric_only)
|
||||
|
||||
def median(self, numeric_only=True):
|
||||
"""
|
||||
Return the median value for each numeric column
|
||||
|
||||
Returns
|
||||
-------
|
||||
pandas.Series
|
||||
median value for each numeric column
|
||||
|
||||
See Also
|
||||
--------
|
||||
:pandas_api_docs:`pandas.DataFrame.median`
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> df = ed.DataFrame('localhost', 'flights')
|
||||
>>> df.median() # doctest: +SKIP
|
||||
AvgTicketPrice 640.387285
|
||||
Cancelled 0.000000
|
||||
DistanceKilometers 7612.072403
|
||||
DistanceMiles 4729.922470
|
||||
FlightDelay 0.000000
|
||||
FlightDelayMin 0.000000
|
||||
FlightTimeHour 8.383113
|
||||
FlightTimeMin 503.148975
|
||||
dayOfWeek 3.000000
|
||||
dtype: float64
|
||||
"""
|
||||
return self._query_compiler.median(numeric_only=numeric_only)
|
||||
|
||||
def max(self, numeric_only=True):
|
||||
"""
|
||||
Return the maximum value for each numeric column
|
||||
|
@ -135,6 +135,11 @@ class Operations:
|
||||
numeric_only=numeric_only,
|
||||
)
|
||||
|
||||
def median(self, query_compiler, numeric_only=True):
|
||||
return self._metric_aggs(
|
||||
query_compiler, ("percentiles", "50.0"), numeric_only=numeric_only
|
||||
)
|
||||
|
||||
def sum(self, query_compiler, numeric_only=True):
|
||||
return self._metric_aggs(query_compiler, "sum", numeric_only=numeric_only)
|
||||
|
||||
@ -275,9 +280,14 @@ class Operations:
|
||||
)
|
||||
else:
|
||||
if isinstance(func, tuple):
|
||||
results[field] = response["aggregations"][
|
||||
func[0] + "_" + field
|
||||
][func[1]]
|
||||
if func[0] == "percentiles":
|
||||
results[field] = response["aggregations"][
|
||||
"percentiles_" + field
|
||||
]["values"]["50.0"]
|
||||
else:
|
||||
results[field] = response["aggregations"][
|
||||
func[0] + "_" + field
|
||||
][func[1]]
|
||||
else:
|
||||
results[field] = response["aggregations"][field]["value"]
|
||||
|
||||
|
@ -469,6 +469,9 @@ class QueryCompiler:
|
||||
def std(self, numeric_only=None):
|
||||
return self._operations.std(self, numeric_only=numeric_only)
|
||||
|
||||
def median(self, numeric_only=None):
|
||||
return self._operations.median(self, numeric_only=numeric_only)
|
||||
|
||||
def sum(self, numeric_only=None):
|
||||
return self._operations.sum(self, numeric_only=numeric_only)
|
||||
|
||||
|
@ -12,16 +12,16 @@
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
# File called _pytest for PyCharm compatability
|
||||
# File called _pytest for PyCharm compatibility
|
||||
|
||||
from pandas.util.testing import assert_series_equal, assert_almost_equal
|
||||
from pandas.testing import assert_series_equal
|
||||
|
||||
from eland.tests.common import TestData
|
||||
|
||||
|
||||
class TestDataFrameMetrics(TestData):
|
||||
funcs = ["max", "min", "mean", "sum"]
|
||||
extended_funcs = ["var", "std"]
|
||||
extended_funcs = ["var", "std", "median"]
|
||||
|
||||
def test_flights_metrics(self):
|
||||
pd_flights = self.pd_flights()
|
||||
@ -41,7 +41,7 @@ class TestDataFrameMetrics(TestData):
|
||||
pd_metric = getattr(pd_flights, func)(numeric_only=True)
|
||||
ed_metric = getattr(ed_flights, func)(numeric_only=True)
|
||||
|
||||
assert_almost_equal(pd_metric, ed_metric, check_less_precise=True)
|
||||
assert_series_equal(pd_metric, ed_metric, check_less_precise=True)
|
||||
|
||||
def test_ecommerce_selected_non_numeric_source_fields(self):
|
||||
# None of these are numeric
|
||||
|
Loading…
x
Reference in New Issue
Block a user