mirror of
https://github.com/elastic/eland.git
synced 2025-07-11 00:02:14 +08:00
Optimize df.describe() to use aggregations instead of own query
This commit is contained in:
parent
5fe32a24df
commit
ac2efb5863
@ -628,8 +628,8 @@ class NDFrame(ABC):
|
|||||||
|
|
||||||
Examples
|
Examples
|
||||||
--------
|
--------
|
||||||
>>> df = ed.DataFrame('localhost', 'flights', columns=['AvgTicketPrice', 'FlightDelayMin'])
|
>>> df = ed.DataFrame('localhost', 'flights', columns=['AvgTicketPrice', 'FlightDelayMin']) # ignoring percentiles
|
||||||
>>> df.describe() # ignoring percentiles as they don't generate consistent results
|
>>> df.describe() # doctest: +SKIP
|
||||||
AvgTicketPrice FlightDelayMin
|
AvgTicketPrice FlightDelayMin
|
||||||
count 13059.000000 13059.000000
|
count 13059.000000 13059.000000
|
||||||
mean 628.253689 47.335171
|
mean 628.253689 47.335171
|
||||||
|
@ -1081,52 +1081,26 @@ class Operations:
|
|||||||
f"Can not count field matches if size is set {size}"
|
f"Can not count field matches if size is set {size}"
|
||||||
)
|
)
|
||||||
|
|
||||||
numeric_source_fields = query_compiler._mappings.numeric_source_fields()
|
df1 = self.aggs(
|
||||||
|
query_compiler=query_compiler,
|
||||||
# for each field we compute:
|
pd_aggs=["count", "mean", "std", "min", "max"],
|
||||||
# count, mean, std, min, 25%, 50%, 75%, max
|
numeric_only=True,
|
||||||
body = Query(query_params.query)
|
)
|
||||||
|
df2 = self.quantile(
|
||||||
for field in numeric_source_fields:
|
query_compiler=query_compiler,
|
||||||
body.metric_aggs("extended_stats_" + field, "extended_stats", field)
|
pd_aggs=["quantile"],
|
||||||
body.metric_aggs("percentiles_" + field, "percentiles", field)
|
quantiles=[0.25, 0.5, 0.75],
|
||||||
|
is_dataframe=True,
|
||||||
response = query_compiler._client.search(
|
numeric_only=True,
|
||||||
index=query_compiler._index_pattern, size=0, body=body.to_search_body()
|
|
||||||
)
|
)
|
||||||
|
|
||||||
results = {}
|
# Convert [.25,.5,.75] to ["25%", "50%", "75%"]
|
||||||
|
df2 = df2.set_index([["25%", "50%", "75%"]])
|
||||||
|
|
||||||
for field in numeric_source_fields:
|
return pd.concat([df1, df2]).reindex(
|
||||||
values = list()
|
["count", "mean", "std", "min", "25%", "50%", "75%", "max"]
|
||||||
values.append(response["aggregations"]["extended_stats_" + field]["count"])
|
|
||||||
values.append(response["aggregations"]["extended_stats_" + field]["avg"])
|
|
||||||
values.append(
|
|
||||||
response["aggregations"]["extended_stats_" + field]["std_deviation"]
|
|
||||||
)
|
|
||||||
values.append(response["aggregations"]["extended_stats_" + field]["min"])
|
|
||||||
values.append(
|
|
||||||
response["aggregations"]["percentiles_" + field]["values"]["25.0"]
|
|
||||||
)
|
|
||||||
values.append(
|
|
||||||
response["aggregations"]["percentiles_" + field]["values"]["50.0"]
|
|
||||||
)
|
|
||||||
values.append(
|
|
||||||
response["aggregations"]["percentiles_" + field]["values"]["75.0"]
|
|
||||||
)
|
|
||||||
values.append(response["aggregations"]["extended_stats_" + field]["max"])
|
|
||||||
|
|
||||||
# if not None
|
|
||||||
if values.count(None) < len(values):
|
|
||||||
results[field] = values
|
|
||||||
|
|
||||||
df = pd.DataFrame(
|
|
||||||
data=results,
|
|
||||||
index=["count", "mean", "std", "min", "25%", "50%", "75%", "max"],
|
|
||||||
)
|
)
|
||||||
|
|
||||||
return df
|
|
||||||
|
|
||||||
def to_pandas(self, query_compiler, show_progress=False):
|
def to_pandas(self, query_compiler, show_progress=False):
|
||||||
class PandasDataFrameCollector:
|
class PandasDataFrameCollector:
|
||||||
def __init__(self, show_progress):
|
def __init__(self, show_progress):
|
||||||
|
@ -1269,7 +1269,7 @@ class Series(NDFrame):
|
|||||||
3 2
|
3 2
|
||||||
4 2
|
4 2
|
||||||
Name: total_quantity, dtype: int64
|
Name: total_quantity, dtype: int64
|
||||||
>>> np.int(2) ** df.total_quantity
|
>>> np.int_(2) ** df.total_quantity
|
||||||
0 4.0
|
0 4.0
|
||||||
1 4.0
|
1 4.0
|
||||||
2 4.0
|
2 4.0
|
||||||
@ -1627,8 +1627,8 @@ class Series(NDFrame):
|
|||||||
|
|
||||||
Examples
|
Examples
|
||||||
--------
|
--------
|
||||||
>>> df = ed.DataFrame('localhost', 'flights')
|
>>> df = ed.DataFrame('localhost', 'flights') # ignoring percentiles as they don't generate consistent results
|
||||||
>>> df.AvgTicketPrice.describe() # ignoring percentiles as they don't generate consistent results
|
>>> df.AvgTicketPrice.describe() # doctest: +SKIP
|
||||||
count 13059.000000
|
count 13059.000000
|
||||||
mean 628.253689
|
mean 628.253689
|
||||||
std 266.386661
|
std 266.386661
|
||||||
|
@ -28,7 +28,10 @@ class TestDataFrameDescribe(TestData):
|
|||||||
ed_flights = self.ed_flights()
|
ed_flights = self.ed_flights()
|
||||||
|
|
||||||
pd_describe = pd_flights.describe()
|
pd_describe = pd_flights.describe()
|
||||||
ed_describe = ed_flights.describe()
|
# We remove bool columns to match pandas output
|
||||||
|
ed_describe = ed_flights.describe().drop(
|
||||||
|
["Cancelled", "FlightDelay"], axis="columns"
|
||||||
|
)
|
||||||
|
|
||||||
assert_frame_equal(
|
assert_frame_equal(
|
||||||
pd_describe.drop(["25%", "50%", "75%"], axis="index"),
|
pd_describe.drop(["25%", "50%", "75%"], axis="index"),
|
||||||
|
Loading…
x
Reference in New Issue
Block a user