Optimize df.describe() to use aggregations instead of own query

This commit is contained in:
P. Sai Vinay 2021-06-22 21:59:54 +05:30 committed by GitHub
parent 5fe32a24df
commit ac2efb5863
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 24 additions and 47 deletions

View File

@ -628,8 +628,8 @@ class NDFrame(ABC):
Examples
--------
>>> df = ed.DataFrame('localhost', 'flights', columns=['AvgTicketPrice', 'FlightDelayMin'])
>>> df.describe() # ignoring percentiles as they don't generate consistent results
>>> df = ed.DataFrame('localhost', 'flights', columns=['AvgTicketPrice', 'FlightDelayMin']) # ignoring percentiles
>>> df.describe() # doctest: +SKIP
AvgTicketPrice FlightDelayMin
count 13059.000000 13059.000000
mean 628.253689 47.335171

View File

@ -1081,51 +1081,25 @@ class Operations:
f"Can not count field matches if size is set {size}"
)
numeric_source_fields = query_compiler._mappings.numeric_source_fields()
# for each field we compute:
# count, mean, std, min, 25%, 50%, 75%, max
body = Query(query_params.query)
for field in numeric_source_fields:
body.metric_aggs("extended_stats_" + field, "extended_stats", field)
body.metric_aggs("percentiles_" + field, "percentiles", field)
response = query_compiler._client.search(
index=query_compiler._index_pattern, size=0, body=body.to_search_body()
df1 = self.aggs(
query_compiler=query_compiler,
pd_aggs=["count", "mean", "std", "min", "max"],
numeric_only=True,
)
df2 = self.quantile(
query_compiler=query_compiler,
pd_aggs=["quantile"],
quantiles=[0.25, 0.5, 0.75],
is_dataframe=True,
numeric_only=True,
)
results = {}
# Convert [.25,.5,.75] to ["25%", "50%", "75%"]
df2 = df2.set_index([["25%", "50%", "75%"]])
for field in numeric_source_fields:
values = list()
values.append(response["aggregations"]["extended_stats_" + field]["count"])
values.append(response["aggregations"]["extended_stats_" + field]["avg"])
values.append(
response["aggregations"]["extended_stats_" + field]["std_deviation"]
return pd.concat([df1, df2]).reindex(
["count", "mean", "std", "min", "25%", "50%", "75%", "max"]
)
values.append(response["aggregations"]["extended_stats_" + field]["min"])
values.append(
response["aggregations"]["percentiles_" + field]["values"]["25.0"]
)
values.append(
response["aggregations"]["percentiles_" + field]["values"]["50.0"]
)
values.append(
response["aggregations"]["percentiles_" + field]["values"]["75.0"]
)
values.append(response["aggregations"]["extended_stats_" + field]["max"])
# if not None
if values.count(None) < len(values):
results[field] = values
df = pd.DataFrame(
data=results,
index=["count", "mean", "std", "min", "25%", "50%", "75%", "max"],
)
return df
def to_pandas(self, query_compiler, show_progress=False):
class PandasDataFrameCollector:

View File

@ -1269,7 +1269,7 @@ class Series(NDFrame):
3 2
4 2
Name: total_quantity, dtype: int64
>>> np.int(2) ** df.total_quantity
>>> np.int_(2) ** df.total_quantity
0 4.0
1 4.0
2 4.0
@ -1627,8 +1627,8 @@ class Series(NDFrame):
Examples
--------
>>> df = ed.DataFrame('localhost', 'flights')
>>> df.AvgTicketPrice.describe() # ignoring percentiles as they don't generate consistent results
>>> df = ed.DataFrame('localhost', 'flights') # ignoring percentiles as they don't generate consistent results
>>> df.AvgTicketPrice.describe() # doctest: +SKIP
count 13059.000000
mean 628.253689
std 266.386661

View File

@ -28,7 +28,10 @@ class TestDataFrameDescribe(TestData):
ed_flights = self.ed_flights()
pd_describe = pd_flights.describe()
ed_describe = ed_flights.describe()
# We remove bool columns to match pandas output
ed_describe = ed_flights.describe().drop(
["Cancelled", "FlightDelay"], axis="columns"
)
assert_frame_equal(
pd_describe.drop(["25%", "50%", "75%"], axis="index"),