Optimize df.describe() to use aggregations instead of own query

This commit is contained in:
P. Sai Vinay 2021-06-22 21:59:54 +05:30 committed by GitHub
parent 5fe32a24df
commit ac2efb5863
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 24 additions and 47 deletions

View File

@ -628,8 +628,8 @@ class NDFrame(ABC):
Examples Examples
-------- --------
>>> df = ed.DataFrame('localhost', 'flights', columns=['AvgTicketPrice', 'FlightDelayMin']) >>> df = ed.DataFrame('localhost', 'flights', columns=['AvgTicketPrice', 'FlightDelayMin']) # ignoring percentiles
>>> df.describe() # ignoring percentiles as they don't generate consistent results >>> df.describe() # doctest: +SKIP
AvgTicketPrice FlightDelayMin AvgTicketPrice FlightDelayMin
count 13059.000000 13059.000000 count 13059.000000 13059.000000
mean 628.253689 47.335171 mean 628.253689 47.335171

View File

@ -1081,52 +1081,26 @@ class Operations:
f"Can not count field matches if size is set {size}" f"Can not count field matches if size is set {size}"
) )
numeric_source_fields = query_compiler._mappings.numeric_source_fields() df1 = self.aggs(
query_compiler=query_compiler,
# for each field we compute: pd_aggs=["count", "mean", "std", "min", "max"],
# count, mean, std, min, 25%, 50%, 75%, max numeric_only=True,
body = Query(query_params.query) )
df2 = self.quantile(
for field in numeric_source_fields: query_compiler=query_compiler,
body.metric_aggs("extended_stats_" + field, "extended_stats", field) pd_aggs=["quantile"],
body.metric_aggs("percentiles_" + field, "percentiles", field) quantiles=[0.25, 0.5, 0.75],
is_dataframe=True,
response = query_compiler._client.search( numeric_only=True,
index=query_compiler._index_pattern, size=0, body=body.to_search_body()
) )
results = {} # Convert [.25,.5,.75] to ["25%", "50%", "75%"]
df2 = df2.set_index([["25%", "50%", "75%"]])
for field in numeric_source_fields: return pd.concat([df1, df2]).reindex(
values = list() ["count", "mean", "std", "min", "25%", "50%", "75%", "max"]
values.append(response["aggregations"]["extended_stats_" + field]["count"])
values.append(response["aggregations"]["extended_stats_" + field]["avg"])
values.append(
response["aggregations"]["extended_stats_" + field]["std_deviation"]
)
values.append(response["aggregations"]["extended_stats_" + field]["min"])
values.append(
response["aggregations"]["percentiles_" + field]["values"]["25.0"]
)
values.append(
response["aggregations"]["percentiles_" + field]["values"]["50.0"]
)
values.append(
response["aggregations"]["percentiles_" + field]["values"]["75.0"]
)
values.append(response["aggregations"]["extended_stats_" + field]["max"])
# if not None
if values.count(None) < len(values):
results[field] = values
df = pd.DataFrame(
data=results,
index=["count", "mean", "std", "min", "25%", "50%", "75%", "max"],
) )
return df
def to_pandas(self, query_compiler, show_progress=False): def to_pandas(self, query_compiler, show_progress=False):
class PandasDataFrameCollector: class PandasDataFrameCollector:
def __init__(self, show_progress): def __init__(self, show_progress):

View File

@ -1269,7 +1269,7 @@ class Series(NDFrame):
3 2 3 2
4 2 4 2
Name: total_quantity, dtype: int64 Name: total_quantity, dtype: int64
>>> np.int(2) ** df.total_quantity >>> np.int_(2) ** df.total_quantity
0 4.0 0 4.0
1 4.0 1 4.0
2 4.0 2 4.0
@ -1627,8 +1627,8 @@ class Series(NDFrame):
Examples Examples
-------- --------
>>> df = ed.DataFrame('localhost', 'flights') >>> df = ed.DataFrame('localhost', 'flights') # ignoring percentiles as they don't generate consistent results
>>> df.AvgTicketPrice.describe() # ignoring percentiles as they don't generate consistent results >>> df.AvgTicketPrice.describe() # doctest: +SKIP
count 13059.000000 count 13059.000000
mean 628.253689 mean 628.253689
std 266.386661 std 266.386661

View File

@ -28,7 +28,10 @@ class TestDataFrameDescribe(TestData):
ed_flights = self.ed_flights() ed_flights = self.ed_flights()
pd_describe = pd_flights.describe() pd_describe = pd_flights.describe()
ed_describe = ed_flights.describe() # We remove bool columns to match pandas output
ed_describe = ed_flights.describe().drop(
["Cancelled", "FlightDelay"], axis="columns"
)
assert_frame_equal( assert_frame_equal(
pd_describe.drop(["25%", "50%", "75%"], axis="index"), pd_describe.drop(["25%", "50%", "75%"], axis="index"),