Skip to content
Merged
63 changes: 50 additions & 13 deletions bigframes/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -517,6 +517,17 @@ def select_dtypes(self, include=None, exclude=None) -> DataFrame:
)
return DataFrame(self._block.select_columns(selected_columns))

def _select_exact_dtypes(
self, dtypes: Sequence[bigframes.dtypes.Dtype]
) -> DataFrame:
"""Selects columns without considering inheritance relationships."""
columns = [
col_id
for col_id, dtype in zip(self._block.value_columns, self._block.dtypes)
if dtype in dtypes
]
return DataFrame(self._block.select_columns(columns))

def _set_internal_query_job(self, query_job: Optional[bigquery.QueryJob]):
self._query_job = query_job

Expand Down Expand Up @@ -2437,13 +2448,9 @@ def agg(
aggregations = [agg_ops.lookup_agg_func(f) for f in func]

for dtype, agg in itertools.product(self.dtypes, aggregations):
if not bigframes.operations.aggregations.is_agg_op_supported(
dtype, agg
):
raise NotImplementedError(
f"Type {dtype} does not support aggregation {agg}. "
f"Share your usecase with the BigQuery DataFrames team at the {constants.FEEDBACK_LINK}"
)
agg.output_type(
dtype
) # Raises exception if the agg does not support the dtype.

return DataFrame(
self._block.summarize(
Expand Down Expand Up @@ -2512,7 +2519,10 @@ def melt(

def describe(self, include: None | Literal["all"] = None) -> DataFrame:
if include is None:
numeric_df = self._drop_non_numeric(permissive=False)
numeric_df = self._select_exact_dtypes(
bigframes.dtypes.NUMERIC_BIGFRAMES_TYPES_RESTRICTIVE
+ bigframes.dtypes.TEMPORAL_NUMERIC_BIGFRAMES_TYPES
)
if len(numeric_df.columns) == 0:
# Describe eligible non-numeric columns
return self._describe_non_numeric()
Expand Down Expand Up @@ -2540,9 +2550,11 @@ def describe(self, include: None | Literal["all"] = None) -> DataFrame:
raise ValueError(f"Unsupported include type: {include}")

def _describe_numeric(self) -> DataFrame:
return typing.cast(
number_df_result = typing.cast(
DataFrame,
self._drop_non_numeric(permissive=False).agg(
self._select_exact_dtypes(
bigframes.dtypes.NUMERIC_BIGFRAMES_TYPES_RESTRICTIVE
).agg(
[
"count",
"mean",
Expand All @@ -2555,16 +2567,41 @@ def _describe_numeric(self) -> DataFrame:
]
),
)
temporal_df_result = typing.cast(
DataFrame,
self._select_exact_dtypes(
bigframes.dtypes.TEMPORAL_NUMERIC_BIGFRAMES_TYPES
).agg(["count"]),
)

if len(number_df_result.columns) == 0:
return temporal_df_result
elif len(temporal_df_result.columns) == 0:
return number_df_result
else:
import bigframes.core.reshape.api as rs

original_columns = self._select_exact_dtypes(
bigframes.dtypes.NUMERIC_BIGFRAMES_TYPES_RESTRICTIVE
+ bigframes.dtypes.TEMPORAL_NUMERIC_BIGFRAMES_TYPES
).columns

# Use reindex after join to preserve the original column order.
return rs.concat(
[number_df_result, temporal_df_result],
axis=1,
)._reindex_columns(original_columns)

def _describe_non_numeric(self) -> DataFrame:
return typing.cast(
DataFrame,
self.select_dtypes(
include={
self._select_exact_dtypes(
[
bigframes.dtypes.STRING_DTYPE,
bigframes.dtypes.BOOL_DTYPE,
bigframes.dtypes.BYTES_DTYPE,
}
bigframes.dtypes.TIME_DTYPE,
]
).agg(["count", "nunique"]),
)

Expand Down
17 changes: 13 additions & 4 deletions bigframes/dtypes.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
import datetime
import decimal
import typing
from typing import Dict, Literal, Union
from typing import Dict, List, Literal, Union

import bigframes_vendored.constants as constants
import geopandas as gpd # type: ignore
Expand Down Expand Up @@ -211,7 +211,7 @@ class SimpleDtypeInfo:

# Corresponds to the pandas concept of numeric type (such as when 'numeric_only' is specified in an operation)
# Pandas is inconsistent, so two definitions are provided, each used in different contexts
NUMERIC_BIGFRAMES_TYPES_RESTRICTIVE = [
NUMERIC_BIGFRAMES_TYPES_RESTRICTIVE: List[Dtype] = [
FLOAT_DTYPE,
INT_DTYPE,
]
Expand All @@ -222,7 +222,16 @@ class SimpleDtypeInfo:
]


## dtype predicates - use these to maintain consistency
# Temporal types that are considered as "numeric" by Pandas
TEMPORAL_NUMERIC_BIGFRAMES_TYPES: List[Dtype] = [
DATE_DTYPE,
TIMESTAMP_DTYPE,
DATETIME_DTYPE,
]
TEMPORAL_BIGFRAMES_TYPES = TEMPORAL_NUMERIC_BIGFRAMES_TYPES + [TIME_DTYPE]


# dtype predicates - use these to maintain consistency
def is_datetime_like(type_: ExpressionType) -> bool:
return type_ in (DATETIME_DTYPE, TIMESTAMP_DTYPE)

Expand Down Expand Up @@ -630,7 +639,7 @@ def can_coerce(source_type: ExpressionType, target_type: ExpressionType) -> bool
return True # None can be coerced to any supported type
else:
return (source_type == STRING_DTYPE) and (
target_type in (DATETIME_DTYPE, TIMESTAMP_DTYPE, TIME_DTYPE, DATE_DTYPE)
target_type in TEMPORAL_BIGFRAMES_TYPES
)


Expand Down
11 changes: 0 additions & 11 deletions bigframes/operations/aggregations.py
Original file line number Diff line number Diff line change
Expand Up @@ -579,14 +579,3 @@ def lookup_agg_func(key: str) -> typing.Union[UnaryAggregateOp, NullaryAggregate
return _AGGREGATIONS_LOOKUP[key]
else:
raise ValueError(f"Unrecognize aggregate function: {key}")


def is_agg_op_supported(dtype: dtypes.Dtype, op: AggregateOp) -> bool:
if dtype in dtypes.NUMERIC_BIGFRAMES_TYPES_PERMISSIVE:
return True

if dtype in (dtypes.STRING_DTYPE, dtypes.BOOL_DTYPE, dtypes.BYTES_DTYPE):
return isinstance(op, (CountOp, NuniqueOp))

# For all other types, support no aggregation
return False
45 changes: 37 additions & 8 deletions tests/system/small/test_dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -2671,11 +2671,11 @@ def test_dataframe_agg_int_multi_string(scalars_dfs):


@skip_legacy_pandas
def test_df_describe(scalars_dfs):
def test_df_describe_non_temporal(scalars_dfs):
scalars_df, scalars_pandas_df = scalars_dfs
# pyarrows time columns fail in pandas
# excluding temporal columns here because BigFrames cannot perform percentiles operations on them
unsupported_columns = ["datetime_col", "timestamp_col", "time_col", "date_col"]
bf_result = scalars_df.describe().to_pandas()
bf_result = scalars_df.drop(columns=unsupported_columns).describe().to_pandas()

modified_pd_df = scalars_pandas_df.drop(columns=unsupported_columns)
pd_result = modified_pd_df.describe()
Expand Down Expand Up @@ -2709,12 +2709,14 @@ def test_df_describe(scalars_dfs):
def test_df_describe_non_numeric(scalars_dfs, include):
scalars_df, scalars_pandas_df = scalars_dfs

non_numeric_columns = ["string_col", "bytes_col", "bool_col"]
# Excluding "date_col" here because in BigFrames it is used as PyArrow[date32()], which is
# considered numerical in Pandas
target_columns = ["string_col", "bytes_col", "bool_col", "time_col"]

modified_bf = scalars_df[non_numeric_columns]
modified_bf = scalars_df[target_columns]
bf_result = modified_bf.describe(include=include).to_pandas()

modified_pd_df = scalars_pandas_df[non_numeric_columns]
modified_pd_df = scalars_pandas_df[target_columns]
pd_result = modified_pd_df.describe(include=include)

# Reindex results with the specified keys and their order, because
Expand All @@ -2726,8 +2728,35 @@ def test_df_describe_non_numeric(scalars_dfs, include):
).rename(index={"unique": "nunique"})

pd.testing.assert_frame_equal(
pd_result[non_numeric_columns].astype("Int64"),
bf_result[non_numeric_columns],
pd_result.astype("Int64"),
bf_result,
check_index_type=False,
)


@skip_legacy_pandas
def test_df_describe_temporal(scalars_dfs):
scalars_df, scalars_pandas_df = scalars_dfs

temporal_columns = ["datetime_col", "timestamp_col", "time_col", "date_col"]

modified_bf = scalars_df[temporal_columns]
bf_result = modified_bf.describe(include="all").to_pandas()

modified_pd_df = scalars_pandas_df[temporal_columns]
pd_result = modified_pd_df.describe(include="all")

# Reindex results with the specified keys and their order, because
# the relative order is not important.
bf_result = bf_result.reindex(["count", "nunique"])
pd_result = pd_result.reindex(
["count", "unique"]
# BF counter part of "unique" is called "nunique"
).rename(index={"unique": "nunique"})

pd.testing.assert_frame_equal(
pd_result.astype("Float64"),
bf_result.astype("Float64"),
check_index_type=False,
)

Expand Down
83 changes: 0 additions & 83 deletions tests/unit/operations/test_aggregations.py

This file was deleted.

Loading