Source code for dimcat.steps.analyzers.proportions
import logging
import marshmallow as mm
from dimcat.data.resources.base import DR, D, FeatureName, SomeSeries
from dimcat.data.resources.dc import DimcatResource, Feature
from dimcat.data.resources.results import Durations
from dimcat.dc_exceptions import FeatureWithUndefinedValueColumnError
from dimcat.steps.analyzers.base import Analyzer
module_logger = logging.getLogger(__name__)
[docs]class Proportions(Analyzer):
_default_dimension_column = "duration_qb"
_new_resource_type = Durations
[docs] @staticmethod
def compute(feature: Feature, **kwargs) -> D:
groupby = [feature.value_column]
if (
feature.formatted_column is not None
and feature.formatted_column not in groupby
):
groupby.append(feature.formatted_column)
result = (
feature.groupby(groupby)[Proportions._default_dimension_column]
.sum()
.astype(float)
)
result = result.to_frame()
return result
[docs] class Schema(Analyzer.Schema):
dimension_column = mm.fields.Str(
load_default="duration_qb", allow_none=True, metadata=dict(expose=False)
)
[docs] def check_resource(self, resource: DimcatResource) -> None:
"""Check if the resource has a value column."""
super().check_resource(resource)
if resource.value_column is None:
raise FeatureWithUndefinedValueColumnError(
resource.resource_name, resource.name
)
[docs] def groupby_apply(self, feature: Feature, groupby: SomeSeries = None, **kwargs):
"""Performs the computation on a groupby. The value of ``groupby`` needs to be
a Series of the same length as ``feature`` or otherwise work as positional argument to feature.groupby().
"""
if groupby is None:
groupby = feature.get_grouping_levels(self.smallest_unit)
self.logger.debug(
f"Using the {feature.resource_name}'s default groupby {groupby!r}"
)
groupby.append(feature.value_column)
if (
feature.formatted_column is not None
and feature.formatted_column not in groupby
):
groupby.append(feature.formatted_column)
result = (
feature.groupby(groupby, group_keys=False)[self.dimension_column]
.sum()
.astype(float)
)
result = result.to_frame()
return result
[docs] def resource_name_factory(self, resource: DR) -> str:
"""Returns a name for the resource based on its name and the name of the pipeline step."""
return f"{resource.resource_name}.proportions"
[docs]class PitchClassVectors(Proportions):
_allowed_features = (FeatureName.Notes,)
[docs] def resource_name_factory(self, resource: DR) -> str:
"""Returns a name for the resource based on its name and the name of the pipeline step."""
return f"{resource.resource_name}.pitch_class_vectors"