Source code for dimcat.steps.analyzers.phrases

from __future__ import annotations

from typing import Iterable, List, Literal, Optional

import marshmallow as mm
from dimcat.base import FriendlyEnumField, ListOfStringsField
from dimcat.data.resources import Feature, FeatureName
from dimcat.data.resources.base import DR, SomeSeries
from dimcat.data.resources.dc import FeatureSpecs, UnitOfAnalysis
from dimcat.data.resources.features import PhraseComponentName
from dimcat.data.resources.results import PhraseData, PhraseDataFormat
from dimcat.data.resources.utils import (
    drop_duplicated_ultima_rows,
    subselect_multiindex_from_df,
    transform_phrase_data,
)
from dimcat.steps.analyzers.base import Analyzer, DispatchStrategy


[docs]class PhraseDataAnalyzer(Analyzer):
    _allowed_features = (
        FeatureName.PhraseAnnotations,
        FeatureName.PhraseComponents,
        FeatureName.PhraseLabels,
    )
    _default_dimension_column = "duration_qb"
    _new_resource_type = PhraseData
    _output_package_name = "results"
    _requires_at_least_one_feature = True

[docs]    class Schema(Analyzer.Schema):
        columns = ListOfStringsField(metadata=dict(expose=False))
        components = ListOfStringsField(metadata=dict(expose=False))
        query = mm.fields.Str(allow_none=True, metadata=dict(expose=False))
        reverse = mm.fields.Bool(metadata=dict(expose=False))
        level_name = mm.fields.Str(metadata=dict(expose=False))
        format = FriendlyEnumField(PhraseDataFormat, metadata=dict(expose=False))
        drop_levels = mm.fields.Raw(metadata=dict(expose=False))
        drop_duplicated_ultima_rows = mm.fields.Bool(
            allow_none=True, metadata=dict(expose=False)
        )

    def __init__(
        self,
        features: Optional[FeatureSpecs | Iterable[FeatureSpecs]] = None,
        columns: str | List[str] = "label",
        components: PhraseComponentName
        | Literal["phrase"]
        | Iterable[PhraseComponentName] = "body",
        query: Optional[str] = None,
        reverse: bool = False,
        level_name: str = "i",
        format: PhraseDataFormat = PhraseDataFormat.LONG,
        drop_levels: bool | int | str | Iterable[int | str] = False,
        drop_duplicated_ultima_rows: bool = False,
        strategy: DispatchStrategy = DispatchStrategy.GROUPBY_APPLY,
        smallest_unit: UnitOfAnalysis = UnitOfAnalysis.SLICE,
        dimension_column: str = None,
    ):
        """

        Args:
            features:
                The Feature objects you want this Analyzer to process. If not specified, it will try to process all
                features present in a given Dataset's Outputs catalog.
            columns:
                Column(s) to include in the result.
            components:
                Which of the four phrase components to include, ∈ {'ante', 'body', 'codetta', 'post'}.
                For convenience, the string 'phrase' is also accepted, which is equivalent to ["body", "codetta"] and
                ``drop_duplicated_ultima_rows=True``.
            query:
                A convenient way to include only those phrases in the result that match the criteria
                formulated in the string query. A query is a string and generally takes the form
                "<column_name> <operator> <value>". Several criteria can be combined using boolean
                operators, e.g. "localkey_mode == 'major' & label.str.contains('/')". This option
                is particularly interesting when used on :class:`PhraseLabels` because it enables
                queries based on the properties of phrases such as
                "body_n_modulations == 0 & end_label.str.contains('IAC')".  For the columns
                containing tuples, you can used a special function to filter those rows that
                contain any of the specified values:
                "@tuple_contains(body_chords, 'V(94)', 'V(9)', 'V(4)')".
            reverse:
                Pass True to reverse the order of harmonies so that each phrase's last label comes
                first.
            level_name:
                Name of the index level representing the individual integer range for each phrase, starting at 0.
                This level replaces the original 'i' level which allows for tracing back each chord, because it allows
                for displaying the phrases in WIDE format.
            format: Can be LONG (default) or WIDE.
            drop_levels:
                Can be a boolean or any level specifier accepted by :meth:`pandas.MultiIndex.droplevel()`.
                If False (default), all levels are retained. If True, only the phrase_id level and
                the ``level_name`` are retained. In all other cases, the indicated (string or
                integer) value(s) must be valid and cause one of the index levels to be dropped.
                ``level_name`` cannot be dropped. Dropping 'phrase_id' will likely lead to an
                exception if a :class:`PhraseData` object will be displayed in WIDE format.
            drop_duplicated_ultima_rows:
                The default behaviour (when None), depends on the value of ``components``: If you set
                ``components='phrase'``, this setting defaults to True, otherwise to False; where
                False corresponds to the default where  each phrase body ends on a duplicate of the
                phrase's ultima label, with zero-duration, enabling the creation of PhraseData
                containing only phrase bodies (i.e., ``components='body'``), without losing information
                about the ultima label. When analyzing entire phrases, however, these duplicate
                rows may be unwanted and can be dropped by setting this option to True.
            strategy: Currently, only the default strategy GROUPBY_APPLY is implemented.
            smallest_unit:
                The smallest unit to consider for analysis. Defaults to SLICE, meaning that slice segments are analyzed
                if a slicer has been previously applied, piece units otherwise. The results for larger units can always
                be retrospectively retrieved by using :meth:`Result.combine_results()`, but not the other way around.
                Use this setting to reduce compute time by setting it to PIECE, CORPUS_GROUP, or GROUP where the latter
                uses the default groupby if a grouper has been previously applied, or the entire dataset, otherwise.
            dimension_column:
                Name of the column containing some dimension, e.g. to be interpreted as quantity (durations, counts,
                etc.) or as color.
        """
        super().__init__(
            features=features,
            strategy=strategy,
            smallest_unit=smallest_unit,
            dimension_column=dimension_column,
        )
        self._columns = None
        self._components = None
        self._drop_levels = None
        self._format = None
        self.drop_levels: bool | int | str | Iterable[int | str] = drop_levels
        self.drop_duplicated_ultima_rows: bool = drop_duplicated_ultima_rows
        self.query: str = query
        self.reverse: bool = reverse
        self.level_name: str = level_name
        self.columns = columns
        self.components = components
        self.format = format

    @property
    def columns(self) -> List[str]:
        return list(self._columns)

    @columns.setter
    def columns(self, columns: str | List[str]):
        if columns is None:
            raise ValueError("columns cannot be None")
        if isinstance(columns, str):
            columns = [columns]
        else:
            columns = list(columns)
        self._columns = columns

    @property
    def components(self) -> List[PhraseComponentName]:
        return list(self._components)

    @components.setter
    def components(
        self,
        components: PhraseComponentName
        | Literal["phrase"]
        | Iterable[PhraseComponentName],
    ):
        if components is None:
            raise ValueError("components cannot be None")
        if isinstance(components, str):
            components = [components]
        else:
            components = list(components)
        if any(c.lower() == "phrase" for c in components):
            assert len(components) == 1, (
                "If you use the convenience value 'phrase', it must be the "
                "only component and will be converted to ['body', 'codetta']"
            )
            components = ["body", "codetta"]
            if self.drop_duplicated_ultima_rows is None:
                self.drop_duplicated_ultima_rows = True
        else:
            components = [PhraseComponentName(c).value for c in components]
        self._components = components

    @property
    def format(self) -> PhraseDataFormat:
        return self._format

    @format.setter
    def format(self, format: PhraseDataFormat):
        self._format = PhraseDataFormat(format)

[docs]    def groupby_apply(self, feature: Feature, groupby: SomeSeries = None, **kwargs):
        phrase_df = feature.phrase_df
        if self.drop_duplicated_ultima_rows:
            phrase_df = drop_duplicated_ultima_rows(phrase_df)
        if self.query:
            if feature.name == "PhraseAnnotations":
                phrase_df = phrase_df.query(self.query)
            else:
                # for PhraseComponents and PhraseLabels, the filtering is performed on their respective feature df,
                # then the phrase_df (which corresponds to a PhraseAnnotations dataframe) is filtered based on the
                # result
                filtered_df = feature.df.query(self.query)
                # idx = filtered_df.index
                # mask = make_boolean_mask_from_set_of_tuples(
                #     phrase_df.index, set(idx), idx.names
                # )
                # phrase_df = phrase_df[mask]
                phrase_df = subselect_multiindex_from_df(phrase_df, filtered_df.index)
        phrase_data = transform_phrase_data(
            phrase_df=phrase_df,
            columns=self.columns,
            components=self.components,
            drop_levels=self.drop_levels,
            reverse=self.reverse,
            level_name=self.level_name,
        )
        # if isinstance(columns, str):
        #     value_column = columns
        #     formatted_column = None
        # else:
        #     value_column = columns[0]
        #     formatted_column = columns[1:]
        # default_groupby = self.default_groupby + ["phrase_id"]
        # df_format = PhraseDataFormat.WIDE if wide_format else PhraseDataFormat.LONG
        return phrase_data

[docs]    def resource_name_factory(self, resource: DR) -> str:
        """Returns a name for the resource based on its name and the name of the pipeline step."""
        return f"{resource.resource_name}.phrase_data"