Source code for dimcat.steps.analyzers.prevalence

from typing import ClassVar, Iterable, List, Optional

import pandas as pd
from dimcat.base import ListOfStringsField
from dimcat.data.resources import Feature
from dimcat.data.resources.dc import DimcatResource, FeatureSpecs, UnitOfAnalysis
from dimcat.data.resources.results import PrevalenceMatrix, Result
from dimcat.steps.analyzers.base import Analyzer, DispatchStrategy
from dimcat.steps.base import D


[docs]class PrevalenceAnalyzer(Analyzer):
    """Creates what is the equivalent to NLP's "frequency matrix" except that in the case of music,
    the coefficients are not restricted to represent count frequencies (when created from a
    :class:`~.data.resources.results.Counts` object) but can also represent durations (when created
    from a :class:`~.data.resources.results.Durations` object). When the analyzer is applied to
    a :class:`Feature`, its default analysis will be used.
    """

    _default_dimension_column: ClassVar[Optional[str]] = "duration_qb"
    _new_resource_type = PrevalenceMatrix

[docs]    @staticmethod
    def compute(
        resource: D | DimcatResource,
        index: Optional[str | Iterable[str]] = None,
        columns: Optional[str | Iterable[str]] = None,
        smallest_unit: UnitOfAnalysis = UnitOfAnalysis.SLICE,
        dimension_column: Optional[str] = None,
        **kwargs,
    ) -> D:
        """Computes the prevalence matrix from the given resource. This is basically a wrapper
        around :meth:`pandas.DataFrame.pivot_table` with ``aggfunc="sum"``.

        Args:
            resource:
                A dataframe, :class:`Feature` or :class:`Result` which will be pivoted to produce
                a prevalence with ``index`` index level(s) and ``columns`` column level(s),
                summing up the respective values contained in ``dimension_column``.
            index:
                Column(s) and/or index level name(s) that will make up the index values of the
                :class:`~.data.resources.results.PrevalenceMatrix` (akin to a groupby). By default,
                all but the last level will be used.
            columns:
                Column(s) and/or index level name(s) that will make up the column names of the
                :class:`~.data.resources.results.PrevalenceMatrix`. By default, the
                :attr:`~.data.resources.Resource.value_column` will be used.
            smallest_unit:
                The smallest unit to consider for analysis. Relevant only when ``index`` is not
                specified and ``resource`` is a :class:`~.data.resources.DimcatResource`.
            dimension_column:
                Name of the column that represents absolute prevalence values, typically "duration_qb"
                or "count". Required only when ``resource`` is a dataframe.
            **kwargs:

        Returns:
            A pivot table with summed (=absolute) prevalence coefficients. For the analogy with
            NLP's frequency matrix, the ``index`` will correspond to documents and the ``columns``
            to the vocabulary (words/tokens).
        """
        is_dataframe = isinstance(resource, pd.DataFrame)
        assert not (is_dataframe and dimension_column is None), (
            "When passing a dataframe as resource, you need to specify the dimension column containing the "
            "absolute prevalence values."
        )
        if not index:
            if is_dataframe:
                index = resource.index.names[:-1]
            else:
                index = resource.get_grouping_levels(smallest_unit)
        elif isinstance(index, str):
            index = [index]
        else:
            index = list(index)
        if not columns:
            if not is_dataframe:
                columns = [resource.value_column]
        elif isinstance(columns, str):
            columns = [columns]
        else:
            columns = list(columns)
        if dimension_column is None:
            dimension_column = getattr(resource, "dimension_column")
        result = resource.pivot_table(
            index=index,
            columns=columns,
            values=dimension_column,
            aggfunc="sum",
            sort=False,
        )
        # sort columns by their overall prevalence
        result = PrevalenceMatrix._sort_combined_result(result, sort_order="descending")
        return result

[docs]    class Schema(Analyzer.Schema):
        index = ListOfStringsField(allow_none=True)
        columns = ListOfStringsField(allow_none=True)

    def __init__(
        self,
        features: Optional[FeatureSpecs | Iterable[FeatureSpecs]] = None,
        columns: Optional[str | Iterable[str]] = None,
        index: Optional[str | Iterable[str]] = None,
        strategy: DispatchStrategy = DispatchStrategy.GROUPBY_APPLY,
        smallest_unit: UnitOfAnalysis = UnitOfAnalysis.SLICE,
        dimension_column: str = None,
    ):
        """

        Args:
            features:
                The Feature objects you want this Analyzer to process. If not specified, it will try to process all
                features present in a given Dataset's Outputs catalog.
            strategy: Currently, only the default strategy GROUPBY_APPLY is implemented.
            smallest_unit:
                The smallest unit to consider for analysis. Defaults to SLICE, meaning that slice segments are analyzed
                if a slicer has been previously applied, piece units otherwise. The results for larger units can always
                be retrospectively retrieved by using :meth:`Result.combine_results()`, but not the other way around.
                Use this setting to reduce compute time by setting it to PIECE, CORPUS_GROUP, or GROUP where the latter
                uses the default groupby if a grouper has been previously applied, or the entire dataset, otherwise.
            dimension_column:
                Name of the column containing some dimension, e.g. to be interpreted as quantity (durations, counts,
                etc.) or as color.
        """
        super().__init__(
            features=features,
            strategy=strategy,
            smallest_unit=smallest_unit,
            dimension_column=dimension_column,
        )
        self._columns = None
        self._index = None
        if columns:
            self.columns = columns
        if index:
            self.index = index

    @property
    def columns(self) -> List[str]:
        if self._columns is None:
            return []
        return list(self._columns)

    @columns.setter
    def columns(self, value: Optional[str | Iterable[str]]):
        if isinstance(value, str):
            value = [value]
        self._columns = list(value)

    @property
    def index(self) -> List[str]:
        if self._index is None:
            return []
        return list(self._index)

    @index.setter
    def index(self, value: Optional[str | Iterable[str]]):
        if isinstance(value, str):
            value = [value]
        self._index = list(value)

[docs]    def groupby_apply(
        self,
        feature: Result | Feature,
        groupby: Optional[str | Iterable[str]] = None,
        **kwargs,
    ) -> D:
        settings = self.to_config()
        if groupby is not None:
            settings["index"] = groupby
        return self.compute(feature, **settings)