from __future__ import annotations
from typing import Iterable, List, Literal, Optional
import marshmallow as mm
from dimcat.base import FriendlyEnumField, ListOfStringsField
from dimcat.data.resources import Feature, FeatureName
from dimcat.data.resources.base import DR, SomeSeries
from dimcat.data.resources.dc import FeatureSpecs, UnitOfAnalysis
from dimcat.data.resources.features import PhraseComponentName
from dimcat.data.resources.results import PhraseData, PhraseDataFormat
from dimcat.data.resources.utils import (
drop_duplicated_ultima_rows,
subselect_multiindex_from_df,
transform_phrase_data,
)
from dimcat.steps.analyzers.base import Analyzer, DispatchStrategy
[docs]class PhraseDataAnalyzer(Analyzer):
_allowed_features = (
FeatureName.PhraseAnnotations,
FeatureName.PhraseComponents,
FeatureName.PhraseLabels,
)
_default_dimension_column = "duration_qb"
_new_resource_type = PhraseData
_output_package_name = "results"
_requires_at_least_one_feature = True
[docs] class Schema(Analyzer.Schema):
columns = ListOfStringsField(metadata=dict(expose=False))
components = ListOfStringsField(metadata=dict(expose=False))
query = mm.fields.Str(allow_none=True, metadata=dict(expose=False))
reverse = mm.fields.Bool(metadata=dict(expose=False))
level_name = mm.fields.Str(metadata=dict(expose=False))
format = FriendlyEnumField(PhraseDataFormat, metadata=dict(expose=False))
drop_levels = mm.fields.Raw(metadata=dict(expose=False))
drop_duplicated_ultima_rows = mm.fields.Bool(
allow_none=True, metadata=dict(expose=False)
)
def __init__(
self,
features: Optional[FeatureSpecs | Iterable[FeatureSpecs]] = None,
columns: str | List[str] = "label",
components: PhraseComponentName
| Literal["phrase"]
| Iterable[PhraseComponentName] = "body",
query: Optional[str] = None,
reverse: bool = False,
level_name: str = "i",
format: PhraseDataFormat = PhraseDataFormat.LONG,
drop_levels: bool | int | str | Iterable[int | str] = False,
drop_duplicated_ultima_rows: bool = False,
strategy: DispatchStrategy = DispatchStrategy.GROUPBY_APPLY,
smallest_unit: UnitOfAnalysis = UnitOfAnalysis.SLICE,
dimension_column: str = None,
):
"""
Args:
features:
The Feature objects you want this Analyzer to process. If not specified, it will try to process all
features present in a given Dataset's Outputs catalog.
columns:
Column(s) to include in the result.
components:
Which of the four phrase components to include, ∈ {'ante', 'body', 'codetta', 'post'}.
For convenience, the string 'phrase' is also accepted, which is equivalent to ["body", "codetta"] and
``drop_duplicated_ultima_rows=True``.
query:
A convenient way to include only those phrases in the result that match the criteria
formulated in the string query. A query is a string and generally takes the form
"<column_name> <operator> <value>". Several criteria can be combined using boolean
operators, e.g. "localkey_mode == 'major' & label.str.contains('/')". This option
is particularly interesting when used on :class:`PhraseLabels` because it enables
queries based on the properties of phrases such as
"body_n_modulations == 0 & end_label.str.contains('IAC')". For the columns
containing tuples, you can used a special function to filter those rows that
contain any of the specified values:
"@tuple_contains(body_chords, 'V(94)', 'V(9)', 'V(4)')".
reverse:
Pass True to reverse the order of harmonies so that each phrase's last label comes
first.
level_name:
Name of the index level representing the individual integer range for each phrase, starting at 0.
This level replaces the original 'i' level which allows for tracing back each chord, because it allows
for displaying the phrases in WIDE format.
format: Can be LONG (default) or WIDE.
drop_levels:
Can be a boolean or any level specifier accepted by :meth:`pandas.MultiIndex.droplevel()`.
If False (default), all levels are retained. If True, only the phrase_id level and
the ``level_name`` are retained. In all other cases, the indicated (string or
integer) value(s) must be valid and cause one of the index levels to be dropped.
``level_name`` cannot be dropped. Dropping 'phrase_id' will likely lead to an
exception if a :class:`PhraseData` object will be displayed in WIDE format.
drop_duplicated_ultima_rows:
The default behaviour (when None), depends on the value of ``components``: If you set
``components='phrase'``, this setting defaults to True, otherwise to False; where
False corresponds to the default where each phrase body ends on a duplicate of the
phrase's ultima label, with zero-duration, enabling the creation of PhraseData
containing only phrase bodies (i.e., ``components='body'``), without losing information
about the ultima label. When analyzing entire phrases, however, these duplicate
rows may be unwanted and can be dropped by setting this option to True.
strategy: Currently, only the default strategy GROUPBY_APPLY is implemented.
smallest_unit:
The smallest unit to consider for analysis. Defaults to SLICE, meaning that slice segments are analyzed
if a slicer has been previously applied, piece units otherwise. The results for larger units can always
be retrospectively retrieved by using :meth:`Result.combine_results()`, but not the other way around.
Use this setting to reduce compute time by setting it to PIECE, CORPUS_GROUP, or GROUP where the latter
uses the default groupby if a grouper has been previously applied, or the entire dataset, otherwise.
dimension_column:
Name of the column containing some dimension, e.g. to be interpreted as quantity (durations, counts,
etc.) or as color.
"""
super().__init__(
features=features,
strategy=strategy,
smallest_unit=smallest_unit,
dimension_column=dimension_column,
)
self._columns = None
self._components = None
self._drop_levels = None
self._format = None
self.drop_levels: bool | int | str | Iterable[int | str] = drop_levels
self.drop_duplicated_ultima_rows: bool = drop_duplicated_ultima_rows
self.query: str = query
self.reverse: bool = reverse
self.level_name: str = level_name
self.columns = columns
self.components = components
self.format = format
@property
def columns(self) -> List[str]:
return list(self._columns)
@columns.setter
def columns(self, columns: str | List[str]):
if columns is None:
raise ValueError("columns cannot be None")
if isinstance(columns, str):
columns = [columns]
else:
columns = list(columns)
self._columns = columns
@property
def components(self) -> List[PhraseComponentName]:
return list(self._components)
@components.setter
def components(
self,
components: PhraseComponentName
| Literal["phrase"]
| Iterable[PhraseComponentName],
):
if components is None:
raise ValueError("components cannot be None")
if isinstance(components, str):
components = [components]
else:
components = list(components)
if any(c.lower() == "phrase" for c in components):
assert len(components) == 1, (
"If you use the convenience value 'phrase', it must be the "
"only component and will be converted to ['body', 'codetta']"
)
components = ["body", "codetta"]
if self.drop_duplicated_ultima_rows is None:
self.drop_duplicated_ultima_rows = True
else:
components = [PhraseComponentName(c).value for c in components]
self._components = components
@property
def format(self) -> PhraseDataFormat:
return self._format
@format.setter
def format(self, format: PhraseDataFormat):
self._format = PhraseDataFormat(format)
[docs] def groupby_apply(self, feature: Feature, groupby: SomeSeries = None, **kwargs):
phrase_df = feature.phrase_df
if self.drop_duplicated_ultima_rows:
phrase_df = drop_duplicated_ultima_rows(phrase_df)
if self.query:
if feature.name == "PhraseAnnotations":
phrase_df = phrase_df.query(self.query)
else:
# for PhraseComponents and PhraseLabels, the filtering is performed on their respective feature df,
# then the phrase_df (which corresponds to a PhraseAnnotations dataframe) is filtered based on the
# result
filtered_df = feature.df.query(self.query)
# idx = filtered_df.index
# mask = make_boolean_mask_from_set_of_tuples(
# phrase_df.index, set(idx), idx.names
# )
# phrase_df = phrase_df[mask]
phrase_df = subselect_multiindex_from_df(phrase_df, filtered_df.index)
phrase_data = transform_phrase_data(
phrase_df=phrase_df,
columns=self.columns,
components=self.components,
drop_levels=self.drop_levels,
reverse=self.reverse,
level_name=self.level_name,
)
# if isinstance(columns, str):
# value_column = columns
# formatted_column = None
# else:
# value_column = columns[0]
# formatted_column = columns[1:]
# default_groupby = self.default_groupby + ["phrase_id"]
# df_format = PhraseDataFormat.WIDE if wide_format else PhraseDataFormat.LONG
return phrase_data
[docs] def resource_name_factory(self, resource: DR) -> str:
"""Returns a name for the resource based on its name and the name of the pipeline step."""
return f"{resource.resource_name}.phrase_data"