dimcat.data.resources package#

Submodules#

dimcat.data.resources.base module#

class dimcat.data.resources.base.FeatureName(value)[source]#

Bases: ObjectEnum

An enumeration.

Annotations = 'Annotations'#
Articulation = 'Articulation'#
BassNotes = 'BassNotes'#
CadenceLabels = 'CadenceLabels'#
DcmlAnnotations = 'DcmlAnnotations'#
Feature = 'Feature'#
HarmonyLabels = 'HarmonyLabels'#
KeyAnnotations = 'KeyAnnotations'#
Measures = 'Measures'#
Metadata = 'Metadata'#
Notes = 'Notes'#
PhraseAnnotations = 'PhraseAnnotations'#
PhraseComponents = 'PhraseComponents'#
PhraseLabels = 'PhraseLabels'#
class dimcat.data.resources.base.PathResource(resource: Resource, descriptor_filename: Optional[str] = None, basepath: Optional[str] = None, **kwargs)[source]#

Bases: Resource

A resource that does not load frictionless descriptors or warns about them as Resource would.

classmethod from_filepath(filepath: str, resource_name: Optional[str] = None, descriptor_filename: Optional[str] = None, basepath: Optional[str] = None, **kwargs) Self[source]#

Create a Resource from a file on disk, be it a JSON/YAML resource descriptor, or a simple path resource.

Parameters:
  • filepath – Path pointing to a resource descriptor or a simple path resource.

  • resource_name – Name of the resource used for retrieving it from a DimcatPackage and as filename when the resource is stored to a ZIP file.

  • descriptor_filename – Relative filepath for using a different JSON/YAML descriptor filename than the default get_descriptor_filename(). Needs to end on one of the file extensions defined in the setting package_descriptor_endings (by default ‘resource.json’ or ‘resource.yaml’).

  • auto_validate – By default, the Resource will not be validated upon instantiation or change (but always before writing to disk). Set True to raise an exception during creation or modification of the resource, e.g. replacing the column_schema.

  • default_groupby – Pass a list of column names or index levels to groupby something else than the default (by piece).

  • basepath – Basepath to use for the resource. If None, the folder of the filepath is used.

classmethod from_resource_path(resource_path: str, resource_name: Optional[str] = None, descriptor_filename: Optional[str] = None, basepath: Optional[str] = None, **kwargs) Self[source]#

Create a Resource from a file on disk, treating it just as a path even if it’s a JSON/YAML resource descriptor.

class dimcat.data.resources.base.Resource(resource: Optional[Resource] = None, descriptor_filename: Optional[str] = None, basepath: Optional[str] = None, **kwargs)[source]#

Bases: Data

A Resource is essentially a wrapper around a frictionless.Resource object. Initializing a Resource object from a descriptor dispatches to the appropriate subclass, depending on the specified dtype or, if absent, to a DimcatResource for tabular data and to a PathResource for any other.

property ID: Tuple[str, str]#

The resource’s unique ID.

class PickleSchema(*, only: Optional[Union[Sequence[str], AbstractSet[str]]] = None, exclude: Union[Sequence[str], AbstractSet[str]] = (), many: Optional[bool] = None, load_only: Union[Sequence[str], AbstractSet[str]] = (), dump_only: Union[Sequence[str], AbstractSet[str]] = (), partial: Optional[Union[bool, Sequence[str], AbstractSet[str]]] = None, unknown: Optional[Literal['exclude', 'include', 'raise']] = None)[source]#

Bases: ResourceSchema

dump_fields: dict[str, Field]#
exclude: set[Any] | MutableSet[Any]#
fields: dict[str, Field]#

Dictionary mapping field_names -> Field objects

load_fields: dict[str, Field]#
opts: Any = <marshmallow.schema.SchemaOpts object>#
squash_data_for_frictionless(data, **kwargs)[source]#
unknown: types.UnknownOption#
class Schema(*, only: Optional[Union[Sequence[str], AbstractSet[str]]] = None, exclude: Union[Sequence[str], AbstractSet[str]] = (), many: Optional[bool] = None, load_only: Union[Sequence[str], AbstractSet[str]] = (), dump_only: Union[Sequence[str], AbstractSet[str]] = (), partial: Optional[Union[bool, Sequence[str], AbstractSet[str]]] = None, unknown: Optional[Literal['exclude', 'include', 'raise']] = None)[source]#

Bases: ResourceSchema, Schema

dump_fields: dict[str, Field]#
exclude: set[Any] | MutableSet[Any]#
fields: dict[str, Field]#

Dictionary mapping field_names -> Field objects

load_fields: dict[str, Field]#
opts: Any = <marshmallow.schema.SchemaOpts object>#
unknown: types.UnknownOption#
property basepath: str#
copy() Self[source]#

Returns a copy of the resource.

copy_to_new_location(basepath: str, overwrite: bool = False, filepath: Optional[str] = None, resource_name: Optional[str] = None, descriptor_filename: Optional[str] = None) Self[source]#
property corpus_name: Optional[str]#

The name of the corpus this resource belongs to.

property descriptor_exists: bool#
property descriptor_filename: Optional[str]#

The path to the descriptor file on disk, relative to the basepath. If you need to fall back to a default value, use get_descriptor_filename() instead.

detach_from_basepath()[source]#
detach_from_descriptor()[source]#
detach_from_filepath()[source]#
property filepath: str#
classmethod from_descriptor(descriptor: dict | frictionless.resource.resource.Resource, descriptor_filename: Optional[str] = None, basepath: Optional[str] = None, **kwargs) Self[source]#

Create a Resource from a frictionless descriptor dictionary.

Parameters:
  • descriptor – Descriptor corresponding to a frictionless resource descriptor.

  • descriptor_filename – Relative filepath for using a different JSON/YAML descriptor filename than the default get_descriptor_filename(). Needs to end on one of the file extensions defined in the setting package_descriptor_endings (by default ‘resource.json’ or ‘resource.yaml’).

  • basepath – Where the file would be serialized and, important for an existing resource, the path against which the descriptor’s ‘filepath’ property can be resolved.

  • **kwargs – Subclasses can use this method.

Raises:
  • TypeError – If the descriptor is a string or a Path, not a dictionary or a frictionless Resource.

  • ResourceDescriptorHasWrongTypeError – If the descriptor belongs to a type that is not a subclass of the Resource class to be initialized.

Returns:

classmethod from_descriptor_path(descriptor_path: str, **kwargs) Self[source]#

Create a Resource from a frictionless descriptor file on disk.

Parameters:
  • descriptor_path – Absolute path where the JSON/YAML descriptor is located.

  • basepath – If you do not want the folder where the descriptor is located to be treated as basepath, you may specify an absolute path higher up within the descriptor_path to serve as base. The resource’s filepath will be adapated accordingly, whereas the resource names specified in the descriptor will remain the same.

  • **kwargs – Subclasses can use this method.

classmethod from_filepath(filepath: str, resource_name: Optional[str] = None, descriptor_filename: Optional[str] = None, basepath: Optional[str] = None, **kwargs) Self[source]#

Create a Resource from a file on disk, be it a JSON/YAML resource descriptor, or a simple path resource.

Parameters:
  • filepath – Path pointing to a resource descriptor or a simple path resource.

  • resource_name – Name of the resource used for retrieving it from a DimcatPackage and as filename when the resource is stored to a ZIP file.

  • descriptor_filename – Relative filepath for using a different JSON/YAML descriptor filename than the default get_descriptor_filename(). Needs to end on one of the file extensions defined in the setting package_descriptor_endings (by default ‘resource.json’ or ‘resource.yaml’).

  • auto_validate – By default, the Resource will not be validated upon instantiation or change (but always before writing to disk). Set True to raise an exception during creation or modification of the resource, e.g. replacing the column_schema.

  • default_groupby – Pass a list of column names or index levels to groupby something else than the default (by piece).

  • basepath – Basepath to use for the resource. If None, the folder of the filepath is used.

classmethod from_resource(resource: Resource, descriptor_filename: Optional[str] = None, resource_name: Optional[str] = None, basepath: Optional[str] = None, **kwargs)[source]#

Create a Resource from an existing Resource, specifying its name and, optionally, at what path it is to be serialized.

Parameters:
  • resource – An existing frictionless.Resource or a filepath.

  • resource_name – Name of the resource used for retrieving it from a DimcatPackage and as filename when the resource is stored to a ZIP file.

  • basepath – Lets you change the basepath of the existing resource.

  • **kwargs – Subclasses can use this method.

classmethod from_resource_path(resource_path: str, resource_name: Optional[str] = None, descriptor_filename: Optional[str] = None, basepath: Optional[str] = None, **kwargs) Self[source]#

Create a Resource from a file on disk, treating it just as a path even if it’s a JSON/YAML resource descriptor

get_corpus_name() str[source]#

Returns the value of corpus_name or, if not set, a name derived from the resource’s filepath.

Raises:

ValueError – If neither corpus_name nor filepath are set.

get_descriptor_filename(set_default_if_missing: bool = False) str[source]#

Like descriptor_filename but returning a default value if None. If set_default_if_missing is set to True and no basepath has been set (e.g. during initialization), the basepath is permanently set to the default basepath.

get_descriptor_path(set_default_if_missing=False) Optional[str][source]#

Returns the path to the descriptor file. If basepath or descriptor_filename are not set, they are set permanently to their defaults. If create_if_missing is set to True, the descriptor file is created if it does not exist yet.

get_filepath(set_default_if_missing=False) str[source]#

Returns the relative path to the data (filepath) if specified, innerpath otherwise.

get_innerpath(set_default_if_missing: bool = False) Optional[str][source]#

Returns the path to the resource file within a ZIP file.

get_path_dict() Dict[str, str][source]#

Returns a dictionary with the paths to the resource’s data and descriptor.

get_resource_name(set_default_if_missing=False) str[source]#
property innerpath: Optional[str]#

The innerpath is the resource’s filepath within a zip file.

property is_empty: bool#
property is_frozen: bool#

Whether the resource is frozen (i.e. it’s pointing to data on the disk) or not.

property is_loaded: bool#
property is_packaged: bool#

Returns True if the resource is packaged, i.e. its descriptor_filename is the one of the Package that the resource is a part of. Also means that the resource is passive.

property is_serialized: bool#

Returns True if the resource is serialized, i.e. it points to a file on disk and, if it is a ZIP file, the innerpath is present in that ZIP file.

property is_valid: bool#

Returns the result of a previous validation or, if the resource has not been validated before, do it now.

property is_zipped: bool#

Returns True if the filepath points to a .zip file.

make_descriptor() dict[source]#

Returns a frictionless descriptor for the resource.

property normpath: str#

Absolute path to the serialized or future tabular file. Raises if basepath is not set.

property resource: Resource#
property resource_exists: bool#

Returns True if the resource’s normpath exists on disk. If the resource is_zipped and you want to check if the innerpath actually exists within the ZIP file, use is_serialized instead.

property resource_name: str#
set_basepath(basepath: str, reconcile: bool = False) None[source]#
set_descriptor_filename(descriptor_filename: str) None[source]#
Parameters:

descriptor_filename

Raises:

ValueError – If the descriptor_filename is not a simple filename.

property status: ResourceStatus#
store_descriptor(descriptor_path: Optional[str] = None, overwrite=True) str[source]#

Stores the frictionless descriptor to disk based on the resource’s configuration and returns its path. Does not modify the resource’s status.

Returns:

The path to the descriptor file on disk. If None, the default is used.

Raises:
  • ResourceIsPackagedError – If the resource is packaged, this method refuses to store a resource descriptor because that would potentially update path information managed by the package.

  • InvalidResourcePathError – If the resource’s path does not point to an existing file on disk.

to_dict(pickle: bool = False) Dict[str, Any][source]#

Returns a dictionary representation of the resource and stores its descriptor to disk.

validate(raise_exception: bool = False, only_if_necessary: bool = False) Optional[Report][source]#

Validate the resource against its descriptor.

Parameters:
  • raise_exception – (default False) Pass True to raise if the resource is not valid.

  • only_if_necessary – (default False) Pass True to skip validation if the resource has already been validated or is assumed to be valid because it exists on disk.

Returns:

None if no validation took place (e.g. because resource is empty or only_if_necessary was True). Otherwise, frictionless report resulting from validating the data against the column_schema.

Raises:

FrictionlessException – If the resource is not valid and raise_exception is True.

class dimcat.data.resources.base.ResourceSchema(*, only: Optional[Union[Sequence[str], AbstractSet[str]]] = None, exclude: Union[Sequence[str], AbstractSet[str]] = (), many: Optional[bool] = None, load_only: Union[Sequence[str], AbstractSet[str]] = (), dump_only: Union[Sequence[str], AbstractSet[str]] = (), partial: Optional[Union[bool, Sequence[str], AbstractSet[str]]] = None, unknown: Optional[Literal['exclude', 'include', 'raise']] = None)[source]#

Bases: Schema

Since Resource objects function partially as a wrapper around a frictionless.Resource object, many properties are serialized by the means of the frictionless descriptor corresponding to it, which is provided by the frictionless library. For example, resource_name uses .resource.name under the hood.

dump_fields: dict[str, Field]#
exclude: set[Any] | MutableSet[Any]#
fields: dict[str, Field]#

Dictionary mapping field_names -> Field objects

get_frictionless_descriptor(obj: Resource) dict[source]#
init_object(data, **kwargs)[source]#

Once the data has been loaded, create the corresponding object.

load_fields: dict[str, Field]#
opts: Any = <marshmallow.schema.SchemaOpts object>#
raw(data)[source]#

Functions as ‘deserialize’ method for the Schema field ‘resource’.

unknown: types.UnknownOption#
unsquash_data_if_necessary(data, **kwargs)[source]#

Data serialized with this schema usually has ‘resource’ field that contains the frictionless descriptor. However, if it has been serialized with the PickleSchema variant, this descriptor has become the top level and all other fields have been squashed into it, effectively flattening the dictionary. This method reverses this flattening, if necessary.

class dimcat.data.resources.base.ResourceStatus(value)[source]#

Bases: IntEnum

Expresses the status of a class:Resource with respect to it being described, valid, and serialized to disk, with or without its descriptor file. The enum members have increasing integer values starting with EMPTY == 0. Statuses > PATH_ONLY (1) are currently only relevant for DimcatResources. The current status is determined by the boolean state of the first three attributes in the table below:

  • is_serialized: True if the resource can be located physically on disk.

  • descriptor_exists: True if a descriptor file (JSON/YAML) is physically present on disk.

  • is_loaded: True if the resource is currently loaded into memory.

The remaining attributes are derived from the first three and are not used to determine the current status:

  • assumed valid: True if the resource is assumed to be valid, which is the case for all serialized resources.

  • standalone: True if the resource is not part of a package. For “free” (not serialized) resources, it depends on the value Resource.descriptor_filename (whether it corresponds to a package or resource descriptor).

  • empty: True if the resource is empty, i.e. it does not data. A DimcatResource that is PATH_ONLY is considered empty, whereas a Resource/PathResource is not (they only have status 0 or 1).

ResourceStatus

is_serialized

descriptor_exists

is_loaded

assumed valid

standalone

empty

EMPTY

False

?

False

no

?

yes

PATH_ONLY

True

?

False

no

?

yes

SCHEMA_ONLY

False

?

False

no

?

yes

DATAFRAME

False

False

True

no

?

no

VALIDATED

False

False

True

guaranteed

?

no

SERIALIZED

True

False

True

yes

yes

no

STANDALONE_LOADED

True

True

True

yes

yes

no

PACKAGED_LOADED

True

True

True

yes

no

no

STANDALONE_NOT_LOADED

True

True

False

yes

yes

no

PACKAGED_NOT_LOADED

True

True

False

yes

no

no

The status of a resource is set at the end of Resource.__init__() by calling Resource._update_status() which, in return calls Resource._get_status().

DATAFRAME = 3#
EMPTY = 0#
PACKAGED_LOADED = 7#
PACKAGED_NOT_LOADED = 9#
PATH_ONLY = 1#
SCHEMA_ONLY = 2#
SERIALIZED = 5#
STANDALONE_LOADED = 6#
STANDALONE_NOT_LOADED = 8#
VALIDATED = 4#
dimcat.data.resources.base.reconcile_base_and_file(basepath: Optional[str], filepath: str) Tuple[str, str][source]#
Parameters:
  • basepath

  • filepath

Returns:

The result is a tuple of an absolute basepath and a relative filepath.if

dimcat.data.resources.base.resource_specs2resource(resource: Union[Resource, str, Path]) R[source]#

Converts a resource specification to a resource.

Parameters:

resource – A resource specification.

Returns:

A resource.

dimcat.data.resources.dc module#

class dimcat.data.resources.dc.DimcatIndex(index: Optional[IX] = None, basepath: Optional[str] = None)[source]#

Bases: Generic[IX], Data

A wrapper around a pandas.MultiIndex that provides additional functionality such as keeping track of index levels and default groupings.

A MultiIndex essentially is a Sequence of tuples where each tuple identifies dataframe row and includes one value per index level. Each index level has a name and can be seen as in individual pandas.Index. One important type of DimcatIndex is the PieceIndex which is a unique MultiIndex (that is, each tuple is unique) and where the last (i.e. right-most) level is named piece.

NB: If you want to use the index in a dataframe constructor, use the actual, wrapped index object as in pd.DataFrame(index=dc_index.index).

class PickleSchema(*, only: Optional[Union[Sequence[str], AbstractSet[str]]] = None, exclude: Union[Sequence[str], AbstractSet[str]] = (), many: Optional[bool] = None, load_only: Union[Sequence[str], AbstractSet[str]] = (), dump_only: Union[Sequence[str], AbstractSet[str]] = (), partial: Optional[Union[bool, Sequence[str], AbstractSet[str]]] = None, unknown: Optional[Literal['exclude', 'include', 'raise']] = None)[source]#

Bases: Schema

dump_fields: dict[str, Field]#
exclude: set[Any] | MutableSet[Any]#
fields: dict[str, Field]#

Dictionary mapping field_names -> Field objects

init_object(data, **kwargs) DimcatIndex[source]#

Once the data has been loaded, create the corresponding object.

load_fields: dict[str, Field]#
opts: Any = <marshmallow.schema.SchemaOpts object>#
unknown: types.UnknownOption#
class Schema(*, only: Optional[Union[Sequence[str], AbstractSet[str]]] = None, exclude: Union[Sequence[str], AbstractSet[str]] = (), many: Optional[bool] = None, load_only: Union[Sequence[str], AbstractSet[str]] = (), dump_only: Union[Sequence[str], AbstractSet[str]] = (), partial: Optional[Union[bool, Sequence[str], AbstractSet[str]]] = None, unknown: Optional[Literal['exclude', 'include', 'raise']] = None)[source]#

Bases: PickleSchema, Schema

dump_fields: dict[str, Field]#
exclude: set[Any] | MutableSet[Any]#
fields: dict[str, Field]#

Dictionary mapping field_names -> Field objects

load_fields: dict[str, Field]#
opts: Any = <marshmallow.schema.SchemaOpts object>#
unknown: types.UnknownOption#
copy() Self[source]#
filter(keep_values: Optional[Union[str, Number, bool, Iterable[Union[str, Number, bool]]]] = None, drop_values: Optional[Union[str, Number, bool, Iterable[Union[str, Number, bool]]]] = None, level: int | str = 0, drop_level: Optional[bool] = None) Self[source]#

Returns a copy of the index with only those items where the given level has wanted values.

Parameters:
  • keep_values – One or several values to keep (dropping the rest). If a value is specified both for keeping and dropping, it is dropped.

  • drop_values – One or several values to drop.

  • level – Which index level to filter on.

  • drop_level – Boolean specifies whether to keep the filtered level or to drop it. The default (None) corresponds to automatic behaviour, where the level is dropped if only one value remains, otherwise kept.

Returns:

A copy of the index with only those items where the given level has wanted values and may have been removed.

classmethod from_dataframe(df: DataFrame) Self[source]#

Create a DimcatIndex from a dataframe’s index.

classmethod from_grouping(grouping: Dict[Hashable, List[tuple]], level_names: Sequence[str] = ('piece_group', 'corpus', 'piece'), sort: bool = False, raise_if_multiple_membership: bool = False) Self[source]#

Creates a DimcatIndex from a dictionary of piece groups.

Args: grouping: A dictionary where keys are group names and values are lists of index tuples. level_names:

Names for the levels of the MultiIndex, i.e. one for the group level and one per level in the tuples.

sort: By default the returned MultiIndex is not sorted. Set False to enable sorting. raise_if_multiple_membership: If True, raises a ValueError if a member is in multiple groups.

classmethod from_index(index: MultiIndex, **kwargs) Self[source]#

Create a DimcatIndex from a dataframe index.

classmethod from_resource(resource: dimcat.data.resources.dc.DimcatResource | frictionless.resource.resource.Resource, index_col: Optional[Union[int, str, List[int | str]]] = None) Self[source]#

Create a DimcatIndex from a frictionless Resource.

classmethod from_tuples(tuples: Iterable[tuple], level_names: Sequence[str]) Self[source]#
get_level_values_to_drop(drop_values: Union[str, Number, bool, Iterable[Union[str, Number, bool]]], keep_values: Union[str, Number, bool, Iterable[Union[str, Number, bool]]], level: int | str) Tuple[Set[Hashable], Set[Hashable]][source]#
property index: IX#
property names: List[str]#
property piece_level_position: Optional[int]#

The position of the piece level in the index, or None if the index has no piece level.

sample(n: int) Self[source]#

Return a random sample of n elements.

to_resource(**kwargs) DimcatResource[source]#

Create a DimcatResource from this index.

class dimcat.data.resources.dc.DimcatResource(resource: Optional[Resource] = None, descriptor_filename: Optional[str] = None, basepath: Optional[str] = None, auto_validate: bool = False, default_groupby: Optional[Union[str, list[str]]] = None, format=None)[source]#

Bases: Resource, Generic[D]

Data object wrapping a dataframe. The dataframe’s metadata are stored as a frictionless.Resource, that can be used for serialization and (lazy) deserialization.

Every serialization of a DimcatResource (e.g. to store it as a config) requires that the dataframe was either originally read from disk or, otherwise, that it be stored to disk. The behaviour depends on whether the resource is part of a package or not.

Standalone resource (rare case)#

If the resource is not part of a package, serializing it results in two files on disk:

  • the dataframe stored as <basepath>/<name>.tsv

  • the frictionless descriptor <basepath>/<name>.resource.json

where <name> defaults to resource_name unless filepath is specified. The serialization has the shape

{
    "dtype": "DimcatResource",
    "resource": "<name>.resource.json",
    "basepath": "<basepath>"
}

A standalone resource can be instantiated in the following ways:

  • DimcatResource(): Creates an empty DimcatResource for setting the .df attribute later. If no basepath is specified, the current working directory is used if the resource is to be serialized.

  • DimcatResource.from_descriptor(descriptor_path): The frictionless descriptor is loaded from disk. Its directory is used as basepath. descriptor_path is expected to end in “resource.[json|yaml]”.

  • DimcatResource.from_dataframe(df=df, resource_name, basepath): Creates a new DimcatResource from a dataframe. If basepath is not specified, the current working directory is used if the resource is to be serialized.

  • DimcatResource.from_resource(resource=DimcatResource): Creates a DimcatResource from an existing one by copying the fields it specifies.

Resource in a package (common case)#

A DimcatResource “knows” that it is part of a package if its filepath ends on .zip. In that case, the DimcatPackage will take care of the serialization and not store an individual resource descriptor.

class Schema(*, only: Optional[Union[Sequence[str], AbstractSet[str]]] = None, exclude: Union[Sequence[str], AbstractSet[str]] = (), many: Optional[bool] = None, load_only: Union[Sequence[str], AbstractSet[str]] = (), dump_only: Union[Sequence[str], AbstractSet[str]] = (), partial: Optional[Union[bool, Sequence[str], AbstractSet[str]]] = None, unknown: Optional[Literal['exclude', 'include', 'raise']] = None)[source]#

Bases: Schema

dump_fields: dict[str, Field]#
exclude: set[Any] | MutableSet[Any]#
fields: dict[str, Field]#

Dictionary mapping field_names -> Field objects

load_fields: dict[str, Field]#
opts: Any = <marshmallow.schema.SchemaOpts object>#
unknown: types.UnknownOption#
align_with_grouping(grouping: dimcat.data.resources.dc.DimcatIndex | pandas.core.indexes.multi.MultiIndex, sort_index=True) D[source]#

Aligns the resource with a grouping index. In the typical case, the grouping index will come with the levels [“<grouping_name>”, “corpus”, “piece”] and the result will be aligned such that every group contains the resource’s sub-dataframes for the included pieces. This is like join_on_index() with the difference that align_with_grouping() expects is sensitive to the presence of “piece” index levels and returns a dataframe, whereas join_on_index() returns a new Resource and makes no assumptions on particular levels.

apply_slice_intervals(slice_intervals: dimcat.data.resources.dc.SliceIntervals | pandas.core.indexes.multi.MultiIndex) DataFrame[source]#
apply_step(step: StepSpecs | List | Tuple) DO[source]#
apply_step(*step: StepSpecs) DO

Applies one or several pipeline steps to this resource. For backward compatibility, when only a single argument is passed, the method accepts it to be a list or tuple of step specs, too.

property column_schema: Schema#
property dataframe: D#

Returns the dataframe underlying this resource, without applying any formatting.

property default_groupby: List[str]#
property df: D#

Returns the dataframe underlying this resource, applying the current format, if set.

extract_feature(feature: Union[Feature, Type[Feature], DimcatConfig, MutableMapping, FeatureName, str], new_name: Optional[str] = None) F[source]#
property extractable_features: Tuple[FeatureName, ...]#
property field_names: List[str]#

The names of the fields in the resource’s schema.

filter_index_level(keep_values: Optional[Union[str, Number, bool, Iterable[Union[str, Number, bool]]]] = None, drop_values: Optional[Union[str, Number, bool, Iterable[Union[str, Number, bool]]]] = None, level: int | str = 0, drop_level: Optional[bool] = None) Self[source]#

Returns a copy of the resource with only those rows where the given level has desired values.

Parameters:
  • keep_values – One or several values to keep (dropping the rest). If a value is specified both for keeping and dropping, it is dropped.

  • drop_values – One or several values to drop.

  • level – Which index level to filter on.

  • drop_level – Boolean specifies whether to keep the filtered level or to drop it. The default (None) corresponds to automatic behaviour, where the level is dropped if only one value remains, otherwise kept.

Returns:

A copy of the resource with only those rows where the given level has desired values.

property format: None#
format_dataframe(format=None)[source]#

Format the resource dataframe or the one specified by the current format or the one specified. This method is called by the df property, but not by the dataframe property.

property formatted_column: Optional[str]#

A secondary value column that represents the value_column in a different format. If it hasn’t been set, it defaults to _default_formatted_column, falling back to value_column.

classmethod from_dataframe(df: D, resource_name: str, descriptor_filename: Optional[str] = None, basepath: Optional[str] = None, auto_validate: bool = False, default_groupby: Optional[Union[str, list[str]]] = None, format=None, **kwargs) Self[source]#

Create a DimcatResource from a dataframe, specifying its name and, optionally, at what path it is to be serialized.

Parameters:
  • df – Dataframe to create the resource from.

  • resource_name – Name of the resource used for retrieving it from a DimcatPackage and as filename when the resource is stored to a ZIP file.

  • basepath

    Where to store serialization data and its descriptor by default. If resource is a filepath, its

    directory is used.

  • auto_validate – By default, the DimcatResource will not be validated upon instantiation or change (but always before writing to disk). Set True to raise an exception during creation or modification of the resource, e.g. replacing the column_schema.

  • default_groupby – Pass a list of column names or index levels to groupby something else than the default (by piece).

  • format – Defines the format.

classmethod from_descriptor(descriptor: dict | frictionless.resource.resource.Resource, descriptor_filename: Optional[str] = None, basepath: Optional[str] = None, auto_validate: bool = False, default_groupby: Optional[Union[str, list[str]]] = None, format=None, **kwargs) Self[source]#

Create a DimcatResource by loading its frictionless descriptor from disk. The descriptor’s directory is used as basepath. descriptor_path is expected to end in .resource.json.

Parameters:
  • descriptor – Descriptor corresponding to a frictionless resource descriptor.

  • descriptor_filename – Relative filepath for using a different JSON/YAML descriptor filename than the default get_descriptor_filename(). Needs to end on one of the file extensions defined in the setting package_descriptor_endings (by default ‘resource.json’ or ‘resource.yaml’).

  • basepath – Where to store serialization data and its descriptor by default.

  • auto_validate – By default, the DimcatResource will not be validated upon instantiation or change (but always before writing to disk). Set True to raise an exception during creation or modification of the resource, e.g. replacing the column_schema.

  • default_groupby – Pass a list of column names or index levels to groupby something else than the default (by piece).

  • format – Defines the format.

classmethod from_descriptor_path(descriptor_path: str, auto_validate: bool = False, default_groupby: Optional[Union[str, list[str]]] = None, format=None, **kwargs) Self[source]#

Create a Resource from a frictionless descriptor file on disk.

Parameters:
  • descriptor_path – Absolute path where the JSON/YAML descriptor is located.

  • auto_validate – By default, the DimcatResource will not be validated upon instantiation or change (but always before writing to disk). Set True to raise an exception during creation or modification of the resource, e.g. replacing the column_schema.

  • default_groupby – Pass a list of column names or index levels to groupby something else than the default (by piece).

  • format – Defines the format.

classmethod from_filepath(filepath: str, resource_name: Optional[str] = None, descriptor_filename: Optional[str] = None, basepath: Optional[str] = None, auto_validate: bool = False, default_groupby: Optional[Union[str, list[str]]] = None, format=None, **kwargs: Optional[bool]) Self[source]#

Create a Resource from a file on disk, be it a JSON/YAML resource descriptor, or a simple path resource.

Parameters:
  • filepath – Path pointing to a resource descriptor or a simple path resource.

  • resource_name – Name of the resource used for retrieving it from a DimcatPackage and as filename when the resource is stored to a ZIP file.

  • descriptor_filename – Relative filepath for using a different JSON/YAML descriptor filename than the default get_descriptor_filename(). Needs to end on one of the file extensions defined in the setting package_descriptor_endings (by default ‘resource.json’ or ‘resource.yaml’).

  • basepath – Basepath to use for the resource. If None, the folder of the filepath is used.

  • auto_validate – By default, the Resource will not be validated upon instantiation or change (but always before writing to disk). Set True to raise an exception during creation or modification of the resource, e.g. replacing the column_schema.

  • default_groupby – Pass a list of column names or index levels to groupby something else than the default (by piece).

  • format – Defines the format.

classmethod from_index(index: dimcat.data.resources.dc.DimcatIndex | pandas.core.indexes.multi.MultiIndex, resource_name: str, basepath: Optional[str] = None, descriptor_filename: Optional[str] = None, auto_validate: bool = False, default_groupby: Optional[Union[str, list[str]]] = None, format=None) Self[source]#
classmethod from_resource(resource: Resource, descriptor_filename: Optional[str] = None, resource_name: Optional[str] = None, basepath: Optional[str] = None, auto_validate: Optional[bool] = None, default_groupby: Optional[Union[str, list[str]]] = None, format=None, **kwargs) Self[source]#

Create a DimcatResource from an existing Resource, specifying its name and, optionally, at what path it is to be serialized.

Parameters:
  • resource – An existing frictionless.Resource or a filepath.

  • resource_name – Name of the resource used for retrieving it from a DimcatPackage and as filename when the resource is stored to a ZIP file.

  • basepath – Where to store serialization data and its descriptor by default. If resource is a filepath, its directory is used.

  • auto_validate – By default, the DimcatResource will not be validated upon instantiation or change (but always before writing to disk). Set True to raise an exception during creation or modification of the resource, e.g. replacing the column_schema.

  • default_groupby – Pass a list of column names or index levels to groupby something else than the default (by piece).

  • format – Defines the format.

classmethod from_resource_and_dataframe(resource: Resource, df: D, descriptor_filename: Optional[str] = None, **kwargs) Self[source]#

Create a DimcatResource from an existing Resource, specifying its name and, optionally, at what path it is to be serialized.

Parameters:
  • resource – An existing frictionless.Resource or a filepath.

  • **kwargs – Init arguments to override.

classmethod from_resource_path(resource_path: str, resource_name: Optional[str] = None, descriptor_filename: Optional[str] = None, **kwargs) Self[source]#

Create a DimcatResource from path to a (tabular) resource file. Currently, only TSV files are supported and they are expected to contain at least the columns “corpus” and “piece”, which are used as index.

get_dataframe(index_col: Optional[Union[int, str, Tuple[int | str]]] = None, usecols: Optional[Union[int, str, Tuple[int | str]]] = None) D[source]#

Load the dataframe from disk based on the descriptor’s normpath. This does not change the resource’s status.

Parameters:
  • index_col – Can be used to override the primary_key(s) specified in the resource’s schema. Value(s) can be column name(s) or column position(s), or both.

  • usecols – If only a subset of the fields specified in the resource’s schema is to be loaded, the names or positions of the subset.

Returns:

The dataframe or DimcatResource.

get_default_analysis() Rs[source]#

Returns the default analysis of the resource.

classmethod get_default_column_names(include_context_columns: bool = True) List[str][source]#

Returns the default column names for a DimcatResource.

get_grouping_levels(smallest_unit: UnitOfAnalysis = UnitOfAnalysis.SLICE) List[str][source]#

Returns the levels of the grouping index, i.e., all levels until and including ‘piece’ or ‘slice’.

get_index() DimcatIndex[source]#

Returns the index of the resource based on the primaryKey of the frictionless.Schema.

get_interval_index(round: Optional[int] = None, level_name: Optional[str] = None) IntervalIndex[source]#

Returns a pandas.IntervalIndex object based on the result of get_time_spans().

Parameters:
  • round – Pass an integer if you want to round the interval positions to so many decimals.

  • level_name – Name of the new level containing intervals. Automatically created if not specified.

get_level_names() List[str][source]#

Returns the level names of the resource’s index.

get_normpath(set_default_if_missing=False) str[source]#
get_piece_index(max_levels: int = 2) PieceIndex[source]#

Returns the PieceIndex of the resource based on get_index. That is, an index of which the right-most level is unique and called piece and up to max_levels additional index levels to its right.

Parameters:

max_levels – By default, the number of levels is limited to the default 2, (‘corpus’, ‘piece’).

Returns:

An index of the pieces described by the resource.

get_slice_intervals(round: Optional[int] = None, level_name: Optional[str] = None, drop_levels: Optional[Literal[False], str | int | Iterable[str | int]] = -1) SliceIntervals[source]#

Returns a SliceIntervals object based on the result of get_time_spans(). Effectively, this is this resource’s DimcatIndex with an additional level containing the time spans of the events represented by the resource’s rows. This object can be used to slice any other resource that has pieces in common.

Parameters:
  • round – Pass an integer if you want to round the interval positions to so many decimals.

  • level_name – Name of the new level containing intervals. Automatically created if not specified.

  • drop_levels – Defaults to -1, meaning that the last level of the original index (usually called ‘i’) is dropped before appending the new interval level (i.e., level ‘i’ is replaced).

Returns:

get_time_spans(round: Optional[int] = None, to_float: bool = True, dropna: bool = False) D[source]#

Returns a dataframe with start (‘left’) and end (‘end’) positions of the events represented by this resource’s rows.

Parameters:
  • round – To how many decimal places to round the intervals’ boundary values. Setting a value automatically sets to_float=True.

  • to_float – Set to True to turn the time span values into floats.

Returns:

property has_distinct_formatted_column: bool#

Returns False if no formatted_column is specified or it is identical with value_column.

property innerpath: str#

The innerpath is the resource_name plus the extension .tsv and is used as filename within a .zip archive.

property is_empty: bool#

Whether this resource holds data available or not (yet).

property is_loaded: bool#
property is_valid: bool#

Returns the result of a previous validation or, if the resource has not been validated before, do it now. Importantly, this property assumes serialized resoures to be valid. If you want to actively validate the resource, use validate() instead.

join_on_index(index: Union[DimcatIndex, IX], how: Literal['left', 'right', 'inner', 'outer', 'cross'] = 'inner') Self[source]#

A convenient way to align a resource with the index of another one through a join operation.

Parameters:
  • index – The index that this resource will be aligned with.

  • how

    The type of join to perform.

    • ’inner’ (default): index of the new resource will contain only keys present in index, and each will be repeated as many times as it appears in index.

Returns:

A new resource.

load(force_reload: bool = False) None[source]#

Tries to load the data from disk into RAM. If successful, the .is_loaded property will be True. If the resource hadn’t been loaded before, its .status property will be updated.

make_bar_plot(*step: StepSpecs, **kwargs) go.Figure[source]#

Returns a plotly figure based on the default analysis or the analysis resulting from the given steps.

Parameters:
  • step – Zero or more PipelineSteps where the last one needs to return an object that has a .make_bar_plot() method, typically an Analyzer returning a Result. Defaults to get_default_analysis() if no step is specified.

  • **kwargs – Keyword arguments passed on to .make_bar_plot().

Returns:

The figure generated by calling .make_bar_plot() on the last step’s result.

make_bubble_plot(*step: StepSpecs, **kwargs) go.Figure[source]#

Returns a plotly figure based on the default analysis or the analysis resulting from the given steps.

Parameters:
  • step – Zero or more PipelineSteps where the last one needs to return an object that has a .make_bubble_plot() method, typically an Analyzer returning a Result. Defaults to get_default_analysis() if no step is specified.

  • **kwargs – Keyword arguments passed on to .make_bubble_plot().

Returns:

The figure generated by calling .make_bubble_plot() on the last step’s result.

make_pie_chart(*step: StepSpecs, **kwargs) go.Figure[source]#

Returns a plotly figure based on the default analysis or the analysis resulting from the given steps.

Parameters:
  • step – Zero or more PipelineSteps where the last one needs to return an object that has a .make_pie_chart() method, typically an Analyzer returning a Result. Defaults to get_default_analysis() if no step is specified.

  • **kwargs – Keyword arguments passed on to .make_pie_chart().

Returns:

The figure generated by calling .make_pie_chart() on the last step’s result.

property metadata: Metadata#
plot(*step: StepSpecs, **kwargs) go.Figure[source]#

Returns a plotly figure based on the default analysis or the analysis resulting from the given steps.

Parameters:
  • step – Zero or more PipelineSteps where the last one needs to return an object that has a .plot() method, typically an Analyzer returning a Result. Defaults to get_default_analysis() if no step is specified.

  • **kwargs – Keyword arguments passed on to .plot().

Returns:

The figure generated by calling .plot() on the last step’s result.

plot_grouped(*step: StepSpecs, **kwargs) go.Figure[source]#

Returns a plotly figure based on the default analysis or the analysis resulting from the given steps.

Parameters:
  • step – Zero or more PipelineSteps where the last one needs to return an object that has a .plot_grouped() method, typically an Analyzer returning a Result. Defaults to get_default_analysis() if no step is specified.

  • **kwargs – Keyword arguments passed on to .plot_grouped().

Returns:

The figure generated by calling .plot_grouped() on the last step’s result.

set_basepath(basepath: str, reconcile: bool = False) None[source]#
set_dataframe(df)[source]#

Tries setting the dataframe of this feature. This method should be called exactly once after instantiating the feature. The method checks for potential problems first, then calls _adapt_newly_set_df(), assuming that the dataframe can be mutated safely, i.e. it is a copy. If auto_validate is True, the newly set dataframe will be validated.

store_dataframe(overwrite=False, validate: bool = True) None[source]#

Stores the dataframe and its descriptor to disk based on the resource’s configuration.

Parameters:
  • overwrite

  • validate

Raises:

RuntimeError – If the resource is frozen or does not contain a dataframe or if the file exists already.

store_resource(basepath: Optional[str] = None, name: Optional[str] = None, overwrite=True) Optional[str][source]#

Stores the resource as a frictionless resource consisting of a TSV file containing the data and an accompanying descriptor file (default: JSON).

Parameters:
  • basepath – The basepath to write the resource to. Defaults to the resource’s basepath.

  • name – The name of the resource. Defaults to the resource’s name.

  • overwrite – Whether to overwrite existing files. Defaults to True.

Returns:

The filepath of the stored descriptor.

subselect(tuples: Union[DimcatIndex, Iterable[tuple]], levels: Optional[Union[int, str, List[int | str]]] = None) DataFrame[source]#

Returns a copy of a subselection of the dataframe based on the union of its index tuples (or subtuples) and the given tuples.

summary_dict() dict[source]#

Returns a summary of the object.

update_default_groupby(new_level_name: str) None[source]#

Updates the value of default_groupby by prepending the new level name to it.

validate(raise_exception: bool = False, only_if_necessary: bool = False) Optional[Report][source]#

Validate the resource’s data against its descriptor.

Parameters:
  • raise_exception – (default False) Pass True to raise if the resource is not valid.

  • only_if_necessary – (default False) Pass True to skip validation if the resource has already been validated or is assumed to be valid because it exists on disk.

Returns:

None if no validation took place (e.g. because resource is empty or only_if_necessary was True). Otherwise, frictionless report resulting from validating the data against the column_schema.

Raises:

FrictionlessException – If the resource is not valid and raise_exception is True.

property value_column: Optional[str]#

Name of the column containing representative values for this resource. If not set, it defaults to _default_value_column, falling back to the last element of _feature_columns, if defined.

class dimcat.data.resources.dc.Feature(resource: Optional[Union[Resource, str]] = None, descriptor_filename: Optional[str] = None, basepath: Optional[str] = None, auto_validate: bool = True, default_groupby: Optional[Union[str, list[str]]] = None, format=None, playthrough: Playthrough = Playthrough.SINGLE)[source]#

Bases: DimcatResource

A feature is a DimcatResource that represents a single feature of a piece of music, generally some subset and/or transformation of a Facet. A feature resource usually represents one object per row and has a defined temporality (‘quarterbeats’, at the very least) relative to the scores in question.

class Schema(*, only: Optional[Union[Sequence[str], AbstractSet[str]]] = None, exclude: Union[Sequence[str], AbstractSet[str]] = (), many: Optional[bool] = None, load_only: Union[Sequence[str], AbstractSet[str]] = (), dump_only: Union[Sequence[str], AbstractSet[str]] = (), partial: Optional[Union[bool, Sequence[str], AbstractSet[str]]] = None, unknown: Optional[Literal['exclude', 'include', 'raise']] = None)[source]#

Bases: Schema

dump_fields: dict[str, Field]#
exclude: set[Any] | MutableSet[Any]#
fields: dict[str, Field]#

Dictionary mapping field_names -> Field objects

load_fields: dict[str, Field]#
opts: Any = <marshmallow.schema.SchemaOpts object>#
unknown: types.UnknownOption#
get_available_column_names(index_levels: bool = False, context_columns: bool = False, auxiliary_columns: bool = False, convenience_columns: bool = False, feature_columns: bool = False)[source]#

Returns the column names that are available on the resource.

property playthrough: Playthrough#
class dimcat.data.resources.dc.IndexField(*, load_default: ~typing.Any = <marshmallow.missing>, dump_default: ~typing.Any = <marshmallow.missing>, data_key: ~typing.Optional[str] = None, attribute: ~typing.Optional[str] = None, validate: ~typing.Optional[~typing.Union[~typing.Callable[[~typing.Any], ~typing.Any], ~typing.Iterable[~typing.Callable[[~typing.Any], ~typing.Any]]]] = None, required: bool = False, allow_none: ~typing.Optional[bool] = None, load_only: bool = False, dump_only: bool = False, error_messages: ~typing.Optional[dict[str, str]] = None, metadata: ~typing.Optional[~typing.Mapping[str, ~typing.Any]] = None)[source]#

Bases: Field

A marshmallow field for DimcatIndex objects.

class dimcat.data.resources.dc.PieceIndex(index: Optional[IX] = None)[source]#

Bases: DimcatIndex[IX]

A unique DimcatIndex where the last (i.e. right-most) level is named piece.

classmethod from_index(index: Union[DimcatIndex[IX], IX], recognized_piece_columns: Optional[Iterable[str]] = None, max_levels: int = 2) Self[source]#

Create a PieceIndex from another index.

classmethod from_resource(resource: dimcat.data.resources.dc.DimcatResource | frictionless.resource.resource.Resource, index_col: Optional[Union[int, str, List[int | str]]] = None, recognized_piece_columns: Optional[Iterable[str]] = None, max_levels: int = 2) Self[source]#

Create a PieceIndex from a frictionless Resource.

classmethod from_tuples(tuples: Iterable[tuple], level_names: Sequence[str] = ('corpus', 'piece')) Self[source]#
class dimcat.data.resources.dc.Playthrough(value)[source]#

Bases: FriendlyEnum

Different types of behaviour regarding repeat structures encoded in score-releated data.

SINGLE:

(default) Represent data for a “single playthrough”. If first and second endings are present the first (third, etc.) are being dropped to exclude incorrect transitions and adjacencies between the first- and second-ending bars.

RAW: Leave data as-is.

RAW = 'RAW'#
SINGLE = 'SINGLE'#
class dimcat.data.resources.dc.SliceIntervals(index: Optional[IX] = None, basepath: Optional[str] = None)[source]#

Bases: DimcatIndex

class dimcat.data.resources.dc.UnitOfAnalysis(value)[source]#

Bases: LowercaseEnum

Serves to specify a grouping of index levels that may depend on the object type and history.

SLICE: Stands for all levels down to the last slice level. If no Slicer has been applied it corresponds to PIECE. PIECE: All levels down to the piece level. GROUP: Current default_groupby based on previously applied Groupers. CORPUS_GROUP: Like GROUP, except the first grouping level is guaranteed to be ‘corpus’.

CORPUS_GROUP = 'CORPUS_GROUP'#
GROUP = 'GROUP'#
PIECE = 'PIECE'#
SLICE = 'SLICE'#

dimcat.data.resources.facets module#

class dimcat.data.resources.facets.AnnotationsFacet(resource: Optional[Resource] = None, descriptor_filename: Optional[str] = None, basepath: Optional[str] = None, auto_validate: bool = False, default_groupby: Optional[Union[str, list[str]]] = None, format=None)[source]#

Bases: Facet

A facet that represents one or several annotation layers.

class dimcat.data.resources.facets.ControlsFacet(resource: Optional[Resource] = None, descriptor_filename: Optional[str] = None, basepath: Optional[str] = None, auto_validate: bool = False, default_groupby: Optional[Union[str, list[str]]] = None, format=None)[source]#

Bases: Facet

A facet that represents ‘control events’ in MEI parlance; i.e. elements that depend on events to exist, such as dynamics, ties, phrase marks, pedal marks, etc. Controls define ‘how’ something is to be performed.

class dimcat.data.resources.facets.EventsFacet(resource: Optional[Resource] = None, descriptor_filename: Optional[str] = None, basepath: Optional[str] = None, auto_validate: bool = False, default_groupby: Optional[Union[str, list[str]]] = None, format=None)[source]#

Bases: Facet

A facet that represents sounding events and/or rests. Events specify ‘what’ is to be performed.

class dimcat.data.resources.facets.Facet(resource: Optional[Resource] = None, descriptor_filename: Optional[str] = None, basepath: Optional[str] = None, auto_validate: bool = False, default_groupby: Optional[Union[str, list[str]]] = None, format=None)[source]#

Bases: DimcatResource

A facet is one aspect of a score that can sensibly ordered and conceived of along the score’s timeline. The format of a facet depends on the score format and tries to stay as close to the original as possible, using only the necessary minimum of standardization. Content and format of a facet define which features can be extracted, based on which configuration options.

class dimcat.data.resources.facets.MuseScoreChords(resource: Optional[Resource] = None, descriptor_filename: Optional[str] = None, basepath: Optional[str] = None, auto_validate: bool = False, default_groupby: Optional[Union[str, list[str]]] = None, format=None)[source]#

Bases: MuseScoreFacet, ControlsFacet

class dimcat.data.resources.facets.MuseScoreFacet(resource: Optional[Resource] = None, descriptor_filename: Optional[str] = None, basepath: Optional[str] = None, auto_validate: bool = False, default_groupby: Optional[Union[str, list[str]]] = None, format=None)[source]#

Bases: Facet

A single facet of a MuseScore package as created by the ms3 MuseScore parsing library. Contains a single TSV facet one or several corpora. Naming format <name>.<facet>[.tsv].

classmethod from_descriptor(descriptor: dict | dimcat.data.resources.base.Resource, descriptor_filename: Optional[str] = None, basepath: Optional[str] = None, auto_validate: bool = False, default_groupby: Optional[Union[str, list[str]]] = None) Self[source]#

Create a DimcatResource by loading its frictionless descriptor from disk. The descriptor’s directory is used as basepath. descriptor_path is expected to end in .resource.json.

Parameters:
  • descriptor – Descriptor corresponding to a frictionless resource descriptor.

  • descriptor_filename – Relative filepath for using a different JSON/YAML descriptor filename than the default get_descriptor_filename(). Needs to end on one of the file extensions defined in the setting package_descriptor_endings (by default ‘resource.json’ or ‘resource.yaml’).

  • basepath – Where to store serialization data and its descriptor by default.

  • auto_validate – By default, the DimcatResource will not be validated upon instantiation or change (but always before writing to disk). Set True to raise an exception during creation or modification of the resource, e.g. replacing the column_schema.

  • default_groupby – Pass a list of column names or index levels to groupby something else than the default (by piece).

  • format – Defines the format.

class dimcat.data.resources.facets.MuseScoreFacetName(value)[source]#

Bases: ObjectEnum

An enumeration.

MuseScoreChords = 'MuseScoreChords'#
MuseScoreFacet = 'MuseScoreFacet'#
MuseScoreHarmonies = 'MuseScoreHarmonies'#
MuseScoreMeasures = 'MuseScoreMeasures'#
MuseScoreNotes = 'MuseScoreNotes'#
class dimcat.data.resources.facets.MuseScoreHarmonies(resource: Optional[Resource] = None, descriptor_filename: Optional[str] = None, basepath: Optional[str] = None, auto_validate: bool = False, default_groupby: Optional[Union[str, list[str]]] = None, format=None)[source]#

Bases: MuseScoreFacet, AnnotationsFacet

class dimcat.data.resources.facets.MuseScoreMeasures(resource: Optional[Resource] = None, descriptor_filename: Optional[str] = None, basepath: Optional[str] = None, auto_validate: bool = False, default_groupby: Optional[Union[str, list[str]]] = None, format=None)[source]#

Bases: MuseScoreFacet, StructureFacet

class dimcat.data.resources.facets.MuseScoreNotes(resource: Optional[Resource] = None, descriptor_filename: Optional[str] = None, basepath: Optional[str] = None, auto_validate: bool = False, default_groupby: Optional[Union[str, list[str]]] = None, format=None)[source]#

Bases: MuseScoreFacet, EventsFacet

class dimcat.data.resources.facets.StructureFacet(resource: Optional[Resource] = None, descriptor_filename: Optional[str] = None, basepath: Optional[str] = None, auto_validate: bool = False, default_groupby: Optional[Union[str, list[str]]] = None, format=None)[source]#

Bases: Facet

A facet that describes structural elements of a score, pertaining to its timeline, segmentations, or its repeat structure.

dimcat.data.resources.facets.add_chord_tone_intervals(feature_df)[source]#

Turns ‘chord_tones’ column into one or two additional columns, depending on whether a ‘root’ column is present, where the chord_tones (which come as fifths) are represented as strings representing intervals over the bass_note and above the root, if present.

dimcat.data.resources.facets.add_chord_tone_scale_degrees(feature_df)[source]#

Turns ‘chord_tones’ column into multiple scale-degree columns.

dimcat.data.resources.facets.chord_tones2interval_structure(fifths: Iterable[int], reference: Optional[int] = None) Tuple[str][source]#

The fifth are interpreted as intervals expressing distances from the local tonic (“neutral degrees”). The result will be a tuple of strings that express the same intervals but expressed with respect to the given reference (neutral degree), removing unisons. If no reference is specified, the first degree (usually, the bass note) is used as such.

dimcat.data.resources.facets.extend_cadence_feature(feature_df)[source]#
dimcat.data.resources.facets.extend_harmony_feature(feature_df)[source]#

Requires previous application of transform_keys_feature().

dimcat.data.resources.facets.extend_keys_feature(feature_df)[source]#
dimcat.data.resources.facets.get_index_intervals_for_phrases(harmony_labels: D, group_cols: List[str], n_ante: int = 0, n_post: int = 0, logger: Optional[Logger] = None) Dict[Any, List[Tuple[int, int]]][source]#

Returns a list of slice intervals for selecting the rows belonging to a phrase.

dimcat.data.resources.facets.make_chord_col(df: D, cols: Optional[List[str]] = None, name: str = 'chord')[source]#

The ‘chord’ column contains the chord part of a DCML label, i.e. without indications of key, pedal, cadence, or phrase. This function can re-create this column, e.g. if the feature columns were changed. To that aim, the function takes a DataFrame and the column names that it adds together, creating new strings.

dimcat.data.resources.facets.make_raw_phrase_df(feature_df: D, ix_intervals: List[Tuple[int, int, Optional[int], int, int]], logger: Optional[Logger] = None)[source]#

Takes the intervals generated by get_index_intervals_for_phrases() and returns a dataframe with two additional index levels, one expressing a running count of phrases used as IDs, and one exhibiting for each phrase between one and four of the phrase_component names (ante, body, codetta, post), where ‘body’ is guaranteed to be present.

dimcat.data.resources.facets.make_take_mask_and_index(ix_intervals: List[Tuple[int, int, Optional[int], int, int]], logger: Logger) Tuple[ndarray[Any, dtype[_ScalarType_co]], ndarray[Any, dtype[_ScalarType_co]], ndarray[Any, dtype[_ScalarType_co]]][source]#

Takes a list of (first_i, start_i, end_i, subsequent_i, stop_i) index positions and turns them into

  • an array of corresponding index positions that can be used as argument for pandas.DataFrame.take()

  • an array of equal length that specifies the corresponding phrase IDs (which come from an integer range)

  • an array of equal length that specifies the corresponding phrase components (ante, body, codetta, post)

dimcat.data.resources.features module#

dimcat.data.resources.features.AUXILIARY_DCML_ANNOTATIONS_COLUMNS = ['label', 'globalkey', 'localkey', 'pedal', 'chord', 'special', 'numeral', 'form', 'figbass', 'changes', 'relativeroot', 'cadence', 'phraseend', 'chord_type', 'globalkey_is_minor', 'localkey_is_minor', 'chord_tones', 'added_tones', 'root', 'bass_note', 'alt_label', 'pedalend', 'placement', 'color', 'color_a', 'color_b', 'color_g', 'color_r']#

These columns are included in sub-features of HarmonyLabels to enable more means of investigation, such as groupers.

class dimcat.data.resources.features.Annotations(resource: Optional[Union[Resource, str]] = None, descriptor_filename: Optional[str] = None, basepath: Optional[str] = None, auto_validate: bool = True, default_groupby: Optional[Union[str, list[str]]] = None, format=None, playthrough: Playthrough = Playthrough.SINGLE)[source]#

Bases: Feature

class dimcat.data.resources.features.Articulation(resource: Optional[Union[Resource, str]] = None, descriptor_filename: Optional[str] = None, basepath: Optional[str] = None, auto_validate: bool = True, default_groupby: Optional[Union[str, list[str]]] = None, format=None, playthrough: Playthrough = Playthrough.SINGLE)[source]#

Bases: Feature

class dimcat.data.resources.features.BassNotes(resource: Optional[Union[Resource, str]] = None, descriptor_filename: Optional[str] = None, basepath: Optional[str] = None, auto_validate: bool = True, default_groupby: Optional[Union[str, list[str]]] = None, format: NotesFormat = BassNotesFormat.INTERVAL, playthrough: Playthrough = Playthrough.SINGLE)[source]#

Bases: HarmonyLabels

class Schema(*, only: Optional[Union[Sequence[str], AbstractSet[str]]] = None, exclude: Union[Sequence[str], AbstractSet[str]] = (), many: Optional[bool] = None, load_only: Union[Sequence[str], AbstractSet[str]] = (), dump_only: Union[Sequence[str], AbstractSet[str]] = (), partial: Optional[Union[bool, Sequence[str], AbstractSet[str]]] = None, unknown: Optional[Literal['exclude', 'include', 'raise']] = None)[source]#

Bases: Schema

dump_fields: dict[str, Field]#
exclude: set[Any] | MutableSet[Any]#
fields: dict[str, Field]#

Dictionary mapping field_names -> Field objects

load_fields: dict[str, Field]#
opts: Any = <marshmallow.schema.SchemaOpts object>#
unknown: types.UnknownOption#
property format: BassNotesFormat#
property formatted_column: str#

A secondary value column that represents the value_column in a different format. If it hasn’t been set, it defaults to _default_formatted_column, falling back to value_column.

class dimcat.data.resources.features.BassNotesFormat(value)[source]#

Bases: FriendlyEnum

Format to display the bass notes in. INTERVAL stands for the interval between the bass note and the local tonic, FIFTHS expresses that same interval as a number of fifths, SCALE_DEGREE expresses the bass note as a scale degree depending on the local key (i.e. scale degrees 3, 6, 7 are minor intervals in minor and major intervals in major), whereas SCALE_DEGREE_MAJOR and SCALE_DEGREE_MINOR express the bass note as a scale degree independent of the local key

FIFTHS = 'FIFTHS'#
INTERVAL = 'INTERVAL'#
SCALE_DEGREE = 'SCALE_DEGREE'#
SCALE_DEGREE_MAJOR = 'SCALE_DEGREE_MAJOR'#
SCALE_DEGREE_MINOR = 'SCALE_DEGREE_MINOR'#
class dimcat.data.resources.features.CadenceLabelFormat(value)[source]#

Bases: FriendlyEnum

Format to display the cadence labels in. RAW stands for ‘as-is’. TYPE omits the subtype, reducing more specific labels, whereas SUBTYPE displays subtypes only, omitting all labels that do not specify one.

RAW = 'RAW'#
SUBTYPE = 'SUBTYPE'#
TYPE = 'TYPE'#
class dimcat.data.resources.features.CadenceLabels(resource: Optional[Union[Resource, str]] = None, descriptor_filename: Optional[str] = None, basepath: Optional[str] = None, auto_validate: bool = True, default_groupby: Optional[Union[str, list[str]]] = None, format: NotesFormat = CadenceLabelFormat.RAW, playthrough: Playthrough = Playthrough.SINGLE)[source]#

Bases: DcmlAnnotations

class Schema(*, only: Optional[Union[Sequence[str], AbstractSet[str]]] = None, exclude: Union[Sequence[str], AbstractSet[str]] = (), many: Optional[bool] = None, load_only: Union[Sequence[str], AbstractSet[str]] = (), dump_only: Union[Sequence[str], AbstractSet[str]] = (), partial: Optional[Union[bool, Sequence[str], AbstractSet[str]]] = None, unknown: Optional[Literal['exclude', 'include', 'raise']] = None)[source]#

Bases: Schema

dump_fields: dict[str, Field]#
exclude: set[Any] | MutableSet[Any]#
fields: dict[str, Field]#

Dictionary mapping field_names -> Field objects

load_fields: dict[str, Field]#
opts: Any = <marshmallow.schema.SchemaOpts object>#
unknown: types.UnknownOption#
property format: CadenceLabelFormat#
class dimcat.data.resources.features.DcmlAnnotations(resource: Optional[Union[Resource, str]] = None, descriptor_filename: Optional[str] = None, basepath: Optional[str] = None, auto_validate: bool = True, default_groupby: Optional[Union[str, list[str]]] = None, format=None, playthrough: Playthrough = Playthrough.SINGLE)[source]#

Bases: Annotations

dimcat.data.resources.features.HARMONY_CONVENIENCE_COLUMNS = ['root_roman', 'relativeroot_resolved', 'effective_localkey', 'effective_localkey_resolved', 'effective_localkey_is_minor', 'pedal_resolved', 'chord_and_mode', 'chord_reduced', 'chord_reduced_and_mode', 'applied_to_numeral', 'numeral_or_applied_to_numeral', 'intervals_over_bass', 'intervals_over_root', 'scale_degrees', 'scale_degrees_and_mode', 'scale_degrees_major', 'scale_degrees_minor']#

These columns are included in all Annotations features that grant full access to DCML harmony labels. First and foremost, this includes HarmonyLabels, but also PhraseAnnotations and derivatives.

class dimcat.data.resources.features.HarmonyLabels(resource: Optional[Resource] = None, descriptor_filename: Optional[str] = None, basepath: Optional[str] = None, auto_validate: bool = False, default_groupby: Optional[Union[str, list[str]]] = None, format: HarmonyLabelsFormat = HarmonyLabelsFormat.ROMAN, playthrough: Playthrough = Playthrough.SINGLE)[source]#

Bases: DcmlAnnotations

A sub-feature of DcmlAnnotations which does not include any non-chord rows.

class Schema(*, only: Optional[Union[Sequence[str], AbstractSet[str]]] = None, exclude: Union[Sequence[str], AbstractSet[str]] = (), many: Optional[bool] = None, load_only: Union[Sequence[str], AbstractSet[str]] = (), dump_only: Union[Sequence[str], AbstractSet[str]] = (), partial: Optional[Union[bool, Sequence[str], AbstractSet[str]]] = None, unknown: Optional[Literal['exclude', 'include', 'raise']] = None)[source]#

Bases: Schema

dump_fields: dict[str, Field]#
exclude: set[Any] | MutableSet[Any]#
fields: dict[str, Field]#

Dictionary mapping field_names -> Field objects

load_fields: dict[str, Field]#
opts: Any = <marshmallow.schema.SchemaOpts object>#
unknown: types.UnknownOption#
property format: HarmonyLabelsFormat#
property formatted_column: str#

A secondary value column that represents the value_column in a different format. If it hasn’t been set, it defaults to _default_formatted_column, falling back to value_column.

class dimcat.data.resources.features.HarmonyLabelsFormat(value)[source]#

Bases: FriendlyEnum

Format to display the chord labels in. ROMAN stands for Roman numerals, ROMAN_REDUCED for the same numerals without any suspensions, alterations, additions, etc.

ROMAN = 'ROMAN'#
ROMAN_REDUCED = 'ROMAN_REDUCED'#
SCALE_DEGREE = 'SCALE_DEGREE'#
SCALE_DEGREE_MAJOR = 'SCALE_DEGREE_MAJOR'#
SCALE_DEGREE_MINOR = 'SCALE_DEGREE_MINOR'#
dimcat.data.resources.features.KEY_CONVENIENCE_COLUMNS = ['globalkey_is_minor', 'localkey_is_minor', 'globalkey_mode', 'localkey_mode', 'localkey_resolved', 'localkey_and_mode']#

These columns are computed by default for all Annotations that include keys, where global keys are given as note names, and local keys are given as Roman numerals. In both cases, lowercase strings are interpreted as minor keys.

class dimcat.data.resources.features.KeyAnnotations(resource: Optional[Union[Resource, str]] = None, descriptor_filename: Optional[str] = None, basepath: Optional[str] = None, auto_validate: bool = True, default_groupby: Optional[Union[str, list[str]]] = None, format=None, playthrough: Playthrough = Playthrough.SINGLE)[source]#

Bases: DcmlAnnotations

class dimcat.data.resources.features.Measures(resource: Optional[Union[Resource, str]] = None, descriptor_filename: Optional[str] = None, basepath: Optional[str] = None, auto_validate: bool = True, default_groupby: Optional[Union[str, list[str]]] = None, format=None, playthrough: Playthrough = Playthrough.SINGLE)[source]#

Bases: Feature

class dimcat.data.resources.features.Metadata(resource: Optional[Union[Resource, str]] = None, descriptor_filename: Optional[str] = None, basepath: Optional[str] = None, auto_validate: bool = True, default_groupby: Optional[Union[str, list[str]]] = None, format=None, playthrough: Playthrough = Playthrough.SINGLE)[source]#

Bases: Feature

apply_slice_intervals(slice_intervals: dimcat.data.resources.dc.SliceIntervals | pandas.core.indexes.multi.MultiIndex) DataFrame[source]#
get_composition_years(group_cols: Optional[Union[UnitOfAnalysis, str, Iterable[str]]] = UnitOfAnalysis.GROUP, name: str = 'mean_composition_year')[source]#
get_corpus_names(func: ~typing.Callable[[str], str] = <functools._lru_cache_wrapper object>)[source]#

Returns the corpus names in chronological order, based on their pieces’ mean composition years. If func is specify, the function will be applied to each corpus name. This is useful for prettifying the names, e.g. by removing underscores.

property metadata: Self#
class dimcat.data.resources.features.Notes(merge_ties: bool = False, weight_grace_notes: float = 0.0, resource: Optional[Union[Resource, str]] = None, descriptor_filename: Optional[str] = None, basepath: Optional[str] = None, auto_validate: bool = True, default_groupby: Optional[Union[str, list[str]]] = None, format: NotesFormat = NotesFormat.NAME, playthrough: Playthrough = Playthrough.SINGLE)[source]#

Bases: Feature

class Schema(*, only: Optional[Union[Sequence[str], AbstractSet[str]]] = None, exclude: Union[Sequence[str], AbstractSet[str]] = (), many: Optional[bool] = None, load_only: Union[Sequence[str], AbstractSet[str]] = (), dump_only: Union[Sequence[str], AbstractSet[str]] = (), partial: Optional[Union[bool, Sequence[str], AbstractSet[str]]] = None, unknown: Optional[Literal['exclude', 'include', 'raise']] = None)[source]#

Bases: Schema

dump_fields: dict[str, Field]#
exclude: set[Any] | MutableSet[Any]#
fields: dict[str, Field]#

Dictionary mapping field_names -> Field objects

load_fields: dict[str, Field]#
opts: Any = <marshmallow.schema.SchemaOpts object>#
unknown: types.UnknownOption#
property format: NotesFormat#
property merge_ties: bool#
property weight_grace_notes: float#
class dimcat.data.resources.features.NotesFormat(value)[source]#

Bases: FriendlyEnum

Format to display the notes in. NAME stands for note names, FIFTHS for the number of fifths from C, and MIDI for MIDI numbers.

FIFTHS = 'FIFTHS'#
MIDI = 'MIDI'#
NAME = 'NAME'#
class dimcat.data.resources.features.PhraseAnnotations(n_ante: int = 0, n_post: int = 0, resource: Optional[Union[Resource, str]] = None, descriptor_filename: Optional[str] = None, basepath: Optional[str] = None, auto_validate: bool = True, default_groupby: Optional[Union[str, list[str]]] = None, format=None, playthrough: Playthrough = Playthrough.SINGLE)[source]#

Bases: HarmonyLabels

class Schema(*, only: Optional[Union[Sequence[str], AbstractSet[str]]] = None, exclude: Union[Sequence[str], AbstractSet[str]] = (), many: Optional[bool] = None, load_only: Union[Sequence[str], AbstractSet[str]] = (), dump_only: Union[Sequence[str], AbstractSet[str]] = (), partial: Optional[Union[bool, Sequence[str], AbstractSet[str]]] = None, unknown: Optional[Literal['exclude', 'include', 'raise']] = None)[source]#

Bases: Schema

dump_fields: dict[str, Field]#
exclude: set[Any] | MutableSet[Any]#
fields: dict[str, Field]#

Dictionary mapping field_names -> Field objects

load_fields: dict[str, Field]#
opts: Any = <marshmallow.schema.SchemaOpts object>#
unknown: types.UnknownOption#
get_phrase_data(columns: Union[str, List[str]] = 'label', components: Union[PhraseComponentName, Literal['phrase'], Iterable[PhraseComponentName]] = 'body', query: Optional[str] = None, reverse: bool = False, level_name: str = 'i', wide_format: bool = False, drop_levels: Union[bool, int, str, Iterable[str | int]] = False, drop_duplicated_ultima_rows: Optional[bool] = None) PhraseData[source]#
Parameters:
  • columns – Column(s) to include in the result.

  • components – Which of the four phrase components to include, ∈ {‘ante’, ‘body’, ‘codetta’, ‘post’}. For convenience, the string ‘phrase’ is also accepted, which is equivalent to [“body”, “codetta”] and drop_duplicated_ultima_rows=True.

  • query – A convenient way to include only those phrases in the result that match the criteria formulated in the string query. A query is a string and generally takes the form “<column_name> <operator> <value>”. Several criteria can be combined using boolean operators, e.g. “localkey_mode == ‘major’ & label.str.contains(‘/’)”. This option is particularly interesting when used on PhraseLabels because it enables queries based on the properties of phrases such as “body_n_modulations == 0 & end_label.str.contains(‘IAC’)”. For the columns containing tuples, you can used a special function to filter those rows that contain any of the specified values: “@tuple_contains(body_chords, ‘V(94)’, ‘V(9)’, ‘V(4)’)”.

  • reverse – Pass True to reverse the order of harmonies so that each phrase’s last label comes first.

  • level_name – Defaults to ‘i’, which is the name of the original level that will be replaced by this new one. The new one represents the individual integer range for each phrase, starting at 0.

  • wide_format – Pass True to unstack the result so that the columns for each phrase are concatenated side by side.

  • drop_levels – Can be a boolean or any level specifier accepted by pandas.MultiIndex.droplevel(). If False (default), all levels are retained. If True, only the phrase_id level and the level_name are retained. In all other cases, the indicated (string or integer) value(s) must be valid and cause one of the index levels to be dropped. level_name cannot be dropped. Dropping ‘phrase_id’ will likely lead to an exception if a PhraseData object will be displayed in WIDE format.

  • drop_duplicated_ultima_rows – The default behaviour (when None), depends on the value of components: If you set components='phrase', this setting defaults to True, otherwise to False; where False corresponds to the default where each phrase body ends on a duplicate of the phrase’s ultima label, with zero-duration, enabling the creation of PhraseData containing only phrase bodies (i.e., components='body'), without losing information about the ultima label. When analyzing entire phrases, however, these duplicate rows may be unwanted and can be dropped by setting this option to True.

Returns:

Dataframe representing partial information on the selected phrases in long or wide format.

property phrase_df: D#

Alias for df().

class dimcat.data.resources.features.PhraseComponentName(value)[source]#

Bases: FriendlyEnum

An enumeration.

ANTE = 'ante'#
BODY = 'body'#
CODETTA = 'codetta'#
POST = 'post'#
class dimcat.data.resources.features.PhraseComponents(n_ante: int = 0, n_post: int = 0, resource: Optional[Union[Resource, str]] = None, descriptor_filename: Optional[str] = None, basepath: Optional[str] = None, auto_validate: bool = True, default_groupby: Optional[Union[str, list[str]]] = None, format=None, playthrough: Playthrough = Playthrough.SINGLE)[source]#

Bases: PhraseAnnotations

property phrase_df: D#

Returns the df that corresponds to the PhraseAnnotations feature from which the PhraseComponents were derived.

class dimcat.data.resources.features.PhraseLabels(n_ante: int = 0, n_post: int = 0, resource: Optional[Union[Resource, str]] = None, descriptor_filename: Optional[str] = None, basepath: Optional[str] = None, auto_validate: bool = True, default_groupby: Optional[Union[str, list[str]]] = None, format=None, playthrough: Playthrough = Playthrough.SINGLE)[source]#

Bases: PhraseAnnotations

property phrase_df: D#

Returns the df that corresponds to the PhraseAnnotations feature from which the PhraseLabels were derived.

dimcat.data.resources.features.condense_components(raw_phrase_df: D) D[source]#
dimcat.data.resources.features.condense_phrases(raw_phrase_df: D) D[source]#
dimcat.data.resources.features.extend_bass_notes_feature(feature_df)[source]#

Requires previous application of transform_keys_feature().

dimcat.data.resources.features.extend_notes_feature(feature_df)[source]#
dimcat.data.resources.features.make_sequence_non_repeating(sequence: S) tuple[source]#

Returns values in the given sequence without immediate repetitions. Fails if the sequence contains NA.

dimcat.data.resources.features.merge_tied_notes(feature_df, groupby=None)[source]#
dimcat.data.resources.features.tuple_contains(series_with_tuples: S, *values: Hashable)[source]#

Function that can be used in queries passed to PhraseLabels.filter_phrase_data() to select rows in which the column’s tuples contain any of the given values.

Example

dimcat.data.resources.results module#

class dimcat.data.resources.results.CadenceCounts(analyzed_resource: DimcatResource, dimension_column: Optional[str], value_column: Optional[str] = None, formatted_column: Optional[str] = None, resource: Optional[Resource] = None, descriptor_filename: Optional[str] = None, basepath: Optional[str] = None, auto_validate: bool = False, default_groupby: Optional[Union[str, list[str]]] = None, format=None, **kwargs)[source]#

Bases: Counts

plot(title: Optional[str] = None, labels: Optional[dict] = None, hover_data: Optional[List[str]] = None, height: Optional[int] = None, width: Optional[int] = None, layout: Optional[dict] = None, font_size: Optional[int] = None, x_axis: Optional[dict] = None, y_axis: Optional[dict] = None, color_axis: Optional[dict] = None, traces_settings: Optional[dict] = None, output: Optional[str] = None, **kwargs) Figure[source]#

Returns a plotly figure based on the default analysis or the analysis resulting from the given steps.

Parameters:
  • step – Zero or more PipelineSteps where the last one needs to return an object that has a .plot() method, typically an Analyzer returning a Result. Defaults to get_default_analysis() if no step is specified.

  • **kwargs – Keyword arguments passed on to .plot().

Returns:

The figure generated by calling .plot() on the last step’s result.

plot_grouped(group_cols: Optional[Union[UnitOfAnalysis, str, Iterable[str]]] = UnitOfAnalysis.GROUP, group_modes: Optional[Union[GroupMode, Iterable[GroupMode]]] = None, title: Optional[str] = None, labels: Optional[dict] = None, hover_data: Optional[List[str]] = None, height: Optional[int] = None, width: Optional[int] = None, layout: Optional[dict] = None, font_size: Optional[int] = None, x_axis: Optional[dict] = None, y_axis: Optional[dict] = None, color_axis: Optional[dict] = None, traces_settings: Optional[dict] = None, output: Optional[str] = None, **kwargs) Figure[source]#

Returns a plotly figure based on the default analysis or the analysis resulting from the given steps.

Parameters:
  • step – Zero or more PipelineSteps where the last one needs to return an object that has a .plot_grouped() method, typically an Analyzer returning a Result. Defaults to get_default_analysis() if no step is specified.

  • **kwargs – Keyword arguments passed on to .plot_grouped().

Returns:

The figure generated by calling .plot_grouped() on the last step’s result.

property x_column: str#

Name of the result column from which to create one marker per distinct value to show over the x-axis.

class dimcat.data.resources.results.Counts(analyzed_resource: DimcatResource, dimension_column: Optional[str], value_column: Optional[str] = None, formatted_column: Optional[str] = None, resource: Optional[Resource] = None, descriptor_filename: Optional[str] = None, basepath: Optional[str] = None, auto_validate: bool = False, default_groupby: Optional[Union[str, list[str]]] = None, format=None, **kwargs)[source]#

Bases: Result

class dimcat.data.resources.results.CulledPrevalenceMatrix(analyzed_resource: DimcatResource, dimension_column: Optional[str], value_column: Optional[str] = None, formatted_column: Optional[str] = None, resource: Optional[Resource] = None, descriptor_filename: Optional[str] = None, basepath: Optional[str] = None, auto_validate: bool = False, default_groupby: Optional[Union[str, list[str]]] = None, format=None, **kwargs)[source]#

Bases: _CulledMatrixMixin, PrevalenceMatrix

class dimcat.data.resources.results.CulledRelativePrevalenceMatrix(analyzed_resource: DimcatResource, dimension_column: Optional[str], value_column: Optional[str] = None, formatted_column: Optional[str] = None, resource: Optional[Resource] = None, descriptor_filename: Optional[str] = None, basepath: Optional[str] = None, auto_validate: bool = False, default_groupby: Optional[Union[str, list[str]]] = None, format=None, **kwargs)[source]#

Bases: _CulledMatrixMixin, RelativePrevalenceMatrix

class dimcat.data.resources.results.Durations(analyzed_resource: DimcatResource, dimension_column: Optional[str], value_column: Optional[str] = None, formatted_column: Optional[str] = None, resource: Optional[Resource] = None, descriptor_filename: Optional[str] = None, basepath: Optional[str] = None, auto_validate: bool = False, default_groupby: Optional[Union[str, list[str]]] = None, format=None, **kwargs)[source]#

Bases: Result

class dimcat.data.resources.results.GroupwisePrevalenceMatrix(analyzed_resource: DimcatResource, dimension_column: Optional[str], value_column: Optional[str] = None, formatted_column: Optional[str] = None, resource: Optional[Resource] = None, descriptor_filename: Optional[str] = None, basepath: Optional[str] = None, auto_validate: bool = False, default_groupby: Optional[Union[str, list[str]]] = None, format=None, **kwargs)[source]#

Bases: RelativePrevalenceMatrix

class dimcat.data.resources.results.InverseDocumentFrequencyFlavor(value)[source]#

Bases: FriendlyEnum

Selectors for the formulas listed under https://en.wikipedia.org/wiki/Tf%E2%80%93idf#Inverse_document_frequency.

MAX = 'max'#
PROBABILISTIC = 'probabilistic'#
SMOOTH = 'smooth'#
VANILLA = 'vanilla'#
class dimcat.data.resources.results.NgramTable(analyzed_resource: DimcatResource, dimension_column: Optional[str], value_column: Optional[str] = None, formatted_column: Optional[str] = None, resource: Optional[Resource] = None, descriptor_filename: Optional[str] = None, basepath: Optional[str] = None, auto_validate: bool = False, default_groupby: Optional[Union[str, list[str]]] = None, format=None, **kwargs)[source]#

Bases: Result

A side-by-side concatenation of a feature with one or several shifted version of itself, so that each row contains both the original values and those of the n-1 following rows, concatenated on the right. This table keeps full flexibility in terms of how you want to create NgramTuples from it.

compute_information_gain(*ngram_component_columns: Optional[str | Tuple[str, ...]], split: int | Tuple[str_or_sequence, str_or_sequence] = -1, join_str: Optional[str | bool] = None, fillna: Optional[Hashable] = None, terminal_symbols: Optional[TerminalSymbol | Hashable, Tuple[TerminalSymbol | Hashable, ...]] = None, group_cols: Optional[UnitOfAnalysis | str | Iterable[str]] = UnitOfAnalysis.GROUP, reverse: bool = False) S | float[source]#

Computes the gain in information about (reduction in entropy of) the consequent from knowing the antecedent. This can be interpreted as measure of how much we know on average about the consequent given an antecedent. This method provides a shortcut to calling TransitionTable.compute_information_gain on the result of get_transitions().

Parameters:
  • gram_component_columns – One or several column specifications. If zero or one are passed, the same specification will be used for each n-gram component. The number of specifications can be at most the number of components (‘a’, ‘b’, etc.) that this NgramTable contains. Each specification can be None (default feature columns), a single column name, or a tuple of column names.

  • split – Relevant only for NgramAnalyzer with n > 2: Then the value can be modified to decide how many components are to be part of the antecedent (context, left) and the consequent (target, right). Defaults to -1, i.e. the last component is used as consequent. This is a useful default for evaluations where the (n-1) previous components are the context for predicting the next one. If you pass an integer within ±[1, n-1], the split will be performed after the indicated component and any side (left or ride) that includes only a single component will contain single values (tuples or strings). To override this automatic behaviour, you may instead pass a tuple that indicates the split in terms of column names (‘a, ‘b’, etc.) in the way that tuples become tuples, individual strings not. Example: ((‘a’, ‘b’), ‘c’) corresponds to the default behaviour, where the left side has tuples, the right side not. ((‘a’, ‘b’), (‘c’)), on the other hand, would turn the right-hand side into 1-element tuples, too.

  • join_str – Parameter passed to make_ngram_table(). It determines whether the antecedent and consequent columns contain [tuples of] tuples (the default) or [tuples of] strings. If n == 2, each cell is of type (tuple|str), if n > 2, it’s Tuple[(tuple|str)].

  • fillna – Pass a value to replace all missing values with it. Pass a tuple tuple of n values to fill missing values differently for the n components (e.g. (None, ‘’) to fill missing values with empty strings only for the second n-gram components). “” is often a good choice for components for which join_str is specified to avoid strings looking like "value<NA>".

  • terminal_symbols – By default, the last bigram in a sequence ends on (a tuple or string concatenation of) missing values. These rows can either be dropped, or the missing components replaced with a terminal symbol. In the case of bigrams, there is only one consequent component. However, when dealing with bigrams constructed by splitting higher-level grams, you can either specify a single value to be used for all consequent components (b, c, …) or a tuple of (n-1) values to obtain different behaviours. For each component to be left untouched, pass None (the default). To drop terminal rows for a component, pass “DROP”. To replace all terminal cells with pd.NA (independent of whether they would be tuples or strings), pass “NA”. To replace them with the default_terminal_symbol, pass “DEFAULT”. Or, pass a string or other Hashable to replace terminals with that string. In the latter two cases, the terminal cells will be tuples of terminal strings if join_str is None, or strings otherwise.

  • group_cols – Defines the groups for which to compute the information gain.

  • reverse – Reverse the argument: How much more do we know about the antecedent when we know the consequent?

Returns:

If group_cols is None or empty or resolves to empty (the default when no groupers have been applied), the resulting value is a float expressing the difference in entropy. Otherwise, when a grouping is performed, the result is a Series of floats.

property format: NgramTableFormat#
get_default_analysis() Transitions[source]#

Returns the default analysis of the resource.

get_grouping_levels(smallest_unit: UnitOfAnalysis = UnitOfAnalysis.SLICE) List[str][source]#

Returns the levels of the grouping index, i.e., all levels until and including ‘piece’.

get_transitions(*ngram_component_columns: Optional[str | Tuple[str, ...]], split: int | Tuple[str_or_sequence, str_or_sequence] = -1, join_str: Optional[str | bool] = None, fillna: Optional[Hashable] = None, terminal_symbols: Optional[TerminalSymbol | Hashable, Tuple[TerminalSymbol | Hashable, ...]] = None, group_cols: Optional[UnitOfAnalysis | str | Iterable[str]] = UnitOfAnalysis.GROUP, feature_columns: Optional[Tuple[str, str]] = None) Transitions[source]#

Get a Series that counts for each context the number of transitions to each possible following element.

Parameters:
  • gram_component_columns – One or several column specifications. If zero or one are passed, the same specification will be used for each n-gram component. The number of specifications can be at most the number of components (‘a’, ‘b’, etc.) that this NgramTable contains. Each specification can be None (default feature columns), a single column name, or a tuple of column names.

  • split – Relevant only for NgramAnalyzer with n > 2: Then the value can be modified to decide how many components are to be part of the antecedent (context, left) and the consequent (target, right). Defaults to -1, i.e. the last component is used as consequent. This is a useful default for evaluations where the (n-1) previous components are the context for predicting the next one. If you pass an integer within ±[1, n-1], the split will be performed after the indicated component and any side (left or ride) that includes only a single component will contain single values (tuples or strings). To override this automatic behaviour, you may instead pass a tuple that indicates the split in terms of column names (‘a, ‘b’, etc.) in the way that tuples become tuples, individual strings not. Example: ((‘a’, ‘b’), ‘c’) corresponds to the default behaviour, where the left side has tuples, the right side not. ((‘a’, ‘b’), (‘c’)), on the other hand, would turn the right-hand side into 1-element tuples, too.

  • join_str – Parameter passed to make_ngram_table(). It determines whether the antecedent and consequent columns contain [tuples of] tuples (the default) or [tuples of] strings. If n == 2, each cell is of type (tuple|str), if n > 2, it’s Tuple[(tuple|str)].

  • fillna – Pass a value to replace all missing values with it. Pass a tuple tuple of n values to fill missing values differently for the n components (e.g. (None, ‘’) to fill missing values with empty strings only for the second n-gram components). “” is often a good choice for components for which join_str is specified to avoid strings looking like "value<NA>".

  • terminal_symbols – By default, the last bigram in a sequence ends on (a tuple or string concatenation of) missing values. These rows can either be dropped, or the missing components replaced with a terminal symbol. In the case of bigrams, there is only one consequent component. However, when dealing with bigrams constructed by splitting higher-level grams, you can either specify a single value to be used for all consequent components (b, c, …) or a tuple of (n-1) values to obtain different behaviours. For each component to be left untouched, pass None (the default). To drop terminal rows for a component, pass “DROP”. To replace all terminal cells with pd.NA (independent of whether they would be tuples or strings), pass “NA”. To replace them with the default_terminal_symbol, pass “DEFAULT”. Or, pass a string or other Hashable to replace terminals with that string. In the latter two cases, the terminal cells will be tuples of terminal strings if join_str is None, or strings otherwise.

  • group_cols – Determines based for which grouping the transitions should be counted and normalized.

  • feature_columns – Defaults to [“antecedent”, “consequent”]. Pass a List with two strings to change.

Returns:

Dataframe with columns ‘count’ and ‘proportion’, showing each (n-1) previous elements (index level 0), the count and proportion of transitions to each possible following element (index level 1).

make_bigram_df(*ngram_component_columns: Optional[str | Tuple[str, ...]], split: int | Tuple[str_or_sequence, str_or_sequence] = -1, join_str: Optional[bool | str | Tuple[str, ...]] = None, fillna: Optional[Hashable | Tuple[Hashable, ...]] = None, terminal_symbols: Optional[TerminalSymbol | Hashable, Tuple[TerminalSymbol | Hashable, ...]] = None, context_columns: Optional[Literal[True], str, Tuple[str, ...]] = None) D[source]#

Reduce the selected specified n-gram components to two columns, called ‘antecedent’ and ‘consequent’. For NgramTables produced by a BigramAnalyzer or by an NgramAnalyzer(n=2), the result is equivalent to make_ngram_table, just with renamed columns. For higher n, the components are split split into an antecedent and a consequent part based on the split parameter. as tuples (default, where join_str is None) or strings. If the result corresponds to n=2 (i.e., neither antecedent nor consequent combine n-gram components), the columns contain strings or tuples (depending on whether join_str is specified or not); otherwise, both column contain tuples thereof.

Parameters:
  • gram_component_columns – One or several column specifications. If zero or one are passed, the same specification will be used for each n-gram component. The number of specifications can be at most the number of components (‘a’, ‘b’, etc.) that this NgramTable contains. Each specification can be None (default feature columns), a single column name, or a tuple of column names.

  • split – Relevant only for NgramAnalyzer with n > 2: Then the value can be modified to decide how many components are to be part of the antecedent (context, left) and the consequent (target, right). Defaults to -1, i.e. the last component is used as consequent. This is a useful default for evaluations where the (n-1) previous components are the context for predicting the next one. If you pass an integer within ±[1, n-1], the split will be performed after the indicated component and any side (left or ride) that includes only a single component will contain single values (tuples or strings). To override this automatic behaviour, you may instead pass a tuple that indicates the split in terms of column names (‘a, ‘b’, etc.) in the way that tuples become tuples, individual strings not. Example: ((‘a’, ‘b’), ‘c’) corresponds to the default behaviour, where the left side has tuples, the right side not. ((‘a’, ‘b’), (‘c’)), on the other hand, would turn the right-hand side into 1-element tuples, too.

  • join_str – Parameter passed to make_ngram_table(). It determines whether the antecedent and consequent columns contain [tuples of] tuples (the default) or [tuples of] strings. If n == 2, each cell is of type (tuple|str), if n > 2, it’s Tuple[(tuple|str)].

  • fillna – Pass a value to replace all missing values with it. Pass a tuple tuple of n values to fill missing values differently for the n components (e.g. (None, ‘’) to fill missing values with empty strings only for the second n-gram components). “” is often a good choice for components for which join_str is specified to avoid strings looking like "value<NA>".

  • terminal_symbols – By default, the last bigram in a sequence ends on (a tuple or string concatenation of) missing values. These rows can either be dropped, or the missing components replaced with a terminal symbol. In the case of bigrams, there is only one consequent component. However, when dealing with bigrams constructed by splitting higher-level grams, you can either specify a single value to be used for all consequent components (b, c, …) or a tuple of (n-1) values to obtain different behaviours. For each component to be left untouched, pass None (the default). To drop terminal rows for a component, pass “DROP”. To replace all terminal cells with pd.NA (independent of whether they would be tuples or strings), pass “NA”. To replace them with the default_terminal_symbol, pass “DEFAULT”. Or, pass a string or other Hashable to replace terminals with that string. In the latter two cases, the terminal cells will be tuples of terminal strings if join_str is None, or strings otherwise.

  • context_columns – Columns preceding the bigram columns for context, such as measure numbers etc. Pass True to use the default context columns or one or several column names to subselect.

Returns:

Like make_ngram_tuples(), but condensed to two columns.

make_bigram_table(*ngram_component_columns: Optional[str | Tuple[str, ...]], split: int | Tuple[str_or_sequence, str_or_sequence] = -1, join_str: Optional[bool | str | Tuple[str, ...]] = None, fillna: Optional[Hashable | Tuple[Hashable, ...]] = None, terminal_symbols: Optional[TerminalSymbol | Hashable, Tuple[TerminalSymbol | Hashable, ...]] = None, context_columns: Optional[Literal[True], str, Tuple[str, ...]] = None) Self[source]#

Returns the result of make_bigram_df() as a new NgramTable object.

make_bigram_tuples(*ngram_component_columns: Optional[str | Tuple[str, ...]], split: int | Tuple[str_or_sequence, str_or_sequence] = -1, join_str: Optional[bool | str | Tuple[str, ...]] = None, fillna: Optional[Hashable | Tuple[Hashable, ...]] = None, terminal_symbols: Optional[TerminalSymbol | Hashable, Tuple[TerminalSymbol | Hashable, ...]] = None, drop_identical: bool = False, n_gram_column_name: str = 'n_gram', context_columns: Optional[Literal[True], str, Tuple[str, ...]] = None) NgramTuples[source]#

Get a Resource with a single column that contains bigram tuples, where each element is a tuple or string based on the specified (or default) columns. If this object represents trigrams or higher, it is always tuples of tuples (never of strings). See make_bigram_table() for details.

Parameters:
  • gram_component_columns – One or several column specifications. If zero or one are passed, the same specification will be used for each n-gram component. The number of specifications can be at most the number of components (‘a’, ‘b’, etc.) that this NgramTable contains. Each specification can be None (default feature columns), a single column name, or a tuple of column names.

  • split – Relevant only for NgramAnalyzer with n > 2: Then the value can be modified to decide how many components are to be part of the antecedent (context, left) and the consequent (target, right). Defaults to -1, i.e. the last component is used as consequent. This is a useful default for evaluations where the (n-1) previous components are the context for predicting the next one. If you pass an integer within ±[1, n-1], the split will be performed after the indicated component and any side (left or ride) that includes only a single component will contain single values (tuples or strings). To override this automatic behaviour, you may instead pass a pair that indicates the split in terms of column names (‘a, ‘b’, etc.) in the way that tuples become tuples, individual strings not. Example: ((‘a’, ‘b’), ‘c’) corresponds to the default behaviour, where the left side has tuples, the right side not. ((‘a’, ‘b’), (‘c’)), on the other hand, would turn the right-hand side into 1-element tuples, too.

  • join_str – Parameter passed to make_ngram_table(). It determines whether the antecedent and consequent columns contain [tuples of] tuples (the default) or [tuples of] strings. If n == 2, each cell is of type (tuple|str), if n > 2, it’s Tuple[(tuple|str)].

  • fillna – Pass a value to replace all missing values with it. Pass a tuple tuple of n values to fill missing values differently for the n components (e.g. (None, ‘’) to fill missing values with empty strings only for the second n-gram components). “” is often a good choice for components for which join_str is specified to avoid strings looking like "value<NA>".

  • terminal_symbols – By default, the last bigram in a sequence ends on (a tuple or string concatenation of) missing values. These rows can either be dropped, or the missing components replaced with a terminal symbol. In the case of bigrams, there is only one consequent component. However, when dealing with bigrams constructed by splitting higher-level grams, you can either specify a single value to be used for all consequent components (b, c, …) or a tuple of (n-1) values to obtain different behaviours. For each component to be left untouched, pass None (the default). To drop terminal rows for a component, pass “DROP”. To replace all terminal cells with pd.NA (independent of whether they would be tuples or strings), pass “NA”. To replace them with the default_terminal_symbol, pass “DEFAULT”. Or, pass a string or other Hashable to replace terminals with that string. In the latter two cases, the terminal cells will be tuples of terminal strings if join_str is None, or strings otherwise.

  • drop_identical – Pass True to drop all tuples where left and right gram are identical.

  • n_gram_column_name – Name of the value_column in the resulting NgramTuples object.

  • context_columns – Columns preceding the bigram columns for context, such as measure numbers etc. Pass True to use the default context columns or one or several column names to subselect.

Returns:

make_ngram_df(*ngram_component_columns: Optional[str | Tuple[str, ...]], n: Optional[int] = None, join_str: Optional[bool | str | Tuple[bool | str, ...]] = None, fillna: Optional[Hashable | Tuple[Hashable, ...]] = None, terminal_symbols: Optional[TerminalSymbol | Hashable, Tuple[TerminalSymbol | Hashable, ...]] = None, context_columns: Optional[Literal[True], str, Tuple[str, ...]] = None) D[source]#

Reduce the selected columns for the n first n-gram levels a, b, … so that the resulting dataframe contains n columns, each of which contains tuples or strings. You may pass several column specifications to create n-gram components from differing columns, e.g. to evaluate how well one feature predicts another.

Parameters:
  • gram_component_columns – One or several column specifications. If one (or only the default, None) is passed, the same specification will be used for each n-gram component, otherwise the number of specifications must match n. Each specification can be None (default feature columns), a single column name, or a tuple of column names.

  • n – Only make columns for the first n n-gram components. If None, use all n-gram levels. Minimum is 2, maximum is the number of n-gram levels determined by the NgramAnalyzer used to create the n-gram table.

  • join_str – By default (None), the resulting columns contain tuples. If you want them to contain strings, you may pass a single specification (bool or string) to use for all n-gram components, or a tuple thereof to use different specifications for each component. True stands for concatenating the tuple values for a given n-gram component separated by “, “ – yielding strings that look like tuples without parentheses. False stands for concatenating without any value in-between the values. If a string is passed, it will be used as the separator between the tuple values.

  • fillna – Pass a value to replace all missing values with it. Pass a tuple tuple of n values to fill missing values differently for the n components (e.g. (None, ‘’) to fill missing values with empty strings only for the second n-gram components). “” is often a good choice for components for which join_str is specified to avoid strings looking like "value<NA>"

  • terminal_symbols – By default, the last n-1 n-grams in a sequence end on (tuples or string concatenations of) missing values. These rows can either be dropped, or the missing components replaced with a terminal symbol. You can either specify a single value to be used for all consequent components (b, c, …) or a tuple of (n-1) values to obtain different behaviours. In the case of bigrams, there is only one consequent component. For each component to be left untouched, pass None (the default). To drop terminal rows for a component, pass “DROP”. To replace all terminal cells with pd.NA (independent of whether they would be tuples or strings), pass “NA”. To replace them with the default_terminal_symbol, pass “DEFAULT”. Or, pass a string or other Hashable to replace terminals with that string. In the latter two cases, the terminal cells will be tuples of terminal strings if join_str is None, or strings otherwise.

  • context_columns – Columns preceding the bigram columns for context, such as measure numbers etc. Pass True to use the default context columns or one or several column names to subselect.

Returns:

make_ngram_table(*ngram_component_columns: Optional[str | Tuple[str, ...]], n: Optional[int] = None, join_str: Optional[bool | str | Tuple[bool | str, ...]] = None, fillna: Optional[Hashable | Tuple[Hashable, ...]] = None, terminal_symbols: Optional[TerminalSymbol | Hashable, Tuple[TerminalSymbol | Hashable, ...]] = None, context_columns: Optional[Literal[True], str, Tuple[str, ...]] = None) Self[source]#

Returns the result of make_ngram_df as a new NgramTable object.

make_ngram_tuples(*ngram_component_columns: Optional[str | Tuple[str, ...]], n: Optional[int] = None, join_str: Optional[bool | str | Tuple[str, ...]] = None, fillna: Optional[Hashable | Tuple[Hashable, ...]] = None, terminal_symbols: Optional[TerminalSymbol | Hashable, Tuple[TerminalSymbol | Hashable, ...]] = None, drop_identical: bool = False, n_gram_column_name: str = 'n_gram', context_columns: Optional[Literal[True], str, Tuple[str, ...]] = None) NgramTuples[source]#

Get a Resource with a single column that contains n-gram tuples, where each element is a tuple or string based on the specified (or default) columns.

Parameters:
  • gram_component_columns – One or several column specifications. If one (or only the default, None) is passed, the same specification will be used for each n-gram component, otherwise the number of specifications must match n. Each specification can be None (default feature columns), a single column name, or a tuple of column names.

  • n – Make tuples from the first n n-gram components only. If None, use all n-gram levels. Minimum is 2, maximum is the number of n-gram levels determined by the NgramAnalyzer used to create the n-gram table.

  • join_str – By default (None), the resulting columns contain tuples. If you want them to contain strings, you may pass a single specification (bool or string) to use for all n-gram components, or a tuple thereof to use different specifications for each component. True stands for concatenating the tuple values for a given n-gram component separated by “, “ – yielding strings that look like tuples without parentheses. False stands for concatenating without any value in-between the values. If a string is passed, it will be used as the separator between the tuple values.

  • fillna – Pass a value to replace all missing values with it. Pass a tuple tuple of n values to fill missing values differently for the n components (e.g. (None, ‘’) to fill missing values with empty strings only for the second n-gram components). “” is often a good choice for components for which join_str is specified to avoid strings looking like "value<NA>".

  • terminal_symbols – By default, the last n-1 n-grams in a sequence end on (tuples or string concatenations of) missing values. These rows can either be dropped, or the missing components replaced with a terminal symbol. You can either specify a single value to be used for all consequent components (b, c, …) or a tuple of (n-1) values to obtain different behaviours. In the case of bigrams, there is only one consequent component. For each component to be left untouched, pass None (the default). To drop terminal rows for a component, pass “DROP”. To replace all terminal cells with pd.NA (independent of whether they would be tuples or strings), pass “NA”. To replace them with the default_terminal_symbol, pass “DEFAULT”. Or, pass a string or other Hashable to replace terminals with that string. In the latter two cases, the terminal cells will be tuples of terminal strings if join_str is None, or strings otherwise.

  • drop_identical – Pass True to drop all tuples where all elements are identical.

  • n_gram_column_name – Name of the value_column in the resulting NgramTuples object.

  • context_columns – Columns preceding the bigram columns for context, such as measure numbers etc. Pass True to use the default context columns or one or several column names to subselect.

Returns:

make_ranking_table(group_cols: ~typing.Optional[~typing.Union[~dimcat.data.resources.dc.UnitOfAnalysis, str, ~typing.Iterable[str]]] = UnitOfAnalysis.GROUP, sort_column: ~typing.Optional[~typing.Union[str, ~typing.Tuple[str, ...]]] = None, sort_order: ~typing.Literal[<SortOrder.DESCENDING: 'DESCENDING'>, <SortOrder.ASCENDING: 'ASCENDING'>] = SortOrder.DESCENDING, top_k: ~typing.Optional[int] = None, drop_cols: ~typing.Optional[~typing.Union[str, ~typing.Iterable[str]]] = None)[source]#

Shortcut for creating the default NgramTuples object and calling make_ranking_table() on it. For more fine-grained control on the n-gram tuples, use make_ngram_tuples() or make_bigram_tuples().

property ngram_levels: List[str]#
plot(title: Optional[str] = None, labels: Optional[dict] = None, hover_data: Optional[List[str]] = None, height: Optional[int] = None, width: Optional[int] = None, layout: Optional[dict] = None, font_size: Optional[int] = None, x_axis: Optional[dict] = None, y_axis: Optional[dict] = None, color_axis: Optional[dict] = None, traces_settings: Optional[dict] = None, output: Optional[str] = None, **kwargs) Figure[source]#

Returns a plotly figure based on the default analysis or the analysis resulting from the given steps.

Parameters:
  • step – Zero or more PipelineSteps where the last one needs to return an object that has a .plot() method, typically an Analyzer returning a Result. Defaults to get_default_analysis() if no step is specified.

  • **kwargs – Keyword arguments passed on to .plot().

Returns:

The figure generated by calling .plot() on the last step’s result.

plot_grouped(title: Optional[str] = None, labels: Optional[dict] = None, hover_data: Optional[List[str]] = None, height: Optional[int] = None, width: Optional[int] = None, layout: Optional[dict] = None, font_size: Optional[int] = None, x_axis: Optional[dict] = None, y_axis: Optional[dict] = None, color_axis: Optional[dict] = None, traces_settings: Optional[dict] = None, output: Optional[str] = None, **kwargs) Figure[source]#

Returns a plotly figure based on the default analysis or the analysis resulting from the given steps.

Parameters:
  • step – Zero or more PipelineSteps where the last one needs to return an object that has a .plot_grouped() method, typically an Analyzer returning a Result. Defaults to get_default_analysis() if no step is specified.

  • **kwargs – Keyword arguments passed on to .plot_grouped().

Returns:

The figure generated by calling .plot_grouped() on the last step’s result.

class dimcat.data.resources.results.NgramTableFormat(value)[source]#

Bases: FriendlyEnum

The format of the ngram table determining how many columns are copied for each of the n-1 shifts. The original columns are always copied. This setting my have a significant effect on the performance when creating the NgramTable.

AUXILIARY = 'AUXILIARY'#
AUXILIARY_CONTEXT = 'AUXILIARY_CONTEXT'#
CONVENIENCE = 'CONVENIENCE'#
CONVENIENCE_CONTEXT = 'CONVENIENCE_CONTEXT'#
FEATURES = 'FEATURES'#
FEATURES_CONTEXT = 'FEATURES_CONTEXT'#
FULL = 'FULL'#
FULL_WITHOUT_CONTEXT = 'FULL_WITHOUT_CONTEXT'#
class dimcat.data.resources.results.NgramTuples(analyzed_resource: DimcatResource, dimension_column: Optional[str], value_column: Optional[str] = None, formatted_column: Optional[str] = None, resource: Optional[Resource] = None, descriptor_filename: Optional[str] = None, basepath: Optional[str] = None, auto_validate: bool = False, default_groupby: Optional[Union[str, list[str]]] = None, format=None, **kwargs)[source]#

Bases: Result

Result that has a value_column containing tuples and no dimension_column.

combine_results(group_cols: Optional[Union[UnitOfAnalysis, str, Iterable[str]]] = UnitOfAnalysis.GROUP, sort_order: Optional[SortOrder] = SortOrder.DESCENDING) Self[source]#

Convenience method for calling .get_default_analysis().combine_results().

make_ranking_table(group_cols: ~typing.Optional[~typing.Union[~dimcat.data.resources.dc.UnitOfAnalysis, str, ~typing.Iterable[str]]] = UnitOfAnalysis.GROUP, sort_column: ~typing.Optional[~typing.Union[str, ~typing.Tuple[str, ...]]] = None, sort_order: ~typing.Literal[<SortOrder.DESCENDING: 'DESCENDING'>, <SortOrder.ASCENDING: 'ASCENDING'>] = SortOrder.DESCENDING, top_k: ~typing.Optional[int] = None, drop_cols: ~typing.Optional[~typing.Union[str, ~typing.Iterable[str]]] = None)[source]#

Sorts the values

Parameters:
  • group_cols – Ranking tables for groups will be concatenated side-by-side. Defaults to the default groupby. To fully prevent grouping, pass False or a falsy value except None.

  • sort_column – By which column to rank. Defaults to the dimension_column.

  • sort_order – Defaults to “descending”, i.e., the highest values will be ranked first.

  • top_k – The number of top ranks to retain. Defaults to 50. Pass None to retain all.

Returns:

plot()[source]#

Returns a plotly figure based on the default analysis or the analysis resulting from the given steps.

Parameters:
  • step – Zero or more PipelineSteps where the last one needs to return an object that has a .plot() method, typically an Analyzer returning a Result. Defaults to get_default_analysis() if no step is specified.

  • **kwargs – Keyword arguments passed on to .plot().

Returns:

The figure generated by calling .plot() on the last step’s result.

plot_grouped()[source]#

Returns a plotly figure based on the default analysis or the analysis resulting from the given steps.

Parameters:
  • step – Zero or more PipelineSteps where the last one needs to return an object that has a .plot_grouped() method, typically an Analyzer returning a Result. Defaults to get_default_analysis() if no step is specified.

  • **kwargs – Keyword arguments passed on to .plot_grouped().

Returns:

The figure generated by calling .plot_grouped() on the last step’s result.

class dimcat.data.resources.results.PhraseData(analyzed_resource: DimcatResource, value_column: Optional[str], dimension_column: Optional[str], formatted_column: Optional[str] = None, resource: Optional[Resource] = None, descriptor_filename: Optional[str] = None, basepath: Optional[str] = None, auto_validate: bool = False, default_groupby: Optional[Union[str, list[str]]] = None, format: PhraseDataFormat = PhraseDataFormat.LONG, **kwargs)[source]#

Bases: Result

class Schema(*, only: Optional[Union[Sequence[str], AbstractSet[str]]] = None, exclude: Union[Sequence[str], AbstractSet[str]] = (), many: Optional[bool] = None, load_only: Union[Sequence[str], AbstractSet[str]] = (), dump_only: Union[Sequence[str], AbstractSet[str]] = (), partial: Optional[Union[bool, Sequence[str], AbstractSet[str]]] = None, unknown: Optional[Literal['exclude', 'include', 'raise']] = None)[source]#

Bases: Schema

dump_fields: dict[str, Field]#
exclude: set[Any] | MutableSet[Any]#
fields: dict[str, Field]#

Dictionary mapping field_names -> Field objects

load_fields: dict[str, Field]#
opts: Any = <marshmallow.schema.SchemaOpts object>#
unknown: types.UnknownOption#
property format: PhraseDataFormat#
make_bar_plot(df: Optional[D] = None, x_col: Optional[str] = None, y_col: Optional[str] = None, group_cols: Optional[Union[str, Iterable[str]]] = None, group_modes: Optional[Union[GroupMode, Iterable[GroupMode]]] = None, title: Optional[str] = None, labels: Optional[dict] = None, hover_data: Optional[List[str]] = None, height: Optional[int] = None, width: Optional[int] = None, layout: Optional[dict] = None, font_size: Optional[int] = None, x_axis: Optional[dict] = None, y_axis: Optional[dict] = None, color_axis: Optional[dict] = None, traces_settings: Optional[dict] = None, output: Optional[str] = None, **kwargs) Figure[source]#
Parameters:
  • layout – Keyword arguments passed to fig.update_layout()

  • **kwargs – Keyword arguments passed to the Plotly plotting function.

Returns:

A Plotly Figure object.

make_bubble_plot(df: ~typing.Optional[~dimcat.data.resources.base.D] = None, x_col: ~typing.Optional[str] = None, y_col: ~typing.Optional[str] = None, group_cols: ~typing.Optional[~typing.Union[str, ~typing.Iterable[str]]] = None, group_modes: ~typing.Optional[~typing.Union[~dimcat.plotting.GroupMode, ~typing.Iterable[~dimcat.plotting.GroupMode]]] = (<GroupMode.ROWS: 'ROWS'>, <GroupMode.COLUMNS: 'COLUMNS'>), normalize: bool = True, dimension_column: ~typing.Optional[str] = None, title: ~typing.Optional[str] = None, labels: ~typing.Optional[dict] = None, hover_data: ~typing.Optional[~typing.List[str]] = None, width: ~typing.Optional[int] = None, height: ~typing.Optional[int] = None, layout: ~typing.Optional[dict] = None, font_size: ~typing.Optional[int] = None, x_axis: ~typing.Optional[dict] = None, y_axis: ~typing.Optional[dict] = None, color_axis: ~typing.Optional[dict] = None, traces_settings: ~typing.Optional[dict] = None, output: ~typing.Optional[str] = None, **kwargs) Figure[source]#
Parameters:
  • layout – Keyword arguments passed to fig.update_layout()

  • **kwargs – Keyword arguments passed to the Plotly plotting function.

Returns:

A Plotly Figure object.

make_pie_chart(df: Optional[D] = None, x_col: Optional[str] = None, y_col: Optional[str] = None, group_cols: Optional[Union[str, Iterable[str]]] = None, group_modes: Optional[Union[GroupMode, Iterable[GroupMode]]] = None, title: Optional[str] = None, labels: Optional[dict] = None, hover_data: Optional[List[str]] = None, height: Optional[int] = None, width: Optional[int] = None, layout: Optional[dict] = None, font_size: Optional[int] = None, x_axis: Optional[dict] = None, y_axis: Optional[dict] = None, color_axis: Optional[dict] = None, traces_settings: Optional[dict] = None, output: Optional[str] = None, **kwargs) Figure[source]#
Parameters:
  • layout – Keyword arguments passed to fig.update_layout()

  • **kwargs – Keyword arguments passed to the Plotly plotting function.

Returns:

A Plotly Figure object.

make_ranking_table(group_cols: ~typing.Optional[~typing.Union[~dimcat.data.resources.dc.UnitOfAnalysis, str, ~typing.Iterable[str]]] = UnitOfAnalysis.GROUP, sort_column: ~typing.Optional[~typing.Union[str, ~typing.Tuple[str, ...]]] = None, sort_order: ~typing.Literal[<SortOrder.DESCENDING: 'DESCENDING'>, <SortOrder.ASCENDING: 'ASCENDING'>] = SortOrder.DESCENDING, top_k: ~typing.Optional[int] = None, drop_cols: ~typing.Optional[~typing.Union[str, ~typing.Iterable[str]]] = None) D[source]#

Sorts the values

Parameters:
  • group_cols – Ranking tables for groups will be concatenated side-by-side. Defaults to the default groupby. To fully prevent grouping, pass False or a falsy value except None.

  • sort_column – By which column to rank. Defaults to the dimension_column.

  • sort_order – Defaults to “descending”, i.e., the highest values will be ranked first.

  • top_k – The number of top ranks to retain. Defaults to 50. Pass None to retain all.

Returns:

plot(title: Optional[str] = None, labels: Optional[dict] = None, hover_data: Optional[List[str]] = None, height: Optional[int] = None, width: Optional[int] = None, layout: Optional[dict] = None, font_size: Optional[int] = None, x_axis: Optional[dict] = None, y_axis: Optional[dict] = None, color_axis: Optional[dict] = None, traces_settings: Optional[dict] = None, output: Optional[str] = None, **kwargs) Figure[source]#

Returns a plotly figure based on the default analysis or the analysis resulting from the given steps.

Parameters:
  • step – Zero or more PipelineSteps where the last one needs to return an object that has a .plot() method, typically an Analyzer returning a Result. Defaults to get_default_analysis() if no step is specified.

  • **kwargs – Keyword arguments passed on to .plot().

Returns:

The figure generated by calling .plot() on the last step’s result.

plot_grouped(group_cols: Optional[Union[UnitOfAnalysis, str, Iterable[str]]] = UnitOfAnalysis.GROUP, group_modes: Optional[Union[GroupMode, Iterable[GroupMode]]] = None, title: Optional[str] = None, labels: Optional[dict] = None, hover_data: Optional[List[str]] = None, height: Optional[int] = None, width: Optional[int] = None, layout: Optional[dict] = None, font_size: Optional[int] = None, x_axis: Optional[dict] = None, y_axis: Optional[dict] = None, color_axis: Optional[dict] = None, traces_settings: Optional[dict] = None, output: Optional[str] = None, **kwargs) Figure[source]#

Returns a plotly figure based on the default analysis or the analysis resulting from the given steps.

Parameters:
  • step – Zero or more PipelineSteps where the last one needs to return an object that has a .plot_grouped() method, typically an Analyzer returning a Result. Defaults to get_default_analysis() if no step is specified.

  • **kwargs – Keyword arguments passed on to .plot_grouped().

Returns:

The figure generated by calling .plot_grouped() on the last step’s result.

regroup_phrases(grouping: S, level_names: Tuple[str, str] = ('stage', 'substage')) Self[source]#

Insert a grouping column and replace the last index level with a new primary and secondary index accordingly. The primary level increments at the beginning of each group, the secondary level increments at every row, restarting at the beginning of each group. For example, a grouping [“a”, “a”, “a”, “b”, “c”, “c”] results in the index [(0, 0), (0, 1), (0, 2), (1, 0), (2, 0), (2, 1)].

Parameters:
  • grouping – A Series with the same index as the (raw) phrase_df, containing the grouping criterion. Adjacent equal values are grouped together.

  • level_names – Names of the two index levels.

Returns:

A reindexed copy of the phrase data.

class dimcat.data.resources.results.PhraseDataFormat(value)[source]#

Bases: FriendlyEnum

An enumeration.

LONG = 'LONG'#
WIDE = 'WIDE'#
class dimcat.data.resources.results.PrevalenceMatrix(analyzed_resource: DimcatResource, dimension_column: Optional[str], value_column: Optional[str] = None, formatted_column: Optional[str] = None, resource: Optional[Resource] = None, descriptor_filename: Optional[str] = None, basepath: Optional[str] = None, auto_validate: bool = False, default_groupby: Optional[Union[str, list[str]]] = None, format=None, **kwargs)[source]#

Bases: Result

The equivalent to NLP’s “frequency matrix” except that in the case of music, the coefficients are not restricted to represent count frequencies (when created from a Counts object) but can also represent durations (when created from a Durations object).

For naming consistency with the NLP terminology, method names and documentation will refer to rows as documents (which could be segments, pieces, or groups of either), and to the columns as tokens (which could be any feature values such as chords, chord features, pitch classes, etc.).

property absolute: D#

Returns the prevalence matrix as dataframe with missing values filled with zeros.

document_frequencies(relative: bool = False, sort_order: Optional[SortOrder] = SortOrder.DESCENDING, name: str = 'document_frequency') S[source]#

Returns a series containing for each token the number of documents it occurs in. “Documents”, here, means rows of the matrix, whether they corresponds to slices, pieces, or groups.

Parameters:
  • relative – By default (False), absolute counts are returned. Pass True to normalize by the number of documents n_documents (number of rows).

  • sort_order – By default (“descending”), the tokens will appear in descending order of their document frequency. Pass “ascending” to reverse the order or None to leave them in the column order of the matrix.

  • name – Name of the returned series. Defaults to “document_frequency”.

Returns:

document_frequency(token: str, relative: bool = False) bool | float[source]#
document_prevalence(name: str = 'document_prevalence') S[source]#
get_culled_matrix(ratio: Optional[float] = None, threshold: Optional[int] = None) CulledPrevalenceMatrix[source]#

Removes all features that do not appear in a minimum number of documents.

Parameters:
  • ratio – Minimum ratio of documents a token must occur in to be retained. The number of documents ratio * D is always rounded up. Ratios > 1 are rounded and interpreted as threshold.

  • threshold – Minimum number of documents a token must occur in to be retained.

get_groupwise_prevalence(column_levels: Union[str, int, Iterable[str | int]] = 0) GroupwisePrevalenceMatrix[source]#

Returns a new PrevalenceMatrix in which each row sums up to 1 for each group of columns (i.e., each row sums up to the number of non-empty groups). Groups are given in the first column level(s).

get_relative_prevalence(fillna: Optional[Number] = 0.0, as_resource: bool = True) RelativePrevalenceMatrix[source]#

Returns a new PrevalenceMatrix in which each row sums up to 1.

inverse_document_frequencies(flavor: InverseDocumentFrequencyFlavor = 'vanilla', log_base: Literal[10, 2, 2.718281828459045, 'e'] = 2, sort_order: Optional[SortOrder] = SortOrder.DESCENDING)[source]#
inverse_document_frequency(token: str, flavor: InverseDocumentFrequencyFlavor.VANILLA, log_base: log_base_ = 2)[source]#
property is_absolute: bool#

Whether matrix represents absolute prevalences in contrast to a RelativePrevalenceMatrix, in which each row sums up to 1. An absolute matrix can be converted into a relative matrix but not the other way around.

property is_complete: bool#

Whether the matrix still contains columns for all tokens, i.e., it has not been culled and can be used for computing relative frequencies.

make_bar_plot(df: Optional[D] = None, x_col: Optional[str] = None, y_col: Optional[str] = None, group_cols: Optional[Union[str, Iterable[str]]] = None, group_modes: Optional[Union[GroupMode, Iterable[GroupMode]]] = None, title: Optional[str] = None, labels: Optional[dict] = None, hover_data: Optional[List[str]] = None, height: Optional[int] = None, width: Optional[int] = None, layout: Optional[dict] = None, font_size: Optional[int] = None, x_axis: Optional[dict] = None, y_axis: Optional[dict] = None, color_axis: Optional[dict] = None, traces_settings: Optional[dict] = None, output: Optional[str] = None, **kwargs) Figure[source]#
Parameters:
  • layout – Keyword arguments passed to fig.update_layout()

  • **kwargs – Keyword arguments passed to the Plotly plotting function.

Returns:

A Plotly Figure object.

make_bubble_plot(df: ~typing.Optional[~dimcat.data.resources.base.D] = None, x_col: ~typing.Optional[str] = None, y_col: ~typing.Optional[str] = None, group_cols: ~typing.Optional[~typing.Union[str, ~typing.Iterable[str]]] = None, group_modes: ~typing.Optional[~typing.Union[~dimcat.plotting.GroupMode, ~typing.Iterable[~dimcat.plotting.GroupMode]]] = (<GroupMode.ROWS: 'ROWS'>, <GroupMode.COLUMNS: 'COLUMNS'>), normalize: bool = True, dimension_column: ~typing.Optional[str] = None, title: ~typing.Optional[str] = None, labels: ~typing.Optional[dict] = None, hover_data: ~typing.Optional[~typing.List[str]] = None, width: ~typing.Optional[int] = None, height: ~typing.Optional[int] = None, layout: ~typing.Optional[dict] = None, font_size: ~typing.Optional[int] = None, x_axis: ~typing.Optional[dict] = None, y_axis: ~typing.Optional[dict] = None, color_axis: ~typing.Optional[dict] = None, traces_settings: ~typing.Optional[dict] = None, output: ~typing.Optional[str] = None, **kwargs) Figure[source]#
Parameters:
  • layout – Keyword arguments passed to fig.update_layout()

  • **kwargs – Keyword arguments passed to the Plotly plotting function.

Returns:

A Plotly Figure object.

make_pie_chart(df: Optional[D] = None, x_col: Optional[str] = None, y_col: Optional[str] = None, group_cols: Optional[Union[str, Iterable[str]]] = None, group_modes: Optional[Union[GroupMode, Iterable[GroupMode]]] = None, title: Optional[str] = None, labels: Optional[dict] = None, hover_data: Optional[List[str]] = None, height: Optional[int] = None, width: Optional[int] = None, layout: Optional[dict] = None, font_size: Optional[int] = None, x_axis: Optional[dict] = None, y_axis: Optional[dict] = None, color_axis: Optional[dict] = None, traces_settings: Optional[dict] = None, output: Optional[str] = None, **kwargs) Figure[source]#
Parameters:
  • layout – Keyword arguments passed to fig.update_layout()

  • **kwargs – Keyword arguments passed to the Plotly plotting function.

Returns:

A Plotly Figure object.

make_ranking_table(group_cols: ~typing.Optional[~typing.Union[~dimcat.data.resources.dc.UnitOfAnalysis, str, ~typing.Iterable[str]]] = UnitOfAnalysis.GROUP, sort_column: ~typing.Optional[~typing.Union[str, ~typing.Tuple[str, ...]]] = None, sort_order: ~typing.Literal[<SortOrder.DESCENDING: 'DESCENDING'>, <SortOrder.ASCENDING: 'ASCENDING'>] = SortOrder.DESCENDING, top_k: ~typing.Optional[int] = None, drop_cols: ~typing.Optional[~typing.Union[str, ~typing.Iterable[str]]] = None) D[source]#

Sorts the values

Parameters:
  • group_cols – Ranking tables for groups will be concatenated side-by-side. Defaults to the default groupby. To fully prevent grouping, pass False or a falsy value except None.

  • sort_column – By which column to rank. Defaults to the dimension_column.

  • sort_order – Defaults to “descending”, i.e., the highest values will be ranked first.

  • top_k – The number of top ranks to retain. Defaults to 50. Pass None to retain all.

Returns:

property n_documents: int#

The number of rows.

property n_types: int#

Overall number of types present in this matrix.

property overall_prevalence: int#

Sums up the prevalence of all tokens in all documents. If prevalence was measured by counts always, this would be called n_tokens.

plot(title: Optional[str] = None, labels: Optional[dict] = None, hover_data: Optional[List[str]] = None, height: Optional[int] = None, width: Optional[int] = None, layout: Optional[dict] = None, font_size: Optional[int] = None, x_axis: Optional[dict] = None, y_axis: Optional[dict] = None, color_axis: Optional[dict] = None, traces_settings: Optional[dict] = None, output: Optional[str] = None, **kwargs) Figure[source]#

Returns a plotly figure based on the default analysis or the analysis resulting from the given steps.

Parameters:
  • step – Zero or more PipelineSteps where the last one needs to return an object that has a .plot() method, typically an Analyzer returning a Result. Defaults to get_default_analysis() if no step is specified.

  • **kwargs – Keyword arguments passed on to .plot().

Returns:

The figure generated by calling .plot() on the last step’s result.

plot_grouped(group_cols: Optional[Union[UnitOfAnalysis, str, Iterable[str]]] = UnitOfAnalysis.GROUP, group_modes: Optional[Union[GroupMode, Iterable[GroupMode]]] = None, title: Optional[str] = None, labels: Optional[dict] = None, hover_data: Optional[List[str]] = None, height: Optional[int] = None, width: Optional[int] = None, layout: Optional[dict] = None, font_size: Optional[int] = None, x_axis: Optional[dict] = None, y_axis: Optional[dict] = None, color_axis: Optional[dict] = None, traces_settings: Optional[dict] = None, output: Optional[str] = None, **kwargs) Figure[source]#

Returns a plotly figure based on the default analysis or the analysis resulting from the given steps.

Parameters:
  • step – Zero or more PipelineSteps where the last one needs to return an object that has a .plot_grouped() method, typically an Analyzer returning a Result. Defaults to get_default_analysis() if no step is specified.

  • **kwargs – Keyword arguments passed on to .plot_grouped().

Returns:

The figure generated by calling .plot_grouped() on the last step’s result.

property relative: D#

Returns the values corresponding to the RelativePrevalenceMatrix as a dataframe. Syntactic sugar for calling get_relative_prevalence() with as_resource=False.

tf_idf(flavor: InverseDocumentFrequencyFlavor = 'vanilla', log_base: Literal[10, 2, 2.718281828459045, 'e'] = 2, sort_order: Optional[SortOrder] = None) D[source]#
property type_count: S#

Returns a series containing for each document the number of distinct tokens it contains.

type_prevalence(name: str = 'type_prevalence') Series[source]#
property z_scores: D#

Standardizes the type prevalences by subtracting the mean and dividing by the standard deviation. As a result, each column has a mean of 0 and a standard deviation of 1. The standardization operates on relative frequencies so that the prevalences are normalized by the length of each document.

class dimcat.data.resources.results.RelativePrevalenceMatrix(analyzed_resource: DimcatResource, dimension_column: Optional[str], value_column: Optional[str] = None, formatted_column: Optional[str] = None, resource: Optional[Resource] = None, descriptor_filename: Optional[str] = None, basepath: Optional[str] = None, auto_validate: bool = False, default_groupby: Optional[Union[str, list[str]]] = None, format=None, **kwargs)[source]#

Bases: PrevalenceMatrix

property absolute#

Raises a TypeError for relative matrices.

document_prevalence() S[source]#

Raises a TypeError for relative matrices.

get_culled_matrix(ratio: Optional[float] = None, threshold: Optional[int] = None) CulledRelativePrevalenceMatrix[source]#

Removes all features that do not appear in a minimum number of documents.

Parameters:
  • ratio – Minimum ratio of documents a token must occur in to be retained. The number of documents ratio * D is always rounded up. Ratios > 1 are rounded and interpreted as threshold.

  • threshold – Minimum number of documents a token must occur in to be retained.

property is_absolute: bool#

Whether matrix represents absolute prevalences in contrast to a RelativePrevalenceMatrix, in which each row sums up to 1. An absolute matrix can be converted into a relative matrix but not the other way around.

property overall_prevalence: int#

Raises a TypeError for relative matrices.

property relative: D#

Returns the values corresponding to the RelativePrevalenceMatrix as a dataframe. Syntactic sugar for .fillna(0.0).

type_prevalence() S[source]#

Raises a TypeError for relative matrices.

class dimcat.data.resources.results.Result(analyzed_resource: DimcatResource, dimension_column: Optional[str], value_column: Optional[str] = None, formatted_column: Optional[str] = None, resource: Optional[Resource] = None, descriptor_filename: Optional[str] = None, basepath: Optional[str] = None, auto_validate: bool = False, default_groupby: Optional[Union[str, list[str]]] = None, format=None, **kwargs)[source]#

Bases: DimcatResource

class Schema(*, only: Optional[Union[Sequence[str], AbstractSet[str]]] = None, exclude: Union[Sequence[str], AbstractSet[str]] = (), many: Optional[bool] = None, load_only: Union[Sequence[str], AbstractSet[str]] = (), dump_only: Union[Sequence[str], AbstractSet[str]] = (), partial: Optional[Union[bool, Sequence[str], AbstractSet[str]]] = None, unknown: Optional[Literal['exclude', 'include', 'raise']] = None)[source]#

Bases: Schema

dump_fields: dict[str, Field]#
exclude: set[Any] | MutableSet[Any]#
fields: dict[str, Field]#

Dictionary mapping field_names -> Field objects

load_fields: dict[str, Field]#
opts: Any = <marshmallow.schema.SchemaOpts object>#
unknown: types.UnknownOption#
property analyzed_resource: DimcatResource#
combine_results(group_cols: Optional[Union[UnitOfAnalysis, str, Iterable[str]]] = UnitOfAnalysis.GROUP, sort_order: Optional[SortOrder] = SortOrder.DESCENDING) Self[source]#

Aggregate results for each group, typically by summing up and normalizing the values. By default, the groups correspond to those that had been applied to the analyzed resource. If no Groupers had been applied, the entire dataset is treated as a single group.

compute_entropy(group_cols: Optional[Union[UnitOfAnalysis, str, Iterable[str]]] = UnitOfAnalysis.GROUP, weighted: bool = False) S[source]#

Compute the Shannon entropies of the probability distributions for the default or specified grouping.

Parameters:
  • group_cols – For which groups to compute entropy values.

  • weighted – If True, the entropy values will be weighted by the relative prevalence of the respective group. If no grouping is specified, this argument has no effect.

Returns:

A Series of entropy values, indexed by the group names.

dimension_column: Optional[str]#

Name of the column containing some dimension, e.g. to be interpreted as quantity (durations, counts, etc.) or as color.

property feature_columns: List[str]#

The column and, if distinct, the formatted_column, as a list.

property formatted_column: str#

Name of the column containing the formatted values, typically for display on the x_axis.

get_grouping_levels(smallest_unit: UnitOfAnalysis = UnitOfAnalysis.SLICE) List[str][source]#

Returns the levels of the grouping index, i.e., all levels until and including ‘piece’.

is_combination#

Is True if this Result has been created by Result.combine_results(), in which case the method will return df as is (without combining anything).

make_bar_plot(df: Optional[D] = None, x_col: Optional[str] = None, y_col: Optional[str] = None, group_cols: Optional[Union[str, Iterable[str]]] = None, group_modes: Optional[Union[GroupMode, Iterable[GroupMode]]] = None, title: Optional[str] = None, labels: Optional[dict] = None, hover_data: Optional[List[str]] = None, height: Optional[int] = None, width: Optional[int] = None, layout: Optional[dict] = None, font_size: Optional[int] = None, x_axis: Optional[dict] = None, y_axis: Optional[dict] = None, color_axis: Optional[dict] = None, traces_settings: Optional[dict] = None, output: Optional[str] = None, **kwargs) Figure[source]#
Parameters:
  • layout – Keyword arguments passed to fig.update_layout()

  • **kwargs – Keyword arguments passed to the Plotly plotting function.

Returns:

A Plotly Figure object.

make_bubble_plot(df: ~typing.Optional[~dimcat.data.resources.base.D] = None, x_col: ~typing.Optional[str] = None, y_col: ~typing.Optional[str] = None, group_cols: ~typing.Optional[~typing.Union[str, ~typing.Iterable[str]]] = None, group_modes: ~typing.Optional[~typing.Union[~dimcat.plotting.GroupMode, ~typing.Iterable[~dimcat.plotting.GroupMode]]] = (<GroupMode.ROWS: 'ROWS'>, <GroupMode.COLUMNS: 'COLUMNS'>), normalize: bool = True, dimension_column: ~typing.Optional[str] = None, title: ~typing.Optional[str] = None, labels: ~typing.Optional[dict] = None, hover_data: ~typing.Optional[~typing.List[str]] = None, width: ~typing.Optional[int] = None, height: ~typing.Optional[int] = None, layout: ~typing.Optional[dict] = None, font_size: ~typing.Optional[int] = None, x_axis: ~typing.Optional[dict] = None, y_axis: ~typing.Optional[dict] = None, color_axis: ~typing.Optional[dict] = None, traces_settings: ~typing.Optional[dict] = None, output: ~typing.Optional[str] = None, **kwargs) Figure[source]#
Parameters:
  • layout – Keyword arguments passed to fig.update_layout()

  • **kwargs – Keyword arguments passed to the Plotly plotting function.

Returns:

A Plotly Figure object.

make_pie_chart(df: Optional[D] = None, x_col: Optional[str] = None, y_col: Optional[str] = None, group_cols: Optional[Union[str, Iterable[str]]] = None, group_modes: Optional[Union[GroupMode, Iterable[GroupMode]]] = None, title: Optional[str] = None, labels: Optional[dict] = None, hover_data: Optional[List[str]] = None, height: Optional[int] = None, width: Optional[int] = None, layout: Optional[dict] = None, font_size: Optional[int] = None, x_axis: Optional[dict] = None, y_axis: Optional[dict] = None, color_axis: Optional[dict] = None, traces_settings: Optional[dict] = None, output: Optional[str] = None, **kwargs) Figure[source]#
Parameters:
  • layout – Keyword arguments passed to fig.update_layout()

  • **kwargs – Keyword arguments passed to the Plotly plotting function.

Returns:

A Plotly Figure object.

make_ranking_table(group_cols: ~typing.Optional[~typing.Union[~dimcat.data.resources.dc.UnitOfAnalysis, str, ~typing.Iterable[str]]] = UnitOfAnalysis.GROUP, sort_column: ~typing.Optional[~typing.Union[str, ~typing.Tuple[str, ...]]] = None, sort_order: ~typing.Literal[<SortOrder.DESCENDING: 'DESCENDING'>, <SortOrder.ASCENDING: 'ASCENDING'>] = SortOrder.DESCENDING, top_k: ~typing.Optional[int] = None, drop_cols: ~typing.Optional[~typing.Union[str, ~typing.Iterable[str]]] = None) D[source]#

Sorts the values

Parameters:
  • group_cols – Ranking tables for groups will be concatenated side-by-side. Defaults to the default groupby. To fully prevent grouping, pass False or a falsy value except None.

  • sort_column – By which column to rank. Defaults to the dimension_column.

  • sort_order – Defaults to “descending”, i.e., the highest values will be ranked first.

  • top_k – The number of top ranks to retain. Defaults to 50. Pass None to retain all.

Returns:

property metadata: Metadata#

The metadata of the analyzed resource.

plot(title: Optional[str] = None, labels: Optional[dict] = None, hover_data: Optional[List[str]] = None, height: Optional[int] = None, width: Optional[int] = None, layout: Optional[dict] = None, font_size: Optional[int] = None, x_axis: Optional[dict] = None, y_axis: Optional[dict] = None, color_axis: Optional[dict] = None, traces_settings: Optional[dict] = None, output: Optional[str] = None, **kwargs) Figure[source]#

Returns a plotly figure based on the default analysis or the analysis resulting from the given steps.

Parameters:
  • step – Zero or more PipelineSteps where the last one needs to return an object that has a .plot() method, typically an Analyzer returning a Result. Defaults to get_default_analysis() if no step is specified.

  • **kwargs – Keyword arguments passed on to .plot().

Returns:

The figure generated by calling .plot() on the last step’s result.

plot_grouped(group_cols: Optional[Union[UnitOfAnalysis, str, Iterable[str]]] = UnitOfAnalysis.GROUP, group_modes: Optional[Union[GroupMode, Iterable[GroupMode]]] = None, title: Optional[str] = None, labels: Optional[dict] = None, hover_data: Optional[List[str]] = None, height: Optional[int] = None, width: Optional[int] = None, layout: Optional[dict] = None, font_size: Optional[int] = None, x_axis: Optional[dict] = None, y_axis: Optional[dict] = None, color_axis: Optional[dict] = None, traces_settings: Optional[dict] = None, output: Optional[str] = None, **kwargs) Figure[source]#

Returns a plotly figure based on the default analysis or the analysis resulting from the given steps.

Parameters:
  • step – Zero or more PipelineSteps where the last one needs to return an object that has a .plot_grouped() method, typically an Analyzer returning a Result. Defaults to get_default_analysis() if no step is specified.

  • **kwargs – Keyword arguments passed on to .plot_grouped().

Returns:

The figure generated by calling .plot_grouped() on the last step’s result.

property uses_line_of_fifths_colors: bool#

Whether or not the plots produced by this Result exhibit a color gradient along the line of fifths. This is typically the case for results based intervals, note names, or scale degrees. In these cases, the color dimension is lost for discerning between different groups, which are then typically shown in different rows or columns.

property value_column: str#

Name of the column containing the values, typically to arrange markers along the x_axis.

property x_column: str#

Name of the result column from which to create one marker per distinct value to show over the x-axis.

property y_column: str#

Name of the numerical result column used for determining each marker’s dimension along the y-axis.

class dimcat.data.resources.results.ResultName(value)[source]#

Bases: ObjectEnum

Identifies the available analyzers.

CadenceCounts = 'CadenceCounts'#
Counts = 'Counts'#
CulledPrevalenceMatrix = 'CulledPrevalenceMatrix'#
CulledRelativePrevalenceMatrix = 'CulledRelativePrevalenceMatrix'#
Durations = 'Durations'#
GroupwisePrevalenceMatrix = 'GroupwisePrevalenceMatrix'#
NgramTable = 'NgramTable'#
NgramTuples = 'NgramTuples'#
PhraseData = 'PhraseData'#
PrevalenceMatrix = 'PrevalenceMatrix'#
RelativePrevalenceMatrix = 'RelativePrevalenceMatrix'#
Result = 'Result'#
Transitions = 'Transitions'#
class dimcat.data.resources.results.TerminalSymbol(value)[source]#

Bases: LowercaseEnum

Used to control arguments for n-gram creation. DEFAULT defines the default terminal symbol. NA replaces each terminal value with pd.NA values (rather than, say, with a tuple of null values). DROP results in terminal n-grams being dropped entirely, that is, those starting with one of the n-1 last n-grams of a sequence.

DEFAULT = '⋉'#
DROP = 'DROP'#
NA = '<NA>'#
class dimcat.data.resources.results.Transitions(analyzed_resource: DimcatResource, feature_columns: List[str, str], value_column: Optional[str] = None, dimension_column: Optional[str] = None, formatted_column: Optional[str] = None, resource: fl.Resource = None, descriptor_filename: Optional[str] = None, basepath: Optional[str] = None, auto_validate: bool = False, default_groupby: Optional[str | list[str]] = None, format=None)[source]#

Bases: Result

class Schema(*, only: Optional[Union[Sequence[str], AbstractSet[str]]] = None, exclude: Union[Sequence[str], AbstractSet[str]] = (), many: Optional[bool] = None, load_only: Union[Sequence[str], AbstractSet[str]] = (), dump_only: Union[Sequence[str], AbstractSet[str]] = (), partial: Optional[Union[bool, Sequence[str], AbstractSet[str]]] = None, unknown: Optional[Literal['exclude', 'include', 'raise']] = None)[source]#

Bases: Schema

dump_fields: dict[str, Field]#
exclude: set[Any] | MutableSet[Any]#
fields: dict[str, Field]#

Dictionary mapping field_names -> Field objects

load_fields: dict[str, Field]#
opts: Any = <marshmallow.schema.SchemaOpts object>#
unknown: types.UnknownOption#
compute_information_gain(group_cols: Optional[Literal[False]], reverse: bool) float[source]#
compute_information_gain(group_cols: Union[UnitOfAnalysis, str, Iterable[str]], reverse: bool) S

Computes the gain in information about (reduction in entropy of) the consequent from knowing the antecedent. This can be interpreted as measure of how much we know on average about the consequent given an antecedent.

It is typically explained as the difference between the entropy of the consequents’ frequency distribution and the weighted frequency-weighted sum of entropies of each antecedent’s consequent distribution (which is considered as a ‘split’ in the context of decision trees).

Parameters:
  • group_cols – Defines the groups for which to compute the information gain.

  • reverse – Reverse the argument: How much more do we know about the antecedent when we know the consequent?

Returns:

If group_cols is None or empty or resolves to empty (the default when no groupers have been applied), the resulting value is a float expressing the difference in entropy. Otherwise, when a grouping is performed, the result is a Series of floats.

property feature_columns: List[str]#

The column and, if distinct, the formatted_column, as a list.

get_grouping_levels(smallest_unit: UnitOfAnalysis = UnitOfAnalysis.SLICE) List[str][source]#

Returns the levels of the grouping index, i.e., all levels until and including ‘piece’ or ‘slice’.

make_heatmap(df: Optional[D] = None, max_x: Optional[int] = None, max_y: Optional[int] = None, x_title: Optional[str] = 'consequent', y_title: Optional[str] = 'antecedent', facet_row: Optional[str] = None, facet_col: Optional[str] = None, column_colorscales: Optional[Union[List[str], Dict[str, str]]] = None, title: Optional[str] = None, labels: Optional[dict] = None, hover_data: Optional[List[str]] = None, height: Optional[int] = None, width: Optional[int] = None, layout: Optional[dict] = None, font_size: Optional[int] = None, x_axis: Optional[dict] = None, y_axis: Optional[dict] = None, color_axis: Optional[dict] = None, traces_settings: Optional[dict] = None, output: Optional[str] = None, **kwargs)[source]#
plot(title: Optional[str] = None, labels: Optional[dict] = None, hover_data: Optional[List[str]] = None, height: Optional[int] = None, width: Optional[int] = None, layout: Optional[dict] = None, font_size: Optional[int] = None, x_axis: Optional[dict] = None, y_axis: Optional[dict] = None, color_axis: Optional[dict] = None, traces_settings: Optional[dict] = None, output: Optional[str] = None, **kwargs) Figure[source]#

Returns a plotly figure based on the default analysis or the analysis resulting from the given steps.

Parameters:
  • step – Zero or more PipelineSteps where the last one needs to return an object that has a .plot() method, typically an Analyzer returning a Result. Defaults to get_default_analysis() if no step is specified.

  • **kwargs – Keyword arguments passed on to .plot().

Returns:

The figure generated by calling .plot() on the last step’s result.

plot_grouped(group_cols: Optional[Union[UnitOfAnalysis, str, Iterable[str]]] = UnitOfAnalysis.GROUP, group_modes: Optional[Union[GroupMode, Iterable[GroupMode]]] = None, title: Optional[str] = None, labels: Optional[dict] = None, hover_data: Optional[List[str]] = None, height: Optional[int] = None, width: Optional[int] = None, layout: Optional[dict] = None, font_size: Optional[int] = None, x_axis: Optional[dict] = None, y_axis: Optional[dict] = None, color_axis: Optional[dict] = None, traces_settings: Optional[dict] = None, output: Optional[str] = None, **kwargs) Figure[source]#

Returns a plotly figure based on the default analysis or the analysis resulting from the given steps.

Parameters:
  • step – Zero or more PipelineSteps where the last one needs to return an object that has a .plot_grouped() method, typically an Analyzer returning a Result. Defaults to get_default_analysis() if no step is specified.

  • **kwargs – Keyword arguments passed on to .plot_grouped().

Returns:

The figure generated by calling .plot_grouped() on the last step’s result.

property x_column: str#

Name of the result column from which to create one marker per distinct value to show over the x-axis.

dimcat.data.resources.results.compute_entropy_of_observations(observations: Iterable[Any], base: Literal[10, 2, 2.718281828459045, 'e'] = 2) float[source]#

Compute the Shannon entropy of an array of observations by counting the values.

dimcat.data.resources.results.compute_entropy_of_occurrences(occurrences: Iterable[int], base: Literal[10, 2, 2.718281828459045, 'e'] = 2) float[source]#

Compute the Shannon entropy of the given absolute frequencies where each integer represents the number of observed occurrences of a category.

dimcat.data.resources.results.compute_entropy_of_probabilities(probabilities: Union[Iterable[float], Iterable[int]], base: Literal[10, 2, 2.718281828459045, 'e'] = 2, skip_check: bool = False) float[source]#

Compute the Shannon entropy of the given probability distribution, which is expected to be normalized.

Parameters:
  • probabilities

  • base – Logarithmic base for computing the entropy.

  • skip_check – If False (default) the probabilities are asserted to sum to 1. Pass True when you have normalized the data yourself or when you’re passing occurrences rather than probabilities.

Returns:

The absolute or normalized Shannon entropy of the given probability distribution.

dimcat.data.resources.results.logarithm_function(base: Literal[10, 2, 2.718281828459045, 'e'] = 2, numpy=False) Callable[source]#
dimcat.data.resources.results.make_heatmaps_from_transitions(transitions_df: D, max_x: Optional[int] = None, max_y: Optional[int] = None, x_title: Optional[str] = 'consequent', y_title: Optional[str] = 'antecedent', facet_col: Optional[str] = None, facet_row: Optional[str] = None, column_colorscales: Optional[Union[List[str], Dict[str, str]]] = None, title: Optional[str] = None, height: Optional[int] = None, width: Optional[int] = None, layout: Optional[dict] = None, font_size: Optional[int] = None, x_axis: Optional[dict] = None, y_axis: Optional[dict] = None, color_axis: Optional[dict] = None, traces_settings: Optional[dict] = None, output: Optional[str] = None) Figure[source]#
dimcat.data.resources.results.prepare_transitions(df: D, max_x: Optional[int] = None, max_y: Optional[int] = None) Tuple[D, D, D][source]#

Turns transitions that come in long format into wide format (transition matrix), optionally subselecting the first n columns (max_x) and rows (max_y). Transitions are expected to be sorted, have the consequents (the new columns) in the last (right-most) index level, and come with the columns “count”, “proportion” and “proportion_%”.

dimcat.data.resources.results.turn_proportions_into_percentage_strings(df: pandas.core.frame.DataFrame | pandas.core.series.Series, column_name: str = 'proportion_%') pandas.core.frame.DataFrame | pandas.core.series.Series[source]#

Interprets the Series or all columns of the DataFrame as proportions, multiplies them by 100 and turns them into strings with a % sign.

dimcat.data.resources.utils module#

dimcat.data.resources.utils.align_with_grouping(df: pd.DataFrame, grouping: DimcatIndex | pd.MultiIndex, sort_index: bool = True) pd.DataFrame[source]#

Aligns a dataframe with a grouping index that has n levels such that the index levels of the new dataframe start with the n levels of the grouping index and are followed by the remaining levels of the original dataframe. This is typically used to align a dataframe with feature information for many pieces with an index grouping piece names.

dimcat.data.resources.utils.append_index_levels(old_index: IX, *new_level: IX | S | D, drop_levels: Optional[Literal[False], str | int | Iterable[str | int]] = None) IX[source]#

Replace index levels by optionally dropping an arbitrary number and concatenating the new level(s) to the right.

dimcat.data.resources.utils.apply_playthrough(feature_df: D, playthrough: Playthrough, logger: Optional[logging.Logger] = None) D[source]#

Transform a dataframe based on the resource’s playthrough setting.

dimcat.data.resources.utils.apply_slice_intervals_to_resource_df(df: DataFrame, slice_intervals: MultiIndex, qstamp_column_name: str = 'quarterbeats', duration_column_name: str = 'duration_qb', logger: Optional[Logger] = None) DataFrame[source]#
dimcat.data.resources.utils.boolean_is_minor_column_to_mode(S: Series) Series[source]#
dimcat.data.resources.utils.check_configs_against_allowed_configs(configs: DimcatConfig | Iterable[DimcatConfig], allowed_configs: Optional[FeatureSpecs | Iterable[FeatureSpecs]], allow_subclasses: bool = True) None[source]#

Matches configs against allowed configs and raises as soon as any pair does not match. Two configs match if they have the same dtype and any overlapping key has the same value.

Parameters:
  • configs – Config(s) to be checked.

  • allowed_configs – The function raises if any of the configs does not match with any of these.

  • allow_subclasses – If True (default), configs dtypes are allowed to be subclasses of the allowed_configs dtypes.

Raises:

ResourceNotProcessableError when any of the configs doesn't match with any of the allowed configs.

dimcat.data.resources.utils.check_qstamp_columns(df: D, qstamp_column_name: str, duration_column_name: str, logger: Optional[Logger] = None) None[source]#
dimcat.data.resources.utils.condense_dataframe_by_groups(df: DataFrame, group_keys_series: Series, logger: Optional[Logger] = None)[source]#

Based on the given group_keys_series, drop all rows but the first of each group and adapt the column ‘duration_qb’ accordingly.

Parameters:
  • df – DataFrame to be reduced, expected to contain the column duration_qb. In order to use the result as a segmentation, it should have a pandas.IntervalIndex.

  • group_keys_series – Series with the same index as df that contains the group keys. If it contains NA values, the

Returns:

Reduced DataFrame with updated ‘duration_qb’ column and pandas.IntervalIndex on the first level (if present).

dimcat.data.resources.utils.condense_pedal_points(df)[source]#

Condenses pedal points into single rows. The duration of the pedal point is summed up and the chord is replaced by the pedal

dimcat.data.resources.utils.drop_duplicated_ultima_rows(phrase_annotations_df: D) D[source]#

Used by the PhraseDataAnalyzer to drop the last row of each phrase’s body component when drop_duplicated_ultima_rows is True.

dimcat.data.resources.utils.drop_rows_with_missing_values(df: D, column_names: List[str], how: Literal['any', 'all'] = 'any', logger: Optional[Logger] = None) D[source]#

Drop rows with missing values in the specified columns. If nothing is to be dropped, the identical dataframe is returned, not a copy.

dimcat.data.resources.utils.ensure_level_named_piece(index: MultiIndex, recognized_piece_columns: Optional[Iterable[str]] = None) Tuple[MultiIndex, int][source]#
Ensures that the index has a level named “piece” by detecting alternative level names and renaming it in case it

doesn’t have one. Returns the index and the position of the piece level.

Parameters:
  • index – MultiIndex.

  • recognized_piece_columns – Defaults to (“pieces”, “fname”, “fnames”). If other names are to be recognized as “piece” level, pass those.

Returns:

The same index or a copy with a renamed level. The position of the piece level.

dimcat.data.resources.utils.feature_specs2config(feature: FeatureSpecs) DimcatConfig[source]#

Converts a feature specification to a DimcatConfig.

Raises:

TypeError – If the specs cannot be resolved to a DimcatConfig that describes a Feature.

dimcat.data.resources.utils.features_argument2config_list(features: Optional[FeatureSpecs | Iterable[FeatureSpecs]] = None, allowed_configs: Optional[FeatureSpecs | Iterable[FeatureSpecs]] = None) List[DimcatConfig][source]#
dimcat.data.resources.utils.fl_fields2pandas_params(fields: List[Field]) Tuple[dict, dict, list][source]#

Convert frictionless Fields to pd.read_csv() parameters ‘dtype’, ‘converters’ and ‘parse_dates’.

dimcat.data.resources.utils.get_corpus_display_name(repo_name: str) str[source]#

Looks up a repository name in the CORPUS_NAMES constant. If not present, the repo name is returned as title case.

dimcat.data.resources.utils.get_existing_normpath(fl_resource) str[source]#

Get the normpath of a frictionless resource, raising an exception if it does not exist.

Parameters:

fl_resource – The frictionless resource. If its basepath is not specified, the filepath is tried relative to the current working directory.

Returns:

The absolute path of the frictionless resource.

Raises:
dimcat.data.resources.utils.get_time_spans_from_resource_df(df: DataFrame, qstamp_column_name: str, duration_column_name: str, round: Optional[int], to_float: bool, dropna: bool, return_df: Literal[False], logger: Optional[Logger]) DataFrame[source]#
dimcat.data.resources.utils.get_time_spans_from_resource_df(df: DataFrame, qstamp_column_name: str, duration_column_name: str, round: Optional[int], to_float: bool, dropna: bool, return_df: Literal[True], logger: Optional[Logger]) Tuple[DataFrame, DataFrame]

Returns a dataframe with start (‘left’) and end (‘right’) positions of the events represented by this resource’s rows.

Parameters:
  • df

  • qstamp_column_name – Column from which to retrieve start positions.

  • duration_column_name – Column from which to retrieve durations to be added to the start positions.

  • round – To how many decimal places to round the intervals’ boundary values. Setting a value automatically sets to_float=True.

  • to_float – By default (True), the returned time span values are floats. Set False to leave values as they are after adding the columns, e.g. as fractions. If round is specified, however, this has no effect since the values are rounded to floats anyway.

  • dropna – By default (False), rows with missing values are ignored and the result will include missing values for them. Pass True to drop rows with missing values. In this case you may also want to set return_df=True.

  • return_df – Pass True if you want to return the original dataframe as well, especially when dropna=True.

  • logger

Returns:

A dataframe with columns start and end. If return_df=True, the input dataframe is returned as used for computing the time spans.

dimcat.data.resources.utils.infer_piece_col_position(column_name: List[str], recognized_piece_columns: Optional[Iterable[str]] = None) Optional[int][source]#

Infer the position of the piece column in a list of column names.

dimcat.data.resources.utils.infer_schema_from_df(df: SomeDataframe, include_index_levels: bool = True, allow_integer_names: bool = True, **kwargs) fl.Schema[source]#

Infer a frictionless.Schema from a dataframe.

This function partially copies ms3.utils.frictionless_helpers.get_schema().

Parameters:
  • df

  • include_index_levels – If False (default), the index levels are not described, assuming that they will not be written to disk (otherwise, validation error). Set to True to add all index levels to the described columns and, in addition, to make them the primaryKey (which, in frictionless, implies the constraints “required” & “unique”).

  • **kwargs – Arbitrary key-value pairs that will be added to the frictionless schema descriptor as “custom” metadata.

Returns:

dimcat.data.resources.utils.insert_index_level(old_index: IX, new_level: Union[IX, S, D], position: int) IX[source]#

Replace index levels by optionally dropping an arbitrary number and concatenating the new level(s) to the right.

dimcat.data.resources.utils.join_df_on_index(df: pd.DataFrame, index: DimcatIndex | pd.MultiIndex, how: Literal['left', 'right', 'inner', 'outer', 'cross'] = 'inner') pd.DataFrame[source]#
dimcat.data.resources.utils.load_fl_resource(fl_resource: fl.Resource, normpath: Optional[str] = None, index_col: Optional[int | str | Iterable[int | str]] = None, usecols: Optional[int | str | Iterable[int | str]] = None) SomeDataframe[source]#

Load a dataframe from a frictionless.Resource.

Parameters:
  • fl_resource – The resource whose normpath points to a file on the local file system.

  • normpath – If not specified, the normpath of the resource is used, which is not always reliable because its own basepath property is half-heartedly maintained.

  • index_col – Column(s) to be used as index levels, overriding the primary key specified in the resource’s schema.

  • usecols – If only a subset of the specified fields is to be loaded, the names or positions of the subset.

Returns:

The loaded dataframe loaded with the dtypes resulting from converting the schema fields via fl_fields2pandas_params().

dimcat.data.resources.utils.load_index_from_fl_resource(fl_resource: fl.Resource, index_col: Optional[int | str | List[int | str]] = None, recognized_piece_columns: Iterable[str] = ('piece', 'pieces', 'fname', 'fnames')) SomeIndex[source]#

Load the index columns from a frictionless Resource.

Parameters:
  • fl_resource – The frictionless Resource to load the index columns from.

  • index_col – The column(s) to use as index. If None, the primary key of the schema is used if it exists.

  • recognized_piece_columns – If the loaded columns do not include ‘piece’ but one of the names specified here, the first column name of the iterable that is detected in the loaded columns will be renamed to ‘piece’. Likewise, such a column would be used (and renamed) if index_col is not specified and the schema does not specify a primary key: in that case, the detected column and all columns left of it will used as index_col argument.

Returns:

The specified or inferred index column(s) as a (Multi)Index object.

Raises:
  • FileNotFoundError – If the normpath of the resource does not exist.

  • ValueError – If the resource doesn’t yield a normpath or the index columns cannot be inferred from it based on the schema.

dimcat.data.resources.utils.make_adjacency_groups(S: Series, groupby=None, logger: Optional[Logger] = None) Tuple[Series, Dict[int, Any]][source]#

Turns a Series into a Series of ascending integers starting from 1 that reflect groups of successive equal values.

This is a simplified variant of ms3.adjacency_groups()

Parameters:
  • S – Series in which to group identical adjacent values with each other.

  • groupby – If not None, the resulting grouper will start new adjacency groups according to this groupby. This is a way, for example, to ensure no group overlaps piece boundaries even if there are adjacent identical values.

Returns:

A series with increasing integers that can be used for grouping. A dictionary mapping the integers to the grouped values.

dimcat.data.resources.utils.make_adjacency_mask(S: Series) Series[source]#

Turns a Series into a Boolean Series that is True for the first value of each group of successive equal values.

dimcat.data.resources.utils.make_boolean_mask_from_set_of_tuples(index: DimcatIndex | pd.MultiIndex, tuples: Set[tuple], levels: Optional[Iterable[int]] = None) pd.Index[bool][source]#

Returns a boolean mask for the given tuples based on index tuples formed from integer positions of the index levels to subselect.

Parameters:
  • index – Index (of the dataframe) you want to subselect from using the returned boolean mask.

  • tuples

  • levels

    • If None, the first n levels of the index are used, where n is the length of the selection tuples.

    • If an iterable of level name strings or level position integers, they are used to create for each row a tuple to compare against the selected tuples.

Returns:

A boolean mask of the same length as the index, where True indicates that the corresponding index tuple is contained in the selection tuples.

Raises:
  • TypeError – If tuples is not a set.

  • ValueError – If tuples is empty.

  • ValueError – If the index has less levels than the selection tuples.

  • ValueError – If levels is not None and has a different length than the selection tuples.

dimcat.data.resources.utils.make_frictionless_schema_descriptor(column_names: Iterable[str], primary_key: Optional[Iterable[str]] = None, **custom_data) dict[source]#

Creates a frictionless schema descriptor from a list of column names and a primary key.

This function is a duplicate of ms3.utils.frictionless_helpers.make_frictionless_schema_descriptor() and the translation of column names into frictionless fields (with type and description) falls back to ms3.utils.frictionless_helpers.column_name2frictionless_field().

Parameters:
  • column_names

  • primary_key

  • **custom_data

Returns:

dimcat.data.resources.utils.make_group_start_mask(df: D, groupby) ndarray[Any, dtype[bool]][source]#

Returns a boolean mask where the beginning of each group is marked with True. This is useful only when the groups already came in groups within the dataframe in the first place.

dimcat.data.resources.utils.make_groups_lasts_mask(feature_df: Union[D, S], groupby=None) ndarray[Any, dtype[bool]][source]#

Returns a boolean mask where each row that comes last in one of the groups is marked as True. This is useful only when the groups already came in groups within the dataframe in the first place. Instead of a dataframe with groupby columns you may also pass a Series with None.

dimcat.data.resources.utils.make_groupwise_range_index_from_groups(idx: Index) ndarray[Any, dtype[int]][source]#

Turns adjacency groups into integer ranges starting from 0.

dimcat.data.resources.utils.make_index_from_grouping_dict(grouping: Dict[str, Iterable[tuple]], level_names=('group_name', 'corpus', 'piece'), sort=False, raise_if_multiple_membership: bool = False) MultiIndex[source]#

Creates a MultiIndex from a dictionary with grouped tuples.

Parameters:
  • grouping – A dictionary where keys are group names and values are lists of index tuples.

  • level_names – Names for the levels of the MultiIndex, i.e. one for the group level and one per level in the tuples.

  • sort – By default the returned MultiIndex is sorted. Set False to disable sorting.

  • raise_if_multiple_membership – If True, raises a ValueError if a member is in multiple groups.

Returns:

A MultiIndex with the given names and the tuples from the grouping dictionary.

dimcat.data.resources.utils.make_multiindex_for_unstack(idx: Index, level_name: str = 'i') MultiIndex[source]#

Turns an index that contains adjacency groups (adjacent entries having the same value) into a 2-level MultiIndex where the new level represents an individual integer range for each group, starting at 0.

dimcat.data.resources.utils.make_phrase_start_mask(df) ndarray[Any, dtype[bool]][source]#

Based on the “phrase_id” index level, make a mask that is True for the first row of each mask.

dimcat.data.resources.utils.make_range_index_from_boolean_mask(inner_start_mask: ndarray[Any, dtype[bool]], outer_start_mask: Optional[ndarray[Any, dtype[bool]]] = None) ndarray[Any, dtype[int]][source]#

Creates an index with the same length as the given boolean mask, that restarts counting from every True entry. The behaviour changes depending on whether outer_start_mask is given or not. That’s how the function is used by PhraseData._regroup_phrases() to create both the inner and the outer index level. The function is indifferent to the value of the first entry in the mask(s).

The algorithm builds on Warren Weckesser’s approach via https://stackoverflow.com/a/20033438

Parameters:
  • inner_start_mask

  • outer_start_mask

Returns:

dimcat.data.resources.utils.make_regrouped_stage_index(df: D, grouping: S, level_names: Tuple[str, str] = ('stage', 'substage')) D[source]#

Returns a dataframe that corresponds to the two new (stage) index levels that regroup_phrase_stages() incorporates.

dimcat.data.resources.utils.make_tsv_resource(name: Optional[str] = None) Resource[source]#

Returns a frictionless.Resource with the default properties of a TSV file stored to disk.

dimcat.data.resources.utils.merge_columns_into_one(df: D, join_str: Optional[Union[str, bool]] = None, fillna: Optional[Hashable] = None) S[source]#

Merge all columns of a dataframe into a single column.

Parameters:
  • df – Dataframe to reduce.

  • join_str

    By default (None), the resulting columns contain tuples. If you want them to contain strings, you may pass

    • True to concatenate the tuple values for a given n-gram component separated by “, “ – yielding strings that look like tuples without parentheses

    • False to concatenate without any string in-between the values

    • a string to be used as the separator between the tuple values.

  • fillna – Pass a value to replace all missing values with it.

Returns:

A series containing tuples or strings.

dimcat.data.resources.utils.merge_ties(df: D, return_dropped: bool = False, perform_checks: bool = True, logger: Optional[Logger] = None)[source]#

In a note list, merge tied notes to single events with accumulated durations. Input dataframe needs columns [‘duration’, ‘tied’, ‘midi’, ‘staff’]. This function does not handle correctly overlapping ties on the same pitch since it doesn’t take into account the notational layers (‘voice’).

Copied from ms3, to be developed further.

Parameters:
  • df

  • return_dropped

  • perform_checks

  • logger

Returns:

dimcat.data.resources.utils.nan_eq(a, b)[source]#

Returns True if a and b are equal or both null. Works on two Series or two elements.

dimcat.data.resources.utils.overlapping_chunk_per_interval_cutoff_direct(df: DataFrame, lefts: ndarray[Any, dtype[_ScalarType_co]], rights: ndarray[Any, dtype[_ScalarType_co]], intervals: IntervalIndex, qstamp_column_name: str = 'quarterbeats', duration_column_name: str = 'duration_qb', logger: Optional[Logger] = None) DataFrame[source]#

The heart of a slicing operation, which returns a dataframe that corresponds to the input dataframe sliced by the intervals present in the intervals pandas.IntervalIndex, which will be included as the first index level of the result dataframe.

Parameters:
  • df – DataFrame to be sliced.

  • lefts – Same-length array expressing the start point of every row.

  • rights – Same-length array expressing the end point (exclusive) of every row.

  • qstamp_column_name – Name of the column in which qstamp (offset from the timeline’s origin) is to be found.

  • duration_column_name – Name of the column in the chunk dfs where the new event durations will be stored as floats. Defaults to “duration_qb”, resulting in the existing values being updated.

  • intervals – The pairs are interpreted as left-closed, right-open intervals that demarcate the boundaries of the returned DataFrame chunks. These intervals are assumed to be non-overlapping and monotonically increasing, which allows us to speed up this expensive operation.

Returns:

Concatenation of the dataframe chunks corresponding to each of the given interval. The first index level of the resulting dataframe is a :obj`pandas.IntervalIndex` which corresponds to the intervals.

dimcat.data.resources.utils.regroup_phrase_stages(df: D, grouping: S, level_names: Tuple[str, str] = ('stage', 'substage'))[source]#

Insert a grouping column and replace the last index level with a new primary and secondary index accordingly. The primary level increments at the beginning of each group, the secondary level increments at every row, restarting at the beginning of each group. For example, a grouping [“a”, “a”, “a”, “b”, “c”, “c”] results in the index [(0, 0), (0, 1), (0, 2), (1, 0), (2, 0), (2, 1)].

Parameters:
  • grouping – A Series with the same index as the (raw) phrase_df, containing the grouping criterion. Adjacent equal values are grouped together.

  • level_names – Names of the two index levels.

Returns:

A reindexed copy of the phrase data.

dimcat.data.resources.utils.resolve_columns_argument(columns: Optional[Union[str, int, Iterable[str | int]]], column_names: List[str]) Optional[List[str]][source]#

Resolve the columns argument of a load function to a list of column names.

Parameters:
  • columns – A list of integer position and/or column names. Can be mixed but integers will always be interpreted as positions.

  • column_names – List of column names to choose from.

Returns:

The resolved list of column names. None if columns is None.

Raises:

ValueError – If columns contains duplicate column names.

dimcat.data.resources.utils.resolve_join_str_argument(join_str: Optional[Union[bool, str, Tuple[bool | str, ...]]]) Optional[str][source]#

Helper function that resolves a join_str argument to a string or None by replacing boolean values with the defaults “, “ for True and “” for False.

dimcat.data.resources.utils.resolve_levels_argument(levels: Optional[Union[str, int, Iterable[str | int]]], level_names: List[str], inverse: bool = False) Optional[Tuple[int]][source]#

Turns a selection of index levels into a list of positive level positions.

dimcat.data.resources.utils.resolve_recognized_piece_columns_argument(recognized_piece_columns: Optional[Iterable[str]] = None) List[str][source]#

Resolve the recognized_piece_columns argument by replacing None with the default value.

dimcat.data.resources.utils.safe_row_tuple(row: Iterable[str]) Literal[<NA>]][source]#

Join the given strings together separated by ‘, ‘ but catch TypeErrors by returning pd.NA instead.

dimcat.data.resources.utils.store_json(data: dict, filepath: str, indent: int = 2, make_dirs: bool = True, **kwargs)[source]#

Serialize object to file.

Parameters:
  • data – Nested structure of dicts and lists.

  • filepath – Path to the text file to (over)write.

  • indent – Prettify the JSON layout. Default indentation: 2 spaces

  • make_dirs – If True (default), create the directory if it does not exist.

  • **kwargs – Keyword arguments passed to json.dumps().

dimcat.data.resources.utils.str2inttuple(s)[source]#

Non-strict version of ms3.str2inttuple() which does not fail on non-integer values.

dimcat.data.resources.utils.str2pd_interval(s: str) Interval[source]#

Function produces only left-closed, right-open intervals.

dimcat.data.resources.utils.subselect_multiindex_from_df(df: D, tuples: DimcatIndex | Iterable[tuple], levels: Optional[int | str | List[int | str]] = None) pd.DataFrame[source]#

Returns a copy of a subselection of the dataframe based on the union of its index tuples (or subtuples) and the given tuples.

Parameters:
  • df – Dataframe of which to return a subset of rows.

  • tuples – Tuples to match against df’s MultiIndex. Can be a MultiIndex because set(tuples) works on that, too.

  • levels

    • If None, the first n levels of the index are used, where n is the length of the selection tuples.

    • If an iterable of level name strings or level position integers, they are used to create for each row a tuple to compare against the selected tuples.

Returns:

dimcat.data.resources.utils.transform_phrase_data(phrase_df, columns: Union[str, List[str]] = 'chord', components: Union[Literal['ante', 'body', 'codetta', 'post'], List[Literal['ante', 'body', 'codetta', 'post']]] = 'body', drop_levels: Union[bool, int, str, Iterable[str | int]] = False, reverse: bool = False, level_name: str = 'i')[source]#

Returns a dataframe containing the requested phrase components and harmony columns.

Parameters:
  • phrase_df – PhraseAnnotations dataframe.

  • columns – Column(s) to include in the result.

  • components – Which of the four phrase components to include, ∈ {‘ante’, ‘body’, ‘codetta’, ‘post’}.

  • drop_levels – Can be a boolean or any level specifier accepted by pandas.MultiIndex.droplevel(). If False (default), all levels are retained. If True, only the phrase_id level and the level_name are retained. In all other cases, the indicated (string or integer) value(s) must be valid and cause one of the index levels to be dropped. level_name cannot be dropped. Dropping ‘phrase_id’ will likely lead to an exception if a PhraseData object will be displayed in WIDE format.

  • reverse – Pass True to reverse the order of harmonies so that each phrase’s last label comes first.

  • level_name – Defaults to ‘i’, which is the name of the original level that will be replaced by this new one. The new one represents the individual integer range for each phrase, starting at 0.

Returns:

Dataframe representing partial information on the selected phrases.

dimcat.data.resources.utils.transpose_notes_to_c(notes: D) D[source]#

Transpose the columns ‘tpc’ and ‘midi’ in a way that they reflect the local key as if it was C major/minor. This operation is typically required for creating pitch class profiles. Uses: ms3.transform(), ms3.name2fifths(), ms3.roman_numeral2fifths()

Parameters:

notes – DataFrame that has at least the columns [‘globalkey’, ‘localkey’, ‘tpc’, ‘midi’].

Returns:

A new dataframe with the columns ‘local_tonic_name’, ‘fifths_over_local_tonic’, and ‘midi_in_c’ where the latter two correspond to the original columns ‘tpc’ and ‘midi’ but transposed in such a way that fifths_over_local_tonic == 0 and midi_in_c % 12 == 0 for all pitches that match the local tonic. E.g. for the local key A major/minor, each pitch A will have tpc=0 and midi % 12 = 0).

dimcat.data.resources.utils.tuple2str(tup: tuple, join_str: Optional[str] = ', ', recursive: bool = True, keep_parentheses: bool = False) str[source]#

Used for turning n-gram components into strings, e.g. for display on plot axes.

Parameters:
  • tup – Tuple to be returned as string.

  • join_str – String to be interspersed between tuple elements. If None, result is str(tup) and recursive is ignored.

  • recursive – If True (default) tuple elements that are tuples themselves will be joined together recursively, using the same join_str (except when it’s None). Inner tuples always keep their parentheses.

  • keep_parentheses – If False (default), the outer parentheses are removed. Pass True to keep them in the string.

Returns:

A string representing the tuple.

dimcat.data.resources.utils.update_duration_qb(df: D, update_mask: Optional[ndarray[Any, dtype[bool]]] = None, logger: Optional[Logger] = None) None[source]#

Replaces the ‘duration_qb’ column in the given DataFrame with a new one that updates the values by subtracting subsequent ‘quarterbeats’ values. If update_mask is specified, only values for which the mask is True are updated. Otherwise, all values are updated.

dimcat.data.resources.utils.value2bool(value: str | float | int | bool) bool | str | float | int[source]#

Identical with ms3.value2bool

Module contents#