dimcat.data.resources package#

Submodules#

dimcat.data.resources.base module#

class dimcat.data.resources.base.FeatureName(value)[source]#

Bases: ObjectEnum

An enumeration.

Annotations = 'Annotations'#
Articulation = 'Articulation'#
BassNotes = 'BassNotes'#
CadenceLabels = 'CadenceLabels'#
DcmlAnnotations = 'DcmlAnnotations'#
Feature = 'Feature'#
HarmonyLabels = 'HarmonyLabels'#
KeyAnnotations = 'KeyAnnotations'#
Measures = 'Measures'#
Metadata = 'Metadata'#
Notes = 'Notes'#
PhraseAnnotations = 'PhraseAnnotations'#
PhraseComponents = 'PhraseComponents'#
PhraseLabels = 'PhraseLabels'#
class dimcat.data.resources.base.PathResource(resource: Resource, descriptor_filename: Optional[str] = None, basepath: Optional[str] = None, **kwargs)[source]#

Bases: Resource

A resource that does not load frictionless descriptors or warns about them as Resource would.

classmethod from_filepath(filepath: str, resource_name: Optional[str] = None, descriptor_filename: Optional[str] = None, basepath: Optional[str] = None, **kwargs) Self[source]#

Create a Resource from a file on disk, be it a JSON/YAML resource descriptor, or a simple path resource.

Parameters:
  • filepath – Path pointing to a resource descriptor or a simple path resource.

  • resource_name – Name of the resource used for retrieving it from a DimcatPackage and as filename when the resource is stored to a ZIP file.

  • descriptor_filename – Relative filepath for using a different JSON/YAML descriptor filename than the default get_descriptor_filename(). Needs to end on one of the file extensions defined in the setting package_descriptor_endings (by default ‘resource.json’ or ‘resource.yaml’).

  • auto_validate – By default, the Resource will not be validated upon instantiation or change (but always before writing to disk). Set True to raise an exception during creation or modification of the resource, e.g. replacing the column_schema.

  • default_groupby – Pass a list of column names or index levels to groupby something else than the default (by piece).

  • basepath – Basepath to use for the resource. If None, the folder of the filepath is used.

classmethod from_resource_path(resource_path: str, resource_name: Optional[str] = None, descriptor_filename: Optional[str] = None, basepath: Optional[str] = None, **kwargs) Self[source]#

Create a Resource from a file on disk, treating it just as a path even if it’s a JSON/YAML resource descriptor.

class dimcat.data.resources.base.Resource(resource: Optional[Resource] = None, descriptor_filename: Optional[str] = None, basepath: Optional[str] = None, **kwargs)[source]#

Bases: Data

A Resource is essentially a wrapper around a frictionless.Resource object. Initializing a Resource object from a descriptor dispatches to the appropriate subclass, depending on the specified dtype or, if absent, to a DimcatResource for tabular data and to a PathResource for any other.

property ID: Tuple[str, str]#

The resource’s unique ID.

class PickleSchema(*, only: Optional[Union[Sequence[str], AbstractSet[str]]] = None, exclude: Union[Sequence[str], AbstractSet[str]] = (), many: Optional[bool] = None, load_only: Union[Sequence[str], AbstractSet[str]] = (), dump_only: Union[Sequence[str], AbstractSet[str]] = (), partial: Optional[Union[bool, Sequence[str], AbstractSet[str]]] = None, unknown: Optional[Literal['exclude', 'include', 'raise']] = None)[source]#

Bases: ResourceSchema

dump_fields: dict[str, Field]#
exclude: set[Any] | MutableSet[Any]#
fields: dict[str, Field]#

Dictionary mapping field_names -> Field objects

load_fields: dict[str, Field]#
opts: Any = <marshmallow.schema.SchemaOpts object>#
squash_data_for_frictionless(data, **kwargs)[source]#
unknown: types.UnknownOption#
class Schema(*, only: Optional[Union[Sequence[str], AbstractSet[str]]] = None, exclude: Union[Sequence[str], AbstractSet[str]] = (), many: Optional[bool] = None, load_only: Union[Sequence[str], AbstractSet[str]] = (), dump_only: Union[Sequence[str], AbstractSet[str]] = (), partial: Optional[Union[bool, Sequence[str], AbstractSet[str]]] = None, unknown: Optional[Literal['exclude', 'include', 'raise']] = None)[source]#

Bases: ResourceSchema, Schema

dump_fields: dict[str, Field]#
exclude: set[Any] | MutableSet[Any]#
fields: dict[str, Field]#

Dictionary mapping field_names -> Field objects

load_fields: dict[str, Field]#
opts: Any = <marshmallow.schema.SchemaOpts object>#
unknown: types.UnknownOption#
property basepath: str#
copy() Self[source]#

Returns a copy of the resource.

copy_to_new_location(basepath: str, overwrite: bool = False, filepath: Optional[str] = None, resource_name: Optional[str] = None, descriptor_filename: Optional[str] = None) Self[source]#
property corpus_name: Optional[str]#

The name of the corpus this resource belongs to.

property descriptor_exists: bool#
property descriptor_filename: Optional[str]#

The path to the descriptor file on disk, relative to the basepath. If you need to fall back to a default value, use get_descriptor_filename() instead.

detach_from_basepath()[source]#
detach_from_descriptor()[source]#
detach_from_filepath()[source]#
property filepath: str#
classmethod from_descriptor(descriptor: dict | frictionless.resource.resource.Resource, descriptor_filename: Optional[str] = None, basepath: Optional[str] = None, **kwargs) Self[source]#

Create a Resource from a frictionless descriptor dictionary.

Parameters:
  • descriptor – Descriptor corresponding to a frictionless resource descriptor.

  • descriptor_filename – Relative filepath for using a different JSON/YAML descriptor filename than the default get_descriptor_filename(). Needs to end on one of the file extensions defined in the setting package_descriptor_endings (by default ‘resource.json’ or ‘resource.yaml’).

  • basepath – Where the file would be serialized and, important for an existing resource, the path against which the descriptor’s ‘filepath’ property can be resolved.

  • **kwargs – Subclasses can use this method.

Raises:
  • TypeError – If the descriptor is a string or a Path, not a dictionary or a frictionless Resource.

  • ResourceDescriptorHasWrongTypeError – If the descriptor belongs to a type that is not a subclass of the Resource class to be initialized.

Returns:

classmethod from_descriptor_path(descriptor_path: str, **kwargs) Self[source]#

Create a Resource from a frictionless descriptor file on disk.

Parameters:
  • descriptor_path – Absolute path where the JSON/YAML descriptor is located.

  • basepath – If you do not want the folder where the descriptor is located to be treated as basepath, you may specify an absolute path higher up within the descriptor_path to serve as base. The resource’s filepath will be adapated accordingly, whereas the resource names specified in the descriptor will remain the same.

  • **kwargs – Subclasses can use this method.

classmethod from_filepath(filepath: str, resource_name: Optional[str] = None, descriptor_filename: Optional[str] = None, basepath: Optional[str] = None, **kwargs) Self[source]#

Create a Resource from a file on disk, be it a JSON/YAML resource descriptor, or a simple path resource.

Parameters:
  • filepath – Path pointing to a resource descriptor or a simple path resource.

  • resource_name – Name of the resource used for retrieving it from a DimcatPackage and as filename when the resource is stored to a ZIP file.

  • descriptor_filename – Relative filepath for using a different JSON/YAML descriptor filename than the default get_descriptor_filename(). Needs to end on one of the file extensions defined in the setting package_descriptor_endings (by default ‘resource.json’ or ‘resource.yaml’).

  • auto_validate – By default, the Resource will not be validated upon instantiation or change (but always before writing to disk). Set True to raise an exception during creation or modification of the resource, e.g. replacing the column_schema.

  • default_groupby – Pass a list of column names or index levels to groupby something else than the default (by piece).

  • basepath – Basepath to use for the resource. If None, the folder of the filepath is used.

classmethod from_resource(resource: Resource, descriptor_filename: Optional[str] = None, resource_name: Optional[str] = None, basepath: Optional[str] = None, **kwargs)[source]#

Create a Resource from an existing Resource, specifying its name and, optionally, at what path it is to be serialized.

Parameters:
  • resource – An existing frictionless.Resource or a filepath.

  • resource_name – Name of the resource used for retrieving it from a DimcatPackage and as filename when the resource is stored to a ZIP file.

  • basepath – Lets you change the basepath of the existing resource.

  • **kwargs – Subclasses can use this method.

classmethod from_resource_path(resource_path: str, resource_name: Optional[str] = None, descriptor_filename: Optional[str] = None, basepath: Optional[str] = None, **kwargs) Self[source]#

Create a Resource from a file on disk, treating it just as a path even if it’s a JSON/YAML resource descriptor

get_corpus_name() str[source]#

Returns the value of corpus_name or, if not set, a name derived from the resource’s filepath.

Raises:

ValueError – If neither corpus_name nor filepath are set.

get_descriptor_filename(set_default_if_missing: bool = False) str[source]#

Like descriptor_filename but returning a default value if None. If set_default_if_missing is set to True and no basepath has been set (e.g. during initialization), the basepath is permanently set to the default basepath.

get_descriptor_path(set_default_if_missing=False) Optional[str][source]#

Returns the path to the descriptor file. If basepath or descriptor_filename are not set, they are set permanently to their defaults. If create_if_missing is set to True, the descriptor file is created if it does not exist yet.

get_filepath(set_default_if_missing=False) str[source]#

Returns the relative path to the data (filepath) if specified, innerpath otherwise.

get_innerpath(set_default_if_missing: bool = False) Optional[str][source]#

Returns the path to the resource file within a ZIP file.

get_path_dict() Dict[str, str][source]#

Returns a dictionary with the paths to the resource’s data and descriptor.

get_resource_name(set_default_if_missing=False) str[source]#
property innerpath: Optional[str]#

The innerpath is the resource’s filepath within a zip file.

property is_empty: bool#
property is_frozen: bool#

Whether the resource is frozen (i.e. it’s pointing to data on the disk) or not.

property is_loaded: bool#
property is_packaged: bool#

Returns True if the resource is packaged, i.e. its descriptor_filename is the one of the Package that the resource is a part of. Also means that the resource is passive.

property is_serialized: bool#

Returns True if the resource is serialized, i.e. it points to a file on disk and, if it is a ZIP file, the innerpath is present in that ZIP file.

property is_valid: bool#

Returns the result of a previous validation or, if the resource has not been validated before, do it now.

property is_zipped: bool#

Returns True if the filepath points to a .zip file.

make_descriptor() dict[source]#

Returns a frictionless descriptor for the resource.

property normpath: str#

Absolute path to the serialized or future tabular file. Raises if basepath is not set.

property resource: Resource#
property resource_exists: bool#

Returns True if the resource’s normpath exists on disk. If the resource is_zipped and you want to check if the innerpath actually exists within the ZIP file, use is_serialized instead.

property resource_name: str#
set_basepath(basepath: str, reconcile: bool = False) None[source]#
set_descriptor_filename(descriptor_filename: str) None[source]#
Parameters:

descriptor_filename

Raises:

ValueError – If the descriptor_filename is not a simple filename.

property status: ResourceStatus#
store_descriptor(descriptor_path: Optional[str] = None, overwrite=True) str[source]#

Stores the frictionless descriptor to disk based on the resource’s configuration and returns its path. Does not modify the resource’s status.

Returns:

The path to the descriptor file on disk. If None, the default is used.

Raises:
  • ResourceIsPackagedError – If the resource is packaged, this method refuses to store a resource descriptor because that would potentially update path information managed by the package.

  • InvalidResourcePathError – If the resource’s path does not point to an existing file on disk.

to_dict(pickle: bool = False) Dict[str, Any][source]#

Returns a dictionary representation of the resource and stores its descriptor to disk.

validate(raise_exception: bool = False, only_if_necessary: bool = False) Optional[Report][source]#

Validate the resource against its descriptor.

Parameters:
  • raise_exception – (default False) Pass True to raise if the resource is not valid.

  • only_if_necessary – (default False) Pass True to skip validation if the resource has already been validated or is assumed to be valid because it exists on disk.

Returns:

None if no validation took place (e.g. because resource is empty or only_if_necessary was True). Otherwise, frictionless report resulting from validating the data against the column_schema.

Raises:

FrictionlessException – If the resource is not valid and raise_exception is True.

class dimcat.data.resources.base.ResourceSchema(*, only: Optional[Union[Sequence[str], AbstractSet[str]]] = None, exclude: Union[Sequence[str], AbstractSet[str]] = (), many: Optional[bool] = None, load_only: Union[Sequence[str], AbstractSet[str]] = (), dump_only: Union[Sequence[str], AbstractSet[str]] = (), partial: Optional[Union[bool, Sequence[str], AbstractSet[str]]] = None, unknown: Optional[Literal['exclude', 'include', 'raise']] = None)[source]#

Bases: Schema

Since Resource objects function partially as a wrapper around a frictionless.Resource object, many properties are serialized by the means of the frictionless descriptor corresponding to it, which is provided by the frictionless library. For example, resource_name uses .resource.name under the hood.

dump_fields: dict[str, Field]#
exclude: set[Any] | MutableSet[Any]#
fields: dict[str, Field]#

Dictionary mapping field_names -> Field objects

get_frictionless_descriptor(obj: Resource) dict[source]#
init_object(data, **kwargs)[source]#

Once the data has been loaded, create the corresponding object.

load_fields: dict[str, Field]#
opts: Any = <marshmallow.schema.SchemaOpts object>#
raw(data)[source]#

Functions as ‘deserialize’ method for the Schema field ‘resource’.

unknown: types.UnknownOption#
unsquash_data_if_necessary(data, **kwargs)[source]#

Data serialized with this schema usually has ‘resource’ field that contains the frictionless descriptor. However, if it has been serialized with the PickleSchema variant, this descriptor has become the top level and all other fields have been squashed into it, effectively flattening the dictionary. This method reverses this flattening, if necessary.

class dimcat.data.resources.base.ResourceStatus(value)[source]#

Bases: IntEnum

Expresses the status of a class:Resource with respect to it being described, valid, and serialized to disk, with or without its descriptor file. The enum members have increasing integer values starting with EMPTY == 0. Statuses > PATH_ONLY (1) are currently only relevant for DimcatResources. The current status is determined by the boolean state of the first three attributes in the table below:

  • is_serialized: True if the resource can be located physically on disk.

  • descriptor_exists: True if a descriptor file (JSON/YAML) is physically present on disk.

  • is_loaded: True if the resource is currently loaded into memory.

The remaining attributes are derived from the first three and are not used to determine the current status:

  • assumed valid: True if the resource is assumed to be valid, which is the case for all serialized resources.

  • standalone: True if the resource is not part of a package. For “free” (not serialized) resources, it depends on the value Resource.descriptor_filename (whether it corresponds to a package or resource descriptor).

  • empty: True if the resource is empty, i.e. it does not data. A DimcatResource that is PATH_ONLY is considered empty, whereas a Resource/PathResource is not (they only have status 0 or 1).

ResourceStatus

is_serialized

descriptor_exists

is_loaded

assumed valid

standalone

empty

EMPTY

False

?

False

no

?

yes

PATH_ONLY

True

?

False

no

?

yes

SCHEMA_ONLY

False

?

False

no

?

yes

DATAFRAME

False

False

True

no

?

no

VALIDATED

False

False

True

guaranteed

?

no

SERIALIZED

True

False

True

yes

yes

no

STANDALONE_LOADED

True

True

True

yes

yes

no

PACKAGED_LOADED

True

True

True

yes

no

no

STANDALONE_NOT_LOADED

True

True

False

yes

yes

no

PACKAGED_NOT_LOADED

True

True

False

yes

no

no

The status of a resource is set at the end of Resource.__init__() by calling Resource._update_status() which, in return calls Resource._get_status().

DATAFRAME = 3#
EMPTY = 0#
PACKAGED_LOADED = 7#
PACKAGED_NOT_LOADED = 9#
PATH_ONLY = 1#
SCHEMA_ONLY = 2#
SERIALIZED = 5#
STANDALONE_LOADED = 6#
STANDALONE_NOT_LOADED = 8#
VALIDATED = 4#
dimcat.data.resources.base.reconcile_base_and_file(basepath: Optional[str], filepath: str) Tuple[str, str][source]#
Parameters:
  • basepath

  • filepath

Returns:

The result is a tuple of an absolute basepath and a relative filepath.if

dimcat.data.resources.base.resource_specs2resource(resource: Union[Resource, str, Path]) R[source]#

Converts a resource specification to a resource.

Parameters:

resource – A resource specification.

Returns:

A resource.

dimcat.data.resources.dc module#

class dimcat.data.resources.dc.DimcatIndex(index: Optional[IX] = None, basepath: Optional[str] = None)[source]#

Bases: Generic[IX], Data

A wrapper around a pandas.MultiIndex that provides additional functionality such as keeping track of index levels and default groupings.

A MultiIndex essentially is a Sequence of tuples where each tuple identifies dataframe row and includes one value per index level. Each index level has a name and can be seen as in individual pandas.Index. One important type of DimcatIndex is the PieceIndex which is a unique MultiIndex (that is, each tuple is unique) and where the last (i.e. right-most) level is named piece.

NB: If you want to use the index in a dataframe constructor, use the actual, wrapped index object as in pd.DataFrame(index=dc_index.index).

class PickleSchema(*, only: Optional[Union[Sequence[str], AbstractSet[str]]] = None, exclude: Union[Sequence[str], AbstractSet[str]] = (), many: Optional[bool] = None, load_only: Union[Sequence[str], AbstractSet[str]] = (), dump_only: Union[Sequence[str], AbstractSet[str]] = (), partial: Optional[Union[bool, Sequence[str], AbstractSet[str]]] = None, unknown: Optional[Literal['exclude', 'include', 'raise']] = None)[source]#

Bases: Schema

dump_fields: dict[str, Field]#
exclude: set[Any] | MutableSet[Any]#
fields: dict[str, Field]#

Dictionary mapping field_names -> Field objects

init_object(data, **kwargs) DimcatIndex[source]#

Once the data has been loaded, create the corresponding object.

load_fields: dict[str, Field]#
opts: Any = <marshmallow.schema.SchemaOpts object>#
unknown: types.UnknownOption#
class Schema(*, only: Optional[Union[Sequence[str], AbstractSet[str]]] = None, exclude: Union[Sequence[str], AbstractSet[str]] = (), many: Optional[bool] = None, load_only: Union[Sequence[str], AbstractSet[str]] = (), dump_only: Union[Sequence[str], AbstractSet[str]] = (), partial: Optional[Union[bool, Sequence[str], AbstractSet[str]]] = None, unknown: Optional[Literal['exclude', 'include', 'raise']] = None)[source]#

Bases: PickleSchema, Schema

dump_fields: dict[str, Field]#
exclude: set[Any] | MutableSet[Any]#
fields: dict[str, Field]#

Dictionary mapping field_names -> Field objects

load_fields: dict[str, Field]#
opts: Any = <marshmallow.schema.SchemaOpts object>#
unknown: types.UnknownOption#
copy() Self[source]#
filter(keep_values: Optional[Union[str, Number, bool, Iterable[Union[str, Number, bool]]]] = None, drop_values: Optional[Union[str, Number, bool, Iterable[Union[str, Number, bool]]]] = None, level: int | str = 0, drop_level: Optional[bool] = None) Self[source]#

Returns a copy of the index with only those items where the given level has wanted values.

Parameters:
  • keep_values – One or several values to keep (dropping the rest). If a value is specified both for keeping and dropping, it is dropped.

  • drop_values – One or several values to drop.

  • level – Which index level to filter on.

  • drop_level – Boolean specifies whether to keep the filtered level or to drop it. The default (None) corresponds to automatic behaviour, where the level is dropped if only one value remains, otherwise kept.

Returns:

A copy of the index with only those items where the given level has wanted values and may have been removed.

classmethod from_dataframe(df: DataFrame) Self[source]#

Create a DimcatIndex from a dataframe’s index.

classmethod from_grouping(grouping: Dict[Hashable, List[tuple]], level_names: Sequence[str] = ('piece_group', 'corpus', 'piece'), sort: bool = False, raise_if_multiple_membership: bool = False) Self[source]#

Creates a DimcatIndex from a dictionary of piece groups.

Args: grouping: A dictionary where keys are group names and values are lists of index tuples. level_names:

Names for the levels of the MultiIndex, i.e. one for the group level and one per level in the tuples.

sort: By default the returned MultiIndex is not sorted. Set False to enable sorting. raise_if_multiple_membership: If True, raises a ValueError if a member is in multiple groups.

classmethod from_index(index: MultiIndex, **kwargs) Self[source]#

Create a DimcatIndex from a dataframe index.

classmethod from_resource(resource: dimcat.data.resources.dc.DimcatResource | frictionless.resource.resource.Resource, index_col: Optional[Union[int, str, List[Union[int, str]]]] = None) Self[source]#

Create a DimcatIndex from a frictionless Resource.

classmethod from_tuples(tuples: Iterable[tuple], level_names: Sequence[str]) Self[source]#
get_level_values_to_drop(drop_values: Union[str, Number, bool, Iterable[Union[str, Number, bool]]], keep_values: Union[str, Number, bool, Iterable[Union[str, Number, bool]]], level: int | str) Tuple[Set[Hashable], Set[Hashable]][source]#
property index: IX#
property names: List[str]#
property piece_level_position: Optional[int]#

The position of the piece level in the index, or None if the index has no piece level.

sample(n: int) Self[source]#

Return a random sample of n elements.

to_resource(**kwargs) DimcatResource[source]#

Create a DimcatResource from this index.

class dimcat.data.resources.dc.DimcatResource(resource: Optional[Resource] = None, descriptor_filename: Optional[str] = None, basepath: Optional[str] = None, auto_validate: bool = False, default_groupby: Optional[Union[str, list[str]]] = None, format=None)[source]#

Bases: Resource, Generic[D]

Data object wrapping a dataframe. The dataframe’s metadata are stored as a frictionless.Resource, that can be used for serialization and (lazy) deserialization.

Every serialization of a DimcatResource (e.g. to store it as a config) requires that the dataframe was either originally read from disk or, otherwise, that it be stored to disk. The behaviour depends on whether the resource is part of a package or not.

Standalone resource (rare case)#

If the resource is not part of a package, serializing it results in two files on disk:

  • the dataframe stored as <basepath>/<name>.tsv

  • the frictionless descriptor <basepath>/<name>.resource.json

where <name> defaults to resource_name unless filepath is specified. The serialization has the shape

{
    "dtype": "DimcatResource",
    "resource": "<name>.resource.json",
    "basepath": "<basepath>"
}

A standalone resource can be instantiated in the following ways:

  • DimcatResource(): Creates an empty DimcatResource for setting the .df attribute later. If no basepath is specified, the current working directory is used if the resource is to be serialized.

  • DimcatResource.from_descriptor(descriptor_path): The frictionless descriptor is loaded from disk. Its directory is used as basepath. descriptor_path is expected to end in “resource.[json|yaml]”.

  • DimcatResource.from_dataframe(df=df, resource_name, basepath): Creates a new DimcatResource from a dataframe. If basepath is not specified, the current working directory is used if the resource is to be serialized.

  • DimcatResource.from_resource(resource=DimcatResource): Creates a DimcatResource from an existing one by copying the fields it specifies.

Resource in a package (common case)#

A DimcatResource “knows” that it is part of a package if its filepath ends on .zip. In that case, the DimcatPackage will take care of the serialization and not store an individual resource descriptor.

class Schema(*, only: Optional[Union[Sequence[str], AbstractSet[str]]] = None, exclude: Union[Sequence[str], AbstractSet[str]] = (), many: Optional[bool] = None, load_only: Union[Sequence[str], AbstractSet[str]] = (), dump_only: Union[Sequence[str], AbstractSet[str]] = (), partial: Optional[Union[bool, Sequence[str], AbstractSet[str]]] = None, unknown: Optional[Literal['exclude', 'include', 'raise']] = None)[source]#

Bases: Schema

dump_fields: dict[str, Field]#
exclude: set[Any] | MutableSet[Any]#
fields: dict[str, Field]#

Dictionary mapping field_names -> Field objects

load_fields: dict[str, Field]#
opts: Any = <marshmallow.schema.SchemaOpts object>#
unknown: types.UnknownOption#
align_with_grouping(grouping: dimcat.data.resources.dc.DimcatIndex | pandas.core.indexes.multi.MultiIndex, sort_index=True) D[source]#

Aligns the resource with a grouping index. In the typical case, the grouping index will come with the levels [“<grouping_name>”, “corpus”, “piece”] and the result will be aligned such that every group contains the resource’s sub-dataframes for the included pieces. This is like join_on_index() with the difference that align_with_grouping() expects is sensitive to the presence of “piece” index levels and returns a dataframe, whereas join_on_index() returns a new Resource and makes no assumptions on particular levels.

apply_slice_intervals(slice_intervals: dimcat.data.resources.dc.SliceIntervals | pandas.core.indexes.multi.MultiIndex) DataFrame[source]#
apply_step(step: StepSpecs | List | Tuple) DO[source]#
apply_step(*step: StepSpecs) DO

Applies one or several pipeline steps to this resource. For backward compatibility, when only a single argument is passed, the method accepts it to be a list or tuple of step specs, too.

property column_schema: Schema#
property dataframe: D#

Returns the dataframe underlying this resource, without applying any formatting.

property default_groupby: List[str]#
property df: D#

Returns the dataframe underlying this resource, applying the current format, if set.

extract_feature(feature: Union[Feature, Type[Feature], DimcatConfig, MutableMapping, FeatureName, str], new_name: Optional[str] = None) F[source]#
property extractable_features: Tuple[FeatureName, ...]#
property field_names: List[str]#

The names of the fields in the resource’s schema.

filter_index_level(keep_values: Optional[Union[str, Number, bool, Iterable[Union[str, Number, bool]]]] = None, drop_values: Optional[Union[str, Number, bool, Iterable[Union[str, Number, bool]]]] = None, level: int | str = 0, drop_level: Optional[bool] = None) Self[source]#

Returns a copy of the resource with only those rows where the given level has desired values.

Parameters:
  • keep_values – One or several values to keep (dropping the rest). If a value is specified both for keeping and dropping, it is dropped.

  • drop_values – One or several values to drop.

  • level – Which index level to filter on.

  • drop_level – Boolean specifies whether to keep the filtered level or to drop it. The default (None) corresponds to automatic behaviour, where the level is dropped if only one value remains, otherwise kept.

Returns:

A copy of the resource with only those rows where the given level has desired values.

property format: None#
format_dataframe(format=None)[source]#

Format the resource dataframe or the one specified by the current format or the one specified. This method is called by the df property, but not by the dataframe property.

property formatted_column: Optional[str]#

A secondary value column that represents the value_column in a different format. If it hasn’t been set, it defaults to _default_formatted_column, falling back to value_column.

classmethod from_dataframe(df: D, resource_name: str, descriptor_filename: Optional[str] = None, basepath: Optional[str] = None, auto_validate: bool = False, default_groupby: Optional[Union[str, list[str]]] = None, format=None, **kwargs) Self[source]#

Create a DimcatResource from a dataframe, specifying its name and, optionally, at what path it is to be serialized.

Parameters:
  • df – Dataframe to create the resource from.

  • resource_name – Name of the resource used for retrieving it from a DimcatPackage and as filename when the resource is stored to a ZIP file.

  • basepath

    Where to store serialization data and its descriptor by default. If resource is a filepath, its

    directory is used.

  • auto_validate – By default, the DimcatResource will not be validated upon instantiation or change (but always before writing to disk). Set True to raise an exception during creation or modification of the resource, e.g. replacing the column_schema.

  • default_groupby – Pass a list of column names or index levels to groupby something else than the default (by piece).

  • format – Defines the format.

classmethod from_descriptor(descriptor: dict | frictionless.resource.resource.Resource, descriptor_filename: Optional[str] = None, basepath: Optional[str] = None, auto_validate: bool = False, default_groupby: Optional[Union[str, list[str]]] = None, format=None, **kwargs) Self[source]#

Create a DimcatResource by loading its frictionless descriptor from disk. The descriptor’s directory is used as basepath. descriptor_path is expected to end in .resource.json.

Parameters:
  • descriptor – Descriptor corresponding to a frictionless resource descriptor.

  • descriptor_filename – Relative filepath for using a different JSON/YAML descriptor filename than the default get_descriptor_filename(). Needs to end on one of the file extensions defined in the setting package_descriptor_endings (by default ‘resource.json’ or ‘resource.yaml’).

  • basepath – Where to store serialization data and its descriptor by default.

  • auto_validate – By default, the DimcatResource will not be validated upon instantiation or change (but always before writing to disk). Set True to raise an exception during creation or modification of the resource, e.g. replacing the column_schema.

  • default_groupby – Pass a list of column names or index levels to groupby something else than the default (by piece).

  • format – Defines the format.

classmethod from_descriptor_path(descriptor_path: str, auto_validate: bool = False, default_groupby: Optional[Union[str, list[str]]] = None, format=None, **kwargs) Self[source]#

Create a Resource from a frictionless descriptor file on disk.

Parameters:
  • descriptor_path – Absolute path where the JSON/YAML descriptor is located.

  • auto_validate – By default, the DimcatResource will not be validated upon instantiation or change (but always before writing to disk). Set True to raise an exception during creation or modification of the resource, e.g. replacing the column_schema.

  • default_groupby – Pass a list of column names or index levels to groupby something else than the default (by piece).

  • format – Defines the format.

classmethod from_filepath(filepath: str, resource_name: Optional[str] = None, descriptor_filename: Optional[str] = None, basepath: Optional[str] = None, auto_validate: bool = False, default_groupby: Optional[Union[str, list[str]]] = None, format=None, **kwargs: Optional[bool]) Self[source]#

Create a Resource from a file on disk, be it a JSON/YAML resource descriptor, or a simple path resource.

Parameters:
  • filepath – Path pointing to a resource descriptor or a simple path resource.

  • resource_name – Name of the resource used for retrieving it from a DimcatPackage and as filename when the resource is stored to a ZIP file.

  • descriptor_filename – Relative filepath for using a different JSON/YAML descriptor filename than the default get_descriptor_filename(). Needs to end on one of the file extensions defined in the setting package_descriptor_endings (by default ‘resource.json’ or ‘resource.yaml’).

  • basepath – Basepath to use for the resource. If None, the folder of the filepath is used.

  • auto_validate – By default, the Resource will not be validated upon instantiation or change (but always before writing to disk). Set True to raise an exception during creation or modification of the resource, e.g. replacing the column_schema.

  • default_groupby – Pass a list of column names or index levels to groupby something else than the default (by piece).

  • format – Defines the format.

classmethod from_index(index: dimcat.data.resources.dc.DimcatIndex | pandas.core.indexes.multi.MultiIndex, resource_name: str, basepath: Optional[str] = None, descriptor_filename: Optional[str] = None, auto_validate: bool = False, default_groupby: Optional[Union[str, list[str]]] = None, format=None) Self[source]#
classmethod from_resource(resource: Resource, descriptor_filename: Optional[str] = None, resource_name: Optional[str] = None, basepath: Optional[str] = None, auto_validate: Optional[bool] = None, default_groupby: Optional[Union[str, list[str]]] = None, format=None, **kwargs) Self[source]#

Create a DimcatResource from an existing Resource, specifying its name and, optionally, at what path it is to be serialized.

Parameters:
  • resource – An existing frictionless.Resource or a filepath.

  • resource_name – Name of the resource used for retrieving it from a DimcatPackage and as filename when the resource is stored to a ZIP file.

  • basepath – Where to store serialization data and its descriptor by default. If resource is a filepath, its directory is used.

  • auto_validate – By default, the DimcatResource will not be validated upon instantiation or change (but always before writing to disk). Set True to raise an exception during creation or modification of the resource, e.g. replacing the column_schema.

  • default_groupby – Pass a list of column names or index levels to groupby something else than the default (by piece).

  • format – Defines the format.

classmethod from_resource_and_dataframe(resource: Resource, df: D, descriptor_filename: Optional[str] = None, **kwargs) Self[source]#

Create a DimcatResource from an existing Resource, specifying its name and, optionally, at what path it is to be serialized.

Parameters:
  • resource – An existing frictionless.Resource or a filepath.

  • **kwargs – Init arguments to override.

classmethod from_resource_path(resource_path: str, resource_name: Optional[str] = None, descriptor_filename: Optional[str] = None, **kwargs) Self[source]#

Create a DimcatResource from path to a (tabular) resource file. Currently, only TSV files are supported and they are expected to contain at least the columns “corpus” and “piece”, which are used as index.

get_dataframe(index_col: Optional[Union[int, str, Tuple[int | str]]] = None, usecols: Optional[Union[int, str, Tuple[int | str]]] = None) D[source]#

Load the dataframe from disk based on the descriptor’s normpath. This does not change the resource’s status.

Parameters:
  • index_col – Can be used to override the primary_key(s) specified in the resource’s schema. Value(s) can be column name(s) or column position(s), or both.

  • usecols – If only a subset of the fields specified in the resource’s schema is to be loaded, the names or positions of the subset.

Returns:

The dataframe or DimcatResource.

get_default_analysis() Rs[source]#

Returns the default analysis of the resource.

classmethod get_default_column_names(include_context_columns: bool = True) List[str][source]#

Returns the default column names for a DimcatResource.

get_grouping_levels(smallest_unit: UnitOfAnalysis = UnitOfAnalysis.SLICE) List[str][source]#

Returns the levels of the grouping index, i.e., all levels until and including ‘piece’ or ‘slice’.

get_index() DimcatIndex[source]#

Returns the index of the resource based on the primaryKey of the frictionless.Schema.

get_interval_index(round: Optional[int] = None, level_name: Optional[str] = None) IntervalIndex[source]#

Returns a pandas.IntervalIndex object based on the result of get_time_spans().

Parameters:
  • round – Pass an integer if you want to round the interval positions to so many decimals.

  • level_name – Name of the new level containing intervals. Automatically created if not specified.

get_level_names() List[str][source]#

Returns the level names of the resource’s index.

get_normpath(set_default_if_missing=False) str[source]#
get_piece_index(max_levels: int = 2) PieceIndex[source]#

Returns the PieceIndex of the resource based on get_index. That is, an index of which the right-most level is unique and called piece and up to max_levels additional index levels to its right.

Parameters:

max_levels – By default, the number of levels is limited to the default 2, (‘corpus’, ‘piece’).

Returns:

An index of the pieces described by the resource.

get_slice_intervals(round: Optional[int] = None, level_name: Optional[str] = None, drop_levels: Optional[Literal[False], str | int | Iterable[str | int]] = -1) SliceIntervals[source]#

Returns a SliceIntervals object based on the result of get_time_spans(). Effectively, this is this resource’s DimcatIndex with an additional level containing the time spans of the events represented by the resource’s rows. This object can be used to slice any other resource that has pieces in common.

Parameters:
  • round – Pass an integer if you want to round the interval positions to so many decimals.

  • level_name – Name of the new level containing intervals. Automatically created if not specified.

  • drop_levels – Defaults to -1, meaning that the last level of the original index (usually called ‘i’) is dropped before appending the new interval level (i.e., level ‘i’ is replaced).

Returns:

get_time_spans(round: Optional[int] = None, to_float: bool = True, dropna: bool = False) D[source]#

Returns a dataframe with start (‘left’) and end (‘end’) positions of the events represented by this resource’s rows.

Parameters:
  • round – To how many decimal places to round the intervals’ boundary values. Setting a value automatically sets to_float=True.

  • to_float – Set to True to turn the time span values into floats.

Returns:

property has_distinct_formatted_column: bool#

Returns False if no formatted_column is specified or it is identical with value_column.

property innerpath: str#

The innerpath is the resource_name plus the extension .tsv and is used as filename within a .zip archive.

property is_empty: bool#

Whether this resource holds data available or not (yet).

property is_loaded: bool#
property is_valid: bool#

Returns the result of a previous validation or, if the resource has not been validated before, do it now. Importantly, this property assumes serialized resoures to be valid. If you want to actively validate the resource, use validate() instead.

join_on_index(index: Union[DimcatIndex, IX], how: Literal['left', 'right', 'inner', 'outer', 'cross'] = 'inner') Self[source]#

A convenient way to align a resource with the index of another one through a join operation.

Parameters:
  • index – The index that this resource will be aligned with.

  • how

    The type of join to perform.

    • ’inner’ (default): index of the new resource will contain only keys present in index, and each will be repeated as many times as it appears in index.

Returns:

A new resource.

load(force_reload: bool = False) None[source]#

Tries to load the data from disk into RAM. If successful, the .is_loaded property will be True. If the resource hadn’t been loaded before, its .status property will be updated.

make_bar_plot(*step: StepSpecs, **kwargs) go.Figure[source]#

Returns a plotly figure based on the default analysis or the analysis resulting from the given steps.

Parameters:
  • step – Zero or more PipelineSteps where the last one needs to return an object that has a .make_bar_plot() method, typically an Analyzer returning a Result. Defaults to get_default_analysis() if no step is specified.

  • **kwargs – Keyword arguments passed on to .make_bar_plot().

Returns:

The figure generated by calling .make_bar_plot() on the last step’s result.

make_bubble_plot(*step: StepSpecs, **kwargs) go.Figure[source]#

Returns a plotly figure based on the default analysis or the analysis resulting from the given steps.

Parameters:
  • step – Zero or more PipelineSteps where the last one needs to return an object that has a .make_bubble_plot() method, typically an Analyzer returning a Result. Defaults to get_default_analysis() if no step is specified.

  • **kwargs – Keyword arguments passed on to .make_bubble_plot().

Returns:

The figure generated by calling .make_bubble_plot() on the last step’s result.

make_pie_chart(*step: StepSpecs, **kwargs) go.Figure[source]#

Returns a plotly figure based on the default analysis or the analysis resulting from the given steps.

Parameters:
  • step – Zero or more PipelineSteps where the last one needs to return an object that has a .make_pie_chart() method, typically an Analyzer returning a Result. Defaults to get_default_analysis() if no step is specified.

  • **kwargs – Keyword arguments passed on to .make_pie_chart().

Returns:

The figure generated by calling .make_pie_chart() on the last step’s result.

property metadata: Metadata#
plot(*step: StepSpecs, **kwargs) go.Figure[source]#

Returns a plotly figure based on the default analysis or the analysis resulting from the given steps.

Parameters:
  • step – Zero or more PipelineSteps where the last one needs to return an object that has a .plot() method, typically an Analyzer returning a Result. Defaults to get_default_analysis() if no step is specified.

  • **kwargs – Keyword arguments passed on to .plot().

Returns:

The figure generated by calling .plot() on the last step’s result.

plot_grouped(*step: StepSpecs, **kwargs) go.Figure[source]#

Returns a plotly figure based on the default analysis or the analysis resulting from the given steps.

Parameters:
  • step – Zero or more PipelineSteps where the last one needs to return an object that has a .plot_grouped() method, typically an Analyzer returning a Result. Defaults to get_default_analysis() if no step is specified.

  • **kwargs – Keyword arguments passed on to .plot_grouped().

Returns:

The figure generated by calling .plot_grouped() on the last step’s result.

set_basepath(basepath: str, reconcile: bool = False) None[source]#
set_dataframe(df)[source]#

Tries setting the dataframe of this feature. This method should be called exactly once after instantiating the feature. The method checks for potential problems first, then calls _adapt_newly_set_df(), assuming that the dataframe can be mutated safely, i.e. it is a copy. If auto_validate is True, the newly set dataframe will be validated.

store_dataframe(overwrite=False, validate: bool = True) None[source]#

Stores the dataframe and its descriptor to disk based on the resource’s configuration.

Parameters:
  • overwrite

  • validate

Raises:

RuntimeError – If the resource is frozen or does not contain a dataframe or if the file exists already.

store_resource(basepath: Optional[str] = None, name: Optional[str] = None, overwrite=True) Optional[str][source]#

Stores the resource as a frictionless resource consisting of a TSV file containing the data and an accompanying descriptor file (default: JSON).

Parameters:
  • basepath – The basepath to write the resource to. Defaults to the resource’s basepath.

  • name – The name of the resource. Defaults to the resource’s name.

  • overwrite – Whether to overwrite existing files. Defaults to True.

Returns:

The filepath of the stored descriptor.

subselect(tuples: Union[DimcatIndex, Iterable[tuple]], levels: Optional[Union[int, str, List[Union[int, str]]]] = None) DataFrame[source]#

Returns a copy of a subselection of the dataframe based on the union of its index tuples (or subtuples) and the given tuples.

summary_dict() dict[source]#

Returns a summary of the object.

update_default_groupby(new_level_name: str) None[source]#

Updates the value of default_groupby by prepending the new level name to it.

validate(raise_exception: bool = False, only_if_necessary: bool = False) Optional[Report][source]#

Validate the resource’s data against its descriptor.

Parameters:
  • raise_exception – (default False) Pass True to raise if the resource is not valid.

  • only_if_necessary – (default False) Pass True to skip validation if the resource has already been validated or is assumed to be valid because it exists on disk.

Returns:

None if no validation took place (e.g. because resource is empty or only_if_necessary was True). Otherwise, frictionless report resulting from validating the data against the column_schema.

Raises:

FrictionlessException – If the resource is not valid and raise_exception is True.

property value_column: Optional[str]#

Name of the column containing representative values for this resource. If not set, it defaults to _default_value_column, falling back to the last element of _feature_columns, if defined.

class dimcat.data.resources.dc.Feature(resource: Optional[Union[Resource, str]] = None, descriptor_filename: Optional[str] = None, basepath: Optional[str] = None, auto_validate: bool = True, default_groupby: Optional[Union[str, list[str]]] = None, format=None, playthrough: Playthrough = Playthrough.SINGLE)[source]#

Bases: DimcatResource

A feature is a DimcatResource that represents a single feature of a piece of music, generally some subset and/or transformation of a Facet. A feature resource usually represents one object per row and has a defined temporality (‘quarterbeats’, at the very least) relative to the scores in question.

class Schema(*, only: Optional[Union[Sequence[str], AbstractSet[str]]] = None, exclude: Union[Sequence[str], AbstractSet[str]] = (), many: Optional[bool] = None, load_only: Union[Sequence[str], AbstractSet[str]] = (), dump_only: Union[Sequence[str], AbstractSet[str]] = (), partial: Optional[Union[bool, Sequence[str], AbstractSet[str]]] = None, unknown: Optional[Literal['exclude', 'include', 'raise']] = None)[source]#

Bases: Schema

dump_fields: dict[str, Field]#
exclude: set[Any] | MutableSet[Any]#
fields: dict[str, Field]#

Dictionary mapping field_names -> Field objects

load_fields: dict[str, Field]#
opts: Any = <marshmallow.schema.SchemaOpts object>#
unknown: types.UnknownOption#
get_available_column_names(index_levels: bool = False, context_columns: bool = False, auxiliary_columns: bool = False, convenience_columns: bool = False, feature_columns: bool = False)[source]#

Returns the column names that are available on the resource.

property playthrough: Playthrough#
class dimcat.data.resources.dc.IndexField(*, load_default: ~typing.Any = <marshmallow.missing>, dump_default: ~typing.Any = <marshmallow.missing>, data_key: ~typing.Optional[str] = None, attribute: ~typing.Optional[str] = None, validate: ~typing.Optional[~typing.Union[~typing.Callable[[~typing.Any], ~typing.Any], ~typing.Iterable[~typing.Callable[[~typing.Any], ~typing.Any]]]] = None, required: bool = False, allow_none: ~typing.Optional[bool] = None, load_only: bool = False, dump_only: bool = False, error_messages: ~typing.Optional[dict[str, str]] = None, metadata: ~typing.Optional[~typing.Mapping[str, ~typing.Any]] = None)[source]#

Bases: Field

A marshmallow field for DimcatIndex objects.

class dimcat.data.resources.dc.PieceIndex(index: Optional[IX] = None)[source]#

Bases: DimcatIndex[IX]

A unique DimcatIndex where the last (i.e. right-most) level is named piece.

classmethod from_index(index: Union[DimcatIndex[IX], IX], recognized_piece_columns: Optional[Iterable[str]] = None, max_levels: int = 2) Self[source]#

Create a PieceIndex from another index.

classmethod from_resource(resource: dimcat.data.resources.dc.DimcatResource | frictionless.resource.resource.Resource, index_col: Optional[Union[int, str, List[Union[int, str]]]] = None, recognized_piece_columns: Optional[Iterable[str]] = None, max_levels: int = 2) Self[source]#

Create a PieceIndex from a frictionless Resource.

classmethod from_tuples(tuples: Iterable[tuple], level_names: Sequence[str] = ('corpus', 'piece')) Self[source]#
class dimcat.data.resources.dc.Playthrough(value)[source]#

Bases: FriendlyEnum

Different types of behaviour regarding repeat structures encoded in score-releated data.

SINGLE:

(default) Represent data for a “single playthrough”. If first and second endings are present the first (third, etc.) are being dropped to exclude incorrect transitions and adjacencies between the first- and second-ending bars.

RAW: Leave data as-is.

RAW = 'RAW'#
SINGLE = 'SINGLE'#
class dimcat.data.resources.dc.SliceIntervals(index: Optional[IX] = None, basepath: Optional[str] = None)[source]#

Bases: DimcatIndex

class dimcat.data.resources.dc.UnitOfAnalysis(value)[source]#

Bases: LowercaseEnum

Serves to specify a grouping of index levels that may depend on the object type and history.

SLICE: Stands for all levels down to the last slice level. If no Slicer has been applied it corresponds to PIECE. PIECE: All levels down to the piece level. GROUP: Current default_groupby based on previously applied Groupers. CORPUS_GROUP: Like GROUP, except the first grouping level is guaranteed to be ‘corpus’.

CORPUS_GROUP = 'CORPUS_GROUP'#
GROUP = 'GROUP'#
PIECE = 'PIECE'#
SLICE = 'SLICE'#

dimcat.data.resources.facets module#

dimcat.data.resources.features module#

dimcat.data.resources.results module#

dimcat.data.resources.utils module#

dimcat.data.resources.utils.align_with_grouping(df: pd.DataFrame, grouping: DimcatIndex | pd.MultiIndex, sort_index: bool = True) pd.DataFrame[source]#

Aligns a dataframe with a grouping index that has n levels such that the index levels of the new dataframe start with the n levels of the grouping index and are followed by the remaining levels of the original dataframe. This is typically used to align a dataframe with feature information for many pieces with an index grouping piece names.

dimcat.data.resources.utils.append_index_levels(old_index: IX, *new_level: IX | S | D, drop_levels: Optional[Literal[False], str | int | Iterable[str | int]] = None) IX[source]#

Replace index levels by optionally dropping an arbitrary number and concatenating the new level(s) to the right.

dimcat.data.resources.utils.apply_playthrough(feature_df: D, playthrough: Playthrough, logger: Optional[logging.Logger] = None) D[source]#

Transform a dataframe based on the resource’s playthrough setting.

dimcat.data.resources.utils.apply_slice_intervals_to_resource_df(df: DataFrame, slice_intervals: MultiIndex, qstamp_column_name: str = 'quarterbeats', duration_column_name: str = 'duration_qb', logger: Optional[Logger] = None) DataFrame[source]#
dimcat.data.resources.utils.boolean_is_minor_column_to_mode(S: Series) Series[source]#
dimcat.data.resources.utils.check_configs_against_allowed_configs(configs: DimcatConfig | Iterable[DimcatConfig], allowed_configs: Optional[FeatureSpecs | Iterable[FeatureSpecs]], allow_subclasses: bool = True) None[source]#

Matches configs against allowed configs and raises as soon as any pair does not match. Two configs match if they have the same dtype and any overlapping key has the same value.

Parameters:
  • configs – Config(s) to be checked.

  • allowed_configs – The function raises if any of the configs does not match with any of these.

  • allow_subclasses – If True (default), configs dtypes are allowed to be subclasses of the allowed_configs dtypes.

Raises:

ResourceNotProcessableError when any of the configs doesn't match with any of the allowed configs.

dimcat.data.resources.utils.check_qstamp_columns(df: D, qstamp_column_name: str, duration_column_name: str, logger: Optional[Logger] = None) None[source]#
dimcat.data.resources.utils.condense_dataframe_by_groups(df: DataFrame, group_keys_series: Series, logger: Optional[Logger] = None)[source]#

Based on the given group_keys_series, drop all rows but the first of each group and adapt the column ‘duration_qb’ accordingly.

Parameters:
  • df – DataFrame to be reduced, expected to contain the column duration_qb. In order to use the result as a segmentation, it should have a pandas.IntervalIndex.

  • group_keys_series – Series with the same index as df that contains the group keys. If it contains NA values, the

Returns:

Reduced DataFrame with updated ‘duration_qb’ column and pandas.IntervalIndex on the first level (if present).

dimcat.data.resources.utils.condense_pedal_points(df)[source]#

Condenses pedal points into single rows. The duration of the pedal point is summed up and the chord is replaced by the pedal

dimcat.data.resources.utils.drop_duplicated_ultima_rows(phrase_annotations_df: D) D[source]#

Used by the PhraseDataAnalyzer to drop the last row of each phrase’s body component when drop_duplicated_ultima_rows is True.

dimcat.data.resources.utils.drop_rows_with_missing_values(df: D, column_names: List[str], how: Literal['any', 'all'] = 'any', logger: Optional[Logger] = None) D[source]#

Drop rows with missing values in the specified columns. If nothing is to be dropped, the identical dataframe is returned, not a copy.

dimcat.data.resources.utils.ensure_level_named_piece(index: MultiIndex, recognized_piece_columns: Optional[Iterable[str]] = None) Tuple[MultiIndex, int][source]#
Ensures that the index has a level named “piece” by detecting alternative level names and renaming it in case it

doesn’t have one. Returns the index and the position of the piece level.

Parameters:
  • index – MultiIndex.

  • recognized_piece_columns – Defaults to (“pieces”, “fname”, “fnames”). If other names are to be recognized as “piece” level, pass those.

Returns:

The same index or a copy with a renamed level. The position of the piece level.

dimcat.data.resources.utils.feature_specs2config(feature: FeatureSpecs) DimcatConfig[source]#

Converts a feature specification to a DimcatConfig.

Raises:

TypeError – If the specs cannot be resolved to a DimcatConfig that describes a Feature.

dimcat.data.resources.utils.features_argument2config_list(features: Optional[FeatureSpecs | Iterable[FeatureSpecs]] = None, allowed_configs: Optional[FeatureSpecs | Iterable[FeatureSpecs]] = None) List[DimcatConfig][source]#
dimcat.data.resources.utils.fl_fields2pandas_params(fields: List[Field]) Tuple[dict, dict, list][source]#

Convert frictionless Fields to pd.read_csv() parameters ‘dtype’, ‘converters’ and ‘parse_dates’.

dimcat.data.resources.utils.get_corpus_display_name(repo_name: str) str[source]#

Looks up a repository name in the CORPUS_NAMES constant. If not present, the repo name is returned as title case.

dimcat.data.resources.utils.get_existing_normpath(fl_resource) str[source]#

Get the normpath of a frictionless resource, raising an exception if it does not exist.

Parameters:

fl_resource – The frictionless resource. If its basepath is not specified, the filepath is tried relative to the current working directory.

Returns:

The absolute path of the frictionless resource.

Raises:
dimcat.data.resources.utils.get_time_spans_from_resource_df(df: DataFrame, qstamp_column_name: str, duration_column_name: str, round: Optional[int], to_float: bool, dropna: bool, return_df: Literal[False], logger: Optional[Logger]) DataFrame[source]#
dimcat.data.resources.utils.get_time_spans_from_resource_df(df: DataFrame, qstamp_column_name: str, duration_column_name: str, round: Optional[int], to_float: bool, dropna: bool, return_df: Literal[True], logger: Optional[Logger]) Tuple[DataFrame, DataFrame]

Returns a dataframe with start (‘left’) and end (‘right’) positions of the events represented by this resource’s rows.

Parameters:
  • df

  • qstamp_column_name – Column from which to retrieve start positions.

  • duration_column_name – Column from which to retrieve durations to be added to the start positions.

  • round – To how many decimal places to round the intervals’ boundary values. Setting a value automatically sets to_float=True.

  • to_float – By default (True), the returned time span values are floats. Set False to leave values as they are after adding the columns, e.g. as fractions. If round is specified, however, this has no effect since the values are rounded to floats anyway.

  • dropna – By default (False), rows with missing values are ignored and the result will include missing values for them. Pass True to drop rows with missing values. In this case you may also want to set return_df=True.

  • return_df – Pass True if you want to return the original dataframe as well, especially when dropna=True.

  • logger

Returns:

A dataframe with columns start and end. If return_df=True, the input dataframe is returned as used for computing the time spans.

dimcat.data.resources.utils.infer_piece_col_position(column_name: List[str], recognized_piece_columns: Optional[Iterable[str]] = None) Optional[int][source]#

Infer the position of the piece column in a list of column names.

dimcat.data.resources.utils.infer_schema_from_df(df: SomeDataframe, include_index_levels: bool = True, allow_integer_names: bool = True, **kwargs) fl.Schema[source]#

Infer a frictionless.Schema from a dataframe.

This function partially copies ms3.utils.frictionless_helpers.get_schema().

Parameters:
  • df

  • include_index_levels – If False (default), the index levels are not described, assuming that they will not be written to disk (otherwise, validation error). Set to True to add all index levels to the described columns and, in addition, to make them the primaryKey (which, in frictionless, implies the constraints “required” & “unique”).

  • **kwargs – Arbitrary key-value pairs that will be added to the frictionless schema descriptor as “custom” metadata.

Returns:

dimcat.data.resources.utils.insert_index_level(old_index: IX, new_level: Union[IX, S, D], position: int) IX[source]#

Replace index levels by optionally dropping an arbitrary number and concatenating the new level(s) to the right.

dimcat.data.resources.utils.join_df_on_index(df: pd.DataFrame, index: DimcatIndex | pd.MultiIndex, how: Literal['left', 'right', 'inner', 'outer', 'cross'] = 'inner') pd.DataFrame[source]#
dimcat.data.resources.utils.load_fl_resource(fl_resource: fl.Resource, normpath: Optional[str] = None, index_col: Optional[int | str | Iterable[int | str]] = None, usecols: Optional[int | str | Iterable[int | str]] = None) SomeDataframe[source]#

Load a dataframe from a frictionless.Resource.

Parameters:
  • fl_resource – The resource whose normpath points to a file on the local file system.

  • normpath – If not specified, the normpath of the resource is used, which is not always reliable because its own basepath property is half-heartedly maintained.

  • index_col – Column(s) to be used as index levels, overriding the primary key specified in the resource’s schema.

  • usecols – If only a subset of the specified fields is to be loaded, the names or positions of the subset.

Returns:

The loaded dataframe loaded with the dtypes resulting from converting the schema fields via fl_fields2pandas_params().

dimcat.data.resources.utils.load_index_from_fl_resource(fl_resource: fl.Resource, index_col: Optional[int | str | List[int | str]] = None, recognized_piece_columns: Iterable[str] = ('piece', 'pieces', 'fname', 'fnames')) SomeIndex[source]#

Load the index columns from a frictionless Resource.

Parameters:
  • fl_resource – The frictionless Resource to load the index columns from.

  • index_col – The column(s) to use as index. If None, the primary key of the schema is used if it exists.

  • recognized_piece_columns – If the loaded columns do not include ‘piece’ but one of the names specified here, the first column name of the iterable that is detected in the loaded columns will be renamed to ‘piece’. Likewise, such a column would be used (and renamed) if index_col is not specified and the schema does not specify a primary key: in that case, the detected column and all columns left of it will used as index_col argument.

Returns:

The specified or inferred index column(s) as a (Multi)Index object.

Raises:
  • FileNotFoundError – If the normpath of the resource does not exist.

  • ValueError – If the resource doesn’t yield a normpath or the index columns cannot be inferred from it based on the schema.

dimcat.data.resources.utils.make_adjacency_groups(S: Series, groupby=None, logger: Optional[Logger] = None) Tuple[Series, Dict[int, Any]][source]#

Turns a Series into a Series of ascending integers starting from 1 that reflect groups of successive equal values.

This is a simplified variant of ms3.adjacency_groups()

Parameters:
  • S – Series in which to group identical adjacent values with each other.

  • groupby – If not None, the resulting grouper will start new adjacency groups according to this groupby. This is a way, for example, to ensure no group overlaps piece boundaries even if there are adjacent identical values.

Returns:

A series with increasing integers that can be used for grouping. A dictionary mapping the integers to the grouped values.

dimcat.data.resources.utils.make_adjacency_mask(S: Series) Series[source]#

Turns a Series into a Boolean Series that is True for the first value of each group of successive equal values.

dimcat.data.resources.utils.make_boolean_mask_from_set_of_tuples(index: DimcatIndex | pd.MultiIndex, tuples: Set[tuple], levels: Optional[Iterable[int]] = None) pd.Index[bool][source]#

Returns a boolean mask for the given tuples based on index tuples formed from integer positions of the index levels to subselect.

Parameters:
  • index – Index (of the dataframe) you want to subselect from using the returned boolean mask.

  • tuples

  • levels

    • If None, the first n levels of the index are used, where n is the length of the selection tuples.

    • If an iterable of level name strings or level position integers, they are used to create for each row a tuple to compare against the selected tuples.

Returns:

A boolean mask of the same length as the index, where True indicates that the corresponding index tuple is contained in the selection tuples.

Raises:
  • TypeError – If tuples is not a set.

  • ValueError – If tuples is empty.

  • ValueError – If the index has less levels than the selection tuples.

  • ValueError – If levels is not None and has a different length than the selection tuples.

dimcat.data.resources.utils.make_frictionless_schema_descriptor(column_names: Iterable[str], primary_key: Optional[Iterable[str]] = None, **custom_data) dict[source]#

Creates a frictionless schema descriptor from a list of column names and a primary key.

This function is a duplicate of ms3.utils.frictionless_helpers.make_frictionless_schema_descriptor() and the translation of column names into frictionless fields (with type and description) falls back to ms3.utils.frictionless_helpers.column_name2frictionless_field().

Parameters:
  • column_names

  • primary_key

  • **custom_data

Returns:

dimcat.data.resources.utils.make_group_start_mask(df: D, groupby) ndarray[Any, dtype[bool]][source]#

Returns a boolean mask where the beginning of each group is marked with True. This is useful only when the groups already came in groups within the dataframe in the first place.

dimcat.data.resources.utils.make_groups_lasts_mask(feature_df: Union[D, S], groupby=None) ndarray[Any, dtype[bool]][source]#

Returns a boolean mask where each row that comes last in one of the groups is marked as True. This is useful only when the groups already came in groups within the dataframe in the first place. Instead of a dataframe with groupby columns you may also pass a Series with None.

dimcat.data.resources.utils.make_groupwise_range_index_from_groups(idx: Index) ndarray[Any, dtype[int]][source]#

Turns adjacency groups into integer ranges starting from 0.

dimcat.data.resources.utils.make_index_from_grouping_dict(grouping: Dict[str, Iterable[tuple]], level_names=('group_name', 'corpus', 'piece'), sort=False, raise_if_multiple_membership: bool = False) MultiIndex[source]#

Creates a MultiIndex from a dictionary with grouped tuples.

Parameters:
  • grouping – A dictionary where keys are group names and values are lists of index tuples.

  • level_names – Names for the levels of the MultiIndex, i.e. one for the group level and one per level in the tuples.

  • sort – By default the returned MultiIndex is sorted. Set False to disable sorting.

  • raise_if_multiple_membership – If True, raises a ValueError if a member is in multiple groups.

Returns:

A MultiIndex with the given names and the tuples from the grouping dictionary.

dimcat.data.resources.utils.make_multiindex_for_unstack(idx: Index, level_name: str = 'i') MultiIndex[source]#

Turns an index that contains adjacency groups (adjacent entries having the same value) into a 2-level MultiIndex where the new level represents an individual integer range for each group, starting at 0.

dimcat.data.resources.utils.make_phrase_start_mask(df) ndarray[Any, dtype[bool]][source]#

Based on the “phrase_id” index level, make a mask that is True for the first row of each mask.

dimcat.data.resources.utils.make_range_index_from_boolean_mask(inner_start_mask: ndarray[Any, dtype[bool]], outer_start_mask: Optional[ndarray[Any, dtype[bool]]] = None) ndarray[Any, dtype[int]][source]#

Creates an index with the same length as the given boolean mask, that restarts counting from every True entry. The behaviour changes depending on whether outer_start_mask is given or not. That’s how the function is used by PhraseData._regroup_phrases() to create both the inner and the outer index level. The function is indifferent to the value of the first entry in the mask(s).

The algorithm builds on Warren Weckesser’s approach via https://stackoverflow.com/a/20033438

Parameters:
  • inner_start_mask

  • outer_start_mask

Returns:

dimcat.data.resources.utils.make_regrouped_stage_index(df: D, grouping: S, level_names: Tuple[str, str] = ('stage', 'substage')) D[source]#

Returns a dataframe that corresponds to the two new (stage) index levels that regroup_phrase_stages() incorporates.

dimcat.data.resources.utils.make_tsv_resource(name: Optional[str] = None) Resource[source]#

Returns a frictionless.Resource with the default properties of a TSV file stored to disk.

dimcat.data.resources.utils.merge_columns_into_one(df: D, join_str: Optional[Union[str, bool]] = None, fillna: Optional[Hashable] = None) S[source]#

Merge all columns of a dataframe into a single column.

Parameters:
  • df – Dataframe to reduce.

  • join_str

    By default (None), the resulting columns contain tuples. If you want them to contain strings, you may pass

    • True to concatenate the tuple values for a given n-gram component separated by “, “ – yielding strings that look like tuples without parentheses

    • False to concatenate without any string in-between the values

    • a string to be used as the separator between the tuple values.

  • fillna – Pass a value to replace all missing values with it.

Returns:

A series containing tuples or strings.

dimcat.data.resources.utils.merge_ties(df: D, return_dropped: bool = False, perform_checks: bool = True, logger: Optional[Logger] = None)[source]#

In a note list, merge tied notes to single events with accumulated durations. Input dataframe needs columns [‘duration’, ‘tied’, ‘midi’, ‘staff’]. This function does not handle correctly overlapping ties on the same pitch since it doesn’t take into account the notational layers (‘voice’).

Copied from ms3, to be developed further.

Parameters:
  • df

  • return_dropped

  • perform_checks

  • logger

Returns:

dimcat.data.resources.utils.nan_eq(a, b)[source]#

Returns True if a and b are equal or both null. Works on two Series or two elements.

dimcat.data.resources.utils.overlapping_chunk_per_interval_cutoff_direct(df: DataFrame, lefts: ndarray[Any, dtype[_ScalarType_co]], rights: ndarray[Any, dtype[_ScalarType_co]], intervals: IntervalIndex, qstamp_column_name: str = 'quarterbeats', duration_column_name: str = 'duration_qb', logger: Optional[Logger] = None) DataFrame[source]#

The heart of a slicing operation, which returns a dataframe that corresponds to the input dataframe sliced by the intervals present in the intervals pandas.IntervalIndex, which will be included as the first index level of the result dataframe.

Parameters:
  • df – DataFrame to be sliced.

  • lefts – Same-length array expressing the start point of every row.

  • rights – Same-length array expressing the end point (exclusive) of every row.

  • qstamp_column_name – Name of the column in which qstamp (offset from the timeline’s origin) is to be found.

  • duration_column_name – Name of the column in the chunk dfs where the new event durations will be stored as floats. Defaults to “duration_qb”, resulting in the existing values being updated.

  • intervals – The pairs are interpreted as left-closed, right-open intervals that demarcate the boundaries of the returned DataFrame chunks. These intervals are assumed to be non-overlapping and monotonically increasing, which allows us to speed up this expensive operation.

Returns:

Concatenation of the dataframe chunks corresponding to each of the given interval. The first index level of the resulting dataframe is a :obj`pandas.IntervalIndex` which corresponds to the intervals.

dimcat.data.resources.utils.regroup_phrase_stages(df: D, grouping: S, level_names: Tuple[str, str] = ('stage', 'substage'))[source]#

Insert a grouping column and replace the last index level with a new primary and secondary index accordingly. The primary level increments at the beginning of each group, the secondary level increments at every row, restarting at the beginning of each group. For example, a grouping [“a”, “a”, “a”, “b”, “c”, “c”] results in the index [(0, 0), (0, 1), (0, 2), (1, 0), (2, 0), (2, 1)].

Parameters:
  • grouping – A Series with the same index as the (raw) phrase_df, containing the grouping criterion. Adjacent equal values are grouped together.

  • level_names – Names of the two index levels.

Returns:

A reindexed copy of the phrase data.

dimcat.data.resources.utils.resolve_columns_argument(columns: Optional[Union[str, int, Iterable[str | int]]], column_names: List[str]) Optional[List[str]][source]#

Resolve the columns argument of a load function to a list of column names.

Parameters:
  • columns – A list of integer position and/or column names. Can be mixed but integers will always be interpreted as positions.

  • column_names – List of column names to choose from.

Returns:

The resolved list of column names. None if columns is None.

Raises:

ValueError – If columns contains duplicate column names.

dimcat.data.resources.utils.resolve_join_str_argument(join_str: Optional[Union[bool, str, Tuple[bool | str, ...]]]) Optional[str][source]#

Helper function that resolves a join_str argument to a string or None by replacing boolean values with the defaults “, “ for True and “” for False.

dimcat.data.resources.utils.resolve_levels_argument(levels: Optional[Union[str, int, Iterable[str | int]]], level_names: List[str], inverse: bool = False) Optional[Tuple[int]][source]#

Turns a selection of index levels into a list of positive level positions.

dimcat.data.resources.utils.resolve_recognized_piece_columns_argument(recognized_piece_columns: Optional[Iterable[str]] = None) List[str][source]#

Resolve the recognized_piece_columns argument by replacing None with the default value.

dimcat.data.resources.utils.safe_row_tuple(row: Iterable[str]) Literal[<NA>]][source]#

Join the given strings together separated by ‘, ‘ but catch TypeErrors by returning pd.NA instead.

dimcat.data.resources.utils.store_json(data: dict, filepath: str, indent: int = 2, make_dirs: bool = True, **kwargs)[source]#

Serialize object to file.

Parameters:
  • data – Nested structure of dicts and lists.

  • filepath – Path to the text file to (over)write.

  • indent – Prettify the JSON layout. Default indentation: 2 spaces

  • make_dirs – If True (default), create the directory if it does not exist.

  • **kwargs – Keyword arguments passed to json.dumps().

dimcat.data.resources.utils.str2inttuple(s)[source]#

Non-strict version of ms3.str2inttuple() which does not fail on non-integer values.

dimcat.data.resources.utils.str2pd_interval(s: str) Interval[source]#

Function produces only left-closed, right-open intervals.

dimcat.data.resources.utils.subselect_multiindex_from_df(df: D, tuples: DimcatIndex | Iterable[tuple], levels: Optional[int | str | List[int | str]] = None) pd.DataFrame[source]#

Returns a copy of a subselection of the dataframe based on the union of its index tuples (or subtuples) and the given tuples.

Parameters:
  • df – Dataframe of which to return a subset of rows.

  • tuples – Tuples to match against df’s MultiIndex. Can be a MultiIndex because set(tuples) works on that, too.

  • levels

    • If None, the first n levels of the index are used, where n is the length of the selection tuples.

    • If an iterable of level name strings or level position integers, they are used to create for each row a tuple to compare against the selected tuples.

Returns:

dimcat.data.resources.utils.transform_phrase_data(phrase_df, columns: Union[str, List[str]] = 'chord', components: Union[Literal['ante', 'body', 'codetta', 'post'], List[Literal['ante', 'body', 'codetta', 'post']]] = 'body', drop_levels: Union[bool, int, str, Iterable[str | int]] = False, reverse: bool = False, level_name: str = 'i')[source]#

Returns a dataframe containing the requested phrase components and harmony columns.

Parameters:
  • phrase_df – PhraseAnnotations dataframe.

  • columns – Column(s) to include in the result.

  • components – Which of the four phrase components to include, ∈ {‘ante’, ‘body’, ‘codetta’, ‘post’}.

  • drop_levels – Can be a boolean or any level specifier accepted by pandas.MultiIndex.droplevel(). If False (default), all levels are retained. If True, only the phrase_id level and the level_name are retained. In all other cases, the indicated (string or integer) value(s) must be valid and cause one of the index levels to be dropped. level_name cannot be dropped. Dropping ‘phrase_id’ will likely lead to an exception if a PhraseData object will be displayed in WIDE format.

  • reverse – Pass True to reverse the order of harmonies so that each phrase’s last label comes first.

  • level_name – Defaults to ‘i’, which is the name of the original level that will be replaced by this new one. The new one represents the individual integer range for each phrase, starting at 0.

Returns:

Dataframe representing partial information on the selected phrases.

dimcat.data.resources.utils.transpose_notes_to_c(notes: D) D[source]#

Transpose the columns ‘tpc’ and ‘midi’ in a way that they reflect the local key as if it was C major/minor. This operation is typically required for creating pitch class profiles. Uses: ms3.transform(), ms3.name2fifths(), ms3.roman_numeral2fifths()

Parameters:

notes – DataFrame that has at least the columns [‘globalkey’, ‘localkey’, ‘tpc’, ‘midi’].

Returns:

A new dataframe with the columns ‘local_tonic_name’, ‘fifths_over_local_tonic’, and ‘midi_in_c’ where the latter two correspond to the original columns ‘tpc’ and ‘midi’ but transposed in such a way that fifths_over_local_tonic == 0 and midi_in_c % 12 == 0 for all pitches that match the local tonic. E.g. for the local key A major/minor, each pitch A will have tpc=0 and midi % 12 = 0).

dimcat.data.resources.utils.tuple2str(tup: tuple, join_str: Optional[str] = ', ', recursive: bool = True, keep_parentheses: bool = False) str[source]#

Used for turning n-gram components into strings, e.g. for display on plot axes.

Parameters:
  • tup – Tuple to be returned as string.

  • join_str – String to be interspersed between tuple elements. If None, result is str(tup) and recursive is ignored.

  • recursive – If True (default) tuple elements that are tuples themselves will be joined together recursively, using the same join_str (except when it’s None). Inner tuples always keep their parentheses.

  • keep_parentheses – If False (default), the outer parentheses are removed. Pass True to keep them in the string.

Returns:

A string representing the tuple.

dimcat.data.resources.utils.update_duration_qb(df: D, update_mask: Optional[ndarray[Any, dtype[bool]]] = None, logger: Optional[Logger] = None) None[source]#

Replaces the ‘duration_qb’ column in the given DataFrame with a new one that updates the values by subtracting subsequent ‘quarterbeats’ values. If update_mask is specified, only values for which the mask is True are updated. Otherwise, all values are updated.

dimcat.data.resources.utils.value2bool(value: str | float | int | bool) bool | str | float | int[source]#

Identical with ms3.value2bool

Module contents#