dimcat.steps.loaders package#

Submodules#

dimcat.steps.loaders.base module#

A loader reads an existing datapackage or creates one by parsing data from a source.

class dimcat.steps.loaders.base.FacetName(value)[source]#

Bases: FriendlyEnum

The names of the facets that can be extracted from scores.

annotations = 'annotations'#
control = 'control'#
events = 'events'#
metadata = 'metadata'#
structure = 'structure'#
class dimcat.steps.loaders.base.LoadedFacets(events: Dict[tuple, pandas.core.frame.DataFrame] = <factory>, control: Dict[tuple, pandas.core.frame.DataFrame] = <factory>, structure: Dict[tuple, pandas.core.frame.DataFrame] = <factory>, annotations: Dict[tuple, pandas.core.frame.DataFrame] = <factory>, metadata: Dict[tuple, pandas.core.series.Series] = <factory>)[source]#

Bases: object

annotations: Dict[tuple, DataFrame]#
control: Dict[tuple, DataFrame]#
events: Dict[tuple, DataFrame]#
get_concatenated_facets() Dict[str, DataFrame][source]#
metadata: Dict[tuple, Series]#
structure: Dict[tuple, DataFrame]#
class dimcat.steps.loaders.base.Loader(basepath: Optional[str] = None, packages: Optional[DimcatCatalog] = None)[source]#

Bases: PipelineStep

Base class for all loaders.

class Schema(*, only: Optional[Union[Sequence[str], AbstractSet[str]]] = None, exclude: Union[Sequence[str], AbstractSet[str]] = (), many: Optional[bool] = None, load_only: Union[Sequence[str], AbstractSet[str]] = (), dump_only: Union[Sequence[str], AbstractSet[str]] = (), partial: Optional[Union[bool, Sequence[str], AbstractSet[str]]] = None, unknown: Optional[Literal['exclude', 'include', 'raise']] = None)[source]#

Bases: Schema

dump_fields: dict[str, Field]#
exclude: set[Any] | MutableSet[Any]#
fields: dict[str, Field]#

Dictionary mapping field_names -> Field objects

load_fields: dict[str, Field]#
opts: Any = <marshmallow.schema.SchemaOpts object>#
unknown: types.UnknownOption#
add_package(package: DimcatPackage) None[source]#

Add a package to the loader that contains resources to be processed.

property basepath: str#
check_resource(resource: Resource) None[source]#

Checks whether the resource at the given path exists.

create_dataset() Dataset[source]#
fit_to_dataset(dataset: Dataset) None[source]#

Fit this PipelineStep to a Dataset.

classmethod from_directory(directory: str, package_name: Optional[str] = None, extensions: Optional[Iterable[str]] = None, file_re: Optional[str] = None, exclude_re: Optional[str] = None, resource_names: Optional[Callable[[str], Optional[str]]] = None, corpus_names: Optional[Callable[[str], Optional[str]]] = None, auto_validate: bool = False, basepath: Optional[str] = None, **kwargs) Self[source]#

Create a loader from a ScorePackage created on the fly from an iterable of filepaths.

Parameters:
  • directory – The directory that is to be scanned for files with particular extensions.

  • package_name – The name of the new package. If None, the base of the directory is used.

  • extensions – The extensions of the files to be discovered under directory and which are to be turned into Resource objects. Defaults to this loader’s _accepted_file_extensions.

  • file_re – Pass a regular expression in order to select only files that (partially) match it.

  • resource_names – Name factory for the resources created from the paths. Names also serve as piece identifiers. By default, the filename is used. To override this behaviour you can pass a callable that takes a filepath and returns a name. When the callable returns None, the default is used (i.e., the filename). Whatever the name turns out to be, it will always be turned into a valid frictionless name via make_valid_frictionless_name().

  • corpus_names – Names of (or name factory for) the corpus that each resource (=piece) belongs to and that is used in the (‘corpus’, ‘piece’) ID. By default, the name of the package is used. To override this behaviour you can pass a callable that takes a path and returns a name. When the callable returns None, the default is used (i.e., the package_name). Whatever the name turns out to be, it will always be turned into a valid frictionless name via make_valid_frictionless_name().

  • auto_validate – Set True to validate the new package after copying it.

  • basepath – The basepath where the new package will be stored. If None, the basepath of the original package

classmethod from_filepaths(filepaths: Iterable[str], basepath: Optional[str] = None) Self[source]#

Create a loader from a DimcatPackage created on the fly from an iterable of filepaths.

Parameters:
  • filepaths – The filepaths that are to be turned into Resource objects and packaged.

  • basepath – The basepath where the new package will be stored. If None, the basepath of the original package

classmethod from_package(package: DimcatPackage, basepath: Optional[str] = None) Self[source]#

Create a loader from a DimcatPackage.

get_basepath() str[source]#

Get the basepath of the resource. If not specified, the default basepath is returned.

iter_package_descriptors() Iterator[str][source]#

Create datapackage(s) for the input catalog of a Dataset and iterate over their descriptor paths.

iter_resource_paths() Iterator[str][source]#

Iterate over the paths of the resources in the package(s).

iter_resources() Iterator[Resource][source]#

Iterate over the resources in the package(s).

process_resource(resource: Resource) None[source]#
property sources: List[str]#
class dimcat.steps.loaders.base.PackageLoader(basepath: Optional[str] = None, packages: Optional[DimcatCatalog] = None)[source]#

Bases: Loader

Simple loader that discovers and loads frictionless datapackages through their descriptors.

default_loader_name = 'package_loader'#
classmethod from_directory(directory: str, package_name: Optional[str] = None, extensions: Optional[Iterable[str]] = None, file_re: Optional[str] = None, exclude_re: Optional[str] = None, resource_names: Optional[Callable[[str], Optional[str]]] = None, corpus_names: Optional[Callable[[str], Optional[str]]] = None, auto_validate: bool = False, basepath: Optional[str] = None, **kwargs) Self[source]#

Create a loader from a ScorePackage created on the fly from an iterable of filepaths.

Parameters:
  • directory – The directory that is to be scanned for files with particular extensions.

  • package_name – The name of the new package. If None, the base of the directory is used.

  • extensions – The extensions of the files to be discovered under directory and which are to be turned into Resource objects. Defaults to this loader’s _accepted_file_extensions.

  • file_re – Pass a regular expression in order to select only files that (partially) match it.

  • resource_names – Name factory for the resources created from the paths. Names also serve as piece identifiers. By default, the filename is used. To override this behaviour you can pass a callable that takes a filepath and returns a name. When the callable returns None, the default is used (i.e., the filename). Whatever the name turns out to be, it will always be turned into a valid frictionless name via make_valid_frictionless_name().

  • corpus_names – Names of (or name factory for) the corpus that each resource (=piece) belongs to and that is used in the (‘corpus’, ‘piece’) ID. By default, the name of the package is used. To override this behaviour you can pass a callable that takes a path and returns a name. When the callable returns None, the default is used (i.e., the package_name). Whatever the name turns out to be, it will always be turned into a valid frictionless name via make_valid_frictionless_name().

  • auto_validate – Set True to validate the new package after copying it.

  • basepath – The basepath where the new package will be stored. If None, the basepath of the original package

class dimcat.steps.loaders.base.ScoreLoader(basepath: Optional[str] = None, loader_name: Optional[str] = None, overwrite: bool = False)[source]#

Bases: Loader

Base class for all loaders that parse scores and create a datapackage containing the extracted facets.

class Schema(*, only: Optional[Union[Sequence[str], AbstractSet[str]]] = None, exclude: Union[Sequence[str], AbstractSet[str]] = (), many: Optional[bool] = None, load_only: Union[Sequence[str], AbstractSet[str]] = (), dump_only: Union[Sequence[str], AbstractSet[str]] = (), partial: Optional[Union[bool, Sequence[str], AbstractSet[str]]] = None, unknown: Optional[Literal['exclude', 'include', 'raise']] = None)[source]#

Bases: Schema

dump_fields: dict[str, Field]#
exclude: set[Any] | MutableSet[Any]#
fields: dict[str, Field]#

Dictionary mapping field_names -> Field objects

load_fields: dict[str, Field]#
opts: Any = <marshmallow.schema.SchemaOpts object>#
unknown: types.UnknownOption#
add_piece_facet_dataframe(facet_name: FacetName, ID: tuple, df: pandas.core.frame.DataFrame | pandas.core.series.Series) None[source]#
check_resource(resource: PathResource) None[source]#

Checks whether the resource at the given path exists.

property descriptor_exists: bool#
property descriptor_path: Optional[str]#
classmethod from_directory(directory: str, package_name: Optional[str] = None, extensions: Optional[Iterable[str]] = None, file_re: Optional[str] = None, exclude_re: Optional[str] = None, resource_names: Optional[Callable[[str], Optional[str]]] = None, corpus_names: Optional[Callable[[str], Optional[str]]] = None, auto_validate: bool = False, basepath: Optional[str] = None, loader_name: Optional[str] = None, overwrite: bool = False) Self[source]#

Create a loader from a ScorePackage created on the fly from an iterable of filepaths.

Parameters:
  • directory – The directory that is to be scanned for files with particular extensions.

  • package_name – The name of the new package. If None, the base of the directory is used.

  • extensions – The extensions of the files to be discovered under directory and which are to be turned into Resource objects. Defaults to this loader’s _accepted_file_extensions.

  • file_re – Pass a regular expression in order to select only files that (partially) match it.

  • resource_names – Name factory for the resources created from the paths. Names also serve as piece identifiers. By default, the filename is used. To override this behaviour you can pass a callable that takes a filepath and returns a name. When the callable returns None, the default is used (i.e., the filename). Whatever the name turns out to be, it will always be turned into a valid frictionless name via make_valid_frictionless_name().

  • corpus_names – Names of (or name factory for) the corpus that each resource (=piece) belongs to and that is used in the (‘corpus’, ‘piece’) ID. By default, the name of the package is used. To override this behaviour you can pass a callable that takes a path and returns a name. When the callable returns None, the default is used (i.e., the package_name). Whatever the name turns out to be, it will always be turned into a valid frictionless name via make_valid_frictionless_name().

  • auto_validate – Set True to validate the new package after copying it.

  • basepath – The basepath where the new package will be stored. If None, the basepath of the original package

classmethod from_filepaths(filepaths: Iterable[str], package_name: str, resource_names: Optional[Union[Iterable[str], Callable[[str], str]]] = None, corpus_names: Optional[Union[Iterable[str], Callable[[str], Optional[str]]]] = None, auto_validate: bool = False, basepath: Optional[str] = None, loader_name: Optional[str] = None, overwrite: bool = False) Self[source]#

Create a loader from a ScorePackage created on the fly from an iterable of filepaths.

Parameters:
  • filepaths – The filepaths that are to be turned into Resource objects and packaged.

  • package_name – The name of the new package.

  • resource_names – Names of (or name factory for) the created resources serving as piece identifiers. By default, the filename is used. To override this behaviour you can pass an iterable of names corresponding to paths, or a callable that takes a path and returns a name. When the callable returns None, the default is used (i.e., the filename). Whatever the name turns out to be, it will always be turned into a valid frictionless name via make_valid_frictionless_name().

  • corpus_names – Names of (or name factory for) the corpus that each resource (=piece) belongs to and that is used in the (‘corpus’, ‘piece’) ID. By default, the name of the package is used. To override this behaviour you can pass an iterable of names corresponding to paths, or a callable that takes a path and returns a name. When the callable returns None, the default is used (i.e., the package_name). Whatever the name turns out to be, it will always be turned into a valid frictionless name via make_valid_frictionless_name().

  • auto_validate – Set True to validate the new package after copying it.

  • basepath – The basepath where the new package will be stored. If None, the basepath of the original package

classmethod from_package(package: ScorePathPackage, basepath: Optional[str] = None, loader_name: Optional[str] = None, overwrite: bool = False) Self[source]#

Create a loader from a DimcatPackage.

classmethod from_resources(resources: Union[Iterable[PathResource], PathResource], package_name: str, auto_validate: bool = False, basepath: Optional[str] = None, loader_name: Optional[str] = None, overwrite: bool = False) Self[source]#

Create a loader from a ScorePackage created on the fly from an iterable of PathResources.

Parameters:
  • resources – The PathResource objects that will be turned into a package.

  • package_name – The name of the new package.

  • auto_validate – Set True to validate the new package after copying it.

  • basepath – The basepath where the new package will be stored. If None, the basepath of the original package

get_descriptor_filename() str[source]#

Returns the filename of the datapackage descriptor.

get_descriptor_path() str[source]#

Returns the path of the datapackage descriptor.

get_loader_name() str[source]#

Returns loader_name if set, otherwise default_loader_name.

get_zip_filepath() str[source]#

Returns the filename of the ZIP file that the resources of this package are serialized to.

get_zip_path() str[source]#

Returns the path of the ZIP file that the resources of this package are serialized to.

iter_package_descriptors() Iterator[str][source]#

Create datapackage(s) and iterate over their descriptor paths.

property loader_name: Optional[str]#
make_and_store_datapackage(overwrite: Optional[bool] = None) str[source]#
Parameters:

overwrite – Set to a boolean to set overwrite to a new value.

Returns:

Raises:

FileExistsError – If the zip file <basepath>/<package_name>.zip already exists.

parse_and_extract() None[source]#

Iterates over score resources and stores the extracted information in loaded_facets.

property processed_ids: Set[tuple]#
property zip_file_exists: bool#

dimcat.steps.loaders.m21 module#

class dimcat.steps.loaders.m21.CollectedElements(events: List[dict] = <factory>, control: List[dict] = <factory>, structure: List[dict] = <factory>, annotations: List[str] = <factory>, metadata: DefaultDict = <factory>, part_ids: List[str] = <factory>, prelims: List[str] = <factory>)[source]#

Bases: object

annotations: List[str]#
control: List[dict]#
events: List[dict]#
metadata: DefaultDict#
part_ids: List[str]#
prelims: List[str]#
structure: List[dict]#
class dimcat.steps.loaders.m21.Music21Loader(basepath: Optional[str] = None, loader_name: Optional[str] = None, overwrite: bool = False)[source]#

Bases: ScoreLoader

Extracts information from scores using music21.

class dimcat.steps.loaders.m21.Music21Score(source: str)[source]#

Bases: object

Auxiliary class for extracting facets from a score parsed with music21.

add_annotation(annotation: dict)[source]#
add_control_event(event: str, event_info: dict)[source]#
add_event(event: str, event_info: dict)[source]#
add_metadata(key, value)[source]#
add_structure(structure: dict)[source]#
parse()[source]#
parse_element(element: Music21Object, **kwargs) dict[source]#
dimcat.steps.loaders.m21.default_list_dict()[source]#
dimcat.steps.loaders.m21.make_dataframe(records: List[dict], drop_empty_columns: bool = True)[source]#
dimcat.steps.loaders.m21.make_metadata(metadata_dict) Series[source]#
dimcat.steps.loaders.m21.parse_AbstractScale(value)[source]#
dimcat.steps.loaders.m21.parse_Barline(barline: Barline)[source]#
dimcat.steps.loaders.m21.parse_Chord(chord: Chord)[source]#
dimcat.steps.loaders.m21.parse_Clef(clef: Clef) str[source]#
dimcat.steps.loaders.m21.parse_ConcreteScale(concrete_scale: ConcreteScale) Tuple[str, ...][source]#
dimcat.steps.loaders.m21.parse_Duration(duration: Duration) float[source]#
dimcat.steps.loaders.m21.parse_Dynamic(dynamic: Dynamic) str[source]#
dimcat.steps.loaders.m21.parse_Editorial(editorial: Editorial) dict[source]#
dimcat.steps.loaders.m21.parse_Harmony(harmony: Harmony) str[source]#
dimcat.steps.loaders.m21.parse_Interval(interval: Interval) str[source]#
dimcat.steps.loaders.m21.parse_Key(key: Key) int[source]#
dimcat.steps.loaders.m21.parse_Measure(measure: Measure, **higher_level_info)[source]#

Inspired by MarkGotham/bar-measure

dimcat.steps.loaders.m21.parse_Microtone(microtone: Microtone) float[source]#
dimcat.steps.loaders.m21.parse_Pitch(pitch: Pitch)[source]#
dimcat.steps.loaders.m21.parse_TextExpression(text_expression: TextExpression) str[source]#
dimcat.steps.loaders.m21.parse_Tie(tie: Tie) str[source]#
dimcat.steps.loaders.m21.parse_TimeSignature(time_signature: TimeSignature) str[source]#
dimcat.steps.loaders.m21.parse_Volume(volume: Volume) float[source]#
dimcat.steps.loaders.m21.parse_m21_object(obj: ProtoM21Object, exclude_private: bool = True, exclude_default: bool = True, exclude_callables: bool = True, **higher_level_info) dict[source]#

dimcat.steps.loaders.musescore module#

class dimcat.steps.loaders.musescore.MuseScoreLoader(basepath: Optional[str] = None, loader_name: Optional[str] = None, overwrite: bool = False, ms: Optional[str] = None)[source]#

Bases: ScoreLoader

Wrapper around the ms3 MuseScore parsing library.

class Schema(*, only: Optional[Union[Sequence[str], AbstractSet[str]]] = None, exclude: Union[Sequence[str], AbstractSet[str]] = (), many: Optional[bool] = None, load_only: Union[Sequence[str], AbstractSet[str]] = (), dump_only: Union[Sequence[str], AbstractSet[str]] = (), partial: Optional[Union[bool, Sequence[str], AbstractSet[str]]] = None, unknown: Optional[Literal['exclude', 'include', 'raise']] = None)[source]#

Bases: Schema

dump_fields: dict[str, Field]#
exclude: set[Any] | MutableSet[Any]#
fields: dict[str, Field]#

Dictionary mapping field_names -> Field objects

load_fields: dict[str, Field]#
opts: Any = <marshmallow.schema.SchemaOpts object>#
unknown: types.UnknownOption#
check_resource(resource: str | pathlib.Path) None[source]#

Checks whether the resource at the given path exists.

classmethod from_ms3(directory: str, package_name: Optional[str] = None, as_corpus: bool = False, only_metadata_pieces: bool = True, include_convertible: bool = False, include_tsv: bool = True, exclude_review: bool = True, file_re: Optional[Union[Pattern, str]] = None, folder_re: Optional[Union[Pattern, str]] = None, exclude_re: Optional[Union[Pattern, str]] = None, paths: Optional[Collection[str]] = None, choose: Literal['auto', 'all', 'ask'] = 'auto', labels_cfg={}, ms=None, logger_cfg: Optional[dict] = None, basepath: Optional[str] = None, loader_name: Optional[str] = None, overwrite: bool = False, auto_validate: bool = True)[source]#

dimcat.steps.loaders.utils module#

class dimcat.steps.loaders.utils.PathFactory(directory: str, extensions: Optional[Union[str, Iterable[str]]] = None, file_re: Optional[str] = None, folder_re: Optional[str] = None, exclude_re: str = '^(\\.|_)', recursive: bool = True, progress: bool = False, exclude_files_only: bool = False)[source]#

Bases: Iterable[str]

dimcat.steps.loaders.utils.get_m21_input_extensions() Tuple[str, ...][source]#
dimcat.steps.loaders.utils.make_datapackage_descriptor(facet_df_pairs: Iterable[Tuple[str, DataFrame]], package_name: str) dict[source]#
dimcat.steps.loaders.utils.store_datapackage(facet_df_pairs: Iterable[Tuple[str, DataFrame]], name: str, directory: str, overwrite: bool = True) str[source]#
dimcat.steps.loaders.utils.store_facets_as_zip(facet_df_pairs: Iterable[Tuple[str, DataFrame]], zip_path: str, overwrite: bool = True)[source]#

Stores the dataframes as <name>.tsv within the given ZIP file.

Module contents#