import sys
if not sys.warnoptions:
import warnings
warnings.simplefilter("ignore")
import os
import frictionless as fl
from dimcat.base import deserialize_json_file
CORPUS_PATH = os.path.abspath(os.path.join("..", "..", "unittest_metacorpus"))
assert os.path.isdir(CORPUS_PATH)
sweelinck_dir = os.path.join(CORPUS_PATH, "sweelinck_keyboard")
---------------------------------------------------------------------------
ModuleNotFoundError Traceback (most recent call last)
Cell In[1], line 7
5 import os
6 import frictionless as fl
----> 7 from dimcat.base import deserialize_json_file
8 CORPUS_PATH = os.path.abspath(os.path.join("..", "..", "unittest_metacorpus"))
9 assert os.path.isdir(CORPUS_PATH)
File ~/checkouts/readthedocs.org/user_builds/dimcat/envs/latest/lib/python3.10/site-packages/dimcat/__init__.py:28
13 # modules of dimcat.data are not allowed to import from dimcat.steps, so when they do, they use get_class() which
14 # requires that the respective step was already "seen" and is part of the registry. Hence, although the main purpose
15 # of the imports here is syntactic sugar, some are required.
16 from .base import (
17 DimcatConfig,
18 change_setting,
(...)
26 reset_settings,
27 )
---> 28 from .data import catalogs, datasets, packages, resources
29 from .data.datasets.base import Dataset
30 from .data.resources import PieceIndex
File ~/checkouts/readthedocs.org/user_builds/dimcat/envs/latest/lib/python3.10/site-packages/dimcat/data/__init__.py:3
1 import logging
----> 3 from .resources import features
5 module_logger = logging.getLogger(__name__)
File ~/checkouts/readthedocs.org/user_builds/dimcat/envs/latest/lib/python3.10/site-packages/dimcat/data/resources/__init__.py:5
3 from .base import FeatureName, PathResource, Resource, ResourceStatus
4 from .dc import DimcatIndex, DimcatResource, Feature, PieceIndex
----> 5 from .features import (
6 Annotations,
7 HarmonyLabels,
8 KeyAnnotations,
9 Metadata,
10 Notes,
11 PhraseAnnotations,
12 PhraseComponents,
13 PhraseLabels,
14 )
15 from .results import (
16 CadenceCounts,
17 Counts,
(...)
24 Transitions,
25 )
27 module_logger = logging.getLogger(__name__)
File ~/checkouts/readthedocs.org/user_builds/dimcat/envs/latest/lib/python3.10/site-packages/dimcat/data/resources/features.py:23
14 from dimcat.data.resources.base import D, FeatureName, S
15 from dimcat.data.resources.dc import (
16 HARMONY_FEATURE_NAMES,
17 DimcatIndex,
(...)
21 UnitOfAnalysis,
22 )
---> 23 from dimcat.data.resources.results import PhraseData, PhraseDataFormat
24 from dimcat.data.resources.utils import (
25 get_corpus_display_name,
26 join_df_on_index,
27 merge_ties,
28 safe_row_tuple,
29 )
30 from dimcat.dc_exceptions import (
31 DataframeIsMissingExpectedColumnsError,
32 FeatureIsMissingFormatColumnError,
33 ResourceIsMissingPieceIndexError,
34 )
File ~/checkouts/readthedocs.org/user_builds/dimcat/envs/latest/lib/python3.10/site-packages/dimcat/data/resources/results.py:41
32 from dimcat.base import (
33 DimcatObjectField,
34 FriendlyEnum,
(...)
38 get_setting,
39 )
40 from dimcat.dc_exceptions import UnknownFormat
---> 41 from dimcat.plotting import (
42 CADENCE_COLORS,
43 GroupMode,
44 make_bar_plot,
45 make_bubble_plot,
46 make_heatmap,
47 make_lof_bar_plot,
48 make_lof_bubble_plot,
49 make_pie_chart,
50 update_figure_layout,
51 update_plot_grouping_settings,
52 write_image,
53 )
54 from dimcat.utils import SortOrder
55 from plotly import graph_objs as go
File ~/checkouts/readthedocs.org/user_builds/dimcat/envs/latest/lib/python3.10/site-packages/dimcat/plotting.py:16
14 from plotly import express as px
15 from plotly import graph_objects as go
---> 16 from plotly.validators.heatmap import ColorscaleValidator
18 AVAILABLE_FIGURE_FORMATS: Tuple[str, ...] = PlotlyScope._all_formats
19 """Possible formats for saving Plotly figures, defined as extensions without leading dot."""
ModuleNotFoundError: No module named 'plotly.validators.heatmap'
Data#
Resource#
A resource is a combination of a file and its descriptor. It allows for interacting with the file without having to “touch” it by interacting with its descriptor only. The descriptor comes in form of a dictionary and is typically stored next to the file in JSON or YAML format.
DiMCAT follows the Frictionless specification for describing resources. There are two types of resources:
PathResource: Stands for a resource on local disk or on the web.
They can be instantiated from a single filepath using the constructors
.from_resource_path()which takes the path to the resource file to be described.from_descriptor_filepath()which takes a filepath pointing to a JSON or YAML file containing a resource descriptor
Let’s exemplify looking at the
PathResource#
The sweelinck_keyboard repository contains a single MuseScore file (in the folder “MS3”) and several TSV files extracted from it.
Let’s load it:
from dimcat import resources
score_resource = os.path.join(sweelinck_dir, "MS3", "SwWV258_fantasia_cromatica.mscx")
score_resource = resources.PathResource.from_resource_path(score_resource)
score_resource.get_path_dict()
The dictionary returned by .get_path_dict() tell us everything we need to know to handle the resource physically:
basepathis an absolute directoryfilepathis the filepath (which can include subfolders), relative to thebasepathnormpathis the full path to the resource and defined asbasepath/filepath(both need to be specified)innerpath: whennormpathpoints to a .zip file, innerpath is the relative filepath of the resource within the ZIP archivedescriptor_filenamestores the name of a descriptor when it deviates from the default<resource_name>.resource.json. Cannot include subfolders since it is expected to be stored inbasepath(otherwise, the relativefilepathstored in the descriptor would resolve incorrectly)descriptor_path: defined bybasepath/descriptor_filename
Here, the descriptor_path corresponds to the default, which does not currently point to an existing file:
score_resource.descriptor_exists
It can be created using .store_descriptor():
score_descriptor_path = score_resource.store_descriptor()
score_resource.descriptor_exists
To underline the functionality of the path resource, even the new descriptor can be treated as a resource:
resources.PathResource.from_resource_path(score_descriptor_path)
Which is different from creating the original PathResource from the created descriptor:
resources.PathResource.from_descriptor_path(score_descriptor_path)
Note that the descriptor_filename is now set to keep track of the existing one the resource originates from.
By the way, the descriptors written to disk qualify as “normal” DimcatConfigs (see ???)…
deserialize_json_file(score_descriptor_path)
… and at the same time as valid Frictionless descriptors that can be validated using its commandline tool or Python library:
fl.validate(score_descriptor_path)
This is also what the property is_valid uses under the hood:
score_resource.is_valid
The status of a PathResource is always and unchangeably PATH_ONLY, with a value one above EMPTY:
score_resource.status
The path components cannot be modified because it would invalidate the relations with other path components:
base_path_level_up = os.path.dirname(score_resource.basepath)
score_resource.basepath = base_path_level_up
DimcatResource#
A DimcatResource is both a Resource in the above sense and a wrapped dataframe. Let’s create one from a TSV resource descriptor:
notes_descriptor_path = os.path.join(sweelinck_dir, "notes", "SwWV258_fantasia_cromatica.notes.resource.json")
notes_resource = resources.DimcatResource.from_descriptor_path(notes_descriptor_path)
notes_resource
As the output shows, the status of the resource is STANDALONE_NOT_LOADED.
The resource is considered standalone, as opposed to packaged, because it has its own resource descriptor file.
And it is considered “not loaded” because the actual tabular data has not been loaded from the described TSV file into memory.
The latter is achieved through the property df (short for dataframe):
notes_resource.df
… which changes the status to STANDALONE_LOADED:
notes_resource.status
type(notes_resource)
Package#
A package, or DataPackage, is a collection of resources. Analogously there are two main types:
PathPackage for collecting PathResources, and
DimcatPackage for collecting DimcatResources.
Just like resources, packages have a basepath and may be stored as a frictionless package descriptor.
For starters, let’s assemble a package from scratch:
from dimcat import packages
path_package = packages.PathPackage(package_name="scratch")
path_package
The fields are mostly familiar from above:
basepath: Absolute path on disk where the descriptor and the ZIP file would be stored.resources: Currently an empty list. Typically, allresourcesneed to have the samebasepath(if not, the package is ‘misaligned’).name: As per the Frictionless specification every package needs a name. In DiMCAT, the relevant property is calledpackage_name.descriptor_filename: The name of the descriptor file if it deviates from the default<package_name>.datapackage.json.auto_validate: If True, the package is automatically validated after it is stored to disk.
Now let’s add the path resource we have created above:
path_package.add_resource(score_resource)
path_package
path_package.store_descriptor()
We can also create a package directly from a resource:
dimcat_package = packages.DimcatPackage.from_resources([notes_resource], package_name="pack")
dimcat_package
score_resource.is_serialized
score_resource.status
score_resource.to_dict()
score_resource.to_dict(pickle=True)
score_resource.to_config().create()
notes_descriptor_path = os.path.join(sweelinck_dir, "notes", "SwWV258_fantasia_cromatica.notes.resource.json")
notes_path_resource = resources.Resource.from_descriptor_path(notes_descriptor_path)
notes_path_resource = resources.PathResource.from_descriptor_path(notes_descriptor_path)
notes_path_resource
notes_resource = resources.Resource.from_descriptor_path(notes_descriptor_path)
notes_resource