Quick demo#
Import dimcat and load data#
import dimcat as dc
from dimcat.data import resources
from dimcat.steps import analyzers, extractors, groupers
package_path = "dcml_corpora.datapackage.json"
dataset = dc.Dataset.from_package(package_path)
dataset
---------------------------------------------------------------------------
ModuleNotFoundError Traceback (most recent call last)
Cell In[1], line 1
----> 1 import dimcat as dc
2 from dimcat.data import resources
3 from dimcat.steps import analyzers, extractors, groupers
File ~/checkouts/readthedocs.org/user_builds/dimcat/envs/latest/lib/python3.10/site-packages/dimcat/__init__.py:28
13 # modules of dimcat.data are not allowed to import from dimcat.steps, so when they do, they use get_class() which
14 # requires that the respective step was already "seen" and is part of the registry. Hence, although the main purpose
15 # of the imports here is syntactic sugar, some are required.
16 from .base import (
17 DimcatConfig,
18 change_setting,
(...)
26 reset_settings,
27 )
---> 28 from .data import catalogs, datasets, packages, resources
29 from .data.datasets.base import Dataset
30 from .data.resources import PieceIndex
File ~/checkouts/readthedocs.org/user_builds/dimcat/envs/latest/lib/python3.10/site-packages/dimcat/data/__init__.py:3
1 import logging
----> 3 from .resources import features
5 module_logger = logging.getLogger(__name__)
File ~/checkouts/readthedocs.org/user_builds/dimcat/envs/latest/lib/python3.10/site-packages/dimcat/data/resources/__init__.py:5
3 from .base import FeatureName, PathResource, Resource, ResourceStatus
4 from .dc import DimcatIndex, DimcatResource, Feature, PieceIndex
----> 5 from .features import (
6 Annotations,
7 HarmonyLabels,
8 KeyAnnotations,
9 Metadata,
10 Notes,
11 PhraseAnnotations,
12 PhraseComponents,
13 PhraseLabels,
14 )
15 from .results import (
16 CadenceCounts,
17 Counts,
(...)
24 Transitions,
25 )
27 module_logger = logging.getLogger(__name__)
File ~/checkouts/readthedocs.org/user_builds/dimcat/envs/latest/lib/python3.10/site-packages/dimcat/data/resources/features.py:23
14 from dimcat.data.resources.base import D, FeatureName, S
15 from dimcat.data.resources.dc import (
16 HARMONY_FEATURE_NAMES,
17 DimcatIndex,
(...)
21 UnitOfAnalysis,
22 )
---> 23 from dimcat.data.resources.results import PhraseData, PhraseDataFormat
24 from dimcat.data.resources.utils import (
25 get_corpus_display_name,
26 join_df_on_index,
27 merge_ties,
28 safe_row_tuple,
29 )
30 from dimcat.dc_exceptions import (
31 DataframeIsMissingExpectedColumnsError,
32 FeatureIsMissingFormatColumnError,
33 ResourceIsMissingPieceIndexError,
34 )
File ~/checkouts/readthedocs.org/user_builds/dimcat/envs/latest/lib/python3.10/site-packages/dimcat/data/resources/results.py:41
32 from dimcat.base import (
33 DimcatObjectField,
34 FriendlyEnum,
(...)
38 get_setting,
39 )
40 from dimcat.dc_exceptions import UnknownFormat
---> 41 from dimcat.plotting import (
42 CADENCE_COLORS,
43 GroupMode,
44 make_bar_plot,
45 make_bubble_plot,
46 make_heatmap,
47 make_lof_bar_plot,
48 make_lof_bubble_plot,
49 make_pie_chart,
50 update_figure_layout,
51 update_plot_grouping_settings,
52 write_image,
53 )
54 from dimcat.utils import SortOrder
55 from plotly import graph_objs as go
File ~/checkouts/readthedocs.org/user_builds/dimcat/envs/latest/lib/python3.10/site-packages/dimcat/plotting.py:16
14 from plotly import express as px
15 from plotly import graph_objects as go
---> 16 from plotly.validators.heatmap import ColorscaleValidator
18 AVAILABLE_FIGURE_FORMATS: Tuple[str, ...] = PlotlyScope._all_formats
19 """Possible formats for saving Plotly figures, defined as extensions without leading dot."""
ModuleNotFoundError: No module named 'plotly.validators.heatmap'
Show metadata#
dataset.get_metadata()
Counting notes#
Variant 1: Extract feature, apply Counter#
Here we pass the extracted notes to the counter.
notes = dataset.get_feature("notes")
result = analyzers.Counter().process(notes)
result.plot()
The FeatureExtractor is added to the dataset’s pipeline implicitly, but the Counter is not because it’s applied only to the extracted feature:
dataset
The pitch-class distributions shown by .plot() correspond to the current unit of analysis, which defaults to the piece-level.
Results also come with a second plotting method, .plot_grouped(). Since no groupers have been applied, the entire dataset is treated as a single group:
result.plot_grouped()
Variant 2: Imply feature extraction in the analyzer#
Here we pass the dataset to the counter.
counter = analyzers.Counter(features="notes")
analyzed_dataset = counter.process(dataset)
analyzed_dataset.get_result().plot()
Applying an Analyzer to a Dataset yields an AnalyzedDataset that includes one Result resource per analyzed Feature.
Both are to be found in the respective packages in the outputs catalog:
analyzed_dataset
Variant 3: Define a Pipeline with FeatureExtractor and Counter#
pipeline = dc.Pipeline([
extractors.FeatureExtractor("notes"),
analyzers.Counter()
])
analyzed_dataset = pipeline.process(dataset)
analyzed_dataset.get_result().plot()
Grouped note counts#
Let’s define a CustomPieceGrouper from random piece groups:
We create a
PieceIndex, which is essentially a fancy list of piece ID tuples.From this, we sample
n_groupsgroups ofn_memberspiece tuples each. Agroupingis a mapping of group names to piece IDs.Then, we set up a
CustomPieceGrouperfrom the grouping. Inspecting it, we see that it stores aPieceIndexin which the first level corresponds to the three group names,group_1,group_2, andgroup_3. Whenever we apply this grouper, it will prepend this level to any processed Resource (provided it contains the grouped pieces). This changes the behaviour of the grouped resource, e.g. when plotting it.
n_groups = 3
n_members = 30
piece_index = resources.PieceIndex.from_resource(notes)
grouping = {f"group_{i}": piece_index.sample(n_members) for i in range(1, n_groups + 1)}
grouper = groupers.CustomPieceGrouper.from_grouping(grouping)
grouper
### Applying the grouper to the analysis result
grouped_result = grouper.process(result)
grouped_result
grouped_result.plot_grouped()
As promised, the grouped result plots differently: Instead of showing pitch-class distributions for each of the grouped pieces,
(which we can still obtain by calling .plot()), it shows the pitch-class distributions for each of the groups.
However, for closer inspection, the area of a circle is not ideal, so let’s view it as a bar plot:
grouped_result.make_bar_plot()
Step.process(Data) == Data.apply_step(Step)#
Above, we have applied Steps, an analyzer, a grouper, and a pipeline, to Data objects, namely
resources (to the Notes feature and to the Counts result) and to a dataset containing these resources.
Another way to achieve the same goal is by applying steps to data. Let’s start with a fresh dataset and
apply the grouper and the analyzer once more:
D = dc.Dataset.from_package(package_path)
analyzed_dataset = D.apply_step(grouper, counter)
analyzed_dataset
result = analyzed_dataset.get_result()
result
result.default_groupby
analyzed_dataset.get_result().make_bar_plot()
Assembling the Pipeline from DimcatConfig objects#
Serialization of any DimcatObject uses the DimcatConfig object. Each config needs to have at least the key dtype,
specifying the name of a DimcatObject. Any other keys need to correspond to init arguments of that object. Wrong keys
or invalid values are rejected.
Any DimcatObject can be expressed as a config by calling its .to_config() method:
config = counter.to_config()
config
Any config can be used to instantiate a DimcatObject:
counter_copy = config.create()
print(f"""The new object and the old object are
equal: {counter == counter_copy}
identical: {counter is counter_copy}""")
Wherever DiMCAT operates with configs, it also accepts dictionaries:
step_configs = [
dict(dtype="FeatureExtractor", features=[dict(dtype="Notes", format="FIFTHS")]),
dict(dtype='CustomPieceGrouper', grouped_units=grouping),
dict(dtype="Counter")
]
pl = dc.Pipeline.from_step_configs(step_configs)
pl
resulting_dataset = pl.process(dataset)
resulting_dataset.get_result().make_bar_plot()