Source code for dimcat.data.packages.dc

from __future__ import annotations

import logging
from typing import Iterable, Optional

import frictionless as fl
import pandas as pd
from dimcat.base import get_setting
from dimcat.data.packages.base import Package, PackageMode
from dimcat.data.resources.base import D, Resource, SomeDataframe
from dimcat.data.resources.dc import DimcatResource, PieceIndex

module_logger = logging.getLogger(__name__)


[docs]class DimcatPackage(Package): _accepted_resource_types = (DimcatResource,) _default_mode = PackageMode.RECONCILE_SAFELY _detects_extensions = get_setting("resource_descriptor_endings") def _verify_creationist_arguments( self, **kwargs, ): """Spoiler alert: They are spurious.""" if not any(kwargs.values()): raise ValueError("No arguments were passed to create a resource.") if kwargs.get("resource") and kwargs.get("df"): raise ValueError("Pass either a resource or a dataframe, not both.") def __init__( self, package_name: str, resources: Iterable[Resource] = None, basepath: Optional[str] = None, descriptor_filename: Optional[str] = None, auto_validate: bool = False, metadata: Optional[dict] = None, ) -> None: """ Args: metadata: package_name: Name of the package that can be used to retrieve it. resources: An iterable of :class:`Resource` objects to add to the package. descriptor_filename: Pass a JSON or YAML filename or relative filepath to override the default (``<package_name>.json``). Following frictionless specs it should end on ".datapackage.[json|yaml]". basepath: The absolute path on the local file system where the package descriptor and all contained resources are stored. The filepaths of all included :class:`DimcatResource` objects need to be relative to the basepath and DiMCAT does its best to ensure this. auto_validate: By default, the package is validated everytime a resource is added. Set to False to disable this. metadata: Custom metadata to be maintained in the package descriptor. """ super().__init__( package_name=package_name, resources=resources, basepath=basepath, descriptor_filename=descriptor_filename, auto_validate=auto_validate, metadata=metadata, )
[docs] def create_and_add_resource( self, df: Optional[D] = None, resource: Optional[Resource | fl.Resource | str] = None, resource_name: Optional[str] = None, basepath: Optional[str] = None, auto_validate: bool = False, ) -> None: """Adds a resource to the package. Parameters are passed to :class:`DimcatResource`.""" self._verify_creationist_arguments(df=df, resource=resource) if df is not None: new_resource = DimcatResource.from_dataframe( df=df, resource_name=resource_name, auto_validate=auto_validate, basepath=basepath, ) self.add_resource(new_resource) return super().create_and_add_resource( resource=resource, resource_name=resource_name, basepath=basepath, auto_validate=auto_validate, )
[docs] def get_boolean_resource_table(self) -> SomeDataframe: """Returns a table with this package's piece index and one boolean column per resource, indicating whether the resource is available for a given piece or not.""" bool_masks = [] for resource in self: piece_index = resource.get_piece_index() if len(piece_index) == 0: continue bool_masks.append( pd.Series( True, dtype="boolean", index=piece_index.index, name=resource.resource_name, ) ) if len(bool_masks) == 0: return pd.DataFrame([], dtype="boolean", index=PieceIndex().index) table = pd.concat(bool_masks, axis=1).fillna(False).sort_index() table.index.names = ("corpus", "piece") table.columns.names = ("resource_name",) return table
[docs] def get_piece_index(self) -> PieceIndex: """Returns the piece index corresponding to a sorted union of all included resources' indices.""" IDs = set() for resource in self: IDs.update(resource.get_piece_index()) return PieceIndex.from_tuples(sorted(IDs))