from importlib import resources
import numpy as np
import pandas as pd
import spacebench
from spacebench.log import LOGGER
[docs]class DataMaster:
"""
Class for managing the masterfile and collections metadata
Parameters
----------
masterfile: pd.DataFrame
A dataframe with metadata about available datasets.
collections: pd.DataFrame
A dataframe with information about the collections
where the datasets are generated from.
Examples
--------
>>> from spacebench.datamaster import DataMaster
>>> dm = DataMaster()
>>> print(dm)
Available datasets (total: 11):
<BLANKLINE>
healthd_dmgrcs_mortality_disc
cdcsvi_limteng_hburdic_cont
climate_relhum_wfsmoke_cont
climate_wfsmoke_minrty_disc
healthd_hhinco_mortality_cont
...
county_educatn_election_cont
county_phyactiv_lifexpcy_cont
county_dmgrcs_election_disc
cdcsvi_nohsdp_poverty_cont
cdcsvi_nohsdp_poverty_disc
"""
def __init__(self):
try:
with resources.open_text(spacebench, "masterfile.csv") as io:
self.master = pd.read_csv(io, index_col=0)
except FileNotFoundError:
LOGGER.error("Masterfile not found.")
raise FileNotFoundError(
(
"The masterfile.csv is not present in the "
"expected directory. Please ensure the "
"file is correctly placed."
)
)
[docs] def list_envs(
self, binary: bool | None = None, continuous: bool | None = None
) -> list[str]:
"""
Returns a list of names of available datasets.
Arguments
binary : bool, optional. If True, only binary datasets are returned.
continuous : bool, optional. If True, only continuous datasets are
returned.
Returns
list[str]: Names of all available datasets.
"""
master = self.master
index = np.zeros(master.shape[0], dtype=bool)
if binary is None and continuous is None:
return master.index.to_list()
if binary is not None:
index[master.treatment_type == "binary"] = True
if continuous is not None:
index[master.treatment_type == "continuous"] = True
return master.index[index].to_list()
def __getitem__(self, key: str) -> pd.Series:
"""
Retrieves the row corresponding to the provided dataset key from the
masterfile.
Parameters
----------
key : str
The identifier for the dataset.
Returns
-------
pd.Series or None
The corresponding dataset row if found, else None.
"""
try:
return self.master.loc[key]
except KeyError:
LOGGER.error(f"Dataset {key} not found in masterfile.")
return None
def __str__(self) -> str:
datasets = self.list_envs()
if len(datasets) > 10:
datasets_str = '\n '.join(datasets[:5] + ['...'] + datasets[-5:])
else:
datasets_str = '\n '.join(datasets)
return (f'Available datasets (total: '
f'{len(datasets)}):\n\n {datasets_str}')