"""Functions for formatting morphology feature names in a consistent way.
Formats morphology feature names and saves features as parquet files.
"""
from __future__ import annotations
import dataclasses
import pathlib
import pandas
import pandera.pandas as pa
from beartype import beartype
FEATURE_NAME_COMPONENT_COLUMNS = (
"compartment",
"channel",
"feature_type",
"measurement",
)
def _coerce_dataframe_column_names_to_strings(
dataframe: pandas.DataFrame,
) -> pandas.DataFrame:
"""
Ensure DataFrame column labels are string-typed before writing.
Parameters
----------
dataframe : pandas.DataFrame
The DataFrame whose column names should be coerced to strings.
Returns
-------
pandas.DataFrame
A copy of the input DataFrame with all column names coerced to strings.
"""
parsed_dataframe = dataframe.copy()
parsed_dataframe.columns = [str(column) for column in parsed_dataframe.columns]
return parsed_dataframe
[docs]
@beartype
def remove_underscores_from_string(string: object) -> str:
"""
Remove unwanted delimiters from a string and replace them with hyphens.
Parameters
----------
string : str
The string to remove unwanted delimiters from.
Returns
-------
str
The string with unwanted delimiters removed and replaced with hyphens.
"""
if not isinstance(string, str):
try:
string = str(string)
except Exception as e:
msg = (
f"Input string must be a string or convertible to a string. "
f"Received input: {string!r} of type {type(string)}"
)
raise ValueError(msg) from e
string = string.translate(
str.maketrans(
{
"_": "-",
".": "-",
" ": "-",
"/": "-",
}
)
)
return string
def _coerce_feature_name_components(
dataframe: pandas.DataFrame,
) -> pandas.DataFrame:
"""
Normalize feature-name components using shared delimiter cleanup.
Parameters
----------
dataframe : pandas.DataFrame
The DataFrame containing feature name components to be normalized.
Expected to have columns corresponding to FEATURE NAME COMPONENT COLUMNS.
Returns
-------
pandas.DataFrame
A copy of the input DataFrame with feature name components normalized by
removing unwanted delimiters and replacing them with hyphens.
"""
parsed_dataframe = dataframe.copy()
for column in FEATURE_NAME_COMPONENT_COLUMNS:
if column in parsed_dataframe.columns:
parsed_dataframe[column] = parsed_dataframe[column].map(
remove_underscores_from_string
)
return parsed_dataframe
# ============================================================================
# Constants
# ============================================================================
FEATURE_NAME_COMPONENT_COLUMNS = (
"compartment",
"channel",
"feature_type",
"measurement",
)
FEATURE_OUTPUT_SCHEMA = pa.DataFrameSchema(
columns={},
strict=False,
parsers=[pa.Parser(_coerce_dataframe_column_names_to_strings)],
)
FEATURE_NAME_COMPONENT_SCHEMA = pa.DataFrameSchema(
columns={
"compartment": pa.Column(object, nullable=False, coerce=True),
"channel": pa.Column(object, nullable=False, coerce=True),
"feature_type": pa.Column(object, nullable=False, coerce=True),
"measurement": pa.Column(object, nullable=False, coerce=True),
},
strict=True,
parsers=[pa.Parser(_coerce_feature_name_components)],
)
[docs]
@beartype
def save_features_as_parquet(
parent_path: pathlib.Path,
df: pandas.DataFrame,
metadata: FeatureMetadata,
) -> pathlib.Path:
"""Save features as parquet files in a consistent way.
Saves features as parquet files with consistent naming across morphology
features.
Parameters
----------
parent_path : pathlib.Path
The parent path to save the features to.
df : pandas.DataFrame
The dataframe containing the features to save.
metadata : FeatureMetadata
Metadata for the feature output (compartment, channel, feature_type,
cpu_or_gpu).
Returns
-------
pathlib.Path
"""
validated_df = FEATURE_OUTPUT_SCHEMA.validate(df)
output_prefix = format_morphology_feature_name(
metadata.compartment,
metadata.channel,
metadata.feature_type,
metadata.cpu_or_gpu,
)
save_path = parent_path / f"{output_prefix}_features.parquet"
validated_df.to_parquet(save_path, index=False)
return save_path