Source code for rexmex.scorecard

from typing import Collection, List, Mapping, Optional, Tuple

import numpy as np
import pandas as pd

from rexmex.utils import Metric

__all__ = [
    "ScoreCard",
    "CoverageScoreCard",
]


[docs]class ScoreCard: """ A scorecard can be used to aggregate metrics, plot those, and generate performance reports. """ metric_set: Mapping[str, Metric] def __init__(self, metric_set: Mapping[str, Metric]): self.metric_set = metric_set
[docs] def get_performance_metrics(self, y_true: np.array, y_score: np.array) -> pd.DataFrame: """ A method to get the performance metrics for a pair of vectors. Args: y_true (np.array): A vector of ground truth values. y_score (np.array): A vector of model predictions. Returns: performance_metrics (pd.DataFrame): The performance metrics calculated from the vectors. """ performance_metrics = {name: [metric(y_true, y_score)] for name, metric in self.metric_set.items()} performance_metrics_df = pd.DataFrame.from_dict(performance_metrics) return performance_metrics_df
[docs] def generate_report(self, scores_to_evaluate: pd.DataFrame, grouping: Optional[List[str]] = None) -> pd.DataFrame: """ A method to calculate (aggregated) performance metrics based on a dataframe of ground truth and predictions. It assumes that the dataframe has the `y_true` and `y_score` keys in the dataframe. Args: scores_to_evaluate (pd.DataFrame): A dataframe with the scores and ground-truth - it has the `y_true` and `y_score` keys. grouping (list): A list of performance grouping variable names. Returns: report (pd.DataFrame): The performance report. """ if grouping is not None: scores_to_evaluate = scores_to_evaluate.groupby(grouping) report = scores_to_evaluate.apply(lambda group: self.get_performance_metrics(group.y_true, group.y_score)) else: report = self.get_performance_metrics(scores_to_evaluate.y_true, scores_to_evaluate.y_score) return report
[docs] def filter_scores( self, scores: pd.DataFrame, training_set: pd.DataFrame, testing_set: pd.DataFrame, validation_set: pd.DataFrame, columns: List[str], ) -> pd.DataFrame: """ A method to filter out those entries which also appear in either the training, testing or validation sets. `The original is here: <https://papers.nips.cc/paper/2013/file/1cecc7a77928ca8133fa24680a88d2f9-Paper.pdf>.` Args: scores (pd.DataFrame): A dataframe with the scores. training_set (pd.DataFrame): A dataframe of training data points. testing_set (pd.DataFrame): A dataframe of testing data points. validation_set (pd.DataFrame): A dataframe of validation data points. columns (list): A list of column names used for cross-referencing. Returns: scores (pd.DataFrame): The scores for data points which are not in the reference sets. """ scores_columns = list(scores.columns.tolist()) in_sample_examples = pd.concat([training_set, testing_set, validation_set]) scores = scores.merge(in_sample_examples.drop_duplicates(), on=columns, how="left", indicator=True) scores = scores[scores["_merge"] == "left_only"].reset_index()[scores_columns] return scores
def __repr__(self): """ A representation of the ScoreCard object. """ return f"ScoreCard(metric_set={self.metric_set!r})"
[docs] def print_metrics(self): """ Printing the name of metrics. """ print({k for k in self.metric_set.keys()})
[docs]class CoverageScoreCard(ScoreCard): """ Coverage scorecard can be used to aggregate coverage-related metrics, plot those, and generate performance reports. """ def __init__(self, metric_set: Mapping[str, Metric], all_users: Collection[str], all_items: Collection[str]): """ all_users and all_items define the relevant user and item space. """ super().__init__(metric_set) self.all_users = all_users self.all_items = all_items
[docs] def get_coverage_metrics(self, recommendations: List[Tuple]) -> pd.DataFrame: """ Gets all coverage (performance) values using the defined metric_set. It expects a list of tuples of user/item combinations, e.g., [(user_1, item_1), (user_2, item1),]. The space of possible users and items to recommend is defined during initalisation of this class. Args: recommendations List[Tuple]: recommendations of items to users, made by the evaluated system. The user has to decide which score or confidence levels to use prior to calling this ScoreCard. Returns: performance_metrics (pd.DataFrame): The coverage (performance) metrics calculated from the recommendations. """ performance_metrics = { name: [metric((self.all_users, self.all_items), recommendations)] for name, metric in self.metric_set.items() } performance_metrics_df = pd.DataFrame.from_dict(performance_metrics) return performance_metrics_df
[docs] def generate_report(self, recs_to_evaluate: pd.DataFrame, grouping: Optional[List[str]] = None) -> pd.DataFrame: """ A method to calculate (aggregated) coverage/performance metrics based on a dataframe of predictions. It assumes that the dataframe has the `user` and `item` keys in the dataframe. Args: recs_to_evaluate (pd.DataFrame): A dataframe holding the recommendations (users, items). Contains columns `user` and `item`. grouping (list): A list of performance grouping variable names (e.g., different recommender settings). Returns: report (pd.DataFrame): The performance report. """ if "user" not in recs_to_evaluate.columns or "item" not in recs_to_evaluate.columns: raise ValueError("recs_to_evaluate has to have user and item columns!") if grouping is not None: recs_to_evaluate = recs_to_evaluate.groupby(grouping) report = recs_to_evaluate.apply( lambda group: self.get_coverage_metrics( list(group[["user", "item"]].itertuples(index=False, name=None)) ) ) else: report = self.get_coverage_metrics( list(recs_to_evaluate[["user", "item"]].itertuples(index=False, name=None)) ) return report