databricks
diff --git a/‎compose_rl/data/preference_data.py
Lines changed: 5 additions & 3 deletions b/‎compose_rl/data/preference_data.py
Lines changed: 5 additions & 3 deletions
diff --git a/‎compose_rl/metrics/reward_model_metrics.py
Lines changed: 61 additions & 0 deletions b/‎compose_rl/metrics/reward_model_metrics.py
Lines changed: 61 additions & 0 deletions
diff --git a/‎compose_rl/reward_learning/__init__.py
Lines changed: 2 additions & 0 deletions b/‎compose_rl/reward_learning/__init__.py
Lines changed: 2 additions & 0 deletions
diff --git a/‎compose_rl/reward_learning/model.py
Lines changed: 73 additions & 14 deletions b/‎compose_rl/reward_learning/model.py
Lines changed: 73 additions & 14 deletions
diff --git a/‎compose_rl/reward_learning/model_methods.py
Lines changed: 69 additions & 0 deletions b/‎compose_rl/reward_learning/model_methods.py
Lines changed: 69 additions & 0 deletions
diff --git a/‎pyproject.toml
Lines changed: 2 additions & 0 deletions b/‎pyproject.toml
Lines changed: 2 additions & 0 deletions
diff --git a/‎scripts/data/unified_tokenize_dataset.py
Lines changed: 29 additions & 1 deletion b/‎scripts/data/unified_tokenize_dataset.py
Lines changed: 29 additions & 1 deletion
@@ -300,13 +300,15 @@ def __getitem__(self, idx: int) -> dict[str, Any]:
             idx (int): the index where we fetch the data in the StreamingDataset.
         """
         sample = super().__getitem__(idx)
-        text = self._read_binary_tokenized_sample(sample, 'text')
-        label = self._read_binary_tokenized_sample(sample, 'label')
+        text = self._read_binary_tokenized_sample(sample, 'input')
+        label = torch.from_numpy(np.frombuffer(sample['label'], dtype=np.uint8))
+        # This needs to be a float tensor for BCE
+        label = label.to(torch.float32)
 
         text_len = len(text)
 
         return {
             'text': text,
-            'label': label,
+            'labels': label,
             'text_len': torch.Tensor([text_len]).to(torch.int64),
         }
@@ -40,3 +40,64 @@ def compute(self):
         assert isinstance(self.correct, Tensor)
         assert isinstance(self.total, Tensor)
         return self.correct / self.total
+
+
+class BinaryRewardClassificationAccuracy(Metric):
+    """Classification accuracy metric.
+
+    Computes the accuracy of a classifier by comparing predictions from logits
+    against ground truth labels. Handles both binary and multi-class
+    classification.
+    """
+
+    # Make torchmetrics call update only once
+    full_state_update = False
+
+    def __init__(
+        self,
+        threshold: float = 0.5,
+        dist_sync_on_step: bool = False,
+        **kwargs: Any,
+    ):
+        """Initialize the metric.
+
+        Args:
+            binary: If True, treats as binary classification with sigmoid.
+                   If False, treats as multi-class with softmax.
+            threshold: Decision threshold for binary classification
+            dist_sync_on_step: Synchronize metric state across processes
+        """
+        super().__init__(dist_sync_on_step=dist_sync_on_step)
+        self.threshold = threshold
+
+        self.add_state(
+            'correct',
+            default=torch.tensor(0.),
+            dist_reduce_fx='sum',
+        )
+        self.add_state('total', default=torch.tensor(0.), dist_reduce_fx='sum')
+
+    def update(self, batch: dict, output_logits: torch.Tensor):
+        """Update state with predictions and targets.
+
+        Args:
+            batch: Dictionary containing 'output_scores' and 'labels'
+            output_logits: `None`
+        """
+        del output_logits
+        logits = batch['output_scores']
+        targets = batch['labels'].squeeze(-1)
+        assert logits.shape[0] == targets.shape[0], 'Batch sizes must match'
+
+        # TODO (raj): Handle multi-class classification with logging
+        probs = torch.sigmoid(logits.squeeze())
+        predictions = (probs > self.threshold).long()
+
+        self.correct += (predictions == targets).sum().detach().cpu()
+        self.total += targets.shape[0]
+
+    def compute(self):
+        """Compute the accuracy."""
+        assert isinstance(self.correct, Tensor)
+        assert isinstance(self.total, Tensor)
+        return self.correct / self.total
@@ -18,6 +18,7 @@
 )
 from compose_rl.reward_learning.inference_model import InferenceRewardModel
 from compose_rl.reward_learning.model import (
+    ComposerHFClassifierRewardModel,
     ComposerHFPairwiseRewardModel,
     ComposerMPTPairwiseRewardModel,
 )
@@ -32,6 +33,7 @@
     'RewardModel',
     'ComposerMPTPairwiseRewardModel',
     'ComposerHFPairwiseRewardModel',
+    'ComposerHFClassifierRewardModel',
     'InferenceRewardModel',
     'BadGenerationEndReward',
     'IncreasingNumbersReward',
 
@@ -4,15 +4,18 @@
 """Reward Model Composer Implementation."""
 
 import logging
-from typing import Any, Mapping, MutableMapping, Optional, Union
+from typing import Any, Mapping, MutableMapping, Optional
 
 import torch
 from llmfoundry.models import ComposerMPTCausalLM
 
 from compose_rl.reward_learning.base_reward import RewardModel, Tokenizer
 from compose_rl.reward_learning.hf_utils import SequenceClassifierOutput
 from compose_rl.reward_learning.model_methods import (
+    ClassifierRewardEnum,
     PairwiseRewardEnum,
+    classifier_forward,
+    classifier_loss,
     pairwise_forward,
     pairwise_loss,
 )
@@ -62,24 +65,14 @@ def __init__(
             **kwargs,
         )
 
-    def forward(
-        self,
-        batch: MutableMapping,
-    ) -> Union[dict[str, torch.Tensor], torch.Tensor]:
+    def forward(self, batch: MutableMapping) -> dict[str, torch.Tensor]:
         is_inference = batch.get('is_inference', False)
         if is_inference:
-            scores = self.model(
+            return self.model(
                 input_ids=batch['input_ids'],
                 attention_mask=batch['attention_mask'],
                 return_lm_logits=self.return_lm_logits,
             ).scores
-            if self.min_threshold is not None and self.max_threshold is not None:
-                scores: torch.Tensor = torch.clamp(
-                    scores,
-                    min=self.min_threshold,
-                    max=self.max_threshold,
-                )
-            return scores
         else:
             return pairwise_forward(
                 model=self.model,
@@ -93,7 +86,7 @@ def eval_forward(
         self,
         batch: MutableMapping,
         outputs: Optional[SequenceClassifierOutput] = None,
-    ) -> Union[dict[str, torch.Tensor], torch.Tensor]:
+    ) -> dict[str, torch.Tensor]:
         return outputs if outputs is not None else self.forward(batch)
 
     def loss(self, outputs: SequenceClassifierOutput,
@@ -105,6 +98,72 @@ def loss(self, outputs: SequenceClassifierOutput,
         )
 
 
+class ComposerHFClassifierRewardModel(
+    ComposerHFSequenceClassification,
+    RewardModel,
+):
+
+    def __init__(
+        self,
+        tokenizer: Tokenizer,
+        use_train_metrics: bool = True,
+        additional_train_metrics: Optional[list] = None,
+        additional_eval_metrics: Optional[list] = None,
+        loss_type: str = 'bce',
+        return_lm_logits: bool = False,
+        return_last: bool = True,
+        **kwargs: Any,
+    ):
+        self.loss_type = ClassifierRewardEnum(loss_type)
+        self.return_lm_logits = return_lm_logits
+        self.return_last = return_last
+
+        config_overrides = {
+            'return_logits': return_lm_logits,
+        }
+
+        if 'config_overrides' in kwargs:
+            config_overrides.update(kwargs.pop('config_overrides'))
+
+        self.min_threshold = kwargs.pop('min_threshold', None)
+        self.max_threshold = kwargs.pop('max_threshold', None)
+
+        super().__init__(
+            tokenizer=tokenizer,
+            use_train_metrics=use_train_metrics,
+            additional_train_metrics=additional_train_metrics,
+            additional_eval_metrics=additional_eval_metrics,
+            config_overrides=config_overrides,
+            **kwargs,
+        )
+
+    def forward(self, batch: MutableMapping) -> dict[str, torch.Tensor]:
+        ret_val = classifier_forward(
+            model=self.model,
+            tokenizer=self.tokenizer,
+            batch=batch,
+            return_last=self.return_last,
+            return_lm_logits=self.return_lm_logits,
+        )
+
+        return ret_val
+
+    def eval_forward(
+        self,
+        batch: MutableMapping,
+        outputs: Optional[SequenceClassifierOutput] = None,
+    ) -> dict[str, torch.Tensor]:
+        return outputs if outputs is not None else self.forward(batch)
+
+    def loss(self, outputs: SequenceClassifierOutput,
+             batch: Mapping) -> dict[str, torch.Tensor]:
+        return classifier_loss(
+            outputs,
+            batch,
+            self.loss_type,
+        )
+
+
 class ComposerMPTPairwiseRewardModel(ComposerMPTCausalLM, RewardModel):
     """MPT model wrapper for Pairwise/BT reward model."""
 
 
@@ -34,6 +34,10 @@ class PairwiseRewardEnum(Enum):
     BELLMAN_EURUS = 'bellman_eurus'
 
 
+class ClassifierRewardEnum(Enum):
+    BCE = 'bce'
+
+
 def pairwise_forward(
     model: nn.Module,
     tokenizer: Tokenizer,
@@ -162,6 +166,40 @@ def pairwise_forward(
     return outputs
 
 
+def classifier_forward(
+    model: nn.Module,
+    tokenizer: Tokenizer,
+    batch: MutableMapping,
+    policy_model_config: Optional[PretrainedConfig] = None,
+    use_attention_sequence_id: bool = False,
+    return_last: bool = True,
+    return_lm_logits: bool = False,
+) -> dict[str, torch.Tensor]:
+
+    model_output = model(
+        batch['text'],
+        attention_mask=batch['text_attention_mask'],
+        return_lm_logits=return_lm_logits,
+    )
+
+    output_scores = model_output.scores
+    if return_last:
+        # Expected Shape: (Batch Size, 1)
+        output_scores = torch.gather(
+            output_scores,
+            dim=1,
+            index=batch['text_len'].view(-1, 1) - 1,
+        )
+
+    # We need to add the labels here to compute metrics
+    outputs: dict[str, torch.Tensor] = {
+        'output_scores': output_scores,
+        'labels': batch['labels'],
+    }
+
+    return outputs
+
+
 def pairwise_loss(
     outputs: SequenceClassifierOutput,
     batch: Mapping,
@@ -219,3 +257,34 @@ def pairwise_loss(
     loss_dict['total'] = losses
 
     return loss_dict
+
+
+def classifier_loss(
+    outputs: SequenceClassifierOutput,
+    batch: Mapping,
+    loss_type: ClassifierRewardEnum,
+) -> dict[str, torch.Tensor]:
+    """Computes Classifier loss.
+
+    Given precomputed values this will compute the specified classifier loss.
+
+    Args:
+        outputs (SequenceClassifierOutput): Outputs from forwarding the model over the batch.
+        batch (Mapping): Input batch of data.
+        loss_type (str): Loss type that we should compute (e.g. bce),
+    """
+    output_scores = outputs['output_scores']
+
+    if loss_type == ClassifierRewardEnum.BCE:
+        loss = F.binary_cross_entropy_with_logits(
+            output_scores,
+            batch['labels'],
+        )
+    else:
+        raise NotImplementedError(f'Loss type: {loss_type} is not supported.')
+
+    loss_dict = {
+        'total': loss,
+    }
+
+    return loss_dict
@@ -38,6 +38,7 @@ mpt_dpo_lm = "compose_rl.dpo:ComposerDPOLM"
 hf_dpo_lm = "compose_rl.dpo:ComposerHFDPOLM"
 mpt_pairwise_rm = "compose_rl.reward_learning:ComposerMPTPairwiseRewardModel"
 hf_pairwise_rm = "compose_rl.reward_learning:ComposerHFPairwiseRewardModel"
+hf_classifier_rm = "compose_rl.reward_learning:ComposerHFClassifierRewardModel"
 mpt_ppo_lm = "compose_rl.ppo:ComposerMosaicPolicy"
 hf_ppo_lm = "compose_rl.ppo:ComposerHFPolicyModel"
 
@@ -52,6 +53,7 @@ ppo = "compose_rl.ppo:PPOCallback"
 
 [project.entry-points."llmfoundry_metrics"]
 pairwise_rm_accuracy = "compose_rl.metrics.reward_model_metrics:PairwiseRewardClassificationAccuracy"
+classifier_accuracy = "compose_rl.metrics.reward_model_metrics:BinaryRewardClassificationAccuracy"
 
 # iSort
 [tool.isort]
 
@@ -56,6 +56,8 @@ def __iter__(self) -> Iterator[dict[str, bytes]]:
                 result = self._process_single_prompt_sample(sample)
                 if result is not None:
                     yield result
+            elif self.dataset_type == 'classifier':
+                yield self._process_classifier_sample(sample)
 
     def _process_preference_sample(self, sample: Any):
         """Process a preference sample.
@@ -104,6 +106,28 @@ def _process_single_prompt_sample(self, sample: Any):
 
         return {'prompt': np.asarray(encoded_prompt).tobytes()}
 
+    def _process_classifier_sample(self, sample: Any):
+        """A dummy process a classifier sample.
+
+        Args:
+            sample (Any): a sample from the dataset
+        """
+        messages = [{
+            'role': 'user',
+            'content': f'This is a test',
+        }]
+        encoded_prompt = self.tokenizer.apply_chat_template(
+            messages,
+            tokenize=True,
+        )
+
+        label = np.random.randint(0, 2, size=(1,))
+
+        return {
+            'input': np.asarray(encoded_prompt).tobytes(),
+            'label': np.asarray(label).tobytes(),
+        }
+
 
 def main(
     dataset_name: str,
@@ -123,6 +147,10 @@ def main(
         'single_prompt': {
             'prompt': 'bytes',
         },
+        'classifier': {
+            'input': 'bytes',
+            'label': 'bytes',
+        },
     }[dataset_type]
 
     tokenizer = AutoTokenizer.from_pretrained(
@@ -185,7 +213,7 @@ def main(
     parser.add_argument(
         '--dataset_type',
         type=str,
-        choices=['preference', 'single_prompt'],
+        choices=['preference', 'single_prompt', 'classifier'],
         required=True,
         help='Type of dataset to process',
     )