opentensor · steffencruz · Dec 12, 2023 · Dec 12, 2023
diff --git a/ocr_subnet/validator/reward.py b/ocr_subnet/validator/reward.py
@@ -40,22 +40,9 @@ def loss(label: dict, pred: dict, alpha_p=1.0, alpha_f=1.0, alpha_t=1.0):
     Returns:
     - float: The score for the section. Bounded between 0 and 1.
     """
-    # position loss is IOU of the bounding boxes
-    if pred.get('position'):
-        position_loss = get_iou(label['position'], pred['position'])
-    else:
-        # otherwise set to max loss
-        position_loss = 1.0
-
-    if pred.get('font'):
-        font_loss = get_font_distance(label['font'], pred['font'])
-    else:
-        font_loss = 1.0
-
-    if pred.get('text'):
-        text_loss = get_edit_distance(label['text'], pred['text'])
-    else:
-        text_loss = 1.0
+    position_loss = get_iou(label['position'], pred.get('position'))
+    font_loss = get_font_distance(label['font'], pred.get('font'))
+    text_loss = get_edit_distance(label['text'], pred.get('text'))
 
     total_loss = (alpha_p * position_loss + alpha_f * font_loss + alpha_t * text_loss) / (alpha_p + alpha_f + alpha_t)
 

diff --git a/ocr_subnet/validator/utils.py b/ocr_subnet/validator/utils.py
@@ -1,7 +1,7 @@
 import editdistance
 
 
-def get_iou(bb1, bb2):
+def get_iou(bb1, bb2 = None):
     """
     Calculate the Intersection over Union (IoU) of two bounding boxes.
     NOTE: Thanks to this guy! https://stackoverflow.com/questions/25349178/calculating-percentage-of-bounding-box-overlap-for-image-detector-evaluation
@@ -21,6 +21,10 @@ def get_iou(bb1, bb2):
     -------
     float: Normalized between 0 and 1.
     """
+
+    if not bb2:
+        return 1.0
+
     assert bb1['x1'] < bb1['x2']
     assert bb1['y1'] < bb1['y2']
     assert bb2['x1'] < bb2['x2']
@@ -52,7 +56,7 @@ def get_iou(bb1, bb2):
     return iou
 
 
-def get_edit_distance(text1: str, text2: str):
+def get_edit_distance(text1: str, text2: str = None):
     """Calculate the edit distance between two strings.
 
     Parameters
@@ -67,10 +71,12 @@ def get_edit_distance(text1: str, text2: str):
     float
         The edit distance between the two strings, normalized to be between 0 and 1.
     """
+    if not text2:
+        return 1.0
 
     return editdistance.eval(text1, text2) / max(len(text1), len(text2))
 
-def get_font_distance(font1: dict, font2: dict):
+def get_font_distance(font1: dict, font2: dict = None):
     """Calculate the distance between two fonts.
 
     Parameters
@@ -85,6 +91,9 @@ def get_font_distance(font1: dict, font2: dict):
     float
         The distance between the two fonts. Normalized to be between 0 and 1.
     """
+    if not font2:
+        return 1.0
+
     font_size_loss = abs(font1['size'] - font2['size']) / max(font1['size'], font2['size'])
     font_family_loss = 0.0 if font1['family'] == font2['family'] else 1.0
     return (font_size_loss + font_family_loss) / 2
diff --git a/scripts/demo_validator.ipynb b/scripts/demo_validator.ipynb
@@ -23,16 +23,22 @@
    "outputs": [],
    "source": [
     "import os\n",
-    "import datetime\n",
-    "import pdf2image\n",
     "import math\n",
+    "import torch\n",
     "import random\n",
+    "import datetime\n",
+    "import bittensor as bt\n",
     "from IPython.display import display\n",
+    "\n",
+    "import pdf2image\n",
+    "import editdistance\n",
+    "\n",
+    "from faker import Faker\n",
+    "from typing import List\n",
     "from PIL import Image, ImageFilter, ImageDraw\n",
     "from reportlab.lib.pagesizes import letter\n",
     "from reportlab.pdfgen import canvas\n",
-    "from reportlab.lib.units import inch\n",
-    "from faker import Faker\n"
+    "from reportlab.lib.units import inch\n"
    ]
   },
   {
@@ -173,7 +179,7 @@
     "\n",
     "# Use the function and pass the data and the filename you want to save as\n",
     "data = create_invoice(invoice_info, \"sample_invoice.pdf\")\n",
-    "data"
+    "data[:3]"
    ]
   },
   {
@@ -184,7 +190,7 @@
    "source": [
     "\n",
     "\n",
-    "def corrupt_image(input_pdf_path, output_pdf_path, theta=1, border=50, noise=0.1, scale=0.95, blur=1, spot=(100,100)):\n",
+    "def corrupt_image(input_pdf_path, output_pdf_path, theta=0.2, border=50, noise=0.1, scale=0.95, blur=1, spot=(100,100)):\n",
     "    # Convert PDF to images\n",
     "    images = pdf2image.convert_from_path(input_pdf_path)\n",
     "\n",
@@ -362,7 +368,18 @@
     "    return image\n",
     "\n",
     "# scale data so that it matches the image size\n",
-    "def scale_data(data, w, h):\n",
+    "def scale_data(data: List[dict], w: int, h: int):\n",
+    "    \"\"\"\n",
+    "    Rescales the position data so that it matches the image size\n",
+    "\n",
+    "    Args:\n",
+    "        data (List[dict]): List of dictionaries containing the position, font and text of each section\n",
+    "        w (int): Width of the image\n",
+    "        h (int): Height of the image\n",
+    "\n",
+    "    Returns:\n",
+    "        List[dict]: List of dictionaries containing the position, font and text of each section\n",
+    "    \"\"\"\n",
     "    scaled_data = []\n",
     "    for section in data:\n",
     "        entry = section.copy()\n",
@@ -379,13 +396,12 @@
    "outputs": [],
    "source": [
     "# Path to the image you want to extract text from\n",
-    "path = os.path.join(os.getcwd(), 'sample_invoice.pdf')\n",
+    "path = os.path.join(os.getcwd(), 'noisy_invoice.pdf')\n",
     "\n",
     "# Read the pdf into memory\n",
     "image = pdf2image.convert_from_path(path)[0]\n",
     "# convert to PIL image\n",
     "image = image.convert('RGB')\n",
-    "# image\n",
     "\n",
     "\n",
     "scaled_data = scale_data(data, *image.size)\n",
@@ -397,7 +413,136 @@
    "execution_count": null,
    "metadata": {},
    "outputs": [],
-   "source": []
+   "source": [
+    "\n",
+    "def get_position_error(boxA: List[float], boxB: List[float] = None):\n",
+    "    \"\"\"\n",
+    "    Calculate the intersection over union (IoU) of two bounding boxes.\n",
+    "\n",
+    "    Args:\n",
+    "    - boxA (list): Bounding box coordinates of box A in the format [x1, y1, x2, y2].\n",
+    "    - boxB (list): Bounding box coordinates of box B in the format [x1, y1, x2, y2].\n",
+    "\n",
+    "    Returns:\n",
+    "    - float: The IoU value, ranging from 0 to 1.\n",
+    "    \"\"\"\n",
+    "    if not boxB:\n",
+    "        return 1.0\n",
+    "\n",
+    "    xA = max(boxA[0], boxB[0])\n",
+    "    yA = max(boxA[1], boxB[1])\n",
+    "    xB = min(boxA[2], boxB[2])\n",
+    "    yB = min(boxA[3], boxB[3])\n",
+    "\n",
+    "    intersection_area = max(0, xB - xA + 1) * max(0, yB - yA + 1)\n",
+    "\n",
+    "    boxA_area = (boxA[2] - boxA[0] + 1) * (boxA[3] - boxA[1] + 1)\n",
+    "    boxB_area = (boxB[2] - boxB[0] + 1) * (boxB[3] - boxB[1] + 1)\n",
+    "\n",
+    "    iou = intersection_area / float(boxA_area + boxB_area - intersection_area)\n",
+    "    \n",
+    "    return 1 - iou\n",
+    "\n",
+    "def get_edit_distance(text1: str, text2: str = None):\n",
+    "    \"\"\"\n",
+    "    Calculate the edit distance between two strings.\n",
+    "\n",
+    "    Args:\n",
+    "    - text1 (str): The first string.\n",
+    "    - text2 (str): The second string.\n",
+    "\n",
+    "    Returns:\n",
+    "    - float: The edit distance between the two strings. Normalized to be between 0 and 1.\n",
+    "    \"\"\"\n",
+    "    if not text2:\n",
+    "        return 1.0\n",
+    "\n",
+    "    return editdistance.eval(text1, text2) / max(len(text1), len(text2))\n",
+    "\n",
+    "def get_font_distance(font1: dict, font2: dict = None):\n",
+    "    \"\"\"\n",
+    "    Calculate the distance between two fonts, based on the font size and font family.\n",
+    "\n",
+    "    Args:\n",
+    "    - font1 (dict): The first font.\n",
+    "    - font2 (dict): The second font.\n",
+    "\n",
+    "    Returns:\n",
+    "    - float: The distance between the two fonts. Normalized to be between 0 and 1.\n",
+    "    \"\"\"\n",
+    "    if not font2:\n",
+    "        return 1.0\n",
+    "\n",
+    "    font_size_loss = abs(font1['size'] - font2['size']) / max(font1['size'], font2['size'])\n",
+    "    font_family_loss = 0.0 if font1['family'] == font2['family'] else 1.0\n",
+    "    return (font_size_loss + font_family_loss) / 2\n",
+    "\n",
+    "def loss(label: dict, pred: dict, alpha_p=1.0, alpha_f=1.0, alpha_t=1.0):\n",
+    "    \"\"\"\n",
+    "    Score a section of the image based on the section's correctness.\n",
+    "    Correctness is defined as:\n",
+    "    - the intersection over union of the bounding boxes,\n",
+    "    - the delta between the predicted font and the ground truth font,\n",
+    "    - and the edit distance between the predicted text and the ground truth text.\n",
+    "\n",
+    "    Args:\n",
+    "    - label (dict): The ground truth data for the section.\n",
+    "    - pred (dict): The predicted data for the section.\n",
+    "\n",
+    "    Returns:\n",
+    "    - float: The score for the section. Bounded between 0 and 1.\n",
+    "    \"\"\"\n",
+    "    position_loss = get_position_error(label['position'], pred.get('position'))\n",
+    "    font_loss = get_font_distance(label['font'], pred.get('font'))\n",
+    "    text_loss = get_edit_distance(label['text'], pred.get('text'))\n",
+    "\n",
+    "    total_loss = (alpha_p * position_loss + alpha_f * font_loss + alpha_t * text_loss) / (alpha_p + alpha_f + alpha_t)\n",
+    "\n",
+    "    bt.logging.info(f\"position_loss: {position_loss:.3f}, font_loss: {font_loss:.3f}, text_loss: {text_loss:.3f}, total_loss: {total_loss:.3f}\")\n",
+    "\n",
+    "    return total_loss\n",
+    "\n",
+    "\n",
+    "def reward(image_data: List[dict], predictions: List[dict]) -> float:\n",
+    "    \"\"\"\n",
+    "    Reward the miner response to the OCR request.\n",
+    "\n",
+    "    Args:\n",
+    "    - image_data (list): The ground truth data for the image.\n",
+    "    - predictions (list): The predicted data for the image.\n",
+    "\n",
+    "    Returns:\n",
+    "    - float: The reward for the miner response. Bounded between 0 and 1.\n",
+    "    \"\"\"\n",
+    "\n",
+    "    if predictions is None:\n",
+    "        return 0.0\n",
+    "\n",
+    "    # Take mean score over all sections in document\n",
+    "    predictions_loss = torch.mean(torch.FloatTensor([loss(label, pred) for label, pred in zip(image_data, predictions)]))\n",
+    "\n",
+    "    # convert loss to reward (invert and scale)\n",
+    "    raw_reward = 1.0 / (predictions_loss + 0.1)\n",
+    "    # NOTE: Tanh will saturate quickly and so two losses of 0.1 and 0.01 would produce raw_rewards of 10 and 100 which would both have tanh values of effectively 1.0.\n",
+    "    normalized_reward = torch.tanh(raw_reward)\n",
+    "\n",
+    "    bt.logging.info(f\"predictions_loss: {predictions_loss:.3f}, raw_reward: {raw_reward:.3f}, normalized_reward: {normalized_reward:.3f}\")\n",
+    "    return normalized_reward"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Test the reward function with slightly modified image data\n",
+    "w, h = image.size\n",
+    "scaled_data2 = scale_data(data, w+100, h+100)\n",
+    "\n",
+    "\n",
+    "reward(scaled_data, scaled_data2)"
+   ]
   },
   {
    "cell_type": "code",

diff --git a/scripts/noisy_invoice.pdf b/scripts/noisy_invoice.pdf
diff --git a/scripts/sample_invoice.pdf b/scripts/sample_invoice.pdf