Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Second pass #5

Merged
merged 5 commits into from
Dec 20, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -86,7 +86,7 @@ Before you proceed with the installation of the subnet, note the following:
- `neurons/miner.py`: Use `pytesseract` for OCR, and use `OCRSynapse` to communicate with validator

### Remaining changes to be done
In addition to the above files, we would also update the following files:
In addition to the above files, we have also updated the following files:
- `README.md`: This file contains the documentation for your project. Update this file to reflect your project's documentation.
- `CONTRIBUTING.md`: This file contains the instructions for contributing to your project. Update this file to reflect your project's contribution guidelines.
- `template/__init__.py`: This file contains the version of your project.
Expand Down
23 changes: 14 additions & 9 deletions neurons/miner.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,5 @@
# The MIT License (MIT)
# Copyright © 2023 Yuma Rao
# TODO(developer): Set your name
# Copyright © 2023 <your name>

# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated
# documentation files (the “Software”), to deal in the Software without restriction, including without limitation
Expand All @@ -22,18 +20,16 @@
import bittensor as bt
import pytesseract

# Bittensor Miner Template:
# Bittensor OCR Miner
import ocr_subnet

from ocr_subnet.utils.serialize import deserialize_image

# import base miner class which takes care of most of the boilerplate
from ocr_subnet.base.miner import BaseMinerNeuron


class Miner(BaseMinerNeuron):
"""
Your miner neuron class. You should use this class to define your miner's behavior. In particular, you should replace the forward function with your own logic. You may also want to override the blacklist and priority functions according to your needs.
OCR miner neuron class. You may also want to override the blacklist and priority functions according to your needs.

This class inherits from the BaseMinerNeuron class, which in turn inherits from BaseNeuron. The BaseNeuron class takes care of routine tasks such as setting up wallet, subtensor, metagraph, logging directory, parsing config, etc. You can override any of the methods in BaseNeuron if you need to customize the behavior.

Expand All @@ -45,6 +41,7 @@ def __init__(self, config=None):

# TODO(developer): Anything specific to your use case you can do here


async def forward(
self, synapse: ocr_subnet.protocol.OCRSynapse
) -> ocr_subnet.protocol.OCRSynapse:
Expand All @@ -58,27 +55,35 @@ async def forward(
ocr_subnet.protocol.OCRSynapse: The synapse object with the 'response' field set to the extracted data.

"""
# Get image data
image = ocr_subnet.utils.image.deserialize(base64_string=synapse.base64_image)

image = deserialize_image(base64_string=synapse.base64_image)
# Use pytesseract to get the data
data = pytesseract.image_to_data(image, output_type=pytesseract.Output.DICT)

response = []
# Loop over each item in the 'text' part of the data
for i in range(len(data['text'])):
if data['text'][i].strip() != '': # This filters out empty text results
x1, y1, width, height = data['left'][i], data['top'][i], data['width'][i], data['height'][i]
if width * height < 10: # This filters out small boxes (likely noise)
continue

x2, y2 = x1 + width, y1 + height

# Here we don't have font information, so we'll omit that.
# Pytesseract does not extract font family or size information.
entry = {
'index': i,
'position': [x1, y1, x2, y2],
'text': data['text'][i]
}
response.append(entry)

# Merge together words into sections, which are on the same line (same y value) and are close together (small distance in x)
response = ocr_subnet.utils.process.group_and_merge_boxes(response)

# Sort sections by y, then sort by x so that they read left to right and top to bottom
response = sorted(response, key=lambda item: (item['position'][1], item['position'][0]))

# Attach response to synapse and return it.
synapse.response = response

Expand Down
66 changes: 50 additions & 16 deletions neurons/validator.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,5 @@
# The MIT License (MIT)
# Copyright © 2023 Yuma Rao
# TODO(developer): Set your name
# Copyright © 2023 <your name>

# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated
# documentation files (the “Software”), to deal in the Software without restriction, including without limitation
Expand All @@ -17,21 +15,20 @@
# OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
# DEALINGS IN THE SOFTWARE.


import os
import time

# Bittensor
import hashlib
import bittensor as bt

from ocr_subnet.validator import forward
import ocr_subnet

# import base validator class which takes care of most of the boilerplate
from ocr_subnet.base.validator import BaseValidatorNeuron


class Validator(BaseValidatorNeuron):
"""
Your validator neuron class. You should use this class to define your validator's behavior. In particular, you should replace the forward function with your own logic.
OCR validator neuron class.

This class inherits from the BaseValidatorNeuron class, which in turn inherits from BaseNeuron. The BaseNeuron class takes care of routine tasks such as setting up wallet, subtensor, metagraph, logging directory, parsing config, etc. You can override any of the methods in BaseNeuron if you need to customize the behavior.

Expand All @@ -44,19 +41,56 @@ def __init__(self, config=None):
bt.logging.info("load_state()")
self.load_state()

# TODO(developer): Anything specific to your use case you can do here
self.image_dir = './data/images/'
if not os.path.exists(self.image_dir):
os.makedirs(self.image_dir)


async def forward(self):
"""
Validator forward pass. Consists of:
- Generating the query
- Querying the miners
- Getting the responses
- Rewarding the miners
- Updating the scores
The forward function is called by the validator every time step.

It consists of 3 important steps:
- Generate a challenge for the miners (in this case it creates a synthetic invoice image)
- Query the miners with the challenge
- Score the responses from the miners

Args:
self (:obj:`bittensor.neuron.Neuron`): The neuron object which contains all the necessary state for the validator.

"""
# TODO(developer): Rewrite this function based on your protocol definition.
return await forward(self)

# get_random_uids is an example method, but you can replace it with your own.
miner_uids = ocr_subnet.utils.uids.get_random_uids(self, k=self.config.neuron.sample_size)

# make a hash from the timestamp
filename = hashlib.md5(str(time.time()).encode()).hexdigest()

# Create a random image and load it.
image_data = ocr_subnet.validator.generate.invoice(path=os.path.join(self.image_dir, f"{filename}.pdf"), corrupt=True)

# Create synapse object to send to the miner and attach the image.
synapse = ocr_subnet.protocol.OCRSynapse(base64_image = image_data['base64_image'])

# The dendrite client queries the network.
responses = self.dendrite.query(
# Send the query to selected miner axons in the network.
axons=[self.metagraph.axons[uid] for uid in miner_uids],
# Pass the synapse to the miner.
synapse=synapse,
# Do not deserialize the response so that we have access to the raw response.
deserialize=False,
)

# Log the results for monitoring purposes.
bt.logging.info(f"Received responses: {responses}")

rewards = ocr_subnet.validator.reward.get_rewards(self, labels=image_data['labels'], responses=responses)

bt.logging.info(f"Scored responses: {rewards}")

# Update the scores based on the rewards. You may want to define your own update_scores function for custom behavior.
self.update_scores(rewards, miner_uids)


# The main function parses the configuration and runs the validator.
Expand Down
3 changes: 1 addition & 2 deletions ocr_subnet/__init__.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,5 @@
# The MIT License (MIT)
# Copyright © 2023 Yuma Rao
# TODO(developer): Set your name
# Copyright © 2023 <your name>

# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated
# documentation files (the “Software”), to deal in the Software without restriction, including without limitation
Expand Down Expand Up @@ -29,3 +27,4 @@
from . import protocol
from . import base
from . import validator
from . import utils
2 changes: 0 additions & 2 deletions ocr_subnet/base/validator.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,5 @@
# The MIT License (MIT)
# Copyright © 2023 Yuma Rao
# TODO(developer): Set your name
# Copyright © 2023 <your name>

# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated
# documentation files (the “Software”), to deal in the Software without restriction, including without limitation
Expand Down
17 changes: 7 additions & 10 deletions ocr_subnet/protocol.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,5 @@
# The MIT License (MIT)
# Copyright © 2023 Yuma Rao
# TODO(developer): Set your name
# Copyright © 2023 <your name>

# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated
# documentation files (the “Software”), to deal in the Software without restriction, including without limitation
Expand All @@ -17,32 +15,31 @@
# OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
# DEALINGS IN THE SOFTWARE.

import typing
import bittensor as bt

import bittensor as bt
from typing import Optional, List

class OCRSynapse(bt.Synapse):
"""
A simple OCR synapse protocol representation which uses bt.Synapse as its base.
This protocol enables communication betweenthe miner and the validator.

Attributes:
- image: A pdf image to be processed by the miner.
- base64_image: Base64 encoding of pdf image to be processed by the miner.
- response: List[dict] containing data extracted from the image.
"""

# Required request input, filled by sending dendrite caller. It is a base64 encoded string.
base64_image: str

# Optional request output, filled by recieving axon.
response: typing.Optional[typing.List[dict]] = None
response: Optional[List[dict]] = None

def deserialize(self) -> int:
def deserialize(self) -> List[dict]:
"""
Deserialize the miner response. This method retrieves the response from
the miner in the form of `response`, maybe this also takes care of casting it to List[dict]?
Deserialize the miner response.

Returns:
- List[dict: The deserialized response, which is a list of dictionaries containing the extracted data.
- List[dict]: The deserialized response, which is a list of dictionaries containing the extracted data.
"""
return self.response
1 change: 1 addition & 0 deletions ocr_subnet/utils/__init__.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
from . import config
from . import misc
from . import uids
from . import process
49 changes: 49 additions & 0 deletions ocr_subnet/utils/image.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
import io
import fitz
import base64

from typing import List
from PIL import Image, ImageDraw


def serialize(image: Image, format: str="JPEG") -> str:
"""Converts PIL image to base64 string.
"""

buffer = io.BytesIO()
image.save(buffer, format=format)
return buffer.getvalue()


def deserialize(base64_string: str) -> Image:
"""Converts base64 string to PIL image.
"""

return Image.open(io.BytesIO(base64.b64decode(base64_string)))


def load(pdf_path: str, page: int=0, zoom_x: float=1.0, zoom_y: float=1.0) -> Image:
"""Loads pdf image and converts to PIL image
"""

# Read the pdf into memory
pdf = fitz.open(pdf_path)
page = pdf[page]

# Set zoom factors for x and y axis (1.0 means 100%)
mat = fitz.Matrix(zoom_x, zoom_y)
pix = page.get_pixmap(matrix=mat)
img_data = io.BytesIO(pix.tobytes('png'))

# convert to PIL image
return Image.open(img_data)

def draw_boxes(image: Image, response: List[dict], color='red'):
"""Draws boxes around text on the image
"""

draw = ImageDraw.Draw(image)
for item in response:
draw.rectangle(item['position'], outline=color)

return image
64 changes: 64 additions & 0 deletions ocr_subnet/utils/process.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
# The MIT License (MIT)
# Copyright © 2023 Yuma Rao

# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated
# documentation files (the “Software”), to deal in the Software without restriction, including without limitation
# the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software,
# and to permit persons to whom the Software is furnished to do so, subject to the following conditions:

# The above copyright notice and this permission notice shall be included in all copies or substantial portions of
# the Software.

# THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO
# THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
# OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
# DEALINGS IN THE SOFTWARE.

from typing import List

def group_and_merge_boxes(data: List[dict], xtol: int=25, ytol: int=5) -> List[dict]:
"""
Combines boxes that are close together into a single box so that the text is grouped into sections.

Args:
- data (list): List of dictionaries containing the position, font and text of each section
- xtol (int): Maximum distance between boxes in the x direction to be considered part of the same section
- ytol (int): Maximum distance between boxes in the y direction to be considered part of the same section

Returns:
- list: List of dictionaries containing the position, font and text of each section
"""
# Ensure all data items are valid and have a 'position' key
data = [box for box in data if box is not None and 'position' in box]

# Step 1: Group boxes by lines
lines = []
for box in data:
added_to_line = False
for line in lines:
if line and abs(line[0]['position'][1] - box['position'][1]) <= ytol:
line.append(box)
added_to_line = True
break
if not added_to_line:
lines.append([box])

# Step 2: Sort and merge within each line
merged_data = []
for line in lines:
line.sort(key=lambda item: item['position'][0]) # Sort by x1
i = 0
while i < len(line) - 1:
box1 = line[i]['position']
box2 = line[i + 1]['position']
if abs(box1[2] - box2[0]) <= xtol: # Check horizontal proximity
new_box = {'position': [min(box1[0], box2[0]), min(box1[1], box2[1]), max(box1[2], box2[2]), max(box1[3], box2[3])],
'text': line[i]['text'] + ' ' + line[i + 1]['text']}
line[i] = new_box
del line[i + 1]
else:
i += 1
merged_data.extend(line)

return merged_data
19 changes: 0 additions & 19 deletions ocr_subnet/utils/serialize.py

This file was deleted.

Loading