Skip to content

Commit

Permalink
update
Browse files Browse the repository at this point in the history
  • Loading branch information
AlexandreSenpai committed Mar 4, 2025
1 parent 7670d35 commit 0e161a3
Show file tree
Hide file tree
Showing 34 changed files with 1,803 additions and 136 deletions.
7 changes: 7 additions & 0 deletions .vscode/settings.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
{
"python.testing.pytestArgs": [
"tests"
],
"python.testing.unittestEnabled": false,
"python.testing.pytestEnabled": true
}
Binary file added CO.jpg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
114 changes: 57 additions & 57 deletions __init__.py
Original file line number Diff line number Diff line change
@@ -1,57 +1,57 @@
###[BUILT-IN MODULES]###
from typing import Tuple
###[EXTERNAL MODULES]###
from PIL import Image, ImageDraw
###[PERSONAL MODULES]###
from lib.translate import Translate
from lib.vision import Vision
from lib.textfit import TextFit

if __name__ == '__main__':

img_in = './images/sample4.jpg'

fit_tool = TextFit()
vision = Vision(service_account=r'') # pass here the path of your service account
translate = Translate()
translations = []

response: Tuple[Image.open, list] = vision.detect_text(img_in) # Image to translate
for block in response[1]:
translations.append({
"font_color": block.get('font_color'),
"translated": translate.translate(text=block.get('text', ''), target_language='pt-br'),
"text_area": block.get('area'),
"text_box_size": block.get('text_box_size')
})

# # added this so we don't need to send requests for google when testing
# translations = [
# {'font_color': 255, 'translated': 'SE PAPI E SUU VIREM BRINCAR COMIGO ...', 'text_area': ((1368, 294), (1582, 499)), 'text_box_size': (214, 205)},
# {'font_color': 255, 'translated': 'ENTÃO QUERO FICAR AQUI.', 'text_area': ((182, 648), (395, 757)), 'text_box_size': (213, 109)},
# {'font_color': 255, 'translated': 'MAS VOCÊ NÃO TEM QUE VIR.', 'text_area': ((1005, 1014), (1131, 1243)), 'text_box_size': (126, 229)},
# {'font_color': 255, 'translated': 'O QUE VOCÊ FAZ, QUERIDA?', 'text_area': ((160, 1028), (344, 1142)), 'text_box_size': (184, 114)},
# {'font_color': 255, 'translated': 'HÃ?!', 'text_area': ((435, 1067), (594, 1121)), 'text_box_size': (159, 54)},
# {'font_color': 255, 'translated': '!!', 'text_area': ((1547, 1265), (1560, 1283)), 'text_box_size': (13, 18)},
# {'font_color': 255, 'translated': '35', 'text_area': ((1667, 1521), (1689, 1533)), 'text_box_size': (22, 12)},
# {'font_color': 255, 'translated': 'VOCÊ AMA ESSE HUMANO. MAS POR QUE...?', 'text_area': ((1034, 1655), (1282, 1764)), 'text_box_size': (248, 109)},
# {'font_color': 255, 'translated': 'MAS ... PAPI, SUU ...', 'text_area': ((1546, 1634), (1647, 1743)), 'text_box_size': (101, 109)},
# {'font_color': 255, 'translated': 'Você fez algo estranho com ela?', 'text_area': ((791, 1735), (994, 1786)), 'text_box_size': (203, 51)},
# {'font_color': 255, 'translated': 'Ah bem. isso é; üh.', 'text_area': ((597, 1781), (757, 1845)), 'text_box_size': (160, 64)},
# {'font_color': 255, 'translated': 'BEM...', 'text_area': ((131, 1814), (280, 1847)), 'text_box_size': (149, 33)},
# {'font_color': 255, 'translated': 'SIMPLESMENTE NÃO CONSIGO ENTENDER.', 'text_area': ((494, 2192), (675, 2368)), 'text_box_size': (181, 176)},
# {'font_color': 255, 'translated': 'Adivinhando com base em suas ações ...', 'text_area': ((1347, 2241), (1477, 2341)), 'text_box_size': (130, 100)}
# ]

# print(translations)

img = response[0]
# img = Image.open('./blank.png')
canvas = ImageDraw.Draw(img)

# Trying to fix text position ;-;
for obj in translations:
rows = fit_tool.fit(row_obj=obj, canvas=canvas)

# Output image
img.save('out.png', 'png')
# ###[BUILT-IN MODULES]###
# from typing import Tuple
# ###[EXTERNAL MODULES]###
# from PIL import Image, ImageDraw
# ###[PERSONAL MODULES]###
# from hime.lib.translate import Translate
# from hime.lib.vision import Vision
# from hime.lib.textfit import TextFit

# if __name__ == '__main__':

# img_in = './images/1.png'

# fit_tool = TextFit()
# vision = Vision(service_account=r'/home/alexandresenpai/.credentials/gcp/hime.json') # pass here the path of your service account
# translate = Translate()
# translations = []

# response = vision.detect_text(img_in) # Image to translate
# for block in response[1]:
# translations.append({
# "font_color": block.get('font_color'),
# "translated": translate.translate(text=block.get('text', ''), target_language='pt-br'),
# "text_area": block.get('area'),
# "text_box_size": block.get('text_box_size')
# })

# # # added this so we don't need to send requests for google when testing
# # translations = [
# # {'font_color': 255, 'translated': 'SE PAPI E SUU VIREM BRINCAR COMIGO ...', 'text_area': ((1368, 294), (1582, 499)), 'text_box_size': (214, 205)},
# # {'font_color': 255, 'translated': 'ENTÃO QUERO FICAR AQUI.', 'text_area': ((182, 648), (395, 757)), 'text_box_size': (213, 109)},
# # {'font_color': 255, 'translated': 'MAS VOCÊ NÃO TEM QUE VIR.', 'text_area': ((1005, 1014), (1131, 1243)), 'text_box_size': (126, 229)},
# # {'font_color': 255, 'translated': 'O QUE VOCÊ FAZ, QUERIDA?', 'text_area': ((160, 1028), (344, 1142)), 'text_box_size': (184, 114)},
# # {'font_color': 255, 'translated': 'HÃ?!', 'text_area': ((435, 1067), (594, 1121)), 'text_box_size': (159, 54)},
# # {'font_color': 255, 'translated': '!!', 'text_area': ((1547, 1265), (1560, 1283)), 'text_box_size': (13, 18)},
# # {'font_color': 255, 'translated': '35', 'text_area': ((1667, 1521), (1689, 1533)), 'text_box_size': (22, 12)},
# # {'font_color': 255, 'translated': 'VOCÊ AMA ESSE HUMANO. MAS POR QUE...?', 'text_area': ((1034, 1655), (1282, 1764)), 'text_box_size': (248, 109)},
# # {'font_color': 255, 'translated': 'MAS ... PAPI, SUU ...', 'text_area': ((1546, 1634), (1647, 1743)), 'text_box_size': (101, 109)},
# # {'font_color': 255, 'translated': 'Você fez algo estranho com ela?', 'text_area': ((791, 1735), (994, 1786)), 'text_box_size': (203, 51)},
# # {'font_color': 255, 'translated': 'Ah bem. isso é; üh.', 'text_area': ((597, 1781), (757, 1845)), 'text_box_size': (160, 64)},
# # {'font_color': 255, 'translated': 'BEM...', 'text_area': ((131, 1814), (280, 1847)), 'text_box_size': (149, 33)},
# # {'font_color': 255, 'translated': 'SIMPLESMENTE NÃO CONSIGO ENTENDER.', 'text_area': ((494, 2192), (675, 2368)), 'text_box_size': (181, 176)},
# # {'font_color': 255, 'translated': 'Adivinhando com base em suas ações ...', 'text_area': ((1347, 2241), (1477, 2341)), 'text_box_size': (130, 100)}
# # ]

# # print(translations)

# img = response[0]
# # img = Image.open('./blank.png')
# canvas = ImageDraw.Draw(img)

# # Trying to fix text position ;-;
# for obj in translations:
# rows = fit_tool.fit(row_obj=obj, canvas=canvas)

# # Output image
# img.save('out.png', 'png')
29 changes: 29 additions & 0 deletions a.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
import cv2


img = cv2.imread("./download.jpg")

cv2.waitKey(0)

# Grayscale
gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)

# Find Canny edges
edged = cv2.Canny(gray, 30, 200)
cv2.waitKey(0)

# Finding Contours
# Use a copy of the image e.g. edged.copy()
# since findContours alters the image
contours, hierarchy = cv2.findContours(edged,
cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)

print(contours)

cv2.imshow('Canny Edges After Contouring', edged)
cv2.waitKey(0)

print("Number of Contours found = " + str(len(contours)))


cv2.destroyAllWindows()
Binary file added blank.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added download.jpg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added ellipsis.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added filter.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
28 changes: 28 additions & 0 deletions hime/lib/canvas/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
from PIL.Image import Image, new as NewImage
from PIL.ImageDraw import ImageDraw

class Canvas:

def remove_text_block(self,
image: Image,
coords: tuple[tuple[int, int],
tuple[int, int]]) -> Image:

draw = ImageDraw(image, 'RGBA')
draw.rectangle(coords, (255, 255, 255, 255))

image.save("blank.png")

return image

def text_block_aware(self,
image: Image,
coords: tuple[tuple[int, int],
tuple[int, int]]) -> Image:

draw = ImageDraw(image, 'RGBA')
draw.ellipse(coords, (255, 255, 255, 0), (255, 0, 0))

image.save("ellipsis.png")

return image
File renamed without changes.
File renamed without changes.
208 changes: 208 additions & 0 deletions hime/lib/vision/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,208 @@

from dataclasses import dataclass, field
from io import BytesIO
import math
from typing import List, Tuple, Union
import easyocr
from PIL.Image import Image, fromarray as p_fromarray
from PIL.Image import open as p_open
from PIL import ImageOps

import numpy as np
import math

import cv2
import numpy as np


from deskew import determine_skew


class ImagePrep:
def resize(self,
image: Image,
scale: float = 2.5) -> Tuple[float, Image]:
w_size, h_size = image.size
return scale, image.resize(
size=(math.ceil(w_size * scale),
math.ceil(h_size * scale))
)

def set_dpi(self,
image: Image,
dpi: int = 300) -> Image:
temp_img = BytesIO()
image.save(temp_img, format="png", dpi=(dpi, dpi))
return p_open(temp_img)

def rotate(self,
image: Image,
angle: float,
background: Union[int,
Tuple[int, int, int]]) -> np.ndarray:
img_arr = np.asarray(image)
old_width, old_height = img_arr.shape[:2]
angle_radian = math.radians(angle)
width = abs(np.sin(angle_radian) * old_height) + abs(np.cos(angle_radian) * old_width)
height = abs(np.sin(angle_radian) * old_width) + abs(np.cos(angle_radian) * old_height)

image_center = tuple(np.array(img_arr.shape[1::-1]) / 2)
rot_mat = cv2.getRotationMatrix2D(image_center, angle, 1.0)
rot_mat[1, 2] += (width - old_width) / 2
rot_mat[0, 2] += (height - old_height) / 2
return cv2.warpAffine(
img_arr,
rot_mat,
(int(round(height)), int(round(width))),
borderValue=background # type: ignore
) # type: ignore

def deskew(self, image: Image):
grayscale = cv2.cvtColor(np.asarray(image), cv2.COLOR_BGR2GRAY)
angle = determine_skew(grayscale)

if angle is None:
raise Exception("Could not determine angle.")

rotated = self.rotate(image, float(angle), (0, 0, 0))
cv2.imwrite('output.png', rotated)
return p_fromarray(rotated)

def convert_to_vector(self, image: Image):

# Convert the image to grayscale
gray_image = cv2.cvtColor(np.asarray(image), cv2.COLOR_BGR2GRAY)

# Apply Canny edge detection to find outlines
edges = cv2.Canny(gray_image, 100, 200)

# Find contours (shapes) in the image
contours, _ = cv2.findContours(edges, cv2.RETR_TREE, cv2.CHAIN_APPROX_NONE)

# Create a new blank image to draw the vectorized image with white background
vector_image = np.ones_like(gray_image) * 255

# Draw contours on the blank image to form vectorized shapes in black color
cv2.drawContours(vector_image, contours, -1, (0, 0, 0), thickness=6)

cv2.imwrite("box.png", vector_image)

return p_fromarray(vector_image)

class TextNormalizer:
def remove_spaces(self, text: str) -> str:
full_txt = ""
for i, c in enumerate(text):
if i == 0:
full_txt += c
continue

if c == " " and text[i-1] == " ":
continue

full_txt += c

return full_txt.replace(" ,", ",").replace(" .", ".").strip()

def remove_breaklines(self, text: str) -> str:
return text.replace("\n", " ")

@dataclass
class Text:
content: str = field(default="")
top_left: Tuple[int, int] = field(default_factory=lambda: (0, 0))
bottom_right: Tuple[int, int] = field(default_factory=lambda: (0, 0))
width: int = field(default=0)
height: int = field(default=0)
scale_factor: float = field(default=0)

EasyOCROutput = List[
Tuple[
Tuple[
Tuple[np.int32, np.int32],
Tuple[np.int32, np.int32],
Tuple[np.int32, np.int32],
Tuple[np.int32, np.int32]
],
str,
np.float64
]
]

class Vision:

def __init__(self):
self.scale_factor = 0.0

def preprocess_image(self, image: Image) -> np.ndarray:
processor = ImagePrep()
scale, image = processor.resize(image=image, scale=2.5)
self.scale_factor = scale
image = processor.set_dpi(image)
# Convert image to grayscale
image = ImageOps.grayscale(image)

# Convert image to NumPy array for OpenCV operations
img_array = np.asarray(image)

# Apply Gaussian blur to remove noise
img_array = cv2.GaussianBlur(img_array, (5, 5), 0)

# Apply adaptive thresholding to create a binary image
img_array = cv2.adaptiveThreshold(img_array, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, 11, 2)

# Use OpenCV to equalize the histogram of the image
img_array = cv2.equalizeHist(img_array)

cv2.imwrite("filter.png", img_array)

return np.asarray(image)

def translate_ocr_output(self, ocr_response: EasyOCROutput) -> Text:
text = Text()

top_left_y_options = []
top_left_x_options = []
bottom_right_y_options = []
bottom_right_x_options = []

for finding in ocr_response:
top_left, _, \
bottom_right, _ = finding[0]
content, _ = finding[1:]

top_left_x, top_left_y = top_left
bottom_right_x, bottom_right_y = bottom_right

top_left_y_options.append(top_left_y)
top_left_x_options.append(top_left_x)
bottom_right_y_options.append(bottom_right_y)
bottom_right_x_options.append(bottom_right_x)

text.content += content + " "

top_left_y_options.sort()
top_left_x_options.sort()

bottom_right_y_options.sort(reverse=True)
bottom_right_x_options.sort(reverse=True)

text.top_left = (top_left_x_options[0], top_left_y_options[0])
text.bottom_right = (bottom_right_x_options[0], bottom_right_y_options[0])

text.content = text.content.strip()

return text

def detect_text(self, image: Image):
text_norm = TextNormalizer()
img = self.preprocess_image(image=image)
response = easyocr.Reader(lang_list=["en"], verbose=False)
text: EasyOCROutput = response.readtext(img)
output = self.translate_ocr_output(text)
output.scale_factor = self.scale_factor
normalized = output.content
normalized = text_norm.remove_breaklines(normalized)
normalized = text_norm.remove_spaces(normalized)
output.content = normalized
return output
Loading

0 comments on commit 0e161a3

Please sign in to comment.