diff --git a/.dependencies_installed b/.dependencies_installed deleted file mode 100644 index e69de29b..00000000 diff --git a/.env.miner.example b/.env.miner.example new file mode 100644 index 00000000..c88cde1d --- /dev/null +++ b/.env.miner.example @@ -0,0 +1,8 @@ +WANDB_OFF = False # Turn off wandb. +WANDB_API_KEY = your_wandb_api_key # Your wandb api key, for example: sk-proj-1234567890 +WANDB_ENTITY_NAME = your_wandb_entity_name #The entity name of the project. +HF_TOKEN = your_huggingface_token # Your huggingface token, for example: hf_1234567890 +LLM_API_KEY = your_openai_api_key # Your openai api key, for example: sk-proj-1234567890 +LLM_MODEL_ID = your_openai_model_id # Minimun model_id: gpt-4o +LLM_MODEL_URL = your_openai_model_url # Your openai model url, for example: https://api.openai.com/v1/ + diff --git a/.env.validator.example b/.env.validator.example new file mode 100644 index 00000000..b360059e --- /dev/null +++ b/.env.validator.example @@ -0,0 +1,10 @@ +WANDB_OFF = False # Turn off wandb. +WANDB_API_KEY = your_wandb_api_key # Your wandb api key, for example: sk-proj-1234567890 +WANDB_ENTITY_NAME = your_wandb_entity_name #The entity name of the project. +LLM_API_KEY = your_openai_api_key # Your openai api key, for example: sk-proj-1234567890 +LLM_MODEL_ID = your_openai_model_id # Minimun model_id: gpt-4o +LLM_MODEL_URL = your_openai_model_url # Your openai model url, for example: https://api.openai.com/v1/ +LIGHTHOUSE_SERVER_PORT = 5000 # Fast api server port to get the lighthouse score +NEURON_EPOCH_LENGTH = 25 # The default epoch length (how often you sync with chain, measured in 12 second blocks). +AXON_OFF = True # Set this flag to not attempt to serve an Axon. + diff --git a/.gitignore b/.gitignore index 72a70c3a..87268a49 100644 --- a/.gitignore +++ b/.gitignore @@ -25,7 +25,7 @@ share/python-wheels/ .installed.cfg *.egg MANIFEST - +*.deb # PyInstaller # Usually these files are written by a python script from a template # before PyInstaller builds the exe, so as to inject date/other infos into it. @@ -50,7 +50,7 @@ coverage.xml .hypothesis/ .pytest_cache/ cover/ - +*.db # Translations *.mo *.pot @@ -163,3 +163,34 @@ testing/ # Editors .vscode/settings.json + +.DS_Store +temp.txt + +# env +.env.miner +.env.validator + +# test +test.py + +# Wandb +wandb/ + +# work dir +work/ +work_save/ +tests/work/ +*.png + +# scripts +run_miner.sh +run_miner2.sh +run_validator.sh + +# developer doc +developer_doc.md + +debug_images/ +submit_results.py +*.pem diff --git a/.python-version b/.python-version new file mode 100644 index 00000000..dd6a2206 --- /dev/null +++ b/.python-version @@ -0,0 +1 @@ +3.12.4 \ No newline at end of file diff --git a/CHANGELOG.md b/CHANGELOG.md new file mode 100644 index 00000000..986e250b --- /dev/null +++ b/CHANGELOG.md @@ -0,0 +1,114 @@ +# Changelog + +All notable changes to this project will be documented in this file. + +The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), +and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). + +## [1.0.0] - 2025-01-24 +### Added +- Initial release of the project. +- Features include Image2Html. + +## [1.0.1] - 2025-01-25 +### Fixed +- Fixed an issue with sending session results to the stats collector. +- Fixed an issue with the high level matching score because of re-initializing CUDA in forked processes. + +## [1.0.2] - 2025-01-25 +### Fixed +- Fixed an issue that happens in selecting winners when the total scores are all 0. + +## [1.0.3] - 2025-01-26 +### Fixed +- Fixed an issue with saving the validator state, specifically the session number when the validator last set weights. + +## [1.0.4] - 2025-01-27 +### Changed +- Changed the wandb run name to include the version. + +## [1.0.5] - 2025-01-27 +### Fixed +- Fixed an issue with dictionary's size change during iteration. + +## [1.0.6] - 2025-01-29 +### Added +- Added an organic task. + +## [1.0.7] - 2025-01-29 +### Changed +- Set weights based on the total scores. + +## [1.0.8] - 2025-01-29 +### Changed +- Set weights to zero in the new session. + +## [1.0.9] - 2025-01-30 +### Changed +- Set weights based on the winner-take-all strategy. + +## [1.0.10] - 2025-01-31 +### Changed +- Query miners without splitting the query window. + +## [1.0.11] - 2025-01-31 +### Changed +- Kill the process on the port before starting the axon and the lighthouse server. + +## [1.0.12] - 2025-02-01 +### Changed +- Install sudo if it is not installed. + +## [1.0.13] - 2025-02-01 +### Added +- Added a balanced competition type. +### Changed +- Do not raise an exception if the error occurs while killing the process on the port. + +## [1.0.14] - 2025-02-02 +### Changed +- Update logging information. + +## [1.1.0] - 2025-02-03 +### Added +- Added a new column to the total scores table to show the average score. +### Changed +- Only reward the winner of current session. +- Reward the winners of previous sessions with a tiny weight to prevent deregistration. +- Changed the scoring mechanism to exclude the previous winner. + +## [1.1.1] - 2025-02-06 +### Changed +- Changed the scoring mechanism to reward all miners. + +## [1.1.2] - 2025-02-07 +### Fixed +- Fixed an issue with clearing the scores. + +## [1.1.3] - 2025-02-07 +### Changed +- Only search for official websites. + +## [1.1.4] - 2025-02-10 +### Added +- Save the results of the previous sessions. + +## [1.1.5] - 2025-02-12 +### Changed +- Switched to winner-take-all strategy. + +## [1.1.6] - 2025-02-13 +### Changed +- Resolved an issue with duckduckgo search returning different results for the same query. + +## [1.1.7] - 2025-02-13 +### Fixed +- Fixed an issue of infinite loop in synthensize a task + +## [1.1.8] - 2025-02-13 +### Added +- Added logging to the random website dataset. + +## [1.1.9] - 2025-02-13 +### Updated +- Updated the version of bittensor to 9.0.0 for dynamic TAO. diff --git a/README.md b/README.md index ba69bdae..ab9a1cd4 100644 --- a/README.md +++ b/README.md @@ -1,213 +1,155 @@ -
At Tech Company, we are dedicated to providing the best technology solutions for your needs. Our team of experts is always ready to help you with any questions or problems you may have.
+At Tech Company, we are dedicated to providing the best technology solutions for your needs. Our team of experts is always ready to help you with any questions or problems you may have.
+At Tech Company, we are dedicated to providing the best technology solutions for your needs. Our team of experts is always ready to help you with any questions or problems you may have.
+tags and trims text inside
tags if the text length exceeds the max limit. + + :param html_content: The HTML content as a string. + :param max_p_count: The maximum number of
tags allowed in any parent tag. + :param max_text_length: The maximum length of text allowed inside
tags. + :return: Modified HTML content with excess
tags removed and text inside
tags shortened. + """ + try: + soup = BeautifulSoup(html_content, 'html.parser') + + # Find all tags that contain
as direct children + for tag in soup.find_all(True): # True will find all tags + # Find only
tags as direct children (not nested
tags) + p_tags = [child for child in tag.find_all('p', recursive=False)] + + if len(p_tags) > max_p_count: + # Remove excess
tags + excess_p_tags = p_tags[max_p_count:] + for p_tag in excess_p_tags: + p_tag.decompose() # Remove the excess
tag + + # Traverse through all
tags and handle text nodes inside them + for p_tag in soup.find_all('p'): # Find all
tags
+ for child in p_tag.contents:
+ if isinstance(child, NavigableString):
+ text_str = str(child)
+ if len(text_str) > max_text_length:
+ shortened = text_str[:max_text_length] + "..." # Shorten the text
+ child.replace_with(shortened) # Replace the original text with the shortened version
+
+ return str(soup)
+ except Exception as e:
+ bt.logging.error(f"Error in shorten_html: {e}")
+ raise e
+
+ async def generate_context(self)->DatasetEntry:
+ try:
+ bt.logging.info("Generating Random Website context")
+ website_url = await self.get_random_website_url()
+ if website_url is None:
+ raise Exception("Failed to get a valid website URL")
+ html = await self.get_rendered_html(website_url)
+ html = await self.shorten_html(html)
+ bt.logging.info(f"Generated website URL: {website_url}")
+ return DatasetEntry(
+ src="random_website",
+ url=f"random_website_{website_url}",
+ ground_truth_html=html,
+ prompt="",
+ base64_image="",
+ )
+ except Exception as e:
+ bt.logging.error(f"Error in generate_context: {e}")
+ raise e
+
diff --git a/webgenie/datasets/synthetic_dataset.py b/webgenie/datasets/synthetic_dataset.py
new file mode 100644
index 00000000..8483d1cc
--- /dev/null
+++ b/webgenie/datasets/synthetic_dataset.py
@@ -0,0 +1,65 @@
+# The paper [Unlocking the conversion of Web Screenshots into HTML Code with the WebSight Dataset]
+# (https://arxiv.org/pdf/2403.09029v1#bib.bib5) is our inspiration.
+# The paper suggests using Mistral-7B-Instruct to generate concepts and use Deepseek-Coder-33b-instruct
+# to generate html, but now we are using openai models here. We are going to use that models on the mainnet
+
+import bittensor as bt
+from typing import List
+from pydantic import BaseModel, Field
+
+from webgenie.datasets.dataset import Dataset, DatasetEntry
+from webgenie.helpers.llms import openai_call
+from webgenie.prompts import PROMPT_GEN_CONCEPT, PROMPT_GEN_HTML
+
+
+class ConceptResponse(BaseModel):
+ concepts: List[str] = Field(description="The concept of the website")
+
+
+class HTMLResponse(BaseModel):
+ html: str = Field(description="The html code of the website")
+
+
+class SyntheticDataset(Dataset):
+ def __init__(self, has_ground_truth_html: bool = True):
+ self.has_ground_truth_html = has_ground_truth_html
+ self.concepts = []
+
+ async def _generate_concepts(self):
+ bt.logging.info("Generating concepts")
+ response = await openai_call(
+ messages = [
+ {"role": "system", "content": PROMPT_GEN_CONCEPT},
+ ],
+ response_format = ConceptResponse,
+ )
+ return response.concepts
+
+ async def _generate_html(self, concept: str):
+ bt.logging.debug(f"Generating HTML from concept: {concept}")
+ response = await openai_call(
+ messages = [
+ {"role": "system", "content": PROMPT_GEN_HTML.format(concept=concept)},
+ ],
+ response_format = HTMLResponse,
+ )
+ return response.html
+
+ async def generate_context(self)->DatasetEntry:
+ bt.logging.info("Generating Synthetic context")
+ if not self.concepts:
+ self.concepts = await self._generate_concepts()
+
+ concept = self.concepts.pop(0)
+
+ if self.has_ground_truth_html == True:
+ ground_truth_html = await self._generate_html(concept)
+ else:
+ ground_truth_html = ""
+
+ return DatasetEntry(
+ src="synthetic",
+ url=f"synthetic_{concept}",
+ prompt=concept,
+ ground_truth_html=ground_truth_html,
+ )
diff --git a/webgenie/helpers/htmls.py b/webgenie/helpers/htmls.py
new file mode 100644
index 00000000..5cae30a5
--- /dev/null
+++ b/webgenie/helpers/htmls.py
@@ -0,0 +1,205 @@
+import bittensor as bt
+import asyncio
+import os
+import re
+import uuid
+
+from bs4 import BeautifulSoup
+from lxml import etree
+from lxml.etree import XMLSyntaxError
+from PIL import Image
+from playwright.async_api import async_playwright
+
+from webgenie.constants import (
+ WORK_DIR,
+ CHROME_HTML_LOAD_TIME,
+ JAVASCRIPT_RUNNING_TIME,
+ PLACE_HOLDER_IMAGE_URL,
+)
+from webgenie.helpers.images import image_to_base64
+
+
+def is_valid_resources(html_content: str) -> bool:
+ """
+ Check if the resources in the HTML content are valid.
+ """
+ # List of allowed patterns for CSS and JavaScript resources
+ allowed_patterns = [
+ r"https?://cdn.jsdelivr.net/npm/tailwindcss@[^/]+/dist/tailwind.min.css",
+ r"https?://stackpath.bootstrapcdn.com/bootstrap/[^/]+/css/bootstrap.min.css",
+ r"https?://code.jquery.com/jquery-[^/]+.min.js",
+ r"https?://stackpath.bootstrapcdn.com/bootstrap/[^/]+/js/bootstrap.bundle.min.js",
+ r"https?://cdnjs.cloudflare.com/ajax/libs/font-awesome/[^/]+/css/font-awesome.min.css",
+ r"https?://fonts.googleapis.com/css2?family=Roboto:wght@300;400;500;700&display=swap",
+ ]
+
+ soup = BeautifulSoup(html_content, 'html.parser')
+ resources = soup.find_all(['link', 'script'])
+
+ for resource in resources:
+ if resource.name == 'link' and resource.get('rel') == ['stylesheet']:
+ href = resource.get('href')
+ if href and not any(re.match(pattern, href) for pattern in allowed_patterns):
+ return True
+ elif resource.name == 'script':
+ src = resource.get('src')
+ if src and not any(re.match(pattern, src) for pattern in allowed_patterns):
+ return True
+
+ return True
+
+
+def is_valid_html(html_content: str) -> bool:
+ """
+ Check if the HTML is valid.
+ """
+ try:
+ soup = BeautifulSoup(html_content, 'html.parser')
+ return True
+ except Exception as e:
+ bt.logging.error(f"An error occurred: {e}")
+ return False
+
+
+def seperate_html_css(html_content: str) -> tuple[str, str]:
+ """
+ Seperate the HTML and CSS from the HTML content.
+ """
+ soup = BeautifulSoup(html_content, 'html.parser')
+
+ css = ''
+ for style_tag in soup.find_all('style'):
+ css += style_tag.get_text()
+ for style_tag in soup.find_all('style'):
+ style_tag.decompose()
+
+ head = soup.head
+ if not head:
+ head = soup.new_tag('head')
+ soup.html.insert(0, head)
+
+ link_tag = soup.new_tag('link', rel='stylesheet', href='styles.css')
+ head.append(link_tag)
+ cleaned_html = str(soup)
+ return cleaned_html, css
+
+
+async def html_to_screenshot(html_content: str, page_load_time: int = 1000) -> str:
+ """
+ Take a screenshot of the HTML content.
+ """
+ html_path = f"{WORK_DIR}/screenshot_{uuid.uuid4()}.html"
+ with open(html_path, "w") as f:
+ f.write(html_content)
+ png_path = f"{WORK_DIR}/screenshot_{uuid.uuid4()}.png"
+ url = f"file://{os.path.abspath(html_path)}"
+
+ try:
+ async with async_playwright() as p:
+ # Choose a browser, e.g., Chromium, Firefox, or WebKit
+ browser = await p.chromium.launch()
+ page = await browser.new_page()
+
+ # Navigate to the URL
+ await page.goto(url, timeout=CHROME_HTML_LOAD_TIME)
+
+ await page.wait_for_load_state('networkidle')
+ await page.wait_for_timeout(JAVASCRIPT_RUNNING_TIME)
+
+ # Take the screenshot
+ await page.screenshot(
+ path=png_path,
+ full_page=True,
+ animations="disabled",
+ timeout=CHROME_HTML_LOAD_TIME,
+ )
+ await page.close()
+ await browser.close()
+ except Exception as e:
+ bt.logging.error(f"Failed to take screenshot due to: {e}. Generating a blank image.")
+ # Generate a blank image
+ img = Image.new('RGB', (1280, 960), color = 'white')
+ img.save(png_path)
+
+ await asyncio.sleep(0.1)
+ base64_image = image_to_base64(png_path)
+
+ await asyncio.sleep(0.1)
+ os.remove(html_path)
+ os.remove(png_path)
+ return base64_image
+
+
+def format_html(html_content: str) -> str:
+ """
+ Format the HTML content.
+ """
+ soup = BeautifulSoup(html_content, 'html.parser')
+ return soup.prettify()
+
+
+def replace_image_sources(html_content: str, new_url: str = PLACE_HOLDER_IMAGE_URL) -> str:
+ """
+ Replace the image sources in the HTML content.
+ """
+ soup = BeautifulSoup(html_content, 'html.parser')
+
+ # Replace 'src' attribute in tags
+ for img_tag in soup.find_all('img'):
+ img_tag['src'] = new_url
+
+ # Replace 'srcset' attribute in