cleaned general prompt

torrmal · torrmal · commit 0fbf28cb76e8 · 2024-10-11T13:16:37.000-05:00
diff --git a/README.md b/README.md
@@ -26,7 +26,7 @@ from aipdf import ocr
 api_key = 'your_openai_api_key'
 
 file = open('somepdf.pdf', 'rb')
-markdown_pages = ocr(file, api_key, prompt="extract markdown, extract tables and turn charts into tables")
+markdown_pages = ocr(file, api_key)
 
 ```
 
@@ -47,7 +47,7 @@ We chose that you pass a file object, because that way it is flexible for you to
 pdf_file = io.BytesIO(requests.get('https://arxiv.org/pdf/2410.02467').content)
 
 # extract
-pages = ocr(pdf_file, api_key, prompt="extract tables and turn charts into tables, return each table in json")
+pages = ocr(pdf_file, api_key, prompt="extract tables, return each table in json")
 
 ```
 ### From S3
@@ -62,7 +62,7 @@ s3 = boto3.client('s3', config=Config(signature_version='s3v4'),
 
 pdf_file = io.BytesIO(s3.get_object(Bucket=bucket_name, Key=object_key)['Body'].read())
 # extract 
-pages = ocr(pdf_file, api_key, prompt="extract tables and turn charts into tables, return each table in json")
+pages = ocr(pdf_file, api_key, prompt="extract charts data, turn it into tables that represent the variables in the chart")
 ```
 
 
diff --git a/src/aipdf/ocr.py b/src/aipdf/ocr.py
@@ -1,17 +1,23 @@
 import io
-from pdf2image import convert_from_bytes
-from PIL import Image
 import base64
-import requests
-import os
 import logging
 import concurrent.futures
+
+from pdf2image import convert_from_bytes
 from openai import OpenAI
 
 # Set up logging
 logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
 
-DEFAULT_PROMPT = "Please analyze this image and provide a markdown representation of its content. Include headings, lists, and any other relevant markdown formatting."
+DEFAULT_PROMPT = """
+Extract the full markdown text from the given image, following these guidelines:
+- Respond only with markdown, no additional commentary.  
+- Capture all the text, respecting titles, headers, subheaders, equations, etc.
+- If there are tables in this page, convert each one into markdown table format and include it in the response.
+- If there are images, provide a brief description of what is shown in each image, and include it in the response.
+- if there are charts, for each chart include a markdown table with the data represents the chart, a column for each of the variables of the cart and the relevant estimated values
+          
+"""
 
 def process_image_to_markdown(file_object, client, model="gpt-4o",  prompt = DEFAULT_PROMPT):
     """
@@ -26,9 +32,11 @@ def process_image_to_markdown(file_object, client, model="gpt-4o",  prompt = DEF
     Returns:
         str: The markdown representation of the image content, or None if an error occurs.
     """
-    
-    base64_image = base64.b64encode(file_object.read()).decode('utf-8')
+    # Log that we're about to process a page
+    logging.info("About to process a page")
 
+    base64_image = base64.b64encode(file_object.read()).decode('utf-8')
+    
     try:
         response = client.chat.completions.create(
             model=model,
@@ -53,6 +61,7 @@ def process_image_to_markdown(file_object, client, model="gpt-4o",  prompt = DEF
         
         # Extract the markdown content from the response
         markdown_content = response.choices[0].message.content
+        logging.info("Page processed successfully")
         return markdown_content
     
     except Exception as e:
@@ -99,7 +108,7 @@ def pdf_to_image_files(pdf_file):
     return image_files
 
 
-def ocr(pdf_file, api_key, model="gpt-4o", base_url= 'https://api.openai.com/v1', prompt=DEFAULT_PROMPT):
+def ocr(pdf_file, api_key, model="gpt-4o", base_url= 'https://api.openai.com/v1', prompt=DEFAULT_PROMPT, pages_list = None):
     """
     Convert a PDF file to a list of markdown-formatted pages using OpenAI's API.
 
@@ -109,13 +118,17 @@ def ocr(pdf_file, api_key, model="gpt-4o", base_url= 'https://api.openai.com/v1'
         model (str, optional): by default is gpt-4o
         base_url (str): You can use this one to point the client whereever you need it like Ollama
         prompt (str, optional): The prompt to send to the API. Defaults to DEFAULT_PROMPT.
-
+        pages_list (list, optional): A list of page numbers to process. If provided, only these pages will be converted. Defaults to None, which processes all pages.
     Returns:
         list: A list of strings, each containing the markdown representation of a PDF page.
     """
     client = OpenAI(api_key=api_key, base_url = base_url)  # Create OpenAI client
     # Convert PDF to image files
     image_files = pdf_to_image_files(pdf_file)
+
+    if pages_list:
+        # Filter image_files to only include pages in page_list
+        image_files = [img for i, img in enumerate(image_files) if i + 1 in pages_list]
     
     # List to store markdown content for each page
     markdown_pages = [None] * len(image_files)