Skip to content

Commit 0fbf28c

Browse files
committedOct 11, 2024
cleaned general prompt
1 parent ae9041d commit 0fbf28c

File tree

2 files changed

+25
-12
lines changed

2 files changed

+25
-12
lines changed
 

‎README.md

+3-3
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@ from aipdf import ocr
2626
api_key = 'your_openai_api_key'
2727

2828
file = open('somepdf.pdf', 'rb')
29-
markdown_pages = ocr(file, api_key, prompt="extract markdown, extract tables and turn charts into tables")
29+
markdown_pages = ocr(file, api_key)
3030

3131
```
3232

@@ -47,7 +47,7 @@ We chose that you pass a file object, because that way it is flexible for you to
4747
pdf_file = io.BytesIO(requests.get('https://arxiv.org/pdf/2410.02467').content)
4848

4949
# extract
50-
pages = ocr(pdf_file, api_key, prompt="extract tables and turn charts into tables, return each table in json")
50+
pages = ocr(pdf_file, api_key, prompt="extract tables, return each table in json")
5151

5252
```
5353
### From S3
@@ -62,7 +62,7 @@ s3 = boto3.client('s3', config=Config(signature_version='s3v4'),
6262

6363
pdf_file = io.BytesIO(s3.get_object(Bucket=bucket_name, Key=object_key)['Body'].read())
6464
# extract
65-
pages = ocr(pdf_file, api_key, prompt="extract tables and turn charts into tables, return each table in json")
65+
pages = ocr(pdf_file, api_key, prompt="extract charts data, turn it into tables that represent the variables in the chart")
6666
```
6767

6868

‎src/aipdf/ocr.py

+22-9
Original file line numberDiff line numberDiff line change
@@ -1,17 +1,23 @@
11
import io
2-
from pdf2image import convert_from_bytes
3-
from PIL import Image
42
import base64
5-
import requests
6-
import os
73
import logging
84
import concurrent.futures
5+
6+
from pdf2image import convert_from_bytes
97
from openai import OpenAI
108

119
# Set up logging
1210
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
1311

14-
DEFAULT_PROMPT = "Please analyze this image and provide a markdown representation of its content. Include headings, lists, and any other relevant markdown formatting."
12+
DEFAULT_PROMPT = """
13+
Extract the full markdown text from the given image, following these guidelines:
14+
- Respond only with markdown, no additional commentary.
15+
- Capture all the text, respecting titles, headers, subheaders, equations, etc.
16+
- If there are tables in this page, convert each one into markdown table format and include it in the response.
17+
- If there are images, provide a brief description of what is shown in each image, and include it in the response.
18+
- if there are charts, for each chart include a markdown table with the data represents the chart, a column for each of the variables of the cart and the relevant estimated values
19+
20+
"""
1521

1622
def process_image_to_markdown(file_object, client, model="gpt-4o", prompt = DEFAULT_PROMPT):
1723
"""
@@ -26,9 +32,11 @@ def process_image_to_markdown(file_object, client, model="gpt-4o", prompt = DEF
2632
Returns:
2733
str: The markdown representation of the image content, or None if an error occurs.
2834
"""
29-
30-
base64_image = base64.b64encode(file_object.read()).decode('utf-8')
35+
# Log that we're about to process a page
36+
logging.info("About to process a page")
3137

38+
base64_image = base64.b64encode(file_object.read()).decode('utf-8')
39+
3240
try:
3341
response = client.chat.completions.create(
3442
model=model,
@@ -53,6 +61,7 @@ def process_image_to_markdown(file_object, client, model="gpt-4o", prompt = DEF
5361

5462
# Extract the markdown content from the response
5563
markdown_content = response.choices[0].message.content
64+
logging.info("Page processed successfully")
5665
return markdown_content
5766

5867
except Exception as e:
@@ -99,7 +108,7 @@ def pdf_to_image_files(pdf_file):
99108
return image_files
100109

101110

102-
def ocr(pdf_file, api_key, model="gpt-4o", base_url= 'https://api.openai.com/v1', prompt=DEFAULT_PROMPT):
111+
def ocr(pdf_file, api_key, model="gpt-4o", base_url= 'https://api.openai.com/v1', prompt=DEFAULT_PROMPT, pages_list = None):
103112
"""
104113
Convert a PDF file to a list of markdown-formatted pages using OpenAI's API.
105114
@@ -109,13 +118,17 @@ def ocr(pdf_file, api_key, model="gpt-4o", base_url= 'https://api.openai.com/v1'
109118
model (str, optional): by default is gpt-4o
110119
base_url (str): You can use this one to point the client whereever you need it like Ollama
111120
prompt (str, optional): The prompt to send to the API. Defaults to DEFAULT_PROMPT.
112-
121+
pages_list (list, optional): A list of page numbers to process. If provided, only these pages will be converted. Defaults to None, which processes all pages.
113122
Returns:
114123
list: A list of strings, each containing the markdown representation of a PDF page.
115124
"""
116125
client = OpenAI(api_key=api_key, base_url = base_url) # Create OpenAI client
117126
# Convert PDF to image files
118127
image_files = pdf_to_image_files(pdf_file)
128+
129+
if pages_list:
130+
# Filter image_files to only include pages in page_list
131+
image_files = [img for i, img in enumerate(image_files) if i + 1 in pages_list]
119132

120133
# List to store markdown content for each page
121134
markdown_pages = [None] * len(image_files)

0 commit comments

Comments
 (0)