Skip to content

Commit

Permalink
feat: pdf ingestion improved
Browse files Browse the repository at this point in the history
  • Loading branch information
sammcj committed Oct 28, 2024
1 parent f8b3596 commit a3d15ad
Show file tree
Hide file tree
Showing 3 changed files with 125 additions and 41 deletions.
66 changes: 63 additions & 3 deletions filesystem/filesystem.go
Original file line number Diff line number Diff line change
Expand Up @@ -140,6 +140,28 @@ func WalkDirectory(rootPath string, includePatterns, excludePatterns []string, p
return "", nil, fmt.Errorf("failed to get file info: %w", err)
}

// Check if rootPath is a single PDF file
if !fileInfo.IsDir() {
isPDF, err := pdf.IsPDF(rootPath)
if err != nil {
return "", nil, fmt.Errorf("failed to check if file is PDF: %w", err)
}

if isPDF {
// Process single PDF file directly
content, err := pdf.ConvertPDFToMarkdown(rootPath, false)
if err != nil {
return "", nil, fmt.Errorf("failed to convert PDF: %w", err)
}

return fmt.Sprintf("File: %s", rootPath), []FileInfo{{
Path: rootPath,
Extension: ".md",
Code: content,
}}, nil
}
}

var treeString string

if !fileInfo.IsDir() {
Expand Down Expand Up @@ -257,6 +279,15 @@ func wrapCodeBlock(code, extension string) string {
}

func isBinaryFile(filePath string) (bool, error) {
// First check if it's a PDF
isPDF, err := pdf.IsPDF(filePath)
if err != nil {
return false, err
}
if isPDF {
return false, nil // Don't treat PDFs as binary files
}

file, err := os.Open(filePath)
if err != nil {
return false, err
Expand All @@ -273,8 +304,8 @@ func isBinaryFile(filePath string) (bool, error) {
// Use http.DetectContentType to determine the content type
contentType := http.DetectContentType(buffer[:n])

// Check if the content type starts with "text/"
return !strings.HasPrefix(contentType, "text/"), nil
// Allow PDFs and text files
return !strings.HasPrefix(contentType, "text/") && contentType != "application/pdf", nil
}

func PrintDefaultExcludes() {
Expand All @@ -287,6 +318,9 @@ func PrintDefaultExcludes() {
}

func processFile(path, relPath string, rootPath string, lineNumber, relativePaths, noCodeblock bool, mu *sync.Mutex, files *[]FileInfo) {
// Check if it's the root path being processed (explicitly provided file)
isExplicitFile := path == rootPath

// Check if file is a PDF
isPDF, err := pdf.IsPDF(path)
if err != nil {
Expand All @@ -295,6 +329,12 @@ func processFile(path, relPath string, rootPath string, lineNumber, relativePath
}

if isPDF {
if !isExplicitFile {
// Skip PDFs during directory traversal
return
}

utils.PrintColouredMessage("ℹ️", fmt.Sprintf("Converting PDF to markdown: %s", path), color.FgBlue)
content, err := pdf.ConvertPDFToMarkdown(path, false)
if err != nil {
utils.PrintColouredMessage("!", fmt.Sprintf("Failed to convert PDF %s: %v", path, err), color.FgRed)
Expand Down Expand Up @@ -456,9 +496,29 @@ func isExcluded(path string, patterns []string) bool {
}

func ProcessSingleFile(path string, lineNumber, relativePaths, noCodeblock bool) (FileInfo, error) {
// Check if it's a PDF first
isPDF, err := pdf.IsPDF(path)
if err != nil {
return FileInfo{}, fmt.Errorf("failed to check if file is PDF: %w", err)
}

if isPDF {
content, err := pdf.ConvertPDFToMarkdown(path, false)
if err != nil {
return FileInfo{}, fmt.Errorf("failed to convert PDF: %w", err)
}

return FileInfo{
Path: path,
Extension: ".md",
Code: content,
}, nil
}

// Handle non-PDF files
content, err := os.ReadFile(path)
if err != nil {
return FileInfo{}, fmt.Errorf("failed to read file %s: %w", path, err)
return FileInfo{}, fmt.Errorf("failed to read file: %w", err)
}

code := string(content)
Expand Down
68 changes: 46 additions & 22 deletions pdf/pdf.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@
package pdf

import (
"bytes"
"fmt"
"io"
"net/http"
Expand All @@ -15,6 +14,7 @@ import (
)

// ConvertPDFToMarkdown converts a PDF file to markdown format

func ConvertPDFToMarkdown(path string, isURL bool) (string, error) {
var reader io.ReadCloser
var err error
Expand All @@ -26,33 +26,31 @@ func ConvertPDFToMarkdown(path string, isURL bool) (string, error) {
}
defer reader.Close()

// Create a temporary file to store the PDF
tempFile, err := os.CreateTemp("", "ingest-*.pdf")
if err != nil {
return "", fmt.Errorf("failed to create temp file: %w", err)
}
defer os.Remove(tempFile.Name())
defer tempFile.Close()

// Copy the downloaded PDF to the temp file
if _, err := io.Copy(tempFile, reader); err != nil {
return "", fmt.Errorf("failed to save PDF: %w", err)
}

path = tempFile.Name()
}

// Open the PDF file
// Open and read the PDF
f, r, err := pdf.Open(path)
if err != nil {
return "", fmt.Errorf("failed to open PDF: %w", err)
}
defer f.Close()

var buf bytes.Buffer
var buf strings.Builder
buf.WriteString(fmt.Sprintf("# PDF Content: %s\n\n", filepath.Base(path)))

// Read each page
// Extract text from each page
totalPages := r.NumPage()
for pageNum := 1; pageNum <= totalPages; pageNum++ {
page := r.Page(pageNum)
Expand All @@ -65,13 +63,21 @@ func ConvertPDFToMarkdown(path string, isURL bool) (string, error) {
return "", fmt.Errorf("failed to extract text from page %d: %w", pageNum, err)
}

// Add page header and content
buf.WriteString(fmt.Sprintf("## Page %d\n\n", pageNum))
buf.WriteString(cleanText(text))
buf.WriteString("\n\n")
// Clean and process the text
cleanedText := cleanText(text)
if cleanedText != "" {
buf.WriteString(fmt.Sprintf("## Page %d\n\n", pageNum))
buf.WriteString(cleanedText)
buf.WriteString("\n\n")
}
}

return buf.String(), nil
result := buf.String()
if strings.TrimSpace(result) == strings.TrimSpace(fmt.Sprintf("# PDF Content: %s\n\n", filepath.Base(path))) {
return "", fmt.Errorf("no text content could be extracted from PDF")
}

return result, nil
}

// IsPDF checks if a file is a PDF based on its content type or extension
Expand All @@ -80,7 +86,7 @@ func IsPDF(path string) (bool, error) {
if strings.HasPrefix(path, "http://") || strings.HasPrefix(path, "https://") {
resp, err := http.Head(path)
if err != nil {
return false, err
return false, fmt.Errorf("failed to check URL for PDF: %w", err)
}
defer resp.Body.Close()
return resp.Header.Get("Content-Type") == "application/pdf", nil
Expand All @@ -89,19 +95,19 @@ func IsPDF(path string) (bool, error) {
// Check local file
file, err := os.Open(path)
if err != nil {
return false, err
return false, fmt.Errorf("failed to open file: %w", err)
}
defer file.Close()

// Read first 512 bytes to determine file type
buffer := make([]byte, 512)
_, err = file.Read(buffer)
n, err := file.Read(buffer)
if err != nil && err != io.EOF {
return false, err
return false, fmt.Errorf("failed to read file header: %w", err)
}

// Check file signature
contentType := http.DetectContentType(buffer)
contentType := http.DetectContentType(buffer[:n])
if contentType == "application/pdf" {
return true, nil
}
Expand Down Expand Up @@ -130,19 +136,37 @@ func downloadPDF(url string) (io.ReadCloser, error) {
}

func cleanText(text string) string {
// Remove excessive whitespace
text = strings.ReplaceAll(text, "\r", "")
text = strings.TrimSpace(text)
if strings.Contains(text, "%PDF-") || strings.Contains(text, "endobj") {
// This appears to be raw PDF data rather than extracted text
return ""
}

// Normalize line endings
// Remove control characters except newlines and tabs
text = strings.Map(func(r rune) rune {
if r < 32 && r != '\n' && r != '\t' {
return -1
}
return r
}, text)

// Split into lines and clean each line
lines := strings.Split(text, "\n")
var cleanLines []string

for _, line := range lines {
line = strings.TrimSpace(line)
if line != "" {
cleanLines = append(cleanLines, line)

// Skip empty lines and lines that look like PDF syntax
if line == "" ||
strings.HasPrefix(line, "%") ||
strings.HasPrefix(line, "/") ||
strings.Contains(line, "obj") ||
strings.Contains(line, "endobj") ||
strings.Contains(line, "stream") {
continue
}

cleanLines = append(cleanLines, line)
}

return strings.Join(cleanLines, "\n\n")
Expand Down
32 changes: 16 additions & 16 deletions web/integration.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,33 +21,33 @@ func ProcessWebURL(urlStr string, options CrawlOptions, excludePatterns []string
// Check if URL points to a PDF
isPDF, err := pdf.IsPDF(urlStr)
if err != nil {
return nil, fmt.Errorf("error checking PDF: %w", err)
return nil, fmt.Errorf("error checking PDF: %w", err)
}

if isPDF {
content, err := pdf.ConvertPDFToMarkdown(urlStr, true)
if err != nil {
return nil, fmt.Errorf("error converting PDF: %w", err)
}
content, err := pdf.ConvertPDFToMarkdown(urlStr, true)
if err != nil {
return nil, fmt.Errorf("error converting PDF: %w", err)
}

return &CrawlResult{
TreeString: fmt.Sprintf("PDF Document: %s", urlStr),
Files: []filesystem.FileInfo{{
Path: urlStr,
Extension: ".md",
Code: content,
}},
}, nil
return &CrawlResult{
TreeString: fmt.Sprintf("PDF Document: %s", urlStr),
Files: []filesystem.FileInfo{{
Path: urlStr,
Extension: ".md",
Code: content,
}},
}, nil
}

// Validate URL
parsedURL, err := url.Parse(urlStr)
if err != nil {
return nil, fmt.Errorf("invalid URL: %w", err)
return nil, fmt.Errorf("invalid URL: %w", err)
}

if !strings.HasPrefix(parsedURL.Scheme, "http") {
return nil, fmt.Errorf("URL must start with http:// or https://")
return nil, fmt.Errorf("URL must start with http:// or https://")
}

// Initialize crawler with the start URL
Expand All @@ -57,7 +57,7 @@ func ProcessWebURL(urlStr string, options CrawlOptions, excludePatterns []string
// Perform crawl
pages, err := crawler.Crawl(urlStr)
if err != nil {
return nil, fmt.Errorf("crawl failed: %w", err)
return nil, fmt.Errorf("crawl failed: %w", err)
}

// Convert crawled pages to FileInfo format
Expand Down

0 comments on commit a3d15ad

Please sign in to comment.