feat: pdf ingestion (#38)

sammcj · Oct 28, 2024 · 1b4f38e · 1b4f38e
1 parent f7ede0b
commit 1b4f38e
Show file tree

Hide file tree

Showing 5 changed files with 209 additions and 4 deletions.
diff --git a/filesystem/filesystem.go b/filesystem/filesystem.go
@@ -16,6 +16,7 @@ import (
 	"github.com/fatih/color"
 	"github.com/mitchellh/go-homedir"
 	ignore "github.com/sabhiram/go-gitignore"
+	"github.com/sammcj/ingest/pdf"
 	"github.com/sammcj/ingest/utils"
 )
 
@@ -286,6 +287,35 @@ func PrintDefaultExcludes() {
 }
 
 func processFile(path, relPath string, rootPath string, lineNumber, relativePaths, noCodeblock bool, mu *sync.Mutex, files *[]FileInfo) {
+	// Check if file is a PDF
+	isPDF, err := pdf.IsPDF(path)
+	if err != nil {
+		utils.PrintColouredMessage("!", fmt.Sprintf("Failed to check if file is PDF %s: %v", path, err), color.FgRed)
+		return
+	}
+
+	if isPDF {
+		content, err := pdf.ConvertPDFToMarkdown(path, false)
+		if err != nil {
+			utils.PrintColouredMessage("!", fmt.Sprintf("Failed to convert PDF %s: %v", path, err), color.FgRed)
+			return
+		}
+
+		filePath := path
+		if relativePaths {
+			filePath = filepath.Join(filepath.Base(rootPath), relPath)
+		}
+
+		mu.Lock()
+		*files = append(*files, FileInfo{
+			Path:      filePath,
+			Extension: ".md",
+			Code:      content,
+		})
+		mu.Unlock()
+		return
+	}
+
 	// Check if the file is binary
 	isBinary, err := isBinaryFile(path)
 	if err != nil {

diff --git a/go.mod b/go.mod
@@ -11,6 +11,7 @@ require (
 	github.com/bmatcuk/doublestar/v4 v4.7.1
 	github.com/charmbracelet/glamour v0.8.0
 	github.com/fatih/color v1.18.0
+	github.com/ledongthuc/pdf v0.0.0-20240201131950-da5b75280b06
 	github.com/mitchellh/go-homedir v1.1.0
 	github.com/pkoukk/tiktoken-go v0.1.7
 	github.com/sabhiram/go-gitignore v0.0.0-20210923224102-525f6e181f06

diff --git a/go.sum b/go.sum
@@ -59,6 +59,8 @@ github.com/kr/pretty v0.1.0/go.mod h1:dAy3ld7l9f0ibDNOQOHHMYYIIbhfbHSm3C4ZsoJORN
 github.com/kr/pty v1.1.1/go.mod h1:pFQYn66WHrOpPYNljwOMqo10TkYh1fy3cYio2l3bCsQ=
 github.com/kr/text v0.1.0 h1:45sCR5RtlFHMR4UwH9sdQ5TC8v0qDQCHnXt+kaKSTVE=
 github.com/kr/text v0.1.0/go.mod h1:4Jbv+DJW3UT/LiOwJeYQe1efqtUx/iVham/4vfdArNI=
+github.com/ledongthuc/pdf v0.0.0-20240201131950-da5b75280b06 h1:kacRlPN7EN++tVpGUorNGPn/4DnB7/DfTY82AOn6ccU=
+github.com/ledongthuc/pdf v0.0.0-20240201131950-da5b75280b06/go.mod h1:imJHygn/1yfhB7XSJJKlFZKl/J+dCPAknuiaGOshXAs=
 github.com/lucasb-eyer/go-colorful v1.2.0 h1:1nnpGOrhyZZuNyfu1QjKiUICQ74+3FNCN69Aj6K7nkY=
 github.com/lucasb-eyer/go-colorful v1.2.0/go.mod h1:R4dSotOR9KMtayYi1e77YzuveK+i7ruzyGqttikkLy0=
 github.com/mattn/go-colorable v0.1.13 h1:fFA4WZxdEF4tXPZVKMLwD8oUnCTTo08duU7wxecdEvA=

diff --git a/pdf/pdf.go b/pdf/pdf.go
@@ -0,0 +1,149 @@
+// pdf/pdf.go
+
+package pdf
+
+import (
+	"bytes"
+	"fmt"
+	"io"
+	"net/http"
+	"os"
+	"path/filepath"
+	"strings"
+
+	"github.com/ledongthuc/pdf"
+)
+
+// ConvertPDFToMarkdown converts a PDF file to markdown format
+func ConvertPDFToMarkdown(path string, isURL bool) (string, error) {
+	var reader io.ReadCloser
+	var err error
+
+	if isURL {
+		reader, err = downloadPDF(path)
+		if err != nil {
+			return "", fmt.Errorf("failed to download PDF: %w", err)
+		}
+		defer reader.Close()
+
+		// Create a temporary file to store the PDF
+		tempFile, err := os.CreateTemp("", "ingest-*.pdf")
+		if err != nil {
+			return "", fmt.Errorf("failed to create temp file: %w", err)
+		}
+		defer os.Remove(tempFile.Name())
+		defer tempFile.Close()
+
+		// Copy the downloaded PDF to the temp file
+		if _, err := io.Copy(tempFile, reader); err != nil {
+			return "", fmt.Errorf("failed to save PDF: %w", err)
+		}
+
+		path = tempFile.Name()
+	}
+
+	// Open the PDF file
+	f, r, err := pdf.Open(path)
+	if err != nil {
+		return "", fmt.Errorf("failed to open PDF: %w", err)
+	}
+	defer f.Close()
+
+	var buf bytes.Buffer
+	buf.WriteString(fmt.Sprintf("# PDF Content: %s\n\n", filepath.Base(path)))
+
+	// Read each page
+	totalPages := r.NumPage()
+	for pageNum := 1; pageNum <= totalPages; pageNum++ {
+		page := r.Page(pageNum)
+		if page.V.IsNull() {
+			continue
+		}
+
+		text, err := page.GetPlainText(nil)
+		if err != nil {
+			return "", fmt.Errorf("failed to extract text from page %d: %w", pageNum, err)
+		}
+
+		// Add page header and content
+		buf.WriteString(fmt.Sprintf("## Page %d\n\n", pageNum))
+		buf.WriteString(cleanText(text))
+		buf.WriteString("\n\n")
+	}
+
+	return buf.String(), nil
+}
+
+// IsPDF checks if a file is a PDF based on its content type or extension
+func IsPDF(path string) (bool, error) {
+	// Check if it's a URL
+	if strings.HasPrefix(path, "http://") || strings.HasPrefix(path, "https://") {
+		resp, err := http.Head(path)
+		if err != nil {
+			return false, err
+		}
+		defer resp.Body.Close()
+		return resp.Header.Get("Content-Type") == "application/pdf", nil
+	}
+
+	// Check local file
+	file, err := os.Open(path)
+	if err != nil {
+		return false, err
+	}
+	defer file.Close()
+
+	// Read first 512 bytes to determine file type
+	buffer := make([]byte, 512)
+	_, err = file.Read(buffer)
+	if err != nil && err != io.EOF {
+		return false, err
+	}
+
+	// Check file signature
+	contentType := http.DetectContentType(buffer)
+	if contentType == "application/pdf" {
+		return true, nil
+	}
+
+	// Also check file extension
+	return strings.ToLower(filepath.Ext(path)) == ".pdf", nil
+}
+
+func downloadPDF(url string) (io.ReadCloser, error) {
+	resp, err := http.Get(url)
+	if err != nil {
+		return nil, err
+	}
+
+	if resp.StatusCode != http.StatusOK {
+		resp.Body.Close()
+		return nil, fmt.Errorf("failed to download PDF: status code %d", resp.StatusCode)
+	}
+
+	if resp.Header.Get("Content-Type") != "application/pdf" {
+		resp.Body.Close()
+		return nil, fmt.Errorf("URL does not point to a PDF file")
+	}
+
+	return resp.Body, nil
+}
+
+func cleanText(text string) string {
+	// Remove excessive whitespace
+	text = strings.ReplaceAll(text, "\r", "")
+	text = strings.TrimSpace(text)
+
+	// Normalize line endings
+	lines := strings.Split(text, "\n")
+	var cleanLines []string
+
+	for _, line := range lines {
+		line = strings.TrimSpace(line)
+		if line != "" {
+			cleanLines = append(cleanLines, line)
+		}
+	}
+
+	return strings.Join(cleanLines, "\n\n")
+}
diff --git a/web/integration.go b/web/integration.go
@@ -9,6 +9,7 @@ import (
 	"strings"
 
 	"github.com/sammcj/ingest/filesystem"
+	"github.com/sammcj/ingest/pdf"
 )
 
 type CrawlResult struct {
@@ -17,24 +18,46 @@ type CrawlResult struct {
 }
 
 func ProcessWebURL(urlStr string, options CrawlOptions, excludePatterns []string) (*CrawlResult, error) {
+	// Check if URL points to a PDF
+	isPDF, err := pdf.IsPDF(urlStr)
+	if err != nil {
+			return nil, fmt.Errorf("error checking PDF: %w", err)
+	}
+
+	if isPDF {
+			content, err := pdf.ConvertPDFToMarkdown(urlStr, true)
+			if err != nil {
+					return nil, fmt.Errorf("error converting PDF: %w", err)
+			}
+
+			return &CrawlResult{
+					TreeString: fmt.Sprintf("PDF Document: %s", urlStr),
+					Files: []filesystem.FileInfo{{
+							Path:      urlStr,
+							Extension: ".md",
+							Code:      content,
+					}},
+			}, nil
+	}
+
 	// Validate URL
 	parsedURL, err := url.Parse(urlStr)
 	if err != nil {
-		return nil, fmt.Errorf("invalid URL: %w", err)
+			return nil, fmt.Errorf("invalid URL: %w", err)
 	}
 
 	if !strings.HasPrefix(parsedURL.Scheme, "http") {
-		return nil, fmt.Errorf("URL must start with http:// or https://")
+			return nil, fmt.Errorf("URL must start with http:// or https://")
 	}
 
-	// Initialise crawler with the start URL
+	// Initialize crawler with the start URL
 	crawler := NewCrawler(options, urlStr)
 	crawler.SetExcludePatterns(excludePatterns)
 
 	// Perform crawl
 	pages, err := crawler.Crawl(urlStr)
 	if err != nil {
-		return nil, fmt.Errorf("crawl failed: %w", err)
+			return nil, fmt.Errorf("crawl failed: %w", err)
 	}
 
 	// Convert crawled pages to FileInfo format