feat: pdf ingestion improved

sammcj · Oct 28, 2024 · a3d15ad · a3d15ad
1 parent f8b3596
commit a3d15ad
Show file tree

Hide file tree

Showing 3 changed files with 125 additions and 41 deletions.
diff --git a/filesystem/filesystem.go b/filesystem/filesystem.go
@@ -140,6 +140,28 @@ func WalkDirectory(rootPath string, includePatterns, excludePatterns []string, p
 		return "", nil, fmt.Errorf("failed to get file info: %w", err)
 	}
 
+	// Check if rootPath is a single PDF file
+	if !fileInfo.IsDir() {
+		isPDF, err := pdf.IsPDF(rootPath)
+		if err != nil {
+			return "", nil, fmt.Errorf("failed to check if file is PDF: %w", err)
+		}
+
+		if isPDF {
+			// Process single PDF file directly
+			content, err := pdf.ConvertPDFToMarkdown(rootPath, false)
+			if err != nil {
+				return "", nil, fmt.Errorf("failed to convert PDF: %w", err)
+			}
+
+			return fmt.Sprintf("File: %s", rootPath), []FileInfo{{
+				Path:      rootPath,
+				Extension: ".md",
+				Code:      content,
+			}}, nil
+		}
+	}
+
 	var treeString string
 
 	if !fileInfo.IsDir() {
@@ -257,6 +279,15 @@ func wrapCodeBlock(code, extension string) string {
 }
 
 func isBinaryFile(filePath string) (bool, error) {
+	// First check if it's a PDF
+	isPDF, err := pdf.IsPDF(filePath)
+	if err != nil {
+		return false, err
+	}
+	if isPDF {
+		return false, nil // Don't treat PDFs as binary files
+	}
+
 	file, err := os.Open(filePath)
 	if err != nil {
 		return false, err
@@ -273,8 +304,8 @@ func isBinaryFile(filePath string) (bool, error) {
 	// Use http.DetectContentType to determine the content type
 	contentType := http.DetectContentType(buffer[:n])
 
-	// Check if the content type starts with "text/"
-	return !strings.HasPrefix(contentType, "text/"), nil
+	// Allow PDFs and text files
+	return !strings.HasPrefix(contentType, "text/") && contentType != "application/pdf", nil
 }
 
 func PrintDefaultExcludes() {
@@ -287,6 +318,9 @@ func PrintDefaultExcludes() {
 }
 
 func processFile(path, relPath string, rootPath string, lineNumber, relativePaths, noCodeblock bool, mu *sync.Mutex, files *[]FileInfo) {
+	// Check if it's the root path being processed (explicitly provided file)
+	isExplicitFile := path == rootPath
+
 	// Check if file is a PDF
 	isPDF, err := pdf.IsPDF(path)
 	if err != nil {
@@ -295,6 +329,12 @@ func processFile(path, relPath string, rootPath string, lineNumber, relativePath
 	}
 
 	if isPDF {
+		if !isExplicitFile {
+			// Skip PDFs during directory traversal
+			return
+		}
+
+		utils.PrintColouredMessage("ℹ️", fmt.Sprintf("Converting PDF to markdown: %s", path), color.FgBlue)
 		content, err := pdf.ConvertPDFToMarkdown(path, false)
 		if err != nil {
 			utils.PrintColouredMessage("!", fmt.Sprintf("Failed to convert PDF %s: %v", path, err), color.FgRed)
@@ -456,9 +496,29 @@ func isExcluded(path string, patterns []string) bool {
 }
 
 func ProcessSingleFile(path string, lineNumber, relativePaths, noCodeblock bool) (FileInfo, error) {
+	// Check if it's a PDF first
+	isPDF, err := pdf.IsPDF(path)
+	if err != nil {
+		return FileInfo{}, fmt.Errorf("failed to check if file is PDF: %w", err)
+	}
+
+	if isPDF {
+		content, err := pdf.ConvertPDFToMarkdown(path, false)
+		if err != nil {
+			return FileInfo{}, fmt.Errorf("failed to convert PDF: %w", err)
+		}
+
+		return FileInfo{
+			Path:      path,
+			Extension: ".md",
+			Code:      content,
+		}, nil
+	}
+
+	// Handle non-PDF files
 	content, err := os.ReadFile(path)
 	if err != nil {
-		return FileInfo{}, fmt.Errorf("failed to read file %s: %w", path, err)
+		return FileInfo{}, fmt.Errorf("failed to read file: %w", err)
 	}
 
 	code := string(content)

diff --git a/pdf/pdf.go b/pdf/pdf.go
@@ -3,7 +3,6 @@
 package pdf
 
 import (
-	"bytes"
 	"fmt"
 	"io"
 	"net/http"
@@ -15,6 +14,7 @@ import (
 )
 
 // ConvertPDFToMarkdown converts a PDF file to markdown format
+
 func ConvertPDFToMarkdown(path string, isURL bool) (string, error) {
 	var reader io.ReadCloser
 	var err error
@@ -26,33 +26,31 @@ func ConvertPDFToMarkdown(path string, isURL bool) (string, error) {
 		}
 		defer reader.Close()
 
-		// Create a temporary file to store the PDF
 		tempFile, err := os.CreateTemp("", "ingest-*.pdf")
 		if err != nil {
 			return "", fmt.Errorf("failed to create temp file: %w", err)
 		}
 		defer os.Remove(tempFile.Name())
 		defer tempFile.Close()
 
-		// Copy the downloaded PDF to the temp file
 		if _, err := io.Copy(tempFile, reader); err != nil {
 			return "", fmt.Errorf("failed to save PDF: %w", err)
 		}
 
 		path = tempFile.Name()
 	}
 
-	// Open the PDF file
+	// Open and read the PDF
 	f, r, err := pdf.Open(path)
 	if err != nil {
 		return "", fmt.Errorf("failed to open PDF: %w", err)
 	}
 	defer f.Close()
 
-	var buf bytes.Buffer
+	var buf strings.Builder
 	buf.WriteString(fmt.Sprintf("# PDF Content: %s\n\n", filepath.Base(path)))
 
-	// Read each page
+	// Extract text from each page
 	totalPages := r.NumPage()
 	for pageNum := 1; pageNum <= totalPages; pageNum++ {
 		page := r.Page(pageNum)
@@ -65,13 +63,21 @@ func ConvertPDFToMarkdown(path string, isURL bool) (string, error) {
 			return "", fmt.Errorf("failed to extract text from page %d: %w", pageNum, err)
 		}
 
-		// Add page header and content
-		buf.WriteString(fmt.Sprintf("## Page %d\n\n", pageNum))
-		buf.WriteString(cleanText(text))
-		buf.WriteString("\n\n")
+		// Clean and process the text
+		cleanedText := cleanText(text)
+		if cleanedText != "" {
+			buf.WriteString(fmt.Sprintf("## Page %d\n\n", pageNum))
+			buf.WriteString(cleanedText)
+			buf.WriteString("\n\n")
+		}
 	}
 
-	return buf.String(), nil
+	result := buf.String()
+	if strings.TrimSpace(result) == strings.TrimSpace(fmt.Sprintf("# PDF Content: %s\n\n", filepath.Base(path))) {
+		return "", fmt.Errorf("no text content could be extracted from PDF")
+	}
+
+	return result, nil
 }
 
 // IsPDF checks if a file is a PDF based on its content type or extension
@@ -80,7 +86,7 @@ func IsPDF(path string) (bool, error) {
 	if strings.HasPrefix(path, "http://") || strings.HasPrefix(path, "https://") {
 		resp, err := http.Head(path)
 		if err != nil {
-			return false, err
+			return false, fmt.Errorf("failed to check URL for PDF: %w", err)
 		}
 		defer resp.Body.Close()
 		return resp.Header.Get("Content-Type") == "application/pdf", nil
@@ -89,19 +95,19 @@ func IsPDF(path string) (bool, error) {
 	// Check local file
 	file, err := os.Open(path)
 	if err != nil {
-		return false, err
+		return false, fmt.Errorf("failed to open file: %w", err)
 	}
 	defer file.Close()
 
 	// Read first 512 bytes to determine file type
 	buffer := make([]byte, 512)
-	_, err = file.Read(buffer)
+	n, err := file.Read(buffer)
 	if err != nil && err != io.EOF {
-		return false, err
+		return false, fmt.Errorf("failed to read file header: %w", err)
 	}
 
 	// Check file signature
-	contentType := http.DetectContentType(buffer)
+	contentType := http.DetectContentType(buffer[:n])
 	if contentType == "application/pdf" {
 		return true, nil
 	}
@@ -130,19 +136,37 @@ func downloadPDF(url string) (io.ReadCloser, error) {
 }
 
 func cleanText(text string) string {
-	// Remove excessive whitespace
-	text = strings.ReplaceAll(text, "\r", "")
-	text = strings.TrimSpace(text)
+	if strings.Contains(text, "%PDF-") || strings.Contains(text, "endobj") {
+		// This appears to be raw PDF data rather than extracted text
+		return ""
+	}
 
-	// Normalize line endings
+	// Remove control characters except newlines and tabs
+	text = strings.Map(func(r rune) rune {
+		if r < 32 && r != '\n' && r != '\t' {
+			return -1
+		}
+		return r
+	}, text)
+
+	// Split into lines and clean each line
 	lines := strings.Split(text, "\n")
 	var cleanLines []string
 
 	for _, line := range lines {
 		line = strings.TrimSpace(line)
-		if line != "" {
-			cleanLines = append(cleanLines, line)
+
+		// Skip empty lines and lines that look like PDF syntax
+		if line == "" ||
+			strings.HasPrefix(line, "%") ||
+			strings.HasPrefix(line, "/") ||
+			strings.Contains(line, "obj") ||
+			strings.Contains(line, "endobj") ||
+			strings.Contains(line, "stream") {
+			continue
 		}
+
+		cleanLines = append(cleanLines, line)
 	}
 
 	return strings.Join(cleanLines, "\n\n")

diff --git a/web/integration.go b/web/integration.go
@@ -21,33 +21,33 @@ func ProcessWebURL(urlStr string, options CrawlOptions, excludePatterns []string
 	// Check if URL points to a PDF
 	isPDF, err := pdf.IsPDF(urlStr)
 	if err != nil {
-			return nil, fmt.Errorf("error checking PDF: %w", err)
+		return nil, fmt.Errorf("error checking PDF: %w", err)
 	}
 
 	if isPDF {
-			content, err := pdf.ConvertPDFToMarkdown(urlStr, true)
-			if err != nil {
-					return nil, fmt.Errorf("error converting PDF: %w", err)
-			}
+		content, err := pdf.ConvertPDFToMarkdown(urlStr, true)
+		if err != nil {
+			return nil, fmt.Errorf("error converting PDF: %w", err)
+		}
 
-			return &CrawlResult{
-					TreeString: fmt.Sprintf("PDF Document: %s", urlStr),
-					Files: []filesystem.FileInfo{{
-							Path:      urlStr,
-							Extension: ".md",
-							Code:      content,
-					}},
-			}, nil
+		return &CrawlResult{
+			TreeString: fmt.Sprintf("PDF Document: %s", urlStr),
+			Files: []filesystem.FileInfo{{
+				Path:      urlStr,
+				Extension: ".md",
+				Code:      content,
+			}},
+		}, nil
 	}
 
 	// Validate URL
 	parsedURL, err := url.Parse(urlStr)
 	if err != nil {
-			return nil, fmt.Errorf("invalid URL: %w", err)
+		return nil, fmt.Errorf("invalid URL: %w", err)
 	}
 
 	if !strings.HasPrefix(parsedURL.Scheme, "http") {
-			return nil, fmt.Errorf("URL must start with http:// or https://")
+		return nil, fmt.Errorf("URL must start with http:// or https://")
 	}
 
 	// Initialize crawler with the start URL
@@ -57,7 +57,7 @@ func ProcessWebURL(urlStr string, options CrawlOptions, excludePatterns []string
 	// Perform crawl
 	pages, err := crawler.Crawl(urlStr)
 	if err != nil {
-			return nil, fmt.Errorf("crawl failed: %w", err)
+		return nil, fmt.Errorf("crawl failed: %w", err)
 	}
 
 	// Convert crawled pages to FileInfo format