Skip to content

Commit

Permalink
feat: pdf ingestion (#38)
Browse files Browse the repository at this point in the history
  • Loading branch information
sammcj authored Oct 28, 2024
1 parent f7ede0b commit 1b4f38e
Show file tree
Hide file tree
Showing 5 changed files with 209 additions and 4 deletions.
30 changes: 30 additions & 0 deletions filesystem/filesystem.go
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ import (
"github.com/fatih/color"
"github.com/mitchellh/go-homedir"
ignore "github.com/sabhiram/go-gitignore"
"github.com/sammcj/ingest/pdf"
"github.com/sammcj/ingest/utils"
)

Expand Down Expand Up @@ -286,6 +287,35 @@ func PrintDefaultExcludes() {
}

func processFile(path, relPath string, rootPath string, lineNumber, relativePaths, noCodeblock bool, mu *sync.Mutex, files *[]FileInfo) {
// Check if file is a PDF
isPDF, err := pdf.IsPDF(path)
if err != nil {
utils.PrintColouredMessage("!", fmt.Sprintf("Failed to check if file is PDF %s: %v", path, err), color.FgRed)
return
}

if isPDF {
content, err := pdf.ConvertPDFToMarkdown(path, false)
if err != nil {
utils.PrintColouredMessage("!", fmt.Sprintf("Failed to convert PDF %s: %v", path, err), color.FgRed)
return
}

filePath := path
if relativePaths {
filePath = filepath.Join(filepath.Base(rootPath), relPath)
}

mu.Lock()
*files = append(*files, FileInfo{
Path: filePath,
Extension: ".md",
Code: content,
})
mu.Unlock()
return
}

// Check if the file is binary
isBinary, err := isBinaryFile(path)
if err != nil {
Expand Down
1 change: 1 addition & 0 deletions go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ require (
github.com/bmatcuk/doublestar/v4 v4.7.1
github.com/charmbracelet/glamour v0.8.0
github.com/fatih/color v1.18.0
github.com/ledongthuc/pdf v0.0.0-20240201131950-da5b75280b06
github.com/mitchellh/go-homedir v1.1.0
github.com/pkoukk/tiktoken-go v0.1.7
github.com/sabhiram/go-gitignore v0.0.0-20210923224102-525f6e181f06
Expand Down
2 changes: 2 additions & 0 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,8 @@ github.com/kr/pretty v0.1.0/go.mod h1:dAy3ld7l9f0ibDNOQOHHMYYIIbhfbHSm3C4ZsoJORN
github.com/kr/pty v1.1.1/go.mod h1:pFQYn66WHrOpPYNljwOMqo10TkYh1fy3cYio2l3bCsQ=
github.com/kr/text v0.1.0 h1:45sCR5RtlFHMR4UwH9sdQ5TC8v0qDQCHnXt+kaKSTVE=
github.com/kr/text v0.1.0/go.mod h1:4Jbv+DJW3UT/LiOwJeYQe1efqtUx/iVham/4vfdArNI=
github.com/ledongthuc/pdf v0.0.0-20240201131950-da5b75280b06 h1:kacRlPN7EN++tVpGUorNGPn/4DnB7/DfTY82AOn6ccU=
github.com/ledongthuc/pdf v0.0.0-20240201131950-da5b75280b06/go.mod h1:imJHygn/1yfhB7XSJJKlFZKl/J+dCPAknuiaGOshXAs=
github.com/lucasb-eyer/go-colorful v1.2.0 h1:1nnpGOrhyZZuNyfu1QjKiUICQ74+3FNCN69Aj6K7nkY=
github.com/lucasb-eyer/go-colorful v1.2.0/go.mod h1:R4dSotOR9KMtayYi1e77YzuveK+i7ruzyGqttikkLy0=
github.com/mattn/go-colorable v0.1.13 h1:fFA4WZxdEF4tXPZVKMLwD8oUnCTTo08duU7wxecdEvA=
Expand Down
149 changes: 149 additions & 0 deletions pdf/pdf.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,149 @@
// pdf/pdf.go

package pdf

import (
"bytes"
"fmt"
"io"
"net/http"
"os"
"path/filepath"
"strings"

"github.com/ledongthuc/pdf"
)

// ConvertPDFToMarkdown converts a PDF file to markdown format
func ConvertPDFToMarkdown(path string, isURL bool) (string, error) {
var reader io.ReadCloser
var err error

if isURL {
reader, err = downloadPDF(path)
if err != nil {
return "", fmt.Errorf("failed to download PDF: %w", err)
}
defer reader.Close()

// Create a temporary file to store the PDF
tempFile, err := os.CreateTemp("", "ingest-*.pdf")
if err != nil {
return "", fmt.Errorf("failed to create temp file: %w", err)
}
defer os.Remove(tempFile.Name())
defer tempFile.Close()

// Copy the downloaded PDF to the temp file
if _, err := io.Copy(tempFile, reader); err != nil {
return "", fmt.Errorf("failed to save PDF: %w", err)
}

path = tempFile.Name()
}

// Open the PDF file
f, r, err := pdf.Open(path)
if err != nil {
return "", fmt.Errorf("failed to open PDF: %w", err)
}
defer f.Close()

var buf bytes.Buffer
buf.WriteString(fmt.Sprintf("# PDF Content: %s\n\n", filepath.Base(path)))

// Read each page
totalPages := r.NumPage()
for pageNum := 1; pageNum <= totalPages; pageNum++ {
page := r.Page(pageNum)
if page.V.IsNull() {
continue
}

text, err := page.GetPlainText(nil)
if err != nil {
return "", fmt.Errorf("failed to extract text from page %d: %w", pageNum, err)
}

// Add page header and content
buf.WriteString(fmt.Sprintf("## Page %d\n\n", pageNum))
buf.WriteString(cleanText(text))
buf.WriteString("\n\n")
}

return buf.String(), nil
}

// IsPDF checks if a file is a PDF based on its content type or extension
func IsPDF(path string) (bool, error) {
// Check if it's a URL
if strings.HasPrefix(path, "http://") || strings.HasPrefix(path, "https://") {
resp, err := http.Head(path)
if err != nil {
return false, err
}
defer resp.Body.Close()
return resp.Header.Get("Content-Type") == "application/pdf", nil
}

// Check local file
file, err := os.Open(path)
if err != nil {
return false, err
}
defer file.Close()

// Read first 512 bytes to determine file type
buffer := make([]byte, 512)
_, err = file.Read(buffer)
if err != nil && err != io.EOF {
return false, err
}

// Check file signature
contentType := http.DetectContentType(buffer)
if contentType == "application/pdf" {
return true, nil
}

// Also check file extension
return strings.ToLower(filepath.Ext(path)) == ".pdf", nil
}

func downloadPDF(url string) (io.ReadCloser, error) {
resp, err := http.Get(url)
if err != nil {
return nil, err
}

if resp.StatusCode != http.StatusOK {
resp.Body.Close()
return nil, fmt.Errorf("failed to download PDF: status code %d", resp.StatusCode)
}

if resp.Header.Get("Content-Type") != "application/pdf" {
resp.Body.Close()
return nil, fmt.Errorf("URL does not point to a PDF file")
}

return resp.Body, nil
}

func cleanText(text string) string {
// Remove excessive whitespace
text = strings.ReplaceAll(text, "\r", "")
text = strings.TrimSpace(text)

// Normalize line endings
lines := strings.Split(text, "\n")
var cleanLines []string

for _, line := range lines {
line = strings.TrimSpace(line)
if line != "" {
cleanLines = append(cleanLines, line)
}
}

return strings.Join(cleanLines, "\n\n")
}
31 changes: 27 additions & 4 deletions web/integration.go
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ import (
"strings"

"github.com/sammcj/ingest/filesystem"
"github.com/sammcj/ingest/pdf"
)

type CrawlResult struct {
Expand All @@ -17,24 +18,46 @@ type CrawlResult struct {
}

func ProcessWebURL(urlStr string, options CrawlOptions, excludePatterns []string) (*CrawlResult, error) {
// Check if URL points to a PDF
isPDF, err := pdf.IsPDF(urlStr)
if err != nil {
return nil, fmt.Errorf("error checking PDF: %w", err)
}

if isPDF {
content, err := pdf.ConvertPDFToMarkdown(urlStr, true)
if err != nil {
return nil, fmt.Errorf("error converting PDF: %w", err)
}

return &CrawlResult{
TreeString: fmt.Sprintf("PDF Document: %s", urlStr),
Files: []filesystem.FileInfo{{
Path: urlStr,
Extension: ".md",
Code: content,
}},
}, nil
}

// Validate URL
parsedURL, err := url.Parse(urlStr)
if err != nil {
return nil, fmt.Errorf("invalid URL: %w", err)
return nil, fmt.Errorf("invalid URL: %w", err)
}

if !strings.HasPrefix(parsedURL.Scheme, "http") {
return nil, fmt.Errorf("URL must start with http:// or https://")
return nil, fmt.Errorf("URL must start with http:// or https://")
}

// Initialise crawler with the start URL
// Initialize crawler with the start URL
crawler := NewCrawler(options, urlStr)
crawler.SetExcludePatterns(excludePatterns)

// Perform crawl
pages, err := crawler.Crawl(urlStr)
if err != nil {
return nil, fmt.Errorf("crawl failed: %w", err)
return nil, fmt.Errorf("crawl failed: %w", err)
}

// Convert crawled pages to FileInfo format
Expand Down

0 comments on commit 1b4f38e

Please sign in to comment.