diff --git a/filesystem/filesystem.go b/filesystem/filesystem.go index ddfd5e0..e4c3759 100644 --- a/filesystem/filesystem.go +++ b/filesystem/filesystem.go @@ -16,6 +16,7 @@ import ( "github.com/fatih/color" "github.com/mitchellh/go-homedir" ignore "github.com/sabhiram/go-gitignore" + "github.com/sammcj/ingest/pdf" "github.com/sammcj/ingest/utils" ) @@ -286,6 +287,35 @@ func PrintDefaultExcludes() { } func processFile(path, relPath string, rootPath string, lineNumber, relativePaths, noCodeblock bool, mu *sync.Mutex, files *[]FileInfo) { + // Check if file is a PDF + isPDF, err := pdf.IsPDF(path) + if err != nil { + utils.PrintColouredMessage("!", fmt.Sprintf("Failed to check if file is PDF %s: %v", path, err), color.FgRed) + return + } + + if isPDF { + content, err := pdf.ConvertPDFToMarkdown(path, false) + if err != nil { + utils.PrintColouredMessage("!", fmt.Sprintf("Failed to convert PDF %s: %v", path, err), color.FgRed) + return + } + + filePath := path + if relativePaths { + filePath = filepath.Join(filepath.Base(rootPath), relPath) + } + + mu.Lock() + *files = append(*files, FileInfo{ + Path: filePath, + Extension: ".md", + Code: content, + }) + mu.Unlock() + return + } + // Check if the file is binary isBinary, err := isBinaryFile(path) if err != nil { diff --git a/go.mod b/go.mod index 89185e5..fd24364 100644 --- a/go.mod +++ b/go.mod @@ -11,6 +11,7 @@ require ( github.com/bmatcuk/doublestar/v4 v4.7.1 github.com/charmbracelet/glamour v0.8.0 github.com/fatih/color v1.18.0 + github.com/ledongthuc/pdf v0.0.0-20240201131950-da5b75280b06 github.com/mitchellh/go-homedir v1.1.0 github.com/pkoukk/tiktoken-go v0.1.7 github.com/sabhiram/go-gitignore v0.0.0-20210923224102-525f6e181f06 diff --git a/go.sum b/go.sum index 0831075..15c429c 100644 --- a/go.sum +++ b/go.sum @@ -59,6 +59,8 @@ github.com/kr/pretty v0.1.0/go.mod h1:dAy3ld7l9f0ibDNOQOHHMYYIIbhfbHSm3C4ZsoJORN github.com/kr/pty v1.1.1/go.mod h1:pFQYn66WHrOpPYNljwOMqo10TkYh1fy3cYio2l3bCsQ= github.com/kr/text v0.1.0 h1:45sCR5RtlFHMR4UwH9sdQ5TC8v0qDQCHnXt+kaKSTVE= github.com/kr/text v0.1.0/go.mod h1:4Jbv+DJW3UT/LiOwJeYQe1efqtUx/iVham/4vfdArNI= +github.com/ledongthuc/pdf v0.0.0-20240201131950-da5b75280b06 h1:kacRlPN7EN++tVpGUorNGPn/4DnB7/DfTY82AOn6ccU= +github.com/ledongthuc/pdf v0.0.0-20240201131950-da5b75280b06/go.mod h1:imJHygn/1yfhB7XSJJKlFZKl/J+dCPAknuiaGOshXAs= github.com/lucasb-eyer/go-colorful v1.2.0 h1:1nnpGOrhyZZuNyfu1QjKiUICQ74+3FNCN69Aj6K7nkY= github.com/lucasb-eyer/go-colorful v1.2.0/go.mod h1:R4dSotOR9KMtayYi1e77YzuveK+i7ruzyGqttikkLy0= github.com/mattn/go-colorable v0.1.13 h1:fFA4WZxdEF4tXPZVKMLwD8oUnCTTo08duU7wxecdEvA= diff --git a/pdf/pdf.go b/pdf/pdf.go new file mode 100644 index 0000000..ed8f715 --- /dev/null +++ b/pdf/pdf.go @@ -0,0 +1,149 @@ +// pdf/pdf.go + +package pdf + +import ( + "bytes" + "fmt" + "io" + "net/http" + "os" + "path/filepath" + "strings" + + "github.com/ledongthuc/pdf" +) + +// ConvertPDFToMarkdown converts a PDF file to markdown format +func ConvertPDFToMarkdown(path string, isURL bool) (string, error) { + var reader io.ReadCloser + var err error + + if isURL { + reader, err = downloadPDF(path) + if err != nil { + return "", fmt.Errorf("failed to download PDF: %w", err) + } + defer reader.Close() + + // Create a temporary file to store the PDF + tempFile, err := os.CreateTemp("", "ingest-*.pdf") + if err != nil { + return "", fmt.Errorf("failed to create temp file: %w", err) + } + defer os.Remove(tempFile.Name()) + defer tempFile.Close() + + // Copy the downloaded PDF to the temp file + if _, err := io.Copy(tempFile, reader); err != nil { + return "", fmt.Errorf("failed to save PDF: %w", err) + } + + path = tempFile.Name() + } + + // Open the PDF file + f, r, err := pdf.Open(path) + if err != nil { + return "", fmt.Errorf("failed to open PDF: %w", err) + } + defer f.Close() + + var buf bytes.Buffer + buf.WriteString(fmt.Sprintf("# PDF Content: %s\n\n", filepath.Base(path))) + + // Read each page + totalPages := r.NumPage() + for pageNum := 1; pageNum <= totalPages; pageNum++ { + page := r.Page(pageNum) + if page.V.IsNull() { + continue + } + + text, err := page.GetPlainText(nil) + if err != nil { + return "", fmt.Errorf("failed to extract text from page %d: %w", pageNum, err) + } + + // Add page header and content + buf.WriteString(fmt.Sprintf("## Page %d\n\n", pageNum)) + buf.WriteString(cleanText(text)) + buf.WriteString("\n\n") + } + + return buf.String(), nil +} + +// IsPDF checks if a file is a PDF based on its content type or extension +func IsPDF(path string) (bool, error) { + // Check if it's a URL + if strings.HasPrefix(path, "http://") || strings.HasPrefix(path, "https://") { + resp, err := http.Head(path) + if err != nil { + return false, err + } + defer resp.Body.Close() + return resp.Header.Get("Content-Type") == "application/pdf", nil + } + + // Check local file + file, err := os.Open(path) + if err != nil { + return false, err + } + defer file.Close() + + // Read first 512 bytes to determine file type + buffer := make([]byte, 512) + _, err = file.Read(buffer) + if err != nil && err != io.EOF { + return false, err + } + + // Check file signature + contentType := http.DetectContentType(buffer) + if contentType == "application/pdf" { + return true, nil + } + + // Also check file extension + return strings.ToLower(filepath.Ext(path)) == ".pdf", nil +} + +func downloadPDF(url string) (io.ReadCloser, error) { + resp, err := http.Get(url) + if err != nil { + return nil, err + } + + if resp.StatusCode != http.StatusOK { + resp.Body.Close() + return nil, fmt.Errorf("failed to download PDF: status code %d", resp.StatusCode) + } + + if resp.Header.Get("Content-Type") != "application/pdf" { + resp.Body.Close() + return nil, fmt.Errorf("URL does not point to a PDF file") + } + + return resp.Body, nil +} + +func cleanText(text string) string { + // Remove excessive whitespace + text = strings.ReplaceAll(text, "\r", "") + text = strings.TrimSpace(text) + + // Normalize line endings + lines := strings.Split(text, "\n") + var cleanLines []string + + for _, line := range lines { + line = strings.TrimSpace(line) + if line != "" { + cleanLines = append(cleanLines, line) + } + } + + return strings.Join(cleanLines, "\n\n") +} diff --git a/web/integration.go b/web/integration.go index 4c7bf65..a300e66 100644 --- a/web/integration.go +++ b/web/integration.go @@ -9,6 +9,7 @@ import ( "strings" "github.com/sammcj/ingest/filesystem" + "github.com/sammcj/ingest/pdf" ) type CrawlResult struct { @@ -17,24 +18,46 @@ type CrawlResult struct { } func ProcessWebURL(urlStr string, options CrawlOptions, excludePatterns []string) (*CrawlResult, error) { + // Check if URL points to a PDF + isPDF, err := pdf.IsPDF(urlStr) + if err != nil { + return nil, fmt.Errorf("error checking PDF: %w", err) + } + + if isPDF { + content, err := pdf.ConvertPDFToMarkdown(urlStr, true) + if err != nil { + return nil, fmt.Errorf("error converting PDF: %w", err) + } + + return &CrawlResult{ + TreeString: fmt.Sprintf("PDF Document: %s", urlStr), + Files: []filesystem.FileInfo{{ + Path: urlStr, + Extension: ".md", + Code: content, + }}, + }, nil + } + // Validate URL parsedURL, err := url.Parse(urlStr) if err != nil { - return nil, fmt.Errorf("invalid URL: %w", err) + return nil, fmt.Errorf("invalid URL: %w", err) } if !strings.HasPrefix(parsedURL.Scheme, "http") { - return nil, fmt.Errorf("URL must start with http:// or https://") + return nil, fmt.Errorf("URL must start with http:// or https://") } - // Initialise crawler with the start URL + // Initialize crawler with the start URL crawler := NewCrawler(options, urlStr) crawler.SetExcludePatterns(excludePatterns) // Perform crawl pages, err := crawler.Crawl(urlStr) if err != nil { - return nil, fmt.Errorf("crawl failed: %w", err) + return nil, fmt.Errorf("crawl failed: %w", err) } // Convert crawled pages to FileInfo format