MiniappGoService/internal/application/utils/fileValidations.go

package utils

import (
	"archive/zip"
	"bytes"
	"errors"
	"fmt"
	"io"
	"path/filepath"
	"strings"

	"gitea.cybertalant.ru/VisionCareerMiniapp/MiniappGoService/internal/application/constants"
	"github.com/pdfcpu/pdfcpu/pkg/api"
)

const (
	maxFileSize = 5 << 20 // 5 MB
)

func pdfFileValidation(fileData []byte) error {
	if len(fileData) == 0 {
		return errors.New("empty pdf file")
	}
	if len(fileData) > maxFileSize {
		return fmt.Errorf("pdf file exceeds max size of %d bytes", maxFileSize)
	}

	// Check file signature
	if !bytes.HasPrefix(fileData, []byte("%PDF-")) {
		return errors.New("file does not start with %PDF- header, not a valid pdf")
	}

	// Validate PDF structure using pdfcpu (checks cross-reference tables, trailer, etc.)
	ctx, err := api.ReadContext(bytes.NewReader(fileData), api.LoadConfiguration())
	if err != nil {
		return fmt.Errorf("pdf parse failed: %w", err)
	}
	if ctx == nil {
		return errors.New("invalid pdf structure")
	}

	// Scan for potentially malicious content such as JavaScript or RichMedia objects
	for _, obj := range ctx.XRefTable.Table {
		if obj.Free {
			continue
		}
		if obj.Object != nil {
			s := fmt.Sprintf("%v", obj.Object)
			if strings.Contains(s, "/JavaScript") || strings.Contains(s, "/JS") {
				return errors.New("pdf contains JavaScript, potentially unsafe")
			}
			if strings.Contains(s, "/RichMedia") || strings.Contains(s, "/Launch") {
				return errors.New("pdf contains embedded media or launch actions")
			}
		}
	}

	return nil
}

func docxFileValidation(fileData []byte) error {
	if len(fileData) == 0 {
		return errors.New("empty docx file")
	}
	if len(fileData) > maxFileSize {
		return fmt.Errorf("docx file exceeds max size of %d bytes", maxFileSize)
	}

	// DOCX is a ZIP archive containing multiple XML files
	reader, err := zip.NewReader(bytes.NewReader(fileData), int64(len(fileData)))
	if err != nil {
		return errors.New("file is not a valid DOCX (invalid zip structure)")
	}

	hasDocumentXML := false

	for _, f := range reader.File {
		name := f.Name

		// Check for the main document part
		if name == "word/document.xml" {
			hasDocumentXML = true
		}

		// Detect and block macro files
		if strings.EqualFold(filepath.Base(name), "vbaProject.bin") {
			return errors.New("docx contains macros (vbaProject.bin) — potentially unsafe")
		}

		// Detect and block embedded objects
		if strings.HasPrefix(name, "word/embeddings/") {
			return errors.New("docx contains embedded objects — potentially unsafe")
		}

		// Verify file part size (safety check)
		if f.UncompressedSize64 > 0 && f.UncompressedSize64 > uint64(maxFileSize) {
			return fmt.Errorf("docx part %s is too large", name)
		}

		// Only inspect XML parts
		if strings.HasSuffix(name, ".xml") {
			rc, err := f.Open()
			if err != nil {
				return fmt.Errorf("failed to open %s: %w", name, err)
			}
			data, err := io.ReadAll(io.LimitReader(rc, 8192)) // read first 8KB for validation
			rc.Close()
			if err != nil {
				return fmt.Errorf("failed to read %s: %w", name, err)
			}

			// Ensure XML files actually start with '<'
			if len(data) > 0 && !bytes.HasPrefix(bytes.TrimSpace(data), []byte("<")) {
				return fmt.Errorf("file %s inside docx is not valid XML", name)
			}
		}
	}

	if !hasDocumentXML {
		return errors.New("missing main document.xml part in DOCX archive")
	}

	return nil
}

func FileValidation(fileName string, fileData []byte) error {
	fileExtension := constants.FileExtension(strings.ReplaceAll(
		strings.ToLower(filepath.Ext(fileName)),
		".", "",
	))
	if !fileExtension.Valid() {
		return errors.New("invalid extension in file name")
	}

	switch fileExtension {
	case constants.PdfFileExtension:
		return pdfFileValidation(fileData)
	case constants.DocxFileExtension:
		return docxFileValidation(fileData)
	default:
		return errors.New("unsupported file extension")
	}
}