MiniappGoService/internal/application/utils/fileValidations.go

144 lines
3.8 KiB
Go

package utils
import (
"archive/zip"
"bytes"
"errors"
"fmt"
"io"
"path/filepath"
"strings"
"gitea.cybertalant.ru/VisionCareerMiniapp/MiniappGoService/internal/application/constants"
"github.com/pdfcpu/pdfcpu/pkg/api"
)
const (
maxFileSize = 5 << 20 // 5 MB
)
func pdfFileValidation(fileData []byte) error {
if len(fileData) == 0 {
return errors.New("empty pdf file")
}
if len(fileData) > maxFileSize {
return fmt.Errorf("pdf file exceeds max size of %d bytes", maxFileSize)
}
// Check file signature
if !bytes.HasPrefix(fileData, []byte("%PDF-")) {
return errors.New("file does not start with %PDF- header, not a valid pdf")
}
// Validate PDF structure using pdfcpu (checks cross-reference tables, trailer, etc.)
ctx, err := api.ReadContext(bytes.NewReader(fileData), api.LoadConfiguration())
if err != nil {
return fmt.Errorf("pdf parse failed: %w", err)
}
if ctx == nil {
return errors.New("invalid pdf structure")
}
// Scan for potentially malicious content such as JavaScript or RichMedia objects
for _, obj := range ctx.XRefTable.Table {
if obj.Free {
continue
}
if obj.Object != nil {
s := fmt.Sprintf("%v", obj.Object)
if strings.Contains(s, "/JavaScript") || strings.Contains(s, "/JS") {
return errors.New("pdf contains JavaScript, potentially unsafe")
}
if strings.Contains(s, "/RichMedia") || strings.Contains(s, "/Launch") {
return errors.New("pdf contains embedded media or launch actions")
}
}
}
return nil
}
func docxFileValidation(fileData []byte) error {
if len(fileData) == 0 {
return errors.New("empty docx file")
}
if len(fileData) > maxFileSize {
return fmt.Errorf("docx file exceeds max size of %d bytes", maxFileSize)
}
// DOCX is a ZIP archive containing multiple XML files
reader, err := zip.NewReader(bytes.NewReader(fileData), int64(len(fileData)))
if err != nil {
return errors.New("file is not a valid DOCX (invalid zip structure)")
}
hasDocumentXML := false
for _, f := range reader.File {
name := f.Name
// Check for the main document part
if name == "word/document.xml" {
hasDocumentXML = true
}
// Detect and block macro files
if strings.EqualFold(filepath.Base(name), "vbaProject.bin") {
return errors.New("docx contains macros (vbaProject.bin) — potentially unsafe")
}
// Detect and block embedded objects
if strings.HasPrefix(name, "word/embeddings/") {
return errors.New("docx contains embedded objects — potentially unsafe")
}
// Verify file part size (safety check)
if f.UncompressedSize64 > 0 && f.UncompressedSize64 > uint64(maxFileSize) {
return fmt.Errorf("docx part %s is too large", name)
}
// Only inspect XML parts
if strings.HasSuffix(name, ".xml") {
rc, err := f.Open()
if err != nil {
return fmt.Errorf("failed to open %s: %w", name, err)
}
data, err := io.ReadAll(io.LimitReader(rc, 8192)) // read first 8KB for validation
rc.Close()
if err != nil {
return fmt.Errorf("failed to read %s: %w", name, err)
}
// Ensure XML files actually start with '<'
if len(data) > 0 && !bytes.HasPrefix(bytes.TrimSpace(data), []byte("<")) {
return fmt.Errorf("file %s inside docx is not valid XML", name)
}
}
}
if !hasDocumentXML {
return errors.New("missing main document.xml part in DOCX archive")
}
return nil
}
func FileValidation(fileName string, fileData []byte) error {
fileExtension := constants.FileExtension(strings.ReplaceAll(
strings.ToLower(filepath.Ext(fileName)),
".", "",
))
if !fileExtension.Valid() {
return errors.New("invalid extension in file name")
}
switch fileExtension {
case constants.PdfFileExtension:
return pdfFileValidation(fileData)
case constants.DocxFileExtension:
return docxFileValidation(fileData)
default:
return errors.New("unsupported file extension")
}
}