144 lines
3.8 KiB
Go
144 lines
3.8 KiB
Go
package utils
|
|
|
|
import (
|
|
"archive/zip"
|
|
"bytes"
|
|
"errors"
|
|
"fmt"
|
|
"io"
|
|
"path/filepath"
|
|
"strings"
|
|
|
|
"gitea.cybertalant.ru/VisionCareerMiniapp/MiniappGoService/internal/application/constants"
|
|
"github.com/pdfcpu/pdfcpu/pkg/api"
|
|
)
|
|
|
|
const (
|
|
maxFileSize = 5 << 20 // 5 MB
|
|
)
|
|
|
|
func pdfFileValidation(fileData []byte) error {
|
|
if len(fileData) == 0 {
|
|
return errors.New("empty pdf file")
|
|
}
|
|
if len(fileData) > maxFileSize {
|
|
return fmt.Errorf("pdf file exceeds max size of %d bytes", maxFileSize)
|
|
}
|
|
|
|
// Check file signature
|
|
if !bytes.HasPrefix(fileData, []byte("%PDF-")) {
|
|
return errors.New("file does not start with %PDF- header, not a valid pdf")
|
|
}
|
|
|
|
// Validate PDF structure using pdfcpu (checks cross-reference tables, trailer, etc.)
|
|
ctx, err := api.ReadContext(bytes.NewReader(fileData), api.LoadConfiguration())
|
|
if err != nil {
|
|
return fmt.Errorf("pdf parse failed: %w", err)
|
|
}
|
|
if ctx == nil {
|
|
return errors.New("invalid pdf structure")
|
|
}
|
|
|
|
// Scan for potentially malicious content such as JavaScript or RichMedia objects
|
|
for _, obj := range ctx.XRefTable.Table {
|
|
if obj.Free {
|
|
continue
|
|
}
|
|
if obj.Object != nil {
|
|
s := fmt.Sprintf("%v", obj.Object)
|
|
if strings.Contains(s, "/JavaScript") || strings.Contains(s, "/JS") {
|
|
return errors.New("pdf contains JavaScript, potentially unsafe")
|
|
}
|
|
if strings.Contains(s, "/RichMedia") || strings.Contains(s, "/Launch") {
|
|
return errors.New("pdf contains embedded media or launch actions")
|
|
}
|
|
}
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
func docxFileValidation(fileData []byte) error {
|
|
if len(fileData) == 0 {
|
|
return errors.New("empty docx file")
|
|
}
|
|
if len(fileData) > maxFileSize {
|
|
return fmt.Errorf("docx file exceeds max size of %d bytes", maxFileSize)
|
|
}
|
|
|
|
// DOCX is a ZIP archive containing multiple XML files
|
|
reader, err := zip.NewReader(bytes.NewReader(fileData), int64(len(fileData)))
|
|
if err != nil {
|
|
return errors.New("file is not a valid DOCX (invalid zip structure)")
|
|
}
|
|
|
|
hasDocumentXML := false
|
|
|
|
for _, f := range reader.File {
|
|
name := f.Name
|
|
|
|
// Check for the main document part
|
|
if name == "word/document.xml" {
|
|
hasDocumentXML = true
|
|
}
|
|
|
|
// Detect and block macro files
|
|
if strings.EqualFold(filepath.Base(name), "vbaProject.bin") {
|
|
return errors.New("docx contains macros (vbaProject.bin) — potentially unsafe")
|
|
}
|
|
|
|
// Detect and block embedded objects
|
|
if strings.HasPrefix(name, "word/embeddings/") {
|
|
return errors.New("docx contains embedded objects — potentially unsafe")
|
|
}
|
|
|
|
// Verify file part size (safety check)
|
|
if f.UncompressedSize64 > 0 && f.UncompressedSize64 > uint64(maxFileSize) {
|
|
return fmt.Errorf("docx part %s is too large", name)
|
|
}
|
|
|
|
// Only inspect XML parts
|
|
if strings.HasSuffix(name, ".xml") {
|
|
rc, err := f.Open()
|
|
if err != nil {
|
|
return fmt.Errorf("failed to open %s: %w", name, err)
|
|
}
|
|
data, err := io.ReadAll(io.LimitReader(rc, 8192)) // read first 8KB for validation
|
|
rc.Close()
|
|
if err != nil {
|
|
return fmt.Errorf("failed to read %s: %w", name, err)
|
|
}
|
|
|
|
// Ensure XML files actually start with '<'
|
|
if len(data) > 0 && !bytes.HasPrefix(bytes.TrimSpace(data), []byte("<")) {
|
|
return fmt.Errorf("file %s inside docx is not valid XML", name)
|
|
}
|
|
}
|
|
}
|
|
|
|
if !hasDocumentXML {
|
|
return errors.New("missing main document.xml part in DOCX archive")
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
func FileValidation(fileName string, fileData []byte) error {
|
|
fileExtension := constants.FileExtension(strings.ReplaceAll(
|
|
strings.ToLower(filepath.Ext(fileName)),
|
|
".", "",
|
|
))
|
|
if !fileExtension.Valid() {
|
|
return errors.New("invalid extension in file name")
|
|
}
|
|
|
|
switch fileExtension {
|
|
case constants.PdfFileExtension:
|
|
return pdfFileValidation(fileData)
|
|
case constants.DocxFileExtension:
|
|
return docxFileValidation(fileData)
|
|
default:
|
|
return errors.New("unsupported file extension")
|
|
}
|
|
}
|