package utils import ( "archive/zip" "bytes" "errors" "fmt" "io" "path/filepath" "strings" "gitea.cybertalant.ru/VisionCareerMiniapp/MiniappGoService/internal/application/constants" "github.com/pdfcpu/pdfcpu/pkg/api" ) const ( maxFileSize = 5 << 20 // 5 MB ) func pdfFileValidation(fileData []byte) error { if len(fileData) == 0 { return errors.New("empty pdf file") } if len(fileData) > maxFileSize { return fmt.Errorf("pdf file exceeds max size of %d bytes", maxFileSize) } // Check file signature if !bytes.HasPrefix(fileData, []byte("%PDF-")) { return errors.New("file does not start with %PDF- header, not a valid pdf") } // Validate PDF structure using pdfcpu (checks cross-reference tables, trailer, etc.) ctx, err := api.ReadContext(bytes.NewReader(fileData), api.LoadConfiguration()) if err != nil { return fmt.Errorf("pdf parse failed: %w", err) } if ctx == nil { return errors.New("invalid pdf structure") } // Scan for potentially malicious content such as JavaScript or RichMedia objects for _, obj := range ctx.XRefTable.Table { if obj.Free { continue } if obj.Object != nil { s := fmt.Sprintf("%v", obj.Object) if strings.Contains(s, "/JavaScript") || strings.Contains(s, "/JS") { return errors.New("pdf contains JavaScript, potentially unsafe") } if strings.Contains(s, "/RichMedia") || strings.Contains(s, "/Launch") { return errors.New("pdf contains embedded media or launch actions") } } } return nil } func docxFileValidation(fileData []byte) error { if len(fileData) == 0 { return errors.New("empty docx file") } if len(fileData) > maxFileSize { return fmt.Errorf("docx file exceeds max size of %d bytes", maxFileSize) } // DOCX is a ZIP archive containing multiple XML files reader, err := zip.NewReader(bytes.NewReader(fileData), int64(len(fileData))) if err != nil { return errors.New("file is not a valid DOCX (invalid zip structure)") } hasDocumentXML := false for _, f := range reader.File { name := f.Name // Check for the main document part if name == "word/document.xml" { hasDocumentXML = true } // Detect and block macro files if strings.EqualFold(filepath.Base(name), "vbaProject.bin") { return errors.New("docx contains macros (vbaProject.bin) — potentially unsafe") } // Detect and block embedded objects if strings.HasPrefix(name, "word/embeddings/") { return errors.New("docx contains embedded objects — potentially unsafe") } // Verify file part size (safety check) if f.UncompressedSize64 > 0 && f.UncompressedSize64 > uint64(maxFileSize) { return fmt.Errorf("docx part %s is too large", name) } // Only inspect XML parts if strings.HasSuffix(name, ".xml") { rc, err := f.Open() if err != nil { return fmt.Errorf("failed to open %s: %w", name, err) } data, err := io.ReadAll(io.LimitReader(rc, 8192)) // read first 8KB for validation rc.Close() if err != nil { return fmt.Errorf("failed to read %s: %w", name, err) } // Ensure XML files actually start with '<' if len(data) > 0 && !bytes.HasPrefix(bytes.TrimSpace(data), []byte("<")) { return fmt.Errorf("file %s inside docx is not valid XML", name) } } } if !hasDocumentXML { return errors.New("missing main document.xml part in DOCX archive") } return nil } func FileValidation(fileName string, fileData []byte) error { fileExtension := constants.FileExtension(strings.ReplaceAll( strings.ToLower(filepath.Ext(fileName)), ".", "", )) if !fileExtension.Valid() { return errors.New("invalid extension in file name") } switch fileExtension { case constants.PdfFileExtension: return pdfFileValidation(fileData) case constants.DocxFileExtension: return docxFileValidation(fileData) default: return errors.New("unsupported file extension") } }