insertr/lib/src/utils/html-preservation.js

/**
 * HTMLPreservationEngine - Direct HTML manipulation preserving all attributes and structure
 *
 * Handles the storage and application of HTML content while maintaining:
 * - All element attributes (classes, IDs, data-*, etc.)
 * - Nested styled element structure
 * - Developer-defined styling context
 *
 * This replaces the lossy markdown conversion system with perfect fidelity HTML operations.
 */
export class HTMLPreservationEngine {
    constructor() {
        this.allowedTags = new Set([
            // Text formatting
            'strong', 'b', 'em', 'i', 'span', 'code', 'kbd', 'samp', 'var',
            // Links and interactive
            'a', 'button',
            // Structure
            'p', 'div', 'section', 'article', 'header', 'footer', 'nav',
            // Lists
            'ul', 'ol', 'li', 'dl', 'dt', 'dd',
            // Headings
            'h1', 'h2', 'h3', 'h4', 'h5', 'h6',
            // Media
            'img', 'figure', 'figcaption',
            // Quotes and citations
            'blockquote', 'cite', 'q',
            // Tables
            'table', 'thead', 'tbody', 'tfoot', 'tr', 'th', 'td',
            // Inline elements
            'small', 'sub', 'sup', 'mark', 'del', 'ins',
            // Icons and symbols
            'i' // Often used for icons
        ]);

        this.allowedAttributes = new Set([
            // Universal attributes
            'class', 'id', 'title', 'lang', 'dir',
            // Data attributes (all data-* allowed)
            // ARIA attributes (all aria-* allowed)
            // Link attributes
            'href', 'rel', 'target', 'download',
            // Media attributes
            'src', 'alt', 'width', 'height',
            // Form attributes
            'type', 'value', 'placeholder', 'disabled', 'readonly',
            // Table attributes
            'colspan', 'rowspan', 'scope',
            // Other semantic attributes
            'datetime', 'cite'
        ]);
    }

    /**
     * Extract content while preserving structure for editing
     *
     * @param {HTMLElement} element - The .insertr element to extract content from
     * @returns {Object} - Extracted content with preservation metadata
     */
    extractForEditing(element) {
        return {
            // Complete HTML content for rich editing
            html: element.innerHTML,
            // Plain text for simple editing fallback
            text: this.extractPlainTextWithStructure(element),
            // Element's own attributes (never modified by content editing)
            containerAttributes: this.extractElementAttributes(element),
            // Original state for restoration if needed
            originalHTML: element.innerHTML,
            // Metadata for validation
            elementTag: element.tagName.toLowerCase(),
            hasNestedElements: element.children.length > 0
        };
    }

    /**
     * Apply edited content while preserving structure and validating safety
     *
     * @param {HTMLElement} element - Target element to update
     * @param {string} newHTML - New HTML content from editor
     * @returns {boolean} - Success status
     */
    applyFromEditing(element, newHTML) {
        try {
            // Validate HTML structure and safety
            const validatedHTML = this.validateAndSanitizeHTML(newHTML);

            // Apply validated content
            element.innerHTML = validatedHTML;

            // Element's own attributes are never modified
            // (classes, IDs on the .insertr element itself are preserved)

            return true;
        } catch (error) {
            console.error('Failed to apply HTML content:', error);
            return false;
        }
    }

    /**
     * Validate and sanitize HTML to ensure safety and structure preservation
     *
     * @param {string} html - HTML to validate
     * @returns {string} - Sanitized HTML
     */
    validateAndSanitizeHTML(html) {
        // Create temporary container for parsing
        const tempDiv = document.createElement('div');
        tempDiv.innerHTML = html;

        // Recursively validate and clean
        this.sanitizeElement(tempDiv);

        return tempDiv.innerHTML;
    }

    /**
     * Recursively sanitize element and its children
     *
     * @param {HTMLElement} element - Element to sanitize
     */
    sanitizeElement(element) {
        // Check all child elements
        const children = Array.from(element.children);

        for (const child of children) {
            // Check if tag is allowed
            if (!this.allowedTags.has(child.tagName.toLowerCase())) {
                // Remove disallowed tags but preserve content
                const textContent = child.textContent;
                const textNode = document.createTextNode(textContent);
                child.parentNode.replaceChild(textNode, child);
                continue;
            }

            // Sanitize attributes
            this.sanitizeAttributes(child);

            // Recursively sanitize children
            this.sanitizeElement(child);
        }
    }

    /**
     * Sanitize element attributes, removing dangerous ones
     *
     * @param {HTMLElement} element - Element to sanitize attributes for
     */
    sanitizeAttributes(element) {
        const attributesToRemove = [];

        for (const attr of element.attributes) {
            const attrName = attr.name.toLowerCase();

            // Always allow data-* and aria-* attributes
            if (attrName.startsWith('data-') || attrName.startsWith('aria-')) {
                continue;
            }

            // Check if attribute is in allowed list
            if (!this.allowedAttributes.has(attrName)) {
                attributesToRemove.push(attrName);
                continue;
            }

            // Sanitize attribute values for security
            if (attrName === 'href') {
                const href = attr.value.toLowerCase().trim();
                // Allow relative URLs, http/https, mailto, tel
                if (!href.match(/^(https?:\/\/|mailto:|tel:|#|\/)/)) {
                    attributesToRemove.push(attrName);
                }
            }
        }

        // Remove invalid attributes
        attributesToRemove.forEach(attrName => {
            element.removeAttribute(attrName);
        });
    }

    /**
     * Extract plain text while preserving some structural information
     * Used for simple editing interfaces
     *
     * @param {HTMLElement} element - Element to extract text from
     * @returns {string} - Plain text with preserved structure
     */
    extractPlainTextWithStructure(element) {
        // For simple elements, just return textContent
        if (element.children.length === 0) {
            return element.textContent;
        }

        // For complex elements, preserve some structure
        let text = '';
        for (const node of element.childNodes) {
            if (node.nodeType === Node.TEXT_NODE) {
                text += node.textContent;
            } else if (node.nodeType === Node.ELEMENT_NODE) {
                // Add the text content of nested elements
                text += node.textContent;
            }
        }

        return text.trim();
    }

    /**
     * Extract all attributes from element for preservation
     *
     * @param {HTMLElement} element - Element to extract attributes from
     * @returns {Object} - Attributes object
     */
    extractElementAttributes(element) {
        const attributes = {};

        for (const attr of element.attributes) {
            attributes[attr.name] = attr.value;
        }

        return attributes;
    }

    /**
     * Restore element attributes (used for element-level preservation)
     *
     * @param {HTMLElement} element - Element to restore attributes to
     * @param {Object} attributes - Attributes to restore
     */
    restoreElementAttributes(element, attributes) {
        // Clear existing attributes (except core ones)
        const existingAttrs = Array.from(element.attributes);
        existingAttrs.forEach(attr => {
            if (attr.name !== 'contenteditable') { // Preserve editing state
                element.removeAttribute(attr.name);
            }
        });

        // Restore saved attributes
        Object.entries(attributes).forEach(([name, value]) => {
            element.setAttribute(name, value);
        });
    }

    /**
     * Check if HTML content is safe and maintains expected structure
     *
     * @param {string} html - HTML to validate
     * @returns {boolean} - True if HTML is valid and safe
     */
    isValidHTML(html) {
        try {
            const tempDiv = document.createElement('div');
            tempDiv.innerHTML = html;

            // Check for script tags or other dangerous elements
            if (tempDiv.querySelector('script, object, embed, iframe')) {
                return false;
            }

            return true;
        } catch (error) {
            return false;
        }
    }

    /**
     * Create a safe copy of HTML content for editing
     *
     * @param {string} html - Original HTML
     * @returns {string} - Safe copy for editing
     */
    createEditableCopy(html) {
        const tempDiv = document.createElement('div');
        tempDiv.innerHTML = html;

        // Remove any potentially dangerous attributes
        const allElements = tempDiv.querySelectorAll('*');
        allElements.forEach(element => {
            this.sanitizeAttributes(element);
        });

        return tempDiv.innerHTML;
    }

    /**
     * Merge edited content back while preserving specific styled elements
     * Used for complex editing scenarios where certain elements must be preserved
     *
     * @param {string} originalHTML - Original HTML content
     * @param {string} editedHTML - Edited HTML content
     * @param {Array} preserveSelectors - CSS selectors for elements to preserve
     * @returns {string} - Merged HTML with preserved elements
     */
    mergeWithPreservation(originalHTML, editedHTML, preserveSelectors = []) {
        if (preserveSelectors.length === 0) {
            return editedHTML;
        }

        const originalDiv = document.createElement('div');
        originalDiv.innerHTML = originalHTML;

        const editedDiv = document.createElement('div');
        editedDiv.innerHTML = editedHTML;

        // Preserve specific elements from original
        preserveSelectors.forEach(selector => {
            const originalElements = originalDiv.querySelectorAll(selector);
            const editedElements = editedDiv.querySelectorAll(selector);

            // Replace edited elements with original preserved ones
            originalElements.forEach((originalEl, index) => {
                if (editedElements[index]) {
                    editedElements[index].replaceWith(originalEl.cloneNode(true));
                }
            });
        });

        return editedDiv.innerHTML;
    }

    /**
     * Convert HTML content to safe editing format
     * Ensures content can be safely edited without losing essential structure
     *
     * @param {HTMLElement} element - Element containing content to prepare
     * @returns {Object} - Prepared content for editing
     */
    prepareForEditing(element) {
        const extracted = this.extractForEditing(element);

        // Create safe editable copy
        const editableHTML = this.createEditableCopy(extracted.html);

        return {
            ...extracted,
            editableHTML: editableHTML,
            isComplex: extracted.hasNestedElements
        };
    }

    /**
     * Finalize edited content and apply to element
     * Handles validation, sanitization, and safe application
     *
     * @param {HTMLElement} element - Target element
     * @param {Object} editedContent - Content from editor
     * @returns {boolean} - Success status
     */
    finalizeEditing(element, editedContent) {
        try {
            // Determine content type and apply appropriately
            if (typeof editedContent === 'string') {
                // Simple text or HTML string
                return this.applyFromEditing(element, editedContent);
            } else if (editedContent.html) {
                // Rich content object
                return this.applyFromEditing(element, editedContent.html);
            }

            return false;
        } catch (error) {
            console.error('Failed to finalize editing:', error);
            return false;
        }
    }
}

// Export singleton instance
export const htmlPreservationEngine = new HTMLPreservationEngine();