package content import ( "fmt" "io/fs" "os" "path/filepath" "strings" "github.com/insertr/insertr/internal/engine" "golang.org/x/net/html" ) // Discoverer handles automatic discovery of editable elements in HTML type Discoverer struct { // Element discovery is now self-contained and configurable } // NewDiscoverer creates a new Discoverer instance func NewDiscoverer() *Discoverer { return &Discoverer{} } // DiscoveryResult contains statistics about element discovery type DiscoveryResult struct { FilesProcessed int ElementsEnhanced int ContainersAdded int IndividualsAdded int SkippedFiles []string EnhancedFiles []string } // DiscoverDirectory discovers editable elements in all HTML files in a directory func (disc *Discoverer) DiscoverDirectory(inputDir, outputDir string, aggressive bool) (*DiscoveryResult, error) { result := &DiscoveryResult{ SkippedFiles: []string{}, EnhancedFiles: []string{}, } // Create output directory if it doesn't exist if err := os.MkdirAll(outputDir, 0755); err != nil { return nil, fmt.Errorf("failed to create output directory: %w", err) } err := filepath.WalkDir(inputDir, func(path string, d fs.DirEntry, err error) error { if err != nil { return err } // Skip directories if d.IsDir() { return nil } // Only process HTML files if !strings.HasSuffix(strings.ToLower(path), ".html") { // Copy non-HTML files as-is return disc.copyFile(path, inputDir, outputDir) } // Discover elements in HTML file enhanced, err := disc.discoverFile(path, aggressive) if err != nil { result.SkippedFiles = append(result.SkippedFiles, path) // Copy original file on error return disc.copyFile(path, inputDir, outputDir) } // Write enhanced file outputPath := disc.getOutputPath(path, inputDir, outputDir) if err := disc.writeEnhancedFile(outputPath, enhanced); err != nil { return fmt.Errorf("failed to write enhanced file %s: %w", outputPath, err) } result.FilesProcessed++ result.ElementsEnhanced += enhanced.ElementsEnhanced result.ContainersAdded += enhanced.ContainersAdded result.IndividualsAdded += enhanced.IndividualsAdded result.EnhancedFiles = append(result.EnhancedFiles, outputPath) return nil }) return result, err } // FileDiscoveryResult contains details about a single file discovery type FileDiscoveryResult struct { ElementsEnhanced int ContainersAdded int IndividualsAdded int Document *html.Node } // discoverFile processes a single HTML file and adds insertr classes func (disc *Discoverer) discoverFile(filePath string, aggressive bool) (*FileDiscoveryResult, error) { file, err := os.Open(filePath) if err != nil { return nil, fmt.Errorf("error opening file: %w", err) } defer file.Close() doc, err := html.Parse(file) if err != nil { return nil, fmt.Errorf("error parsing HTML: %w", err) } result := &FileDiscoveryResult{Document: doc} // Find candidates for enhancement disc.discoverNode(doc, result, aggressive) return result, nil } // discoverNode recursively discovers editable nodes in the document func (disc *Discoverer) discoverNode(node *html.Node, result *FileDiscoveryResult, aggressive bool) { if node.Type != html.ElementNode { // Recursively check children for child := node.FirstChild; child != nil; child = child.NextSibling { disc.discoverNode(child, result, aggressive) } return } // Skip if already has insertr class if disc.hasInsertrClass(node) { return } // Check if this is a container that should use expansion if disc.isGoodContainer(node) { viableChildren := engine.FindViableChildren(node) if len(viableChildren) >= 2 || (aggressive && len(viableChildren) >= 1) { // Container expansion: add insertr class to each viable child, not the container for _, child := range viableChildren { if !disc.hasInsertrClass(child) { disc.addInsertrClass(child) result.IndividualsAdded++ result.ElementsEnhanced++ } } result.ContainersAdded++ // Don't process children since we just processed them return } } // Check if this individual element should be enhanced if disc.isGoodIndividualElement(node) { disc.addInsertrClass(node) result.IndividualsAdded++ result.ElementsEnhanced++ // Don't process children of enhanced individual elements return } // Recursively check children for child := node.FirstChild; child != nil; child = child.NextSibling { disc.discoverNode(child, result, aggressive) } } // isGoodContainer checks if an element is a good candidate for container expansion func (disc *Discoverer) isGoodContainer(node *html.Node) bool { containerTags := map[string]bool{ "div": true, "section": true, "article": true, "header": true, "footer": true, "main": true, "aside": true, "nav": true, } tag := strings.ToLower(node.Data) if !containerTags[tag] { return false } // Skip containers that are clearly non-content if disc.isNonContentElement(node) { return false } // Skip containers in the head section if disc.isInHead(node) { return false } // Skip containers with technical/framework-specific classes that suggest they're not content classes := disc.getClasses(node) for _, class := range classes { lowerClass := strings.ToLower(class) // Skip Next.js internal classes and other framework artifacts if strings.Contains(lowerClass, "__next") || strings.Contains(lowerClass, "webpack") || strings.Contains(lowerClass, "hydration") || strings.Contains(lowerClass, "react") || strings.Contains(lowerClass, "gatsby") { return false } } return true } // isGoodIndividualElement checks if an element is a good candidate for individual enhancement func (disc *Discoverer) isGoodIndividualElement(node *html.Node) bool { // Skip self-closing elements if disc.isSelfClosing(node) { return false } // Skip non-content elements that should never be editable if disc.isNonContentElement(node) { return false } // Skip elements inside head section if disc.isInHead(node) { return false } // Skip elements with no meaningful content if disc.hasNoMeaningfulContent(node) { return false } // Check if element has editable content return disc.hasEditableContent(node) } // hasEditableContent uses the engine's enhanced detection logic func (disc *Discoverer) hasEditableContent(node *html.Node) bool { return engine.HasEditableContent(node) } // hasInsertrClass checks if a node already has the insertr class func (disc *Discoverer) hasInsertrClass(node *html.Node) bool { classes := disc.getClasses(node) for _, class := range classes { if class == "insertr" { return true } } return false } // addInsertrClass adds the insertr class to a node func (disc *Discoverer) addInsertrClass(node *html.Node) { classes := disc.getClasses(node) classes = append(classes, "insertr") disc.setClasses(node, classes) } // getClasses extracts CSS classes from a node func (disc *Discoverer) getClasses(node *html.Node) []string { for i, attr := range node.Attr { if attr.Key == "class" { if attr.Val == "" { return []string{} } return strings.Fields(attr.Val) } // Update existing class attribute if attr.Key == "class" { node.Attr[i] = attr return strings.Fields(attr.Val) } } return []string{} } // setClasses sets CSS classes on a node func (disc *Discoverer) setClasses(node *html.Node, classes []string) { classValue := strings.Join(classes, " ") // Update existing class attribute or add new one for i, attr := range node.Attr { if attr.Key == "class" { node.Attr[i].Val = classValue return } } // Add new class attribute node.Attr = append(node.Attr, html.Attribute{ Key: "class", Val: classValue, }) } // isSelfClosing checks if an element is self-closing func (disc *Discoverer) isSelfClosing(node *html.Node) bool { selfClosingTags := map[string]bool{ "img": true, "input": true, "br": true, "hr": true, "meta": true, "link": true, "area": true, "base": true, "col": true, "embed": true, "source": true, "track": true, "wbr": true, } return selfClosingTags[strings.ToLower(node.Data)] } // isNonContentElement checks if an element should never be editable func (disc *Discoverer) isNonContentElement(node *html.Node) bool { nonContentTags := map[string]bool{ "script": true, // JavaScript code "style": true, // CSS styles "meta": true, // Metadata "link": true, // Links to resources "title": true, // Document title (handled separately) "head": true, // Document head "html": true, // Root element "body": true, // Body element (too broad) "noscript": true, // Fallback content "template": true, // HTML templates "svg": true, // SVG graphics (complex) "canvas": true, // Canvas graphics "iframe": true, // Embedded content "object": true, // Embedded objects "embed": true, // Embedded content "video": true, // Video elements (complex) "audio": true, // Audio elements (complex) "map": true, // Image maps "area": true, // Image map areas "base": true, // Base URL "col": true, // Table columns "colgroup": true, // Table column groups "track": true, // Video/audio tracks "source": true, // Media sources "param": true, // Object parameters "wbr": true, // Word break opportunities } return nonContentTags[strings.ToLower(node.Data)] } // isInHead checks if a node is inside the document head func (disc *Discoverer) isInHead(node *html.Node) bool { current := node.Parent for current != nil { if current.Type == html.ElementNode && strings.ToLower(current.Data) == "head" { return true } current = current.Parent } return false } // hasNoMeaningfulContent checks if an element has no meaningful text content func (disc *Discoverer) hasNoMeaningfulContent(node *html.Node) bool { if node.Type != html.ElementNode { return true } // Extract text content var text strings.Builder disc.extractTextRecursive(node, &text) content := strings.TrimSpace(text.String()) // Empty or whitespace-only content if content == "" { return true } // Very short content that's likely not meaningful if len(content) < 2 { return true } // Content that looks like technical artifacts technicalPatterns := []string{ "$", "", "{", "}", "[", "]", "function", "var ", "const ", "let ", "return", "import", "export", "require", "module.exports", "/*", "*/", "//", "", "<%", "%>", } for _, pattern := range technicalPatterns { if strings.Contains(content, pattern) { return true } } return false } // extractTextRecursive extracts text content from a node and its children func (disc *Discoverer) extractTextRecursive(node *html.Node, text *strings.Builder) { if node.Type == html.TextNode { text.WriteString(node.Data) return } for child := node.FirstChild; child != nil; child = child.NextSibling { // Skip script and style content if child.Type == html.ElementNode { tag := strings.ToLower(child.Data) if tag == "script" || tag == "style" { continue } } disc.extractTextRecursive(child, text) } } // copyFile copies a file from input to output directory func (disc *Discoverer) copyFile(filePath, inputDir, outputDir string) error { outputPath := disc.getOutputPath(filePath, inputDir, outputDir) // Create output directory for the file if err := os.MkdirAll(filepath.Dir(outputPath), 0755); err != nil { return err } input, err := os.ReadFile(filePath) if err != nil { return err } return os.WriteFile(outputPath, input, 0644) } // getOutputPath converts input path to output path func (disc *Discoverer) getOutputPath(filePath, inputDir, outputDir string) string { relPath, _ := filepath.Rel(inputDir, filePath) return filepath.Join(outputDir, relPath) } // writeEnhancedFile writes the enhanced HTML document to a file func (disc *Discoverer) writeEnhancedFile(outputPath string, enhanced *FileDiscoveryResult) error { // Create output directory if err := os.MkdirAll(filepath.Dir(outputPath), 0755); err != nil { return err } file, err := os.Create(outputPath) if err != nil { return err } defer file.Close() return html.Render(file, enhanced.Document) }