Files
insertr/internal/content/auto_enhancer.go
Joakim 27179dc943 refactor: remove legacy parser system and migrate to unified engine
- Remove internal/parser package and all legacy ID generation logic
- Update enhancer and auto_enhancer to use unified engine functions
- Migrate utility functions (FindViableChildren, HasEditableContent) to engine
- Create stub enhancer implementation that uses unified engine architecture
- Ensure all enhancement workflows now go through single unified system
- Remove parser dependencies and consolidate content processing logic

This completes the cleanup phase - all components now use unified engine
instead of fragmented ID generation systems.
2025-09-16 15:18:40 +02:00

443 lines
12 KiB
Go

package content
import (
"fmt"
"io/fs"
"os"
"path/filepath"
"strings"
"github.com/insertr/insertr/internal/engine"
"golang.org/x/net/html"
)
// AutoEnhancer handles automatic enhancement of HTML files
type AutoEnhancer struct {
// Remove parser dependency - auto enhancement is now self-contained
}
// NewAutoEnhancer creates a new AutoEnhancer instance
func NewAutoEnhancer() *AutoEnhancer {
return &AutoEnhancer{}
}
// AutoEnhanceResult contains statistics about auto-enhancement
type AutoEnhanceResult struct {
FilesProcessed int
ElementsEnhanced int
ContainersAdded int
IndividualsAdded int
SkippedFiles []string
EnhancedFiles []string
}
// EnhanceDirectory automatically enhances all HTML files in a directory
func (ae *AutoEnhancer) EnhanceDirectory(inputDir, outputDir string, aggressive bool) (*AutoEnhanceResult, error) {
result := &AutoEnhanceResult{
SkippedFiles: []string{},
EnhancedFiles: []string{},
}
// Create output directory if it doesn't exist
if err := os.MkdirAll(outputDir, 0755); err != nil {
return nil, fmt.Errorf("failed to create output directory: %w", err)
}
err := filepath.WalkDir(inputDir, func(path string, d fs.DirEntry, err error) error {
if err != nil {
return err
}
// Skip directories
if d.IsDir() {
return nil
}
// Only process HTML files
if !strings.HasSuffix(strings.ToLower(path), ".html") {
// Copy non-HTML files as-is
return ae.copyFile(path, inputDir, outputDir)
}
// Enhance HTML file
enhanced, err := ae.enhanceFile(path, aggressive)
if err != nil {
result.SkippedFiles = append(result.SkippedFiles, path)
// Copy original file on error
return ae.copyFile(path, inputDir, outputDir)
}
// Write enhanced file
outputPath := ae.getOutputPath(path, inputDir, outputDir)
if err := ae.writeEnhancedFile(outputPath, enhanced); err != nil {
return fmt.Errorf("failed to write enhanced file %s: %w", outputPath, err)
}
result.FilesProcessed++
result.ElementsEnhanced += enhanced.ElementsEnhanced
result.ContainersAdded += enhanced.ContainersAdded
result.IndividualsAdded += enhanced.IndividualsAdded
result.EnhancedFiles = append(result.EnhancedFiles, outputPath)
return nil
})
return result, err
}
// EnhancementResult contains details about a single file enhancement
type EnhancementResult struct {
ElementsEnhanced int
ContainersAdded int
IndividualsAdded int
Document *html.Node
}
// enhanceFile processes a single HTML file and adds insertr classes
func (ae *AutoEnhancer) enhanceFile(filePath string, aggressive bool) (*EnhancementResult, error) {
file, err := os.Open(filePath)
if err != nil {
return nil, fmt.Errorf("error opening file: %w", err)
}
defer file.Close()
doc, err := html.Parse(file)
if err != nil {
return nil, fmt.Errorf("error parsing HTML: %w", err)
}
result := &EnhancementResult{Document: doc}
// Find candidates for enhancement
ae.enhanceNode(doc, result, aggressive)
return result, nil
}
// enhanceNode recursively enhances nodes in the document
func (ae *AutoEnhancer) enhanceNode(node *html.Node, result *EnhancementResult, aggressive bool) {
if node.Type != html.ElementNode {
// Recursively check children
for child := node.FirstChild; child != nil; child = child.NextSibling {
ae.enhanceNode(child, result, aggressive)
}
return
}
// Skip if already has insertr class
if ae.hasInsertrClass(node) {
return
}
// Check if this is a container that should use expansion
if ae.isGoodContainer(node) {
viableChildren := engine.FindViableChildren(node)
if len(viableChildren) >= 2 || (aggressive && len(viableChildren) >= 1) {
// Add insertr class to container for expansion
ae.addInsertrClass(node)
result.ContainersAdded++
result.ElementsEnhanced += len(viableChildren)
// Don't process children since container expansion handles them
return
}
}
// Check if this individual element should be enhanced
if ae.isGoodIndividualElement(node) {
ae.addInsertrClass(node)
result.IndividualsAdded++
result.ElementsEnhanced++
// Don't process children of enhanced individual elements
return
}
// Recursively check children
for child := node.FirstChild; child != nil; child = child.NextSibling {
ae.enhanceNode(child, result, aggressive)
}
}
// isGoodContainer checks if an element is a good candidate for container expansion
func (ae *AutoEnhancer) isGoodContainer(node *html.Node) bool {
containerTags := map[string]bool{
"div": true,
"section": true,
"article": true,
"header": true,
"footer": true,
"main": true,
"aside": true,
"nav": true,
}
tag := strings.ToLower(node.Data)
if !containerTags[tag] {
return false
}
// Skip containers that are clearly non-content
if ae.isNonContentElement(node) {
return false
}
// Skip containers in the head section
if ae.isInHead(node) {
return false
}
// Skip containers with technical/framework-specific classes that suggest they're not content
classes := ae.getClasses(node)
for _, class := range classes {
lowerClass := strings.ToLower(class)
// Skip Next.js internal classes and other framework artifacts
if strings.Contains(lowerClass, "__next") ||
strings.Contains(lowerClass, "webpack") ||
strings.Contains(lowerClass, "hydration") ||
strings.Contains(lowerClass, "react") ||
strings.Contains(lowerClass, "gatsby") {
return false
}
}
return true
}
// isGoodIndividualElement checks if an element is a good candidate for individual enhancement
func (ae *AutoEnhancer) isGoodIndividualElement(node *html.Node) bool {
// Skip self-closing elements
if ae.isSelfClosing(node) {
return false
}
// Skip non-content elements that should never be editable
if ae.isNonContentElement(node) {
return false
}
// Skip elements inside head section
if ae.isInHead(node) {
return false
}
// Skip elements with no meaningful content
if ae.hasNoMeaningfulContent(node) {
return false
}
// Check if element has editable content
return ae.hasEditableContent(node)
}
// hasEditableContent uses the engine's enhanced detection logic
func (ae *AutoEnhancer) hasEditableContent(node *html.Node) bool {
return engine.HasEditableContent(node)
}
// hasInsertrClass checks if a node already has the insertr class
func (ae *AutoEnhancer) hasInsertrClass(node *html.Node) bool {
classes := ae.getClasses(node)
for _, class := range classes {
if class == "insertr" {
return true
}
}
return false
}
// addInsertrClass adds the insertr class to a node
func (ae *AutoEnhancer) addInsertrClass(node *html.Node) {
classes := ae.getClasses(node)
classes = append(classes, "insertr")
ae.setClasses(node, classes)
}
// getClasses extracts CSS classes from a node
func (ae *AutoEnhancer) getClasses(node *html.Node) []string {
for i, attr := range node.Attr {
if attr.Key == "class" {
if attr.Val == "" {
return []string{}
}
return strings.Fields(attr.Val)
}
// Update existing class attribute
if attr.Key == "class" {
node.Attr[i] = attr
return strings.Fields(attr.Val)
}
}
return []string{}
}
// setClasses sets CSS classes on a node
func (ae *AutoEnhancer) setClasses(node *html.Node, classes []string) {
classValue := strings.Join(classes, " ")
// Update existing class attribute or add new one
for i, attr := range node.Attr {
if attr.Key == "class" {
node.Attr[i].Val = classValue
return
}
}
// Add new class attribute
node.Attr = append(node.Attr, html.Attribute{
Key: "class",
Val: classValue,
})
}
// isSelfClosing checks if an element is self-closing
func (ae *AutoEnhancer) isSelfClosing(node *html.Node) bool {
selfClosingTags := map[string]bool{
"img": true, "input": true, "br": true, "hr": true,
"meta": true, "link": true, "area": true, "base": true,
"col": true, "embed": true, "source": true, "track": true, "wbr": true,
}
return selfClosingTags[strings.ToLower(node.Data)]
}
// isNonContentElement checks if an element should never be editable
func (ae *AutoEnhancer) isNonContentElement(node *html.Node) bool {
nonContentTags := map[string]bool{
"script": true, // JavaScript code
"style": true, // CSS styles
"meta": true, // Metadata
"link": true, // Links to resources
"title": true, // Document title (handled separately)
"head": true, // Document head
"html": true, // Root element
"body": true, // Body element (too broad)
"noscript": true, // Fallback content
"template": true, // HTML templates
"svg": true, // SVG graphics (complex)
"canvas": true, // Canvas graphics
"iframe": true, // Embedded content
"object": true, // Embedded objects
"embed": true, // Embedded content
"video": true, // Video elements (complex)
"audio": true, // Audio elements (complex)
"map": true, // Image maps
"area": true, // Image map areas
"base": true, // Base URL
"col": true, // Table columns
"colgroup": true, // Table column groups
"track": true, // Video/audio tracks
"source": true, // Media sources
"param": true, // Object parameters
"wbr": true, // Word break opportunities
}
return nonContentTags[strings.ToLower(node.Data)]
}
// isInHead checks if a node is inside the document head
func (ae *AutoEnhancer) isInHead(node *html.Node) bool {
current := node.Parent
for current != nil {
if current.Type == html.ElementNode && strings.ToLower(current.Data) == "head" {
return true
}
current = current.Parent
}
return false
}
// hasNoMeaningfulContent checks if an element has no meaningful text content
func (ae *AutoEnhancer) hasNoMeaningfulContent(node *html.Node) bool {
if node.Type != html.ElementNode {
return true
}
// Extract text content
var text strings.Builder
ae.extractTextRecursive(node, &text)
content := strings.TrimSpace(text.String())
// Empty or whitespace-only content
if content == "" {
return true
}
// Very short content that's likely not meaningful
if len(content) < 2 {
return true
}
// Content that looks like technical artifacts
technicalPatterns := []string{
"$", "<!--", "-->", "{", "}", "[", "]",
"function", "var ", "const ", "let ", "return",
"import", "export", "require", "module.exports",
"/*", "*/", "//", "<?", "?>", "<%", "%>",
}
for _, pattern := range technicalPatterns {
if strings.Contains(content, pattern) {
return true
}
}
return false
}
// extractTextRecursive extracts text content from a node and its children
func (ae *AutoEnhancer) extractTextRecursive(node *html.Node, text *strings.Builder) {
if node.Type == html.TextNode {
text.WriteString(node.Data)
return
}
for child := node.FirstChild; child != nil; child = child.NextSibling {
// Skip script and style content
if child.Type == html.ElementNode {
tag := strings.ToLower(child.Data)
if tag == "script" || tag == "style" {
continue
}
}
ae.extractTextRecursive(child, text)
}
}
// copyFile copies a file from input to output directory
func (ae *AutoEnhancer) copyFile(filePath, inputDir, outputDir string) error {
outputPath := ae.getOutputPath(filePath, inputDir, outputDir)
// Create output directory for the file
if err := os.MkdirAll(filepath.Dir(outputPath), 0755); err != nil {
return err
}
input, err := os.ReadFile(filePath)
if err != nil {
return err
}
return os.WriteFile(outputPath, input, 0644)
}
// getOutputPath converts input path to output path
func (ae *AutoEnhancer) getOutputPath(filePath, inputDir, outputDir string) string {
relPath, _ := filepath.Rel(inputDir, filePath)
return filepath.Join(outputDir, relPath)
}
// writeEnhancedFile writes the enhanced HTML document to a file
func (ae *AutoEnhancer) writeEnhancedFile(outputPath string, enhanced *EnhancementResult) error {
// Create output directory
if err := os.MkdirAll(filepath.Dir(outputPath), 0755); err != nil {
return err
}
file, err := os.Create(outputPath)
if err != nil {
return err
}
defer file.Close()
return html.Render(file, enhanced.Document)
}