insertr/internal/engine/id_generator.go

package engine

import (
	"crypto/sha256"
	"fmt"
	"path/filepath"
	"strings"

	"golang.org/x/net/html"
)

// IDGenerator generates unique content IDs for elements using lightweight hierarchical approach
type IDGenerator struct {
	usedIDs       map[string]bool
	elementCounts map[string]int // Track counts per file+type for indexing
}

// NewIDGenerator creates a new ID generator
func NewIDGenerator() *IDGenerator {
	return &IDGenerator{
		usedIDs:       make(map[string]bool),
		elementCounts: make(map[string]int),
	}
}

// Generate creates a content ID for an HTML element using deterministic approach
func (g *IDGenerator) Generate(node *html.Node, filePath string) string {
	// 1. File context (minimal)
	fileName := g.getFileName(filePath)

	// 2. Element identity (lightweight)
	tag := strings.ToLower(node.Data)
	primaryClass := g.getPrimaryClass(node)

	// 3. Build readable prefix (deterministic, no runtime counting)
	prefix := g.buildDeterministicPrefix(fileName, tag, primaryClass)

	// 4. Create deterministic suffix based on element characteristics
	signature := g.createDeterministicSignature(node, filePath)

	finalID := fmt.Sprintf("%s-%s", prefix, signature)

	// Ensure uniqueness within this session
	counter := 1
	originalID := finalID
	for g.usedIDs[finalID] {
		finalID = fmt.Sprintf("%s-%d", originalID, counter)
		counter++
	}

	g.usedIDs[finalID] = true

	return finalID
}

// getFileName extracts filename without extension for ID prefix
func (g *IDGenerator) getFileName(filePath string) string {
	base := filepath.Base(filePath)
	return strings.TrimSuffix(base, filepath.Ext(base))
}

// getPrimaryClass returns the first meaningful (non-insertr) CSS class
func (g *IDGenerator) getPrimaryClass(node *html.Node) string {
	classes := GetClasses(node)
	for _, class := range classes {
		if class != "insertr" && class != "" {
			return class
		}
	}
	return ""
}

// getElementKey creates a key for tracking element counts
func (g *IDGenerator) getElementKey(fileName, tag, primaryClass string) string {
	if primaryClass != "" {
		return fmt.Sprintf("%s-%s", fileName, primaryClass)
	}
	return fmt.Sprintf("%s-%s", fileName, tag)
}

// getElementIndex returns the position index for this element type in the file
func (g *IDGenerator) getElementIndex(elementKey string) int {
	g.elementCounts[elementKey]++
	return g.elementCounts[elementKey]
}

// buildDeterministicPrefix creates human-readable prefix without runtime counting
func (g *IDGenerator) buildDeterministicPrefix(fileName, tag, primaryClass string) string {
	var parts []string
	parts = append(parts, fileName)

	if primaryClass != "" {
		parts = append(parts, primaryClass)
	} else {
		parts = append(parts, tag)
	}

	// No runtime index - rely on hash for uniqueness
	return strings.Join(parts, "-")
}

// buildPrefix creates human-readable prefix for the ID (legacy method)
func (g *IDGenerator) buildPrefix(fileName, tag, primaryClass string, index int) string {
	var parts []string
	parts = append(parts, fileName)

	if primaryClass != "" {
		parts = append(parts, primaryClass)
	} else {
		parts = append(parts, tag)
	}

	// Only add index if it's not the first element of this type
	if index > 1 {
		parts = append(parts, fmt.Sprintf("%d", index))
	}

	return strings.Join(parts, "-")
}

// createDeterministicSignature creates a deterministic signature for element identification
func (g *IDGenerator) createDeterministicSignature(node *html.Node, filePath string) string {
	// Build structural signature for stable IDs across content changes
	tag := node.Data
	domPath := g.getDetailedDOMPath(node)
	allClasses := strings.Join(GetClasses(node), " ")
	semanticContext := g.getSemanticContext(node)
	preciseIndex := g.getPreciseSiblingIndex(node)

	// Create purely structural deterministic signature
	signature := fmt.Sprintf("%s|%s|%s|%s|%s|%d",
		filePath,        // File context for uniqueness across files
		domPath,         // Detailed structural position in DOM
		tag,             // Element type
		allClasses,      // All CSS classes for style differentiation
		semanticContext, // Semantic context (header/main/footer/nav)
		preciseIndex,    // Precise position among exact siblings
	)

	// Create deterministic hash suffix (6 chars)
	hash := sha256.Sum256([]byte(signature))
	return fmt.Sprintf("%x", hash)[:6]
}

// createSignature creates a unique signature for collision resistance (DEPRECATED - using deterministic now)
func (g *IDGenerator) createSignature(node *html.Node, filePath string) string {
	// This method is kept for compatibility but not used in deterministic generation
	return ""
}

// getSimpleDOMPath creates a simple but precise DOM path for uniqueness (max 3 levels)
func (g *IDGenerator) getSimpleDOMPath(node *html.Node) string {
	var pathParts []string
	current := node
	depth := 0

	for current != nil && current.Type == html.ElementNode && depth < 3 {
		part := current.Data

		// Add first meaningful class (not insertr) for better differentiation
		classes := GetClasses(current)
		for _, class := range classes {
			if class != "insertr" && class != "" {
				part += "." + class
				break
			}
		}

		pathParts = append([]string{part}, pathParts...)
		current = current.Parent
		depth++
	}

	return strings.Join(pathParts, ">")
}

// getDetailedDOMPath creates a more detailed DOM path for enhanced structural differentiation
func (g *IDGenerator) getDetailedDOMPath(node *html.Node) string {
	var pathParts []string
	current := node
	depth := 0

	for current != nil && current.Type == html.ElementNode && depth < 5 {
		part := current.Data

		// Add all meaningful classes for maximum differentiation
		classes := GetClasses(current)
		var meaningfulClasses []string
		for _, class := range classes {
			if class != "insertr" && class != "" {
				meaningfulClasses = append(meaningfulClasses, class)
			}
		}
		if len(meaningfulClasses) > 0 {
			part += "." + strings.Join(meaningfulClasses, ".")
		}

		pathParts = append([]string{part}, pathParts...)
		current = current.Parent
		depth++
	}

	return strings.Join(pathParts, ">")
}

// getSemanticContext identifies the semantic container (header, main, footer, nav)
func (g *IDGenerator) getSemanticContext(node *html.Node) string {
	current := node.Parent

	// Traverse up to find semantic containers
	for current != nil && current.Type == html.ElementNode {
		tag := strings.ToLower(current.Data)

		// Direct semantic tags
		switch tag {
		case "header":
			return "header"
		case "main":
			return "main"
		case "footer":
			return "footer"
		case "nav":
			return "nav"
		case "aside":
			return "aside"
		}

		// Semantic classes
		classes := GetClasses(current)
		for _, class := range classes {
			class = strings.ToLower(class)
			if strings.Contains(class, "header") {
				return "header"
			}
			if strings.Contains(class, "footer") {
				return "footer"
			}
			if strings.Contains(class, "nav") {
				return "nav"
			}
			if strings.Contains(class, "sidebar") || strings.Contains(class, "aside") {
				return "aside"
			}
		}

		current = current.Parent
	}

	return "content"
}

// getPreciseSiblingIndex returns position among siblings with exact tag and class match
func (g *IDGenerator) getPreciseSiblingIndex(node *html.Node) int {
	if node.Parent == nil {
		return 0
	}

	index := 0
	tag := node.Data
	classes := GetClasses(node)

	// Sort classes for consistent comparison
	sortedClasses := make([]string, len(classes))
	copy(sortedClasses, classes)
	for i := 0; i < len(sortedClasses); i++ {
		for j := i + 1; j < len(sortedClasses); j++ {
			if sortedClasses[i] > sortedClasses[j] {
				sortedClasses[i], sortedClasses[j] = sortedClasses[j], sortedClasses[i]
			}
		}
	}

	for sibling := node.Parent.FirstChild; sibling != nil; sibling = sibling.NextSibling {
		if sibling.Type == html.ElementNode && sibling.Data == tag {
			siblingClasses := GetClasses(sibling)

			// Sort sibling classes for comparison
			sortedSiblingClasses := make([]string, len(siblingClasses))
			copy(sortedSiblingClasses, siblingClasses)
			for i := 0; i < len(sortedSiblingClasses); i++ {
				for j := i + 1; j < len(sortedSiblingClasses); j++ {
					if sortedSiblingClasses[i] > sortedSiblingClasses[j] {
						sortedSiblingClasses[i], sortedSiblingClasses[j] = sortedSiblingClasses[j], sortedSiblingClasses[i]
					}
				}
			}

			// Check if classes match exactly
			if g.classSlicesEqual(sortedClasses, sortedSiblingClasses) {
				if sibling == node {
					return index
				}
				index++
			}
		}
	}
	return index
}

// classSlicesEqual compares two sorted class slices for equality
func (g *IDGenerator) classSlicesEqual(a, b []string) bool {
	if len(a) != len(b) {
		return false
	}
	for i := range a {
		if a[i] != b[i] {
			return false
		}
	}
	return true
}

// getContentPreview extracts first 50 characters of text content for uniqueness
func (g *IDGenerator) getContentPreview(node *html.Node) string {
	var text strings.Builder
	g.extractTextContent(node, &text)
	content := strings.TrimSpace(text.String())
	if len(content) > 50 {
		content = content[:50]
	}
	// Remove newlines and normalize whitespace
	content = strings.ReplaceAll(content, "\n", " ")
	content = strings.ReplaceAll(content, "\t", " ")
	for strings.Contains(content, "  ") {
		content = strings.ReplaceAll(content, "  ", " ")
	}
	return content
}

// extractTextContent recursively extracts text content from a node
func (g *IDGenerator) extractTextContent(node *html.Node, text *strings.Builder) {
	if node.Type == html.TextNode {
		text.WriteString(node.Data)
	}
	for child := node.FirstChild; child != nil; child = child.NextSibling {
		g.extractTextContent(child, text)
	}
}

// getSiblingIndex returns the position of this element among its siblings of the same type and class
func (g *IDGenerator) getSiblingIndex(node *html.Node) int {
	if node.Parent == nil {
		return 0
	}

	index := 0
	tag := node.Data
	classes := GetClasses(node)

	// First try: match by tag + insertr class (most common case)
	hasInsertr := false
	for _, class := range classes {
		if class == "insertr" {
			hasInsertr = true
			break
		}
	}

	for sibling := node.Parent.FirstChild; sibling != nil; sibling = sibling.NextSibling {
		if sibling.Type == html.ElementNode && sibling.Data == tag {
			siblingClasses := GetClasses(sibling)

			// For insertr elements, match by tag + insertr class
			if hasInsertr {
				siblingHasInsertr := false
				for _, class := range siblingClasses {
					if class == "insertr" {
						siblingHasInsertr = true
						break
					}
				}
				if siblingHasInsertr {
					if sibling == node {
						return index
					}
					index++
				}
			} else {
				// For non-insertr elements, match by exact class list
				if g.classesMatch(classes, siblingClasses) {
					if sibling == node {
						return index
					}
					index++
				}
			}
		}
	}
	return index
}

// classesMatch checks if two class lists are equivalent
func (g *IDGenerator) classesMatch(classes1, classes2 []string) bool {
	if len(classes1) != len(classes2) {
		return false
	}
	for i, class := range classes1 {
		if i >= len(classes2) || class != classes2[i] {
			return false
		}
	}
	return true
}