insertr/internal/engine/id_generator.go

package engine

import (
	"crypto/sha256"
	"fmt"
	"path/filepath"
	"strings"

	"golang.org/x/net/html"
)

// IDGenerator generates unique content IDs for elements using lightweight hierarchical approach
type IDGenerator struct {
	usedIDs       map[string]bool
	elementCounts map[string]int // Track counts per file+type for indexing
}

// NewIDGenerator creates a new ID generator
func NewIDGenerator() *IDGenerator {
	return &IDGenerator{
		usedIDs:       make(map[string]bool),
		elementCounts: make(map[string]int),
	}
}

// Generate creates a content ID for an HTML element using deterministic approach
func (g *IDGenerator) Generate(node *html.Node, filePath string) string {
	// 1. File context (minimal)
	fileName := g.getFileName(filePath)

	// 2. Element identity (lightweight)
	tag := strings.ToLower(node.Data)
	primaryClass := g.getPrimaryClass(node)

	// 3. Build readable prefix (deterministic, no runtime counting)
	prefix := g.buildDeterministicPrefix(fileName, tag, primaryClass)

	// 4. Create deterministic suffix based on element characteristics
	signature := g.createDeterministicSignature(node, filePath)

	finalID := fmt.Sprintf("%s-%s", prefix, signature)

	// Ensure uniqueness within this session
	counter := 1
	originalID := finalID
	for g.usedIDs[finalID] {
		finalID = fmt.Sprintf("%s-%d", originalID, counter)
		counter++
	}

	g.usedIDs[finalID] = true

	return finalID
}

// getFileName extracts filename without extension for ID prefix
func (g *IDGenerator) getFileName(filePath string) string {
	base := filepath.Base(filePath)
	return strings.TrimSuffix(base, filepath.Ext(base))
}

// getPrimaryClass returns the first meaningful (non-insertr) CSS class
func (g *IDGenerator) getPrimaryClass(node *html.Node) string {
	classes := GetClasses(node)
	for _, class := range classes {
		if class != "insertr" && class != "" {
			return class
		}
	}
	return ""
}

// getElementKey creates a key for tracking element counts
func (g *IDGenerator) getElementKey(fileName, tag, primaryClass string) string {
	if primaryClass != "" {
		return fmt.Sprintf("%s-%s", fileName, primaryClass)
	}
	return fmt.Sprintf("%s-%s", fileName, tag)
}

// getElementIndex returns the position index for this element type in the file
func (g *IDGenerator) getElementIndex(elementKey string) int {
	g.elementCounts[elementKey]++
	return g.elementCounts[elementKey]
}

// buildDeterministicPrefix creates human-readable prefix without runtime counting
func (g *IDGenerator) buildDeterministicPrefix(fileName, tag, primaryClass string) string {
	var parts []string
	parts = append(parts, fileName)

	if primaryClass != "" {
		parts = append(parts, primaryClass)
	} else {
		parts = append(parts, tag)
	}

	// No runtime index - rely on hash for uniqueness
	return strings.Join(parts, "-")
}

// buildPrefix creates human-readable prefix for the ID (legacy method)
func (g *IDGenerator) buildPrefix(fileName, tag, primaryClass string, index int) string {
	var parts []string
	parts = append(parts, fileName)

	if primaryClass != "" {
		parts = append(parts, primaryClass)
	} else {
		parts = append(parts, tag)
	}

	// Only add index if it's not the first element of this type
	if index > 1 {
		parts = append(parts, fmt.Sprintf("%d", index))
	}

	return strings.Join(parts, "-")
}

// createDeterministicSignature creates a deterministic signature for element identification
func (g *IDGenerator) createDeterministicSignature(node *html.Node, filePath string) string {
	// Build signature from stable characteristics
	var sigParts []string

	// 1. DOM path (simplified, max 3 levels)
	domPath := g.getSimpleDOMPath(node)
	if domPath != "" {
		sigParts = append(sigParts, domPath)
	}

	// 2. Sibling position
	siblingIndex := g.getSiblingIndex(node)
	sigParts = append(sigParts, fmt.Sprintf("pos%d", siblingIndex))

	// 3. Content preview (first few chars for uniqueness)
	contentPreview := g.getContentPreview(node)
	if contentPreview != "" {
		// Use first 20 chars for signature
		if len(contentPreview) > 20 {
			contentPreview = contentPreview[:20]
		}
		sigParts = append(sigParts, contentPreview)
	}

	// 4. Create hash of combined signature
	combined := strings.Join(sigParts, "|")
	hash := sha256.Sum256([]byte(combined))

	// Use first 6 characters of hash for short, deterministic suffix
	return fmt.Sprintf("%x", hash)[:6]
}

// createSignature creates a unique signature for collision resistance (DEPRECATED - using deterministic now)
func (g *IDGenerator) createSignature(node *html.Node, filePath string) string {
	// This method is kept for compatibility but not used in deterministic generation
	return ""
}

// getSimpleDOMPath creates a simple DOM path for uniqueness
func (g *IDGenerator) getSimpleDOMPath(node *html.Node) string {
	var pathParts []string
	current := node
	depth := 0

	for current != nil && current.Type == html.ElementNode && depth < 5 {
		part := current.Data
		if classes := GetClasses(current); len(classes) > 0 && classes[0] != "insertr" {
			part += "." + classes[0]
		}
		pathParts = append([]string{part}, pathParts...)
		current = current.Parent
		depth++
	}

	return strings.Join(pathParts, ">")
}

// getContentPreview extracts first 50 characters of text content for uniqueness
func (g *IDGenerator) getContentPreview(node *html.Node) string {
	var text strings.Builder
	g.extractTextContent(node, &text)
	content := strings.TrimSpace(text.String())
	if len(content) > 50 {
		content = content[:50]
	}
	// Remove newlines and normalize whitespace
	content = strings.ReplaceAll(content, "\n", " ")
	content = strings.ReplaceAll(content, "\t", " ")
	for strings.Contains(content, "  ") {
		content = strings.ReplaceAll(content, "  ", " ")
	}
	return content
}

// extractTextContent recursively extracts text content from a node
func (g *IDGenerator) extractTextContent(node *html.Node, text *strings.Builder) {
	if node.Type == html.TextNode {
		text.WriteString(node.Data)
	}
	for child := node.FirstChild; child != nil; child = child.NextSibling {
		g.extractTextContent(child, text)
	}
}

// getSiblingIndex returns the position of this element among its siblings of the same type
func (g *IDGenerator) getSiblingIndex(node *html.Node) int {
	if node.Parent == nil {
		return 0
	}

	index := 0
	tag := node.Data
	classes := GetClasses(node)

	for sibling := node.Parent.FirstChild; sibling != nil; sibling = sibling.NextSibling {
		if sibling.Type == html.ElementNode && sibling.Data == tag {
			siblingClasses := GetClasses(sibling)
			// Check if classes match (for more precise positioning)
			if g.classesMatch(classes, siblingClasses) {
				if sibling == node {
					return index
				}
				index++
			}
		}
	}
	return index
}

// classesMatch checks if two class lists are equivalent
func (g *IDGenerator) classesMatch(classes1, classes2 []string) bool {
	if len(classes1) != len(classes2) {
		return false
	}
	for i, class := range classes1 {
		if i >= len(classes2) || class != classes2[i] {
			return false
		}
	}
	return true
}