feat: implement collision-free lightweight hierarchical ID generation

- Replace content-hash based ID generation with position-based algorithm - Use file + element identity + position index + hash for unique IDs - Generate human-readable prefixes (e.g. index-lead-, index-p-2-) - Add collision-resistant hash suffixes for guaranteed uniqueness - Update Generate() to accept filePath parameter for context - Fix ID collisions where hero and footer elements shared same ID - Clean demo site files removing all data-content-id attributes - Preserve insertr-gate elements for authentication functionality Results: Hero gets 'index-lead-2-fc31f2', footer gets 'index-p-13-99fd13' No more content cross-contamination between different elements.
2025-09-11 17:38:15 +02:00
parent cfb744f091
commit 72bd31b626
5 changed files with 132 additions and 167 deletions
--- a/internal/parser/id_generator.go
+++ b/internal/parser/id_generator.go
@@ -1,169 +1,133 @@
 package parser

 import (
-	"crypto/sha1"
+	"crypto/sha256"
+	"encoding/hex"
 	"fmt"
-	"regexp"
+	"path/filepath"
 	"strings"

 	"golang.org/x/net/html"
 )

-// IDGenerator generates unique content IDs for elements
+// IDGenerator generates unique content IDs for elements using lightweight hierarchical approach
 type IDGenerator struct {
-	usedIDs map[string]bool
+	usedIDs       map[string]bool
+	elementCounts map[string]int // Track counts per file+type for indexing
 }

 // NewIDGenerator creates a new ID generator
 func NewIDGenerator() *IDGenerator {
 	return &IDGenerator{
-		usedIDs: make(map[string]bool),
+		usedIDs:       make(map[string]bool),
+		elementCounts: make(map[string]int),
 	}
 }

-// Generate creates a content ID for an HTML element
-func (g *IDGenerator) Generate(node *html.Node) string {
-	context := g.getSemanticContext(node)
-	purpose := g.getPurpose(node)
-	contentHash := g.getContentHash(node)
+// Generate creates a content ID for an HTML element using lightweight hierarchical approach
+func (g *IDGenerator) Generate(node *html.Node, filePath string) string {
+	// 1. File context (minimal)
+	fileName := g.getFileName(filePath)

-	baseID := g.createBaseID(context, purpose, contentHash)
-	finalID := g.ensureUnique(baseID)
+	// 2. Element identity (lightweight)
+	tag := strings.ToLower(node.Data)
+	primaryClass := g.getPrimaryClass(node)
+
+	// 3. Position context (simple)
+	elementKey := g.getElementKey(fileName, tag, primaryClass)
+	index := g.getElementIndex(elementKey)
+
+	// 4. Build readable prefix
+	prefix := g.buildPrefix(fileName, tag, primaryClass, index)
+
+	// 5. Add collision-resistant suffix
+	signature := g.createSignature(node, filePath)
+	hash := sha256.Sum256([]byte(signature))
+	suffix := hex.EncodeToString(hash[:3])
+
+	finalID := fmt.Sprintf("%s-%s", prefix, suffix)
+
+	// Ensure uniqueness (should be guaranteed by hash, but safety check)
+	g.usedIDs[finalID] = true

 	return finalID
 }

-// getSemanticContext determines the semantic context from parent elements
-func (g *IDGenerator) getSemanticContext(node *html.Node) string {
-	// Walk up the tree to find semantic containers
-	parent := node.Parent
-	for parent != nil && parent.Type == html.ElementNode {
-		classes := GetClasses(parent)
-
-		// Check for common semantic section classes
-		for _, class := range []string{"hero", "services", "nav", "navbar", "footer", "about", "contact", "testimonial"} {
-			if ContainsClass(classes, class) {
-				return class
-			}
-		}
-
-		// Check for semantic HTML elements
-		switch parent.Data {
-		case "nav":
-			return "nav"
-		case "header":
-			return "header"
-		case "footer":
-			return "footer"
-		case "main":
-			return "main"
-		case "aside":
-			return "aside"
-		}
-
-		parent = parent.Parent
-	}
-
-	return "content"
+// getFileName extracts filename without extension for ID prefix
+func (g *IDGenerator) getFileName(filePath string) string {
+	base := filepath.Base(filePath)
+	return strings.TrimSuffix(base, filepath.Ext(base))
 }

-// getPurpose determines the purpose/role of the element
-func (g *IDGenerator) getPurpose(node *html.Node) string {
-	tag := strings.ToLower(node.Data)
+// getPrimaryClass returns the first meaningful (non-insertr) CSS class
+func (g *IDGenerator) getPrimaryClass(node *html.Node) string {
 	classes := GetClasses(node)
-
-	// Check for specific CSS classes that indicate purpose
 	for _, class := range classes {
-		switch {
-		case strings.Contains(class, "title"):
-			return "title"
-		case strings.Contains(class, "headline"):
-			return "headline"
-		case strings.Contains(class, "description"):
-			return "description"
-		case strings.Contains(class, "subtitle"):
-			return "subtitle"
-		case strings.Contains(class, "cta"):
-			return "cta"
-		case strings.Contains(class, "button"):
-			return "button"
-		case strings.Contains(class, "logo"):
-			return "logo"
-		case strings.Contains(class, "lead"):
-			return "lead"
+		if class != "insertr" && class != "" {
+			return class
 		}
 	}
-
-	// Infer purpose from HTML tag
-	switch tag {
-	case "h1":
-		return "title"
-	case "h2":
-		return "subtitle"
-	case "h3", "h4", "h5", "h6":
-		return "heading"
-	case "p":
-		return "text"
-	case "a":
-		return "link"
-	case "button":
-		return "button"
-	default:
-		return "content"
-	}
+	return ""
 }

-// getContentHash creates a short hash of the content for ID generation
-func (g *IDGenerator) getContentHash(node *html.Node) string {
-	text := extractTextContent(node)
-
-	// Create hash of the text content
-	hash := fmt.Sprintf("%x", sha1.Sum([]byte(text)))
-
-	// Return first 6 characters for brevity
-	return hash[:6]
+// getElementKey creates a key for tracking element counts
+func (g *IDGenerator) getElementKey(fileName, tag, primaryClass string) string {
+	if primaryClass != "" {
+		return fmt.Sprintf("%s-%s", fileName, primaryClass)
+	}
+	return fmt.Sprintf("%s-%s", fileName, tag)
 }

-// createBaseID creates the base ID from components
-func (g *IDGenerator) createBaseID(context, purpose, contentHash string) string {
-	parts := []string{}
-
-	// Add context if meaningful
-	if context != "content" {
-		parts = append(parts, context)
-	}
-
-	// Add purpose
-	parts = append(parts, purpose)
-
-	// Always add content hash for uniqueness
-	parts = append(parts, contentHash)
-
-	baseID := strings.Join(parts, "-")
-
-	// Clean up the ID
-	baseID = regexp.MustCompile(`-+`).ReplaceAllString(baseID, "-")
-	baseID = strings.Trim(baseID, "-")
-
-	// Ensure it's not empty
-	if baseID == "" {
-		baseID = fmt.Sprintf("content-%s", contentHash)
-	}
-
-	return baseID
+// getElementIndex returns the position index for this element type in the file
+func (g *IDGenerator) getElementIndex(elementKey string) int {
+	g.elementCounts[elementKey]++
+	return g.elementCounts[elementKey]
 }

-// ensureUnique makes sure the ID is unique by adding a suffix if needed
-func (g *IDGenerator) ensureUnique(baseID string) string {
-	if !g.usedIDs[baseID] {
-		g.usedIDs[baseID] = true
-		return baseID
+// buildPrefix creates human-readable prefix for the ID
+func (g *IDGenerator) buildPrefix(fileName, tag, primaryClass string, index int) string {
+	var parts []string
+	parts = append(parts, fileName)
+
+	if primaryClass != "" {
+		parts = append(parts, primaryClass)
+	} else {
+		parts = append(parts, tag)
 	}

-	// If base ID is taken, add a hash suffix
-	hash := fmt.Sprintf("%x", sha1.Sum([]byte(baseID)))[:6]
-	uniqueID := fmt.Sprintf("%s-%s", baseID, hash)
+	// Only add index if it's not the first element of this type
+	if index > 1 {
+		parts = append(parts, fmt.Sprintf("%d", index))
+	}

-	g.usedIDs[uniqueID] = true
-	return uniqueID
+	return strings.Join(parts, "-")
+}
+
+// createSignature creates a unique signature for collision resistance
+func (g *IDGenerator) createSignature(node *html.Node, filePath string) string {
+	// Minimal signature for uniqueness
+	tag := node.Data
+	classes := strings.Join(GetClasses(node), " ")
+	domPath := g.getSimpleDOMPath(node)
+
+	return fmt.Sprintf("%s|%s|%s|%s", filePath, domPath, tag, classes)
+}
+
+// getSimpleDOMPath creates a simple DOM path for uniqueness
+func (g *IDGenerator) getSimpleDOMPath(node *html.Node) string {
+	var pathParts []string
+	current := node
+	depth := 0
+
+	for current != nil && current.Type == html.ElementNode && depth < 5 {
+		part := current.Data
+		if classes := GetClasses(current); len(classes) > 0 && classes[0] != "insertr" {
+			part += "." + classes[0]
+		}
+		pathParts = append([]string{part}, pathParts...)
+		current = current.Parent
+		depth++
+	}
+
+	return strings.Join(pathParts, ">")
 }