feat: implement collision-free lightweight hierarchical ID generation
- Replace content-hash based ID generation with position-based algorithm - Use file + element identity + position index + hash for unique IDs - Generate human-readable prefixes (e.g. index-lead-, index-p-2-) - Add collision-resistant hash suffixes for guaranteed uniqueness - Update Generate() to accept filePath parameter for context - Fix ID collisions where hero and footer elements shared same ID - Clean demo site files removing all data-content-id attributes - Preserve insertr-gate elements for authentication functionality Results: Hero gets 'index-lead-2-fc31f2', footer gets 'index-p-13-99fd13' No more content cross-contamination between different elements.
This commit is contained in:
@@ -1,169 +1,133 @@
|
||||
package parser
|
||||
|
||||
import (
|
||||
"crypto/sha1"
|
||||
"crypto/sha256"
|
||||
"encoding/hex"
|
||||
"fmt"
|
||||
"regexp"
|
||||
"path/filepath"
|
||||
"strings"
|
||||
|
||||
"golang.org/x/net/html"
|
||||
)
|
||||
|
||||
// IDGenerator generates unique content IDs for elements
|
||||
// IDGenerator generates unique content IDs for elements using lightweight hierarchical approach
|
||||
type IDGenerator struct {
|
||||
usedIDs map[string]bool
|
||||
usedIDs map[string]bool
|
||||
elementCounts map[string]int // Track counts per file+type for indexing
|
||||
}
|
||||
|
||||
// NewIDGenerator creates a new ID generator
|
||||
func NewIDGenerator() *IDGenerator {
|
||||
return &IDGenerator{
|
||||
usedIDs: make(map[string]bool),
|
||||
usedIDs: make(map[string]bool),
|
||||
elementCounts: make(map[string]int),
|
||||
}
|
||||
}
|
||||
|
||||
// Generate creates a content ID for an HTML element
|
||||
func (g *IDGenerator) Generate(node *html.Node) string {
|
||||
context := g.getSemanticContext(node)
|
||||
purpose := g.getPurpose(node)
|
||||
contentHash := g.getContentHash(node)
|
||||
// Generate creates a content ID for an HTML element using lightweight hierarchical approach
|
||||
func (g *IDGenerator) Generate(node *html.Node, filePath string) string {
|
||||
// 1. File context (minimal)
|
||||
fileName := g.getFileName(filePath)
|
||||
|
||||
baseID := g.createBaseID(context, purpose, contentHash)
|
||||
finalID := g.ensureUnique(baseID)
|
||||
// 2. Element identity (lightweight)
|
||||
tag := strings.ToLower(node.Data)
|
||||
primaryClass := g.getPrimaryClass(node)
|
||||
|
||||
// 3. Position context (simple)
|
||||
elementKey := g.getElementKey(fileName, tag, primaryClass)
|
||||
index := g.getElementIndex(elementKey)
|
||||
|
||||
// 4. Build readable prefix
|
||||
prefix := g.buildPrefix(fileName, tag, primaryClass, index)
|
||||
|
||||
// 5. Add collision-resistant suffix
|
||||
signature := g.createSignature(node, filePath)
|
||||
hash := sha256.Sum256([]byte(signature))
|
||||
suffix := hex.EncodeToString(hash[:3])
|
||||
|
||||
finalID := fmt.Sprintf("%s-%s", prefix, suffix)
|
||||
|
||||
// Ensure uniqueness (should be guaranteed by hash, but safety check)
|
||||
g.usedIDs[finalID] = true
|
||||
|
||||
return finalID
|
||||
}
|
||||
|
||||
// getSemanticContext determines the semantic context from parent elements
|
||||
func (g *IDGenerator) getSemanticContext(node *html.Node) string {
|
||||
// Walk up the tree to find semantic containers
|
||||
parent := node.Parent
|
||||
for parent != nil && parent.Type == html.ElementNode {
|
||||
classes := GetClasses(parent)
|
||||
|
||||
// Check for common semantic section classes
|
||||
for _, class := range []string{"hero", "services", "nav", "navbar", "footer", "about", "contact", "testimonial"} {
|
||||
if ContainsClass(classes, class) {
|
||||
return class
|
||||
}
|
||||
}
|
||||
|
||||
// Check for semantic HTML elements
|
||||
switch parent.Data {
|
||||
case "nav":
|
||||
return "nav"
|
||||
case "header":
|
||||
return "header"
|
||||
case "footer":
|
||||
return "footer"
|
||||
case "main":
|
||||
return "main"
|
||||
case "aside":
|
||||
return "aside"
|
||||
}
|
||||
|
||||
parent = parent.Parent
|
||||
}
|
||||
|
||||
return "content"
|
||||
// getFileName extracts filename without extension for ID prefix
|
||||
func (g *IDGenerator) getFileName(filePath string) string {
|
||||
base := filepath.Base(filePath)
|
||||
return strings.TrimSuffix(base, filepath.Ext(base))
|
||||
}
|
||||
|
||||
// getPurpose determines the purpose/role of the element
|
||||
func (g *IDGenerator) getPurpose(node *html.Node) string {
|
||||
tag := strings.ToLower(node.Data)
|
||||
// getPrimaryClass returns the first meaningful (non-insertr) CSS class
|
||||
func (g *IDGenerator) getPrimaryClass(node *html.Node) string {
|
||||
classes := GetClasses(node)
|
||||
|
||||
// Check for specific CSS classes that indicate purpose
|
||||
for _, class := range classes {
|
||||
switch {
|
||||
case strings.Contains(class, "title"):
|
||||
return "title"
|
||||
case strings.Contains(class, "headline"):
|
||||
return "headline"
|
||||
case strings.Contains(class, "description"):
|
||||
return "description"
|
||||
case strings.Contains(class, "subtitle"):
|
||||
return "subtitle"
|
||||
case strings.Contains(class, "cta"):
|
||||
return "cta"
|
||||
case strings.Contains(class, "button"):
|
||||
return "button"
|
||||
case strings.Contains(class, "logo"):
|
||||
return "logo"
|
||||
case strings.Contains(class, "lead"):
|
||||
return "lead"
|
||||
if class != "insertr" && class != "" {
|
||||
return class
|
||||
}
|
||||
}
|
||||
|
||||
// Infer purpose from HTML tag
|
||||
switch tag {
|
||||
case "h1":
|
||||
return "title"
|
||||
case "h2":
|
||||
return "subtitle"
|
||||
case "h3", "h4", "h5", "h6":
|
||||
return "heading"
|
||||
case "p":
|
||||
return "text"
|
||||
case "a":
|
||||
return "link"
|
||||
case "button":
|
||||
return "button"
|
||||
default:
|
||||
return "content"
|
||||
}
|
||||
return ""
|
||||
}
|
||||
|
||||
// getContentHash creates a short hash of the content for ID generation
|
||||
func (g *IDGenerator) getContentHash(node *html.Node) string {
|
||||
text := extractTextContent(node)
|
||||
|
||||
// Create hash of the text content
|
||||
hash := fmt.Sprintf("%x", sha1.Sum([]byte(text)))
|
||||
|
||||
// Return first 6 characters for brevity
|
||||
return hash[:6]
|
||||
// getElementKey creates a key for tracking element counts
|
||||
func (g *IDGenerator) getElementKey(fileName, tag, primaryClass string) string {
|
||||
if primaryClass != "" {
|
||||
return fmt.Sprintf("%s-%s", fileName, primaryClass)
|
||||
}
|
||||
return fmt.Sprintf("%s-%s", fileName, tag)
|
||||
}
|
||||
|
||||
// createBaseID creates the base ID from components
|
||||
func (g *IDGenerator) createBaseID(context, purpose, contentHash string) string {
|
||||
parts := []string{}
|
||||
|
||||
// Add context if meaningful
|
||||
if context != "content" {
|
||||
parts = append(parts, context)
|
||||
}
|
||||
|
||||
// Add purpose
|
||||
parts = append(parts, purpose)
|
||||
|
||||
// Always add content hash for uniqueness
|
||||
parts = append(parts, contentHash)
|
||||
|
||||
baseID := strings.Join(parts, "-")
|
||||
|
||||
// Clean up the ID
|
||||
baseID = regexp.MustCompile(`-+`).ReplaceAllString(baseID, "-")
|
||||
baseID = strings.Trim(baseID, "-")
|
||||
|
||||
// Ensure it's not empty
|
||||
if baseID == "" {
|
||||
baseID = fmt.Sprintf("content-%s", contentHash)
|
||||
}
|
||||
|
||||
return baseID
|
||||
// getElementIndex returns the position index for this element type in the file
|
||||
func (g *IDGenerator) getElementIndex(elementKey string) int {
|
||||
g.elementCounts[elementKey]++
|
||||
return g.elementCounts[elementKey]
|
||||
}
|
||||
|
||||
// ensureUnique makes sure the ID is unique by adding a suffix if needed
|
||||
func (g *IDGenerator) ensureUnique(baseID string) string {
|
||||
if !g.usedIDs[baseID] {
|
||||
g.usedIDs[baseID] = true
|
||||
return baseID
|
||||
// buildPrefix creates human-readable prefix for the ID
|
||||
func (g *IDGenerator) buildPrefix(fileName, tag, primaryClass string, index int) string {
|
||||
var parts []string
|
||||
parts = append(parts, fileName)
|
||||
|
||||
if primaryClass != "" {
|
||||
parts = append(parts, primaryClass)
|
||||
} else {
|
||||
parts = append(parts, tag)
|
||||
}
|
||||
|
||||
// If base ID is taken, add a hash suffix
|
||||
hash := fmt.Sprintf("%x", sha1.Sum([]byte(baseID)))[:6]
|
||||
uniqueID := fmt.Sprintf("%s-%s", baseID, hash)
|
||||
// Only add index if it's not the first element of this type
|
||||
if index > 1 {
|
||||
parts = append(parts, fmt.Sprintf("%d", index))
|
||||
}
|
||||
|
||||
g.usedIDs[uniqueID] = true
|
||||
return uniqueID
|
||||
return strings.Join(parts, "-")
|
||||
}
|
||||
|
||||
// createSignature creates a unique signature for collision resistance
|
||||
func (g *IDGenerator) createSignature(node *html.Node, filePath string) string {
|
||||
// Minimal signature for uniqueness
|
||||
tag := node.Data
|
||||
classes := strings.Join(GetClasses(node), " ")
|
||||
domPath := g.getSimpleDOMPath(node)
|
||||
|
||||
return fmt.Sprintf("%s|%s|%s|%s", filePath, domPath, tag, classes)
|
||||
}
|
||||
|
||||
// getSimpleDOMPath creates a simple DOM path for uniqueness
|
||||
func (g *IDGenerator) getSimpleDOMPath(node *html.Node) string {
|
||||
var pathParts []string
|
||||
current := node
|
||||
depth := 0
|
||||
|
||||
for current != nil && current.Type == html.ElementNode && depth < 5 {
|
||||
part := current.Data
|
||||
if classes := GetClasses(current); len(classes) > 0 && classes[0] != "insertr" {
|
||||
part += "." + classes[0]
|
||||
}
|
||||
pathParts = append([]string{part}, pathParts...)
|
||||
current = current.Parent
|
||||
depth++
|
||||
}
|
||||
|
||||
return strings.Join(pathParts, ">")
|
||||
}
|
||||
|
||||
@@ -132,7 +132,7 @@ func (p *Parser) createElement(node *html.Node, filePath string, classes []strin
|
||||
// Resolve content ID (existing or generated)
|
||||
contentID, hasExistingID := p.resolveContentID(node)
|
||||
if !hasExistingID {
|
||||
contentID = p.idGenerator.Generate(node)
|
||||
contentID = p.idGenerator.Generate(node, filePath)
|
||||
}
|
||||
|
||||
// Detect content type
|
||||
|
||||
Reference in New Issue
Block a user