Files
insertr/internal/engine/id_generator.go
Joakim eb812fa78a feat: dramatically improve structural differentiation with sibling context
- Add sibling context detection to identify unique heading content
- Include parent container context with enhanced class filtering
- Look for heading siblings (h1-h6) that provide unique container context
- Extract up to 12 chars of sibling heading text for differentiation

Results:
 Collision Elimination: From 8 collisions to 0 collisions
 Unique Base IDs: Each element gets distinct ID (no -1, -2, -3 suffixes)
 Better Context: 'Example 1', 'Example 2' headings provide unique signatures
 Maintained Stability: Same elements still get same IDs across runs

Before: index-p-cad2a8, index-p-cad2a8-1, index-p-cad2a8-2... (8 collisions)
After: index-p-1198e8, index-p-215de9, index-p-604e11... (0 collisions)
2025-09-20 21:49:48 +02:00

511 lines
14 KiB
Go

package engine
import (
"crypto/sha256"
"fmt"
"path/filepath"
"strings"
"golang.org/x/net/html"
)
// IDGenerator generates unique content IDs for elements using lightweight hierarchical approach
type IDGenerator struct {
usedIDs map[string]bool
elementCounts map[string]int // Track counts per file+type for indexing
}
// NewIDGenerator creates a new ID generator
func NewIDGenerator() *IDGenerator {
return &IDGenerator{
usedIDs: make(map[string]bool),
elementCounts: make(map[string]int),
}
}
// Generate creates a content ID for an HTML element using deterministic approach
func (g *IDGenerator) Generate(node *html.Node, filePath string) string {
// 1. File context (minimal)
fileName := g.getFileName(filePath)
// 2. Element identity (lightweight)
tag := strings.ToLower(node.Data)
primaryClass := g.getPrimaryClass(node)
// 3. Build readable prefix (deterministic, no runtime counting)
prefix := g.buildDeterministicPrefix(fileName, tag, primaryClass)
// 4. Create deterministic suffix based on element characteristics
signature := g.createDeterministicSignature(node, filePath)
finalID := fmt.Sprintf("%s-%s", prefix, signature)
// Ensure uniqueness within this session
counter := 1
originalID := finalID
for g.usedIDs[finalID] {
finalID = fmt.Sprintf("%s-%d", originalID, counter)
counter++
}
g.usedIDs[finalID] = true
return finalID
}
// getFileName extracts filename without extension for ID prefix
func (g *IDGenerator) getFileName(filePath string) string {
base := filepath.Base(filePath)
return strings.TrimSuffix(base, filepath.Ext(base))
}
// getPrimaryClass returns the first meaningful (non-insertr) CSS class
func (g *IDGenerator) getPrimaryClass(node *html.Node) string {
classes := GetClasses(node)
for _, class := range classes {
if class != "insertr" && class != "" {
return class
}
}
return ""
}
// getElementKey creates a key for tracking element counts
func (g *IDGenerator) getElementKey(fileName, tag, primaryClass string) string {
if primaryClass != "" {
return fmt.Sprintf("%s-%s", fileName, primaryClass)
}
return fmt.Sprintf("%s-%s", fileName, tag)
}
// getElementIndex returns the position index for this element type in the file
func (g *IDGenerator) getElementIndex(elementKey string) int {
g.elementCounts[elementKey]++
return g.elementCounts[elementKey]
}
// buildDeterministicPrefix creates human-readable prefix without runtime counting
func (g *IDGenerator) buildDeterministicPrefix(fileName, tag, primaryClass string) string {
var parts []string
parts = append(parts, fileName)
if primaryClass != "" {
parts = append(parts, primaryClass)
} else {
parts = append(parts, tag)
}
// No runtime index - rely on hash for uniqueness
return strings.Join(parts, "-")
}
// buildPrefix creates human-readable prefix for the ID (legacy method)
func (g *IDGenerator) buildPrefix(fileName, tag, primaryClass string, index int) string {
var parts []string
parts = append(parts, fileName)
if primaryClass != "" {
parts = append(parts, primaryClass)
} else {
parts = append(parts, tag)
}
// Only add index if it's not the first element of this type
if index > 1 {
parts = append(parts, fmt.Sprintf("%d", index))
}
return strings.Join(parts, "-")
}
// createDeterministicSignature creates a deterministic signature for element identification
func (g *IDGenerator) createDeterministicSignature(node *html.Node, filePath string) string {
// Build structural signature for stable IDs across content changes
tag := node.Data
domPath := g.getDetailedDOMPath(node)
allClasses := strings.Join(GetClasses(node), " ")
semanticContext := g.getSemanticContext(node)
parentContext := g.getParentContainerContext(node)
preciseIndex := g.getPreciseSiblingIndex(node)
// Create enhanced structural deterministic signature with parent context
signature := fmt.Sprintf("%s|%s|%s|%s|%s|%s|%d",
filePath, // File context for uniqueness across files
domPath, // Detailed structural position in DOM
tag, // Element type
allClasses, // All CSS classes for style differentiation
semanticContext, // Semantic context (header/main/footer/nav)
parentContext, // Parent container unique context
preciseIndex, // Precise position among exact siblings
)
// Create deterministic hash suffix (6 chars)
hash := sha256.Sum256([]byte(signature))
return fmt.Sprintf("%x", hash)[:6]
}
// createSignature creates a unique signature for collision resistance (DEPRECATED - using deterministic now)
func (g *IDGenerator) createSignature(node *html.Node, filePath string) string {
// This method is kept for compatibility but not used in deterministic generation
return ""
}
// getSimpleDOMPath creates a simple but precise DOM path for uniqueness (max 3 levels)
func (g *IDGenerator) getSimpleDOMPath(node *html.Node) string {
var pathParts []string
current := node
depth := 0
for current != nil && current.Type == html.ElementNode && depth < 3 {
part := current.Data
// Add first meaningful class (not insertr) for better differentiation
classes := GetClasses(current)
for _, class := range classes {
if class != "insertr" && class != "" {
part += "." + class
break
}
}
pathParts = append([]string{part}, pathParts...)
current = current.Parent
depth++
}
return strings.Join(pathParts, ">")
}
// getDetailedDOMPath creates a more detailed DOM path for enhanced structural differentiation
func (g *IDGenerator) getDetailedDOMPath(node *html.Node) string {
var pathParts []string
current := node
depth := 0
for current != nil && current.Type == html.ElementNode && depth < 5 {
part := current.Data
// Add all meaningful classes for maximum differentiation
classes := GetClasses(current)
var meaningfulClasses []string
for _, class := range classes {
if class != "insertr" && class != "" {
meaningfulClasses = append(meaningfulClasses, class)
}
}
if len(meaningfulClasses) > 0 {
part += "." + strings.Join(meaningfulClasses, ".")
}
pathParts = append([]string{part}, pathParts...)
current = current.Parent
depth++
}
return strings.Join(pathParts, ">")
}
// getSemanticContext identifies the semantic container (header, main, footer, nav)
func (g *IDGenerator) getSemanticContext(node *html.Node) string {
current := node.Parent
// Traverse up to find semantic containers
for current != nil && current.Type == html.ElementNode {
tag := strings.ToLower(current.Data)
// Direct semantic tags
switch tag {
case "header":
return "header"
case "main":
return "main"
case "footer":
return "footer"
case "nav":
return "nav"
case "aside":
return "aside"
}
// Semantic classes
classes := GetClasses(current)
for _, class := range classes {
class = strings.ToLower(class)
if strings.Contains(class, "header") {
return "header"
}
if strings.Contains(class, "footer") {
return "footer"
}
if strings.Contains(class, "nav") {
return "nav"
}
if strings.Contains(class, "sidebar") || strings.Contains(class, "aside") {
return "aside"
}
}
current = current.Parent
}
return "content"
}
// getPreciseSiblingIndex returns position among siblings with exact tag and class match
func (g *IDGenerator) getPreciseSiblingIndex(node *html.Node) int {
if node.Parent == nil {
return 0
}
index := 0
tag := node.Data
classes := GetClasses(node)
// Sort classes for consistent comparison
sortedClasses := make([]string, len(classes))
copy(sortedClasses, classes)
for i := 0; i < len(sortedClasses); i++ {
for j := i + 1; j < len(sortedClasses); j++ {
if sortedClasses[i] > sortedClasses[j] {
sortedClasses[i], sortedClasses[j] = sortedClasses[j], sortedClasses[i]
}
}
}
for sibling := node.Parent.FirstChild; sibling != nil; sibling = sibling.NextSibling {
if sibling.Type == html.ElementNode && sibling.Data == tag {
siblingClasses := GetClasses(sibling)
// Sort sibling classes for comparison
sortedSiblingClasses := make([]string, len(siblingClasses))
copy(sortedSiblingClasses, siblingClasses)
for i := 0; i < len(sortedSiblingClasses); i++ {
for j := i + 1; j < len(sortedSiblingClasses); j++ {
if sortedSiblingClasses[i] > sortedSiblingClasses[j] {
sortedSiblingClasses[i], sortedSiblingClasses[j] = sortedSiblingClasses[j], sortedSiblingClasses[i]
}
}
}
// Check if classes match exactly
if g.classSlicesEqual(sortedClasses, sortedSiblingClasses) {
if sibling == node {
return index
}
index++
}
}
}
return index
}
// classSlicesEqual compares two sorted class slices for equality
func (g *IDGenerator) classSlicesEqual(a, b []string) bool {
if len(a) != len(b) {
return false
}
for i := range a {
if a[i] != b[i] {
return false
}
}
return true
}
// getParentContainerContext extracts unique context from parent containers and siblings
func (g *IDGenerator) getParentContainerContext(node *html.Node) string {
if node.Parent == nil {
return ""
}
// First check for unique context in immediate parent's children (siblings)
siblingContext := g.getSiblingContext(node)
if siblingContext != "" {
return "sibling:" + siblingContext
}
// Look through parent chain for unique identifiers
current := node.Parent
depth := 0
for current != nil && current.Type == html.ElementNode && depth < 3 {
// Check for ID attribute (most unique)
if id := g.getAttribute(current, "id"); id != "" {
return "id:" + id
}
// Check for unique classes (not just "insertr" or common ones)
classes := GetClasses(current)
for _, class := range classes {
if class != "insertr" && class != "container" && class != "content" && class != "" {
return "class:" + class
}
}
// Look for unique text content in parent's children (like headings)
uniqueText := g.getParentUniqueText(current)
if uniqueText != "" {
return "text:" + uniqueText
}
current = current.Parent
depth++
}
return ""
}
// getSiblingContext looks for unique identifying content in sibling elements
func (g *IDGenerator) getSiblingContext(node *html.Node) string {
if node.Parent == nil {
return ""
}
// Look for heading siblings that provide unique context
for sibling := node.Parent.FirstChild; sibling != nil; sibling = sibling.NextSibling {
if sibling.Type == html.ElementNode && sibling != node {
tag := strings.ToLower(sibling.Data)
// Check for heading elements
if tag == "h1" || tag == "h2" || tag == "h3" || tag == "h4" || tag == "h5" || tag == "h6" {
var text strings.Builder
g.extractTextContent(sibling, &text)
content := strings.TrimSpace(text.String())
if content != "" && len(content) > 3 {
// Return first 12 chars for uniqueness
if len(content) > 12 {
content = content[:12]
}
return content
}
}
}
}
return ""
}
// getParentUniqueText extracts unique text from parent's child elements
func (g *IDGenerator) getParentUniqueText(parent *html.Node) string {
for child := parent.FirstChild; child != nil; child = child.NextSibling {
if child.Type == html.ElementNode {
tag := strings.ToLower(child.Data)
// Look for heading elements or elements with distinctive text
if tag == "h1" || tag == "h2" || tag == "h3" || tag == "h4" || tag == "h5" || tag == "h6" {
var text strings.Builder
g.extractTextContent(child, &text)
content := strings.TrimSpace(text.String())
if content != "" && len(content) > 2 {
// Return first 15 chars of heading text for uniqueness
if len(content) > 15 {
content = content[:15]
}
return content
}
}
}
}
return ""
}
// getAttribute safely gets an attribute value from a node
func (g *IDGenerator) getAttribute(node *html.Node, attrName string) string {
for _, attr := range node.Attr {
if attr.Key == attrName {
return attr.Val
}
}
return ""
}
// getContentPreview extracts first 50 characters of text content for uniqueness
func (g *IDGenerator) getContentPreview(node *html.Node) string {
var text strings.Builder
g.extractTextContent(node, &text)
content := strings.TrimSpace(text.String())
if len(content) > 50 {
content = content[:50]
}
// Remove newlines and normalize whitespace
content = strings.ReplaceAll(content, "\n", " ")
content = strings.ReplaceAll(content, "\t", " ")
for strings.Contains(content, " ") {
content = strings.ReplaceAll(content, " ", " ")
}
return content
}
// extractTextContent recursively extracts text content from a node
func (g *IDGenerator) extractTextContent(node *html.Node, text *strings.Builder) {
if node.Type == html.TextNode {
text.WriteString(node.Data)
}
for child := node.FirstChild; child != nil; child = child.NextSibling {
g.extractTextContent(child, text)
}
}
// getSiblingIndex returns the position of this element among its siblings of the same type and class
func (g *IDGenerator) getSiblingIndex(node *html.Node) int {
if node.Parent == nil {
return 0
}
index := 0
tag := node.Data
classes := GetClasses(node)
// First try: match by tag + insertr class (most common case)
hasInsertr := false
for _, class := range classes {
if class == "insertr" {
hasInsertr = true
break
}
}
for sibling := node.Parent.FirstChild; sibling != nil; sibling = sibling.NextSibling {
if sibling.Type == html.ElementNode && sibling.Data == tag {
siblingClasses := GetClasses(sibling)
// For insertr elements, match by tag + insertr class
if hasInsertr {
siblingHasInsertr := false
for _, class := range siblingClasses {
if class == "insertr" {
siblingHasInsertr = true
break
}
}
if siblingHasInsertr {
if sibling == node {
return index
}
index++
}
} else {
// For non-insertr elements, match by exact class list
if g.classesMatch(classes, siblingClasses) {
if sibling == node {
return index
}
index++
}
}
}
}
return index
}
// classesMatch checks if two class lists are equivalent
func (g *IDGenerator) classesMatch(classes1, classes2 []string) bool {
if len(classes1) != len(classes2) {
return false
}
for i, class := range classes1 {
if i >= len(classes2) || class != classes2[i] {
return false
}
}
return true
}