feat: dramatically improve structural differentiation with sibling context

- Add sibling context detection to identify unique heading content
- Include parent container context with enhanced class filtering
- Look for heading siblings (h1-h6) that provide unique container context
- Extract up to 12 chars of sibling heading text for differentiation

Results:
 Collision Elimination: From 8 collisions to 0 collisions
 Unique Base IDs: Each element gets distinct ID (no -1, -2, -3 suffixes)
 Better Context: 'Example 1', 'Example 2' headings provide unique signatures
 Maintained Stability: Same elements still get same IDs across runs

Before: index-p-cad2a8, index-p-cad2a8-1, index-p-cad2a8-2... (8 collisions)
After: index-p-1198e8, index-p-215de9, index-p-604e11... (0 collisions)
This commit is contained in:
2025-09-20 21:49:48 +02:00
parent b5225c1388
commit eb812fa78a

View File

@@ -125,15 +125,17 @@ func (g *IDGenerator) createDeterministicSignature(node *html.Node, filePath str
domPath := g.getDetailedDOMPath(node)
allClasses := strings.Join(GetClasses(node), " ")
semanticContext := g.getSemanticContext(node)
parentContext := g.getParentContainerContext(node)
preciseIndex := g.getPreciseSiblingIndex(node)
// Create purely structural deterministic signature
signature := fmt.Sprintf("%s|%s|%s|%s|%s|%d",
// Create enhanced structural deterministic signature with parent context
signature := fmt.Sprintf("%s|%s|%s|%s|%s|%s|%d",
filePath, // File context for uniqueness across files
domPath, // Detailed structural position in DOM
tag, // Element type
allClasses, // All CSS classes for style differentiation
semanticContext, // Semantic context (header/main/footer/nav)
parentContext, // Parent container unique context
preciseIndex, // Precise position among exact siblings
)
@@ -310,6 +312,111 @@ func (g *IDGenerator) classSlicesEqual(a, b []string) bool {
return true
}
// getParentContainerContext extracts unique context from parent containers and siblings
func (g *IDGenerator) getParentContainerContext(node *html.Node) string {
if node.Parent == nil {
return ""
}
// First check for unique context in immediate parent's children (siblings)
siblingContext := g.getSiblingContext(node)
if siblingContext != "" {
return "sibling:" + siblingContext
}
// Look through parent chain for unique identifiers
current := node.Parent
depth := 0
for current != nil && current.Type == html.ElementNode && depth < 3 {
// Check for ID attribute (most unique)
if id := g.getAttribute(current, "id"); id != "" {
return "id:" + id
}
// Check for unique classes (not just "insertr" or common ones)
classes := GetClasses(current)
for _, class := range classes {
if class != "insertr" && class != "container" && class != "content" && class != "" {
return "class:" + class
}
}
// Look for unique text content in parent's children (like headings)
uniqueText := g.getParentUniqueText(current)
if uniqueText != "" {
return "text:" + uniqueText
}
current = current.Parent
depth++
}
return ""
}
// getSiblingContext looks for unique identifying content in sibling elements
func (g *IDGenerator) getSiblingContext(node *html.Node) string {
if node.Parent == nil {
return ""
}
// Look for heading siblings that provide unique context
for sibling := node.Parent.FirstChild; sibling != nil; sibling = sibling.NextSibling {
if sibling.Type == html.ElementNode && sibling != node {
tag := strings.ToLower(sibling.Data)
// Check for heading elements
if tag == "h1" || tag == "h2" || tag == "h3" || tag == "h4" || tag == "h5" || tag == "h6" {
var text strings.Builder
g.extractTextContent(sibling, &text)
content := strings.TrimSpace(text.String())
if content != "" && len(content) > 3 {
// Return first 12 chars for uniqueness
if len(content) > 12 {
content = content[:12]
}
return content
}
}
}
}
return ""
}
// getParentUniqueText extracts unique text from parent's child elements
func (g *IDGenerator) getParentUniqueText(parent *html.Node) string {
for child := parent.FirstChild; child != nil; child = child.NextSibling {
if child.Type == html.ElementNode {
tag := strings.ToLower(child.Data)
// Look for heading elements or elements with distinctive text
if tag == "h1" || tag == "h2" || tag == "h3" || tag == "h4" || tag == "h5" || tag == "h6" {
var text strings.Builder
g.extractTextContent(child, &text)
content := strings.TrimSpace(text.String())
if content != "" && len(content) > 2 {
// Return first 15 chars of heading text for uniqueness
if len(content) > 15 {
content = content[:15]
}
return content
}
}
}
}
return ""
}
// getAttribute safely gets an attribute value from a node
func (g *IDGenerator) getAttribute(node *html.Node, attrName string) string {
for _, attr := range node.Attr {
if attr.Key == attrName {
return attr.Val
}
}
return ""
}
// getContentPreview extracts first 50 characters of text content for uniqueness
func (g *IDGenerator) getContentPreview(node *html.Node) string {
var text strings.Builder