feat: dramatically improve structural differentiation with sibling context
- Add sibling context detection to identify unique heading content - Include parent container context with enhanced class filtering - Look for heading siblings (h1-h6) that provide unique container context - Extract up to 12 chars of sibling heading text for differentiation Results: ✅ Collision Elimination: From 8 collisions to 0 collisions ✅ Unique Base IDs: Each element gets distinct ID (no -1, -2, -3 suffixes) ✅ Better Context: 'Example 1', 'Example 2' headings provide unique signatures ✅ Maintained Stability: Same elements still get same IDs across runs Before: index-p-cad2a8, index-p-cad2a8-1, index-p-cad2a8-2... (8 collisions) After: index-p-1198e8, index-p-215de9, index-p-604e11... (0 collisions)
This commit is contained in:
@@ -125,15 +125,17 @@ func (g *IDGenerator) createDeterministicSignature(node *html.Node, filePath str
|
||||
domPath := g.getDetailedDOMPath(node)
|
||||
allClasses := strings.Join(GetClasses(node), " ")
|
||||
semanticContext := g.getSemanticContext(node)
|
||||
parentContext := g.getParentContainerContext(node)
|
||||
preciseIndex := g.getPreciseSiblingIndex(node)
|
||||
|
||||
// Create purely structural deterministic signature
|
||||
signature := fmt.Sprintf("%s|%s|%s|%s|%s|%d",
|
||||
// Create enhanced structural deterministic signature with parent context
|
||||
signature := fmt.Sprintf("%s|%s|%s|%s|%s|%s|%d",
|
||||
filePath, // File context for uniqueness across files
|
||||
domPath, // Detailed structural position in DOM
|
||||
tag, // Element type
|
||||
allClasses, // All CSS classes for style differentiation
|
||||
semanticContext, // Semantic context (header/main/footer/nav)
|
||||
parentContext, // Parent container unique context
|
||||
preciseIndex, // Precise position among exact siblings
|
||||
)
|
||||
|
||||
@@ -310,6 +312,111 @@ func (g *IDGenerator) classSlicesEqual(a, b []string) bool {
|
||||
return true
|
||||
}
|
||||
|
||||
// getParentContainerContext extracts unique context from parent containers and siblings
|
||||
func (g *IDGenerator) getParentContainerContext(node *html.Node) string {
|
||||
if node.Parent == nil {
|
||||
return ""
|
||||
}
|
||||
|
||||
// First check for unique context in immediate parent's children (siblings)
|
||||
siblingContext := g.getSiblingContext(node)
|
||||
if siblingContext != "" {
|
||||
return "sibling:" + siblingContext
|
||||
}
|
||||
|
||||
// Look through parent chain for unique identifiers
|
||||
current := node.Parent
|
||||
depth := 0
|
||||
|
||||
for current != nil && current.Type == html.ElementNode && depth < 3 {
|
||||
// Check for ID attribute (most unique)
|
||||
if id := g.getAttribute(current, "id"); id != "" {
|
||||
return "id:" + id
|
||||
}
|
||||
|
||||
// Check for unique classes (not just "insertr" or common ones)
|
||||
classes := GetClasses(current)
|
||||
for _, class := range classes {
|
||||
if class != "insertr" && class != "container" && class != "content" && class != "" {
|
||||
return "class:" + class
|
||||
}
|
||||
}
|
||||
|
||||
// Look for unique text content in parent's children (like headings)
|
||||
uniqueText := g.getParentUniqueText(current)
|
||||
if uniqueText != "" {
|
||||
return "text:" + uniqueText
|
||||
}
|
||||
|
||||
current = current.Parent
|
||||
depth++
|
||||
}
|
||||
|
||||
return ""
|
||||
}
|
||||
|
||||
// getSiblingContext looks for unique identifying content in sibling elements
|
||||
func (g *IDGenerator) getSiblingContext(node *html.Node) string {
|
||||
if node.Parent == nil {
|
||||
return ""
|
||||
}
|
||||
|
||||
// Look for heading siblings that provide unique context
|
||||
for sibling := node.Parent.FirstChild; sibling != nil; sibling = sibling.NextSibling {
|
||||
if sibling.Type == html.ElementNode && sibling != node {
|
||||
tag := strings.ToLower(sibling.Data)
|
||||
// Check for heading elements
|
||||
if tag == "h1" || tag == "h2" || tag == "h3" || tag == "h4" || tag == "h5" || tag == "h6" {
|
||||
var text strings.Builder
|
||||
g.extractTextContent(sibling, &text)
|
||||
content := strings.TrimSpace(text.String())
|
||||
if content != "" && len(content) > 3 {
|
||||
// Return first 12 chars for uniqueness
|
||||
if len(content) > 12 {
|
||||
content = content[:12]
|
||||
}
|
||||
return content
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return ""
|
||||
}
|
||||
|
||||
// getParentUniqueText extracts unique text from parent's child elements
|
||||
func (g *IDGenerator) getParentUniqueText(parent *html.Node) string {
|
||||
for child := parent.FirstChild; child != nil; child = child.NextSibling {
|
||||
if child.Type == html.ElementNode {
|
||||
tag := strings.ToLower(child.Data)
|
||||
// Look for heading elements or elements with distinctive text
|
||||
if tag == "h1" || tag == "h2" || tag == "h3" || tag == "h4" || tag == "h5" || tag == "h6" {
|
||||
var text strings.Builder
|
||||
g.extractTextContent(child, &text)
|
||||
content := strings.TrimSpace(text.String())
|
||||
if content != "" && len(content) > 2 {
|
||||
// Return first 15 chars of heading text for uniqueness
|
||||
if len(content) > 15 {
|
||||
content = content[:15]
|
||||
}
|
||||
return content
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return ""
|
||||
}
|
||||
|
||||
// getAttribute safely gets an attribute value from a node
|
||||
func (g *IDGenerator) getAttribute(node *html.Node, attrName string) string {
|
||||
for _, attr := range node.Attr {
|
||||
if attr.Key == attrName {
|
||||
return attr.Val
|
||||
}
|
||||
}
|
||||
return ""
|
||||
}
|
||||
|
||||
// getContentPreview extracts first 50 characters of text content for uniqueness
|
||||
func (g *IDGenerator) getContentPreview(node *html.Node) string {
|
||||
var text strings.Builder
|
||||
|
||||
Reference in New Issue
Block a user