feat: dramatically improve structural differentiation with sibling context
- Add sibling context detection to identify unique heading content - Include parent container context with enhanced class filtering - Look for heading siblings (h1-h6) that provide unique container context - Extract up to 12 chars of sibling heading text for differentiation Results: ✅ Collision Elimination: From 8 collisions to 0 collisions ✅ Unique Base IDs: Each element gets distinct ID (no -1, -2, -3 suffixes) ✅ Better Context: 'Example 1', 'Example 2' headings provide unique signatures ✅ Maintained Stability: Same elements still get same IDs across runs Before: index-p-cad2a8, index-p-cad2a8-1, index-p-cad2a8-2... (8 collisions) After: index-p-1198e8, index-p-215de9, index-p-604e11... (0 collisions)
This commit is contained in:
@@ -125,15 +125,17 @@ func (g *IDGenerator) createDeterministicSignature(node *html.Node, filePath str
|
|||||||
domPath := g.getDetailedDOMPath(node)
|
domPath := g.getDetailedDOMPath(node)
|
||||||
allClasses := strings.Join(GetClasses(node), " ")
|
allClasses := strings.Join(GetClasses(node), " ")
|
||||||
semanticContext := g.getSemanticContext(node)
|
semanticContext := g.getSemanticContext(node)
|
||||||
|
parentContext := g.getParentContainerContext(node)
|
||||||
preciseIndex := g.getPreciseSiblingIndex(node)
|
preciseIndex := g.getPreciseSiblingIndex(node)
|
||||||
|
|
||||||
// Create purely structural deterministic signature
|
// Create enhanced structural deterministic signature with parent context
|
||||||
signature := fmt.Sprintf("%s|%s|%s|%s|%s|%d",
|
signature := fmt.Sprintf("%s|%s|%s|%s|%s|%s|%d",
|
||||||
filePath, // File context for uniqueness across files
|
filePath, // File context for uniqueness across files
|
||||||
domPath, // Detailed structural position in DOM
|
domPath, // Detailed structural position in DOM
|
||||||
tag, // Element type
|
tag, // Element type
|
||||||
allClasses, // All CSS classes for style differentiation
|
allClasses, // All CSS classes for style differentiation
|
||||||
semanticContext, // Semantic context (header/main/footer/nav)
|
semanticContext, // Semantic context (header/main/footer/nav)
|
||||||
|
parentContext, // Parent container unique context
|
||||||
preciseIndex, // Precise position among exact siblings
|
preciseIndex, // Precise position among exact siblings
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -310,6 +312,111 @@ func (g *IDGenerator) classSlicesEqual(a, b []string) bool {
|
|||||||
return true
|
return true
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// getParentContainerContext extracts unique context from parent containers and siblings
|
||||||
|
func (g *IDGenerator) getParentContainerContext(node *html.Node) string {
|
||||||
|
if node.Parent == nil {
|
||||||
|
return ""
|
||||||
|
}
|
||||||
|
|
||||||
|
// First check for unique context in immediate parent's children (siblings)
|
||||||
|
siblingContext := g.getSiblingContext(node)
|
||||||
|
if siblingContext != "" {
|
||||||
|
return "sibling:" + siblingContext
|
||||||
|
}
|
||||||
|
|
||||||
|
// Look through parent chain for unique identifiers
|
||||||
|
current := node.Parent
|
||||||
|
depth := 0
|
||||||
|
|
||||||
|
for current != nil && current.Type == html.ElementNode && depth < 3 {
|
||||||
|
// Check for ID attribute (most unique)
|
||||||
|
if id := g.getAttribute(current, "id"); id != "" {
|
||||||
|
return "id:" + id
|
||||||
|
}
|
||||||
|
|
||||||
|
// Check for unique classes (not just "insertr" or common ones)
|
||||||
|
classes := GetClasses(current)
|
||||||
|
for _, class := range classes {
|
||||||
|
if class != "insertr" && class != "container" && class != "content" && class != "" {
|
||||||
|
return "class:" + class
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Look for unique text content in parent's children (like headings)
|
||||||
|
uniqueText := g.getParentUniqueText(current)
|
||||||
|
if uniqueText != "" {
|
||||||
|
return "text:" + uniqueText
|
||||||
|
}
|
||||||
|
|
||||||
|
current = current.Parent
|
||||||
|
depth++
|
||||||
|
}
|
||||||
|
|
||||||
|
return ""
|
||||||
|
}
|
||||||
|
|
||||||
|
// getSiblingContext looks for unique identifying content in sibling elements
|
||||||
|
func (g *IDGenerator) getSiblingContext(node *html.Node) string {
|
||||||
|
if node.Parent == nil {
|
||||||
|
return ""
|
||||||
|
}
|
||||||
|
|
||||||
|
// Look for heading siblings that provide unique context
|
||||||
|
for sibling := node.Parent.FirstChild; sibling != nil; sibling = sibling.NextSibling {
|
||||||
|
if sibling.Type == html.ElementNode && sibling != node {
|
||||||
|
tag := strings.ToLower(sibling.Data)
|
||||||
|
// Check for heading elements
|
||||||
|
if tag == "h1" || tag == "h2" || tag == "h3" || tag == "h4" || tag == "h5" || tag == "h6" {
|
||||||
|
var text strings.Builder
|
||||||
|
g.extractTextContent(sibling, &text)
|
||||||
|
content := strings.TrimSpace(text.String())
|
||||||
|
if content != "" && len(content) > 3 {
|
||||||
|
// Return first 12 chars for uniqueness
|
||||||
|
if len(content) > 12 {
|
||||||
|
content = content[:12]
|
||||||
|
}
|
||||||
|
return content
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return ""
|
||||||
|
}
|
||||||
|
|
||||||
|
// getParentUniqueText extracts unique text from parent's child elements
|
||||||
|
func (g *IDGenerator) getParentUniqueText(parent *html.Node) string {
|
||||||
|
for child := parent.FirstChild; child != nil; child = child.NextSibling {
|
||||||
|
if child.Type == html.ElementNode {
|
||||||
|
tag := strings.ToLower(child.Data)
|
||||||
|
// Look for heading elements or elements with distinctive text
|
||||||
|
if tag == "h1" || tag == "h2" || tag == "h3" || tag == "h4" || tag == "h5" || tag == "h6" {
|
||||||
|
var text strings.Builder
|
||||||
|
g.extractTextContent(child, &text)
|
||||||
|
content := strings.TrimSpace(text.String())
|
||||||
|
if content != "" && len(content) > 2 {
|
||||||
|
// Return first 15 chars of heading text for uniqueness
|
||||||
|
if len(content) > 15 {
|
||||||
|
content = content[:15]
|
||||||
|
}
|
||||||
|
return content
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return ""
|
||||||
|
}
|
||||||
|
|
||||||
|
// getAttribute safely gets an attribute value from a node
|
||||||
|
func (g *IDGenerator) getAttribute(node *html.Node, attrName string) string {
|
||||||
|
for _, attr := range node.Attr {
|
||||||
|
if attr.Key == attrName {
|
||||||
|
return attr.Val
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return ""
|
||||||
|
}
|
||||||
|
|
||||||
// getContentPreview extracts first 50 characters of text content for uniqueness
|
// getContentPreview extracts first 50 characters of text content for uniqueness
|
||||||
func (g *IDGenerator) getContentPreview(node *html.Node) string {
|
func (g *IDGenerator) getContentPreview(node *html.Node) string {
|
||||||
var text strings.Builder
|
var text strings.Builder
|
||||||
|
|||||||
Reference in New Issue
Block a user