diff --git a/internal/engine/id_generator.go b/internal/engine/id_generator.go index badd7eb..3c504d2 100644 --- a/internal/engine/id_generator.go +++ b/internal/engine/id_generator.go @@ -125,15 +125,17 @@ func (g *IDGenerator) createDeterministicSignature(node *html.Node, filePath str domPath := g.getDetailedDOMPath(node) allClasses := strings.Join(GetClasses(node), " ") semanticContext := g.getSemanticContext(node) + parentContext := g.getParentContainerContext(node) preciseIndex := g.getPreciseSiblingIndex(node) - // Create purely structural deterministic signature - signature := fmt.Sprintf("%s|%s|%s|%s|%s|%d", + // Create enhanced structural deterministic signature with parent context + signature := fmt.Sprintf("%s|%s|%s|%s|%s|%s|%d", filePath, // File context for uniqueness across files domPath, // Detailed structural position in DOM tag, // Element type allClasses, // All CSS classes for style differentiation semanticContext, // Semantic context (header/main/footer/nav) + parentContext, // Parent container unique context preciseIndex, // Precise position among exact siblings ) @@ -310,6 +312,111 @@ func (g *IDGenerator) classSlicesEqual(a, b []string) bool { return true } +// getParentContainerContext extracts unique context from parent containers and siblings +func (g *IDGenerator) getParentContainerContext(node *html.Node) string { + if node.Parent == nil { + return "" + } + + // First check for unique context in immediate parent's children (siblings) + siblingContext := g.getSiblingContext(node) + if siblingContext != "" { + return "sibling:" + siblingContext + } + + // Look through parent chain for unique identifiers + current := node.Parent + depth := 0 + + for current != nil && current.Type == html.ElementNode && depth < 3 { + // Check for ID attribute (most unique) + if id := g.getAttribute(current, "id"); id != "" { + return "id:" + id + } + + // Check for unique classes (not just "insertr" or common ones) + classes := GetClasses(current) + for _, class := range classes { + if class != "insertr" && class != "container" && class != "content" && class != "" { + return "class:" + class + } + } + + // Look for unique text content in parent's children (like headings) + uniqueText := g.getParentUniqueText(current) + if uniqueText != "" { + return "text:" + uniqueText + } + + current = current.Parent + depth++ + } + + return "" +} + +// getSiblingContext looks for unique identifying content in sibling elements +func (g *IDGenerator) getSiblingContext(node *html.Node) string { + if node.Parent == nil { + return "" + } + + // Look for heading siblings that provide unique context + for sibling := node.Parent.FirstChild; sibling != nil; sibling = sibling.NextSibling { + if sibling.Type == html.ElementNode && sibling != node { + tag := strings.ToLower(sibling.Data) + // Check for heading elements + if tag == "h1" || tag == "h2" || tag == "h3" || tag == "h4" || tag == "h5" || tag == "h6" { + var text strings.Builder + g.extractTextContent(sibling, &text) + content := strings.TrimSpace(text.String()) + if content != "" && len(content) > 3 { + // Return first 12 chars for uniqueness + if len(content) > 12 { + content = content[:12] + } + return content + } + } + } + } + + return "" +} + +// getParentUniqueText extracts unique text from parent's child elements +func (g *IDGenerator) getParentUniqueText(parent *html.Node) string { + for child := parent.FirstChild; child != nil; child = child.NextSibling { + if child.Type == html.ElementNode { + tag := strings.ToLower(child.Data) + // Look for heading elements or elements with distinctive text + if tag == "h1" || tag == "h2" || tag == "h3" || tag == "h4" || tag == "h5" || tag == "h6" { + var text strings.Builder + g.extractTextContent(child, &text) + content := strings.TrimSpace(text.String()) + if content != "" && len(content) > 2 { + // Return first 15 chars of heading text for uniqueness + if len(content) > 15 { + content = content[:15] + } + return content + } + } + } + } + return "" +} + +// getAttribute safely gets an attribute value from a node +func (g *IDGenerator) getAttribute(node *html.Node, attrName string) string { + for _, attr := range node.Attr { + if attr.Key == attrName { + return attr.Val + } + } + return "" +} + // getContentPreview extracts first 50 characters of text content for uniqueness func (g *IDGenerator) getContentPreview(node *html.Node) string { var text strings.Builder