Fix template deduplication by separating structure comparison from content storage

- Replace content-aware extractCleanTemplate with structure-only extractStructureSignature for template comparison
- Add extractTemplateForStorage to preserve actual content for meaningful template display
- Update generateTemplateSignature to use purely structural comparison ignoring text content
- Remove redundant extractClassSignature function (functionality moved to extractStructureSignature)
- Resolves issue where identical DOM structures created multiple templates due to content differences
- Knowledge cards and other collections now correctly deduplicate to single templates while preserving content for previews
This commit is contained in:
2025-11-01 23:09:46 +01:00
parent 163cbf7eea
commit 16ad759880
2 changed files with 38 additions and 56 deletions

View File

@@ -2,7 +2,9 @@ package engine
import (
"context"
"fmt"
"slices"
"sort"
"strings"
"golang.org/x/net/html"
@@ -62,31 +64,44 @@ func (e *ContentEngine) extractOriginalTemplate(node *html.Node) string {
return buf.String()
}
// extractCleanTemplate extracts a clean template without data-content-id attributes and with placeholder content. Used for collection template variants.
func (e *ContentEngine) extractCleanTemplate(node *html.Node) string {
// extractStructureSignature creates a content-agnostic signature for template comparison
// This only considers DOM structure and class attributes, completely ignoring text content
func (e *ContentEngine) extractStructureSignature(node *html.Node) string {
var signature strings.Builder
e.walkNodes(node, func(n *html.Node) {
if n.Type == html.ElementNode {
// Get classes for this element
classes := GetClasses(n)
if len(classes) > 0 {
// Sort classes for consistent comparison
sortedClasses := make([]string, len(classes))
copy(sortedClasses, classes)
sort.Strings(sortedClasses)
// Add to signature: element[class1,class2,...]
signature.WriteString(fmt.Sprintf("%s[%s];", n.Data, strings.Join(sortedClasses, ",")))
} else {
// Element with no classes
signature.WriteString(fmt.Sprintf("%s[];", n.Data))
}
}
// Completely ignore text nodes and their content
})
return signature.String()
}
// extractTemplateForStorage extracts template HTML while preserving content but removing data-content-id attributes
func (e *ContentEngine) extractTemplateForStorage(node *html.Node) string {
// Clone the node to avoid modifying the original
clonedNode := e.cloneNode(node)
// Remove all data-content-id attributes and replace content with placeholders
// Remove all data-content-id attributes but preserve all content
e.walkNodes(clonedNode, func(n *html.Node) {
if n.Type == html.ElementNode {
// Remove data-content-id attribute
e.removeAttribute(n, "data-content-id")
// If this is an .insertr element, replace content with placeholder
if e.hasClass(n, "insertr") {
placeholderText := e.getPlaceholderForElement(n.Data)
// Clear existing children and add placeholder text
for child := n.FirstChild; child != nil; {
next := child.NextSibling
n.RemoveChild(child)
child = next
}
n.AppendChild(&html.Node{
Type: html.TextNode,
Data: placeholderText,
})
}
}
})