Fix template deduplication by separating structure comparison from content storage
- Replace content-aware extractCleanTemplate with structure-only extractStructureSignature for template comparison - Add extractTemplateForStorage to preserve actual content for meaningful template display - Update generateTemplateSignature to use purely structural comparison ignoring text content - Remove redundant extractClassSignature function (functionality moved to extractStructureSignature) - Resolves issue where identical DOM structures created multiple templates due to content differences - Knowledge cards and other collections now correctly deduplicate to single templates while preserving content for previews
This commit is contained in:
@@ -3,7 +3,6 @@ package engine
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"sort"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
@@ -86,7 +85,7 @@ func (e *ContentEngine) extractAndStoreTemplatesAndItems(collectionNode *html.No
|
||||
templateIndex := 0
|
||||
for child := collectionNode.FirstChild; child != nil; child = child.NextSibling {
|
||||
if child.Type == html.ElementNode {
|
||||
templateHTML := e.extractCleanTemplate(child)
|
||||
templateHTML := e.extractTemplateForStorage(child)
|
||||
templateSignature := e.generateTemplateSignature(child)
|
||||
|
||||
// Check if we've already seen this exact template structure + styling
|
||||
@@ -233,7 +232,7 @@ func (e *ContentEngine) processChildElementsAsContent(childElement *html.Node, s
|
||||
|
||||
// Extract the content
|
||||
htmlContent := e.extractHTMLContent(n)
|
||||
template := e.extractCleanTemplate(n)
|
||||
template := e.extractTemplateForStorage(n)
|
||||
|
||||
// Store content entry
|
||||
contentEntries = append(contentEntries, ContentEntry{
|
||||
@@ -476,42 +475,10 @@ func (e *ContentEngine) cloneNode(node *html.Node) *html.Node {
|
||||
}
|
||||
|
||||
// generateTemplateSignature creates a unique signature for template comparison
|
||||
// This combines structural HTML + class-based styling differences
|
||||
// This is purely structure + class based, completely ignoring content
|
||||
func (e *ContentEngine) generateTemplateSignature(element *html.Node) string {
|
||||
// Get the clean template HTML (structure)
|
||||
structuralHTML := e.extractCleanTemplate(element)
|
||||
|
||||
// Extract class-based styling signature
|
||||
stylingSignature := e.extractClassSignature(element)
|
||||
|
||||
// Combine both for a unique signature
|
||||
return fmt.Sprintf("%s|%s", structuralHTML, stylingSignature)
|
||||
}
|
||||
|
||||
// extractClassSignature recursively extracts and normalizes class attributes
|
||||
func (e *ContentEngine) extractClassSignature(element *html.Node) string {
|
||||
var signature strings.Builder
|
||||
|
||||
e.walkNodes(element, func(n *html.Node) {
|
||||
if n.Type == html.ElementNode {
|
||||
// Get classes for this element
|
||||
classes := GetClasses(n)
|
||||
if len(classes) > 0 {
|
||||
// Sort classes for consistent comparison
|
||||
sortedClasses := make([]string, len(classes))
|
||||
copy(sortedClasses, classes)
|
||||
sort.Strings(sortedClasses)
|
||||
|
||||
// Add to signature: element[class1,class2,...]
|
||||
signature.WriteString(fmt.Sprintf("%s[%s];", n.Data, strings.Join(sortedClasses, ",")))
|
||||
} else {
|
||||
// Element with no classes
|
||||
signature.WriteString(fmt.Sprintf("%s[];", n.Data))
|
||||
}
|
||||
}
|
||||
})
|
||||
|
||||
return signature.String()
|
||||
// Get content-agnostic structure signature
|
||||
return e.extractStructureSignature(element)
|
||||
}
|
||||
|
||||
// generateTemplateNameFromSignature creates human-readable template names
|
||||
|
||||
@@ -2,7 +2,9 @@ package engine
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"slices"
|
||||
"sort"
|
||||
"strings"
|
||||
|
||||
"golang.org/x/net/html"
|
||||
@@ -62,31 +64,44 @@ func (e *ContentEngine) extractOriginalTemplate(node *html.Node) string {
|
||||
return buf.String()
|
||||
}
|
||||
|
||||
// extractCleanTemplate extracts a clean template without data-content-id attributes and with placeholder content. Used for collection template variants.
|
||||
func (e *ContentEngine) extractCleanTemplate(node *html.Node) string {
|
||||
// extractStructureSignature creates a content-agnostic signature for template comparison
|
||||
// This only considers DOM structure and class attributes, completely ignoring text content
|
||||
func (e *ContentEngine) extractStructureSignature(node *html.Node) string {
|
||||
var signature strings.Builder
|
||||
|
||||
e.walkNodes(node, func(n *html.Node) {
|
||||
if n.Type == html.ElementNode {
|
||||
// Get classes for this element
|
||||
classes := GetClasses(n)
|
||||
if len(classes) > 0 {
|
||||
// Sort classes for consistent comparison
|
||||
sortedClasses := make([]string, len(classes))
|
||||
copy(sortedClasses, classes)
|
||||
sort.Strings(sortedClasses)
|
||||
|
||||
// Add to signature: element[class1,class2,...]
|
||||
signature.WriteString(fmt.Sprintf("%s[%s];", n.Data, strings.Join(sortedClasses, ",")))
|
||||
} else {
|
||||
// Element with no classes
|
||||
signature.WriteString(fmt.Sprintf("%s[];", n.Data))
|
||||
}
|
||||
}
|
||||
// Completely ignore text nodes and their content
|
||||
})
|
||||
|
||||
return signature.String()
|
||||
}
|
||||
|
||||
// extractTemplateForStorage extracts template HTML while preserving content but removing data-content-id attributes
|
||||
func (e *ContentEngine) extractTemplateForStorage(node *html.Node) string {
|
||||
// Clone the node to avoid modifying the original
|
||||
clonedNode := e.cloneNode(node)
|
||||
|
||||
// Remove all data-content-id attributes and replace content with placeholders
|
||||
// Remove all data-content-id attributes but preserve all content
|
||||
e.walkNodes(clonedNode, func(n *html.Node) {
|
||||
if n.Type == html.ElementNode {
|
||||
// Remove data-content-id attribute
|
||||
e.removeAttribute(n, "data-content-id")
|
||||
|
||||
// If this is an .insertr element, replace content with placeholder
|
||||
if e.hasClass(n, "insertr") {
|
||||
placeholderText := e.getPlaceholderForElement(n.Data)
|
||||
// Clear existing children and add placeholder text
|
||||
for child := n.FirstChild; child != nil; {
|
||||
next := child.NextSibling
|
||||
n.RemoveChild(child)
|
||||
child = next
|
||||
}
|
||||
n.AppendChild(&html.Node{
|
||||
Type: html.TextNode,
|
||||
Data: placeholderText,
|
||||
})
|
||||
}
|
||||
}
|
||||
})
|
||||
|
||||
|
||||
Reference in New Issue
Block a user