Fix template deduplication by separating structure comparison from content storage
- Replace content-aware extractCleanTemplate with structure-only extractStructureSignature for template comparison - Add extractTemplateForStorage to preserve actual content for meaningful template display - Update generateTemplateSignature to use purely structural comparison ignoring text content - Remove redundant extractClassSignature function (functionality moved to extractStructureSignature) - Resolves issue where identical DOM structures created multiple templates due to content differences - Knowledge cards and other collections now correctly deduplicate to single templates while preserving content for previews
This commit is contained in:
@@ -3,7 +3,6 @@ package engine
|
|||||||
import (
|
import (
|
||||||
"context"
|
"context"
|
||||||
"fmt"
|
"fmt"
|
||||||
"sort"
|
|
||||||
"strings"
|
"strings"
|
||||||
"time"
|
"time"
|
||||||
|
|
||||||
@@ -86,7 +85,7 @@ func (e *ContentEngine) extractAndStoreTemplatesAndItems(collectionNode *html.No
|
|||||||
templateIndex := 0
|
templateIndex := 0
|
||||||
for child := collectionNode.FirstChild; child != nil; child = child.NextSibling {
|
for child := collectionNode.FirstChild; child != nil; child = child.NextSibling {
|
||||||
if child.Type == html.ElementNode {
|
if child.Type == html.ElementNode {
|
||||||
templateHTML := e.extractCleanTemplate(child)
|
templateHTML := e.extractTemplateForStorage(child)
|
||||||
templateSignature := e.generateTemplateSignature(child)
|
templateSignature := e.generateTemplateSignature(child)
|
||||||
|
|
||||||
// Check if we've already seen this exact template structure + styling
|
// Check if we've already seen this exact template structure + styling
|
||||||
@@ -233,7 +232,7 @@ func (e *ContentEngine) processChildElementsAsContent(childElement *html.Node, s
|
|||||||
|
|
||||||
// Extract the content
|
// Extract the content
|
||||||
htmlContent := e.extractHTMLContent(n)
|
htmlContent := e.extractHTMLContent(n)
|
||||||
template := e.extractCleanTemplate(n)
|
template := e.extractTemplateForStorage(n)
|
||||||
|
|
||||||
// Store content entry
|
// Store content entry
|
||||||
contentEntries = append(contentEntries, ContentEntry{
|
contentEntries = append(contentEntries, ContentEntry{
|
||||||
@@ -476,42 +475,10 @@ func (e *ContentEngine) cloneNode(node *html.Node) *html.Node {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// generateTemplateSignature creates a unique signature for template comparison
|
// generateTemplateSignature creates a unique signature for template comparison
|
||||||
// This combines structural HTML + class-based styling differences
|
// This is purely structure + class based, completely ignoring content
|
||||||
func (e *ContentEngine) generateTemplateSignature(element *html.Node) string {
|
func (e *ContentEngine) generateTemplateSignature(element *html.Node) string {
|
||||||
// Get the clean template HTML (structure)
|
// Get content-agnostic structure signature
|
||||||
structuralHTML := e.extractCleanTemplate(element)
|
return e.extractStructureSignature(element)
|
||||||
|
|
||||||
// Extract class-based styling signature
|
|
||||||
stylingSignature := e.extractClassSignature(element)
|
|
||||||
|
|
||||||
// Combine both for a unique signature
|
|
||||||
return fmt.Sprintf("%s|%s", structuralHTML, stylingSignature)
|
|
||||||
}
|
|
||||||
|
|
||||||
// extractClassSignature recursively extracts and normalizes class attributes
|
|
||||||
func (e *ContentEngine) extractClassSignature(element *html.Node) string {
|
|
||||||
var signature strings.Builder
|
|
||||||
|
|
||||||
e.walkNodes(element, func(n *html.Node) {
|
|
||||||
if n.Type == html.ElementNode {
|
|
||||||
// Get classes for this element
|
|
||||||
classes := GetClasses(n)
|
|
||||||
if len(classes) > 0 {
|
|
||||||
// Sort classes for consistent comparison
|
|
||||||
sortedClasses := make([]string, len(classes))
|
|
||||||
copy(sortedClasses, classes)
|
|
||||||
sort.Strings(sortedClasses)
|
|
||||||
|
|
||||||
// Add to signature: element[class1,class2,...]
|
|
||||||
signature.WriteString(fmt.Sprintf("%s[%s];", n.Data, strings.Join(sortedClasses, ",")))
|
|
||||||
} else {
|
|
||||||
// Element with no classes
|
|
||||||
signature.WriteString(fmt.Sprintf("%s[];", n.Data))
|
|
||||||
}
|
|
||||||
}
|
|
||||||
})
|
|
||||||
|
|
||||||
return signature.String()
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// generateTemplateNameFromSignature creates human-readable template names
|
// generateTemplateNameFromSignature creates human-readable template names
|
||||||
|
|||||||
@@ -2,7 +2,9 @@ package engine
|
|||||||
|
|
||||||
import (
|
import (
|
||||||
"context"
|
"context"
|
||||||
|
"fmt"
|
||||||
"slices"
|
"slices"
|
||||||
|
"sort"
|
||||||
"strings"
|
"strings"
|
||||||
|
|
||||||
"golang.org/x/net/html"
|
"golang.org/x/net/html"
|
||||||
@@ -62,31 +64,44 @@ func (e *ContentEngine) extractOriginalTemplate(node *html.Node) string {
|
|||||||
return buf.String()
|
return buf.String()
|
||||||
}
|
}
|
||||||
|
|
||||||
// extractCleanTemplate extracts a clean template without data-content-id attributes and with placeholder content. Used for collection template variants.
|
// extractStructureSignature creates a content-agnostic signature for template comparison
|
||||||
func (e *ContentEngine) extractCleanTemplate(node *html.Node) string {
|
// This only considers DOM structure and class attributes, completely ignoring text content
|
||||||
|
func (e *ContentEngine) extractStructureSignature(node *html.Node) string {
|
||||||
|
var signature strings.Builder
|
||||||
|
|
||||||
|
e.walkNodes(node, func(n *html.Node) {
|
||||||
|
if n.Type == html.ElementNode {
|
||||||
|
// Get classes for this element
|
||||||
|
classes := GetClasses(n)
|
||||||
|
if len(classes) > 0 {
|
||||||
|
// Sort classes for consistent comparison
|
||||||
|
sortedClasses := make([]string, len(classes))
|
||||||
|
copy(sortedClasses, classes)
|
||||||
|
sort.Strings(sortedClasses)
|
||||||
|
|
||||||
|
// Add to signature: element[class1,class2,...]
|
||||||
|
signature.WriteString(fmt.Sprintf("%s[%s];", n.Data, strings.Join(sortedClasses, ",")))
|
||||||
|
} else {
|
||||||
|
// Element with no classes
|
||||||
|
signature.WriteString(fmt.Sprintf("%s[];", n.Data))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// Completely ignore text nodes and their content
|
||||||
|
})
|
||||||
|
|
||||||
|
return signature.String()
|
||||||
|
}
|
||||||
|
|
||||||
|
// extractTemplateForStorage extracts template HTML while preserving content but removing data-content-id attributes
|
||||||
|
func (e *ContentEngine) extractTemplateForStorage(node *html.Node) string {
|
||||||
// Clone the node to avoid modifying the original
|
// Clone the node to avoid modifying the original
|
||||||
clonedNode := e.cloneNode(node)
|
clonedNode := e.cloneNode(node)
|
||||||
|
|
||||||
// Remove all data-content-id attributes and replace content with placeholders
|
// Remove all data-content-id attributes but preserve all content
|
||||||
e.walkNodes(clonedNode, func(n *html.Node) {
|
e.walkNodes(clonedNode, func(n *html.Node) {
|
||||||
if n.Type == html.ElementNode {
|
if n.Type == html.ElementNode {
|
||||||
// Remove data-content-id attribute
|
// Remove data-content-id attribute
|
||||||
e.removeAttribute(n, "data-content-id")
|
e.removeAttribute(n, "data-content-id")
|
||||||
|
|
||||||
// If this is an .insertr element, replace content with placeholder
|
|
||||||
if e.hasClass(n, "insertr") {
|
|
||||||
placeholderText := e.getPlaceholderForElement(n.Data)
|
|
||||||
// Clear existing children and add placeholder text
|
|
||||||
for child := n.FirstChild; child != nil; {
|
|
||||||
next := child.NextSibling
|
|
||||||
n.RemoveChild(child)
|
|
||||||
child = next
|
|
||||||
}
|
|
||||||
n.AppendChild(&html.Node{
|
|
||||||
Type: html.TextNode,
|
|
||||||
Data: placeholderText,
|
|
||||||
})
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
})
|
})
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user