Fix template deduplication by separating structure comparison from content storage
- Replace content-aware extractCleanTemplate with structure-only extractStructureSignature for template comparison - Add extractTemplateForStorage to preserve actual content for meaningful template display - Update generateTemplateSignature to use purely structural comparison ignoring text content - Remove redundant extractClassSignature function (functionality moved to extractStructureSignature) - Resolves issue where identical DOM structures created multiple templates due to content differences - Knowledge cards and other collections now correctly deduplicate to single templates while preserving content for previews
This commit is contained in:
@@ -3,7 +3,6 @@ package engine
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"sort"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
@@ -86,7 +85,7 @@ func (e *ContentEngine) extractAndStoreTemplatesAndItems(collectionNode *html.No
|
||||
templateIndex := 0
|
||||
for child := collectionNode.FirstChild; child != nil; child = child.NextSibling {
|
||||
if child.Type == html.ElementNode {
|
||||
templateHTML := e.extractCleanTemplate(child)
|
||||
templateHTML := e.extractTemplateForStorage(child)
|
||||
templateSignature := e.generateTemplateSignature(child)
|
||||
|
||||
// Check if we've already seen this exact template structure + styling
|
||||
@@ -233,7 +232,7 @@ func (e *ContentEngine) processChildElementsAsContent(childElement *html.Node, s
|
||||
|
||||
// Extract the content
|
||||
htmlContent := e.extractHTMLContent(n)
|
||||
template := e.extractCleanTemplate(n)
|
||||
template := e.extractTemplateForStorage(n)
|
||||
|
||||
// Store content entry
|
||||
contentEntries = append(contentEntries, ContentEntry{
|
||||
@@ -476,42 +475,10 @@ func (e *ContentEngine) cloneNode(node *html.Node) *html.Node {
|
||||
}
|
||||
|
||||
// generateTemplateSignature creates a unique signature for template comparison
|
||||
// This combines structural HTML + class-based styling differences
|
||||
// This is purely structure + class based, completely ignoring content
|
||||
func (e *ContentEngine) generateTemplateSignature(element *html.Node) string {
|
||||
// Get the clean template HTML (structure)
|
||||
structuralHTML := e.extractCleanTemplate(element)
|
||||
|
||||
// Extract class-based styling signature
|
||||
stylingSignature := e.extractClassSignature(element)
|
||||
|
||||
// Combine both for a unique signature
|
||||
return fmt.Sprintf("%s|%s", structuralHTML, stylingSignature)
|
||||
}
|
||||
|
||||
// extractClassSignature recursively extracts and normalizes class attributes
|
||||
func (e *ContentEngine) extractClassSignature(element *html.Node) string {
|
||||
var signature strings.Builder
|
||||
|
||||
e.walkNodes(element, func(n *html.Node) {
|
||||
if n.Type == html.ElementNode {
|
||||
// Get classes for this element
|
||||
classes := GetClasses(n)
|
||||
if len(classes) > 0 {
|
||||
// Sort classes for consistent comparison
|
||||
sortedClasses := make([]string, len(classes))
|
||||
copy(sortedClasses, classes)
|
||||
sort.Strings(sortedClasses)
|
||||
|
||||
// Add to signature: element[class1,class2,...]
|
||||
signature.WriteString(fmt.Sprintf("%s[%s];", n.Data, strings.Join(sortedClasses, ",")))
|
||||
} else {
|
||||
// Element with no classes
|
||||
signature.WriteString(fmt.Sprintf("%s[];", n.Data))
|
||||
}
|
||||
}
|
||||
})
|
||||
|
||||
return signature.String()
|
||||
// Get content-agnostic structure signature
|
||||
return e.extractStructureSignature(element)
|
||||
}
|
||||
|
||||
// generateTemplateNameFromSignature creates human-readable template names
|
||||
|
||||
Reference in New Issue
Block a user