Fix template deduplication by separating structure comparison from content storage

- Replace content-aware extractCleanTemplate with structure-only extractStructureSignature for template comparison
- Add extractTemplateForStorage to preserve actual content for meaningful template display
- Update generateTemplateSignature to use purely structural comparison ignoring text content
- Remove redundant extractClassSignature function (functionality moved to extractStructureSignature)
- Resolves issue where identical DOM structures created multiple templates due to content differences
- Knowledge cards and other collections now correctly deduplicate to single templates while preserving content for previews
This commit is contained in:
2025-11-01 23:09:46 +01:00
parent 163cbf7eea
commit 16ad759880
2 changed files with 38 additions and 56 deletions

View File

@@ -3,7 +3,6 @@ package engine
import (
"context"
"fmt"
"sort"
"strings"
"time"
@@ -86,7 +85,7 @@ func (e *ContentEngine) extractAndStoreTemplatesAndItems(collectionNode *html.No
templateIndex := 0
for child := collectionNode.FirstChild; child != nil; child = child.NextSibling {
if child.Type == html.ElementNode {
templateHTML := e.extractCleanTemplate(child)
templateHTML := e.extractTemplateForStorage(child)
templateSignature := e.generateTemplateSignature(child)
// Check if we've already seen this exact template structure + styling
@@ -233,7 +232,7 @@ func (e *ContentEngine) processChildElementsAsContent(childElement *html.Node, s
// Extract the content
htmlContent := e.extractHTMLContent(n)
template := e.extractCleanTemplate(n)
template := e.extractTemplateForStorage(n)
// Store content entry
contentEntries = append(contentEntries, ContentEntry{
@@ -476,42 +475,10 @@ func (e *ContentEngine) cloneNode(node *html.Node) *html.Node {
}
// generateTemplateSignature creates a unique signature for template comparison
// This combines structural HTML + class-based styling differences
// This is purely structure + class based, completely ignoring content
func (e *ContentEngine) generateTemplateSignature(element *html.Node) string {
// Get the clean template HTML (structure)
structuralHTML := e.extractCleanTemplate(element)
// Extract class-based styling signature
stylingSignature := e.extractClassSignature(element)
// Combine both for a unique signature
return fmt.Sprintf("%s|%s", structuralHTML, stylingSignature)
}
// extractClassSignature recursively extracts and normalizes class attributes
func (e *ContentEngine) extractClassSignature(element *html.Node) string {
var signature strings.Builder
e.walkNodes(element, func(n *html.Node) {
if n.Type == html.ElementNode {
// Get classes for this element
classes := GetClasses(n)
if len(classes) > 0 {
// Sort classes for consistent comparison
sortedClasses := make([]string, len(classes))
copy(sortedClasses, classes)
sort.Strings(sortedClasses)
// Add to signature: element[class1,class2,...]
signature.WriteString(fmt.Sprintf("%s[%s];", n.Data, strings.Join(sortedClasses, ",")))
} else {
// Element with no classes
signature.WriteString(fmt.Sprintf("%s[];", n.Data))
}
}
})
return signature.String()
// Get content-agnostic structure signature
return e.extractStructureSignature(element)
}
// generateTemplateNameFromSignature creates human-readable template names