From 16ad759880f9c6e08c7ca3de7ba9ca518b748908 Mon Sep 17 00:00:00 2001 From: Joakim Date: Sat, 1 Nov 2025 23:09:46 +0100 Subject: [PATCH] Fix template deduplication by separating structure comparison from content storage - Replace content-aware extractCleanTemplate with structure-only extractStructureSignature for template comparison - Add extractTemplateForStorage to preserve actual content for meaningful template display - Update generateTemplateSignature to use purely structural comparison ignoring text content - Remove redundant extractClassSignature function (functionality moved to extractStructureSignature) - Resolves issue where identical DOM structures created multiple templates due to content differences - Knowledge cards and other collections now correctly deduplicate to single templates while preserving content for previews --- internal/engine/collection.go | 43 ++++------------------------- internal/engine/content.go | 51 ++++++++++++++++++++++------------- 2 files changed, 38 insertions(+), 56 deletions(-) diff --git a/internal/engine/collection.go b/internal/engine/collection.go index 6b08f7e..7ff52d2 100644 --- a/internal/engine/collection.go +++ b/internal/engine/collection.go @@ -3,7 +3,6 @@ package engine import ( "context" "fmt" - "sort" "strings" "time" @@ -86,7 +85,7 @@ func (e *ContentEngine) extractAndStoreTemplatesAndItems(collectionNode *html.No templateIndex := 0 for child := collectionNode.FirstChild; child != nil; child = child.NextSibling { if child.Type == html.ElementNode { - templateHTML := e.extractCleanTemplate(child) + templateHTML := e.extractTemplateForStorage(child) templateSignature := e.generateTemplateSignature(child) // Check if we've already seen this exact template structure + styling @@ -233,7 +232,7 @@ func (e *ContentEngine) processChildElementsAsContent(childElement *html.Node, s // Extract the content htmlContent := e.extractHTMLContent(n) - template := e.extractCleanTemplate(n) + template := e.extractTemplateForStorage(n) // Store content entry contentEntries = append(contentEntries, ContentEntry{ @@ -476,42 +475,10 @@ func (e *ContentEngine) cloneNode(node *html.Node) *html.Node { } // generateTemplateSignature creates a unique signature for template comparison -// This combines structural HTML + class-based styling differences +// This is purely structure + class based, completely ignoring content func (e *ContentEngine) generateTemplateSignature(element *html.Node) string { - // Get the clean template HTML (structure) - structuralHTML := e.extractCleanTemplate(element) - - // Extract class-based styling signature - stylingSignature := e.extractClassSignature(element) - - // Combine both for a unique signature - return fmt.Sprintf("%s|%s", structuralHTML, stylingSignature) -} - -// extractClassSignature recursively extracts and normalizes class attributes -func (e *ContentEngine) extractClassSignature(element *html.Node) string { - var signature strings.Builder - - e.walkNodes(element, func(n *html.Node) { - if n.Type == html.ElementNode { - // Get classes for this element - classes := GetClasses(n) - if len(classes) > 0 { - // Sort classes for consistent comparison - sortedClasses := make([]string, len(classes)) - copy(sortedClasses, classes) - sort.Strings(sortedClasses) - - // Add to signature: element[class1,class2,...] - signature.WriteString(fmt.Sprintf("%s[%s];", n.Data, strings.Join(sortedClasses, ","))) - } else { - // Element with no classes - signature.WriteString(fmt.Sprintf("%s[];", n.Data)) - } - } - }) - - return signature.String() + // Get content-agnostic structure signature + return e.extractStructureSignature(element) } // generateTemplateNameFromSignature creates human-readable template names diff --git a/internal/engine/content.go b/internal/engine/content.go index 28454f7..6037029 100644 --- a/internal/engine/content.go +++ b/internal/engine/content.go @@ -2,7 +2,9 @@ package engine import ( "context" + "fmt" "slices" + "sort" "strings" "golang.org/x/net/html" @@ -62,31 +64,44 @@ func (e *ContentEngine) extractOriginalTemplate(node *html.Node) string { return buf.String() } -// extractCleanTemplate extracts a clean template without data-content-id attributes and with placeholder content. Used for collection template variants. -func (e *ContentEngine) extractCleanTemplate(node *html.Node) string { +// extractStructureSignature creates a content-agnostic signature for template comparison +// This only considers DOM structure and class attributes, completely ignoring text content +func (e *ContentEngine) extractStructureSignature(node *html.Node) string { + var signature strings.Builder + + e.walkNodes(node, func(n *html.Node) { + if n.Type == html.ElementNode { + // Get classes for this element + classes := GetClasses(n) + if len(classes) > 0 { + // Sort classes for consistent comparison + sortedClasses := make([]string, len(classes)) + copy(sortedClasses, classes) + sort.Strings(sortedClasses) + + // Add to signature: element[class1,class2,...] + signature.WriteString(fmt.Sprintf("%s[%s];", n.Data, strings.Join(sortedClasses, ","))) + } else { + // Element with no classes + signature.WriteString(fmt.Sprintf("%s[];", n.Data)) + } + } + // Completely ignore text nodes and their content + }) + + return signature.String() +} + +// extractTemplateForStorage extracts template HTML while preserving content but removing data-content-id attributes +func (e *ContentEngine) extractTemplateForStorage(node *html.Node) string { // Clone the node to avoid modifying the original clonedNode := e.cloneNode(node) - // Remove all data-content-id attributes and replace content with placeholders + // Remove all data-content-id attributes but preserve all content e.walkNodes(clonedNode, func(n *html.Node) { if n.Type == html.ElementNode { // Remove data-content-id attribute e.removeAttribute(n, "data-content-id") - - // If this is an .insertr element, replace content with placeholder - if e.hasClass(n, "insertr") { - placeholderText := e.getPlaceholderForElement(n.Data) - // Clear existing children and add placeholder text - for child := n.FirstChild; child != nil; { - next := child.NextSibling - n.RemoveChild(child) - child = next - } - n.AppendChild(&html.Node{ - Type: html.TextNode, - Data: placeholderText, - }) - } } })