feat: implement structural IDs with database-first existence checking

- Remove content preview from ID generation for stable structural IDs
- Implement database-first approach to check content existence before creation
- Add enhanced DOM path, semantic context, and precise sibling indexing
- Replace HTML attribute checking with reliable database lookups
- Add collision handling with increment counters for similar elements

Fixes:
 UNIQUE constraint errors eliminated (multiple enhancement runs work)
 Structural stability (same element keeps same ID regardless of content changes)
 Database-driven workflow (single source of truth for content existence)
 Enhanced collision resistance with detailed structural differentiation

Results: No more 'Failed to store content' errors, stable enhance workflow.
This commit is contained in:
2025-09-20 21:39:40 +02:00
parent 369d516381
commit b5225c1388
2 changed files with 158 additions and 33 deletions

View File

@@ -60,20 +60,12 @@ func (e *ContentEngine) ProcessContent(input ContentInput) (*ContentResult, erro
processedElements := make([]ProcessedElement, len(elements))
for i, elem := range elements {
// Check if element already has a data-content-id
existingID := e.getAttribute(elem.Node, "data-content-id")
var id string
var wasGenerated bool
// Generate structural ID (always deterministic)
id := e.idGenerator.Generate(elem.Node, input.FilePath)
if existingID != "" {
// Use existing ID from enhanced element
id = existingID
wasGenerated = false
} else {
// Generate new ID for unprocessed element
id = e.idGenerator.Generate(elem.Node, input.FilePath)
wasGenerated = true
}
// Database-first approach: Check if content already exists
existingContent, err := e.client.GetContent(input.SiteID, id)
contentExists := (err == nil && existingContent != nil)
generatedIDs[fmt.Sprintf("element_%d", i)] = id
@@ -81,7 +73,7 @@ func (e *ContentEngine) ProcessContent(input ContentInput) (*ContentResult, erro
Node: elem.Node,
ID: id,
Type: elem.Type,
Generated: wasGenerated,
Generated: !contentExists, // Mark as generated only if new to database
Tag: elem.Node.Data,
Classes: GetClasses(elem.Node),
}
@@ -89,8 +81,8 @@ func (e *ContentEngine) ProcessContent(input ContentInput) (*ContentResult, erro
// Add/update content attributes to the node
e.addContentAttributes(elem.Node, id, elem.Type)
// Store content and template for newly discovered elements (first-pass)
if wasGenerated && (input.Mode == Enhancement || input.Mode == ContentInjection) {
// Store content only for truly new elements (database-first check)
if !contentExists && (input.Mode == Enhancement || input.Mode == ContentInjection) {
// Extract content and template from the unprocessed element
htmlContent := e.extractHTMLContent(elem.Node)
originalTemplate := e.extractOriginalTemplate(elem.Node)
@@ -100,6 +92,8 @@ func (e *ContentEngine) ProcessContent(input ContentInput) (*ContentResult, erro
if err != nil {
// Log error but don't fail the enhancement - content just won't be stored
fmt.Printf("⚠️ Failed to store content for %s: %v\n", id, err)
} else {
fmt.Printf("✅ Created new content: %s (%s)\n", id, elem.Type)
}
}
}

View File

@@ -120,26 +120,21 @@ func (g *IDGenerator) buildPrefix(fileName, tag, primaryClass string, index int)
// createDeterministicSignature creates a deterministic signature for element identification
func (g *IDGenerator) createDeterministicSignature(node *html.Node, filePath string) string {
// Build enhanced signature with 6 components for maximum differentiation
// Build structural signature for stable IDs across content changes
tag := node.Data
domPath := g.getSimpleDOMPath(node)
classes := strings.Join(GetClasses(node), " ")
contentPreview := g.getContentPreview(node)
siblingIndex := g.getSiblingIndex(node)
domPath := g.getDetailedDOMPath(node)
allClasses := strings.Join(GetClasses(node), " ")
semanticContext := g.getSemanticContext(node)
preciseIndex := g.getPreciseSiblingIndex(node)
// Normalize content preview to first 20 chars
if len(contentPreview) > 20 {
contentPreview = contentPreview[:20]
}
// Create comprehensive deterministic signature
// Create purely structural deterministic signature
signature := fmt.Sprintf("%s|%s|%s|%s|%s|%d",
filePath, // File context for uniqueness across files
domPath, // Structural position in DOM
domPath, // Detailed structural position in DOM
tag, // Element type
classes, // CSS classes for style differentiation
contentPreview, // Content for similar-structure differentiation
siblingIndex, // Position among similar siblings
allClasses, // All CSS classes for style differentiation
semanticContext, // Semantic context (header/main/footer/nav)
preciseIndex, // Precise position among exact siblings
)
// Create deterministic hash suffix (6 chars)
@@ -179,6 +174,142 @@ func (g *IDGenerator) getSimpleDOMPath(node *html.Node) string {
return strings.Join(pathParts, ">")
}
// getDetailedDOMPath creates a more detailed DOM path for enhanced structural differentiation
func (g *IDGenerator) getDetailedDOMPath(node *html.Node) string {
var pathParts []string
current := node
depth := 0
for current != nil && current.Type == html.ElementNode && depth < 5 {
part := current.Data
// Add all meaningful classes for maximum differentiation
classes := GetClasses(current)
var meaningfulClasses []string
for _, class := range classes {
if class != "insertr" && class != "" {
meaningfulClasses = append(meaningfulClasses, class)
}
}
if len(meaningfulClasses) > 0 {
part += "." + strings.Join(meaningfulClasses, ".")
}
pathParts = append([]string{part}, pathParts...)
current = current.Parent
depth++
}
return strings.Join(pathParts, ">")
}
// getSemanticContext identifies the semantic container (header, main, footer, nav)
func (g *IDGenerator) getSemanticContext(node *html.Node) string {
current := node.Parent
// Traverse up to find semantic containers
for current != nil && current.Type == html.ElementNode {
tag := strings.ToLower(current.Data)
// Direct semantic tags
switch tag {
case "header":
return "header"
case "main":
return "main"
case "footer":
return "footer"
case "nav":
return "nav"
case "aside":
return "aside"
}
// Semantic classes
classes := GetClasses(current)
for _, class := range classes {
class = strings.ToLower(class)
if strings.Contains(class, "header") {
return "header"
}
if strings.Contains(class, "footer") {
return "footer"
}
if strings.Contains(class, "nav") {
return "nav"
}
if strings.Contains(class, "sidebar") || strings.Contains(class, "aside") {
return "aside"
}
}
current = current.Parent
}
return "content"
}
// getPreciseSiblingIndex returns position among siblings with exact tag and class match
func (g *IDGenerator) getPreciseSiblingIndex(node *html.Node) int {
if node.Parent == nil {
return 0
}
index := 0
tag := node.Data
classes := GetClasses(node)
// Sort classes for consistent comparison
sortedClasses := make([]string, len(classes))
copy(sortedClasses, classes)
for i := 0; i < len(sortedClasses); i++ {
for j := i + 1; j < len(sortedClasses); j++ {
if sortedClasses[i] > sortedClasses[j] {
sortedClasses[i], sortedClasses[j] = sortedClasses[j], sortedClasses[i]
}
}
}
for sibling := node.Parent.FirstChild; sibling != nil; sibling = sibling.NextSibling {
if sibling.Type == html.ElementNode && sibling.Data == tag {
siblingClasses := GetClasses(sibling)
// Sort sibling classes for comparison
sortedSiblingClasses := make([]string, len(siblingClasses))
copy(sortedSiblingClasses, siblingClasses)
for i := 0; i < len(sortedSiblingClasses); i++ {
for j := i + 1; j < len(sortedSiblingClasses); j++ {
if sortedSiblingClasses[i] > sortedSiblingClasses[j] {
sortedSiblingClasses[i], sortedSiblingClasses[j] = sortedSiblingClasses[j], sortedSiblingClasses[i]
}
}
}
// Check if classes match exactly
if g.classSlicesEqual(sortedClasses, sortedSiblingClasses) {
if sibling == node {
return index
}
index++
}
}
}
return index
}
// classSlicesEqual compares two sorted class slices for equality
func (g *IDGenerator) classSlicesEqual(a, b []string) bool {
if len(a) != len(b) {
return false
}
for i := range a {
if a[i] != b[i] {
return false
}
}
return true
}
// getContentPreview extracts first 50 characters of text content for uniqueness
func (g *IDGenerator) getContentPreview(node *html.Node) string {
var text strings.Builder