package engine import ( "crypto/sha256" "fmt" "path/filepath" "strings" "golang.org/x/net/html" ) // IDGenerator generates unique content IDs for elements using lightweight hierarchical approach type IDGenerator struct { usedIDs map[string]bool elementCounts map[string]int // Track counts per file+type for indexing } // NewIDGenerator creates a new ID generator func NewIDGenerator() *IDGenerator { return &IDGenerator{ usedIDs: make(map[string]bool), elementCounts: make(map[string]int), } } // Generate creates a content ID for an HTML element using deterministic approach func (g *IDGenerator) Generate(node *html.Node, filePath string) string { // 1. File context (minimal) fileName := g.getFileName(filePath) // 2. Element identity (lightweight) tag := strings.ToLower(node.Data) primaryClass := g.getPrimaryClass(node) // 3. Build readable prefix (deterministic, no runtime counting) prefix := g.buildDeterministicPrefix(fileName, tag, primaryClass) // 4. Create deterministic suffix based on element characteristics signature := g.createDeterministicSignature(node, filePath) finalID := fmt.Sprintf("%s-%s", prefix, signature) // Ensure uniqueness within this session counter := 1 originalID := finalID for g.usedIDs[finalID] { finalID = fmt.Sprintf("%s-%d", originalID, counter) counter++ } g.usedIDs[finalID] = true return finalID } // getFileName extracts filename without extension for ID prefix func (g *IDGenerator) getFileName(filePath string) string { base := filepath.Base(filePath) return strings.TrimSuffix(base, filepath.Ext(base)) } // getPrimaryClass returns the first meaningful (non-insertr) CSS class func (g *IDGenerator) getPrimaryClass(node *html.Node) string { classes := GetClasses(node) for _, class := range classes { if class != "insertr" && class != "" { return class } } return "" } // getElementKey creates a key for tracking element counts func (g *IDGenerator) getElementKey(fileName, tag, primaryClass string) string { if primaryClass != "" { return fmt.Sprintf("%s-%s", fileName, primaryClass) } return fmt.Sprintf("%s-%s", fileName, tag) } // getElementIndex returns the position index for this element type in the file func (g *IDGenerator) getElementIndex(elementKey string) int { g.elementCounts[elementKey]++ return g.elementCounts[elementKey] } // buildDeterministicPrefix creates human-readable prefix without runtime counting func (g *IDGenerator) buildDeterministicPrefix(fileName, tag, primaryClass string) string { var parts []string parts = append(parts, fileName) if primaryClass != "" { parts = append(parts, primaryClass) } else { parts = append(parts, tag) } // No runtime index - rely on hash for uniqueness return strings.Join(parts, "-") } // buildPrefix creates human-readable prefix for the ID (legacy method) func (g *IDGenerator) buildPrefix(fileName, tag, primaryClass string, index int) string { var parts []string parts = append(parts, fileName) if primaryClass != "" { parts = append(parts, primaryClass) } else { parts = append(parts, tag) } // Only add index if it's not the first element of this type if index > 1 { parts = append(parts, fmt.Sprintf("%d", index)) } return strings.Join(parts, "-") } // createDeterministicSignature creates a deterministic signature for element identification func (g *IDGenerator) createDeterministicSignature(node *html.Node, filePath string) string { // Build structural signature for stable IDs across content changes tag := node.Data domPath := g.getDetailedDOMPath(node) allClasses := strings.Join(GetClasses(node), " ") semanticContext := g.getSemanticContext(node) preciseIndex := g.getPreciseSiblingIndex(node) // Create purely structural deterministic signature signature := fmt.Sprintf("%s|%s|%s|%s|%s|%d", filePath, // File context for uniqueness across files domPath, // Detailed structural position in DOM tag, // Element type allClasses, // All CSS classes for style differentiation semanticContext, // Semantic context (header/main/footer/nav) preciseIndex, // Precise position among exact siblings ) // Create deterministic hash suffix (6 chars) hash := sha256.Sum256([]byte(signature)) return fmt.Sprintf("%x", hash)[:6] } // createSignature creates a unique signature for collision resistance (DEPRECATED - using deterministic now) func (g *IDGenerator) createSignature(node *html.Node, filePath string) string { // This method is kept for compatibility but not used in deterministic generation return "" } // getSimpleDOMPath creates a simple but precise DOM path for uniqueness (max 3 levels) func (g *IDGenerator) getSimpleDOMPath(node *html.Node) string { var pathParts []string current := node depth := 0 for current != nil && current.Type == html.ElementNode && depth < 3 { part := current.Data // Add first meaningful class (not insertr) for better differentiation classes := GetClasses(current) for _, class := range classes { if class != "insertr" && class != "" { part += "." + class break } } pathParts = append([]string{part}, pathParts...) current = current.Parent depth++ } return strings.Join(pathParts, ">") } // getDetailedDOMPath creates a more detailed DOM path for enhanced structural differentiation func (g *IDGenerator) getDetailedDOMPath(node *html.Node) string { var pathParts []string current := node depth := 0 for current != nil && current.Type == html.ElementNode && depth < 5 { part := current.Data // Add all meaningful classes for maximum differentiation classes := GetClasses(current) var meaningfulClasses []string for _, class := range classes { if class != "insertr" && class != "" { meaningfulClasses = append(meaningfulClasses, class) } } if len(meaningfulClasses) > 0 { part += "." + strings.Join(meaningfulClasses, ".") } pathParts = append([]string{part}, pathParts...) current = current.Parent depth++ } return strings.Join(pathParts, ">") } // getSemanticContext identifies the semantic container (header, main, footer, nav) func (g *IDGenerator) getSemanticContext(node *html.Node) string { current := node.Parent // Traverse up to find semantic containers for current != nil && current.Type == html.ElementNode { tag := strings.ToLower(current.Data) // Direct semantic tags switch tag { case "header": return "header" case "main": return "main" case "footer": return "footer" case "nav": return "nav" case "aside": return "aside" } // Semantic classes classes := GetClasses(current) for _, class := range classes { class = strings.ToLower(class) if strings.Contains(class, "header") { return "header" } if strings.Contains(class, "footer") { return "footer" } if strings.Contains(class, "nav") { return "nav" } if strings.Contains(class, "sidebar") || strings.Contains(class, "aside") { return "aside" } } current = current.Parent } return "content" } // getPreciseSiblingIndex returns position among siblings with exact tag and class match func (g *IDGenerator) getPreciseSiblingIndex(node *html.Node) int { if node.Parent == nil { return 0 } index := 0 tag := node.Data classes := GetClasses(node) // Sort classes for consistent comparison sortedClasses := make([]string, len(classes)) copy(sortedClasses, classes) for i := 0; i < len(sortedClasses); i++ { for j := i + 1; j < len(sortedClasses); j++ { if sortedClasses[i] > sortedClasses[j] { sortedClasses[i], sortedClasses[j] = sortedClasses[j], sortedClasses[i] } } } for sibling := node.Parent.FirstChild; sibling != nil; sibling = sibling.NextSibling { if sibling.Type == html.ElementNode && sibling.Data == tag { siblingClasses := GetClasses(sibling) // Sort sibling classes for comparison sortedSiblingClasses := make([]string, len(siblingClasses)) copy(sortedSiblingClasses, siblingClasses) for i := 0; i < len(sortedSiblingClasses); i++ { for j := i + 1; j < len(sortedSiblingClasses); j++ { if sortedSiblingClasses[i] > sortedSiblingClasses[j] { sortedSiblingClasses[i], sortedSiblingClasses[j] = sortedSiblingClasses[j], sortedSiblingClasses[i] } } } // Check if classes match exactly if g.classSlicesEqual(sortedClasses, sortedSiblingClasses) { if sibling == node { return index } index++ } } } return index } // classSlicesEqual compares two sorted class slices for equality func (g *IDGenerator) classSlicesEqual(a, b []string) bool { if len(a) != len(b) { return false } for i := range a { if a[i] != b[i] { return false } } return true } // getContentPreview extracts first 50 characters of text content for uniqueness func (g *IDGenerator) getContentPreview(node *html.Node) string { var text strings.Builder g.extractTextContent(node, &text) content := strings.TrimSpace(text.String()) if len(content) > 50 { content = content[:50] } // Remove newlines and normalize whitespace content = strings.ReplaceAll(content, "\n", " ") content = strings.ReplaceAll(content, "\t", " ") for strings.Contains(content, " ") { content = strings.ReplaceAll(content, " ", " ") } return content } // extractTextContent recursively extracts text content from a node func (g *IDGenerator) extractTextContent(node *html.Node, text *strings.Builder) { if node.Type == html.TextNode { text.WriteString(node.Data) } for child := node.FirstChild; child != nil; child = child.NextSibling { g.extractTextContent(child, text) } } // getSiblingIndex returns the position of this element among its siblings of the same type and class func (g *IDGenerator) getSiblingIndex(node *html.Node) int { if node.Parent == nil { return 0 } index := 0 tag := node.Data classes := GetClasses(node) // First try: match by tag + insertr class (most common case) hasInsertr := false for _, class := range classes { if class == "insertr" { hasInsertr = true break } } for sibling := node.Parent.FirstChild; sibling != nil; sibling = sibling.NextSibling { if sibling.Type == html.ElementNode && sibling.Data == tag { siblingClasses := GetClasses(sibling) // For insertr elements, match by tag + insertr class if hasInsertr { siblingHasInsertr := false for _, class := range siblingClasses { if class == "insertr" { siblingHasInsertr = true break } } if siblingHasInsertr { if sibling == node { return index } index++ } } else { // For non-insertr elements, match by exact class list if g.classesMatch(classes, siblingClasses) { if sibling == node { return index } index++ } } } } return index } // classesMatch checks if two class lists are equivalent func (g *IDGenerator) classesMatch(classes1, classes2 []string) bool { if len(classes1) != len(classes2) { return false } for i, class := range classes1 { if i >= len(classes2) || class != classes2[i] { return false } } return true }