package parser import ( "crypto/sha1" "fmt" "regexp" "strings" "golang.org/x/net/html" ) // IDGenerator generates unique content IDs for elements type IDGenerator struct { usedIDs map[string]bool } // NewIDGenerator creates a new ID generator func NewIDGenerator() *IDGenerator { return &IDGenerator{ usedIDs: make(map[string]bool), } } // Generate creates a content ID for an HTML element func (g *IDGenerator) Generate(node *html.Node) string { context := g.getSemanticContext(node) purpose := g.getPurpose(node) contentHash := g.getContentHash(node) baseID := g.createBaseID(context, purpose, contentHash) finalID := g.ensureUnique(baseID) return finalID } // getSemanticContext determines the semantic context from parent elements func (g *IDGenerator) getSemanticContext(node *html.Node) string { // Walk up the tree to find semantic containers parent := node.Parent for parent != nil && parent.Type == html.ElementNode { classes := GetClasses(parent) // Check for common semantic section classes for _, class := range []string{"hero", "services", "nav", "navbar", "footer", "about", "contact", "testimonial"} { if ContainsClass(classes, class) { return class } } // Check for semantic HTML elements switch parent.Data { case "nav": return "nav" case "header": return "header" case "footer": return "footer" case "main": return "main" case "aside": return "aside" } parent = parent.Parent } return "content" } // getPurpose determines the purpose/role of the element func (g *IDGenerator) getPurpose(node *html.Node) string { tag := strings.ToLower(node.Data) classes := GetClasses(node) // Check for specific CSS classes that indicate purpose for _, class := range classes { switch { case strings.Contains(class, "title"): return "title" case strings.Contains(class, "headline"): return "headline" case strings.Contains(class, "description"): return "description" case strings.Contains(class, "subtitle"): return "subtitle" case strings.Contains(class, "cta"): return "cta" case strings.Contains(class, "button"): return "button" case strings.Contains(class, "logo"): return "logo" case strings.Contains(class, "lead"): return "lead" } } // Infer purpose from HTML tag switch tag { case "h1": return "title" case "h2": return "subtitle" case "h3", "h4", "h5", "h6": return "heading" case "p": return "text" case "a": return "link" case "button": return "button" default: return "content" } } // getContentHash creates a short hash of the content for ID generation func (g *IDGenerator) getContentHash(node *html.Node) string { text := extractTextContent(node) // Create hash of the text content hash := fmt.Sprintf("%x", sha1.Sum([]byte(text))) // Return first 6 characters for brevity return hash[:6] } // createBaseID creates the base ID from components func (g *IDGenerator) createBaseID(context, purpose, contentHash string) string { parts := []string{} // Add context if meaningful if context != "content" { parts = append(parts, context) } // Add purpose parts = append(parts, purpose) // Always add content hash for uniqueness parts = append(parts, contentHash) baseID := strings.Join(parts, "-") // Clean up the ID baseID = regexp.MustCompile(`-+`).ReplaceAllString(baseID, "-") baseID = strings.Trim(baseID, "-") // Ensure it's not empty if baseID == "" { baseID = fmt.Sprintf("content-%s", contentHash) } return baseID } // ensureUnique makes sure the ID is unique by adding a suffix if needed func (g *IDGenerator) ensureUnique(baseID string) string { if !g.usedIDs[baseID] { g.usedIDs[baseID] = true return baseID } // If base ID is taken, add a hash suffix hash := fmt.Sprintf("%x", sha1.Sum([]byte(baseID)))[:6] uniqueID := fmt.Sprintf("%s-%s", baseID, hash) g.usedIDs[uniqueID] = true return uniqueID }