insertr/internal/parser/id_generator.go

package parser

import (
	"crypto/sha1"
	"fmt"
	"regexp"
	"strings"

	"golang.org/x/net/html"
)

// IDGenerator generates unique content IDs for elements
type IDGenerator struct {
	usedIDs map[string]bool
}

// NewIDGenerator creates a new ID generator
func NewIDGenerator() *IDGenerator {
	return &IDGenerator{
		usedIDs: make(map[string]bool),
	}
}

// Generate creates a content ID for an HTML element
func (g *IDGenerator) Generate(node *html.Node) string {
	context := g.getSemanticContext(node)
	purpose := g.getPurpose(node)
	contentHash := g.getContentHash(node)

	baseID := g.createBaseID(context, purpose, contentHash)
	finalID := g.ensureUnique(baseID)

	return finalID
}

// getSemanticContext determines the semantic context from parent elements
func (g *IDGenerator) getSemanticContext(node *html.Node) string {
	// Walk up the tree to find semantic containers
	parent := node.Parent
	for parent != nil && parent.Type == html.ElementNode {
		classes := GetClasses(parent)

		// Check for common semantic section classes
		for _, class := range []string{"hero", "services", "nav", "navbar", "footer", "about", "contact", "testimonial"} {
			if ContainsClass(classes, class) {
				return class
			}
		}

		// Check for semantic HTML elements
		switch parent.Data {
		case "nav":
			return "nav"
		case "header":
			return "header"
		case "footer":
			return "footer"
		case "main":
			return "main"
		case "aside":
			return "aside"
		}

		parent = parent.Parent
	}

	return "content"
}

// getPurpose determines the purpose/role of the element
func (g *IDGenerator) getPurpose(node *html.Node) string {
	tag := strings.ToLower(node.Data)
	classes := GetClasses(node)

	// Check for specific CSS classes that indicate purpose
	for _, class := range classes {
		switch {
		case strings.Contains(class, "title"):
			return "title"
		case strings.Contains(class, "headline"):
			return "headline"
		case strings.Contains(class, "description"):
			return "description"
		case strings.Contains(class, "subtitle"):
			return "subtitle"
		case strings.Contains(class, "cta"):
			return "cta"
		case strings.Contains(class, "button"):
			return "button"
		case strings.Contains(class, "logo"):
			return "logo"
		case strings.Contains(class, "lead"):
			return "lead"
		}
	}

	// Infer purpose from HTML tag
	switch tag {
	case "h1":
		return "title"
	case "h2":
		return "subtitle"
	case "h3", "h4", "h5", "h6":
		return "heading"
	case "p":
		return "text"
	case "a":
		return "link"
	case "button":
		return "button"
	default:
		return "content"
	}
}

// getContentHash creates a short hash of the content for ID generation
func (g *IDGenerator) getContentHash(node *html.Node) string {
	text := extractTextContent(node)

	// Create hash of the text content
	hash := fmt.Sprintf("%x", sha1.Sum([]byte(text)))

	// Return first 6 characters for brevity
	return hash[:6]
}

// createBaseID creates the base ID from components
func (g *IDGenerator) createBaseID(context, purpose, contentHash string) string {
	parts := []string{}

	// Add context if meaningful
	if context != "content" {
		parts = append(parts, context)
	}

	// Add purpose
	parts = append(parts, purpose)

	// Always add content hash for uniqueness
	parts = append(parts, contentHash)

	baseID := strings.Join(parts, "-")

	// Clean up the ID
	baseID = regexp.MustCompile(`-+`).ReplaceAllString(baseID, "-")
	baseID = strings.Trim(baseID, "-")

	// Ensure it's not empty
	if baseID == "" {
		baseID = fmt.Sprintf("content-%s", contentHash)
	}

	return baseID
}

// ensureUnique makes sure the ID is unique by adding a suffix if needed
func (g *IDGenerator) ensureUnique(baseID string) string {
	if !g.usedIDs[baseID] {
		g.usedIDs[baseID] = true
		return baseID
	}

	// If base ID is taken, add a hash suffix
	hash := fmt.Sprintf("%x", sha1.Sum([]byte(baseID)))[:6]
	uniqueID := fmt.Sprintf("%s-%s", baseID, hash)

	g.usedIDs[uniqueID] = true
	return uniqueID
}