insertr/internal/parser/utils.go

package parser

import (
	"strings"

	"golang.org/x/net/html"
)

// GetClasses extracts CSS classes from an HTML node
func GetClasses(node *html.Node) []string {
	classAttr := getAttribute(node, "class")
	if classAttr == "" {
		return []string{}
	}

	classes := strings.Fields(classAttr)
	return classes
}

// ContainsClass checks if a class list contains a specific class
func ContainsClass(classes []string, target string) bool {
	for _, class := range classes {
		if class == target {
			return true
		}
	}
	return false
}

// getAttribute gets an attribute value from an HTML node
func getAttribute(node *html.Node, key string) string {
	for _, attr := range node.Attr {
		if attr.Key == key {
			return attr.Val
		}
	}
	return ""
}

// extractTextContent gets the text content from an HTML node
func extractTextContent(node *html.Node) string {
	var text strings.Builder
	extractTextRecursive(node, &text)
	return strings.TrimSpace(text.String())
}

// extractTextRecursive recursively extracts text from node and children
func extractTextRecursive(node *html.Node, text *strings.Builder) {
	if node.Type == html.TextNode {
		text.WriteString(node.Data)
	}

	for child := node.FirstChild; child != nil; child = child.NextSibling {
		// Skip script and style elements
		if child.Type == html.ElementNode &&
			(child.Data == "script" || child.Data == "style") {
			continue
		}
		extractTextRecursive(child, text)
	}
}

// hasOnlyTextContent checks if a node contains only text content (no nested HTML elements)
func hasOnlyTextContent(node *html.Node) bool {
	if node.Type != html.ElementNode {
		return false
	}

	for child := node.FirstChild; child != nil; child = child.NextSibling {
		switch child.Type {
		case html.ElementNode:
			// Found a nested HTML element - not text-only
			return false
		case html.TextNode:
			// Text nodes are fine, continue checking
			continue
		default:
			// Comments, etc. - continue checking
			continue
		}
	}
	return true
}

// isContainer checks if a tag is typically used as a container element
func isContainer(node *html.Node) bool {
	if node.Type != html.ElementNode {
		return false
	}

	containerTags := map[string]bool{
		"div":     true,
		"section": true,
		"article": true,
		"header":  true,
		"footer":  true,
		"main":    true,
		"aside":   true,
		"nav":     true,
	}

	return containerTags[node.Data]
}

// findViableChildren finds all child elements that are viable for editing
func findViableChildren(node *html.Node) []*html.Node {
	var viable []*html.Node

	for child := node.FirstChild; child != nil; child = child.NextSibling {
		// Skip whitespace-only text nodes
		if child.Type == html.TextNode {
			if strings.TrimSpace(child.Data) == "" {
				continue
			}
		}

		// Only consider element nodes
		if child.Type != html.ElementNode {
			continue
		}

		// Skip self-closing elements for now
		if isSelfClosing(child) {
			continue
		}

		// Check if element has only text content
		if hasOnlyTextContent(child) {
			viable = append(viable, child)
		}
	}

	return viable
}

// isSelfClosing checks if an element is typically self-closing
func isSelfClosing(node *html.Node) bool {
	if node.Type != html.ElementNode {
		return false
	}

	selfClosingTags := map[string]bool{
		"img":    true,
		"input":  true,
		"br":     true,
		"hr":     true,
		"meta":   true,
		"link":   true,
		"area":   true,
		"base":   true,
		"col":    true,
		"embed":  true,
		"source": true,
		"track":  true,
		"wbr":    true,
	}

	return selfClosingTags[node.Data]
}

// FindElementInDocument finds a parser element in HTML document tree using semantic matching
func FindElementInDocument(doc *html.Node, element Element) *html.Node {
	return findElementWithContext(doc, element)
}

// findElementWithContext uses the parser's semantic understanding to find the correct element
func findElementWithContext(node *html.Node, target Element) *html.Node {
	if node.Type == html.ElementNode && node.Data == target.Tag {
		classes := GetClasses(node)
		if ContainsClass(classes, "insertr") {
			// Content-based validation for precise matching
			textContent := extractTextContent(node)
			nodeContent := strings.TrimSpace(textContent)
			targetContent := strings.TrimSpace(target.Content)

			if nodeContent == targetContent {
				return node
			}
		}
	}

	// Recursively search children
	for child := node.FirstChild; child != nil; child = child.NextSibling {
		if result := findElementWithContext(child, target); result != nil {
			return result
		}
	}

	return nil
}

// GetAttribute gets an attribute value from an HTML node (exported version)
func GetAttribute(node *html.Node, key string) string {
	return getAttribute(node, key)
}