insertr/internal/parser/parser.go

package parser

import (
	"fmt"
	"io/fs"
	"os"
	"path/filepath"
	"strings"

	"golang.org/x/net/html"
)

// Parser handles HTML parsing and element detection
type Parser struct {
	idGenerator *IDGenerator
}

// New creates a new Parser instance
func New() *Parser {
	return &Parser{
		idGenerator: NewIDGenerator(),
	}
}

// ParseDirectory parses all HTML files in the given directory
func (p *Parser) ParseDirectory(dir string) (*ParseResult, error) {
	result := &ParseResult{
		Elements: []Element{},
		Warnings: []string{},
		Stats: ParseStats{
			TypeBreakdown: make(map[ContentType]int),
		},
	}

	err := filepath.WalkDir(dir, func(path string, d fs.DirEntry, err error) error {
		if err != nil {
			return err
		}

		// Only process HTML files
		if d.IsDir() || !strings.HasSuffix(strings.ToLower(path), ".html") {
			return nil
		}

		elements, warnings, err := p.parseFile(path)
		if err != nil {
			result.Warnings = append(result.Warnings,
				fmt.Sprintf("Error parsing %s: %v", path, err))
			return nil // Continue processing other files
		}

		result.Elements = append(result.Elements, elements...)
		result.Warnings = append(result.Warnings, warnings...)
		result.Stats.FilesProcessed++

		return nil
	})

	if err != nil {
		return nil, fmt.Errorf("error walking directory: %w", err)
	}

	// Calculate statistics
	p.calculateStats(result)

	return result, nil
}

// parseFile parses a single HTML file
func (p *Parser) parseFile(filePath string) ([]Element, []string, error) {
	file, err := os.Open(filePath)
	if err != nil {
		return nil, nil, fmt.Errorf("error opening file: %w", err)
	}
	defer file.Close()

	doc, err := html.Parse(file)
	if err != nil {
		return nil, nil, fmt.Errorf("error parsing HTML: %w", err)
	}

	var elements []Element
	var warnings []string

	p.findInsertrElements(doc, filePath, &elements, &warnings)

	return elements, warnings, nil
}

// findInsertrElements recursively finds all elements with "insertr" class
func (p *Parser) findInsertrElements(node *html.Node, filePath string, elements *[]Element, warnings *[]string) {
	if node.Type == html.ElementNode {
		classes := GetClasses(node)

		// Check if element has "insertr" class
		if ContainsClass(classes, "insertr") {
			if isContainer(node) {
				// Container element - expand to viable children
				viableChildren := findViableChildren(node)
				for _, child := range viableChildren {
					childClasses := GetClasses(child)
					element, warning := p.createElement(child, filePath, childClasses)
					*elements = append(*elements, element)
					if warning != "" {
						*warnings = append(*warnings, warning)
					}
				}

				// Don't process children recursively since we've handled the container's children
				return
			} else {
				// Regular element - process as before
				element, warning := p.createElement(node, filePath, classes)
				*elements = append(*elements, element)
				if warning != "" {
					*warnings = append(*warnings, warning)
				}
			}
		}
	}

	// Recursively check children
	for child := node.FirstChild; child != nil; child = child.NextSibling {
		p.findInsertrElements(child, filePath, elements, warnings)
	}
}

// createElement creates an Element from an HTML node
func (p *Parser) createElement(node *html.Node, filePath string, classes []string) (Element, string) {
	var warning string

	// Resolve content ID (existing or generated)
	contentID, hasExistingID := p.resolveContentID(node)
	if !hasExistingID {
		contentID = p.idGenerator.Generate(node, filePath)
	}

	// Detect content type
	contentType := p.detectContentType(node, classes)

	// Extract text content
	content := extractTextContent(node)

	element := Element{
		FilePath:  filePath,
		Node:      node,
		ContentID: contentID,
		Type:      contentType,
		Tag:       strings.ToLower(node.Data),
		Classes:   classes,
		Content:   content,
		HasID:     hasExistingID,
		Generated: !hasExistingID,
	}

	// Generate warnings for edge cases
	if content == "" {
		warning = fmt.Sprintf("Element <%s> with id '%s' has no text content",
			element.Tag, element.ContentID)
	}

	return element, warning
}

// resolveContentID gets the content ID from existing attributes
func (p *Parser) resolveContentID(node *html.Node) (string, bool) {
	// 1. Check for existing HTML id attribute
	if id := getAttribute(node, "id"); id != "" {
		return id, true
	}

	// 2. Check for data-content-id attribute
	if contentID := getAttribute(node, "data-content-id"); contentID != "" {
		return contentID, true
	}

	// 3. No existing ID found
	return "", false
}

// detectContentType determines the content type based on element and classes
func (p *Parser) detectContentType(node *html.Node, classes []string) ContentType {
	// Check for explicit type classes first
	if ContainsClass(classes, "insertr-markdown") {
		return ContentMarkdown
	}
	if ContainsClass(classes, "insertr-link") {
		return ContentLink
	}
	if ContainsClass(classes, "insertr-text") {
		return ContentText
	}

	// Infer from HTML tag and context
	tag := strings.ToLower(node.Data)
	switch tag {
	case "h1", "h2", "h3", "h4", "h5", "h6":
		return ContentText
	case "p":
		// Paragraphs default to markdown for rich content
		return ContentMarkdown
	case "a", "button":
		return ContentLink
	case "div", "section":
		// Default divs/sections to markdown for rich content
		return ContentMarkdown
	case "span":
		// Default spans to markdown for rich inline content
		return ContentMarkdown
	default:
		return ContentText
	}
}

// calculateStats computes statistics for the parse result
func (p *Parser) calculateStats(result *ParseResult) {
	result.Stats.TotalElements = len(result.Elements)

	for _, element := range result.Elements {
		// Count existing vs generated IDs
		if element.HasID {
			result.Stats.ExistingIDs++
		} else {
			result.Stats.GeneratedIDs++
		}

		// Count content types
		result.Stats.TypeBreakdown[element.Type]++
	}
}