package parser import ( "fmt" "io/fs" "os" "path/filepath" "strings" "golang.org/x/net/html" ) // Parser handles HTML parsing and element detection type Parser struct { idGenerator *IDGenerator } // New creates a new Parser instance func New() *Parser { return &Parser{ idGenerator: NewIDGenerator(), } } // ParseDirectory parses all HTML files in the given directory func (p *Parser) ParseDirectory(dir string) (*ParseResult, error) { result := &ParseResult{ Elements: []Element{}, Warnings: []string{}, Stats: ParseStats{ TypeBreakdown: make(map[ContentType]int), }, } err := filepath.WalkDir(dir, func(path string, d fs.DirEntry, err error) error { if err != nil { return err } // Only process HTML files if d.IsDir() || !strings.HasSuffix(strings.ToLower(path), ".html") { return nil } elements, warnings, err := p.parseFile(path) if err != nil { result.Warnings = append(result.Warnings, fmt.Sprintf("Error parsing %s: %v", path, err)) return nil // Continue processing other files } result.Elements = append(result.Elements, elements...) result.Warnings = append(result.Warnings, warnings...) result.Stats.FilesProcessed++ return nil }) if err != nil { return nil, fmt.Errorf("error walking directory: %w", err) } // Calculate statistics p.calculateStats(result) return result, nil } // parseFile parses a single HTML file func (p *Parser) parseFile(filePath string) ([]Element, []string, error) { file, err := os.Open(filePath) if err != nil { return nil, nil, fmt.Errorf("error opening file: %w", err) } defer file.Close() doc, err := html.Parse(file) if err != nil { return nil, nil, fmt.Errorf("error parsing HTML: %w", err) } var elements []Element var warnings []string p.findInsertrElements(doc, filePath, &elements, &warnings) return elements, warnings, nil } // findInsertrElements recursively finds all elements with "insertr" class func (p *Parser) findInsertrElements(node *html.Node, filePath string, elements *[]Element, warnings *[]string) { if node.Type == html.ElementNode { classes := GetClasses(node) // Check if element has "insertr" class if ContainsClass(classes, "insertr") { if isContainer(node) { // Container element - expand to viable children viableChildren := findViableChildren(node) for _, child := range viableChildren { childClasses := GetClasses(child) element, warning := p.createElement(child, filePath, childClasses) *elements = append(*elements, element) if warning != "" { *warnings = append(*warnings, warning) } } // Don't process children recursively since we've handled the container's children return } else { // Regular element - process as before element, warning := p.createElement(node, filePath, classes) *elements = append(*elements, element) if warning != "" { *warnings = append(*warnings, warning) } } } } // Recursively check children for child := node.FirstChild; child != nil; child = child.NextSibling { p.findInsertrElements(child, filePath, elements, warnings) } } // createElement creates an Element from an HTML node func (p *Parser) createElement(node *html.Node, filePath string, classes []string) (Element, string) { var warning string // Resolve content ID (existing or generated) contentID, hasExistingID := p.resolveContentID(node) if !hasExistingID { contentID = p.idGenerator.Generate(node) } // Detect content type contentType := p.detectContentType(node, classes) // Extract text content content := extractTextContent(node) element := Element{ FilePath: filePath, Node: node, ContentID: contentID, Type: contentType, Tag: strings.ToLower(node.Data), Classes: classes, Content: content, HasID: hasExistingID, Generated: !hasExistingID, } // Generate warnings for edge cases if content == "" { warning = fmt.Sprintf("Element <%s> with id '%s' has no text content", element.Tag, element.ContentID) } return element, warning } // resolveContentID gets the content ID from existing attributes func (p *Parser) resolveContentID(node *html.Node) (string, bool) { // 1. Check for existing HTML id attribute if id := getAttribute(node, "id"); id != "" { return id, true } // 2. Check for data-content-id attribute if contentID := getAttribute(node, "data-content-id"); contentID != "" { return contentID, true } // 3. No existing ID found return "", false } // detectContentType determines the content type based on element and classes func (p *Parser) detectContentType(node *html.Node, classes []string) ContentType { // Check for explicit type classes first if ContainsClass(classes, "insertr-markdown") { return ContentMarkdown } if ContainsClass(classes, "insertr-link") { return ContentLink } if ContainsClass(classes, "insertr-text") { return ContentText } // Infer from HTML tag and context tag := strings.ToLower(node.Data) switch tag { case "h1", "h2", "h3", "h4", "h5", "h6": return ContentText case "p": // Paragraphs default to markdown for rich content return ContentMarkdown case "a", "button": return ContentLink case "div", "section": // Default divs/sections to markdown for rich content return ContentMarkdown case "span": return ContentText default: return ContentText } } // calculateStats computes statistics for the parse result func (p *Parser) calculateStats(result *ParseResult) { result.Stats.TotalElements = len(result.Elements) for _, element := range result.Elements { // Count existing vs generated IDs if element.HasID { result.Stats.ExistingIDs++ } else { result.Stats.GeneratedIDs++ } // Count content types result.Stats.TypeBreakdown[element.Type]++ } }