Files
insertr/internal/parser/parser.go
Joakim 72bd31b626 feat: implement collision-free lightweight hierarchical ID generation
- Replace content-hash based ID generation with position-based algorithm
- Use file + element identity + position index + hash for unique IDs
- Generate human-readable prefixes (e.g. index-lead-, index-p-2-)
- Add collision-resistant hash suffixes for guaranteed uniqueness
- Update Generate() to accept filePath parameter for context
- Fix ID collisions where hero and footer elements shared same ID
- Clean demo site files removing all data-content-id attributes
- Preserve insertr-gate elements for authentication functionality

Results: Hero gets 'index-lead-2-fc31f2', footer gets 'index-p-13-99fd13'
No more content cross-contamination between different elements.
2025-09-11 17:38:15 +02:00

231 lines
5.8 KiB
Go

package parser
import (
"fmt"
"io/fs"
"os"
"path/filepath"
"strings"
"golang.org/x/net/html"
)
// Parser handles HTML parsing and element detection
type Parser struct {
idGenerator *IDGenerator
}
// New creates a new Parser instance
func New() *Parser {
return &Parser{
idGenerator: NewIDGenerator(),
}
}
// ParseDirectory parses all HTML files in the given directory
func (p *Parser) ParseDirectory(dir string) (*ParseResult, error) {
result := &ParseResult{
Elements: []Element{},
Warnings: []string{},
Stats: ParseStats{
TypeBreakdown: make(map[ContentType]int),
},
}
err := filepath.WalkDir(dir, func(path string, d fs.DirEntry, err error) error {
if err != nil {
return err
}
// Only process HTML files
if d.IsDir() || !strings.HasSuffix(strings.ToLower(path), ".html") {
return nil
}
elements, warnings, err := p.parseFile(path)
if err != nil {
result.Warnings = append(result.Warnings,
fmt.Sprintf("Error parsing %s: %v", path, err))
return nil // Continue processing other files
}
result.Elements = append(result.Elements, elements...)
result.Warnings = append(result.Warnings, warnings...)
result.Stats.FilesProcessed++
return nil
})
if err != nil {
return nil, fmt.Errorf("error walking directory: %w", err)
}
// Calculate statistics
p.calculateStats(result)
return result, nil
}
// parseFile parses a single HTML file
func (p *Parser) parseFile(filePath string) ([]Element, []string, error) {
file, err := os.Open(filePath)
if err != nil {
return nil, nil, fmt.Errorf("error opening file: %w", err)
}
defer file.Close()
doc, err := html.Parse(file)
if err != nil {
return nil, nil, fmt.Errorf("error parsing HTML: %w", err)
}
var elements []Element
var warnings []string
p.findInsertrElements(doc, filePath, &elements, &warnings)
return elements, warnings, nil
}
// findInsertrElements recursively finds all elements with "insertr" class
func (p *Parser) findInsertrElements(node *html.Node, filePath string, elements *[]Element, warnings *[]string) {
if node.Type == html.ElementNode {
classes := GetClasses(node)
// Check if element has "insertr" class
if ContainsClass(classes, "insertr") {
if isContainer(node) {
// Container element - expand to viable children
viableChildren := findViableChildren(node)
for _, child := range viableChildren {
childClasses := GetClasses(child)
element, warning := p.createElement(child, filePath, childClasses)
*elements = append(*elements, element)
if warning != "" {
*warnings = append(*warnings, warning)
}
}
// Don't process children recursively since we've handled the container's children
return
} else {
// Regular element - process as before
element, warning := p.createElement(node, filePath, classes)
*elements = append(*elements, element)
if warning != "" {
*warnings = append(*warnings, warning)
}
}
}
}
// Recursively check children
for child := node.FirstChild; child != nil; child = child.NextSibling {
p.findInsertrElements(child, filePath, elements, warnings)
}
}
// createElement creates an Element from an HTML node
func (p *Parser) createElement(node *html.Node, filePath string, classes []string) (Element, string) {
var warning string
// Resolve content ID (existing or generated)
contentID, hasExistingID := p.resolveContentID(node)
if !hasExistingID {
contentID = p.idGenerator.Generate(node, filePath)
}
// Detect content type
contentType := p.detectContentType(node, classes)
// Extract text content
content := extractTextContent(node)
element := Element{
FilePath: filePath,
Node: node,
ContentID: contentID,
Type: contentType,
Tag: strings.ToLower(node.Data),
Classes: classes,
Content: content,
HasID: hasExistingID,
Generated: !hasExistingID,
}
// Generate warnings for edge cases
if content == "" {
warning = fmt.Sprintf("Element <%s> with id '%s' has no text content",
element.Tag, element.ContentID)
}
return element, warning
}
// resolveContentID gets the content ID from existing attributes
func (p *Parser) resolveContentID(node *html.Node) (string, bool) {
// 1. Check for existing HTML id attribute
if id := getAttribute(node, "id"); id != "" {
return id, true
}
// 2. Check for data-content-id attribute
if contentID := getAttribute(node, "data-content-id"); contentID != "" {
return contentID, true
}
// 3. No existing ID found
return "", false
}
// detectContentType determines the content type based on element and classes
func (p *Parser) detectContentType(node *html.Node, classes []string) ContentType {
// Check for explicit type classes first
if ContainsClass(classes, "insertr-markdown") {
return ContentMarkdown
}
if ContainsClass(classes, "insertr-link") {
return ContentLink
}
if ContainsClass(classes, "insertr-text") {
return ContentText
}
// Infer from HTML tag and context
tag := strings.ToLower(node.Data)
switch tag {
case "h1", "h2", "h3", "h4", "h5", "h6":
return ContentText
case "p":
// Paragraphs default to markdown for rich content
return ContentMarkdown
case "a", "button":
return ContentLink
case "div", "section":
// Default divs/sections to markdown for rich content
return ContentMarkdown
case "span":
// Default spans to markdown for rich inline content
return ContentMarkdown
default:
return ContentText
}
}
// calculateStats computes statistics for the parse result
func (p *Parser) calculateStats(result *ParseResult) {
result.Stats.TotalElements = len(result.Elements)
for _, element := range result.Elements {
// Count existing vs generated IDs
if element.HasID {
result.Stats.ExistingIDs++
} else {
result.Stats.GeneratedIDs++
}
// Count content types
result.Stats.TypeBreakdown[element.Type]++
}
}