feat: implement unified content engine to eliminate ID generation inconsistencies
- Create internal/engine module as single source of truth for content processing - Consolidate 4 separate ID generation systems into one unified engine - Update API handlers to use engine for consistent server-side ID generation - Remove frontend client-side ID generation, delegate to server engine - Ensure identical HTML markup + file path produces identical content IDs - Resolve content persistence failures caused by ID fragmentation between manual editing and enhancement processes
This commit is contained in:
285
internal/engine/utils.go
Normal file
285
internal/engine/utils.go
Normal file
@@ -0,0 +1,285 @@
|
||||
package engine
|
||||
|
||||
import (
|
||||
"strings"
|
||||
|
||||
"golang.org/x/net/html"
|
||||
)
|
||||
|
||||
// GetClasses extracts CSS classes from an HTML node
|
||||
func GetClasses(node *html.Node) []string {
|
||||
classAttr := getAttribute(node, "class")
|
||||
if classAttr == "" {
|
||||
return []string{}
|
||||
}
|
||||
|
||||
classes := strings.Fields(classAttr)
|
||||
return classes
|
||||
}
|
||||
|
||||
// ContainsClass checks if a class list contains a specific class
|
||||
func ContainsClass(classes []string, target string) bool {
|
||||
for _, class := range classes {
|
||||
if class == target {
|
||||
return true
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
// getAttribute gets an attribute value from an HTML node
|
||||
func getAttribute(node *html.Node, key string) string {
|
||||
for _, attr := range node.Attr {
|
||||
if attr.Key == key {
|
||||
return attr.Val
|
||||
}
|
||||
}
|
||||
return ""
|
||||
}
|
||||
|
||||
// extractTextContent gets the text content from an HTML node
|
||||
func extractTextContent(node *html.Node) string {
|
||||
var text strings.Builder
|
||||
extractTextRecursive(node, &text)
|
||||
return strings.TrimSpace(text.String())
|
||||
}
|
||||
|
||||
// extractTextRecursive recursively extracts text from node and children
|
||||
func extractTextRecursive(node *html.Node, text *strings.Builder) {
|
||||
if node.Type == html.TextNode {
|
||||
text.WriteString(node.Data)
|
||||
}
|
||||
|
||||
for child := node.FirstChild; child != nil; child = child.NextSibling {
|
||||
// Skip script and style elements
|
||||
if child.Type == html.ElementNode &&
|
||||
(child.Data == "script" || child.Data == "style") {
|
||||
continue
|
||||
}
|
||||
extractTextRecursive(child, text)
|
||||
}
|
||||
}
|
||||
|
||||
// hasOnlyTextContent checks if a node contains only text content (no nested HTML elements)
|
||||
// DEPRECATED: Use hasEditableContent for more sophisticated detection
|
||||
func hasOnlyTextContent(node *html.Node) bool {
|
||||
if node.Type != html.ElementNode {
|
||||
return false
|
||||
}
|
||||
|
||||
for child := node.FirstChild; child != nil; child = child.NextSibling {
|
||||
switch child.Type {
|
||||
case html.ElementNode:
|
||||
// Found a nested HTML element - not text-only
|
||||
return false
|
||||
case html.TextNode:
|
||||
// Text nodes are fine, continue checking
|
||||
continue
|
||||
default:
|
||||
// Comments, etc. - continue checking
|
||||
continue
|
||||
}
|
||||
}
|
||||
return true
|
||||
}
|
||||
|
||||
// Inline formatting elements that are safe for editing
|
||||
var inlineFormattingTags = map[string]bool{
|
||||
"strong": true,
|
||||
"b": true,
|
||||
"em": true,
|
||||
"i": true,
|
||||
"span": true,
|
||||
"code": true,
|
||||
"small": true,
|
||||
"sub": true,
|
||||
"sup": true,
|
||||
"a": true, // Links within content are fine
|
||||
}
|
||||
|
||||
// Elements that should NOT be nested within editable content
|
||||
var blockingElements = map[string]bool{
|
||||
"button": true, // Buttons shouldn't be nested in paragraphs
|
||||
"input": true,
|
||||
"select": true,
|
||||
"textarea": true,
|
||||
"img": true,
|
||||
"video": true,
|
||||
"audio": true,
|
||||
"canvas": true,
|
||||
"svg": true,
|
||||
"iframe": true,
|
||||
"object": true,
|
||||
"embed": true,
|
||||
"div": true, // Nested divs usually indicate complex structure
|
||||
"section": true, // Block-level semantic elements
|
||||
"article": true,
|
||||
"header": true,
|
||||
"footer": true,
|
||||
"nav": true,
|
||||
"aside": true,
|
||||
"main": true,
|
||||
"form": true,
|
||||
"table": true,
|
||||
"ul": true,
|
||||
"ol": true,
|
||||
"dl": true,
|
||||
}
|
||||
|
||||
// hasEditableContent checks if a node contains content that can be safely edited
|
||||
// This includes text and safe inline formatting elements
|
||||
func hasEditableContent(node *html.Node) bool {
|
||||
if node.Type != html.ElementNode {
|
||||
return false
|
||||
}
|
||||
|
||||
return hasOnlyTextAndSafeFormatting(node)
|
||||
}
|
||||
|
||||
// hasOnlyTextAndSafeFormatting recursively checks if content is safe for editing
|
||||
func hasOnlyTextAndSafeFormatting(node *html.Node) bool {
|
||||
for child := node.FirstChild; child != nil; child = child.NextSibling {
|
||||
switch child.Type {
|
||||
case html.TextNode:
|
||||
continue // Text is always safe
|
||||
case html.ElementNode:
|
||||
// Check if it's a blocking element
|
||||
if blockingElements[child.Data] {
|
||||
return false
|
||||
}
|
||||
// Allow safe inline formatting
|
||||
if inlineFormattingTags[child.Data] {
|
||||
// Recursively validate the formatting element
|
||||
if !hasOnlyTextAndSafeFormatting(child) {
|
||||
return false
|
||||
}
|
||||
continue
|
||||
}
|
||||
// Unknown/unsafe element
|
||||
return false
|
||||
default:
|
||||
continue // Comments, whitespace, etc.
|
||||
}
|
||||
}
|
||||
return true
|
||||
}
|
||||
|
||||
// isContainer checks if a tag is typically used as a container element
|
||||
func isContainer(node *html.Node) bool {
|
||||
if node.Type != html.ElementNode {
|
||||
return false
|
||||
}
|
||||
|
||||
containerTags := map[string]bool{
|
||||
"div": true,
|
||||
"section": true,
|
||||
"article": true,
|
||||
"header": true,
|
||||
"footer": true,
|
||||
"main": true,
|
||||
"aside": true,
|
||||
"nav": true,
|
||||
}
|
||||
|
||||
return containerTags[node.Data]
|
||||
}
|
||||
|
||||
// findViableChildren finds all child elements that are viable for editing
|
||||
func findViableChildren(node *html.Node) []*html.Node {
|
||||
var viable []*html.Node
|
||||
|
||||
for child := node.FirstChild; child != nil; child = child.NextSibling {
|
||||
// Skip whitespace-only text nodes
|
||||
if child.Type == html.TextNode {
|
||||
if strings.TrimSpace(child.Data) == "" {
|
||||
continue
|
||||
}
|
||||
}
|
||||
|
||||
// Only consider element nodes
|
||||
if child.Type != html.ElementNode {
|
||||
continue
|
||||
}
|
||||
|
||||
// Skip self-closing elements for now
|
||||
if isSelfClosing(child) {
|
||||
continue
|
||||
}
|
||||
|
||||
// Check if element has editable content (improved logic)
|
||||
if hasEditableContent(child) {
|
||||
viable = append(viable, child)
|
||||
}
|
||||
}
|
||||
|
||||
return viable
|
||||
}
|
||||
|
||||
// findViableChildrenLegacy uses the old text-only logic for backwards compatibility
|
||||
func findViableChildrenLegacy(node *html.Node) []*html.Node {
|
||||
var viable []*html.Node
|
||||
|
||||
for child := node.FirstChild; child != nil; child = child.NextSibling {
|
||||
if child.Type == html.TextNode {
|
||||
if strings.TrimSpace(child.Data) == "" {
|
||||
continue
|
||||
}
|
||||
}
|
||||
|
||||
if child.Type != html.ElementNode {
|
||||
continue
|
||||
}
|
||||
|
||||
if isSelfClosing(child) {
|
||||
continue
|
||||
}
|
||||
|
||||
if hasOnlyTextContent(child) {
|
||||
viable = append(viable, child)
|
||||
}
|
||||
}
|
||||
|
||||
return viable
|
||||
}
|
||||
|
||||
// isSelfClosing checks if an element is typically self-closing
|
||||
func isSelfClosing(node *html.Node) bool {
|
||||
if node.Type != html.ElementNode {
|
||||
return false
|
||||
}
|
||||
|
||||
selfClosingTags := map[string]bool{
|
||||
"img": true,
|
||||
"input": true,
|
||||
"br": true,
|
||||
"hr": true,
|
||||
"meta": true,
|
||||
"link": true,
|
||||
"area": true,
|
||||
"base": true,
|
||||
"col": true,
|
||||
"embed": true,
|
||||
"source": true,
|
||||
"track": true,
|
||||
"wbr": true,
|
||||
}
|
||||
|
||||
return selfClosingTags[node.Data]
|
||||
}
|
||||
|
||||
// Note: FindElementInDocument functions removed - will be reimplemented in engine if needed
|
||||
|
||||
// GetAttribute gets an attribute value from an HTML node (exported version)
|
||||
func GetAttribute(node *html.Node, key string) string {
|
||||
return getAttribute(node, key)
|
||||
}
|
||||
|
||||
// HasEditableContent checks if a node has editable content (exported version)
|
||||
func HasEditableContent(node *html.Node) bool {
|
||||
return hasEditableContent(node)
|
||||
}
|
||||
|
||||
// FindViableChildren finds viable children for editing (exported version)
|
||||
func FindViableChildren(node *html.Node) []*html.Node {
|
||||
return findViableChildren(node)
|
||||
}
|
||||
Reference in New Issue
Block a user