From 1f97acc1bfe604d46629328a5398ca7c2b1c91fc Mon Sep 17 00:00:00 2001 From: Joakim Date: Wed, 3 Sep 2025 12:23:20 +0200 Subject: [PATCH] Implement hash-based content ID generation - Replace content text sampling with SHA1 content hashing - Generate IDs with format: context-purpose-hash (e.g., services-text-0d96da) - Achieve content independence: IDs stable when text changes - Maintain semantic meaning through context and purpose components - Support disposable development IDs with production stability --- insertr-cli/pkg/parser/id_generator.go | 34 +++++++++----------------- 1 file changed, 12 insertions(+), 22 deletions(-) diff --git a/insertr-cli/pkg/parser/id_generator.go b/insertr-cli/pkg/parser/id_generator.go index 73632ca..932ed22 100644 --- a/insertr-cli/pkg/parser/id_generator.go +++ b/insertr-cli/pkg/parser/id_generator.go @@ -25,9 +25,9 @@ func NewIDGenerator() *IDGenerator { func (g *IDGenerator) Generate(node *html.Node) string { context := g.getSemanticContext(node) purpose := g.getPurpose(node) - content := g.getContentSample(node) + contentHash := g.getContentHash(node) - baseID := g.createBaseID(context, purpose, content) + baseID := g.createBaseID(context, purpose, contentHash) return g.ensureUnique(baseID) } @@ -111,27 +111,19 @@ func (g *IDGenerator) getPurpose(node *html.Node) string { } } -// getContentSample gets a sample of content for ID generation -func (g *IDGenerator) getContentSample(node *html.Node) string { +// getContentHash creates a short hash of the content for ID generation +func (g *IDGenerator) getContentHash(node *html.Node) string { text := extractTextContent(node) - // Clean and normalize text - text = strings.ToLower(text) - text = regexp.MustCompile(`[^a-z0-9\s]+`).ReplaceAllString(text, "") - text = regexp.MustCompile(`\s+`).ReplaceAllString(text, " ") - text = strings.TrimSpace(text) + // Create hash of the text content + hash := fmt.Sprintf("%x", sha1.Sum([]byte(text))) - // Take first few words - words := strings.Fields(text) - if len(words) > 3 { - words = words[:3] - } - - return strings.Join(words, "-") + // Return first 6 characters for brevity + return hash[:6] } // createBaseID creates the base ID from components -func (g *IDGenerator) createBaseID(context, purpose, content string) string { +func (g *IDGenerator) createBaseID(context, purpose, contentHash string) string { parts := []string{} // Add context if meaningful @@ -142,10 +134,8 @@ func (g *IDGenerator) createBaseID(context, purpose, content string) string { // Add purpose parts = append(parts, purpose) - // Add content sample if available and meaningful - if content != "" && content != purpose { - parts = append(parts, content) - } + // Always add content hash for uniqueness + parts = append(parts, contentHash) baseID := strings.Join(parts, "-") @@ -155,7 +145,7 @@ func (g *IDGenerator) createBaseID(context, purpose, content string) string { // Ensure it's not empty if baseID == "" { - baseID = "content" + baseID = fmt.Sprintf("content-%s", contentHash) } return baseID