Implement hash-based content ID generation

- Replace content text sampling with SHA1 content hashing
- Generate IDs with format: context-purpose-hash (e.g., services-text-0d96da)
- Achieve content independence: IDs stable when text changes
- Maintain semantic meaning through context and purpose components
- Support disposable development IDs with production stability
This commit is contained in:
2025-09-03 12:23:20 +02:00
parent 81d8ef2bf5
commit 1f97acc1bf

View File

@@ -25,9 +25,9 @@ func NewIDGenerator() *IDGenerator {
func (g *IDGenerator) Generate(node *html.Node) string { func (g *IDGenerator) Generate(node *html.Node) string {
context := g.getSemanticContext(node) context := g.getSemanticContext(node)
purpose := g.getPurpose(node) purpose := g.getPurpose(node)
content := g.getContentSample(node) contentHash := g.getContentHash(node)
baseID := g.createBaseID(context, purpose, content) baseID := g.createBaseID(context, purpose, contentHash)
return g.ensureUnique(baseID) return g.ensureUnique(baseID)
} }
@@ -111,27 +111,19 @@ func (g *IDGenerator) getPurpose(node *html.Node) string {
} }
} }
// getContentSample gets a sample of content for ID generation // getContentHash creates a short hash of the content for ID generation
func (g *IDGenerator) getContentSample(node *html.Node) string { func (g *IDGenerator) getContentHash(node *html.Node) string {
text := extractTextContent(node) text := extractTextContent(node)
// Clean and normalize text // Create hash of the text content
text = strings.ToLower(text) hash := fmt.Sprintf("%x", sha1.Sum([]byte(text)))
text = regexp.MustCompile(`[^a-z0-9\s]+`).ReplaceAllString(text, "")
text = regexp.MustCompile(`\s+`).ReplaceAllString(text, " ")
text = strings.TrimSpace(text)
// Take first few words // Return first 6 characters for brevity
words := strings.Fields(text) return hash[:6]
if len(words) > 3 {
words = words[:3]
}
return strings.Join(words, "-")
} }
// createBaseID creates the base ID from components // createBaseID creates the base ID from components
func (g *IDGenerator) createBaseID(context, purpose, content string) string { func (g *IDGenerator) createBaseID(context, purpose, contentHash string) string {
parts := []string{} parts := []string{}
// Add context if meaningful // Add context if meaningful
@@ -142,10 +134,8 @@ func (g *IDGenerator) createBaseID(context, purpose, content string) string {
// Add purpose // Add purpose
parts = append(parts, purpose) parts = append(parts, purpose)
// Add content sample if available and meaningful // Always add content hash for uniqueness
if content != "" && content != purpose { parts = append(parts, contentHash)
parts = append(parts, content)
}
baseID := strings.Join(parts, "-") baseID := strings.Join(parts, "-")
@@ -155,7 +145,7 @@ func (g *IDGenerator) createBaseID(context, purpose, content string) string {
// Ensure it's not empty // Ensure it's not empty
if baseID == "" { if baseID == "" {
baseID = "content" baseID = fmt.Sprintf("content-%s", contentHash)
} }
return baseID return baseID