Implement hash-based content ID generation
- Replace content text sampling with SHA1 content hashing - Generate IDs with format: context-purpose-hash (e.g., services-text-0d96da) - Achieve content independence: IDs stable when text changes - Maintain semantic meaning through context and purpose components - Support disposable development IDs with production stability
This commit is contained in:
@@ -25,9 +25,9 @@ func NewIDGenerator() *IDGenerator {
|
|||||||
func (g *IDGenerator) Generate(node *html.Node) string {
|
func (g *IDGenerator) Generate(node *html.Node) string {
|
||||||
context := g.getSemanticContext(node)
|
context := g.getSemanticContext(node)
|
||||||
purpose := g.getPurpose(node)
|
purpose := g.getPurpose(node)
|
||||||
content := g.getContentSample(node)
|
contentHash := g.getContentHash(node)
|
||||||
|
|
||||||
baseID := g.createBaseID(context, purpose, content)
|
baseID := g.createBaseID(context, purpose, contentHash)
|
||||||
return g.ensureUnique(baseID)
|
return g.ensureUnique(baseID)
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -111,27 +111,19 @@ func (g *IDGenerator) getPurpose(node *html.Node) string {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// getContentSample gets a sample of content for ID generation
|
// getContentHash creates a short hash of the content for ID generation
|
||||||
func (g *IDGenerator) getContentSample(node *html.Node) string {
|
func (g *IDGenerator) getContentHash(node *html.Node) string {
|
||||||
text := extractTextContent(node)
|
text := extractTextContent(node)
|
||||||
|
|
||||||
// Clean and normalize text
|
// Create hash of the text content
|
||||||
text = strings.ToLower(text)
|
hash := fmt.Sprintf("%x", sha1.Sum([]byte(text)))
|
||||||
text = regexp.MustCompile(`[^a-z0-9\s]+`).ReplaceAllString(text, "")
|
|
||||||
text = regexp.MustCompile(`\s+`).ReplaceAllString(text, " ")
|
|
||||||
text = strings.TrimSpace(text)
|
|
||||||
|
|
||||||
// Take first few words
|
// Return first 6 characters for brevity
|
||||||
words := strings.Fields(text)
|
return hash[:6]
|
||||||
if len(words) > 3 {
|
|
||||||
words = words[:3]
|
|
||||||
}
|
|
||||||
|
|
||||||
return strings.Join(words, "-")
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// createBaseID creates the base ID from components
|
// createBaseID creates the base ID from components
|
||||||
func (g *IDGenerator) createBaseID(context, purpose, content string) string {
|
func (g *IDGenerator) createBaseID(context, purpose, contentHash string) string {
|
||||||
parts := []string{}
|
parts := []string{}
|
||||||
|
|
||||||
// Add context if meaningful
|
// Add context if meaningful
|
||||||
@@ -142,10 +134,8 @@ func (g *IDGenerator) createBaseID(context, purpose, content string) string {
|
|||||||
// Add purpose
|
// Add purpose
|
||||||
parts = append(parts, purpose)
|
parts = append(parts, purpose)
|
||||||
|
|
||||||
// Add content sample if available and meaningful
|
// Always add content hash for uniqueness
|
||||||
if content != "" && content != purpose {
|
parts = append(parts, contentHash)
|
||||||
parts = append(parts, content)
|
|
||||||
}
|
|
||||||
|
|
||||||
baseID := strings.Join(parts, "-")
|
baseID := strings.Join(parts, "-")
|
||||||
|
|
||||||
@@ -155,7 +145,7 @@ func (g *IDGenerator) createBaseID(context, purpose, content string) string {
|
|||||||
|
|
||||||
// Ensure it's not empty
|
// Ensure it's not empty
|
||||||
if baseID == "" {
|
if baseID == "" {
|
||||||
baseID = "content"
|
baseID = fmt.Sprintf("content-%s", contentHash)
|
||||||
}
|
}
|
||||||
|
|
||||||
return baseID
|
return baseID
|
||||||
|
|||||||
Reference in New Issue
Block a user