feat: implement unified content engine to eliminate ID generation inconsistencies

- Create internal/engine module as single source of truth for content processing
- Consolidate 4 separate ID generation systems into one unified engine
- Update API handlers to use engine for consistent server-side ID generation
- Remove frontend client-side ID generation, delegate to server engine
- Ensure identical HTML markup + file path produces identical content IDs
- Resolve content persistence failures caused by ID fragmentation between manual editing and enhancement processes
This commit is contained in:
2025-09-16 15:04:27 +02:00
parent c1bc28d107
commit 84c90f428d
12 changed files with 1426 additions and 267 deletions

View File

@@ -106,220 +106,23 @@ export class InsertrCore {
getElementMetadata(element) {
const existingId = element.getAttribute('data-content-id');
// Always provide both existing ID (if any) and element context
// Backend will use existing ID if provided, or generate new one from context
// Send HTML markup to server for unified ID generation
return {
contentId: existingId, // null if new content, existing ID if updating
contentType: element.getAttribute('data-content-type') || this.detectContentType(element),
element: element,
elementContext: this.extractElementContext(element)
htmlMarkup: element.outerHTML // Server will generate ID from this
};
}
// Extract element context for backend ID generation
extractElementContext(element) {
return {
tag: element.tagName.toLowerCase(),
classes: Array.from(element.classList),
original_content: element.textContent.trim(),
parent_context: this.getSemanticContext(element),
purpose: this.getPurpose(element)
};
}
// Generate deterministic ID using same algorithm as CLI parser
generateTempId(element) {
return this.generateDeterministicId(element);
}
// Generate deterministic content ID (matches CLI parser algorithm)
generateDeterministicId(element) {
const context = this.getSemanticContext(element);
const purpose = this.getPurpose(element);
const contentHash = this.getContentHash(element);
return this.createBaseId(context, purpose, contentHash);
}
// Get semantic context from parent elements (matches CLI algorithm)
getSemanticContext(element) {
let parent = element.parentElement;
while (parent && parent.nodeType === Node.ELEMENT_NODE) {
const classList = Array.from(parent.classList);
// Check for common semantic section classes
const semanticClasses = ['hero', 'services', 'nav', 'navbar', 'footer', 'about', 'contact', 'testimonial'];
for (const semanticClass of semanticClasses) {
if (classList.includes(semanticClass)) {
return semanticClass;
}
}
// Check for semantic HTML elements
const tag = parent.tagName.toLowerCase();
if (['nav', 'header', 'footer', 'main', 'aside'].includes(tag)) {
return tag;
}
parent = parent.parentElement;
// Get current file path from URL for consistent ID generation
getCurrentFilePath() {
const path = window.location.pathname;
if (path === '/' || path === '') {
return 'index.html';
}
return 'content';
}
// Get purpose/role of the element (matches CLI algorithm)
getPurpose(element) {
const tag = element.tagName.toLowerCase();
const classList = Array.from(element.classList);
// Check for specific CSS classes that indicate purpose
for (const className of classList) {
if (className.includes('title')) return 'title';
if (className.includes('headline')) return 'headline';
if (className.includes('description')) return 'description';
if (className.includes('subtitle')) return 'subtitle';
if (className.includes('cta')) return 'cta';
if (className.includes('button')) return 'button';
if (className.includes('logo')) return 'logo';
if (className.includes('lead')) return 'lead';
}
// Infer purpose from HTML tag
switch (tag) {
case 'h1':
return 'title';
case 'h2':
return 'subtitle';
case 'h3':
case 'h4':
case 'h5':
case 'h6':
return 'heading';
case 'p':
return 'text';
case 'a':
return 'link';
case 'button':
return 'button';
default:
return 'content';
}
}
// Generate content hash (matches CLI algorithm)
getContentHash(element) {
const text = element.textContent.trim();
// Simple SHA-1 implementation for consistent hashing
return this.sha1(text).substring(0, 6);
}
// Simple SHA-1 implementation (matches Go crypto/sha1)
sha1(str) {
// Convert string to UTF-8 bytes
const utf8Bytes = new TextEncoder().encode(str);
// SHA-1 implementation
const h = [0x67452301, 0xEFCDAB89, 0x98BADCFE, 0x10325476, 0xC3D2E1F0];
const messageLength = utf8Bytes.length;
// Pre-processing: adding padding bits
const paddedMessage = new Uint8Array(Math.ceil((messageLength + 9) / 64) * 64);
paddedMessage.set(utf8Bytes);
paddedMessage[messageLength] = 0x80;
// Append original length in bits as 64-bit big-endian integer
const bitLength = messageLength * 8;
const view = new DataView(paddedMessage.buffer);
view.setUint32(paddedMessage.length - 4, bitLength, false); // big-endian
// Process message in 512-bit chunks
for (let chunk = 0; chunk < paddedMessage.length; chunk += 64) {
const w = new Array(80);
// Break chunk into sixteen 32-bit words
for (let i = 0; i < 16; i++) {
w[i] = view.getUint32(chunk + i * 4, false); // big-endian
}
// Extend the words
for (let i = 16; i < 80; i++) {
w[i] = this.leftRotate(w[i-3] ^ w[i-8] ^ w[i-14] ^ w[i-16], 1);
}
// Initialize hash value for this chunk
let [a, b, c, d, e] = h;
// Main loop
for (let i = 0; i < 80; i++) {
let f, k;
if (i < 20) {
f = (b & c) | ((~b) & d);
k = 0x5A827999;
} else if (i < 40) {
f = b ^ c ^ d;
k = 0x6ED9EBA1;
} else if (i < 60) {
f = (b & c) | (b & d) | (c & d);
k = 0x8F1BBCDC;
} else {
f = b ^ c ^ d;
k = 0xCA62C1D6;
}
const temp = (this.leftRotate(a, 5) + f + e + k + w[i]) >>> 0;
e = d;
d = c;
c = this.leftRotate(b, 30);
b = a;
a = temp;
}
// Add this chunk's hash to result
h[0] = (h[0] + a) >>> 0;
h[1] = (h[1] + b) >>> 0;
h[2] = (h[2] + c) >>> 0;
h[3] = (h[3] + d) >>> 0;
h[4] = (h[4] + e) >>> 0;
}
// Produce the final hash value as a 160-bit hex string
return h.map(x => x.toString(16).padStart(8, '0')).join('');
}
// Left rotate function for SHA-1
leftRotate(value, amount) {
return ((value << amount) | (value >>> (32 - amount))) >>> 0;
}
// Create base ID from components (matches CLI algorithm)
createBaseId(context, purpose, contentHash) {
const parts = [];
// Add context if meaningful
if (context !== 'content') {
parts.push(context);
}
// Add purpose
parts.push(purpose);
// Always add content hash for uniqueness
parts.push(contentHash);
let baseId = parts.join('-');
// Clean up the ID
baseId = baseId.replace(/-+/g, '-');
baseId = baseId.replace(/^-+|-+$/g, '');
// Ensure it's not empty
if (!baseId) {
baseId = `content-${contentHash}`;
}
return baseId;
// Remove leading slash: "/about.html" → "about.html"
return path.replace(/^\//, '');
}
// Detect content type for elements without data-content-type