Lazy audio loading for interactive and public readers

2026-05-23 17:48:03 +06:00
parent e0e3b65c75
commit 965470853e
3 changed files with 181 additions and 141 deletions
--- a/routes/public_routes.py
+++ b/routes/public_routes.py
@@ -1,5 +1,6 @@
 # routes/public_routes.py - Public (No Auth) Routes for Published Audiobooks

+import re
 import json
 from flask import Blueprint, jsonify, send_from_directory, abort

@@ -8,6 +9,33 @@ from db import get_db
 public_bp = Blueprint('public', __name__)


+# ============================================
+# Helpers
+# ============================================
+
+_CONTROL_CHAR_RE = re.compile(r'[\x00-\x08\x0B\x0C\x0E-\x1F\x7F]')
+
+
+def clean_str(s):
+    if s is None:
+        return ''
+    if not isinstance(s, str):
+        s = str(s)
+    return _CONTROL_CHAR_RE.sub('', s)
+
+
+def clean_transcription(transcription):
+    if isinstance(transcription, list):
+        for t in transcription:
+            if isinstance(t, dict) and 'word' in t:
+                t['word'] = clean_str(t.get('word', ''))
+    return transcription
+
+
+# ============================================
+# Routes
+# ============================================
+
@public_bp.route('/home')
 def public_home():
    """Public homepage - Bookcase view of published audiobooks."""
@@ -68,7 +96,11 @@ def list_published_books():

@public_bp.route('/api/public/books/<int:project_id>', methods=['GET'])
 def get_published_book(project_id):
-    """Get full published book content for the reader."""
+    """
+    Get book metadata WITHOUT audio_data.
+    Audio is loaded lazily via /api/public/books/<id>/audio/<block_id>.
+    This keeps the response small (<1 MB) and avoids proxy truncation issues.
+    """
    db = get_db()
    cursor = db.cursor()
    
@@ -88,7 +120,9 @@ def get_published_book(project_id):
    chapters_data = []
    for chapter in chapters:
        cursor.execute('''
-            SELECT * FROM markdown_blocks WHERE chapter_id = ? ORDER BY block_order
+            SELECT id, block_order, block_type, content, audio_format, transcription,
+                   (audio_data IS NOT NULL AND audio_data != '') as has_audio
+            FROM markdown_blocks WHERE chapter_id = ? ORDER BY block_order
        ''', (chapter['id'],))
        blocks = cursor.fetchall()
        
@@ -99,35 +133,79 @@ def get_published_book(project_id):
            ''', (block['id'],))
            images = cursor.fetchall()
            
+            transcription = []
+            if block['transcription']:
+                try:
+                    transcription = json.loads(block['transcription'])
+                    transcription = clean_transcription(transcription)
+                except (json.JSONDecodeError, TypeError):
+                    transcription = []
+            
            blocks_data.append({
                'id': block['id'],
                'block_order': block['block_order'],
-                'block_type': block['block_type'],
-                'content': block['content'],
-                'audio_data': block['audio_data'],
-                'audio_format': block['audio_format'],
-                'transcription': json.loads(block['transcription']) if block['transcription'] else [],
+                'block_type': clean_str(block['block_type']),
+                'content': clean_str(block['content']),
+                'audio_data': '',  # Empty here; loaded lazily by frontend
+                'audio_format': clean_str(block['audio_format']) or 'mp3',
+                'has_audio': bool(block['has_audio']),
+                'transcription': transcription,
                'images': [{
-                    'data': img['image_data'],
-                    'format': img['image_format'],
-                    'alt_text': img['alt_text'],
-                    'position': img['position']
+                    'data': clean_str(img['image_data']),
+                    'format': clean_str(img['image_format']) or 'png',
+                    'alt_text': clean_str(img['alt_text']),
+                    'position': clean_str(img['position']) or 'before'
                } for img in images]
            })
        
        chapters_data.append({
            'id': chapter['id'],
            'chapter_number': chapter['chapter_number'],
-            'title': chapter['title'],
+            'title': clean_str(chapter['title']),
            'blocks': blocks_data
        })
    
    return jsonify({
        'id': project['id'],
-        'name': project['name'],
-        'description': project['description'] or '',
-        'author': project['author'] or '',
+        'name': clean_str(project['name']),
+        'description': clean_str(project['description']) if project['description'] else '',
+        'author': clean_str(project['author']) if project['author'] else '',
        'thumbnail_data': project['thumbnail_data'],
        'thumbnail_format': project['thumbnail_format'] or 'png',
        'chapters': chapters_data
    })
+
+
+@public_bp.route('/api/public/books/<int:project_id>/audio/<int:block_id>', methods=['GET'])
+def get_public_block_audio(project_id, block_id):
+    """
+    Return audio_data (base64) for a single block in a published book.
+    No auth required since the book is published publicly.
+    """
+    db = get_db()
+    cursor = db.cursor()
+    
+    # Verify project is published
+    cursor.execute('SELECT is_published FROM projects WHERE id = ?', (project_id,))
+    project = cursor.fetchone()
+    if not project or not project['is_published']:
+        return jsonify({'error': 'Book not found or not published'}), 404
+    
+    cursor.execute('''
+        SELECT mb.audio_data, mb.audio_format
+        FROM markdown_blocks mb
+        JOIN chapters c ON mb.chapter_id = c.id
+        WHERE mb.id = ? AND c.project_id = ?
+    ''', (block_id, project_id))
+    row = cursor.fetchone()
+    
+    if not row:
+        return jsonify({'error': 'Block not found'}), 404
+    
+    if not row['audio_data']:
+        return jsonify({'audio_data': '', 'audio_format': row['audio_format'] or 'mp3'})
+    
+    return jsonify({
+        'audio_data': clean_str(row['audio_data']),
+        'audio_format': clean_str(row['audio_format']) or 'mp3'
+    })