Implement lazy audio loading to fix large response truncation

2026-05-23 07:45:02 +06:00
parent 36a842dc60
commit e0e3b65c75
3 changed files with 139 additions and 164 deletions
--- a/routes/project_routes.py
+++ b/routes/project_routes.py
@@ -3,7 +3,7 @@
 import re
 import json
 import base64
-from flask import Blueprint, request, jsonify, Response, stream_with_context
+from flask import Blueprint, request, jsonify

 from db import get_db, vacuum_db
 from auth import login_required
@@ -15,12 +15,10 @@ project_bp = Blueprint('project', __name__)
 # Helpers
 # ============================================

-# C0/C1 control characters except \t \n \r — these corrupt JSON streams.
 _CONTROL_CHAR_RE = re.compile(r'[\x00-\x08\x0B\x0C\x0E-\x1F\x7F]')


 def clean_str(s):
-    """Strip raw control characters from a string. Returns '' for None."""
    if s is None:
        return ''
    if not isinstance(s, str):
@@ -29,7 +27,6 @@ def clean_str(s):


 def clean_transcription(transcription):
-    """Sanitize 'word' fields inside a transcription list."""
    if isinstance(transcription, list):
        for t in transcription:
            if isinstance(t, dict) and 'word' in t:
@@ -118,11 +115,9 @@ def create_project():
@login_required
 def get_project(project_id):
    """
-    Get a project with all its chapters and blocks.
-    
-    Streamed response: large projects (with many audio blocks) can produce
-    10-50 MB of JSON. We stream it in chunks and sanitize every string field
-    to prevent control characters from breaking JSON parsing on the client.
+    Get project metadata WITHOUT audio_data.
+    Audio is loaded lazily via /api/projects/<id>/audio/<block_id>.
+    This keeps the response small (<1 MB) and avoids proxy truncation issues.
    """
    db = get_db()
    cursor = db.cursor()
@@ -141,7 +136,10 @@ def get_project(project_id):
    chapters_data = []
    for chapter in chapters:
        cursor.execute('''
-            SELECT * FROM markdown_blocks WHERE chapter_id = ? ORDER BY block_order
+            SELECT id, block_order, block_type, content, tts_text,
+                   audio_format, transcription,
+                   (audio_data IS NOT NULL AND audio_data != '') as has_audio
+            FROM markdown_blocks WHERE chapter_id = ? ORDER BY block_order
        ''', (chapter['id'],))
        blocks = cursor.fetchall()
        
@@ -152,7 +150,6 @@ def get_project(project_id):
            ''', (block['id'],))
            images = cursor.fetchall()
            
-            # Safely parse transcription (might be NULL, empty, or malformed)
            transcription = []
            if block['transcription']:
                try:
@@ -167,8 +164,9 @@ def get_project(project_id):
                'block_type': clean_str(block['block_type']),
                'content': clean_str(block['content']),
                'tts_text': clean_str(block['tts_text']),
-                'audio_data': clean_str(block['audio_data']),
+                'audio_data': '',  # Empty here; loaded lazily by frontend
                'audio_format': clean_str(block['audio_format']) or 'mp3',
+                'has_audio': bool(block['has_audio']),
                'transcription': transcription,
                'images': [{
                    'id': img['id'],
@@ -187,32 +185,43 @@ def get_project(project_id):
            'blocks': blocks_data
        })
    
-    response_data = {
+    return jsonify({
        'id': project['id'],
        'name': clean_str(project['name']),
        'created_at': clean_str(project['created_at']),
        'updated_at': clean_str(project['updated_at']),
        'chapters': chapters_data
-    }
+    })
+
+
+@project_bp.route('/api/projects/<int:project_id>/audio/<int:block_id>', methods=['GET'])
+@login_required
+def get_block_audio(project_id, block_id):
+    """
+    Return audio_data (base64) for a single block.
+    Used by the frontend to lazy-load audio after metadata is loaded.
+    """
+    db = get_db()
+    cursor = db.cursor()
    
-    # Stream JSON in chunks. ensure_ascii=True forces all non-ASCII chars
-    # to be escaped (\uXXXX) — slightly larger payload but guarantees the
-    # stream is pure ASCII, so no proxy can mis-handle multi-byte chars
-    # at chunk boundaries.
-    def generate():
-        json_str = json.dumps(response_data, ensure_ascii=True)
-        chunk_size = 64 * 1024  # 64 KB per chunk
-        for i in range(0, len(json_str), chunk_size):
-            yield json_str[i:i + chunk_size]
+    cursor.execute('''
+        SELECT mb.audio_data, mb.audio_format
+        FROM markdown_blocks mb
+        JOIN chapters c ON mb.chapter_id = c.id
+        WHERE mb.id = ? AND c.project_id = ?
+    ''', (block_id, project_id))
+    row = cursor.fetchone()
    
-    return Response(
-        stream_with_context(generate()),
-        mimetype='application/json; charset=utf-8',
-        headers={
-            'Cache-Control': 'no-cache',
-            'X-Accel-Buffering': 'no'  # Tell Nginx/Traefik: don't buffer this response
-        }
-    )
+    if not row:
+        return jsonify({'error': 'Block not found'}), 404
+    
+    if not row['audio_data']:
+        return jsonify({'audio_data': '', 'audio_format': row['audio_format'] or 'mp3'})
+    
+    return jsonify({
+        'audio_data': clean_str(row['audio_data']),
+        'audio_format': clean_str(row['audio_format']) or 'mp3'
+    })


@project_bp.route('/api/projects/<int:project_id>', methods=['PUT'])
@@ -281,13 +290,7 @@ def delete_project(project_id):
@project_bp.route('/api/projects/<int:project_id>/save', methods=['POST'])
@login_required
 def save_project_content(project_id):
-    """
-    Save all chapters and blocks for a project.
-    
-    Every string field is sanitized before insertion so that invalid
-    control characters never enter the database. This protects future
-    reads from the JSON corruption bug we saw on /api/projects/<id> GET.
-    """
+    """Save all chapters and blocks for a project."""
    data = request.json
    chapters = data.get('chapters', [])
    
@@ -328,7 +331,6 @@ def save_project_content(project_id):
        chapter_id = cursor.lastrowid
        
        for block in chapter.get('blocks', []):
-            # Clean transcription word fields before storing
            transcription = clean_transcription(block.get('transcription', []))
            
            cursor.execute('''
@@ -376,7 +378,7 @@ def save_project_content(project_id):
@project_bp.route('/api/projects/<int:project_id>/publish', methods=['POST'])
@login_required
 def publish_project(project_id):
-    """Publish a project to make it visible on public homepage."""
+    """Publish a project to public homepage."""
    data = request.json or {}
    
    db = get_db()
@@ -387,7 +389,6 @@ def publish_project(project_id):
    if not project:
        return jsonify({'error': 'Project not found'}), 404
    
-    # Verify project has at least one chapter with audio
    cursor.execute('''
        SELECT COUNT(*) as cnt FROM markdown_blocks mb
        JOIN chapters c ON mb.chapter_id = c.id
@@ -422,7 +423,7 @@ def publish_project(project_id):
@project_bp.route('/api/projects/<int:project_id>/unpublish', methods=['POST'])
@login_required
 def unpublish_project(project_id):
-    """Unpublish a project (but keep author/description/category for easy republish)."""
+    """Unpublish a project."""
    db = get_db()
    cursor = db.cursor()
    
@@ -439,7 +440,7 @@ def unpublish_project(project_id):
@project_bp.route('/api/projects/<int:project_id>/thumbnail', methods=['POST'])
@login_required
 def upload_thumbnail(project_id):
-    """Upload a thumbnail image for the project."""
+    """Upload a thumbnail image."""
    if 'file' not in request.files:
        return jsonify({'error': 'No file provided'}), 400
    
@@ -488,76 +489,3 @@ def delete_thumbnail(project_id):
    cursor.execute('UPDATE projects SET thumbnail_data = NULL WHERE id = ?', (project_id,))
    db.commit()
    return jsonify({'success': True})
-
-# ============================================
-# DEBUG: Identify corrupt data
-# ============================================
-
-@project_bp.route('/api/projects/<int:project_id>/debug', methods=['GET'])
-@login_required
-def debug_project(project_id):
-    """
-    Scan a project for control characters and report which fields are dirty.
-    Visit: /api/projects/<id>/debug after logging in.
-    """
-    db = get_db()
-    cursor = db.cursor()
-    
-    cursor.execute('SELECT id, name FROM projects WHERE id = ?', (project_id,))
-    project = cursor.fetchone()
-    if not project:
-        return jsonify({'error': 'Project not found'}), 404
-    
-    def find_bad_chars(s):
-        """Return list of (position, char_code) for any control char found."""
-        if not s or not isinstance(s, str):
-            return []
-        bad = []
-        for i, ch in enumerate(s):
-            code = ord(ch)
-            # Allow \t (9), \n (10), \r (13). Anything else <32 or 127 is bad.
-            if (code < 32 and code not in (9, 10, 13)) or code == 127:
-                bad.append({'pos': i, 'code': code, 'hex': f'0x{code:02x}'})
-                if len(bad) >= 5:  # cap at 5 per field
-                    break
-        return bad
-    
-    report = {
-        'project_id': project['id'],
-        'project_name': project['name'],
-        'issues': []
-    }
-    
-    cursor.execute('SELECT * FROM chapters WHERE project_id = ? ORDER BY chapter_number', (project_id,))
-    chapters = cursor.fetchall()
-    
-    for chapter in chapters:
-        ch_num = chapter['chapter_number']
-        
-        for field in ('title', 'voice'):
-            bad = find_bad_chars(chapter[field])
-            if bad:
-                report['issues'].append({
-                    'where': f'chapter {ch_num} -> {field}',
-                    'bad_chars': bad,
-                    'sample': repr((chapter[field] or '')[:80])
-                })
-        
-        cursor.execute('SELECT * FROM markdown_blocks WHERE chapter_id = ? ORDER BY block_order', (chapter['id'],))
-        blocks = cursor.fetchall()
-        
-        for block in blocks:
-            b_order = block['block_order']
-            for field in ('block_type', 'content', 'tts_text', 'audio_data', 'audio_format', 'transcription'):
-                bad = find_bad_chars(block[field])
-                if bad:
-                    val = block[field] or ''
-                    report['issues'].append({
-                        'where': f'chapter {ch_num}, block {b_order} -> {field}',
-                        'field_length': len(val),
-                        'bad_chars': bad,
-                        'sample_around_first_bad': repr(val[max(0, bad[0]['pos']-20):bad[0]['pos']+20])
-                    })
-    
-    report['total_issues'] = len(report['issues'])
-    return jsonify(report)