Implement lazy audio loading to fix large response truncation
This commit is contained in:
@@ -3,7 +3,7 @@
|
||||
import re
|
||||
import json
|
||||
import base64
|
||||
from flask import Blueprint, request, jsonify, Response, stream_with_context
|
||||
from flask import Blueprint, request, jsonify
|
||||
|
||||
from db import get_db, vacuum_db
|
||||
from auth import login_required
|
||||
@@ -15,12 +15,10 @@ project_bp = Blueprint('project', __name__)
|
||||
# Helpers
|
||||
# ============================================
|
||||
|
||||
# C0/C1 control characters except \t \n \r — these corrupt JSON streams.
|
||||
_CONTROL_CHAR_RE = re.compile(r'[\x00-\x08\x0B\x0C\x0E-\x1F\x7F]')
|
||||
|
||||
|
||||
def clean_str(s):
|
||||
"""Strip raw control characters from a string. Returns '' for None."""
|
||||
if s is None:
|
||||
return ''
|
||||
if not isinstance(s, str):
|
||||
@@ -29,7 +27,6 @@ def clean_str(s):
|
||||
|
||||
|
||||
def clean_transcription(transcription):
|
||||
"""Sanitize 'word' fields inside a transcription list."""
|
||||
if isinstance(transcription, list):
|
||||
for t in transcription:
|
||||
if isinstance(t, dict) and 'word' in t:
|
||||
@@ -118,11 +115,9 @@ def create_project():
|
||||
@login_required
|
||||
def get_project(project_id):
|
||||
"""
|
||||
Get a project with all its chapters and blocks.
|
||||
|
||||
Streamed response: large projects (with many audio blocks) can produce
|
||||
10-50 MB of JSON. We stream it in chunks and sanitize every string field
|
||||
to prevent control characters from breaking JSON parsing on the client.
|
||||
Get project metadata WITHOUT audio_data.
|
||||
Audio is loaded lazily via /api/projects/<id>/audio/<block_id>.
|
||||
This keeps the response small (<1 MB) and avoids proxy truncation issues.
|
||||
"""
|
||||
db = get_db()
|
||||
cursor = db.cursor()
|
||||
@@ -141,7 +136,10 @@ def get_project(project_id):
|
||||
chapters_data = []
|
||||
for chapter in chapters:
|
||||
cursor.execute('''
|
||||
SELECT * FROM markdown_blocks WHERE chapter_id = ? ORDER BY block_order
|
||||
SELECT id, block_order, block_type, content, tts_text,
|
||||
audio_format, transcription,
|
||||
(audio_data IS NOT NULL AND audio_data != '') as has_audio
|
||||
FROM markdown_blocks WHERE chapter_id = ? ORDER BY block_order
|
||||
''', (chapter['id'],))
|
||||
blocks = cursor.fetchall()
|
||||
|
||||
@@ -152,7 +150,6 @@ def get_project(project_id):
|
||||
''', (block['id'],))
|
||||
images = cursor.fetchall()
|
||||
|
||||
# Safely parse transcription (might be NULL, empty, or malformed)
|
||||
transcription = []
|
||||
if block['transcription']:
|
||||
try:
|
||||
@@ -167,8 +164,9 @@ def get_project(project_id):
|
||||
'block_type': clean_str(block['block_type']),
|
||||
'content': clean_str(block['content']),
|
||||
'tts_text': clean_str(block['tts_text']),
|
||||
'audio_data': clean_str(block['audio_data']),
|
||||
'audio_data': '', # Empty here; loaded lazily by frontend
|
||||
'audio_format': clean_str(block['audio_format']) or 'mp3',
|
||||
'has_audio': bool(block['has_audio']),
|
||||
'transcription': transcription,
|
||||
'images': [{
|
||||
'id': img['id'],
|
||||
@@ -187,32 +185,43 @@ def get_project(project_id):
|
||||
'blocks': blocks_data
|
||||
})
|
||||
|
||||
response_data = {
|
||||
return jsonify({
|
||||
'id': project['id'],
|
||||
'name': clean_str(project['name']),
|
||||
'created_at': clean_str(project['created_at']),
|
||||
'updated_at': clean_str(project['updated_at']),
|
||||
'chapters': chapters_data
|
||||
}
|
||||
})
|
||||
|
||||
|
||||
@project_bp.route('/api/projects/<int:project_id>/audio/<int:block_id>', methods=['GET'])
|
||||
@login_required
|
||||
def get_block_audio(project_id, block_id):
|
||||
"""
|
||||
Return audio_data (base64) for a single block.
|
||||
Used by the frontend to lazy-load audio after metadata is loaded.
|
||||
"""
|
||||
db = get_db()
|
||||
cursor = db.cursor()
|
||||
|
||||
# Stream JSON in chunks. ensure_ascii=True forces all non-ASCII chars
|
||||
# to be escaped (\uXXXX) — slightly larger payload but guarantees the
|
||||
# stream is pure ASCII, so no proxy can mis-handle multi-byte chars
|
||||
# at chunk boundaries.
|
||||
def generate():
|
||||
json_str = json.dumps(response_data, ensure_ascii=True)
|
||||
chunk_size = 64 * 1024 # 64 KB per chunk
|
||||
for i in range(0, len(json_str), chunk_size):
|
||||
yield json_str[i:i + chunk_size]
|
||||
cursor.execute('''
|
||||
SELECT mb.audio_data, mb.audio_format
|
||||
FROM markdown_blocks mb
|
||||
JOIN chapters c ON mb.chapter_id = c.id
|
||||
WHERE mb.id = ? AND c.project_id = ?
|
||||
''', (block_id, project_id))
|
||||
row = cursor.fetchone()
|
||||
|
||||
return Response(
|
||||
stream_with_context(generate()),
|
||||
mimetype='application/json; charset=utf-8',
|
||||
headers={
|
||||
'Cache-Control': 'no-cache',
|
||||
'X-Accel-Buffering': 'no' # Tell Nginx/Traefik: don't buffer this response
|
||||
}
|
||||
)
|
||||
if not row:
|
||||
return jsonify({'error': 'Block not found'}), 404
|
||||
|
||||
if not row['audio_data']:
|
||||
return jsonify({'audio_data': '', 'audio_format': row['audio_format'] or 'mp3'})
|
||||
|
||||
return jsonify({
|
||||
'audio_data': clean_str(row['audio_data']),
|
||||
'audio_format': clean_str(row['audio_format']) or 'mp3'
|
||||
})
|
||||
|
||||
|
||||
@project_bp.route('/api/projects/<int:project_id>', methods=['PUT'])
|
||||
@@ -281,13 +290,7 @@ def delete_project(project_id):
|
||||
@project_bp.route('/api/projects/<int:project_id>/save', methods=['POST'])
|
||||
@login_required
|
||||
def save_project_content(project_id):
|
||||
"""
|
||||
Save all chapters and blocks for a project.
|
||||
|
||||
Every string field is sanitized before insertion so that invalid
|
||||
control characters never enter the database. This protects future
|
||||
reads from the JSON corruption bug we saw on /api/projects/<id> GET.
|
||||
"""
|
||||
"""Save all chapters and blocks for a project."""
|
||||
data = request.json
|
||||
chapters = data.get('chapters', [])
|
||||
|
||||
@@ -328,7 +331,6 @@ def save_project_content(project_id):
|
||||
chapter_id = cursor.lastrowid
|
||||
|
||||
for block in chapter.get('blocks', []):
|
||||
# Clean transcription word fields before storing
|
||||
transcription = clean_transcription(block.get('transcription', []))
|
||||
|
||||
cursor.execute('''
|
||||
@@ -376,7 +378,7 @@ def save_project_content(project_id):
|
||||
@project_bp.route('/api/projects/<int:project_id>/publish', methods=['POST'])
|
||||
@login_required
|
||||
def publish_project(project_id):
|
||||
"""Publish a project to make it visible on public homepage."""
|
||||
"""Publish a project to public homepage."""
|
||||
data = request.json or {}
|
||||
|
||||
db = get_db()
|
||||
@@ -387,7 +389,6 @@ def publish_project(project_id):
|
||||
if not project:
|
||||
return jsonify({'error': 'Project not found'}), 404
|
||||
|
||||
# Verify project has at least one chapter with audio
|
||||
cursor.execute('''
|
||||
SELECT COUNT(*) as cnt FROM markdown_blocks mb
|
||||
JOIN chapters c ON mb.chapter_id = c.id
|
||||
@@ -422,7 +423,7 @@ def publish_project(project_id):
|
||||
@project_bp.route('/api/projects/<int:project_id>/unpublish', methods=['POST'])
|
||||
@login_required
|
||||
def unpublish_project(project_id):
|
||||
"""Unpublish a project (but keep author/description/category for easy republish)."""
|
||||
"""Unpublish a project."""
|
||||
db = get_db()
|
||||
cursor = db.cursor()
|
||||
|
||||
@@ -439,7 +440,7 @@ def unpublish_project(project_id):
|
||||
@project_bp.route('/api/projects/<int:project_id>/thumbnail', methods=['POST'])
|
||||
@login_required
|
||||
def upload_thumbnail(project_id):
|
||||
"""Upload a thumbnail image for the project."""
|
||||
"""Upload a thumbnail image."""
|
||||
if 'file' not in request.files:
|
||||
return jsonify({'error': 'No file provided'}), 400
|
||||
|
||||
@@ -488,76 +489,3 @@ def delete_thumbnail(project_id):
|
||||
cursor.execute('UPDATE projects SET thumbnail_data = NULL WHERE id = ?', (project_id,))
|
||||
db.commit()
|
||||
return jsonify({'success': True})
|
||||
|
||||
# ============================================
|
||||
# DEBUG: Identify corrupt data
|
||||
# ============================================
|
||||
|
||||
@project_bp.route('/api/projects/<int:project_id>/debug', methods=['GET'])
|
||||
@login_required
|
||||
def debug_project(project_id):
|
||||
"""
|
||||
Scan a project for control characters and report which fields are dirty.
|
||||
Visit: /api/projects/<id>/debug after logging in.
|
||||
"""
|
||||
db = get_db()
|
||||
cursor = db.cursor()
|
||||
|
||||
cursor.execute('SELECT id, name FROM projects WHERE id = ?', (project_id,))
|
||||
project = cursor.fetchone()
|
||||
if not project:
|
||||
return jsonify({'error': 'Project not found'}), 404
|
||||
|
||||
def find_bad_chars(s):
|
||||
"""Return list of (position, char_code) for any control char found."""
|
||||
if not s or not isinstance(s, str):
|
||||
return []
|
||||
bad = []
|
||||
for i, ch in enumerate(s):
|
||||
code = ord(ch)
|
||||
# Allow \t (9), \n (10), \r (13). Anything else <32 or 127 is bad.
|
||||
if (code < 32 and code not in (9, 10, 13)) or code == 127:
|
||||
bad.append({'pos': i, 'code': code, 'hex': f'0x{code:02x}'})
|
||||
if len(bad) >= 5: # cap at 5 per field
|
||||
break
|
||||
return bad
|
||||
|
||||
report = {
|
||||
'project_id': project['id'],
|
||||
'project_name': project['name'],
|
||||
'issues': []
|
||||
}
|
||||
|
||||
cursor.execute('SELECT * FROM chapters WHERE project_id = ? ORDER BY chapter_number', (project_id,))
|
||||
chapters = cursor.fetchall()
|
||||
|
||||
for chapter in chapters:
|
||||
ch_num = chapter['chapter_number']
|
||||
|
||||
for field in ('title', 'voice'):
|
||||
bad = find_bad_chars(chapter[field])
|
||||
if bad:
|
||||
report['issues'].append({
|
||||
'where': f'chapter {ch_num} -> {field}',
|
||||
'bad_chars': bad,
|
||||
'sample': repr((chapter[field] or '')[:80])
|
||||
})
|
||||
|
||||
cursor.execute('SELECT * FROM markdown_blocks WHERE chapter_id = ? ORDER BY block_order', (chapter['id'],))
|
||||
blocks = cursor.fetchall()
|
||||
|
||||
for block in blocks:
|
||||
b_order = block['block_order']
|
||||
for field in ('block_type', 'content', 'tts_text', 'audio_data', 'audio_format', 'transcription'):
|
||||
bad = find_bad_chars(block[field])
|
||||
if bad:
|
||||
val = block[field] or ''
|
||||
report['issues'].append({
|
||||
'where': f'chapter {ch_num}, block {b_order} -> {field}',
|
||||
'field_length': len(val),
|
||||
'bad_chars': bad,
|
||||
'sample_around_first_bad': repr(val[max(0, bad[0]['pos']-20):bad[0]['pos']+20])
|
||||
})
|
||||
|
||||
report['total_issues'] = len(report['issues'])
|
||||
return jsonify(report)
|
||||
|
||||
Reference in New Issue
Block a user