Fix: sanitize control characters in project data, use ASCII-safe streaming

This commit is contained in:
Ashim Kumar
2026-05-23 06:53:16 +06:00
parent d5cbadbdc4
commit 4044c6aeaa

View File

@@ -1,5 +1,6 @@
# routes/project_routes.py - Project Management Routes (v4.2) # routes/project_routes.py - Project Management Routes (v4.2)
import re
import json import json
import base64 import base64
from flask import Blueprint, request, jsonify, Response, stream_with_context from flask import Blueprint, request, jsonify, Response, stream_with_context
@@ -10,6 +11,36 @@ from auth import login_required
project_bp = Blueprint('project', __name__) project_bp = Blueprint('project', __name__)
# ============================================
# Helpers
# ============================================
# C0/C1 control characters except \t \n \r — these corrupt JSON streams.
_CONTROL_CHAR_RE = re.compile(r'[\x00-\x08\x0B\x0C\x0E-\x1F\x7F]')
def clean_str(s):
"""Strip raw control characters from a string. Returns '' for None."""
if s is None:
return ''
if not isinstance(s, str):
s = str(s)
return _CONTROL_CHAR_RE.sub('', s)
def clean_transcription(transcription):
"""Sanitize 'word' fields inside a transcription list."""
if isinstance(transcription, list):
for t in transcription:
if isinstance(t, dict) and 'word' in t:
t['word'] = clean_str(t.get('word', ''))
return transcription
# ============================================
# Routes
# ============================================
@project_bp.route('/api/projects', methods=['GET']) @project_bp.route('/api/projects', methods=['GET'])
@login_required @login_required
def list_projects(): def list_projects():
@@ -90,8 +121,8 @@ def get_project(project_id):
Get a project with all its chapters and blocks. Get a project with all its chapters and blocks.
Streamed response: large projects (with many audio blocks) can produce Streamed response: large projects (with many audio blocks) can produce
10-50 MB of JSON. We stream it in chunks so that the reverse proxy 10-50 MB of JSON. We stream it in chunks and sanitize every string field
(Traefik in Coolify) doesn't buffer the entire payload and truncate it. to prevent control characters from breaking JSON parsing on the client.
""" """
db = get_db() db = get_db()
cursor = db.cursor() cursor = db.cursor()
@@ -126,47 +157,50 @@ def get_project(project_id):
if block['transcription']: if block['transcription']:
try: try:
transcription = json.loads(block['transcription']) transcription = json.loads(block['transcription'])
transcription = clean_transcription(transcription)
except (json.JSONDecodeError, TypeError): except (json.JSONDecodeError, TypeError):
transcription = [] transcription = []
blocks_data.append({ blocks_data.append({
'id': block['id'], 'id': block['id'],
'block_order': block['block_order'], 'block_order': block['block_order'],
'block_type': block['block_type'], 'block_type': clean_str(block['block_type']),
'content': block['content'], 'content': clean_str(block['content']),
'tts_text': block['tts_text'], 'tts_text': clean_str(block['tts_text']),
'audio_data': block['audio_data'], 'audio_data': clean_str(block['audio_data']),
'audio_format': block['audio_format'], 'audio_format': clean_str(block['audio_format']) or 'mp3',
'transcription': transcription, 'transcription': transcription,
'images': [{ 'images': [{
'id': img['id'], 'id': img['id'],
'data': img['image_data'], 'data': clean_str(img['image_data']),
'format': img['image_format'], 'format': clean_str(img['image_format']) or 'png',
'alt_text': img['alt_text'], 'alt_text': clean_str(img['alt_text']),
'position': img['position'] 'position': clean_str(img['position']) or 'before'
} for img in images] } for img in images]
}) })
chapters_data.append({ chapters_data.append({
'id': chapter['id'], 'id': chapter['id'],
'chapter_number': chapter['chapter_number'], 'chapter_number': chapter['chapter_number'],
'title': chapter['title'], 'title': clean_str(chapter['title']),
'voice': chapter['voice'], 'voice': clean_str(chapter['voice']),
'blocks': blocks_data 'blocks': blocks_data
}) })
response_data = { response_data = {
'id': project['id'], 'id': project['id'],
'name': project['name'], 'name': clean_str(project['name']),
'created_at': project['created_at'], 'created_at': clean_str(project['created_at']),
'updated_at': project['updated_at'], 'updated_at': clean_str(project['updated_at']),
'chapters': chapters_data 'chapters': chapters_data
} }
# Stream the JSON in chunks. ensure_ascii=False keeps Unicode (e.g. Bangla) # Stream JSON in chunks. ensure_ascii=True forces all non-ASCII chars
# compact and avoids the JSON ballooning to 2-3x its size. # to be escaped (\uXXXX) — slightly larger payload but guarantees the
# stream is pure ASCII, so no proxy can mis-handle multi-byte chars
# at chunk boundaries.
def generate(): def generate():
json_str = json.dumps(response_data, ensure_ascii=False) json_str = json.dumps(response_data, ensure_ascii=True)
chunk_size = 64 * 1024 # 64 KB per chunk chunk_size = 64 * 1024 # 64 KB per chunk
for i in range(0, len(json_str), chunk_size): for i in range(0, len(json_str), chunk_size):
yield json_str[i:i + chunk_size] yield json_str[i:i + chunk_size]
@@ -247,7 +281,13 @@ def delete_project(project_id):
@project_bp.route('/api/projects/<int:project_id>/save', methods=['POST']) @project_bp.route('/api/projects/<int:project_id>/save', methods=['POST'])
@login_required @login_required
def save_project_content(project_id): def save_project_content(project_id):
"""Save all chapters and blocks for a project.""" """
Save all chapters and blocks for a project.
Every string field is sanitized before insertion so that invalid
control characters never enter the database. This protects future
reads from the JSON corruption bug we saw on /api/projects/<id> GET.
"""
data = request.json data = request.json
chapters = data.get('chapters', []) chapters = data.get('chapters', [])
@@ -281,13 +321,16 @@ def save_project_content(project_id):
''', ( ''', (
project_id, project_id,
chapter['chapter_number'], chapter['chapter_number'],
chapter.get('title', 'Section'), clean_str(chapter.get('title', 'Section')),
chapter.get('voice', 'af_heart') clean_str(chapter.get('voice', 'af_heart'))
)) ))
chapter_id = cursor.lastrowid chapter_id = cursor.lastrowid
for block in chapter.get('blocks', []): for block in chapter.get('blocks', []):
# Clean transcription word fields before storing
transcription = clean_transcription(block.get('transcription', []))
cursor.execute(''' cursor.execute('''
INSERT INTO markdown_blocks INSERT INTO markdown_blocks
(chapter_id, block_order, block_type, content, tts_text, audio_data, audio_format, transcription) (chapter_id, block_order, block_type, content, tts_text, audio_data, audio_format, transcription)
@@ -295,12 +338,12 @@ def save_project_content(project_id):
''', ( ''', (
chapter_id, chapter_id,
block['block_order'], block['block_order'],
block.get('block_type', 'paragraph'), clean_str(block.get('block_type', 'paragraph')),
block['content'], clean_str(block.get('content', '')),
block.get('tts_text'), clean_str(block.get('tts_text', '')),
block.get('audio_data'), clean_str(block.get('audio_data', '')),
block.get('audio_format', 'mp3'), clean_str(block.get('audio_format', 'mp3')),
json.dumps(block.get('transcription', [])) json.dumps(transcription)
)) ))
block_id = cursor.lastrowid block_id = cursor.lastrowid
@@ -311,10 +354,10 @@ def save_project_content(project_id):
VALUES (?, ?, ?, ?, ?) VALUES (?, ?, ?, ?, ?)
''', ( ''', (
block_id, block_id,
img['data'], clean_str(img.get('data', '')),
img.get('format', 'png'), clean_str(img.get('format', 'png')),
img.get('alt_text', ''), clean_str(img.get('alt_text', '')),
img.get('position', 'before') clean_str(img.get('position', 'before'))
)) ))
cursor.execute(''' cursor.execute('''
@@ -387,7 +430,6 @@ def unpublish_project(project_id):
if not cursor.fetchone(): if not cursor.fetchone():
return jsonify({'error': 'Project not found'}), 404 return jsonify({'error': 'Project not found'}), 404
# Only flip is_published flag — keep author/description/category for republish
cursor.execute('UPDATE projects SET is_published = 0 WHERE id = ?', (project_id,)) cursor.execute('UPDATE projects SET is_published = 0 WHERE id = ?', (project_id,))
db.commit() db.commit()