From 4044c6aeaacc239eed71bc066f1b85f6a3e410d9 Mon Sep 17 00:00:00 2001 From: Ashim Kumar Date: Sat, 23 May 2026 06:53:16 +0600 Subject: [PATCH] Fix: sanitize control characters in project data, use ASCII-safe streaming --- routes/project_routes.py | 112 +++++++++++++++++++++++++++------------ 1 file changed, 77 insertions(+), 35 deletions(-) diff --git a/routes/project_routes.py b/routes/project_routes.py index 7fc669d..824d840 100644 --- a/routes/project_routes.py +++ b/routes/project_routes.py @@ -1,5 +1,6 @@ # routes/project_routes.py - Project Management Routes (v4.2) +import re import json import base64 from flask import Blueprint, request, jsonify, Response, stream_with_context @@ -10,6 +11,36 @@ from auth import login_required project_bp = Blueprint('project', __name__) +# ============================================ +# Helpers +# ============================================ + +# C0/C1 control characters except \t \n \r — these corrupt JSON streams. +_CONTROL_CHAR_RE = re.compile(r'[\x00-\x08\x0B\x0C\x0E-\x1F\x7F]') + + +def clean_str(s): + """Strip raw control characters from a string. Returns '' for None.""" + if s is None: + return '' + if not isinstance(s, str): + s = str(s) + return _CONTROL_CHAR_RE.sub('', s) + + +def clean_transcription(transcription): + """Sanitize 'word' fields inside a transcription list.""" + if isinstance(transcription, list): + for t in transcription: + if isinstance(t, dict) and 'word' in t: + t['word'] = clean_str(t.get('word', '')) + return transcription + + +# ============================================ +# Routes +# ============================================ + @project_bp.route('/api/projects', methods=['GET']) @login_required def list_projects(): @@ -90,8 +121,8 @@ def get_project(project_id): Get a project with all its chapters and blocks. Streamed response: large projects (with many audio blocks) can produce - 10-50 MB of JSON. We stream it in chunks so that the reverse proxy - (Traefik in Coolify) doesn't buffer the entire payload and truncate it. + 10-50 MB of JSON. We stream it in chunks and sanitize every string field + to prevent control characters from breaking JSON parsing on the client. """ db = get_db() cursor = db.cursor() @@ -126,47 +157,50 @@ def get_project(project_id): if block['transcription']: try: transcription = json.loads(block['transcription']) + transcription = clean_transcription(transcription) except (json.JSONDecodeError, TypeError): transcription = [] blocks_data.append({ 'id': block['id'], 'block_order': block['block_order'], - 'block_type': block['block_type'], - 'content': block['content'], - 'tts_text': block['tts_text'], - 'audio_data': block['audio_data'], - 'audio_format': block['audio_format'], + 'block_type': clean_str(block['block_type']), + 'content': clean_str(block['content']), + 'tts_text': clean_str(block['tts_text']), + 'audio_data': clean_str(block['audio_data']), + 'audio_format': clean_str(block['audio_format']) or 'mp3', 'transcription': transcription, 'images': [{ 'id': img['id'], - 'data': img['image_data'], - 'format': img['image_format'], - 'alt_text': img['alt_text'], - 'position': img['position'] + 'data': clean_str(img['image_data']), + 'format': clean_str(img['image_format']) or 'png', + 'alt_text': clean_str(img['alt_text']), + 'position': clean_str(img['position']) or 'before' } for img in images] }) chapters_data.append({ 'id': chapter['id'], 'chapter_number': chapter['chapter_number'], - 'title': chapter['title'], - 'voice': chapter['voice'], + 'title': clean_str(chapter['title']), + 'voice': clean_str(chapter['voice']), 'blocks': blocks_data }) response_data = { 'id': project['id'], - 'name': project['name'], - 'created_at': project['created_at'], - 'updated_at': project['updated_at'], + 'name': clean_str(project['name']), + 'created_at': clean_str(project['created_at']), + 'updated_at': clean_str(project['updated_at']), 'chapters': chapters_data } - # Stream the JSON in chunks. ensure_ascii=False keeps Unicode (e.g. Bangla) - # compact and avoids the JSON ballooning to 2-3x its size. + # Stream JSON in chunks. ensure_ascii=True forces all non-ASCII chars + # to be escaped (\uXXXX) — slightly larger payload but guarantees the + # stream is pure ASCII, so no proxy can mis-handle multi-byte chars + # at chunk boundaries. def generate(): - json_str = json.dumps(response_data, ensure_ascii=False) + json_str = json.dumps(response_data, ensure_ascii=True) chunk_size = 64 * 1024 # 64 KB per chunk for i in range(0, len(json_str), chunk_size): yield json_str[i:i + chunk_size] @@ -247,7 +281,13 @@ def delete_project(project_id): @project_bp.route('/api/projects//save', methods=['POST']) @login_required def save_project_content(project_id): - """Save all chapters and blocks for a project.""" + """ + Save all chapters and blocks for a project. + + Every string field is sanitized before insertion so that invalid + control characters never enter the database. This protects future + reads from the JSON corruption bug we saw on /api/projects/ GET. + """ data = request.json chapters = data.get('chapters', []) @@ -279,15 +319,18 @@ def save_project_content(project_id): INSERT INTO chapters (project_id, chapter_number, title, voice) VALUES (?, ?, ?, ?) ''', ( - project_id, - chapter['chapter_number'], - chapter.get('title', 'Section'), - chapter.get('voice', 'af_heart') + project_id, + chapter['chapter_number'], + clean_str(chapter.get('title', 'Section')), + clean_str(chapter.get('voice', 'af_heart')) )) chapter_id = cursor.lastrowid for block in chapter.get('blocks', []): + # Clean transcription word fields before storing + transcription = clean_transcription(block.get('transcription', [])) + cursor.execute(''' INSERT INTO markdown_blocks (chapter_id, block_order, block_type, content, tts_text, audio_data, audio_format, transcription) @@ -295,12 +338,12 @@ def save_project_content(project_id): ''', ( chapter_id, block['block_order'], - block.get('block_type', 'paragraph'), - block['content'], - block.get('tts_text'), - block.get('audio_data'), - block.get('audio_format', 'mp3'), - json.dumps(block.get('transcription', [])) + clean_str(block.get('block_type', 'paragraph')), + clean_str(block.get('content', '')), + clean_str(block.get('tts_text', '')), + clean_str(block.get('audio_data', '')), + clean_str(block.get('audio_format', 'mp3')), + json.dumps(transcription) )) block_id = cursor.lastrowid @@ -311,10 +354,10 @@ def save_project_content(project_id): VALUES (?, ?, ?, ?, ?) ''', ( block_id, - img['data'], - img.get('format', 'png'), - img.get('alt_text', ''), - img.get('position', 'before') + clean_str(img.get('data', '')), + clean_str(img.get('format', 'png')), + clean_str(img.get('alt_text', '')), + clean_str(img.get('position', 'before')) )) cursor.execute(''' @@ -387,7 +430,6 @@ def unpublish_project(project_id): if not cursor.fetchone(): return jsonify({'error': 'Project not found'}), 404 - # Only flip is_published flag — keep author/description/category for republish cursor.execute('UPDATE projects SET is_published = 0 WHERE id = ?', (project_id,)) db.commit()