From 4044c6aeaacc239eed71bc066f1b85f6a3e410d9 Mon Sep 17 00:00:00 2001
From: Ashim Kumar <ak@Ashims-MacBook-Pro-2.local>
Date: Sat, 23 May 2026 06:53:16 +0600
Subject: [PATCH] Fix: sanitize control characters in project data, use
 ASCII-safe streaming

---
 routes/project_routes.py | 112 +++++++++++++++++++++++++++------------
 1 file changed, 77 insertions(+), 35 deletions(-)

diff --git a/routes/project_routes.py b/routes/project_routes.py
index 7fc669d..824d840 100644
--- a/routes/project_routes.py
+++ b/routes/project_routes.py
@@ -1,5 +1,6 @@
 # routes/project_routes.py - Project Management Routes (v4.2)
 
+import re
 import json
 import base64
 from flask import Blueprint, request, jsonify, Response, stream_with_context
@@ -10,6 +11,36 @@ from auth import login_required
 project_bp = Blueprint('project', __name__)
 
 
+# ============================================
+# Helpers
+# ============================================
+
+# C0/C1 control characters except \t \n \r — these corrupt JSON streams.
+_CONTROL_CHAR_RE = re.compile(r'[\x00-\x08\x0B\x0C\x0E-\x1F\x7F]')
+
+
+def clean_str(s):
+    """Strip raw control characters from a string. Returns '' for None."""
+    if s is None:
+        return ''
+    if not isinstance(s, str):
+        s = str(s)
+    return _CONTROL_CHAR_RE.sub('', s)
+
+
+def clean_transcription(transcription):
+    """Sanitize 'word' fields inside a transcription list."""
+    if isinstance(transcription, list):
+        for t in transcription:
+            if isinstance(t, dict) and 'word' in t:
+                t['word'] = clean_str(t.get('word', ''))
+    return transcription
+
+
+# ============================================
+# Routes
+# ============================================
+
 @project_bp.route('/api/projects', methods=['GET'])
 @login_required
 def list_projects():
@@ -90,8 +121,8 @@ def get_project(project_id):
     Get a project with all its chapters and blocks.
     
     Streamed response: large projects (with many audio blocks) can produce
-    10-50 MB of JSON. We stream it in chunks so that the reverse proxy
-    (Traefik in Coolify) doesn't buffer the entire payload and truncate it.
+    10-50 MB of JSON. We stream it in chunks and sanitize every string field
+    to prevent control characters from breaking JSON parsing on the client.
     """
     db = get_db()
     cursor = db.cursor()
@@ -126,47 +157,50 @@ def get_project(project_id):
             if block['transcription']:
                 try:
                     transcription = json.loads(block['transcription'])
+                    transcription = clean_transcription(transcription)
                 except (json.JSONDecodeError, TypeError):
                     transcription = []
             
             blocks_data.append({
                 'id': block['id'],
                 'block_order': block['block_order'],
-                'block_type': block['block_type'],
-                'content': block['content'],
-                'tts_text': block['tts_text'],
-                'audio_data': block['audio_data'],
-                'audio_format': block['audio_format'],
+                'block_type': clean_str(block['block_type']),
+                'content': clean_str(block['content']),
+                'tts_text': clean_str(block['tts_text']),
+                'audio_data': clean_str(block['audio_data']),
+                'audio_format': clean_str(block['audio_format']) or 'mp3',
                 'transcription': transcription,
                 'images': [{
                     'id': img['id'],
-                    'data': img['image_data'],
-                    'format': img['image_format'],
-                    'alt_text': img['alt_text'],
-                    'position': img['position']
+                    'data': clean_str(img['image_data']),
+                    'format': clean_str(img['image_format']) or 'png',
+                    'alt_text': clean_str(img['alt_text']),
+                    'position': clean_str(img['position']) or 'before'
                 } for img in images]
             })
         
         chapters_data.append({
             'id': chapter['id'],
             'chapter_number': chapter['chapter_number'],
-            'title': chapter['title'],
-            'voice': chapter['voice'],
+            'title': clean_str(chapter['title']),
+            'voice': clean_str(chapter['voice']),
             'blocks': blocks_data
         })
     
     response_data = {
         'id': project['id'],
-        'name': project['name'],
-        'created_at': project['created_at'],
-        'updated_at': project['updated_at'],
+        'name': clean_str(project['name']),
+        'created_at': clean_str(project['created_at']),
+        'updated_at': clean_str(project['updated_at']),
         'chapters': chapters_data
     }
     
-    # Stream the JSON in chunks. ensure_ascii=False keeps Unicode (e.g. Bangla)
-    # compact and avoids the JSON ballooning to 2-3x its size.
+    # Stream JSON in chunks. ensure_ascii=True forces all non-ASCII chars
+    # to be escaped (\uXXXX) — slightly larger payload but guarantees the
+    # stream is pure ASCII, so no proxy can mis-handle multi-byte chars
+    # at chunk boundaries.
     def generate():
-        json_str = json.dumps(response_data, ensure_ascii=False)
+        json_str = json.dumps(response_data, ensure_ascii=True)
         chunk_size = 64 * 1024  # 64 KB per chunk
         for i in range(0, len(json_str), chunk_size):
             yield json_str[i:i + chunk_size]
@@ -247,7 +281,13 @@ def delete_project(project_id):
 @project_bp.route('/api/projects/<int:project_id>/save', methods=['POST'])
 @login_required
 def save_project_content(project_id):
-    """Save all chapters and blocks for a project."""
+    """
+    Save all chapters and blocks for a project.
+    
+    Every string field is sanitized before insertion so that invalid
+    control characters never enter the database. This protects future
+    reads from the JSON corruption bug we saw on /api/projects/<id> GET.
+    """
     data = request.json
     chapters = data.get('chapters', [])
     
@@ -279,15 +319,18 @@ def save_project_content(project_id):
             INSERT INTO chapters (project_id, chapter_number, title, voice)
             VALUES (?, ?, ?, ?)
         ''', (
-            project_id, 
-            chapter['chapter_number'], 
-            chapter.get('title', 'Section'),
-            chapter.get('voice', 'af_heart')
+            project_id,
+            chapter['chapter_number'],
+            clean_str(chapter.get('title', 'Section')),
+            clean_str(chapter.get('voice', 'af_heart'))
         ))
         
         chapter_id = cursor.lastrowid
         
         for block in chapter.get('blocks', []):
+            # Clean transcription word fields before storing
+            transcription = clean_transcription(block.get('transcription', []))
+            
             cursor.execute('''
                 INSERT INTO markdown_blocks 
                 (chapter_id, block_order, block_type, content, tts_text, audio_data, audio_format, transcription)
@@ -295,12 +338,12 @@ def save_project_content(project_id):
             ''', (
                 chapter_id,
                 block['block_order'],
-                block.get('block_type', 'paragraph'),
-                block['content'],
-                block.get('tts_text'),
-                block.get('audio_data'),
-                block.get('audio_format', 'mp3'),
-                json.dumps(block.get('transcription', []))
+                clean_str(block.get('block_type', 'paragraph')),
+                clean_str(block.get('content', '')),
+                clean_str(block.get('tts_text', '')),
+                clean_str(block.get('audio_data', '')),
+                clean_str(block.get('audio_format', 'mp3')),
+                json.dumps(transcription)
             ))
             
             block_id = cursor.lastrowid
@@ -311,10 +354,10 @@ def save_project_content(project_id):
                     VALUES (?, ?, ?, ?, ?)
                 ''', (
                     block_id,
-                    img['data'],
-                    img.get('format', 'png'),
-                    img.get('alt_text', ''),
-                    img.get('position', 'before')
+                    clean_str(img.get('data', '')),
+                    clean_str(img.get('format', 'png')),
+                    clean_str(img.get('alt_text', '')),
+                    clean_str(img.get('position', 'before'))
                 ))
     
     cursor.execute('''
@@ -387,7 +430,6 @@ def unpublish_project(project_id):
     if not cursor.fetchone():
         return jsonify({'error': 'Project not found'}), 404
     
-    # Only flip is_published flag — keep author/description/category for republish
     cursor.execute('UPDATE projects SET is_published = 0 WHERE id = ?', (project_id,))
     db.commit()