Fix: sanitize control characters in project data, use ASCII-safe streaming
This commit is contained in:
@@ -1,5 +1,6 @@
|
|||||||
# routes/project_routes.py - Project Management Routes (v4.2)
|
# routes/project_routes.py - Project Management Routes (v4.2)
|
||||||
|
|
||||||
|
import re
|
||||||
import json
|
import json
|
||||||
import base64
|
import base64
|
||||||
from flask import Blueprint, request, jsonify, Response, stream_with_context
|
from flask import Blueprint, request, jsonify, Response, stream_with_context
|
||||||
@@ -10,6 +11,36 @@ from auth import login_required
|
|||||||
project_bp = Blueprint('project', __name__)
|
project_bp = Blueprint('project', __name__)
|
||||||
|
|
||||||
|
|
||||||
|
# ============================================
|
||||||
|
# Helpers
|
||||||
|
# ============================================
|
||||||
|
|
||||||
|
# C0/C1 control characters except \t \n \r — these corrupt JSON streams.
|
||||||
|
_CONTROL_CHAR_RE = re.compile(r'[\x00-\x08\x0B\x0C\x0E-\x1F\x7F]')
|
||||||
|
|
||||||
|
|
||||||
|
def clean_str(s):
|
||||||
|
"""Strip raw control characters from a string. Returns '' for None."""
|
||||||
|
if s is None:
|
||||||
|
return ''
|
||||||
|
if not isinstance(s, str):
|
||||||
|
s = str(s)
|
||||||
|
return _CONTROL_CHAR_RE.sub('', s)
|
||||||
|
|
||||||
|
|
||||||
|
def clean_transcription(transcription):
|
||||||
|
"""Sanitize 'word' fields inside a transcription list."""
|
||||||
|
if isinstance(transcription, list):
|
||||||
|
for t in transcription:
|
||||||
|
if isinstance(t, dict) and 'word' in t:
|
||||||
|
t['word'] = clean_str(t.get('word', ''))
|
||||||
|
return transcription
|
||||||
|
|
||||||
|
|
||||||
|
# ============================================
|
||||||
|
# Routes
|
||||||
|
# ============================================
|
||||||
|
|
||||||
@project_bp.route('/api/projects', methods=['GET'])
|
@project_bp.route('/api/projects', methods=['GET'])
|
||||||
@login_required
|
@login_required
|
||||||
def list_projects():
|
def list_projects():
|
||||||
@@ -90,8 +121,8 @@ def get_project(project_id):
|
|||||||
Get a project with all its chapters and blocks.
|
Get a project with all its chapters and blocks.
|
||||||
|
|
||||||
Streamed response: large projects (with many audio blocks) can produce
|
Streamed response: large projects (with many audio blocks) can produce
|
||||||
10-50 MB of JSON. We stream it in chunks so that the reverse proxy
|
10-50 MB of JSON. We stream it in chunks and sanitize every string field
|
||||||
(Traefik in Coolify) doesn't buffer the entire payload and truncate it.
|
to prevent control characters from breaking JSON parsing on the client.
|
||||||
"""
|
"""
|
||||||
db = get_db()
|
db = get_db()
|
||||||
cursor = db.cursor()
|
cursor = db.cursor()
|
||||||
@@ -126,47 +157,50 @@ def get_project(project_id):
|
|||||||
if block['transcription']:
|
if block['transcription']:
|
||||||
try:
|
try:
|
||||||
transcription = json.loads(block['transcription'])
|
transcription = json.loads(block['transcription'])
|
||||||
|
transcription = clean_transcription(transcription)
|
||||||
except (json.JSONDecodeError, TypeError):
|
except (json.JSONDecodeError, TypeError):
|
||||||
transcription = []
|
transcription = []
|
||||||
|
|
||||||
blocks_data.append({
|
blocks_data.append({
|
||||||
'id': block['id'],
|
'id': block['id'],
|
||||||
'block_order': block['block_order'],
|
'block_order': block['block_order'],
|
||||||
'block_type': block['block_type'],
|
'block_type': clean_str(block['block_type']),
|
||||||
'content': block['content'],
|
'content': clean_str(block['content']),
|
||||||
'tts_text': block['tts_text'],
|
'tts_text': clean_str(block['tts_text']),
|
||||||
'audio_data': block['audio_data'],
|
'audio_data': clean_str(block['audio_data']),
|
||||||
'audio_format': block['audio_format'],
|
'audio_format': clean_str(block['audio_format']) or 'mp3',
|
||||||
'transcription': transcription,
|
'transcription': transcription,
|
||||||
'images': [{
|
'images': [{
|
||||||
'id': img['id'],
|
'id': img['id'],
|
||||||
'data': img['image_data'],
|
'data': clean_str(img['image_data']),
|
||||||
'format': img['image_format'],
|
'format': clean_str(img['image_format']) or 'png',
|
||||||
'alt_text': img['alt_text'],
|
'alt_text': clean_str(img['alt_text']),
|
||||||
'position': img['position']
|
'position': clean_str(img['position']) or 'before'
|
||||||
} for img in images]
|
} for img in images]
|
||||||
})
|
})
|
||||||
|
|
||||||
chapters_data.append({
|
chapters_data.append({
|
||||||
'id': chapter['id'],
|
'id': chapter['id'],
|
||||||
'chapter_number': chapter['chapter_number'],
|
'chapter_number': chapter['chapter_number'],
|
||||||
'title': chapter['title'],
|
'title': clean_str(chapter['title']),
|
||||||
'voice': chapter['voice'],
|
'voice': clean_str(chapter['voice']),
|
||||||
'blocks': blocks_data
|
'blocks': blocks_data
|
||||||
})
|
})
|
||||||
|
|
||||||
response_data = {
|
response_data = {
|
||||||
'id': project['id'],
|
'id': project['id'],
|
||||||
'name': project['name'],
|
'name': clean_str(project['name']),
|
||||||
'created_at': project['created_at'],
|
'created_at': clean_str(project['created_at']),
|
||||||
'updated_at': project['updated_at'],
|
'updated_at': clean_str(project['updated_at']),
|
||||||
'chapters': chapters_data
|
'chapters': chapters_data
|
||||||
}
|
}
|
||||||
|
|
||||||
# Stream the JSON in chunks. ensure_ascii=False keeps Unicode (e.g. Bangla)
|
# Stream JSON in chunks. ensure_ascii=True forces all non-ASCII chars
|
||||||
# compact and avoids the JSON ballooning to 2-3x its size.
|
# to be escaped (\uXXXX) — slightly larger payload but guarantees the
|
||||||
|
# stream is pure ASCII, so no proxy can mis-handle multi-byte chars
|
||||||
|
# at chunk boundaries.
|
||||||
def generate():
|
def generate():
|
||||||
json_str = json.dumps(response_data, ensure_ascii=False)
|
json_str = json.dumps(response_data, ensure_ascii=True)
|
||||||
chunk_size = 64 * 1024 # 64 KB per chunk
|
chunk_size = 64 * 1024 # 64 KB per chunk
|
||||||
for i in range(0, len(json_str), chunk_size):
|
for i in range(0, len(json_str), chunk_size):
|
||||||
yield json_str[i:i + chunk_size]
|
yield json_str[i:i + chunk_size]
|
||||||
@@ -247,7 +281,13 @@ def delete_project(project_id):
|
|||||||
@project_bp.route('/api/projects/<int:project_id>/save', methods=['POST'])
|
@project_bp.route('/api/projects/<int:project_id>/save', methods=['POST'])
|
||||||
@login_required
|
@login_required
|
||||||
def save_project_content(project_id):
|
def save_project_content(project_id):
|
||||||
"""Save all chapters and blocks for a project."""
|
"""
|
||||||
|
Save all chapters and blocks for a project.
|
||||||
|
|
||||||
|
Every string field is sanitized before insertion so that invalid
|
||||||
|
control characters never enter the database. This protects future
|
||||||
|
reads from the JSON corruption bug we saw on /api/projects/<id> GET.
|
||||||
|
"""
|
||||||
data = request.json
|
data = request.json
|
||||||
chapters = data.get('chapters', [])
|
chapters = data.get('chapters', [])
|
||||||
|
|
||||||
@@ -281,13 +321,16 @@ def save_project_content(project_id):
|
|||||||
''', (
|
''', (
|
||||||
project_id,
|
project_id,
|
||||||
chapter['chapter_number'],
|
chapter['chapter_number'],
|
||||||
chapter.get('title', 'Section'),
|
clean_str(chapter.get('title', 'Section')),
|
||||||
chapter.get('voice', 'af_heart')
|
clean_str(chapter.get('voice', 'af_heart'))
|
||||||
))
|
))
|
||||||
|
|
||||||
chapter_id = cursor.lastrowid
|
chapter_id = cursor.lastrowid
|
||||||
|
|
||||||
for block in chapter.get('blocks', []):
|
for block in chapter.get('blocks', []):
|
||||||
|
# Clean transcription word fields before storing
|
||||||
|
transcription = clean_transcription(block.get('transcription', []))
|
||||||
|
|
||||||
cursor.execute('''
|
cursor.execute('''
|
||||||
INSERT INTO markdown_blocks
|
INSERT INTO markdown_blocks
|
||||||
(chapter_id, block_order, block_type, content, tts_text, audio_data, audio_format, transcription)
|
(chapter_id, block_order, block_type, content, tts_text, audio_data, audio_format, transcription)
|
||||||
@@ -295,12 +338,12 @@ def save_project_content(project_id):
|
|||||||
''', (
|
''', (
|
||||||
chapter_id,
|
chapter_id,
|
||||||
block['block_order'],
|
block['block_order'],
|
||||||
block.get('block_type', 'paragraph'),
|
clean_str(block.get('block_type', 'paragraph')),
|
||||||
block['content'],
|
clean_str(block.get('content', '')),
|
||||||
block.get('tts_text'),
|
clean_str(block.get('tts_text', '')),
|
||||||
block.get('audio_data'),
|
clean_str(block.get('audio_data', '')),
|
||||||
block.get('audio_format', 'mp3'),
|
clean_str(block.get('audio_format', 'mp3')),
|
||||||
json.dumps(block.get('transcription', []))
|
json.dumps(transcription)
|
||||||
))
|
))
|
||||||
|
|
||||||
block_id = cursor.lastrowid
|
block_id = cursor.lastrowid
|
||||||
@@ -311,10 +354,10 @@ def save_project_content(project_id):
|
|||||||
VALUES (?, ?, ?, ?, ?)
|
VALUES (?, ?, ?, ?, ?)
|
||||||
''', (
|
''', (
|
||||||
block_id,
|
block_id,
|
||||||
img['data'],
|
clean_str(img.get('data', '')),
|
||||||
img.get('format', 'png'),
|
clean_str(img.get('format', 'png')),
|
||||||
img.get('alt_text', ''),
|
clean_str(img.get('alt_text', '')),
|
||||||
img.get('position', 'before')
|
clean_str(img.get('position', 'before'))
|
||||||
))
|
))
|
||||||
|
|
||||||
cursor.execute('''
|
cursor.execute('''
|
||||||
@@ -387,7 +430,6 @@ def unpublish_project(project_id):
|
|||||||
if not cursor.fetchone():
|
if not cursor.fetchone():
|
||||||
return jsonify({'error': 'Project not found'}), 404
|
return jsonify({'error': 'Project not found'}), 404
|
||||||
|
|
||||||
# Only flip is_published flag — keep author/description/category for republish
|
|
||||||
cursor.execute('UPDATE projects SET is_published = 0 WHERE id = ?', (project_id,))
|
cursor.execute('UPDATE projects SET is_published = 0 WHERE id = ?', (project_id,))
|
||||||
db.commit()
|
db.commit()
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user