audiobook-maker-pro-v4.2/routes/project_routes.py

# routes/project_routes.py - Project Management Routes (v4.2)

import re
import json
import base64
from flask import Blueprint, request, jsonify, Response, stream_with_context

from db import get_db, vacuum_db
from auth import login_required

project_bp = Blueprint('project', __name__)


# ============================================
# Helpers
# ============================================

# C0/C1 control characters except \t \n \r — these corrupt JSON streams.
_CONTROL_CHAR_RE = re.compile(r'[\x00-\x08\x0B\x0C\x0E-\x1F\x7F]')


def clean_str(s):
    """Strip raw control characters from a string. Returns '' for None."""
    if s is None:
        return ''
    if not isinstance(s, str):
        s = str(s)
    return _CONTROL_CHAR_RE.sub('', s)


def clean_transcription(transcription):
    """Sanitize 'word' fields inside a transcription list."""
    if isinstance(transcription, list):
        for t in transcription:
            if isinstance(t, dict) and 'word' in t:
                t['word'] = clean_str(t.get('word', ''))
    return transcription


# ============================================
# Routes
# ============================================

@project_bp.route('/api/projects', methods=['GET'])
@login_required
def list_projects():
    """List all projects with publishing info."""
    db = get_db()
    cursor = db.cursor()

    cursor.execute('''
        SELECT p.id, p.name, p.created_at, p.updated_at,
               p.is_published, p.published_at, p.thumbnail_data, p.thumbnail_format,
               p.description, p.author, p.category, p.view_count,
               (SELECT COUNT(*) FROM chapters WHERE project_id = p.id) as chapter_count,
               (SELECT COUNT(*) FROM markdown_blocks mb
                JOIN chapters c ON mb.chapter_id = c.id
                WHERE c.project_id = p.id) as block_count,
               (SELECT COUNT(*) FROM markdown_blocks mb
                JOIN chapters c ON mb.chapter_id = c.id
                WHERE c.project_id = p.id AND mb.audio_data IS NOT NULL AND mb.audio_data != '') as audio_count
        FROM projects p
        ORDER BY p.updated_at DESC
    ''')

    projects = []
    for row in cursor.fetchall():
        projects.append({
            'id': row['id'],
            'name': row['name'],
            'created_at': row['created_at'],
            'updated_at': row['updated_at'],
            'chapter_count': row['chapter_count'],
            'block_count': row['block_count'],
            'audio_count': row['audio_count'],
            'is_published': bool(row['is_published']),
            'published_at': row['published_at'],
            'thumbnail_data': row['thumbnail_data'],
            'thumbnail_format': row['thumbnail_format'] or 'png',
            'description': row['description'] or '',
            'author': row['author'] or '',
            'category': row['category'] or '',
            'view_count': row['view_count'] or 0
        })

    return jsonify({'projects': projects})


@project_bp.route('/api/projects', methods=['POST'])
@login_required
def create_project():
    """Create a new project."""
    data = request.json
    name = data.get('name', '').strip()

    if not name:
        return jsonify({'error': 'Project name is required'}), 400

    db = get_db()
    cursor = db.cursor()

    try:
        cursor.execute('INSERT INTO projects (name) VALUES (?)', (name,))
        db.commit()

        return jsonify({
            'success': True,
            'project_id': cursor.lastrowid,
            'name': name
        })
    except Exception as e:
        if 'UNIQUE constraint' in str(e):
            return jsonify({'error': 'Project with this name already exists'}), 400
        return jsonify({'error': str(e)}), 500


@project_bp.route('/api/projects/<int:project_id>', methods=['GET'])
@login_required
def get_project(project_id):
    """
    Get a project with all its chapters and blocks.

    Streamed response: large projects (with many audio blocks) can produce
    10-50 MB of JSON. We stream it in chunks and sanitize every string field
    to prevent control characters from breaking JSON parsing on the client.
    """
    db = get_db()
    cursor = db.cursor()

    cursor.execute('SELECT * FROM projects WHERE id = ?', (project_id,))
    project = cursor.fetchone()

    if not project:
        return jsonify({'error': 'Project not found'}), 404

    cursor.execute('''
        SELECT * FROM chapters WHERE project_id = ? ORDER BY chapter_number
    ''', (project_id,))
    chapters = cursor.fetchall()

    chapters_data = []
    for chapter in chapters:
        cursor.execute('''
            SELECT * FROM markdown_blocks WHERE chapter_id = ? ORDER BY block_order
        ''', (chapter['id'],))
        blocks = cursor.fetchall()

        blocks_data = []
        for block in blocks:
            cursor.execute('''
                SELECT * FROM block_images WHERE block_id = ? ORDER BY id
            ''', (block['id'],))
            images = cursor.fetchall()

            # Safely parse transcription (might be NULL, empty, or malformed)
            transcription = []
            if block['transcription']:
                try:
                    transcription = json.loads(block['transcription'])
                    transcription = clean_transcription(transcription)
                except (json.JSONDecodeError, TypeError):
                    transcription = []

            blocks_data.append({
                'id': block['id'],
                'block_order': block['block_order'],
                'block_type': clean_str(block['block_type']),
                'content': clean_str(block['content']),
                'tts_text': clean_str(block['tts_text']),
                'audio_data': clean_str(block['audio_data']),
                'audio_format': clean_str(block['audio_format']) or 'mp3',
                'transcription': transcription,
                'images': [{
                    'id': img['id'],
                    'data': clean_str(img['image_data']),
                    'format': clean_str(img['image_format']) or 'png',
                    'alt_text': clean_str(img['alt_text']),
                    'position': clean_str(img['position']) or 'before'
                } for img in images]
            })

        chapters_data.append({
            'id': chapter['id'],
            'chapter_number': chapter['chapter_number'],
            'title': clean_str(chapter['title']),
            'voice': clean_str(chapter['voice']),
            'blocks': blocks_data
        })

    response_data = {
        'id': project['id'],
        'name': clean_str(project['name']),
        'created_at': clean_str(project['created_at']),
        'updated_at': clean_str(project['updated_at']),
        'chapters': chapters_data
    }

    # Stream JSON in chunks. ensure_ascii=True forces all non-ASCII chars
    # to be escaped (\uXXXX) — slightly larger payload but guarantees the
    # stream is pure ASCII, so no proxy can mis-handle multi-byte chars
    # at chunk boundaries.
    def generate():
        json_str = json.dumps(response_data, ensure_ascii=True)
        chunk_size = 64 * 1024  # 64 KB per chunk
        for i in range(0, len(json_str), chunk_size):
            yield json_str[i:i + chunk_size]

    return Response(
        stream_with_context(generate()),
        mimetype='application/json; charset=utf-8',
        headers={
            'Cache-Control': 'no-cache',
            'X-Accel-Buffering': 'no'  # Tell Nginx/Traefik: don't buffer this response
        }
    )


@project_bp.route('/api/projects/<int:project_id>', methods=['PUT'])
@login_required
def update_project(project_id):
    """Update project name."""
    data = request.json
    name = data.get('name', '').strip()

    if not name:
        return jsonify({'error': 'Project name is required'}), 400

    db = get_db()
    cursor = db.cursor()

    try:
        cursor.execute('''
            UPDATE projects SET name = ?, updated_at = CURRENT_TIMESTAMP WHERE id = ?
        ''', (name, project_id))
        db.commit()

        if cursor.rowcount == 0:
            return jsonify({'error': 'Project not found'}), 404

        return jsonify({'success': True})
    except Exception as e:
        if 'UNIQUE constraint' in str(e):
            return jsonify({'error': 'A project with this name already exists'}), 400
        return jsonify({'error': str(e)}), 500


@project_bp.route('/api/projects/<int:project_id>', methods=['DELETE'])
@login_required
def delete_project(project_id):
    """Delete a project and all its data."""
    db = get_db()
    cursor = db.cursor()

    cursor.execute('SELECT id FROM projects WHERE id = ?', (project_id,))
    if not cursor.fetchone():
        return jsonify({'error': 'Project not found'}), 404

    cursor.execute('''
        DELETE FROM block_images WHERE block_id IN (
            SELECT mb.id FROM markdown_blocks mb
            JOIN chapters c ON mb.chapter_id = c.id
            WHERE c.project_id = ?
        )
    ''', (project_id,))

    cursor.execute('''
        DELETE FROM markdown_blocks WHERE chapter_id IN (
            SELECT id FROM chapters WHERE project_id = ?
        )
    ''', (project_id,))

    cursor.execute('DELETE FROM chapters WHERE project_id = ?', (project_id,))
    cursor.execute('DELETE FROM projects WHERE id = ?', (project_id,))

    db.commit()
    vacuum_db()

    return jsonify({'success': True})


@project_bp.route('/api/projects/<int:project_id>/save', methods=['POST'])
@login_required
def save_project_content(project_id):
    """
    Save all chapters and blocks for a project.

    Every string field is sanitized before insertion so that invalid
    control characters never enter the database. This protects future
    reads from the JSON corruption bug we saw on /api/projects/<id> GET.
    """
    data = request.json
    chapters = data.get('chapters', [])

    db = get_db()
    cursor = db.cursor()

    cursor.execute('SELECT id FROM projects WHERE id = ?', (project_id,))
    if not cursor.fetchone():
        return jsonify({'error': 'Project not found'}), 404

    cursor.execute('''
        DELETE FROM block_images WHERE block_id IN (
            SELECT mb.id FROM markdown_blocks mb
            JOIN chapters c ON mb.chapter_id = c.id
            WHERE c.project_id = ?
        )
    ''', (project_id,))

    cursor.execute('''
        DELETE FROM markdown_blocks WHERE chapter_id IN (
            SELECT id FROM chapters WHERE project_id = ?
        )
    ''', (project_id,))

    cursor.execute('DELETE FROM chapters WHERE project_id = ?', (project_id,))

    for chapter in chapters:
        cursor.execute('''
            INSERT INTO chapters (project_id, chapter_number, title, voice)
            VALUES (?, ?, ?, ?)
        ''', (
            project_id,
            chapter['chapter_number'],
            clean_str(chapter.get('title', 'Section')),
            clean_str(chapter.get('voice', 'af_heart'))
        ))

        chapter_id = cursor.lastrowid

        for block in chapter.get('blocks', []):
            # Clean transcription word fields before storing
            transcription = clean_transcription(block.get('transcription', []))

            cursor.execute('''
                INSERT INTO markdown_blocks
                (chapter_id, block_order, block_type, content, tts_text, audio_data, audio_format, transcription)
                VALUES (?, ?, ?, ?, ?, ?, ?, ?)
            ''', (
                chapter_id,
                block['block_order'],
                clean_str(block.get('block_type', 'paragraph')),
                clean_str(block.get('content', '')),
                clean_str(block.get('tts_text', '')),
                clean_str(block.get('audio_data', '')),
                clean_str(block.get('audio_format', 'mp3')),
                json.dumps(transcription)
            ))

            block_id = cursor.lastrowid

            for img in block.get('images', []):
                cursor.execute('''
                    INSERT INTO block_images (block_id, image_data, image_format, alt_text, position)
                    VALUES (?, ?, ?, ?, ?)
                ''', (
                    block_id,
                    clean_str(img.get('data', '')),
                    clean_str(img.get('format', 'png')),
                    clean_str(img.get('alt_text', '')),
                    clean_str(img.get('position', 'before'))
                ))

    cursor.execute('''
        UPDATE projects SET updated_at = CURRENT_TIMESTAMP WHERE id = ?
    ''', (project_id,))

    db.commit()

    return jsonify({'success': True, 'message': 'Project saved successfully'})


# ============================================
# v4.2: Publishing Endpoints
# ============================================

@project_bp.route('/api/projects/<int:project_id>/publish', methods=['POST'])
@login_required
def publish_project(project_id):
    """Publish a project to make it visible on public homepage."""
    data = request.json or {}

    db = get_db()
    cursor = db.cursor()

    cursor.execute('SELECT id, name FROM projects WHERE id = ?', (project_id,))
    project = cursor.fetchone()
    if not project:
        return jsonify({'error': 'Project not found'}), 404

    # Verify project has at least one chapter with audio
    cursor.execute('''
        SELECT COUNT(*) as cnt FROM markdown_blocks mb
        JOIN chapters c ON mb.chapter_id = c.id
        WHERE c.project_id = ? AND mb.audio_data IS NOT NULL AND mb.audio_data != ''
    ''', (project_id,))
    audio_count = cursor.fetchone()['cnt']

    if audio_count == 0:
        return jsonify({'error': 'Cannot publish: no audio generated yet'}), 400

    description = (data.get('description') or '').strip()
    author = (data.get('author') or '').strip()
    category = (data.get('category') or '').strip()

    cursor.execute('''
        UPDATE projects
        SET is_published = 1,
            published_at = CURRENT_TIMESTAMP,
            description = ?,
            author = ?,
            category = ?
        WHERE id = ?
    ''', (description, author, category, project_id))
    db.commit()

    return jsonify({
        'success': True,
        'message': f'"{project["name"]}" published successfully!'
    })


@project_bp.route('/api/projects/<int:project_id>/unpublish', methods=['POST'])
@login_required
def unpublish_project(project_id):
    """Unpublish a project (but keep author/description/category for easy republish)."""
    db = get_db()
    cursor = db.cursor()

    cursor.execute('SELECT id FROM projects WHERE id = ?', (project_id,))
    if not cursor.fetchone():
        return jsonify({'error': 'Project not found'}), 404

    cursor.execute('UPDATE projects SET is_published = 0 WHERE id = ?', (project_id,))
    db.commit()

    return jsonify({'success': True, 'message': 'Project unpublished'})


@project_bp.route('/api/projects/<int:project_id>/thumbnail', methods=['POST'])
@login_required
def upload_thumbnail(project_id):
    """Upload a thumbnail image for the project."""
    if 'file' not in request.files:
        return jsonify({'error': 'No file provided'}), 400

    img_file = request.files['file']
    if not img_file or not img_file.filename:
        return jsonify({'error': 'Invalid file'}), 400

    filename = img_file.filename.lower()
    if not any(filename.endswith(ext) for ext in ('.png', '.jpg', '.jpeg', '.webp', '.gif')):
        return jsonify({'error': 'File must be an image (PNG/JPG/WEBP/GIF)'}), 400

    img_bytes = img_file.read()
    if len(img_bytes) > 5 * 1024 * 1024:
        return jsonify({'error': 'Image too large (max 5MB)'}), 400

    fmt = filename.rsplit('.', 1)[-1]
    if fmt == 'jpg':
        fmt = 'jpeg'

    b64 = base64.b64encode(img_bytes).decode('utf-8')

    db = get_db()
    cursor = db.cursor()
    cursor.execute('SELECT id FROM projects WHERE id = ?', (project_id,))
    if not cursor.fetchone():
        return jsonify({'error': 'Project not found'}), 404

    cursor.execute('''
        UPDATE projects SET thumbnail_data = ?, thumbnail_format = ? WHERE id = ?
    ''', (b64, fmt, project_id))
    db.commit()

    return jsonify({
        'success': True,
        'thumbnail_data': b64,
        'thumbnail_format': fmt
    })


@project_bp.route('/api/projects/<int:project_id>/thumbnail', methods=['DELETE'])
@login_required
def delete_thumbnail(project_id):
    """Remove project thumbnail."""
    db = get_db()
    cursor = db.cursor()
    cursor.execute('UPDATE projects SET thumbnail_data = NULL WHERE id = ?', (project_id,))
    db.commit()
    return jsonify({'success': True})

# ============================================
# DEBUG: Identify corrupt data
# ============================================

@project_bp.route('/api/projects/<int:project_id>/debug', methods=['GET'])
@login_required
def debug_project(project_id):
    """
    Scan a project for control characters and report which fields are dirty.
    Visit: /api/projects/<id>/debug after logging in.
    """
    db = get_db()
    cursor = db.cursor()

    cursor.execute('SELECT id, name FROM projects WHERE id = ?', (project_id,))
    project = cursor.fetchone()
    if not project:
        return jsonify({'error': 'Project not found'}), 404

    def find_bad_chars(s):
        """Return list of (position, char_code) for any control char found."""
        if not s or not isinstance(s, str):
            return []
        bad = []
        for i, ch in enumerate(s):
            code = ord(ch)
            # Allow \t (9), \n (10), \r (13). Anything else <32 or 127 is bad.
            if (code < 32 and code not in (9, 10, 13)) or code == 127:
                bad.append({'pos': i, 'code': code, 'hex': f'0x{code:02x}'})
                if len(bad) >= 5:  # cap at 5 per field
                    break
        return bad

    report = {
        'project_id': project['id'],
        'project_name': project['name'],
        'issues': []
    }

    cursor.execute('SELECT * FROM chapters WHERE project_id = ? ORDER BY chapter_number', (project_id,))
    chapters = cursor.fetchall()

    for chapter in chapters:
        ch_num = chapter['chapter_number']

        for field in ('title', 'voice'):
            bad = find_bad_chars(chapter[field])
            if bad:
                report['issues'].append({
                    'where': f'chapter {ch_num} -> {field}',
                    'bad_chars': bad,
                    'sample': repr((chapter[field] or '')[:80])
                })

        cursor.execute('SELECT * FROM markdown_blocks WHERE chapter_id = ? ORDER BY block_order', (chapter['id'],))
        blocks = cursor.fetchall()

        for block in blocks:
            b_order = block['block_order']
            for field in ('block_type', 'content', 'tts_text', 'audio_data', 'audio_format', 'transcription'):
                bad = find_bad_chars(block[field])
                if bad:
                    val = block[field] or ''
                    report['issues'].append({
                        'where': f'chapter {ch_num}, block {b_order} -> {field}',
                        'field_length': len(val),
                        'bad_chars': bad,
                        'sample_around_first_bad': repr(val[max(0, bad[0]['pos']-20):bad[0]['pos']+20])
                    })

    report['total_issues'] = len(report['issues'])
    return jsonify(report)