audiobook-maker-pro-v4.2/ai_processor.py

# ai_processor.py - অ্যাডভান্সড রুল-বেসড ইঞ্জিন (v2.1)
# আপনার দেয়া প্রোডাকশন-গ্রেড অ্যানালাইসিস এবং অপটিমাইজেশনের ওপর ভিত্তি করে আপডেট করা হয়েছে
# ১০০% এআই-মুক্ত, অত্যন্ত দ্রুত এবং নির্ভুল

import re

# =====================================================================
# হেল্পার ফাংশন
# =====================================================================

def _clean_markdown(text):
    """মার্কডাউন সিম্বলগুলো রিমুভ করে পরিষ্কার টেক্সট দেয়।"""
    if not text:
        return ""
    text = re.sub(r'[*_]+', '', text)
    text = re.sub(r'^#+\s*|^>\s*|^\-\s*', '', text, flags=re.MULTILINE)
    return text.strip()


def _get_body_font_size(blocks):
    """
    ডকুমেন্টের মূল বডি টেক্সটের ফন্ট সাইজ বের করে (Frequency/Word Count ভিত্তিক)।
    মিডিয়ানের বদলে সবচেয়ে বেশি ব্যবহৃত ফন্ট সাইজটিকে বডি হিসেবে ধরে।
    """
    size_char_counts = {}
    for block in blocks:
        if block.get('type') not in ['image', 'table'] and block.get('font_size'):
            size = round(block['font_size'], 0)
            word_count = len(_clean_markdown(block.get('content', '')).split())
            # বড় ব্লকগুলোকে বেশি ওয়েট (Weight) দেওয়া হচ্ছে
            size_char_counts[size] = size_char_counts.get(size, 0) + word_count

    if not size_char_counts:
        return 12.0

    # যে ফন্ট সাইজে সবচেয়ে বেশি শব্দ আছে সেটিই বডি ফন্ট
    return max(size_char_counts.keys(), key=lambda s: size_char_counts[s])


# =====================================================================
# স্টেপ ১: অ্যাডভান্সড মার্জিং (While Loop & Soft Signals)
# =====================================================================

def _should_merge(current, nxt):
    """দুটি ব্লক মার্জ করা উচিত কিনা তা যাচাই করে।"""

    # রুল: হেডিং ব্লকগুলোকে নন-হেডিং ব্লকের সাথে জোড়া লাগতে দেওয়া হবে না
    c_type = current.get('type', 'paragraph')
    n_type = nxt.get('type', 'paragraph')
    heading_types = {'heading1', 'heading2', 'heading3'}

    if (c_type in heading_types and n_type not in heading_types) or \
       (n_type in heading_types and c_type not in heading_types):
        return False

    if c_type in ['image', 'table'] or n_type in ['image', 'table']:
        return False

    c_text = current.get('content', '').strip()
    n_text = nxt.get('content', '').strip()
    c_clean = _clean_markdown(c_text)
    n_clean = _clean_markdown(n_text)

    if not c_clean or not n_clean:
        return False

    # রুল: স্ট্যান্ডঅ্যালোন হেডারগুলো প্রটেক্ট করা
    standalone = ["INTRODUCTION", "FOREWORD", "PREFACE", "CONCLUSION",
                 "CONTENTS", "TABLE OF CONTENTS", "GLOSSARY", "APPENDIX"]
    if c_clean.upper() in standalone or n_clean.upper() in standalone:
        return False

    # টাইপোগ্রাফি চেক
    c_size = current.get('font_size', 12)
    n_size = nxt.get('font_size', 12)
    if abs(c_size - n_size) > 1.5:
        return False

    same_formatting = current.get('is_bold') == nxt.get('is_bold')
    if not same_formatting:
        return False

    word_count_current = len(c_clean.split())
    word_count_next = len(n_clean.split())

    # রিলাক্সড টাইপ চেক: ব্লক ছোট হলে টাইপ মিসম্যাচ ইগনোর করা হবে
    same_type = c_type == n_type
    if not same_type:
        if word_count_current < 10 and word_count_next < 10:
            same_type = True

    if not same_type:
        return False

    # সফট সিগন্যালস (Soft Signals)
    ends_with_punct = bool(re.search(r'[.!?;:]\s*["\u0027\u2018\u2019\u201C\u201D]?$', c_clean))
    starts_with_lower = n_clean[0].islower()
    # "it", "is", "was", "are", "were" বাদ দেওয়া হয়েছে কারণ এগুলো বাক্যের শেষে থাকতে পারে
    prep_regex = r'\b(the|a|an|of|in|to|for|and|or|but|with|from|by|at|on)\s*$'

    # যেকোনো একটি স্ট্রং সিগন্যাল পেলেই মার্জ করবে
    strong_merge = (
        (not ends_with_punct and starts_with_lower) or
        (c_clean.isupper() and n_clean.isupper() and len(c_clean) < 80 and len(n_clean) < 80) or
        (c_clean and c_clean[-1] in ',;-') or
        bool(re.search(prep_regex, c_clean, re.IGNORECASE))
    )

    return strong_merge

def _advanced_merge(blocks):
    """While লুপ ব্যবহার করে একাধিক (৩ বা ততোধিক) ভাঙা ফ্র্যাগমেন্ট জোড়া লাগায়।"""
    merged_blocks = []
    i = 0

    while i < len(blocks):
        current = dict(blocks[i])

        # যতক্ষণ পর্যন্ত পরের ব্লকটি মার্জ করার যোগ্য, লুপ চলতে থাকবে
        while i < len(blocks) - 1:
            nxt = blocks[i + 1]
            if _should_merge(current, nxt):
                c_text = current.get('content', '').strip()
                n_text = nxt.get('content', '').strip()
                c_clean = _clean_markdown(c_text)
                n_clean = _clean_markdown(n_text)

                prefix = ""
                if c_text.startswith('### '): prefix = "### "
                elif c_text.startswith('## '): prefix = "## "
                elif c_text.startswith('# '): prefix = "# "

                current['content'] = f"{prefix}{c_clean} {n_clean}".strip()
                print(f"  🔗 ফ্র্যাগমেন্ট মার্জ করা হয়েছে: \"{c_clean[-20:]} {n_clean[:20]}\"")
                i += 1
            else:
                break

        merged_blocks.append(current)
        i += 1

    return merged_blocks


# =====================================================================
# স্টেপ ২: ক্লাস্টার ভিত্তিক TOC ডিটেকশন (অপটিমাইজড)
# =====================================================================

def _detect_toc_region(blocks):
    """পরপর অনেকগুলো ছোট চ্যাপ্টার-লাইক এন্ট্রি দেখে TOC ক্লাস্টার বের করে। (While loop দিয়ে জাম্প করে)"""
    toc_indices = set()
    i = 0

    while i < len(blocks) - 2:
        streak = 0
        temp_indices = []
        for j in range(i, min(i + 30, len(blocks))):
            clean = _clean_markdown(blocks[j].get('content', '')).strip()
            word_count = len(clean.split())

            if word_count > 20:
                break  # বড় প্যারাগ্রাফ পেলে ক্লাস্টার ভেঙে যাবে

            is_chapter_like = bool(re.match(
                r'^(chapter|part|section|appendix|introduction|conclusion|glossary|index|preface|foreword|contents)',
                clean, re.IGNORECASE
            ))
            is_numbered = bool(re.match(r'^\d+[\.\)]\s', clean))

            if is_chapter_like or is_numbered or bool(re.search(r'(\.{3,}|…)\s*\d+$', clean)):
                streak += 1
                temp_indices.append(j)
            elif word_count < 5:
                temp_indices.append(j) # পেজ নাম্বার বা ছোট গ্যাপ হতে পারে, স্কিপ করে স্ট্রিক বজায় রাখবে
                continue
            else:
                break

        # ৩ বা তার বেশি এন্ট্রি পেলে সেটি একটি TOC রিজিয়ন
        if streak >= 3:
            toc_indices.update(temp_indices)
            i = temp_indices[-1] + 1  # বারবার একই ইনডেক্স চেক না করে জাম্প করবে (লুপ এফিশিয়েন্সি)
        else:
            i += 1

    return toc_indices


# =====================================================================
# স্টেপ ৩: সেকশন স্কোরিং এবং ফিল্টারিং
# =====================================================================

def _apply_section_scoring(blocks):
    """টেক্সটের ঘনত্ব, ফন্ট সাইজ এবং ক্লাস্টার ব্যবহার করে সেকশন চিহ্নিত করে।"""
    body_size = _get_body_font_size(blocks)
    section_counter = 1
    toc_indices = _detect_toc_region(blocks)

    for i, block in enumerate(blocks):
        if block.get('type') in ['image', 'table']:
            block['is_section_start'] = False
            continue

        if i == 0:
            block['is_section_start'] = True
            text = block.get('content', '').strip()
            clean_text = _clean_markdown(text)
            title = clean_text[:40].strip() + ("..." if len(clean_text) > 40 else "")
            if not title:
                title = "Section 1"
            block['section_name'] = title
            continue

        text = block.get('content', '').strip()
        clean_text = _clean_markdown(text)
        word_count = len(clean_text.split())

        if word_count == 0:
            block['is_section_start'] = False
            continue

        # --- উন্নত পেজ নাম্বার ফিল্টারিং ---
        # "316", "- 316 -", "Page 316" ফরম্যাটগুলো ধরবে
        is_page_number = bool(re.match(r'^[-—–\s]*\d{1,4}[-—–\s]*$', clean_text.strip())) or \
                         bool(re.match(r'^page\s+\d{1,4}$', clean_text.strip(), re.IGNORECASE))

        if is_page_number:
            block['is_section_start'] = False
            block['is_page_number'] = True
            continue

        # TOC এর ভেতরের এলিমেন্টগুলো সেকশন হবে না
        if i in toc_indices:
            block['is_section_start'] = False
            continue

        score = 0

        # ফ্যাক্টর A: হেডিং টাইপ (PDF-এর heading3 কেও বুস্ট দেওয়া হলো)
        if block.get('type') in ['heading1', 'heading2']: score += 5
        elif block.get('type') == 'heading3': score += 3

        # ফ্যাক্টর B: ফন্ট সাইজ (বডি টেক্সটের সাথে তুলনা)
        f_size = block.get('font_size', body_size)
        if f_size >= body_size + 4: score += 6
        elif f_size >= body_size + 2: score += 3

        # ফ্যাক্টর C: ফন্ট ওয়েট
        if block.get('is_bold'): score += 3

        # ফ্যাক্টর D: কেস (Case)
        if clean_text.isupper() and 3 < len(clean_text) < 80: score += 3

        # ফ্যাক্টর E: কি-ওয়ার্ডস
        lower_text = clean_text.lower()
        if re.match(r'^(chapter|part|section|appendix|introduction|preface|prologue|epilogue|foreword|glossary|index)', lower_text):
            score += 5

        # TABLE OF CONTENTS স্পেশাল রুল লজিক ফিক্স
        if "table of contents" in lower_text or "contents" == lower_text:
            if not toc_indices and i < 50:
                # যদি কোনো TOC ক্লাস্টার না পাওয়া যায়, কিন্তু প্রথম দিকে থাকে
                score += 5
            elif len(toc_indices) > 0 and i <= min(toc_indices):
                # যদি ক্লাস্টার থাকে এবং এটি তার আগে থাকে
                score += 5

        # --- পেনাল্টি (নেগেটিভ স্কোরিং - ফ্লেক্সিবল রুল) ---
        if word_count > 20:
            score -= 10
        elif word_count > 12:
            # ফন্ট বড় না হলে এবং বোল্ড না হলে তবেই পেনাল্টি
            if not block.get('is_bold') and f_size <= body_size + 2:
                score -= 5

        if re.search(r'[.!?]\s*["\u0027\u2018\u2019\u201C\u201D]?$', clean_text):
            score -= 3

        # চূড়ান্ত সিদ্ধান্ত
        if score >= 6:
            block['is_section_start'] = True
            title = clean_text[:60].strip()
            if title.isupper() and len(title) > 10:
                title = title.title()
            block['section_name'] = title
            section_counter += 1
            print(f"  📌 সেকশন চিহ্নিত করা হয়েছে: [{score} pts] {title}")
        else:
            block['is_section_start'] = False
            block['section_name'] = ""

    # পেজ নাম্বার ব্লকগুলো মূল ডেটা থেকে পুরোপুরি বাদ দিয়ে দেওয়া হচ্ছে (TTS যেন না পড়ে)
    filtered_blocks = [b for b in blocks if not b.get('is_page_number')]
    return filtered_blocks


# =====================================================================
# মেইন এক্সপোর্ট ফাংশন
# =====================================================================

def process_document_smartly(blocks, metadata):
    """
    মেইন এন্ট্রি পয়েন্ট। রুল-বেসড ইঞ্জিনের মাধ্যমে পুরো ডকুমেন্ট প্রসেস করা হয়।
    """
    print("\n" + "=" * 60, flush=True)
    print("🚀 অ্যাডভান্সড রুল-বেসড ইঞ্জিন (v2.1) শুরু হচ্ছে...", flush=True)
    print(f"📄 মোট {len(blocks)} টি ব্লক বিশ্লেষণ করা হচ্ছে।", flush=True)

    if not blocks:
        return blocks

    merged_blocks = _advanced_merge(blocks)
    print(f"✂️  মার্জ করার পর মোট ব্লক সংখ্যা: {len(merged_blocks)}", flush=True)

    final_blocks = _apply_section_scoring(merged_blocks)

    section_count = sum(1 for b in final_blocks if b.get('is_section_start'))
    print(f"📑 ডকুমেন্টে মোট {section_count} টি সেকশন পাওয়া গেছে।", flush=True)
    print("=" * 60 + "\n", flush=True)

    return final_blocks