Audiobook Maker Pro v4.2 — production ready

2026-05-22 18:28:47 +06:00
commit 0617a374dd
41 changed files with 15262 additions and 0 deletions
--- a/ai_processor.py
+++ b/ai_processor.py
@@ -0,0 +1,321 @@
+# ai_processor.py - অ্যাডভান্সড রুল-বেসড ইঞ্জিন (v2.1)
+# আপনার দেয়া প্রোডাকশন-গ্রেড অ্যানালাইসিস এবং অপটিমাইজেশনের ওপর ভিত্তি করে আপডেট করা হয়েছে
+# ১০০% এআই-মুক্ত, অত্যন্ত দ্রুত এবং নির্ভুল
+
+import re
+
+# =====================================================================
+# হেল্পার ফাংশন
+# =====================================================================
+
+def _clean_markdown(text):
+    """মার্কডাউন সিম্বলগুলো রিমুভ করে পরিষ্কার টেক্সট দেয়।"""
+    if not text:
+        return ""
+    text = re.sub(r'[*_]+', '', text)
+    text = re.sub(r'^#+\s*|^>\s*|^\-\s*', '', text, flags=re.MULTILINE)
+    return text.strip()
+
+
+def _get_body_font_size(blocks):
+    """
+    ডকুমেন্টের মূল বডি টেক্সটের ফন্ট সাইজ বের করে (Frequency/Word Count ভিত্তিক)।
+    মিডিয়ানের বদলে সবচেয়ে বেশি ব্যবহৃত ফন্ট সাইজটিকে বডি হিসেবে ধরে।
+    """
+    size_char_counts = {}
+    for block in blocks:
+        if block.get('type') not in ['image', 'table'] and block.get('font_size'):
+            size = round(block['font_size'], 0)
+            word_count = len(_clean_markdown(block.get('content', '')).split())
+            # বড় ব্লকগুলোকে বেশি ওয়েট (Weight) দেওয়া হচ্ছে
+            size_char_counts[size] = size_char_counts.get(size, 0) + word_count
+    
+    if not size_char_counts:
+        return 12.0
+    
+    # যে ফন্ট সাইজে সবচেয়ে বেশি শব্দ আছে সেটিই বডি ফন্ট
+    return max(size_char_counts.keys(), key=lambda s: size_char_counts[s])
+
+
+# =====================================================================
+# স্টেপ ১: অ্যাডভান্সড মার্জিং (While Loop & Soft Signals)
+# =====================================================================
+
+def _should_merge(current, nxt):
+    """দুটি ব্লক মার্জ করা উচিত কিনা তা যাচাই করে।"""
+    
+    # রুল: হেডিং ব্লকগুলোকে নন-হেডিং ব্লকের সাথে জোড়া লাগতে দেওয়া হবে না
+    c_type = current.get('type', 'paragraph')
+    n_type = nxt.get('type', 'paragraph')
+    heading_types = {'heading1', 'heading2', 'heading3'}
+    
+    if (c_type in heading_types and n_type not in heading_types) or \
+       (n_type in heading_types and c_type not in heading_types):
+        return False
+        
+    if c_type in ['image', 'table'] or n_type in ['image', 'table']:
+        return False
+        
+    c_text = current.get('content', '').strip()
+    n_text = nxt.get('content', '').strip()
+    c_clean = _clean_markdown(c_text)
+    n_clean = _clean_markdown(n_text)
+    
+    if not c_clean or not n_clean:
+        return False
+        
+    # রুল: স্ট্যান্ডঅ্যালোন হেডারগুলো প্রটেক্ট করা
+    standalone = ["INTRODUCTION", "FOREWORD", "PREFACE", "CONCLUSION", 
+                 "CONTENTS", "TABLE OF CONTENTS", "GLOSSARY", "APPENDIX"]
+    if c_clean.upper() in standalone or n_clean.upper() in standalone:
+        return False
+        
+    # টাইপোগ্রাফি চেক
+    c_size = current.get('font_size', 12)
+    n_size = nxt.get('font_size', 12)
+    if abs(c_size - n_size) > 1.5:
+        return False
+        
+    same_formatting = current.get('is_bold') == nxt.get('is_bold')
+    if not same_formatting:
+        return False
+        
+    word_count_current = len(c_clean.split())
+    word_count_next = len(n_clean.split())
+    
+    # রিলাক্সড টাইপ চেক: ব্লক ছোট হলে টাইপ মিসম্যাচ ইগনোর করা হবে
+    same_type = c_type == n_type
+    if not same_type:
+        if word_count_current < 10 and word_count_next < 10:
+            same_type = True
+            
+    if not same_type:
+        return False
+        
+    # সফট সিগন্যালস (Soft Signals)
+    ends_with_punct = bool(re.search(r'[.!?;:]\s*["\u0027\u2018\u2019\u201C\u201D]?$', c_clean))
+    starts_with_lower = n_clean[0].islower()
+    # "it", "is", "was", "are", "were" বাদ দেওয়া হয়েছে কারণ এগুলো বাক্যের শেষে থাকতে পারে
+    prep_regex = r'\b(the|a|an|of|in|to|for|and|or|but|with|from|by|at|on)\s*$'
+    
+    # যেকোনো একটি স্ট্রং সিগন্যাল পেলেই মার্জ করবে
+    strong_merge = (
+        (not ends_with_punct and starts_with_lower) or
+        (c_clean.isupper() and n_clean.isupper() and len(c_clean) < 80 and len(n_clean) < 80) or
+        (c_clean and c_clean[-1] in ',;-') or
+        bool(re.search(prep_regex, c_clean, re.IGNORECASE))
+    )
+    
+    return strong_merge
+
+def _advanced_merge(blocks):
+    """While লুপ ব্যবহার করে একাধিক (৩ বা ততোধিক) ভাঙা ফ্র্যাগমেন্ট জোড়া লাগায়।"""
+    merged_blocks = []
+    i = 0
+    
+    while i < len(blocks):
+        current = dict(blocks[i])
+        
+        # যতক্ষণ পর্যন্ত পরের ব্লকটি মার্জ করার যোগ্য, লুপ চলতে থাকবে
+        while i < len(blocks) - 1:
+            nxt = blocks[i + 1]
+            if _should_merge(current, nxt):
+                c_text = current.get('content', '').strip()
+                n_text = nxt.get('content', '').strip()
+                c_clean = _clean_markdown(c_text)
+                n_clean = _clean_markdown(n_text)
+                
+                prefix = ""
+                if c_text.startswith('### '): prefix = "### "
+                elif c_text.startswith('## '): prefix = "## "
+                elif c_text.startswith('# '): prefix = "# "
+                
+                current['content'] = f"{prefix}{c_clean} {n_clean}".strip()
+                print(f"  🔗 ফ্র্যাগমেন্ট মার্জ করা হয়েছে: \"{c_clean[-20:]} {n_clean[:20]}\"")
+                i += 1
+            else:
+                break
+                
+        merged_blocks.append(current)
+        i += 1
+        
+    return merged_blocks
+
+
+# =====================================================================
+# স্টেপ ২: ক্লাস্টার ভিত্তিক TOC ডিটেকশন (অপটিমাইজড)
+# =====================================================================
+
+def _detect_toc_region(blocks):
+    """পরপর অনেকগুলো ছোট চ্যাপ্টার-লাইক এন্ট্রি দেখে TOC ক্লাস্টার বের করে। (While loop দিয়ে জাম্প করে)"""
+    toc_indices = set()
+    i = 0
+    
+    while i < len(blocks) - 2:
+        streak = 0
+        temp_indices = []
+        for j in range(i, min(i + 30, len(blocks))):
+            clean = _clean_markdown(blocks[j].get('content', '')).strip()
+            word_count = len(clean.split())
+            
+            if word_count > 20:
+                break  # বড় প্যারাগ্রাফ পেলে ক্লাস্টার ভেঙে যাবে
+                
+            is_chapter_like = bool(re.match(
+                r'^(chapter|part|section|appendix|introduction|conclusion|glossary|index|preface|foreword|contents)',
+                clean, re.IGNORECASE
+            ))
+            is_numbered = bool(re.match(r'^\d+[\.\)]\s', clean))
+            
+            if is_chapter_like or is_numbered or bool(re.search(r'(\.{3,}|…)\s*\d+$', clean)):
+                streak += 1
+                temp_indices.append(j)
+            elif word_count < 5:
+                temp_indices.append(j) # পেজ নাম্বার বা ছোট গ্যাপ হতে পারে, স্কিপ করে স্ট্রিক বজায় রাখবে
+                continue
+            else:
+                break
+                
+        # ৩ বা তার বেশি এন্ট্রি পেলে সেটি একটি TOC রিজিয়ন
+        if streak >= 3:
+            toc_indices.update(temp_indices)
+            i = temp_indices[-1] + 1  # বারবার একই ইনডেক্স চেক না করে জাম্প করবে (লুপ এফিশিয়েন্সি)
+        else:
+            i += 1
+            
+    return toc_indices
+
+
+# =====================================================================
+# স্টেপ ৩: সেকশন স্কোরিং এবং ফিল্টারিং
+# =====================================================================
+
+def _apply_section_scoring(blocks):
+    """টেক্সটের ঘনত্ব, ফন্ট সাইজ এবং ক্লাস্টার ব্যবহার করে সেকশন চিহ্নিত করে।"""
+    body_size = _get_body_font_size(blocks)
+    section_counter = 1
+    toc_indices = _detect_toc_region(blocks)
+    
+    for i, block in enumerate(blocks):
+        if block.get('type') in ['image', 'table']:
+            block['is_section_start'] = False
+            continue
+            
+        if i == 0:
+            block['is_section_start'] = True
+            text = block.get('content', '').strip()
+            clean_text = _clean_markdown(text)
+            title = clean_text[:40].strip() + ("..." if len(clean_text) > 40 else "")
+            if not title: 
+                title = "Section 1"
+            block['section_name'] = title
+            continue
+            
+        text = block.get('content', '').strip()
+        clean_text = _clean_markdown(text)
+        word_count = len(clean_text.split())
+        
+        if word_count == 0:
+            block['is_section_start'] = False
+            continue
+            
+        # --- উন্নত পেজ নাম্বার ফিল্টারিং ---
+        # "316", "- 316 -", "Page 316" ফরম্যাটগুলো ধরবে
+        is_page_number = bool(re.match(r'^[-—–\s]*\d{1,4}[-—–\s]*$', clean_text.strip())) or \
+                         bool(re.match(r'^page\s+\d{1,4}$', clean_text.strip(), re.IGNORECASE))
+                         
+        if is_page_number:
+            block['is_section_start'] = False
+            block['is_page_number'] = True
+            continue
+            
+        # TOC এর ভেতরের এলিমেন্টগুলো সেকশন হবে না
+        if i in toc_indices:
+            block['is_section_start'] = False
+            continue
+            
+        score = 0
+        
+        # ফ্যাক্টর A: হেডিং টাইপ (PDF-এর heading3 কেও বুস্ট দেওয়া হলো)
+        if block.get('type') in ['heading1', 'heading2']: score += 5
+        elif block.get('type') == 'heading3': score += 3
+            
+        # ফ্যাক্টর B: ফন্ট সাইজ (বডি টেক্সটের সাথে তুলনা)
+        f_size = block.get('font_size', body_size)
+        if f_size >= body_size + 4: score += 6
+        elif f_size >= body_size + 2: score += 3
+            
+        # ফ্যাক্টর C: ফন্ট ওয়েট
+        if block.get('is_bold'): score += 3
+            
+        # ফ্যাক্টর D: কেস (Case)
+        if clean_text.isupper() and 3 < len(clean_text) < 80: score += 3
+            
+        # ফ্যাক্টর E: কি-ওয়ার্ডস
+        lower_text = clean_text.lower()
+        if re.match(r'^(chapter|part|section|appendix|introduction|preface|prologue|epilogue|foreword|glossary|index)', lower_text):
+            score += 5
+            
+        # TABLE OF CONTENTS স্পেশাল রুল লজিক ফিক্স
+        if "table of contents" in lower_text or "contents" == lower_text:
+            if not toc_indices and i < 50: 
+                # যদি কোনো TOC ক্লাস্টার না পাওয়া যায়, কিন্তু প্রথম দিকে থাকে
+                score += 5
+            elif len(toc_indices) > 0 and i <= min(toc_indices):
+                # যদি ক্লাস্টার থাকে এবং এটি তার আগে থাকে
+                score += 5
+                
+        # --- পেনাল্টি (নেগেটিভ স্কোরিং - ফ্লেক্সিবল রুল) ---
+        if word_count > 20:
+            score -= 10
+        elif word_count > 12:
+            # ফন্ট বড় না হলে এবং বোল্ড না হলে তবেই পেনাল্টি
+            if not block.get('is_bold') and f_size <= body_size + 2:
+                score -= 5
+                
+        if re.search(r'[.!?]\s*["\u0027\u2018\u2019\u201C\u201D]?$', clean_text):
+            score -= 3
+            
+        # চূড়ান্ত সিদ্ধান্ত
+        if score >= 6:
+            block['is_section_start'] = True
+            title = clean_text[:60].strip()
+            if title.isupper() and len(title) > 10: 
+                title = title.title()
+            block['section_name'] = title
+            section_counter += 1
+            print(f"  📌 সেকশন চিহ্নিত করা হয়েছে: [{score} pts] {title}")
+        else:
+            block['is_section_start'] = False
+            block['section_name'] = ""
+            
+    # পেজ নাম্বার ব্লকগুলো মূল ডেটা থেকে পুরোপুরি বাদ দিয়ে দেওয়া হচ্ছে (TTS যেন না পড়ে)
+    filtered_blocks = [b for b in blocks if not b.get('is_page_number')]
+    return filtered_blocks
+
+
+# =====================================================================
+# মেইন এক্সপোর্ট ফাংশন
+# =====================================================================
+
+def process_document_smartly(blocks, metadata):
+    """
+    মেইন এন্ট্রি পয়েন্ট। রুল-বেসড ইঞ্জিনের মাধ্যমে পুরো ডকুমেন্ট প্রসেস করা হয়।
+    """
+    print("\n" + "=" * 60, flush=True)
+    print("🚀 অ্যাডভান্সড রুল-বেসড ইঞ্জিন (v2.1) শুরু হচ্ছে...", flush=True)
+    print(f"📄 মোট {len(blocks)} টি ব্লক বিশ্লেষণ করা হচ্ছে।", flush=True)
+    
+    if not blocks:
+        return blocks
+        
+    merged_blocks = _advanced_merge(blocks)
+    print(f"✂️  মার্জ করার পর মোট ব্লক সংখ্যা: {len(merged_blocks)}", flush=True)
+    
+    final_blocks = _apply_section_scoring(merged_blocks)
+    
+    section_count = sum(1 for b in final_blocks if b.get('is_section_start'))
+    print(f"📑 ডকুমেন্টে মোট {section_count} টি সেকশন পাওয়া গেছে।", flush=True)
+    print("=" * 60 + "\n", flush=True)
+    
+    return final_blocks