# docx_processor.py - DOCX/DOC Processing and Content Extraction # FIXED: Split paragraphs that contain soft returns () into separate blocks # FIXED: Extract text from hyperlinks in DOCX import io import re import base64 import email import email.policy import quopri from html.parser import HTMLParser from urllib.parse import unquote, urlparse # ================================================================ # FORMAT DETECTION # ================================================================ def detect_doc_format(file_bytes): if not file_bytes or len(file_bytes) < 4: return 'unknown' if file_bytes[:4] == b'PK\x03\x04': return 'docx' if file_bytes[:8] == b'\xd0\xcf\x11\xe0\xa1\xb1\x1a\xe1': return 'ole2' if file_bytes[:4] == b'\xd0\xcf\x11\xe0': return 'ole2' header = file_bytes[:64] for bom in [b'\xef\xbb\xbf', b'\xff\xfe', b'\xfe\xff']: if header.startswith(bom): header = header[len(bom):] break if header.lstrip().startswith(b'{\\rtf'): return 'rtf' header_str = header.decode('ascii', errors='ignore').strip() if header_str.upper().startswith('MIME-VERSION'): return 'mhtml' sample_512 = file_bytes[:512].decode('ascii', errors='ignore').lower() if 'mime-version' in sample_512 and 'content-type' in sample_512 and 'boundary' in sample_512: return 'mhtml' sample = file_bytes[:4096] sample_str_lower = '' for enc in ['utf-8', 'utf-16-le', 'utf-16-be', 'cp1252', 'latin-1']: try: sample_str_lower = sample.decode(enc, errors='ignore').lower().strip() if sample_str_lower: break except Exception: continue if sample_str_lower: html_markers = [ '', 'mso-', ] for marker in html_markers: if marker in sample_str_lower: return 'html' return 'unknown' # ================================================================ # IMAGE NORMALIZATION HELPERS # ================================================================ def _normalize_image_key(raw_location): if not raw_location: return '' loc = raw_location.strip() if loc.lower().startswith('cid:'): loc = loc[4:] for _ in range(3): decoded = unquote(loc) if decoded == loc: break loc = decoded try: parsed = urlparse(loc) path = parsed.path if parsed.path else loc except Exception: path = loc filename = path.replace('\\', '/').rsplit('/', 1)[-1].strip() return filename.lower() # ================================================================ # MHTML PROCESSOR # ================================================================ class MHTMLProcessor: def __init__(self, file_bytes): self._file_bytes = file_bytes self._embedded_images = {} self._ordered_images = [] def process(self): html_content = self._extract_html_from_mhtml() if not html_content: html_content = self._fallback_extract() if not html_content: return { 'metadata': {'title': '', 'author': '', 'subject': ''}, 'blocks': [{'type': 'paragraph', 'content': '⚠️ Could not extract content from MHTML file.'}], } print(f" 📷 MHTML: {len(self._ordered_images)} image parts") processor = HTMLDocProcessor( html_content, embedded_images=self._embedded_images, ordered_images=self._ordered_images ) result = processor.process() img_blocks = sum(1 for b in result.get('blocks', []) if b.get('type') == 'image') print(f"📄 MHTML processed: {len(result.get('blocks', []))} blocks ({img_blocks} images)") return result def _store_image(self, payload_bytes, content_type, content_location, content_id): fmt = content_type.split('/')[-1].lower() if fmt in ('x-wmf', 'x-emf', 'wmf', 'emf'): return if fmt == 'jpg': fmt = 'jpeg' b64_data = base64.b64encode(payload_bytes).decode('ascii') self._ordered_images.append((b64_data, fmt, content_location or content_id or '')) if content_location: self._embedded_images[content_location] = (b64_data, fmt) norm = _normalize_image_key(content_location) if norm: self._embedded_images[norm] = (b64_data, fmt) if content_id: self._embedded_images[f'cid:{content_id}'] = (b64_data, fmt) self._embedded_images[content_id] = (b64_data, fmt) def _extract_html_from_mhtml(self): try: msg = email.message_from_bytes(self._file_bytes, policy=email.policy.default) html_body = None if msg.is_multipart(): for part in msg.walk(): ct = part.get_content_type() cl = part.get('Content-Location', '').strip() cid = part.get('Content-ID', '').strip('<> ') if ct == 'text/html': payload = part.get_payload(decode=True) if payload: cs = part.get_content_charset() or 'utf-8' try: html_body = payload.decode(cs, errors='ignore') except: html_body = payload.decode('utf-8', errors='ignore') elif ct and ct.startswith('image/'): payload = part.get_payload(decode=True) if payload and len(payload) > 100: self._store_image(payload, ct, cl, cid) else: ct = msg.get_content_type() if ct in ('text/html', 'multipart/related'): payload = msg.get_payload(decode=True) if payload: cs = msg.get_content_charset() or 'utf-8' try: html_body = payload.decode(cs, errors='ignore') except: html_body = payload.decode('utf-8', errors='ignore') return html_body except Exception as e: print(f" ⚠️ MIME parsing failed: {e}") return None def _fallback_extract(self): try: text = self._file_bytes.decode('ascii', errors='ignore') bm = re.search(r'boundary="?([^\s";\r\n]+)"?', text, re.IGNORECASE) if not bm: return None boundary = bm.group(1) parts = text.split(f'--{boundary}') html_body = None for part in parts: he = part.find('\r\n\r\n') if he == -1: he = part.find('\n\n') if he == -1: continue hs = part[:he]; body = part[he:].strip() ctm = re.search(r'Content-Type:\s*([^\s;]+)', hs, re.IGNORECASE) ct = ctm.group(1).lower() if ctm else '' is_qp = bool(re.search(r'Content-Transfer-Encoding:\s*quoted-printable', hs, re.IGNORECASE)) is_b64 = bool(re.search(r'Content-Transfer-Encoding:\s*base64', hs, re.IGNORECASE)) clm = re.search(r'Content-Location:\s*(.+?)[\r\n]', hs, re.IGNORECASE) cl = clm.group(1).strip() if clm else '' cidm = re.search(r'Content-ID:\s*\s\r\n]+)>?', hs, re.IGNORECASE) cid = cidm.group(1).strip() if cidm else '' if ct == 'text/html': if is_qp: body = quopri.decodestring(body.encode('ascii', errors='ignore')).decode('utf-8', errors='ignore') elif is_b64: try: body = base64.b64decode(body).decode('utf-8', errors='ignore') except: pass if ' 100: self._store_image(pb, ct, cl, cid) except: pass return html_body except Exception as e: print(f" ⚠️ Fallback MHTML failed: {e}") return None # ================================================================ # HTML COMMENT CLEANUP # ================================================================ def _clean_html_comments(html_text): html_text = re.sub(r'(.*?)', r'\1', html_text, flags=re.DOTALL|re.IGNORECASE) html_text = re.sub(r'(.*?)', r'\1', html_text, flags=re.DOTALL|re.IGNORECASE) html_text = re.sub(r'', '', html_text, flags=re.DOTALL|re.IGNORECASE) html_text = re.sub(r'', '', html_text, flags=re.DOTALL) return html_text # ================================================================ # HTML DOC PROCESSOR # ================================================================ class HTMLDocProcessor: def __init__(self, file_bytes, embedded_images=None, ordered_images=None): if isinstance(file_bytes, str): self._html_text = file_bytes self._file_bytes = file_bytes.encode('utf-8', errors='ignore') else: self._file_bytes = file_bytes self._html_text = self._decode_html() self._embedded_images = embedded_images or {} self._ordered_images = ordered_images or [] self._used_image_indices = set() def _decode_html(self): if self._file_bytes[:3] == b'\xef\xbb\xbf': return self._file_bytes[3:].decode('utf-8', errors='ignore') if self._file_bytes[:2] == b'\xff\xfe': return self._file_bytes[2:].decode('utf-16-le', errors='ignore') if self._file_bytes[:2] == b'\xfe\xff': return self._file_bytes[2:].decode('utf-16-be', errors='ignore') sample = self._file_bytes[:4096] try: st = sample.decode('ascii', errors='ignore') cm = re.search(r'charset[="\s]+([a-zA-Z0-9\-]+)', st, re.IGNORECASE) if cm: try: return self._file_bytes.decode(cm.group(1).strip().strip('"\''), errors='ignore') except: pass except: pass for enc in ['utf-8', 'cp1252', 'latin-1']: try: return self._file_bytes.decode(enc, errors='ignore') except: continue return self._file_bytes.decode('latin-1', errors='replace') def process(self): metadata = {'title': '', 'author': '', 'subject': ''} tm = re.search(r']*>(.*?)', self._html_text, re.IGNORECASE|re.DOTALL) if tm: metadata['title'] = self._strip_tags(tm.group(1)).strip() blocks = self._extract_all_blocks() blocks = [b for b in blocks if b.get('content', '').strip() or b.get('data')] if not blocks: blocks = self._simple_extract() img_count = sum(1 for b in blocks if b.get('type') == 'image') print(f"📄 HTML-DOC processed: {len(blocks)} blocks ({img_count} images)") return {'metadata': metadata, 'blocks': blocks} def _strip_tags(self, html_str): import html as hm return hm.unescape(re.sub(r'<[^>]+>', '', html_str)) def _resolve_image_src(self, src): import html as hm if not src: return None, None src = hm.unescape(src).strip() if src.startswith('data:image'): dm = re.match(r'data:image/([^;]+);base64,(.+)', src, re.DOTALL) if dm: return dm.group(2).strip(), dm.group(1) if src in self._embedded_images: self._mark_used(self._embedded_images[src][0]); return self._embedded_images[src] ns = _normalize_image_key(src) if ns and ns in self._embedded_images: self._mark_used(self._embedded_images[ns][0]); return self._embedded_images[ns] if ns and '.' in ns: nne = ns.rsplit('.', 1)[0] if nne and nne in self._embedded_images: self._mark_used(self._embedded_images[nne][0]); return self._embedded_images[nne] if ns: for loc, (data, fmt) in self._embedded_images.items(): ln = _normalize_image_key(loc) if ln and ns and ln == ns: self._mark_used(data); return data, fmt return self._get_next_unused() def _mark_used(self, data_prefix): p = data_prefix[:60] for i, (b, f, l) in enumerate(self._ordered_images): if i not in self._used_image_indices and b[:60] == p: self._used_image_indices.add(i); return def _get_next_unused(self): for i, (b, f, l) in enumerate(self._ordered_images): if i not in self._used_image_indices: self._used_image_indices.add(i); return b, f return None, None def _extract_all_blocks(self): import html as hm blocks = [] cleaned = re.sub(r']*>.*?', '', self._html_text, flags=re.DOTALL|re.IGNORECASE) cleaned = re.sub(r']*>.*?', '', cleaned, flags=re.DOTALL|re.IGNORECASE) vml_srcs = [] for vm in re.finditer(r'', cleaned, re.DOTALL|re.IGNORECASE): for im in re.finditer(r']*?\bsrc\s*=\s*["\']([^"\']+)["\']', vm.group(1), re.IGNORECASE|re.DOTALL): vml_srcs.append((hm.unescape(im.group(1)), vm.start())) cleaned = _clean_html_comments(cleaned) cleaned = re.sub(r']+>', '', cleaned, flags=re.IGNORECASE) bm = re.search(r']*>(.*)', cleaned, re.IGNORECASE|re.DOTALL) if bm: cleaned = bm.group(1) img_entries = [] for m in re.finditer(r']*?)/?\s*>', cleaned, re.IGNORECASE|re.DOTALL): sm = re.search(r'\bsrc\s*=\s*["\']([^"\']+)["\']', m.group(1), re.IGNORECASE) if not sm: sm = re.search(r'\bsrc\s*=\s*(\S+)', m.group(1), re.IGNORECASE) if sm: img_entries.append((hm.unescape(sm.group(1)), m.start())) if not img_entries and vml_srcs: img_entries = vml_srcs self._used_image_indices = set() for src, pos in img_entries: d, f = self._resolve_image_src(src) if d: blocks.append({'type':'image','content':f"![Image](embedded-image.{f})",'data':d,'format':f,'_pos':pos}) for m in re.finditer(r'<(h[1-6])\b[^>]*>(.*?)', cleaned, re.IGNORECASE|re.DOTALL): t = re.sub(r'\s+', ' ', self._strip_tags(m.group(2))).strip() if t: tag = m.group(1).lower() p = {'h1':'# ','h2':'## '}.get(tag,'### ') bt = {'h1':'heading1','h2':'heading2'}.get(tag,'heading3') blocks.append({'type':bt,'content':f"{p}{t}",'_pos':m.start()}) for m in re.finditer(r']*>(.*?)', cleaned, re.IGNORECASE|re.DOTALL): md = self._parse_table(m.group(1)) if md: blocks.append({'type':'table','content':md,'_pos':m.start()}) for m in re.finditer(r']*)>(.*?)', cleaned, re.IGNORECASE|re.DOTALL): inner = m.group(2); attrs = m.group(1) it = self._strip_tags(inner).strip() hw = not it or all(c in ' \t\n\r\xa0' for c in it) if hw: continue t = re.sub(r'[ \t]+', ' ', re.sub(r'\n\s*\n', '\n', it)).strip() if not t: continue bt = 'paragraph' cm = re.search(r'class\s*=\s*["\']?([^"\'>\s]+)', attrs, re.IGNORECASE) cn = cm.group(1) if cm else '' if 'MsoListParagraph' in cn: t = re.sub(r'^[·•●○◦‣⁃]\s*', '', re.sub(r'^\d+[.)]\s*', '', t)); bt = 'list_item' elif 'MsoTitle' in cn: bt = 'heading1' elif 'MsoSubtitle' in cn: bt = 'heading2' elif 'MsoQuote' in cn or 'MsoIntenseQuote' in cn: bt = 'quote' pm = {'heading1':'# ','heading2':'## ','list_item':'- ','quote':'> '} blocks.append({'type':bt,'content':f"{pm.get(bt,'')}{t}",'_pos':m.start()}) for m in re.finditer(r']*>(.*?)', cleaned, re.IGNORECASE|re.DOTALL): t = re.sub(r'\s+', ' ', self._strip_tags(m.group(1))).strip() if t: blocks.append({'type':'list_item','content':f"- {t}",'_pos':m.start()}) for m in re.finditer(r']*>(.*?)', cleaned, re.IGNORECASE|re.DOTALL): t = re.sub(r'\s+', ' ', self._strip_tags(m.group(1))).strip() if t: blocks.append({'type':'quote','content':f"> {t}",'_pos':m.start()}) for m in re.finditer(r']*)>(.*?)', cleaned, re.IGNORECASE|re.DOTALL): if re.search(r'<(?:p|h[1-6]|table|div|ul|ol)\b', m.group(2), re.IGNORECASE): continue t = re.sub(r'[ \t]+', ' ', self._strip_tags(m.group(2))).strip() if t and len(t) > 1 and not all(c in ' \t\n\r\xa0' for c in t): if not any(t in b.get('content','') for b in blocks): blocks.append({'type':'paragraph','content':t,'_pos':m.start()}) blocks.sort(key=lambda b: b.get('_pos', 0)) seen = set(); deduped = [] for b in blocks: b.pop('_pos', None) if b.get('type') == 'image': k = b.get('data','')[:60] if k and k in seen: continue if k: seen.add(k) deduped.append(b) else: c = b.get('content','').strip() if c and c not in seen: seen.add(c); deduped.append(b) return deduped def _parse_table(self, html): rows = [] for rm in re.finditer(r']*>(.*?)', html, re.IGNORECASE|re.DOTALL): cells = [] for cm in re.finditer(r']*>(.*?)', rm.group(1), re.IGNORECASE|re.DOTALL): cells.append(re.sub(r'\s+', ' ', self._strip_tags(cm.group(1))).strip().replace('|','\\|')) if cells: rows.append(cells) if not rows: return '' if all(len(r)==1 for r in rows) and len(rows)<=2: return '' lines = [] for i, r in enumerate(rows): lines.append('| '+' | '.join(r)+' |') if i == 0: lines.append('| '+' | '.join(['---']*len(r))+' |') return '\n'.join(lines) def _simple_extract(self): import html as hm blocks = []; t = self._html_text t = re.sub(r']*>.*?', '', t, flags=re.DOTALL|re.IGNORECASE) t = re.sub(r']*>.*?', '', t, flags=re.DOTALL|re.IGNORECASE) t = _clean_html_comments(t) bm = re.search(r']*>(.*)', t, re.IGNORECASE|re.DOTALL) if bm: t = bm.group(1) for tag, repl in [('br', '\n'), ('p', '\n\n'), ('div', '\n\n'), ('li', '\n'), ('tr', '\n'), ('table', '\n\n')]: t = re.sub(rf']*>', repl, t, flags=re.IGNORECASE) t = hm.unescape(re.sub(r'<[^>]+>', '', t)) for p in re.split(r'\n{2,}', t): p = re.sub(r'[ \t]+', ' ', p).strip() if p and len(p) > 1: blocks.append({'type':'paragraph','content':p}) return blocks # ================================================================ # RTF DOC PROCESSOR # ================================================================ class RTFDocProcessor: def __init__(self, file_bytes): self._file_bytes = file_bytes def process(self): blocks = []; metadata = {'title':'','author':'','subject':''} rtf = self._decode_rtf(); metadata.update(self._extract_meta(rtf)) pt = self._rtf_to_text(rtf) if pt: for p in re.split(r'\n{2,}', pt): p = p.strip() if not p: continue if len(p) < 80 and p.isupper(): blocks.append({'type':'heading2','content':f"## {p}"}) else: blocks.append({'type':'paragraph','content':p}) print(f"📄 RTF-DOC processed: {len(blocks)} blocks") return {'metadata': metadata, 'blocks': blocks} def _decode_rtf(self): d = self._file_bytes for b in [b'\xef\xbb\xbf',b'\xff\xfe',b'\xfe\xff']: if d.startswith(b): d = d[len(b):]; break try: return d.decode('ascii', errors='ignore') except: return d.decode('latin-1', errors='replace') def _extract_meta(self, rtf): m = {} for f in ['title','author','subject']: r = re.search(r'\\'+f+r'\s+([^}]+)', rtf) if r: m[f] = r.group(1).strip() return m def _rtf_to_text(self, rtf): try: from striprtf.striprtf import rtf_to_text return rtf_to_text(rtf, errors='ignore') except ImportError: pass except Exception: pass t = rtf for g in ['fonttbl','colortbl','stylesheet','info','header','footer']: t = re.sub(r'\{\\'+re.escape(g)+r'[^{}]*(?:\{[^{}]*\}[^{}]*)*\}', '', t, flags=re.DOTALL) t = re.sub(r'\\par\b\s*','\n',t); t = re.sub(r'\\pard\b\s*','',t) t = re.sub(r'\\line\b\s*','\n',t); t = re.sub(r'\\tab\b\s*','\t',t) def hr(m): try: return bytes([int(m.group(1),16)]).decode('cp1252',errors='ignore') except: return '' t = re.sub(r"\\\'([0-9a-fA-F]{2})", hr, t) def ur(m): try: c = int(m.group(1)) if c < 0: c += 65536 return chr(c) except: return '' t = re.sub(r'\\u(-?\d+)\??', ur, t) t = re.sub(r'\\[a-zA-Z]+\d*\s?','',t); t = re.sub(r'[{}]','',t) return re.sub(r'\n{3,}','\n\n',re.sub(r' +',' ',t)).strip() # ================================================================ # DOCX PROCESSOR (using python-docx) # ================================================================ DOCX_NSMAP = { 'w': 'http://schemas.openxmlformats.org/wordprocessingml/2006/main', 'a': 'http://schemas.openxmlformats.org/drawingml/2006/main', 'r': 'http://schemas.openxmlformats.org/officeDocument/2006/relationships', } W_NS = '{http://schemas.openxmlformats.org/wordprocessingml/2006/main}' R_NS = '{http://schemas.openxmlformats.org/officeDocument/2006/relationships}' class DOCXProcessor: """Process DOCX files. CRITICAL FIX: Extracts text from hyperlinks () which paragraph.runs misses. Also splits paragraphs containing (soft line breaks) into separate blocks when they represent logically distinct paragraphs. """ HEADING_PATTERNS = { 'Title':'title','Subtitle':'subtitle', 'Heading 1':'heading1','Heading 2':'heading2', 'Heading 3':'heading3','Heading 4':'heading3', 'Heading 5':'heading3','Heading 6':'heading3', 'Heading 7':'heading3','Heading 8':'heading3','Heading 9':'heading3', } QUOTE_STYLES = {'Quote','Intense Quote','Block Text'} LIST_BULLET_STYLES = {'List Bullet','List Bullet 2','List Bullet 3'} LIST_NUMBER_STYLES = {'List Number','List Number 2','List Number 3','List Continue'} def __init__(self, docx_bytes): import docx as docx_module self.doc = docx_module.Document(io.BytesIO(docx_bytes)) self._image_cache = {} self._extract_all_images() def _extract_all_images(self): try: for rel_id, rel in self.doc.part.rels.items(): if "image" in rel.reltype: try: ip = rel.target_part ib = ip.blob; ct = ip.content_type or '' fmt = 'png' if 'jpeg' in ct or 'jpg' in ct: fmt = 'jpeg' elif 'gif' in ct: fmt = 'gif' elif 'bmp' in ct: fmt = 'bmp' elif 'tiff' in ct: fmt = 'tiff' elif 'webp' in ct: fmt = 'webp' else: pn = str(ip.partname) if hasattr(ip,'partname') else '' if '.jpg' in pn or '.jpeg' in pn: fmt = 'jpeg' elif '.gif' in pn: fmt = 'gif' self._image_cache[rel_id] = (base64.b64encode(ib).decode('utf-8'), fmt) except Exception as e: print(f" ⚠️ Image {rel_id}: {e}") except Exception as e: print(f" ⚠️ Rels error: {e}") def _get_paragraph_images(self, paragraph): images = [] try: for drawing in paragraph._element.findall('.//w:drawing', DOCX_NSMAP): for blip in drawing.findall('.//a:blip', DOCX_NSMAP): eid = blip.get(f'{R_NS}embed') if eid and eid in self._image_cache: d, f = self._image_cache[eid] images.append({'data': d, 'format': f}) except Exception as e: print(f" ⚠️ Para images: {e}") return images def _get_paragraph_segments(self, paragraph): """Extract text from paragraph as a list of SEGMENTS split by . Each segment is a list of (text, is_bold, is_italic) tuples. Segments are separated by elements (soft line breaks). This allows us to split a single containing multiple logical paragraphs (joined by ) into separate blocks. """ segments = [[]] # List of segments, each segment is a list of (text, bold, italic) # Walk all direct children of the paragraph element in order # Children can be: (run), , , etc. for child in paragraph._element: tag = child.tag if tag == f'{W_NS}r': # Direct run self._process_run_element(child, segments) elif tag == f'{W_NS}hyperlink': # Hyperlink — contains children for run_elem in child.findall(f'{W_NS}r'): self._process_run_element(run_elem, segments) elif tag == f'{W_NS}smartTag': # Smart tag — contains children for run_elem in child.findall(f'{W_NS}r'): self._process_run_element(run_elem, segments) elif tag == f'{W_NS}sdt': # Structured document tag — may contain runs for run_elem in child.iter(f'{W_NS}r'): self._process_run_element(run_elem, segments) return segments def _process_run_element(self, run_elem, segments): """Process a single element, adding text to segments. If the run contains a , start a new segment. """ # Check for first — it means a line break for elem in run_elem: if elem.tag == f'{W_NS}br': # Start a new segment segments.append([]) elif elem.tag == f'{W_NS}t': if elem.text: is_bold, is_italic = self._get_run_formatting(run_elem) segments[-1].append((elem.text, is_bold, is_italic)) def _get_run_formatting(self, run_elem): """Check if a element has bold/italic formatting.""" is_bold = False is_italic = False rpr = run_elem.find(f'{W_NS}rPr') if rpr is not None: b = rpr.find(f'{W_NS}b') if b is not None: v = b.get(f'{W_NS}val') is_bold = v is None or v not in ('0', 'false') i = rpr.find(f'{W_NS}i') if i is not None: v = i.get(f'{W_NS}val') is_italic = v is None or v not in ('0', 'false') return is_bold, is_italic def _segments_to_text(self, segment): """Convert a segment (list of (text, bold, italic) tuples) to markdown string.""" parts = [] for text, is_bold, is_italic in segment: if is_bold and is_italic: parts.append(f"***{text}***") elif is_bold: parts.append(f"**{text}**") elif is_italic: parts.append(f"*{text}*") else: parts.append(text) return ''.join(parts) def _segment_plain_text(self, segment): """Get plain text from a segment.""" return ''.join(text for text, _, _ in segment) def _get_full_paragraph_plain_text(self, paragraph): """Get ALL plain text from paragraph including hyperlinks.""" texts = [] for t_elem in paragraph._element.iter(f'{W_NS}t'): if t_elem.text: texts.append(t_elem.text) return ''.join(texts).strip() def _classify_paragraph(self, paragraph): sn = paragraph.style.name if paragraph.style else '' for p, bt in self.HEADING_PATTERNS.items(): if sn == p or sn.startswith(p): return bt if sn in self.QUOTE_STYLES: return 'quote' if sn in self.LIST_BULLET_STYLES: return 'list_item' if sn in self.LIST_NUMBER_STYLES: return 'numbered_list' if sn == 'List Paragraph': return 'list_item' if 'toc' in sn.lower(): return 'list_item' return 'paragraph' def _table_to_markdown(self, table): rd = [] for r in table.rows: rd.append([c.text.replace('|','\\|').replace('\n',' ').strip() for c in r.cells]) if not rd: return "" lines = [] for i, r in enumerate(rd): lines.append('| '+' | '.join(r)+' |') if i == 0: lines.append('| '+' | '.join(['---']*len(r))+' |') return '\n'.join(lines) def _make_block(self, block_type, text): tm = { 'title':('heading1','# '),'subtitle':('heading2','## '), 'heading1':('heading1','# '),'heading2':('heading2','## '), 'heading3':('heading3','### '),'quote':('quote','> '), 'list_item':('list_item','- '),'numbered_list':('list_item','1. '), } if block_type in tm: bt, pf = tm[block_type] return {'type': bt, 'content': f"{pf}{text}"} return {'type': 'paragraph', 'content': text} def _process_element(self, element, blocks): from docx.table import Table as DocxTable from docx.text.paragraph import Paragraph as DocxParagraph if isinstance(element, DocxParagraph): plain_text = self._get_full_paragraph_plain_text(element) if not plain_text: # Image-only paragraph for img in self._get_paragraph_images(element): blocks.append({ 'type': 'image', 'content': f"![Document Image](embedded-image.{img['format']})", 'data': img['data'], 'format': img['format'], }) return # Extract images first for img in self._get_paragraph_images(element): blocks.append({ 'type': 'image', 'content': f"![Document Image](embedded-image.{img['format']})", 'data': img['data'], 'format': img['format'], }) block_type = self._classify_paragraph(element) # Get text as segments split by segments = self._get_paragraph_segments(element) # Filter out empty segments non_empty_segments = [s for s in segments if self._segment_plain_text(s).strip()] if len(non_empty_segments) <= 1: # Single segment — normal case, one block text = self._segments_to_text(non_empty_segments[0]) if non_empty_segments else '' if text.strip(): blocks.append(self._make_block(block_type, text)) else: # Multiple segments — split into separate blocks # First segment gets the paragraph's style classification # Subsequent segments are treated as paragraphs unless they look like headings for idx, seg in enumerate(non_empty_segments): seg_text = self._segments_to_text(seg) seg_plain = self._segment_plain_text(seg).strip() if not seg_plain: continue if idx == 0: # First segment keeps original type blocks.append(self._make_block(block_type, seg_text)) else: # Subsequent segments — detect if they look like headings # (short, all bold, or specific patterns) is_all_bold = all(b for _, b, _ in seg if _) is_short = len(seg_plain) < 100 if is_all_bold and is_short and not seg_plain.endswith(('.', ':', ',')): # Looks like a sub-heading blocks.append(self._make_block('heading3', seg_text)) else: blocks.append(self._make_block('paragraph', seg_text)) elif isinstance(element, DocxTable): md = self._table_to_markdown(element) if md.strip(): blocks.append({'type': 'table', 'content': md}) def process(self): blocks = []; metadata = {'title':'','author':'','subject':''} try: cp = self.doc.core_properties metadata['title'] = cp.title or '' metadata['author'] = cp.author or '' metadata['subject'] = cp.subject or '' except: pass try: for element in self.doc.iter_inner_content(): self._process_element(element, blocks) except AttributeError: print(" ⚠️ iter_inner_content() not available, using fallback") for p in self.doc.paragraphs: self._process_element(p, blocks) for t in self.doc.tables: self._process_element(t, blocks) img_count = sum(1 for b in blocks if b.get('type') == 'image') print(f"📄 DOCX processed: {len(blocks)} blocks ({img_count} images)") return {'metadata': metadata, 'blocks': blocks} # ================================================================ # OLE2 DOC PROCESSOR # ================================================================ class DOCProcessor: def __init__(self, doc_bytes): self._doc_bytes = doc_bytes def process(self): blocks = []; metadata = {'title':'','author':'','subject':''}; imgs = [] try: import olefile ole = olefile.OleFileIO(io.BytesIO(self._doc_bytes)) try: m = ole.get_metadata() for f in ['title','author','subject']: v = getattr(m,f,None) if v: metadata[f] = v.decode('utf-8',errors='ignore') if isinstance(v,bytes) else str(v) except: pass imgs = self._extract_ole_images(ole) if ole.exists('WordDocument'): t = self._extract_text(ole) if t: for p in re.split(r'\r\n|\r|\n', t): p = p.strip() if p: blocks.append({'type':'paragraph','content':p}) ole.close() except ImportError: blocks = self._basic_extract(); imgs = self._scan_images(self._doc_bytes) except Exception as e: print(f" ⚠️ OLE failed: {e}") blocks = self._basic_extract(); imgs = self._scan_images(self._doc_bytes) if not blocks: blocks = self._basic_extract() if imgs and blocks: iv = max(1, len(blocks)//(len(imgs)+1)); r = []; ii = 0 for i, b in enumerate(blocks): if ii < len(imgs) and i > 0 and i % iv == 0: r.append(imgs[ii]); ii += 1 r.append(b) while ii < len(imgs): r.append(imgs[ii]); ii += 1 blocks = r elif imgs: blocks = imgs + blocks print(f"📄 DOC (OLE2): {len(blocks)} blocks ({len(imgs)} images)") return {'metadata': metadata, 'blocks': blocks} def _extract_ole_images(self, ole): imgs = [] try: for sp in ole.listdir(): try: d = ole.openstream(sp).read() if len(d) < 100: continue if d[:3] == b'\xff\xd8\xff': imgs.append({'type':'image','content':'![Image](embedded-image.jpeg)','data':base64.b64encode(d).decode(),'format':'jpeg'}); continue if d[:8] == b'\x89PNG\r\n\x1a\n': imgs.append({'type':'image','content':'![Image](embedded-image.png)','data':base64.b64encode(d).decode(),'format':'png'}); continue if len(d) > 2048: imgs.extend(self._scan_images(d)) except: continue except: pass seen = set(); return [i for i in imgs if (k:=i.get('data','')[:80]) and k not in seen and not seen.add(k)] def _scan_images(self, data): imgs = []; pos = 0 while pos < len(data)-3: i = data.find(b'\xff\xd8\xff', pos) if i == -1: break e = data.find(b'\xff\xd9', i+3) if e == -1: break e += 2 if 2048 < e-i < 50*1024*1024: imgs.append({'type':'image','content':'![Image](embedded-image.jpeg)','data':base64.b64encode(data[i:e]).decode(),'format':'jpeg'}) pos = e pos = 0 while pos < len(data)-8: i = data.find(b'\x89PNG\r\n\x1a\n', pos) if i == -1: break e = data.find(b'IEND\xaeB`\x82', i+8) if e == -1: break e += 8 if 1024 < e-i < 50*1024*1024: imgs.append({'type':'image','content':'![Image](embedded-image.png)','data':base64.b64encode(data[i:e]).decode(),'format':'png'}) pos = e return imgs def _extract_text(self, ole): t = '' try: if ole.exists('WordDocument'): s = ole.openstream('WordDocument').read() d = s.decode('utf-16-le',errors='ignore') c = ''.join(ch for ch in d if ch in '\r\n\t' or ch.isprintable()) if len(c) > 20: return c.strip() except: pass for sp in ole.listdir(): try: d = ole.openstream(sp).read().decode('utf-16-le',errors='ignore') c = ''.join(ch for ch in d if ch.isprintable() or ch in '\r\n\t') if len(c) > len(t): t = c except: pass return t def _basic_extract(self): blocks = [] try: d = self._doc_bytes.decode('utf-16-le',errors='ignore') c = ''.join(ch for ch in d if ch.isprintable() or ch in '\r\n\t') for p in c.split('\r'): p = p.strip() if len(p) > 3: blocks.append({'type':'paragraph','content':p}) except: pass return blocks # ================================================================ # MAIN ENTRY POINT # ================================================================ def process_docx_to_markdown(file_bytes, filename=''): fmt = detect_doc_format(file_bytes) print(f" 🔍 File: {filename} | Format: {fmt} | Size: {len(file_bytes)}") pmap = {'docx':DOCXProcessor,'mhtml':MHTMLProcessor,'html':HTMLDocProcessor,'rtf':RTFDocProcessor,'ole2':DOCProcessor} if fmt in pmap: try: r = pmap[fmt](file_bytes).process() if r.get('blocks'): ic = sum(1 for b in r['blocks'] if b.get('type')=='image') print(f" ✅ {len(r['blocks'])} blocks ({ic} images)") return {'metadata':r['metadata'],'markdown_blocks':r['blocks']} except Exception as e: import traceback; print(f" ⚠️ {fmt} failed: {e}"); traceback.print_exc() for fn, PC in [('DOCX',DOCXProcessor),('MHTML',MHTMLProcessor),('HTML',HTMLDocProcessor),('RTF',RTFDocProcessor),('OLE2',DOCProcessor)]: try: r = PC(file_bytes).process() if r.get('blocks'): print(f" ✅ Parsed as {fn}") return {'metadata':r['metadata'],'markdown_blocks':r['blocks']} except: continue return {'metadata':{'title':'','author':'','subject':''},'markdown_blocks':[{'type':'paragraph','content':'⚠️ Could not extract content. Try saving as .docx.'}]}