from __future__ import annotations import hashlib import html import re from datetime import UTC, datetime from email import policy from email.parser import BytesParser from email.utils import getaddresses, parsedate_to_datetime from pathlib import Path from uuid import NAMESPACE_URL, uuid5 from .evidence import candidate_from_parsed from .models import ( Confidence, EmailEvidenceCandidate, InboundMailboxMessage, MessageClass, ParsedMailboxMessage, ) PARSER_VERSION = "mailbox-scanner-v0.1" EMAIL_RE = re.compile(r"[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,}", re.IGNORECASE) ENHANCED_STATUS_RE = re.compile(r"\b[245]\.\d{1,3}\.\d{1,3}\b") SMTP_STATUS_RE = re.compile(r"\b[245]\d{2}\b") def parse_message_bytes( raw_bytes: bytes, *, mailbox_id: str, raw_message_ref: str | None, imap_uid: str | None = None, now: datetime | None = None, ) -> tuple[InboundMailboxMessage, ParsedMailboxMessage, EmailEvidenceCandidate | None]: observed_at = now or datetime.now(UTC) if not raw_bytes.strip(): return _parse_failed( mailbox_id=mailbox_id, raw_message_ref=raw_message_ref, imap_uid=imap_uid, observed_at=observed_at, reason="empty_message", ) try: msg = BytesParser(policy=policy.default).parsebytes(raw_bytes) except Exception as exc: return _parse_failed( mailbox_id=mailbox_id, raw_message_ref=raw_message_ref, imap_uid=imap_uid, observed_at=observed_at, reason=f"parser_error:{type(exc).__name__}", ) message_id = _clean_header(msg.get("Message-ID")) subject = _clean_header(msg.get("Subject")) from_address = _first_address(msg.get("From")) to_addresses = [addr for _, addr in getaddresses(msg.get_all("To", []))] received_at = _parse_date(msg.get("Date")) text = _extract_text(msg) headers_text = "\n".join(f"{key}: {value}" for key, value in msg.items()) combined = f"{headers_text}\n\n{subject or ''}\n{text}" dedup_key = _message_dedup_key( mailbox_id=mailbox_id, imap_uid=imap_uid, message_id=message_id, received_at=received_at, from_address=from_address, subject=subject, body=text, ) mailbox_message_id = str(uuid5(NAMESPACE_URL, "email-connect:message:" + dedup_key)) inbound = InboundMailboxMessage( mailbox_message_id=mailbox_message_id, mailbox_id=mailbox_id, imap_uid=imap_uid, message_id_header=message_id, received_at=received_at, from_address=from_address, to_addresses=to_addresses, subject=subject, raw_headers_ref=raw_message_ref, raw_message_ref=raw_message_ref, first_seen_at=observed_at, last_seen_at=observed_at, deduplication_key=dedup_key, ) parsed = classify_message(inbound, combined, observed_at=observed_at) candidate = candidate_from_parsed( parsed, raw_message_ref=raw_message_ref, observed_at=observed_at, occurred_at=received_at, ) return inbound, parsed, candidate def parse_message_file( path: str | Path, *, mailbox_id: str, now: datetime | None = None, ) -> tuple[InboundMailboxMessage, ParsedMailboxMessage, EmailEvidenceCandidate | None]: file_path = Path(path) return parse_message_bytes( file_path.read_bytes(), mailbox_id=mailbox_id, raw_message_ref=str(file_path), now=now, ) def classify_message( inbound: InboundMailboxMessage, combined_text: str, *, observed_at: datetime, ) -> ParsedMailboxMessage: text = combined_text.lower() enhanced_status = _first_match(ENHANCED_STATUS_RE, combined_text) smtp_status = _first_match(SMTP_STATUS_RE, combined_text) dsn_fields = _extract_dsn_fields(combined_text) affected = dsn_fields.get("final_recipient_email") or dsn_fields.get("original_recipient_email") if affected is None: affected = _extract_affected_recipient(combined_text) original_message_id = _extract_headerish(combined_text, "Original-Message-ID") original_recipient = dsn_fields.get("original_recipient_email") or affected notes: list[str] = [] message_class = MessageClass.UNRELATED_MESSAGE confidence = Confidence.LOW reason_code: str | None = None if _contains_any(text, ["feedback loop", "abuse report", "spam complaint", "complaint notification"]): message_class = MessageClass.COMPLAINT_OR_ABUSE confidence = Confidence.HIGH reason_code = "complaint" elif _contains_any(text, ["unsubscribe", "opt-out", "opt out", "remove me"]): message_class = MessageClass.UNSUBSCRIBE_OR_OPT_OUT confidence = Confidence.MEDIUM reason_code = "unsubscribe" elif _is_out_of_office(text): message_class = MessageClass.OUT_OF_OFFICE confidence = Confidence.MEDIUM reason_code = "auto_reply" elif _is_challenge_response(text): message_class = MessageClass.CHALLENGE_RESPONSE confidence = Confidence.MEDIUM reason_code = "challenge_response" elif _contains_any(text, ["will keep trying", "delivery delayed", "message delayed", "not yet delivered"]): message_class = MessageClass.DELAYED_DELIVERY_NOTICE confidence = Confidence.HIGH reason_code = "delayed" elif _is_dsn_like(text): message_class, confidence, reason_code = _classify_dsn(text, smtp_status, enhanced_status) elif _looks_like_human_reply(inbound, text): message_class = MessageClass.HUMAN_REPLY confidence = Confidence.MEDIUM reason_code = "reply" elif _looks_return_related(text): message_class = MessageClass.UNKNOWN_RETURN_MESSAGE confidence = Confidence.LOW reason_code = "unknown_return" notes.append("Return-channel message did not match a reliable classifier.") for key in ["original_recipient", "final_recipient", "action", "diagnostic_code", "remote_mta"]: if dsn_fields.get(key): notes.append(f"{key}={dsn_fields[key]}") if enhanced_status: notes.append(f"enhanced_status={enhanced_status}") if smtp_status: notes.append(f"smtp_status={smtp_status}") parsed_id_basis = "|".join([inbound.mailbox_message_id, PARSER_VERSION, message_class.value]) return ParsedMailboxMessage( parsed_message_id=str(uuid5(NAMESPACE_URL, "email-connect:parsed:" + parsed_id_basis)), mailbox_message_id=inbound.mailbox_message_id, parser_version=PARSER_VERSION, message_class=message_class, affected_email_address=affected, original_message_id=original_message_id, original_recipient=original_recipient, smtp_status_code=smtp_status, enhanced_status_code=enhanced_status, reason_code=reason_code, confidence=confidence, parsed_at=observed_at, notes=notes, ) def _classify_dsn( text: str, smtp_status: str | None, enhanced_status: str | None, ) -> tuple[MessageClass, Confidence, str]: if _contains_any(text, ["final failure", "giving up", "could not deliver after"]): return MessageClass.FINAL_DELIVERY_FAILURE, Confidence.HIGH, "expired_without_delivery" if (enhanced_status and enhanced_status.startswith("5.")) or (smtp_status and smtp_status.startswith("5")): return MessageClass.HARD_BOUNCE, Confidence.HIGH, "hard_bounce" if _contains_any(text, ["unknown user", "mailbox not found", "user unknown", "domain not found"]): return MessageClass.HARD_BOUNCE, Confidence.HIGH, "hard_bounce" if (enhanced_status and enhanced_status.startswith("4.")) or (smtp_status and smtp_status.startswith("4")): return MessageClass.SOFT_BOUNCE, Confidence.HIGH, "soft_bounce" if _contains_any(text, ["temporary failure", "mailbox full", "greylist", "try again later"]): return MessageClass.SOFT_BOUNCE, Confidence.MEDIUM, "soft_bounce" return MessageClass.UNKNOWN_RETURN_MESSAGE, Confidence.LOW, "unknown_dsn" def _extract_text(msg) -> str: chunks: list[str] = [] html_chunks: list[str] = [] if msg.is_multipart(): for part in msg.walk(): if part.get_content_maintype() == "multipart": continue if part.get_content_type() in {"text/plain", "message/delivery-status"}: try: chunks.append(str(part.get_content())) except Exception: continue elif part.get_content_type() == "text/html": try: html_chunks.append(_html_to_text(str(part.get_content()))) except Exception: continue else: try: content = str(msg.get_content()) if msg.get_content_type() == "text/html": html_chunks.append(_html_to_text(content)) else: chunks.append(content) except Exception: payload = msg.get_payload(decode=True) if payload: chunks.append(payload.decode(errors="replace")) if chunks: return "\n".join(chunks) return "\n".join(html_chunks) def _parse_failed( *, mailbox_id: str, raw_message_ref: str | None, imap_uid: str | None, observed_at: datetime, reason: str, ) -> tuple[InboundMailboxMessage, ParsedMailboxMessage, EmailEvidenceCandidate | None]: dedup_key = "|".join([mailbox_id, imap_uid or "", raw_message_ref or "", reason]) mailbox_message_id = str(uuid5(NAMESPACE_URL, "email-connect:message:" + dedup_key)) inbound = InboundMailboxMessage( mailbox_message_id=mailbox_message_id, mailbox_id=mailbox_id, imap_uid=imap_uid, message_id_header=None, received_at=None, from_address=None, to_addresses=[], subject=None, raw_headers_ref=raw_message_ref, raw_message_ref=raw_message_ref, first_seen_at=observed_at, last_seen_at=observed_at, deduplication_key=dedup_key, ) parsed_id_basis = "|".join([mailbox_message_id, PARSER_VERSION, "parse_failed"]) parsed = ParsedMailboxMessage( parsed_message_id=str(uuid5(NAMESPACE_URL, "email-connect:parsed:" + parsed_id_basis)), mailbox_message_id=mailbox_message_id, parser_version=PARSER_VERSION, message_class=MessageClass.PARSE_FAILED, affected_email_address=None, original_message_id=None, original_recipient=None, smtp_status_code=None, enhanced_status_code=None, reason_code=reason, confidence=Confidence.HIGH, parsed_at=observed_at, notes=[f"parse_failure={reason}"], ) candidate = candidate_from_parsed( parsed, raw_message_ref=raw_message_ref, observed_at=observed_at, occurred_at=None, ) return inbound, parsed, candidate def _message_dedup_key( *, mailbox_id: str, imap_uid: str | None, message_id: str | None, received_at: datetime | None, from_address: str | None, subject: str | None, body: str, ) -> str: body_hash = hashlib.sha256(body.encode("utf-8", errors="replace")).hexdigest()[:16] return "|".join( [ mailbox_id, imap_uid or "", message_id or "", received_at.isoformat() if received_at else "", from_address or "", hashlib.sha256((subject or "").encode()).hexdigest()[:12], body_hash, ] ) def _clean_header(value: str | None) -> str | None: if value is None: return None cleaned = str(value).strip() return cleaned or None def _first_address(value: str | None) -> str | None: if not value: return None addresses = getaddresses([value]) return addresses[0][1] if addresses else None def _parse_date(value: str | None) -> datetime | None: if not value: return None try: parsed = parsedate_to_datetime(value) except (TypeError, ValueError): return None if parsed.tzinfo is None: return parsed.replace(tzinfo=UTC) return parsed.astimezone(UTC) def _first_match(pattern: re.Pattern[str], text: str) -> str | None: match = pattern.search(text) return match.group(0) if match else None def _extract_headerish(text: str, name: str) -> str | None: match = re.search(rf"^{re.escape(name)}:\s*(.+)$", text, re.IGNORECASE | re.MULTILINE) return match.group(1).strip() if match else None def _extract_dsn_fields(text: str) -> dict[str, str]: fields: dict[str, str] = {} for field, key in [ ("Original-Recipient", "original_recipient"), ("Final-Recipient", "final_recipient"), ("Action", "action"), ("Diagnostic-Code", "diagnostic_code"), ("Remote-MTA", "remote_mta"), ]: value = _extract_headerish(text, field) if value: fields[key] = value if field in {"Original-Recipient", "Final-Recipient"}: match = EMAIL_RE.search(value) if match: fields[f"{key}_email"] = match.group(0).lower() return fields def _extract_affected_recipient(text: str) -> str | None: for name in ["Final-Recipient", "Original-Recipient", "X-Failed-Recipients", "Failed-Recipient"]: value = _extract_headerish(text, name) if value: match = EMAIL_RE.search(value) if match: return match.group(0).lower() match = EMAIL_RE.search(text) return match.group(0).lower() if match else None def _contains_any(text: str, needles: list[str]) -> bool: return any(needle in text for needle in needles) def _is_dsn_like(text: str) -> bool: return _contains_any( text, [ "delivery status notification", "delivery failure", "undeliverable", "returned mail", "mail delivery subsystem", "final-recipient", "diagnostic-code", "status:", ], ) def _is_out_of_office(text: str) -> bool: return _contains_any( text, [ "out of office", "auto-reply", "autoreply", "automatic reply", "vacation", "abwesenheitsnotiz", "nicht im buero", "nicht im büro", ], ) def _is_challenge_response(text: str) -> bool: return _contains_any( text, [ "challenge-response", "challenge response", "sender verification", "verify your email before your message can be delivered", "confirm you are a real person", "confirm that you sent this message", "please verify yourself", ], ) def _looks_like_human_reply(inbound: InboundMailboxMessage, text: str) -> bool: subject = (inbound.subject or "").lower() if _contains_any(text, ["auto-submitted: auto-replied", "x-autoreply", "auto-generated"]): return False return subject.startswith("re:") or _contains_any(text, ["thanks", "thank you", "i confirm", "received"]) def _looks_return_related(text: str) -> bool: return _contains_any(text, ["delivery", "mailbox", "recipient", "message", "smtp", "unsubscribe", "reply"]) def _html_to_text(value: str) -> str: without_tags = re.sub(r"<[^>]+>", " ", value) return re.sub(r"\s+", " ", html.unescape(without_tags)).strip()