generated from coulomb/repo-seed
452 lines
15 KiB
Python
452 lines
15 KiB
Python
from __future__ import annotations
|
|
|
|
import hashlib
|
|
import html
|
|
import re
|
|
from datetime import UTC, datetime
|
|
from email import policy
|
|
from email.parser import BytesParser
|
|
from email.utils import getaddresses, parsedate_to_datetime
|
|
from pathlib import Path
|
|
from uuid import NAMESPACE_URL, uuid5
|
|
|
|
from .evidence import candidate_from_parsed
|
|
from .models import (
|
|
Confidence,
|
|
EmailEvidenceCandidate,
|
|
InboundMailboxMessage,
|
|
MessageClass,
|
|
ParsedMailboxMessage,
|
|
)
|
|
|
|
PARSER_VERSION = "mailbox-scanner-v0.1"
|
|
|
|
EMAIL_RE = re.compile(r"[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,}", re.IGNORECASE)
|
|
ENHANCED_STATUS_RE = re.compile(r"\b[245]\.\d{1,3}\.\d{1,3}\b")
|
|
SMTP_STATUS_RE = re.compile(r"\b[245]\d{2}\b")
|
|
|
|
|
|
def parse_message_bytes(
|
|
raw_bytes: bytes,
|
|
*,
|
|
mailbox_id: str,
|
|
raw_message_ref: str | None,
|
|
imap_uid: str | None = None,
|
|
now: datetime | None = None,
|
|
) -> tuple[InboundMailboxMessage, ParsedMailboxMessage, EmailEvidenceCandidate | None]:
|
|
observed_at = now or datetime.now(UTC)
|
|
if not raw_bytes.strip():
|
|
return _parse_failed(
|
|
mailbox_id=mailbox_id,
|
|
raw_message_ref=raw_message_ref,
|
|
imap_uid=imap_uid,
|
|
observed_at=observed_at,
|
|
reason="empty_message",
|
|
)
|
|
|
|
try:
|
|
msg = BytesParser(policy=policy.default).parsebytes(raw_bytes)
|
|
except Exception as exc:
|
|
return _parse_failed(
|
|
mailbox_id=mailbox_id,
|
|
raw_message_ref=raw_message_ref,
|
|
imap_uid=imap_uid,
|
|
observed_at=observed_at,
|
|
reason=f"parser_error:{type(exc).__name__}",
|
|
)
|
|
|
|
message_id = _clean_header(msg.get("Message-ID"))
|
|
subject = _clean_header(msg.get("Subject"))
|
|
from_address = _first_address(msg.get("From"))
|
|
to_addresses = [addr for _, addr in getaddresses(msg.get_all("To", []))]
|
|
received_at = _parse_date(msg.get("Date"))
|
|
text = _extract_text(msg)
|
|
headers_text = "\n".join(f"{key}: {value}" for key, value in msg.items())
|
|
combined = f"{headers_text}\n\n{subject or ''}\n{text}"
|
|
dedup_key = _message_dedup_key(
|
|
mailbox_id=mailbox_id,
|
|
imap_uid=imap_uid,
|
|
message_id=message_id,
|
|
received_at=received_at,
|
|
from_address=from_address,
|
|
subject=subject,
|
|
body=text,
|
|
)
|
|
mailbox_message_id = str(uuid5(NAMESPACE_URL, "email-connect:message:" + dedup_key))
|
|
inbound = InboundMailboxMessage(
|
|
mailbox_message_id=mailbox_message_id,
|
|
mailbox_id=mailbox_id,
|
|
imap_uid=imap_uid,
|
|
message_id_header=message_id,
|
|
received_at=received_at,
|
|
from_address=from_address,
|
|
to_addresses=to_addresses,
|
|
subject=subject,
|
|
raw_headers_ref=raw_message_ref,
|
|
raw_message_ref=raw_message_ref,
|
|
first_seen_at=observed_at,
|
|
last_seen_at=observed_at,
|
|
deduplication_key=dedup_key,
|
|
)
|
|
|
|
parsed = classify_message(inbound, combined, observed_at=observed_at)
|
|
candidate = candidate_from_parsed(
|
|
parsed,
|
|
raw_message_ref=raw_message_ref,
|
|
observed_at=observed_at,
|
|
occurred_at=received_at,
|
|
)
|
|
return inbound, parsed, candidate
|
|
|
|
|
|
def parse_message_file(
|
|
path: str | Path,
|
|
*,
|
|
mailbox_id: str,
|
|
now: datetime | None = None,
|
|
) -> tuple[InboundMailboxMessage, ParsedMailboxMessage, EmailEvidenceCandidate | None]:
|
|
file_path = Path(path)
|
|
return parse_message_bytes(
|
|
file_path.read_bytes(),
|
|
mailbox_id=mailbox_id,
|
|
raw_message_ref=str(file_path),
|
|
now=now,
|
|
)
|
|
|
|
|
|
def classify_message(
|
|
inbound: InboundMailboxMessage,
|
|
combined_text: str,
|
|
*,
|
|
observed_at: datetime,
|
|
) -> ParsedMailboxMessage:
|
|
text = combined_text.lower()
|
|
enhanced_status = _first_match(ENHANCED_STATUS_RE, combined_text)
|
|
smtp_status = _first_match(SMTP_STATUS_RE, combined_text)
|
|
dsn_fields = _extract_dsn_fields(combined_text)
|
|
affected = dsn_fields.get("final_recipient_email") or dsn_fields.get("original_recipient_email")
|
|
if affected is None:
|
|
affected = _extract_affected_recipient(combined_text)
|
|
original_message_id = _extract_headerish(combined_text, "Original-Message-ID")
|
|
original_recipient = dsn_fields.get("original_recipient_email") or affected
|
|
notes: list[str] = []
|
|
|
|
message_class = MessageClass.UNRELATED_MESSAGE
|
|
confidence = Confidence.LOW
|
|
reason_code: str | None = None
|
|
|
|
if _contains_any(text, ["feedback loop", "abuse report", "spam complaint", "complaint notification"]):
|
|
message_class = MessageClass.COMPLAINT_OR_ABUSE
|
|
confidence = Confidence.HIGH
|
|
reason_code = "complaint"
|
|
elif _contains_any(text, ["unsubscribe", "opt-out", "opt out", "remove me"]):
|
|
message_class = MessageClass.UNSUBSCRIBE_OR_OPT_OUT
|
|
confidence = Confidence.MEDIUM
|
|
reason_code = "unsubscribe"
|
|
elif _is_out_of_office(text):
|
|
message_class = MessageClass.OUT_OF_OFFICE
|
|
confidence = Confidence.MEDIUM
|
|
reason_code = "auto_reply"
|
|
elif _is_challenge_response(text):
|
|
message_class = MessageClass.CHALLENGE_RESPONSE
|
|
confidence = Confidence.MEDIUM
|
|
reason_code = "challenge_response"
|
|
elif _contains_any(text, ["will keep trying", "delivery delayed", "message delayed", "not yet delivered"]):
|
|
message_class = MessageClass.DELAYED_DELIVERY_NOTICE
|
|
confidence = Confidence.HIGH
|
|
reason_code = "delayed"
|
|
elif _is_dsn_like(text):
|
|
message_class, confidence, reason_code = _classify_dsn(text, smtp_status, enhanced_status)
|
|
elif _looks_like_human_reply(inbound, text):
|
|
message_class = MessageClass.HUMAN_REPLY
|
|
confidence = Confidence.MEDIUM
|
|
reason_code = "reply"
|
|
elif _looks_return_related(text):
|
|
message_class = MessageClass.UNKNOWN_RETURN_MESSAGE
|
|
confidence = Confidence.LOW
|
|
reason_code = "unknown_return"
|
|
notes.append("Return-channel message did not match a reliable classifier.")
|
|
|
|
for key in ["original_recipient", "final_recipient", "action", "diagnostic_code", "remote_mta"]:
|
|
if dsn_fields.get(key):
|
|
notes.append(f"{key}={dsn_fields[key]}")
|
|
if enhanced_status:
|
|
notes.append(f"enhanced_status={enhanced_status}")
|
|
if smtp_status:
|
|
notes.append(f"smtp_status={smtp_status}")
|
|
|
|
parsed_id_basis = "|".join([inbound.mailbox_message_id, PARSER_VERSION, message_class.value])
|
|
return ParsedMailboxMessage(
|
|
parsed_message_id=str(uuid5(NAMESPACE_URL, "email-connect:parsed:" + parsed_id_basis)),
|
|
mailbox_message_id=inbound.mailbox_message_id,
|
|
parser_version=PARSER_VERSION,
|
|
message_class=message_class,
|
|
affected_email_address=affected,
|
|
original_message_id=original_message_id,
|
|
original_recipient=original_recipient,
|
|
smtp_status_code=smtp_status,
|
|
enhanced_status_code=enhanced_status,
|
|
reason_code=reason_code,
|
|
confidence=confidence,
|
|
parsed_at=observed_at,
|
|
notes=notes,
|
|
)
|
|
|
|
|
|
def _classify_dsn(
|
|
text: str,
|
|
smtp_status: str | None,
|
|
enhanced_status: str | None,
|
|
) -> tuple[MessageClass, Confidence, str]:
|
|
if _contains_any(text, ["final failure", "giving up", "could not deliver after"]):
|
|
return MessageClass.FINAL_DELIVERY_FAILURE, Confidence.HIGH, "expired_without_delivery"
|
|
if (enhanced_status and enhanced_status.startswith("5.")) or (smtp_status and smtp_status.startswith("5")):
|
|
return MessageClass.HARD_BOUNCE, Confidence.HIGH, "hard_bounce"
|
|
if _contains_any(text, ["unknown user", "mailbox not found", "user unknown", "domain not found"]):
|
|
return MessageClass.HARD_BOUNCE, Confidence.HIGH, "hard_bounce"
|
|
if (enhanced_status and enhanced_status.startswith("4.")) or (smtp_status and smtp_status.startswith("4")):
|
|
return MessageClass.SOFT_BOUNCE, Confidence.HIGH, "soft_bounce"
|
|
if _contains_any(text, ["temporary failure", "mailbox full", "greylist", "try again later"]):
|
|
return MessageClass.SOFT_BOUNCE, Confidence.MEDIUM, "soft_bounce"
|
|
return MessageClass.UNKNOWN_RETURN_MESSAGE, Confidence.LOW, "unknown_dsn"
|
|
|
|
|
|
def _extract_text(msg) -> str:
|
|
chunks: list[str] = []
|
|
html_chunks: list[str] = []
|
|
if msg.is_multipart():
|
|
for part in msg.walk():
|
|
if part.get_content_maintype() == "multipart":
|
|
continue
|
|
if part.get_content_type() in {"text/plain", "message/delivery-status"}:
|
|
try:
|
|
chunks.append(str(part.get_content()))
|
|
except Exception:
|
|
continue
|
|
elif part.get_content_type() == "text/html":
|
|
try:
|
|
html_chunks.append(_html_to_text(str(part.get_content())))
|
|
except Exception:
|
|
continue
|
|
else:
|
|
try:
|
|
content = str(msg.get_content())
|
|
if msg.get_content_type() == "text/html":
|
|
html_chunks.append(_html_to_text(content))
|
|
else:
|
|
chunks.append(content)
|
|
except Exception:
|
|
payload = msg.get_payload(decode=True)
|
|
if payload:
|
|
chunks.append(payload.decode(errors="replace"))
|
|
if chunks:
|
|
return "\n".join(chunks)
|
|
return "\n".join(html_chunks)
|
|
|
|
|
|
def _parse_failed(
|
|
*,
|
|
mailbox_id: str,
|
|
raw_message_ref: str | None,
|
|
imap_uid: str | None,
|
|
observed_at: datetime,
|
|
reason: str,
|
|
) -> tuple[InboundMailboxMessage, ParsedMailboxMessage, EmailEvidenceCandidate | None]:
|
|
dedup_key = "|".join([mailbox_id, imap_uid or "", raw_message_ref or "", reason])
|
|
mailbox_message_id = str(uuid5(NAMESPACE_URL, "email-connect:message:" + dedup_key))
|
|
inbound = InboundMailboxMessage(
|
|
mailbox_message_id=mailbox_message_id,
|
|
mailbox_id=mailbox_id,
|
|
imap_uid=imap_uid,
|
|
message_id_header=None,
|
|
received_at=None,
|
|
from_address=None,
|
|
to_addresses=[],
|
|
subject=None,
|
|
raw_headers_ref=raw_message_ref,
|
|
raw_message_ref=raw_message_ref,
|
|
first_seen_at=observed_at,
|
|
last_seen_at=observed_at,
|
|
deduplication_key=dedup_key,
|
|
)
|
|
parsed_id_basis = "|".join([mailbox_message_id, PARSER_VERSION, "parse_failed"])
|
|
parsed = ParsedMailboxMessage(
|
|
parsed_message_id=str(uuid5(NAMESPACE_URL, "email-connect:parsed:" + parsed_id_basis)),
|
|
mailbox_message_id=mailbox_message_id,
|
|
parser_version=PARSER_VERSION,
|
|
message_class=MessageClass.PARSE_FAILED,
|
|
affected_email_address=None,
|
|
original_message_id=None,
|
|
original_recipient=None,
|
|
smtp_status_code=None,
|
|
enhanced_status_code=None,
|
|
reason_code=reason,
|
|
confidence=Confidence.HIGH,
|
|
parsed_at=observed_at,
|
|
notes=[f"parse_failure={reason}"],
|
|
)
|
|
candidate = candidate_from_parsed(
|
|
parsed,
|
|
raw_message_ref=raw_message_ref,
|
|
observed_at=observed_at,
|
|
occurred_at=None,
|
|
)
|
|
return inbound, parsed, candidate
|
|
|
|
|
|
def _message_dedup_key(
|
|
*,
|
|
mailbox_id: str,
|
|
imap_uid: str | None,
|
|
message_id: str | None,
|
|
received_at: datetime | None,
|
|
from_address: str | None,
|
|
subject: str | None,
|
|
body: str,
|
|
) -> str:
|
|
body_hash = hashlib.sha256(body.encode("utf-8", errors="replace")).hexdigest()[:16]
|
|
return "|".join(
|
|
[
|
|
mailbox_id,
|
|
imap_uid or "",
|
|
message_id or "",
|
|
received_at.isoformat() if received_at else "",
|
|
from_address or "",
|
|
hashlib.sha256((subject or "").encode()).hexdigest()[:12],
|
|
body_hash,
|
|
]
|
|
)
|
|
|
|
|
|
def _clean_header(value: str | None) -> str | None:
|
|
if value is None:
|
|
return None
|
|
cleaned = str(value).strip()
|
|
return cleaned or None
|
|
|
|
|
|
def _first_address(value: str | None) -> str | None:
|
|
if not value:
|
|
return None
|
|
addresses = getaddresses([value])
|
|
return addresses[0][1] if addresses else None
|
|
|
|
|
|
def _parse_date(value: str | None) -> datetime | None:
|
|
if not value:
|
|
return None
|
|
try:
|
|
parsed = parsedate_to_datetime(value)
|
|
except (TypeError, ValueError):
|
|
return None
|
|
if parsed.tzinfo is None:
|
|
return parsed.replace(tzinfo=UTC)
|
|
return parsed.astimezone(UTC)
|
|
|
|
|
|
def _first_match(pattern: re.Pattern[str], text: str) -> str | None:
|
|
match = pattern.search(text)
|
|
return match.group(0) if match else None
|
|
|
|
|
|
def _extract_headerish(text: str, name: str) -> str | None:
|
|
match = re.search(rf"^{re.escape(name)}:\s*(.+)$", text, re.IGNORECASE | re.MULTILINE)
|
|
return match.group(1).strip() if match else None
|
|
|
|
|
|
def _extract_dsn_fields(text: str) -> dict[str, str]:
|
|
fields: dict[str, str] = {}
|
|
for field, key in [
|
|
("Original-Recipient", "original_recipient"),
|
|
("Final-Recipient", "final_recipient"),
|
|
("Action", "action"),
|
|
("Diagnostic-Code", "diagnostic_code"),
|
|
("Remote-MTA", "remote_mta"),
|
|
]:
|
|
value = _extract_headerish(text, field)
|
|
if value:
|
|
fields[key] = value
|
|
if field in {"Original-Recipient", "Final-Recipient"}:
|
|
match = EMAIL_RE.search(value)
|
|
if match:
|
|
fields[f"{key}_email"] = match.group(0).lower()
|
|
return fields
|
|
|
|
|
|
def _extract_affected_recipient(text: str) -> str | None:
|
|
for name in ["Final-Recipient", "Original-Recipient", "X-Failed-Recipients", "Failed-Recipient"]:
|
|
value = _extract_headerish(text, name)
|
|
if value:
|
|
match = EMAIL_RE.search(value)
|
|
if match:
|
|
return match.group(0).lower()
|
|
match = EMAIL_RE.search(text)
|
|
return match.group(0).lower() if match else None
|
|
|
|
|
|
def _contains_any(text: str, needles: list[str]) -> bool:
|
|
return any(needle in text for needle in needles)
|
|
|
|
|
|
def _is_dsn_like(text: str) -> bool:
|
|
return _contains_any(
|
|
text,
|
|
[
|
|
"delivery status notification",
|
|
"delivery failure",
|
|
"undeliverable",
|
|
"returned mail",
|
|
"mail delivery subsystem",
|
|
"final-recipient",
|
|
"diagnostic-code",
|
|
"status:",
|
|
],
|
|
)
|
|
|
|
|
|
def _is_out_of_office(text: str) -> bool:
|
|
return _contains_any(
|
|
text,
|
|
[
|
|
"out of office",
|
|
"auto-reply",
|
|
"autoreply",
|
|
"automatic reply",
|
|
"vacation",
|
|
"abwesenheitsnotiz",
|
|
"nicht im buero",
|
|
"nicht im büro",
|
|
],
|
|
)
|
|
|
|
|
|
def _is_challenge_response(text: str) -> bool:
|
|
return _contains_any(
|
|
text,
|
|
[
|
|
"challenge-response",
|
|
"challenge response",
|
|
"sender verification",
|
|
"verify your email before your message can be delivered",
|
|
"confirm you are a real person",
|
|
"confirm that you sent this message",
|
|
"please verify yourself",
|
|
],
|
|
)
|
|
|
|
|
|
def _looks_like_human_reply(inbound: InboundMailboxMessage, text: str) -> bool:
|
|
subject = (inbound.subject or "").lower()
|
|
if _contains_any(text, ["auto-submitted: auto-replied", "x-autoreply", "auto-generated"]):
|
|
return False
|
|
return subject.startswith("re:") or _contains_any(text, ["thanks", "thank you", "i confirm", "received"])
|
|
|
|
|
|
def _looks_return_related(text: str) -> bool:
|
|
return _contains_any(text, ["delivery", "mailbox", "recipient", "message", "smtp", "unsubscribe", "reply"])
|
|
|
|
|
|
def _html_to_text(value: str) -> str:
|
|
without_tags = re.sub(r"<[^>]+>", " ", value)
|
|
return re.sub(r"\s+", " ", html.unescape(without_tags)).strip()
|