From b7591f531bc203412df3a2980c3999f30fd88d49 Mon Sep 17 00:00:00 2001 From: tegwick Date: Tue, 2 Jun 2026 03:07:13 +0200 Subject: [PATCH] feat: add expected recipient reporting --- README.md | 8 + config/mailbox.example.yml | 6 + docs/email-evidence-canon.md | 3 + docs/mailbox-report-tutorial.md | 139 ++++++++++++++++++ src/email_connect/adapter_contract.py | 1 + src/email_connect/cli.py | 10 ++ src/email_connect/config.py | 16 ++ src/email_connect/models.py | 2 + src/email_connect/recipients.py | 79 ++++++++++ src/email_connect/reporting.py | 43 +++++- src/email_connect/scanner.py | 121 ++++++++++++++- src/email_connect/storage.py | 52 ++++++- tests/fixtures/expected_recipients.csv | 5 + tests/fixtures/expected_recipients.txt | 4 + tests/test_recipients.py | 31 ++++ tests/test_scanner.py | 109 ++++++++++++++ ...ecipient-reporting-and-mailbox-tutorial.md | 22 +-- 17 files changed, 629 insertions(+), 22 deletions(-) create mode 100644 docs/mailbox-report-tutorial.md create mode 100644 src/email_connect/recipients.py create mode 100644 tests/fixtures/expected_recipients.csv create mode 100644 tests/fixtures/expected_recipients.txt create mode 100644 tests/test_recipients.py diff --git a/README.md b/README.md index 0892b66..9cb8089 100644 --- a/README.md +++ b/README.md @@ -13,12 +13,18 @@ overclaiming delivery, awareness, or coordination success. PYTHONPATH=src python3 -m unittest discover -s tests PYTHONPATH=src python3 -m email_connect.cli adapter-descriptor PYTHONPATH=src python3 -m email_connect.cli scan-mailbox --config config/mailbox.example.yml --out reports/ +PYTHONPATH=src python3 -m email_connect.cli scan-mailbox --config config/mailbox.example.yml --expected-recipients tests/fixtures/expected_recipients.txt --out reports/ ``` The example config uses `tests/fixtures/mailbox` as a mailbox source. Runtime state is written to `.email-connect/state.sqlite`; generated CSV reports are written to `reports/`. +Expected recipients are optional. When provided as a text file or CSV, reports +include a `known_recipient` column, place known recipients first, and add +`undef.no_signal` diagnostics for expected recipients with no mailbox evidence. +See [Mailbox Report Tutorial](docs/mailbox-report-tutorial.md). + For a live mailbox, set `mailbox.protocol: imap`, configure host, port, folder, and credential environment variable names, then export the credentials before running `scan-mailbox`. IMAP scans select the configured folder read-only and @@ -35,6 +41,8 @@ marking messages seen are intentionally unsupported in this MVP. - SQLite state store with scan cursor, message/evidence deduplication, and endpoint quality hints. - CSV report generation, including `--report-only-new`. +- Optional expected-recipient text/CSV input, `known_recipient` report + filtering, no-evidence diagnostics, and datetime range filtering. - Golden fixture tests for hard bounce, soft bounce, delayed delivery, final failure, complaint, unsubscribe, challenge-response, unknown return, parse-failure, out-of-office, and human reply signals. diff --git a/config/mailbox.example.yml b/config/mailbox.example.yml index 5841f8f..b3e29f2 100644 --- a/config/mailbox.example.yml +++ b/config/mailbox.example.yml @@ -15,12 +15,18 @@ scan: mode: incremental max_messages_per_run: 5000 since: null + from: null + to: null include_seen: true mark_seen: false store_raw_headers: true store_raw_body: false store_raw_message_ref: true +expected_recipients: + path: null + csv_column: email + storage: path: .email-connect/state.sqlite diff --git a/docs/email-evidence-canon.md b/docs/email-evidence-canon.md index a6cfd66..e0e64f5 100644 --- a/docs/email-evidence-canon.md +++ b/docs/email-evidence-canon.md @@ -37,6 +37,7 @@ coordination runtime decides whether those facts satisfy a coordination case. | `unknown_return_message` | `notification.endpoint.unknown` | `undef.conflicting_evidence` | | `challenge_response` | `interaction.unverified_actor_interaction` | `undef.identity_uncertain` | | `parse_failed` | `diagnostic.message.parse_failed` | `undef.parse_failed` | +| expected recipient with no evidence | `diagnostic.expected_recipient.no_evidence` | `undef.no_signal` | ## Overclaim Prevention @@ -47,6 +48,8 @@ coordination runtime decides whether those facts satisfy a coordination case. - Human reply does not prove legal acceptance. - Unknown return messages remain visible. - Parse failures are diagnostic rows, not delivery or interaction outcomes. +- Expected-recipient no-evidence rows mean no mailbox evidence was found in the + inspected range, not that notification succeeded or failed. - Scanner and proxy interactions must stay below identity-bound interaction. ## Endpoint Quality Hints diff --git a/docs/mailbox-report-tutorial.md b/docs/mailbox-report-tutorial.md new file mode 100644 index 0000000..e7284ae --- /dev/null +++ b/docs/mailbox-report-tutorial.md @@ -0,0 +1,139 @@ +# Mailbox Report Tutorial + +This tutorial shows how to generate an email-channel evidence report from a +return mailbox or from the bundled fixture mailbox. + +## 1. Verify The Scanner + +Run the tests and print the adapter descriptor: + +```bash +PYTHONPATH=src python3 -m unittest discover -s tests +PYTHONPATH=src python3 -m email_connect.cli adapter-descriptor +``` + +## 2. Start With Fixtures + +The example config uses `tests/fixtures/mailbox`: + +```bash +PYTHONPATH=src python3 -m email_connect.cli scan-mailbox \ + --config config/mailbox.example.yml \ + --full-rescan \ + --out reports/ +``` + +The scanner writes a timestamped CSV file to `reports/`. + +## 3. Configure A Live IMAP Mailbox + +Copy `config/mailbox.example.yml` and set: + +```yaml +mailbox: + protocol: imap + host: imap.example.com + port: 993 + tls: true + username_env: EMAIL_CONNECT_IMAP_USER + password_env: EMAIL_CONNECT_IMAP_PASSWORD + folder: INBOX +``` + +Then export credentials: + +```bash +export EMAIL_CONNECT_IMAP_USER='mailbox@example.com' +export EMAIL_CONNECT_IMAP_PASSWORD='app-password' +``` + +IMAP scans select the folder read-only and fetch messages with `BODY.PEEK[]`. +The scanner does not mark messages seen, move messages, or delete messages. + +## 4. Add Expected Recipients + +Expected recipients are optional. A newline-separated file can look like: + +```text +missing@example.com +recipient@example.com +``` + +Run: + +```bash +PYTHONPATH=src python3 -m email_connect.cli scan-mailbox \ + --config config/mailbox.example.yml \ + --expected-recipients recipients.txt \ + --out reports/ +``` + +CSV input is also supported: + +```csv +email,name +missing@example.com,Missing User +recipient@example.com,Known Recipient +``` + +Run: + +```bash +PYTHONPATH=src python3 -m email_connect.cli scan-mailbox \ + --config config/mailbox.example.yml \ + --expected-recipients recipients.csv \ + --expected-recipient-column email \ + --out reports/ +``` + +Invalid recipient rows are ignored and printed as warnings. + +## 5. Limit The Time Range + +Use an inclusive datetime range: + +```bash +PYTHONPATH=src python3 -m email_connect.cli scan-mailbox \ + --config config/mailbox.example.yml \ + --from 2026-06-02T10:00:00Z \ + --to 2026-06-02T11:00:00Z \ + --out reports/ +``` + +`--since` remains a compatibility alias for the lower bound. When a range is +active, messages with no parseable `Date` header are excluded because the +scanner cannot confirm that they originated inside the requested window. + +## 6. Read The Report + +Key columns: + +- `known_recipient`: `true` when the address was supplied in the expected list. +- `normalized_event_type`: the email evidence or diagnostic event. +- `assessment_category` and `assessment_subclass`: advisory interpretation. +- `affected_email_address`: the endpoint the row is about. + +Known recipients appear first by default so spreadsheet filtering is easy. + +Expected recipients with no mailbox evidence appear as: + +```text +normalized_event_type: diagnostic.expected_recipient.no_evidence +assessment_category: undef +assessment_subclass: undef.no_signal +evidence_strength: none +known_recipient: true +``` + +That row means only that no mailbox evidence was found for the supplied address +inside the inspected range. It is not evidence of delivery success, delivery +failure, recipient awareness, or legal acceptance. + +## 7. Troubleshooting + +- Empty report: check folder, time range, and whether incremental cursor state + already skipped older messages. Try `--full-rescan`. +- IMAP credential error: verify the environment variable names and values. +- Missing expected rows: check the recipient file path and CSV column name. +- Unexpected no-evidence rows: confirm that the relevant mailbox evidence is + inside the configured datetime range. diff --git a/src/email_connect/adapter_contract.py b/src/email_connect/adapter_contract.py index 959be95..f28764a 100644 --- a/src/email_connect/adapter_contract.py +++ b/src/email_connect/adapter_contract.py @@ -13,6 +13,7 @@ EMITTED_EVENT_TYPES = [ "interaction.out_of_office_received", "notification.endpoint.unknown", "diagnostic.message.parse_failed", + "diagnostic.expected_recipient.no_evidence", ] diff --git a/src/email_connect/cli.py b/src/email_connect/cli.py index 42b92be..627bdc5 100644 --- a/src/email_connect/cli.py +++ b/src/email_connect/cli.py @@ -18,6 +18,10 @@ def main(argv: list[str] | None = None) -> int: scan.add_argument("--out", default=None) scan.add_argument("--full-rescan", action="store_true") scan.add_argument("--since", default=None) + scan.add_argument("--from", dest="range_from", default=None) + scan.add_argument("--to", dest="range_to", default=None) + scan.add_argument("--expected-recipients", default=None) + scan.add_argument("--expected-recipient-column", default=None) scan.add_argument("--report-only-new", action="store_true") scan.add_argument("--dry-run", action="store_true") scan.add_argument("--fixture-dir", default=None) @@ -39,6 +43,10 @@ def main(argv: list[str] | None = None) -> int: dry_run=args.dry_run, fixture_dir=args.fixture_dir, since=args.since, + range_from=args.range_from, + range_to=args.range_to, + expected_recipients_path=args.expected_recipients, + expected_recipient_column=args.expected_recipient_column, ) print(f"scan_id={result.scan.scan_id}") print(f"messages_seen={result.scan.messages_seen}") @@ -47,6 +55,8 @@ def main(argv: list[str] | None = None) -> int: print(f"evidence_events_created={result.scan.evidence_events_created}") if result.report_path: print(f"report_path={Path(result.report_path)}") + for warning in result.warnings: + print(f"warning={warning}") return 0 return 2 diff --git a/src/email_connect/config.py b/src/email_connect/config.py index da4716a..587cb28 100644 --- a/src/email_connect/config.py +++ b/src/email_connect/config.py @@ -22,6 +22,8 @@ class ScanConfig: mode: str = "incremental" max_messages_per_run: int = 5000 since: str | None = None + range_from: str | None = None + range_to: str | None = None include_seen: bool = True mark_seen: bool = False store_raw_headers: bool = True @@ -47,6 +49,12 @@ class SourceConfig: fixture_dir: str | None = None +@dataclass(frozen=True) +class ExpectedRecipientsConfig: + path: str | None = None + csv_column: str = "email" + + @dataclass(frozen=True) class AppConfig: mailbox: MailboxConfig @@ -54,6 +62,7 @@ class AppConfig: storage: StorageConfig reports: ReportsConfig source: SourceConfig = SourceConfig() + expected_recipients: ExpectedRecipientsConfig = ExpectedRecipientsConfig() def load_config(path: str | Path) -> AppConfig: @@ -63,6 +72,7 @@ def load_config(path: str | Path) -> AppConfig: storage = data.get("storage", {}) reports = data.get("reports", {}) source = data.get("source", {}) + expected_recipients = data.get("expected_recipients", {}) return AppConfig( mailbox=MailboxConfig( id=str(mailbox.get("id", "return-mailbox-default")), @@ -78,6 +88,8 @@ def load_config(path: str | Path) -> AppConfig: mode=str(scan.get("mode", "incremental")), max_messages_per_run=int(scan.get("max_messages_per_run", 5000)), since=scan.get("since"), + range_from=scan.get("from") or scan.get("range_from"), + range_to=scan.get("to") or scan.get("range_to"), include_seen=bool(scan.get("include_seen", True)), mark_seen=bool(scan.get("mark_seen", False)), store_raw_headers=bool(scan.get("store_raw_headers", True)), @@ -92,6 +104,10 @@ def load_config(path: str | Path) -> AppConfig: timestamp_timezone=str(reports.get("timestamp_timezone", "UTC")), ), source=SourceConfig(fixture_dir=source.get("fixture_dir")), + expected_recipients=ExpectedRecipientsConfig( + path=expected_recipients.get("path"), + csv_column=str(expected_recipients.get("csv_column", "email")), + ), ) diff --git a/src/email_connect/models.py b/src/email_connect/models.py index cced3eb..eab817b 100644 --- a/src/email_connect/models.py +++ b/src/email_connect/models.py @@ -57,6 +57,8 @@ class MailboxScan: evidence_events_created: int = 0 report_path: str | None = None since: datetime | None = None + range_start: datetime | None = None + range_end: datetime | None = None @dataclass(frozen=True) diff --git a/src/email_connect/recipients.py b/src/email_connect/recipients.py new file mode 100644 index 0000000..4701321 --- /dev/null +++ b/src/email_connect/recipients.py @@ -0,0 +1,79 @@ +from __future__ import annotations + +import csv +import re +from dataclasses import dataclass, field +from pathlib import Path + +EMAIL_RE = re.compile(r"^[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,}$", re.IGNORECASE) + + +@dataclass(frozen=True) +class ExpectedRecipients: + addresses: tuple[str, ...] = () + invalid_entries: tuple[str, ...] = () + + +def load_expected_recipients( + path: str | Path | None, + *, + csv_column: str | None = "email", +) -> ExpectedRecipients: + if not path: + return ExpectedRecipients() + + recipient_path = Path(path) + if recipient_path.suffix.lower() == ".csv": + return _load_csv_recipients(recipient_path, csv_column=csv_column or "email") + return _load_line_recipients(recipient_path) + + +def normalize_email_address(value: str | None) -> str | None: + if value is None: + return None + normalized = value.strip().lower() + if not normalized: + return None + return normalized if EMAIL_RE.fullmatch(normalized) else None + + +@dataclass +class _RecipientCollector: + addresses: dict[str, None] = field(default_factory=dict) + invalid_entries: list[str] = field(default_factory=list) + + def add(self, value: str | None, *, source: str) -> None: + normalized = normalize_email_address(value) + if normalized: + self.addresses[normalized] = None + return + if value and value.strip(): + self.invalid_entries.append(f"{source}: {value.strip()}") + + def result(self) -> ExpectedRecipients: + return ExpectedRecipients( + addresses=tuple(self.addresses.keys()), + invalid_entries=tuple(self.invalid_entries), + ) + + +def _load_line_recipients(path: Path) -> ExpectedRecipients: + collector = _RecipientCollector() + for line_number, raw_line in enumerate(path.read_text(encoding="utf-8").splitlines(), start=1): + line = raw_line.strip() + if not line or line.startswith("#"): + continue + collector.add(line, source=f"{path}:{line_number}") + return collector.result() + + +def _load_csv_recipients(path: Path, *, csv_column: str) -> ExpectedRecipients: + collector = _RecipientCollector() + with path.open(newline="", encoding="utf-8") as fh: + reader = csv.DictReader(fh) + if reader.fieldnames is None: + return collector.result() + column = csv_column if csv_column in reader.fieldnames else reader.fieldnames[0] + for line_number, row in enumerate(reader, start=2): + collector.add(row.get(column), source=f"{path}:{line_number}:{column}") + return collector.result() diff --git a/src/email_connect/reporting.py b/src/email_connect/reporting.py index d901eb8..01f6509 100644 --- a/src/email_connect/reporting.py +++ b/src/email_connect/reporting.py @@ -20,6 +20,7 @@ REPORT_COLUMNS = [ "assessment_category", "assessment_subclass", "affected_email_address", + "known_recipient", "original_message_id", "original_recipient", "smtp_status_code", @@ -49,16 +50,17 @@ def write_evidence_report( scan_id: str, mailbox_id: str, generated_at: datetime | None = None, + expected_recipients: set[str] | None = None, ) -> Path: generated = generated_at or datetime.now(UTC) out_dir = Path(output_dir) out_dir.mkdir(parents=True, exist_ok=True) - path = out_dir / report_filename(generated) + path = _unique_report_path(out_dir / report_filename(generated)) with path.open("w", newline="", encoding="utf-8") as fh: writer = csv.DictWriter(fh, fieldnames=REPORT_COLUMNS) writer.writeheader() - for row in rows: + for row in _ordered_rows(rows, expected_recipients=expected_recipients or set()): writer.writerow(_report_row(row, scan_id=scan_id, mailbox_id=mailbox_id, generated_at=generated)) return path @@ -66,6 +68,7 @@ def write_evidence_report( def _report_row(row: dict, *, scan_id: str, mailbox_id: str, generated_at: datetime) -> dict: metadata = _json(row.get("metadata_json")) notes = _json(row.get("notes_json")) + known_recipient = _known_recipient(row, expected_recipients=set(row.get("_expected_recipients", []))) return { "report_generated_at": generated_at.isoformat(), "scan_id": scan_id, @@ -81,6 +84,7 @@ def _report_row(row: dict, *, scan_id: str, mailbox_id: str, generated_at: datet "assessment_category": row.get("assessment_category", ""), "assessment_subclass": row.get("assessment_subclass", ""), "affected_email_address": row.get("affected_email_address") or "", + "known_recipient": "true" if known_recipient else "false", "original_message_id": row.get("original_message_id") or "", "original_recipient": metadata.get("original_recipient", ""), "smtp_status_code": metadata.get("smtp_status_code") or "", @@ -98,6 +102,29 @@ def _report_row(row: dict, *, scan_id: str, mailbox_id: str, generated_at: datet } +def _ordered_rows(rows: list[dict], *, expected_recipients: set[str]) -> list[dict]: + enriched = [dict(row, _expected_recipients=tuple(expected_recipients)) for row in rows] + if not expected_recipients: + return enriched + return sorted( + enriched, + key=lambda row: ( + not _known_recipient(row, expected_recipients=expected_recipients), + str(row.get("affected_email_address") or ""), + str(row.get("observed_at") or ""), + str(row.get("event_type") or ""), + str(row.get("deduplication_key") or ""), + ), + ) + + +def _known_recipient(row: dict, *, expected_recipients: set[str]) -> bool: + if row.get("known_recipient") is True: + return True + address = str(row.get("affected_email_address") or "").lower() + return bool(address and address in expected_recipients) + + def _json(value: str | None) -> dict | list: if not value: return {} @@ -105,3 +132,15 @@ def _json(value: str | None) -> dict | list: return json.loads(value) except json.JSONDecodeError: return {} + + +def _unique_report_path(path: Path) -> Path: + if not path.exists(): + return path + stem = path.stem + suffix = path.suffix + for index in range(1, 1000): + candidate = path.with_name(f"{stem}-{index:02d}{suffix}") + if not candidate.exists(): + return candidate + raise RuntimeError(f"Could not allocate unique report filename for {path}") diff --git a/src/email_connect/scanner.py b/src/email_connect/scanner.py index 9f697fd..60c715c 100644 --- a/src/email_connect/scanner.py +++ b/src/email_connect/scanner.py @@ -1,7 +1,9 @@ from __future__ import annotations +import json from dataclasses import dataclass from datetime import UTC, datetime +from pathlib import Path from uuid import uuid4 from .config import AppConfig @@ -9,6 +11,7 @@ from .evidence import endpoint_quality_from_candidate from .mailbox import source_for_config from .models import MailboxScan from .parser import parse_message_bytes +from .recipients import load_expected_recipients from .reporting import write_evidence_report from .storage import StateStore @@ -17,6 +20,7 @@ from .storage import StateStore class ScanResult: scan: MailboxScan report_path: Path | None + warnings: tuple[str, ...] = () def scan_mailbox( @@ -28,10 +32,24 @@ def scan_mailbox( dry_run: bool = False, fixture_dir: str | None = None, since: str | None = None, + range_from: str | None = None, + range_to: str | None = None, + expected_recipients_path: str | None = None, + expected_recipient_column: str | None = None, ) -> ScanResult: started_at = datetime.now(UTC) scan_id = str(uuid4()) - since_at = _parse_since(since or config.scan.since) + range_start = _parse_datetime(range_from or since or config.scan.range_from or config.scan.since) + range_end = _parse_datetime(range_to or config.scan.range_to) + if range_start and range_end and range_start > range_end: + raise ValueError("scan datetime range lower bound must be before or equal to upper bound.") + since_at = range_start + expected = load_expected_recipients( + expected_recipients_path or config.expected_recipients.path, + csv_column=expected_recipient_column or config.expected_recipients.csv_column, + ) + expected_addresses = set(expected.addresses) + warnings = tuple(f"invalid expected recipient ignored: {entry}" for entry in expected.invalid_entries) source = source_for_config(config, fixture_dir_override=fixture_dir) store = StateStore(config.storage.path) @@ -58,7 +76,7 @@ def scan_mailbox( raw_message_ref=message.raw_message_ref, imap_uid=message.imap_uid, ) - if since_at and inbound.received_at and inbound.received_at < since_at: + if not _in_range(inbound.received_at, range_start=range_start, range_end=range_end): continue if dry_run: messages_parsed += 1 @@ -91,14 +109,29 @@ def scan_mailbox( report_path = None if not dry_run: + range_evidence_rows = store.evidence_rows(range_start=range_start, range_end=range_end) report_rows = store.evidence_rows( deduplication_keys=new_evidence_keys if report_only_new else None, + range_start=range_start, + range_end=range_end, ) + report_rows = [ + *report_rows, + *_no_evidence_rows( + mailbox_id=config.mailbox.id, + expected_addresses=expected_addresses, + evidence_rows=range_evidence_rows, + observed_at=datetime.now(UTC), + range_start=range_start, + range_end=range_end, + ), + ] report_path = write_evidence_report( report_rows, output_dir=output_dir or config.reports.output_dir, scan_id=scan_id, mailbox_id=config.mailbox.id, + expected_recipients=expected_addresses, ) finished_at = datetime.now(UTC) scan = MailboxScan( @@ -115,10 +148,12 @@ def scan_mailbox( evidence_events_created=evidence_created, report_path=str(report_path) if report_path else None, since=since_at, + range_start=range_start, + range_end=range_end, ) if not dry_run: store.insert_scan(scan) - return ScanResult(scan=scan, report_path=report_path) + return ScanResult(scan=scan, report_path=report_path, warnings=warnings) finally: store.close() @@ -142,7 +177,7 @@ def _enrich_candidate(candidate, inbound, parsed): ) -def _parse_since(value: str | None) -> datetime | None: +def _parse_datetime(value: str | None) -> datetime | None: if not value: return None normalized = value.strip() @@ -154,3 +189,81 @@ def _parse_since(value: str | None) -> datetime | None: if parsed.tzinfo is None: return parsed.replace(tzinfo=UTC) return parsed.astimezone(UTC) + + +def _in_range( + received_at: datetime | None, + *, + range_start: datetime | None, + range_end: datetime | None, +) -> bool: + if range_start is None and range_end is None: + return True + if received_at is None: + return False + if range_start is not None and received_at < range_start: + return False + if range_end is not None and received_at > range_end: + return False + return True + + +def _no_evidence_rows( + *, + mailbox_id: str, + expected_addresses: set[str], + evidence_rows: list[dict], + observed_at: datetime, + range_start: datetime | None, + range_end: datetime | None, +) -> list[dict]: + if not expected_addresses: + return [] + known_evidence_addresses = { + str(row.get("affected_email_address") or "").lower() + for row in evidence_rows + if row.get("affected_email_address") + } + rows = [] + for address in sorted(expected_addresses - known_evidence_addresses): + rows.append(_no_evidence_row(mailbox_id, address, observed_at, range_start=range_start, range_end=range_end)) + return rows + + +def _no_evidence_row( + mailbox_id: str, + address: str, + observed_at: datetime, + *, + range_start: datetime | None, + range_end: datetime | None, +) -> dict: + range_key = "|".join([ + range_start.isoformat() if range_start else "", + range_end.isoformat() if range_end else "", + ]) + return { + "mailbox_message_id": "", + "event_type": "diagnostic.expected_recipient.no_evidence", + "assessment_category": "undef", + "assessment_subclass": "undef.no_signal", + "affected_email_address": address, + "original_message_id": "", + "confidence": "high", + "evidence_strength": "none", + "occurred_at": "", + "observed_at": observed_at.isoformat(), + "deduplication_key": f"{mailbox_id}|expected_recipient|no_evidence|{address}|{range_key}", + "raw_message_ref": "", + "notes_json": json.dumps([ + "Expected recipient was supplied by the operator; no mailbox evidence was found in the inspected range.", + "This is not evidence of delivery success or delivery failure.", + ]), + "metadata_json": json.dumps({ + "message_class": "expected_recipient_no_evidence", + "original_recipient": address, + "range_start": range_start.isoformat() if range_start else None, + "range_end": range_end.isoformat() if range_end else None, + }), + "known_recipient": True, + } diff --git a/src/email_connect/storage.py b/src/email_connect/storage.py index b2577e1..b61125d 100644 --- a/src/email_connect/storage.py +++ b/src/email_connect/storage.py @@ -42,7 +42,9 @@ class StateStore: messages_parsed integer not null, evidence_events_created integer not null, report_path text, - since text + since text, + range_start text, + range_end text ); create table if not exists mailbox_messages ( @@ -121,8 +123,18 @@ class StateStore: ); """ ) + self._ensure_column("mailbox_scans", "range_start", "text") + self._ensure_column("mailbox_scans", "range_end", "text") self.conn.commit() + def _ensure_column(self, table: str, column: str, column_type: str) -> None: + columns = { + str(row["name"]) + for row in self.conn.execute(f"pragma table_info({table})").fetchall() + } + if column not in columns: + self.conn.execute(f"alter table {table} add column {column} {column_type}") + def upsert_message(self, message: InboundMailboxMessage) -> bool: existing = self.conn.execute( "select mailbox_message_id from mailbox_messages where deduplication_key = ?", @@ -214,7 +226,7 @@ class StateStore: def insert_scan(self, scan: MailboxScan) -> None: self.conn.execute( """ - insert or replace into mailbox_scans values (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?) + insert or replace into mailbox_scans values (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?) """, ( scan.scan_id, @@ -230,6 +242,8 @@ class StateStore: scan.evidence_events_created, scan.report_path, _dt(scan.since), + _dt(scan.range_start), + _dt(scan.range_end), ), ) self.conn.commit() @@ -304,7 +318,13 @@ class StateStore: ).fetchall() return [dict(row) for row in rows] - def evidence_rows(self, *, deduplication_keys: list[str] | None = None) -> list[dict]: + def evidence_rows( + self, + *, + deduplication_keys: list[str] | None = None, + range_start: datetime | None = None, + range_end: datetime | None = None, + ) -> list[dict]: if deduplication_keys is not None: if not deduplication_keys: return [] @@ -317,9 +337,9 @@ class StateStore: """, deduplication_keys, ).fetchall() - return [dict(row) for row in rows] + return _filter_rows_by_range([dict(row) for row in rows], range_start=range_start, range_end=range_end) rows = self.conn.execute("select * from evidence_candidates order by observed_at, event_type").fetchall() - return [dict(row) for row in rows] + return _filter_rows_by_range([dict(row) for row in rows], range_start=range_start, range_end=range_end) def _dt(value: datetime | None) -> str | None: @@ -330,6 +350,28 @@ def _parse_dt(value: str | None) -> datetime | None: return datetime.fromisoformat(value) if value else None +def _filter_rows_by_range( + rows: list[dict], + *, + range_start: datetime | None, + range_end: datetime | None, +) -> list[dict]: + if range_start is None and range_end is None: + return rows + return [row for row in rows if _row_in_range(row, range_start=range_start, range_end=range_end)] + + +def _row_in_range(row: dict, *, range_start: datetime | None, range_end: datetime | None) -> bool: + occurred_at = _parse_dt(row.get("occurred_at")) + if occurred_at is None: + return False + if range_start is not None and occurred_at < range_start: + return False + if range_end is not None and occurred_at > range_end: + return False + return True + + def _merge_endpoint_quality( existing, update: EndpointQualityUpdate, diff --git a/tests/fixtures/expected_recipients.csv b/tests/fixtures/expected_recipients.csv new file mode 100644 index 0000000..2c21e5f --- /dev/null +++ b/tests/fixtures/expected_recipients.csv @@ -0,0 +1,5 @@ +email,name +optout@example.com,Opt Out +csv-absent@example.com,Missing From Mailbox +OPTOut@example.com,Duplicate Case Variant +not-an-address,Invalid diff --git a/tests/fixtures/expected_recipients.txt b/tests/fixtures/expected_recipients.txt new file mode 100644 index 0000000..a6c96df --- /dev/null +++ b/tests/fixtures/expected_recipients.txt @@ -0,0 +1,4 @@ +missing@example.com +absent@example.com +MISSING@example.com +not-an-address diff --git a/tests/test_recipients.py b/tests/test_recipients.py new file mode 100644 index 0000000..a5296a3 --- /dev/null +++ b/tests/test_recipients.py @@ -0,0 +1,31 @@ +from __future__ import annotations + +import unittest +from pathlib import Path + +from email_connect.recipients import load_expected_recipients, normalize_email_address + + +FIXTURES = Path(__file__).parent / "fixtures" + + +class RecipientTests(unittest.TestCase): + def test_normalizes_email_addresses(self) -> None: + self.assertEqual(normalize_email_address(" USER@Example.COM "), "user@example.com") + self.assertIsNone(normalize_email_address("not-an-address")) + + def test_loads_line_separated_recipients(self) -> None: + recipients = load_expected_recipients(FIXTURES / "expected_recipients.txt") + + self.assertEqual(recipients.addresses, ("missing@example.com", "absent@example.com")) + self.assertEqual(len(recipients.invalid_entries), 1) + + def test_loads_csv_recipients(self) -> None: + recipients = load_expected_recipients(FIXTURES / "expected_recipients.csv", csv_column="email") + + self.assertEqual(recipients.addresses, ("optout@example.com", "csv-absent@example.com")) + self.assertEqual(len(recipients.invalid_entries), 1) + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/test_scanner.py b/tests/test_scanner.py index e484e56..687d210 100644 --- a/tests/test_scanner.py +++ b/tests/test_scanner.py @@ -11,6 +11,7 @@ from email_connect.storage import StateStore FIXTURES = Path(__file__).parent / "fixtures" / "mailbox" +RECIPIENT_FIXTURES = Path(__file__).parent / "fixtures" class ScannerTests(unittest.TestCase): @@ -38,6 +39,10 @@ class ScannerTests(unittest.TestCase): self.assertEqual(full.scan.messages_new, 0) self.assertEqual(full.scan.evidence_events_created, 0) self.assertTrue(first.report_path and first.report_path.exists()) + with first.report_path.open(newline="", encoding="utf-8") as fh: + first_rows = list(DictReader(fh)) + self.assertTrue(first_rows) + self.assertTrue(all(row["known_recipient"] == "false" for row in first_rows)) self.assertTrue(full.report_path and full.report_path.exists()) with full.report_path.open(newline="", encoding="utf-8") as fh: self.assertEqual(list(DictReader(fh)), []) @@ -65,6 +70,110 @@ class ScannerTests(unittest.TestCase): self.assertEqual(rows["complained@example.com"]["suppression_state"], "suppressed") self.assertEqual(rows["optout@example.com"]["suppression_state"], "opted_out") + def test_expected_recipients_sort_first_and_get_no_evidence_rows(self) -> None: + with tempfile.TemporaryDirectory() as tmp: + root = Path(tmp) + config = AppConfig( + mailbox=MailboxConfig(id="test-mailbox", protocol="fixture"), + scan=ScanConfig(), + storage=StorageConfig(path=str(root / "state.sqlite")), + reports=ReportsConfig(output_dir=str(root / "reports")), + source=SourceConfig(fixture_dir=str(FIXTURES)), + ) + result = scan_mailbox( + config, + full_rescan=True, + expected_recipients_path=str(RECIPIENT_FIXTURES / "expected_recipients.txt"), + ) + + self.assertEqual(len(result.warnings), 1) + self.assertTrue(result.report_path and result.report_path.exists()) + with result.report_path.open(newline="", encoding="utf-8") as fh: + rows = list(DictReader(fh)) + + self.assertGreater(len(rows), 2) + known_flags = [row["known_recipient"] for row in rows] + self.assertEqual(known_flags, sorted(known_flags, reverse=True)) + missing_rows = [row for row in rows if row["affected_email_address"] == "missing@example.com"] + self.assertTrue(missing_rows) + self.assertTrue(all(row["known_recipient"] == "true" for row in missing_rows)) + + absent_rows = [row for row in rows if row["affected_email_address"] == "absent@example.com"] + self.assertEqual(len(absent_rows), 1) + self.assertEqual(absent_rows[0]["normalized_event_type"], "diagnostic.expected_recipient.no_evidence") + self.assertEqual(absent_rows[0]["assessment_category"], "undef") + self.assertEqual(absent_rows[0]["assessment_subclass"], "undef.no_signal") + self.assertEqual(absent_rows[0]["evidence_strength"], "none") + + store = StateStore(config.storage.path) + try: + quality_addresses = {row["affected_email_address"] for row in store.endpoint_quality_rows()} + finally: + store.close() + self.assertNotIn("absent@example.com", quality_addresses) + + def test_csv_expected_recipients_are_supported(self) -> None: + with tempfile.TemporaryDirectory() as tmp: + root = Path(tmp) + config = AppConfig( + mailbox=MailboxConfig(id="test-mailbox", protocol="fixture"), + scan=ScanConfig(), + storage=StorageConfig(path=str(root / "state.sqlite")), + reports=ReportsConfig(output_dir=str(root / "reports")), + source=SourceConfig(fixture_dir=str(FIXTURES)), + ) + result = scan_mailbox( + config, + full_rescan=True, + expected_recipients_path=str(RECIPIENT_FIXTURES / "expected_recipients.csv"), + expected_recipient_column="email", + ) + + self.assertEqual(len(result.warnings), 1) + self.assertTrue(result.report_path and result.report_path.exists()) + with result.report_path.open(newline="", encoding="utf-8") as fh: + rows = list(DictReader(fh)) + + optout_rows = [row for row in rows if row["affected_email_address"] == "optout@example.com"] + self.assertTrue(optout_rows) + self.assertTrue(all(row["known_recipient"] == "true" for row in optout_rows)) + csv_absent = [row for row in rows if row["affected_email_address"] == "csv-absent@example.com"] + self.assertEqual(csv_absent[0]["normalized_event_type"], "diagnostic.expected_recipient.no_evidence") + + def test_datetime_range_excludes_messages_outside_range(self) -> None: + with tempfile.TemporaryDirectory() as tmp: + root = Path(tmp) + expected_path = root / "expected.txt" + expected_path.write_text("complained@example.com\nmissing@example.com\n", encoding="utf-8") + config = AppConfig( + mailbox=MailboxConfig(id="test-mailbox", protocol="fixture"), + scan=ScanConfig(), + storage=StorageConfig(path=str(root / "state.sqlite")), + reports=ReportsConfig(output_dir=str(root / "reports")), + source=SourceConfig(fixture_dir=str(FIXTURES)), + ) + result = scan_mailbox( + config, + full_rescan=True, + expected_recipients_path=str(expected_path), + range_from="2026-06-02T10:04:00Z", + range_to="2026-06-02T10:04:00Z", + ) + + self.assertEqual(result.scan.messages_seen, 11) + self.assertEqual(result.scan.messages_parsed, 1) + self.assertTrue(result.report_path and result.report_path.exists()) + with result.report_path.open(newline="", encoding="utf-8") as fh: + rows = list(DictReader(fh)) + + self.assertEqual({row["affected_email_address"] for row in rows}, {"complained@example.com", "missing@example.com"}) + complaint = [row for row in rows if row["affected_email_address"] == "complained@example.com"][0] + missing = [row for row in rows if row["affected_email_address"] == "missing@example.com"][0] + self.assertEqual(complaint["normalized_event_type"], "notification.channel.complaint_received") + self.assertEqual(missing["normalized_event_type"], "diagnostic.expected_recipient.no_evidence") + self.assertEqual(result.scan.range_start.isoformat(), "2026-06-02T10:04:00+00:00") + self.assertEqual(result.scan.range_end.isoformat(), "2026-06-02T10:04:00+00:00") + if __name__ == "__main__": unittest.main() diff --git a/workplans/EMAIL-WP-0003-expected-recipient-reporting-and-mailbox-tutorial.md b/workplans/EMAIL-WP-0003-expected-recipient-reporting-and-mailbox-tutorial.md index df08f9e..59e2e96 100644 --- a/workplans/EMAIL-WP-0003-expected-recipient-reporting-and-mailbox-tutorial.md +++ b/workplans/EMAIL-WP-0003-expected-recipient-reporting-and-mailbox-tutorial.md @@ -4,7 +4,7 @@ type: workplan title: "Expected Recipient Reporting and Mailbox Tutorial" domain: custodian repo: email-connect -status: active +status: finished owner: codex topic_slug: custodian created: "2026-06-02" @@ -122,9 +122,9 @@ The scanner should support an optional inclusive datetime range: Messages outside the range must be excluded before parsing and evidence generation whenever the message timestamp is available. The range should also be -usable from config. If a message has no parseable timestamp, the implementation -should either include it with a diagnostic note or exclude it only when the -behavior is explicitly documented and tested. +usable from config. If a message has no parseable timestamp while a range is +active, it is excluded because the scanner cannot confirm that it originated +inside the requested window. Existing `--since` behavior may be retained as a compatibility alias for the lower bound, but the new range should be expressed clearly in documentation. @@ -146,7 +146,7 @@ email-connect scan-mailbox --config config/mailbox.yml --from 2026-06-01T00:00:0 ```task id: EMAIL-WP-0003-T01 -status: todo +status: done priority: high state_hub_task_id: "d1cd0de0-cbd5-4e8d-8179-000ba10e5506" ``` @@ -174,7 +174,7 @@ Invalid recipient rows are visible as warnings or diagnostics. ```task id: EMAIL-WP-0003-T02 -status: todo +status: done priority: high state_hub_task_id: "3d7d3bb8-4118-4158-b874-b4e0527eaa85" ``` @@ -201,7 +201,7 @@ unknown-recipient rows by default. ```task id: EMAIL-WP-0003-T03 -status: todo +status: done priority: high state_hub_task_id: "aa737837-2f19-4fbf-9920-f98413bd9779" ``` @@ -230,7 +230,7 @@ no-signal diagnostics, not as failures or successes. ```task id: EMAIL-WP-0003-T04 -status: todo +status: done priority: medium state_hub_task_id: "731cf592-1bbe-4143-b21b-721af281528c" ``` @@ -255,7 +255,7 @@ or partial expected recipients. ```task id: EMAIL-WP-0003-T05 -status: todo +status: done priority: high state_hub_task_id: "22585e83-d995-42d9-9ab2-c383b055fbb8" ``` @@ -284,7 +284,7 @@ message timestamp falls within the range according to the documented rules. ```task id: EMAIL-WP-0003-T06 -status: todo +status: done priority: high state_hub_task_id: "f30cd5b9-5035-42b4-9eca-a104e2b26ecb" ``` @@ -313,7 +313,7 @@ and datetime range filtering. ```task id: EMAIL-WP-0003-T07 -status: todo +status: done priority: medium state_hub_task_id: "00a29cb9-ac5a-4784-a9c4-7f2d4905405c" ```