"""Public-safe dedup pipeline sketch for C++ corpora.

This sample keeps the contract narrow:
- exact duplicates are removed first
- near-duplicate candidates come from token shingles
- MinHash-style signatures feed an LSH candidate pass
- canonical representatives are chosen deterministically

It is intentionally small and dependency-free. The production lane uses
stronger batching, persistence, and provenance plumbing around the same steps.
"""

from __future__ import annotations

from dataclasses import dataclass
from hashlib import blake2b


@dataclass(frozen=True)
class NormalizedFile:
    path: str
    tokens: tuple[str, ...]


def exact_digest(file: NormalizedFile) -> str:
    payload = "\n".join(file.tokens).encode("utf-8")
    return blake2b(payload, digest_size=16).hexdigest()


def token_shingles(tokens: tuple[str, ...], *, k: int = 5) -> set[tuple[str, ...]]:
    if len(tokens) < k:
        return {tokens}
    return {tuple(tokens[idx : idx + k]) for idx in range(len(tokens) - k + 1)}


def minhash_signature(
    shingles: set[tuple[str, ...]], *, permutations: int = 128
) -> tuple[int, ...]:
    signature: list[int] = []
    ordered = sorted(" ".join(shingle) for shingle in shingles)
    for seed in range(permutations):
        signature.append(
            min(
                int(
                    blake2b(
                        f"{seed}:{item}".encode("utf-8"),
                        digest_size=8,
                    ).hexdigest(),
                    16,
                )
                for item in ordered
            )
        )
    return tuple(signature)


def lsh_buckets(
    signature: tuple[int, ...], *, bands: int = 20, rows: int = 6
) -> list[tuple[int, tuple[int, ...]]]:
    buckets: list[tuple[int, tuple[int, ...]]] = []
    for band in range(bands):
        start = band * rows
        stop = start + rows
        chunk = signature[start:stop]
        if len(chunk) == rows:
            buckets.append((band, chunk))
    return buckets


def canonical_representative(group: list[NormalizedFile]) -> NormalizedFile:
    return sorted(group, key=lambda file: file.path)[0]


def dedup_pipeline(files: list[NormalizedFile]) -> dict[str, object]:
    exact_groups: dict[str, list[NormalizedFile]] = {}
    for file in files:
        exact_groups.setdefault(exact_digest(file), []).append(file)

    exact_representatives = [
        canonical_representative(group) for group in exact_groups.values()
    ]

    near_duplicate_buckets: dict[tuple[int, tuple[int, ...]], list[str]] = {}
    for file in exact_representatives:
        shingles = token_shingles(file.tokens, k=5)
        signature = minhash_signature(shingles, permutations=120)
        for bucket in lsh_buckets(signature, bands=20, rows=6):
            near_duplicate_buckets.setdefault(bucket, []).append(file.path)

    candidate_groups = {
        bucket: sorted(paths)
        for bucket, paths in near_duplicate_buckets.items()
        if len(paths) > 1
    }

    return {
        "exact_groups": {
            digest: [file.path for file in group]
            for digest, group in exact_groups.items()
        },
        "canonical_paths": [file.path for file in exact_representatives],
        "near_duplicate_candidates": candidate_groups,
    }


def _demo() -> None:
    files = [
        NormalizedFile(
            path="upstream/json.hpp",
            tokens=("template", "<", "class", "T", ">", "struct", "json"),
        ),
        NormalizedFile(
            path="vendor/json.hpp",
            tokens=("template", "<", "class", "T", ">", "struct", "json"),
        ),
        NormalizedFile(
            path="src/hash_combine.cpp",
            tokens=("inline", "void", "hash_combine", "(", "seed", ")"),
        ),
    ]
    result = dedup_pipeline(files)
    print(result["canonical_paths"])
    print(len(result["near_duplicate_candidates"]))


if __name__ == "__main__":
    _demo()