Source code for laion_fmri.discovery

"""Discovery API backed entirely by the LAION-fMRI S3 bucket.

All queries about what is in the dataset read directly from the S3
bucket via the AWS CLI. Local filesystem state is never consulted.
"""

import warnings

from laion_fmri._bidsify import bidsify_local_key
from laion_fmri._paths import parse_roi_label
from laion_fmri._s3_engine import (
    list_common_prefixes,
    list_prefix_keys,
)
from laion_fmri._sources import LAION_FMRI_BUCKET

SUBJECT_PREFIXES = (
    "derivatives/glmsingle-tedana/",
    "derivatives/rois/",
)



[docs]
def get_subjects():
    """Return all subject BIDS IDs present in the S3 bucket.

    The bucket may be partially populated during development -- some
    derivative trees may exist before others. This function returns
    the union of subjects found under any known derivative prefix.

    Returns
    -------
    list[str]
        Sorted list of BIDS subject IDs (``sub-*``).
    """
    found = set()
    for prefix in SUBJECT_PREFIXES:
        for name in list_common_prefixes(LAION_FMRI_BUCKET, prefix):
            if name.startswith("sub-"):
                found.add(name)

    if not found:
        warnings.warn(
            f"No subjects found in s3://{LAION_FMRI_BUCKET}/ under "
            f"any of {SUBJECT_PREFIXES}. "
            "Check bucket layout and AWS credentials.",
            UserWarning,
            stacklevel=2,
        )

    return sorted(found)




[docs]
def get_rois(subject=None, category=None):
    """Return ROI names available for a subject in the S3 bucket.

    The bucket layout is
    ``derivatives/rois/{subject}/{category}/...`` with three file
    types per ROI; only the volumetric ``.nii.gz`` files are used
    here as the source of truth (one per ROI per subject).

    Hyphenated label values (e.g. ``label-FFA-1``) are normalized
    to BIDS-clean form (``"FFA1"``) before being returned.

    Parameters
    ----------
    subject : str or None
        BIDS subject ID. If None, uses the first subject in the
        bucket.
    category : str or None
        Optional category filter (``"face"``, ``"place"``, ...).

    Returns
    -------
    list[str]
        Sorted bidsified ROI names.
    """
    if subject is None:
        subjects = get_subjects()
        if not subjects:
            return []
        subject = subjects[0]

    prefix = f"derivatives/rois/{subject}/"
    keys = list_prefix_keys(LAION_FMRI_BUCKET, prefix)

    rois = set()
    for key in keys:
        if not key.endswith("_mask.nii.gz"):
            continue
        if "_space-T1w_res-1pt8_" not in key:
            continue
        relative = key[len(prefix):]
        parts = relative.split("/")
        if len(parts) != 2:
            continue
        file_category, fname = parts
        if category is not None and file_category != category:
            continue
        roi = parse_roi_label(bidsify_local_key(fname), subject)
        if roi is None:
            continue
        rois.add(roi)

    if not rois:
        warnings.warn(
            f"No ROIs found at s3://{LAION_FMRI_BUCKET}/{prefix}. "
            "The subject may not have ROI atlases uploaded yet.",
            UserWarning,
            stacklevel=2,
        )

    return sorted(rois)




[docs]
def describe():
    """Print a human-readable summary of the S3 bucket contents."""
    subjects = get_subjects()
    print("LAION-fMRI Dataset")
    print(f"  Bucket:    s3://{LAION_FMRI_BUCKET}")
    print(f"  Subjects:  {len(subjects)}", end="")
    if subjects:
        print(f" ({', '.join(subjects)})")
        rois = get_rois(subjects[0])
        if rois:
            print(f"  ROIs:      {', '.join(rois)}")
    else:
        print(" (none)")




[docs]
def inspect_bucket():
    """Print a diagnostic listing of the bucket for troubleshooting.

    Lists immediate top-level prefixes and probes the expected
    subject prefixes, showing how many subject folders exist under
    each. Use this when discovery returns unexpected results.
    """
    print(f"Bucket: s3://{LAION_FMRI_BUCKET}")

    top = list_common_prefixes(LAION_FMRI_BUCKET, "")
    print(f"Top-level prefixes ({len(top)}):")
    for name in sorted(top):
        print(f"  {name}/")

    for prefix in SUBJECT_PREFIXES:
        names = list_common_prefixes(LAION_FMRI_BUCKET, prefix)
        subject_count = sum(1 for n in names if n.startswith("sub-"))
        print(
            f"{prefix}: {len(names)} entries, "
            f"{subject_count} sub-* entries"
        )