Source code for laion_fmri.discovery
"""Discovery API backed entirely by the LAION-fMRI S3 bucket.
All queries about what is in the dataset read directly from the S3
bucket via the AWS CLI. Local filesystem state is never consulted.
"""
import warnings
from laion_fmri._bidsify import bidsify_local_key
from laion_fmri._paths import parse_roi_label
from laion_fmri._s3_engine import (
list_common_prefixes,
list_prefix_keys,
)
from laion_fmri._sources import LAION_FMRI_BUCKET
SUBJECT_PREFIXES = (
"derivatives/glmsingle-tedana/",
"derivatives/rois/",
)
[docs]
def get_subjects():
"""Return all subject BIDS IDs present in the S3 bucket.
The bucket may be partially populated during development -- some
derivative trees may exist before others. This function returns
the union of subjects found under any known derivative prefix.
Returns
-------
list[str]
Sorted list of BIDS subject IDs (``sub-*``).
"""
found = set()
for prefix in SUBJECT_PREFIXES:
for name in list_common_prefixes(LAION_FMRI_BUCKET, prefix):
if name.startswith("sub-"):
found.add(name)
if not found:
warnings.warn(
f"No subjects found in s3://{LAION_FMRI_BUCKET}/ under "
f"any of {SUBJECT_PREFIXES}. "
"Check bucket layout and AWS credentials.",
UserWarning,
stacklevel=2,
)
return sorted(found)
[docs]
def get_rois(subject=None, category=None):
"""Return ROI names available for a subject in the S3 bucket.
The bucket layout is
``derivatives/rois/{subject}/{category}/...`` with three file
types per ROI; only the volumetric ``.nii.gz`` files are used
here as the source of truth (one per ROI per subject).
Hyphenated label values (e.g. ``label-FFA-1``) are normalized
to BIDS-clean form (``"FFA1"``) before being returned.
Parameters
----------
subject : str or None
BIDS subject ID. If None, uses the first subject in the
bucket.
category : str or None
Optional category filter (``"face"``, ``"place"``, ...).
Returns
-------
list[str]
Sorted bidsified ROI names.
"""
if subject is None:
subjects = get_subjects()
if not subjects:
return []
subject = subjects[0]
prefix = f"derivatives/rois/{subject}/"
keys = list_prefix_keys(LAION_FMRI_BUCKET, prefix)
rois = set()
for key in keys:
if not key.endswith("_mask.nii.gz"):
continue
if "_space-T1w_res-1pt8_" not in key:
continue
relative = key[len(prefix):]
parts = relative.split("/")
if len(parts) != 2:
continue
file_category, fname = parts
if category is not None and file_category != category:
continue
roi = parse_roi_label(bidsify_local_key(fname), subject)
if roi is None:
continue
rois.add(roi)
if not rois:
warnings.warn(
f"No ROIs found at s3://{LAION_FMRI_BUCKET}/{prefix}. "
"The subject may not have ROI atlases uploaded yet.",
UserWarning,
stacklevel=2,
)
return sorted(rois)
[docs]
def describe():
"""Print a human-readable summary of the S3 bucket contents."""
subjects = get_subjects()
print("LAION-fMRI Dataset")
print(f" Bucket: s3://{LAION_FMRI_BUCKET}")
print(f" Subjects: {len(subjects)}", end="")
if subjects:
print(f" ({', '.join(subjects)})")
rois = get_rois(subjects[0])
if rois:
print(f" ROIs: {', '.join(rois)}")
else:
print(" (none)")
[docs]
def inspect_bucket():
"""Print a diagnostic listing of the bucket for troubleshooting.
Lists immediate top-level prefixes and probes the expected
subject prefixes, showing how many subject folders exist under
each. Use this when discovery returns unexpected results.
"""
print(f"Bucket: s3://{LAION_FMRI_BUCKET}")
top = list_common_prefixes(LAION_FMRI_BUCKET, "")
print(f"Top-level prefixes ({len(top)}):")
for name in sorted(top):
print(f" {name}/")
for prefix in SUBJECT_PREFIXES:
names = list_common_prefixes(LAION_FMRI_BUCKET, prefix)
subject_count = sum(1 for n in names if n.startswith("sub-"))
print(
f"{prefix}: {len(names)} entries, "
f"{subject_count} sub-* entries"
)