Source code for seisbench.data.ceed

import shutil
from pathlib import Path

import h5py
import pandas as pd
from tqdm import tqdm

import seisbench

from .base import WaveformBenchmarkDataset, WaveformDataWriter

try:
    from huggingface_hub import hf_hub_download
except ModuleNotFoundError:
    hf_hub_download = None



[docs]
class CEED(WaveformBenchmarkDataset):
    """
    The CEED dataset for California from Zhu et al. (2025)
    """

    def __init__(self, **kwargs):
        citation = (
            "Zhu, W., Wang, H., Rong, B., Yu, E., Zuzlewski, S., Tepp, G., "
            "... & Allen, R. M. (2025). California Earthquake Dataset for "
            "Machine Learning and Cloud Computing. arXiv preprint arXiv:2502.11500."
        )
        super().__init__(citation=citation, repository_lookup=True, **kwargs)


[docs]
    @classmethod
    def available_chunks(cls, force=False, wait_for_file=False):
        nc_chunks = [str(x) for x in range(1987, 2024)]
        sc_chunks = [str(x) for x in range(1999, 2019)] + [
            "2019_0",
            "2019_1",
            "2019_2",
            "2020_0",
            "2020_1",
            "2021",
            "2022",
            "2023",
        ]

        return [f"nc{x}" for x in nc_chunks] + [f"sc{x}" for x in sc_chunks]


    @staticmethod
    def _ensure_hf_hub_download_available():
        assert hf_hub_download is not None, (
            "To download this dataset, huggingface_hub must be installed. "
            "For installation instructions, "
            "see https://huggingface.co/docs/huggingface_hub/installation"
        )

    def _download_dataset(self, writer: WaveformDataWriter, chunk: str, **kwargs):
        path = writer.waveforms_path.parent

        seisbench.logger.warning(f"Start chunk {chunk} from Huggingface Hub")

        area, year = chunk[:2], chunk[2:]
        filename = f"waveform_h5/{year}.h5"

        # download from huggingface hub
        hf_hub_download(
            repo_id=f"AI4EPS/quakeflow_{area}",
            filename=filename,
            repo_type="dataset",
            local_dir=path,
        )

        shutil.move(path / filename, writer.waveforms_path)

        metadata = self._create_metadata(writer.waveforms_path, chunk)
        self._add_split(metadata)
        self._adjust_hdf5(writer.waveforms_path)

        metadata.to_csv(writer.metadata_path, index=False)

    @staticmethod
    def _adjust_hdf5(path: Path) -> None:
        with h5py.File(path, "a") as f:
            # SeisBench needs a data format group
            g = f.create_group("data_format")
            g.create_dataset("dimension_order", data="CW")
            # Add a softlink from "data/" to "/" as SeisBench expects all waveforms under "data/"
            f["data"] = h5py.SoftLink("/")

    @staticmethod
    def _add_split(metadata: pd.DataFrame) -> None:
        # Temporal split, oriented after the split from the original publication with an extra dev set
        metadata["split"] = "train"
        metadata.loc[metadata["source_origin_time"] > "2020", "split"] = "dev"
        metadata.loc[metadata["source_origin_time"] > "2021", "split"] = "test"

    def _create_metadata(self, path: Path, chunk: str) -> pd.DataFrame:
        metadata = []
        with h5py.File(path, "r") as f:
            for s_event, g_event in tqdm(
                f.items(), desc=f"Compiling metadata for chunk {chunk}"
            ):
                event_metadata = {k: v for k, v in g_event.attrs.items()}
                # The key "depth_km" is used on both event and station level, so we need to rename it here
                event_metadata["source_depth_km"] = event_metadata.get("depth_km")
                for s_trace, g_trace in g_event.items():
                    trace_metadata = {
                        **event_metadata,
                        **{k: v for k, v in g_trace.attrs.items()},
                        "trace_name": f"{s_event}/{s_trace}",
                    }
                    metadata.append(trace_metadata)

        renames = {
            "component": "trace_component_order",
            "sampling_rate": "trace_sampling_rate_hz",
            "begin_time": "trace_start_time",
            "end_time": "trace_end_time",
            "snr": "trace_snr_db",
            "depth_km": "station_depth_km",
            "event_id": "source_id_list",
            "event_time": "source_origin_time",
            "event_time_index": "source_origin_time_sample",
            "latitude": "source_latitude_deg",
            "longitude": "source_longitude_deg",
            "magnitude": "source_magnitude",
            "magnitude_type": "source_magnitude_type",
            "nt": "trace_npts",
            "source": "trace_source_region",
            "azimuth": "path_azimuth_deg",
            "back_azimuth": "path_back_azimuth_deg",
            "takeoff_angle": "path_takeoff_angle_deg",
            "distance_km": "path_ep_distance_km",
            "elevation_m": "station_elevation_m",
            "local_depth_m": "station_local_depth_m",
            "instrument": "station_instrument",
            "station": "station_code",
            "network": "station_network_code",
            "location": "station_location_code",
            "unit": "trace_unit",
            # P phase attributes
            "p_phase_index": "trace_p_arrival_sample",
            "p_phase_polarity": "trace_p_polarity",
            "p_phase_score": "trace_p_score",
            "p_phase_status": "trace_p_status",
            "p_phase_time": "trace_p_time",
            # S phase attributes
            "s_phase_index": "trace_s_arrival_sample",
            "s_phase_polarity": "trace_s_polarity",
            "s_phase_score": "trace_s_score",
            "s_phase_status": "trace_s_status",
            "s_phase_time": "trace_s_time",
            # These are a range of attributes in list form.
            # They lose their format when saved as csv, but at least they are documented.
            "phase_index": "trace_phase_arrival_sample_list",
            "phase_picking_channel": "trace_phase_picking_channel_list",
            "phase_polarity": "trace_phase_polarity_list",
            "phase_remark": "trace_phase_remark_list",
            "phase_score": "trace_phase_score_list",
            "phase_status": "trace_phase_status_list",
            "phase_time": "trace_phase_time_list",
            "phase_type": "trace_phase_type_list",
        }
        drops = [
            "nx",  # Number of stations for event. Can be easily recalculated.
            "dt_s",  # Redundant with sampling rate
        ]

        metadata = pd.DataFrame(metadata)

        metadata.drop(columns=drops, inplace=True)
        metadata.rename(columns=renames, inplace=True)

        # The component order in the data is always ENZ
        metadata["trace_component_order"] = "ENZ"

        return metadata