Source code for seisbench.data.stead

from pathlib import Path

import h5py
import numpy as np
import pandas as pd

import seisbench
import seisbench.util

from .base import WaveformBenchmarkDataset, WaveformDataWriter



[docs]
class STEAD(WaveformBenchmarkDataset):
    """
    STEAD dataset from Mousavi et al.

    Using the train/test split from the EQTransformer Github repository
    train/dev split defined in SeisBench
    """

    def __init__(self, **kwargs):
        citation = (
            "Mousavi, S. M., Sheng, Y., Zhu, W., Beroza G.C., (2019). STanford EArthquake Dataset (STEAD): "
            "A Global Data Set of Seismic Signals for AI, IEEE Access, doi:10.1109/ACCESS.2019.2947848"
        )
        license = "CC BY 4.0"
        super().__init__(
            citation=citation, license=license, repository_lookup=True, **kwargs
        )

    def _download_dataset(self, writer: WaveformDataWriter, basepath=None, **kwargs):
        download_instructions = (
            "Please download STEAD following the instructions at https://github.com/smousavi05/STEAD. "
            "Provide the locations of the STEAD unpacked files (merged.csv and merged.hdf5) in the "
            "download_kwargs argument 'basepath'."
            "This step is only necessary the first time STEAD is loaded."
        )

        metadata_dict = {
            "trace_start_time": "trace_start_time",
            "trace_category": "trace_category",
            "trace_name": "trace_name",
            "p_arrival_sample": "trace_p_arrival_sample",
            "p_status": "trace_p_status",
            "p_weight": "trace_p_weight",
            "p_travel_sec": "path_p_travel_sec",
            "s_arrival_sample": "trace_s_arrival_sample",
            "s_status": "trace_s_status",
            "s_weight": "trace_s_weight",
            "s_travel_sec": "path_s_travel_sec",
            "back_azimuth_deg": "path_back_azimuth_deg",
            "snr_db": "trace_snr_db",
            "coda_end_sample": "trace_coda_end_sample",
            "network_code": "station_network_code",
            "receiver_code": "station_code",
            "receiver_type": "trace_channel",
            "receiver_latitude": "station_latitude_deg",
            "receiver_longitude": "station_longitude_deg",
            "receiver_elevation_m": "station_elevation_m",
            "source_id": "source_id",
            "source_origin_time": "source_origin_time",
            "source_origin_uncertainty_sec": "source_origin_uncertainty_sec",
            "source_latitude": "source_latitude_deg",
            "source_longitude": "source_longitude_deg",
            "source_error_sec": "source_error_sec",
            "source_gap_deg": "source_gap_deg",
            "source_horizontal_uncertainty_km": "source_horizontal_uncertainty_km",
            "source_depth_km": "source_depth_km",
            "source_depth_uncertainty_km": "source_depth_uncertainty_km",
            "source_magnitude": "source_magnitude",
            "source_magnitude_type": "source_magnitude_type",
            "source_magnitude_author": "source_magnitude_author",
        }

        path = self.path

        if basepath is None:
            raise ValueError(
                "No cached version of STEAD found. " + download_instructions
            )

        basepath = Path(basepath)

        if not (basepath / "merged.csv").is_file():
            raise ValueError(
                "Basepath does not contain file merged.csv. " + download_instructions
            )
        if not (basepath / "merged.hdf5").is_file():
            raise ValueError(
                "Basepath does not contain file merged.hdf5. " + download_instructions
            )

        self.path.mkdir(parents=True, exist_ok=True)
        seisbench.logger.warning(
            "Converting STEAD files to SeisBench format. This might take a while."
        )

        split_url = "https://github.com/smousavi05/EQTransformer/raw/master/ModelsAndSampleData/test.npy"
        seisbench.util.download_http(
            split_url, path / "test.npy", desc="Downloading test splits"
        )

        # Copy metadata and rename columns to SeisBench format
        metadata = pd.read_csv(basepath / "merged.csv")
        metadata.rename(columns=metadata_dict, inplace=True)

        # Set split
        test_split = set(np.load(path / "test.npy"))
        test_mask = metadata["trace_name"].isin(test_split)
        train_dev = metadata["trace_name"][~test_mask].values
        dev_split = train_dev[
            ::18
        ]  # Use 5% of total traces as suggested in EQTransformer Github repository
        dev_mask = metadata["trace_name"].isin(dev_split)
        metadata["split"] = "train"
        metadata.loc[dev_mask, "split"] = "dev"
        metadata.loc[test_mask, "split"] = "test"

        # Writer data format
        writer.data_format = {
            "dimension_order": "CW",
            "component_order": "ZNE",
            "sampling_rate": 100,
            "measurement": "velocity",
            "unit": "counts",
            "instrument_response": "not restituted",
        }

        writer.set_total(len(metadata))

        with h5py.File(basepath / "merged.hdf5") as f:
            gdata = f["data"]
            for _, row in metadata.iterrows():
                row = row.to_dict()
                waveforms = gdata[row["trace_name"]][()]
                waveforms = waveforms.T  # From WC to CW
                waveforms = waveforms[[2, 1, 0]]  # From ENZ to ZNE

                writer.add_trace(row, waveforms)