Source code for seisbench.data.scedc

import shutil
from collections import defaultdict

import h5py
import numpy as np

import seisbench
import seisbench.util

from .base import WaveformBenchmarkDataset



[docs]
class SCEDC(WaveformBenchmarkDataset):
    """
    SCEDC waveform archive (2000-2020).

    Splits are set using standard random sampling of :py:class: BenchmarkDataset.
    """

    def __init__(self, **kwargs):
        citation = (
            "SCEDC (2013): Southern California Earthquake Center."
            "https://doi.org/10.7909/C3WD3xH1"
        )

        seisbench.logger.warning(
            "Check available storage and memory before downloading and general use "
            "of SCEDC dataset. "
            "Dataset size: waveforms.hdf5 ~660Gb, metadata.csv ~2.2Gb"
        )

        super().__init__(citation=citation, repository_lookup=True, **kwargs)

    def _download_dataset(self, **kwargs):
        # NOTE: SCEDC dataset is pre-compiled and stored in remote repository root for access
        pass



# TODO: Check with Zach Ross if this dataset really only differs from Ross2018JGRPick through the class rebalancing.
#       If so, it this should be stated in the SeisBench documentation and probably also be reflected in the naming.

[docs]
class Ross2018JGRFM(WaveformBenchmarkDataset):
    """
    First motion polarity dataset belonging to the publication:
    Ross, Z. E., Meier, M.‐A., & Hauksson, E. (2018). P wave arrival picking and first‐motion polarity determination
    with deep learning. Journal of Geophysical Research: Solid Earth, 123, 5120– 5129.
    https://doi.org/10.1029/2017JB015251

    Note that this dataset contains picks as well.

    .. warning::

        This dataset only contains traces for the Z component.
        It therefore ignores the default SeisBench the component_order.

    """

    def __init__(self, component_order="Z", **kwargs):
        citation = (
            "Ross, Z. E., Meier, M.‐A., & Hauksson, E. (2018). "
            "P wave arrival picking and first‐motion polarity determination with deep learning. "
            "Journal of Geophysical Research: Solid Earth, 123, 5120– 5129. https://doi.org/10.1029/2017JB015251"
        )
        super().__init__(
            citation=citation,
            repository_lookup=False,
            component_order=component_order,
            **kwargs,
        )

    def _download_dataset(self, writer, cleanup=False, blocksize=2**14):
        """
        Downloads and converts the dataset from the original publication

        :param writer: WaveformWriter
        :param cleanup: If true, delete the original files after conversion. Defaults to false.
        :param blocksize: Number of waveform samples to read from disk at once.
        :return:
        """

        path = self.path
        path_original = path / "original"
        path_original.mkdir(parents=True, exist_ok=True)

        # Maps ids to strings for the polarity
        polarity_list = ["up", "down", "unknown"]

        # Download data files
        data_urls = [
            "https://service.scedc.caltech.edu/ftp/Ross_FinalTrainedModels/scsn_p_2000_2017_6sec_0.5r_fm_test.hdf5",
            "https://service.scedc.caltech.edu/ftp/Ross_FinalTrainedModels/scsn_p_2000_2017_6sec_0.5r_fm_train.hdf5",
        ]

        for f in data_urls:
            # Uses callback_if_uncached only to be able to utilize the cache mechanism.
            # Concurrent accesses are anyhow already controlled
            # by the callback_if_uncached call wrapping _download_dataset.
            # It's therefore considered save to set force=True.
            filename = f[f.rfind("/") + 1 :]

            def callback_download_original(path):
                seisbench.util.download_http(
                    f,
                    path,
                    desc=f"Downloading file {filename}",
                )

            seisbench.util.callback_if_uncached(
                path_original / filename, callback_download_original, force=True
            )

        with h5py.File(
            path_original / "scsn_p_2000_2017_6sec_0.5r_fm_train.hdf5", "r"
        ) as f_train:
            train_samples = f_train["X"].shape[0]
        with h5py.File(
            path_original / "scsn_p_2000_2017_6sec_0.5r_fm_test.hdf5", "r"
        ) as f_test:
            test_samples = f_test["X"].shape[0]

        writer.set_total(train_samples + test_samples)
        writer.data_format = {
            "dimension_order": "CW",
            "component_order": "Z",
            "measurement": "velocity",
            "sampling_rate": 100,
            "unit": "none/normalized",
            "instrument_response": "not restituted",
        }

        eq_counts = defaultdict(lambda: 0)

        for split in ["train", "test"]:
            with h5py.File(
                path_original / f"scsn_p_2000_2017_6sec_0.5r_fm_{split}.hdf5", "r"
            ) as f:
                # Preload all small arrays to avoid disk seeks
                y = f["Y"][:]
                dist = f["dist"][:]
                evids = f["evids"][:]
                mag = f["mag"][:]
                sncls = f["sncls"][:]
                snr = f["snr"][:]

                # Use 10 percent of the training events as development set
                if split == "train":
                    dev_ids = set(np.unique(evids)[::10])
                else:
                    dev_ids = set()

                wf_block = None
                for i in range(f["X"].shape[0]):
                    # Preload block of waveforms
                    if i % blocksize == 0:
                        wf_block = f["X"][i : i + blocksize]
                    wf = wf_block[i % blocksize].reshape(
                        1, -1
                    )  # Load waveforms and add (virtual) channel axis

                    eid = f"{evids[i]}_{sncls[i].decode()}"
                    trace_station_id = eq_counts[eid]
                    eq_counts[eid] += 1
                    trace_name = f"{eid}_{trace_station_id}"

                    if evids[i] in dev_ids:
                        trace_split = "dev"
                    else:
                        trace_split = split

                    net, sta, cha = sncls[i].decode().split(".")
                    polarity = polarity_list[y[i]]

                    metadata = {
                        "trace_name": trace_name,
                        "trace_category": "earthquake",
                        "trace_p_arrival_sample": 300,
                        "trace_p_status": "manual",
                        "trace_snr_db": snr[i],
                        "trace_channel": cha,
                        "trace_polarity": polarity,
                        "station_network_code": net,
                        "station_code": sta,
                        "source_magnitude": mag[i],
                        "source_id": evids[i],
                        "path_ep_distance_km": dist[i],
                        "split": trace_split,
                    }

                    writer.add_trace(metadata, wf)

            # Write out all data from the current split
            writer.flush_hdf5()

        if cleanup:
            shutil.rmtree(path_original)




[docs]
class Ross2018JGRPick(WaveformBenchmarkDataset):
    """
    Pick dataset belonging to the publication:
    Ross, Z. E., Meier, M.‐A., & Hauksson, E. (2018). P wave arrival picking and first‐motion polarity determination
    with deep learning. Journal of Geophysical Research: Solid Earth, 123, 5120– 5129.
    https://doi.org/10.1029/2017JB015251

    Note that this dataset contains polarities as well.

    .. warning::

        This dataset only contains traces for the Z component.
        It therefore ignores the default SeisBench the component_order.

    """

    def __init__(self, component_order="Z", **kwargs):
        citation = (
            "Ross, Z. E., Meier, M.‐A., & Hauksson, E. (2018). "
            "P wave arrival picking and first‐motion polarity determination with deep learning. "
            "Journal of Geophysical Research: Solid Earth, 123, 5120– 5129. https://doi.org/10.1029/2017JB015251"
        )
        super().__init__(
            citation=citation,
            repository_lookup=False,
            component_order=component_order,
            **kwargs,
        )

    def _download_dataset(self, writer, cleanup=False, blocksize=2**14):
        """
        Downloads and converts the dataset from the original publication

        :param writer: WaveformWriter
        :param cleanup: If true, delete the original files after conversion. Defaults to false.
        :param blocksize: Number of waveform samples to read from disk at once
        :return:
        """

        path = self.path
        path_original = path / "original"
        path_original.mkdir(parents=True, exist_ok=True)

        # Maps ids to strings for the polarity
        polarity_list = ["up", "down", "unknown"]

        # Download data files
        data_urls = [
            "https://service.scedc.caltech.edu/ftp/Ross_FinalTrainedModels/scsn_p_2000_2017_6sec_0.5r_pick_test.hdf5",
            "https://service.scedc.caltech.edu/ftp/Ross_FinalTrainedModels/scsn_p_2000_2017_6sec_0.5r_pick_train.hdf5",
        ]

        for f in data_urls:
            # Uses callback_if_uncached only to be able to utilize the cache mechanism.
            # Concurrent accesses are anyhow already controlled
            # by the callback_if_uncached call wrapping _download_dataset.
            # It's therefore considered save to set force=True.
            filename = f[f.rfind("/") + 1 :]

            def callback_download_original(path):
                seisbench.util.download_http(
                    f,
                    path,
                    desc=f"Downloading file {filename}",
                )

            seisbench.util.callback_if_uncached(
                path_original / filename, callback_download_original, force=True
            )

        with h5py.File(
            path_original / "scsn_p_2000_2017_6sec_0.5r_pick_train.hdf5", "r"
        ) as f_train:
            train_samples = f_train["X"].shape[0]
        with h5py.File(
            path_original / "scsn_p_2000_2017_6sec_0.5r_pick_test.hdf5", "r"
        ) as f_test:
            test_samples = f_test["X"].shape[0]

        writer.set_total(train_samples + test_samples)
        writer.data_format = {
            "dimension_order": "CW",
            "component_order": "Z",
            "measurement": "velocity",
            "sampling_rate": 100,
            "unit": "none/normalized",
            "instrument_response": "not restituted",
        }

        eq_counts = defaultdict(lambda: 0)

        for split in ["train", "test"]:
            with h5py.File(
                path_original / f"scsn_p_2000_2017_6sec_0.5r_pick_{split}.hdf5", "r"
            ) as f:
                # Preload all small arrays to avoid disk seeks
                fm = f["fm"][:]
                dist = f["dist"][:]
                evids = f["evids"][:]
                mag = f["mag"][:]
                sncls = f["sncls"][:]
                snr = f["snr"][:]

                # Use 10 percent of the training events as development set
                if split == "train":
                    dev_ids = set(np.unique(evids)[::10])
                else:
                    dev_ids = set()

                wf_block = None
                for i in range(f["X"].shape[0]):
                    # Preload block of waveforms
                    if i % blocksize == 0:
                        wf_block = f["X"][i : i + blocksize]
                    wf = wf_block[i % blocksize].reshape(
                        1, -1
                    )  # Load waveforms and add (virtual) channel axis

                    eid = f"{evids[i]}_{sncls[i].decode()}"
                    trace_station_id = eq_counts[eid]
                    eq_counts[eid] += 1
                    trace_name = f"{eid}_{trace_station_id}"

                    if evids[i] in dev_ids:
                        trace_split = "dev"
                    else:
                        trace_split = split

                    net, sta, cha = sncls[i].decode().split(".")
                    polarity = polarity_list[fm[i]]

                    metadata = {
                        "trace_name": trace_name,
                        "trace_category": "earthquake",
                        "trace_p_arrival_sample": 300,
                        "trace_p_status": "manual",
                        "trace_snr_db": snr[i],
                        "trace_channel": cha,
                        "trace_polarity": polarity,
                        "station_network_code": net,
                        "station_code": sta,
                        "source_magnitude": mag[i],
                        "source_id": evids[i],
                        "path_ep_distance_km": dist[i],
                        "split": trace_split,
                    }

                    writer.add_trace(metadata, wf)

            # Write out all data from the current split
            writer.flush_hdf5()

        if cleanup:
            shutil.rmtree(path_original)




[docs]
class Ross2018GPD(WaveformBenchmarkDataset):
    """
    Pick dataset belonging to the publication:
    Zachary E. Ross, Men‐Andrin Meier, Egill Hauksson, Thomas H. Heaton;
    Generalized Seismic Phase Detection with Deep Learning.
    Bulletin of the Seismological Society of America 2018;; 108 (5A): 2894–2901.
    https://doi.org/10.1785/0120180080
    """

    def __init__(self, **kwargs):
        citation = (
            "Ross, Z. E., Meier, M.‐A., Hauksson, E., & Heaton, T.(2018). "
            "Generalized Seismic Phase Detection with Deep Learning. "
            "Bulletin of the Seismological Society of America 2018;; 108 (5A): 2894–2901. "
            "https://doi.org/10.1785/0120180080"
        )
        super().__init__(citation=citation, repository_lookup=False, **kwargs)

    def _download_dataset(self, writer, cleanup=False, blocksize=2**14):
        """
        Downloads and converts the dataset from the original publication

        :param writer: WaveformWriter
        :param cleanup: If true, delete the original files after conversion. Defaults to false.
        :param blocksize: Number of waveform samples to read from disk at once
        :return:
        """

        path = self.path
        path_original = path / "original"
        path_original.mkdir(parents=True, exist_ok=True)

        # Download data files
        # Uses callback_if_uncached only to be able to utilize the cache mechanism
        # Concurrent accesses are anyhow already controlled by the callback_if_uncached call wrapping _download_dataset
        # It's therefore considered save to set force=True

        data_url = "https://service.scedc.caltech.edu/ftp/ross_etal_2018_bssa/scsn_ps_2000_2017_shuf.hdf5"
        filename = data_url[data_url.rfind("/") + 1 :]

        def callback_download_original(path):
            seisbench.util.download_http(
                data_url,
                path,
                desc=f"Downloading file {filename}",
            )

        seisbench.util.callback_if_uncached(
            path_original / filename, callback_download_original, force=True
        )

        writer.bucket_size = (
            4096  # Single waveforms are small so the bucket size should be larger
        )
        writer.data_format = {
            "dimension_order": "CW",
            "component_order": "ZNE",
            "measurement": "velocity",
            "sampling_rate": 100,
            "unit": "none/normalized",
            "instrument_response": "not restituted",
        }

        with h5py.File(path_original / filename, "r") as fin:
            writer.set_total(fin["X"].shape[0])
            y = fin["Y"][()]

            wf_block = None
            for i in range(fin["X"].shape[0]):
                # Preload block of waveforms
                if i % blocksize == 0:
                    wf_block = fin["X"][i : i + blocksize]
                wf = wf_block[i % blocksize].T  # Load waveforms and transpose
                wf = wf[[2, 0, 1]]  # Resort components to ZNE

                if i % 10 < 6:
                    trace_split = "train"
                elif i % 10 < 7:
                    trace_split = "dev"
                else:
                    trace_split = "test"

                metadata = {
                    "split": trace_split,
                }

                if y[i] == 0:
                    # P pick
                    metadata["trace_category"] = "earthquake"
                    metadata["trace_p_arrival_sample"] = 300
                    metadata["trace_p_status"] = "manual"
                elif y[i] == 1:
                    # S pick
                    metadata["trace_category"] = "earthquake"
                    metadata["trace_s_arrival_sample"] = 300
                    metadata["trace_s_status"] = "manual"
                else:
                    metadata["trace_category"] = "noise"

                writer.add_trace(metadata, wf)

        if cleanup:
            shutil.rmtree(path_original)



# TODO: Write Men-Andrin Meier regarding zero metadata columns, time format, split format

[docs]
class Meier2019JGR(WaveformBenchmarkDataset):
    """
    Southern californian part of the dataset from Meier et al. (2019)
    Note that due to the missing Japanese data,
    there is a massive overrepresentation of noise samples.

    Meier, M.-A., Ross, Z. E., Ramachandran, A., Balakrishna, A.,
    Nair, S., Kundzicz, P., et al. (2019). Reliable real‐time
    seismic signal/noise discrimination with machine learning.
    Journal of Geophysical Research: Solid Earth, 124.
    https://doi.org/10.1029/2018JB016661
    """

    def __init__(self, **kwargs):
        citation = (
            "Meier, M.-A., Ross, Z. E., Ramachandran, A., Balakrishna, A., "
            "Nair, S., Kundzicz, P., et al. (2019). Reliable real‐time "
            "seismic signal/noise discrimination with machine learning. "
            "Journal of Geophysical Research: Solid Earth, 124. "
            "https://doi.org/10.1029/2018JB016661"
        )
        super().__init__(citation=citation, repository_lookup=False, **kwargs)

    def _download_dataset(self, writer, cleanup=False, blocksize=2**14):
        """
        Downloads and converts the dataset from the original publication

        :param writer: WaveformWriter
        :param cleanup: If true, delete the original files after conversion. Defaults to false.
        :param blocksize: Number of waveform samples to read from disk at once
        :return:
        """

        path = self.path
        path_original = path / "original"
        path_original.mkdir(parents=True, exist_ok=True)

        # Download data files
        # Uses callback_if_uncached only to be able to utilize the cache mechanism
        # Concurrent accesses are anyhow already controlled by the callback_if_uncached call wrapping _download_dataset
        # It's therefore considered save to set force=True

        data_url = "https://service.scedc.caltech.edu/ftp/meier_etal_2019_jgr/onsetWforms_meier19jgr_pub1_0_woJP.h5"
        filename = data_url[data_url.rfind("/") + 1 :]

        def callback_download_original(path):
            seisbench.util.download_http(
                data_url,
                path,
                desc=f"Downloading file {filename}",
            )

        seisbench.util.callback_if_uncached(
            path_original / filename, callback_download_original, force=True
        )

        writer.bucket_size = (
            4096  # Single waveforms are small so the bucket size should be larger
        )
        writer.data_format = {
            "dimension_order": "CW",
            "component_order": "ZNE",
            "measurement": "velocity",
            "sampling_rate": 100,
            "unit": "mps",
            "instrument_response": "gain corrected",
        }

        category_map = {
            "noise": "noise",
            "quake": "earthquake (local)",
            "tele": "earthquake (teleseismic)",
        }

        with h5py.File(path_original / filename, "r") as fin:
            total = (
                fin["quake/wforms"].shape[1]
                + fin["noise/wforms"].shape[1]
                + fin["tele/wforms"].shape[1]
            )
            writer.set_total(total)

            for group in "quake", "noise", "tele":
                gin = fin[group]
                meta_features = gin["numMeta"][()]
                category = category_map[group]

                wf_block = None
                for i in range(meta_features.shape[1]):
                    # Preload block of waveforms
                    if i % blocksize == 0:
                        wf_block = gin["wforms"][:, i : i + blocksize]
                    wf = wf_block[:, i % blocksize]  # Load waveforms
                    wf = wf[[2, 0, 1]]  # Resort components to ZNE

                    # TODO: Read/define split
                    meta_row = meta_features[:, i]

                    if group == "noise":
                        metadata = {
                            "trace_category": category,
                            # "split": trace_split,
                            "trace_snr_db": meta_row[3],
                            "trace_record_id": meta_row[4],
                            # meta_row[5] - pickIndex - ignored
                            # Data is consistently zero
                            # "station_latitude_deg": meta_row[6],
                            # "station_longitude_deg": meta_row[7],
                            # "trace_pga_mps2": meta_row[8],
                            # "trace_pgv_mps": meta_row[9],
                            # "trace_pgd_m": meta_row[10],
                            "source_origin_time": meta_row[11],  # Format unclear
                            # Data is consistently zero
                            # "path_back_azimuth_deg": meta_row[12]
                        }

                    else:
                        metadata = {
                            "trace_category": category,
                            # "split": trace_split,
                            "source_magnitude": meta_row[0],
                            "path_hyp_distance_km": meta_row[1],
                            "source_depth_km": meta_row[2],
                            "trace_snr_db": meta_row[3],
                            "trace_record_id": meta_row[4],
                            # meta_row[5] - pickIndex - ignored
                            "station_latitude_deg": meta_row[6],
                            "station_longitude_deg": meta_row[7],
                            # Data is consistently zero
                            # "trace_pga_mps2": meta_row[8],
                            # "trace_pgv_mps": meta_row[9],
                            # "trace_pgd_m": meta_row[10],
                            "source_origin_time": meta_row[11],  # Format unclear
                            # Data is consistently zero
                            # "path_back_azimuth_deg": meta_row[12]
                            "trace_p_arrival_sample": 201,
                            "trace_p_status": "manual",
                        }

                    writer.add_trace(metadata, wf)

        if cleanup:
            shutil.rmtree(path_original)