Source code for seisbench.data.cwa

import os
import tarfile
from abc import ABC

import pandas as pd

import seisbench
import seisbench.util

from .base import WaveformBenchmarkDataset

try:
    from huggingface_hub import hf_hub_download
except ModuleNotFoundError:
    hf_hub_download = None


[docs] class CWABase(WaveformBenchmarkDataset, ABC): """ An abstract class for downloading datasets. The CWA dataset comprises data from two seismographic networks: CWASN and TSMIP. The dataset spans from 2011 to 2021 and primarily includes P and S wave arrivals. Additionally, a subset of noise data is provided. """ citation = ( "Kuan-Wei Tang, Kuan-Yu Chen, Da-Yi Chen, Tai-Lin Chin, and Ting-Yu Hsu. (2024)" "The CWA Benchmark: A Seismic Dataset from Taiwan for Seismic Research." "Seismological Research Letters 2024." "doi: https://doi.org/10.1785/0220230393" ) chunk2file = { "_2011": "merge2011_2014.tar.gz", "_2012": "merge2011_2014.tar.gz", "_2013": "merge2011_2014.tar.gz", "_2014": "merge2011_2014.tar.gz", "_2015": "merge2015_2018.tar.gz", "_2016": "merge2015_2018.tar.gz", "_2017": "merge2015_2018.tar.gz", "_2018": "merge2015_2018.tar.gz", "_2019": "merge2019_2021.tar.gz", "_2020": "merge2019_2021.tar.gz", "_2021": "merge2019_2021.tar.gz", "_noise1": "noise_chunk1.tar.gz", "_noise2": "noise_chunk2.tar.gz", } src_repo_name = None def __init__(self, **kwargs): assert self.src_repo_name is not None, ( "Subclass needs to overwrite src_repo_name" ) super().__init__(citation=self.citation, repository_lookup=True, **kwargs) def _download_dataset(self, writer, chunk, **kwargs): self._download_pipeline(writer, chunk) @staticmethod def _ensure_hf_hub_download_available(): assert hf_hub_download is not None, ( "To download this dataset, huggingface_hub must be installed. " "For installation instructions, " "see https://huggingface.co/docs/huggingface_hub/installation" ) @staticmethod def _download_from_huggingfaceHub(path, src_file, file_type, repo_name): CWABase._ensure_hf_hub_download_available() seisbench.logger.warning( f"Start downloading {file_type} from Huggingface Hub: {repo_name}" ) # download from huggingface hub hf_hub_download( repo_id=repo_name, filename=src_file, repo_type="dataset", local_dir=path, )
[docs] def tar_file(self, filepath, savepath): with tarfile.open(filepath, "r:gz") as tar: seisbench.util.safe_extract_tar(tar, savepath)
[docs] @classmethod def available_chunks(cls, force=False, wait_for_file=False): path = cls._path_internal() chunks_path = path / "chunks" if not chunks_path.is_file(): cls._download_from_huggingfaceHub( path, "chunks", "chunks information", cls.src_repo_name ) with open(chunks_path, "r") as f: chunks = [x for x in f.read().split("\n") if x.strip()] return chunks
def _download_pipeline(self, writer, chunk, **kwargs): # chunk: _2011 (for example) to_download = self.chunk2file[chunk] path = self.path self._download_from_huggingfaceHub( path, to_download, "source_file", self.src_repo_name ) seisbench.logger.warning("Unarchiving. This might take a few minutes.") self.tar_file(path / to_download, path) for file in path.iterdir(): if file.name.startswith("metadata") and file.name.endswith(".csv"): self._add_split(file) seisbench.logger.warning("Remove the source file.") os.remove(path / to_download) @staticmethod def _add_split(metadata_path): def split_by_year(trace_name): year = int("20" + trace_name[:2]) if year <= 2018: return "train" elif year == 2019: return "dev" else: return "test" metadata = pd.read_csv(metadata_path) if "split" in metadata.columns: return # No action required metadata["split"] = metadata["trace_name"].apply(split_by_year) metadata.to_csv(metadata_path, index=False)
[docs] class CWA(CWABase): """ CWA dataset - Events and traces. """ src_repo_name = "NLPLabNTUST/Merged-CWA"
[docs] class CWANoise(CWABase): """ CWA dataset - Noise samples. """ src_repo_name = "NLPLabNTUST/Merged-CWA-Noise"