Source code for scatcluster.analysis.predictions

"""Predictions analysis module."""
import os

import matplotlib.pyplot as plt
import numpy as np
import obspy
import pandas as pd
from matplotlib import dates as mdates
from scipy.spatial.distance import euclidean
from tqdm import tqdm

from scatcluster.helper import COLORS


[docs]class Predictions:
[docs] def identify_predicted_cluster_from_time_window(self, time_window: str) -> int: """Identify the predicted cluster from a provided Time Window Args: time_window (str): Time window in "YYYY-MM-DD HH-MM-SSSS" Raises: ValueError: Incorrect time window provided Returns: int: Predicted cluster id """ time_window_num = mdates.date2num(obspy.UTCDateTime(time_window)) position_in_time = len(self.dendrogram_timestamps[ self.dendrogram_timestamps <= time_window_num, ]) if position_in_time > 1: return self.dendrogram_predictions[position_in_time - 1] else: raise ValueError('Provided time_window is not valid')
[docs] def df_times_for_predictions(self, n_clusters, cluster_rank=False) -> pd.DataFrame: """ Get a pandas dataframe with columns {'times','predictions'} for the windowed seismograms and associated prediction Args: n_clusters (_type_): The number of clusters for the predition cluster_rank (bool, optional): Whether to calculate the inter-cluster rank. Defaults to False. Returns: _type_: A pandas dataframe with columns {'times','predictions'} for the windowed seismograms and predictions """ self.load_data_times() # load cluster predictions clusters_path = (f'{self.data_savepath}clustering/{self.data_network}_{self.data_station}_{self.data_location}_' f'{self.network_name}_ICA_{self.ica.n_components}_clusters_{n_clusters}.npz') if not os.path.exists(clusters_path): raise ValueError( f'Clusters of size {n_clusters} does not exist. Kindly choose another n_clusters or compute using' f' "single_dendrogram"') p = np.load(clusters_path) self.dendrogram_predictions = p['predictions'] pd_times_preds = pd.DataFrame({ 'times_unix': self.data_times, 'times_YYYYMMDD': [mdates.num2date(x) for x in self.data_times], 'predictions': self.dendrogram_predictions }) if cluster_rank is True: print('Calculating inter-cluster rank') pd_times_preds['cluster_rank'] = 0 for cluster in np.unique(self.dendrogram_predictions): # Extract clusters within_cluster = self.dendrogram_predictions == cluster cluster_samples = self.ica_features[within_cluster] cluster_times = self.dendrogram_timestamps[within_cluster] # Centroid centroid = np.median(cluster_samples, axis=0) distances = [] for sample in cluster_samples: distances.append(euclidean(sample, centroid)) distances = np.array(distances) # Sort times based on within cluster Euclidean distance distances_argsort = np.argsort(distances) sorted_times = cluster_times[distances_argsort] print(f'Processing cluster {cluster} of size {len(sorted_times)}') for st_enum, st in tqdm(enumerate(sorted_times)): pd_times_preds.loc[pd_times_preds.times_unix == st, 'cluster_rank'] = st_enum self.pd_times_preds = pd_times_preds return self.pd_times_preds
[docs] def plot_prediction_occurance(self, n_clusters): """ Plots the occurrence of predictions in a scatter plot. Parameters: n_clusters (int): The number of clusters. """ if self.data_times is None or self.dendrogram_predictions is None: self.pd_times_preds = self.df_times_for_predictions(n_clusters) df_preds = pd.DataFrame({ 'DATES': [mdates.num2date(x) for x in self.data_times], 'predictions': self.dendrogram_predictions }) for clust in np.unique(df_preds['predictions']): dt = df_preds.loc[ df_preds['predictions'] == clust, ] plt.vlines(dt['DATES'], clust - 0.5, clust + 0.5, color=COLORS[clust]) plt.yticks(range(1, df_preds['predictions'].max() + 1)) plt.ylabel('Cluster') plt.show()
[docs] def preload_predictions(self, ica_n_components, n_clusters): """ Load precomputed predictions from a NumPy file. Parameters: ica_n_components (int): The number of ICA components used for prediction. n_clusters (int): The number of clusters used for prediction. Returns: numpy.ndarray: The loaded predictions. Raises: FileNotFoundError: If the specified file does not exist. """ file_path = (f'{self.data_savepath}clustering/{self.data_network}_{self.data_station}_{self.data_location}_' f'{self.network_name}_ICA_{ica_n_components}_clusters_{n_clusters}.npz') return np.load(file_path)