From ffa19592f98e58a11bb0aca0d091a83062b59231 Mon Sep 17 00:00:00 2001 From: Victor Mylle Date: Tue, 28 Nov 2023 15:35:35 +0000 Subject: [PATCH] Rewrote the NRVDataset to be cleaner --- src/data/dataset.py | 228 +++++------------- src/losses/crps_metric.py | 30 ++- .../autoregressive_quantiles.py | 6 +- 3 files changed, 83 insertions(+), 181 deletions(-) diff --git a/src/data/dataset.py b/src/data/dataset.py index a09497b..91ffcf9 100644 --- a/src/data/dataset.py +++ b/src/data/dataset.py @@ -1,6 +1,7 @@ import torch from torch.utils.data import Dataset, DataLoader import pandas as pd +import numpy as np class NrvDataset(Dataset): @@ -19,37 +20,44 @@ class NrvDataset(Dataset): # reset dataframe index self.dataframe.reset_index(drop=True, inplace=True) - self.nrv = torch.tensor(dataframe["nrv"].to_numpy(), dtype=torch.float32) - self.load_forecast = torch.tensor( - dataframe["load_forecast"].to_numpy(), dtype=torch.float32 - ) - self.total_load = torch.tensor( - dataframe["total_load"].to_numpy(), dtype=torch.float32 - ) - self.pv_gen_forecast = torch.tensor( - dataframe["pv_forecast"].to_numpy(), dtype=torch.float32 - ) - self.wind_gen_forecast = torch.tensor( - dataframe["wind_forecast"].to_numpy(), dtype=torch.float32 - ) - - self.quarter = torch.tensor( - dataframe["quarter"].to_numpy(), dtype=torch.float32 - ) - - self.day_of_week = torch.tensor( - dataframe["day_of_week"].to_numpy(), dtype=torch.float32 - ) - self.sequence_length = sequence_length self.predict_sequence_length = predict_sequence_length self.samples_to_skip = self.skip_samples() total_indices = set( - range(len(self.nrv) - self.sequence_length - self.predict_sequence_length) + range(len(self.dataframe) - self.sequence_length - self.predict_sequence_length) ) self.valid_indices = sorted(list(total_indices - set(self.samples_to_skip))) + self.history_features = [] + if self.data_config.LOAD_HISTORY: + self.history_features.append("total_load") + if self.data_config.PV_HISTORY: + self.history_features.append("pv_gen_forecast") + if self.data_config.WIND_HISTORY: + self.history_features.append("wind_gen_forecast") + + self.forecast_features = [] + if self.data_config.LOAD_FORECAST: + self.forecast_features.append("load_forecast") + if self.data_config.PV_FORECAST: + self.forecast_features.append("pv_gen_forecast") + if self.data_config.WIND_FORECAST: + self.forecast_features.append("wind_gen_forecast") + + # add time feature to dataframe + time_feature = np.array([0] * len(self.dataframe)) + if self.data_config.QUARTER: + time_feature += self.dataframe["quarter"] + + if self.data_config.DAY_OF_WEEK: + d_w = self.dataframe["day_of_week"] + if self.data_config.QUARTER: + d_w *= 96 + time_feature += d_w + + self.dataframe["time_feature"] = time_feature + def skip_samples(self): nan_rows = self.dataframe[self.dataframe.isnull().any(axis=1)] nan_indices = nan_rows.index @@ -80,88 +88,41 @@ class NrvDataset(Dataset): def __len__(self): return len(self.valid_indices) + def _get__all_data(self, idx: int): + history_df = self.dataframe.iloc[idx : idx + self.sequence_length] + forecast_df = self.dataframe.iloc[ + idx + self.sequence_length : idx + self.sequence_length + self.predict_sequence_length + ] + return history_df, forecast_df + def __getitem__(self, idx): actual_idx = self.valid_indices[idx] - features = [] + + history_df, forecast_df = self._get__all_data(actual_idx) + + # get nrv history features + nrv_features = torch.tensor(history_df[["nrv"]].values).reshape(-1) - if self.data_config.NRV_HISTORY: - nrv = self.nrv[actual_idx : actual_idx + self.sequence_length] - features.append(nrv.view(-1)) + # get history featues + history_features = history_df[self.history_features].values - if self.data_config.LOAD_HISTORY: - load_history = self.total_load[ - actual_idx : actual_idx + self.sequence_length - ] - features.append(load_history.view(-1)) + # combine the history features to one tensor (first one feature, then the next one, etc.) + history_features = torch.tensor(history_features).reshape(-1) - if self.data_config.PV_HISTORY: - pv_history = self.pv_gen_forecast[ - actual_idx : actual_idx + self.sequence_length - ] - features.append(pv_history.view(-1)) + # get forecast features + forecast_features = forecast_df[self.forecast_features].values + forecast_features = torch.tensor(forecast_features).view(-1) - if self.data_config.WIND_HISTORY: - wind_history = self.wind_gen_forecast[ - actual_idx : actual_idx + self.sequence_length - ] - features.append(wind_history.view(-1)) + # add last time feature of the history + time_feature = history_df["time_feature"].iloc[-1] - if self.data_config.LOAD_FORECAST: - load_forecast = self.load_forecast[ - actual_idx - + self.sequence_length : actual_idx - + self.sequence_length - + self.predict_sequence_length - ] - features.append(load_forecast.view(-1)) - - if self.data_config.PV_FORECAST: - pv_forecast = self.pv_gen_forecast[ - actual_idx - + self.sequence_length : actual_idx - + self.sequence_length - + self.predict_sequence_length - ] - features.append(pv_forecast.view(-1)) - - if self.data_config.WIND_FORECAST: - wind_forecast = self.wind_gen_forecast[ - actual_idx - + self.sequence_length : actual_idx - + self.sequence_length - + self.predict_sequence_length - ] - features.append(wind_forecast.view(-1)) - - ### Time Features ### - time_feature = 0 - if self.data_config.QUARTER: - time_feature += self.quarter[actual_idx].item() - - if self.data_config.DAY_OF_WEEK: - d_w = self.day_of_week[actual_idx].item() - if self.data_config.QUARTER: - d_w *= 96 - time_feature += d_w - - if time_feature is not None: - features.append(torch.tensor([time_feature])) - - if not features: - raise ValueError( - "No features are configured to be included in the dataset." - ) - - # Concatenate along dimension 0 to create a one-dimensional feature vector - all_features = torch.cat(features, dim=0) + ## all features + all_features = torch.cat( + [nrv_features, history_features, forecast_features, torch.tensor([time_feature])], dim=0 + ) # Target sequence, flattened if necessary - nrv_target = self.nrv[ - actual_idx - + self.sequence_length : actual_idx - + self.sequence_length - + self.predict_sequence_length - ].view(-1) + nrv_target = forecast_df["nrv"].values # check if nan values are present if torch.isnan(all_features).any(): @@ -169,78 +130,21 @@ class NrvDataset(Dataset): print(f"Actual index: {actual_idx}") raise ValueError("There are nan values in the features.") + # all features and target to float + all_features = all_features.float() + + # to tensors + nrv_target = torch.tensor(nrv_target).float() return all_features, nrv_target, idx def random_day_autoregressive(self, idx: int): - idx = self.valid_indices[idx] - features = [] + all_features, nrv_target, _ = self.__getitem__(idx) - # we already have the NRV history with the newly predicted values, so we don't need to include the last 96 values - if self.data_config.LOAD_HISTORY: - load_history = self.total_load[idx : idx + self.sequence_length] - features.append(load_history.view(-1)) + # remove the first 96 values of the features (the nrv history) + all_features = all_features[self.sequence_length :] - if self.data_config.PV_HISTORY: - pv_history = self.pv_gen_forecast[idx : idx + self.sequence_length] - features.append(pv_history.view(-1)) + return all_features, nrv_target - if self.data_config.WIND_HISTORY: - wind_history = self.wind_gen_forecast[idx : idx + self.sequence_length] - features.append(wind_history.view(-1)) - - if self.data_config.LOAD_FORECAST: - load_forecast = self.load_forecast[ - idx - + self.sequence_length : idx - + self.sequence_length - + self.predict_sequence_length - ] - features.append(load_forecast.view(-1)) - - if self.data_config.PV_FORECAST: - pv_forecast = self.pv_gen_forecast[ - idx - + self.sequence_length : idx - + self.sequence_length - + self.predict_sequence_length - ] - features.append(pv_forecast.view(-1)) - - if self.data_config.WIND_FORECAST: - wind_forecast = self.wind_gen_forecast[ - idx - + self.sequence_length : idx - + self.sequence_length - + self.predict_sequence_length - ] - features.append(wind_forecast.view(-1)) - - ### Time Features ### - time_feature = 0 - if self.data_config.QUARTER: - time_feature += self.quarter[idx] - - if self.data_config.DAY_OF_WEEK: - d_w = self.day_of_week[idx].item() - if self.data_config.QUARTER: - d_w *= 96 - time_feature += d_w - - if time_feature is not None: - features.append(torch.tensor([time_feature])) - - target = self.nrv[ - idx - + self.sequence_length : idx - + self.sequence_length - + self.predict_sequence_length - ] - - if len(features) == 0: - return None, target - - all_features = torch.cat(features, dim=0) - return all_features, target def get_batch(self, idx: list): features = [] diff --git a/src/losses/crps_metric.py b/src/losses/crps_metric.py index af55618..2821f64 100644 --- a/src/losses/crps_metric.py +++ b/src/losses/crps_metric.py @@ -1,30 +1,28 @@ import torch from torch import nn import torch +from properscoring import crps_ensemble class CRPSLoss(nn.Module): - def __init__(self, quantiles): + def __init__(self): super(CRPSLoss, self).__init__() - if not torch.is_tensor(quantiles): - quantiles = torch.tensor(quantiles, dtype=torch.float32) - self.quantiles_tensor = quantiles - def forward(self, preds, target): + # if tensor, to cpu + if isinstance(preds, torch.Tensor): + preds = preds.detach().cpu() + + if isinstance(target, torch.Tensor): + target = target.detach().cpu() + + # target squeeze -1 + target = target.squeeze(-1) + # preds shape: [batch_size, num_quantiles] - - # unsqueeze target - # target = target.unsqueeze(-1) - - mask = (preds > target).float() - self.quantiles_tensor = self.quantiles_tensor.to(preds.device) - test = self.quantiles_tensor - mask - # square them - test = test * test - crps = torch.trapz(test, x=preds) + scores = crps_ensemble(target, preds) # mean over batch - crps = torch.mean(crps) + crps = scores.mean() return crps diff --git a/src/training_scripts/autoregressive_quantiles.py b/src/training_scripts/autoregressive_quantiles.py index 779d912..28a90b8 100644 --- a/src/training_scripts/autoregressive_quantiles.py +++ b/src/training_scripts/autoregressive_quantiles.py @@ -16,7 +16,7 @@ from src.models.time_embedding_layer import TimeEmbedding #### ClearML #### clearml_helper = ClearMLHelper(project_name="Thesis/NrvForecast") -task = clearml_helper.get_task(task_name="None") +task = clearml_helper.get_task(task_name="Autoregressive Quantile Regression") #### Data Processor #### @@ -63,8 +63,8 @@ trainer = AutoRegressiveQuantileTrainer( debug=True, ) trainer.add_metrics_to_track( - [PinballLoss(quantiles), MSELoss(), L1Loss(), CRPSLoss(quantiles)] + [PinballLoss(quantiles), MSELoss(), L1Loss(), CRPSLoss()] ) trainer.early_stopping(patience=10) trainer.plot_every(5) -trainer.train(task=task, epochs=epochs, remotely=True) \ No newline at end of file +trainer.train(task=task, epochs=epochs, remotely=False) \ No newline at end of file